loopllm 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopllm/evaluators.py ADDED
@@ -0,0 +1,419 @@
1
+ """Built-in evaluators for scoring LLM outputs."""
2
+ from __future__ import annotations
3
+
4
+ import json
5
+ import re
6
+ from typing import Any, Callable
7
+
8
+ from loopllm.engine import EvaluationResult
9
+
10
+
11
+ class ThresholdEvaluator:
12
+ """Evaluator that delegates scoring to a callable and applies a pass/fail threshold.
13
+
14
+ Args:
15
+ scorer: A callable ``(output, context) -> float`` returning a score in [0, 1].
16
+ threshold: Minimum score to pass.
17
+ name: Human-readable evaluator name.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ scorer: Callable[[str, dict[str, Any]], float],
23
+ threshold: float = 0.7,
24
+ name: str = "threshold",
25
+ ) -> None:
26
+ self.scorer = scorer
27
+ self.threshold = threshold
28
+ self.name = name
29
+
30
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
31
+ """Score *output* and return pass/fail based on threshold.
32
+
33
+ Args:
34
+ output: The text to evaluate.
35
+ context: Optional context dict passed to the scorer.
36
+
37
+ Returns:
38
+ :class:`EvaluationResult` with clamped score.
39
+ """
40
+ ctx = context or {}
41
+ raw = self.scorer(output, ctx)
42
+ score = max(0.0, min(1.0, raw))
43
+ passed = score >= self.threshold
44
+ deficiencies: list[str] = []
45
+ if not passed:
46
+ deficiencies.append(
47
+ f"{self.name}: score {score:.2f} below threshold {self.threshold:.2f}"
48
+ )
49
+ return EvaluationResult(
50
+ score=score,
51
+ passed=passed,
52
+ deficiencies=deficiencies,
53
+ sub_scores={self.name: score},
54
+ )
55
+
56
+
57
+ class RegexEvaluator:
58
+ """Evaluator that checks for required and forbidden regex patterns.
59
+
60
+ Args:
61
+ required: Patterns that must be present in the output.
62
+ forbidden: Patterns that must NOT be present in the output.
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ required: list[str] | None = None,
68
+ forbidden: list[str] | None = None,
69
+ ) -> None:
70
+ self.required = [re.compile(p, re.IGNORECASE) for p in (required or [])]
71
+ self.forbidden = [re.compile(p, re.IGNORECASE) for p in (forbidden or [])]
72
+ self._required_raw = required or []
73
+ self._forbidden_raw = forbidden or []
74
+
75
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
76
+ """Check *output* against required/forbidden patterns.
77
+
78
+ Args:
79
+ output: The text to evaluate.
80
+ context: Unused; accepted for interface compatibility.
81
+
82
+ Returns:
83
+ :class:`EvaluationResult` with score = passing_checks / total_checks.
84
+ """
85
+ total_checks = len(self.required) + len(self.forbidden)
86
+ if total_checks == 0:
87
+ return EvaluationResult(score=1.0, passed=True)
88
+
89
+ passing = 0
90
+ deficiencies: list[str] = []
91
+ all_required_present = True
92
+ no_forbidden_found = True
93
+
94
+ for pattern, raw in zip(self.required, self._required_raw):
95
+ if pattern.search(output):
96
+ passing += 1
97
+ else:
98
+ deficiencies.append(f"Required pattern not found: {raw}")
99
+ all_required_present = False
100
+
101
+ for pattern, raw in zip(self.forbidden, self._forbidden_raw):
102
+ if not pattern.search(output):
103
+ passing += 1
104
+ else:
105
+ deficiencies.append(f"Forbidden pattern found: {raw}")
106
+ no_forbidden_found = False
107
+
108
+ score = passing / total_checks
109
+ passed = all_required_present and no_forbidden_found
110
+ return EvaluationResult(
111
+ score=score,
112
+ passed=passed,
113
+ deficiencies=deficiencies,
114
+ sub_scores={"regex": score},
115
+ )
116
+
117
+
118
+ class JSONSchemaEvaluator:
119
+ """Evaluator that checks output is valid JSON matching a lightweight schema.
120
+
121
+ Args:
122
+ required_fields: Field names that must be present in the JSON object.
123
+ field_types: Mapping of field names to expected Python types.
124
+ must_be_object: If True, the top-level JSON value must be a dict.
125
+ """
126
+
127
+ def __init__(
128
+ self,
129
+ required_fields: list[str] | None = None,
130
+ field_types: dict[str, type] | None = None,
131
+ must_be_object: bool = True,
132
+ ) -> None:
133
+ self.required_fields = required_fields or []
134
+ self.field_types = field_types or {}
135
+ self.must_be_object = must_be_object
136
+
137
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
138
+ """Validate *output* as JSON against the configured schema.
139
+
140
+ Weighted sub-scores:
141
+ - ``json_valid``: 0.2
142
+ - ``is_object``: 0.1
143
+ - ``fields_present``: 0.4
144
+ - ``type_correct``: 0.3
145
+
146
+ Args:
147
+ output: The text to evaluate as JSON.
148
+ context: Unused; accepted for interface compatibility.
149
+
150
+ Returns:
151
+ :class:`EvaluationResult` with weighted composite score.
152
+ """
153
+ sub_scores: dict[str, float] = {}
154
+ deficiencies: list[str] = []
155
+
156
+ # Parse JSON
157
+ try:
158
+ data = json.loads(output)
159
+ sub_scores["json_valid"] = 1.0
160
+ except (json.JSONDecodeError, ValueError):
161
+ return EvaluationResult(
162
+ score=0.0,
163
+ passed=False,
164
+ deficiencies=["Invalid JSON"],
165
+ sub_scores={"json_valid": 0.0, "is_object": 0.0, "fields_present": 0.0, "type_correct": 0.0},
166
+ )
167
+
168
+ # Check object type
169
+ if self.must_be_object and not isinstance(data, dict):
170
+ sub_scores["is_object"] = 0.0
171
+ sub_scores["fields_present"] = 0.0
172
+ sub_scores["type_correct"] = 0.0
173
+ deficiencies.append("Top-level value is not a JSON object")
174
+ score = 0.2 * sub_scores["json_valid"]
175
+ return EvaluationResult(
176
+ score=score,
177
+ passed=False,
178
+ deficiencies=deficiencies,
179
+ sub_scores=sub_scores,
180
+ )
181
+
182
+ sub_scores["is_object"] = 1.0
183
+
184
+ # Check required fields
185
+ if self.required_fields:
186
+ present = sum(1 for f in self.required_fields if f in data)
187
+ sub_scores["fields_present"] = present / len(self.required_fields)
188
+ for f in self.required_fields:
189
+ if f not in data:
190
+ deficiencies.append(f"Missing required field: {f}")
191
+ else:
192
+ sub_scores["fields_present"] = 1.0
193
+
194
+ # Check field types
195
+ type_checks = [(f, t) for f, t in self.field_types.items() if f in data]
196
+ if type_checks:
197
+ correct = sum(1 for f, t in type_checks if isinstance(data[f], t))
198
+ sub_scores["type_correct"] = correct / len(type_checks)
199
+ for f, t in type_checks:
200
+ if not isinstance(data[f], t):
201
+ deficiencies.append(
202
+ f"Field '{f}' has type {type(data[f]).__name__}, expected {t.__name__}"
203
+ )
204
+ else:
205
+ sub_scores["type_correct"] = 1.0
206
+
207
+ # Weighted score
208
+ score = (
209
+ 0.2 * sub_scores["json_valid"]
210
+ + 0.1 * sub_scores["is_object"]
211
+ + 0.4 * sub_scores["fields_present"]
212
+ + 0.3 * sub_scores["type_correct"]
213
+ )
214
+
215
+ passed = score >= 0.7 and not deficiencies
216
+ return EvaluationResult(
217
+ score=score,
218
+ passed=passed,
219
+ deficiencies=deficiencies,
220
+ sub_scores=sub_scores,
221
+ )
222
+
223
+
224
+ class LengthEvaluator:
225
+ """Evaluator that checks character and word count bounds.
226
+
227
+ Args:
228
+ min_chars: Minimum number of characters required.
229
+ max_chars: Maximum number of characters allowed.
230
+ min_words: Minimum number of words required.
231
+ max_words: Maximum number of words allowed.
232
+ """
233
+
234
+ def __init__(
235
+ self,
236
+ min_chars: int = 0,
237
+ max_chars: int = 100_000,
238
+ min_words: int = 0,
239
+ max_words: int = 100_000,
240
+ ) -> None:
241
+ self.min_chars = min_chars
242
+ self.max_chars = max_chars
243
+ self.min_words = min_words
244
+ self.max_words = max_words
245
+
246
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
247
+ """Check *output* length against configured bounds.
248
+
249
+ Args:
250
+ output: The text to evaluate.
251
+ context: Unused; accepted for interface compatibility.
252
+
253
+ Returns:
254
+ :class:`EvaluationResult` with score 1.0 if all bounds met, 0.3 otherwise.
255
+ """
256
+ char_count = len(output)
257
+ word_count = len(output.split())
258
+ deficiencies: list[str] = []
259
+
260
+ if char_count < self.min_chars:
261
+ deficiencies.append(
262
+ f"Too few characters: {char_count} < minimum {self.min_chars}"
263
+ )
264
+ if char_count > self.max_chars:
265
+ deficiencies.append(
266
+ f"Too many characters: {char_count} > maximum {self.max_chars}"
267
+ )
268
+ if word_count < self.min_words:
269
+ deficiencies.append(
270
+ f"Too few words: {word_count} < minimum {self.min_words}"
271
+ )
272
+ if word_count > self.max_words:
273
+ deficiencies.append(
274
+ f"Too many words: {word_count} > maximum {self.max_words}"
275
+ )
276
+
277
+ score = 1.0 if not deficiencies else 0.3
278
+ passed = not deficiencies
279
+ return EvaluationResult(
280
+ score=score,
281
+ passed=passed,
282
+ deficiencies=deficiencies,
283
+ sub_scores={"length": score},
284
+ )
285
+
286
+
287
+ class ConsistencyEvaluator:
288
+ """Evaluator that checks consistency between a subtask result and its dependencies.
289
+
290
+ Verifies that the output references or is consistent with the provided
291
+ dependency outputs. Uses keyword overlap as a lightweight proxy for
292
+ semantic consistency.
293
+
294
+ Args:
295
+ dependency_outputs: List of output strings from dependency tasks.
296
+ min_overlap: Minimum fraction of dependency keywords that must
297
+ appear in the output.
298
+ """
299
+
300
+ def __init__(
301
+ self,
302
+ dependency_outputs: list[str],
303
+ min_overlap: float = 0.1,
304
+ ) -> None:
305
+ self.dependency_outputs = dependency_outputs
306
+ self.min_overlap = min_overlap
307
+ self._dep_keywords: list[set[str]] = [
308
+ self._extract_keywords(output) for output in dependency_outputs
309
+ ]
310
+
311
+ @staticmethod
312
+ def _extract_keywords(text: str) -> set[str]:
313
+ """Extract significant keywords from text (words >= 4 chars)."""
314
+ words = re.findall(r"\b[a-zA-Z_]\w{3,}\b", text.lower())
315
+ # Filter common stop words
316
+ stop = {
317
+ "that", "this", "with", "from", "have", "been", "will", "would",
318
+ "could", "should", "which", "their", "there", "about", "after",
319
+ "than", "them", "then", "were", "when", "what", "your", "also",
320
+ "into", "each", "only", "other", "some", "such", "more", "very",
321
+ }
322
+ return {w for w in words if w not in stop}
323
+
324
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
325
+ """Check *output* for consistency with dependency outputs.
326
+
327
+ Args:
328
+ output: The text to evaluate.
329
+ context: Unused; accepted for interface compatibility.
330
+
331
+ Returns:
332
+ :class:`EvaluationResult` with overlap-based score.
333
+ """
334
+ if not self._dep_keywords:
335
+ return EvaluationResult(score=1.0, passed=True, sub_scores={"consistency": 1.0})
336
+
337
+ output_keywords = self._extract_keywords(output)
338
+ overlaps: list[float] = []
339
+ deficiencies: list[str] = []
340
+
341
+ for i, dep_kw in enumerate(self._dep_keywords):
342
+ if not dep_kw:
343
+ overlaps.append(1.0)
344
+ continue
345
+ overlap = len(output_keywords & dep_kw) / len(dep_kw)
346
+ overlaps.append(overlap)
347
+ if overlap < self.min_overlap:
348
+ deficiencies.append(
349
+ f"Low consistency with dependency {i}: "
350
+ f"{overlap:.1%} keyword overlap (min {self.min_overlap:.1%})"
351
+ )
352
+
353
+ score = sum(overlaps) / len(overlaps) if overlaps else 1.0
354
+ passed = score >= self.min_overlap and not deficiencies
355
+ return EvaluationResult(
356
+ score=score,
357
+ passed=passed,
358
+ deficiencies=deficiencies,
359
+ sub_scores={"consistency": score},
360
+ )
361
+
362
+
363
+ class CompletenessEvaluator:
364
+ """Evaluator that checks whether an output addresses all required aspects.
365
+
366
+ Given a list of required aspects (e.g. quality criteria from an
367
+ :class:`IntentSpec`), checks that each is mentioned or addressed in
368
+ the output.
369
+
370
+ Args:
371
+ required_aspects: List of strings that should be addressed.
372
+ """
373
+
374
+ def __init__(self, required_aspects: list[str]) -> None:
375
+ self.required_aspects = required_aspects
376
+
377
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
378
+ """Check *output* for completeness against required aspects.
379
+
380
+ Each aspect is checked via case-insensitive substring matching
381
+ on its keywords (words >= 4 chars).
382
+
383
+ Args:
384
+ output: The text to evaluate.
385
+ context: Unused; accepted for interface compatibility.
386
+
387
+ Returns:
388
+ :class:`EvaluationResult` with per-aspect scoring.
389
+ """
390
+ if not self.required_aspects:
391
+ return EvaluationResult(score=1.0, passed=True, sub_scores={"completeness": 1.0})
392
+
393
+ output_lower = output.lower()
394
+ addressed = 0
395
+ deficiencies: list[str] = []
396
+
397
+ for aspect in self.required_aspects:
398
+ # Extract keywords from the aspect
399
+ keywords = re.findall(r"\b[a-zA-Z_]\w{3,}\b", aspect.lower())
400
+ if not keywords:
401
+ addressed += 1
402
+ continue
403
+
404
+ # Check if most keywords appear in the output
405
+ found = sum(1 for kw in keywords if kw in output_lower)
406
+ if found >= len(keywords) * 0.5:
407
+ addressed += 1
408
+ else:
409
+ deficiencies.append(f"Aspect not addressed: {aspect}")
410
+
411
+ score = addressed / len(self.required_aspects)
412
+ passed = score >= 0.8 and not deficiencies
413
+ return EvaluationResult(
414
+ score=score,
415
+ passed=passed,
416
+ deficiencies=deficiencies,
417
+ sub_scores={"completeness": score},
418
+ )
419
+
loopllm/guards.py ADDED
@@ -0,0 +1,254 @@
1
+ """Composable stop guards for adaptive agent loops."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ import time
6
+ from dataclasses import dataclass
7
+ from typing import TYPE_CHECKING, Protocol, runtime_checkable
8
+
9
+ from loopllm.adaptive_exit import BayesianExitCondition
10
+ from loopllm.engine import ExitConditionProtocol, ExitReason
11
+ from loopllm.priors import AdaptivePriors
12
+
13
+ if TYPE_CHECKING:
14
+ from loopllm.agent_loop import AgentLoopSession
15
+
16
+ CONVERGENCE_DELTA = 0.01
17
+ MAX_STEPS_DEFAULT = 10
18
+
19
+
20
+ @dataclass
21
+ class GuardContext:
22
+ """Runtime context passed to each agent-loop guard."""
23
+
24
+ session: AgentLoopSession
25
+ iteration: int
26
+ current_score: float
27
+ scores_so_far: list[float]
28
+ step_output: str = ""
29
+
30
+
31
+ @runtime_checkable
32
+ class AgentLoopGuard(Protocol):
33
+ """Protocol for pluggable agent-loop stop conditions."""
34
+
35
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None: ...
36
+
37
+
38
+ class GuardStack:
39
+ """Run guards in order; first stop reason wins (OR semantics)."""
40
+
41
+ def __init__(self, guards: list[AgentLoopGuard]) -> None:
42
+ self.guards = guards
43
+
44
+ def evaluate(self, ctx: GuardContext) -> ExitReason | None:
45
+ for guard in self.guards:
46
+ reason = guard.should_stop(ctx)
47
+ if reason is not None:
48
+ return reason
49
+ return None
50
+
51
+
52
+ class ExitConditionAdapter:
53
+ """Wrap :class:`ExitConditionProtocol` for agent-loop guards."""
54
+
55
+ def __init__(self, condition: ExitConditionProtocol) -> None:
56
+ self._condition = condition
57
+
58
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
59
+ return self._condition.should_exit(
60
+ ctx.iteration,
61
+ ctx.current_score,
62
+ ctx.scores_so_far,
63
+ )
64
+
65
+
66
+ class ScoreThresholdGuard:
67
+ """Stop when verified score meets the session quality threshold."""
68
+
69
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
70
+ if ctx.current_score >= ctx.session.quality_threshold:
71
+ return ExitReason(
72
+ condition="quality_threshold",
73
+ message=(
74
+ f"Goal reached: score={ctx.current_score:.3f} >= threshold "
75
+ f"{ctx.session.quality_threshold:.2f} at step {ctx.iteration}"
76
+ ),
77
+ )
78
+ return None
79
+
80
+
81
+ class PlateauGuard:
82
+ """Stop when the last three verified scores plateau."""
83
+
84
+ def __init__(self, delta: float = CONVERGENCE_DELTA) -> None:
85
+ self.delta = delta
86
+
87
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
88
+ scores = ctx.scores_so_far
89
+ if len(scores) < 3:
90
+ return None
91
+ delta1 = abs(scores[-1] - scores[-2])
92
+ delta2 = abs(scores[-2] - scores[-3])
93
+ if delta1 < self.delta and delta2 < self.delta:
94
+ return ExitReason(
95
+ condition="plateau",
96
+ message=(
97
+ f"Progress plateaued (last deltas {delta1:.4f}, {delta2:.4f} "
98
+ f"< {self.delta:.4f}); further steps unlikely to help"
99
+ ),
100
+ )
101
+ return None
102
+
103
+
104
+ class BayesianGuard:
105
+ """Stop when learned priors predict low ROI on further steps."""
106
+
107
+ def __init__(self, priors: AdaptivePriors) -> None:
108
+ self._priors = priors
109
+
110
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
111
+ session = ctx.session
112
+ should_go = self._priors.should_continue(
113
+ session.task_type,
114
+ session.model_id,
115
+ ctx.iteration,
116
+ ctx.current_score,
117
+ ctx.scores_so_far,
118
+ quality_threshold=session.quality_threshold,
119
+ )
120
+ if not should_go:
121
+ expected_delta, uncertainty = self._priors.expected_improvement(
122
+ session.task_type, session.model_id, ctx.iteration
123
+ )
124
+ return ExitReason(
125
+ condition="adaptive_bayesian",
126
+ message=(
127
+ f"Bayesian stop at step {ctx.iteration}: "
128
+ f"score={ctx.current_score:.3f}, "
129
+ f"E[delta]={expected_delta:.3f}±{uncertainty:.3f}, "
130
+ f"threshold={session.quality_threshold:.2f} (low expected ROI)"
131
+ ),
132
+ )
133
+ return None
134
+
135
+
136
+ class BudgetExhaustedGuard:
137
+ """Stop when the learned step budget is exhausted."""
138
+
139
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
140
+ if ctx.iteration >= ctx.session.suggested_budget:
141
+ return ExitReason(
142
+ condition="budget_exhausted",
143
+ message=(
144
+ f"Step budget exhausted ({ctx.iteration}/"
145
+ f"{ctx.session.suggested_budget}); escalate or accept current result"
146
+ ),
147
+ )
148
+ return None
149
+
150
+
151
+ class MaxStepsGuard:
152
+ """Hard safety cap on agent-loop steps."""
153
+
154
+ def __init__(self, max_steps: int = MAX_STEPS_DEFAULT) -> None:
155
+ self.max_steps = max_steps
156
+
157
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
158
+ if ctx.iteration >= self.max_steps:
159
+ return ExitReason(
160
+ condition="max_steps",
161
+ message=f"Hard step cap reached ({self.max_steps})",
162
+ )
163
+ return None
164
+
165
+
166
+ class TimeoutGuard:
167
+ """Stop when wall-clock time since session start exceeds max_wall_ms."""
168
+
169
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
170
+ max_ms = ctx.session.max_wall_ms
171
+ if max_ms <= 0:
172
+ return None
173
+ elapsed_ms = (time.perf_counter() - ctx.session.started_at) * 1000.0
174
+ if elapsed_ms >= max_ms:
175
+ return ExitReason(
176
+ condition="timeout",
177
+ message=(
178
+ f"Wall-clock timeout: {elapsed_ms:.0f}ms >= {max_ms:.0f}ms"
179
+ ),
180
+ )
181
+ return None
182
+
183
+
184
+ class TokenBudgetGuard:
185
+ """Stop when cumulative session tokens exceed max_tokens."""
186
+
187
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
188
+ cap = ctx.session.max_tokens
189
+ if cap <= 0:
190
+ return None
191
+ total = ctx.session.prompt_tokens + ctx.session.completion_tokens
192
+ if total >= cap:
193
+ return ExitReason(
194
+ condition="token_budget",
195
+ message=f"Token budget exhausted ({total}/{cap})",
196
+ )
197
+ return None
198
+
199
+
200
+ class OutputRepeatGuard:
201
+ """Stop when the same step artifact repeats within a sliding window."""
202
+
203
+ def __init__(self, window: int = 5, min_repeats: int = 2) -> None:
204
+ self.window = window
205
+ self.min_repeats = min_repeats
206
+
207
+ def should_stop(self, ctx: GuardContext) -> ExitReason | None:
208
+ if not ctx.step_output:
209
+ return None
210
+ fingerprint = hashlib.sha256(ctx.step_output.strip().encode()).hexdigest()[:16]
211
+ session = ctx.session
212
+ session.step_fingerprints.append(fingerprint)
213
+ recent = session.step_fingerprints[-self.window :]
214
+ count = recent.count(fingerprint)
215
+ if count >= self.min_repeats:
216
+ return ExitReason(
217
+ condition="output_repeat",
218
+ message=(
219
+ f"Step output repeated {count} times in last {len(recent)} "
220
+ f"step(s); likely stuck in a loop"
221
+ ),
222
+ )
223
+ return None
224
+
225
+
226
+ def default_guard_stack(priors: AdaptivePriors, max_steps: int = MAX_STEPS_DEFAULT) -> GuardStack:
227
+ """Build the default guard stack mirroring legacy _decide() order."""
228
+ return GuardStack([
229
+ ScoreThresholdGuard(),
230
+ PlateauGuard(),
231
+ BayesianGuard(priors),
232
+ BudgetExhaustedGuard(),
233
+ MaxStepsGuard(max_steps),
234
+ TimeoutGuard(),
235
+ TokenBudgetGuard(),
236
+ OutputRepeatGuard(),
237
+ ])
238
+
239
+
240
+ def bayesian_exit_as_guard(
241
+ priors: AdaptivePriors,
242
+ task_type: str,
243
+ model_id: str,
244
+ quality_threshold: float,
245
+ ) -> ExitConditionAdapter:
246
+ """Wrap :class:`BayesianExitCondition` as an agent-loop guard."""
247
+ return ExitConditionAdapter(
248
+ BayesianExitCondition(
249
+ priors=priors,
250
+ task_type=task_type,
251
+ model_id=model_id,
252
+ quality_threshold=quality_threshold,
253
+ )
254
+ )