loopllm 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopllm/__init__.py +69 -0
- loopllm/__main__.py +5 -0
- loopllm/adaptive_exit.py +78 -0
- loopllm/agent_loop.py +299 -0
- loopllm/cli.py +521 -0
- loopllm/elicitation.py +519 -0
- loopllm/engine.py +376 -0
- loopllm/evaluator_factory.py +72 -0
- loopllm/evaluators.py +419 -0
- loopllm/guards.py +254 -0
- loopllm/local_loop.py +273 -0
- loopllm/mcp_server.py +2657 -0
- loopllm/plan_registry.py +412 -0
- loopllm/priors.py +604 -0
- loopllm/provider.py +51 -0
- loopllm/providers/__init__.py +15 -0
- loopllm/providers/agent.py +64 -0
- loopllm/providers/mock.py +64 -0
- loopllm/providers/ollama.py +95 -0
- loopllm/providers/openrouter.py +101 -0
- loopllm/serve.py +297 -0
- loopllm/step_scorer.py +190 -0
- loopllm/store.py +1126 -0
- loopllm/tasks.py +599 -0
- loopllm-0.7.0.dist-info/METADATA +454 -0
- loopllm-0.7.0.dist-info/RECORD +29 -0
- loopllm-0.7.0.dist-info/WHEEL +4 -0
- loopllm-0.7.0.dist-info/entry_points.txt +3 -0
- loopllm-0.7.0.dist-info/licenses/LICENSE +21 -0
loopllm/evaluators.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
"""Built-in evaluators for scoring LLM outputs."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Callable
|
|
7
|
+
|
|
8
|
+
from loopllm.engine import EvaluationResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ThresholdEvaluator:
|
|
12
|
+
"""Evaluator that delegates scoring to a callable and applies a pass/fail threshold.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
scorer: A callable ``(output, context) -> float`` returning a score in [0, 1].
|
|
16
|
+
threshold: Minimum score to pass.
|
|
17
|
+
name: Human-readable evaluator name.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
scorer: Callable[[str, dict[str, Any]], float],
|
|
23
|
+
threshold: float = 0.7,
|
|
24
|
+
name: str = "threshold",
|
|
25
|
+
) -> None:
|
|
26
|
+
self.scorer = scorer
|
|
27
|
+
self.threshold = threshold
|
|
28
|
+
self.name = name
|
|
29
|
+
|
|
30
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
31
|
+
"""Score *output* and return pass/fail based on threshold.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
output: The text to evaluate.
|
|
35
|
+
context: Optional context dict passed to the scorer.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
:class:`EvaluationResult` with clamped score.
|
|
39
|
+
"""
|
|
40
|
+
ctx = context or {}
|
|
41
|
+
raw = self.scorer(output, ctx)
|
|
42
|
+
score = max(0.0, min(1.0, raw))
|
|
43
|
+
passed = score >= self.threshold
|
|
44
|
+
deficiencies: list[str] = []
|
|
45
|
+
if not passed:
|
|
46
|
+
deficiencies.append(
|
|
47
|
+
f"{self.name}: score {score:.2f} below threshold {self.threshold:.2f}"
|
|
48
|
+
)
|
|
49
|
+
return EvaluationResult(
|
|
50
|
+
score=score,
|
|
51
|
+
passed=passed,
|
|
52
|
+
deficiencies=deficiencies,
|
|
53
|
+
sub_scores={self.name: score},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class RegexEvaluator:
|
|
58
|
+
"""Evaluator that checks for required and forbidden regex patterns.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
required: Patterns that must be present in the output.
|
|
62
|
+
forbidden: Patterns that must NOT be present in the output.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
required: list[str] | None = None,
|
|
68
|
+
forbidden: list[str] | None = None,
|
|
69
|
+
) -> None:
|
|
70
|
+
self.required = [re.compile(p, re.IGNORECASE) for p in (required or [])]
|
|
71
|
+
self.forbidden = [re.compile(p, re.IGNORECASE) for p in (forbidden or [])]
|
|
72
|
+
self._required_raw = required or []
|
|
73
|
+
self._forbidden_raw = forbidden or []
|
|
74
|
+
|
|
75
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
76
|
+
"""Check *output* against required/forbidden patterns.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
output: The text to evaluate.
|
|
80
|
+
context: Unused; accepted for interface compatibility.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
:class:`EvaluationResult` with score = passing_checks / total_checks.
|
|
84
|
+
"""
|
|
85
|
+
total_checks = len(self.required) + len(self.forbidden)
|
|
86
|
+
if total_checks == 0:
|
|
87
|
+
return EvaluationResult(score=1.0, passed=True)
|
|
88
|
+
|
|
89
|
+
passing = 0
|
|
90
|
+
deficiencies: list[str] = []
|
|
91
|
+
all_required_present = True
|
|
92
|
+
no_forbidden_found = True
|
|
93
|
+
|
|
94
|
+
for pattern, raw in zip(self.required, self._required_raw):
|
|
95
|
+
if pattern.search(output):
|
|
96
|
+
passing += 1
|
|
97
|
+
else:
|
|
98
|
+
deficiencies.append(f"Required pattern not found: {raw}")
|
|
99
|
+
all_required_present = False
|
|
100
|
+
|
|
101
|
+
for pattern, raw in zip(self.forbidden, self._forbidden_raw):
|
|
102
|
+
if not pattern.search(output):
|
|
103
|
+
passing += 1
|
|
104
|
+
else:
|
|
105
|
+
deficiencies.append(f"Forbidden pattern found: {raw}")
|
|
106
|
+
no_forbidden_found = False
|
|
107
|
+
|
|
108
|
+
score = passing / total_checks
|
|
109
|
+
passed = all_required_present and no_forbidden_found
|
|
110
|
+
return EvaluationResult(
|
|
111
|
+
score=score,
|
|
112
|
+
passed=passed,
|
|
113
|
+
deficiencies=deficiencies,
|
|
114
|
+
sub_scores={"regex": score},
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class JSONSchemaEvaluator:
|
|
119
|
+
"""Evaluator that checks output is valid JSON matching a lightweight schema.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
required_fields: Field names that must be present in the JSON object.
|
|
123
|
+
field_types: Mapping of field names to expected Python types.
|
|
124
|
+
must_be_object: If True, the top-level JSON value must be a dict.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
def __init__(
|
|
128
|
+
self,
|
|
129
|
+
required_fields: list[str] | None = None,
|
|
130
|
+
field_types: dict[str, type] | None = None,
|
|
131
|
+
must_be_object: bool = True,
|
|
132
|
+
) -> None:
|
|
133
|
+
self.required_fields = required_fields or []
|
|
134
|
+
self.field_types = field_types or {}
|
|
135
|
+
self.must_be_object = must_be_object
|
|
136
|
+
|
|
137
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
138
|
+
"""Validate *output* as JSON against the configured schema.
|
|
139
|
+
|
|
140
|
+
Weighted sub-scores:
|
|
141
|
+
- ``json_valid``: 0.2
|
|
142
|
+
- ``is_object``: 0.1
|
|
143
|
+
- ``fields_present``: 0.4
|
|
144
|
+
- ``type_correct``: 0.3
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
output: The text to evaluate as JSON.
|
|
148
|
+
context: Unused; accepted for interface compatibility.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
:class:`EvaluationResult` with weighted composite score.
|
|
152
|
+
"""
|
|
153
|
+
sub_scores: dict[str, float] = {}
|
|
154
|
+
deficiencies: list[str] = []
|
|
155
|
+
|
|
156
|
+
# Parse JSON
|
|
157
|
+
try:
|
|
158
|
+
data = json.loads(output)
|
|
159
|
+
sub_scores["json_valid"] = 1.0
|
|
160
|
+
except (json.JSONDecodeError, ValueError):
|
|
161
|
+
return EvaluationResult(
|
|
162
|
+
score=0.0,
|
|
163
|
+
passed=False,
|
|
164
|
+
deficiencies=["Invalid JSON"],
|
|
165
|
+
sub_scores={"json_valid": 0.0, "is_object": 0.0, "fields_present": 0.0, "type_correct": 0.0},
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Check object type
|
|
169
|
+
if self.must_be_object and not isinstance(data, dict):
|
|
170
|
+
sub_scores["is_object"] = 0.0
|
|
171
|
+
sub_scores["fields_present"] = 0.0
|
|
172
|
+
sub_scores["type_correct"] = 0.0
|
|
173
|
+
deficiencies.append("Top-level value is not a JSON object")
|
|
174
|
+
score = 0.2 * sub_scores["json_valid"]
|
|
175
|
+
return EvaluationResult(
|
|
176
|
+
score=score,
|
|
177
|
+
passed=False,
|
|
178
|
+
deficiencies=deficiencies,
|
|
179
|
+
sub_scores=sub_scores,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
sub_scores["is_object"] = 1.0
|
|
183
|
+
|
|
184
|
+
# Check required fields
|
|
185
|
+
if self.required_fields:
|
|
186
|
+
present = sum(1 for f in self.required_fields if f in data)
|
|
187
|
+
sub_scores["fields_present"] = present / len(self.required_fields)
|
|
188
|
+
for f in self.required_fields:
|
|
189
|
+
if f not in data:
|
|
190
|
+
deficiencies.append(f"Missing required field: {f}")
|
|
191
|
+
else:
|
|
192
|
+
sub_scores["fields_present"] = 1.0
|
|
193
|
+
|
|
194
|
+
# Check field types
|
|
195
|
+
type_checks = [(f, t) for f, t in self.field_types.items() if f in data]
|
|
196
|
+
if type_checks:
|
|
197
|
+
correct = sum(1 for f, t in type_checks if isinstance(data[f], t))
|
|
198
|
+
sub_scores["type_correct"] = correct / len(type_checks)
|
|
199
|
+
for f, t in type_checks:
|
|
200
|
+
if not isinstance(data[f], t):
|
|
201
|
+
deficiencies.append(
|
|
202
|
+
f"Field '{f}' has type {type(data[f]).__name__}, expected {t.__name__}"
|
|
203
|
+
)
|
|
204
|
+
else:
|
|
205
|
+
sub_scores["type_correct"] = 1.0
|
|
206
|
+
|
|
207
|
+
# Weighted score
|
|
208
|
+
score = (
|
|
209
|
+
0.2 * sub_scores["json_valid"]
|
|
210
|
+
+ 0.1 * sub_scores["is_object"]
|
|
211
|
+
+ 0.4 * sub_scores["fields_present"]
|
|
212
|
+
+ 0.3 * sub_scores["type_correct"]
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
passed = score >= 0.7 and not deficiencies
|
|
216
|
+
return EvaluationResult(
|
|
217
|
+
score=score,
|
|
218
|
+
passed=passed,
|
|
219
|
+
deficiencies=deficiencies,
|
|
220
|
+
sub_scores=sub_scores,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class LengthEvaluator:
|
|
225
|
+
"""Evaluator that checks character and word count bounds.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
min_chars: Minimum number of characters required.
|
|
229
|
+
max_chars: Maximum number of characters allowed.
|
|
230
|
+
min_words: Minimum number of words required.
|
|
231
|
+
max_words: Maximum number of words allowed.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def __init__(
|
|
235
|
+
self,
|
|
236
|
+
min_chars: int = 0,
|
|
237
|
+
max_chars: int = 100_000,
|
|
238
|
+
min_words: int = 0,
|
|
239
|
+
max_words: int = 100_000,
|
|
240
|
+
) -> None:
|
|
241
|
+
self.min_chars = min_chars
|
|
242
|
+
self.max_chars = max_chars
|
|
243
|
+
self.min_words = min_words
|
|
244
|
+
self.max_words = max_words
|
|
245
|
+
|
|
246
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
247
|
+
"""Check *output* length against configured bounds.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
output: The text to evaluate.
|
|
251
|
+
context: Unused; accepted for interface compatibility.
|
|
252
|
+
|
|
253
|
+
Returns:
|
|
254
|
+
:class:`EvaluationResult` with score 1.0 if all bounds met, 0.3 otherwise.
|
|
255
|
+
"""
|
|
256
|
+
char_count = len(output)
|
|
257
|
+
word_count = len(output.split())
|
|
258
|
+
deficiencies: list[str] = []
|
|
259
|
+
|
|
260
|
+
if char_count < self.min_chars:
|
|
261
|
+
deficiencies.append(
|
|
262
|
+
f"Too few characters: {char_count} < minimum {self.min_chars}"
|
|
263
|
+
)
|
|
264
|
+
if char_count > self.max_chars:
|
|
265
|
+
deficiencies.append(
|
|
266
|
+
f"Too many characters: {char_count} > maximum {self.max_chars}"
|
|
267
|
+
)
|
|
268
|
+
if word_count < self.min_words:
|
|
269
|
+
deficiencies.append(
|
|
270
|
+
f"Too few words: {word_count} < minimum {self.min_words}"
|
|
271
|
+
)
|
|
272
|
+
if word_count > self.max_words:
|
|
273
|
+
deficiencies.append(
|
|
274
|
+
f"Too many words: {word_count} > maximum {self.max_words}"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
score = 1.0 if not deficiencies else 0.3
|
|
278
|
+
passed = not deficiencies
|
|
279
|
+
return EvaluationResult(
|
|
280
|
+
score=score,
|
|
281
|
+
passed=passed,
|
|
282
|
+
deficiencies=deficiencies,
|
|
283
|
+
sub_scores={"length": score},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class ConsistencyEvaluator:
|
|
288
|
+
"""Evaluator that checks consistency between a subtask result and its dependencies.
|
|
289
|
+
|
|
290
|
+
Verifies that the output references or is consistent with the provided
|
|
291
|
+
dependency outputs. Uses keyword overlap as a lightweight proxy for
|
|
292
|
+
semantic consistency.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
dependency_outputs: List of output strings from dependency tasks.
|
|
296
|
+
min_overlap: Minimum fraction of dependency keywords that must
|
|
297
|
+
appear in the output.
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
def __init__(
|
|
301
|
+
self,
|
|
302
|
+
dependency_outputs: list[str],
|
|
303
|
+
min_overlap: float = 0.1,
|
|
304
|
+
) -> None:
|
|
305
|
+
self.dependency_outputs = dependency_outputs
|
|
306
|
+
self.min_overlap = min_overlap
|
|
307
|
+
self._dep_keywords: list[set[str]] = [
|
|
308
|
+
self._extract_keywords(output) for output in dependency_outputs
|
|
309
|
+
]
|
|
310
|
+
|
|
311
|
+
@staticmethod
|
|
312
|
+
def _extract_keywords(text: str) -> set[str]:
|
|
313
|
+
"""Extract significant keywords from text (words >= 4 chars)."""
|
|
314
|
+
words = re.findall(r"\b[a-zA-Z_]\w{3,}\b", text.lower())
|
|
315
|
+
# Filter common stop words
|
|
316
|
+
stop = {
|
|
317
|
+
"that", "this", "with", "from", "have", "been", "will", "would",
|
|
318
|
+
"could", "should", "which", "their", "there", "about", "after",
|
|
319
|
+
"than", "them", "then", "were", "when", "what", "your", "also",
|
|
320
|
+
"into", "each", "only", "other", "some", "such", "more", "very",
|
|
321
|
+
}
|
|
322
|
+
return {w for w in words if w not in stop}
|
|
323
|
+
|
|
324
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
325
|
+
"""Check *output* for consistency with dependency outputs.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
output: The text to evaluate.
|
|
329
|
+
context: Unused; accepted for interface compatibility.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
:class:`EvaluationResult` with overlap-based score.
|
|
333
|
+
"""
|
|
334
|
+
if not self._dep_keywords:
|
|
335
|
+
return EvaluationResult(score=1.0, passed=True, sub_scores={"consistency": 1.0})
|
|
336
|
+
|
|
337
|
+
output_keywords = self._extract_keywords(output)
|
|
338
|
+
overlaps: list[float] = []
|
|
339
|
+
deficiencies: list[str] = []
|
|
340
|
+
|
|
341
|
+
for i, dep_kw in enumerate(self._dep_keywords):
|
|
342
|
+
if not dep_kw:
|
|
343
|
+
overlaps.append(1.0)
|
|
344
|
+
continue
|
|
345
|
+
overlap = len(output_keywords & dep_kw) / len(dep_kw)
|
|
346
|
+
overlaps.append(overlap)
|
|
347
|
+
if overlap < self.min_overlap:
|
|
348
|
+
deficiencies.append(
|
|
349
|
+
f"Low consistency with dependency {i}: "
|
|
350
|
+
f"{overlap:.1%} keyword overlap (min {self.min_overlap:.1%})"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
score = sum(overlaps) / len(overlaps) if overlaps else 1.0
|
|
354
|
+
passed = score >= self.min_overlap and not deficiencies
|
|
355
|
+
return EvaluationResult(
|
|
356
|
+
score=score,
|
|
357
|
+
passed=passed,
|
|
358
|
+
deficiencies=deficiencies,
|
|
359
|
+
sub_scores={"consistency": score},
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class CompletenessEvaluator:
|
|
364
|
+
"""Evaluator that checks whether an output addresses all required aspects.
|
|
365
|
+
|
|
366
|
+
Given a list of required aspects (e.g. quality criteria from an
|
|
367
|
+
:class:`IntentSpec`), checks that each is mentioned or addressed in
|
|
368
|
+
the output.
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
required_aspects: List of strings that should be addressed.
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
def __init__(self, required_aspects: list[str]) -> None:
|
|
375
|
+
self.required_aspects = required_aspects
|
|
376
|
+
|
|
377
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
378
|
+
"""Check *output* for completeness against required aspects.
|
|
379
|
+
|
|
380
|
+
Each aspect is checked via case-insensitive substring matching
|
|
381
|
+
on its keywords (words >= 4 chars).
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
output: The text to evaluate.
|
|
385
|
+
context: Unused; accepted for interface compatibility.
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
:class:`EvaluationResult` with per-aspect scoring.
|
|
389
|
+
"""
|
|
390
|
+
if not self.required_aspects:
|
|
391
|
+
return EvaluationResult(score=1.0, passed=True, sub_scores={"completeness": 1.0})
|
|
392
|
+
|
|
393
|
+
output_lower = output.lower()
|
|
394
|
+
addressed = 0
|
|
395
|
+
deficiencies: list[str] = []
|
|
396
|
+
|
|
397
|
+
for aspect in self.required_aspects:
|
|
398
|
+
# Extract keywords from the aspect
|
|
399
|
+
keywords = re.findall(r"\b[a-zA-Z_]\w{3,}\b", aspect.lower())
|
|
400
|
+
if not keywords:
|
|
401
|
+
addressed += 1
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
# Check if most keywords appear in the output
|
|
405
|
+
found = sum(1 for kw in keywords if kw in output_lower)
|
|
406
|
+
if found >= len(keywords) * 0.5:
|
|
407
|
+
addressed += 1
|
|
408
|
+
else:
|
|
409
|
+
deficiencies.append(f"Aspect not addressed: {aspect}")
|
|
410
|
+
|
|
411
|
+
score = addressed / len(self.required_aspects)
|
|
412
|
+
passed = score >= 0.8 and not deficiencies
|
|
413
|
+
return EvaluationResult(
|
|
414
|
+
score=score,
|
|
415
|
+
passed=passed,
|
|
416
|
+
deficiencies=deficiencies,
|
|
417
|
+
sub_scores={"completeness": score},
|
|
418
|
+
)
|
|
419
|
+
|
loopllm/guards.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Composable stop guards for adaptive agent loops."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import TYPE_CHECKING, Protocol, runtime_checkable
|
|
8
|
+
|
|
9
|
+
from loopllm.adaptive_exit import BayesianExitCondition
|
|
10
|
+
from loopllm.engine import ExitConditionProtocol, ExitReason
|
|
11
|
+
from loopllm.priors import AdaptivePriors
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from loopllm.agent_loop import AgentLoopSession
|
|
15
|
+
|
|
16
|
+
CONVERGENCE_DELTA = 0.01
|
|
17
|
+
MAX_STEPS_DEFAULT = 10
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class GuardContext:
|
|
22
|
+
"""Runtime context passed to each agent-loop guard."""
|
|
23
|
+
|
|
24
|
+
session: AgentLoopSession
|
|
25
|
+
iteration: int
|
|
26
|
+
current_score: float
|
|
27
|
+
scores_so_far: list[float]
|
|
28
|
+
step_output: str = ""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@runtime_checkable
|
|
32
|
+
class AgentLoopGuard(Protocol):
|
|
33
|
+
"""Protocol for pluggable agent-loop stop conditions."""
|
|
34
|
+
|
|
35
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None: ...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GuardStack:
|
|
39
|
+
"""Run guards in order; first stop reason wins (OR semantics)."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, guards: list[AgentLoopGuard]) -> None:
|
|
42
|
+
self.guards = guards
|
|
43
|
+
|
|
44
|
+
def evaluate(self, ctx: GuardContext) -> ExitReason | None:
|
|
45
|
+
for guard in self.guards:
|
|
46
|
+
reason = guard.should_stop(ctx)
|
|
47
|
+
if reason is not None:
|
|
48
|
+
return reason
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ExitConditionAdapter:
|
|
53
|
+
"""Wrap :class:`ExitConditionProtocol` for agent-loop guards."""
|
|
54
|
+
|
|
55
|
+
def __init__(self, condition: ExitConditionProtocol) -> None:
|
|
56
|
+
self._condition = condition
|
|
57
|
+
|
|
58
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
59
|
+
return self._condition.should_exit(
|
|
60
|
+
ctx.iteration,
|
|
61
|
+
ctx.current_score,
|
|
62
|
+
ctx.scores_so_far,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ScoreThresholdGuard:
|
|
67
|
+
"""Stop when verified score meets the session quality threshold."""
|
|
68
|
+
|
|
69
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
70
|
+
if ctx.current_score >= ctx.session.quality_threshold:
|
|
71
|
+
return ExitReason(
|
|
72
|
+
condition="quality_threshold",
|
|
73
|
+
message=(
|
|
74
|
+
f"Goal reached: score={ctx.current_score:.3f} >= threshold "
|
|
75
|
+
f"{ctx.session.quality_threshold:.2f} at step {ctx.iteration}"
|
|
76
|
+
),
|
|
77
|
+
)
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class PlateauGuard:
|
|
82
|
+
"""Stop when the last three verified scores plateau."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, delta: float = CONVERGENCE_DELTA) -> None:
|
|
85
|
+
self.delta = delta
|
|
86
|
+
|
|
87
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
88
|
+
scores = ctx.scores_so_far
|
|
89
|
+
if len(scores) < 3:
|
|
90
|
+
return None
|
|
91
|
+
delta1 = abs(scores[-1] - scores[-2])
|
|
92
|
+
delta2 = abs(scores[-2] - scores[-3])
|
|
93
|
+
if delta1 < self.delta and delta2 < self.delta:
|
|
94
|
+
return ExitReason(
|
|
95
|
+
condition="plateau",
|
|
96
|
+
message=(
|
|
97
|
+
f"Progress plateaued (last deltas {delta1:.4f}, {delta2:.4f} "
|
|
98
|
+
f"< {self.delta:.4f}); further steps unlikely to help"
|
|
99
|
+
),
|
|
100
|
+
)
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class BayesianGuard:
|
|
105
|
+
"""Stop when learned priors predict low ROI on further steps."""
|
|
106
|
+
|
|
107
|
+
def __init__(self, priors: AdaptivePriors) -> None:
|
|
108
|
+
self._priors = priors
|
|
109
|
+
|
|
110
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
111
|
+
session = ctx.session
|
|
112
|
+
should_go = self._priors.should_continue(
|
|
113
|
+
session.task_type,
|
|
114
|
+
session.model_id,
|
|
115
|
+
ctx.iteration,
|
|
116
|
+
ctx.current_score,
|
|
117
|
+
ctx.scores_so_far,
|
|
118
|
+
quality_threshold=session.quality_threshold,
|
|
119
|
+
)
|
|
120
|
+
if not should_go:
|
|
121
|
+
expected_delta, uncertainty = self._priors.expected_improvement(
|
|
122
|
+
session.task_type, session.model_id, ctx.iteration
|
|
123
|
+
)
|
|
124
|
+
return ExitReason(
|
|
125
|
+
condition="adaptive_bayesian",
|
|
126
|
+
message=(
|
|
127
|
+
f"Bayesian stop at step {ctx.iteration}: "
|
|
128
|
+
f"score={ctx.current_score:.3f}, "
|
|
129
|
+
f"E[delta]={expected_delta:.3f}±{uncertainty:.3f}, "
|
|
130
|
+
f"threshold={session.quality_threshold:.2f} (low expected ROI)"
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class BudgetExhaustedGuard:
|
|
137
|
+
"""Stop when the learned step budget is exhausted."""
|
|
138
|
+
|
|
139
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
140
|
+
if ctx.iteration >= ctx.session.suggested_budget:
|
|
141
|
+
return ExitReason(
|
|
142
|
+
condition="budget_exhausted",
|
|
143
|
+
message=(
|
|
144
|
+
f"Step budget exhausted ({ctx.iteration}/"
|
|
145
|
+
f"{ctx.session.suggested_budget}); escalate or accept current result"
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class MaxStepsGuard:
|
|
152
|
+
"""Hard safety cap on agent-loop steps."""
|
|
153
|
+
|
|
154
|
+
def __init__(self, max_steps: int = MAX_STEPS_DEFAULT) -> None:
|
|
155
|
+
self.max_steps = max_steps
|
|
156
|
+
|
|
157
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
158
|
+
if ctx.iteration >= self.max_steps:
|
|
159
|
+
return ExitReason(
|
|
160
|
+
condition="max_steps",
|
|
161
|
+
message=f"Hard step cap reached ({self.max_steps})",
|
|
162
|
+
)
|
|
163
|
+
return None
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class TimeoutGuard:
|
|
167
|
+
"""Stop when wall-clock time since session start exceeds max_wall_ms."""
|
|
168
|
+
|
|
169
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
170
|
+
max_ms = ctx.session.max_wall_ms
|
|
171
|
+
if max_ms <= 0:
|
|
172
|
+
return None
|
|
173
|
+
elapsed_ms = (time.perf_counter() - ctx.session.started_at) * 1000.0
|
|
174
|
+
if elapsed_ms >= max_ms:
|
|
175
|
+
return ExitReason(
|
|
176
|
+
condition="timeout",
|
|
177
|
+
message=(
|
|
178
|
+
f"Wall-clock timeout: {elapsed_ms:.0f}ms >= {max_ms:.0f}ms"
|
|
179
|
+
),
|
|
180
|
+
)
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class TokenBudgetGuard:
|
|
185
|
+
"""Stop when cumulative session tokens exceed max_tokens."""
|
|
186
|
+
|
|
187
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
188
|
+
cap = ctx.session.max_tokens
|
|
189
|
+
if cap <= 0:
|
|
190
|
+
return None
|
|
191
|
+
total = ctx.session.prompt_tokens + ctx.session.completion_tokens
|
|
192
|
+
if total >= cap:
|
|
193
|
+
return ExitReason(
|
|
194
|
+
condition="token_budget",
|
|
195
|
+
message=f"Token budget exhausted ({total}/{cap})",
|
|
196
|
+
)
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class OutputRepeatGuard:
|
|
201
|
+
"""Stop when the same step artifact repeats within a sliding window."""
|
|
202
|
+
|
|
203
|
+
def __init__(self, window: int = 5, min_repeats: int = 2) -> None:
|
|
204
|
+
self.window = window
|
|
205
|
+
self.min_repeats = min_repeats
|
|
206
|
+
|
|
207
|
+
def should_stop(self, ctx: GuardContext) -> ExitReason | None:
|
|
208
|
+
if not ctx.step_output:
|
|
209
|
+
return None
|
|
210
|
+
fingerprint = hashlib.sha256(ctx.step_output.strip().encode()).hexdigest()[:16]
|
|
211
|
+
session = ctx.session
|
|
212
|
+
session.step_fingerprints.append(fingerprint)
|
|
213
|
+
recent = session.step_fingerprints[-self.window :]
|
|
214
|
+
count = recent.count(fingerprint)
|
|
215
|
+
if count >= self.min_repeats:
|
|
216
|
+
return ExitReason(
|
|
217
|
+
condition="output_repeat",
|
|
218
|
+
message=(
|
|
219
|
+
f"Step output repeated {count} times in last {len(recent)} "
|
|
220
|
+
f"step(s); likely stuck in a loop"
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def default_guard_stack(priors: AdaptivePriors, max_steps: int = MAX_STEPS_DEFAULT) -> GuardStack:
|
|
227
|
+
"""Build the default guard stack mirroring legacy _decide() order."""
|
|
228
|
+
return GuardStack([
|
|
229
|
+
ScoreThresholdGuard(),
|
|
230
|
+
PlateauGuard(),
|
|
231
|
+
BayesianGuard(priors),
|
|
232
|
+
BudgetExhaustedGuard(),
|
|
233
|
+
MaxStepsGuard(max_steps),
|
|
234
|
+
TimeoutGuard(),
|
|
235
|
+
TokenBudgetGuard(),
|
|
236
|
+
OutputRepeatGuard(),
|
|
237
|
+
])
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def bayesian_exit_as_guard(
|
|
241
|
+
priors: AdaptivePriors,
|
|
242
|
+
task_type: str,
|
|
243
|
+
model_id: str,
|
|
244
|
+
quality_threshold: float,
|
|
245
|
+
) -> ExitConditionAdapter:
|
|
246
|
+
"""Wrap :class:`BayesianExitCondition` as an agent-loop guard."""
|
|
247
|
+
return ExitConditionAdapter(
|
|
248
|
+
BayesianExitCondition(
|
|
249
|
+
priors=priors,
|
|
250
|
+
task_type=task_type,
|
|
251
|
+
model_id=model_id,
|
|
252
|
+
quality_threshold=quality_threshold,
|
|
253
|
+
)
|
|
254
|
+
)
|