loopllm 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopllm/__init__.py +69 -0
- loopllm/__main__.py +5 -0
- loopllm/adaptive_exit.py +78 -0
- loopllm/agent_loop.py +299 -0
- loopllm/cli.py +521 -0
- loopllm/elicitation.py +519 -0
- loopllm/engine.py +376 -0
- loopllm/evaluator_factory.py +72 -0
- loopllm/evaluators.py +419 -0
- loopllm/guards.py +254 -0
- loopllm/local_loop.py +273 -0
- loopllm/mcp_server.py +2657 -0
- loopllm/plan_registry.py +412 -0
- loopllm/priors.py +604 -0
- loopllm/provider.py +51 -0
- loopllm/providers/__init__.py +15 -0
- loopllm/providers/agent.py +64 -0
- loopllm/providers/mock.py +64 -0
- loopllm/providers/ollama.py +95 -0
- loopllm/providers/openrouter.py +101 -0
- loopllm/serve.py +297 -0
- loopllm/step_scorer.py +190 -0
- loopllm/store.py +1126 -0
- loopllm/tasks.py +599 -0
- loopllm-0.7.0.dist-info/METADATA +454 -0
- loopllm-0.7.0.dist-info/RECORD +29 -0
- loopllm-0.7.0.dist-info/WHEEL +4 -0
- loopllm-0.7.0.dist-info/entry_points.txt +3 -0
- loopllm-0.7.0.dist-info/licenses/LICENSE +21 -0
loopllm/engine.py
ADDED
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""Core iterative refinement engine."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
import structlog
|
|
9
|
+
|
|
10
|
+
from loopllm.provider import LLMProvider
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@runtime_checkable
|
|
14
|
+
class Evaluator(Protocol):
|
|
15
|
+
"""Protocol for output evaluators.
|
|
16
|
+
|
|
17
|
+
Any object with an ``evaluate`` method matching this signature can be
|
|
18
|
+
used as an evaluator in the refinement loop.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def evaluate(
|
|
22
|
+
self, output: str, context: dict[str, Any] | None = None
|
|
23
|
+
) -> EvaluationResult: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@runtime_checkable
|
|
27
|
+
class ExitConditionProtocol(Protocol):
|
|
28
|
+
"""Protocol for custom exit conditions.
|
|
29
|
+
|
|
30
|
+
Objects satisfying this protocol can be registered via
|
|
31
|
+
:meth:`LoopedLLM.add_exit_condition`.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def should_exit(
|
|
35
|
+
self,
|
|
36
|
+
iteration: int,
|
|
37
|
+
current_score: float,
|
|
38
|
+
scores_so_far: list[float],
|
|
39
|
+
) -> ExitReason | None: ...
|
|
40
|
+
|
|
41
|
+
logger = structlog.get_logger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class EvaluationResult:
|
|
46
|
+
"""Result of evaluating an LLM output.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
score: Quality score in [0.0, 1.0].
|
|
50
|
+
passed: Whether the output meets the quality bar.
|
|
51
|
+
deficiencies: List of issues found in the output.
|
|
52
|
+
sub_scores: Named component scores.
|
|
53
|
+
feedback: Human-readable summary; auto-generated from deficiencies if not provided.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
score: float
|
|
57
|
+
passed: bool
|
|
58
|
+
deficiencies: list[str] = field(default_factory=list)
|
|
59
|
+
sub_scores: dict[str, float] = field(default_factory=dict)
|
|
60
|
+
feedback: str = ""
|
|
61
|
+
|
|
62
|
+
def __post_init__(self) -> None:
|
|
63
|
+
if not self.feedback and self.deficiencies:
|
|
64
|
+
self.feedback = "Issues found: " + "; ".join(self.deficiencies)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class ExitReason:
|
|
69
|
+
"""Describes why the refinement loop terminated.
|
|
70
|
+
|
|
71
|
+
Attributes:
|
|
72
|
+
condition: Name of the exit condition that triggered.
|
|
73
|
+
message: Human-readable explanation.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
condition: str
|
|
77
|
+
message: str
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class IterationRecord:
|
|
82
|
+
"""Record of a single loop iteration.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
iteration: Zero-based iteration index.
|
|
86
|
+
prompt: The prompt sent to the LLM.
|
|
87
|
+
output: The LLM's response content.
|
|
88
|
+
score: Evaluation score for this iteration.
|
|
89
|
+
passed: Whether the evaluation passed.
|
|
90
|
+
deficiencies: Issues identified in this iteration.
|
|
91
|
+
latency_ms: Time taken for this iteration in milliseconds.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
iteration: int
|
|
95
|
+
prompt: str
|
|
96
|
+
output: str
|
|
97
|
+
score: float
|
|
98
|
+
passed: bool
|
|
99
|
+
deficiencies: list[str] = field(default_factory=list)
|
|
100
|
+
latency_ms: float = 0.0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class LoopMetrics:
|
|
105
|
+
"""Aggregate metrics for the entire refinement run.
|
|
106
|
+
|
|
107
|
+
Attributes:
|
|
108
|
+
total_iterations: Number of iterations executed.
|
|
109
|
+
best_score: Highest score achieved across all iterations.
|
|
110
|
+
final_score: Score of the last iteration.
|
|
111
|
+
converged: Whether the loop converged to a passing result.
|
|
112
|
+
exit_reason: Why the loop terminated.
|
|
113
|
+
total_latency_ms: Total time spent in milliseconds.
|
|
114
|
+
score_trajectory: Ordered list of scores per iteration.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
total_iterations: int
|
|
118
|
+
best_score: float
|
|
119
|
+
final_score: float
|
|
120
|
+
converged: bool
|
|
121
|
+
exit_reason: ExitReason
|
|
122
|
+
total_latency_ms: float
|
|
123
|
+
score_trajectory: list[float] = field(default_factory=list)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass
|
|
127
|
+
class RefinementResult:
|
|
128
|
+
"""Final result of a refinement run.
|
|
129
|
+
|
|
130
|
+
Attributes:
|
|
131
|
+
output: Best (or final) output across all iterations.
|
|
132
|
+
metrics: Aggregate loop metrics.
|
|
133
|
+
iterations: Full history of iteration records.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
output: str
|
|
137
|
+
metrics: LoopMetrics
|
|
138
|
+
iterations: list[IterationRecord] = field(default_factory=list)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass
|
|
142
|
+
class LoopConfig:
|
|
143
|
+
"""Configuration for the refinement loop.
|
|
144
|
+
|
|
145
|
+
Attributes:
|
|
146
|
+
max_iterations: Maximum number of refinement iterations.
|
|
147
|
+
quality_threshold: Minimum score to consider output acceptable.
|
|
148
|
+
min_iterations: Minimum number of iterations before early exit.
|
|
149
|
+
convergence_delta: Exit if improvement < this for 2 consecutive iters.
|
|
150
|
+
timeout_ms: Maximum total wall-clock time in milliseconds.
|
|
151
|
+
best_of: If True, return the best output; otherwise return the last.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
max_iterations: int = 5
|
|
155
|
+
quality_threshold: float = 0.8
|
|
156
|
+
min_iterations: int = 1
|
|
157
|
+
convergence_delta: float = 0.01
|
|
158
|
+
timeout_ms: float = 30_000
|
|
159
|
+
best_of: bool = True
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class CompositeEvaluator:
|
|
163
|
+
"""Combines multiple evaluators via weighted average.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
evaluators: List of evaluator objects, each with an ``evaluate`` method.
|
|
167
|
+
weights: Optional per-evaluator weights. Defaults to equal weights.
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def __init__(self, evaluators: list[Evaluator], weights: list[float] | None = None) -> None:
|
|
171
|
+
self.evaluators = evaluators
|
|
172
|
+
if weights is None:
|
|
173
|
+
self.weights = [1.0 / len(evaluators)] * len(evaluators)
|
|
174
|
+
else:
|
|
175
|
+
total = sum(weights)
|
|
176
|
+
self.weights = [w / total for w in weights]
|
|
177
|
+
|
|
178
|
+
def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
|
|
179
|
+
"""Evaluate *output* using all child evaluators and return weighted result.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
output: The text to evaluate.
|
|
183
|
+
context: Optional context dict passed to each evaluator.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
A single :class:`EvaluationResult` with merged scores and deficiencies.
|
|
187
|
+
"""
|
|
188
|
+
ctx = context or {}
|
|
189
|
+
all_deficiencies: list[str] = []
|
|
190
|
+
all_sub_scores: dict[str, float] = {}
|
|
191
|
+
weighted_score = 0.0
|
|
192
|
+
|
|
193
|
+
for evaluator, weight in zip(self.evaluators, self.weights):
|
|
194
|
+
result = evaluator.evaluate(output, ctx)
|
|
195
|
+
weighted_score += result.score * weight
|
|
196
|
+
all_deficiencies.extend(result.deficiencies)
|
|
197
|
+
all_sub_scores.update(result.sub_scores)
|
|
198
|
+
|
|
199
|
+
passed = weighted_score >= 0.5 and not all_deficiencies
|
|
200
|
+
return EvaluationResult(
|
|
201
|
+
score=weighted_score,
|
|
202
|
+
passed=passed,
|
|
203
|
+
deficiencies=all_deficiencies,
|
|
204
|
+
sub_scores=all_sub_scores,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
class LoopedLLM:
|
|
209
|
+
"""Iterative refinement engine that loops an LLM call with evaluation.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
provider: The LLM provider to use for completions.
|
|
213
|
+
config: Loop configuration. Uses defaults if not provided.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
def __init__(self, provider: LLMProvider, config: LoopConfig | None = None) -> None:
|
|
217
|
+
self.provider = provider
|
|
218
|
+
self.config = config or LoopConfig()
|
|
219
|
+
self._exit_conditions: list[ExitConditionProtocol] = []
|
|
220
|
+
|
|
221
|
+
def add_exit_condition(self, condition: ExitConditionProtocol) -> None:
|
|
222
|
+
"""Register an additional exit condition (e.g. :class:`BayesianExitCondition`).
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
condition: An object satisfying :class:`ExitConditionProtocol`.
|
|
226
|
+
"""
|
|
227
|
+
self._exit_conditions.append(condition)
|
|
228
|
+
|
|
229
|
+
def refine(
|
|
230
|
+
self,
|
|
231
|
+
initial_prompt: str,
|
|
232
|
+
evaluator: Evaluator,
|
|
233
|
+
context: dict[str, Any] | None = None,
|
|
234
|
+
model: str = "gpt-4o-mini",
|
|
235
|
+
) -> RefinementResult:
|
|
236
|
+
"""Run the iterative refinement loop.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
initial_prompt: The initial prompt to send to the LLM.
|
|
240
|
+
evaluator: An evaluator with an ``evaluate(output, context)`` method.
|
|
241
|
+
context: Optional context dict passed to the evaluator.
|
|
242
|
+
model: Model identifier for the LLM provider.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
:class:`RefinementResult` containing the best/final output and metrics.
|
|
246
|
+
"""
|
|
247
|
+
ctx = context or {}
|
|
248
|
+
iterations: list[IterationRecord] = []
|
|
249
|
+
scores: list[float] = []
|
|
250
|
+
prompt = initial_prompt
|
|
251
|
+
best_output = ""
|
|
252
|
+
best_score = -1.0
|
|
253
|
+
exit_reason: ExitReason | None = None
|
|
254
|
+
loop_start = time.perf_counter()
|
|
255
|
+
|
|
256
|
+
for i in range(self.config.max_iterations):
|
|
257
|
+
iter_start = time.perf_counter()
|
|
258
|
+
|
|
259
|
+
# 1. Call provider
|
|
260
|
+
response = self.provider.complete(prompt, model)
|
|
261
|
+
output = response.content
|
|
262
|
+
|
|
263
|
+
# 2. Evaluate
|
|
264
|
+
result = evaluator.evaluate(output, ctx)
|
|
265
|
+
iter_latency = (time.perf_counter() - iter_start) * 1000.0
|
|
266
|
+
|
|
267
|
+
# Track
|
|
268
|
+
scores.append(result.score)
|
|
269
|
+
record = IterationRecord(
|
|
270
|
+
iteration=i,
|
|
271
|
+
prompt=prompt,
|
|
272
|
+
output=output,
|
|
273
|
+
score=result.score,
|
|
274
|
+
passed=result.passed,
|
|
275
|
+
deficiencies=list(result.deficiencies),
|
|
276
|
+
latency_ms=iter_latency,
|
|
277
|
+
)
|
|
278
|
+
iterations.append(record)
|
|
279
|
+
|
|
280
|
+
if result.score > best_score:
|
|
281
|
+
best_score = result.score
|
|
282
|
+
best_output = output
|
|
283
|
+
|
|
284
|
+
logger.debug(
|
|
285
|
+
"loop_iteration",
|
|
286
|
+
iteration=i,
|
|
287
|
+
score=result.score,
|
|
288
|
+
passed=result.passed,
|
|
289
|
+
deficiencies=result.deficiencies,
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
# 3. Check exit conditions (only after min_iterations)
|
|
293
|
+
if i + 1 >= self.config.min_iterations:
|
|
294
|
+
# Quality threshold
|
|
295
|
+
if result.score >= self.config.quality_threshold:
|
|
296
|
+
exit_reason = ExitReason(
|
|
297
|
+
"quality_threshold",
|
|
298
|
+
f"Score {result.score:.2f} >= threshold {self.config.quality_threshold:.2f}"
|
|
299
|
+
f" at iteration {i + 1}",
|
|
300
|
+
)
|
|
301
|
+
break
|
|
302
|
+
|
|
303
|
+
# Convergence delta
|
|
304
|
+
if len(scores) >= 3:
|
|
305
|
+
delta1 = abs(scores[-1] - scores[-2])
|
|
306
|
+
delta2 = abs(scores[-2] - scores[-3])
|
|
307
|
+
if delta1 < self.config.convergence_delta and delta2 < self.config.convergence_delta:
|
|
308
|
+
exit_reason = ExitReason(
|
|
309
|
+
"convergence",
|
|
310
|
+
f"Score plateaued (last deltas: {delta1:.4f}, {delta2:.4f}"
|
|
311
|
+
f" < {self.config.convergence_delta:.4f})",
|
|
312
|
+
)
|
|
313
|
+
break
|
|
314
|
+
|
|
315
|
+
# Bayesian / custom exit conditions
|
|
316
|
+
for cond in self._exit_conditions:
|
|
317
|
+
reason = cond.should_exit(i + 1, result.score, list(scores))
|
|
318
|
+
if reason is not None:
|
|
319
|
+
exit_reason = reason
|
|
320
|
+
break
|
|
321
|
+
if exit_reason is not None:
|
|
322
|
+
break
|
|
323
|
+
|
|
324
|
+
# Timeout
|
|
325
|
+
elapsed = (time.perf_counter() - loop_start) * 1000.0
|
|
326
|
+
if elapsed >= self.config.timeout_ms:
|
|
327
|
+
exit_reason = ExitReason(
|
|
328
|
+
"timeout",
|
|
329
|
+
f"Elapsed {elapsed:.0f}ms >= timeout {self.config.timeout_ms:.0f}ms",
|
|
330
|
+
)
|
|
331
|
+
break
|
|
332
|
+
|
|
333
|
+
# 4. Build feedback prompt for next iteration
|
|
334
|
+
if result.deficiencies:
|
|
335
|
+
deficiency_lines = "\n".join(f"- {d}" for d in result.deficiencies)
|
|
336
|
+
prompt = (
|
|
337
|
+
f"{initial_prompt}\n\n"
|
|
338
|
+
f"Previous attempt scored {result.score:.2f}/1.0. Issues to fix:\n"
|
|
339
|
+
f"{deficiency_lines}\n\n"
|
|
340
|
+
f"Please address all issues in your next response."
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
prompt = initial_prompt
|
|
344
|
+
|
|
345
|
+
# If we exhausted iterations without another exit reason
|
|
346
|
+
if exit_reason is None:
|
|
347
|
+
exit_reason = ExitReason(
|
|
348
|
+
"max_iterations",
|
|
349
|
+
f"Reached maximum of {self.config.max_iterations} iterations",
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
total_latency = (time.perf_counter() - loop_start) * 1000.0
|
|
353
|
+
final_output = best_output if self.config.best_of else iterations[-1].output
|
|
354
|
+
|
|
355
|
+
metrics = LoopMetrics(
|
|
356
|
+
total_iterations=len(iterations),
|
|
357
|
+
best_score=best_score,
|
|
358
|
+
final_score=scores[-1] if scores else 0.0,
|
|
359
|
+
converged=best_score >= self.config.quality_threshold,
|
|
360
|
+
exit_reason=exit_reason,
|
|
361
|
+
total_latency_ms=total_latency,
|
|
362
|
+
score_trajectory=scores,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
logger.info(
|
|
366
|
+
"refinement_complete",
|
|
367
|
+
total_iterations=metrics.total_iterations,
|
|
368
|
+
best_score=metrics.best_score,
|
|
369
|
+
exit_reason=exit_reason.condition,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return RefinementResult(
|
|
373
|
+
output=final_output,
|
|
374
|
+
metrics=metrics,
|
|
375
|
+
iterations=iterations,
|
|
376
|
+
)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Build evaluators from type strings — shared by MCP and CDV step scoring."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from loopllm.engine import CompositeEvaluator
|
|
7
|
+
from loopllm.evaluators import (
|
|
8
|
+
CompletenessEvaluator,
|
|
9
|
+
JSONSchemaEvaluator,
|
|
10
|
+
LengthEvaluator,
|
|
11
|
+
RegexEvaluator,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_evaluator(
|
|
16
|
+
evaluator_type: str = "length",
|
|
17
|
+
**kwargs: Any,
|
|
18
|
+
) -> (
|
|
19
|
+
LengthEvaluator
|
|
20
|
+
| RegexEvaluator
|
|
21
|
+
| JSONSchemaEvaluator
|
|
22
|
+
| CompositeEvaluator
|
|
23
|
+
| CompletenessEvaluator
|
|
24
|
+
):
|
|
25
|
+
"""Build an evaluator from a type string and optional config.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
evaluator_type: One of ``length``, ``json``, ``regex``, ``composite``,
|
|
29
|
+
or ``completeness``.
|
|
30
|
+
**kwargs: Evaluator-specific options (``required_patterns``,
|
|
31
|
+
``required_fields``, ``quality_criteria``, ``min_words``, etc.).
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
A concrete evaluator instance.
|
|
35
|
+
"""
|
|
36
|
+
if evaluator_type == "json":
|
|
37
|
+
return JSONSchemaEvaluator(
|
|
38
|
+
required_fields=kwargs.get("required_fields", []),
|
|
39
|
+
field_types={},
|
|
40
|
+
)
|
|
41
|
+
if evaluator_type == "regex":
|
|
42
|
+
return RegexEvaluator(
|
|
43
|
+
required=kwargs.get("required_patterns", []),
|
|
44
|
+
forbidden=kwargs.get("forbidden_patterns", []),
|
|
45
|
+
)
|
|
46
|
+
if evaluator_type == "completeness":
|
|
47
|
+
criteria = kwargs.get("quality_criteria") or kwargs.get("required_aspects") or []
|
|
48
|
+
return CompletenessEvaluator(required_aspects=list(criteria))
|
|
49
|
+
if evaluator_type == "composite":
|
|
50
|
+
evals: list[Any] = []
|
|
51
|
+
criteria = kwargs.get("quality_criteria") or []
|
|
52
|
+
if criteria:
|
|
53
|
+
evals.append(CompletenessEvaluator(required_aspects=list(criteria)))
|
|
54
|
+
if kwargs.get("required_fields"):
|
|
55
|
+
evals.append(JSONSchemaEvaluator(required_fields=kwargs["required_fields"]))
|
|
56
|
+
if kwargs.get("required_patterns"):
|
|
57
|
+
evals.append(RegexEvaluator(required=kwargs["required_patterns"]))
|
|
58
|
+
if not evals:
|
|
59
|
+
evals.append(LengthEvaluator(
|
|
60
|
+
min_words=kwargs.get("min_words", 5),
|
|
61
|
+
max_words=kwargs.get("max_words", 10_000),
|
|
62
|
+
))
|
|
63
|
+
else:
|
|
64
|
+
evals.append(LengthEvaluator(
|
|
65
|
+
min_words=kwargs.get("min_words", 1),
|
|
66
|
+
max_words=kwargs.get("max_words", 10_000),
|
|
67
|
+
))
|
|
68
|
+
return CompositeEvaluator(evaluators=evals)
|
|
69
|
+
return LengthEvaluator(
|
|
70
|
+
min_words=kwargs.get("min_words", 5),
|
|
71
|
+
max_words=kwargs.get("max_words", 10_000),
|
|
72
|
+
)
|