loopllm 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopllm/engine.py ADDED
@@ -0,0 +1,376 @@
1
+ """Core iterative refinement engine."""
2
+ from __future__ import annotations
3
+
4
+ import time
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Protocol, runtime_checkable
7
+
8
+ import structlog
9
+
10
+ from loopllm.provider import LLMProvider
11
+
12
+
13
+ @runtime_checkable
14
+ class Evaluator(Protocol):
15
+ """Protocol for output evaluators.
16
+
17
+ Any object with an ``evaluate`` method matching this signature can be
18
+ used as an evaluator in the refinement loop.
19
+ """
20
+
21
+ def evaluate(
22
+ self, output: str, context: dict[str, Any] | None = None
23
+ ) -> EvaluationResult: ...
24
+
25
+
26
+ @runtime_checkable
27
+ class ExitConditionProtocol(Protocol):
28
+ """Protocol for custom exit conditions.
29
+
30
+ Objects satisfying this protocol can be registered via
31
+ :meth:`LoopedLLM.add_exit_condition`.
32
+ """
33
+
34
+ def should_exit(
35
+ self,
36
+ iteration: int,
37
+ current_score: float,
38
+ scores_so_far: list[float],
39
+ ) -> ExitReason | None: ...
40
+
41
+ logger = structlog.get_logger(__name__)
42
+
43
+
44
+ @dataclass
45
+ class EvaluationResult:
46
+ """Result of evaluating an LLM output.
47
+
48
+ Attributes:
49
+ score: Quality score in [0.0, 1.0].
50
+ passed: Whether the output meets the quality bar.
51
+ deficiencies: List of issues found in the output.
52
+ sub_scores: Named component scores.
53
+ feedback: Human-readable summary; auto-generated from deficiencies if not provided.
54
+ """
55
+
56
+ score: float
57
+ passed: bool
58
+ deficiencies: list[str] = field(default_factory=list)
59
+ sub_scores: dict[str, float] = field(default_factory=dict)
60
+ feedback: str = ""
61
+
62
+ def __post_init__(self) -> None:
63
+ if not self.feedback and self.deficiencies:
64
+ self.feedback = "Issues found: " + "; ".join(self.deficiencies)
65
+
66
+
67
+ @dataclass
68
+ class ExitReason:
69
+ """Describes why the refinement loop terminated.
70
+
71
+ Attributes:
72
+ condition: Name of the exit condition that triggered.
73
+ message: Human-readable explanation.
74
+ """
75
+
76
+ condition: str
77
+ message: str
78
+
79
+
80
+ @dataclass
81
+ class IterationRecord:
82
+ """Record of a single loop iteration.
83
+
84
+ Attributes:
85
+ iteration: Zero-based iteration index.
86
+ prompt: The prompt sent to the LLM.
87
+ output: The LLM's response content.
88
+ score: Evaluation score for this iteration.
89
+ passed: Whether the evaluation passed.
90
+ deficiencies: Issues identified in this iteration.
91
+ latency_ms: Time taken for this iteration in milliseconds.
92
+ """
93
+
94
+ iteration: int
95
+ prompt: str
96
+ output: str
97
+ score: float
98
+ passed: bool
99
+ deficiencies: list[str] = field(default_factory=list)
100
+ latency_ms: float = 0.0
101
+
102
+
103
+ @dataclass
104
+ class LoopMetrics:
105
+ """Aggregate metrics for the entire refinement run.
106
+
107
+ Attributes:
108
+ total_iterations: Number of iterations executed.
109
+ best_score: Highest score achieved across all iterations.
110
+ final_score: Score of the last iteration.
111
+ converged: Whether the loop converged to a passing result.
112
+ exit_reason: Why the loop terminated.
113
+ total_latency_ms: Total time spent in milliseconds.
114
+ score_trajectory: Ordered list of scores per iteration.
115
+ """
116
+
117
+ total_iterations: int
118
+ best_score: float
119
+ final_score: float
120
+ converged: bool
121
+ exit_reason: ExitReason
122
+ total_latency_ms: float
123
+ score_trajectory: list[float] = field(default_factory=list)
124
+
125
+
126
+ @dataclass
127
+ class RefinementResult:
128
+ """Final result of a refinement run.
129
+
130
+ Attributes:
131
+ output: Best (or final) output across all iterations.
132
+ metrics: Aggregate loop metrics.
133
+ iterations: Full history of iteration records.
134
+ """
135
+
136
+ output: str
137
+ metrics: LoopMetrics
138
+ iterations: list[IterationRecord] = field(default_factory=list)
139
+
140
+
141
+ @dataclass
142
+ class LoopConfig:
143
+ """Configuration for the refinement loop.
144
+
145
+ Attributes:
146
+ max_iterations: Maximum number of refinement iterations.
147
+ quality_threshold: Minimum score to consider output acceptable.
148
+ min_iterations: Minimum number of iterations before early exit.
149
+ convergence_delta: Exit if improvement < this for 2 consecutive iters.
150
+ timeout_ms: Maximum total wall-clock time in milliseconds.
151
+ best_of: If True, return the best output; otherwise return the last.
152
+ """
153
+
154
+ max_iterations: int = 5
155
+ quality_threshold: float = 0.8
156
+ min_iterations: int = 1
157
+ convergence_delta: float = 0.01
158
+ timeout_ms: float = 30_000
159
+ best_of: bool = True
160
+
161
+
162
+ class CompositeEvaluator:
163
+ """Combines multiple evaluators via weighted average.
164
+
165
+ Args:
166
+ evaluators: List of evaluator objects, each with an ``evaluate`` method.
167
+ weights: Optional per-evaluator weights. Defaults to equal weights.
168
+ """
169
+
170
+ def __init__(self, evaluators: list[Evaluator], weights: list[float] | None = None) -> None:
171
+ self.evaluators = evaluators
172
+ if weights is None:
173
+ self.weights = [1.0 / len(evaluators)] * len(evaluators)
174
+ else:
175
+ total = sum(weights)
176
+ self.weights = [w / total for w in weights]
177
+
178
+ def evaluate(self, output: str, context: dict[str, Any] | None = None) -> EvaluationResult:
179
+ """Evaluate *output* using all child evaluators and return weighted result.
180
+
181
+ Args:
182
+ output: The text to evaluate.
183
+ context: Optional context dict passed to each evaluator.
184
+
185
+ Returns:
186
+ A single :class:`EvaluationResult` with merged scores and deficiencies.
187
+ """
188
+ ctx = context or {}
189
+ all_deficiencies: list[str] = []
190
+ all_sub_scores: dict[str, float] = {}
191
+ weighted_score = 0.0
192
+
193
+ for evaluator, weight in zip(self.evaluators, self.weights):
194
+ result = evaluator.evaluate(output, ctx)
195
+ weighted_score += result.score * weight
196
+ all_deficiencies.extend(result.deficiencies)
197
+ all_sub_scores.update(result.sub_scores)
198
+
199
+ passed = weighted_score >= 0.5 and not all_deficiencies
200
+ return EvaluationResult(
201
+ score=weighted_score,
202
+ passed=passed,
203
+ deficiencies=all_deficiencies,
204
+ sub_scores=all_sub_scores,
205
+ )
206
+
207
+
208
+ class LoopedLLM:
209
+ """Iterative refinement engine that loops an LLM call with evaluation.
210
+
211
+ Args:
212
+ provider: The LLM provider to use for completions.
213
+ config: Loop configuration. Uses defaults if not provided.
214
+ """
215
+
216
+ def __init__(self, provider: LLMProvider, config: LoopConfig | None = None) -> None:
217
+ self.provider = provider
218
+ self.config = config or LoopConfig()
219
+ self._exit_conditions: list[ExitConditionProtocol] = []
220
+
221
+ def add_exit_condition(self, condition: ExitConditionProtocol) -> None:
222
+ """Register an additional exit condition (e.g. :class:`BayesianExitCondition`).
223
+
224
+ Args:
225
+ condition: An object satisfying :class:`ExitConditionProtocol`.
226
+ """
227
+ self._exit_conditions.append(condition)
228
+
229
+ def refine(
230
+ self,
231
+ initial_prompt: str,
232
+ evaluator: Evaluator,
233
+ context: dict[str, Any] | None = None,
234
+ model: str = "gpt-4o-mini",
235
+ ) -> RefinementResult:
236
+ """Run the iterative refinement loop.
237
+
238
+ Args:
239
+ initial_prompt: The initial prompt to send to the LLM.
240
+ evaluator: An evaluator with an ``evaluate(output, context)`` method.
241
+ context: Optional context dict passed to the evaluator.
242
+ model: Model identifier for the LLM provider.
243
+
244
+ Returns:
245
+ :class:`RefinementResult` containing the best/final output and metrics.
246
+ """
247
+ ctx = context or {}
248
+ iterations: list[IterationRecord] = []
249
+ scores: list[float] = []
250
+ prompt = initial_prompt
251
+ best_output = ""
252
+ best_score = -1.0
253
+ exit_reason: ExitReason | None = None
254
+ loop_start = time.perf_counter()
255
+
256
+ for i in range(self.config.max_iterations):
257
+ iter_start = time.perf_counter()
258
+
259
+ # 1. Call provider
260
+ response = self.provider.complete(prompt, model)
261
+ output = response.content
262
+
263
+ # 2. Evaluate
264
+ result = evaluator.evaluate(output, ctx)
265
+ iter_latency = (time.perf_counter() - iter_start) * 1000.0
266
+
267
+ # Track
268
+ scores.append(result.score)
269
+ record = IterationRecord(
270
+ iteration=i,
271
+ prompt=prompt,
272
+ output=output,
273
+ score=result.score,
274
+ passed=result.passed,
275
+ deficiencies=list(result.deficiencies),
276
+ latency_ms=iter_latency,
277
+ )
278
+ iterations.append(record)
279
+
280
+ if result.score > best_score:
281
+ best_score = result.score
282
+ best_output = output
283
+
284
+ logger.debug(
285
+ "loop_iteration",
286
+ iteration=i,
287
+ score=result.score,
288
+ passed=result.passed,
289
+ deficiencies=result.deficiencies,
290
+ )
291
+
292
+ # 3. Check exit conditions (only after min_iterations)
293
+ if i + 1 >= self.config.min_iterations:
294
+ # Quality threshold
295
+ if result.score >= self.config.quality_threshold:
296
+ exit_reason = ExitReason(
297
+ "quality_threshold",
298
+ f"Score {result.score:.2f} >= threshold {self.config.quality_threshold:.2f}"
299
+ f" at iteration {i + 1}",
300
+ )
301
+ break
302
+
303
+ # Convergence delta
304
+ if len(scores) >= 3:
305
+ delta1 = abs(scores[-1] - scores[-2])
306
+ delta2 = abs(scores[-2] - scores[-3])
307
+ if delta1 < self.config.convergence_delta and delta2 < self.config.convergence_delta:
308
+ exit_reason = ExitReason(
309
+ "convergence",
310
+ f"Score plateaued (last deltas: {delta1:.4f}, {delta2:.4f}"
311
+ f" < {self.config.convergence_delta:.4f})",
312
+ )
313
+ break
314
+
315
+ # Bayesian / custom exit conditions
316
+ for cond in self._exit_conditions:
317
+ reason = cond.should_exit(i + 1, result.score, list(scores))
318
+ if reason is not None:
319
+ exit_reason = reason
320
+ break
321
+ if exit_reason is not None:
322
+ break
323
+
324
+ # Timeout
325
+ elapsed = (time.perf_counter() - loop_start) * 1000.0
326
+ if elapsed >= self.config.timeout_ms:
327
+ exit_reason = ExitReason(
328
+ "timeout",
329
+ f"Elapsed {elapsed:.0f}ms >= timeout {self.config.timeout_ms:.0f}ms",
330
+ )
331
+ break
332
+
333
+ # 4. Build feedback prompt for next iteration
334
+ if result.deficiencies:
335
+ deficiency_lines = "\n".join(f"- {d}" for d in result.deficiencies)
336
+ prompt = (
337
+ f"{initial_prompt}\n\n"
338
+ f"Previous attempt scored {result.score:.2f}/1.0. Issues to fix:\n"
339
+ f"{deficiency_lines}\n\n"
340
+ f"Please address all issues in your next response."
341
+ )
342
+ else:
343
+ prompt = initial_prompt
344
+
345
+ # If we exhausted iterations without another exit reason
346
+ if exit_reason is None:
347
+ exit_reason = ExitReason(
348
+ "max_iterations",
349
+ f"Reached maximum of {self.config.max_iterations} iterations",
350
+ )
351
+
352
+ total_latency = (time.perf_counter() - loop_start) * 1000.0
353
+ final_output = best_output if self.config.best_of else iterations[-1].output
354
+
355
+ metrics = LoopMetrics(
356
+ total_iterations=len(iterations),
357
+ best_score=best_score,
358
+ final_score=scores[-1] if scores else 0.0,
359
+ converged=best_score >= self.config.quality_threshold,
360
+ exit_reason=exit_reason,
361
+ total_latency_ms=total_latency,
362
+ score_trajectory=scores,
363
+ )
364
+
365
+ logger.info(
366
+ "refinement_complete",
367
+ total_iterations=metrics.total_iterations,
368
+ best_score=metrics.best_score,
369
+ exit_reason=exit_reason.condition,
370
+ )
371
+
372
+ return RefinementResult(
373
+ output=final_output,
374
+ metrics=metrics,
375
+ iterations=iterations,
376
+ )
@@ -0,0 +1,72 @@
1
+ """Build evaluators from type strings — shared by MCP and CDV step scoring."""
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ from loopllm.engine import CompositeEvaluator
7
+ from loopllm.evaluators import (
8
+ CompletenessEvaluator,
9
+ JSONSchemaEvaluator,
10
+ LengthEvaluator,
11
+ RegexEvaluator,
12
+ )
13
+
14
+
15
+ def build_evaluator(
16
+ evaluator_type: str = "length",
17
+ **kwargs: Any,
18
+ ) -> (
19
+ LengthEvaluator
20
+ | RegexEvaluator
21
+ | JSONSchemaEvaluator
22
+ | CompositeEvaluator
23
+ | CompletenessEvaluator
24
+ ):
25
+ """Build an evaluator from a type string and optional config.
26
+
27
+ Args:
28
+ evaluator_type: One of ``length``, ``json``, ``regex``, ``composite``,
29
+ or ``completeness``.
30
+ **kwargs: Evaluator-specific options (``required_patterns``,
31
+ ``required_fields``, ``quality_criteria``, ``min_words``, etc.).
32
+
33
+ Returns:
34
+ A concrete evaluator instance.
35
+ """
36
+ if evaluator_type == "json":
37
+ return JSONSchemaEvaluator(
38
+ required_fields=kwargs.get("required_fields", []),
39
+ field_types={},
40
+ )
41
+ if evaluator_type == "regex":
42
+ return RegexEvaluator(
43
+ required=kwargs.get("required_patterns", []),
44
+ forbidden=kwargs.get("forbidden_patterns", []),
45
+ )
46
+ if evaluator_type == "completeness":
47
+ criteria = kwargs.get("quality_criteria") or kwargs.get("required_aspects") or []
48
+ return CompletenessEvaluator(required_aspects=list(criteria))
49
+ if evaluator_type == "composite":
50
+ evals: list[Any] = []
51
+ criteria = kwargs.get("quality_criteria") or []
52
+ if criteria:
53
+ evals.append(CompletenessEvaluator(required_aspects=list(criteria)))
54
+ if kwargs.get("required_fields"):
55
+ evals.append(JSONSchemaEvaluator(required_fields=kwargs["required_fields"]))
56
+ if kwargs.get("required_patterns"):
57
+ evals.append(RegexEvaluator(required=kwargs["required_patterns"]))
58
+ if not evals:
59
+ evals.append(LengthEvaluator(
60
+ min_words=kwargs.get("min_words", 5),
61
+ max_words=kwargs.get("max_words", 10_000),
62
+ ))
63
+ else:
64
+ evals.append(LengthEvaluator(
65
+ min_words=kwargs.get("min_words", 1),
66
+ max_words=kwargs.get("max_words", 10_000),
67
+ ))
68
+ return CompositeEvaluator(evaluators=evals)
69
+ return LengthEvaluator(
70
+ min_words=kwargs.get("min_words", 5),
71
+ max_words=kwargs.get("max_words", 10_000),
72
+ )