loopllm 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopllm/__init__.py +69 -0
- loopllm/__main__.py +5 -0
- loopllm/adaptive_exit.py +78 -0
- loopllm/agent_loop.py +299 -0
- loopllm/cli.py +521 -0
- loopllm/elicitation.py +519 -0
- loopllm/engine.py +376 -0
- loopllm/evaluator_factory.py +72 -0
- loopllm/evaluators.py +419 -0
- loopllm/guards.py +254 -0
- loopllm/local_loop.py +273 -0
- loopllm/mcp_server.py +2657 -0
- loopllm/plan_registry.py +412 -0
- loopllm/priors.py +604 -0
- loopllm/provider.py +51 -0
- loopllm/providers/__init__.py +15 -0
- loopllm/providers/agent.py +64 -0
- loopllm/providers/mock.py +64 -0
- loopllm/providers/ollama.py +95 -0
- loopllm/providers/openrouter.py +101 -0
- loopllm/serve.py +297 -0
- loopllm/step_scorer.py +190 -0
- loopllm/store.py +1126 -0
- loopllm/tasks.py +599 -0
- loopllm-0.7.0.dist-info/METADATA +454 -0
- loopllm-0.7.0.dist-info/RECORD +29 -0
- loopllm-0.7.0.dist-info/WHEEL +4 -0
- loopllm-0.7.0.dist-info/entry_points.txt +3 -0
- loopllm-0.7.0.dist-info/licenses/LICENSE +21 -0
loopllm/priors.py
ADDED
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
"""Bayesian meta-learning layer for adaptive loop depth prediction."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import math
|
|
6
|
+
import tempfile
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Primitive prior distributions
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class BetaPrior:
|
|
23
|
+
"""Beta-distributed prior for binary outcomes.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
alpha: Pseudo-count of successes.
|
|
27
|
+
beta: Pseudo-count of failures.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
alpha: float = 1.0
|
|
31
|
+
beta: float = 1.0
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def mean(self) -> float:
|
|
35
|
+
"""Expected value of the Beta distribution."""
|
|
36
|
+
return self.alpha / (self.alpha + self.beta)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def variance(self) -> float:
|
|
40
|
+
"""Variance of the Beta distribution."""
|
|
41
|
+
a, b = self.alpha, self.beta
|
|
42
|
+
return (a * b) / ((a + b) ** 2 * (a + b + 1))
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def confidence(self) -> float:
|
|
46
|
+
"""Confidence level based on number of observations.
|
|
47
|
+
|
|
48
|
+
Returns 0.0 if no observations have been recorded. Otherwise applies
|
|
49
|
+
a sigmoid-like mapping of ``(alpha + beta - 2)`` over ``n + 10``.
|
|
50
|
+
"""
|
|
51
|
+
n = self.alpha + self.beta - 2 # prior pseudo-counts subtracted
|
|
52
|
+
if n <= 0:
|
|
53
|
+
return 0.0
|
|
54
|
+
return 1 / (1 + math.exp(-n / (n + 10)))
|
|
55
|
+
|
|
56
|
+
def update(self, success: bool) -> None:
|
|
57
|
+
"""Record an observation.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
success: Whether the outcome was a success.
|
|
61
|
+
"""
|
|
62
|
+
if success:
|
|
63
|
+
self.alpha += 1
|
|
64
|
+
else:
|
|
65
|
+
self.beta += 1
|
|
66
|
+
|
|
67
|
+
def prob_above(self, threshold: float) -> float:
|
|
68
|
+
"""Approximate P(X > threshold) using a normal approximation to the Beta CDF.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
threshold: The threshold to compare against.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Approximate probability that a sample exceeds *threshold*.
|
|
75
|
+
"""
|
|
76
|
+
mu = self.mean
|
|
77
|
+
std = math.sqrt(max(self.variance, 1e-10))
|
|
78
|
+
z = (threshold - mu) / std
|
|
79
|
+
# P(X > threshold) ≈ 1 - Φ(z) using erf
|
|
80
|
+
return 0.5 * (1 - math.erf(z / math.sqrt(2)))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class NormalPrior:
|
|
85
|
+
"""Normal-distributed prior with optional exponential decay, updated via Welford's algorithm.
|
|
86
|
+
|
|
87
|
+
Attributes:
|
|
88
|
+
mean: Current mean estimate.
|
|
89
|
+
variance: Current variance estimate.
|
|
90
|
+
n_observations: Number of observations incorporated.
|
|
91
|
+
_m2: Running sum of squared differences (Welford internal state).
|
|
92
|
+
decay: Exponential decay factor; 1.0 disables decay.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
mean: float = 0.0
|
|
96
|
+
variance: float = 1.0
|
|
97
|
+
n_observations: int = 0
|
|
98
|
+
_m2: float = 0.0
|
|
99
|
+
decay: float = 1.0
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def confidence(self) -> float:
|
|
103
|
+
"""Confidence level: ``n / (n + 10)``, adjusted for decay."""
|
|
104
|
+
n = self.n_observations
|
|
105
|
+
if n == 0:
|
|
106
|
+
return 0.0
|
|
107
|
+
base = n / (n + 10)
|
|
108
|
+
if self.decay < 1.0:
|
|
109
|
+
return base * self.decay
|
|
110
|
+
return base
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def std(self) -> float:
|
|
114
|
+
"""Standard deviation (floored to avoid numerical issues)."""
|
|
115
|
+
return math.sqrt(max(self.variance, 1e-10))
|
|
116
|
+
|
|
117
|
+
def update(self, value: float) -> None:
|
|
118
|
+
"""Incorporate a new observation.
|
|
119
|
+
|
|
120
|
+
Uses Welford's online algorithm when ``decay == 1.0``, otherwise an
|
|
121
|
+
exponential moving average.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
value: The observed value.
|
|
125
|
+
"""
|
|
126
|
+
if self.decay < 1.0:
|
|
127
|
+
# Exponential moving average
|
|
128
|
+
self.n_observations += 1
|
|
129
|
+
alpha = 1 - self.decay
|
|
130
|
+
self.mean = self.decay * self.mean + alpha * value
|
|
131
|
+
diff = value - self.mean
|
|
132
|
+
self.variance = self.decay * self.variance + alpha * diff * diff
|
|
133
|
+
else:
|
|
134
|
+
# Welford's online algorithm
|
|
135
|
+
self.n_observations += 1
|
|
136
|
+
delta = value - self.mean
|
|
137
|
+
self.mean += delta / self.n_observations
|
|
138
|
+
delta2 = value - self.mean
|
|
139
|
+
self._m2 += delta * delta2
|
|
140
|
+
if self.n_observations >= 2:
|
|
141
|
+
self.variance = self._m2 / (self.n_observations - 1)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# Per-iteration and per-task profiles
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class IterationProfile:
|
|
151
|
+
"""Statistical profile for a specific iteration depth.
|
|
152
|
+
|
|
153
|
+
Attributes:
|
|
154
|
+
score: Expected quality score at this iteration.
|
|
155
|
+
score_delta: Expected improvement from the previous iteration.
|
|
156
|
+
converge_prob: Probability of having converged by this iteration.
|
|
157
|
+
latency_ms: Expected wall-clock time for this iteration.
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
score: NormalPrior = field(default_factory=lambda: NormalPrior(mean=0.3, variance=0.1))
|
|
161
|
+
score_delta: NormalPrior = field(default_factory=lambda: NormalPrior(mean=0.1, variance=0.05))
|
|
162
|
+
converge_prob: BetaPrior = field(default_factory=BetaPrior)
|
|
163
|
+
latency_ms: NormalPrior = field(default_factory=lambda: NormalPrior(mean=2000, variance=500_000))
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
@dataclass
|
|
167
|
+
class TaskModelPrior:
|
|
168
|
+
"""Collected beliefs about a (task_type, model_id) pair.
|
|
169
|
+
|
|
170
|
+
Attributes:
|
|
171
|
+
task_type: Identifier for the class of task.
|
|
172
|
+
model_id: Identifier for the LLM model.
|
|
173
|
+
created_at: ISO timestamp of first observation.
|
|
174
|
+
updated_at: ISO timestamp of most recent observation.
|
|
175
|
+
total_calls: Total number of refinement runs observed.
|
|
176
|
+
iterations: Per-iteration statistical profiles.
|
|
177
|
+
optimal_depth: Estimated optimal number of iterations.
|
|
178
|
+
overall_converge_rate: Overall probability of convergence.
|
|
179
|
+
first_call_quality: Expected score of the first LLM call.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
task_type: str = ""
|
|
183
|
+
model_id: str = ""
|
|
184
|
+
created_at: str = ""
|
|
185
|
+
updated_at: str = ""
|
|
186
|
+
total_calls: int = 0
|
|
187
|
+
iterations: dict[int, IterationProfile] = field(default_factory=dict)
|
|
188
|
+
optimal_depth: NormalPrior = field(
|
|
189
|
+
default_factory=lambda: NormalPrior(mean=3.0, variance=2.0)
|
|
190
|
+
)
|
|
191
|
+
overall_converge_rate: BetaPrior = field(default_factory=BetaPrior)
|
|
192
|
+
first_call_quality: NormalPrior = field(
|
|
193
|
+
default_factory=lambda: NormalPrior(mean=0.4, variance=0.1)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def get_iteration(self, k: int) -> IterationProfile:
|
|
197
|
+
"""Return the :class:`IterationProfile` for iteration *k*, creating if missing.
|
|
198
|
+
|
|
199
|
+
New profiles use diminishing-returns priors:
|
|
200
|
+
- score mean = ``min(0.3 + 0.15*k, 0.9)``
|
|
201
|
+
- score_delta mean = ``max(0.15 - 0.03*k, 0.01)``
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
k: Zero-based iteration index.
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
The iteration profile for depth *k*.
|
|
208
|
+
"""
|
|
209
|
+
if k not in self.iterations:
|
|
210
|
+
self.iterations[k] = IterationProfile(
|
|
211
|
+
score=NormalPrior(mean=min(0.3 + 0.15 * k, 0.9), variance=0.1),
|
|
212
|
+
score_delta=NormalPrior(mean=max(0.15 - 0.03 * k, 0.01), variance=0.05),
|
|
213
|
+
converge_prob=BetaPrior(),
|
|
214
|
+
latency_ms=NormalPrior(mean=2000, variance=500_000),
|
|
215
|
+
)
|
|
216
|
+
return self.iterations[k]
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
@dataclass
|
|
220
|
+
class CallObservation:
|
|
221
|
+
"""Observation recorded after a refinement run.
|
|
222
|
+
|
|
223
|
+
Attributes:
|
|
224
|
+
task_type: Identifier for the task class.
|
|
225
|
+
model_id: Identifier for the LLM model.
|
|
226
|
+
scores: Per-iteration quality scores.
|
|
227
|
+
latencies_ms: Per-iteration latencies in milliseconds.
|
|
228
|
+
converged: Whether the loop converged to an acceptable result.
|
|
229
|
+
total_iterations: Number of iterations executed.
|
|
230
|
+
max_iterations: Maximum iterations configured.
|
|
231
|
+
quality_threshold: Quality threshold configured.
|
|
232
|
+
prompt_tokens: Total prompt tokens consumed.
|
|
233
|
+
completion_tokens: Total completion tokens consumed.
|
|
234
|
+
metadata: Arbitrary extra data.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
task_type: str = ""
|
|
238
|
+
model_id: str = ""
|
|
239
|
+
scores: list[float] = field(default_factory=list)
|
|
240
|
+
latencies_ms: list[float] = field(default_factory=list)
|
|
241
|
+
converged: bool = False
|
|
242
|
+
total_iterations: int = 0
|
|
243
|
+
max_iterations: int = 5
|
|
244
|
+
quality_threshold: float = 0.8
|
|
245
|
+
prompt_tokens: int = 0
|
|
246
|
+
completion_tokens: int = 0
|
|
247
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
# Main adaptive priors manager
|
|
252
|
+
# ---------------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class AdaptivePriors:
|
|
256
|
+
"""Bayesian meta-learning manager that learns optimal loop depth from observations.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
store_path: Optional filesystem path for JSON persistence.
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
def __init__(self, store_path: Path | None = None) -> None:
|
|
263
|
+
self.store_path = store_path
|
|
264
|
+
self._priors: dict[str, TaskModelPrior] = {}
|
|
265
|
+
if store_path and store_path.exists():
|
|
266
|
+
self._load()
|
|
267
|
+
|
|
268
|
+
# -- key helpers ---------------------------------------------------------
|
|
269
|
+
|
|
270
|
+
@staticmethod
|
|
271
|
+
def _key(task_type: str, model_id: str) -> str:
|
|
272
|
+
return f"{task_type}::{model_id}"
|
|
273
|
+
|
|
274
|
+
def _get_or_create(self, task_type: str, model_id: str) -> TaskModelPrior:
|
|
275
|
+
key = self._key(task_type, model_id)
|
|
276
|
+
if key not in self._priors:
|
|
277
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
278
|
+
self._priors[key] = TaskModelPrior(
|
|
279
|
+
task_type=task_type,
|
|
280
|
+
model_id=model_id,
|
|
281
|
+
created_at=now,
|
|
282
|
+
updated_at=now,
|
|
283
|
+
)
|
|
284
|
+
return self._priors[key]
|
|
285
|
+
|
|
286
|
+
# -- public API ----------------------------------------------------------
|
|
287
|
+
|
|
288
|
+
def predict_optimal_depth(
|
|
289
|
+
self, task_type: str, model_id: str, cost_weight: float = 0.5
|
|
290
|
+
) -> int:
|
|
291
|
+
"""Predict the optimal number of refinement iterations.
|
|
292
|
+
|
|
293
|
+
Returns task-type defaults when fewer than 5 observations exist.
|
|
294
|
+
Otherwise uses expected improvement analysis.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
task_type: Identifier for the task class.
|
|
298
|
+
model_id: Identifier for the LLM model.
|
|
299
|
+
cost_weight: Weight given to cost vs. quality (0 = quality only).
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Recommended number of iterations (clamped to [1, 10]).
|
|
303
|
+
"""
|
|
304
|
+
prior = self._get_or_create(task_type, model_id)
|
|
305
|
+
|
|
306
|
+
if prior.total_calls < 5:
|
|
307
|
+
defaults: dict[str, int] = {
|
|
308
|
+
"decompose": 4,
|
|
309
|
+
"resolve": 2,
|
|
310
|
+
"assemble": 3,
|
|
311
|
+
"validate": 1,
|
|
312
|
+
}
|
|
313
|
+
return defaults.get(task_type, 3)
|
|
314
|
+
|
|
315
|
+
best_k = 1
|
|
316
|
+
for k in range(1, 11):
|
|
317
|
+
profile = prior.get_iteration(k)
|
|
318
|
+
expected_delta = profile.score_delta.mean
|
|
319
|
+
confidence = profile.score_delta.confidence
|
|
320
|
+
benefit = expected_delta + (1 - confidence) * 0.1
|
|
321
|
+
cost = 0.02 + cost_weight * 0.08
|
|
322
|
+
if benefit > cost and expected_delta > 0.01:
|
|
323
|
+
best_k = k
|
|
324
|
+
else:
|
|
325
|
+
break
|
|
326
|
+
|
|
327
|
+
return max(1, min(best_k, 10))
|
|
328
|
+
|
|
329
|
+
def should_continue(
|
|
330
|
+
self,
|
|
331
|
+
task_type: str,
|
|
332
|
+
model_id: str,
|
|
333
|
+
current_iteration: int,
|
|
334
|
+
current_score: float,
|
|
335
|
+
scores_so_far: list[float],
|
|
336
|
+
quality_threshold: float = 0.8,
|
|
337
|
+
) -> bool:
|
|
338
|
+
"""Decide whether the loop should continue beyond the current iteration.
|
|
339
|
+
|
|
340
|
+
Falls back to a simple threshold check during cold start (< 3 observations).
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
task_type: Identifier for the task class.
|
|
344
|
+
model_id: Identifier for the LLM model.
|
|
345
|
+
current_iteration: Current iteration number (1-based).
|
|
346
|
+
current_score: Score of the current iteration.
|
|
347
|
+
scores_so_far: All scores observed so far in this run.
|
|
348
|
+
quality_threshold: Target quality level.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
True if continuing is recommended.
|
|
352
|
+
"""
|
|
353
|
+
prior = self._get_or_create(task_type, model_id)
|
|
354
|
+
|
|
355
|
+
# Cold start
|
|
356
|
+
if prior.total_calls < 3:
|
|
357
|
+
return current_score < quality_threshold
|
|
358
|
+
|
|
359
|
+
# Compute cumulative expected improvement over remaining iterations
|
|
360
|
+
gap = quality_threshold - current_score
|
|
361
|
+
if gap <= 0:
|
|
362
|
+
return False
|
|
363
|
+
|
|
364
|
+
# Sum expected deltas over remaining plausible iterations
|
|
365
|
+
cumulative_delta = 0.0
|
|
366
|
+
cumulative_var = 0.0
|
|
367
|
+
max_remaining = 10 - current_iteration
|
|
368
|
+
for k in range(current_iteration, current_iteration + max(max_remaining, 1)):
|
|
369
|
+
p = prior.get_iteration(k)
|
|
370
|
+
cumulative_delta += p.score_delta.mean
|
|
371
|
+
cumulative_var += p.score_delta.variance
|
|
372
|
+
if cumulative_delta >= gap:
|
|
373
|
+
break
|
|
374
|
+
|
|
375
|
+
std = math.sqrt(max(cumulative_var, 1e-10))
|
|
376
|
+
z = (gap - cumulative_delta) / std
|
|
377
|
+
p_bridge_gap = 0.5 * (1 - math.erf(z / math.sqrt(2)))
|
|
378
|
+
|
|
379
|
+
return p_bridge_gap > 0.3
|
|
380
|
+
|
|
381
|
+
def expected_improvement(
|
|
382
|
+
self, task_type: str, model_id: str, at_iteration: int
|
|
383
|
+
) -> tuple[float, float]:
|
|
384
|
+
"""Return the expected score improvement and its uncertainty at a given iteration.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
task_type: Identifier for the task class.
|
|
388
|
+
model_id: Identifier for the LLM model.
|
|
389
|
+
at_iteration: Iteration depth to query.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
``(mean_delta, std_delta)`` tuple.
|
|
393
|
+
"""
|
|
394
|
+
prior = self._get_or_create(task_type, model_id)
|
|
395
|
+
profile = prior.get_iteration(at_iteration)
|
|
396
|
+
return profile.score_delta.mean, profile.score_delta.std
|
|
397
|
+
|
|
398
|
+
def suggest_config(
|
|
399
|
+
self, task_type: str, model_id: str, cost_weight: float = 0.5
|
|
400
|
+
) -> dict[str, Any]:
|
|
401
|
+
"""Suggest a :class:`LoopConfig`-compatible dict based on learned beliefs.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
task_type: Identifier for the task class.
|
|
405
|
+
model_id: Identifier for the LLM model.
|
|
406
|
+
cost_weight: Weight given to cost vs. quality.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Dict with ``max_iterations``, ``quality_threshold``, and ``metadata``.
|
|
410
|
+
"""
|
|
411
|
+
prior = self._get_or_create(task_type, model_id)
|
|
412
|
+
depth = self.predict_optimal_depth(task_type, model_id, cost_weight)
|
|
413
|
+
|
|
414
|
+
if prior.total_calls >= 5:
|
|
415
|
+
profile = prior.get_iteration(depth)
|
|
416
|
+
threshold = min(profile.score.mean + 0.5 * profile.score.std, 0.95)
|
|
417
|
+
else:
|
|
418
|
+
threshold = 0.8
|
|
419
|
+
|
|
420
|
+
return {
|
|
421
|
+
"max_iterations": depth,
|
|
422
|
+
"quality_threshold": round(threshold, 3),
|
|
423
|
+
"metadata": {
|
|
424
|
+
"source": "adaptive_priors",
|
|
425
|
+
"confidence": round(prior.optimal_depth.confidence, 3),
|
|
426
|
+
"total_observations": prior.total_calls,
|
|
427
|
+
},
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
def observe(self, observation: CallObservation) -> None:
|
|
431
|
+
"""Record a completed refinement run and update all priors.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
observation: The observation to incorporate.
|
|
435
|
+
"""
|
|
436
|
+
prior = self._get_or_create(observation.task_type, observation.model_id)
|
|
437
|
+
prior.total_calls += 1
|
|
438
|
+
prior.updated_at = datetime.now(timezone.utc).isoformat()
|
|
439
|
+
|
|
440
|
+
# Overall convergence
|
|
441
|
+
prior.overall_converge_rate.update(observation.converged)
|
|
442
|
+
|
|
443
|
+
# First-call quality
|
|
444
|
+
if observation.scores:
|
|
445
|
+
prior.first_call_quality.update(observation.scores[0])
|
|
446
|
+
|
|
447
|
+
# Per-iteration updates
|
|
448
|
+
for k, score in enumerate(observation.scores):
|
|
449
|
+
profile = prior.get_iteration(k)
|
|
450
|
+
profile.score.update(score)
|
|
451
|
+
if k > 0:
|
|
452
|
+
delta = score - observation.scores[k - 1]
|
|
453
|
+
profile.score_delta.update(delta)
|
|
454
|
+
if k < len(observation.latencies_ms):
|
|
455
|
+
profile.latency_ms.update(observation.latencies_ms[k])
|
|
456
|
+
converged_at_k = score >= observation.quality_threshold
|
|
457
|
+
profile.converge_prob.update(converged_at_k)
|
|
458
|
+
|
|
459
|
+
# Optimal depth: first iteration where score >= threshold
|
|
460
|
+
opt_depth = observation.total_iterations
|
|
461
|
+
for k, score in enumerate(observation.scores):
|
|
462
|
+
if score >= observation.quality_threshold:
|
|
463
|
+
opt_depth = k + 1
|
|
464
|
+
break
|
|
465
|
+
prior.optimal_depth.update(float(opt_depth))
|
|
466
|
+
|
|
467
|
+
# Auto-save
|
|
468
|
+
if self.store_path and prior.total_calls % 10 == 0:
|
|
469
|
+
self._save()
|
|
470
|
+
|
|
471
|
+
def report(self, task_type: str, model_id: str) -> dict[str, Any]:
|
|
472
|
+
"""Generate a human-readable summary of beliefs for a (task, model) pair.
|
|
473
|
+
|
|
474
|
+
Args:
|
|
475
|
+
task_type: Identifier for the task class.
|
|
476
|
+
model_id: Identifier for the LLM model.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Dict with summary statistics.
|
|
480
|
+
"""
|
|
481
|
+
prior = self._get_or_create(task_type, model_id)
|
|
482
|
+
iteration_summaries: dict[str, Any] = {}
|
|
483
|
+
for k in sorted(prior.iterations.keys()):
|
|
484
|
+
p = prior.iterations[k]
|
|
485
|
+
iteration_summaries[f"iter_{k}"] = {
|
|
486
|
+
"expected_score": round(p.score.mean, 3),
|
|
487
|
+
"expected_delta": round(p.score_delta.mean, 3),
|
|
488
|
+
"converge_prob": round(p.converge_prob.mean, 3),
|
|
489
|
+
"latency_ms": round(p.latency_ms.mean, 1),
|
|
490
|
+
}
|
|
491
|
+
return {
|
|
492
|
+
"task_type": task_type,
|
|
493
|
+
"model_id": model_id,
|
|
494
|
+
"total_calls": prior.total_calls,
|
|
495
|
+
"optimal_depth": round(prior.optimal_depth.mean, 2),
|
|
496
|
+
"converge_rate": round(prior.overall_converge_rate.mean, 3),
|
|
497
|
+
"first_call_quality": round(prior.first_call_quality.mean, 3),
|
|
498
|
+
"confidence": round(prior.optimal_depth.confidence, 3),
|
|
499
|
+
"iterations": iteration_summaries,
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
def report_all(self) -> list[dict[str, Any]]:
|
|
503
|
+
"""Generate summaries for all tracked (task, model) combinations.
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
List of report dicts.
|
|
507
|
+
"""
|
|
508
|
+
results: list[dict[str, Any]] = []
|
|
509
|
+
for key in sorted(self._priors.keys()):
|
|
510
|
+
p = self._priors[key]
|
|
511
|
+
results.append(self.report(p.task_type, p.model_id))
|
|
512
|
+
return results
|
|
513
|
+
|
|
514
|
+
# -- persistence ---------------------------------------------------------
|
|
515
|
+
|
|
516
|
+
def _save(self) -> None:
|
|
517
|
+
"""Atomically persist all priors to *store_path* as JSON."""
|
|
518
|
+
if not self.store_path:
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
data: dict[str, Any] = {}
|
|
522
|
+
for key, prior in self._priors.items():
|
|
523
|
+
iterations_data: dict[str, Any] = {}
|
|
524
|
+
for k, profile in prior.iterations.items():
|
|
525
|
+
iterations_data[str(k)] = {
|
|
526
|
+
"score": self._serialize_normal(profile.score),
|
|
527
|
+
"score_delta": self._serialize_normal(profile.score_delta),
|
|
528
|
+
"converge_prob": self._serialize_beta(profile.converge_prob),
|
|
529
|
+
"latency_ms": self._serialize_normal(profile.latency_ms),
|
|
530
|
+
}
|
|
531
|
+
data[key] = {
|
|
532
|
+
"task_type": prior.task_type,
|
|
533
|
+
"model_id": prior.model_id,
|
|
534
|
+
"created_at": prior.created_at,
|
|
535
|
+
"updated_at": prior.updated_at,
|
|
536
|
+
"total_calls": prior.total_calls,
|
|
537
|
+
"iterations": iterations_data,
|
|
538
|
+
"optimal_depth": self._serialize_normal(prior.optimal_depth),
|
|
539
|
+
"overall_converge_rate": self._serialize_beta(prior.overall_converge_rate),
|
|
540
|
+
"first_call_quality": self._serialize_normal(prior.first_call_quality),
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
self.store_path.parent.mkdir(parents=True, exist_ok=True)
|
|
544
|
+
fd, tmp_path = tempfile.mkstemp(
|
|
545
|
+
dir=self.store_path.parent, suffix=".tmp"
|
|
546
|
+
)
|
|
547
|
+
try:
|
|
548
|
+
with open(fd, "w") as f:
|
|
549
|
+
json.dump(data, f, indent=2)
|
|
550
|
+
Path(tmp_path).replace(self.store_path)
|
|
551
|
+
except BaseException:
|
|
552
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
553
|
+
raise
|
|
554
|
+
|
|
555
|
+
def _load(self) -> None:
|
|
556
|
+
"""Load priors from *store_path*."""
|
|
557
|
+
if not self.store_path or not self.store_path.exists():
|
|
558
|
+
return
|
|
559
|
+
|
|
560
|
+
with open(self.store_path) as f:
|
|
561
|
+
data = json.load(f)
|
|
562
|
+
|
|
563
|
+
for key, pdata in data.items():
|
|
564
|
+
iterations: dict[int, IterationProfile] = {}
|
|
565
|
+
for k_str, idata in pdata.get("iterations", {}).items():
|
|
566
|
+
iterations[int(k_str)] = IterationProfile(
|
|
567
|
+
score=self._deserialize_normal(idata["score"]),
|
|
568
|
+
score_delta=self._deserialize_normal(idata["score_delta"]),
|
|
569
|
+
converge_prob=self._deserialize_beta(idata["converge_prob"]),
|
|
570
|
+
latency_ms=self._deserialize_normal(idata["latency_ms"]),
|
|
571
|
+
)
|
|
572
|
+
self._priors[key] = TaskModelPrior(
|
|
573
|
+
task_type=pdata["task_type"],
|
|
574
|
+
model_id=pdata["model_id"],
|
|
575
|
+
created_at=pdata["created_at"],
|
|
576
|
+
updated_at=pdata["updated_at"],
|
|
577
|
+
total_calls=pdata["total_calls"],
|
|
578
|
+
iterations=iterations,
|
|
579
|
+
optimal_depth=self._deserialize_normal(pdata["optimal_depth"]),
|
|
580
|
+
overall_converge_rate=self._deserialize_beta(pdata["overall_converge_rate"]),
|
|
581
|
+
first_call_quality=self._deserialize_normal(pdata["first_call_quality"]),
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
@staticmethod
|
|
585
|
+
def _serialize_normal(p: NormalPrior) -> dict[str, Any]:
|
|
586
|
+
return {
|
|
587
|
+
"mean": p.mean,
|
|
588
|
+
"variance": p.variance,
|
|
589
|
+
"n_observations": p.n_observations,
|
|
590
|
+
"_m2": p._m2,
|
|
591
|
+
"decay": p.decay,
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
@staticmethod
|
|
595
|
+
def _deserialize_normal(d: dict[str, Any]) -> NormalPrior:
|
|
596
|
+
return NormalPrior(**d)
|
|
597
|
+
|
|
598
|
+
@staticmethod
|
|
599
|
+
def _serialize_beta(p: BetaPrior) -> dict[str, Any]:
|
|
600
|
+
return {"alpha": p.alpha, "beta": p.beta}
|
|
601
|
+
|
|
602
|
+
@staticmethod
|
|
603
|
+
def _deserialize_beta(d: dict[str, Any]) -> BetaPrior:
|
|
604
|
+
return BetaPrior(**d)
|
loopllm/provider.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""Base abstractions for LLM providers."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class LLMUsage:
|
|
11
|
+
"""Token usage reported by the LLM API.
|
|
12
|
+
|
|
13
|
+
Attributes:
|
|
14
|
+
prompt_tokens: Number of tokens in the prompt.
|
|
15
|
+
completion_tokens: Number of tokens in the completion.
|
|
16
|
+
total_tokens: Total tokens consumed.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
prompt_tokens: int = 0
|
|
20
|
+
completion_tokens: int = 0
|
|
21
|
+
total_tokens: int = 0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class LLMResponse:
|
|
26
|
+
"""Normalised response returned by every LLM provider.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
content: The text content of the completion.
|
|
30
|
+
model: Model identifier used for the call.
|
|
31
|
+
usage: Token usage statistics.
|
|
32
|
+
latency_ms: Round-trip latency in milliseconds.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
content: str
|
|
36
|
+
model: str
|
|
37
|
+
usage: LLMUsage = field(default_factory=LLMUsage)
|
|
38
|
+
latency_ms: float = 0.0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class LLMProvider(ABC):
|
|
42
|
+
"""Abstract base class that all providers must implement."""
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def name(self) -> str:
|
|
47
|
+
"""Human-readable provider name."""
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def complete(self, prompt: str, model: str, **kwargs: Any) -> LLMResponse:
|
|
51
|
+
"""Send *prompt* to *model* and return a normalised response."""
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Built-in LLM provider implementations."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from loopllm.providers.agent import AgentExecutionRequired, AgentPassthroughProvider
|
|
5
|
+
from loopllm.providers.mock import MockLLMProvider
|
|
6
|
+
from loopllm.providers.ollama import OllamaProvider
|
|
7
|
+
from loopllm.providers.openrouter import OpenRouterProvider
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"AgentExecutionRequired",
|
|
11
|
+
"AgentPassthroughProvider",
|
|
12
|
+
"MockLLMProvider",
|
|
13
|
+
"OllamaProvider",
|
|
14
|
+
"OpenRouterProvider",
|
|
15
|
+
]
|