loopllm 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopllm/__init__.py +69 -0
- loopllm/__main__.py +5 -0
- loopllm/adaptive_exit.py +78 -0
- loopllm/agent_loop.py +299 -0
- loopllm/cli.py +521 -0
- loopllm/elicitation.py +519 -0
- loopllm/engine.py +376 -0
- loopllm/evaluator_factory.py +72 -0
- loopllm/evaluators.py +419 -0
- loopllm/guards.py +254 -0
- loopllm/local_loop.py +273 -0
- loopllm/mcp_server.py +2657 -0
- loopllm/plan_registry.py +412 -0
- loopllm/priors.py +604 -0
- loopllm/provider.py +51 -0
- loopllm/providers/__init__.py +15 -0
- loopllm/providers/agent.py +64 -0
- loopllm/providers/mock.py +64 -0
- loopllm/providers/ollama.py +95 -0
- loopllm/providers/openrouter.py +101 -0
- loopllm/serve.py +297 -0
- loopllm/step_scorer.py +190 -0
- loopllm/store.py +1126 -0
- loopllm/tasks.py +599 -0
- loopllm-0.7.0.dist-info/METADATA +454 -0
- loopllm-0.7.0.dist-info/RECORD +29 -0
- loopllm-0.7.0.dist-info/WHEEL +4 -0
- loopllm-0.7.0.dist-info/entry_points.txt +3 -0
- loopllm-0.7.0.dist-info/licenses/LICENSE +21 -0
loopllm/plan_registry.py
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""Confidence-driven plan registry for scored task management.
|
|
2
|
+
|
|
3
|
+
Each plan tracks a rolling confidence score derived from:
|
|
4
|
+
- Prompt quality scores (from loopllm_intercept)
|
|
5
|
+
- Output scores (from loopllm_verify_output / evaluators)
|
|
6
|
+
|
|
7
|
+
When rolling_confidence drops below the plan's threshold, the registry
|
|
8
|
+
signals that the current task should be refined or the plan should be
|
|
9
|
+
replanned before proceeding.
|
|
10
|
+
|
|
11
|
+
This is the backbone for Shrimp-style task management where every action
|
|
12
|
+
is gated by accumulated evidence of quality.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import time
|
|
17
|
+
import uuid
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from enum import Enum
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TaskStatus(str, Enum):
|
|
24
|
+
PENDING = "pending"
|
|
25
|
+
IN_PROGRESS = "in_progress"
|
|
26
|
+
DONE = "done"
|
|
27
|
+
FAILED = "failed"
|
|
28
|
+
REPLANNING = "replanning"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class TaskRecord:
|
|
33
|
+
"""A single task entry in a plan.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
id: Unique short identifier.
|
|
37
|
+
title: Short description.
|
|
38
|
+
description: Full task description / prompt.
|
|
39
|
+
status: Current lifecycle status.
|
|
40
|
+
prompt_score: Quality score of the task prompt (0–1), if scored.
|
|
41
|
+
output_score: Quality score of the task output (0–1), if scored.
|
|
42
|
+
confidence: Combined confidence for this task (weighted avg of both scores).
|
|
43
|
+
replan_count: How many times this task has been re-attempted.
|
|
44
|
+
metadata: Arbitrary extra data (e.g. model used, latency).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
id: str = field(default_factory=lambda: uuid.uuid4().hex[:8])
|
|
48
|
+
title: str = ""
|
|
49
|
+
description: str = ""
|
|
50
|
+
status: TaskStatus = TaskStatus.PENDING
|
|
51
|
+
prompt_score: float | None = None
|
|
52
|
+
output_score: float | None = None
|
|
53
|
+
confidence: float = 0.0
|
|
54
|
+
replan_count: int = 0
|
|
55
|
+
created_at: float = field(default_factory=time.time)
|
|
56
|
+
updated_at: float = field(default_factory=time.time)
|
|
57
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
def update_confidence(
|
|
60
|
+
self,
|
|
61
|
+
prompt_weight: float = 0.35,
|
|
62
|
+
output_weight: float = 0.65,
|
|
63
|
+
) -> None:
|
|
64
|
+
"""Recalculate confidence from available scores.
|
|
65
|
+
|
|
66
|
+
Prompt score has lower weight — it measures intent clarity.
|
|
67
|
+
Output score has higher weight — it measures actual result quality.
|
|
68
|
+
If only one score is available, that score IS the confidence.
|
|
69
|
+
"""
|
|
70
|
+
p = self.prompt_score
|
|
71
|
+
o = self.output_score
|
|
72
|
+
if p is not None and o is not None:
|
|
73
|
+
self.confidence = p * prompt_weight + o * output_weight
|
|
74
|
+
elif p is not None:
|
|
75
|
+
self.confidence = p
|
|
76
|
+
elif o is not None:
|
|
77
|
+
self.confidence = o
|
|
78
|
+
else:
|
|
79
|
+
self.confidence = 0.0
|
|
80
|
+
self.updated_at = time.time()
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class Plan:
|
|
85
|
+
"""A collection of ordered tasks with a rolling confidence score.
|
|
86
|
+
|
|
87
|
+
Attributes:
|
|
88
|
+
plan_id: Unique identifier.
|
|
89
|
+
goal: The original high-level goal / prompt.
|
|
90
|
+
tasks: Ordered list of task records.
|
|
91
|
+
confidence_threshold: Minimum rolling confidence to continue without replanning.
|
|
92
|
+
rolling_confidence: Weighted rolling average across all scored tasks.
|
|
93
|
+
replan_count: Total number of replan events for this plan.
|
|
94
|
+
created_at: Creation timestamp.
|
|
95
|
+
metadata: Arbitrary extra data.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
plan_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
|
|
99
|
+
goal: str = ""
|
|
100
|
+
tasks: list[TaskRecord] = field(default_factory=list)
|
|
101
|
+
confidence_threshold: float = 0.72
|
|
102
|
+
rolling_confidence: float = 1.0 # starts optimistic
|
|
103
|
+
replan_count: int = 0
|
|
104
|
+
created_at: float = field(default_factory=time.time)
|
|
105
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
106
|
+
|
|
107
|
+
# -- task accessors ------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
def get_task(self, task_id: str) -> TaskRecord | None:
|
|
110
|
+
return next((t for t in self.tasks if t.id == task_id), None)
|
|
111
|
+
|
|
112
|
+
def pending_tasks(self) -> list[TaskRecord]:
|
|
113
|
+
return [t for t in self.tasks if t.status == TaskStatus.PENDING]
|
|
114
|
+
|
|
115
|
+
def done_tasks(self) -> list[TaskRecord]:
|
|
116
|
+
return [t for t in self.tasks
|
|
117
|
+
if t.status in (TaskStatus.DONE,)]
|
|
118
|
+
|
|
119
|
+
# -- confidence engine ---------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def recalculate_confidence(self, decay: float = 0.85) -> float:
|
|
122
|
+
"""Recalculate rolling confidence using exponential decay weighting.
|
|
123
|
+
|
|
124
|
+
More recent task scores have higher weight. Tasks with no scores
|
|
125
|
+
yet are skipped (they don't penalise the plan until scored).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
decay: Weight decay factor per task (0–1). Lower = more recency bias.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Updated rolling_confidence value.
|
|
132
|
+
"""
|
|
133
|
+
scored = [t for t in self.tasks if t.confidence > 0.0]
|
|
134
|
+
if not scored:
|
|
135
|
+
self.rolling_confidence = 1.0
|
|
136
|
+
return self.rolling_confidence
|
|
137
|
+
|
|
138
|
+
# Exponential weighting: most recent task has weight 1.0, prior tasks decay
|
|
139
|
+
weights = [decay ** (len(scored) - 1 - i) for i in range(len(scored))]
|
|
140
|
+
total_weight = sum(weights)
|
|
141
|
+
self.rolling_confidence = sum(
|
|
142
|
+
t.confidence * w for t, w in zip(scored, weights)
|
|
143
|
+
) / total_weight
|
|
144
|
+
return self.rolling_confidence
|
|
145
|
+
|
|
146
|
+
def needs_replan(self) -> bool:
|
|
147
|
+
"""Return True if rolling confidence is below the threshold."""
|
|
148
|
+
return self.rolling_confidence < self.confidence_threshold
|
|
149
|
+
|
|
150
|
+
def to_dict(self) -> dict[str, Any]:
|
|
151
|
+
return {
|
|
152
|
+
"plan_id": self.plan_id,
|
|
153
|
+
"goal": self.goal,
|
|
154
|
+
"rolling_confidence": round(self.rolling_confidence, 4),
|
|
155
|
+
"confidence_threshold": self.confidence_threshold,
|
|
156
|
+
"needs_replan": self.needs_replan(),
|
|
157
|
+
"replan_count": self.replan_count,
|
|
158
|
+
"created_at": self.created_at,
|
|
159
|
+
"task_count": len(self.tasks),
|
|
160
|
+
"tasks": [
|
|
161
|
+
{
|
|
162
|
+
"id": t.id,
|
|
163
|
+
"title": t.title,
|
|
164
|
+
"description": t.description,
|
|
165
|
+
"status": t.status.value,
|
|
166
|
+
"prompt_score": t.prompt_score,
|
|
167
|
+
"output_score": t.output_score,
|
|
168
|
+
"confidence": round(t.confidence, 4),
|
|
169
|
+
"replan_count": t.replan_count,
|
|
170
|
+
"metadata": t.metadata,
|
|
171
|
+
}
|
|
172
|
+
for t in self.tasks
|
|
173
|
+
],
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
@classmethod
|
|
177
|
+
def from_dict(cls, data: dict[str, Any]) -> "Plan":
|
|
178
|
+
"""Reconstruct a Plan from a serialised dict (e.g. loaded from store)."""
|
|
179
|
+
plan = cls(
|
|
180
|
+
plan_id=data["plan_id"],
|
|
181
|
+
goal=data.get("goal", ""),
|
|
182
|
+
confidence_threshold=data.get("confidence_threshold", 0.72),
|
|
183
|
+
rolling_confidence=data.get("rolling_confidence", 1.0),
|
|
184
|
+
replan_count=data.get("replan_count", 0),
|
|
185
|
+
created_at=data.get("created_at", time.time()),
|
|
186
|
+
)
|
|
187
|
+
for t in data.get("tasks", []):
|
|
188
|
+
record = TaskRecord(
|
|
189
|
+
id=t["id"],
|
|
190
|
+
title=t.get("title", ""),
|
|
191
|
+
description=t.get("description", ""),
|
|
192
|
+
status=TaskStatus(t.get("status", "pending")),
|
|
193
|
+
prompt_score=t.get("prompt_score"),
|
|
194
|
+
output_score=t.get("output_score"),
|
|
195
|
+
confidence=t.get("confidence", 0.0),
|
|
196
|
+
replan_count=t.get("replan_count", 0),
|
|
197
|
+
metadata=t.get("metadata", {}),
|
|
198
|
+
)
|
|
199
|
+
plan.tasks.append(record)
|
|
200
|
+
return plan
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class PlanRegistry:
|
|
204
|
+
"""In-memory registry of active plans.
|
|
205
|
+
|
|
206
|
+
Thread-safe for concurrent MCP tool calls (uses a simple dict with
|
|
207
|
+
no shared mutable state between plans).
|
|
208
|
+
|
|
209
|
+
Usage::
|
|
210
|
+
|
|
211
|
+
registry = PlanRegistry()
|
|
212
|
+
plan_id = registry.create(goal="build a parser", tasks=[...])
|
|
213
|
+
registry.score_prompt(plan_id, task_id, score=0.81)
|
|
214
|
+
registry.score_output(plan_id, task_id, score=0.74)
|
|
215
|
+
status = registry.get_status(plan_id)
|
|
216
|
+
# status["needs_replan"] → True/False
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
def __init__(self) -> None:
|
|
220
|
+
self._plans: dict[str, Plan] = {}
|
|
221
|
+
|
|
222
|
+
# -- plan lifecycle ------------------------------------------------------
|
|
223
|
+
|
|
224
|
+
def create(
|
|
225
|
+
self,
|
|
226
|
+
goal: str,
|
|
227
|
+
tasks: list[dict[str, Any]],
|
|
228
|
+
confidence_threshold: float = 0.72,
|
|
229
|
+
) -> Plan:
|
|
230
|
+
"""Create a new plan from a goal and list of task dicts.
|
|
231
|
+
|
|
232
|
+
Each task dict should have at least: ``title``, ``description``.
|
|
233
|
+
Optional: ``id``, ``metadata``.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
goal: High-level goal text.
|
|
237
|
+
tasks: List of task attribute dicts.
|
|
238
|
+
confidence_threshold: Minimum rolling confidence before replan.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
The created :class:`Plan`.
|
|
242
|
+
"""
|
|
243
|
+
plan = Plan(goal=goal, confidence_threshold=confidence_threshold)
|
|
244
|
+
for t in tasks:
|
|
245
|
+
plan.tasks.append(TaskRecord(
|
|
246
|
+
id=t.get("id", uuid.uuid4().hex[:8]),
|
|
247
|
+
title=t.get("title", ""),
|
|
248
|
+
description=t.get("description", ""),
|
|
249
|
+
metadata=t.get("metadata", {}),
|
|
250
|
+
))
|
|
251
|
+
self._plans[plan.plan_id] = plan
|
|
252
|
+
return plan
|
|
253
|
+
|
|
254
|
+
def get(self, plan_id: str) -> Plan | None:
|
|
255
|
+
return self._plans.get(plan_id)
|
|
256
|
+
|
|
257
|
+
def delete(self, plan_id: str) -> bool:
|
|
258
|
+
return bool(self._plans.pop(plan_id, None))
|
|
259
|
+
|
|
260
|
+
def list_plans(self) -> list[dict[str, Any]]:
|
|
261
|
+
return [p.to_dict() for p in self._plans.values()]
|
|
262
|
+
|
|
263
|
+
def restore_from_store(self, store: Any) -> int:
|
|
264
|
+
"""Load all persisted plans from a LoopStore into this registry.
|
|
265
|
+
|
|
266
|
+
Should be called once at server startup. Skips plan IDs that are
|
|
267
|
+
already in memory (idempotent).
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
store: A :class:`loopllm.store.LoopStore` instance.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Number of plans loaded.
|
|
274
|
+
"""
|
|
275
|
+
loaded = 0
|
|
276
|
+
for plan_dict in store.load_all_plans():
|
|
277
|
+
pid = plan_dict.get("plan_id")
|
|
278
|
+
if pid and pid not in self._plans:
|
|
279
|
+
self._plans[pid] = Plan.from_dict(plan_dict)
|
|
280
|
+
loaded += 1
|
|
281
|
+
return loaded
|
|
282
|
+
|
|
283
|
+
# -- scoring API ---------------------------------------------------------
|
|
284
|
+
|
|
285
|
+
def score_prompt(
|
|
286
|
+
self,
|
|
287
|
+
plan_id: str,
|
|
288
|
+
task_id: str,
|
|
289
|
+
score: float,
|
|
290
|
+
) -> dict[str, Any]:
|
|
291
|
+
"""Record a prompt quality score for a task.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
plan_id: Plan identifier.
|
|
295
|
+
task_id: Task identifier within the plan.
|
|
296
|
+
score: Prompt quality score (0–1).
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Updated plan status dict.
|
|
300
|
+
"""
|
|
301
|
+
plan = self._plans.get(plan_id)
|
|
302
|
+
if plan is None:
|
|
303
|
+
return {"error": f"Plan not found: {plan_id}"}
|
|
304
|
+
task = plan.get_task(task_id)
|
|
305
|
+
if task is None:
|
|
306
|
+
return {"error": f"Task not found: {task_id} in plan {plan_id}"}
|
|
307
|
+
|
|
308
|
+
task.prompt_score = max(0.0, min(1.0, score))
|
|
309
|
+
task.update_confidence()
|
|
310
|
+
plan.recalculate_confidence()
|
|
311
|
+
return plan.to_dict()
|
|
312
|
+
|
|
313
|
+
def score_output(
|
|
314
|
+
self,
|
|
315
|
+
plan_id: str,
|
|
316
|
+
task_id: str,
|
|
317
|
+
score: float,
|
|
318
|
+
mark_done: bool = True,
|
|
319
|
+
) -> dict[str, Any]:
|
|
320
|
+
"""Record an output quality score for a task.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
plan_id: Plan identifier.
|
|
324
|
+
task_id: Task identifier within the plan.
|
|
325
|
+
score: Output quality score (0–1).
|
|
326
|
+
mark_done: If True and score >= threshold, mark task as DONE.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Updated plan status dict.
|
|
330
|
+
"""
|
|
331
|
+
plan = self._plans.get(plan_id)
|
|
332
|
+
if plan is None:
|
|
333
|
+
return {"error": f"Plan not found: {plan_id}"}
|
|
334
|
+
task = plan.get_task(task_id)
|
|
335
|
+
if task is None:
|
|
336
|
+
return {"error": f"Task not found: {task_id} in plan {plan_id}"}
|
|
337
|
+
|
|
338
|
+
task.output_score = max(0.0, min(1.0, score))
|
|
339
|
+
task.update_confidence()
|
|
340
|
+
plan.recalculate_confidence()
|
|
341
|
+
|
|
342
|
+
if mark_done:
|
|
343
|
+
if task.confidence >= plan.confidence_threshold:
|
|
344
|
+
task.status = TaskStatus.DONE
|
|
345
|
+
else:
|
|
346
|
+
task.status = TaskStatus.FAILED
|
|
347
|
+
|
|
348
|
+
# Trigger replan bookkeeping if needed
|
|
349
|
+
if plan.needs_replan():
|
|
350
|
+
plan.replan_count += 1
|
|
351
|
+
task.replan_count += 1
|
|
352
|
+
task.status = TaskStatus.REPLANNING
|
|
353
|
+
|
|
354
|
+
return plan.to_dict()
|
|
355
|
+
|
|
356
|
+
def mark_task(
|
|
357
|
+
self,
|
|
358
|
+
plan_id: str,
|
|
359
|
+
task_id: str,
|
|
360
|
+
status: str,
|
|
361
|
+
) -> dict[str, Any]:
|
|
362
|
+
"""Manually set a task's status."""
|
|
363
|
+
plan = self._plans.get(plan_id)
|
|
364
|
+
if plan is None:
|
|
365
|
+
return {"error": f"Plan not found: {plan_id}"}
|
|
366
|
+
task = plan.get_task(task_id)
|
|
367
|
+
if task is None:
|
|
368
|
+
return {"error": f"Task not found: {task_id}"}
|
|
369
|
+
try:
|
|
370
|
+
task.status = TaskStatus(status)
|
|
371
|
+
except ValueError:
|
|
372
|
+
return {"error": f"Unknown status: {status}"}
|
|
373
|
+
task.updated_at = time.time()
|
|
374
|
+
return plan.to_dict()
|
|
375
|
+
|
|
376
|
+
def get_status(self, plan_id: str) -> dict[str, Any]:
|
|
377
|
+
plan = self._plans.get(plan_id)
|
|
378
|
+
if plan is None:
|
|
379
|
+
return {"error": f"Plan not found: {plan_id}"}
|
|
380
|
+
return plan.to_dict()
|
|
381
|
+
|
|
382
|
+
def next_task(self, plan_id: str) -> dict[str, Any] | None:
|
|
383
|
+
"""Return the next pending task, or None if the plan is complete/blocked."""
|
|
384
|
+
plan = self._plans.get(plan_id)
|
|
385
|
+
if plan is None:
|
|
386
|
+
return None
|
|
387
|
+
pending = plan.pending_tasks()
|
|
388
|
+
if not pending:
|
|
389
|
+
return None
|
|
390
|
+
t = pending[0]
|
|
391
|
+
t.status = TaskStatus.IN_PROGRESS
|
|
392
|
+
t.updated_at = time.time()
|
|
393
|
+
return {
|
|
394
|
+
"id": t.id,
|
|
395
|
+
"title": t.title,
|
|
396
|
+
"description": t.description,
|
|
397
|
+
"replan_count": t.replan_count,
|
|
398
|
+
"needs_replan": plan.needs_replan(),
|
|
399
|
+
"rolling_confidence": round(plan.rolling_confidence, 4),
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
# ---------------------------------------------------------------------------
|
|
404
|
+
# Process-level singleton — shared across all MCP tool calls
|
|
405
|
+
# ---------------------------------------------------------------------------
|
|
406
|
+
|
|
407
|
+
_registry: PlanRegistry = PlanRegistry()
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def get_registry() -> PlanRegistry:
|
|
411
|
+
"""Return the global PlanRegistry instance."""
|
|
412
|
+
return _registry
|