loopllm 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,412 @@
1
+ """Confidence-driven plan registry for scored task management.
2
+
3
+ Each plan tracks a rolling confidence score derived from:
4
+ - Prompt quality scores (from loopllm_intercept)
5
+ - Output scores (from loopllm_verify_output / evaluators)
6
+
7
+ When rolling_confidence drops below the plan's threshold, the registry
8
+ signals that the current task should be refined or the plan should be
9
+ replanned before proceeding.
10
+
11
+ This is the backbone for Shrimp-style task management where every action
12
+ is gated by accumulated evidence of quality.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import time
17
+ import uuid
18
+ from dataclasses import dataclass, field
19
+ from enum import Enum
20
+ from typing import Any
21
+
22
+
23
+ class TaskStatus(str, Enum):
24
+ PENDING = "pending"
25
+ IN_PROGRESS = "in_progress"
26
+ DONE = "done"
27
+ FAILED = "failed"
28
+ REPLANNING = "replanning"
29
+
30
+
31
+ @dataclass
32
+ class TaskRecord:
33
+ """A single task entry in a plan.
34
+
35
+ Attributes:
36
+ id: Unique short identifier.
37
+ title: Short description.
38
+ description: Full task description / prompt.
39
+ status: Current lifecycle status.
40
+ prompt_score: Quality score of the task prompt (0–1), if scored.
41
+ output_score: Quality score of the task output (0–1), if scored.
42
+ confidence: Combined confidence for this task (weighted avg of both scores).
43
+ replan_count: How many times this task has been re-attempted.
44
+ metadata: Arbitrary extra data (e.g. model used, latency).
45
+ """
46
+
47
+ id: str = field(default_factory=lambda: uuid.uuid4().hex[:8])
48
+ title: str = ""
49
+ description: str = ""
50
+ status: TaskStatus = TaskStatus.PENDING
51
+ prompt_score: float | None = None
52
+ output_score: float | None = None
53
+ confidence: float = 0.0
54
+ replan_count: int = 0
55
+ created_at: float = field(default_factory=time.time)
56
+ updated_at: float = field(default_factory=time.time)
57
+ metadata: dict[str, Any] = field(default_factory=dict)
58
+
59
+ def update_confidence(
60
+ self,
61
+ prompt_weight: float = 0.35,
62
+ output_weight: float = 0.65,
63
+ ) -> None:
64
+ """Recalculate confidence from available scores.
65
+
66
+ Prompt score has lower weight — it measures intent clarity.
67
+ Output score has higher weight — it measures actual result quality.
68
+ If only one score is available, that score IS the confidence.
69
+ """
70
+ p = self.prompt_score
71
+ o = self.output_score
72
+ if p is not None and o is not None:
73
+ self.confidence = p * prompt_weight + o * output_weight
74
+ elif p is not None:
75
+ self.confidence = p
76
+ elif o is not None:
77
+ self.confidence = o
78
+ else:
79
+ self.confidence = 0.0
80
+ self.updated_at = time.time()
81
+
82
+
83
+ @dataclass
84
+ class Plan:
85
+ """A collection of ordered tasks with a rolling confidence score.
86
+
87
+ Attributes:
88
+ plan_id: Unique identifier.
89
+ goal: The original high-level goal / prompt.
90
+ tasks: Ordered list of task records.
91
+ confidence_threshold: Minimum rolling confidence to continue without replanning.
92
+ rolling_confidence: Weighted rolling average across all scored tasks.
93
+ replan_count: Total number of replan events for this plan.
94
+ created_at: Creation timestamp.
95
+ metadata: Arbitrary extra data.
96
+ """
97
+
98
+ plan_id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
99
+ goal: str = ""
100
+ tasks: list[TaskRecord] = field(default_factory=list)
101
+ confidence_threshold: float = 0.72
102
+ rolling_confidence: float = 1.0 # starts optimistic
103
+ replan_count: int = 0
104
+ created_at: float = field(default_factory=time.time)
105
+ metadata: dict[str, Any] = field(default_factory=dict)
106
+
107
+ # -- task accessors ------------------------------------------------------
108
+
109
+ def get_task(self, task_id: str) -> TaskRecord | None:
110
+ return next((t for t in self.tasks if t.id == task_id), None)
111
+
112
+ def pending_tasks(self) -> list[TaskRecord]:
113
+ return [t for t in self.tasks if t.status == TaskStatus.PENDING]
114
+
115
+ def done_tasks(self) -> list[TaskRecord]:
116
+ return [t for t in self.tasks
117
+ if t.status in (TaskStatus.DONE,)]
118
+
119
+ # -- confidence engine ---------------------------------------------------
120
+
121
+ def recalculate_confidence(self, decay: float = 0.85) -> float:
122
+ """Recalculate rolling confidence using exponential decay weighting.
123
+
124
+ More recent task scores have higher weight. Tasks with no scores
125
+ yet are skipped (they don't penalise the plan until scored).
126
+
127
+ Args:
128
+ decay: Weight decay factor per task (0–1). Lower = more recency bias.
129
+
130
+ Returns:
131
+ Updated rolling_confidence value.
132
+ """
133
+ scored = [t for t in self.tasks if t.confidence > 0.0]
134
+ if not scored:
135
+ self.rolling_confidence = 1.0
136
+ return self.rolling_confidence
137
+
138
+ # Exponential weighting: most recent task has weight 1.0, prior tasks decay
139
+ weights = [decay ** (len(scored) - 1 - i) for i in range(len(scored))]
140
+ total_weight = sum(weights)
141
+ self.rolling_confidence = sum(
142
+ t.confidence * w for t, w in zip(scored, weights)
143
+ ) / total_weight
144
+ return self.rolling_confidence
145
+
146
+ def needs_replan(self) -> bool:
147
+ """Return True if rolling confidence is below the threshold."""
148
+ return self.rolling_confidence < self.confidence_threshold
149
+
150
+ def to_dict(self) -> dict[str, Any]:
151
+ return {
152
+ "plan_id": self.plan_id,
153
+ "goal": self.goal,
154
+ "rolling_confidence": round(self.rolling_confidence, 4),
155
+ "confidence_threshold": self.confidence_threshold,
156
+ "needs_replan": self.needs_replan(),
157
+ "replan_count": self.replan_count,
158
+ "created_at": self.created_at,
159
+ "task_count": len(self.tasks),
160
+ "tasks": [
161
+ {
162
+ "id": t.id,
163
+ "title": t.title,
164
+ "description": t.description,
165
+ "status": t.status.value,
166
+ "prompt_score": t.prompt_score,
167
+ "output_score": t.output_score,
168
+ "confidence": round(t.confidence, 4),
169
+ "replan_count": t.replan_count,
170
+ "metadata": t.metadata,
171
+ }
172
+ for t in self.tasks
173
+ ],
174
+ }
175
+
176
+ @classmethod
177
+ def from_dict(cls, data: dict[str, Any]) -> "Plan":
178
+ """Reconstruct a Plan from a serialised dict (e.g. loaded from store)."""
179
+ plan = cls(
180
+ plan_id=data["plan_id"],
181
+ goal=data.get("goal", ""),
182
+ confidence_threshold=data.get("confidence_threshold", 0.72),
183
+ rolling_confidence=data.get("rolling_confidence", 1.0),
184
+ replan_count=data.get("replan_count", 0),
185
+ created_at=data.get("created_at", time.time()),
186
+ )
187
+ for t in data.get("tasks", []):
188
+ record = TaskRecord(
189
+ id=t["id"],
190
+ title=t.get("title", ""),
191
+ description=t.get("description", ""),
192
+ status=TaskStatus(t.get("status", "pending")),
193
+ prompt_score=t.get("prompt_score"),
194
+ output_score=t.get("output_score"),
195
+ confidence=t.get("confidence", 0.0),
196
+ replan_count=t.get("replan_count", 0),
197
+ metadata=t.get("metadata", {}),
198
+ )
199
+ plan.tasks.append(record)
200
+ return plan
201
+
202
+
203
+ class PlanRegistry:
204
+ """In-memory registry of active plans.
205
+
206
+ Thread-safe for concurrent MCP tool calls (uses a simple dict with
207
+ no shared mutable state between plans).
208
+
209
+ Usage::
210
+
211
+ registry = PlanRegistry()
212
+ plan_id = registry.create(goal="build a parser", tasks=[...])
213
+ registry.score_prompt(plan_id, task_id, score=0.81)
214
+ registry.score_output(plan_id, task_id, score=0.74)
215
+ status = registry.get_status(plan_id)
216
+ # status["needs_replan"] → True/False
217
+ """
218
+
219
+ def __init__(self) -> None:
220
+ self._plans: dict[str, Plan] = {}
221
+
222
+ # -- plan lifecycle ------------------------------------------------------
223
+
224
+ def create(
225
+ self,
226
+ goal: str,
227
+ tasks: list[dict[str, Any]],
228
+ confidence_threshold: float = 0.72,
229
+ ) -> Plan:
230
+ """Create a new plan from a goal and list of task dicts.
231
+
232
+ Each task dict should have at least: ``title``, ``description``.
233
+ Optional: ``id``, ``metadata``.
234
+
235
+ Args:
236
+ goal: High-level goal text.
237
+ tasks: List of task attribute dicts.
238
+ confidence_threshold: Minimum rolling confidence before replan.
239
+
240
+ Returns:
241
+ The created :class:`Plan`.
242
+ """
243
+ plan = Plan(goal=goal, confidence_threshold=confidence_threshold)
244
+ for t in tasks:
245
+ plan.tasks.append(TaskRecord(
246
+ id=t.get("id", uuid.uuid4().hex[:8]),
247
+ title=t.get("title", ""),
248
+ description=t.get("description", ""),
249
+ metadata=t.get("metadata", {}),
250
+ ))
251
+ self._plans[plan.plan_id] = plan
252
+ return plan
253
+
254
+ def get(self, plan_id: str) -> Plan | None:
255
+ return self._plans.get(plan_id)
256
+
257
+ def delete(self, plan_id: str) -> bool:
258
+ return bool(self._plans.pop(plan_id, None))
259
+
260
+ def list_plans(self) -> list[dict[str, Any]]:
261
+ return [p.to_dict() for p in self._plans.values()]
262
+
263
+ def restore_from_store(self, store: Any) -> int:
264
+ """Load all persisted plans from a LoopStore into this registry.
265
+
266
+ Should be called once at server startup. Skips plan IDs that are
267
+ already in memory (idempotent).
268
+
269
+ Args:
270
+ store: A :class:`loopllm.store.LoopStore` instance.
271
+
272
+ Returns:
273
+ Number of plans loaded.
274
+ """
275
+ loaded = 0
276
+ for plan_dict in store.load_all_plans():
277
+ pid = plan_dict.get("plan_id")
278
+ if pid and pid not in self._plans:
279
+ self._plans[pid] = Plan.from_dict(plan_dict)
280
+ loaded += 1
281
+ return loaded
282
+
283
+ # -- scoring API ---------------------------------------------------------
284
+
285
+ def score_prompt(
286
+ self,
287
+ plan_id: str,
288
+ task_id: str,
289
+ score: float,
290
+ ) -> dict[str, Any]:
291
+ """Record a prompt quality score for a task.
292
+
293
+ Args:
294
+ plan_id: Plan identifier.
295
+ task_id: Task identifier within the plan.
296
+ score: Prompt quality score (0–1).
297
+
298
+ Returns:
299
+ Updated plan status dict.
300
+ """
301
+ plan = self._plans.get(plan_id)
302
+ if plan is None:
303
+ return {"error": f"Plan not found: {plan_id}"}
304
+ task = plan.get_task(task_id)
305
+ if task is None:
306
+ return {"error": f"Task not found: {task_id} in plan {plan_id}"}
307
+
308
+ task.prompt_score = max(0.0, min(1.0, score))
309
+ task.update_confidence()
310
+ plan.recalculate_confidence()
311
+ return plan.to_dict()
312
+
313
+ def score_output(
314
+ self,
315
+ plan_id: str,
316
+ task_id: str,
317
+ score: float,
318
+ mark_done: bool = True,
319
+ ) -> dict[str, Any]:
320
+ """Record an output quality score for a task.
321
+
322
+ Args:
323
+ plan_id: Plan identifier.
324
+ task_id: Task identifier within the plan.
325
+ score: Output quality score (0–1).
326
+ mark_done: If True and score >= threshold, mark task as DONE.
327
+
328
+ Returns:
329
+ Updated plan status dict.
330
+ """
331
+ plan = self._plans.get(plan_id)
332
+ if plan is None:
333
+ return {"error": f"Plan not found: {plan_id}"}
334
+ task = plan.get_task(task_id)
335
+ if task is None:
336
+ return {"error": f"Task not found: {task_id} in plan {plan_id}"}
337
+
338
+ task.output_score = max(0.0, min(1.0, score))
339
+ task.update_confidence()
340
+ plan.recalculate_confidence()
341
+
342
+ if mark_done:
343
+ if task.confidence >= plan.confidence_threshold:
344
+ task.status = TaskStatus.DONE
345
+ else:
346
+ task.status = TaskStatus.FAILED
347
+
348
+ # Trigger replan bookkeeping if needed
349
+ if plan.needs_replan():
350
+ plan.replan_count += 1
351
+ task.replan_count += 1
352
+ task.status = TaskStatus.REPLANNING
353
+
354
+ return plan.to_dict()
355
+
356
+ def mark_task(
357
+ self,
358
+ plan_id: str,
359
+ task_id: str,
360
+ status: str,
361
+ ) -> dict[str, Any]:
362
+ """Manually set a task's status."""
363
+ plan = self._plans.get(plan_id)
364
+ if plan is None:
365
+ return {"error": f"Plan not found: {plan_id}"}
366
+ task = plan.get_task(task_id)
367
+ if task is None:
368
+ return {"error": f"Task not found: {task_id}"}
369
+ try:
370
+ task.status = TaskStatus(status)
371
+ except ValueError:
372
+ return {"error": f"Unknown status: {status}"}
373
+ task.updated_at = time.time()
374
+ return plan.to_dict()
375
+
376
+ def get_status(self, plan_id: str) -> dict[str, Any]:
377
+ plan = self._plans.get(plan_id)
378
+ if plan is None:
379
+ return {"error": f"Plan not found: {plan_id}"}
380
+ return plan.to_dict()
381
+
382
+ def next_task(self, plan_id: str) -> dict[str, Any] | None:
383
+ """Return the next pending task, or None if the plan is complete/blocked."""
384
+ plan = self._plans.get(plan_id)
385
+ if plan is None:
386
+ return None
387
+ pending = plan.pending_tasks()
388
+ if not pending:
389
+ return None
390
+ t = pending[0]
391
+ t.status = TaskStatus.IN_PROGRESS
392
+ t.updated_at = time.time()
393
+ return {
394
+ "id": t.id,
395
+ "title": t.title,
396
+ "description": t.description,
397
+ "replan_count": t.replan_count,
398
+ "needs_replan": plan.needs_replan(),
399
+ "rolling_confidence": round(plan.rolling_confidence, 4),
400
+ }
401
+
402
+
403
+ # ---------------------------------------------------------------------------
404
+ # Process-level singleton — shared across all MCP tool calls
405
+ # ---------------------------------------------------------------------------
406
+
407
+ _registry: PlanRegistry = PlanRegistry()
408
+
409
+
410
+ def get_registry() -> PlanRegistry:
411
+ """Return the global PlanRegistry instance."""
412
+ return _registry