loopllm 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopllm/__init__.py +69 -0
- loopllm/__main__.py +5 -0
- loopllm/adaptive_exit.py +78 -0
- loopllm/agent_loop.py +299 -0
- loopllm/cli.py +521 -0
- loopllm/elicitation.py +519 -0
- loopllm/engine.py +376 -0
- loopllm/evaluator_factory.py +72 -0
- loopllm/evaluators.py +419 -0
- loopllm/guards.py +254 -0
- loopllm/local_loop.py +273 -0
- loopllm/mcp_server.py +2657 -0
- loopllm/plan_registry.py +412 -0
- loopllm/priors.py +604 -0
- loopllm/provider.py +51 -0
- loopllm/providers/__init__.py +15 -0
- loopllm/providers/agent.py +64 -0
- loopllm/providers/mock.py +64 -0
- loopllm/providers/ollama.py +95 -0
- loopllm/providers/openrouter.py +101 -0
- loopllm/serve.py +297 -0
- loopllm/step_scorer.py +190 -0
- loopllm/store.py +1126 -0
- loopllm/tasks.py +599 -0
- loopllm-0.7.0.dist-info/METADATA +454 -0
- loopllm-0.7.0.dist-info/RECORD +29 -0
- loopllm-0.7.0.dist-info/WHEEL +4 -0
- loopllm-0.7.0.dist-info/entry_points.txt +3 -0
- loopllm-0.7.0.dist-info/licenses/LICENSE +21 -0
loopllm/tasks.py
ADDED
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
"""Task model and orchestrator for multi-step workflows.
|
|
2
|
+
|
|
3
|
+
Decomposes an :class:`IntentSpec` into a dependency-ordered graph
|
|
4
|
+
of subtasks, executes each through :class:`LoopedLLM`, and assembles
|
|
5
|
+
the final result.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import uuid
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from enum import Enum
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import structlog
|
|
16
|
+
|
|
17
|
+
from loopllm.elicitation import IntentRefiner, IntentSpec
|
|
18
|
+
from loopllm.engine import (
|
|
19
|
+
EvaluationResult,
|
|
20
|
+
LoopConfig,
|
|
21
|
+
LoopedLLM,
|
|
22
|
+
RefinementResult,
|
|
23
|
+
)
|
|
24
|
+
from loopllm.evaluators import LengthEvaluator
|
|
25
|
+
from loopllm.priors import AdaptivePriors, CallObservation
|
|
26
|
+
from loopllm.provider import LLMProvider
|
|
27
|
+
from loopllm.store import LoopStore
|
|
28
|
+
|
|
29
|
+
logger = structlog.get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Task state machine
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TaskState(str, Enum):
|
|
38
|
+
"""Lifecycle state of a task."""
|
|
39
|
+
|
|
40
|
+
PENDING = "pending"
|
|
41
|
+
IN_PROGRESS = "in_progress"
|
|
42
|
+
COMPLETED = "completed"
|
|
43
|
+
VERIFIED = "verified"
|
|
44
|
+
FAILED = "failed"
|
|
45
|
+
BLOCKED = "blocked"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Task data model
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class Task:
|
|
55
|
+
"""A single unit of work in a task plan.
|
|
56
|
+
|
|
57
|
+
Attributes:
|
|
58
|
+
id: Unique identifier.
|
|
59
|
+
parent_id: ID of the parent task (``None`` for root tasks).
|
|
60
|
+
title: Short description.
|
|
61
|
+
description: Full description / instructions.
|
|
62
|
+
state: Current lifecycle state.
|
|
63
|
+
dependencies: IDs of tasks that must complete before this one.
|
|
64
|
+
intent_spec: Optional structured spec for this subtask.
|
|
65
|
+
result: Refinement result once executed.
|
|
66
|
+
metadata: Arbitrary extra data.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
id: str = field(default_factory=lambda: uuid.uuid4().hex[:12])
|
|
70
|
+
parent_id: str | None = None
|
|
71
|
+
title: str = ""
|
|
72
|
+
description: str = ""
|
|
73
|
+
state: TaskState = TaskState.PENDING
|
|
74
|
+
dependencies: list[str] = field(default_factory=list)
|
|
75
|
+
intent_spec: IntentSpec | None = None
|
|
76
|
+
result: RefinementResult | None = None
|
|
77
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class TaskPlan:
|
|
82
|
+
"""An ordered collection of tasks with dependency information.
|
|
83
|
+
|
|
84
|
+
Attributes:
|
|
85
|
+
tasks: All tasks in the plan.
|
|
86
|
+
dependency_graph: Mapping of task ID → list of dependency IDs.
|
|
87
|
+
estimated_total_cost: Rough cost estimate (token-based).
|
|
88
|
+
session_id: ID of the elicitation session that produced this plan.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
tasks: list[Task] = field(default_factory=list)
|
|
92
|
+
dependency_graph: dict[str, list[str]] = field(default_factory=dict)
|
|
93
|
+
estimated_total_cost: float = 0.0
|
|
94
|
+
session_id: str = ""
|
|
95
|
+
|
|
96
|
+
def execution_order(self) -> list[Task]:
|
|
97
|
+
"""Return tasks in topological order respecting dependencies.
|
|
98
|
+
|
|
99
|
+
Uses Kahn's algorithm. Raises :class:`ValueError` if the
|
|
100
|
+
dependency graph contains cycles.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
Tasks sorted so that each task appears after all its dependencies.
|
|
104
|
+
"""
|
|
105
|
+
task_map = {t.id: t for t in self.tasks}
|
|
106
|
+
in_degree: dict[str, int] = {t.id: 0 for t in self.tasks}
|
|
107
|
+
adj: dict[str, list[str]] = {t.id: [] for t in self.tasks}
|
|
108
|
+
|
|
109
|
+
for task in self.tasks:
|
|
110
|
+
deps = self.dependency_graph.get(task.id, task.dependencies)
|
|
111
|
+
for dep_id in deps:
|
|
112
|
+
if dep_id in adj:
|
|
113
|
+
adj[dep_id].append(task.id)
|
|
114
|
+
in_degree[task.id] += 1
|
|
115
|
+
|
|
116
|
+
queue = [tid for tid, deg in in_degree.items() if deg == 0]
|
|
117
|
+
ordered: list[Task] = []
|
|
118
|
+
|
|
119
|
+
while queue:
|
|
120
|
+
tid = queue.pop(0)
|
|
121
|
+
ordered.append(task_map[tid])
|
|
122
|
+
for next_id in adj.get(tid, []):
|
|
123
|
+
in_degree[next_id] -= 1
|
|
124
|
+
if in_degree[next_id] == 0:
|
|
125
|
+
queue.append(next_id)
|
|
126
|
+
|
|
127
|
+
if len(ordered) != len(self.tasks):
|
|
128
|
+
msg = "Dependency graph contains a cycle"
|
|
129
|
+
raise ValueError(msg)
|
|
130
|
+
|
|
131
|
+
return ordered
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Prompt templates for task decomposition
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
_DECOMPOSE_PROMPT = """\
|
|
139
|
+
You are a task decomposition assistant. Given the following structured
|
|
140
|
+
specification, break it into discrete subtasks that can be executed
|
|
141
|
+
independently (with explicit dependencies where needed).
|
|
142
|
+
|
|
143
|
+
Specification:
|
|
144
|
+
- Task type: {task_type}
|
|
145
|
+
- Prompt: {refined_prompt}
|
|
146
|
+
- Constraints: {constraints}
|
|
147
|
+
- Quality criteria: {quality_criteria}
|
|
148
|
+
- Decomposition hints: {decomposition_hints}
|
|
149
|
+
- Estimated complexity: {estimated_complexity}
|
|
150
|
+
|
|
151
|
+
Produce a JSON array of subtask objects. Each subtask has:
|
|
152
|
+
- "title": short label (3-8 words)
|
|
153
|
+
- "description": detailed instructions for the subtask
|
|
154
|
+
- "dependencies": list of titles of subtasks this one depends on (empty for independent)
|
|
155
|
+
- "estimated_complexity": float 0.0-1.0
|
|
156
|
+
|
|
157
|
+
If the task is simple enough to do in one step, return a single-element array.
|
|
158
|
+
Return ONLY the JSON array.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
_ASSEMBLE_PROMPT = """\
|
|
162
|
+
You are a result assembler. Given the following subtask results,
|
|
163
|
+
combine them into a single coherent output that addresses the original
|
|
164
|
+
prompt.
|
|
165
|
+
|
|
166
|
+
Original prompt: {original_prompt}
|
|
167
|
+
|
|
168
|
+
Subtask results:
|
|
169
|
+
{subtask_results}
|
|
170
|
+
|
|
171
|
+
Produce a single, cohesive output that integrates all subtask results.
|
|
172
|
+
Do not repeat the prompt or explain what you did — just produce the final output.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
_VERIFY_PROMPT = """\
|
|
176
|
+
You are a quality verifier. Check whether the following output
|
|
177
|
+
addresses all the requirements from the original specification.
|
|
178
|
+
|
|
179
|
+
Specification:
|
|
180
|
+
- Prompt: {refined_prompt}
|
|
181
|
+
- Quality criteria: {quality_criteria}
|
|
182
|
+
|
|
183
|
+
Output to verify:
|
|
184
|
+
\"\"\"
|
|
185
|
+
{output}
|
|
186
|
+
\"\"\"
|
|
187
|
+
|
|
188
|
+
For each quality criterion, rate it 0.0-1.0.
|
|
189
|
+
Return a JSON object with:
|
|
190
|
+
- "overall_score": float 0.0-1.0
|
|
191
|
+
- "criteria_scores": object mapping criterion → score
|
|
192
|
+
- "issues": list of strings describing problems (empty if none)
|
|
193
|
+
|
|
194
|
+
Return ONLY the JSON object.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# ---------------------------------------------------------------------------
|
|
199
|
+
# TaskOrchestrator
|
|
200
|
+
# ---------------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class TaskOrchestrator:
|
|
204
|
+
"""Decompose, execute, and verify multi-step LLM tasks.
|
|
205
|
+
|
|
206
|
+
Integrates :class:`IntentRefiner` for elicitation, :class:`LoopedLLM`
|
|
207
|
+
for per-subtask refinement, and :class:`AdaptivePriors` for learning
|
|
208
|
+
optimal decomposition strategies.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
provider: LLM provider for all calls.
|
|
212
|
+
priors: Adaptive priors for learning.
|
|
213
|
+
store: Optional persistent store.
|
|
214
|
+
refiner: Optional intent refiner (created automatically if not given).
|
|
215
|
+
model: Default model to use.
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
def __init__(
|
|
219
|
+
self,
|
|
220
|
+
provider: LLMProvider,
|
|
221
|
+
priors: AdaptivePriors | None = None,
|
|
222
|
+
store: LoopStore | None = None,
|
|
223
|
+
refiner: IntentRefiner | None = None,
|
|
224
|
+
model: str = "gpt-4o-mini",
|
|
225
|
+
) -> None:
|
|
226
|
+
self.provider = provider
|
|
227
|
+
self.priors = priors or AdaptivePriors()
|
|
228
|
+
self.store = store
|
|
229
|
+
self.refiner = refiner or IntentRefiner(
|
|
230
|
+
provider=provider, priors=self.priors, model=model
|
|
231
|
+
)
|
|
232
|
+
self.model = model
|
|
233
|
+
|
|
234
|
+
# -- decomposition -------------------------------------------------------
|
|
235
|
+
|
|
236
|
+
def plan(self, spec: IntentSpec) -> TaskPlan:
|
|
237
|
+
"""Decompose an :class:`IntentSpec` into a :class:`TaskPlan`.
|
|
238
|
+
|
|
239
|
+
Uses the LLM to generate subtasks with dependency ordering.
|
|
240
|
+
Simple tasks (complexity < 0.3) are kept as a single task.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
spec: The structured specification.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
A :class:`TaskPlan` with ordered subtasks.
|
|
247
|
+
"""
|
|
248
|
+
# Simple tasks don't need decomposition
|
|
249
|
+
if spec.estimated_complexity < 0.3 and not spec.decomposition_hints:
|
|
250
|
+
task = Task(
|
|
251
|
+
title="Execute task",
|
|
252
|
+
description=spec.refined_prompt,
|
|
253
|
+
intent_spec=spec,
|
|
254
|
+
)
|
|
255
|
+
return TaskPlan(
|
|
256
|
+
tasks=[task],
|
|
257
|
+
dependency_graph={task.id: []},
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Use the LLM to decompose
|
|
261
|
+
decompose_prompt = _DECOMPOSE_PROMPT.format(
|
|
262
|
+
task_type=spec.task_type,
|
|
263
|
+
refined_prompt=spec.refined_prompt,
|
|
264
|
+
constraints=json.dumps(spec.constraints),
|
|
265
|
+
quality_criteria=json.dumps(spec.quality_criteria),
|
|
266
|
+
decomposition_hints=json.dumps(spec.decomposition_hints),
|
|
267
|
+
estimated_complexity=spec.estimated_complexity,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
response = self.provider.complete(decompose_prompt, self.model)
|
|
271
|
+
raw = response.content.strip()
|
|
272
|
+
|
|
273
|
+
tasks = self._parse_tasks(raw, spec)
|
|
274
|
+
dep_graph = self._build_dependency_graph(tasks)
|
|
275
|
+
|
|
276
|
+
plan = TaskPlan(
|
|
277
|
+
tasks=tasks,
|
|
278
|
+
dependency_graph=dep_graph,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
# Persist tasks
|
|
282
|
+
if self.store:
|
|
283
|
+
for task in tasks:
|
|
284
|
+
self.store.save_task({
|
|
285
|
+
"id": task.id,
|
|
286
|
+
"parent_id": task.parent_id,
|
|
287
|
+
"title": task.title,
|
|
288
|
+
"description": task.description,
|
|
289
|
+
"state": task.state.value,
|
|
290
|
+
"dependencies": task.dependencies,
|
|
291
|
+
"spec": {
|
|
292
|
+
"task_type": spec.task_type,
|
|
293
|
+
"refined_prompt": task.description,
|
|
294
|
+
} if task.intent_spec else None,
|
|
295
|
+
})
|
|
296
|
+
|
|
297
|
+
logger.info("task_plan_created", task_count=len(tasks))
|
|
298
|
+
return plan
|
|
299
|
+
|
|
300
|
+
# -- execution -----------------------------------------------------------
|
|
301
|
+
|
|
302
|
+
def execute(
|
|
303
|
+
self, plan: TaskPlan, model: str | None = None
|
|
304
|
+
) -> dict[str, RefinementResult]:
|
|
305
|
+
"""Execute all tasks in a plan in dependency order.
|
|
306
|
+
|
|
307
|
+
Each task is refined using :class:`LoopedLLM` with adaptive
|
|
308
|
+
exit conditions. Prior task outputs are passed as context
|
|
309
|
+
to dependent tasks.
|
|
310
|
+
|
|
311
|
+
Args:
|
|
312
|
+
plan: The task plan to execute.
|
|
313
|
+
model: Model override (defaults to ``self.model``).
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Dict mapping task ID to :class:`RefinementResult`.
|
|
317
|
+
"""
|
|
318
|
+
model = model or self.model
|
|
319
|
+
results: dict[str, RefinementResult] = {}
|
|
320
|
+
|
|
321
|
+
depth = self.priors.predict_optimal_depth(
|
|
322
|
+
"orchestrated_subtask", model
|
|
323
|
+
)
|
|
324
|
+
config = LoopConfig(
|
|
325
|
+
max_iterations=max(depth, 2),
|
|
326
|
+
quality_threshold=0.75,
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
for task in plan.execution_order():
|
|
330
|
+
task.state = TaskState.IN_PROGRESS
|
|
331
|
+
if self.store:
|
|
332
|
+
self.store.update_task_state(task.id, task.state.value)
|
|
333
|
+
|
|
334
|
+
logger.info("executing_task", task_id=task.id, title=task.title)
|
|
335
|
+
|
|
336
|
+
# Build context from dependency results
|
|
337
|
+
dep_context = ""
|
|
338
|
+
for dep_id in plan.dependency_graph.get(task.id, task.dependencies):
|
|
339
|
+
if dep_id in results:
|
|
340
|
+
dep_context += f"\n--- Result from '{dep_id}' ---\n"
|
|
341
|
+
dep_context += results[dep_id].output + "\n"
|
|
342
|
+
|
|
343
|
+
prompt = task.description
|
|
344
|
+
if dep_context:
|
|
345
|
+
prompt = (
|
|
346
|
+
f"{task.description}\n\n"
|
|
347
|
+
f"Context from previous steps:\n{dep_context}"
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
evaluator = LengthEvaluator(min_words=5, max_words=10_000)
|
|
351
|
+
loop = LoopedLLM(provider=self.provider, config=config)
|
|
352
|
+
|
|
353
|
+
try:
|
|
354
|
+
result = loop.refine(prompt, evaluator, model=model)
|
|
355
|
+
task.result = result
|
|
356
|
+
task.state = TaskState.COMPLETED
|
|
357
|
+
results[task.id] = result
|
|
358
|
+
|
|
359
|
+
# Learn from this subtask execution
|
|
360
|
+
obs = CallObservation(
|
|
361
|
+
task_type="orchestrated_subtask",
|
|
362
|
+
model_id=model,
|
|
363
|
+
scores=result.metrics.score_trajectory,
|
|
364
|
+
latencies_ms=[it.latency_ms for it in result.iterations],
|
|
365
|
+
converged=result.metrics.converged,
|
|
366
|
+
total_iterations=result.metrics.total_iterations,
|
|
367
|
+
max_iterations=config.max_iterations,
|
|
368
|
+
quality_threshold=config.quality_threshold,
|
|
369
|
+
)
|
|
370
|
+
self.priors.observe(obs)
|
|
371
|
+
|
|
372
|
+
except Exception as exc:
|
|
373
|
+
logger.error("task_failed", task_id=task.id, error=str(exc))
|
|
374
|
+
task.state = TaskState.FAILED
|
|
375
|
+
task.metadata["error"] = str(exc)
|
|
376
|
+
|
|
377
|
+
if self.store:
|
|
378
|
+
self.store.update_task_state(task.id, task.state.value)
|
|
379
|
+
|
|
380
|
+
return results
|
|
381
|
+
|
|
382
|
+
# -- verification --------------------------------------------------------
|
|
383
|
+
|
|
384
|
+
def verify(
|
|
385
|
+
self,
|
|
386
|
+
spec: IntentSpec,
|
|
387
|
+
output: str,
|
|
388
|
+
) -> EvaluationResult:
|
|
389
|
+
"""Verify a combined output against the original spec.
|
|
390
|
+
|
|
391
|
+
Uses the LLM to check quality criteria, then parses
|
|
392
|
+
the structured response into an :class:`EvaluationResult`.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
spec: The original specification.
|
|
396
|
+
output: The assembled output to verify.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
An :class:`EvaluationResult` with per-criterion scores.
|
|
400
|
+
"""
|
|
401
|
+
verify_prompt = _VERIFY_PROMPT.format(
|
|
402
|
+
refined_prompt=spec.refined_prompt,
|
|
403
|
+
quality_criteria=json.dumps(spec.quality_criteria),
|
|
404
|
+
output=output,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
response = self.provider.complete(verify_prompt, self.model)
|
|
408
|
+
return self._parse_verification(response.content)
|
|
409
|
+
|
|
410
|
+
# -- full pipeline -------------------------------------------------------
|
|
411
|
+
|
|
412
|
+
def run(
|
|
413
|
+
self,
|
|
414
|
+
prompt: str,
|
|
415
|
+
model: str | None = None,
|
|
416
|
+
answer_func: Any | None = None,
|
|
417
|
+
) -> RefinementResult:
|
|
418
|
+
"""Run the full pipeline: elicit → plan → execute → assemble.
|
|
419
|
+
|
|
420
|
+
This is the main entry point for end-to-end task processing.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
prompt: The user's original prompt.
|
|
424
|
+
model: Model override.
|
|
425
|
+
answer_func: Optional function for interactive elicitation.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
The final assembled :class:`RefinementResult`.
|
|
429
|
+
"""
|
|
430
|
+
model = model or self.model
|
|
431
|
+
|
|
432
|
+
# Step 1: Elicit intent
|
|
433
|
+
logger.info("pipeline_elicit", prompt=prompt[:80])
|
|
434
|
+
session = self.refiner.run_session(prompt, answer_func=answer_func)
|
|
435
|
+
spec = session.refined_spec or IntentSpec(
|
|
436
|
+
original_prompt=prompt, refined_prompt=prompt
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Step 2: Plan
|
|
440
|
+
logger.info("pipeline_plan", task_type=spec.task_type)
|
|
441
|
+
plan = self.plan(spec)
|
|
442
|
+
plan.session_id = session.session_id
|
|
443
|
+
|
|
444
|
+
# Step 3: Execute
|
|
445
|
+
logger.info("pipeline_execute", task_count=len(plan.tasks))
|
|
446
|
+
results = self.execute(plan, model=model)
|
|
447
|
+
|
|
448
|
+
# Step 4: Assemble
|
|
449
|
+
if len(results) == 1:
|
|
450
|
+
# Single task — return directly
|
|
451
|
+
final_result = next(iter(results.values()))
|
|
452
|
+
else:
|
|
453
|
+
# Multiple tasks — assemble results
|
|
454
|
+
final_result = self._assemble(spec, plan, results, model)
|
|
455
|
+
|
|
456
|
+
# Step 5: Learn from outcome
|
|
457
|
+
self.refiner.observe_outcome(
|
|
458
|
+
session, final_score=final_result.metrics.best_score
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
logger.info(
|
|
462
|
+
"pipeline_complete",
|
|
463
|
+
tasks=len(plan.tasks),
|
|
464
|
+
best_score=final_result.metrics.best_score,
|
|
465
|
+
)
|
|
466
|
+
return final_result
|
|
467
|
+
|
|
468
|
+
# -- assembly ------------------------------------------------------------
|
|
469
|
+
|
|
470
|
+
def _assemble(
|
|
471
|
+
self,
|
|
472
|
+
spec: IntentSpec,
|
|
473
|
+
plan: TaskPlan,
|
|
474
|
+
results: dict[str, RefinementResult],
|
|
475
|
+
model: str,
|
|
476
|
+
) -> RefinementResult:
|
|
477
|
+
"""Assemble subtask results into a single output."""
|
|
478
|
+
subtask_text = ""
|
|
479
|
+
for task in plan.execution_order():
|
|
480
|
+
if task.id in results:
|
|
481
|
+
subtask_text += (
|
|
482
|
+
f"\n--- {task.title} ---\n"
|
|
483
|
+
f"{results[task.id].output}\n"
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
assemble_prompt = _ASSEMBLE_PROMPT.format(
|
|
487
|
+
original_prompt=spec.original_prompt,
|
|
488
|
+
subtask_results=subtask_text,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
evaluator = LengthEvaluator(min_words=10, max_words=10_000)
|
|
492
|
+
config = LoopConfig(max_iterations=2, quality_threshold=0.8)
|
|
493
|
+
loop = LoopedLLM(provider=self.provider, config=config)
|
|
494
|
+
|
|
495
|
+
return loop.refine(assemble_prompt, evaluator, model=model)
|
|
496
|
+
|
|
497
|
+
# -- parsing helpers -----------------------------------------------------
|
|
498
|
+
|
|
499
|
+
def _parse_tasks(
|
|
500
|
+
self, raw: str, spec: IntentSpec
|
|
501
|
+
) -> list[Task]:
|
|
502
|
+
"""Parse LLM decomposition response into Task objects."""
|
|
503
|
+
raw = raw.strip()
|
|
504
|
+
if not raw.startswith("["):
|
|
505
|
+
start = raw.find("[")
|
|
506
|
+
end = raw.rfind("]")
|
|
507
|
+
if start >= 0 and end > start:
|
|
508
|
+
raw = raw[start : end + 1]
|
|
509
|
+
else:
|
|
510
|
+
# Can't parse — create a single task
|
|
511
|
+
return [
|
|
512
|
+
Task(
|
|
513
|
+
title="Execute task",
|
|
514
|
+
description=spec.refined_prompt,
|
|
515
|
+
intent_spec=spec,
|
|
516
|
+
)
|
|
517
|
+
]
|
|
518
|
+
|
|
519
|
+
try:
|
|
520
|
+
items = json.loads(raw)
|
|
521
|
+
except json.JSONDecodeError:
|
|
522
|
+
return [
|
|
523
|
+
Task(
|
|
524
|
+
title="Execute task",
|
|
525
|
+
description=spec.refined_prompt,
|
|
526
|
+
intent_spec=spec,
|
|
527
|
+
)
|
|
528
|
+
]
|
|
529
|
+
|
|
530
|
+
tasks: list[Task] = []
|
|
531
|
+
title_to_id: dict[str, str] = {}
|
|
532
|
+
|
|
533
|
+
for item in items:
|
|
534
|
+
if not isinstance(item, dict):
|
|
535
|
+
continue
|
|
536
|
+
task = Task(
|
|
537
|
+
title=item.get("title", "Subtask"),
|
|
538
|
+
description=item.get("description", ""),
|
|
539
|
+
intent_spec=spec,
|
|
540
|
+
metadata={"estimated_complexity": item.get("estimated_complexity", 0.5)},
|
|
541
|
+
)
|
|
542
|
+
title_to_id[task.title] = task.id
|
|
543
|
+
tasks.append(task)
|
|
544
|
+
|
|
545
|
+
# Resolve title-based dependencies to IDs
|
|
546
|
+
for item, task in zip(items, tasks):
|
|
547
|
+
if not isinstance(item, dict):
|
|
548
|
+
continue
|
|
549
|
+
dep_titles = item.get("dependencies", [])
|
|
550
|
+
for dt in dep_titles:
|
|
551
|
+
if dt in title_to_id:
|
|
552
|
+
task.dependencies.append(title_to_id[dt])
|
|
553
|
+
|
|
554
|
+
return tasks if tasks else [
|
|
555
|
+
Task(title="Execute task", description=spec.refined_prompt, intent_spec=spec)
|
|
556
|
+
]
|
|
557
|
+
|
|
558
|
+
def _build_dependency_graph(
|
|
559
|
+
self, tasks: list[Task]
|
|
560
|
+
) -> dict[str, list[str]]:
|
|
561
|
+
"""Build a dependency graph from task objects."""
|
|
562
|
+
return {task.id: list(task.dependencies) for task in tasks}
|
|
563
|
+
|
|
564
|
+
def _parse_verification(self, raw: str) -> EvaluationResult:
|
|
565
|
+
"""Parse LLM verification response into an EvaluationResult."""
|
|
566
|
+
raw = raw.strip()
|
|
567
|
+
if not raw.startswith("{"):
|
|
568
|
+
start = raw.find("{")
|
|
569
|
+
end = raw.rfind("}")
|
|
570
|
+
if start >= 0 and end > start:
|
|
571
|
+
raw = raw[start : end + 1]
|
|
572
|
+
else:
|
|
573
|
+
return EvaluationResult(
|
|
574
|
+
score=0.5, passed=False,
|
|
575
|
+
deficiencies=["Could not parse verification response"],
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
try:
|
|
579
|
+
data = json.loads(raw)
|
|
580
|
+
except json.JSONDecodeError:
|
|
581
|
+
return EvaluationResult(
|
|
582
|
+
score=0.5, passed=False,
|
|
583
|
+
deficiencies=["Invalid verification JSON"],
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
score = float(data.get("overall_score", 0.5))
|
|
587
|
+
score = max(0.0, min(1.0, score))
|
|
588
|
+
issues = data.get("issues", [])
|
|
589
|
+
sub_scores = {
|
|
590
|
+
str(k): float(v)
|
|
591
|
+
for k, v in data.get("criteria_scores", {}).items()
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
return EvaluationResult(
|
|
595
|
+
score=score,
|
|
596
|
+
passed=score >= 0.7 and not issues,
|
|
597
|
+
deficiencies=issues,
|
|
598
|
+
sub_scores=sub_scores,
|
|
599
|
+
)
|