loopllm 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopllm/mcp_server.py ADDED
@@ -0,0 +1,2657 @@
1
+ """MCP server exposing loop-llm tools to IDE agents.
2
+
3
+ Provides iterative refinement, intent elicitation, task orchestration,
4
+ Bayesian meta-learning, and prompt quality analysis as MCP tools for
5
+ VS Code Copilot, Cursor, and other MCP-compatible clients.
6
+
7
+ Usage::
8
+
9
+ loopllm mcp-server --provider ollama --model qwen2.5:0.5b
10
+ # or
11
+ python -m loopllm.mcp_server
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import os
17
+ import re
18
+ import time
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ from loopllm.agent_loop import AgentLoopController
23
+ from loopllm.elicitation import ElicitationSession, IntentRefiner, IntentSpec
24
+ from loopllm.engine import LoopConfig, LoopedLLM
25
+ from loopllm.evaluator_factory import build_evaluator
26
+ from loopllm.step_scorer import (
27
+ conservative_dual_verify,
28
+ legacy_self_report_score,
29
+ build_step_evaluator,
30
+ )
31
+ from loopllm.priors import CallObservation
32
+ from loopllm.provider import LLMProvider
33
+ from loopllm.plan_registry import get_registry
34
+ from loopllm.providers.agent import AgentPassthroughProvider
35
+ from loopllm.store import LoopStore, SQLiteBackedPriors
36
+ from loopllm.tasks import TaskOrchestrator
37
+
38
+ try:
39
+ from mcp.server.fastmcp import Context
40
+ except ImportError:
41
+ Context = Any # type: ignore[assignment,misc]
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Shared state — initialised once per MCP server process
45
+ # ---------------------------------------------------------------------------
46
+
47
+ _store: LoopStore | None = None
48
+ _priors: SQLiteBackedPriors | None = None
49
+ _provider: LLMProvider | None = None
50
+ _default_model: str = "gpt-4o-mini"
51
+ _active_sessions: dict[str, dict[str, Any]] = {}
52
+ _agent_loop: AgentLoopController | None = None
53
+ _status_path: Path | None = None
54
+ _history_path: Path | None = None
55
+ # Last per-dimension scores computed by _score_prompt_quality — used by SGD.
56
+ _last_prompt_dims: dict[str, float] = {}
57
+
58
+
59
+ def _init_state() -> None:
60
+ """Lazily initialise shared store, priors, and provider."""
61
+ global _store, _priors, _provider, _default_model, _status_path, _history_path # noqa: PLW0603
62
+
63
+ if _store is not None:
64
+ return
65
+
66
+ db_path = Path(os.environ.get("LOOPLLM_DB", str(Path.home() / ".loopllm" / "store.db")))
67
+ db_path.parent.mkdir(parents=True, exist_ok=True)
68
+ _store = LoopStore(db_path=db_path)
69
+ _priors = SQLiteBackedPriors(_store)
70
+ _default_model = os.environ.get("LOOPLLM_MODEL", "gpt-4o-mini")
71
+ _status_path = db_path.parent / "status.json"
72
+ _history_path = db_path.parent / "prompt_history.json"
73
+
74
+ provider_name = os.environ.get("LOOPLLM_PROVIDER", "agent")
75
+ _provider = _make_provider(provider_name)
76
+
77
+
78
+ def _make_provider(name: str) -> LLMProvider:
79
+ """Create an LLM provider by name."""
80
+ if name == "agent":
81
+ return AgentPassthroughProvider()
82
+ elif name == "mock":
83
+ from loopllm.providers.mock import MockLLMProvider
84
+
85
+ return MockLLMProvider(responses=[
86
+ '{"result": "initial attempt"}',
87
+ '{"result": "improved", "details": "comprehensive", "quality": "high"}',
88
+ ])
89
+ elif name == "ollama":
90
+ from loopllm.providers.ollama import OllamaProvider
91
+
92
+ base_url = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
93
+ return OllamaProvider(base_url=base_url)
94
+ elif name == "openrouter":
95
+ from loopllm.providers.openrouter import OpenRouterProvider
96
+
97
+ api_key = os.environ.get("OPENROUTER_API_KEY", "")
98
+ if not api_key:
99
+ raise ValueError("OPENROUTER_API_KEY env var is required for openrouter provider")
100
+ return OpenRouterProvider(api_key=api_key)
101
+ else:
102
+ raise ValueError(f"Unknown provider: {name}")
103
+
104
+
105
+ def _get_provider(provider_override: str | None = None) -> LLMProvider:
106
+ """Return the provider, applying optional per-call override."""
107
+ _init_state()
108
+ if provider_override and provider_override != os.environ.get("LOOPLLM_PROVIDER", "agent"):
109
+ return _make_provider(provider_override)
110
+ assert _provider is not None
111
+ return _provider
112
+
113
+
114
+ def _is_agent_mode(provider_override: str | None = None) -> bool:
115
+ """Return True when the active provider is the agent passthrough."""
116
+ return isinstance(_get_provider(provider_override), AgentPassthroughProvider)
117
+
118
+
119
+ def _build_agent_refine_response(
120
+ prompt: str,
121
+ evaluator_type: str,
122
+ min_words: int,
123
+ max_words: int,
124
+ required_fields: list[str],
125
+ required_patterns: list[str],
126
+ max_iterations: int,
127
+ quality_threshold: float,
128
+ ) -> str:
129
+ """Build a structured agent-executable refinement response."""
130
+ criteria: dict[str, Any] = {
131
+ "evaluator_type": evaluator_type,
132
+ "min_words": min_words,
133
+ "max_words": max_words,
134
+ }
135
+ if required_fields:
136
+ criteria["required_fields"] = required_fields
137
+ if required_patterns:
138
+ criteria["required_patterns"] = required_patterns
139
+
140
+ instructions = (
141
+ f"Generate a response to the prompt below. "
142
+ f"Self-evaluate your response against the evaluator_criteria "
143
+ f"and assign a quality score between 0.0 and 1.0. "
144
+ f"If your score is below {quality_threshold}, revise and improve your response. "
145
+ f"Repeat up to {max_iterations} times until your score is {quality_threshold} or higher. "
146
+ f"Return only your best response."
147
+ )
148
+ return json.dumps({
149
+ "mode": "agent_execute",
150
+ "instructions": instructions,
151
+ "prompt": prompt,
152
+ "evaluator_criteria": criteria,
153
+ "max_iterations": max_iterations,
154
+ "quality_threshold": quality_threshold,
155
+ }, indent=2)
156
+
157
+
158
+ def _get_model(model_override: str | None = None) -> str:
159
+ """Return the model, applying optional per-call override."""
160
+ _init_state()
161
+ return model_override or _default_model
162
+
163
+
164
+ def _get_store() -> LoopStore:
165
+ _init_state()
166
+ assert _store is not None
167
+ return _store
168
+
169
+
170
+ def _get_priors() -> SQLiteBackedPriors:
171
+ _init_state()
172
+ assert _priors is not None
173
+ return _priors
174
+
175
+
176
+ def _get_agent_loop() -> AgentLoopController:
177
+ """Return the process-wide agent-loop controller, bound to shared priors."""
178
+ global _agent_loop # noqa: PLW0603
179
+ if _agent_loop is None:
180
+ _agent_loop = AgentLoopController(_get_priors())
181
+ return _agent_loop
182
+
183
+
184
+ def _build_evaluator(
185
+ evaluator_type: str = "length",
186
+ **kwargs: Any,
187
+ ) -> Any:
188
+ """Build an evaluator from a type string and optional config."""
189
+ return build_evaluator(evaluator_type, **kwargs)
190
+
191
+
192
+ def _result_to_dict(result: Any) -> dict[str, Any]:
193
+ """Convert a RefinementResult to a serialisable dict."""
194
+ return {
195
+ "output": result.output,
196
+ "best_score": result.metrics.best_score,
197
+ "final_score": result.metrics.final_score,
198
+ "total_iterations": result.metrics.total_iterations,
199
+ "converged": result.metrics.converged,
200
+ "exit_reason": result.metrics.exit_reason.condition,
201
+ "exit_message": result.metrics.exit_reason.message,
202
+ "score_trajectory": result.metrics.score_trajectory,
203
+ }
204
+
205
+
206
+ # ---------------------------------------------------------------------------
207
+ # Prompt quality scoring (heuristic-first)
208
+ # ---------------------------------------------------------------------------
209
+
210
+
211
+ def _score_prompt_quality(prompt: str) -> dict[str, Any]:
212
+ """Score a prompt across multiple quality dimensions.
213
+
214
+ Returns a dict with composite score, per-dimension scores, grade,
215
+ issues, suggestions, and an ASCII gauge.
216
+ """
217
+ words = prompt.split()
218
+ word_count = len(words)
219
+ prompt_lower = prompt.lower()
220
+
221
+ # --- Dimension: Specificity (0-1) ---
222
+ specificity = 1.0
223
+ vague_terms = ["something", "stuff", "thing", "things", "whatever", "somehow",
224
+ "do it", "make it", "fix it", "help me", "do this"]
225
+ vague_hits = sum(1 for v in vague_terms if v in prompt_lower)
226
+ specificity -= min(0.5, vague_hits * 0.15)
227
+ if word_count < 5:
228
+ specificity -= 0.3
229
+ elif word_count < 10:
230
+ specificity -= 0.15
231
+ if word_count > 20:
232
+ specificity += 0.1
233
+ specificity = max(0.0, min(1.0, specificity))
234
+
235
+ # --- Dimension: Constraint Clarity (0-1) ---
236
+ constraint_clarity = 0.0
237
+ constraint_words = ["must", "should", "require", "need", "format", "length",
238
+ "json", "csv", "return", "output", "include", "exclude",
239
+ "type", "schema", "limit", "exactly", "at least", "at most",
240
+ "no more than", "minimum", "maximum"]
241
+ constraint_hits = sum(1 for c in constraint_words if c in prompt_lower)
242
+ constraint_clarity = min(1.0, constraint_hits * 0.2)
243
+
244
+ # --- Dimension: Context Completeness (0-1) ---
245
+ context_completeness = 0.3 # base
246
+ context_markers = ["because", "context", "background", "given that",
247
+ "for example", "e.g.", "such as", "like this",
248
+ "the goal is", "we need", "the purpose", "in order to"]
249
+ ctx_hits = sum(1 for m in context_markers if m in prompt_lower)
250
+ context_completeness += min(0.5, ctx_hits * 0.15)
251
+ if word_count > 30:
252
+ context_completeness += 0.1
253
+ if word_count > 60:
254
+ context_completeness += 0.1
255
+ context_completeness = max(0.0, min(1.0, context_completeness))
256
+
257
+ # --- Dimension: Ambiguity (0=no ambiguity, 1=highly ambiguous) ---
258
+ ambiguity = 0.0
259
+ ambiguous_pronouns = ["it", "this", "that", "they", "them", "those"]
260
+ if word_count < 15:
261
+ pronoun_hits = sum(1 for w in words if w.lower() in ambiguous_pronouns)
262
+ ambiguity += min(0.4, pronoun_hits * 0.1)
263
+ if not any(c in prompt for c in ["?", ".", "!", ":"]):
264
+ ambiguity += 0.15
265
+ if word_count < 5:
266
+ ambiguity += 0.3
267
+ if "?" in prompt and word_count < 8:
268
+ ambiguity += 0.15
269
+ ambiguity = max(0.0, min(1.0, ambiguity))
270
+
271
+ # --- Dimension: Format Specification (0-1) ---
272
+ format_spec = 0.0
273
+ format_words = ["json", "csv", "xml", "html", "markdown", "yaml", "list",
274
+ "table", "code", "python", "javascript", "typescript",
275
+ "function", "class", "paragraph", "bullet points", "steps"]
276
+ fmt_hits = sum(1 for f in format_words if f in prompt_lower)
277
+ format_spec = min(1.0, fmt_hits * 0.25)
278
+
279
+ # --- Composite Score ---
280
+ _default_weights: dict[str, float] = {
281
+ "specificity": 0.25,
282
+ "constraint_clarity": 0.20,
283
+ "context_completeness": 0.20,
284
+ "ambiguity": 0.20,
285
+ "format_spec": 0.15,
286
+ }
287
+ # Load adaptively learned weights if available.
288
+ try:
289
+ _init_state()
290
+ _learned = _store.load_learned_weights() if _store else None
291
+ except Exception:
292
+ _learned = None
293
+ weights = _learned if _learned is not None else _default_weights
294
+
295
+ # Cache raw dimension values for SGD step at feedback time.
296
+ global _last_prompt_dims # noqa: PLW0603
297
+ _last_prompt_dims = {
298
+ "specificity": specificity,
299
+ "constraint_clarity": constraint_clarity,
300
+ "context_completeness": context_completeness,
301
+ "ambiguity": ambiguity,
302
+ "format_spec": format_spec,
303
+ }
304
+
305
+ composite = (
306
+ weights["specificity"] * specificity
307
+ + weights["constraint_clarity"] * constraint_clarity
308
+ + weights["context_completeness"] * context_completeness
309
+ + weights["ambiguity"] * (1.0 - ambiguity)
310
+ + weights["format_spec"] * format_spec
311
+ )
312
+ composite = max(0.0, min(1.0, composite))
313
+
314
+ # --- Grade ---
315
+ if composite >= 0.85:
316
+ grade = "A"
317
+ elif composite >= 0.70:
318
+ grade = "B"
319
+ elif composite >= 0.55:
320
+ grade = "C"
321
+ elif composite >= 0.40:
322
+ grade = "D"
323
+ else:
324
+ grade = "F"
325
+
326
+ # --- Issues & Suggestions ---
327
+ issues: list[str] = []
328
+ suggestions: list[str] = []
329
+
330
+ if specificity < 0.5:
331
+ issues.append("Prompt is vague — lacks specific details")
332
+ suggestions.append("Add concrete details about what you need")
333
+ if constraint_clarity < 0.3:
334
+ issues.append("No explicit constraints or requirements detected")
335
+ suggestions.append("Specify output format, length, or quality requirements")
336
+ if context_completeness < 0.4:
337
+ issues.append("Insufficient context provided")
338
+ suggestions.append("Add background, examples, or explain the goal")
339
+ if ambiguity > 0.5:
340
+ issues.append("High ambiguity — contains unclear references")
341
+ suggestions.append("Replace pronouns (it, this, that) with specific nouns")
342
+ if format_spec < 0.3:
343
+ suggestions.append("Consider specifying the desired output format")
344
+
345
+ # --- ASCII Gauge ---
346
+ filled = int(composite * 10)
347
+ gauge = "\u2588" * filled + "\u2591" * (10 - filled)
348
+ pct = int(composite * 100)
349
+ gauge_str = f"{gauge} {pct}% [{grade}]"
350
+
351
+ return {
352
+ "quality_score": round(composite, 3),
353
+ "grade": grade,
354
+ "gauge": gauge_str,
355
+ "dimensions": {
356
+ "specificity": round(specificity, 3),
357
+ "constraint_clarity": round(constraint_clarity, 3),
358
+ "context_completeness": round(context_completeness, 3),
359
+ "ambiguity": round(ambiguity, 3),
360
+ "format_spec": round(format_spec, 3),
361
+ },
362
+ "word_count": word_count,
363
+ "issues": issues,
364
+ "suggestions": suggestions,
365
+ }
366
+
367
+
368
+ def _classify_task_type(prompt: str) -> str:
369
+ """Fast heuristic task type classification (no LLM call)."""
370
+ import re
371
+
372
+ prompt_lower = prompt.lower()
373
+ # Order matters: more specific patterns first, general last.
374
+ # Multi-word phrases use plain substring match; single short words
375
+ # use word-boundary regex to avoid false positives (e.g. "api" in "capital").
376
+ patterns: list[tuple[str, list[str]]] = [
377
+ ("creative_writing", ["write a story", "poem", "creative", "narrative",
378
+ "fiction", "blog post"]),
379
+ ("summarization", ["summarize", "summary", "tldr", "condense",
380
+ "shorten"]),
381
+ ("data_extraction", ["extract", "parse", "list all",
382
+ "pull out", "get all"]),
383
+ ("transformation", ["convert", "transform", "translate", "reformat",
384
+ "restructure", "refactor"]),
385
+ ("analysis", ["analyze", "analyse", "compare", "evaluate", "assess",
386
+ "review", "audit", "examine"]),
387
+ ("question_answering", ["what is", "how does", "explain", "why ",
388
+ "describe", "define"]),
389
+ ("code_generation", ["implement", "create a function", "build",
390
+ "code", "script", "program"]),
391
+ ]
392
+
393
+ # Short words needing word-boundary match to avoid substring false positives
394
+ boundary_patterns: list[tuple[str, list[str]]] = [
395
+ ("code_generation", [r"\bapi\b", r"\bclass\b"]),
396
+ ]
397
+
398
+ for task_type, keywords in patterns:
399
+ if any(kw in prompt_lower for kw in keywords):
400
+ return task_type
401
+
402
+ for task_type, regexes in boundary_patterns:
403
+ if any(re.search(rx, prompt_lower) for rx in regexes):
404
+ return task_type
405
+
406
+ # "write" alone → code_generation (after creative_writing checked above)
407
+ if "write" in prompt_lower:
408
+ return "code_generation"
409
+
410
+ return "general"
411
+
412
+
413
+ def _estimate_complexity(prompt: str) -> float:
414
+ """Estimate task complexity from 0.0 (trivial) to 1.0 (very complex)."""
415
+ words = prompt.split()
416
+ score = 0.0
417
+ score += min(0.25, len(words) / 100)
418
+ conjunctions = sum(1 for w in words if w.lower() in
419
+ ["and", "then", "also", "additionally", "plus"])
420
+ score += min(0.2, conjunctions * 0.05)
421
+ complex_kw = ["api", "database", "auth", "deploy", "test", "migrate",
422
+ "integrate", "concurrent", "async", "distributed",
423
+ "microservice", "pipeline", "architecture"]
424
+ matches = sum(1 for kw in complex_kw if kw in prompt.lower())
425
+ score += min(0.3, matches * 0.1)
426
+ if prompt.count(",") > 3:
427
+ score += 0.1
428
+ if any(m in prompt.lower() for m in ["in addition", "as well as", "furthermore"]):
429
+ score += 0.1
430
+ return min(1.0, round(score, 3))
431
+
432
+
433
+ # ---------------------------------------------------------------------------
434
+ # Status file writer — enables near-real-time VS Code extension updates
435
+ # ---------------------------------------------------------------------------
436
+
437
+
438
+ def _write_status(tool_name: str, data: dict[str, Any]) -> None:
439
+ """Write current status to ~/.loopllm/status.json for the VS Code extension."""
440
+ if _status_path is None:
441
+ return
442
+ try:
443
+ status = {
444
+ "timestamp": time.time(),
445
+ "tool": tool_name,
446
+ "data": data,
447
+ }
448
+ _status_path.write_text(json.dumps(status, indent=2, default=str))
449
+ except OSError:
450
+ pass # Never crash on status write failure
451
+
452
+
453
+ def _append_history(record: dict[str, Any]) -> None:
454
+ """Append a prompt record to ~/.loopllm/prompt_history.json for the VS Code extension."""
455
+ if _history_path is None:
456
+ return
457
+ try:
458
+ history: list[dict[str, Any]] = []
459
+ if _history_path.exists():
460
+ try:
461
+ history = json.loads(_history_path.read_text())
462
+ if not isinstance(history, list):
463
+ history = []
464
+ except (json.JSONDecodeError, OSError):
465
+ history = []
466
+ record["id"] = len(history) + 1
467
+ record["timestamp"] = time.strftime("%Y-%m-%dT%H:%M:%S")
468
+ history.append(record)
469
+ _history_path.write_text(json.dumps(history, indent=2, default=str))
470
+ except OSError:
471
+ pass # Never crash on history write failure
472
+
473
+
474
+ # ---------------------------------------------------------------------------
475
+ # Tool implementations (sync — FastMCP wraps them in threads)
476
+ # ---------------------------------------------------------------------------
477
+
478
+
479
+ def _tool_intercept(prompt: str) -> str:
480
+ """Analyse a prompt and recommend the best approach before acting."""
481
+ store = _get_store()
482
+ priors = _get_priors()
483
+ model = _get_model()
484
+
485
+ quality = _score_prompt_quality(prompt)
486
+ task_type = _classify_task_type(prompt)
487
+ complexity = _estimate_complexity(prompt)
488
+ config = priors.suggest_config(task_type, model)
489
+
490
+ q = quality["quality_score"]
491
+ if q < 0.4:
492
+ route = "elicit"
493
+ reason = ("Prompt is too vague — clarifying questions will "
494
+ "significantly improve output quality")
495
+ next_tool = "loopllm_elicitation_start"
496
+ elif complexity > 0.6:
497
+ route = "decompose"
498
+ reason = (f"Complex task (complexity={complexity:.2f}) — "
499
+ "breaking into subtasks will produce better results")
500
+ next_tool = "loopllm_plan_tasks"
501
+ elif q < 0.6:
502
+ route = "elicit_then_refine"
503
+ reason = ("Prompt has gaps — quick elicitation then refinement "
504
+ "recommended")
505
+ next_tool = "loopllm_elicitation_start"
506
+ else:
507
+ route = "refine"
508
+ reason = "Prompt is clear enough — direct refinement loop"
509
+ next_tool = "loopllm_refine"
510
+
511
+ store.record_prompt({
512
+ "prompt_text": prompt[:500],
513
+ "quality_score": quality["quality_score"],
514
+ "specificity": quality["dimensions"]["specificity"],
515
+ "constraint_clarity": quality["dimensions"]["constraint_clarity"],
516
+ "context_completeness": quality["dimensions"]["context_completeness"],
517
+ "ambiguity": quality["dimensions"]["ambiguity"],
518
+ "format_spec": quality["dimensions"]["format_spec"],
519
+ "task_type": task_type,
520
+ "complexity": complexity,
521
+ "route_chosen": route,
522
+ "word_count": quality["word_count"],
523
+ "grade": quality["grade"],
524
+ })
525
+
526
+ # Also write to JSON history for the VS Code extension
527
+ _append_history({
528
+ "prompt_text": prompt[:500],
529
+ "quality_score": quality["quality_score"],
530
+ "specificity": quality["dimensions"]["specificity"],
531
+ "constraint_clarity": quality["dimensions"]["constraint_clarity"],
532
+ "context_completeness": quality["dimensions"]["context_completeness"],
533
+ "ambiguity": quality["dimensions"]["ambiguity"],
534
+ "format_spec": quality["dimensions"]["format_spec"],
535
+ "task_type": task_type,
536
+ "grade": quality["grade"],
537
+ })
538
+
539
+ result = {
540
+ "route": route,
541
+ "reason": reason,
542
+ "next_tool": next_tool,
543
+ "quality": quality,
544
+ "task_type": task_type,
545
+ "complexity": complexity,
546
+ "prior_knowledge": config,
547
+ }
548
+
549
+ _write_status("intercept", {
550
+ "quality_score": quality["quality_score"],
551
+ "grade": quality["grade"],
552
+ "gauge": quality["gauge"],
553
+ "task_type": task_type,
554
+ "route": route,
555
+ })
556
+
557
+ return json.dumps(result, indent=2, default=str)
558
+
559
+
560
+ def _tool_prompt_stats(window: int = 50) -> str:
561
+ """Get aggregate prompt quality statistics and learning curve."""
562
+ store = _get_store()
563
+ stats = store.get_prompt_stats(window=window)
564
+
565
+ curve = stats.get("learning_curve", [])
566
+ sparkline = ""
567
+ if curve:
568
+ spark_chars = " \u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"
569
+ mn = min(curve)
570
+ mx = max(curve)
571
+ rng = mx - mn if mx > mn else 1.0
572
+ sparkline = "".join(
573
+ spark_chars[int((v - mn) / rng * 8)] for v in curve
574
+ )
575
+
576
+ stats["sparkline"] = sparkline
577
+
578
+ # normalise key name: expose both avg_score (canonical) and avg_quality (legacy)
579
+ if "avg_quality" in stats and "avg_score" not in stats:
580
+ stats["avg_score"] = stats["avg_quality"]
581
+ elif "avg_score" in stats and "avg_quality" not in stats:
582
+ stats["avg_quality"] = stats["avg_score"]
583
+
584
+ _write_status("prompt_stats", {
585
+ "total_prompts": stats.get("total_prompts", 0),
586
+ "avg_score": stats.get("avg_score", 0),
587
+ "trend": stats.get("trend", "no_data"),
588
+ })
589
+
590
+ return json.dumps(stats, indent=2, default=str)
591
+
592
+
593
+ def _update_scoring_weights(rating: int, dims: dict[str, float]) -> dict[str, Any]:
594
+ """Run one online SGD step on the scoring dimension weights.
595
+
596
+ Maps the 1-5 user rating to a [0, 1] target, computes the MSE gradient
597
+ w.r.t. each weight, clips weights to [0.05, 0.50] and simplex-projects
598
+ (sum-to-1), then persists to the store.
599
+
600
+ Args:
601
+ rating: User quality rating 1-5.
602
+ dims: Raw per-dimension scores from ``_score_prompt_quality``.
603
+
604
+ Returns:
605
+ Dict with new weights, loss, and update count.
606
+ """
607
+ if not dims:
608
+ return {"skipped": True, "reason": "no dimension cache"}
609
+
610
+ store = _get_store()
611
+ current_weights = store.load_learned_weights() or {
612
+ "specificity": 0.25,
613
+ "constraint_clarity": 0.20,
614
+ "context_completeness": 0.20,
615
+ "ambiguity": 0.20,
616
+ "format_spec": 0.15,
617
+ }
618
+
619
+ target = (max(1, min(5, rating)) - 1) / 4.0 # map [1, 5] → [0.0, 1.0]
620
+
621
+ # Predicted composite under current weights (ambiguity is inverted).
622
+ y_hat = sum(
623
+ current_weights[k] * (1.0 - dims[k] if k == "ambiguity" else dims[k])
624
+ for k in current_weights
625
+ if k in dims
626
+ )
627
+ loss = (y_hat - target) ** 2
628
+
629
+ lr = 0.02 # learning rate
630
+ new_weights = {}
631
+ for k, w in current_weights.items():
632
+ d = (1.0 - dims[k]) if k == "ambiguity" else dims.get(k, 0.0)
633
+ grad = 2.0 * (y_hat - target) * d
634
+ new_weights[k] = w - lr * grad
635
+
636
+ # Project onto simplex: clip to [0.05, 0.50] then renormalise.
637
+ for k in new_weights:
638
+ new_weights[k] = max(0.05, min(0.50, new_weights[k]))
639
+ total = sum(new_weights.values())
640
+ for k in new_weights:
641
+ new_weights[k] = round(new_weights[k] / total, 6)
642
+
643
+ meta = store.get_learned_weight_meta()
644
+ n_updates = meta["n_updates"] + 1
645
+ store.save_learned_weights(new_weights, n_updates, loss)
646
+ return {"weights": new_weights, "loss": round(loss, 6), "n_updates": n_updates}
647
+
648
+
649
+ def _tool_feedback(
650
+ rating: int,
651
+ task_type: str = "general",
652
+ comment: str = "",
653
+ ) -> str:
654
+ """Record user quality feedback (1-5) to improve future predictions."""
655
+ priors = _get_priors()
656
+ model = _get_model()
657
+
658
+ clamped = max(1, min(5, rating))
659
+ normalized = clamped / 5.0
660
+
661
+ obs = CallObservation(
662
+ task_type=task_type,
663
+ model_id=model,
664
+ scores=[normalized],
665
+ latencies_ms=[0.0],
666
+ converged=normalized >= 0.8,
667
+ total_iterations=1,
668
+ max_iterations=1,
669
+ quality_threshold=0.8,
670
+ )
671
+ priors.observe(obs)
672
+
673
+ # --- Online SGD step: update scoring dimension weights. ---
674
+ sgd_result = _update_scoring_weights(clamped, _last_prompt_dims)
675
+
676
+ result: dict[str, Any] = {
677
+ "recorded": True,
678
+ "status": "ok",
679
+ "rating": clamped,
680
+ "normalized_score": normalized,
681
+ "task_type": task_type,
682
+ "model": model,
683
+ "impact": ("Priors updated — future predictions for this task "
684
+ "type will be adjusted"),
685
+ "weight_update": sgd_result,
686
+ }
687
+ if comment:
688
+ result["comment"] = comment
689
+
690
+ _write_status("feedback", {
691
+ "rating": clamped,
692
+ "task_type": task_type,
693
+ })
694
+
695
+ return json.dumps(result, indent=2)
696
+
697
+
698
+ def _tool_refine(
699
+ prompt: str,
700
+ provider: str | None = None,
701
+ model: str | None = None,
702
+ max_iterations: int = 5,
703
+ quality_threshold: float = 0.8,
704
+ evaluator_type: str = "length",
705
+ min_words: int = 5,
706
+ max_words: int = 10000,
707
+ required_fields: list[str] | None = None,
708
+ required_patterns: list[str] | None = None,
709
+ ) -> str:
710
+ """Run the iterative refinement loop on a prompt."""
711
+ prov = _get_provider(provider)
712
+ mod = _get_model(model)
713
+ priors = _get_priors()
714
+
715
+ # Agent passthrough: delegate generation to the calling IDE agent.
716
+ if isinstance(prov, AgentPassthroughProvider):
717
+ return _build_agent_refine_response(
718
+ prompt=prompt,
719
+ evaluator_type=evaluator_type,
720
+ min_words=min_words,
721
+ max_words=max_words,
722
+ required_fields=required_fields or [],
723
+ required_patterns=required_patterns or [],
724
+ max_iterations=max_iterations,
725
+ quality_threshold=quality_threshold,
726
+ )
727
+
728
+ evaluator = _build_evaluator(
729
+ evaluator_type,
730
+ min_words=min_words,
731
+ max_words=max_words,
732
+ required_fields=required_fields or [],
733
+ required_patterns=required_patterns or [],
734
+ )
735
+
736
+ config = LoopConfig(
737
+ max_iterations=max_iterations,
738
+ quality_threshold=quality_threshold,
739
+ )
740
+ loop = LoopedLLM(provider=prov, config=config)
741
+ result = loop.refine(prompt, evaluator, model=mod)
742
+
743
+ obs = CallObservation(
744
+ task_type="mcp_refine",
745
+ model_id=mod,
746
+ scores=result.metrics.score_trajectory,
747
+ latencies_ms=[it.latency_ms for it in result.iterations],
748
+ converged=result.metrics.converged,
749
+ total_iterations=result.metrics.total_iterations,
750
+ max_iterations=config.max_iterations,
751
+ quality_threshold=config.quality_threshold,
752
+ )
753
+ priors.observe(obs)
754
+
755
+ result_dict = _result_to_dict(result)
756
+
757
+ _write_status("refine", {
758
+ "best_score": result.metrics.best_score,
759
+ "iterations": result.metrics.total_iterations,
760
+ "converged": result.metrics.converged,
761
+ })
762
+
763
+ return json.dumps(result_dict, indent=2)
764
+
765
+
766
+ def _tool_run_pipeline(
767
+ prompt: str,
768
+ provider: str | None = None,
769
+ model: str | None = None,
770
+ max_iterations: int = 5,
771
+ quality_threshold: float = 0.8,
772
+ skip_elicitation: bool = False,
773
+ ) -> str:
774
+ """Run the full pipeline: elicit -> decompose -> execute -> verify."""
775
+ prov = _get_provider(provider)
776
+ mod = _get_model(model)
777
+ store = _get_store()
778
+ priors = _get_priors()
779
+
780
+ # Agent passthrough: return a structured pipeline prompt for the calling agent.
781
+ if isinstance(prov, AgentPassthroughProvider):
782
+ quality = _score_prompt_quality(prompt)
783
+ task_type = _classify_task_type(prompt)
784
+ complexity = _estimate_complexity(prompt)
785
+ instructions = (
786
+ f"Execute the following pipeline for the user's prompt:\n"
787
+ f"1. Elicit: identify any ambiguities and note clarifying questions.\n"
788
+ f"2. Decompose: break into subtasks if complexity > 0.4 (detected: {complexity:.2f}).\n"
789
+ f"3. Execute: generate a high-quality response for each subtask.\n"
790
+ f"4. Verify: self-evaluate the combined output against quality_threshold={quality_threshold}.\n"
791
+ f"Iterate on any failing subtasks (max {max_iterations} iterations total).\n"
792
+ f"Return the final assembled result."
793
+ )
794
+ return json.dumps({
795
+ "mode": "agent_execute",
796
+ "instructions": instructions,
797
+ "prompt": prompt,
798
+ "task_type": task_type,
799
+ "estimated_complexity": complexity,
800
+ "prompt_quality": quality,
801
+ "max_iterations": max_iterations,
802
+ "quality_threshold": quality_threshold,
803
+ }, indent=2)
804
+
805
+ orchestrator = TaskOrchestrator(
806
+ provider=prov,
807
+ priors=priors,
808
+ store=store,
809
+ model=mod,
810
+ )
811
+
812
+ result = orchestrator.run(prompt, model=mod, answer_func=None)
813
+
814
+ _write_status("pipeline", {
815
+ "best_score": result.metrics.best_score,
816
+ "iterations": result.metrics.total_iterations,
817
+ "converged": result.metrics.converged,
818
+ })
819
+
820
+ return json.dumps(_result_to_dict(result), indent=2)
821
+
822
+
823
+ def _tool_classify_task(
824
+ prompt: str,
825
+ provider: str | None = None,
826
+ model: str | None = None,
827
+ ) -> str:
828
+ """Classify a prompt into a task type."""
829
+ prov = _get_provider(provider)
830
+ # Agent / fast path: deterministic heuristic; no LLM call needed.
831
+ if isinstance(prov, AgentPassthroughProvider):
832
+ return json.dumps({"task_type": _classify_task_type(prompt)})
833
+
834
+ mod = _get_model(model)
835
+ priors = _get_priors()
836
+ refiner = IntentRefiner(provider=prov, priors=priors, model=mod)
837
+ task_type = refiner.classify_task(prompt)
838
+ return json.dumps({"task_type": task_type})
839
+
840
+
841
+ # Static question templates used in agent mode (no LLM required).
842
+ _STATIC_QUESTION_TEMPLATES: dict[str, dict[str, Any]] = {
843
+ "scope": {
844
+ "text": "What exactly should the output cover? What are the scope boundaries?",
845
+ "options": None,
846
+ },
847
+ "format": {
848
+ "text": "What output format is expected? (e.g., JSON, markdown, plain text, code)",
849
+ "options": ["JSON", "Markdown", "Plain text", "Code", "Other"],
850
+ },
851
+ "constraints": {
852
+ "text": "Are there any hard requirements, rules, or constraints to follow?",
853
+ "options": None,
854
+ },
855
+ "examples": {
856
+ "text": "Can you give an example of the expected input and/or output?",
857
+ "options": None,
858
+ },
859
+ "audience": {
860
+ "text": "Who will use this output? (e.g., developers, end-users, another system)",
861
+ "options": ["Developers", "End-users", "Another system/API", "General audience"],
862
+ },
863
+ "priority": {
864
+ "text": "If trade-offs are needed, what matters most?",
865
+ "options": ["Speed", "Accuracy", "Brevity", "Completeness"],
866
+ },
867
+ "edge_cases": {
868
+ "text": "How should edge cases, errors, or missing data be handled?",
869
+ "options": None,
870
+ },
871
+ }
872
+
873
+ # Fallback order when no historical stats are available.
874
+ _STATIC_QUESTION_ORDER = ["scope", "format", "constraints", "examples",
875
+ "audience", "priority", "edge_cases"]
876
+
877
+
878
+ def _next_static_question(
879
+ asked_types: set[str],
880
+ max_questions: int,
881
+ n_asked: int,
882
+ ) -> dict[str, Any] | None:
883
+ """Return the next question selected via Thompson Sampling.
884
+
885
+ Each question type maintains a Beta(alpha, beta) prior where alpha tracks
886
+ historically positive-impact asks and beta tracks negative-impact asks.
887
+ We draw one sample per candidate type and pick the argmax — exploration
888
+ / exploitation without a fixed schedule.
889
+ """
890
+ import random # stdlib — already available
891
+
892
+ if n_asked >= max_questions:
893
+ return None
894
+
895
+ candidates = [qt for qt in _STATIC_QUESTION_TEMPLATES if qt not in asked_types]
896
+ if not candidates:
897
+ return None
898
+
899
+ # Load per-type stats from store for Thompson Sampling.
900
+ try:
901
+ stats_list = _get_store().get_question_stats()
902
+ stats: dict[str, dict[str, Any]] = {r["question_type"]: r for r in stats_list}
903
+ except Exception:
904
+ stats = {}
905
+
906
+ scores: dict[str, float] = {}
907
+ for qt in candidates:
908
+ row = stats.get(qt, {})
909
+ alpha = 1.0 + float(row.get("positive_impact", 0))
910
+ beta = 1.0 + float(row.get("negative_impact", 0))
911
+ scores[qt] = random.betavariate(alpha, beta)
912
+
913
+ best_qt = max(scores, key=lambda k: scores[k])
914
+ tmpl = _STATIC_QUESTION_TEMPLATES[best_qt]
915
+ return {
916
+ "text": tmpl["text"],
917
+ "question_type": best_qt,
918
+ "options": tmpl["options"],
919
+ "information_gain": round(scores[best_qt], 4),
920
+ }
921
+
922
+
923
+ def _tool_analyze_prompt(
924
+ prompt: str,
925
+ provider: str | None = None,
926
+ model: str | None = None,
927
+ max_questions: int = 5,
928
+ ) -> str:
929
+ """Analyze a prompt and generate ranked clarifying questions."""
930
+ prov = _get_provider(provider)
931
+
932
+ # Agent / fast path: derive questions from quality dimensions, no LLM.
933
+ if isinstance(prov, AgentPassthroughProvider):
934
+ quality = _score_prompt_quality(prompt)
935
+ issues = quality.get("issues", [])
936
+ # Map quality issues → most relevant question types
937
+ issue_map = {
938
+ "vague": "scope",
939
+ "constraints": "constraints",
940
+ "context": "examples",
941
+ "ambiguity": "scope",
942
+ "format": "format",
943
+ }
944
+ prioritized: list[str] = []
945
+ for issue in issues:
946
+ for keyword, qt in issue_map.items():
947
+ if keyword in issue.lower() and qt not in prioritized:
948
+ prioritized.append(qt)
949
+ # Fill remaining slots from default order
950
+ for qt in _STATIC_QUESTION_ORDER:
951
+ if qt not in prioritized:
952
+ prioritized.append(qt)
953
+
954
+ questions: list[dict[str, Any]] = []
955
+ for qt in prioritized[:max_questions]:
956
+ tmpl = _STATIC_QUESTION_TEMPLATES.get(qt, {})
957
+ idx = len(questions)
958
+ questions.append({
959
+ "question": tmpl.get("text", qt),
960
+ "question_type": qt,
961
+ "options": tmpl.get("options"),
962
+ "information_gain": round(0.5 - idx * 0.05, 4),
963
+ })
964
+ return json.dumps(questions, indent=2)
965
+
966
+ mod = _get_model(model)
967
+ priors = _get_priors()
968
+ refiner = IntentRefiner(provider=prov, priors=priors, model=mod, max_questions=max_questions)
969
+ cq_list = refiner.analyze(prompt)
970
+
971
+ return json.dumps([
972
+ {
973
+ "question": q.text,
974
+ "question_type": q.question_type,
975
+ "options": q.options,
976
+ "information_gain": round(q.information_gain, 4),
977
+ }
978
+ for q in cq_list
979
+ ], indent=2)
980
+
981
+
982
+ def _tool_elicitation_start(
983
+ prompt: str,
984
+ provider: str | None = None,
985
+ model: str | None = None,
986
+ max_questions: int = 3,
987
+ ) -> str:
988
+ """Start a new elicitation session."""
989
+ prov = _get_provider(provider)
990
+ mod = _get_model(model)
991
+ store = _get_store()
992
+ priors = _get_priors()
993
+
994
+ session = ElicitationSession(original_prompt=prompt, model_id=mod)
995
+
996
+ # Agent mode: deterministic task classification and static questions.
997
+ if isinstance(prov, AgentPassthroughProvider):
998
+ session.task_type = _classify_task_type(prompt)
999
+ q_dict = _next_static_question(set(), max_questions, 0)
1000
+ _active_sessions[session.session_id] = {
1001
+ "session": session,
1002
+ "max_questions": max_questions,
1003
+ "agent_mode": True,
1004
+ }
1005
+ store.create_session(
1006
+ session_id=session.session_id,
1007
+ original_prompt=prompt,
1008
+ task_type=session.task_type,
1009
+ model_id=mod,
1010
+ )
1011
+ result: dict[str, Any] = {
1012
+ "session_id": session.session_id,
1013
+ "task_type": session.task_type,
1014
+ }
1015
+ if q_dict:
1016
+ from loopllm.elicitation import ClarifyingQuestion
1017
+ q = ClarifyingQuestion(
1018
+ text=q_dict["text"],
1019
+ question_type=q_dict["question_type"],
1020
+ options=q_dict["options"],
1021
+ information_gain=q_dict["information_gain"],
1022
+ )
1023
+ session.questions_asked.append(q)
1024
+ result["question"] = q_dict
1025
+ result["is_complete"] = False
1026
+ else:
1027
+ result["question"] = None
1028
+ result["is_complete"] = True
1029
+ return json.dumps(result, indent=2)
1030
+
1031
+ refiner = IntentRefiner(provider=prov, priors=priors, model=mod, max_questions=max_questions)
1032
+ session.task_type = refiner.classify_task(prompt)
1033
+ question = refiner.ask(session)
1034
+
1035
+ _active_sessions[session.session_id] = {
1036
+ "session": session,
1037
+ "refiner": refiner,
1038
+ }
1039
+ store.create_session(
1040
+ session_id=session.session_id,
1041
+ original_prompt=prompt,
1042
+ task_type=session.task_type,
1043
+ model_id=mod,
1044
+ )
1045
+
1046
+ result2: dict[str, Any] = {
1047
+ "session_id": session.session_id,
1048
+ "task_type": session.task_type,
1049
+ }
1050
+
1051
+ if question is not None:
1052
+ result2["question"] = {
1053
+ "text": question.text,
1054
+ "question_type": question.question_type,
1055
+ "options": question.options,
1056
+ "information_gain": round(question.information_gain, 4),
1057
+ }
1058
+ session.questions_asked.append(question)
1059
+ result2["is_complete"] = False
1060
+ else:
1061
+ result2["question"] = None
1062
+ result2["is_complete"] = True
1063
+
1064
+ return json.dumps(result2, indent=2)
1065
+
1066
+
1067
+ def _tool_elicitation_answer(session_id: str, answer: str) -> str:
1068
+ """Answer the current question and get the next one."""
1069
+ if session_id not in _active_sessions:
1070
+ return json.dumps({"error": f"Session not found: {session_id}"})
1071
+
1072
+ state = _active_sessions[session_id]
1073
+ session: ElicitationSession = state["session"]
1074
+ store = _get_store()
1075
+
1076
+ if session.questions_asked:
1077
+ last_q = session.questions_asked[-1]
1078
+ session.answers[last_q.question_type] = answer
1079
+
1080
+ result: dict[str, Any] = {"session_id": session_id}
1081
+
1082
+ # Agent mode: use static questions; no LLM needed.
1083
+ if state.get("agent_mode"):
1084
+ from loopllm.elicitation import ClarifyingQuestion
1085
+ asked_types = {q.question_type for q in session.questions_asked}
1086
+ max_q = state.get("max_questions", 3)
1087
+ q_dict = _next_static_question(asked_types, max_q, len(session.questions_asked))
1088
+ if q_dict:
1089
+ q = ClarifyingQuestion(
1090
+ text=q_dict["text"],
1091
+ question_type=q_dict["question_type"],
1092
+ options=q_dict["options"],
1093
+ information_gain=q_dict["information_gain"],
1094
+ )
1095
+ session.questions_asked.append(q)
1096
+ result["question"] = q_dict
1097
+ result["is_complete"] = False
1098
+ else:
1099
+ result["question"] = None
1100
+ result["is_complete"] = True
1101
+ else:
1102
+ refiner: IntentRefiner = state["refiner"]
1103
+ question = refiner.ask(session)
1104
+ if question is not None:
1105
+ result["question"] = {
1106
+ "text": question.text,
1107
+ "question_type": question.question_type,
1108
+ "options": question.options,
1109
+ "information_gain": round(question.information_gain, 4),
1110
+ }
1111
+ session.questions_asked.append(question)
1112
+ result["is_complete"] = False
1113
+ else:
1114
+ result["question"] = None
1115
+ result["is_complete"] = True
1116
+
1117
+ store.update_session(
1118
+ session_id,
1119
+ answers=session.answers,
1120
+ questions=[
1121
+ {"text": q.text, "type": q.question_type}
1122
+ for q in session.questions_asked
1123
+ ],
1124
+ )
1125
+
1126
+ return json.dumps(result, indent=2)
1127
+
1128
+
1129
+ def _tool_elicitation_finish(session_id: str) -> str:
1130
+ """Finish an elicitation session and get the refined IntentSpec."""
1131
+ if session_id not in _active_sessions:
1132
+ return json.dumps({"error": f"Session not found: {session_id}"})
1133
+
1134
+ state = _active_sessions[session_id]
1135
+ session: ElicitationSession = state["session"]
1136
+ store = _get_store()
1137
+
1138
+ # Agent mode: synthesise spec deterministically from gathered answers.
1139
+ if state.get("agent_mode"):
1140
+ answers = session.answers
1141
+ constraints = {qt: ans for qt, ans in answers.items()
1142
+ if qt in ("constraints", "format", "scope")}
1143
+ quality_criteria = []
1144
+ if "format" in answers:
1145
+ quality_criteria.append(f"Output must be in {answers['format']} format")
1146
+ if "constraints" in answers:
1147
+ quality_criteria.append(answers["constraints"])
1148
+ if "scope" in answers:
1149
+ quality_criteria.append(f"Scope: {answers['scope']}")
1150
+ context_parts = []
1151
+ if "examples" in answers:
1152
+ context_parts.append(f"Examples: {answers['examples']}")
1153
+ if "audience" in answers:
1154
+ context_parts.append(f"Audience: {answers['audience']}")
1155
+ context_str = ". ".join(context_parts)
1156
+ refined = session.original_prompt
1157
+ if context_str:
1158
+ refined = f"{session.original_prompt}. {context_str}."
1159
+ if "priority" in answers:
1160
+ refined += f" Prioritise: {answers['priority']}."
1161
+ complexity = _estimate_complexity(refined)
1162
+ spec = IntentSpec(
1163
+ task_type=session.task_type,
1164
+ original_prompt=session.original_prompt,
1165
+ refined_prompt=refined,
1166
+ constraints=constraints,
1167
+ quality_criteria=quality_criteria,
1168
+ estimated_complexity=complexity,
1169
+ )
1170
+ else:
1171
+ refiner: IntentRefiner = state["refiner"]
1172
+ if session.answers:
1173
+ spec = refiner.refine(session.original_prompt, session.answers)
1174
+ else:
1175
+ spec = IntentSpec(
1176
+ task_type=session.task_type,
1177
+ original_prompt=session.original_prompt,
1178
+ refined_prompt=session.original_prompt,
1179
+ )
1180
+
1181
+ session.refined_spec = spec
1182
+
1183
+ spec_dict = {
1184
+ "task_type": spec.task_type,
1185
+ "refined_prompt": spec.refined_prompt,
1186
+ "constraints": spec.constraints,
1187
+ "quality_criteria": spec.quality_criteria,
1188
+ "decomposition_hints": spec.decomposition_hints,
1189
+ "estimated_complexity": spec.estimated_complexity,
1190
+ }
1191
+ store.update_session(session_id, spec=spec_dict)
1192
+
1193
+ del _active_sessions[session_id]
1194
+
1195
+ return json.dumps({
1196
+ "session_id": session_id,
1197
+ "spec": spec_dict,
1198
+ }, indent=2)
1199
+
1200
+
1201
+ def _tool_plan_tasks(
1202
+ prompt: str,
1203
+ provider: str | None = None,
1204
+ model: str | None = None,
1205
+ estimated_complexity: float = 0.5,
1206
+ ) -> str:
1207
+ """Decompose a prompt into a task plan with dependencies."""
1208
+ prov = _get_provider(provider)
1209
+
1210
+ # Agent passthrough: return a decomposition prompt for the calling agent.
1211
+ if isinstance(prov, AgentPassthroughProvider):
1212
+ task_type = _classify_task_type(prompt)
1213
+ instructions = (
1214
+ "Decompose the following task into an ordered list of subtasks. "
1215
+ "For each subtask provide: id (short unique string), title, description, "
1216
+ "and dependencies (list of ids that must complete first). "
1217
+ "Order them so they can be executed with dependencies satisfied. "
1218
+ "Return a JSON object with fields: task_count, tasks (array), execution_order (array of ids)."
1219
+ )
1220
+ return json.dumps({
1221
+ "mode": "agent_execute",
1222
+ "instructions": instructions,
1223
+ "prompt": prompt,
1224
+ "task_type": task_type,
1225
+ "estimated_complexity": estimated_complexity,
1226
+ }, indent=2)
1227
+
1228
+ mod = _get_model(model)
1229
+ store = _get_store()
1230
+ priors = _get_priors()
1231
+
1232
+ spec = IntentSpec(
1233
+ original_prompt=prompt,
1234
+ refined_prompt=prompt,
1235
+ estimated_complexity=estimated_complexity,
1236
+ )
1237
+
1238
+ orchestrator = TaskOrchestrator(
1239
+ provider=prov, priors=priors, store=store, model=mod,
1240
+ )
1241
+ plan = orchestrator.plan(spec)
1242
+
1243
+ return json.dumps({
1244
+ "task_count": len(plan.tasks),
1245
+ "tasks": [
1246
+ {
1247
+ "id": t.id,
1248
+ "title": t.title,
1249
+ "description": t.description,
1250
+ "state": t.state.value,
1251
+ "dependencies": t.dependencies,
1252
+ }
1253
+ for t in plan.tasks
1254
+ ],
1255
+ "execution_order": [t.id for t in plan.execution_order()],
1256
+ }, indent=2)
1257
+
1258
+
1259
+ def _tool_verify_output(
1260
+ output: str,
1261
+ original_prompt: str,
1262
+ quality_criteria: list[str] | None = None,
1263
+ provider: str | None = None,
1264
+ model: str | None = None,
1265
+ ) -> str:
1266
+ """Verify an output against a prompt and quality criteria."""
1267
+ prov = _get_provider(provider)
1268
+
1269
+ # Agent passthrough: score deterministically + ask agent for deeper check.
1270
+ if isinstance(prov, AgentPassthroughProvider):
1271
+ criteria = quality_criteria or []
1272
+ # Keyword-match criteria against output for a fast deterministic score
1273
+ output_lower = output.lower()
1274
+ passed_criteria = [c for c in criteria if any(
1275
+ word in output_lower for word in c.lower().split() if len(word) > 3
1276
+ )]
1277
+ score = (len(passed_criteria) / len(criteria)) if criteria else 0.9
1278
+ deficiencies = [c for c in criteria if c not in passed_criteria]
1279
+ agent_check = (
1280
+ f"Verify the following output against the original prompt and quality criteria.\n"
1281
+ f"Prompt: {original_prompt}\n"
1282
+ f"Criteria: {criteria}\n"
1283
+ f"Output: {output[:2000]}\n"
1284
+ f"List any deficiencies found and provide a quality score 0.0-1.0."
1285
+ )
1286
+ return json.dumps({
1287
+ "score": round(score, 3),
1288
+ "passed": score >= 0.7,
1289
+ "deficiencies": deficiencies,
1290
+ "sub_scores": {},
1291
+ "feedback": "Deterministic pre-check complete. Execute the instructions via the agent for deeper verification.",
1292
+ "mode": "agent_execute",
1293
+ "instructions": agent_check,
1294
+ }, indent=2)
1295
+
1296
+ mod = _get_model(model)
1297
+ store = _get_store()
1298
+ priors = _get_priors()
1299
+
1300
+ spec = IntentSpec(
1301
+ original_prompt=original_prompt,
1302
+ refined_prompt=original_prompt,
1303
+ quality_criteria=quality_criteria or [],
1304
+ )
1305
+
1306
+ orchestrator = TaskOrchestrator(
1307
+ provider=prov, priors=priors, store=store, model=mod,
1308
+ )
1309
+ result = orchestrator.verify(spec, output)
1310
+
1311
+ return json.dumps({
1312
+ "score": result.score,
1313
+ "passed": result.passed,
1314
+ "deficiencies": result.deficiencies,
1315
+ "sub_scores": result.sub_scores,
1316
+ "feedback": result.feedback,
1317
+ }, indent=2)
1318
+
1319
+
1320
+ def _tool_report(
1321
+ task_type: str | None = None,
1322
+ model_id: str | None = None,
1323
+ ) -> str:
1324
+ """Show learned Bayesian priors and question effectiveness statistics."""
1325
+ priors = _get_priors()
1326
+ store = _get_store()
1327
+
1328
+ if task_type and model_id:
1329
+ reports = [priors.report(task_type, model_id)]
1330
+ else:
1331
+ reports = priors.report_all()
1332
+
1333
+ question_stats = store.get_question_stats()
1334
+
1335
+ return json.dumps({
1336
+ "priors": reports,
1337
+ "question_effectiveness": question_stats,
1338
+ }, indent=2, default=str)
1339
+
1340
+
1341
+ def _tool_suggest_config(
1342
+ task_type: str,
1343
+ model_id: str | None = None,
1344
+ cost_weight: float = 0.5,
1345
+ ) -> str:
1346
+ """Get a suggested LoopConfig based on learned beliefs."""
1347
+ priors = _get_priors()
1348
+ mod = model_id or _get_model()
1349
+ config = priors.suggest_config(task_type, mod, cost_weight)
1350
+ return json.dumps(config, indent=2, default=str)
1351
+
1352
+
1353
+ # ---------------------------------------------------------------------------
1354
+ # Adaptive agent-loop tools — Bayesian stop/continue control for agent loops
1355
+ # ---------------------------------------------------------------------------
1356
+
1357
+
1358
+ def _tool_loop_start(
1359
+ goal: str,
1360
+ task_type: str = "general",
1361
+ model_id: str | None = None,
1362
+ quality_threshold: float | None = None,
1363
+ cost_weight: float = 0.5,
1364
+ evaluator_type: str = "composite",
1365
+ quality_criteria: list[str] | None = None,
1366
+ required_patterns: list[str] | None = None,
1367
+ required_fields: list[str] | None = None,
1368
+ max_wall_ms: float = 300_000.0,
1369
+ max_tokens: int = 0,
1370
+ ) -> str:
1371
+ """Begin an adaptive agent-loop session with a CDV verifier recipe."""
1372
+ controller = _get_agent_loop()
1373
+ mod = model_id or _get_model()
1374
+ eval_kwargs: dict[str, Any] = {}
1375
+ if required_patterns:
1376
+ eval_kwargs["required_patterns"] = required_patterns
1377
+ if required_fields:
1378
+ eval_kwargs["required_fields"] = required_fields
1379
+
1380
+ session = controller.start(
1381
+ goal=goal,
1382
+ task_type=task_type,
1383
+ model_id=mod,
1384
+ quality_threshold=quality_threshold,
1385
+ cost_weight=cost_weight,
1386
+ evaluator_type=evaluator_type,
1387
+ quality_criteria=quality_criteria,
1388
+ max_wall_ms=max_wall_ms,
1389
+ max_tokens=max_tokens,
1390
+ **eval_kwargs,
1391
+ )
1392
+ return json.dumps(
1393
+ {
1394
+ "session_id": session.session_id,
1395
+ "goal": session.goal,
1396
+ "task_type": session.task_type,
1397
+ "model_id": session.model_id,
1398
+ "suggested_budget": session.suggested_budget,
1399
+ "quality_threshold": round(session.quality_threshold, 3),
1400
+ "confidence": round(session.confidence, 3),
1401
+ "total_observations": session.total_observations,
1402
+ "evaluator_type": session.evaluator_type,
1403
+ "quality_criteria": session.quality_criteria,
1404
+ "guidance": (
1405
+ f"Run up to ~{session.suggested_budget} step(s). After each step, "
1406
+ f"call loopllm_loop_step with step_output=<artifact> (test log, diff, "
1407
+ f"summary). The server runs Conservative Dual-Verify (deterministic "
1408
+ f"checks + separate critic) — do NOT pass your own score. Stop when "
1409
+ f"the verdict says 'stop', then call loopllm_loop_end."
1410
+ ),
1411
+ },
1412
+ indent=2,
1413
+ default=str,
1414
+ )
1415
+
1416
+
1417
+ async def _tool_loop_step(
1418
+ session_id: str,
1419
+ step_output: str = "",
1420
+ score: float | None = None,
1421
+ note: str = "",
1422
+ step_tokens: int = 0,
1423
+ ctx: Context[Any, Any, Any] | None = None,
1424
+ ) -> str:
1425
+ """Score a step artifact via CDV and return a continue/stop verdict."""
1426
+ controller = _get_agent_loop()
1427
+ try:
1428
+ session = controller.get_session(session_id)
1429
+ except KeyError as exc:
1430
+ return json.dumps({"error": str(exc)})
1431
+
1432
+ cdv_meta: dict[str, Any] = {}
1433
+ if step_output:
1434
+ evaluator = build_step_evaluator(
1435
+ session.evaluator_type,
1436
+ session.quality_criteria,
1437
+ **session.evaluator_kwargs,
1438
+ )
1439
+ try:
1440
+ dual = await conservative_dual_verify(
1441
+ step_output=step_output,
1442
+ goal=session.goal,
1443
+ quality_criteria=session.quality_criteria,
1444
+ evaluator=evaluator,
1445
+ ctx=ctx,
1446
+ )
1447
+ except Exception as exc: # noqa: BLE001
1448
+ return json.dumps({
1449
+ "error": f"CDV scoring failed: {exc}",
1450
+ "hint": "Pass step_output with the step artifact for verified scoring.",
1451
+ })
1452
+ final_score = dual.final_score
1453
+ cdv_meta = dual.to_dict()
1454
+ elif score is not None:
1455
+ dual = legacy_self_report_score(score)
1456
+ final_score = dual.final_score
1457
+ cdv_meta = dual.to_dict()
1458
+ cdv_meta["deprecation"] = (
1459
+ "Self-reported score is deprecated. Pass step_output for "
1460
+ "Conservative Dual-Verify scoring."
1461
+ )
1462
+ else:
1463
+ return json.dumps({
1464
+ "error": "Provide step_output (preferred) or score (legacy).",
1465
+ "hint": (
1466
+ "Submit the step artifact (test output, diff, summary) as "
1467
+ "step_output. The server scores it via Conservative Dual-Verify."
1468
+ ),
1469
+ })
1470
+
1471
+ try:
1472
+ verdict = controller.step(
1473
+ session_id,
1474
+ final_score,
1475
+ note=note,
1476
+ step_output=step_output,
1477
+ step_tokens=step_tokens,
1478
+ )
1479
+ except KeyError as exc:
1480
+ return json.dumps({"error": str(exc)})
1481
+ except ValueError as exc:
1482
+ return json.dumps({"error": str(exc)})
1483
+
1484
+ verdict.update(cdv_meta)
1485
+ return json.dumps(verdict, indent=2, default=str)
1486
+
1487
+
1488
+ def _tool_loop_end(session_id: str, converged: bool | None = None) -> str:
1489
+ """Finalise an agent loop and learn from it for future budget predictions."""
1490
+ controller = _get_agent_loop()
1491
+ try:
1492
+ summary = controller.end(session_id, converged)
1493
+ except KeyError as exc:
1494
+ return json.dumps({"error": str(exc)})
1495
+ return json.dumps(summary, indent=2, default=str)
1496
+
1497
+
1498
+ def _tool_loop_status(session_id: str) -> str:
1499
+ """Inspect the current state of an active agent-loop session."""
1500
+ controller = _get_agent_loop()
1501
+ try:
1502
+ status = controller.status(session_id)
1503
+ except KeyError as exc:
1504
+ return json.dumps({"error": str(exc)})
1505
+ return json.dumps(status, indent=2, default=str)
1506
+
1507
+
1508
+ def _tool_list_tasks(
1509
+ state: str | None = None,
1510
+ limit: int = 20,
1511
+ ) -> str:
1512
+ """List tasks from the store."""
1513
+ store = _get_store()
1514
+ tasks = store.get_tasks(state=state, limit=limit)
1515
+ return json.dumps(tasks, indent=2, default=str)
1516
+
1517
+
1518
+ def _tool_show_task(task_id: str) -> str:
1519
+ """Show detailed information about a specific task."""
1520
+ store = _get_store()
1521
+ task = store.get_task(task_id)
1522
+ if task is None:
1523
+ return json.dumps({"error": f"Task not found: {task_id}"})
1524
+ return json.dumps(task, indent=2, default=str)
1525
+
1526
+
1527
+ # ---------------------------------------------------------------------------
1528
+ # Plan Registry tools — confidence-driven task management
1529
+ # ---------------------------------------------------------------------------
1530
+
1531
+
1532
+ def _tool_plan_register(
1533
+ goal: str,
1534
+ tasks: list[dict[str, Any]],
1535
+ confidence_threshold: float = 0.72,
1536
+ ) -> str:
1537
+ """Create a new plan in the PlanRegistry.
1538
+
1539
+ Each task in ``tasks`` should have at minimum a ``title`` and
1540
+ ``description``. The registry assigns a ``plan_id`` and starts
1541
+ tracking rolling confidence as tasks are scored.
1542
+
1543
+ Args:
1544
+ goal: High-level goal text for the plan.
1545
+ tasks: List of task dicts with ``title``, ``description`` (and
1546
+ optionally ``id``, ``metadata``).
1547
+ confidence_threshold: Rolling confidence must stay above this
1548
+ value or the plan flags ``needs_replan=True``.
1549
+
1550
+ Returns:
1551
+ JSON plan dict including ``plan_id``, all tasks with initial
1552
+ scores, and ``rolling_confidence``.
1553
+ """
1554
+ registry = get_registry()
1555
+ plan = registry.create(
1556
+ goal=goal,
1557
+ tasks=tasks,
1558
+ confidence_threshold=confidence_threshold,
1559
+ )
1560
+ # Persist immediately so plans survive MCP server restarts
1561
+ store = _get_store()
1562
+ store.save_plan(plan.to_dict())
1563
+ return json.dumps(plan.to_dict(), indent=2)
1564
+
1565
+
1566
+ def _tool_plan_update(
1567
+ plan_id: str,
1568
+ task_id: str,
1569
+ prompt_score: float | None = None,
1570
+ output_score: float | None = None,
1571
+ mark_done: bool = True,
1572
+ ) -> str:
1573
+ """Update a task's prompt/output scores and recalculate plan confidence.
1574
+
1575
+ Call this after:
1576
+ - Scoring the task prompt with ``loopllm_intercept``
1577
+ → pass the ``quality_score`` as ``prompt_score``
1578
+ - Generating and verifying the task output
1579
+ → pass the evaluation score as ``output_score``
1580
+
1581
+ The registry recalculates ``rolling_confidence`` with exponential
1582
+ decay weighting (recent tasks count more) and sets
1583
+ ``needs_replan=True`` if confidence drops below the threshold.
1584
+
1585
+ Args:
1586
+ plan_id: The plan to update.
1587
+ task_id: The task within the plan.
1588
+ prompt_score: Prompt quality score (0–1) from loopllm_intercept.
1589
+ output_score: Output quality score (0–1) from loopllm_verify_output.
1590
+ mark_done: If True, mark the task DONE when confidence >= threshold,
1591
+ or REPLANNING when below it.
1592
+
1593
+ Returns:
1594
+ Updated plan dict with new ``rolling_confidence`` and
1595
+ ``needs_replan`` flag.
1596
+ """
1597
+ registry = get_registry()
1598
+ result: dict[str, Any] = {}
1599
+
1600
+ if prompt_score is not None:
1601
+ result = registry.score_prompt(plan_id, task_id, prompt_score)
1602
+ if "error" in result:
1603
+ return json.dumps(result, indent=2)
1604
+
1605
+ if output_score is not None:
1606
+ result = registry.score_output(plan_id, task_id, output_score, mark_done=mark_done)
1607
+ if "error" in result:
1608
+ return json.dumps(result, indent=2)
1609
+
1610
+ if not result:
1611
+ result = registry.get_status(plan_id)
1612
+
1613
+ # Persist updated plan state
1614
+ store = _get_store()
1615
+ plan = registry.get(plan_id)
1616
+ if plan:
1617
+ store.save_plan(plan.to_dict())
1618
+
1619
+ return json.dumps(result, indent=2)
1620
+
1621
+
1622
+ def _tool_plan_list() -> str:
1623
+ """List all active plans with their current status and confidence.
1624
+
1625
+ Returns all plans tracked by the PlanRegistry — both in-memory and
1626
+ those restored from disk after a server restart. Use this to get a
1627
+ Shrimp-style overview of all ongoing work.
1628
+
1629
+ Returns:
1630
+ JSON with a ``plans`` list, each entry containing ``plan_id``,
1631
+ ``goal``, ``rolling_confidence``, ``needs_replan``, task counts
1632
+ by status, and the next pending task title.
1633
+ """
1634
+ registry = get_registry()
1635
+ store = _get_store()
1636
+
1637
+ # Ensure any plans saved by previous server invocations are loaded
1638
+ registry.restore_from_store(store)
1639
+
1640
+ plans = registry.list_plans()
1641
+ summary = []
1642
+ for p in plans:
1643
+ tasks = p.get("tasks", [])
1644
+ by_status: dict[str, int] = {}
1645
+ for t in tasks:
1646
+ s = t.get("status", "pending")
1647
+ by_status[s] = by_status.get(s, 0) + 1
1648
+ next_pending = next((t["title"] for t in tasks if t["status"] == "pending"), None)
1649
+ confidence = p.get("rolling_confidence", 1.0)
1650
+ filled = int(confidence * 10)
1651
+ gauge = f"{'█' * filled}{'░' * (10 - filled)} {int(confidence * 100)}%"
1652
+ summary.append({
1653
+ "plan_id": p["plan_id"],
1654
+ "goal": p["goal"],
1655
+ "gauge": gauge,
1656
+ "rolling_confidence": p["rolling_confidence"],
1657
+ "needs_replan": p["needs_replan"],
1658
+ "confidence_threshold": p.get("confidence_threshold", 0.72),
1659
+ "replan_count": p.get("replan_count", 0),
1660
+ "task_counts": by_status,
1661
+ "total_tasks": len(tasks),
1662
+ "next_task": next_pending,
1663
+ })
1664
+ return json.dumps({
1665
+ "total_plans": len(summary),
1666
+ "plans": summary,
1667
+ }, indent=2)
1668
+
1669
+
1670
+ def _tool_plan_delete(plan_id: str) -> str:
1671
+ """Delete a plan from the registry and persistent store.
1672
+
1673
+ Use this when a plan is complete or no longer needed.
1674
+
1675
+ Args:
1676
+ plan_id: The plan to delete.
1677
+
1678
+ Returns:
1679
+ JSON confirmation with ``deleted`` flag.
1680
+ """
1681
+ registry = get_registry()
1682
+ store = _get_store()
1683
+ in_memory = registry.delete(plan_id)
1684
+ on_disk = store.delete_plan(plan_id)
1685
+ return json.dumps({
1686
+ "deleted": in_memory or on_disk,
1687
+ "plan_id": plan_id,
1688
+ "message": (
1689
+ f"Plan {plan_id} deleted." if (in_memory or on_disk)
1690
+ else f"Plan {plan_id} not found."
1691
+ ),
1692
+ }, indent=2)
1693
+
1694
+
1695
+ def _tool_gauge(prompt: str) -> str:
1696
+ """Instantly score a prompt and return a visual quality gauge.
1697
+
1698
+ Lighter than loopllm_intercept — no routing, no DB write, no elicitation.
1699
+ Use this for a quick visual quality check of any prompt or draft.
1700
+
1701
+ Returns a gauge like: ████████░░ 82% [A]
1702
+ plus the five dimension scores and a list of improvement suggestions.
1703
+
1704
+ Args:
1705
+ prompt: The prompt text to score.
1706
+
1707
+ Returns:
1708
+ JSON with ``gauge``, ``grade``, ``score``, ``dimensions``, and ``suggestions``.
1709
+ """
1710
+ quality = _score_prompt_quality(prompt)
1711
+ q = quality["quality_score"]
1712
+ dims = quality["dimensions"]
1713
+
1714
+ # Sort dimensions worst→best so weakest areas appear first
1715
+ sorted_dims = sorted(dims.items(), key=lambda x: x[1])
1716
+
1717
+ return json.dumps({
1718
+ "gauge": quality["gauge"],
1719
+ "grade": quality["grade"],
1720
+ "score": round(q, 3),
1721
+ "dimensions": {
1722
+ k: {
1723
+ "score": round(v, 3),
1724
+ "bar": "█" * int(v * 10) + "░" * (10 - int(v * 10)),
1725
+ }
1726
+ for k, v in sorted_dims
1727
+ },
1728
+ "suggestions": quality.get("suggestions", []),
1729
+ "issues": quality.get("issues", []),
1730
+ "word_count": quality.get("word_count", 0),
1731
+ }, indent=2)
1732
+
1733
+
1734
+ def _tool_context_history(
1735
+ limit: int = 20,
1736
+ session_context: str | None = None,
1737
+ min_score: float | None = None,
1738
+ ) -> str:
1739
+ """Browse your prompt quality history with visual gauges.
1740
+
1741
+ Returns your recent prompts with their scores, grades, and gauges so you
1742
+ can track how your prompting quality is evolving over time.
1743
+
1744
+ Args:
1745
+ limit: Number of recent prompts to return (default 20, max 100).
1746
+ session_context: Filter to a specific session context tag.
1747
+ min_score: Only return prompts at or above this quality score (0-1).
1748
+
1749
+ Returns:
1750
+ JSON with a ``history`` list (newest first) and aggregate ``summary``.
1751
+ """
1752
+ store = _get_store()
1753
+ limit = max(1, min(limit, 100))
1754
+ rows = store.get_prompt_history(limit=limit, session_context=session_context)
1755
+
1756
+ if min_score is not None:
1757
+ rows = [r for r in rows if r["quality_score"] >= min_score]
1758
+
1759
+ spark_chars = " ▁▂▃▄▅▆▇█"
1760
+
1761
+ def _gauge(score: float, grade: str) -> str:
1762
+ filled = int(score * 10)
1763
+ return f"{'█' * filled}{'░' * (10 - filled)} {int(score * 100)}% [{grade}]"
1764
+
1765
+ def _spark(score: float) -> str:
1766
+ idx = int(score * 8)
1767
+ return spark_chars[min(idx, 8)]
1768
+
1769
+ formatted = []
1770
+ for r in rows:
1771
+ formatted.append({
1772
+ "id": r["id"],
1773
+ "timestamp": r["timestamp"],
1774
+ "prompt_preview": r["prompt_text"][:80] + ("…" if len(r["prompt_text"]) > 80 else ""),
1775
+ "gauge": _gauge(r["quality_score"], r["grade"]),
1776
+ "grade": r["grade"],
1777
+ "score": round(r["quality_score"], 3),
1778
+ "task_type": r["task_type"],
1779
+ "route_chosen": r["route_chosen"],
1780
+ "session_context": r["session_context"],
1781
+ "dimensions": {
1782
+ "specificity": round(r["specificity"], 3),
1783
+ "constraint_clarity": round(r["constraint_clarity"], 3),
1784
+ "context_completeness": round(r["context_completeness"], 3),
1785
+ "ambiguity": round(r["ambiguity"], 3),
1786
+ "format_spec": round(r["format_spec"], 3),
1787
+ },
1788
+ })
1789
+
1790
+ # Summary strip
1791
+ if rows:
1792
+ scores = [r["quality_score"] for r in rows]
1793
+ avg = sum(scores) / len(scores)
1794
+ sparkline = "".join(_spark(s) for s in reversed(scores))
1795
+ grade_dist: dict[str, int] = {}
1796
+ for r in rows:
1797
+ grade_dist[r["grade"]] = grade_dist.get(r["grade"], 0) + 1
1798
+ summary = {
1799
+ "total_shown": len(rows),
1800
+ "avg_score": round(avg, 3),
1801
+ "avg_gauge": _gauge(avg, next(
1802
+ g for g, lb in [("A", 0.85), ("B", 0.70), ("C", 0.55), ("D", 0.40), ("F", 0.0)]
1803
+ if avg >= lb
1804
+ )),
1805
+ "sparkline": sparkline,
1806
+ "grade_distribution": grade_dist,
1807
+ }
1808
+ else:
1809
+ summary = {"total_shown": 0, "avg_score": 0.0, "sparkline": ""}
1810
+
1811
+ return json.dumps({
1812
+ "summary": summary,
1813
+ "history": formatted,
1814
+ }, indent=2)
1815
+
1816
+
1817
+ def _tool_context_clear(session_context: str | None = None) -> str:
1818
+ """Clear stored prompt history.
1819
+
1820
+ Wipes all (or session-scoped) prompt history records from the local DB.
1821
+ Use this to reset your quality baseline at the start of a new project
1822
+ or when switching contexts.
1823
+
1824
+ Args:
1825
+ session_context: If provided, only clear records with this session tag.
1826
+ If omitted, ALL prompt history is cleared.
1827
+
1828
+ Returns:
1829
+ JSON with the count of records deleted and confirmation message.
1830
+ """
1831
+ store = _get_store()
1832
+ with store._connection() as conn:
1833
+ if session_context is not None:
1834
+ cursor = conn.execute(
1835
+ "DELETE FROM prompt_history WHERE session_context = ?",
1836
+ (session_context,),
1837
+ )
1838
+ else:
1839
+ cursor = conn.execute("DELETE FROM prompt_history")
1840
+ conn.commit()
1841
+ deleted = cursor.rowcount
1842
+
1843
+ scope = f"session '{session_context}'" if session_context else "all sessions"
1844
+ return json.dumps({
1845
+ "deleted": deleted,
1846
+ "scope": scope,
1847
+ "message": f"Cleared {deleted} prompt history record(s) from {scope}.",
1848
+ }, indent=2)
1849
+
1850
+
1851
+ def _tool_plan_next(plan_id: str) -> str:
1852
+ """Get and activate the next pending task in a plan.
1853
+
1854
+ Returns the next PENDING task and marks it IN_PROGRESS, or signals
1855
+ that the plan is complete. Also surfaces ``needs_replan`` and the
1856
+ current ``rolling_confidence`` so the agent can decide whether to
1857
+ pause and replan before proceeding.
1858
+
1859
+ Args:
1860
+ plan_id: The plan to query.
1861
+
1862
+ Returns:
1863
+ JSON with the next task details, ``needs_replan``, and
1864
+ ``rolling_confidence``; or ``{\"done\": true}`` if all tasks
1865
+ are finished.
1866
+ """
1867
+ registry = get_registry()
1868
+ task = registry.next_task(plan_id)
1869
+ if task is None:
1870
+ status = registry.get_status(plan_id)
1871
+ return json.dumps({
1872
+ "done": True,
1873
+ "plan_id": plan_id,
1874
+ "rolling_confidence": status.get("rolling_confidence", 0.0),
1875
+ }, indent=2)
1876
+ # Persist the in_progress status change
1877
+ store = _get_store()
1878
+ plan = registry.get(plan_id)
1879
+ if plan:
1880
+ store.save_plan(plan.to_dict())
1881
+ return json.dumps({**task, "done": False}, indent=2)
1882
+
1883
+
1884
+ # ---------------------------------------------------------------------------
1885
+ # Mid-execution MCP sampling helpers
1886
+ # ---------------------------------------------------------------------------
1887
+
1888
+
1889
+ async def _sample_text(ctx: Any, prompt: str, max_tokens: int = 2048) -> str:
1890
+ """Call ctx.sample() and return the plain text content."""
1891
+ result = await ctx.sample(prompt, max_tokens=max_tokens)
1892
+ content = result.content
1893
+ return content.text if hasattr(content, "text") else str(content)
1894
+
1895
+
1896
+ async def _sampling_refine(
1897
+ ctx: Any,
1898
+ prompt: str,
1899
+ max_iterations: int,
1900
+ quality_threshold: float,
1901
+ evaluator_type: str,
1902
+ min_words: int,
1903
+ max_words: int,
1904
+ required_fields: list[str],
1905
+ required_patterns: list[str],
1906
+ ) -> str:
1907
+ """Iterative refinement loop executed entirely via MCP sampling calls."""
1908
+ evaluator = _build_evaluator(
1909
+ evaluator_type,
1910
+ min_words=min_words,
1911
+ max_words=max_words,
1912
+ required_fields=required_fields,
1913
+ required_patterns=required_patterns,
1914
+ )
1915
+ best_output = ""
1916
+ best_score = 0.0
1917
+ scores: list[float] = []
1918
+ current_prompt = prompt
1919
+ for i in range(max_iterations):
1920
+ output = await _sample_text(ctx, current_prompt, max_tokens=4096)
1921
+ ev = evaluator.evaluate(output)
1922
+ scores.append(ev.score)
1923
+ if ev.score > best_score:
1924
+ best_score = ev.score
1925
+ best_output = output
1926
+ if ev.passed or i == max_iterations - 1:
1927
+ break
1928
+ deficiency_str = "; ".join(ev.deficiencies) if ev.deficiencies else "low score"
1929
+ current_prompt = (
1930
+ f"{prompt}\n\n"
1931
+ f"[Iteration {i + 1} score: {ev.score:.2f}. Issues: {deficiency_str}. "
1932
+ f"Please improve your response to address these issues.]"
1933
+ )
1934
+ return json.dumps({
1935
+ "output": best_output,
1936
+ "best_score": round(best_score, 3),
1937
+ "converged": best_score >= quality_threshold,
1938
+ "iterations": len(scores),
1939
+ "score_trajectory": [round(s, 3) for s in scores],
1940
+ "via": "mcp_sampling",
1941
+ }, indent=2)
1942
+
1943
+
1944
+ async def _sampling_run_pipeline(
1945
+ ctx: Any,
1946
+ prompt: str,
1947
+ max_iterations: int,
1948
+ quality_threshold: float,
1949
+ skip_elicitation: bool,
1950
+ ) -> str:
1951
+ """Full pipeline (elicit -> decompose -> execute -> verify) via MCP sampling."""
1952
+ quality = _score_prompt_quality(prompt)
1953
+ task_type = _classify_task_type(prompt)
1954
+ complexity = _estimate_complexity(prompt)
1955
+ num_samples = 0
1956
+
1957
+ # Stage 1: elicit clarifying assumptions when prompt quality is weak
1958
+ refined_prompt = prompt
1959
+ if not skip_elicitation and quality["quality_score"] < 0.6:
1960
+ elicit_text = await _sample_text(
1961
+ ctx,
1962
+ f"The user asked: '{prompt}'\n"
1963
+ f"Identify the 1-2 most important ambiguities (prompt score: "
1964
+ f"{quality['quality_score']:.2f}). State your clarifying assumptions "
1965
+ f"clearly, then proceed based on those assumptions.",
1966
+ max_tokens=400,
1967
+ )
1968
+ num_samples += 1
1969
+ refined_prompt = f"{prompt}\n\n[Clarifying assumptions: {elicit_text}]"
1970
+
1971
+ # Stage 2: decompose into subtasks when complexity warrants it
1972
+ subtask_list: list[dict[str, Any]] = []
1973
+ if complexity > 0.5:
1974
+ decomp_text = await _sample_text(
1975
+ ctx,
1976
+ f"Decompose this task into 2-5 ordered subtasks:\n{refined_prompt}\n\n"
1977
+ f"Task type: {task_type}\n"
1978
+ f"Reply ONLY with a JSON array: "
1979
+ f'[{{"id":"t1","title":"...","description":"..."}}]',
1980
+ max_tokens=800,
1981
+ )
1982
+ num_samples += 1
1983
+ try:
1984
+ m = re.search(r"\[.*\]", decomp_text, re.DOTALL)
1985
+ subtask_list = json.loads(m.group()) if m else []
1986
+ except Exception: # noqa: BLE001
1987
+ subtask_list = []
1988
+
1989
+ # Stage 3: execute each subtask (or the whole prompt if not decomposed)
1990
+ if subtask_list:
1991
+ parts: list[str] = []
1992
+ for t in subtask_list:
1993
+ part = await _sample_text(
1994
+ ctx,
1995
+ f"Task: {t.get('title', '')}\n{t.get('description', '')}\n\nContext: {refined_prompt}",
1996
+ max_tokens=2048,
1997
+ )
1998
+ num_samples += 1
1999
+ parts.append(part)
2000
+ output = "\n\n".join(parts)
2001
+ else:
2002
+ output = await _sample_text(ctx, refined_prompt, max_tokens=4096)
2003
+ num_samples += 1
2004
+
2005
+ # Stage 4: ask the agent to self-rate the output
2006
+ verify_text = await _sample_text(
2007
+ ctx,
2008
+ f"Rate the quality of this response to the prompt '{refined_prompt[:200]}' "
2009
+ f"on a scale 0.0-1.0. Reply ONLY with a decimal number.",
2010
+ max_tokens=20,
2011
+ )
2012
+ num_samples += 1
2013
+ try:
2014
+ score_match = re.search(r"\d+\.?\d*", verify_text)
2015
+ best_score = float(score_match.group()) if score_match else 0.85
2016
+ if best_score > 1.0:
2017
+ best_score = best_score / 10.0
2018
+ best_score = min(1.0, best_score)
2019
+ except Exception: # noqa: BLE001
2020
+ best_score = 0.85
2021
+
2022
+ return json.dumps({
2023
+ "output": output,
2024
+ "best_score": round(best_score, 3),
2025
+ "converged": best_score >= quality_threshold,
2026
+ "iterations": num_samples,
2027
+ "score_trajectory": [round(best_score, 3)],
2028
+ "task_type": task_type,
2029
+ "subtasks": len(subtask_list) if subtask_list else 1,
2030
+ "via": "mcp_sampling",
2031
+ }, indent=2)
2032
+
2033
+
2034
+ async def _sampling_plan_tasks(
2035
+ ctx: Any,
2036
+ prompt: str,
2037
+ estimated_complexity: float,
2038
+ ) -> str:
2039
+ """Decompose a prompt into a structured task plan via MCP sampling."""
2040
+ task_type = _classify_task_type(prompt)
2041
+ text = await _sample_text(
2042
+ ctx,
2043
+ f"Decompose this task into 2-6 ordered subtasks:\n{prompt}\n\n"
2044
+ f"Task type: {task_type}, Complexity: {estimated_complexity:.2f}\n\n"
2045
+ f"Reply ONLY with valid JSON:\n"
2046
+ f'{{"tasks":[{{"id":"t1","title":"...","description":"...","dependencies":[]}}],'
2047
+ f'"execution_order":["t1"]}}',
2048
+ max_tokens=1200,
2049
+ )
2050
+ try:
2051
+ m = re.search(r"\{.*\}", text, re.DOTALL)
2052
+ data = json.loads(m.group()) if m else {}
2053
+ tasks = data.get("tasks", [])
2054
+ order = data.get("execution_order", [t.get("id", "") for t in tasks])
2055
+ except Exception: # noqa: BLE001
2056
+ tasks = [{"id": "t1", "title": prompt[:60], "description": prompt, "dependencies": []}]
2057
+ order = ["t1"]
2058
+ return json.dumps({
2059
+ "task_count": len(tasks),
2060
+ "tasks": [
2061
+ {
2062
+ "id": t.get("id", f"t{i}"),
2063
+ "title": t.get("title", ""),
2064
+ "description": t.get("description", ""),
2065
+ "state": "pending",
2066
+ "dependencies": t.get("dependencies", []),
2067
+ }
2068
+ for i, t in enumerate(tasks, 1)
2069
+ ],
2070
+ "execution_order": order,
2071
+ "via": "mcp_sampling",
2072
+ }, indent=2)
2073
+
2074
+
2075
+ async def _sampling_verify_output(
2076
+ ctx: Any,
2077
+ output: str,
2078
+ original_prompt: str,
2079
+ quality_criteria: list[str],
2080
+ ) -> str:
2081
+ """Verify output quality with a fast keyword pre-check then deep MCP sampling."""
2082
+ output_lower = output.lower()
2083
+ passed_criteria = [
2084
+ c for c in quality_criteria if any(
2085
+ word in output_lower for word in c.lower().split() if len(word) > 3
2086
+ )
2087
+ ]
2088
+ fast_score = (len(passed_criteria) / len(quality_criteria)) if quality_criteria else 0.9
2089
+ fast_deficiencies = [c for c in quality_criteria if c not in passed_criteria]
2090
+
2091
+ verify_result = await _sample_text(
2092
+ ctx,
2093
+ f"Verify the following output against the original prompt and quality criteria.\n\n"
2094
+ f"ORIGINAL PROMPT:\n{original_prompt}\n\n"
2095
+ f"OUTPUT:\n{output[:3000]}\n\n"
2096
+ f"QUALITY CRITERIA: {quality_criteria}\n\n"
2097
+ f"Reply ONLY with valid JSON:\n"
2098
+ f'{{"score":0.0,"passed":false,"deficiencies":["..."],"feedback":"..."}}',
2099
+ max_tokens=500,
2100
+ )
2101
+ try:
2102
+ m = re.search(r"\{.*\}", verify_result, re.DOTALL)
2103
+ data = json.loads(m.group()) if m else {}
2104
+ score = float(data.get("score", fast_score))
2105
+ passed = bool(data.get("passed", score >= 0.7))
2106
+ deficiencies = data.get("deficiencies", fast_deficiencies)
2107
+ feedback = data.get("feedback", "")
2108
+ except Exception: # noqa: BLE001
2109
+ score = fast_score
2110
+ passed = fast_score >= 0.7
2111
+ deficiencies = fast_deficiencies
2112
+ feedback = verify_result[:300]
2113
+
2114
+ return json.dumps({
2115
+ "score": round(score, 3),
2116
+ "passed": passed,
2117
+ "deficiencies": deficiencies,
2118
+ "sub_scores": {"keyword_match": round(fast_score, 3)},
2119
+ "feedback": feedback,
2120
+ "via": "mcp_sampling",
2121
+ }, indent=2)
2122
+
2123
+
2124
+ # ---------------------------------------------------------------------------
2125
+ # FastMCP registration
2126
+ # ---------------------------------------------------------------------------
2127
+
2128
+
2129
+ def create_mcp_server() -> Any:
2130
+ """Create and configure the FastMCP server with all tools registered."""
2131
+ try:
2132
+ from mcp.server.fastmcp import FastMCP
2133
+ except ImportError as exc:
2134
+ raise ImportError(
2135
+ "The mcp package is required for the MCP server. "
2136
+ "Install it with: pip install loopllm[mcp]"
2137
+ ) from exc
2138
+
2139
+ mcp = FastMCP(
2140
+ name="loopllm",
2141
+ instructions=(
2142
+ "IMPORTANT: Before responding to ANY non-trivial user message, "
2143
+ "you MUST first call loopllm_intercept with the user's message. "
2144
+ "This tool analyzes prompt quality and returns routing guidance "
2145
+ "you should follow. Do NOT skip this step.\n\n"
2146
+ "The intercept tool returns a quality gauge, routing recommendation "
2147
+ "(elicit/refine/decompose), and suggestions. Follow its guidance.\n\n"
2148
+ "For multi-step / iterative tasks (where you plan → act → observe → "
2149
+ "repeat), drive the loop through loopllm_loop_start, then "
2150
+ "loopllm_loop_step after each step with step_output=<artifact> "
2151
+ "(test log, diff, summary). The server runs Conservative Dual-Verify: "
2152
+ "deterministic checks plus a separate critic call — do NOT self-grade. "
2153
+ "Honor the continue/stop verdict, then call loopllm_loop_end.\n\n"
2154
+ "After presenting results to the user, call loopllm_feedback with "
2155
+ "the user's quality rating (1-5) to improve future predictions.\n\n"
2156
+ "Periodically call loopllm_prompt_stats to show the user their "
2157
+ "prompting quality trend and learning curve."
2158
+ ),
2159
+ )
2160
+
2161
+ # Restore any plans saved in previous server sessions
2162
+ try:
2163
+ get_registry().restore_from_store(_get_store())
2164
+ except Exception: # noqa: BLE001
2165
+ pass # non-fatal: store may not exist yet
2166
+
2167
+ # -- Routing & Prompt Engineering tools --
2168
+
2169
+ @mcp.tool(
2170
+ name="loopllm_intercept",
2171
+ description=(
2172
+ "CALL THIS FIRST for any non-trivial request. Analyzes the user's "
2173
+ "prompt for quality (specificity, constraints, context, ambiguity, "
2174
+ "format) and returns: a quality gauge with score and grade, routing "
2175
+ "recommendation (elicit/refine/decompose), issues found, improvement "
2176
+ "suggestions, and task type classification. Logs to prompt history "
2177
+ "for learning curve tracking."
2178
+ ),
2179
+ )
2180
+ def intercept(prompt: str) -> str:
2181
+ return _tool_intercept(prompt)
2182
+
2183
+ @mcp.tool(
2184
+ name="loopllm_prompt_stats",
2185
+ description=(
2186
+ "Show the user's prompting quality over time. Returns: total "
2187
+ "prompts analyzed, average quality score, trend direction "
2188
+ "(improving/declining/stable), learning curve sparkline, grade "
2189
+ "distribution, and weak/strong dimensions to improve."
2190
+ ),
2191
+ )
2192
+ def prompt_stats(window: int = 50) -> str:
2193
+ return _tool_prompt_stats(window)
2194
+
2195
+ @mcp.tool(
2196
+ name="loopllm_feedback",
2197
+ description=(
2198
+ "Record the user's quality rating (1-5) for the last output. "
2199
+ "Updates Bayesian priors with human signal so the system learns "
2200
+ "what quality scores correspond to user satisfaction."
2201
+ ),
2202
+ )
2203
+ def feedback(
2204
+ rating: int,
2205
+ task_type: str = "general",
2206
+ comment: str = "",
2207
+ ) -> str:
2208
+ return _tool_feedback(rating, task_type, comment)
2209
+
2210
+ # -- Core tools --
2211
+
2212
+ @mcp.tool(
2213
+ name="loopllm_refine",
2214
+ description=(
2215
+ "Iteratively refine a prompt using MCP sampling to call the host agent "
2216
+ "mid-execution. Runs the score → rewrite → retry loop inline: each "
2217
+ "iteration calls ctx.sample(), evaluates with deterministic evaluators "
2218
+ "(length, regex, JSON schema), feeds deficiencies back into the next "
2219
+ "prompt, and repeats until quality_threshold is met or max_iterations "
2220
+ "is exhausted. Falls back to agent_execute if sampling is unavailable."
2221
+ ),
2222
+ )
2223
+ async def refine(
2224
+ prompt: str,
2225
+ provider: str | None = None,
2226
+ model: str | None = None,
2227
+ max_iterations: int = 5,
2228
+ quality_threshold: float = 0.8,
2229
+ evaluator_type: str = "length",
2230
+ min_words: int = 5,
2231
+ max_words: int = 10000,
2232
+ required_fields: list[str] | None = None,
2233
+ required_patterns: list[str] | None = None,
2234
+ ctx: Context[Any, Any, Any] | None = None,
2235
+ ) -> str:
2236
+ prov = _get_provider(provider)
2237
+ if isinstance(prov, AgentPassthroughProvider) and ctx is not None:
2238
+ try:
2239
+ return await _sampling_refine(
2240
+ ctx, prompt, max_iterations, quality_threshold,
2241
+ evaluator_type, min_words, max_words,
2242
+ required_fields or [], required_patterns or [],
2243
+ )
2244
+ except Exception: # noqa: BLE001
2245
+ pass # sampling not supported by this client; fall through
2246
+ return _tool_refine(
2247
+ prompt, provider, model, max_iterations, quality_threshold,
2248
+ evaluator_type, min_words, max_words, required_fields, required_patterns,
2249
+ )
2250
+
2251
+ @mcp.tool(
2252
+ name="loopllm_run_pipeline",
2253
+ description=(
2254
+ "Run the full loop-llm pipeline via MCP sampling: "
2255
+ "(1) elicit clarifying assumptions if prompt quality < 0.6, "
2256
+ "(2) decompose into subtasks if complexity > 0.5, "
2257
+ "(3) execute each subtask with a sampling call, "
2258
+ "(4) self-rate the assembled output. "
2259
+ "Each stage is a real mid-execution ctx.sample() call — "
2260
+ "not a deferred agent_execute instruction."
2261
+ ),
2262
+ )
2263
+ async def run_pipeline(
2264
+ prompt: str,
2265
+ provider: str | None = None,
2266
+ model: str | None = None,
2267
+ max_iterations: int = 5,
2268
+ quality_threshold: float = 0.8,
2269
+ skip_elicitation: bool = False,
2270
+ ctx: Context[Any, Any, Any] | None = None,
2271
+ ) -> str:
2272
+ prov = _get_provider(provider)
2273
+ if isinstance(prov, AgentPassthroughProvider) and ctx is not None:
2274
+ try:
2275
+ return await _sampling_run_pipeline(
2276
+ ctx, prompt, max_iterations, quality_threshold, skip_elicitation,
2277
+ )
2278
+ except Exception: # noqa: BLE001
2279
+ pass
2280
+ return _tool_run_pipeline(
2281
+ prompt, provider, model, max_iterations, quality_threshold, skip_elicitation,
2282
+ )
2283
+
2284
+ @mcp.tool(
2285
+ name="loopllm_classify_task",
2286
+ description=(
2287
+ "Classify a user prompt into a task type: code_generation, "
2288
+ "summarization, data_extraction, question_answering, creative_writing, "
2289
+ "analysis, transformation, or general."
2290
+ ),
2291
+ )
2292
+ def classify_task(
2293
+ prompt: str,
2294
+ provider: str | None = None,
2295
+ model: str | None = None,
2296
+ ) -> str:
2297
+ return _tool_classify_task(prompt, provider, model)
2298
+
2299
+ @mcp.tool(
2300
+ name="loopllm_analyze_prompt",
2301
+ description=(
2302
+ "Analyze a prompt and generate clarifying questions ranked by "
2303
+ "expected information gain using Bayesian priors."
2304
+ ),
2305
+ )
2306
+ def analyze_prompt(
2307
+ prompt: str,
2308
+ provider: str | None = None,
2309
+ model: str | None = None,
2310
+ max_questions: int = 5,
2311
+ ) -> str:
2312
+ return _tool_analyze_prompt(prompt, provider, model, max_questions)
2313
+
2314
+ # -- Elicitation session tools --
2315
+
2316
+ @mcp.tool(
2317
+ name="loopllm_elicitation_start",
2318
+ description=(
2319
+ "Start a multi-turn elicitation session. Classifies the prompt, "
2320
+ "generates the first clarifying question, and returns a session_id."
2321
+ ),
2322
+ )
2323
+ def elicitation_start(
2324
+ prompt: str,
2325
+ provider: str | None = None,
2326
+ model: str | None = None,
2327
+ max_questions: int = 3,
2328
+ ) -> str:
2329
+ return _tool_elicitation_start(prompt, provider, model, max_questions)
2330
+
2331
+ @mcp.tool(
2332
+ name="loopllm_elicitation_answer",
2333
+ description=(
2334
+ "Answer the current clarifying question in an elicitation session."
2335
+ ),
2336
+ )
2337
+ def elicitation_answer(session_id: str, answer: str) -> str:
2338
+ return _tool_elicitation_answer(session_id, answer)
2339
+
2340
+ @mcp.tool(
2341
+ name="loopllm_elicitation_finish",
2342
+ description=(
2343
+ "Finish an elicitation session and synthesize an IntentSpec."
2344
+ ),
2345
+ )
2346
+ def elicitation_finish(session_id: str) -> str:
2347
+ return _tool_elicitation_finish(session_id)
2348
+
2349
+ # -- Task orchestration tools --
2350
+
2351
+ @mcp.tool(
2352
+ name="loopllm_plan_tasks",
2353
+ description=(
2354
+ "Decompose a prompt into subtasks with dependency ordering. "
2355
+ "Uses MCP sampling to call the host agent mid-execution and parse "
2356
+ "its JSON decomposition into a structured task plan."
2357
+ ),
2358
+ )
2359
+ async def plan_tasks(
2360
+ prompt: str,
2361
+ provider: str | None = None,
2362
+ model: str | None = None,
2363
+ estimated_complexity: float = 0.5,
2364
+ ctx: Context[Any, Any, Any] | None = None,
2365
+ ) -> str:
2366
+ prov = _get_provider(provider)
2367
+ if isinstance(prov, AgentPassthroughProvider) and ctx is not None:
2368
+ try:
2369
+ return await _sampling_plan_tasks(ctx, prompt, estimated_complexity)
2370
+ except Exception: # noqa: BLE001
2371
+ pass
2372
+ return _tool_plan_tasks(prompt, provider, model, estimated_complexity)
2373
+
2374
+ @mcp.tool(
2375
+ name="loopllm_verify_output",
2376
+ description=(
2377
+ "Verify an output against the original prompt and quality criteria. "
2378
+ "Runs a fast deterministic keyword pre-check, then calls ctx.sample() "
2379
+ "mid-execution to ask the host agent for a deep quality assessment. "
2380
+ "Returns a combined score, pass/fail, deficiencies, and feedback."
2381
+ ),
2382
+ )
2383
+ async def verify_output(
2384
+ output: str,
2385
+ original_prompt: str,
2386
+ quality_criteria: list[str] | None = None,
2387
+ provider: str | None = None,
2388
+ model: str | None = None,
2389
+ ctx: Context[Any, Any, Any] | None = None,
2390
+ ) -> str:
2391
+ prov = _get_provider(provider)
2392
+ if isinstance(prov, AgentPassthroughProvider) and ctx is not None:
2393
+ try:
2394
+ return await _sampling_verify_output(
2395
+ ctx, output, original_prompt, quality_criteria or [],
2396
+ )
2397
+ except Exception: # noqa: BLE001
2398
+ pass
2399
+ return _tool_verify_output(output, original_prompt, quality_criteria, provider, model)
2400
+
2401
+ # -- Observability tools --
2402
+
2403
+ @mcp.tool(
2404
+ name="loopllm_report",
2405
+ description=(
2406
+ "Show learned Bayesian priors and question effectiveness statistics."
2407
+ ),
2408
+ )
2409
+ def report(
2410
+ task_type: str | None = None,
2411
+ model_id: str | None = None,
2412
+ ) -> str:
2413
+ return _tool_report(task_type, model_id)
2414
+
2415
+ @mcp.tool(
2416
+ name="loopllm_suggest_config",
2417
+ description=(
2418
+ "Get a suggested loop configuration based on learned beliefs."
2419
+ ),
2420
+ )
2421
+ def suggest_config(
2422
+ task_type: str,
2423
+ model_id: str | None = None,
2424
+ cost_weight: float = 0.5,
2425
+ ) -> str:
2426
+ return _tool_suggest_config(task_type, model_id, cost_weight)
2427
+
2428
+ # -- Adaptive agent-loop tools --
2429
+
2430
+ @mcp.tool(
2431
+ name="loopllm_loop_start",
2432
+ description=(
2433
+ "Begin an ADAPTIVE AGENT LOOP for a multi-step task. Returns a learned "
2434
+ "step budget, quality threshold, and a Conservative Dual-Verify recipe "
2435
+ "(evaluator_type, quality_criteria). After each step submit step_output "
2436
+ "to loopllm_loop_step — the server scores externally; do NOT self-grade."
2437
+ ),
2438
+ )
2439
+ def loop_start(
2440
+ goal: str,
2441
+ task_type: str = "general",
2442
+ model_id: str | None = None,
2443
+ quality_threshold: float | None = None,
2444
+ cost_weight: float = 0.5,
2445
+ evaluator_type: str = "composite",
2446
+ quality_criteria: list[str] | None = None,
2447
+ required_patterns: list[str] | None = None,
2448
+ required_fields: list[str] | None = None,
2449
+ max_wall_ms: float = 300_000.0,
2450
+ max_tokens: int = 0,
2451
+ ) -> str:
2452
+ return _tool_loop_start(
2453
+ goal,
2454
+ task_type,
2455
+ model_id,
2456
+ quality_threshold,
2457
+ cost_weight,
2458
+ evaluator_type,
2459
+ quality_criteria,
2460
+ required_patterns,
2461
+ required_fields,
2462
+ max_wall_ms,
2463
+ max_tokens,
2464
+ )
2465
+
2466
+ @mcp.tool(
2467
+ name="loopllm_loop_step",
2468
+ description=(
2469
+ "Submit ONE agent-loop step artifact for Conservative Dual-Verify "
2470
+ "scoring. Pass session_id and step_output (test log, diff, summary). "
2471
+ "The server scores via deterministic evaluators (Channel A) and a "
2472
+ "separate critic sampling call (Channel B); final = min(A,B). Returns "
2473
+ "decision, channel_a_score, channel_b_score, deficiencies. Do NOT "
2474
+ "pass your own score unless step_output is unavailable (legacy)."
2475
+ ),
2476
+ )
2477
+ async def loop_step(
2478
+ session_id: str,
2479
+ step_output: str = "",
2480
+ score: float | None = None,
2481
+ note: str = "",
2482
+ step_tokens: int = 0,
2483
+ ctx: Context[Any, Any, Any] | None = None,
2484
+ ) -> str:
2485
+ return await _tool_loop_step(
2486
+ session_id, step_output, score, note, step_tokens, ctx
2487
+ )
2488
+
2489
+ @mcp.tool(
2490
+ name="loopllm_loop_end",
2491
+ description=(
2492
+ "Close an agent loop and LEARN from it. Call once the loop stops. "
2493
+ "Records the run (step scores, whether it converged) into the "
2494
+ "Bayesian priors so future loops of this task_type get a better step "
2495
+ "budget. Returns a summary of what the system now believes "
2496
+ "(optimal_depth, converge_rate, confidence)."
2497
+ ),
2498
+ )
2499
+ def loop_end(session_id: str, converged: bool | None = None) -> str:
2500
+ return _tool_loop_end(session_id, converged)
2501
+
2502
+ @mcp.tool(
2503
+ name="loopllm_loop_status",
2504
+ description=(
2505
+ "Inspect an active agent-loop session: steps used, suggested budget, "
2506
+ "score trajectory, and the last continue/stop verdict."
2507
+ ),
2508
+ )
2509
+ def loop_status(session_id: str) -> str:
2510
+ return _tool_loop_status(session_id)
2511
+
2512
+ @mcp.tool(
2513
+ name="loopllm_list_tasks",
2514
+ description="List tasks from the persistent store.",
2515
+ )
2516
+ def list_tasks(
2517
+ state: str | None = None,
2518
+ limit: int = 20,
2519
+ ) -> str:
2520
+ return _tool_list_tasks(state, limit)
2521
+
2522
+ @mcp.tool(
2523
+ name="loopllm_show_task",
2524
+ description="Show detailed information about a specific task by ID.",
2525
+ )
2526
+ def show_task(task_id: str) -> str:
2527
+ return _tool_show_task(task_id)
2528
+
2529
+ # -- Plan Registry tools --
2530
+
2531
+ @mcp.tool(
2532
+ name="loopllm_plan_register",
2533
+ description=(
2534
+ "Create a new confidence-tracked plan in the PlanRegistry. "
2535
+ "Pass a goal and list of tasks (each with title + description). "
2536
+ "Returns a plan_id to use with loopllm_plan_update and loopllm_plan_next. "
2537
+ "The plan tracks rolling_confidence aggregated from all task scores "
2538
+ "and flags needs_replan=true when confidence drops below the threshold."
2539
+ ),
2540
+ )
2541
+ def plan_register(
2542
+ goal: str,
2543
+ tasks: list[dict[str, Any]],
2544
+ confidence_threshold: float = 0.72,
2545
+ ) -> str:
2546
+ return _tool_plan_register(goal, tasks, confidence_threshold)
2547
+
2548
+ @mcp.tool(
2549
+ name="loopllm_plan_update",
2550
+ description=(
2551
+ "Update a task's prompt_score and/or output_score, then recalculate "
2552
+ "the plan's rolling_confidence. "
2553
+ "Pass prompt_score from loopllm_intercept's quality_score field. "
2554
+ "Pass output_score from loopllm_verify_output's score field. "
2555
+ "Returns the updated plan with rolling_confidence and needs_replan flag. "
2556
+ "If needs_replan=true, refine the current task before calling loopllm_plan_next."
2557
+ ),
2558
+ )
2559
+ def plan_update(
2560
+ plan_id: str,
2561
+ task_id: str,
2562
+ prompt_score: float | None = None,
2563
+ output_score: float | None = None,
2564
+ mark_done: bool = True,
2565
+ ) -> str:
2566
+ return _tool_plan_update(plan_id, task_id, prompt_score, output_score, mark_done)
2567
+
2568
+ @mcp.tool(
2569
+ name="loopllm_plan_next",
2570
+ description=(
2571
+ "Get the next pending task in a plan and mark it in_progress. "
2572
+ "Returns the task description, current rolling_confidence, and "
2573
+ "needs_replan flag. If needs_replan=true, run loopllm_refine on the "
2574
+ "task description before executing it. Returns done=true when all "
2575
+ "tasks are complete."
2576
+ ),
2577
+ )
2578
+ def plan_next(plan_id: str) -> str:
2579
+ return _tool_plan_next(plan_id)
2580
+
2581
+ @mcp.tool(
2582
+ name="loopllm_plan_list",
2583
+ description=(
2584
+ "List all active plans with gauge, confidence, task counts by status, "
2585
+ "and next pending task. Gives a Shrimp-style overview of all ongoing work. "
2586
+ "Also restores any plans saved during previous server sessions from disk."
2587
+ ),
2588
+ )
2589
+ def plan_list() -> str:
2590
+ return _tool_plan_list()
2591
+
2592
+ @mcp.tool(
2593
+ name="loopllm_plan_delete",
2594
+ description=(
2595
+ "Delete a plan from the registry and persistent store. "
2596
+ "Use when a plan is complete or abandoned."
2597
+ ),
2598
+ )
2599
+ def plan_delete(plan_id: str) -> str:
2600
+ return _tool_plan_delete(plan_id)
2601
+
2602
+ @mcp.tool(
2603
+ name="loopllm_gauge",
2604
+ description=(
2605
+ "Instantly score a prompt and return a visual quality gauge. "
2606
+ "Lighter than loopllm_intercept — no routing, no DB write, no elicitation. "
2607
+ "Use this for a quick visual quality check of any prompt or draft. "
2608
+ "Returns a gauge like: ████████░░ 82% [A] plus per-dimension bars and suggestions."
2609
+ ),
2610
+ )
2611
+ def gauge(prompt: str) -> str:
2612
+ return _tool_gauge(prompt)
2613
+
2614
+ @mcp.tool(
2615
+ name="loopllm_context_history",
2616
+ description=(
2617
+ "Browse your prompt quality history with visual gauges. "
2618
+ "Returns recent prompts with their scores, grades, and gauges so you can "
2619
+ "track how your prompting quality is evolving. Includes a sparkline summary. "
2620
+ "Optionally filter by session_context tag or minimum score."
2621
+ ),
2622
+ )
2623
+ def context_history(
2624
+ limit: int = 20,
2625
+ session_context: str | None = None,
2626
+ min_score: float | None = None,
2627
+ ) -> str:
2628
+ return _tool_context_history(limit, session_context, min_score)
2629
+
2630
+ @mcp.tool(
2631
+ name="loopllm_context_clear",
2632
+ description=(
2633
+ "Clear stored prompt history. Wipes all (or session-scoped) prompt history "
2634
+ "from the local DB. Use this to reset your quality baseline at the start of "
2635
+ "a new project or when switching contexts. "
2636
+ "Omit session_context to clear everything."
2637
+ ),
2638
+ )
2639
+ def context_clear(session_context: str | None = None) -> str:
2640
+ return _tool_context_clear(session_context)
2641
+
2642
+ return mcp
2643
+
2644
+
2645
+ # ---------------------------------------------------------------------------
2646
+ # Entry point
2647
+ # ---------------------------------------------------------------------------
2648
+
2649
+
2650
+ def main() -> None:
2651
+ """Start the MCP server (stdio transport)."""
2652
+ mcp = create_mcp_server()
2653
+ mcp.run(transport="stdio")
2654
+
2655
+
2656
+ if __name__ == "__main__":
2657
+ main()