loopllm 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loopllm/__init__.py +69 -0
- loopllm/__main__.py +5 -0
- loopllm/adaptive_exit.py +78 -0
- loopllm/agent_loop.py +299 -0
- loopllm/cli.py +521 -0
- loopllm/elicitation.py +519 -0
- loopllm/engine.py +376 -0
- loopllm/evaluator_factory.py +72 -0
- loopllm/evaluators.py +419 -0
- loopllm/guards.py +254 -0
- loopllm/local_loop.py +273 -0
- loopllm/mcp_server.py +2657 -0
- loopllm/plan_registry.py +412 -0
- loopllm/priors.py +604 -0
- loopllm/provider.py +51 -0
- loopllm/providers/__init__.py +15 -0
- loopllm/providers/agent.py +64 -0
- loopllm/providers/mock.py +64 -0
- loopllm/providers/ollama.py +95 -0
- loopllm/providers/openrouter.py +101 -0
- loopllm/serve.py +297 -0
- loopllm/step_scorer.py +190 -0
- loopllm/store.py +1126 -0
- loopllm/tasks.py +599 -0
- loopllm-0.7.0.dist-info/METADATA +454 -0
- loopllm-0.7.0.dist-info/RECORD +29 -0
- loopllm-0.7.0.dist-info/WHEEL +4 -0
- loopllm-0.7.0.dist-info/entry_points.txt +3 -0
- loopllm-0.7.0.dist-info/licenses/LICENSE +21 -0
loopllm/step_scorer.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Conservative Dual-Verify (CDV) scoring for agent-loop steps.
|
|
2
|
+
|
|
3
|
+
Channel A: deterministic evaluators (regex, JSON, completeness).
|
|
4
|
+
Channel B: separate critic via MCP sampling (verifier hat).
|
|
5
|
+
Final score: min(channel_a, channel_b) — the stricter channel wins.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from loopllm.engine import EvaluationResult, Evaluator
|
|
15
|
+
from loopllm.evaluator_factory import build_evaluator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class DualVerifyScore:
|
|
20
|
+
"""Result of Conservative Dual-Verify scoring for one agent-loop step."""
|
|
21
|
+
|
|
22
|
+
final_score: float
|
|
23
|
+
channel_a_score: float
|
|
24
|
+
channel_b_score: float | None
|
|
25
|
+
passed: bool
|
|
26
|
+
deficiencies: list[str] = field(default_factory=list)
|
|
27
|
+
channel_a_sub_scores: dict[str, float] = field(default_factory=dict)
|
|
28
|
+
channel_b_feedback: str = ""
|
|
29
|
+
source: str = "conservative_dual_verify"
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> dict[str, Any]:
|
|
32
|
+
"""Serialise for MCP verdict JSON."""
|
|
33
|
+
out: dict[str, Any] = {
|
|
34
|
+
"score": round(self.final_score, 4),
|
|
35
|
+
"channel_a_score": round(self.channel_a_score, 4),
|
|
36
|
+
"score_source": self.source,
|
|
37
|
+
"passed": self.passed,
|
|
38
|
+
"deficiencies": self.deficiencies,
|
|
39
|
+
"channel_a_sub_scores": self.channel_a_sub_scores,
|
|
40
|
+
"channel_b_feedback": self.channel_b_feedback,
|
|
41
|
+
}
|
|
42
|
+
if self.channel_b_score is not None:
|
|
43
|
+
out["channel_b_score"] = round(self.channel_b_score, 4)
|
|
44
|
+
return out
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def keyword_criteria_score(output: str, quality_criteria: list[str]) -> tuple[float, list[str]]:
|
|
48
|
+
"""Fast keyword pre-check used by Channel B."""
|
|
49
|
+
if not quality_criteria:
|
|
50
|
+
return 0.9, []
|
|
51
|
+
output_lower = output.lower()
|
|
52
|
+
passed_criteria = [
|
|
53
|
+
c for c in quality_criteria
|
|
54
|
+
if any(word in output_lower for word in c.lower().split() if len(word) > 3)
|
|
55
|
+
]
|
|
56
|
+
fast_score = len(passed_criteria) / len(quality_criteria)
|
|
57
|
+
fast_deficiencies = [c for c in quality_criteria if c not in passed_criteria]
|
|
58
|
+
return fast_score, fast_deficiencies
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def score_channel_a(
|
|
62
|
+
step_output: str,
|
|
63
|
+
goal: str,
|
|
64
|
+
evaluator: Evaluator,
|
|
65
|
+
) -> EvaluationResult:
|
|
66
|
+
"""Channel A: deterministic evaluator on the step artifact."""
|
|
67
|
+
return evaluator.evaluate(
|
|
68
|
+
step_output,
|
|
69
|
+
context={"goal": goal},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def sample_text(ctx: Any, prompt: str, max_tokens: int = 2048) -> str:
|
|
74
|
+
"""Call ctx.sample() and return plain text content."""
|
|
75
|
+
result = await ctx.sample(prompt, max_tokens=max_tokens)
|
|
76
|
+
content = result.content
|
|
77
|
+
return content.text if hasattr(content, "text") else str(content)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def score_channel_b(
|
|
81
|
+
step_output: str,
|
|
82
|
+
goal: str,
|
|
83
|
+
quality_criteria: list[str],
|
|
84
|
+
ctx: Any,
|
|
85
|
+
) -> dict[str, Any]:
|
|
86
|
+
"""Channel B: critic sampling call in a separate verifier role."""
|
|
87
|
+
fast_score, fast_deficiencies = keyword_criteria_score(step_output, quality_criteria)
|
|
88
|
+
|
|
89
|
+
verify_result = await sample_text(
|
|
90
|
+
ctx,
|
|
91
|
+
f"You are an independent verifier — NOT the agent that produced this output.\n\n"
|
|
92
|
+
f"GOAL:\n{goal}\n\n"
|
|
93
|
+
f"STEP OUTPUT:\n{step_output[:3000]}\n\n"
|
|
94
|
+
f"QUALITY CRITERIA: {quality_criteria}\n\n"
|
|
95
|
+
f"Rate how close this output is to achieving the goal on a scale 0.0-1.0.\n"
|
|
96
|
+
f"Reply ONLY with valid JSON:\n"
|
|
97
|
+
f'{{"score":0.0,"passed":false,"deficiencies":["..."],"feedback":"..."}}',
|
|
98
|
+
max_tokens=500,
|
|
99
|
+
)
|
|
100
|
+
try:
|
|
101
|
+
m = re.search(r"\{.*\}", verify_result, re.DOTALL)
|
|
102
|
+
data = json.loads(m.group()) if m else {}
|
|
103
|
+
score = float(data.get("score", fast_score))
|
|
104
|
+
score = max(0.0, min(1.0, score))
|
|
105
|
+
passed = bool(data.get("passed", score >= 0.7))
|
|
106
|
+
deficiencies = list(data.get("deficiencies", fast_deficiencies))
|
|
107
|
+
feedback = str(data.get("feedback", ""))
|
|
108
|
+
except Exception: # noqa: BLE001
|
|
109
|
+
score = fast_score
|
|
110
|
+
passed = fast_score >= 0.7
|
|
111
|
+
deficiencies = fast_deficiencies
|
|
112
|
+
feedback = verify_result[:300]
|
|
113
|
+
|
|
114
|
+
return {
|
|
115
|
+
"score": score,
|
|
116
|
+
"passed": passed,
|
|
117
|
+
"deficiencies": deficiencies,
|
|
118
|
+
"feedback": feedback,
|
|
119
|
+
"keyword_match": fast_score,
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
async def conservative_dual_verify(
|
|
124
|
+
step_output: str,
|
|
125
|
+
goal: str,
|
|
126
|
+
quality_criteria: list[str],
|
|
127
|
+
evaluator: Evaluator,
|
|
128
|
+
ctx: Any | None,
|
|
129
|
+
) -> DualVerifyScore:
|
|
130
|
+
"""Run Conservative Dual-Verify: min(Channel A, Channel B)."""
|
|
131
|
+
channel_a = score_channel_a(step_output, goal, evaluator)
|
|
132
|
+
deficiencies = list(channel_a.deficiencies)
|
|
133
|
+
|
|
134
|
+
if ctx is None:
|
|
135
|
+
return DualVerifyScore(
|
|
136
|
+
final_score=channel_a.score,
|
|
137
|
+
channel_a_score=channel_a.score,
|
|
138
|
+
channel_b_score=None,
|
|
139
|
+
passed=channel_a.passed,
|
|
140
|
+
deficiencies=deficiencies,
|
|
141
|
+
channel_a_sub_scores=dict(channel_a.sub_scores),
|
|
142
|
+
channel_b_feedback="",
|
|
143
|
+
source="channel_a_only",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
channel_b = await score_channel_b(step_output, goal, quality_criteria, ctx)
|
|
147
|
+
final = min(channel_a.score, channel_b["score"])
|
|
148
|
+
all_deficiencies = deficiencies + [
|
|
149
|
+
d for d in channel_b["deficiencies"] if d not in deficiencies
|
|
150
|
+
]
|
|
151
|
+
passed = final >= 0.7 and not all_deficiencies
|
|
152
|
+
|
|
153
|
+
return DualVerifyScore(
|
|
154
|
+
final_score=final,
|
|
155
|
+
channel_a_score=channel_a.score,
|
|
156
|
+
channel_b_score=channel_b["score"],
|
|
157
|
+
passed=passed,
|
|
158
|
+
deficiencies=all_deficiencies,
|
|
159
|
+
channel_a_sub_scores=dict(channel_a.sub_scores),
|
|
160
|
+
channel_b_feedback=channel_b["feedback"],
|
|
161
|
+
source="conservative_dual_verify",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def legacy_self_report_score(score: float) -> DualVerifyScore:
|
|
166
|
+
"""Wrap a legacy agent self-reported score (not CDV)."""
|
|
167
|
+
clamped = max(0.0, min(1.0, float(score)))
|
|
168
|
+
return DualVerifyScore(
|
|
169
|
+
final_score=clamped,
|
|
170
|
+
channel_a_score=clamped,
|
|
171
|
+
channel_b_score=None,
|
|
172
|
+
passed=clamped >= 0.7,
|
|
173
|
+
deficiencies=[],
|
|
174
|
+
channel_a_sub_scores={},
|
|
175
|
+
channel_b_feedback="",
|
|
176
|
+
source="legacy_self_report",
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def build_step_evaluator(
|
|
181
|
+
evaluator_type: str,
|
|
182
|
+
quality_criteria: list[str],
|
|
183
|
+
**kwargs: Any,
|
|
184
|
+
) -> Evaluator:
|
|
185
|
+
"""Build the Channel A evaluator for an agent-loop session."""
|
|
186
|
+
return build_evaluator(
|
|
187
|
+
evaluator_type,
|
|
188
|
+
quality_criteria=quality_criteria,
|
|
189
|
+
**kwargs,
|
|
190
|
+
)
|