loopllm 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
loopllm/step_scorer.py ADDED
@@ -0,0 +1,190 @@
1
+ """Conservative Dual-Verify (CDV) scoring for agent-loop steps.
2
+
3
+ Channel A: deterministic evaluators (regex, JSON, completeness).
4
+ Channel B: separate critic via MCP sampling (verifier hat).
5
+ Final score: min(channel_a, channel_b) — the stricter channel wins.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import re
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ from loopllm.engine import EvaluationResult, Evaluator
15
+ from loopllm.evaluator_factory import build_evaluator
16
+
17
+
18
+ @dataclass
19
+ class DualVerifyScore:
20
+ """Result of Conservative Dual-Verify scoring for one agent-loop step."""
21
+
22
+ final_score: float
23
+ channel_a_score: float
24
+ channel_b_score: float | None
25
+ passed: bool
26
+ deficiencies: list[str] = field(default_factory=list)
27
+ channel_a_sub_scores: dict[str, float] = field(default_factory=dict)
28
+ channel_b_feedback: str = ""
29
+ source: str = "conservative_dual_verify"
30
+
31
+ def to_dict(self) -> dict[str, Any]:
32
+ """Serialise for MCP verdict JSON."""
33
+ out: dict[str, Any] = {
34
+ "score": round(self.final_score, 4),
35
+ "channel_a_score": round(self.channel_a_score, 4),
36
+ "score_source": self.source,
37
+ "passed": self.passed,
38
+ "deficiencies": self.deficiencies,
39
+ "channel_a_sub_scores": self.channel_a_sub_scores,
40
+ "channel_b_feedback": self.channel_b_feedback,
41
+ }
42
+ if self.channel_b_score is not None:
43
+ out["channel_b_score"] = round(self.channel_b_score, 4)
44
+ return out
45
+
46
+
47
+ def keyword_criteria_score(output: str, quality_criteria: list[str]) -> tuple[float, list[str]]:
48
+ """Fast keyword pre-check used by Channel B."""
49
+ if not quality_criteria:
50
+ return 0.9, []
51
+ output_lower = output.lower()
52
+ passed_criteria = [
53
+ c for c in quality_criteria
54
+ if any(word in output_lower for word in c.lower().split() if len(word) > 3)
55
+ ]
56
+ fast_score = len(passed_criteria) / len(quality_criteria)
57
+ fast_deficiencies = [c for c in quality_criteria if c not in passed_criteria]
58
+ return fast_score, fast_deficiencies
59
+
60
+
61
+ def score_channel_a(
62
+ step_output: str,
63
+ goal: str,
64
+ evaluator: Evaluator,
65
+ ) -> EvaluationResult:
66
+ """Channel A: deterministic evaluator on the step artifact."""
67
+ return evaluator.evaluate(
68
+ step_output,
69
+ context={"goal": goal},
70
+ )
71
+
72
+
73
+ async def sample_text(ctx: Any, prompt: str, max_tokens: int = 2048) -> str:
74
+ """Call ctx.sample() and return plain text content."""
75
+ result = await ctx.sample(prompt, max_tokens=max_tokens)
76
+ content = result.content
77
+ return content.text if hasattr(content, "text") else str(content)
78
+
79
+
80
+ async def score_channel_b(
81
+ step_output: str,
82
+ goal: str,
83
+ quality_criteria: list[str],
84
+ ctx: Any,
85
+ ) -> dict[str, Any]:
86
+ """Channel B: critic sampling call in a separate verifier role."""
87
+ fast_score, fast_deficiencies = keyword_criteria_score(step_output, quality_criteria)
88
+
89
+ verify_result = await sample_text(
90
+ ctx,
91
+ f"You are an independent verifier — NOT the agent that produced this output.\n\n"
92
+ f"GOAL:\n{goal}\n\n"
93
+ f"STEP OUTPUT:\n{step_output[:3000]}\n\n"
94
+ f"QUALITY CRITERIA: {quality_criteria}\n\n"
95
+ f"Rate how close this output is to achieving the goal on a scale 0.0-1.0.\n"
96
+ f"Reply ONLY with valid JSON:\n"
97
+ f'{{"score":0.0,"passed":false,"deficiencies":["..."],"feedback":"..."}}',
98
+ max_tokens=500,
99
+ )
100
+ try:
101
+ m = re.search(r"\{.*\}", verify_result, re.DOTALL)
102
+ data = json.loads(m.group()) if m else {}
103
+ score = float(data.get("score", fast_score))
104
+ score = max(0.0, min(1.0, score))
105
+ passed = bool(data.get("passed", score >= 0.7))
106
+ deficiencies = list(data.get("deficiencies", fast_deficiencies))
107
+ feedback = str(data.get("feedback", ""))
108
+ except Exception: # noqa: BLE001
109
+ score = fast_score
110
+ passed = fast_score >= 0.7
111
+ deficiencies = fast_deficiencies
112
+ feedback = verify_result[:300]
113
+
114
+ return {
115
+ "score": score,
116
+ "passed": passed,
117
+ "deficiencies": deficiencies,
118
+ "feedback": feedback,
119
+ "keyword_match": fast_score,
120
+ }
121
+
122
+
123
+ async def conservative_dual_verify(
124
+ step_output: str,
125
+ goal: str,
126
+ quality_criteria: list[str],
127
+ evaluator: Evaluator,
128
+ ctx: Any | None,
129
+ ) -> DualVerifyScore:
130
+ """Run Conservative Dual-Verify: min(Channel A, Channel B)."""
131
+ channel_a = score_channel_a(step_output, goal, evaluator)
132
+ deficiencies = list(channel_a.deficiencies)
133
+
134
+ if ctx is None:
135
+ return DualVerifyScore(
136
+ final_score=channel_a.score,
137
+ channel_a_score=channel_a.score,
138
+ channel_b_score=None,
139
+ passed=channel_a.passed,
140
+ deficiencies=deficiencies,
141
+ channel_a_sub_scores=dict(channel_a.sub_scores),
142
+ channel_b_feedback="",
143
+ source="channel_a_only",
144
+ )
145
+
146
+ channel_b = await score_channel_b(step_output, goal, quality_criteria, ctx)
147
+ final = min(channel_a.score, channel_b["score"])
148
+ all_deficiencies = deficiencies + [
149
+ d for d in channel_b["deficiencies"] if d not in deficiencies
150
+ ]
151
+ passed = final >= 0.7 and not all_deficiencies
152
+
153
+ return DualVerifyScore(
154
+ final_score=final,
155
+ channel_a_score=channel_a.score,
156
+ channel_b_score=channel_b["score"],
157
+ passed=passed,
158
+ deficiencies=all_deficiencies,
159
+ channel_a_sub_scores=dict(channel_a.sub_scores),
160
+ channel_b_feedback=channel_b["feedback"],
161
+ source="conservative_dual_verify",
162
+ )
163
+
164
+
165
+ def legacy_self_report_score(score: float) -> DualVerifyScore:
166
+ """Wrap a legacy agent self-reported score (not CDV)."""
167
+ clamped = max(0.0, min(1.0, float(score)))
168
+ return DualVerifyScore(
169
+ final_score=clamped,
170
+ channel_a_score=clamped,
171
+ channel_b_score=None,
172
+ passed=clamped >= 0.7,
173
+ deficiencies=[],
174
+ channel_a_sub_scores={},
175
+ channel_b_feedback="",
176
+ source="legacy_self_report",
177
+ )
178
+
179
+
180
+ def build_step_evaluator(
181
+ evaluator_type: str,
182
+ quality_criteria: list[str],
183
+ **kwargs: Any,
184
+ ) -> Evaluator:
185
+ """Build the Channel A evaluator for an agent-loop session."""
186
+ return build_evaluator(
187
+ evaluator_type,
188
+ quality_criteria=quality_criteria,
189
+ **kwargs,
190
+ )