verifyloop 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
verifyloop/verifier.py ADDED
@@ -0,0 +1,390 @@
1
+ """Verify phase: check execution results with trained verification model.
2
+
3
+ This is THE KEY DIFFERENTIATOR. Unlike other agent frameworks that rely on
4
+ LLM prompts for verification, VerifyLoop uses a trained ReasonCritic model
5
+ when available, falling back to LLM-based verification otherwise.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import re
12
+ from typing import Any
13
+
14
+ import litellm
15
+
16
+ from verifyloop.models import (
17
+ ExecuteStep,
18
+ PlanStep,
19
+ TokenUsage,
20
+ VerifyCheckResult,
21
+ VerifyStep,
22
+ )
23
+
24
+ VERIFY_SYSTEM_PROMPT = """You are a verification agent. Given a plan, execution results, and the original task, verify whether the task was completed correctly.
25
+
26
+ Produce a JSON object:
27
+ {
28
+ "checks": ["Description of each verification check performed"],
29
+ "check_results": [
30
+ {"check": "description", "passed": true/false, "detail": "reasoning"}
31
+ ],
32
+ "passed": true/false,
33
+ "confidence": 0.0-1.0,
34
+ "failures": ["List of failures if any"],
35
+ "fix_suggestions": ["Suggested fixes for each failure"]
36
+ }
37
+
38
+ Be thorough. Check:
39
+ 1. Did every substep succeed?
40
+ 2. Are the actual outputs consistent with what was expected?
41
+ 3. Do files contain the expected content or structure?
42
+ 4. Are there any syntax errors or obvious bugs in generated code?
43
+ 5. Would the changes break existing functionality?
44
+
45
+ Respond ONLY with valid JSON, no markdown fences."""
46
+
47
+ REASON_CRITIC_PROMPT = """You are ReasonCritic, a specialized verification model trained to evaluate code changes and execution results.
48
+
49
+ Analyze the plan-execution pair for:
50
+ - Logical consistency between plan and execution
51
+ - Correctness of code changes (syntax, semantics)
52
+ - Completeness: were all substeps addressed?
53
+ - Edge cases: potential runtime errors
54
+ - Test coverage considerations
55
+
56
+ Output JSON:
57
+ {
58
+ "checks": ["verification checks performed"],
59
+ "check_results": [{"check": "...", "passed": bool, "detail": "..."}],
60
+ "passed": bool,
61
+ "confidence": float 0.0-1.0,
62
+ "failures": ["list of failures"],
63
+ "fix_suggestions": ["list of fix suggestions"]
64
+ }
65
+
66
+ Respond ONLY with valid JSON."""
67
+
68
+
69
+ class VerifierConfig:
70
+ def __init__(
71
+ self,
72
+ verify_model: str = "reason-critic-7b",
73
+ confidence_threshold: float = 0.8,
74
+ api_key: str | None = None,
75
+ api_base: str | None = None,
76
+ prefer_trained_model: bool = True,
77
+ max_retries: int = 2,
78
+ ) -> None:
79
+ self.verify_model = verify_model
80
+ self.confidence_threshold = confidence_threshold
81
+ self.api_key = api_key
82
+ self.api_base = api_base
83
+ self.prefer_trained_model = prefer_trained_model
84
+ self.max_retries = max_retries
85
+
86
+
87
+ class Verifier:
88
+ def __init__(self, config: VerifierConfig | None = None) -> None:
89
+ self.config = config or VerifierConfig()
90
+ self._token_usage = TokenUsage()
91
+ self._trained_model_available: bool | None = None
92
+
93
+ @property
94
+ def token_usage(self) -> TokenUsage:
95
+ return self._token_usage
96
+
97
+ async def verify_code_edits(
98
+ self,
99
+ plan: PlanStep,
100
+ execute_steps: list[ExecuteStep],
101
+ ) -> VerifyStep:
102
+ executed_summary = self._summarize_executions(execute_steps)
103
+ prompt = self._build_verification_prompt(
104
+ task=plan.description,
105
+ plan_substeps=plan.substeps,
106
+ executions=executed_summary,
107
+ )
108
+ return await self._run_verification(prompt, plan=plan, execute_steps=execute_steps)
109
+
110
+ async def verify_bash_output(
111
+ self,
112
+ command: str,
113
+ output: str,
114
+ expected: str | None = None,
115
+ ) -> VerifyStep:
116
+ checks = [f"Command '{command}' executed successfully"]
117
+ if output.strip():
118
+ checks.append("Command produced output")
119
+ if expected:
120
+ checks.append(f"Output matches expected: {expected[:100]}")
121
+
122
+ prompt = f"Verify bash command execution:\n\nCommand: {command}\nOutput:\n{output}\n"
123
+ if expected:
124
+ prompt += f"\nExpected output contains: {expected}\n"
125
+
126
+ return await self._run_verification(prompt, checks=checks)
127
+
128
+ async def verify_file_state(
129
+ self,
130
+ file_path: str,
131
+ expected_content: str | None = None,
132
+ should_exist: bool = True,
133
+ ) -> VerifyStep:
134
+ from pathlib import Path
135
+
136
+ target = Path(file_path)
137
+ checks = [f"File {file_path} {'exists' if should_exist else 'should not exist'}"]
138
+
139
+ if target.exists() != should_exist:
140
+ return VerifyStep(
141
+ checks=checks,
142
+ passed=False,
143
+ confidence=1.0,
144
+ failures=[f"File {file_path} {'does not exist' if should_exist else 'exists unexpectedly'}"],
145
+ fix_suggestions=[
146
+ f"Create the file {file_path}" if should_exist else f"Remove the file {file_path}"
147
+ ],
148
+ verification_model=self.config.verify_model,
149
+ used_trained_model=False,
150
+ )
151
+
152
+ if expected_content and target.exists():
153
+ actual = target.read_text()
154
+ if expected_content in actual:
155
+ checks.append("File contains expected content")
156
+ else:
157
+ checks.append("File content mismatch")
158
+ return VerifyStep(
159
+ checks=checks,
160
+ passed=False,
161
+ confidence=0.9,
162
+ failures=[f"File {file_path} does not contain expected content"],
163
+ fix_suggestions=[f"Edit {file_path} to include: {expected_content[:100]}..."],
164
+ verification_model=self.config.verify_model,
165
+ used_trained_model=False,
166
+ )
167
+
168
+ return VerifyStep(
169
+ checks=checks,
170
+ passed=True,
171
+ confidence=1.0,
172
+ verification_model="local",
173
+ used_trained_model=False,
174
+ )
175
+
176
+ async def verify_tests(
177
+ self,
178
+ test_command: str,
179
+ working_dir: str = ".",
180
+ ) -> VerifyStep:
181
+ import asyncio as _asyncio
182
+
183
+ proc = await _asyncio.create_subprocess_shell(
184
+ test_command,
185
+ stdout=_asyncio.subprocess.PIPE,
186
+ stderr=_asyncio.subprocess.PIPE,
187
+ cwd=working_dir,
188
+ )
189
+ stdout, stderr = await _asyncio.wait_for(proc.communicate(), timeout=120)
190
+
191
+ passed = proc.returncode == 0
192
+ output = stdout.decode(errors="replace")
193
+ errors = stderr.decode(errors="replace")
194
+
195
+ checks = [f"Test command: {test_command}", f"Exit code: {proc.returncode}"]
196
+ failures = []
197
+ fix_suggestions = []
198
+
199
+ if passed:
200
+ checks.append("All tests passed")
201
+ else:
202
+ checks.append("Tests failed")
203
+ failures.append(f"Test suite returned exit code {proc.returncode}")
204
+ failures.extend(self._extract_failure_lines(output + "\n" + errors))
205
+ fix_suggestions.extend(self._suggest_test_fixes(output + "\n" + errors))
206
+
207
+ return VerifyStep(
208
+ checks=checks,
209
+ passed=passed,
210
+ confidence=1.0 if passed else 0.3,
211
+ failures=failures,
212
+ fix_suggestions=fix_suggestions,
213
+ verification_model="local",
214
+ used_trained_model=False,
215
+ )
216
+
217
+ async def _run_verification(
218
+ self,
219
+ prompt: str,
220
+ plan: PlanStep | None = None,
221
+ execute_steps: list[ExecuteStep] | None = None,
222
+ checks: list[str] | None = None,
223
+ ) -> VerifyStep:
224
+ if self.config.prefer_trained_model and await self._check_trained_model():
225
+ result = await self._verify_with_trained_model(prompt)
226
+ if result is not None:
227
+ return result
228
+
229
+ return await self._verify_with_llm(prompt)
230
+
231
+ async def _check_trained_model(self) -> bool:
232
+ if self._trained_model_available is not None:
233
+ return self._trained_model_available
234
+
235
+ try:
236
+ test_response = await litellm.acompletion(
237
+ model=self.config.verify_model,
238
+ messages=[{"role": "user", "content": "ping"}],
239
+ max_tokens=5,
240
+ api_key=self.config.api_key,
241
+ api_base=self.config.api_base,
242
+ )
243
+ self._trained_model_available = True
244
+ return True
245
+ except Exception:
246
+ self._trained_model_available = False
247
+ return False
248
+
249
+ async def _verify_with_trained_model(self, prompt: str) -> VerifyStep | None:
250
+ try:
251
+ response = await litellm.acompletion(
252
+ model=self.config.verify_model,
253
+ messages=[
254
+ {"role": "system", "content": REASON_CRITIC_PROMPT},
255
+ {"role": "user", "content": prompt},
256
+ ],
257
+ temperature=0.0,
258
+ max_tokens=2048,
259
+ api_key=self.config.api_key,
260
+ api_base=self.config.api_base,
261
+ )
262
+ self._token_usage = self._token_usage.merge(
263
+ TokenUsage(
264
+ prompt_tokens=response.usage.prompt_tokens if response.usage else 0,
265
+ completion_tokens=response.usage.completion_tokens if response.usage else 0,
266
+ total_tokens=response.usage.total_tokens if response.usage else 0,
267
+ )
268
+ )
269
+ return self._parse_verification_response(
270
+ response.choices[0].message.content or "{}",
271
+ used_trained_model=True,
272
+ )
273
+ except Exception:
274
+ return None
275
+
276
+ async def _verify_with_llm(self, prompt: str) -> VerifyStep:
277
+ fallback_model = self.config.verify_model
278
+ if await self._check_trained_model() is False:
279
+ fallback_model = "gpt-4o"
280
+
281
+ response = await litellm.acompletion(
282
+ model=fallback_model,
283
+ messages=[
284
+ {"role": "system", "content": VERIFY_SYSTEM_PROMPT},
285
+ {"role": "user", "content": prompt},
286
+ ],
287
+ temperature=0.0,
288
+ max_tokens=2048,
289
+ api_key=self.config.api_key,
290
+ api_base=self.config.api_base,
291
+ )
292
+ self._token_usage = self._token_usage.merge(
293
+ TokenUsage(
294
+ prompt_tokens=response.usage.prompt_tokens if response.usage else 0,
295
+ completion_tokens=response.usage.completion_tokens if response.usage else 0,
296
+ total_tokens=response.usage.total_tokens if response.usage else 0,
297
+ )
298
+ )
299
+ return self._parse_verification_response(
300
+ response.choices[0].message.content or "{}",
301
+ used_trained_model=False,
302
+ )
303
+
304
+ def _parse_verification_response(
305
+ self, content: str, used_trained_model: bool = False
306
+ ) -> VerifyStep:
307
+ content = re.sub(r"^```(?:json)?\s*", "", content.strip())
308
+ content = re.sub(r"\s*```$", "", content.strip())
309
+ try:
310
+ data = json.loads(content)
311
+ except json.JSONDecodeError:
312
+ return VerifyStep(
313
+ checks=["Failed to parse verification response"],
314
+ passed=False,
315
+ confidence=0.0,
316
+ failures=["Verification model returned invalid JSON"],
317
+ fix_suggestions=["Retry verification"],
318
+ verification_model=self.config.verify_model,
319
+ used_trained_model=used_trained_model,
320
+ )
321
+
322
+ check_results = []
323
+ for cr in data.get("check_results", []):
324
+ check_results.append(VerifyCheckResult(
325
+ check=cr.get("check", ""),
326
+ passed=cr.get("passed", False),
327
+ detail=cr.get("detail", ""),
328
+ ))
329
+
330
+ return VerifyStep(
331
+ checks=data.get("checks", []),
332
+ check_results=check_results,
333
+ passed=data.get("passed", False),
334
+ confidence=data.get("confidence", 0.0),
335
+ failures=data.get("failures", []),
336
+ fix_suggestions=data.get("fix_suggestions", []),
337
+ verification_model=self.config.verify_model,
338
+ used_trained_model=used_trained_model,
339
+ )
340
+
341
+ def _summarize_executions(self, steps: list[ExecuteStep]) -> str:
342
+ lines = []
343
+ for i, step in enumerate(steps, 1):
344
+ status = "SUCCESS" if step.success else "FAILED"
345
+ result_preview = step.result[:500] if step.result else "(no output)"
346
+ error_info = f"\nError: {step.error}" if step.error else ""
347
+ lines.append(
348
+ f"Step {i} [{step.tool}] {status}:\n"
349
+ f" Args: {step.arguments}\n"
350
+ f" Result: {result_preview}{error_info}"
351
+ )
352
+ return "\n\n".join(lines)
353
+
354
+ def _build_verification_prompt(
355
+ self,
356
+ task: str,
357
+ plan_substeps: list[str],
358
+ executions: str,
359
+ ) -> str:
360
+ substep_text = "\n".join(f" {i+1}. {s}" for i, s in enumerate(plan_substeps))
361
+ return (
362
+ f"Original Task: {task}\n\n"
363
+ f"Plan Substeps:\n{substep_text}\n\n"
364
+ f"Execution Results:\n{executions}\n\n"
365
+ f"Verify whether the task was completed correctly and all substeps were addressed."
366
+ )
367
+
368
+ @staticmethod
369
+ def _extract_failure_lines(output: str) -> list[str]:
370
+ failures = []
371
+ for line in output.splitlines():
372
+ stripped = line.strip()
373
+ if any(stripped.startswith(prefix) for prefix in ("FAILED", "ERROR", "AssertionError", "FAIL")):
374
+ failures.append(stripped[:200])
375
+ return failures[:10]
376
+
377
+ @staticmethod
378
+ def _suggest_test_fixes(output: str) -> list[str]:
379
+ suggestions = []
380
+ if "import" in output and "ModuleNotFoundError" in output:
381
+ suggestions.append("Missing import — install the required package or add the module")
382
+ if "AssertionError" in output:
383
+ suggestions.append("Assertion failed — review expected vs actual values in the test")
384
+ if "TypeError" in output:
385
+ suggestions.append("Type error — check function signatures and argument types")
386
+ if "NameError" in output:
387
+ suggestions.append("Name error — variable or function not defined in scope")
388
+ if not suggestions:
389
+ suggestions.append("Review the failing test output for specific error details")
390
+ return suggestions[:5]