verifyloop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- verifyloop/__init__.py +41 -0
- verifyloop/cli.py +186 -0
- verifyloop/executor.py +330 -0
- verifyloop/memory.py +197 -0
- verifyloop/models.py +146 -0
- verifyloop/pipeline.py +246 -0
- verifyloop/planner.py +190 -0
- verifyloop/recoverer.py +204 -0
- verifyloop/verifier.py +390 -0
- verifyloop-0.1.0.dist-info/METADATA +383 -0
- verifyloop-0.1.0.dist-info/RECORD +14 -0
- verifyloop-0.1.0.dist-info/WHEEL +4 -0
- verifyloop-0.1.0.dist-info/entry_points.txt +2 -0
- verifyloop-0.1.0.dist-info/licenses/LICENSE +21 -0
verifyloop/verifier.py
ADDED
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""Verify phase: check execution results with trained verification model.
|
|
2
|
+
|
|
3
|
+
This is THE KEY DIFFERENTIATOR. Unlike other agent frameworks that rely on
|
|
4
|
+
LLM prompts for verification, VerifyLoop uses a trained ReasonCritic model
|
|
5
|
+
when available, falling back to LLM-based verification otherwise.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import litellm
|
|
15
|
+
|
|
16
|
+
from verifyloop.models import (
|
|
17
|
+
ExecuteStep,
|
|
18
|
+
PlanStep,
|
|
19
|
+
TokenUsage,
|
|
20
|
+
VerifyCheckResult,
|
|
21
|
+
VerifyStep,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
VERIFY_SYSTEM_PROMPT = """You are a verification agent. Given a plan, execution results, and the original task, verify whether the task was completed correctly.
|
|
25
|
+
|
|
26
|
+
Produce a JSON object:
|
|
27
|
+
{
|
|
28
|
+
"checks": ["Description of each verification check performed"],
|
|
29
|
+
"check_results": [
|
|
30
|
+
{"check": "description", "passed": true/false, "detail": "reasoning"}
|
|
31
|
+
],
|
|
32
|
+
"passed": true/false,
|
|
33
|
+
"confidence": 0.0-1.0,
|
|
34
|
+
"failures": ["List of failures if any"],
|
|
35
|
+
"fix_suggestions": ["Suggested fixes for each failure"]
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
Be thorough. Check:
|
|
39
|
+
1. Did every substep succeed?
|
|
40
|
+
2. Are the actual outputs consistent with what was expected?
|
|
41
|
+
3. Do files contain the expected content or structure?
|
|
42
|
+
4. Are there any syntax errors or obvious bugs in generated code?
|
|
43
|
+
5. Would the changes break existing functionality?
|
|
44
|
+
|
|
45
|
+
Respond ONLY with valid JSON, no markdown fences."""
|
|
46
|
+
|
|
47
|
+
REASON_CRITIC_PROMPT = """You are ReasonCritic, a specialized verification model trained to evaluate code changes and execution results.
|
|
48
|
+
|
|
49
|
+
Analyze the plan-execution pair for:
|
|
50
|
+
- Logical consistency between plan and execution
|
|
51
|
+
- Correctness of code changes (syntax, semantics)
|
|
52
|
+
- Completeness: were all substeps addressed?
|
|
53
|
+
- Edge cases: potential runtime errors
|
|
54
|
+
- Test coverage considerations
|
|
55
|
+
|
|
56
|
+
Output JSON:
|
|
57
|
+
{
|
|
58
|
+
"checks": ["verification checks performed"],
|
|
59
|
+
"check_results": [{"check": "...", "passed": bool, "detail": "..."}],
|
|
60
|
+
"passed": bool,
|
|
61
|
+
"confidence": float 0.0-1.0,
|
|
62
|
+
"failures": ["list of failures"],
|
|
63
|
+
"fix_suggestions": ["list of fix suggestions"]
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
Respond ONLY with valid JSON."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class VerifierConfig:
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
verify_model: str = "reason-critic-7b",
|
|
73
|
+
confidence_threshold: float = 0.8,
|
|
74
|
+
api_key: str | None = None,
|
|
75
|
+
api_base: str | None = None,
|
|
76
|
+
prefer_trained_model: bool = True,
|
|
77
|
+
max_retries: int = 2,
|
|
78
|
+
) -> None:
|
|
79
|
+
self.verify_model = verify_model
|
|
80
|
+
self.confidence_threshold = confidence_threshold
|
|
81
|
+
self.api_key = api_key
|
|
82
|
+
self.api_base = api_base
|
|
83
|
+
self.prefer_trained_model = prefer_trained_model
|
|
84
|
+
self.max_retries = max_retries
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Verifier:
|
|
88
|
+
def __init__(self, config: VerifierConfig | None = None) -> None:
|
|
89
|
+
self.config = config or VerifierConfig()
|
|
90
|
+
self._token_usage = TokenUsage()
|
|
91
|
+
self._trained_model_available: bool | None = None
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def token_usage(self) -> TokenUsage:
|
|
95
|
+
return self._token_usage
|
|
96
|
+
|
|
97
|
+
async def verify_code_edits(
|
|
98
|
+
self,
|
|
99
|
+
plan: PlanStep,
|
|
100
|
+
execute_steps: list[ExecuteStep],
|
|
101
|
+
) -> VerifyStep:
|
|
102
|
+
executed_summary = self._summarize_executions(execute_steps)
|
|
103
|
+
prompt = self._build_verification_prompt(
|
|
104
|
+
task=plan.description,
|
|
105
|
+
plan_substeps=plan.substeps,
|
|
106
|
+
executions=executed_summary,
|
|
107
|
+
)
|
|
108
|
+
return await self._run_verification(prompt, plan=plan, execute_steps=execute_steps)
|
|
109
|
+
|
|
110
|
+
async def verify_bash_output(
|
|
111
|
+
self,
|
|
112
|
+
command: str,
|
|
113
|
+
output: str,
|
|
114
|
+
expected: str | None = None,
|
|
115
|
+
) -> VerifyStep:
|
|
116
|
+
checks = [f"Command '{command}' executed successfully"]
|
|
117
|
+
if output.strip():
|
|
118
|
+
checks.append("Command produced output")
|
|
119
|
+
if expected:
|
|
120
|
+
checks.append(f"Output matches expected: {expected[:100]}")
|
|
121
|
+
|
|
122
|
+
prompt = f"Verify bash command execution:\n\nCommand: {command}\nOutput:\n{output}\n"
|
|
123
|
+
if expected:
|
|
124
|
+
prompt += f"\nExpected output contains: {expected}\n"
|
|
125
|
+
|
|
126
|
+
return await self._run_verification(prompt, checks=checks)
|
|
127
|
+
|
|
128
|
+
async def verify_file_state(
|
|
129
|
+
self,
|
|
130
|
+
file_path: str,
|
|
131
|
+
expected_content: str | None = None,
|
|
132
|
+
should_exist: bool = True,
|
|
133
|
+
) -> VerifyStep:
|
|
134
|
+
from pathlib import Path
|
|
135
|
+
|
|
136
|
+
target = Path(file_path)
|
|
137
|
+
checks = [f"File {file_path} {'exists' if should_exist else 'should not exist'}"]
|
|
138
|
+
|
|
139
|
+
if target.exists() != should_exist:
|
|
140
|
+
return VerifyStep(
|
|
141
|
+
checks=checks,
|
|
142
|
+
passed=False,
|
|
143
|
+
confidence=1.0,
|
|
144
|
+
failures=[f"File {file_path} {'does not exist' if should_exist else 'exists unexpectedly'}"],
|
|
145
|
+
fix_suggestions=[
|
|
146
|
+
f"Create the file {file_path}" if should_exist else f"Remove the file {file_path}"
|
|
147
|
+
],
|
|
148
|
+
verification_model=self.config.verify_model,
|
|
149
|
+
used_trained_model=False,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if expected_content and target.exists():
|
|
153
|
+
actual = target.read_text()
|
|
154
|
+
if expected_content in actual:
|
|
155
|
+
checks.append("File contains expected content")
|
|
156
|
+
else:
|
|
157
|
+
checks.append("File content mismatch")
|
|
158
|
+
return VerifyStep(
|
|
159
|
+
checks=checks,
|
|
160
|
+
passed=False,
|
|
161
|
+
confidence=0.9,
|
|
162
|
+
failures=[f"File {file_path} does not contain expected content"],
|
|
163
|
+
fix_suggestions=[f"Edit {file_path} to include: {expected_content[:100]}..."],
|
|
164
|
+
verification_model=self.config.verify_model,
|
|
165
|
+
used_trained_model=False,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return VerifyStep(
|
|
169
|
+
checks=checks,
|
|
170
|
+
passed=True,
|
|
171
|
+
confidence=1.0,
|
|
172
|
+
verification_model="local",
|
|
173
|
+
used_trained_model=False,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
async def verify_tests(
|
|
177
|
+
self,
|
|
178
|
+
test_command: str,
|
|
179
|
+
working_dir: str = ".",
|
|
180
|
+
) -> VerifyStep:
|
|
181
|
+
import asyncio as _asyncio
|
|
182
|
+
|
|
183
|
+
proc = await _asyncio.create_subprocess_shell(
|
|
184
|
+
test_command,
|
|
185
|
+
stdout=_asyncio.subprocess.PIPE,
|
|
186
|
+
stderr=_asyncio.subprocess.PIPE,
|
|
187
|
+
cwd=working_dir,
|
|
188
|
+
)
|
|
189
|
+
stdout, stderr = await _asyncio.wait_for(proc.communicate(), timeout=120)
|
|
190
|
+
|
|
191
|
+
passed = proc.returncode == 0
|
|
192
|
+
output = stdout.decode(errors="replace")
|
|
193
|
+
errors = stderr.decode(errors="replace")
|
|
194
|
+
|
|
195
|
+
checks = [f"Test command: {test_command}", f"Exit code: {proc.returncode}"]
|
|
196
|
+
failures = []
|
|
197
|
+
fix_suggestions = []
|
|
198
|
+
|
|
199
|
+
if passed:
|
|
200
|
+
checks.append("All tests passed")
|
|
201
|
+
else:
|
|
202
|
+
checks.append("Tests failed")
|
|
203
|
+
failures.append(f"Test suite returned exit code {proc.returncode}")
|
|
204
|
+
failures.extend(self._extract_failure_lines(output + "\n" + errors))
|
|
205
|
+
fix_suggestions.extend(self._suggest_test_fixes(output + "\n" + errors))
|
|
206
|
+
|
|
207
|
+
return VerifyStep(
|
|
208
|
+
checks=checks,
|
|
209
|
+
passed=passed,
|
|
210
|
+
confidence=1.0 if passed else 0.3,
|
|
211
|
+
failures=failures,
|
|
212
|
+
fix_suggestions=fix_suggestions,
|
|
213
|
+
verification_model="local",
|
|
214
|
+
used_trained_model=False,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
async def _run_verification(
|
|
218
|
+
self,
|
|
219
|
+
prompt: str,
|
|
220
|
+
plan: PlanStep | None = None,
|
|
221
|
+
execute_steps: list[ExecuteStep] | None = None,
|
|
222
|
+
checks: list[str] | None = None,
|
|
223
|
+
) -> VerifyStep:
|
|
224
|
+
if self.config.prefer_trained_model and await self._check_trained_model():
|
|
225
|
+
result = await self._verify_with_trained_model(prompt)
|
|
226
|
+
if result is not None:
|
|
227
|
+
return result
|
|
228
|
+
|
|
229
|
+
return await self._verify_with_llm(prompt)
|
|
230
|
+
|
|
231
|
+
async def _check_trained_model(self) -> bool:
|
|
232
|
+
if self._trained_model_available is not None:
|
|
233
|
+
return self._trained_model_available
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
test_response = await litellm.acompletion(
|
|
237
|
+
model=self.config.verify_model,
|
|
238
|
+
messages=[{"role": "user", "content": "ping"}],
|
|
239
|
+
max_tokens=5,
|
|
240
|
+
api_key=self.config.api_key,
|
|
241
|
+
api_base=self.config.api_base,
|
|
242
|
+
)
|
|
243
|
+
self._trained_model_available = True
|
|
244
|
+
return True
|
|
245
|
+
except Exception:
|
|
246
|
+
self._trained_model_available = False
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
async def _verify_with_trained_model(self, prompt: str) -> VerifyStep | None:
|
|
250
|
+
try:
|
|
251
|
+
response = await litellm.acompletion(
|
|
252
|
+
model=self.config.verify_model,
|
|
253
|
+
messages=[
|
|
254
|
+
{"role": "system", "content": REASON_CRITIC_PROMPT},
|
|
255
|
+
{"role": "user", "content": prompt},
|
|
256
|
+
],
|
|
257
|
+
temperature=0.0,
|
|
258
|
+
max_tokens=2048,
|
|
259
|
+
api_key=self.config.api_key,
|
|
260
|
+
api_base=self.config.api_base,
|
|
261
|
+
)
|
|
262
|
+
self._token_usage = self._token_usage.merge(
|
|
263
|
+
TokenUsage(
|
|
264
|
+
prompt_tokens=response.usage.prompt_tokens if response.usage else 0,
|
|
265
|
+
completion_tokens=response.usage.completion_tokens if response.usage else 0,
|
|
266
|
+
total_tokens=response.usage.total_tokens if response.usage else 0,
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
return self._parse_verification_response(
|
|
270
|
+
response.choices[0].message.content or "{}",
|
|
271
|
+
used_trained_model=True,
|
|
272
|
+
)
|
|
273
|
+
except Exception:
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
async def _verify_with_llm(self, prompt: str) -> VerifyStep:
|
|
277
|
+
fallback_model = self.config.verify_model
|
|
278
|
+
if await self._check_trained_model() is False:
|
|
279
|
+
fallback_model = "gpt-4o"
|
|
280
|
+
|
|
281
|
+
response = await litellm.acompletion(
|
|
282
|
+
model=fallback_model,
|
|
283
|
+
messages=[
|
|
284
|
+
{"role": "system", "content": VERIFY_SYSTEM_PROMPT},
|
|
285
|
+
{"role": "user", "content": prompt},
|
|
286
|
+
],
|
|
287
|
+
temperature=0.0,
|
|
288
|
+
max_tokens=2048,
|
|
289
|
+
api_key=self.config.api_key,
|
|
290
|
+
api_base=self.config.api_base,
|
|
291
|
+
)
|
|
292
|
+
self._token_usage = self._token_usage.merge(
|
|
293
|
+
TokenUsage(
|
|
294
|
+
prompt_tokens=response.usage.prompt_tokens if response.usage else 0,
|
|
295
|
+
completion_tokens=response.usage.completion_tokens if response.usage else 0,
|
|
296
|
+
total_tokens=response.usage.total_tokens if response.usage else 0,
|
|
297
|
+
)
|
|
298
|
+
)
|
|
299
|
+
return self._parse_verification_response(
|
|
300
|
+
response.choices[0].message.content or "{}",
|
|
301
|
+
used_trained_model=False,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
def _parse_verification_response(
|
|
305
|
+
self, content: str, used_trained_model: bool = False
|
|
306
|
+
) -> VerifyStep:
|
|
307
|
+
content = re.sub(r"^```(?:json)?\s*", "", content.strip())
|
|
308
|
+
content = re.sub(r"\s*```$", "", content.strip())
|
|
309
|
+
try:
|
|
310
|
+
data = json.loads(content)
|
|
311
|
+
except json.JSONDecodeError:
|
|
312
|
+
return VerifyStep(
|
|
313
|
+
checks=["Failed to parse verification response"],
|
|
314
|
+
passed=False,
|
|
315
|
+
confidence=0.0,
|
|
316
|
+
failures=["Verification model returned invalid JSON"],
|
|
317
|
+
fix_suggestions=["Retry verification"],
|
|
318
|
+
verification_model=self.config.verify_model,
|
|
319
|
+
used_trained_model=used_trained_model,
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
check_results = []
|
|
323
|
+
for cr in data.get("check_results", []):
|
|
324
|
+
check_results.append(VerifyCheckResult(
|
|
325
|
+
check=cr.get("check", ""),
|
|
326
|
+
passed=cr.get("passed", False),
|
|
327
|
+
detail=cr.get("detail", ""),
|
|
328
|
+
))
|
|
329
|
+
|
|
330
|
+
return VerifyStep(
|
|
331
|
+
checks=data.get("checks", []),
|
|
332
|
+
check_results=check_results,
|
|
333
|
+
passed=data.get("passed", False),
|
|
334
|
+
confidence=data.get("confidence", 0.0),
|
|
335
|
+
failures=data.get("failures", []),
|
|
336
|
+
fix_suggestions=data.get("fix_suggestions", []),
|
|
337
|
+
verification_model=self.config.verify_model,
|
|
338
|
+
used_trained_model=used_trained_model,
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
def _summarize_executions(self, steps: list[ExecuteStep]) -> str:
|
|
342
|
+
lines = []
|
|
343
|
+
for i, step in enumerate(steps, 1):
|
|
344
|
+
status = "SUCCESS" if step.success else "FAILED"
|
|
345
|
+
result_preview = step.result[:500] if step.result else "(no output)"
|
|
346
|
+
error_info = f"\nError: {step.error}" if step.error else ""
|
|
347
|
+
lines.append(
|
|
348
|
+
f"Step {i} [{step.tool}] {status}:\n"
|
|
349
|
+
f" Args: {step.arguments}\n"
|
|
350
|
+
f" Result: {result_preview}{error_info}"
|
|
351
|
+
)
|
|
352
|
+
return "\n\n".join(lines)
|
|
353
|
+
|
|
354
|
+
def _build_verification_prompt(
|
|
355
|
+
self,
|
|
356
|
+
task: str,
|
|
357
|
+
plan_substeps: list[str],
|
|
358
|
+
executions: str,
|
|
359
|
+
) -> str:
|
|
360
|
+
substep_text = "\n".join(f" {i+1}. {s}" for i, s in enumerate(plan_substeps))
|
|
361
|
+
return (
|
|
362
|
+
f"Original Task: {task}\n\n"
|
|
363
|
+
f"Plan Substeps:\n{substep_text}\n\n"
|
|
364
|
+
f"Execution Results:\n{executions}\n\n"
|
|
365
|
+
f"Verify whether the task was completed correctly and all substeps were addressed."
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
@staticmethod
|
|
369
|
+
def _extract_failure_lines(output: str) -> list[str]:
|
|
370
|
+
failures = []
|
|
371
|
+
for line in output.splitlines():
|
|
372
|
+
stripped = line.strip()
|
|
373
|
+
if any(stripped.startswith(prefix) for prefix in ("FAILED", "ERROR", "AssertionError", "FAIL")):
|
|
374
|
+
failures.append(stripped[:200])
|
|
375
|
+
return failures[:10]
|
|
376
|
+
|
|
377
|
+
@staticmethod
|
|
378
|
+
def _suggest_test_fixes(output: str) -> list[str]:
|
|
379
|
+
suggestions = []
|
|
380
|
+
if "import" in output and "ModuleNotFoundError" in output:
|
|
381
|
+
suggestions.append("Missing import — install the required package or add the module")
|
|
382
|
+
if "AssertionError" in output:
|
|
383
|
+
suggestions.append("Assertion failed — review expected vs actual values in the test")
|
|
384
|
+
if "TypeError" in output:
|
|
385
|
+
suggestions.append("Type error — check function signatures and argument types")
|
|
386
|
+
if "NameError" in output:
|
|
387
|
+
suggestions.append("Name error — variable or function not defined in scope")
|
|
388
|
+
if not suggestions:
|
|
389
|
+
suggestions.append("Review the failing test output for specific error details")
|
|
390
|
+
return suggestions[:5]
|