verifyloop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- verifyloop/__init__.py +41 -0
- verifyloop/cli.py +186 -0
- verifyloop/executor.py +330 -0
- verifyloop/memory.py +197 -0
- verifyloop/models.py +146 -0
- verifyloop/pipeline.py +246 -0
- verifyloop/planner.py +190 -0
- verifyloop/recoverer.py +204 -0
- verifyloop/verifier.py +390 -0
- verifyloop-0.1.0.dist-info/METADATA +383 -0
- verifyloop-0.1.0.dist-info/RECORD +14 -0
- verifyloop-0.1.0.dist-info/WHEEL +4 -0
- verifyloop-0.1.0.dist-info/entry_points.txt +2 -0
- verifyloop-0.1.0.dist-info/licenses/LICENSE +21 -0
verifyloop/memory.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Memory system: short-term (in-process) and long-term (persistent file) stores."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import aiofiles
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MemoryStore(ABC):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
async def store(self, key: str, value: Any, namespace: str = "default") -> None:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
async def retrieve(self, key: str, namespace: str = "default") -> Any | None:
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
async def search(self, query: str, namespace: str = "default", limit: int = 10) -> list[dict[str, Any]]:
|
|
26
|
+
...
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
async def delete(self, key: str, namespace: str = "default") -> bool:
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
async def list_keys(self, namespace: str = "default") -> list[str]:
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class InMemoryStore(MemoryStore):
|
|
38
|
+
def __init__(self) -> None:
|
|
39
|
+
self._store: dict[str, dict[str, dict[str, Any]]] = {}
|
|
40
|
+
|
|
41
|
+
def _ns(self, namespace: str) -> dict[str, dict[str, Any]]:
|
|
42
|
+
if namespace not in self._store:
|
|
43
|
+
self._store[namespace] = {}
|
|
44
|
+
return self._store[namespace]
|
|
45
|
+
|
|
46
|
+
async def store(self, key: str, value: Any, namespace: str = "default") -> None:
|
|
47
|
+
ns = self._ns(namespace)
|
|
48
|
+
ns[key] = {
|
|
49
|
+
"value": value,
|
|
50
|
+
"stored_at": datetime.now(timezone.utc).isoformat(),
|
|
51
|
+
"access_count": ns.get(key, {}).get("access_count", 0),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async def retrieve(self, key: str, namespace: str = "default") -> Any | None:
|
|
55
|
+
ns = self._ns(namespace)
|
|
56
|
+
entry = ns.get(key)
|
|
57
|
+
if entry is None:
|
|
58
|
+
return None
|
|
59
|
+
entry["access_count"] = entry.get("access_count", 0) + 1
|
|
60
|
+
return entry["value"]
|
|
61
|
+
|
|
62
|
+
async def search(self, query: str, namespace: str = "default", limit: int = 10) -> list[dict[str, Any]]:
|
|
63
|
+
ns = self._ns(namespace)
|
|
64
|
+
query_lower = query.lower()
|
|
65
|
+
results = []
|
|
66
|
+
for key, entry in ns.items():
|
|
67
|
+
value_str = str(entry.get("value", "")).lower()
|
|
68
|
+
if query_lower in value_str or query_lower in key.lower():
|
|
69
|
+
results.append({"key": key, **entry})
|
|
70
|
+
results.sort(key=lambda r: r.get("access_count", 0), reverse=True)
|
|
71
|
+
return results[:limit]
|
|
72
|
+
|
|
73
|
+
async def delete(self, key: str, namespace: str = "default") -> bool:
|
|
74
|
+
ns = self._ns(namespace)
|
|
75
|
+
if key in ns:
|
|
76
|
+
del ns[key]
|
|
77
|
+
return True
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
async def list_keys(self, namespace: str = "default") -> list[str]:
|
|
81
|
+
return list(self._ns(namespace).keys())
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class FileStore(MemoryStore):
|
|
85
|
+
def __init__(self, base_dir: str = ".verifyloop_memory") -> None:
|
|
86
|
+
self.base_dir = Path(base_dir)
|
|
87
|
+
self._cache: dict[str, dict[str, dict[str, Any]]] = {}
|
|
88
|
+
|
|
89
|
+
def _ns_path(self, namespace: str) -> Path:
|
|
90
|
+
return self.base_dir / f"{namespace}.json"
|
|
91
|
+
|
|
92
|
+
async def _load_ns(self, namespace: str) -> dict[str, dict[str, Any]]:
|
|
93
|
+
if namespace in self._cache:
|
|
94
|
+
return self._cache[namespace]
|
|
95
|
+
path = self._ns_path(namespace)
|
|
96
|
+
if path.exists():
|
|
97
|
+
async with aiofiles.open(path, "r") as f:
|
|
98
|
+
data = json.loads(await f.read())
|
|
99
|
+
self._cache[namespace] = data
|
|
100
|
+
return data
|
|
101
|
+
self._cache[namespace] = {}
|
|
102
|
+
return {}
|
|
103
|
+
|
|
104
|
+
async def _save_ns(self, namespace: str, data: dict[str, dict[str, Any]]) -> None:
|
|
105
|
+
self._cache[namespace] = data
|
|
106
|
+
path = self._ns_path(namespace)
|
|
107
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
async with aiofiles.open(path, "w") as f:
|
|
109
|
+
await f.write(json.dumps(data, indent=2, default=str))
|
|
110
|
+
|
|
111
|
+
async def store(self, key: str, value: Any, namespace: str = "default") -> None:
|
|
112
|
+
data = await self._load_ns(namespace)
|
|
113
|
+
data[key] = {
|
|
114
|
+
"value": value,
|
|
115
|
+
"stored_at": datetime.now(timezone.utc).isoformat(),
|
|
116
|
+
"access_count": data.get(key, {}).get("access_count", 0),
|
|
117
|
+
}
|
|
118
|
+
await self._save_ns(namespace, data)
|
|
119
|
+
|
|
120
|
+
async def retrieve(self, key: str, namespace: str = "default") -> Any | None:
|
|
121
|
+
data = await self._load_ns(namespace)
|
|
122
|
+
entry = data.get(key)
|
|
123
|
+
if entry is None:
|
|
124
|
+
return None
|
|
125
|
+
entry["access_count"] = entry.get("access_count", 0) + 1
|
|
126
|
+
await self._save_ns(namespace, data)
|
|
127
|
+
return entry["value"]
|
|
128
|
+
|
|
129
|
+
async def search(self, query: str, namespace: str = "default", limit: int = 10) -> list[dict[str, Any]]:
|
|
130
|
+
data = await self._load_ns(namespace)
|
|
131
|
+
query_lower = query.lower()
|
|
132
|
+
results = []
|
|
133
|
+
for key, entry in data.items():
|
|
134
|
+
value_str = str(entry.get("value", "")).lower()
|
|
135
|
+
if query_lower in value_str or query_lower in key.lower():
|
|
136
|
+
results.append({"key": key, **entry})
|
|
137
|
+
results.sort(key=lambda r: r.get("access_count", 0), reverse=True)
|
|
138
|
+
return results[:limit]
|
|
139
|
+
|
|
140
|
+
async def delete(self, key: str, namespace: str = "default") -> bool:
|
|
141
|
+
data = await self._load_ns(namespace)
|
|
142
|
+
if key in data:
|
|
143
|
+
del data[key]
|
|
144
|
+
await self._save_ns(namespace, data)
|
|
145
|
+
return True
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
async def list_keys(self, namespace: str = "default") -> list[str]:
|
|
149
|
+
data = await self._load_ns(namespace)
|
|
150
|
+
return list(data.keys())
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class ConversationContext:
|
|
154
|
+
def __init__(self, memory: MemoryStore | None = None) -> None:
|
|
155
|
+
self.memory = memory or InMemoryStore()
|
|
156
|
+
self._messages: list[dict[str, str]] = []
|
|
157
|
+
self._file_context: dict[str, str] = {}
|
|
158
|
+
|
|
159
|
+
def add_message(self, role: str, content: str) -> None:
|
|
160
|
+
self._messages.append({"role": role, "content": content})
|
|
161
|
+
|
|
162
|
+
def get_messages(self) -> list[dict[str, str]]:
|
|
163
|
+
return list(self._messages)
|
|
164
|
+
|
|
165
|
+
def add_file_context(self, file_path: str, content: str) -> None:
|
|
166
|
+
self._file_context[file_path] = content
|
|
167
|
+
if self.memory:
|
|
168
|
+
import asyncio as _asyncio
|
|
169
|
+
try:
|
|
170
|
+
loop = _asyncio.get_event_loop()
|
|
171
|
+
if loop.is_running():
|
|
172
|
+
_asyncio.ensure_future(
|
|
173
|
+
self.memory.store(f"file:{file_path}", content, namespace="files")
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
loop.run_until_complete(
|
|
177
|
+
self.memory.store(f"file:{file_path}", content, namespace="files")
|
|
178
|
+
)
|
|
179
|
+
except RuntimeError:
|
|
180
|
+
pass
|
|
181
|
+
|
|
182
|
+
def get_file_context(self, file_path: str) -> str | None:
|
|
183
|
+
return self._file_context.get(file_path)
|
|
184
|
+
|
|
185
|
+
def get_all_file_paths(self) -> list[str]:
|
|
186
|
+
return list(self._file_context.keys())
|
|
187
|
+
|
|
188
|
+
def build_context_string(self, max_files: int = 5) -> str:
|
|
189
|
+
parts = []
|
|
190
|
+
if self._messages:
|
|
191
|
+
last_msg = self._messages[-1] if self._messages else {}
|
|
192
|
+
parts.append(f"Last message: {last_msg.get('content', '')[:500]}")
|
|
193
|
+
if self._file_context:
|
|
194
|
+
for path, content in list(self._file_context.items())[:max_files]:
|
|
195
|
+
preview = content[:300] + "..." if len(content) > 300 else content
|
|
196
|
+
parts.append(f"File {path}:\n{preview}")
|
|
197
|
+
return "\n\n".join(parts)
|
verifyloop/models.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
"""Core data models for the VerifyLoop framework."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StepType(str, Enum):
|
|
14
|
+
PLAN = "plan"
|
|
15
|
+
EXECUTE = "execute"
|
|
16
|
+
VERIFY = "verify"
|
|
17
|
+
RECOVER = "recover"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Step(BaseModel):
|
|
21
|
+
step_type: StepType
|
|
22
|
+
content: str
|
|
23
|
+
tool_calls: list[dict[str, Any]] = Field(default_factory=list)
|
|
24
|
+
confidence: float = 0.0
|
|
25
|
+
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Substep(BaseModel):
|
|
29
|
+
description: str
|
|
30
|
+
tool: str
|
|
31
|
+
arguments: dict[str, Any] = Field(default_factory=dict)
|
|
32
|
+
order: int = 0
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class PlanStep(BaseModel):
|
|
36
|
+
description: str
|
|
37
|
+
substeps: list[str] = Field(default_factory=list)
|
|
38
|
+
estimated_tools: list[str] = Field(default_factory=list)
|
|
39
|
+
substep_details: list[Substep] = Field(default_factory=list)
|
|
40
|
+
complexity: Literal["low", "medium", "high"] = "medium"
|
|
41
|
+
context_tokens: int = 0
|
|
42
|
+
estimated_duration_seconds: float = 0.0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ExecuteStep(BaseModel):
|
|
46
|
+
tool: str
|
|
47
|
+
arguments: dict[str, Any] = Field(default_factory=dict)
|
|
48
|
+
result: str = ""
|
|
49
|
+
success: bool = False
|
|
50
|
+
duration_seconds: float = 0.0
|
|
51
|
+
exit_code: int | None = None
|
|
52
|
+
error: str | None = None
|
|
53
|
+
artifacts: dict[str, str] = Field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def failed(self) -> bool:
|
|
57
|
+
return not self.success
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class VerifyCheckResult(BaseModel):
|
|
61
|
+
check: str
|
|
62
|
+
passed: bool
|
|
63
|
+
detail: str = ""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class VerifyStep(BaseModel):
|
|
67
|
+
checks: list[str] = Field(default_factory=list)
|
|
68
|
+
check_results: list[VerifyCheckResult] = Field(default_factory=list)
|
|
69
|
+
passed: bool = False
|
|
70
|
+
confidence: float = 0.0
|
|
71
|
+
failures: list[str] = Field(default_factory=list)
|
|
72
|
+
fix_suggestions: list[str] = Field(default_factory=list)
|
|
73
|
+
verification_model: str = "reason-critic-7b"
|
|
74
|
+
used_trained_model: bool = False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RecoverStep(BaseModel):
|
|
78
|
+
original_error: str
|
|
79
|
+
recovery_attempt: str = ""
|
|
80
|
+
recovery_type: Literal["edit", "create", "retry", "simplify", "analyze"] = "edit"
|
|
81
|
+
success: bool = False
|
|
82
|
+
attempt_number: int = 1
|
|
83
|
+
max_attempts: int = 3
|
|
84
|
+
patched_arguments: dict[str, Any] = Field(default_factory=dict)
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def exhausted(self) -> bool:
|
|
88
|
+
return self.attempt_number >= self.max_attempts and not self.success
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class RunStatus(str, Enum):
|
|
92
|
+
PENDING = "pending"
|
|
93
|
+
PLANNING = "planning"
|
|
94
|
+
EXECUTING = "executing"
|
|
95
|
+
VERIFYING = "verifying"
|
|
96
|
+
RECOVERING = "recovering"
|
|
97
|
+
COMPLETED = "completed"
|
|
98
|
+
FAILED = "failed"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class TokenUsage(BaseModel):
|
|
102
|
+
prompt_tokens: int = 0
|
|
103
|
+
completion_tokens: int = 0
|
|
104
|
+
total_tokens: int = 0
|
|
105
|
+
|
|
106
|
+
def merge(self, other: TokenUsage) -> TokenUsage:
|
|
107
|
+
return TokenUsage(
|
|
108
|
+
prompt_tokens=self.prompt_tokens + other.prompt_tokens,
|
|
109
|
+
completion_tokens=self.completion_tokens + other.completion_tokens,
|
|
110
|
+
total_tokens=self.total_tokens + other.total_tokens,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class AgentRun(BaseModel):
|
|
115
|
+
task: str
|
|
116
|
+
steps: list[Step] = Field(default_factory=list)
|
|
117
|
+
status: RunStatus = RunStatus.PENDING
|
|
118
|
+
token_usage: TokenUsage = Field(default_factory=TokenUsage)
|
|
119
|
+
duration_seconds: float = 0.0
|
|
120
|
+
iteration: int = 0
|
|
121
|
+
max_iterations: int = 5
|
|
122
|
+
created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
|
123
|
+
completed_at: datetime | None = None
|
|
124
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
125
|
+
|
|
126
|
+
def add_step(self, step: Step) -> None:
|
|
127
|
+
self.steps.append(step)
|
|
128
|
+
|
|
129
|
+
def elapsed(self) -> float:
|
|
130
|
+
if self.completed_at:
|
|
131
|
+
return (self.completed_at - self.created_at).total_seconds()
|
|
132
|
+
return (datetime.now(timezone.utc) - self.created_at).total_seconds()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class PipelineConfig(BaseModel):
|
|
136
|
+
model: str = "gpt-4o"
|
|
137
|
+
verify_model: str = "reason-critic-7b"
|
|
138
|
+
max_iterations: int = 5
|
|
139
|
+
confidence_threshold: float = 0.8
|
|
140
|
+
max_recovery_attempts: int = 3
|
|
141
|
+
working_dir: str = "."
|
|
142
|
+
dry_run: bool = False
|
|
143
|
+
interactive: bool = False
|
|
144
|
+
sandbox: bool = False
|
|
145
|
+
sandbox_image: str = "python:3.11-slim"
|
|
146
|
+
callbacks: dict[str, Any] = Field(default_factory=dict)
|
verifyloop/pipeline.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""Full Pipeline: Plan → Execute → Verify → Recover loop."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Any, Callable, Coroutine
|
|
8
|
+
|
|
9
|
+
from verifyloop.executor import Executor
|
|
10
|
+
from verifyloop.memory import ConversationContext, InMemoryStore, MemoryStore
|
|
11
|
+
from verifyloop.models import (
|
|
12
|
+
AgentRun,
|
|
13
|
+
ExecuteStep,
|
|
14
|
+
PipelineConfig,
|
|
15
|
+
PlanStep,
|
|
16
|
+
RecoverStep,
|
|
17
|
+
RunStatus,
|
|
18
|
+
Step,
|
|
19
|
+
StepType,
|
|
20
|
+
TokenUsage,
|
|
21
|
+
VerifyStep,
|
|
22
|
+
)
|
|
23
|
+
from verifyloop.planner import PlanGenerator
|
|
24
|
+
from verifyloop.recoverer import Recoverer
|
|
25
|
+
from verifyloop.verifier import Verifier, VerifierConfig
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
CallbackFn = Callable[[str, dict[str, Any]], Coroutine[Any, Any, None]] | None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AgentPipeline:
|
|
32
|
+
def __init__(self, config: PipelineConfig | None = None) -> None:
|
|
33
|
+
self.config = config or PipelineConfig()
|
|
34
|
+
self._planner = PlanGenerator(
|
|
35
|
+
model=self.config.model,
|
|
36
|
+
)
|
|
37
|
+
self._executor = Executor(
|
|
38
|
+
working_dir=self.config.working_dir,
|
|
39
|
+
sandbox=self.config.sandbox,
|
|
40
|
+
sandbox_image=self.config.sandbox_image,
|
|
41
|
+
)
|
|
42
|
+
self._verifier = Verifier(
|
|
43
|
+
VerifierConfig(
|
|
44
|
+
verify_model=self.config.verify_model,
|
|
45
|
+
confidence_threshold=self.config.confidence_threshold,
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
self._recoverer = Recoverer(
|
|
49
|
+
model=self.config.model,
|
|
50
|
+
max_recovery_attempts=self.config.max_recovery_attempts,
|
|
51
|
+
)
|
|
52
|
+
self._memory: MemoryStore = InMemoryStore()
|
|
53
|
+
self._context = ConversationContext(self._memory)
|
|
54
|
+
self._callbacks: list[CallbackFn] = []
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def token_usage(self) -> TokenUsage:
|
|
58
|
+
return (
|
|
59
|
+
self._planner.token_usage
|
|
60
|
+
.merge(self._verifier.token_usage)
|
|
61
|
+
.merge(self._recoverer.token_usage)
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def on_event(self, callback: CallbackFn) -> None:
|
|
65
|
+
self._callbacks.append(callback)
|
|
66
|
+
|
|
67
|
+
async def _emit(self, event: str, data: dict[str, Any]) -> None:
|
|
68
|
+
for cb in self._callbacks:
|
|
69
|
+
if cb is not None:
|
|
70
|
+
try:
|
|
71
|
+
await cb(event, data)
|
|
72
|
+
except Exception:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
async def run(
|
|
76
|
+
self,
|
|
77
|
+
task: str,
|
|
78
|
+
context: str = "",
|
|
79
|
+
max_iterations: int | None = None,
|
|
80
|
+
) -> AgentRun:
|
|
81
|
+
max_iters = max_iterations or self.config.max_iterations
|
|
82
|
+
run = AgentRun(
|
|
83
|
+
task=task,
|
|
84
|
+
max_iterations=max_iters,
|
|
85
|
+
status=RunStatus.PENDING,
|
|
86
|
+
)
|
|
87
|
+
start_time = time.monotonic()
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
await self._emit("run_start", {"task": task})
|
|
91
|
+
|
|
92
|
+
for iteration in range(1, max_iters + 1):
|
|
93
|
+
run.iteration = iteration
|
|
94
|
+
await self._emit("iteration_start", {"iteration": iteration})
|
|
95
|
+
|
|
96
|
+
# Phase 1: Plan
|
|
97
|
+
run.status = RunStatus.PLANNING
|
|
98
|
+
await self._emit("phase_start", {"phase": "plan", "iteration": iteration})
|
|
99
|
+
|
|
100
|
+
plan = await self._planner.generate_plan(task, context or self._context.build_context_string())
|
|
101
|
+
run.add_step(Step(
|
|
102
|
+
step_type=StepType.PLAN,
|
|
103
|
+
content=plan.description,
|
|
104
|
+
confidence=0.7,
|
|
105
|
+
))
|
|
106
|
+
await self._emit("phase_complete", {
|
|
107
|
+
"phase": "plan",
|
|
108
|
+
"description": plan.description,
|
|
109
|
+
"substeps": plan.substeps,
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
if self.config.dry_run:
|
|
113
|
+
run.status = RunStatus.COMPLETED
|
|
114
|
+
run.duration_seconds = time.monotonic() - start_time
|
|
115
|
+
return run
|
|
116
|
+
|
|
117
|
+
# Phase 2: Execute
|
|
118
|
+
run.status = RunStatus.EXECUTING
|
|
119
|
+
await self._emit("phase_start", {"phase": "execute", "iteration": iteration})
|
|
120
|
+
|
|
121
|
+
execute_steps: list[ExecuteStep] = []
|
|
122
|
+
for substep in plan.substep_details:
|
|
123
|
+
if self.config.interactive:
|
|
124
|
+
proceed = await self._confirm_substep(substep)
|
|
125
|
+
if not proceed:
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
step_result = await self._executor.execute(substep.tool, substep.arguments)
|
|
129
|
+
execute_steps.append(step_result)
|
|
130
|
+
run.add_step(Step(
|
|
131
|
+
step_type=StepType.EXECUTE,
|
|
132
|
+
content=f"{substep.tool}: {substep.description}",
|
|
133
|
+
tool_calls=[{"tool": substep.tool, "args": substep.arguments}],
|
|
134
|
+
confidence=1.0 if step_result.success else 0.0,
|
|
135
|
+
))
|
|
136
|
+
await self._emit("step_complete", {
|
|
137
|
+
"tool": substep.tool,
|
|
138
|
+
"success": step_result.success,
|
|
139
|
+
"iteration": iteration,
|
|
140
|
+
})
|
|
141
|
+
|
|
142
|
+
if substep.tool == "read" and step_result.success:
|
|
143
|
+
self._context.add_file_context(
|
|
144
|
+
substep.arguments.get("file_path", ""), step_result.result
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Phase 3: Verify
|
|
148
|
+
run.status = RunStatus.VERIFYING
|
|
149
|
+
await self._emit("phase_start", {"phase": "verify", "iteration": iteration})
|
|
150
|
+
|
|
151
|
+
verification = await self._verifier.verify_code_edits(plan, execute_steps)
|
|
152
|
+
run.add_step(Step(
|
|
153
|
+
step_type=StepType.VERIFY,
|
|
154
|
+
content=f"Passed: {verification.passed}, Confidence: {verification.confidence:.2f}",
|
|
155
|
+
confidence=verification.confidence,
|
|
156
|
+
))
|
|
157
|
+
await self._emit("phase_complete", {
|
|
158
|
+
"phase": "verify",
|
|
159
|
+
"passed": verification.passed,
|
|
160
|
+
"confidence": verification.confidence,
|
|
161
|
+
"failures": verification.failures,
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
if verification.passed and verification.confidence >= self.config.confidence_threshold:
|
|
165
|
+
run.status = RunStatus.COMPLETED
|
|
166
|
+
run.duration_seconds = time.monotonic() - start_time
|
|
167
|
+
run.completed_at = datetime.now(timezone.utc)
|
|
168
|
+
run.token_usage = self.token_usage
|
|
169
|
+
await self._emit("run_complete", {"status": "completed", "iterations": iteration})
|
|
170
|
+
return run
|
|
171
|
+
|
|
172
|
+
# Phase 4: Recover (if verification failed)
|
|
173
|
+
run.status = RunStatus.RECOVERING
|
|
174
|
+
await self._emit("phase_start", {"phase": "recover", "iteration": iteration})
|
|
175
|
+
|
|
176
|
+
failure_messages = verification.failures or ["Verification failed"]
|
|
177
|
+
all_errors = "; ".join(failure_messages)
|
|
178
|
+
|
|
179
|
+
for recovery_attempt in range(1, self.config.max_recovery_attempts + 1):
|
|
180
|
+
recovery = await self._recoverer.recover(
|
|
181
|
+
error=all_errors,
|
|
182
|
+
context=self._context.build_context_string(),
|
|
183
|
+
attempt=recovery_attempt,
|
|
184
|
+
failed_step=execute_steps[-1] if execute_steps else None,
|
|
185
|
+
)
|
|
186
|
+
run.add_step(Step(
|
|
187
|
+
step_type=StepType.RECOVER,
|
|
188
|
+
content=f"Recovery attempt {recovery_attempt}: {recovery.recovery_attempt}",
|
|
189
|
+
confidence=0.5,
|
|
190
|
+
))
|
|
191
|
+
await self._emit("recovery_attempt", {
|
|
192
|
+
"attempt": recovery_attempt,
|
|
193
|
+
"type": recovery.recovery_type,
|
|
194
|
+
"description": recovery.recovery_attempt,
|
|
195
|
+
})
|
|
196
|
+
|
|
197
|
+
if recovery.patched_arguments:
|
|
198
|
+
tool = recovery.patched_arguments.get("tool", "bash")
|
|
199
|
+
args = recovery.patched_arguments.get("arguments", {})
|
|
200
|
+
recovery_exec = await self._executor.execute(tool, args)
|
|
201
|
+
execute_steps.append(recovery_exec)
|
|
202
|
+
|
|
203
|
+
# Re-verify after recovery
|
|
204
|
+
recheck = await self._verifier.verify_code_edits(plan, execute_steps)
|
|
205
|
+
if recheck.passed and recheck.confidence >= self.config.confidence_threshold:
|
|
206
|
+
run.status = RunStatus.COMPLETED
|
|
207
|
+
run.duration_seconds = time.monotonic() - start_time
|
|
208
|
+
run.completed_at = datetime.now(timezone.utc)
|
|
209
|
+
run.token_usage = self.token_usage
|
|
210
|
+
await self._emit("run_complete", {"status": "completed_after_recovery"})
|
|
211
|
+
return run
|
|
212
|
+
|
|
213
|
+
if recovery.exhausted:
|
|
214
|
+
break
|
|
215
|
+
|
|
216
|
+
# If we get here, recovery didn't fix it — loop back for next iteration
|
|
217
|
+
context = self._context.build_context_string() + f"\nPrevious failures: {all_errors}"
|
|
218
|
+
|
|
219
|
+
run.status = RunStatus.FAILED
|
|
220
|
+
run.duration_seconds = time.monotonic() - start_time
|
|
221
|
+
run.completed_at = datetime.now(timezone.utc)
|
|
222
|
+
run.token_usage = self.token_usage
|
|
223
|
+
await self._emit("run_complete", {"status": "failed", "iterations": max_iters})
|
|
224
|
+
return run
|
|
225
|
+
|
|
226
|
+
except Exception as exc:
|
|
227
|
+
run.status = RunStatus.FAILED
|
|
228
|
+
run.duration_seconds = time.monotonic() - start_time
|
|
229
|
+
run.completed_at = datetime.now(timezone.utc)
|
|
230
|
+
run.metadata["error"] = str(exc)
|
|
231
|
+
run.token_usage = self.token_usage
|
|
232
|
+
await self._emit("run_error", {"error": str(exc)})
|
|
233
|
+
return run
|
|
234
|
+
|
|
235
|
+
async def _confirm_substep(self, substep: Any) -> bool:
|
|
236
|
+
try:
|
|
237
|
+
from rich.console import Console
|
|
238
|
+
from rich.prompt import Confirm
|
|
239
|
+
|
|
240
|
+
console = Console()
|
|
241
|
+
console.print(f"\n[bold blue]Step:[/] {substep.description}")
|
|
242
|
+
console.print(f" [dim]Tool: {substep.tool}[/dim]")
|
|
243
|
+
console.print(f" [dim]Args: {substep.arguments}[/dim]")
|
|
244
|
+
return Confirm.ask("Execute this step?", default=True)
|
|
245
|
+
except ImportError:
|
|
246
|
+
return True
|