borisxdave 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- boris.py +672 -0
- borisxdave-0.2.0.dist-info/METADATA +6 -0
- borisxdave-0.2.0.dist-info/RECORD +12 -0
- borisxdave-0.2.0.dist-info/WHEEL +5 -0
- borisxdave-0.2.0.dist-info/entry_points.txt +2 -0
- borisxdave-0.2.0.dist-info/top_level.txt +7 -0
- config.py +12 -0
- engine.py +684 -0
- git_manager.py +248 -0
- planner.py +161 -0
- prompts.py +687 -0
- state.py +103 -0
engine.py
ADDED
|
@@ -0,0 +1,684 @@
|
|
|
1
|
+
"""Boris engine - execution and monitoring (merged from executor + monitor)."""
|
|
2
|
+
import enum
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
import tempfile
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
# Force unbuffered stdout for real-time output on Windows
|
|
15
|
+
os.environ.setdefault("PYTHONUNBUFFERED", "1")
|
|
16
|
+
|
|
17
|
+
from state import Milestone, UIMilestone
|
|
18
|
+
|
|
19
|
+
IS_WINDOWS = sys.platform == "win32"
|
|
20
|
+
logger = logging.getLogger("boris.engine")
|
|
21
|
+
|
|
22
|
+
# Regex to strip ANSI escape codes
|
|
23
|
+
_ANSI_RE = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]")
|
|
24
|
+
|
|
25
|
+
# Directories (relative to Boris install dir)
|
|
26
|
+
_BORIS_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
27
|
+
_LOGS_DIR = os.path.join(_BORIS_DIR, "logs")
|
|
28
|
+
_BORIS_PROMPT_PATH = os.path.join(_BORIS_DIR, "boris_prompt.md")
|
|
29
|
+
|
|
30
|
+
# Resolve commands to full paths so shell=False works on Windows (.cmd files)
|
|
31
|
+
CLAUDE_CMD = shutil.which("claude") or "claude"
|
|
32
|
+
|
|
33
|
+
# Cached Boris system prompt
|
|
34
|
+
_boris_prompt_cache = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _load_boris_prompt() -> str:
|
|
38
|
+
"""Load Boris's management prompt from boris_prompt.md. Cached after first load."""
|
|
39
|
+
global _boris_prompt_cache
|
|
40
|
+
if _boris_prompt_cache is not None:
|
|
41
|
+
return _boris_prompt_cache
|
|
42
|
+
try:
|
|
43
|
+
with open(_BORIS_PROMPT_PATH, "r", encoding="utf-8") as f:
|
|
44
|
+
_boris_prompt_cache = f.read().strip()
|
|
45
|
+
logger.debug("Loaded Boris prompt: %d chars", len(_boris_prompt_cache))
|
|
46
|
+
except FileNotFoundError:
|
|
47
|
+
logger.warning("boris_prompt.md not found at %s", _BORIS_PROMPT_PATH)
|
|
48
|
+
_boris_prompt_cache = ""
|
|
49
|
+
return _boris_prompt_cache
|
|
50
|
+
DAVELOOP_CMD = shutil.which("daveloop") or "daveloop"
|
|
51
|
+
|
|
52
|
+
# Limits
|
|
53
|
+
DEFAULT_MAX_ITERATIONS = 15
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _clean_output(text: str) -> str:
|
|
57
|
+
"""Strip ANSI escape codes and non-ASCII chars for clean analysis."""
|
|
58
|
+
text = _ANSI_RE.sub("", text)
|
|
59
|
+
# Replace non-ASCII (emoji, box-drawing) with spaces
|
|
60
|
+
text = text.encode("ascii", errors="replace").decode("ascii")
|
|
61
|
+
return text
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# --- Execution (from executor.py) ---
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class ExecutionResult:
|
|
69
|
+
output: str
|
|
70
|
+
exit_code: int
|
|
71
|
+
resolved: bool
|
|
72
|
+
log_path: Optional[str] = None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _boris_commentary(reasoning: dict, step_num: int, accomplishments: list):
|
|
76
|
+
"""Boris reports what DaveLoop accomplished so far and what he's cooking next."""
|
|
77
|
+
known = reasoning.get("KNOWN", "").strip()
|
|
78
|
+
hypothesis = reasoning.get("HYPOTHESIS", "").strip()
|
|
79
|
+
next_action = reasoning.get("NEXT", "").strip()
|
|
80
|
+
|
|
81
|
+
print(flush=True)
|
|
82
|
+
print(f" [Boris] === DaveLoop Check-in #{step_num} ===", flush=True)
|
|
83
|
+
|
|
84
|
+
# What DaveLoop accomplished since last reasoning block
|
|
85
|
+
if accomplishments:
|
|
86
|
+
print(f" [Boris] Done so far:", flush=True)
|
|
87
|
+
for a in accomplishments:
|
|
88
|
+
print(f" [Boris] - {a}", flush=True)
|
|
89
|
+
elif step_num == 1:
|
|
90
|
+
print(f" [Boris] Starting up, gathering info.", flush=True)
|
|
91
|
+
|
|
92
|
+
# What DaveLoop knows now
|
|
93
|
+
if known:
|
|
94
|
+
print(f" [Boris] Knows: {known}", flush=True)
|
|
95
|
+
|
|
96
|
+
# What DaveLoop is about to do
|
|
97
|
+
if hypothesis:
|
|
98
|
+
print(f" [Boris] Thinking: {hypothesis}", flush=True)
|
|
99
|
+
if next_action:
|
|
100
|
+
print(f" [Boris] Next: {next_action}", flush=True)
|
|
101
|
+
|
|
102
|
+
print(f" [Boris] ===========================", flush=True)
|
|
103
|
+
print(flush=True)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _parse_accomplishment(clean_line: str) -> Optional[str]:
|
|
107
|
+
"""Parse a DaveLoop output line into a human-readable accomplishment, or None."""
|
|
108
|
+
# File writes
|
|
109
|
+
write_match = re.search(r"Write\((.+?)\)", clean_line)
|
|
110
|
+
if write_match:
|
|
111
|
+
return f"Created {write_match.group(1).strip()}"
|
|
112
|
+
|
|
113
|
+
# File edits
|
|
114
|
+
edit_match = re.search(r"Edit\((.+?)\)", clean_line)
|
|
115
|
+
if edit_match:
|
|
116
|
+
return f"Edited {edit_match.group(1).strip()}"
|
|
117
|
+
|
|
118
|
+
# Bash commands
|
|
119
|
+
bash_match = re.search(r"Bash\((.+?)\)", clean_line)
|
|
120
|
+
if bash_match:
|
|
121
|
+
cmd = bash_match.group(1).strip()
|
|
122
|
+
if "test" in cmd.lower() or "pytest" in cmd.lower():
|
|
123
|
+
return f"Ran tests: {cmd}"
|
|
124
|
+
elif "mkdir" in cmd.lower():
|
|
125
|
+
return f"Created directory: {cmd}"
|
|
126
|
+
elif "pip install" in cmd.lower() or "npm install" in cmd.lower():
|
|
127
|
+
return f"Installed dependencies: {cmd}"
|
|
128
|
+
else:
|
|
129
|
+
return f"Ran: {cmd}"
|
|
130
|
+
|
|
131
|
+
# Tool results - success/failure
|
|
132
|
+
if "? ?" in clean_line or "PASS" in clean_line.upper():
|
|
133
|
+
return "Verification passed"
|
|
134
|
+
if "? ?" in clean_line or "FAIL" in clean_line.upper():
|
|
135
|
+
if "test" in clean_line.lower():
|
|
136
|
+
return "Test failed - will retry"
|
|
137
|
+
|
|
138
|
+
# DaveLoop iteration markers
|
|
139
|
+
if "ITERATION" in clean_line.upper() and re.search(r"\d+", clean_line):
|
|
140
|
+
iter_match = re.search(r"(\d+)", clean_line)
|
|
141
|
+
if iter_match:
|
|
142
|
+
return f"DaveLoop iteration {iter_match.group(1)}"
|
|
143
|
+
|
|
144
|
+
# RESOLVED signal
|
|
145
|
+
if "[DAVELOOP:RESOLVED]" in clean_line:
|
|
146
|
+
return "DaveLoop reports: RESOLVED"
|
|
147
|
+
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _check_off_rail(clean_line: str, milestone: Milestone) -> Optional[str]:
|
|
152
|
+
"""Detect if DaveLoop is going off-rail. Returns interrupt message or None."""
|
|
153
|
+
lower = clean_line.lower()
|
|
154
|
+
|
|
155
|
+
# Detect building wrong files
|
|
156
|
+
allowed_files = set((milestone.files_to_create or []) + (milestone.files_to_modify or []))
|
|
157
|
+
if allowed_files:
|
|
158
|
+
# Check for Write tool creating files outside scope
|
|
159
|
+
write_match = re.search(r"Write\((.+?)\)", clean_line)
|
|
160
|
+
if write_match:
|
|
161
|
+
written_file = write_match.group(1).strip()
|
|
162
|
+
# Normalize: strip path prefixes, compare basenames
|
|
163
|
+
written_base = os.path.basename(written_file)
|
|
164
|
+
allowed_bases = {os.path.basename(f) for f in allowed_files}
|
|
165
|
+
if written_base and written_base not in allowed_bases:
|
|
166
|
+
return (
|
|
167
|
+
f"wait - you are creating {written_file} which is outside the scope of "
|
|
168
|
+
f"{milestone.id}. Only touch: {', '.join(allowed_files)}. "
|
|
169
|
+
f"Focus on {milestone.id}: {milestone.title} only."
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Detect building entire project / other milestones
|
|
173
|
+
off_rail_phrases = [
|
|
174
|
+
"build the entire project",
|
|
175
|
+
"build everything",
|
|
176
|
+
"implement all milestones",
|
|
177
|
+
"implement all features",
|
|
178
|
+
"building the full",
|
|
179
|
+
]
|
|
180
|
+
for phrase in off_rail_phrases:
|
|
181
|
+
if phrase in lower:
|
|
182
|
+
return (
|
|
183
|
+
f"wait - you are going off-rail. You are only building {milestone.id}: "
|
|
184
|
+
f"{milestone.title}. Do NOT build the entire project. "
|
|
185
|
+
f"Focus ONLY on {milestone.id}."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Detect mentioning other milestone IDs in action context
|
|
189
|
+
other_ms = re.findall(r"\bM\d+\b", clean_line)
|
|
190
|
+
for m_id in other_ms:
|
|
191
|
+
if m_id != milestone.id and ("building" in lower or "implement" in lower or "creating" in lower):
|
|
192
|
+
return (
|
|
193
|
+
f"wait - you mentioned {m_id} but you should only be working on "
|
|
194
|
+
f"{milestone.id}: {milestone.title}. Stay in scope."
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _send_interrupt(process, message: str, boris_log):
|
|
201
|
+
"""Send a text interrupt to DaveLoop's stdin."""
|
|
202
|
+
try:
|
|
203
|
+
if process.stdin and not process.stdin.closed:
|
|
204
|
+
process.stdin.write(f"{message}\n".encode("utf-8"))
|
|
205
|
+
process.stdin.flush()
|
|
206
|
+
log_msg = f"[Boris INTERRUPT] {message}"
|
|
207
|
+
print(f"\n {log_msg}\n", flush=True)
|
|
208
|
+
logger.warning(log_msg)
|
|
209
|
+
if boris_log:
|
|
210
|
+
boris_log.write(f"\n{log_msg}\n")
|
|
211
|
+
except (OSError, BrokenPipeError):
|
|
212
|
+
logger.debug("Could not send interrupt to DaveLoop stdin")
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def run(prompt: str, project_dir: str, max_iterations: int = None,
|
|
216
|
+
milestone: Milestone = None) -> ExecutionResult:
|
|
217
|
+
"""Spawn DaveLoop with the crafted prompt. Boris manages, DaveLoop builds.
|
|
218
|
+
|
|
219
|
+
If milestone is provided, Boris monitors DaveLoop's reasoning blocks in real-time
|
|
220
|
+
and intervenes via text interrupt if DaveLoop goes off-rail.
|
|
221
|
+
"""
|
|
222
|
+
max_iter = max_iterations if max_iterations is not None else DEFAULT_MAX_ITERATIONS
|
|
223
|
+
temp_file = None
|
|
224
|
+
boris_log = None
|
|
225
|
+
|
|
226
|
+
try:
|
|
227
|
+
# Write prompt to temp file for DaveLoop
|
|
228
|
+
temp_file = tempfile.NamedTemporaryFile(
|
|
229
|
+
mode="w", suffix=".txt", delete=False, encoding="utf-8"
|
|
230
|
+
)
|
|
231
|
+
temp_file.write(prompt)
|
|
232
|
+
temp_file.close()
|
|
233
|
+
|
|
234
|
+
# Boris spawns DaveLoop as the worker
|
|
235
|
+
cmd = [
|
|
236
|
+
DAVELOOP_CMD,
|
|
237
|
+
"-f", temp_file.name,
|
|
238
|
+
"-d", project_dir,
|
|
239
|
+
"-m", str(max_iter),
|
|
240
|
+
"-v",
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
env = os.environ.copy()
|
|
244
|
+
env["PYTHONIOENCODING"] = "utf-8"
|
|
245
|
+
|
|
246
|
+
print(f" [Boris] Spawning DaveLoop: max_iter={max_iter}, project={project_dir}", flush=True)
|
|
247
|
+
logger.info("Spawning DaveLoop: max_iter=%d, project=%s", max_iter, project_dir)
|
|
248
|
+
logger.debug("Prompt length: %d chars", len(prompt))
|
|
249
|
+
|
|
250
|
+
# Boris's own log for this execution
|
|
251
|
+
log_path = _setup_log(project_dir)
|
|
252
|
+
boris_log = open(log_path, "w", encoding="utf-8")
|
|
253
|
+
boris_log.write(f"=== Boris Execution Log ===\n")
|
|
254
|
+
boris_log.write(f"Timestamp: {datetime.now().isoformat()}\n")
|
|
255
|
+
boris_log.write(f"Project: {project_dir}\n")
|
|
256
|
+
boris_log.write(f"Max iterations: {max_iter}\n")
|
|
257
|
+
boris_log.write(f"Prompt file: {temp_file.name}\n")
|
|
258
|
+
boris_log.write(f"Prompt length: {len(prompt)} chars\n")
|
|
259
|
+
boris_log.write(f"===========================\n\n")
|
|
260
|
+
|
|
261
|
+
# Run DaveLoop with real-time streaming + stdin for interrupts
|
|
262
|
+
# bufsize=0 disables OS-level pipe buffering for real-time output on Windows
|
|
263
|
+
process = subprocess.Popen(
|
|
264
|
+
cmd,
|
|
265
|
+
stdin=subprocess.PIPE,
|
|
266
|
+
stdout=subprocess.PIPE,
|
|
267
|
+
stderr=subprocess.STDOUT,
|
|
268
|
+
cwd=project_dir,
|
|
269
|
+
env=env,
|
|
270
|
+
bufsize=0,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
output_lines = []
|
|
274
|
+
# Reasoning block parser state
|
|
275
|
+
in_reasoning = False
|
|
276
|
+
reasoning_lines = []
|
|
277
|
+
reasoning_count = 0
|
|
278
|
+
accomplishments = [] # tracks what DaveLoop did since last reasoning block
|
|
279
|
+
all_accomplishments = [] # cumulative for the whole run
|
|
280
|
+
interrupt_count = 0
|
|
281
|
+
MAX_INTERRUPTS = 3 # After 3 interrupts, let DaveLoop finish and fail at verdict
|
|
282
|
+
|
|
283
|
+
for raw_line in process.stdout:
|
|
284
|
+
line = raw_line.decode("utf-8", errors="replace")
|
|
285
|
+
sys.stdout.write(line)
|
|
286
|
+
sys.stdout.flush()
|
|
287
|
+
output_lines.append(line)
|
|
288
|
+
boris_log.write(line)
|
|
289
|
+
|
|
290
|
+
# Parse cleaned line for monitoring
|
|
291
|
+
clean = _clean_output(line).strip()
|
|
292
|
+
|
|
293
|
+
# --- Track accomplishments between reasoning blocks ---
|
|
294
|
+
if not in_reasoning:
|
|
295
|
+
acc = _parse_accomplishment(clean)
|
|
296
|
+
if acc:
|
|
297
|
+
accomplishments.append(acc)
|
|
298
|
+
all_accomplishments.append(acc)
|
|
299
|
+
|
|
300
|
+
# --- Reasoning block detection ---
|
|
301
|
+
if "REASONING" in clean and ("===" in clean or "---" in clean or "KNOWN" in clean):
|
|
302
|
+
in_reasoning = True
|
|
303
|
+
reasoning_lines = []
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
if in_reasoning:
|
|
307
|
+
# Check for end of reasoning block
|
|
308
|
+
if clean.startswith("===") or clean.startswith("---") or clean.startswith("└"):
|
|
309
|
+
in_reasoning = False
|
|
310
|
+
reasoning_count += 1
|
|
311
|
+
# Parse collected reasoning
|
|
312
|
+
reasoning = {}
|
|
313
|
+
for rl in reasoning_lines:
|
|
314
|
+
for key in ("KNOWN", "UNKNOWN", "HYPOTHESIS", "NEXT", "WHY"):
|
|
315
|
+
if rl.startswith(key + ":") or rl.startswith(key + " :"):
|
|
316
|
+
reasoning[key] = rl.split(":", 1)[1].strip()
|
|
317
|
+
if reasoning:
|
|
318
|
+
_boris_commentary(reasoning, reasoning_count, accomplishments)
|
|
319
|
+
# Reset per-block accomplishments, keep cumulative
|
|
320
|
+
accomplishments = []
|
|
321
|
+
reasoning_lines = []
|
|
322
|
+
else:
|
|
323
|
+
reasoning_lines.append(clean)
|
|
324
|
+
|
|
325
|
+
# --- Off-rail detection ---
|
|
326
|
+
if milestone and interrupt_count < MAX_INTERRUPTS:
|
|
327
|
+
interrupt_msg = _check_off_rail(clean, milestone)
|
|
328
|
+
if interrupt_msg:
|
|
329
|
+
_send_interrupt(process, interrupt_msg, boris_log)
|
|
330
|
+
interrupt_count += 1
|
|
331
|
+
if interrupt_count >= MAX_INTERRUPTS:
|
|
332
|
+
warn = (
|
|
333
|
+
f"[Boris] Sent {MAX_INTERRUPTS} interrupts. "
|
|
334
|
+
f"DaveLoop keeps going off-rail. Will check at verdict."
|
|
335
|
+
)
|
|
336
|
+
print(f"\n {warn}\n", flush=True)
|
|
337
|
+
logger.warning(warn)
|
|
338
|
+
if boris_log:
|
|
339
|
+
boris_log.write(f"\n{warn}\n")
|
|
340
|
+
|
|
341
|
+
process.wait()
|
|
342
|
+
output = "".join(output_lines)
|
|
343
|
+
|
|
344
|
+
# Boris end-of-run summary
|
|
345
|
+
if all_accomplishments:
|
|
346
|
+
print(flush=True)
|
|
347
|
+
print(f" [Boris] === DaveLoop Run Complete ===", flush=True)
|
|
348
|
+
print(f" [Boris] Total actions tracked: {len(all_accomplishments)}", flush=True)
|
|
349
|
+
for a in all_accomplishments:
|
|
350
|
+
print(f" [Boris] - {a}", flush=True)
|
|
351
|
+
print(f" [Boris] =============================", flush=True)
|
|
352
|
+
print(flush=True)
|
|
353
|
+
|
|
354
|
+
boris_log.write(f"\n=== DaveLoop exit code: {process.returncode} ===\n")
|
|
355
|
+
if interrupt_count > 0:
|
|
356
|
+
boris_log.write(f"Boris interrupts sent: {interrupt_count}\n")
|
|
357
|
+
if all_accomplishments:
|
|
358
|
+
boris_log.write(f"Tracked accomplishments: {len(all_accomplishments)}\n")
|
|
359
|
+
for a in all_accomplishments:
|
|
360
|
+
boris_log.write(f" - {a}\n")
|
|
361
|
+
|
|
362
|
+
# Check if DaveLoop resolved the milestone
|
|
363
|
+
resolved = "[DAVELOOP:RESOLVED]" in output
|
|
364
|
+
|
|
365
|
+
# Also find DaveLoop's own log path
|
|
366
|
+
daveloop_log = None
|
|
367
|
+
log_match = re.search(r"Logs:\s*(\S+)", output)
|
|
368
|
+
if log_match:
|
|
369
|
+
daveloop_log = log_match.group(1)
|
|
370
|
+
boris_log.write(f"DaveLoop log: {daveloop_log}\n")
|
|
371
|
+
|
|
372
|
+
logger.info(
|
|
373
|
+
"DaveLoop finished. Exit code: %d, Resolved: %s, Interrupts: %d",
|
|
374
|
+
process.returncode, resolved, interrupt_count
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
return ExecutionResult(
|
|
378
|
+
output=output,
|
|
379
|
+
exit_code=process.returncode,
|
|
380
|
+
resolved=resolved,
|
|
381
|
+
log_path=log_path,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
except FileNotFoundError:
|
|
385
|
+
return ExecutionResult(
|
|
386
|
+
output=f"Error: '{DAVELOOP_CMD}' not found. Install with: pip install daveloop",
|
|
387
|
+
exit_code=127,
|
|
388
|
+
resolved=False,
|
|
389
|
+
)
|
|
390
|
+
except subprocess.SubprocessError as e:
|
|
391
|
+
return ExecutionResult(
|
|
392
|
+
output=f"Subprocess error: {e}",
|
|
393
|
+
exit_code=1,
|
|
394
|
+
resolved=False,
|
|
395
|
+
)
|
|
396
|
+
finally:
|
|
397
|
+
if temp_file is not None:
|
|
398
|
+
try:
|
|
399
|
+
os.unlink(temp_file.name)
|
|
400
|
+
except OSError:
|
|
401
|
+
pass
|
|
402
|
+
if boris_log is not None:
|
|
403
|
+
try:
|
|
404
|
+
boris_log.close()
|
|
405
|
+
except OSError:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _setup_log(project_dir: str) -> str:
|
|
410
|
+
"""Create a log file path for this execution."""
|
|
411
|
+
os.makedirs(_LOGS_DIR, exist_ok=True)
|
|
412
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
413
|
+
return os.path.join(_LOGS_DIR, f"boris_exec_{timestamp}.log")
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
# --- Monitoring (from monitor.py) ---
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
class Verdict(enum.Enum):
|
|
420
|
+
RESOLVED = "RESOLVED"
|
|
421
|
+
OFF_PLAN = "OFF_PLAN"
|
|
422
|
+
FAILED = "FAILED"
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
@dataclass
|
|
426
|
+
class VerdictResult:
|
|
427
|
+
verdict: Verdict
|
|
428
|
+
reason: str
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def check(result: ExecutionResult, milestone: Milestone) -> VerdictResult:
|
|
432
|
+
"""Check execution result and return a verdict."""
|
|
433
|
+
print(f" [Boris] Evaluating DaveLoop result for {milestone.id}...", flush=True)
|
|
434
|
+
|
|
435
|
+
if result.resolved:
|
|
436
|
+
print(f" [Boris] DaveLoop self-reported RESOLVED for {milestone.id}", flush=True)
|
|
437
|
+
return VerdictResult(verdict=Verdict.RESOLVED, reason="DaveLoop reported resolution")
|
|
438
|
+
|
|
439
|
+
if result.exit_code != 0:
|
|
440
|
+
print(f" [Boris] DaveLoop exited with code {result.exit_code} for {milestone.id}", flush=True)
|
|
441
|
+
# Get tail of output for reason
|
|
442
|
+
lines = result.output.strip().splitlines()
|
|
443
|
+
tail = "\n".join(lines[-20:]) if len(lines) > 20 else result.output.strip()
|
|
444
|
+
return VerdictResult(
|
|
445
|
+
verdict=Verdict.FAILED,
|
|
446
|
+
reason=f"Process exited with code {result.exit_code}. Output tail:\n{tail}",
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Use Claude for nuanced analysis
|
|
450
|
+
print(f" [Boris] Running Claude verdict analysis for {milestone.id}...", flush=True)
|
|
451
|
+
return _analyze_with_claude(result.output, milestone)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _analyze_with_claude(output: str, milestone: Milestone) -> VerdictResult:
|
|
455
|
+
"""Use Claude CLI to analyze whether output achieved the milestone."""
|
|
456
|
+
criteria_text = "\n".join(f"- {c}" for c in milestone.acceptance_criteria)
|
|
457
|
+
# Clean and truncate output
|
|
458
|
+
clean = _clean_output(output)
|
|
459
|
+
output_lines = clean.strip().splitlines()
|
|
460
|
+
truncated = "\n".join(output_lines[-300:]) if len(output_lines) > 300 else clean.strip()
|
|
461
|
+
|
|
462
|
+
boris_brain = _load_boris_prompt()
|
|
463
|
+
prompt = ""
|
|
464
|
+
if boris_brain:
|
|
465
|
+
prompt += boris_brain + "\n\n---\n\n"
|
|
466
|
+
prompt += (
|
|
467
|
+
f"Analyze this build output. Did it achieve these acceptance criteria:\n"
|
|
468
|
+
f"{criteria_text}\n\n"
|
|
469
|
+
f"Build output:\n{truncated}\n\n"
|
|
470
|
+
f"Answer with exactly one of: RESOLVED, OFF_PLAN, FAILED. "
|
|
471
|
+
f"Then explain why on the next line."
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
try:
|
|
475
|
+
env = os.environ.copy()
|
|
476
|
+
env["PYTHONIOENCODING"] = "utf-8"
|
|
477
|
+
result = subprocess.run(
|
|
478
|
+
[CLAUDE_CMD, "-p", "--no-session-persistence"],
|
|
479
|
+
input=prompt,
|
|
480
|
+
capture_output=True,
|
|
481
|
+
timeout=120,
|
|
482
|
+
env=env,
|
|
483
|
+
encoding="utf-8",
|
|
484
|
+
errors="replace",
|
|
485
|
+
)
|
|
486
|
+
if result.returncode != 0:
|
|
487
|
+
return VerdictResult(
|
|
488
|
+
verdict=Verdict.FAILED,
|
|
489
|
+
reason="Could not analyze output",
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
response = result.stdout.strip()
|
|
493
|
+
lines = response.splitlines()
|
|
494
|
+
if not lines:
|
|
495
|
+
return VerdictResult(verdict=Verdict.FAILED, reason="Empty analysis response")
|
|
496
|
+
|
|
497
|
+
first_line = lines[0].strip().upper()
|
|
498
|
+
reason = "\n".join(lines[1:]).strip() if len(lines) > 1 else "No explanation provided"
|
|
499
|
+
|
|
500
|
+
if "RESOLVED" in first_line:
|
|
501
|
+
return VerdictResult(verdict=Verdict.RESOLVED, reason=reason)
|
|
502
|
+
elif "OFF_PLAN" in first_line:
|
|
503
|
+
return VerdictResult(verdict=Verdict.OFF_PLAN, reason=reason)
|
|
504
|
+
else:
|
|
505
|
+
return VerdictResult(verdict=Verdict.FAILED, reason=reason)
|
|
506
|
+
|
|
507
|
+
except (subprocess.SubprocessError, subprocess.TimeoutExpired):
|
|
508
|
+
return VerdictResult(
|
|
509
|
+
verdict=Verdict.FAILED,
|
|
510
|
+
reason="Could not analyze output",
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
# --- UI Testing (DaveLoop v1.4) ---
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
# Off-rail phrases specific to UI tester mode (building new stuff instead of testing)
|
|
518
|
+
_UI_OFF_RAIL_PHRASES = [
|
|
519
|
+
"add new feature",
|
|
520
|
+
"implement new",
|
|
521
|
+
"create new endpoint",
|
|
522
|
+
"add new screen",
|
|
523
|
+
"build new component",
|
|
524
|
+
]
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def _parse_ui_accomplishment(clean_line: str, ui_milestone: UIMilestone) -> Optional[str]:
|
|
528
|
+
"""Parse a DaveLoop output line into a UI-specific accomplishment, or None.
|
|
529
|
+
|
|
530
|
+
Falls back to the standard _parse_accomplishment for non-UI-specific lines.
|
|
531
|
+
"""
|
|
532
|
+
clean = clean_line.strip()
|
|
533
|
+
lower = clean.lower()
|
|
534
|
+
|
|
535
|
+
# Screenshots
|
|
536
|
+
if "screenshot" in lower and ("saved" in lower or "captured" in lower or "taken" in lower):
|
|
537
|
+
return "Captured screenshot"
|
|
538
|
+
|
|
539
|
+
# Playwright/Maestro test runs
|
|
540
|
+
if "maestro test" in lower or "playwright test" in lower:
|
|
541
|
+
return f"Ran UI test: {clean}"
|
|
542
|
+
|
|
543
|
+
# Issue tracking
|
|
544
|
+
issue_match = re.search(r"ISSUE FOUND:\s*(.+)", clean)
|
|
545
|
+
if issue_match:
|
|
546
|
+
ui_milestone.issues_found.append(issue_match.group(1))
|
|
547
|
+
return f"Found issue: {issue_match.group(1)}"
|
|
548
|
+
|
|
549
|
+
fix_match = re.search(r"FIX APPLIED:\s*(.+)", clean)
|
|
550
|
+
if fix_match:
|
|
551
|
+
ui_milestone.issues_fixed.append(fix_match.group(1))
|
|
552
|
+
return f"Fixed: {fix_match.group(1)}"
|
|
553
|
+
|
|
554
|
+
# Fall back to standard parsing
|
|
555
|
+
return _parse_accomplishment(clean)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def _check_ui_off_rail(clean_line: str, ui_milestone: UIMilestone) -> Optional[str]:
|
|
559
|
+
"""Detect if DaveLoop is going off-rail in UI tester mode (building instead of testing)."""
|
|
560
|
+
lower = clean_line.lower()
|
|
561
|
+
|
|
562
|
+
for phrase in _UI_OFF_RAIL_PHRASES:
|
|
563
|
+
if phrase in lower:
|
|
564
|
+
return (
|
|
565
|
+
f"wait - you are in UI TESTER MODE. You should be testing "
|
|
566
|
+
f"{ui_milestone.id}: {ui_milestone.title}, not building new features. "
|
|
567
|
+
f"Test existing UI and fix visual/UX issues only."
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
return None
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def run_ui_test(prompt: str, project_dir: str, max_iterations: int = None,
|
|
574
|
+
ui_milestone: UIMilestone = None) -> ExecutionResult:
|
|
575
|
+
"""Spawn DaveLoop for UI testing. Thin wrapper around run() with UI-specific monitoring.
|
|
576
|
+
|
|
577
|
+
Boris ships DaveLoop the prompt and monitors for UI-specific accomplishments
|
|
578
|
+
and off-rail behavior. DaveLoop handles the actual testing.
|
|
579
|
+
"""
|
|
580
|
+
# Use the same run() infrastructure - DaveLoop does the work
|
|
581
|
+
# We just swap in UI-specific off-rail detection via a temporary Milestone adapter
|
|
582
|
+
# so the existing run() monitoring pipeline picks it up.
|
|
583
|
+
|
|
584
|
+
# Create a lightweight Milestone-compatible object for run()'s off-rail checks.
|
|
585
|
+
# run() expects a Milestone with files_to_create/files_to_modify for scope checks.
|
|
586
|
+
# For UI testing, we don't restrict files - DaveLoop can touch test files freely.
|
|
587
|
+
adapter = Milestone(
|
|
588
|
+
id=ui_milestone.id if ui_milestone else "UI",
|
|
589
|
+
title=ui_milestone.title if ui_milestone else "UI Test",
|
|
590
|
+
description=ui_milestone.description if ui_milestone else "",
|
|
591
|
+
depends_on=[],
|
|
592
|
+
acceptance_criteria=ui_milestone.acceptance_criteria if ui_milestone else [],
|
|
593
|
+
files_to_create=[], # No file restrictions in UI test mode
|
|
594
|
+
files_to_modify=[],
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
result = run(prompt, project_dir, max_iterations, milestone=adapter)
|
|
598
|
+
|
|
599
|
+
# Post-process: scan output for UI-specific markers
|
|
600
|
+
if ui_milestone:
|
|
601
|
+
for line in result.output.splitlines():
|
|
602
|
+
clean = _clean_output(line).strip()
|
|
603
|
+
issue_match = re.search(r"ISSUE FOUND:\s*(.+)", clean)
|
|
604
|
+
if issue_match and issue_match.group(1) not in ui_milestone.issues_found:
|
|
605
|
+
ui_milestone.issues_found.append(issue_match.group(1))
|
|
606
|
+
fix_match = re.search(r"FIX APPLIED:\s*(.+)", clean)
|
|
607
|
+
if fix_match and fix_match.group(1) not in ui_milestone.issues_fixed:
|
|
608
|
+
ui_milestone.issues_fixed.append(fix_match.group(1))
|
|
609
|
+
|
|
610
|
+
return result
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def check_ui(result: ExecutionResult, ui_milestone: UIMilestone, test_tool: str) -> VerdictResult:
|
|
614
|
+
"""Check UI test execution result and return a verdict."""
|
|
615
|
+
print(f" [Boris] Evaluating UI test result for {ui_milestone.id}...", flush=True)
|
|
616
|
+
|
|
617
|
+
if result.resolved:
|
|
618
|
+
print(f" [Boris] DaveLoop self-reported RESOLVED for {ui_milestone.id}", flush=True)
|
|
619
|
+
return VerdictResult(verdict=Verdict.RESOLVED, reason="DaveLoop reported resolution")
|
|
620
|
+
|
|
621
|
+
if result.exit_code != 0:
|
|
622
|
+
print(f" [Boris] DaveLoop exited with code {result.exit_code} for {ui_milestone.id}", flush=True)
|
|
623
|
+
lines = result.output.strip().splitlines()
|
|
624
|
+
tail = "\n".join(lines[-20:]) if len(lines) > 20 else result.output.strip()
|
|
625
|
+
return VerdictResult(
|
|
626
|
+
verdict=Verdict.FAILED,
|
|
627
|
+
reason=f"Process exited with code {result.exit_code}. Output tail:\n{tail}",
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
# Use Claude for UI-specific verdict
|
|
631
|
+
print(f" [Boris] Running Claude UI verdict analysis for {ui_milestone.id}...", flush=True)
|
|
632
|
+
return _analyze_ui_verdict(result.output, ui_milestone, test_tool)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def _analyze_ui_verdict(output: str, ui_milestone: UIMilestone, test_tool: str) -> VerdictResult:
|
|
636
|
+
"""Use Claude CLI to analyze whether UI testing output achieved the milestone."""
|
|
637
|
+
criteria_text = "\n".join(f"- {c}" for c in ui_milestone.acceptance_criteria)
|
|
638
|
+
clean = _clean_output(output)
|
|
639
|
+
output_lines = clean.strip().splitlines()
|
|
640
|
+
truncated = "\n".join(output_lines[-300:]) if len(output_lines) > 300 else clean.strip()
|
|
641
|
+
|
|
642
|
+
prompt = (
|
|
643
|
+
f"Analyze this UI testing output. Did DaveLoop:\n"
|
|
644
|
+
f"1. Run the specified tests using {test_tool}?\n"
|
|
645
|
+
f"2. Find and report UI issues?\n"
|
|
646
|
+
f"3. Fix the issues it found?\n"
|
|
647
|
+
f"4. Achieve these acceptance criteria:\n{criteria_text}\n\n"
|
|
648
|
+
f"Test output:\n{truncated}\n\n"
|
|
649
|
+
f"Answer with exactly one of: RESOLVED, OFF_PLAN, FAILED.\n"
|
|
650
|
+
f"Then explain why on the next line."
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
try:
|
|
654
|
+
env = os.environ.copy()
|
|
655
|
+
env["PYTHONIOENCODING"] = "utf-8"
|
|
656
|
+
result = subprocess.run(
|
|
657
|
+
[CLAUDE_CMD, "-p", "--no-session-persistence"],
|
|
658
|
+
input=prompt,
|
|
659
|
+
capture_output=True,
|
|
660
|
+
timeout=120,
|
|
661
|
+
env=env,
|
|
662
|
+
encoding="utf-8",
|
|
663
|
+
errors="replace",
|
|
664
|
+
)
|
|
665
|
+
if result.returncode != 0:
|
|
666
|
+
return VerdictResult(verdict=Verdict.FAILED, reason="Could not analyze UI output")
|
|
667
|
+
|
|
668
|
+
response = result.stdout.strip()
|
|
669
|
+
lines = response.splitlines()
|
|
670
|
+
if not lines:
|
|
671
|
+
return VerdictResult(verdict=Verdict.FAILED, reason="Empty analysis response")
|
|
672
|
+
|
|
673
|
+
first_line = lines[0].strip().upper()
|
|
674
|
+
reason = "\n".join(lines[1:]).strip() if len(lines) > 1 else "No explanation provided"
|
|
675
|
+
|
|
676
|
+
if "RESOLVED" in first_line:
|
|
677
|
+
return VerdictResult(verdict=Verdict.RESOLVED, reason=reason)
|
|
678
|
+
elif "OFF_PLAN" in first_line:
|
|
679
|
+
return VerdictResult(verdict=Verdict.OFF_PLAN, reason=reason)
|
|
680
|
+
else:
|
|
681
|
+
return VerdictResult(verdict=Verdict.FAILED, reason=reason)
|
|
682
|
+
|
|
683
|
+
except (subprocess.SubprocessError, subprocess.TimeoutExpired):
|
|
684
|
+
return VerdictResult(verdict=Verdict.FAILED, reason="Could not analyze UI output")
|