borisxdave 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
engine.py ADDED
@@ -0,0 +1,684 @@
1
+ """Boris engine - execution and monitoring (merged from executor + monitor)."""
2
+ import enum
3
+ import logging
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+ from typing import Optional
13
+
14
+ # Force unbuffered stdout for real-time output on Windows
15
+ os.environ.setdefault("PYTHONUNBUFFERED", "1")
16
+
17
+ from state import Milestone, UIMilestone
18
+
19
+ IS_WINDOWS = sys.platform == "win32"
20
+ logger = logging.getLogger("boris.engine")
21
+
22
+ # Regex to strip ANSI escape codes
23
+ _ANSI_RE = re.compile(r"\x1b\[[0-9;]*[a-zA-Z]")
24
+
25
+ # Directories (relative to Boris install dir)
26
+ _BORIS_DIR = os.path.dirname(os.path.abspath(__file__))
27
+ _LOGS_DIR = os.path.join(_BORIS_DIR, "logs")
28
+ _BORIS_PROMPT_PATH = os.path.join(_BORIS_DIR, "boris_prompt.md")
29
+
30
+ # Resolve commands to full paths so shell=False works on Windows (.cmd files)
31
+ CLAUDE_CMD = shutil.which("claude") or "claude"
32
+
33
+ # Cached Boris system prompt
34
+ _boris_prompt_cache = None
35
+
36
+
37
+ def _load_boris_prompt() -> str:
38
+ """Load Boris's management prompt from boris_prompt.md. Cached after first load."""
39
+ global _boris_prompt_cache
40
+ if _boris_prompt_cache is not None:
41
+ return _boris_prompt_cache
42
+ try:
43
+ with open(_BORIS_PROMPT_PATH, "r", encoding="utf-8") as f:
44
+ _boris_prompt_cache = f.read().strip()
45
+ logger.debug("Loaded Boris prompt: %d chars", len(_boris_prompt_cache))
46
+ except FileNotFoundError:
47
+ logger.warning("boris_prompt.md not found at %s", _BORIS_PROMPT_PATH)
48
+ _boris_prompt_cache = ""
49
+ return _boris_prompt_cache
50
+ DAVELOOP_CMD = shutil.which("daveloop") or "daveloop"
51
+
52
+ # Limits
53
+ DEFAULT_MAX_ITERATIONS = 15
54
+
55
+
56
+ def _clean_output(text: str) -> str:
57
+ """Strip ANSI escape codes and non-ASCII chars for clean analysis."""
58
+ text = _ANSI_RE.sub("", text)
59
+ # Replace non-ASCII (emoji, box-drawing) with spaces
60
+ text = text.encode("ascii", errors="replace").decode("ascii")
61
+ return text
62
+
63
+
64
+ # --- Execution (from executor.py) ---
65
+
66
+
67
+ @dataclass
68
+ class ExecutionResult:
69
+ output: str
70
+ exit_code: int
71
+ resolved: bool
72
+ log_path: Optional[str] = None
73
+
74
+
75
+ def _boris_commentary(reasoning: dict, step_num: int, accomplishments: list):
76
+ """Boris reports what DaveLoop accomplished so far and what he's cooking next."""
77
+ known = reasoning.get("KNOWN", "").strip()
78
+ hypothesis = reasoning.get("HYPOTHESIS", "").strip()
79
+ next_action = reasoning.get("NEXT", "").strip()
80
+
81
+ print(flush=True)
82
+ print(f" [Boris] === DaveLoop Check-in #{step_num} ===", flush=True)
83
+
84
+ # What DaveLoop accomplished since last reasoning block
85
+ if accomplishments:
86
+ print(f" [Boris] Done so far:", flush=True)
87
+ for a in accomplishments:
88
+ print(f" [Boris] - {a}", flush=True)
89
+ elif step_num == 1:
90
+ print(f" [Boris] Starting up, gathering info.", flush=True)
91
+
92
+ # What DaveLoop knows now
93
+ if known:
94
+ print(f" [Boris] Knows: {known}", flush=True)
95
+
96
+ # What DaveLoop is about to do
97
+ if hypothesis:
98
+ print(f" [Boris] Thinking: {hypothesis}", flush=True)
99
+ if next_action:
100
+ print(f" [Boris] Next: {next_action}", flush=True)
101
+
102
+ print(f" [Boris] ===========================", flush=True)
103
+ print(flush=True)
104
+
105
+
106
+ def _parse_accomplishment(clean_line: str) -> Optional[str]:
107
+ """Parse a DaveLoop output line into a human-readable accomplishment, or None."""
108
+ # File writes
109
+ write_match = re.search(r"Write\((.+?)\)", clean_line)
110
+ if write_match:
111
+ return f"Created {write_match.group(1).strip()}"
112
+
113
+ # File edits
114
+ edit_match = re.search(r"Edit\((.+?)\)", clean_line)
115
+ if edit_match:
116
+ return f"Edited {edit_match.group(1).strip()}"
117
+
118
+ # Bash commands
119
+ bash_match = re.search(r"Bash\((.+?)\)", clean_line)
120
+ if bash_match:
121
+ cmd = bash_match.group(1).strip()
122
+ if "test" in cmd.lower() or "pytest" in cmd.lower():
123
+ return f"Ran tests: {cmd}"
124
+ elif "mkdir" in cmd.lower():
125
+ return f"Created directory: {cmd}"
126
+ elif "pip install" in cmd.lower() or "npm install" in cmd.lower():
127
+ return f"Installed dependencies: {cmd}"
128
+ else:
129
+ return f"Ran: {cmd}"
130
+
131
+ # Tool results - success/failure
132
+ if "? ?" in clean_line or "PASS" in clean_line.upper():
133
+ return "Verification passed"
134
+ if "? ?" in clean_line or "FAIL" in clean_line.upper():
135
+ if "test" in clean_line.lower():
136
+ return "Test failed - will retry"
137
+
138
+ # DaveLoop iteration markers
139
+ if "ITERATION" in clean_line.upper() and re.search(r"\d+", clean_line):
140
+ iter_match = re.search(r"(\d+)", clean_line)
141
+ if iter_match:
142
+ return f"DaveLoop iteration {iter_match.group(1)}"
143
+
144
+ # RESOLVED signal
145
+ if "[DAVELOOP:RESOLVED]" in clean_line:
146
+ return "DaveLoop reports: RESOLVED"
147
+
148
+ return None
149
+
150
+
151
+ def _check_off_rail(clean_line: str, milestone: Milestone) -> Optional[str]:
152
+ """Detect if DaveLoop is going off-rail. Returns interrupt message or None."""
153
+ lower = clean_line.lower()
154
+
155
+ # Detect building wrong files
156
+ allowed_files = set((milestone.files_to_create or []) + (milestone.files_to_modify or []))
157
+ if allowed_files:
158
+ # Check for Write tool creating files outside scope
159
+ write_match = re.search(r"Write\((.+?)\)", clean_line)
160
+ if write_match:
161
+ written_file = write_match.group(1).strip()
162
+ # Normalize: strip path prefixes, compare basenames
163
+ written_base = os.path.basename(written_file)
164
+ allowed_bases = {os.path.basename(f) for f in allowed_files}
165
+ if written_base and written_base not in allowed_bases:
166
+ return (
167
+ f"wait - you are creating {written_file} which is outside the scope of "
168
+ f"{milestone.id}. Only touch: {', '.join(allowed_files)}. "
169
+ f"Focus on {milestone.id}: {milestone.title} only."
170
+ )
171
+
172
+ # Detect building entire project / other milestones
173
+ off_rail_phrases = [
174
+ "build the entire project",
175
+ "build everything",
176
+ "implement all milestones",
177
+ "implement all features",
178
+ "building the full",
179
+ ]
180
+ for phrase in off_rail_phrases:
181
+ if phrase in lower:
182
+ return (
183
+ f"wait - you are going off-rail. You are only building {milestone.id}: "
184
+ f"{milestone.title}. Do NOT build the entire project. "
185
+ f"Focus ONLY on {milestone.id}."
186
+ )
187
+
188
+ # Detect mentioning other milestone IDs in action context
189
+ other_ms = re.findall(r"\bM\d+\b", clean_line)
190
+ for m_id in other_ms:
191
+ if m_id != milestone.id and ("building" in lower or "implement" in lower or "creating" in lower):
192
+ return (
193
+ f"wait - you mentioned {m_id} but you should only be working on "
194
+ f"{milestone.id}: {milestone.title}. Stay in scope."
195
+ )
196
+
197
+ return None
198
+
199
+
200
+ def _send_interrupt(process, message: str, boris_log):
201
+ """Send a text interrupt to DaveLoop's stdin."""
202
+ try:
203
+ if process.stdin and not process.stdin.closed:
204
+ process.stdin.write(f"{message}\n".encode("utf-8"))
205
+ process.stdin.flush()
206
+ log_msg = f"[Boris INTERRUPT] {message}"
207
+ print(f"\n {log_msg}\n", flush=True)
208
+ logger.warning(log_msg)
209
+ if boris_log:
210
+ boris_log.write(f"\n{log_msg}\n")
211
+ except (OSError, BrokenPipeError):
212
+ logger.debug("Could not send interrupt to DaveLoop stdin")
213
+
214
+
215
+ def run(prompt: str, project_dir: str, max_iterations: int = None,
216
+ milestone: Milestone = None) -> ExecutionResult:
217
+ """Spawn DaveLoop with the crafted prompt. Boris manages, DaveLoop builds.
218
+
219
+ If milestone is provided, Boris monitors DaveLoop's reasoning blocks in real-time
220
+ and intervenes via text interrupt if DaveLoop goes off-rail.
221
+ """
222
+ max_iter = max_iterations if max_iterations is not None else DEFAULT_MAX_ITERATIONS
223
+ temp_file = None
224
+ boris_log = None
225
+
226
+ try:
227
+ # Write prompt to temp file for DaveLoop
228
+ temp_file = tempfile.NamedTemporaryFile(
229
+ mode="w", suffix=".txt", delete=False, encoding="utf-8"
230
+ )
231
+ temp_file.write(prompt)
232
+ temp_file.close()
233
+
234
+ # Boris spawns DaveLoop as the worker
235
+ cmd = [
236
+ DAVELOOP_CMD,
237
+ "-f", temp_file.name,
238
+ "-d", project_dir,
239
+ "-m", str(max_iter),
240
+ "-v",
241
+ ]
242
+
243
+ env = os.environ.copy()
244
+ env["PYTHONIOENCODING"] = "utf-8"
245
+
246
+ print(f" [Boris] Spawning DaveLoop: max_iter={max_iter}, project={project_dir}", flush=True)
247
+ logger.info("Spawning DaveLoop: max_iter=%d, project=%s", max_iter, project_dir)
248
+ logger.debug("Prompt length: %d chars", len(prompt))
249
+
250
+ # Boris's own log for this execution
251
+ log_path = _setup_log(project_dir)
252
+ boris_log = open(log_path, "w", encoding="utf-8")
253
+ boris_log.write(f"=== Boris Execution Log ===\n")
254
+ boris_log.write(f"Timestamp: {datetime.now().isoformat()}\n")
255
+ boris_log.write(f"Project: {project_dir}\n")
256
+ boris_log.write(f"Max iterations: {max_iter}\n")
257
+ boris_log.write(f"Prompt file: {temp_file.name}\n")
258
+ boris_log.write(f"Prompt length: {len(prompt)} chars\n")
259
+ boris_log.write(f"===========================\n\n")
260
+
261
+ # Run DaveLoop with real-time streaming + stdin for interrupts
262
+ # bufsize=0 disables OS-level pipe buffering for real-time output on Windows
263
+ process = subprocess.Popen(
264
+ cmd,
265
+ stdin=subprocess.PIPE,
266
+ stdout=subprocess.PIPE,
267
+ stderr=subprocess.STDOUT,
268
+ cwd=project_dir,
269
+ env=env,
270
+ bufsize=0,
271
+ )
272
+
273
+ output_lines = []
274
+ # Reasoning block parser state
275
+ in_reasoning = False
276
+ reasoning_lines = []
277
+ reasoning_count = 0
278
+ accomplishments = [] # tracks what DaveLoop did since last reasoning block
279
+ all_accomplishments = [] # cumulative for the whole run
280
+ interrupt_count = 0
281
+ MAX_INTERRUPTS = 3 # After 3 interrupts, let DaveLoop finish and fail at verdict
282
+
283
+ for raw_line in process.stdout:
284
+ line = raw_line.decode("utf-8", errors="replace")
285
+ sys.stdout.write(line)
286
+ sys.stdout.flush()
287
+ output_lines.append(line)
288
+ boris_log.write(line)
289
+
290
+ # Parse cleaned line for monitoring
291
+ clean = _clean_output(line).strip()
292
+
293
+ # --- Track accomplishments between reasoning blocks ---
294
+ if not in_reasoning:
295
+ acc = _parse_accomplishment(clean)
296
+ if acc:
297
+ accomplishments.append(acc)
298
+ all_accomplishments.append(acc)
299
+
300
+ # --- Reasoning block detection ---
301
+ if "REASONING" in clean and ("===" in clean or "---" in clean or "KNOWN" in clean):
302
+ in_reasoning = True
303
+ reasoning_lines = []
304
+ continue
305
+
306
+ if in_reasoning:
307
+ # Check for end of reasoning block
308
+ if clean.startswith("===") or clean.startswith("---") or clean.startswith("└"):
309
+ in_reasoning = False
310
+ reasoning_count += 1
311
+ # Parse collected reasoning
312
+ reasoning = {}
313
+ for rl in reasoning_lines:
314
+ for key in ("KNOWN", "UNKNOWN", "HYPOTHESIS", "NEXT", "WHY"):
315
+ if rl.startswith(key + ":") or rl.startswith(key + " :"):
316
+ reasoning[key] = rl.split(":", 1)[1].strip()
317
+ if reasoning:
318
+ _boris_commentary(reasoning, reasoning_count, accomplishments)
319
+ # Reset per-block accomplishments, keep cumulative
320
+ accomplishments = []
321
+ reasoning_lines = []
322
+ else:
323
+ reasoning_lines.append(clean)
324
+
325
+ # --- Off-rail detection ---
326
+ if milestone and interrupt_count < MAX_INTERRUPTS:
327
+ interrupt_msg = _check_off_rail(clean, milestone)
328
+ if interrupt_msg:
329
+ _send_interrupt(process, interrupt_msg, boris_log)
330
+ interrupt_count += 1
331
+ if interrupt_count >= MAX_INTERRUPTS:
332
+ warn = (
333
+ f"[Boris] Sent {MAX_INTERRUPTS} interrupts. "
334
+ f"DaveLoop keeps going off-rail. Will check at verdict."
335
+ )
336
+ print(f"\n {warn}\n", flush=True)
337
+ logger.warning(warn)
338
+ if boris_log:
339
+ boris_log.write(f"\n{warn}\n")
340
+
341
+ process.wait()
342
+ output = "".join(output_lines)
343
+
344
+ # Boris end-of-run summary
345
+ if all_accomplishments:
346
+ print(flush=True)
347
+ print(f" [Boris] === DaveLoop Run Complete ===", flush=True)
348
+ print(f" [Boris] Total actions tracked: {len(all_accomplishments)}", flush=True)
349
+ for a in all_accomplishments:
350
+ print(f" [Boris] - {a}", flush=True)
351
+ print(f" [Boris] =============================", flush=True)
352
+ print(flush=True)
353
+
354
+ boris_log.write(f"\n=== DaveLoop exit code: {process.returncode} ===\n")
355
+ if interrupt_count > 0:
356
+ boris_log.write(f"Boris interrupts sent: {interrupt_count}\n")
357
+ if all_accomplishments:
358
+ boris_log.write(f"Tracked accomplishments: {len(all_accomplishments)}\n")
359
+ for a in all_accomplishments:
360
+ boris_log.write(f" - {a}\n")
361
+
362
+ # Check if DaveLoop resolved the milestone
363
+ resolved = "[DAVELOOP:RESOLVED]" in output
364
+
365
+ # Also find DaveLoop's own log path
366
+ daveloop_log = None
367
+ log_match = re.search(r"Logs:\s*(\S+)", output)
368
+ if log_match:
369
+ daveloop_log = log_match.group(1)
370
+ boris_log.write(f"DaveLoop log: {daveloop_log}\n")
371
+
372
+ logger.info(
373
+ "DaveLoop finished. Exit code: %d, Resolved: %s, Interrupts: %d",
374
+ process.returncode, resolved, interrupt_count
375
+ )
376
+
377
+ return ExecutionResult(
378
+ output=output,
379
+ exit_code=process.returncode,
380
+ resolved=resolved,
381
+ log_path=log_path,
382
+ )
383
+
384
+ except FileNotFoundError:
385
+ return ExecutionResult(
386
+ output=f"Error: '{DAVELOOP_CMD}' not found. Install with: pip install daveloop",
387
+ exit_code=127,
388
+ resolved=False,
389
+ )
390
+ except subprocess.SubprocessError as e:
391
+ return ExecutionResult(
392
+ output=f"Subprocess error: {e}",
393
+ exit_code=1,
394
+ resolved=False,
395
+ )
396
+ finally:
397
+ if temp_file is not None:
398
+ try:
399
+ os.unlink(temp_file.name)
400
+ except OSError:
401
+ pass
402
+ if boris_log is not None:
403
+ try:
404
+ boris_log.close()
405
+ except OSError:
406
+ pass
407
+
408
+
409
+ def _setup_log(project_dir: str) -> str:
410
+ """Create a log file path for this execution."""
411
+ os.makedirs(_LOGS_DIR, exist_ok=True)
412
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
413
+ return os.path.join(_LOGS_DIR, f"boris_exec_{timestamp}.log")
414
+
415
+
416
+ # --- Monitoring (from monitor.py) ---
417
+
418
+
419
+ class Verdict(enum.Enum):
420
+ RESOLVED = "RESOLVED"
421
+ OFF_PLAN = "OFF_PLAN"
422
+ FAILED = "FAILED"
423
+
424
+
425
+ @dataclass
426
+ class VerdictResult:
427
+ verdict: Verdict
428
+ reason: str
429
+
430
+
431
+ def check(result: ExecutionResult, milestone: Milestone) -> VerdictResult:
432
+ """Check execution result and return a verdict."""
433
+ print(f" [Boris] Evaluating DaveLoop result for {milestone.id}...", flush=True)
434
+
435
+ if result.resolved:
436
+ print(f" [Boris] DaveLoop self-reported RESOLVED for {milestone.id}", flush=True)
437
+ return VerdictResult(verdict=Verdict.RESOLVED, reason="DaveLoop reported resolution")
438
+
439
+ if result.exit_code != 0:
440
+ print(f" [Boris] DaveLoop exited with code {result.exit_code} for {milestone.id}", flush=True)
441
+ # Get tail of output for reason
442
+ lines = result.output.strip().splitlines()
443
+ tail = "\n".join(lines[-20:]) if len(lines) > 20 else result.output.strip()
444
+ return VerdictResult(
445
+ verdict=Verdict.FAILED,
446
+ reason=f"Process exited with code {result.exit_code}. Output tail:\n{tail}",
447
+ )
448
+
449
+ # Use Claude for nuanced analysis
450
+ print(f" [Boris] Running Claude verdict analysis for {milestone.id}...", flush=True)
451
+ return _analyze_with_claude(result.output, milestone)
452
+
453
+
454
+ def _analyze_with_claude(output: str, milestone: Milestone) -> VerdictResult:
455
+ """Use Claude CLI to analyze whether output achieved the milestone."""
456
+ criteria_text = "\n".join(f"- {c}" for c in milestone.acceptance_criteria)
457
+ # Clean and truncate output
458
+ clean = _clean_output(output)
459
+ output_lines = clean.strip().splitlines()
460
+ truncated = "\n".join(output_lines[-300:]) if len(output_lines) > 300 else clean.strip()
461
+
462
+ boris_brain = _load_boris_prompt()
463
+ prompt = ""
464
+ if boris_brain:
465
+ prompt += boris_brain + "\n\n---\n\n"
466
+ prompt += (
467
+ f"Analyze this build output. Did it achieve these acceptance criteria:\n"
468
+ f"{criteria_text}\n\n"
469
+ f"Build output:\n{truncated}\n\n"
470
+ f"Answer with exactly one of: RESOLVED, OFF_PLAN, FAILED. "
471
+ f"Then explain why on the next line."
472
+ )
473
+
474
+ try:
475
+ env = os.environ.copy()
476
+ env["PYTHONIOENCODING"] = "utf-8"
477
+ result = subprocess.run(
478
+ [CLAUDE_CMD, "-p", "--no-session-persistence"],
479
+ input=prompt,
480
+ capture_output=True,
481
+ timeout=120,
482
+ env=env,
483
+ encoding="utf-8",
484
+ errors="replace",
485
+ )
486
+ if result.returncode != 0:
487
+ return VerdictResult(
488
+ verdict=Verdict.FAILED,
489
+ reason="Could not analyze output",
490
+ )
491
+
492
+ response = result.stdout.strip()
493
+ lines = response.splitlines()
494
+ if not lines:
495
+ return VerdictResult(verdict=Verdict.FAILED, reason="Empty analysis response")
496
+
497
+ first_line = lines[0].strip().upper()
498
+ reason = "\n".join(lines[1:]).strip() if len(lines) > 1 else "No explanation provided"
499
+
500
+ if "RESOLVED" in first_line:
501
+ return VerdictResult(verdict=Verdict.RESOLVED, reason=reason)
502
+ elif "OFF_PLAN" in first_line:
503
+ return VerdictResult(verdict=Verdict.OFF_PLAN, reason=reason)
504
+ else:
505
+ return VerdictResult(verdict=Verdict.FAILED, reason=reason)
506
+
507
+ except (subprocess.SubprocessError, subprocess.TimeoutExpired):
508
+ return VerdictResult(
509
+ verdict=Verdict.FAILED,
510
+ reason="Could not analyze output",
511
+ )
512
+
513
+
514
+ # --- UI Testing (DaveLoop v1.4) ---
515
+
516
+
517
+ # Off-rail phrases specific to UI tester mode (building new stuff instead of testing)
518
+ _UI_OFF_RAIL_PHRASES = [
519
+ "add new feature",
520
+ "implement new",
521
+ "create new endpoint",
522
+ "add new screen",
523
+ "build new component",
524
+ ]
525
+
526
+
527
+ def _parse_ui_accomplishment(clean_line: str, ui_milestone: UIMilestone) -> Optional[str]:
528
+ """Parse a DaveLoop output line into a UI-specific accomplishment, or None.
529
+
530
+ Falls back to the standard _parse_accomplishment for non-UI-specific lines.
531
+ """
532
+ clean = clean_line.strip()
533
+ lower = clean.lower()
534
+
535
+ # Screenshots
536
+ if "screenshot" in lower and ("saved" in lower or "captured" in lower or "taken" in lower):
537
+ return "Captured screenshot"
538
+
539
+ # Playwright/Maestro test runs
540
+ if "maestro test" in lower or "playwright test" in lower:
541
+ return f"Ran UI test: {clean}"
542
+
543
+ # Issue tracking
544
+ issue_match = re.search(r"ISSUE FOUND:\s*(.+)", clean)
545
+ if issue_match:
546
+ ui_milestone.issues_found.append(issue_match.group(1))
547
+ return f"Found issue: {issue_match.group(1)}"
548
+
549
+ fix_match = re.search(r"FIX APPLIED:\s*(.+)", clean)
550
+ if fix_match:
551
+ ui_milestone.issues_fixed.append(fix_match.group(1))
552
+ return f"Fixed: {fix_match.group(1)}"
553
+
554
+ # Fall back to standard parsing
555
+ return _parse_accomplishment(clean)
556
+
557
+
558
+ def _check_ui_off_rail(clean_line: str, ui_milestone: UIMilestone) -> Optional[str]:
559
+ """Detect if DaveLoop is going off-rail in UI tester mode (building instead of testing)."""
560
+ lower = clean_line.lower()
561
+
562
+ for phrase in _UI_OFF_RAIL_PHRASES:
563
+ if phrase in lower:
564
+ return (
565
+ f"wait - you are in UI TESTER MODE. You should be testing "
566
+ f"{ui_milestone.id}: {ui_milestone.title}, not building new features. "
567
+ f"Test existing UI and fix visual/UX issues only."
568
+ )
569
+
570
+ return None
571
+
572
+
573
+ def run_ui_test(prompt: str, project_dir: str, max_iterations: int = None,
574
+ ui_milestone: UIMilestone = None) -> ExecutionResult:
575
+ """Spawn DaveLoop for UI testing. Thin wrapper around run() with UI-specific monitoring.
576
+
577
+ Boris ships DaveLoop the prompt and monitors for UI-specific accomplishments
578
+ and off-rail behavior. DaveLoop handles the actual testing.
579
+ """
580
+ # Use the same run() infrastructure - DaveLoop does the work
581
+ # We just swap in UI-specific off-rail detection via a temporary Milestone adapter
582
+ # so the existing run() monitoring pipeline picks it up.
583
+
584
+ # Create a lightweight Milestone-compatible object for run()'s off-rail checks.
585
+ # run() expects a Milestone with files_to_create/files_to_modify for scope checks.
586
+ # For UI testing, we don't restrict files - DaveLoop can touch test files freely.
587
+ adapter = Milestone(
588
+ id=ui_milestone.id if ui_milestone else "UI",
589
+ title=ui_milestone.title if ui_milestone else "UI Test",
590
+ description=ui_milestone.description if ui_milestone else "",
591
+ depends_on=[],
592
+ acceptance_criteria=ui_milestone.acceptance_criteria if ui_milestone else [],
593
+ files_to_create=[], # No file restrictions in UI test mode
594
+ files_to_modify=[],
595
+ )
596
+
597
+ result = run(prompt, project_dir, max_iterations, milestone=adapter)
598
+
599
+ # Post-process: scan output for UI-specific markers
600
+ if ui_milestone:
601
+ for line in result.output.splitlines():
602
+ clean = _clean_output(line).strip()
603
+ issue_match = re.search(r"ISSUE FOUND:\s*(.+)", clean)
604
+ if issue_match and issue_match.group(1) not in ui_milestone.issues_found:
605
+ ui_milestone.issues_found.append(issue_match.group(1))
606
+ fix_match = re.search(r"FIX APPLIED:\s*(.+)", clean)
607
+ if fix_match and fix_match.group(1) not in ui_milestone.issues_fixed:
608
+ ui_milestone.issues_fixed.append(fix_match.group(1))
609
+
610
+ return result
611
+
612
+
613
+ def check_ui(result: ExecutionResult, ui_milestone: UIMilestone, test_tool: str) -> VerdictResult:
614
+ """Check UI test execution result and return a verdict."""
615
+ print(f" [Boris] Evaluating UI test result for {ui_milestone.id}...", flush=True)
616
+
617
+ if result.resolved:
618
+ print(f" [Boris] DaveLoop self-reported RESOLVED for {ui_milestone.id}", flush=True)
619
+ return VerdictResult(verdict=Verdict.RESOLVED, reason="DaveLoop reported resolution")
620
+
621
+ if result.exit_code != 0:
622
+ print(f" [Boris] DaveLoop exited with code {result.exit_code} for {ui_milestone.id}", flush=True)
623
+ lines = result.output.strip().splitlines()
624
+ tail = "\n".join(lines[-20:]) if len(lines) > 20 else result.output.strip()
625
+ return VerdictResult(
626
+ verdict=Verdict.FAILED,
627
+ reason=f"Process exited with code {result.exit_code}. Output tail:\n{tail}",
628
+ )
629
+
630
+ # Use Claude for UI-specific verdict
631
+ print(f" [Boris] Running Claude UI verdict analysis for {ui_milestone.id}...", flush=True)
632
+ return _analyze_ui_verdict(result.output, ui_milestone, test_tool)
633
+
634
+
635
+ def _analyze_ui_verdict(output: str, ui_milestone: UIMilestone, test_tool: str) -> VerdictResult:
636
+ """Use Claude CLI to analyze whether UI testing output achieved the milestone."""
637
+ criteria_text = "\n".join(f"- {c}" for c in ui_milestone.acceptance_criteria)
638
+ clean = _clean_output(output)
639
+ output_lines = clean.strip().splitlines()
640
+ truncated = "\n".join(output_lines[-300:]) if len(output_lines) > 300 else clean.strip()
641
+
642
+ prompt = (
643
+ f"Analyze this UI testing output. Did DaveLoop:\n"
644
+ f"1. Run the specified tests using {test_tool}?\n"
645
+ f"2. Find and report UI issues?\n"
646
+ f"3. Fix the issues it found?\n"
647
+ f"4. Achieve these acceptance criteria:\n{criteria_text}\n\n"
648
+ f"Test output:\n{truncated}\n\n"
649
+ f"Answer with exactly one of: RESOLVED, OFF_PLAN, FAILED.\n"
650
+ f"Then explain why on the next line."
651
+ )
652
+
653
+ try:
654
+ env = os.environ.copy()
655
+ env["PYTHONIOENCODING"] = "utf-8"
656
+ result = subprocess.run(
657
+ [CLAUDE_CMD, "-p", "--no-session-persistence"],
658
+ input=prompt,
659
+ capture_output=True,
660
+ timeout=120,
661
+ env=env,
662
+ encoding="utf-8",
663
+ errors="replace",
664
+ )
665
+ if result.returncode != 0:
666
+ return VerdictResult(verdict=Verdict.FAILED, reason="Could not analyze UI output")
667
+
668
+ response = result.stdout.strip()
669
+ lines = response.splitlines()
670
+ if not lines:
671
+ return VerdictResult(verdict=Verdict.FAILED, reason="Empty analysis response")
672
+
673
+ first_line = lines[0].strip().upper()
674
+ reason = "\n".join(lines[1:]).strip() if len(lines) > 1 else "No explanation provided"
675
+
676
+ if "RESOLVED" in first_line:
677
+ return VerdictResult(verdict=Verdict.RESOLVED, reason=reason)
678
+ elif "OFF_PLAN" in first_line:
679
+ return VerdictResult(verdict=Verdict.OFF_PLAN, reason=reason)
680
+ else:
681
+ return VerdictResult(verdict=Verdict.FAILED, reason=reason)
682
+
683
+ except (subprocess.SubprocessError, subprocess.TimeoutExpired):
684
+ return VerdictResult(verdict=Verdict.FAILED, reason="Could not analyze UI output")