daveloop 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ Metadata-Version: 2.4
2
+ Name: daveloop
3
+ Version: 1.0.0
4
+ Summary: Self-healing debug agent powered by Claude Code CLI
5
+ Home-page: https://github.com/davebruzil/DaveLoop
6
+ Author: Dave Bruzil
7
+ Keywords: debugging ai claude automation agent
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.7
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Software Development :: Debuggers
19
+ Classifier: Topic :: Software Development :: Quality Assurance
20
+ Requires-Python: >=3.7
21
+ Description-Content-Type: text/markdown
22
+ Dynamic: author
23
+ Dynamic: classifier
24
+ Dynamic: description
25
+ Dynamic: description-content-type
26
+ Dynamic: home-page
27
+ Dynamic: keywords
28
+ Dynamic: requires-python
29
+ Dynamic: summary
30
+
31
+ # DaveLoop: Autonomous Debugging Agent
32
+
33
+ DaveLoop is a Claude CLI-based debugging tool that iteratively solves complex bugs through multiple attempts. Rather than relying on single-shot fixes, it uses persistent context via the `--continue` flag to build solutions incrementally.
34
+
35
+ ## Installation
36
+
37
+ ### From Source (Current Method)
38
+
39
+ ```bash
40
+ git clone https://github.com/davebruzil/DaveLoop.git
41
+ cd DaveLoop
42
+ pip install -e .
43
+ ```
44
+
45
+ ### Via pip (Coming Soon)
46
+
47
+ Once published to PyPI, you'll be able to install with:
48
+
49
+ ```bash
50
+ pip install daveloop
51
+ ```
52
+
53
+ ## Core Functionality
54
+
55
+ The tool operates through a systematic loop: users provide bug descriptions, Claude analyzes and attempts fixes, and if unsuccessful, the process repeats with accumulated context. The agent signals completion with `[DAVELOOP:RESOLVED]` or exits when blocked.
56
+
57
+ ## Key Capabilities
58
+
59
+ **4-Level Reasoning Protocol**: The system structures debugging through KNOWN facts, UNKNOWN gaps, HYPOTHESIS formulation, and concrete NEXT ACTIONs. This prevents random modifications and enforces methodical problem-solving.
60
+
61
+ **Persistent Memory**: Unlike isolated attempts, each iteration builds on previous findings through Claude's context continuation feature.
62
+
63
+ **Autonomous Operation**: The agent works without requiring manual permission prompts between iterations, enabling hands-free debugging sessions.
64
+
65
+ **Exit Signals**: Clear indicators show whether bugs are resolved, the agent is blocked, or iteration limits have been reached.
66
+
67
+ ## Usage
68
+
69
+ Basic invocation requires a bug description:
70
+ ```bash
71
+ python daveloop.py "your bug description here"
72
+ ```
73
+
74
+ The tool accepts detailed bug reports via files, custom working directories, and configurable iteration limits. Sessions automatically generate timestamped logs documenting the agent's reasoning and actions across all iterations.
75
+
76
+ ## Tested Domains
77
+
78
+ The agent has demonstrated effectiveness on security vulnerabilities, race conditions, multi-file refactors, and real-world benchmark problems from Django, Pytest, and SymPy codebases.
@@ -0,0 +1,7 @@
1
+ daveloop.py,sha256=qBaRmxB7pJrpxeJh1bvodMfkz3BJKFiV-i3UlOWSeBg,28348
2
+ daveloop_swebench.py,sha256=iD9AU3XRiMQpt7TknFNlvnmPCNp64V-JaTfqTFgsGBM,15996
3
+ daveloop-1.0.0.dist-info/METADATA,sha256=edBrqn1_7JCe0CKGVg91oQicbCJKA251oAObNjG2J4A,3190
4
+ daveloop-1.0.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
5
+ daveloop-1.0.0.dist-info/entry_points.txt,sha256=QcFAZgFrDfPtIikNQb7eW9DxOpBK7T-qWrKqbGAS9Ww,86
6
+ daveloop-1.0.0.dist-info/top_level.txt,sha256=36DiYt70m4DIK8t7IhV_y6hAzUIyeb5-qDUf3-gbDdg,27
7
+ daveloop-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ daveloop = daveloop:main
3
+ daveloop-swebench = daveloop_swebench:main
@@ -0,0 +1,2 @@
1
+ daveloop
2
+ daveloop_swebench
daveloop.py ADDED
@@ -0,0 +1,716 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DaveLoop - Self-Healing Debug Agent
4
+ Orchestrates Claude Code CLI in a feedback loop until bugs are resolved.
5
+ """
6
+
7
+ import subprocess
8
+ import sys
9
+ import os
10
+ import argparse
11
+ import threading
12
+ import time
13
+ import itertools
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+
17
+ # Configuration
18
+ MAX_ITERATIONS = 20
19
+
20
+ # Find prompt file - works both when running as script and when installed as package
21
+ def find_prompt_file():
22
+ """Find the prompt file in the correct location."""
23
+ # Try script directory first (for development)
24
+ script_dir = Path(__file__).parent
25
+ script_prompt = script_dir / "daveloop_prompt.md"
26
+ if script_prompt.exists():
27
+ return script_prompt
28
+
29
+ # Try package resources (for pip install)
30
+ try:
31
+ import importlib.resources as pkg_resources
32
+ try:
33
+ # Python 3.9+
34
+ files = pkg_resources.files(__package__ or __name__.split('.')[0])
35
+ return files / "daveloop_prompt.md"
36
+ except AttributeError:
37
+ # Python 3.7-3.8
38
+ with pkg_resources.path(__package__ or __name__.split('.')[0], "daveloop_prompt.md") as p:
39
+ return p
40
+ except (ImportError, FileNotFoundError):
41
+ pass
42
+
43
+ # Fallback to script directory
44
+ return script_prompt
45
+
46
+ SCRIPT_DIR = Path(__file__).parent
47
+ PROMPT_FILE = find_prompt_file()
48
+ LOG_DIR = Path.cwd() / "logs" # Use current working directory for logs
49
+
50
+ # Exit signals from Claude Code
51
+ SIGNAL_RESOLVED = "[DAVELOOP:RESOLVED]"
52
+ SIGNAL_BLOCKED = "[DAVELOOP:BLOCKED]"
53
+ SIGNAL_CLARIFY = "[DAVELOOP:CLARIFY]"
54
+
55
+ # ============================================================================
56
+ # ANSI Color Codes
57
+ # ============================================================================
58
+ class Colors:
59
+ RESET = "\033[0m"
60
+ BOLD = "\033[1m"
61
+ DIM = "\033[2m"
62
+
63
+ # Foreground
64
+ BLACK = "\033[30m"
65
+ RED = "\033[31m"
66
+ GREEN = "\033[32m"
67
+ YELLOW = "\033[33m"
68
+ BLUE = "\033[34m"
69
+ MAGENTA = "\033[35m"
70
+ CYAN = "\033[36m"
71
+ WHITE = "\033[37m"
72
+
73
+ # Bright foreground
74
+ BRIGHT_RED = "\033[91m"
75
+ BRIGHT_GREEN = "\033[92m"
76
+ BRIGHT_YELLOW = "\033[93m"
77
+ BRIGHT_BLUE = "\033[94m"
78
+ BRIGHT_MAGENTA = "\033[95m"
79
+ BRIGHT_CYAN = "\033[96m"
80
+ BRIGHT_WHITE = "\033[97m"
81
+
82
+ # Background
83
+ BG_BLACK = "\033[40m"
84
+ BG_RED = "\033[41m"
85
+ BG_GREEN = "\033[42m"
86
+ BG_BLUE = "\033[44m"
87
+ BG_MAGENTA = "\033[45m"
88
+ BG_CYAN = "\033[46m"
89
+
90
+ C = Colors # Shorthand
91
+
92
+ # Enable ANSI and UTF-8 on Windows
93
+ if sys.platform == "win32":
94
+ os.system("chcp 65001 >nul 2>&1") # Set console to UTF-8
95
+ os.system("") # Enables ANSI escape sequences in Windows terminal
96
+ # Force UTF-8 encoding for stdout/stderr (only if not already wrapped)
97
+ import io
98
+ if not isinstance(sys.stdout, io.TextIOWrapper) or sys.stdout.encoding != 'utf-8':
99
+ if hasattr(sys.stdout, 'buffer'):
100
+ sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
101
+ if not isinstance(sys.stderr, io.TextIOWrapper) or sys.stderr.encoding != 'utf-8':
102
+ if hasattr(sys.stderr, 'buffer'):
103
+ sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
104
+
105
+ # ============================================================================
106
+ # ASCII Art Banner
107
+ # ============================================================================
108
+ BANNER = f"""
109
+ {C.BRIGHT_BLUE}{C.BOLD}
110
+ ██████╗ █████╗ ██╗ ██╗███████╗██╗ ██████╗ ██████╗ ██████╗
111
+ ██╔══██╗██╔══██╗██║ ██║██╔════╝██║ ██╔═══██╗██╔═══██╗██╔══██╗
112
+ ██║ ██║███████║██║ ██║█████╗ ██║ ██║ ██║██║ ██║██████╔╝
113
+ ██║ ██║██╔══██║╚██╗ ██╔╝██╔══╝ ██║ ██║ ██║██║ ██║██╔═══╝
114
+ ██████╔╝██║ ██║ ╚████╔╝ ███████╗███████╗╚██████╔╝╚██████╔╝██║
115
+ ╚═════╝ ╚═╝ ╚═╝ ╚═══╝ ╚══════╝╚══════╝ ╚═════╝ ╚═════╝ ╚═╝
116
+ {C.RESET}
117
+ {C.BRIGHT_WHITE}{C.BOLD} Self-Healing Debug Agent{C.RESET}
118
+ {C.WHITE} Powered by Claude Code - Autonomous Mode{C.RESET}
119
+ """
120
+
121
+ # ============================================================================
122
+ # UI Components
123
+ # ============================================================================
124
+ def print_header_box(title: str, color: str = C.BRIGHT_BLUE):
125
+ """Print a header."""
126
+ print(f"\n{color}{C.BOLD}{title}{C.RESET}")
127
+ print(f"{color}{'─'*len(title)}{C.RESET}\n")
128
+
129
+ def print_section(title: str, color: str = C.BRIGHT_BLUE):
130
+ """Print a section divider."""
131
+ print(f"\n{color}{C.BOLD}{title}{C.RESET}")
132
+ print(f"{color}{'─'*len(title)}{C.RESET}\n")
133
+
134
+ def print_status(label: str, value: str, color: str = C.WHITE):
135
+ """Print a status line."""
136
+ print(f" {C.WHITE}{label}:{C.RESET} {color}{value}{C.RESET}")
137
+
138
+ def print_iteration_header(iteration: int, max_iter: int):
139
+ """Print the iteration header with visual progress."""
140
+ progress = iteration / max_iter
141
+ bar_width = 30
142
+ filled = int(bar_width * progress)
143
+ bar = f"{C.BLUE}{'█' * filled}{C.DIM}{'░' * (bar_width - filled)}{C.RESET}"
144
+
145
+ iteration_text = f"ITERATION {iteration}/{max_iter}"
146
+ percentage_text = f"{int(progress*100)}%"
147
+
148
+ print(f"\n{C.BOLD}{C.WHITE}{iteration_text}{C.RESET} {bar} {C.BRIGHT_BLUE}{percentage_text}{C.RESET}\n")
149
+
150
+ def print_success_box(message: str):
151
+ """Print an epic success message."""
152
+ print(f"\n{C.BRIGHT_GREEN}{C.BOLD}")
153
+ print(" ███████╗ ██╗ ██╗ ██████╗ ██████╗ ███████╗ ███████╗ ███████╗")
154
+ print(" ██╔════╝ ██║ ██║ ██╔════╝ ██╔════╝ ██╔════╝ ██╔════╝ ██╔════╝")
155
+ print(" ███████╗ ██║ ██║ ██║ ██║ █████╗ ███████╗ ███████╗")
156
+ print(" ╚════██║ ██║ ██║ ██║ ██║ ██╔══╝ ╚════██║ ╚════██║")
157
+ print(" ███████║ ╚██████╔╝ ╚██████╗ ╚██████╗ ███████╗ ███████║ ███████║")
158
+ print(" ╚══════╝ ╚═════╝ ╚═════╝ ╚═════╝ ╚══════╝ ╚══════╝ ╚══════╝")
159
+ print()
160
+ print(f" {C.BRIGHT_YELLOW}★ ★ ★{C.RESET}{C.BRIGHT_GREEN}{C.BOLD} {C.BRIGHT_WHITE}BUG SUCCESSFULLY RESOLVED{C.RESET}{C.BRIGHT_GREEN}{C.BOLD} {C.BRIGHT_YELLOW}★ ★ ★{C.RESET}")
161
+ print()
162
+ print(f" {C.WHITE}{message}{C.RESET}")
163
+ print(f"{C.RESET}\n")
164
+
165
+ def print_error_box(message: str):
166
+ """Print an error message."""
167
+ print(f"\n{C.BRIGHT_RED}{C.BOLD}✗ ERROR: {C.WHITE}{message}{C.RESET}\n")
168
+
169
+ def print_warning_box(message: str):
170
+ """Print a warning message."""
171
+ print(f"\n{C.BRIGHT_YELLOW}{C.BOLD}⚠ WARNING: {C.WHITE}{message}{C.RESET}\n")
172
+
173
+ # ============================================================================
174
+ # Spinner Animation
175
+ # ============================================================================
176
+ class Spinner:
177
+ """Animated spinner for showing work in progress."""
178
+
179
+ def __init__(self, message: str = "Processing"):
180
+ self.message = message
181
+ self.running = False
182
+ self.thread = None
183
+ self.frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
184
+ self.start_time = None
185
+
186
+ def spin(self):
187
+ idx = 0
188
+ while self.running:
189
+ elapsed = time.time() - self.start_time
190
+ frame = self.frames[idx % len(self.frames)]
191
+ sys.stdout.write(f"\r {C.BRIGHT_CYAN}{frame}{C.RESET} {C.BOLD}{self.message}{C.RESET} {C.DIM}({elapsed:.0f}s){C.RESET} ")
192
+ sys.stdout.flush()
193
+ idx += 1
194
+ time.sleep(0.1)
195
+
196
+ def start(self):
197
+ self.running = True
198
+ self.start_time = time.time()
199
+ self.thread = threading.Thread(target=self.spin)
200
+ self.thread.start()
201
+
202
+ def stop(self, final_message: str = None):
203
+ self.running = False
204
+ if self.thread:
205
+ self.thread.join()
206
+ elapsed = time.time() - self.start_time
207
+ if final_message:
208
+ sys.stdout.write(f"\r {C.GREEN}✓{C.RESET} {final_message} {C.DIM}({elapsed:.1f}s){C.RESET} \n")
209
+ else:
210
+ sys.stdout.write(f"\r {C.GREEN}✓{C.RESET} {self.message} complete {C.DIM}({elapsed:.1f}s){C.RESET} \n")
211
+ sys.stdout.flush()
212
+
213
+ # ============================================================================
214
+ # Output Formatter
215
+ # ============================================================================
216
+ def format_claude_output(output: str) -> str:
217
+ """Format Claude's output with colors and sections."""
218
+ lines = output.split('\n')
219
+ formatted = []
220
+ in_reasoning = False
221
+ in_code = False
222
+
223
+ for line in lines:
224
+ # Reasoning block
225
+ if "=== DAVELOOP REASONING ===" in line:
226
+ in_reasoning = True
227
+ formatted.append(f"\n{C.BRIGHT_YELLOW}┌{'─'*50}┐{C.RESET}")
228
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.BOLD} 🧠 REASONING{C.RESET}")
229
+ formatted.append(f"{C.BRIGHT_YELLOW}├{'─'*50}┤{C.RESET}")
230
+ continue
231
+ elif "===========================" in line and in_reasoning:
232
+ in_reasoning = False
233
+ formatted.append(f"{C.BRIGHT_YELLOW}└{'─'*50}┘{C.RESET}\n")
234
+ continue
235
+
236
+ # Verification block
237
+ if "=== VERIFICATION ===" in line:
238
+ formatted.append(f"\n{C.BRIGHT_GREEN}┌{'─'*50}┐{C.RESET}")
239
+ formatted.append(f"{C.BRIGHT_GREEN}│{C.BOLD} ✓ VERIFICATION{C.RESET}")
240
+ formatted.append(f"{C.BRIGHT_GREEN}├{'─'*50}┤{C.RESET}")
241
+ continue
242
+ elif "====================" in line:
243
+ formatted.append(f"{C.BRIGHT_GREEN}└{'─'*50}┘{C.RESET}\n")
244
+ continue
245
+
246
+ # Code blocks
247
+ if line.strip().startswith("```"):
248
+ in_code = not in_code
249
+ if in_code:
250
+ formatted.append(f"{C.DIM}┌─ code ────────────────────────────────{C.RESET}")
251
+ else:
252
+ formatted.append(f"{C.DIM}└───────────────────────────────────────{C.RESET}")
253
+ continue
254
+
255
+ # Reasoning labels
256
+ if in_reasoning:
257
+ if line.startswith("KNOWN:"):
258
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.RESET} {C.CYAN}KNOWN:{C.RESET}{line[6:]}")
259
+ elif line.startswith("UNKNOWN:"):
260
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.RESET} {C.MAGENTA}UNKNOWN:{C.RESET}{line[8:]}")
261
+ elif line.startswith("HYPOTHESIS:"):
262
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.RESET} {C.YELLOW}HYPOTHESIS:{C.RESET}{line[11:]}")
263
+ elif line.startswith("NEXT ACTION:"):
264
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.RESET} {C.GREEN}NEXT ACTION:{C.RESET}{line[12:]}")
265
+ elif line.startswith("WHY:"):
266
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.RESET} {C.BLUE}WHY:{C.RESET}{line[4:]}")
267
+ else:
268
+ formatted.append(f"{C.BRIGHT_YELLOW}│{C.RESET} {line}")
269
+ continue
270
+
271
+ # Exit signals - dim them out, don't make prominent
272
+ if "[DAVELOOP:RESOLVED]" in line:
273
+ formatted.append(f" {C.DIM}→ [Exit signal: RESOLVED]{C.RESET}")
274
+ continue
275
+ elif "[DAVELOOP:BLOCKED]" in line:
276
+ formatted.append(f" {C.DIM}→ [Exit signal: BLOCKED]{C.RESET}")
277
+ continue
278
+ elif "[DAVELOOP:CLARIFY]" in line:
279
+ formatted.append(f" {C.DIM}→ [Exit signal: CLARIFY]{C.RESET}")
280
+ continue
281
+
282
+ # Code content
283
+ if in_code:
284
+ formatted.append(f"{C.DIM}│{C.RESET} {C.WHITE}{line}{C.RESET}")
285
+ continue
286
+
287
+ # Regular content
288
+ formatted.append(f" {line}")
289
+
290
+ return '\n'.join(formatted)
291
+
292
+ # ============================================================================
293
+ # Core Functions
294
+ # ============================================================================
295
+ def load_prompt() -> str:
296
+ """Load the DaveLoop system prompt."""
297
+ if PROMPT_FILE.exists():
298
+ return PROMPT_FILE.read_text(encoding="utf-8")
299
+ else:
300
+ print_warning_box(f"Prompt file not found: {PROMPT_FILE}")
301
+ return "You are debugging. Fix the bug. Output [DAVELOOP:RESOLVED] when done."
302
+
303
+
304
+ def find_claude_cli():
305
+ """Find Claude CLI executable path."""
306
+ import platform
307
+ import shutil
308
+
309
+ # 1. Check environment variable (highest priority)
310
+ env_path = os.environ.get('CLAUDE_CLI_PATH')
311
+ if env_path and os.path.exists(env_path):
312
+ return env_path
313
+
314
+ # 2. Try common installation paths
315
+ is_windows = platform.system() == "Windows"
316
+ if is_windows:
317
+ common_paths = [
318
+ os.path.expanduser("~\\AppData\\Local\\Programs\\claude\\claude.cmd"),
319
+ os.path.expanduser("~\\AppData\\Roaming\\npm\\claude.cmd"),
320
+ "C:\\Program Files\\Claude\\claude.cmd",
321
+ "C:\\Program Files (x86)\\Claude\\claude.cmd",
322
+ ]
323
+ for path in common_paths:
324
+ if os.path.exists(path):
325
+ return path
326
+ else:
327
+ common_paths = [
328
+ "/usr/local/bin/claude",
329
+ "/usr/bin/claude",
330
+ os.path.expanduser("~/.local/bin/claude"),
331
+ ]
332
+ for path in common_paths:
333
+ if os.path.exists(path):
334
+ return path
335
+
336
+ # 3. Check if it's in PATH
337
+ claude_name = "claude.cmd" if is_windows else "claude"
338
+ if shutil.which(claude_name):
339
+ return claude_name
340
+
341
+ # 4. Not found
342
+ return None
343
+
344
+
345
+ def run_claude_code(prompt: str, working_dir: str = None, continue_session: bool = False, stream: bool = True) -> str:
346
+ """Execute Claude Code CLI with the given prompt.
347
+
348
+ If stream=True, output is printed in real-time and also returned.
349
+ """
350
+ claude_cmd = find_claude_cli()
351
+ if not claude_cmd:
352
+ error_msg = (
353
+ "Claude CLI not found!\n\n"
354
+ "Please install Claude Code CLI or set CLAUDE_CLI_PATH environment variable:\n"
355
+ " Windows: set CLAUDE_CLI_PATH=C:\\path\\to\\claude.cmd\n"
356
+ " Linux/Mac: export CLAUDE_CLI_PATH=/path/to/claude\n\n"
357
+ "Install from: https://github.com/anthropics/claude-code"
358
+ )
359
+ print_error_box(error_msg)
360
+ return "[DAVELOOP:ERROR] Claude CLI not found"
361
+
362
+ cmd = [claude_cmd]
363
+
364
+ if continue_session:
365
+ cmd.append("--continue")
366
+
367
+ cmd.extend(["-p", "--verbose", "--output-format", "stream-json", "--allowedTools", "Bash,Read,Write,Edit,Glob,Grep,Task"])
368
+
369
+ try:
370
+ if stream:
371
+ # Stream output in real-time
372
+ process = subprocess.Popen(
373
+ cmd,
374
+ stdin=subprocess.PIPE,
375
+ stdout=subprocess.PIPE,
376
+ stderr=subprocess.STDOUT,
377
+ text=True,
378
+ encoding='utf-8',
379
+ errors='replace',
380
+ cwd=working_dir,
381
+ bufsize=1 # Line buffered
382
+ )
383
+
384
+ # Send prompt and close stdin
385
+ process.stdin.write(prompt)
386
+ process.stdin.close()
387
+
388
+ # Heartbeat thread to show we're alive
389
+ start_time = time.time()
390
+ heartbeat_active = True
391
+
392
+ def heartbeat():
393
+ while heartbeat_active:
394
+ elapsed = int(time.time() - start_time)
395
+ print(f"\r {C.BLUE}[{elapsed}s elapsed...]{C.RESET} ", end='')
396
+ sys.stdout.flush()
397
+ time.sleep(3)
398
+
399
+ heartbeat_thread = threading.Thread(target=heartbeat, daemon=True)
400
+ heartbeat_thread.start()
401
+
402
+ # Read and display JSON stream output
403
+ import json
404
+ output_lines = []
405
+ full_text = []
406
+
407
+ for line in process.stdout:
408
+ # Clear heartbeat line
409
+ print(f"\r{' '*40}\r", end='')
410
+
411
+ line = line.strip()
412
+ if not line:
413
+ continue
414
+
415
+ try:
416
+ data = json.loads(line)
417
+ msg_type = data.get("type", "")
418
+
419
+ # Handle different message types
420
+ if msg_type == "assistant":
421
+ # Assistant text message
422
+ content = data.get("message", {}).get("content", [])
423
+ for block in content:
424
+ if block.get("type") == "text":
425
+ text = block.get("text", "")
426
+ for line_text in text.split('\n'):
427
+ formatted = format_output_line(line_text)
428
+ print(formatted)
429
+ full_text.append(text)
430
+
431
+ elif msg_type == "content_block_delta":
432
+ # Streaming text delta
433
+ delta = data.get("delta", {})
434
+ if delta.get("type") == "text_delta":
435
+ text = delta.get("text", "")
436
+ print(text, end='')
437
+ full_text.append(text)
438
+
439
+ elif msg_type == "tool_use":
440
+ # Tool being used
441
+ tool_name = data.get("name", "unknown")
442
+ print(f"\n {C.BLUE}🔧 Using tool: {tool_name}{C.RESET}")
443
+
444
+ elif msg_type == "tool_result":
445
+ # Tool result
446
+ print(f" {C.BLUE}✓ Tool completed{C.RESET}\n")
447
+
448
+ elif msg_type == "result":
449
+ # Final result
450
+ text = data.get("result", "")
451
+ if text:
452
+ for line_text in text.split('\n'):
453
+ formatted = format_output_line(line_text)
454
+ print(formatted)
455
+ full_text.append(text)
456
+
457
+ elif msg_type == "error":
458
+ error_msg = data.get("error", {}).get("message", "Unknown error")
459
+ print(f" {C.RED}ERROR: {error_msg}{C.RESET}")
460
+
461
+ sys.stdout.flush()
462
+
463
+ except json.JSONDecodeError:
464
+ # Not JSON, just print as-is
465
+ print(f" {line}")
466
+ full_text.append(line)
467
+
468
+ output_lines.append(line)
469
+
470
+ heartbeat_active = False
471
+ print(f"\r{' '*40}\r", end='') # Clear final heartbeat
472
+
473
+ process.wait(timeout=600)
474
+ return '\n'.join(full_text)
475
+ else:
476
+ # Non-streaming mode
477
+ result = subprocess.run(
478
+ cmd,
479
+ input=prompt,
480
+ capture_output=True,
481
+ text=True,
482
+ encoding='utf-8',
483
+ errors='replace',
484
+ cwd=working_dir,
485
+ timeout=600
486
+ )
487
+ output = result.stdout
488
+ if result.stderr:
489
+ output += f"\n{C.RED}[STDERR]{C.RESET}\n{result.stderr}"
490
+ return output
491
+
492
+ except subprocess.TimeoutExpired:
493
+ return "[DAVELOOP:TIMEOUT] Claude Code iteration timed out after 10 minutes"
494
+ except FileNotFoundError:
495
+ return "[DAVELOOP:ERROR] Claude Code CLI not found. Is it installed?"
496
+ except Exception as e:
497
+ return f"[DAVELOOP:ERROR] {str(e)}"
498
+
499
+
500
+ def format_output_line(line: str) -> str:
501
+ """Format a single line of Claude's output with colors."""
502
+ # Reasoning markers
503
+ if "=== DAVELOOP REASONING ===" in line:
504
+ return f"\n{C.BRIGHT_BLUE}{'─'*50}\n 🧠 REASONING\n{'─'*50}{C.RESET}"
505
+ if "===========================" in line:
506
+ return f"{C.BRIGHT_BLUE}{'─'*50}{C.RESET}\n"
507
+
508
+ # Reasoning labels
509
+ if line.startswith("KNOWN:"):
510
+ return f" {C.BLUE}KNOWN:{C.RESET}{C.WHITE}{line[6:]}{C.RESET}"
511
+ if line.startswith("UNKNOWN:"):
512
+ return f" {C.BLUE}UNKNOWN:{C.RESET}{C.WHITE}{line[8:]}{C.RESET}"
513
+ if line.startswith("HYPOTHESIS:"):
514
+ return f" {C.BLUE}HYPOTHESIS:{C.RESET}{C.WHITE}{line[11:]}{C.RESET}"
515
+ if line.startswith("NEXT ACTION:"):
516
+ return f" {C.BLUE}NEXT ACTION:{C.RESET}{C.WHITE}{line[12:]}{C.RESET}"
517
+ if line.startswith("WHY:"):
518
+ return f" {C.BLUE}WHY:{C.RESET}{C.WHITE}{line[4:]}{C.RESET}"
519
+
520
+ # Exit signals - just dim them out in the stream, don't make them prominent
521
+ # The actual success/error boxes will be shown after iteration completes
522
+ if "[DAVELOOP:RESOLVED]" in line:
523
+ return f" {C.DIM}→ [Exit signal detected: RESOLVED]{C.RESET}"
524
+ if "[DAVELOOP:BLOCKED]" in line:
525
+ return f" {C.DIM}→ [Exit signal detected: BLOCKED]{C.RESET}"
526
+ if "[DAVELOOP:CLARIFY]" in line:
527
+ return f" {C.DIM}→ [Exit signal detected: CLARIFY]{C.RESET}"
528
+
529
+ # Code blocks
530
+ if line.strip().startswith("```"):
531
+ return f"{C.BLUE}{'─'*40}{C.RESET}"
532
+
533
+ # Default - white text
534
+ return f" {C.WHITE}{line}{C.RESET}"
535
+
536
+
537
+ def check_exit_condition(output: str) -> tuple[str, bool]:
538
+ """Check if we should exit the loop."""
539
+ if SIGNAL_RESOLVED in output:
540
+ return "RESOLVED", True
541
+ if SIGNAL_BLOCKED in output:
542
+ return "BLOCKED", True
543
+ if SIGNAL_CLARIFY in output:
544
+ return "CLARIFY", True
545
+ if "[DAVELOOP:ERROR]" in output:
546
+ return "ERROR", True
547
+ if "[DAVELOOP:TIMEOUT]" in output:
548
+ return "TIMEOUT", False
549
+ return "CONTINUE", False
550
+
551
+
552
+ def save_log(iteration: int, content: str, session_id: str):
553
+ """Save iteration log to file."""
554
+ LOG_DIR.mkdir(exist_ok=True)
555
+ log_file = LOG_DIR / f"{session_id}_iteration_{iteration:02d}.log"
556
+ log_file.write_text(content, encoding="utf-8")
557
+
558
+
559
+ # ============================================================================
560
+ # Main Entry Point
561
+ # ============================================================================
562
+ def main():
563
+ parser = argparse.ArgumentParser(
564
+ description="DaveLoop - Self-Healing Debug Agent",
565
+ formatter_class=argparse.RawDescriptionHelpFormatter
566
+ )
567
+ parser.add_argument("bug", nargs="?", help="Bug description or error message")
568
+ parser.add_argument("-f", "--file", help="Read bug description from file")
569
+ parser.add_argument("-d", "--dir", help="Working directory for Claude Code")
570
+ parser.add_argument("-m", "--max-iterations", type=int, default=MAX_ITERATIONS)
571
+ parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
572
+
573
+ args = parser.parse_args()
574
+
575
+ # Clear screen and show banner
576
+ os.system('cls' if os.name == 'nt' else 'clear')
577
+ print(BANNER)
578
+
579
+ # Get bug description
580
+ if args.file:
581
+ bug_input = Path(args.file).read_text(encoding="utf-8")
582
+ elif args.bug:
583
+ bug_input = args.bug
584
+ else:
585
+ print(f" {C.CYAN}Describe the bug (Ctrl+D or Ctrl+Z to finish):{C.RESET}")
586
+ bug_input = sys.stdin.read().strip()
587
+
588
+ if not bug_input:
589
+ print_error_box("No bug description provided")
590
+ return 1
591
+
592
+ # Setup
593
+ session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
594
+ system_prompt = load_prompt()
595
+ working_dir = args.dir or os.getcwd()
596
+
597
+ # Session info
598
+ print_header_box(f"SESSION: {session_id}", C.BRIGHT_BLUE)
599
+ print_status("Working Directory", working_dir, C.WHITE)
600
+ print_status("Max Iterations", str(args.max_iterations), C.WHITE)
601
+ print_status("Context Mode", "PERSISTENT (--continue)", C.WHITE)
602
+ print_status("System Prompt", f"{len(system_prompt)} chars loaded", C.WHITE)
603
+ print()
604
+
605
+ print_section("BUG REPORT", C.BRIGHT_RED)
606
+ # Wrap bug input nicely
607
+ for line in bug_input.split('\n')[:10]:
608
+ print(f" {C.RED}{line[:80]}{C.RESET}")
609
+ if len(bug_input.split('\n')) > 10:
610
+ print(f" {C.RED}... ({len(bug_input.split(chr(10))) - 10} more lines){C.RESET}")
611
+ print()
612
+
613
+ sys.stdout.flush()
614
+
615
+ # Initial context
616
+ context = f"""
617
+ ## Bug Report
618
+
619
+ {bug_input}
620
+
621
+ ## Instructions
622
+
623
+ Analyze this bug. Gather whatever logs/information you need to understand it.
624
+ Then fix it. Use the reasoning protocol before each action.
625
+ """
626
+
627
+ iteration_history = []
628
+
629
+ for iteration in range(1, args.max_iterations + 1):
630
+ print_iteration_header(iteration, args.max_iterations)
631
+
632
+ if iteration == 1:
633
+ full_prompt = f"{system_prompt}\n\n---\n\n{context}"
634
+ continue_session = False
635
+ else:
636
+ full_prompt = context
637
+ continue_session = True
638
+
639
+ if args.verbose:
640
+ print(f" {C.DIM}[DEBUG] Prompt: {len(full_prompt)} chars, continue={continue_session}{C.RESET}")
641
+
642
+ # Show "Claude is working" indicator
643
+ print(f"\n {C.BRIGHT_BLUE}▶ Claude is working...{C.RESET}\n")
644
+ sys.stdout.flush()
645
+
646
+ # Run Claude with real-time streaming output
647
+ output = run_claude_code(full_prompt, working_dir, continue_session=continue_session, stream=True)
648
+
649
+ print(f"\n {C.BLUE}✓ Iteration complete{C.RESET}\n")
650
+
651
+ # Save log
652
+ save_log(iteration, output, session_id)
653
+ iteration_history.append(output)
654
+
655
+ # Check exit condition
656
+ signal, should_exit = check_exit_condition(output)
657
+
658
+ if should_exit:
659
+ if signal == "RESOLVED":
660
+ print_success_box(f"Bug fixed in {iteration} iteration(s)!")
661
+ print_status("Session", session_id, C.WHITE)
662
+ print_status("Logs", str(LOG_DIR), C.WHITE)
663
+ print()
664
+ return 0
665
+ elif signal == "CLARIFY":
666
+ print_warning_box("Claude needs clarification")
667
+ print(f"\n {C.BLUE}Your response:{C.RESET}")
668
+ human_input = input(f" {C.WHITE}> {C.RESET}")
669
+ context = f"""
670
+ ## Human Clarification
671
+
672
+ {human_input}
673
+
674
+ Continue debugging with this information. Use the reasoning protocol before each action.
675
+ """
676
+ continue
677
+ elif signal == "BLOCKED":
678
+ print_error_box(f"Claude is blocked - needs human help")
679
+ print_status("Session", session_id, C.WHITE)
680
+ print_status("Logs", str(LOG_DIR), C.WHITE)
681
+ print()
682
+ return 1
683
+ else:
684
+ print_error_box(f"Error occurred: {signal}")
685
+ return 1
686
+
687
+ # Prepare context for next iteration
688
+ context = f"""
689
+ ## Iteration {iteration + 1}
690
+
691
+ The bug is NOT yet resolved. You have full context from previous iterations.
692
+
693
+ Continue debugging. Analyze what happened, determine next steps, and proceed.
694
+ Use the reasoning protocol before each action.
695
+ """
696
+
697
+ # Max iterations reached
698
+ print_warning_box(f"Max iterations ({args.max_iterations}) reached")
699
+ print_status("Session", session_id, C.WHITE)
700
+ print_status("Logs", str(LOG_DIR), C.WHITE)
701
+ print()
702
+
703
+ # Save summary
704
+ summary = f"# DaveLoop Session {session_id}\n\n"
705
+ summary += f"Bug: {bug_input[:200]}...\n\n"
706
+ summary += f"Iterations: {args.max_iterations}\n\n"
707
+ summary += "## Iteration History\n\n"
708
+ for i, hist in enumerate(iteration_history, 1):
709
+ summary += f"### Iteration {i}\n```\n{hist[:500]}...\n```\n\n"
710
+ (LOG_DIR / f"{session_id}_summary.md").write_text(summary, encoding="utf-8")
711
+
712
+ return 1
713
+
714
+
715
+ if __name__ == "__main__":
716
+ sys.exit(main())
daveloop_swebench.py ADDED
@@ -0,0 +1,432 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DaveLoop SWE-bench Runner
4
+ Evaluates DaveLoop agent against SWE-bench benchmark tasks.
5
+ """
6
+
7
+ import subprocess
8
+ import sys
9
+ import os
10
+ import json
11
+ import argparse
12
+ import shutil
13
+ from pathlib import Path
14
+ from datetime import datetime
15
+ from typing import Dict, List, Optional
16
+ import tempfile
17
+
18
+ # Import DaveLoop components
19
+ from daveloop import (
20
+ Colors as C, print_header_box, print_section, print_status,
21
+ print_success_box, print_error_box, print_warning_box,
22
+ run_claude_code, check_exit_condition, SIGNAL_RESOLVED
23
+ )
24
+
25
+ # Configuration
26
+ SCRIPT_DIR = Path(__file__).parent
27
+ RESULTS_DIR = SCRIPT_DIR / "swebench_results"
28
+ WORK_DIR = SCRIPT_DIR / "swebench_work"
29
+ PROMPT_FILE = SCRIPT_DIR / "daveloop_prompt.md"
30
+
31
+ MAX_ITERATIONS_PER_TASK = 10
32
+
33
+ BANNER = f"""
34
+ {C.BRIGHT_BLUE}{C.BOLD}
35
+ ███████╗██╗ ██╗███████╗ ██████╗ ███████╗███╗ ██╗ ██████╗██╗ ██╗
36
+ ██╔════╝██║ ██║██╔════╝ ██╔══██╗██╔════╝████╗ ██║██╔════╝██║ ██║
37
+ ███████╗██║ █╗ ██║█████╗ █████╗██████╔╝█████╗ ██╔██╗ ██║██║ ███████║
38
+ ╚════██║██║███╗██║██╔══╝ ╚════╝██╔══██╗██╔══╝ ██║╚██╗██║██║ ██╔══██║
39
+ ███████║╚███╔███╔╝███████╗ ██████╔╝███████╗██║ ╚████║╚██████╗██║ ██║
40
+ ╚══════╝ ╚══╝╚══╝ ╚══════╝ ╚═════╝ ╚══════╝╚═╝ ╚═══╝ ╚═════╝╚═╝ ╚═╝
41
+ {C.RESET}
42
+ {C.WHITE} DaveLoop × SWE-bench: Real-World Bug Benchmark{C.RESET}
43
+ """
44
+
45
+ # ============================================================================
46
+ # SWE-bench Dataset Interface
47
+ # ============================================================================
48
+
49
+ class SWEBenchTask:
50
+ """Represents a single SWE-bench task."""
51
+
52
+ def __init__(self, data: Dict):
53
+ self.instance_id = data.get('instance_id', '')
54
+ self.repo = data.get('repo', '')
55
+ self.base_commit = data.get('base_commit', '')
56
+ self.problem_statement = data.get('problem_statement', '')
57
+ self.hints_text = data.get('hints_text', '')
58
+ self.patch = data.get('patch', '')
59
+ self.test_patch = data.get('test_patch', '')
60
+ self.version = data.get('version', '')
61
+ self.environment_setup_commit = data.get('environment_setup_commit', '')
62
+
63
+ def __str__(self):
64
+ return f"{self.instance_id} ({self.repo})"
65
+
66
+
67
+ def load_swebench_dataset(dataset_name: str = "princeton-nlp/SWE-bench_Lite", split: str = "test", limit: Optional[int] = None) -> List[SWEBenchTask]:
68
+ """Load SWE-bench dataset from Hugging Face."""
69
+ try:
70
+ from datasets import load_dataset
71
+
72
+ print_section(f"Loading {dataset_name}", C.BRIGHT_CYAN)
73
+ print(f" {C.CYAN}Downloading dataset from Hugging Face...{C.RESET}")
74
+
75
+ dataset = load_dataset(dataset_name, split=split)
76
+
77
+ if limit:
78
+ dataset = dataset.select(range(min(limit, len(dataset))))
79
+
80
+ tasks = [SWEBenchTask(item) for item in dataset]
81
+
82
+ print(f" {C.GREEN}✓ Loaded {len(tasks)} tasks{C.RESET}\n")
83
+ return tasks
84
+
85
+ except ImportError:
86
+ print_error_box("datasets library not installed. Run: pip install datasets")
87
+ sys.exit(1)
88
+ except Exception as e:
89
+ print_error_box(f"Failed to load dataset: {e}")
90
+ sys.exit(1)
91
+
92
+
93
+ def load_swebench_local(json_file: Path) -> List[SWEBenchTask]:
94
+ """Load SWE-bench tasks from local JSON file."""
95
+ try:
96
+ with open(json_file, 'r', encoding='utf-8') as f:
97
+ data = json.load(f)
98
+
99
+ if isinstance(data, list):
100
+ tasks = [SWEBenchTask(item) for item in data]
101
+ else:
102
+ tasks = [SWEBenchTask(data)]
103
+
104
+ print(f" {C.GREEN}✓ Loaded {len(tasks)} tasks from {json_file}{C.RESET}\n")
105
+ return tasks
106
+
107
+ except Exception as e:
108
+ print_error_box(f"Failed to load local file: {e}")
109
+ sys.exit(1)
110
+
111
+
112
+ # ============================================================================
113
+ # Repository Setup
114
+ # ============================================================================
115
+
116
+ def remove_readonly(func, path, excinfo):
117
+ """Helper to handle readonly files on Windows."""
118
+ import stat
119
+ os.chmod(path, stat.S_IWRITE)
120
+ func(path)
121
+
122
+
123
+ def safe_rmtree(path):
124
+ """Safely remove directory tree, handling Windows permission issues."""
125
+ try:
126
+ shutil.rmtree(path, onerror=remove_readonly)
127
+ except Exception as e:
128
+ print(f" {C.YELLOW}Warning: Could not fully clean directory: {e}{C.RESET}")
129
+
130
+
131
+ def setup_task_repo(task: SWEBenchTask, work_dir: Path) -> Optional[Path]:
132
+ """Clone and setup repository for a task."""
133
+ repo_name = task.repo.replace('/', '_')
134
+ repo_path = work_dir / repo_name / task.instance_id
135
+
136
+ print_section(f"Setting up: {task.instance_id}", C.BRIGHT_CYAN)
137
+
138
+ # Clean existing directory
139
+ if repo_path.exists():
140
+ print(f" {C.YELLOW}Cleaning existing directory...{C.RESET}")
141
+ safe_rmtree(repo_path)
142
+
143
+ repo_path.mkdir(parents=True, exist_ok=True)
144
+
145
+ # Clone repository
146
+ repo_url = f"https://github.com/{task.repo}.git"
147
+ print(f" {C.CYAN}Cloning {repo_url}...{C.RESET}")
148
+
149
+ try:
150
+ subprocess.run(
151
+ ["git", "clone", "--quiet", repo_url, str(repo_path)],
152
+ check=True,
153
+ capture_output=True,
154
+ timeout=300
155
+ )
156
+ except subprocess.CalledProcessError as e:
157
+ print_error_box(f"Failed to clone repository: {e.stderr.decode()}")
158
+ return None
159
+ except subprocess.TimeoutExpired:
160
+ print_error_box("Git clone timed out after 5 minutes")
161
+ return None
162
+
163
+ # Checkout base commit
164
+ print(f" {C.CYAN}Checking out commit {task.base_commit[:8]}...{C.RESET}")
165
+ try:
166
+ subprocess.run(
167
+ ["git", "checkout", task.base_commit],
168
+ cwd=repo_path,
169
+ check=True,
170
+ capture_output=True
171
+ )
172
+ except subprocess.CalledProcessError as e:
173
+ print_error_box(f"Failed to checkout commit: {e.stderr.decode()}")
174
+ return None
175
+
176
+ print(f" {C.GREEN}✓ Repository ready at {repo_path}{C.RESET}\n")
177
+ return repo_path
178
+
179
+
180
+ # ============================================================================
181
+ # Task Execution
182
+ # ============================================================================
183
+
184
+ def create_task_prompt(task: SWEBenchTask) -> str:
185
+ """Create DaveLoop prompt for a SWE-bench task."""
186
+ prompt = f"""# SWE-bench Task: {task.instance_id}
187
+
188
+ ## Repository
189
+ {task.repo}
190
+
191
+ ## Problem Statement
192
+ {task.problem_statement}
193
+ """
194
+
195
+ if task.hints_text:
196
+ prompt += f"""
197
+ ## Hints
198
+ {task.hints_text}
199
+ """
200
+
201
+ prompt += """
202
+
203
+ ## Your Task
204
+ Analyze and fix the issue described above. Use the reasoning protocol before each action.
205
+ When you believe the issue is resolved, run the tests and output [DAVELOOP:RESOLVED].
206
+ """
207
+
208
+ return prompt
209
+
210
+
211
+ def run_task(task: SWEBenchTask, repo_path: Path, system_prompt: str, max_iterations: int = 10) -> Dict:
212
+ """Run DaveLoop on a single SWE-bench task."""
213
+ result = {
214
+ 'instance_id': task.instance_id,
215
+ 'repo': task.repo,
216
+ 'resolved': False,
217
+ 'iterations': 0,
218
+ 'error': None,
219
+ 'start_time': datetime.now().isoformat(),
220
+ 'end_time': None
221
+ }
222
+
223
+ print_header_box(f"RUNNING: {task.instance_id}", C.BRIGHT_MAGENTA)
224
+
225
+ task_prompt = create_task_prompt(task)
226
+ context = task_prompt
227
+
228
+ for iteration in range(1, max_iterations + 1):
229
+ result['iterations'] = iteration
230
+
231
+ print(f"\n{C.BRIGHT_BLUE}{'─'*70}")
232
+ print(f" ITERATION {iteration}/{max_iterations}")
233
+ print(f"{'─'*70}{C.RESET}\n")
234
+
235
+ # Build prompt
236
+ if iteration == 1:
237
+ full_prompt = f"{system_prompt}\n\n---\n\n{context}"
238
+ continue_session = False
239
+ else:
240
+ full_prompt = context
241
+ continue_session = True
242
+
243
+ # Run Claude
244
+ print(f" {C.BRIGHT_BLUE}▶ Claude is working...{C.RESET}\n")
245
+ output = run_claude_code(full_prompt, str(repo_path), continue_session=continue_session, stream=True)
246
+
247
+ print(f"\n {C.BLUE}✓ Iteration complete{C.RESET}\n")
248
+
249
+ # Check exit condition
250
+ signal, should_exit = check_exit_condition(output)
251
+
252
+ if should_exit:
253
+ if signal == "RESOLVED":
254
+ result['resolved'] = True
255
+ result['end_time'] = datetime.now().isoformat()
256
+ print_success_box(f"Task resolved in {iteration} iteration(s)!")
257
+ return result
258
+ elif signal in ["BLOCKED", "ERROR"]:
259
+ result['error'] = signal
260
+ result['end_time'] = datetime.now().isoformat()
261
+ print_error_box(f"Task failed: {signal}")
262
+ return result
263
+
264
+ # Continue to next iteration
265
+ context = f"""
266
+ ## Iteration {iteration + 1}
267
+
268
+ The issue is NOT yet resolved. You have full context from previous iterations.
269
+ Continue debugging and fixing the issue. Use the reasoning protocol before each action.
270
+ """
271
+
272
+ # Max iterations reached
273
+ result['error'] = 'MAX_ITERATIONS'
274
+ result['end_time'] = datetime.now().isoformat()
275
+ print_warning_box(f"Max iterations ({max_iterations}) reached without resolution")
276
+ return result
277
+
278
+
279
+ # ============================================================================
280
+ # Evaluation & Reporting
281
+ # ============================================================================
282
+
283
+ def save_results(results: List[Dict], output_file: Path):
284
+ """Save results to JSON file."""
285
+ RESULTS_DIR.mkdir(exist_ok=True)
286
+
287
+ with open(output_file, 'w', encoding='utf-8') as f:
288
+ json.dump(results, f, indent=2)
289
+
290
+ print(f"\n {C.GREEN}✓ Results saved to {output_file}{C.RESET}")
291
+
292
+
293
+ def print_summary(results: List[Dict]):
294
+ """Print summary statistics."""
295
+ total = len(results)
296
+ resolved = sum(1 for r in results if r['resolved'])
297
+ failed = sum(1 for r in results if r.get('error'))
298
+
299
+ print_header_box("EVALUATION SUMMARY", C.BRIGHT_GREEN)
300
+ print_status("Total Tasks", str(total), C.WHITE)
301
+ print_status("Resolved", f"{resolved} ({resolved/total*100:.1f}%)", C.GREEN)
302
+ print_status("Failed", f"{failed} ({failed/total*100:.1f}%)", C.RED)
303
+ print()
304
+
305
+ # Breakdown by error type
306
+ if failed > 0:
307
+ print(f" {C.BLUE}│{C.RESET} {C.WHITE}Failure Breakdown:{C.RESET}")
308
+ error_types = {}
309
+ for r in results:
310
+ if r.get('error'):
311
+ error_types[r['error']] = error_types.get(r['error'], 0) + 1
312
+
313
+ for error_type, count in error_types.items():
314
+ print(f" {C.BLUE}│{C.RESET} - {error_type}: {count}")
315
+ print()
316
+
317
+
318
+ # ============================================================================
319
+ # Main Entry Point
320
+ # ============================================================================
321
+
322
+ def main():
323
+ parser = argparse.ArgumentParser(
324
+ description="DaveLoop SWE-bench Evaluation",
325
+ formatter_class=argparse.RawDescriptionHelpFormatter
326
+ )
327
+ parser.add_argument("-d", "--dataset",
328
+ default="princeton-nlp/SWE-bench_Lite",
329
+ help="Hugging Face dataset name (default: SWE-bench_Lite)")
330
+ parser.add_argument("-f", "--file",
331
+ help="Load tasks from local JSON file instead of Hugging Face")
332
+ parser.add_argument("-l", "--limit", type=int,
333
+ help="Limit number of tasks to run")
334
+ parser.add_argument("-m", "--max-iterations", type=int,
335
+ default=MAX_ITERATIONS_PER_TASK,
336
+ help="Max iterations per task")
337
+ parser.add_argument("-s", "--start-from", type=int, default=0,
338
+ help="Start from task index (0-based)")
339
+ parser.add_argument("--keep-repos", action="store_true",
340
+ help="Keep cloned repositories after evaluation")
341
+
342
+ args = parser.parse_args()
343
+
344
+ # Clear screen and show banner
345
+ os.system('cls' if os.name == 'nt' else 'clear')
346
+ print(BANNER)
347
+
348
+ # Load system prompt
349
+ if PROMPT_FILE.exists():
350
+ system_prompt = PROMPT_FILE.read_text(encoding="utf-8")
351
+ else:
352
+ print_warning_box(f"Prompt file not found: {PROMPT_FILE}")
353
+ system_prompt = "You are a debugging agent. Fix bugs and output [DAVELOOP:RESOLVED] when done."
354
+
355
+ # Load tasks
356
+ if args.file:
357
+ tasks = load_swebench_local(Path(args.file))
358
+ else:
359
+ tasks = load_swebench_dataset(args.dataset, limit=args.limit)
360
+
361
+ if args.start_from > 0:
362
+ tasks = tasks[args.start_from:]
363
+ print(f" {C.YELLOW}Starting from task index {args.start_from}{C.RESET}\n")
364
+
365
+ # Setup work directory
366
+ WORK_DIR.mkdir(exist_ok=True)
367
+
368
+ # Run evaluation
369
+ session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
370
+ results = []
371
+
372
+ print_header_box(f"SESSION: {session_id}", C.BRIGHT_BLUE)
373
+ print_status("Dataset", args.dataset if not args.file else args.file, C.WHITE)
374
+ print_status("Tasks", str(len(tasks)), C.WHITE)
375
+ print_status("Max Iterations/Task", str(args.max_iterations), C.WHITE)
376
+ print_status("Work Directory", str(WORK_DIR), C.WHITE)
377
+ print()
378
+
379
+ for i, task in enumerate(tasks, 1):
380
+ print(f"\n{C.BRIGHT_MAGENTA}{'='*70}")
381
+ print(f" TASK {i}/{len(tasks)}: {task.instance_id}")
382
+ print(f"{'='*70}{C.RESET}\n")
383
+
384
+ # Setup repository
385
+ repo_path = setup_task_repo(task, WORK_DIR)
386
+ if not repo_path:
387
+ results.append({
388
+ 'instance_id': task.instance_id,
389
+ 'repo': task.repo,
390
+ 'resolved': False,
391
+ 'error': 'SETUP_FAILED',
392
+ 'iterations': 0
393
+ })
394
+ continue
395
+
396
+ # Run task
397
+ try:
398
+ result = run_task(task, repo_path, system_prompt, args.max_iterations)
399
+ results.append(result)
400
+ except KeyboardInterrupt:
401
+ print_warning_box("Evaluation interrupted by user")
402
+ break
403
+ except Exception as e:
404
+ print_error_box(f"Unexpected error: {e}")
405
+ results.append({
406
+ 'instance_id': task.instance_id,
407
+ 'repo': task.repo,
408
+ 'resolved': False,
409
+ 'error': f'EXCEPTION: {str(e)}',
410
+ 'iterations': 0
411
+ })
412
+ finally:
413
+ # Cleanup repository unless --keep-repos
414
+ if not args.keep_repos and repo_path.exists():
415
+ print(f" {C.DIM}Cleaning up repository...{C.RESET}")
416
+ safe_rmtree(repo_path)
417
+
418
+ # Save and display results
419
+ output_file = RESULTS_DIR / f"results_{session_id}.json"
420
+ save_results(results, output_file)
421
+ print_summary(results)
422
+
423
+ print(f"\n{C.BRIGHT_BLUE}{'='*70}{C.RESET}")
424
+ print(f" {C.BOLD}Evaluation complete!{C.RESET}")
425
+ print(f" {C.DIM}Results: {output_file}{C.RESET}")
426
+ print(f"{C.BRIGHT_BLUE}{'='*70}{C.RESET}\n")
427
+
428
+ return 0 if all(r['resolved'] for r in results) else 1
429
+
430
+
431
+ if __name__ == "__main__":
432
+ sys.exit(main())