devagent-cli 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ """
2
+ Sandbox Layer — copies project to isolated workspace before agent modifies anything.
3
+
4
+ Flow:
5
+ Real Project → Sandbox Copy → Agent Modifies Sandbox → Run Tests → Show Diff → Optional Apply
6
+
7
+ Safety features:
8
+ - Path validation (no escaping sandbox)
9
+ - Restricted to supported file types
10
+ - Diff preview before applying back
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+ import shutil
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ from devagent.utils.config import IGNORE_DIRS, SUPPORTED_EXTENSIONS
21
+
22
+
23
+ class SandboxManager:
24
+ """Manages an isolated sandbox workspace for safe agent operations."""
25
+
26
+ def __init__(self, project_root: str, sandbox_dir: str | None = None):
27
+ self.project_root = os.path.abspath(project_root)
28
+ self.sandbox_dir = sandbox_dir or os.path.join(self.project_root, "sandbox_workspace")
29
+ self._active = False
30
+
31
+ def create(self) -> str:
32
+ """Create a sandbox copy of the project. Returns sandbox path."""
33
+ if os.path.exists(self.sandbox_dir):
34
+ shutil.rmtree(self.sandbox_dir, ignore_errors=True)
35
+
36
+ def _ignore(directory: str, contents: list[str]) -> list[str]:
37
+ ignored = []
38
+ for item in contents:
39
+ if item in IGNORE_DIRS or item == "sandbox_workspace":
40
+ ignored.append(item)
41
+ return ignored
42
+
43
+ shutil.copytree(self.project_root, self.sandbox_dir, ignore=_ignore)
44
+ self._active = True
45
+ print(f"[SANDBOX] Created at: {self.sandbox_dir}")
46
+ return self.sandbox_dir
47
+
48
+ def destroy(self) -> None:
49
+ """Remove the sandbox."""
50
+ if os.path.exists(self.sandbox_dir):
51
+ shutil.rmtree(self.sandbox_dir, ignore_errors=True)
52
+ self._active = False
53
+ print("[SANDBOX] Destroyed.")
54
+
55
+ def validate_path(self, path: str) -> bool:
56
+ """Ensure a path is within the sandbox (no directory traversal)."""
57
+ abs_path = os.path.abspath(path)
58
+ return abs_path.startswith(os.path.abspath(self.sandbox_dir))
59
+
60
+ def get_sandbox_path(self, relative_path: str) -> str:
61
+ """Convert a relative path to its sandbox equivalent."""
62
+ return os.path.join(self.sandbox_dir, relative_path)
63
+
64
+ def apply_to_project(self) -> dict[str, Any]:
65
+ """Copy sandbox changes back to the real project.
66
+
67
+ Returns a summary of what was applied.
68
+ """
69
+ if not self._active:
70
+ return {"status": "error", "message": "No active sandbox"}
71
+
72
+ applied: list[str] = []
73
+ errors: list[str] = []
74
+
75
+ for root, dirs, files in os.walk(self.sandbox_dir):
76
+ dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
77
+ for f in files:
78
+ sandbox_file = os.path.join(root, f)
79
+ rel_path = os.path.relpath(sandbox_file, self.sandbox_dir)
80
+ real_file = os.path.join(self.project_root, rel_path)
81
+
82
+ try:
83
+ # Only apply supported file types
84
+ if Path(f).suffix in SUPPORTED_EXTENSIONS or f in {"conftest.py"}:
85
+ sandbox_content = Path(sandbox_file).read_text(encoding="utf-8", errors="replace")
86
+ real_content = ""
87
+ if os.path.exists(real_file):
88
+ real_content = Path(real_file).read_text(encoding="utf-8", errors="replace")
89
+
90
+ if sandbox_content != real_content:
91
+ Path(real_file).parent.mkdir(parents=True, exist_ok=True)
92
+ Path(real_file).write_text(sandbox_content, encoding="utf-8")
93
+ applied.append(rel_path)
94
+ except Exception as exc:
95
+ errors.append(f"{rel_path}: {exc}")
96
+
97
+ return {
98
+ "status": "success" if not errors else "partial",
99
+ "applied": applied,
100
+ "errors": errors,
101
+ }
102
+
103
+ @property
104
+ def is_active(self) -> bool:
105
+ return self._active and os.path.isdir(self.sandbox_dir)
devagent/app/state.py ADDED
@@ -0,0 +1,113 @@
1
+ """
2
+ Shared state object for the agent.
3
+ Single source of truth passed through every step of the ReAct loop.
4
+
5
+ Implements both short-term memory (runtime state) and slots for
6
+ long-term memory integration.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import copy
12
+ from dataclasses import dataclass, field
13
+ from typing import Any
14
+
15
+
16
+ @dataclass
17
+ class AgentState:
18
+ """Mutable state shared across all agent components."""
19
+
20
+ # --- task definition ---
21
+ task: str = ""
22
+ project_root: str = "."
23
+
24
+ # --- iteration tracking ---
25
+ # Progress tracking
26
+ last_test_output: str = ""
27
+ stagnant_steps: int = 0
28
+ failing_functions: list[str] = field(default_factory=list)
29
+ current_step: int = 0
30
+ max_steps: int = 5
31
+ status: str = "pending" # pending | running | success | fail
32
+
33
+ # --- file context ---
34
+ current_file: str = ""
35
+ current_file_content: str = ""
36
+
37
+ # --- execution results ---
38
+ test_output: str = ""
39
+ test_exit_code: int = -1
40
+ lint_output: str = ""
41
+
42
+ # --- history (short-term memory) ---
43
+ history: list[dict[str, Any]] = field(default_factory=list)
44
+
45
+ # --- last LLM outputs ---
46
+ last_thought: str = ""
47
+ last_action: str = ""
48
+ last_observation: str = ""
49
+ last_code_fix: str = ""
50
+ last_review: str = ""
51
+
52
+ # --- attempts counter ---
53
+ attempts: int = 0
54
+
55
+ # --- retrieval context ---
56
+ retrieved_chunks: list[Any] = field(default_factory=list)
57
+
58
+ # --- planner output ---
59
+ plan: dict[str, Any] = field(default_factory=dict)
60
+
61
+ # --- patch tracking ---
62
+ patches_applied: list[dict[str, Any]] = field(default_factory=list)
63
+
64
+ # --- sandbox ---
65
+ sandbox_active: bool = False
66
+ working_root: str = "" # actual root being modified (sandbox or real)
67
+
68
+ # --- thoughts / observations for memory ---
69
+ thoughts: list[str] = field(default_factory=list)
70
+ actions: list[str] = field(default_factory=list)
71
+ observations: list[str] = field(default_factory=list)
72
+
73
+ # -- Trust & Confidence --
74
+ confidence_score: float = 0.0
75
+ confidence_reasons: list[str] = field(default_factory=list)
76
+
77
+ def to_dict(self) -> dict[str, Any]:
78
+ """Return a JSON-serialisable snapshot of the current state."""
79
+ return {
80
+ "task": self.task,
81
+ "project_root": self.project_root,
82
+ "current_step": self.current_step,
83
+ "max_steps": self.max_steps,
84
+ "status": self.status,
85
+ "current_file": self.current_file,
86
+ "test_output": self.test_output[:500] if self.test_output else "",
87
+ "test_exit_code": self.test_exit_code,
88
+ "attempts": self.attempts,
89
+ "history_length": len(self.history),
90
+ "patches_applied": len(self.patches_applied),
91
+ "sandbox_active": self.sandbox_active,
92
+ }
93
+
94
+ def snapshot(self) -> dict[str, Any]:
95
+ """Return a JSON-serialisable snapshot of the current state."""
96
+ return {
97
+ "task": self.task,
98
+ "project_root": self.project_root,
99
+ "current_step": self.current_step,
100
+ "max_steps": self.max_steps,
101
+ "status": self.status,
102
+ "current_file": self.current_file,
103
+ "test_output": self.test_output[:500] if self.test_output else "",
104
+ "test_exit_code": self.test_exit_code,
105
+ "attempts": self.attempts,
106
+ "history_length": len(self.history),
107
+ "patches_applied": len(self.patches_applied),
108
+ "sandbox_active": self.sandbox_active,
109
+ }
110
+
111
+ def clone(self) -> "AgentState":
112
+ """Deep-copy for safe rollback."""
113
+ return copy.deepcopy(self)
devagent/cli.py ADDED
@@ -0,0 +1,282 @@
1
+ """
2
+ DevAgent Professional CLI — The entry point for all agent operations.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import argparse
8
+ import sys
9
+ import os
10
+ import time
11
+ from rich.console import Console
12
+ from rich.panel import Panel
13
+ from rich.table import Table
14
+
15
+ # Add package root to sys.path if running as script
16
+ if __name__ == "__main__":
17
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
18
+
19
+ from devagent.app.agent import Agent
20
+ from devagent.app.sandbox import SandboxManager
21
+ from devagent.tools.git_tools import is_git_repo, git_commit, git_push
22
+ from devagent.utils.config import AgentConfig, MODELS
23
+ from devagent import __version__
24
+
25
+ console = Console()
26
+
27
+ BANNER = r"""
28
+ ____ _ _
29
+ | _ \ _____ __/ \ __ _ ___ _ __ | |_
30
+ | | | |/ _ \ \ / / _ \ / _` |/ _ \ '_ \| __|
31
+ | |_| | __/\ V / ___ \ (_| | __/ | | | |_
32
+ |____/ \___| \_/_/ \_\__, |\___|_| |_|\__|
33
+ |___/
34
+ """
35
+
36
+ from rich.progress import Progress, SpinnerColumn, TextColumn
37
+
38
+ def verify_ollama(model_name: str) -> bool:
39
+ """Verify Ollama is running and model is available."""
40
+ import subprocess
41
+ import requests
42
+
43
+ # 1. Check if server is reachable
44
+ try:
45
+ response = requests.get("http://localhost:11434/api/tags", timeout=2)
46
+ if response.status_code != 200:
47
+ console.print("[bold red][ERROR][/bold red] Ollama server returned error.")
48
+ return False
49
+ except:
50
+ console.print("[bold red][ERROR][/bold red] Could not connect to Ollama server.")
51
+ console.print("Run: [bold cyan]ollama serve[/bold cyan] in a separate terminal.")
52
+ return False
53
+
54
+ # 2. Check if model is pulled
55
+ try:
56
+ tags = response.json().get("models", [])
57
+ model_names = [m["name"] for m in tags]
58
+ # Handle both "name" and "name:latest"
59
+ if model_name not in model_names and f"{model_name}:latest" not in model_names:
60
+ console.print(f"[bold red][ERROR][/bold red] Model [bold cyan]{model_name}[/bold cyan] not found.")
61
+ console.print(f"Run: [bold cyan]ollama pull {model_name}[/bold cyan]")
62
+ return False
63
+ except Exception as e:
64
+ console.print(f"[bold yellow][WARN][/bold yellow] Could not verify model: {e}")
65
+
66
+ return True
67
+
68
+ def cmd_run(args):
69
+ """Implementation of 'devagent run' command."""
70
+ if not verify_ollama(args.model):
71
+ return 1
72
+
73
+ config = AgentConfig.from_cli(args)
74
+ root = os.path.abspath(config.project_root)
75
+
76
+ if not os.path.isdir(root):
77
+ console.print(f"[bold red][ERROR][/bold red] Project root not found: {root}")
78
+ return 1
79
+
80
+ # Set model
81
+ import devagent.app.llm as llm_module
82
+ llm_module.set_model(config.model)
83
+
84
+ console.print(BANNER, style="cyan")
85
+ console.print(Panel.fit(
86
+ f"[bold]DevAgent v{__version__}[/bold]\n"
87
+ f"Model: [green]{config.model}[/green]\n"
88
+ f"Sandbox: [yellow]{'ON' if config.sandbox else 'OFF'}[/yellow]",
89
+ title="Session Info", border_style="blue"
90
+ ))
91
+
92
+ # Sandbox setup
93
+ sandbox = None
94
+ working_root = root
95
+ if config.sandbox:
96
+ sandbox = SandboxManager(root)
97
+ working_root = sandbox.create()
98
+
99
+ # Run agent
100
+ agent = Agent(
101
+ task=config.task,
102
+ project_root=working_root,
103
+ max_steps=config.max_steps,
104
+ )
105
+
106
+ start_time = time.time()
107
+ final_state = agent.run()
108
+ elapsed = time.time() - start_time
109
+
110
+ # Save metrics
111
+ metrics_path = agent.metrics.save(os.path.join(root, "logs"))
112
+
113
+ # Print summary table
114
+ table = Table(title="Execution Summary", box=None)
115
+ table.add_column("Metric", style="cyan")
116
+ table.add_column("Value", style="bold white")
117
+
118
+ table.add_row("Status", final_state.status.upper())
119
+ table.add_row("Steps", f"{final_state.current_step}/{final_state.max_steps}")
120
+ table.add_row("Time", f"{elapsed:.1f}s")
121
+
122
+ # Confidence Score with color coding
123
+ conf_color = "green" if final_state.confidence_score > 0.8 else "yellow" if final_state.confidence_score > 0.5 else "red"
124
+ table.add_row("Confidence", f"[{conf_color}]{final_state.confidence_score * 100:.0f}%[/{conf_color}]")
125
+
126
+ if final_state.current_file:
127
+ table.add_row("Last File", final_state.current_file)
128
+ table.add_row("Patches", str(len(final_state.patches_applied)))
129
+
130
+ console.print("\n", table)
131
+
132
+ # Show confidence reasons
133
+ if final_state.confidence_reasons:
134
+ console.print("\n[bold]Confidence Breakdown:[/bold]")
135
+ for reason in final_state.confidence_reasons:
136
+ console.print(f" [dim]• {reason}[/dim]")
137
+
138
+ # Sandbox apply
139
+ if sandbox and sandbox.is_active:
140
+ if final_state.status == "success":
141
+ if getattr(args, "interactive", False):
142
+ console.print("\n[bold yellow][INTERACTIVE][/bold yellow] Reviewing changes...")
143
+ # Show diff for each applied patch
144
+ for i, patch in enumerate(final_state.patches_applied):
145
+ console.print(f"\n[bold]Patch #{i+1}[/bold] for [cyan]{patch.get('file', 'unknown')}[/cyan]:")
146
+ console.print(f"[dim]{patch.get('diff', 'No diff available')}[/dim]")
147
+
148
+ choice = console.input("\nApply these changes to the real project? [y/N]: ").lower()
149
+ if choice != 'y':
150
+ console.print("[bold red]Changes rejected.[/bold red] Sandbox will be destroyed.")
151
+ sandbox.destroy()
152
+ return 1
153
+
154
+ console.print("\n[bold yellow][SANDBOX][/bold yellow] Applying changes to real project...")
155
+ result = sandbox.apply_to_project()
156
+ if result["applied"]:
157
+ for f in result["applied"]:
158
+ console.print(f" [green]✓[/green] {f}")
159
+ sandbox.destroy()
160
+
161
+ # Git operations
162
+ if final_state.status == "success" and config.auto_commit:
163
+ _handle_git(root, config)
164
+
165
+ return 0 if final_state.status == "success" else 1
166
+
167
+ def _handle_git(root: str, config: AgentConfig) -> None:
168
+ if not is_git_repo(root):
169
+ return
170
+ console.print("\n[bold blue][GIT][/bold blue] Committing changes...")
171
+ commit_msg = f"agent: {config.task[:50]}"
172
+ git_commit(root, commit_msg)
173
+ if config.auto_push:
174
+ git_push(root)
175
+
176
+ def cmd_benchmark(args):
177
+ """Implementation of 'devagent benchmark' command."""
178
+ from devagent.tools.benchmark_runner import run_benchmarks
179
+
180
+ agent_dir = os.path.dirname(os.path.abspath(__file__))
181
+ benchmarks_dir = os.path.abspath(os.path.join(agent_dir, "..", "benchmarks"))
182
+
183
+ import devagent.app.llm as llm_module
184
+ llm_module.set_model(args.model)
185
+
186
+ console.print(Panel(f"Running benchmarks with [bold cyan]{args.model}[/bold cyan]", title="Benchmark Suite"))
187
+
188
+ report = run_benchmarks(benchmarks_dir, model=args.model, max_steps=args.max_steps)
189
+ report.print_report()
190
+
191
+ return 0 if report.pass_rate >= 80 else 1
192
+
193
+ def cmd_doctor(args):
194
+ """Implementation of 'devagent doctor' command."""
195
+ console.print("[bold cyan]DevAgent System Check[/bold cyan]\n")
196
+
197
+ checks = []
198
+
199
+ # Python Check
200
+ checks.append(("[green]OK[/green]" if sys.version_info >= (3, 11) else "[red]FAIL[/red]", f"Python {sys.version_info.major}.{sys.version_info.minor}"))
201
+
202
+ # Ollama Check
203
+ import subprocess
204
+ try:
205
+ subprocess.run(["ollama", "--version"], capture_output=True, check=True)
206
+ checks.append(("[green]OK[/green]", "Ollama installed"))
207
+ except:
208
+ checks.append(("[red]FAIL[/red]", "Ollama NOT found (run: ollama serve)"))
209
+
210
+ # FAISS Check
211
+ try:
212
+ import faiss
213
+ checks.append(("[green]OK[/green]", "FAISS available"))
214
+ except:
215
+ checks.append(("[yellow]WARN[/yellow]", "FAISS not found (keyword search fallback active)"))
216
+
217
+ for status, msg in checks:
218
+ console.print(f" {status} {msg}")
219
+
220
+ return 0
221
+
222
+ def cmd_models(args):
223
+ """Implementation of 'devagent models' command."""
224
+ import subprocess
225
+ console.print("[bold cyan]Installed Ollama Models[/bold cyan]\n")
226
+ try:
227
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
228
+ console.print(result.stdout)
229
+ console.print(f"\n[bold green]Recommended:[/bold green] {MODELS['primary']}")
230
+ except:
231
+ console.print("[red][ERROR][/red] Could not list Ollama models.")
232
+ return 0
233
+
234
+ def main():
235
+ parser = argparse.ArgumentParser(description="DevAgent CLI — Professional local coding agent.")
236
+ parser.add_argument("--version", action="version", version=f"DevAgent v{__version__}")
237
+
238
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
239
+
240
+ # Command: run
241
+ run_parser = subparsers.add_parser("run", help="Run the agent on a coding task")
242
+ run_parser.add_argument("--task", "-t", required=True, help="Task description")
243
+ run_parser.add_argument("--root", "-r", default=".", help="Project root")
244
+ run_parser.add_argument("--model", default=MODELS["primary"], help="Ollama model")
245
+ run_parser.add_argument("--max-steps", "-m", type=int, default=5, help="Max iterations")
246
+ run_parser.add_argument("--sandbox", action="store_true", help="Run in sandbox")
247
+ run_parser.add_argument("--auto-commit", action="store_true", help="Auto-commit on success")
248
+ run_parser.add_argument("--auto-push", action="store_true", help="Auto-push after commit")
249
+ run_parser.add_argument("--interactive", "-i", action="store_true", help="Review changes before applying")
250
+ run_parser.add_argument("--verbose", action="store_true", help="Verbose output")
251
+
252
+ # Command: benchmark
253
+ bench_parser = subparsers.add_parser("benchmark", help="Run benchmark suite")
254
+ bench_parser.add_argument("--model", default=MODELS["primary"], help="Ollama model")
255
+ bench_parser.add_argument("--max-steps", "-m", type=int, default=5, help="Max iterations")
256
+
257
+ # Command: doctor
258
+ subparsers.add_parser("doctor", help="Check system health")
259
+
260
+ # Command: models
261
+ subparsers.add_parser("models", help="List installed Ollama models")
262
+
263
+ # Command: version
264
+ subparsers.add_parser("version", help="Show version")
265
+
266
+ args = parser.parse_args()
267
+
268
+ if args.command == "run":
269
+ sys.exit(cmd_run(args))
270
+ elif args.command == "benchmark":
271
+ sys.exit(cmd_benchmark(args))
272
+ elif args.command == "doctor":
273
+ sys.exit(cmd_doctor(args))
274
+ elif args.command == "models":
275
+ sys.exit(cmd_models(args))
276
+ elif args.command == "version":
277
+ console.print(f"DevAgent CLI v{__version__}")
278
+ else:
279
+ parser.print_help()
280
+
281
+ if __name__ == "__main__":
282
+ main()
@@ -0,0 +1 @@
1
+ # tools — executable tool modules
@@ -0,0 +1,184 @@
1
+ """
2
+ Benchmark Runner — evaluates the agent against a suite of known bug scenarios.
3
+
4
+ Each benchmark is a self-contained project with:
5
+ - buggy source code
6
+ - test file
7
+ - expected behavior
8
+
9
+ Measures: pass rate, retries, execution time, model performance.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ import shutil
17
+ import time
18
+ from dataclasses import dataclass, field
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+
23
+ @dataclass
24
+ class BenchmarkResult:
25
+ """Result of a single benchmark run."""
26
+ name: str = ""
27
+ task: str = ""
28
+ passed: bool = False
29
+ steps_used: int = 0
30
+ max_steps: int = 5
31
+ execution_time_s: float = 0.0
32
+ model: str = ""
33
+ error: str = ""
34
+
35
+
36
+ @dataclass
37
+ class BenchmarkReport:
38
+ """Aggregated benchmark report."""
39
+ results: list[BenchmarkResult] = field(default_factory=list)
40
+ model: str = ""
41
+ total_time_s: float = 0.0
42
+
43
+ @property
44
+ def total(self) -> int:
45
+ return len(self.results)
46
+
47
+ @property
48
+ def passed(self) -> int:
49
+ return sum(1 for r in self.results if r.passed)
50
+
51
+ @property
52
+ def failed(self) -> int:
53
+ return self.total - self.passed
54
+
55
+ @property
56
+ def pass_rate(self) -> float:
57
+ return (self.passed / max(self.total, 1)) * 100
58
+
59
+ def summary(self) -> dict[str, Any]:
60
+ return {
61
+ "model": self.model,
62
+ "total": self.total,
63
+ "passed": self.passed,
64
+ "failed": self.failed,
65
+ "pass_rate": f"{self.pass_rate:.1f}%",
66
+ "total_time_s": round(self.total_time_s, 2),
67
+ "results": [
68
+ {
69
+ "name": r.name,
70
+ "passed": r.passed,
71
+ "steps": r.steps_used,
72
+ "time_s": round(r.execution_time_s, 2),
73
+ "error": r.error[:200] if r.error else "",
74
+ }
75
+ for r in self.results
76
+ ],
77
+ }
78
+
79
+ def print_report(self) -> None:
80
+ """Pretty-print the benchmark report."""
81
+ print("\n" + "=" * 60)
82
+ print(" BENCHMARK REPORT")
83
+ print("=" * 60)
84
+ print(f" Model: {self.model}")
85
+ print(f" Total: {self.total}")
86
+ print(f" Passed: {self.passed}")
87
+ print(f" Failed: {self.failed}")
88
+ print(f" Pass Rate: {self.pass_rate:.1f}%")
89
+ print(f" Time: {self.total_time_s:.1f}s")
90
+ print("-" * 60)
91
+ for r in self.results:
92
+ icon = "PASS" if r.passed else "FAIL"
93
+ print(f" [{icon}] {r.name} ({r.steps_used} steps, {r.execution_time_s:.1f}s)")
94
+ print("=" * 60)
95
+
96
+ def save(self, output_dir: str) -> str:
97
+ """Save report to JSON."""
98
+ path = Path(output_dir)
99
+ path.mkdir(parents=True, exist_ok=True)
100
+ out_file = path / "benchmark_report.json"
101
+ out_file.write_text(
102
+ json.dumps(self.summary(), indent=2, ensure_ascii=False),
103
+ encoding="utf-8",
104
+ )
105
+ return str(out_file)
106
+
107
+
108
+ def discover_benchmarks(benchmarks_dir: str) -> list[dict[str, str]]:
109
+ """Discover all benchmark suites in the benchmarks directory."""
110
+ benchmarks = []
111
+ bench_path = Path(benchmarks_dir)
112
+
113
+ if not bench_path.is_dir():
114
+ return benchmarks
115
+
116
+ for entry in sorted(bench_path.iterdir()):
117
+ if entry.is_dir():
118
+ task_file = entry / "task.txt"
119
+ if task_file.exists():
120
+ task = task_file.read_text(encoding="utf-8").strip()
121
+ benchmarks.append({
122
+ "name": entry.name,
123
+ "path": str(entry),
124
+ "task": task,
125
+ })
126
+
127
+ return benchmarks
128
+
129
+
130
+ def run_benchmarks(benchmarks_dir: str, model: str = "qwen2.5-coder:3b",
131
+ max_steps: int = 5) -> BenchmarkReport:
132
+ """Run all benchmarks and return a report."""
133
+ # Import here to avoid circular imports
134
+ from devagent.app.agent import Agent
135
+
136
+ report = BenchmarkReport(model=model)
137
+ suites = discover_benchmarks(benchmarks_dir)
138
+
139
+ if not suites:
140
+ print("[BENCHMARK] No benchmark suites found.")
141
+ return report
142
+
143
+ print(f"\n[BENCHMARK] Found {len(suites)} benchmark suites.")
144
+ start_time = time.time()
145
+
146
+ for suite in suites:
147
+ print(f"\n[BENCHMARK] Running: {suite['name']}")
148
+ result = BenchmarkResult(
149
+ name=suite["name"],
150
+ task=suite["task"],
151
+ max_steps=max_steps,
152
+ model=model,
153
+ )
154
+
155
+ # Create a temp copy to avoid modifying the benchmark
156
+ tmp_dir = os.path.join(benchmarks_dir, f"_tmp_{suite['name']}")
157
+ try:
158
+ if os.path.exists(tmp_dir):
159
+ shutil.rmtree(tmp_dir)
160
+ shutil.copytree(suite["path"], tmp_dir)
161
+
162
+ t0 = time.time()
163
+ agent = Agent(
164
+ task=suite["task"],
165
+ project_root=tmp_dir,
166
+ max_steps=max_steps,
167
+ )
168
+ final_state = agent.run()
169
+ result.execution_time_s = time.time() - t0
170
+ result.steps_used = final_state.current_step
171
+ result.passed = final_state.status == "success"
172
+
173
+ except Exception as exc:
174
+ result.error = str(exc)
175
+ result.passed = False
176
+
177
+ finally:
178
+ if os.path.exists(tmp_dir):
179
+ shutil.rmtree(tmp_dir, ignore_errors=True)
180
+
181
+ report.results.append(result)
182
+
183
+ report.total_time_s = time.time() - start_time
184
+ return report