PyPI - devagent-cli - Versions diffs - 3.2.1__py3-none-any.whl - Mend

devagent-cli 3.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

devagent/__init__.py +1 -0
devagent/app/__init__.py +1 -0
devagent/app/agent.py +717 -0
devagent/app/llm.py +83 -0
devagent/app/memory.py +309 -0
devagent/app/patcher.py +83 -0
devagent/app/planner.py +76 -0
devagent/app/reviewer.py +65 -0
devagent/app/sandbox.py +105 -0
devagent/app/state.py +113 -0
devagent/cli.py +282 -0
devagent/tools/__init__.py +1 -0
devagent/tools/benchmark_runner.py +184 -0
devagent/tools/file_ops.py +52 -0
devagent/tools/git_tools.py +91 -0
devagent/tools/linter.py +55 -0
devagent/tools/search.py +65 -0
devagent/tools/semantic_search.py +60 -0
devagent/tools/surgical_patcher.py +39 -0
devagent/tools/test_runner.py +143 -0
devagent/utils/__init__.py +1 -0
devagent/utils/config.py +116 -0
devagent/utils/logger.py +94 -0
devagent/utils/metrics.py +130 -0
devagent_cli-3.2.1.dist-info/METADATA +480 -0
devagent_cli-3.2.1.dist-info/RECORD +30 -0
devagent_cli-3.2.1.dist-info/WHEEL +5 -0
devagent_cli-3.2.1.dist-info/entry_points.txt +2 -0
devagent_cli-3.2.1.dist-info/licenses/LICENSE +21 -0
devagent_cli-3.2.1.dist-info/top_level.txt +1 -0

devagent/app/sandbox.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""
+Sandbox Layer — copies project to isolated workspace before agent modifies anything.
+Flow:
+  Real Project → Sandbox Copy → Agent Modifies Sandbox → Run Tests → Show Diff → Optional Apply
+Safety features:
+  - Path validation (no escaping sandbox)
+  - Restricted to supported file types
+  - Diff preview before applying back
+"""
+from __future__ import annotations
+import os
+import shutil
+from pathlib import Path
+from typing import Any
+from devagent.utils.config import IGNORE_DIRS, SUPPORTED_EXTENSIONS
+class SandboxManager:
+    """Manages an isolated sandbox workspace for safe agent operations."""
+    def __init__(self, project_root: str, sandbox_dir: str | None = None):
+        self.project_root = os.path.abspath(project_root)
+        self.sandbox_dir = sandbox_dir or os.path.join(self.project_root, "sandbox_workspace")
+        self._active = False
+    def create(self) -> str:
+        """Create a sandbox copy of the project. Returns sandbox path."""
+        if os.path.exists(self.sandbox_dir):
+            shutil.rmtree(self.sandbox_dir, ignore_errors=True)
+        def _ignore(directory: str, contents: list[str]) -> list[str]:
+            ignored = []
+            for item in contents:
+                if item in IGNORE_DIRS or item == "sandbox_workspace":
+                    ignored.append(item)
+            return ignored
+        shutil.copytree(self.project_root, self.sandbox_dir, ignore=_ignore)
+        self._active = True
+        print(f"[SANDBOX] Created at: {self.sandbox_dir}")
+        return self.sandbox_dir
+    def destroy(self) -> None:
+        """Remove the sandbox."""
+        if os.path.exists(self.sandbox_dir):
+            shutil.rmtree(self.sandbox_dir, ignore_errors=True)
+            self._active = False
+            print("[SANDBOX] Destroyed.")
+    def validate_path(self, path: str) -> bool:
+        """Ensure a path is within the sandbox (no directory traversal)."""
+        abs_path = os.path.abspath(path)
+        return abs_path.startswith(os.path.abspath(self.sandbox_dir))
+    def get_sandbox_path(self, relative_path: str) -> str:
+        """Convert a relative path to its sandbox equivalent."""
+        return os.path.join(self.sandbox_dir, relative_path)
+    def apply_to_project(self) -> dict[str, Any]:
+        """Copy sandbox changes back to the real project.
+        Returns a summary of what was applied.
+        """
+        if not self._active:
+            return {"status": "error", "message": "No active sandbox"}
+        applied: list[str] = []
+        errors: list[str] = []
+        for root, dirs, files in os.walk(self.sandbox_dir):
+            dirs[:] = [d for d in dirs if d not in IGNORE_DIRS]
+            for f in files:
+                sandbox_file = os.path.join(root, f)
+                rel_path = os.path.relpath(sandbox_file, self.sandbox_dir)
+                real_file = os.path.join(self.project_root, rel_path)
+                try:
+                    # Only apply supported file types
+                    if Path(f).suffix in SUPPORTED_EXTENSIONS or f in {"conftest.py"}:
+                        sandbox_content = Path(sandbox_file).read_text(encoding="utf-8", errors="replace")
+                        real_content = ""
+                        if os.path.exists(real_file):
+                            real_content = Path(real_file).read_text(encoding="utf-8", errors="replace")
+                        if sandbox_content != real_content:
+                            Path(real_file).parent.mkdir(parents=True, exist_ok=True)
+                            Path(real_file).write_text(sandbox_content, encoding="utf-8")
+                            applied.append(rel_path)
+                except Exception as exc:
+                    errors.append(f"{rel_path}: {exc}")
+        return {
+            "status": "success" if not errors else "partial",
+            "applied": applied,
+            "errors": errors,
+        }
+    @property
+    def is_active(self) -> bool:
+        return self._active and os.path.isdir(self.sandbox_dir)

devagent/app/state.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""
+Shared state object for the agent.
+Single source of truth passed through every step of the ReAct loop.
+Implements both short-term memory (runtime state) and slots for
+long-term memory integration.
+"""
+from __future__ import annotations
+import copy
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class AgentState:
+    """Mutable state shared across all agent components."""
+    # --- task definition ---
+    task: str = ""
+    project_root: str = "."
+    # --- iteration tracking ---
+    # Progress tracking
+    last_test_output: str = ""
+    stagnant_steps: int = 0
+    failing_functions: list[str] = field(default_factory=list)
+    current_step: int = 0
+    max_steps: int = 5
+    status: str = "pending"  # pending | running | success | fail
+    # --- file context ---
+    current_file: str = ""
+    current_file_content: str = ""
+    # --- execution results ---
+    test_output: str = ""
+    test_exit_code: int = -1
+    lint_output: str = ""
+    # --- history (short-term memory) ---
+    history: list[dict[str, Any]] = field(default_factory=list)
+    # --- last LLM outputs ---
+    last_thought: str = ""
+    last_action: str = ""
+    last_observation: str = ""
+    last_code_fix: str = ""
+    last_review: str = ""
+    # --- attempts counter ---
+    attempts: int = 0
+    # --- retrieval context ---
+    retrieved_chunks: list[Any] = field(default_factory=list)
+    # --- planner output ---
+    plan: dict[str, Any] = field(default_factory=dict)
+    # --- patch tracking ---
+    patches_applied: list[dict[str, Any]] = field(default_factory=list)
+    # --- sandbox ---
+    sandbox_active: bool = False
+    working_root: str = ""  # actual root being modified (sandbox or real)
+    # --- thoughts / observations for memory ---
+    thoughts: list[str] = field(default_factory=list)
+    actions: list[str] = field(default_factory=list)
+    observations: list[str] = field(default_factory=list)
+    # -- Trust & Confidence --
+    confidence_score: float = 0.0
+    confidence_reasons: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        """Return a JSON-serialisable snapshot of the current state."""
+        return {
+            "task": self.task,
+            "project_root": self.project_root,
+            "current_step": self.current_step,
+            "max_steps": self.max_steps,
+            "status": self.status,
+            "current_file": self.current_file,
+            "test_output": self.test_output[:500] if self.test_output else "",
+            "test_exit_code": self.test_exit_code,
+            "attempts": self.attempts,
+            "history_length": len(self.history),
+            "patches_applied": len(self.patches_applied),
+            "sandbox_active": self.sandbox_active,
+        }
+    def snapshot(self) -> dict[str, Any]:
+        """Return a JSON-serialisable snapshot of the current state."""
+        return {
+            "task": self.task,
+            "project_root": self.project_root,
+            "current_step": self.current_step,
+            "max_steps": self.max_steps,
+            "status": self.status,
+            "current_file": self.current_file,
+            "test_output": self.test_output[:500] if self.test_output else "",
+            "test_exit_code": self.test_exit_code,
+            "attempts": self.attempts,
+            "history_length": len(self.history),
+            "patches_applied": len(self.patches_applied),
+            "sandbox_active": self.sandbox_active,
+        }
+    def clone(self) -> "AgentState":
+        """Deep-copy for safe rollback."""
+        return copy.deepcopy(self)

devagent/cli.py ADDED Viewed

@@ -0,0 +1,282 @@
+"""
+DevAgent Professional CLI — The entry point for all agent operations.
+"""
+from __future__ import annotations
+import argparse
+import sys
+import os
+import time
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+# Add package root to sys.path if running as script
+if __name__ == "__main__":
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from devagent.app.agent import Agent
+from devagent.app.sandbox import SandboxManager
+from devagent.tools.git_tools import is_git_repo, git_commit, git_push
+from devagent.utils.config import AgentConfig, MODELS
+from devagent import __version__
+console = Console()
+BANNER = r"""
+ ____              _                    _
+|  _ \  _____   __/ \   __ _  ___ _ __ | |_
+| | | |/ _ \ \ / / _ \ / _` |/ _ \ '_ \| __|
+| |_| |  __/\ V / ___ \ (_| |  __/ | | | |_
+|____/ \___| \_/_/   \_\__, |\___|_| |_|\__|
+                       |___/
+"""
+from rich.progress import Progress, SpinnerColumn, TextColumn
+def verify_ollama(model_name: str) -> bool:
+    """Verify Ollama is running and model is available."""
+    import subprocess
+    import requests
+    # 1. Check if server is reachable
+    try:
+        response = requests.get("http://localhost:11434/api/tags", timeout=2)
+        if response.status_code != 200:
+            console.print("[bold red][ERROR][/bold red] Ollama server returned error.")
+            return False
+    except:
+        console.print("[bold red][ERROR][/bold red] Could not connect to Ollama server.")
+        console.print("Run: [bold cyan]ollama serve[/bold cyan] in a separate terminal.")
+        return False
+    # 2. Check if model is pulled
+    try:
+        tags = response.json().get("models", [])
+        model_names = [m["name"] for m in tags]
+        # Handle both "name" and "name:latest"
+        if model_name not in model_names and f"{model_name}:latest" not in model_names:
+            console.print(f"[bold red][ERROR][/bold red] Model [bold cyan]{model_name}[/bold cyan] not found.")
+            console.print(f"Run: [bold cyan]ollama pull {model_name}[/bold cyan]")
+            return False
+    except Exception as e:
+        console.print(f"[bold yellow][WARN][/bold yellow] Could not verify model: {e}")
+    return True
+def cmd_run(args):
+    """Implementation of 'devagent run' command."""
+    if not verify_ollama(args.model):
+        return 1
+    config = AgentConfig.from_cli(args)
+    root = os.path.abspath(config.project_root)
+    if not os.path.isdir(root):
+        console.print(f"[bold red][ERROR][/bold red] Project root not found: {root}")
+        return 1
+    # Set model
+    import devagent.app.llm as llm_module
+    llm_module.set_model(config.model)
+    console.print(BANNER, style="cyan")
+    console.print(Panel.fit(
+        f"[bold]DevAgent v{__version__}[/bold]\n"
+        f"Model: [green]{config.model}[/green]\n"
+        f"Sandbox: [yellow]{'ON' if config.sandbox else 'OFF'}[/yellow]",
+        title="Session Info", border_style="blue"
+    ))
+    # Sandbox setup
+    sandbox = None
+    working_root = root
+    if config.sandbox:
+        sandbox = SandboxManager(root)
+        working_root = sandbox.create()
+    # Run agent
+    agent = Agent(
+        task=config.task,
+        project_root=working_root,
+        max_steps=config.max_steps,
+    )
+    start_time = time.time()
+    final_state = agent.run()
+    elapsed = time.time() - start_time
+    # Save metrics
+    metrics_path = agent.metrics.save(os.path.join(root, "logs"))
+    # Print summary table
+    table = Table(title="Execution Summary", box=None)
+    table.add_column("Metric", style="cyan")
+    table.add_column("Value", style="bold white")
+    table.add_row("Status", final_state.status.upper())
+    table.add_row("Steps", f"{final_state.current_step}/{final_state.max_steps}")
+    table.add_row("Time", f"{elapsed:.1f}s")
+    # Confidence Score with color coding
+    conf_color = "green" if final_state.confidence_score > 0.8 else "yellow" if final_state.confidence_score > 0.5 else "red"
+    table.add_row("Confidence", f"[{conf_color}]{final_state.confidence_score * 100:.0f}%[/{conf_color}]")
+    if final_state.current_file:
+        table.add_row("Last File", final_state.current_file)
+    table.add_row("Patches", str(len(final_state.patches_applied)))
+    console.print("\n", table)
+    # Show confidence reasons
+    if final_state.confidence_reasons:
+        console.print("\n[bold]Confidence Breakdown:[/bold]")
+        for reason in final_state.confidence_reasons:
+            console.print(f"  [dim]• {reason}[/dim]")
+    # Sandbox apply
+    if sandbox and sandbox.is_active:
+        if final_state.status == "success":
+            if getattr(args, "interactive", False):
+                console.print("\n[bold yellow][INTERACTIVE][/bold yellow] Reviewing changes...")
+                # Show diff for each applied patch
+                for i, patch in enumerate(final_state.patches_applied):
+                    console.print(f"\n[bold]Patch #{i+1}[/bold] for [cyan]{patch.get('file', 'unknown')}[/cyan]:")
+                    console.print(f"[dim]{patch.get('diff', 'No diff available')}[/dim]")
+                choice = console.input("\nApply these changes to the real project? [y/N]: ").lower()
+                if choice != 'y':
+                    console.print("[bold red]Changes rejected.[/bold red] Sandbox will be destroyed.")
+                    sandbox.destroy()
+                    return 1
+            console.print("\n[bold yellow][SANDBOX][/bold yellow] Applying changes to real project...")
+            result = sandbox.apply_to_project()
+            if result["applied"]:
+                for f in result["applied"]:
+                    console.print(f"  [green]✓[/green] {f}")
+        sandbox.destroy()
+    # Git operations
+    if final_state.status == "success" and config.auto_commit:
+        _handle_git(root, config)
+    return 0 if final_state.status == "success" else 1
+def _handle_git(root: str, config: AgentConfig) -> None:
+    if not is_git_repo(root):
+        return
+    console.print("\n[bold blue][GIT][/bold blue] Committing changes...")
+    commit_msg = f"agent: {config.task[:50]}"
+    git_commit(root, commit_msg)
+    if config.auto_push:
+        git_push(root)
+def cmd_benchmark(args):
+    """Implementation of 'devagent benchmark' command."""
+    from devagent.tools.benchmark_runner import run_benchmarks
+    agent_dir = os.path.dirname(os.path.abspath(__file__))
+    benchmarks_dir = os.path.abspath(os.path.join(agent_dir, "..", "benchmarks"))
+    import devagent.app.llm as llm_module
+    llm_module.set_model(args.model)
+    console.print(Panel(f"Running benchmarks with [bold cyan]{args.model}[/bold cyan]", title="Benchmark Suite"))
+    report = run_benchmarks(benchmarks_dir, model=args.model, max_steps=args.max_steps)
+    report.print_report()
+    return 0 if report.pass_rate >= 80 else 1
+def cmd_doctor(args):
+    """Implementation of 'devagent doctor' command."""
+    console.print("[bold cyan]DevAgent System Check[/bold cyan]\n")
+    checks = []
+    # Python Check
+    checks.append(("[green]OK[/green]" if sys.version_info >= (3, 11) else "[red]FAIL[/red]", f"Python {sys.version_info.major}.{sys.version_info.minor}"))
+    # Ollama Check
+    import subprocess
+    try:
+        subprocess.run(["ollama", "--version"], capture_output=True, check=True)
+        checks.append(("[green]OK[/green]", "Ollama installed"))
+    except:
+        checks.append(("[red]FAIL[/red]", "Ollama NOT found (run: ollama serve)"))
+    # FAISS Check
+    try:
+        import faiss
+        checks.append(("[green]OK[/green]", "FAISS available"))
+    except:
+        checks.append(("[yellow]WARN[/yellow]", "FAISS not found (keyword search fallback active)"))
+    for status, msg in checks:
+        console.print(f" {status} {msg}")
+    return 0
+def cmd_models(args):
+    """Implementation of 'devagent models' command."""
+    import subprocess
+    console.print("[bold cyan]Installed Ollama Models[/bold cyan]\n")
+    try:
+        result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
+        console.print(result.stdout)
+        console.print(f"\n[bold green]Recommended:[/bold green] {MODELS['primary']}")
+    except:
+        console.print("[red][ERROR][/red] Could not list Ollama models.")
+    return 0
+def main():
+    parser = argparse.ArgumentParser(description="DevAgent CLI — Professional local coding agent.")
+    parser.add_argument("--version", action="version", version=f"DevAgent v{__version__}")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Command: run
+    run_parser = subparsers.add_parser("run", help="Run the agent on a coding task")
+    run_parser.add_argument("--task", "-t", required=True, help="Task description")
+    run_parser.add_argument("--root", "-r", default=".", help="Project root")
+    run_parser.add_argument("--model", default=MODELS["primary"], help="Ollama model")
+    run_parser.add_argument("--max-steps", "-m", type=int, default=5, help="Max iterations")
+    run_parser.add_argument("--sandbox", action="store_true", help="Run in sandbox")
+    run_parser.add_argument("--auto-commit", action="store_true", help="Auto-commit on success")
+    run_parser.add_argument("--auto-push", action="store_true", help="Auto-push after commit")
+    run_parser.add_argument("--interactive", "-i", action="store_true", help="Review changes before applying")
+    run_parser.add_argument("--verbose", action="store_true", help="Verbose output")
+    # Command: benchmark
+    bench_parser = subparsers.add_parser("benchmark", help="Run benchmark suite")
+    bench_parser.add_argument("--model", default=MODELS["primary"], help="Ollama model")
+    bench_parser.add_argument("--max-steps", "-m", type=int, default=5, help="Max iterations")
+    # Command: doctor
+    subparsers.add_parser("doctor", help="Check system health")
+    # Command: models
+    subparsers.add_parser("models", help="List installed Ollama models")
+    # Command: version
+    subparsers.add_parser("version", help="Show version")
+    args = parser.parse_args()
+    if args.command == "run":
+        sys.exit(cmd_run(args))
+    elif args.command == "benchmark":
+        sys.exit(cmd_benchmark(args))
+    elif args.command == "doctor":
+        sys.exit(cmd_doctor(args))
+    elif args.command == "models":
+        sys.exit(cmd_models(args))
+    elif args.command == "version":
+        console.print(f"DevAgent CLI v{__version__}")
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

devagent/tools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # tools — executable tool modules

devagent/tools/benchmark_runner.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""
+Benchmark Runner — evaluates the agent against a suite of known bug scenarios.
+Each benchmark is a self-contained project with:
+  - buggy source code
+  - test file
+  - expected behavior
+Measures: pass rate, retries, execution time, model performance.
+"""
+from __future__ import annotations
+import json
+import os
+import shutil
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+@dataclass
+class BenchmarkResult:
+    """Result of a single benchmark run."""
+    name: str = ""
+    task: str = ""
+    passed: bool = False
+    steps_used: int = 0
+    max_steps: int = 5
+    execution_time_s: float = 0.0
+    model: str = ""
+    error: str = ""
+@dataclass
+class BenchmarkReport:
+    """Aggregated benchmark report."""
+    results: list[BenchmarkResult] = field(default_factory=list)
+    model: str = ""
+    total_time_s: float = 0.0
+    @property
+    def total(self) -> int:
+        return len(self.results)
+    @property
+    def passed(self) -> int:
+        return sum(1 for r in self.results if r.passed)
+    @property
+    def failed(self) -> int:
+        return self.total - self.passed
+    @property
+    def pass_rate(self) -> float:
+        return (self.passed / max(self.total, 1)) * 100
+    def summary(self) -> dict[str, Any]:
+        return {
+            "model": self.model,
+            "total": self.total,
+            "passed": self.passed,
+            "failed": self.failed,
+            "pass_rate": f"{self.pass_rate:.1f}%",
+            "total_time_s": round(self.total_time_s, 2),
+            "results": [
+                {
+                    "name": r.name,
+                    "passed": r.passed,
+                    "steps": r.steps_used,
+                    "time_s": round(r.execution_time_s, 2),
+                    "error": r.error[:200] if r.error else "",
+                }
+                for r in self.results
+            ],
+        }
+    def print_report(self) -> None:
+        """Pretty-print the benchmark report."""
+        print("\n" + "=" * 60)
+        print("  BENCHMARK REPORT")
+        print("=" * 60)
+        print(f"  Model:     {self.model}")
+        print(f"  Total:     {self.total}")
+        print(f"  Passed:    {self.passed}")
+        print(f"  Failed:    {self.failed}")
+        print(f"  Pass Rate: {self.pass_rate:.1f}%")
+        print(f"  Time:      {self.total_time_s:.1f}s")
+        print("-" * 60)
+        for r in self.results:
+            icon = "PASS" if r.passed else "FAIL"
+            print(f"  [{icon}] {r.name} ({r.steps_used} steps, {r.execution_time_s:.1f}s)")
+        print("=" * 60)
+    def save(self, output_dir: str) -> str:
+        """Save report to JSON."""
+        path = Path(output_dir)
+        path.mkdir(parents=True, exist_ok=True)
+        out_file = path / "benchmark_report.json"
+        out_file.write_text(
+            json.dumps(self.summary(), indent=2, ensure_ascii=False),
+            encoding="utf-8",
+        )
+        return str(out_file)
+def discover_benchmarks(benchmarks_dir: str) -> list[dict[str, str]]:
+    """Discover all benchmark suites in the benchmarks directory."""
+    benchmarks = []
+    bench_path = Path(benchmarks_dir)
+    if not bench_path.is_dir():
+        return benchmarks
+    for entry in sorted(bench_path.iterdir()):
+        if entry.is_dir():
+            task_file = entry / "task.txt"
+            if task_file.exists():
+                task = task_file.read_text(encoding="utf-8").strip()
+                benchmarks.append({
+                    "name": entry.name,
+                    "path": str(entry),
+                    "task": task,
+                })
+    return benchmarks
+def run_benchmarks(benchmarks_dir: str, model: str = "qwen2.5-coder:3b",
+                   max_steps: int = 5) -> BenchmarkReport:
+    """Run all benchmarks and return a report."""
+    # Import here to avoid circular imports
+    from devagent.app.agent import Agent
+    report = BenchmarkReport(model=model)
+    suites = discover_benchmarks(benchmarks_dir)
+    if not suites:
+        print("[BENCHMARK] No benchmark suites found.")
+        return report
+    print(f"\n[BENCHMARK] Found {len(suites)} benchmark suites.")
+    start_time = time.time()
+    for suite in suites:
+        print(f"\n[BENCHMARK] Running: {suite['name']}")
+        result = BenchmarkResult(
+            name=suite["name"],
+            task=suite["task"],
+            max_steps=max_steps,
+            model=model,
+        )
+        # Create a temp copy to avoid modifying the benchmark
+        tmp_dir = os.path.join(benchmarks_dir, f"_tmp_{suite['name']}")
+        try:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+            shutil.copytree(suite["path"], tmp_dir)
+            t0 = time.time()
+            agent = Agent(
+                task=suite["task"],
+                project_root=tmp_dir,
+                max_steps=max_steps,
+            )
+            final_state = agent.run()
+            result.execution_time_s = time.time() - t0
+            result.steps_used = final_state.current_step
+            result.passed = final_state.status == "success"
+        except Exception as exc:
+            result.error = str(exc)
+            result.passed = False
+        finally:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir, ignore_errors=True)
+        report.results.append(result)
+    report.total_time_s = time.time() - start_time
+    return report