PyPI - benchrail - Versions diffs - 0.1.0__py3-none-any.whl - Mend

benchrail 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of benchrail might be problematic. Click here for more details.

Files changed (26) hide show

benchrail/__init__.py +3 -0
benchrail/adapters/__init__.py +3 -0
benchrail/adapters/base.py +47 -0
benchrail/adapters/claude_code.py +95 -0
benchrail/adapters/codex.py +138 -0
benchrail/cli.py +22 -0
benchrail/commands/__init__.py +0 -0
benchrail/commands/run.py +97 -0
benchrail/commands/validate.py +104 -0
benchrail/commands/version.py +10 -0
benchrail/dto/__init__.py +16 -0
benchrail/dto/config.py +163 -0
benchrail/dto/manifest.py +34 -0
benchrail/dto/result.py +140 -0
benchrail/pricing.py +118 -0
benchrail/registry.py +40 -0
benchrail/runner/__init__.py +0 -0
benchrail/runner/docker.py +472 -0
benchrail/runner/local.py +277 -0
benchrail/runner/logging_util.py +264 -0
benchrail/runner/orchestrator.py +445 -0
benchrail/runner/worker.py +1150 -0
benchrail-0.1.0.dist-info/METADATA +395 -0
benchrail-0.1.0.dist-info/RECORD +26 -0
benchrail-0.1.0.dist-info/WHEEL +4 -0
benchrail-0.1.0.dist-info/entry_points.txt +2 -0

benchrail/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""benchrail package metadata."""
+__version__ = "0.1.0"

benchrail/adapters/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from benchrail.adapters.base import AgentRunResult, BaseAdapter
+__all__ = ["AgentRunResult", "BaseAdapter"]

benchrail/adapters/base.py ADDED Viewed

@@ -0,0 +1,47 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar
+@dataclass
+class AgentRunResult:
+    exit_code: int
+    stdout: bytes
+    stderr: bytes
+    duration_ms: int
+    agent_session_id: str = ""
+    turns: int | None = None
+    input_tokens: int | None = None
+    output_tokens: int | None = None
+    cache_read_tokens: int | None = None
+    cache_creation_tokens: int | None = None
+    reasoning_tokens: int | None = None
+    cost_usd: float | None = None
+    cost_credits: float | None = None
+    model: str | None = None
+class BaseAdapter(ABC):
+    FIXED_ARGS: ClassVar[list[str]] = []
+    def __init__(self, extra_args: list[str] | None = None) -> None:
+        self.extra_args: list[str] = extra_args or []
+    def build_command(self, prompt: str, execution_mode: str = "local") -> list[str]:
+        return self._base_command(execution_mode) + self.extra_args + [prompt]
+    def auth_session_file(self) -> Path | None:
+        return None
+    @abstractmethod
+    def _base_command(self, execution_mode: str) -> list[str]: ...
+    @abstractmethod
+    def parse_result(
+        self,
+        stdout: bytes,
+        stderr: bytes,
+        exit_code: int,
+        duration_ms: int,
+    ) -> AgentRunResult: ...

benchrail/adapters/claude_code.py ADDED Viewed

@@ -0,0 +1,95 @@
+import json
+from pathlib import Path
+from typing import ClassVar
+from benchrail.adapters.base import AgentRunResult, BaseAdapter
+def _as_dict(value: object) -> dict[str, object] | None:
+    return value if isinstance(value, dict) else None
+def _as_int(value: object) -> int | None:
+    return value if isinstance(value, int) else None
+def _as_float(value: object) -> float | None:
+    return float(value) if isinstance(value, int | float) else None
+def _as_str(value: object) -> str | None:
+    return value if isinstance(value, str) else None
+class ClaudeCodeAdapter(BaseAdapter):
+    FIXED_ARGS: ClassVar[list[str]] = [
+        "--print",
+        "--output-format=json",
+        "--no-session-persistence",
+        "--dangerously-skip-permissions",
+    ]
+    def _base_command(self, execution_mode: str) -> list[str]:
+        return ["claude", *self.FIXED_ARGS]
+    def auth_session_file(self) -> Path | None:
+        return Path.home() / ".claude" / ".credentials.json"
+    def parse_result(
+        self,
+        stdout: bytes,
+        stderr: bytes,
+        exit_code: int,
+        duration_ms: int,
+    ) -> AgentRunResult:
+        session_id = ""
+        turns: int | None = None
+        input_tokens: int | None = None
+        output_tokens: int | None = None
+        cache_read_tokens: int | None = None
+        cache_creation_tokens: int | None = None
+        cost_usd: float | None = None
+        model: str | None = None
+        try:
+            text = stdout.decode("utf-8", errors="replace").strip()
+            payload = json.loads(text)
+            data = _as_dict(payload)
+            if data is None:
+                raise ValueError("Claude output must be a JSON object")
+            session_id = _as_str(data.get("session_id")) or ""
+            turns = _as_int(data.get("num_turns"))
+            cost_usd = _as_float(data.get("total_cost_usd"))
+            if "duration_ms" in data:
+                parsed_duration_ms = _as_int(data["duration_ms"])
+                if parsed_duration_ms is not None:
+                    duration_ms = parsed_duration_ms
+            usage = _as_dict(data.get("usage")) or {}
+            input_tokens = _as_int(usage.get("input_tokens"))
+            output_tokens = _as_int(usage.get("output_tokens"))
+            cache_read_tokens = _as_int(usage.get("cache_read_input_tokens"))
+            cache_creation_tokens = _as_int(usage.get("cache_creation_input_tokens"))
+            model_usage = _as_dict(data.get("modelUsage")) or {}
+            if model_usage:
+                model = next(iter(model_usage.keys()), None)
+        except Exception:
+            pass
+        return AgentRunResult(
+            exit_code=exit_code,
+            stdout=stdout,
+            stderr=stderr,
+            duration_ms=duration_ms,
+            agent_session_id=session_id,
+            turns=turns,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            cache_read_tokens=cache_read_tokens,
+            cache_creation_tokens=cache_creation_tokens,
+            cost_usd=cost_usd,
+            model=model,
+        )

benchrail/adapters/codex.py ADDED Viewed

@@ -0,0 +1,138 @@
+import json
+from pathlib import Path
+from typing import ClassVar
+from benchrail.adapters.base import AgentRunResult, BaseAdapter
+from benchrail.pricing import calc_codex_cost, calc_codex_credits
+def _as_dict(value: object) -> dict[str, object] | None:
+    return value if isinstance(value, dict) else None
+def _as_int(value: object, default: int = 0) -> int:
+    return value if isinstance(value, int) else default
+def _as_str(value: object, default: str = "") -> str:
+    return value if isinstance(value, str) else default
+class CodexAdapter(BaseAdapter):
+    FIXED_ARGS: ClassVar[list[str]] = [
+        "exec",
+        "--json",
+        "--ephemeral",
+    ]
+    def _base_command(self, execution_mode: str) -> list[str]:
+        args = ["codex", *self.FIXED_ARGS]
+        if execution_mode == "docker":
+            if not self._has_flag("--dangerously-bypass-approvals-and-sandbox"):
+                args.append("--dangerously-bypass-approvals-and-sandbox")
+            return args
+        if not self._has_flag("--dangerously-bypass-approvals-and-sandbox"):
+            if not self._has_config_override("sandbox_workspace_write.network_access"):
+                args.extend(["-c", "sandbox_workspace_write.network_access=true"])
+            if not self._has_config_override("approval_policy"):
+                args.extend(["-c", 'approval_policy="never"'])
+            if not self._has_flag("--sandbox"):
+                args.extend(["--sandbox", "workspace-write"])
+            if not self._has_flag("--cd"):
+                args.extend(["--cd", "."])
+        return args
+    def _has_flag(self, name: str) -> bool:
+        return any(arg == name or arg.startswith(f"{name}=") for arg in self.extra_args)
+    def _has_config_override(self, key: str) -> bool:
+        for i, arg in enumerate(self.extra_args):
+            if (
+                arg in {"-c", "--config"}
+                and i + 1 < len(self.extra_args)
+                and self.extra_args[i + 1].startswith(f"{key}=")
+            ):
+                return True
+            if arg.startswith("--config="):
+                _, _, value = arg.partition("=")
+                if value.startswith(f"{key}="):
+                    return True
+        return False
+    def _extract_model(self) -> str | None:
+        for i, arg in enumerate(self.extra_args):
+            if arg == "--model" and i + 1 < len(self.extra_args):
+                return self.extra_args[i + 1]
+            if arg.startswith("--model="):
+                return arg[len("--model=") :]
+        return None
+    def auth_session_file(self) -> Path | None:
+        return Path.home() / ".codex" / "auth.json"
+    def parse_result(
+        self,
+        stdout: bytes,
+        stderr: bytes,
+        exit_code: int,
+        duration_ms: int,
+    ) -> AgentRunResult:
+        session_id = ""
+        turns = 0
+        input_tokens = 0
+        output_tokens = 0
+        cache_read_tokens = 0
+        reasoning_tokens = 0
+        model = self._extract_model()
+        try:
+            text = stdout.decode("utf-8", errors="replace")
+            for line in text.splitlines():
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    payload = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                event = _as_dict(payload)
+                if event is None:
+                    continue
+                etype = _as_str(event.get("type"))
+                if etype == "thread.started":
+                    session_id = _as_str(event.get("thread_id"))
+                elif etype == "turn.completed":
+                    turns += 1
+                    usage = _as_dict(event.get("usage")) or {}
+                    input_tokens += _as_int(usage.get("input_tokens"))
+                    output_tokens += _as_int(usage.get("output_tokens"))
+                    cache_read_tokens += _as_int(usage.get("cached_input_tokens"))
+                    reasoning_tokens += _as_int(usage.get("reasoning_output_tokens"))
+        except Exception:
+            pass
+        cost_usd: float | None = None
+        cost_credits: float | None = None
+        if model:
+            cost_usd = calc_codex_cost(model, input_tokens, output_tokens, cache_read_tokens)
+            cost_credits = calc_codex_credits(model, input_tokens, output_tokens, cache_read_tokens)
+        return AgentRunResult(
+            exit_code=exit_code,
+            stdout=stdout,
+            stderr=stderr,
+            duration_ms=duration_ms,
+            agent_session_id=session_id,
+            turns=turns if turns > 0 else None,
+            input_tokens=input_tokens if input_tokens > 0 else None,
+            output_tokens=output_tokens if output_tokens > 0 else None,
+            cache_read_tokens=cache_read_tokens if cache_read_tokens > 0 else None,
+            reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
+            cost_usd=cost_usd,
+            cost_credits=cost_credits,
+            model=model,
+        )

benchrail/cli.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""CLI entry point: registers all subcommands."""
+import typer
+from benchrail.commands.run import run_cmd
+from benchrail.commands.validate import validate_cmd
+from benchrail.commands.version import version_cmd
+app = typer.Typer(
+    name="benchrail",
+    help="Reproducible benchmark harness for coding agents.",
+    add_completion=False,
+    no_args_is_help=True,
+)
+app.command("run")(run_cmd)
+app.command("validate")(validate_cmd)
+app.command("version")(version_cmd)
+if __name__ == "__main__":
+    app()

benchrail/commands/__init__.py ADDED Viewed

File without changes

benchrail/commands/run.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""CLI command: benchrail run"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Annotated
+import typer
+def run_cmd(
+    dataset: Annotated[
+        Path,
+        typer.Option("--dataset", help="Path to dataset directory"),
+    ] = Path("dataset"),
+    mode: Annotated[
+        str,
+        typer.Option("--mode", help="Execution mode: local|docker"),
+    ] = "local",
+    workspace: Annotated[
+        Path,
+        typer.Option("--workspace", help="Workspace root directory (default: current dir)"),
+    ] = Path("."),
+    agents: Annotated[
+        str | None,
+        typer.Option("--agents", help="Comma-separated agent ids to run"),
+    ] = None,
+    instance_ids: Annotated[
+        str | None,
+        typer.Option("--instance_ids", help="Comma-separated instance ids to run"),
+    ] = None,
+    output: Annotated[
+        Path | None,
+        typer.Option("--output", help="Output directory for result files"),
+    ] = None,
+    logs: Annotated[
+        Path | None,
+        typer.Option("--logs", help="Directory for runner logs and step stdout/stderr files"),
+    ] = None,
+    run_id: Annotated[
+        str | None,
+        typer.Option("--run_id", help="Run identifier (auto-generated if omitted)"),
+    ] = None,
+    max_workers: Annotated[
+        int | None,
+        typer.Option("--workers", help="Maximum parallel workers"),
+    ] = None,
+    auth_session: Annotated[
+        bool,
+        typer.Option(
+            "--auth-session",
+            help="In docker mode, copy the local auth session file into the task container",
+        ),
+    ] = False,
+) -> None:
+    """Run benchmark on a dataset and evaluate results."""
+    from benchrail.runner.orchestrator import ConfigError, run_benchmark
+    if mode not in ("local", "docker"):
+        typer.echo(f"Error: --mode must be 'local' or 'docker', got {mode!r}", err=True)
+        raise typer.Exit(2)
+    if not dataset.is_dir():
+        typer.echo(f"Error: --dataset does not point to a directory: {dataset}", err=True)
+        raise typer.Exit(2)
+    if not (dataset / "manifest.json").exists():
+        typer.echo(f"Error: manifest.json not found in {dataset}", err=True)
+        raise typer.Exit(2)
+    if max_workers is not None and max_workers <= 0:
+        typer.echo("Error: --workers must be a positive integer", err=True)
+        raise typer.Exit(2)
+    filter_agents = [a.strip() for a in agents.split(",")] if agents else None
+    filter_instances = [i.strip() for i in instance_ids.split(",")] if instance_ids else None
+    try:
+        _, exit_code = run_benchmark(
+            dataset_path=dataset.resolve(),
+            workspace=workspace.resolve(),
+            mode=mode,
+            run_id=run_id,
+            filter_agents=filter_agents,
+            filter_instances=filter_instances,
+            output=output.resolve() if output else None,
+            logs=logs.resolve() if logs else None,
+            max_workers=max_workers,
+            auth_session=auth_session,
+        )
+    except ConfigError as e:
+        typer.echo(f"Configuration error: {e}", err=True)
+        raise typer.Exit(2) from None
+    except Exception as e:
+        typer.echo(f"Fatal error: {e}", err=True)
+        raise typer.Exit(1) from None
+    raise typer.Exit(exit_code)

benchrail/commands/validate.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""CLI command: benchrail validate"""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Annotated
+import typer
+def _load_json_object(path: Path) -> dict[str, object]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(payload, dict):
+        raise ValueError(f"{path.name} must contain a JSON object")
+    return payload
+def validate_cmd(
+    dataset: Annotated[
+        Path,
+        typer.Option("--dataset", help="Path to dataset directory"),
+    ] = Path("dataset"),
+) -> None:
+    """Validate dataset structure and all configuration files."""
+    from benchrail.dto.config import InstanceConfig
+    from benchrail.dto.manifest import Manifest
+    from benchrail.registry import AGENT_REGISTRY
+    errors: list[str] = []
+    warnings: list[str] = []
+    if not dataset.is_dir():
+        typer.echo(f"Error: dataset path does not exist: {dataset}", err=True)
+        raise typer.Exit(2)
+    manifest_file = dataset / "manifest.json"
+    if not manifest_file.exists():
+        typer.echo("Error: manifest.json not found", err=True)
+        raise typer.Exit(2)
+    manifest: Manifest | None = None
+    try:
+        data = _load_json_object(manifest_file)
+        manifest = Manifest.model_validate(data)
+        typer.echo(f"  manifest.json  OK  ({len(manifest.agents)} agents)")
+    except Exception as e:
+        errors.append(f"manifest.json invalid: {e}")
+    if manifest:
+        for agent in manifest.agents:
+            if agent.agent not in AGENT_REGISTRY:
+                errors.append(
+                    f"manifest: agent type {agent.agent!r} (id={agent.id!r}) not in AGENT_REGISTRY"
+                )
+    instance_count = 0
+    for item in sorted(dataset.iterdir()):
+        if not item.is_dir():
+            continue
+        config_file = item / "config.json"
+        if not config_file.exists():
+            continue
+        instance_count += 1
+        try:
+            data = _load_json_object(config_file)
+            config = InstanceConfig.model_validate(data)
+        except Exception as e:
+            errors.append(f"{item.name}/config.json invalid: {e}")
+            continue
+        if config.instance_id != item.name:
+            errors.append(
+                f"{item.name}: instance_id {config.instance_id!r} does not match directory name"
+            )
+        try:
+            config.resolve_patch_paths(item)
+        except ValueError as e:
+            errors.append(f"{item.name}: {e}")
+        try:
+            config.docker.resolve_dockerfile_path(item)
+        except ValueError as e:
+            errors.append(f"{item.name}: {e}")
+        typer.echo(f"  {item.name}/config.json  OK")
+    if instance_count == 0:
+        warnings.append("No instances found in dataset")
+    typer.echo(f"\nTotal: {instance_count} instance(s)")
+    if warnings:
+        typer.echo("")
+        for w in warnings:
+            typer.echo(f"  WARNING: {w}")
+    if errors:
+        typer.echo("")
+        for err in errors:
+            typer.echo(f"  ERROR: {err}", err=True)
+        raise typer.Exit(2)
+    typer.echo("\nDataset is valid.")

benchrail/commands/version.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""CLI command for printing the benchrail version."""
+import typer
+from benchrail import __version__
+def version_cmd() -> None:
+    """Print the installed benchrail version."""
+    typer.echo(__version__)

benchrail/dto/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+from benchrail.dto.config import CheckCommand, HookConfig, HooksConfig, InstanceConfig
+from benchrail.dto.manifest import AgentEntry, Manifest
+from benchrail.dto.result import AgentStats, CheckResult, InstanceResult, RunResult
+__all__ = [
+    "AgentEntry",
+    "AgentStats",
+    "CheckCommand",
+    "CheckResult",
+    "HookConfig",
+    "HooksConfig",
+    "InstanceConfig",
+    "InstanceResult",
+    "Manifest",
+    "RunResult",
+]