benchrail 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of benchrail might be problematic. Click here for more details.

benchrail/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """benchrail package metadata."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from benchrail.adapters.base import AgentRunResult, BaseAdapter
2
+
3
+ __all__ = ["AgentRunResult", "BaseAdapter"]
@@ -0,0 +1,47 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import ClassVar
5
+
6
+
7
+ @dataclass
8
+ class AgentRunResult:
9
+ exit_code: int
10
+ stdout: bytes
11
+ stderr: bytes
12
+ duration_ms: int
13
+ agent_session_id: str = ""
14
+ turns: int | None = None
15
+ input_tokens: int | None = None
16
+ output_tokens: int | None = None
17
+ cache_read_tokens: int | None = None
18
+ cache_creation_tokens: int | None = None
19
+ reasoning_tokens: int | None = None
20
+ cost_usd: float | None = None
21
+ cost_credits: float | None = None
22
+ model: str | None = None
23
+
24
+
25
+ class BaseAdapter(ABC):
26
+ FIXED_ARGS: ClassVar[list[str]] = []
27
+
28
+ def __init__(self, extra_args: list[str] | None = None) -> None:
29
+ self.extra_args: list[str] = extra_args or []
30
+
31
+ def build_command(self, prompt: str, execution_mode: str = "local") -> list[str]:
32
+ return self._base_command(execution_mode) + self.extra_args + [prompt]
33
+
34
+ def auth_session_file(self) -> Path | None:
35
+ return None
36
+
37
+ @abstractmethod
38
+ def _base_command(self, execution_mode: str) -> list[str]: ...
39
+
40
+ @abstractmethod
41
+ def parse_result(
42
+ self,
43
+ stdout: bytes,
44
+ stderr: bytes,
45
+ exit_code: int,
46
+ duration_ms: int,
47
+ ) -> AgentRunResult: ...
@@ -0,0 +1,95 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import ClassVar
4
+
5
+ from benchrail.adapters.base import AgentRunResult, BaseAdapter
6
+
7
+
8
+ def _as_dict(value: object) -> dict[str, object] | None:
9
+ return value if isinstance(value, dict) else None
10
+
11
+
12
+ def _as_int(value: object) -> int | None:
13
+ return value if isinstance(value, int) else None
14
+
15
+
16
+ def _as_float(value: object) -> float | None:
17
+ return float(value) if isinstance(value, int | float) else None
18
+
19
+
20
+ def _as_str(value: object) -> str | None:
21
+ return value if isinstance(value, str) else None
22
+
23
+
24
+ class ClaudeCodeAdapter(BaseAdapter):
25
+ FIXED_ARGS: ClassVar[list[str]] = [
26
+ "--print",
27
+ "--output-format=json",
28
+ "--no-session-persistence",
29
+ "--dangerously-skip-permissions",
30
+ ]
31
+
32
+ def _base_command(self, execution_mode: str) -> list[str]:
33
+ return ["claude", *self.FIXED_ARGS]
34
+
35
+ def auth_session_file(self) -> Path | None:
36
+ return Path.home() / ".claude" / ".credentials.json"
37
+
38
+ def parse_result(
39
+ self,
40
+ stdout: bytes,
41
+ stderr: bytes,
42
+ exit_code: int,
43
+ duration_ms: int,
44
+ ) -> AgentRunResult:
45
+ session_id = ""
46
+ turns: int | None = None
47
+ input_tokens: int | None = None
48
+ output_tokens: int | None = None
49
+ cache_read_tokens: int | None = None
50
+ cache_creation_tokens: int | None = None
51
+ cost_usd: float | None = None
52
+ model: str | None = None
53
+
54
+ try:
55
+ text = stdout.decode("utf-8", errors="replace").strip()
56
+ payload = json.loads(text)
57
+ data = _as_dict(payload)
58
+ if data is None:
59
+ raise ValueError("Claude output must be a JSON object")
60
+
61
+ session_id = _as_str(data.get("session_id")) or ""
62
+ turns = _as_int(data.get("num_turns"))
63
+ cost_usd = _as_float(data.get("total_cost_usd"))
64
+
65
+ if "duration_ms" in data:
66
+ parsed_duration_ms = _as_int(data["duration_ms"])
67
+ if parsed_duration_ms is not None:
68
+ duration_ms = parsed_duration_ms
69
+
70
+ usage = _as_dict(data.get("usage")) or {}
71
+ input_tokens = _as_int(usage.get("input_tokens"))
72
+ output_tokens = _as_int(usage.get("output_tokens"))
73
+ cache_read_tokens = _as_int(usage.get("cache_read_input_tokens"))
74
+ cache_creation_tokens = _as_int(usage.get("cache_creation_input_tokens"))
75
+
76
+ model_usage = _as_dict(data.get("modelUsage")) or {}
77
+ if model_usage:
78
+ model = next(iter(model_usage.keys()), None)
79
+ except Exception:
80
+ pass
81
+
82
+ return AgentRunResult(
83
+ exit_code=exit_code,
84
+ stdout=stdout,
85
+ stderr=stderr,
86
+ duration_ms=duration_ms,
87
+ agent_session_id=session_id,
88
+ turns=turns,
89
+ input_tokens=input_tokens,
90
+ output_tokens=output_tokens,
91
+ cache_read_tokens=cache_read_tokens,
92
+ cache_creation_tokens=cache_creation_tokens,
93
+ cost_usd=cost_usd,
94
+ model=model,
95
+ )
@@ -0,0 +1,138 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import ClassVar
4
+
5
+ from benchrail.adapters.base import AgentRunResult, BaseAdapter
6
+ from benchrail.pricing import calc_codex_cost, calc_codex_credits
7
+
8
+
9
+ def _as_dict(value: object) -> dict[str, object] | None:
10
+ return value if isinstance(value, dict) else None
11
+
12
+
13
+ def _as_int(value: object, default: int = 0) -> int:
14
+ return value if isinstance(value, int) else default
15
+
16
+
17
+ def _as_str(value: object, default: str = "") -> str:
18
+ return value if isinstance(value, str) else default
19
+
20
+
21
+ class CodexAdapter(BaseAdapter):
22
+ FIXED_ARGS: ClassVar[list[str]] = [
23
+ "exec",
24
+ "--json",
25
+ "--ephemeral",
26
+ ]
27
+
28
+ def _base_command(self, execution_mode: str) -> list[str]:
29
+ args = ["codex", *self.FIXED_ARGS]
30
+
31
+ if execution_mode == "docker":
32
+ if not self._has_flag("--dangerously-bypass-approvals-and-sandbox"):
33
+ args.append("--dangerously-bypass-approvals-and-sandbox")
34
+ return args
35
+
36
+ if not self._has_flag("--dangerously-bypass-approvals-and-sandbox"):
37
+ if not self._has_config_override("sandbox_workspace_write.network_access"):
38
+ args.extend(["-c", "sandbox_workspace_write.network_access=true"])
39
+ if not self._has_config_override("approval_policy"):
40
+ args.extend(["-c", 'approval_policy="never"'])
41
+ if not self._has_flag("--sandbox"):
42
+ args.extend(["--sandbox", "workspace-write"])
43
+ if not self._has_flag("--cd"):
44
+ args.extend(["--cd", "."])
45
+
46
+ return args
47
+
48
+ def _has_flag(self, name: str) -> bool:
49
+ return any(arg == name or arg.startswith(f"{name}=") for arg in self.extra_args)
50
+
51
+ def _has_config_override(self, key: str) -> bool:
52
+ for i, arg in enumerate(self.extra_args):
53
+ if (
54
+ arg in {"-c", "--config"}
55
+ and i + 1 < len(self.extra_args)
56
+ and self.extra_args[i + 1].startswith(f"{key}=")
57
+ ):
58
+ return True
59
+ if arg.startswith("--config="):
60
+ _, _, value = arg.partition("=")
61
+ if value.startswith(f"{key}="):
62
+ return True
63
+ return False
64
+
65
+ def _extract_model(self) -> str | None:
66
+ for i, arg in enumerate(self.extra_args):
67
+ if arg == "--model" and i + 1 < len(self.extra_args):
68
+ return self.extra_args[i + 1]
69
+ if arg.startswith("--model="):
70
+ return arg[len("--model=") :]
71
+ return None
72
+
73
+ def auth_session_file(self) -> Path | None:
74
+ return Path.home() / ".codex" / "auth.json"
75
+
76
+ def parse_result(
77
+ self,
78
+ stdout: bytes,
79
+ stderr: bytes,
80
+ exit_code: int,
81
+ duration_ms: int,
82
+ ) -> AgentRunResult:
83
+ session_id = ""
84
+ turns = 0
85
+ input_tokens = 0
86
+ output_tokens = 0
87
+ cache_read_tokens = 0
88
+ reasoning_tokens = 0
89
+ model = self._extract_model()
90
+
91
+ try:
92
+ text = stdout.decode("utf-8", errors="replace")
93
+ for line in text.splitlines():
94
+ line = line.strip()
95
+ if not line:
96
+ continue
97
+ try:
98
+ payload = json.loads(line)
99
+ except json.JSONDecodeError:
100
+ continue
101
+ event = _as_dict(payload)
102
+ if event is None:
103
+ continue
104
+
105
+ etype = _as_str(event.get("type"))
106
+ if etype == "thread.started":
107
+ session_id = _as_str(event.get("thread_id"))
108
+ elif etype == "turn.completed":
109
+ turns += 1
110
+ usage = _as_dict(event.get("usage")) or {}
111
+ input_tokens += _as_int(usage.get("input_tokens"))
112
+ output_tokens += _as_int(usage.get("output_tokens"))
113
+ cache_read_tokens += _as_int(usage.get("cached_input_tokens"))
114
+ reasoning_tokens += _as_int(usage.get("reasoning_output_tokens"))
115
+ except Exception:
116
+ pass
117
+
118
+ cost_usd: float | None = None
119
+ cost_credits: float | None = None
120
+ if model:
121
+ cost_usd = calc_codex_cost(model, input_tokens, output_tokens, cache_read_tokens)
122
+ cost_credits = calc_codex_credits(model, input_tokens, output_tokens, cache_read_tokens)
123
+
124
+ return AgentRunResult(
125
+ exit_code=exit_code,
126
+ stdout=stdout,
127
+ stderr=stderr,
128
+ duration_ms=duration_ms,
129
+ agent_session_id=session_id,
130
+ turns=turns if turns > 0 else None,
131
+ input_tokens=input_tokens if input_tokens > 0 else None,
132
+ output_tokens=output_tokens if output_tokens > 0 else None,
133
+ cache_read_tokens=cache_read_tokens if cache_read_tokens > 0 else None,
134
+ reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
135
+ cost_usd=cost_usd,
136
+ cost_credits=cost_credits,
137
+ model=model,
138
+ )
benchrail/cli.py ADDED
@@ -0,0 +1,22 @@
1
+ """CLI entry point: registers all subcommands."""
2
+
3
+ import typer
4
+
5
+ from benchrail.commands.run import run_cmd
6
+ from benchrail.commands.validate import validate_cmd
7
+ from benchrail.commands.version import version_cmd
8
+
9
+ app = typer.Typer(
10
+ name="benchrail",
11
+ help="Reproducible benchmark harness for coding agents.",
12
+ add_completion=False,
13
+ no_args_is_help=True,
14
+ )
15
+
16
+ app.command("run")(run_cmd)
17
+ app.command("validate")(validate_cmd)
18
+ app.command("version")(version_cmd)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ app()
File without changes
@@ -0,0 +1,97 @@
1
+ """CLI command: benchrail run"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ import typer
9
+
10
+
11
+ def run_cmd(
12
+ dataset: Annotated[
13
+ Path,
14
+ typer.Option("--dataset", help="Path to dataset directory"),
15
+ ] = Path("dataset"),
16
+ mode: Annotated[
17
+ str,
18
+ typer.Option("--mode", help="Execution mode: local|docker"),
19
+ ] = "local",
20
+ workspace: Annotated[
21
+ Path,
22
+ typer.Option("--workspace", help="Workspace root directory (default: current dir)"),
23
+ ] = Path("."),
24
+ agents: Annotated[
25
+ str | None,
26
+ typer.Option("--agents", help="Comma-separated agent ids to run"),
27
+ ] = None,
28
+ instance_ids: Annotated[
29
+ str | None,
30
+ typer.Option("--instance_ids", help="Comma-separated instance ids to run"),
31
+ ] = None,
32
+ output: Annotated[
33
+ Path | None,
34
+ typer.Option("--output", help="Output directory for result files"),
35
+ ] = None,
36
+ logs: Annotated[
37
+ Path | None,
38
+ typer.Option("--logs", help="Directory for runner logs and step stdout/stderr files"),
39
+ ] = None,
40
+ run_id: Annotated[
41
+ str | None,
42
+ typer.Option("--run_id", help="Run identifier (auto-generated if omitted)"),
43
+ ] = None,
44
+ max_workers: Annotated[
45
+ int | None,
46
+ typer.Option("--workers", help="Maximum parallel workers"),
47
+ ] = None,
48
+ auth_session: Annotated[
49
+ bool,
50
+ typer.Option(
51
+ "--auth-session",
52
+ help="In docker mode, copy the local auth session file into the task container",
53
+ ),
54
+ ] = False,
55
+ ) -> None:
56
+ """Run benchmark on a dataset and evaluate results."""
57
+ from benchrail.runner.orchestrator import ConfigError, run_benchmark
58
+
59
+ if mode not in ("local", "docker"):
60
+ typer.echo(f"Error: --mode must be 'local' or 'docker', got {mode!r}", err=True)
61
+ raise typer.Exit(2)
62
+
63
+ if not dataset.is_dir():
64
+ typer.echo(f"Error: --dataset does not point to a directory: {dataset}", err=True)
65
+ raise typer.Exit(2)
66
+
67
+ if not (dataset / "manifest.json").exists():
68
+ typer.echo(f"Error: manifest.json not found in {dataset}", err=True)
69
+ raise typer.Exit(2)
70
+
71
+ if max_workers is not None and max_workers <= 0:
72
+ typer.echo("Error: --workers must be a positive integer", err=True)
73
+ raise typer.Exit(2)
74
+
75
+ filter_agents = [a.strip() for a in agents.split(",")] if agents else None
76
+ filter_instances = [i.strip() for i in instance_ids.split(",")] if instance_ids else None
77
+ try:
78
+ _, exit_code = run_benchmark(
79
+ dataset_path=dataset.resolve(),
80
+ workspace=workspace.resolve(),
81
+ mode=mode,
82
+ run_id=run_id,
83
+ filter_agents=filter_agents,
84
+ filter_instances=filter_instances,
85
+ output=output.resolve() if output else None,
86
+ logs=logs.resolve() if logs else None,
87
+ max_workers=max_workers,
88
+ auth_session=auth_session,
89
+ )
90
+ except ConfigError as e:
91
+ typer.echo(f"Configuration error: {e}", err=True)
92
+ raise typer.Exit(2) from None
93
+ except Exception as e:
94
+ typer.echo(f"Fatal error: {e}", err=True)
95
+ raise typer.Exit(1) from None
96
+
97
+ raise typer.Exit(exit_code)
@@ -0,0 +1,104 @@
1
+ """CLI command: benchrail validate"""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Annotated
8
+
9
+ import typer
10
+
11
+
12
+ def _load_json_object(path: Path) -> dict[str, object]:
13
+ payload = json.loads(path.read_text(encoding="utf-8"))
14
+ if not isinstance(payload, dict):
15
+ raise ValueError(f"{path.name} must contain a JSON object")
16
+ return payload
17
+
18
+
19
+ def validate_cmd(
20
+ dataset: Annotated[
21
+ Path,
22
+ typer.Option("--dataset", help="Path to dataset directory"),
23
+ ] = Path("dataset"),
24
+ ) -> None:
25
+ """Validate dataset structure and all configuration files."""
26
+ from benchrail.dto.config import InstanceConfig
27
+ from benchrail.dto.manifest import Manifest
28
+ from benchrail.registry import AGENT_REGISTRY
29
+
30
+ errors: list[str] = []
31
+ warnings: list[str] = []
32
+
33
+ if not dataset.is_dir():
34
+ typer.echo(f"Error: dataset path does not exist: {dataset}", err=True)
35
+ raise typer.Exit(2)
36
+
37
+ manifest_file = dataset / "manifest.json"
38
+ if not manifest_file.exists():
39
+ typer.echo("Error: manifest.json not found", err=True)
40
+ raise typer.Exit(2)
41
+
42
+ manifest: Manifest | None = None
43
+ try:
44
+ data = _load_json_object(manifest_file)
45
+ manifest = Manifest.model_validate(data)
46
+ typer.echo(f" manifest.json OK ({len(manifest.agents)} agents)")
47
+ except Exception as e:
48
+ errors.append(f"manifest.json invalid: {e}")
49
+
50
+ if manifest:
51
+ for agent in manifest.agents:
52
+ if agent.agent not in AGENT_REGISTRY:
53
+ errors.append(
54
+ f"manifest: agent type {agent.agent!r} (id={agent.id!r}) not in AGENT_REGISTRY"
55
+ )
56
+
57
+ instance_count = 0
58
+ for item in sorted(dataset.iterdir()):
59
+ if not item.is_dir():
60
+ continue
61
+ config_file = item / "config.json"
62
+ if not config_file.exists():
63
+ continue
64
+
65
+ instance_count += 1
66
+ try:
67
+ data = _load_json_object(config_file)
68
+ config = InstanceConfig.model_validate(data)
69
+ except Exception as e:
70
+ errors.append(f"{item.name}/config.json invalid: {e}")
71
+ continue
72
+
73
+ if config.instance_id != item.name:
74
+ errors.append(
75
+ f"{item.name}: instance_id {config.instance_id!r} does not match directory name"
76
+ )
77
+
78
+ try:
79
+ config.resolve_patch_paths(item)
80
+ except ValueError as e:
81
+ errors.append(f"{item.name}: {e}")
82
+ try:
83
+ config.docker.resolve_dockerfile_path(item)
84
+ except ValueError as e:
85
+ errors.append(f"{item.name}: {e}")
86
+
87
+ typer.echo(f" {item.name}/config.json OK")
88
+
89
+ if instance_count == 0:
90
+ warnings.append("No instances found in dataset")
91
+
92
+ typer.echo(f"\nTotal: {instance_count} instance(s)")
93
+
94
+ if warnings:
95
+ typer.echo("")
96
+ for w in warnings:
97
+ typer.echo(f" WARNING: {w}")
98
+ if errors:
99
+ typer.echo("")
100
+ for err in errors:
101
+ typer.echo(f" ERROR: {err}", err=True)
102
+ raise typer.Exit(2)
103
+
104
+ typer.echo("\nDataset is valid.")
@@ -0,0 +1,10 @@
1
+ """CLI command for printing the benchrail version."""
2
+
3
+ import typer
4
+
5
+ from benchrail import __version__
6
+
7
+
8
+ def version_cmd() -> None:
9
+ """Print the installed benchrail version."""
10
+ typer.echo(__version__)
@@ -0,0 +1,16 @@
1
+ from benchrail.dto.config import CheckCommand, HookConfig, HooksConfig, InstanceConfig
2
+ from benchrail.dto.manifest import AgentEntry, Manifest
3
+ from benchrail.dto.result import AgentStats, CheckResult, InstanceResult, RunResult
4
+
5
+ __all__ = [
6
+ "AgentEntry",
7
+ "AgentStats",
8
+ "CheckCommand",
9
+ "CheckResult",
10
+ "HookConfig",
11
+ "HooksConfig",
12
+ "InstanceConfig",
13
+ "InstanceResult",
14
+ "Manifest",
15
+ "RunResult",
16
+ ]