tokenjam-bench 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tjbench/__init__.py +6 -0
- tjbench/agent_pipeline.py +117 -0
- tjbench/agents/__init__.py +25 -0
- tjbench/agents/runner.py +66 -0
- tjbench/agents/swe_bench_tools.py +296 -0
- tjbench/agents/tools.py +63 -0
- tjbench/agents/trace.py +72 -0
- tjbench/agents/validation.py +68 -0
- tjbench/bench_meta.py +2 -0
- tjbench/benchmarks/__init__.py +65 -0
- tjbench/benchmarks/agent_base.py +37 -0
- tjbench/benchmarks/base.py +37 -0
- tjbench/benchmarks/gsm8k.py +48 -0
- tjbench/benchmarks/humaneval.py +58 -0
- tjbench/benchmarks/judged.py +74 -0
- tjbench/benchmarks/real_scenarios.py +144 -0
- tjbench/benchmarks/sample_agent.py +107 -0
- tjbench/benchmarks/samples.py +73 -0
- tjbench/benchmarks/scenario_lib.py +108 -0
- tjbench/benchmarks/scenario_suites.py +153 -0
- tjbench/benchmarks/scoring.py +63 -0
- tjbench/benchmarks/swe_bench_lite.py +288 -0
- tjbench/ci_benchmark.py +108 -0
- tjbench/cli.py +647 -0
- tjbench/cost.py +44 -0
- tjbench/dashboard.py +980 -0
- tjbench/deepeval_judge.py +137 -0
- tjbench/exec_sandbox.py +54 -0
- tjbench/history.py +290 -0
- tjbench/judge.py +116 -0
- tjbench/matrix.py +170 -0
- tjbench/models/__init__.py +7 -0
- tjbench/models/anthropic_agent_client.py +114 -0
- tjbench/models/anthropic_client.py +52 -0
- tjbench/models/base.py +30 -0
- tjbench/models/google_client.py +41 -0
- tjbench/models/mock_agent_client.py +129 -0
- tjbench/models/mock_client.py +73 -0
- tjbench/models/openai_client.py +42 -0
- tjbench/models/openai_compatible.py +208 -0
- tjbench/models/registry.py +50 -0
- tjbench/models/tool_calling.py +51 -0
- tjbench/pipeline.py +218 -0
- tjbench/recommend.py +28 -0
- tjbench/replay.py +139 -0
- tjbench/replay_pipeline.py +151 -0
- tjbench/report.py +172 -0
- tjbench/report_html.py +322 -0
- tjbench/stats.py +96 -0
- tjbench/version.py +43 -0
- tjbench/workflows/__init__.py +151 -0
- tjbench/workflows/agentic.py +119 -0
- tokenjam_bench-0.1.0.dist-info/METADATA +410 -0
- tokenjam_bench-0.1.0.dist-info/RECORD +58 -0
- tokenjam_bench-0.1.0.dist-info/WHEEL +4 -0
- tokenjam_bench-0.1.0.dist-info/entry_points.txt +2 -0
- tokenjam_bench-0.1.0.dist-info/licenses/LICENSE +22 -0
- tokenjam_bench-0.1.0.dist-info/licenses/NOTICE +7 -0
tjbench/__init__.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Agent proof pipeline.
|
|
2
|
+
|
|
3
|
+
Runs an agent benchmark on the ORIGINAL model and the CANDIDATE model (TokenJam's
|
|
4
|
+
recommendation), each via the multi-turn AgentRunner, scores each run on its
|
|
5
|
+
trace (answer correctness + tool-call validation incl. the safety gate), prices
|
|
6
|
+
the summed multi-turn token usage, and feeds the per-task outcomes into the SAME
|
|
7
|
+
assembler the single-shot path uses — so Wilson CIs, McNemar, and cost
|
|
8
|
+
validation apply unchanged.
|
|
9
|
+
|
|
10
|
+
This is the payoff of the keystone: agent benchmarks inherit all the statistical
|
|
11
|
+
rigor for free, and a candidate that takes an unsafe action fails the task even
|
|
12
|
+
when its answer text is correct.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from tjbench.agents.runner import AgentRunner
|
|
17
|
+
from tjbench.benchmarks import get_agent_benchmark
|
|
18
|
+
from tjbench.cost import price_completion
|
|
19
|
+
from tjbench.models.anthropic_agent_client import get_tool_calling_client
|
|
20
|
+
from tjbench.models.registry import parse_spec
|
|
21
|
+
from tjbench.pipeline import assemble_proof, resolve_candidate
|
|
22
|
+
from tjbench.report import TaskOutcome
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _run_agent_samples(client, provider, model, benchmark, task, registry,
|
|
26
|
+
samples, max_turns, temperature, max_tokens):
|
|
27
|
+
passes = 0
|
|
28
|
+
out_tok = 0
|
|
29
|
+
cost = 0.0
|
|
30
|
+
last_detail = ""
|
|
31
|
+
for _ in range(samples):
|
|
32
|
+
runner = AgentRunner(client, registry, max_turns=max_turns,
|
|
33
|
+
temperature=temperature, max_tokens=max_tokens)
|
|
34
|
+
trace = runner.run(task.task_id, task.prompt)
|
|
35
|
+
score = benchmark.score(task, trace)
|
|
36
|
+
seq = trace.tool_sequence()
|
|
37
|
+
last_detail = (
|
|
38
|
+
f"tools={seq} stopped={trace.stopped_reason} "
|
|
39
|
+
f"turns={trace.num_turns} -> {score.detail}"
|
|
40
|
+
)
|
|
41
|
+
if score.passed:
|
|
42
|
+
passes += 1
|
|
43
|
+
out_tok += trace.total_output_tokens
|
|
44
|
+
cost += price_completion(provider, model, trace.as_completion())
|
|
45
|
+
return passes, out_tok, round(cost, 8), last_detail
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def run_agent_proof(
|
|
49
|
+
*,
|
|
50
|
+
benchmark_name: str,
|
|
51
|
+
original_spec: str,
|
|
52
|
+
candidate_spec: str | None = None,
|
|
53
|
+
limit: int | None = None,
|
|
54
|
+
samples: int = 1,
|
|
55
|
+
temperature: float = 0.0,
|
|
56
|
+
max_turns: int = 8,
|
|
57
|
+
max_tokens: int = 1024,
|
|
58
|
+
mock: bool = False,
|
|
59
|
+
candidate_behavior: str = "ok",
|
|
60
|
+
alpha: float = 0.05,
|
|
61
|
+
):
|
|
62
|
+
"""Run an agent-benchmark proof (original vs TokenJam's candidate)."""
|
|
63
|
+
if samples < 1:
|
|
64
|
+
raise ValueError("samples must be >= 1")
|
|
65
|
+
|
|
66
|
+
recommended_by = "tokenjam.DOWNGRADE_CANDIDATES"
|
|
67
|
+
if candidate_spec is None:
|
|
68
|
+
candidate_spec = resolve_candidate(original_spec)
|
|
69
|
+
if candidate_spec is None:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"TokenJam has no downgrade candidate for '{original_spec}'. "
|
|
72
|
+
f"Pass --candidate explicitly to override."
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
recommended_by = "explicit --candidate override"
|
|
76
|
+
|
|
77
|
+
orig_provider, orig_model = parse_spec(original_spec)
|
|
78
|
+
cand_provider, cand_model = parse_spec(candidate_spec)
|
|
79
|
+
|
|
80
|
+
# Offline: original behaves correctly, candidate's behavior is configurable
|
|
81
|
+
# (ok | wrong | unsafe) to exercise the answer + safety gates.
|
|
82
|
+
original = get_tool_calling_client(original_spec, mock=mock, behavior="ok")
|
|
83
|
+
candidate = get_tool_calling_client(candidate_spec, mock=mock, behavior=candidate_behavior)
|
|
84
|
+
|
|
85
|
+
benchmark = get_agent_benchmark(benchmark_name)
|
|
86
|
+
registry = benchmark.tools()
|
|
87
|
+
tasks = benchmark.tasks(limit=limit)
|
|
88
|
+
|
|
89
|
+
outcomes: list[TaskOutcome] = []
|
|
90
|
+
tot_o = tot_c = 0
|
|
91
|
+
for task in tasks:
|
|
92
|
+
o_pass, o_out, o_cost, o_detail = _run_agent_samples(
|
|
93
|
+
original, orig_provider, orig_model, benchmark, task, registry,
|
|
94
|
+
samples, max_turns, temperature, max_tokens)
|
|
95
|
+
c_pass, c_out, c_cost, c_detail = _run_agent_samples(
|
|
96
|
+
candidate, cand_provider, cand_model, benchmark, task, registry,
|
|
97
|
+
samples, max_turns, temperature, max_tokens)
|
|
98
|
+
tot_o += o_pass
|
|
99
|
+
tot_c += c_pass
|
|
100
|
+
outcomes.append(TaskOutcome(
|
|
101
|
+
task_id=task.task_id, samples=samples,
|
|
102
|
+
original_passes=o_pass, candidate_passes=c_pass,
|
|
103
|
+
original_cost_usd=o_cost, candidate_cost_usd=c_cost,
|
|
104
|
+
original_output_tokens=o_out, candidate_output_tokens=c_out,
|
|
105
|
+
original_detail=o_detail, candidate_detail=c_detail,
|
|
106
|
+
))
|
|
107
|
+
|
|
108
|
+
return assemble_proof(
|
|
109
|
+
outcomes,
|
|
110
|
+
benchmark_name=benchmark_name,
|
|
111
|
+
original_spec=original_spec, candidate_spec=candidate_spec,
|
|
112
|
+
recommended_by=recommended_by, samples=samples, mock=mock,
|
|
113
|
+
orig_provider=orig_provider, orig_model=orig_model,
|
|
114
|
+
cand_provider=cand_provider, cand_model=cand_model,
|
|
115
|
+
sample_pass_totals=(tot_o, tot_c),
|
|
116
|
+
alpha=alpha,
|
|
117
|
+
)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Agent execution: the multi-turn AgentRunner, tools, trace, and validation.
|
|
2
|
+
|
|
3
|
+
This package is the keystone that turns the bench from an LLM benchmark into an
|
|
4
|
+
agent benchmark. It feeds the SAME proof machinery (stats + cost) as the
|
|
5
|
+
single-shot path — a trace yields a per-task pass/fail and a measured
|
|
6
|
+
multi-turn cost.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from tjbench.agents.runner import AgentRunner
|
|
11
|
+
from tjbench.agents.tools import Tool, ToolRegistry, ToolResult
|
|
12
|
+
from tjbench.agents.trace import AgentTrace, ToolCallRecord, TurnRecord
|
|
13
|
+
from tjbench.agents.validation import ToolValidation, validate_tools
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"AgentRunner",
|
|
17
|
+
"Tool",
|
|
18
|
+
"ToolRegistry",
|
|
19
|
+
"ToolResult",
|
|
20
|
+
"AgentTrace",
|
|
21
|
+
"TurnRecord",
|
|
22
|
+
"ToolCallRecord",
|
|
23
|
+
"ToolValidation",
|
|
24
|
+
"validate_tools",
|
|
25
|
+
]
|
tjbench/agents/runner.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""AgentRunner — the multi-turn loop. THE keystone.
|
|
2
|
+
|
|
3
|
+
It drives a tool-calling model against a tool registry until the model returns a
|
|
4
|
+
final answer or hits `max_turns`, recording every turn into an AgentTrace. This
|
|
5
|
+
one component is what unlocks agent benchmarks, multi-turn evaluation, tool-call
|
|
6
|
+
validation, and side-effect safety — all of which were blocked on the
|
|
7
|
+
single-shot `complete(prompt)` interface.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from tjbench.agents.tools import ToolRegistry
|
|
12
|
+
from tjbench.agents.trace import AgentTrace, ToolCallRecord, TurnRecord
|
|
13
|
+
from tjbench.models.tool_calling import ToolCallingClient
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class AgentRunner:
|
|
17
|
+
def __init__(self, client: ToolCallingClient, tools: ToolRegistry,
|
|
18
|
+
max_turns: int = 8, temperature: float = 0.0,
|
|
19
|
+
max_tokens: int = 1024) -> None:
|
|
20
|
+
self.client = client
|
|
21
|
+
self.tools = tools
|
|
22
|
+
self.max_turns = max_turns
|
|
23
|
+
self.temperature = temperature
|
|
24
|
+
self.max_tokens = max_tokens
|
|
25
|
+
|
|
26
|
+
def run(self, task_id: str, prompt: str) -> AgentTrace:
|
|
27
|
+
messages: list[dict] = [{"role": "user", "content": prompt}]
|
|
28
|
+
trace = AgentTrace(task_id=task_id)
|
|
29
|
+
specs = self.tools.specs()
|
|
30
|
+
|
|
31
|
+
for i in range(self.max_turns):
|
|
32
|
+
turn = self.client.chat(messages, specs, self.temperature, self.max_tokens)
|
|
33
|
+
|
|
34
|
+
if not turn.wants_tools:
|
|
35
|
+
trace.turns.append(TurnRecord(
|
|
36
|
+
index=i, assistant_text=turn.text, tool_calls=[],
|
|
37
|
+
input_tokens=turn.input_tokens, output_tokens=turn.output_tokens,
|
|
38
|
+
cache_tokens=turn.cache_tokens,
|
|
39
|
+
))
|
|
40
|
+
trace.final_text = turn.text
|
|
41
|
+
trace.stopped_reason = "final"
|
|
42
|
+
return trace
|
|
43
|
+
|
|
44
|
+
# The model asked for tool(s): execute each, record, feed results back.
|
|
45
|
+
messages.append({
|
|
46
|
+
"role": "assistant", "content": turn.text, "tool_calls": turn.tool_calls,
|
|
47
|
+
})
|
|
48
|
+
call_records: list[ToolCallRecord] = []
|
|
49
|
+
for tc in turn.tool_calls:
|
|
50
|
+
result = self.tools.execute(tc.name, tc.arguments)
|
|
51
|
+
call_records.append(ToolCallRecord(
|
|
52
|
+
name=tc.name, arguments=tc.arguments,
|
|
53
|
+
result=result.output, is_error=result.is_error,
|
|
54
|
+
))
|
|
55
|
+
messages.append({
|
|
56
|
+
"role": "tool", "tool_call_id": tc.id, "name": tc.name,
|
|
57
|
+
"content": result.output,
|
|
58
|
+
})
|
|
59
|
+
trace.turns.append(TurnRecord(
|
|
60
|
+
index=i, assistant_text=turn.text, tool_calls=call_records,
|
|
61
|
+
input_tokens=turn.input_tokens, output_tokens=turn.output_tokens,
|
|
62
|
+
cache_tokens=turn.cache_tokens,
|
|
63
|
+
))
|
|
64
|
+
|
|
65
|
+
trace.stopped_reason = "max_turns"
|
|
66
|
+
return trace
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""SWE-Bench tool implementations that operate on a real workspace.
|
|
2
|
+
|
|
3
|
+
These tools are designed to be bound to a specific task's file workspace
|
|
4
|
+
at runtime. They provide the core developer operations needed for
|
|
5
|
+
SWE-Bench: reading files, editing files, and running commands.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from tjbench.agents.tools import ToolResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class SWEBenchToolSet:
|
|
18
|
+
"""Collection of tools for SWE-Bench agent evaluation.
|
|
19
|
+
|
|
20
|
+
Each tool operates on a specific workspace directory. The toolset
|
|
21
|
+
is instantiated per-task and bound to the task's workspace.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, workspace: Path) -> None:
|
|
25
|
+
self.workspace = workspace
|
|
26
|
+
self._files: dict[str, str] = {} # Cache of file contents
|
|
27
|
+
|
|
28
|
+
def _resolve_path(self, path: str) -> Path:
|
|
29
|
+
"""Resolve a path relative to the workspace."""
|
|
30
|
+
# Prevent directory traversal outside workspace
|
|
31
|
+
resolved = (self.workspace / path).resolve()
|
|
32
|
+
if not str(resolved).startswith(str(self.workspace.resolve())):
|
|
33
|
+
raise ValueError(f"Path {path} escapes workspace")
|
|
34
|
+
return resolved
|
|
35
|
+
|
|
36
|
+
def _read_file(self, path: Path) -> str:
|
|
37
|
+
"""Read a file, caching the result."""
|
|
38
|
+
str_path = str(path)
|
|
39
|
+
if str_path not in self._files:
|
|
40
|
+
if path.exists():
|
|
41
|
+
self._files[str_path] = path.read_text(encoding="utf-8")
|
|
42
|
+
else:
|
|
43
|
+
self._files[str_path] = ""
|
|
44
|
+
return self._files[str_path]
|
|
45
|
+
|
|
46
|
+
def _write_file(self, path: Path, content: str) -> None:
|
|
47
|
+
"""Write a file and update cache."""
|
|
48
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
49
|
+
path.write_text(content, encoding="utf-8")
|
|
50
|
+
self._files[str(path)] = content
|
|
51
|
+
|
|
52
|
+
# --- Tool implementations ---
|
|
53
|
+
|
|
54
|
+
def view(self, args: dict[str, Any]) -> ToolResult:
|
|
55
|
+
"""View the contents of a file."""
|
|
56
|
+
try:
|
|
57
|
+
path = self._resolve_path(args["path"])
|
|
58
|
+
if not path.exists():
|
|
59
|
+
return ToolResult(
|
|
60
|
+
output=f"Error: File '{args['path']}' does not exist.",
|
|
61
|
+
is_error=True,
|
|
62
|
+
)
|
|
63
|
+
content = self._read_file(path)
|
|
64
|
+
# Add line numbers for readability
|
|
65
|
+
lines = content.split("\n")
|
|
66
|
+
numbered = "\n".join(f"{i+1:4d} | {line}" for i, line in enumerate(lines))
|
|
67
|
+
return ToolResult(output=f"File: {args['path']}\n{numbered}")
|
|
68
|
+
except Exception as e:
|
|
69
|
+
return ToolResult(output=f"Error: {e}", is_error=True)
|
|
70
|
+
|
|
71
|
+
def view_range(self, args: dict[str, Any]) -> ToolResult:
|
|
72
|
+
"""View a specific range of lines in a file."""
|
|
73
|
+
try:
|
|
74
|
+
path = self._resolve_path(args["path"])
|
|
75
|
+
start = args["start"]
|
|
76
|
+
end = args["end"]
|
|
77
|
+
|
|
78
|
+
if not path.exists():
|
|
79
|
+
return ToolResult(
|
|
80
|
+
output=f"Error: File '{args['path']}' does not exist.",
|
|
81
|
+
is_error=True,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
content = self._read_file(path)
|
|
85
|
+
lines = content.split("\n")
|
|
86
|
+
|
|
87
|
+
# Clamp to valid range
|
|
88
|
+
start = max(1, start)
|
|
89
|
+
end = min(len(lines), end)
|
|
90
|
+
|
|
91
|
+
selected = lines[start - 1:end]
|
|
92
|
+
numbered = "\n".join(f"{i+start:4d} | {line}" for i, line in enumerate(selected))
|
|
93
|
+
return ToolResult(
|
|
94
|
+
output=f"File: {args['path']} (lines {start}-{end})\n{numbered}"
|
|
95
|
+
)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
return ToolResult(output=f"Error: {e}", is_error=True)
|
|
98
|
+
|
|
99
|
+
def str_replace(self, args: dict[str, Any]) -> ToolResult:
|
|
100
|
+
"""Replace an exact string in a file."""
|
|
101
|
+
try:
|
|
102
|
+
path = self._resolve_path(args["path"])
|
|
103
|
+
old_str = args["old_str"]
|
|
104
|
+
new_str = args["new_str"]
|
|
105
|
+
|
|
106
|
+
if not path.exists():
|
|
107
|
+
return ToolResult(
|
|
108
|
+
output=f"Error: File '{args['path']}' does not exist.",
|
|
109
|
+
is_error=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
content = self._read_file(path)
|
|
113
|
+
|
|
114
|
+
if old_str not in content:
|
|
115
|
+
return ToolResult(
|
|
116
|
+
output=f"Error: Could not find the exact string in {args['path']}. "
|
|
117
|
+
"Make sure the old_str matches exactly (including whitespace).",
|
|
118
|
+
is_error=True,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Count occurrences
|
|
122
|
+
count = content.count(old_str)
|
|
123
|
+
if count > 1:
|
|
124
|
+
return ToolResult(
|
|
125
|
+
output=f"Error: Found {count} occurrences of the string. "
|
|
126
|
+
"Please use a more specific old_str that matches exactly once.",
|
|
127
|
+
is_error=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
new_content = content.replace(old_str, new_str, 1)
|
|
131
|
+
self._write_file(path, new_content)
|
|
132
|
+
|
|
133
|
+
return ToolResult(
|
|
134
|
+
output=f"Successfully replaced in {args['path']}. "
|
|
135
|
+
f"Changed {len(old_str)} chars to {len(new_str)} chars."
|
|
136
|
+
)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
return ToolResult(output=f"Error: {e}", is_error=True)
|
|
139
|
+
|
|
140
|
+
def create(self, args: dict[str, Any]) -> ToolResult:
|
|
141
|
+
"""Create a new file with the given content."""
|
|
142
|
+
try:
|
|
143
|
+
path = self._resolve_path(args["path"])
|
|
144
|
+
content = args["content"]
|
|
145
|
+
|
|
146
|
+
if path.exists():
|
|
147
|
+
return ToolResult(
|
|
148
|
+
output=f"Error: File '{args['path']}' already exists. Use str_replace to modify it.",
|
|
149
|
+
is_error=True,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
self._write_file(path, content)
|
|
153
|
+
return ToolResult(output=f"Created file: {args['path']}")
|
|
154
|
+
except Exception as e:
|
|
155
|
+
return ToolResult(output=f"Error: {e}", is_error=True)
|
|
156
|
+
|
|
157
|
+
def insert(self, args: dict[str, Any]) -> ToolResult:
|
|
158
|
+
"""Insert text after a specific line."""
|
|
159
|
+
try:
|
|
160
|
+
path = self._resolve_path(args["path"])
|
|
161
|
+
line = args["line"]
|
|
162
|
+
new_str = args["new_str"]
|
|
163
|
+
|
|
164
|
+
if not path.exists():
|
|
165
|
+
return ToolResult(
|
|
166
|
+
output=f"Error: File '{args['path']}' does not exist.",
|
|
167
|
+
is_error=True,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
content = self._read_file(path)
|
|
171
|
+
lines = content.split("\n")
|
|
172
|
+
|
|
173
|
+
if line < 0 or line > len(lines):
|
|
174
|
+
return ToolResult(
|
|
175
|
+
output=f"Error: Line {line} is out of range (file has {len(lines)} lines).",
|
|
176
|
+
is_error=True,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
lines.insert(line, new_str)
|
|
180
|
+
self._write_file(path, "\n".join(lines))
|
|
181
|
+
|
|
182
|
+
return ToolResult(output=f"Inserted after line {line} in {args['path']}")
|
|
183
|
+
except Exception as e:
|
|
184
|
+
return ToolResult(output=f"Error: {e}", is_error=True)
|
|
185
|
+
|
|
186
|
+
def bash(self, args: dict[str, Any]) -> ToolResult:
|
|
187
|
+
"""Run a shell command in the workspace."""
|
|
188
|
+
try:
|
|
189
|
+
command = args["command"]
|
|
190
|
+
timeout = args.get("timeout", 30)
|
|
191
|
+
|
|
192
|
+
result = subprocess.run(
|
|
193
|
+
command,
|
|
194
|
+
shell=True,
|
|
195
|
+
cwd=self.workspace,
|
|
196
|
+
capture_output=True,
|
|
197
|
+
text=True,
|
|
198
|
+
timeout=timeout,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
output = f"Exit code: {result.returncode}\n"
|
|
202
|
+
if result.stdout:
|
|
203
|
+
output += f"STDOUT:\n{result.stdout}\n"
|
|
204
|
+
if result.stderr:
|
|
205
|
+
output += f"STDERR:\n{result.stderr}\n"
|
|
206
|
+
|
|
207
|
+
return ToolResult(
|
|
208
|
+
output=output,
|
|
209
|
+
is_error=result.returncode != 0,
|
|
210
|
+
)
|
|
211
|
+
except subprocess.TimeoutExpired:
|
|
212
|
+
return ToolResult(
|
|
213
|
+
output=f"Error: Command timed out after {timeout}s.",
|
|
214
|
+
is_error=True,
|
|
215
|
+
)
|
|
216
|
+
except Exception as e:
|
|
217
|
+
return ToolResult(output=f"Error: {e}", is_error=True)
|
|
218
|
+
|
|
219
|
+
def get_tool_specs(self) -> list[dict[str, Any]]:
|
|
220
|
+
"""Return tool specifications for the agent."""
|
|
221
|
+
return [
|
|
222
|
+
{
|
|
223
|
+
"name": "view",
|
|
224
|
+
"description": "View the contents of a file. Shows line numbers.",
|
|
225
|
+
"parameters": {
|
|
226
|
+
"type": "object",
|
|
227
|
+
"properties": {
|
|
228
|
+
"path": {"type": "string", "description": "Path to the file (relative to workspace)"},
|
|
229
|
+
},
|
|
230
|
+
"required": ["path"],
|
|
231
|
+
},
|
|
232
|
+
},
|
|
233
|
+
{
|
|
234
|
+
"name": "view_range",
|
|
235
|
+
"description": "View a specific range of lines in a file.",
|
|
236
|
+
"parameters": {
|
|
237
|
+
"type": "object",
|
|
238
|
+
"properties": {
|
|
239
|
+
"path": {"type": "string"},
|
|
240
|
+
"start": {"type": "integer", "description": "Start line (1-indexed)"},
|
|
241
|
+
"end": {"type": "integer", "description": "End line (1-indexed)"},
|
|
242
|
+
},
|
|
243
|
+
"required": ["path", "start", "end"],
|
|
244
|
+
},
|
|
245
|
+
},
|
|
246
|
+
{
|
|
247
|
+
"name": "str_replace",
|
|
248
|
+
"description": "Replace an exact string in a file. The old_str must match exactly once.",
|
|
249
|
+
"parameters": {
|
|
250
|
+
"type": "object",
|
|
251
|
+
"properties": {
|
|
252
|
+
"path": {"type": "string"},
|
|
253
|
+
"old_str": {"type": "string", "description": "Exact text to replace (must match exactly once)"},
|
|
254
|
+
"new_str": {"type": "string", "description": "Replacement text"},
|
|
255
|
+
},
|
|
256
|
+
"required": ["path", "old_str", "new_str"],
|
|
257
|
+
},
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"name": "create",
|
|
261
|
+
"description": "Create a new file with the given content.",
|
|
262
|
+
"parameters": {
|
|
263
|
+
"type": "object",
|
|
264
|
+
"properties": {
|
|
265
|
+
"path": {"type": "string"},
|
|
266
|
+
"content": {"type": "string"},
|
|
267
|
+
},
|
|
268
|
+
"required": ["path", "content"],
|
|
269
|
+
},
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
"name": "insert",
|
|
273
|
+
"description": "Insert text after a specific line in a file.",
|
|
274
|
+
"parameters": {
|
|
275
|
+
"type": "object",
|
|
276
|
+
"properties": {
|
|
277
|
+
"path": {"type": "string"},
|
|
278
|
+
"line": {"type": "integer", "description": "Line after which to insert (1-indexed)"},
|
|
279
|
+
"new_str": {"type": "string"},
|
|
280
|
+
},
|
|
281
|
+
"required": ["path", "line", "new_str"],
|
|
282
|
+
},
|
|
283
|
+
},
|
|
284
|
+
{
|
|
285
|
+
"name": "bash",
|
|
286
|
+
"description": "Run a shell command in the workspace. Use for running tests, git, etc.",
|
|
287
|
+
"parameters": {
|
|
288
|
+
"type": "object",
|
|
289
|
+
"properties": {
|
|
290
|
+
"command": {"type": "string", "description": "Shell command to run"},
|
|
291
|
+
"timeout": {"type": "integer", "default": 30, "description": "Timeout in seconds"},
|
|
292
|
+
},
|
|
293
|
+
"required": ["command"],
|
|
294
|
+
},
|
|
295
|
+
},
|
|
296
|
+
]
|
tjbench/agents/tools.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Tools an agent can call, and the registry that executes them.
|
|
2
|
+
|
|
3
|
+
A `Tool` carries a JSON-schema for its arguments (so it can be advertised to a
|
|
4
|
+
tool-calling model) and a `dangerous` flag. The flag is load-bearing for the
|
|
5
|
+
safety story from the review: a cheaper model can produce a correct-looking
|
|
6
|
+
final answer while taking a catastrophic action (delete vs read). Marking the
|
|
7
|
+
tool dangerous lets validation fail the task on the *action*, not the text.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Any, Callable
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class ToolResult:
|
|
17
|
+
output: str
|
|
18
|
+
is_error: bool = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Tool:
|
|
23
|
+
name: str
|
|
24
|
+
description: str
|
|
25
|
+
parameters: dict[str, Any] # JSON Schema for the arguments object
|
|
26
|
+
run: Callable[[dict[str, Any]], ToolResult]
|
|
27
|
+
dangerous: bool = False # has side effects worth flagging on misuse
|
|
28
|
+
|
|
29
|
+
def spec(self) -> dict[str, Any]:
|
|
30
|
+
"""Provider-agnostic advertisement of this tool to a model."""
|
|
31
|
+
return {
|
|
32
|
+
"name": self.name,
|
|
33
|
+
"description": self.description,
|
|
34
|
+
"parameters": self.parameters,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ToolRegistry:
|
|
39
|
+
def __init__(self, tools: list[Tool] | None = None) -> None:
|
|
40
|
+
self._tools: dict[str, Tool] = {}
|
|
41
|
+
for t in tools or []:
|
|
42
|
+
self.register(t)
|
|
43
|
+
|
|
44
|
+
def register(self, tool: Tool) -> None:
|
|
45
|
+
self._tools[tool.name] = tool
|
|
46
|
+
|
|
47
|
+
def names(self) -> list[str]:
|
|
48
|
+
return list(self._tools.keys())
|
|
49
|
+
|
|
50
|
+
def dangerous_names(self) -> set[str]:
|
|
51
|
+
return {n for n, t in self._tools.items() if t.dangerous}
|
|
52
|
+
|
|
53
|
+
def specs(self) -> list[dict[str, Any]]:
|
|
54
|
+
return [t.spec() for t in self._tools.values()]
|
|
55
|
+
|
|
56
|
+
def execute(self, name: str, arguments: dict[str, Any]) -> ToolResult:
|
|
57
|
+
tool = self._tools.get(name)
|
|
58
|
+
if tool is None:
|
|
59
|
+
return ToolResult(output=f"unknown tool: {name}", is_error=True)
|
|
60
|
+
try:
|
|
61
|
+
return tool.run(arguments or {})
|
|
62
|
+
except Exception as exc: # a tool raising is a tool error, not a crash
|
|
63
|
+
return ToolResult(output=f"tool error: {exc}", is_error=True)
|
tjbench/agents/trace.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""The observable record of an agent run.
|
|
2
|
+
|
|
3
|
+
A single-shot completion has nothing to validate beyond its text. A multi-turn
|
|
4
|
+
agent run produces a *trace*: which tools were called, with what arguments, in
|
|
5
|
+
what order, whether they errored, the per-turn token usage, and the final
|
|
6
|
+
answer. The trace is what makes tool-call validation, multi-turn evaluation, and
|
|
7
|
+
side-effect safety possible — and summing tokens across turns is what makes the
|
|
8
|
+
cost number honest for agents (a model that loops 8× costs 8×).
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
from tjbench.models.base import Completion
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ToolCallRecord:
|
|
19
|
+
name: str
|
|
20
|
+
arguments: dict
|
|
21
|
+
result: str
|
|
22
|
+
is_error: bool
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class TurnRecord:
|
|
27
|
+
index: int
|
|
28
|
+
assistant_text: str
|
|
29
|
+
tool_calls: list[ToolCallRecord]
|
|
30
|
+
input_tokens: int
|
|
31
|
+
output_tokens: int
|
|
32
|
+
cache_tokens: int = 0
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class AgentTrace:
|
|
37
|
+
task_id: str
|
|
38
|
+
turns: list[TurnRecord] = field(default_factory=list)
|
|
39
|
+
final_text: str = ""
|
|
40
|
+
stopped_reason: str = "final" # "final" | "max_turns"
|
|
41
|
+
|
|
42
|
+
def tool_sequence(self) -> list[str]:
|
|
43
|
+
"""Ordered tool names across the whole run."""
|
|
44
|
+
return [tc.name for turn in self.turns for tc in turn.tool_calls]
|
|
45
|
+
|
|
46
|
+
def all_tool_calls(self) -> list[ToolCallRecord]:
|
|
47
|
+
return [tc for turn in self.turns for tc in turn.tool_calls]
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def num_turns(self) -> int:
|
|
51
|
+
return len(self.turns)
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def total_input_tokens(self) -> int:
|
|
55
|
+
return sum(t.input_tokens for t in self.turns)
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def total_output_tokens(self) -> int:
|
|
59
|
+
return sum(t.output_tokens for t in self.turns)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def total_cache_tokens(self) -> int:
|
|
63
|
+
return sum(t.cache_tokens for t in self.turns)
|
|
64
|
+
|
|
65
|
+
def as_completion(self) -> Completion:
|
|
66
|
+
"""Collapse the run's token usage so the existing pricing path applies."""
|
|
67
|
+
return Completion(
|
|
68
|
+
text=self.final_text,
|
|
69
|
+
input_tokens=self.total_input_tokens,
|
|
70
|
+
output_tokens=self.total_output_tokens,
|
|
71
|
+
cache_tokens=self.total_cache_tokens,
|
|
72
|
+
)
|