tokenjam-bench 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. tjbench/__init__.py +6 -0
  2. tjbench/agent_pipeline.py +117 -0
  3. tjbench/agents/__init__.py +25 -0
  4. tjbench/agents/runner.py +66 -0
  5. tjbench/agents/swe_bench_tools.py +296 -0
  6. tjbench/agents/tools.py +63 -0
  7. tjbench/agents/trace.py +72 -0
  8. tjbench/agents/validation.py +68 -0
  9. tjbench/bench_meta.py +2 -0
  10. tjbench/benchmarks/__init__.py +65 -0
  11. tjbench/benchmarks/agent_base.py +37 -0
  12. tjbench/benchmarks/base.py +37 -0
  13. tjbench/benchmarks/gsm8k.py +48 -0
  14. tjbench/benchmarks/humaneval.py +58 -0
  15. tjbench/benchmarks/judged.py +74 -0
  16. tjbench/benchmarks/real_scenarios.py +144 -0
  17. tjbench/benchmarks/sample_agent.py +107 -0
  18. tjbench/benchmarks/samples.py +73 -0
  19. tjbench/benchmarks/scenario_lib.py +108 -0
  20. tjbench/benchmarks/scenario_suites.py +153 -0
  21. tjbench/benchmarks/scoring.py +63 -0
  22. tjbench/benchmarks/swe_bench_lite.py +288 -0
  23. tjbench/ci_benchmark.py +108 -0
  24. tjbench/cli.py +647 -0
  25. tjbench/cost.py +44 -0
  26. tjbench/dashboard.py +980 -0
  27. tjbench/deepeval_judge.py +137 -0
  28. tjbench/exec_sandbox.py +54 -0
  29. tjbench/history.py +290 -0
  30. tjbench/judge.py +116 -0
  31. tjbench/matrix.py +170 -0
  32. tjbench/models/__init__.py +7 -0
  33. tjbench/models/anthropic_agent_client.py +114 -0
  34. tjbench/models/anthropic_client.py +52 -0
  35. tjbench/models/base.py +30 -0
  36. tjbench/models/google_client.py +41 -0
  37. tjbench/models/mock_agent_client.py +129 -0
  38. tjbench/models/mock_client.py +73 -0
  39. tjbench/models/openai_client.py +42 -0
  40. tjbench/models/openai_compatible.py +208 -0
  41. tjbench/models/registry.py +50 -0
  42. tjbench/models/tool_calling.py +51 -0
  43. tjbench/pipeline.py +218 -0
  44. tjbench/recommend.py +28 -0
  45. tjbench/replay.py +139 -0
  46. tjbench/replay_pipeline.py +151 -0
  47. tjbench/report.py +172 -0
  48. tjbench/report_html.py +322 -0
  49. tjbench/stats.py +96 -0
  50. tjbench/version.py +43 -0
  51. tjbench/workflows/__init__.py +151 -0
  52. tjbench/workflows/agentic.py +119 -0
  53. tokenjam_bench-0.1.0.dist-info/METADATA +410 -0
  54. tokenjam_bench-0.1.0.dist-info/RECORD +58 -0
  55. tokenjam_bench-0.1.0.dist-info/WHEEL +4 -0
  56. tokenjam_bench-0.1.0.dist-info/entry_points.txt +2 -0
  57. tokenjam_bench-0.1.0.dist-info/licenses/LICENSE +22 -0
  58. tokenjam_bench-0.1.0.dist-info/licenses/NOTICE +7 -0
tjbench/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """tokenjam-benchmark — the package.
2
+
3
+ All modules live here so the repo root stays clean and the installed `tjbench`
4
+ console script (`tjbench.cli:cli`) does not collide with same-named PyPI
5
+ packages (e.g. `cli`). Run from source via the root `run.py`.
6
+ """
@@ -0,0 +1,117 @@
1
+ """Agent proof pipeline.
2
+
3
+ Runs an agent benchmark on the ORIGINAL model and the CANDIDATE model (TokenJam's
4
+ recommendation), each via the multi-turn AgentRunner, scores each run on its
5
+ trace (answer correctness + tool-call validation incl. the safety gate), prices
6
+ the summed multi-turn token usage, and feeds the per-task outcomes into the SAME
7
+ assembler the single-shot path uses — so Wilson CIs, McNemar, and cost
8
+ validation apply unchanged.
9
+
10
+ This is the payoff of the keystone: agent benchmarks inherit all the statistical
11
+ rigor for free, and a candidate that takes an unsafe action fails the task even
12
+ when its answer text is correct.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ from tjbench.agents.runner import AgentRunner
17
+ from tjbench.benchmarks import get_agent_benchmark
18
+ from tjbench.cost import price_completion
19
+ from tjbench.models.anthropic_agent_client import get_tool_calling_client
20
+ from tjbench.models.registry import parse_spec
21
+ from tjbench.pipeline import assemble_proof, resolve_candidate
22
+ from tjbench.report import TaskOutcome
23
+
24
+
25
+ def _run_agent_samples(client, provider, model, benchmark, task, registry,
26
+ samples, max_turns, temperature, max_tokens):
27
+ passes = 0
28
+ out_tok = 0
29
+ cost = 0.0
30
+ last_detail = ""
31
+ for _ in range(samples):
32
+ runner = AgentRunner(client, registry, max_turns=max_turns,
33
+ temperature=temperature, max_tokens=max_tokens)
34
+ trace = runner.run(task.task_id, task.prompt)
35
+ score = benchmark.score(task, trace)
36
+ seq = trace.tool_sequence()
37
+ last_detail = (
38
+ f"tools={seq} stopped={trace.stopped_reason} "
39
+ f"turns={trace.num_turns} -> {score.detail}"
40
+ )
41
+ if score.passed:
42
+ passes += 1
43
+ out_tok += trace.total_output_tokens
44
+ cost += price_completion(provider, model, trace.as_completion())
45
+ return passes, out_tok, round(cost, 8), last_detail
46
+
47
+
48
+ def run_agent_proof(
49
+ *,
50
+ benchmark_name: str,
51
+ original_spec: str,
52
+ candidate_spec: str | None = None,
53
+ limit: int | None = None,
54
+ samples: int = 1,
55
+ temperature: float = 0.0,
56
+ max_turns: int = 8,
57
+ max_tokens: int = 1024,
58
+ mock: bool = False,
59
+ candidate_behavior: str = "ok",
60
+ alpha: float = 0.05,
61
+ ):
62
+ """Run an agent-benchmark proof (original vs TokenJam's candidate)."""
63
+ if samples < 1:
64
+ raise ValueError("samples must be >= 1")
65
+
66
+ recommended_by = "tokenjam.DOWNGRADE_CANDIDATES"
67
+ if candidate_spec is None:
68
+ candidate_spec = resolve_candidate(original_spec)
69
+ if candidate_spec is None:
70
+ raise ValueError(
71
+ f"TokenJam has no downgrade candidate for '{original_spec}'. "
72
+ f"Pass --candidate explicitly to override."
73
+ )
74
+ else:
75
+ recommended_by = "explicit --candidate override"
76
+
77
+ orig_provider, orig_model = parse_spec(original_spec)
78
+ cand_provider, cand_model = parse_spec(candidate_spec)
79
+
80
+ # Offline: original behaves correctly, candidate's behavior is configurable
81
+ # (ok | wrong | unsafe) to exercise the answer + safety gates.
82
+ original = get_tool_calling_client(original_spec, mock=mock, behavior="ok")
83
+ candidate = get_tool_calling_client(candidate_spec, mock=mock, behavior=candidate_behavior)
84
+
85
+ benchmark = get_agent_benchmark(benchmark_name)
86
+ registry = benchmark.tools()
87
+ tasks = benchmark.tasks(limit=limit)
88
+
89
+ outcomes: list[TaskOutcome] = []
90
+ tot_o = tot_c = 0
91
+ for task in tasks:
92
+ o_pass, o_out, o_cost, o_detail = _run_agent_samples(
93
+ original, orig_provider, orig_model, benchmark, task, registry,
94
+ samples, max_turns, temperature, max_tokens)
95
+ c_pass, c_out, c_cost, c_detail = _run_agent_samples(
96
+ candidate, cand_provider, cand_model, benchmark, task, registry,
97
+ samples, max_turns, temperature, max_tokens)
98
+ tot_o += o_pass
99
+ tot_c += c_pass
100
+ outcomes.append(TaskOutcome(
101
+ task_id=task.task_id, samples=samples,
102
+ original_passes=o_pass, candidate_passes=c_pass,
103
+ original_cost_usd=o_cost, candidate_cost_usd=c_cost,
104
+ original_output_tokens=o_out, candidate_output_tokens=c_out,
105
+ original_detail=o_detail, candidate_detail=c_detail,
106
+ ))
107
+
108
+ return assemble_proof(
109
+ outcomes,
110
+ benchmark_name=benchmark_name,
111
+ original_spec=original_spec, candidate_spec=candidate_spec,
112
+ recommended_by=recommended_by, samples=samples, mock=mock,
113
+ orig_provider=orig_provider, orig_model=orig_model,
114
+ cand_provider=cand_provider, cand_model=cand_model,
115
+ sample_pass_totals=(tot_o, tot_c),
116
+ alpha=alpha,
117
+ )
@@ -0,0 +1,25 @@
1
+ """Agent execution: the multi-turn AgentRunner, tools, trace, and validation.
2
+
3
+ This package is the keystone that turns the bench from an LLM benchmark into an
4
+ agent benchmark. It feeds the SAME proof machinery (stats + cost) as the
5
+ single-shot path — a trace yields a per-task pass/fail and a measured
6
+ multi-turn cost.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from tjbench.agents.runner import AgentRunner
11
+ from tjbench.agents.tools import Tool, ToolRegistry, ToolResult
12
+ from tjbench.agents.trace import AgentTrace, ToolCallRecord, TurnRecord
13
+ from tjbench.agents.validation import ToolValidation, validate_tools
14
+
15
+ __all__ = [
16
+ "AgentRunner",
17
+ "Tool",
18
+ "ToolRegistry",
19
+ "ToolResult",
20
+ "AgentTrace",
21
+ "TurnRecord",
22
+ "ToolCallRecord",
23
+ "ToolValidation",
24
+ "validate_tools",
25
+ ]
@@ -0,0 +1,66 @@
1
+ """AgentRunner — the multi-turn loop. THE keystone.
2
+
3
+ It drives a tool-calling model against a tool registry until the model returns a
4
+ final answer or hits `max_turns`, recording every turn into an AgentTrace. This
5
+ one component is what unlocks agent benchmarks, multi-turn evaluation, tool-call
6
+ validation, and side-effect safety — all of which were blocked on the
7
+ single-shot `complete(prompt)` interface.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from tjbench.agents.tools import ToolRegistry
12
+ from tjbench.agents.trace import AgentTrace, ToolCallRecord, TurnRecord
13
+ from tjbench.models.tool_calling import ToolCallingClient
14
+
15
+
16
+ class AgentRunner:
17
+ def __init__(self, client: ToolCallingClient, tools: ToolRegistry,
18
+ max_turns: int = 8, temperature: float = 0.0,
19
+ max_tokens: int = 1024) -> None:
20
+ self.client = client
21
+ self.tools = tools
22
+ self.max_turns = max_turns
23
+ self.temperature = temperature
24
+ self.max_tokens = max_tokens
25
+
26
+ def run(self, task_id: str, prompt: str) -> AgentTrace:
27
+ messages: list[dict] = [{"role": "user", "content": prompt}]
28
+ trace = AgentTrace(task_id=task_id)
29
+ specs = self.tools.specs()
30
+
31
+ for i in range(self.max_turns):
32
+ turn = self.client.chat(messages, specs, self.temperature, self.max_tokens)
33
+
34
+ if not turn.wants_tools:
35
+ trace.turns.append(TurnRecord(
36
+ index=i, assistant_text=turn.text, tool_calls=[],
37
+ input_tokens=turn.input_tokens, output_tokens=turn.output_tokens,
38
+ cache_tokens=turn.cache_tokens,
39
+ ))
40
+ trace.final_text = turn.text
41
+ trace.stopped_reason = "final"
42
+ return trace
43
+
44
+ # The model asked for tool(s): execute each, record, feed results back.
45
+ messages.append({
46
+ "role": "assistant", "content": turn.text, "tool_calls": turn.tool_calls,
47
+ })
48
+ call_records: list[ToolCallRecord] = []
49
+ for tc in turn.tool_calls:
50
+ result = self.tools.execute(tc.name, tc.arguments)
51
+ call_records.append(ToolCallRecord(
52
+ name=tc.name, arguments=tc.arguments,
53
+ result=result.output, is_error=result.is_error,
54
+ ))
55
+ messages.append({
56
+ "role": "tool", "tool_call_id": tc.id, "name": tc.name,
57
+ "content": result.output,
58
+ })
59
+ trace.turns.append(TurnRecord(
60
+ index=i, assistant_text=turn.text, tool_calls=call_records,
61
+ input_tokens=turn.input_tokens, output_tokens=turn.output_tokens,
62
+ cache_tokens=turn.cache_tokens,
63
+ ))
64
+
65
+ trace.stopped_reason = "max_turns"
66
+ return trace
@@ -0,0 +1,296 @@
1
+ """SWE-Bench tool implementations that operate on a real workspace.
2
+
3
+ These tools are designed to be bound to a specific task's file workspace
4
+ at runtime. They provide the core developer operations needed for
5
+ SWE-Bench: reading files, editing files, and running commands.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import subprocess
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+ from tjbench.agents.tools import ToolResult
15
+
16
+
17
+ class SWEBenchToolSet:
18
+ """Collection of tools for SWE-Bench agent evaluation.
19
+
20
+ Each tool operates on a specific workspace directory. The toolset
21
+ is instantiated per-task and bound to the task's workspace.
22
+ """
23
+
24
+ def __init__(self, workspace: Path) -> None:
25
+ self.workspace = workspace
26
+ self._files: dict[str, str] = {} # Cache of file contents
27
+
28
+ def _resolve_path(self, path: str) -> Path:
29
+ """Resolve a path relative to the workspace."""
30
+ # Prevent directory traversal outside workspace
31
+ resolved = (self.workspace / path).resolve()
32
+ if not str(resolved).startswith(str(self.workspace.resolve())):
33
+ raise ValueError(f"Path {path} escapes workspace")
34
+ return resolved
35
+
36
+ def _read_file(self, path: Path) -> str:
37
+ """Read a file, caching the result."""
38
+ str_path = str(path)
39
+ if str_path not in self._files:
40
+ if path.exists():
41
+ self._files[str_path] = path.read_text(encoding="utf-8")
42
+ else:
43
+ self._files[str_path] = ""
44
+ return self._files[str_path]
45
+
46
+ def _write_file(self, path: Path, content: str) -> None:
47
+ """Write a file and update cache."""
48
+ path.parent.mkdir(parents=True, exist_ok=True)
49
+ path.write_text(content, encoding="utf-8")
50
+ self._files[str(path)] = content
51
+
52
+ # --- Tool implementations ---
53
+
54
+ def view(self, args: dict[str, Any]) -> ToolResult:
55
+ """View the contents of a file."""
56
+ try:
57
+ path = self._resolve_path(args["path"])
58
+ if not path.exists():
59
+ return ToolResult(
60
+ output=f"Error: File '{args['path']}' does not exist.",
61
+ is_error=True,
62
+ )
63
+ content = self._read_file(path)
64
+ # Add line numbers for readability
65
+ lines = content.split("\n")
66
+ numbered = "\n".join(f"{i+1:4d} | {line}" for i, line in enumerate(lines))
67
+ return ToolResult(output=f"File: {args['path']}\n{numbered}")
68
+ except Exception as e:
69
+ return ToolResult(output=f"Error: {e}", is_error=True)
70
+
71
+ def view_range(self, args: dict[str, Any]) -> ToolResult:
72
+ """View a specific range of lines in a file."""
73
+ try:
74
+ path = self._resolve_path(args["path"])
75
+ start = args["start"]
76
+ end = args["end"]
77
+
78
+ if not path.exists():
79
+ return ToolResult(
80
+ output=f"Error: File '{args['path']}' does not exist.",
81
+ is_error=True,
82
+ )
83
+
84
+ content = self._read_file(path)
85
+ lines = content.split("\n")
86
+
87
+ # Clamp to valid range
88
+ start = max(1, start)
89
+ end = min(len(lines), end)
90
+
91
+ selected = lines[start - 1:end]
92
+ numbered = "\n".join(f"{i+start:4d} | {line}" for i, line in enumerate(selected))
93
+ return ToolResult(
94
+ output=f"File: {args['path']} (lines {start}-{end})\n{numbered}"
95
+ )
96
+ except Exception as e:
97
+ return ToolResult(output=f"Error: {e}", is_error=True)
98
+
99
+ def str_replace(self, args: dict[str, Any]) -> ToolResult:
100
+ """Replace an exact string in a file."""
101
+ try:
102
+ path = self._resolve_path(args["path"])
103
+ old_str = args["old_str"]
104
+ new_str = args["new_str"]
105
+
106
+ if not path.exists():
107
+ return ToolResult(
108
+ output=f"Error: File '{args['path']}' does not exist.",
109
+ is_error=True,
110
+ )
111
+
112
+ content = self._read_file(path)
113
+
114
+ if old_str not in content:
115
+ return ToolResult(
116
+ output=f"Error: Could not find the exact string in {args['path']}. "
117
+ "Make sure the old_str matches exactly (including whitespace).",
118
+ is_error=True,
119
+ )
120
+
121
+ # Count occurrences
122
+ count = content.count(old_str)
123
+ if count > 1:
124
+ return ToolResult(
125
+ output=f"Error: Found {count} occurrences of the string. "
126
+ "Please use a more specific old_str that matches exactly once.",
127
+ is_error=True,
128
+ )
129
+
130
+ new_content = content.replace(old_str, new_str, 1)
131
+ self._write_file(path, new_content)
132
+
133
+ return ToolResult(
134
+ output=f"Successfully replaced in {args['path']}. "
135
+ f"Changed {len(old_str)} chars to {len(new_str)} chars."
136
+ )
137
+ except Exception as e:
138
+ return ToolResult(output=f"Error: {e}", is_error=True)
139
+
140
+ def create(self, args: dict[str, Any]) -> ToolResult:
141
+ """Create a new file with the given content."""
142
+ try:
143
+ path = self._resolve_path(args["path"])
144
+ content = args["content"]
145
+
146
+ if path.exists():
147
+ return ToolResult(
148
+ output=f"Error: File '{args['path']}' already exists. Use str_replace to modify it.",
149
+ is_error=True,
150
+ )
151
+
152
+ self._write_file(path, content)
153
+ return ToolResult(output=f"Created file: {args['path']}")
154
+ except Exception as e:
155
+ return ToolResult(output=f"Error: {e}", is_error=True)
156
+
157
+ def insert(self, args: dict[str, Any]) -> ToolResult:
158
+ """Insert text after a specific line."""
159
+ try:
160
+ path = self._resolve_path(args["path"])
161
+ line = args["line"]
162
+ new_str = args["new_str"]
163
+
164
+ if not path.exists():
165
+ return ToolResult(
166
+ output=f"Error: File '{args['path']}' does not exist.",
167
+ is_error=True,
168
+ )
169
+
170
+ content = self._read_file(path)
171
+ lines = content.split("\n")
172
+
173
+ if line < 0 or line > len(lines):
174
+ return ToolResult(
175
+ output=f"Error: Line {line} is out of range (file has {len(lines)} lines).",
176
+ is_error=True,
177
+ )
178
+
179
+ lines.insert(line, new_str)
180
+ self._write_file(path, "\n".join(lines))
181
+
182
+ return ToolResult(output=f"Inserted after line {line} in {args['path']}")
183
+ except Exception as e:
184
+ return ToolResult(output=f"Error: {e}", is_error=True)
185
+
186
+ def bash(self, args: dict[str, Any]) -> ToolResult:
187
+ """Run a shell command in the workspace."""
188
+ try:
189
+ command = args["command"]
190
+ timeout = args.get("timeout", 30)
191
+
192
+ result = subprocess.run(
193
+ command,
194
+ shell=True,
195
+ cwd=self.workspace,
196
+ capture_output=True,
197
+ text=True,
198
+ timeout=timeout,
199
+ )
200
+
201
+ output = f"Exit code: {result.returncode}\n"
202
+ if result.stdout:
203
+ output += f"STDOUT:\n{result.stdout}\n"
204
+ if result.stderr:
205
+ output += f"STDERR:\n{result.stderr}\n"
206
+
207
+ return ToolResult(
208
+ output=output,
209
+ is_error=result.returncode != 0,
210
+ )
211
+ except subprocess.TimeoutExpired:
212
+ return ToolResult(
213
+ output=f"Error: Command timed out after {timeout}s.",
214
+ is_error=True,
215
+ )
216
+ except Exception as e:
217
+ return ToolResult(output=f"Error: {e}", is_error=True)
218
+
219
+ def get_tool_specs(self) -> list[dict[str, Any]]:
220
+ """Return tool specifications for the agent."""
221
+ return [
222
+ {
223
+ "name": "view",
224
+ "description": "View the contents of a file. Shows line numbers.",
225
+ "parameters": {
226
+ "type": "object",
227
+ "properties": {
228
+ "path": {"type": "string", "description": "Path to the file (relative to workspace)"},
229
+ },
230
+ "required": ["path"],
231
+ },
232
+ },
233
+ {
234
+ "name": "view_range",
235
+ "description": "View a specific range of lines in a file.",
236
+ "parameters": {
237
+ "type": "object",
238
+ "properties": {
239
+ "path": {"type": "string"},
240
+ "start": {"type": "integer", "description": "Start line (1-indexed)"},
241
+ "end": {"type": "integer", "description": "End line (1-indexed)"},
242
+ },
243
+ "required": ["path", "start", "end"],
244
+ },
245
+ },
246
+ {
247
+ "name": "str_replace",
248
+ "description": "Replace an exact string in a file. The old_str must match exactly once.",
249
+ "parameters": {
250
+ "type": "object",
251
+ "properties": {
252
+ "path": {"type": "string"},
253
+ "old_str": {"type": "string", "description": "Exact text to replace (must match exactly once)"},
254
+ "new_str": {"type": "string", "description": "Replacement text"},
255
+ },
256
+ "required": ["path", "old_str", "new_str"],
257
+ },
258
+ },
259
+ {
260
+ "name": "create",
261
+ "description": "Create a new file with the given content.",
262
+ "parameters": {
263
+ "type": "object",
264
+ "properties": {
265
+ "path": {"type": "string"},
266
+ "content": {"type": "string"},
267
+ },
268
+ "required": ["path", "content"],
269
+ },
270
+ },
271
+ {
272
+ "name": "insert",
273
+ "description": "Insert text after a specific line in a file.",
274
+ "parameters": {
275
+ "type": "object",
276
+ "properties": {
277
+ "path": {"type": "string"},
278
+ "line": {"type": "integer", "description": "Line after which to insert (1-indexed)"},
279
+ "new_str": {"type": "string"},
280
+ },
281
+ "required": ["path", "line", "new_str"],
282
+ },
283
+ },
284
+ {
285
+ "name": "bash",
286
+ "description": "Run a shell command in the workspace. Use for running tests, git, etc.",
287
+ "parameters": {
288
+ "type": "object",
289
+ "properties": {
290
+ "command": {"type": "string", "description": "Shell command to run"},
291
+ "timeout": {"type": "integer", "default": 30, "description": "Timeout in seconds"},
292
+ },
293
+ "required": ["command"],
294
+ },
295
+ },
296
+ ]
@@ -0,0 +1,63 @@
1
+ """Tools an agent can call, and the registry that executes them.
2
+
3
+ A `Tool` carries a JSON-schema for its arguments (so it can be advertised to a
4
+ tool-calling model) and a `dangerous` flag. The flag is load-bearing for the
5
+ safety story from the review: a cheaper model can produce a correct-looking
6
+ final answer while taking a catastrophic action (delete vs read). Marking the
7
+ tool dangerous lets validation fail the task on the *action*, not the text.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+ from typing import Any, Callable
13
+
14
+
15
+ @dataclass
16
+ class ToolResult:
17
+ output: str
18
+ is_error: bool = False
19
+
20
+
21
+ @dataclass
22
+ class Tool:
23
+ name: str
24
+ description: str
25
+ parameters: dict[str, Any] # JSON Schema for the arguments object
26
+ run: Callable[[dict[str, Any]], ToolResult]
27
+ dangerous: bool = False # has side effects worth flagging on misuse
28
+
29
+ def spec(self) -> dict[str, Any]:
30
+ """Provider-agnostic advertisement of this tool to a model."""
31
+ return {
32
+ "name": self.name,
33
+ "description": self.description,
34
+ "parameters": self.parameters,
35
+ }
36
+
37
+
38
+ class ToolRegistry:
39
+ def __init__(self, tools: list[Tool] | None = None) -> None:
40
+ self._tools: dict[str, Tool] = {}
41
+ for t in tools or []:
42
+ self.register(t)
43
+
44
+ def register(self, tool: Tool) -> None:
45
+ self._tools[tool.name] = tool
46
+
47
+ def names(self) -> list[str]:
48
+ return list(self._tools.keys())
49
+
50
+ def dangerous_names(self) -> set[str]:
51
+ return {n for n, t in self._tools.items() if t.dangerous}
52
+
53
+ def specs(self) -> list[dict[str, Any]]:
54
+ return [t.spec() for t in self._tools.values()]
55
+
56
+ def execute(self, name: str, arguments: dict[str, Any]) -> ToolResult:
57
+ tool = self._tools.get(name)
58
+ if tool is None:
59
+ return ToolResult(output=f"unknown tool: {name}", is_error=True)
60
+ try:
61
+ return tool.run(arguments or {})
62
+ except Exception as exc: # a tool raising is a tool error, not a crash
63
+ return ToolResult(output=f"tool error: {exc}", is_error=True)
@@ -0,0 +1,72 @@
1
+ """The observable record of an agent run.
2
+
3
+ A single-shot completion has nothing to validate beyond its text. A multi-turn
4
+ agent run produces a *trace*: which tools were called, with what arguments, in
5
+ what order, whether they errored, the per-turn token usage, and the final
6
+ answer. The trace is what makes tool-call validation, multi-turn evaluation, and
7
+ side-effect safety possible — and summing tokens across turns is what makes the
8
+ cost number honest for agents (a model that loops 8× costs 8×).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass, field
13
+
14
+ from tjbench.models.base import Completion
15
+
16
+
17
+ @dataclass
18
+ class ToolCallRecord:
19
+ name: str
20
+ arguments: dict
21
+ result: str
22
+ is_error: bool
23
+
24
+
25
+ @dataclass
26
+ class TurnRecord:
27
+ index: int
28
+ assistant_text: str
29
+ tool_calls: list[ToolCallRecord]
30
+ input_tokens: int
31
+ output_tokens: int
32
+ cache_tokens: int = 0
33
+
34
+
35
+ @dataclass
36
+ class AgentTrace:
37
+ task_id: str
38
+ turns: list[TurnRecord] = field(default_factory=list)
39
+ final_text: str = ""
40
+ stopped_reason: str = "final" # "final" | "max_turns"
41
+
42
+ def tool_sequence(self) -> list[str]:
43
+ """Ordered tool names across the whole run."""
44
+ return [tc.name for turn in self.turns for tc in turn.tool_calls]
45
+
46
+ def all_tool_calls(self) -> list[ToolCallRecord]:
47
+ return [tc for turn in self.turns for tc in turn.tool_calls]
48
+
49
+ @property
50
+ def num_turns(self) -> int:
51
+ return len(self.turns)
52
+
53
+ @property
54
+ def total_input_tokens(self) -> int:
55
+ return sum(t.input_tokens for t in self.turns)
56
+
57
+ @property
58
+ def total_output_tokens(self) -> int:
59
+ return sum(t.output_tokens for t in self.turns)
60
+
61
+ @property
62
+ def total_cache_tokens(self) -> int:
63
+ return sum(t.cache_tokens for t in self.turns)
64
+
65
+ def as_completion(self) -> Completion:
66
+ """Collapse the run's token usage so the existing pricing path applies."""
67
+ return Completion(
68
+ text=self.final_text,
69
+ input_tokens=self.total_input_tokens,
70
+ output_tokens=self.total_output_tokens,
71
+ cache_tokens=self.total_cache_tokens,
72
+ )