agentsynth-ai 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. agentsynth/__init__.py +168 -0
  2. agentsynth/benchmarks/__init__.py +37 -0
  3. agentsynth/benchmarks/bfcl.py +123 -0
  4. agentsynth/benchmarks/tau_bench.py +71 -0
  5. agentsynth/benchmarks/tool_calling.py +275 -0
  6. agentsynth/cli.py +236 -0
  7. agentsynth/dedup.py +104 -0
  8. agentsynth/environments/__init__.py +16 -0
  9. agentsynth/environments/base.py +85 -0
  10. agentsynth/environments/mcp_env.py +199 -0
  11. agentsynth/environments/python_sandbox.py +93 -0
  12. agentsynth/environments/sql.py +153 -0
  13. agentsynth/evaluator.py +613 -0
  14. agentsynth/exporters.py +297 -0
  15. agentsynth/generator.py +867 -0
  16. agentsynth/hub.py +157 -0
  17. agentsynth/metrics.py +410 -0
  18. agentsynth/pipelines/__init__.py +8 -0
  19. agentsynth/pipelines/recipe.py +77 -0
  20. agentsynth/pipelines/runner.py +164 -0
  21. agentsynth/preferences.py +134 -0
  22. agentsynth/py.typed +0 -0
  23. agentsynth/schemas.py +262 -0
  24. agentsynth/tasks/__init__.py +7 -0
  25. agentsynth/tasks/taxonomy.py +193 -0
  26. agentsynth/training/__init__.py +19 -0
  27. agentsynth/training/datasets.py +71 -0
  28. agentsynth/utils.py +465 -0
  29. agentsynth/verification/__init__.py +37 -0
  30. agentsynth/verification/base.py +99 -0
  31. agentsynth/verification/ensemble.py +70 -0
  32. agentsynth/verification/rubrics.py +63 -0
  33. agentsynth/verification/verifiers.py +135 -0
  34. agentsynth_ai-0.2.0.dist-info/METADATA +589 -0
  35. agentsynth_ai-0.2.0.dist-info/RECORD +39 -0
  36. agentsynth_ai-0.2.0.dist-info/WHEEL +5 -0
  37. agentsynth_ai-0.2.0.dist-info/entry_points.txt +2 -0
  38. agentsynth_ai-0.2.0.dist-info/licenses/LICENSE +21 -0
  39. agentsynth_ai-0.2.0.dist-info/top_level.txt +1 -0
agentsynth/__init__.py ADDED
@@ -0,0 +1,168 @@
1
+ """AgentSynth: generate synthetic agent trajectories and score them with an
2
+ LLM-as-Judge loop. Runs offline against a mock by default.
3
+
4
+ >>> from agentsynth import AgentTrajectoryGenerator, TrajectoryEvaluator
5
+ >>> gen = AgentTrajectoryGenerator()
6
+ >>> traj = gen.generate("What's the weather in Paris?")
7
+ >>> ev = TrajectoryEvaluator().evaluate(traj)
8
+ >>> ev.overall, ev.passed
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ __version__ = "0.2.0"
14
+
15
+ from .benchmarks import (
16
+ BUILTIN_CASES,
17
+ BenchmarkCase,
18
+ BenchmarkReport,
19
+ agentsynth_model,
20
+ compare_models,
21
+ prompted_model,
22
+ report_table_md,
23
+ run_benchmark,
24
+ )
25
+ from .dedup import DedupResult, decontaminate, dedup_trajectories
26
+ from .environments import (
27
+ CompositeEnvironment,
28
+ Environment,
29
+ MCPEnvironment,
30
+ PythonSandbox,
31
+ SQLEnvironment,
32
+ )
33
+ from .evaluator import TrajectoryEvaluator
34
+ from .exporters import (
35
+ load_jsonl,
36
+ save_dataset,
37
+ to_adp,
38
+ to_jsonl,
39
+ to_sharegpt,
40
+ )
41
+ from .generator import AgentTrajectoryGenerator
42
+ from .hub import dataset_card, prepare_dataset_dir, push_dataset
43
+ from .metrics import compute_dataset_metrics, diversity_score
44
+ from .pipelines import Recipe, RunResult, load_recipe, make_environment, run_recipe
45
+ from .preferences import (
46
+ PreferencePair,
47
+ build_preference_pairs,
48
+ load_dpo_jsonl,
49
+ to_dpo_jsonl,
50
+ )
51
+ from .schemas import (
52
+ DEFAULT_RUBRIC_WEIGHTS,
53
+ RUBRIC_DIMENSIONS,
54
+ EvalResult,
55
+ RubricScores,
56
+ ToolSpec,
57
+ Trajectory,
58
+ TrajectoryStep,
59
+ )
60
+ from .tasks import SEED_TASKS, SeedTask, sample_tasks
61
+ from .training import build_dpo_dataset, build_sft_dataset, to_dpo_records, to_sft_records
62
+ from .utils import (
63
+ DEFAULT_TOOL_CATALOG,
64
+ LLMClient,
65
+ PythonREPL,
66
+ default_tool_catalog,
67
+ parse_tool_catalog,
68
+ )
69
+ from .verification import (
70
+ RUBRIC_PRESETS,
71
+ EnsembleEvaluator,
72
+ ExecutionVerifier,
73
+ ExpectedAnswerVerifier,
74
+ SafetyVerifier,
75
+ ToolArgVerifier,
76
+ VerificationResult,
77
+ Verifier,
78
+ batch_verify,
79
+ get_rubric,
80
+ rubric_names,
81
+ verify_trajectory,
82
+ )
83
+
84
+ __all__ = [
85
+ "__version__",
86
+ # schemas
87
+ "Trajectory",
88
+ "TrajectoryStep",
89
+ "ToolSpec",
90
+ "RubricScores",
91
+ "EvalResult",
92
+ "RUBRIC_DIMENSIONS",
93
+ "DEFAULT_RUBRIC_WEIGHTS",
94
+ # utils
95
+ "parse_tool_catalog",
96
+ "default_tool_catalog",
97
+ "DEFAULT_TOOL_CATALOG",
98
+ "PythonREPL",
99
+ "LLMClient",
100
+ # core
101
+ "AgentTrajectoryGenerator",
102
+ "TrajectoryEvaluator",
103
+ # metrics
104
+ "compute_dataset_metrics",
105
+ "diversity_score",
106
+ # exporters
107
+ "to_jsonl",
108
+ "to_sharegpt",
109
+ "to_adp",
110
+ "save_dataset",
111
+ "load_jsonl",
112
+ # environments
113
+ "Environment",
114
+ "CompositeEnvironment",
115
+ "SQLEnvironment",
116
+ "PythonSandbox",
117
+ "MCPEnvironment",
118
+ # tasks
119
+ "SeedTask",
120
+ "SEED_TASKS",
121
+ "sample_tasks",
122
+ # pipelines
123
+ "Recipe",
124
+ "RunResult",
125
+ "run_recipe",
126
+ "load_recipe",
127
+ "make_environment",
128
+ # verification
129
+ "Verifier",
130
+ "VerificationResult",
131
+ "verify_trajectory",
132
+ "batch_verify",
133
+ "ExecutionVerifier",
134
+ "ToolArgVerifier",
135
+ "SafetyVerifier",
136
+ "ExpectedAnswerVerifier",
137
+ "EnsembleEvaluator",
138
+ "RUBRIC_PRESETS",
139
+ "get_rubric",
140
+ "rubric_names",
141
+ # preferences (DPO)
142
+ "PreferencePair",
143
+ "build_preference_pairs",
144
+ "to_dpo_jsonl",
145
+ "load_dpo_jsonl",
146
+ # dedup / decontamination
147
+ "dedup_trajectories",
148
+ "decontaminate",
149
+ "DedupResult",
150
+ # training data prep
151
+ "to_sft_records",
152
+ "to_dpo_records",
153
+ "build_sft_dataset",
154
+ "build_dpo_dataset",
155
+ # benchmark
156
+ "BenchmarkCase",
157
+ "BenchmarkReport",
158
+ "BUILTIN_CASES",
159
+ "run_benchmark",
160
+ "compare_models",
161
+ "agentsynth_model",
162
+ "prompted_model",
163
+ "report_table_md",
164
+ # hub
165
+ "dataset_card",
166
+ "prepare_dataset_dir",
167
+ "push_dataset",
168
+ ]
@@ -0,0 +1,37 @@
1
+ """A self-contained function-calling benchmark, plus BFCL and τ-bench adapters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .bfcl import SAMPLE_BFCL, bfcl_case, load_bfcl, sample_cases
6
+ from .tau_bench import run_tau_bench, tau_bench_available
7
+ from .tool_calling import (
8
+ BUILTIN_CASES,
9
+ BenchmarkCase,
10
+ BenchmarkReport,
11
+ CaseResult,
12
+ agentsynth_model,
13
+ compare_models,
14
+ prompted_model,
15
+ report_table_md,
16
+ run_benchmark,
17
+ )
18
+
19
+ __all__ = [
20
+ "BenchmarkCase",
21
+ "CaseResult",
22
+ "BenchmarkReport",
23
+ "BUILTIN_CASES",
24
+ "run_benchmark",
25
+ "compare_models",
26
+ "agentsynth_model",
27
+ "prompted_model",
28
+ "report_table_md",
29
+ # BFCL
30
+ "load_bfcl",
31
+ "bfcl_case",
32
+ "sample_cases",
33
+ "SAMPLE_BFCL",
34
+ # tau-bench
35
+ "run_tau_bench",
36
+ "tau_bench_available",
37
+ ]
@@ -0,0 +1,123 @@
1
+ """Load Berkeley Function-Calling Leaderboard (BFCL) cases into the harness.
2
+
3
+ BFCL ships JSONL: a questions file (`id`, `question`, `function`) and a
4
+ possible-answers file (`id`, `ground_truth`). This converts the pair into
5
+ `BenchmarkCase`s so any `model_fn` can be scored with `run_benchmark`.
6
+
7
+ The scoring here is a simplified tool + argument match (arguments accept the list
8
+ of allowed values BFCL gives), not BFCL's full AST checker. Use it for a quick
9
+ before/after signal; use BFCL's own scorer for the leaderboard.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from typing import Any, Dict, List, Optional, Tuple
16
+
17
+ from .tool_calling import BenchmarkCase
18
+
19
+
20
+ def _user_query(question: Any) -> str:
21
+ turns = question[0] if question and isinstance(question[0], list) else question
22
+ for msg in turns or []:
23
+ if isinstance(msg, dict) and msg.get("role") == "user":
24
+ return str(msg.get("content", ""))
25
+ return str(question)
26
+
27
+
28
+ def _normalize_tools(functions: Any) -> List[Dict[str, Any]]:
29
+ out: List[Dict[str, Any]] = []
30
+ for fn in functions or []:
31
+ params = dict(fn.get("parameters") or {})
32
+ if params.get("type") == "dict": # BFCL uses "dict"; JSON Schema uses "object"
33
+ params["type"] = "object"
34
+ out.append(
35
+ {"name": fn.get("name"), "description": fn.get("description", ""), "parameters": params}
36
+ )
37
+ return out
38
+
39
+
40
+ def bfcl_case(
41
+ question_rec: Dict[str, Any], answer_rec: Optional[Dict[str, Any]] = None
42
+ ) -> Optional[BenchmarkCase]:
43
+ """Convert one BFCL question (+ its ground truth) into a BenchmarkCase."""
44
+ ground_truth = (answer_rec or {}).get("ground_truth") or []
45
+ if not ground_truth or not isinstance(ground_truth[0], dict) or not ground_truth[0]:
46
+ return None
47
+ first = ground_truth[0]
48
+ expected_tool = next(iter(first))
49
+ expected_args = first[expected_tool] if isinstance(first[expected_tool], dict) else {}
50
+ return BenchmarkCase(
51
+ id=str(question_rec.get("id", "")),
52
+ query=_user_query(question_rec.get("question")),
53
+ expected_tool=expected_tool,
54
+ expected_args=expected_args,
55
+ tools=_normalize_tools(question_rec.get("function")),
56
+ )
57
+
58
+
59
+ def _read_jsonl(path: str) -> List[Dict[str, Any]]:
60
+ with open(path, encoding="utf-8") as fh:
61
+ return [json.loads(line) for line in fh if line.strip()]
62
+
63
+
64
+ def load_bfcl(questions_path: str, answers_path: Optional[str] = None) -> List[BenchmarkCase]:
65
+ """Load BFCL question + possible-answer JSONL files into BenchmarkCases."""
66
+ questions = _read_jsonl(questions_path)
67
+ answers: Dict[str, Dict[str, Any]] = {}
68
+ if answers_path:
69
+ answers = {str(a.get("id")): a for a in _read_jsonl(answers_path)}
70
+ cases: List[BenchmarkCase] = []
71
+ for q in questions:
72
+ case = bfcl_case(q, answers.get(str(q.get("id"))))
73
+ if case is not None:
74
+ cases.append(case)
75
+ return cases
76
+
77
+
78
+ # A couple of BFCL-format records, for tests and an offline demo.
79
+ SAMPLE_BFCL: List[Tuple[Dict[str, Any], Dict[str, Any]]] = [
80
+ (
81
+ {
82
+ "id": "simple_0",
83
+ "question": [[{"role": "user", "content": "What is the weather in Paris?"}]],
84
+ "function": [
85
+ {
86
+ "name": "get_weather",
87
+ "description": "Get the current weather for a city.",
88
+ "parameters": {
89
+ "type": "dict",
90
+ "properties": {"city": {"type": "string"}},
91
+ "required": ["city"],
92
+ },
93
+ }
94
+ ],
95
+ },
96
+ {"id": "simple_0", "ground_truth": [{"get_weather": {"city": ["Paris", "paris"]}}]},
97
+ ),
98
+ (
99
+ {
100
+ "id": "simple_1",
101
+ "question": [[{"role": "user", "content": "Compute 15 factorial."}]],
102
+ "function": [
103
+ {
104
+ "name": "math_factorial",
105
+ "description": "Compute the factorial of a number.",
106
+ "parameters": {
107
+ "type": "dict",
108
+ "properties": {"n": {"type": "integer"}},
109
+ "required": ["n"],
110
+ },
111
+ }
112
+ ],
113
+ },
114
+ {"id": "simple_1", "ground_truth": [{"math_factorial": {"n": [15]}}]},
115
+ ),
116
+ ]
117
+
118
+
119
+ def sample_cases() -> List[BenchmarkCase]:
120
+ return [c for c in (bfcl_case(q, a) for q, a in SAMPLE_BFCL) if c is not None]
121
+
122
+
123
+ __all__ = ["bfcl_case", "load_bfcl", "sample_cases", "SAMPLE_BFCL"]
@@ -0,0 +1,71 @@
1
+ """Run τ-bench against a model.
2
+
3
+ Unlike BFCL, τ-bench is multi-turn and agentic — a user simulator plus a domain
4
+ database — so it can't be scored from single tool calls. This is a thin bridge to
5
+ the official `tau-bench` package: it builds the environment and the package's
6
+ tool-calling agent and reports the pass rate.
7
+
8
+ Point `model` at your fine-tuned model served behind an OpenAI-compatible endpoint
9
+ (vLLM, TGI, …). Install the harness with:
10
+
11
+ pip install git+https://github.com/sierra-research/tau-bench
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from typing import Any, Dict, List, Optional
17
+
18
+
19
+ def tau_bench_available() -> bool:
20
+ try:
21
+ import tau_bench # noqa: F401
22
+
23
+ return True
24
+ except Exception:
25
+ return False
26
+
27
+
28
+ def run_tau_bench(
29
+ model: str,
30
+ provider: str = "openai",
31
+ env_name: str = "retail",
32
+ user_model: str = "gpt-4o",
33
+ user_provider: str = "openai",
34
+ task_split: str = "test",
35
+ task_ids: Optional[List[int]] = None,
36
+ ) -> Dict[str, Any]:
37
+ """Run τ-bench and return `{env, n, pass_rate, avg_reward}`.
38
+
39
+ Requires the `tau-bench` package and API keys for the agent + user-simulator models.
40
+ """
41
+ try:
42
+ from tau_bench.agents.tool_calling_agent import ToolCallingAgent
43
+ from tau_bench.envs import get_env
44
+ except Exception as exc:
45
+ raise ImportError(
46
+ "tau-bench is not installed: "
47
+ "pip install git+https://github.com/sierra-research/tau-bench"
48
+ ) from exc
49
+
50
+ env = get_env(
51
+ env_name,
52
+ user_strategy="llm",
53
+ user_model=user_model,
54
+ user_provider=user_provider,
55
+ task_split=task_split,
56
+ )
57
+ agent = ToolCallingAgent(
58
+ tools_info=env.tools_info, wiki=env.wiki, model=model, provider=provider
59
+ )
60
+ ids = task_ids if task_ids is not None else list(range(len(env.tasks)))
61
+ rewards = [float(getattr(agent.solve(env=env, task_index=i), "reward", 0.0)) for i in ids]
62
+ n = len(rewards)
63
+ return {
64
+ "env": env_name,
65
+ "n": n,
66
+ "pass_rate": round(sum(1 for r in rewards if r >= 0.999) / n, 4) if n else 0.0,
67
+ "avg_reward": round(sum(rewards) / n, 4) if n else 0.0,
68
+ }
69
+
70
+
71
+ __all__ = ["tau_bench_available", "run_tau_bench"]
@@ -0,0 +1,275 @@
1
+ """A small function-calling benchmark.
2
+
3
+ Each case gives a query and the full tool catalog; the model has to pick the right
4
+ tool and supply sane arguments. Scoring is deliberately simple — tool match plus a
5
+ lenient arg check — so it's easy to reason about and produces a clear before/after
6
+ table. A "model" here is any callable `(query, tools) -> (tool_name, tool_args)`,
7
+ so you can score a real LLM, a fine-tuned model, or AgentSynth's own generator.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from typing import Any, Callable, Dict, List, Optional, Tuple
14
+
15
+ from pydantic import BaseModel, Field
16
+
17
+ from ..schemas import ToolSpec
18
+ from ..utils import DEFAULT_TOOL_CATALOG, extract_json, parse_tool_catalog
19
+
20
+ ModelFn = Callable[[str, List[ToolSpec]], Tuple[Optional[str], Dict[str, Any]]]
21
+
22
+
23
+ class BenchmarkCase(BaseModel):
24
+ id: str
25
+ query: str
26
+ expected_tool: str
27
+ expected_args: Dict[str, Any] = Field(default_factory=dict) # value None => key presence only
28
+ # The full default catalog (with parameter schemas), so the model must both pick the
29
+ # right tool and fill its arguments.
30
+ tools: List[Dict[str, Any]] = Field(default_factory=lambda: list(DEFAULT_TOOL_CATALOG))
31
+
32
+ def tool_specs(self) -> List[ToolSpec]:
33
+ return parse_tool_catalog(self.tools)
34
+
35
+
36
+ class CaseResult(BaseModel):
37
+ id: str
38
+ expected_tool: str
39
+ predicted_tool: Optional[str]
40
+ tool_ok: bool
41
+ args_ok: bool
42
+ score: float
43
+
44
+
45
+ class BenchmarkReport(BaseModel):
46
+ n: int
47
+ tool_accuracy: float
48
+ arg_accuracy: float
49
+ score: float
50
+ results: List[CaseResult] = Field(default_factory=list)
51
+
52
+
53
+ BUILTIN_CASES: List[BenchmarkCase] = [
54
+ BenchmarkCase(
55
+ id="weather_paris",
56
+ query="What's the weather in Paris right now?",
57
+ expected_tool="get_weather",
58
+ expected_args={"city": "Paris"},
59
+ ),
60
+ BenchmarkCase(
61
+ id="weather_tokyo",
62
+ query="Is it raining in Tokyo today?",
63
+ expected_tool="get_weather",
64
+ expected_args={"city": "Tokyo"},
65
+ ),
66
+ BenchmarkCase(
67
+ id="math_mult",
68
+ query="What is 23 times 7 plus 4?",
69
+ expected_tool="calculator",
70
+ expected_args={"expression": None},
71
+ ),
72
+ BenchmarkCase(
73
+ id="math_tip",
74
+ query="Calculate an 18% tip on a $54 bill.",
75
+ expected_tool="calculator",
76
+ expected_args={"expression": None},
77
+ ),
78
+ BenchmarkCase(
79
+ id="search_news",
80
+ query="Find recent news about open-source AI agents.",
81
+ expected_tool="web_search",
82
+ expected_args={"query": None},
83
+ ),
84
+ BenchmarkCase(
85
+ id="search_fact",
86
+ query="Search the web for the population of Vietnam.",
87
+ expected_tool="web_search",
88
+ expected_args={"query": None},
89
+ ),
90
+ BenchmarkCase(
91
+ id="file_csv",
92
+ query="Read the file data/report.csv and summarize it.",
93
+ expected_tool="read_file",
94
+ expected_args={"path": None},
95
+ ),
96
+ BenchmarkCase(
97
+ id="file_notes",
98
+ query="Open notes.md and list the action items.",
99
+ expected_tool="read_file",
100
+ expected_args={"path": None},
101
+ ),
102
+ BenchmarkCase(
103
+ id="sql_revenue",
104
+ query="Query the database for total revenue by region.",
105
+ expected_tool="sql_query",
106
+ expected_args={"query": None},
107
+ ),
108
+ BenchmarkCase(
109
+ id="sql_count",
110
+ query="Run a SQL query to count rows in the sales table.",
111
+ expected_tool="sql_query",
112
+ expected_args={"query": None},
113
+ ),
114
+ BenchmarkCase(
115
+ id="email_launch",
116
+ query="Send an email to team@example.com about the launch.",
117
+ expected_tool="send_email",
118
+ expected_args={"to": None},
119
+ ),
120
+ BenchmarkCase(
121
+ id="email_summary",
122
+ query="Email a summary of the report to alex@example.com.",
123
+ expected_tool="send_email",
124
+ expected_args={"to": None},
125
+ ),
126
+ ]
127
+
128
+
129
+ def _args_ok(expected: Dict[str, Any], predicted: Dict[str, Any]) -> bool:
130
+ for key, value in (expected or {}).items():
131
+ if key not in (predicted or {}):
132
+ return False
133
+ if value is None:
134
+ continue # key presence is enough
135
+ pred = str(predicted[key]).strip().lower()
136
+ if isinstance(value, (list, tuple)): # any acceptable value (BFCL-style)
137
+ if pred not in [str(v).strip().lower() for v in value]:
138
+ return False
139
+ elif pred != str(value).strip().lower():
140
+ return False
141
+ return True
142
+
143
+
144
+ def _mean(values: List[float]) -> float:
145
+ return round(sum(values) / len(values), 4) if values else 0.0
146
+
147
+
148
+ def run_benchmark(
149
+ model_fn: ModelFn, cases: Optional[List[BenchmarkCase]] = None
150
+ ) -> BenchmarkReport:
151
+ """Score a model on the function-calling cases.
152
+
153
+ `model_fn(query, tools)` returns the `(tool_name, tool_args)` the model would call.
154
+ """
155
+ cases = cases or BUILTIN_CASES
156
+ results: List[CaseResult] = []
157
+ for case in cases:
158
+ try:
159
+ predicted_tool, predicted_args = model_fn(case.query, case.tool_specs())
160
+ except Exception:
161
+ predicted_tool, predicted_args = None, {}
162
+ tool_ok = predicted_tool == case.expected_tool
163
+ args_ok = tool_ok and _args_ok(case.expected_args, predicted_args or {})
164
+ score = 1.0 if (tool_ok and args_ok) else (0.5 if tool_ok else 0.0)
165
+ results.append(
166
+ CaseResult(
167
+ id=case.id,
168
+ expected_tool=case.expected_tool,
169
+ predicted_tool=predicted_tool,
170
+ tool_ok=tool_ok,
171
+ args_ok=args_ok,
172
+ score=score,
173
+ )
174
+ )
175
+ return BenchmarkReport(
176
+ n=len(results),
177
+ tool_accuracy=_mean([1.0 if r.tool_ok else 0.0 for r in results]),
178
+ arg_accuracy=_mean([1.0 if r.args_ok else 0.0 for r in results]),
179
+ score=_mean([r.score for r in results]),
180
+ results=results,
181
+ )
182
+
183
+
184
+ def compare_models(
185
+ before: ModelFn, after: ModelFn, cases: Optional[List[BenchmarkCase]] = None
186
+ ) -> Dict[str, Any]:
187
+ """Run two models on the same cases and return a before/after comparison."""
188
+ cases = cases or BUILTIN_CASES
189
+ before_report = run_benchmark(before, cases)
190
+ after_report = run_benchmark(after, cases)
191
+ return {
192
+ "before": before_report,
193
+ "after": after_report,
194
+ "delta_tool_accuracy": round(after_report.tool_accuracy - before_report.tool_accuracy, 4),
195
+ "delta_score": round(after_report.score - before_report.score, 4),
196
+ "n": before_report.n,
197
+ }
198
+
199
+
200
+ def agentsynth_model(generator: Any, mode: str = "single_agent") -> ModelFn:
201
+ """Adapt an AgentTrajectoryGenerator into a benchmark model: it takes the first
202
+ tool call the generated trajectory makes."""
203
+
204
+ def model_fn(query: str, tools: List[ToolSpec]) -> Tuple[Optional[str], Dict[str, Any]]:
205
+ traj = generator.generate(query, tools=tools, mode=mode)
206
+ calls = traj.tool_calls()
207
+ if not calls:
208
+ return None, {}
209
+ return calls[0].tool_name, calls[0].tool_args or {}
210
+
211
+ return model_fn
212
+
213
+
214
+ def prompted_model(complete_fn: Callable[[str], str]) -> ModelFn:
215
+ """Turn a text-completion function `(prompt) -> text` into a benchmark model.
216
+
217
+ It asks the model for a single JSON tool call and parses the reply, so it works
218
+ with any instruction-following model (a base or fine-tuned HF model, etc.).
219
+ """
220
+
221
+ def model_fn(query: str, tools: List[ToolSpec]) -> Tuple[Optional[str], Dict[str, Any]]:
222
+ tool_json = json.dumps(
223
+ [
224
+ {"name": t.name, "description": t.description, "parameters": t.parameters}
225
+ for t in tools
226
+ ]
227
+ )
228
+ prompt = (
229
+ "You can call exactly one tool to help the user.\n"
230
+ f"Tools (JSON): {tool_json}\n\n"
231
+ f"User: {query}\n"
232
+ 'Respond with ONLY a JSON object: {"tool": "<tool name>", "args": {<arguments>}}'
233
+ )
234
+ parsed = extract_json(complete_fn(prompt))
235
+ if isinstance(parsed, dict):
236
+ args = parsed.get("args")
237
+ return parsed.get("tool"), args if isinstance(args, dict) else {}
238
+ return None, {}
239
+
240
+ return model_fn
241
+
242
+
243
+ def report_table_md(comparison: Dict[str, Any]) -> str:
244
+ """Render a before/after comparison as a markdown table."""
245
+ before = comparison["before"]
246
+ after = comparison["after"]
247
+
248
+ def pct(x: float) -> str:
249
+ return f"{x * 100:.1f}%"
250
+
251
+ lines = [
252
+ f"Function-calling benchmark ({comparison['n']} cases)",
253
+ "",
254
+ "| Metric | Before | After | Δ |",
255
+ "| --- | --- | --- | --- |",
256
+ f"| Tool accuracy | {pct(before.tool_accuracy)} | {pct(after.tool_accuracy)} "
257
+ f"| {pct(comparison['delta_tool_accuracy'])} |",
258
+ f"| Arg accuracy | {pct(before.arg_accuracy)} | {pct(after.arg_accuracy)} | |",
259
+ f"| Overall score | {pct(before.score)} | {pct(after.score)} "
260
+ f"| {pct(comparison['delta_score'])} |",
261
+ ]
262
+ return "\n".join(lines)
263
+
264
+
265
+ __all__ = [
266
+ "BenchmarkCase",
267
+ "CaseResult",
268
+ "BenchmarkReport",
269
+ "BUILTIN_CASES",
270
+ "run_benchmark",
271
+ "compare_models",
272
+ "agentsynth_model",
273
+ "prompted_model",
274
+ "report_table_md",
275
+ ]