agentsynth-ai 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentsynth/__init__.py +168 -0
- agentsynth/benchmarks/__init__.py +37 -0
- agentsynth/benchmarks/bfcl.py +123 -0
- agentsynth/benchmarks/tau_bench.py +71 -0
- agentsynth/benchmarks/tool_calling.py +275 -0
- agentsynth/cli.py +236 -0
- agentsynth/dedup.py +104 -0
- agentsynth/environments/__init__.py +16 -0
- agentsynth/environments/base.py +85 -0
- agentsynth/environments/mcp_env.py +199 -0
- agentsynth/environments/python_sandbox.py +93 -0
- agentsynth/environments/sql.py +153 -0
- agentsynth/evaluator.py +613 -0
- agentsynth/exporters.py +297 -0
- agentsynth/generator.py +867 -0
- agentsynth/hub.py +157 -0
- agentsynth/metrics.py +410 -0
- agentsynth/pipelines/__init__.py +8 -0
- agentsynth/pipelines/recipe.py +77 -0
- agentsynth/pipelines/runner.py +164 -0
- agentsynth/preferences.py +134 -0
- agentsynth/py.typed +0 -0
- agentsynth/schemas.py +262 -0
- agentsynth/tasks/__init__.py +7 -0
- agentsynth/tasks/taxonomy.py +193 -0
- agentsynth/training/__init__.py +19 -0
- agentsynth/training/datasets.py +71 -0
- agentsynth/utils.py +465 -0
- agentsynth/verification/__init__.py +37 -0
- agentsynth/verification/base.py +99 -0
- agentsynth/verification/ensemble.py +70 -0
- agentsynth/verification/rubrics.py +63 -0
- agentsynth/verification/verifiers.py +135 -0
- agentsynth_ai-0.2.0.dist-info/METADATA +589 -0
- agentsynth_ai-0.2.0.dist-info/RECORD +39 -0
- agentsynth_ai-0.2.0.dist-info/WHEEL +5 -0
- agentsynth_ai-0.2.0.dist-info/entry_points.txt +2 -0
- agentsynth_ai-0.2.0.dist-info/licenses/LICENSE +21 -0
- agentsynth_ai-0.2.0.dist-info/top_level.txt +1 -0
agentsynth/__init__.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""AgentSynth: generate synthetic agent trajectories and score them with an
|
|
2
|
+
LLM-as-Judge loop. Runs offline against a mock by default.
|
|
3
|
+
|
|
4
|
+
>>> from agentsynth import AgentTrajectoryGenerator, TrajectoryEvaluator
|
|
5
|
+
>>> gen = AgentTrajectoryGenerator()
|
|
6
|
+
>>> traj = gen.generate("What's the weather in Paris?")
|
|
7
|
+
>>> ev = TrajectoryEvaluator().evaluate(traj)
|
|
8
|
+
>>> ev.overall, ev.passed
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
__version__ = "0.2.0"
|
|
14
|
+
|
|
15
|
+
from .benchmarks import (
|
|
16
|
+
BUILTIN_CASES,
|
|
17
|
+
BenchmarkCase,
|
|
18
|
+
BenchmarkReport,
|
|
19
|
+
agentsynth_model,
|
|
20
|
+
compare_models,
|
|
21
|
+
prompted_model,
|
|
22
|
+
report_table_md,
|
|
23
|
+
run_benchmark,
|
|
24
|
+
)
|
|
25
|
+
from .dedup import DedupResult, decontaminate, dedup_trajectories
|
|
26
|
+
from .environments import (
|
|
27
|
+
CompositeEnvironment,
|
|
28
|
+
Environment,
|
|
29
|
+
MCPEnvironment,
|
|
30
|
+
PythonSandbox,
|
|
31
|
+
SQLEnvironment,
|
|
32
|
+
)
|
|
33
|
+
from .evaluator import TrajectoryEvaluator
|
|
34
|
+
from .exporters import (
|
|
35
|
+
load_jsonl,
|
|
36
|
+
save_dataset,
|
|
37
|
+
to_adp,
|
|
38
|
+
to_jsonl,
|
|
39
|
+
to_sharegpt,
|
|
40
|
+
)
|
|
41
|
+
from .generator import AgentTrajectoryGenerator
|
|
42
|
+
from .hub import dataset_card, prepare_dataset_dir, push_dataset
|
|
43
|
+
from .metrics import compute_dataset_metrics, diversity_score
|
|
44
|
+
from .pipelines import Recipe, RunResult, load_recipe, make_environment, run_recipe
|
|
45
|
+
from .preferences import (
|
|
46
|
+
PreferencePair,
|
|
47
|
+
build_preference_pairs,
|
|
48
|
+
load_dpo_jsonl,
|
|
49
|
+
to_dpo_jsonl,
|
|
50
|
+
)
|
|
51
|
+
from .schemas import (
|
|
52
|
+
DEFAULT_RUBRIC_WEIGHTS,
|
|
53
|
+
RUBRIC_DIMENSIONS,
|
|
54
|
+
EvalResult,
|
|
55
|
+
RubricScores,
|
|
56
|
+
ToolSpec,
|
|
57
|
+
Trajectory,
|
|
58
|
+
TrajectoryStep,
|
|
59
|
+
)
|
|
60
|
+
from .tasks import SEED_TASKS, SeedTask, sample_tasks
|
|
61
|
+
from .training import build_dpo_dataset, build_sft_dataset, to_dpo_records, to_sft_records
|
|
62
|
+
from .utils import (
|
|
63
|
+
DEFAULT_TOOL_CATALOG,
|
|
64
|
+
LLMClient,
|
|
65
|
+
PythonREPL,
|
|
66
|
+
default_tool_catalog,
|
|
67
|
+
parse_tool_catalog,
|
|
68
|
+
)
|
|
69
|
+
from .verification import (
|
|
70
|
+
RUBRIC_PRESETS,
|
|
71
|
+
EnsembleEvaluator,
|
|
72
|
+
ExecutionVerifier,
|
|
73
|
+
ExpectedAnswerVerifier,
|
|
74
|
+
SafetyVerifier,
|
|
75
|
+
ToolArgVerifier,
|
|
76
|
+
VerificationResult,
|
|
77
|
+
Verifier,
|
|
78
|
+
batch_verify,
|
|
79
|
+
get_rubric,
|
|
80
|
+
rubric_names,
|
|
81
|
+
verify_trajectory,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
__all__ = [
|
|
85
|
+
"__version__",
|
|
86
|
+
# schemas
|
|
87
|
+
"Trajectory",
|
|
88
|
+
"TrajectoryStep",
|
|
89
|
+
"ToolSpec",
|
|
90
|
+
"RubricScores",
|
|
91
|
+
"EvalResult",
|
|
92
|
+
"RUBRIC_DIMENSIONS",
|
|
93
|
+
"DEFAULT_RUBRIC_WEIGHTS",
|
|
94
|
+
# utils
|
|
95
|
+
"parse_tool_catalog",
|
|
96
|
+
"default_tool_catalog",
|
|
97
|
+
"DEFAULT_TOOL_CATALOG",
|
|
98
|
+
"PythonREPL",
|
|
99
|
+
"LLMClient",
|
|
100
|
+
# core
|
|
101
|
+
"AgentTrajectoryGenerator",
|
|
102
|
+
"TrajectoryEvaluator",
|
|
103
|
+
# metrics
|
|
104
|
+
"compute_dataset_metrics",
|
|
105
|
+
"diversity_score",
|
|
106
|
+
# exporters
|
|
107
|
+
"to_jsonl",
|
|
108
|
+
"to_sharegpt",
|
|
109
|
+
"to_adp",
|
|
110
|
+
"save_dataset",
|
|
111
|
+
"load_jsonl",
|
|
112
|
+
# environments
|
|
113
|
+
"Environment",
|
|
114
|
+
"CompositeEnvironment",
|
|
115
|
+
"SQLEnvironment",
|
|
116
|
+
"PythonSandbox",
|
|
117
|
+
"MCPEnvironment",
|
|
118
|
+
# tasks
|
|
119
|
+
"SeedTask",
|
|
120
|
+
"SEED_TASKS",
|
|
121
|
+
"sample_tasks",
|
|
122
|
+
# pipelines
|
|
123
|
+
"Recipe",
|
|
124
|
+
"RunResult",
|
|
125
|
+
"run_recipe",
|
|
126
|
+
"load_recipe",
|
|
127
|
+
"make_environment",
|
|
128
|
+
# verification
|
|
129
|
+
"Verifier",
|
|
130
|
+
"VerificationResult",
|
|
131
|
+
"verify_trajectory",
|
|
132
|
+
"batch_verify",
|
|
133
|
+
"ExecutionVerifier",
|
|
134
|
+
"ToolArgVerifier",
|
|
135
|
+
"SafetyVerifier",
|
|
136
|
+
"ExpectedAnswerVerifier",
|
|
137
|
+
"EnsembleEvaluator",
|
|
138
|
+
"RUBRIC_PRESETS",
|
|
139
|
+
"get_rubric",
|
|
140
|
+
"rubric_names",
|
|
141
|
+
# preferences (DPO)
|
|
142
|
+
"PreferencePair",
|
|
143
|
+
"build_preference_pairs",
|
|
144
|
+
"to_dpo_jsonl",
|
|
145
|
+
"load_dpo_jsonl",
|
|
146
|
+
# dedup / decontamination
|
|
147
|
+
"dedup_trajectories",
|
|
148
|
+
"decontaminate",
|
|
149
|
+
"DedupResult",
|
|
150
|
+
# training data prep
|
|
151
|
+
"to_sft_records",
|
|
152
|
+
"to_dpo_records",
|
|
153
|
+
"build_sft_dataset",
|
|
154
|
+
"build_dpo_dataset",
|
|
155
|
+
# benchmark
|
|
156
|
+
"BenchmarkCase",
|
|
157
|
+
"BenchmarkReport",
|
|
158
|
+
"BUILTIN_CASES",
|
|
159
|
+
"run_benchmark",
|
|
160
|
+
"compare_models",
|
|
161
|
+
"agentsynth_model",
|
|
162
|
+
"prompted_model",
|
|
163
|
+
"report_table_md",
|
|
164
|
+
# hub
|
|
165
|
+
"dataset_card",
|
|
166
|
+
"prepare_dataset_dir",
|
|
167
|
+
"push_dataset",
|
|
168
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""A self-contained function-calling benchmark, plus BFCL and τ-bench adapters."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .bfcl import SAMPLE_BFCL, bfcl_case, load_bfcl, sample_cases
|
|
6
|
+
from .tau_bench import run_tau_bench, tau_bench_available
|
|
7
|
+
from .tool_calling import (
|
|
8
|
+
BUILTIN_CASES,
|
|
9
|
+
BenchmarkCase,
|
|
10
|
+
BenchmarkReport,
|
|
11
|
+
CaseResult,
|
|
12
|
+
agentsynth_model,
|
|
13
|
+
compare_models,
|
|
14
|
+
prompted_model,
|
|
15
|
+
report_table_md,
|
|
16
|
+
run_benchmark,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"BenchmarkCase",
|
|
21
|
+
"CaseResult",
|
|
22
|
+
"BenchmarkReport",
|
|
23
|
+
"BUILTIN_CASES",
|
|
24
|
+
"run_benchmark",
|
|
25
|
+
"compare_models",
|
|
26
|
+
"agentsynth_model",
|
|
27
|
+
"prompted_model",
|
|
28
|
+
"report_table_md",
|
|
29
|
+
# BFCL
|
|
30
|
+
"load_bfcl",
|
|
31
|
+
"bfcl_case",
|
|
32
|
+
"sample_cases",
|
|
33
|
+
"SAMPLE_BFCL",
|
|
34
|
+
# tau-bench
|
|
35
|
+
"run_tau_bench",
|
|
36
|
+
"tau_bench_available",
|
|
37
|
+
]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Load Berkeley Function-Calling Leaderboard (BFCL) cases into the harness.
|
|
2
|
+
|
|
3
|
+
BFCL ships JSONL: a questions file (`id`, `question`, `function`) and a
|
|
4
|
+
possible-answers file (`id`, `ground_truth`). This converts the pair into
|
|
5
|
+
`BenchmarkCase`s so any `model_fn` can be scored with `run_benchmark`.
|
|
6
|
+
|
|
7
|
+
The scoring here is a simplified tool + argument match (arguments accept the list
|
|
8
|
+
of allowed values BFCL gives), not BFCL's full AST checker. Use it for a quick
|
|
9
|
+
before/after signal; use BFCL's own scorer for the leaderboard.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
from .tool_calling import BenchmarkCase
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _user_query(question: Any) -> str:
|
|
21
|
+
turns = question[0] if question and isinstance(question[0], list) else question
|
|
22
|
+
for msg in turns or []:
|
|
23
|
+
if isinstance(msg, dict) and msg.get("role") == "user":
|
|
24
|
+
return str(msg.get("content", ""))
|
|
25
|
+
return str(question)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _normalize_tools(functions: Any) -> List[Dict[str, Any]]:
|
|
29
|
+
out: List[Dict[str, Any]] = []
|
|
30
|
+
for fn in functions or []:
|
|
31
|
+
params = dict(fn.get("parameters") or {})
|
|
32
|
+
if params.get("type") == "dict": # BFCL uses "dict"; JSON Schema uses "object"
|
|
33
|
+
params["type"] = "object"
|
|
34
|
+
out.append(
|
|
35
|
+
{"name": fn.get("name"), "description": fn.get("description", ""), "parameters": params}
|
|
36
|
+
)
|
|
37
|
+
return out
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def bfcl_case(
|
|
41
|
+
question_rec: Dict[str, Any], answer_rec: Optional[Dict[str, Any]] = None
|
|
42
|
+
) -> Optional[BenchmarkCase]:
|
|
43
|
+
"""Convert one BFCL question (+ its ground truth) into a BenchmarkCase."""
|
|
44
|
+
ground_truth = (answer_rec or {}).get("ground_truth") or []
|
|
45
|
+
if not ground_truth or not isinstance(ground_truth[0], dict) or not ground_truth[0]:
|
|
46
|
+
return None
|
|
47
|
+
first = ground_truth[0]
|
|
48
|
+
expected_tool = next(iter(first))
|
|
49
|
+
expected_args = first[expected_tool] if isinstance(first[expected_tool], dict) else {}
|
|
50
|
+
return BenchmarkCase(
|
|
51
|
+
id=str(question_rec.get("id", "")),
|
|
52
|
+
query=_user_query(question_rec.get("question")),
|
|
53
|
+
expected_tool=expected_tool,
|
|
54
|
+
expected_args=expected_args,
|
|
55
|
+
tools=_normalize_tools(question_rec.get("function")),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _read_jsonl(path: str) -> List[Dict[str, Any]]:
|
|
60
|
+
with open(path, encoding="utf-8") as fh:
|
|
61
|
+
return [json.loads(line) for line in fh if line.strip()]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def load_bfcl(questions_path: str, answers_path: Optional[str] = None) -> List[BenchmarkCase]:
|
|
65
|
+
"""Load BFCL question + possible-answer JSONL files into BenchmarkCases."""
|
|
66
|
+
questions = _read_jsonl(questions_path)
|
|
67
|
+
answers: Dict[str, Dict[str, Any]] = {}
|
|
68
|
+
if answers_path:
|
|
69
|
+
answers = {str(a.get("id")): a for a in _read_jsonl(answers_path)}
|
|
70
|
+
cases: List[BenchmarkCase] = []
|
|
71
|
+
for q in questions:
|
|
72
|
+
case = bfcl_case(q, answers.get(str(q.get("id"))))
|
|
73
|
+
if case is not None:
|
|
74
|
+
cases.append(case)
|
|
75
|
+
return cases
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# A couple of BFCL-format records, for tests and an offline demo.
|
|
79
|
+
SAMPLE_BFCL: List[Tuple[Dict[str, Any], Dict[str, Any]]] = [
|
|
80
|
+
(
|
|
81
|
+
{
|
|
82
|
+
"id": "simple_0",
|
|
83
|
+
"question": [[{"role": "user", "content": "What is the weather in Paris?"}]],
|
|
84
|
+
"function": [
|
|
85
|
+
{
|
|
86
|
+
"name": "get_weather",
|
|
87
|
+
"description": "Get the current weather for a city.",
|
|
88
|
+
"parameters": {
|
|
89
|
+
"type": "dict",
|
|
90
|
+
"properties": {"city": {"type": "string"}},
|
|
91
|
+
"required": ["city"],
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
],
|
|
95
|
+
},
|
|
96
|
+
{"id": "simple_0", "ground_truth": [{"get_weather": {"city": ["Paris", "paris"]}}]},
|
|
97
|
+
),
|
|
98
|
+
(
|
|
99
|
+
{
|
|
100
|
+
"id": "simple_1",
|
|
101
|
+
"question": [[{"role": "user", "content": "Compute 15 factorial."}]],
|
|
102
|
+
"function": [
|
|
103
|
+
{
|
|
104
|
+
"name": "math_factorial",
|
|
105
|
+
"description": "Compute the factorial of a number.",
|
|
106
|
+
"parameters": {
|
|
107
|
+
"type": "dict",
|
|
108
|
+
"properties": {"n": {"type": "integer"}},
|
|
109
|
+
"required": ["n"],
|
|
110
|
+
},
|
|
111
|
+
}
|
|
112
|
+
],
|
|
113
|
+
},
|
|
114
|
+
{"id": "simple_1", "ground_truth": [{"math_factorial": {"n": [15]}}]},
|
|
115
|
+
),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def sample_cases() -> List[BenchmarkCase]:
|
|
120
|
+
return [c for c in (bfcl_case(q, a) for q, a in SAMPLE_BFCL) if c is not None]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
__all__ = ["bfcl_case", "load_bfcl", "sample_cases", "SAMPLE_BFCL"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Run τ-bench against a model.
|
|
2
|
+
|
|
3
|
+
Unlike BFCL, τ-bench is multi-turn and agentic — a user simulator plus a domain
|
|
4
|
+
database — so it can't be scored from single tool calls. This is a thin bridge to
|
|
5
|
+
the official `tau-bench` package: it builds the environment and the package's
|
|
6
|
+
tool-calling agent and reports the pass rate.
|
|
7
|
+
|
|
8
|
+
Point `model` at your fine-tuned model served behind an OpenAI-compatible endpoint
|
|
9
|
+
(vLLM, TGI, …). Install the harness with:
|
|
10
|
+
|
|
11
|
+
pip install git+https://github.com/sierra-research/tau-bench
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Any, Dict, List, Optional
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def tau_bench_available() -> bool:
|
|
20
|
+
try:
|
|
21
|
+
import tau_bench # noqa: F401
|
|
22
|
+
|
|
23
|
+
return True
|
|
24
|
+
except Exception:
|
|
25
|
+
return False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def run_tau_bench(
|
|
29
|
+
model: str,
|
|
30
|
+
provider: str = "openai",
|
|
31
|
+
env_name: str = "retail",
|
|
32
|
+
user_model: str = "gpt-4o",
|
|
33
|
+
user_provider: str = "openai",
|
|
34
|
+
task_split: str = "test",
|
|
35
|
+
task_ids: Optional[List[int]] = None,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""Run τ-bench and return `{env, n, pass_rate, avg_reward}`.
|
|
38
|
+
|
|
39
|
+
Requires the `tau-bench` package and API keys for the agent + user-simulator models.
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
from tau_bench.agents.tool_calling_agent import ToolCallingAgent
|
|
43
|
+
from tau_bench.envs import get_env
|
|
44
|
+
except Exception as exc:
|
|
45
|
+
raise ImportError(
|
|
46
|
+
"tau-bench is not installed: "
|
|
47
|
+
"pip install git+https://github.com/sierra-research/tau-bench"
|
|
48
|
+
) from exc
|
|
49
|
+
|
|
50
|
+
env = get_env(
|
|
51
|
+
env_name,
|
|
52
|
+
user_strategy="llm",
|
|
53
|
+
user_model=user_model,
|
|
54
|
+
user_provider=user_provider,
|
|
55
|
+
task_split=task_split,
|
|
56
|
+
)
|
|
57
|
+
agent = ToolCallingAgent(
|
|
58
|
+
tools_info=env.tools_info, wiki=env.wiki, model=model, provider=provider
|
|
59
|
+
)
|
|
60
|
+
ids = task_ids if task_ids is not None else list(range(len(env.tasks)))
|
|
61
|
+
rewards = [float(getattr(agent.solve(env=env, task_index=i), "reward", 0.0)) for i in ids]
|
|
62
|
+
n = len(rewards)
|
|
63
|
+
return {
|
|
64
|
+
"env": env_name,
|
|
65
|
+
"n": n,
|
|
66
|
+
"pass_rate": round(sum(1 for r in rewards if r >= 0.999) / n, 4) if n else 0.0,
|
|
67
|
+
"avg_reward": round(sum(rewards) / n, 4) if n else 0.0,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
__all__ = ["tau_bench_available", "run_tau_bench"]
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""A small function-calling benchmark.
|
|
2
|
+
|
|
3
|
+
Each case gives a query and the full tool catalog; the model has to pick the right
|
|
4
|
+
tool and supply sane arguments. Scoring is deliberately simple — tool match plus a
|
|
5
|
+
lenient arg check — so it's easy to reason about and produces a clear before/after
|
|
6
|
+
table. A "model" here is any callable `(query, tools) -> (tool_name, tool_args)`,
|
|
7
|
+
so you can score a real LLM, a fine-tuned model, or AgentSynth's own generator.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, Field
|
|
16
|
+
|
|
17
|
+
from ..schemas import ToolSpec
|
|
18
|
+
from ..utils import DEFAULT_TOOL_CATALOG, extract_json, parse_tool_catalog
|
|
19
|
+
|
|
20
|
+
ModelFn = Callable[[str, List[ToolSpec]], Tuple[Optional[str], Dict[str, Any]]]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BenchmarkCase(BaseModel):
|
|
24
|
+
id: str
|
|
25
|
+
query: str
|
|
26
|
+
expected_tool: str
|
|
27
|
+
expected_args: Dict[str, Any] = Field(default_factory=dict) # value None => key presence only
|
|
28
|
+
# The full default catalog (with parameter schemas), so the model must both pick the
|
|
29
|
+
# right tool and fill its arguments.
|
|
30
|
+
tools: List[Dict[str, Any]] = Field(default_factory=lambda: list(DEFAULT_TOOL_CATALOG))
|
|
31
|
+
|
|
32
|
+
def tool_specs(self) -> List[ToolSpec]:
|
|
33
|
+
return parse_tool_catalog(self.tools)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class CaseResult(BaseModel):
|
|
37
|
+
id: str
|
|
38
|
+
expected_tool: str
|
|
39
|
+
predicted_tool: Optional[str]
|
|
40
|
+
tool_ok: bool
|
|
41
|
+
args_ok: bool
|
|
42
|
+
score: float
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BenchmarkReport(BaseModel):
|
|
46
|
+
n: int
|
|
47
|
+
tool_accuracy: float
|
|
48
|
+
arg_accuracy: float
|
|
49
|
+
score: float
|
|
50
|
+
results: List[CaseResult] = Field(default_factory=list)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
BUILTIN_CASES: List[BenchmarkCase] = [
|
|
54
|
+
BenchmarkCase(
|
|
55
|
+
id="weather_paris",
|
|
56
|
+
query="What's the weather in Paris right now?",
|
|
57
|
+
expected_tool="get_weather",
|
|
58
|
+
expected_args={"city": "Paris"},
|
|
59
|
+
),
|
|
60
|
+
BenchmarkCase(
|
|
61
|
+
id="weather_tokyo",
|
|
62
|
+
query="Is it raining in Tokyo today?",
|
|
63
|
+
expected_tool="get_weather",
|
|
64
|
+
expected_args={"city": "Tokyo"},
|
|
65
|
+
),
|
|
66
|
+
BenchmarkCase(
|
|
67
|
+
id="math_mult",
|
|
68
|
+
query="What is 23 times 7 plus 4?",
|
|
69
|
+
expected_tool="calculator",
|
|
70
|
+
expected_args={"expression": None},
|
|
71
|
+
),
|
|
72
|
+
BenchmarkCase(
|
|
73
|
+
id="math_tip",
|
|
74
|
+
query="Calculate an 18% tip on a $54 bill.",
|
|
75
|
+
expected_tool="calculator",
|
|
76
|
+
expected_args={"expression": None},
|
|
77
|
+
),
|
|
78
|
+
BenchmarkCase(
|
|
79
|
+
id="search_news",
|
|
80
|
+
query="Find recent news about open-source AI agents.",
|
|
81
|
+
expected_tool="web_search",
|
|
82
|
+
expected_args={"query": None},
|
|
83
|
+
),
|
|
84
|
+
BenchmarkCase(
|
|
85
|
+
id="search_fact",
|
|
86
|
+
query="Search the web for the population of Vietnam.",
|
|
87
|
+
expected_tool="web_search",
|
|
88
|
+
expected_args={"query": None},
|
|
89
|
+
),
|
|
90
|
+
BenchmarkCase(
|
|
91
|
+
id="file_csv",
|
|
92
|
+
query="Read the file data/report.csv and summarize it.",
|
|
93
|
+
expected_tool="read_file",
|
|
94
|
+
expected_args={"path": None},
|
|
95
|
+
),
|
|
96
|
+
BenchmarkCase(
|
|
97
|
+
id="file_notes",
|
|
98
|
+
query="Open notes.md and list the action items.",
|
|
99
|
+
expected_tool="read_file",
|
|
100
|
+
expected_args={"path": None},
|
|
101
|
+
),
|
|
102
|
+
BenchmarkCase(
|
|
103
|
+
id="sql_revenue",
|
|
104
|
+
query="Query the database for total revenue by region.",
|
|
105
|
+
expected_tool="sql_query",
|
|
106
|
+
expected_args={"query": None},
|
|
107
|
+
),
|
|
108
|
+
BenchmarkCase(
|
|
109
|
+
id="sql_count",
|
|
110
|
+
query="Run a SQL query to count rows in the sales table.",
|
|
111
|
+
expected_tool="sql_query",
|
|
112
|
+
expected_args={"query": None},
|
|
113
|
+
),
|
|
114
|
+
BenchmarkCase(
|
|
115
|
+
id="email_launch",
|
|
116
|
+
query="Send an email to team@example.com about the launch.",
|
|
117
|
+
expected_tool="send_email",
|
|
118
|
+
expected_args={"to": None},
|
|
119
|
+
),
|
|
120
|
+
BenchmarkCase(
|
|
121
|
+
id="email_summary",
|
|
122
|
+
query="Email a summary of the report to alex@example.com.",
|
|
123
|
+
expected_tool="send_email",
|
|
124
|
+
expected_args={"to": None},
|
|
125
|
+
),
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _args_ok(expected: Dict[str, Any], predicted: Dict[str, Any]) -> bool:
|
|
130
|
+
for key, value in (expected or {}).items():
|
|
131
|
+
if key not in (predicted or {}):
|
|
132
|
+
return False
|
|
133
|
+
if value is None:
|
|
134
|
+
continue # key presence is enough
|
|
135
|
+
pred = str(predicted[key]).strip().lower()
|
|
136
|
+
if isinstance(value, (list, tuple)): # any acceptable value (BFCL-style)
|
|
137
|
+
if pred not in [str(v).strip().lower() for v in value]:
|
|
138
|
+
return False
|
|
139
|
+
elif pred != str(value).strip().lower():
|
|
140
|
+
return False
|
|
141
|
+
return True
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _mean(values: List[float]) -> float:
|
|
145
|
+
return round(sum(values) / len(values), 4) if values else 0.0
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def run_benchmark(
|
|
149
|
+
model_fn: ModelFn, cases: Optional[List[BenchmarkCase]] = None
|
|
150
|
+
) -> BenchmarkReport:
|
|
151
|
+
"""Score a model on the function-calling cases.
|
|
152
|
+
|
|
153
|
+
`model_fn(query, tools)` returns the `(tool_name, tool_args)` the model would call.
|
|
154
|
+
"""
|
|
155
|
+
cases = cases or BUILTIN_CASES
|
|
156
|
+
results: List[CaseResult] = []
|
|
157
|
+
for case in cases:
|
|
158
|
+
try:
|
|
159
|
+
predicted_tool, predicted_args = model_fn(case.query, case.tool_specs())
|
|
160
|
+
except Exception:
|
|
161
|
+
predicted_tool, predicted_args = None, {}
|
|
162
|
+
tool_ok = predicted_tool == case.expected_tool
|
|
163
|
+
args_ok = tool_ok and _args_ok(case.expected_args, predicted_args or {})
|
|
164
|
+
score = 1.0 if (tool_ok and args_ok) else (0.5 if tool_ok else 0.0)
|
|
165
|
+
results.append(
|
|
166
|
+
CaseResult(
|
|
167
|
+
id=case.id,
|
|
168
|
+
expected_tool=case.expected_tool,
|
|
169
|
+
predicted_tool=predicted_tool,
|
|
170
|
+
tool_ok=tool_ok,
|
|
171
|
+
args_ok=args_ok,
|
|
172
|
+
score=score,
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
return BenchmarkReport(
|
|
176
|
+
n=len(results),
|
|
177
|
+
tool_accuracy=_mean([1.0 if r.tool_ok else 0.0 for r in results]),
|
|
178
|
+
arg_accuracy=_mean([1.0 if r.args_ok else 0.0 for r in results]),
|
|
179
|
+
score=_mean([r.score for r in results]),
|
|
180
|
+
results=results,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def compare_models(
|
|
185
|
+
before: ModelFn, after: ModelFn, cases: Optional[List[BenchmarkCase]] = None
|
|
186
|
+
) -> Dict[str, Any]:
|
|
187
|
+
"""Run two models on the same cases and return a before/after comparison."""
|
|
188
|
+
cases = cases or BUILTIN_CASES
|
|
189
|
+
before_report = run_benchmark(before, cases)
|
|
190
|
+
after_report = run_benchmark(after, cases)
|
|
191
|
+
return {
|
|
192
|
+
"before": before_report,
|
|
193
|
+
"after": after_report,
|
|
194
|
+
"delta_tool_accuracy": round(after_report.tool_accuracy - before_report.tool_accuracy, 4),
|
|
195
|
+
"delta_score": round(after_report.score - before_report.score, 4),
|
|
196
|
+
"n": before_report.n,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def agentsynth_model(generator: Any, mode: str = "single_agent") -> ModelFn:
|
|
201
|
+
"""Adapt an AgentTrajectoryGenerator into a benchmark model: it takes the first
|
|
202
|
+
tool call the generated trajectory makes."""
|
|
203
|
+
|
|
204
|
+
def model_fn(query: str, tools: List[ToolSpec]) -> Tuple[Optional[str], Dict[str, Any]]:
|
|
205
|
+
traj = generator.generate(query, tools=tools, mode=mode)
|
|
206
|
+
calls = traj.tool_calls()
|
|
207
|
+
if not calls:
|
|
208
|
+
return None, {}
|
|
209
|
+
return calls[0].tool_name, calls[0].tool_args or {}
|
|
210
|
+
|
|
211
|
+
return model_fn
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def prompted_model(complete_fn: Callable[[str], str]) -> ModelFn:
|
|
215
|
+
"""Turn a text-completion function `(prompt) -> text` into a benchmark model.
|
|
216
|
+
|
|
217
|
+
It asks the model for a single JSON tool call and parses the reply, so it works
|
|
218
|
+
with any instruction-following model (a base or fine-tuned HF model, etc.).
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
def model_fn(query: str, tools: List[ToolSpec]) -> Tuple[Optional[str], Dict[str, Any]]:
|
|
222
|
+
tool_json = json.dumps(
|
|
223
|
+
[
|
|
224
|
+
{"name": t.name, "description": t.description, "parameters": t.parameters}
|
|
225
|
+
for t in tools
|
|
226
|
+
]
|
|
227
|
+
)
|
|
228
|
+
prompt = (
|
|
229
|
+
"You can call exactly one tool to help the user.\n"
|
|
230
|
+
f"Tools (JSON): {tool_json}\n\n"
|
|
231
|
+
f"User: {query}\n"
|
|
232
|
+
'Respond with ONLY a JSON object: {"tool": "<tool name>", "args": {<arguments>}}'
|
|
233
|
+
)
|
|
234
|
+
parsed = extract_json(complete_fn(prompt))
|
|
235
|
+
if isinstance(parsed, dict):
|
|
236
|
+
args = parsed.get("args")
|
|
237
|
+
return parsed.get("tool"), args if isinstance(args, dict) else {}
|
|
238
|
+
return None, {}
|
|
239
|
+
|
|
240
|
+
return model_fn
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def report_table_md(comparison: Dict[str, Any]) -> str:
|
|
244
|
+
"""Render a before/after comparison as a markdown table."""
|
|
245
|
+
before = comparison["before"]
|
|
246
|
+
after = comparison["after"]
|
|
247
|
+
|
|
248
|
+
def pct(x: float) -> str:
|
|
249
|
+
return f"{x * 100:.1f}%"
|
|
250
|
+
|
|
251
|
+
lines = [
|
|
252
|
+
f"Function-calling benchmark ({comparison['n']} cases)",
|
|
253
|
+
"",
|
|
254
|
+
"| Metric | Before | After | Δ |",
|
|
255
|
+
"| --- | --- | --- | --- |",
|
|
256
|
+
f"| Tool accuracy | {pct(before.tool_accuracy)} | {pct(after.tool_accuracy)} "
|
|
257
|
+
f"| {pct(comparison['delta_tool_accuracy'])} |",
|
|
258
|
+
f"| Arg accuracy | {pct(before.arg_accuracy)} | {pct(after.arg_accuracy)} | |",
|
|
259
|
+
f"| Overall score | {pct(before.score)} | {pct(after.score)} "
|
|
260
|
+
f"| {pct(comparison['delta_score'])} |",
|
|
261
|
+
]
|
|
262
|
+
return "\n".join(lines)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
__all__ = [
|
|
266
|
+
"BenchmarkCase",
|
|
267
|
+
"CaseResult",
|
|
268
|
+
"BenchmarkReport",
|
|
269
|
+
"BUILTIN_CASES",
|
|
270
|
+
"run_benchmark",
|
|
271
|
+
"compare_models",
|
|
272
|
+
"agentsynth_model",
|
|
273
|
+
"prompted_model",
|
|
274
|
+
"report_table_md",
|
|
275
|
+
]
|