agentsynth-ai 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. agentsynth_ai-0.2.0/LICENSE +21 -0
  2. agentsynth_ai-0.2.0/PKG-INFO +589 -0
  3. agentsynth_ai-0.2.0/README.md +519 -0
  4. agentsynth_ai-0.2.0/agentsynth/__init__.py +168 -0
  5. agentsynth_ai-0.2.0/agentsynth/benchmarks/__init__.py +37 -0
  6. agentsynth_ai-0.2.0/agentsynth/benchmarks/bfcl.py +123 -0
  7. agentsynth_ai-0.2.0/agentsynth/benchmarks/tau_bench.py +71 -0
  8. agentsynth_ai-0.2.0/agentsynth/benchmarks/tool_calling.py +275 -0
  9. agentsynth_ai-0.2.0/agentsynth/cli.py +236 -0
  10. agentsynth_ai-0.2.0/agentsynth/dedup.py +104 -0
  11. agentsynth_ai-0.2.0/agentsynth/environments/__init__.py +16 -0
  12. agentsynth_ai-0.2.0/agentsynth/environments/base.py +85 -0
  13. agentsynth_ai-0.2.0/agentsynth/environments/mcp_env.py +199 -0
  14. agentsynth_ai-0.2.0/agentsynth/environments/python_sandbox.py +93 -0
  15. agentsynth_ai-0.2.0/agentsynth/environments/sql.py +153 -0
  16. agentsynth_ai-0.2.0/agentsynth/evaluator.py +613 -0
  17. agentsynth_ai-0.2.0/agentsynth/exporters.py +297 -0
  18. agentsynth_ai-0.2.0/agentsynth/generator.py +867 -0
  19. agentsynth_ai-0.2.0/agentsynth/hub.py +157 -0
  20. agentsynth_ai-0.2.0/agentsynth/metrics.py +410 -0
  21. agentsynth_ai-0.2.0/agentsynth/pipelines/__init__.py +8 -0
  22. agentsynth_ai-0.2.0/agentsynth/pipelines/recipe.py +77 -0
  23. agentsynth_ai-0.2.0/agentsynth/pipelines/runner.py +164 -0
  24. agentsynth_ai-0.2.0/agentsynth/preferences.py +134 -0
  25. agentsynth_ai-0.2.0/agentsynth/py.typed +0 -0
  26. agentsynth_ai-0.2.0/agentsynth/schemas.py +262 -0
  27. agentsynth_ai-0.2.0/agentsynth/tasks/__init__.py +7 -0
  28. agentsynth_ai-0.2.0/agentsynth/tasks/taxonomy.py +193 -0
  29. agentsynth_ai-0.2.0/agentsynth/training/__init__.py +19 -0
  30. agentsynth_ai-0.2.0/agentsynth/training/datasets.py +71 -0
  31. agentsynth_ai-0.2.0/agentsynth/utils.py +465 -0
  32. agentsynth_ai-0.2.0/agentsynth/verification/__init__.py +37 -0
  33. agentsynth_ai-0.2.0/agentsynth/verification/base.py +99 -0
  34. agentsynth_ai-0.2.0/agentsynth/verification/ensemble.py +70 -0
  35. agentsynth_ai-0.2.0/agentsynth/verification/rubrics.py +63 -0
  36. agentsynth_ai-0.2.0/agentsynth/verification/verifiers.py +135 -0
  37. agentsynth_ai-0.2.0/agentsynth_ai.egg-info/PKG-INFO +589 -0
  38. agentsynth_ai-0.2.0/agentsynth_ai.egg-info/SOURCES.txt +57 -0
  39. agentsynth_ai-0.2.0/agentsynth_ai.egg-info/dependency_links.txt +1 -0
  40. agentsynth_ai-0.2.0/agentsynth_ai.egg-info/entry_points.txt +2 -0
  41. agentsynth_ai-0.2.0/agentsynth_ai.egg-info/requires.txt +53 -0
  42. agentsynth_ai-0.2.0/agentsynth_ai.egg-info/top_level.txt +1 -0
  43. agentsynth_ai-0.2.0/pyproject.toml +129 -0
  44. agentsynth_ai-0.2.0/setup.cfg +4 -0
  45. agentsynth_ai-0.2.0/setup.py +10 -0
  46. agentsynth_ai-0.2.0/tests/test_benchmarks.py +102 -0
  47. agentsynth_ai-0.2.0/tests/test_bfcl.py +56 -0
  48. agentsynth_ai-0.2.0/tests/test_dedup.py +66 -0
  49. agentsynth_ai-0.2.0/tests/test_environments.py +52 -0
  50. agentsynth_ai-0.2.0/tests/test_hub.py +27 -0
  51. agentsynth_ai-0.2.0/tests/test_mcp.py +124 -0
  52. agentsynth_ai-0.2.0/tests/test_pipeline_phase2.py +50 -0
  53. agentsynth_ai-0.2.0/tests/test_pipelines.py +69 -0
  54. agentsynth_ai-0.2.0/tests/test_preferences.py +44 -0
  55. agentsynth_ai-0.2.0/tests/test_regression.py +37 -0
  56. agentsynth_ai-0.2.0/tests/test_smoke.py +274 -0
  57. agentsynth_ai-0.2.0/tests/test_tasks.py +24 -0
  58. agentsynth_ai-0.2.0/tests/test_training.py +36 -0
  59. agentsynth_ai-0.2.0/tests/test_verification.py +128 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 agentsynth
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,589 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentsynth-ai
3
+ Version: 0.2.0
4
+ Summary: Synthetic Agentic Trajectories Generator + LLM-as-Judge Eval Loop for fine-tuning agentic LLMs
5
+ Author: agentsynth
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/agentsynth/agentsynth
8
+ Project-URL: Documentation, https://agentsynth.github.io/agentsynth
9
+ Project-URL: Repository, https://github.com/agentsynth/agentsynth
10
+ Project-URL: Hugging Face Space, https://huggingface.co/spaces/agentsynth/agentsynth
11
+ Project-URL: Issues, https://github.com/agentsynth/agentsynth/issues
12
+ Keywords: synthetic-data,agentic-ai,llm-finetuning,trajectories,tool-use,llm-as-judge,datasets
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: pydantic>=2.5
27
+ Provides-Extra: llm
28
+ Requires-Dist: litellm>=1.40; extra == "llm"
29
+ Provides-Extra: mcp
30
+ Requires-Dist: mcp>=1.2; extra == "mcp"
31
+ Provides-Extra: hub
32
+ Requires-Dist: huggingface-hub>=0.23; extra == "hub"
33
+ Requires-Dist: datasets>=2.18; extra == "hub"
34
+ Provides-Extra: train
35
+ Requires-Dist: trl>=0.9; extra == "train"
36
+ Requires-Dist: transformers>=4.40; extra == "train"
37
+ Requires-Dist: peft>=0.10; extra == "train"
38
+ Requires-Dist: datasets>=2.18; extra == "train"
39
+ Requires-Dist: accelerate>=0.30; extra == "train"
40
+ Provides-Extra: app
41
+ Requires-Dist: gradio<6.0,>=5.0; extra == "app"
42
+ Requires-Dist: litellm>=1.40; extra == "app"
43
+ Requires-Dist: plotly>=5.18; extra == "app"
44
+ Requires-Dist: pandas>=2.0; extra == "app"
45
+ Requires-Dist: datasets>=2.18; extra == "app"
46
+ Requires-Dist: huggingface-hub>=0.23; extra == "app"
47
+ Requires-Dist: pyyaml>=6.0; extra == "app"
48
+ Provides-Extra: dev
49
+ Requires-Dist: pytest>=8.0; extra == "dev"
50
+ Requires-Dist: pytest-cov>=4.1; extra == "dev"
51
+ Requires-Dist: ruff>=0.6; extra == "dev"
52
+ Requires-Dist: mypy>=1.8; extra == "dev"
53
+ Requires-Dist: build>=1.2; extra == "dev"
54
+ Requires-Dist: pre-commit>=3.5; extra == "dev"
55
+ Requires-Dist: pyyaml>=6.0; extra == "dev"
56
+ Requires-Dist: types-PyYAML>=6.0; extra == "dev"
57
+ Provides-Extra: docs
58
+ Requires-Dist: mkdocs-material>=9.5; extra == "docs"
59
+ Requires-Dist: mkdocstrings[python]>=0.25; extra == "docs"
60
+ Provides-Extra: all
61
+ Requires-Dist: litellm>=1.40; extra == "all"
62
+ Requires-Dist: gradio<6.0,>=5.0; extra == "all"
63
+ Requires-Dist: plotly>=5.18; extra == "all"
64
+ Requires-Dist: pandas>=2.0; extra == "all"
65
+ Requires-Dist: datasets>=2.18; extra == "all"
66
+ Requires-Dist: huggingface-hub>=0.23; extra == "all"
67
+ Requires-Dist: pyyaml>=6.0; extra == "all"
68
+ Requires-Dist: mcp>=1.2; python_version >= "3.10" and extra == "all"
69
+ Dynamic: license-file
70
+
71
+ # AgentSynth
72
+
73
+ > Synthetic agentic trajectories with a built-in LLM-as-Judge eval loop. Generate tool-use, code-execution, and multi-agent training data offline, then score it.
74
+
75
+ <p align="center">
76
+ <a href="https://github.com/agentsynth/agentsynth/actions/workflows/ci.yml"><img alt="CI" src="https://github.com/agentsynth/agentsynth/actions/workflows/ci.yml/badge.svg"></a>
77
+ <a href="https://pypi.org/project/agentsynth-ai/"><img alt="PyPI" src="https://img.shields.io/pypi/v/agentsynth-ai.svg"></a>
78
+ <a href="https://codecov.io/gh/agentsynth/agentsynth"><img alt="coverage" src="https://codecov.io/gh/agentsynth/agentsynth/branch/main/graph/badge.svg"></a>
79
+ <a href="https://www.python.org/downloads/"><img alt="Python 3.9+" src="https://img.shields.io/badge/python-3.9%2B-blue.svg"></a>
80
+ <a href="https://github.com/agentsynth/agentsynth/blob/main/LICENSE"><img alt="License: MIT" src="https://img.shields.io/badge/license-MIT-green.svg"></a>
81
+ <a href="https://huggingface.co/spaces/agentsynth/agentsynth"><img alt="Hugging Face Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow.svg"></a>
82
+ </p>
83
+
84
+ <p align="center">
85
+ <a href="https://agentsynth.github.io/agentsynth/">Docs</a> ·
86
+ <a href="docs/VISION.md">Vision</a> ·
87
+ <a href="docs/ARCHITECTURE.md">Architecture</a> ·
88
+ <a href="ROADMAP.md">Roadmap</a> ·
89
+ <a href="CONTRIBUTING.md">Contributing</a>
90
+ </p>
91
+
92
+ ---
93
+
94
+ ## What it is
95
+
96
+ AgentSynth generates multi-turn agent trajectories — tool-use, grounded code-execution, and multi-agent collaboration traces — and scores each one with an LLM-as-Judge eval loop. The output is training data for fine-tuning agentic LLMs, built without harvesting real conversations.
97
+
98
+ What it's good for:
99
+
100
+ - Bootstrapping an agentic dataset when you have no production traffic, or can't use what you have.
101
+ - Running entirely offline. Mock generation and evaluation are deterministic and need no API keys or network access.
102
+ - Swapping in a real LLM when you want richer generation and a sharper judge. Claude, Grok, Groq, and OpenAI are all supported through [LiteLLM](https://github.com/BerriAI/litellm).
103
+ - Filtering before you train. An 8-metric rubric scores every trajectory, so you can keep the high-signal subset and drop the rest.
104
+ - Exporting straight into a training pipeline: JSONL, ShareGPT, and ADP formats load into Hugging Face / TRL / Unsloth / Axolotl without conversion.
105
+
106
+ Runs are reproducible. Any randomness in the mock paths comes from a stable hash seed, so identical inputs produce identical trajectories.
107
+
108
+ ---
109
+
110
+ ## Live demo
111
+
112
+ Try it in the browser: [AgentSynth on Hugging Face Spaces](https://huggingface.co/spaces/agentsynth/agentsynth).
113
+
114
+ Generate a trajectory, watch the judge score it across the rubric dimensions, then export the dataset — all from the Gradio UI.
115
+
116
+ ---
117
+
118
+ ## Features
119
+
120
+ Core capabilities:
121
+
122
+ - Synthetic trajectory generation in single-agent, multi-agent, and code-execution modes.
123
+ - An LLM-as-Judge eval loop built on a weighted 6-dimension rubric, with a deterministic mock fallback.
124
+ - Dataset metrics: aggregate pass@1, per-dimension averages, and trajectory diversity.
125
+ - Export to JSONL, ShareGPT, or ADP in a single call.
126
+
127
+ The eval loop scores six per-trajectory dimensions — task completion, tool correctness, trajectory faithfulness, reasoning coherence / plan adherence, efficiency, and safety — plus two dataset-level metrics, overall pass@1 and diversity.
128
+
129
+ ---
130
+
131
+ ## Install
132
+
133
+ ```bash
134
+ # Core library (offline mock generation + eval, exporters, metrics)
135
+ pip install agentsynth-ai
136
+
137
+ # With the Gradio web UI
138
+ pip install "agentsynth-ai[app]"
139
+
140
+ # For running the Hugging Face Space (pins everything the app needs)
141
+ pip install -r requirements.txt
142
+ ```
143
+
144
+ The core library targets Python 3.9+. The Gradio app wants 3.10+, so use that interpreter if you plan to run the UI locally or on Spaces.
145
+
146
+ Calling a real LLM also needs `pip install litellm` (already in the `[app]` extra) and the relevant provider key. See [Using a real LLM-as-Judge](#using-a-real-llm-as-judge).
147
+
148
+ ---
149
+
150
+ ## Quickstart
151
+
152
+ ```python
153
+ from agentsynth import AgentTrajectoryGenerator, TrajectoryEvaluator, to_jsonl
154
+
155
+ # 1. Create a generator — offline deterministic mock mode by default.
156
+ gen = AgentTrajectoryGenerator()
157
+
158
+ # 2. Generate a multi-step agent trajectory for a query.
159
+ traj = gen.generate("What's the weather in Paris, and is it warmer than Berlin?")
160
+
161
+ print(f"{traj.num_steps()} steps, tools used: {traj.tool_names_used()}")
162
+ print("final answer:", traj.final_answer)
163
+
164
+ # 3. Evaluate it with the built-in LLM-as-Judge (also mock by default).
165
+ result = TrajectoryEvaluator().evaluate(traj)
166
+ print(f"overall = {result.overall:.3f} passed = {result.passed}")
167
+ print(result.scores.as_dict()) # all 6 rubric dimensions in [0, 1]
168
+
169
+ # 4. Export a training-ready dataset.
170
+ to_jsonl([traj], "agent_data.jsonl")
171
+ ```
172
+
173
+ No keys, no network. Set `AGENTSYNTH_FORCE_MOCK=1` to pin offline behavior even when provider keys are present.
174
+
175
+ ---
176
+
177
+ ## Worked examples
178
+
179
+ ### 1) Single-agent tool use with a custom tool catalog
180
+
181
+ Pass your own tools through `parse_tool_catalog`. It accepts any JSON-Schema function-calling shape, including raw OpenAI `tools` blocks:
182
+
183
+ ```python
184
+ from agentsynth import AgentTrajectoryGenerator, TrajectoryEvaluator, parse_tool_catalog
185
+
186
+ # A custom catalog — a list of tool dicts (OpenAI/Anthropic style also accepted).
187
+ my_tools = parse_tool_catalog([
188
+ {
189
+ "name": "stock_price",
190
+ "description": "Look up the latest stock price for a ticker symbol.",
191
+ "parameters": {
192
+ "type": "object",
193
+ "properties": {"ticker": {"type": "string", "description": "e.g. 'AAPL'"}},
194
+ "required": ["ticker"],
195
+ },
196
+ },
197
+ {
198
+ "name": "currency_convert",
199
+ "description": "Convert an amount from one currency to another.",
200
+ "parameters": {
201
+ "type": "object",
202
+ "properties": {
203
+ "amount": {"type": "number"},
204
+ "from_ccy": {"type": "string"},
205
+ "to_ccy": {"type": "string"},
206
+ },
207
+ "required": ["amount", "from_ccy", "to_ccy"],
208
+ },
209
+ },
210
+ ])
211
+
212
+ gen = AgentTrajectoryGenerator(tools=my_tools)
213
+ traj = gen.generate(
214
+ "How much is 100 shares of AAPL worth in euros?",
215
+ mode="single_agent",
216
+ domain="finance",
217
+ )
218
+
219
+ for step in traj.steps:
220
+ print(step.short())
221
+
222
+ result = TrajectoryEvaluator().evaluate(traj)
223
+ print(f"tool_correctness = {result.scores.tool_correctness:.2f}")
224
+ ```
225
+
226
+ ### 2) Code-execution trace (grounded REPL output)
227
+
228
+ In `code_execution` mode, the emitted Python actually runs through the sandboxed `PythonREPL`. That means `code_output` is captured stdout, not something the model made up:
229
+
230
+ ```python
231
+ from agentsynth import AgentTrajectoryGenerator
232
+
233
+ gen = AgentTrajectoryGenerator()
234
+ traj = gen.generate(
235
+ "Compute the mean and standard deviation of [4, 8, 15, 16, 23, 42].",
236
+ mode="code_execution",
237
+ domain="data_analysis",
238
+ )
239
+
240
+ for step in traj.steps:
241
+ if step.step_type == "code_execution":
242
+ print("CODE:\n", step.code)
243
+ print("OUTPUT (grounded, from the REPL):\n", step.code_output)
244
+
245
+ print("ANSWER:", traj.final_answer)
246
+ ```
247
+
248
+ You can drive the same REPL directly to ground your own snippets:
249
+
250
+ ```python
251
+ from agentsynth import PythonREPL
252
+
253
+ repl = PythonREPL()
254
+ print(repl.run("import statistics\nstatistics.pstdev([4, 8, 15, 16, 23, 42])"))
255
+ # -> 12.315302134607444 (real stdout; only whitelisted numeric/data imports allowed)
256
+ ```
257
+
258
+ ### 3) Multi-agent batch + dataset metrics
259
+
260
+ Generate a batch, set `vary_modes=True` to mix single-agent, multi-agent, and code-execution traces, then evaluate and aggregate:
261
+
262
+ ```python
263
+ from agentsynth import (
264
+ AgentTrajectoryGenerator,
265
+ TrajectoryEvaluator,
266
+ compute_dataset_metrics,
267
+ save_dataset,
268
+ )
269
+
270
+ queries = [
271
+ "Plan a 3-day trip to Tokyo on a $1500 budget.",
272
+ "Summarize last quarter's sales from the analytics DB and email the team.",
273
+ "Find the 10th Fibonacci number and explain the recurrence.",
274
+ "What's the weather in Reykjavik and should I pack a coat?",
275
+ ]
276
+
277
+ gen = AgentTrajectoryGenerator()
278
+ trajectories = gen.generate_batch(queries, vary_modes=True) # mixes modes per query
279
+
280
+ evaluator = TrajectoryEvaluator()
281
+ results = [evaluator.evaluate(t) for t in trajectories]
282
+
283
+ # Aggregate quality metrics across the dataset (pass@1, per-dim averages, diversity).
284
+ metrics = compute_dataset_metrics(trajectories, results)
285
+ print(metrics)
286
+
287
+ # Ship it (format inferred from the extension).
288
+ save_dataset(trajectories, "dataset.jsonl")
289
+ ```
290
+
291
+ ### 4) Grounded execution with environments and recipes
292
+
293
+ Attach an environment and the tool calls run for real — `sql_query` hits an
294
+ in-memory SQLite database, `python` runs in an isolated subprocess — so the
295
+ observations are actual output, not templated text:
296
+
297
+ ```python
298
+ from agentsynth import AgentTrajectoryGenerator
299
+ from agentsynth.environments import SQLEnvironment
300
+
301
+ gen = AgentTrajectoryGenerator(environment=SQLEnvironment())
302
+ traj = gen.generate("Which product sold the most units?")
303
+
304
+ for step in traj.steps:
305
+ if step.step_type == "observation":
306
+ print(step.observation) # a real query result, e.g. "Widget | 2931 ... (3 rows)"
307
+ ```
308
+
309
+ A `Recipe` wraps a whole run — generate (optionally concurrent), evaluate,
310
+ compute metrics, export — and loads from YAML:
311
+
312
+ ```python
313
+ from agentsynth import Recipe, run_recipe
314
+
315
+ result = run_recipe(Recipe(
316
+ query="Which region had the highest revenue, and how did products compare?",
317
+ num_trajectories=12,
318
+ vary_modes=True,
319
+ environment="sql+python", # real SQLite + subprocess Python
320
+ export_format="jsonl",
321
+ export_path="dataset.jsonl",
322
+ max_workers=4,
323
+ ))
324
+ print(result.metrics["pass_rate"], "->", result.output_path)
325
+ ```
326
+
327
+ Or run a recipe file: `run_recipe(load_recipe("recipes/analytics_sql.yaml"))`.
328
+
329
+ ### 5) Verify trajectories and build DPO pairs
330
+
331
+ Verification re-runs what it can instead of trusting the model — a `code_execution`
332
+ step only passes if its code reproduces the recorded output:
333
+
334
+ ```python
335
+ from agentsynth import AgentTrajectoryGenerator, verify_trajectory
336
+
337
+ traj = AgentTrajectoryGenerator().generate("compute the mean of 4, 8, 15, 16, 23, 42",
338
+ mode="code_execution")
339
+ result = verify_trajectory(traj) # tool args + execution + safety checks
340
+ print(result.verified, result.detail) # True 'tool_args: ok; execution: ok; safety: ok'
341
+ ```
342
+
343
+ Turn scored trajectories into preference pairs for DPO:
344
+
345
+ ```python
346
+ from agentsynth import (
347
+ AgentTrajectoryGenerator, TrajectoryEvaluator, build_preference_pairs, to_dpo_jsonl,
348
+ )
349
+
350
+ pairs = build_preference_pairs(
351
+ AgentTrajectoryGenerator(), TrajectoryEvaluator(),
352
+ "analyze sales by region and email a summary", k=8,
353
+ )
354
+ to_dpo_jsonl(pairs, "prefs.jsonl") # {"prompt", "chosen", "rejected", "margin"} per line
355
+ ```
356
+
357
+ Recipes can do it all at once — `Recipe(..., verify=True, dedup=True, rubric="strict")`
358
+ adds verification, near-duplicate removal, and a stricter judge to the run.
359
+
360
+ ### 6) Generate against a real MCP server
361
+
362
+ Point AgentSynth at any [Model Context Protocol](https://modelcontextprotocol.io)
363
+ server and its tools become a live environment — calls run against the server, so the
364
+ observations are real:
365
+
366
+ ```python
367
+ import sys
368
+ from agentsynth import AgentTrajectoryGenerator
369
+ from agentsynth.environments import MCPEnvironment
370
+
371
+ # A local stdio server here; pass url=... for an HTTP/SSE server instead.
372
+ env = MCPEnvironment(command=sys.executable, args=["examples/mcp_server.py"])
373
+ gen = AgentTrajectoryGenerator(environment=env)
374
+
375
+ traj = gen.generate("reverse some text and count its words")
376
+ print(traj.tool_names_used()) # tools discovered from the MCP server
377
+ env.close()
378
+ ```
379
+
380
+ Needs `pip install "agentsynth-ai[mcp]"` (Python 3.10+).
381
+
382
+ ---
383
+
384
+ ## Run the app locally
385
+
386
+ ```bash
387
+ pip install "agentsynth-ai[app]"
388
+ python app.py
389
+ ```
390
+
391
+ Open the printed local URL (usually `http://127.0.0.1:7860`). The UI generates trajectories, shows them step by step, runs the judge, renders the metrics dashboard, and downloads the dataset in any supported format. No keys required.
392
+
393
+ ---
394
+
395
+ ## Fine-tune and benchmark
396
+
397
+ The point of the data is to make a model better. AgentSynth ships the harness to prove
398
+ it: dataset prep, fine-tune scripts (TRL SFT + DPO, Unsloth-friendly), a built-in
399
+ function-calling benchmark, and a one-command reproduction. The fine-tune needs a GPU;
400
+ everything else runs on CPU.
401
+
402
+ ```bash
403
+ # generate data, score a model, dry-run the trainer — all offline
404
+ python scripts/make_dataset.py --n 500 --vary-modes --verify --dedup --out data
405
+ python scripts/run_benchmark.py --model mock
406
+ python scripts/train_sft.py --data data/train.jsonl --dry-run
407
+ ```
408
+
409
+ `run_benchmark.py --before <base> --after <finetuned>` prints the before/after table.
410
+ Full walkthrough in [docs/BENCHMARK.md](docs/BENCHMARK.md).
411
+
412
+ ---
413
+
414
+ ## Using a real LLM-as-Judge
415
+
416
+ Generation and evaluation both default to deterministic mock mode. Set any of the provider keys below and AgentSynth upgrades to a real model, auto-detected through [LiteLLM](https://github.com/BerriAI/litellm). It picks a fast, cheap default for whichever key it finds first.
417
+
418
+ | Provider | Env var | Default model used |
419
+ |------------|---------------------|-------------------------------------|
420
+ | Anthropic | `ANTHROPIC_API_KEY` | `claude-3-5-haiku-latest` |
421
+ | xAI (Grok) | `XAI_API_KEY` | `xai/grok-2-latest` |
422
+ | Groq | `GROQ_API_KEY` | `groq/llama-3.3-70b-versatile` |
423
+ | OpenAI | `OPENAI_API_KEY` | `gpt-4o-mini` |
424
+
425
+ ```bash
426
+ export ANTHROPIC_API_KEY="sk-ant-..."
427
+ python app.py # generation + judge now use Claude
428
+ ```
429
+
430
+ ```python
431
+ from agentsynth import AgentTrajectoryGenerator, TrajectoryEvaluator
432
+
433
+ # Or pin a model explicitly:
434
+ gen = AgentTrajectoryGenerator(model="claude-3-5-haiku-latest")
435
+ ev = TrajectoryEvaluator(model="gpt-4o-mini")
436
+ ```
437
+
438
+ If LiteLLM isn't installed, no key is set, or a request fails, AgentSynth falls back to mock instead of crashing. Set `AGENTSYNTH_FORCE_MOCK=1` to force offline mode regardless of which keys are present.
439
+
440
+ ---
441
+
442
+ ## Dataset formats
443
+
444
+ AgentSynth exports three trainer-friendly shapes, all compatible with Hugging Face Datasets, TRL, Unsloth, and Axolotl.
445
+
446
+ ### JSONL
447
+
448
+ One JSON object per line, holding the full structured trajectory — steps, tools, scores, metadata. Good for archival and custom loaders.
449
+
450
+ ```json
451
+ {"id": "a1b2c3d4e5f6", "query": "What's the weather in Paris?", "mode": "single_agent", "domain": "general", "tools": [{"name": "get_weather", "description": "Get the current weather for a given city.", "parameters": {"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}}], "steps": [{"step_type": "thought", "thought": "I should look up the weather for Paris."}, {"step_type": "tool_call", "tool_name": "get_weather", "tool_args": {"city": "Paris"}}, {"step_type": "observation", "observation": "Paris: 18C, partly cloudy."}, {"step_type": "final_answer", "content": "It's 18C and partly cloudy in Paris."}], "final_answer": "It's 18C and partly cloudy in Paris.", "success": true, "generator_model": "mock"}
452
+ ```
453
+
454
+ ### ShareGPT
455
+
456
+ The familiar `{"conversations": [{"from": "human"/"gpt"/"tool", "value": ...}]}` chat format that Axolotl and Unsloth chat-SFT recipes read natively. Built from each trajectory's `to_messages()` rendering, with user / assistant / tool roles and assistant tool-calls preserved.
457
+
458
+ ### ADP (Agent Data Protocol)
459
+
460
+ A normalized agent-centric schema that keeps thoughts, tool calls, observations, and code-execution as first-class typed steps. Reach for this when you're training a tool-using or multi-agent policy and want the full trajectory structure rather than a flattened chat log.
461
+
462
+ ```python
463
+ from agentsynth import to_jsonl, to_sharegpt, to_adp
464
+
465
+ to_jsonl(trajectories, "data.jsonl") # structured trajectories
466
+ to_sharegpt(trajectories, "data_sg.json") # chat SFT
467
+ to_adp(trajectories, "data_adp.json") # agent-protocol records
468
+ ```
469
+
470
+ ---
471
+
472
+ ## Quality metrics
473
+
474
+ `TrajectoryEvaluator` scores every trajectory. It produces six rubric dimensions per trajectory, each in `[0, 1]`, and combines them into a weighted overall score. `compute_dataset_metrics` adds two more at the dataset level.
475
+
476
+ | Metric | Scope | What it measures |
477
+ |--------|-------|------------------|
478
+ | **Task Completion** | per-traj | Did the trajectory actually solve the user's query? *(weight 0.30)* |
479
+ | **Tool Correctness** | per-traj | Were the right tools called with valid, well-typed arguments? *(weight 0.20)* |
480
+ | **Trajectory Faithfulness** | per-traj | Is the final answer grounded in the observations / tool outputs (no hallucination)? *(weight 0.15)* |
481
+ | **Reasoning Coherence / Plan Adherence** | per-traj | Do the steps follow a logical plan, and does execution match it? *(weight 0.15)* |
482
+ | **Efficiency** | per-traj | Was the goal reached without redundant or wasted steps? *(weight 0.10)* |
483
+ | **Safety** | per-traj | Does the trajectory avoid unsafe tool use or harmful content? *(weight 0.10)* |
484
+ | **Overall pass@1** | dataset | Fraction of trajectories whose weighted overall clears the pass threshold. |
485
+ | **Diversity** | dataset | How varied the dataset is across tool-usage signatures, modes, and domains. |
486
+
487
+ The six per-trajectory weights live in `DEFAULT_RUBRIC_WEIGHTS` and sum to `1.0`. Pass your own to `RubricScores.weighted_overall(weights=...)` to re-balance.
488
+
489
+ ```python
490
+ from agentsynth import TrajectoryEvaluator, diversity_score
491
+
492
+ result = TrajectoryEvaluator().evaluate(traj)
493
+ print(result.flat()) # trajectory_id, overall, passed, judge_model + 6 dims
494
+ print(result.explanation) # human-readable judge rationale
495
+
496
+ print("dataset diversity:", diversity_score(trajectories))
497
+ ```
498
+
499
+ ---
500
+
501
+ ## Project structure
502
+
503
+ ```text
504
+ AgentSynth/
505
+ ├── agentsynth/
506
+ │ ├── schemas.py # Pydantic models (Trajectory, ToolSpec, EvalResult, …)
507
+ │ ├── utils.py # tool-catalog parsing, PythonREPL, LLMClient (LiteLLM)
508
+ │ ├── generator.py # AgentTrajectoryGenerator (mock + LLM-backed)
509
+ │ ├── evaluator.py # TrajectoryEvaluator — LLM-as-Judge eval loop
510
+ │ ├── metrics.py # dataset metrics + Plotly dashboards
511
+ │ ├── exporters.py # JSONL / ShareGPT / ADP / Parquet
512
+ │ ├── preferences.py # DPO preference pairs
513
+ │ ├── dedup.py # near-duplicate removal + decontamination
514
+ │ ├── hub.py # push datasets to the Hugging Face Hub
515
+ │ ├── cli.py # the `agentsynth` CLI
516
+ │ ├── environments/ # SQL, Python, MCP, composite — run tool calls for real
517
+ │ ├── tasks/ # seed-task taxonomy
518
+ │ ├── pipelines/ # Recipe + run_recipe (generate → verify → export)
519
+ │ ├── verification/ # verifiers, judge ensemble, rubric presets
520
+ │ ├── benchmarks/ # function-calling benchmark + before/after reporting
521
+ │ └── training/ # SFT / DPO dataset builders
522
+ ├── app.py # Gradio web UI (Hugging Face Space entrypoint)
523
+ ├── scripts/ # make_dataset / train_sft / train_dpo / run_benchmark
524
+ ├── examples/ # sample datasets + a demo MCP server
525
+ ├── docs/ # VISION, ARCHITECTURE, BENCHMARK, MANIFESTO
526
+ ├── tests/ # pytest suite
527
+ ├── pyproject.toml # packaging / metadata
528
+ └── README.md
529
+ ```
530
+
531
+ ---
532
+
533
+ ## Deploy to Hugging Face Spaces
534
+
535
+ 1. Create a Space and pick the Gradio SDK.
536
+ 2. Push this repo to the Space. The entrypoint is `app.py`.
537
+ 3. `requirements.txt` is auto-detected and installed, so there's no extra build config.
538
+ 4. Optional: to enable a real LLM judge, add a provider key (for example `ANTHROPIC_API_KEY`) under Settings → Repository secrets. Without one, the Space stays in deterministic mock mode.
539
+ 5. CPU Basic hardware is enough. Generation and the mock judge need no GPU.
540
+
541
+ ---
542
+
543
+ ## Roadmap
544
+
545
+ - [ ] More agent personas & domain-specific tool packs.
546
+ - [ ] Configurable rubric presets (strict / lenient / safety-focused).
547
+ - [ ] Difficulty-aware curriculum sampling for batches.
548
+ - [ ] Direct `datasets.Dataset` / `push_to_hub` export helper.
549
+ - [ ] Pairwise / preference (DPO-style) trajectory generation.
550
+ - [ ] Streaming generation progress in the Gradio UI.
551
+
552
+ ---
553
+
554
+ ## Contributing
555
+
556
+ Contributions are welcome.
557
+
558
+ 1. Fork the repo and create a feature branch.
559
+ 2. Keep changes Python 3.9-compatible and add or extend tests under `tests/`.
560
+ 3. Run the suite: `pytest`.
561
+ 4. Open a PR with a clear description.
562
+
563
+ Bug reports, new tool catalogs, and additional export formats all make good first contributions.
564
+
565
+ ---
566
+
567
+ ## License
568
+
569
+ MIT. See [`LICENSE`](LICENSE) for details.
570
+
571
+ ---
572
+
573
+ ## Citation
574
+
575
+ If you use AgentSynth in your research or product, please cite it:
576
+
577
+ ```bibtex
578
+ @software{agentsynth2026,
579
+ title = {AgentSynth: Synthetic Agentic Trajectories Generator with an LLM-as-Judge Evaluation Loop},
580
+ author = {Your Name and Contributors},
581
+ year = {2026},
582
+ url = {https://github.com/agentsynth/agentsynth},
583
+ note = {Open-source library for generating and evaluating synthetic agent trajectories}
584
+ }
585
+ ```
586
+
587
+ ---
588
+
589
+ <sub>Suggested GitHub topics: `synthetic-data` · `agentic-ai` · `llm-finetuning` · `trajectories` · `tool-use` · `llm-as-judge`</sub>