synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. synth_ai/__init__.py +1 -1
  2. synth_ai/cli/balance.py +3 -15
  3. synth_ai/config/base_url.py +47 -0
  4. synth_ai/http.py +102 -0
  5. synth_ai/inference/__init__.py +7 -0
  6. synth_ai/inference/client.py +20 -0
  7. synth_ai/jobs/client.py +246 -0
  8. synth_ai/learning/__init__.py +24 -0
  9. synth_ai/learning/client.py +149 -0
  10. synth_ai/learning/config.py +43 -0
  11. synth_ai/learning/constants.py +29 -0
  12. synth_ai/learning/ft_client.py +59 -0
  13. synth_ai/learning/health.py +43 -0
  14. synth_ai/learning/jobs.py +205 -0
  15. synth_ai/learning/rl_client.py +256 -0
  16. synth_ai/learning/sse.py +58 -0
  17. synth_ai/learning/validators.py +48 -0
  18. synth_ai/lm/core/main_v3.py +13 -0
  19. synth_ai/lm/core/synth_models.py +48 -0
  20. synth_ai/lm/core/vendor_clients.py +9 -6
  21. synth_ai/lm/vendors/core/openai_api.py +31 -3
  22. synth_ai/lm/vendors/openai_standard.py +45 -14
  23. synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
  24. synth_ai/lm/vendors/synth_client.py +372 -28
  25. synth_ai/rl/__init__.py +30 -0
  26. synth_ai/rl/contracts.py +32 -0
  27. synth_ai/rl/env_keys.py +137 -0
  28. synth_ai/rl/secrets.py +19 -0
  29. synth_ai/scripts/verify_rewards.py +100 -0
  30. synth_ai/task/__init__.py +10 -0
  31. synth_ai/task/contracts.py +120 -0
  32. synth_ai/task/health.py +28 -0
  33. synth_ai/task/validators.py +12 -0
  34. synth_ai/tracing_v3/hooks.py +3 -1
  35. synth_ai/tracing_v3/session_tracer.py +123 -2
  36. synth_ai/tracing_v3/turso/manager.py +218 -0
  37. synth_ai/tracing_v3/turso/models.py +53 -0
  38. synth_ai-0.2.4.dev8.dist-info/METADATA +635 -0
  39. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/RECORD +43 -25
  40. synth_ai/tui/__init__.py +0 -1
  41. synth_ai/tui/__main__.py +0 -13
  42. synth_ai/tui/cli/__init__.py +0 -1
  43. synth_ai/tui/cli/query_experiments.py +0 -164
  44. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  45. synth_ai/tui/dashboard.py +0 -340
  46. synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
  47. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/WHEEL +0 -0
  48. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/entry_points.txt +0 -0
  49. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/licenses/LICENSE +0 -0
  50. {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,635 @@
1
+ Metadata-Version: 2.4
2
+ Name: synth-ai
3
+ Version: 0.2.4.dev8
4
+ Summary: Software for aiding the best and multiplying the will - Core AI functionality and tracing
5
+ Author-email: Synth AI <josh@usesynth.ai>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/synth-laboratories/synth-ai
8
+ Project-URL: Repository, https://github.com/synth-laboratories/synth-ai
9
+ Project-URL: Issues, https://github.com/synth-laboratories/synth-ai/issues
10
+ Requires-Python: >=3.11
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: pydantic>=2.0.0
14
+ Requires-Dist: python-dotenv>=1.0.1
15
+ Requires-Dist: requests>=2.32.3
16
+ Requires-Dist: urllib3>=2.3.0
17
+ Requires-Dist: tqdm>=4.66.4
18
+ Requires-Dist: jsonschema>=4.23.0
19
+ Requires-Dist: backoff>=2.0.0
20
+ Requires-Dist: typing_extensions>=4.0.0
21
+ Requires-Dist: openai>=1.99.0
22
+ Requires-Dist: anthropic>=0.42.0
23
+ Requires-Dist: langfuse<3.0.0,>=2.53.9
24
+ Requires-Dist: opentelemetry-api<1.27.0,>=1.26.0
25
+ Requires-Dist: opentelemetry-sdk<1.27.0,>=1.26.0
26
+ Requires-Dist: diskcache>=5.6.3
27
+ Requires-Dist: groq>=0.30.0
28
+ Requires-Dist: google-genai>=1.26.0
29
+ Requires-Dist: together>=1.5.21
30
+ Requires-Dist: mistralai>=1.9.2
31
+ Requires-Dist: fastapi>=0.115.12
32
+ Requires-Dist: uvicorn>=0.34.2
33
+ Requires-Dist: numpy>=2.2.3
34
+ Requires-Dist: networkx>=3.4.2
35
+ Requires-Dist: redis>=6.2.0
36
+ Requires-Dist: duckdb>=1.0.0
37
+ Requires-Dist: pandas>=2.2.3
38
+ Requires-Dist: ty>=0.0.1a5
39
+ Requires-Dist: toml>=0.10.2
40
+ Requires-Dist: sqlalchemy>=2.0.42
41
+ Requires-Dist: aiosqlite>=0.21.0
42
+ Requires-Dist: greenlet>=3.2.3
43
+ Requires-Dist: libsql>=0.1.8
44
+ Requires-Dist: pynacl>=1.5.0
45
+ Requires-Dist: google-api-core>=2.25.1
46
+ Requires-Dist: google-generativeai>=0.8.5
47
+ Requires-Dist: crafter>=1.8.3
48
+ Requires-Dist: click>=8.1.0
49
+ Requires-Dist: textual>=1.1.0
50
+ Requires-Dist: openai-harmony>=0.0.1
51
+ Requires-Dist: asyncpg>=0.30.0
52
+ Requires-Dist: aiohttp>=3.8.0
53
+ Requires-Dist: datasets>=4.0.0
54
+ Requires-Dist: transformers>=4.56.1
55
+ Provides-Extra: dev
56
+ Requires-Dist: build>=1.2.2.post1; extra == "dev"
57
+ Requires-Dist: twine>=4.0.0; extra == "dev"
58
+ Requires-Dist: keyring>=24.0.0; extra == "dev"
59
+ Requires-Dist: pytest>=8.3.3; extra == "dev"
60
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == "dev"
61
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
62
+ Requires-Dist: pyright>=1.1.350; extra == "dev"
63
+ Requires-Dist: coverage[toml]>=7.3.0; extra == "dev"
64
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
65
+ Provides-Extra: research
66
+ Requires-Dist: crafter>=1.8.3; extra == "research"
67
+ Requires-Dist: datasets>=4.0.0; extra == "research"
68
+ Provides-Extra: all
69
+ Requires-Dist: crafter>=1.8.3; extra == "all"
70
+ Requires-Dist: datasets>=4.0.0; extra == "all"
71
+ Dynamic: license-file
72
+
73
+ # Synth AI
74
+
75
+ Modern Compound AI System Development
76
+
77
+ **Comprehensive AI Framework for Language Models, Environments, and Observability**
78
+
79
+ [![Python](https://img.shields.io/badge/python-3.11+-blue)](https://www.python.org/)
80
+ [![License](https://img.shields.io/badge/license-MIT-green)](LICENSE)
81
+ [![PyPI](https://img.shields.io/badge/PyPI-0.2.3.dev0-orange)](https://pypi.org/project/synth-ai/)
82
+ ![Coverage](https://img.shields.io/badge/coverage-0.0%25-red)
83
+ ![Tests](https://img.shields.io/badge/tests-17%2F17%20passing-brightgreen)
84
+
85
+ A unified framework combining language model capabilities, synthetic environments, and comprehensive tracing for building and evaluating AI agents.
86
+
87
+ ## 🚀 Quick Start
88
+
89
+ ### Installation
90
+
91
+ ```bash
92
+ # Basic installation
93
+ pip install synth-ai
94
+
95
+ # With research environments (includes game environments)
96
+ pip install synth-ai[research]
97
+
98
+ # Full installation with all providers
99
+ pip install synth-ai[all]
100
+ ```
101
+
102
+ ### Spinning Up
103
+
104
+ Start the Synth AI service daemon (includes sqld database + environment service):
105
+
106
+ ```bash
107
+ # Start both database daemon (port 8080) and environment service (port 8901)
108
+ uvx synth-ai serve
109
+ ```
110
+
111
+ #### Service Command Options
112
+
113
+ ```bash
114
+ uvx synth-ai serve [OPTIONS]
115
+ ```
116
+
117
+ **Available Options:**
118
+ - `--db-file` - Database file path (default: "synth_ai.db")
119
+ - `--sqld-port` - Port for sqld HTTP interface (default: 8080)
120
+ - `--env-port` - Port for environment service (default: 8901)
121
+ - `--no-sqld` - Skip starting sqld database daemon
122
+ - `--no-env` - Skip starting environment service
123
+
124
+ **Examples:**
125
+ ```bash
126
+ # Start with custom ports
127
+ uvx synth-ai serve --sqld-port 8081 --env-port 8902
128
+
129
+ # Start only the environment service
130
+ uvx synth-ai serve --no-sqld
131
+
132
+ # Start only the database service
133
+ uvx synth-ai serve --no-env
134
+ ```
135
+
136
+ #### What the Serve Command Provides
137
+
138
+ **sqld Database Service (port 8080)**
139
+ - Local SQLite-compatible database server with HTTP API
140
+ - Automatically downloads and installs sqld binary if needed
141
+ - Provides persistent storage for agent interactions and traces
142
+
143
+ **Environment Service (port 8901)**
144
+ - FastAPI service for managing AI environments and tasks
145
+ - Built-in environments: Crafter, Sokoban, MiniGrid, TicTacToe, Verilog, NetHack, Enron
146
+ - RESTful API for environment initialization, stepping, and termination
147
+ - Dynamic environment registry for custom environments
148
+
149
+ In another terminal, run your first example:
150
+
151
+ ```bash
152
+ # Run a Crafter agent demo with Gemini
153
+ ./examples/run_crafter_demo.sh
154
+ ```
155
+
156
+ This will:
157
+ - Start the sqld database daemon with HTTP API on port 8080
158
+ - Launch the environment service API on port 8901
159
+ - Run a reactive agent in the Crafter environment using Gemini 1.5 Flash
160
+
161
+ #### Demos (Eval + Finetuning)
162
+
163
+ You can run interactive demos from the repo without remembering exact commands:
164
+
165
+ ```bash
166
+ # Lists all available demos under examples/, then prompts you to choose
167
+ uvx synth-ai demo
168
+ ```
169
+
170
+ Today this includes:
171
+ - Eval demo: `examples/evals/run_demo.sh`
172
+ - Prompts for models, episodes, etc.
173
+ - Runs Crafter rollouts with v3 tracing, then analyzes and filters traces
174
+ - Writes a JSONL like `ft_data/evals_filtered.jsonl` for downstream use
175
+ - Finetuning demo: `examples/finetuning/synth_qwen/run_demo.sh`
176
+ - Guides you through: rollouts → filter v3 traces → prepare SFT JSONL
177
+ - Pair with `uvpm examples.finetuning.synth_qwen.sft_kickoff` to start an SFT job when ready
178
+
179
+ Notes:
180
+ - Ensure the service is running (`uvx synth-ai serve`) so v3 traces are recorded locally.
181
+ - Set API configuration for finetuning:
182
+ - `export LEARNING_V2_BASE_URL="http://localhost:8000/api"` (or your proxy)
183
+ - `export SYNTH_API_KEY="sk_live_..."`
184
+ - v3 trace data is stored under `traces/v3/synth_ai.db/` by default. Inspect with `uvx synth-ai traces`.
185
+ - LM tracing: all model calls (prompts, outputs, tool calls, token usage, latency, cost) are automatically captured via v3 tracing and stored locally; inspect with `uvx synth-ai traces`.
186
+
187
+ ### One-Command Demos
188
+
189
+ Quickly browse and launch interactive demos under `examples/`:
190
+
191
+ ```bash
192
+ uvx synth-ai demo
193
+ ```
194
+
195
+ This lists all `run_demo.sh` scripts found in the repo (e.g., eval comparisons, finetuning flows) and lets you pick one to run.
196
+
197
+
198
+ ## Changelog (migrated from CHANGELOG.md)
199
+
200
+ # Changelog
201
+
202
+ All notable changes to this project are documented in this file.
203
+
204
+ ## [0.2.4.dev6] - 2025-08-18
205
+ - Added: Wordle environment for simple RL testing, including engine, environment, taskset, and curated instances with helper generator script.
206
+ - Added: Wordle README and unit/integration tests (integrity checks and gameplay coverage).
207
+ - Added: Service routes and example wiring to expose Wordle tasks through the environment service.
208
+ - Changed: Bumped development version to 0.2.4.dev6 and published to PyPI.
209
+
210
+ [0.2.4.dev6]: https://github.com/synth-laboratories/synth-ai/releases/tag/v0.2.4.dev6
211
+
212
+
213
+ ## Crafter Notes (migrated from crafter.md)
214
+
215
+ uvpm src.synth_env.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash-latest
216
+ uv run uvicorn src.synth_env.service.app:app --host 0.0.0.0 --port 8901
217
+ CRAFTER
218
+
219
+ gemini-1.5-flash-8b
220
+ collect_sapling: 20 times (K=0.1, contribution=0.304)
221
+
222
+ gpt-4.1-nano
223
+ collect_drink: 8 times (K=0.1, contribution=0.220)
224
+ collect_sapling: 2 times (K=0.1, contribution=0.110)
225
+ collect_wood: 12 times (K=1.0, contribution=2.565)
226
+
227
+ gpt-4o-mini
228
+ collect_drink: 1 times (K=0.1, contribution=0.069)
229
+ collect_sapling: 15 times (K=0.1, contribution=0.277)
230
+ collect_wood: 7 times (K=1.0, contribution=2.079)
231
+ eat_cow: 2 times (K=1.0, contribution=1.099)
232
+
233
+ gemini-1.5-flash
234
+ collect_drink: 5 times (K=0.1, contribution=0.179)
235
+ collect_sapling: 10 times (K=0.1, contribution=0.240)
236
+ collect_wood: 12 times (K=1.0, contribution=2.565)
237
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
238
+ eat_cow: 1 times (K=1.0, contribution=0.693)
239
+ make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
240
+ place_table: 1 times (K=3.0, contribution=2.079)
241
+
242
+ gpt-4.1-mini
243
+ collect_coal: 1 times (K=3.0, contribution=2.079)
244
+ collect_drink: 7 times (K=0.1, contribution=0.208)
245
+ collect_sapling: 16 times (K=0.1, contribution=0.283)
246
+ collect_stone: 1 times (K=1.0, contribution=0.693)
247
+ collect_wood: 17 times (K=1.0, contribution=2.890)
248
+ eat_cow: 3 times (K=1.0, contribution=1.386)
249
+ make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
250
+ place_table: 1 times (K=3.0, contribution=2.079)
251
+
252
+ gemini-2.5-flash
253
+ collect_coal: 5 times (K=3.0, contribution=5.375)
254
+ collect_drink: 7 times (K=0.1, contribution=0.208)
255
+ collect_sapling: 12 times (K=0.1, contribution=0.256)
256
+ collect_stone: 9 times (K=1.0, contribution=2.303)
257
+ collect_wood: 18 times (K=1.0, contribution=2.944)
258
+ eat_cow: 1 times (K=1.0, contribution=0.693)
259
+ make_stone_pickaxe: 2 times (K=10.0, contribution=10.986)
260
+ make_wood_pickaxe: 13 times (K=3.0, contribution=7.917)
261
+ place_furnace: 2 times (K=10.0, contribution=10.986)
262
+ place_plant: 1 times (K=0.1, contribution=0.069)
263
+ place_table: 17 times (K=3.0, contribution=8.671)
264
+ wake_up: 2 times (K=0.1, contribution=0.110)
265
+
266
+ gemini-2.5-pro
267
+ collect_coal: 3 times (K=3.0, contribution=4.159)
268
+ collect_drink: 4 times (K=0.1, contribution=0.161)
269
+ collect_sapling: 12 times (K=0.1, contribution=0.256)
270
+ collect_stone: 6 times (K=1.0, contribution=1.946)
271
+ collect_wood: 18 times (K=1.0, contribution=2.944)
272
+ make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
273
+ make_wood_pickaxe: 10 times (K=3.0, contribution=7.194)
274
+ place_furnace: 3 times (K=10.0, contribution=13.863)
275
+ place_table: 18 times (K=3.0, contribution=8.833)
276
+ wake_up: 3 times (K=0.1, contribution=0.139)
277
+
278
+ gpt-4.1
279
+ collect_coal: 1 times (K=3.0, contribution=2.079)
280
+ collect_drink: 3 times (K=0.1, contribution=0.139)
281
+ collect_sapling: 15 times (K=0.1, contribution=0.277)
282
+ collect_stone: 7 times (K=1.0, contribution=2.079)
283
+ collect_wood: 19 times (K=1.0, contribution=2.996)
284
+ defeat_skeleton: 1 times (K=1.0, contribution=0.693)
285
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
286
+ eat_cow: 3 times (K=1.0, contribution=1.386)
287
+ make_stone_pickaxe: 4 times (K=10.0, contribution=16.094)
288
+ make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
289
+ place_table: 17 times (K=3.0, contribution=8.671)
290
+ wake_up: 3 times (K=0.1, contribution=0.139)
291
+
292
+ claude-sonnet-4
293
+ collect_coal: 1 times (K=3.0, contribution=2.079)
294
+ collect_drink: 2 times (K=0.1, contribution=0.110)
295
+ collect_sapling: 11 times (K=0.1, contribution=0.248)
296
+ collect_stone: 4 times (K=1.0, contribution=1.609)
297
+ collect_wood: 15 times (K=1.0, contribution=2.773)
298
+ eat_cow: 4 times (K=1.0, contribution=1.609)
299
+ make_wood_pickaxe: 8 times (K=3.0, contribution=6.592)
300
+ place_plant: 1 times (K=0.1, contribution=0.069)
301
+ place_table: 13 times (K=3.0, contribution=7.917)
302
+ wake_up: 1 times (K=0.1, contribution=0.069)
303
+
304
+ gemini-2.5-flash-lite
305
+ collect_drink: 8 times (K=0.1, contribution=0.220)
306
+ collect_sapling: 15 times (K=0.1, contribution=0.277)
307
+ collect_stone: 2 times (K=1.0, contribution=1.099)
308
+ collect_wood: 17 times (K=1.0, contribution=2.890)
309
+ eat_cow: 3 times (K=1.0, contribution=1.386)
310
+ make_wood_pickaxe: 7 times (K=3.0, contribution=6.238)
311
+ place_plant: 1 times (K=0.1, contribution=0.069)
312
+ place_table: 11 times (K=3.0, contribution=7.455)
313
+ wake_up: 6 times (K=0.1, contribution=0.195)
314
+
315
+ o4-mini
316
+ collect_coal: 7 times (K=3.0, contribution=6.238)
317
+ collect_drink: 5 times (K=0.1, contribution=0.179)
318
+ collect_iron: 1 times (K=10.0, contribution=6.931)
319
+ collect_sapling: 9 times (K=0.1, contribution=0.230)
320
+ collect_stone: 15 times (K=1.0, contribution=2.773)
321
+ collect_wood: 19 times (K=1.0, contribution=2.996)
322
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
323
+ make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
324
+ make_stone_sword: 1 times (K=10.0, contribution=6.931)
325
+ make_wood_pickaxe: 19 times (K=3.0, contribution=8.987)
326
+ place_furnace: 5 times (K=10.0, contribution=17.918)
327
+ place_plant: 3 times (K=0.1, contribution=0.139)
328
+ place_table: 19 times (K=3.0, contribution=8.987)
329
+ wake_up: 3 times (K=0.1, contribution=0.139)
330
+
331
+ o3-mini
332
+ collect_coal: 3 times (K=3.0, contribution=4.159)
333
+ collect_drink: 7 times (K=0.1, contribution=0.208)
334
+ collect_sapling: 10 times (K=0.1, contribution=0.240)
335
+ collect_stone: 5 times (K=1.0, contribution=1.792)
336
+ collect_wood: 17 times (K=1.0, contribution=2.890)
337
+ eat_cow: 8 times (K=1.0, contribution=2.197)
338
+ make_stone_pickaxe: 1 times (K=10.0, contribution=6.931)
339
+ make_wood_pickaxe: 9 times (K=3.0, contribution=6.908)
340
+ place_table: 13 times (K=3.0, contribution=7.917)
341
+ wake_up: 11 times (K=0.1, contribution=0.248)
342
+
343
+ qwen/qwen3-32b
344
+ collect_coal: 3 times (K=3.0, contribution=4.159)
345
+ collect_drink: 6 times (K=0.1, contribution=0.195)
346
+ collect_sapling: 12 times (K=0.1, contribution=0.256)
347
+ collect_stone: 8 times (K=1.0, contribution=2.197)
348
+ collect_wood: 20 times (K=1.0, contribution=3.045)
349
+ eat_cow: 5 times (K=1.0, contribution=1.792)
350
+ make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
351
+ make_wood_pickaxe: 15 times (K=3.0, contribution=8.318)
352
+ place_furnace: 3 times (K=10.0, contribution=13.863)
353
+ place_plant: 2 times (K=0.1, contribution=0.110)
354
+ place_table: 18 times (K=3.0, contribution=8.833)
355
+ wake_up: 13 times (K=0.1, contribution=0.264)
356
+
357
+ o3
358
+ collect_coal: 6 times (K=3.0, contribution=5.838)
359
+ collect_drink: 1 times (K=0.1, contribution=0.069)
360
+ collect_iron: 2 times (K=10.0, contribution=10.986)
361
+ collect_sapling: 11 times (K=0.1, contribution=0.248)
362
+ collect_stone: 9 times (K=1.0, contribution=2.303)
363
+ collect_wood: 19 times (K=1.0, contribution=2.996)
364
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
365
+ eat_cow: 1 times (K=1.0, contribution=0.693)
366
+ make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
367
+ make_stone_sword: 3 times (K=10.0, contribution=13.863)
368
+ make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
369
+ make_wood_sword: 6 times (K=3.0, contribution=5.838)
370
+ place_furnace: 4 times (K=10.0, contribution=16.094)
371
+ place_plant: 5 times (K=0.1, contribution=0.179)
372
+ place_table: 15 times (K=3.0, contribution=8.318)
373
+ wake_up: 12 times (K=0.1, contribution=0.256)
374
+
375
+ uv run python src/synth_env/examples/crafter_classic/agent_demos/test_crafter_react_agent.py --config src/evals/configs/crafter.toml
376
+
377
+ episodes = 20 # Number of episodes to run
378
+ max_steps = 50 # Maximum steps per episode
379
+ seed = 42 # Random seed for reproducibility
380
+ difficulty = "easy" # Difficulty mode
381
+
382
+
383
+ groq models
384
+ - meta-llama/llama-4-scout-17b-16e-instruct
385
+ - meta-llama/llama-4-maverick-17b-128e-instruct
386
+ qwen/qwen3-32b
387
+
388
+
389
+ CRAFTER
390
+ 50 steps
391
+ | Model | Episodes | Mean Score | Avg Achievements | Unique Achievements | Shaped Reward | Mean K-Score |
392
+ |------------------|----------|------------|------------------|---------------------|---------------|--------------|
393
+ | qwen-2.5-0.5b | 10 | 1.00 | 1.00 | 1 | 0.240 | 0.024 |
394
+ | g-1.5-flash-8b | 20 | 1.00 | 1.00 | 1 | 0.304 | 0.015 |
395
+ | L4-scout-17b | 20 | 0.20 | 0.20 | 4 | 1.525 | 0.076 |
396
+ | gpt-4.1-nano | 20 | 1.10 | 1.10 | 3 | 2.895 | 0.145 |
397
+ | gpt-4o-mini | 20 | 1.25 | 1.25 | 4 | 3.525 | 0.176 |
398
+ | L3.1-8b-groq | 20 | 1.45 | 1.45 | 4 | 3.552 | 0.178 |
399
+ | L4-maverick-17b | 20 | 2.20 | 2.20 | 6 | 7.087 | 0.354 |
400
+ | L3.3-70b-groq | 20 | 2.15 | 2.15 | 6 | 7.188 | 0.359 |
401
+ | gemini-1.5-flash | 20 | 1.55 | 1.55 | 7 | 8.529 | 0.426 |
402
+ | deepseek-chat | 20 | 1.85 | 1.85 | 7 | 9.458 | 0.473 |
403
+ | gpt-4.1-mini | 20 | 2.35 | 2.35 | 8 | 11.699 | 0.585 |
404
+ | gpt-5-nano | 20 | 2.85 | ???? | 13 | ?????? | ??????|
405
+ | groq/kimi-k2 | 20 | 3.05 | 3.05 | 8 | 17.952 | 0.898 |
406
+ | g-2.5-flash-lite | 20 | 3.50 | 3.50 | 9 | 19.829 | 0.991 |
407
+ | claude-sonnet-4 | 20 | 3.00 | 3.00 | 10 | 23.077 | 1.154 |
408
+ | gpt-5-mini | 20 | 3.85 | ???? | 15 | ?????? | ????? |
409
+ | o3-mini | 20 | 4.20 | 4.20 | 10 | 33.491 | 1.675 |
410
+ | gpt-4.1 | 20 | 4.40 | 4.40 | 12 | 43.371 | 2.169 |
411
+ | gemini-2.5-flash | 19 | 4.68 | 4.68 | 12 | 50.520 | 2.659 |
412
+ | gemini-2.5-pro | 20 | 4.00 | 4.00 | 10 | 53.358 | 2.668 |
413
+ | qwen/qwen3-32b | 20 | 5.40 | 5.40 | 15 | 56.894 | 2.845 |
414
+ | o4-mini | 20 | 5.70 | 5.70 | 14 | 83.936 | 4.197 |
415
+ | o3 | 20 | 5.80 | 5.80 | 16 | 97.293 | 4.865 |
416
+
417
+ *o3 had trajectories terminated early
418
+
419
+ 300 steps
420
+ | gemini-1.5-flash | 20 | 1.50 | 1.50 | 6 | 7.440 | 0.372 |
421
+ | g-2.5-flash-lite | 20 | 4.90 | 4.90 | 10 | 24.713 | 1.236 |
422
+ | kimi-k2-instruct | 20 | 4.45 | 4.45 | 12 | 45.834 | 2.292 |
423
+ | qwen/qwen3-32b | 20 | 6.25 | 6.25 | 14 | 55.396 | 2.770 |
424
+
425
+ 50 steps, 100 traj
426
+ | qwen/qwen3-32b | 93 | 4.74 | 4.74 | 14 | 94.806 | 1.019 |uvpm src.synth_env.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash-latest
427
+ uv run uvicorn src.synth_env.service.app:app --host 0.0.0.0 --port 8901
428
+ CRAFTER
429
+
430
+ gemini-1.5-flash-8b
431
+ collect_sapling: 20 times (K=0.1, contribution=0.304)
432
+
433
+ gpt-4.1-nano
434
+ collect_drink: 8 times (K=0.1, contribution=0.220)
435
+ collect_sapling: 2 times (K=0.1, contribution=0.110)
436
+ collect_wood: 12 times (K=1.0, contribution=2.565)
437
+
438
+ gpt-4o-mini
439
+ collect_drink: 1 times (K=0.1, contribution=0.069)
440
+ collect_sapling: 15 times (K=0.1, contribution=0.277)
441
+ collect_wood: 7 times (K=1.0, contribution=2.079)
442
+ eat_cow: 2 times (K=1.0, contribution=1.099)
443
+
444
+ gemini-1.5-flash
445
+ collect_drink: 5 times (K=0.1, contribution=0.179)
446
+ collect_sapling: 10 times (K=0.1, contribution=0.240)
447
+ collect_wood: 12 times (K=1.0, contribution=2.565)
448
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
449
+ eat_cow: 1 times (K=1.0, contribution=0.693)
450
+ make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
451
+ place_table: 1 times (K=3.0, contribution=2.079)
452
+
453
+ gpt-4.1-mini
454
+ collect_coal: 1 times (K=3.0, contribution=2.079)
455
+ collect_drink: 7 times (K=0.1, contribution=0.208)
456
+ collect_sapling: 16 times (K=0.1, contribution=0.283)
457
+ collect_stone: 1 times (K=1.0, contribution=0.693)
458
+ collect_wood: 17 times (K=1.0, contribution=2.890)
459
+ eat_cow: 3 times (K=1.0, contribution=1.386)
460
+ make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
461
+ place_table: 1 times (K=3.0, contribution=2.079)
462
+
463
+ gemini-2.5-flash
464
+ collect_coal: 5 times (K=3.0, contribution=5.375)
465
+ collect_drink: 7 times (K=0.1, contribution=0.208)
466
+ collect_sapling: 12 times (K=0.1, contribution=0.256)
467
+ collect_stone: 9 times (K=1.0, contribution=2.303)
468
+ collect_wood: 18 times (K=1.0, contribution=2.944)
469
+ eat_cow: 1 times (K=1.0, contribution=0.693)
470
+ make_stone_pickaxe: 2 times (K=10.0, contribution=10.986)
471
+ make_wood_pickaxe: 13 times (K=3.0, contribution=7.917)
472
+ place_furnace: 2 times (K=10.0, contribution=10.986)
473
+ place_plant: 1 times (K=0.1, contribution=0.069)
474
+ place_table: 17 times (K=3.0, contribution=8.671)
475
+ wake_up: 2 times (K=0.1, contribution=0.110)
476
+
477
+ gemini-2.5-pro
478
+ collect_coal: 3 times (K=3.0, contribution=4.159)
479
+ collect_drink: 4 times (K=0.1, contribution=0.161)
480
+ collect_sapling: 12 times (K=0.1, contribution=0.256)
481
+ collect_stone: 6 times (K=1.0, contribution=1.946)
482
+ collect_wood: 18 times (K=1.0, contribution=2.944)
483
+ make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
484
+ make_wood_pickaxe: 10 times (K=3.0, contribution=7.194)
485
+ place_furnace: 3 times (K=10.0, contribution=13.863)
486
+ place_table: 18 times (K=3.0, contribution=8.833)
487
+ wake_up: 3 times (K=0.1, contribution=0.139)
488
+
489
+ gpt-4.1
490
+ collect_coal: 1 times (K=3.0, contribution=2.079)
491
+ collect_drink: 3 times (K=0.1, contribution=0.139)
492
+ collect_sapling: 15 times (K=0.1, contribution=0.277)
493
+ collect_stone: 7 times (K=1.0, contribution=2.079)
494
+ collect_wood: 19 times (K=1.0, contribution=2.996)
495
+ defeat_skeleton: 1 times (K=1.0, contribution=0.693)
496
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
497
+ eat_cow: 3 times (K=1.0, contribution=1.386)
498
+ make_stone_pickaxe: 4 times (K=10.0, contribution=16.094)
499
+ make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
500
+ place_table: 17 times (K=3.0, contribution=8.671)
501
+ wake_up: 3 times (K=0.1, contribution=0.139)
502
+
503
+ claude-sonnet-4
504
+ collect_coal: 1 times (K=3.0, contribution=2.079)
505
+ collect_drink: 2 times (K=0.1, contribution=0.110)
506
+ collect_sapling: 11 times (K=0.1, contribution=0.248)
507
+ collect_stone: 4 times (K=1.0, contribution=1.609)
508
+ collect_wood: 15 times (K=1.0, contribution=2.773)
509
+ eat_cow: 4 times (K=1.0, contribution=1.609)
510
+ make_wood_pickaxe: 8 times (K=3.0, contribution=6.592)
511
+ place_plant: 1 times (K=0.1, contribution=0.069)
512
+ place_table: 13 times (K=3.0, contribution=7.917)
513
+ wake_up: 1 times (K=0.1, contribution=0.069)
514
+
515
+ gemini-2.5-flash-lite
516
+ collect_drink: 8 times (K=0.1, contribution=0.220)
517
+ collect_sapling: 15 times (K=0.1, contribution=0.277)
518
+ collect_stone: 2 times (K=1.0, contribution=1.099)
519
+ collect_wood: 17 times (K=1.0, contribution=2.890)
520
+ eat_cow: 3 times (K=1.0, contribution=1.386)
521
+ make_wood_pickaxe: 7 times (K=3.0, contribution=6.238)
522
+ place_plant: 1 times (K=0.1, contribution=0.069)
523
+ place_table: 11 times (K=3.0, contribution=7.455)
524
+ wake_up: 6 times (K=0.1, contribution=0.195)
525
+
526
+ o4-mini
527
+ collect_coal: 7 times (K=3.0, contribution=6.238)
528
+ collect_drink: 5 times (K=0.1, contribution=0.179)
529
+ collect_iron: 1 times (K=10.0, contribution=6.931)
530
+ collect_sapling: 9 times (K=0.1, contribution=0.230)
531
+ collect_stone: 15 times (K=1.0, contribution=2.773)
532
+ collect_wood: 19 times (K=1.0, contribution=2.996)
533
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
534
+ make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
535
+ make_stone_sword: 1 times (K=10.0, contribution=6.931)
536
+ make_wood_pickaxe: 19 times (K=3.0, contribution=8.987)
537
+ place_furnace: 5 times (K=10.0, contribution=17.918)
538
+ place_plant: 3 times (K=0.1, contribution=0.139)
539
+ place_table: 19 times (K=3.0, contribution=8.987)
540
+ wake_up: 3 times (K=0.1, contribution=0.139)
541
+
542
+ o3-mini
543
+ collect_coal: 3 times (K=3.0, contribution=4.159)
544
+ collect_drink: 7 times (K=0.1, contribution=0.208)
545
+ collect_sapling: 10 times (K=0.1, contribution=0.240)
546
+ collect_stone: 5 times (K=1.0, contribution=1.792)
547
+ collect_wood: 17 times (K=1.0, contribution=2.890)
548
+ eat_cow: 8 times (K=1.0, contribution=2.197)
549
+ make_stone_pickaxe: 1 times (K=10.0, contribution=6.931)
550
+ make_wood_pickaxe: 9 times (K=3.0, contribution=6.908)
551
+ place_table: 13 times (K=3.0, contribution=7.917)
552
+ wake_up: 11 times (K=0.1, contribution=0.248)
553
+
554
+ qwen/qwen3-32b
555
+ collect_coal: 3 times (K=3.0, contribution=4.159)
556
+ collect_drink: 6 times (K=0.1, contribution=0.195)
557
+ collect_sapling: 12 times (K=0.1, contribution=0.256)
558
+ collect_stone: 8 times (K=1.0, contribution=2.197)
559
+ collect_wood: 20 times (K=1.0, contribution=3.045)
560
+ eat_cow: 5 times (K=1.0, contribution=1.792)
561
+ make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
562
+ make_wood_pickaxe: 15 times (K=3.0, contribution=8.318)
563
+ place_furnace: 3 times (K=10.0, contribution=13.863)
564
+ place_plant: 2 times (K=0.1, contribution=0.110)
565
+ place_table: 18 times (K=3.0, contribution=8.833)
566
+ wake_up: 13 times (K=0.1, contribution=0.264)
567
+
568
+ o3
569
+ collect_coal: 6 times (K=3.0, contribution=5.838)
570
+ collect_drink: 1 times (K=0.1, contribution=0.069)
571
+ collect_iron: 2 times (K=10.0, contribution=10.986)
572
+ collect_sapling: 11 times (K=0.1, contribution=0.248)
573
+ collect_stone: 9 times (K=1.0, contribution=2.303)
574
+ collect_wood: 19 times (K=1.0, contribution=2.996)
575
+ defeat_zombie: 1 times (K=1.0, contribution=0.693)
576
+ eat_cow: 1 times (K=1.0, contribution=0.693)
577
+ make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
578
+ make_stone_sword: 3 times (K=10.0, contribution=13.863)
579
+ make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
580
+ make_wood_sword: 6 times (K=3.0, contribution=5.838)
581
+ place_furnace: 4 times (K=10.0, contribution=16.094)
582
+ place_plant: 5 times (K=0.1, contribution=0.179)
583
+ place_table: 15 times (K=3.0, contribution=8.318)
584
+ wake_up: 12 times (K=0.1, contribution=0.256)
585
+
586
+ uv run python src/synth_env/examples/crafter_classic/agent_demos/test_crafter_react_agent.py --config src/evals/configs/crafter.toml
587
+
588
+ episodes = 20 # Number of episodes to run
589
+ max_steps = 50 # Maximum steps per episode
590
+ seed = 42 # Random seed for reproducibility
591
+ difficulty = "easy" # Difficulty mode
592
+
593
+
594
+ groq models
595
+ - meta-llama/llama-4-scout-17b-16e-instruct
596
+ - meta-llama/llama-4-maverick-17b-128e-instruct
597
+ qwen/qwen3-32b
598
+
599
+
600
+ CRAFTER
601
+ 50 steps
602
+ | Model | Episodes | Mean Score | Avg Achievements | Unique Achievements | Shaped Reward | Mean K-Score |
603
+ |------------------|----------|------------|------------------|---------------------|---------------|--------------|
604
+ | qwen-2.5-0.5b | 10 | 1.00 | 1.00 | 1 | 0.240 | 0.024 |
605
+ | g-1.5-flash-8b | 20 | 1.00 | 1.00 | 1 | 0.304 | 0.015 |
606
+ | L4-scout-17b | 20 | 0.20 | 0.20 | 4 | 1.525 | 0.076 |
607
+ | gpt-4.1-nano | 20 | 1.10 | 1.10 | 3 | 2.895 | 0.145 |
608
+ | gpt-4o-mini | 20 | 1.25 | 1.25 | 4 | 3.525 | 0.176 |
609
+ | L3.1-8b-groq | 20 | 1.45 | 1.45 | 4 | 3.552 | 0.178 |
610
+ | L4-maverick-17b | 20 | 2.20 | 2.20 | 6 | 7.087 | 0.354 |
611
+ | L3.3-70b-groq | 20 | 2.15 | 2.15 | 6 | 7.188 | 0.359 |
612
+ | gemini-1.5-flash | 20 | 1.55 | 1.55 | 7 | 8.529 | 0.426 |
613
+ | deepseek-chat | 20 | 1.85 | 1.85 | 7 | 9.458 | 0.473 |
614
+ | gpt-4.1-mini | 20 | 2.35 | 2.35 | 8 | 11.699 | 0.585 |
615
+ | groq/kimi-k2 | 20 | 3.05 | 3.05 | 8 | 17.952 | 0.898 |
616
+ | g-2.5-flash-lite | 20 | 3.50 | 3.50 | 9 | 19.829 | 0.991 |
617
+ | claude-sonnet-4 | 20 | 3.00 | 3.00 | 10 | 23.077 | 1.154 |
618
+ | o3-mini | 20 | 4.20 | 4.20 | 10 | 33.491 | 1.675 |
619
+ | gpt-4.1 | 20 | 4.40 | 4.40 | 12 | 43.371 | 2.169 |
620
+ | gemini-2.5-flash | 19 | 4.68 | 4.68 | 12 | 50.520 | 2.659 |
621
+ | gemini-2.5-pro | 20 | 4.00 | 4.00 | 10 | 53.358 | 2.668 |
622
+ | qwen/qwen3-32b | 20 | 5.40 | 5.40 | 12 | 56.894 | 2.845 |
623
+ | o4-mini | 20 | 5.70 | 5.70 | 14 | 83.936 | 4.197 |
624
+ | o3 | 20 | 5.80 | 5.80 | 16 | 97.293 | 4.865 |
625
+
626
+ *o3 had trajectories terminated early
627
+
628
+ 300 steps
629
+ | gemini-1.5-flash | 20 | 1.50 | 1.50 | 6 | 7.440 | 0.372 |
630
+ | g-2.5-flash-lite | 20 | 4.90 | 4.90 | 10 | 24.713 | 1.236 |
631
+ | kimi-k2-instruct | 20 | 4.45 | 4.45 | 12 | 45.834 | 2.292 |
632
+ | qwen/qwen3-32b | 20 | 6.25 | 6.25 | 14 | 55.396 | 2.770 |
633
+
634
+ 50 steps, 100 traj
635
+ | qwen/qwen3-32b | 93 | 4.74 | 4.74 | 14 | 94.806 | 1.019 |