synth-ai 0.2.4.dev7__py3-none-any.whl → 0.2.4.dev8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- synth_ai/__init__.py +1 -1
- synth_ai/cli/balance.py +3 -15
- synth_ai/config/base_url.py +47 -0
- synth_ai/http.py +102 -0
- synth_ai/inference/__init__.py +7 -0
- synth_ai/inference/client.py +20 -0
- synth_ai/jobs/client.py +246 -0
- synth_ai/learning/__init__.py +24 -0
- synth_ai/learning/client.py +149 -0
- synth_ai/learning/config.py +43 -0
- synth_ai/learning/constants.py +29 -0
- synth_ai/learning/ft_client.py +59 -0
- synth_ai/learning/health.py +43 -0
- synth_ai/learning/jobs.py +205 -0
- synth_ai/learning/rl_client.py +256 -0
- synth_ai/learning/sse.py +58 -0
- synth_ai/learning/validators.py +48 -0
- synth_ai/lm/core/main_v3.py +13 -0
- synth_ai/lm/core/synth_models.py +48 -0
- synth_ai/lm/core/vendor_clients.py +9 -6
- synth_ai/lm/vendors/core/openai_api.py +31 -3
- synth_ai/lm/vendors/openai_standard.py +45 -14
- synth_ai/lm/vendors/supported/custom_endpoint.py +12 -2
- synth_ai/lm/vendors/synth_client.py +372 -28
- synth_ai/rl/__init__.py +30 -0
- synth_ai/rl/contracts.py +32 -0
- synth_ai/rl/env_keys.py +137 -0
- synth_ai/rl/secrets.py +19 -0
- synth_ai/scripts/verify_rewards.py +100 -0
- synth_ai/task/__init__.py +10 -0
- synth_ai/task/contracts.py +120 -0
- synth_ai/task/health.py +28 -0
- synth_ai/task/validators.py +12 -0
- synth_ai/tracing_v3/hooks.py +3 -1
- synth_ai/tracing_v3/session_tracer.py +123 -2
- synth_ai/tracing_v3/turso/manager.py +218 -0
- synth_ai/tracing_v3/turso/models.py +53 -0
- synth_ai-0.2.4.dev8.dist-info/METADATA +635 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/RECORD +43 -25
- synth_ai/tui/__init__.py +0 -1
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -340
- synth_ai-0.2.4.dev7.dist-info/METADATA +0 -193
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.4.dev7.dist-info → synth_ai-0.2.4.dev8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,635 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: synth-ai
|
3
|
+
Version: 0.2.4.dev8
|
4
|
+
Summary: Software for aiding the best and multiplying the will - Core AI functionality and tracing
|
5
|
+
Author-email: Synth AI <josh@usesynth.ai>
|
6
|
+
License-Expression: MIT
|
7
|
+
Project-URL: Homepage, https://github.com/synth-laboratories/synth-ai
|
8
|
+
Project-URL: Repository, https://github.com/synth-laboratories/synth-ai
|
9
|
+
Project-URL: Issues, https://github.com/synth-laboratories/synth-ai/issues
|
10
|
+
Requires-Python: >=3.11
|
11
|
+
Description-Content-Type: text/markdown
|
12
|
+
License-File: LICENSE
|
13
|
+
Requires-Dist: pydantic>=2.0.0
|
14
|
+
Requires-Dist: python-dotenv>=1.0.1
|
15
|
+
Requires-Dist: requests>=2.32.3
|
16
|
+
Requires-Dist: urllib3>=2.3.0
|
17
|
+
Requires-Dist: tqdm>=4.66.4
|
18
|
+
Requires-Dist: jsonschema>=4.23.0
|
19
|
+
Requires-Dist: backoff>=2.0.0
|
20
|
+
Requires-Dist: typing_extensions>=4.0.0
|
21
|
+
Requires-Dist: openai>=1.99.0
|
22
|
+
Requires-Dist: anthropic>=0.42.0
|
23
|
+
Requires-Dist: langfuse<3.0.0,>=2.53.9
|
24
|
+
Requires-Dist: opentelemetry-api<1.27.0,>=1.26.0
|
25
|
+
Requires-Dist: opentelemetry-sdk<1.27.0,>=1.26.0
|
26
|
+
Requires-Dist: diskcache>=5.6.3
|
27
|
+
Requires-Dist: groq>=0.30.0
|
28
|
+
Requires-Dist: google-genai>=1.26.0
|
29
|
+
Requires-Dist: together>=1.5.21
|
30
|
+
Requires-Dist: mistralai>=1.9.2
|
31
|
+
Requires-Dist: fastapi>=0.115.12
|
32
|
+
Requires-Dist: uvicorn>=0.34.2
|
33
|
+
Requires-Dist: numpy>=2.2.3
|
34
|
+
Requires-Dist: networkx>=3.4.2
|
35
|
+
Requires-Dist: redis>=6.2.0
|
36
|
+
Requires-Dist: duckdb>=1.0.0
|
37
|
+
Requires-Dist: pandas>=2.2.3
|
38
|
+
Requires-Dist: ty>=0.0.1a5
|
39
|
+
Requires-Dist: toml>=0.10.2
|
40
|
+
Requires-Dist: sqlalchemy>=2.0.42
|
41
|
+
Requires-Dist: aiosqlite>=0.21.0
|
42
|
+
Requires-Dist: greenlet>=3.2.3
|
43
|
+
Requires-Dist: libsql>=0.1.8
|
44
|
+
Requires-Dist: pynacl>=1.5.0
|
45
|
+
Requires-Dist: google-api-core>=2.25.1
|
46
|
+
Requires-Dist: google-generativeai>=0.8.5
|
47
|
+
Requires-Dist: crafter>=1.8.3
|
48
|
+
Requires-Dist: click>=8.1.0
|
49
|
+
Requires-Dist: textual>=1.1.0
|
50
|
+
Requires-Dist: openai-harmony>=0.0.1
|
51
|
+
Requires-Dist: asyncpg>=0.30.0
|
52
|
+
Requires-Dist: aiohttp>=3.8.0
|
53
|
+
Requires-Dist: datasets>=4.0.0
|
54
|
+
Requires-Dist: transformers>=4.56.1
|
55
|
+
Provides-Extra: dev
|
56
|
+
Requires-Dist: build>=1.2.2.post1; extra == "dev"
|
57
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
58
|
+
Requires-Dist: keyring>=24.0.0; extra == "dev"
|
59
|
+
Requires-Dist: pytest>=8.3.3; extra == "dev"
|
60
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == "dev"
|
61
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
62
|
+
Requires-Dist: pyright>=1.1.350; extra == "dev"
|
63
|
+
Requires-Dist: coverage[toml]>=7.3.0; extra == "dev"
|
64
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
65
|
+
Provides-Extra: research
|
66
|
+
Requires-Dist: crafter>=1.8.3; extra == "research"
|
67
|
+
Requires-Dist: datasets>=4.0.0; extra == "research"
|
68
|
+
Provides-Extra: all
|
69
|
+
Requires-Dist: crafter>=1.8.3; extra == "all"
|
70
|
+
Requires-Dist: datasets>=4.0.0; extra == "all"
|
71
|
+
Dynamic: license-file
|
72
|
+
|
73
|
+
# Synth AI
|
74
|
+
|
75
|
+
Modern Compound AI System Development
|
76
|
+
|
77
|
+
**Comprehensive AI Framework for Language Models, Environments, and Observability**
|
78
|
+
|
79
|
+
[](https://www.python.org/)
|
80
|
+
[](LICENSE)
|
81
|
+
[](https://pypi.org/project/synth-ai/)
|
82
|
+

|
83
|
+

|
84
|
+
|
85
|
+
A unified framework combining language model capabilities, synthetic environments, and comprehensive tracing for building and evaluating AI agents.
|
86
|
+
|
87
|
+
## 🚀 Quick Start
|
88
|
+
|
89
|
+
### Installation
|
90
|
+
|
91
|
+
```bash
|
92
|
+
# Basic installation
|
93
|
+
pip install synth-ai
|
94
|
+
|
95
|
+
# With research environments (includes game environments)
|
96
|
+
pip install synth-ai[research]
|
97
|
+
|
98
|
+
# Full installation with all providers
|
99
|
+
pip install synth-ai[all]
|
100
|
+
```
|
101
|
+
|
102
|
+
### Spinning Up
|
103
|
+
|
104
|
+
Start the Synth AI service daemon (includes sqld database + environment service):
|
105
|
+
|
106
|
+
```bash
|
107
|
+
# Start both database daemon (port 8080) and environment service (port 8901)
|
108
|
+
uvx synth-ai serve
|
109
|
+
```
|
110
|
+
|
111
|
+
#### Service Command Options
|
112
|
+
|
113
|
+
```bash
|
114
|
+
uvx synth-ai serve [OPTIONS]
|
115
|
+
```
|
116
|
+
|
117
|
+
**Available Options:**
|
118
|
+
- `--db-file` - Database file path (default: "synth_ai.db")
|
119
|
+
- `--sqld-port` - Port for sqld HTTP interface (default: 8080)
|
120
|
+
- `--env-port` - Port for environment service (default: 8901)
|
121
|
+
- `--no-sqld` - Skip starting sqld database daemon
|
122
|
+
- `--no-env` - Skip starting environment service
|
123
|
+
|
124
|
+
**Examples:**
|
125
|
+
```bash
|
126
|
+
# Start with custom ports
|
127
|
+
uvx synth-ai serve --sqld-port 8081 --env-port 8902
|
128
|
+
|
129
|
+
# Start only the environment service
|
130
|
+
uvx synth-ai serve --no-sqld
|
131
|
+
|
132
|
+
# Start only the database service
|
133
|
+
uvx synth-ai serve --no-env
|
134
|
+
```
|
135
|
+
|
136
|
+
#### What the Serve Command Provides
|
137
|
+
|
138
|
+
**sqld Database Service (port 8080)**
|
139
|
+
- Local SQLite-compatible database server with HTTP API
|
140
|
+
- Automatically downloads and installs sqld binary if needed
|
141
|
+
- Provides persistent storage for agent interactions and traces
|
142
|
+
|
143
|
+
**Environment Service (port 8901)**
|
144
|
+
- FastAPI service for managing AI environments and tasks
|
145
|
+
- Built-in environments: Crafter, Sokoban, MiniGrid, TicTacToe, Verilog, NetHack, Enron
|
146
|
+
- RESTful API for environment initialization, stepping, and termination
|
147
|
+
- Dynamic environment registry for custom environments
|
148
|
+
|
149
|
+
In another terminal, run your first example:
|
150
|
+
|
151
|
+
```bash
|
152
|
+
# Run a Crafter agent demo with Gemini
|
153
|
+
./examples/run_crafter_demo.sh
|
154
|
+
```
|
155
|
+
|
156
|
+
This will:
|
157
|
+
- Start the sqld database daemon with HTTP API on port 8080
|
158
|
+
- Launch the environment service API on port 8901
|
159
|
+
- Run a reactive agent in the Crafter environment using Gemini 1.5 Flash
|
160
|
+
|
161
|
+
#### Demos (Eval + Finetuning)
|
162
|
+
|
163
|
+
You can run interactive demos from the repo without remembering exact commands:
|
164
|
+
|
165
|
+
```bash
|
166
|
+
# Lists all available demos under examples/, then prompts you to choose
|
167
|
+
uvx synth-ai demo
|
168
|
+
```
|
169
|
+
|
170
|
+
Today this includes:
|
171
|
+
- Eval demo: `examples/evals/run_demo.sh`
|
172
|
+
- Prompts for models, episodes, etc.
|
173
|
+
- Runs Crafter rollouts with v3 tracing, then analyzes and filters traces
|
174
|
+
- Writes a JSONL like `ft_data/evals_filtered.jsonl` for downstream use
|
175
|
+
- Finetuning demo: `examples/finetuning/synth_qwen/run_demo.sh`
|
176
|
+
- Guides you through: rollouts → filter v3 traces → prepare SFT JSONL
|
177
|
+
- Pair with `uvpm examples.finetuning.synth_qwen.sft_kickoff` to start an SFT job when ready
|
178
|
+
|
179
|
+
Notes:
|
180
|
+
- Ensure the service is running (`uvx synth-ai serve`) so v3 traces are recorded locally.
|
181
|
+
- Set API configuration for finetuning:
|
182
|
+
- `export LEARNING_V2_BASE_URL="http://localhost:8000/api"` (or your proxy)
|
183
|
+
- `export SYNTH_API_KEY="sk_live_..."`
|
184
|
+
- v3 trace data is stored under `traces/v3/synth_ai.db/` by default. Inspect with `uvx synth-ai traces`.
|
185
|
+
- LM tracing: all model calls (prompts, outputs, tool calls, token usage, latency, cost) are automatically captured via v3 tracing and stored locally; inspect with `uvx synth-ai traces`.
|
186
|
+
|
187
|
+
### One-Command Demos
|
188
|
+
|
189
|
+
Quickly browse and launch interactive demos under `examples/`:
|
190
|
+
|
191
|
+
```bash
|
192
|
+
uvx synth-ai demo
|
193
|
+
```
|
194
|
+
|
195
|
+
This lists all `run_demo.sh` scripts found in the repo (e.g., eval comparisons, finetuning flows) and lets you pick one to run.
|
196
|
+
|
197
|
+
|
198
|
+
## Changelog (migrated from CHANGELOG.md)
|
199
|
+
|
200
|
+
# Changelog
|
201
|
+
|
202
|
+
All notable changes to this project are documented in this file.
|
203
|
+
|
204
|
+
## [0.2.4.dev6] - 2025-08-18
|
205
|
+
- Added: Wordle environment for simple RL testing, including engine, environment, taskset, and curated instances with helper generator script.
|
206
|
+
- Added: Wordle README and unit/integration tests (integrity checks and gameplay coverage).
|
207
|
+
- Added: Service routes and example wiring to expose Wordle tasks through the environment service.
|
208
|
+
- Changed: Bumped development version to 0.2.4.dev6 and published to PyPI.
|
209
|
+
|
210
|
+
[0.2.4.dev6]: https://github.com/synth-laboratories/synth-ai/releases/tag/v0.2.4.dev6
|
211
|
+
|
212
|
+
|
213
|
+
## Crafter Notes (migrated from crafter.md)
|
214
|
+
|
215
|
+
uvpm src.synth_env.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash-latest
|
216
|
+
uv run uvicorn src.synth_env.service.app:app --host 0.0.0.0 --port 8901
|
217
|
+
CRAFTER
|
218
|
+
|
219
|
+
gemini-1.5-flash-8b
|
220
|
+
collect_sapling: 20 times (K=0.1, contribution=0.304)
|
221
|
+
|
222
|
+
gpt-4.1-nano
|
223
|
+
collect_drink: 8 times (K=0.1, contribution=0.220)
|
224
|
+
collect_sapling: 2 times (K=0.1, contribution=0.110)
|
225
|
+
collect_wood: 12 times (K=1.0, contribution=2.565)
|
226
|
+
|
227
|
+
gpt-4o-mini
|
228
|
+
collect_drink: 1 times (K=0.1, contribution=0.069)
|
229
|
+
collect_sapling: 15 times (K=0.1, contribution=0.277)
|
230
|
+
collect_wood: 7 times (K=1.0, contribution=2.079)
|
231
|
+
eat_cow: 2 times (K=1.0, contribution=1.099)
|
232
|
+
|
233
|
+
gemini-1.5-flash
|
234
|
+
collect_drink: 5 times (K=0.1, contribution=0.179)
|
235
|
+
collect_sapling: 10 times (K=0.1, contribution=0.240)
|
236
|
+
collect_wood: 12 times (K=1.0, contribution=2.565)
|
237
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
238
|
+
eat_cow: 1 times (K=1.0, contribution=0.693)
|
239
|
+
make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
|
240
|
+
place_table: 1 times (K=3.0, contribution=2.079)
|
241
|
+
|
242
|
+
gpt-4.1-mini
|
243
|
+
collect_coal: 1 times (K=3.0, contribution=2.079)
|
244
|
+
collect_drink: 7 times (K=0.1, contribution=0.208)
|
245
|
+
collect_sapling: 16 times (K=0.1, contribution=0.283)
|
246
|
+
collect_stone: 1 times (K=1.0, contribution=0.693)
|
247
|
+
collect_wood: 17 times (K=1.0, contribution=2.890)
|
248
|
+
eat_cow: 3 times (K=1.0, contribution=1.386)
|
249
|
+
make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
|
250
|
+
place_table: 1 times (K=3.0, contribution=2.079)
|
251
|
+
|
252
|
+
gemini-2.5-flash
|
253
|
+
collect_coal: 5 times (K=3.0, contribution=5.375)
|
254
|
+
collect_drink: 7 times (K=0.1, contribution=0.208)
|
255
|
+
collect_sapling: 12 times (K=0.1, contribution=0.256)
|
256
|
+
collect_stone: 9 times (K=1.0, contribution=2.303)
|
257
|
+
collect_wood: 18 times (K=1.0, contribution=2.944)
|
258
|
+
eat_cow: 1 times (K=1.0, contribution=0.693)
|
259
|
+
make_stone_pickaxe: 2 times (K=10.0, contribution=10.986)
|
260
|
+
make_wood_pickaxe: 13 times (K=3.0, contribution=7.917)
|
261
|
+
place_furnace: 2 times (K=10.0, contribution=10.986)
|
262
|
+
place_plant: 1 times (K=0.1, contribution=0.069)
|
263
|
+
place_table: 17 times (K=3.0, contribution=8.671)
|
264
|
+
wake_up: 2 times (K=0.1, contribution=0.110)
|
265
|
+
|
266
|
+
gemini-2.5-pro
|
267
|
+
collect_coal: 3 times (K=3.0, contribution=4.159)
|
268
|
+
collect_drink: 4 times (K=0.1, contribution=0.161)
|
269
|
+
collect_sapling: 12 times (K=0.1, contribution=0.256)
|
270
|
+
collect_stone: 6 times (K=1.0, contribution=1.946)
|
271
|
+
collect_wood: 18 times (K=1.0, contribution=2.944)
|
272
|
+
make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
|
273
|
+
make_wood_pickaxe: 10 times (K=3.0, contribution=7.194)
|
274
|
+
place_furnace: 3 times (K=10.0, contribution=13.863)
|
275
|
+
place_table: 18 times (K=3.0, contribution=8.833)
|
276
|
+
wake_up: 3 times (K=0.1, contribution=0.139)
|
277
|
+
|
278
|
+
gpt-4.1
|
279
|
+
collect_coal: 1 times (K=3.0, contribution=2.079)
|
280
|
+
collect_drink: 3 times (K=0.1, contribution=0.139)
|
281
|
+
collect_sapling: 15 times (K=0.1, contribution=0.277)
|
282
|
+
collect_stone: 7 times (K=1.0, contribution=2.079)
|
283
|
+
collect_wood: 19 times (K=1.0, contribution=2.996)
|
284
|
+
defeat_skeleton: 1 times (K=1.0, contribution=0.693)
|
285
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
286
|
+
eat_cow: 3 times (K=1.0, contribution=1.386)
|
287
|
+
make_stone_pickaxe: 4 times (K=10.0, contribution=16.094)
|
288
|
+
make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
|
289
|
+
place_table: 17 times (K=3.0, contribution=8.671)
|
290
|
+
wake_up: 3 times (K=0.1, contribution=0.139)
|
291
|
+
|
292
|
+
claude-sonnet-4
|
293
|
+
collect_coal: 1 times (K=3.0, contribution=2.079)
|
294
|
+
collect_drink: 2 times (K=0.1, contribution=0.110)
|
295
|
+
collect_sapling: 11 times (K=0.1, contribution=0.248)
|
296
|
+
collect_stone: 4 times (K=1.0, contribution=1.609)
|
297
|
+
collect_wood: 15 times (K=1.0, contribution=2.773)
|
298
|
+
eat_cow: 4 times (K=1.0, contribution=1.609)
|
299
|
+
make_wood_pickaxe: 8 times (K=3.0, contribution=6.592)
|
300
|
+
place_plant: 1 times (K=0.1, contribution=0.069)
|
301
|
+
place_table: 13 times (K=3.0, contribution=7.917)
|
302
|
+
wake_up: 1 times (K=0.1, contribution=0.069)
|
303
|
+
|
304
|
+
gemini-2.5-flash-lite
|
305
|
+
collect_drink: 8 times (K=0.1, contribution=0.220)
|
306
|
+
collect_sapling: 15 times (K=0.1, contribution=0.277)
|
307
|
+
collect_stone: 2 times (K=1.0, contribution=1.099)
|
308
|
+
collect_wood: 17 times (K=1.0, contribution=2.890)
|
309
|
+
eat_cow: 3 times (K=1.0, contribution=1.386)
|
310
|
+
make_wood_pickaxe: 7 times (K=3.0, contribution=6.238)
|
311
|
+
place_plant: 1 times (K=0.1, contribution=0.069)
|
312
|
+
place_table: 11 times (K=3.0, contribution=7.455)
|
313
|
+
wake_up: 6 times (K=0.1, contribution=0.195)
|
314
|
+
|
315
|
+
o4-mini
|
316
|
+
collect_coal: 7 times (K=3.0, contribution=6.238)
|
317
|
+
collect_drink: 5 times (K=0.1, contribution=0.179)
|
318
|
+
collect_iron: 1 times (K=10.0, contribution=6.931)
|
319
|
+
collect_sapling: 9 times (K=0.1, contribution=0.230)
|
320
|
+
collect_stone: 15 times (K=1.0, contribution=2.773)
|
321
|
+
collect_wood: 19 times (K=1.0, contribution=2.996)
|
322
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
323
|
+
make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
|
324
|
+
make_stone_sword: 1 times (K=10.0, contribution=6.931)
|
325
|
+
make_wood_pickaxe: 19 times (K=3.0, contribution=8.987)
|
326
|
+
place_furnace: 5 times (K=10.0, contribution=17.918)
|
327
|
+
place_plant: 3 times (K=0.1, contribution=0.139)
|
328
|
+
place_table: 19 times (K=3.0, contribution=8.987)
|
329
|
+
wake_up: 3 times (K=0.1, contribution=0.139)
|
330
|
+
|
331
|
+
o3-mini
|
332
|
+
collect_coal: 3 times (K=3.0, contribution=4.159)
|
333
|
+
collect_drink: 7 times (K=0.1, contribution=0.208)
|
334
|
+
collect_sapling: 10 times (K=0.1, contribution=0.240)
|
335
|
+
collect_stone: 5 times (K=1.0, contribution=1.792)
|
336
|
+
collect_wood: 17 times (K=1.0, contribution=2.890)
|
337
|
+
eat_cow: 8 times (K=1.0, contribution=2.197)
|
338
|
+
make_stone_pickaxe: 1 times (K=10.0, contribution=6.931)
|
339
|
+
make_wood_pickaxe: 9 times (K=3.0, contribution=6.908)
|
340
|
+
place_table: 13 times (K=3.0, contribution=7.917)
|
341
|
+
wake_up: 11 times (K=0.1, contribution=0.248)
|
342
|
+
|
343
|
+
qwen/qwen3-32b
|
344
|
+
collect_coal: 3 times (K=3.0, contribution=4.159)
|
345
|
+
collect_drink: 6 times (K=0.1, contribution=0.195)
|
346
|
+
collect_sapling: 12 times (K=0.1, contribution=0.256)
|
347
|
+
collect_stone: 8 times (K=1.0, contribution=2.197)
|
348
|
+
collect_wood: 20 times (K=1.0, contribution=3.045)
|
349
|
+
eat_cow: 5 times (K=1.0, contribution=1.792)
|
350
|
+
make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
|
351
|
+
make_wood_pickaxe: 15 times (K=3.0, contribution=8.318)
|
352
|
+
place_furnace: 3 times (K=10.0, contribution=13.863)
|
353
|
+
place_plant: 2 times (K=0.1, contribution=0.110)
|
354
|
+
place_table: 18 times (K=3.0, contribution=8.833)
|
355
|
+
wake_up: 13 times (K=0.1, contribution=0.264)
|
356
|
+
|
357
|
+
o3
|
358
|
+
collect_coal: 6 times (K=3.0, contribution=5.838)
|
359
|
+
collect_drink: 1 times (K=0.1, contribution=0.069)
|
360
|
+
collect_iron: 2 times (K=10.0, contribution=10.986)
|
361
|
+
collect_sapling: 11 times (K=0.1, contribution=0.248)
|
362
|
+
collect_stone: 9 times (K=1.0, contribution=2.303)
|
363
|
+
collect_wood: 19 times (K=1.0, contribution=2.996)
|
364
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
365
|
+
eat_cow: 1 times (K=1.0, contribution=0.693)
|
366
|
+
make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
|
367
|
+
make_stone_sword: 3 times (K=10.0, contribution=13.863)
|
368
|
+
make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
|
369
|
+
make_wood_sword: 6 times (K=3.0, contribution=5.838)
|
370
|
+
place_furnace: 4 times (K=10.0, contribution=16.094)
|
371
|
+
place_plant: 5 times (K=0.1, contribution=0.179)
|
372
|
+
place_table: 15 times (K=3.0, contribution=8.318)
|
373
|
+
wake_up: 12 times (K=0.1, contribution=0.256)
|
374
|
+
|
375
|
+
uv run python src/synth_env/examples/crafter_classic/agent_demos/test_crafter_react_agent.py --config src/evals/configs/crafter.toml
|
376
|
+
|
377
|
+
episodes = 20 # Number of episodes to run
|
378
|
+
max_steps = 50 # Maximum steps per episode
|
379
|
+
seed = 42 # Random seed for reproducibility
|
380
|
+
difficulty = "easy" # Difficulty mode
|
381
|
+
|
382
|
+
|
383
|
+
groq models
|
384
|
+
- meta-llama/llama-4-scout-17b-16e-instruct
|
385
|
+
- meta-llama/llama-4-maverick-17b-128e-instruct
|
386
|
+
qwen/qwen3-32b
|
387
|
+
|
388
|
+
|
389
|
+
CRAFTER
|
390
|
+
50 steps
|
391
|
+
| Model | Episodes | Mean Score | Avg Achievements | Unique Achievements | Shaped Reward | Mean K-Score |
|
392
|
+
|------------------|----------|------------|------------------|---------------------|---------------|--------------|
|
393
|
+
| qwen-2.5-0.5b | 10 | 1.00 | 1.00 | 1 | 0.240 | 0.024 |
|
394
|
+
| g-1.5-flash-8b | 20 | 1.00 | 1.00 | 1 | 0.304 | 0.015 |
|
395
|
+
| L4-scout-17b | 20 | 0.20 | 0.20 | 4 | 1.525 | 0.076 |
|
396
|
+
| gpt-4.1-nano | 20 | 1.10 | 1.10 | 3 | 2.895 | 0.145 |
|
397
|
+
| gpt-4o-mini | 20 | 1.25 | 1.25 | 4 | 3.525 | 0.176 |
|
398
|
+
| L3.1-8b-groq | 20 | 1.45 | 1.45 | 4 | 3.552 | 0.178 |
|
399
|
+
| L4-maverick-17b | 20 | 2.20 | 2.20 | 6 | 7.087 | 0.354 |
|
400
|
+
| L3.3-70b-groq | 20 | 2.15 | 2.15 | 6 | 7.188 | 0.359 |
|
401
|
+
| gemini-1.5-flash | 20 | 1.55 | 1.55 | 7 | 8.529 | 0.426 |
|
402
|
+
| deepseek-chat | 20 | 1.85 | 1.85 | 7 | 9.458 | 0.473 |
|
403
|
+
| gpt-4.1-mini | 20 | 2.35 | 2.35 | 8 | 11.699 | 0.585 |
|
404
|
+
| gpt-5-nano | 20 | 2.85 | ???? | 13 | ?????? | ??????|
|
405
|
+
| groq/kimi-k2 | 20 | 3.05 | 3.05 | 8 | 17.952 | 0.898 |
|
406
|
+
| g-2.5-flash-lite | 20 | 3.50 | 3.50 | 9 | 19.829 | 0.991 |
|
407
|
+
| claude-sonnet-4 | 20 | 3.00 | 3.00 | 10 | 23.077 | 1.154 |
|
408
|
+
| gpt-5-mini | 20 | 3.85 | ???? | 15 | ?????? | ????? |
|
409
|
+
| o3-mini | 20 | 4.20 | 4.20 | 10 | 33.491 | 1.675 |
|
410
|
+
| gpt-4.1 | 20 | 4.40 | 4.40 | 12 | 43.371 | 2.169 |
|
411
|
+
| gemini-2.5-flash | 19 | 4.68 | 4.68 | 12 | 50.520 | 2.659 |
|
412
|
+
| gemini-2.5-pro | 20 | 4.00 | 4.00 | 10 | 53.358 | 2.668 |
|
413
|
+
| qwen/qwen3-32b | 20 | 5.40 | 5.40 | 15 | 56.894 | 2.845 |
|
414
|
+
| o4-mini | 20 | 5.70 | 5.70 | 14 | 83.936 | 4.197 |
|
415
|
+
| o3 | 20 | 5.80 | 5.80 | 16 | 97.293 | 4.865 |
|
416
|
+
|
417
|
+
*o3 had trajectories terminated early
|
418
|
+
|
419
|
+
300 steps
|
420
|
+
| gemini-1.5-flash | 20 | 1.50 | 1.50 | 6 | 7.440 | 0.372 |
|
421
|
+
| g-2.5-flash-lite | 20 | 4.90 | 4.90 | 10 | 24.713 | 1.236 |
|
422
|
+
| kimi-k2-instruct | 20 | 4.45 | 4.45 | 12 | 45.834 | 2.292 |
|
423
|
+
| qwen/qwen3-32b | 20 | 6.25 | 6.25 | 14 | 55.396 | 2.770 |
|
424
|
+
|
425
|
+
50 steps, 100 traj
|
426
|
+
| qwen/qwen3-32b | 93 | 4.74 | 4.74 | 14 | 94.806 | 1.019 |uvpm src.synth_env.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash-latest
|
427
|
+
uv run uvicorn src.synth_env.service.app:app --host 0.0.0.0 --port 8901
|
428
|
+
CRAFTER
|
429
|
+
|
430
|
+
gemini-1.5-flash-8b
|
431
|
+
collect_sapling: 20 times (K=0.1, contribution=0.304)
|
432
|
+
|
433
|
+
gpt-4.1-nano
|
434
|
+
collect_drink: 8 times (K=0.1, contribution=0.220)
|
435
|
+
collect_sapling: 2 times (K=0.1, contribution=0.110)
|
436
|
+
collect_wood: 12 times (K=1.0, contribution=2.565)
|
437
|
+
|
438
|
+
gpt-4o-mini
|
439
|
+
collect_drink: 1 times (K=0.1, contribution=0.069)
|
440
|
+
collect_sapling: 15 times (K=0.1, contribution=0.277)
|
441
|
+
collect_wood: 7 times (K=1.0, contribution=2.079)
|
442
|
+
eat_cow: 2 times (K=1.0, contribution=1.099)
|
443
|
+
|
444
|
+
gemini-1.5-flash
|
445
|
+
collect_drink: 5 times (K=0.1, contribution=0.179)
|
446
|
+
collect_sapling: 10 times (K=0.1, contribution=0.240)
|
447
|
+
collect_wood: 12 times (K=1.0, contribution=2.565)
|
448
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
449
|
+
eat_cow: 1 times (K=1.0, contribution=0.693)
|
450
|
+
make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
|
451
|
+
place_table: 1 times (K=3.0, contribution=2.079)
|
452
|
+
|
453
|
+
gpt-4.1-mini
|
454
|
+
collect_coal: 1 times (K=3.0, contribution=2.079)
|
455
|
+
collect_drink: 7 times (K=0.1, contribution=0.208)
|
456
|
+
collect_sapling: 16 times (K=0.1, contribution=0.283)
|
457
|
+
collect_stone: 1 times (K=1.0, contribution=0.693)
|
458
|
+
collect_wood: 17 times (K=1.0, contribution=2.890)
|
459
|
+
eat_cow: 3 times (K=1.0, contribution=1.386)
|
460
|
+
make_wood_pickaxe: 1 times (K=3.0, contribution=2.079)
|
461
|
+
place_table: 1 times (K=3.0, contribution=2.079)
|
462
|
+
|
463
|
+
gemini-2.5-flash
|
464
|
+
collect_coal: 5 times (K=3.0, contribution=5.375)
|
465
|
+
collect_drink: 7 times (K=0.1, contribution=0.208)
|
466
|
+
collect_sapling: 12 times (K=0.1, contribution=0.256)
|
467
|
+
collect_stone: 9 times (K=1.0, contribution=2.303)
|
468
|
+
collect_wood: 18 times (K=1.0, contribution=2.944)
|
469
|
+
eat_cow: 1 times (K=1.0, contribution=0.693)
|
470
|
+
make_stone_pickaxe: 2 times (K=10.0, contribution=10.986)
|
471
|
+
make_wood_pickaxe: 13 times (K=3.0, contribution=7.917)
|
472
|
+
place_furnace: 2 times (K=10.0, contribution=10.986)
|
473
|
+
place_plant: 1 times (K=0.1, contribution=0.069)
|
474
|
+
place_table: 17 times (K=3.0, contribution=8.671)
|
475
|
+
wake_up: 2 times (K=0.1, contribution=0.110)
|
476
|
+
|
477
|
+
gemini-2.5-pro
|
478
|
+
collect_coal: 3 times (K=3.0, contribution=4.159)
|
479
|
+
collect_drink: 4 times (K=0.1, contribution=0.161)
|
480
|
+
collect_sapling: 12 times (K=0.1, contribution=0.256)
|
481
|
+
collect_stone: 6 times (K=1.0, contribution=1.946)
|
482
|
+
collect_wood: 18 times (K=1.0, contribution=2.944)
|
483
|
+
make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
|
484
|
+
make_wood_pickaxe: 10 times (K=3.0, contribution=7.194)
|
485
|
+
place_furnace: 3 times (K=10.0, contribution=13.863)
|
486
|
+
place_table: 18 times (K=3.0, contribution=8.833)
|
487
|
+
wake_up: 3 times (K=0.1, contribution=0.139)
|
488
|
+
|
489
|
+
gpt-4.1
|
490
|
+
collect_coal: 1 times (K=3.0, contribution=2.079)
|
491
|
+
collect_drink: 3 times (K=0.1, contribution=0.139)
|
492
|
+
collect_sapling: 15 times (K=0.1, contribution=0.277)
|
493
|
+
collect_stone: 7 times (K=1.0, contribution=2.079)
|
494
|
+
collect_wood: 19 times (K=1.0, contribution=2.996)
|
495
|
+
defeat_skeleton: 1 times (K=1.0, contribution=0.693)
|
496
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
497
|
+
eat_cow: 3 times (K=1.0, contribution=1.386)
|
498
|
+
make_stone_pickaxe: 4 times (K=10.0, contribution=16.094)
|
499
|
+
make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
|
500
|
+
place_table: 17 times (K=3.0, contribution=8.671)
|
501
|
+
wake_up: 3 times (K=0.1, contribution=0.139)
|
502
|
+
|
503
|
+
claude-sonnet-4
|
504
|
+
collect_coal: 1 times (K=3.0, contribution=2.079)
|
505
|
+
collect_drink: 2 times (K=0.1, contribution=0.110)
|
506
|
+
collect_sapling: 11 times (K=0.1, contribution=0.248)
|
507
|
+
collect_stone: 4 times (K=1.0, contribution=1.609)
|
508
|
+
collect_wood: 15 times (K=1.0, contribution=2.773)
|
509
|
+
eat_cow: 4 times (K=1.0, contribution=1.609)
|
510
|
+
make_wood_pickaxe: 8 times (K=3.0, contribution=6.592)
|
511
|
+
place_plant: 1 times (K=0.1, contribution=0.069)
|
512
|
+
place_table: 13 times (K=3.0, contribution=7.917)
|
513
|
+
wake_up: 1 times (K=0.1, contribution=0.069)
|
514
|
+
|
515
|
+
gemini-2.5-flash-lite
|
516
|
+
collect_drink: 8 times (K=0.1, contribution=0.220)
|
517
|
+
collect_sapling: 15 times (K=0.1, contribution=0.277)
|
518
|
+
collect_stone: 2 times (K=1.0, contribution=1.099)
|
519
|
+
collect_wood: 17 times (K=1.0, contribution=2.890)
|
520
|
+
eat_cow: 3 times (K=1.0, contribution=1.386)
|
521
|
+
make_wood_pickaxe: 7 times (K=3.0, contribution=6.238)
|
522
|
+
place_plant: 1 times (K=0.1, contribution=0.069)
|
523
|
+
place_table: 11 times (K=3.0, contribution=7.455)
|
524
|
+
wake_up: 6 times (K=0.1, contribution=0.195)
|
525
|
+
|
526
|
+
o4-mini
|
527
|
+
collect_coal: 7 times (K=3.0, contribution=6.238)
|
528
|
+
collect_drink: 5 times (K=0.1, contribution=0.179)
|
529
|
+
collect_iron: 1 times (K=10.0, contribution=6.931)
|
530
|
+
collect_sapling: 9 times (K=0.1, contribution=0.230)
|
531
|
+
collect_stone: 15 times (K=1.0, contribution=2.773)
|
532
|
+
collect_wood: 19 times (K=1.0, contribution=2.996)
|
533
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
534
|
+
make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
|
535
|
+
make_stone_sword: 1 times (K=10.0, contribution=6.931)
|
536
|
+
make_wood_pickaxe: 19 times (K=3.0, contribution=8.987)
|
537
|
+
place_furnace: 5 times (K=10.0, contribution=17.918)
|
538
|
+
place_plant: 3 times (K=0.1, contribution=0.139)
|
539
|
+
place_table: 19 times (K=3.0, contribution=8.987)
|
540
|
+
wake_up: 3 times (K=0.1, contribution=0.139)
|
541
|
+
|
542
|
+
o3-mini
|
543
|
+
collect_coal: 3 times (K=3.0, contribution=4.159)
|
544
|
+
collect_drink: 7 times (K=0.1, contribution=0.208)
|
545
|
+
collect_sapling: 10 times (K=0.1, contribution=0.240)
|
546
|
+
collect_stone: 5 times (K=1.0, contribution=1.792)
|
547
|
+
collect_wood: 17 times (K=1.0, contribution=2.890)
|
548
|
+
eat_cow: 8 times (K=1.0, contribution=2.197)
|
549
|
+
make_stone_pickaxe: 1 times (K=10.0, contribution=6.931)
|
550
|
+
make_wood_pickaxe: 9 times (K=3.0, contribution=6.908)
|
551
|
+
place_table: 13 times (K=3.0, contribution=7.917)
|
552
|
+
wake_up: 11 times (K=0.1, contribution=0.248)
|
553
|
+
|
554
|
+
qwen/qwen3-32b
|
555
|
+
collect_coal: 3 times (K=3.0, contribution=4.159)
|
556
|
+
collect_drink: 6 times (K=0.1, contribution=0.195)
|
557
|
+
collect_sapling: 12 times (K=0.1, contribution=0.256)
|
558
|
+
collect_stone: 8 times (K=1.0, contribution=2.197)
|
559
|
+
collect_wood: 20 times (K=1.0, contribution=3.045)
|
560
|
+
eat_cow: 5 times (K=1.0, contribution=1.792)
|
561
|
+
make_stone_pickaxe: 3 times (K=10.0, contribution=13.863)
|
562
|
+
make_wood_pickaxe: 15 times (K=3.0, contribution=8.318)
|
563
|
+
place_furnace: 3 times (K=10.0, contribution=13.863)
|
564
|
+
place_plant: 2 times (K=0.1, contribution=0.110)
|
565
|
+
place_table: 18 times (K=3.0, contribution=8.833)
|
566
|
+
wake_up: 13 times (K=0.1, contribution=0.264)
|
567
|
+
|
568
|
+
o3
|
569
|
+
collect_coal: 6 times (K=3.0, contribution=5.838)
|
570
|
+
collect_drink: 1 times (K=0.1, contribution=0.069)
|
571
|
+
collect_iron: 2 times (K=10.0, contribution=10.986)
|
572
|
+
collect_sapling: 11 times (K=0.1, contribution=0.248)
|
573
|
+
collect_stone: 9 times (K=1.0, contribution=2.303)
|
574
|
+
collect_wood: 19 times (K=1.0, contribution=2.996)
|
575
|
+
defeat_zombie: 1 times (K=1.0, contribution=0.693)
|
576
|
+
eat_cow: 1 times (K=1.0, contribution=0.693)
|
577
|
+
make_stone_pickaxe: 7 times (K=10.0, contribution=20.794)
|
578
|
+
make_stone_sword: 3 times (K=10.0, contribution=13.863)
|
579
|
+
make_wood_pickaxe: 14 times (K=3.0, contribution=8.124)
|
580
|
+
make_wood_sword: 6 times (K=3.0, contribution=5.838)
|
581
|
+
place_furnace: 4 times (K=10.0, contribution=16.094)
|
582
|
+
place_plant: 5 times (K=0.1, contribution=0.179)
|
583
|
+
place_table: 15 times (K=3.0, contribution=8.318)
|
584
|
+
wake_up: 12 times (K=0.1, contribution=0.256)
|
585
|
+
|
586
|
+
uv run python src/synth_env/examples/crafter_classic/agent_demos/test_crafter_react_agent.py --config src/evals/configs/crafter.toml
|
587
|
+
|
588
|
+
episodes = 20 # Number of episodes to run
|
589
|
+
max_steps = 50 # Maximum steps per episode
|
590
|
+
seed = 42 # Random seed for reproducibility
|
591
|
+
difficulty = "easy" # Difficulty mode
|
592
|
+
|
593
|
+
|
594
|
+
groq models
|
595
|
+
- meta-llama/llama-4-scout-17b-16e-instruct
|
596
|
+
- meta-llama/llama-4-maverick-17b-128e-instruct
|
597
|
+
qwen/qwen3-32b
|
598
|
+
|
599
|
+
|
600
|
+
CRAFTER
|
601
|
+
50 steps
|
602
|
+
| Model | Episodes | Mean Score | Avg Achievements | Unique Achievements | Shaped Reward | Mean K-Score |
|
603
|
+
|------------------|----------|------------|------------------|---------------------|---------------|--------------|
|
604
|
+
| qwen-2.5-0.5b | 10 | 1.00 | 1.00 | 1 | 0.240 | 0.024 |
|
605
|
+
| g-1.5-flash-8b | 20 | 1.00 | 1.00 | 1 | 0.304 | 0.015 |
|
606
|
+
| L4-scout-17b | 20 | 0.20 | 0.20 | 4 | 1.525 | 0.076 |
|
607
|
+
| gpt-4.1-nano | 20 | 1.10 | 1.10 | 3 | 2.895 | 0.145 |
|
608
|
+
| gpt-4o-mini | 20 | 1.25 | 1.25 | 4 | 3.525 | 0.176 |
|
609
|
+
| L3.1-8b-groq | 20 | 1.45 | 1.45 | 4 | 3.552 | 0.178 |
|
610
|
+
| L4-maverick-17b | 20 | 2.20 | 2.20 | 6 | 7.087 | 0.354 |
|
611
|
+
| L3.3-70b-groq | 20 | 2.15 | 2.15 | 6 | 7.188 | 0.359 |
|
612
|
+
| gemini-1.5-flash | 20 | 1.55 | 1.55 | 7 | 8.529 | 0.426 |
|
613
|
+
| deepseek-chat | 20 | 1.85 | 1.85 | 7 | 9.458 | 0.473 |
|
614
|
+
| gpt-4.1-mini | 20 | 2.35 | 2.35 | 8 | 11.699 | 0.585 |
|
615
|
+
| groq/kimi-k2 | 20 | 3.05 | 3.05 | 8 | 17.952 | 0.898 |
|
616
|
+
| g-2.5-flash-lite | 20 | 3.50 | 3.50 | 9 | 19.829 | 0.991 |
|
617
|
+
| claude-sonnet-4 | 20 | 3.00 | 3.00 | 10 | 23.077 | 1.154 |
|
618
|
+
| o3-mini | 20 | 4.20 | 4.20 | 10 | 33.491 | 1.675 |
|
619
|
+
| gpt-4.1 | 20 | 4.40 | 4.40 | 12 | 43.371 | 2.169 |
|
620
|
+
| gemini-2.5-flash | 19 | 4.68 | 4.68 | 12 | 50.520 | 2.659 |
|
621
|
+
| gemini-2.5-pro | 20 | 4.00 | 4.00 | 10 | 53.358 | 2.668 |
|
622
|
+
| qwen/qwen3-32b | 20 | 5.40 | 5.40 | 12 | 56.894 | 2.845 |
|
623
|
+
| o4-mini | 20 | 5.70 | 5.70 | 14 | 83.936 | 4.197 |
|
624
|
+
| o3 | 20 | 5.80 | 5.80 | 16 | 97.293 | 4.865 |
|
625
|
+
|
626
|
+
*o3 had trajectories terminated early
|
627
|
+
|
628
|
+
300 steps
|
629
|
+
| gemini-1.5-flash | 20 | 1.50 | 1.50 | 6 | 7.440 | 0.372 |
|
630
|
+
| g-2.5-flash-lite | 20 | 4.90 | 4.90 | 10 | 24.713 | 1.236 |
|
631
|
+
| kimi-k2-instruct | 20 | 4.45 | 4.45 | 12 | 45.834 | 2.292 |
|
632
|
+
| qwen/qwen3-32b | 20 | 6.25 | 6.25 | 14 | 55.396 | 2.770 |
|
633
|
+
|
634
|
+
50 steps, 100 traj
|
635
|
+
| qwen/qwen3-32b | 93 | 4.74 | 4.74 | 14 | 94.806 | 1.019 |
|