synth-ai 0.2.9.dev2__py3-none-any.whl → 0.2.9.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +17 -0
- examples/common_old/backend.py +21 -0
- examples/crafter_debug_render.py +180 -0
- examples/evals_old/README.md +98 -0
- examples/evals_old/__init__.py +6 -0
- examples/evals_old/compare_models.py +1037 -0
- examples/evals_old/example_log.md +145 -0
- examples/evals_old/run_demo.sh +126 -0
- examples/evals_old/trace_analysis.py +270 -0
- examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
- examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
- examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
- examples/finetuning_old/synth_qwen_v1/README.md +68 -0
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
- examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
- examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
- examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
- examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
- examples/finetuning_old/synth_qwen_v1/util.py +147 -0
- examples/rl/README.md +169 -0
- examples/rl/configs/eval_base_qwen.toml +15 -0
- examples/rl/configs/eval_rl_qwen.toml +11 -0
- examples/rl/configs/rl_from_base_qwen.toml +35 -0
- examples/rl/configs/rl_from_base_qwen17.toml +74 -0
- examples/rl/configs/rl_from_ft_qwen.toml +35 -0
- examples/rl/download_dataset.py +64 -0
- examples/rl/run_eval.py +435 -0
- examples/rl/run_rl_and_save.py +94 -0
- examples/rl/task_app/README.md +22 -0
- {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
- examples/rl/task_app/math_task_app.py +107 -0
- examples/rl_old/task_app.py +962 -0
- examples/run_crafter_demo.sh +10 -0
- examples/warming_up_to_rl/analyze_trace_db.py +420 -0
- examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
- examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
- examples/warming_up_to_rl/export_trace_sft.py +541 -0
- examples/warming_up_to_rl/groq_test.py +88 -0
- examples/warming_up_to_rl/manage_secrets.py +127 -0
- examples/warming_up_to_rl/old/event_rewards.md +234 -0
- examples/warming_up_to_rl/old/notes.md +73 -0
- examples/warming_up_to_rl/readme.md +172 -0
- examples/warming_up_to_rl/run_eval.py +434 -0
- examples/warming_up_to_rl/run_fft_and_save.py +309 -0
- examples/warming_up_to_rl/run_local_rollout.py +188 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
- examples/warming_up_to_rl/run_rl_and_save.py +101 -0
- examples/warming_up_to_rl/run_rollout_remote.py +129 -0
- examples/warming_up_to_rl/task_app/README.md +38 -0
- {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +58 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
- synth_ai/api/train/config_finder.py +18 -18
- synth_ai/api/train/env_resolver.py +28 -1
- synth_ai/cli/task_apps.py +264 -55
- synth_ai/demo_registry.py +7 -7
- synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +54 -0
- synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +165 -0
- synth_ai/task/apps/__init__.py +54 -13
- {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/METADATA +1 -1
- {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/RECORD +112 -13
- {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/top_level.txt +1 -0
- {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
|
|
3
|
+
# Run Crafter agent and analyze semantic map word distribution
|
|
4
|
+
# This script demonstrates semantic analysis of agent observations
|
|
5
|
+
# Output: Markdown tables and JSON data (no plotting dependencies)
|
|
6
|
+
|
|
7
|
+
echo "🔍 Analyzing semantic map words from Crafter agent..."
|
|
8
|
+
echo "Make sure the synth-ai service is running: uvx synth-ai serve"
|
|
9
|
+
echo ""
|
|
10
|
+
|
|
11
|
+
cd synth_ai/environments/examples/crafter_classic/agent_demos/
|
|
12
|
+
|
|
13
|
+
# Run the semantic analysis (markdown output only)
|
|
14
|
+
python analyze_semantic_words_markdown.py --model gemini-1.5-flash --episodes 3 --max-turns 30
|
|
15
|
+
|
|
16
|
+
echo ""
|
|
17
|
+
echo "✅ Analysis complete! Check the generated markdown report and JSON files."
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from synth_ai.config.base_url import get_backend_from_env, PROD_BASE_URL_DEFAULT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
DEFAULT_PROD_BACKEND = f"{PROD_BASE_URL_DEFAULT.rstrip('/')}/api"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def resolve_backend_url() -> str:
|
|
10
|
+
"""Resolve backend base URL honoring BACKEND_OVERRIDE and env overrides.
|
|
11
|
+
|
|
12
|
+
Always returns a URL ending with /api.
|
|
13
|
+
"""
|
|
14
|
+
base, _ = get_backend_from_env()
|
|
15
|
+
base = base.rstrip("/")
|
|
16
|
+
return base if base.endswith("/api") else f"{base}/api"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if __name__ == "__main__":
|
|
20
|
+
print(resolve_backend_url())
|
|
21
|
+
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Quick local Crafter observation inspector.
|
|
4
|
+
|
|
5
|
+
- Initializes a CrafterClassic env via local service (default http://localhost:8901)
|
|
6
|
+
- Fetches one observation
|
|
7
|
+
- Renders a 7x7 semantic view around the player with best-effort item names
|
|
8
|
+
- Prints status (health/food/energy), inventory, and achievements
|
|
9
|
+
|
|
10
|
+
Run:
|
|
11
|
+
uv run python examples/crafter_debug_render.py --base-url http://localhost:8901 --seed 1
|
|
12
|
+
"""
|
|
13
|
+
import argparse
|
|
14
|
+
import math
|
|
15
|
+
import os
|
|
16
|
+
from typing import Any, Dict, List
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def try_import_crafter_mapping():
|
|
22
|
+
try:
|
|
23
|
+
import crafter # type: ignore
|
|
24
|
+
|
|
25
|
+
env = crafter.Env()
|
|
26
|
+
try:
|
|
27
|
+
max_id = (
|
|
28
|
+
max(max(env._world._mat_ids.values()), max(env._sem_view._obj_ids.values())) + 1
|
|
29
|
+
)
|
|
30
|
+
id_to_item = ["void"] * max_id
|
|
31
|
+
for name, ind in env._world._mat_ids.items():
|
|
32
|
+
label = name.__name__ if hasattr(name, "__name__") else str(name)
|
|
33
|
+
id_to_item[ind] = label.lower()
|
|
34
|
+
for name, ind in env._sem_view._obj_ids.items():
|
|
35
|
+
label = name.__name__ if hasattr(name, "__name__") else str(name)
|
|
36
|
+
id_to_item[ind] = label.lower()
|
|
37
|
+
return id_to_item
|
|
38
|
+
finally:
|
|
39
|
+
try:
|
|
40
|
+
env.close()
|
|
41
|
+
except Exception:
|
|
42
|
+
pass
|
|
43
|
+
except Exception:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def format_semantic_map_view(obs: Dict[str, Any], view_size: int = 7) -> str:
|
|
48
|
+
sem = obs.get("semantic_map") or obs.get("sem_map") or obs.get("map")
|
|
49
|
+
if sem is None:
|
|
50
|
+
return "No semantic map available"
|
|
51
|
+
|
|
52
|
+
# Normalize to 2D grid
|
|
53
|
+
grid: List[List[int]]
|
|
54
|
+
if isinstance(sem, list) and sem and isinstance(sem[0], list):
|
|
55
|
+
grid = sem
|
|
56
|
+
elif isinstance(sem, list):
|
|
57
|
+
try:
|
|
58
|
+
n = int(math.sqrt(len(sem)))
|
|
59
|
+
if n * n != len(sem) or n == 0:
|
|
60
|
+
return "Semantic map format not recognized"
|
|
61
|
+
grid = [sem[i * n : (i + 1) * n] for i in range(n)]
|
|
62
|
+
except Exception:
|
|
63
|
+
return "Semantic map format not recognized"
|
|
64
|
+
else:
|
|
65
|
+
return "Semantic map format not recognized"
|
|
66
|
+
|
|
67
|
+
rows = len(grid)
|
|
68
|
+
cols = len(grid[0]) if rows > 0 else 0
|
|
69
|
+
if rows == 0 or cols == 0:
|
|
70
|
+
return "Empty semantic map"
|
|
71
|
+
|
|
72
|
+
# Resolve item mapping if available
|
|
73
|
+
id_to_item = try_import_crafter_mapping()
|
|
74
|
+
|
|
75
|
+
# Player position if provided; otherwise center
|
|
76
|
+
ppos = obs.get("player_position") or [rows // 2, cols // 2]
|
|
77
|
+
try:
|
|
78
|
+
px = int(ppos[0])
|
|
79
|
+
py = int(ppos[1])
|
|
80
|
+
except Exception:
|
|
81
|
+
px, py = rows // 2, cols // 2
|
|
82
|
+
|
|
83
|
+
half = max(1, view_size // 2)
|
|
84
|
+
lines: List[str] = []
|
|
85
|
+
visible: set[str] = set()
|
|
86
|
+
for dy in range(-half, half + 1):
|
|
87
|
+
row_cells: List[str] = []
|
|
88
|
+
for dx in range(-half, half + 1):
|
|
89
|
+
x = px + dx
|
|
90
|
+
y = py + dy
|
|
91
|
+
if dx == 0 and dy == 0:
|
|
92
|
+
row_cells.append("you")
|
|
93
|
+
elif 0 <= x < rows and 0 <= y < cols:
|
|
94
|
+
try:
|
|
95
|
+
val = int(grid[x][y])
|
|
96
|
+
except Exception:
|
|
97
|
+
val = -1
|
|
98
|
+
if id_to_item and 0 <= val < len(id_to_item):
|
|
99
|
+
name = id_to_item[val]
|
|
100
|
+
else:
|
|
101
|
+
# Fallback: simple mapping for common ids (best-effort)
|
|
102
|
+
name = {
|
|
103
|
+
0: "grass",
|
|
104
|
+
1: "stone",
|
|
105
|
+
2: "stone",
|
|
106
|
+
3: "tree",
|
|
107
|
+
4: "coal",
|
|
108
|
+
5: "iron",
|
|
109
|
+
6: "water",
|
|
110
|
+
7: "zombie",
|
|
111
|
+
14: "wood",
|
|
112
|
+
}.get(val, str(val))
|
|
113
|
+
row_cells.append(name)
|
|
114
|
+
if name not in {"grass", "you", "0"}:
|
|
115
|
+
visible.add(name)
|
|
116
|
+
else:
|
|
117
|
+
row_cells.append("void")
|
|
118
|
+
lines.append(" ".join(row_cells))
|
|
119
|
+
|
|
120
|
+
legend = f"Visible items: {', '.join(sorted(visible))}" if visible else "No notable items visible"
|
|
121
|
+
return "\n".join(lines) + "\n" + legend
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
async def main():
|
|
125
|
+
parser = argparse.ArgumentParser()
|
|
126
|
+
parser.add_argument("--base-url", default=os.getenv("CRAFTER_BASE_URL", "http://localhost:8901"))
|
|
127
|
+
parser.add_argument("--seed", type=int, default=1)
|
|
128
|
+
args = parser.parse_args()
|
|
129
|
+
|
|
130
|
+
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
131
|
+
init = await client.post(
|
|
132
|
+
f"{args.base_url}/env/CrafterClassic/initialize",
|
|
133
|
+
json={"config": {"difficulty": "easy", "seed": args.seed}},
|
|
134
|
+
)
|
|
135
|
+
init.raise_for_status()
|
|
136
|
+
data = init.json()
|
|
137
|
+
env_id = data["env_id"]
|
|
138
|
+
obs = data["observation"]
|
|
139
|
+
|
|
140
|
+
print("=== INITIAL OBSERVATION ===")
|
|
141
|
+
print(format_semantic_map_view(obs, view_size=7))
|
|
142
|
+
inv = obs.get("inventory", {})
|
|
143
|
+
ach = obs.get("achievements_status", {})
|
|
144
|
+
print("\n=== STATUS ===")
|
|
145
|
+
print(f"Health: {obs.get('health', 10)}/10")
|
|
146
|
+
print(f"Hunger: {obs.get('food', 10)}/10")
|
|
147
|
+
print(f"Energy: {obs.get('energy', 10)}/10")
|
|
148
|
+
inv_items = ", ".join([f"{k}: {v}" for k, v in inv.items() if v]) if isinstance(inv, dict) else str(inv)
|
|
149
|
+
print(f"Inventory: {inv_items if inv_items else 'empty'}")
|
|
150
|
+
if isinstance(ach, dict):
|
|
151
|
+
unlocked = sum(1 for v in ach.values() if v)
|
|
152
|
+
print(f"Achievements: {unlocked}/{len(ach)} unlocked")
|
|
153
|
+
|
|
154
|
+
# Take one step right to get a new obs
|
|
155
|
+
step = await client.post(
|
|
156
|
+
f"{args.base_url}/env/CrafterClassic/step",
|
|
157
|
+
json={
|
|
158
|
+
"env_id": env_id,
|
|
159
|
+
"action": {"tool_calls": [{"tool": "interact", "args": {"action": 2}}]},
|
|
160
|
+
},
|
|
161
|
+
)
|
|
162
|
+
step.raise_for_status()
|
|
163
|
+
sdata = step.json()
|
|
164
|
+
sobs = sdata["observation"]
|
|
165
|
+
print("\n=== NEXT OBSERVATION (after move_right) ===")
|
|
166
|
+
print(format_semantic_map_view(sobs, view_size=7))
|
|
167
|
+
|
|
168
|
+
# Cleanup
|
|
169
|
+
try:
|
|
170
|
+
await client.post(f"{args.base_url}/env/CrafterClassic/terminate", json={"env_id": env_id})
|
|
171
|
+
except Exception:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
if __name__ == "__main__":
|
|
176
|
+
import asyncio
|
|
177
|
+
|
|
178
|
+
asyncio.run(main())
|
|
179
|
+
|
|
180
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Crafter Model Comparison Cookbook
|
|
2
|
+
|
|
3
|
+
This cookbook demonstrates how to run parallel experiments comparing different language models on the Crafter environment, with robust timeout handling and performance analysis.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
This cookbook runs episodes of the Crafter game environment with different language models (e.g., gpt-5-nano and Qwen/Qwen3-32B-Instruct) in parallel, collecting performance metrics and analyzing the results.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Parallel episode execution**: Runs multiple episodes simultaneously for faster experimentation
|
|
12
|
+
- **Timeout handling**:
|
|
13
|
+
- Turn-level timeout (20s per LLM call)
|
|
14
|
+
- Episode-level timeout (180s total)
|
|
15
|
+
- Action execution timeout (5s)
|
|
16
|
+
- **Progress tracking**: Real-time progress bars showing steps across all episodes
|
|
17
|
+
- **Performance comparison**: Analyzes achievements, invalid action rates, and model usage statistics
|
|
18
|
+
- **Deterministic seeding**: Uses consecutive seeds for reproducible experiments
|
|
19
|
+
|
|
20
|
+
## Prerequisites
|
|
21
|
+
|
|
22
|
+
1. Ensure the Crafter environment service is running:
|
|
23
|
+
```bash
|
|
24
|
+
cd synth-ai/
|
|
25
|
+
bash serve.sh
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
2. Set up your API keys (Synth/OpenAI or provider as needed):
|
|
29
|
+
```bash
|
|
30
|
+
export OPENAI_API_KEY="your-api-key"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
Run the comparison script (gpt-5-nano vs Qwen/Qwen3-32B-Instruct):
|
|
36
|
+
```bash
|
|
37
|
+
uvpm examples.evals.compare_models --episodes 5 --max-turns 100 --difficulty easy \
|
|
38
|
+
--models "gpt-5-nano" "Qwen/Qwen3-32B-Instruct"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Or with custom parameters:
|
|
42
|
+
```bash
|
|
43
|
+
python compare_models.py \
|
|
44
|
+
--episodes 10 \
|
|
45
|
+
--max-turns 100 \
|
|
46
|
+
--difficulty easy \
|
|
47
|
+
--models "gpt-5-nano" "Qwen/Qwen3-32B-Instruct" \
|
|
48
|
+
--base-seed 1000 \
|
|
49
|
+
--turn-timeout 30.0 \
|
|
50
|
+
--episode-timeout 300.0
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Parameters
|
|
54
|
+
|
|
55
|
+
- `--episodes`: Number of episodes per model (default: 5)
|
|
56
|
+
- `--max-turns`: Maximum turns per episode (default: 50)
|
|
57
|
+
- `--difficulty`: Game difficulty - easy, medium, hard (default: easy)
|
|
58
|
+
- `--models`: Models to test (default: gpt-4o-mini gpt-4.1-mini)
|
|
59
|
+
- `--base-seed`: Starting seed for episodes (default: 1000)
|
|
60
|
+
- `--turn-timeout`: Timeout per turn in seconds (default: 20.0)
|
|
61
|
+
- `--episode-timeout`: Total timeout per episode in seconds (default: 180.0)
|
|
62
|
+
|
|
63
|
+
## Output
|
|
64
|
+
|
|
65
|
+
The script produces:
|
|
66
|
+
1. Real-time progress bars showing episode execution
|
|
67
|
+
2. Performance summary table comparing models
|
|
68
|
+
3. Achievement frequency analysis
|
|
69
|
+
4. Model usage statistics (filtered to current experiment only)
|
|
70
|
+
5. JSON file with detailed results
|
|
71
|
+
|
|
72
|
+
## Example Output (abridged)
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
📊 Analysis Results:
|
|
76
|
+
================================================================================
|
|
77
|
+
|
|
78
|
+
📈 Model Performance Summary:
|
|
79
|
+
Model Avg Achievements Max Achievements Invalid Rate Success Rate
|
|
80
|
+
--------------------------------------------------------------------------------------
|
|
81
|
+
gpt-5-nano 1.60 ± 1.10 4 1.20% 100.00%
|
|
82
|
+
Qwen/Qwen3-32B-Inst 1.40 ± 1.05 3 1.80% 100.00%
|
|
83
|
+
|
|
84
|
+
🏆 Achievement Frequencies:
|
|
85
|
+
Achievement gpt-5-na qwen3-32
|
|
86
|
+
-----------------------------------------------
|
|
87
|
+
collect_drink 1/5 ( 20%) 3/5 ( 60%)
|
|
88
|
+
collect_sapling 2/5 ( 40%) 2/5 ( 40%)
|
|
89
|
+
collect_wood 4/5 ( 80%) 2/5 ( 40%)
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Implementation Details
|
|
93
|
+
|
|
94
|
+
The comparison uses:
|
|
95
|
+
- Async/await for parallel episode execution
|
|
96
|
+
- Session-based tracing with v3 architecture
|
|
97
|
+
- Structured output tools for consistent LLM interactions
|
|
98
|
+
- SQLite database for tracking model usage and costs
|