synth-ai 0.2.9.dev2__py3-none-any.whl → 0.2.9.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (112) hide show
  1. examples/analyze_semantic_words.sh +17 -0
  2. examples/common_old/backend.py +21 -0
  3. examples/crafter_debug_render.py +180 -0
  4. examples/evals_old/README.md +98 -0
  5. examples/evals_old/__init__.py +6 -0
  6. examples/evals_old/compare_models.py +1037 -0
  7. examples/evals_old/example_log.md +145 -0
  8. examples/evals_old/run_demo.sh +126 -0
  9. examples/evals_old/trace_analysis.py +270 -0
  10. examples/finetuning_old/_backup_synth_qwen/config.toml +29 -0
  11. examples/finetuning_old/_backup_synth_qwen/example_log.md +324 -0
  12. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +60 -0
  13. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +239 -0
  14. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +109 -0
  15. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +1924 -0
  16. examples/finetuning_old/_backup_synth_qwen/readme.md +49 -0
  17. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +114 -0
  18. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +195 -0
  19. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +118 -0
  20. examples/finetuning_old/synth_qwen_v1/README.md +68 -0
  21. examples/finetuning_old/synth_qwen_v1/filter_traces.py +60 -0
  22. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +239 -0
  23. examples/finetuning_old/synth_qwen_v1/finetune.py +46 -0
  24. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +71 -0
  25. examples/finetuning_old/synth_qwen_v1/infer.py +37 -0
  26. examples/finetuning_old/synth_qwen_v1/poll.py +44 -0
  27. examples/finetuning_old/synth_qwen_v1/prepare_data.py +35 -0
  28. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +109 -0
  29. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +1932 -0
  30. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +207 -0
  31. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +232 -0
  32. examples/finetuning_old/synth_qwen_v1/upload_data.py +34 -0
  33. examples/finetuning_old/synth_qwen_v1/util.py +147 -0
  34. examples/rl/README.md +169 -0
  35. examples/rl/configs/eval_base_qwen.toml +15 -0
  36. examples/rl/configs/eval_rl_qwen.toml +11 -0
  37. examples/rl/configs/rl_from_base_qwen.toml +35 -0
  38. examples/rl/configs/rl_from_base_qwen17.toml +74 -0
  39. examples/rl/configs/rl_from_ft_qwen.toml +35 -0
  40. examples/rl/download_dataset.py +64 -0
  41. examples/rl/run_eval.py +435 -0
  42. examples/rl/run_rl_and_save.py +94 -0
  43. examples/rl/task_app/README.md +22 -0
  44. {synth_ai/task/apps → examples/rl/task_app}/math_single_step.py +8 -8
  45. examples/rl/task_app/math_task_app.py +107 -0
  46. examples/rl_old/task_app.py +962 -0
  47. examples/run_crafter_demo.sh +10 -0
  48. examples/warming_up_to_rl/analyze_trace_db.py +420 -0
  49. examples/warming_up_to_rl/configs/crafter_fft.toml +48 -0
  50. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +54 -0
  51. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +20 -0
  52. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +13 -0
  53. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +23 -0
  54. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +73 -0
  55. examples/warming_up_to_rl/configs/rl_from_ft.toml +56 -0
  56. examples/warming_up_to_rl/export_trace_sft.py +541 -0
  57. examples/warming_up_to_rl/groq_test.py +88 -0
  58. examples/warming_up_to_rl/manage_secrets.py +127 -0
  59. examples/warming_up_to_rl/old/event_rewards.md +234 -0
  60. examples/warming_up_to_rl/old/notes.md +73 -0
  61. examples/warming_up_to_rl/readme.md +172 -0
  62. examples/warming_up_to_rl/run_eval.py +434 -0
  63. examples/warming_up_to_rl/run_fft_and_save.py +309 -0
  64. examples/warming_up_to_rl/run_local_rollout.py +188 -0
  65. examples/warming_up_to_rl/run_local_rollout_modal.py +160 -0
  66. examples/warming_up_to_rl/run_local_rollout_parallel.py +342 -0
  67. examples/warming_up_to_rl/run_local_rollout_traced.py +372 -0
  68. examples/warming_up_to_rl/run_rl_and_save.py +101 -0
  69. examples/warming_up_to_rl/run_rollout_remote.py +129 -0
  70. examples/warming_up_to_rl/task_app/README.md +38 -0
  71. {synth_ai/task/apps → examples/warming_up_to_rl/task_app}/grpo_crafter.py +7 -7
  72. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +165 -0
  73. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  74. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  75. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +145 -0
  76. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1271 -0
  77. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  78. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +429 -0
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +442 -0
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +96 -0
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +302 -0
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +202 -0
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +512 -0
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +102 -0
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +985 -0
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +197 -0
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1749 -0
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +217 -0
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +160 -0
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +146 -0
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +58 -0
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +61 -0
  98. synth_ai/api/train/config_finder.py +18 -18
  99. synth_ai/api/train/env_resolver.py +28 -1
  100. synth_ai/cli/task_apps.py +264 -55
  101. synth_ai/demo_registry.py +7 -7
  102. synth_ai/demos/demo_task_apps/crafter/__init__.py +1 -0
  103. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +54 -0
  104. synth_ai/demos/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
  105. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +165 -0
  106. synth_ai/task/apps/__init__.py +54 -13
  107. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/METADATA +1 -1
  108. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/RECORD +112 -13
  109. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/top_level.txt +1 -0
  110. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/WHEEL +0 -0
  111. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/entry_points.txt +0 -0
  112. {synth_ai-0.2.9.dev2.dist-info → synth_ai-0.2.9.dev4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,145 @@
1
+ joshuapurtell@Mac synth-ai % bash examples/evals/run_demo.sh
2
+ Models to compare (space-separated) [gpt-5-nano gpt-4.1-nano]:
3
+ Models: gpt-5-nano gpt-4.1-nano
4
+ Episodes per model [3]: 5
5
+ Max turns per episode [5]: 5
6
+ Parallelism per model (concurrency) [5]: 5
7
+ Difficulty [easy]:
8
+ Running comparison: episodes=5, max_turns=5, difficulty=easy, concurrency=5
9
+ Detected SYNTH_API_KEY (sk_liv...ac95). Use this key? [Y/n]: n
10
+ Use SYNTH_API_KEY_PROD (sk_liv...a2a4)? [y/N]: Y
11
+ [PATCH] Attempting to apply Crafter deterministic patch...
12
+ [PATCH] Patching crafter.Env._balance_object...
13
+ [PATCH] crafter.Env._balance_object patched.
14
+ [PATCH] Attempting to apply Crafter serialization patch v3...
15
+ [PATCH] Adding enhanced save/load methods to crafter.Env...
16
+ [PATCH] crafter.Env.save() and load() methods added (v3).
17
+ [PATCH] Crafter serialization patch v3 complete.
18
+ [PATCH] Attempting to apply simplified Crafter world configuration patch...
19
+ [PATCH] Simplified Crafter world configuration patch complete.
20
+ [PATCH] Available configs: easy, normal, hard, peaceful
21
+ ✅ Loaded 8 Crafter achievement hooks (Easy, Medium, Hard)
22
+ 🎮 Crafter Multi-Model Experiment
23
+ ==================================================
24
+ Experiment ID: crafter_multi_model_20250808_170152
25
+ Models: gpt-5-nano, gpt-4.1-nano
26
+ Episodes per model: 5
27
+ Max turns per episode: 5
28
+ Difficulty: easy
29
+ Seeds: 1000 to 1004
30
+ Turn timeout: 20.0s
31
+ Episode timeout: 180.0s
32
+ Save traces: True
33
+ Database URL: sqlite+aiosqlite:////Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data
34
+ ==================================================
35
+ ✅ Crafter service is running
36
+
37
+ Running 5 episodes for gpt-5-nano in parallel...
38
+
39
+ gpt-5-nano | ep1: 0%| | 0/5 [00:00<?, ?turn/s]
40
+ Running 5 episodes for gpt-4.1-nano in parallel... | 0/5 [00:00<?, ?turn/s]
41
+ gpt-5-nano | ep3: 0%| | 0/5 [00:00<?, ?turn/s]
42
+ gpt-4.1-nano | ep3: 100%|██████████████████████████████████████████████| 5/5 [00:09<00:00, 1.95s/turn, ach=1]
43
+ gpt-4.1-nano | ep2: 80%|████████████████████████████████████▊ | 4/5 [00:10<00:02, 2.64s/turn, ach=2]
44
+ gpt-4.1-nano | ep4: 100%|██████████████████████████████████████████████| 5/5 [00:11<00:00, 2.32s/turn, ach=0]
45
+ gpt-4.1-nano | ep5: 100%|██████████████████████████████████████████████| 5/5 [00:11<00:00, 2.37s/turn, ach=2]
46
+ gpt-5-nano | ep1: 20%|█████████▌ | 1/5 [00:21<01:24, 21.13s/turn, ach=0 ⏰ Turn 3 timed out for episode 0 after 20.0s | 2/5 [00:25<00:38, 12.83s/turn, ach=0]
47
+ gpt-4.1-nano | ep1: 60%|███████████████████████████▌ | 3/5 [00:28<00:19, 9.62s/turn, ach=1]
48
+ gpt-5-nano | ep3: 100%|████████████████████████████████████████████████| 5/5 [01:00<00:00, 12.05s/turn, ach=1]
49
+ gpt-5-nano | ep2: 100%|████████████████████████████████████████████████| 5/5 [01:07<00:00, 13.56s/turn, ach=2]
50
+ ⏰ Turn 4 timed out for episode 3 after 20.0s██████████████████████| 5/5 [01:07<00:00, 14.04s/turn, ach=2]
51
+ gpt-5-nano | ep4: 80%|██████████████████████████████████████▍ | 4/5 [01:08<00:17, 17.02s/turn, ach=0]
52
+ gpt-5-nano | ep5: 100%|████████████████████████████████████████████████| 5/5 [01:13<00:00, 14.71s/turn, ach=1]
53
+ gpt-5-nano | ep1: 100%|████████████████████████████████████████████████| 5/5 [01:19<00:00, 15.83s/turn, ach=1]
54
+ gpt-4.1-nano | ep5: 100%|██████████████████████████████████████████████| 5/5 [00:11<00:00, 1.68s/turn, ach=2]
55
+ 📊 Analysis Results:
56
+ ================================================================================:13<00:00, 14.26s/turn, ach=1]
57
+
58
+ 📈 Model Performance Summary:
59
+ Model Avg Achievements Max Achievements Invalid Rate Success Rate
60
+ --------------------------------------------------------------------------------------
61
+ gpt-4.1-nano 1.20 ± 0.75 2 0.00% 100.00%
62
+ gpt-5-nano 1.00 ± 0.63 2 0.00% 100.00%
63
+
64
+ 🏆 Achievement Frequencies:
65
+
66
+ Achievement gpt-4.1-nano gpt-5-nano
67
+ -----------------------------------------------
68
+ collect_drink 2/5 ( 40%) 0/5 ( 0%)
69
+ collect_sapling 1/5 ( 20%) 2/5 ( 40%)
70
+ collect_wood 3/5 ( 60%) 2/5 ( 40%)
71
+ place_plant 0/5 ( 0%) 1/5 ( 20%)
72
+
73
+ 💰 Model Usage Statistics from Current Experiment:
74
+ Model Provider Usage Count Avg Latency (ms) Total Cost
75
+ ------------------------------------------------------------------------
76
+ gpt-5-nano openai 221 13006.57 $0.0000
77
+ gpt-4.1-nano openai 161 950.12 $0.0000
78
+
79
+ 💾 Detailed results saved to: /Users/joshuapurtell/Documents/GitHub/synth-ai/temp/crafter_experiment_results_20250808_170312.json
80
+
81
+ ✅ Experiment complete!
82
+ Using v3 traces DB: /Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data
83
+ \nAvailable achievements (session counts):
84
+ [PATCH] Attempting to apply Crafter deterministic patch...
85
+ [PATCH] Patching crafter.Env._balance_object...
86
+ [PATCH] crafter.Env._balance_object patched.
87
+ [PATCH] Attempting to apply Crafter serialization patch v3...
88
+ [PATCH] Adding enhanced save/load methods to crafter.Env...
89
+ [PATCH] crafter.Env.save() and load() methods added (v3).
90
+ [PATCH] Crafter serialization patch v3 complete.
91
+ [PATCH] Attempting to apply simplified Crafter world configuration patch...
92
+ [PATCH] Simplified Crafter world configuration patch complete.
93
+ [PATCH] Available configs: easy, normal, hard, peaceful
94
+ Achievements present (session counts):
95
+ - collect_drink: 44
96
+ - collect_sapling: 62
97
+ - collect_wood: 74
98
+ - defeat_skeleton: 4
99
+ - defeat_zombie: 2
100
+ - eat_cow: 2
101
+ - place_plant: 8
102
+ - place_table: 3
103
+ \nEnter achievements to filter by (space-separated), or press Enter for 'collect_wood':
104
+
105
+ Optionally restrict to models (space-separated), or press Enter to include all:
106
+
107
+ \nRunning: uv run python -m examples.evals.trace_analysis filter --db "/Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data" --achievements collect_wood --output ft_data/evals_filtered.jsonl
108
+ [PATCH] Attempting to apply Crafter deterministic patch...
109
+ [PATCH] Patching crafter.Env._balance_object...
110
+ [PATCH] crafter.Env._balance_object patched.
111
+ [PATCH] Attempting to apply Crafter serialization patch v3...
112
+ [PATCH] Adding enhanced save/load methods to crafter.Env...
113
+ [PATCH] crafter.Env.save() and load() methods added (v3).
114
+ [PATCH] Crafter serialization patch v3 complete.
115
+ [PATCH] Attempting to apply simplified Crafter world configuration patch...
116
+ [PATCH] Simplified Crafter world configuration patch complete.
117
+ [PATCH] Available configs: easy, normal, hard, peaceful
118
+ ✅ Wrote 74 examples from 74 sessions → ft_data/evals_filtered.jsonl
119
+ \nRunning: uv run python -m examples.evals.trace_analysis stats --db "/Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data" --achievements collect_wood
120
+ [PATCH] Attempting to apply Crafter deterministic patch...
121
+ [PATCH] Patching crafter.Env._balance_object...
122
+ [PATCH] crafter.Env._balance_object patched.
123
+ [PATCH] Attempting to apply Crafter serialization patch v3...
124
+ [PATCH] Adding enhanced save/load methods to crafter.Env...
125
+ [PATCH] crafter.Env.save() and load() methods added (v3).
126
+ [PATCH] Crafter serialization patch v3 complete.
127
+ [PATCH] Attempting to apply simplified Crafter world configuration patch...
128
+ [PATCH] Simplified Crafter world configuration patch complete.
129
+ [PATCH] Available configs: easy, normal, hard, peaceful
130
+ Matched sessions (any of: collect_wood )
131
+ n=74 avg_reward=0.76 stddev=1.00
132
+ avg_first_unlock_step=4.7 stddev=4.6
133
+ Others
134
+ n=224 avg_reward=0.21 stddev=0.51
135
+
136
+ Achievement frequency by session (matched vs others):
137
+ - collect_drink: matched 25/74 (33.8%), others 19/224 (8.5%)
138
+ - collect_sapling: matched 21/74 (28.4%), others 41/224 (18.3%)
139
+ - place_table: matched 3/74 (4.1%), others 0/224 (0.0%)
140
+ - eat_cow: matched 2/74 (2.7%), others 0/224 (0.0%)
141
+ - place_plant: matched 3/74 (4.1%), others 5/224 (2.2%)
142
+ - defeat_skeleton: matched 2/74 (2.7%), others 2/224 (0.9%)
143
+ - defeat_zombie: matched 0/74 (0.0%), others 2/224 (0.9%)
144
+ \nDone. See ft_data/evals_filtered.jsonl and v3 DB for deeper analysis.
145
+ joshuapurtell@Mac synth-ai %
@@ -0,0 +1,126 @@
1
+ #!/bin/bash
2
+
3
+ # Run Crafter experiments comparing gpt-5-nano and Qwen/Qwen3-32B-Instruct
4
+
5
+ # Get the directory where this script is located
6
+ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
7
+
8
+ # Change to the synth-ai root directory
9
+ cd "$SCRIPT_DIR/../.."
10
+
11
+ # Interactive mini-demo: run small comparison, then analyze v3 traces
12
+ set -euo pipefail
13
+
14
+ # Load env (prefer local .env at repo root)
15
+ set +u
16
+ set -a
17
+ if [ -f ".env" ]; then source ".env"; fi
18
+ set +a
19
+ set -u
20
+
21
+ # Ensure API key present (SYNTH_API_KEY, optionally mirror to OPENAI_API_KEY)
22
+ ensure_api_key() {
23
+ local current_key="${SYNTH_API_KEY:-}"
24
+ if [ -n "$current_key" ]; then
25
+ local preview="${current_key:0:6}...${current_key: -4}"
26
+ read -r -p "Detected SYNTH_API_KEY ($preview). Use this key? [Y/n]: " USE_CUR || true
27
+ USE_CUR=${USE_CUR:-Y}
28
+ if [[ ! "$USE_CUR" =~ ^[Yy]$ ]]; then
29
+ current_key=""
30
+ fi
31
+ fi
32
+
33
+ if [ -z "$current_key" ] && [ -n "${SYNTH_API_KEY_PROD:-}" ]; then
34
+ local prod_prev="${SYNTH_API_KEY_PROD:0:6}...${SYNTH_API_KEY_PROD: -4}"
35
+ read -r -p "Use SYNTH_API_KEY_PROD ($prod_prev)? [y/N]: " USE_PROD || true
36
+ if [[ "$USE_PROD" =~ ^[Yy]$ ]]; then
37
+ current_key="$SYNTH_API_KEY_PROD"
38
+ fi
39
+ fi
40
+
41
+ while [ -z "$current_key" ]; do
42
+ echo
43
+ read -s -p "Enter your SYNTH_API_KEY: " KEY_IN || true
44
+ echo
45
+ if [ -n "$KEY_IN" ]; then
46
+ current_key="$KEY_IN"
47
+ else
48
+ echo "A valid SYNTH_API_KEY is required to continue."
49
+ fi
50
+ done
51
+
52
+ export SYNTH_API_KEY="$current_key"
53
+ if [ -z "${OPENAI_API_KEY:-}" ]; then
54
+ export OPENAI_API_KEY="$SYNTH_API_KEY"
55
+ echo "OPENAI_API_KEY set from SYNTH_API_KEY."
56
+ fi
57
+ }
58
+
59
+ # Interactive prompts (with sensible defaults)
60
+ MODELS_DEFAULT="gpt-5-nano gpt-4.1-nano"
61
+ read -r -p "Models to compare (space-separated) [${MODELS_DEFAULT}]: " MODELS_INPUT || true
62
+ MODELS=${MODELS_INPUT:-$MODELS_DEFAULT}
63
+ echo "Models: ${MODELS}"
64
+
65
+ read -r -p "Episodes per model [3]: " EPISODES_INPUT || true
66
+ EPISODES=${EPISODES_INPUT:-3}
67
+
68
+ read -r -p "Max turns per episode [5]: " MAX_TURNS_INPUT || true
69
+ MAX_TURNS=${MAX_TURNS_INPUT:-5}
70
+
71
+ read -r -p "Parallelism per model (concurrency) [5]: " CONCURRENCY_INPUT || true
72
+ CONCURRENCY=${CONCURRENCY_INPUT:-5}
73
+
74
+ read -r -p "Difficulty [easy]: " DIFFICULTY_INPUT || true
75
+ DIFFICULTY=${DIFFICULTY_INPUT:-easy}
76
+
77
+ echo "Running comparison: episodes=${EPISODES}, max_turns=${MAX_TURNS}, difficulty=${DIFFICULTY}, concurrency=${CONCURRENCY}"
78
+
79
+ # Ensure key before running rollouts
80
+ ensure_api_key
81
+
82
+ uv run python examples/evals/compare_models.py \
83
+ --episodes "${EPISODES}" \
84
+ --max-turns "${MAX_TURNS}" \
85
+ --difficulty "${DIFFICULTY}" \
86
+ --models ${MODELS} \
87
+ --base-seed 1000 \
88
+ --turn-timeout 20.0 \
89
+ --episode-timeout 180.0 \
90
+ --concurrency "${CONCURRENCY}" \
91
+ --quiet
92
+
93
+ # Derive v3 sqld internal DB path for quick analysis
94
+ DB_PATH="$PWD/traces/v3/synth_ai.db/dbs/default/data"
95
+ export DB_PATH
96
+ echo "Using v3 traces DB: $DB_PATH"
97
+
98
+ echo "\nAvailable achievements (session counts):"
99
+ uv run python -m examples.evals.trace_analysis list --db "$DB_PATH"
100
+
101
+ echo "\nEnter achievements to filter by (space-separated), or press Enter for 'collect_wood':"
102
+ read -r ACH
103
+ ACH=${ACH:-collect_wood}
104
+
105
+ echo "Optionally restrict to models (space-separated), or press Enter to include all:"
106
+ read -r MODELS_FILTER
107
+
108
+ mkdir -p ft_data
109
+ if [ -n "$MODELS_FILTER" ]; then
110
+ echo "\nRunning: uv run python -m examples.evals.trace_analysis filter --db \"$DB_PATH\" --achievements $ACH --output ft_data/evals_filtered.jsonl --models $MODELS_FILTER"
111
+ uv run python -m examples.evals.trace_analysis filter --db "$DB_PATH" --achievements $ACH --output ft_data/evals_filtered.jsonl --models $MODELS_FILTER
112
+ else
113
+ echo "\nRunning: uv run python -m examples.evals.trace_analysis filter --db \"$DB_PATH\" --achievements $ACH --output ft_data/evals_filtered.jsonl"
114
+ uv run python -m examples.evals.trace_analysis filter --db "$DB_PATH" --achievements $ACH --output ft_data/evals_filtered.jsonl
115
+ fi
116
+
117
+ # Show stats comparing filtered vs others (including achievement frequencies)
118
+ if [ -n "$MODELS_FILTER" ]; then
119
+ echo "\nRunning: uv run python -m examples.evals.trace_analysis stats --db \"$DB_PATH\" --achievements $ACH --models $MODELS_FILTER"
120
+ uv run python -m examples.evals.trace_analysis stats --db "$DB_PATH" --achievements $ACH --models $MODELS_FILTER
121
+ else
122
+ echo "\nRunning: uv run python -m examples.evals.trace_analysis stats --db \"$DB_PATH\" --achievements $ACH"
123
+ uv run python -m examples.evals.trace_analysis stats --db "$DB_PATH" --achievements $ACH
124
+ fi
125
+
126
+ echo "\nDone. See ft_data/evals_filtered.jsonl and v3 DB for deeper analysis."
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Trace analysis utilities for Crafter v3 traces (sqld/Turso).
4
+
5
+ Subcommands:
6
+ - list: List achievements present in the database and counts
7
+ - filter: Filter sessions by required achievements and export OpenAI-format JSONL
8
+ - stats: Compare rewards and achievement frequencies for filtered vs. others
9
+
10
+ Usage examples:
11
+ uvpm examples.evals.trace_analysis list --db traces/v3/synth_ai.db/dbs/default/data
12
+ uvpm examples.evals.trace_analysis filter --db traces/v3/synth_ai.db/dbs/default/data \
13
+ --achievements collect_wood --output ft_data/evals_collect_wood.jsonl
14
+ """
15
+
16
+ import argparse
17
+ import asyncio
18
+ import json
19
+ import json as pyjson
20
+ import math
21
+ from pathlib import Path
22
+
23
+ from synth_ai.environments.examples.crafter_classic.agent_demos.crafter_modal_ft.filter_traces_sft_turso import (
24
+ FinetuningDataExtractorV3,
25
+ )
26
+
27
+
28
+ def build_db_url(path: str) -> str:
29
+ if path.startswith("sqlite+"):
30
+ return path
31
+ return f"sqlite+aiosqlite:///{path}"
32
+
33
+
34
+ async def cmd_list(db_path: str) -> None:
35
+ db_url = build_db_url(db_path)
36
+ async with FinetuningDataExtractorV3(db_url) as ex:
37
+ sessions = await ex.get_all_sessions()
38
+ achievement_counts: dict[str, int] = {}
39
+ for _, row in sessions.iterrows():
40
+ ach_list = await ex.get_session_achievements(row["session_id"]) or []
41
+ for name in ach_list:
42
+ achievement_counts[name] = achievement_counts.get(name, 0) + 1
43
+
44
+ print("Achievements present (session counts):")
45
+ for name in sorted(achievement_counts.keys()):
46
+ print(f" - {name}: {achievement_counts[name]}")
47
+
48
+
49
+ async def cmd_filter(
50
+ db_path: str, achievements: list[str], output: str, models: list[str] | None = None
51
+ ) -> None:
52
+ db_url = build_db_url(db_path)
53
+ required: set[str] = set(achievements)
54
+ async with FinetuningDataExtractorV3(db_url) as ex:
55
+ sessions = await ex.get_all_sessions()
56
+ kept: list[str] = []
57
+ for _, row in sessions.iterrows():
58
+ if models:
59
+ # Restrict to sessions containing any of the requested models
60
+ model_df = await ex.db_manager.query_traces(
61
+ """
62
+ SELECT DISTINCT model_name
63
+ FROM events
64
+ WHERE session_id = :session_id
65
+ AND event_type = 'cais'
66
+ AND model_name IS NOT NULL
67
+ """,
68
+ {"session_id": row["session_id"]},
69
+ )
70
+ session_models = (
71
+ model_df["model_name"].tolist()
72
+ if model_df is not None and not model_df.empty
73
+ else []
74
+ )
75
+ if not any(m in session_models for m in models):
76
+ continue
77
+ ach_list = await ex.get_session_achievements(row["session_id"]) or []
78
+ if required & set(ach_list):
79
+ kept.append(row["session_id"])
80
+
81
+ data = await ex.extract_openai_format(kept)
82
+ Path(output).parent.mkdir(parents=True, exist_ok=True)
83
+ with open(output, "w") as f:
84
+ for exm in data:
85
+ f.write(json.dumps(exm) + "\n")
86
+ print(f"✅ Wrote {len(data)} examples from {len(kept)} sessions → {output}")
87
+
88
+
89
+ async def _first_achievement_step(
90
+ ex: FinetuningDataExtractorV3, session_id: str, required: set[str]
91
+ ) -> int | None:
92
+ q = """
93
+ SELECT message_time, system_state_after
94
+ FROM events
95
+ WHERE session_id = :session_id
96
+ AND event_type = 'environment'
97
+ ORDER BY message_time ASC
98
+ """
99
+ df = await ex.db_manager.query_traces(q, {"session_id": session_id})
100
+ if df is None or df.empty:
101
+ return None
102
+ seen: set[str] = set()
103
+ for _, row in df.iterrows():
104
+ st = row.get("system_state_after")
105
+ if isinstance(st, str):
106
+ try:
107
+ st = pyjson.loads(st)
108
+ except Exception:
109
+ st = None
110
+ ach = None
111
+ if isinstance(st, dict):
112
+ ps = st.get("public_state") or {}
113
+ ach = ps.get("achievements_status") or {}
114
+ if isinstance(ach, dict):
115
+ for name, unlocked in ach.items():
116
+ if unlocked and name in required and name not in seen:
117
+ return int(row.get("message_time") or 0)
118
+ return None
119
+
120
+
121
+ def _mean(values: list[float]) -> float:
122
+ return (sum(values) / len(values)) if values else 0.0
123
+
124
+
125
+ def _stddev(values: list[float]) -> float:
126
+ if not values:
127
+ return 0.0
128
+ m = _mean(values)
129
+ var = sum((v - m) * (v - m) for v in values) / len(values)
130
+ return math.sqrt(var)
131
+
132
+
133
+ async def cmd_stats(db_path: str, achievements: list[str], models: list[str] | None = None) -> None:
134
+ db_url = build_db_url(db_path)
135
+ required: set[str] = set(achievements)
136
+ async with FinetuningDataExtractorV3(db_url) as ex:
137
+ sessions = await ex.get_all_sessions()
138
+ matched_rewards: list[float] = []
139
+ other_rewards: list[float] = []
140
+ first_steps: list[int] = []
141
+ matched_count: int = 0
142
+ other_count: int = 0
143
+ matched_ach_counts: dict[str, int] = {}
144
+ other_ach_counts: dict[str, int] = {}
145
+
146
+ for _, row in sessions.iterrows():
147
+ sid = row["session_id"]
148
+ if models:
149
+ model_df = await ex.db_manager.query_traces(
150
+ """
151
+ SELECT DISTINCT model_name
152
+ FROM events
153
+ WHERE session_id = :session_id
154
+ AND event_type = 'cais'
155
+ AND model_name IS NOT NULL
156
+ """,
157
+ {"session_id": sid},
158
+ )
159
+ session_models = (
160
+ model_df["model_name"].tolist()
161
+ if model_df is not None and not model_df.empty
162
+ else []
163
+ )
164
+ if not any(m in session_models for m in models):
165
+ continue
166
+
167
+ ach_list = await ex.get_session_achievements(sid) or []
168
+ metrics = await ex.get_session_metrics(sid)
169
+ reward = float(metrics.get("total_reward", 0.0))
170
+
171
+ if required & set(ach_list):
172
+ matched_rewards.append(reward)
173
+ step = await _first_achievement_step(ex, sid, required)
174
+ if step is not None:
175
+ first_steps.append(step)
176
+ matched_count += 1
177
+ for name in ach_list:
178
+ matched_ach_counts[name] = matched_ach_counts.get(name, 0) + 1
179
+ else:
180
+ other_rewards.append(reward)
181
+ other_count += 1
182
+ for name in ach_list:
183
+ other_ach_counts[name] = other_ach_counts.get(name, 0) + 1
184
+
185
+ print("Matched sessions (any of:", ", ".join(sorted(required)), ")")
186
+ print(
187
+ f" n={len(matched_rewards)} avg_reward={_mean(matched_rewards):.2f} stddev={_stddev(matched_rewards):.2f}"
188
+ )
189
+ if first_steps:
190
+ print(
191
+ f" avg_first_unlock_step={_mean([float(s) for s in first_steps]):.1f} stddev={_stddev([float(s) for s in first_steps]):.1f}"
192
+ )
193
+ else:
194
+ print(" avg_first_unlock_step=n/a (no unlocks recorded)")
195
+ print("Others")
196
+ print(
197
+ f" n={len(other_rewards)} avg_reward={_mean(other_rewards):.2f} stddev={_stddev(other_rewards):.2f}"
198
+ )
199
+
200
+ # Achievement frequency comparison (by session presence), excluding required achievements
201
+ all_achievements: set[str] = set(matched_ach_counts.keys()) | set(other_ach_counts.keys())
202
+ compare_achievements = [a for a in sorted(all_achievements) if a not in required]
203
+ if compare_achievements and (matched_count > 0 or other_count > 0):
204
+ print("\nAchievement frequency by session (matched vs others):")
205
+ # Build rows with absolute percentage difference for sorting
206
+ rows: list[tuple[float, str, int, float, int, float]] = []
207
+ for name in compare_achievements:
208
+ m_n = matched_ach_counts.get(name, 0)
209
+ o_n = other_ach_counts.get(name, 0)
210
+ m_pct = (m_n / matched_count * 100.0) if matched_count else 0.0
211
+ o_pct = (o_n / other_count * 100.0) if other_count else 0.0
212
+ diff = abs(m_pct - o_pct)
213
+ rows.append((diff, name, m_n, m_pct, o_n, o_pct))
214
+
215
+ # Show top 10 differences
216
+ rows.sort(reverse=True)
217
+ limit = min(10, len(rows))
218
+ for i in range(limit):
219
+ _, name, m_n, m_pct, o_n, o_pct = rows[i]
220
+ print(
221
+ f" - {name}: matched {m_n}/{matched_count} ({m_pct:.1f}%), others {o_n}/{other_count} ({o_pct:.1f}%)"
222
+ )
223
+
224
+
225
+ def main() -> None:
226
+ parser = argparse.ArgumentParser(description="Crafter v3 trace analysis")
227
+ sub = parser.add_subparsers(dest="command", required=True)
228
+
229
+ p_list = sub.add_parser("list", help="List achievements present in DB")
230
+ p_list.add_argument(
231
+ "--db", required=True, help="Path to sqld internal data file or full sqlite+aiosqlite URL"
232
+ )
233
+
234
+ p_filter = sub.add_parser("filter", help="Filter sessions by achievements and export JSONL")
235
+ p_filter.add_argument(
236
+ "--db", required=True, help="Path to sqld internal data file or full sqlite+aiosqlite URL"
237
+ )
238
+ p_filter.add_argument(
239
+ "--achievements",
240
+ nargs="+",
241
+ required=True,
242
+ help="Required achievements (any match keeps session)",
243
+ )
244
+ p_filter.add_argument("--output", required=True, help="Output JSONL path")
245
+ p_filter.add_argument("--models", nargs="*", help="Optional model names to include (any match)")
246
+
247
+ p_stats = sub.add_parser("stats", help="Show summary stats for filtered vs others")
248
+ p_stats.add_argument(
249
+ "--db", required=True, help="Path to sqld internal data file or full sqlite+aiosqlite URL"
250
+ )
251
+ p_stats.add_argument(
252
+ "--achievements", nargs="+", required=True, help="Achievements to match (any match)"
253
+ )
254
+ p_stats.add_argument("--models", nargs="*", help="Optional model names to include (any match)")
255
+
256
+ args = parser.parse_args()
257
+
258
+ if args.command == "list":
259
+ asyncio.run(cmd_list(args.db))
260
+ return
261
+ if args.command == "filter":
262
+ asyncio.run(cmd_filter(args.db, args.achievements, args.output, args.models or None))
263
+ return
264
+ if args.command == "stats":
265
+ asyncio.run(cmd_stats(args.db, args.achievements, args.models or None))
266
+ return
267
+
268
+
269
+ if __name__ == "__main__":
270
+ main()
@@ -0,0 +1,29 @@
1
+ # Centralized configuration for Synth Qwen Crafter workflows
2
+
3
+ [rollouts]
4
+ model = "Qwen/Qwen3-4B-Instruct-2507"
5
+ episodes = 5
6
+ max_steps = 30
7
+ difficulty = "easy"
8
+ temperature = 0.4
9
+ max_tokens = 2048
10
+ tool_choice = "required"
11
+
12
+ [traces]
13
+ sqld_db_path = "traces/v3/synth_ai.db"
14
+
15
+ [filter]
16
+ # For v3 sqld traces, use the internal data file under the sqld directory
17
+ db_path = "traces/v3/synth_ai.db/dbs/default/data"
18
+ required_achievements = ["collect_wood"]
19
+ min_total_reward = 1.0
20
+ max_cost = 10.0
21
+ max_tokens = 100000
22
+ output_jsonl = "ft_data/qwen4b_crafter_sft_collect_wood.jsonl"
23
+
24
+ [sft]
25
+ base_model = "Qwen/Qwen3-4B-Instruct-2507"
26
+ training_jsonl = "ft_data/qwen4b_crafter_sft_collect_wood.jsonl"
27
+ n_epochs = 1
28
+ batch_size = 4
29
+ upload_to_wasabi = true