synth-ai 0.2.9.dev7__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (323) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +8 -11
  3. examples/dev/qwen3_32b_qlora_4xh100.toml +40 -0
  4. examples/multi_step/crafter_rl_lora.md +29 -0
  5. examples/qwen_coder/README.md +102 -0
  6. examples/qwen_coder/_shared.py +113 -0
  7. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  8. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  9. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  10. examples/qwen_coder/generate_dataset.py +98 -0
  11. examples/qwen_coder/infer_ft_smoke.py +65 -0
  12. examples/qwen_coder/infer_prod_proxy.py +73 -0
  13. examples/qwen_coder/infer_via_synth.py +87 -0
  14. examples/qwen_coder/scripts/infer_coder.sh +19 -0
  15. examples/qwen_coder/scripts/train_coder_30b.sh +22 -0
  16. examples/qwen_coder/sft_full_17b.py +103 -0
  17. examples/qwen_coder/sft_lora_30b.py +110 -0
  18. examples/qwen_coder/subset_jsonl.py +39 -0
  19. examples/qwen_coder/todos.md +38 -0
  20. examples/qwen_coder/validate_jsonl.py +60 -0
  21. examples/rl/run_eval.py +36 -37
  22. examples/rl/run_rl_and_save.py +5 -5
  23. examples/rl/task_app/math_single_step.py +65 -43
  24. examples/rl/task_app/math_task_app.py +3 -3
  25. examples/sft/README.md +139 -0
  26. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  27. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  28. examples/sft/evaluate.py +117 -0
  29. examples/sft/export_dataset.py +117 -0
  30. examples/sft/generate_traces.py +162 -0
  31. examples/swe/__init__.py +12 -0
  32. examples/swe/task_app/README.md +105 -0
  33. examples/swe/task_app/__init__.py +2 -0
  34. examples/swe/task_app/grpo_swe_mini.py +571 -0
  35. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  36. examples/swe/task_app/hosted/README.md +173 -0
  37. examples/swe/task_app/hosted/__init__.py +5 -0
  38. examples/swe/task_app/hosted/branching.py +143 -0
  39. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  40. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  41. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  42. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  43. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  44. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  45. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  46. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  47. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  50. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  51. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  52. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  53. examples/swe/task_app/hosted/hosted_app.py +204 -0
  54. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  55. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  56. examples/swe/task_app/hosted/main.py +100 -0
  57. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  58. examples/swe/task_app/hosted/registry.py +195 -0
  59. examples/swe/task_app/hosted/rollout.py +1869 -0
  60. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  61. examples/swe/task_app/hosted/storage/volume.py +211 -0
  62. examples/swe/task_app/hosted/test_agents.py +161 -0
  63. examples/swe/task_app/hosted/test_service.py +137 -0
  64. examples/swe/task_app/hosted/utils.py +62 -0
  65. examples/vlm/PROPOSAL.md +53 -0
  66. examples/vlm/README.md +68 -0
  67. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  68. examples/vlm/crafter_image_only_agent.py +207 -0
  69. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  70. examples/vlm/filter_image_rows.py +63 -0
  71. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  72. examples/warming_up_to_rl/analyze_trace_db.py +5 -5
  73. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  74. examples/warming_up_to_rl/export_trace_sft.py +78 -21
  75. examples/warming_up_to_rl/groq_test.py +4 -4
  76. examples/warming_up_to_rl/manage_secrets.py +13 -18
  77. examples/warming_up_to_rl/run_eval.py +42 -44
  78. examples/warming_up_to_rl/run_fft_and_save.py +11 -16
  79. examples/warming_up_to_rl/run_local_rollout.py +1 -3
  80. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -4
  81. examples/warming_up_to_rl/run_local_rollout_parallel.py +1 -4
  82. examples/warming_up_to_rl/run_local_rollout_traced.py +3 -5
  83. examples/warming_up_to_rl/run_rl_and_save.py +5 -6
  84. examples/warming_up_to_rl/run_rollout_remote.py +8 -10
  85. examples/warming_up_to_rl/task_app/README.md +6 -2
  86. examples/warming_up_to_rl/task_app/grpo_crafter.py +234 -35
  87. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +2 -3
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +131 -114
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +101 -41
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +73 -51
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +14 -6
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +16 -16
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +32 -34
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +94 -31
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +303 -203
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +328 -225
  101. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +13 -13
  102. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  103. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +1 -0
  104. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  105. synth_ai/api/models/supported.py +376 -0
  106. synth_ai/api/train/builders.py +128 -21
  107. synth_ai/api/train/cli.py +80 -64
  108. synth_ai/api/train/config_finder.py +7 -2
  109. synth_ai/api/train/env_resolver.py +1 -1
  110. synth_ai/api/train/pollers.py +2 -1
  111. synth_ai/api/train/supported_algos.py +139 -0
  112. synth_ai/api/train/task_app.py +1 -2
  113. synth_ai/api/train/utils.py +13 -44
  114. synth_ai/cli/__init__.py +8 -0
  115. synth_ai/cli/_modal_wrapper.py +28 -0
  116. synth_ai/cli/_typer_patch.py +49 -0
  117. synth_ai/cli/balance.py +1 -2
  118. synth_ai/cli/calc.py +1 -1
  119. synth_ai/cli/demo.py +2 -1
  120. synth_ai/cli/recent.py +2 -2
  121. synth_ai/cli/rl_demo.py +2 -1
  122. synth_ai/cli/root.py +11 -13
  123. synth_ai/cli/status.py +2 -2
  124. synth_ai/cli/task_apps.py +529 -179
  125. synth_ai/cli/traces.py +6 -4
  126. synth_ai/cli/watch.py +12 -18
  127. synth_ai/demo_registry.py +1 -1
  128. synth_ai/demos/core/cli.py +36 -43
  129. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  130. synth_ai/demos/demo_task_apps/core.py +17 -25
  131. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +3 -4
  132. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  133. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -4
  134. synth_ai/demos/demo_task_apps/math/modal_task_app.py +16 -18
  135. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
  136. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  137. synth_ai/environments/reproducibility/tree.py +2 -5
  138. synth_ai/environments/service/app.py +11 -12
  139. synth_ai/environments/service/core_routes.py +4 -7
  140. synth_ai/environments/stateful/engine.py +1 -1
  141. synth_ai/environments/tasks/core.py +1 -0
  142. synth_ai/environments/tasks/filters.py +5 -6
  143. synth_ai/environments/tasks/utils.py +4 -5
  144. synth_ai/handshake.py +9 -9
  145. synth_ai/http.py +1 -1
  146. synth_ai/http_client.py +18 -10
  147. synth_ai/inference/client.py +15 -5
  148. synth_ai/jobs/client.py +78 -83
  149. synth_ai/learning/__init__.py +41 -6
  150. synth_ai/learning/algorithms.py +14 -0
  151. synth_ai/learning/client.py +91 -24
  152. synth_ai/learning/config.py +2 -38
  153. synth_ai/learning/ft_client.py +4 -59
  154. synth_ai/learning/health.py +5 -6
  155. synth_ai/learning/jobs.py +31 -47
  156. synth_ai/{rl → learning/rl}/__init__.py +14 -4
  157. synth_ai/learning/rl/client.py +267 -0
  158. synth_ai/learning/rl/config.py +31 -0
  159. synth_ai/{rl → learning/rl}/contracts.py +5 -8
  160. synth_ai/{rl → learning/rl}/env_keys.py +39 -15
  161. synth_ai/learning/rl/secrets.py +13 -0
  162. synth_ai/learning/rl_client.py +2 -281
  163. synth_ai/learning/sft/__init__.py +29 -0
  164. synth_ai/learning/sft/client.py +68 -0
  165. synth_ai/learning/sft/config.py +270 -0
  166. synth_ai/learning/sft/data.py +295 -0
  167. synth_ai/learning/sse.py +25 -24
  168. synth_ai/learning/validators.py +25 -28
  169. synth_ai/lm/__init__.py +21 -47
  170. synth_ai/task/__init__.py +25 -27
  171. synth_ai/task/apps/__init__.py +7 -8
  172. synth_ai/task/auth.py +8 -8
  173. synth_ai/task/client.py +14 -14
  174. synth_ai/task/contracts.py +36 -35
  175. synth_ai/task/datasets.py +6 -5
  176. synth_ai/task/errors.py +10 -10
  177. synth_ai/task/health.py +17 -9
  178. synth_ai/task/json.py +58 -23
  179. synth_ai/task/proxy.py +13 -9
  180. synth_ai/task/rubrics.py +16 -15
  181. synth_ai/task/server.py +12 -12
  182. synth_ai/task/tracing_utils.py +4 -4
  183. synth_ai/task/vendors.py +5 -6
  184. synth_ai/tracing_v3/__init__.py +2 -0
  185. synth_ai/tracing_v3/abstractions.py +21 -4
  186. synth_ai/tracing_v3/decorators.py +18 -16
  187. synth_ai/tracing_v3/hooks.py +5 -5
  188. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  189. synth_ai/tracing_v3/session_tracer.py +40 -14
  190. synth_ai/tracing_v3/storage/base.py +85 -0
  191. synth_ai/tracing_v3/storage/config.py +21 -8
  192. synth_ai/tracing_v3/storage/factory.py +10 -7
  193. synth_ai/tracing_v3/storage/utils.py +4 -2
  194. synth_ai/tracing_v3/turso/daemon.py +7 -2
  195. synth_ai/tracing_v3/turso/models.py +2 -2
  196. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  197. synth_ai/tracing_v3/utils.py +4 -4
  198. synth_ai/v0/api/__init__.py +8 -0
  199. synth_ai/v0/api/models/__init__.py +8 -0
  200. synth_ai/v0/api/models/supported.py +8 -0
  201. synth_ai/v0/config/__init__.py +15 -0
  202. synth_ai/v0/config/base_url.py +12 -0
  203. synth_ai/v0/lm/__init__.py +51 -0
  204. synth_ai/{lm → v0/lm}/caching/ephemeral.py +2 -2
  205. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  206. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  207. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  208. synth_ai/{lm → v0/lm}/config.py +6 -1
  209. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  210. synth_ai/{lm → v0/lm}/core/main.py +6 -6
  211. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  212. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -14
  213. synth_ai/{lm → v0/lm}/core/vendor_clients.py +2 -2
  214. synth_ai/{lm → v0/lm}/overrides.py +2 -2
  215. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  216. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  217. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  218. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  219. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +9 -9
  220. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  221. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  222. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +10 -10
  223. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +8 -8
  224. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +2 -2
  225. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +3 -3
  226. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  227. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  228. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  229. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  230. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  231. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  232. synth_ai/{lm → v0/lm}/vendors/synth_client.py +1 -1
  233. synth_ai/v0/tracing_v3/__init__.py +10 -0
  234. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  235. synth_ai/v0/tracing_v3/decorators.py +3 -0
  236. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  237. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  238. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/METADATA +10 -7
  239. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/RECORD +269 -233
  240. examples/common_old/backend.py +0 -20
  241. examples/evals_old/README.md +0 -98
  242. examples/evals_old/__init__.py +0 -6
  243. examples/evals_old/compare_models.py +0 -1038
  244. examples/evals_old/example_log.md +0 -145
  245. examples/evals_old/run_demo.sh +0 -126
  246. examples/evals_old/trace_analysis.py +0 -270
  247. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  248. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  249. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  250. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -243
  251. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  252. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  253. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  254. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  255. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  256. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -119
  257. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  258. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  259. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -243
  260. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  261. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  262. examples/finetuning_old/synth_qwen_v1/infer.py +0 -36
  263. examples/finetuning_old/synth_qwen_v1/poll.py +0 -46
  264. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  265. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  266. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1933
  267. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -210
  268. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -237
  269. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  270. examples/finetuning_old/synth_qwen_v1/util.py +0 -152
  271. examples/rl_old/task_app.py +0 -1131
  272. synth_ai/experimental/synth_oss.py +0 -445
  273. synth_ai/learning/filtering.py +0 -0
  274. synth_ai/learning/offline/dpo.py +0 -0
  275. synth_ai/learning/offline/providers.py +0 -7
  276. synth_ai/learning/offline/sft.py +0 -0
  277. synth_ai/learning/offline/shared.py +0 -0
  278. synth_ai/learning/online/grpo.py +0 -0
  279. synth_ai/learning/online/irft.py +0 -0
  280. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  281. synth_ai/learning/prompts/gepa.py +0 -0
  282. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -211
  283. synth_ai/learning/prompts/mipro.py +0 -289
  284. synth_ai/learning/prompts/random_search.py +0 -249
  285. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  286. synth_ai/learning/prompts/run_random_search_banking77.py +0 -329
  287. synth_ai/rl/secrets.py +0 -19
  288. synth_ai/scripts/verify_rewards.py +0 -100
  289. synth_ai/tracing/__init__.py +0 -30
  290. synth_ai/tracing_v1/__init__.py +0 -33
  291. synth_ai/tracing_v3/turso/__init__.py +0 -25
  292. synth_ai/tracing_v3/turso/manager.py +0 -838
  293. synth_ai/zyk/__init__.py +0 -30
  294. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  295. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  296. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  297. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  298. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  299. /synth_ai/{lm → v0/lm}/core/exceptions.py +0 -0
  300. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  301. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  302. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  303. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  304. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  305. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  306. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  307. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  308. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  309. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  310. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  311. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  312. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  313. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  314. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  315. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  316. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  317. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  318. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  319. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  320. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/WHEEL +0 -0
  321. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/entry_points.txt +0 -0
  322. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/licenses/LICENSE +0 -0
  323. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/top_level.txt +0 -0
@@ -1,1038 +0,0 @@
1
- #!/usr/bin/env python3
2
- # ruff: noqa: E402
3
- """
4
- Comprehensive script to run Crafter rollouts for multiple models and compare their performance.
5
- Updated to use tracing_v3 with async architecture.
6
-
7
- Runs experiments for:
8
- - gpt-4o-mini
9
- - gpt-4.1-mini
10
- - gpt-4.1-nano
11
- - gemini-1.5-flash
12
- - gemini-2.5-flash-lite
13
- - qwen3/32b
14
-
15
- Analyzes and compares:
16
- - Invalid action rates
17
- - Achievement frequencies by step
18
- - Achievement counts across models
19
- - Performance metrics
20
- - Cost analysis
21
- """
22
-
23
- import os
24
- import sys
25
- from pathlib import Path
26
-
27
- # Ensure repository root is on sys.path before importing synth_ai
28
- synth_ai_root = Path(__file__).parent.parent.parent
29
- sys.path.insert(0, str(synth_ai_root))
30
-
31
- # Disable v1 logging to see v3 tracing clearly
32
- os.environ["LANGFUSE_ENABLED"] = "false"
33
- os.environ["SYNTH_LOGGING"] = "false"
34
-
35
- import argparse
36
- import asyncio
37
- import contextlib
38
- import json
39
- import logging
40
- import random
41
- import time
42
- from collections import defaultdict
43
- from datetime import datetime
44
- from typing import Any
45
-
46
- import httpx
47
- import numpy as np
48
- import pandas as pd
49
-
50
- # Import enhanced LM with v3 tracing
51
- from synth_ai.lm.core.main_v3 import LM
52
- from synth_ai.tracing_v3.abstractions import (
53
- EnvironmentEvent,
54
- RuntimeEvent,
55
- SessionEventMarkovBlanketMessage,
56
- TimeRecord,
57
- )
58
- from synth_ai.tracing_v3.db_config import get_default_db_config
59
- from synth_ai.tracing_v3.decorators import set_turn_number
60
- from synth_ai.tracing_v3.session_tracer import SessionTracer
61
- from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager
62
- from tqdm import tqdm
63
-
64
- # Disable httpx logging for cleaner output
65
- logging.getLogger("httpx").setLevel(logging.WARNING)
66
-
67
- # Import Crafter hooks
68
- try:
69
- from synth_ai.environments.examples.crafter_classic.trace_hooks_v3 import CRAFTER_HOOKS
70
-
71
- print(f"āœ… Loaded {len(CRAFTER_HOOKS.hooks)} Crafter achievement hooks (Easy, Medium, Hard)")
72
- except ImportError:
73
- print("Warning: Could not import CRAFTER_HOOKS for v3")
74
- from synth_ai.tracing_v3.hooks import HookManager
75
-
76
- CRAFTER_HOOKS = HookManager()
77
-
78
- # Global buckets for sessions
79
- _SESSIONS: dict[str, tuple[str, object]] = {} # session_id -> (experiment_id, trace)
80
-
81
- # Configuration
82
- MODELS_TO_TEST = [
83
- "gpt-5-nano",
84
- "gpt-4.1-nano",
85
- ]
86
-
87
- # Service URLs (modify these based on your setup)
88
- CRAFTER_SERVICE_URL = "http://localhost:8901"
89
-
90
- # Database configuration - uses the centralized config which matches serve.sh
91
- db_config = get_default_db_config()
92
- DATABASE_URL = db_config.database_url
93
-
94
- # Retry configuration for HTTP requests
95
- MAX_RETRIES = 3
96
- BASE_DELAY = 0.1
97
- MAX_DELAY = 2.0
98
- HTTP_TIMEOUT = 30.0
99
-
100
-
101
- class ExperimentConfig:
102
- """Configuration for the multi-model experiment."""
103
-
104
- def __init__(self):
105
- self.num_episodes = 10 # Number of episodes per model
106
- self.max_turns = 100 # Max turns per episode
107
- self.difficulty = "easy"
108
- self.save_traces = True
109
- self.verbose = True
110
- self.quiet = False # Default to verbose mode
111
- self.enable_v3_tracing = True
112
- self.v3_trace_dir = "./traces/v3/crafter_comparison"
113
- self.crafter_service_url = CRAFTER_SERVICE_URL
114
- self.database_url = DATABASE_URL
115
- self.base_seed = 1000 # Base seed for episode generation
116
- self.turn_timeout = 30.0 # Timeout per turn in seconds
117
- self.episode_timeout = 300.0 # Total timeout per episode in seconds
118
- self.concurrency = 5 # Max concurrent episodes per model
119
-
120
-
121
- async def retry_http_request(client: httpx.AsyncClient, method: str, url: str, **kwargs) -> Any:
122
- """Retry HTTP requests with exponential backoff and jitter."""
123
- last_exception = None
124
-
125
- for attempt in range(MAX_RETRIES):
126
- try:
127
- if attempt > 0:
128
- delay = min(BASE_DELAY * (2 ** (attempt - 1)), MAX_DELAY)
129
- jitter = random.uniform(0, 0.1 * delay)
130
- total_delay = delay + jitter
131
- await asyncio.sleep(total_delay)
132
-
133
- response = await client.request(method, url, timeout=HTTP_TIMEOUT, **kwargs)
134
-
135
- if response.status_code < 500:
136
- return response
137
-
138
- last_exception = Exception(f"HTTP {response.status_code}: {response.text}")
139
-
140
- except httpx.ConnectError as e:
141
- last_exception = Exception(f"Connection failed to {url}: {e}")
142
- if attempt < MAX_RETRIES - 1:
143
- await asyncio.sleep(1.0 * (2**attempt))
144
- except httpx.ReadError as e:
145
- last_exception = e
146
- if attempt < MAX_RETRIES - 1:
147
- read_error_delay = min(1.0 * (2**attempt), 5.0)
148
- await asyncio.sleep(read_error_delay)
149
- except Exception as e:
150
- last_exception = e
151
-
152
- print(f" āŒ HTTP request failed after {MAX_RETRIES} attempts: {method} {url}")
153
- print(f" āŒ Error: {type(last_exception).__name__}: {str(last_exception)[:200]}")
154
- raise last_exception
155
-
156
-
157
- # Crafter action mapping
158
- CRAFTER_ACTIONS = {
159
- "noop": 0,
160
- "move_left": 1,
161
- "move_right": 2,
162
- "move_up": 3,
163
- "move_down": 4,
164
- "do": 5,
165
- "sleep": 6,
166
- "place_stone": 7,
167
- "place_table": 8,
168
- "place_furnace": 9,
169
- "place_plant": 10,
170
- "make_wood_pickaxe": 11,
171
- "make_stone_pickaxe": 12,
172
- "make_iron_pickaxe": 13,
173
- "make_wood_sword": 14,
174
- "make_stone_sword": 15,
175
- "make_iron_sword": 16,
176
- "eat_cow": 17,
177
- "eat_plant": 18,
178
- }
179
-
180
- # Create reverse mapping for validation
181
- INT_TO_ACTION_STRING = {v: k for k, v in CRAFTER_ACTIONS.items()}
182
-
183
-
184
- def compress_observation_for_trace(obs: dict[str, Any]) -> str:
185
- """Compress observation data for storage in traces."""
186
- try:
187
- return json.dumps(
188
- {
189
- "inv": {k: v for k, v in obs.get("inventory", {}).items() if v > 0},
190
- "nearby": obs.get("nearby", []),
191
- "hp": obs.get("status", {}).get("health", 0),
192
- "food": obs.get("status", {}).get("food", 0),
193
- "ach": sum(1 for v in obs.get("achievements_status", {}).values() if v),
194
- },
195
- separators=(",", ":"),
196
- )
197
- except Exception as e:
198
- return f'{{"error": "{str(e)}"}}'
199
-
200
-
201
- def create_message(
202
- content: str, message_type: str, system_id: str, turn: int
203
- ) -> SessionEventMarkovBlanketMessage:
204
- """Create a SessionEventMarkovBlanketMessage representing cross-boundary communication."""
205
- return SessionEventMarkovBlanketMessage(
206
- content=content,
207
- message_type=message_type,
208
- metadata={"system_id": system_id, "turn": turn},
209
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
210
- )
211
-
212
-
213
- async def run_episode(
214
- config: ExperimentConfig,
215
- model_name: str,
216
- episode_num: int,
217
- experiment_id: str,
218
- pbar: tqdm | None = None,
219
- ) -> dict[str, Any]:
220
- """Run a single episode with a specific model using v3 tracing."""
221
- # Create a new session tracer for this episode
222
- session_tracer = SessionTracer(hooks=CRAFTER_HOOKS, db_url=config.database_url)
223
-
224
- # Start session with metadata
225
- session_id = await session_tracer.start_session(
226
- metadata={
227
- "model": model_name,
228
- "episode": episode_num,
229
- "experiment_id": experiment_id,
230
- "difficulty": config.difficulty,
231
- }
232
- )
233
-
234
- # Started tracing session (output disabled for clean UI)
235
-
236
- # Store session in global bucket
237
- _SESSIONS[session_id] = (experiment_id, session_tracer)
238
-
239
- # Initialize LM with session tracer
240
- lm = LM(
241
- vendor="openai",
242
- model=model_name,
243
- temperature=0.1, # Low temperature for more consistent gameplay
244
- session_tracer=session_tracer,
245
- system_id=f"crafter_agent_{model_name}",
246
- enable_v3_tracing=True,
247
- )
248
-
249
- # Create HTTP client
250
- async with httpx.AsyncClient() as client:
251
- try:
252
- # Initialize environment with consecutive seed
253
- seed = (
254
- config.base_seed + episode_num
255
- ) # Base seed + episode number for consecutive seeds
256
- request_data = {"config": {"difficulty": config.difficulty, "seed": seed}}
257
- init_response = await retry_http_request(
258
- client,
259
- "POST",
260
- f"{config.crafter_service_url}/env/CrafterClassic/initialize",
261
- json=request_data,
262
- )
263
- init_data = init_response.json()
264
-
265
- # Debug the response format (removed for clean output)
266
-
267
- # Handle different possible response formats
268
- if "instance_id" in init_data:
269
- instance_id = init_data["instance_id"]
270
- elif "env_id" in init_data:
271
- instance_id = init_data["env_id"]
272
- elif "id" in init_data:
273
- instance_id = init_data["id"]
274
- else:
275
- # If none of the expected keys exist, print the response and raise a clear error
276
- print(f"āŒ Unexpected response format from Crafter service: {init_data}")
277
- raise KeyError(
278
- f"Could not find environment ID in response. Available keys: {list(init_data.keys())}"
279
- )
280
-
281
- # Get initial observation (from initialize response)
282
- obs = init_data["observation"]
283
-
284
- prev_obs = obs
285
- done = False
286
- invalid_actions = 0
287
- total_actions = 0
288
- episode_start_time = time.time()
289
-
290
- for turn in range(config.max_turns):
291
- if done:
292
- break
293
-
294
- # Check episode timeout
295
- if time.time() - episode_start_time > config.episode_timeout:
296
- print(f" ā° Episode {episode_num} timed out after {config.episode_timeout}s")
297
- done = True
298
- break
299
-
300
- # Progress bar will be updated at end of turn
301
-
302
- set_turn_number(turn)
303
-
304
- # Start timestep for this turn
305
- await session_tracer.start_timestep(f"turn_{turn}")
306
-
307
- # Prepare context for the agent
308
- inventory_str = ", ".join(
309
- [f"{k}: {v}" for k, v in obs.get("inventory", {}).items() if v > 0]
310
- )
311
- if not inventory_str:
312
- inventory_str = "empty"
313
-
314
- nearby_str = ", ".join(obs.get("nearby", []))
315
- if not nearby_str:
316
- nearby_str = "nothing"
317
-
318
- status = obs.get("status", {})
319
- health = status.get("health", 0)
320
- hunger = status.get("food", 0)
321
-
322
- # Get more detailed game state
323
- position = obs.get("position", [0, 0])
324
- achievements = obs.get("achievements_status", {})
325
- unlocked = [name for name, status in achievements.items() if status]
326
- achievements_str = ", ".join(unlocked) if unlocked else "none"
327
-
328
- # Get semantic map if available
329
- semantic_map = obs.get("semantic_map", None)
330
- map_str = ""
331
- if semantic_map is not None:
332
- # Simple 5x5 view around player
333
- try:
334
- px, py = position
335
- view_size = 5
336
- half = view_size // 2
337
- map_lines = []
338
- for dy in range(-half, half + 1):
339
- row = []
340
- for dx in range(-half, half + 1):
341
- x, y = px + dx, py + dy
342
- if dx == 0 and dy == 0:
343
- row.append("@") # Player
344
- elif 0 <= x < len(semantic_map) and 0 <= y < len(semantic_map[0]):
345
- cell = semantic_map[x][y]
346
- # Map common items
347
- if cell == 0:
348
- row.append(".") # Empty/grass
349
- elif cell == 1:
350
- row.append("T") # Tree
351
- elif cell == 2:
352
- row.append("S") # Stone
353
- elif cell == 3:
354
- row.append("C") # Cow
355
- elif cell == 4:
356
- row.append("W") # Water
357
- else:
358
- row.append("?")
359
- else:
360
- row.append("#") # Out of bounds
361
- map_lines.append(" ".join(row))
362
- map_str = "\nMap (5x5 view, @ = you):\n" + "\n".join(map_lines)
363
- except Exception:
364
- map_str = "\nMap view unavailable"
365
-
366
- # Create agent prompt
367
- prompt = f"""Game State (Turn {turn}):
368
- - Position: {position}
369
- - Health: {health}/9
370
- - Hunger: {hunger}/9
371
- - Inventory: {inventory_str}
372
- - Nearby objects: {nearby_str}
373
- - Achievements unlocked: {achievements_str}
374
- {map_str}
375
-
376
- Choose your next actions based on what you see. Use the 'interact' tool with a list of action IDs.
377
-
378
- Tips:
379
- - Look at the map! T=tree (wood), S=stone, C=cow (food), W=water
380
- - To collect resources: move to them (actions 1-4) then use action 5 (do)
381
- - To craft: place table (8) first, then craft tools (11-16)
382
- - If hungry and see cow (C), move to it and eat (17)
383
-
384
- What actions do you want to take?"""
385
-
386
- # Send observation as message
387
- obs_msg = create_message(
388
- f"Observation: {compress_observation_for_trace(obs)}",
389
- "system",
390
- f"crafter_env_{instance_id}",
391
- turn,
392
- )
393
- await session_tracer.record_message(
394
- content=obs_msg.content,
395
- message_type=obs_msg.message_type,
396
- event_time=obs_msg.time_record.event_time,
397
- message_time=obs_msg.time_record.message_time,
398
- metadata=obs_msg.metadata,
399
- )
400
-
401
- # Get action from LM with tools (with timeout)
402
- try:
403
- # Define the interact tool for Crafter
404
- from pydantic import BaseModel, Field
405
- from synth_ai.lm.tools.base import BaseTool
406
-
407
- class InteractArgs(BaseModel):
408
- actions: list[int] = Field(..., description="List of action IDs to execute")
409
-
410
- interact_tool = BaseTool(
411
- name="interact",
412
- arguments=InteractArgs,
413
- description="Execute actions in the Crafter game",
414
- )
415
-
416
- # Create system message that explains available actions
417
- action_list = "\n".join(
418
- [f"{action_id}: {action}" for action, action_id in CRAFTER_ACTIONS.items()]
419
- )
420
- system_message = f"""You are an agent playing Crafter, a 2D survival game. Your goal is to survive and unlock achievements.
421
-
422
- You MUST use the 'interact' tool to execute actions. The tool takes a list of action IDs.
423
-
424
- Action ID mapping:
425
- {action_list}
426
-
427
- Strategy tips:
428
- - Start by collecting wood (move to trees and use action 5)
429
- - Place a crafting table (action 8) to unlock crafting recipes
430
- - Craft tools to collect resources more efficiently
431
- - Eat when hungry, sleep when tired
432
- - Explore to find different resources
433
-
434
- IMPORTANT: Always use the 'interact' tool with a list of action IDs. For example: interact(actions=[2, 2, 5]) to move right twice and collect."""
435
-
436
- # Get actions from LM using tools with timeout
437
- try:
438
- action_response = await asyncio.wait_for(
439
- lm.respond_async(
440
- system_message=system_message,
441
- user_message=prompt,
442
- tools=[interact_tool],
443
- turn_number=turn,
444
- ),
445
- timeout=config.turn_timeout,
446
- )
447
- except TimeoutError:
448
- print(
449
- f" ā° Turn {turn} timed out for episode {episode_num} after {config.turn_timeout}s"
450
- )
451
- action_response = None
452
- done = True
453
- break
454
-
455
- # Debug: print response (removed for clean output)
456
-
457
- # Extract tool calls from response
458
- if hasattr(action_response, "tool_calls") and action_response.tool_calls:
459
- tool_calls = action_response.tool_calls
460
-
461
- # Process each tool call
462
- for tool_call in tool_calls:
463
- if tool_call.get("function", {}).get("name") == "interact":
464
- # Extract actions from the tool call
465
- import json
466
-
467
- args = json.loads(
468
- tool_call.get("function", {}).get("arguments", "{}")
469
- )
470
- actions = args.get("actions", [])
471
-
472
- if not actions:
473
- # If no actions provided, use noop
474
- actions = [0]
475
-
476
- # Execute each action separately
477
- for i, action_id in enumerate(actions):
478
- # Capture BEFORE frame
479
- frame_before_b64 = None
480
- try:
481
- fr = await retry_http_request(
482
- client,
483
- "GET",
484
- f"{config.crafter_service_url}/env/CrafterClassic/frame",
485
- params={"env_id": instance_id},
486
- )
487
- if fr.status_code == 200:
488
- frame_before_b64 = fr.json().get("image_base64")
489
- except Exception:
490
- frame_before_b64 = None
491
- total_actions += 1
492
-
493
- # Validate action ID
494
- if action_id not in INT_TO_ACTION_STRING:
495
- # Invalid action logging removed for clean output
496
- action_id = 0
497
- invalid_actions += 1
498
-
499
- # Send action to Crafter service with timeout
500
- try:
501
- step_response = await asyncio.wait_for(
502
- retry_http_request(
503
- client,
504
- "POST",
505
- f"{config.crafter_service_url}/env/CrafterClassic/step",
506
- json={
507
- "env_id": instance_id,
508
- "action": {
509
- "tool_calls": [
510
- {
511
- "tool": "interact",
512
- "args": {"action": action_id},
513
- }
514
- ]
515
- },
516
- },
517
- ),
518
- timeout=5.0, # 5 second timeout for individual action
519
- )
520
- except TimeoutError:
521
- print(
522
- f" ā° Action execution timed out in episode {episode_num}"
523
- )
524
- done = True
525
- break
526
-
527
- if step_response.status_code != 200:
528
- print(
529
- f" āŒ Step failed: {step_response.status_code} - {step_response.text}"
530
- )
531
- done = True
532
- break
533
-
534
- step_data = step_response.json()
535
-
536
- # Extract data from response
537
- new_obs = step_data["observation"]
538
- reward = step_data["reward"]
539
- done = step_data["done"]
540
-
541
- # Record runtime event for action
542
- action_name = INT_TO_ACTION_STRING.get(action_id, "unknown")
543
- runtime_event = RuntimeEvent(
544
- system_instance_id=f"crafter_env_{instance_id}",
545
- time_record=TimeRecord(
546
- event_time=time.time(), message_time=turn
547
- ),
548
- actions=[action_id],
549
- metadata={
550
- "action_name": action_name,
551
- "valid": action_name != "noop" or invalid_actions == 0,
552
- },
553
- )
554
- await session_tracer.record_event(runtime_event)
555
-
556
- # Capture AFTER frame
557
- frame_after_b64 = None
558
- try:
559
- fr = await retry_http_request(
560
- client,
561
- "GET",
562
- f"{config.crafter_service_url}/env/CrafterClassic/frame",
563
- params={"env_id": instance_id},
564
- )
565
- if fr.status_code == 200:
566
- frame_after_b64 = fr.json().get("image_base64")
567
- except Exception:
568
- frame_after_b64 = None
569
-
570
- # Save frames to assets and compute URIs
571
- before_uri = None
572
- after_uri = None
573
- try:
574
- if frame_before_b64:
575
- import base64
576
- from pathlib import Path
577
-
578
- assets_dir = Path("traces/v3/assets") / session_id
579
- assets_dir.mkdir(parents=True, exist_ok=True)
580
- before_path = assets_dir / f"{turn}_{i}_before.png"
581
- with open(before_path, "wb") as f:
582
- f.write(base64.b64decode(frame_before_b64))
583
- before_uri = str(before_path)
584
- if frame_after_b64:
585
- import base64
586
- from pathlib import Path
587
-
588
- assets_dir = Path("traces/v3/assets") / session_id
589
- assets_dir.mkdir(parents=True, exist_ok=True)
590
- after_path = assets_dir / f"{turn}_{i}_after.png"
591
- with open(after_path, "wb") as f:
592
- f.write(base64.b64decode(frame_after_b64))
593
- after_uri = str(after_path)
594
- except Exception:
595
- before_uri = None
596
- after_uri = None
597
-
598
- # Record environment event with visuals
599
- env_event = EnvironmentEvent(
600
- system_instance_id=f"crafter_env_{instance_id}",
601
- time_record=TimeRecord(
602
- event_time=time.time(), message_time=turn
603
- ),
604
- reward=reward,
605
- terminated=done,
606
- system_state_before={
607
- "observation": prev_obs,
608
- **(
609
- {"visuals": {"frame_uri": before_uri}}
610
- if before_uri
611
- else {}
612
- ),
613
- },
614
- system_state_after={
615
- "observation": new_obs,
616
- "public_state": {
617
- "achievements_status": new_obs.get(
618
- "achievements_status", {}
619
- )
620
- },
621
- **(
622
- {"visuals": {"frame_uri": after_uri}}
623
- if after_uri
624
- else {}
625
- ),
626
- },
627
- )
628
- await session_tracer.record_event(env_event)
629
-
630
- # Update for next turn
631
- prev_obs = obs
632
- obs = new_obs
633
-
634
- if done:
635
- break
636
-
637
- # Per-episode progress updated once per turn (not per action)
638
- else:
639
- # No tool calls provided, use noop
640
- action_id = 0
641
- total_actions += 1
642
- invalid_actions += 1
643
-
644
- # Send noop action with timeout
645
- try:
646
- step_response = await asyncio.wait_for(
647
- retry_http_request(
648
- client,
649
- "POST",
650
- f"{config.crafter_service_url}/env/CrafterClassic/step",
651
- json={
652
- "env_id": instance_id,
653
- "action": {
654
- "tool_calls": [
655
- {"tool": "interact", "args": {"action": action_id}}
656
- ]
657
- },
658
- },
659
- ),
660
- timeout=5.0, # 5 second timeout
661
- )
662
- except TimeoutError:
663
- print(f" ā° Noop action timed out in episode {episode_num}")
664
- done = True
665
- break
666
-
667
- if step_response.status_code != 200:
668
- print(
669
- f" āŒ Step failed: {step_response.status_code} - {step_response.text}"
670
- )
671
- done = True
672
- else:
673
- step_data = step_response.json()
674
- new_obs = step_data["observation"]
675
- reward = step_data["reward"]
676
- done = step_data["done"]
677
-
678
- # Update observation
679
- prev_obs = obs
680
- obs = new_obs
681
-
682
- # End timestep
683
- await session_tracer.end_timestep(f"turn_{turn}")
684
- # Update per-episode progress bar once per turn
685
- if pbar is not None:
686
- current_achievements = sum(
687
- 1 for v in obs.get("achievements_status", {}).values() if v
688
- )
689
- pbar.set_postfix({"ach": current_achievements})
690
- pbar.update(1)
691
-
692
- except Exception as e:
693
- print(f" āŒ Environment step error: {e}")
694
- done = True
695
-
696
- # Progress bar updated per turn above
697
-
698
- # Calculate invalid action rate
699
- invalid_rate = invalid_actions / total_actions if total_actions > 0 else 0
700
-
701
- # Calculate achievements
702
- final_achievements = obs.get("achievements_status", {})
703
- total_achievements = sum(1 for v in final_achievements.values() if v)
704
-
705
- # Terminate environment
706
- try:
707
- await retry_http_request(
708
- client,
709
- "POST",
710
- f"{config.crafter_service_url}/env/CrafterClassic/terminate",
711
- json={"env_id": instance_id},
712
- )
713
- except Exception as e:
714
- print(f" āš ļø Failed to terminate environment: {e}")
715
-
716
- # End session
717
- await session_tracer.end_session(save=config.save_traces)
718
- # Close the tracer for this episode
719
- await session_tracer.close()
720
-
721
- return {
722
- "model": model_name,
723
- "episode": episode_num,
724
- "total_achievements": total_achievements,
725
- "achievements": final_achievements,
726
- "invalid_action_rate": invalid_rate,
727
- "total_actions": total_actions,
728
- "invalid_actions": invalid_actions,
729
- "session_id": session_id,
730
- }
731
-
732
- except Exception as e:
733
- print(f" āŒ Episode failed: {e}")
734
- import traceback
735
-
736
- traceback.print_exc()
737
-
738
- # End session even if failed
739
- await session_tracer.end_session(save=config.save_traces)
740
- # Close the tracer for this episode
741
- await session_tracer.close()
742
-
743
- return {
744
- "model": model_name,
745
- "episode": episode_num,
746
- "total_achievements": 0,
747
- "achievements": {},
748
- "invalid_action_rate": 1.0,
749
- "total_actions": 0,
750
- "invalid_actions": 0,
751
- "session_id": session_id,
752
- "error": str(e),
753
- }
754
-
755
-
756
- async def run_model_experiment(
757
- config: ExperimentConfig, model_name: str, experiment_id: str, position_base: int = 0
758
- ) -> list[dict[str, Any]]:
759
- """Run multiple episodes for a single model in parallel with per-episode stacked progress bars."""
760
- # print(f"\nRunning {config.num_episodes} episodes for {model_name} in parallel...\n")
761
-
762
- # One progress bar per episode, stacked
763
- episode_bars = [
764
- tqdm(
765
- total=config.max_turns,
766
- desc=f"{model_name} | ep{i + 1}",
767
- unit="turn",
768
- leave=True,
769
- position=position_base + i,
770
- )
771
- for i in range(config.num_episodes)
772
- ]
773
-
774
- try:
775
- # Create tasks for all episodes (each will create its own tracer) with concurrency limit
776
- sem = asyncio.Semaphore(max(1, int(config.concurrency)))
777
-
778
- async def _limited_run(ep_idx: int):
779
- async with sem:
780
- pbar = episode_bars[ep_idx]
781
- try:
782
- return await run_episode(config, model_name, ep_idx, experiment_id, pbar)
783
- finally:
784
- pbar.close()
785
-
786
- tasks = [_limited_run(i) for i in range(config.num_episodes)]
787
-
788
- # Run all episodes in parallel
789
- results = await asyncio.gather(*tasks)
790
-
791
- # Optional summary on the last bar
792
- successful_results = [r for r in results if "error" not in r]
793
- if successful_results and episode_bars:
794
- avg_ach = sum(r["total_achievements"] for r in successful_results) / len(
795
- successful_results
796
- )
797
- avg_inv = sum(r["invalid_action_rate"] for r in successful_results) / len(
798
- successful_results
799
- )
800
- with contextlib.suppress(Exception):
801
- episode_bars[-1].set_postfix(
802
- {"avg_ach": f"{avg_ach:.1f}", "inv_rate": f"{avg_inv:.1%}"}
803
- )
804
-
805
- return results
806
- finally:
807
- for b in episode_bars:
808
- with contextlib.suppress(Exception):
809
- b.close()
810
-
811
-
812
- async def analyze_results(config: ExperimentConfig, all_results: dict[str, list[dict[str, Any]]]):
813
- """Analyze results across all models using v3 database."""
814
- print("\nšŸ“Š Analysis Results:")
815
- print("=" * 80)
816
-
817
- # Initialize database manager
818
- db_manager = AsyncSQLTraceManager(config.database_url)
819
- await db_manager.initialize()
820
-
821
- try:
822
- # Basic statistics by model
823
- model_stats = {}
824
- for model, results in all_results.items():
825
- valid_results = [r for r in results if "error" not in r]
826
- if valid_results:
827
- achievements = [r["total_achievements"] for r in valid_results]
828
- invalid_rates = [r["invalid_action_rate"] for r in valid_results]
829
-
830
- model_stats[model] = {
831
- "avg_achievements": np.mean(achievements),
832
- "std_achievements": np.std(achievements),
833
- "max_achievements": max(achievements),
834
- "avg_invalid_rate": np.mean(invalid_rates),
835
- "success_rate": len(valid_results) / len(results),
836
- }
837
-
838
- # Print model comparison
839
- print("\nšŸ“ˆ Model Performance Summary:")
840
- print(
841
- f"{'Model':<20} {'Avg Achievements':<18} {'Max Achievements':<18} {'Invalid Rate':<15} {'Success Rate':<15}"
842
- )
843
- print("-" * 86)
844
-
845
- for model, stats in sorted(
846
- model_stats.items(), key=lambda x: x[1]["avg_achievements"], reverse=True
847
- ):
848
- print(
849
- f"{model:<20} {stats['avg_achievements']:>6.2f} ± {stats['std_achievements']:>4.2f} "
850
- f"{stats['max_achievements']:>16} {stats['avg_invalid_rate']:>12.2%} {stats['success_rate']:>12.2%}"
851
- )
852
-
853
- # Achievement frequency analysis
854
- print("\nšŸ† Achievement Frequencies:")
855
- achievement_counts = defaultdict(lambda: defaultdict(int))
856
-
857
- for model, results in all_results.items():
858
- for result in results:
859
- if "error" not in result:
860
- for achievement, unlocked in result["achievements"].items():
861
- if unlocked:
862
- achievement_counts[model][achievement] += 1
863
-
864
- # Get all unique achievements
865
- all_achievements = set()
866
- for model_achievements in achievement_counts.values():
867
- all_achievements.update(model_achievements.keys())
868
-
869
- # Print achievement table
870
- if all_achievements:
871
- print(
872
- f"\n{'Achievement':<25} "
873
- + " ".join(f"{model[:8]:>10}" for model in sorted(all_results.keys()))
874
- )
875
- print("-" * (25 + 11 * len(all_results)))
876
-
877
- for achievement in sorted(all_achievements):
878
- row = f"{achievement:<25}"
879
- for model in sorted(all_results.keys()):
880
- count = achievement_counts[model].get(achievement, 0)
881
- total = len([r for r in all_results[model] if "error" not in r])
882
- pct = (count / total * 100) if total > 0 else 0
883
- row += f" {count:>3}/{total:<3} ({pct:>3.0f}%)"
884
- print(row)
885
-
886
- # Query model usage from database - filter to only show models used in this experiment
887
- print("\nšŸ’° Model Usage Statistics from Current Experiment:")
888
- model_usage_df = await db_manager.get_model_usage()
889
-
890
- if model_usage_df is not None and not model_usage_df.empty:
891
- # Filter to only show models from this experiment
892
- experiment_models = set(all_results.keys())
893
- filtered_df = model_usage_df[model_usage_df["model_name"].isin(experiment_models)]
894
-
895
- if not filtered_df.empty:
896
- # Format model usage statistics as table
897
- print(
898
- f"{'Model':<20} {'Provider':<10} {'Usage Count':<12} {'Avg Latency (ms)':<18} {'Total Cost':<12}"
899
- )
900
- print("-" * 72)
901
- for _, row in filtered_df.iterrows():
902
- avg_latency = row["avg_latency_ms"]
903
- if pd.notna(avg_latency):
904
- print(
905
- f"{row['model_name']:<20} {row['provider'] or 'N/A':<10} {row['usage_count']:<12} "
906
- f"{avg_latency:<18.2f} ${row['total_cost_usd']:<11.4f}"
907
- )
908
- else:
909
- print(
910
- f"{row['model_name']:<20} {row['provider'] or 'N/A':<10} {row['usage_count']:<12} "
911
- f"{'N/A':<18} ${row['total_cost_usd']:<11.4f}"
912
- )
913
-
914
- # Export detailed results under a temp/ directory (git-ignored)
915
- import os
916
- from pathlib import Path
917
-
918
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
919
- out_dir = Path(os.getenv("SYNTH_OUTPUT_DIR", "temp")).resolve()
920
- out_dir.mkdir(parents=True, exist_ok=True)
921
- results_path = out_dir / f"crafter_experiment_results_{timestamp}.json"
922
-
923
- with open(results_path, "w") as f:
924
- json.dump(
925
- {
926
- "config": {
927
- "num_episodes": config.num_episodes,
928
- "max_turns": config.max_turns,
929
- "difficulty": config.difficulty,
930
- "models": list(all_results.keys()),
931
- },
932
- "results": all_results,
933
- "statistics": model_stats,
934
- "timestamp": timestamp,
935
- },
936
- f,
937
- indent=2,
938
- )
939
-
940
- print(f"\nšŸ’¾ Detailed results saved to: {results_path}")
941
-
942
- finally:
943
- await db_manager.close()
944
-
945
-
946
- async def main():
947
- """Main entry point for the experiment."""
948
- parser = argparse.ArgumentParser(description="Run Crafter experiments with multiple models")
949
- parser.add_argument("--episodes", type=int, default=5, help="Number of episodes per model")
950
- parser.add_argument("--max-turns", type=int, default=100, help="Maximum turns per episode")
951
- parser.add_argument(
952
- "--difficulty", choices=["easy", "medium", "hard"], default="easy", help="Game difficulty"
953
- )
954
- parser.add_argument("--models", nargs="+", default=MODELS_TO_TEST, help="Models to test")
955
- parser.add_argument("--no-save", action="store_true", help="Don't save traces to database")
956
- parser.add_argument("--quiet", action="store_true", help="Reduce output verbosity")
957
- parser.add_argument("--db-url", default=DATABASE_URL, help="Database URL for tracing")
958
- parser.add_argument(
959
- "--concurrency", type=int, default=5, help="Max concurrent rollouts per model"
960
- )
961
- parser.add_argument(
962
- "--base-seed",
963
- type=int,
964
- default=1000,
965
- help="Base seed for episodes (episodes use base_seed+episode_num)",
966
- )
967
- parser.add_argument(
968
- "--turn-timeout", type=float, default=30.0, help="Timeout per turn in seconds"
969
- )
970
- parser.add_argument(
971
- "--episode-timeout", type=float, default=300.0, help="Total timeout per episode in seconds"
972
- )
973
-
974
- args = parser.parse_args()
975
-
976
- # Create configuration
977
- config = ExperimentConfig()
978
- config.num_episodes = args.episodes
979
- config.max_turns = args.max_turns
980
- config.difficulty = args.difficulty
981
- config.save_traces = not args.no_save
982
- config.verbose = not args.quiet
983
- config.quiet = args.quiet
984
- config.database_url = args.db_url
985
- config.base_seed = args.base_seed
986
- config.turn_timeout = args.turn_timeout
987
- config.episode_timeout = args.episode_timeout
988
- config.concurrency = max(1, int(args.concurrency))
989
-
990
- # Generate experiment ID
991
- experiment_id = f"crafter_multi_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
992
-
993
- print("šŸŽ® Crafter Multi-Model Experiment")
994
- print("=" * 50)
995
- print(f"Experiment ID: {experiment_id}")
996
- print(f"Models: {', '.join(args.models)}")
997
- print(f"Episodes per model: {config.num_episodes}")
998
- print(f"Max turns per episode: {config.max_turns}")
999
- print(f"Difficulty: {config.difficulty}")
1000
- print(f"Seeds: {config.base_seed} to {config.base_seed + config.num_episodes - 1}")
1001
- print(f"Turn timeout: {config.turn_timeout}s")
1002
- print(f"Episode timeout: {config.episode_timeout}s")
1003
- print(f"Save traces: {config.save_traces}")
1004
- print(f"Database URL: {config.database_url}")
1005
- print("=" * 50)
1006
-
1007
- # Check Crafter service
1008
- try:
1009
- async with httpx.AsyncClient() as client:
1010
- response = await client.get(f"{config.crafter_service_url}/health", timeout=5.0)
1011
- if response.status_code != 200:
1012
- print(f"āŒ Crafter service not healthy at {config.crafter_service_url}")
1013
- return
1014
- except Exception as e:
1015
- print(f"āŒ Cannot connect to Crafter service at {config.crafter_service_url}: {e}")
1016
- print("Please ensure the Crafter service is running.")
1017
- return
1018
-
1019
- print("āœ… Crafter service is running")
1020
-
1021
- # Run experiments for each model in parallel with stacked per-episode progress bars
1022
- all_results = {}
1023
- model_tasks = []
1024
- for idx, model in enumerate(args.models):
1025
- base = idx * (config.num_episodes + 1)
1026
- model_tasks.append(run_model_experiment(config, model, experiment_id, position_base=base))
1027
- results_list = await asyncio.gather(*model_tasks)
1028
- for model, results in zip(args.models, results_list, strict=False):
1029
- all_results[model] = results
1030
-
1031
- # Analyze and compare results
1032
- await analyze_results(config, all_results)
1033
-
1034
- print("\nāœ… Experiment complete!")
1035
-
1036
-
1037
- if __name__ == "__main__":
1038
- asyncio.run(main())