synth-ai 0.2.9.dev7__py3-none-any.whl → 0.2.9.dev8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (327) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +8 -11
  3. examples/qwen_coder/README.md +102 -0
  4. examples/qwen_coder/_shared.py +113 -0
  5. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  6. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  7. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  8. examples/qwen_coder/generate_dataset.py +98 -0
  9. examples/qwen_coder/infer_ft_smoke.py +64 -0
  10. examples/qwen_coder/infer_prod_proxy.py +73 -0
  11. examples/qwen_coder/infer_via_synth.py +87 -0
  12. examples/qwen_coder/scripts/infer_coder.sh +18 -0
  13. examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
  14. examples/qwen_coder/sft_full_17b.py +103 -0
  15. examples/qwen_coder/sft_lora_30b.py +110 -0
  16. examples/qwen_coder/subset_jsonl.py +38 -0
  17. examples/qwen_coder/validate_jsonl.py +59 -0
  18. examples/rl/run_eval.py +36 -37
  19. examples/rl/run_rl_and_save.py +5 -5
  20. examples/rl/task_app/math_single_step.py +65 -43
  21. examples/rl/task_app/math_task_app.py +3 -3
  22. examples/sft/README.md +139 -0
  23. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  24. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  25. examples/sft/evaluate.py +117 -0
  26. examples/sft/export_dataset.py +117 -0
  27. examples/sft/generate_traces.py +162 -0
  28. examples/swe/__init__.py +12 -0
  29. examples/swe/task_app/README.md +105 -0
  30. examples/swe/task_app/__init__.py +2 -0
  31. examples/swe/task_app/grpo_swe_mini.py +571 -0
  32. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  33. examples/swe/task_app/hosted/README.md +173 -0
  34. examples/swe/task_app/hosted/__init__.py +5 -0
  35. examples/swe/task_app/hosted/branching.py +143 -0
  36. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  37. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  38. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  39. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  40. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  41. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  42. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  43. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  44. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  45. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  46. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  47. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  50. examples/swe/task_app/hosted/hosted_app.py +204 -0
  51. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  52. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  53. examples/swe/task_app/hosted/main.py +100 -0
  54. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  55. examples/swe/task_app/hosted/registry.py +195 -0
  56. examples/swe/task_app/hosted/rollout.py +1869 -0
  57. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  58. examples/swe/task_app/hosted/storage/volume.py +211 -0
  59. examples/swe/task_app/hosted/test_agents.py +161 -0
  60. examples/swe/task_app/hosted/test_service.py +137 -0
  61. examples/swe/task_app/hosted/utils.py +62 -0
  62. examples/vlm/README.md +68 -0
  63. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  64. examples/vlm/crafter_image_only_agent.py +207 -0
  65. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  66. examples/vlm/filter_image_rows.py +63 -0
  67. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  68. examples/warming_up_to_rl/analyze_trace_db.py +5 -5
  69. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  70. examples/warming_up_to_rl/export_trace_sft.py +78 -21
  71. examples/warming_up_to_rl/groq_test.py +4 -4
  72. examples/warming_up_to_rl/manage_secrets.py +13 -18
  73. examples/warming_up_to_rl/run_eval.py +42 -44
  74. examples/warming_up_to_rl/run_fft_and_save.py +11 -16
  75. examples/warming_up_to_rl/run_local_rollout.py +1 -3
  76. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -4
  77. examples/warming_up_to_rl/run_local_rollout_parallel.py +1 -4
  78. examples/warming_up_to_rl/run_local_rollout_traced.py +3 -5
  79. examples/warming_up_to_rl/run_rl_and_save.py +5 -6
  80. examples/warming_up_to_rl/run_rollout_remote.py +8 -10
  81. examples/warming_up_to_rl/task_app/README.md +6 -2
  82. examples/warming_up_to_rl/task_app/grpo_crafter.py +234 -35
  83. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +2 -3
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +131 -114
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +101 -41
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +73 -51
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +14 -6
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +16 -16
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +32 -34
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +94 -31
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +303 -203
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +328 -225
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +13 -13
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +1 -0
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  101. synth/__init__.py +14 -0
  102. synth_ai/__init__.py +26 -4
  103. synth_ai/api/models/supported.py +376 -0
  104. synth_ai/api/train/builders.py +128 -21
  105. synth_ai/api/train/cli.py +80 -64
  106. synth_ai/api/train/config_finder.py +7 -2
  107. synth_ai/api/train/env_resolver.py +1 -1
  108. synth_ai/api/train/pollers.py +2 -1
  109. synth_ai/api/train/supported_algos.py +139 -0
  110. synth_ai/api/train/task_app.py +1 -2
  111. synth_ai/api/train/utils.py +13 -44
  112. synth_ai/cli/__init__.py +8 -0
  113. synth_ai/cli/_modal_wrapper.py +28 -0
  114. synth_ai/cli/_typer_patch.py +49 -0
  115. synth_ai/cli/balance.py +1 -2
  116. synth_ai/cli/calc.py +1 -1
  117. synth_ai/cli/demo.py +2 -1
  118. synth_ai/cli/recent.py +2 -2
  119. synth_ai/cli/rl_demo.py +2 -1
  120. synth_ai/cli/root.py +11 -13
  121. synth_ai/cli/status.py +2 -2
  122. synth_ai/cli/task_apps.py +529 -179
  123. synth_ai/cli/traces.py +6 -4
  124. synth_ai/cli/watch.py +12 -18
  125. synth_ai/demo_registry.py +1 -1
  126. synth_ai/demos/core/cli.py +36 -43
  127. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  128. synth_ai/demos/demo_task_apps/core.py +17 -25
  129. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +3 -4
  130. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  131. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -4
  132. synth_ai/demos/demo_task_apps/math/modal_task_app.py +16 -18
  133. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
  134. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  135. synth_ai/environments/reproducibility/tree.py +2 -5
  136. synth_ai/environments/service/app.py +11 -12
  137. synth_ai/environments/service/core_routes.py +4 -7
  138. synth_ai/environments/stateful/engine.py +1 -1
  139. synth_ai/environments/tasks/core.py +1 -0
  140. synth_ai/environments/tasks/filters.py +5 -6
  141. synth_ai/environments/tasks/utils.py +4 -5
  142. synth_ai/handshake.py +9 -9
  143. synth_ai/http.py +1 -1
  144. synth_ai/http_client.py +18 -10
  145. synth_ai/inference/client.py +15 -5
  146. synth_ai/jobs/client.py +78 -83
  147. synth_ai/learning/__init__.py +41 -6
  148. synth_ai/learning/algorithms.py +14 -0
  149. synth_ai/learning/client.py +91 -24
  150. synth_ai/learning/config.py +2 -38
  151. synth_ai/learning/ft_client.py +4 -59
  152. synth_ai/learning/health.py +5 -6
  153. synth_ai/learning/jobs.py +31 -47
  154. synth_ai/{rl → learning/rl}/__init__.py +14 -4
  155. synth_ai/learning/rl/client.py +267 -0
  156. synth_ai/learning/rl/config.py +31 -0
  157. synth_ai/{rl → learning/rl}/contracts.py +5 -8
  158. synth_ai/{rl → learning/rl}/env_keys.py +39 -15
  159. synth_ai/learning/rl/secrets.py +13 -0
  160. synth_ai/learning/rl_client.py +2 -281
  161. synth_ai/learning/sft/__init__.py +29 -0
  162. synth_ai/learning/sft/client.py +68 -0
  163. synth_ai/learning/sft/config.py +270 -0
  164. synth_ai/learning/sft/data.py +295 -0
  165. synth_ai/learning/sse.py +25 -24
  166. synth_ai/learning/validators.py +25 -28
  167. synth_ai/lm/__init__.py +21 -47
  168. synth_ai/main.py +4 -0
  169. synth_ai/task/__init__.py +25 -27
  170. synth_ai/task/apps/__init__.py +7 -8
  171. synth_ai/task/auth.py +8 -8
  172. synth_ai/task/client.py +14 -14
  173. synth_ai/task/contracts.py +36 -35
  174. synth_ai/task/datasets.py +6 -5
  175. synth_ai/task/errors.py +10 -10
  176. synth_ai/task/health.py +17 -9
  177. synth_ai/task/json.py +58 -23
  178. synth_ai/task/proxy.py +13 -9
  179. synth_ai/task/rubrics.py +16 -15
  180. synth_ai/task/server.py +12 -12
  181. synth_ai/task/tracing_utils.py +4 -4
  182. synth_ai/task/vendors.py +5 -6
  183. synth_ai/tracing_v3/__init__.py +2 -0
  184. synth_ai/tracing_v3/abstractions.py +21 -4
  185. synth_ai/tracing_v3/decorators.py +18 -16
  186. synth_ai/tracing_v3/hooks.py +5 -5
  187. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  188. synth_ai/tracing_v3/session_tracer.py +40 -14
  189. synth_ai/tracing_v3/storage/base.py +85 -0
  190. synth_ai/tracing_v3/storage/config.py +21 -8
  191. synth_ai/tracing_v3/storage/factory.py +10 -7
  192. synth_ai/tracing_v3/storage/utils.py +4 -2
  193. synth_ai/tracing_v3/turso/daemon.py +7 -2
  194. synth_ai/tracing_v3/turso/models.py +2 -2
  195. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  196. synth_ai/tracing_v3/utils.py +4 -4
  197. synth_ai/v0/api/__init__.py +8 -0
  198. synth_ai/v0/api/models/__init__.py +8 -0
  199. synth_ai/v0/api/models/supported.py +8 -0
  200. synth_ai/v0/config/__init__.py +15 -0
  201. synth_ai/v0/config/base_url.py +12 -0
  202. synth_ai/v0/lm/__init__.py +51 -0
  203. synth_ai/{lm → v0/lm}/caching/ephemeral.py +2 -2
  204. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  205. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  206. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  207. synth_ai/{lm → v0/lm}/config.py +6 -1
  208. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  209. synth_ai/{lm → v0/lm}/core/main.py +6 -6
  210. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  211. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -14
  212. synth_ai/{lm → v0/lm}/core/vendor_clients.py +2 -2
  213. synth_ai/{lm → v0/lm}/overrides.py +2 -2
  214. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  215. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  216. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  217. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  218. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +9 -9
  219. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  220. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  221. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +10 -10
  222. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +8 -8
  223. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +2 -2
  224. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +3 -3
  225. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  226. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  227. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  228. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  229. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  230. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  231. synth_ai/{lm → v0/lm}/vendors/synth_client.py +1 -1
  232. synth_ai/v0/tracing_v3/__init__.py +10 -0
  233. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  234. synth_ai/v0/tracing_v3/decorators.py +3 -0
  235. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  236. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  237. synth_ai-0.2.9.dev8.dist-info/METADATA +191 -0
  238. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/RECORD +268 -238
  239. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/top_level.txt +1 -0
  240. examples/common_old/backend.py +0 -20
  241. examples/evals_old/README.md +0 -98
  242. examples/evals_old/__init__.py +0 -6
  243. examples/evals_old/compare_models.py +0 -1038
  244. examples/evals_old/example_log.md +0 -145
  245. examples/evals_old/run_demo.sh +0 -126
  246. examples/evals_old/trace_analysis.py +0 -270
  247. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  248. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  249. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  250. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -243
  251. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  252. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  253. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  254. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  255. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  256. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -119
  257. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  258. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  259. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -243
  260. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  261. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  262. examples/finetuning_old/synth_qwen_v1/infer.py +0 -36
  263. examples/finetuning_old/synth_qwen_v1/poll.py +0 -46
  264. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  265. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  266. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1933
  267. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -210
  268. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -237
  269. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  270. examples/finetuning_old/synth_qwen_v1/util.py +0 -152
  271. examples/rl_old/task_app.py +0 -1131
  272. examples/warming_up_to_rl/old/event_rewards.md +0 -234
  273. examples/warming_up_to_rl/old/notes.md +0 -73
  274. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
  275. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
  276. synth_ai/experimental/synth_oss.py +0 -445
  277. synth_ai/learning/filtering.py +0 -0
  278. synth_ai/learning/offline/dpo.py +0 -0
  279. synth_ai/learning/offline/providers.py +0 -7
  280. synth_ai/learning/offline/sft.py +0 -0
  281. synth_ai/learning/offline/shared.py +0 -0
  282. synth_ai/learning/online/grpo.py +0 -0
  283. synth_ai/learning/online/irft.py +0 -0
  284. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  285. synth_ai/learning/prompts/gepa.py +0 -0
  286. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -211
  287. synth_ai/learning/prompts/mipro.py +0 -289
  288. synth_ai/learning/prompts/random_search.py +0 -249
  289. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  290. synth_ai/learning/prompts/run_random_search_banking77.py +0 -329
  291. synth_ai/rl/secrets.py +0 -19
  292. synth_ai/scripts/verify_rewards.py +0 -100
  293. synth_ai/tracing/__init__.py +0 -30
  294. synth_ai/tracing_v1/__init__.py +0 -33
  295. synth_ai/tracing_v3/turso/__init__.py +0 -25
  296. synth_ai/tracing_v3/turso/manager.py +0 -838
  297. synth_ai/zyk/__init__.py +0 -30
  298. synth_ai-0.2.9.dev7.dist-info/METADATA +0 -131
  299. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  300. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  301. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  302. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  303. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  304. /synth_ai/{lm → v0/lm}/core/exceptions.py +0 -0
  305. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  306. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  307. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  308. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  309. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  310. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  311. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  312. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  313. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  314. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  315. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  316. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  317. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  318. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  319. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  320. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  321. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  322. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  323. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  324. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  325. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/WHEEL +0 -0
  326. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/entry_points.txt +0 -0
  327. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/licenses/LICENSE +0 -0
@@ -1,1933 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script to run ReAct agents against Crafter environment using LM class with Synth backend.
4
- This demonstrates using the LM class with Synth models through native integration.
5
-
6
- This version uses the new tracing_v3 system with async Turso/SQLite backend.
7
- """
8
-
9
- import argparse
10
- import asyncio
11
- import contextlib
12
- from contextlib import asynccontextmanager
13
- import glob
14
- import itertools
15
- import json
16
- import logging
17
- import os
18
- import random
19
- import sys
20
- import time
21
- import uuid
22
- from collections import defaultdict
23
- from datetime import datetime
24
- from pathlib import Path
25
- from typing import Any
26
-
27
- import httpx
28
- import numpy as np
29
- import toml
30
- import yaml
31
- from httpx import AsyncClient
32
- from tqdm import tqdm
33
-
34
- from synth_ai.config.base_url import get_backend_from_env
35
-
36
-
37
- def _resolve_backend_default() -> str:
38
- base, _ = get_backend_from_env()
39
- base = base.rstrip("/")
40
- return base if base.endswith("/api") else f"{base}/api"
41
-
42
-
43
- # Disable httpx logging immediately
44
- logging.getLogger("httpx").setLevel(logging.ERROR)
45
- logging.getLogger("httpcore").setLevel(logging.ERROR)
46
-
47
-
48
- # Configure logging to suppress noisy third-party logs when in quiet mode
49
- def setup_logging(quiet_mode: bool = False):
50
- """Setup logging configuration."""
51
- if quiet_mode:
52
- # Suppress most third-party logging in quiet mode
53
- logging.getLogger("httpx").setLevel(logging.ERROR)
54
- logging.getLogger("synth_ai.tracing_v3").setLevel(logging.ERROR)
55
- logging.getLogger("synth_ai.tracing_v3.turso").setLevel(logging.ERROR)
56
- logging.getLogger("sqlalchemy").setLevel(logging.ERROR)
57
- logging.getLogger("aiosqlite").setLevel(logging.ERROR)
58
- # Suppress httpcore as well (used by httpx)
59
- logging.getLogger("httpcore").setLevel(logging.ERROR)
60
- else:
61
- # Normal logging levels
62
- logging.getLogger("httpx").setLevel(logging.ERROR) # Always suppress httpx logs
63
- logging.getLogger("synth_ai.tracing_v3").setLevel(logging.INFO)
64
-
65
-
66
- # Set default logging to avoid noisy logs during import
67
- setup_logging(quiet_mode=True)
68
-
69
- # Setup environment
70
- sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
71
-
72
- # Disable v1 logging to see v3 tracing clearly
73
- os.environ["LANGFUSE_ENABLED"] = "false"
74
- os.environ["SYNTH_LOGGING"] = "false"
75
-
76
- from synth_ai.lm.config import SynthConfig # noqa: E402
77
-
78
- # Import Synth warmup utilities
79
- from synth_ai.lm.warmup import warmup_synth_model # noqa: E402
80
-
81
- # Import session tracer for v3 tracing
82
- from synth_ai.tracing_v3 import SessionTracer # noqa: E402
83
- from synth_ai.tracing_v3.abstractions import ( # noqa: E402
84
- EnvironmentEvent,
85
- RuntimeEvent,
86
- SessionEventMarkovBlanketMessage,
87
- TimeRecord,
88
- )
89
-
90
- # Import Crafter hooks for v3
91
- from synth_ai.tracing_v3.hooks import HookManager # noqa: E402
92
- from synth_ai.tracing_v3.turso.daemon import SqldDaemon # noqa: E402
93
-
94
- # create_experiment_context will be defined as a helper function below
95
- from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager # noqa: E402
96
-
97
- # Create a custom hook manager without default print statements
98
- QUIET_HOOKS = HookManager()
99
-
100
- # Import LM components (v3 version if available)
101
- try:
102
- from synth_ai.lm.core.main_v3 import LM # noqa: E402
103
- except ImportError:
104
- from synth_ai.lm.core.main_v2 import LM # noqa: E402
105
-
106
- # Configuration constants
107
- HTTP_TIMEOUT = (
108
- 30.0 # Increased from 10.0 for better handling of concurrent load and LM response times
109
- )
110
- MAX_RETRIES = 3
111
- RETRY_DELAY = 1.0
112
-
113
-
114
- # Use the backend
115
- @asynccontextmanager
116
- async def _noop_async_context():
117
- yield
118
-
119
-
120
- async def create_experiment_context(
121
- db_manager: AsyncSQLTraceManager, experiment_name: str, description: str
122
- ) -> dict[str, Any]:
123
- """Create an experiment context for v3 tracing."""
124
- experiment_id = f"exp_{uuid.uuid4().hex[:12]}"
125
- await db_manager.create_experiment(
126
- experiment_id=experiment_id, name=experiment_name, description=description, configuration={}
127
- )
128
- return {
129
- "experiment_id": experiment_id,
130
- "experiment_name": experiment_name,
131
- "description": description,
132
- }
133
-
134
-
135
- def cleanup_old_files():
136
- """Clean up old trace files and result files to keep directory clean."""
137
- # Remove old JSON result files (keep only the latest 5)
138
- result_files = glob.glob("crafter_lm_synth_results_*.json")
139
- if len(result_files) > 5:
140
- # Sort by modification time and keep only the latest 5
141
- result_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
142
- for old_file in result_files[5:]:
143
- try:
144
- os.remove(old_file)
145
- print(f"šŸ—‘ļø Cleaned up old result file: {old_file}")
146
- except OSError:
147
- pass
148
-
149
-
150
- def _load_env_from_monorepo() -> dict:
151
- """Load environment variables from monorepo/.env.local if present."""
152
- env_file = (
153
- Path(__file__).resolve().parent.parent.parent.parent.parent.parent / "monorepo/.env.local"
154
- )
155
- env_vars = {}
156
-
157
- if env_file.exists():
158
- with open(env_file) as f:
159
- for line in f:
160
- line = line.strip()
161
- if line and not line.startswith("#") and "=" in line:
162
- key, value = line.split("=", 1)
163
- # Remove quotes if present
164
- value = value.strip().strip('"').strip("'")
165
- env_vars[key] = value
166
-
167
- return env_vars
168
-
169
-
170
- def _load_testing_yaml_api_key() -> str | None:
171
- """Load SYNTH_API_KEY from monorepo/tests/prod/testing_info.yaml if present."""
172
- # First try the new env vars from monorepo/.env.local
173
- env_vars = _load_env_from_monorepo()
174
-
175
- # Try production key first, then test key
176
- if "SYNTH_API_KEY_PROD" in env_vars:
177
- return env_vars["SYNTH_API_KEY_PROD"]
178
- elif "SYNTH_API_KEY_TEST" in env_vars:
179
- return env_vars["SYNTH_API_KEY_TEST"]
180
-
181
- # Fallback to the old YAML method
182
- yaml_path = (
183
- Path(__file__).resolve().parent.parent.parent.parent.parent.parent
184
- / "monorepo/tests/prod/testing_info.yaml"
185
- )
186
- if yaml_path.exists():
187
- with open(yaml_path) as f:
188
- data = yaml.safe_load(f)
189
- return data.get("SYNTH_API_KEY")
190
- return None
191
-
192
-
193
- def setup_synth_environment():
194
- """Setup environment variables for Synth/Modal endpoints.
195
-
196
- Resolution order for the base URL:
197
- 1. Explicit environment variables (SYNTH_BASE_URL or MODAL_BASE_URL)
198
- 2. PROD_API_URL env var used in production integration tests
199
- 3. Synth production default (PROD_BASE_URL_DEFAULT)
200
-
201
- The API key is resolved from the matching *_API_KEY env vars or, if not
202
- present, from the shared testing_info.yaml used by the prod tests.
203
- """
204
- # Load environment variables from monorepo/.env.local
205
- env_vars = _load_env_from_monorepo()
206
-
207
- synth_base_url = (
208
- os.getenv("SYNTH_BASE_URL")
209
- or os.getenv("MODAL_BASE_URL")
210
- or os.getenv("PROD_API_URL")
211
- or env_vars.get("SYNTH_BASE_URL_PROD") # Use production URL from .env.local
212
- or _resolve_backend_default()
213
- )
214
-
215
- synth_api_key = os.getenv("SYNTH_API_KEY") or _load_testing_yaml_api_key()
216
-
217
- # # --- Validate API key format ---
218
- # if synth_api_key:
219
- # VALID_PREFIXES = ("sk-", "sk_live_", "sk_test_")
220
- # if not any(synth_api_key.startswith(p) for p in VALID_PREFIXES):
221
- # truncated = synth_api_key[:8] if len(synth_api_key) >= 8 else synth_api_key
222
- # expected_formats = " or ".join(VALID_PREFIXES)
223
- # raise ValueError(
224
- # f"Invalid API key format. Expected prefix {expected_formats}. Provided key begins with '{truncated}'."
225
- # )
226
- # else:
227
- # raise ValueError(
228
- # "SYNTH_API_KEY or MODAL_API_KEY must be provided via environment variables or testing_info.yaml"
229
- # )
230
-
231
- # Ensure trailing /v1 for OpenAI-compatible endpoints
232
- if not synth_base_url.endswith("/v1"):
233
- synth_base_url = synth_base_url.rstrip("/") + "/v1"
234
- synth_base_url = synth_base_url.rstrip("/")
235
-
236
- # Propagate to OpenAI SDK env vars expected by LM class
237
- os.environ["OPENAI_API_BASE"] = synth_base_url
238
- os.environ["OPENAI_BASE_URL"] = synth_base_url
239
- os.environ["OPENAI_API_KEY"] = synth_api_key
240
-
241
- return synth_base_url, synth_api_key
242
-
243
-
244
- async def retry_http_request(client: AsyncClient, method: str, url: str, **kwargs) -> Any:
245
- """Retry HTTP requests with exponential backoff and jitter."""
246
- last_exception = None
247
-
248
- for attempt in range(MAX_RETRIES):
249
- try:
250
- if attempt > 0:
251
- delay = min(RETRY_DELAY * (2 ** (attempt - 1)), RETRY_DELAY * 2) # Use RETRY_DELAY
252
- jitter = random.uniform(0, 0.1 * delay)
253
- total_delay = delay + jitter
254
- await asyncio.sleep(total_delay)
255
-
256
- response = await client.request(method, url, timeout=HTTP_TIMEOUT, **kwargs)
257
-
258
- if response.status_code < 500:
259
- return response
260
-
261
- last_exception = Exception(f"HTTP {response.status_code}: {response.text}")
262
-
263
- except httpx.ReadError as e:
264
- last_exception = e
265
- if attempt < MAX_RETRIES - 1:
266
- read_error_delay = min(1.0 * (2**attempt), 5.0)
267
- await asyncio.sleep(read_error_delay)
268
- except Exception as e:
269
- last_exception = e
270
-
271
- print(
272
- f" āŒ HTTP request failed after {MAX_RETRIES} attempts: {type(last_exception).__name__}: {str(last_exception)[:200]}"
273
- )
274
- raise last_exception
275
-
276
-
277
- def create_message(
278
- content: Any, message_type: str, origin_system_id: Any, turn: int
279
- ) -> SessionEventMarkovBlanketMessage:
280
- """Create a message with origin system ID embedded in content."""
281
- # Map custom message types to valid v3 message types
282
- type_mapping = {
283
- "observation": "system", # Map observation to system message
284
- "user": "user",
285
- "assistant": "assistant",
286
- "system": "system",
287
- "tool_use": "tool_use",
288
- "tool_result": "tool_result",
289
- }
290
-
291
- return SessionEventMarkovBlanketMessage(
292
- content=json.dumps({"origin_system_id": str(origin_system_id), "payload": content}),
293
- message_type=type_mapping.get(message_type, "system"), # Default to system
294
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
295
- )
296
-
297
-
298
- def compress_observation_for_trace(obs: dict[str, Any]) -> dict[str, Any]:
299
- """Compress observation for trace storage to avoid huge trace files."""
300
- compressed = obs.copy()
301
-
302
- # Compress semantic map if present
303
- if "semantic_map" in compressed:
304
- del compressed["semantic_map"]
305
-
306
- # Compress other large fields
307
- if "rgb" in compressed:
308
- del compressed["rgb"]
309
-
310
- return compressed
311
-
312
-
313
- def format_semantic_map_view_v2(obs: dict[str, Any], view_size: int = 7) -> str:
314
- """Format a semantic map view around the player with normal names using real Crafter mapping."""
315
- # Get semantic map
316
- semantic_map = obs.get("semantic_map")
317
- if semantic_map is None:
318
- return "No semantic map available"
319
-
320
- # Convert to numpy array if needed
321
- sem_arr = np.asarray(semantic_map)
322
- if sem_arr.ndim == 1:
323
- # Assuming square map, reshape
324
- size = int(np.sqrt(sem_arr.size))
325
- sem_arr = sem_arr.reshape(size, size)
326
-
327
- # Get player position
328
- player_pos = obs.get("player_position", [sem_arr.shape[0] // 2, sem_arr.shape[1] // 2])
329
- px, py = int(player_pos[0]), int(player_pos[1])
330
-
331
- # Get real crafter semantic mapping directly from crafter library
332
- import crafter
333
-
334
- dummyenv = crafter.Env()
335
- try:
336
- max_id = (
337
- max(max(dummyenv._world._mat_ids.values()), max(dummyenv._sem_view._obj_ids.values()))
338
- + 1
339
- )
340
- id_to_item = ["void"] * max_id
341
- for name, ind in itertools.chain(
342
- dummyenv._world._mat_ids.items(), dummyenv._sem_view._obj_ids.items()
343
- ):
344
- clean = (
345
- name.__name__
346
- if hasattr(name, "__name__")
347
- else (str(name) if name is not None else "none")
348
- )
349
- id_to_item[ind] = clean.lower()
350
- finally:
351
- with contextlib.suppress(AttributeError, Exception):
352
- dummyenv.close()
353
-
354
- # Create view
355
- half = view_size // 2
356
- lines = []
357
- visible_items = set()
358
-
359
- for dy in range(-half, half + 1):
360
- row = []
361
- for dx in range(-half, half + 1):
362
- x, y = px + dx, py + dy
363
-
364
- if dx == 0 and dy == 0:
365
- row.append("you") # Player
366
- elif 0 <= x < sem_arr.shape[0] and 0 <= y < sem_arr.shape[1]:
367
- val = int(sem_arr[x, y])
368
- # Use the real crafter mapping
369
- item_name = id_to_item[val] if val < len(id_to_item) else f"unknown_{val}"
370
- row.append(item_name)
371
- if item_name not in ["grass", "you", "void"]:
372
- visible_items.add(item_name)
373
- else:
374
- row.append("void") # Out of bounds
375
-
376
- lines.append(" ".join(row))
377
-
378
- # Add legend of visible items
379
- legend = (
380
- f"Visible items: {', '.join(sorted(visible_items))}"
381
- if visible_items
382
- else "No special items visible (mostly grass)"
383
- )
384
-
385
- return "\n".join(lines) + "\n" + legend
386
-
387
-
388
- def get_openai_tools():
389
- """Get OpenAI-compatible tool definitions for Synth models."""
390
- return [
391
- {
392
- "type": "function",
393
- "function": {
394
- "name": "interact",
395
- "description": "Perform actions in the Crafter environment.",
396
- "parameters": {
397
- "type": "object",
398
- "properties": {
399
- "actions": {
400
- "type": "array",
401
- "items": {"type": "string"},
402
- "description": "List of actions to perform in sequence (e.g., ['move_right', 'move_right', 'do']). Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop",
403
- },
404
- "reasoning": {
405
- "type": "string",
406
- "description": "Reasoning for these actions",
407
- },
408
- },
409
- "required": ["actions", "reasoning"],
410
- },
411
- },
412
- },
413
- {
414
- "type": "function",
415
- "function": {
416
- "name": "terminate",
417
- "description": "End the episode when finished or no progress can be made.",
418
- "parameters": {
419
- "type": "object",
420
- "properties": {
421
- "reason": {"type": "string", "description": "Reason for termination"}
422
- },
423
- "required": ["reason"],
424
- },
425
- },
426
- },
427
- ]
428
-
429
-
430
- # --- Configuration Class ---
431
- class CrafterConfig:
432
- """Configuration for Crafter evaluation with Synth backend."""
433
-
434
- def __init__(self, config_path: str | None = None):
435
- # Default values
436
- self.model_name: str | None = None
437
- self.num_instances = 1
438
- self.max_turns = 2
439
- self.difficulty = "easy"
440
- self.service_base_url = "http://localhost:8901"
441
- self.service_timeout = 30.0
442
- self.seed = 42
443
- self.save_traces = True
444
- self.save_detailed_results = True
445
- self.verbose = False
446
- self.quiet = False # Add quiet mode support
447
- self.analyze_traces = False
448
-
449
- # V3 tracing settings
450
- self.enable_v3_tracing = True
451
- # Standardize to a single shared v3 DB by default; allow env override
452
- self.v3_trace_dir = os.getenv("SYNTH_TRACES_ROOT", "./traces/v3")
453
- # Use shared DB path unless explicitly overridden via env or config
454
- self.turso_db_path = os.getenv(
455
- "SQLD_DB_PATH", os.path.join(self.v3_trace_dir, "synth_ai.db")
456
- )
457
- self.start_sqld_daemon = True # Whether to start sqld daemon
458
- self.auto_cleanup = True # Clean up old files automatically
459
-
460
- # Synth-specific settings
461
- self.warmup_model = True
462
- self.warmup_max_attempts = 30
463
- self.warmup_timeout = 60.0 # Default timeout in seconds
464
- self.use_synth_backend = True # Flag to indicate Synth backend
465
-
466
- # Load from TOML if provided
467
- if config_path and os.path.exists(config_path):
468
- self.load_from_toml(config_path)
469
-
470
- def load_from_toml(self, config_path: str):
471
- """Load configuration from TOML file."""
472
- config = toml.load(config_path)
473
-
474
- eval_config = config.get("eval", {})
475
- self.model_name = eval_config.get("model_name", self.model_name)
476
- self.num_instances = eval_config.get("episodes", self.num_instances)
477
- self.max_turns = eval_config.get("max_steps", self.max_turns)
478
- self.difficulty = eval_config.get("difficulty", self.difficulty)
479
- self.seed = eval_config.get("seed", self.seed)
480
-
481
- service_config = config.get("service", {})
482
- self.service_base_url = service_config.get("base_url", self.service_base_url)
483
- self.service_timeout = service_config.get("timeout", self.service_timeout)
484
-
485
- output_config = config.get("output", {})
486
- self.save_traces = output_config.get("save_traces", self.save_traces)
487
- self.save_detailed_results = output_config.get(
488
- "save_detailed_results", self.save_detailed_results
489
- )
490
-
491
- # V3 tracing config
492
- tracing_config = config.get("tracing_v3", {})
493
- self.enable_v3_tracing = tracing_config.get("enabled", self.enable_v3_tracing)
494
- self.v3_trace_dir = tracing_config.get("trace_dir", self.v3_trace_dir)
495
- self.turso_db_path = tracing_config.get("db_path", self.turso_db_path)
496
- self.start_sqld_daemon = tracing_config.get("start_daemon", self.start_sqld_daemon)
497
- self.auto_cleanup = tracing_config.get("auto_cleanup", self.auto_cleanup)
498
-
499
- # Synth config
500
- synth_config = config.get("synth", {})
501
- self.warmup_model = synth_config.get("warmup_model", self.warmup_model)
502
- self.warmup_max_attempts = synth_config.get("warmup_max_attempts", self.warmup_max_attempts)
503
- self.warmup_timeout = synth_config.get("warmup_timeout", self.warmup_timeout)
504
- self.use_synth_backend = synth_config.get("use_synth_backend", self.use_synth_backend)
505
-
506
-
507
- # --- Base ReAct Agent using LM with Synth ---
508
- class BaseReActAgentWithLMSynth:
509
- """Base ReAct agent using LM class configured for Synth backend."""
510
-
511
- def __init__(
512
- self,
513
- model_name: str,
514
- max_turns: int = 20,
515
- verbose: bool = False,
516
- tracer: SessionTracer | None = None,
517
- episode_id: int = 0,
518
- quiet: bool = False,
519
- model_params: dict[str, Any] | None = None,
520
- ):
521
- self.model_name = model_name
522
- self.max_turns = max_turns
523
- self.verbose = verbose
524
- self.quiet = quiet
525
- self.history = []
526
- self.system_name = "base-react-agent-lm-synth"
527
- self.tools = get_openai_tools()
528
- self.tracer = tracer
529
- self.system_id = f"{self.system_name}_{uuid.uuid4()}"
530
- self.episode_id = episode_id
531
-
532
- # Default model parameters
533
- default_model_params = {
534
- "temperature": 0.7,
535
- "max_tokens": 512,
536
- "top_p": 1.0,
537
- "frequency_penalty": 0.0,
538
- "presence_penalty": 0.0,
539
- "tool_choice": "auto",
540
- }
541
-
542
- # Merge user-provided parameters with defaults
543
- self.model_params = {**default_model_params, **(model_params or {})}
544
-
545
- # Setup Synth environment variables
546
- setup_synth_environment()
547
-
548
- # Create LM instance with synth provider and configurable parameters
549
- self.lm = LM(
550
- model_name=model_name,
551
- formatting_model_name=model_name,
552
- temperature=self.model_params["temperature"],
553
- synth_logging=False, # Disable v1 tracing
554
- provider="synth", # Use synth provider
555
- session_tracer=tracer,
556
- system_id=self.system_id,
557
- enable_v3_tracing=True,
558
- # Pass additional model parameters
559
- max_tokens=self.model_params["max_tokens"],
560
- top_p=self.model_params["top_p"],
561
- frequency_penalty=self.model_params["frequency_penalty"],
562
- presence_penalty=self.model_params["presence_penalty"],
563
- # Qwen3 think mode (propagated by vendor to chat_template_kwargs)
564
- enable_thinking=self.model_params.get("enable_thinking"),
565
- # Forward arbitrary extra_body to vendor for features like
566
- # stop_after_tool_calls. The runner sets this to 1.
567
- extra_body=self.model_params.get("extra_body"),
568
- )
569
-
570
- # Agent state tracking
571
- self.agent_state = {
572
- "message_history": [],
573
- "steps_taken": 0,
574
- "steps_remaining": max_turns,
575
- "total_tokens_used": 0,
576
- "tool_calls_made": 0,
577
- "current_turn": 0,
578
- "last_failure": None, # Track last failure for prompting
579
- "recent_tool_calls": [],
580
- }
581
-
582
- async def decide(self, obs: str, system_message: str, turn: int) -> dict[str, Any]:
583
- """Get agent decision based on observation using LM class with Synth backend."""
584
- # Update agent state
585
- self.agent_state["current_turn"] = turn
586
- self.agent_state["steps_taken"] = turn
587
- self.agent_state["steps_remaining"] = self.max_turns - turn
588
-
589
- # Include last 3 tool calls (reasoning and actions) to provide short action history
590
- recent_calls = self.agent_state.get("recent_tool_calls", [])
591
- recent_tail = recent_calls[-3:] if isinstance(recent_calls, list) else []
592
- if recent_tail:
593
- lines = ["\nRecent tool calls (last 3):"]
594
- for entry in recent_tail:
595
- tnum = entry.get("turn")
596
- name = entry.get("name")
597
- reasoning = entry.get("reasoning")
598
- actions = entry.get("actions")
599
- actions_str = ", ".join(actions) if isinstance(actions, list) else ""
600
- lines.append(
601
- f"- Turn {tnum}: {name} — reasoning: {reasoning}; actions: {actions_str}"
602
- )
603
- obs_with_history = f"{obs}\n" + "\n".join(lines)
604
- else:
605
- obs_with_history = obs
606
-
607
- # Create conversation context with unique episode ID to prevent caching
608
- context = (
609
- f"Episode {self.episode_id} - Turn {turn + 1}/{self.max_turns}\n\n{obs_with_history}"
610
- )
611
-
612
- # Build messages in OpenAI format for tools
613
- # Augment the system message if the previous turn failed to produce a tool call
614
- local_system_message = system_message
615
- last_failure = self.agent_state.get("last_failure")
616
- if last_failure:
617
- local_system_message = (
618
- f"{system_message}\n\nIMPORTANT: In the previous turn, no valid tool call was returned. "
619
- f"Error: {last_failure}. You MUST respond with a single function tool call in the OpenAI tools format."
620
- )
621
- messages = [
622
- {"role": "system", "content": local_system_message},
623
- {"role": "user", "content": context},
624
- ]
625
-
626
- # Add to message history
627
- self.agent_state["message_history"].extend(messages)
628
-
629
- # Truncate history if too long
630
- max_history_length = 20
631
- if len(self.agent_state["message_history"]) > max_history_length:
632
- self.agent_state["message_history"] = [
633
- self.agent_state["message_history"][0]
634
- ] + self.agent_state["message_history"][-(max_history_length - 1) :]
635
-
636
- try:
637
- llm_start = time.time()
638
-
639
- # Optionally print full prompt on final turn when verbose
640
- if self.verbose and turn == self.max_turns - 1:
641
- print("\nšŸ” FINAL TURN PROMPT:")
642
- print("=" * 80)
643
- print(f"System: {local_system_message[:200]}...")
644
- print(f"\nUser message:\n{context}")
645
- print("=" * 80)
646
-
647
- # Debug: Print request info only when verbose
648
- if self.verbose:
649
- print(f"\nšŸ” DEBUG: LM call details (turn {turn})")
650
- print(f" Model: {self.model_name}")
651
- print(" Provider: synth")
652
- print(f" Messages: {len(messages)} messages")
653
- print(f" Tools: {len(self.tools) if self.tools else 0} tools")
654
- if self.tools:
655
- print(
656
- f" Tool 0 name: {self.tools[0].get('function', {}).get('name', 'unknown')}"
657
- )
658
- print(f" Tools structure: {json.dumps(self.tools[0], indent=4)[:300]}...")
659
-
660
- # Call LM with turn number for v3 tracing
661
- # The LM class should handle Synth routing internally
662
- if self.verbose:
663
- print(
664
- f"šŸ” DEBUG: LM sampling params => max_tokens={self.model_params.get('max_tokens')} temp={self.model_params.get('temperature')} top_p={self.model_params.get('top_p')} tool_choice={self.model_params.get('tool_choice')}"
665
- )
666
-
667
- # Optional full input logging (system, user, tools). Enable with CRAFTER_LOG_FULL_INPUTS=1
668
- _log_full_inputs = os.getenv("CRAFTER_LOG_FULL_INPUTS", "0").lower() in (
669
- "1",
670
- "true",
671
- "yes",
672
- "on",
673
- )
674
- # if _log_full_inputs:
675
- # print("\n" + "=" * 80)
676
- # print(f"FULL LM INPUT (turn {turn})")
677
- # print("-" * 80)
678
- # print("System message:\n" + local_system_message)
679
- # print("\nUser message:\n" + context)
680
- # print("\nMessages JSON:")
681
- # print(json.dumps(messages, indent=2))
682
- # print("\nTools definition:")
683
- # print(json.dumps(self.tools, indent=2))
684
- # print("\nSampling/tool params:")
685
- # print(
686
- # json.dumps(
687
- # {
688
- # "tool_choice": self.model_params.get("tool_choice"),
689
- # "extra_body": self.model_params.get("extra_body"),
690
- # "temperature": self.model_params.get("temperature"),
691
- # "max_tokens": self.model_params.get("max_tokens"),
692
- # "top_p": self.model_params.get("top_p"),
693
- # "frequency_penalty": self.model_params.get("frequency_penalty"),
694
- # "presence_penalty": self.model_params.get("presence_penalty"),
695
- # },
696
- # indent=2,
697
- # )
698
- # )
699
- # print("=" * 80)
700
-
701
- response = await self.lm.respond_async(
702
- messages=messages,
703
- turn_number=turn,
704
- # Pass tools in the format expected by LM class
705
- tools=self.tools,
706
- max_tokens=self.model_params["max_tokens"],
707
- tool_choice=self.model_params.get("tool_choice", "auto"),
708
- # Pass extra_body per call to ensure backend receives stop_after_tool_calls
709
- extra_body=self.model_params.get("extra_body"),
710
- )
711
-
712
- llm_end = time.time()
713
-
714
- # Minimal output: show only tool_call presence, number of actions, and tokens
715
- completion_tokens = None
716
- prompt_tokens = None
717
- toks_per_sec = None
718
- if hasattr(response, "usage") and isinstance(response.usage, dict):
719
- completion_tokens = response.usage.get("completion_tokens")
720
- prompt_tokens = response.usage.get("prompt_tokens")
721
- # Compute tokens/sec if we have duration and completion tokens
722
- try:
723
- if completion_tokens is not None:
724
- duration_s = max(1e-6, (llm_end - llm_start))
725
- toks_per_sec = round(float(completion_tokens) / duration_s, 2)
726
- except Exception:
727
- toks_per_sec = None
728
-
729
- # Parse the response to extract tool calls
730
- raw_response = response.raw_response
731
- decision: dict[str, Any]
732
-
733
- if hasattr(response, "tool_calls") and response.tool_calls:
734
- tool_call = response.tool_calls[0]
735
- parsed_decision = None
736
- fn = tool_call.get("function") if isinstance(tool_call, dict) else None
737
- if isinstance(fn, dict) and ("name" in fn):
738
- name = fn.get("name", "interact")
739
- args_raw = fn.get("arguments", "{}")
740
- try:
741
- import json as _json
742
-
743
- args = (
744
- _json.loads(args_raw) if isinstance(args_raw, str) else (args_raw or {})
745
- )
746
- if isinstance(args, dict):
747
- parsed_decision = {"name": name, "parameters": args}
748
- except Exception as _e:
749
- parsed_decision = {"name": name, "parameters": {"arguments": args_raw}}
750
- if (
751
- not parsed_decision
752
- and isinstance(tool_call, dict)
753
- and ("name" in tool_call or "parameters" in tool_call)
754
- ):
755
- parsed_decision = {
756
- "name": tool_call.get("name", "interact"),
757
- "parameters": tool_call.get("parameters", {}),
758
- }
759
- if parsed_decision:
760
- decision = parsed_decision
761
- try:
762
- pname = decision.get("name")
763
- pparams = (
764
- decision.get("parameters", {}) if isinstance(decision, dict) else {}
765
- )
766
- preason = pparams.get("reasoning") if isinstance(pparams, dict) else None
767
- pacts = pparams.get("actions") if isinstance(pparams, dict) else None
768
- entry = {
769
- "turn": turn,
770
- "name": pname,
771
- "reasoning": preason,
772
- "actions": pacts if isinstance(pacts, list) else [],
773
- }
774
- self.agent_state["recent_tool_calls"].append(entry)
775
- if len(self.agent_state["recent_tool_calls"]) > 10:
776
- self.agent_state["recent_tool_calls"] = self.agent_state[
777
- "recent_tool_calls"
778
- ][-10:]
779
- except Exception:
780
- pass
781
- # Clear failure flag on success
782
- if self.agent_state.get("last_failure"):
783
- self.agent_state["last_failure"] = None
784
- params = decision.get("parameters", {}) if isinstance(decision, dict) else {}
785
- actions = params.get("actions", []) if isinstance(params, dict) else []
786
- num_actions = len(actions) if isinstance(actions, list) else 0
787
- # Store metrics for tqdm postfix update in run_episode
788
- self.agent_state["last_metrics"] = {
789
- "tc": 1,
790
- "act": num_actions,
791
- "tok": completion_tokens,
792
- "in": prompt_tokens,
793
- "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
794
- }
795
- else:
796
- # Unrecognized tool_calls structure: do nothing, record failure
797
- failure_msg = "Unrecognized tool_calls structure"
798
- self.agent_state["last_failure"] = failure_msg
799
- decision = {
800
- "name": "interact",
801
- "parameters": {"actions": [], "reasoning": failure_msg},
802
- }
803
- if self.verbose:
804
- print(f"šŸ” DEBUG: {failure_msg}")
805
- else:
806
- # No tool calls: do nothing, record failure for next prompt
807
- failure_msg = "No valid tool_calls in assistant message"
808
- self.agent_state["last_failure"] = failure_msg
809
- decision = {
810
- "name": "interact",
811
- "parameters": {"actions": [], "reasoning": failure_msg},
812
- }
813
- # Store metrics for tqdm postfix update in run_episode
814
- self.agent_state["last_metrics"] = {
815
- "tc": 0,
816
- "act": 0,
817
- "tok": completion_tokens,
818
- "in": prompt_tokens,
819
- "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
820
- }
821
-
822
- # Update agent state
823
- self.agent_state["tool_calls_made"] += 1
824
-
825
- # Add assistant response to history
826
- assistant_message = {"role": "assistant", "content": raw_response}
827
- self.agent_state["message_history"].append(assistant_message)
828
-
829
- if self.verbose:
830
- print(f"šŸ¤– LM Response (turn {turn}): {json.dumps(decision, indent=2)}")
831
- print(f"šŸ“Š Response time: {llm_end - llm_start:.2f}s")
832
- except Exception as e:
833
- print(f"āŒ Error in LM decide: {e}")
834
- import traceback
835
-
836
- traceback.print_exc()
837
- # Record failure and do nothing this turn
838
- failure_msg = f"Exception during decide: {str(e)}"
839
- self.agent_state["last_failure"] = failure_msg
840
- decision = {"name": "interact", "parameters": {"actions": [], "reasoning": failure_msg}}
841
-
842
- return decision
843
-
844
- def _parse_tool_response(self, raw_response: str) -> dict[str, Any]:
845
- """Parse raw LM response to extract tool calls."""
846
- # Try to parse JSON if present
847
- try:
848
- # Look for JSON in the response
849
- import re
850
-
851
- json_match = re.search(r"\{.*\}", raw_response, re.DOTALL)
852
- if json_match:
853
- data = json.loads(json_match.group())
854
- if "name" in data:
855
- return data
856
- elif "function" in data:
857
- return {
858
- "name": data["function"].get("name", "interact"),
859
- "parameters": data["function"].get("arguments", {}),
860
- }
861
- except Exception:
862
- pass
863
-
864
- # Fallback to text parsing
865
- if "terminate" in raw_response.lower():
866
- return {"name": "terminate", "parameters": {"reason": "Agent decided to terminate"}}
867
-
868
- # Try to extract actions from the response
869
- actions = []
870
- action_keywords = [
871
- "move_up",
872
- "move_down",
873
- "move_left",
874
- "move_right",
875
- "do",
876
- "sleep",
877
- "place_stone",
878
- "place_table",
879
- "place_furnace",
880
- "place_plant",
881
- "make_wood_pickaxe",
882
- "make_stone_pickaxe",
883
- "make_iron_pickaxe",
884
- "make_wood_sword",
885
- "make_stone_sword",
886
- "make_iron_sword",
887
- ]
888
-
889
- for keyword in action_keywords:
890
- if keyword in raw_response.lower():
891
- actions.append(keyword)
892
-
893
- if not actions:
894
- actions = ["do"] # Default action
895
-
896
- return {
897
- "name": "interact",
898
- "parameters": {
899
- "actions": actions, # Return as array of actions
900
- "reasoning": "Parsed from response",
901
- },
902
- }
903
-
904
- def get_system_message(self) -> str:
905
- """Return system message for agent. Override in subclasses."""
906
- return """You are an AI agent playing Crafter. Use the available tools to interact with the environment.
907
-
908
- CRITICAL RULE: You MUST provide MULTIPLE actions (2-5) in EVERY interact() tool call!
909
-
910
- The 'interact' function accepts a LIST of 1-5 actions. ALWAYS provide 2-5 actions for efficiency.
911
-
912
- GOOD Examples (what you SHOULD do):
913
- āœ“ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
914
- āœ“ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate to stone and mine it")
915
- āœ“ interact(actions=["place_table", "make_wood_pickaxe", "move_left"], reasoning="Craft and continue exploring")
916
-
917
- BAD Examples (what you should AVOID):
918
- āœ— interact(actions=["move_right"], reasoning="Move right") - TOO FEW ACTIONS!
919
- āœ— interact(actions=["do"], reasoning="Collect") - TOO FEW ACTIONS!
920
-
921
- REMEMBER: Single actions waste time. Always plan 2-5 actions ahead and execute them together!"""
922
-
923
- def format_observation(self, obs: dict[str, Any]) -> str:
924
- """Format observation for agent. Override in subclasses."""
925
- return str(obs)
926
-
927
-
928
- # --- Crafter-specific ReAct Agent ---
929
- class CrafterReActAgentWithLMSynth(BaseReActAgentWithLMSynth):
930
- """Crafter-specific ReAct agent with enhanced prompting for Synth models."""
931
-
932
- def get_system_message(self) -> str:
933
- """Return Crafter-specific system message optimized for Synth models."""
934
- override = os.getenv("CRAFTER_SYSTEM_PROMPT")
935
- if override:
936
- return override
937
- return """You are CrafterAgent playing Crafter survival environment. Your goal is to unlock as many achievements as possible while staying alive.
938
-
939
- You will see a semantic map view showing your surroundings. Use this to navigate toward resources.
940
-
941
- Key mechanics:
942
- • 'do' action: collect wood from trees, stone from deposits, food from cows/plants
943
- • 'do' does nothing on grass/water - move to find resources first
944
- • Craft progression: wood → table → wood_pickaxe → stone → stone_pickaxe → iron tools
945
- • Sleep when energy low to restore and unlock wake_up achievement
946
- • Use semantic map view to navigate toward resources you can see
947
-
948
- Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop
949
-
950
- KEY ACHIEVEMENTS TO UNLOCK:
951
- Basic Resource Collection (PRIORITY #1):
952
- - collect_wood: Move NEXT TO a tree, then use action="do" to collect wood
953
- - collect_stone: Move NEXT TO stone, then use action="do" (requires wood_pickaxe in inventory)
954
- - collect_coal: Move NEXT TO coal, then use action="do" (requires stone_pickaxe)
955
- - collect_iron: Move NEXT TO iron, then use action="do" (requires stone_pickaxe)
956
- - collect_diamond: Move NEXT TO diamond, then use action="do" (requires iron_pickaxe)
957
-
958
- Tool Crafting (enables resource collection):
959
- - make_wood_pickaxe: Use action="make_wood_pickaxe" when you have wood (unlocks ability to mine stone)
960
- - make_stone_pickaxe: Use action="make_stone_pickaxe" when you have wood and stone (unlocks coal/iron mining)
961
- - make_iron_pickaxe: Use action="make_iron_pickaxe" when you have wood, coal, and iron (unlocks diamond mining)
962
-
963
- Weapon Crafting (for defense):
964
- - make_wood_sword: Use action="make_wood_sword" when you have wood
965
- - make_stone_sword: Use action="make_stone_sword" when you have wood and stone
966
- - make_iron_sword: Use action="make_iron_sword" when you have wood, coal, and iron
967
-
968
- Survival Actions:
969
- - eat_plant: Use action="eat_plant" when food < 9 and you see a plant nearby
970
- - eat_cow: Move NEXT TO cow, use action="do" to kill it, then action="eat_cow"
971
- - collect_drink: Move NEXT TO water, then use action="drink" when drink < 9
972
- - sleep: Use action="sleep" when energy < 5 (restores energy to 9)
973
-
974
- Building/Placing:
975
- - place_table: Use action="place_table" when you have wood (enables advanced crafting)
976
- - place_furnace: Use action="place_furnace" when you have stone (for smelting)
977
- - place_plant: Use action="place_plant" when you have sapling (grows into tree)
978
- - place_stone: Use action="place_stone" when you have stone (creates barrier)
979
-
980
- Combat:
981
- - defeat_zombie: Move NEXT TO zombie, then use action="do" repeatedly to attack
982
- - defeat_skeleton: Move NEXT TO skeleton, then use action="do" repeatedly to attack
983
-
984
- CRITICAL: The action="do" is your INTERACTION button! Use it when adjacent to:
985
- - Trees → get wood
986
- - Stone/Coal/Iron/Diamond → mine resources (need appropriate pickaxe)
987
- - Enemies → attack them
988
- - Cows → kill for food
989
-
990
- Simple Strategy:
991
- 1. Look for resources (trees, stones) in the semantic map
992
- 2. Move toward the nearest resource
993
- 3. When adjacent to a resource, use action="do" to collect it
994
- 4. If you have wood, try action="make_wood_pickaxe"
995
- 5. Repeat: find resources, move to them, use "do"
996
-
997
- Critical Gameplay Tips:
998
- - You must be ADJACENT (one tile away) to objects to interact with them
999
- - Use "do" when next to: trees (for wood), stone (for stone), coal, iron, diamond
1000
- - Use "do" to attack zombies/skeletons when adjacent
1001
- - First priority: Find a tree, move next to it, then use "do" to collect wood
1002
- - Wood is essential for crafting your first pickaxe
1003
- - With wood_pickaxe you can mine stone, with stone_pickaxe you can mine iron, etc.
1004
-
1005
- CRITICAL INSTRUCTION: You MUST ALWAYS provide MULTIPLE actions (2-5) in EVERY interact() tool call!
1006
-
1007
- The 'interact' function accepts a LIST of 1-5 actions. NEVER use single actions - always plan 2-5 actions ahead!
1008
-
1009
- MANDATORY action sequences (ALWAYS use multiple):
1010
- āœ“ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
1011
- āœ“ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate and collect")
1012
- āœ“ interact(actions=["place_table", "make_wood_pickaxe", "move_left", "move_left"], reasoning="Craft and explore")
1013
- āœ“ interact(actions=["do", "move_right", "do", "move_right", "do"], reasoning="Collect multiple resources")
1014
-
1015
- FORBIDDEN (NEVER do this):
1016
- āœ— interact(actions=["move_right"], ...) - WRONG! Too few actions!
1017
- āœ— interact(actions=["do"], ...) - WRONG! Too few actions!
1018
-
1019
- RULE: If you use less than 2 actions, you are playing inefficiently. Always think 2-5 steps ahead!
1020
-
1021
- Key Strategy:
1022
- 1. Plan a sequence of moves to reach resources
1023
- 2. Execute multiple moves in one tool call (e.g., ["move_right", "move_right", "move_up"])
1024
- 3. When adjacent to a resource, use "do" to collect it
1025
- 4. Chain crafting actions together (e.g., ["place_table", "make_wood_pickaxe"])
1026
-
1027
- Remember:
1028
- - Use "do" when ADJACENT to trees (for wood), stones, or other resources
1029
- - Collect wood FIRST before trying to craft anything
1030
- - Be efficient - use multiple actions per tool call!
1031
- - Focus on unlocking achievements by collecting resources and crafting items."""
1032
-
1033
- def format_observation(self, obs: dict[str, Any]) -> str:
1034
- """Format Crafter observation with semantic map view."""
1035
- # Get semantic map view
1036
- semantic_view = format_semantic_map_view_v2(obs, view_size=7)
1037
-
1038
- # Extract key information
1039
- inventory = obs.get("inventory", {})
1040
- # Try both possible keys for achievements
1041
- achievements = obs.get("achievements_status", obs.get("achievements_info", {}))
1042
- health = obs.get("health", 10)
1043
- food = obs.get("food", 10)
1044
- drink = obs.get("drink", 10)
1045
- energy = obs.get("energy", 10)
1046
-
1047
- # Count achievements
1048
- achieved = sum(1 for v in achievements.values() if v)
1049
- total_achievements = len(achievements)
1050
-
1051
- # Format inventory (only show non-zero items)
1052
- inv_items = []
1053
- for item, count in inventory.items():
1054
- if count > 0:
1055
- inv_items.append(f"{item}: {count}")
1056
- inv_str = ", ".join(inv_items) if inv_items else "empty"
1057
-
1058
- # List unlocked achievements
1059
- unlocked = [k for k, v in achievements.items() if v]
1060
- unlocked_str = ", ".join(unlocked) if unlocked else "none"
1061
-
1062
- # Recent achievements (from info if available)
1063
- recent_str = ""
1064
-
1065
- suppress_reminder = os.getenv("CRAFTER_SUPPRESS_OBS_REMINDER")
1066
- base = (
1067
- f"=== SEMANTIC MAP VIEW (7x7) ===\n"
1068
- f"{semantic_view}\n\n"
1069
- f"=== STATUS ===\n"
1070
- f"Health: {health}/10 | Food: {food}/10 | Drink: {drink}/10 | Energy: {energy}/10\n"
1071
- f"Inventory: {inv_str}\n"
1072
- f"Achievements: {achieved}/{total_achievements} unlocked\n"
1073
- f"Unlocked: {unlocked_str}\n"
1074
- f"{recent_str}\n\n"
1075
- # f"What do you see in the map? What actions should you take? "
1076
- )
1077
- if suppress_reminder:
1078
- return base
1079
- return (
1080
- base
1081
- # + "\n\nREMINDER: You MUST provide 2-5 actions in your interact() tool call. Plan multiple steps ahead!\n"
1082
- # + 'Example: interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")'
1083
- )
1084
-
1085
-
1086
- async def run_episode(
1087
- episode_id: int,
1088
- config: CrafterConfig,
1089
- session_tracer: SessionTracer | None = None,
1090
- progress_bar: tqdm | None = None,
1091
- quiet: bool = False,
1092
- model_params: dict[str, Any] | None = None,
1093
- ):
1094
- """Run a single episode."""
1095
- episode_start_time = time.time()
1096
-
1097
- # Create agent - always disable verbose for cleaner output
1098
- agent = CrafterReActAgentWithLMSynth(
1099
- model_name=config.model_name,
1100
- max_turns=config.max_turns,
1101
- verbose=False, # Always disable verbose logging in agent
1102
- tracer=session_tracer,
1103
- episode_id=episode_id,
1104
- quiet=True, # Always use quiet mode for agent
1105
- model_params=model_params,
1106
- )
1107
-
1108
- # Initialize environment
1109
- async with AsyncClient(base_url=config.service_base_url) as client:
1110
- try:
1111
- # Initialize environment with unique seed for each episode
1112
- # Use simple sequential seeds: 1, 2, 3, 4, etc.
1113
- episode_seed = episode_id + 1 # Start from 1 instead of 0
1114
-
1115
- init_response = await retry_http_request(
1116
- client,
1117
- "POST",
1118
- "/env/CrafterClassic/initialize",
1119
- json={"config": {"difficulty": config.difficulty, "seed": episode_seed}},
1120
- )
1121
-
1122
- init_data = init_response.json()
1123
- instance_id = init_data["env_id"]
1124
- obs = init_data["observation"]
1125
-
1126
- # Start initial timestep and send initial observation as message
1127
- if session_tracer:
1128
- async with session_tracer.timestep("init", turn_number=0):
1129
- obs_msg = create_message(
1130
- compress_observation_for_trace(obs),
1131
- "observation",
1132
- f"crafter_env_{instance_id}",
1133
- 0,
1134
- )
1135
- await session_tracer.record_message(
1136
- content=obs_msg.content, message_type=obs_msg.message_type
1137
- )
1138
-
1139
- # Run episode
1140
- episode_reward = 0
1141
- termination_reason = None
1142
- step_results = []
1143
- consecutive_no_tool_calls = 0
1144
-
1145
- # Create progress bar for this episode
1146
- episode_progress = tqdm(
1147
- total=config.max_turns,
1148
- desc=f"Episode {episode_id}",
1149
- position=episode_id,
1150
- leave=True,
1151
- ncols=100,
1152
- )
1153
-
1154
- for turn in range(config.max_turns):
1155
- episode_progress.update(1)
1156
-
1157
- # Use timestep context for this turn
1158
- timestep_name = f"turn_{turn + 1}"
1159
- async with (
1160
- session_tracer.timestep(timestep_name, turn_number=turn + 1)
1161
- if session_tracer
1162
- else _noop_async_context()
1163
- ):
1164
- # Get agent decision
1165
- obs_formatted = agent.format_observation(obs)
1166
- system_msg = agent.get_system_message()
1167
-
1168
- decision = await agent.decide(obs_formatted, system_msg, turn)
1169
- # Update tqdm postfix with latest metrics from agent
1170
- try:
1171
- metrics = agent.agent_state.get("last_metrics")
1172
- if isinstance(metrics, dict):
1173
- episode_progress.set_postfix(metrics, refresh=False)
1174
- except Exception:
1175
- pass
1176
-
1177
- # Handle termination
1178
- if decision["name"] == "terminate":
1179
- termination_reason = decision["parameters"]["reason"]
1180
- break
1181
-
1182
- # Detect consecutive no-tool-call responses and abort after 3
1183
- decision_params = (
1184
- decision.get("parameters") if isinstance(decision, dict) else None
1185
- )
1186
- decision_actions = (
1187
- decision_params.get("actions", [])
1188
- if isinstance(decision_params, dict)
1189
- else []
1190
- )
1191
- if (
1192
- decision.get("name") == "interact"
1193
- and isinstance(decision_actions, list)
1194
- and len(decision_actions) == 0
1195
- ):
1196
- consecutive_no_tool_calls += 1
1197
- print(f"šŸ” DEBUG: consecutive_no_tool_calls={consecutive_no_tool_calls}")
1198
- else:
1199
- consecutive_no_tool_calls = 0
1200
- if consecutive_no_tool_calls >= 3:
1201
- # Gracefully end the episode without recording this problematic turn
1202
- termination_reason = "no_tool_calls_abort"
1203
- break
1204
-
1205
- # Execute actions in sequence
1206
- actions = (
1207
- decision["parameters"].get("actions", [])
1208
- if isinstance(decision.get("parameters"), dict)
1209
- else []
1210
- )
1211
-
1212
- # Ensure control variables are defined even if no actions are taken this turn
1213
- done = False
1214
- reward = 0.0
1215
- info = {}
1216
-
1217
- # Define action mapping
1218
- crafter_action_map = {
1219
- "noop": 0,
1220
- "move_left": 1,
1221
- "move_right": 2,
1222
- "move_up": 3,
1223
- "move_down": 4,
1224
- "do": 5,
1225
- "sleep": 6,
1226
- "place_stone": 7,
1227
- "place_table": 8,
1228
- "place_furnace": 9,
1229
- "place_plant": 10,
1230
- "make_wood_pickaxe": 11,
1231
- "make_stone_pickaxe": 12,
1232
- "make_iron_pickaxe": 13,
1233
- "make_wood_sword": 14,
1234
- "make_stone_sword": 15,
1235
- "make_iron_sword": 16,
1236
- }
1237
-
1238
- # Execute each action in the sequence (may be empty)
1239
- for action in actions:
1240
- # Convert action name to integer
1241
- action_int = crafter_action_map.get(action, 0) # Default to noop
1242
-
1243
- # Get state before action
1244
- state_before = {"observation": obs} if "obs" in locals() else {}
1245
- prev_obs = obs.copy()
1246
-
1247
- # Step environment
1248
- step_response = await retry_http_request(
1249
- client,
1250
- "POST",
1251
- "/env/CrafterClassic/step",
1252
- json={
1253
- "env_id": instance_id,
1254
- "action": {
1255
- "tool_calls": [
1256
- {"tool": "interact", "args": {"action": action_int}}
1257
- ]
1258
- },
1259
- },
1260
- )
1261
- step_data = step_response.json()
1262
-
1263
- # Check if response has expected structure
1264
- if "observation" not in step_data:
1265
- print(
1266
- f"\nāŒ Error: Missing observation in step response. Keys: {list(step_data.keys())}"
1267
- )
1268
- if "error" in step_data:
1269
- print(f" Error message: {step_data['error']}")
1270
- # Try to recover or break
1271
- break
1272
-
1273
- obs = step_data["observation"]
1274
- reward = step_data.get("reward", 0) # Default to 0 if None
1275
- done = step_data.get("done", False) # Default to False if None
1276
- info = step_data.get("info", {})
1277
-
1278
- # Calculate achievement reward if not provided by service
1279
- if (reward == 0 or reward is None) and (
1280
- "achievements_status" in obs and "achievements_status" in prev_obs
1281
- ):
1282
- prev_achievements = prev_obs["achievements_status"]
1283
- curr_achievements = obs["achievements_status"]
1284
- new_unlocks = sum(
1285
- 1
1286
- for k in curr_achievements
1287
- if curr_achievements.get(k) and not prev_achievements.get(k)
1288
- )
1289
- if new_unlocks > 0:
1290
- reward = float(new_unlocks) # +1 for each new achievement
1291
-
1292
- if reward is not None:
1293
- episode_reward += reward
1294
-
1295
- # Record step result
1296
- step_results.append(
1297
- {
1298
- "turn": turn,
1299
- "action": action,
1300
- "reward": reward,
1301
- "done": done,
1302
- "info": info,
1303
- }
1304
- )
1305
-
1306
- # Record environment event for hooks to catch
1307
- if session_tracer:
1308
- # Create environment event with state transition
1309
- env_event = EnvironmentEvent(
1310
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
1311
- system_instance_id=f"crafter_env_{instance_id}",
1312
- system_state_before={"public_state": prev_obs},
1313
- system_state_after={"public_state": obs},
1314
- reward=reward, # This now includes calculated achievement rewards
1315
- terminated=done,
1316
- metadata={"action": action, "action_int": action_int, "info": info},
1317
- )
1318
- await session_tracer.record_event(env_event)
1319
-
1320
- # Also record runtime event for invalid action detection
1321
- runtime_event = RuntimeEvent(
1322
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
1323
- system_instance_id=f"crafter_runtime_{instance_id}",
1324
- actions=[action_int],
1325
- metadata={
1326
- "action_name": action,
1327
- "action_int": action_int,
1328
- "reward": reward,
1329
- "state_before": state_before,
1330
- "state_after": {"observation": obs},
1331
- },
1332
- )
1333
- await session_tracer.record_event(runtime_event)
1334
-
1335
- if done:
1336
- break
1337
-
1338
- # After all actions (or none), send final observation message
1339
- if session_tracer:
1340
- obs_msg = create_message(
1341
- compress_observation_for_trace(obs),
1342
- "observation",
1343
- f"crafter_env_{instance_id}",
1344
- turn + 1,
1345
- )
1346
- await session_tracer.record_message(
1347
- content=obs_msg.content, message_type=obs_msg.message_type
1348
- )
1349
-
1350
- if done:
1351
- break
1352
-
1353
- # Close progress bar
1354
- episode_progress.close()
1355
-
1356
- # Terminate instance
1357
- terminate_response = await retry_http_request(
1358
- client, "POST", "/env/CrafterClassic/terminate", json={"env_id": instance_id}
1359
- )
1360
-
1361
- except Exception as e:
1362
- if "episode_progress" in locals():
1363
- episode_progress.close()
1364
- print(f"\nāŒ Episode {episode_id} failed: {e}")
1365
- if config.verbose:
1366
- import traceback
1367
-
1368
- traceback.print_exc()
1369
- return {
1370
- "episode_id": episode_id,
1371
- "error": str(e),
1372
- "duration": time.time() - episode_start_time,
1373
- }
1374
-
1375
- # Extract final achievements
1376
- final_achievements = []
1377
- if obs and "achievements_status" in obs:
1378
- final_achievements = [k for k, v in obs["achievements_status"].items() if v]
1379
-
1380
- # Return results
1381
- return {
1382
- "episode_id": episode_id,
1383
- "total_reward": episode_reward,
1384
- "steps": len(step_results),
1385
- "termination_reason": termination_reason,
1386
- "duration": time.time() - episode_start_time,
1387
- "step_results": step_results,
1388
- "achievements_unlocked": final_achievements,
1389
- }
1390
-
1391
-
1392
- # --- Main ---
1393
- async def main():
1394
- """Main entry point with v3 tracing."""
1395
- parser = argparse.ArgumentParser(description="Run Crafter evaluation with LM Synth backend")
1396
- parser.add_argument("--config", type=str, help="Path to TOML config file")
1397
- parser.add_argument("--model", type=str, help="Model name (overrides config)")
1398
- parser.add_argument("--episodes", type=int, help="Number of episodes (overrides config)")
1399
- parser.add_argument("--max-steps", type=int, help="Max steps per episode (overrides config)")
1400
- parser.add_argument(
1401
- "--difficulty", type=str, choices=["easy", "normal", "hard"], help="Difficulty override"
1402
- )
1403
- parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
1404
- parser.add_argument("--quiet", action="store_true", help="Suppress most output except results")
1405
- parser.add_argument("--no-traces", action="store_true", help="Disable trace saving")
1406
- parser.add_argument("--analyze", action="store_true", help="Analyze traces after running")
1407
- parser.add_argument("--skip-warmup", action="store_true", help="Skip model warmup")
1408
- parser.add_argument(
1409
- "--no-daemon",
1410
- action="store_true",
1411
- help="Don't start sqld daemon (assumes it's already running)",
1412
- )
1413
-
1414
- # Qwen3 thinking mode flags (mutually exclusive)
1415
- think_group = parser.add_mutually_exclusive_group()
1416
- think_group.add_argument(
1417
- "--think",
1418
- dest="enable_thinking",
1419
- action="store_true",
1420
- help="Enable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=True)",
1421
- )
1422
- think_group.add_argument(
1423
- "--no-think",
1424
- dest="enable_thinking",
1425
- action="store_false",
1426
- help="Disable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=False)",
1427
- )
1428
- parser.set_defaults(enable_thinking=None)
1429
-
1430
- # Model parameter arguments
1431
- parser.add_argument(
1432
- "--temperature",
1433
- type=float,
1434
- default=0.7,
1435
- help="Temperature for model responses (default: 0.7)",
1436
- )
1437
- parser.add_argument(
1438
- "--max-tokens", type=int, default=512, help="Maximum tokens to generate (default: 512)"
1439
- )
1440
- parser.add_argument(
1441
- "--top-p", type=float, default=1.0, help="Top-p sampling parameter (default: 1.0)"
1442
- )
1443
- parser.add_argument(
1444
- "--frequency-penalty", type=float, default=0.0, help="Frequency penalty (default: 0.0)"
1445
- )
1446
- parser.add_argument(
1447
- "--presence-penalty", type=float, default=0.0, help="Presence penalty (default: 0.0)"
1448
- )
1449
- parser.add_argument(
1450
- "--tool-choice",
1451
- type=str,
1452
- choices=["auto", "required", "none"],
1453
- default="auto",
1454
- help="Tool choice mode (default: auto)",
1455
- )
1456
-
1457
- args = parser.parse_args()
1458
-
1459
- # Load configuration
1460
- config = CrafterConfig(args.config)
1461
-
1462
- # Setup Synth environment variables
1463
- setup_synth_environment()
1464
-
1465
- # Clean up old files to keep directory clean
1466
- if config.auto_cleanup:
1467
- cleanup_old_files()
1468
-
1469
- # Apply command-line overrides
1470
- if args.model:
1471
- config.model_name = args.model
1472
- if args.episodes:
1473
- config.num_instances = args.episodes
1474
- if args.max_steps:
1475
- config.max_turns = args.max_steps
1476
- if args.difficulty:
1477
- config.difficulty = args.difficulty
1478
- if args.verbose:
1479
- config.verbose = True
1480
- if args.quiet:
1481
- config.quiet = True
1482
- if not args.verbose: # Don't show this if verbose is also on
1483
- print("šŸ”‡ Quiet mode enabled - suppressing verbose logs")
1484
- else:
1485
- config.quiet = False
1486
- if args.no_daemon:
1487
- config.start_sqld_daemon = False
1488
-
1489
- # Environment overrides for model parameters (fail-fast on bad values)
1490
- env_temp = os.getenv("CRAFTER_TEMPERATURE")
1491
- if env_temp is not None:
1492
- args.temperature = float(env_temp)
1493
- env_max_tok = os.getenv("CRAFTER_MAX_TOKENS")
1494
- if env_max_tok is not None:
1495
- args.max_tokens = int(env_max_tok)
1496
- env_tool_choice = os.getenv("CRAFTER_TOOL_CHOICE")
1497
- if env_tool_choice is not None:
1498
- if env_tool_choice not in {"auto", "required", "none"}:
1499
- raise ValueError(f"Invalid CRAFTER_TOOL_CHOICE: {env_tool_choice}")
1500
- args.tool_choice = env_tool_choice
1501
- env_top_p = os.getenv("CRAFTER_TOP_P")
1502
- if env_top_p is not None:
1503
- args.top_p = float(env_top_p)
1504
- env_freq_pen = os.getenv("CRAFTER_FREQUENCY_PENALTY")
1505
- if env_freq_pen is not None:
1506
- args.frequency_penalty = float(env_freq_pen)
1507
- env_pres_pen = os.getenv("CRAFTER_PRESENCE_PENALTY")
1508
- if env_pres_pen is not None:
1509
- args.presence_penalty = float(env_pres_pen)
1510
-
1511
- # Resolve stop-after-tool-calls from environment (wrapper sets this)
1512
- try:
1513
- _satc = int(os.getenv("CRAFTER_STOP_AFTER_TOOL_CALLS", "1"))
1514
- except Exception:
1515
- _satc = 1
1516
- _extra_body = {"stop_after_tool_calls": _satc} if _satc and _satc > 0 else {}
1517
-
1518
- # Create model parameters dictionary from command line arguments
1519
- model_params = {
1520
- "temperature": args.temperature,
1521
- "max_tokens": args.max_tokens,
1522
- "top_p": args.top_p,
1523
- "frequency_penalty": args.frequency_penalty,
1524
- "presence_penalty": args.presence_penalty,
1525
- "tool_choice": args.tool_choice,
1526
- # Request early stop after N tool call blocks to avoid spillover
1527
- "extra_body": _extra_body,
1528
- }
1529
- # Optionally carry thinking mode through to LM config
1530
- if args.enable_thinking is not None:
1531
- model_params["enable_thinking"] = args.enable_thinking
1532
-
1533
- # Configure logging based on quiet mode
1534
- setup_logging(quiet_mode=config.quiet)
1535
-
1536
- # Display configuration (only if not in quiet mode)
1537
- if not config.quiet:
1538
- print("šŸŽ® Crafter ReAct Agent Evaluation (LM with Synth Backend - v3)")
1539
- print(f"Model: {config.model_name}")
1540
- print("Model Parameters:")
1541
- print(f" Temperature: {model_params['temperature']}")
1542
- print(f" Max Tokens: {model_params['max_tokens']}")
1543
- print(f" Top-p: {model_params['top_p']}")
1544
- print(f" Frequency Penalty: {model_params['frequency_penalty']}")
1545
- print(f" Presence Penalty: {model_params['presence_penalty']}")
1546
- print(f"Service: {config.service_base_url}")
1547
- print(f"Instances: {config.num_instances}")
1548
- print(f"Max Turns: {config.max_turns}")
1549
- print(f"Difficulty: {config.difficulty}")
1550
- print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
1551
- print("=" * 50)
1552
-
1553
- if args.no_traces:
1554
- config.save_traces = False
1555
- config.enable_v3_tracing = False
1556
- if args.analyze:
1557
- config.analyze_traces = True
1558
- if args.skip_warmup:
1559
- config.warmup_model = False
1560
-
1561
- # Ensure model is specified
1562
- if not config.model_name:
1563
- parser.error("Model name must be specified via --model or config file")
1564
-
1565
- # Test service health
1566
- async with AsyncClient(base_url=config.service_base_url) as client:
1567
- try:
1568
- health_resp = await retry_http_request(client, "GET", "/health")
1569
- health_data = health_resp.json()
1570
- print(f"āœ… Crafter service is healthy: {health_data}")
1571
- except Exception as e:
1572
- print(f"āŒ Failed to connect to Crafter service: {e}")
1573
- return
1574
-
1575
- # Warm up the model if requested
1576
- if config.warmup_model and not args.skip_warmup:
1577
- print(f"\nšŸ”„ Warming up {config.model_name} on Synth backend...")
1578
- try:
1579
- synth_base_url = os.getenv("SYNTH_BASE_URL") # or os.getenv('MODAL_BASE_URL')
1580
- synth_api_key = os.getenv("SYNTH_API_KEY") # or os.getenv('MODAL_API_KEY')
1581
- if synth_base_url and synth_api_key:
1582
- synth_config = SynthConfig(
1583
- base_url=synth_base_url,
1584
- api_key=synth_api_key,
1585
- timeout=config.warmup_timeout, # Use configurable timeout
1586
- )
1587
- warmed = await warmup_synth_model(config.model_name, synth_config)
1588
- if warmed:
1589
- print("āœ… Model warmed up successfully!")
1590
- else:
1591
- print("āš ļø Warmup did not complete; continuing anyway...")
1592
- else:
1593
- print("āš ļø Missing SYNTH_BASE_URL or SYNTH_API_KEY, skipping warmup")
1594
- except Exception as e:
1595
- print(f"āš ļø Warmup failed: {e}")
1596
- print("Continuing anyway...")
1597
-
1598
- # Set up v3 tracing if enabled
1599
- trace_manager = None
1600
- experiment_ctx = None
1601
- sqld_daemon = None
1602
-
1603
- if config.enable_v3_tracing:
1604
- # Create trace directory first
1605
- os.makedirs(config.v3_trace_dir, exist_ok=True)
1606
-
1607
- # Start sqld daemon if requested
1608
- if config.start_sqld_daemon:
1609
- print("\nšŸš€ Starting sqld daemon for v3 tracing...")
1610
- sqld_daemon = SqldDaemon(db_path=config.turso_db_path)
1611
- sqld_daemon.__enter__() # Start the daemon
1612
- await asyncio.sleep(2) # Give it time to start
1613
- print("āœ… sqld daemon started")
1614
-
1615
- # Initialize trace manager with proper URL format
1616
- # If SQLD_DB_PATH is a directory managed by sqld, use its data file
1617
- _db_path = config.turso_db_path
1618
- if os.path.isdir(_db_path):
1619
- _candidate = os.path.join(_db_path, "dbs", "default", "data")
1620
- if os.path.exists(_candidate):
1621
- _db_path = _candidate
1622
- db_url = f"sqlite+aiosqlite:///{os.path.abspath(_db_path)}"
1623
- trace_manager = AsyncSQLTraceManager(db_url=db_url)
1624
- await trace_manager.initialize()
1625
-
1626
- # Create experiment context
1627
- experiment_ctx = await create_experiment_context(
1628
- db_manager=trace_manager,
1629
- experiment_name=f"crafter_lm_synth_{config.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
1630
- description=f"Crafter LM Synth experiment with {config.model_name} on {config.difficulty} difficulty, using LM class with v3 tracing",
1631
- )
1632
-
1633
- print(f"\nšŸ“Š V3 Tracing enabled. Traces will be saved to: {config.turso_db_path}")
1634
- print(f" Experiment: {experiment_ctx['experiment_name']}")
1635
-
1636
- # Run episodes with bounded concurrency using asyncio.Semaphore
1637
- # Control concurrency with env var CRAFTER_CONCURRENCY (default 5)
1638
- try:
1639
- _conc_str = os.getenv("CRAFTER_CONCURRENCY")
1640
- max_concurrency = int(_conc_str) if _conc_str else 5
1641
- except Exception:
1642
- max_concurrency = 5
1643
- concurrency_limiter = asyncio.Semaphore(max_concurrency)
1644
-
1645
- print(f"\nšŸš€ Running {config.num_instances} episodes (concurrency={max_concurrency})...")
1646
-
1647
- episode_seeds = [] # Track seeds used for each episode
1648
-
1649
- # Prepare episode tasks
1650
- episode_tasks = []
1651
- session_ids = []
1652
-
1653
- for i in range(config.num_instances):
1654
- # Calculate episode seed for logging (simple sequential: 1, 2, 3, etc)
1655
- episode_seed = i + 1
1656
- episode_seeds.append(episode_seed)
1657
-
1658
- # Create session tracer for this episode if v3 tracing is enabled
1659
- session_tracer = None
1660
- if config.enable_v3_tracing and trace_manager:
1661
- session_tracer = SessionTracer(hooks=QUIET_HOOKS) # Use quiet hooks
1662
- session_tracer.db = trace_manager # Use existing manager
1663
- session_tracer._initialized = True
1664
-
1665
- # Generate session ID
1666
- session_id = f"crafter_episode_{i}_{uuid.uuid4().hex[:8]}"
1667
- session_ids.append(session_id)
1668
-
1669
- # Create episode task with proper session context
1670
- async def run_episode_with_session(ep_id, cfg, tracer, pb, quiet, sess_id, model_params):
1671
- if tracer:
1672
- async with tracer.session(
1673
- session_id=sess_id,
1674
- metadata={
1675
- "episode_id": ep_id,
1676
- "experiment_id": experiment_ctx["experiment_id"]
1677
- if experiment_ctx
1678
- else None,
1679
- },
1680
- ):
1681
- return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1682
- else:
1683
- return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1684
-
1685
- # Freeze per-iteration values to avoid late-binding bugs in closures
1686
- this_tracer = session_tracer
1687
- this_session_id = session_ids[i] if session_ids else None
1688
-
1689
- async def _limited_episode(ep_idx=i, tracer=this_tracer, sess_id=this_session_id):
1690
- async with concurrency_limiter:
1691
- return await run_episode_with_session(
1692
- ep_idx, config, tracer, None, args.quiet, sess_id, model_params
1693
- )
1694
-
1695
- episode_task = _limited_episode()
1696
- episode_tasks.append(episode_task)
1697
-
1698
- print("\nšŸ“¤ Starting episodes...")
1699
- start_time = time.time()
1700
-
1701
- # Run all episodes in parallel and fail fast on first error
1702
- try:
1703
- results = await asyncio.gather(*episode_tasks, return_exceptions=False)
1704
- except Exception as e:
1705
- print(f"\nāŒ Run aborted due to error: {e}")
1706
- # Ensure resources are cleaned up before exiting
1707
- if trace_manager:
1708
- await trace_manager.close()
1709
- if sqld_daemon:
1710
- sqld_daemon.__exit__(None, None, None)
1711
- print("\nāœ… Stopped sqld daemon")
1712
- raise
1713
-
1714
- end_time = time.time()
1715
- parallel_time = end_time - start_time
1716
-
1717
- print(f"\nāœ… Completed {len(episode_tasks)} episodes in {parallel_time:.2f} seconds")
1718
-
1719
- # Process results and handle any exceptions
1720
- successful_results = []
1721
- failed_results = []
1722
-
1723
- for i, result in enumerate(results):
1724
- if isinstance(result, Exception):
1725
- print(f"āŒ Episode {i} failed: {result}")
1726
- failed_results.append({"episode_id": i, "error": str(result)})
1727
- else:
1728
- successful_results.append(result)
1729
-
1730
- # Link session to experiment if tracing enabled
1731
- if (
1732
- config.enable_v3_tracing
1733
- and trace_manager
1734
- and experiment_ctx
1735
- and i < len(session_ids)
1736
- ):
1737
- await trace_manager.link_session_to_experiment(
1738
- session_ids[i], experiment_ctx["experiment_id"]
1739
- )
1740
-
1741
- # Use successful results for analysis
1742
- results = successful_results + failed_results
1743
-
1744
- # Analyze results
1745
- print("\n" + "=" * 50)
1746
- print("šŸ“Š EVALUATION RESULTS")
1747
- print("=" * 50)
1748
-
1749
- successful_episodes = [r for r in results if "error" not in r]
1750
- failed_episodes = [r for r in results if "error" in r]
1751
-
1752
- if successful_episodes:
1753
- total_reward = sum(r["total_reward"] for r in successful_episodes)
1754
- total_steps = sum(r["steps"] for r in successful_episodes)
1755
- avg_reward = total_reward / len(successful_episodes)
1756
- avg_steps = total_steps / len(successful_episodes)
1757
-
1758
- print(f"Episodes completed: {len(successful_episodes)}/{config.num_instances}")
1759
- print(f"Failed episodes: {len(failed_episodes)}")
1760
- print(f"Total reward: {total_reward:.2f}")
1761
- print(f"Average reward per episode: {avg_reward:.2f}")
1762
- print(f"Total steps: {total_steps}")
1763
- print(f"Average steps per episode: {avg_steps:.2f}")
1764
-
1765
- # Show seeds used
1766
- if episode_seeds:
1767
- print("\nSeeds used:")
1768
- for i, seed in enumerate(episode_seeds[: len(successful_episodes)]):
1769
- print(f" Episode {i}: seed {seed}")
1770
-
1771
- # Extract unique achievements
1772
- all_achievements = set()
1773
- achievement_counts = defaultdict(int)
1774
-
1775
- for result in successful_episodes:
1776
- # Use the achievements_unlocked field we added
1777
- if "achievements_unlocked" in result:
1778
- for achievement in result["achievements_unlocked"]:
1779
- all_achievements.add(achievement)
1780
- achievement_counts[achievement] += 1
1781
-
1782
- # Extract and count all actions from successful episodes
1783
- action_counts = defaultdict(int)
1784
- total_actions = 0
1785
-
1786
- for result in successful_episodes:
1787
- if "step_results" in result:
1788
- for step in result["step_results"]:
1789
- if "action" in step:
1790
- action_counts[step["action"]] += 1
1791
- total_actions += 1
1792
-
1793
- print(f"Unique achievements unlocked: {len(all_achievements)}")
1794
- if all_achievements:
1795
- print("\nAchievements unlocked:")
1796
- for achievement, count in sorted(achievement_counts.items()):
1797
- print(
1798
- f" - {achievement}: {count} episodes ({count / len(successful_episodes) * 100:.1f}%)"
1799
- )
1800
-
1801
- # Display action counts
1802
- if action_counts:
1803
- print(f"\nAction counts (total: {total_actions}):")
1804
- for action, count in sorted(action_counts.items(), key=lambda x: x[1], reverse=True):
1805
- percentage = count / total_actions * 100 if total_actions > 0 else 0
1806
- print(f" - {action}: {count} ({percentage:.1f}%)")
1807
- else:
1808
- print("No successful episodes completed.")
1809
-
1810
- # Save detailed results
1811
- if config.save_detailed_results and config.enable_v3_tracing and trace_manager:
1812
- # For v3, results are automatically saved in the database
1813
- print(f"\nšŸ’¾ Results available in Turso database: {config.turso_db_path}")
1814
- print(f" Experiment ID: {experiment_ctx['experiment_id']}")
1815
- print(" Use the filter_traces_sft_turso.py script to extract fine-tuning data")
1816
- elif config.save_detailed_results:
1817
- # Fallback to JSON if no tracing - write under temp/ (git-ignored)
1818
- from pathlib import Path
1819
-
1820
- out_dir = Path(os.getenv("SYNTH_OUTPUT_DIR", "temp")).resolve()
1821
- out_dir.mkdir(parents=True, exist_ok=True)
1822
- results_path = (
1823
- out_dir / f"crafter_lm_synth_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1824
- )
1825
- with open(results_path, "w") as f:
1826
- json.dump(
1827
- {
1828
- "config": {
1829
- "model": config.model_name,
1830
- "episodes": config.num_instances,
1831
- "max_steps": config.max_turns,
1832
- "difficulty": config.difficulty,
1833
- "backend": "synth",
1834
- "tracing": "v3",
1835
- },
1836
- "results": results,
1837
- "summary": {
1838
- "successful_episodes": len(successful_episodes),
1839
- "failed_episodes": len(failed_episodes),
1840
- "total_reward": total_reward if successful_episodes else 0,
1841
- "avg_reward": avg_reward if successful_episodes else 0,
1842
- "unique_achievements": list(all_achievements)
1843
- if successful_episodes
1844
- else [],
1845
- },
1846
- },
1847
- f,
1848
- indent=2,
1849
- )
1850
- print(f"\nšŸ’¾ Detailed results saved to: {results_path}")
1851
-
1852
- # Print a markdown row compatible with Environments/crafter.md tables
1853
- if successful_episodes:
1854
- # Columns: | model | trajectories | avg achievements | adj score | unique | steps sum | avg steps |
1855
- model_label = config.model_name.replace("/", "/")
1856
- trajectories = len(successful_episodes)
1857
- avg_ach = avg_reward # our reward == achievements unlocked per episode
1858
-
1859
- # Compute weighted scores (shaped and K-Score) from final achievements across episodes
1860
- # K coefficients taken from crafter.md (representative weights)
1861
- k_weights = {
1862
- "collect_drink": 0.1,
1863
- "collect_sapling": 0.1,
1864
- "wake_up": 0.1,
1865
- "collect_wood": 1.0,
1866
- "collect_stone": 1.0,
1867
- "eat_cow": 1.0,
1868
- "defeat_zombie": 1.0,
1869
- "defeat_skeleton": 1.0,
1870
- "make_wood_pickaxe": 3.0,
1871
- "place_table": 3.0,
1872
- "collect_coal": 3.0,
1873
- "make_stone_pickaxe": 10.0,
1874
- "place_furnace": 10.0,
1875
- "collect_iron": 10.0,
1876
- "make_stone_sword": 10.0,
1877
- "make_wood_sword": 3.0,
1878
- "place_plant": 0.1,
1879
- }
1880
-
1881
- # Aggregate final achievements across successful episodes
1882
- from collections import Counter
1883
-
1884
- ach_counter: Counter[str] = Counter()
1885
- for ep in successful_episodes:
1886
- for name in ep.get("achievements_unlocked", []):
1887
- ach_counter[name] += 1
1888
-
1889
- shaped_total = 0.0
1890
- for name, count in ach_counter.items():
1891
- k = k_weights.get(name, 1.0)
1892
- shaped_total += k * count
1893
-
1894
- # Shaped reward per episode average
1895
- shaped_reward_avg = shaped_total / trajectories if trajectories > 0 else 0.0
1896
- k_score_avg = shaped_reward_avg / 20.0 # normalize roughly to match table scale
1897
-
1898
- # unique = len(all_achievements) # unused
1899
- steps_sum = total_steps
1900
- avg_steps_md = avg_steps
1901
- print("\nMarkdown row:")
1902
- print(
1903
- f"| {model_label:<15} | {trajectories:7d} | {avg_ach:8.2f} | {shaped_reward_avg:13.3f} | {k_score_avg:12.3f} | {steps_sum:12.3f} | {avg_steps_md:8.3f} |"
1904
- )
1905
-
1906
- # Cleanup
1907
- if trace_manager:
1908
- await trace_manager.close()
1909
-
1910
- if sqld_daemon:
1911
- sqld_daemon.__exit__(None, None, None)
1912
- print("\nāœ… Stopped sqld daemon")
1913
-
1914
-
1915
- if __name__ == "__main__":
1916
- asyncio.run(main())
1917
-
1918
-
1919
- # === SEMANTIC MAP VIEW (15x15) ===
1920
- # stone coal iron coal coal coal coal
1921
- # stone stone iron coal coal coal coal
1922
- # stone stone zombie coal coal iron iron
1923
- # stone stone stone you stone iron iron
1924
- # stone stone stone stone stone stone stone
1925
- # stone stone stone stone stone stone stone
1926
- # stone stone stone stone stone stone stone
1927
- # Visible items: coal, iron, stone, zombie
1928
-
1929
- # === STATUS ===
1930
- # Health: 10/10 | Food: 10/10 | Drink: 10/10 | Energy: 10/10
1931
- # Inventory: health: 9, food: 7, drink: 7, energy: 9, wood: 1, wood_pickaxe: 1
1932
- # Achievements: 4/22 unlocked
1933
- # Unlocked: collect_wood, make_wood_pickaxe, place_table, wake_up