synth-ai 0.2.9.dev7__py3-none-any.whl → 0.2.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (323) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +8 -11
  3. examples/dev/qwen3_32b_qlora_4xh100.toml +40 -0
  4. examples/multi_step/crafter_rl_lora.md +29 -0
  5. examples/qwen_coder/README.md +102 -0
  6. examples/qwen_coder/_shared.py +113 -0
  7. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  8. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  9. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  10. examples/qwen_coder/generate_dataset.py +98 -0
  11. examples/qwen_coder/infer_ft_smoke.py +65 -0
  12. examples/qwen_coder/infer_prod_proxy.py +73 -0
  13. examples/qwen_coder/infer_via_synth.py +87 -0
  14. examples/qwen_coder/scripts/infer_coder.sh +19 -0
  15. examples/qwen_coder/scripts/train_coder_30b.sh +22 -0
  16. examples/qwen_coder/sft_full_17b.py +103 -0
  17. examples/qwen_coder/sft_lora_30b.py +110 -0
  18. examples/qwen_coder/subset_jsonl.py +39 -0
  19. examples/qwen_coder/todos.md +38 -0
  20. examples/qwen_coder/validate_jsonl.py +60 -0
  21. examples/rl/run_eval.py +36 -37
  22. examples/rl/run_rl_and_save.py +5 -5
  23. examples/rl/task_app/math_single_step.py +65 -43
  24. examples/rl/task_app/math_task_app.py +3 -3
  25. examples/sft/README.md +139 -0
  26. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  27. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  28. examples/sft/evaluate.py +117 -0
  29. examples/sft/export_dataset.py +117 -0
  30. examples/sft/generate_traces.py +162 -0
  31. examples/swe/__init__.py +12 -0
  32. examples/swe/task_app/README.md +105 -0
  33. examples/swe/task_app/__init__.py +2 -0
  34. examples/swe/task_app/grpo_swe_mini.py +571 -0
  35. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  36. examples/swe/task_app/hosted/README.md +173 -0
  37. examples/swe/task_app/hosted/__init__.py +5 -0
  38. examples/swe/task_app/hosted/branching.py +143 -0
  39. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  40. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  41. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  42. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  43. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  44. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  45. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  46. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  47. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  50. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  51. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  52. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  53. examples/swe/task_app/hosted/hosted_app.py +204 -0
  54. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  55. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  56. examples/swe/task_app/hosted/main.py +100 -0
  57. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  58. examples/swe/task_app/hosted/registry.py +195 -0
  59. examples/swe/task_app/hosted/rollout.py +1869 -0
  60. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  61. examples/swe/task_app/hosted/storage/volume.py +211 -0
  62. examples/swe/task_app/hosted/test_agents.py +161 -0
  63. examples/swe/task_app/hosted/test_service.py +137 -0
  64. examples/swe/task_app/hosted/utils.py +62 -0
  65. examples/vlm/PROPOSAL.md +53 -0
  66. examples/vlm/README.md +68 -0
  67. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  68. examples/vlm/crafter_image_only_agent.py +207 -0
  69. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  70. examples/vlm/filter_image_rows.py +63 -0
  71. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  72. examples/warming_up_to_rl/analyze_trace_db.py +5 -5
  73. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  74. examples/warming_up_to_rl/export_trace_sft.py +78 -21
  75. examples/warming_up_to_rl/groq_test.py +4 -4
  76. examples/warming_up_to_rl/manage_secrets.py +13 -18
  77. examples/warming_up_to_rl/run_eval.py +42 -44
  78. examples/warming_up_to_rl/run_fft_and_save.py +11 -16
  79. examples/warming_up_to_rl/run_local_rollout.py +1 -3
  80. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -4
  81. examples/warming_up_to_rl/run_local_rollout_parallel.py +1 -4
  82. examples/warming_up_to_rl/run_local_rollout_traced.py +3 -5
  83. examples/warming_up_to_rl/run_rl_and_save.py +5 -6
  84. examples/warming_up_to_rl/run_rollout_remote.py +8 -10
  85. examples/warming_up_to_rl/task_app/README.md +6 -2
  86. examples/warming_up_to_rl/task_app/grpo_crafter.py +234 -35
  87. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +2 -3
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +131 -114
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +101 -41
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +73 -51
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +14 -6
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +16 -16
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +32 -34
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +94 -31
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +303 -203
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +328 -225
  101. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +13 -13
  102. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  103. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +1 -0
  104. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  105. synth_ai/api/models/supported.py +376 -0
  106. synth_ai/api/train/builders.py +128 -21
  107. synth_ai/api/train/cli.py +80 -64
  108. synth_ai/api/train/config_finder.py +7 -2
  109. synth_ai/api/train/env_resolver.py +1 -1
  110. synth_ai/api/train/pollers.py +2 -1
  111. synth_ai/api/train/supported_algos.py +139 -0
  112. synth_ai/api/train/task_app.py +1 -2
  113. synth_ai/api/train/utils.py +13 -44
  114. synth_ai/cli/__init__.py +8 -0
  115. synth_ai/cli/_modal_wrapper.py +28 -0
  116. synth_ai/cli/_typer_patch.py +49 -0
  117. synth_ai/cli/balance.py +1 -2
  118. synth_ai/cli/calc.py +1 -1
  119. synth_ai/cli/demo.py +2 -1
  120. synth_ai/cli/recent.py +2 -2
  121. synth_ai/cli/rl_demo.py +2 -1
  122. synth_ai/cli/root.py +11 -13
  123. synth_ai/cli/status.py +2 -2
  124. synth_ai/cli/task_apps.py +529 -179
  125. synth_ai/cli/traces.py +6 -4
  126. synth_ai/cli/watch.py +12 -18
  127. synth_ai/demo_registry.py +1 -1
  128. synth_ai/demos/core/cli.py +36 -43
  129. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  130. synth_ai/demos/demo_task_apps/core.py +17 -25
  131. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +3 -4
  132. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  133. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -4
  134. synth_ai/demos/demo_task_apps/math/modal_task_app.py +16 -18
  135. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
  136. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  137. synth_ai/environments/reproducibility/tree.py +2 -5
  138. synth_ai/environments/service/app.py +11 -12
  139. synth_ai/environments/service/core_routes.py +4 -7
  140. synth_ai/environments/stateful/engine.py +1 -1
  141. synth_ai/environments/tasks/core.py +1 -0
  142. synth_ai/environments/tasks/filters.py +5 -6
  143. synth_ai/environments/tasks/utils.py +4 -5
  144. synth_ai/handshake.py +9 -9
  145. synth_ai/http.py +1 -1
  146. synth_ai/http_client.py +18 -10
  147. synth_ai/inference/client.py +15 -5
  148. synth_ai/jobs/client.py +78 -83
  149. synth_ai/learning/__init__.py +41 -6
  150. synth_ai/learning/algorithms.py +14 -0
  151. synth_ai/learning/client.py +91 -24
  152. synth_ai/learning/config.py +2 -38
  153. synth_ai/learning/ft_client.py +4 -59
  154. synth_ai/learning/health.py +5 -6
  155. synth_ai/learning/jobs.py +31 -47
  156. synth_ai/{rl → learning/rl}/__init__.py +14 -4
  157. synth_ai/learning/rl/client.py +267 -0
  158. synth_ai/learning/rl/config.py +31 -0
  159. synth_ai/{rl → learning/rl}/contracts.py +5 -8
  160. synth_ai/{rl → learning/rl}/env_keys.py +39 -15
  161. synth_ai/learning/rl/secrets.py +13 -0
  162. synth_ai/learning/rl_client.py +2 -281
  163. synth_ai/learning/sft/__init__.py +29 -0
  164. synth_ai/learning/sft/client.py +68 -0
  165. synth_ai/learning/sft/config.py +270 -0
  166. synth_ai/learning/sft/data.py +295 -0
  167. synth_ai/learning/sse.py +25 -24
  168. synth_ai/learning/validators.py +25 -28
  169. synth_ai/lm/__init__.py +21 -47
  170. synth_ai/task/__init__.py +25 -27
  171. synth_ai/task/apps/__init__.py +7 -8
  172. synth_ai/task/auth.py +8 -8
  173. synth_ai/task/client.py +14 -14
  174. synth_ai/task/contracts.py +36 -35
  175. synth_ai/task/datasets.py +6 -5
  176. synth_ai/task/errors.py +10 -10
  177. synth_ai/task/health.py +17 -9
  178. synth_ai/task/json.py +58 -23
  179. synth_ai/task/proxy.py +13 -9
  180. synth_ai/task/rubrics.py +16 -15
  181. synth_ai/task/server.py +12 -12
  182. synth_ai/task/tracing_utils.py +4 -4
  183. synth_ai/task/vendors.py +5 -6
  184. synth_ai/tracing_v3/__init__.py +2 -0
  185. synth_ai/tracing_v3/abstractions.py +21 -4
  186. synth_ai/tracing_v3/decorators.py +18 -16
  187. synth_ai/tracing_v3/hooks.py +5 -5
  188. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  189. synth_ai/tracing_v3/session_tracer.py +40 -14
  190. synth_ai/tracing_v3/storage/base.py +85 -0
  191. synth_ai/tracing_v3/storage/config.py +21 -8
  192. synth_ai/tracing_v3/storage/factory.py +10 -7
  193. synth_ai/tracing_v3/storage/utils.py +4 -2
  194. synth_ai/tracing_v3/turso/daemon.py +7 -2
  195. synth_ai/tracing_v3/turso/models.py +2 -2
  196. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  197. synth_ai/tracing_v3/utils.py +4 -4
  198. synth_ai/v0/api/__init__.py +8 -0
  199. synth_ai/v0/api/models/__init__.py +8 -0
  200. synth_ai/v0/api/models/supported.py +8 -0
  201. synth_ai/v0/config/__init__.py +15 -0
  202. synth_ai/v0/config/base_url.py +12 -0
  203. synth_ai/v0/lm/__init__.py +51 -0
  204. synth_ai/{lm → v0/lm}/caching/ephemeral.py +2 -2
  205. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  206. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  207. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  208. synth_ai/{lm → v0/lm}/config.py +6 -1
  209. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  210. synth_ai/{lm → v0/lm}/core/main.py +6 -6
  211. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  212. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -14
  213. synth_ai/{lm → v0/lm}/core/vendor_clients.py +2 -2
  214. synth_ai/{lm → v0/lm}/overrides.py +2 -2
  215. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  216. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  217. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  218. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  219. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +9 -9
  220. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  221. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  222. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +10 -10
  223. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +8 -8
  224. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +2 -2
  225. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +3 -3
  226. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  227. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  228. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  229. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  230. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  231. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  232. synth_ai/{lm → v0/lm}/vendors/synth_client.py +1 -1
  233. synth_ai/v0/tracing_v3/__init__.py +10 -0
  234. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  235. synth_ai/v0/tracing_v3/decorators.py +3 -0
  236. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  237. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  238. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/METADATA +10 -7
  239. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/RECORD +269 -233
  240. examples/common_old/backend.py +0 -20
  241. examples/evals_old/README.md +0 -98
  242. examples/evals_old/__init__.py +0 -6
  243. examples/evals_old/compare_models.py +0 -1038
  244. examples/evals_old/example_log.md +0 -145
  245. examples/evals_old/run_demo.sh +0 -126
  246. examples/evals_old/trace_analysis.py +0 -270
  247. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  248. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  249. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  250. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -243
  251. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  252. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  253. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  254. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  255. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  256. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -119
  257. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  258. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  259. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -243
  260. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  261. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  262. examples/finetuning_old/synth_qwen_v1/infer.py +0 -36
  263. examples/finetuning_old/synth_qwen_v1/poll.py +0 -46
  264. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  265. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  266. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1933
  267. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -210
  268. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -237
  269. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  270. examples/finetuning_old/synth_qwen_v1/util.py +0 -152
  271. examples/rl_old/task_app.py +0 -1131
  272. synth_ai/experimental/synth_oss.py +0 -445
  273. synth_ai/learning/filtering.py +0 -0
  274. synth_ai/learning/offline/dpo.py +0 -0
  275. synth_ai/learning/offline/providers.py +0 -7
  276. synth_ai/learning/offline/sft.py +0 -0
  277. synth_ai/learning/offline/shared.py +0 -0
  278. synth_ai/learning/online/grpo.py +0 -0
  279. synth_ai/learning/online/irft.py +0 -0
  280. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  281. synth_ai/learning/prompts/gepa.py +0 -0
  282. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -211
  283. synth_ai/learning/prompts/mipro.py +0 -289
  284. synth_ai/learning/prompts/random_search.py +0 -249
  285. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  286. synth_ai/learning/prompts/run_random_search_banking77.py +0 -329
  287. synth_ai/rl/secrets.py +0 -19
  288. synth_ai/scripts/verify_rewards.py +0 -100
  289. synth_ai/tracing/__init__.py +0 -30
  290. synth_ai/tracing_v1/__init__.py +0 -33
  291. synth_ai/tracing_v3/turso/__init__.py +0 -25
  292. synth_ai/tracing_v3/turso/manager.py +0 -838
  293. synth_ai/zyk/__init__.py +0 -30
  294. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  295. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  296. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  297. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  298. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  299. /synth_ai/{lm → v0/lm}/core/exceptions.py +0 -0
  300. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  301. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  302. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  303. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  304. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  305. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  306. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  307. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  308. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  309. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  310. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  311. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  312. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  313. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  314. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  315. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  316. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  317. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  318. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  319. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  320. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/WHEEL +0 -0
  321. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/entry_points.txt +0 -0
  322. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/licenses/LICENSE +0 -0
  323. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.10.dist-info}/top_level.txt +0 -0
@@ -1,1924 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script to run ReAct agents against Crafter environment using LM class with Synth backend.
4
- This demonstrates using the LM class with Synth models through native integration.
5
-
6
- This version uses the new tracing_v3 system with async Turso/SQLite backend.
7
- """
8
-
9
- import argparse
10
- import asyncio
11
- import contextlib
12
- from contextlib import asynccontextmanager
13
- import glob
14
- import itertools
15
- import json
16
- import logging
17
- import os
18
- import random
19
- import sys
20
- import time
21
- import uuid
22
- from collections import defaultdict
23
- from datetime import datetime
24
- from pathlib import Path
25
- from typing import Any
26
-
27
- import httpx
28
- import numpy as np
29
- import toml
30
- import yaml
31
- from httpx import AsyncClient
32
- from tqdm import tqdm
33
-
34
- # Disable httpx logging immediately
35
- logging.getLogger("httpx").setLevel(logging.ERROR)
36
- logging.getLogger("httpcore").setLevel(logging.ERROR)
37
-
38
-
39
- # Configure logging to suppress noisy third-party logs when in quiet mode
40
- def setup_logging(quiet_mode: bool = False):
41
- """Setup logging configuration."""
42
- if quiet_mode:
43
- # Suppress most third-party logging in quiet mode
44
- logging.getLogger("httpx").setLevel(logging.ERROR)
45
- logging.getLogger("synth_ai.tracing_v3").setLevel(logging.ERROR)
46
- logging.getLogger("synth_ai.tracing_v3.turso").setLevel(logging.ERROR)
47
- logging.getLogger("sqlalchemy").setLevel(logging.ERROR)
48
- logging.getLogger("aiosqlite").setLevel(logging.ERROR)
49
- # Suppress httpcore as well (used by httpx)
50
- logging.getLogger("httpcore").setLevel(logging.ERROR)
51
- else:
52
- # Normal logging levels
53
- logging.getLogger("httpx").setLevel(logging.ERROR) # Always suppress httpx logs
54
- logging.getLogger("synth_ai.tracing_v3").setLevel(logging.INFO)
55
-
56
-
57
- # Set default logging to avoid noisy logs during import
58
- setup_logging(quiet_mode=True)
59
-
60
- # Setup environment
61
- sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
62
-
63
- # Disable v1 logging to see v3 tracing clearly
64
- os.environ["LANGFUSE_ENABLED"] = "false"
65
- os.environ["SYNTH_LOGGING"] = "false"
66
-
67
- from synth_ai.lm.config import SynthConfig # noqa: E402
68
-
69
- # Import Synth warmup utilities
70
- from synth_ai.lm.warmup import warmup_synth_model # noqa: E402
71
-
72
- # Import session tracer for v3 tracing
73
- from synth_ai.tracing_v3 import SessionTracer # noqa: E402
74
- from synth_ai.tracing_v3.abstractions import ( # noqa: E402
75
- EnvironmentEvent,
76
- RuntimeEvent,
77
- SessionEventMarkovBlanketMessage,
78
- TimeRecord,
79
- )
80
-
81
- # Import Crafter hooks for v3
82
- from synth_ai.tracing_v3.hooks import HookManager # noqa: E402
83
- from synth_ai.tracing_v3.turso.daemon import SqldDaemon # noqa: E402
84
-
85
- # create_experiment_context will be defined as a helper function below
86
- from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager # noqa: E402
87
-
88
- # Create a custom hook manager without default print statements
89
- QUIET_HOOKS = HookManager()
90
-
91
- # Import LM components (v3 version if available)
92
- try:
93
- from synth_ai.lm.core.main_v3 import LM # noqa: E402
94
- except ImportError:
95
- from synth_ai.lm.core.main_v2 import LM # noqa: E402
96
-
97
- # Configuration constants
98
- HTTP_TIMEOUT = (
99
- 30.0 # Increased from 10.0 for better handling of concurrent load and LM response times
100
- )
101
- MAX_RETRIES = 3
102
- RETRY_DELAY = 1.0
103
-
104
-
105
- # Use the backend
106
- @asynccontextmanager
107
- async def _noop_async_context():
108
- yield
109
-
110
-
111
- async def create_experiment_context(
112
- db_manager: AsyncSQLTraceManager, experiment_name: str, description: str
113
- ) -> dict[str, Any]:
114
- """Create an experiment context for v3 tracing."""
115
- experiment_id = f"exp_{uuid.uuid4().hex[:12]}"
116
- await db_manager.create_experiment(
117
- experiment_id=experiment_id, name=experiment_name, description=description, configuration={}
118
- )
119
- return {
120
- "experiment_id": experiment_id,
121
- "experiment_name": experiment_name,
122
- "description": description,
123
- }
124
-
125
-
126
- def cleanup_old_files():
127
- """Clean up old trace files and result files to keep directory clean."""
128
- # Remove old JSON result files (keep only the latest 5)
129
- result_files = glob.glob("crafter_lm_synth_results_*.json")
130
- if len(result_files) > 5:
131
- # Sort by modification time and keep only the latest 5
132
- result_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
133
- for old_file in result_files[5:]:
134
- try:
135
- os.remove(old_file)
136
- print(f"šŸ—‘ļø Cleaned up old result file: {old_file}")
137
- except OSError:
138
- pass
139
-
140
-
141
- def _load_env_from_monorepo() -> dict:
142
- """Load environment variables from monorepo/.env.local if present."""
143
- env_file = (
144
- Path(__file__).resolve().parent.parent.parent.parent.parent.parent / "monorepo/.env.local"
145
- )
146
- env_vars = {}
147
-
148
- if env_file.exists():
149
- with open(env_file) as f:
150
- for line in f:
151
- line = line.strip()
152
- if line and not line.startswith("#") and "=" in line:
153
- key, value = line.split("=", 1)
154
- # Remove quotes if present
155
- value = value.strip().strip('"').strip("'")
156
- env_vars[key] = value
157
-
158
- return env_vars
159
-
160
-
161
- def _load_testing_yaml_api_key() -> str | None:
162
- """Load SYNTH_API_KEY from monorepo/tests/prod/testing_info.yaml if present."""
163
- # First try the new env vars from monorepo/.env.local
164
- env_vars = _load_env_from_monorepo()
165
-
166
- # Try production key first, then test key
167
- if "SYNTH_API_KEY_PROD" in env_vars:
168
- return env_vars["SYNTH_API_KEY_PROD"]
169
- elif "SYNTH_API_KEY_TEST" in env_vars:
170
- return env_vars["SYNTH_API_KEY_TEST"]
171
-
172
- # Fallback to the old YAML method
173
- yaml_path = (
174
- Path(__file__).resolve().parent.parent.parent.parent.parent.parent
175
- / "monorepo/tests/prod/testing_info.yaml"
176
- )
177
- if yaml_path.exists():
178
- with open(yaml_path) as f:
179
- data = yaml.safe_load(f)
180
- return data.get("SYNTH_API_KEY")
181
- return None
182
-
183
-
184
- def setup_synth_environment():
185
- """Setup environment variables for Synth/Modal endpoints.
186
-
187
- Resolution order for the base URL:
188
- 1. Explicit environment variables (SYNTH_BASE_URL or MODAL_BASE_URL)
189
- 2. PROD_API_URL env var used in production integration tests
190
- 3. Hard-coded production constant (https://agent-learning.onrender.com)
191
-
192
- The API key is resolved from the matching *_API_KEY env vars or, if not
193
- present, from the shared testing_info.yaml used by the prod tests.
194
- """
195
- # Load environment variables from monorepo/.env.local
196
- env_vars = _load_env_from_monorepo()
197
-
198
- synth_base_url = (
199
- os.getenv("SYNTH_BASE_URL")
200
- or os.getenv("MODAL_BASE_URL")
201
- or os.getenv("PROD_API_URL")
202
- or env_vars.get("SYNTH_BASE_URL_PROD") # Use production URL from .env.local
203
- or "https://agent-learning.onrender.com/api"
204
- )
205
-
206
- synth_api_key = os.getenv("SYNTH_API_KEY") or _load_testing_yaml_api_key()
207
-
208
- # # --- Validate API key format ---
209
- # if synth_api_key:
210
- # VALID_PREFIXES = ("sk-", "sk_live_", "sk_test_")
211
- # if not any(synth_api_key.startswith(p) for p in VALID_PREFIXES):
212
- # truncated = synth_api_key[:8] if len(synth_api_key) >= 8 else synth_api_key
213
- # expected_formats = " or ".join(VALID_PREFIXES)
214
- # raise ValueError(
215
- # f"Invalid API key format. Expected prefix {expected_formats}. Provided key begins with '{truncated}'."
216
- # )
217
- # else:
218
- # raise ValueError(
219
- # "SYNTH_API_KEY or MODAL_API_KEY must be provided via environment variables or testing_info.yaml"
220
- # )
221
-
222
- # Ensure trailing /v1 for OpenAI-compatible endpoints
223
- if not synth_base_url.endswith("/v1"):
224
- synth_base_url = synth_base_url.rstrip("/") + "/v1"
225
- synth_base_url = synth_base_url.rstrip("/")
226
-
227
- # Propagate to OpenAI SDK env vars expected by LM class
228
- os.environ["OPENAI_API_BASE"] = synth_base_url
229
- os.environ["OPENAI_BASE_URL"] = synth_base_url
230
- os.environ["OPENAI_API_KEY"] = synth_api_key
231
-
232
- return synth_base_url, synth_api_key
233
-
234
-
235
- async def retry_http_request(client: AsyncClient, method: str, url: str, **kwargs) -> Any:
236
- """Retry HTTP requests with exponential backoff and jitter."""
237
- last_exception = None
238
-
239
- for attempt in range(MAX_RETRIES):
240
- try:
241
- if attempt > 0:
242
- delay = min(RETRY_DELAY * (2 ** (attempt - 1)), RETRY_DELAY * 2) # Use RETRY_DELAY
243
- jitter = random.uniform(0, 0.1 * delay)
244
- total_delay = delay + jitter
245
- await asyncio.sleep(total_delay)
246
-
247
- response = await client.request(method, url, timeout=HTTP_TIMEOUT, **kwargs)
248
-
249
- if response.status_code < 500:
250
- return response
251
-
252
- last_exception = Exception(f"HTTP {response.status_code}: {response.text}")
253
-
254
- except httpx.ReadError as e:
255
- last_exception = e
256
- if attempt < MAX_RETRIES - 1:
257
- read_error_delay = min(1.0 * (2**attempt), 5.0)
258
- await asyncio.sleep(read_error_delay)
259
- except Exception as e:
260
- last_exception = e
261
-
262
- print(
263
- f" āŒ HTTP request failed after {MAX_RETRIES} attempts: {type(last_exception).__name__}: {str(last_exception)[:200]}"
264
- )
265
- raise last_exception
266
-
267
-
268
- def create_message(
269
- content: Any, message_type: str, origin_system_id: Any, turn: int
270
- ) -> SessionEventMarkovBlanketMessage:
271
- """Create a message with origin system ID embedded in content."""
272
- # Map custom message types to valid v3 message types
273
- type_mapping = {
274
- "observation": "system", # Map observation to system message
275
- "user": "user",
276
- "assistant": "assistant",
277
- "system": "system",
278
- "tool_use": "tool_use",
279
- "tool_result": "tool_result",
280
- }
281
-
282
- return SessionEventMarkovBlanketMessage(
283
- content=json.dumps({"origin_system_id": str(origin_system_id), "payload": content}),
284
- message_type=type_mapping.get(message_type, "system"), # Default to system
285
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
286
- )
287
-
288
-
289
- def compress_observation_for_trace(obs: dict[str, Any]) -> dict[str, Any]:
290
- """Compress observation for trace storage to avoid huge trace files."""
291
- compressed = obs.copy()
292
-
293
- # Compress semantic map if present
294
- if "semantic_map" in compressed:
295
- del compressed["semantic_map"]
296
-
297
- # Compress other large fields
298
- if "rgb" in compressed:
299
- del compressed["rgb"]
300
-
301
- return compressed
302
-
303
-
304
- def format_semantic_map_view_v2(obs: dict[str, Any], view_size: int = 7) -> str:
305
- """Format a semantic map view around the player with normal names using real Crafter mapping."""
306
- # Get semantic map
307
- semantic_map = obs.get("semantic_map")
308
- if semantic_map is None:
309
- return "No semantic map available"
310
-
311
- # Convert to numpy array if needed
312
- sem_arr = np.asarray(semantic_map)
313
- if sem_arr.ndim == 1:
314
- # Assuming square map, reshape
315
- size = int(np.sqrt(sem_arr.size))
316
- sem_arr = sem_arr.reshape(size, size)
317
-
318
- # Get player position
319
- player_pos = obs.get("player_position", [sem_arr.shape[0] // 2, sem_arr.shape[1] // 2])
320
- px, py = int(player_pos[0]), int(player_pos[1])
321
-
322
- # Get real crafter semantic mapping directly from crafter library
323
- import crafter
324
-
325
- dummyenv = crafter.Env()
326
- try:
327
- max_id = (
328
- max(max(dummyenv._world._mat_ids.values()), max(dummyenv._sem_view._obj_ids.values()))
329
- + 1
330
- )
331
- id_to_item = ["void"] * max_id
332
- for name, ind in itertools.chain(
333
- dummyenv._world._mat_ids.items(), dummyenv._sem_view._obj_ids.items()
334
- ):
335
- clean = (
336
- name.__name__
337
- if hasattr(name, "__name__")
338
- else (str(name) if name is not None else "none")
339
- )
340
- id_to_item[ind] = clean.lower()
341
- finally:
342
- with contextlib.suppress(AttributeError, Exception):
343
- dummyenv.close()
344
-
345
- # Create view
346
- half = view_size // 2
347
- lines = []
348
- visible_items = set()
349
-
350
- for dy in range(-half, half + 1):
351
- row = []
352
- for dx in range(-half, half + 1):
353
- x, y = px + dx, py + dy
354
-
355
- if dx == 0 and dy == 0:
356
- row.append("you") # Player
357
- elif 0 <= x < sem_arr.shape[0] and 0 <= y < sem_arr.shape[1]:
358
- val = int(sem_arr[x, y])
359
- # Use the real crafter mapping
360
- item_name = id_to_item[val] if val < len(id_to_item) else f"unknown_{val}"
361
- row.append(item_name)
362
- if item_name not in ["grass", "you", "void"]:
363
- visible_items.add(item_name)
364
- else:
365
- row.append("void") # Out of bounds
366
-
367
- lines.append(" ".join(row))
368
-
369
- # Add legend of visible items
370
- legend = (
371
- f"Visible items: {', '.join(sorted(visible_items))}"
372
- if visible_items
373
- else "No special items visible (mostly grass)"
374
- )
375
-
376
- return "\n".join(lines) + "\n" + legend
377
-
378
-
379
- def get_openai_tools():
380
- """Get OpenAI-compatible tool definitions for Synth models."""
381
- return [
382
- {
383
- "type": "function",
384
- "function": {
385
- "name": "interact",
386
- "description": "Perform actions in the Crafter environment.",
387
- "parameters": {
388
- "type": "object",
389
- "properties": {
390
- "actions": {
391
- "type": "array",
392
- "items": {"type": "string"},
393
- "description": "List of actions to perform in sequence (e.g., ['move_right', 'move_right', 'do']). Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop",
394
- },
395
- "reasoning": {
396
- "type": "string",
397
- "description": "Reasoning for these actions",
398
- },
399
- },
400
- "required": ["actions", "reasoning"],
401
- },
402
- },
403
- },
404
- {
405
- "type": "function",
406
- "function": {
407
- "name": "terminate",
408
- "description": "End the episode when finished or no progress can be made.",
409
- "parameters": {
410
- "type": "object",
411
- "properties": {
412
- "reason": {"type": "string", "description": "Reason for termination"}
413
- },
414
- "required": ["reason"],
415
- },
416
- },
417
- },
418
- ]
419
-
420
-
421
- # --- Configuration Class ---
422
- class CrafterConfig:
423
- """Configuration for Crafter evaluation with Synth backend."""
424
-
425
- def __init__(self, config_path: str | None = None):
426
- # Default values
427
- self.model_name: str | None = None
428
- self.num_instances = 1
429
- self.max_turns = 2
430
- self.difficulty = "easy"
431
- self.service_base_url = "http://localhost:8901"
432
- self.service_timeout = 30.0
433
- self.seed = 42
434
- self.save_traces = True
435
- self.save_detailed_results = True
436
- self.verbose = False
437
- self.quiet = False # Add quiet mode support
438
- self.analyze_traces = False
439
-
440
- # V3 tracing settings
441
- self.enable_v3_tracing = True
442
- # Standardize to a single shared v3 DB by default; allow env override
443
- self.v3_trace_dir = os.getenv("SYNTH_TRACES_ROOT", "./traces/v3")
444
- # Use shared DB path unless explicitly overridden via env or config
445
- self.turso_db_path = os.getenv(
446
- "SQLD_DB_PATH", os.path.join(self.v3_trace_dir, "synth_ai.db")
447
- )
448
- self.start_sqld_daemon = True # Whether to start sqld daemon
449
- self.auto_cleanup = True # Clean up old files automatically
450
-
451
- # Synth-specific settings
452
- self.warmup_model = True
453
- self.warmup_max_attempts = 30
454
- self.warmup_timeout = 60.0 # Default timeout in seconds
455
- self.use_synth_backend = True # Flag to indicate Synth backend
456
-
457
- # Load from TOML if provided
458
- if config_path and os.path.exists(config_path):
459
- self.load_from_toml(config_path)
460
-
461
- def load_from_toml(self, config_path: str):
462
- """Load configuration from TOML file."""
463
- config = toml.load(config_path)
464
-
465
- eval_config = config.get("eval", {})
466
- self.model_name = eval_config.get("model_name", self.model_name)
467
- self.num_instances = eval_config.get("episodes", self.num_instances)
468
- self.max_turns = eval_config.get("max_steps", self.max_turns)
469
- self.difficulty = eval_config.get("difficulty", self.difficulty)
470
- self.seed = eval_config.get("seed", self.seed)
471
-
472
- service_config = config.get("service", {})
473
- self.service_base_url = service_config.get("base_url", self.service_base_url)
474
- self.service_timeout = service_config.get("timeout", self.service_timeout)
475
-
476
- output_config = config.get("output", {})
477
- self.save_traces = output_config.get("save_traces", self.save_traces)
478
- self.save_detailed_results = output_config.get(
479
- "save_detailed_results", self.save_detailed_results
480
- )
481
-
482
- # V3 tracing config
483
- tracing_config = config.get("tracing_v3", {})
484
- self.enable_v3_tracing = tracing_config.get("enabled", self.enable_v3_tracing)
485
- self.v3_trace_dir = tracing_config.get("trace_dir", self.v3_trace_dir)
486
- self.turso_db_path = tracing_config.get("db_path", self.turso_db_path)
487
- self.start_sqld_daemon = tracing_config.get("start_daemon", self.start_sqld_daemon)
488
- self.auto_cleanup = tracing_config.get("auto_cleanup", self.auto_cleanup)
489
-
490
- # Synth config
491
- synth_config = config.get("synth", {})
492
- self.warmup_model = synth_config.get("warmup_model", self.warmup_model)
493
- self.warmup_max_attempts = synth_config.get("warmup_max_attempts", self.warmup_max_attempts)
494
- self.warmup_timeout = synth_config.get("warmup_timeout", self.warmup_timeout)
495
- self.use_synth_backend = synth_config.get("use_synth_backend", self.use_synth_backend)
496
-
497
-
498
- # --- Base ReAct Agent using LM with Synth ---
499
- class BaseReActAgentWithLMSynth:
500
- """Base ReAct agent using LM class configured for Synth backend."""
501
-
502
- def __init__(
503
- self,
504
- model_name: str,
505
- max_turns: int = 20,
506
- verbose: bool = False,
507
- tracer: SessionTracer | None = None,
508
- episode_id: int = 0,
509
- quiet: bool = False,
510
- model_params: dict[str, Any] | None = None,
511
- ):
512
- self.model_name = model_name
513
- self.max_turns = max_turns
514
- self.verbose = verbose
515
- self.quiet = quiet
516
- self.history = []
517
- self.system_name = "base-react-agent-lm-synth"
518
- self.tools = get_openai_tools()
519
- self.tracer = tracer
520
- self.system_id = f"{self.system_name}_{uuid.uuid4()}"
521
- self.episode_id = episode_id
522
-
523
- # Default model parameters
524
- default_model_params = {
525
- "temperature": 0.7,
526
- "max_tokens": 512,
527
- "top_p": 1.0,
528
- "frequency_penalty": 0.0,
529
- "presence_penalty": 0.0,
530
- "tool_choice": "auto",
531
- }
532
-
533
- # Merge user-provided parameters with defaults
534
- self.model_params = {**default_model_params, **(model_params or {})}
535
-
536
- # Setup Synth environment variables
537
- setup_synth_environment()
538
-
539
- # Create LM instance with synth provider and configurable parameters
540
- self.lm = LM(
541
- model_name=model_name,
542
- formatting_model_name=model_name,
543
- temperature=self.model_params["temperature"],
544
- synth_logging=False, # Disable v1 tracing
545
- provider="synth", # Use synth provider
546
- session_tracer=tracer,
547
- system_id=self.system_id,
548
- enable_v3_tracing=True,
549
- # Pass additional model parameters
550
- max_tokens=self.model_params["max_tokens"],
551
- top_p=self.model_params["top_p"],
552
- frequency_penalty=self.model_params["frequency_penalty"],
553
- presence_penalty=self.model_params["presence_penalty"],
554
- # Qwen3 think mode (propagated by vendor to chat_template_kwargs)
555
- enable_thinking=self.model_params.get("enable_thinking"),
556
- # Forward arbitrary extra_body to vendor for features like
557
- # stop_after_tool_calls. The runner sets this to 1.
558
- extra_body=self.model_params.get("extra_body"),
559
- )
560
-
561
- # Agent state tracking
562
- self.agent_state = {
563
- "message_history": [],
564
- "steps_taken": 0,
565
- "steps_remaining": max_turns,
566
- "total_tokens_used": 0,
567
- "tool_calls_made": 0,
568
- "current_turn": 0,
569
- "last_failure": None, # Track last failure for prompting
570
- "recent_tool_calls": [],
571
- }
572
-
573
- async def decide(self, obs: str, system_message: str, turn: int) -> dict[str, Any]:
574
- """Get agent decision based on observation using LM class with Synth backend."""
575
- # Update agent state
576
- self.agent_state["current_turn"] = turn
577
- self.agent_state["steps_taken"] = turn
578
- self.agent_state["steps_remaining"] = self.max_turns - turn
579
-
580
- # Include last 3 tool calls (reasoning and actions) to provide short action history
581
- recent_calls = self.agent_state.get("recent_tool_calls", [])
582
- recent_tail = recent_calls[-3:] if isinstance(recent_calls, list) else []
583
- if recent_tail:
584
- lines = ["\nRecent tool calls (last 3):"]
585
- for entry in recent_tail:
586
- tnum = entry.get("turn")
587
- name = entry.get("name")
588
- reasoning = entry.get("reasoning")
589
- actions = entry.get("actions")
590
- actions_str = ", ".join(actions) if isinstance(actions, list) else ""
591
- lines.append(
592
- f"- Turn {tnum}: {name} — reasoning: {reasoning}; actions: {actions_str}"
593
- )
594
- obs_with_history = f"{obs}\n" + "\n".join(lines)
595
- else:
596
- obs_with_history = obs
597
-
598
- # Create conversation context with unique episode ID to prevent caching
599
- context = (
600
- f"Episode {self.episode_id} - Turn {turn + 1}/{self.max_turns}\n\n{obs_with_history}"
601
- )
602
-
603
- # Build messages in OpenAI format for tools
604
- # Augment the system message if the previous turn failed to produce a tool call
605
- local_system_message = system_message
606
- last_failure = self.agent_state.get("last_failure")
607
- if last_failure:
608
- local_system_message = (
609
- f"{system_message}\n\nIMPORTANT: In the previous turn, no valid tool call was returned. "
610
- f"Error: {last_failure}. You MUST respond with a single function tool call in the OpenAI tools format."
611
- )
612
- messages = [
613
- {"role": "system", "content": local_system_message},
614
- {"role": "user", "content": context},
615
- ]
616
-
617
- # Add to message history
618
- self.agent_state["message_history"].extend(messages)
619
-
620
- # Truncate history if too long
621
- max_history_length = 20
622
- if len(self.agent_state["message_history"]) > max_history_length:
623
- self.agent_state["message_history"] = [
624
- self.agent_state["message_history"][0]
625
- ] + self.agent_state["message_history"][-(max_history_length - 1) :]
626
-
627
- try:
628
- llm_start = time.time()
629
-
630
- # Optionally print full prompt on final turn when verbose
631
- if self.verbose and turn == self.max_turns - 1:
632
- print("\nšŸ” FINAL TURN PROMPT:")
633
- print("=" * 80)
634
- print(f"System: {local_system_message[:200]}...")
635
- print(f"\nUser message:\n{context}")
636
- print("=" * 80)
637
-
638
- # Debug: Print request info only when verbose
639
- if self.verbose:
640
- print(f"\nšŸ” DEBUG: LM call details (turn {turn})")
641
- print(f" Model: {self.model_name}")
642
- print(" Provider: synth")
643
- print(f" Messages: {len(messages)} messages")
644
- print(f" Tools: {len(self.tools) if self.tools else 0} tools")
645
- if self.tools:
646
- print(
647
- f" Tool 0 name: {self.tools[0].get('function', {}).get('name', 'unknown')}"
648
- )
649
- print(f" Tools structure: {json.dumps(self.tools[0], indent=4)[:300]}...")
650
-
651
- # Call LM with turn number for v3 tracing
652
- # The LM class should handle Synth routing internally
653
- if self.verbose:
654
- print(
655
- f"šŸ” DEBUG: LM sampling params => max_tokens={self.model_params.get('max_tokens')} temp={self.model_params.get('temperature')} top_p={self.model_params.get('top_p')} tool_choice={self.model_params.get('tool_choice')}"
656
- )
657
-
658
- # Optional full input logging (system, user, tools). Enable with CRAFTER_LOG_FULL_INPUTS=1
659
- _log_full_inputs = os.getenv("CRAFTER_LOG_FULL_INPUTS", "0").lower() in (
660
- "1",
661
- "true",
662
- "yes",
663
- "on",
664
- )
665
- # if _log_full_inputs:
666
- # print("\n" + "=" * 80)
667
- # print(f"FULL LM INPUT (turn {turn})")
668
- # print("-" * 80)
669
- # print("System message:\n" + local_system_message)
670
- # print("\nUser message:\n" + context)
671
- # print("\nMessages JSON:")
672
- # print(json.dumps(messages, indent=2))
673
- # print("\nTools definition:")
674
- # print(json.dumps(self.tools, indent=2))
675
- # print("\nSampling/tool params:")
676
- # print(
677
- # json.dumps(
678
- # {
679
- # "tool_choice": self.model_params.get("tool_choice"),
680
- # "extra_body": self.model_params.get("extra_body"),
681
- # "temperature": self.model_params.get("temperature"),
682
- # "max_tokens": self.model_params.get("max_tokens"),
683
- # "top_p": self.model_params.get("top_p"),
684
- # "frequency_penalty": self.model_params.get("frequency_penalty"),
685
- # "presence_penalty": self.model_params.get("presence_penalty"),
686
- # },
687
- # indent=2,
688
- # )
689
- # )
690
- # print("=" * 80)
691
-
692
- response = await self.lm.respond_async(
693
- messages=messages,
694
- turn_number=turn,
695
- # Pass tools in the format expected by LM class
696
- tools=self.tools,
697
- max_tokens=self.model_params["max_tokens"],
698
- tool_choice=self.model_params.get("tool_choice", "auto"),
699
- # Pass extra_body per call to ensure backend receives stop_after_tool_calls
700
- extra_body=self.model_params.get("extra_body"),
701
- )
702
-
703
- llm_end = time.time()
704
-
705
- # Minimal output: show only tool_call presence, number of actions, and tokens
706
- completion_tokens = None
707
- prompt_tokens = None
708
- toks_per_sec = None
709
- if hasattr(response, "usage") and isinstance(response.usage, dict):
710
- completion_tokens = response.usage.get("completion_tokens")
711
- prompt_tokens = response.usage.get("prompt_tokens")
712
- # Compute tokens/sec if we have duration and completion tokens
713
- try:
714
- if completion_tokens is not None:
715
- duration_s = max(1e-6, (llm_end - llm_start))
716
- toks_per_sec = round(float(completion_tokens) / duration_s, 2)
717
- except Exception:
718
- toks_per_sec = None
719
-
720
- # Parse the response to extract tool calls
721
- raw_response = response.raw_response
722
- decision: dict[str, Any]
723
-
724
- if hasattr(response, "tool_calls") and response.tool_calls:
725
- tool_call = response.tool_calls[0]
726
- parsed_decision = None
727
- fn = tool_call.get("function") if isinstance(tool_call, dict) else None
728
- if isinstance(fn, dict) and ("name" in fn):
729
- name = fn.get("name", "interact")
730
- args_raw = fn.get("arguments", "{}")
731
- try:
732
- import json as _json
733
-
734
- args = (
735
- _json.loads(args_raw) if isinstance(args_raw, str) else (args_raw or {})
736
- )
737
- if isinstance(args, dict):
738
- parsed_decision = {"name": name, "parameters": args}
739
- except Exception as _e:
740
- parsed_decision = {"name": name, "parameters": {"arguments": args_raw}}
741
- if (
742
- not parsed_decision
743
- and isinstance(tool_call, dict)
744
- and ("name" in tool_call or "parameters" in tool_call)
745
- ):
746
- parsed_decision = {
747
- "name": tool_call.get("name", "interact"),
748
- "parameters": tool_call.get("parameters", {}),
749
- }
750
- if parsed_decision:
751
- decision = parsed_decision
752
- try:
753
- pname = decision.get("name")
754
- pparams = (
755
- decision.get("parameters", {}) if isinstance(decision, dict) else {}
756
- )
757
- preason = pparams.get("reasoning") if isinstance(pparams, dict) else None
758
- pacts = pparams.get("actions") if isinstance(pparams, dict) else None
759
- entry = {
760
- "turn": turn,
761
- "name": pname,
762
- "reasoning": preason,
763
- "actions": pacts if isinstance(pacts, list) else [],
764
- }
765
- self.agent_state["recent_tool_calls"].append(entry)
766
- if len(self.agent_state["recent_tool_calls"]) > 10:
767
- self.agent_state["recent_tool_calls"] = self.agent_state[
768
- "recent_tool_calls"
769
- ][-10:]
770
- except Exception:
771
- pass
772
- # Clear failure flag on success
773
- if self.agent_state.get("last_failure"):
774
- self.agent_state["last_failure"] = None
775
- params = decision.get("parameters", {}) if isinstance(decision, dict) else {}
776
- actions = params.get("actions", []) if isinstance(params, dict) else []
777
- num_actions = len(actions) if isinstance(actions, list) else 0
778
- # Store metrics for tqdm postfix update in run_episode
779
- self.agent_state["last_metrics"] = {
780
- "tc": 1,
781
- "act": num_actions,
782
- "tok": completion_tokens,
783
- "in": prompt_tokens,
784
- "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
785
- }
786
- else:
787
- # Unrecognized tool_calls structure: do nothing, record failure
788
- failure_msg = "Unrecognized tool_calls structure"
789
- self.agent_state["last_failure"] = failure_msg
790
- decision = {
791
- "name": "interact",
792
- "parameters": {"actions": [], "reasoning": failure_msg},
793
- }
794
- if self.verbose:
795
- print(f"šŸ” DEBUG: {failure_msg}")
796
- else:
797
- # No tool calls: do nothing, record failure for next prompt
798
- failure_msg = "No valid tool_calls in assistant message"
799
- self.agent_state["last_failure"] = failure_msg
800
- decision = {
801
- "name": "interact",
802
- "parameters": {"actions": [], "reasoning": failure_msg},
803
- }
804
- # Store metrics for tqdm postfix update in run_episode
805
- self.agent_state["last_metrics"] = {
806
- "tc": 0,
807
- "act": 0,
808
- "tok": completion_tokens,
809
- "in": prompt_tokens,
810
- "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
811
- }
812
-
813
- # Update agent state
814
- self.agent_state["tool_calls_made"] += 1
815
-
816
- # Add assistant response to history
817
- assistant_message = {"role": "assistant", "content": raw_response}
818
- self.agent_state["message_history"].append(assistant_message)
819
-
820
- if self.verbose:
821
- print(f"šŸ¤– LM Response (turn {turn}): {json.dumps(decision, indent=2)}")
822
- print(f"šŸ“Š Response time: {llm_end - llm_start:.2f}s")
823
- except Exception as e:
824
- print(f"āŒ Error in LM decide: {e}")
825
- import traceback
826
-
827
- traceback.print_exc()
828
- # Record failure and do nothing this turn
829
- failure_msg = f"Exception during decide: {str(e)}"
830
- self.agent_state["last_failure"] = failure_msg
831
- decision = {"name": "interact", "parameters": {"actions": [], "reasoning": failure_msg}}
832
-
833
- return decision
834
-
835
- def _parse_tool_response(self, raw_response: str) -> dict[str, Any]:
836
- """Parse raw LM response to extract tool calls."""
837
- # Try to parse JSON if present
838
- try:
839
- # Look for JSON in the response
840
- import re
841
-
842
- json_match = re.search(r"\{.*\}", raw_response, re.DOTALL)
843
- if json_match:
844
- data = json.loads(json_match.group())
845
- if "name" in data:
846
- return data
847
- elif "function" in data:
848
- return {
849
- "name": data["function"].get("name", "interact"),
850
- "parameters": data["function"].get("arguments", {}),
851
- }
852
- except Exception:
853
- pass
854
-
855
- # Fallback to text parsing
856
- if "terminate" in raw_response.lower():
857
- return {"name": "terminate", "parameters": {"reason": "Agent decided to terminate"}}
858
-
859
- # Try to extract actions from the response
860
- actions = []
861
- action_keywords = [
862
- "move_up",
863
- "move_down",
864
- "move_left",
865
- "move_right",
866
- "do",
867
- "sleep",
868
- "place_stone",
869
- "place_table",
870
- "place_furnace",
871
- "place_plant",
872
- "make_wood_pickaxe",
873
- "make_stone_pickaxe",
874
- "make_iron_pickaxe",
875
- "make_wood_sword",
876
- "make_stone_sword",
877
- "make_iron_sword",
878
- ]
879
-
880
- for keyword in action_keywords:
881
- if keyword in raw_response.lower():
882
- actions.append(keyword)
883
-
884
- if not actions:
885
- actions = ["do"] # Default action
886
-
887
- return {
888
- "name": "interact",
889
- "parameters": {
890
- "actions": actions, # Return as array of actions
891
- "reasoning": "Parsed from response",
892
- },
893
- }
894
-
895
- def get_system_message(self) -> str:
896
- """Return system message for agent. Override in subclasses."""
897
- return """You are an AI agent playing Crafter. Use the available tools to interact with the environment.
898
-
899
- CRITICAL RULE: You MUST provide MULTIPLE actions (2-5) in EVERY interact() tool call!
900
-
901
- The 'interact' function accepts a LIST of 1-5 actions. ALWAYS provide 2-5 actions for efficiency.
902
-
903
- GOOD Examples (what you SHOULD do):
904
- āœ“ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
905
- āœ“ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate to stone and mine it")
906
- āœ“ interact(actions=["place_table", "make_wood_pickaxe", "move_left"], reasoning="Craft and continue exploring")
907
-
908
- BAD Examples (what you should AVOID):
909
- āœ— interact(actions=["move_right"], reasoning="Move right") - TOO FEW ACTIONS!
910
- āœ— interact(actions=["do"], reasoning="Collect") - TOO FEW ACTIONS!
911
-
912
- REMEMBER: Single actions waste time. Always plan 2-5 actions ahead and execute them together!"""
913
-
914
- def format_observation(self, obs: dict[str, Any]) -> str:
915
- """Format observation for agent. Override in subclasses."""
916
- return str(obs)
917
-
918
-
919
- # --- Crafter-specific ReAct Agent ---
920
- class CrafterReActAgentWithLMSynth(BaseReActAgentWithLMSynth):
921
- """Crafter-specific ReAct agent with enhanced prompting for Synth models."""
922
-
923
- def get_system_message(self) -> str:
924
- """Return Crafter-specific system message optimized for Synth models."""
925
- override = os.getenv("CRAFTER_SYSTEM_PROMPT")
926
- if override:
927
- return override
928
- return """You are CrafterAgent playing Crafter survival environment. Your goal is to unlock as many achievements as possible while staying alive.
929
-
930
- You will see a semantic map view showing your surroundings. Use this to navigate toward resources.
931
-
932
- Key mechanics:
933
- • 'do' action: collect wood from trees, stone from deposits, food from cows/plants
934
- • 'do' does nothing on grass/water - move to find resources first
935
- • Craft progression: wood → table → wood_pickaxe → stone → stone_pickaxe → iron tools
936
- • Sleep when energy low to restore and unlock wake_up achievement
937
- • Use semantic map view to navigate toward resources you can see
938
-
939
- Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop
940
-
941
- KEY ACHIEVEMENTS TO UNLOCK:
942
- Basic Resource Collection (PRIORITY #1):
943
- - collect_wood: Move NEXT TO a tree, then use action="do" to collect wood
944
- - collect_stone: Move NEXT TO stone, then use action="do" (requires wood_pickaxe in inventory)
945
- - collect_coal: Move NEXT TO coal, then use action="do" (requires stone_pickaxe)
946
- - collect_iron: Move NEXT TO iron, then use action="do" (requires stone_pickaxe)
947
- - collect_diamond: Move NEXT TO diamond, then use action="do" (requires iron_pickaxe)
948
-
949
- Tool Crafting (enables resource collection):
950
- - make_wood_pickaxe: Use action="make_wood_pickaxe" when you have wood (unlocks ability to mine stone)
951
- - make_stone_pickaxe: Use action="make_stone_pickaxe" when you have wood and stone (unlocks coal/iron mining)
952
- - make_iron_pickaxe: Use action="make_iron_pickaxe" when you have wood, coal, and iron (unlocks diamond mining)
953
-
954
- Weapon Crafting (for defense):
955
- - make_wood_sword: Use action="make_wood_sword" when you have wood
956
- - make_stone_sword: Use action="make_stone_sword" when you have wood and stone
957
- - make_iron_sword: Use action="make_iron_sword" when you have wood, coal, and iron
958
-
959
- Survival Actions:
960
- - eat_plant: Use action="eat_plant" when food < 9 and you see a plant nearby
961
- - eat_cow: Move NEXT TO cow, use action="do" to kill it, then action="eat_cow"
962
- - collect_drink: Move NEXT TO water, then use action="drink" when drink < 9
963
- - sleep: Use action="sleep" when energy < 5 (restores energy to 9)
964
-
965
- Building/Placing:
966
- - place_table: Use action="place_table" when you have wood (enables advanced crafting)
967
- - place_furnace: Use action="place_furnace" when you have stone (for smelting)
968
- - place_plant: Use action="place_plant" when you have sapling (grows into tree)
969
- - place_stone: Use action="place_stone" when you have stone (creates barrier)
970
-
971
- Combat:
972
- - defeat_zombie: Move NEXT TO zombie, then use action="do" repeatedly to attack
973
- - defeat_skeleton: Move NEXT TO skeleton, then use action="do" repeatedly to attack
974
-
975
- CRITICAL: The action="do" is your INTERACTION button! Use it when adjacent to:
976
- - Trees → get wood
977
- - Stone/Coal/Iron/Diamond → mine resources (need appropriate pickaxe)
978
- - Enemies → attack them
979
- - Cows → kill for food
980
-
981
- Simple Strategy:
982
- 1. Look for resources (trees, stones) in the semantic map
983
- 2. Move toward the nearest resource
984
- 3. When adjacent to a resource, use action="do" to collect it
985
- 4. If you have wood, try action="make_wood_pickaxe"
986
- 5. Repeat: find resources, move to them, use "do"
987
-
988
- Critical Gameplay Tips:
989
- - You must be ADJACENT (one tile away) to objects to interact with them
990
- - Use "do" when next to: trees (for wood), stone (for stone), coal, iron, diamond
991
- - Use "do" to attack zombies/skeletons when adjacent
992
- - First priority: Find a tree, move next to it, then use "do" to collect wood
993
- - Wood is essential for crafting your first pickaxe
994
- - With wood_pickaxe you can mine stone, with stone_pickaxe you can mine iron, etc.
995
-
996
- CRITICAL INSTRUCTION: You MUST ALWAYS provide MULTIPLE actions (2-5) in EVERY interact() tool call!
997
-
998
- The 'interact' function accepts a LIST of 1-5 actions. NEVER use single actions - always plan 2-5 actions ahead!
999
-
1000
- MANDATORY action sequences (ALWAYS use multiple):
1001
- āœ“ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
1002
- āœ“ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate and collect")
1003
- āœ“ interact(actions=["place_table", "make_wood_pickaxe", "move_left", "move_left"], reasoning="Craft and explore")
1004
- āœ“ interact(actions=["do", "move_right", "do", "move_right", "do"], reasoning="Collect multiple resources")
1005
-
1006
- FORBIDDEN (NEVER do this):
1007
- āœ— interact(actions=["move_right"], ...) - WRONG! Too few actions!
1008
- āœ— interact(actions=["do"], ...) - WRONG! Too few actions!
1009
-
1010
- RULE: If you use less than 2 actions, you are playing inefficiently. Always think 2-5 steps ahead!
1011
-
1012
- Key Strategy:
1013
- 1. Plan a sequence of moves to reach resources
1014
- 2. Execute multiple moves in one tool call (e.g., ["move_right", "move_right", "move_up"])
1015
- 3. When adjacent to a resource, use "do" to collect it
1016
- 4. Chain crafting actions together (e.g., ["place_table", "make_wood_pickaxe"])
1017
-
1018
- Remember:
1019
- - Use "do" when ADJACENT to trees (for wood), stones, or other resources
1020
- - Collect wood FIRST before trying to craft anything
1021
- - Be efficient - use multiple actions per tool call!
1022
- - Focus on unlocking achievements by collecting resources and crafting items."""
1023
-
1024
- def format_observation(self, obs: dict[str, Any]) -> str:
1025
- """Format Crafter observation with semantic map view."""
1026
- # Get semantic map view
1027
- semantic_view = format_semantic_map_view_v2(obs, view_size=7)
1028
-
1029
- # Extract key information
1030
- inventory = obs.get("inventory", {})
1031
- # Try both possible keys for achievements
1032
- achievements = obs.get("achievements_status", obs.get("achievements_info", {}))
1033
- health = obs.get("health", 10)
1034
- food = obs.get("food", 10)
1035
- drink = obs.get("drink", 10)
1036
- energy = obs.get("energy", 10)
1037
-
1038
- # Count achievements
1039
- achieved = sum(1 for v in achievements.values() if v)
1040
- total_achievements = len(achievements)
1041
-
1042
- # Format inventory (only show non-zero items)
1043
- inv_items = []
1044
- for item, count in inventory.items():
1045
- if count > 0:
1046
- inv_items.append(f"{item}: {count}")
1047
- inv_str = ", ".join(inv_items) if inv_items else "empty"
1048
-
1049
- # List unlocked achievements
1050
- unlocked = [k for k, v in achievements.items() if v]
1051
- unlocked_str = ", ".join(unlocked) if unlocked else "none"
1052
-
1053
- # Recent achievements (from info if available)
1054
- recent_str = ""
1055
-
1056
- suppress_reminder = os.getenv("CRAFTER_SUPPRESS_OBS_REMINDER")
1057
- base = (
1058
- f"=== SEMANTIC MAP VIEW (7x7) ===\n"
1059
- f"{semantic_view}\n\n"
1060
- f"=== STATUS ===\n"
1061
- f"Health: {health}/10 | Food: {food}/10 | Drink: {drink}/10 | Energy: {energy}/10\n"
1062
- f"Inventory: {inv_str}\n"
1063
- f"Achievements: {achieved}/{total_achievements} unlocked\n"
1064
- f"Unlocked: {unlocked_str}\n"
1065
- f"{recent_str}\n\n"
1066
- # f"What do you see in the map? What actions should you take? "
1067
- )
1068
- if suppress_reminder:
1069
- return base
1070
- return (
1071
- base
1072
- # + "\n\nREMINDER: You MUST provide 2-5 actions in your interact() tool call. Plan multiple steps ahead!\n"
1073
- # + 'Example: interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")'
1074
- )
1075
-
1076
-
1077
- async def run_episode(
1078
- episode_id: int,
1079
- config: CrafterConfig,
1080
- session_tracer: SessionTracer | None = None,
1081
- progress_bar: tqdm | None = None,
1082
- quiet: bool = False,
1083
- model_params: dict[str, Any] | None = None,
1084
- ):
1085
- """Run a single episode."""
1086
- episode_start_time = time.time()
1087
-
1088
- # Create agent - always disable verbose for cleaner output
1089
- agent = CrafterReActAgentWithLMSynth(
1090
- model_name=config.model_name,
1091
- max_turns=config.max_turns,
1092
- verbose=False, # Always disable verbose logging in agent
1093
- tracer=session_tracer,
1094
- episode_id=episode_id,
1095
- quiet=True, # Always use quiet mode for agent
1096
- model_params=model_params,
1097
- )
1098
-
1099
- # Initialize environment
1100
- async with AsyncClient(base_url=config.service_base_url) as client:
1101
- try:
1102
- # Initialize environment with unique seed for each episode
1103
- # Use simple sequential seeds: 1, 2, 3, 4, etc.
1104
- episode_seed = episode_id + 1 # Start from 1 instead of 0
1105
-
1106
- init_response = await retry_http_request(
1107
- client,
1108
- "POST",
1109
- "/env/CrafterClassic/initialize",
1110
- json={"config": {"difficulty": config.difficulty, "seed": episode_seed}},
1111
- )
1112
-
1113
- init_data = init_response.json()
1114
- instance_id = init_data["env_id"]
1115
- obs = init_data["observation"]
1116
-
1117
- # Start initial timestep and send initial observation as message
1118
- if session_tracer:
1119
- async with session_tracer.timestep("init", turn_number=0):
1120
- obs_msg = create_message(
1121
- compress_observation_for_trace(obs),
1122
- "observation",
1123
- f"crafter_env_{instance_id}",
1124
- 0,
1125
- )
1126
- await session_tracer.record_message(
1127
- content=obs_msg.content, message_type=obs_msg.message_type
1128
- )
1129
-
1130
- # Run episode
1131
- episode_reward = 0
1132
- termination_reason = None
1133
- step_results = []
1134
- consecutive_no_tool_calls = 0
1135
-
1136
- # Create progress bar for this episode
1137
- episode_progress = tqdm(
1138
- total=config.max_turns,
1139
- desc=f"Episode {episode_id}",
1140
- position=episode_id,
1141
- leave=True,
1142
- ncols=100,
1143
- )
1144
-
1145
- for turn in range(config.max_turns):
1146
- episode_progress.update(1)
1147
-
1148
- # Use timestep context for this turn
1149
- timestep_name = f"turn_{turn + 1}"
1150
- async with (
1151
- session_tracer.timestep(timestep_name, turn_number=turn + 1)
1152
- if session_tracer
1153
- else _noop_async_context()
1154
- ):
1155
- # Get agent decision
1156
- obs_formatted = agent.format_observation(obs)
1157
- system_msg = agent.get_system_message()
1158
-
1159
- decision = await agent.decide(obs_formatted, system_msg, turn)
1160
- # Update tqdm postfix with latest metrics from agent
1161
- try:
1162
- metrics = agent.agent_state.get("last_metrics")
1163
- if isinstance(metrics, dict):
1164
- episode_progress.set_postfix(metrics, refresh=False)
1165
- except Exception:
1166
- pass
1167
-
1168
- # Handle termination
1169
- if decision["name"] == "terminate":
1170
- termination_reason = decision["parameters"]["reason"]
1171
- break
1172
-
1173
- # Detect consecutive no-tool-call responses and abort after 3
1174
- decision_params = (
1175
- decision.get("parameters") if isinstance(decision, dict) else None
1176
- )
1177
- decision_actions = (
1178
- decision_params.get("actions", [])
1179
- if isinstance(decision_params, dict)
1180
- else []
1181
- )
1182
- if (
1183
- decision.get("name") == "interact"
1184
- and isinstance(decision_actions, list)
1185
- and len(decision_actions) == 0
1186
- ):
1187
- consecutive_no_tool_calls += 1
1188
- print(f"šŸ” DEBUG: consecutive_no_tool_calls={consecutive_no_tool_calls}")
1189
- else:
1190
- consecutive_no_tool_calls = 0
1191
- if consecutive_no_tool_calls >= 3:
1192
- # Gracefully end the episode without recording this problematic turn
1193
- termination_reason = "no_tool_calls_abort"
1194
- break
1195
-
1196
- # Execute actions in sequence
1197
- actions = (
1198
- decision["parameters"].get("actions", [])
1199
- if isinstance(decision.get("parameters"), dict)
1200
- else []
1201
- )
1202
-
1203
- # Ensure control variables are defined even if no actions are taken this turn
1204
- done = False
1205
- reward = 0.0
1206
- info = {}
1207
-
1208
- # Define action mapping
1209
- crafter_action_map = {
1210
- "noop": 0,
1211
- "move_left": 1,
1212
- "move_right": 2,
1213
- "move_up": 3,
1214
- "move_down": 4,
1215
- "do": 5,
1216
- "sleep": 6,
1217
- "place_stone": 7,
1218
- "place_table": 8,
1219
- "place_furnace": 9,
1220
- "place_plant": 10,
1221
- "make_wood_pickaxe": 11,
1222
- "make_stone_pickaxe": 12,
1223
- "make_iron_pickaxe": 13,
1224
- "make_wood_sword": 14,
1225
- "make_stone_sword": 15,
1226
- "make_iron_sword": 16,
1227
- }
1228
-
1229
- # Execute each action in the sequence (may be empty)
1230
- for action in actions:
1231
- # Convert action name to integer
1232
- action_int = crafter_action_map.get(action, 0) # Default to noop
1233
-
1234
- # Get state before action
1235
- state_before = {"observation": obs} if "obs" in locals() else {}
1236
- prev_obs = obs.copy()
1237
-
1238
- # Step environment
1239
- step_response = await retry_http_request(
1240
- client,
1241
- "POST",
1242
- "/env/CrafterClassic/step",
1243
- json={
1244
- "env_id": instance_id,
1245
- "action": {
1246
- "tool_calls": [
1247
- {"tool": "interact", "args": {"action": action_int}}
1248
- ]
1249
- },
1250
- },
1251
- )
1252
- step_data = step_response.json()
1253
-
1254
- # Check if response has expected structure
1255
- if "observation" not in step_data:
1256
- print(
1257
- f"\nāŒ Error: Missing observation in step response. Keys: {list(step_data.keys())}"
1258
- )
1259
- if "error" in step_data:
1260
- print(f" Error message: {step_data['error']}")
1261
- # Try to recover or break
1262
- break
1263
-
1264
- obs = step_data["observation"]
1265
- reward = step_data.get("reward", 0) # Default to 0 if None
1266
- done = step_data.get("done", False) # Default to False if None
1267
- info = step_data.get("info", {})
1268
-
1269
- # Calculate achievement reward if not provided by service
1270
- if (reward == 0 or reward is None) and (
1271
- "achievements_status" in obs and "achievements_status" in prev_obs
1272
- ):
1273
- prev_achievements = prev_obs["achievements_status"]
1274
- curr_achievements = obs["achievements_status"]
1275
- new_unlocks = sum(
1276
- 1
1277
- for k in curr_achievements
1278
- if curr_achievements.get(k) and not prev_achievements.get(k)
1279
- )
1280
- if new_unlocks > 0:
1281
- reward = float(new_unlocks) # +1 for each new achievement
1282
-
1283
- if reward is not None:
1284
- episode_reward += reward
1285
-
1286
- # Record step result
1287
- step_results.append(
1288
- {
1289
- "turn": turn,
1290
- "action": action,
1291
- "reward": reward,
1292
- "done": done,
1293
- "info": info,
1294
- }
1295
- )
1296
-
1297
- # Record environment event for hooks to catch
1298
- if session_tracer:
1299
- # Create environment event with state transition
1300
- env_event = EnvironmentEvent(
1301
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
1302
- system_instance_id=f"crafter_env_{instance_id}",
1303
- system_state_before={"public_state": prev_obs},
1304
- system_state_after={"public_state": obs},
1305
- reward=reward, # This now includes calculated achievement rewards
1306
- terminated=done,
1307
- metadata={"action": action, "action_int": action_int, "info": info},
1308
- )
1309
- await session_tracer.record_event(env_event)
1310
-
1311
- # Also record runtime event for invalid action detection
1312
- runtime_event = RuntimeEvent(
1313
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
1314
- system_instance_id=f"crafter_runtime_{instance_id}",
1315
- actions=[action_int],
1316
- metadata={
1317
- "action_name": action,
1318
- "action_int": action_int,
1319
- "reward": reward,
1320
- "state_before": state_before,
1321
- "state_after": {"observation": obs},
1322
- },
1323
- )
1324
- await session_tracer.record_event(runtime_event)
1325
-
1326
- if done:
1327
- break
1328
-
1329
- # After all actions (or none), send final observation message
1330
- if session_tracer:
1331
- obs_msg = create_message(
1332
- compress_observation_for_trace(obs),
1333
- "observation",
1334
- f"crafter_env_{instance_id}",
1335
- turn + 1,
1336
- )
1337
- await session_tracer.record_message(
1338
- content=obs_msg.content, message_type=obs_msg.message_type
1339
- )
1340
-
1341
- if done:
1342
- break
1343
-
1344
- # Close progress bar
1345
- episode_progress.close()
1346
-
1347
- # Terminate instance
1348
- terminate_response = await retry_http_request(
1349
- client, "POST", "/env/CrafterClassic/terminate", json={"env_id": instance_id}
1350
- )
1351
-
1352
- except Exception as e:
1353
- if "episode_progress" in locals():
1354
- episode_progress.close()
1355
- print(f"\nāŒ Episode {episode_id} failed: {e}")
1356
- if config.verbose:
1357
- import traceback
1358
-
1359
- traceback.print_exc()
1360
- return {
1361
- "episode_id": episode_id,
1362
- "error": str(e),
1363
- "duration": time.time() - episode_start_time,
1364
- }
1365
-
1366
- # Extract final achievements
1367
- final_achievements = []
1368
- if obs and "achievements_status" in obs:
1369
- final_achievements = [k for k, v in obs["achievements_status"].items() if v]
1370
-
1371
- # Return results
1372
- return {
1373
- "episode_id": episode_id,
1374
- "total_reward": episode_reward,
1375
- "steps": len(step_results),
1376
- "termination_reason": termination_reason,
1377
- "duration": time.time() - episode_start_time,
1378
- "step_results": step_results,
1379
- "achievements_unlocked": final_achievements,
1380
- }
1381
-
1382
-
1383
- # --- Main ---
1384
- async def main():
1385
- """Main entry point with v3 tracing."""
1386
- parser = argparse.ArgumentParser(description="Run Crafter evaluation with LM Synth backend")
1387
- parser.add_argument("--config", type=str, help="Path to TOML config file")
1388
- parser.add_argument("--model", type=str, help="Model name (overrides config)")
1389
- parser.add_argument("--episodes", type=int, help="Number of episodes (overrides config)")
1390
- parser.add_argument("--max-steps", type=int, help="Max steps per episode (overrides config)")
1391
- parser.add_argument(
1392
- "--difficulty", type=str, choices=["easy", "normal", "hard"], help="Difficulty override"
1393
- )
1394
- parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
1395
- parser.add_argument("--quiet", action="store_true", help="Suppress most output except results")
1396
- parser.add_argument("--no-traces", action="store_true", help="Disable trace saving")
1397
- parser.add_argument("--analyze", action="store_true", help="Analyze traces after running")
1398
- parser.add_argument("--skip-warmup", action="store_true", help="Skip model warmup")
1399
- parser.add_argument(
1400
- "--no-daemon",
1401
- action="store_true",
1402
- help="Don't start sqld daemon (assumes it's already running)",
1403
- )
1404
-
1405
- # Qwen3 thinking mode flags (mutually exclusive)
1406
- think_group = parser.add_mutually_exclusive_group()
1407
- think_group.add_argument(
1408
- "--think",
1409
- dest="enable_thinking",
1410
- action="store_true",
1411
- help="Enable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=True)",
1412
- )
1413
- think_group.add_argument(
1414
- "--no-think",
1415
- dest="enable_thinking",
1416
- action="store_false",
1417
- help="Disable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=False)",
1418
- )
1419
- parser.set_defaults(enable_thinking=None)
1420
-
1421
- # Model parameter arguments
1422
- parser.add_argument(
1423
- "--temperature",
1424
- type=float,
1425
- default=0.7,
1426
- help="Temperature for model responses (default: 0.7)",
1427
- )
1428
- parser.add_argument(
1429
- "--max-tokens", type=int, default=512, help="Maximum tokens to generate (default: 512)"
1430
- )
1431
- parser.add_argument(
1432
- "--top-p", type=float, default=1.0, help="Top-p sampling parameter (default: 1.0)"
1433
- )
1434
- parser.add_argument(
1435
- "--frequency-penalty", type=float, default=0.0, help="Frequency penalty (default: 0.0)"
1436
- )
1437
- parser.add_argument(
1438
- "--presence-penalty", type=float, default=0.0, help="Presence penalty (default: 0.0)"
1439
- )
1440
- parser.add_argument(
1441
- "--tool-choice",
1442
- type=str,
1443
- choices=["auto", "required", "none"],
1444
- default="auto",
1445
- help="Tool choice mode (default: auto)",
1446
- )
1447
-
1448
- args = parser.parse_args()
1449
-
1450
- # Load configuration
1451
- config = CrafterConfig(args.config)
1452
-
1453
- # Setup Synth environment variables
1454
- setup_synth_environment()
1455
-
1456
- # Clean up old files to keep directory clean
1457
- if config.auto_cleanup:
1458
- cleanup_old_files()
1459
-
1460
- # Apply command-line overrides
1461
- if args.model:
1462
- config.model_name = args.model
1463
- if args.episodes:
1464
- config.num_instances = args.episodes
1465
- if args.max_steps:
1466
- config.max_turns = args.max_steps
1467
- if args.difficulty:
1468
- config.difficulty = args.difficulty
1469
- if args.verbose:
1470
- config.verbose = True
1471
- if args.quiet:
1472
- config.quiet = True
1473
- if not args.verbose: # Don't show this if verbose is also on
1474
- print("šŸ”‡ Quiet mode enabled - suppressing verbose logs")
1475
- else:
1476
- config.quiet = False
1477
- if args.no_daemon:
1478
- config.start_sqld_daemon = False
1479
-
1480
- # Environment overrides for model parameters (fail-fast on bad values)
1481
- env_temp = os.getenv("CRAFTER_TEMPERATURE")
1482
- if env_temp is not None:
1483
- args.temperature = float(env_temp)
1484
- env_max_tok = os.getenv("CRAFTER_MAX_TOKENS")
1485
- if env_max_tok is not None:
1486
- args.max_tokens = int(env_max_tok)
1487
- env_tool_choice = os.getenv("CRAFTER_TOOL_CHOICE")
1488
- if env_tool_choice is not None:
1489
- if env_tool_choice not in {"auto", "required", "none"}:
1490
- raise ValueError(f"Invalid CRAFTER_TOOL_CHOICE: {env_tool_choice}")
1491
- args.tool_choice = env_tool_choice
1492
- env_top_p = os.getenv("CRAFTER_TOP_P")
1493
- if env_top_p is not None:
1494
- args.top_p = float(env_top_p)
1495
- env_freq_pen = os.getenv("CRAFTER_FREQUENCY_PENALTY")
1496
- if env_freq_pen is not None:
1497
- args.frequency_penalty = float(env_freq_pen)
1498
- env_pres_pen = os.getenv("CRAFTER_PRESENCE_PENALTY")
1499
- if env_pres_pen is not None:
1500
- args.presence_penalty = float(env_pres_pen)
1501
-
1502
- # Resolve stop-after-tool-calls from environment (wrapper sets this)
1503
- try:
1504
- _satc = int(os.getenv("CRAFTER_STOP_AFTER_TOOL_CALLS", "1"))
1505
- except Exception:
1506
- _satc = 1
1507
- _extra_body = {"stop_after_tool_calls": _satc} if _satc and _satc > 0 else {}
1508
-
1509
- # Create model parameters dictionary from command line arguments
1510
- model_params = {
1511
- "temperature": args.temperature,
1512
- "max_tokens": args.max_tokens,
1513
- "top_p": args.top_p,
1514
- "frequency_penalty": args.frequency_penalty,
1515
- "presence_penalty": args.presence_penalty,
1516
- "tool_choice": args.tool_choice,
1517
- # Request early stop after N tool call blocks to avoid spillover
1518
- "extra_body": _extra_body,
1519
- }
1520
- # Optionally carry thinking mode through to LM config
1521
- if args.enable_thinking is not None:
1522
- model_params["enable_thinking"] = args.enable_thinking
1523
-
1524
- # Configure logging based on quiet mode
1525
- setup_logging(quiet_mode=config.quiet)
1526
-
1527
- # Display configuration (only if not in quiet mode)
1528
- if not config.quiet:
1529
- print("šŸŽ® Crafter ReAct Agent Evaluation (LM with Synth Backend - v3)")
1530
- print(f"Model: {config.model_name}")
1531
- print("Model Parameters:")
1532
- print(f" Temperature: {model_params['temperature']}")
1533
- print(f" Max Tokens: {model_params['max_tokens']}")
1534
- print(f" Top-p: {model_params['top_p']}")
1535
- print(f" Frequency Penalty: {model_params['frequency_penalty']}")
1536
- print(f" Presence Penalty: {model_params['presence_penalty']}")
1537
- print(f"Service: {config.service_base_url}")
1538
- print(f"Instances: {config.num_instances}")
1539
- print(f"Max Turns: {config.max_turns}")
1540
- print(f"Difficulty: {config.difficulty}")
1541
- print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
1542
- print("=" * 50)
1543
-
1544
- if args.no_traces:
1545
- config.save_traces = False
1546
- config.enable_v3_tracing = False
1547
- if args.analyze:
1548
- config.analyze_traces = True
1549
- if args.skip_warmup:
1550
- config.warmup_model = False
1551
-
1552
- # Ensure model is specified
1553
- if not config.model_name:
1554
- parser.error("Model name must be specified via --model or config file")
1555
-
1556
- # Test service health
1557
- async with AsyncClient(base_url=config.service_base_url) as client:
1558
- try:
1559
- health_resp = await retry_http_request(client, "GET", "/health")
1560
- health_data = health_resp.json()
1561
- print(f"āœ… Crafter service is healthy: {health_data}")
1562
- except Exception as e:
1563
- print(f"āŒ Failed to connect to Crafter service: {e}")
1564
- return
1565
-
1566
- # Warm up the model if requested
1567
- if config.warmup_model and not args.skip_warmup:
1568
- print(f"\nšŸ”„ Warming up {config.model_name} on Synth backend...")
1569
- try:
1570
- synth_base_url = os.getenv("SYNTH_BASE_URL") # or os.getenv('MODAL_BASE_URL')
1571
- synth_api_key = os.getenv("SYNTH_API_KEY") # or os.getenv('MODAL_API_KEY')
1572
- if synth_base_url and synth_api_key:
1573
- synth_config = SynthConfig(
1574
- base_url=synth_base_url,
1575
- api_key=synth_api_key,
1576
- timeout=config.warmup_timeout, # Use configurable timeout
1577
- )
1578
- warmed = await warmup_synth_model(config.model_name, synth_config)
1579
- if warmed:
1580
- print("āœ… Model warmed up successfully!")
1581
- else:
1582
- print("āš ļø Warmup did not complete; continuing anyway...")
1583
- else:
1584
- print("āš ļø Missing SYNTH_BASE_URL or SYNTH_API_KEY, skipping warmup")
1585
- except Exception as e:
1586
- print(f"āš ļø Warmup failed: {e}")
1587
- print("Continuing anyway...")
1588
-
1589
- # Set up v3 tracing if enabled
1590
- trace_manager = None
1591
- experiment_ctx = None
1592
- sqld_daemon = None
1593
-
1594
- if config.enable_v3_tracing:
1595
- # Create trace directory first
1596
- os.makedirs(config.v3_trace_dir, exist_ok=True)
1597
-
1598
- # Start sqld daemon if requested
1599
- if config.start_sqld_daemon:
1600
- print("\nšŸš€ Starting sqld daemon for v3 tracing...")
1601
- sqld_daemon = SqldDaemon(db_path=config.turso_db_path)
1602
- sqld_daemon.__enter__() # Start the daemon
1603
- await asyncio.sleep(2) # Give it time to start
1604
- print("āœ… sqld daemon started")
1605
-
1606
- # Initialize trace manager with proper URL format
1607
- # If SQLD_DB_PATH is a directory managed by sqld, use its data file
1608
- _db_path = config.turso_db_path
1609
- if os.path.isdir(_db_path):
1610
- _candidate = os.path.join(_db_path, "dbs", "default", "data")
1611
- if os.path.exists(_candidate):
1612
- _db_path = _candidate
1613
- db_url = f"sqlite+aiosqlite:///{os.path.abspath(_db_path)}"
1614
- trace_manager = AsyncSQLTraceManager(db_url=db_url)
1615
- await trace_manager.initialize()
1616
-
1617
- # Create experiment context
1618
- experiment_ctx = await create_experiment_context(
1619
- db_manager=trace_manager,
1620
- experiment_name=f"crafter_lm_synth_{config.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
1621
- description=f"Crafter LM Synth experiment with {config.model_name} on {config.difficulty} difficulty, using LM class with v3 tracing",
1622
- )
1623
-
1624
- print(f"\nšŸ“Š V3 Tracing enabled. Traces will be saved to: {config.turso_db_path}")
1625
- print(f" Experiment: {experiment_ctx['experiment_name']}")
1626
-
1627
- # Run episodes with bounded concurrency using asyncio.Semaphore
1628
- # Control concurrency with env var CRAFTER_CONCURRENCY (default 5)
1629
- try:
1630
- _conc_str = os.getenv("CRAFTER_CONCURRENCY")
1631
- max_concurrency = int(_conc_str) if _conc_str else 5
1632
- except Exception:
1633
- max_concurrency = 5
1634
- concurrency_limiter = asyncio.Semaphore(max_concurrency)
1635
-
1636
- print(f"\nšŸš€ Running {config.num_instances} episodes (concurrency={max_concurrency})...")
1637
-
1638
- episode_seeds = [] # Track seeds used for each episode
1639
-
1640
- # Prepare episode tasks
1641
- episode_tasks = []
1642
- session_ids = []
1643
-
1644
- for i in range(config.num_instances):
1645
- # Calculate episode seed for logging (simple sequential: 1, 2, 3, etc)
1646
- episode_seed = i + 1
1647
- episode_seeds.append(episode_seed)
1648
-
1649
- # Create session tracer for this episode if v3 tracing is enabled
1650
- session_tracer = None
1651
- if config.enable_v3_tracing and trace_manager:
1652
- session_tracer = SessionTracer(hooks=QUIET_HOOKS) # Use quiet hooks
1653
- session_tracer.db = trace_manager # Use existing manager
1654
- session_tracer._initialized = True
1655
-
1656
- # Generate session ID
1657
- session_id = f"crafter_episode_{i}_{uuid.uuid4().hex[:8]}"
1658
- session_ids.append(session_id)
1659
-
1660
- # Create episode task with proper session context
1661
- async def run_episode_with_session(ep_id, cfg, tracer, pb, quiet, sess_id, model_params):
1662
- if tracer:
1663
- async with tracer.session(
1664
- session_id=sess_id,
1665
- metadata={
1666
- "episode_id": ep_id,
1667
- "experiment_id": experiment_ctx["experiment_id"]
1668
- if experiment_ctx
1669
- else None,
1670
- },
1671
- ):
1672
- return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1673
- else:
1674
- return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1675
-
1676
- # Freeze per-iteration values to avoid late-binding bugs in closures
1677
- this_tracer = session_tracer
1678
- this_session_id = session_ids[i] if session_ids else None
1679
-
1680
- async def _limited_episode(ep_idx=i, tracer=this_tracer, sess_id=this_session_id):
1681
- async with concurrency_limiter:
1682
- return await run_episode_with_session(
1683
- ep_idx, config, tracer, None, args.quiet, sess_id, model_params
1684
- )
1685
-
1686
- episode_task = _limited_episode()
1687
- episode_tasks.append(episode_task)
1688
-
1689
- print("\nšŸ“¤ Starting episodes...")
1690
- start_time = time.time()
1691
-
1692
- # Run all episodes in parallel and fail fast on first error
1693
- try:
1694
- results = await asyncio.gather(*episode_tasks, return_exceptions=False)
1695
- except Exception as e:
1696
- print(f"\nāŒ Run aborted due to error: {e}")
1697
- # Ensure resources are cleaned up before exiting
1698
- if trace_manager:
1699
- await trace_manager.close()
1700
- if sqld_daemon:
1701
- sqld_daemon.__exit__(None, None, None)
1702
- print("\nāœ… Stopped sqld daemon")
1703
- raise
1704
-
1705
- end_time = time.time()
1706
- parallel_time = end_time - start_time
1707
-
1708
- print(f"\nāœ… Completed {len(episode_tasks)} episodes in {parallel_time:.2f} seconds")
1709
-
1710
- # Process results and handle any exceptions
1711
- successful_results = []
1712
- failed_results = []
1713
-
1714
- for i, result in enumerate(results):
1715
- if isinstance(result, Exception):
1716
- print(f"āŒ Episode {i} failed: {result}")
1717
- failed_results.append({"episode_id": i, "error": str(result)})
1718
- else:
1719
- successful_results.append(result)
1720
-
1721
- # Link session to experiment if tracing enabled
1722
- if (
1723
- config.enable_v3_tracing
1724
- and trace_manager
1725
- and experiment_ctx
1726
- and i < len(session_ids)
1727
- ):
1728
- await trace_manager.link_session_to_experiment(
1729
- session_ids[i], experiment_ctx["experiment_id"]
1730
- )
1731
-
1732
- # Use successful results for analysis
1733
- results = successful_results + failed_results
1734
-
1735
- # Analyze results
1736
- print("\n" + "=" * 50)
1737
- print("šŸ“Š EVALUATION RESULTS")
1738
- print("=" * 50)
1739
-
1740
- successful_episodes = [r for r in results if "error" not in r]
1741
- failed_episodes = [r for r in results if "error" in r]
1742
-
1743
- if successful_episodes:
1744
- total_reward = sum(r["total_reward"] for r in successful_episodes)
1745
- total_steps = sum(r["steps"] for r in successful_episodes)
1746
- avg_reward = total_reward / len(successful_episodes)
1747
- avg_steps = total_steps / len(successful_episodes)
1748
-
1749
- print(f"Episodes completed: {len(successful_episodes)}/{config.num_instances}")
1750
- print(f"Failed episodes: {len(failed_episodes)}")
1751
- print(f"Total reward: {total_reward:.2f}")
1752
- print(f"Average reward per episode: {avg_reward:.2f}")
1753
- print(f"Total steps: {total_steps}")
1754
- print(f"Average steps per episode: {avg_steps:.2f}")
1755
-
1756
- # Show seeds used
1757
- if episode_seeds:
1758
- print("\nSeeds used:")
1759
- for i, seed in enumerate(episode_seeds[: len(successful_episodes)]):
1760
- print(f" Episode {i}: seed {seed}")
1761
-
1762
- # Extract unique achievements
1763
- all_achievements = set()
1764
- achievement_counts = defaultdict(int)
1765
-
1766
- for result in successful_episodes:
1767
- # Use the achievements_unlocked field we added
1768
- if "achievements_unlocked" in result:
1769
- for achievement in result["achievements_unlocked"]:
1770
- all_achievements.add(achievement)
1771
- achievement_counts[achievement] += 1
1772
-
1773
- # Extract and count all actions from successful episodes
1774
- action_counts = defaultdict(int)
1775
- total_actions = 0
1776
-
1777
- for result in successful_episodes:
1778
- if "step_results" in result:
1779
- for step in result["step_results"]:
1780
- if "action" in step:
1781
- action_counts[step["action"]] += 1
1782
- total_actions += 1
1783
-
1784
- print(f"Unique achievements unlocked: {len(all_achievements)}")
1785
- if all_achievements:
1786
- print("\nAchievements unlocked:")
1787
- for achievement, count in sorted(achievement_counts.items()):
1788
- print(
1789
- f" - {achievement}: {count} episodes ({count / len(successful_episodes) * 100:.1f}%)"
1790
- )
1791
-
1792
- # Display action counts
1793
- if action_counts:
1794
- print(f"\nAction counts (total: {total_actions}):")
1795
- for action, count in sorted(action_counts.items(), key=lambda x: x[1], reverse=True):
1796
- percentage = count / total_actions * 100 if total_actions > 0 else 0
1797
- print(f" - {action}: {count} ({percentage:.1f}%)")
1798
- else:
1799
- print("No successful episodes completed.")
1800
-
1801
- # Save detailed results
1802
- if config.save_detailed_results and config.enable_v3_tracing and trace_manager:
1803
- # For v3, results are automatically saved in the database
1804
- print(f"\nšŸ’¾ Results available in Turso database: {config.turso_db_path}")
1805
- print(f" Experiment ID: {experiment_ctx['experiment_id']}")
1806
- print(" Use the filter_traces_sft_turso.py script to extract fine-tuning data")
1807
- elif config.save_detailed_results:
1808
- # Fallback to JSON if no tracing - write under temp/ (git-ignored)
1809
- from pathlib import Path
1810
-
1811
- out_dir = Path(os.getenv("SYNTH_OUTPUT_DIR", "temp")).resolve()
1812
- out_dir.mkdir(parents=True, exist_ok=True)
1813
- results_path = (
1814
- out_dir / f"crafter_lm_synth_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1815
- )
1816
- with open(results_path, "w") as f:
1817
- json.dump(
1818
- {
1819
- "config": {
1820
- "model": config.model_name,
1821
- "episodes": config.num_instances,
1822
- "max_steps": config.max_turns,
1823
- "difficulty": config.difficulty,
1824
- "backend": "synth",
1825
- "tracing": "v3",
1826
- },
1827
- "results": results,
1828
- "summary": {
1829
- "successful_episodes": len(successful_episodes),
1830
- "failed_episodes": len(failed_episodes),
1831
- "total_reward": total_reward if successful_episodes else 0,
1832
- "avg_reward": avg_reward if successful_episodes else 0,
1833
- "unique_achievements": list(all_achievements)
1834
- if successful_episodes
1835
- else [],
1836
- },
1837
- },
1838
- f,
1839
- indent=2,
1840
- )
1841
- print(f"\nšŸ’¾ Detailed results saved to: {results_path}")
1842
-
1843
- # Print a markdown row compatible with Environments/crafter.md tables
1844
- if successful_episodes:
1845
- # Columns: | model | trajectories | avg achievements | adj score | unique | steps sum | avg steps |
1846
- model_label = config.model_name.replace("/", "/")
1847
- trajectories = len(successful_episodes)
1848
- avg_ach = avg_reward # our reward == achievements unlocked per episode
1849
-
1850
- # Compute weighted scores (shaped and K-Score) from final achievements across episodes
1851
- # K coefficients taken from crafter.md (representative weights)
1852
- k_weights = {
1853
- "collect_drink": 0.1,
1854
- "collect_sapling": 0.1,
1855
- "wake_up": 0.1,
1856
- "collect_wood": 1.0,
1857
- "collect_stone": 1.0,
1858
- "eat_cow": 1.0,
1859
- "defeat_zombie": 1.0,
1860
- "defeat_skeleton": 1.0,
1861
- "make_wood_pickaxe": 3.0,
1862
- "place_table": 3.0,
1863
- "collect_coal": 3.0,
1864
- "make_stone_pickaxe": 10.0,
1865
- "place_furnace": 10.0,
1866
- "collect_iron": 10.0,
1867
- "make_stone_sword": 10.0,
1868
- "make_wood_sword": 3.0,
1869
- "place_plant": 0.1,
1870
- }
1871
-
1872
- # Aggregate final achievements across successful episodes
1873
- from collections import Counter
1874
-
1875
- ach_counter: Counter[str] = Counter()
1876
- for ep in successful_episodes:
1877
- for name in ep.get("achievements_unlocked", []):
1878
- ach_counter[name] += 1
1879
-
1880
- shaped_total = 0.0
1881
- for name, count in ach_counter.items():
1882
- k = k_weights.get(name, 1.0)
1883
- shaped_total += k * count
1884
-
1885
- # Shaped reward per episode average
1886
- shaped_reward_avg = shaped_total / trajectories if trajectories > 0 else 0.0
1887
- k_score_avg = shaped_reward_avg / 20.0 # normalize roughly to match table scale
1888
-
1889
- # unique = len(all_achievements) # unused
1890
- steps_sum = total_steps
1891
- avg_steps_md = avg_steps
1892
- print("\nMarkdown row:")
1893
- print(
1894
- f"| {model_label:<15} | {trajectories:7d} | {avg_ach:8.2f} | {shaped_reward_avg:13.3f} | {k_score_avg:12.3f} | {steps_sum:12.3f} | {avg_steps_md:8.3f} |"
1895
- )
1896
-
1897
- # Cleanup
1898
- if trace_manager:
1899
- await trace_manager.close()
1900
-
1901
- if sqld_daemon:
1902
- sqld_daemon.__exit__(None, None, None)
1903
- print("\nāœ… Stopped sqld daemon")
1904
-
1905
-
1906
- if __name__ == "__main__":
1907
- asyncio.run(main())
1908
-
1909
-
1910
- # === SEMANTIC MAP VIEW (15x15) ===
1911
- # stone coal iron coal coal coal coal
1912
- # stone stone iron coal coal coal coal
1913
- # stone stone zombie coal coal iron iron
1914
- # stone stone stone you stone iron iron
1915
- # stone stone stone stone stone stone stone
1916
- # stone stone stone stone stone stone stone
1917
- # stone stone stone stone stone stone stone
1918
- # Visible items: coal, iron, stone, zombie
1919
-
1920
- # === STATUS ===
1921
- # Health: 10/10 | Food: 10/10 | Drink: 10/10 | Energy: 10/10
1922
- # Inventory: health: 9, food: 7, drink: 7, energy: 9, wood: 1, wood_pickaxe: 1
1923
- # Achievements: 4/22 unlocked
1924
- # Unlocked: collect_wood, make_wood_pickaxe, place_table, wake_up