synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (353) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +23 -17
  3. examples/qwen_coder/README.md +102 -0
  4. examples/qwen_coder/_shared.py +113 -0
  5. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  6. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  7. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  8. examples/qwen_coder/generate_dataset.py +98 -0
  9. examples/qwen_coder/infer_ft_smoke.py +64 -0
  10. examples/qwen_coder/infer_prod_proxy.py +73 -0
  11. examples/qwen_coder/infer_via_synth.py +87 -0
  12. examples/qwen_coder/scripts/infer_coder.sh +18 -0
  13. examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
  14. examples/qwen_coder/sft_full_17b.py +103 -0
  15. examples/qwen_coder/sft_lora_30b.py +110 -0
  16. examples/qwen_coder/subset_jsonl.py +38 -0
  17. examples/qwen_coder/validate_jsonl.py +59 -0
  18. examples/rl/configs/eval_base_qwen.toml +1 -1
  19. examples/rl/configs/rl_from_base_qwen17.toml +1 -1
  20. examples/rl/download_dataset.py +26 -10
  21. examples/rl/run_eval.py +53 -52
  22. examples/rl/run_rl_and_save.py +29 -12
  23. examples/rl/task_app/math_single_step.py +180 -41
  24. examples/rl/task_app/math_task_app.py +14 -6
  25. examples/sft/README.md +139 -0
  26. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  27. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  28. examples/sft/evaluate.py +117 -0
  29. examples/sft/export_dataset.py +117 -0
  30. examples/sft/generate_traces.py +162 -0
  31. examples/swe/__init__.py +12 -0
  32. examples/swe/task_app/README.md +105 -0
  33. examples/swe/task_app/__init__.py +2 -0
  34. examples/swe/task_app/grpo_swe_mini.py +571 -0
  35. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  36. examples/swe/task_app/hosted/README.md +173 -0
  37. examples/swe/task_app/hosted/__init__.py +5 -0
  38. examples/swe/task_app/hosted/branching.py +143 -0
  39. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  40. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  41. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  42. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  43. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  44. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  45. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  46. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  47. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  50. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  51. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  52. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  53. examples/swe/task_app/hosted/hosted_app.py +204 -0
  54. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  55. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  56. examples/swe/task_app/hosted/main.py +100 -0
  57. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  58. examples/swe/task_app/hosted/registry.py +195 -0
  59. examples/swe/task_app/hosted/rollout.py +1869 -0
  60. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  61. examples/swe/task_app/hosted/storage/volume.py +211 -0
  62. examples/swe/task_app/hosted/test_agents.py +161 -0
  63. examples/swe/task_app/hosted/test_service.py +137 -0
  64. examples/swe/task_app/hosted/utils.py +62 -0
  65. examples/vlm/README.md +68 -0
  66. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  67. examples/vlm/crafter_image_only_agent.py +207 -0
  68. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  69. examples/vlm/filter_image_rows.py +63 -0
  70. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  71. examples/warming_up_to_rl/analyze_trace_db.py +12 -10
  72. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  73. examples/warming_up_to_rl/export_trace_sft.py +218 -36
  74. examples/warming_up_to_rl/groq_test.py +15 -8
  75. examples/warming_up_to_rl/manage_secrets.py +29 -25
  76. examples/warming_up_to_rl/readme.md +9 -2
  77. examples/warming_up_to_rl/run_eval.py +137 -61
  78. examples/warming_up_to_rl/run_fft_and_save.py +131 -60
  79. examples/warming_up_to_rl/run_local_rollout.py +88 -39
  80. examples/warming_up_to_rl/run_local_rollout_modal.py +114 -28
  81. examples/warming_up_to_rl/run_local_rollout_parallel.py +81 -20
  82. examples/warming_up_to_rl/run_local_rollout_traced.py +126 -23
  83. examples/warming_up_to_rl/run_rl_and_save.py +35 -12
  84. examples/warming_up_to_rl/run_rollout_remote.py +44 -19
  85. examples/warming_up_to_rl/task_app/README.md +6 -2
  86. examples/warming_up_to_rl/task_app/grpo_crafter.py +319 -57
  87. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +11 -30
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +137 -182
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +150 -57
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +105 -69
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +19 -7
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +45 -42
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +47 -45
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
  101. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +198 -92
  102. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  103. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +361 -263
  104. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  105. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +394 -274
  106. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
  107. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +56 -62
  108. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  109. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +6 -15
  110. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  111. synth/__init__.py +14 -0
  112. synth_ai/__init__.py +20 -4
  113. synth_ai/api/models/supported.py +376 -0
  114. synth_ai/api/train/builders.py +157 -26
  115. synth_ai/api/train/cli.py +213 -57
  116. synth_ai/api/train/config_finder.py +65 -5
  117. synth_ai/api/train/env_resolver.py +33 -15
  118. synth_ai/api/train/pollers.py +13 -4
  119. synth_ai/api/train/supported_algos.py +139 -0
  120. synth_ai/api/train/task_app.py +5 -3
  121. synth_ai/api/train/utils.py +33 -48
  122. synth_ai/cli/__init__.py +19 -4
  123. synth_ai/cli/_modal_wrapper.py +28 -0
  124. synth_ai/cli/_typer_patch.py +49 -0
  125. synth_ai/cli/balance.py +2 -3
  126. synth_ai/cli/calc.py +1 -1
  127. synth_ai/cli/demo.py +21 -6
  128. synth_ai/cli/recent.py +2 -2
  129. synth_ai/cli/rl_demo.py +77 -17
  130. synth_ai/cli/root.py +116 -39
  131. synth_ai/cli/status.py +2 -2
  132. synth_ai/cli/task_apps.py +1709 -243
  133. synth_ai/cli/traces.py +7 -4
  134. synth_ai/cli/turso.py +73 -0
  135. synth_ai/cli/watch.py +12 -18
  136. synth_ai/core/experiment.py +0 -2
  137. synth_ai/demo_registry.py +68 -31
  138. synth_ai/demos/core/cli.py +516 -194
  139. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  140. synth_ai/demos/demo_task_apps/core.py +64 -28
  141. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
  142. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +37 -30
  143. synth_ai/demos/demo_task_apps/math/_common.py +1 -2
  144. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  145. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
  146. synth_ai/demos/demo_task_apps/math/modal_task_app.py +183 -82
  147. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
  148. synth_ai/environments/examples/bandit/engine.py +12 -4
  149. synth_ai/environments/examples/bandit/taskset.py +4 -4
  150. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  151. synth_ai/environments/reproducibility/tree.py +5 -6
  152. synth_ai/environments/service/app.py +11 -12
  153. synth_ai/environments/service/core_routes.py +10 -9
  154. synth_ai/environments/stateful/engine.py +1 -1
  155. synth_ai/environments/tasks/core.py +1 -0
  156. synth_ai/environments/tasks/filters.py +5 -6
  157. synth_ai/environments/tasks/utils.py +4 -5
  158. synth_ai/evals/base.py +0 -2
  159. synth_ai/handshake.py +11 -9
  160. synth_ai/http.py +1 -1
  161. synth_ai/http_client.py +43 -11
  162. synth_ai/inference/__init__.py +0 -2
  163. synth_ai/inference/client.py +20 -6
  164. synth_ai/jobs/client.py +103 -78
  165. synth_ai/learning/__init__.py +41 -6
  166. synth_ai/learning/algorithms.py +14 -0
  167. synth_ai/learning/client.py +121 -29
  168. synth_ai/learning/config.py +2 -40
  169. synth_ai/learning/constants.py +0 -2
  170. synth_ai/learning/ft_client.py +4 -56
  171. synth_ai/learning/health.py +13 -7
  172. synth_ai/learning/jobs.py +43 -47
  173. synth_ai/{rl → learning/rl}/__init__.py +14 -5
  174. synth_ai/learning/rl/client.py +267 -0
  175. synth_ai/learning/rl/config.py +31 -0
  176. synth_ai/{rl → learning/rl}/contracts.py +5 -10
  177. synth_ai/{rl → learning/rl}/env_keys.py +45 -16
  178. synth_ai/learning/rl/secrets.py +13 -0
  179. synth_ai/learning/rl_client.py +2 -253
  180. synth_ai/learning/sft/__init__.py +29 -0
  181. synth_ai/learning/sft/client.py +68 -0
  182. synth_ai/learning/sft/config.py +270 -0
  183. synth_ai/learning/sft/data.py +295 -0
  184. synth_ai/learning/sse.py +25 -26
  185. synth_ai/learning/validators.py +25 -24
  186. synth_ai/lm/__init__.py +21 -47
  187. synth_ai/task/__init__.py +26 -27
  188. synth_ai/task/apps/__init__.py +18 -19
  189. synth_ai/task/auth.py +35 -23
  190. synth_ai/task/client.py +15 -13
  191. synth_ai/task/contracts.py +37 -35
  192. synth_ai/task/datasets.py +9 -6
  193. synth_ai/task/errors.py +11 -10
  194. synth_ai/task/health.py +17 -11
  195. synth_ai/task/json.py +58 -24
  196. synth_ai/task/proxy.py +15 -14
  197. synth_ai/task/rubrics.py +22 -15
  198. synth_ai/task/server.py +43 -17
  199. synth_ai/task/tracing_utils.py +12 -7
  200. synth_ai/task/validators.py +0 -1
  201. synth_ai/task/vendors.py +5 -7
  202. synth_ai/tracing_v3/__init__.py +2 -0
  203. synth_ai/tracing_v3/abstractions.py +21 -4
  204. synth_ai/tracing_v3/db_config.py +26 -1
  205. synth_ai/tracing_v3/decorators.py +18 -15
  206. synth_ai/tracing_v3/examples/basic_usage.py +3 -2
  207. synth_ai/tracing_v3/hooks.py +6 -4
  208. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  209. synth_ai/tracing_v3/replica_sync.py +1 -0
  210. synth_ai/tracing_v3/session_tracer.py +63 -16
  211. synth_ai/tracing_v3/storage/base.py +89 -1
  212. synth_ai/tracing_v3/storage/config.py +21 -8
  213. synth_ai/tracing_v3/storage/factory.py +10 -8
  214. synth_ai/tracing_v3/storage/utils.py +4 -2
  215. synth_ai/tracing_v3/turso/daemon.py +7 -2
  216. synth_ai/tracing_v3/turso/models.py +5 -2
  217. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  218. synth_ai/tracing_v3/utils.py +4 -3
  219. synth_ai/v0/api/__init__.py +8 -0
  220. synth_ai/v0/api/models/__init__.py +8 -0
  221. synth_ai/v0/api/models/supported.py +8 -0
  222. synth_ai/v0/config/__init__.py +15 -0
  223. synth_ai/v0/config/base_url.py +12 -0
  224. synth_ai/v0/lm/__init__.py +51 -0
  225. synth_ai/{lm → v0/lm}/caching/ephemeral.py +3 -5
  226. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  227. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  228. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  229. synth_ai/{lm → v0/lm}/config.py +6 -1
  230. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  231. synth_ai/{lm → v0/lm}/core/exceptions.py +0 -2
  232. synth_ai/{lm → v0/lm}/core/main.py +19 -7
  233. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  234. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -15
  235. synth_ai/{lm → v0/lm}/core/vendor_clients.py +6 -4
  236. synth_ai/{lm → v0/lm}/overrides.py +4 -4
  237. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  238. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  239. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  240. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  241. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +16 -16
  242. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  243. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  244. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +12 -10
  245. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +11 -9
  246. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +8 -5
  247. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +4 -6
  248. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  249. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  250. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  251. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  252. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  253. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  254. synth_ai/{lm → v0/lm}/vendors/synth_client.py +38 -11
  255. synth_ai/v0/tracing/upload.py +32 -135
  256. synth_ai/v0/tracing_v3/__init__.py +10 -0
  257. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  258. synth_ai/v0/tracing_v3/decorators.py +3 -0
  259. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  260. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  261. synth_ai-0.2.9.dev6.dist-info/METADATA +191 -0
  262. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/RECORD +291 -264
  263. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/top_level.txt +1 -0
  264. examples/common_old/backend.py +0 -21
  265. examples/evals_old/README.md +0 -98
  266. examples/evals_old/__init__.py +0 -6
  267. examples/evals_old/compare_models.py +0 -1037
  268. examples/evals_old/example_log.md +0 -145
  269. examples/evals_old/run_demo.sh +0 -126
  270. examples/evals_old/trace_analysis.py +0 -270
  271. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  272. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  273. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  274. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -239
  275. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  276. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  277. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  278. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  279. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  280. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -118
  281. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  282. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  283. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -239
  284. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  285. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  286. examples/finetuning_old/synth_qwen_v1/infer.py +0 -37
  287. examples/finetuning_old/synth_qwen_v1/poll.py +0 -44
  288. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  289. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  290. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1932
  291. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -207
  292. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -232
  293. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  294. examples/finetuning_old/synth_qwen_v1/util.py +0 -147
  295. examples/rl_old/task_app.py +0 -962
  296. examples/warming_up_to_rl/old/event_rewards.md +0 -234
  297. examples/warming_up_to_rl/old/notes.md +0 -73
  298. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
  299. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
  300. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
  301. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  302. synth_ai/experimental/synth_oss.py +0 -446
  303. synth_ai/install_sqld.sh +0 -40
  304. synth_ai/learning/filtering.py +0 -0
  305. synth_ai/learning/offline/dpo.py +0 -0
  306. synth_ai/learning/offline/providers.py +0 -7
  307. synth_ai/learning/offline/sft.py +0 -0
  308. synth_ai/learning/offline/shared.py +0 -0
  309. synth_ai/learning/online/grpo.py +0 -0
  310. synth_ai/learning/online/irft.py +0 -0
  311. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  312. synth_ai/learning/prompts/gepa.py +0 -0
  313. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
  314. synth_ai/learning/prompts/mipro.py +0 -289
  315. synth_ai/learning/prompts/random_search.py +0 -246
  316. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  317. synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
  318. synth_ai/rl/secrets.py +0 -19
  319. synth_ai/scripts/verify_rewards.py +0 -100
  320. synth_ai/tracing/__init__.py +0 -30
  321. synth_ai/tracing_v1/__init__.py +0 -33
  322. synth_ai/tracing_v3/turso/__init__.py +0 -25
  323. synth_ai/tracing_v3/turso/manager.py +0 -774
  324. synth_ai/zyk/__init__.py +0 -30
  325. synth_ai-0.2.9.dev4.dist-info/METADATA +0 -131
  326. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  327. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  328. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  329. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  330. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  331. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  332. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  333. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  334. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  335. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  336. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  337. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  338. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  339. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  340. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  341. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  342. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  343. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  344. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  345. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  346. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  347. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  348. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  349. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  350. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  351. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/WHEEL +0 -0
  352. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/entry_points.txt +0 -0
  353. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/licenses/LICENSE +0 -0
@@ -1,1924 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Test script to run ReAct agents against Crafter environment using LM class with Synth backend.
4
- This demonstrates using the LM class with Synth models through native integration.
5
-
6
- This version uses the new tracing_v3 system with async Turso/SQLite backend.
7
- """
8
- import argparse
9
- import asyncio
10
- import contextlib
11
- from contextlib import asynccontextmanager
12
- import glob
13
- import itertools
14
- import json
15
- import logging
16
- import os
17
- import random
18
- import sys
19
- import time
20
- import uuid
21
- from collections import defaultdict
22
- from datetime import datetime
23
- from pathlib import Path
24
- from typing import Any
25
-
26
- import httpx
27
- import numpy as np
28
- import toml
29
- import yaml
30
- from httpx import AsyncClient
31
- from tqdm import tqdm
32
-
33
- # Disable httpx logging immediately
34
- logging.getLogger("httpx").setLevel(logging.ERROR)
35
- logging.getLogger("httpcore").setLevel(logging.ERROR)
36
-
37
-
38
- # Configure logging to suppress noisy third-party logs when in quiet mode
39
- def setup_logging(quiet_mode: bool = False):
40
- """Setup logging configuration."""
41
- if quiet_mode:
42
- # Suppress most third-party logging in quiet mode
43
- logging.getLogger("httpx").setLevel(logging.ERROR)
44
- logging.getLogger("synth_ai.tracing_v3").setLevel(logging.ERROR)
45
- logging.getLogger("synth_ai.tracing_v3.turso").setLevel(logging.ERROR)
46
- logging.getLogger("sqlalchemy").setLevel(logging.ERROR)
47
- logging.getLogger("aiosqlite").setLevel(logging.ERROR)
48
- # Suppress httpcore as well (used by httpx)
49
- logging.getLogger("httpcore").setLevel(logging.ERROR)
50
- else:
51
- # Normal logging levels
52
- logging.getLogger("httpx").setLevel(logging.ERROR) # Always suppress httpx logs
53
- logging.getLogger("synth_ai.tracing_v3").setLevel(logging.INFO)
54
-
55
-
56
- # Set default logging to avoid noisy logs during import
57
- setup_logging(quiet_mode=True)
58
-
59
- # Setup environment
60
- sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
61
-
62
- # Disable v1 logging to see v3 tracing clearly
63
- os.environ["LANGFUSE_ENABLED"] = "false"
64
- os.environ["SYNTH_LOGGING"] = "false"
65
-
66
- from synth_ai.lm.config import SynthConfig # noqa: E402
67
-
68
- # Import Synth warmup utilities
69
- from synth_ai.lm.warmup import warmup_synth_model # noqa: E402
70
-
71
- # Import session tracer for v3 tracing
72
- from synth_ai.tracing_v3 import SessionTracer # noqa: E402
73
- from synth_ai.tracing_v3.abstractions import ( # noqa: E402
74
- EnvironmentEvent,
75
- RuntimeEvent,
76
- SessionEventMarkovBlanketMessage,
77
- TimeRecord,
78
- )
79
-
80
- # Import Crafter hooks for v3
81
- from synth_ai.tracing_v3.hooks import HookManager # noqa: E402
82
- from synth_ai.tracing_v3.turso.daemon import SqldDaemon # noqa: E402
83
-
84
- # create_experiment_context will be defined as a helper function below
85
- from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager # noqa: E402
86
-
87
- # Create a custom hook manager without default print statements
88
- QUIET_HOOKS = HookManager()
89
-
90
- # Import LM components (v3 version if available)
91
- try:
92
- from synth_ai.lm.core.main_v3 import LM # noqa: E402
93
- except ImportError:
94
- from synth_ai.lm.core.main_v2 import LM # noqa: E402
95
-
96
- # Configuration constants
97
- HTTP_TIMEOUT = (
98
- 30.0 # Increased from 10.0 for better handling of concurrent load and LM response times
99
- )
100
- MAX_RETRIES = 3
101
- RETRY_DELAY = 1.0
102
-
103
- # Use the backend
104
- @asynccontextmanager
105
- async def _noop_async_context():
106
- yield
107
-
108
-
109
-
110
- async def create_experiment_context(
111
- db_manager: AsyncSQLTraceManager, experiment_name: str, description: str
112
- ) -> dict[str, Any]:
113
- """Create an experiment context for v3 tracing."""
114
- experiment_id = f"exp_{uuid.uuid4().hex[:12]}"
115
- await db_manager.create_experiment(
116
- experiment_id=experiment_id, name=experiment_name, description=description, configuration={}
117
- )
118
- return {
119
- "experiment_id": experiment_id,
120
- "experiment_name": experiment_name,
121
- "description": description,
122
- }
123
-
124
-
125
- def cleanup_old_files():
126
- """Clean up old trace files and result files to keep directory clean."""
127
- # Remove old JSON result files (keep only the latest 5)
128
- result_files = glob.glob("crafter_lm_synth_results_*.json")
129
- if len(result_files) > 5:
130
- # Sort by modification time and keep only the latest 5
131
- result_files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
132
- for old_file in result_files[5:]:
133
- try:
134
- os.remove(old_file)
135
- print(f"šŸ—‘ļø Cleaned up old result file: {old_file}")
136
- except OSError:
137
- pass
138
-
139
-
140
- def _load_env_from_monorepo() -> dict:
141
- """Load environment variables from monorepo/.env.local if present."""
142
- env_file = (
143
- Path(__file__).resolve().parent.parent.parent.parent.parent.parent / "monorepo/.env.local"
144
- )
145
- env_vars = {}
146
-
147
- if env_file.exists():
148
- with open(env_file) as f:
149
- for line in f:
150
- line = line.strip()
151
- if line and not line.startswith("#") and "=" in line:
152
- key, value = line.split("=", 1)
153
- # Remove quotes if present
154
- value = value.strip().strip('"').strip("'")
155
- env_vars[key] = value
156
-
157
- return env_vars
158
-
159
-
160
- def _load_testing_yaml_api_key() -> str | None:
161
- """Load SYNTH_API_KEY from monorepo/tests/prod/testing_info.yaml if present."""
162
- # First try the new env vars from monorepo/.env.local
163
- env_vars = _load_env_from_monorepo()
164
-
165
- # Try production key first, then test key
166
- if "SYNTH_API_KEY_PROD" in env_vars:
167
- return env_vars["SYNTH_API_KEY_PROD"]
168
- elif "SYNTH_API_KEY_TEST" in env_vars:
169
- return env_vars["SYNTH_API_KEY_TEST"]
170
-
171
- # Fallback to the old YAML method
172
- yaml_path = (
173
- Path(__file__).resolve().parent.parent.parent.parent.parent.parent
174
- / "monorepo/tests/prod/testing_info.yaml"
175
- )
176
- if yaml_path.exists():
177
- with open(yaml_path) as f:
178
- data = yaml.safe_load(f)
179
- return data.get("SYNTH_API_KEY")
180
- return None
181
-
182
-
183
- def setup_synth_environment():
184
- """Setup environment variables for Synth/Modal endpoints.
185
-
186
- Resolution order for the base URL:
187
- 1. Explicit environment variables (SYNTH_BASE_URL or MODAL_BASE_URL)
188
- 2. PROD_API_URL env var used in production integration tests
189
- 3. Hard-coded production constant (https://agent-learning.onrender.com)
190
-
191
- The API key is resolved from the matching *_API_KEY env vars or, if not
192
- present, from the shared testing_info.yaml used by the prod tests.
193
- """
194
- # Load environment variables from monorepo/.env.local
195
- env_vars = _load_env_from_monorepo()
196
-
197
- synth_base_url = (
198
- os.getenv("SYNTH_BASE_URL")
199
- or os.getenv("MODAL_BASE_URL")
200
- or os.getenv("PROD_API_URL")
201
- or env_vars.get("SYNTH_BASE_URL_PROD") # Use production URL from .env.local
202
- or "https://agent-learning.onrender.com/api"
203
- )
204
-
205
- synth_api_key = os.getenv("SYNTH_API_KEY") or _load_testing_yaml_api_key()
206
-
207
- # # --- Validate API key format ---
208
- # if synth_api_key:
209
- # VALID_PREFIXES = ("sk-", "sk_live_", "sk_test_")
210
- # if not any(synth_api_key.startswith(p) for p in VALID_PREFIXES):
211
- # truncated = synth_api_key[:8] if len(synth_api_key) >= 8 else synth_api_key
212
- # expected_formats = " or ".join(VALID_PREFIXES)
213
- # raise ValueError(
214
- # f"Invalid API key format. Expected prefix {expected_formats}. Provided key begins with '{truncated}'."
215
- # )
216
- # else:
217
- # raise ValueError(
218
- # "SYNTH_API_KEY or MODAL_API_KEY must be provided via environment variables or testing_info.yaml"
219
- # )
220
-
221
- # Ensure trailing /v1 for OpenAI-compatible endpoints
222
- if not synth_base_url.endswith("/v1"):
223
- synth_base_url = synth_base_url.rstrip("/") + "/v1"
224
- synth_base_url = synth_base_url.rstrip("/")
225
-
226
- # Propagate to OpenAI SDK env vars expected by LM class
227
- os.environ["OPENAI_API_BASE"] = synth_base_url
228
- os.environ["OPENAI_BASE_URL"] = synth_base_url
229
- os.environ["OPENAI_API_KEY"] = synth_api_key
230
-
231
- return synth_base_url, synth_api_key
232
-
233
-
234
- async def retry_http_request(client: AsyncClient, method: str, url: str, **kwargs) -> Any:
235
- """Retry HTTP requests with exponential backoff and jitter."""
236
- last_exception = None
237
-
238
- for attempt in range(MAX_RETRIES):
239
- try:
240
- if attempt > 0:
241
- delay = min(RETRY_DELAY * (2 ** (attempt - 1)), RETRY_DELAY * 2) # Use RETRY_DELAY
242
- jitter = random.uniform(0, 0.1 * delay)
243
- total_delay = delay + jitter
244
- await asyncio.sleep(total_delay)
245
-
246
- response = await client.request(method, url, timeout=HTTP_TIMEOUT, **kwargs)
247
-
248
- if response.status_code < 500:
249
- return response
250
-
251
- last_exception = Exception(f"HTTP {response.status_code}: {response.text}")
252
-
253
- except httpx.ReadError as e:
254
- last_exception = e
255
- if attempt < MAX_RETRIES - 1:
256
- read_error_delay = min(1.0 * (2**attempt), 5.0)
257
- await asyncio.sleep(read_error_delay)
258
- except Exception as e:
259
- last_exception = e
260
-
261
- print(
262
- f" āŒ HTTP request failed after {MAX_RETRIES} attempts: {type(last_exception).__name__}: {str(last_exception)[:200]}"
263
- )
264
- raise last_exception
265
-
266
-
267
- def create_message(
268
- content: Any, message_type: str, origin_system_id: Any, turn: int
269
- ) -> SessionEventMarkovBlanketMessage:
270
- """Create a message with origin system ID embedded in content."""
271
- # Map custom message types to valid v3 message types
272
- type_mapping = {
273
- "observation": "system", # Map observation to system message
274
- "user": "user",
275
- "assistant": "assistant",
276
- "system": "system",
277
- "tool_use": "tool_use",
278
- "tool_result": "tool_result",
279
- }
280
-
281
- return SessionEventMarkovBlanketMessage(
282
- content=json.dumps({"origin_system_id": str(origin_system_id), "payload": content}),
283
- message_type=type_mapping.get(message_type, "system"), # Default to system
284
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
285
- )
286
-
287
-
288
- def compress_observation_for_trace(obs: dict[str, Any]) -> dict[str, Any]:
289
- """Compress observation for trace storage to avoid huge trace files."""
290
- compressed = obs.copy()
291
-
292
- # Compress semantic map if present
293
- if "semantic_map" in compressed:
294
- del compressed["semantic_map"]
295
-
296
- # Compress other large fields
297
- if "rgb" in compressed:
298
- del compressed["rgb"]
299
-
300
- return compressed
301
-
302
-
303
- def format_semantic_map_view_v2(obs: dict[str, Any], view_size: int = 7) -> str:
304
- """Format a semantic map view around the player with normal names using real Crafter mapping."""
305
- # Get semantic map
306
- semantic_map = obs.get("semantic_map")
307
- if semantic_map is None:
308
- return "No semantic map available"
309
-
310
- # Convert to numpy array if needed
311
- sem_arr = np.asarray(semantic_map)
312
- if sem_arr.ndim == 1:
313
- # Assuming square map, reshape
314
- size = int(np.sqrt(sem_arr.size))
315
- sem_arr = sem_arr.reshape(size, size)
316
-
317
- # Get player position
318
- player_pos = obs.get("player_position", [sem_arr.shape[0] // 2, sem_arr.shape[1] // 2])
319
- px, py = int(player_pos[0]), int(player_pos[1])
320
-
321
- # Get real crafter semantic mapping directly from crafter library
322
- import crafter
323
-
324
- dummyenv = crafter.Env()
325
- try:
326
- max_id = (
327
- max(max(dummyenv._world._mat_ids.values()), max(dummyenv._sem_view._obj_ids.values()))
328
- + 1
329
- )
330
- id_to_item = ["void"] * max_id
331
- for name, ind in itertools.chain(
332
- dummyenv._world._mat_ids.items(), dummyenv._sem_view._obj_ids.items()
333
- ):
334
- clean = (
335
- name.__name__
336
- if hasattr(name, "__name__")
337
- else (str(name) if name is not None else "none")
338
- )
339
- id_to_item[ind] = clean.lower()
340
- finally:
341
- with contextlib.suppress(AttributeError, Exception):
342
- dummyenv.close()
343
-
344
- # Create view
345
- half = view_size // 2
346
- lines = []
347
- visible_items = set()
348
-
349
- for dy in range(-half, half + 1):
350
- row = []
351
- for dx in range(-half, half + 1):
352
- x, y = px + dx, py + dy
353
-
354
- if dx == 0 and dy == 0:
355
- row.append("you") # Player
356
- elif 0 <= x < sem_arr.shape[0] and 0 <= y < sem_arr.shape[1]:
357
- val = int(sem_arr[x, y])
358
- # Use the real crafter mapping
359
- item_name = id_to_item[val] if val < len(id_to_item) else f"unknown_{val}"
360
- row.append(item_name)
361
- if item_name not in ["grass", "you", "void"]:
362
- visible_items.add(item_name)
363
- else:
364
- row.append("void") # Out of bounds
365
-
366
- lines.append(" ".join(row))
367
-
368
- # Add legend of visible items
369
- legend = (
370
- f"Visible items: {', '.join(sorted(visible_items))}"
371
- if visible_items
372
- else "No special items visible (mostly grass)"
373
- )
374
-
375
- return "\n".join(lines) + "\n" + legend
376
-
377
-
378
- def get_openai_tools():
379
- """Get OpenAI-compatible tool definitions for Synth models."""
380
- return [
381
- {
382
- "type": "function",
383
- "function": {
384
- "name": "interact",
385
- "description": "Perform actions in the Crafter environment.",
386
- "parameters": {
387
- "type": "object",
388
- "properties": {
389
- "actions": {
390
- "type": "array",
391
- "items": {"type": "string"},
392
- "description": "List of actions to perform in sequence (e.g., ['move_right', 'move_right', 'do']). Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop",
393
- },
394
- "reasoning": {
395
- "type": "string",
396
- "description": "Reasoning for these actions",
397
- },
398
- },
399
- "required": ["actions", "reasoning"],
400
- },
401
- },
402
- },
403
- {
404
- "type": "function",
405
- "function": {
406
- "name": "terminate",
407
- "description": "End the episode when finished or no progress can be made.",
408
- "parameters": {
409
- "type": "object",
410
- "properties": {
411
- "reason": {"type": "string", "description": "Reason for termination"}
412
- },
413
- "required": ["reason"],
414
- },
415
- },
416
- },
417
- ]
418
-
419
-
420
- # --- Configuration Class ---
421
- class CrafterConfig:
422
- """Configuration for Crafter evaluation with Synth backend."""
423
-
424
- def __init__(self, config_path: str | None = None):
425
- # Default values
426
- self.model_name: str | None = None
427
- self.num_instances = 1
428
- self.max_turns = 2
429
- self.difficulty = "easy"
430
- self.service_base_url = "http://localhost:8901"
431
- self.service_timeout = 30.0
432
- self.seed = 42
433
- self.save_traces = True
434
- self.save_detailed_results = True
435
- self.verbose = False
436
- self.quiet = False # Add quiet mode support
437
- self.analyze_traces = False
438
-
439
- # V3 tracing settings
440
- self.enable_v3_tracing = True
441
- # Standardize to a single shared v3 DB by default; allow env override
442
- self.v3_trace_dir = os.getenv("SYNTH_TRACES_ROOT", "./traces/v3")
443
- # Use shared DB path unless explicitly overridden via env or config
444
- self.turso_db_path = os.getenv(
445
- "SQLD_DB_PATH", os.path.join(self.v3_trace_dir, "synth_ai.db")
446
- )
447
- self.start_sqld_daemon = True # Whether to start sqld daemon
448
- self.auto_cleanup = True # Clean up old files automatically
449
-
450
- # Synth-specific settings
451
- self.warmup_model = True
452
- self.warmup_max_attempts = 30
453
- self.warmup_timeout = 60.0 # Default timeout in seconds
454
- self.use_synth_backend = True # Flag to indicate Synth backend
455
-
456
- # Load from TOML if provided
457
- if config_path and os.path.exists(config_path):
458
- self.load_from_toml(config_path)
459
-
460
- def load_from_toml(self, config_path: str):
461
- """Load configuration from TOML file."""
462
- config = toml.load(config_path)
463
-
464
- eval_config = config.get("eval", {})
465
- self.model_name = eval_config.get("model_name", self.model_name)
466
- self.num_instances = eval_config.get("episodes", self.num_instances)
467
- self.max_turns = eval_config.get("max_steps", self.max_turns)
468
- self.difficulty = eval_config.get("difficulty", self.difficulty)
469
- self.seed = eval_config.get("seed", self.seed)
470
-
471
- service_config = config.get("service", {})
472
- self.service_base_url = service_config.get("base_url", self.service_base_url)
473
- self.service_timeout = service_config.get("timeout", self.service_timeout)
474
-
475
- output_config = config.get("output", {})
476
- self.save_traces = output_config.get("save_traces", self.save_traces)
477
- self.save_detailed_results = output_config.get(
478
- "save_detailed_results", self.save_detailed_results
479
- )
480
-
481
- # V3 tracing config
482
- tracing_config = config.get("tracing_v3", {})
483
- self.enable_v3_tracing = tracing_config.get("enabled", self.enable_v3_tracing)
484
- self.v3_trace_dir = tracing_config.get("trace_dir", self.v3_trace_dir)
485
- self.turso_db_path = tracing_config.get("db_path", self.turso_db_path)
486
- self.start_sqld_daemon = tracing_config.get("start_daemon", self.start_sqld_daemon)
487
- self.auto_cleanup = tracing_config.get("auto_cleanup", self.auto_cleanup)
488
-
489
- # Synth config
490
- synth_config = config.get("synth", {})
491
- self.warmup_model = synth_config.get("warmup_model", self.warmup_model)
492
- self.warmup_max_attempts = synth_config.get("warmup_max_attempts", self.warmup_max_attempts)
493
- self.warmup_timeout = synth_config.get("warmup_timeout", self.warmup_timeout)
494
- self.use_synth_backend = synth_config.get("use_synth_backend", self.use_synth_backend)
495
-
496
-
497
- # --- Base ReAct Agent using LM with Synth ---
498
- class BaseReActAgentWithLMSynth:
499
- """Base ReAct agent using LM class configured for Synth backend."""
500
-
501
- def __init__(
502
- self,
503
- model_name: str,
504
- max_turns: int = 20,
505
- verbose: bool = False,
506
- tracer: SessionTracer | None = None,
507
- episode_id: int = 0,
508
- quiet: bool = False,
509
- model_params: dict[str, Any] | None = None,
510
- ):
511
- self.model_name = model_name
512
- self.max_turns = max_turns
513
- self.verbose = verbose
514
- self.quiet = quiet
515
- self.history = []
516
- self.system_name = "base-react-agent-lm-synth"
517
- self.tools = get_openai_tools()
518
- self.tracer = tracer
519
- self.system_id = f"{self.system_name}_{uuid.uuid4()}"
520
- self.episode_id = episode_id
521
-
522
- # Default model parameters
523
- default_model_params = {
524
- "temperature": 0.7,
525
- "max_tokens": 512,
526
- "top_p": 1.0,
527
- "frequency_penalty": 0.0,
528
- "presence_penalty": 0.0,
529
- "tool_choice": "auto",
530
- }
531
-
532
- # Merge user-provided parameters with defaults
533
- self.model_params = {**default_model_params, **(model_params or {})}
534
-
535
- # Setup Synth environment variables
536
- setup_synth_environment()
537
-
538
- # Create LM instance with synth provider and configurable parameters
539
- self.lm = LM(
540
- model_name=model_name,
541
- formatting_model_name=model_name,
542
- temperature=self.model_params["temperature"],
543
- synth_logging=False, # Disable v1 tracing
544
- provider="synth", # Use synth provider
545
- session_tracer=tracer,
546
- system_id=self.system_id,
547
- enable_v3_tracing=True,
548
- # Pass additional model parameters
549
- max_tokens=self.model_params["max_tokens"],
550
- top_p=self.model_params["top_p"],
551
- frequency_penalty=self.model_params["frequency_penalty"],
552
- presence_penalty=self.model_params["presence_penalty"],
553
- # Qwen3 think mode (propagated by vendor to chat_template_kwargs)
554
- enable_thinking=self.model_params.get("enable_thinking"),
555
- # Forward arbitrary extra_body to vendor for features like
556
- # stop_after_tool_calls. The runner sets this to 1.
557
- extra_body=self.model_params.get("extra_body"),
558
- )
559
-
560
- # Agent state tracking
561
- self.agent_state = {
562
- "message_history": [],
563
- "steps_taken": 0,
564
- "steps_remaining": max_turns,
565
- "total_tokens_used": 0,
566
- "tool_calls_made": 0,
567
- "current_turn": 0,
568
- "last_failure": None, # Track last failure for prompting
569
- "recent_tool_calls": [],
570
- }
571
-
572
- async def decide(self, obs: str, system_message: str, turn: int) -> dict[str, Any]:
573
- """Get agent decision based on observation using LM class with Synth backend."""
574
- # Update agent state
575
- self.agent_state["current_turn"] = turn
576
- self.agent_state["steps_taken"] = turn
577
- self.agent_state["steps_remaining"] = self.max_turns - turn
578
-
579
- # Include last 3 tool calls (reasoning and actions) to provide short action history
580
- recent_calls = self.agent_state.get("recent_tool_calls", [])
581
- recent_tail = recent_calls[-3:] if isinstance(recent_calls, list) else []
582
- if recent_tail:
583
- lines = ["\nRecent tool calls (last 3):"]
584
- for entry in recent_tail:
585
- tnum = entry.get("turn")
586
- name = entry.get("name")
587
- reasoning = entry.get("reasoning")
588
- actions = entry.get("actions")
589
- actions_str = ", ".join(actions) if isinstance(actions, list) else ""
590
- lines.append(
591
- f"- Turn {tnum}: {name} — reasoning: {reasoning}; actions: {actions_str}"
592
- )
593
- obs_with_history = f"{obs}\n" + "\n".join(lines)
594
- else:
595
- obs_with_history = obs
596
-
597
- # Create conversation context with unique episode ID to prevent caching
598
- context = (
599
- f"Episode {self.episode_id} - Turn {turn + 1}/{self.max_turns}\n\n{obs_with_history}"
600
- )
601
-
602
- # Build messages in OpenAI format for tools
603
- # Augment the system message if the previous turn failed to produce a tool call
604
- local_system_message = system_message
605
- last_failure = self.agent_state.get("last_failure")
606
- if last_failure:
607
- local_system_message = (
608
- f"{system_message}\n\nIMPORTANT: In the previous turn, no valid tool call was returned. "
609
- f"Error: {last_failure}. You MUST respond with a single function tool call in the OpenAI tools format."
610
- )
611
- messages = [
612
- {"role": "system", "content": local_system_message},
613
- {"role": "user", "content": context},
614
- ]
615
-
616
- # Add to message history
617
- self.agent_state["message_history"].extend(messages)
618
-
619
- # Truncate history if too long
620
- max_history_length = 20
621
- if len(self.agent_state["message_history"]) > max_history_length:
622
- self.agent_state["message_history"] = [
623
- self.agent_state["message_history"][0]
624
- ] + self.agent_state["message_history"][-(max_history_length - 1) :]
625
-
626
- try:
627
- llm_start = time.time()
628
-
629
- # Optionally print full prompt on final turn when verbose
630
- if self.verbose and turn == self.max_turns - 1:
631
- print("\nšŸ” FINAL TURN PROMPT:")
632
- print("=" * 80)
633
- print(f"System: {local_system_message[:200]}...")
634
- print(f"\nUser message:\n{context}")
635
- print("=" * 80)
636
-
637
- # Debug: Print request info only when verbose
638
- if self.verbose:
639
- print(f"\nšŸ” DEBUG: LM call details (turn {turn})")
640
- print(f" Model: {self.model_name}")
641
- print(" Provider: synth")
642
- print(f" Messages: {len(messages)} messages")
643
- print(f" Tools: {len(self.tools) if self.tools else 0} tools")
644
- if self.tools:
645
- print(
646
- f" Tool 0 name: {self.tools[0].get('function', {}).get('name', 'unknown')}"
647
- )
648
- print(f" Tools structure: {json.dumps(self.tools[0], indent=4)[:300]}...")
649
-
650
- # Call LM with turn number for v3 tracing
651
- # The LM class should handle Synth routing internally
652
- if self.verbose:
653
- print(
654
- f"šŸ” DEBUG: LM sampling params => max_tokens={self.model_params.get('max_tokens')} temp={self.model_params.get('temperature')} top_p={self.model_params.get('top_p')} tool_choice={self.model_params.get('tool_choice')}"
655
- )
656
-
657
- # Optional full input logging (system, user, tools). Enable with CRAFTER_LOG_FULL_INPUTS=1
658
- _log_full_inputs = os.getenv("CRAFTER_LOG_FULL_INPUTS", "0").lower() in (
659
- "1",
660
- "true",
661
- "yes",
662
- "on",
663
- )
664
- # if _log_full_inputs:
665
- # print("\n" + "=" * 80)
666
- # print(f"FULL LM INPUT (turn {turn})")
667
- # print("-" * 80)
668
- # print("System message:\n" + local_system_message)
669
- # print("\nUser message:\n" + context)
670
- # print("\nMessages JSON:")
671
- # print(json.dumps(messages, indent=2))
672
- # print("\nTools definition:")
673
- # print(json.dumps(self.tools, indent=2))
674
- # print("\nSampling/tool params:")
675
- # print(
676
- # json.dumps(
677
- # {
678
- # "tool_choice": self.model_params.get("tool_choice"),
679
- # "extra_body": self.model_params.get("extra_body"),
680
- # "temperature": self.model_params.get("temperature"),
681
- # "max_tokens": self.model_params.get("max_tokens"),
682
- # "top_p": self.model_params.get("top_p"),
683
- # "frequency_penalty": self.model_params.get("frequency_penalty"),
684
- # "presence_penalty": self.model_params.get("presence_penalty"),
685
- # },
686
- # indent=2,
687
- # )
688
- # )
689
- # print("=" * 80)
690
-
691
- response = await self.lm.respond_async(
692
- messages=messages,
693
- turn_number=turn,
694
- # Pass tools in the format expected by LM class
695
- tools=self.tools,
696
- max_tokens=self.model_params["max_tokens"],
697
- tool_choice=self.model_params.get("tool_choice", "auto"),
698
- # Pass extra_body per call to ensure backend receives stop_after_tool_calls
699
- extra_body=self.model_params.get("extra_body"),
700
- )
701
-
702
- llm_end = time.time()
703
-
704
- # Minimal output: show only tool_call presence, number of actions, and tokens
705
- completion_tokens = None
706
- prompt_tokens = None
707
- toks_per_sec = None
708
- if hasattr(response, "usage") and isinstance(response.usage, dict):
709
- completion_tokens = response.usage.get("completion_tokens")
710
- prompt_tokens = response.usage.get("prompt_tokens")
711
- # Compute tokens/sec if we have duration and completion tokens
712
- try:
713
- if completion_tokens is not None:
714
- duration_s = max(1e-6, (llm_end - llm_start))
715
- toks_per_sec = round(float(completion_tokens) / duration_s, 2)
716
- except Exception:
717
- toks_per_sec = None
718
-
719
- # Parse the response to extract tool calls
720
- raw_response = response.raw_response
721
- decision: dict[str, Any]
722
-
723
- if hasattr(response, "tool_calls") and response.tool_calls:
724
- tool_call = response.tool_calls[0]
725
- parsed_decision = None
726
- fn = tool_call.get("function") if isinstance(tool_call, dict) else None
727
- if isinstance(fn, dict) and ("name" in fn):
728
- name = fn.get("name", "interact")
729
- args_raw = fn.get("arguments", "{}")
730
- try:
731
- import json as _json
732
-
733
- args = (
734
- _json.loads(args_raw) if isinstance(args_raw, str) else (args_raw or {})
735
- )
736
- if isinstance(args, dict):
737
- parsed_decision = {"name": name, "parameters": args}
738
- except Exception as _e:
739
- parsed_decision = {"name": name, "parameters": {"arguments": args_raw}}
740
- if (
741
- not parsed_decision
742
- and isinstance(tool_call, dict)
743
- and ("name" in tool_call or "parameters" in tool_call)
744
- ):
745
- parsed_decision = {
746
- "name": tool_call.get("name", "interact"),
747
- "parameters": tool_call.get("parameters", {}),
748
- }
749
- if parsed_decision:
750
- decision = parsed_decision
751
- try:
752
- pname = decision.get("name")
753
- pparams = (
754
- decision.get("parameters", {}) if isinstance(decision, dict) else {}
755
- )
756
- preason = pparams.get("reasoning") if isinstance(pparams, dict) else None
757
- pacts = pparams.get("actions") if isinstance(pparams, dict) else None
758
- entry = {
759
- "turn": turn,
760
- "name": pname,
761
- "reasoning": preason,
762
- "actions": pacts if isinstance(pacts, list) else [],
763
- }
764
- self.agent_state["recent_tool_calls"].append(entry)
765
- if len(self.agent_state["recent_tool_calls"]) > 10:
766
- self.agent_state["recent_tool_calls"] = self.agent_state[
767
- "recent_tool_calls"
768
- ][-10:]
769
- except Exception:
770
- pass
771
- # Clear failure flag on success
772
- if self.agent_state.get("last_failure"):
773
- self.agent_state["last_failure"] = None
774
- params = decision.get("parameters", {}) if isinstance(decision, dict) else {}
775
- actions = params.get("actions", []) if isinstance(params, dict) else []
776
- num_actions = len(actions) if isinstance(actions, list) else 0
777
- # Store metrics for tqdm postfix update in run_episode
778
- self.agent_state["last_metrics"] = {
779
- "tc": 1,
780
- "act": num_actions,
781
- "tok": completion_tokens,
782
- "in": prompt_tokens,
783
- "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
784
- }
785
- else:
786
- # Unrecognized tool_calls structure: do nothing, record failure
787
- failure_msg = "Unrecognized tool_calls structure"
788
- self.agent_state["last_failure"] = failure_msg
789
- decision = {
790
- "name": "interact",
791
- "parameters": {"actions": [], "reasoning": failure_msg},
792
- }
793
- if self.verbose:
794
- print(f"šŸ” DEBUG: {failure_msg}")
795
- else:
796
- # No tool calls: do nothing, record failure for next prompt
797
- failure_msg = "No valid tool_calls in assistant message"
798
- self.agent_state["last_failure"] = failure_msg
799
- decision = {
800
- "name": "interact",
801
- "parameters": {"actions": [], "reasoning": failure_msg},
802
- }
803
- # Store metrics for tqdm postfix update in run_episode
804
- self.agent_state["last_metrics"] = {
805
- "tc": 0,
806
- "act": 0,
807
- "tok": completion_tokens,
808
- "in": prompt_tokens,
809
- "tps": f"{toks_per_sec}" if toks_per_sec is not None else "-",
810
- }
811
-
812
- # Update agent state
813
- self.agent_state["tool_calls_made"] += 1
814
-
815
- # Add assistant response to history
816
- assistant_message = {"role": "assistant", "content": raw_response}
817
- self.agent_state["message_history"].append(assistant_message)
818
-
819
- if self.verbose:
820
- print(f"šŸ¤– LM Response (turn {turn}): {json.dumps(decision, indent=2)}")
821
- print(f"šŸ“Š Response time: {llm_end - llm_start:.2f}s")
822
- except Exception as e:
823
- print(f"āŒ Error in LM decide: {e}")
824
- import traceback
825
-
826
- traceback.print_exc()
827
- # Record failure and do nothing this turn
828
- failure_msg = f"Exception during decide: {str(e)}"
829
- self.agent_state["last_failure"] = failure_msg
830
- decision = {"name": "interact", "parameters": {"actions": [], "reasoning": failure_msg}}
831
-
832
- return decision
833
-
834
- def _parse_tool_response(self, raw_response: str) -> dict[str, Any]:
835
- """Parse raw LM response to extract tool calls."""
836
- # Try to parse JSON if present
837
- try:
838
- # Look for JSON in the response
839
- import re
840
-
841
- json_match = re.search(r"\{.*\}", raw_response, re.DOTALL)
842
- if json_match:
843
- data = json.loads(json_match.group())
844
- if "name" in data:
845
- return data
846
- elif "function" in data:
847
- return {
848
- "name": data["function"].get("name", "interact"),
849
- "parameters": data["function"].get("arguments", {}),
850
- }
851
- except Exception:
852
- pass
853
-
854
- # Fallback to text parsing
855
- if "terminate" in raw_response.lower():
856
- return {"name": "terminate", "parameters": {"reason": "Agent decided to terminate"}}
857
-
858
- # Try to extract actions from the response
859
- actions = []
860
- action_keywords = [
861
- "move_up",
862
- "move_down",
863
- "move_left",
864
- "move_right",
865
- "do",
866
- "sleep",
867
- "place_stone",
868
- "place_table",
869
- "place_furnace",
870
- "place_plant",
871
- "make_wood_pickaxe",
872
- "make_stone_pickaxe",
873
- "make_iron_pickaxe",
874
- "make_wood_sword",
875
- "make_stone_sword",
876
- "make_iron_sword",
877
- ]
878
-
879
- for keyword in action_keywords:
880
- if keyword in raw_response.lower():
881
- actions.append(keyword)
882
-
883
- if not actions:
884
- actions = ["do"] # Default action
885
-
886
- return {
887
- "name": "interact",
888
- "parameters": {
889
- "actions": actions, # Return as array of actions
890
- "reasoning": "Parsed from response",
891
- },
892
- }
893
-
894
- def get_system_message(self) -> str:
895
- """Return system message for agent. Override in subclasses."""
896
- return """You are an AI agent playing Crafter. Use the available tools to interact with the environment.
897
-
898
- CRITICAL RULE: You MUST provide MULTIPLE actions (2-5) in EVERY interact() tool call!
899
-
900
- The 'interact' function accepts a LIST of 1-5 actions. ALWAYS provide 2-5 actions for efficiency.
901
-
902
- GOOD Examples (what you SHOULD do):
903
- āœ“ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
904
- āœ“ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate to stone and mine it")
905
- āœ“ interact(actions=["place_table", "make_wood_pickaxe", "move_left"], reasoning="Craft and continue exploring")
906
-
907
- BAD Examples (what you should AVOID):
908
- āœ— interact(actions=["move_right"], reasoning="Move right") - TOO FEW ACTIONS!
909
- āœ— interact(actions=["do"], reasoning="Collect") - TOO FEW ACTIONS!
910
-
911
- REMEMBER: Single actions waste time. Always plan 2-5 actions ahead and execute them together!"""
912
-
913
- def format_observation(self, obs: dict[str, Any]) -> str:
914
- """Format observation for agent. Override in subclasses."""
915
- return str(obs)
916
-
917
-
918
- # --- Crafter-specific ReAct Agent ---
919
- class CrafterReActAgentWithLMSynth(BaseReActAgentWithLMSynth):
920
- """Crafter-specific ReAct agent with enhanced prompting for Synth models."""
921
-
922
- def get_system_message(self) -> str:
923
- """Return Crafter-specific system message optimized for Synth models."""
924
- override = os.getenv("CRAFTER_SYSTEM_PROMPT")
925
- if override:
926
- return override
927
- return """You are CrafterAgent playing Crafter survival environment. Your goal is to unlock as many achievements as possible while staying alive.
928
-
929
- You will see a semantic map view showing your surroundings. Use this to navigate toward resources.
930
-
931
- Key mechanics:
932
- • 'do' action: collect wood from trees, stone from deposits, food from cows/plants
933
- • 'do' does nothing on grass/water - move to find resources first
934
- • Craft progression: wood → table → wood_pickaxe → stone → stone_pickaxe → iron tools
935
- • Sleep when energy low to restore and unlock wake_up achievement
936
- • Use semantic map view to navigate toward resources you can see
937
-
938
- Available actions: move_left, move_right, move_up, move_down, do, sleep, place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword, noop
939
-
940
- KEY ACHIEVEMENTS TO UNLOCK:
941
- Basic Resource Collection (PRIORITY #1):
942
- - collect_wood: Move NEXT TO a tree, then use action="do" to collect wood
943
- - collect_stone: Move NEXT TO stone, then use action="do" (requires wood_pickaxe in inventory)
944
- - collect_coal: Move NEXT TO coal, then use action="do" (requires stone_pickaxe)
945
- - collect_iron: Move NEXT TO iron, then use action="do" (requires stone_pickaxe)
946
- - collect_diamond: Move NEXT TO diamond, then use action="do" (requires iron_pickaxe)
947
-
948
- Tool Crafting (enables resource collection):
949
- - make_wood_pickaxe: Use action="make_wood_pickaxe" when you have wood (unlocks ability to mine stone)
950
- - make_stone_pickaxe: Use action="make_stone_pickaxe" when you have wood and stone (unlocks coal/iron mining)
951
- - make_iron_pickaxe: Use action="make_iron_pickaxe" when you have wood, coal, and iron (unlocks diamond mining)
952
-
953
- Weapon Crafting (for defense):
954
- - make_wood_sword: Use action="make_wood_sword" when you have wood
955
- - make_stone_sword: Use action="make_stone_sword" when you have wood and stone
956
- - make_iron_sword: Use action="make_iron_sword" when you have wood, coal, and iron
957
-
958
- Survival Actions:
959
- - eat_plant: Use action="eat_plant" when food < 9 and you see a plant nearby
960
- - eat_cow: Move NEXT TO cow, use action="do" to kill it, then action="eat_cow"
961
- - collect_drink: Move NEXT TO water, then use action="drink" when drink < 9
962
- - sleep: Use action="sleep" when energy < 5 (restores energy to 9)
963
-
964
- Building/Placing:
965
- - place_table: Use action="place_table" when you have wood (enables advanced crafting)
966
- - place_furnace: Use action="place_furnace" when you have stone (for smelting)
967
- - place_plant: Use action="place_plant" when you have sapling (grows into tree)
968
- - place_stone: Use action="place_stone" when you have stone (creates barrier)
969
-
970
- Combat:
971
- - defeat_zombie: Move NEXT TO zombie, then use action="do" repeatedly to attack
972
- - defeat_skeleton: Move NEXT TO skeleton, then use action="do" repeatedly to attack
973
-
974
- CRITICAL: The action="do" is your INTERACTION button! Use it when adjacent to:
975
- - Trees → get wood
976
- - Stone/Coal/Iron/Diamond → mine resources (need appropriate pickaxe)
977
- - Enemies → attack them
978
- - Cows → kill for food
979
-
980
- Simple Strategy:
981
- 1. Look for resources (trees, stones) in the semantic map
982
- 2. Move toward the nearest resource
983
- 3. When adjacent to a resource, use action="do" to collect it
984
- 4. If you have wood, try action="make_wood_pickaxe"
985
- 5. Repeat: find resources, move to them, use "do"
986
-
987
- Critical Gameplay Tips:
988
- - You must be ADJACENT (one tile away) to objects to interact with them
989
- - Use "do" when next to: trees (for wood), stone (for stone), coal, iron, diamond
990
- - Use "do" to attack zombies/skeletons when adjacent
991
- - First priority: Find a tree, move next to it, then use "do" to collect wood
992
- - Wood is essential for crafting your first pickaxe
993
- - With wood_pickaxe you can mine stone, with stone_pickaxe you can mine iron, etc.
994
-
995
- CRITICAL INSTRUCTION: You MUST ALWAYS provide MULTIPLE actions (2-5) in EVERY interact() tool call!
996
-
997
- The 'interact' function accepts a LIST of 1-5 actions. NEVER use single actions - always plan 2-5 actions ahead!
998
-
999
- MANDATORY action sequences (ALWAYS use multiple):
1000
- āœ“ interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")
1001
- āœ“ interact(actions=["move_up", "move_up", "move_right", "do"], reasoning="Navigate and collect")
1002
- āœ“ interact(actions=["place_table", "make_wood_pickaxe", "move_left", "move_left"], reasoning="Craft and explore")
1003
- āœ“ interact(actions=["do", "move_right", "do", "move_right", "do"], reasoning="Collect multiple resources")
1004
-
1005
- FORBIDDEN (NEVER do this):
1006
- āœ— interact(actions=["move_right"], ...) - WRONG! Too few actions!
1007
- āœ— interact(actions=["do"], ...) - WRONG! Too few actions!
1008
-
1009
- RULE: If you use less than 2 actions, you are playing inefficiently. Always think 2-5 steps ahead!
1010
-
1011
- Key Strategy:
1012
- 1. Plan a sequence of moves to reach resources
1013
- 2. Execute multiple moves in one tool call (e.g., ["move_right", "move_right", "move_up"])
1014
- 3. When adjacent to a resource, use "do" to collect it
1015
- 4. Chain crafting actions together (e.g., ["place_table", "make_wood_pickaxe"])
1016
-
1017
- Remember:
1018
- - Use "do" when ADJACENT to trees (for wood), stones, or other resources
1019
- - Collect wood FIRST before trying to craft anything
1020
- - Be efficient - use multiple actions per tool call!
1021
- - Focus on unlocking achievements by collecting resources and crafting items."""
1022
-
1023
- def format_observation(self, obs: dict[str, Any]) -> str:
1024
- """Format Crafter observation with semantic map view."""
1025
- # Get semantic map view
1026
- semantic_view = format_semantic_map_view_v2(obs, view_size=7)
1027
-
1028
- # Extract key information
1029
- inventory = obs.get("inventory", {})
1030
- # Try both possible keys for achievements
1031
- achievements = obs.get("achievements_status", obs.get("achievements_info", {}))
1032
- health = obs.get("health", 10)
1033
- food = obs.get("food", 10)
1034
- drink = obs.get("drink", 10)
1035
- energy = obs.get("energy", 10)
1036
-
1037
- # Count achievements
1038
- achieved = sum(1 for v in achievements.values() if v)
1039
- total_achievements = len(achievements)
1040
-
1041
- # Format inventory (only show non-zero items)
1042
- inv_items = []
1043
- for item, count in inventory.items():
1044
- if count > 0:
1045
- inv_items.append(f"{item}: {count}")
1046
- inv_str = ", ".join(inv_items) if inv_items else "empty"
1047
-
1048
- # List unlocked achievements
1049
- unlocked = [k for k, v in achievements.items() if v]
1050
- unlocked_str = ", ".join(unlocked) if unlocked else "none"
1051
-
1052
- # Recent achievements (from info if available)
1053
- recent_str = ""
1054
-
1055
- suppress_reminder = os.getenv("CRAFTER_SUPPRESS_OBS_REMINDER")
1056
- base = (
1057
- f"=== SEMANTIC MAP VIEW (7x7) ===\n"
1058
- f"{semantic_view}\n\n"
1059
- f"=== STATUS ===\n"
1060
- f"Health: {health}/10 | Food: {food}/10 | Drink: {drink}/10 | Energy: {energy}/10\n"
1061
- f"Inventory: {inv_str}\n"
1062
- f"Achievements: {achieved}/{total_achievements} unlocked\n"
1063
- f"Unlocked: {unlocked_str}\n"
1064
- f"{recent_str}\n\n"
1065
- # f"What do you see in the map? What actions should you take? "
1066
- )
1067
- if suppress_reminder:
1068
- return base
1069
- return (
1070
- base
1071
- # + "\n\nREMINDER: You MUST provide 2-5 actions in your interact() tool call. Plan multiple steps ahead!\n"
1072
- # + 'Example: interact(actions=["move_right", "move_right", "do"], reasoning="Move to tree and collect wood")'
1073
- )
1074
-
1075
-
1076
- async def run_episode(
1077
- episode_id: int,
1078
- config: CrafterConfig,
1079
- session_tracer: SessionTracer | None = None,
1080
- progress_bar: tqdm | None = None,
1081
- quiet: bool = False,
1082
- model_params: dict[str, Any] | None = None,
1083
- ):
1084
- """Run a single episode."""
1085
- episode_start_time = time.time()
1086
-
1087
- # Create agent - always disable verbose for cleaner output
1088
- agent = CrafterReActAgentWithLMSynth(
1089
- model_name=config.model_name,
1090
- max_turns=config.max_turns,
1091
- verbose=False, # Always disable verbose logging in agent
1092
- tracer=session_tracer,
1093
- episode_id=episode_id,
1094
- quiet=True, # Always use quiet mode for agent
1095
- model_params=model_params,
1096
- )
1097
-
1098
- # Initialize environment
1099
- async with AsyncClient(base_url=config.service_base_url) as client:
1100
- try:
1101
- # Initialize environment with unique seed for each episode
1102
- # Use simple sequential seeds: 1, 2, 3, 4, etc.
1103
- episode_seed = episode_id + 1 # Start from 1 instead of 0
1104
-
1105
- init_response = await retry_http_request(
1106
- client,
1107
- "POST",
1108
- "/env/CrafterClassic/initialize",
1109
- json={"config": {"difficulty": config.difficulty, "seed": episode_seed}},
1110
- )
1111
-
1112
- init_data = init_response.json()
1113
- instance_id = init_data["env_id"]
1114
- obs = init_data["observation"]
1115
-
1116
- # Start initial timestep and send initial observation as message
1117
- if session_tracer:
1118
- async with session_tracer.timestep("init", turn_number=0):
1119
- obs_msg = create_message(
1120
- compress_observation_for_trace(obs),
1121
- "observation",
1122
- f"crafter_env_{instance_id}",
1123
- 0,
1124
- )
1125
- await session_tracer.record_message(
1126
- content=obs_msg.content, message_type=obs_msg.message_type
1127
- )
1128
-
1129
- # Run episode
1130
- episode_reward = 0
1131
- termination_reason = None
1132
- step_results = []
1133
- consecutive_no_tool_calls = 0
1134
-
1135
- # Create progress bar for this episode
1136
- episode_progress = tqdm(
1137
- total=config.max_turns,
1138
- desc=f"Episode {episode_id}",
1139
- position=episode_id,
1140
- leave=True,
1141
- ncols=100,
1142
- )
1143
-
1144
- for turn in range(config.max_turns):
1145
- episode_progress.update(1)
1146
-
1147
- # Use timestep context for this turn
1148
- timestep_name = f"turn_{turn + 1}"
1149
- async with (
1150
- session_tracer.timestep(timestep_name, turn_number=turn + 1)
1151
- if session_tracer
1152
- else _noop_async_context()
1153
- ):
1154
- # Get agent decision
1155
- obs_formatted = agent.format_observation(obs)
1156
- system_msg = agent.get_system_message()
1157
-
1158
- decision = await agent.decide(obs_formatted, system_msg, turn)
1159
- # Update tqdm postfix with latest metrics from agent
1160
- try:
1161
- metrics = agent.agent_state.get("last_metrics")
1162
- if isinstance(metrics, dict):
1163
- episode_progress.set_postfix(metrics, refresh=False)
1164
- except Exception:
1165
- pass
1166
-
1167
- # Handle termination
1168
- if decision["name"] == "terminate":
1169
- termination_reason = decision["parameters"]["reason"]
1170
- break
1171
-
1172
- # Detect consecutive no-tool-call responses and abort after 3
1173
- decision_params = (
1174
- decision.get("parameters") if isinstance(decision, dict) else None
1175
- )
1176
- decision_actions = (
1177
- decision_params.get("actions", [])
1178
- if isinstance(decision_params, dict)
1179
- else []
1180
- )
1181
- if (
1182
- decision.get("name") == "interact"
1183
- and isinstance(decision_actions, list)
1184
- and len(decision_actions) == 0
1185
- ):
1186
- consecutive_no_tool_calls += 1
1187
- print(f"šŸ” DEBUG: consecutive_no_tool_calls={consecutive_no_tool_calls}")
1188
- else:
1189
- consecutive_no_tool_calls = 0
1190
- if consecutive_no_tool_calls >= 3:
1191
- # Gracefully end the episode without recording this problematic turn
1192
- termination_reason = "no_tool_calls_abort"
1193
- break
1194
-
1195
- # Execute actions in sequence
1196
- actions = (
1197
- decision["parameters"].get("actions", [])
1198
- if isinstance(decision.get("parameters"), dict)
1199
- else []
1200
- )
1201
-
1202
- # Ensure control variables are defined even if no actions are taken this turn
1203
- done = False
1204
- reward = 0.0
1205
- info = {}
1206
-
1207
- # Define action mapping
1208
- crafter_action_map = {
1209
- "noop": 0,
1210
- "move_left": 1,
1211
- "move_right": 2,
1212
- "move_up": 3,
1213
- "move_down": 4,
1214
- "do": 5,
1215
- "sleep": 6,
1216
- "place_stone": 7,
1217
- "place_table": 8,
1218
- "place_furnace": 9,
1219
- "place_plant": 10,
1220
- "make_wood_pickaxe": 11,
1221
- "make_stone_pickaxe": 12,
1222
- "make_iron_pickaxe": 13,
1223
- "make_wood_sword": 14,
1224
- "make_stone_sword": 15,
1225
- "make_iron_sword": 16,
1226
- }
1227
-
1228
- # Execute each action in the sequence (may be empty)
1229
- for action in actions:
1230
- # Convert action name to integer
1231
- action_int = crafter_action_map.get(action, 0) # Default to noop
1232
-
1233
- # Get state before action
1234
- state_before = {"observation": obs} if "obs" in locals() else {}
1235
- prev_obs = obs.copy()
1236
-
1237
- # Step environment
1238
- step_response = await retry_http_request(
1239
- client,
1240
- "POST",
1241
- "/env/CrafterClassic/step",
1242
- json={
1243
- "env_id": instance_id,
1244
- "action": {
1245
- "tool_calls": [
1246
- {"tool": "interact", "args": {"action": action_int}}
1247
- ]
1248
- },
1249
- },
1250
- )
1251
- step_data = step_response.json()
1252
-
1253
- # Check if response has expected structure
1254
- if "observation" not in step_data:
1255
- print(
1256
- f"\nāŒ Error: Missing observation in step response. Keys: {list(step_data.keys())}"
1257
- )
1258
- if "error" in step_data:
1259
- print(f" Error message: {step_data['error']}")
1260
- # Try to recover or break
1261
- break
1262
-
1263
- obs = step_data["observation"]
1264
- reward = step_data.get("reward", 0) # Default to 0 if None
1265
- done = step_data.get("done", False) # Default to False if None
1266
- info = step_data.get("info", {})
1267
-
1268
- # Calculate achievement reward if not provided by service
1269
- if (
1270
- (reward == 0 or reward is None)
1271
- and ("achievements_status" in obs and "achievements_status" in prev_obs)
1272
- ):
1273
- prev_achievements = prev_obs["achievements_status"]
1274
- curr_achievements = obs["achievements_status"]
1275
- new_unlocks = sum(
1276
- 1
1277
- for k in curr_achievements
1278
- if curr_achievements.get(k) and not prev_achievements.get(k)
1279
- )
1280
- if new_unlocks > 0:
1281
- reward = float(new_unlocks) # +1 for each new achievement
1282
-
1283
- if reward is not None:
1284
- episode_reward += reward
1285
-
1286
- # Record step result
1287
- step_results.append(
1288
- {
1289
- "turn": turn,
1290
- "action": action,
1291
- "reward": reward,
1292
- "done": done,
1293
- "info": info,
1294
- }
1295
- )
1296
-
1297
- # Record environment event for hooks to catch
1298
- if session_tracer:
1299
- # Create environment event with state transition
1300
- env_event = EnvironmentEvent(
1301
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
1302
- system_instance_id=f"crafter_env_{instance_id}",
1303
- system_state_before={"public_state": prev_obs},
1304
- system_state_after={"public_state": obs},
1305
- reward=reward, # This now includes calculated achievement rewards
1306
- terminated=done,
1307
- metadata={"action": action, "action_int": action_int, "info": info},
1308
- )
1309
- await session_tracer.record_event(env_event)
1310
-
1311
- # Also record runtime event for invalid action detection
1312
- runtime_event = RuntimeEvent(
1313
- time_record=TimeRecord(event_time=time.time(), message_time=turn),
1314
- system_instance_id=f"crafter_runtime_{instance_id}",
1315
- actions=[action_int],
1316
- metadata={
1317
- "action_name": action,
1318
- "action_int": action_int,
1319
- "reward": reward,
1320
- "state_before": state_before,
1321
- "state_after": {"observation": obs},
1322
- },
1323
- )
1324
- await session_tracer.record_event(runtime_event)
1325
-
1326
- if done:
1327
- break
1328
-
1329
- # After all actions (or none), send final observation message
1330
- if session_tracer:
1331
- obs_msg = create_message(
1332
- compress_observation_for_trace(obs),
1333
- "observation",
1334
- f"crafter_env_{instance_id}",
1335
- turn + 1,
1336
- )
1337
- await session_tracer.record_message(
1338
- content=obs_msg.content, message_type=obs_msg.message_type
1339
- )
1340
-
1341
- if done:
1342
- break
1343
-
1344
- # Close progress bar
1345
- episode_progress.close()
1346
-
1347
- # Terminate instance
1348
- terminate_response = await retry_http_request(
1349
- client, "POST", "/env/CrafterClassic/terminate", json={"env_id": instance_id}
1350
- )
1351
-
1352
- except Exception as e:
1353
- if "episode_progress" in locals():
1354
- episode_progress.close()
1355
- print(f"\nāŒ Episode {episode_id} failed: {e}")
1356
- if config.verbose:
1357
- import traceback
1358
-
1359
- traceback.print_exc()
1360
- return {
1361
- "episode_id": episode_id,
1362
- "error": str(e),
1363
- "duration": time.time() - episode_start_time,
1364
- }
1365
-
1366
- # Extract final achievements
1367
- final_achievements = []
1368
- if obs and "achievements_status" in obs:
1369
- final_achievements = [k for k, v in obs["achievements_status"].items() if v]
1370
-
1371
- # Return results
1372
- return {
1373
- "episode_id": episode_id,
1374
- "total_reward": episode_reward,
1375
- "steps": len(step_results),
1376
- "termination_reason": termination_reason,
1377
- "duration": time.time() - episode_start_time,
1378
- "step_results": step_results,
1379
- "achievements_unlocked": final_achievements,
1380
- }
1381
-
1382
-
1383
- # --- Main ---
1384
- async def main():
1385
- """Main entry point with v3 tracing."""
1386
- parser = argparse.ArgumentParser(description="Run Crafter evaluation with LM Synth backend")
1387
- parser.add_argument("--config", type=str, help="Path to TOML config file")
1388
- parser.add_argument("--model", type=str, help="Model name (overrides config)")
1389
- parser.add_argument("--episodes", type=int, help="Number of episodes (overrides config)")
1390
- parser.add_argument("--max-steps", type=int, help="Max steps per episode (overrides config)")
1391
- parser.add_argument(
1392
- "--difficulty", type=str, choices=["easy", "normal", "hard"], help="Difficulty override"
1393
- )
1394
- parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
1395
- parser.add_argument("--quiet", action="store_true", help="Suppress most output except results")
1396
- parser.add_argument("--no-traces", action="store_true", help="Disable trace saving")
1397
- parser.add_argument("--analyze", action="store_true", help="Analyze traces after running")
1398
- parser.add_argument("--skip-warmup", action="store_true", help="Skip model warmup")
1399
- parser.add_argument(
1400
- "--no-daemon",
1401
- action="store_true",
1402
- help="Don't start sqld daemon (assumes it's already running)",
1403
- )
1404
-
1405
- # Qwen3 thinking mode flags (mutually exclusive)
1406
- think_group = parser.add_mutually_exclusive_group()
1407
- think_group.add_argument(
1408
- "--think",
1409
- dest="enable_thinking",
1410
- action="store_true",
1411
- help="Enable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=True)",
1412
- )
1413
- think_group.add_argument(
1414
- "--no-think",
1415
- dest="enable_thinking",
1416
- action="store_false",
1417
- help="Disable Qwen3 thinking mode (chat_template_kwargs.enable_thinking=False)",
1418
- )
1419
- parser.set_defaults(enable_thinking=None)
1420
-
1421
- # Model parameter arguments
1422
- parser.add_argument(
1423
- "--temperature",
1424
- type=float,
1425
- default=0.7,
1426
- help="Temperature for model responses (default: 0.7)",
1427
- )
1428
- parser.add_argument(
1429
- "--max-tokens", type=int, default=512, help="Maximum tokens to generate (default: 512)"
1430
- )
1431
- parser.add_argument(
1432
- "--top-p", type=float, default=1.0, help="Top-p sampling parameter (default: 1.0)"
1433
- )
1434
- parser.add_argument(
1435
- "--frequency-penalty", type=float, default=0.0, help="Frequency penalty (default: 0.0)"
1436
- )
1437
- parser.add_argument(
1438
- "--presence-penalty", type=float, default=0.0, help="Presence penalty (default: 0.0)"
1439
- )
1440
- parser.add_argument(
1441
- "--tool-choice",
1442
- type=str,
1443
- choices=["auto", "required", "none"],
1444
- default="auto",
1445
- help="Tool choice mode (default: auto)",
1446
- )
1447
-
1448
- args = parser.parse_args()
1449
-
1450
- # Load configuration
1451
- config = CrafterConfig(args.config)
1452
-
1453
- # Setup Synth environment variables
1454
- setup_synth_environment()
1455
-
1456
- # Clean up old files to keep directory clean
1457
- if config.auto_cleanup:
1458
- cleanup_old_files()
1459
-
1460
- # Apply command-line overrides
1461
- if args.model:
1462
- config.model_name = args.model
1463
- if args.episodes:
1464
- config.num_instances = args.episodes
1465
- if args.max_steps:
1466
- config.max_turns = args.max_steps
1467
- if args.difficulty:
1468
- config.difficulty = args.difficulty
1469
- if args.verbose:
1470
- config.verbose = True
1471
- if args.quiet:
1472
- config.quiet = True
1473
- if not args.verbose: # Don't show this if verbose is also on
1474
- print("šŸ”‡ Quiet mode enabled - suppressing verbose logs")
1475
- else:
1476
- config.quiet = False
1477
- if args.no_daemon:
1478
- config.start_sqld_daemon = False
1479
-
1480
- # Environment overrides for model parameters (fail-fast on bad values)
1481
- env_temp = os.getenv("CRAFTER_TEMPERATURE")
1482
- if env_temp is not None:
1483
- args.temperature = float(env_temp)
1484
- env_max_tok = os.getenv("CRAFTER_MAX_TOKENS")
1485
- if env_max_tok is not None:
1486
- args.max_tokens = int(env_max_tok)
1487
- env_tool_choice = os.getenv("CRAFTER_TOOL_CHOICE")
1488
- if env_tool_choice is not None:
1489
- if env_tool_choice not in {"auto", "required", "none"}:
1490
- raise ValueError(f"Invalid CRAFTER_TOOL_CHOICE: {env_tool_choice}")
1491
- args.tool_choice = env_tool_choice
1492
- env_top_p = os.getenv("CRAFTER_TOP_P")
1493
- if env_top_p is not None:
1494
- args.top_p = float(env_top_p)
1495
- env_freq_pen = os.getenv("CRAFTER_FREQUENCY_PENALTY")
1496
- if env_freq_pen is not None:
1497
- args.frequency_penalty = float(env_freq_pen)
1498
- env_pres_pen = os.getenv("CRAFTER_PRESENCE_PENALTY")
1499
- if env_pres_pen is not None:
1500
- args.presence_penalty = float(env_pres_pen)
1501
-
1502
- # Resolve stop-after-tool-calls from environment (wrapper sets this)
1503
- try:
1504
- _satc = int(os.getenv("CRAFTER_STOP_AFTER_TOOL_CALLS", "1"))
1505
- except Exception:
1506
- _satc = 1
1507
- _extra_body = {"stop_after_tool_calls": _satc} if _satc and _satc > 0 else {}
1508
-
1509
- # Create model parameters dictionary from command line arguments
1510
- model_params = {
1511
- "temperature": args.temperature,
1512
- "max_tokens": args.max_tokens,
1513
- "top_p": args.top_p,
1514
- "frequency_penalty": args.frequency_penalty,
1515
- "presence_penalty": args.presence_penalty,
1516
- "tool_choice": args.tool_choice,
1517
- # Request early stop after N tool call blocks to avoid spillover
1518
- "extra_body": _extra_body,
1519
- }
1520
- # Optionally carry thinking mode through to LM config
1521
- if args.enable_thinking is not None:
1522
- model_params["enable_thinking"] = args.enable_thinking
1523
-
1524
- # Configure logging based on quiet mode
1525
- setup_logging(quiet_mode=config.quiet)
1526
-
1527
- # Display configuration (only if not in quiet mode)
1528
- if not config.quiet:
1529
- print("šŸŽ® Crafter ReAct Agent Evaluation (LM with Synth Backend - v3)")
1530
- print(f"Model: {config.model_name}")
1531
- print("Model Parameters:")
1532
- print(f" Temperature: {model_params['temperature']}")
1533
- print(f" Max Tokens: {model_params['max_tokens']}")
1534
- print(f" Top-p: {model_params['top_p']}")
1535
- print(f" Frequency Penalty: {model_params['frequency_penalty']}")
1536
- print(f" Presence Penalty: {model_params['presence_penalty']}")
1537
- print(f"Service: {config.service_base_url}")
1538
- print(f"Instances: {config.num_instances}")
1539
- print(f"Max Turns: {config.max_turns}")
1540
- print(f"Difficulty: {config.difficulty}")
1541
- print(f"Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
1542
- print("=" * 50)
1543
-
1544
- if args.no_traces:
1545
- config.save_traces = False
1546
- config.enable_v3_tracing = False
1547
- if args.analyze:
1548
- config.analyze_traces = True
1549
- if args.skip_warmup:
1550
- config.warmup_model = False
1551
-
1552
- # Ensure model is specified
1553
- if not config.model_name:
1554
- parser.error("Model name must be specified via --model or config file")
1555
-
1556
- # Test service health
1557
- async with AsyncClient(base_url=config.service_base_url) as client:
1558
- try:
1559
- health_resp = await retry_http_request(client, "GET", "/health")
1560
- health_data = health_resp.json()
1561
- print(f"āœ… Crafter service is healthy: {health_data}")
1562
- except Exception as e:
1563
- print(f"āŒ Failed to connect to Crafter service: {e}")
1564
- return
1565
-
1566
- # Warm up the model if requested
1567
- if config.warmup_model and not args.skip_warmup:
1568
- print(f"\nšŸ”„ Warming up {config.model_name} on Synth backend...")
1569
- try:
1570
- synth_base_url = os.getenv("SYNTH_BASE_URL") # or os.getenv('MODAL_BASE_URL')
1571
- synth_api_key = os.getenv("SYNTH_API_KEY") # or os.getenv('MODAL_API_KEY')
1572
- if synth_base_url and synth_api_key:
1573
- synth_config = SynthConfig(
1574
- base_url=synth_base_url,
1575
- api_key=synth_api_key,
1576
- timeout=config.warmup_timeout, # Use configurable timeout
1577
- )
1578
- warmed = await warmup_synth_model(config.model_name, synth_config)
1579
- if warmed:
1580
- print("āœ… Model warmed up successfully!")
1581
- else:
1582
- print("āš ļø Warmup did not complete; continuing anyway...")
1583
- else:
1584
- print("āš ļø Missing SYNTH_BASE_URL or SYNTH_API_KEY, skipping warmup")
1585
- except Exception as e:
1586
- print(f"āš ļø Warmup failed: {e}")
1587
- print("Continuing anyway...")
1588
-
1589
- # Set up v3 tracing if enabled
1590
- trace_manager = None
1591
- experiment_ctx = None
1592
- sqld_daemon = None
1593
-
1594
- if config.enable_v3_tracing:
1595
- # Create trace directory first
1596
- os.makedirs(config.v3_trace_dir, exist_ok=True)
1597
-
1598
- # Start sqld daemon if requested
1599
- if config.start_sqld_daemon:
1600
- print("\nšŸš€ Starting sqld daemon for v3 tracing...")
1601
- sqld_daemon = SqldDaemon(db_path=config.turso_db_path)
1602
- sqld_daemon.__enter__() # Start the daemon
1603
- await asyncio.sleep(2) # Give it time to start
1604
- print("āœ… sqld daemon started")
1605
-
1606
- # Initialize trace manager with proper URL format
1607
- # If SQLD_DB_PATH is a directory managed by sqld, use its data file
1608
- _db_path = config.turso_db_path
1609
- if os.path.isdir(_db_path):
1610
- _candidate = os.path.join(_db_path, "dbs", "default", "data")
1611
- if os.path.exists(_candidate):
1612
- _db_path = _candidate
1613
- db_url = f"sqlite+aiosqlite:///{os.path.abspath(_db_path)}"
1614
- trace_manager = AsyncSQLTraceManager(db_url=db_url)
1615
- await trace_manager.initialize()
1616
-
1617
- # Create experiment context
1618
- experiment_ctx = await create_experiment_context(
1619
- db_manager=trace_manager,
1620
- experiment_name=f"crafter_lm_synth_{config.model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
1621
- description=f"Crafter LM Synth experiment with {config.model_name} on {config.difficulty} difficulty, using LM class with v3 tracing",
1622
- )
1623
-
1624
- print(f"\nšŸ“Š V3 Tracing enabled. Traces will be saved to: {config.turso_db_path}")
1625
- print(f" Experiment: {experiment_ctx['experiment_name']}")
1626
-
1627
- # Run episodes with bounded concurrency using asyncio.Semaphore
1628
- # Control concurrency with env var CRAFTER_CONCURRENCY (default 5)
1629
- try:
1630
- _conc_str = os.getenv("CRAFTER_CONCURRENCY")
1631
- max_concurrency = int(_conc_str) if _conc_str else 5
1632
- except Exception:
1633
- max_concurrency = 5
1634
- concurrency_limiter = asyncio.Semaphore(max_concurrency)
1635
-
1636
- print(f"\nšŸš€ Running {config.num_instances} episodes (concurrency={max_concurrency})...")
1637
-
1638
- episode_seeds = [] # Track seeds used for each episode
1639
-
1640
- # Prepare episode tasks
1641
- episode_tasks = []
1642
- session_ids = []
1643
-
1644
- for i in range(config.num_instances):
1645
- # Calculate episode seed for logging (simple sequential: 1, 2, 3, etc)
1646
- episode_seed = i + 1
1647
- episode_seeds.append(episode_seed)
1648
-
1649
- # Create session tracer for this episode if v3 tracing is enabled
1650
- session_tracer = None
1651
- if config.enable_v3_tracing and trace_manager:
1652
- session_tracer = SessionTracer(hooks=QUIET_HOOKS) # Use quiet hooks
1653
- session_tracer.db = trace_manager # Use existing manager
1654
- session_tracer._initialized = True
1655
-
1656
- # Generate session ID
1657
- session_id = f"crafter_episode_{i}_{uuid.uuid4().hex[:8]}"
1658
- session_ids.append(session_id)
1659
-
1660
- # Create episode task with proper session context
1661
- async def run_episode_with_session(ep_id, cfg, tracer, pb, quiet, sess_id, model_params):
1662
- if tracer:
1663
- async with tracer.session(
1664
- session_id=sess_id,
1665
- metadata={
1666
- "episode_id": ep_id,
1667
- "experiment_id": experiment_ctx["experiment_id"]
1668
- if experiment_ctx
1669
- else None,
1670
- },
1671
- ):
1672
- return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1673
- else:
1674
- return await run_episode(ep_id, cfg, tracer, pb, quiet, model_params)
1675
-
1676
- # Freeze per-iteration values to avoid late-binding bugs in closures
1677
- this_tracer = session_tracer
1678
- this_session_id = session_ids[i] if session_ids else None
1679
-
1680
- async def _limited_episode(ep_idx=i, tracer=this_tracer, sess_id=this_session_id):
1681
- async with concurrency_limiter:
1682
- return await run_episode_with_session(
1683
- ep_idx, config, tracer, None, args.quiet, sess_id, model_params
1684
- )
1685
-
1686
- episode_task = _limited_episode()
1687
- episode_tasks.append(episode_task)
1688
-
1689
- print("\nšŸ“¤ Starting episodes...")
1690
- start_time = time.time()
1691
-
1692
- # Run all episodes in parallel and fail fast on first error
1693
- try:
1694
- results = await asyncio.gather(*episode_tasks, return_exceptions=False)
1695
- except Exception as e:
1696
- print(f"\nāŒ Run aborted due to error: {e}")
1697
- # Ensure resources are cleaned up before exiting
1698
- if trace_manager:
1699
- await trace_manager.close()
1700
- if sqld_daemon:
1701
- sqld_daemon.__exit__(None, None, None)
1702
- print("\nāœ… Stopped sqld daemon")
1703
- raise
1704
-
1705
- end_time = time.time()
1706
- parallel_time = end_time - start_time
1707
-
1708
- print(f"\nāœ… Completed {len(episode_tasks)} episodes in {parallel_time:.2f} seconds")
1709
-
1710
- # Process results and handle any exceptions
1711
- successful_results = []
1712
- failed_results = []
1713
-
1714
- for i, result in enumerate(results):
1715
- if isinstance(result, Exception):
1716
- print(f"āŒ Episode {i} failed: {result}")
1717
- failed_results.append({"episode_id": i, "error": str(result)})
1718
- else:
1719
- successful_results.append(result)
1720
-
1721
- # Link session to experiment if tracing enabled
1722
- if (
1723
- config.enable_v3_tracing
1724
- and trace_manager
1725
- and experiment_ctx
1726
- and i < len(session_ids)
1727
- ):
1728
- await trace_manager.link_session_to_experiment(
1729
- session_ids[i], experiment_ctx["experiment_id"]
1730
- )
1731
-
1732
- # Use successful results for analysis
1733
- results = successful_results + failed_results
1734
-
1735
- # Analyze results
1736
- print("\n" + "=" * 50)
1737
- print("šŸ“Š EVALUATION RESULTS")
1738
- print("=" * 50)
1739
-
1740
- successful_episodes = [r for r in results if "error" not in r]
1741
- failed_episodes = [r for r in results if "error" in r]
1742
-
1743
- if successful_episodes:
1744
- total_reward = sum(r["total_reward"] for r in successful_episodes)
1745
- total_steps = sum(r["steps"] for r in successful_episodes)
1746
- avg_reward = total_reward / len(successful_episodes)
1747
- avg_steps = total_steps / len(successful_episodes)
1748
-
1749
- print(f"Episodes completed: {len(successful_episodes)}/{config.num_instances}")
1750
- print(f"Failed episodes: {len(failed_episodes)}")
1751
- print(f"Total reward: {total_reward:.2f}")
1752
- print(f"Average reward per episode: {avg_reward:.2f}")
1753
- print(f"Total steps: {total_steps}")
1754
- print(f"Average steps per episode: {avg_steps:.2f}")
1755
-
1756
- # Show seeds used
1757
- if episode_seeds:
1758
- print("\nSeeds used:")
1759
- for i, seed in enumerate(episode_seeds[: len(successful_episodes)]):
1760
- print(f" Episode {i}: seed {seed}")
1761
-
1762
- # Extract unique achievements
1763
- all_achievements = set()
1764
- achievement_counts = defaultdict(int)
1765
-
1766
- for result in successful_episodes:
1767
- # Use the achievements_unlocked field we added
1768
- if "achievements_unlocked" in result:
1769
- for achievement in result["achievements_unlocked"]:
1770
- all_achievements.add(achievement)
1771
- achievement_counts[achievement] += 1
1772
-
1773
- # Extract and count all actions from successful episodes
1774
- action_counts = defaultdict(int)
1775
- total_actions = 0
1776
-
1777
- for result in successful_episodes:
1778
- if "step_results" in result:
1779
- for step in result["step_results"]:
1780
- if "action" in step:
1781
- action_counts[step["action"]] += 1
1782
- total_actions += 1
1783
-
1784
- print(f"Unique achievements unlocked: {len(all_achievements)}")
1785
- if all_achievements:
1786
- print("\nAchievements unlocked:")
1787
- for achievement, count in sorted(achievement_counts.items()):
1788
- print(
1789
- f" - {achievement}: {count} episodes ({count / len(successful_episodes) * 100:.1f}%)"
1790
- )
1791
-
1792
- # Display action counts
1793
- if action_counts:
1794
- print(f"\nAction counts (total: {total_actions}):")
1795
- for action, count in sorted(action_counts.items(), key=lambda x: x[1], reverse=True):
1796
- percentage = count / total_actions * 100 if total_actions > 0 else 0
1797
- print(f" - {action}: {count} ({percentage:.1f}%)")
1798
- else:
1799
- print("No successful episodes completed.")
1800
-
1801
- # Save detailed results
1802
- if config.save_detailed_results and config.enable_v3_tracing and trace_manager:
1803
- # For v3, results are automatically saved in the database
1804
- print(f"\nšŸ’¾ Results available in Turso database: {config.turso_db_path}")
1805
- print(f" Experiment ID: {experiment_ctx['experiment_id']}")
1806
- print(" Use the filter_traces_sft_turso.py script to extract fine-tuning data")
1807
- elif config.save_detailed_results:
1808
- # Fallback to JSON if no tracing - write under temp/ (git-ignored)
1809
- from pathlib import Path
1810
-
1811
- out_dir = Path(os.getenv("SYNTH_OUTPUT_DIR", "temp")).resolve()
1812
- out_dir.mkdir(parents=True, exist_ok=True)
1813
- results_path = (
1814
- out_dir / f"crafter_lm_synth_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
1815
- )
1816
- with open(results_path, "w") as f:
1817
- json.dump(
1818
- {
1819
- "config": {
1820
- "model": config.model_name,
1821
- "episodes": config.num_instances,
1822
- "max_steps": config.max_turns,
1823
- "difficulty": config.difficulty,
1824
- "backend": "synth",
1825
- "tracing": "v3",
1826
- },
1827
- "results": results,
1828
- "summary": {
1829
- "successful_episodes": len(successful_episodes),
1830
- "failed_episodes": len(failed_episodes),
1831
- "total_reward": total_reward if successful_episodes else 0,
1832
- "avg_reward": avg_reward if successful_episodes else 0,
1833
- "unique_achievements": list(all_achievements)
1834
- if successful_episodes
1835
- else [],
1836
- },
1837
- },
1838
- f,
1839
- indent=2,
1840
- )
1841
- print(f"\nšŸ’¾ Detailed results saved to: {results_path}")
1842
-
1843
- # Print a markdown row compatible with Environments/crafter.md tables
1844
- if successful_episodes:
1845
- # Columns: | model | trajectories | avg achievements | adj score | unique | steps sum | avg steps |
1846
- model_label = config.model_name.replace("/", "/")
1847
- trajectories = len(successful_episodes)
1848
- avg_ach = avg_reward # our reward == achievements unlocked per episode
1849
-
1850
- # Compute weighted scores (shaped and K-Score) from final achievements across episodes
1851
- # K coefficients taken from crafter.md (representative weights)
1852
- k_weights = {
1853
- "collect_drink": 0.1,
1854
- "collect_sapling": 0.1,
1855
- "wake_up": 0.1,
1856
- "collect_wood": 1.0,
1857
- "collect_stone": 1.0,
1858
- "eat_cow": 1.0,
1859
- "defeat_zombie": 1.0,
1860
- "defeat_skeleton": 1.0,
1861
- "make_wood_pickaxe": 3.0,
1862
- "place_table": 3.0,
1863
- "collect_coal": 3.0,
1864
- "make_stone_pickaxe": 10.0,
1865
- "place_furnace": 10.0,
1866
- "collect_iron": 10.0,
1867
- "make_stone_sword": 10.0,
1868
- "make_wood_sword": 3.0,
1869
- "place_plant": 0.1,
1870
- }
1871
-
1872
- # Aggregate final achievements across successful episodes
1873
- from collections import Counter
1874
-
1875
- ach_counter: Counter[str] = Counter()
1876
- for ep in successful_episodes:
1877
- for name in ep.get("achievements_unlocked", []):
1878
- ach_counter[name] += 1
1879
-
1880
- shaped_total = 0.0
1881
- for name, count in ach_counter.items():
1882
- k = k_weights.get(name, 1.0)
1883
- shaped_total += k * count
1884
-
1885
- # Shaped reward per episode average
1886
- shaped_reward_avg = shaped_total / trajectories if trajectories > 0 else 0.0
1887
- k_score_avg = shaped_reward_avg / 20.0 # normalize roughly to match table scale
1888
-
1889
- # unique = len(all_achievements) # unused
1890
- steps_sum = total_steps
1891
- avg_steps_md = avg_steps
1892
- print("\nMarkdown row:")
1893
- print(
1894
- f"| {model_label:<15} | {trajectories:7d} | {avg_ach:8.2f} | {shaped_reward_avg:13.3f} | {k_score_avg:12.3f} | {steps_sum:12.3f} | {avg_steps_md:8.3f} |"
1895
- )
1896
-
1897
- # Cleanup
1898
- if trace_manager:
1899
- await trace_manager.close()
1900
-
1901
- if sqld_daemon:
1902
- sqld_daemon.__exit__(None, None, None)
1903
- print("\nāœ… Stopped sqld daemon")
1904
-
1905
-
1906
- if __name__ == "__main__":
1907
- asyncio.run(main())
1908
-
1909
-
1910
- # === SEMANTIC MAP VIEW (15x15) ===
1911
- # stone coal iron coal coal coal coal
1912
- # stone stone iron coal coal coal coal
1913
- # stone stone zombie coal coal iron iron
1914
- # stone stone stone you stone iron iron
1915
- # stone stone stone stone stone stone stone
1916
- # stone stone stone stone stone stone stone
1917
- # stone stone stone stone stone stone stone
1918
- # Visible items: coal, iron, stone, zombie
1919
-
1920
- # === STATUS ===
1921
- # Health: 10/10 | Food: 10/10 | Drink: 10/10 | Energy: 10/10
1922
- # Inventory: health: 9, food: 7, drink: 7, energy: 9, wood: 1, wood_pickaxe: 1
1923
- # Achievements: 4/22 unlocked
1924
- # Unlocked: collect_wood, make_wood_pickaxe, place_table, wake_up