synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,415 @@
1
+ # Pokemon Red Image-Only Evaluation Guide
2
+
3
+ This guide shows you how to run Pokemon Red evaluations with **image-only input** (no text observations) and save traces + rewards to **Turso database**.
4
+
5
+ ## Prerequisites
6
+
7
+ 1. **OpenAI API Key**: Set in your `.env` file
8
+ 2. **UV Package Manager**: Already installed if you can run `uv run`
9
+ 3. **Pokemon Red ROM**: Place in `synth_ai/environments/examples/red/roms/pokemon_red.gb`
10
+ 4. **Synth AI Repository**: Clone and set up per main README
11
+
12
+ ## Quick Start
13
+
14
+ ### 1. Run Image-Only Evaluation (10 Rollouts)
15
+
16
+ ```bash
17
+ cd /Users/joshpurtell/Documents/GitHub/synth-ai
18
+
19
+ # Set up environment for Turso tracing
20
+ export TASKAPP_TRACING_ENABLED=1
21
+ export TURSO_NATIVE=1
22
+ export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
23
+
24
+ # Run evaluation with image-only input
25
+ uv run synth-ai eval pokemon_red \
26
+ --config examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml
27
+ ```
28
+
29
+ **Expected output**:
30
+ - 10 rollouts complete
31
+ - Most will stay in Red's bedroom (challenging task!)
32
+ - All traces and rewards saved to `traces/v3/pokemon_red_eval.db`
33
+
34
+ ### 2. Check Results
35
+
36
+ ```bash
37
+ # View database
38
+ ls -lh traces/v3/pokemon_red_eval.db # Should be ~192KB
39
+
40
+ # Count sessions
41
+ sqlite3 traces/v3/pokemon_red_eval.db \
42
+ "SELECT COUNT(*) FROM session_traces;"
43
+
44
+ # View all rollouts
45
+ sqlite3 -header -column traces/v3/pokemon_red_eval.db \
46
+ "SELECT
47
+ session_id,
48
+ total_reward,
49
+ achievements_count,
50
+ json_extract(reward_metadata, '\$.final_map') as map,
51
+ json_extract(reward_metadata, '\$.party_count') as party
52
+ FROM outcome_rewards
53
+ ORDER BY total_reward DESC;"
54
+ ```
55
+
56
+ ### 3. Query Statistics
57
+
58
+ ```bash
59
+ # Get summary stats
60
+ sqlite3 traces/v3/pokemon_red_eval.db \
61
+ "SELECT
62
+ 'Total rollouts' as metric, COUNT(*) as value FROM outcome_rewards
63
+ UNION ALL
64
+ SELECT
65
+ 'With rewards', COUNT(*) FROM outcome_rewards WHERE total_reward > 0
66
+ UNION ALL
67
+ SELECT
68
+ 'Average reward', ROUND(AVG(total_reward), 2) FROM outcome_rewards;"
69
+ ```
70
+
71
+ ## Configuration File
72
+
73
+ **Location**: `examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml`
74
+
75
+ ```toml
76
+ [eval]
77
+ app_id = "pokemon_red"
78
+ model = "gpt-4o-mini-2024-07-18"
79
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # 10 rollouts
80
+ max_turns = 10
81
+ concurrency = 1
82
+ env_name = "pokemon_red"
83
+ policy_name = "pokemon_red_policy"
84
+ trace_format = "full"
85
+ return_trace = true
86
+
87
+ [eval.env_config]
88
+ max_steps_per_episode = 10
89
+
90
+ [eval.policy_config]
91
+ provider = "openai"
92
+ model = "gpt-4o-mini-2024-07-18"
93
+ inference_url = "https://api.openai.com"
94
+ temperature = 0.7
95
+ top_p = 0.95
96
+ max_tokens = 512
97
+ use_vision = true # Enable vision mode
98
+ image_only_mode = true # Send ONLY images (no text)
99
+ max_llm_calls = 10
100
+ ```
101
+
102
+ ### Key Configuration Options
103
+
104
+ | Option | Description | Values |
105
+ |--------|-------------|--------|
106
+ | `use_vision` | Enable vision/image input | `true` / `false` |
107
+ | `image_only_mode` | Send only images (no text) | `true` / `false` |
108
+ | `seeds` | Which seeds to run | Array of integers |
109
+ | `max_turns` | Max policy calls per rollout | Integer (10-100) |
110
+ | `concurrency` | Parallel rollouts | 1-3 recommended |
111
+
112
+ ## Customization
113
+
114
+ ### Run More Steps (Recommended for Pokemon Red)
115
+
116
+ Pokemon Red needs more steps to make progress:
117
+
118
+ ```toml
119
+ [eval.env_config]
120
+ env_params = {max_steps_per_episode = 500} # Full Pallet Town sequence
121
+
122
+ [eval.policy_config]
123
+ max_llm_calls = 100 # Allow more LLM decisions
124
+ ```
125
+
126
+ ### Enable Text + Images (Recommended)
127
+
128
+ Image-only is very challenging for Pokemon Red. Try multimodal:
129
+
130
+ ```toml
131
+ [eval.policy_config]
132
+ use_vision = true
133
+ image_only_mode = false # Send both text AND images
134
+ ```
135
+
136
+ This gives the model both:
137
+ - Base64-encoded PNG frames (160x144 Game Boy screen)
138
+ - Text state (HP, position, party, inventory, etc.)
139
+
140
+ ### Use Better Model
141
+
142
+ ```toml
143
+ [eval]
144
+ model = "gpt-4o-2024-08-06" # Full GPT-4o
145
+
146
+ [eval.policy_config]
147
+ model = "gpt-4o-2024-08-06"
148
+ temperature = 0.7 # Slightly higher for exploration
149
+ ```
150
+
151
+ ### Run More Episodes
152
+
153
+ ```toml
154
+ [eval]
155
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] # 20 rollouts
156
+ ```
157
+
158
+ ## Database Schema
159
+
160
+ ### outcome_rewards Table
161
+
162
+ ```sql
163
+ CREATE TABLE outcome_rewards (
164
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
165
+ session_id VARCHAR NOT NULL,
166
+ total_reward INTEGER NOT NULL,
167
+ achievements_count INTEGER NOT NULL, -- Milestone events
168
+ total_steps INTEGER NOT NULL,
169
+ created_at DATETIME NOT NULL,
170
+ reward_metadata TEXT, -- JSON with map_id, party_count, badges, etc.
171
+ FOREIGN KEY(session_id) REFERENCES session_traces(session_id)
172
+ );
173
+ ```
174
+
175
+ ### Example Queries
176
+
177
+ ```sql
178
+ -- Get statistics
179
+ SELECT
180
+ COUNT(*) as total,
181
+ SUM(CASE WHEN total_reward > 0 THEN 1 ELSE 0 END) as with_rewards,
182
+ AVG(total_reward) as avg_reward,
183
+ MAX(total_reward) as max_reward,
184
+ MAX(achievements_count) as max_achievements
185
+ FROM outcome_rewards;
186
+
187
+ -- Find rollouts that made progress
188
+ SELECT
189
+ session_id,
190
+ total_reward,
191
+ achievements_count,
192
+ json_extract(reward_metadata, '$.final_map') as final_map,
193
+ json_extract(reward_metadata, '$.party_count') as party_count,
194
+ json_extract(reward_metadata, '$.badges') as badges
195
+ FROM outcome_rewards
196
+ WHERE total_reward > 0 OR achievements_count > 0
197
+ ORDER BY total_reward DESC;
198
+
199
+ -- Join with session traces
200
+ SELECT
201
+ st.session_id,
202
+ st.created_at,
203
+ st.num_timesteps,
204
+ orw.total_reward,
205
+ orw.achievements_count,
206
+ json_extract(orw.reward_metadata, '$.milestone_events') as milestones
207
+ FROM session_traces st
208
+ INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
209
+ WHERE orw.total_reward > 0
210
+ ORDER BY orw.total_reward DESC;
211
+
212
+ -- Check which maps were reached
213
+ SELECT
214
+ json_extract(reward_metadata, '$.final_map') as map_id,
215
+ COUNT(*) as count
216
+ FROM outcome_rewards
217
+ GROUP BY map_id
218
+ ORDER BY count DESC;
219
+ ```
220
+
221
+ ## Understanding Maps
222
+
223
+ **Common Map IDs**:
224
+ - `38`: Red's bedroom (starting location)
225
+ - `0`: Pallet Town (outside)
226
+ - `40`: Red's house downstairs
227
+ - `37`: Oak's Lab
228
+
229
+ **Goal**: Progress from Map 38 → 40 → 0 → 37 (get starter Pokemon)
230
+
231
+ ## Pallet Town Milestones
232
+
233
+ The `PalletTownProgressionCompositeReward` tracks these milestones:
234
+
235
+ | Milestone | Reward | Description |
236
+ |-----------|--------|-------------|
237
+ | Leave bedroom | +20 | Go downstairs |
238
+ | Exit house | +30 | Enter Pallet Town |
239
+ | Find Oak's lab | +40 | Discover and enter lab |
240
+ | Talk to Oak | +50 | First dialogue |
241
+ | Get starter | +100 | Receive your first Pokémon |
242
+ | Enter first battle | +75 | Battle rival |
243
+ | Win battle | +150 | Defeat rival |
244
+
245
+ **Total possible**: ~600+ points
246
+
247
+ ## Typical Results
248
+
249
+ **Expected Performance** (10 rollouts, 10 steps, image-only):
250
+
251
+ ```
252
+ Total rollouts: 10
253
+ Rollouts with rewards: 0 (0%) ← Expected! Task is hard
254
+ Average reward: 0.0
255
+ Final map: 38 (Red's bedroom)
256
+ ```
257
+
258
+ **Why Zero Rewards?**
259
+ - 10 steps is too few for Pokemon Red
260
+ - Image-only mode is very challenging (no HP/inventory text)
261
+ - Needs navigation + NPC interaction
262
+
263
+ **To Get Non-Zero Rewards**:
264
+ 1. Increase `max_steps_per_episode` to 100-500
265
+ 2. Enable multimodal: `image_only_mode = false`
266
+ 3. Use full GPT-4o: `model = "gpt-4o-2024-08-06"`
267
+
268
+ ## Troubleshooting
269
+
270
+ ### No Database Created
271
+
272
+ **Issue**: `traces/v3/pokemon_red_eval.db` doesn't exist or is 0 bytes
273
+
274
+ **Fix**: Ensure environment variables are set:
275
+ ```bash
276
+ export TASKAPP_TRACING_ENABLED=1
277
+ export TURSO_NATIVE=1
278
+ export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
279
+ ```
280
+
281
+ ### ROM Not Found
282
+
283
+ **Issue**: `FileNotFoundError: pokemon_red.gb`
284
+
285
+ **Fix**: Place ROM at:
286
+ ```bash
287
+ synth_ai/environments/examples/red/roms/pokemon_red.gb
288
+ ```
289
+
290
+ Or set environment variable:
291
+ ```bash
292
+ export POKEMON_RED_ROM_PATH="/path/to/pokemon_red.gb"
293
+ ```
294
+
295
+ ### 401 Unauthorized Error
296
+
297
+ **Issue**: OpenAI API returns 401
298
+
299
+ **Fix**: Check your `.env` file:
300
+ ```bash
301
+ # .env
302
+ OPENAI_API_KEY=sk-proj-...your-key-here...
303
+ ```
304
+
305
+ ### All Rewards are Zero
306
+
307
+ **Issue**: Agents aren't making progress (expected with image-only + 10 steps)
308
+
309
+ **Solutions**:
310
+
311
+ 1. **Increase steps**:
312
+ ```toml
313
+ [eval.env_config]
314
+ env_params = {max_steps_per_episode = 100}
315
+
316
+ [eval.policy_config]
317
+ max_llm_calls = 100
318
+ ```
319
+
320
+ 2. **Enable text observations**:
321
+ ```toml
322
+ [eval.policy_config]
323
+ image_only_mode = false # Send both image AND text
324
+ ```
325
+
326
+ 3. **Use better model**:
327
+ ```toml
328
+ [eval]
329
+ model = "gpt-4o-2024-08-06"
330
+ ```
331
+
332
+ ### PyBoy Not Installed
333
+
334
+ **Issue**: `ModuleNotFoundError: No module named 'pyboy'`
335
+
336
+ **Fix**:
337
+ ```bash
338
+ uv add pyboy
339
+ ```
340
+
341
+ ## Advanced: Export to CSV
342
+
343
+ ```bash
344
+ # Export all rollouts to CSV
345
+ sqlite3 -header -csv traces/v3/pokemon_red_eval.db \
346
+ "SELECT
347
+ session_id,
348
+ total_reward,
349
+ achievements_count,
350
+ json_extract(reward_metadata, '$.final_map') as final_map,
351
+ json_extract(reward_metadata, '$.party_count') as party_count,
352
+ json_extract(reward_metadata, '$.badges') as badges,
353
+ json_extract(reward_metadata, '$.milestone_events') as milestones
354
+ FROM outcome_rewards
355
+ ORDER BY total_reward DESC" \
356
+ > pokemon_red_rewards.csv
357
+ ```
358
+
359
+ ## Files Overview
360
+
361
+ ```
362
+ examples/task_apps/pokemon_red/
363
+ ├── eval_image_only_gpt4o.toml # Config file
364
+ ├── EVAL_IMAGE_ONLY_COMPLETE.md # Implementation details
365
+ ├── EVAL_IMAGE_ONLY_STATUS.md # Status document
366
+ ├── README_IMAGE_ONLY_EVAL.md # This file
367
+ ├── task_app.py # Main task app
368
+ │ ├── Image-only mode logic
369
+ │ ├── SessionTracer integration
370
+ │ ├── OpenAI API authentication
371
+ │ └── Reward computation
372
+ └── pallet_town_rl_config.toml # RL training config (reference)
373
+ ```
374
+
375
+ ## Recommended Settings for Success
376
+
377
+ For best chance of non-zero rewards:
378
+
379
+ ```toml
380
+ [eval]
381
+ model = "gpt-4o-2024-08-06" # Full GPT-4o
382
+ seeds = [0, 1, 2, 3, 4] # 5 rollouts
383
+ max_turns = 100 # Allow more decisions
384
+
385
+ [eval.env_config]
386
+ env_params = {max_steps_per_episode = 500} # Full episode
387
+
388
+ [eval.policy_config]
389
+ provider = "openai"
390
+ model = "gpt-4o-2024-08-06"
391
+ inference_url = "https://api.openai.com"
392
+ temperature = 0.7
393
+ max_tokens = 512
394
+ use_vision = true # Enable vision
395
+ image_only_mode = false # Send text too (multimodal)
396
+ max_llm_calls = 100
397
+ ```
398
+
399
+ ## See Also
400
+
401
+ - `EVAL_IMAGE_ONLY_COMPLETE.md` - Full implementation details
402
+ - `pallet_town_rl_config.toml` - RL training configuration
403
+ - `../crafter/README_IMAGE_ONLY_EVAL.md` - Crafter version
404
+
405
+ ## Summary
406
+
407
+ 1. ✅ Set environment variables for Turso tracing
408
+ 2. ✅ Run `uv run synth-ai eval pokemon_red --config ...`
409
+ 3. ✅ Check database: `traces/v3/pokemon_red_eval.db`
410
+ 4. ✅ Query rewards: `SELECT * FROM outcome_rewards`
411
+ 5. ✅ For non-zero rewards: increase steps + use multimodal + better model
412
+
413
+ Pokemon Red is challenging - don't be discouraged by zero rewards with image-only + 10 steps! 🎮
414
+
415
+
@@ -0,0 +1,3 @@
1
+ """Pokémon Red task app example package."""
2
+
3
+
@@ -0,0 +1,29 @@
1
+ # Evaluation config for Pokemon Red with image-only input
2
+ # This config uses GPT-4o mini with only image data (no text observations)
3
+
4
+ [eval]
5
+ app_id = "pokemon_red"
6
+ model = "gpt-4o-mini-2024-07-18"
7
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
8
+ max_turns = 10
9
+ concurrency = 1 # Keep low initially to avoid issues
10
+ env_name = "pokemon_red"
11
+ policy_name = "pokemon_red_policy"
12
+ trace_format = "full"
13
+ return_trace = true
14
+
15
+ [eval.env_config]
16
+ max_steps_per_episode = 10
17
+
18
+ [eval.policy_config]
19
+ provider = "openai"
20
+ model = "gpt-4o-mini-2024-07-18"
21
+ inference_url = "https://api.openai.com"
22
+ temperature = 0.7
23
+ top_p = 0.95
24
+ max_tokens = 512
25
+ use_vision = true
26
+ image_only_mode = true
27
+ max_llm_calls = 10
28
+
29
+
@@ -0,0 +1,225 @@
1
+ #!/usr/bin/env python3
2
+ """Evaluate GPT-5-nano policy on Pokemon Red Pallet Town progression.
3
+
4
+ Runs 10 parallel rollouts and reports rewards in a table.
5
+ """
6
+ import asyncio
7
+ import os
8
+ from typing import Any
9
+
10
+ import httpx
11
+ from dotenv import load_dotenv
12
+ from tabulate import tabulate
13
+
14
+
15
+ # Load environment variables
16
+ load_dotenv()
17
+
18
+ # Configuration
19
+ TASK_APP_URL = "http://127.0.0.1:8913"
20
+ NUM_EPISODES = 10
21
+ MAX_STEPS_PER_EPISODE = 10 # 10 policy calls per episode (each may return 5-10 actions)
22
+ MODEL = "gpt-5-nano"
23
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
24
+
25
+
26
+ async def run_single_rollout(
27
+ client: httpx.AsyncClient,
28
+ episode_id: int,
29
+ ) -> dict[str, Any]:
30
+ """Run a single rollout with policy-driven actions."""
31
+
32
+ # Build rollout request with policy actions
33
+ # Send "policy" for each step to trigger LLM inference
34
+ rollout_request = {
35
+ "run_id": f"eval_episode_{episode_id:03d}",
36
+ "env": {"instance_id": f"pallet_town_{episode_id:03d}"},
37
+ "ops": ["policy"] * MAX_STEPS_PER_EPISODE, # Let policy drive all actions
38
+ "policy": {
39
+ "type": "llm",
40
+ "model": MODEL,
41
+ "config": {
42
+ "model": MODEL,
43
+ "temperature": 0.7,
44
+ "max_tokens": 500,
45
+ }
46
+ },
47
+ }
48
+
49
+ try:
50
+ response = await client.post(
51
+ f"{TASK_APP_URL}/rollout",
52
+ json=rollout_request,
53
+ timeout=300.0, # 5 minutes per rollout
54
+ )
55
+ response.raise_for_status()
56
+ result = response.json()
57
+
58
+ # Extract metrics
59
+ trajectories = result.get("trajectories", [])
60
+ if not trajectories:
61
+ return {
62
+ "episode_id": episode_id,
63
+ "status": "error",
64
+ "error": "No trajectories returned",
65
+ }
66
+
67
+ trajectory = trajectories[0]
68
+ steps = trajectory.get("steps", [])
69
+ num_steps = len(steps) - 1 # Subtract initial observation
70
+
71
+ # Get metrics
72
+ metrics = result.get("metrics", {})
73
+ total_reward = metrics.get("episode_returns", [0.0])[0]
74
+ outcome_score = metrics.get("outcome_score", 0.0)
75
+ details = metrics.get("details", {})
76
+
77
+ # Extract milestone info
78
+ reward_components = details.get("reward_components", [])
79
+ milestone_events = details.get("milestone_events", [])
80
+ final_map = details.get("final_map", -1)
81
+ party_count = details.get("party_count", 0)
82
+ badges = details.get("badges", 0)
83
+
84
+ return {
85
+ "episode_id": episode_id,
86
+ "status": "success",
87
+ "total_reward": total_reward,
88
+ "outcome_score": outcome_score,
89
+ "num_steps": num_steps,
90
+ "final_map": final_map,
91
+ "party_count": party_count,
92
+ "badges": badges,
93
+ "num_milestones": len(milestone_events),
94
+ "reward_components": reward_components,
95
+ "milestone_events": milestone_events,
96
+ }
97
+
98
+ except httpx.TimeoutException:
99
+ return {
100
+ "episode_id": episode_id,
101
+ "status": "timeout",
102
+ "error": "Rollout timed out after 5 minutes",
103
+ }
104
+ except Exception as e:
105
+ return {
106
+ "episode_id": episode_id,
107
+ "status": "error",
108
+ "error": str(e),
109
+ }
110
+
111
+
112
+ async def main():
113
+ print("=" * 80)
114
+ print("POKÉMON RED - POLICY EVALUATION")
115
+ print("=" * 80)
116
+ print()
117
+ print(f"Task: Pallet Town Progression")
118
+ print(f"Policy: {MODEL}")
119
+ print(f"Episodes: {NUM_EPISODES}")
120
+ print(f"Max steps per episode: {MAX_STEPS_PER_EPISODE}")
121
+ print(f"Server: {TASK_APP_URL}")
122
+ print()
123
+
124
+ # Check server health
125
+ async with httpx.AsyncClient() as client:
126
+ try:
127
+ response = await client.get(f"{TASK_APP_URL}/health", timeout=5.0)
128
+ response.raise_for_status()
129
+ print("✓ Server is healthy")
130
+ except Exception as e:
131
+ print(f"❌ Server not responding: {e}")
132
+ print(f" Start it with: uv run -m synth_ai task-app serve pokemon_red --port 8913")
133
+ return
134
+
135
+ # Check API key
136
+ if not OPENAI_API_KEY:
137
+ print("❌ OPENAI_API_KEY not found in environment")
138
+ print(" Make sure .env file contains OPENAI_API_KEY")
139
+ return
140
+ print(f"✓ API key loaded (sk_env...{OPENAI_API_KEY[-4:]})")
141
+ print()
142
+
143
+ # Run rollouts in parallel
144
+ print(f"🎮 Running {NUM_EPISODES} episodes in parallel...")
145
+ print()
146
+
147
+ tasks = [
148
+ run_single_rollout(client, episode_id)
149
+ for episode_id in range(1, NUM_EPISODES + 1)
150
+ ]
151
+
152
+ results = await asyncio.gather(*tasks)
153
+
154
+ # Separate successful and failed results
155
+ successful = [r for r in results if r.get("status") == "success"]
156
+ failed = [r for r in results if r.get("status") != "success"]
157
+
158
+ # Print summary table
159
+ print()
160
+ print("=" * 80)
161
+ print("RESULTS SUMMARY")
162
+ print("=" * 80)
163
+ print()
164
+
165
+ if successful:
166
+ table_data = []
167
+ for r in successful:
168
+ table_data.append([
169
+ r["episode_id"],
170
+ f"{r['total_reward']:.1f}",
171
+ r["num_steps"],
172
+ f"Map{r['final_map']}",
173
+ r["party_count"],
174
+ r["badges"],
175
+ r["num_milestones"],
176
+ f"{r['outcome_score']:.3f}",
177
+ ])
178
+
179
+ headers = [
180
+ "Episode",
181
+ "Reward",
182
+ "Steps",
183
+ "Final Map",
184
+ "Party",
185
+ "Badges",
186
+ "Milestones",
187
+ "Outcome Score",
188
+ ]
189
+
190
+ print(tabulate(table_data, headers=headers, tablefmt="grid"))
191
+ print()
192
+
193
+ # Print statistics
194
+ rewards = [r["total_reward"] for r in successful]
195
+ steps = [r["num_steps"] for r in successful]
196
+ outcome_scores = [r["outcome_score"] for r in successful]
197
+
198
+ print("Statistics:")
199
+ print(f" Mean reward: {sum(rewards) / len(rewards):.2f}")
200
+ print(f" Max reward: {max(rewards):.2f}")
201
+ print(f" Min reward: {min(rewards):.2f}")
202
+ print(f" Mean steps: {sum(steps) / len(steps):.1f}")
203
+ print(f" Mean outcome score: {sum(outcome_scores) / len(outcome_scores):.4f}")
204
+ print()
205
+
206
+ # Print milestone breakdown for best episode
207
+ best_episode = max(successful, key=lambda r: r["total_reward"])
208
+ print(f"Best Episode (#{best_episode['episode_id']}):")
209
+ print(f" Total reward: {best_episode['total_reward']:.1f}")
210
+ print(f" Steps taken: {best_episode['num_steps']}")
211
+ print(f" Milestones achieved:")
212
+ for milestone in best_episode["milestone_events"]:
213
+ print(f" Step {milestone['step']}: {milestone['description']} (+{milestone['reward']:.1f})")
214
+ print()
215
+
216
+ if failed:
217
+ print(f"Failed episodes: {len(failed)}")
218
+ for r in failed:
219
+ print(f" Episode {r['episode_id']}: {r.get('error', 'Unknown error')}")
220
+ print()
221
+
222
+
223
+ if __name__ == "__main__":
224
+ asyncio.run(main())
225
+