synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,811 @@
1
+ # PokéAgent Challenge: RPG Speedrunning Agent in Pokémon Emerald
2
+
3
+ ![PokéAgent Challenge: RPG Speedrunning Agent in Pokémon Emerald](emerald.png)
4
+
5
+ An AI agent that plays Pokémon Emerald using vision-language models to perceive the game environment, plan actions, and execute gameplay strategies. This is a **starter kit** designed to be easily customizable for different VLMs and agent behaviors.
6
+
7
+ ## Table of Contents
8
+
9
+ - [Overview](#overview)
10
+ - [Features](#features)
11
+ - [Directory Structure](#directory-structure)
12
+ - [Requirements](#requirements)
13
+ - [Installation](#installation)
14
+ - [1. Clone the Repository](#1-clone-the-repository)
15
+ - [2. Create Conda Environment (Recommended)](#2-create-conda-environment-recommended)
16
+ - [3. Install mgba System Library (Required for Python bindings)](#3-install-mgba-system-library-required-for-python-bindings)
17
+ - [4. Install Compatible libffi in Conda (Important!)](#4-install-compatible-libffi-in-conda-important)
18
+ - [5. Install Python Dependencies](#5-install-python-dependencies)
19
+ - [6. Set up Game ROM](#6-set-up-game-rom)
20
+ - [VLM Backend Setup](#vlm-backend-setup)
21
+ - [OpenAI](#-openai-gpt-4v-o3-mini-etc)
22
+ - [OpenRouter](#-openrouter-access-to-many-models)
23
+ - [Google Gemini](#-google-gemini)
24
+ - [Local HuggingFace Models](#-local-huggingface-models)
25
+ - [Auto Backend Detection](#-auto-backend-detection)
26
+ - [Running the Agent](#running-the-agent)
27
+ - [Command Line Options](#command-line-options)
28
+ - [Customizing Agent Behavior](#customizing-agent-behavior-prompt-editing-guide)
29
+ - [Advanced Configuration](#advanced-configuration)
30
+ - [Troubleshooting](#troubleshooting)
31
+ - [Submission Instructions](#submission-instructions)
32
+ - [Citation](#citation)
33
+ - [License](#license)
34
+
35
+ ## Overview
36
+
37
+ This project implements an AI agent capable of playing Pokémon Emerald on a Game Boy Advance emulator. The agent uses a vision-language model (VLM) to analyze game frames, understand the current game state, and make intelligent decisions to progress through the game.
38
+
39
+ The system is built with a modular architecture that separates perception, planning, memory, and action execution into distinct components that communicate through a message-passing system.
40
+
41
+ ## Features
42
+
43
+ - **Multiple VLM Backends**: Support for OpenAI, OpenRouter, Google Gemini, and local HuggingFace models
44
+ - **Vision-based game perception**: Uses VLMs to analyze and understand game frames
45
+ - **Strategic planning**: Develops high-level plans based on game observations
46
+ - **Memory management**: Maintains context about the game state and progress
47
+ - **Intelligent action selection**: Chooses appropriate GBA button inputs based on the current situation
48
+ - **Advanced Map System**: Location-based persistent maps with portal coordinate tracking
49
+ - **Spatial Navigation**: Bidirectional portal connections show exact transition coordinates between locations
50
+ - **NPC Detection**: Real-time NPC detection and display on maps to help avoid blocked movement
51
+ - **Movement Memory**: Tracks failed movements and NPC interactions for better navigation
52
+ - **LLM-Controlled Pathfinding**: Intelligent pathfinding decisions made directly by the language model
53
+ - **Checkpoint Persistence**: Maps and connections persist across game sessions with checkpoint system
54
+ - **Web interface**: Visualize the agent's thought process and game state in real-time
55
+ - **Modular architecture**: Easily extendable with new capabilities
56
+ - **Customizable prompts**: Easy-to-edit prompt system for different agent behaviors
57
+
58
+ ## Directory Structure
59
+
60
+ ```
61
+ pokeagent-speedrun/
62
+ ├── README.md
63
+ ├── requirements.txt
64
+ ├── run.py # Main AI agent implementation (direct emulator integration)
65
+ ├── server/ # Server components (multiprocess mode)
66
+ │ ├── __init__.py
67
+ │ ├── app.py # FastAPI server for multiprocess mode
68
+ │ ├── frame_server.py # Frame streaming server
69
+ │ └── stream.html # Web interface for streaming
70
+ ├── agent/ # Four-module agent architecture (EDIT THESE FILES TO CUSTOMIZE BEHAVIOR)
71
+ │ ├── __init__.py
72
+ │ ├── system_prompt.py # Main system prompt
73
+ │ ├── perception.py # Perception module + prompts
74
+ │ ├── planning.py # Planning module + prompts
75
+ │ ├── memory.py # Memory module + prompts
76
+ │ ├── action.py # Action module + prompts
77
+ │ └── simple.py # Simple mode implementation (bypasses four-module architecture)
78
+ ├── utils/
79
+ │ ├── __init__.py
80
+ │ ├── vlm.py # VLM backend implementations (OpenAI, Gemini, local models)
81
+ │ ├── helpers.py # Helper functions
82
+ │ ├── state_formatter.py # Game state formatting utilities
83
+ │ ├── anticheat.py # Anti-cheat tracking and verification
84
+ │ ├── llm_logger.py # Comprehensive LLM interaction logging
85
+ │ ├── ocr_dialogue.py # OCR-based dialogue detection
86
+ │ ├── map_formatter.py # Map visualization and formatting
87
+ │ ├── map_stitcher.py # Map stitching utilities
88
+ │ ├── map_visualizer.py # Map visualization tools
89
+ │ ├── headless_recorder.py # Video recording capabilities
90
+ │ └── get_local_ip.py # Network utilities
91
+ ├── pokemon_env/ # Pokémon environment wrapper (mGBA integration)
92
+ │ ├── __init__.py
93
+ │ ├── emulator.py # Core emulator integration
94
+ │ ├── memory_reader.py # Game state memory reading (DO NOT MODIFY)
95
+ │ ├── emerald_utils.py # Pokémon Emerald specific utilities
96
+ │ ├── enums.py # Game enumerations
97
+ │ ├── types.py # Type definitions
98
+ │ └── utils.py # Environment utilities
99
+ ├── tests/ # Test suite and validation
100
+ │ ├── run_tests.py # Main test runner
101
+ │ ├── states/ # Test save states with ground truth data
102
+ │ ├── ground_truth/ # Reference data for validation
103
+ │ └── test_*.py # Individual test files
104
+ ├── Emerald-GBAdvance/ # Game ROM and save states
105
+ │ ├── rom.gba # Pokémon Emerald ROM (not included)
106
+ │ └── *.state # Various starting save states
107
+ ├── llm_logs/ # LLM interaction logs (auto-generated)
108
+ └── *.mp4 # Video recordings (auto-generated with --record)
109
+ ```
110
+
111
+ ## Requirements
112
+
113
+ - Python 3.10 - 3.11
114
+ - Pokémon Emerald ROM (not included - obtain legally)
115
+ - One of the supported VLM backends (see VLM Setup section)
116
+
117
+ ## Installation
118
+
119
+ ### 1. Clone the Repository
120
+
121
+ ```bash
122
+ git clone https://github.com/sethkarten/pokeagent-speedrun
123
+ cd pokeagent-speedrun
124
+ ```
125
+
126
+ ### 2. Install uv and Set Up Environment
127
+
128
+ ```bash
129
+ # Install uv if not already installed
130
+ curl -LsSf https://astral.sh/uv/install.sh | sh
131
+
132
+ # Create virtual environment and install dependencies
133
+ uv sync
134
+
135
+ # Activate the virtual environment
136
+ source .venv/bin/activate
137
+ ```
138
+
139
+ ### 3. Install mgba System Library (Required for Python bindings)
140
+
141
+ Download and install the official Ubuntu package from the [mGBA downloads page](https://mgba.io/downloads.html):
142
+
143
+ Example for 20.04:
144
+ ```bash
145
+ wget https://github.com/mgba-emu/mgba/releases/download/0.10.5/mGBA-0.10.5-ubuntu64-focal.tar.xz
146
+ tar -xf mGBA-0.10.5-ubuntu64-focal.tar.xz
147
+ sudo dpkg -i mGBA-0.10.5-ubuntu64-focal/libmgba.deb
148
+ ```
149
+
150
+ Mac OS x86_64 Instructions:
151
+ ```bash
152
+ # arch -x86_64 /bin/zsh # m-series Macs for backwards compatibility
153
+ brew install mgba
154
+ ```
155
+
156
+ ### 4. Install Python Dependencies
157
+
158
+ The dependencies are automatically installed when you run `uv sync` in step 2.
159
+
160
+ If you need to reinstall or update dependencies:
161
+
162
+ ```bash
163
+ uv sync
164
+ ```
165
+
166
+ For development dependencies:
167
+
168
+ ```bash
169
+ uv sync --dev
170
+ ```
171
+
172
+ ### 5. Set up Game ROM
173
+
174
+ **Important**: You must obtain a Pokémon Emerald ROM file legally (e.g., dump from your own cartridge).
175
+
176
+ 1. Place your ROM file in the `Emerald-GBAdvance/` directory and rename it to `rom.gba`:
177
+ ```
178
+ pokeagent-speedrun/
179
+ └── Emerald-GBAdvance/
180
+ └── rom.gba # Your Pokémon Emerald ROM file here
181
+ ```
182
+
183
+ 2. Ensure it's a valid Pokémon Emerald ROM. The SHA-1 hash should be `f3ae088181bf583e55daf962a92bb46f4f1d07b7` for the US English version.
184
+
185
+ ## VLM Backend Setup
186
+
187
+ The agent supports multiple VLM backends. Choose one based on your needs:
188
+
189
+ ### 🔸 OpenAI (GPT-4V, o3-mini, etc.)
190
+
191
+ **Best for: Quick setup, reliable performance**
192
+
193
+ 1. Set environment variable:
194
+ ```bash
195
+ export OPENAI_API_KEY="your-api-key-here"
196
+ ```
197
+
198
+ 2. Run agent:
199
+ ```bash
200
+ python run.py --backend openai --model-name "gpt-4o"
201
+ ```
202
+
203
+ Supported models: `gpt-4o`, `gpt-4-turbo`, `o3-mini`, etc.
204
+
205
+ ### 🔸 OpenRouter (Access to many models)
206
+
207
+ **Best for: Trying different models, cost optimization**
208
+
209
+ 1. Set environment variable:
210
+ ```bash
211
+ export OPENROUTER_API_KEY="your-api-key-here"
212
+ ```
213
+
214
+ 2. Run agent:
215
+ ```bash
216
+ python run.py --backend openrouter --model-name "anthropic/claude-3.5-sonnet"
217
+ ```
218
+
219
+ Supported models: `anthropic/claude-3.5-sonnet`, `google/gemini-pro-vision`, `openai/gpt-4o`, etc.
220
+
221
+ ### 🔸 Google Gemini
222
+
223
+ **Best for: Google ecosystem integration**
224
+
225
+ 1. Set environment variable:
226
+ ```bash
227
+ export GEMINI_API_KEY="your-api-key-here"
228
+ # OR
229
+ export GOOGLE_API_KEY="your-api-key-here"
230
+ ```
231
+
232
+ 2. Run agent:
233
+ ```bash
234
+ python run.py --backend gemini --model-name "gemini-2.5-flash"
235
+ ```
236
+
237
+ Supported models: `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.5-flash-lite`, etc.
238
+
239
+ ### 🔸 Local HuggingFace Models
240
+
241
+ **Best for: Privacy, no API costs, customization**
242
+
243
+ 1. Install additional dependencies:
244
+ ```bash
245
+ pip install torch transformers bitsandbytes accelerate
246
+ ```
247
+
248
+ 2. Run agent:
249
+ ```bash
250
+ python run.py --backend local --model-name "Qwen/Qwen2-VL-2B-Instruct"
251
+ ```
252
+
253
+ Supported models: `Qwen/Qwen2-VL-2B-Instruct`, `Qwen/Qwen2-VL-7B-Instruct`, `microsoft/Phi-3.5-vision-instruct`, `llava-hf/llava-1.5-7b-hf`, etc.
254
+
255
+ ## Running the Agent
256
+
257
+ `run.py` runs the emulator and agent in a single process, providing better integration and real-time control.
258
+
259
+ ### Quick Start
260
+
261
+ ```bash
262
+ # Start with default settings (Gemini backend, agent mode)
263
+ python run.py
264
+
265
+ # OpenAI example
266
+ python run.py --backend openai --model-name "gpt-4o"
267
+
268
+ # Local model example
269
+ python run.py --backend local --model-name "Qwen/Qwen2-VL-2B-Instruct"
270
+ ```
271
+
272
+ ### Starting from Saved States
273
+
274
+ ```bash
275
+ # Load from a saved state
276
+ python run.py --load-state Emerald-GBAdvance/start.state --backend gemini --model-name gemini-2.5-flash
277
+
278
+ # Load from test states
279
+ python run.py --load-state tests/states/torchic.state --backend gemini --model-name gemini-2.5-flash
280
+ ```
281
+
282
+ ### Advanced Options
283
+
284
+ ```bash
285
+ # Start in manual mode (keyboard control)
286
+ python run.py --manual
287
+
288
+ # Enable auto agent (agent acts continuously)
289
+ python run.py --agent-auto
290
+
291
+ # Run without display window (headless)
292
+ python run.py --headless --agent-auto
293
+
294
+ # Custom port for web interface
295
+ python run.py --port 8080
296
+
297
+ # Video recording (saves MP4 file with timestamp)
298
+ python run.py --record --agent-auto
299
+
300
+ # Simple mode (lightweight processing, frame + LLM only, skips perception/planning/memory)
301
+ python run.py --simple --agent-auto
302
+
303
+ # Disable OCR dialogue detection (forces overworld state, no dialogue processing)
304
+ python run.py --no-ocr --agent-auto
305
+
306
+ # Combine multiple features (recommended for production runs)
307
+ python run.py --record --simple --no-ocr --agent-auto --backend gemini
308
+ ```
309
+
310
+ ### Debug Controls
311
+
312
+ When running with display (default):
313
+ - **M**: Display comprehensive state (exactly what the LLM sees)
314
+ - **Shift+M**: Display map visualization
315
+ - **S**: Save screenshot
316
+ - **Tab**: Toggle agent/manual mode
317
+ - **A**: Toggle auto agent mode
318
+ - **1/2**: Save/Load state
319
+ - **Space**: Trigger single agent step
320
+ - **Arrow Keys/WASD**: Manual movement
321
+ - **X/Z**: A/B buttons
322
+
323
+ ### Web Interface
324
+
325
+ The agent automatically starts a web server at `http://localhost:8000/stream` (or custom port) that serves the game stream and agent status in real-time.
326
+
327
+ #### Other Options
328
+
329
+ ```bash
330
+ # With additional debugging options
331
+ python run.py \
332
+ --backend openai \
333
+ --model-name "gpt-4o" \
334
+ --debug-state # Enable detailed state logging
335
+ ```
336
+
337
+ ### 3. Monitor the Agent
338
+
339
+ - **Web Interface**: View game state at `http://localhost:8000/stream`
340
+ - **Logs**: Monitor agent decisions in the terminal
341
+ - **Debug**: Use `--debug-state` flag for detailed state information
342
+
343
+ ## Feature Documentation
344
+
345
+ ### 🎬 Video Recording (`--record`)
346
+
347
+ Automatically records gameplay to MP4 files with timestamps.
348
+
349
+ **How it works:**
350
+ - Records at 30 FPS (intelligent frame skipping from 120 FPS emulator)
351
+ - Files saved as `pokegent_recording_YYYYMMDD_HHMMSS.mp4`
352
+ - Works in both direct and multiprocess modes
353
+ - Automatically cleaned up on graceful shutdown
354
+
355
+ **Usage:**
356
+ ```bash
357
+ # Recording gameplay to MP4
358
+ python run.py --record --agent-auto
359
+ ```
360
+
361
+ ### ⚡ Simple Mode (`--simple`)
362
+
363
+ Lightweight processing mode that bypasses the four-module agent architecture.
364
+
365
+ **Benefits:**
366
+ - 3-5x faster processing (skips perception/planning/memory modules)
367
+ - Direct frame + state → VLM → action pipeline
368
+ - Ideal for rapid prototyping and resource-constrained environments
369
+ - Maintains action history (last 20 actions)
370
+
371
+ **Usage:**
372
+ ```bash
373
+ # Simple mode for fast iterations
374
+ python run.py --simple --agent-auto
375
+
376
+ # Combined with other features
377
+ python run.py --simple --record --agent-auto
378
+ ```
379
+
380
+ ### 🔇 No OCR Mode (`--no-ocr`)
381
+
382
+ Completely disables dialogue detection and forces overworld state.
383
+
384
+ **When to use:**
385
+ - When dialogue detection is unreliable or causing issues
386
+ - For speedrunning where dialogue should be skipped quickly
387
+ - To ensure the agent never gets stuck in dialogue states
388
+ - When OCR processing is consuming too many resources
389
+
390
+ **Usage:**
391
+ ```bash
392
+ # Disable all dialogue detection
393
+ python run.py --no-ocr --agent-auto
394
+
395
+ # Recommended for production speedruns
396
+ python run.py --no-ocr --simple --agent-auto
397
+ ```
398
+
399
+ ### 🔄 Architecture
400
+
401
+ The agent uses a multiprocess architecture for improved stability and performance:
402
+
403
+ **Components:**
404
+ - **Server Process**: Runs emulator, pygame display, handles game state (automatically launched by run.py)
405
+ - **Client Process**: Runs agent decision-making, sends actions via HTTP
406
+ - **Communication**: RESTful API between processes
407
+
408
+ **Advantages:**
409
+ - **Improved Stability**: Isolates emulator from agent crashes
410
+ - **Better Performance**: Eliminates memory corruption from multithreading
411
+ - **Resource Separation**: Agent and emulator can use different CPU cores
412
+
413
+ ### 🧭 Navigation & Pathfinding System
414
+
415
+ The agent includes an intelligent navigation system that helps with spatial reasoning:
416
+
417
+ **Movement Preview System:**
418
+ - Shows immediate results of directional actions (UP, DOWN, LEFT, RIGHT)
419
+ - Displays target coordinates and tile information for each direction
420
+ - Handles special terrain like ledges (only walkable in arrow direction)
421
+
422
+ **NPC Detection & Avoidance:**
423
+ - Real-time NPC detection from game memory displays NPCs as `N` markers on maps
424
+ - Visual frame analysis allows LLM to identify NPCs not shown on maps
425
+ - Movement memory system tracks locations where movement failed (usually NPCs/obstacles)
426
+
427
+ **LLM-Controlled Pathfinding:**
428
+ - All pathfinding decisions made directly by the language model for maximum flexibility
429
+ - Movement preview provides the LLM with complete information about movement consequences
430
+ - No automatic pathfinding algorithms - the LLM plans routes step-by-step based on current state
431
+
432
+ **Map Features:**
433
+ - `P` = Player position
434
+ - `N` = NPC/Trainer location
435
+ - `?` = Unexplored areas at map edges (only shown for walkable boundaries)
436
+ - `#` = Walls/obstacles, `~` = Tall grass, `.` = Walkable paths
437
+ - Directional arrows (`↑↓←→`) = Ledges (one-way movement)
438
+
439
+ This system provides the LLM with complete spatial awareness while maintaining flexibility in navigation decisions.
440
+
441
+ ### 🚀 Recommended Production Setup
442
+
443
+ For the most stable and efficient agent runs:
444
+
445
+ ```bash
446
+ python run.py \
447
+ --record \
448
+ --simple \
449
+ --no-ocr \
450
+ --agent-auto \
451
+ --backend gemini \
452
+ --model-name gemini-2.5-flash \
453
+ --load-state your_starting_state.state
454
+ ```
455
+
456
+ This combination provides:
457
+ - ✅ Maximum stability (multiprocess architecture)
458
+ - ✅ Video evidence (automatic recording)
459
+ - ✅ Fast processing (simple mode)
460
+ - ✅ No dialogue hanging (no-ocr)
461
+ - ✅ Continuous operation (agent-auto)
462
+ - ✅ Intelligent navigation (movement preview + NPC detection)
463
+
464
+ ## Command Line Options
465
+
466
+ ```bash
467
+ python run.py [OPTIONS]
468
+
469
+ Basic Options:
470
+ --rom PATH Path to Pokemon Emerald ROM (default: Emerald-GBAdvance/rom.gba)
471
+ --load-state PATH Load from a saved state file
472
+ --load-checkpoint Load from checkpoint.state and checkpoint_milestones.json
473
+ --backend TEXT VLM backend (openai/gemini/local/auto, default: gemini)
474
+ --model-name TEXT Model name (default: gemini-2.5-flash)
475
+ --port INTEGER Server port for web interface (default: 8000)
476
+
477
+ Mode Options:
478
+ --headless Run without PyGame display window
479
+ --agent-auto Enable automatic agent actions on startup
480
+ --manual Start in manual mode instead of agent mode
481
+
482
+ Feature Options:
483
+ --record Record video of gameplay (saves MP4 with timestamp)
484
+ --simple Simple mode: frame + LLM only (skips perception/planning/memory)
485
+ --no-ocr Disable OCR dialogue detection (forces overworld state)
486
+
487
+ VLM Options:
488
+ --vlm-port INTEGER Port for Ollama server (default: 11434)
489
+ ```
490
+
491
+ ## Customizing Agent Behavior (Prompt Editing Guide)
492
+
493
+ This starter kit is designed to be easily customizable. Here's how to edit the agent's behavior:
494
+
495
+ ### 🎯 Main System Prompt
496
+
497
+ **File: `agent/system_prompt.py`**
498
+
499
+ This is the core personality of your agent. Edit this to change the overall behavior:
500
+
501
+ ```python
502
+ # Current system prompt
503
+ system_prompt = """
504
+ You are an AI agent playing Pokémon Emerald on a Game Boy Advance emulator...
505
+ """
506
+
507
+ # Example: Speedrunner personality
508
+ system_prompt = """
509
+ You are an expert Pokémon Emerald speedrunner. Your goal is to beat the game as quickly as possible using optimal strategies, routing, and tricks. Always think about efficiency and time-saving strategies.
510
+ """
511
+
512
+ # Example: Casual player personality
513
+ system_prompt = """
514
+ You are a casual Pokémon player exploring Emerald for fun. You enjoy catching different Pokémon, talking to NPCs, and thoroughly exploring each area. Take your time and enjoy the experience.
515
+ """
516
+ ```
517
+
518
+ ### 🔍 Perception Module Prompts
519
+
520
+ **File: `agent/perception.py`**
521
+
522
+ Control how the agent observes and interprets the game state:
523
+
524
+ ```python
525
+ # Find and edit the perception_prompt around line 24
526
+ perception_prompt = f"""
527
+ ★★★ VISUAL ANALYSIS TASK ★★★
528
+
529
+ You are the agent, actively playing Pokemon Emerald...
530
+ """
531
+
532
+ # Example customization for battle focus:
533
+ perception_prompt = f"""
534
+ ★★★ BATTLE-FOCUSED VISUAL ANALYSIS ★★★
535
+
536
+ You are a competitive Pokemon battler. Pay special attention to:
537
+ - Pokemon types and weaknesses
538
+ - Move effectiveness and damage calculations
539
+ - Status conditions and stat changes
540
+ - Switching opportunities
541
+ ...
542
+ """
543
+ ```
544
+
545
+ ### 🧠 Planning Module Prompts
546
+
547
+ **File: `agent/planning.py`**
548
+
549
+ Modify strategic planning behavior:
550
+
551
+ ```python
552
+ # Find the planning_prompt around line 55
553
+ planning_prompt = f"""
554
+ ★★★ STRATEGIC PLANNING TASK ★★★
555
+
556
+ You are the agent playing Pokemon Emerald with a speedrunning mindset...
557
+ """
558
+
559
+ # Example: Exploration-focused planning
560
+ planning_prompt = f"""
561
+ ★★★ EXPLORATION PLANNING TASK ★★★
562
+
563
+ You are curious explorer who wants to discover everything in Pokemon Emerald:
564
+ 1. DISCOVERY GOALS: What new areas, Pokemon, or secrets can you find?
565
+ 2. COLLECTION OBJECTIVES: What Pokemon should you catch or items should you collect?
566
+ 3. INTERACTION STRATEGY: Which NPCs should you talk to for lore and tips?
567
+ ...
568
+ """
569
+ ```
570
+
571
+ ### 🎮 Action Module Prompts
572
+
573
+ **File: `agent/action.py`**
574
+
575
+ Control decision-making and button inputs:
576
+
577
+ ```python
578
+ # Find the action_prompt around line 69
579
+ action_prompt = f"""
580
+ ★★★ ACTION DECISION TASK ★★★
581
+
582
+ You are the agent playing Pokemon Emerald with a speedrunning mindset...
583
+ """
584
+
585
+ # Example: Cautious player style
586
+ action_prompt = f"""
587
+ ★★★ CAREFUL ACTION DECISIONS ★★★
588
+
589
+ You are a careful player who wants to avoid risks:
590
+ - Always heal Pokemon before they reach critical HP
591
+ - Avoid wild Pokemon encounters when possible
592
+ - Stock up on items before challenging gyms
593
+ - Save frequently at Pokemon Centers
594
+ ...
595
+ """
596
+ ```
597
+
598
+ ### 🧵 Memory Module Behavior
599
+
600
+ **File: `agent/memory.py`**
601
+
602
+ Customize what the agent remembers and prioritizes:
603
+
604
+ ```python
605
+ # Edit the memory_step function around line 70
606
+ # Add custom key events tracking:
607
+
608
+ # Example: Track more specific events
609
+ if 'new_pokemon_caught' in state:
610
+ key_events.append(f"Caught new Pokemon: {state['new_pokemon_caught']}")
611
+
612
+ if 'item_found' in state:
613
+ key_events.append(f"Found item: {state['item_found']}")
614
+ ```
615
+
616
+ ### 🎨 Example: Creating a "Nuzlocke Challenge" Agent
617
+
618
+ Create a specialized agent for Nuzlocke rules:
619
+
620
+ 1. **Edit `agent/system_prompt.py`**:
621
+ ```python
622
+ system_prompt = """
623
+ You are playing Pokemon Emerald under strict Nuzlocke rules:
624
+ 1. You may only catch the first Pokemon in each area
625
+ 2. If a Pokemon faints, it's considered "dead" and must be released
626
+ 3. You must nickname all caught Pokemon
627
+ 4. Play very cautiously to avoid losing Pokemon
628
+ """
629
+ ```
630
+
631
+ 2. **Edit action prompts** to be more cautious about battles
632
+ 3. **Edit memory** to track "living" vs "dead" Pokemon
633
+ 4. **Edit perception** to emphasize Pokemon health monitoring
634
+
635
+ ### 🔧 Testing Your Changes
636
+
637
+ 1. Make your prompt edits
638
+ 2. Restart the agent: `python run.py --backend your-backend --model-name your-model`
639
+ 3. Monitor the logs to see how behavior changes
640
+ 4. Use `--debug-state` flag for detailed insights
641
+
642
+ ### 💡 Prompt Engineering Tips
643
+
644
+ - **Be specific**: Instead of "play well", say "prioritize type advantages and stat buffs"
645
+ - **Use examples**: Show the agent exactly what you want with concrete examples
646
+ - **Test iteratively**: Make small changes and observe the effects
647
+ - **Use sections**: Break complex prompts into clear sections with headers
648
+ - **Consider context**: Remember the agent sees game state, not just the screen
649
+
650
+ ## Advanced Configuration
651
+
652
+ ### Environment Variables
653
+
654
+ ```bash
655
+ # VLM API Keys
656
+ export OPENAI_API_KEY="your-openai-key"
657
+ export OPENROUTER_API_KEY="your-openrouter-key"
658
+ export GEMINI_API_KEY="your-gemini-key"
659
+
660
+ # Optional: Custom logging
661
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
662
+ ```
663
+
664
+ ### Local Model Optimization
665
+
666
+ For better performance with local models:
667
+
668
+ ```bash
669
+ # Use local models with appropriate hardware
670
+ python run.py --backend local --model-name "Qwen/Qwen2-VL-2B-Instruct"
671
+ ```
672
+
673
+ ## Troubleshooting
674
+
675
+ ### Common Issues
676
+
677
+ 1. **"Module not found" errors**:
678
+ ```bash
679
+ uv sync
680
+ export PYTHONPATH="${PYTHONPATH}:$(pwd)"
681
+ ```
682
+
683
+ 2. **Out of memory with local models**:
684
+ ```bash
685
+ # Try a smaller model or use cloud-based VLMs
686
+ python run.py --backend gemini --model-name "gemini-2.5-flash"
687
+ ```
688
+
689
+ 3. **Web interface connection issues**:
690
+ - Ensure run.py is running
691
+ - Check that the specified port (default 8000) is available
692
+ - Try accessing http://localhost:8000/stream directly
693
+
694
+ 4. **API rate limits**:
695
+ - Use OpenRouter for better rate limits
696
+ - Switch to local models for unlimited usage
697
+
698
+ ### Performance Tips
699
+
700
+ - **OpenAI**: Fastest for quick prototyping
701
+ - **Local models**: Best for extended runs, no API costs
702
+ - **Debug mode**: Use `--debug-state` only when needed (verbose output)
703
+
704
+ ## Fair Use and Modification Guidelines
705
+
706
+ ### ✅ Allowed Modifications
707
+
708
+ You are encouraged to modify and improve the agent in the following ways:
709
+
710
+ - **Agent Behavior**: Edit prompts in `agent/` directory to change how the agent thinks and acts, adding new planning, memory, or training
711
+ - **VLM Backends**: Add new VLM backends or modify existing ones in `utils/vlm.py`
712
+ - **Error Handling**: Improve error handling, retry logic, and fallback mechanisms
713
+ - **Logging and Debugging**: Enhance logging, add debugging tools, and improve observability
714
+ - **Testing**: Add new tests, improve test coverage, and enhance the testing framework
715
+ - **Documentation**: Update README, add comments, and improve code documentation
716
+ - **Performance**: Optimize code performance, add caching, and improve efficiency
717
+ - **UI/UX**: Enhance the web interface, add new visualizations, and improve user experience
718
+ - **Utilities**: Add helper functions, improve state formatting, and enhance utility modules
719
+
720
+ ### ❌ Restricted Modifications
721
+
722
+ The following modifications are **NOT ALLOWED** for competitive submissions:
723
+
724
+ - **Memory Reading**: Do not modify `pokemon_env/memory_reader.py` or any memory reading logic (e.g., read additional memory addresses not already being read). Feel free to use the already given information as you please (e.g., use the provided map OR do not use the provided map and use the VLM for mapping).
725
+ - **State Observation**: Do not change how game state is extracted or interpreted from memory
726
+ - **Emulator Core**: Do not modify the mGBA emulator integration or core emulation logic
727
+ - **Anti-Cheat Bypass**: Do not attempt to bypass or modify the anti-cheat verification system
728
+ - **Game State Manipulation**: Do not directly manipulate game memory or state outside of normal button inputs
729
+
730
+ ### 🎯 What This Means
731
+
732
+ - **Focus on AI/ML**: Improve the agent's decision-making, planning, and reasoning
733
+ - **Enhance Infrastructure**: Make the system more robust, debuggable, and maintainable
734
+ - **Preserve Fairness**: Keep the core game state observation system unchanged for fair competition
735
+
736
+ ## Submission Instructions
737
+
738
+ Ready to compete in the PokéAgent Challenge? Follow these submission guidelines to participate in Track 2.
739
+
740
+ ### 🎯 Submission Overview
741
+
742
+ - **Objective**: Achieve maximum game completion in Pokémon Emerald under time constraints
743
+ - **Method**: Agents must interact exclusively through the custom Pokémon Emerald emulator API
744
+ - **Flexibility**: Use any method, as long as the final action comes from a neural network
745
+ - **Anti-cheat**: All submissions undergo verification to ensure fair competition
746
+
747
+ ### 📋 Submission Requirements
748
+
749
+ Your submission must include **all three** of the following components:
750
+
751
+ #### 1. **Code Archive**
752
+ - ZIP or TAR.GZ file containing your complete agent implementation
753
+ - Include all dependencies and a clear README with setup instructions
754
+ - Ensure your code is reproducible and well-documented
755
+
756
+ #### 2. **Action & State Logs**
757
+ - Detailed logs automatically created by this starter kit during your agent's run
758
+ - These logs are generated when you run `python run.py` and include:
759
+ - All agent actions and decisions with timestamps
760
+ - Game state information at each step with cryptographic hashes
761
+ - Performance metrics and decision timing analysis
762
+ - Anti-cheat verification data for submission validation
763
+ - LLM interaction logs for debugging and transparency
764
+
765
+ #### 3. **Video Evidence**
766
+ - YouTube link to a screen recording showing your complete speedrun
767
+ - Must show the entire run from start to finish
768
+ - Video should clearly demonstrate your agent's performance and final game state
769
+
770
+ ### 🏆 Evaluation Criteria
771
+
772
+ Your submission will be evaluated on:
773
+
774
+ 1. **Milestone Completion**: Percentage of game milestones accomplished (primary metric)
775
+ 2. **Completion Time**: Time taken to complete achieved milestones (secondary metric)
776
+ 3. **Reproducibility**: Clear documentation and reproducible results
777
+
778
+ ### 📝 How to Submit
779
+
780
+ Submit your complete package through the official Google Form:
781
+
782
+ **🔗 [Submit Here: https://forms.gle/nFciH9DrT4RKC1vt9](https://forms.gle/nFciH9DrT4RKC1vt9)**
783
+
784
+ ### 💡 Tips for Success
785
+
786
+ - **Test thoroughly**: Ensure your agent runs reliably for extended periods
787
+ - **Document everything**: Clear setup instructions help with reproducibility
788
+ - **Optimize for milestones**: Focus on completing key game objectives rather than perfect play
789
+ - **Monitor logs**: Use the generated logs to debug and improve your agent's performance
790
+ - **Record quality video**: Clear, uninterrupted footage helps with verification
791
+
792
+ The submission process emphasizes both performance (how much of the game you complete and how quickly) and transparency (providing logs and video evidence for verification).
793
+
794
+ ## Citation
795
+
796
+ If you use this codebase in your research, please cite:
797
+
798
+ ```bibtex
799
+ @inproceedings{karten2025pokeagent,
800
+ title = {The PokeAgent Challenge: Competitive and Long-Context Learning at Scale},
801
+ author = {Karten, Seth and Grigsby, Jake and Milani, Stephanie and Vodrahalli, Kiran
802
+ and Zhang, Amy and Fang, Fei and Zhu, Yuke and Jin, Chi},
803
+ booktitle = {NeurIPS Competition Track},
804
+ year = {2025},
805
+ month = apr,
806
+ }
807
+ ```
808
+
809
+ ## License
810
+
811
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. Make sure to comply with the terms of service of any VLM APIs you use.