synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (229) hide show
  1. examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
  2. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
  3. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
  4. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
  5. examples/multi_step/crafter_rl_lora.md +51 -10
  6. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  7. examples/multi_step/task_app_config_notes.md +7 -1
  8. examples/swe/task_app/grpo_swe_mini.py +55 -26
  9. examples/swe/task_app/hosted/rollout.py +40 -0
  10. examples/swe/task_app/hosted/test_service.py +5 -6
  11. examples/task_apps/TESTING.md +275 -0
  12. examples/task_apps/__init__.py +0 -0
  13. examples/task_apps/crafter/__init__.py +0 -0
  14. examples/task_apps/crafter/task_app/__init__.py +2 -0
  15. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
  16. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  17. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  18. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
  19. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
  20. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
  21. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  22. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  78. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  79. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  80. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  81. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  82. examples/task_apps/enron/__init__.py +1 -0
  83. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  84. examples/task_apps/enron/task_app/README.md +14 -0
  85. examples/task_apps/enron/task_app/__init__.py +1 -0
  86. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  87. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  88. examples/task_apps/enron/tests/__init__.py +2 -0
  89. examples/task_apps/enron/tests/conftest.py +115 -0
  90. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  91. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  92. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  93. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  94. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  95. examples/task_apps/math/__init__.py +0 -0
  96. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  97. examples/task_apps/pokemon_battle/__init__.py +2 -0
  98. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  99. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  100. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  101. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  102. examples/task_apps/pokemon_red/README.md +357 -0
  103. examples/task_apps/pokemon_red/__init__.py +3 -0
  104. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  105. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  106. examples/task_apps/pokemon_red/task_app.py +606 -0
  107. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  108. examples/task_apps/sokoban/README.md +307 -0
  109. examples/task_apps/sokoban/__init__.py +3 -0
  110. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  111. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  112. examples/task_apps/sokoban/task_app.py +1058 -0
  113. examples/task_apps/sokoban/tests/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/conftest.py +113 -0
  115. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  116. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  117. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  118. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  119. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  120. examples/task_apps/verilog/__init__.py +1 -0
  121. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  122. examples/task_apps/verilog/task_app/README.md +12 -0
  123. examples/task_apps/verilog/task_app/__init__.py +1 -0
  124. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  125. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  126. examples/task_apps/verilog/tests/__init__.py +2 -0
  127. examples/task_apps/verilog/tests/conftest.py +115 -0
  128. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  129. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  130. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  131. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  132. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  133. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  134. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  135. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
  136. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
  137. examples/warming_up_to_rl/run_eval.py +127 -18
  138. examples/workflows/__init__.py +0 -0
  139. examples/workflows/math_rl/__init__.py +0 -0
  140. examples/workflows/math_rl/download_dataset.py +80 -0
  141. synth_ai/__init__.py +41 -1
  142. synth_ai/api/train/builders.py +73 -29
  143. synth_ai/api/train/cli.py +12 -6
  144. synth_ai/api/train/configs/__init__.py +44 -0
  145. synth_ai/api/train/configs/rl.py +134 -0
  146. synth_ai/api/train/configs/sft.py +95 -0
  147. synth_ai/api/train/configs/shared.py +24 -0
  148. synth_ai/api/train/env_resolver.py +5 -2
  149. synth_ai/api/train/supported_algos.py +10 -5
  150. synth_ai/api/train/utils.py +7 -4
  151. synth_ai/cli/__init__.py +7 -51
  152. synth_ai/cli/_storage.py +4 -3
  153. synth_ai/cli/_validate_task_app.py +11 -0
  154. synth_ai/cli/balance.py +4 -3
  155. synth_ai/cli/calc.py +2 -2
  156. synth_ai/cli/demo.py +49 -43
  157. synth_ai/cli/legacy_root_backup.py +1 -1
  158. synth_ai/cli/rl_demo.py +86 -106
  159. synth_ai/cli/root.py +0 -97
  160. synth_ai/cli/task_apps.py +1710 -186
  161. synth_ai/demos/core/cli.py +121 -159
  162. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  163. synth_ai/environments/examples/crafter_classic/environment.py +16 -0
  164. synth_ai/environments/examples/enron/engine.py +7 -2
  165. synth_ai/environments/examples/enron/environment.py +68 -0
  166. synth_ai/environments/examples/red/engine.py +27 -0
  167. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  168. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  169. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  170. synth_ai/environments/examples/red/environment.py +60 -0
  171. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  172. synth_ai/environments/examples/verilog/engine.py +30 -4
  173. synth_ai/evals/__init__.py +15 -0
  174. synth_ai/evals/client.py +82 -0
  175. synth_ai/evals/types.py +42 -0
  176. synth_ai/jobs/client.py +16 -4
  177. synth_ai/judge_schemas.py +127 -0
  178. synth_ai/py.typed +0 -0
  179. synth_ai/task/__init__.py +14 -5
  180. synth_ai/task/contracts.py +124 -38
  181. synth_ai/task/proxy.py +48 -56
  182. synth_ai/task/rubrics/__init__.py +53 -0
  183. synth_ai/task/rubrics/loaders.py +133 -0
  184. synth_ai/task/rubrics/models.py +57 -0
  185. synth_ai/task/rubrics/scoring.py +113 -0
  186. synth_ai/task/rubrics/strict.py +149 -0
  187. synth_ai/task/server.py +8 -7
  188. synth_ai/task/validators.py +269 -6
  189. synth_ai/tracing_v3/decorators.py +7 -3
  190. synth_ai/tracing_v3/replica_sync.py +4 -4
  191. synth_ai/tracing_v3/serialization.py +130 -0
  192. synth_ai/tracing_v3/trace_utils.py +317 -0
  193. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  194. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  195. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
  196. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
  197. synth_ai/task/rubrics.py +0 -219
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  214. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  215. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  216. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  217. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  218. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  219. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  222. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  223. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  224. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  225. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  226. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  227. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  228. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  229. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,191 @@
1
+ """
2
+ Test script for Pallet Town Progression Rewards
3
+
4
+ This script demonstrates the reward function by simulating
5
+ a sequence of states representing the ideal Pallet Town progression.
6
+ """
7
+
8
+ import asyncio
9
+ from synth_ai.environments.examples.red.engine_helpers.reward_library.pallet_town_progression import (
10
+ PalletTownProgressionCompositeReward,
11
+ )
12
+
13
+
14
+ async def main():
15
+ """Simulate a perfect Pallet Town run and show rewards"""
16
+
17
+ reward_fn = PalletTownProgressionCompositeReward()
18
+ total_reward = 0.0
19
+
20
+ print("=" * 70)
21
+ print("PALLET TOWN PROGRESSION - REWARD SIMULATION")
22
+ print("=" * 70)
23
+ print()
24
+
25
+ # Step 1: Start in bedroom (Map 1)
26
+ state1 = {
27
+ "map_id": 1,
28
+ "player_x": 3,
29
+ "player_y": 4,
30
+ "party_count": 0,
31
+ "in_battle": False,
32
+ "text_box_active": False,
33
+ "battle_outcome": 0,
34
+ "enemy_hp_current": 0,
35
+ "enemy_hp_max": 0,
36
+ "enemy_hp_percentage": 0.0,
37
+ }
38
+ action1 = {
39
+ "prev_map_id": 1,
40
+ "prev_party_count": 0,
41
+ "prev_in_battle": False,
42
+ "prev_text_box_active": False,
43
+ "prev_enemy_hp_current": 0,
44
+ "prev_enemy_hp_percentage": 0.0,
45
+ }
46
+
47
+ # Step 2: Go downstairs (Map 1 -> Map 2)
48
+ state2 = {**state1, "map_id": 2, "player_y": 8}
49
+ action2 = {**action1, "prev_map_id": 1}
50
+
51
+ r = await reward_fn.score(state2, action2)
52
+ total_reward += r
53
+ print(f"✓ Leave bedroom (Map 1→2): +{r:.0f} points")
54
+
55
+ # Step 3: Exit house (Map 2 -> Map 0)
56
+ state3 = {**state2, "map_id": 0, "player_x": 5, "player_y": 7}
57
+ action3 = {**action2, "prev_map_id": 2}
58
+
59
+ r = await reward_fn.score(state3, action3)
60
+ total_reward += r
61
+ print(f"✓ Exit house to Pallet Town (Map 2→0): +{r:.0f} points")
62
+
63
+ # Step 4: Navigate to and enter Oak's Lab (Map 0 -> Map 3)
64
+ state4 = {**state3, "map_id": 3, "player_x": 4, "player_y": 11}
65
+ action4 = {**action3, "prev_map_id": 0}
66
+
67
+ r = await reward_fn.score(state4, action4)
68
+ total_reward += r
69
+ print(f"✓ Find and enter Oak's Lab (Map 0→3): +{r:.0f} points")
70
+
71
+ # Step 5: Talk to Oak (text box appears)
72
+ state5 = {**state4, "text_box_active": True}
73
+ action5 = {**action4, "prev_text_box_active": False}
74
+
75
+ r = await reward_fn.score(state5, action5)
76
+ total_reward += r
77
+ print(f"✓ Talk to Professor Oak: +{r:.0f} points")
78
+
79
+ # Step 6: Receive starter Pokemon (party count 0 -> 1)
80
+ state6 = {
81
+ **state5,
82
+ "party_count": 1,
83
+ "party_pokemon": [
84
+ {
85
+ "species_id": 4, # Charmander
86
+ "level": 5,
87
+ "hp_current": 20,
88
+ "hp_max": 20,
89
+ "hp_percentage": 100.0,
90
+ }
91
+ ],
92
+ }
93
+ action6 = {**action5, "prev_party_count": 0}
94
+
95
+ r = await reward_fn.score(state6, action6)
96
+ total_reward += r
97
+ print(f"✓ Receive starter Pokemon: +{r:.0f} points")
98
+
99
+ # Step 7: Enter first battle
100
+ state7 = {**state6, "in_battle": True, "text_box_active": False,
101
+ "enemy_hp_current": 20, "enemy_hp_max": 20, "enemy_hp_percentage": 100.0}
102
+ action7 = {**action6, "prev_in_battle": False, "prev_text_box_active": True}
103
+
104
+ r = await reward_fn.score(state7, action7)
105
+ total_reward += r
106
+ print(f"✓ Enter first battle with rival: +{r:.0f} points")
107
+
108
+ # Step 8-12: Deal damage (5 attacks)
109
+ print()
110
+ print("Battle sequence:")
111
+ for i in range(5):
112
+ prev_hp = 20 - (i * 4)
113
+ curr_hp = 20 - ((i + 1) * 4)
114
+ state_dmg = {
115
+ **state7,
116
+ "enemy_hp_current": curr_hp,
117
+ "enemy_hp_percentage": (curr_hp / 20) * 100,
118
+ }
119
+ action_dmg = {
120
+ **action7,
121
+ "prev_in_battle": True,
122
+ "prev_enemy_hp_current": prev_hp,
123
+ "prev_enemy_hp_percentage": (prev_hp / 20) * 100,
124
+ }
125
+
126
+ r = await reward_fn.score(state_dmg, action_dmg)
127
+ total_reward += r
128
+
129
+ # Check for half HP and low HP milestones
130
+ if r > 5: # Got bonus reward
131
+ if (prev_hp / 20) >= 0.5 and (curr_hp / 20) < 0.5:
132
+ print(f" → Attack {i+1}: Enemy HP {prev_hp}→{curr_hp} (+5) + Half HP bonus (+25) = +{r:.0f}")
133
+ elif (prev_hp / 20) >= 0.25 and (curr_hp / 20) < 0.25:
134
+ print(f" → Attack {i+1}: Enemy HP {prev_hp}→{curr_hp} (+5) + Low HP bonus (+35) = +{r:.0f}")
135
+ else:
136
+ print(f" → Attack {i+1}: Enemy HP {prev_hp}→{curr_hp} +{r:.0f} points")
137
+
138
+ print()
139
+
140
+ # Step 13: Win battle
141
+ state13 = {
142
+ **state7,
143
+ "in_battle": False,
144
+ "battle_outcome": 1, # Win
145
+ "enemy_hp_current": 0,
146
+ "enemy_hp_percentage": 0.0,
147
+ "battle_turn": 4,
148
+ "party_pokemon": [
149
+ {
150
+ "species_id": 4,
151
+ "level": 5,
152
+ "hp_current": 15, # 75% HP
153
+ "hp_max": 20,
154
+ "hp_percentage": 75.0,
155
+ }
156
+ ],
157
+ }
158
+ action13 = {
159
+ **action7,
160
+ "prev_in_battle": True,
161
+ "prev_enemy_hp_current": 0,
162
+ }
163
+
164
+ r = await reward_fn.score(state13, action13)
165
+ total_reward += r
166
+ print(f"✓ Win first battle: +{r:.0f} points")
167
+
168
+ # Step 14: Exit lab with Pokemon (Map 3 -> Map 0)
169
+ state14 = {**state13, "map_id": 0, "player_x": 5, "player_y": 11}
170
+ action14 = {**action13, "prev_map_id": 3}
171
+
172
+ r = await reward_fn.score(state14, action14)
173
+ total_reward += r
174
+ print(f"✓ Exit Oak's Lab with Pokemon (Map 3→0): +{r:.0f} points")
175
+
176
+ print()
177
+ print("=" * 70)
178
+ print(f"TOTAL REWARD: {total_reward:.0f} points")
179
+ print("=" * 70)
180
+ print()
181
+ print("Breakdown by category:")
182
+ print(" Navigation: 150 points (bedroom, house, lab, exit)")
183
+ print(" Story: 150 points (talk to Oak, get Pokemon)")
184
+ print(" Battle: 335 points (enter, damage, milestones, win)")
185
+ print(" Efficiency: ~100 points (battle speed, health, navigation)")
186
+ print()
187
+
188
+
189
+ if __name__ == "__main__":
190
+ asyncio.run(main())
191
+
@@ -0,0 +1,307 @@
1
+ # Sokoban Task App
2
+
3
+ A task app for training and evaluating LLM agents on Sokoban puzzles.
4
+
5
+ Sokoban is a classic puzzle game where the player must push boxes onto target locations. It's a good benchmark for spatial reasoning, planning, and sequential decision-making.
6
+
7
+ ## Features
8
+
9
+ - 🎮 Multiple difficulty levels (easy, medium, hard)
10
+ - 🤖 LLM policy support (GPT-5-mini, Qwen)
11
+ - 📊 Supports both RL training and evaluation rollouts
12
+ - 🎯 Rich observations with ASCII grid visualization
13
+ - ⚡ Batched actions (up to 8 actions per LLM call)
14
+
15
+ ## Quick Start
16
+
17
+ ### 1. Start the Server
18
+
19
+ ```bash
20
+ cd /path/to/synth-ai
21
+
22
+ # Start the Sokoban task app on port 8911
23
+ uvx synth-ai task-app serve sokoban --port 8911
24
+ ```
25
+
26
+ The server will be available at `http://localhost:8911`.
27
+
28
+ ### 2. Run a Test Rollout
29
+
30
+ #### Option A: Using GPT-5-mini
31
+
32
+ ```bash
33
+ export OPENAI_API_KEY="your-api-key"
34
+
35
+ python3 << 'EOF'
36
+ import httpx
37
+ import asyncio
38
+
39
+ async def test_gpt5mini():
40
+ async with httpx.AsyncClient(timeout=600.0) as client: # Longer timeout
41
+ print("🎮 Testing with GPT-5-mini (slower due to reasoning tokens)...\n")
42
+
43
+ response = await client.post(
44
+ "http://localhost:8911/rollout",
45
+ json={
46
+ "run_id": "test_gpt5mini",
47
+ "env": {"seed": 123, "config": {"difficulty": "easy", "max_steps": 100}},
48
+ "ops": ["policy"] * 5, # Fewer calls due to slowness
49
+ "policy": {
50
+ "config": {
51
+ "provider": "openai",
52
+ "model": "gpt-5-mini",
53
+ "max_actions_per_call": 8
54
+ }
55
+ }
56
+ },
57
+ headers={"Authorization": "Bearer sk_env_your_key_here"}
58
+ )
59
+
60
+ result = response.json()
61
+ traj = result["trajectories"][0]
62
+ final = traj["final"]["observation"]
63
+
64
+ print(f"Boxes: {final['boxes_on_target']}/{final['num_boxes']}")
65
+ print(f"Steps: {final['steps_taken']}")
66
+
67
+ asyncio.run(test_gpt5mini())
68
+ EOF
69
+ ```
70
+
71
+ #### Option B: Using Qwen via Groq (Fast & Cheap)
72
+
73
+ ```bash
74
+ export GROQ_API_KEY="your-groq-key"
75
+
76
+ python3 << 'EOF'
77
+ import httpx
78
+ import asyncio
79
+
80
+ async def test_qwen():
81
+ async with httpx.AsyncClient(timeout=300.0) as client:
82
+ response = await client.post(
83
+ "http://localhost:8911/rollout",
84
+ json={
85
+ "run_id": "test_qwen",
86
+ "env": {"seed": 123, "config": {"difficulty": "easy", "max_steps": 100}},
87
+ "ops": ["policy"] * 15,
88
+ "policy": {
89
+ "config": {
90
+ "provider": "groq",
91
+ "model": "qwen-2.5-7b",
92
+ "max_actions_per_call": 8
93
+ }
94
+ }
95
+ },
96
+ headers={"Authorization": "Bearer sk_env_your_key_here"}
97
+ )
98
+
99
+ result = response.json()
100
+ traj = result["trajectories"][0]
101
+ final = traj["final"]["observation"]
102
+
103
+ print(f"Result: {'✅ SOLVED!' if final['boxes_on_target'] == final['num_boxes'] else '❌ Not solved'}")
104
+ print(f"Boxes: {final['boxes_on_target']}/{final['num_boxes']}")
105
+
106
+ asyncio.run(test_qwen())
107
+ EOF
108
+ ```
109
+
110
+ ## Configuration Options
111
+
112
+ ### Environment Config
113
+
114
+ ```python
115
+ {
116
+ "seed": 123, # Random seed for puzzle generation
117
+ "config": {
118
+ "difficulty": "easy", # "easy", "medium", or "hard"
119
+ "max_steps": 100 # Maximum steps before truncation
120
+ }
121
+ }
122
+ ```
123
+
124
+ ### Policy Config
125
+
126
+ ```python
127
+ {
128
+ "provider": "openai", # "openai" or "groq"
129
+ "model": "gpt-5-mini", # Model name
130
+ "max_actions_per_call": 8, # Actions per policy call (1-8)
131
+ "temperature": 0.7, # Temperature (optional)
132
+ "max_completion_tokens": 4000 # Max tokens (optional)
133
+ }
134
+ ```
135
+
136
+ ## Model Recommendations
137
+
138
+ | Model | Status | Speed | Notes |
139
+ |-------|--------|-------|-------|
140
+ | **gpt-5-mini** | ✅ Recommended | Slow (30-50s/call) | Uses 1500-2750 reasoning tokens per call |
141
+ | **gpt-5** | ❌ Not supported | N/A | Doesn't support tool calling |
142
+ | **gpt-5-nano** | ❌ Not supported | N/A | Doesn't support tool calling |
143
+ | **qwen-2.5-7b** (Groq) | ✅ Works | Very fast | Cheap and fast alternative |
144
+
145
+ ### Why is GPT-5-mini slow?
146
+
147
+ GPT-5-mini uses extensive internal reasoning (1500-2750 reasoning tokens per call) before generating actions. While this could lead to better puzzle-solving, it makes each policy call take 30-50 seconds.
148
+
149
+ Example usage breakdown:
150
+ ```json
151
+ {
152
+ "usage": {
153
+ "completion_tokens": 2465,
154
+ "reasoning_tokens": 2432, // Deep thinking!
155
+ "prompt_tokens": 470
156
+ }
157
+ }
158
+ ```
159
+
160
+ ## Observation Format
161
+
162
+ Each observation includes:
163
+
164
+ ```python
165
+ {
166
+ "room_text": str, # ASCII visualization of the puzzle
167
+ "player_position": [x, y], # Player coordinates
168
+ "boxes_on_target": int, # Number of boxes on target squares
169
+ "num_boxes": int, # Total number of boxes
170
+ "steps_taken": int, # Steps taken so far
171
+ "max_steps": int, # Maximum allowed steps
172
+ "last_action": str, # Last action taken
173
+ "reward_last": float, # Reward from last step
174
+ "total_reward": float, # Cumulative reward
175
+ "terminated": bool, # Puzzle solved?
176
+ "truncated": bool # Max steps reached?
177
+ }
178
+ ```
179
+
180
+ ### ASCII Legend
181
+
182
+ - `P` = Player
183
+ - `O` = Box
184
+ - `X` = Target square
185
+ - `@` = Box on target
186
+ - `+` = Player on target
187
+ - `#` = Wall
188
+ - `_` = Floor
189
+
190
+ ## Action Space
191
+
192
+ The agent uses the `interact_many` tool to execute multiple actions in sequence:
193
+
194
+ ```python
195
+ {
196
+ "tool": "interact_many",
197
+ "args": {
198
+ "actions": [0, 1, 2, 3] # 0=left, 1=up, 2=right, 3=down
199
+ }
200
+ }
201
+ ```
202
+
203
+ Or with string names:
204
+ ```python
205
+ {
206
+ "actions": ["left", "up", "right", "down"]
207
+ }
208
+ ```
209
+
210
+ ## Training with RL
211
+
212
+ The Sokoban task app supports RL training. Example config:
213
+
214
+ ```toml
215
+ # sokoban_rl_config.toml
216
+ [task_app]
217
+ url = "http://localhost:8911"
218
+ auth_token = "sk_env_your_key_here"
219
+
220
+ [rl]
221
+ algorithm = "grpo"
222
+ num_episodes = 1000
223
+ batch_size = 32
224
+
225
+ [policy]
226
+ provider = "groq"
227
+ model = "qwen-2.5-7b"
228
+ max_actions_per_call = 8
229
+
230
+ [env]
231
+ difficulty = "easy"
232
+ max_steps = 100
233
+ ```
234
+
235
+ Run training:
236
+ ```bash
237
+ uvx synth-ai train --config sokoban_rl_config.toml
238
+ ```
239
+
240
+ ## Debugging
241
+
242
+ ### Check server health
243
+ ```bash
244
+ curl http://localhost:8911/health
245
+ ```
246
+
247
+ ### View server logs
248
+ ```bash
249
+ # If running with nohup
250
+ tail -f nohup_sokoban.log
251
+
252
+ # Filter for important logs
253
+ tail -f nohup_sokoban.log | grep -E "extract|debug|error"
254
+ ```
255
+
256
+ ### Test with explicit actions
257
+ ```python
258
+ # Instead of "policy", provide explicit actions
259
+ "ops": [
260
+ {"button": "right", "count": 3},
261
+ {"button": "down", "count": 2}
262
+ ]
263
+ ```
264
+
265
+ ## Troubleshooting
266
+
267
+ ### Empty responses from LLM
268
+ - **GPT-5/GPT-5-nano**: These models don't support tool calling reliably. Use GPT-5-mini instead.
269
+ - **Timeout errors**: GPT-5-mini is slow. Increase client timeout to 600+ seconds or use fewer policy calls.
270
+
271
+ ### Puzzle not solving
272
+ - Try more policy calls (15-30)
273
+ - Use a different seed
274
+ - Try "easy" difficulty first
275
+ - Check if the agent is stuck in a loop (repeating same actions)
276
+
277
+ ### Server won't start
278
+ ```bash
279
+ # Check if port is in use
280
+ lsof -i :8911
281
+
282
+ # Kill existing process
283
+ kill -9 $(lsof -ti :8911)
284
+
285
+ # Restart
286
+ uvx synth-ai task-app serve sokoban --port 8911
287
+ ```
288
+
289
+ ## Examples
290
+
291
+ See the `examples/workflows/` directory for:
292
+ - RL training scripts
293
+ - Evaluation scripts
294
+ - Multi-episode parallel evaluation
295
+
296
+ ## Contributing
297
+
298
+ To add new features:
299
+ 1. Edit `task_app.py` for core logic
300
+ 2. Update `_base_task_info()` for new observation/action specs
301
+ 3. Modify `rollout_executor()` for custom rollout behavior
302
+ 4. Add tests in `tests/integration/`
303
+
304
+ ## License
305
+
306
+ MIT
307
+
@@ -0,0 +1,3 @@
1
+ """Sokoban task app example package."""
2
+
3
+
@@ -0,0 +1,16 @@
1
+ # Evaluation config for running Groq Qwen/Qwen3-32B against the Sokoban task app.
2
+
3
+ provider = "groq"
4
+ task_app_url = "http://127.0.0.1:8911"
5
+ model = "qwen/qwen3-32b"
6
+ seeds = [123]
7
+ max_turns = 60
8
+ concurrency = 1
9
+
10
+ [policy]
11
+ provider = "groq"
12
+ model = "qwen/qwen3-32b"
13
+ temperature = 0.2
14
+ top_p = 0.95
15
+ max_tokens = 8000
16
+ max_actions_per_call = 4
@@ -0,0 +1,16 @@
1
+ # Evaluation config for running OpenAI GPT-5 against the Sokoban task app.
2
+
3
+ provider = "openai"
4
+ task_app_url = "http://127.0.0.1:8911"
5
+ model = "gpt-5"
6
+ seeds = [123]
7
+ max_turns = 60
8
+ concurrency = 1
9
+
10
+ [policy]
11
+ provider = "openai"
12
+ model = "gpt-5"
13
+ temperature = 0.2
14
+ top_p = 0.9
15
+ max_completion_tokens = 4000
16
+ max_actions_per_call = 4