synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -12,7 +12,7 @@ variety = "gspo"
12
12
  # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
13
13
  task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
14
14
  # Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
15
- judge_url = "https://YOUR-BACKEND-ENDPOINT/api"
15
+ judge_url = "https://synth-backend-dev-docker.onrender.com/api"
16
16
 
17
17
  [compute]
18
18
  gpu_type = "H200"
@@ -101,6 +101,9 @@ verify_every_k = 0
101
101
 
102
102
  [rubric]
103
103
  enabled = true
104
+ model = "openai/gpt-oss-120b"
105
+ api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
106
+ api_key_env = "OPENAI_API_KEY"
104
107
  # Blend the hosted judge scores with environment returns inside the trainer.
105
108
  [rubric.weights]
106
109
  env = 0.2
@@ -110,10 +113,18 @@ outcome = 0.4
110
113
  [rubric.event]
111
114
  # Hosted judge rubric for per-decision progress scoring.
112
115
  rubric_id = "crafter/event@v1"
116
+ criteria = [
117
+ { key = "progress.unique_achievements", weight = 0.9, description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0.", aggregation = "weighted_sum" },
118
+ { key = "process.intent_alignment", weight = 0.1, description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock.", aggregation = "weighted_sum" },
119
+ ]
113
120
 
114
121
  [rubric.outcome]
115
122
  # Hosted judge rubric for final trajectory scoring.
116
123
  rubric_id = "crafter/outcome@v1"
124
+ criteria = [
125
+ { key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
126
+ { key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
127
+ ]
117
128
 
118
129
  [judge]
119
130
  type = "gemini" # or "groq" when routing to Groq-hosted judges
@@ -60,34 +60,55 @@ try:
60
60
  HAS_HOSTED = True
61
61
  except Exception:
62
62
  try: # pragma: no cover - optional dependency path
63
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.branching import ( # type: ignore
64
- router as branching_router,
63
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.branching import ( # type: ignore
64
+ BranchingEnvironmentConfig,
65
65
  )
66
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.environment_routes import ( # type: ignore # noqa: E501
67
- router as environment_router,
66
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.environment_routes import ( # type: ignore # noqa: E501
67
+ CrafterEnvironmentRoutes,
68
68
  )
69
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.policy_routes import ( # type: ignore
70
- router as policy_router,
69
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.policy_routes import ( # type: ignore
70
+ PolicyRoutes,
71
71
  )
72
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import ( # type: ignore
72
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import ( # type: ignore
73
+ RolloutPayload,
74
+ )
75
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
76
+ EnvironmentConfig,
77
+ )
78
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
79
+ PolicyConfig,
80
+ )
81
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
82
+ RolloutRequest,
83
+ )
84
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
85
+ RolloutResponse,
86
+ )
87
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
88
+ RunSpec,
89
+ )
90
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.rollout import (
91
+ ToolUse,
92
+ )
93
+ from examples.task_apps.crafter.task_app.hosted.rollout import ( # type: ignore
73
94
  RolloutEnvSpec as LegacyRolloutEnvSpec,
74
95
  )
75
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import (
96
+ from examples.task_apps.crafter.task_app.hosted.rollout import (
76
97
  RolloutPolicySpec as LegacyRolloutPolicySpec,
77
98
  )
78
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import (
99
+ from examples.task_apps.crafter.task_app.hosted.rollout import (
79
100
  RolloutRecordConfig as LegacyRolloutRecordConfig,
80
101
  )
81
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import (
102
+ from examples.task_apps.crafter.task_app.hosted.rollout import (
82
103
  RolloutRequest as LegacyRolloutRequest,
83
104
  )
84
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import (
105
+ from examples.task_apps.crafter.task_app.hosted.rollout import (
85
106
  RolloutResponse as LegacyRolloutResponse,
86
107
  )
87
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import (
108
+ from examples.task_apps.crafter.task_app.hosted.rollout import (
88
109
  RolloutSafetyConfig as LegacyRolloutSafetyConfig,
89
110
  )
90
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.rollout import (
111
+ from examples.task_apps.crafter.task_app.hosted.rollout import (
91
112
  execute_rollout as legacy_execute_rollout,
92
113
  )
93
114
  HAS_HOSTED = True
@@ -264,7 +285,7 @@ def build_dataset() -> tuple[TaskDatasetRegistry, MiniSweDataset]:
264
285
  def _base_task_info(dataset: MiniSweDataset) -> TaskInfo:
265
286
  return TaskInfo(
266
287
  task={"id": "swe_mini", "name": "mini-SWE Tasks", "version": "0.1.0"},
267
- environments=["swe-mini"],
288
+ environment="swe-mini",
268
289
  action_space={
269
290
  "type": "tool",
270
291
  "tools": ["run_command", "submit_patch"],
@@ -292,11 +313,6 @@ def _base_task_info(dataset: MiniSweDataset) -> TaskInfo:
292
313
  },
293
314
  "tool": {"name": "run_command", "parallel_tool_calls": False},
294
315
  },
295
- capabilities={
296
- "supports_rollout": True,
297
- "supports_env_lifecycle": True,
298
- "requires_api_key_header": True,
299
- },
300
316
  limits={"max_ops": 2000, "max_time_s": 7200},
301
317
  )
302
318
 
@@ -348,18 +364,31 @@ def provide_task_instances(
348
364
  dataset: MiniSweDataset, base_info: TaskInfo, seeds: Sequence[int]
349
365
  ) -> Iterable[TaskInfo]:
350
366
  infos: list[TaskInfo] = []
367
+ base_observation = getattr(base_info, "observation", None)
368
+ if hasattr(base_observation, "model_dump"):
369
+ base_observation_data = base_observation.model_dump()
370
+ elif isinstance(base_observation, dict):
371
+ base_observation_data = dict(base_observation)
372
+ else:
373
+ base_observation_data = {}
374
+
351
375
  for seed in seeds:
352
376
  instance = dataset.sample_by_index(int(seed))
353
377
  infos.append(
354
378
  TaskInfo(
355
379
  task=base_info.task,
356
- environments=base_info.environments,
380
+ environment=base_info.environment,
357
381
  action_space=base_info.action_space,
358
- observation={**base_info.observation, "instance_id": instance["instance_id"]},
359
- dataset={**base_info.dataset, "instance_id": instance["instance_id"]},
382
+ observation={
383
+ **base_observation_data,
384
+ "instance_id": instance["instance_id"],
385
+ },
386
+ dataset={
387
+ **base_info.dataset.model_dump(),
388
+ "instance_id": instance["instance_id"],
389
+ },
360
390
  rubric=base_info.rubric,
361
391
  inference=base_info.inference,
362
- capabilities=base_info.capabilities,
363
392
  limits=base_info.limits,
364
393
  )
365
394
  )
@@ -397,10 +426,10 @@ def build_config() -> TaskAppConfig:
397
426
  HostedTaskAppCls = HostedTaskApp
398
427
  except Exception:
399
428
  try:
400
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.hosted_app import ( # type: ignore
401
- TaskApp as HostedTaskApp,
429
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.hosted_app import ( # type: ignore
430
+ create_app,
402
431
  )
403
- HostedTaskAppCls = HostedTaskApp
432
+ HostedTaskAppCls = create_app
404
433
  except Exception as exc: # pragma: no cover - optional dependency path
405
434
  logger.warning("Unable to import HostedTaskApp for swe-mini: %s", exc)
406
435
  if HostedTaskAppCls is not None:
@@ -1238,6 +1238,15 @@ async def execute_rollout(
1238
1238
  )
1239
1239
 
1240
1240
  # Build partial trajectory and return HTTP 200
1241
+ # Extract inference_url from policy meta (best effort)
1242
+ inference_url = None
1243
+ if policy_handle is not None:
1244
+ try:
1245
+ policy_snapshot = policy_handle.snapshot()
1246
+ inference_url = policy_snapshot.get("config", {}).get("inference_url")
1247
+ except Exception:
1248
+ pass
1249
+
1241
1250
  trajectory = RolloutTrajectory(
1242
1251
  env_id=env_id,
1243
1252
  policy_id=policy_id,
@@ -1249,6 +1258,7 @@ async def execute_rollout(
1249
1258
  "at_op": op,
1250
1259
  },
1251
1260
  length=len(trajectory_steps),
1261
+ inference_url=inference_url, # NEW: Required for trace correlation
1252
1262
  decision_samples=decision_samples if step_rewards_active else None,
1253
1263
  )
1254
1264
  metrics = RolloutMetrics(
@@ -1369,6 +1379,15 @@ async def execute_rollout(
1369
1379
  },
1370
1380
  )
1371
1381
  trajectory_steps.append(term_step)
1382
+ # Extract inference_url from policy meta (best effort)
1383
+ inference_url = None
1384
+ if policy_handle is not None:
1385
+ try:
1386
+ policy_snapshot = policy_handle.snapshot()
1387
+ inference_url = policy_snapshot.get("config", {}).get("inference_url")
1388
+ except Exception:
1389
+ pass
1390
+
1372
1391
  trajectory = RolloutTrajectory(
1373
1392
  env_id=env_id,
1374
1393
  policy_id=policy_id,
@@ -1379,6 +1398,7 @@ async def execute_rollout(
1379
1398
  "at_op": op,
1380
1399
  },
1381
1400
  length=len(trajectory_steps),
1401
+ inference_url=inference_url, # NEW: Required for trace correlation
1382
1402
  decision_samples=decision_samples if step_rewards_active else None,
1383
1403
  )
1384
1404
  metrics = RolloutMetrics(
@@ -1460,6 +1480,15 @@ async def execute_rollout(
1460
1480
  )
1461
1481
  trajectory_steps.append(term_step)
1462
1482
  # Build partial response
1483
+ # Extract inference_url from policy meta (best effort)
1484
+ inference_url = None
1485
+ if policy_handle is not None:
1486
+ try:
1487
+ policy_snapshot = policy_handle.snapshot()
1488
+ inference_url = policy_snapshot.get("config", {}).get("inference_url")
1489
+ except Exception:
1490
+ pass
1491
+
1463
1492
  trajectory = RolloutTrajectory(
1464
1493
  env_id=env_id,
1465
1494
  policy_id=policy_id,
@@ -1471,6 +1500,7 @@ async def execute_rollout(
1471
1500
  "at_op": op,
1472
1501
  },
1473
1502
  length=len(trajectory_steps),
1503
+ inference_url=inference_url, # NEW: Required for trace correlation
1474
1504
  decision_samples=decision_samples if step_rewards_active else None,
1475
1505
  )
1476
1506
  metrics = RolloutMetrics(
@@ -1688,12 +1718,22 @@ async def execute_rollout(
1688
1718
  timing_final.setdefault("overhead_ms", 0.0)
1689
1719
 
1690
1720
  # Build trajectory
1721
+ # Extract inference_url from policy meta
1722
+ inference_url = None
1723
+ if policy_handle is not None:
1724
+ try:
1725
+ policy_snapshot = policy_handle.snapshot()
1726
+ inference_url = policy_snapshot.get("config", {}).get("inference_url")
1727
+ except Exception:
1728
+ pass
1729
+
1691
1730
  trajectory = RolloutTrajectory(
1692
1731
  env_id=env_id,
1693
1732
  policy_id=policy_id,
1694
1733
  steps=trajectory_steps,
1695
1734
  final={"observation": _summarize_observation_for_storage(env_handle, current_obs)},
1696
1735
  length=len(trajectory_steps),
1736
+ inference_url=inference_url, # NEW: Required for trace correlation
1697
1737
  decision_samples=decision_samples if step_rewards_active else None,
1698
1738
  )
1699
1739
 
@@ -1,15 +1,14 @@
1
1
  #!/usr/bin/env python3
2
- """
3
- Simple test script for the GRPO Synth Envs Hosted Service.
4
-
5
- Run this after starting the service with:
6
- python main.py
7
- """
2
+ """Manual smoke script for the GRPO Synth Envs Hosted Service."""
8
3
 
9
4
  import asyncio
10
5
  import json
11
6
 
12
7
  import httpx
8
+ import pytest
9
+
10
+
11
+ pytestmark = pytest.mark.skip(reason="Requires running hosted service on localhost:8000")
13
12
 
14
13
 
15
14
  async def test_service():
@@ -0,0 +1,275 @@
1
+ # Task App Testing Guide
2
+
3
+ This document describes how to run tests for the task apps in this directory.
4
+
5
+ ## Overview
6
+
7
+ Each task app has unit and integration tests following a consistent pattern inspired by the customer environment tests in `customers/`.
8
+
9
+ ## Test Structure
10
+
11
+ ```
12
+ examples/task_apps/<app_name>/tests/
13
+ ├── __init__.py
14
+ ├── integration/
15
+ │ ├── __init__.py
16
+ │ └── test_<app>_eval.py # Server startup + eval tests
17
+ └── unit/
18
+ ├── __init__.py
19
+ └── test_<app>_*.py # Environment, scoring, dataset tests
20
+ ```
21
+
22
+ ## Running Tests
23
+
24
+ ### Prerequisites
25
+
26
+ ```bash
27
+ # Install test dependencies
28
+ uv sync --dev
29
+
30
+ # Set required environment variables
31
+ export GROQ_API_KEY="your-groq-key"
32
+ export OPENAI_API_KEY="your-openai-key" # For Sokoban
33
+ ```
34
+
35
+ ### Run All Tests for a Task App
36
+
37
+ ```bash
38
+ # Verilog
39
+ pytest examples/task_apps/verilog/tests/ -v
40
+
41
+ # Enron
42
+ pytest examples/task_apps/enron/tests/ -v
43
+
44
+ # Sokoban
45
+ pytest examples/task_apps/sokoban/tests/ -v
46
+ ```
47
+
48
+ ### Run Only Unit Tests (Fast)
49
+
50
+ ```bash
51
+ # Runs quickly, no server startup required
52
+ pytest examples/task_apps/verilog/tests/unit/ -v
53
+ pytest examples/task_apps/enron/tests/unit/ -v
54
+ pytest examples/task_apps/sokoban/tests/unit/ -v
55
+ ```
56
+
57
+ ### Run Only Integration Tests
58
+
59
+ ```bash
60
+ # Slower, starts servers and runs evals
61
+ pytest examples/task_apps/verilog/tests/integration/ -v
62
+ pytest examples/task_apps/enron/tests/integration/ -v
63
+ pytest examples/task_apps/sokoban/tests/integration/ -v
64
+ ```
65
+
66
+ ### Run All Task App Tests
67
+
68
+ ```bash
69
+ # Run everything
70
+ pytest examples/task_apps/*/tests/ -v
71
+
72
+ # Skip slow tests
73
+ pytest examples/task_apps/*/tests/ -v -m "not slow"
74
+ ```
75
+
76
+ ## Test Categories
77
+
78
+ ### Unit Tests
79
+
80
+ **Purpose**: Test individual components in isolation
81
+ - Environment initialization
82
+ - Reward calculation
83
+ - Tool implementations
84
+ - State management
85
+
86
+ **Characteristics**:
87
+ - Fast (< 1 second each)
88
+ - No external dependencies
89
+ - No server startup
90
+ - No API calls
91
+
92
+ **Examples**:
93
+ - `test_verilog_scoring.py`: Tests reward components (compile, simulate, submit)
94
+ - `test_enron_environment.py`: Tests search, answer, reward calculation
95
+ - `test_sokoban_environment.py`: Tests actions, rewards, truncation
96
+
97
+ ### Integration Tests
98
+
99
+ **Purpose**: Test the full system end-to-end
100
+ - Server startup
101
+ - Health/info endpoints
102
+ - Full evaluation runs
103
+ - **Rollout execution** (manual and policy-driven)
104
+
105
+ **Characteristics**:
106
+ - Slower (30-300 seconds)
107
+ - Requires server startup
108
+ - May require API keys
109
+ - Tests real workflows
110
+
111
+ **Examples**:
112
+ - `test_verilog_eval.py`: Starts server, runs Groq eval with Qwen3-32B
113
+ - `test_verilog_rollout.py`: **Manual & policy rollouts via /rollout endpoint**
114
+ - `test_enron_eval.py`: Starts server, runs Groq eval
115
+ - `test_enron_rollout.py`: **Manual & policy rollouts, auth testing**
116
+ - `test_sokoban_eval.py`: Starts server, tests manual rollout
117
+ - `test_sokoban_rollout.py`: **6 rollout tests (manual, policy, difficulties, limits)**
118
+
119
+ ## What Each Test Validates
120
+
121
+ ### Verilog Tests
122
+
123
+ **Unit Tests** (4 tests):
124
+ - ✅ Compile success gives +0.1 reward
125
+ - ✅ Simulation pass gives +1.0 reward
126
+ - ✅ Submit success gives +10.0 reward
127
+ - ✅ Submit checks last simulation output correctly
128
+
129
+ **Integration Tests** (5 tests):
130
+ - ✅ Server starts and responds to /health
131
+ - ✅ /task_info returns valid Verilog task metadata
132
+ - ✅ Full eval with Qwen3-32B completes successfully
133
+ - ✅ **Manual rollout** with explicit write/compile/simulate/submit
134
+ - ✅ **Policy rollout** using Groq/Qwen3-32B (verifies LLM integration)
135
+
136
+ ### Enron Tests
137
+
138
+ **Unit Tests** (3 tests):
139
+ - ✅ search_emails tool works correctly
140
+ - ✅ answer_question tool calculates rewards
141
+ - ✅ Exact answer match gives high reward (>0.9)
142
+ - ✅ Partial answer match gives medium reward (>0.5)
143
+ - ✅ Wrong answer gives low reward (<0.5)
144
+
145
+ **Integration Tests** (6 tests):
146
+ - ✅ Server starts and responds to /health
147
+ - ✅ /task_info returns valid Enron task metadata
148
+ - ✅ Full eval with Qwen3-32B completes successfully
149
+ - ✅ **Manual rollout** with explicit search/read/answer actions
150
+ - ✅ **Policy rollout** using Groq/Qwen3-32B
151
+ - ✅ **Authentication** enforcement (rejects requests without auth header)
152
+
153
+ ### Sokoban Tests
154
+
155
+ **Unit Tests** (3 tests):
156
+ - ✅ Module imports work correctly
157
+ - ✅ Reward components exist (goal achieved, step penalty)
158
+ - ✅ Engine creation with different difficulty levels
159
+
160
+ **Integration Tests** (9 tests):
161
+ - ✅ Server starts and responds to /health
162
+ - ✅ /task_info returns valid Sokoban task metadata
163
+ - ✅ **Manual rollout** with movement actions (left/right/up/down)
164
+ - ✅ **Policy rollout** with OpenAI GPT-5-mini (may skip if slow)
165
+ - ✅ **All difficulty levels** (easy/medium/hard) work correctly
166
+ - ✅ **Max steps limit** enforcement (stops at configured limit)
167
+ - ✅ **Puzzle completion detection** (terminated=True when solved)
168
+ - ✅ Truncation on max_steps
169
+ - ✅ Response structure validation
170
+
171
+ ## Debugging Test Failures
172
+
173
+ ### Server Won't Start
174
+
175
+ ```bash
176
+ # Check if port is already in use
177
+ lsof -i :<port>
178
+
179
+ # Check logs manually
180
+ uv run -m synth_ai task-app serve <app_name> --port 8999
181
+
182
+ # Check environment variables
183
+ echo $GROQ_API_KEY
184
+ echo $OPENAI_API_KEY
185
+ ```
186
+
187
+ ### Tests Timeout
188
+
189
+ ```bash
190
+ # Run with more verbose output
191
+ pytest <test_file> -v -s
192
+
193
+ # Skip slow tests
194
+ pytest <test_file> -v --timeout=60
195
+ ```
196
+
197
+ ### Import Errors
198
+
199
+ ```bash
200
+ # Ensure you're in the right directory
201
+ cd /path/to/synth-ai
202
+
203
+ # Reinstall dependencies
204
+ uv sync --dev
205
+ ```
206
+
207
+ ## CI/CD Integration
208
+
209
+ These tests can be run in CI with:
210
+
211
+ ```yaml
212
+ # .github/workflows/test-task-apps.yml
213
+ - name: Run task app tests
214
+ env:
215
+ GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
216
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
217
+ run: |
218
+ # Unit tests (fast, always run)
219
+ pytest examples/task_apps/*/tests/unit/ -v
220
+
221
+ # Integration tests (slower, only on main)
222
+ if [ "$GITHUB_REF" = "refs/heads/main" ]; then
223
+ pytest examples/task_apps/*/tests/integration/ -v --timeout=300
224
+ fi
225
+ ```
226
+
227
+ ## Adding Tests for New Task Apps
228
+
229
+ When creating a new task app, follow this pattern:
230
+
231
+ 1. **Create test structure**:
232
+ ```bash
233
+ mkdir -p examples/task_apps/<new_app>/tests/{unit,integration}
234
+ touch examples/task_apps/<new_app>/tests/__init__.py
235
+ touch examples/task_apps/<new_app>/tests/unit/__init__.py
236
+ touch examples/task_apps/<new_app>/tests/integration/__init__.py
237
+ ```
238
+
239
+ 2. **Create unit tests** (`tests/unit/test_<app>_*.py`):
240
+ - Test environment initialization
241
+ - Test reward calculation
242
+ - Test tool implementations
243
+ - Test edge cases
244
+
245
+ 3. **Create integration tests** (`tests/integration/test_<app>_eval.py`):
246
+ - Copy from an existing integration test
247
+ - Update app name, port, config path
248
+ - Add app-specific endpoint tests
249
+
250
+ 4. **Add to CI**:
251
+ - Update CI config to include new tests
252
+ - Ensure required env vars are set
253
+
254
+ ## Test Coverage Goals
255
+
256
+ - Unit test coverage: >80%
257
+ - Integration test coverage: 100% of critical paths
258
+ - All public APIs have at least one integration test
259
+ - All reward components have unit tests
260
+
261
+ ## Common Issues
262
+
263
+ ### "Task app terminated immediately"
264
+ - Check that the app name is correct
265
+ - Verify the app is registered in `synth_ai/task/apps.py`
266
+ - Check recent changes to the app code
267
+
268
+ ### "GROQ_API_KEY must be set"
269
+ - Set the environment variable
270
+ - Or skip the test: `pytest -k "not groq"`
271
+
272
+ ### "Config file not found"
273
+ - Ensure eval config exists in task app directory
274
+ - Check the path in the test matches actual location
275
+
File without changes
File without changes
@@ -0,0 +1,2 @@
1
+ """Crafter task app implementation."""
2
+
@@ -68,7 +68,7 @@ def _resolve_repo_root() -> Path:
68
68
  def _resolve_task_app_root(repo_root: Path) -> Path:
69
69
  """Locate the task_app directory even when the module is copied to a temp mount."""
70
70
 
71
- preferred = (repo_root / "examples" / "warming_up_to_rl" / "task_app").resolve()
71
+ preferred = (repo_root / "examples" / "task_apps" / "crafter" / "task_app").resolve()
72
72
  if preferred.is_dir():
73
73
  return preferred
74
74
 
@@ -81,7 +81,7 @@ def _resolve_task_app_root(repo_root: Path) -> Path:
81
81
  if (candidate / "synth_envs_hosted").is_dir():
82
82
  return candidate
83
83
 
84
- fallback = Path("/opt/synth_ai_repo/examples/warming_up_to_rl/task_app")
84
+ fallback = Path("/opt/synth_ai_repo/examples/task_apps/crafter/task_app")
85
85
  if fallback.is_dir():
86
86
  return fallback.resolve()
87
87
 
@@ -306,13 +306,16 @@ def build_dataset() -> tuple[TaskDatasetRegistry, CrafterDataset]:
306
306
  def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
307
307
  return TaskInfo(
308
308
  task={"id": "crafter_classic", "name": "Crafter Classic", "version": "1.0.0"},
309
- environments=["crafter"],
309
+ environment="crafter",
310
310
  action_space={
311
311
  "type": "discrete",
312
+ "description": f"Discrete action space with {len(crafter_constants.actions)} actions including movement, crafting, and interaction",
312
313
  "size": len(crafter_constants.actions),
313
314
  "actions": list(crafter_constants.actions),
314
315
  },
315
316
  observation={
317
+ "type": "dict",
318
+ "description": "RGB frame (64x64x3) plus inventory counts, achievements, and semantic map patches",
316
319
  "summary": "RGB frame plus inventory, achievements, and semantic map patches.",
317
320
  "keys": ["image", "inventory", "achievements", "semantic_map_patch7"],
318
321
  "image_shape": [64, 64, 3],
@@ -336,11 +339,6 @@ def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
336
339
  },
337
340
  "tool": {"name": "interact", "parallel_tool_calls": False},
338
341
  },
339
- capabilities={
340
- "supports_rollout": True,
341
- "supports_env_lifecycle": True,
342
- "requires_api_key_header": True,
343
- },
344
342
  limits={"max_ops": 100000, "max_time_s": 3600},
345
343
  )
346
344
 
@@ -366,29 +364,36 @@ def provide_task_instances(
366
364
  dataset: CrafterDataset, base_info: TaskInfo, seeds: Sequence[int]
367
365
  ) -> Iterable[TaskInfo]:
368
366
  infos: list[TaskInfo] = []
367
+ base_observation = getattr(base_info, "observation", None)
368
+ if hasattr(base_observation, "model_dump"):
369
+ observation_template = base_observation.model_dump()
370
+ elif isinstance(base_observation, dict):
371
+ observation_template = dict(base_observation)
372
+ else:
373
+ observation_template = {}
374
+
369
375
  for seed_value in seeds:
370
376
  summary = dataset.describe_seed(seed_value)
371
377
  infos.append(
372
378
  TaskInfo(
373
379
  task=base_info.task,
374
- environments=base_info.environments,
380
+ environment=base_info.environment,
375
381
  action_space=base_info.action_space,
376
382
  observation={
377
- **base_info.observation,
383
+ **observation_template,
378
384
  "seed": seed_value,
379
385
  "traits": summary["traits"],
380
386
  "inventory": summary["inventory"],
381
387
  "player_position": summary["player_position"],
382
388
  },
383
389
  dataset={
384
- **base_info.dataset,
390
+ **base_info.dataset.model_dump(),
385
391
  "seed": seed_value,
386
392
  "difficulty": summary["difficulty"],
387
393
  "config": summary["config"],
388
394
  },
389
395
  rubric=base_info.rubric,
390
396
  inference=base_info.inference,
391
- capabilities=base_info.capabilities,
392
397
  limits=base_info.limits,
393
398
  )
394
399
  )
@@ -659,7 +664,7 @@ register_task_app(
659
664
  # Mount repo root so local modules resolve when deployed on Modal
660
665
  (str(REPO_ROOT), "/opt/synth_ai_repo"),
661
666
  (str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),
662
- (str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/warming_up_to_rl/task_app"),
667
+ (str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/task_apps/crafter/task_app"),
663
668
  ),
664
669
  secret_names=("groq-api-key", "openai-api-key"),
665
670
  memory=16384,
@@ -1,7 +1,7 @@
1
1
  """Compatibility wrapper for the GRPO Crafter task app.
2
2
 
3
3
  This module now delegates to the TaskAppConfig defined in the colocated example at
4
- `examples/warming_up_to_rl/task_app/grpo_crafter.py`. It is kept for legacy usage
4
+ `examples/task_apps/crafter/task_app/grpo_crafter.py`. It is kept for legacy usage
5
5
  (running the file directly or targeting `fastapi_app` from external tooling). Prefer using
6
6
  `uvx synth-ai serve grpo-crafter` for local development and testing.
7
7
  """