synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,4 @@
1
+ # Sokoban task app tests
2
+
3
+
4
+
@@ -0,0 +1,113 @@
1
+ """Shared fixtures for Sokoban tests."""
2
+ import os
3
+ import socket
4
+ import subprocess
5
+ from subprocess import TimeoutExpired
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Iterator
9
+
10
+ import pytest
11
+
12
+ requests = pytest.importorskip("requests")
13
+
14
+
15
+ def _which(executable: str) -> bool:
16
+ return any(
17
+ (Path(path) / executable).exists()
18
+ for path in os.getenv("PATH", "").split(os.pathsep)
19
+ )
20
+
21
+
22
+ def _find_free_port() -> int:
23
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
24
+ sock.bind(("127.0.0.1", 0))
25
+ return sock.getsockname()[1]
26
+
27
+
28
+ def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
29
+ """Wait for the Sokoban server to become ready."""
30
+ deadline = time.time() + timeout
31
+ while time.time() < deadline:
32
+ try:
33
+ # Try /info first (no auth required if --insecure)
34
+ resp = requests.get(f"{base_url}/info", timeout=2.0)
35
+ if resp.status_code == 200:
36
+ return
37
+ # If 400/401, server is up but needs auth - that's OK
38
+ if resp.status_code in (400, 401):
39
+ return
40
+ except Exception:
41
+ time.sleep(0.5)
42
+ raise RuntimeError(f"Task app at {base_url} did not become ready")
43
+
44
+
45
+ @pytest.fixture(scope="module")
46
+ def sokoban_server(tmp_path_factory: pytest.TempPathFactory) -> Iterator[str]:
47
+ """Start the Sokoban task app server for testing."""
48
+ if not _which("uv"):
49
+ pytest.skip("uv executable not found on PATH")
50
+
51
+ port = _find_free_port()
52
+ base_url = f"http://127.0.0.1:{port}"
53
+ tmp_path = tmp_path_factory.mktemp("sokoban")
54
+
55
+ env = os.environ.copy()
56
+ # Set the test API key
57
+ env["ENVIRONMENT_API_KEY"] = "sk_env_30c78a787bac223c716918181209f263"
58
+ cmd = [
59
+ "uv",
60
+ "run",
61
+ "-m",
62
+ "synth_ai",
63
+ "task-app",
64
+ "serve",
65
+ "sokoban",
66
+ "--port",
67
+ str(port),
68
+ "--no-reload",
69
+ ]
70
+ proc = subprocess.Popen(
71
+ cmd,
72
+ stdout=subprocess.PIPE,
73
+ stderr=subprocess.STDOUT,
74
+ text=True,
75
+ env=env,
76
+ stdin=subprocess.PIPE,
77
+ )
78
+
79
+ # Send "n" to decline tracing
80
+ try:
81
+ if proc.stdin:
82
+ proc.stdin.write("n\n")
83
+ proc.stdin.flush()
84
+ except Exception:
85
+ pass
86
+
87
+ stdout_capture = ""
88
+ try:
89
+ time.sleep(2)
90
+ if proc.poll() is not None:
91
+ stdout_capture, _ = proc.communicate(timeout=2)
92
+ tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
93
+ pytest.skip(f"Task app terminated immediately:\n{tail}")
94
+
95
+ _wait_for_server(base_url)
96
+ yield base_url
97
+ except RuntimeError as e:
98
+ proc.terminate()
99
+ try:
100
+ stdout_capture, _ = proc.communicate(timeout=10)
101
+ except TimeoutExpired:
102
+ proc.kill()
103
+ stdout_capture, _ = proc.communicate()
104
+ tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
105
+ pytest.skip(f"Task app failed to start: {e}\n{tail}")
106
+ finally:
107
+ if proc.poll() is None:
108
+ proc.terminate()
109
+ try:
110
+ proc.wait(timeout=5)
111
+ except TimeoutExpired:
112
+ proc.kill()
113
+
@@ -0,0 +1,4 @@
1
+ # Integration tests for Sokoban task app
2
+
3
+
4
+
@@ -0,0 +1,57 @@
1
+ """Integration tests for Sokoban task app with evaluation."""
2
+ import pytest
3
+
4
+ requests = pytest.importorskip("requests")
5
+
6
+ # sokoban_server fixture is in conftest.py
7
+ # Use the actual ENVIRONMENT_API_KEY from .env
8
+ AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
9
+
10
+
11
+ @pytest.mark.slow
12
+ def test_sokoban_server_health(sokoban_server: str) -> None:
13
+ """Test that the Sokoban server health endpoint works."""
14
+ resp = requests.get(f"{sokoban_server}/health", timeout=5.0)
15
+ assert resp.status_code in (200, 400), f"Unexpected status: {resp.status_code}"
16
+
17
+
18
+ def test_sokoban_task_info(sokoban_server: str) -> None:
19
+ """Test that the Sokoban server returns valid task_info."""
20
+ resp = requests.get(f"{sokoban_server}/task_info", timeout=5.0)
21
+ assert resp.status_code == 200
22
+ data = resp.json()
23
+ assert "task" in data
24
+ assert data["task"]["id"] == "sokoban"
25
+
26
+
27
+ @pytest.mark.fast
28
+ def test_sokoban_manual_rollout(sokoban_server: str) -> None:
29
+ """Test a manual Sokoban rollout with explicit actions."""
30
+ # Try explicit action rollout (no LLM required)
31
+ # Actions: 0=left, 1=up, 2=right, 3=down
32
+ rollout_payload = {
33
+ "run_id": "test_manual",
34
+ "env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 50}},
35
+ "ops": [],
36
+ "policy": {
37
+ "policy_name": "manual",
38
+ "config": {
39
+ "provider": "noop",
40
+ "actions": [0, 2, 2, 3], # left, right, right, down
41
+ },
42
+ },
43
+ }
44
+
45
+ resp = requests.post(
46
+ f"{sokoban_server}/rollout",
47
+ json=rollout_payload,
48
+ headers=AUTH_HEADER,
49
+ timeout=30.0,
50
+ )
51
+
52
+ assert resp.status_code == 200
53
+ data = resp.json()
54
+ assert "trajectories" in data
55
+ assert len(data["trajectories"]) > 0
56
+ assert "metrics" in data
57
+
@@ -0,0 +1,198 @@
1
+ """Integration test for Sokoban rollouts via /rollout endpoint."""
2
+ import os
3
+ import pytest
4
+
5
+ requests = pytest.importorskip("requests")
6
+
7
+ # Use the actual ENVIRONMENT_API_KEY from .env
8
+ AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
9
+
10
+
11
+ @pytest.mark.slow
12
+ def test_sokoban_manual_rollout(sokoban_server: str) -> None:
13
+ """Test a manual Sokoban rollout with explicit movement actions."""
14
+ # Actions: 0=left, 1=up, 2=right, 3=down
15
+ rollout_payload = {
16
+ "run_id": "test_manual_sokoban",
17
+ "env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 20}},
18
+ "ops": [], # Not used for manual actions in Sokoban
19
+ "policy": {
20
+ "policy_name": "manual",
21
+ "config": {
22
+ "provider": "noop",
23
+ "actions": [0, 2, 2, 3, 3, 0], # Pass actions via policy.config
24
+ },
25
+ },
26
+ }
27
+
28
+ resp = requests.post(
29
+ f"{sokoban_server}/rollout",
30
+ json=rollout_payload,
31
+ headers=AUTH_HEADER,
32
+ timeout=30.0,
33
+ )
34
+
35
+ assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
36
+ data = resp.json()
37
+
38
+ # Verify response structure
39
+ assert "trajectories" in data
40
+ assert len(data["trajectories"]) > 0
41
+ assert "metrics" in data
42
+
43
+ trajectory = data["trajectories"][0]
44
+ assert "steps" in trajectory
45
+
46
+ # Should have taken the requested actions
47
+ assert len(trajectory["steps"]) >= 6 # Initial obs + 6 actions
48
+
49
+ # Verify each step has required fields
50
+ for step in trajectory["steps"]:
51
+ assert "obs" in step
52
+ assert "reward" in step or "reward_last" in step.get("obs", {})
53
+
54
+
55
+ @pytest.mark.slow
56
+ def test_sokoban_policy_rollout_with_openai(sokoban_server: str) -> None:
57
+ """Test a Sokoban rollout using OpenAI GPT-5-mini policy."""
58
+ if "OPENAI_API_KEY" not in os.environ:
59
+ pytest.skip("OPENAI_API_KEY required for this test")
60
+
61
+ rollout_payload = {
62
+ "run_id": "test_policy_sokoban",
63
+ "env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 10}},
64
+ "ops": ["policy", "policy"], # 2 policy calls
65
+ "policy": {
66
+ "policy_name": "gpt-5-mini",
67
+ "config": {
68
+ "provider": "openai",
69
+ "model": "gpt-5-mini",
70
+ "max_tokens": 512,
71
+ },
72
+ },
73
+ }
74
+
75
+ resp = requests.post(
76
+ f"{sokoban_server}/rollout",
77
+ json=rollout_payload,
78
+ headers=AUTH_HEADER,
79
+ timeout=180.0, # GPT-5-mini can be slow
80
+ )
81
+
82
+ # GPT-5-mini may or may not work for Sokoban, so just check it doesn't crash
83
+ assert resp.status_code in (200, 500), f"Unexpected status: {resp.status_code}"
84
+
85
+ if resp.status_code == 200:
86
+ data = resp.json()
87
+ assert "trajectories" in data
88
+ assert "metrics" in data
89
+
90
+
91
+ @pytest.mark.fast
92
+ def test_sokoban_difficulty_levels(sokoban_server: str) -> None:
93
+ """Test Sokoban rollouts with different difficulty levels."""
94
+ for difficulty in ["easy", "medium", "hard"]:
95
+ rollout_payload = {
96
+ "run_id": f"test_difficulty_{difficulty}",
97
+ "env": {"seed": 0, "config": {"difficulty": difficulty, "max_steps": 10}},
98
+ "ops": [],
99
+ "policy": {
100
+ "config": {
101
+ "provider": "noop",
102
+ "actions": [2, 3, 0], # right, down, left
103
+ },
104
+ },
105
+ }
106
+
107
+ resp = requests.post(
108
+ f"{sokoban_server}/rollout",
109
+ json=rollout_payload,
110
+ headers=AUTH_HEADER,
111
+ timeout=30.0,
112
+ )
113
+
114
+ assert resp.status_code == 200, f"Rollout failed for {difficulty}: {resp.text}"
115
+ data = resp.json()
116
+
117
+ # Verify basic structure
118
+ assert "trajectories" in data
119
+ assert len(data["trajectories"]) > 0
120
+
121
+
122
+ @pytest.mark.fast
123
+ def test_sokoban_max_steps_limit(sokoban_server: str) -> None:
124
+ """Test that Sokoban respects max_steps configuration."""
125
+ max_steps = 5
126
+ rollout_payload = {
127
+ "run_id": "test_max_steps",
128
+ "env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": max_steps}},
129
+ "ops": [],
130
+ "policy": {
131
+ "config": {
132
+ "provider": "noop",
133
+ "actions": [0] * 20, # Try to take 20 actions, but should be limited
134
+ },
135
+ },
136
+ }
137
+
138
+ resp = requests.post(
139
+ f"{sokoban_server}/rollout",
140
+ json=rollout_payload,
141
+ headers=AUTH_HEADER,
142
+ timeout=30.0,
143
+ )
144
+
145
+ assert resp.status_code == 200
146
+ data = resp.json()
147
+
148
+ trajectory = data["trajectories"][0]
149
+ steps = trajectory["steps"]
150
+
151
+ # Should have stopped at max_steps (plus initial observation)
152
+ assert len(steps) <= max_steps + 1, f"Expected <= {max_steps + 1} steps, got {len(steps)}"
153
+
154
+ # Check if truncated
155
+ final_obs = steps[-1].get("obs", {})
156
+ if len(steps) > max_steps:
157
+ assert final_obs.get("truncated") is True
158
+
159
+
160
+ @pytest.mark.fast
161
+ def test_sokoban_completion_detection(sokoban_server: str) -> None:
162
+ """Test that Sokoban detects puzzle completion (terminated=True)."""
163
+ # This test verifies the structure, not necessarily that we solve it
164
+ rollout_payload = {
165
+ "run_id": "test_completion",
166
+ "env": {"seed": 0, "config": {"difficulty": "easy", "max_steps": 50}},
167
+ "ops": [],
168
+ "policy": {
169
+ "config": {
170
+ "provider": "noop",
171
+ "actions": [2, 3, 0, 1, 2], # Random moves
172
+ },
173
+ },
174
+ }
175
+
176
+ resp = requests.post(
177
+ f"{sokoban_server}/rollout",
178
+ json=rollout_payload,
179
+ headers=AUTH_HEADER,
180
+ timeout=30.0,
181
+ )
182
+
183
+ assert resp.status_code == 200
184
+ data = resp.json()
185
+
186
+ trajectory = data["trajectories"][0]
187
+ final_step = trajectory["steps"][-1]
188
+ final_obs = final_step.get("obs", {})
189
+
190
+ # Verify that termination fields exist
191
+ assert "terminated" in final_obs or "done" in final_step
192
+ assert "boxes_on_target" in final_obs
193
+ assert "num_boxes" in final_obs
194
+
195
+ # If all boxes on target, should be terminated
196
+ if final_obs.get("boxes_on_target") == final_obs.get("num_boxes"):
197
+ assert final_obs.get("terminated") is True or final_step.get("done") is True
198
+
@@ -0,0 +1,4 @@
1
+ # Unit tests for Sokoban task app
2
+
3
+
4
+
@@ -0,0 +1,114 @@
1
+ """Unit tests for Sokoban environment and rewards."""
2
+ import pytest
3
+
4
+
5
+ @pytest.mark.fast
6
+ def test_sokoban_module_imports():
7
+ """Test that Sokoban modules can be imported."""
8
+ from synth_ai.environments.examples.sokoban import environment, engine
9
+
10
+ assert hasattr(environment, "SokobanEnvironment")
11
+ assert hasattr(engine, "SokobanEngine")
12
+
13
+
14
+ @pytest.mark.asyncio
15
+ async def test_sokoban_reward_components():
16
+ """Test that Sokoban reward components exist and work."""
17
+ from synth_ai.environments.examples.sokoban.engine import (
18
+ SokobanEngine,
19
+ SokobanGoalAchievedComponent,
20
+ SokobanStepPenaltyComponent,
21
+ SokobanPublicState,
22
+ )
23
+ from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
24
+
25
+ # Create a minimal task instance
26
+ task = TaskInstance(
27
+ id="test",
28
+ impetus=Impetus(instructions="Test"),
29
+ intent=Intent(
30
+ rubric={"goal": "test"},
31
+ gold_trajectories=None,
32
+ gold_state_diff={},
33
+ deterministic_eval_functions=[],
34
+ ),
35
+ metadata={"difficulty": "easy", "max_steps": 50, "seed": 0},
36
+ is_reproducible=False,
37
+ initial_engine_snapshot=None,
38
+ )
39
+
40
+ engine = SokobanEngine(task)
41
+
42
+ # Test that reward components exist
43
+ assert hasattr(engine, "reward_stack")
44
+ assert engine.reward_stack is not None
45
+
46
+ # Test reward components directly
47
+ goal_reward = SokobanGoalAchievedComponent()
48
+ penalty = SokobanStepPenaltyComponent()
49
+
50
+ # Mock state for reward calculation
51
+ import numpy as np
52
+
53
+ state = SokobanPublicState(
54
+ dim_room=(3, 3),
55
+ room_fixed=np.array([[0]]),
56
+ room_state=np.array([[0]]),
57
+ player_position=(0, 0),
58
+ boxes_on_target=0,
59
+ num_steps=0,
60
+ max_steps=50,
61
+ last_action_name="NONE",
62
+ num_boxes=1,
63
+ error_info=None,
64
+ )
65
+
66
+ # Test goal reward (should be 0 for incomplete puzzle)
67
+ reward1 = await goal_reward.score(state, {"action": 0})
68
+ assert reward1 == 0.0
69
+
70
+ # Test completed state
71
+ state_complete = SokobanPublicState(
72
+ dim_room=(3, 3),
73
+ room_fixed=np.array([[0]]),
74
+ room_state=np.array([[0]]),
75
+ player_position=(0, 0),
76
+ boxes_on_target=1,
77
+ num_steps=10,
78
+ max_steps=50,
79
+ last_action_name="RIGHT",
80
+ num_boxes=1,
81
+ error_info=None,
82
+ )
83
+ reward_complete = await goal_reward.score(state_complete, {"action": 0})
84
+ assert reward_complete > 0
85
+
86
+ # Test penalty (should be negative small value)
87
+ penalty_reward = await penalty.score(state, {"action": 0})
88
+ assert penalty_reward < 0
89
+ assert penalty_reward > -1 # Should be a small penalty
90
+
91
+
92
+ def test_sokoban_difficulty_settings():
93
+ """Test that Sokoban engine can be created with task metadata."""
94
+ from synth_ai.environments.examples.sokoban.engine import SokobanEngine
95
+ from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
96
+
97
+ # Test with different difficulty metadata
98
+ for difficulty in ["easy", "medium", "hard"]:
99
+ task = TaskInstance(
100
+ id="test",
101
+ impetus=Impetus(instructions="Test"),
102
+ intent=Intent(
103
+ rubric={"goal": "test"},
104
+ gold_trajectories=None,
105
+ gold_state_diff={},
106
+ deterministic_eval_functions=[],
107
+ ),
108
+ metadata={"difficulty": difficulty, "max_steps": 50, "seed": 0},
109
+ is_reproducible=False,
110
+ initial_engine_snapshot=None,
111
+ )
112
+
113
+ engine = SokobanEngine(task)
114
+ assert engine is not None
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,24 @@
1
+ # Verilog Eval Config for Groq Qwen3-32B
2
+ # Quick eval to test Verilog task app before RL training
3
+
4
+ [task_app]
5
+ # Update this with your Modal URL after deployment
6
+ url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
7
+
8
+ [eval]
9
+ num_episodes = 3 # Quick test with 3 seeds
10
+ seeds = [0, 1, 2]
11
+ max_steps = 15 # More steps for Verilog compilation chains
12
+
13
+ [policy]
14
+ provider = "groq"
15
+ model = "qwen/qwen3-32b"
16
+ temperature = 0.2
17
+ max_tokens = 768
18
+ inference_url = "https://api.groq.com/openai/v1/chat/completions"
19
+
20
+ [env]
21
+ difficulty = "medium" # Can be "easy", "medium", or "hard"
22
+
23
+
24
+
@@ -0,0 +1,5 @@
1
+ [filter]
2
+ db = "traces/v3/synth_ai.db"
3
+ output = "ft_data/verilog_sft.jsonl"
4
+ min_official_score = 0.01
5
+
@@ -0,0 +1,12 @@
1
+ # GRPO Verilog Task App
2
+
3
+ This example mirrors the Crafter task app layout while targeting the Verilog
4
+ hardware synthesis environment under `synth_ai.environments.examples.verilog`.
5
+ The `grpo_verilog.py` module builds a lightweight dataset from the VerilogEval
6
+ spec-to-RTL benchmark and wires a minimalist task-app configuration. The
7
+ companion `grpo_verilog_task_app.py` acts as a compatibility wrapper for direct
8
+ FastAPI execution or Modal deployment.
9
+
10
+ The rollout bridge currently surfaces the initial observation for the selected
11
+ task instance, providing a scaffold for future extensions that integrate the
12
+ full hosted environment workflow and policy orchestration similar to Crafter.