synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,146 @@
1
+ """Compatibility wrapper for the GRPO Enron task app.
2
+
3
+ This mirrors the structure of the Crafter task app wrapper while delegating
4
+ all configuration to the colocated `grpo_enron.py` module. Normal usage should
5
+ prefer invoking `uvx synth-ai serve grpo-enron`, but this module remains for
6
+ direct execution or importing the FastAPI app object.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ from pathlib import Path
13
+
14
+ from fastapi.exceptions import RequestValidationError
15
+ from fastapi.responses import JSONResponse
16
+ from starlette.requests import Request
17
+ from synth_ai.task.apps import ModalDeploymentConfig, registry
18
+ from synth_ai.task.auth import is_api_key_header_authorized, normalize_environment_api_key
19
+ from synth_ai.task.server import TaskAppConfig, create_task_app, run_task_app
20
+
21
+ from .grpo_enron import build_config
22
+
23
+ APP_ID = "grpo-enron"
24
+
25
+
26
+ def _build_base_config() -> TaskAppConfig:
27
+ # Lazily construct the base config to avoid heavy work at import time.
28
+ return build_config()
29
+
30
+
31
+ try:
32
+ _REGISTERED_ENTRY = registry.get(APP_ID)
33
+ except Exception: # pragma: no cover - registry unavailable in some contexts
34
+ MODAL_DEPLOYMENT: ModalDeploymentConfig | None = None
35
+ ENV_FILES: tuple[str, ...] = ()
36
+ else:
37
+ MODAL_DEPLOYMENT = _REGISTERED_ENTRY.modal
38
+ ENV_FILES = tuple(_REGISTERED_ENTRY.env_files)
39
+
40
+
41
+ def build_task_app_config() -> TaskAppConfig:
42
+ """Return a fresh TaskAppConfig for this wrapper."""
43
+ base = _build_base_config()
44
+ return base.clone()
45
+
46
+
47
+ def fastapi_app():
48
+ """Return the FastAPI application for Modal or other ASGI hosts."""
49
+
50
+ app = create_task_app(build_task_app_config())
51
+
52
+ # Replace default health endpoints so we can permit soft auth failures and log 422s.
53
+ filtered_routes = []
54
+ for route in app.router.routes:
55
+ path = getattr(route, "path", None)
56
+ methods = getattr(route, "methods", set()) or set()
57
+ if path in {"/health", "/health/rollout"} and "GET" in methods:
58
+ continue
59
+ filtered_routes.append(route)
60
+ app.router.routes = filtered_routes
61
+
62
+ def _log_env_key_prefix(source: str, env_key: str | None) -> str | None:
63
+ if not env_key:
64
+ return None
65
+ prefix = env_key[: max(1, len(env_key) // 2)]
66
+ print(f"[{source}] expected ENVIRONMENT_API_KEY prefix: {prefix}")
67
+ return prefix
68
+
69
+ @app.get("/health")
70
+ async def health(request: Request):
71
+ env_key = normalize_environment_api_key()
72
+ if not env_key:
73
+ return JSONResponse(
74
+ status_code=503,
75
+ content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
76
+ )
77
+ if not is_api_key_header_authorized(request):
78
+ prefix = _log_env_key_prefix("health", env_key)
79
+ content = {"status": "healthy", "authorized": False}
80
+ if prefix:
81
+ content["expected_api_key_prefix"] = prefix
82
+ return JSONResponse(status_code=200, content=content)
83
+ return {"status": "healthy", "authorized": True}
84
+
85
+ @app.get("/health/rollout")
86
+ async def health_rollout(request: Request):
87
+ env_key = normalize_environment_api_key()
88
+ if not env_key:
89
+ return JSONResponse(
90
+ status_code=503,
91
+ content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
92
+ )
93
+ if not is_api_key_header_authorized(request):
94
+ prefix = _log_env_key_prefix("health/rollout", env_key)
95
+ content = {"status": "healthy", "authorized": False}
96
+ if prefix:
97
+ content["expected_api_key_prefix"] = prefix
98
+ return JSONResponse(status_code=200, content=content)
99
+ return {"ok": True, "authorized": True}
100
+
101
+ @app.exception_handler(RequestValidationError)
102
+ async def _on_validation_error(request: Request, exc: RequestValidationError):
103
+ try:
104
+ hdr = request.headers
105
+ snapshot = {
106
+ "path": str(request.url.path),
107
+ "have_x_api_key": bool(hdr.get("x-api-key")),
108
+ "have_x_api_keys": bool(hdr.get("x-api-keys")),
109
+ "have_authorization": bool(hdr.get("authorization")),
110
+ "errors": exc.errors()[:5],
111
+ }
112
+ print("[422] validation", snapshot, flush=True)
113
+ except Exception:
114
+ pass
115
+ return JSONResponse(
116
+ status_code=422,
117
+ content={"status": "invalid", "detail": exc.errors()[:5]},
118
+ )
119
+
120
+ return app
121
+
122
+
123
+ if __name__ == "__main__":
124
+ parser = argparse.ArgumentParser(description="Run the Enron task app locally")
125
+ parser.add_argument("--host", default="0.0.0.0")
126
+ parser.add_argument("--port", type=int, default=8102)
127
+ parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
128
+ parser.add_argument(
129
+ "--env-file",
130
+ action="append",
131
+ default=[],
132
+ help="Additional .env files to load before startup",
133
+ )
134
+ args = parser.parse_args()
135
+
136
+ default_env = Path(__file__).resolve().parents[4] / "backend" / ".env.dev"
137
+ env_files = [str(default_env)] if default_env.exists() else []
138
+ env_files.extend(args.env_file or [])
139
+
140
+ run_task_app(
141
+ build_task_app_config,
142
+ host=args.host,
143
+ port=args.port,
144
+ reload=args.reload,
145
+ env_files=env_files,
146
+ )
@@ -0,0 +1,4 @@
1
+ # Enron task app tests
2
+
3
+
4
+
@@ -0,0 +1,115 @@
1
+ """Shared fixtures for Enron tests."""
2
+ import os
3
+ import socket
4
+ import subprocess
5
+ from subprocess import TimeoutExpired
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Iterator
9
+
10
+ import pytest
11
+
12
+ requests = pytest.importorskip("requests")
13
+
14
+
15
+ def _which(executable: str) -> bool:
16
+ return any(
17
+ (Path(path) / executable).exists()
18
+ for path in os.getenv("PATH", "").split(os.pathsep)
19
+ )
20
+
21
+
22
+ def _find_free_port() -> int:
23
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
24
+ sock.bind(("127.0.0.1", 0))
25
+ return sock.getsockname()[1]
26
+
27
+
28
+ def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
29
+ """Wait for the Enron server to become ready."""
30
+ deadline = time.time() + timeout
31
+ while time.time() < deadline:
32
+ try:
33
+ # Try /info first (no auth required if --insecure)
34
+ resp = requests.get(f"{base_url}/info", timeout=2.0)
35
+ if resp.status_code == 200:
36
+ return
37
+ # If 400/401, server is up but needs auth - that's OK
38
+ if resp.status_code in (400, 401):
39
+ return
40
+ except Exception:
41
+ time.sleep(0.5)
42
+ raise RuntimeError(f"Task app at {base_url} did not become ready")
43
+
44
+
45
+ @pytest.fixture(scope="module")
46
+ def enron_server(tmp_path_factory: pytest.TempPathFactory) -> Iterator[str]:
47
+ """Start the Enron task app server for testing."""
48
+ if not _which("uv"):
49
+ pytest.skip("uv executable not found on PATH")
50
+ if "GROQ_API_KEY" not in os.environ:
51
+ pytest.skip("GROQ_API_KEY must be set for Groq-backed tests")
52
+
53
+ port = _find_free_port()
54
+ base_url = f"http://127.0.0.1:{port}"
55
+ tmp_path = tmp_path_factory.mktemp("enron")
56
+ trace_dir = tmp_path / "traces"
57
+ trace_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ env = os.environ.copy()
60
+ cmd = [
61
+ "uv",
62
+ "run",
63
+ "-m",
64
+ "synth_ai",
65
+ "task-app",
66
+ "serve",
67
+ "grpo-enron",
68
+ "--port",
69
+ str(port),
70
+ "--no-reload",
71
+ ]
72
+ proc = subprocess.Popen(
73
+ cmd,
74
+ stdout=subprocess.PIPE,
75
+ stderr=subprocess.STDOUT,
76
+ text=True,
77
+ env=env,
78
+ stdin=subprocess.PIPE,
79
+ )
80
+
81
+ # Send "n" to decline tracing
82
+ try:
83
+ if proc.stdin:
84
+ proc.stdin.write("n\n")
85
+ proc.stdin.flush()
86
+ except Exception:
87
+ pass
88
+
89
+ stdout_capture = ""
90
+ try:
91
+ time.sleep(2)
92
+ if proc.poll() is not None:
93
+ stdout_capture, _ = proc.communicate(timeout=2)
94
+ tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
95
+ pytest.skip(f"Task app terminated immediately:\n{tail}")
96
+
97
+ _wait_for_server(base_url)
98
+ yield base_url
99
+ except RuntimeError as e:
100
+ proc.terminate()
101
+ try:
102
+ stdout_capture, _ = proc.communicate(timeout=10)
103
+ except TimeoutExpired:
104
+ proc.kill()
105
+ stdout_capture, _ = proc.communicate()
106
+ tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
107
+ pytest.skip(f"Task app failed to start: {e}\n{tail}")
108
+ finally:
109
+ if proc.poll() is None:
110
+ proc.terminate()
111
+ try:
112
+ proc.wait(timeout=5)
113
+ except TimeoutExpired:
114
+ proc.kill()
115
+
@@ -0,0 +1,4 @@
1
+ # Integration tests for Enron task app
2
+
3
+
4
+
@@ -0,0 +1,179 @@
1
+ """Integration tests for Enron task app with Groq evaluation."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import socket
6
+ import subprocess
7
+ from subprocess import TimeoutExpired
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Iterator
11
+
12
+ import pytest
13
+
14
+ requests = pytest.importorskip("requests")
15
+
16
+
17
+ HERE = Path(__file__).resolve().parent
18
+ TASK_APP_ROOT = HERE.parents[1]
19
+ CONFIG_PATH = TASK_APP_ROOT / "eval_groq_qwen32.toml"
20
+
21
+
22
+ def _which(executable: str) -> bool:
23
+ return any(
24
+ (Path(path) / executable).exists()
25
+ for path in os.getenv("PATH", "").split(os.pathsep)
26
+ )
27
+
28
+
29
+ def _find_free_port() -> int:
30
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
31
+ sock.bind(("127.0.0.1", 0))
32
+ return sock.getsockname()[1]
33
+
34
+
35
+ def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
36
+ """Wait for the Enron server to become ready."""
37
+ deadline = time.time() + timeout
38
+ while time.time() < deadline:
39
+ try:
40
+ resp = requests.get(f"{base_url}/info", timeout=2.0)
41
+ if resp.status_code == 200:
42
+ return
43
+ except Exception:
44
+ time.sleep(0.5)
45
+ raise RuntimeError(f"Task app at {base_url} did not become ready")
46
+
47
+
48
+ @pytest.fixture
49
+ def enron_server(tmp_path: Path) -> Iterator[str]:
50
+ """Start the Enron task app server for testing."""
51
+ if not _which("uv"):
52
+ pytest.skip("uv executable not found on PATH")
53
+ if "GROQ_API_KEY" not in os.environ:
54
+ pytest.skip("GROQ_API_KEY must be set for Groq-backed evals")
55
+
56
+ port = _find_free_port()
57
+ base_url = f"http://127.0.0.1:{port}"
58
+ trace_dir = tmp_path / "traces"
59
+ trace_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ env = os.environ.copy()
62
+ cmd = [
63
+ "uv",
64
+ "run",
65
+ "-m",
66
+ "synth_ai",
67
+ "task-app",
68
+ "serve",
69
+ "grpo-enron",
70
+ "--port",
71
+ str(port),
72
+ "--no-reload",
73
+ ]
74
+ proc = subprocess.Popen(
75
+ cmd,
76
+ stdout=subprocess.PIPE,
77
+ stderr=subprocess.STDOUT,
78
+ text=True,
79
+ env=env,
80
+ stdin=subprocess.PIPE,
81
+ )
82
+
83
+ # Send "n" to decline tracing
84
+ try:
85
+ if proc.stdin:
86
+ proc.stdin.write("n\n")
87
+ proc.stdin.flush()
88
+ except Exception:
89
+ pass
90
+
91
+ stdout_capture = ""
92
+ try:
93
+ time.sleep(2)
94
+ if proc.poll() is not None:
95
+ stdout_capture, _ = proc.communicate(timeout=2)
96
+ tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
97
+ pytest.skip(f"Task app terminated immediately:\n{tail}")
98
+
99
+ _wait_for_server(base_url)
100
+ yield base_url
101
+ except RuntimeError as e:
102
+ proc.terminate()
103
+ try:
104
+ stdout_capture, _ = proc.communicate(timeout=10)
105
+ except TimeoutExpired:
106
+ proc.kill()
107
+ stdout_capture, _ = proc.communicate()
108
+ tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
109
+ pytest.skip(f"Task app failed to start: {e}\n{tail}")
110
+ finally:
111
+ if proc.poll() is None:
112
+ proc.terminate()
113
+ try:
114
+ proc.wait(timeout=5)
115
+ except TimeoutExpired:
116
+ proc.kill()
117
+
118
+
119
+ @pytest.mark.slow
120
+ def test_enron_server_health(enron_server: str) -> None:
121
+ """Test that the Enron server health endpoint works."""
122
+ resp = requests.get(f"{enron_server}/health", timeout=5.0)
123
+ assert resp.status_code in (200, 400), f"Unexpected status: {resp.status_code}"
124
+
125
+
126
+ @pytest.mark.slow
127
+ def test_enron_task_info(enron_server: str) -> None:
128
+ """Test that the Enron server returns valid task_info."""
129
+ resp = requests.get(f"{enron_server}/task_info", timeout=5.0)
130
+ assert resp.status_code == 200
131
+ data = resp.json()
132
+ assert "task" in data
133
+ assert data["task"]["id"] == "enron_email_qa"
134
+
135
+
136
+ @pytest.mark.slow
137
+ def test_enron_eval_with_groq(enron_server: str) -> None:
138
+ """Spin up the Enron task app and run a Groq-backed eval."""
139
+ if not CONFIG_PATH.exists():
140
+ pytest.skip(f"Config file not found: {CONFIG_PATH}")
141
+
142
+ cmd = [
143
+ "uv",
144
+ "run",
145
+ "-m",
146
+ "synth_ai",
147
+ "eval",
148
+ "grpo-enron",
149
+ "--config",
150
+ str(CONFIG_PATH),
151
+ "--url",
152
+ enron_server,
153
+ "--model",
154
+ "qwen/qwen3-32b",
155
+ "--seeds",
156
+ "0",
157
+ ]
158
+ result = subprocess.run(
159
+ cmd,
160
+ stdout=subprocess.PIPE,
161
+ stderr=subprocess.STDOUT,
162
+ text=True,
163
+ env=os.environ.copy(),
164
+ check=False,
165
+ timeout=300,
166
+ )
167
+
168
+ if result.returncode != 0:
169
+ pytest.fail(f"Eval failed with return code {result.returncode}:\n{result.stdout}")
170
+
171
+ # Check for success indicators
172
+ assert "Eval complete" in result.stdout
173
+ assert "1 ok, 0 failed" in result.stdout or "status=200" in result.stdout
174
+
175
+ # Check that we got a meaningful score
176
+ assert "official" in result.stdout.lower() or "mean_return" in result.stdout.lower()
177
+
178
+
179
+
@@ -0,0 +1,135 @@
1
+ """Integration test for Enron rollouts via /rollout endpoint."""
2
+ import os
3
+ import pytest
4
+
5
+ requests = pytest.importorskip("requests")
6
+
7
+ # Use the actual ENVIRONMENT_API_KEY from .env
8
+ AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
9
+
10
+
11
+ @pytest.mark.slow
12
+ def test_enron_manual_rollout(enron_server: str) -> None:
13
+ """Test a manual Enron rollout with explicit search/read/answer actions."""
14
+ rollout_payload = {
15
+ "run_id": "test_manual_enron",
16
+ "env": {"seed": 0},
17
+ "ops": [
18
+ {
19
+ "tool": "search_emails",
20
+ "args": {
21
+ "inbox": "test@enron.com",
22
+ "keywords": ["test", "question"],
23
+ "max_results": 5,
24
+ },
25
+ },
26
+ {
27
+ "tool": "answer_question",
28
+ "args": {"answer": "This is a test answer"},
29
+ },
30
+ ],
31
+ "policy": {
32
+ "policy_name": "manual",
33
+ "config": {"provider": "noop"},
34
+ },
35
+ }
36
+
37
+ resp = requests.post(
38
+ f"{enron_server}/rollout",
39
+ json=rollout_payload,
40
+ headers=AUTH_HEADER,
41
+ timeout=60.0,
42
+ )
43
+
44
+ assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
45
+ data = resp.json()
46
+
47
+ # Verify response structure
48
+ assert "trajectories" in data
49
+ assert len(data["trajectories"]) > 0
50
+ assert "metrics" in data
51
+ assert "trace" in data
52
+
53
+ # Check that trace is present
54
+ assert data["trace"] is not None
55
+ assert "session_trace" in data["trace"]
56
+
57
+ trajectory = data["trajectories"][0]
58
+ assert "steps" in trajectory
59
+
60
+ # Should have at least initial observation
61
+ assert len(trajectory["steps"]) > 0
62
+
63
+
64
+ @pytest.mark.slow
65
+ def test_enron_policy_rollout(enron_server: str) -> None:
66
+ """Test an Enron rollout using Groq policy."""
67
+ if "GROQ_API_KEY" not in os.environ:
68
+ pytest.skip("GROQ_API_KEY required for this test")
69
+
70
+ rollout_payload = {
71
+ "run_id": "test_policy_enron",
72
+ "env": {"seed": 0},
73
+ "ops": [], # Empty ops means use policy
74
+ "policy": {
75
+ "policy_name": "qwen-groq",
76
+ "config": {
77
+ "provider": "groq",
78
+ "model": "qwen/qwen3-32b",
79
+ "temperature": 0.2,
80
+ "max_tokens": 1024,
81
+ },
82
+ },
83
+ }
84
+
85
+ resp = requests.post(
86
+ f"{enron_server}/rollout",
87
+ json=rollout_payload,
88
+ headers=AUTH_HEADER,
89
+ timeout=180.0, # Enron can be slow with multiple tool calls
90
+ )
91
+
92
+ assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
93
+ data = resp.json()
94
+
95
+ # Verify response structure
96
+ assert "trajectories" in data
97
+ assert "metrics" in data
98
+ assert "trace" in data
99
+
100
+ trajectory = data["trajectories"][0]
101
+ assert "steps" in trajectory
102
+
103
+ # Check that steps were taken
104
+ assert len(trajectory["steps"]) > 0
105
+
106
+ # Verify metrics
107
+ metrics = data["metrics"]
108
+ assert "episode_returns" in metrics or "mean_return" in metrics
109
+
110
+ # Check that we got some reward (could be negative for search penalty)
111
+ if "episode_returns" in metrics and len(metrics["episode_returns"]) > 0:
112
+ # Just verify it's a number
113
+ assert isinstance(metrics["episode_returns"][0], (int, float))
114
+
115
+
116
+ @pytest.mark.fast
117
+ def test_enron_rollout_with_auth(enron_server: str) -> None:
118
+ """Test that Enron rollout requires proper authentication."""
119
+ rollout_payload = {
120
+ "run_id": "test_auth",
121
+ "env": {"seed": 0},
122
+ "ops": [],
123
+ "policy": {"config": {"provider": "noop"}},
124
+ }
125
+
126
+ # Try without auth header
127
+ resp = requests.post(
128
+ f"{enron_server}/rollout",
129
+ json=rollout_payload,
130
+ timeout=10.0,
131
+ )
132
+
133
+ # Should fail without auth (400 or 401)
134
+ assert resp.status_code in (400, 401, 403), f"Expected auth error, got {resp.status_code}"
135
+
@@ -0,0 +1,4 @@
1
+ # Unit tests for Enron task app
2
+
3
+
4
+