synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,145 @@
1
+ """Compatibility wrapper for the GRPO Verilog task app.
2
+
3
+ This mirrors the Crafter task app wrapper while delegating configuration to
4
+ `grpo_verilog.py`. Normal usage should prefer `uvx synth-ai serve grpo-verilog`,
5
+ but the module remains for direct execution or importing the FastAPI app.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ from pathlib import Path
12
+
13
+ from fastapi.exceptions import RequestValidationError
14
+ from fastapi.responses import JSONResponse
15
+ from starlette.requests import Request
16
+ from synth_ai.task.apps import ModalDeploymentConfig, registry
17
+ from synth_ai.task.auth import is_api_key_header_authorized, normalize_environment_api_key
18
+ from synth_ai.task.server import TaskAppConfig, create_task_app, run_task_app
19
+
20
+ from .grpo_verilog import build_config
21
+
22
+ APP_ID = "grpo-verilog"
23
+
24
+
25
+ def _build_base_config() -> TaskAppConfig:
26
+ # Lazily construct the base config to avoid heavy work at import time.
27
+ return build_config()
28
+
29
+
30
+ try:
31
+ _REGISTERED_ENTRY = registry.get(APP_ID)
32
+ except Exception: # pragma: no cover - registry unavailable in some contexts
33
+ MODAL_DEPLOYMENT: ModalDeploymentConfig | None = None
34
+ ENV_FILES: tuple[str, ...] = ()
35
+ else:
36
+ MODAL_DEPLOYMENT = _REGISTERED_ENTRY.modal
37
+ ENV_FILES = tuple(_REGISTERED_ENTRY.env_files)
38
+
39
+
40
+ def build_task_app_config() -> TaskAppConfig:
41
+ """Return a fresh TaskAppConfig for this wrapper."""
42
+ base = _build_base_config()
43
+ return base.clone()
44
+
45
+
46
+ def fastapi_app():
47
+ """Return the FastAPI application for Modal or other ASGI hosts."""
48
+
49
+ app = create_task_app(build_task_app_config())
50
+
51
+ # Replace default health endpoints so we can permit soft auth failures and log 422s.
52
+ filtered_routes = []
53
+ for route in app.router.routes:
54
+ path = getattr(route, "path", None)
55
+ methods = getattr(route, "methods", set()) or set()
56
+ if path in {"/health", "/health/rollout"} and "GET" in methods:
57
+ continue
58
+ filtered_routes.append(route)
59
+ app.router.routes = filtered_routes
60
+
61
+ def _log_env_key_prefix(source: str, env_key: str | None) -> str | None:
62
+ if not env_key:
63
+ return None
64
+ prefix = env_key[: max(1, len(env_key) // 2)]
65
+ print(f"[{source}] expected ENVIRONMENT_API_KEY prefix: {prefix}")
66
+ return prefix
67
+
68
+ @app.get("/health")
69
+ async def health(request: Request):
70
+ env_key = normalize_environment_api_key()
71
+ if not env_key:
72
+ return JSONResponse(
73
+ status_code=503,
74
+ content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
75
+ )
76
+ if not is_api_key_header_authorized(request):
77
+ prefix = _log_env_key_prefix("health", env_key)
78
+ content = {"status": "healthy", "authorized": False}
79
+ if prefix:
80
+ content["expected_api_key_prefix"] = prefix
81
+ return JSONResponse(status_code=200, content=content)
82
+ return {"status": "healthy", "authorized": True}
83
+
84
+ @app.get("/health/rollout")
85
+ async def health_rollout(request: Request):
86
+ env_key = normalize_environment_api_key()
87
+ if not env_key:
88
+ return JSONResponse(
89
+ status_code=503,
90
+ content={"status": "unhealthy", "detail": "Missing ENVIRONMENT_API_KEY"},
91
+ )
92
+ if not is_api_key_header_authorized(request):
93
+ prefix = _log_env_key_prefix("health/rollout", env_key)
94
+ content = {"status": "healthy", "authorized": False}
95
+ if prefix:
96
+ content["expected_api_key_prefix"] = prefix
97
+ return JSONResponse(status_code=200, content=content)
98
+ return {"ok": True, "authorized": True}
99
+
100
+ @app.exception_handler(RequestValidationError)
101
+ async def _on_validation_error(request: Request, exc: RequestValidationError):
102
+ try:
103
+ hdr = request.headers
104
+ snapshot = {
105
+ "path": str(request.url.path),
106
+ "have_x_api_key": bool(hdr.get("x-api-key")),
107
+ "have_x_api_keys": bool(hdr.get("x-api-keys")),
108
+ "have_authorization": bool(hdr.get("authorization")),
109
+ "errors": exc.errors()[:5],
110
+ }
111
+ print("[422] validation", snapshot, flush=True)
112
+ except Exception:
113
+ pass
114
+ return JSONResponse(
115
+ status_code=422,
116
+ content={"status": "invalid", "detail": exc.errors()[:5]},
117
+ )
118
+
119
+ return app
120
+
121
+
122
+ if __name__ == "__main__":
123
+ parser = argparse.ArgumentParser(description="Run the Verilog task app locally")
124
+ parser.add_argument("--host", default="0.0.0.0")
125
+ parser.add_argument("--port", type=int, default=8103)
126
+ parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
127
+ parser.add_argument(
128
+ "--env-file",
129
+ action="append",
130
+ default=[],
131
+ help="Additional .env files to load before startup",
132
+ )
133
+ args = parser.parse_args()
134
+
135
+ default_env = Path(__file__).resolve().parents[4] / "backend" / ".env.dev"
136
+ env_files = [str(default_env)] if default_env.exists() else []
137
+ env_files.extend(args.env_file or [])
138
+
139
+ run_task_app(
140
+ build_task_app_config,
141
+ host=args.host,
142
+ port=args.port,
143
+ reload=args.reload,
144
+ env_files=env_files,
145
+ )
@@ -0,0 +1,4 @@
1
+ # Verilog task app tests
2
+
3
+
4
+
@@ -0,0 +1,115 @@
1
+ """Shared fixtures for Verilog tests."""
2
+ import os
3
+ import socket
4
+ import subprocess
5
+ from subprocess import TimeoutExpired
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Iterator
9
+
10
+ import pytest
11
+
12
+ requests = pytest.importorskip("requests")
13
+
14
+
15
+ def _which(executable: str) -> bool:
16
+ return any(
17
+ (Path(path) / executable).exists()
18
+ for path in os.getenv("PATH", "").split(os.pathsep)
19
+ )
20
+
21
+
22
+ def _find_free_port() -> int:
23
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
24
+ sock.bind(("127.0.0.1", 0))
25
+ return sock.getsockname()[1]
26
+
27
+
28
+ def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
29
+ """Wait for the Verilog server to become ready."""
30
+ deadline = time.time() + timeout
31
+ while time.time() < deadline:
32
+ try:
33
+ # Try /info first (no auth required if --insecure)
34
+ resp = requests.get(f"{base_url}/info", timeout=2.0)
35
+ if resp.status_code == 200:
36
+ return
37
+ # If 400/401, server is up but needs auth - that's OK
38
+ if resp.status_code in (400, 401):
39
+ return
40
+ except Exception:
41
+ time.sleep(0.5)
42
+ raise RuntimeError(f"Task app at {base_url} did not become ready")
43
+
44
+
45
+ @pytest.fixture(scope="module")
46
+ def verilog_server(tmp_path_factory: pytest.TempPathFactory) -> Iterator[str]:
47
+ """Start the Verilog task app server for testing."""
48
+ if not _which("uv"):
49
+ pytest.skip("uv executable not found on PATH")
50
+ if "GROQ_API_KEY" not in os.environ:
51
+ pytest.skip("GROQ_API_KEY must be set for Groq-backed tests")
52
+
53
+ port = _find_free_port()
54
+ base_url = f"http://127.0.0.1:{port}"
55
+ tmp_path = tmp_path_factory.mktemp("verilog")
56
+ trace_dir = tmp_path / "traces"
57
+ trace_dir.mkdir(parents=True, exist_ok=True)
58
+
59
+ env = os.environ.copy()
60
+ cmd = [
61
+ "uv",
62
+ "run",
63
+ "-m",
64
+ "synth_ai",
65
+ "task-app",
66
+ "serve",
67
+ "grpo-verilog",
68
+ "--port",
69
+ str(port),
70
+ "--no-reload",
71
+ ]
72
+ proc = subprocess.Popen(
73
+ cmd,
74
+ stdout=subprocess.PIPE,
75
+ stderr=subprocess.STDOUT,
76
+ text=True,
77
+ env=env,
78
+ stdin=subprocess.PIPE,
79
+ )
80
+
81
+ # Send "n" to decline tracing
82
+ try:
83
+ if proc.stdin:
84
+ proc.stdin.write("n\n")
85
+ proc.stdin.flush()
86
+ except Exception:
87
+ pass
88
+
89
+ stdout_capture = ""
90
+ try:
91
+ time.sleep(2)
92
+ if proc.poll() is not None:
93
+ stdout_capture, _ = proc.communicate(timeout=2)
94
+ tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
95
+ pytest.skip(f"Task app terminated immediately:\n{tail}")
96
+
97
+ _wait_for_server(base_url)
98
+ yield base_url
99
+ except RuntimeError as e:
100
+ proc.terminate()
101
+ try:
102
+ stdout_capture, _ = proc.communicate(timeout=10)
103
+ except TimeoutExpired:
104
+ proc.kill()
105
+ stdout_capture, _ = proc.communicate()
106
+ tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
107
+ pytest.skip(f"Task app failed to start: {e}\n{tail}")
108
+ finally:
109
+ if proc.poll() is None:
110
+ proc.terminate()
111
+ try:
112
+ proc.wait(timeout=5)
113
+ except TimeoutExpired:
114
+ proc.kill()
115
+
@@ -0,0 +1,4 @@
1
+ # Integration tests for Verilog task app
2
+
3
+
4
+
@@ -0,0 +1,181 @@
1
+ """Integration tests for Verilog task app with Groq evaluation."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import socket
6
+ import subprocess
7
+ from subprocess import TimeoutExpired
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Iterator
11
+
12
+ import pytest
13
+
14
+ requests = pytest.importorskip("requests")
15
+
16
+
17
+ HERE = Path(__file__).resolve().parent
18
+ TASK_APP_ROOT = HERE.parents[1]
19
+ CONFIG_PATH = TASK_APP_ROOT / "eval_groq_qwen32b.toml"
20
+
21
+
22
+ def _which(executable: str) -> bool:
23
+ return any(
24
+ (Path(path) / executable).exists()
25
+ for path in os.getenv("PATH", "").split(os.pathsep)
26
+ )
27
+
28
+
29
+ def _find_free_port() -> int:
30
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
31
+ sock.bind(("127.0.0.1", 0))
32
+ return sock.getsockname()[1]
33
+
34
+
35
+ def _wait_for_server(base_url: str, timeout: float = 60.0) -> None:
36
+ """Wait for the Verilog server to become ready."""
37
+ deadline = time.time() + timeout
38
+ while time.time() < deadline:
39
+ try:
40
+ resp = requests.get(f"{base_url}/info", timeout=2.0)
41
+ if resp.status_code == 200:
42
+ return
43
+ except Exception:
44
+ time.sleep(0.5)
45
+ raise RuntimeError(f"Task app at {base_url} did not become ready")
46
+
47
+
48
+ @pytest.fixture
49
+ def verilog_server(tmp_path: Path) -> Iterator[str]:
50
+ """Start the Verilog task app server for testing."""
51
+ if not _which("uv"):
52
+ pytest.skip("uv executable not found on PATH")
53
+ if "GROQ_API_KEY" not in os.environ:
54
+ pytest.skip("GROQ_API_KEY must be set for Groq-backed evals")
55
+
56
+ port = _find_free_port()
57
+ base_url = f"http://127.0.0.1:{port}"
58
+ trace_dir = tmp_path / "traces"
59
+ trace_dir.mkdir(parents=True, exist_ok=True)
60
+
61
+ env = os.environ.copy()
62
+ cmd = [
63
+ "uv",
64
+ "run",
65
+ "-m",
66
+ "synth_ai",
67
+ "task-app",
68
+ "serve",
69
+ "grpo-verilog",
70
+ "--port",
71
+ str(port),
72
+ "--no-reload",
73
+ ]
74
+ proc = subprocess.Popen(
75
+ cmd,
76
+ stdout=subprocess.PIPE,
77
+ stderr=subprocess.STDOUT,
78
+ text=True,
79
+ env=env,
80
+ stdin=subprocess.PIPE, # Auto-answer tracing prompt
81
+ )
82
+
83
+ # Send "n" to decline tracing
84
+ try:
85
+ if proc.stdin:
86
+ proc.stdin.write("n\n")
87
+ proc.stdin.flush()
88
+ except Exception:
89
+ pass
90
+
91
+ stdout_capture = ""
92
+ try:
93
+ # Check if process died immediately
94
+ time.sleep(2)
95
+ if proc.poll() is not None:
96
+ stdout_capture, _ = proc.communicate(timeout=2)
97
+ tail = "\n".join(stdout_capture.strip().splitlines()[-20:]) if stdout_capture else ""
98
+ pytest.skip(f"Task app terminated immediately:\n{tail}")
99
+
100
+ _wait_for_server(base_url)
101
+ yield base_url
102
+ except RuntimeError as e:
103
+ proc.terminate()
104
+ try:
105
+ stdout_capture, _ = proc.communicate(timeout=10)
106
+ except TimeoutExpired:
107
+ proc.kill()
108
+ stdout_capture, _ = proc.communicate()
109
+ tail = "\n".join((stdout_capture or "").strip().splitlines()[-20:])
110
+ pytest.skip(f"Task app failed to start: {e}\n{tail}")
111
+ finally:
112
+ if proc.poll() is None:
113
+ proc.terminate()
114
+ try:
115
+ proc.wait(timeout=5)
116
+ except TimeoutExpired:
117
+ proc.kill()
118
+
119
+
120
+ @pytest.mark.slow
121
+ def test_verilog_server_health(verilog_server: str) -> None:
122
+ """Test that the Verilog server health endpoint works."""
123
+ # Health endpoint requires auth, so we expect 400 (auth failed) or 200
124
+ resp = requests.get(f"{verilog_server}/health", timeout=5.0)
125
+ assert resp.status_code in (200, 400), f"Unexpected status: {resp.status_code}"
126
+
127
+
128
+ @pytest.mark.slow
129
+ def test_verilog_task_info(verilog_server: str) -> None:
130
+ """Test that the Verilog server returns valid task_info."""
131
+ resp = requests.get(f"{verilog_server}/task_info", timeout=5.0)
132
+ assert resp.status_code == 200
133
+ data = resp.json()
134
+ assert "task" in data
135
+ assert data["task"]["id"] == "verilog"
136
+
137
+
138
+ @pytest.mark.slow
139
+ def test_verilog_eval_with_groq(verilog_server: str) -> None:
140
+ """Spin up the Verilog task app and run a Groq-backed eval."""
141
+ if not CONFIG_PATH.exists():
142
+ pytest.skip(f"Config file not found: {CONFIG_PATH}")
143
+
144
+ cmd = [
145
+ "uv",
146
+ "run",
147
+ "-m",
148
+ "synth_ai",
149
+ "eval",
150
+ "grpo-verilog",
151
+ "--config",
152
+ str(CONFIG_PATH),
153
+ "--url",
154
+ verilog_server,
155
+ "--model",
156
+ "qwen/qwen3-32b",
157
+ "--seeds",
158
+ "0", # Just test one seed
159
+ ]
160
+ result = subprocess.run(
161
+ cmd,
162
+ stdout=subprocess.PIPE,
163
+ stderr=subprocess.STDOUT,
164
+ text=True,
165
+ env=os.environ.copy(),
166
+ check=False,
167
+ timeout=300, # 5 minutes max
168
+ )
169
+
170
+ if result.returncode != 0:
171
+ pytest.fail(f"Eval failed with return code {result.returncode}:\n{result.stdout}")
172
+
173
+ # Check for success indicators
174
+ assert "Eval complete" in result.stdout
175
+ assert "1 ok, 0 failed" in result.stdout or "status=200" in result.stdout
176
+
177
+ # Check that we got a meaningful outcome score
178
+ assert "outcome" in result.stdout.lower() or "mean_return" in result.stdout.lower()
179
+
180
+
181
+
@@ -0,0 +1,55 @@
1
+ """Integration test for Verilog rollouts via /rollout endpoint."""
2
+ import os
3
+ import pytest
4
+
5
+ requests = pytest.importorskip("requests")
6
+
7
+ # Use the actual ENVIRONMENT_API_KEY from .env
8
+ AUTH_HEADER = {"Authorization": "Bearer sk_env_30c78a787bac223c716918181209f263"}
9
+
10
+
11
+ @pytest.mark.slow
12
+ def test_verilog_policy_rollout(verilog_server: str) -> None:
13
+ """Test a Verilog rollout using Groq policy."""
14
+ if "GROQ_API_KEY" not in os.environ:
15
+ pytest.skip("GROQ_API_KEY required for this test")
16
+
17
+ rollout_payload = {
18
+ "run_id": "test_policy_verilog",
19
+ "env": {"seed": 0},
20
+ "ops": [], # Empty ops means use policy for all steps
21
+ "policy": {
22
+ "policy_name": "qwen-groq",
23
+ "config": {
24
+ "provider": "groq",
25
+ "model": "qwen/qwen3-32b",
26
+ "max_steps": 5, # Limit steps for test
27
+ },
28
+ },
29
+ }
30
+
31
+ resp = requests.post(
32
+ f"{verilog_server}/rollout",
33
+ json=rollout_payload,
34
+ headers=AUTH_HEADER,
35
+ timeout=120.0,
36
+ )
37
+
38
+ assert resp.status_code == 200, f"Rollout failed: {resp.status_code} {resp.text}"
39
+ data = resp.json()
40
+
41
+ # Verify response structure
42
+ assert "trajectories" in data
43
+ assert "metrics" in data
44
+ assert "trace" in data
45
+
46
+ trajectory = data["trajectories"][0]
47
+ assert "steps" in trajectory
48
+
49
+ # Check that at least one step was taken
50
+ assert len(trajectory["steps"]) > 0
51
+
52
+ # Verify metrics
53
+ metrics = data["metrics"]
54
+ assert "episode_returns" in metrics or "mean_return" in metrics
55
+
@@ -0,0 +1,4 @@
1
+ # Unit tests for Verilog task app
2
+
3
+
4
+
@@ -0,0 +1,118 @@
1
+ """Unit tests for Verilog scoring and rewards."""
2
+ import pytest
3
+
4
+ from synth_ai.environments.examples.verilog.engine import (
5
+ VerilogCompileSuccessComponent,
6
+ VerilogSimulationPassComponent,
7
+ VerilogSubmitSuccessComponent,
8
+ VerilogPublicState,
9
+ )
10
+
11
+
12
+ @pytest.mark.asyncio
13
+ async def test_compile_success_reward():
14
+ """Test that successful compilation awards 0.1 reward."""
15
+ component = VerilogCompileSuccessComponent()
16
+ state = VerilogPublicState(files={}, build_dir="/tmp", task_completed=False)
17
+
18
+ # Successful compile (returncode 0)
19
+ action = {"type": "compile", "returncode": 0}
20
+ reward = await component.score(state, action)
21
+ assert reward == 0.1
22
+
23
+ # Failed compile (returncode != 0)
24
+ action_fail = {"type": "compile", "returncode": 1}
25
+ reward_fail = await component.score(state, action_fail)
26
+ assert reward_fail == 0.0
27
+
28
+ # Non-compile action
29
+ action_other = {"type": "write_file"}
30
+ reward_other = await component.score(state, action_other)
31
+ assert reward_other == 0.0
32
+
33
+
34
+ @pytest.mark.asyncio
35
+ async def test_simulation_pass_reward():
36
+ """Test that passing simulation awards 1.0 reward."""
37
+ component = VerilogSimulationPassComponent()
38
+ state = VerilogPublicState(files={}, build_dir="/tmp", task_completed=False)
39
+
40
+ # Passing simulation
41
+ action = {"type": "simulate", "passed": True}
42
+ reward = await component.score(state, action)
43
+ assert reward == 1.0
44
+
45
+ # Failing simulation
46
+ action_fail = {"type": "simulate", "passed": False}
47
+ reward_fail = await component.score(state, action_fail)
48
+ assert reward_fail == 0.0
49
+
50
+ # Non-simulate action
51
+ action_other = {"type": "compile"}
52
+ reward_other = await component.score(state, action_other)
53
+ assert reward_other == 0.0
54
+
55
+
56
+ @pytest.mark.asyncio
57
+ async def test_submit_success_reward():
58
+ """Test that successful submission awards 10.0 reward."""
59
+ component = VerilogSubmitSuccessComponent()
60
+ state = VerilogPublicState(files={}, build_dir="/tmp", task_completed=False)
61
+
62
+ # Successful submission (tests passed)
63
+ action = {"type": "submit", "passed": True}
64
+ reward = await component.score(state, action)
65
+ assert reward == 10.0
66
+
67
+ # Failed submission (tests didn't pass)
68
+ action_fail = {"type": "submit", "passed": False}
69
+ reward_fail = await component.score(state, action_fail)
70
+ assert reward_fail == 0.0
71
+
72
+ # Non-submit action
73
+ action_other = {"type": "compile"}
74
+ reward_other = await component.score(state, action_other)
75
+ assert reward_other == 0.0
76
+
77
+
78
+ @pytest.mark.asyncio
79
+ async def test_submit_checks_simulation_output():
80
+ """Test that submit() correctly checks the last simulation output."""
81
+ from synth_ai.environments.examples.verilog.engine import VerilogEngine
82
+ from synth_ai.environments.tasks.core import TaskInstance, Impetus, Intent
83
+
84
+ # Create a minimal task instance
85
+ task = TaskInstance(
86
+ id="test",
87
+ impetus=Impetus(instructions="Test"),
88
+ intent=Intent(
89
+ rubric={"goal": "test"},
90
+ gold_trajectories=None,
91
+ gold_state_diff={},
92
+ deterministic_eval_functions=[],
93
+ ),
94
+ metadata=None,
95
+ is_reproducible=False,
96
+ initial_engine_snapshot=None,
97
+ )
98
+ task.snapshot_dir = None # Will be set by engine
99
+
100
+ engine = VerilogEngine(task)
101
+
102
+ # Test 1: No simulation run yet
103
+ result = await engine.submit()
104
+ assert result["passed"] is False
105
+ assert "No simulation run yet" in result["detail"]
106
+
107
+ # Test 2: Simulate with passing output
108
+ engine._last_simulate_output = "Mismatches: 0 in 100 samples\nALL_TESTS_PASSED"
109
+ result_pass = await engine.submit()
110
+ assert result_pass["passed"] is True
111
+ assert "All tests passed" in result_pass["detail"]
112
+
113
+ # Test 3: Simulate with failing output
114
+ engine._last_simulate_output = "Mismatches: 5 in 100 samples\nErrors detected"
115
+ result_fail = await engine.submit()
116
+ assert result_fail["passed"] is False
117
+ assert "Tests failed" in result_fail["detail"]
118
+
@@ -28,10 +28,10 @@ from pathlib import Path
28
28
  from typing import Any
29
29
  from uuid import uuid4
30
30
 
31
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.environment import (
32
- CrafterEnvironmentWrapper,
31
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.environment import (
32
+ CrafterEnvironment,
33
33
  )
34
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
34
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
35
35
  from openai import OpenAI
36
36
  from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
37
37
  from synth_ai.environments.examples.crafter_classic.taskset import (
@@ -140,7 +140,7 @@ async def _run_episode(
140
140
  ) -> EpisodeResult:
141
141
  task_instance = _build_task_instance(seed)
142
142
  env = CrafterClassicEnvironment(task_instance)
143
- wrapper = CrafterEnvironmentWrapper(env, seed=seed)
143
+ wrapper = CrafterEnvironment(env, seed=seed)
144
144
  policy = CrafterPolicy(inference_url="openai://chat-completions", model=model)
145
145
  await policy.initialize({"use_tools": True, "model": model})
146
146
 
@@ -24,10 +24,10 @@ from pathlib import Path
24
24
  from typing import Any
25
25
  from uuid import uuid4
26
26
 
27
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.environment import (
28
- CrafterEnvironmentWrapper,
27
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.environment import (
28
+ CrafterEnvironment,
29
29
  )
30
- from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
30
+ from examples.task_apps.crafter.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
31
31
  from openai import AsyncOpenAI
32
32
  from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
33
33
  from synth_ai.environments.examples.crafter_classic.taskset import (
@@ -142,7 +142,7 @@ async def _run_episode(
142
142
  async with semaphore:
143
143
  task_instance = _build_task_instance(seed)
144
144
  env = CrafterClassicEnvironment(task_instance)
145
- wrapper = CrafterEnvironmentWrapper(env, seed=seed)
145
+ wrapper = CrafterEnvironment(env, seed=seed)
146
146
 
147
147
  policy = CrafterPolicy(inference_url="openai://chat-completions", model=model)
148
148
  await policy.initialize({"use_tools": True, "model": model})
@@ -47,8 +47,10 @@ async def run(args: argparse.Namespace) -> None:
47
47
 
48
48
  inference_url = args.inference_url or f"{args.base_url.rstrip('/')}/proxy/groq"
49
49
 
50
+ from synth_ai.task.contracts import RolloutMode
50
51
  request = RolloutRequest(
51
52
  run_id=args.run_id,
53
+ mode=RolloutMode.EVAL,
52
54
  env=RolloutEnvSpec(env_name="crafter", seed=args.seed, config={"seed": args.seed}),
53
55
  policy=RolloutPolicySpec(
54
56
  policy_name="groq-smoke",