synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +56 -26
  19. examples/swe/task_app/hosted/rollout.py +42 -0
  20. examples/swe/task_app/hosted/test_service.py +5 -6
  21. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  22. examples/task_apps/TESTING.md +275 -0
  23. examples/task_apps/__init__.py +0 -0
  24. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  25. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  26. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  27. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  28. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  29. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  30. examples/task_apps/crafter/__init__.py +0 -0
  31. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  32. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  33. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  34. examples/task_apps/crafter/task_app/__init__.py +5 -0
  35. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
  36. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  37. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  38. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
  39. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  40. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  41. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
  42. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
  43. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  44. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
  45. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  78. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  79. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  80. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  81. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  82. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  83. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  84. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  85. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  86. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  87. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  88. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  89. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  90. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  91. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  92. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  93. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  94. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  95. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  96. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  97. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  98. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  99. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  100. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  101. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  102. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  103. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  104. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  105. examples/task_apps/enron/__init__.py +1 -0
  106. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  107. examples/task_apps/enron/filter_sft.toml +5 -0
  108. examples/task_apps/enron/task_app/README.md +14 -0
  109. examples/task_apps/enron/task_app/__init__.py +1 -0
  110. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  111. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  112. examples/task_apps/enron/tests/__init__.py +4 -0
  113. examples/task_apps/enron/tests/conftest.py +115 -0
  114. examples/task_apps/enron/tests/integration/__init__.py +4 -0
  115. examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
  116. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  117. examples/task_apps/enron/tests/unit/__init__.py +4 -0
  118. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  119. examples/task_apps/math/__init__.py +0 -0
  120. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  121. examples/task_apps/pokemon_battle/__init__.py +2 -0
  122. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  123. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  124. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  125. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  126. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  127. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  128. examples/task_apps/pokemon_red/README.md +357 -0
  129. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  130. examples/task_apps/pokemon_red/__init__.py +3 -0
  131. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  132. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  133. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
  134. examples/task_apps/pokemon_red/task_app.py +799 -0
  135. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
  136. examples/task_apps/sokoban/README.md +307 -0
  137. examples/task_apps/sokoban/__init__.py +3 -0
  138. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  139. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  140. examples/task_apps/sokoban/filter_sft.toml +5 -0
  141. examples/task_apps/sokoban/task_app.py +1058 -0
  142. examples/task_apps/sokoban/tests/__init__.py +4 -0
  143. examples/task_apps/sokoban/tests/conftest.py +113 -0
  144. examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
  145. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  146. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  147. examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
  148. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  149. examples/task_apps/verilog/__init__.py +1 -0
  150. examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
  151. examples/task_apps/verilog/filter_sft.toml +5 -0
  152. examples/task_apps/verilog/task_app/README.md +12 -0
  153. examples/task_apps/verilog/task_app/__init__.py +1 -0
  154. examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
  155. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  156. examples/task_apps/verilog/tests/__init__.py +4 -0
  157. examples/task_apps/verilog/tests/conftest.py +115 -0
  158. examples/task_apps/verilog/tests/integration/__init__.py +4 -0
  159. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
  160. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  161. examples/task_apps/verilog/tests/unit/__init__.py +4 -0
  162. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  163. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  164. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  165. examples/warming_up_to_rl/groq_test.py +2 -0
  166. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  167. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  168. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  169. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  170. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  171. examples/workflows/__init__.py +0 -0
  172. examples/workflows/math_rl/__init__.py +0 -0
  173. examples/workflows/math_rl/download_dataset.py +80 -0
  174. synth_ai/__init__.py +2 -2
  175. synth_ai/api/models/supported.py +1 -0
  176. synth_ai/api/train/builders.py +25 -11
  177. synth_ai/api/train/cli.py +12 -6
  178. synth_ai/api/train/configs/__init__.py +10 -10
  179. synth_ai/api/train/configs/rl.py +5 -4
  180. synth_ai/api/train/configs/sft.py +4 -3
  181. synth_ai/api/train/env_resolver.py +5 -2
  182. synth_ai/api/train/supported_algos.py +10 -5
  183. synth_ai/api/train/utils.py +7 -4
  184. synth_ai/cli/__init__.py +48 -59
  185. synth_ai/cli/_modal_wrapper.py +3 -2
  186. synth_ai/cli/_storage.py +4 -3
  187. synth_ai/cli/_validate_task_app.py +11 -0
  188. synth_ai/cli/balance.py +4 -3
  189. synth_ai/cli/calc.py +2 -2
  190. synth_ai/cli/demo.py +14 -7
  191. synth_ai/cli/legacy_root_backup.py +1 -1
  192. synth_ai/cli/recent.py +1 -1
  193. synth_ai/cli/rl_demo.py +8 -7
  194. synth_ai/cli/root.py +0 -97
  195. synth_ai/cli/status.py +1 -1
  196. synth_ai/cli/task_apps.py +1922 -190
  197. synth_ai/cli/traces.py +1 -1
  198. synth_ai/cli/tui.py +57 -0
  199. synth_ai/cli/turso.py +1 -1
  200. synth_ai/cli/watch.py +1 -1
  201. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
  202. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  203. synth_ai/environments/examples/enron/engine.py +7 -2
  204. synth_ai/environments/examples/enron/environment.py +68 -0
  205. synth_ai/environments/examples/red/engine.py +27 -0
  206. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  207. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  208. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  209. synth_ai/environments/examples/red/environment.py +60 -0
  210. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  211. synth_ai/environments/examples/verilog/engine.py +104 -12
  212. synth_ai/evals/client.py +58 -61
  213. synth_ai/jobs/client.py +16 -4
  214. synth_ai/judge_schemas.py +9 -9
  215. synth_ai/py.typed +0 -0
  216. synth_ai/task/__init__.py +24 -5
  217. synth_ai/task/apps/__init__.py +1 -0
  218. synth_ai/task/config.py +257 -0
  219. synth_ai/task/contracts.py +138 -39
  220. synth_ai/task/proxy.py +48 -56
  221. synth_ai/task/rubrics/__init__.py +56 -0
  222. synth_ai/task/rubrics/loaders.py +152 -0
  223. synth_ai/task/rubrics/models.py +57 -0
  224. synth_ai/task/rubrics/scoring.py +116 -0
  225. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  226. synth_ai/task/server.py +8 -7
  227. synth_ai/task/trace_correlation_helpers.py +315 -0
  228. synth_ai/task/validators.py +413 -6
  229. synth_ai/tracing_v3/abstractions.py +3 -3
  230. synth_ai/tracing_v3/decorators.py +7 -3
  231. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  232. synth_ai/tracing_v3/replica_sync.py +4 -4
  233. synth_ai/tracing_v3/serialization.py +5 -5
  234. synth_ai/tracing_v3/session_tracer.py +16 -6
  235. synth_ai/tracing_v3/storage/base.py +29 -29
  236. synth_ai/tracing_v3/storage/config.py +3 -3
  237. synth_ai/tracing_v3/trace_utils.py +317 -0
  238. synth_ai/tracing_v3/turso/daemon.py +8 -7
  239. synth_ai/tracing_v3/turso/native_manager.py +66 -43
  240. synth_ai/tracing_v3/utils.py +3 -3
  241. synth_ai/tui/__init__.py +5 -0
  242. synth_ai/tui/__main__.py +13 -0
  243. synth_ai/tui/cli/__init__.py +1 -0
  244. synth_ai/tui/cli/query_experiments.py +164 -0
  245. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  246. synth_ai/tui/dashboard.py +906 -0
  247. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
  248. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
  249. examples/agora_ex/README_MoE.md +0 -224
  250. examples/agora_ex/__init__.py +0 -7
  251. examples/agora_ex/agora_ex.py +0 -65
  252. examples/agora_ex/agora_ex_task_app.py +0 -590
  253. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  254. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  255. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  256. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  257. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  258. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  259. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  260. synth_ai/rubrics/__init__.py +0 -22
  261. synth_ai/task/rubrics.py +0 -219
  262. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  263. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  264. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  265. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  266. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  267. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  268. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  269. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  270. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  271. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  272. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  273. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  274. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  275. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  276. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  277. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  278. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  279. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  280. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  281. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  282. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  283. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  284. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  285. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  286. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  287. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  288. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  289. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  290. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  291. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,77 @@
1
+ # Verilog RL with LoRA (Qwen3-0.6B)
2
+
3
+ ## Quick Start
4
+
5
+ 1. **Deploy Verilog Task App**:
6
+ ```bash
7
+ cd synth-ai
8
+ uvx synth-ai modal-serve grpo-verilog
9
+ ```
10
+ Note the Modal URL and update `task_url` in `verilog_rl_lora.toml`.
11
+
12
+ 2. **Run Training**:
13
+ ```bash
14
+ uvx synth-ai rl run --config examples/multi_step/configs/verilog_rl_lora.toml
15
+ ```
16
+
17
+ ## Configuration Overview
18
+
19
+ ### **Key Adaptations from Crafter**:
20
+
21
+ - **Model**: `Qwen/Qwen3-0.6B` (✅ proven in SFT configs)
22
+ - **Environment**: `verilog` instead of `crafter`
23
+ - **Steps**: 15 turns (vs Crafter's 10) for compilation workflows
24
+ - **Rewards**: Adjusted for sparser Verilog rewards (0.5 vs 1.0 indicator_lambda)
25
+ - **Rubrics**: Verilog-specific judging criteria
26
+
27
+ ### **Hardware Requirements** (Standard RL setup):
28
+ - ✅ **2x H100 GPUs** (vLLM inference + LoRA training split)
29
+ - ✅ **No tensor parallelism** needed for 0.6B model
30
+ - ✅ **4x faster inference** than 32B model
31
+ - ✅ **Same compute pattern** as Crafter (just smaller model)
32
+
33
+ ### **Expected Workflow**:
34
+ 1. Agent writes Verilog code (`write_file`)
35
+ 2. Compiles to check syntax (`compile`)
36
+ 3. Simulates to verify behavior (`simulate`)
37
+ 4. Submits if tests pass (`submit`)
38
+ 5. **Rewards**: +1.0 for compilation success, +10.0 for passing tests
39
+
40
+ ## Rubric Design
41
+
42
+ ### **Event Rewards** (per decision):
43
+ - **Compilation Success**: 70% weight (1.0 for success, 0.0 for errors)
44
+ - **Process Efficiency**: 30% weight (penalizes redundant operations)
45
+
46
+ ### **Outcome Rewards** (final score):
47
+ - **Tests Passed**: 80% weight (full credit when all tests pass)
48
+ - **Design Quality**: 20% weight (code clarity, documentation)
49
+
50
+ ## Troubleshooting
51
+
52
+ ### **If training fails**:
53
+ 1. Check Modal URL in `task_url` field
54
+ 2. Verify `GROQ_API_KEY` for inference
55
+ 3. Ensure `OPENAI_API_KEY` for judging
56
+
57
+ ### **Memory issues** (unlikely with 0.6B):
58
+ - Reduce `batch_size` to 2
59
+ - Set `gradient_accumulation_steps = 2`
60
+ - Verify 2x GPU split is working (vLLM on GPU 0, training on GPU 1)
61
+
62
+ ### **Slow training**:
63
+ - Increase `episodes_per_batch` to 6-8
64
+ - Check network latency to Modal task app
65
+
66
+ ## Expected Results
67
+
68
+ - **Convergence**: Should learn basic compilation workflow in 1-2 hours
69
+ - **Success Rate**: 20-40% initial test pass rate (improves with training)
70
+ - **Learning**: Agent learns to debug compilation errors and write correct Verilog
71
+
72
+ ## Next Steps
73
+
74
+ 1. **Monitor reward progression** in training logs
75
+ 2. **Adjust rubrics** if agent struggles with compilation errors
76
+ 3. **Scale to 8B model** once 0.6B baseline works
77
+ 4. **Add domain-specific fine-tuning** for Verilog syntax
@@ -0,0 +1,90 @@
1
+ # Verilog Reward Structure (Normalized to 1.0)
2
+
3
+ ## Overview
4
+ All rewards in the Verilog task app are normalized so the maximum possible reward is **1.0**.
5
+
6
+ ## Reward Components
7
+
8
+ ### 1. Step Penalty: **-0.001** per step
9
+ - Applied to every action taken
10
+ - Encourages efficient solutions
11
+ - Normalized from `-0.01` (original)
12
+
13
+ ### 2. Compile Success: **+0.01**
14
+ - Awarded when `iverilog` compilation succeeds (returncode 0)
15
+ - Validates syntax correctness
16
+ - Normalized from `+0.1` (original)
17
+
18
+ ### 3. Simulation Pass: **+0.1**
19
+ - Awarded when `vvp` simulation passes all tests
20
+ - Validates behavioral correctness
21
+ - Normalized from `+1.0` (original)
22
+
23
+ ### 4. Submit Success: **+1.0** (maximum reward)
24
+ - Awarded when final submission passes all verification tests
25
+ - This is the goal state
26
+ - Normalized from `+10.0` (original)
27
+
28
+ ## Typical Reward Trajectories
29
+
30
+ ### ✅ Optimal Path (3 steps)
31
+ ```
32
+ Step 1: write_file → -0.001
33
+ Step 2: compile (success) → +0.01 - 0.001 = +0.009
34
+ Step 3: simulate (pass) → +0.1 - 0.001 = +0.099
35
+ Total: ~0.107
36
+ ```
37
+
38
+ ### ✅ Good Path (4 steps with submit)
39
+ ```
40
+ Step 1: write_file → -0.001
41
+ Step 2: compile (success) → +0.009
42
+ Step 3: simulate (pass) → +0.099
43
+ Step 4: submit (success) → +1.0 - 0.001 = +0.999
44
+ Total: ~1.106
45
+ ```
46
+ *Note: Can exceed 1.0 if intermediate rewards stack with final submit*
47
+
48
+ ### ❌ Failure Path (compilation errors)
49
+ ```
50
+ Step 1: write_file → -0.001
51
+ Step 2: compile (fail) → -0.001
52
+ Step 3: write_file (fix) → -0.001
53
+ Step 4: compile (success) → +0.009
54
+ Step 5: simulate (pass) → +0.099
55
+ Total: ~0.105
56
+ ```
57
+
58
+ ## Implementation Details
59
+
60
+ ### Location
61
+ - **Reward components**: `synth_ai/environments/examples/verilog/engine.py`
62
+ - `VerilogCompileSuccessComponent`: +0.01
63
+ - `VerilogSimulationPassComponent`: +0.1
64
+ - `VerilogSubmitSuccessComponent`: +1.0
65
+ - `VerilogStepPenaltyComponent`: -0.001
66
+
67
+ ### Normalization Ratio
68
+ All rewards were divided by **10.0** to normalize:
69
+ - Original max: ~10.0
70
+ - Normalized max: ~1.0
71
+ - Ratio: 10.0
72
+
73
+ ## Why Normalize?
74
+
75
+ 1. **Consistency**: Makes it easier to compare rewards across different task types
76
+ 2. **RL Training**: Standard reward scales improve learning stability
77
+ 3. **Interpretability**: Rewards as percentages (0.0 to 1.0) are intuitive
78
+ 4. **Judge Compatibility**: Rubric scores typically range 0-1, making blending easier
79
+
80
+ ## Testing
81
+ ```bash
82
+ # Run eval to verify normalized rewards
83
+ uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
84
+ ```
85
+
86
+ Expected output for successful rollout:
87
+ - `mean_return` ≈ 0.1 (if only compile+simulate)
88
+ - `mean_return` ≈ 1.0+ (if full submit success)
89
+
90
+
@@ -0,0 +1,183 @@
1
+ # Verilog Task App - RL Training Readiness Checklist
2
+
3
+ ## ✅ Core Requirements
4
+
5
+ ### 1. Reward Normalization
6
+ - ✅ **Max reward = 1.0**: All rewards scaled to `[0, 1]` range
7
+ - ✅ **Step penalty**: `-0.001` (normalized from `-0.01`)
8
+ - ✅ **Compile success**: `+0.01` (normalized from `+0.1`)
9
+ - ✅ **Simulate pass**: `+0.1` (normalized from `+1.0`)
10
+ - ✅ **Submit success**: `+1.0` (normalized from `+10.0`)
11
+
12
+ ### 2. Inference URL Handling (Critical for Trace Correlation)
13
+ - ✅ **Extracts from policy config**: Uses `policy_config.get("inference_url")` as primary source
14
+ - ✅ **Includes in trajectory**: Sets `trajectory.inference_url` with `?cid=...` parameter
15
+ - ✅ **Includes in final.info**: Adds to `final["info"]["inference_url"]`
16
+ - ✅ **Includes in pipeline_metadata**: Top-level `inference_url` field for trainer extraction
17
+ - ✅ **Logs cid presence**: Logs `has_cid` flag for debugging
18
+ - ✅ **Fallback to agent.inference_url**: Uses agent's URL if policy config missing (eval mode)
19
+
20
+ **Location**: `grpo_verilog.py` lines 829-867, 887-908
21
+
22
+ ### 3. Pipeline Metadata
23
+ - ✅ **Required fields present**:
24
+ - `reward_score`: Final episode reward
25
+ - `policy_id`: Policy identifier
26
+ - `inference_url`: **CRITICAL** - Contains `?cid=trace_xxxxx` for correlation
27
+ - `env_name`: Environment identifier
28
+ - `task_id`: Problem identifier
29
+ - `task_split`: Dataset split (train/val/test)
30
+ - ✅ **Inference details**: Provider, model, URL in nested `inference` dict
31
+
32
+ **Location**: `grpo_verilog.py` lines 887-908
33
+
34
+ ### 4. Trace Correlation (Required for RL Training)
35
+ - ✅ **Trainer injects cid**: Trainer adds `?cid=trace_xxxxx` to `policy_config["inference_url"]`
36
+ - ✅ **Task app preserves cid**: Uses `policy_config["inference_url"]` directly
37
+ - ✅ **Trainer extracts cid**: Extracts from `trajectory.inference_url` using `inference_url_to_trace_correlation_id()`
38
+ - ✅ **Trace hydration**: Trainer queries trace store with extracted `trace_correlation_id`
39
+
40
+ **Flow**:
41
+ ```
42
+ Trainer → policy_config["inference_url"] = "http://...?cid=trace_xxxxx"
43
+
44
+ Task App → trajectory.inference_url = policy_config["inference_url"]
45
+
46
+ Trainer → extract_trace_correlation_id(trajectory.inference_url)
47
+
48
+ Trainer → trace_store.resolve_correlation(trace_correlation_id)
49
+
50
+ Trainer → Hydrate v3 trace with event_history
51
+
52
+ Judge → Score using full trace
53
+ ```
54
+
55
+ ### 5. Response Contract Compliance
56
+ - ✅ **RolloutResponse fields**:
57
+ - `run_id`: Unique identifier
58
+ - `trajectories`: List of trajectories (with `inference_url`)
59
+ - `metrics`: Episode metrics
60
+ - `pipeline_metadata`: **CRITICAL** - Contains `inference_url` and `reward_score`
61
+ - `trace_correlation_id`: Optional (trainer infers from `inference_url`)
62
+ - ✅ **Optional trace_correlation_id**: Made optional in `contracts.py` (trainer infers from URL)
63
+
64
+ **Location**: `synth_ai/task/contracts.py` line 156
65
+
66
+ ### 6. Environment Implementation
67
+ - ✅ **Stateful engine**: `VerilogEngine` extends `StatefulEngine`
68
+ - ✅ **Reward stack**: Properly configured with normalized components
69
+ - ✅ **State management**: `VerilogPublicState` and `VerilogPrivateState`
70
+ - ✅ **Tool implementation**: All 4 tools (write_file, compile, simulate, submit)
71
+
72
+ **Location**: `synth_ai/environments/examples/verilog/engine.py`
73
+
74
+ ### 7. LLM Agent Integration
75
+ - ✅ **Multi-turn support**: Agent maintains conversation history
76
+ - ✅ **Tool parsing**: Extracts tool calls from LLM responses
77
+ - ✅ **Guidance system**: Provides context-aware hints
78
+ - ✅ **Error handling**: Graceful fallback for malformed responses
79
+
80
+ **Location**: `grpo_verilog.py` lines 200-530
81
+
82
+ ## 🔍 Verification Tests
83
+
84
+ ### Test 1: Eval Mode (No Trace Correlation)
85
+ ```bash
86
+ uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
87
+ ```
88
+ **Expected**:
89
+ - ✅ `mean_return` ≈ 0.1 (normalized rewards)
90
+ - ✅ `inference_url` = Groq API URL (no `?cid=...`)
91
+ - ✅ `task_completed` = True for correct solutions
92
+
93
+ ### Test 2: RL Training Mode (With Trace Correlation)
94
+ ```bash
95
+ uvx synth-ai train \
96
+ --type rl \
97
+ --config examples/multi_step/configs/verilog_rl_lora.toml \
98
+ --task-url https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run \
99
+ --backend https://synth-backend-dev-docker.onrender.com/api \
100
+ --env-file /path/to/verilog/.env
101
+ ```
102
+ **Expected**:
103
+ - ✅ Trainer logs show `inference_url` with `?cid=trace_xxxxx`
104
+ - ✅ Task app logs show `has_cid=True`
105
+ - ✅ Trace hydration succeeds (no `404 Not Found` errors)
106
+ - ✅ Judge receives full `event_history`
107
+ - ✅ Training updates show non-zero rewards
108
+
109
+ ### Test 3: Trace Correlation ID Extraction
110
+ ```python
111
+ from synth_envs_hosted.utils import inference_url_to_trace_correlation_id
112
+
113
+ # Should extract trace_xxxxx from URL
114
+ url = "http://localhost:8000/v1/chat/completions?cid=trace_abc123"
115
+ cid = inference_url_to_trace_correlation_id(url)
116
+ assert cid == "trace_abc123"
117
+ ```
118
+
119
+ ### Test 4: Pipeline Metadata Structure
120
+ ```python
121
+ # Verify response has correct structure for RL
122
+ response = await task_app.rollout(request)
123
+ assert "pipeline_metadata" in response
124
+ assert "inference_url" in response.pipeline_metadata
125
+ assert "reward_score" in response.pipeline_metadata
126
+ assert len(response.trajectories) > 0
127
+ assert response.trajectories[0].inference_url is not None
128
+ ```
129
+
130
+ ## 📋 Deployment Checklist
131
+
132
+ ### Modal Deployment
133
+ 1. ✅ **Environment variables set**:
134
+ - `GROQ_API_KEY`
135
+ - `VERILOG_INFERENCE_URL` (optional, uses Groq default)
136
+ 2. ✅ **Secrets configured**: Groq API key in Modal secrets
137
+ 3. ✅ **Task app URL**: Update in `verilog_rl_lora.toml`
138
+
139
+ ### Training Configuration
140
+ 1. ✅ **2x GPUs minimum**: 1 for vLLM, 1 for training
141
+ 2. ✅ **Model size**: `Qwen/Qwen3-0.6B` for testing
142
+ 3. ✅ **Batch size**: 4 (matches Crafter)
143
+ 4. ✅ **Max turns**: 15 (enough for compile chains)
144
+ 5. ✅ **Rubric enabled**: `rubric.enabled = true`
145
+
146
+ ## 🚨 Common Issues & Fixes
147
+
148
+ ### Issue 1: `trace_correlation_id` Missing
149
+ **Symptom**: Trainer logs `FATAL: Rollout payload missing 'trace_correlation_id'`
150
+ **Fix**: Verify `trajectory.inference_url` contains `?cid=...` parameter
151
+
152
+ ### Issue 2: Trace Hydration Fails (404)
153
+ **Symptom**: `404 Not Found` when querying `/trace/by-correlation/...`
154
+ **Fix**:
155
+ - Check inference server is capturing traces
156
+ - Verify `cid` parameter is in inference URL
157
+ - Ensure `vllm_public_url` is set correctly
158
+
159
+ ### Issue 3: Rewards Not Normalized
160
+ **Symptom**: `mean_return` > 1.0 in eval
161
+ **Fix**: Verify all reward components in `engine.py` are scaled by 10x
162
+
163
+ ### Issue 4: Agent Gets Stuck
164
+ **Symptom**: Agent repeats same action (e.g., compile without fixing)
165
+ **Fix**: Check guidance system is providing proper hints
166
+
167
+ ## 🎯 Final Verification
168
+
169
+ Before starting RL training, verify:
170
+ - [ ] Eval runs successfully with normalized rewards (≈ 0.1)
171
+ - [ ] Modal deployment returns proper `inference_url` structure
172
+ - [ ] Trace correlation ID extraction works
173
+ - [ ] Pipeline metadata includes all required fields
174
+ - [ ] Response contract matches expected schema
175
+
176
+ **If all checks pass**: ✅ **Ready for RL training!**
177
+
178
+ ## 📚 Related Documentation
179
+ - [VERILOG_REWARDS.md](./VERILOG_REWARDS.md) - Reward structure details
180
+ - [verilog_rl_lora.md](../verilog_rl_lora.md) - RL/LoRA feasibility analysis
181
+ - [verilog_rl_lora.toml](./verilog_rl_lora.toml) - Training configuration
182
+
183
+
@@ -0,0 +1,35 @@
1
+ # Crafter eval using Synth backend with Qwen3-4B
2
+
3
+ [eval]
4
+ app_id = "grpo-crafter-task-app"
5
+ task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
6
+ model = "Qwen/Qwen3-4B"
7
+ seeds = [0, 1, 2]
8
+ max_turns = 10
9
+ concurrency = 1
10
+ env_name = "crafter"
11
+ policy_name = "crafter-react"
12
+ trace_format = "full"
13
+ return_trace = true
14
+
15
+ [eval.env_config]
16
+ env_params = {max_steps_per_episode = 10}
17
+
18
+ [eval.policy_config]
19
+ provider = "openai"
20
+ model = "Qwen/Qwen3-4B"
21
+ inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
22
+ temperature = 0.6
23
+ top_p = 0.95
24
+ max_tokens = 512
25
+ use_vision = false
26
+ image_only_mode = false
27
+ max_llm_calls = 10
28
+
29
+ [eval.judge]
30
+ path = "examples/multi_step/judges/crafter_backend_judge.py"
31
+ name = "Backend"
32
+ backend_url = "http://localhost:8000/api"
33
+ model = "openai/gpt-oss-120b"
34
+ timeout_s = 45
35
+
@@ -0,0 +1,36 @@
1
+ # Evaluation config for Crafter with text-only input
2
+ # This config uses Groq Qwen with only text observations (no images)
3
+
4
+ [eval]
5
+ app_id = "grpo-crafter-task-app"
6
+ task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
7
+ model = "qwen/qwen3-32b"
8
+ seeds = [0, 1, 2]
9
+ max_turns = 10
10
+ concurrency = 1
11
+ env_name = "crafter"
12
+ policy_name = "crafter-react"
13
+ trace_format = "full"
14
+ return_trace = true
15
+
16
+ [eval.env_config]
17
+ env_params = {max_steps_per_episode = 10}
18
+
19
+ [eval.policy_config]
20
+ provider = "groq"
21
+ model = "qwen/qwen3-32b"
22
+ inference_url = "https://api.groq.com/openai/v1/chat/completions"
23
+ temperature = 0.6
24
+ top_p = 0.95
25
+ max_tokens = 512
26
+ use_vision = false
27
+ image_only_mode = false
28
+ max_llm_calls = 10
29
+
30
+ [eval.judge]
31
+ path = "examples/multi_step/judges/crafter_backend_judge.py"
32
+ name = "Backend"
33
+ backend_url = "http://localhost:8000/api"
34
+ model = "openai/gpt-oss-120b"
35
+ timeout_s = 45
36
+
@@ -12,7 +12,7 @@ variety = "gspo"
12
12
  # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
13
13
  task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
14
14
  # Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
15
- judge_url = "https://YOUR-BACKEND-ENDPOINT/api"
15
+ judge_url = "https://synth-backend-dev-docker.onrender.com/api"
16
16
 
17
17
  [compute]
18
18
  gpu_type = "H200"
@@ -46,7 +46,7 @@ target_modules = ["all-linear"]
46
46
  [rollout]
47
47
  env_name = "crafter"
48
48
  max_turns = 10
49
- episodes_per_batch = 4
49
+ episodes_per_batch = 2
50
50
  policy_name = "crafter-react"
51
51
  max_concurrent_rollouts = 8
52
52
  batches_per_step = 2
@@ -69,12 +69,12 @@ ops = ["agent", "env"]
69
69
 
70
70
  [evaluation]
71
71
  instances = 16
72
- every_n_iters = 8
72
+ every_n_iters = 10
73
73
  seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
74
74
 
75
75
  [training]
76
76
  num_epochs = 1
77
- iterations_per_epoch = 16
77
+ iterations_per_epoch = 20
78
78
  gradient_accumulation_steps = 1
79
79
  max_accumulated_minibatch = 1
80
80
  max_turns = 10
@@ -84,6 +84,7 @@ learning_rate = 5e-5
84
84
  log_interval = 1
85
85
  weight_sync_interval = 1
86
86
  event_rewards_kind = "unique"
87
+ async_semaphore_max = 40 # Max concurrent rollouts in streaming pipeline
87
88
 
88
89
  # Enable dense decision rewards in the trainer to mirror env_config step rewards.
89
90
  step_rewards_enabled = true
@@ -101,6 +102,9 @@ verify_every_k = 0
101
102
 
102
103
  [rubric]
103
104
  enabled = true
105
+ model = "openai/gpt-oss-120b"
106
+ api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
107
+ api_key_env = "OPENAI_API_KEY"
104
108
  # Blend the hosted judge scores with environment returns inside the trainer.
105
109
  [rubric.weights]
106
110
  env = 0.2
@@ -110,13 +114,21 @@ outcome = 0.4
110
114
  [rubric.event]
111
115
  # Hosted judge rubric for per-decision progress scoring.
112
116
  rubric_id = "crafter/event@v1"
117
+ criteria = [
118
+ { key = "progress.unique_achievements", weight = 0.9, description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0.", aggregation = "weighted_sum" },
119
+ { key = "process.intent_alignment", weight = 0.1, description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock.", aggregation = "weighted_sum" },
120
+ ]
113
121
 
114
122
  [rubric.outcome]
115
123
  # Hosted judge rubric for final trajectory scoring.
116
124
  rubric_id = "crafter/outcome@v1"
125
+ criteria = [
126
+ { key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
127
+ { key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
128
+ ]
117
129
 
118
130
  [judge]
119
- type = "gemini" # or "groq" when routing to Groq-hosted judges
131
+ type = "groq" # or "groq" when routing to Groq-hosted judges
120
132
  timeout_s = 45
121
133
 
122
134
  [judge.options]
@@ -0,0 +1,40 @@
1
+ # Crafter Eval Using Synth Backend with Qwen 4B
2
+
3
+ ## What Changed
4
+
5
+ Created `crafter_eval_synth_qwen4b.toml` to evaluate Crafter using Qwen3-4B via the Synth backend inference proxy.
6
+
7
+ ## Key Difference from Groq Config
8
+
9
+ **Before (Groq):**
10
+ ```toml
11
+ [eval.policy_config]
12
+ provider = "groq"
13
+ model = "qwen/qwen3-32b"
14
+ inference_url = "https://api.groq.com/openai/v1/chat/completions"
15
+ ```
16
+
17
+ **After (Synth Backend):**
18
+ ```toml
19
+ [eval.policy_config]
20
+ provider = "openai"
21
+ model = "Qwen/Qwen3-4B"
22
+ inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ ```bash
28
+ uvx synth-ai eval --config examples/multi_step/configs/crafter_eval_synth_qwen4b.toml
29
+ ```
30
+
31
+ ## Why This Works
32
+
33
+ The Synth backend's `/api/v1/chat/completions` endpoint:
34
+ 1. Accepts OpenAI-compatible requests
35
+ 2. Routes to Modal vLLM service
36
+ 3. Loads the base model (Qwen/Qwen3-4B from HuggingFace)
37
+ 4. Returns OpenAI-compatible responses
38
+
39
+ No code changes needed - the infrastructure already exists.
40
+
@@ -0,0 +1,31 @@
1
+ # Verilog Eval Config for Groq Qwen3-32B
2
+ # Quick eval to test Verilog task app before RL training
3
+
4
+ [eval]
5
+ app_id = "grpo-verilog"
6
+ task_app_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
7
+ model = "qwen/qwen3-32b"
8
+ seeds = [0, 1, 2]
9
+ max_turns = 15
10
+ concurrency = 1
11
+ env_name = "verilog"
12
+ policy_name = "verilog-designer"
13
+ trace_format = "full"
14
+ return_trace = true
15
+
16
+ [eval.env_config]
17
+ difficulty = "medium" # Can be "easy", "medium", or "hard"
18
+
19
+ [eval.policy_config]
20
+ provider = "groq"
21
+ model = "qwen/qwen3-32b"
22
+ inference_url = "https://api.groq.com/openai/v1/chat/completions"
23
+ temperature = 0.2
24
+ max_tokens = 8192 # Large buffer for Verilog (long testbenches + module implementation)
25
+
26
+ [eval.judge]
27
+ path = "examples/multi_step/judges/verilog_backend_judge.py"
28
+ name = "Backend"
29
+ backend_url = "http://localhost:8000/api"
30
+ model = "openai/gpt-oss-120b"
31
+ timeout_s = 45
@@ -0,0 +1,33 @@
1
+ # Verilog eval using Synth backend with Qwen3-8B
2
+
3
+ [eval]
4
+ app_id = "grpo-verilog"
5
+ task_app_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
6
+ model = "Qwen/Qwen3-8B"
7
+ seeds = [0, 1, 2]
8
+ max_turns = 6
9
+ concurrency = 1
10
+ env_name = "verilog"
11
+ policy_name = "verilog-designer"
12
+ trace_format = "full"
13
+ return_trace = true
14
+
15
+ [eval.env_config]
16
+ difficulty = "medium"
17
+
18
+ [eval.policy_config]
19
+ provider = "openai"
20
+ model = "Qwen/Qwen3-8B"
21
+ inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
22
+ temperature = 0.2
23
+ top_p = 0.95
24
+ max_tokens = 4096
25
+ max_llm_calls = 6
26
+
27
+ [eval.judge]
28
+ path = "examples/multi_step/judges/verilog_backend_judge.py"
29
+ name = "Backend"
30
+ backend_url = "http://localhost:8000/api"
31
+ model = "openai/gpt-oss-120b"
32
+ timeout_s = 45
33
+