synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (293) hide show
  1. examples/README.md +1 -0
  2. examples/multi_step/SFT_README.md +147 -0
  3. examples/multi_step/configs/README_verilog_rl.md +77 -0
  4. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  5. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  6. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  7. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  8. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
  9. examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
  10. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  11. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  12. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  13. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  14. examples/multi_step/convert_traces_to_sft.py +84 -0
  15. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  16. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  17. examples/multi_step/readme.md +48 -0
  18. examples/multi_step/run_sft_qwen30b.sh +45 -0
  19. examples/multi_step/verilog_rl_lora.md +218 -0
  20. examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
  21. examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
  22. examples/qwen_coder/configs/coder_lora_small.toml +2 -1
  23. examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
  24. examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
  25. examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
  26. examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
  27. examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
  28. examples/qwen_vl/QUICKSTART.md +327 -0
  29. examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
  30. examples/qwen_vl/README.md +154 -0
  31. examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
  32. examples/qwen_vl/RL_VISION_TESTING.md +333 -0
  33. examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
  34. examples/qwen_vl/SETUP_COMPLETE.md +275 -0
  35. examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
  36. examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
  37. examples/qwen_vl/__init__.py +2 -0
  38. examples/qwen_vl/collect_data_via_cli.md +423 -0
  39. examples/qwen_vl/collect_vision_traces.py +368 -0
  40. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
  41. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
  42. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
  43. examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
  44. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
  45. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
  46. examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
  47. examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
  48. examples/qwen_vl/configs/filter_vision_test.toml +8 -0
  49. examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
  50. examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
  51. examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
  52. examples/qwen_vl/run_vision_comparison.sh +62 -0
  53. examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
  54. examples/qwen_vl/test_image_validation.py +201 -0
  55. examples/qwen_vl/test_sft_vision_data.py +110 -0
  56. examples/rl/README.md +1 -1
  57. examples/rl/configs/eval_base_qwen.toml +17 -0
  58. examples/rl/configs/eval_rl_qwen.toml +13 -0
  59. examples/rl/configs/rl_from_base_qwen.toml +37 -0
  60. examples/rl/configs/rl_from_base_qwen17.toml +76 -0
  61. examples/rl/configs/rl_from_ft_qwen.toml +37 -0
  62. examples/rl/run_eval.py +436 -0
  63. examples/rl/run_rl_and_save.py +111 -0
  64. examples/rl/task_app/README.md +22 -0
  65. examples/rl/task_app/math_single_step.py +990 -0
  66. examples/rl/task_app/math_task_app.py +111 -0
  67. examples/sft/README.md +5 -5
  68. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
  69. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
  70. examples/sft/evaluate.py +4 -4
  71. examples/sft/export_dataset.py +7 -4
  72. examples/sft/generate_traces.py +2 -0
  73. examples/swe/task_app/README.md +1 -1
  74. examples/swe/task_app/grpo_swe_mini.py +1 -1
  75. examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
  76. examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
  77. examples/swe/task_app/hosted/policy_routes.py +0 -2
  78. examples/swe/task_app/hosted/rollout.py +2 -8
  79. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  80. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  81. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  82. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  83. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  84. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  85. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  86. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  87. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  88. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  89. examples/task_apps/crafter/task_app/__init__.py +3 -0
  90. examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
  91. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  92. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
  93. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  94. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
  95. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
  96. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
  97. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  98. examples/task_apps/enron/__init__.py +1 -0
  99. examples/task_apps/enron/filter_sft.toml +5 -0
  100. examples/task_apps/enron/tests/__init__.py +2 -0
  101. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  102. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  103. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  104. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  105. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  106. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  107. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  108. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  109. examples/task_apps/pokemon_red/task_app.py +199 -6
  110. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  111. examples/task_apps/sokoban/filter_sft.toml +5 -0
  112. examples/task_apps/sokoban/tests/__init__.py +2 -0
  113. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  115. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  116. examples/task_apps/verilog/filter_sft.toml +5 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  118. examples/task_apps/verilog/tests/__init__.py +2 -0
  119. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  121. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  122. examples/vlm/README.md +3 -3
  123. examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
  124. examples/vlm/crafter_openai_vlm_agent.py +3 -5
  125. examples/vlm/filter_image_rows.py +1 -1
  126. examples/vlm/run_crafter_vlm_benchmark.py +2 -2
  127. examples/warming_up_to_rl/_utils.py +92 -0
  128. examples/warming_up_to_rl/analyze_trace_db.py +1 -1
  129. examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
  130. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
  131. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
  132. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
  133. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
  134. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
  135. examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
  136. examples/warming_up_to_rl/export_trace_sft.py +174 -60
  137. examples/warming_up_to_rl/groq_test.py +2 -0
  138. examples/warming_up_to_rl/readme.md +63 -132
  139. examples/warming_up_to_rl/run_fft_and_save.py +1 -1
  140. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  141. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  142. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  143. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  144. examples/warming_up_to_rl/run_rl_and_save.py +1 -1
  145. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  146. examples/warming_up_to_rl/task_app/README.md +42 -0
  147. examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
  148. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
  149. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  150. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  151. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
  152. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  153. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  154. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  155. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  156. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
  157. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
  158. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
  159. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  160. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  161. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
  162. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  163. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
  164. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
  165. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
  166. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
  167. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
  168. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  169. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
  170. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
  171. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
  172. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
  173. synth_ai/__init__.py +44 -30
  174. synth_ai/_utils/__init__.py +47 -0
  175. synth_ai/_utils/base_url.py +10 -0
  176. synth_ai/_utils/http.py +10 -0
  177. synth_ai/_utils/prompts.py +10 -0
  178. synth_ai/_utils/task_app_state.py +12 -0
  179. synth_ai/_utils/user_config.py +10 -0
  180. synth_ai/api/models/supported.py +145 -7
  181. synth_ai/api/train/__init__.py +13 -1
  182. synth_ai/api/train/cli.py +30 -7
  183. synth_ai/api/train/config_finder.py +18 -11
  184. synth_ai/api/train/env_resolver.py +13 -10
  185. synth_ai/cli/__init__.py +66 -49
  186. synth_ai/cli/_modal_wrapper.py +9 -6
  187. synth_ai/cli/_typer_patch.py +0 -2
  188. synth_ai/cli/_validate_task_app.py +22 -4
  189. synth_ai/cli/legacy_root_backup.py +3 -1
  190. synth_ai/cli/lib/__init__.py +10 -0
  191. synth_ai/cli/lib/task_app_discovery.py +7 -0
  192. synth_ai/cli/lib/task_app_env.py +518 -0
  193. synth_ai/cli/recent.py +1 -0
  194. synth_ai/cli/setup.py +266 -0
  195. synth_ai/cli/task_app_deploy.py +16 -0
  196. synth_ai/cli/task_app_list.py +25 -0
  197. synth_ai/cli/task_app_modal_serve.py +16 -0
  198. synth_ai/cli/task_app_serve.py +18 -0
  199. synth_ai/cli/task_apps.py +392 -141
  200. synth_ai/cli/train.py +18 -0
  201. synth_ai/cli/tui.py +62 -0
  202. synth_ai/demos/__init__.py +10 -0
  203. synth_ai/demos/core/__init__.py +28 -1
  204. synth_ai/demos/crafter/__init__.py +1 -0
  205. synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
  206. synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
  207. synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
  208. synth_ai/demos/demo_registry.py +176 -0
  209. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  210. synth_ai/demos/math/__init__.py +1 -0
  211. synth_ai/demos/math/_common.py +16 -0
  212. synth_ai/demos/math/app.py +38 -0
  213. synth_ai/demos/math/config.toml +76 -0
  214. synth_ai/demos/math/deploy_modal.py +54 -0
  215. synth_ai/demos/math/modal_task_app.py +702 -0
  216. synth_ai/demos/math/task_app_entry.py +51 -0
  217. synth_ai/environments/environment/core.py +7 -1
  218. synth_ai/environments/examples/bandit/engine.py +0 -1
  219. synth_ai/environments/examples/bandit/environment.py +0 -1
  220. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  221. synth_ai/environments/examples/verilog/engine.py +76 -10
  222. synth_ai/environments/examples/wordle/environment.py +0 -1
  223. synth_ai/evals/base.py +16 -5
  224. synth_ai/evals/client.py +1 -1
  225. synth_ai/inference/client.py +1 -1
  226. synth_ai/learning/client.py +1 -1
  227. synth_ai/learning/health.py +1 -1
  228. synth_ai/learning/jobs.py +1 -1
  229. synth_ai/learning/rl/client.py +1 -1
  230. synth_ai/learning/rl/env_keys.py +1 -1
  231. synth_ai/learning/rl/secrets.py +1 -1
  232. synth_ai/learning/sft/client.py +1 -1
  233. synth_ai/learning/sft/data.py +407 -4
  234. synth_ai/learning/validators.py +4 -1
  235. synth_ai/task/__init__.py +11 -1
  236. synth_ai/task/apps/__init__.py +5 -2
  237. synth_ai/task/config.py +259 -0
  238. synth_ai/task/contracts.py +15 -2
  239. synth_ai/task/rubrics/__init__.py +4 -2
  240. synth_ai/task/rubrics/loaders.py +27 -4
  241. synth_ai/task/rubrics/scoring.py +3 -0
  242. synth_ai/task/rubrics.py +219 -0
  243. synth_ai/task/trace_correlation_helpers.py +328 -0
  244. synth_ai/task/tracing_utils.py +14 -3
  245. synth_ai/task/validators.py +145 -2
  246. synth_ai/tracing_v3/config.py +15 -13
  247. synth_ai/tracing_v3/constants.py +21 -0
  248. synth_ai/tracing_v3/db_config.py +3 -1
  249. synth_ai/tracing_v3/decorators.py +10 -7
  250. synth_ai/tracing_v3/session_tracer.py +10 -0
  251. synth_ai/tracing_v3/turso/daemon.py +2 -2
  252. synth_ai/tracing_v3/turso/native_manager.py +108 -77
  253. synth_ai/tracing_v3/utils.py +1 -1
  254. synth_ai/tui/__init__.py +5 -0
  255. synth_ai/tui/__main__.py +13 -0
  256. synth_ai/tui/cli/__init__.py +1 -0
  257. synth_ai/tui/cli/query_experiments.py +164 -0
  258. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  259. synth_ai/tui/dashboard.py +911 -0
  260. synth_ai/utils/__init__.py +101 -0
  261. synth_ai/utils/base_url.py +94 -0
  262. synth_ai/utils/cli.py +131 -0
  263. synth_ai/utils/env.py +287 -0
  264. synth_ai/utils/http.py +169 -0
  265. synth_ai/utils/modal.py +308 -0
  266. synth_ai/utils/process.py +212 -0
  267. synth_ai/utils/prompts.py +39 -0
  268. synth_ai/utils/sqld.py +122 -0
  269. synth_ai/utils/task_app_discovery.py +882 -0
  270. synth_ai/utils/task_app_env.py +186 -0
  271. synth_ai/utils/task_app_state.py +318 -0
  272. synth_ai/utils/user_config.py +137 -0
  273. synth_ai/v0/config/__init__.py +1 -5
  274. synth_ai/v0/config/base_url.py +1 -7
  275. synth_ai/v0/tracing/config.py +1 -1
  276. synth_ai/v0/tracing/decorators.py +1 -1
  277. synth_ai/v0/tracing/upload.py +1 -1
  278. synth_ai/v0/tracing_v1/config.py +1 -1
  279. synth_ai/v0/tracing_v1/decorators.py +1 -1
  280. synth_ai/v0/tracing_v1/upload.py +1 -1
  281. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
  282. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
  283. synth_ai/cli/man.py +0 -106
  284. synth_ai/compound/cais.py +0 -0
  285. synth_ai/core/experiment.py +0 -13
  286. synth_ai/core/system.py +0 -15
  287. synth_ai/demo_registry.py +0 -295
  288. synth_ai/handshake.py +0 -109
  289. synth_ai/http.py +0 -26
  290. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
  291. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
  292. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
  293. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
examples/README.md ADDED
@@ -0,0 +1 @@
1
+ ### The instructions for how to create and configure a task app are documented at https://docs.usesynth.ai/sdk/task-apps
@@ -0,0 +1,147 @@
1
+ # SFT Training for Qwen3-Coder-30B with LoRA
2
+
3
+ Supervised Fine-Tuning configuration for the same 30B MoE model used in RL training.
4
+
5
+ ## Configuration Overview
6
+
7
+ **Model:** `Qwen/Qwen3-Coder-30B-A3B-Instruct` (Mixture of Experts)
8
+
9
+ **Hardware:** 4x H200 GPUs (561GB total VRAM)
10
+
11
+ **Parallelism Strategy:**
12
+ - **Tensor Parallel (TP)**: 2 GPUs - Splits the model across 2 GPUs for inference/forward pass
13
+ - **Data Parallel (DP)**: 2 GPUs - Splits batches across 2 GPUs for training throughput
14
+
15
+ **LoRA Configuration:**
16
+ - Rank (r): 16
17
+ - Alpha: 32
18
+ - Dropout: 0.05
19
+ - Target modules: `["all-linear"]` - Applies LoRA to all linear layers
20
+
21
+ ## Memory Breakdown per GPU
22
+
23
+ With 4x H200 (141GB each):
24
+
25
+ **Model Split (TP=2):**
26
+ - 2 GPUs hold the base model (70GB each)
27
+ - ~70GB free per GPU for activations and gradients
28
+
29
+ **Training (DP=2):**
30
+ - 2 GPUs process different batches
31
+ - LoRA adapters: ~5-10GB per GPU
32
+ - Gradients/optimizer states: ~20-30GB per GPU
33
+ - **Total per training GPU: ~50-60GB** ✅
34
+
35
+ ## Quick Start
36
+
37
+ ### 1. Prepare Your Dataset
38
+
39
+ Your dataset should be in JSONL format with conversation turns:
40
+
41
+ ```jsonl
42
+ {"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
43
+ {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
44
+ ```
45
+
46
+ ### 2. Run Training
47
+
48
+ ```bash
49
+ # Using the helper script
50
+ ./examples/multi_step/run_sft_qwen30b.sh path/to/your/dataset.jsonl
51
+
52
+ # Or directly with synth-ai CLI
53
+ uvx synth-ai train \
54
+ --type sft \
55
+ --config examples/multi_step/configs/crafter_sft_qwen30b_lora.toml \
56
+ --dataset path/to/your/dataset.jsonl \
57
+ --env-file backend/.env.dev
58
+ ```
59
+
60
+ ### 3. Monitor Training
61
+
62
+ Check the Synth dashboard for:
63
+ - Training loss curve
64
+ - Validation metrics (if validation set provided)
65
+ - GPU utilization
66
+ - Training throughput (tokens/sec)
67
+
68
+ ## Hyperparameters
69
+
70
+ **Batch Configuration:**
71
+ - Per-device batch size: 1
72
+ - Gradient accumulation: 64 steps
73
+ - **Effective global batch size: 128** (1 × 64 × 2 GPUs)
74
+
75
+ **Learning Rate:**
76
+ - Initial LR: 5e-6
77
+ - Warmup ratio: 3%
78
+ - Schedule: Linear decay
79
+
80
+ **Sequence Length:** 4096 tokens
81
+
82
+ **Training:**
83
+ - Epochs: 1
84
+ - Mixed precision: BF16
85
+ - DeepSpeed: Stage 2 (optimizer state sharding)
86
+ - Activation checkpointing: Enabled
87
+
88
+ ## Configuration File Structure
89
+
90
+ ```toml
91
+ [algorithm]
92
+ type = "offline" # Supervised (not RL)
93
+ method = "sft" # Supervised fine-tuning
94
+ variety = "lora" # Using LoRA adapters
95
+
96
+ [compute]
97
+ gpu_type = "H200"
98
+ gpu_count = 4
99
+
100
+ [data.topology]
101
+ tensor_parallel = 2 # Split model across 2 GPUs
102
+ data_parallel = 2 # Split batches across 2 GPUs
103
+
104
+ [training]
105
+ mode = "lora"
106
+ use_qlora = true # Quantized LoRA (4-bit base model)
107
+
108
+ [lora]
109
+ r = 16 # LoRA rank
110
+ alpha = 32 # LoRA scaling
111
+ dropout = 0.05
112
+ target_modules = ["all-linear"] # Apply to all linear layers
113
+ ```
114
+
115
+ ## Comparison with RL Config
116
+
117
+ | Aspect | SFT | RL |
118
+ |--------|-----|-----|
119
+ | Purpose | Supervised learning | Reinforcement learning |
120
+ | Data | Labeled examples | Environment interactions |
121
+ | Topology | TP=2, DP=2 | Split: 2 inference + 2 training |
122
+ | Batch size | 128 (effective) | Variable (episode-based) |
123
+ | Training | Standard backprop | Policy gradient (GSPO) |
124
+
125
+ ## Tips
126
+
127
+ 1. **Start Small:** Test with a small dataset first to verify the pipeline
128
+ 2. **Validation:** Add a validation set to monitor overfitting
129
+ 3. **Checkpointing:** Training saves checkpoints every 100 steps
130
+ 4. **Resume:** Can resume from checkpoint if training is interrupted
131
+ 5. **Inference:** After training, use the LoRA adapter with the base model
132
+
133
+ ## Output
134
+
135
+ After training completes, you'll get:
136
+ - LoRA adapter weights (saved to volume)
137
+ - Training metrics and logs
138
+ - Best checkpoint (based on validation loss)
139
+ - Model ready for inference or RL initialization
140
+
141
+ ## Next Steps
142
+
143
+ 1. **Evaluate:** Test your fine-tuned model on held-out data
144
+ 2. **RL Training:** Use this as initialization for RL (`init_from_sft = true`)
145
+ 3. **Deploy:** Load LoRA adapter for inference
146
+ 4. **Iterate:** Adjust hyperparameters based on performance
147
+
@@ -0,0 +1,77 @@
1
+ # Verilog RL with LoRA (Qwen3-0.6B)
2
+
3
+ ## Quick Start
4
+
5
+ 1. **Deploy Verilog Task App**:
6
+ ```bash
7
+ cd synth-ai
8
+ uvx synth-ai modal-serve grpo-verilog
9
+ ```
10
+ Note the Modal URL and update `task_url` in `verilog_rl_lora.toml`.
11
+
12
+ 2. **Run Training**:
13
+ ```bash
14
+ uvx synth-ai rl run --config examples/multi_step/configs/verilog_rl_lora.toml
15
+ ```
16
+
17
+ ## Configuration Overview
18
+
19
+ ### **Key Adaptations from Crafter**:
20
+
21
+ - **Model**: `Qwen/Qwen3-0.6B` (✅ proven in SFT configs)
22
+ - **Environment**: `verilog` instead of `crafter`
23
+ - **Steps**: 15 turns (vs Crafter's 10) for compilation workflows
24
+ - **Rewards**: Adjusted for sparser Verilog rewards (0.5 vs 1.0 indicator_lambda)
25
+ - **Rubrics**: Verilog-specific judging criteria
26
+
27
+ ### **Hardware Requirements** (Standard RL setup):
28
+ - ✅ **2x H100 GPUs** (vLLM inference + LoRA training split)
29
+ - ✅ **No tensor parallelism** needed for 0.6B model
30
+ - ✅ **4x faster inference** than 32B model
31
+ - ✅ **Same compute pattern** as Crafter (just smaller model)
32
+
33
+ ### **Expected Workflow**:
34
+ 1. Agent writes Verilog code (`write_file`)
35
+ 2. Compiles to check syntax (`compile`)
36
+ 3. Simulates to verify behavior (`simulate`)
37
+ 4. Submits if tests pass (`submit`)
38
+ 5. **Rewards**: +1.0 for compilation success, +10.0 for passing tests
39
+
40
+ ## Rubric Design
41
+
42
+ ### **Event Rewards** (per decision):
43
+ - **Compilation Success**: 70% weight (1.0 for success, 0.0 for errors)
44
+ - **Process Efficiency**: 30% weight (penalizes redundant operations)
45
+
46
+ ### **Outcome Rewards** (final score):
47
+ - **Tests Passed**: 80% weight (full credit when all tests pass)
48
+ - **Design Quality**: 20% weight (code clarity, documentation)
49
+
50
+ ## Troubleshooting
51
+
52
+ ### **If training fails**:
53
+ 1. Check Modal URL in `task_url` field
54
+ 2. Verify `GROQ_API_KEY` for inference
55
+ 3. Ensure `OPENAI_API_KEY` for judging
56
+
57
+ ### **Memory issues** (unlikely with 0.6B):
58
+ - Reduce `batch_size` to 2
59
+ - Set `gradient_accumulation_steps = 2`
60
+ - Verify 2x GPU split is working (vLLM on GPU 0, training on GPU 1)
61
+
62
+ ### **Slow training**:
63
+ - Increase `episodes_per_batch` to 6-8
64
+ - Check network latency to Modal task app
65
+
66
+ ## Expected Results
67
+
68
+ - **Convergence**: Should learn basic compilation workflow in 1-2 hours
69
+ - **Success Rate**: 20-40% initial test pass rate (improves with training)
70
+ - **Learning**: Agent learns to debug compilation errors and write correct Verilog
71
+
72
+ ## Next Steps
73
+
74
+ 1. **Monitor reward progression** in training logs
75
+ 2. **Adjust rubrics** if agent struggles with compilation errors
76
+ 3. **Scale to 8B model** once 0.6B baseline works
77
+ 4. **Add domain-specific fine-tuning** for Verilog syntax
@@ -0,0 +1,90 @@
1
+ # Verilog Reward Structure (Normalized to 1.0)
2
+
3
+ ## Overview
4
+ All rewards in the Verilog task app are normalized so the maximum possible reward is **1.0**.
5
+
6
+ ## Reward Components
7
+
8
+ ### 1. Step Penalty: **-0.001** per step
9
+ - Applied to every action taken
10
+ - Encourages efficient solutions
11
+ - Normalized from `-0.01` (original)
12
+
13
+ ### 2. Compile Success: **+0.01**
14
+ - Awarded when `iverilog` compilation succeeds (returncode 0)
15
+ - Validates syntax correctness
16
+ - Normalized from `+0.1` (original)
17
+
18
+ ### 3. Simulation Pass: **+0.1**
19
+ - Awarded when `vvp` simulation passes all tests
20
+ - Validates behavioral correctness
21
+ - Normalized from `+1.0` (original)
22
+
23
+ ### 4. Submit Success: **+1.0** (maximum reward)
24
+ - Awarded when final submission passes all verification tests
25
+ - This is the goal state
26
+ - Normalized from `+10.0` (original)
27
+
28
+ ## Typical Reward Trajectories
29
+
30
+ ### ✅ Optimal Path (3 steps)
31
+ ```
32
+ Step 1: write_file → -0.001
33
+ Step 2: compile (success) → +0.01 - 0.001 = +0.009
34
+ Step 3: simulate (pass) → +0.1 - 0.001 = +0.099
35
+ Total: ~0.107
36
+ ```
37
+
38
+ ### ✅ Good Path (4 steps with submit)
39
+ ```
40
+ Step 1: write_file → -0.001
41
+ Step 2: compile (success) → +0.009
42
+ Step 3: simulate (pass) → +0.099
43
+ Step 4: submit (success) → +1.0 - 0.001 = +0.999
44
+ Total: ~1.106
45
+ ```
46
+ *Note: Can exceed 1.0 if intermediate rewards stack with final submit*
47
+
48
+ ### ❌ Failure Path (compilation errors)
49
+ ```
50
+ Step 1: write_file → -0.001
51
+ Step 2: compile (fail) → -0.001
52
+ Step 3: write_file (fix) → -0.001
53
+ Step 4: compile (success) → +0.009
54
+ Step 5: simulate (pass) → +0.099
55
+ Total: ~0.105
56
+ ```
57
+
58
+ ## Implementation Details
59
+
60
+ ### Location
61
+ - **Reward components**: `synth_ai/environments/examples/verilog/engine.py`
62
+ - `VerilogCompileSuccessComponent`: +0.01
63
+ - `VerilogSimulationPassComponent`: +0.1
64
+ - `VerilogSubmitSuccessComponent`: +1.0
65
+ - `VerilogStepPenaltyComponent`: -0.001
66
+
67
+ ### Normalization Ratio
68
+ All rewards were divided by **10.0** to normalize:
69
+ - Original max: ~10.0
70
+ - Normalized max: ~1.0
71
+ - Ratio: 10.0
72
+
73
+ ## Why Normalize?
74
+
75
+ 1. **Consistency**: Makes it easier to compare rewards across different task types
76
+ 2. **RL Training**: Standard reward scales improve learning stability
77
+ 3. **Interpretability**: Rewards as percentages (0.0 to 1.0) are intuitive
78
+ 4. **Judge Compatibility**: Rubric scores typically range 0-1, making blending easier
79
+
80
+ ## Testing
81
+ ```bash
82
+ # Run eval to verify normalized rewards
83
+ uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
84
+ ```
85
+
86
+ Expected output for successful rollout:
87
+ - `mean_return` ≈ 0.1 (if only compile+simulate)
88
+ - `mean_return` ≈ 1.0+ (if full submit success)
89
+
90
+
@@ -0,0 +1,183 @@
1
+ # Verilog Task App - RL Training Readiness Checklist
2
+
3
+ ## ✅ Core Requirements
4
+
5
+ ### 1. Reward Normalization
6
+ - ✅ **Max reward = 1.0**: All rewards scaled to `[0, 1]` range
7
+ - ✅ **Step penalty**: `-0.001` (normalized from `-0.01`)
8
+ - ✅ **Compile success**: `+0.01` (normalized from `+0.1`)
9
+ - ✅ **Simulate pass**: `+0.1` (normalized from `+1.0`)
10
+ - ✅ **Submit success**: `+1.0` (normalized from `+10.0`)
11
+
12
+ ### 2. Inference URL Handling (Critical for Trace Correlation)
13
+ - ✅ **Extracts from policy config**: Uses `policy_config.get("inference_url")` as primary source
14
+ - ✅ **Includes in trajectory**: Sets `trajectory.inference_url` with `?cid=...` parameter
15
+ - ✅ **Includes in final.info**: Adds to `final["info"]["inference_url"]`
16
+ - ✅ **Includes in pipeline_metadata**: Top-level `inference_url` field for trainer extraction
17
+ - ✅ **Logs cid presence**: Logs `has_cid` flag for debugging
18
+ - ✅ **Fallback to agent.inference_url**: Uses agent's URL if policy config missing (eval mode)
19
+
20
+ **Location**: `grpo_verilog.py` lines 829-867, 887-908
21
+
22
+ ### 3. Pipeline Metadata
23
+ - ✅ **Required fields present**:
24
+ - `reward_score`: Final episode reward
25
+ - `policy_id`: Policy identifier
26
+ - `inference_url`: **CRITICAL** - Contains `?cid=trace_xxxxx` for correlation
27
+ - `env_name`: Environment identifier
28
+ - `task_id`: Problem identifier
29
+ - `task_split`: Dataset split (train/val/test)
30
+ - ✅ **Inference details**: Provider, model, URL in nested `inference` dict
31
+
32
+ **Location**: `grpo_verilog.py` lines 887-908
33
+
34
+ ### 4. Trace Correlation (Required for RL Training)
35
+ - ✅ **Trainer injects cid**: Trainer adds `?cid=trace_xxxxx` to `policy_config["inference_url"]`
36
+ - ✅ **Task app preserves cid**: Uses `policy_config["inference_url"]` directly
37
+ - ✅ **Trainer extracts cid**: Extracts from `trajectory.inference_url` using `inference_url_to_trace_correlation_id()`
38
+ - ✅ **Trace hydration**: Trainer queries trace store with extracted `trace_correlation_id`
39
+
40
+ **Flow**:
41
+ ```
42
+ Trainer → policy_config["inference_url"] = "http://...?cid=trace_xxxxx"
43
+
44
+ Task App → trajectory.inference_url = policy_config["inference_url"]
45
+
46
+ Trainer → extract_trace_correlation_id(trajectory.inference_url)
47
+
48
+ Trainer → trace_store.resolve_correlation(trace_correlation_id)
49
+
50
+ Trainer → Hydrate v3 trace with event_history
51
+
52
+ Judge → Score using full trace
53
+ ```
54
+
55
+ ### 5. Response Contract Compliance
56
+ - ✅ **RolloutResponse fields**:
57
+ - `run_id`: Unique identifier
58
+ - `trajectories`: List of trajectories (with `inference_url`)
59
+ - `metrics`: Episode metrics
60
+ - `pipeline_metadata`: **CRITICAL** - Contains `inference_url` and `reward_score`
61
+ - `trace_correlation_id`: Optional (trainer infers from `inference_url`)
62
+ - ✅ **Optional trace_correlation_id**: Made optional in `contracts.py` (trainer infers from URL)
63
+
64
+ **Location**: `synth_ai/task/contracts.py` line 156
65
+
66
+ ### 6. Environment Implementation
67
+ - ✅ **Stateful engine**: `VerilogEngine` extends `StatefulEngine`
68
+ - ✅ **Reward stack**: Properly configured with normalized components
69
+ - ✅ **State management**: `VerilogPublicState` and `VerilogPrivateState`
70
+ - ✅ **Tool implementation**: All 4 tools (write_file, compile, simulate, submit)
71
+
72
+ **Location**: `synth_ai/environments/examples/verilog/engine.py`
73
+
74
+ ### 7. LLM Agent Integration
75
+ - ✅ **Multi-turn support**: Agent maintains conversation history
76
+ - ✅ **Tool parsing**: Extracts tool calls from LLM responses
77
+ - ✅ **Guidance system**: Provides context-aware hints
78
+ - ✅ **Error handling**: Graceful fallback for malformed responses
79
+
80
+ **Location**: `grpo_verilog.py` lines 200-530
81
+
82
+ ## 🔍 Verification Tests
83
+
84
+ ### Test 1: Eval Mode (No Trace Correlation)
85
+ ```bash
86
+ uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
87
+ ```
88
+ **Expected**:
89
+ - ✅ `mean_return` ≈ 0.1 (normalized rewards)
90
+ - ✅ `inference_url` = Groq API URL (no `?cid=...`)
91
+ - ✅ `task_completed` = True for correct solutions
92
+
93
+ ### Test 2: RL Training Mode (With Trace Correlation)
94
+ ```bash
95
+ uvx synth-ai train \
96
+ --type rl \
97
+ --config examples/multi_step/configs/verilog_rl_lora.toml \
98
+ --task-url https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run \
99
+ --backend https://synth-backend-dev-docker.onrender.com/api \
100
+ --env-file /path/to/verilog/.env
101
+ ```
102
+ **Expected**:
103
+ - ✅ Trainer logs show `inference_url` with `?cid=trace_xxxxx`
104
+ - ✅ Task app logs show `has_cid=True`
105
+ - ✅ Trace hydration succeeds (no `404 Not Found` errors)
106
+ - ✅ Judge receives full `event_history`
107
+ - ✅ Training updates show non-zero rewards
108
+
109
+ ### Test 3: Trace Correlation ID Extraction
110
+ ```python
111
+ from synth_envs_hosted.utils import inference_url_to_trace_correlation_id
112
+
113
+ # Should extract trace_xxxxx from URL
114
+ url = "http://localhost:8000/v1/chat/completions?cid=trace_abc123"
115
+ cid = inference_url_to_trace_correlation_id(url)
116
+ assert cid == "trace_abc123"
117
+ ```
118
+
119
+ ### Test 4: Pipeline Metadata Structure
120
+ ```python
121
+ # Verify response has correct structure for RL
122
+ response = await task_app.rollout(request)
123
+ assert "pipeline_metadata" in response
124
+ assert "inference_url" in response.pipeline_metadata
125
+ assert "reward_score" in response.pipeline_metadata
126
+ assert len(response.trajectories) > 0
127
+ assert response.trajectories[0].inference_url is not None
128
+ ```
129
+
130
+ ## 📋 Deployment Checklist
131
+
132
+ ### Modal Deployment
133
+ 1. ✅ **Environment variables set**:
134
+ - `GROQ_API_KEY`
135
+ - `VERILOG_INFERENCE_URL` (optional, uses Groq default)
136
+ 2. ✅ **Secrets configured**: Groq API key in Modal secrets
137
+ 3. ✅ **Task app URL**: Update in `verilog_rl_lora.toml`
138
+
139
+ ### Training Configuration
140
+ 1. ✅ **2x GPUs minimum**: 1 for vLLM, 1 for training
141
+ 2. ✅ **Model size**: `Qwen/Qwen3-0.6B` for testing
142
+ 3. ✅ **Batch size**: 4 (matches Crafter)
143
+ 4. ✅ **Max turns**: 15 (enough for compile chains)
144
+ 5. ✅ **Rubric enabled**: `rubric.enabled = true`
145
+
146
+ ## 🚨 Common Issues & Fixes
147
+
148
+ ### Issue 1: `trace_correlation_id` Missing
149
+ **Symptom**: Trainer logs `FATAL: Rollout payload missing 'trace_correlation_id'`
150
+ **Fix**: Verify `trajectory.inference_url` contains `?cid=...` parameter
151
+
152
+ ### Issue 2: Trace Hydration Fails (404)
153
+ **Symptom**: `404 Not Found` when querying `/trace/by-correlation/...`
154
+ **Fix**:
155
+ - Check inference server is capturing traces
156
+ - Verify `cid` parameter is in inference URL
157
+ - Ensure `vllm_public_url` is set correctly
158
+
159
+ ### Issue 3: Rewards Not Normalized
160
+ **Symptom**: `mean_return` > 1.0 in eval
161
+ **Fix**: Verify all reward components in `engine.py` are scaled by 10x
162
+
163
+ ### Issue 4: Agent Gets Stuck
164
+ **Symptom**: Agent repeats same action (e.g., compile without fixing)
165
+ **Fix**: Check guidance system is providing proper hints
166
+
167
+ ## 🎯 Final Verification
168
+
169
+ Before starting RL training, verify:
170
+ - [ ] Eval runs successfully with normalized rewards (≈ 0.1)
171
+ - [ ] Modal deployment returns proper `inference_url` structure
172
+ - [ ] Trace correlation ID extraction works
173
+ - [ ] Pipeline metadata includes all required fields
174
+ - [ ] Response contract matches expected schema
175
+
176
+ **If all checks pass**: ✅ **Ready for RL training!**
177
+
178
+ ## 📚 Related Documentation
179
+ - [VERILOG_REWARDS.md](./VERILOG_REWARDS.md) - Reward structure details
180
+ - [verilog_rl_lora.md](../verilog_rl_lora.md) - RL/LoRA feasibility analysis
181
+ - [verilog_rl_lora.toml](./verilog_rl_lora.toml) - Training configuration
182
+
183
+
@@ -0,0 +1,35 @@
1
+ # Crafter eval using Synth backend with Qwen3-4B
2
+
3
+ [eval]
4
+ app_id = "grpo-crafter-task-app"
5
+ task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
6
+ model = "Qwen/Qwen3-4B"
7
+ seeds = [0, 1, 2]
8
+ max_turns = 10
9
+ concurrency = 1
10
+ env_name = "crafter"
11
+ policy_name = "crafter-react"
12
+ trace_format = "full"
13
+ return_trace = true
14
+
15
+ [eval.env_config]
16
+ env_params = {max_steps_per_episode = 10}
17
+
18
+ [eval.policy_config]
19
+ provider = "openai"
20
+ model = "Qwen/Qwen3-4B"
21
+ inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
22
+ temperature = 0.6
23
+ top_p = 0.95
24
+ max_tokens = 512
25
+ use_vision = false
26
+ image_only_mode = false
27
+ max_llm_calls = 10
28
+
29
+ [eval.judge]
30
+ path = "examples/multi_step/judges/crafter_backend_judge.py"
31
+ name = "Backend"
32
+ backend_url = "http://localhost:8000/api"
33
+ model = "openai/gpt-oss-120b"
34
+ timeout_s = 45
35
+
@@ -0,0 +1,36 @@
1
+ # Evaluation config for Crafter with text-only input
2
+ # This config uses Groq Qwen with only text observations (no images)
3
+
4
+ [eval]
5
+ app_id = "grpo-crafter-task-app"
6
+ task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
7
+ model = "qwen/qwen3-32b"
8
+ seeds = [0, 1, 2]
9
+ max_turns = 10
10
+ concurrency = 1
11
+ env_name = "crafter"
12
+ policy_name = "crafter-react"
13
+ trace_format = "full"
14
+ return_trace = true
15
+
16
+ [eval.env_config]
17
+ env_params = {max_steps_per_episode = 10}
18
+
19
+ [eval.policy_config]
20
+ provider = "groq"
21
+ model = "qwen/qwen3-32b"
22
+ inference_url = "https://api.groq.com/openai/v1/chat/completions"
23
+ temperature = 0.6
24
+ top_p = 0.95
25
+ max_tokens = 512
26
+ use_vision = false
27
+ image_only_mode = false
28
+ max_llm_calls = 10
29
+
30
+ [eval.judge]
31
+ path = "examples/multi_step/judges/crafter_backend_judge.py"
32
+ name = "Backend"
33
+ backend_url = "http://localhost:8000/api"
34
+ model = "openai/gpt-oss-120b"
35
+ timeout_s = 45
36
+
@@ -16,24 +16,24 @@ judge_url = "https://synth-backend-dev-docker.onrender.com/api"
16
16
 
17
17
  [compute]
18
18
  gpu_type = "H200"
19
- gpu_count = 2
19
+ gpu_count = 4
20
20
 
21
21
  [topology]
22
22
  type = "single_node_split"
23
- gpus_for_vllm = 1
24
- gpus_for_training = 1
23
+ gpus_for_vllm = 2
24
+ gpus_for_training = 2
25
25
  gpus_for_ref = 0
26
- tensor_parallel = 1
26
+ tensor_parallel = 2
27
27
 
28
28
  [vllm]
29
- tensor_parallel_size = 1
30
- max_model_len = 8192
29
+ tensor_parallel_size = 2
30
+ max_model_len = 4096
31
31
 
32
32
  [reference]
33
33
  placement = "none"
34
34
 
35
35
  [model]
36
- base = "Qwen/Qwen3-4B"
36
+ base = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
37
37
  trainer_mode = "lora"
38
38
  label = "crafter-rl-stepwise-hosted-judge"
39
39
 
@@ -46,7 +46,7 @@ target_modules = ["all-linear"]
46
46
  [rollout]
47
47
  env_name = "crafter"
48
48
  max_turns = 10
49
- episodes_per_batch = 4
49
+ episodes_per_batch = 2
50
50
  policy_name = "crafter-react"
51
51
  max_concurrent_rollouts = 8
52
52
  batches_per_step = 2
@@ -69,12 +69,12 @@ ops = ["agent", "env"]
69
69
 
70
70
  [evaluation]
71
71
  instances = 16
72
- every_n_iters = 8
72
+ every_n_iters = 10
73
73
  seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
74
74
 
75
75
  [training]
76
76
  num_epochs = 1
77
- iterations_per_epoch = 16
77
+ iterations_per_epoch = 5
78
78
  gradient_accumulation_steps = 1
79
79
  max_accumulated_minibatch = 1
80
80
  max_turns = 10
@@ -84,6 +84,7 @@ learning_rate = 5e-5
84
84
  log_interval = 1
85
85
  weight_sync_interval = 1
86
86
  event_rewards_kind = "unique"
87
+ async_semaphore_max = 4 # Max concurrent rollouts in streaming pipeline
87
88
 
88
89
  # Enable dense decision rewards in the trainer to mirror env_config step rewards.
89
90
  step_rewards_enabled = true
@@ -127,7 +128,7 @@ criteria = [
127
128
  ]
128
129
 
129
130
  [judge]
130
- type = "gemini" # or "groq" when routing to Groq-hosted judges
131
+ type = "groq" # or "groq" when routing to Groq-hosted judges
131
132
  timeout_s = 45
132
133
 
133
134
  [judge.options]