synth-ai 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (192) hide show
  1. examples/analyze_semantic_words.sh +2 -2
  2. examples/blog_posts/pokemon_vl/README.md +98 -0
  3. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +25 -0
  4. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  5. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  6. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +42 -0
  7. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  8. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  9. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  10. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  11. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  12. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  13. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +41 -0
  14. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  15. examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
  16. examples/multi_step/configs/crafter_rl_outcome.toml +1 -1
  17. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
  18. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -1
  19. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -1
  20. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  21. examples/multi_step/configs/verilog_rl_lora.toml +80 -123
  22. examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
  23. examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
  24. examples/qwen_coder/configs/coder_lora_small.toml +1 -3
  25. examples/qwen_vl/README.md +10 -12
  26. examples/qwen_vl/SETUP_COMPLETE.md +7 -8
  27. examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
  28. examples/qwen_vl/collect_data_via_cli.md +76 -84
  29. examples/qwen_vl/collect_vision_traces.py +4 -4
  30. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
  31. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
  32. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
  33. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
  34. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  35. examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
  36. examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
  37. examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
  38. examples/qwen_vl/run_vision_comparison.sh +6 -7
  39. examples/rl/README.md +5 -5
  40. examples/rl/configs/rl_from_base_qwen.toml +26 -1
  41. examples/rl/configs/rl_from_base_qwen17.toml +5 -2
  42. examples/rl/task_app/README.md +1 -2
  43. examples/rl/task_app/math_single_step.py +2 -2
  44. examples/run_crafter_demo.sh +2 -2
  45. examples/sft/README.md +1 -1
  46. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
  47. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
  48. examples/swe/task_app/README.md +32 -2
  49. examples/swe/task_app/grpo_swe_mini.py +4 -0
  50. examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
  51. examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
  52. examples/swe/task_app/hosted/inference/openai_client.py +4 -4
  53. examples/swe/task_app/morph_backend.py +178 -0
  54. examples/task_apps/crafter/task_app/README.md +1 -1
  55. examples/task_apps/crafter/task_app/grpo_crafter.py +66 -3
  56. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
  57. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
  58. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
  59. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +17 -49
  60. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +13 -5
  61. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +15 -1
  62. examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
  63. examples/task_apps/math/README.md +1 -2
  64. examples/task_apps/pokemon_red/README.md +3 -4
  65. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
  66. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
  67. examples/task_apps/pokemon_red/task_app.py +36 -5
  68. examples/task_apps/sokoban/README.md +2 -3
  69. examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
  70. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
  71. examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
  72. examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
  73. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
  74. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -2
  75. examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
  76. examples/warming_up_to_rl/task_app/README.md +1 -1
  77. examples/warming_up_to_rl/task_app/grpo_crafter.py +134 -3
  78. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +4 -4
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +6 -3
  83. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
  84. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +5 -0
  85. synth_ai/api/train/builders.py +9 -3
  86. synth_ai/api/train/cli.py +125 -10
  87. synth_ai/api/train/configs/__init__.py +8 -1
  88. synth_ai/api/train/configs/rl.py +32 -7
  89. synth_ai/api/train/configs/sft.py +6 -2
  90. synth_ai/api/train/configs/shared.py +59 -2
  91. synth_ai/auth/credentials.py +119 -0
  92. synth_ai/cli/__init__.py +12 -4
  93. synth_ai/cli/commands/__init__.py +17 -0
  94. synth_ai/cli/commands/demo/__init__.py +6 -0
  95. synth_ai/cli/commands/demo/core.py +163 -0
  96. synth_ai/cli/commands/deploy/__init__.py +23 -0
  97. synth_ai/cli/commands/deploy/core.py +614 -0
  98. synth_ai/cli/commands/deploy/errors.py +72 -0
  99. synth_ai/cli/commands/deploy/validation.py +11 -0
  100. synth_ai/cli/commands/eval/__init__.py +19 -0
  101. synth_ai/cli/commands/eval/core.py +1109 -0
  102. synth_ai/cli/commands/eval/errors.py +81 -0
  103. synth_ai/cli/commands/eval/validation.py +133 -0
  104. synth_ai/cli/commands/filter/__init__.py +12 -0
  105. synth_ai/cli/commands/filter/core.py +388 -0
  106. synth_ai/cli/commands/filter/errors.py +55 -0
  107. synth_ai/cli/commands/filter/validation.py +77 -0
  108. synth_ai/cli/commands/help/__init__.py +177 -0
  109. synth_ai/cli/commands/help/core.py +73 -0
  110. synth_ai/cli/commands/status/__init__.py +64 -0
  111. synth_ai/cli/commands/status/client.py +192 -0
  112. synth_ai/cli/commands/status/config.py +92 -0
  113. synth_ai/cli/commands/status/errors.py +20 -0
  114. synth_ai/cli/commands/status/formatters.py +164 -0
  115. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  116. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  117. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  118. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  119. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  120. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  121. synth_ai/cli/commands/status/utils.py +114 -0
  122. synth_ai/cli/commands/train/__init__.py +53 -0
  123. synth_ai/cli/commands/train/core.py +21 -0
  124. synth_ai/cli/commands/train/errors.py +117 -0
  125. synth_ai/cli/commands/train/judge_schemas.py +199 -0
  126. synth_ai/cli/commands/train/judge_validation.py +304 -0
  127. synth_ai/cli/commands/train/validation.py +443 -0
  128. synth_ai/cli/demo.py +2 -162
  129. synth_ai/cli/deploy/__init__.py +28 -0
  130. synth_ai/cli/deploy/core.py +5 -0
  131. synth_ai/cli/deploy/errors.py +23 -0
  132. synth_ai/cli/deploy/validation.py +5 -0
  133. synth_ai/cli/eval/__init__.py +36 -0
  134. synth_ai/cli/eval/core.py +5 -0
  135. synth_ai/cli/eval/errors.py +31 -0
  136. synth_ai/cli/eval/validation.py +5 -0
  137. synth_ai/cli/filter/__init__.py +28 -0
  138. synth_ai/cli/filter/core.py +5 -0
  139. synth_ai/cli/filter/errors.py +23 -0
  140. synth_ai/cli/filter/validation.py +5 -0
  141. synth_ai/cli/modal_serve/__init__.py +12 -0
  142. synth_ai/cli/modal_serve/core.py +14 -0
  143. synth_ai/cli/modal_serve/errors.py +8 -0
  144. synth_ai/cli/modal_serve/validation.py +11 -0
  145. synth_ai/cli/serve/__init__.py +12 -0
  146. synth_ai/cli/serve/core.py +14 -0
  147. synth_ai/cli/serve/errors.py +8 -0
  148. synth_ai/cli/serve/validation.py +11 -0
  149. synth_ai/cli/setup.py +20 -265
  150. synth_ai/cli/status.py +7 -126
  151. synth_ai/cli/task_app_deploy.py +1 -10
  152. synth_ai/cli/task_app_modal_serve.py +4 -9
  153. synth_ai/cli/task_app_serve.py +4 -11
  154. synth_ai/cli/task_apps.py +58 -1487
  155. synth_ai/cli/train/__init__.py +12 -0
  156. synth_ai/cli/train/core.py +21 -0
  157. synth_ai/cli/train/errors.py +8 -0
  158. synth_ai/cli/train/validation.py +24 -0
  159. synth_ai/cli/train.py +1 -14
  160. synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
  161. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  162. synth_ai/environments/examples/red/engine.py +33 -12
  163. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  164. synth_ai/environments/examples/red/environment.py +26 -0
  165. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  166. synth_ai/http.py +12 -0
  167. synth_ai/judge_schemas.py +10 -11
  168. synth_ai/learning/rl/client.py +3 -1
  169. synth_ai/streaming/__init__.py +29 -0
  170. synth_ai/streaming/config.py +94 -0
  171. synth_ai/streaming/handlers.py +469 -0
  172. synth_ai/streaming/streamer.py +301 -0
  173. synth_ai/streaming/types.py +95 -0
  174. synth_ai/task/validators.py +2 -2
  175. synth_ai/tracing_v3/migration_helper.py +1 -2
  176. synth_ai/utils/env.py +25 -18
  177. synth_ai/utils/http.py +4 -1
  178. synth_ai/utils/modal.py +2 -2
  179. {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/METADATA +8 -3
  180. {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/RECORD +184 -109
  181. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
  182. synth_ai/cli/tui.py +0 -62
  183. synth_ai/tui/__init__.py +0 -5
  184. synth_ai/tui/__main__.py +0 -13
  185. synth_ai/tui/cli/__init__.py +0 -1
  186. synth_ai/tui/cli/query_experiments.py +0 -164
  187. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  188. synth_ai/tui/dashboard.py +0 -911
  189. {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/WHEEL +0 -0
  190. {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/entry_points.txt +0 -0
  191. {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/licenses/LICENSE +0 -0
  192. {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,10 @@
1
- # Crafter RL experiment – stepwise shaping with hosted judge rubrics
2
- #
3
- # This configuration extends the stepwise LoRA baseline by wiring the Synth judge
4
- # service so evaluation rolls combine dense step rewards with hosted rubric scoring.
5
-
6
1
  [algorithm]
7
2
  type = "online"
8
3
  method = "policy_gradient"
9
4
  variety = "gspo"
10
5
 
11
6
  [services]
12
- # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
13
7
  task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
14
- # Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
15
8
  judge_url = "https://synth-backend-dev-docker.onrender.com/api"
16
9
 
17
10
  [compute]
@@ -41,7 +34,7 @@ label = "crafter-rl-stepwise-hosted-judge"
41
34
  r = 16
42
35
  alpha = 32
43
36
  dropout = 0.05
44
- target_modules = ["all-linear"]
37
+ target_modules = [ "all-linear",]
45
38
 
46
39
  [rollout]
47
40
  env_name = "crafter"
@@ -50,27 +43,12 @@ episodes_per_batch = 2
50
43
  policy_name = "crafter-react"
51
44
  max_concurrent_rollouts = 8
52
45
  batches_per_step = 2
53
- ops = ["agent", "env"]
54
-
55
- [rollout.env_config]
56
- difficulty = "easy"
57
-
58
- [rollout.env_config.step_rewards]
59
- enabled = true
60
- mode = "decision_stepwise"
61
- strategy = "consistent" # +1 for each decision that unlocks a new achievement
62
- indicator_lambda = 1.0
63
- step_beta = 0.0
64
-
65
- [rollout.policy_config]
66
- temperature = 0.2
67
- top_p = 0.95
68
- max_tokens = 512
46
+ ops = [ "agent", "env",]
69
47
 
70
48
  [evaluation]
71
49
  instances = 16
72
50
  every_n_iters = 10
73
- seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
51
+ seeds = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,]
74
52
 
75
53
  [training]
76
54
  num_epochs = 1
@@ -84,104 +62,84 @@ learning_rate = 5e-5
84
62
  log_interval = 1
85
63
  weight_sync_interval = 1
86
64
  event_rewards_kind = "unique"
87
- async_semaphore_max = 4 # Max concurrent rollouts in streaming pipeline
88
-
89
- # Enable dense decision rewards in the trainer to mirror env_config step rewards.
65
+ async_semaphore_max = 4
90
66
  step_rewards_enabled = true
91
67
  step_rewards_mode = "decision_stepwise"
92
68
  step_rewards_indicator_lambda = 1.0
93
69
  step_rewards_beta = 0.0
94
70
  step_rewards_strategy = "consistent"
95
71
 
72
+ [rubric]
73
+ enabled = true
74
+
75
+ [rollout.env_config]
76
+ difficulty = "easy"
77
+
78
+ [rollout.policy_config]
79
+ temperature = 0.2
80
+ top_p = 0.95
81
+ max_tokens = 512
82
+
96
83
  [training.weight_sync]
97
84
  enable = true
98
- targets = ["policy"]
85
+ targets = [ "policy",]
99
86
  mode = "direct"
100
87
  direct = true
101
88
  verify_every_k = 0
102
89
 
103
- [rubric]
104
- enabled = true
105
- model = "openai/gpt-oss-120b"
106
- api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
107
- api_key_env = "OPENAI_API_KEY"
108
- # Blend the hosted judge scores with environment returns inside the trainer.
109
90
  [rubric.weights]
110
91
  env = 0.2
111
92
  event = 0.4
112
93
  outcome = 0.4
113
94
 
114
- [rubric.event]
115
- # Hosted judge rubric for per-decision progress scoring.
116
- rubric_id = "crafter/event@v1"
117
- criteria = [
118
- { key = "progress.unique_achievements", weight = 0.9, description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0.", aggregation = "weighted_sum" },
119
- { key = "process.intent_alignment", weight = 0.1, description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock.", aggregation = "weighted_sum" },
120
- ]
121
-
122
- [rubric.outcome]
123
- # Hosted judge rubric for final trajectory scoring.
124
- rubric_id = "crafter/outcome@v1"
125
- criteria = [
126
- { key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
127
- { key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
128
- ]
129
-
130
- [judge]
131
- type = "groq" # or "groq" when routing to Groq-hosted judges
95
+ [judge.options]
96
+ event = true
97
+ outcome = true
98
+ provider = "openai"
99
+ model = "openai/gpt-oss-120b"
100
+ rubric_id = "crafter/bundle@v1"
132
101
  timeout_s = 45
133
102
 
134
- [judge.options]
135
- event = true
136
- outcome = true
137
- provider = "openai"
138
- model = "openai/gpt-oss-120b"
139
- rubric_id = "crafter/bundle@v1"
140
- max_concurrency = 6
141
- tracks = ["process", "reasoning", "progress", "outcome"]
142
-
143
- [judge.options.rubric_overrides]
144
-
145
- [judge.options.rubric_overrides.event]
146
- goal_text = """
147
- Treat each decision as a check for new Crafter achievements.
148
- Award the top score only when the log shows a fresh achievement unlock or an immediately verifiable deterministic completion.
149
- Keep otherwise useful setup actions in a narrow low band so non-achievement turns stay near zero."""
150
- aggregation = "weighted_sum"
151
-
152
- [[judge.options.rubric_overrides.event.criteria]]
153
- id = "progress.unique_achievements"
154
- weight = 0.9
155
- scale = "binary"
156
- description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0."
157
-
158
- [[judge.options.rubric_overrides.event.criteria]]
159
- id = "process.intent_alignment"
160
- weight = 0.1
161
- scale = "bounded"
162
- description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock."
163
-
164
- [judge.options.rubric_overrides.outcome]
165
- goal_text = """
166
- Summarise the episode outcome in relation to Crafter’s win condition:
167
- survive, accumulate resources, and craft advanced tools or structures.
168
- Highlight notable achievements, safety failures, and preparedness for future exploration."""
169
- aggregation = "weighted_sum"
170
-
171
- [[judge.options.rubric_overrides.outcome.criteria]]
172
- id = "outcome.goal_completion"
173
- weight = 0.6
174
- scale = "binary"
175
- description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace)."
176
-
177
- [[judge.options.rubric_overrides.outcome.criteria]]
178
- id = "outcome.achievement_depth"
179
- weight = 0.4
180
- scale = "bounded"
181
- description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
182
-
183
- [judge.options.weights]
184
- process = 0.05
185
- reasoning = 0.15
186
- progress = 0.30
187
- outcome = 0.50
103
+ [rollout.env_config.step_rewards]
104
+ enabled = true
105
+ mode = "decision_stepwise"
106
+ strategy = "consistent"
107
+ indicator_lambda = 1.0
108
+ step_beta = 0.0
109
+
110
+ [judge.options.weights]
111
+ process = 0.05
112
+ reasoning = 0.15
113
+ progress = 0.3
114
+ outcome = 0.5
115
+
116
+ [judge.options.rubric_overrides.event]
117
+ goal_text = "Treat each decision as a check for new Crafter achievements.\nAward the top score only when the log shows a fresh achievement unlock or an immediately verifiable deterministic completion.\nKeep otherwise useful setup actions in a narrow low band so non-achievement turns stay near zero."
118
+ aggregation = "weighted_sum"
119
+ [[judge.options.rubric_overrides.event.criteria]]
120
+ id = "progress.unique_achievements"
121
+ weight = 0.9
122
+ scale = "binary"
123
+ description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0."
124
+
125
+ [[judge.options.rubric_overrides.event.criteria]]
126
+ id = "process.intent_alignment"
127
+ weight = 0.1
128
+ scale = "bounded"
129
+ description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock."
130
+
131
+ [judge.options.rubric_overrides.outcome]
132
+ goal_text = "Summarise the episode outcome in relation to Crafter’s win condition:\nsurvive, accumulate resources, and craft advanced tools or structures.\nHighlight notable achievements, safety failures, and preparedness for future exploration."
133
+ aggregation = "weighted_sum"
134
+ [[judge.options.rubric_overrides.outcome.criteria]]
135
+ id = "outcome.goal_completion"
136
+ weight = 0.6
137
+ scale = "binary"
138
+ description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace)."
139
+
140
+ [[judge.options.rubric_overrides.outcome.criteria]]
141
+ id = "outcome.achievement_depth"
142
+ weight = 0.4
143
+ scale = "bounded"
144
+ description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
145
+
@@ -6,7 +6,7 @@ method = "policy_gradient"
6
6
  variety = "gspo"
7
7
 
8
8
  [services]
9
- # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
9
+ # Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
10
10
  task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
11
11
 
12
12
  [compute]
@@ -6,7 +6,7 @@ method = "policy_gradient"
6
6
  variety = "gspo"
7
7
 
8
8
  [services]
9
- # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
9
+ # Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
10
10
  task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
11
11
 
12
12
  [compute]
@@ -0,0 +1,105 @@
1
+ # Crafter RL experiment – simple stepwise rewards (1 point per new achievement)
2
+ # This config uses the NEW unified [policy] section format
3
+
4
+ [algorithm]
5
+ type = "online"
6
+ method = "policy_gradient"
7
+ variety = "gspo"
8
+
9
+ [services]
10
+ # Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
11
+ task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
12
+
13
+ [compute]
14
+ gpu_type = "H200"
15
+ gpu_count = 2
16
+
17
+ [compute.topology] # Nested: topology is part of compute
18
+ type = "single_node_split"
19
+ gpus_for_vllm = 1
20
+ gpus_for_training = 1
21
+ gpus_for_ref = 0
22
+ tensor_parallel = 1
23
+ reference_placement = "none" # Reference model placement
24
+
25
+ [vllm]
26
+ tensor_parallel_size = 1
27
+ max_model_len = 8192
28
+
29
+ [judge]
30
+ enabled = false # Set to true to enable judge/rubric scoring
31
+
32
+ # Uncomment to enable judge-based reward blending:
33
+ # enabled = true
34
+ # timeout_s = 45
35
+ #
36
+ # [judge.reward_blend] # How to blend env/event/outcome reward sources
37
+ # env = 0.2
38
+ # event = 0.4
39
+ # outcome = 0.4
40
+ #
41
+ # [judge.options]
42
+ # provider = "openai"
43
+ # model = "openai/gpt-oss-120b"
44
+ # event = true
45
+ # outcome = true
46
+ # max_concurrency = 6
47
+
48
+ # NEW: Unified [policy] section - single source of truth for model and sampling
49
+ [policy]
50
+ model_name = "Qwen/Qwen3-4B"
51
+ trainer_mode = "lora"
52
+ label = "crafter-rl-stepwise-simple"
53
+
54
+ # Sampling parameters for rollouts
55
+ max_tokens = 512
56
+ temperature = 0.6
57
+ top_p = 0.95
58
+
59
+ [rollout]
60
+ env_name = "crafter"
61
+ max_turns = 10
62
+ episodes_per_batch = 4
63
+ policy_name = "crafter-react"
64
+ max_concurrent_rollouts = 8
65
+ batches_per_step = 2
66
+ ops = ["agent", "env"]
67
+
68
+ [evaluation]
69
+ instances = 10
70
+ every_n_iters = 10
71
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
72
+
73
+ [training]
74
+ num_epochs = 1
75
+ iterations_per_epoch = 10
76
+ gradient_accumulation_steps = 1
77
+ max_accumulated_minibatch = 1
78
+ max_turns = 10
79
+ batch_size = 4
80
+ group_size = 4
81
+ learning_rate = 5e-5
82
+ log_interval = 1
83
+ weight_sync_interval = 1
84
+
85
+ [training.rewards] # Nested: Reward config under training
86
+ step_rewards_enabled = true
87
+ step_rewards_mode = "decision_stepwise"
88
+ step_rewards_indicator_lambda = 1.0
89
+ step_rewards_beta = 0.0
90
+ step_rewards_strategy = "consistent"
91
+ event_rewards_kind = "unique"
92
+
93
+ [training.lora] # Nested: LoRA config under training
94
+ r = 16
95
+ alpha = 32
96
+ dropout = 0.05
97
+ target_modules = ["all-linear"]
98
+
99
+ [training.weight_sync]
100
+ enable = true
101
+ targets = ["policy"]
102
+ mode = "direct"
103
+ direct = true
104
+ verify_every_k = 0
105
+
@@ -1,40 +1,33 @@
1
- # Verilog RL experiment – LoRA training on Qwen3-0.6B
2
- #
3
- # This configuration adapts the Crafter RL setup for Verilog spec-to-RTL tasks.
4
- # Uses the same proven pipeline but optimized for 0.6B model and Verilog domain.
5
-
6
1
  [algorithm]
7
2
  type = "online"
8
3
  method = "policy_gradient"
9
4
  variety = "gspo"
10
5
 
11
6
  [services]
12
- # Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-verilog`
13
7
  task_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
14
- # Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
15
8
  judge_url = "https://synth-backend-dev-docker.onrender.com/api"
16
9
 
17
10
  [compute]
18
- gpu_type = "H200" # ✅ 8B model needs H200 for larger context window
19
- gpu_count = 2 # ✅ Minimum 2x GPUs (1 for vLLM inference + 1 for training)
11
+ gpu_type = "H200"
12
+ gpu_count = 2
20
13
  nodes = 1
21
14
 
22
15
  [topology]
23
16
  type = "single_node_split"
24
- gpus_for_vllm = 1 # ✅ vLLM for inference
25
- gpus_for_training = 1 # ✅ Training GPU (8B LoRA fits well)
17
+ gpus_for_vllm = 1
18
+ gpus_for_training = 1
26
19
  gpus_for_ref = 0
27
20
  tensor_parallel = 1
28
21
 
29
22
  [vllm]
30
23
  tensor_parallel_size = 1
31
- max_model_len = 24576 # ✅ Increased to 24K to accommodate long Verilog prompts (16K + 8K buffer for testbenches + history)
24
+ max_model_len = 24576
32
25
 
33
26
  [reference]
34
27
  placement = "none"
35
28
 
36
29
  [model]
37
- base = "Qwen/Qwen3-8B" # ✅ 8B model for RL training with good balance of speed and capability
30
+ base = "Qwen/Qwen3-8B"
38
31
  trainer_mode = "lora"
39
32
  label = "verilog-rl-lora-qwen8b"
40
33
 
@@ -42,38 +35,21 @@ label = "verilog-rl-lora-qwen8b"
42
35
  r = 16
43
36
  alpha = 32
44
37
  dropout = 0.05
45
- target_modules = ["all-linear"]
38
+ target_modules = [ "all-linear",]
46
39
 
47
40
  [rollout]
48
- env_name = "verilog" # ✅ Changed from "crafter" to "verilog"
49
- max_turns = 6 # ✅ More steps for compilation chains vs Crafter's 10
50
- episodes_per_batch = 4 # ✅ Good batch size for 8B model
41
+ env_name = "verilog"
42
+ max_turns = 6
43
+ episodes_per_batch = 4
51
44
  policy_name = "verilog-designer"
52
45
  max_concurrent_rollouts = 8
53
46
  batches_per_step = 2
54
- ops = ["agent", "env"]
55
-
56
- [rollout.env_config]
57
- # Verilog-specific environment settings
58
- difficulty = "medium" # Can be "easy", "medium", or "hard"
59
-
60
- [rollout.env_config.step_rewards]
61
- enabled = true
62
- mode = "decision_stepwise"
63
- strategy = "consistent"
64
- indicator_lambda = 0.5 # ✅ Reduced from Crafter (sparser rewards)
65
- step_beta = 0.0
66
-
67
- [rollout.policy_config]
68
- provider = "openai"
69
- model = "Qwen/Qwen3-8B" # ✅ Use the model being trained (8B) for rollouts
70
- temperature = 0.2
71
- max_tokens = 4096 # ✅ Balanced for Verilog generation while leaving room for long input prompts (testbenches + history)
47
+ ops = [ "agent", "env",]
72
48
 
73
49
  [evaluation]
74
50
  instances = 16
75
51
  every_n_iters = 10
76
- seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
52
+ seeds = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,]
77
53
 
78
54
  [training]
79
55
  num_epochs = 1
@@ -81,110 +57,91 @@ iterations_per_epoch = 5
81
57
  gradient_accumulation_steps = 1
82
58
  max_accumulated_minibatch = 1
83
59
  max_turns = 15
84
- batch_size = 4 # ✅ Same as Crafter (works well for 8B LoRA)
60
+ batch_size = 4
85
61
  group_size = 4
86
- learning_rate = 5e-5 # ✅ Same as Crafter
62
+ learning_rate = 5e-5
87
63
  log_interval = 1
88
64
  weight_sync_interval = 1
89
65
  event_rewards_kind = "unique"
90
- async_semaphore_max = 20 # Max concurrent rollouts in streaming pipeline
91
-
92
- # Enable dense decision rewards in the trainer
66
+ async_semaphore_max = 20
93
67
  step_rewards_enabled = true
94
68
  step_rewards_mode = "decision_stepwise"
95
- step_rewards_indicator_lambda = 0.5 # ✅ Reduced for Verilog's sparser rewards
69
+ step_rewards_indicator_lambda = 0.5
96
70
  step_rewards_beta = 0.0
97
71
  step_rewards_strategy = "consistent"
98
72
 
73
+ [judge]
74
+ enabled = true
75
+
76
+ [rollout.env_config]
77
+ difficulty = "medium"
78
+
79
+ [rollout.policy_config]
80
+ provider = "openai"
81
+ model = "Qwen/Qwen3-8B"
82
+ temperature = 0.2
83
+ max_tokens = 4096
84
+
99
85
  [training.weight_sync]
100
86
  enable = true
101
- targets = ["policy"]
87
+ targets = [ "policy",]
102
88
  mode = "direct"
103
89
  direct = true
104
90
  verify_every_k = 0
105
91
 
106
- [rubric]
107
- enabled = true
92
+ [judge.reward_blend]
93
+ env = 0.3
94
+ event = 0.3
95
+ outcome = 0.4
96
+
97
+ [judge.options]
98
+ event = true
99
+ outcome = true
100
+ provider = "openai"
108
101
  model = "openai/gpt-oss-120b"
109
- api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
110
- api_key_env = "OPENAI_API_KEY"
102
+ rubric_id = "verilog/bundle@v1"
103
+ timeout_s = 45
111
104
 
112
- # Blend the hosted judge scores with environment returns
113
- [rubric.weights]
114
- env = 0.3 # ✅ Higher weight on env rewards for Verilog (vs Crafter's 0.2)
115
- event = 0.3 # ✅ Adjusted for Verilog's different reward structure
105
+ [rollout.env_config.step_rewards]
106
+ enabled = true
107
+ mode = "decision_stepwise"
108
+ strategy = "consistent"
109
+ indicator_lambda = 0.5
110
+ step_beta = 0.0
111
+
112
+ [judge.options.weights]
113
+ process = 0.1
114
+ reasoning = 0.2
115
+ progress = 0.3
116
116
  outcome = 0.4
117
117
 
118
- [rubric.event]
119
- # Verilog-specific event rubric for process efficiency
120
- rubric_id = "verilog/event@v1"
121
- criteria = [
122
- { key = "process.compilation_success", weight = 0.7, description = "Return 1.0 when compilation succeeds, 0.5 for partial success, 0.0 for failure", aggregation = "weighted_sum" },
123
- { key = "process.design_iterations", weight = 0.3, description = "Reward efficient design iterations without unnecessary recompilation", aggregation = "weighted_sum" },
124
- ]
125
-
126
- [rubric.outcome]
127
- # Verilog-specific outcome rubric for final results
128
- rubric_id = "verilog/outcome@v1"
129
- criteria = [
130
- { key = "outcome.tests_passed", weight = 0.8, description = "Full credit when all tests pass, partial for some tests", aggregation = "weighted_sum" },
131
- { key = "outcome.design_quality", weight = 0.2, description = "Code quality, documentation, and design efficiency", aggregation = "weighted_sum" },
132
- ]
133
-
134
- [judge]
135
- type = "groq"
136
- timeout_s = 45
118
+ [judge.options.rubric_overrides.event]
119
+ goal_text = " Evaluate each Verilog design decision for compilation success and process efficiency.\n High scores for successful compilation and strategic tool usage.\n Penalize unnecessary operations and compilation failures."
120
+ aggregation = "weighted_sum"
121
+ [[judge.options.rubric_overrides.event.criteria]]
122
+ id = "process.compilation_success"
123
+ weight = 0.7
124
+ scale = "bounded"
125
+ description = "Return 1.0 when compilation succeeds cleanly, 0.5 for warnings, 0.0 for errors"
126
+
127
+ [[judge.options.rubric_overrides.event.criteria]]
128
+ id = "process.design_iterations"
129
+ weight = 0.3
130
+ scale = "bounded"
131
+ description = "Reward efficient write→compile→simulate workflow, penalize redundant operations"
132
+
133
+ [judge.options.rubric_overrides.outcome]
134
+ goal_text = " Evaluate the final Verilog implementation for correctness and quality.\n High scores for working designs that pass all tests with good code quality."
135
+ aggregation = "weighted_sum"
136
+ [[judge.options.rubric_overrides.outcome.criteria]]
137
+ id = "outcome.tests_passed"
138
+ weight = 0.8
139
+ scale = "binary"
140
+ description = "Full credit when all tests pass, partial credit for some tests passing"
141
+
142
+ [[judge.options.rubric_overrides.outcome.criteria]]
143
+ id = "outcome.design_quality"
144
+ weight = 0.2
145
+ scale = "bounded"
146
+ description = "Code clarity, proper documentation, and efficient design patterns"
137
147
 
138
- [judge.options]
139
- event = true
140
- outcome = true
141
- provider = "openai"
142
- model = "openai/gpt-oss-120b"
143
- rubric_id = "verilog/bundle@v1"
144
- max_concurrency = 6
145
- tracks = ["process", "reasoning", "progress", "outcome"]
146
-
147
- [judge.options.rubric_overrides]
148
-
149
- [judge.options.rubric_overrides.event]
150
- goal_text = """
151
- Evaluate each Verilog design decision for compilation success and process efficiency.
152
- High scores for successful compilation and strategic tool usage.
153
- Penalize unnecessary operations and compilation failures."""
154
- aggregation = "weighted_sum"
155
-
156
- [[judge.options.rubric_overrides.event.criteria]]
157
- id = "process.compilation_success"
158
- weight = 0.7
159
- scale = "bounded"
160
- description = "Return 1.0 when compilation succeeds cleanly, 0.5 for warnings, 0.0 for errors"
161
-
162
- [[judge.options.rubric_overrides.event.criteria]]
163
- id = "process.design_iterations"
164
- weight = 0.3
165
- scale = "bounded"
166
- description = "Reward efficient write→compile→simulate workflow, penalize redundant operations"
167
-
168
- [judge.options.rubric_overrides.outcome]
169
- goal_text = """
170
- Evaluate the final Verilog implementation for correctness and quality.
171
- High scores for working designs that pass all tests with good code quality."""
172
- aggregation = "weighted_sum"
173
-
174
- [[judge.options.rubric_overrides.outcome.criteria]]
175
- id = "outcome.tests_passed"
176
- weight = 0.8
177
- scale = "binary"
178
- description = "Full credit when all tests pass, partial credit for some tests passing"
179
-
180
- [[judge.options.rubric_overrides.outcome.criteria]]
181
- id = "outcome.design_quality"
182
- weight = 0.2
183
- scale = "bounded"
184
- description = "Code clarity, proper documentation, and efficient design patterns"
185
-
186
- [judge.options.weights]
187
- process = 0.1
188
- reasoning = 0.2
189
- progress = 0.3
190
- outcome = 0.4
@@ -1,11 +1,9 @@
1
1
  # Qwen3 Coder 30B LoRA SFT – all-linear adapters
2
2
 
3
- type = "sft"
4
-
5
3
  [algorithm]
6
4
  type = "offline"
7
5
  method = "sft"
8
- variety = "lora"
6
+ variety = "qlora"
9
7
 
10
8
  [job]
11
9
  model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
@@ -1,6 +1,9 @@
1
1
  # Qwen3 Coder 4B LoRA SFT – all-linear adapters
2
2
 
3
- type = "sft"
3
+ [algorithm]
4
+ type = "offline"
5
+ method = "sft"
6
+ variety = "qlora"
4
7
 
5
8
  [job]
6
9
  model = "Qwen/Qwen3-4B"
@@ -1,11 +1,9 @@
1
1
  # Qwen3 Coder LoRA SFT – all-linear adapters
2
2
 
3
- type = "sft"
4
-
5
3
  [algorithm]
6
4
  type = "offline"
7
5
  method = "sft"
8
- variety = "fft"
6
+ variety = "qlora"
9
7
 
10
8
  [job]
11
9
  # Smallest supported Qwen3 base; replace with the smallest Coder variant when available