synth-ai 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +25 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +42 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +41 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +5 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -4
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +66 -3
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +17 -49
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +13 -5
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +15 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +36 -5
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +134 -3
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +4 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +6 -3
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +5 -0
- synth_ai/api/train/builders.py +9 -3
- synth_ai/api/train/cli.py +125 -10
- synth_ai/api/train/configs/__init__.py +8 -1
- synth_ai/api/train/configs/rl.py +32 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/auth/credentials.py +119 -0
- synth_ai/cli/__init__.py +12 -4
- synth_ai/cli/commands/__init__.py +17 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/deploy/__init__.py +23 -0
- synth_ai/cli/commands/deploy/core.py +614 -0
- synth_ai/cli/commands/deploy/errors.py +72 -0
- synth_ai/cli/commands/deploy/validation.py +11 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1109 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +388 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +73 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +199 -0
- synth_ai/cli/commands/train/judge_validation.py +304 -0
- synth_ai/cli/commands/train/validation.py +443 -0
- synth_ai/cli/demo.py +2 -162
- synth_ai/cli/deploy/__init__.py +28 -0
- synth_ai/cli/deploy/core.py +5 -0
- synth_ai/cli/deploy/errors.py +23 -0
- synth_ai/cli/deploy/validation.py +5 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +58 -1487
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -11
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +469 -0
- synth_ai/streaming/streamer.py +301 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/validators.py +2 -2
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/utils/env.py +25 -18
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/modal.py +2 -2
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/METADATA +8 -3
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/RECORD +184 -109
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,10 @@
|
|
|
1
|
-
# Crafter RL experiment – stepwise shaping with hosted judge rubrics
|
|
2
|
-
#
|
|
3
|
-
# This configuration extends the stepwise LoRA baseline by wiring the Synth judge
|
|
4
|
-
# service so evaluation rolls combine dense step rewards with hosted rubric scoring.
|
|
5
|
-
|
|
6
1
|
[algorithm]
|
|
7
2
|
type = "online"
|
|
8
3
|
method = "policy_gradient"
|
|
9
4
|
variety = "gspo"
|
|
10
5
|
|
|
11
6
|
[services]
|
|
12
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
13
7
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
14
|
-
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
8
|
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
16
9
|
|
|
17
10
|
[compute]
|
|
@@ -41,7 +34,7 @@ label = "crafter-rl-stepwise-hosted-judge"
|
|
|
41
34
|
r = 16
|
|
42
35
|
alpha = 32
|
|
43
36
|
dropout = 0.05
|
|
44
|
-
target_modules = ["all-linear"]
|
|
37
|
+
target_modules = [ "all-linear",]
|
|
45
38
|
|
|
46
39
|
[rollout]
|
|
47
40
|
env_name = "crafter"
|
|
@@ -50,27 +43,12 @@ episodes_per_batch = 2
|
|
|
50
43
|
policy_name = "crafter-react"
|
|
51
44
|
max_concurrent_rollouts = 8
|
|
52
45
|
batches_per_step = 2
|
|
53
|
-
ops = ["agent", "env"]
|
|
54
|
-
|
|
55
|
-
[rollout.env_config]
|
|
56
|
-
difficulty = "easy"
|
|
57
|
-
|
|
58
|
-
[rollout.env_config.step_rewards]
|
|
59
|
-
enabled = true
|
|
60
|
-
mode = "decision_stepwise"
|
|
61
|
-
strategy = "consistent" # +1 for each decision that unlocks a new achievement
|
|
62
|
-
indicator_lambda = 1.0
|
|
63
|
-
step_beta = 0.0
|
|
64
|
-
|
|
65
|
-
[rollout.policy_config]
|
|
66
|
-
temperature = 0.2
|
|
67
|
-
top_p = 0.95
|
|
68
|
-
max_tokens = 512
|
|
46
|
+
ops = [ "agent", "env",]
|
|
69
47
|
|
|
70
48
|
[evaluation]
|
|
71
49
|
instances = 16
|
|
72
50
|
every_n_iters = 10
|
|
73
|
-
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
51
|
+
seeds = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,]
|
|
74
52
|
|
|
75
53
|
[training]
|
|
76
54
|
num_epochs = 1
|
|
@@ -84,104 +62,84 @@ learning_rate = 5e-5
|
|
|
84
62
|
log_interval = 1
|
|
85
63
|
weight_sync_interval = 1
|
|
86
64
|
event_rewards_kind = "unique"
|
|
87
|
-
async_semaphore_max = 4
|
|
88
|
-
|
|
89
|
-
# Enable dense decision rewards in the trainer to mirror env_config step rewards.
|
|
65
|
+
async_semaphore_max = 4
|
|
90
66
|
step_rewards_enabled = true
|
|
91
67
|
step_rewards_mode = "decision_stepwise"
|
|
92
68
|
step_rewards_indicator_lambda = 1.0
|
|
93
69
|
step_rewards_beta = 0.0
|
|
94
70
|
step_rewards_strategy = "consistent"
|
|
95
71
|
|
|
72
|
+
[rubric]
|
|
73
|
+
enabled = true
|
|
74
|
+
|
|
75
|
+
[rollout.env_config]
|
|
76
|
+
difficulty = "easy"
|
|
77
|
+
|
|
78
|
+
[rollout.policy_config]
|
|
79
|
+
temperature = 0.2
|
|
80
|
+
top_p = 0.95
|
|
81
|
+
max_tokens = 512
|
|
82
|
+
|
|
96
83
|
[training.weight_sync]
|
|
97
84
|
enable = true
|
|
98
|
-
targets = ["policy"]
|
|
85
|
+
targets = [ "policy",]
|
|
99
86
|
mode = "direct"
|
|
100
87
|
direct = true
|
|
101
88
|
verify_every_k = 0
|
|
102
89
|
|
|
103
|
-
[rubric]
|
|
104
|
-
enabled = true
|
|
105
|
-
model = "openai/gpt-oss-120b"
|
|
106
|
-
api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
|
|
107
|
-
api_key_env = "OPENAI_API_KEY"
|
|
108
|
-
# Blend the hosted judge scores with environment returns inside the trainer.
|
|
109
90
|
[rubric.weights]
|
|
110
91
|
env = 0.2
|
|
111
92
|
event = 0.4
|
|
112
93
|
outcome = 0.4
|
|
113
94
|
|
|
114
|
-
[
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
[rubric.outcome]
|
|
123
|
-
# Hosted judge rubric for final trajectory scoring.
|
|
124
|
-
rubric_id = "crafter/outcome@v1"
|
|
125
|
-
criteria = [
|
|
126
|
-
{ key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
|
|
127
|
-
{ key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
|
|
128
|
-
]
|
|
129
|
-
|
|
130
|
-
[judge]
|
|
131
|
-
type = "groq" # or "groq" when routing to Groq-hosted judges
|
|
95
|
+
[judge.options]
|
|
96
|
+
event = true
|
|
97
|
+
outcome = true
|
|
98
|
+
provider = "openai"
|
|
99
|
+
model = "openai/gpt-oss-120b"
|
|
100
|
+
rubric_id = "crafter/bundle@v1"
|
|
132
101
|
timeout_s = 45
|
|
133
102
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
178
|
-
id = "outcome.achievement_depth"
|
|
179
|
-
weight = 0.4
|
|
180
|
-
scale = "bounded"
|
|
181
|
-
description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
|
|
182
|
-
|
|
183
|
-
[judge.options.weights]
|
|
184
|
-
process = 0.05
|
|
185
|
-
reasoning = 0.15
|
|
186
|
-
progress = 0.30
|
|
187
|
-
outcome = 0.50
|
|
103
|
+
[rollout.env_config.step_rewards]
|
|
104
|
+
enabled = true
|
|
105
|
+
mode = "decision_stepwise"
|
|
106
|
+
strategy = "consistent"
|
|
107
|
+
indicator_lambda = 1.0
|
|
108
|
+
step_beta = 0.0
|
|
109
|
+
|
|
110
|
+
[judge.options.weights]
|
|
111
|
+
process = 0.05
|
|
112
|
+
reasoning = 0.15
|
|
113
|
+
progress = 0.3
|
|
114
|
+
outcome = 0.5
|
|
115
|
+
|
|
116
|
+
[judge.options.rubric_overrides.event]
|
|
117
|
+
goal_text = "Treat each decision as a check for new Crafter achievements.\nAward the top score only when the log shows a fresh achievement unlock or an immediately verifiable deterministic completion.\nKeep otherwise useful setup actions in a narrow low band so non-achievement turns stay near zero."
|
|
118
|
+
aggregation = "weighted_sum"
|
|
119
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
120
|
+
id = "progress.unique_achievements"
|
|
121
|
+
weight = 0.9
|
|
122
|
+
scale = "binary"
|
|
123
|
+
description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0."
|
|
124
|
+
|
|
125
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
126
|
+
id = "process.intent_alignment"
|
|
127
|
+
weight = 0.1
|
|
128
|
+
scale = "bounded"
|
|
129
|
+
description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock."
|
|
130
|
+
|
|
131
|
+
[judge.options.rubric_overrides.outcome]
|
|
132
|
+
goal_text = "Summarise the episode outcome in relation to Crafter’s win condition:\nsurvive, accumulate resources, and craft advanced tools or structures.\nHighlight notable achievements, safety failures, and preparedness for future exploration."
|
|
133
|
+
aggregation = "weighted_sum"
|
|
134
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
135
|
+
id = "outcome.goal_completion"
|
|
136
|
+
weight = 0.6
|
|
137
|
+
scale = "binary"
|
|
138
|
+
description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace)."
|
|
139
|
+
|
|
140
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
141
|
+
id = "outcome.achievement_depth"
|
|
142
|
+
weight = 0.4
|
|
143
|
+
scale = "bounded"
|
|
144
|
+
description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
|
|
145
|
+
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Crafter RL experiment – simple stepwise rewards (1 point per new achievement)
|
|
2
|
+
# This config uses the NEW unified [policy] section format
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "online"
|
|
6
|
+
method = "policy_gradient"
|
|
7
|
+
variety = "gspo"
|
|
8
|
+
|
|
9
|
+
[services]
|
|
10
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
11
|
+
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
12
|
+
|
|
13
|
+
[compute]
|
|
14
|
+
gpu_type = "H200"
|
|
15
|
+
gpu_count = 2
|
|
16
|
+
|
|
17
|
+
[compute.topology] # Nested: topology is part of compute
|
|
18
|
+
type = "single_node_split"
|
|
19
|
+
gpus_for_vllm = 1
|
|
20
|
+
gpus_for_training = 1
|
|
21
|
+
gpus_for_ref = 0
|
|
22
|
+
tensor_parallel = 1
|
|
23
|
+
reference_placement = "none" # Reference model placement
|
|
24
|
+
|
|
25
|
+
[vllm]
|
|
26
|
+
tensor_parallel_size = 1
|
|
27
|
+
max_model_len = 8192
|
|
28
|
+
|
|
29
|
+
[judge]
|
|
30
|
+
enabled = false # Set to true to enable judge/rubric scoring
|
|
31
|
+
|
|
32
|
+
# Uncomment to enable judge-based reward blending:
|
|
33
|
+
# enabled = true
|
|
34
|
+
# timeout_s = 45
|
|
35
|
+
#
|
|
36
|
+
# [judge.reward_blend] # How to blend env/event/outcome reward sources
|
|
37
|
+
# env = 0.2
|
|
38
|
+
# event = 0.4
|
|
39
|
+
# outcome = 0.4
|
|
40
|
+
#
|
|
41
|
+
# [judge.options]
|
|
42
|
+
# provider = "openai"
|
|
43
|
+
# model = "openai/gpt-oss-120b"
|
|
44
|
+
# event = true
|
|
45
|
+
# outcome = true
|
|
46
|
+
# max_concurrency = 6
|
|
47
|
+
|
|
48
|
+
# NEW: Unified [policy] section - single source of truth for model and sampling
|
|
49
|
+
[policy]
|
|
50
|
+
model_name = "Qwen/Qwen3-4B"
|
|
51
|
+
trainer_mode = "lora"
|
|
52
|
+
label = "crafter-rl-stepwise-simple"
|
|
53
|
+
|
|
54
|
+
# Sampling parameters for rollouts
|
|
55
|
+
max_tokens = 512
|
|
56
|
+
temperature = 0.6
|
|
57
|
+
top_p = 0.95
|
|
58
|
+
|
|
59
|
+
[rollout]
|
|
60
|
+
env_name = "crafter"
|
|
61
|
+
max_turns = 10
|
|
62
|
+
episodes_per_batch = 4
|
|
63
|
+
policy_name = "crafter-react"
|
|
64
|
+
max_concurrent_rollouts = 8
|
|
65
|
+
batches_per_step = 2
|
|
66
|
+
ops = ["agent", "env"]
|
|
67
|
+
|
|
68
|
+
[evaluation]
|
|
69
|
+
instances = 10
|
|
70
|
+
every_n_iters = 10
|
|
71
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
72
|
+
|
|
73
|
+
[training]
|
|
74
|
+
num_epochs = 1
|
|
75
|
+
iterations_per_epoch = 10
|
|
76
|
+
gradient_accumulation_steps = 1
|
|
77
|
+
max_accumulated_minibatch = 1
|
|
78
|
+
max_turns = 10
|
|
79
|
+
batch_size = 4
|
|
80
|
+
group_size = 4
|
|
81
|
+
learning_rate = 5e-5
|
|
82
|
+
log_interval = 1
|
|
83
|
+
weight_sync_interval = 1
|
|
84
|
+
|
|
85
|
+
[training.rewards] # Nested: Reward config under training
|
|
86
|
+
step_rewards_enabled = true
|
|
87
|
+
step_rewards_mode = "decision_stepwise"
|
|
88
|
+
step_rewards_indicator_lambda = 1.0
|
|
89
|
+
step_rewards_beta = 0.0
|
|
90
|
+
step_rewards_strategy = "consistent"
|
|
91
|
+
event_rewards_kind = "unique"
|
|
92
|
+
|
|
93
|
+
[training.lora] # Nested: LoRA config under training
|
|
94
|
+
r = 16
|
|
95
|
+
alpha = 32
|
|
96
|
+
dropout = 0.05
|
|
97
|
+
target_modules = ["all-linear"]
|
|
98
|
+
|
|
99
|
+
[training.weight_sync]
|
|
100
|
+
enable = true
|
|
101
|
+
targets = ["policy"]
|
|
102
|
+
mode = "direct"
|
|
103
|
+
direct = true
|
|
104
|
+
verify_every_k = 0
|
|
105
|
+
|
|
@@ -1,40 +1,33 @@
|
|
|
1
|
-
# Verilog RL experiment – LoRA training on Qwen3-0.6B
|
|
2
|
-
#
|
|
3
|
-
# This configuration adapts the Crafter RL setup for Verilog spec-to-RTL tasks.
|
|
4
|
-
# Uses the same proven pipeline but optimized for 0.6B model and Verilog domain.
|
|
5
|
-
|
|
6
1
|
[algorithm]
|
|
7
2
|
type = "online"
|
|
8
3
|
method = "policy_gradient"
|
|
9
4
|
variety = "gspo"
|
|
10
5
|
|
|
11
6
|
[services]
|
|
12
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-verilog`
|
|
13
7
|
task_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
|
|
14
|
-
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
8
|
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
16
9
|
|
|
17
10
|
[compute]
|
|
18
|
-
gpu_type = "H200"
|
|
19
|
-
gpu_count = 2
|
|
11
|
+
gpu_type = "H200"
|
|
12
|
+
gpu_count = 2
|
|
20
13
|
nodes = 1
|
|
21
14
|
|
|
22
15
|
[topology]
|
|
23
16
|
type = "single_node_split"
|
|
24
|
-
gpus_for_vllm = 1
|
|
25
|
-
gpus_for_training = 1
|
|
17
|
+
gpus_for_vllm = 1
|
|
18
|
+
gpus_for_training = 1
|
|
26
19
|
gpus_for_ref = 0
|
|
27
20
|
tensor_parallel = 1
|
|
28
21
|
|
|
29
22
|
[vllm]
|
|
30
23
|
tensor_parallel_size = 1
|
|
31
|
-
max_model_len = 24576
|
|
24
|
+
max_model_len = 24576
|
|
32
25
|
|
|
33
26
|
[reference]
|
|
34
27
|
placement = "none"
|
|
35
28
|
|
|
36
29
|
[model]
|
|
37
|
-
base = "Qwen/Qwen3-8B"
|
|
30
|
+
base = "Qwen/Qwen3-8B"
|
|
38
31
|
trainer_mode = "lora"
|
|
39
32
|
label = "verilog-rl-lora-qwen8b"
|
|
40
33
|
|
|
@@ -42,38 +35,21 @@ label = "verilog-rl-lora-qwen8b"
|
|
|
42
35
|
r = 16
|
|
43
36
|
alpha = 32
|
|
44
37
|
dropout = 0.05
|
|
45
|
-
target_modules = ["all-linear"]
|
|
38
|
+
target_modules = [ "all-linear",]
|
|
46
39
|
|
|
47
40
|
[rollout]
|
|
48
|
-
env_name = "verilog"
|
|
49
|
-
max_turns = 6
|
|
50
|
-
episodes_per_batch = 4
|
|
41
|
+
env_name = "verilog"
|
|
42
|
+
max_turns = 6
|
|
43
|
+
episodes_per_batch = 4
|
|
51
44
|
policy_name = "verilog-designer"
|
|
52
45
|
max_concurrent_rollouts = 8
|
|
53
46
|
batches_per_step = 2
|
|
54
|
-
ops = ["agent", "env"]
|
|
55
|
-
|
|
56
|
-
[rollout.env_config]
|
|
57
|
-
# Verilog-specific environment settings
|
|
58
|
-
difficulty = "medium" # Can be "easy", "medium", or "hard"
|
|
59
|
-
|
|
60
|
-
[rollout.env_config.step_rewards]
|
|
61
|
-
enabled = true
|
|
62
|
-
mode = "decision_stepwise"
|
|
63
|
-
strategy = "consistent"
|
|
64
|
-
indicator_lambda = 0.5 # ✅ Reduced from Crafter (sparser rewards)
|
|
65
|
-
step_beta = 0.0
|
|
66
|
-
|
|
67
|
-
[rollout.policy_config]
|
|
68
|
-
provider = "openai"
|
|
69
|
-
model = "Qwen/Qwen3-8B" # ✅ Use the model being trained (8B) for rollouts
|
|
70
|
-
temperature = 0.2
|
|
71
|
-
max_tokens = 4096 # ✅ Balanced for Verilog generation while leaving room for long input prompts (testbenches + history)
|
|
47
|
+
ops = [ "agent", "env",]
|
|
72
48
|
|
|
73
49
|
[evaluation]
|
|
74
50
|
instances = 16
|
|
75
51
|
every_n_iters = 10
|
|
76
|
-
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
52
|
+
seeds = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,]
|
|
77
53
|
|
|
78
54
|
[training]
|
|
79
55
|
num_epochs = 1
|
|
@@ -81,110 +57,91 @@ iterations_per_epoch = 5
|
|
|
81
57
|
gradient_accumulation_steps = 1
|
|
82
58
|
max_accumulated_minibatch = 1
|
|
83
59
|
max_turns = 15
|
|
84
|
-
batch_size = 4
|
|
60
|
+
batch_size = 4
|
|
85
61
|
group_size = 4
|
|
86
|
-
learning_rate = 5e-5
|
|
62
|
+
learning_rate = 5e-5
|
|
87
63
|
log_interval = 1
|
|
88
64
|
weight_sync_interval = 1
|
|
89
65
|
event_rewards_kind = "unique"
|
|
90
|
-
async_semaphore_max = 20
|
|
91
|
-
|
|
92
|
-
# Enable dense decision rewards in the trainer
|
|
66
|
+
async_semaphore_max = 20
|
|
93
67
|
step_rewards_enabled = true
|
|
94
68
|
step_rewards_mode = "decision_stepwise"
|
|
95
|
-
step_rewards_indicator_lambda = 0.5
|
|
69
|
+
step_rewards_indicator_lambda = 0.5
|
|
96
70
|
step_rewards_beta = 0.0
|
|
97
71
|
step_rewards_strategy = "consistent"
|
|
98
72
|
|
|
73
|
+
[judge]
|
|
74
|
+
enabled = true
|
|
75
|
+
|
|
76
|
+
[rollout.env_config]
|
|
77
|
+
difficulty = "medium"
|
|
78
|
+
|
|
79
|
+
[rollout.policy_config]
|
|
80
|
+
provider = "openai"
|
|
81
|
+
model = "Qwen/Qwen3-8B"
|
|
82
|
+
temperature = 0.2
|
|
83
|
+
max_tokens = 4096
|
|
84
|
+
|
|
99
85
|
[training.weight_sync]
|
|
100
86
|
enable = true
|
|
101
|
-
targets = ["policy"]
|
|
87
|
+
targets = [ "policy",]
|
|
102
88
|
mode = "direct"
|
|
103
89
|
direct = true
|
|
104
90
|
verify_every_k = 0
|
|
105
91
|
|
|
106
|
-
[
|
|
107
|
-
|
|
92
|
+
[judge.reward_blend]
|
|
93
|
+
env = 0.3
|
|
94
|
+
event = 0.3
|
|
95
|
+
outcome = 0.4
|
|
96
|
+
|
|
97
|
+
[judge.options]
|
|
98
|
+
event = true
|
|
99
|
+
outcome = true
|
|
100
|
+
provider = "openai"
|
|
108
101
|
model = "openai/gpt-oss-120b"
|
|
109
|
-
|
|
110
|
-
|
|
102
|
+
rubric_id = "verilog/bundle@v1"
|
|
103
|
+
timeout_s = 45
|
|
111
104
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
105
|
+
[rollout.env_config.step_rewards]
|
|
106
|
+
enabled = true
|
|
107
|
+
mode = "decision_stepwise"
|
|
108
|
+
strategy = "consistent"
|
|
109
|
+
indicator_lambda = 0.5
|
|
110
|
+
step_beta = 0.0
|
|
111
|
+
|
|
112
|
+
[judge.options.weights]
|
|
113
|
+
process = 0.1
|
|
114
|
+
reasoning = 0.2
|
|
115
|
+
progress = 0.3
|
|
116
116
|
outcome = 0.4
|
|
117
117
|
|
|
118
|
-
[
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
criteria
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
118
|
+
[judge.options.rubric_overrides.event]
|
|
119
|
+
goal_text = " Evaluate each Verilog design decision for compilation success and process efficiency.\n High scores for successful compilation and strategic tool usage.\n Penalize unnecessary operations and compilation failures."
|
|
120
|
+
aggregation = "weighted_sum"
|
|
121
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
122
|
+
id = "process.compilation_success"
|
|
123
|
+
weight = 0.7
|
|
124
|
+
scale = "bounded"
|
|
125
|
+
description = "Return 1.0 when compilation succeeds cleanly, 0.5 for warnings, 0.0 for errors"
|
|
126
|
+
|
|
127
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
128
|
+
id = "process.design_iterations"
|
|
129
|
+
weight = 0.3
|
|
130
|
+
scale = "bounded"
|
|
131
|
+
description = "Reward efficient write→compile→simulate workflow, penalize redundant operations"
|
|
132
|
+
|
|
133
|
+
[judge.options.rubric_overrides.outcome]
|
|
134
|
+
goal_text = " Evaluate the final Verilog implementation for correctness and quality.\n High scores for working designs that pass all tests with good code quality."
|
|
135
|
+
aggregation = "weighted_sum"
|
|
136
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
137
|
+
id = "outcome.tests_passed"
|
|
138
|
+
weight = 0.8
|
|
139
|
+
scale = "binary"
|
|
140
|
+
description = "Full credit when all tests pass, partial credit for some tests passing"
|
|
141
|
+
|
|
142
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
143
|
+
id = "outcome.design_quality"
|
|
144
|
+
weight = 0.2
|
|
145
|
+
scale = "bounded"
|
|
146
|
+
description = "Code clarity, proper documentation, and efficient design patterns"
|
|
137
147
|
|
|
138
|
-
[judge.options]
|
|
139
|
-
event = true
|
|
140
|
-
outcome = true
|
|
141
|
-
provider = "openai"
|
|
142
|
-
model = "openai/gpt-oss-120b"
|
|
143
|
-
rubric_id = "verilog/bundle@v1"
|
|
144
|
-
max_concurrency = 6
|
|
145
|
-
tracks = ["process", "reasoning", "progress", "outcome"]
|
|
146
|
-
|
|
147
|
-
[judge.options.rubric_overrides]
|
|
148
|
-
|
|
149
|
-
[judge.options.rubric_overrides.event]
|
|
150
|
-
goal_text = """
|
|
151
|
-
Evaluate each Verilog design decision for compilation success and process efficiency.
|
|
152
|
-
High scores for successful compilation and strategic tool usage.
|
|
153
|
-
Penalize unnecessary operations and compilation failures."""
|
|
154
|
-
aggregation = "weighted_sum"
|
|
155
|
-
|
|
156
|
-
[[judge.options.rubric_overrides.event.criteria]]
|
|
157
|
-
id = "process.compilation_success"
|
|
158
|
-
weight = 0.7
|
|
159
|
-
scale = "bounded"
|
|
160
|
-
description = "Return 1.0 when compilation succeeds cleanly, 0.5 for warnings, 0.0 for errors"
|
|
161
|
-
|
|
162
|
-
[[judge.options.rubric_overrides.event.criteria]]
|
|
163
|
-
id = "process.design_iterations"
|
|
164
|
-
weight = 0.3
|
|
165
|
-
scale = "bounded"
|
|
166
|
-
description = "Reward efficient write→compile→simulate workflow, penalize redundant operations"
|
|
167
|
-
|
|
168
|
-
[judge.options.rubric_overrides.outcome]
|
|
169
|
-
goal_text = """
|
|
170
|
-
Evaluate the final Verilog implementation for correctness and quality.
|
|
171
|
-
High scores for working designs that pass all tests with good code quality."""
|
|
172
|
-
aggregation = "weighted_sum"
|
|
173
|
-
|
|
174
|
-
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
175
|
-
id = "outcome.tests_passed"
|
|
176
|
-
weight = 0.8
|
|
177
|
-
scale = "binary"
|
|
178
|
-
description = "Full credit when all tests pass, partial credit for some tests passing"
|
|
179
|
-
|
|
180
|
-
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
181
|
-
id = "outcome.design_quality"
|
|
182
|
-
weight = 0.2
|
|
183
|
-
scale = "bounded"
|
|
184
|
-
description = "Code clarity, proper documentation, and efficient design patterns"
|
|
185
|
-
|
|
186
|
-
[judge.options.weights]
|
|
187
|
-
process = 0.1
|
|
188
|
-
reasoning = 0.2
|
|
189
|
-
progress = 0.3
|
|
190
|
-
outcome = 0.4
|