synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
- examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_small.toml +2 -1
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +154 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +275 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +423 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
- examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +62 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +1 -1
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +37 -0
- examples/rl/configs/rl_from_base_qwen17.toml +76 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +22 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/sft/README.md +5 -5
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
- examples/sft/evaluate.py +4 -4
- examples/sft/export_dataset.py +7 -4
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/README.md +1 -1
- examples/swe/task_app/grpo_swe_mini.py +1 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +2 -8
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +3 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
- examples/task_apps/pokemon_red/task_app.py +199 -6
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +145 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/cli.py +30 -7
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/cli/__init__.py +66 -49
- synth_ai/cli/_modal_wrapper.py +9 -6
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/recent.py +1 -0
- synth_ai/cli/setup.py +266 -0
- synth_ai/cli/task_app_deploy.py +16 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +16 -0
- synth_ai/cli/task_app_serve.py +18 -0
- synth_ai/cli/task_apps.py +392 -141
- synth_ai/cli/train.py +18 -0
- synth_ai/cli/tui.py +62 -0
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/verilog/engine.py +76 -10
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/inference/client.py +1 -1
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +1 -1
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/task/__init__.py +11 -1
- synth_ai/task/apps/__init__.py +5 -2
- synth_ai/task/config.py +259 -0
- synth_ai/task/contracts.py +15 -2
- synth_ai/task/rubrics/__init__.py +4 -2
- synth_ai/task/rubrics/loaders.py +27 -4
- synth_ai/task/rubrics/scoring.py +3 -0
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +145 -2
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/session_tracer.py +10 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -2
- synth_ai/tracing_v3/turso/native_manager.py +108 -77
- synth_ai/tracing_v3/utils.py +1 -1
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +911 -0
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +287 -0
- synth_ai/utils/http.py +169 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
- synth_ai/cli/man.py +0 -106
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/http.py +0 -26
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
examples/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
### The instructions for how to create and configure a task app are documented at https://docs.usesynth.ai/sdk/task-apps
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# SFT Training for Qwen3-Coder-30B with LoRA
|
|
2
|
+
|
|
3
|
+
Supervised Fine-Tuning configuration for the same 30B MoE model used in RL training.
|
|
4
|
+
|
|
5
|
+
## Configuration Overview
|
|
6
|
+
|
|
7
|
+
**Model:** `Qwen/Qwen3-Coder-30B-A3B-Instruct` (Mixture of Experts)
|
|
8
|
+
|
|
9
|
+
**Hardware:** 4x H200 GPUs (561GB total VRAM)
|
|
10
|
+
|
|
11
|
+
**Parallelism Strategy:**
|
|
12
|
+
- **Tensor Parallel (TP)**: 2 GPUs - Splits the model across 2 GPUs for inference/forward pass
|
|
13
|
+
- **Data Parallel (DP)**: 2 GPUs - Splits batches across 2 GPUs for training throughput
|
|
14
|
+
|
|
15
|
+
**LoRA Configuration:**
|
|
16
|
+
- Rank (r): 16
|
|
17
|
+
- Alpha: 32
|
|
18
|
+
- Dropout: 0.05
|
|
19
|
+
- Target modules: `["all-linear"]` - Applies LoRA to all linear layers
|
|
20
|
+
|
|
21
|
+
## Memory Breakdown per GPU
|
|
22
|
+
|
|
23
|
+
With 4x H200 (141GB each):
|
|
24
|
+
|
|
25
|
+
**Model Split (TP=2):**
|
|
26
|
+
- 2 GPUs hold the base model (70GB each)
|
|
27
|
+
- ~70GB free per GPU for activations and gradients
|
|
28
|
+
|
|
29
|
+
**Training (DP=2):**
|
|
30
|
+
- 2 GPUs process different batches
|
|
31
|
+
- LoRA adapters: ~5-10GB per GPU
|
|
32
|
+
- Gradients/optimizer states: ~20-30GB per GPU
|
|
33
|
+
- **Total per training GPU: ~50-60GB** ✅
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### 1. Prepare Your Dataset
|
|
38
|
+
|
|
39
|
+
Your dataset should be in JSONL format with conversation turns:
|
|
40
|
+
|
|
41
|
+
```jsonl
|
|
42
|
+
{"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
43
|
+
{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 2. Run Training
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Using the helper script
|
|
50
|
+
./examples/multi_step/run_sft_qwen30b.sh path/to/your/dataset.jsonl
|
|
51
|
+
|
|
52
|
+
# Or directly with synth-ai CLI
|
|
53
|
+
uvx synth-ai train \
|
|
54
|
+
--type sft \
|
|
55
|
+
--config examples/multi_step/configs/crafter_sft_qwen30b_lora.toml \
|
|
56
|
+
--dataset path/to/your/dataset.jsonl \
|
|
57
|
+
--env-file backend/.env.dev
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 3. Monitor Training
|
|
61
|
+
|
|
62
|
+
Check the Synth dashboard for:
|
|
63
|
+
- Training loss curve
|
|
64
|
+
- Validation metrics (if validation set provided)
|
|
65
|
+
- GPU utilization
|
|
66
|
+
- Training throughput (tokens/sec)
|
|
67
|
+
|
|
68
|
+
## Hyperparameters
|
|
69
|
+
|
|
70
|
+
**Batch Configuration:**
|
|
71
|
+
- Per-device batch size: 1
|
|
72
|
+
- Gradient accumulation: 64 steps
|
|
73
|
+
- **Effective global batch size: 128** (1 × 64 × 2 GPUs)
|
|
74
|
+
|
|
75
|
+
**Learning Rate:**
|
|
76
|
+
- Initial LR: 5e-6
|
|
77
|
+
- Warmup ratio: 3%
|
|
78
|
+
- Schedule: Linear decay
|
|
79
|
+
|
|
80
|
+
**Sequence Length:** 4096 tokens
|
|
81
|
+
|
|
82
|
+
**Training:**
|
|
83
|
+
- Epochs: 1
|
|
84
|
+
- Mixed precision: BF16
|
|
85
|
+
- DeepSpeed: Stage 2 (optimizer state sharding)
|
|
86
|
+
- Activation checkpointing: Enabled
|
|
87
|
+
|
|
88
|
+
## Configuration File Structure
|
|
89
|
+
|
|
90
|
+
```toml
|
|
91
|
+
[algorithm]
|
|
92
|
+
type = "offline" # Supervised (not RL)
|
|
93
|
+
method = "sft" # Supervised fine-tuning
|
|
94
|
+
variety = "lora" # Using LoRA adapters
|
|
95
|
+
|
|
96
|
+
[compute]
|
|
97
|
+
gpu_type = "H200"
|
|
98
|
+
gpu_count = 4
|
|
99
|
+
|
|
100
|
+
[data.topology]
|
|
101
|
+
tensor_parallel = 2 # Split model across 2 GPUs
|
|
102
|
+
data_parallel = 2 # Split batches across 2 GPUs
|
|
103
|
+
|
|
104
|
+
[training]
|
|
105
|
+
mode = "lora"
|
|
106
|
+
use_qlora = true # Quantized LoRA (4-bit base model)
|
|
107
|
+
|
|
108
|
+
[lora]
|
|
109
|
+
r = 16 # LoRA rank
|
|
110
|
+
alpha = 32 # LoRA scaling
|
|
111
|
+
dropout = 0.05
|
|
112
|
+
target_modules = ["all-linear"] # Apply to all linear layers
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Comparison with RL Config
|
|
116
|
+
|
|
117
|
+
| Aspect | SFT | RL |
|
|
118
|
+
|--------|-----|-----|
|
|
119
|
+
| Purpose | Supervised learning | Reinforcement learning |
|
|
120
|
+
| Data | Labeled examples | Environment interactions |
|
|
121
|
+
| Topology | TP=2, DP=2 | Split: 2 inference + 2 training |
|
|
122
|
+
| Batch size | 128 (effective) | Variable (episode-based) |
|
|
123
|
+
| Training | Standard backprop | Policy gradient (GSPO) |
|
|
124
|
+
|
|
125
|
+
## Tips
|
|
126
|
+
|
|
127
|
+
1. **Start Small:** Test with a small dataset first to verify the pipeline
|
|
128
|
+
2. **Validation:** Add a validation set to monitor overfitting
|
|
129
|
+
3. **Checkpointing:** Training saves checkpoints every 100 steps
|
|
130
|
+
4. **Resume:** Can resume from checkpoint if training is interrupted
|
|
131
|
+
5. **Inference:** After training, use the LoRA adapter with the base model
|
|
132
|
+
|
|
133
|
+
## Output
|
|
134
|
+
|
|
135
|
+
After training completes, you'll get:
|
|
136
|
+
- LoRA adapter weights (saved to volume)
|
|
137
|
+
- Training metrics and logs
|
|
138
|
+
- Best checkpoint (based on validation loss)
|
|
139
|
+
- Model ready for inference or RL initialization
|
|
140
|
+
|
|
141
|
+
## Next Steps
|
|
142
|
+
|
|
143
|
+
1. **Evaluate:** Test your fine-tuned model on held-out data
|
|
144
|
+
2. **RL Training:** Use this as initialization for RL (`init_from_sft = true`)
|
|
145
|
+
3. **Deploy:** Load LoRA adapter for inference
|
|
146
|
+
4. **Iterate:** Adjust hyperparameters based on performance
|
|
147
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Verilog RL with LoRA (Qwen3-0.6B)
|
|
2
|
+
|
|
3
|
+
## Quick Start
|
|
4
|
+
|
|
5
|
+
1. **Deploy Verilog Task App**:
|
|
6
|
+
```bash
|
|
7
|
+
cd synth-ai
|
|
8
|
+
uvx synth-ai modal-serve grpo-verilog
|
|
9
|
+
```
|
|
10
|
+
Note the Modal URL and update `task_url` in `verilog_rl_lora.toml`.
|
|
11
|
+
|
|
12
|
+
2. **Run Training**:
|
|
13
|
+
```bash
|
|
14
|
+
uvx synth-ai rl run --config examples/multi_step/configs/verilog_rl_lora.toml
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Configuration Overview
|
|
18
|
+
|
|
19
|
+
### **Key Adaptations from Crafter**:
|
|
20
|
+
|
|
21
|
+
- **Model**: `Qwen/Qwen3-0.6B` (✅ proven in SFT configs)
|
|
22
|
+
- **Environment**: `verilog` instead of `crafter`
|
|
23
|
+
- **Steps**: 15 turns (vs Crafter's 10) for compilation workflows
|
|
24
|
+
- **Rewards**: Adjusted for sparser Verilog rewards (0.5 vs 1.0 indicator_lambda)
|
|
25
|
+
- **Rubrics**: Verilog-specific judging criteria
|
|
26
|
+
|
|
27
|
+
### **Hardware Requirements** (Standard RL setup):
|
|
28
|
+
- ✅ **2x H100 GPUs** (vLLM inference + LoRA training split)
|
|
29
|
+
- ✅ **No tensor parallelism** needed for 0.6B model
|
|
30
|
+
- ✅ **4x faster inference** than 32B model
|
|
31
|
+
- ✅ **Same compute pattern** as Crafter (just smaller model)
|
|
32
|
+
|
|
33
|
+
### **Expected Workflow**:
|
|
34
|
+
1. Agent writes Verilog code (`write_file`)
|
|
35
|
+
2. Compiles to check syntax (`compile`)
|
|
36
|
+
3. Simulates to verify behavior (`simulate`)
|
|
37
|
+
4. Submits if tests pass (`submit`)
|
|
38
|
+
5. **Rewards**: +1.0 for compilation success, +10.0 for passing tests
|
|
39
|
+
|
|
40
|
+
## Rubric Design
|
|
41
|
+
|
|
42
|
+
### **Event Rewards** (per decision):
|
|
43
|
+
- **Compilation Success**: 70% weight (1.0 for success, 0.0 for errors)
|
|
44
|
+
- **Process Efficiency**: 30% weight (penalizes redundant operations)
|
|
45
|
+
|
|
46
|
+
### **Outcome Rewards** (final score):
|
|
47
|
+
- **Tests Passed**: 80% weight (full credit when all tests pass)
|
|
48
|
+
- **Design Quality**: 20% weight (code clarity, documentation)
|
|
49
|
+
|
|
50
|
+
## Troubleshooting
|
|
51
|
+
|
|
52
|
+
### **If training fails**:
|
|
53
|
+
1. Check Modal URL in `task_url` field
|
|
54
|
+
2. Verify `GROQ_API_KEY` for inference
|
|
55
|
+
3. Ensure `OPENAI_API_KEY` for judging
|
|
56
|
+
|
|
57
|
+
### **Memory issues** (unlikely with 0.6B):
|
|
58
|
+
- Reduce `batch_size` to 2
|
|
59
|
+
- Set `gradient_accumulation_steps = 2`
|
|
60
|
+
- Verify 2x GPU split is working (vLLM on GPU 0, training on GPU 1)
|
|
61
|
+
|
|
62
|
+
### **Slow training**:
|
|
63
|
+
- Increase `episodes_per_batch` to 6-8
|
|
64
|
+
- Check network latency to Modal task app
|
|
65
|
+
|
|
66
|
+
## Expected Results
|
|
67
|
+
|
|
68
|
+
- **Convergence**: Should learn basic compilation workflow in 1-2 hours
|
|
69
|
+
- **Success Rate**: 20-40% initial test pass rate (improves with training)
|
|
70
|
+
- **Learning**: Agent learns to debug compilation errors and write correct Verilog
|
|
71
|
+
|
|
72
|
+
## Next Steps
|
|
73
|
+
|
|
74
|
+
1. **Monitor reward progression** in training logs
|
|
75
|
+
2. **Adjust rubrics** if agent struggles with compilation errors
|
|
76
|
+
3. **Scale to 8B model** once 0.6B baseline works
|
|
77
|
+
4. **Add domain-specific fine-tuning** for Verilog syntax
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Verilog Reward Structure (Normalized to 1.0)
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
All rewards in the Verilog task app are normalized so the maximum possible reward is **1.0**.
|
|
5
|
+
|
|
6
|
+
## Reward Components
|
|
7
|
+
|
|
8
|
+
### 1. Step Penalty: **-0.001** per step
|
|
9
|
+
- Applied to every action taken
|
|
10
|
+
- Encourages efficient solutions
|
|
11
|
+
- Normalized from `-0.01` (original)
|
|
12
|
+
|
|
13
|
+
### 2. Compile Success: **+0.01**
|
|
14
|
+
- Awarded when `iverilog` compilation succeeds (returncode 0)
|
|
15
|
+
- Validates syntax correctness
|
|
16
|
+
- Normalized from `+0.1` (original)
|
|
17
|
+
|
|
18
|
+
### 3. Simulation Pass: **+0.1**
|
|
19
|
+
- Awarded when `vvp` simulation passes all tests
|
|
20
|
+
- Validates behavioral correctness
|
|
21
|
+
- Normalized from `+1.0` (original)
|
|
22
|
+
|
|
23
|
+
### 4. Submit Success: **+1.0** (maximum reward)
|
|
24
|
+
- Awarded when final submission passes all verification tests
|
|
25
|
+
- This is the goal state
|
|
26
|
+
- Normalized from `+10.0` (original)
|
|
27
|
+
|
|
28
|
+
## Typical Reward Trajectories
|
|
29
|
+
|
|
30
|
+
### ✅ Optimal Path (3 steps)
|
|
31
|
+
```
|
|
32
|
+
Step 1: write_file → -0.001
|
|
33
|
+
Step 2: compile (success) → +0.01 - 0.001 = +0.009
|
|
34
|
+
Step 3: simulate (pass) → +0.1 - 0.001 = +0.099
|
|
35
|
+
Total: ~0.107
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### ✅ Good Path (4 steps with submit)
|
|
39
|
+
```
|
|
40
|
+
Step 1: write_file → -0.001
|
|
41
|
+
Step 2: compile (success) → +0.009
|
|
42
|
+
Step 3: simulate (pass) → +0.099
|
|
43
|
+
Step 4: submit (success) → +1.0 - 0.001 = +0.999
|
|
44
|
+
Total: ~1.106
|
|
45
|
+
```
|
|
46
|
+
*Note: Can exceed 1.0 if intermediate rewards stack with final submit*
|
|
47
|
+
|
|
48
|
+
### ❌ Failure Path (compilation errors)
|
|
49
|
+
```
|
|
50
|
+
Step 1: write_file → -0.001
|
|
51
|
+
Step 2: compile (fail) → -0.001
|
|
52
|
+
Step 3: write_file (fix) → -0.001
|
|
53
|
+
Step 4: compile (success) → +0.009
|
|
54
|
+
Step 5: simulate (pass) → +0.099
|
|
55
|
+
Total: ~0.105
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Implementation Details
|
|
59
|
+
|
|
60
|
+
### Location
|
|
61
|
+
- **Reward components**: `synth_ai/environments/examples/verilog/engine.py`
|
|
62
|
+
- `VerilogCompileSuccessComponent`: +0.01
|
|
63
|
+
- `VerilogSimulationPassComponent`: +0.1
|
|
64
|
+
- `VerilogSubmitSuccessComponent`: +1.0
|
|
65
|
+
- `VerilogStepPenaltyComponent`: -0.001
|
|
66
|
+
|
|
67
|
+
### Normalization Ratio
|
|
68
|
+
All rewards were divided by **10.0** to normalize:
|
|
69
|
+
- Original max: ~10.0
|
|
70
|
+
- Normalized max: ~1.0
|
|
71
|
+
- Ratio: 10.0
|
|
72
|
+
|
|
73
|
+
## Why Normalize?
|
|
74
|
+
|
|
75
|
+
1. **Consistency**: Makes it easier to compare rewards across different task types
|
|
76
|
+
2. **RL Training**: Standard reward scales improve learning stability
|
|
77
|
+
3. **Interpretability**: Rewards as percentages (0.0 to 1.0) are intuitive
|
|
78
|
+
4. **Judge Compatibility**: Rubric scores typically range 0-1, making blending easier
|
|
79
|
+
|
|
80
|
+
## Testing
|
|
81
|
+
```bash
|
|
82
|
+
# Run eval to verify normalized rewards
|
|
83
|
+
uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Expected output for successful rollout:
|
|
87
|
+
- `mean_return` ≈ 0.1 (if only compile+simulate)
|
|
88
|
+
- `mean_return` ≈ 1.0+ (if full submit success)
|
|
89
|
+
|
|
90
|
+
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Verilog Task App - RL Training Readiness Checklist
|
|
2
|
+
|
|
3
|
+
## ✅ Core Requirements
|
|
4
|
+
|
|
5
|
+
### 1. Reward Normalization
|
|
6
|
+
- ✅ **Max reward = 1.0**: All rewards scaled to `[0, 1]` range
|
|
7
|
+
- ✅ **Step penalty**: `-0.001` (normalized from `-0.01`)
|
|
8
|
+
- ✅ **Compile success**: `+0.01` (normalized from `+0.1`)
|
|
9
|
+
- ✅ **Simulate pass**: `+0.1` (normalized from `+1.0`)
|
|
10
|
+
- ✅ **Submit success**: `+1.0` (normalized from `+10.0`)
|
|
11
|
+
|
|
12
|
+
### 2. Inference URL Handling (Critical for Trace Correlation)
|
|
13
|
+
- ✅ **Extracts from policy config**: Uses `policy_config.get("inference_url")` as primary source
|
|
14
|
+
- ✅ **Includes in trajectory**: Sets `trajectory.inference_url` with `?cid=...` parameter
|
|
15
|
+
- ✅ **Includes in final.info**: Adds to `final["info"]["inference_url"]`
|
|
16
|
+
- ✅ **Includes in pipeline_metadata**: Top-level `inference_url` field for trainer extraction
|
|
17
|
+
- ✅ **Logs cid presence**: Logs `has_cid` flag for debugging
|
|
18
|
+
- ✅ **Fallback to agent.inference_url**: Uses agent's URL if policy config missing (eval mode)
|
|
19
|
+
|
|
20
|
+
**Location**: `grpo_verilog.py` lines 829-867, 887-908
|
|
21
|
+
|
|
22
|
+
### 3. Pipeline Metadata
|
|
23
|
+
- ✅ **Required fields present**:
|
|
24
|
+
- `reward_score`: Final episode reward
|
|
25
|
+
- `policy_id`: Policy identifier
|
|
26
|
+
- `inference_url`: **CRITICAL** - Contains `?cid=trace_xxxxx` for correlation
|
|
27
|
+
- `env_name`: Environment identifier
|
|
28
|
+
- `task_id`: Problem identifier
|
|
29
|
+
- `task_split`: Dataset split (train/val/test)
|
|
30
|
+
- ✅ **Inference details**: Provider, model, URL in nested `inference` dict
|
|
31
|
+
|
|
32
|
+
**Location**: `grpo_verilog.py` lines 887-908
|
|
33
|
+
|
|
34
|
+
### 4. Trace Correlation (Required for RL Training)
|
|
35
|
+
- ✅ **Trainer injects cid**: Trainer adds `?cid=trace_xxxxx` to `policy_config["inference_url"]`
|
|
36
|
+
- ✅ **Task app preserves cid**: Uses `policy_config["inference_url"]` directly
|
|
37
|
+
- ✅ **Trainer extracts cid**: Extracts from `trajectory.inference_url` using `inference_url_to_trace_correlation_id()`
|
|
38
|
+
- ✅ **Trace hydration**: Trainer queries trace store with extracted `trace_correlation_id`
|
|
39
|
+
|
|
40
|
+
**Flow**:
|
|
41
|
+
```
|
|
42
|
+
Trainer → policy_config["inference_url"] = "http://...?cid=trace_xxxxx"
|
|
43
|
+
↓
|
|
44
|
+
Task App → trajectory.inference_url = policy_config["inference_url"]
|
|
45
|
+
↓
|
|
46
|
+
Trainer → extract_trace_correlation_id(trajectory.inference_url)
|
|
47
|
+
↓
|
|
48
|
+
Trainer → trace_store.resolve_correlation(trace_correlation_id)
|
|
49
|
+
↓
|
|
50
|
+
Trainer → Hydrate v3 trace with event_history
|
|
51
|
+
↓
|
|
52
|
+
Judge → Score using full trace
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### 5. Response Contract Compliance
|
|
56
|
+
- ✅ **RolloutResponse fields**:
|
|
57
|
+
- `run_id`: Unique identifier
|
|
58
|
+
- `trajectories`: List of trajectories (with `inference_url`)
|
|
59
|
+
- `metrics`: Episode metrics
|
|
60
|
+
- `pipeline_metadata`: **CRITICAL** - Contains `inference_url` and `reward_score`
|
|
61
|
+
- `trace_correlation_id`: Optional (trainer infers from `inference_url`)
|
|
62
|
+
- ✅ **Optional trace_correlation_id**: Made optional in `contracts.py` (trainer infers from URL)
|
|
63
|
+
|
|
64
|
+
**Location**: `synth_ai/task/contracts.py` line 156
|
|
65
|
+
|
|
66
|
+
### 6. Environment Implementation
|
|
67
|
+
- ✅ **Stateful engine**: `VerilogEngine` extends `StatefulEngine`
|
|
68
|
+
- ✅ **Reward stack**: Properly configured with normalized components
|
|
69
|
+
- ✅ **State management**: `VerilogPublicState` and `VerilogPrivateState`
|
|
70
|
+
- ✅ **Tool implementation**: All 4 tools (write_file, compile, simulate, submit)
|
|
71
|
+
|
|
72
|
+
**Location**: `synth_ai/environments/examples/verilog/engine.py`
|
|
73
|
+
|
|
74
|
+
### 7. LLM Agent Integration
|
|
75
|
+
- ✅ **Multi-turn support**: Agent maintains conversation history
|
|
76
|
+
- ✅ **Tool parsing**: Extracts tool calls from LLM responses
|
|
77
|
+
- ✅ **Guidance system**: Provides context-aware hints
|
|
78
|
+
- ✅ **Error handling**: Graceful fallback for malformed responses
|
|
79
|
+
|
|
80
|
+
**Location**: `grpo_verilog.py` lines 200-530
|
|
81
|
+
|
|
82
|
+
## 🔍 Verification Tests
|
|
83
|
+
|
|
84
|
+
### Test 1: Eval Mode (No Trace Correlation)
|
|
85
|
+
```bash
|
|
86
|
+
uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
|
|
87
|
+
```
|
|
88
|
+
**Expected**:
|
|
89
|
+
- ✅ `mean_return` ≈ 0.1 (normalized rewards)
|
|
90
|
+
- ✅ `inference_url` = Groq API URL (no `?cid=...`)
|
|
91
|
+
- ✅ `task_completed` = True for correct solutions
|
|
92
|
+
|
|
93
|
+
### Test 2: RL Training Mode (With Trace Correlation)
|
|
94
|
+
```bash
|
|
95
|
+
uvx synth-ai train \
|
|
96
|
+
--type rl \
|
|
97
|
+
--config examples/multi_step/configs/verilog_rl_lora.toml \
|
|
98
|
+
--task-url https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run \
|
|
99
|
+
--backend https://synth-backend-dev-docker.onrender.com/api \
|
|
100
|
+
--env-file /path/to/verilog/.env
|
|
101
|
+
```
|
|
102
|
+
**Expected**:
|
|
103
|
+
- ✅ Trainer logs show `inference_url` with `?cid=trace_xxxxx`
|
|
104
|
+
- ✅ Task app logs show `has_cid=True`
|
|
105
|
+
- ✅ Trace hydration succeeds (no `404 Not Found` errors)
|
|
106
|
+
- ✅ Judge receives full `event_history`
|
|
107
|
+
- ✅ Training updates show non-zero rewards
|
|
108
|
+
|
|
109
|
+
### Test 3: Trace Correlation ID Extraction
|
|
110
|
+
```python
|
|
111
|
+
from synth_envs_hosted.utils import inference_url_to_trace_correlation_id
|
|
112
|
+
|
|
113
|
+
# Should extract trace_xxxxx from URL
|
|
114
|
+
url = "http://localhost:8000/v1/chat/completions?cid=trace_abc123"
|
|
115
|
+
cid = inference_url_to_trace_correlation_id(url)
|
|
116
|
+
assert cid == "trace_abc123"
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Test 4: Pipeline Metadata Structure
|
|
120
|
+
```python
|
|
121
|
+
# Verify response has correct structure for RL
|
|
122
|
+
response = await task_app.rollout(request)
|
|
123
|
+
assert "pipeline_metadata" in response
|
|
124
|
+
assert "inference_url" in response.pipeline_metadata
|
|
125
|
+
assert "reward_score" in response.pipeline_metadata
|
|
126
|
+
assert len(response.trajectories) > 0
|
|
127
|
+
assert response.trajectories[0].inference_url is not None
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## 📋 Deployment Checklist
|
|
131
|
+
|
|
132
|
+
### Modal Deployment
|
|
133
|
+
1. ✅ **Environment variables set**:
|
|
134
|
+
- `GROQ_API_KEY`
|
|
135
|
+
- `VERILOG_INFERENCE_URL` (optional, uses Groq default)
|
|
136
|
+
2. ✅ **Secrets configured**: Groq API key in Modal secrets
|
|
137
|
+
3. ✅ **Task app URL**: Update in `verilog_rl_lora.toml`
|
|
138
|
+
|
|
139
|
+
### Training Configuration
|
|
140
|
+
1. ✅ **2x GPUs minimum**: 1 for vLLM, 1 for training
|
|
141
|
+
2. ✅ **Model size**: `Qwen/Qwen3-0.6B` for testing
|
|
142
|
+
3. ✅ **Batch size**: 4 (matches Crafter)
|
|
143
|
+
4. ✅ **Max turns**: 15 (enough for compile chains)
|
|
144
|
+
5. ✅ **Rubric enabled**: `rubric.enabled = true`
|
|
145
|
+
|
|
146
|
+
## 🚨 Common Issues & Fixes
|
|
147
|
+
|
|
148
|
+
### Issue 1: `trace_correlation_id` Missing
|
|
149
|
+
**Symptom**: Trainer logs `FATAL: Rollout payload missing 'trace_correlation_id'`
|
|
150
|
+
**Fix**: Verify `trajectory.inference_url` contains `?cid=...` parameter
|
|
151
|
+
|
|
152
|
+
### Issue 2: Trace Hydration Fails (404)
|
|
153
|
+
**Symptom**: `404 Not Found` when querying `/trace/by-correlation/...`
|
|
154
|
+
**Fix**:
|
|
155
|
+
- Check inference server is capturing traces
|
|
156
|
+
- Verify `cid` parameter is in inference URL
|
|
157
|
+
- Ensure `vllm_public_url` is set correctly
|
|
158
|
+
|
|
159
|
+
### Issue 3: Rewards Not Normalized
|
|
160
|
+
**Symptom**: `mean_return` > 1.0 in eval
|
|
161
|
+
**Fix**: Verify all reward components in `engine.py` are scaled by 10x
|
|
162
|
+
|
|
163
|
+
### Issue 4: Agent Gets Stuck
|
|
164
|
+
**Symptom**: Agent repeats same action (e.g., compile without fixing)
|
|
165
|
+
**Fix**: Check guidance system is providing proper hints
|
|
166
|
+
|
|
167
|
+
## 🎯 Final Verification
|
|
168
|
+
|
|
169
|
+
Before starting RL training, verify:
|
|
170
|
+
- [ ] Eval runs successfully with normalized rewards (≈ 0.1)
|
|
171
|
+
- [ ] Modal deployment returns proper `inference_url` structure
|
|
172
|
+
- [ ] Trace correlation ID extraction works
|
|
173
|
+
- [ ] Pipeline metadata includes all required fields
|
|
174
|
+
- [ ] Response contract matches expected schema
|
|
175
|
+
|
|
176
|
+
**If all checks pass**: ✅ **Ready for RL training!**
|
|
177
|
+
|
|
178
|
+
## 📚 Related Documentation
|
|
179
|
+
- [VERILOG_REWARDS.md](./VERILOG_REWARDS.md) - Reward structure details
|
|
180
|
+
- [verilog_rl_lora.md](../verilog_rl_lora.md) - RL/LoRA feasibility analysis
|
|
181
|
+
- [verilog_rl_lora.toml](./verilog_rl_lora.toml) - Training configuration
|
|
182
|
+
|
|
183
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Crafter eval using Synth backend with Qwen3-4B
|
|
2
|
+
|
|
3
|
+
[eval]
|
|
4
|
+
app_id = "grpo-crafter-task-app"
|
|
5
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
6
|
+
model = "Qwen/Qwen3-4B"
|
|
7
|
+
seeds = [0, 1, 2]
|
|
8
|
+
max_turns = 10
|
|
9
|
+
concurrency = 1
|
|
10
|
+
env_name = "crafter"
|
|
11
|
+
policy_name = "crafter-react"
|
|
12
|
+
trace_format = "full"
|
|
13
|
+
return_trace = true
|
|
14
|
+
|
|
15
|
+
[eval.env_config]
|
|
16
|
+
env_params = {max_steps_per_episode = 10}
|
|
17
|
+
|
|
18
|
+
[eval.policy_config]
|
|
19
|
+
provider = "openai"
|
|
20
|
+
model = "Qwen/Qwen3-4B"
|
|
21
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
|
|
22
|
+
temperature = 0.6
|
|
23
|
+
top_p = 0.95
|
|
24
|
+
max_tokens = 512
|
|
25
|
+
use_vision = false
|
|
26
|
+
image_only_mode = false
|
|
27
|
+
max_llm_calls = 10
|
|
28
|
+
|
|
29
|
+
[eval.judge]
|
|
30
|
+
path = "examples/multi_step/judges/crafter_backend_judge.py"
|
|
31
|
+
name = "Backend"
|
|
32
|
+
backend_url = "http://localhost:8000/api"
|
|
33
|
+
model = "openai/gpt-oss-120b"
|
|
34
|
+
timeout_s = 45
|
|
35
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Evaluation config for Crafter with text-only input
|
|
2
|
+
# This config uses Groq Qwen with only text observations (no images)
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "qwen/qwen3-32b"
|
|
8
|
+
seeds = [0, 1, 2]
|
|
9
|
+
max_turns = 10
|
|
10
|
+
concurrency = 1
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "full"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
env_params = {max_steps_per_episode = 10}
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "groq"
|
|
21
|
+
model = "qwen/qwen3-32b"
|
|
22
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
23
|
+
temperature = 0.6
|
|
24
|
+
top_p = 0.95
|
|
25
|
+
max_tokens = 512
|
|
26
|
+
use_vision = false
|
|
27
|
+
image_only_mode = false
|
|
28
|
+
max_llm_calls = 10
|
|
29
|
+
|
|
30
|
+
[eval.judge]
|
|
31
|
+
path = "examples/multi_step/judges/crafter_backend_judge.py"
|
|
32
|
+
name = "Backend"
|
|
33
|
+
backend_url = "http://localhost:8000/api"
|
|
34
|
+
model = "openai/gpt-oss-120b"
|
|
35
|
+
timeout_s = 45
|
|
36
|
+
|
|
@@ -16,24 +16,24 @@ judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
|
16
16
|
|
|
17
17
|
[compute]
|
|
18
18
|
gpu_type = "H200"
|
|
19
|
-
gpu_count =
|
|
19
|
+
gpu_count = 4
|
|
20
20
|
|
|
21
21
|
[topology]
|
|
22
22
|
type = "single_node_split"
|
|
23
|
-
gpus_for_vllm =
|
|
24
|
-
gpus_for_training =
|
|
23
|
+
gpus_for_vllm = 2
|
|
24
|
+
gpus_for_training = 2
|
|
25
25
|
gpus_for_ref = 0
|
|
26
|
-
tensor_parallel =
|
|
26
|
+
tensor_parallel = 2
|
|
27
27
|
|
|
28
28
|
[vllm]
|
|
29
|
-
tensor_parallel_size =
|
|
30
|
-
max_model_len =
|
|
29
|
+
tensor_parallel_size = 2
|
|
30
|
+
max_model_len = 4096
|
|
31
31
|
|
|
32
32
|
[reference]
|
|
33
33
|
placement = "none"
|
|
34
34
|
|
|
35
35
|
[model]
|
|
36
|
-
base = "Qwen/Qwen3-
|
|
36
|
+
base = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
|
37
37
|
trainer_mode = "lora"
|
|
38
38
|
label = "crafter-rl-stepwise-hosted-judge"
|
|
39
39
|
|
|
@@ -46,7 +46,7 @@ target_modules = ["all-linear"]
|
|
|
46
46
|
[rollout]
|
|
47
47
|
env_name = "crafter"
|
|
48
48
|
max_turns = 10
|
|
49
|
-
episodes_per_batch =
|
|
49
|
+
episodes_per_batch = 2
|
|
50
50
|
policy_name = "crafter-react"
|
|
51
51
|
max_concurrent_rollouts = 8
|
|
52
52
|
batches_per_step = 2
|
|
@@ -69,12 +69,12 @@ ops = ["agent", "env"]
|
|
|
69
69
|
|
|
70
70
|
[evaluation]
|
|
71
71
|
instances = 16
|
|
72
|
-
every_n_iters =
|
|
72
|
+
every_n_iters = 10
|
|
73
73
|
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
74
74
|
|
|
75
75
|
[training]
|
|
76
76
|
num_epochs = 1
|
|
77
|
-
iterations_per_epoch =
|
|
77
|
+
iterations_per_epoch = 5
|
|
78
78
|
gradient_accumulation_steps = 1
|
|
79
79
|
max_accumulated_minibatch = 1
|
|
80
80
|
max_turns = 10
|
|
@@ -84,6 +84,7 @@ learning_rate = 5e-5
|
|
|
84
84
|
log_interval = 1
|
|
85
85
|
weight_sync_interval = 1
|
|
86
86
|
event_rewards_kind = "unique"
|
|
87
|
+
async_semaphore_max = 4 # Max concurrent rollouts in streaming pipeline
|
|
87
88
|
|
|
88
89
|
# Enable dense decision rewards in the trainer to mirror env_config step rewards.
|
|
89
90
|
step_rewards_enabled = true
|
|
@@ -127,7 +128,7 @@ criteria = [
|
|
|
127
128
|
]
|
|
128
129
|
|
|
129
130
|
[judge]
|
|
130
|
-
type = "
|
|
131
|
+
type = "groq" # or "groq" when routing to Groq-hosted judges
|
|
131
132
|
timeout_s = 45
|
|
132
133
|
|
|
133
134
|
[judge.options]
|