synth-ai 0.2.14__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/analyze_semantic_words.sh +2 -2
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +25 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +42 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +41 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +73 -115
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -2
- examples/qwen_coder/configs/coder_lora_4b.toml +5 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -2
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +152 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +274 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +489 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +415 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +110 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +59 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +26 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +26 -0
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/filter_qwen3vl_sft.toml +49 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +52 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +61 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +6 -6
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +62 -0
- examples/rl/configs/rl_from_base_qwen17.toml +79 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +21 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +6 -6
- examples/sft/configs/crafter_fft_qwen0p6b.toml +7 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +7 -3
- examples/sft/evaluate.py +2 -4
- examples/sft/export_dataset.py +7 -4
- examples/swe/task_app/README.md +33 -3
- examples/swe/task_app/grpo_swe_mini.py +4 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +50 -23
- examples/swe/task_app/hosted/inference/openai_client.py +4 -4
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +0 -8
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +70 -10
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +63 -27
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +48 -50
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +75 -36
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +31 -15
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +36 -5
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +5 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +5 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +827 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +454 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1084 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +5 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +144 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/builders.py +9 -3
- synth_ai/api/train/cli.py +155 -17
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/configs/__init__.py +8 -1
- synth_ai/api/train/configs/rl.py +32 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/auth/credentials.py +119 -0
- synth_ai/cli/__init__.py +61 -69
- synth_ai/cli/_modal_wrapper.py +7 -5
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/commands/__init__.py +17 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/deploy/__init__.py +23 -0
- synth_ai/cli/commands/deploy/core.py +614 -0
- synth_ai/cli/commands/deploy/errors.py +72 -0
- synth_ai/cli/commands/deploy/validation.py +11 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1109 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +388 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +73 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +199 -0
- synth_ai/cli/commands/train/judge_validation.py +304 -0
- synth_ai/cli/commands/train/validation.py +443 -0
- synth_ai/cli/demo.py +2 -162
- synth_ai/cli/deploy/__init__.py +28 -0
- synth_ai/cli/deploy/core.py +5 -0
- synth_ai/cli/deploy/errors.py +23 -0
- synth_ai/cli/deploy/validation.py +5 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/recent.py +2 -1
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +21 -0
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +7 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +11 -0
- synth_ai/cli/task_app_serve.py +11 -0
- synth_ai/cli/task_apps.py +110 -1499
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +5 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/http.py +8 -22
- synth_ai/inference/client.py +1 -1
- synth_ai/judge_schemas.py +4 -5
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +4 -2
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +469 -0
- synth_ai/streaming/streamer.py +301 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/apps/__init__.py +4 -2
- synth_ai/task/config.py +6 -4
- synth_ai/task/rubrics/__init__.py +1 -2
- synth_ai/task/rubrics/loaders.py +14 -10
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +24 -11
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +0 -1
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/session_tracer.py +7 -7
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +8 -9
- synth_ai/tracing_v3/turso/native_manager.py +80 -72
- synth_ai/tracing_v3/utils.py +2 -2
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +294 -0
- synth_ai/utils/http.py +172 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.17.dist-info}/METADATA +91 -32
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.17.dist-info}/RECORD +341 -154
- synth_ai/cli/man.py +0 -106
- synth_ai/cli/tui.py +0 -57
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -906
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.17.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.17.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# SFT Training for Qwen3-Coder-30B with LoRA
|
|
2
|
+
|
|
3
|
+
Supervised Fine-Tuning configuration for the same 30B MoE model used in RL training.
|
|
4
|
+
|
|
5
|
+
## Configuration Overview
|
|
6
|
+
|
|
7
|
+
**Model:** `Qwen/Qwen3-Coder-30B-A3B-Instruct` (Mixture of Experts)
|
|
8
|
+
|
|
9
|
+
**Hardware:** 4x H200 GPUs (561GB total VRAM)
|
|
10
|
+
|
|
11
|
+
**Parallelism Strategy:**
|
|
12
|
+
- **Tensor Parallel (TP)**: 2 GPUs - Splits the model across 2 GPUs for inference/forward pass
|
|
13
|
+
- **Data Parallel (DP)**: 2 GPUs - Splits batches across 2 GPUs for training throughput
|
|
14
|
+
|
|
15
|
+
**LoRA Configuration:**
|
|
16
|
+
- Rank (r): 16
|
|
17
|
+
- Alpha: 32
|
|
18
|
+
- Dropout: 0.05
|
|
19
|
+
- Target modules: `["all-linear"]` - Applies LoRA to all linear layers
|
|
20
|
+
|
|
21
|
+
## Memory Breakdown per GPU
|
|
22
|
+
|
|
23
|
+
With 4x H200 (141GB each):
|
|
24
|
+
|
|
25
|
+
**Model Split (TP=2):**
|
|
26
|
+
- 2 GPUs hold the base model (70GB each)
|
|
27
|
+
- ~70GB free per GPU for activations and gradients
|
|
28
|
+
|
|
29
|
+
**Training (DP=2):**
|
|
30
|
+
- 2 GPUs process different batches
|
|
31
|
+
- LoRA adapters: ~5-10GB per GPU
|
|
32
|
+
- Gradients/optimizer states: ~20-30GB per GPU
|
|
33
|
+
- **Total per training GPU: ~50-60GB** ✅
|
|
34
|
+
|
|
35
|
+
## Quick Start
|
|
36
|
+
|
|
37
|
+
### 1. Prepare Your Dataset
|
|
38
|
+
|
|
39
|
+
Your dataset should be in JSONL format with conversation turns:
|
|
40
|
+
|
|
41
|
+
```jsonl
|
|
42
|
+
{"messages": [{"role": "system", "content": "..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
43
|
+
{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 2. Run Training
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Using the helper script
|
|
50
|
+
./examples/multi_step/run_sft_qwen30b.sh path/to/your/dataset.jsonl
|
|
51
|
+
|
|
52
|
+
# Or directly with synth-ai CLI
|
|
53
|
+
uvx synth-ai train \
|
|
54
|
+
--type sft \
|
|
55
|
+
--config examples/multi_step/configs/crafter_sft_qwen30b_lora.toml \
|
|
56
|
+
--dataset path/to/your/dataset.jsonl \
|
|
57
|
+
--env-file backend/.env.dev
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 3. Monitor Training
|
|
61
|
+
|
|
62
|
+
Check the Synth dashboard for:
|
|
63
|
+
- Training loss curve
|
|
64
|
+
- Validation metrics (if validation set provided)
|
|
65
|
+
- GPU utilization
|
|
66
|
+
- Training throughput (tokens/sec)
|
|
67
|
+
|
|
68
|
+
## Hyperparameters
|
|
69
|
+
|
|
70
|
+
**Batch Configuration:**
|
|
71
|
+
- Per-device batch size: 1
|
|
72
|
+
- Gradient accumulation: 64 steps
|
|
73
|
+
- **Effective global batch size: 128** (1 × 64 × 2 GPUs)
|
|
74
|
+
|
|
75
|
+
**Learning Rate:**
|
|
76
|
+
- Initial LR: 5e-6
|
|
77
|
+
- Warmup ratio: 3%
|
|
78
|
+
- Schedule: Linear decay
|
|
79
|
+
|
|
80
|
+
**Sequence Length:** 4096 tokens
|
|
81
|
+
|
|
82
|
+
**Training:**
|
|
83
|
+
- Epochs: 1
|
|
84
|
+
- Mixed precision: BF16
|
|
85
|
+
- DeepSpeed: Stage 2 (optimizer state sharding)
|
|
86
|
+
- Activation checkpointing: Enabled
|
|
87
|
+
|
|
88
|
+
## Configuration File Structure
|
|
89
|
+
|
|
90
|
+
```toml
|
|
91
|
+
[algorithm]
|
|
92
|
+
type = "offline" # Supervised (not RL)
|
|
93
|
+
method = "sft" # Supervised fine-tuning
|
|
94
|
+
variety = "lora" # Using LoRA adapters
|
|
95
|
+
|
|
96
|
+
[compute]
|
|
97
|
+
gpu_type = "H200"
|
|
98
|
+
gpu_count = 4
|
|
99
|
+
|
|
100
|
+
[data.topology]
|
|
101
|
+
tensor_parallel = 2 # Split model across 2 GPUs
|
|
102
|
+
data_parallel = 2 # Split batches across 2 GPUs
|
|
103
|
+
|
|
104
|
+
[training]
|
|
105
|
+
mode = "lora"
|
|
106
|
+
use_qlora = true # Quantized LoRA (4-bit base model)
|
|
107
|
+
|
|
108
|
+
[lora]
|
|
109
|
+
r = 16 # LoRA rank
|
|
110
|
+
alpha = 32 # LoRA scaling
|
|
111
|
+
dropout = 0.05
|
|
112
|
+
target_modules = ["all-linear"] # Apply to all linear layers
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Comparison with RL Config
|
|
116
|
+
|
|
117
|
+
| Aspect | SFT | RL |
|
|
118
|
+
|--------|-----|-----|
|
|
119
|
+
| Purpose | Supervised learning | Reinforcement learning |
|
|
120
|
+
| Data | Labeled examples | Environment interactions |
|
|
121
|
+
| Topology | TP=2, DP=2 | Split: 2 inference + 2 training |
|
|
122
|
+
| Batch size | 128 (effective) | Variable (episode-based) |
|
|
123
|
+
| Training | Standard backprop | Policy gradient (GSPO) |
|
|
124
|
+
|
|
125
|
+
## Tips
|
|
126
|
+
|
|
127
|
+
1. **Start Small:** Test with a small dataset first to verify the pipeline
|
|
128
|
+
2. **Validation:** Add a validation set to monitor overfitting
|
|
129
|
+
3. **Checkpointing:** Training saves checkpoints every 100 steps
|
|
130
|
+
4. **Resume:** Can resume from checkpoint if training is interrupted
|
|
131
|
+
5. **Inference:** After training, use the LoRA adapter with the base model
|
|
132
|
+
|
|
133
|
+
## Output
|
|
134
|
+
|
|
135
|
+
After training completes, you'll get:
|
|
136
|
+
- LoRA adapter weights (saved to volume)
|
|
137
|
+
- Training metrics and logs
|
|
138
|
+
- Best checkpoint (based on validation loss)
|
|
139
|
+
- Model ready for inference or RL initialization
|
|
140
|
+
|
|
141
|
+
## Next Steps
|
|
142
|
+
|
|
143
|
+
1. **Evaluate:** Test your fine-tuned model on held-out data
|
|
144
|
+
2. **RL Training:** Use this as initialization for RL (`init_from_sft = true`)
|
|
145
|
+
3. **Deploy:** Load LoRA adapter for inference
|
|
146
|
+
4. **Iterate:** Adjust hyperparameters based on performance
|
|
147
|
+
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|
|
@@ -1,39 +1,32 @@
|
|
|
1
|
-
# Crafter RL experiment – stepwise shaping with hosted judge rubrics
|
|
2
|
-
#
|
|
3
|
-
# This configuration extends the stepwise LoRA baseline by wiring the Synth judge
|
|
4
|
-
# service so evaluation rolls combine dense step rewards with hosted rubric scoring.
|
|
5
|
-
|
|
6
1
|
[algorithm]
|
|
7
2
|
type = "online"
|
|
8
3
|
method = "policy_gradient"
|
|
9
4
|
variety = "gspo"
|
|
10
5
|
|
|
11
6
|
[services]
|
|
12
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
13
7
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
14
|
-
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
8
|
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
16
9
|
|
|
17
10
|
[compute]
|
|
18
11
|
gpu_type = "H200"
|
|
19
|
-
gpu_count =
|
|
12
|
+
gpu_count = 4
|
|
20
13
|
|
|
21
14
|
[topology]
|
|
22
15
|
type = "single_node_split"
|
|
23
|
-
gpus_for_vllm =
|
|
24
|
-
gpus_for_training =
|
|
16
|
+
gpus_for_vllm = 2
|
|
17
|
+
gpus_for_training = 2
|
|
25
18
|
gpus_for_ref = 0
|
|
26
|
-
tensor_parallel =
|
|
19
|
+
tensor_parallel = 2
|
|
27
20
|
|
|
28
21
|
[vllm]
|
|
29
|
-
tensor_parallel_size =
|
|
30
|
-
max_model_len =
|
|
22
|
+
tensor_parallel_size = 2
|
|
23
|
+
max_model_len = 4096
|
|
31
24
|
|
|
32
25
|
[reference]
|
|
33
26
|
placement = "none"
|
|
34
27
|
|
|
35
28
|
[model]
|
|
36
|
-
base = "Qwen/Qwen3-
|
|
29
|
+
base = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
|
37
30
|
trainer_mode = "lora"
|
|
38
31
|
label = "crafter-rl-stepwise-hosted-judge"
|
|
39
32
|
|
|
@@ -41,7 +34,7 @@ label = "crafter-rl-stepwise-hosted-judge"
|
|
|
41
34
|
r = 16
|
|
42
35
|
alpha = 32
|
|
43
36
|
dropout = 0.05
|
|
44
|
-
target_modules = ["all-linear"]
|
|
37
|
+
target_modules = [ "all-linear",]
|
|
45
38
|
|
|
46
39
|
[rollout]
|
|
47
40
|
env_name = "crafter"
|
|
@@ -50,31 +43,16 @@ episodes_per_batch = 2
|
|
|
50
43
|
policy_name = "crafter-react"
|
|
51
44
|
max_concurrent_rollouts = 8
|
|
52
45
|
batches_per_step = 2
|
|
53
|
-
ops = ["agent", "env"]
|
|
54
|
-
|
|
55
|
-
[rollout.env_config]
|
|
56
|
-
difficulty = "easy"
|
|
57
|
-
|
|
58
|
-
[rollout.env_config.step_rewards]
|
|
59
|
-
enabled = true
|
|
60
|
-
mode = "decision_stepwise"
|
|
61
|
-
strategy = "consistent" # +1 for each decision that unlocks a new achievement
|
|
62
|
-
indicator_lambda = 1.0
|
|
63
|
-
step_beta = 0.0
|
|
64
|
-
|
|
65
|
-
[rollout.policy_config]
|
|
66
|
-
temperature = 0.2
|
|
67
|
-
top_p = 0.95
|
|
68
|
-
max_tokens = 512
|
|
46
|
+
ops = [ "agent", "env",]
|
|
69
47
|
|
|
70
48
|
[evaluation]
|
|
71
49
|
instances = 16
|
|
72
50
|
every_n_iters = 10
|
|
73
|
-
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
51
|
+
seeds = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,]
|
|
74
52
|
|
|
75
53
|
[training]
|
|
76
54
|
num_epochs = 1
|
|
77
|
-
iterations_per_epoch =
|
|
55
|
+
iterations_per_epoch = 5
|
|
78
56
|
gradient_accumulation_steps = 1
|
|
79
57
|
max_accumulated_minibatch = 1
|
|
80
58
|
max_turns = 10
|
|
@@ -84,104 +62,84 @@ learning_rate = 5e-5
|
|
|
84
62
|
log_interval = 1
|
|
85
63
|
weight_sync_interval = 1
|
|
86
64
|
event_rewards_kind = "unique"
|
|
87
|
-
async_semaphore_max =
|
|
88
|
-
|
|
89
|
-
# Enable dense decision rewards in the trainer to mirror env_config step rewards.
|
|
65
|
+
async_semaphore_max = 4
|
|
90
66
|
step_rewards_enabled = true
|
|
91
67
|
step_rewards_mode = "decision_stepwise"
|
|
92
68
|
step_rewards_indicator_lambda = 1.0
|
|
93
69
|
step_rewards_beta = 0.0
|
|
94
70
|
step_rewards_strategy = "consistent"
|
|
95
71
|
|
|
72
|
+
[rubric]
|
|
73
|
+
enabled = true
|
|
74
|
+
|
|
75
|
+
[rollout.env_config]
|
|
76
|
+
difficulty = "easy"
|
|
77
|
+
|
|
78
|
+
[rollout.policy_config]
|
|
79
|
+
temperature = 0.2
|
|
80
|
+
top_p = 0.95
|
|
81
|
+
max_tokens = 512
|
|
82
|
+
|
|
96
83
|
[training.weight_sync]
|
|
97
84
|
enable = true
|
|
98
|
-
targets = ["policy"]
|
|
85
|
+
targets = [ "policy",]
|
|
99
86
|
mode = "direct"
|
|
100
87
|
direct = true
|
|
101
88
|
verify_every_k = 0
|
|
102
89
|
|
|
103
|
-
[rubric]
|
|
104
|
-
enabled = true
|
|
105
|
-
model = "openai/gpt-oss-120b"
|
|
106
|
-
api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
|
|
107
|
-
api_key_env = "OPENAI_API_KEY"
|
|
108
|
-
# Blend the hosted judge scores with environment returns inside the trainer.
|
|
109
90
|
[rubric.weights]
|
|
110
91
|
env = 0.2
|
|
111
92
|
event = 0.4
|
|
112
93
|
outcome = 0.4
|
|
113
94
|
|
|
114
|
-
[
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
]
|
|
121
|
-
|
|
122
|
-
[rubric.outcome]
|
|
123
|
-
# Hosted judge rubric for final trajectory scoring.
|
|
124
|
-
rubric_id = "crafter/outcome@v1"
|
|
125
|
-
criteria = [
|
|
126
|
-
{ key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
|
|
127
|
-
{ key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
|
|
128
|
-
]
|
|
129
|
-
|
|
130
|
-
[judge]
|
|
131
|
-
type = "groq" # or "groq" when routing to Groq-hosted judges
|
|
95
|
+
[judge.options]
|
|
96
|
+
event = true
|
|
97
|
+
outcome = true
|
|
98
|
+
provider = "openai"
|
|
99
|
+
model = "openai/gpt-oss-120b"
|
|
100
|
+
rubric_id = "crafter/bundle@v1"
|
|
132
101
|
timeout_s = 45
|
|
133
102
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
178
|
-
id = "outcome.achievement_depth"
|
|
179
|
-
weight = 0.4
|
|
180
|
-
scale = "bounded"
|
|
181
|
-
description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
|
|
182
|
-
|
|
183
|
-
[judge.options.weights]
|
|
184
|
-
process = 0.05
|
|
185
|
-
reasoning = 0.15
|
|
186
|
-
progress = 0.30
|
|
187
|
-
outcome = 0.50
|
|
103
|
+
[rollout.env_config.step_rewards]
|
|
104
|
+
enabled = true
|
|
105
|
+
mode = "decision_stepwise"
|
|
106
|
+
strategy = "consistent"
|
|
107
|
+
indicator_lambda = 1.0
|
|
108
|
+
step_beta = 0.0
|
|
109
|
+
|
|
110
|
+
[judge.options.weights]
|
|
111
|
+
process = 0.05
|
|
112
|
+
reasoning = 0.15
|
|
113
|
+
progress = 0.3
|
|
114
|
+
outcome = 0.5
|
|
115
|
+
|
|
116
|
+
[judge.options.rubric_overrides.event]
|
|
117
|
+
goal_text = "Treat each decision as a check for new Crafter achievements.\nAward the top score only when the log shows a fresh achievement unlock or an immediately verifiable deterministic completion.\nKeep otherwise useful setup actions in a narrow low band so non-achievement turns stay near zero."
|
|
118
|
+
aggregation = "weighted_sum"
|
|
119
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
120
|
+
id = "progress.unique_achievements"
|
|
121
|
+
weight = 0.9
|
|
122
|
+
scale = "binary"
|
|
123
|
+
description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0."
|
|
124
|
+
|
|
125
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
126
|
+
id = "process.intent_alignment"
|
|
127
|
+
weight = 0.1
|
|
128
|
+
scale = "bounded"
|
|
129
|
+
description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock."
|
|
130
|
+
|
|
131
|
+
[judge.options.rubric_overrides.outcome]
|
|
132
|
+
goal_text = "Summarise the episode outcome in relation to Crafter’s win condition:\nsurvive, accumulate resources, and craft advanced tools or structures.\nHighlight notable achievements, safety failures, and preparedness for future exploration."
|
|
133
|
+
aggregation = "weighted_sum"
|
|
134
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
135
|
+
id = "outcome.goal_completion"
|
|
136
|
+
weight = 0.6
|
|
137
|
+
scale = "binary"
|
|
138
|
+
description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace)."
|
|
139
|
+
|
|
140
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
141
|
+
id = "outcome.achievement_depth"
|
|
142
|
+
weight = 0.4
|
|
143
|
+
scale = "bounded"
|
|
144
|
+
description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success."
|
|
145
|
+
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# Crafter RL experiment – simple stepwise rewards (1 point per new achievement)
|
|
2
|
+
# This config uses the NEW unified [policy] section format
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "online"
|
|
6
|
+
method = "policy_gradient"
|
|
7
|
+
variety = "gspo"
|
|
8
|
+
|
|
9
|
+
[services]
|
|
10
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
11
|
+
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
12
|
+
|
|
13
|
+
[compute]
|
|
14
|
+
gpu_type = "H200"
|
|
15
|
+
gpu_count = 2
|
|
16
|
+
|
|
17
|
+
[compute.topology] # Nested: topology is part of compute
|
|
18
|
+
type = "single_node_split"
|
|
19
|
+
gpus_for_vllm = 1
|
|
20
|
+
gpus_for_training = 1
|
|
21
|
+
gpus_for_ref = 0
|
|
22
|
+
tensor_parallel = 1
|
|
23
|
+
reference_placement = "none" # Reference model placement
|
|
24
|
+
|
|
25
|
+
[vllm]
|
|
26
|
+
tensor_parallel_size = 1
|
|
27
|
+
max_model_len = 8192
|
|
28
|
+
|
|
29
|
+
[judge]
|
|
30
|
+
enabled = false # Set to true to enable judge/rubric scoring
|
|
31
|
+
|
|
32
|
+
# Uncomment to enable judge-based reward blending:
|
|
33
|
+
# enabled = true
|
|
34
|
+
# timeout_s = 45
|
|
35
|
+
#
|
|
36
|
+
# [judge.reward_blend] # How to blend env/event/outcome reward sources
|
|
37
|
+
# env = 0.2
|
|
38
|
+
# event = 0.4
|
|
39
|
+
# outcome = 0.4
|
|
40
|
+
#
|
|
41
|
+
# [judge.options]
|
|
42
|
+
# provider = "openai"
|
|
43
|
+
# model = "openai/gpt-oss-120b"
|
|
44
|
+
# event = true
|
|
45
|
+
# outcome = true
|
|
46
|
+
# max_concurrency = 6
|
|
47
|
+
|
|
48
|
+
# NEW: Unified [policy] section - single source of truth for model and sampling
|
|
49
|
+
[policy]
|
|
50
|
+
model_name = "Qwen/Qwen3-4B"
|
|
51
|
+
trainer_mode = "lora"
|
|
52
|
+
label = "crafter-rl-stepwise-simple"
|
|
53
|
+
|
|
54
|
+
# Sampling parameters for rollouts
|
|
55
|
+
max_tokens = 512
|
|
56
|
+
temperature = 0.6
|
|
57
|
+
top_p = 0.95
|
|
58
|
+
|
|
59
|
+
[rollout]
|
|
60
|
+
env_name = "crafter"
|
|
61
|
+
max_turns = 10
|
|
62
|
+
episodes_per_batch = 4
|
|
63
|
+
policy_name = "crafter-react"
|
|
64
|
+
max_concurrent_rollouts = 8
|
|
65
|
+
batches_per_step = 2
|
|
66
|
+
ops = ["agent", "env"]
|
|
67
|
+
|
|
68
|
+
[evaluation]
|
|
69
|
+
instances = 10
|
|
70
|
+
every_n_iters = 10
|
|
71
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
72
|
+
|
|
73
|
+
[training]
|
|
74
|
+
num_epochs = 1
|
|
75
|
+
iterations_per_epoch = 10
|
|
76
|
+
gradient_accumulation_steps = 1
|
|
77
|
+
max_accumulated_minibatch = 1
|
|
78
|
+
max_turns = 10
|
|
79
|
+
batch_size = 4
|
|
80
|
+
group_size = 4
|
|
81
|
+
learning_rate = 5e-5
|
|
82
|
+
log_interval = 1
|
|
83
|
+
weight_sync_interval = 1
|
|
84
|
+
|
|
85
|
+
[training.rewards] # Nested: Reward config under training
|
|
86
|
+
step_rewards_enabled = true
|
|
87
|
+
step_rewards_mode = "decision_stepwise"
|
|
88
|
+
step_rewards_indicator_lambda = 1.0
|
|
89
|
+
step_rewards_beta = 0.0
|
|
90
|
+
step_rewards_strategy = "consistent"
|
|
91
|
+
event_rewards_kind = "unique"
|
|
92
|
+
|
|
93
|
+
[training.lora] # Nested: LoRA config under training
|
|
94
|
+
r = 16
|
|
95
|
+
alpha = 32
|
|
96
|
+
dropout = 0.05
|
|
97
|
+
target_modules = ["all-linear"]
|
|
98
|
+
|
|
99
|
+
[training.weight_sync]
|
|
100
|
+
enable = true
|
|
101
|
+
targets = ["policy"]
|
|
102
|
+
mode = "direct"
|
|
103
|
+
direct = true
|
|
104
|
+
verify_every_k = 0
|
|
105
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Crafter SFT LoRA configuration
|
|
2
|
+
# Train Qwen3-Coder-30B on Crafter agent traces
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "offline"
|
|
6
|
+
method = "sft"
|
|
7
|
+
variety = "lora"
|
|
8
|
+
|
|
9
|
+
[job]
|
|
10
|
+
model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
|
11
|
+
# Default dataset - can override with --dataset flag
|
|
12
|
+
data = "traces/crafter_sft_converted.jsonl"
|
|
13
|
+
|
|
14
|
+
[compute]
|
|
15
|
+
gpu_type = "H200"
|
|
16
|
+
gpu_count = 2
|
|
17
|
+
nodes = 1
|
|
18
|
+
|
|
19
|
+
[data]
|
|
20
|
+
# Forwarded into metadata.effective_config
|
|
21
|
+
topology = {}
|
|
22
|
+
# Optional validation set if you have one locally
|
|
23
|
+
# validation_path = "examples/multi_step/ft_data/crafter_sft.val.jsonl"
|
|
24
|
+
|
|
25
|
+
[training]
|
|
26
|
+
mode = "lora"
|
|
27
|
+
use_qlora = true
|
|
28
|
+
|
|
29
|
+
[training.validation]
|
|
30
|
+
enabled = true
|
|
31
|
+
evaluation_strategy = "steps"
|
|
32
|
+
eval_steps = 100
|
|
33
|
+
save_best_model_at_end = true
|
|
34
|
+
metric_for_best_model = "val.loss"
|
|
35
|
+
greater_is_better = false
|
|
36
|
+
|
|
37
|
+
[hyperparameters]
|
|
38
|
+
n_epochs = 1
|
|
39
|
+
train_kind = "peft"
|
|
40
|
+
per_device_batch = 1
|
|
41
|
+
gradient_accumulation_steps = 64
|
|
42
|
+
sequence_length = 4096
|
|
43
|
+
learning_rate = 5e-6
|
|
44
|
+
warmup_ratio = 0.03
|
|
45
|
+
lora_rank = 16
|
|
46
|
+
lora_alpha = 32
|
|
47
|
+
lora_dropout = 0.05
|
|
48
|
+
lora_target_modules = ["all-linear"]
|
|
49
|
+
|
|
50
|
+
[hyperparameters.parallelism]
|
|
51
|
+
use_deepspeed = true
|
|
52
|
+
deepspeed_stage = 2
|
|
53
|
+
fsdp = false
|
|
54
|
+
bf16 = true
|
|
55
|
+
fp16 = false
|
|
56
|
+
activation_checkpointing = true
|
|
57
|
+
|
|
58
|
+
[tags]
|
|
59
|
+
experiment = "crafter_sft_lora_qwen_coder_30b"
|
|
60
|
+
task = "crafter_agent"
|
|
61
|
+
model_size = "30b"
|
|
62
|
+
|