synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
- examples/sft/evaluate.py +2 -0
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +56 -26
- examples/swe/task_app/hosted/rollout.py +42 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +799 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/models/supported.py +1 -0
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +48 -59
- synth_ai/cli/_modal_wrapper.py +3 -2
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/recent.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_apps.py +1922 -190
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/tui.py +57 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +9 -9
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +24 -5
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +257 -0
- synth_ai/task/contracts.py +138 -39
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +56 -0
- synth_ai/task/rubrics/loaders.py +152 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/trace_correlation_helpers.py +315 -0
- synth_ai/task/validators.py +413 -6
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +16 -6
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/daemon.py +8 -7
- synth_ai/tracing_v3/turso/native_manager.py +66 -43
- synth_ai/tracing_v3/utils.py +3 -3
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +906 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Verilog RL with LoRA (Qwen3-0.6B)
|
|
2
|
+
|
|
3
|
+
## Quick Start
|
|
4
|
+
|
|
5
|
+
1. **Deploy Verilog Task App**:
|
|
6
|
+
```bash
|
|
7
|
+
cd synth-ai
|
|
8
|
+
uvx synth-ai modal-serve grpo-verilog
|
|
9
|
+
```
|
|
10
|
+
Note the Modal URL and update `task_url` in `verilog_rl_lora.toml`.
|
|
11
|
+
|
|
12
|
+
2. **Run Training**:
|
|
13
|
+
```bash
|
|
14
|
+
uvx synth-ai rl run --config examples/multi_step/configs/verilog_rl_lora.toml
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Configuration Overview
|
|
18
|
+
|
|
19
|
+
### **Key Adaptations from Crafter**:
|
|
20
|
+
|
|
21
|
+
- **Model**: `Qwen/Qwen3-0.6B` (✅ proven in SFT configs)
|
|
22
|
+
- **Environment**: `verilog` instead of `crafter`
|
|
23
|
+
- **Steps**: 15 turns (vs Crafter's 10) for compilation workflows
|
|
24
|
+
- **Rewards**: Adjusted for sparser Verilog rewards (0.5 vs 1.0 indicator_lambda)
|
|
25
|
+
- **Rubrics**: Verilog-specific judging criteria
|
|
26
|
+
|
|
27
|
+
### **Hardware Requirements** (Standard RL setup):
|
|
28
|
+
- ✅ **2x H100 GPUs** (vLLM inference + LoRA training split)
|
|
29
|
+
- ✅ **No tensor parallelism** needed for 0.6B model
|
|
30
|
+
- ✅ **4x faster inference** than 32B model
|
|
31
|
+
- ✅ **Same compute pattern** as Crafter (just smaller model)
|
|
32
|
+
|
|
33
|
+
### **Expected Workflow**:
|
|
34
|
+
1. Agent writes Verilog code (`write_file`)
|
|
35
|
+
2. Compiles to check syntax (`compile`)
|
|
36
|
+
3. Simulates to verify behavior (`simulate`)
|
|
37
|
+
4. Submits if tests pass (`submit`)
|
|
38
|
+
5. **Rewards**: +1.0 for compilation success, +10.0 for passing tests
|
|
39
|
+
|
|
40
|
+
## Rubric Design
|
|
41
|
+
|
|
42
|
+
### **Event Rewards** (per decision):
|
|
43
|
+
- **Compilation Success**: 70% weight (1.0 for success, 0.0 for errors)
|
|
44
|
+
- **Process Efficiency**: 30% weight (penalizes redundant operations)
|
|
45
|
+
|
|
46
|
+
### **Outcome Rewards** (final score):
|
|
47
|
+
- **Tests Passed**: 80% weight (full credit when all tests pass)
|
|
48
|
+
- **Design Quality**: 20% weight (code clarity, documentation)
|
|
49
|
+
|
|
50
|
+
## Troubleshooting
|
|
51
|
+
|
|
52
|
+
### **If training fails**:
|
|
53
|
+
1. Check Modal URL in `task_url` field
|
|
54
|
+
2. Verify `GROQ_API_KEY` for inference
|
|
55
|
+
3. Ensure `OPENAI_API_KEY` for judging
|
|
56
|
+
|
|
57
|
+
### **Memory issues** (unlikely with 0.6B):
|
|
58
|
+
- Reduce `batch_size` to 2
|
|
59
|
+
- Set `gradient_accumulation_steps = 2`
|
|
60
|
+
- Verify 2x GPU split is working (vLLM on GPU 0, training on GPU 1)
|
|
61
|
+
|
|
62
|
+
### **Slow training**:
|
|
63
|
+
- Increase `episodes_per_batch` to 6-8
|
|
64
|
+
- Check network latency to Modal task app
|
|
65
|
+
|
|
66
|
+
## Expected Results
|
|
67
|
+
|
|
68
|
+
- **Convergence**: Should learn basic compilation workflow in 1-2 hours
|
|
69
|
+
- **Success Rate**: 20-40% initial test pass rate (improves with training)
|
|
70
|
+
- **Learning**: Agent learns to debug compilation errors and write correct Verilog
|
|
71
|
+
|
|
72
|
+
## Next Steps
|
|
73
|
+
|
|
74
|
+
1. **Monitor reward progression** in training logs
|
|
75
|
+
2. **Adjust rubrics** if agent struggles with compilation errors
|
|
76
|
+
3. **Scale to 8B model** once 0.6B baseline works
|
|
77
|
+
4. **Add domain-specific fine-tuning** for Verilog syntax
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Verilog Reward Structure (Normalized to 1.0)
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
All rewards in the Verilog task app are normalized so the maximum possible reward is **1.0**.
|
|
5
|
+
|
|
6
|
+
## Reward Components
|
|
7
|
+
|
|
8
|
+
### 1. Step Penalty: **-0.001** per step
|
|
9
|
+
- Applied to every action taken
|
|
10
|
+
- Encourages efficient solutions
|
|
11
|
+
- Normalized from `-0.01` (original)
|
|
12
|
+
|
|
13
|
+
### 2. Compile Success: **+0.01**
|
|
14
|
+
- Awarded when `iverilog` compilation succeeds (returncode 0)
|
|
15
|
+
- Validates syntax correctness
|
|
16
|
+
- Normalized from `+0.1` (original)
|
|
17
|
+
|
|
18
|
+
### 3. Simulation Pass: **+0.1**
|
|
19
|
+
- Awarded when `vvp` simulation passes all tests
|
|
20
|
+
- Validates behavioral correctness
|
|
21
|
+
- Normalized from `+1.0` (original)
|
|
22
|
+
|
|
23
|
+
### 4. Submit Success: **+1.0** (maximum reward)
|
|
24
|
+
- Awarded when final submission passes all verification tests
|
|
25
|
+
- This is the goal state
|
|
26
|
+
- Normalized from `+10.0` (original)
|
|
27
|
+
|
|
28
|
+
## Typical Reward Trajectories
|
|
29
|
+
|
|
30
|
+
### ✅ Optimal Path (3 steps)
|
|
31
|
+
```
|
|
32
|
+
Step 1: write_file → -0.001
|
|
33
|
+
Step 2: compile (success) → +0.01 - 0.001 = +0.009
|
|
34
|
+
Step 3: simulate (pass) → +0.1 - 0.001 = +0.099
|
|
35
|
+
Total: ~0.107
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### ✅ Good Path (4 steps with submit)
|
|
39
|
+
```
|
|
40
|
+
Step 1: write_file → -0.001
|
|
41
|
+
Step 2: compile (success) → +0.009
|
|
42
|
+
Step 3: simulate (pass) → +0.099
|
|
43
|
+
Step 4: submit (success) → +1.0 - 0.001 = +0.999
|
|
44
|
+
Total: ~1.106
|
|
45
|
+
```
|
|
46
|
+
*Note: Can exceed 1.0 if intermediate rewards stack with final submit*
|
|
47
|
+
|
|
48
|
+
### ❌ Failure Path (compilation errors)
|
|
49
|
+
```
|
|
50
|
+
Step 1: write_file → -0.001
|
|
51
|
+
Step 2: compile (fail) → -0.001
|
|
52
|
+
Step 3: write_file (fix) → -0.001
|
|
53
|
+
Step 4: compile (success) → +0.009
|
|
54
|
+
Step 5: simulate (pass) → +0.099
|
|
55
|
+
Total: ~0.105
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Implementation Details
|
|
59
|
+
|
|
60
|
+
### Location
|
|
61
|
+
- **Reward components**: `synth_ai/environments/examples/verilog/engine.py`
|
|
62
|
+
- `VerilogCompileSuccessComponent`: +0.01
|
|
63
|
+
- `VerilogSimulationPassComponent`: +0.1
|
|
64
|
+
- `VerilogSubmitSuccessComponent`: +1.0
|
|
65
|
+
- `VerilogStepPenaltyComponent`: -0.001
|
|
66
|
+
|
|
67
|
+
### Normalization Ratio
|
|
68
|
+
All rewards were divided by **10.0** to normalize:
|
|
69
|
+
- Original max: ~10.0
|
|
70
|
+
- Normalized max: ~1.0
|
|
71
|
+
- Ratio: 10.0
|
|
72
|
+
|
|
73
|
+
## Why Normalize?
|
|
74
|
+
|
|
75
|
+
1. **Consistency**: Makes it easier to compare rewards across different task types
|
|
76
|
+
2. **RL Training**: Standard reward scales improve learning stability
|
|
77
|
+
3. **Interpretability**: Rewards as percentages (0.0 to 1.0) are intuitive
|
|
78
|
+
4. **Judge Compatibility**: Rubric scores typically range 0-1, making blending easier
|
|
79
|
+
|
|
80
|
+
## Testing
|
|
81
|
+
```bash
|
|
82
|
+
# Run eval to verify normalized rewards
|
|
83
|
+
uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Expected output for successful rollout:
|
|
87
|
+
- `mean_return` ≈ 0.1 (if only compile+simulate)
|
|
88
|
+
- `mean_return` ≈ 1.0+ (if full submit success)
|
|
89
|
+
|
|
90
|
+
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Verilog Task App - RL Training Readiness Checklist
|
|
2
|
+
|
|
3
|
+
## ✅ Core Requirements
|
|
4
|
+
|
|
5
|
+
### 1. Reward Normalization
|
|
6
|
+
- ✅ **Max reward = 1.0**: All rewards scaled to `[0, 1]` range
|
|
7
|
+
- ✅ **Step penalty**: `-0.001` (normalized from `-0.01`)
|
|
8
|
+
- ✅ **Compile success**: `+0.01` (normalized from `+0.1`)
|
|
9
|
+
- ✅ **Simulate pass**: `+0.1` (normalized from `+1.0`)
|
|
10
|
+
- ✅ **Submit success**: `+1.0` (normalized from `+10.0`)
|
|
11
|
+
|
|
12
|
+
### 2. Inference URL Handling (Critical for Trace Correlation)
|
|
13
|
+
- ✅ **Extracts from policy config**: Uses `policy_config.get("inference_url")` as primary source
|
|
14
|
+
- ✅ **Includes in trajectory**: Sets `trajectory.inference_url` with `?cid=...` parameter
|
|
15
|
+
- ✅ **Includes in final.info**: Adds to `final["info"]["inference_url"]`
|
|
16
|
+
- ✅ **Includes in pipeline_metadata**: Top-level `inference_url` field for trainer extraction
|
|
17
|
+
- ✅ **Logs cid presence**: Logs `has_cid` flag for debugging
|
|
18
|
+
- ✅ **Fallback to agent.inference_url**: Uses agent's URL if policy config missing (eval mode)
|
|
19
|
+
|
|
20
|
+
**Location**: `grpo_verilog.py` lines 829-867, 887-908
|
|
21
|
+
|
|
22
|
+
### 3. Pipeline Metadata
|
|
23
|
+
- ✅ **Required fields present**:
|
|
24
|
+
- `reward_score`: Final episode reward
|
|
25
|
+
- `policy_id`: Policy identifier
|
|
26
|
+
- `inference_url`: **CRITICAL** - Contains `?cid=trace_xxxxx` for correlation
|
|
27
|
+
- `env_name`: Environment identifier
|
|
28
|
+
- `task_id`: Problem identifier
|
|
29
|
+
- `task_split`: Dataset split (train/val/test)
|
|
30
|
+
- ✅ **Inference details**: Provider, model, URL in nested `inference` dict
|
|
31
|
+
|
|
32
|
+
**Location**: `grpo_verilog.py` lines 887-908
|
|
33
|
+
|
|
34
|
+
### 4. Trace Correlation (Required for RL Training)
|
|
35
|
+
- ✅ **Trainer injects cid**: Trainer adds `?cid=trace_xxxxx` to `policy_config["inference_url"]`
|
|
36
|
+
- ✅ **Task app preserves cid**: Uses `policy_config["inference_url"]` directly
|
|
37
|
+
- ✅ **Trainer extracts cid**: Extracts from `trajectory.inference_url` using `inference_url_to_trace_correlation_id()`
|
|
38
|
+
- ✅ **Trace hydration**: Trainer queries trace store with extracted `trace_correlation_id`
|
|
39
|
+
|
|
40
|
+
**Flow**:
|
|
41
|
+
```
|
|
42
|
+
Trainer → policy_config["inference_url"] = "http://...?cid=trace_xxxxx"
|
|
43
|
+
↓
|
|
44
|
+
Task App → trajectory.inference_url = policy_config["inference_url"]
|
|
45
|
+
↓
|
|
46
|
+
Trainer → extract_trace_correlation_id(trajectory.inference_url)
|
|
47
|
+
↓
|
|
48
|
+
Trainer → trace_store.resolve_correlation(trace_correlation_id)
|
|
49
|
+
↓
|
|
50
|
+
Trainer → Hydrate v3 trace with event_history
|
|
51
|
+
↓
|
|
52
|
+
Judge → Score using full trace
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### 5. Response Contract Compliance
|
|
56
|
+
- ✅ **RolloutResponse fields**:
|
|
57
|
+
- `run_id`: Unique identifier
|
|
58
|
+
- `trajectories`: List of trajectories (with `inference_url`)
|
|
59
|
+
- `metrics`: Episode metrics
|
|
60
|
+
- `pipeline_metadata`: **CRITICAL** - Contains `inference_url` and `reward_score`
|
|
61
|
+
- `trace_correlation_id`: Optional (trainer infers from `inference_url`)
|
|
62
|
+
- ✅ **Optional trace_correlation_id**: Made optional in `contracts.py` (trainer infers from URL)
|
|
63
|
+
|
|
64
|
+
**Location**: `synth_ai/task/contracts.py` line 156
|
|
65
|
+
|
|
66
|
+
### 6. Environment Implementation
|
|
67
|
+
- ✅ **Stateful engine**: `VerilogEngine` extends `StatefulEngine`
|
|
68
|
+
- ✅ **Reward stack**: Properly configured with normalized components
|
|
69
|
+
- ✅ **State management**: `VerilogPublicState` and `VerilogPrivateState`
|
|
70
|
+
- ✅ **Tool implementation**: All 4 tools (write_file, compile, simulate, submit)
|
|
71
|
+
|
|
72
|
+
**Location**: `synth_ai/environments/examples/verilog/engine.py`
|
|
73
|
+
|
|
74
|
+
### 7. LLM Agent Integration
|
|
75
|
+
- ✅ **Multi-turn support**: Agent maintains conversation history
|
|
76
|
+
- ✅ **Tool parsing**: Extracts tool calls from LLM responses
|
|
77
|
+
- ✅ **Guidance system**: Provides context-aware hints
|
|
78
|
+
- ✅ **Error handling**: Graceful fallback for malformed responses
|
|
79
|
+
|
|
80
|
+
**Location**: `grpo_verilog.py` lines 200-530
|
|
81
|
+
|
|
82
|
+
## 🔍 Verification Tests
|
|
83
|
+
|
|
84
|
+
### Test 1: Eval Mode (No Trace Correlation)
|
|
85
|
+
```bash
|
|
86
|
+
uvx synth-ai eval --config examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
|
|
87
|
+
```
|
|
88
|
+
**Expected**:
|
|
89
|
+
- ✅ `mean_return` ≈ 0.1 (normalized rewards)
|
|
90
|
+
- ✅ `inference_url` = Groq API URL (no `?cid=...`)
|
|
91
|
+
- ✅ `task_completed` = True for correct solutions
|
|
92
|
+
|
|
93
|
+
### Test 2: RL Training Mode (With Trace Correlation)
|
|
94
|
+
```bash
|
|
95
|
+
uvx synth-ai train \
|
|
96
|
+
--type rl \
|
|
97
|
+
--config examples/multi_step/configs/verilog_rl_lora.toml \
|
|
98
|
+
--task-url https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run \
|
|
99
|
+
--backend https://synth-backend-dev-docker.onrender.com/api \
|
|
100
|
+
--env-file /path/to/verilog/.env
|
|
101
|
+
```
|
|
102
|
+
**Expected**:
|
|
103
|
+
- ✅ Trainer logs show `inference_url` with `?cid=trace_xxxxx`
|
|
104
|
+
- ✅ Task app logs show `has_cid=True`
|
|
105
|
+
- ✅ Trace hydration succeeds (no `404 Not Found` errors)
|
|
106
|
+
- ✅ Judge receives full `event_history`
|
|
107
|
+
- ✅ Training updates show non-zero rewards
|
|
108
|
+
|
|
109
|
+
### Test 3: Trace Correlation ID Extraction
|
|
110
|
+
```python
|
|
111
|
+
from synth_envs_hosted.utils import inference_url_to_trace_correlation_id
|
|
112
|
+
|
|
113
|
+
# Should extract trace_xxxxx from URL
|
|
114
|
+
url = "http://localhost:8000/v1/chat/completions?cid=trace_abc123"
|
|
115
|
+
cid = inference_url_to_trace_correlation_id(url)
|
|
116
|
+
assert cid == "trace_abc123"
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Test 4: Pipeline Metadata Structure
|
|
120
|
+
```python
|
|
121
|
+
# Verify response has correct structure for RL
|
|
122
|
+
response = await task_app.rollout(request)
|
|
123
|
+
assert "pipeline_metadata" in response
|
|
124
|
+
assert "inference_url" in response.pipeline_metadata
|
|
125
|
+
assert "reward_score" in response.pipeline_metadata
|
|
126
|
+
assert len(response.trajectories) > 0
|
|
127
|
+
assert response.trajectories[0].inference_url is not None
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## 📋 Deployment Checklist
|
|
131
|
+
|
|
132
|
+
### Modal Deployment
|
|
133
|
+
1. ✅ **Environment variables set**:
|
|
134
|
+
- `GROQ_API_KEY`
|
|
135
|
+
- `VERILOG_INFERENCE_URL` (optional, uses Groq default)
|
|
136
|
+
2. ✅ **Secrets configured**: Groq API key in Modal secrets
|
|
137
|
+
3. ✅ **Task app URL**: Update in `verilog_rl_lora.toml`
|
|
138
|
+
|
|
139
|
+
### Training Configuration
|
|
140
|
+
1. ✅ **2x GPUs minimum**: 1 for vLLM, 1 for training
|
|
141
|
+
2. ✅ **Model size**: `Qwen/Qwen3-0.6B` for testing
|
|
142
|
+
3. ✅ **Batch size**: 4 (matches Crafter)
|
|
143
|
+
4. ✅ **Max turns**: 15 (enough for compile chains)
|
|
144
|
+
5. ✅ **Rubric enabled**: `rubric.enabled = true`
|
|
145
|
+
|
|
146
|
+
## 🚨 Common Issues & Fixes
|
|
147
|
+
|
|
148
|
+
### Issue 1: `trace_correlation_id` Missing
|
|
149
|
+
**Symptom**: Trainer logs `FATAL: Rollout payload missing 'trace_correlation_id'`
|
|
150
|
+
**Fix**: Verify `trajectory.inference_url` contains `?cid=...` parameter
|
|
151
|
+
|
|
152
|
+
### Issue 2: Trace Hydration Fails (404)
|
|
153
|
+
**Symptom**: `404 Not Found` when querying `/trace/by-correlation/...`
|
|
154
|
+
**Fix**:
|
|
155
|
+
- Check inference server is capturing traces
|
|
156
|
+
- Verify `cid` parameter is in inference URL
|
|
157
|
+
- Ensure `vllm_public_url` is set correctly
|
|
158
|
+
|
|
159
|
+
### Issue 3: Rewards Not Normalized
|
|
160
|
+
**Symptom**: `mean_return` > 1.0 in eval
|
|
161
|
+
**Fix**: Verify all reward components in `engine.py` are scaled by 10x
|
|
162
|
+
|
|
163
|
+
### Issue 4: Agent Gets Stuck
|
|
164
|
+
**Symptom**: Agent repeats same action (e.g., compile without fixing)
|
|
165
|
+
**Fix**: Check guidance system is providing proper hints
|
|
166
|
+
|
|
167
|
+
## 🎯 Final Verification
|
|
168
|
+
|
|
169
|
+
Before starting RL training, verify:
|
|
170
|
+
- [ ] Eval runs successfully with normalized rewards (≈ 0.1)
|
|
171
|
+
- [ ] Modal deployment returns proper `inference_url` structure
|
|
172
|
+
- [ ] Trace correlation ID extraction works
|
|
173
|
+
- [ ] Pipeline metadata includes all required fields
|
|
174
|
+
- [ ] Response contract matches expected schema
|
|
175
|
+
|
|
176
|
+
**If all checks pass**: ✅ **Ready for RL training!**
|
|
177
|
+
|
|
178
|
+
## 📚 Related Documentation
|
|
179
|
+
- [VERILOG_REWARDS.md](./VERILOG_REWARDS.md) - Reward structure details
|
|
180
|
+
- [verilog_rl_lora.md](../verilog_rl_lora.md) - RL/LoRA feasibility analysis
|
|
181
|
+
- [verilog_rl_lora.toml](./verilog_rl_lora.toml) - Training configuration
|
|
182
|
+
|
|
183
|
+
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Crafter eval using Synth backend with Qwen3-4B
|
|
2
|
+
|
|
3
|
+
[eval]
|
|
4
|
+
app_id = "grpo-crafter-task-app"
|
|
5
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
6
|
+
model = "Qwen/Qwen3-4B"
|
|
7
|
+
seeds = [0, 1, 2]
|
|
8
|
+
max_turns = 10
|
|
9
|
+
concurrency = 1
|
|
10
|
+
env_name = "crafter"
|
|
11
|
+
policy_name = "crafter-react"
|
|
12
|
+
trace_format = "full"
|
|
13
|
+
return_trace = true
|
|
14
|
+
|
|
15
|
+
[eval.env_config]
|
|
16
|
+
env_params = {max_steps_per_episode = 10}
|
|
17
|
+
|
|
18
|
+
[eval.policy_config]
|
|
19
|
+
provider = "openai"
|
|
20
|
+
model = "Qwen/Qwen3-4B"
|
|
21
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
|
|
22
|
+
temperature = 0.6
|
|
23
|
+
top_p = 0.95
|
|
24
|
+
max_tokens = 512
|
|
25
|
+
use_vision = false
|
|
26
|
+
image_only_mode = false
|
|
27
|
+
max_llm_calls = 10
|
|
28
|
+
|
|
29
|
+
[eval.judge]
|
|
30
|
+
path = "examples/multi_step/judges/crafter_backend_judge.py"
|
|
31
|
+
name = "Backend"
|
|
32
|
+
backend_url = "http://localhost:8000/api"
|
|
33
|
+
model = "openai/gpt-oss-120b"
|
|
34
|
+
timeout_s = 45
|
|
35
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Evaluation config for Crafter with text-only input
|
|
2
|
+
# This config uses Groq Qwen with only text observations (no images)
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "qwen/qwen3-32b"
|
|
8
|
+
seeds = [0, 1, 2]
|
|
9
|
+
max_turns = 10
|
|
10
|
+
concurrency = 1
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "full"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
env_params = {max_steps_per_episode = 10}
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "groq"
|
|
21
|
+
model = "qwen/qwen3-32b"
|
|
22
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
23
|
+
temperature = 0.6
|
|
24
|
+
top_p = 0.95
|
|
25
|
+
max_tokens = 512
|
|
26
|
+
use_vision = false
|
|
27
|
+
image_only_mode = false
|
|
28
|
+
max_llm_calls = 10
|
|
29
|
+
|
|
30
|
+
[eval.judge]
|
|
31
|
+
path = "examples/multi_step/judges/crafter_backend_judge.py"
|
|
32
|
+
name = "Backend"
|
|
33
|
+
backend_url = "http://localhost:8000/api"
|
|
34
|
+
model = "openai/gpt-oss-120b"
|
|
35
|
+
timeout_s = 45
|
|
36
|
+
|
|
@@ -12,7 +12,7 @@ variety = "gspo"
|
|
|
12
12
|
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
13
13
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
14
14
|
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
|
-
judge_url = "https://
|
|
15
|
+
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
16
16
|
|
|
17
17
|
[compute]
|
|
18
18
|
gpu_type = "H200"
|
|
@@ -46,7 +46,7 @@ target_modules = ["all-linear"]
|
|
|
46
46
|
[rollout]
|
|
47
47
|
env_name = "crafter"
|
|
48
48
|
max_turns = 10
|
|
49
|
-
episodes_per_batch =
|
|
49
|
+
episodes_per_batch = 2
|
|
50
50
|
policy_name = "crafter-react"
|
|
51
51
|
max_concurrent_rollouts = 8
|
|
52
52
|
batches_per_step = 2
|
|
@@ -69,12 +69,12 @@ ops = ["agent", "env"]
|
|
|
69
69
|
|
|
70
70
|
[evaluation]
|
|
71
71
|
instances = 16
|
|
72
|
-
every_n_iters =
|
|
72
|
+
every_n_iters = 10
|
|
73
73
|
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
74
74
|
|
|
75
75
|
[training]
|
|
76
76
|
num_epochs = 1
|
|
77
|
-
iterations_per_epoch =
|
|
77
|
+
iterations_per_epoch = 20
|
|
78
78
|
gradient_accumulation_steps = 1
|
|
79
79
|
max_accumulated_minibatch = 1
|
|
80
80
|
max_turns = 10
|
|
@@ -84,6 +84,7 @@ learning_rate = 5e-5
|
|
|
84
84
|
log_interval = 1
|
|
85
85
|
weight_sync_interval = 1
|
|
86
86
|
event_rewards_kind = "unique"
|
|
87
|
+
async_semaphore_max = 40 # Max concurrent rollouts in streaming pipeline
|
|
87
88
|
|
|
88
89
|
# Enable dense decision rewards in the trainer to mirror env_config step rewards.
|
|
89
90
|
step_rewards_enabled = true
|
|
@@ -101,6 +102,9 @@ verify_every_k = 0
|
|
|
101
102
|
|
|
102
103
|
[rubric]
|
|
103
104
|
enabled = true
|
|
105
|
+
model = "openai/gpt-oss-120b"
|
|
106
|
+
api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
|
|
107
|
+
api_key_env = "OPENAI_API_KEY"
|
|
104
108
|
# Blend the hosted judge scores with environment returns inside the trainer.
|
|
105
109
|
[rubric.weights]
|
|
106
110
|
env = 0.2
|
|
@@ -110,13 +114,21 @@ outcome = 0.4
|
|
|
110
114
|
[rubric.event]
|
|
111
115
|
# Hosted judge rubric for per-decision progress scoring.
|
|
112
116
|
rubric_id = "crafter/event@v1"
|
|
117
|
+
criteria = [
|
|
118
|
+
{ key = "progress.unique_achievements", weight = 0.9, description = "Return 1 when this decision explicitly unlocks a brand-new Crafter achievement (inventory or status text confirms it this turn). Otherwise return 0.", aggregation = "weighted_sum" },
|
|
119
|
+
{ key = "process.intent_alignment", weight = 0.1, description = "Use at most 0.3 to acknowledge tightly coupled setup that finishes the last prerequisite; keep ≤0.1 when the agent only repositions or gathers without an imminent unlock.", aggregation = "weighted_sum" },
|
|
120
|
+
]
|
|
113
121
|
|
|
114
122
|
[rubric.outcome]
|
|
115
123
|
# Hosted judge rubric for final trajectory scoring.
|
|
116
124
|
rubric_id = "crafter/outcome@v1"
|
|
125
|
+
criteria = [
|
|
126
|
+
{ key = "outcome.goal_completion", weight = 0.6, description = "Full credit when the agent ends with strong survival metrics and a clear crafted milestone (e.g., iron tools, furnace).", aggregation = "weighted_sum" },
|
|
127
|
+
{ key = "outcome.achievement_depth", weight = 0.4, description = "Partial credit for intermediate achievements (saplings, wood/stone tools) that set up future success.", aggregation = "weighted_sum" },
|
|
128
|
+
]
|
|
117
129
|
|
|
118
130
|
[judge]
|
|
119
|
-
type = "
|
|
131
|
+
type = "groq" # or "groq" when routing to Groq-hosted judges
|
|
120
132
|
timeout_s = 45
|
|
121
133
|
|
|
122
134
|
[judge.options]
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Crafter Eval Using Synth Backend with Qwen 4B
|
|
2
|
+
|
|
3
|
+
## What Changed
|
|
4
|
+
|
|
5
|
+
Created `crafter_eval_synth_qwen4b.toml` to evaluate Crafter using Qwen3-4B via the Synth backend inference proxy.
|
|
6
|
+
|
|
7
|
+
## Key Difference from Groq Config
|
|
8
|
+
|
|
9
|
+
**Before (Groq):**
|
|
10
|
+
```toml
|
|
11
|
+
[eval.policy_config]
|
|
12
|
+
provider = "groq"
|
|
13
|
+
model = "qwen/qwen3-32b"
|
|
14
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
**After (Synth Backend):**
|
|
18
|
+
```toml
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "openai"
|
|
21
|
+
model = "Qwen/Qwen3-4B"
|
|
22
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uvx synth-ai eval --config examples/multi_step/configs/crafter_eval_synth_qwen4b.toml
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Why This Works
|
|
32
|
+
|
|
33
|
+
The Synth backend's `/api/v1/chat/completions` endpoint:
|
|
34
|
+
1. Accepts OpenAI-compatible requests
|
|
35
|
+
2. Routes to Modal vLLM service
|
|
36
|
+
3. Loads the base model (Qwen/Qwen3-4B from HuggingFace)
|
|
37
|
+
4. Returns OpenAI-compatible responses
|
|
38
|
+
|
|
39
|
+
No code changes needed - the infrastructure already exists.
|
|
40
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Verilog Eval Config for Groq Qwen3-32B
|
|
2
|
+
# Quick eval to test Verilog task app before RL training
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-verilog"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "qwen/qwen3-32b"
|
|
8
|
+
seeds = [0, 1, 2]
|
|
9
|
+
max_turns = 15
|
|
10
|
+
concurrency = 1
|
|
11
|
+
env_name = "verilog"
|
|
12
|
+
policy_name = "verilog-designer"
|
|
13
|
+
trace_format = "full"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
difficulty = "medium" # Can be "easy", "medium", or "hard"
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "groq"
|
|
21
|
+
model = "qwen/qwen3-32b"
|
|
22
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
23
|
+
temperature = 0.2
|
|
24
|
+
max_tokens = 8192 # Large buffer for Verilog (long testbenches + module implementation)
|
|
25
|
+
|
|
26
|
+
[eval.judge]
|
|
27
|
+
path = "examples/multi_step/judges/verilog_backend_judge.py"
|
|
28
|
+
name = "Backend"
|
|
29
|
+
backend_url = "http://localhost:8000/api"
|
|
30
|
+
model = "openai/gpt-oss-120b"
|
|
31
|
+
timeout_s = 45
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Verilog eval using Synth backend with Qwen3-8B
|
|
2
|
+
|
|
3
|
+
[eval]
|
|
4
|
+
app_id = "grpo-verilog"
|
|
5
|
+
task_app_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
|
|
6
|
+
model = "Qwen/Qwen3-8B"
|
|
7
|
+
seeds = [0, 1, 2]
|
|
8
|
+
max_turns = 6
|
|
9
|
+
concurrency = 1
|
|
10
|
+
env_name = "verilog"
|
|
11
|
+
policy_name = "verilog-designer"
|
|
12
|
+
trace_format = "full"
|
|
13
|
+
return_trace = true
|
|
14
|
+
|
|
15
|
+
[eval.env_config]
|
|
16
|
+
difficulty = "medium"
|
|
17
|
+
|
|
18
|
+
[eval.policy_config]
|
|
19
|
+
provider = "openai"
|
|
20
|
+
model = "Qwen/Qwen3-8B"
|
|
21
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
|
|
22
|
+
temperature = 0.2
|
|
23
|
+
top_p = 0.95
|
|
24
|
+
max_tokens = 4096
|
|
25
|
+
max_llm_calls = 6
|
|
26
|
+
|
|
27
|
+
[eval.judge]
|
|
28
|
+
path = "examples/multi_step/judges/verilog_backend_judge.py"
|
|
29
|
+
name = "Backend"
|
|
30
|
+
backend_url = "http://localhost:8000/api"
|
|
31
|
+
model = "openai/gpt-oss-120b"
|
|
32
|
+
timeout_s = 45
|
|
33
|
+
|