synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
- examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_small.toml +2 -1
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +154 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +275 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +423 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
- examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +62 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +1 -1
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +37 -0
- examples/rl/configs/rl_from_base_qwen17.toml +76 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +22 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/sft/README.md +5 -5
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
- examples/sft/evaluate.py +4 -4
- examples/sft/export_dataset.py +7 -4
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/README.md +1 -1
- examples/swe/task_app/grpo_swe_mini.py +1 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +2 -8
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +3 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
- examples/task_apps/pokemon_red/task_app.py +199 -6
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +145 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/cli.py +30 -7
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/cli/__init__.py +66 -49
- synth_ai/cli/_modal_wrapper.py +9 -6
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/recent.py +1 -0
- synth_ai/cli/setup.py +266 -0
- synth_ai/cli/task_app_deploy.py +16 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +16 -0
- synth_ai/cli/task_app_serve.py +18 -0
- synth_ai/cli/task_apps.py +392 -141
- synth_ai/cli/train.py +18 -0
- synth_ai/cli/tui.py +62 -0
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/verilog/engine.py +76 -10
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/inference/client.py +1 -1
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +1 -1
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/task/__init__.py +11 -1
- synth_ai/task/apps/__init__.py +5 -2
- synth_ai/task/config.py +259 -0
- synth_ai/task/contracts.py +15 -2
- synth_ai/task/rubrics/__init__.py +4 -2
- synth_ai/task/rubrics/loaders.py +27 -4
- synth_ai/task/rubrics/scoring.py +3 -0
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +145 -2
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/session_tracer.py +10 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -2
- synth_ai/tracing_v3/turso/native_manager.py +108 -77
- synth_ai/tracing_v3/utils.py +1 -1
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +911 -0
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +287 -0
- synth_ai/utils/http.py +169 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
- synth_ai/cli/man.py +0 -106
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/http.py +0 -26
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Crafter SFT LoRA configuration
|
|
2
|
+
# Train Qwen3-Coder-30B on Crafter agent traces
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "offline"
|
|
6
|
+
method = "sft"
|
|
7
|
+
variety = "lora"
|
|
8
|
+
|
|
9
|
+
[job]
|
|
10
|
+
model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
|
|
11
|
+
# Default dataset - can override with --dataset flag
|
|
12
|
+
data = "traces/crafter_sft_converted.jsonl"
|
|
13
|
+
|
|
14
|
+
[compute]
|
|
15
|
+
gpu_type = "H200"
|
|
16
|
+
gpu_count = 2
|
|
17
|
+
nodes = 1
|
|
18
|
+
|
|
19
|
+
[data]
|
|
20
|
+
# Forwarded into metadata.effective_config
|
|
21
|
+
topology = {}
|
|
22
|
+
# Optional validation set if you have one locally
|
|
23
|
+
# validation_path = "examples/multi_step/ft_data/crafter_sft.val.jsonl"
|
|
24
|
+
|
|
25
|
+
[training]
|
|
26
|
+
mode = "lora"
|
|
27
|
+
use_qlora = true
|
|
28
|
+
|
|
29
|
+
[training.validation]
|
|
30
|
+
enabled = true
|
|
31
|
+
evaluation_strategy = "steps"
|
|
32
|
+
eval_steps = 100
|
|
33
|
+
save_best_model_at_end = true
|
|
34
|
+
metric_for_best_model = "val.loss"
|
|
35
|
+
greater_is_better = false
|
|
36
|
+
|
|
37
|
+
[hyperparameters]
|
|
38
|
+
n_epochs = 1
|
|
39
|
+
train_kind = "peft"
|
|
40
|
+
per_device_batch = 1
|
|
41
|
+
gradient_accumulation_steps = 64
|
|
42
|
+
sequence_length = 4096
|
|
43
|
+
learning_rate = 5e-6
|
|
44
|
+
warmup_ratio = 0.03
|
|
45
|
+
lora_rank = 16
|
|
46
|
+
lora_alpha = 32
|
|
47
|
+
lora_dropout = 0.05
|
|
48
|
+
lora_target_modules = ["all-linear"]
|
|
49
|
+
|
|
50
|
+
[hyperparameters.parallelism]
|
|
51
|
+
use_deepspeed = true
|
|
52
|
+
deepspeed_stage = 2
|
|
53
|
+
fsdp = false
|
|
54
|
+
bf16 = true
|
|
55
|
+
fp16 = false
|
|
56
|
+
activation_checkpointing = true
|
|
57
|
+
|
|
58
|
+
[tags]
|
|
59
|
+
experiment = "crafter_sft_lora_qwen_coder_30b"
|
|
60
|
+
task = "crafter_agent"
|
|
61
|
+
model_size = "30b"
|
|
62
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Crafter Eval Using Synth Backend with Qwen 4B
|
|
2
|
+
|
|
3
|
+
## What Changed
|
|
4
|
+
|
|
5
|
+
Created `crafter_eval_synth_qwen4b.toml` to evaluate Crafter using Qwen3-4B via the Synth backend inference proxy.
|
|
6
|
+
|
|
7
|
+
## Key Difference from Groq Config
|
|
8
|
+
|
|
9
|
+
**Before (Groq):**
|
|
10
|
+
```toml
|
|
11
|
+
[eval.policy_config]
|
|
12
|
+
provider = "groq"
|
|
13
|
+
model = "qwen/qwen3-32b"
|
|
14
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
**After (Synth Backend):**
|
|
18
|
+
```toml
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "openai"
|
|
21
|
+
model = "Qwen/Qwen3-4B"
|
|
22
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uvx synth-ai eval --config examples/multi_step/configs/crafter_eval_synth_qwen4b.toml
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Why This Works
|
|
32
|
+
|
|
33
|
+
The Synth backend's `/api/v1/chat/completions` endpoint:
|
|
34
|
+
1. Accepts OpenAI-compatible requests
|
|
35
|
+
2. Routes to Modal vLLM service
|
|
36
|
+
3. Loads the base model (Qwen/Qwen3-4B from HuggingFace)
|
|
37
|
+
4. Returns OpenAI-compatible responses
|
|
38
|
+
|
|
39
|
+
No code changes needed - the infrastructure already exists.
|
|
40
|
+
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Verilog Eval Config for Groq Qwen3-32B
|
|
2
|
+
# Quick eval to test Verilog task app before RL training
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-verilog"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "qwen/qwen3-32b"
|
|
8
|
+
seeds = [0, 1, 2]
|
|
9
|
+
max_turns = 15
|
|
10
|
+
concurrency = 1
|
|
11
|
+
env_name = "verilog"
|
|
12
|
+
policy_name = "verilog-designer"
|
|
13
|
+
trace_format = "full"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
difficulty = "medium" # Can be "easy", "medium", or "hard"
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "groq"
|
|
21
|
+
model = "qwen/qwen3-32b"
|
|
22
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
23
|
+
temperature = 0.2
|
|
24
|
+
max_tokens = 8192 # Large buffer for Verilog (long testbenches + module implementation)
|
|
25
|
+
|
|
26
|
+
[eval.judge]
|
|
27
|
+
path = "examples/multi_step/judges/verilog_backend_judge.py"
|
|
28
|
+
name = "Backend"
|
|
29
|
+
backend_url = "http://localhost:8000/api"
|
|
30
|
+
model = "openai/gpt-oss-120b"
|
|
31
|
+
timeout_s = 45
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Verilog eval using Synth backend with Qwen3-8B
|
|
2
|
+
|
|
3
|
+
[eval]
|
|
4
|
+
app_id = "grpo-verilog"
|
|
5
|
+
task_app_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
|
|
6
|
+
model = "Qwen/Qwen3-8B"
|
|
7
|
+
seeds = [0, 1, 2]
|
|
8
|
+
max_turns = 6
|
|
9
|
+
concurrency = 1
|
|
10
|
+
env_name = "verilog"
|
|
11
|
+
policy_name = "verilog-designer"
|
|
12
|
+
trace_format = "full"
|
|
13
|
+
return_trace = true
|
|
14
|
+
|
|
15
|
+
[eval.env_config]
|
|
16
|
+
difficulty = "medium"
|
|
17
|
+
|
|
18
|
+
[eval.policy_config]
|
|
19
|
+
provider = "openai"
|
|
20
|
+
model = "Qwen/Qwen3-8B"
|
|
21
|
+
inference_url = "https://synth-backend-dev-docker.onrender.com/api/v1/chat/completions"
|
|
22
|
+
temperature = 0.2
|
|
23
|
+
top_p = 0.95
|
|
24
|
+
max_tokens = 4096
|
|
25
|
+
max_llm_calls = 6
|
|
26
|
+
|
|
27
|
+
[eval.judge]
|
|
28
|
+
path = "examples/multi_step/judges/verilog_backend_judge.py"
|
|
29
|
+
name = "Backend"
|
|
30
|
+
backend_url = "http://localhost:8000/api"
|
|
31
|
+
model = "openai/gpt-oss-120b"
|
|
32
|
+
timeout_s = 45
|
|
33
|
+
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Verilog RL experiment – LoRA training on Qwen3-0.6B
|
|
2
|
+
#
|
|
3
|
+
# This configuration adapts the Crafter RL setup for Verilog spec-to-RTL tasks.
|
|
4
|
+
# Uses the same proven pipeline but optimized for 0.6B model and Verilog domain.
|
|
5
|
+
|
|
6
|
+
[algorithm]
|
|
7
|
+
type = "online"
|
|
8
|
+
method = "policy_gradient"
|
|
9
|
+
variety = "gspo"
|
|
10
|
+
|
|
11
|
+
[services]
|
|
12
|
+
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-verilog`
|
|
13
|
+
task_url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
|
|
14
|
+
# Point at the Synth backend (or compatible service) that exposes /api/judge/v1/*
|
|
15
|
+
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
16
|
+
|
|
17
|
+
[compute]
|
|
18
|
+
gpu_type = "H200" # ✅ 8B model needs H200 for larger context window
|
|
19
|
+
gpu_count = 2 # ✅ Minimum 2x GPUs (1 for vLLM inference + 1 for training)
|
|
20
|
+
nodes = 1
|
|
21
|
+
|
|
22
|
+
[topology]
|
|
23
|
+
type = "single_node_split"
|
|
24
|
+
gpus_for_vllm = 1 # ✅ vLLM for inference
|
|
25
|
+
gpus_for_training = 1 # ✅ Training GPU (8B LoRA fits well)
|
|
26
|
+
gpus_for_ref = 0
|
|
27
|
+
tensor_parallel = 1
|
|
28
|
+
|
|
29
|
+
[vllm]
|
|
30
|
+
tensor_parallel_size = 1
|
|
31
|
+
max_model_len = 24576 # ✅ Increased to 24K to accommodate long Verilog prompts (16K + 8K buffer for testbenches + history)
|
|
32
|
+
|
|
33
|
+
[reference]
|
|
34
|
+
placement = "none"
|
|
35
|
+
|
|
36
|
+
[model]
|
|
37
|
+
base = "Qwen/Qwen3-8B" # ✅ 8B model for RL training with good balance of speed and capability
|
|
38
|
+
trainer_mode = "lora"
|
|
39
|
+
label = "verilog-rl-lora-qwen8b"
|
|
40
|
+
|
|
41
|
+
[lora]
|
|
42
|
+
r = 16
|
|
43
|
+
alpha = 32
|
|
44
|
+
dropout = 0.05
|
|
45
|
+
target_modules = ["all-linear"]
|
|
46
|
+
|
|
47
|
+
[rollout]
|
|
48
|
+
env_name = "verilog" # ✅ Changed from "crafter" to "verilog"
|
|
49
|
+
max_turns = 6 # ✅ More steps for compilation chains vs Crafter's 10
|
|
50
|
+
episodes_per_batch = 4 # ✅ Good batch size for 8B model
|
|
51
|
+
policy_name = "verilog-designer"
|
|
52
|
+
max_concurrent_rollouts = 8
|
|
53
|
+
batches_per_step = 2
|
|
54
|
+
ops = ["agent", "env"]
|
|
55
|
+
|
|
56
|
+
[rollout.env_config]
|
|
57
|
+
# Verilog-specific environment settings
|
|
58
|
+
difficulty = "medium" # Can be "easy", "medium", or "hard"
|
|
59
|
+
|
|
60
|
+
[rollout.env_config.step_rewards]
|
|
61
|
+
enabled = true
|
|
62
|
+
mode = "decision_stepwise"
|
|
63
|
+
strategy = "consistent"
|
|
64
|
+
indicator_lambda = 0.5 # ✅ Reduced from Crafter (sparser rewards)
|
|
65
|
+
step_beta = 0.0
|
|
66
|
+
|
|
67
|
+
[rollout.policy_config]
|
|
68
|
+
provider = "openai"
|
|
69
|
+
model = "Qwen/Qwen3-8B" # ✅ Use the model being trained (8B) for rollouts
|
|
70
|
+
temperature = 0.2
|
|
71
|
+
max_tokens = 4096 # ✅ Balanced for Verilog generation while leaving room for long input prompts (testbenches + history)
|
|
72
|
+
|
|
73
|
+
[evaluation]
|
|
74
|
+
instances = 16
|
|
75
|
+
every_n_iters = 10
|
|
76
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
|
|
77
|
+
|
|
78
|
+
[training]
|
|
79
|
+
num_epochs = 1
|
|
80
|
+
iterations_per_epoch = 5
|
|
81
|
+
gradient_accumulation_steps = 1
|
|
82
|
+
max_accumulated_minibatch = 1
|
|
83
|
+
max_turns = 15
|
|
84
|
+
batch_size = 4 # ✅ Same as Crafter (works well for 8B LoRA)
|
|
85
|
+
group_size = 4
|
|
86
|
+
learning_rate = 5e-5 # ✅ Same as Crafter
|
|
87
|
+
log_interval = 1
|
|
88
|
+
weight_sync_interval = 1
|
|
89
|
+
event_rewards_kind = "unique"
|
|
90
|
+
async_semaphore_max = 20 # Max concurrent rollouts in streaming pipeline
|
|
91
|
+
|
|
92
|
+
# Enable dense decision rewards in the trainer
|
|
93
|
+
step_rewards_enabled = true
|
|
94
|
+
step_rewards_mode = "decision_stepwise"
|
|
95
|
+
step_rewards_indicator_lambda = 0.5 # ✅ Reduced for Verilog's sparser rewards
|
|
96
|
+
step_rewards_beta = 0.0
|
|
97
|
+
step_rewards_strategy = "consistent"
|
|
98
|
+
|
|
99
|
+
[training.weight_sync]
|
|
100
|
+
enable = true
|
|
101
|
+
targets = ["policy"]
|
|
102
|
+
mode = "direct"
|
|
103
|
+
direct = true
|
|
104
|
+
verify_every_k = 0
|
|
105
|
+
|
|
106
|
+
[rubric]
|
|
107
|
+
enabled = true
|
|
108
|
+
model = "openai/gpt-oss-120b"
|
|
109
|
+
api_base = "https://synth-backend-dev-docker.onrender.com/api/judge"
|
|
110
|
+
api_key_env = "OPENAI_API_KEY"
|
|
111
|
+
|
|
112
|
+
# Blend the hosted judge scores with environment returns
|
|
113
|
+
[rubric.weights]
|
|
114
|
+
env = 0.3 # ✅ Higher weight on env rewards for Verilog (vs Crafter's 0.2)
|
|
115
|
+
event = 0.3 # ✅ Adjusted for Verilog's different reward structure
|
|
116
|
+
outcome = 0.4
|
|
117
|
+
|
|
118
|
+
[rubric.event]
|
|
119
|
+
# Verilog-specific event rubric for process efficiency
|
|
120
|
+
rubric_id = "verilog/event@v1"
|
|
121
|
+
criteria = [
|
|
122
|
+
{ key = "process.compilation_success", weight = 0.7, description = "Return 1.0 when compilation succeeds, 0.5 for partial success, 0.0 for failure", aggregation = "weighted_sum" },
|
|
123
|
+
{ key = "process.design_iterations", weight = 0.3, description = "Reward efficient design iterations without unnecessary recompilation", aggregation = "weighted_sum" },
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
[rubric.outcome]
|
|
127
|
+
# Verilog-specific outcome rubric for final results
|
|
128
|
+
rubric_id = "verilog/outcome@v1"
|
|
129
|
+
criteria = [
|
|
130
|
+
{ key = "outcome.tests_passed", weight = 0.8, description = "Full credit when all tests pass, partial for some tests", aggregation = "weighted_sum" },
|
|
131
|
+
{ key = "outcome.design_quality", weight = 0.2, description = "Code quality, documentation, and design efficiency", aggregation = "weighted_sum" },
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
[judge]
|
|
135
|
+
type = "groq"
|
|
136
|
+
timeout_s = 45
|
|
137
|
+
|
|
138
|
+
[judge.options]
|
|
139
|
+
event = true
|
|
140
|
+
outcome = true
|
|
141
|
+
provider = "openai"
|
|
142
|
+
model = "openai/gpt-oss-120b"
|
|
143
|
+
rubric_id = "verilog/bundle@v1"
|
|
144
|
+
max_concurrency = 6
|
|
145
|
+
tracks = ["process", "reasoning", "progress", "outcome"]
|
|
146
|
+
|
|
147
|
+
[judge.options.rubric_overrides]
|
|
148
|
+
|
|
149
|
+
[judge.options.rubric_overrides.event]
|
|
150
|
+
goal_text = """
|
|
151
|
+
Evaluate each Verilog design decision for compilation success and process efficiency.
|
|
152
|
+
High scores for successful compilation and strategic tool usage.
|
|
153
|
+
Penalize unnecessary operations and compilation failures."""
|
|
154
|
+
aggregation = "weighted_sum"
|
|
155
|
+
|
|
156
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
157
|
+
id = "process.compilation_success"
|
|
158
|
+
weight = 0.7
|
|
159
|
+
scale = "bounded"
|
|
160
|
+
description = "Return 1.0 when compilation succeeds cleanly, 0.5 for warnings, 0.0 for errors"
|
|
161
|
+
|
|
162
|
+
[[judge.options.rubric_overrides.event.criteria]]
|
|
163
|
+
id = "process.design_iterations"
|
|
164
|
+
weight = 0.3
|
|
165
|
+
scale = "bounded"
|
|
166
|
+
description = "Reward efficient write→compile→simulate workflow, penalize redundant operations"
|
|
167
|
+
|
|
168
|
+
[judge.options.rubric_overrides.outcome]
|
|
169
|
+
goal_text = """
|
|
170
|
+
Evaluate the final Verilog implementation for correctness and quality.
|
|
171
|
+
High scores for working designs that pass all tests with good code quality."""
|
|
172
|
+
aggregation = "weighted_sum"
|
|
173
|
+
|
|
174
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
175
|
+
id = "outcome.tests_passed"
|
|
176
|
+
weight = 0.8
|
|
177
|
+
scale = "binary"
|
|
178
|
+
description = "Full credit when all tests pass, partial credit for some tests passing"
|
|
179
|
+
|
|
180
|
+
[[judge.options.rubric_overrides.outcome.criteria]]
|
|
181
|
+
id = "outcome.design_quality"
|
|
182
|
+
weight = 0.2
|
|
183
|
+
scale = "bounded"
|
|
184
|
+
description = "Code clarity, proper documentation, and efficient design patterns"
|
|
185
|
+
|
|
186
|
+
[judge.options.weights]
|
|
187
|
+
process = 0.1
|
|
188
|
+
reasoning = 0.2
|
|
189
|
+
progress = 0.3
|
|
190
|
+
outcome = 0.4
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Convert Crafter trace format to SFT format with messages[] structure."""
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
def convert_trace_to_sft(trace: dict) -> dict:
|
|
9
|
+
"""Convert a single trace to SFT format."""
|
|
10
|
+
# Extract dialogue from trace
|
|
11
|
+
dialogue = trace.get("dialogue", [])
|
|
12
|
+
assistant = trace.get("assistant", {})
|
|
13
|
+
|
|
14
|
+
# Build messages list
|
|
15
|
+
messages = []
|
|
16
|
+
|
|
17
|
+
# Add dialogue history
|
|
18
|
+
for msg in dialogue:
|
|
19
|
+
messages.append({
|
|
20
|
+
"role": msg["role"],
|
|
21
|
+
"content": msg["content"]
|
|
22
|
+
})
|
|
23
|
+
|
|
24
|
+
# Add assistant response if present
|
|
25
|
+
if assistant:
|
|
26
|
+
content = assistant.get("content", "")
|
|
27
|
+
tool_calls = assistant.get("tool_calls", [])
|
|
28
|
+
|
|
29
|
+
# If there are tool calls, format them
|
|
30
|
+
if tool_calls:
|
|
31
|
+
# Convert tool calls to a simple text format for SFT
|
|
32
|
+
tool_text = "\n".join([
|
|
33
|
+
f"Tool: {tc['name']}\nArguments: {json.dumps(tc.get('arguments', {}))}"
|
|
34
|
+
for tc in tool_calls
|
|
35
|
+
])
|
|
36
|
+
content = f"{content}\n\n{tool_text}".strip()
|
|
37
|
+
|
|
38
|
+
messages.append({
|
|
39
|
+
"role": "assistant",
|
|
40
|
+
"content": content
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
return {"messages": messages}
|
|
44
|
+
|
|
45
|
+
def main():
|
|
46
|
+
if len(sys.argv) < 2:
|
|
47
|
+
print("Usage: python convert_traces_to_sft.py <input.jsonl> [output.jsonl]")
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
input_path = Path(sys.argv[1])
|
|
51
|
+
output_path = Path(sys.argv[2]) if len(sys.argv) > 2 else input_path.with_name(f"{input_path.stem}_sft_format.jsonl")
|
|
52
|
+
|
|
53
|
+
if not input_path.exists():
|
|
54
|
+
print(f"Error: Input file not found: {input_path}")
|
|
55
|
+
sys.exit(1)
|
|
56
|
+
|
|
57
|
+
print(f"Converting {input_path} → {output_path}")
|
|
58
|
+
|
|
59
|
+
converted = 0
|
|
60
|
+
skipped = 0
|
|
61
|
+
|
|
62
|
+
with open(input_path) as f_in, open(output_path, "w") as f_out:
|
|
63
|
+
for line_no, line in enumerate(f_in, 1):
|
|
64
|
+
try:
|
|
65
|
+
trace = json.loads(line.strip())
|
|
66
|
+
sft_entry = convert_trace_to_sft(trace)
|
|
67
|
+
|
|
68
|
+
# Only write if we have messages
|
|
69
|
+
if sft_entry["messages"]:
|
|
70
|
+
f_out.write(json.dumps(sft_entry) + "\n")
|
|
71
|
+
converted += 1
|
|
72
|
+
else:
|
|
73
|
+
skipped += 1
|
|
74
|
+
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print(f"Warning: Skipping line {line_no}: {e}")
|
|
77
|
+
skipped += 1
|
|
78
|
+
|
|
79
|
+
print(f"✅ Converted {converted} entries, skipped {skipped}")
|
|
80
|
+
print(f"Output: {output_path}")
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
84
|
+
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Crafter backend judge that calls the Synth judge API with inline rubric."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, TypedDict
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TraceMetadata(TypedDict, total=False):
|
|
14
|
+
"""Metadata for the trace."""
|
|
15
|
+
env_id: str
|
|
16
|
+
policy_id: str
|
|
17
|
+
length: int
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class JudgeTracePayload(TypedDict):
|
|
21
|
+
"""Trace payload sent to backend judge."""
|
|
22
|
+
event_history: List[Dict[str, Any]]
|
|
23
|
+
markov_blanket_message_history: List[Dict[str, Any]]
|
|
24
|
+
metadata: TraceMetadata
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class JudgeOptions(TypedDict, total=False):
|
|
28
|
+
"""Options for judge scoring."""
|
|
29
|
+
model: str
|
|
30
|
+
timeout_s: int
|
|
31
|
+
event: bool
|
|
32
|
+
outcome: bool
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TaskApp(TypedDict):
|
|
36
|
+
"""Task application metadata."""
|
|
37
|
+
id: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class JudgeScoreRequest(TypedDict):
|
|
41
|
+
"""Request to backend judge API."""
|
|
42
|
+
policy_name: str
|
|
43
|
+
task_app: TaskApp
|
|
44
|
+
trace: JudgeTracePayload
|
|
45
|
+
rubric: Dict[str, Any]
|
|
46
|
+
options: JudgeOptions
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Load rubric from file (cached at module level)
|
|
50
|
+
_RUBRIC_PATH = Path(__file__).parent.parent / "rubrics" / "crafter_backend_judge.json"
|
|
51
|
+
_RUBRIC: Dict[str, Any] | None = None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _load_rubric() -> Dict[str, Any]:
|
|
55
|
+
"""Load rubric from file with fallback to inline default."""
|
|
56
|
+
global _RUBRIC
|
|
57
|
+
if _RUBRIC is None:
|
|
58
|
+
try:
|
|
59
|
+
with open(_RUBRIC_PATH, 'r') as f:
|
|
60
|
+
_RUBRIC = json.load(f)
|
|
61
|
+
assert isinstance(_RUBRIC, dict), "Rubric must be a dict"
|
|
62
|
+
assert "outcome" in _RUBRIC, "Rubric must have 'outcome' key"
|
|
63
|
+
assert isinstance(_RUBRIC["outcome"], list), "Rubric 'outcome' must be a list"
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"[crafter_backend_judge] Warning: Failed to load rubric from {_RUBRIC_PATH}: {e}")
|
|
66
|
+
# Fallback inline rubric (matching RubricCriteriaBlock format)
|
|
67
|
+
_RUBRIC = {
|
|
68
|
+
"event": [],
|
|
69
|
+
"outcome": [
|
|
70
|
+
{"id": "achievement_progression", "description": "Achievement progression", "weight": 0.35, "scale": "bounded"},
|
|
71
|
+
{"id": "resource_stockpile", "description": "Resource stockpile", "weight": 0.2, "scale": "bounded"},
|
|
72
|
+
{"id": "survival_state", "description": "Survival state", "weight": 0.2, "scale": "bounded"},
|
|
73
|
+
{"id": "failure_analysis", "description": "Failure analysis", "weight": 0.15, "scale": "bounded"},
|
|
74
|
+
{"id": "future_readiness", "description": "Future readiness", "weight": 0.1, "scale": "bounded"}
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
return _RUBRIC
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def judge(payload: Dict[str, Any], **kwargs: Any) -> float:
|
|
81
|
+
"""
|
|
82
|
+
Call the Synth backend judge API to score a Crafter rollout.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
payload: Dict with keys: seed, prompt, completion, metrics, response, trace
|
|
86
|
+
**kwargs: Additional config (backend_url, model, timeout_s, etc.)
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
float: Aggregate score from 0.0 to 1.0
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
# Extract configuration
|
|
93
|
+
backend_url = kwargs.get("backend_url", "http://localhost:8000/api")
|
|
94
|
+
model = kwargs.get("model", "openai/gpt-oss-120b")
|
|
95
|
+
timeout = kwargs.get("timeout_s", 45)
|
|
96
|
+
|
|
97
|
+
assert isinstance(backend_url, str), "backend_url must be a string"
|
|
98
|
+
assert isinstance(model, str), "model must be a string"
|
|
99
|
+
assert isinstance(timeout, (int, float)), "timeout_s must be numeric"
|
|
100
|
+
|
|
101
|
+
# Extract trajectory from response
|
|
102
|
+
response_data = payload.get("response", {})
|
|
103
|
+
assert isinstance(response_data, dict), "response must be a dict"
|
|
104
|
+
|
|
105
|
+
trajectories = response_data.get("trajectories", [])
|
|
106
|
+
assert isinstance(trajectories, list), "trajectories must be a list"
|
|
107
|
+
|
|
108
|
+
if not trajectories:
|
|
109
|
+
print("[crafter_backend_judge] No trajectories in response")
|
|
110
|
+
return 0.0
|
|
111
|
+
|
|
112
|
+
trajectory = trajectories[0] # First trajectory
|
|
113
|
+
assert isinstance(trajectory, dict), "trajectory must be a dict"
|
|
114
|
+
|
|
115
|
+
# Load rubric
|
|
116
|
+
rubric = _load_rubric()
|
|
117
|
+
|
|
118
|
+
# Transform trajectory into JudgeTracePayload format
|
|
119
|
+
steps = trajectory.get("steps", [])
|
|
120
|
+
assert isinstance(steps, list), "trajectory steps must be a list"
|
|
121
|
+
|
|
122
|
+
event_history: List[Dict[str, Any]] = []
|
|
123
|
+
for idx, step in enumerate(steps):
|
|
124
|
+
assert isinstance(step, dict), f"step {idx} must be a dict"
|
|
125
|
+
# Each step becomes an event
|
|
126
|
+
event_history.append({
|
|
127
|
+
"observation": step.get("obs", {}),
|
|
128
|
+
"tool_calls": step.get("tool_calls", []),
|
|
129
|
+
"reward": step.get("reward", 0.0),
|
|
130
|
+
"done": step.get("done", False),
|
|
131
|
+
"truncated": step.get("truncated", False),
|
|
132
|
+
"info": step.get("info", {}),
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
# Add final observation - backend will extract this as outcome context
|
|
136
|
+
final_data = trajectory.get("final", {})
|
|
137
|
+
if final_data:
|
|
138
|
+
assert isinstance(final_data, dict), "final data must be a dict"
|
|
139
|
+
final_obs = final_data.get("observation", {})
|
|
140
|
+
assert isinstance(final_obs, dict), "final observation must be a dict"
|
|
141
|
+
|
|
142
|
+
event_history.append({
|
|
143
|
+
"observation": final_obs,
|
|
144
|
+
"reward": final_data.get("reward", 0.0),
|
|
145
|
+
"done": final_data.get("done", True),
|
|
146
|
+
"truncated": final_data.get("truncated", False),
|
|
147
|
+
"info": final_data.get("info", {}),
|
|
148
|
+
})
|
|
149
|
+
|
|
150
|
+
# Build trace metadata
|
|
151
|
+
metadata: TraceMetadata = {
|
|
152
|
+
"env_id": trajectory.get("env_id", "crafter"),
|
|
153
|
+
"policy_id": trajectory.get("policy_id", "crafter-react"),
|
|
154
|
+
"length": trajectory.get("length", len(steps)),
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# Build judge request with rubric included
|
|
158
|
+
judge_request: JudgeScoreRequest = {
|
|
159
|
+
"policy_name": "crafter-react",
|
|
160
|
+
"task_app": {"id": "grpo-crafter-task-app"},
|
|
161
|
+
"trace": {
|
|
162
|
+
"event_history": event_history,
|
|
163
|
+
"markov_blanket_message_history": [],
|
|
164
|
+
"metadata": metadata,
|
|
165
|
+
},
|
|
166
|
+
"rubric": rubric,
|
|
167
|
+
"options": {
|
|
168
|
+
"model": model,
|
|
169
|
+
"timeout_s": timeout,
|
|
170
|
+
"event": False, # Not scoring per-event
|
|
171
|
+
"outcome": True, # Score the final outcome
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
# Call backend judge API
|
|
176
|
+
with httpx.Client(timeout=timeout) as client:
|
|
177
|
+
# Get API key from env
|
|
178
|
+
api_key = os.environ.get("SYNTH_API_KEY") or os.environ.get("OPENAI_API_KEY")
|
|
179
|
+
headers = {}
|
|
180
|
+
if api_key:
|
|
181
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
182
|
+
|
|
183
|
+
url = f"{backend_url.rstrip('/')}/judge/v1/score"
|
|
184
|
+
|
|
185
|
+
# Debug: print request summary
|
|
186
|
+
print(f"\n[crafter_backend_judge] Scoring trajectory with {len(event_history)} events")
|
|
187
|
+
if event_history:
|
|
188
|
+
last_obs = event_history[-1].get('observation', {})
|
|
189
|
+
print(f" Final observation keys: {list(last_obs.keys())[:5]}...")
|
|
190
|
+
|
|
191
|
+
response = client.post(url, json=judge_request, headers=headers)
|
|
192
|
+
|
|
193
|
+
response.raise_for_status()
|
|
194
|
+
result = response.json()
|
|
195
|
+
assert isinstance(result, dict), "Response must be a dict"
|
|
196
|
+
|
|
197
|
+
# Extract aggregate score
|
|
198
|
+
aggregate_score = result.get("aggregate_score", 0.0)
|
|
199
|
+
|
|
200
|
+
# Try outcome_review.total if aggregate_score not found
|
|
201
|
+
if aggregate_score == 0.0 and "outcome_review" in result:
|
|
202
|
+
outcome_review = result["outcome_review"]
|
|
203
|
+
if isinstance(outcome_review, dict):
|
|
204
|
+
aggregate_score = outcome_review.get("total", 0.0)
|
|
205
|
+
|
|
206
|
+
print(f" Backend judge score: {aggregate_score:.3f}\n")
|
|
207
|
+
return float(aggregate_score)
|
|
208
|
+
|
|
209
|
+
except httpx.HTTPStatusError as e:
|
|
210
|
+
print(f"\n[crafter_backend_judge] HTTP ERROR:")
|
|
211
|
+
print(f" Status: {e.response.status_code}")
|
|
212
|
+
print(f" Response: {e.response.text[:300]}\n")
|
|
213
|
+
return 0.0
|
|
214
|
+
except AssertionError as e:
|
|
215
|
+
print(f"[crafter_backend_judge] Assertion error: {e}")
|
|
216
|
+
return 0.0
|
|
217
|
+
except Exception as e:
|
|
218
|
+
print(f"[crafter_backend_judge] Unexpected error: {e}")
|
|
219
|
+
return 0.0
|
|
220
|
+
|