synth-ai 0.2.16__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +25 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +42 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +41 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +5 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -4
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +66 -3
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +17 -49
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +13 -5
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +15 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +36 -5
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +134 -3
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +4 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +6 -3
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +5 -0
- synth_ai/api/train/builders.py +9 -3
- synth_ai/api/train/cli.py +125 -10
- synth_ai/api/train/configs/__init__.py +8 -1
- synth_ai/api/train/configs/rl.py +32 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/auth/credentials.py +119 -0
- synth_ai/cli/__init__.py +12 -4
- synth_ai/cli/commands/__init__.py +17 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/deploy/__init__.py +23 -0
- synth_ai/cli/commands/deploy/core.py +614 -0
- synth_ai/cli/commands/deploy/errors.py +72 -0
- synth_ai/cli/commands/deploy/validation.py +11 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1109 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +388 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +73 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +199 -0
- synth_ai/cli/commands/train/judge_validation.py +304 -0
- synth_ai/cli/commands/train/validation.py +443 -0
- synth_ai/cli/demo.py +2 -162
- synth_ai/cli/deploy/__init__.py +28 -0
- synth_ai/cli/deploy/core.py +5 -0
- synth_ai/cli/deploy/errors.py +23 -0
- synth_ai/cli/deploy/validation.py +5 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +58 -1487
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -11
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +469 -0
- synth_ai/streaming/streamer.py +301 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/validators.py +2 -2
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/utils/env.py +25 -18
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/modal.py +2 -2
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/METADATA +8 -3
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/RECORD +184 -109
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.17.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# Output: Markdown tables and JSON data (no plotting dependencies)
|
|
6
6
|
|
|
7
7
|
echo "🔍 Analyzing semantic map words from Crafter agent..."
|
|
8
|
-
echo "Make sure the synth-ai service is running: uvx synth-ai
|
|
8
|
+
echo "Make sure the synth-ai service is running: uvx synth-ai deploy --runtime uvicorn"
|
|
9
9
|
echo ""
|
|
10
10
|
|
|
11
11
|
cd synth_ai/environments/examples/crafter_classic/agent_demos/
|
|
@@ -14,4 +14,4 @@ cd synth_ai/environments/examples/crafter_classic/agent_demos/
|
|
|
14
14
|
python analyze_semantic_words_markdown.py --model gemini-1.5-flash --episodes 3 --max-turns 30
|
|
15
15
|
|
|
16
16
|
echo ""
|
|
17
|
-
echo "✅ Analysis complete! Check the generated markdown report and JSON files."
|
|
17
|
+
echo "✅ Analysis complete! Check the generated markdown report and JSON files."
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Pokémon VL: Vision-Language RL Pipeline
|
|
2
|
+
|
|
3
|
+
This playbook demonstrates end-to-end vision-language reinforcement learning on Pokémon Red using Synth AI's CLI tools. We follow the eval → collect data → SFT → RL → eval pipeline, but with vision models throughout.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
**Model**: Qwen3-VL-4B-Instruct (4B parameter vision-language model via Synth API)
|
|
8
|
+
**Environment**: Pokémon Red (Game Boy emulation with vision support)
|
|
9
|
+
**Benchmark**: Pallet Town progression task (leave bedroom → get starter → win first battle)
|
|
10
|
+
|
|
11
|
+
## Pipeline Steps
|
|
12
|
+
|
|
13
|
+
1. **Deploy Task App** - Host the Pokémon Red environment
|
|
14
|
+
2. **Collect Vision Rollouts** - Generate high-quality demonstrations using Qwen3-VL
|
|
15
|
+
3. **Filter Dataset** - Extract successful trajectories for supervised fine-tuning
|
|
16
|
+
4. **Fine-Tune Qwen3-4B VL** - Train vision-language model on filtered data
|
|
17
|
+
5. **Vision-Language RL** - Bootstrap RL training from SFT checkpoint
|
|
18
|
+
6. **Final Evaluation** - Compare SFT and RL performance
|
|
19
|
+
|
|
20
|
+
## Prerequisites
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
# Install dependencies
|
|
24
|
+
uv pip install -e .
|
|
25
|
+
|
|
26
|
+
# Setup authentication
|
|
27
|
+
uvx synth-ai setup
|
|
28
|
+
|
|
29
|
+
# Copy environment template
|
|
30
|
+
cp examples/blog_posts/pokemon_vl/.env.example .env
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# Export trace database path
|
|
37
|
+
export POKEMON_VL_TRACE_DB=traces/v3/pokemon_vl_blog.db
|
|
38
|
+
|
|
39
|
+
# 1. Deploy task app
|
|
40
|
+
uvx synth-ai deploy pokemon_red --runtime modal --name pokemon-vl-blog --env-file .env
|
|
41
|
+
|
|
42
|
+
# 2. Collect vision rollouts with Qwen3-VL
|
|
43
|
+
uvx synth-ai eval pokemon_red --config examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml --trace-db "${POKEMON_VL_TRACE_DB}"
|
|
44
|
+
|
|
45
|
+
# 3. Filter high-reward trajectories
|
|
46
|
+
uvx synth-ai filter --config examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml
|
|
47
|
+
|
|
48
|
+
# 4. Fine-tune Qwen3-4B VL
|
|
49
|
+
uvx synth-ai train --type sft --config examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml --env-file .env --poll
|
|
50
|
+
|
|
51
|
+
# 5. RL from SFT checkpoint (replace JOB_ID)
|
|
52
|
+
uvx synth-ai train --type rl --config examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml --env-file .env --poll
|
|
53
|
+
|
|
54
|
+
# 6. Evaluate final RL model
|
|
55
|
+
uvx synth-ai eval pokemon_red --config examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml --trace-db "${POKEMON_VL_TRACE_DB}"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Vision Features
|
|
59
|
+
|
|
60
|
+
- **Full Game Boy Frames**: Base64-encoded PNG screenshots (160x144 resolution)
|
|
61
|
+
- **Vision-Only Mode**: Pure image understanding without text state
|
|
62
|
+
- **Vision + Text Mode**: Combined visual and structured state information
|
|
63
|
+
- **Efficient Action Batching**: `execute_sequence` tool for 5-10 actions per inference call
|
|
64
|
+
|
|
65
|
+
## Expected Results
|
|
66
|
+
|
|
67
|
+
| Stage | Model | Mean Reward | Success Rate | Best Achievement |
|
|
68
|
+
|-------|-------|-------------|--------------|------------------|
|
|
69
|
+
| Initial | Qwen3-VL (vision) | ~150 | 60% | Win first battle |
|
|
70
|
+
| SFT | Qwen3-4B VL | ~200 | 75% | Win first battle + explore |
|
|
71
|
+
| RL | Qwen3-4B VL + RL | ~350 | 85% | Complete Pallet Town |
|
|
72
|
+
|
|
73
|
+
## Files
|
|
74
|
+
|
|
75
|
+
- `configs/` - All TOML configuration files
|
|
76
|
+
- `ft_data/` - Filtered datasets for fine-tuning
|
|
77
|
+
- `.env.example` - Environment variables template
|
|
78
|
+
|
|
79
|
+
## Vision Model Configuration
|
|
80
|
+
|
|
81
|
+
The vision models receive:
|
|
82
|
+
- **Input**: Game Boy screenshot + optional structured state (position, HP, party, etc.)
|
|
83
|
+
- **Output**: Sequence of button presses via `execute_sequence` tool
|
|
84
|
+
- **Action Space**: UP, DOWN, LEFT, RIGHT, A, B, START, SELECT with frame counts
|
|
85
|
+
|
|
86
|
+
## Reward Function
|
|
87
|
+
|
|
88
|
+
Dense rewards for Pallet Town progression:
|
|
89
|
+
- Leave bedroom (+20)
|
|
90
|
+
- Exit house (+30)
|
|
91
|
+
- Find Oak's lab (+40)
|
|
92
|
+
- Talk to Oak (+50)
|
|
93
|
+
- Get starter Pokémon (+100)
|
|
94
|
+
- Enter battle (+75)
|
|
95
|
+
- Deal damage (+50 per 10HP)
|
|
96
|
+
- Win battle (+150)
|
|
97
|
+
|
|
98
|
+
Total possible: ~700 points
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[eval]
|
|
2
|
+
app_id = "pokemon_red"
|
|
3
|
+
task_app_url = "https://synth-laboratories--pokemon-vl-qwen-xml-fastapi-app.modal.run"
|
|
4
|
+
model = "Qwen/Qwen3-VL-8B-Instruct" # Vision-capable Qwen3-VL model
|
|
5
|
+
seeds = [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
|
|
6
|
+
max_turns = 10
|
|
7
|
+
concurrency = 2
|
|
8
|
+
env_name = "pokemon_red"
|
|
9
|
+
policy_name = "pokemon_vl_qwen3_vl"
|
|
10
|
+
trace_format = "full"
|
|
11
|
+
return_trace = true
|
|
12
|
+
|
|
13
|
+
[eval.policy_config]
|
|
14
|
+
provider = "synth" # Use Synth internal API for vision models
|
|
15
|
+
model = "Qwen/Qwen3-VL-8B-Instruct" # Vision-capable Qwen3-VL model
|
|
16
|
+
inference_url = "http://localhost:8000/api/inference/v1/chat/completions"
|
|
17
|
+
temperature = 1.0 # Higher temperature to encourage tool calling
|
|
18
|
+
top_p = 0.95
|
|
19
|
+
max_tokens = 4096
|
|
20
|
+
use_vision = true
|
|
21
|
+
image_only_mode = false
|
|
22
|
+
max_llm_calls = 10
|
|
23
|
+
|
|
24
|
+
[eval.env_config.env_params]
|
|
25
|
+
max_steps_per_episode = 10
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[eval]
|
|
2
|
+
app_id = "pokemon_red"
|
|
3
|
+
task_app_url = "http://127.0.0.1:8914"
|
|
4
|
+
model = "fft:REPLACE-WITH-RL-JOB-ID" # Update with final RL job ID
|
|
5
|
+
seeds = [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]
|
|
6
|
+
max_turns = 15 # Allow more steps for trained model
|
|
7
|
+
concurrency = 3
|
|
8
|
+
env_name = "pokemon_red"
|
|
9
|
+
policy_name = "pokemon_vl_rl_final"
|
|
10
|
+
trace_format = "full"
|
|
11
|
+
return_trace = true
|
|
12
|
+
|
|
13
|
+
[eval.policy_config]
|
|
14
|
+
provider = "synth"
|
|
15
|
+
model = "fft:REPLACE-WITH-RL-JOB-ID" # Update with final RL job ID
|
|
16
|
+
temperature = 0.1 # Lower temperature for evaluation
|
|
17
|
+
top_p = 0.9
|
|
18
|
+
max_tokens = 4096
|
|
19
|
+
use_vision = true
|
|
20
|
+
image_only_mode = false
|
|
21
|
+
max_llm_calls = 15
|
|
22
|
+
|
|
23
|
+
[eval.env_config.env_params]
|
|
24
|
+
max_steps_per_episode = 15
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Filter high-quality vision-language rollouts for SFT training
|
|
2
|
+
# Assumes traces stored in pokemon_vl_blog.db via eval commands
|
|
3
|
+
|
|
4
|
+
[filter]
|
|
5
|
+
db = "traces/v3/pokemon_vl_blog.db"
|
|
6
|
+
output = "examples/blog_posts/pokemon_vl/ft_data/pokemon_vl_high_reward.jsonl"
|
|
7
|
+
min_official_score = 0.3 # Require at least 30% completion (Pallet Town progression)
|
|
8
|
+
models = ["Qwen/Qwen3-VL-4B-Instruct"] # Vision models used for rollouts
|
|
9
|
+
shuffle = true
|
|
10
|
+
shuffle_seed = 42
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Vision-Language RL: Continue training Qwen3-4B VL from SFT checkpoint
|
|
2
|
+
# Update task_url with deployed Modal task app URL
|
|
3
|
+
# Set model.source to the SFT job id from `uvx synth-ai train --type sft`
|
|
4
|
+
|
|
5
|
+
type = "rl"
|
|
6
|
+
|
|
7
|
+
[services]
|
|
8
|
+
task_url = "http://127.0.0.1:8914"
|
|
9
|
+
|
|
10
|
+
[compute]
|
|
11
|
+
gpu_type = "H100"
|
|
12
|
+
gpu_count = 8
|
|
13
|
+
|
|
14
|
+
[topology]
|
|
15
|
+
gpus_for_vllm = 4
|
|
16
|
+
gpus_for_training = 3
|
|
17
|
+
gpus_for_ref = 1
|
|
18
|
+
|
|
19
|
+
[vllm]
|
|
20
|
+
tensor_parallel_size = 4
|
|
21
|
+
|
|
22
|
+
[model]
|
|
23
|
+
source = "fft:REPLACE-WITH-SFT-JOB-ID" # Update with actual SFT job ID
|
|
24
|
+
label = "pokemon_vl_rl_blog"
|
|
25
|
+
supports_vision = true
|
|
26
|
+
|
|
27
|
+
[rollout]
|
|
28
|
+
max_turns = 10
|
|
29
|
+
episodes_per_batch = 64
|
|
30
|
+
|
|
31
|
+
[evaluation]
|
|
32
|
+
instances = 100
|
|
33
|
+
every_n_iters = 20
|
|
34
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
|
|
35
|
+
|
|
36
|
+
[training]
|
|
37
|
+
log_interval = 1
|
|
38
|
+
|
|
39
|
+
[training.weight_sync]
|
|
40
|
+
enable = true
|
|
41
|
+
targets = ["policy"]
|
|
42
|
+
weight_sync_interval = 1
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Vision-Language Supervised Fine-Tuning: Qwen3-4B VL on filtered Pokémon rollouts
|
|
2
|
+
# Update the `data` path once `uvx synth-ai filter` produces your JSONL
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "offline"
|
|
6
|
+
method = "sft"
|
|
7
|
+
variety = "fft"
|
|
8
|
+
|
|
9
|
+
[job]
|
|
10
|
+
model = "Qwen/Qwen3-VL-4B-Instruct" # Vision-enabled Qwen3-VL model
|
|
11
|
+
data = "../ft_data/pokemon_vl_high_reward.jsonl"
|
|
12
|
+
poll_seconds = 1800
|
|
13
|
+
|
|
14
|
+
[compute]
|
|
15
|
+
gpu_type = "H100"
|
|
16
|
+
gpu_count = 4
|
|
17
|
+
nodes = 1
|
|
18
|
+
|
|
19
|
+
[data.topology]
|
|
20
|
+
container_count = 4
|
|
21
|
+
|
|
22
|
+
[training]
|
|
23
|
+
mode = "full_finetune"
|
|
24
|
+
use_qlora = false
|
|
25
|
+
|
|
26
|
+
[hyperparameters]
|
|
27
|
+
n_epochs = 2
|
|
28
|
+
world_size = 4
|
|
29
|
+
sequence_length = 4096 # Longer for vision tokens + text
|
|
30
|
+
per_device_batch = 2
|
|
31
|
+
gradient_accumulation_steps = 64
|
|
32
|
+
learning_rate = 8e-6
|
|
33
|
+
warmup_ratio = 0.03
|
|
34
|
+
|
|
35
|
+
[hyperparameters.parallelism]
|
|
36
|
+
use_deepspeed = true
|
|
37
|
+
deepspeed_stage = 3
|
|
38
|
+
fsdp = false
|
|
39
|
+
bf16 = true
|
|
40
|
+
fp16 = false
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Crafter: From Rollouts to RL with the Synth AI CLI
|
|
2
|
+
|
|
3
|
+
This playbook mirrors the original “Warming Up to RL” walkthrough, but swaps the bespoke scripts for the first–class `uvx synth-ai` helpers. Every step—from deploying the task app to filtering rollouts, fine-tuning, and bootstrapping RL— now uses the same CLI you’d reach for in production.
|
|
4
|
+
|
|
5
|
+
All commands assume you are inside the repository root and have `uv`/`uvx` available.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## 0. Prerequisites
|
|
10
|
+
|
|
11
|
+
1. Install dependencies and authenticate once:
|
|
12
|
+
```bash
|
|
13
|
+
uv pip install -e .
|
|
14
|
+
uvx synth-ai setup
|
|
15
|
+
```
|
|
16
|
+
The setup wizard writes the required `SYNTH_API_KEY`, `ENVIRONMENT_API_KEY`, and local `.env` helpers.
|
|
17
|
+
|
|
18
|
+
2. Copy the example secrets if you need a starter file:
|
|
19
|
+
```bash
|
|
20
|
+
cp examples/warming_up_to_rl/.env.example .env
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
3. Export the path we use for trace capture (optional but keeps things tidy):
|
|
24
|
+
```bash
|
|
25
|
+
export CRAFTER_TRACE_DB=traces/v3/crafter_blog.db
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## 1. Ship the Crafter Task App
|
|
31
|
+
|
|
32
|
+
Deploy the hosted Crafter environment once. The Modal URL that prints at the end is reused by eval, SFT, and RL.
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
uvx synth-ai deploy grpo-crafter \
|
|
36
|
+
--runtime modal \
|
|
37
|
+
--modal-mode serve \
|
|
38
|
+
--name crafter-blogpost \
|
|
39
|
+
--env-file .env
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
For local testing you can run:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
uvx synth-ai deploy grpo-crafter \
|
|
46
|
+
--runtime uvicorn \
|
|
47
|
+
--port 8001 \
|
|
48
|
+
--trace traces/v3 \
|
|
49
|
+
--env-file .env
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Copy the Modal URL (e.g. `https://your-app.modal.run`) and replace the `task_app_url` placeholders inside every config under `examples/blog_posts/warming_up_to_rl/configs/`.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## 2. Collect High-Quality Rollouts
|
|
57
|
+
|
|
58
|
+
We lean on large teacher models to produce demonstrations. The configs in `configs/` already request full traces so we retain chain-of-thought.
|
|
59
|
+
|
|
60
|
+
Groq Qwen3-32B (text-only prompt):
|
|
61
|
+
```bash
|
|
62
|
+
uvx synth-ai eval grpo-crafter \
|
|
63
|
+
--config examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml \
|
|
64
|
+
--trace-db "${CRAFTER_TRACE_DB}"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
GPT-OSS-120B via Groq’s OpenAI-compatible endpoint (also text-only):
|
|
68
|
+
```bash
|
|
69
|
+
uvx synth-ai eval grpo-crafter \
|
|
70
|
+
--config examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml \
|
|
71
|
+
--trace-db "${CRAFTER_TRACE_DB}"
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Both configs disable image attachments and rely on the textual observation renderer (`format_observation`) so Groq stays within its supported modalities. If you want to try other models, keep `use_vision = false` unless the provider explicitly supports image inputs.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## 3. Filter Into an SFT Dataset
|
|
79
|
+
|
|
80
|
+
Once traces are stored in `CRAFT_TRACE_DB`, trim to the crisp trajectories:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
uvx synth-ai filter \
|
|
84
|
+
--config examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
The output JSONL lands in `ft_data/crafter_blog_high_reward.jsonl`, ready for supervised fine-tuning.
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 4. Fine-Tune Qwen3-4B with `uvx synth-ai train`
|
|
92
|
+
|
|
93
|
+
Update the dataset path (and optionally hyperparameters) in `train_sft_qwen4b.toml`, then launch:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
uvx synth-ai train \
|
|
97
|
+
--type sft \
|
|
98
|
+
--config examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml \
|
|
99
|
+
--env-file .env \
|
|
100
|
+
--poll
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Capture the returned job id (it looks like `fft:Qwen/Qwen3-4B:job_xxxxx`). We reuse that identifier in the evaluation and RL configs.
|
|
104
|
+
At any time you can list recently minted checkpoints with:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uvx synth-ai status models
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The output table shows the canonical model name/ID alongside the source job.
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## 5. Evaluate the Fine-Tuned Checkpoint
|
|
115
|
+
|
|
116
|
+
Replace both `REPLACE-WITH-SFT-JOB-ID` strings inside `eval_ft_qwen4b.toml`, then run:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
uvx synth-ai eval grpo-crafter \
|
|
120
|
+
--config examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml \
|
|
121
|
+
--trace-db "${CRAFTER_TRACE_DB}"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
This provides a clean, CLI-native comparison between the teacher rollouts and the fine-tuned model.
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## 6. Kick Off RL from the Fine-Tuned Model
|
|
129
|
+
|
|
130
|
+
Point `train_rl_from_sft.toml` at the same Modal task app and set `model.source` to your SFT job id:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
uvx synth-ai train \
|
|
134
|
+
--type rl \
|
|
135
|
+
--config examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml \
|
|
136
|
+
--env-file .env \
|
|
137
|
+
--poll
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
The CLI streams rollout and judge metrics in real time. When the run finishes, you can re-use the Stage 5 config (substituting the RL job id) to quantify the uplift.
|
|
141
|
+
If you lose track of the produced RL label or want to confirm the latest status, run:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
uvx synth-ai status jobs
|
|
145
|
+
uvx synth-ai status models
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
The first command shows job completion state; the second surfaces model IDs you can plug into new eval configs.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 7. Where to Go Next
|
|
153
|
+
|
|
154
|
+
- The original `examples/warming_up_to_rl` folder still contains deeper experiments (auto-curricula, modal renderers, etc.).
|
|
155
|
+
- Add more `eval_*.toml` configs to compare alternative judges or reward shaping strategies.
|
|
156
|
+
- Plug the filtered dataset into `uvx synth-ai files upload` if you want to share it with a teammate without copying JSONL around.
|
|
157
|
+
|
|
158
|
+
This directory now holds everything a blog post needs: configs, output locations, and the CLI entrypoints to reproduce the Crafter SFT → RL pipeline end-to-end.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Evaluate the finetuned Qwen3-4B checkpoint on Crafter.
|
|
2
|
+
# Replace model with the fft: job id returned by the SFT run.
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "fft:REPLACE-WITH-SFT-JOB-ID"
|
|
8
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
9
|
+
max_turns = 10
|
|
10
|
+
concurrency = 4
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "compact"
|
|
14
|
+
return_trace = false
|
|
15
|
+
|
|
16
|
+
[eval.policy_config]
|
|
17
|
+
provider = "synth"
|
|
18
|
+
model = "fft:REPLACE-WITH-SFT-JOB-ID"
|
|
19
|
+
temperature = 0.2
|
|
20
|
+
top_p = 0.8
|
|
21
|
+
max_tokens = 512
|
|
22
|
+
use_vision = true
|
|
23
|
+
image_only_mode = false
|
|
24
|
+
max_llm_calls = 10
|
|
25
|
+
tool_choice = "auto"
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[eval]
|
|
2
|
+
app_id = "grpo-crafter"
|
|
3
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
4
|
+
model = "qwen/qwen3-32b"
|
|
5
|
+
seeds = [ 0, 1, 2,]
|
|
6
|
+
max_turns = 10
|
|
7
|
+
concurrency = 1
|
|
8
|
+
env_name = "crafter"
|
|
9
|
+
policy_name = "crafter-react"
|
|
10
|
+
trace_format = "full"
|
|
11
|
+
return_trace = true
|
|
12
|
+
|
|
13
|
+
[eval.policy_config]
|
|
14
|
+
provider = "groq"
|
|
15
|
+
model = "qwen/qwen3-32b"
|
|
16
|
+
inference_url = "https://api.groq.com/openai"
|
|
17
|
+
temperature = 0.6
|
|
18
|
+
top_p = 0.95
|
|
19
|
+
max_tokens = 8192
|
|
20
|
+
use_vision = false
|
|
21
|
+
image_only_mode = false
|
|
22
|
+
max_llm_calls = 10
|
|
23
|
+
|
|
24
|
+
[eval.env_config.env_params]
|
|
25
|
+
max_steps_per_episode = 10
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Crafter rollout config for GPT-OSS-120B served from OpenAI-compatible APIs.
|
|
2
|
+
# Replace the task_app_url with your deployed Crafter task app URL.
|
|
3
|
+
# The run stores full traces so we can keep the LLM reasoning for fine-tuning.
|
|
4
|
+
|
|
5
|
+
[eval]
|
|
6
|
+
app_id = "grpo-crafter"
|
|
7
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
8
|
+
model = "openai/gpt-oss-120b"
|
|
9
|
+
seeds = [0, 1, 2]
|
|
10
|
+
max_turns = 10
|
|
11
|
+
concurrency = 1
|
|
12
|
+
env_name = "crafter"
|
|
13
|
+
policy_name = "crafter-react"
|
|
14
|
+
trace_format = "full"
|
|
15
|
+
return_trace = true
|
|
16
|
+
|
|
17
|
+
[eval.env_config]
|
|
18
|
+
env_params = { max_steps_per_episode = 10 }
|
|
19
|
+
|
|
20
|
+
[eval.policy_config]
|
|
21
|
+
provider = "groq"
|
|
22
|
+
model = "openai/gpt-oss-120b"
|
|
23
|
+
inference_url = "https://api.groq.com/openai"
|
|
24
|
+
temperature = 0.6
|
|
25
|
+
top_p = 0.9
|
|
26
|
+
max_tokens = 768
|
|
27
|
+
use_vision = false
|
|
28
|
+
image_only_mode = false
|
|
29
|
+
max_llm_calls = 10
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Filters Crafter traces into an instruction-tuning dataset.
|
|
2
|
+
# Assumes you stored rollouts in traces/v3/crafter_blog.db via `uvx synth-ai eval`.
|
|
3
|
+
|
|
4
|
+
[filter]
|
|
5
|
+
db = "traces/v3/crafter_blog.db"
|
|
6
|
+
output = "examples/blog_posts/warming_up_to_rl/ft_data/crafter_blog_high_reward.jsonl"
|
|
7
|
+
min_official_score = 0.1
|
|
8
|
+
models = ["qwen/qwen3-32b", "openai/gpt-oss-120b"]
|
|
9
|
+
shuffle = true
|
|
10
|
+
shuffle_seed = 42
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Continue training the finetuned Crafter policy with GRPO-style RL.
|
|
2
|
+
# Fill in task_url with your deployed task app and set model.source to the
|
|
3
|
+
# finetuned model id returned by `uvx synth-ai train --type sft`.
|
|
4
|
+
|
|
5
|
+
type = "rl"
|
|
6
|
+
|
|
7
|
+
[services]
|
|
8
|
+
task_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
9
|
+
|
|
10
|
+
[compute]
|
|
11
|
+
gpu_type = "H100"
|
|
12
|
+
gpu_count = 8
|
|
13
|
+
|
|
14
|
+
[topology]
|
|
15
|
+
gpus_for_vllm = 4
|
|
16
|
+
gpus_for_training = 3
|
|
17
|
+
gpus_for_ref = 1
|
|
18
|
+
|
|
19
|
+
[vllm]
|
|
20
|
+
tensor_parallel_size = 4
|
|
21
|
+
|
|
22
|
+
[model]
|
|
23
|
+
source = "fft:REPLACE-WITH-SFT-JOB-ID"
|
|
24
|
+
label = "crafter-rl-blogpost"
|
|
25
|
+
|
|
26
|
+
[rollout]
|
|
27
|
+
max_turns = 10
|
|
28
|
+
episodes_per_batch = 64
|
|
29
|
+
|
|
30
|
+
[evaluation]
|
|
31
|
+
instances = 100
|
|
32
|
+
every_n_iters = 20
|
|
33
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
|
|
34
|
+
|
|
35
|
+
[training]
|
|
36
|
+
log_interval = 1
|
|
37
|
+
|
|
38
|
+
[training.weight_sync]
|
|
39
|
+
enable = true
|
|
40
|
+
targets = ["policy"]
|
|
41
|
+
weight_sync_interval = 1
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Full-finetune Qwen3-4B on filtered Crafter traces.
|
|
2
|
+
# Update the `data` path once `uvx synth-ai filter` produces your JSONL.
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "offline"
|
|
6
|
+
method = "sft"
|
|
7
|
+
variety = "fft"
|
|
8
|
+
|
|
9
|
+
[job]
|
|
10
|
+
model = "Qwen/Qwen3-4B"
|
|
11
|
+
data = "../ft_data/crafter_blog_high_reward.jsonl"
|
|
12
|
+
poll_seconds = 1800
|
|
13
|
+
|
|
14
|
+
[compute]
|
|
15
|
+
gpu_type = "H100"
|
|
16
|
+
gpu_count = 4
|
|
17
|
+
nodes = 1
|
|
18
|
+
|
|
19
|
+
[data.topology]
|
|
20
|
+
container_count = 4
|
|
21
|
+
|
|
22
|
+
[training]
|
|
23
|
+
mode = "full_finetune"
|
|
24
|
+
use_qlora = false
|
|
25
|
+
|
|
26
|
+
[hyperparameters]
|
|
27
|
+
n_epochs = 2
|
|
28
|
+
world_size = 4
|
|
29
|
+
sequence_length = 2048
|
|
30
|
+
per_device_batch = 2
|
|
31
|
+
gradient_accumulation_steps = 64
|
|
32
|
+
learning_rate = 8e-6
|
|
33
|
+
warmup_ratio = 0.03
|
|
34
|
+
|
|
35
|
+
[hyperparameters.parallelism]
|
|
36
|
+
use_deepspeed = true
|
|
37
|
+
deepspeed_stage = 3
|
|
38
|
+
fsdp = false
|
|
39
|
+
bf16 = true
|
|
40
|
+
fp16 = false
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|