synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +6 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -38
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +288 -39
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
- synth_ai/api/train/builders.py +99 -4
- synth_ai/api/train/cli.py +516 -26
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +23 -2
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +61 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/auth/credentials.py +119 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +94 -18
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +30 -158
- synth_ai/cli/deploy/__init__.py +43 -0
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +51 -1480
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -10
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +518 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +45 -9
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +40 -33
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +285 -3
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[eval]
|
|
2
|
+
app_id = "grpo-crafter"
|
|
3
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
4
|
+
model = "Qwen/Qwen3-4B"
|
|
5
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
6
|
+
max_turns = 20
|
|
7
|
+
concurrency = 1
|
|
8
|
+
env_name = "crafter"
|
|
9
|
+
policy_name = "crafter-react"
|
|
10
|
+
trace_format = "structured"
|
|
11
|
+
return_trace = true
|
|
12
|
+
|
|
13
|
+
[eval.policy_config]
|
|
14
|
+
provider = "synth"
|
|
15
|
+
model = "Qwen/Qwen3-4B"
|
|
16
|
+
inference_url = "https://synth-laboratories-dev--learning-v2-service-fastapi-app.modal.run"
|
|
17
|
+
temperature = 0.6
|
|
18
|
+
top_p = 0.95
|
|
19
|
+
max_tokens = 2048
|
|
20
|
+
use_vision = false
|
|
21
|
+
image_only_mode = false
|
|
22
|
+
max_llm_calls = 10
|
|
23
|
+
|
|
24
|
+
[eval.env_config.env_params]
|
|
25
|
+
max_steps_per_episode = 20
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Evaluate the finetuned Qwen3-4B checkpoint on Crafter.
|
|
2
|
+
# Replace model with the fft: job id returned by the SFT run.
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "fft:REPLACE-WITH-SFT-JOB-ID"
|
|
8
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
9
|
+
max_turns = 10
|
|
10
|
+
concurrency = 4
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "compact"
|
|
14
|
+
return_trace = false
|
|
15
|
+
|
|
16
|
+
[eval.policy_config]
|
|
17
|
+
provider = "synth"
|
|
18
|
+
model = "fft:REPLACE-WITH-SFT-JOB-ID"
|
|
19
|
+
temperature = 0.2
|
|
20
|
+
top_p = 0.8
|
|
21
|
+
max_tokens = 512
|
|
22
|
+
use_vision = true
|
|
23
|
+
image_only_mode = false
|
|
24
|
+
max_llm_calls = 10
|
|
25
|
+
tool_choice = "auto"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[eval]
|
|
2
|
+
app_id = "grpo-crafter"
|
|
3
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
4
|
+
model = "peft:Qwen/Qwen3-4B:job_f774218e6c954517"
|
|
5
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
6
|
+
max_turns = 20
|
|
7
|
+
concurrency = 2
|
|
8
|
+
env_name = "crafter"
|
|
9
|
+
policy_name = "crafter-react"
|
|
10
|
+
trace_format = "structured"
|
|
11
|
+
return_trace = true
|
|
12
|
+
|
|
13
|
+
[eval.policy_config]
|
|
14
|
+
provider = "synth"
|
|
15
|
+
model = "peft:Qwen/Qwen3-4B:job_f774218e6c954517"
|
|
16
|
+
inference_url = "https://synth-laboratories-dev--learning-v2-service-fastapi-app.modal.run"
|
|
17
|
+
temperature = 0.2
|
|
18
|
+
top_p = 0.8
|
|
19
|
+
max_tokens = 1024
|
|
20
|
+
use_vision = false
|
|
21
|
+
image_only_mode = false
|
|
22
|
+
max_llm_calls = 10
|
|
23
|
+
tool_choice = "auto"
|
|
24
|
+
|
|
25
|
+
[eval.env_config.env_params]
|
|
26
|
+
max_steps_per_episode = 20
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[eval]
|
|
2
|
+
app_id = "grpo-crafter"
|
|
3
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
4
|
+
model = "qwen/qwen3-32b"
|
|
5
|
+
seeds = [ 0, 1, 2,]
|
|
6
|
+
max_turns = 10
|
|
7
|
+
concurrency = 1
|
|
8
|
+
env_name = "crafter"
|
|
9
|
+
policy_name = "crafter-react"
|
|
10
|
+
trace_format = "full"
|
|
11
|
+
return_trace = true
|
|
12
|
+
|
|
13
|
+
[eval.policy_config]
|
|
14
|
+
provider = "groq"
|
|
15
|
+
model = "qwen/qwen3-32b"
|
|
16
|
+
inference_url = "https://api.groq.com/openai"
|
|
17
|
+
temperature = 0.6
|
|
18
|
+
top_p = 0.95
|
|
19
|
+
max_tokens = 8192
|
|
20
|
+
use_vision = false
|
|
21
|
+
image_only_mode = false
|
|
22
|
+
max_llm_calls = 10
|
|
23
|
+
|
|
24
|
+
[eval.env_config.env_params]
|
|
25
|
+
max_steps_per_episode = 10
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Crafter rollout config for GPT-OSS-120B served from OpenAI-compatible APIs.
|
|
2
|
+
# Replace the task_app_url with your deployed Crafter task app URL.
|
|
3
|
+
# The run stores full traces so we can keep the LLM reasoning for fine-tuning.
|
|
4
|
+
|
|
5
|
+
[eval]
|
|
6
|
+
app_id = "grpo-crafter"
|
|
7
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
8
|
+
model = "openai/gpt-oss-120b"
|
|
9
|
+
seeds = [0, 1, 2]
|
|
10
|
+
max_turns = 10
|
|
11
|
+
concurrency = 1
|
|
12
|
+
env_name = "crafter"
|
|
13
|
+
policy_name = "crafter-react"
|
|
14
|
+
trace_format = "full"
|
|
15
|
+
return_trace = true
|
|
16
|
+
|
|
17
|
+
[eval.env_config]
|
|
18
|
+
env_params = { max_steps_per_episode = 10 }
|
|
19
|
+
|
|
20
|
+
[eval.policy_config]
|
|
21
|
+
provider = "groq"
|
|
22
|
+
model = "openai/gpt-oss-120b"
|
|
23
|
+
inference_url = "https://api.groq.com/openai"
|
|
24
|
+
temperature = 0.6
|
|
25
|
+
top_p = 0.9
|
|
26
|
+
max_tokens = 768
|
|
27
|
+
use_vision = false
|
|
28
|
+
image_only_mode = false
|
|
29
|
+
max_llm_calls = 10
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Filters Crafter traces into an instruction-tuning dataset.
|
|
2
|
+
# Assumes you stored rollouts in traces/v3/crafter_blog.db via `uvx synth-ai eval`.
|
|
3
|
+
|
|
4
|
+
[filter]
|
|
5
|
+
db = "sqlite+libsql://http://127.0.0.1:8080"
|
|
6
|
+
output = "examples/blog_posts/warming_up_to_rl/ft_data/crafter_blog_high_reward.jsonl"
|
|
7
|
+
min_official_score = 0.1
|
|
8
|
+
models = ["qwen/qwen3-32b", "openai/gpt-oss-120b"]
|
|
9
|
+
shuffle = true
|
|
10
|
+
shuffle_seed = 42
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Example RL config with smoke testing enabled
|
|
2
|
+
# This config demonstrates auto-starting task app and sqld for easy smoke testing
|
|
3
|
+
|
|
4
|
+
type = "rl"
|
|
5
|
+
|
|
6
|
+
# Smoke testing configuration - AUTO-STARTS services in background!
|
|
7
|
+
[smoke]
|
|
8
|
+
# Auto-start the task app server
|
|
9
|
+
task_app_name = "grpo-crafter" # Your task app name (use "synth-ai task-app list" to see available apps)
|
|
10
|
+
task_app_port = 8765
|
|
11
|
+
task_app_env_file = ".env" # Required for this task app
|
|
12
|
+
task_app_force = true # Kill any existing process on this port
|
|
13
|
+
|
|
14
|
+
# Auto-start sqld for tracing
|
|
15
|
+
sqld_auto_start = true
|
|
16
|
+
sqld_db_path = "./traces/local.db"
|
|
17
|
+
sqld_hrana_port = 8080
|
|
18
|
+
sqld_http_port = 8081
|
|
19
|
+
|
|
20
|
+
# Test parameters
|
|
21
|
+
env_name = "crafter"
|
|
22
|
+
policy_name = "crafter-react"
|
|
23
|
+
max_steps = 10
|
|
24
|
+
policy = "gpt-5-nano" # Use gpt-5-nano policy with mock backend
|
|
25
|
+
model = "gpt-4o-mini" # Real model to use via OpenAI
|
|
26
|
+
mock_backend = "openai" # Use OpenAI backend for real inference and tool calls
|
|
27
|
+
return_trace = true
|
|
28
|
+
use_mock = true # Use mock proxy that routes to OpenAI
|
|
29
|
+
|
|
30
|
+
# RL Training Configuration (used by actual training, not smoke tests)
|
|
31
|
+
[algorithm]
|
|
32
|
+
type = "online"
|
|
33
|
+
method = "policy_gradient"
|
|
34
|
+
variety = "gspo"
|
|
35
|
+
|
|
36
|
+
[policy]
|
|
37
|
+
model_name = "Qwen/Qwen3-4B"
|
|
38
|
+
trainer_mode = "full"
|
|
39
|
+
label = "crafter-rl-demo"
|
|
40
|
+
|
|
41
|
+
[compute]
|
|
42
|
+
gpu_type = "H100"
|
|
43
|
+
gpu_count = 2
|
|
44
|
+
|
|
45
|
+
[compute.topology]
|
|
46
|
+
type = "single_node_split"
|
|
47
|
+
gpus_for_vllm = 1
|
|
48
|
+
gpus_for_training = 1
|
|
49
|
+
|
|
50
|
+
[services]
|
|
51
|
+
task_url = "http://localhost:8765"
|
|
52
|
+
|
|
53
|
+
[rollout]
|
|
54
|
+
env_name = "crafter"
|
|
55
|
+
policy_name = "crafter-react"
|
|
56
|
+
max_turns = 10
|
|
57
|
+
episodes_per_batch = 16
|
|
58
|
+
max_concurrent_rollouts = 4
|
|
59
|
+
task_app_origin_rewards_only = true
|
|
60
|
+
|
|
61
|
+
[training]
|
|
62
|
+
num_epochs = 1
|
|
63
|
+
iterations_per_epoch = 10
|
|
64
|
+
max_turns = 10
|
|
65
|
+
batch_size = 4
|
|
66
|
+
group_size = 4
|
|
67
|
+
learning_rate = 5e-5
|
|
68
|
+
weight_sync_interval = 1
|
|
69
|
+
log_interval = 1
|
|
70
|
+
|
|
71
|
+
[evaluation]
|
|
72
|
+
instances = 2
|
|
73
|
+
every_n_iters = 1
|
|
74
|
+
seeds = [0, 1]
|
|
75
|
+
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Continue training the finetuned Crafter policy with GRPO-style RL.
|
|
2
|
+
# Fill in task_url with your deployed task app and set model.source to the
|
|
3
|
+
# finetuned model id returned by `uvx synth-ai train --type sft`.
|
|
4
|
+
|
|
5
|
+
type = "rl"
|
|
6
|
+
|
|
7
|
+
# [smoke] section is OPTIONAL and only used by `synth-ai smoke` command for local testing.
|
|
8
|
+
# This section is completely IGNORED by the RL trainer and will not affect training jobs.
|
|
9
|
+
# It allows you to quickly test your task app without passing many CLI arguments:
|
|
10
|
+
# uvx synth-ai smoke --config this-file.toml
|
|
11
|
+
# All values are optional; CLI args override TOML values.
|
|
12
|
+
[smoke]
|
|
13
|
+
task_url = "https://synth-laboratories--crafter-blogpost-fastapi-app-dev.modal.run"
|
|
14
|
+
env_name = "crafter"
|
|
15
|
+
policy_name = "crafter-react"
|
|
16
|
+
max_steps = 10
|
|
17
|
+
policy = "mock" # mock, gpt-5-nano, openai, groq
|
|
18
|
+
model = "gpt-5-nano"
|
|
19
|
+
mock_backend = "openai" # synthetic or openai
|
|
20
|
+
mock_port = 0 # 0 = auto-assign
|
|
21
|
+
return_trace = true
|
|
22
|
+
use_mock = true
|
|
23
|
+
|
|
24
|
+
[algorithm]
|
|
25
|
+
type = "online"
|
|
26
|
+
method = "policy_gradient"
|
|
27
|
+
variety = "gspo"
|
|
28
|
+
|
|
29
|
+
[services]
|
|
30
|
+
task_url = "https://synth-laboratories--crafter-blogpost-fastapi-app-dev.modal.run"
|
|
31
|
+
judge_url = "https://synth-backend-dev-docker.onrender.com/api"
|
|
32
|
+
|
|
33
|
+
[compute]
|
|
34
|
+
gpu_type = "H200"
|
|
35
|
+
gpu_count = 2
|
|
36
|
+
[compute.topology]
|
|
37
|
+
reference_placement = "none"
|
|
38
|
+
|
|
39
|
+
[topology]
|
|
40
|
+
type = "single_node_split"
|
|
41
|
+
reference_placement = "none"
|
|
42
|
+
gpus_for_vllm = 1
|
|
43
|
+
gpus_for_training = 1
|
|
44
|
+
gpus_for_ref = 0
|
|
45
|
+
tensor_parallel = 1
|
|
46
|
+
|
|
47
|
+
[vllm]
|
|
48
|
+
tensor_parallel_size = 1
|
|
49
|
+
max_model_len = 8192
|
|
50
|
+
|
|
51
|
+
[reference]
|
|
52
|
+
placement = "none"
|
|
53
|
+
|
|
54
|
+
[model]
|
|
55
|
+
base = "Qwen/Qwen3-4B"
|
|
56
|
+
trainer_mode = "lora"
|
|
57
|
+
label = "crafter-rl-baseline"
|
|
58
|
+
|
|
59
|
+
[rollout]
|
|
60
|
+
env_name = "crafter"
|
|
61
|
+
policy_name = "crafter-react"
|
|
62
|
+
max_turns = 10
|
|
63
|
+
episodes_per_batch = 20
|
|
64
|
+
max_concurrent_rollouts = 8
|
|
65
|
+
rubric_rewards_only = false
|
|
66
|
+
task_app_origin_rewards_only = true
|
|
67
|
+
|
|
68
|
+
[evaluation]
|
|
69
|
+
instances = 100
|
|
70
|
+
every_n_iters = 20
|
|
71
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]
|
|
72
|
+
|
|
73
|
+
[training]
|
|
74
|
+
num_epochs = 1
|
|
75
|
+
iterations_per_epoch = 1
|
|
76
|
+
max_turns = 10
|
|
77
|
+
batch_size = 2
|
|
78
|
+
group_size = 2
|
|
79
|
+
learning_rate = 5e-6
|
|
80
|
+
weight_sync_interval = 1
|
|
81
|
+
log_interval = 1
|
|
82
|
+
max_completion_tokens = 256
|
|
83
|
+
async_semaphore_max = 4
|
|
84
|
+
|
|
85
|
+
[training.weight_sync]
|
|
86
|
+
enable = true
|
|
87
|
+
targets = ["policy"]
|
|
88
|
+
weight_sync_interval = 1
|
|
89
|
+
|
|
90
|
+
[rubric]
|
|
91
|
+
enabled = false
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Full-finetune Qwen3-4B on filtered Crafter traces.
|
|
2
|
+
# Update the `data` path once `uvx synth-ai filter` produces your JSONL.
|
|
3
|
+
|
|
4
|
+
[algorithm]
|
|
5
|
+
type = "offline"
|
|
6
|
+
method = "sft"
|
|
7
|
+
variety = "fft"
|
|
8
|
+
|
|
9
|
+
[job]
|
|
10
|
+
model = "Qwen/Qwen3-4B"
|
|
11
|
+
data = "examples/blog_posts/warming_up_to_rl/ft_data/crafter_blog_high_reward.jsonl"
|
|
12
|
+
poll_seconds = 1800
|
|
13
|
+
|
|
14
|
+
[compute]
|
|
15
|
+
gpu_type = "H100"
|
|
16
|
+
gpu_count = 4
|
|
17
|
+
nodes = 1
|
|
18
|
+
|
|
19
|
+
[data.topology]
|
|
20
|
+
container_count = 4
|
|
21
|
+
|
|
22
|
+
[training]
|
|
23
|
+
mode = "full_finetune"
|
|
24
|
+
use_qlora = false
|
|
25
|
+
|
|
26
|
+
[hyperparameters]
|
|
27
|
+
n_epochs = 2
|
|
28
|
+
world_size = 4
|
|
29
|
+
sequence_length = 2048
|
|
30
|
+
per_device_batch = 2
|
|
31
|
+
gradient_accumulation_steps = 64
|
|
32
|
+
learning_rate = 8e-6
|
|
33
|
+
warmup_ratio = 0.03
|
|
34
|
+
|
|
35
|
+
[hyperparameters.parallelism]
|
|
36
|
+
use_deepspeed = true
|
|
37
|
+
deepspeed_stage = 3
|
|
38
|
+
fsdp = false
|
|
39
|
+
bf16 = true
|
|
40
|
+
fp16 = false
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Warming Up to RL baseline for Crafter.
|
|
2
|
+
|
|
3
|
+
This baseline demonstrates how to evaluate an LLM agent on the Crafter survival game
|
|
4
|
+
without requiring a deployed task app. This is the recommended starting point for coding
|
|
5
|
+
agents to get a baseline score before making changes.
|
|
6
|
+
|
|
7
|
+
Quick Start:
|
|
8
|
+
# Run a quick 3-task baseline
|
|
9
|
+
uvx synth-ai baseline warming_up_to_rl --split train --seeds 0,1,2
|
|
10
|
+
|
|
11
|
+
# Full train evaluation
|
|
12
|
+
uvx synth-ai baseline warming_up_to_rl --split train
|
|
13
|
+
|
|
14
|
+
# Compare models
|
|
15
|
+
uvx synth-ai baseline warming_up_to_rl --model groq:openai/gpt-oss-20b
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
import crafter
|
|
25
|
+
CRAFTER_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
CRAFTER_AVAILABLE = False
|
|
28
|
+
|
|
29
|
+
from synth_ai.baseline import BaselineConfig, BaselineTaskRunner, DataSplit, TaskResult
|
|
30
|
+
from synth_ai.types import EventReward, OutcomeReward
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CrafterRunner(BaselineTaskRunner):
|
|
34
|
+
"""Task runner for Crafter environment."""
|
|
35
|
+
|
|
36
|
+
def __init__(self, policy_config: dict[str, Any], env_config: dict[str, Any]):
|
|
37
|
+
super().__init__(policy_config, env_config)
|
|
38
|
+
self.max_steps = env_config.get("max_steps", 1000)
|
|
39
|
+
|
|
40
|
+
async def run_task(self, seed: int) -> TaskResult:
|
|
41
|
+
"""Run a single Crafter episode."""
|
|
42
|
+
if not CRAFTER_AVAILABLE:
|
|
43
|
+
raise ImportError(
|
|
44
|
+
"Crafter not installed. Install with: pip install crafter"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Create environment
|
|
48
|
+
env = crafter.Env()
|
|
49
|
+
env.reset()
|
|
50
|
+
|
|
51
|
+
# Initialize tracking
|
|
52
|
+
event_rewards: list[EventReward] = []
|
|
53
|
+
achievements = {}
|
|
54
|
+
step_count = 0
|
|
55
|
+
|
|
56
|
+
# Get model configuration
|
|
57
|
+
from synth_ai.inference.client import InferenceClient
|
|
58
|
+
|
|
59
|
+
client = InferenceClient()
|
|
60
|
+
model = self.policy_config.get("model", "gpt-4o-mini")
|
|
61
|
+
temperature = self.policy_config.get("temperature", 0.7)
|
|
62
|
+
|
|
63
|
+
# Define action tool
|
|
64
|
+
actions = [
|
|
65
|
+
"noop", "move_left", "move_right", "move_up", "move_down",
|
|
66
|
+
"do", "sleep", "place_stone", "place_table", "place_furnace",
|
|
67
|
+
"place_plant", "make_wood_pickaxe", "make_stone_pickaxe",
|
|
68
|
+
"make_iron_pickaxe", "make_wood_sword", "make_stone_sword",
|
|
69
|
+
"make_iron_sword"
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
action_tool = {
|
|
73
|
+
"type": "function",
|
|
74
|
+
"function": {
|
|
75
|
+
"name": "take_action",
|
|
76
|
+
"description": "Take an action in the Crafter world",
|
|
77
|
+
"parameters": {
|
|
78
|
+
"type": "object",
|
|
79
|
+
"properties": {
|
|
80
|
+
"action": {
|
|
81
|
+
"type": "string",
|
|
82
|
+
"enum": actions,
|
|
83
|
+
"description": f"Action to take. Available: {', '.join(actions)}",
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
"required": ["action"],
|
|
87
|
+
},
|
|
88
|
+
},
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
# Run episode
|
|
92
|
+
done = False
|
|
93
|
+
while not done and step_count < self.max_steps:
|
|
94
|
+
# Get observation (would include visual state in full implementation)
|
|
95
|
+
obs_str = f"Crafter Step {step_count}\n"
|
|
96
|
+
obs_str += f"Current achievements: {achievements}\n"
|
|
97
|
+
obs_str += "What action should you take to survive and progress?"
|
|
98
|
+
|
|
99
|
+
# Get action from model
|
|
100
|
+
try:
|
|
101
|
+
response = await client.generate(
|
|
102
|
+
model=model,
|
|
103
|
+
messages=[
|
|
104
|
+
{
|
|
105
|
+
"role": "system",
|
|
106
|
+
"content": "You are an expert at survival games. Use the take_action tool to survive and achieve goals in Crafter.",
|
|
107
|
+
},
|
|
108
|
+
{"role": "user", "content": obs_str},
|
|
109
|
+
],
|
|
110
|
+
tools=[action_tool],
|
|
111
|
+
temperature=temperature,
|
|
112
|
+
max_tokens=100,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Extract action
|
|
116
|
+
action_name = "noop"
|
|
117
|
+
if response.get("tool_calls"):
|
|
118
|
+
tool_call = response["tool_calls"][0]
|
|
119
|
+
args = json.loads(tool_call["function"]["arguments"])
|
|
120
|
+
action_name = args.get("action", "noop")
|
|
121
|
+
|
|
122
|
+
action_idx = actions.index(action_name) if action_name in actions else 0
|
|
123
|
+
|
|
124
|
+
# Take step
|
|
125
|
+
obs, reward, done, info = env.step(action_idx)
|
|
126
|
+
|
|
127
|
+
# Update achievements
|
|
128
|
+
if "achievements" in info:
|
|
129
|
+
achievements.update(info["achievements"])
|
|
130
|
+
|
|
131
|
+
# Track rewards
|
|
132
|
+
if reward > 0:
|
|
133
|
+
event_rewards.append(
|
|
134
|
+
EventReward(
|
|
135
|
+
event_id=f"step_{step_count}",
|
|
136
|
+
reward=reward,
|
|
137
|
+
metadata={"action": action_name, "achievements": achievements.copy()},
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
step_count += 1
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
done = True
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
# Calculate outcome reward based on achievements
|
|
148
|
+
total_achievements = sum(achievements.values())
|
|
149
|
+
success = total_achievements >= 3 # At least 3 achievements
|
|
150
|
+
|
|
151
|
+
return TaskResult(
|
|
152
|
+
success=success,
|
|
153
|
+
outcome_reward=OutcomeReward(
|
|
154
|
+
reward=float(total_achievements),
|
|
155
|
+
metadata={
|
|
156
|
+
"steps": step_count,
|
|
157
|
+
"achievements": achievements,
|
|
158
|
+
"seed": seed,
|
|
159
|
+
},
|
|
160
|
+
),
|
|
161
|
+
event_rewards=event_rewards,
|
|
162
|
+
total_steps=step_count,
|
|
163
|
+
metadata={"achievements": achievements},
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# Define baseline configuration (only if Crafter is available)
|
|
168
|
+
if CRAFTER_AVAILABLE:
|
|
169
|
+
warming_up_to_rl_baseline = BaselineConfig(
|
|
170
|
+
baseline_id="warming_up_to_rl",
|
|
171
|
+
name="Warming Up to RL - Crafter",
|
|
172
|
+
description="Crafter survival game baseline for comparing agent performance on RL tasks",
|
|
173
|
+
task_runner=CrafterRunner,
|
|
174
|
+
splits={
|
|
175
|
+
"train": DataSplit(name="train", seeds=list(range(20))),
|
|
176
|
+
"val": DataSplit(name="val", seeds=list(range(20, 25))),
|
|
177
|
+
"test": DataSplit(name="test", seeds=list(range(25, 30))),
|
|
178
|
+
},
|
|
179
|
+
default_policy_config={
|
|
180
|
+
"model": "gpt-4o-mini",
|
|
181
|
+
"temperature": 0.7,
|
|
182
|
+
},
|
|
183
|
+
default_env_config={
|
|
184
|
+
"max_steps": 1000,
|
|
185
|
+
},
|
|
186
|
+
tags=["rl", "survival", "achievements", "blog-post"],
|
|
187
|
+
)
|
|
@@ -6,7 +6,7 @@ method = "policy_gradient"
|
|
|
6
6
|
variety = "gspo"
|
|
7
7
|
|
|
8
8
|
[services]
|
|
9
|
-
# Replace with the Modal URL printed by `uvx synth-ai modal-serve grpo-crafter`
|
|
9
|
+
# Replace with the Modal URL printed by `uvx synth-ai deploy --runtime modal --modal-mode serve grpo-crafter`
|
|
10
10
|
task_url = "https://YOUR-MODAL-TASK-APP.modal.run"
|
|
11
11
|
|
|
12
12
|
[compute]
|
|
@@ -46,6 +46,7 @@ policy_name = "crafter-react"
|
|
|
46
46
|
max_concurrent_rollouts = 12
|
|
47
47
|
batches_per_step = 2
|
|
48
48
|
ops = ["agent", "env"]
|
|
49
|
+
task_app_origin_rewards_only = true
|
|
49
50
|
|
|
50
51
|
[evaluation]
|
|
51
52
|
instances = 10
|