synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +6 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -38
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +288 -39
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
- synth_ai/api/train/builders.py +99 -4
- synth_ai/api/train/cli.py +516 -26
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +23 -2
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +61 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/auth/credentials.py +119 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +94 -18
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +30 -158
- synth_ai/cli/deploy/__init__.py +43 -0
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +51 -1480
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -10
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +518 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +45 -9
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +40 -33
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +285 -3
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -1,43 +1,26 @@
|
|
|
1
1
|
# Evaluation config for gpt-4o-mini with vision
|
|
2
|
-
#
|
|
2
|
+
# Higher-quality teacher for Crafter SFT distillation
|
|
3
3
|
|
|
4
4
|
[eval]
|
|
5
|
-
|
|
6
|
-
provider = "openai" # Use OpenAI API
|
|
7
|
-
|
|
8
|
-
# Task app endpoint
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
9
6
|
task_app_url = "https://synth-laboratories--grpo-crafter-task-app.modal.run"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
7
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
8
|
+
seeds = "200-299"
|
|
9
|
+
max_turns = 50
|
|
10
|
+
concurrency = 5
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "structured"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
env_params = {max_steps_per_episode = 50}
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "openai"
|
|
21
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
22
|
+
temperature = 0.6
|
|
22
23
|
max_tokens = 512
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
collect_traces = true
|
|
26
|
-
trace_db = "traces/gpt4o_mini_vision/rollouts.db"
|
|
27
|
-
|
|
28
|
-
# Tools
|
|
24
|
+
use_vision = true
|
|
25
|
+
image_only_mode = false
|
|
29
26
|
use_tools = true
|
|
30
|
-
|
|
31
|
-
# Parallel rollouts
|
|
32
|
-
parallel_episodes = 5
|
|
33
|
-
|
|
34
|
-
[task]
|
|
35
|
-
name = "crafter"
|
|
36
|
-
environment = "crafter-classic"
|
|
37
|
-
|
|
38
|
-
# Task-specific settings
|
|
39
|
-
[task.config]
|
|
40
|
-
seed_start = 200
|
|
41
|
-
max_episode_length = 256
|
|
42
|
-
render_size = [64, 64] # 64x64 PNG images
|
|
43
|
-
|
|
@@ -1,45 +1,26 @@
|
|
|
1
|
-
# Evaluation config for gpt-4o-mini
|
|
2
|
-
# Collects
|
|
3
|
-
# Note: gpt-5-nano doesn't support tool calling yet, use gpt-4o-mini instead
|
|
1
|
+
# Evaluation config for gpt-4o-mini (vision)
|
|
2
|
+
# Collects traces for SFT training; legacy gpt-5-nano naming kept for convenience
|
|
4
3
|
|
|
5
4
|
[eval]
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app.modal.run"
|
|
7
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
8
|
+
seeds = "0-99"
|
|
9
|
+
max_turns = 50
|
|
10
|
+
concurrency = 5
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "structured"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
env_params = {max_steps_per_episode = 50}
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "openai"
|
|
21
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
23
22
|
temperature = 0.7
|
|
24
23
|
max_tokens = 512
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
collect_traces = true
|
|
28
|
-
trace_db = "traces/gpt5nano_vision/rollouts.db"
|
|
29
|
-
|
|
30
|
-
# Tools
|
|
24
|
+
use_vision = true
|
|
25
|
+
image_only_mode = false
|
|
31
26
|
use_tools = true
|
|
32
|
-
|
|
33
|
-
# Parallel rollouts (speeds up collection)
|
|
34
|
-
parallel_episodes = 5 # Run 5 episodes in parallel
|
|
35
|
-
|
|
36
|
-
[task]
|
|
37
|
-
name = "crafter"
|
|
38
|
-
environment = "crafter-classic"
|
|
39
|
-
|
|
40
|
-
# Task-specific settings
|
|
41
|
-
[task.config]
|
|
42
|
-
seed_start = 0
|
|
43
|
-
max_episode_length = 256
|
|
44
|
-
render_size = [64, 64] # 64x64 PNG images
|
|
45
|
-
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Evaluation config for Qwen3-VL vision rollouts
|
|
2
|
+
# Collects traces for SFT training via synth-ai hosted inference
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app.modal.run"
|
|
7
|
+
model = "Qwen/Qwen3-VL-8B-Instruct"
|
|
8
|
+
seeds = "100-199"
|
|
9
|
+
max_turns = 50
|
|
10
|
+
concurrency = 5
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "structured"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
env_params = {max_steps_per_episode = 50}
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "synth"
|
|
21
|
+
model = "Qwen/Qwen3-VL-8B-Instruct"
|
|
22
|
+
temperature = 0.7
|
|
23
|
+
max_tokens = 512
|
|
24
|
+
use_vision = true
|
|
25
|
+
image_only_mode = false
|
|
26
|
+
use_tools = true
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
# Filter
|
|
2
|
-
#
|
|
1
|
+
# Filter Qwen3-VL vision traces for SFT training
|
|
2
|
+
# Mirrors the GPT-4o mini filter configuration for vision data
|
|
3
3
|
|
|
4
4
|
[filter]
|
|
5
|
-
input_db = "traces/
|
|
6
|
-
output_dir = "traces/
|
|
5
|
+
input_db = "traces/qwen3vl_vision/rollouts.db"
|
|
6
|
+
output_dir = "traces/qwen3vl_vision/sft"
|
|
7
7
|
|
|
8
8
|
# Quality filters
|
|
9
9
|
min_steps_per_episode = 5
|
|
@@ -47,4 +47,3 @@ val_file = "val.jsonl"
|
|
|
47
47
|
save_stats = true
|
|
48
48
|
stats_file = "filter_stats.json"
|
|
49
49
|
save_filtered_episode_ids = true
|
|
50
|
-
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
# Applies quality filters and exports to SFT JSONL format
|
|
3
3
|
|
|
4
4
|
[filter]
|
|
5
|
-
input_db = "traces/
|
|
6
|
-
output_dir = "traces/
|
|
5
|
+
input_db = "traces/gpt4omini_vision/rollouts.db"
|
|
6
|
+
output_dir = "traces/gpt4omini_vision/sft"
|
|
7
7
|
|
|
8
8
|
# Quality filters
|
|
9
9
|
min_steps_per_episode = 5 # Remove very short episodes
|
|
@@ -50,4 +50,3 @@ val_file = "val.jsonl"
|
|
|
50
50
|
save_stats = true
|
|
51
51
|
stats_file = "filter_stats.json"
|
|
52
52
|
save_filtered_episode_ids = true
|
|
53
|
-
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Crafter agent using Qwen-VL models via synth-ai's hosted inference.
|
|
4
4
|
|
|
5
|
-
This demonstrates vision-language models (
|
|
5
|
+
This demonstrates vision-language models (Qwen3-VL family) playing Crafter
|
|
6
6
|
with image observations. The CrafterPolicy automatically detects vision capability
|
|
7
7
|
from the model name and includes base64-encoded PNG frames in the prompt.
|
|
8
8
|
|
|
@@ -12,7 +12,7 @@ Requirements:
|
|
|
12
12
|
|
|
13
13
|
Usage:
|
|
14
14
|
uv run python examples/qwen_vl/crafter_qwen_vl_agent.py \
|
|
15
|
-
--model Qwen/
|
|
15
|
+
--model Qwen/Qwen3-VL-8B-Instruct --seeds 10 --steps 20
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
from __future__ import annotations
|
|
@@ -142,7 +142,7 @@ async def _run_episode(
|
|
|
142
142
|
env = CrafterClassicEnvironment(task_instance)
|
|
143
143
|
wrapper = CrafterEnvironmentWrapper(env, seed=seed)
|
|
144
144
|
|
|
145
|
-
# Policy will auto-detect vision from model name (qwen-vl
|
|
145
|
+
# Policy will auto-detect vision from model name (qwen-vl and qwen3-vl tokens)
|
|
146
146
|
policy = CrafterPolicy(inference_url="synth://inference", model=model)
|
|
147
147
|
await policy.initialize({
|
|
148
148
|
"use_tools": True,
|
|
@@ -235,8 +235,8 @@ async def main() -> None:
|
|
|
235
235
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
236
236
|
parser.add_argument(
|
|
237
237
|
"--model",
|
|
238
|
-
default="Qwen/
|
|
239
|
-
help="Qwen-VL model name (e.g., Qwen/
|
|
238
|
+
default="Qwen/Qwen3-VL-8B-Instruct",
|
|
239
|
+
help="Qwen-VL model name (e.g., Qwen/Qwen3-VL-2B-Instruct, Qwen/Qwen3-VL-8B-Instruct)",
|
|
240
240
|
)
|
|
241
241
|
parser.add_argument("--seeds", type=int, default=10, help="Number of random seeds to evaluate")
|
|
242
242
|
parser.add_argument("--steps", type=int, default=20, help="Max steps per seed")
|
|
@@ -37,13 +37,13 @@ uv run python examples/qwen_vl/crafter_gpt5nano_agent.py \
|
|
|
37
37
|
|
|
38
38
|
echo ""
|
|
39
39
|
echo "======================================"
|
|
40
|
-
echo "2. Running
|
|
40
|
+
echo "2. Running Qwen3-VL-8B (synth-ai)"
|
|
41
41
|
echo "======================================"
|
|
42
42
|
uv run python examples/qwen_vl/crafter_qwen_vl_agent.py \
|
|
43
|
-
--model Qwen/
|
|
43
|
+
--model Qwen/Qwen3-VL-8B-Instruct \
|
|
44
44
|
--seeds $SEEDS \
|
|
45
45
|
--steps $STEPS \
|
|
46
|
-
--output-dir "$OUTPUT_DIR/
|
|
46
|
+
--output-dir "$OUTPUT_DIR/qwen3vl"
|
|
47
47
|
|
|
48
48
|
echo ""
|
|
49
49
|
echo "======================================"
|
|
@@ -53,10 +53,9 @@ echo ""
|
|
|
53
53
|
echo "gpt-5-nano (OpenAI):"
|
|
54
54
|
cat "$OUTPUT_DIR/gpt5nano/gpt5nano_summary.json" | python -m json.tool
|
|
55
55
|
echo ""
|
|
56
|
-
echo "
|
|
57
|
-
cat "$OUTPUT_DIR/
|
|
56
|
+
echo "Qwen3-VL-8B (synth-ai):"
|
|
57
|
+
cat "$OUTPUT_DIR/qwen3vl/qwen_vl_summary.json" | python -m json.tool
|
|
58
58
|
echo ""
|
|
59
59
|
echo "Frames saved in:"
|
|
60
60
|
echo " - $OUTPUT_DIR/gpt5nano/gpt5nano_frames/"
|
|
61
|
-
echo " - $OUTPUT_DIR/
|
|
62
|
-
|
|
61
|
+
echo " - $OUTPUT_DIR/qwen3vl/qwen_vl_frames/"
|
examples/rl/README.md
CHANGED
|
@@ -5,8 +5,8 @@ This example trains a reinforcement learning policy on single-step math problems
|
|
|
5
5
|
## Quick Commands
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
|
-
# Serve locally with tracing
|
|
9
|
-
uvx synth-ai
|
|
8
|
+
# Serve locally with tracing (uvicorn runtime)
|
|
9
|
+
uvx synth-ai deploy --runtime uvicorn math-single-step --port 8101 --env-file examples/rl/.env --trace traces/math
|
|
10
10
|
|
|
11
11
|
# Modal deployment
|
|
12
12
|
uvx synth-ai deploy --name synth-math-single-step --env-file examples/rl/.env
|
|
@@ -45,10 +45,10 @@ The task app is defined in `synth_ai/task/apps/math_single_step.py` and register
|
|
|
45
45
|
- `-0.5` if the tool call omits an answer or uses the wrong tool
|
|
46
46
|
- `-1.0` when no tool call is provided
|
|
47
47
|
|
|
48
|
-
|
|
48
|
+
Run locally (uvicorn runtime) with tracing to capture trajectories:
|
|
49
49
|
|
|
50
50
|
```bash
|
|
51
|
-
uvx synth-ai
|
|
51
|
+
uvx synth-ai deploy --runtime uvicorn math-single-step \
|
|
52
52
|
--port 8101 \
|
|
53
53
|
--env-file examples/rl/.env \
|
|
54
54
|
--trace traces/math \
|
|
@@ -162,7 +162,7 @@ For broader background on Synth task apps, CLI commands, and tracing, see the ne
|
|
|
162
162
|
|
|
163
163
|
|
|
164
164
|
uv run python examples/rl/run_eval.py --toml examples/rl/configs/eval_base_qwen.toml
|
|
165
|
-
uvx synth-ai
|
|
165
|
+
uvx synth-ai deploy --runtime uvicorn math-single-step \
|
|
166
166
|
--port 8101 \
|
|
167
167
|
--env-file examples/rl/.env \
|
|
168
168
|
--trace traces/math \
|
|
@@ -1,10 +1,15 @@
|
|
|
1
|
-
|
|
1
|
+
[algorithm]
|
|
2
|
+
type = "online"
|
|
3
|
+
method = "policy_gradient"
|
|
4
|
+
variety = "gspo"
|
|
2
5
|
|
|
3
6
|
[services]
|
|
4
7
|
task_url = "https://your-math-task.modal.run"
|
|
5
8
|
|
|
6
9
|
[model]
|
|
7
10
|
base = "Qwen/Qwen3-4B"
|
|
11
|
+
trainer_mode = "full"
|
|
12
|
+
label = "math-single-step-qwen3-4b"
|
|
8
13
|
|
|
9
14
|
[policy]
|
|
10
15
|
model = "Qwen/Qwen3-4B"
|
|
@@ -20,6 +25,8 @@ evaluation_split = "validation"
|
|
|
20
25
|
evaluation_episodes = 256
|
|
21
26
|
|
|
22
27
|
[training]
|
|
28
|
+
num_epochs = 1
|
|
29
|
+
iterations_per_epoch = 20
|
|
23
30
|
max_turns = 1
|
|
24
31
|
ops = ["agent", "env"]
|
|
25
32
|
batch_size = 128
|
|
@@ -33,5 +40,23 @@ learning_rate = 5e-6
|
|
|
33
40
|
gpu_type = "A10G"
|
|
34
41
|
gpu_count = 4
|
|
35
42
|
|
|
43
|
+
[topology]
|
|
44
|
+
type = "single_node_split"
|
|
45
|
+
gpus_for_vllm = 2
|
|
46
|
+
gpus_for_training = 2
|
|
47
|
+
gpus_for_ref = 0
|
|
48
|
+
tensor_parallel = 1
|
|
49
|
+
|
|
50
|
+
[rollout]
|
|
51
|
+
env_name = "math"
|
|
52
|
+
policy_name = "math-single-step"
|
|
53
|
+
max_turns = 1
|
|
54
|
+
episodes_per_batch = 256
|
|
55
|
+
|
|
56
|
+
[evaluation]
|
|
57
|
+
instances = 256
|
|
58
|
+
every_n_iters = 10
|
|
59
|
+
seeds = [0, 1, 2, 3, 4]
|
|
60
|
+
|
|
36
61
|
[tags]
|
|
37
62
|
experiment = "math_single_step"
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
type = "rl"
|
|
2
|
-
|
|
3
1
|
[algorithm]
|
|
4
2
|
type = "online"
|
|
5
3
|
method = "policy_gradient"
|
|
@@ -10,6 +8,8 @@ task_url = "http://localhost:8101"
|
|
|
10
8
|
|
|
11
9
|
[model]
|
|
12
10
|
base = "Qwen/Qwen3-1.7B"
|
|
11
|
+
trainer_mode = "full"
|
|
12
|
+
label = "math-single-step-qwen3-1.7b"
|
|
13
13
|
|
|
14
14
|
[policy]
|
|
15
15
|
model = "Qwen/Qwen3-1.7B"
|
|
@@ -25,6 +25,8 @@ evaluation_split = "validation"
|
|
|
25
25
|
evaluation_episodes = 50
|
|
26
26
|
|
|
27
27
|
[training]
|
|
28
|
+
num_epochs = 1
|
|
29
|
+
iterations_per_epoch = 20
|
|
28
30
|
max_turns = 1
|
|
29
31
|
ops = ["agent", "env"]
|
|
30
32
|
batch_size = 2
|
|
@@ -63,9 +65,11 @@ health_max_wait_s = 180
|
|
|
63
65
|
health_interval_ms = 300
|
|
64
66
|
|
|
65
67
|
[rollout]
|
|
68
|
+
env_name = "math"
|
|
66
69
|
policy_name = "math-single-step"
|
|
67
70
|
max_turns = 1
|
|
68
71
|
episodes_per_batch = 32 # group_size * batch_size
|
|
72
|
+
task_app_origin_rewards_only = true
|
|
69
73
|
|
|
70
74
|
[evaluation]
|
|
71
75
|
instances = 32
|
examples/rl/task_app/README.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
This directory hosts the legacy entrypoint for the math single-step task app. Prefer starting the app via:
|
|
4
4
|
|
|
5
5
|
```bash
|
|
6
|
-
uvx synth-ai
|
|
6
|
+
uvx synth-ai deploy --runtime uvicorn math-single-step --env-file examples/rl/.env --port 8101
|
|
7
7
|
```
|
|
8
8
|
|
|
9
9
|
If you need to run it directly (e.g., for Modal `modal deploy` compatibility), use:
|
|
@@ -19,4 +19,3 @@ Environment variables:
|
|
|
19
19
|
- `MATH_DATASET_DEFAULT_SPLIT`, `MATH_DATASET_VALIDATION_SPLIT`, `MATH_DATASET_TEST_SPLIT`
|
|
20
20
|
|
|
21
21
|
The task app enforces a single `math_submit` tool call per episode, enabling RL to reward correct final answers and penalise missing or malformed submissions.
|
|
22
|
-
|
|
@@ -800,7 +800,7 @@ def build_dataset() -> tuple[TaskDatasetRegistry, MathDataset]:
|
|
|
800
800
|
def _base_task_info() -> TaskInfo:
|
|
801
801
|
return TaskInfo(
|
|
802
802
|
task={"id": "math_single_step", "name": "Math Single Step", "version": "1.0.0"},
|
|
803
|
-
|
|
803
|
+
environment="math",
|
|
804
804
|
action_space={
|
|
805
805
|
"type": "tool_call",
|
|
806
806
|
"tools": [
|
|
@@ -891,7 +891,7 @@ def provide_task_instances(dataset: MathDataset, seeds: Sequence[int]) -> Iterab
|
|
|
891
891
|
sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
|
|
892
892
|
yield TaskInfo(
|
|
893
893
|
task=info.task,
|
|
894
|
-
|
|
894
|
+
environment=info.environment,
|
|
895
895
|
action_space=info.action_space,
|
|
896
896
|
observation={**info.observation, "sample_index": sample["index"]},
|
|
897
897
|
dataset={
|
examples/run_crafter_demo.sh
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# This script demonstrates a reactive agent in the Crafter environment
|
|
5
5
|
|
|
6
6
|
echo "🚀 Starting Crafter agent demo with Gemini 1.5 Flash..."
|
|
7
|
-
echo "Make sure the synth-ai service is running: uvx synth-ai
|
|
7
|
+
echo "Make sure the synth-ai service is running: uvx synth-ai deploy --runtime uvicorn"
|
|
8
8
|
echo ""
|
|
9
9
|
|
|
10
|
-
uv run python -m synth_ai.environments.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash
|
|
10
|
+
uv run python -m synth_ai.environments.examples.crafter_classic.agent_demos.test_crafter_react_agent --model gemini-1.5-flash
|
examples/sft/README.md
CHANGED
|
@@ -25,7 +25,7 @@ You can generate traces with the Crafter task app and then export them to SFT JS
|
|
|
25
25
|
|
|
26
26
|
```bash
|
|
27
27
|
# Serve the task app locally with tracing enabled (example)
|
|
28
|
-
uvx synth-ai
|
|
28
|
+
uvx synth-ai deploy --runtime uvicorn grpo-crafter \
|
|
29
29
|
--trace traces/v3 \
|
|
30
30
|
--trace-db traces/v3/task_app_traces_<timestamp>.db \
|
|
31
31
|
--port 8001
|
examples/swe/task_app/README.md
CHANGED
|
@@ -28,13 +28,13 @@ endpoints.
|
|
|
28
28
|
## Using the task app
|
|
29
29
|
|
|
30
30
|
```
|
|
31
|
-
uvx synth-ai
|
|
31
|
+
uvx synth-ai deploy --runtime uvicorn swe-mini --port 8020
|
|
32
32
|
```
|
|
33
33
|
|
|
34
34
|
### Recommended: non-interactive serve + .env
|
|
35
35
|
|
|
36
36
|
```bash
|
|
37
|
-
uvx synth-ai
|
|
37
|
+
uvx synth-ai deploy --runtime uvicorn swe-mini \
|
|
38
38
|
--port 8020 \
|
|
39
39
|
--env-file .env \
|
|
40
40
|
--trace traces/v3 \
|
|
@@ -60,6 +60,36 @@ Execution is handled by mini-swe's environment classes. Configure execution via
|
|
|
60
60
|
`SWE_MINI_ENVIRONMENT_CLASS` (`local`, `docker`, `singularity`, …) and pass
|
|
61
61
|
additional keyword arguments with `SWE_MINI_ENVIRONMENT_KWARGS` (JSON).
|
|
62
62
|
|
|
63
|
+
### Morph Cloud backend
|
|
64
|
+
|
|
65
|
+
The task app now ships with a Morph-powered environment class so you can run
|
|
66
|
+
mini-SWE rollouts in managed sandboxes. When `MORPH_API_KEY` is present the app
|
|
67
|
+
defaults to this backend automatically unless you override
|
|
68
|
+
`SWE_MINI_ENVIRONMENT_CLASS`.
|
|
69
|
+
|
|
70
|
+
1. Install the optional dependencies: `pip install "synth-ai[swe]"`.
|
|
71
|
+
2. Export your API key: `export MORPH_API_KEY=...`.
|
|
72
|
+
3. Point the task app at Morph by setting:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
export SWE_MINI_ENVIRONMENT_CLASS=morph
|
|
76
|
+
export SWE_MINI_ENVIRONMENT_KWARGS='{
|
|
77
|
+
"snapshot_id": "snap_your_pre_baked_swebench_image",
|
|
78
|
+
"cwd": "/workspace/swebench",
|
|
79
|
+
"env": {"PIP_PROGRESS_BAR": "off"},
|
|
80
|
+
"metadata": {"project": "synth-ai", "task": "swe-mini"}
|
|
81
|
+
}'
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
If you do not have a pre-built snapshot, provide `"image_id"` (defaults to
|
|
85
|
+
`morphvm-minimal`) along with resource hints (`"vcpus"`, `"memory_mb"`,
|
|
86
|
+
`"disk_mb"`). You can also set `SWE_MINI_MORPH_SNAPSHOT_ID` globally.
|
|
87
|
+
|
|
88
|
+
During cleanup the backend deletes the remote workspace and stops the Morph
|
|
89
|
+
instance automatically. All shell commands (including submissions) now execute
|
|
90
|
+
inside the Morph sandbox, enabling RL workflows that require persistent remote
|
|
91
|
+
compute.
|
|
92
|
+
|
|
63
93
|
### Tracing & SFT
|
|
64
94
|
|
|
65
95
|
Tracing works the same as Crafter; pass `--trace` / `--trace-db` to the CLI or
|
|
@@ -404,6 +404,10 @@ def _ensure_env_has_task(
|
|
|
404
404
|
if not instance_id:
|
|
405
405
|
raise ValueError("mini-swe rollout request requires env.config.instance_id")
|
|
406
406
|
config["task"] = dataset.get(instance_id)
|
|
407
|
+
env_cfg = dict(config.get("environment") or {})
|
|
408
|
+
if "environment_class" not in env_cfg and os.getenv("MORPH_API_KEY"):
|
|
409
|
+
env_cfg["environment_class"] = "morph"
|
|
410
|
+
config["environment"] = env_cfg
|
|
407
411
|
return env_spec.model_copy(update={"config": config})
|
|
408
412
|
|
|
409
413
|
|
|
@@ -46,7 +46,7 @@ class CrafterReActAgent:
|
|
|
46
46
|
"- Always return a single tool call: interact_many({actions: [...]})\n"
|
|
47
47
|
"- Use 2–5 actions per call; prefer long movement sequences to explore.\n"
|
|
48
48
|
"- Mix in 'do' only when it makes sense (tree, stone, animal, enemy nearby).\n"
|
|
49
|
-
"
|
|
49
|
+
"\n"
|
|
50
50
|
"Available actions: noop, move_up, move_down, move_left, move_right, do (interact), sleep, "
|
|
51
51
|
"place_stone, place_table, place_furnace, place_plant, make_wood_pickaxe, make_stone_pickaxe, "
|
|
52
52
|
"make_iron_pickaxe, make_wood_sword, make_stone_sword, make_iron_sword\n"
|
|
@@ -18,6 +18,7 @@ from typing import Any
|
|
|
18
18
|
from minisweagent.environments import get_environment
|
|
19
19
|
from synth_ai.environments.environment.tools import EnvToolCall
|
|
20
20
|
|
|
21
|
+
from examples.swe.task_app.morph_backend import MorphSandboxBackend
|
|
21
22
|
from .shared import summarise_history
|
|
22
23
|
from .tools import TOOLS_SCHEMA
|
|
23
24
|
|
|
@@ -25,8 +26,9 @@ logger = logging.getLogger(__name__)
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
def _environment_type_from_config(config: dict[str, Any]) -> str:
|
|
29
|
+
default = "morph" if os.getenv("MORPH_API_KEY") else "local"
|
|
28
30
|
value = (config or {}).get("environment_class") or os.getenv(
|
|
29
|
-
"SWE_MINI_ENVIRONMENT_CLASS",
|
|
31
|
+
"SWE_MINI_ENVIRONMENT_CLASS", default
|
|
30
32
|
)
|
|
31
33
|
return str(value).strip() or "local"
|
|
32
34
|
|
|
@@ -91,6 +93,7 @@ class MiniSweEnvironmentWrapper:
|
|
|
91
93
|
self._local_workspace_dir: Path | None = None
|
|
92
94
|
self._remote_workspace: str | None = None
|
|
93
95
|
self._cleanup_workspace = False
|
|
96
|
+
self._using_morph_backend = False
|
|
94
97
|
|
|
95
98
|
if self.environment_type == "local":
|
|
96
99
|
workspace = self._prepare_local_workspace(kwargs)
|
|
@@ -117,11 +120,11 @@ class MiniSweEnvironmentWrapper:
|
|
|
117
120
|
timeout = self.env_config.get("timeout")
|
|
118
121
|
if timeout and "timeout" not in kwargs:
|
|
119
122
|
kwargs["timeout"] = int(timeout)
|
|
120
|
-
if self.repo_url and "image" not in kwargs:
|
|
123
|
+
if self.environment_type in {"docker", "bubblewrap"} and self.repo_url and "image" not in kwargs:
|
|
121
124
|
image = self.metadata.get("image_name") or os.getenv("SWE_MINI_DOCKER_IMAGE")
|
|
122
125
|
if image:
|
|
123
126
|
kwargs["image"] = image
|
|
124
|
-
if self.environment_type in {"docker", "bubblewrap"}:
|
|
127
|
+
if self.environment_type in {"docker", "bubblewrap", "morph"}:
|
|
125
128
|
remote_env = dict(kwargs.get("env") or {})
|
|
126
129
|
remote_env.setdefault("GIT_TERMINAL_PROMPT", "0")
|
|
127
130
|
kwargs["env"] = remote_env
|
|
@@ -131,13 +134,34 @@ class MiniSweEnvironmentWrapper:
|
|
|
131
134
|
self.environment_type,
|
|
132
135
|
kwargs,
|
|
133
136
|
)
|
|
134
|
-
self.
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
137
|
+
if self.environment_type == "morph":
|
|
138
|
+
morph_kwargs = dict(kwargs)
|
|
139
|
+
image_value = morph_kwargs.pop("image", None)
|
|
140
|
+
if image_value and "image_id" not in morph_kwargs:
|
|
141
|
+
morph_kwargs["image_id"] = image_value
|
|
142
|
+
timeout_value = morph_kwargs.pop("timeout", None)
|
|
143
|
+
if timeout_value is not None and "startup_timeout" not in morph_kwargs:
|
|
144
|
+
try:
|
|
145
|
+
morph_kwargs["startup_timeout"] = int(timeout_value)
|
|
146
|
+
except Exception:
|
|
147
|
+
logger.warning("Invalid timeout value for morph backend: %r", timeout_value)
|
|
148
|
+
metadata_override = morph_kwargs.pop("metadata", {}) or {}
|
|
149
|
+
metadata_payload = {
|
|
150
|
+
"app": "swe-mini",
|
|
151
|
+
"instance_id": self.instance_id,
|
|
152
|
+
}
|
|
153
|
+
metadata_payload.update({str(k): str(v) for k, v in dict(metadata_override).items()})
|
|
154
|
+
morph_kwargs["metadata"] = metadata_payload
|
|
155
|
+
self.env = MorphSandboxBackend(**morph_kwargs)
|
|
156
|
+
self._using_morph_backend = True
|
|
157
|
+
else:
|
|
158
|
+
self.env = get_environment(
|
|
159
|
+
{
|
|
160
|
+
"environment_class": self.environment_type,
|
|
161
|
+
**kwargs,
|
|
162
|
+
},
|
|
163
|
+
default_type="local",
|
|
164
|
+
)
|
|
141
165
|
|
|
142
166
|
if self.environment_type != "local":
|
|
143
167
|
self._bootstrap_remote_workspace()
|
|
@@ -181,6 +205,9 @@ class MiniSweEnvironmentWrapper:
|
|
|
181
205
|
with contextlib.suppress(Exception):
|
|
182
206
|
self.env.execute(f"rm -rf {shlex.quote(self._remote_workspace)}")
|
|
183
207
|
self._remote_workspace = None
|
|
208
|
+
if self._using_morph_backend and hasattr(self.env, "close"):
|
|
209
|
+
with contextlib.suppress(Exception):
|
|
210
|
+
self.env.close()
|
|
184
211
|
|
|
185
212
|
def _resolve_repo_url(self, metadata: dict[str, Any]) -> str | None:
|
|
186
213
|
candidates = [
|