synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/__init__.py +16 -0
- examples/crafter_debug_render.py +23 -17
- examples/qwen_coder/README.md +102 -0
- examples/qwen_coder/_shared.py +113 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
- examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
- examples/qwen_coder/configs/coder_lora_small.toml +58 -0
- examples/qwen_coder/generate_dataset.py +98 -0
- examples/qwen_coder/infer_ft_smoke.py +64 -0
- examples/qwen_coder/infer_prod_proxy.py +73 -0
- examples/qwen_coder/infer_via_synth.py +87 -0
- examples/qwen_coder/scripts/infer_coder.sh +18 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
- examples/qwen_coder/sft_full_17b.py +103 -0
- examples/qwen_coder/sft_lora_30b.py +110 -0
- examples/qwen_coder/subset_jsonl.py +38 -0
- examples/qwen_coder/validate_jsonl.py +59 -0
- examples/rl/configs/eval_base_qwen.toml +1 -1
- examples/rl/configs/rl_from_base_qwen17.toml +1 -1
- examples/rl/download_dataset.py +26 -10
- examples/rl/run_eval.py +53 -52
- examples/rl/run_rl_and_save.py +29 -12
- examples/rl/task_app/math_single_step.py +180 -41
- examples/rl/task_app/math_task_app.py +14 -6
- examples/sft/README.md +139 -0
- examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
- examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
- examples/sft/evaluate.py +117 -0
- examples/sft/export_dataset.py +117 -0
- examples/sft/generate_traces.py +162 -0
- examples/swe/__init__.py +12 -0
- examples/swe/task_app/README.md +105 -0
- examples/swe/task_app/__init__.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +571 -0
- examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
- examples/swe/task_app/hosted/README.md +173 -0
- examples/swe/task_app/hosted/__init__.py +5 -0
- examples/swe/task_app/hosted/branching.py +143 -0
- examples/swe/task_app/hosted/environment_routes.py +1289 -0
- examples/swe/task_app/hosted/envs/__init__.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
- examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
- examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
- examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
- examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
- examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
- examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
- examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
- examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
- examples/swe/task_app/hosted/hosted_app.py +204 -0
- examples/swe/task_app/hosted/inference/__init__.py +5 -0
- examples/swe/task_app/hosted/inference/openai_client.py +618 -0
- examples/swe/task_app/hosted/main.py +100 -0
- examples/swe/task_app/hosted/policy_routes.py +1079 -0
- examples/swe/task_app/hosted/registry.py +195 -0
- examples/swe/task_app/hosted/rollout.py +1869 -0
- examples/swe/task_app/hosted/storage/__init__.py +5 -0
- examples/swe/task_app/hosted/storage/volume.py +211 -0
- examples/swe/task_app/hosted/test_agents.py +161 -0
- examples/swe/task_app/hosted/test_service.py +137 -0
- examples/swe/task_app/hosted/utils.py +62 -0
- examples/vlm/README.md +68 -0
- examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
- examples/vlm/crafter_image_only_agent.py +207 -0
- examples/vlm/crafter_openai_vlm_agent.py +277 -0
- examples/vlm/filter_image_rows.py +63 -0
- examples/vlm/run_crafter_vlm_benchmark.py +316 -0
- examples/warming_up_to_rl/analyze_trace_db.py +12 -10
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
- examples/warming_up_to_rl/export_trace_sft.py +218 -36
- examples/warming_up_to_rl/groq_test.py +15 -8
- examples/warming_up_to_rl/manage_secrets.py +29 -25
- examples/warming_up_to_rl/readme.md +9 -2
- examples/warming_up_to_rl/run_eval.py +137 -61
- examples/warming_up_to_rl/run_fft_and_save.py +131 -60
- examples/warming_up_to_rl/run_local_rollout.py +88 -39
- examples/warming_up_to_rl/run_local_rollout_modal.py +114 -28
- examples/warming_up_to_rl/run_local_rollout_parallel.py +81 -20
- examples/warming_up_to_rl/run_local_rollout_traced.py +126 -23
- examples/warming_up_to_rl/run_rl_and_save.py +35 -12
- examples/warming_up_to_rl/run_rollout_remote.py +44 -19
- examples/warming_up_to_rl/task_app/README.md +6 -2
- examples/warming_up_to_rl/task_app/grpo_crafter.py +319 -57
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +11 -30
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +137 -182
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +150 -57
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +105 -69
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +19 -7
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +45 -42
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +47 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +198 -92
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +361 -263
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +394 -274
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +56 -62
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +6 -15
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
- synth/__init__.py +14 -0
- synth_ai/__init__.py +20 -4
- synth_ai/api/models/supported.py +376 -0
- synth_ai/api/train/builders.py +157 -26
- synth_ai/api/train/cli.py +213 -57
- synth_ai/api/train/config_finder.py +65 -5
- synth_ai/api/train/env_resolver.py +33 -15
- synth_ai/api/train/pollers.py +13 -4
- synth_ai/api/train/supported_algos.py +139 -0
- synth_ai/api/train/task_app.py +5 -3
- synth_ai/api/train/utils.py +33 -48
- synth_ai/cli/__init__.py +19 -4
- synth_ai/cli/_modal_wrapper.py +28 -0
- synth_ai/cli/_typer_patch.py +49 -0
- synth_ai/cli/balance.py +2 -3
- synth_ai/cli/calc.py +1 -1
- synth_ai/cli/demo.py +21 -6
- synth_ai/cli/recent.py +2 -2
- synth_ai/cli/rl_demo.py +77 -17
- synth_ai/cli/root.py +116 -39
- synth_ai/cli/status.py +2 -2
- synth_ai/cli/task_apps.py +1709 -243
- synth_ai/cli/traces.py +7 -4
- synth_ai/cli/turso.py +73 -0
- synth_ai/cli/watch.py +12 -18
- synth_ai/core/experiment.py +0 -2
- synth_ai/demo_registry.py +68 -31
- synth_ai/demos/core/cli.py +516 -194
- synth_ai/demos/demo_task_apps/__init__.py +3 -3
- synth_ai/demos/demo_task_apps/core.py +64 -28
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +37 -30
- synth_ai/demos/demo_task_apps/math/_common.py +1 -2
- synth_ai/demos/demo_task_apps/math/app.py +2 -1
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +183 -82
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
- synth_ai/environments/examples/bandit/engine.py +12 -4
- synth_ai/environments/examples/bandit/taskset.py +4 -4
- synth_ai/environments/examples/crafter_classic/environment.py +76 -1
- synth_ai/environments/reproducibility/tree.py +5 -6
- synth_ai/environments/service/app.py +11 -12
- synth_ai/environments/service/core_routes.py +10 -9
- synth_ai/environments/stateful/engine.py +1 -1
- synth_ai/environments/tasks/core.py +1 -0
- synth_ai/environments/tasks/filters.py +5 -6
- synth_ai/environments/tasks/utils.py +4 -5
- synth_ai/evals/base.py +0 -2
- synth_ai/handshake.py +11 -9
- synth_ai/http.py +1 -1
- synth_ai/http_client.py +43 -11
- synth_ai/inference/__init__.py +0 -2
- synth_ai/inference/client.py +20 -6
- synth_ai/jobs/client.py +103 -78
- synth_ai/learning/__init__.py +41 -6
- synth_ai/learning/algorithms.py +14 -0
- synth_ai/learning/client.py +121 -29
- synth_ai/learning/config.py +2 -40
- synth_ai/learning/constants.py +0 -2
- synth_ai/learning/ft_client.py +4 -56
- synth_ai/learning/health.py +13 -7
- synth_ai/learning/jobs.py +43 -47
- synth_ai/{rl → learning/rl}/__init__.py +14 -5
- synth_ai/learning/rl/client.py +267 -0
- synth_ai/learning/rl/config.py +31 -0
- synth_ai/{rl → learning/rl}/contracts.py +5 -10
- synth_ai/{rl → learning/rl}/env_keys.py +45 -16
- synth_ai/learning/rl/secrets.py +13 -0
- synth_ai/learning/rl_client.py +2 -253
- synth_ai/learning/sft/__init__.py +29 -0
- synth_ai/learning/sft/client.py +68 -0
- synth_ai/learning/sft/config.py +270 -0
- synth_ai/learning/sft/data.py +295 -0
- synth_ai/learning/sse.py +25 -26
- synth_ai/learning/validators.py +25 -24
- synth_ai/lm/__init__.py +21 -47
- synth_ai/task/__init__.py +26 -27
- synth_ai/task/apps/__init__.py +18 -19
- synth_ai/task/auth.py +35 -23
- synth_ai/task/client.py +15 -13
- synth_ai/task/contracts.py +37 -35
- synth_ai/task/datasets.py +9 -6
- synth_ai/task/errors.py +11 -10
- synth_ai/task/health.py +17 -11
- synth_ai/task/json.py +58 -24
- synth_ai/task/proxy.py +15 -14
- synth_ai/task/rubrics.py +22 -15
- synth_ai/task/server.py +43 -17
- synth_ai/task/tracing_utils.py +12 -7
- synth_ai/task/validators.py +0 -1
- synth_ai/task/vendors.py +5 -7
- synth_ai/tracing_v3/__init__.py +2 -0
- synth_ai/tracing_v3/abstractions.py +21 -4
- synth_ai/tracing_v3/db_config.py +26 -1
- synth_ai/tracing_v3/decorators.py +18 -15
- synth_ai/tracing_v3/examples/basic_usage.py +3 -2
- synth_ai/tracing_v3/hooks.py +6 -4
- synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
- synth_ai/tracing_v3/replica_sync.py +1 -0
- synth_ai/tracing_v3/session_tracer.py +63 -16
- synth_ai/tracing_v3/storage/base.py +89 -1
- synth_ai/tracing_v3/storage/config.py +21 -8
- synth_ai/tracing_v3/storage/factory.py +10 -8
- synth_ai/tracing_v3/storage/utils.py +4 -2
- synth_ai/tracing_v3/turso/daemon.py +7 -2
- synth_ai/tracing_v3/turso/models.py +5 -2
- synth_ai/tracing_v3/turso/native_manager.py +1173 -0
- synth_ai/tracing_v3/utils.py +4 -3
- synth_ai/v0/api/__init__.py +8 -0
- synth_ai/v0/api/models/__init__.py +8 -0
- synth_ai/v0/api/models/supported.py +8 -0
- synth_ai/v0/config/__init__.py +15 -0
- synth_ai/v0/config/base_url.py +12 -0
- synth_ai/v0/lm/__init__.py +51 -0
- synth_ai/{lm → v0/lm}/caching/ephemeral.py +3 -5
- synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
- synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
- synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
- synth_ai/{lm → v0/lm}/config.py +6 -1
- synth_ai/{lm → v0/lm}/core/all.py +9 -9
- synth_ai/{lm → v0/lm}/core/exceptions.py +0 -2
- synth_ai/{lm → v0/lm}/core/main.py +19 -7
- synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
- synth_ai/{lm → v0/lm}/core/synth_models.py +2 -15
- synth_ai/{lm → v0/lm}/core/vendor_clients.py +6 -4
- synth_ai/{lm → v0/lm}/overrides.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +16 -16
- synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +12 -10
- synth_ai/{lm → v0/lm}/vendors/openai_standard.py +11 -9
- synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +8 -5
- synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +4 -6
- synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
- synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/synth_client.py +38 -11
- synth_ai/v0/tracing/upload.py +32 -135
- synth_ai/v0/tracing_v3/__init__.py +10 -0
- synth_ai/v0/tracing_v3/abstractions.py +3 -0
- synth_ai/v0/tracing_v3/decorators.py +3 -0
- synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
- synth_ai/v0/tracing_v3/session_tracer.py +3 -0
- synth_ai-0.2.9.dev6.dist-info/METADATA +191 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/RECORD +291 -264
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/top_level.txt +1 -0
- examples/common_old/backend.py +0 -21
- examples/evals_old/README.md +0 -98
- examples/evals_old/__init__.py +0 -6
- examples/evals_old/compare_models.py +0 -1037
- examples/evals_old/example_log.md +0 -145
- examples/evals_old/run_demo.sh +0 -126
- examples/evals_old/trace_analysis.py +0 -270
- examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
- examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -239
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
- examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -118
- examples/finetuning_old/synth_qwen_v1/README.md +0 -68
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -239
- examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
- examples/finetuning_old/synth_qwen_v1/infer.py +0 -37
- examples/finetuning_old/synth_qwen_v1/poll.py +0 -44
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1932
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -207
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -232
- examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
- examples/finetuning_old/synth_qwen_v1/util.py +0 -147
- examples/rl_old/task_app.py +0 -962
- examples/warming_up_to_rl/old/event_rewards.md +0 -234
- examples/warming_up_to_rl/old/notes.md +0 -73
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- synth_ai/experimental/synth_oss.py +0 -446
- synth_ai/install_sqld.sh +0 -40
- synth_ai/learning/filtering.py +0 -0
- synth_ai/learning/offline/dpo.py +0 -0
- synth_ai/learning/offline/providers.py +0 -7
- synth_ai/learning/offline/sft.py +0 -0
- synth_ai/learning/offline/shared.py +0 -0
- synth_ai/learning/online/grpo.py +0 -0
- synth_ai/learning/online/irft.py +0 -0
- synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
- synth_ai/learning/prompts/gepa.py +0 -0
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
- synth_ai/learning/prompts/mipro.py +0 -289
- synth_ai/learning/prompts/random_search.py +0 -246
- synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
- synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
- synth_ai/rl/secrets.py +0 -19
- synth_ai/scripts/verify_rewards.py +0 -100
- synth_ai/tracing/__init__.py +0 -30
- synth_ai/tracing_v1/__init__.py +0 -33
- synth_ai/tracing_v3/turso/__init__.py +0 -25
- synth_ai/tracing_v3/turso/manager.py +0 -774
- synth_ai/zyk/__init__.py +0 -30
- synth_ai-0.2.9.dev4.dist-info/METADATA +0 -131
- /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
- /synth_ai/{lm → v0/lm}/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
- /synth_ai/{lm → v0/lm}/injection.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
- /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/warmup.py +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[job]
|
|
2
|
+
model = "Qwen/Qwen3-0.6B"
|
|
3
|
+
# Optionally set here, but prefer passing --dataset at runtime
|
|
4
|
+
# data = "examples/sft/ft_data/crafter_traces.jsonl"
|
|
5
|
+
|
|
6
|
+
[compute]
|
|
7
|
+
gpu_type = "H100"
|
|
8
|
+
gpu_count = 1
|
|
9
|
+
nodes = 1
|
|
10
|
+
|
|
11
|
+
[data]
|
|
12
|
+
# Forwarded into metadata.effective_config
|
|
13
|
+
topology = {}
|
|
14
|
+
# Optional validation set if you have one locally
|
|
15
|
+
# validation_path = "examples/sft/ft_data/crafter_traces.val.jsonl"
|
|
16
|
+
|
|
17
|
+
[training]
|
|
18
|
+
mode = "lora"
|
|
19
|
+
use_qlora = true
|
|
20
|
+
|
|
21
|
+
[training.validation]
|
|
22
|
+
enabled = true
|
|
23
|
+
evaluation_strategy = "steps"
|
|
24
|
+
eval_steps = 50
|
|
25
|
+
save_best_model_at_end = true
|
|
26
|
+
metric_for_best_model = "val.loss"
|
|
27
|
+
greater_is_better = false
|
|
28
|
+
|
|
29
|
+
[hyperparameters]
|
|
30
|
+
n_epochs = 1
|
|
31
|
+
train_kind = "peft"
|
|
32
|
+
per_device_batch = 2
|
|
33
|
+
gradient_accumulation_steps = 32
|
|
34
|
+
sequence_length = 4096
|
|
35
|
+
learning_rate = 5e-6
|
|
36
|
+
warmup_ratio = 0.03
|
|
37
|
+
|
|
38
|
+
[hyperparameters.parallelism]
|
|
39
|
+
use_deepspeed = true
|
|
40
|
+
deepspeed_stage = 2
|
|
41
|
+
fsdp = false
|
|
42
|
+
bf16 = true
|
|
43
|
+
fp16 = false
|
|
44
|
+
activation_checkpointing = true
|
|
45
|
+
|
examples/sft/evaluate.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Evaluate a base or fine-tuned model on Crafter via the Task App rollout.
|
|
3
|
+
|
|
4
|
+
This mirrors the minimal evaluation loop: call `/rollout` for a set of seeds
|
|
5
|
+
and report outcome/step metrics. If tracing is enabled server-side, you can
|
|
6
|
+
use the exported sqlite DB for further analysis.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import asyncio
|
|
13
|
+
import os
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
from synth_ai.task import (
|
|
18
|
+
RolloutEnvSpec,
|
|
19
|
+
RolloutPolicySpec,
|
|
20
|
+
RolloutRecordConfig,
|
|
21
|
+
RolloutRequest,
|
|
22
|
+
TaskAppClient,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(slots=True)
|
|
27
|
+
class EvalArgs:
|
|
28
|
+
base_url: str
|
|
29
|
+
api_key: str
|
|
30
|
+
model: str
|
|
31
|
+
inference_url: str
|
|
32
|
+
inference_api_key: str
|
|
33
|
+
seeds: list[int]
|
|
34
|
+
max_llm_calls: int
|
|
35
|
+
timeout: float
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _ops(n: int) -> list[str]:
|
|
39
|
+
n = max(1, n)
|
|
40
|
+
ops: list[str] = []
|
|
41
|
+
for _ in range(n):
|
|
42
|
+
ops.extend(["agent", "env"]) # one LLM step followed by one env step
|
|
43
|
+
return ops
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _request(seed: int, a: EvalArgs) -> RolloutRequest:
|
|
47
|
+
return RolloutRequest(
|
|
48
|
+
run_id=f"eval-{seed}",
|
|
49
|
+
env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
|
|
50
|
+
policy=RolloutPolicySpec(
|
|
51
|
+
policy_name="crafter-react",
|
|
52
|
+
config={"model": a.model, "inference_url": a.inference_url, "api_key": a.inference_api_key},
|
|
53
|
+
),
|
|
54
|
+
ops=_ops(a.max_llm_calls),
|
|
55
|
+
record=RolloutRecordConfig(trajectories=True, return_trace=False, trace_format="compact"),
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
async def _eval_seed(client: TaskAppClient, seed: int, a: EvalArgs) -> dict[str, Any]:
|
|
60
|
+
resp = await client.rollout(_request(seed, a))
|
|
61
|
+
m = resp.metrics
|
|
62
|
+
return {"seed": seed, "num_steps": m.num_steps, "episode_returns": m.episode_returns, "outcome_score": m.outcome_score}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def main() -> None:
|
|
66
|
+
p = argparse.ArgumentParser(description=__doc__)
|
|
67
|
+
p.add_argument("--base-url", default=os.getenv("TASK_APP_URL", "http://localhost:8001"))
|
|
68
|
+
p.add_argument("--api-key", default=os.getenv("ENVIRONMENT_API_KEY"))
|
|
69
|
+
p.add_argument("--model", required=True, help="Base or ft:<id> to evaluate")
|
|
70
|
+
p.add_argument("--inference-url", default=os.getenv("INFERENCE_URL", "https://api.groq.com/openai"))
|
|
71
|
+
p.add_argument("--inference-api-key", default=os.getenv("GROQ_API_KEY"))
|
|
72
|
+
p.add_argument("--seeds", default="0,1,2,3,4,5,6,7,8,9")
|
|
73
|
+
p.add_argument("--max-llm-calls", type=int, default=10)
|
|
74
|
+
p.add_argument("--timeout", type=float, default=60.0)
|
|
75
|
+
args = p.parse_args()
|
|
76
|
+
|
|
77
|
+
seeds = [int(s) for s in str(args.seeds).split(",") if s.strip()]
|
|
78
|
+
a = EvalArgs(
|
|
79
|
+
base_url=str(args.base_url).strip(),
|
|
80
|
+
api_key=str(args.api_key or "").strip(),
|
|
81
|
+
model=str(args.model).strip(),
|
|
82
|
+
inference_url=str(args.inference_url).strip(),
|
|
83
|
+
inference_api_key=str(args.inference_api_key or "").strip(),
|
|
84
|
+
seeds=seeds,
|
|
85
|
+
max_llm_calls=int(args.max_llm_calls),
|
|
86
|
+
timeout=float(args.timeout),
|
|
87
|
+
)
|
|
88
|
+
if not a.api_key:
|
|
89
|
+
raise SystemExit("ENVIRONMENT_API_KEY is required")
|
|
90
|
+
if not a.inference_api_key:
|
|
91
|
+
raise SystemExit("Inference API key (e.g., GROQ_API_KEY) is required")
|
|
92
|
+
|
|
93
|
+
results: list[dict[str, Any]] = []
|
|
94
|
+
async with TaskAppClient(a.base_url, api_key=a.api_key, timeout=a.timeout) as client:
|
|
95
|
+
for seed in a.seeds:
|
|
96
|
+
r = await _eval_seed(client, seed, a)
|
|
97
|
+
results.append(r)
|
|
98
|
+
print(f"seed={seed} return={r.get('episode_returns')}")
|
|
99
|
+
|
|
100
|
+
# Simple aggregate
|
|
101
|
+
flat_returns: list[float] = []
|
|
102
|
+
for r in results:
|
|
103
|
+
ers = r.get("episode_returns") or []
|
|
104
|
+
if isinstance(ers, list) and ers:
|
|
105
|
+
try:
|
|
106
|
+
flat_returns.append(float(ers[0]))
|
|
107
|
+
except Exception:
|
|
108
|
+
pass
|
|
109
|
+
if flat_returns:
|
|
110
|
+
mean_ret = sum(flat_returns) / len(flat_returns)
|
|
111
|
+
print(f"mean_return={mean_ret:.3f} over {len(flat_returns)} episodes")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
if __name__ == "__main__":
|
|
115
|
+
asyncio.run(main())
|
|
116
|
+
|
|
117
|
+
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Export SFT JSONL from tracing_v3 sqlite using the shared exporter utilities.
|
|
3
|
+
|
|
4
|
+
Thin wrapper over `examples/warming_up_to_rl/export_trace_sft.py` to keep the
|
|
5
|
+
SFT workflow self-contained in this folder while reusing tested logic.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from examples.warming_up_to_rl.export_trace_sft import (
|
|
14
|
+
build_sft_dataset,
|
|
15
|
+
connect,
|
|
16
|
+
fetch_achievement_data,
|
|
17
|
+
fetch_event_reward_totals,
|
|
18
|
+
fetch_outcome_rewards,
|
|
19
|
+
fetch_session_models,
|
|
20
|
+
parse_event_filters,
|
|
21
|
+
write_jsonl,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main() -> None:
|
|
26
|
+
p = argparse.ArgumentParser(description=__doc__)
|
|
27
|
+
p.add_argument("--db", type=Path, default=Path("traces/v3/synth_ai.db"))
|
|
28
|
+
p.add_argument("--output", type=Path, default=Path("examples/sft/ft_data/crafter_traces.jsonl"))
|
|
29
|
+
p.add_argument("--model", action="append", dest="models")
|
|
30
|
+
p.add_argument("--provider", action="append", dest="providers")
|
|
31
|
+
p.add_argument("--min-unique", type=int, default=0)
|
|
32
|
+
p.add_argument("--max-unique", type=int, default=None)
|
|
33
|
+
p.add_argument("--exclude-achievement", action="append", dest="exclude_achievements")
|
|
34
|
+
p.add_argument("--require-achievement", action="append", dest="required_achievements")
|
|
35
|
+
p.add_argument("--min-outcome-reward", type=float, default=None)
|
|
36
|
+
p.add_argument("--max-outcome-reward", type=float, default=None)
|
|
37
|
+
p.add_argument("--event-reward", action="append", dest="event_reward_filters")
|
|
38
|
+
p.add_argument("--limit", type=int, default=None)
|
|
39
|
+
args = p.parse_args()
|
|
40
|
+
|
|
41
|
+
conn = connect(args.db)
|
|
42
|
+
try:
|
|
43
|
+
achievements_map, unique_counts, name_counts, size_counts, session_uniques, session_final = (
|
|
44
|
+
fetch_achievement_data(conn)
|
|
45
|
+
)
|
|
46
|
+
session_models = fetch_session_models(conn)
|
|
47
|
+
outcome_data = fetch_outcome_rewards(conn)
|
|
48
|
+
event_totals = fetch_event_reward_totals(conn)
|
|
49
|
+
event_filters = parse_event_filters(args.event_reward_filters)
|
|
50
|
+
|
|
51
|
+
allowed_models = set(args.models) if args.models else None
|
|
52
|
+
allowed_providers = set(args.providers) if args.providers else None
|
|
53
|
+
required_achievements = set(args.required_achievements or [])
|
|
54
|
+
excluded_achievements = set(args.exclude_achievements or [])
|
|
55
|
+
|
|
56
|
+
eligible: set[str] = set()
|
|
57
|
+
for session_id, (model_name, provider, _calls) in session_models.items():
|
|
58
|
+
if allowed_models and model_name not in allowed_models:
|
|
59
|
+
continue
|
|
60
|
+
if allowed_providers and (provider or "unknown") not in allowed_providers:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
session_unique = session_uniques.get(session_id, set())
|
|
64
|
+
adjusted_uniques = {a for a in session_unique if a not in excluded_achievements}
|
|
65
|
+
unique_count = len(adjusted_uniques)
|
|
66
|
+
if args.min_unique is not None and unique_count < args.min_unique:
|
|
67
|
+
continue
|
|
68
|
+
if args.max_unique is not None and unique_count > args.max_unique:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
outcome = outcome_data.get(session_id)
|
|
72
|
+
total_reward = outcome["total_reward"] if outcome else 0.0
|
|
73
|
+
final_achievements = (
|
|
74
|
+
outcome["achievements"] if outcome else session_final.get(session_id, set())
|
|
75
|
+
)
|
|
76
|
+
if args.min_outcome_reward is not None and total_reward < args.min_outcome_reward:
|
|
77
|
+
continue
|
|
78
|
+
if args.max_outcome_reward is not None and total_reward > args.max_outcome_reward:
|
|
79
|
+
continue
|
|
80
|
+
if required_achievements and not required_achievements.issubset(final_achievements):
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
totals = event_totals.get(session_id, {})
|
|
84
|
+
meets_filters = True
|
|
85
|
+
for reward_type, min_total in event_filters:
|
|
86
|
+
total = totals.get(reward_type, {}).get("total", 0.0)
|
|
87
|
+
if total < min_total:
|
|
88
|
+
meets_filters = False
|
|
89
|
+
break
|
|
90
|
+
if not meets_filters:
|
|
91
|
+
continue
|
|
92
|
+
eligible.add(session_id)
|
|
93
|
+
|
|
94
|
+
if not eligible:
|
|
95
|
+
raise SystemExit("No sessions matched the provided filters.")
|
|
96
|
+
|
|
97
|
+
dataset = build_sft_dataset(
|
|
98
|
+
conn,
|
|
99
|
+
achievements_map,
|
|
100
|
+
eligible,
|
|
101
|
+
allowed_models=allowed_models,
|
|
102
|
+
limit=args.limit,
|
|
103
|
+
)
|
|
104
|
+
if not dataset:
|
|
105
|
+
raise SystemExit("No rollout steps matched the filters (after session selection).")
|
|
106
|
+
|
|
107
|
+
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
write_jsonl(args.output, dataset)
|
|
109
|
+
print(f"Wrote {len(dataset)} examples -> {args.output}")
|
|
110
|
+
finally:
|
|
111
|
+
conn.close()
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
if __name__ == "__main__":
|
|
115
|
+
main()
|
|
116
|
+
|
|
117
|
+
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate Crafter rollouts and server-side traces via the Task App.
|
|
3
|
+
|
|
4
|
+
This script is a slim wrapper around the Task App `/rollout` endpoint to
|
|
5
|
+
produce trajectories while the server (if configured with TASKAPP_TRACING_ENABLED)
|
|
6
|
+
persists traces to its sqlite database. Use `export_dataset.py` afterwards
|
|
7
|
+
to build an SFT JSONL.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import asyncio
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from synth_ai.task import (
|
|
21
|
+
RolloutEnvSpec,
|
|
22
|
+
RolloutPolicySpec,
|
|
23
|
+
RolloutRecordConfig,
|
|
24
|
+
RolloutRequest,
|
|
25
|
+
TaskAppClient,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _ensure_str(val: Any, name: str) -> str:
|
|
30
|
+
s = str(val or "").strip()
|
|
31
|
+
if not s:
|
|
32
|
+
raise SystemExit(f"Missing required {name}")
|
|
33
|
+
return s
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _build_ops(max_llm_calls: int) -> list[str]:
|
|
37
|
+
max_llm_calls = max(1, int(max_llm_calls or 1))
|
|
38
|
+
ops: list[str] = []
|
|
39
|
+
for _ in range(max_llm_calls):
|
|
40
|
+
ops.extend(["agent", "env"]) # LLM step then env step
|
|
41
|
+
return ops
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _build_request(seed: int, run_id: str, model: str, inference_url: str, api_key: str, *, max_llm_calls: int, return_trace: bool) -> RolloutRequest:
|
|
45
|
+
policy_cfg: dict[str, Any] = {
|
|
46
|
+
"model": model,
|
|
47
|
+
"inference_url": inference_url,
|
|
48
|
+
"api_key": api_key,
|
|
49
|
+
}
|
|
50
|
+
record = RolloutRecordConfig(trajectories=True, return_trace=bool(return_trace), trace_format="compact")
|
|
51
|
+
return RolloutRequest(
|
|
52
|
+
run_id=run_id,
|
|
53
|
+
env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
|
|
54
|
+
policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_cfg),
|
|
55
|
+
ops=_build_ops(max_llm_calls),
|
|
56
|
+
record=record,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(slots=True)
|
|
61
|
+
class Args:
|
|
62
|
+
base_url: str
|
|
63
|
+
api_key: str
|
|
64
|
+
inference_url: str
|
|
65
|
+
inference_api_key: str
|
|
66
|
+
model: str
|
|
67
|
+
episodes: int
|
|
68
|
+
start_seed: int
|
|
69
|
+
max_llm_calls: int
|
|
70
|
+
concurrency: int
|
|
71
|
+
return_trace: bool
|
|
72
|
+
timeout: float
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
async def _run_one(client: TaskAppClient, run_id: str, seed: int, a: Args) -> dict[str, Any]:
|
|
76
|
+
req = _build_request(
|
|
77
|
+
seed=seed,
|
|
78
|
+
run_id=f"{run_id}-seed{seed}",
|
|
79
|
+
model=a.model,
|
|
80
|
+
inference_url=a.inference_url,
|
|
81
|
+
api_key=a.inference_api_key,
|
|
82
|
+
max_llm_calls=a.max_llm_calls,
|
|
83
|
+
return_trace=a.return_trace,
|
|
84
|
+
)
|
|
85
|
+
resp = await client.rollout(req)
|
|
86
|
+
metrics = resp.metrics.model_dump()
|
|
87
|
+
return {
|
|
88
|
+
"seed": seed,
|
|
89
|
+
"num_steps": metrics.get("num_steps"),
|
|
90
|
+
"episode_returns": metrics.get("episode_returns"),
|
|
91
|
+
"outcome_score": metrics.get("outcome_score"),
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
async def _bounded_gather(n: int, coros: list[asyncio.Future]):
|
|
96
|
+
sem = asyncio.Semaphore(n)
|
|
97
|
+
|
|
98
|
+
async def _wrap(coro):
|
|
99
|
+
async with sem:
|
|
100
|
+
return await coro
|
|
101
|
+
|
|
102
|
+
return await asyncio.gather(*[_wrap(c) for c in coros])
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def main() -> None:
|
|
106
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
107
|
+
parser.add_argument("--base-url", default=os.getenv("TASK_APP_URL", "http://localhost:8001"))
|
|
108
|
+
parser.add_argument("--api-key", default=os.getenv("ENVIRONMENT_API_KEY"))
|
|
109
|
+
parser.add_argument("--inference-url", default=os.getenv("INFERENCE_URL", "https://api.groq.com/openai"))
|
|
110
|
+
parser.add_argument("--inference-api-key", default=os.getenv("GROQ_API_KEY"))
|
|
111
|
+
parser.add_argument("--model", default=os.getenv("POLICY_MODEL", "llama-3.3-70b-versatile"))
|
|
112
|
+
parser.add_argument("--episodes", type=int, default=50)
|
|
113
|
+
parser.add_argument("--start-seed", type=int, default=0)
|
|
114
|
+
parser.add_argument("--max-llm-calls", type=int, default=10)
|
|
115
|
+
parser.add_argument("--concurrency", type=int, default=5)
|
|
116
|
+
parser.add_argument("--return-trace", action="store_true")
|
|
117
|
+
parser.add_argument("--timeout", type=float, default=60.0)
|
|
118
|
+
args_ns = parser.parse_args()
|
|
119
|
+
|
|
120
|
+
a = Args(
|
|
121
|
+
base_url=_ensure_str(args_ns.base_url, "--base-url"),
|
|
122
|
+
api_key=_ensure_str(args_ns.api_key, "--api-key"),
|
|
123
|
+
inference_url=_ensure_str(args_ns.inference_url, "--inference-url"),
|
|
124
|
+
inference_api_key=_ensure_str(args_ns.inference_api_key, "--inference-api-key"),
|
|
125
|
+
model=_ensure_str(args_ns.model, "--model"),
|
|
126
|
+
episodes=int(args_ns.episodes),
|
|
127
|
+
start_seed=int(args_ns.start_seed),
|
|
128
|
+
max_llm_calls=int(args_ns.max_llm_calls),
|
|
129
|
+
concurrency=max(1, int(args_ns.concurrency)),
|
|
130
|
+
return_trace=bool(args_ns.return_trace),
|
|
131
|
+
timeout=float(args_ns.timeout),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
print(
|
|
135
|
+
f"[INFO] base={a.base_url} episodes={a.episodes} start_seed={a.start_seed} model={a.model} tp={a.max_llm_calls}"
|
|
136
|
+
)
|
|
137
|
+
run_id = f"traces-{int(time.time())}"
|
|
138
|
+
|
|
139
|
+
successes = 0
|
|
140
|
+
failures = 0
|
|
141
|
+
async with TaskAppClient(a.base_url, api_key=a.api_key, timeout=a.timeout) as client:
|
|
142
|
+
tasks = [
|
|
143
|
+
_run_one(client, run_id, seed, a) for seed in range(a.start_seed, a.start_seed + a.episodes)
|
|
144
|
+
]
|
|
145
|
+
for result in await _bounded_gather(a.concurrency, tasks):
|
|
146
|
+
if isinstance(result, dict):
|
|
147
|
+
successes += 1
|
|
148
|
+
print(f"[OK] seed={result['seed']} return={result.get('episode_returns')}")
|
|
149
|
+
else:
|
|
150
|
+
failures += 1
|
|
151
|
+
print(f"[ERR] seed result not dict: {result}", file=sys.stderr)
|
|
152
|
+
|
|
153
|
+
print(f"[DONE] successes={successes} failures={failures}")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
try:
|
|
158
|
+
asyncio.run(main())
|
|
159
|
+
except KeyboardInterrupt:
|
|
160
|
+
print("Interrupted", file=sys.stderr)
|
|
161
|
+
|
|
162
|
+
|
examples/swe/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""SWE task app examples package."""
|
|
2
|
+
|
|
3
|
+
from importlib import resources as _resources
|
|
4
|
+
|
|
5
|
+
__all__ = ["path_for"]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def path_for(package: str, resource: str) -> str:
|
|
9
|
+
"""Return path for packaged SWE example resources."""
|
|
10
|
+
|
|
11
|
+
with _resources.as_file(_resources.files(f"examples.swe.{package}") / resource) as path:
|
|
12
|
+
return str(path)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# mini-SWE Task App
|
|
2
|
+
|
|
3
|
+
This directory contains an example task app that exposes the
|
|
4
|
+
[mini-swe-agent](https://github.com/SWE-agent/mini-swe-agent) workflow through
|
|
5
|
+
the Synth AI task app interface. The goal is to provide a Crafter-like workflow
|
|
6
|
+
for SWE tasks: you can serve the task app locally, point RL training at it,
|
|
7
|
+
collect rollouts with tracing, and run vendor inference via the standard proxy
|
|
8
|
+
endpoints.
|
|
9
|
+
|
|
10
|
+
> **Status:** The implementation focuses on a minimal, hackable integration.
|
|
11
|
+
> It supports local/docker environments, step-wise command execution, tracing
|
|
12
|
+
> hooks, and rollouts. By default it streams SWE-Bench Verified tasks from
|
|
13
|
+
> Hugging Face; you can point the loader at your own dataset (or the bundled
|
|
14
|
+
> sample JSONL) via environment variables (see the docs at the end of this
|
|
15
|
+
> file).
|
|
16
|
+
|
|
17
|
+
## Layout
|
|
18
|
+
|
|
19
|
+
- `grpo_swe_mini.py` – main task-app configuration (dataset, rollout executor,
|
|
20
|
+
tracing, Modal metadata, registration).
|
|
21
|
+
- `grpo_swe_mini_task_app.py` – backwards-compatible FastAPI wrapper that
|
|
22
|
+
allows running the module directly (mirrors `grpo_crafter_task_app.py`).
|
|
23
|
+
- `hosted/envs/mini_swe` – environment/policy adapters that wrap `mini-swe-agent`
|
|
24
|
+
inside a hosted FastAPI service.
|
|
25
|
+
- `data/sample_instances.json` – optional curated subset for quick smoke tests
|
|
26
|
+
(no longer the default dataset).
|
|
27
|
+
|
|
28
|
+
## Using the task app
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
uvx synth-ai serve swe-mini --port 8020
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Recommended: non-interactive serve + .env
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
uvx synth-ai serve swe-mini \
|
|
38
|
+
--port 8020 \
|
|
39
|
+
--env-file .env \
|
|
40
|
+
--trace traces/v3 \
|
|
41
|
+
--trace-db traces/v3/synth_ai.db
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
This avoids interactive prompts (useful for CI) and loads `ENVIRONMENT_API_KEY`, `OPENAI_API_KEY`, etc. from `.env`.
|
|
45
|
+
|
|
46
|
+
### Configure dataset and execution
|
|
47
|
+
|
|
48
|
+
Set `SWE_MINI_DATASET` to control what tasks the environment loads (defaults to
|
|
49
|
+
`hf://princeton-nlp/SWE-Bench_Verified:test`):
|
|
50
|
+
|
|
51
|
+
- `file://path/to/tasks.jsonl` – each line should contain an object with
|
|
52
|
+
`instance_id`, `problem_statement`, and optional docker metadata
|
|
53
|
+
(`image_name`, `repo`, …).
|
|
54
|
+
- `hf://namespace/dataset:split` – lazily stream from Hugging Face (requires
|
|
55
|
+
`datasets` and network).
|
|
56
|
+
For quick local smoke tests you can point at
|
|
57
|
+
`file://$REPO/examples/swe/task_app/data/sample_instances.json`.
|
|
58
|
+
|
|
59
|
+
Execution is handled by mini-swe's environment classes. Configure execution via
|
|
60
|
+
`SWE_MINI_ENVIRONMENT_CLASS` (`local`, `docker`, `singularity`, …) and pass
|
|
61
|
+
additional keyword arguments with `SWE_MINI_ENVIRONMENT_KWARGS` (JSON).
|
|
62
|
+
|
|
63
|
+
### Tracing & SFT
|
|
64
|
+
|
|
65
|
+
Tracing works the same as Crafter; pass `--trace` / `--trace-db` to the CLI or
|
|
66
|
+
set `TASKAPP_TRACING_ENABLED=1`. The task app writes JSONL snippets for SFT and
|
|
67
|
+
records decision traces in the configured SQLite/Postgres database.
|
|
68
|
+
|
|
69
|
+
## Next steps
|
|
70
|
+
|
|
71
|
+
- `docs/examples/swe/mini_swe_task_app.md` – end-to-end walkthrough
|
|
72
|
+
- `examples/swe/task_app/grpo_swe_mini.py` – main entrypoint
|
|
73
|
+
- `examples/swe/task_app/hosted` – shared host scaffolding for the Mini-SWE task app
|
|
74
|
+
|
|
75
|
+
Pull requests welcome – especially for better dataset loaders, richer metrics,
|
|
76
|
+
and robust docker support.
|
|
77
|
+
|
|
78
|
+
### Example rollout configs
|
|
79
|
+
|
|
80
|
+
- OpenAI gpt-4o-mini (works out-of-the-box):
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"run_id": "example-$(date +%s)",
|
|
85
|
+
"policy": {
|
|
86
|
+
"policy_name": "swe-mini-react",
|
|
87
|
+
"config": {
|
|
88
|
+
"model": "gpt-4o-mini",
|
|
89
|
+
"inference_url": "https://api.openai.com",
|
|
90
|
+
"temperature": 0.0,
|
|
91
|
+
"max_completion_tokens": 256,
|
|
92
|
+
"use_tools": false,
|
|
93
|
+
"response_format": { "type": "text" },
|
|
94
|
+
"system_template": "You are participating in a software engineering evaluation. Provide exactly one bash command enclosed in a single ```bash``` block. No THOUGHT. No extra text. If unsure, output ```bash\necho NOOP\n```.",
|
|
95
|
+
"instance_template": "{{problem_statement}}\n\n{{instructions}}",
|
|
96
|
+
"action_template": "{{ output.stdout }}"
|
|
97
|
+
}
|
|
98
|
+
},
|
|
99
|
+
"env": { "env_name": "swe-mini" },
|
|
100
|
+
"ops": ["agent","env","agent","env","agent","env"],
|
|
101
|
+
"record": {"trajectories": true, "return_trace": true, "trace_format": "compact"}
|
|
102
|
+
}
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
- OpenAI gpt-5-mini (experimental): remove reasoning flags and constrain output. If responses are empty, retry without `stop` and consider switching to `gpt-4o-mini`.
|