synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/__init__.py +16 -0
- examples/crafter_debug_render.py +23 -17
- examples/qwen_coder/README.md +102 -0
- examples/qwen_coder/_shared.py +113 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
- examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
- examples/qwen_coder/configs/coder_lora_small.toml +58 -0
- examples/qwen_coder/generate_dataset.py +98 -0
- examples/qwen_coder/infer_ft_smoke.py +64 -0
- examples/qwen_coder/infer_prod_proxy.py +73 -0
- examples/qwen_coder/infer_via_synth.py +87 -0
- examples/qwen_coder/scripts/infer_coder.sh +18 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
- examples/qwen_coder/sft_full_17b.py +103 -0
- examples/qwen_coder/sft_lora_30b.py +110 -0
- examples/qwen_coder/subset_jsonl.py +38 -0
- examples/qwen_coder/validate_jsonl.py +59 -0
- examples/rl/configs/eval_base_qwen.toml +1 -1
- examples/rl/configs/rl_from_base_qwen17.toml +1 -1
- examples/rl/download_dataset.py +26 -10
- examples/rl/run_eval.py +53 -52
- examples/rl/run_rl_and_save.py +29 -12
- examples/rl/task_app/math_single_step.py +180 -41
- examples/rl/task_app/math_task_app.py +14 -6
- examples/sft/README.md +139 -0
- examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
- examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
- examples/sft/evaluate.py +117 -0
- examples/sft/export_dataset.py +117 -0
- examples/sft/generate_traces.py +162 -0
- examples/swe/__init__.py +12 -0
- examples/swe/task_app/README.md +105 -0
- examples/swe/task_app/__init__.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +571 -0
- examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
- examples/swe/task_app/hosted/README.md +173 -0
- examples/swe/task_app/hosted/__init__.py +5 -0
- examples/swe/task_app/hosted/branching.py +143 -0
- examples/swe/task_app/hosted/environment_routes.py +1289 -0
- examples/swe/task_app/hosted/envs/__init__.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
- examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
- examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
- examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
- examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
- examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
- examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
- examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
- examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
- examples/swe/task_app/hosted/hosted_app.py +204 -0
- examples/swe/task_app/hosted/inference/__init__.py +5 -0
- examples/swe/task_app/hosted/inference/openai_client.py +618 -0
- examples/swe/task_app/hosted/main.py +100 -0
- examples/swe/task_app/hosted/policy_routes.py +1079 -0
- examples/swe/task_app/hosted/registry.py +195 -0
- examples/swe/task_app/hosted/rollout.py +1869 -0
- examples/swe/task_app/hosted/storage/__init__.py +5 -0
- examples/swe/task_app/hosted/storage/volume.py +211 -0
- examples/swe/task_app/hosted/test_agents.py +161 -0
- examples/swe/task_app/hosted/test_service.py +137 -0
- examples/swe/task_app/hosted/utils.py +62 -0
- examples/vlm/README.md +68 -0
- examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
- examples/vlm/crafter_image_only_agent.py +207 -0
- examples/vlm/crafter_openai_vlm_agent.py +277 -0
- examples/vlm/filter_image_rows.py +63 -0
- examples/vlm/run_crafter_vlm_benchmark.py +316 -0
- examples/warming_up_to_rl/analyze_trace_db.py +12 -10
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
- examples/warming_up_to_rl/export_trace_sft.py +218 -36
- examples/warming_up_to_rl/groq_test.py +15 -8
- examples/warming_up_to_rl/manage_secrets.py +29 -25
- examples/warming_up_to_rl/readme.md +9 -2
- examples/warming_up_to_rl/run_eval.py +137 -61
- examples/warming_up_to_rl/run_fft_and_save.py +131 -60
- examples/warming_up_to_rl/run_local_rollout.py +88 -39
- examples/warming_up_to_rl/run_local_rollout_modal.py +114 -28
- examples/warming_up_to_rl/run_local_rollout_parallel.py +81 -20
- examples/warming_up_to_rl/run_local_rollout_traced.py +126 -23
- examples/warming_up_to_rl/run_rl_and_save.py +35 -12
- examples/warming_up_to_rl/run_rollout_remote.py +44 -19
- examples/warming_up_to_rl/task_app/README.md +6 -2
- examples/warming_up_to_rl/task_app/grpo_crafter.py +319 -57
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +11 -30
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +137 -182
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +150 -57
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +105 -69
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +19 -7
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +45 -42
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +47 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +198 -92
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +361 -263
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +394 -274
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +56 -62
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +6 -15
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
- synth/__init__.py +14 -0
- synth_ai/__init__.py +20 -4
- synth_ai/api/models/supported.py +376 -0
- synth_ai/api/train/builders.py +157 -26
- synth_ai/api/train/cli.py +213 -57
- synth_ai/api/train/config_finder.py +65 -5
- synth_ai/api/train/env_resolver.py +33 -15
- synth_ai/api/train/pollers.py +13 -4
- synth_ai/api/train/supported_algos.py +139 -0
- synth_ai/api/train/task_app.py +5 -3
- synth_ai/api/train/utils.py +33 -48
- synth_ai/cli/__init__.py +19 -4
- synth_ai/cli/_modal_wrapper.py +28 -0
- synth_ai/cli/_typer_patch.py +49 -0
- synth_ai/cli/balance.py +2 -3
- synth_ai/cli/calc.py +1 -1
- synth_ai/cli/demo.py +21 -6
- synth_ai/cli/recent.py +2 -2
- synth_ai/cli/rl_demo.py +77 -17
- synth_ai/cli/root.py +116 -39
- synth_ai/cli/status.py +2 -2
- synth_ai/cli/task_apps.py +1709 -243
- synth_ai/cli/traces.py +7 -4
- synth_ai/cli/turso.py +73 -0
- synth_ai/cli/watch.py +12 -18
- synth_ai/core/experiment.py +0 -2
- synth_ai/demo_registry.py +68 -31
- synth_ai/demos/core/cli.py +516 -194
- synth_ai/demos/demo_task_apps/__init__.py +3 -3
- synth_ai/demos/demo_task_apps/core.py +64 -28
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +37 -30
- synth_ai/demos/demo_task_apps/math/_common.py +1 -2
- synth_ai/demos/demo_task_apps/math/app.py +2 -1
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +183 -82
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
- synth_ai/environments/examples/bandit/engine.py +12 -4
- synth_ai/environments/examples/bandit/taskset.py +4 -4
- synth_ai/environments/examples/crafter_classic/environment.py +76 -1
- synth_ai/environments/reproducibility/tree.py +5 -6
- synth_ai/environments/service/app.py +11 -12
- synth_ai/environments/service/core_routes.py +10 -9
- synth_ai/environments/stateful/engine.py +1 -1
- synth_ai/environments/tasks/core.py +1 -0
- synth_ai/environments/tasks/filters.py +5 -6
- synth_ai/environments/tasks/utils.py +4 -5
- synth_ai/evals/base.py +0 -2
- synth_ai/handshake.py +11 -9
- synth_ai/http.py +1 -1
- synth_ai/http_client.py +43 -11
- synth_ai/inference/__init__.py +0 -2
- synth_ai/inference/client.py +20 -6
- synth_ai/jobs/client.py +103 -78
- synth_ai/learning/__init__.py +41 -6
- synth_ai/learning/algorithms.py +14 -0
- synth_ai/learning/client.py +121 -29
- synth_ai/learning/config.py +2 -40
- synth_ai/learning/constants.py +0 -2
- synth_ai/learning/ft_client.py +4 -56
- synth_ai/learning/health.py +13 -7
- synth_ai/learning/jobs.py +43 -47
- synth_ai/{rl → learning/rl}/__init__.py +14 -5
- synth_ai/learning/rl/client.py +267 -0
- synth_ai/learning/rl/config.py +31 -0
- synth_ai/{rl → learning/rl}/contracts.py +5 -10
- synth_ai/{rl → learning/rl}/env_keys.py +45 -16
- synth_ai/learning/rl/secrets.py +13 -0
- synth_ai/learning/rl_client.py +2 -253
- synth_ai/learning/sft/__init__.py +29 -0
- synth_ai/learning/sft/client.py +68 -0
- synth_ai/learning/sft/config.py +270 -0
- synth_ai/learning/sft/data.py +295 -0
- synth_ai/learning/sse.py +25 -26
- synth_ai/learning/validators.py +25 -24
- synth_ai/lm/__init__.py +21 -47
- synth_ai/task/__init__.py +26 -27
- synth_ai/task/apps/__init__.py +18 -19
- synth_ai/task/auth.py +35 -23
- synth_ai/task/client.py +15 -13
- synth_ai/task/contracts.py +37 -35
- synth_ai/task/datasets.py +9 -6
- synth_ai/task/errors.py +11 -10
- synth_ai/task/health.py +17 -11
- synth_ai/task/json.py +58 -24
- synth_ai/task/proxy.py +15 -14
- synth_ai/task/rubrics.py +22 -15
- synth_ai/task/server.py +43 -17
- synth_ai/task/tracing_utils.py +12 -7
- synth_ai/task/validators.py +0 -1
- synth_ai/task/vendors.py +5 -7
- synth_ai/tracing_v3/__init__.py +2 -0
- synth_ai/tracing_v3/abstractions.py +21 -4
- synth_ai/tracing_v3/db_config.py +26 -1
- synth_ai/tracing_v3/decorators.py +18 -15
- synth_ai/tracing_v3/examples/basic_usage.py +3 -2
- synth_ai/tracing_v3/hooks.py +6 -4
- synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
- synth_ai/tracing_v3/replica_sync.py +1 -0
- synth_ai/tracing_v3/session_tracer.py +63 -16
- synth_ai/tracing_v3/storage/base.py +89 -1
- synth_ai/tracing_v3/storage/config.py +21 -8
- synth_ai/tracing_v3/storage/factory.py +10 -8
- synth_ai/tracing_v3/storage/utils.py +4 -2
- synth_ai/tracing_v3/turso/daemon.py +7 -2
- synth_ai/tracing_v3/turso/models.py +5 -2
- synth_ai/tracing_v3/turso/native_manager.py +1173 -0
- synth_ai/tracing_v3/utils.py +4 -3
- synth_ai/v0/api/__init__.py +8 -0
- synth_ai/v0/api/models/__init__.py +8 -0
- synth_ai/v0/api/models/supported.py +8 -0
- synth_ai/v0/config/__init__.py +15 -0
- synth_ai/v0/config/base_url.py +12 -0
- synth_ai/v0/lm/__init__.py +51 -0
- synth_ai/{lm → v0/lm}/caching/ephemeral.py +3 -5
- synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
- synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
- synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
- synth_ai/{lm → v0/lm}/config.py +6 -1
- synth_ai/{lm → v0/lm}/core/all.py +9 -9
- synth_ai/{lm → v0/lm}/core/exceptions.py +0 -2
- synth_ai/{lm → v0/lm}/core/main.py +19 -7
- synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
- synth_ai/{lm → v0/lm}/core/synth_models.py +2 -15
- synth_ai/{lm → v0/lm}/core/vendor_clients.py +6 -4
- synth_ai/{lm → v0/lm}/overrides.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +16 -16
- synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +12 -10
- synth_ai/{lm → v0/lm}/vendors/openai_standard.py +11 -9
- synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +8 -5
- synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +4 -6
- synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
- synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/synth_client.py +38 -11
- synth_ai/v0/tracing/upload.py +32 -135
- synth_ai/v0/tracing_v3/__init__.py +10 -0
- synth_ai/v0/tracing_v3/abstractions.py +3 -0
- synth_ai/v0/tracing_v3/decorators.py +3 -0
- synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
- synth_ai/v0/tracing_v3/session_tracer.py +3 -0
- synth_ai-0.2.9.dev6.dist-info/METADATA +191 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/RECORD +291 -264
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/top_level.txt +1 -0
- examples/common_old/backend.py +0 -21
- examples/evals_old/README.md +0 -98
- examples/evals_old/__init__.py +0 -6
- examples/evals_old/compare_models.py +0 -1037
- examples/evals_old/example_log.md +0 -145
- examples/evals_old/run_demo.sh +0 -126
- examples/evals_old/trace_analysis.py +0 -270
- examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
- examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -239
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
- examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -118
- examples/finetuning_old/synth_qwen_v1/README.md +0 -68
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -239
- examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
- examples/finetuning_old/synth_qwen_v1/infer.py +0 -37
- examples/finetuning_old/synth_qwen_v1/poll.py +0 -44
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1932
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -207
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -232
- examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
- examples/finetuning_old/synth_qwen_v1/util.py +0 -147
- examples/rl_old/task_app.py +0 -962
- examples/warming_up_to_rl/old/event_rewards.md +0 -234
- examples/warming_up_to_rl/old/notes.md +0 -73
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- synth_ai/experimental/synth_oss.py +0 -446
- synth_ai/install_sqld.sh +0 -40
- synth_ai/learning/filtering.py +0 -0
- synth_ai/learning/offline/dpo.py +0 -0
- synth_ai/learning/offline/providers.py +0 -7
- synth_ai/learning/offline/sft.py +0 -0
- synth_ai/learning/offline/shared.py +0 -0
- synth_ai/learning/online/grpo.py +0 -0
- synth_ai/learning/online/irft.py +0 -0
- synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
- synth_ai/learning/prompts/gepa.py +0 -0
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
- synth_ai/learning/prompts/mipro.py +0 -289
- synth_ai/learning/prompts/random_search.py +0 -246
- synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
- synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
- synth_ai/rl/secrets.py +0 -19
- synth_ai/scripts/verify_rewards.py +0 -100
- synth_ai/tracing/__init__.py +0 -30
- synth_ai/tracing_v1/__init__.py +0 -33
- synth_ai/tracing_v3/turso/__init__.py +0 -25
- synth_ai/tracing_v3/turso/manager.py +0 -774
- synth_ai/zyk/__init__.py +0 -30
- synth_ai-0.2.9.dev4.dist-info/METADATA +0 -131
- /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
- /synth_ai/{lm → v0/lm}/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
- /synth_ai/{lm → v0/lm}/injection.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
- /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/warmup.py +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,17 +5,22 @@ Baseline evaluation script (public-friendly skeleton)
|
|
|
5
5
|
- Uses a TaskAppClient interface (to be implemented in synth-ai SDK)
|
|
6
6
|
- Keeps structure aligned with research/testing/crafter eval harness
|
|
7
7
|
"""
|
|
8
|
+
|
|
8
9
|
from __future__ import annotations
|
|
9
|
-
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import asyncio
|
|
13
|
+
import contextlib
|
|
10
14
|
import json
|
|
15
|
+
import os
|
|
11
16
|
import re
|
|
12
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
-
from collections import Counter
|
|
14
|
-
import asyncio
|
|
15
|
-
import httpx
|
|
16
|
-
import argparse
|
|
17
17
|
import tomllib
|
|
18
|
+
from collections import Counter
|
|
18
19
|
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
import httpx
|
|
23
|
+
|
|
19
24
|
|
|
20
25
|
class TaskAppClient:
|
|
21
26
|
"""Minimal async client for the task app initialize/step/terminate routes.
|
|
@@ -23,12 +28,12 @@ class TaskAppClient:
|
|
|
23
28
|
This is a public-friendly shim for examples, pending SDK surface consolidation.
|
|
24
29
|
"""
|
|
25
30
|
|
|
26
|
-
def __init__(self, base_url: str, api_key:
|
|
31
|
+
def __init__(self, base_url: str, api_key: str | None = None) -> None:
|
|
27
32
|
self.base_url = base_url.rstrip("/")
|
|
28
33
|
self.api_key = api_key
|
|
29
|
-
self._client:
|
|
34
|
+
self._client: httpx.AsyncClient | None = None
|
|
30
35
|
|
|
31
|
-
async def __aenter__(self) ->
|
|
36
|
+
async def __aenter__(self) -> TaskAppClient:
|
|
32
37
|
headers = {}
|
|
33
38
|
if self.api_key:
|
|
34
39
|
headers["X-API-Key"] = self.api_key
|
|
@@ -54,9 +59,9 @@ class TaskAppClient:
|
|
|
54
59
|
)
|
|
55
60
|
return self._client
|
|
56
61
|
|
|
57
|
-
async def initialize(self, env_name: str, config:
|
|
62
|
+
async def initialize(self, env_name: str, config: dict[str, Any]) -> dict[str, Any]:
|
|
58
63
|
"""POST /env/{env_name}/initialize (compat route supported in task app)."""
|
|
59
|
-
payload:
|
|
64
|
+
payload: dict[str, Any] = {
|
|
60
65
|
"seed": config.get("seed"),
|
|
61
66
|
}
|
|
62
67
|
# Allow both world_config and config inputs; env routes will normalize difficulty
|
|
@@ -68,29 +73,31 @@ class TaskAppClient:
|
|
|
68
73
|
resp.raise_for_status()
|
|
69
74
|
return resp.json()
|
|
70
75
|
|
|
71
|
-
async def step(
|
|
76
|
+
async def step(
|
|
77
|
+
self, env_name: str, env_id: str, tool_calls: list[dict[str, Any]]
|
|
78
|
+
) -> dict[str, Any]:
|
|
72
79
|
"""POST /env/{env_name}/step with wrapped tool_calls in action."""
|
|
73
80
|
payload = {"env_id": env_id, "action": {"tool_calls": tool_calls}}
|
|
74
81
|
resp = await self.client.post(f"/env/{env_name}/step", json=payload)
|
|
75
82
|
resp.raise_for_status()
|
|
76
83
|
return resp.json()
|
|
77
84
|
|
|
78
|
-
async def terminate(self, env_name: str, env_id: str) ->
|
|
85
|
+
async def terminate(self, env_name: str, env_id: str) -> dict[str, Any]:
|
|
79
86
|
resp = await self.client.post(f"/env/{env_name}/terminate", json={"env_id": env_id})
|
|
80
87
|
resp.raise_for_status()
|
|
81
88
|
return resp.json()
|
|
82
89
|
|
|
83
|
-
async def get_info(self) ->
|
|
90
|
+
async def get_info(self) -> dict[str, Any]:
|
|
84
91
|
resp = await self.client.get("/info")
|
|
85
92
|
resp.raise_for_status()
|
|
86
93
|
return resp.json()
|
|
87
94
|
|
|
88
|
-
async def proxy_groq_chat(self, payload:
|
|
95
|
+
async def proxy_groq_chat(self, payload: dict[str, Any]) -> dict[str, Any]:
|
|
89
96
|
resp = await self.client.post("/proxy/groq/v1/chat/completions", json=payload)
|
|
90
97
|
resp.raise_for_status()
|
|
91
98
|
return resp.json()
|
|
92
99
|
|
|
93
|
-
async def vllm_chat(self, vllm_base_url: str, payload:
|
|
100
|
+
async def vllm_chat(self, vllm_base_url: str, payload: dict[str, Any]) -> dict[str, Any]:
|
|
94
101
|
async with httpx.AsyncClient(base_url=vllm_base_url.rstrip("/"), timeout=60.0) as c:
|
|
95
102
|
resp = await c.post("/v1/chat/completions", json=payload)
|
|
96
103
|
# Do not raise for status to surface body in errors
|
|
@@ -102,11 +109,21 @@ class TaskAppClient:
|
|
|
102
109
|
return {"error": data}
|
|
103
110
|
return data
|
|
104
111
|
|
|
105
|
-
async def rollout(
|
|
106
|
-
|
|
112
|
+
async def rollout(
|
|
113
|
+
self,
|
|
114
|
+
*,
|
|
115
|
+
run_id: str,
|
|
116
|
+
env_name: str,
|
|
117
|
+
seed: int,
|
|
118
|
+
difficulty: str,
|
|
119
|
+
policy_name: str,
|
|
120
|
+
policy_config: dict[str, Any],
|
|
121
|
+
max_turns: int,
|
|
122
|
+
) -> dict[str, Any]:
|
|
123
|
+
ops: list[str] = []
|
|
107
124
|
for _ in range(max_turns):
|
|
108
125
|
ops.extend(["agent", "env"])
|
|
109
|
-
payload:
|
|
126
|
+
payload: dict[str, Any] = {
|
|
110
127
|
"run_id": run_id,
|
|
111
128
|
"env": {
|
|
112
129
|
"env_name": env_name,
|
|
@@ -128,35 +145,41 @@ class TaskAppClient:
|
|
|
128
145
|
resp.raise_for_status()
|
|
129
146
|
return resp.json()
|
|
130
147
|
|
|
148
|
+
|
|
131
149
|
TASK_APP_URL = os.getenv("TASK_APP_URL", "https://YOUR-TASK-APP.modal.run").rstrip("/")
|
|
132
150
|
MODEL = os.getenv("EVAL_MODEL", "qwen/qwen3-32b")
|
|
133
151
|
NUM_EPISODES = int(os.getenv("NUM_EPISODES", "3"))
|
|
134
152
|
MAX_TURNS = int(os.getenv("MAX_TURNS", "10"))
|
|
135
153
|
CONCURRENCY = int(os.getenv("CONCURRENCY", "1"))
|
|
136
154
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
"
|
|
142
|
-
"
|
|
143
|
-
|
|
144
|
-
"
|
|
145
|
-
"
|
|
146
|
-
"
|
|
147
|
-
"
|
|
155
|
+
|
|
156
|
+
def _interact_tool_schema() -> list[dict[str, Any]]:
|
|
157
|
+
return [
|
|
158
|
+
{
|
|
159
|
+
"type": "function",
|
|
160
|
+
"function": {
|
|
161
|
+
"name": "interact",
|
|
162
|
+
"description": "Perform actions in the Crafter environment.",
|
|
163
|
+
"parameters": {
|
|
164
|
+
"type": "object",
|
|
165
|
+
"properties": {
|
|
166
|
+
"actions": {"type": "array", "items": {"type": "string"}},
|
|
167
|
+
"reasoning": {"type": "string"},
|
|
168
|
+
},
|
|
169
|
+
"required": ["actions", "reasoning"],
|
|
148
170
|
},
|
|
149
|
-
"required": ["actions", "reasoning"],
|
|
150
171
|
},
|
|
151
|
-
}
|
|
152
|
-
|
|
172
|
+
}
|
|
173
|
+
]
|
|
174
|
+
|
|
153
175
|
|
|
154
|
-
def _build_messages_from_observation(
|
|
176
|
+
def _build_messages_from_observation(
|
|
177
|
+
observation: dict[str, Any], history: list[dict[str, Any]]
|
|
178
|
+
) -> list[dict[str, Any]]:
|
|
155
179
|
inv = observation.get("inventory") or {}
|
|
156
180
|
pos = observation.get("player_position") or []
|
|
157
181
|
ach = observation.get("achievements_status") or {}
|
|
158
|
-
|
|
159
|
-
user_lines: List[str] = []
|
|
182
|
+
user_lines: list[str] = []
|
|
160
183
|
user_lines.append("Environment: CrafterClassic")
|
|
161
184
|
user_lines.append(f"Player position: {pos}")
|
|
162
185
|
user_lines.append(f"Inventory: {json.dumps(inv, ensure_ascii=False)}")
|
|
@@ -171,7 +194,8 @@ def _build_messages_from_observation(observation: Dict[str, Any], history: List[
|
|
|
171
194
|
content = "\n".join(user_lines)
|
|
172
195
|
return [{"role": "user", "content": content}]
|
|
173
196
|
|
|
174
|
-
|
|
197
|
+
|
|
198
|
+
def _parse_tool_calls_from_openai_response(data: dict[str, Any]) -> list[str]:
|
|
175
199
|
try:
|
|
176
200
|
choices = data.get("choices")
|
|
177
201
|
if isinstance(choices, list) and choices:
|
|
@@ -203,7 +227,11 @@ def _parse_tool_calls_from_openai_response(data: Dict[str, Any]) -> List[str]:
|
|
|
203
227
|
if isinstance(content, str):
|
|
204
228
|
text = content
|
|
205
229
|
elif isinstance(content, list):
|
|
206
|
-
text = "\n".join(
|
|
230
|
+
text = "\n".join(
|
|
231
|
+
str(part.get("text"))
|
|
232
|
+
for part in content
|
|
233
|
+
if isinstance(part, dict) and part.get("text")
|
|
234
|
+
)
|
|
207
235
|
for raw in re.findall(r"\{[\s\S]*\}", text or ""):
|
|
208
236
|
try:
|
|
209
237
|
obj = json.loads(raw)
|
|
@@ -217,9 +245,16 @@ def _parse_tool_calls_from_openai_response(data: Dict[str, Any]) -> List[str]:
|
|
|
217
245
|
pass
|
|
218
246
|
return []
|
|
219
247
|
|
|
220
|
-
|
|
248
|
+
|
|
249
|
+
async def _choose_actions_via_llm(
|
|
250
|
+
client: TaskAppClient,
|
|
251
|
+
provider: str,
|
|
252
|
+
model: str,
|
|
253
|
+
observation: dict[str, Any],
|
|
254
|
+
history: list[dict[str, Any]],
|
|
255
|
+
) -> list[str]:
|
|
221
256
|
messages = _build_messages_from_observation(observation, history)
|
|
222
|
-
payload:
|
|
257
|
+
payload: dict[str, Any] = {
|
|
223
258
|
"model": model,
|
|
224
259
|
"messages": messages,
|
|
225
260
|
"tools": _interact_tool_schema(),
|
|
@@ -245,33 +280,40 @@ async def _choose_actions_via_llm(client: TaskAppClient, provider: str, model: s
|
|
|
245
280
|
actions = _parse_tool_calls_from_openai_response(data)
|
|
246
281
|
return actions or []
|
|
247
282
|
|
|
248
|
-
|
|
249
|
-
|
|
283
|
+
|
|
284
|
+
def _expand_actions_to_tool_calls(actions: list[str]) -> list[dict[str, Any]]:
|
|
285
|
+
out: list[dict[str, Any]] = []
|
|
250
286
|
for a in actions[:5]:
|
|
251
287
|
out.append({"tool": "interact", "args": {"action": a}})
|
|
252
288
|
return out
|
|
253
289
|
|
|
290
|
+
|
|
254
291
|
def _detect_provider(model: str) -> str:
|
|
255
292
|
m = (model or "").lower()
|
|
256
293
|
if "qwen/qwen3-32b" in m or "qwen-2.5-" in m or m.startswith("groq:"):
|
|
257
294
|
return "groq"
|
|
258
295
|
return "vllm"
|
|
259
296
|
|
|
260
|
-
|
|
297
|
+
|
|
298
|
+
def _rollout_inference_url_from_cfg(cfg: dict[str, Any], default_vllm: str | None) -> str | None:
|
|
261
299
|
# Prefer explicit inference_url in TOML; else fall back to discovered vLLM base
|
|
262
300
|
url = cfg.get("inference_url")
|
|
263
301
|
if isinstance(url, str) and url:
|
|
264
302
|
return url
|
|
265
303
|
return default_vllm
|
|
266
304
|
|
|
267
|
-
|
|
305
|
+
|
|
306
|
+
async def eval_episode(client: TaskAppClient, seed: int) -> dict[str, Any]:
|
|
268
307
|
env_name = "CrafterClassic"
|
|
269
|
-
history:
|
|
308
|
+
history: list[dict[str, Any]] = []
|
|
270
309
|
achievements: set[str] = set()
|
|
271
310
|
turns = 0
|
|
272
311
|
|
|
273
312
|
# Initialize environment
|
|
274
|
-
init_cfg:
|
|
313
|
+
init_cfg: dict[str, Any] = {
|
|
314
|
+
"seed": seed,
|
|
315
|
+
"world_config": {"difficulty": os.getenv("DIFFICULTY", "easy")},
|
|
316
|
+
}
|
|
275
317
|
created = await client.initialize(env_name, init_cfg)
|
|
276
318
|
env_id = created.get("env_id")
|
|
277
319
|
if not isinstance(env_id, str) or not env_id:
|
|
@@ -285,7 +327,9 @@ async def eval_episode(client: TaskAppClient, seed: int) -> Dict[str, Any]:
|
|
|
285
327
|
try:
|
|
286
328
|
while turns < MAX_TURNS and not done:
|
|
287
329
|
# Ask LLM for actions; fallback to a simple exploratory pair
|
|
288
|
-
chosen_actions = await _choose_actions_via_llm(
|
|
330
|
+
chosen_actions = await _choose_actions_via_llm(
|
|
331
|
+
client, provider, MODEL, observation, history
|
|
332
|
+
)
|
|
289
333
|
if not chosen_actions:
|
|
290
334
|
chosen_actions = ["move_up", "do"]
|
|
291
335
|
tool_calls = _expand_actions_to_tool_calls(chosen_actions)
|
|
@@ -299,13 +343,12 @@ async def eval_episode(client: TaskAppClient, seed: int) -> Dict[str, Any]:
|
|
|
299
343
|
if isinstance(nxt, dict):
|
|
300
344
|
observation = nxt
|
|
301
345
|
finally:
|
|
302
|
-
|
|
346
|
+
with contextlib.suppress(Exception):
|
|
303
347
|
await client.terminate(env_name, env_id)
|
|
304
|
-
except Exception:
|
|
305
|
-
pass
|
|
306
348
|
|
|
307
349
|
return {"seed": seed, "turns": turns, "achievements": sorted(achievements)}
|
|
308
350
|
|
|
351
|
+
|
|
309
352
|
async def main() -> None:
|
|
310
353
|
# Best-effort load local .env if present (ensures ENVIRONMENT_API_KEY for rollout)
|
|
311
354
|
try:
|
|
@@ -322,13 +365,17 @@ async def main() -> None:
|
|
|
322
365
|
except Exception:
|
|
323
366
|
pass
|
|
324
367
|
|
|
325
|
-
parser = argparse.ArgumentParser(
|
|
368
|
+
parser = argparse.ArgumentParser(
|
|
369
|
+
description="Baseline eval against task app with optional TOML config"
|
|
370
|
+
)
|
|
326
371
|
parser.add_argument("--toml", help="Path to TOML config file", default=None)
|
|
327
|
-
parser.add_argument(
|
|
372
|
+
parser.add_argument(
|
|
373
|
+
"--use-rollout", action="store_true", help="Use server-side rollout endpoint for eval"
|
|
374
|
+
)
|
|
328
375
|
args = parser.parse_args()
|
|
329
376
|
|
|
330
377
|
global TASK_APP_URL, MODEL, NUM_EPISODES, MAX_TURNS, CONCURRENCY
|
|
331
|
-
cfg:
|
|
378
|
+
cfg: dict[str, Any] = {}
|
|
332
379
|
if args.toml:
|
|
333
380
|
with open(args.toml, "rb") as f:
|
|
334
381
|
cfg = tomllib.load(f)
|
|
@@ -346,10 +393,14 @@ async def main() -> None:
|
|
|
346
393
|
if env_url:
|
|
347
394
|
TASK_APP_URL = env_url.rstrip("/")
|
|
348
395
|
else:
|
|
349
|
-
raise RuntimeError(
|
|
396
|
+
raise RuntimeError(
|
|
397
|
+
"TASK_APP_URL is a placeholder. Set task_app_url in TOML or export TASK_APP_URL."
|
|
398
|
+
)
|
|
350
399
|
|
|
351
400
|
print(f"Task App: {TASK_APP_URL}")
|
|
352
|
-
print(
|
|
401
|
+
print(
|
|
402
|
+
f"Model: {MODEL} Episodes: {NUM_EPISODES} Max turns: {MAX_TURNS} Concurrency: {CONCURRENCY}"
|
|
403
|
+
)
|
|
353
404
|
sem = asyncio.Semaphore(max(CONCURRENCY, 1))
|
|
354
405
|
async with TaskAppClient(TASK_APP_URL, api_key=os.getenv("ENVIRONMENT_API_KEY")) as client:
|
|
355
406
|
if args.use_rollout:
|
|
@@ -359,16 +410,24 @@ async def main() -> None:
|
|
|
359
410
|
inf_url = _rollout_inference_url_from_cfg(cfg, default_vllm)
|
|
360
411
|
if not inf_url:
|
|
361
412
|
raise RuntimeError("Could not resolve inference URL for rollout")
|
|
413
|
+
|
|
362
414
|
async def _run(seed: int):
|
|
363
415
|
async with sem:
|
|
364
416
|
try:
|
|
365
417
|
run_id = f"eval-{seed}"
|
|
366
418
|
# Build policy config from TOML (explicit control; no server-side guessing)
|
|
367
|
-
policy_cfg:
|
|
419
|
+
policy_cfg: dict[str, Any] = {
|
|
368
420
|
"model": cfg.get("model", MODEL),
|
|
369
421
|
"inference_url": inf_url,
|
|
370
422
|
}
|
|
371
|
-
for k in (
|
|
423
|
+
for k in (
|
|
424
|
+
"max_tokens",
|
|
425
|
+
"temperature",
|
|
426
|
+
"top_p",
|
|
427
|
+
"thinking_mode",
|
|
428
|
+
"thinking_budget",
|
|
429
|
+
"use_tools",
|
|
430
|
+
):
|
|
372
431
|
if k in cfg and cfg.get(k) is not None:
|
|
373
432
|
policy_cfg[k] = cfg.get(k)
|
|
374
433
|
|
|
@@ -385,8 +444,16 @@ async def main() -> None:
|
|
|
385
444
|
ach = []
|
|
386
445
|
try:
|
|
387
446
|
trajs = r.get("trajectories") or []
|
|
388
|
-
final_obs = (
|
|
389
|
-
|
|
447
|
+
final_obs = (
|
|
448
|
+
(trajs[0].get("final") or {}).get("observation")
|
|
449
|
+
if trajs and isinstance(trajs[0], dict)
|
|
450
|
+
else None
|
|
451
|
+
)
|
|
452
|
+
ach_map = (
|
|
453
|
+
(final_obs or {}).get("achievements_status")
|
|
454
|
+
if isinstance(final_obs, dict)
|
|
455
|
+
else None
|
|
456
|
+
)
|
|
390
457
|
if isinstance(ach_map, dict):
|
|
391
458
|
ach = sorted([k for k, v in ach_map.items() if v])
|
|
392
459
|
except Exception:
|
|
@@ -401,7 +468,11 @@ async def main() -> None:
|
|
|
401
468
|
return {"seed": seed, "turns": length, "achievements": ach}
|
|
402
469
|
except Exception as e:
|
|
403
470
|
return {"seed": seed, "turns": 0, "achievements": [], "error": str(e)}
|
|
404
|
-
|
|
471
|
+
|
|
472
|
+
results = await asyncio.gather(
|
|
473
|
+
*[asyncio.create_task(_run(i)) for i in range(1, NUM_EPISODES + 1)],
|
|
474
|
+
return_exceptions=False,
|
|
475
|
+
)
|
|
405
476
|
# Aggregate summary
|
|
406
477
|
counts = [len(r.get("achievements") or []) for r in results if isinstance(r, dict)]
|
|
407
478
|
turns = [int(r.get("turns") or 0) for r in results if isinstance(r, dict)]
|
|
@@ -424,11 +495,16 @@ async def main() -> None:
|
|
|
424
495
|
}
|
|
425
496
|
print(json.dumps(summary, indent=2))
|
|
426
497
|
else:
|
|
498
|
+
|
|
427
499
|
async def _run(seed: int):
|
|
428
500
|
async with sem:
|
|
429
501
|
return await eval_episode(client, seed)
|
|
430
|
-
|
|
502
|
+
|
|
503
|
+
results = await asyncio.gather(
|
|
504
|
+
*[asyncio.create_task(_run(i)) for i in range(1, NUM_EPISODES + 1)]
|
|
505
|
+
)
|
|
431
506
|
print(json.dumps({"episodes": results}, indent=2))
|
|
432
507
|
|
|
508
|
+
|
|
433
509
|
if __name__ == "__main__":
|
|
434
510
|
asyncio.run(main())
|