synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/__init__.py +16 -0
- examples/crafter_debug_render.py +23 -17
- examples/qwen_coder/README.md +102 -0
- examples/qwen_coder/_shared.py +113 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
- examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
- examples/qwen_coder/configs/coder_lora_small.toml +58 -0
- examples/qwen_coder/generate_dataset.py +98 -0
- examples/qwen_coder/infer_ft_smoke.py +64 -0
- examples/qwen_coder/infer_prod_proxy.py +73 -0
- examples/qwen_coder/infer_via_synth.py +87 -0
- examples/qwen_coder/scripts/infer_coder.sh +18 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
- examples/qwen_coder/sft_full_17b.py +103 -0
- examples/qwen_coder/sft_lora_30b.py +110 -0
- examples/qwen_coder/subset_jsonl.py +38 -0
- examples/qwen_coder/validate_jsonl.py +59 -0
- examples/rl/configs/eval_base_qwen.toml +1 -1
- examples/rl/configs/rl_from_base_qwen17.toml +1 -1
- examples/rl/download_dataset.py +26 -10
- examples/rl/run_eval.py +53 -52
- examples/rl/run_rl_and_save.py +29 -12
- examples/rl/task_app/math_single_step.py +180 -41
- examples/rl/task_app/math_task_app.py +14 -6
- examples/sft/README.md +139 -0
- examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
- examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
- examples/sft/evaluate.py +117 -0
- examples/sft/export_dataset.py +117 -0
- examples/sft/generate_traces.py +162 -0
- examples/swe/__init__.py +12 -0
- examples/swe/task_app/README.md +105 -0
- examples/swe/task_app/__init__.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +571 -0
- examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
- examples/swe/task_app/hosted/README.md +173 -0
- examples/swe/task_app/hosted/__init__.py +5 -0
- examples/swe/task_app/hosted/branching.py +143 -0
- examples/swe/task_app/hosted/environment_routes.py +1289 -0
- examples/swe/task_app/hosted/envs/__init__.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
- examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
- examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
- examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
- examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
- examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
- examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
- examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
- examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
- examples/swe/task_app/hosted/hosted_app.py +204 -0
- examples/swe/task_app/hosted/inference/__init__.py +5 -0
- examples/swe/task_app/hosted/inference/openai_client.py +618 -0
- examples/swe/task_app/hosted/main.py +100 -0
- examples/swe/task_app/hosted/policy_routes.py +1079 -0
- examples/swe/task_app/hosted/registry.py +195 -0
- examples/swe/task_app/hosted/rollout.py +1869 -0
- examples/swe/task_app/hosted/storage/__init__.py +5 -0
- examples/swe/task_app/hosted/storage/volume.py +211 -0
- examples/swe/task_app/hosted/test_agents.py +161 -0
- examples/swe/task_app/hosted/test_service.py +137 -0
- examples/swe/task_app/hosted/utils.py +62 -0
- examples/vlm/README.md +68 -0
- examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
- examples/vlm/crafter_image_only_agent.py +207 -0
- examples/vlm/crafter_openai_vlm_agent.py +277 -0
- examples/vlm/filter_image_rows.py +63 -0
- examples/vlm/run_crafter_vlm_benchmark.py +316 -0
- examples/warming_up_to_rl/analyze_trace_db.py +12 -10
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
- examples/warming_up_to_rl/export_trace_sft.py +218 -36
- examples/warming_up_to_rl/groq_test.py +15 -8
- examples/warming_up_to_rl/manage_secrets.py +29 -25
- examples/warming_up_to_rl/readme.md +9 -2
- examples/warming_up_to_rl/run_eval.py +137 -61
- examples/warming_up_to_rl/run_fft_and_save.py +131 -60
- examples/warming_up_to_rl/run_local_rollout.py +88 -39
- examples/warming_up_to_rl/run_local_rollout_modal.py +114 -28
- examples/warming_up_to_rl/run_local_rollout_parallel.py +81 -20
- examples/warming_up_to_rl/run_local_rollout_traced.py +126 -23
- examples/warming_up_to_rl/run_rl_and_save.py +35 -12
- examples/warming_up_to_rl/run_rollout_remote.py +44 -19
- examples/warming_up_to_rl/task_app/README.md +6 -2
- examples/warming_up_to_rl/task_app/grpo_crafter.py +319 -57
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +11 -30
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +137 -182
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +150 -57
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +105 -69
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +19 -7
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +45 -42
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +47 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +198 -92
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +361 -263
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +394 -274
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +56 -62
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +6 -15
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
- synth/__init__.py +14 -0
- synth_ai/__init__.py +20 -4
- synth_ai/api/models/supported.py +376 -0
- synth_ai/api/train/builders.py +157 -26
- synth_ai/api/train/cli.py +213 -57
- synth_ai/api/train/config_finder.py +65 -5
- synth_ai/api/train/env_resolver.py +33 -15
- synth_ai/api/train/pollers.py +13 -4
- synth_ai/api/train/supported_algos.py +139 -0
- synth_ai/api/train/task_app.py +5 -3
- synth_ai/api/train/utils.py +33 -48
- synth_ai/cli/__init__.py +19 -4
- synth_ai/cli/_modal_wrapper.py +28 -0
- synth_ai/cli/_typer_patch.py +49 -0
- synth_ai/cli/balance.py +2 -3
- synth_ai/cli/calc.py +1 -1
- synth_ai/cli/demo.py +21 -6
- synth_ai/cli/recent.py +2 -2
- synth_ai/cli/rl_demo.py +77 -17
- synth_ai/cli/root.py +116 -39
- synth_ai/cli/status.py +2 -2
- synth_ai/cli/task_apps.py +1709 -243
- synth_ai/cli/traces.py +7 -4
- synth_ai/cli/turso.py +73 -0
- synth_ai/cli/watch.py +12 -18
- synth_ai/core/experiment.py +0 -2
- synth_ai/demo_registry.py +68 -31
- synth_ai/demos/core/cli.py +516 -194
- synth_ai/demos/demo_task_apps/__init__.py +3 -3
- synth_ai/demos/demo_task_apps/core.py +64 -28
- synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +37 -30
- synth_ai/demos/demo_task_apps/math/_common.py +1 -2
- synth_ai/demos/demo_task_apps/math/app.py +2 -1
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +183 -82
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
- synth_ai/environments/examples/bandit/engine.py +12 -4
- synth_ai/environments/examples/bandit/taskset.py +4 -4
- synth_ai/environments/examples/crafter_classic/environment.py +76 -1
- synth_ai/environments/reproducibility/tree.py +5 -6
- synth_ai/environments/service/app.py +11 -12
- synth_ai/environments/service/core_routes.py +10 -9
- synth_ai/environments/stateful/engine.py +1 -1
- synth_ai/environments/tasks/core.py +1 -0
- synth_ai/environments/tasks/filters.py +5 -6
- synth_ai/environments/tasks/utils.py +4 -5
- synth_ai/evals/base.py +0 -2
- synth_ai/handshake.py +11 -9
- synth_ai/http.py +1 -1
- synth_ai/http_client.py +43 -11
- synth_ai/inference/__init__.py +0 -2
- synth_ai/inference/client.py +20 -6
- synth_ai/jobs/client.py +103 -78
- synth_ai/learning/__init__.py +41 -6
- synth_ai/learning/algorithms.py +14 -0
- synth_ai/learning/client.py +121 -29
- synth_ai/learning/config.py +2 -40
- synth_ai/learning/constants.py +0 -2
- synth_ai/learning/ft_client.py +4 -56
- synth_ai/learning/health.py +13 -7
- synth_ai/learning/jobs.py +43 -47
- synth_ai/{rl → learning/rl}/__init__.py +14 -5
- synth_ai/learning/rl/client.py +267 -0
- synth_ai/learning/rl/config.py +31 -0
- synth_ai/{rl → learning/rl}/contracts.py +5 -10
- synth_ai/{rl → learning/rl}/env_keys.py +45 -16
- synth_ai/learning/rl/secrets.py +13 -0
- synth_ai/learning/rl_client.py +2 -253
- synth_ai/learning/sft/__init__.py +29 -0
- synth_ai/learning/sft/client.py +68 -0
- synth_ai/learning/sft/config.py +270 -0
- synth_ai/learning/sft/data.py +295 -0
- synth_ai/learning/sse.py +25 -26
- synth_ai/learning/validators.py +25 -24
- synth_ai/lm/__init__.py +21 -47
- synth_ai/task/__init__.py +26 -27
- synth_ai/task/apps/__init__.py +18 -19
- synth_ai/task/auth.py +35 -23
- synth_ai/task/client.py +15 -13
- synth_ai/task/contracts.py +37 -35
- synth_ai/task/datasets.py +9 -6
- synth_ai/task/errors.py +11 -10
- synth_ai/task/health.py +17 -11
- synth_ai/task/json.py +58 -24
- synth_ai/task/proxy.py +15 -14
- synth_ai/task/rubrics.py +22 -15
- synth_ai/task/server.py +43 -17
- synth_ai/task/tracing_utils.py +12 -7
- synth_ai/task/validators.py +0 -1
- synth_ai/task/vendors.py +5 -7
- synth_ai/tracing_v3/__init__.py +2 -0
- synth_ai/tracing_v3/abstractions.py +21 -4
- synth_ai/tracing_v3/db_config.py +26 -1
- synth_ai/tracing_v3/decorators.py +18 -15
- synth_ai/tracing_v3/examples/basic_usage.py +3 -2
- synth_ai/tracing_v3/hooks.py +6 -4
- synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
- synth_ai/tracing_v3/replica_sync.py +1 -0
- synth_ai/tracing_v3/session_tracer.py +63 -16
- synth_ai/tracing_v3/storage/base.py +89 -1
- synth_ai/tracing_v3/storage/config.py +21 -8
- synth_ai/tracing_v3/storage/factory.py +10 -8
- synth_ai/tracing_v3/storage/utils.py +4 -2
- synth_ai/tracing_v3/turso/daemon.py +7 -2
- synth_ai/tracing_v3/turso/models.py +5 -2
- synth_ai/tracing_v3/turso/native_manager.py +1173 -0
- synth_ai/tracing_v3/utils.py +4 -3
- synth_ai/v0/api/__init__.py +8 -0
- synth_ai/v0/api/models/__init__.py +8 -0
- synth_ai/v0/api/models/supported.py +8 -0
- synth_ai/v0/config/__init__.py +15 -0
- synth_ai/v0/config/base_url.py +12 -0
- synth_ai/v0/lm/__init__.py +51 -0
- synth_ai/{lm → v0/lm}/caching/ephemeral.py +3 -5
- synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
- synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
- synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
- synth_ai/{lm → v0/lm}/config.py +6 -1
- synth_ai/{lm → v0/lm}/core/all.py +9 -9
- synth_ai/{lm → v0/lm}/core/exceptions.py +0 -2
- synth_ai/{lm → v0/lm}/core/main.py +19 -7
- synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
- synth_ai/{lm → v0/lm}/core/synth_models.py +2 -15
- synth_ai/{lm → v0/lm}/core/vendor_clients.py +6 -4
- synth_ai/{lm → v0/lm}/overrides.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +16 -16
- synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +12 -10
- synth_ai/{lm → v0/lm}/vendors/openai_standard.py +11 -9
- synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +8 -5
- synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +4 -6
- synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
- synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/synth_client.py +38 -11
- synth_ai/v0/tracing/upload.py +32 -135
- synth_ai/v0/tracing_v3/__init__.py +10 -0
- synth_ai/v0/tracing_v3/abstractions.py +3 -0
- synth_ai/v0/tracing_v3/decorators.py +3 -0
- synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
- synth_ai/v0/tracing_v3/session_tracer.py +3 -0
- synth_ai-0.2.9.dev6.dist-info/METADATA +191 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/RECORD +291 -264
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/top_level.txt +1 -0
- examples/common_old/backend.py +0 -21
- examples/evals_old/README.md +0 -98
- examples/evals_old/__init__.py +0 -6
- examples/evals_old/compare_models.py +0 -1037
- examples/evals_old/example_log.md +0 -145
- examples/evals_old/run_demo.sh +0 -126
- examples/evals_old/trace_analysis.py +0 -270
- examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
- examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -239
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
- examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -118
- examples/finetuning_old/synth_qwen_v1/README.md +0 -68
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -239
- examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
- examples/finetuning_old/synth_qwen_v1/infer.py +0 -37
- examples/finetuning_old/synth_qwen_v1/poll.py +0 -44
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1932
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -207
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -232
- examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
- examples/finetuning_old/synth_qwen_v1/util.py +0 -147
- examples/rl_old/task_app.py +0 -962
- examples/warming_up_to_rl/old/event_rewards.md +0 -234
- examples/warming_up_to_rl/old/notes.md +0 -73
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
- synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
- synth_ai/experimental/synth_oss.py +0 -446
- synth_ai/install_sqld.sh +0 -40
- synth_ai/learning/filtering.py +0 -0
- synth_ai/learning/offline/dpo.py +0 -0
- synth_ai/learning/offline/providers.py +0 -7
- synth_ai/learning/offline/sft.py +0 -0
- synth_ai/learning/offline/shared.py +0 -0
- synth_ai/learning/online/grpo.py +0 -0
- synth_ai/learning/online/irft.py +0 -0
- synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
- synth_ai/learning/prompts/gepa.py +0 -0
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
- synth_ai/learning/prompts/mipro.py +0 -289
- synth_ai/learning/prompts/random_search.py +0 -246
- synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
- synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
- synth_ai/rl/secrets.py +0 -19
- synth_ai/scripts/verify_rewards.py +0 -100
- synth_ai/tracing/__init__.py +0 -30
- synth_ai/tracing_v1/__init__.py +0 -33
- synth_ai/tracing_v3/turso/__init__.py +0 -25
- synth_ai/tracing_v3/turso/manager.py +0 -774
- synth_ai/zyk/__init__.py +0 -30
- synth_ai-0.2.9.dev4.dist-info/METADATA +0 -131
- /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
- /synth_ai/{lm → v0/lm}/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
- /synth_ai/{lm → v0/lm}/injection.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
- /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/warmup.py +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Benchmark Crafter performance across prompt modalities (text-only, image-only, both).
|
|
4
|
+
|
|
5
|
+
For each mode we:
|
|
6
|
+
* Run 20 seeded episodes (configurable) with GPT-4o mini via OpenAI Chat Completions.
|
|
7
|
+
* Execute the returned tool calls in the local Crafter environment.
|
|
8
|
+
* Record achievements/steps and save every rendered frame under `examples/vlm/temp/`.
|
|
9
|
+
|
|
10
|
+
Concurrency is capped by an asyncio semaphore (default parallelism = 10).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import asyncio
|
|
17
|
+
import base64
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
from collections import Counter, defaultdict
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
from uuid import uuid4
|
|
26
|
+
|
|
27
|
+
from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.environment import (
|
|
28
|
+
CrafterEnvironmentWrapper,
|
|
29
|
+
)
|
|
30
|
+
from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
|
|
31
|
+
from openai import AsyncOpenAI
|
|
32
|
+
from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
|
|
33
|
+
from synth_ai.environments.examples.crafter_classic.taskset import (
|
|
34
|
+
CrafterTaskInstance,
|
|
35
|
+
CrafterTaskInstanceMetadata,
|
|
36
|
+
)
|
|
37
|
+
from synth_ai.environments.tasks.core import Impetus, Intent
|
|
38
|
+
|
|
39
|
+
OUTPUT_ROOT = Path("examples/vlm/temp")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Mode(str, Enum):
|
|
43
|
+
TEXT = "text"
|
|
44
|
+
IMAGE = "image"
|
|
45
|
+
BOTH = "both"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class EpisodeResult:
|
|
50
|
+
mode: Mode
|
|
51
|
+
seed: int
|
|
52
|
+
steps_taken: int
|
|
53
|
+
achievements: set[str]
|
|
54
|
+
total_reward: float
|
|
55
|
+
tool_calls: int
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _ensure_openai_client(api_key: str | None) -> AsyncOpenAI:
|
|
59
|
+
if not api_key:
|
|
60
|
+
raise RuntimeError(
|
|
61
|
+
"OPENAI_API_KEY must be set to run the VLM benchmark (export the key or add to your .env)."
|
|
62
|
+
)
|
|
63
|
+
return AsyncOpenAI(api_key=api_key)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _build_task_instance(seed: int) -> CrafterTaskInstance:
|
|
67
|
+
impetus = Impetus(instructions="Explore, survive, and unlock achievements.")
|
|
68
|
+
intent = Intent(rubric={"goal": "Unlock achievements"}, gold_trajectories=None, gold_state_diff={})
|
|
69
|
+
metadata = CrafterTaskInstanceMetadata(
|
|
70
|
+
difficulty="custom",
|
|
71
|
+
seed=seed,
|
|
72
|
+
num_trees_radius=0,
|
|
73
|
+
num_cows_radius=0,
|
|
74
|
+
num_hostiles_radius=0,
|
|
75
|
+
)
|
|
76
|
+
instance = CrafterTaskInstance(
|
|
77
|
+
id=uuid4(),
|
|
78
|
+
impetus=impetus,
|
|
79
|
+
intent=intent,
|
|
80
|
+
metadata=metadata,
|
|
81
|
+
is_reproducible=True,
|
|
82
|
+
initial_engine_snapshot=None,
|
|
83
|
+
)
|
|
84
|
+
# Engine expects these config keys
|
|
85
|
+
instance.config = {"seed": seed, "length": 256, "area": [64, 64]}
|
|
86
|
+
return instance
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _save_observation_frame(observation_packet: dict[str, Any], dest_path: Path) -> None:
|
|
90
|
+
obs = observation_packet.get("observation")
|
|
91
|
+
if not isinstance(obs, dict):
|
|
92
|
+
return
|
|
93
|
+
image_b64 = obs.get("observation_image_base64")
|
|
94
|
+
if not isinstance(image_b64, str) or not image_b64:
|
|
95
|
+
return
|
|
96
|
+
try:
|
|
97
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
98
|
+
dest_path.write_bytes(base64.b64decode(image_b64))
|
|
99
|
+
except Exception:
|
|
100
|
+
pass # best effort
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _strip_image_fields(observation_packet: dict[str, Any]) -> dict[str, Any]:
|
|
104
|
+
stripped = json.loads(json.dumps(observation_packet))
|
|
105
|
+
obs = stripped.get("observation")
|
|
106
|
+
if isinstance(obs, dict):
|
|
107
|
+
for key in list(obs.keys()):
|
|
108
|
+
if key.startswith("observation_image"):
|
|
109
|
+
obs.pop(key, None)
|
|
110
|
+
return stripped
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _make_image_only_request(request: dict[str, Any]) -> dict[str, Any]:
|
|
114
|
+
cloned = json.loads(json.dumps(request))
|
|
115
|
+
for message in cloned.get("messages", []):
|
|
116
|
+
if message.get("role") != "user":
|
|
117
|
+
continue
|
|
118
|
+
content = message.get("content")
|
|
119
|
+
if isinstance(content, list):
|
|
120
|
+
image_parts = [
|
|
121
|
+
item
|
|
122
|
+
for item in content
|
|
123
|
+
if isinstance(item, dict) and item.get("type") in {"image_url", "image"}
|
|
124
|
+
]
|
|
125
|
+
message["content"] = image_parts or content
|
|
126
|
+
elif isinstance(content, str):
|
|
127
|
+
# No structured parts available; leave as empty string
|
|
128
|
+
message["content"] = ""
|
|
129
|
+
return cloned
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
async def _run_episode(
|
|
133
|
+
*,
|
|
134
|
+
mode: Mode,
|
|
135
|
+
seed: int,
|
|
136
|
+
client: AsyncOpenAI,
|
|
137
|
+
model: str,
|
|
138
|
+
max_steps: int,
|
|
139
|
+
temperature: float,
|
|
140
|
+
semaphore: asyncio.Semaphore,
|
|
141
|
+
) -> EpisodeResult:
|
|
142
|
+
async with semaphore:
|
|
143
|
+
task_instance = _build_task_instance(seed)
|
|
144
|
+
env = CrafterClassicEnvironment(task_instance)
|
|
145
|
+
wrapper = CrafterEnvironmentWrapper(env, seed=seed)
|
|
146
|
+
|
|
147
|
+
policy = CrafterPolicy(inference_url="openai://chat-completions", model=model)
|
|
148
|
+
await policy.initialize({"use_tools": True, "model": model})
|
|
149
|
+
|
|
150
|
+
observation_packet = await wrapper.initialize()
|
|
151
|
+
achievements: set[str] = set()
|
|
152
|
+
total_reward = 0.0
|
|
153
|
+
steps_taken = 0
|
|
154
|
+
tool_calls_total = 0
|
|
155
|
+
|
|
156
|
+
frames_dir = OUTPUT_ROOT / f"{mode.value}_frames" / f"seed_{seed:04d}"
|
|
157
|
+
_save_observation_frame(observation_packet, frames_dir / "step_000.png")
|
|
158
|
+
|
|
159
|
+
for step_idx in range(max_steps):
|
|
160
|
+
obs_dict = observation_packet.get("observation")
|
|
161
|
+
if not isinstance(obs_dict, dict):
|
|
162
|
+
break
|
|
163
|
+
|
|
164
|
+
observation_for_policy: dict[str, Any]
|
|
165
|
+
metadata_payload: dict[str, Any] = {}
|
|
166
|
+
|
|
167
|
+
if mode == Mode.TEXT:
|
|
168
|
+
observation_for_policy = _strip_image_fields(observation_packet)
|
|
169
|
+
else:
|
|
170
|
+
observation_for_policy = json.loads(json.dumps(observation_packet))
|
|
171
|
+
metadata_payload["raw_observation"] = observation_packet
|
|
172
|
+
|
|
173
|
+
obs_text = policy._format_observation_for_llm(observation_for_policy) # noqa: SLF001
|
|
174
|
+
_, meta = await policy.step(
|
|
175
|
+
observation_text=obs_text,
|
|
176
|
+
metadata=metadata_payload,
|
|
177
|
+
)
|
|
178
|
+
inference_request = json.loads(json.dumps(meta["inference_request"]))
|
|
179
|
+
|
|
180
|
+
if mode == Mode.IMAGE:
|
|
181
|
+
inference_request = _make_image_only_request(inference_request)
|
|
182
|
+
|
|
183
|
+
inference_request.update(
|
|
184
|
+
{
|
|
185
|
+
"model": model,
|
|
186
|
+
"temperature": temperature,
|
|
187
|
+
"max_tokens": inference_request.get("max_tokens", 512),
|
|
188
|
+
}
|
|
189
|
+
)
|
|
190
|
+
inference_request.pop("stop_after_tool_calls", None)
|
|
191
|
+
inference_request.pop("thinking_mode", None)
|
|
192
|
+
inference_request.pop("thinking_budget", None)
|
|
193
|
+
|
|
194
|
+
response = await client.chat.completions.create(**inference_request)
|
|
195
|
+
response_dict = response.model_dump()
|
|
196
|
+
|
|
197
|
+
assistant_tool_calls = CrafterPolicy.parse_response_to_tool_calls(
|
|
198
|
+
response_dict,
|
|
199
|
+
use_tools=policy.use_tools,
|
|
200
|
+
)
|
|
201
|
+
if not assistant_tool_calls:
|
|
202
|
+
break
|
|
203
|
+
|
|
204
|
+
tool_calls_total += len(assistant_tool_calls)
|
|
205
|
+
assistant_message = response_dict["choices"][0].get("message") or {}
|
|
206
|
+
assistant_text = assistant_message.get("content")
|
|
207
|
+
|
|
208
|
+
env_response = await wrapper.step(assistant_tool_calls)
|
|
209
|
+
if not isinstance(env_response, dict):
|
|
210
|
+
raise RuntimeError(f"Unexpected environment response type: {type(env_response)!r}")
|
|
211
|
+
|
|
212
|
+
policy._append_assistant_turn( # noqa: SLF001
|
|
213
|
+
assistant_text,
|
|
214
|
+
assistant_tool_calls,
|
|
215
|
+
env_response,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
steps_taken += 1
|
|
219
|
+
obs = env_response.get("observation")
|
|
220
|
+
if isinstance(obs, dict):
|
|
221
|
+
ach = obs.get("achievements_status")
|
|
222
|
+
if isinstance(ach, dict):
|
|
223
|
+
for name, unlocked in ach.items():
|
|
224
|
+
if unlocked:
|
|
225
|
+
achievements.add(str(name))
|
|
226
|
+
reward = obs.get("reward_last_step")
|
|
227
|
+
if isinstance(reward, (int, float)):
|
|
228
|
+
total_reward += float(reward)
|
|
229
|
+
|
|
230
|
+
_save_observation_frame(env_response, frames_dir / f"step_{step_idx + 1:03d}.png")
|
|
231
|
+
|
|
232
|
+
if env_response.get("done"):
|
|
233
|
+
break
|
|
234
|
+
observation_packet = env_response
|
|
235
|
+
|
|
236
|
+
await wrapper.terminate()
|
|
237
|
+
return EpisodeResult(
|
|
238
|
+
mode=mode,
|
|
239
|
+
seed=seed,
|
|
240
|
+
steps_taken=steps_taken,
|
|
241
|
+
achievements=achievements,
|
|
242
|
+
total_reward=total_reward,
|
|
243
|
+
tool_calls=tool_calls_total,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _summarise(results: list[EpisodeResult]) -> dict[str, Any]:
|
|
248
|
+
grouped: dict[Mode, list[EpisodeResult]] = defaultdict(list)
|
|
249
|
+
for result in results:
|
|
250
|
+
grouped[result.mode].append(result)
|
|
251
|
+
|
|
252
|
+
summary: dict[str, Any] = {}
|
|
253
|
+
for mode, mode_results in grouped.items():
|
|
254
|
+
if not mode_results:
|
|
255
|
+
continue
|
|
256
|
+
mean_steps = sum(r.steps_taken for r in mode_results) / len(mode_results)
|
|
257
|
+
mean_achievements = sum(len(r.achievements) for r in mode_results) / len(mode_results)
|
|
258
|
+
achievement_counts = Counter()
|
|
259
|
+
for res in mode_results:
|
|
260
|
+
achievement_counts.update(res.achievements)
|
|
261
|
+
summary[mode.value] = {
|
|
262
|
+
"episodes": len(mode_results),
|
|
263
|
+
"mean_steps": round(mean_steps, 2),
|
|
264
|
+
"mean_achievements": round(mean_achievements, 2),
|
|
265
|
+
"total_tool_calls": sum(r.tool_calls for r in mode_results),
|
|
266
|
+
"achievements": {name: count for name, count in sorted(achievement_counts.items())},
|
|
267
|
+
}
|
|
268
|
+
return summary
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
async def main() -> None:
|
|
272
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
273
|
+
parser.add_argument("--model", default="gpt-4o-mini-2024-07-18", help="OpenAI model id to benchmark")
|
|
274
|
+
parser.add_argument("--seeds", type=int, default=20, help="Number of seeds per mode")
|
|
275
|
+
parser.add_argument("--steps", type=int, default=10, help="Max steps per episode")
|
|
276
|
+
parser.add_argument("--temperature", type=float, default=0.6, help="Sampling temperature")
|
|
277
|
+
parser.add_argument("--concurrency", type=int, default=10, help="Max concurrent OpenAI calls")
|
|
278
|
+
args = parser.parse_args()
|
|
279
|
+
|
|
280
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
281
|
+
client = _ensure_openai_client(api_key)
|
|
282
|
+
semaphore = asyncio.Semaphore(max(1, args.concurrency))
|
|
283
|
+
|
|
284
|
+
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
|
|
285
|
+
|
|
286
|
+
tasks: list[asyncio.Task[EpisodeResult]] = []
|
|
287
|
+
for mode in (Mode.TEXT, Mode.IMAGE, Mode.BOTH):
|
|
288
|
+
for seed in range(args.seeds):
|
|
289
|
+
task = asyncio.create_task(
|
|
290
|
+
_run_episode(
|
|
291
|
+
mode=mode,
|
|
292
|
+
seed=seed,
|
|
293
|
+
client=client,
|
|
294
|
+
model=args.model,
|
|
295
|
+
max_steps=args.steps,
|
|
296
|
+
temperature=args.temperature,
|
|
297
|
+
semaphore=semaphore,
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
tasks.append(task)
|
|
301
|
+
|
|
302
|
+
results = await asyncio.gather(*tasks)
|
|
303
|
+
summary = _summarise(results)
|
|
304
|
+
|
|
305
|
+
summary_path = OUTPUT_ROOT / "vlm_benchmark_summary.json"
|
|
306
|
+
summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
|
|
307
|
+
|
|
308
|
+
print("\nBenchmark Summary")
|
|
309
|
+
print("-----------------")
|
|
310
|
+
print(json.dumps(summary, indent=2))
|
|
311
|
+
print(f"\nFrames stored under: {OUTPUT_ROOT}/<mode>_frames/seed_xxxx/")
|
|
312
|
+
print(f"Summary saved to: {summary_path}")
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
if __name__ == "__main__":
|
|
316
|
+
asyncio.run(main())
|
|
@@ -9,7 +9,7 @@ import sqlite3
|
|
|
9
9
|
import sys
|
|
10
10
|
from collections import Counter, defaultdict
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Any
|
|
12
|
+
from typing import Any
|
|
13
13
|
|
|
14
14
|
Row = sqlite3.Row
|
|
15
15
|
|
|
@@ -56,7 +56,7 @@ def fetch_model_usage(conn: sqlite3.Connection) -> list[dict[str, Any]]:
|
|
|
56
56
|
def _parse_json(value: Any) -> Any:
|
|
57
57
|
if value is None:
|
|
58
58
|
return None
|
|
59
|
-
if isinstance(value,
|
|
59
|
+
if isinstance(value, dict | list):
|
|
60
60
|
return value
|
|
61
61
|
try:
|
|
62
62
|
return json.loads(value)
|
|
@@ -64,7 +64,7 @@ def _parse_json(value: Any) -> Any:
|
|
|
64
64
|
return None
|
|
65
65
|
|
|
66
66
|
|
|
67
|
-
AchievementMap = dict[
|
|
67
|
+
AchievementMap = dict[tuple[str, int], dict[str, list[str]]]
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
def fetch_achievement_data(
|
|
@@ -162,7 +162,7 @@ def fetch_achievement_data(
|
|
|
162
162
|
achievement_name_counts.update(achievement_set)
|
|
163
163
|
|
|
164
164
|
achievement_size_counts: Counter = Counter()
|
|
165
|
-
for
|
|
165
|
+
for _session_id, count in unique_counts_per_session.items():
|
|
166
166
|
achievement_size_counts[count] += 1
|
|
167
167
|
|
|
168
168
|
return (
|
|
@@ -229,7 +229,9 @@ def format_model_stats(stats: list[dict[str, Any]]) -> str:
|
|
|
229
229
|
if not stats:
|
|
230
230
|
return "No model usage recorded."
|
|
231
231
|
lines = ["Model usage (by LLM calls):"]
|
|
232
|
-
header =
|
|
232
|
+
header = (
|
|
233
|
+
f"{'Model':30} {'Provider':10} {'Calls':>7} {'Tokens (in/out)':>20} {'Avg latency ms':>15}"
|
|
234
|
+
)
|
|
233
235
|
lines.append(header)
|
|
234
236
|
lines.append("-" * len(header))
|
|
235
237
|
for item in stats:
|
|
@@ -243,9 +245,7 @@ def format_model_stats(stats: list[dict[str, Any]]) -> str:
|
|
|
243
245
|
return "\n".join(lines)
|
|
244
246
|
|
|
245
247
|
|
|
246
|
-
def format_achievement_summary(
|
|
247
|
-
name_counts: Counter, size_counts: Counter
|
|
248
|
-
) -> str:
|
|
248
|
+
def format_achievement_summary(name_counts: Counter, size_counts: Counter) -> str:
|
|
249
249
|
lines = ["Unique achievements unlocked:"]
|
|
250
250
|
if name_counts:
|
|
251
251
|
top = name_counts.most_common()
|
|
@@ -295,7 +295,7 @@ def format_reward_summary(outcome: dict[str, Any], breakdown: list[dict[str, Any
|
|
|
295
295
|
|
|
296
296
|
|
|
297
297
|
def compute_model_achievement_stats(
|
|
298
|
-
conn: sqlite3.Connection, session_unique_sets: dict[str,
|
|
298
|
+
conn: sqlite3.Connection, session_unique_sets: dict[str, set[str]]
|
|
299
299
|
) -> dict[str, dict[str, Any]]:
|
|
300
300
|
"""Aggregate unique-achievement stats per model."""
|
|
301
301
|
|
|
@@ -349,7 +349,9 @@ def format_model_achievement_stats(model_stats: dict[str, dict[str, Any]]) -> st
|
|
|
349
349
|
return "Achievement stats by model:\n (no model sessions recorded)"
|
|
350
350
|
|
|
351
351
|
lines = ["Achievement stats by model:"]
|
|
352
|
-
for model_name in sorted(
|
|
352
|
+
for model_name in sorted(
|
|
353
|
+
model_stats.keys(), key=lambda m: model_stats[m]["sessions"], reverse=True
|
|
354
|
+
):
|
|
353
355
|
stats = model_stats[model_name]
|
|
354
356
|
providers = ", ".join(sorted(stats["providers"])) if stats["providers"] else "-"
|
|
355
357
|
sessions = stats["sessions"]
|
|
@@ -42,9 +42,13 @@ base = "Qwen/Qwen3-4B"
|
|
|
42
42
|
label = "crafter-rl-from-base"
|
|
43
43
|
|
|
44
44
|
[rollout]
|
|
45
|
+
env_name = "crafter"
|
|
45
46
|
max_turns = 10
|
|
46
47
|
episodes_per_batch = 64
|
|
47
|
-
policy_name = "crafter"
|
|
48
|
+
policy_name = "crafter-react"
|
|
49
|
+
max_concurrent_rollouts = 8
|
|
50
|
+
batches_per_step = 2
|
|
51
|
+
ops = ["agent", "env"]
|
|
48
52
|
|
|
49
53
|
[evaluation]
|
|
50
54
|
# Run baseline evaluation over the first 100 seeds every 20 training iterations
|
|
@@ -55,6 +59,12 @@ seeds = [
|
|
|
55
59
|
]
|
|
56
60
|
|
|
57
61
|
[training]
|
|
62
|
+
num_epochs = 1
|
|
63
|
+
iterations_per_epoch = 10
|
|
64
|
+
batch_size = 16
|
|
65
|
+
group_size = 4
|
|
66
|
+
gradient_accumulation_steps = 1
|
|
67
|
+
learning_rate = 5e-5
|
|
58
68
|
log_interval = 1
|
|
59
69
|
weight_sync_interval = 1
|
|
60
70
|
# Additional RL hyperparameters can go here
|