synth-ai 0.2.9.dev7__py3-none-any.whl → 0.2.9.dev8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/__init__.py +16 -0
- examples/crafter_debug_render.py +8 -11
- examples/qwen_coder/README.md +102 -0
- examples/qwen_coder/_shared.py +113 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
- examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
- examples/qwen_coder/configs/coder_lora_small.toml +58 -0
- examples/qwen_coder/generate_dataset.py +98 -0
- examples/qwen_coder/infer_ft_smoke.py +64 -0
- examples/qwen_coder/infer_prod_proxy.py +73 -0
- examples/qwen_coder/infer_via_synth.py +87 -0
- examples/qwen_coder/scripts/infer_coder.sh +18 -0
- examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
- examples/qwen_coder/sft_full_17b.py +103 -0
- examples/qwen_coder/sft_lora_30b.py +110 -0
- examples/qwen_coder/subset_jsonl.py +38 -0
- examples/qwen_coder/validate_jsonl.py +59 -0
- examples/rl/run_eval.py +36 -37
- examples/rl/run_rl_and_save.py +5 -5
- examples/rl/task_app/math_single_step.py +65 -43
- examples/rl/task_app/math_task_app.py +3 -3
- examples/sft/README.md +139 -0
- examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
- examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
- examples/sft/evaluate.py +117 -0
- examples/sft/export_dataset.py +117 -0
- examples/sft/generate_traces.py +162 -0
- examples/swe/__init__.py +12 -0
- examples/swe/task_app/README.md +105 -0
- examples/swe/task_app/__init__.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +571 -0
- examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
- examples/swe/task_app/hosted/README.md +173 -0
- examples/swe/task_app/hosted/__init__.py +5 -0
- examples/swe/task_app/hosted/branching.py +143 -0
- examples/swe/task_app/hosted/environment_routes.py +1289 -0
- examples/swe/task_app/hosted/envs/__init__.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
- examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
- examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
- examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
- examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
- examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
- examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
- examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
- examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
- examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
- examples/swe/task_app/hosted/hosted_app.py +204 -0
- examples/swe/task_app/hosted/inference/__init__.py +5 -0
- examples/swe/task_app/hosted/inference/openai_client.py +618 -0
- examples/swe/task_app/hosted/main.py +100 -0
- examples/swe/task_app/hosted/policy_routes.py +1079 -0
- examples/swe/task_app/hosted/registry.py +195 -0
- examples/swe/task_app/hosted/rollout.py +1869 -0
- examples/swe/task_app/hosted/storage/__init__.py +5 -0
- examples/swe/task_app/hosted/storage/volume.py +211 -0
- examples/swe/task_app/hosted/test_agents.py +161 -0
- examples/swe/task_app/hosted/test_service.py +137 -0
- examples/swe/task_app/hosted/utils.py +62 -0
- examples/vlm/README.md +68 -0
- examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
- examples/vlm/crafter_image_only_agent.py +207 -0
- examples/vlm/crafter_openai_vlm_agent.py +277 -0
- examples/vlm/filter_image_rows.py +63 -0
- examples/vlm/run_crafter_vlm_benchmark.py +316 -0
- examples/warming_up_to_rl/analyze_trace_db.py +5 -5
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
- examples/warming_up_to_rl/export_trace_sft.py +78 -21
- examples/warming_up_to_rl/groq_test.py +4 -4
- examples/warming_up_to_rl/manage_secrets.py +13 -18
- examples/warming_up_to_rl/run_eval.py +42 -44
- examples/warming_up_to_rl/run_fft_and_save.py +11 -16
- examples/warming_up_to_rl/run_local_rollout.py +1 -3
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -4
- examples/warming_up_to_rl/run_local_rollout_parallel.py +1 -4
- examples/warming_up_to_rl/run_local_rollout_traced.py +3 -5
- examples/warming_up_to_rl/run_rl_and_save.py +5 -6
- examples/warming_up_to_rl/run_rollout_remote.py +8 -10
- examples/warming_up_to_rl/task_app/README.md +6 -2
- examples/warming_up_to_rl/task_app/grpo_crafter.py +234 -35
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +2 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +131 -114
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +101 -41
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +73 -51
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +14 -6
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +16 -16
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +32 -34
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +94 -31
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +303 -203
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +328 -225
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +13 -13
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
- synth/__init__.py +14 -0
- synth_ai/__init__.py +26 -4
- synth_ai/api/models/supported.py +376 -0
- synth_ai/api/train/builders.py +128 -21
- synth_ai/api/train/cli.py +80 -64
- synth_ai/api/train/config_finder.py +7 -2
- synth_ai/api/train/env_resolver.py +1 -1
- synth_ai/api/train/pollers.py +2 -1
- synth_ai/api/train/supported_algos.py +139 -0
- synth_ai/api/train/task_app.py +1 -2
- synth_ai/api/train/utils.py +13 -44
- synth_ai/cli/__init__.py +8 -0
- synth_ai/cli/_modal_wrapper.py +28 -0
- synth_ai/cli/_typer_patch.py +49 -0
- synth_ai/cli/balance.py +1 -2
- synth_ai/cli/calc.py +1 -1
- synth_ai/cli/demo.py +2 -1
- synth_ai/cli/recent.py +2 -2
- synth_ai/cli/rl_demo.py +2 -1
- synth_ai/cli/root.py +11 -13
- synth_ai/cli/status.py +2 -2
- synth_ai/cli/task_apps.py +529 -179
- synth_ai/cli/traces.py +6 -4
- synth_ai/cli/watch.py +12 -18
- synth_ai/demo_registry.py +1 -1
- synth_ai/demos/core/cli.py +36 -43
- synth_ai/demos/demo_task_apps/__init__.py +3 -3
- synth_ai/demos/demo_task_apps/core.py +17 -25
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +3 -4
- synth_ai/demos/demo_task_apps/math/app.py +2 -1
- synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -4
- synth_ai/demos/demo_task_apps/math/modal_task_app.py +16 -18
- synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
- synth_ai/environments/examples/crafter_classic/environment.py +76 -1
- synth_ai/environments/reproducibility/tree.py +2 -5
- synth_ai/environments/service/app.py +11 -12
- synth_ai/environments/service/core_routes.py +4 -7
- synth_ai/environments/stateful/engine.py +1 -1
- synth_ai/environments/tasks/core.py +1 -0
- synth_ai/environments/tasks/filters.py +5 -6
- synth_ai/environments/tasks/utils.py +4 -5
- synth_ai/handshake.py +9 -9
- synth_ai/http.py +1 -1
- synth_ai/http_client.py +18 -10
- synth_ai/inference/client.py +15 -5
- synth_ai/jobs/client.py +78 -83
- synth_ai/learning/__init__.py +41 -6
- synth_ai/learning/algorithms.py +14 -0
- synth_ai/learning/client.py +91 -24
- synth_ai/learning/config.py +2 -38
- synth_ai/learning/ft_client.py +4 -59
- synth_ai/learning/health.py +5 -6
- synth_ai/learning/jobs.py +31 -47
- synth_ai/{rl → learning/rl}/__init__.py +14 -4
- synth_ai/learning/rl/client.py +267 -0
- synth_ai/learning/rl/config.py +31 -0
- synth_ai/{rl → learning/rl}/contracts.py +5 -8
- synth_ai/{rl → learning/rl}/env_keys.py +39 -15
- synth_ai/learning/rl/secrets.py +13 -0
- synth_ai/learning/rl_client.py +2 -281
- synth_ai/learning/sft/__init__.py +29 -0
- synth_ai/learning/sft/client.py +68 -0
- synth_ai/learning/sft/config.py +270 -0
- synth_ai/learning/sft/data.py +295 -0
- synth_ai/learning/sse.py +25 -24
- synth_ai/learning/validators.py +25 -28
- synth_ai/lm/__init__.py +21 -47
- synth_ai/main.py +4 -0
- synth_ai/task/__init__.py +25 -27
- synth_ai/task/apps/__init__.py +7 -8
- synth_ai/task/auth.py +8 -8
- synth_ai/task/client.py +14 -14
- synth_ai/task/contracts.py +36 -35
- synth_ai/task/datasets.py +6 -5
- synth_ai/task/errors.py +10 -10
- synth_ai/task/health.py +17 -9
- synth_ai/task/json.py +58 -23
- synth_ai/task/proxy.py +13 -9
- synth_ai/task/rubrics.py +16 -15
- synth_ai/task/server.py +12 -12
- synth_ai/task/tracing_utils.py +4 -4
- synth_ai/task/vendors.py +5 -6
- synth_ai/tracing_v3/__init__.py +2 -0
- synth_ai/tracing_v3/abstractions.py +21 -4
- synth_ai/tracing_v3/decorators.py +18 -16
- synth_ai/tracing_v3/hooks.py +5 -5
- synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
- synth_ai/tracing_v3/session_tracer.py +40 -14
- synth_ai/tracing_v3/storage/base.py +85 -0
- synth_ai/tracing_v3/storage/config.py +21 -8
- synth_ai/tracing_v3/storage/factory.py +10 -7
- synth_ai/tracing_v3/storage/utils.py +4 -2
- synth_ai/tracing_v3/turso/daemon.py +7 -2
- synth_ai/tracing_v3/turso/models.py +2 -2
- synth_ai/tracing_v3/turso/native_manager.py +1173 -0
- synth_ai/tracing_v3/utils.py +4 -4
- synth_ai/v0/api/__init__.py +8 -0
- synth_ai/v0/api/models/__init__.py +8 -0
- synth_ai/v0/api/models/supported.py +8 -0
- synth_ai/v0/config/__init__.py +15 -0
- synth_ai/v0/config/base_url.py +12 -0
- synth_ai/v0/lm/__init__.py +51 -0
- synth_ai/{lm → v0/lm}/caching/ephemeral.py +2 -2
- synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
- synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
- synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
- synth_ai/{lm → v0/lm}/config.py +6 -1
- synth_ai/{lm → v0/lm}/core/all.py +9 -9
- synth_ai/{lm → v0/lm}/core/main.py +6 -6
- synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
- synth_ai/{lm → v0/lm}/core/synth_models.py +2 -14
- synth_ai/{lm → v0/lm}/core/vendor_clients.py +2 -2
- synth_ai/{lm → v0/lm}/overrides.py +2 -2
- synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
- synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
- synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +9 -9
- synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
- synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +10 -10
- synth_ai/{lm → v0/lm}/vendors/openai_standard.py +8 -8
- synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +3 -3
- synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
- synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
- synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
- synth_ai/{lm → v0/lm}/vendors/synth_client.py +1 -1
- synth_ai/v0/tracing_v3/__init__.py +10 -0
- synth_ai/v0/tracing_v3/abstractions.py +3 -0
- synth_ai/v0/tracing_v3/decorators.py +3 -0
- synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
- synth_ai/v0/tracing_v3/session_tracer.py +3 -0
- synth_ai-0.2.9.dev8.dist-info/METADATA +191 -0
- {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/RECORD +268 -238
- {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/top_level.txt +1 -0
- examples/common_old/backend.py +0 -20
- examples/evals_old/README.md +0 -98
- examples/evals_old/__init__.py +0 -6
- examples/evals_old/compare_models.py +0 -1038
- examples/evals_old/example_log.md +0 -145
- examples/evals_old/run_demo.sh +0 -126
- examples/evals_old/trace_analysis.py +0 -270
- examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
- examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
- examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
- examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -243
- examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
- examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
- examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
- examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
- examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
- examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -119
- examples/finetuning_old/synth_qwen_v1/README.md +0 -68
- examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
- examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -243
- examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
- examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
- examples/finetuning_old/synth_qwen_v1/infer.py +0 -36
- examples/finetuning_old/synth_qwen_v1/poll.py +0 -46
- examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
- examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
- examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1933
- examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -210
- examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -237
- examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
- examples/finetuning_old/synth_qwen_v1/util.py +0 -152
- examples/rl_old/task_app.py +0 -1131
- examples/warming_up_to_rl/old/event_rewards.md +0 -234
- examples/warming_up_to_rl/old/notes.md +0 -73
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
- synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
- synth_ai/experimental/synth_oss.py +0 -445
- synth_ai/learning/filtering.py +0 -0
- synth_ai/learning/offline/dpo.py +0 -0
- synth_ai/learning/offline/providers.py +0 -7
- synth_ai/learning/offline/sft.py +0 -0
- synth_ai/learning/offline/shared.py +0 -0
- synth_ai/learning/online/grpo.py +0 -0
- synth_ai/learning/online/irft.py +0 -0
- synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
- synth_ai/learning/prompts/gepa.py +0 -0
- synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -211
- synth_ai/learning/prompts/mipro.py +0 -289
- synth_ai/learning/prompts/random_search.py +0 -249
- synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
- synth_ai/learning/prompts/run_random_search_banking77.py +0 -329
- synth_ai/rl/secrets.py +0 -19
- synth_ai/scripts/verify_rewards.py +0 -100
- synth_ai/tracing/__init__.py +0 -30
- synth_ai/tracing_v1/__init__.py +0 -33
- synth_ai/tracing_v3/turso/__init__.py +0 -25
- synth_ai/tracing_v3/turso/manager.py +0 -838
- synth_ai/zyk/__init__.py +0 -30
- synth_ai-0.2.9.dev7.dist-info/METADATA +0 -131
- /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
- /synth_ai/{lm → v0/lm}/constants.py +0 -0
- /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/core/exceptions.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
- /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
- /synth_ai/{lm → v0/lm}/injection.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
- /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
- /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
- /synth_ai/{lm → v0/lm}/warmup.py +0 -0
- {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,145 +0,0 @@
|
|
|
1
|
-
joshuapurtell@Mac synth-ai % bash examples/evals/run_demo.sh
|
|
2
|
-
Models to compare (space-separated) [gpt-5-nano gpt-4.1-nano]:
|
|
3
|
-
Models: gpt-5-nano gpt-4.1-nano
|
|
4
|
-
Episodes per model [3]: 5
|
|
5
|
-
Max turns per episode [5]: 5
|
|
6
|
-
Parallelism per model (concurrency) [5]: 5
|
|
7
|
-
Difficulty [easy]:
|
|
8
|
-
Running comparison: episodes=5, max_turns=5, difficulty=easy, concurrency=5
|
|
9
|
-
Detected SYNTH_API_KEY (sk_liv...ac95). Use this key? [Y/n]: n
|
|
10
|
-
Use SYNTH_API_KEY_PROD (sk_liv...a2a4)? [y/N]: Y
|
|
11
|
-
[PATCH] Attempting to apply Crafter deterministic patch...
|
|
12
|
-
[PATCH] Patching crafter.Env._balance_object...
|
|
13
|
-
[PATCH] crafter.Env._balance_object patched.
|
|
14
|
-
[PATCH] Attempting to apply Crafter serialization patch v3...
|
|
15
|
-
[PATCH] Adding enhanced save/load methods to crafter.Env...
|
|
16
|
-
[PATCH] crafter.Env.save() and load() methods added (v3).
|
|
17
|
-
[PATCH] Crafter serialization patch v3 complete.
|
|
18
|
-
[PATCH] Attempting to apply simplified Crafter world configuration patch...
|
|
19
|
-
[PATCH] Simplified Crafter world configuration patch complete.
|
|
20
|
-
[PATCH] Available configs: easy, normal, hard, peaceful
|
|
21
|
-
✅ Loaded 8 Crafter achievement hooks (Easy, Medium, Hard)
|
|
22
|
-
🎮 Crafter Multi-Model Experiment
|
|
23
|
-
==================================================
|
|
24
|
-
Experiment ID: crafter_multi_model_20250808_170152
|
|
25
|
-
Models: gpt-5-nano, gpt-4.1-nano
|
|
26
|
-
Episodes per model: 5
|
|
27
|
-
Max turns per episode: 5
|
|
28
|
-
Difficulty: easy
|
|
29
|
-
Seeds: 1000 to 1004
|
|
30
|
-
Turn timeout: 20.0s
|
|
31
|
-
Episode timeout: 180.0s
|
|
32
|
-
Save traces: True
|
|
33
|
-
Database URL: sqlite+aiosqlite:////Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data
|
|
34
|
-
==================================================
|
|
35
|
-
✅ Crafter service is running
|
|
36
|
-
|
|
37
|
-
Running 5 episodes for gpt-5-nano in parallel...
|
|
38
|
-
|
|
39
|
-
gpt-5-nano | ep1: 0%| | 0/5 [00:00<?, ?turn/s]
|
|
40
|
-
Running 5 episodes for gpt-4.1-nano in parallel... | 0/5 [00:00<?, ?turn/s]
|
|
41
|
-
gpt-5-nano | ep3: 0%| | 0/5 [00:00<?, ?turn/s]
|
|
42
|
-
gpt-4.1-nano | ep3: 100%|██████████████████████████████████████████████| 5/5 [00:09<00:00, 1.95s/turn, ach=1]
|
|
43
|
-
gpt-4.1-nano | ep2: 80%|████████████████████████████████████▊ | 4/5 [00:10<00:02, 2.64s/turn, ach=2]
|
|
44
|
-
gpt-4.1-nano | ep4: 100%|██████████████████████████████████████████████| 5/5 [00:11<00:00, 2.32s/turn, ach=0]
|
|
45
|
-
gpt-4.1-nano | ep5: 100%|██████████████████████████████████████████████| 5/5 [00:11<00:00, 2.37s/turn, ach=2]
|
|
46
|
-
gpt-5-nano | ep1: 20%|█████████▌ | 1/5 [00:21<01:24, 21.13s/turn, ach=0 ⏰ Turn 3 timed out for episode 0 after 20.0s | 2/5 [00:25<00:38, 12.83s/turn, ach=0]
|
|
47
|
-
gpt-4.1-nano | ep1: 60%|███████████████████████████▌ | 3/5 [00:28<00:19, 9.62s/turn, ach=1]
|
|
48
|
-
gpt-5-nano | ep3: 100%|████████████████████████████████████████████████| 5/5 [01:00<00:00, 12.05s/turn, ach=1]
|
|
49
|
-
gpt-5-nano | ep2: 100%|████████████████████████████████████████████████| 5/5 [01:07<00:00, 13.56s/turn, ach=2]
|
|
50
|
-
⏰ Turn 4 timed out for episode 3 after 20.0s██████████████████████| 5/5 [01:07<00:00, 14.04s/turn, ach=2]
|
|
51
|
-
gpt-5-nano | ep4: 80%|██████████████████████████████████████▍ | 4/5 [01:08<00:17, 17.02s/turn, ach=0]
|
|
52
|
-
gpt-5-nano | ep5: 100%|████████████████████████████████████████████████| 5/5 [01:13<00:00, 14.71s/turn, ach=1]
|
|
53
|
-
gpt-5-nano | ep1: 100%|████████████████████████████████████████████████| 5/5 [01:19<00:00, 15.83s/turn, ach=1]
|
|
54
|
-
gpt-4.1-nano | ep5: 100%|██████████████████████████████████████████████| 5/5 [00:11<00:00, 1.68s/turn, ach=2]
|
|
55
|
-
📊 Analysis Results:
|
|
56
|
-
================================================================================:13<00:00, 14.26s/turn, ach=1]
|
|
57
|
-
|
|
58
|
-
📈 Model Performance Summary:
|
|
59
|
-
Model Avg Achievements Max Achievements Invalid Rate Success Rate
|
|
60
|
-
--------------------------------------------------------------------------------------
|
|
61
|
-
gpt-4.1-nano 1.20 ± 0.75 2 0.00% 100.00%
|
|
62
|
-
gpt-5-nano 1.00 ± 0.63 2 0.00% 100.00%
|
|
63
|
-
|
|
64
|
-
🏆 Achievement Frequencies:
|
|
65
|
-
|
|
66
|
-
Achievement gpt-4.1-nano gpt-5-nano
|
|
67
|
-
-----------------------------------------------
|
|
68
|
-
collect_drink 2/5 ( 40%) 0/5 ( 0%)
|
|
69
|
-
collect_sapling 1/5 ( 20%) 2/5 ( 40%)
|
|
70
|
-
collect_wood 3/5 ( 60%) 2/5 ( 40%)
|
|
71
|
-
place_plant 0/5 ( 0%) 1/5 ( 20%)
|
|
72
|
-
|
|
73
|
-
💰 Model Usage Statistics from Current Experiment:
|
|
74
|
-
Model Provider Usage Count Avg Latency (ms) Total Cost
|
|
75
|
-
------------------------------------------------------------------------
|
|
76
|
-
gpt-5-nano openai 221 13006.57 $0.0000
|
|
77
|
-
gpt-4.1-nano openai 161 950.12 $0.0000
|
|
78
|
-
|
|
79
|
-
💾 Detailed results saved to: /Users/joshuapurtell/Documents/GitHub/synth-ai/temp/crafter_experiment_results_20250808_170312.json
|
|
80
|
-
|
|
81
|
-
✅ Experiment complete!
|
|
82
|
-
Using v3 traces DB: /Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data
|
|
83
|
-
\nAvailable achievements (session counts):
|
|
84
|
-
[PATCH] Attempting to apply Crafter deterministic patch...
|
|
85
|
-
[PATCH] Patching crafter.Env._balance_object...
|
|
86
|
-
[PATCH] crafter.Env._balance_object patched.
|
|
87
|
-
[PATCH] Attempting to apply Crafter serialization patch v3...
|
|
88
|
-
[PATCH] Adding enhanced save/load methods to crafter.Env...
|
|
89
|
-
[PATCH] crafter.Env.save() and load() methods added (v3).
|
|
90
|
-
[PATCH] Crafter serialization patch v3 complete.
|
|
91
|
-
[PATCH] Attempting to apply simplified Crafter world configuration patch...
|
|
92
|
-
[PATCH] Simplified Crafter world configuration patch complete.
|
|
93
|
-
[PATCH] Available configs: easy, normal, hard, peaceful
|
|
94
|
-
Achievements present (session counts):
|
|
95
|
-
- collect_drink: 44
|
|
96
|
-
- collect_sapling: 62
|
|
97
|
-
- collect_wood: 74
|
|
98
|
-
- defeat_skeleton: 4
|
|
99
|
-
- defeat_zombie: 2
|
|
100
|
-
- eat_cow: 2
|
|
101
|
-
- place_plant: 8
|
|
102
|
-
- place_table: 3
|
|
103
|
-
\nEnter achievements to filter by (space-separated), or press Enter for 'collect_wood':
|
|
104
|
-
|
|
105
|
-
Optionally restrict to models (space-separated), or press Enter to include all:
|
|
106
|
-
|
|
107
|
-
\nRunning: uv run python -m examples.evals.trace_analysis filter --db "/Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data" --achievements collect_wood --output ft_data/evals_filtered.jsonl
|
|
108
|
-
[PATCH] Attempting to apply Crafter deterministic patch...
|
|
109
|
-
[PATCH] Patching crafter.Env._balance_object...
|
|
110
|
-
[PATCH] crafter.Env._balance_object patched.
|
|
111
|
-
[PATCH] Attempting to apply Crafter serialization patch v3...
|
|
112
|
-
[PATCH] Adding enhanced save/load methods to crafter.Env...
|
|
113
|
-
[PATCH] crafter.Env.save() and load() methods added (v3).
|
|
114
|
-
[PATCH] Crafter serialization patch v3 complete.
|
|
115
|
-
[PATCH] Attempting to apply simplified Crafter world configuration patch...
|
|
116
|
-
[PATCH] Simplified Crafter world configuration patch complete.
|
|
117
|
-
[PATCH] Available configs: easy, normal, hard, peaceful
|
|
118
|
-
✅ Wrote 74 examples from 74 sessions → ft_data/evals_filtered.jsonl
|
|
119
|
-
\nRunning: uv run python -m examples.evals.trace_analysis stats --db "/Users/joshuapurtell/Documents/GitHub/synth-ai/traces/v3/synth_ai.db/dbs/default/data" --achievements collect_wood
|
|
120
|
-
[PATCH] Attempting to apply Crafter deterministic patch...
|
|
121
|
-
[PATCH] Patching crafter.Env._balance_object...
|
|
122
|
-
[PATCH] crafter.Env._balance_object patched.
|
|
123
|
-
[PATCH] Attempting to apply Crafter serialization patch v3...
|
|
124
|
-
[PATCH] Adding enhanced save/load methods to crafter.Env...
|
|
125
|
-
[PATCH] crafter.Env.save() and load() methods added (v3).
|
|
126
|
-
[PATCH] Crafter serialization patch v3 complete.
|
|
127
|
-
[PATCH] Attempting to apply simplified Crafter world configuration patch...
|
|
128
|
-
[PATCH] Simplified Crafter world configuration patch complete.
|
|
129
|
-
[PATCH] Available configs: easy, normal, hard, peaceful
|
|
130
|
-
Matched sessions (any of: collect_wood )
|
|
131
|
-
n=74 avg_reward=0.76 stddev=1.00
|
|
132
|
-
avg_first_unlock_step=4.7 stddev=4.6
|
|
133
|
-
Others
|
|
134
|
-
n=224 avg_reward=0.21 stddev=0.51
|
|
135
|
-
|
|
136
|
-
Achievement frequency by session (matched vs others):
|
|
137
|
-
- collect_drink: matched 25/74 (33.8%), others 19/224 (8.5%)
|
|
138
|
-
- collect_sapling: matched 21/74 (28.4%), others 41/224 (18.3%)
|
|
139
|
-
- place_table: matched 3/74 (4.1%), others 0/224 (0.0%)
|
|
140
|
-
- eat_cow: matched 2/74 (2.7%), others 0/224 (0.0%)
|
|
141
|
-
- place_plant: matched 3/74 (4.1%), others 5/224 (2.2%)
|
|
142
|
-
- defeat_skeleton: matched 2/74 (2.7%), others 2/224 (0.9%)
|
|
143
|
-
- defeat_zombie: matched 0/74 (0.0%), others 2/224 (0.9%)
|
|
144
|
-
\nDone. See ft_data/evals_filtered.jsonl and v3 DB for deeper analysis.
|
|
145
|
-
joshuapurtell@Mac synth-ai %
|
examples/evals_old/run_demo.sh
DELETED
|
@@ -1,126 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
|
|
3
|
-
# Run Crafter experiments comparing gpt-5-nano and Qwen/Qwen3-32B-Instruct
|
|
4
|
-
|
|
5
|
-
# Get the directory where this script is located
|
|
6
|
-
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
|
7
|
-
|
|
8
|
-
# Change to the synth-ai root directory
|
|
9
|
-
cd "$SCRIPT_DIR/../.."
|
|
10
|
-
|
|
11
|
-
# Interactive mini-demo: run small comparison, then analyze v3 traces
|
|
12
|
-
set -euo pipefail
|
|
13
|
-
|
|
14
|
-
# Load env (prefer local .env at repo root)
|
|
15
|
-
set +u
|
|
16
|
-
set -a
|
|
17
|
-
if [ -f ".env" ]; then source ".env"; fi
|
|
18
|
-
set +a
|
|
19
|
-
set -u
|
|
20
|
-
|
|
21
|
-
# Ensure API key present (SYNTH_API_KEY, optionally mirror to OPENAI_API_KEY)
|
|
22
|
-
ensure_api_key() {
|
|
23
|
-
local current_key="${SYNTH_API_KEY:-}"
|
|
24
|
-
if [ -n "$current_key" ]; then
|
|
25
|
-
local preview="${current_key:0:6}...${current_key: -4}"
|
|
26
|
-
read -r -p "Detected SYNTH_API_KEY ($preview). Use this key? [Y/n]: " USE_CUR || true
|
|
27
|
-
USE_CUR=${USE_CUR:-Y}
|
|
28
|
-
if [[ ! "$USE_CUR" =~ ^[Yy]$ ]]; then
|
|
29
|
-
current_key=""
|
|
30
|
-
fi
|
|
31
|
-
fi
|
|
32
|
-
|
|
33
|
-
if [ -z "$current_key" ] && [ -n "${SYNTH_API_KEY_PROD:-}" ]; then
|
|
34
|
-
local prod_prev="${SYNTH_API_KEY_PROD:0:6}...${SYNTH_API_KEY_PROD: -4}"
|
|
35
|
-
read -r -p "Use SYNTH_API_KEY_PROD ($prod_prev)? [y/N]: " USE_PROD || true
|
|
36
|
-
if [[ "$USE_PROD" =~ ^[Yy]$ ]]; then
|
|
37
|
-
current_key="$SYNTH_API_KEY_PROD"
|
|
38
|
-
fi
|
|
39
|
-
fi
|
|
40
|
-
|
|
41
|
-
while [ -z "$current_key" ]; do
|
|
42
|
-
echo
|
|
43
|
-
read -s -p "Enter your SYNTH_API_KEY: " KEY_IN || true
|
|
44
|
-
echo
|
|
45
|
-
if [ -n "$KEY_IN" ]; then
|
|
46
|
-
current_key="$KEY_IN"
|
|
47
|
-
else
|
|
48
|
-
echo "A valid SYNTH_API_KEY is required to continue."
|
|
49
|
-
fi
|
|
50
|
-
done
|
|
51
|
-
|
|
52
|
-
export SYNTH_API_KEY="$current_key"
|
|
53
|
-
if [ -z "${OPENAI_API_KEY:-}" ]; then
|
|
54
|
-
export OPENAI_API_KEY="$SYNTH_API_KEY"
|
|
55
|
-
echo "OPENAI_API_KEY set from SYNTH_API_KEY."
|
|
56
|
-
fi
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
# Interactive prompts (with sensible defaults)
|
|
60
|
-
MODELS_DEFAULT="gpt-5-nano gpt-4.1-nano"
|
|
61
|
-
read -r -p "Models to compare (space-separated) [${MODELS_DEFAULT}]: " MODELS_INPUT || true
|
|
62
|
-
MODELS=${MODELS_INPUT:-$MODELS_DEFAULT}
|
|
63
|
-
echo "Models: ${MODELS}"
|
|
64
|
-
|
|
65
|
-
read -r -p "Episodes per model [3]: " EPISODES_INPUT || true
|
|
66
|
-
EPISODES=${EPISODES_INPUT:-3}
|
|
67
|
-
|
|
68
|
-
read -r -p "Max turns per episode [5]: " MAX_TURNS_INPUT || true
|
|
69
|
-
MAX_TURNS=${MAX_TURNS_INPUT:-5}
|
|
70
|
-
|
|
71
|
-
read -r -p "Parallelism per model (concurrency) [5]: " CONCURRENCY_INPUT || true
|
|
72
|
-
CONCURRENCY=${CONCURRENCY_INPUT:-5}
|
|
73
|
-
|
|
74
|
-
read -r -p "Difficulty [easy]: " DIFFICULTY_INPUT || true
|
|
75
|
-
DIFFICULTY=${DIFFICULTY_INPUT:-easy}
|
|
76
|
-
|
|
77
|
-
echo "Running comparison: episodes=${EPISODES}, max_turns=${MAX_TURNS}, difficulty=${DIFFICULTY}, concurrency=${CONCURRENCY}"
|
|
78
|
-
|
|
79
|
-
# Ensure key before running rollouts
|
|
80
|
-
ensure_api_key
|
|
81
|
-
|
|
82
|
-
uv run python examples/evals/compare_models.py \
|
|
83
|
-
--episodes "${EPISODES}" \
|
|
84
|
-
--max-turns "${MAX_TURNS}" \
|
|
85
|
-
--difficulty "${DIFFICULTY}" \
|
|
86
|
-
--models ${MODELS} \
|
|
87
|
-
--base-seed 1000 \
|
|
88
|
-
--turn-timeout 20.0 \
|
|
89
|
-
--episode-timeout 180.0 \
|
|
90
|
-
--concurrency "${CONCURRENCY}" \
|
|
91
|
-
--quiet
|
|
92
|
-
|
|
93
|
-
# Derive v3 sqld internal DB path for quick analysis
|
|
94
|
-
DB_PATH="$PWD/traces/v3/synth_ai.db/dbs/default/data"
|
|
95
|
-
export DB_PATH
|
|
96
|
-
echo "Using v3 traces DB: $DB_PATH"
|
|
97
|
-
|
|
98
|
-
echo "\nAvailable achievements (session counts):"
|
|
99
|
-
uv run python -m examples.evals.trace_analysis list --db "$DB_PATH"
|
|
100
|
-
|
|
101
|
-
echo "\nEnter achievements to filter by (space-separated), or press Enter for 'collect_wood':"
|
|
102
|
-
read -r ACH
|
|
103
|
-
ACH=${ACH:-collect_wood}
|
|
104
|
-
|
|
105
|
-
echo "Optionally restrict to models (space-separated), or press Enter to include all:"
|
|
106
|
-
read -r MODELS_FILTER
|
|
107
|
-
|
|
108
|
-
mkdir -p ft_data
|
|
109
|
-
if [ -n "$MODELS_FILTER" ]; then
|
|
110
|
-
echo "\nRunning: uv run python -m examples.evals.trace_analysis filter --db \"$DB_PATH\" --achievements $ACH --output ft_data/evals_filtered.jsonl --models $MODELS_FILTER"
|
|
111
|
-
uv run python -m examples.evals.trace_analysis filter --db "$DB_PATH" --achievements $ACH --output ft_data/evals_filtered.jsonl --models $MODELS_FILTER
|
|
112
|
-
else
|
|
113
|
-
echo "\nRunning: uv run python -m examples.evals.trace_analysis filter --db \"$DB_PATH\" --achievements $ACH --output ft_data/evals_filtered.jsonl"
|
|
114
|
-
uv run python -m examples.evals.trace_analysis filter --db "$DB_PATH" --achievements $ACH --output ft_data/evals_filtered.jsonl
|
|
115
|
-
fi
|
|
116
|
-
|
|
117
|
-
# Show stats comparing filtered vs others (including achievement frequencies)
|
|
118
|
-
if [ -n "$MODELS_FILTER" ]; then
|
|
119
|
-
echo "\nRunning: uv run python -m examples.evals.trace_analysis stats --db \"$DB_PATH\" --achievements $ACH --models $MODELS_FILTER"
|
|
120
|
-
uv run python -m examples.evals.trace_analysis stats --db "$DB_PATH" --achievements $ACH --models $MODELS_FILTER
|
|
121
|
-
else
|
|
122
|
-
echo "\nRunning: uv run python -m examples.evals.trace_analysis stats --db \"$DB_PATH\" --achievements $ACH"
|
|
123
|
-
uv run python -m examples.evals.trace_analysis stats --db "$DB_PATH" --achievements $ACH
|
|
124
|
-
fi
|
|
125
|
-
|
|
126
|
-
echo "\nDone. See ft_data/evals_filtered.jsonl and v3 DB for deeper analysis."
|
|
@@ -1,270 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Trace analysis utilities for Crafter v3 traces (sqld/Turso).
|
|
4
|
-
|
|
5
|
-
Subcommands:
|
|
6
|
-
- list: List achievements present in the database and counts
|
|
7
|
-
- filter: Filter sessions by required achievements and export OpenAI-format JSONL
|
|
8
|
-
- stats: Compare rewards and achievement frequencies for filtered vs. others
|
|
9
|
-
|
|
10
|
-
Usage examples:
|
|
11
|
-
uvpm examples.evals.trace_analysis list --db traces/v3/synth_ai.db/dbs/default/data
|
|
12
|
-
uvpm examples.evals.trace_analysis filter --db traces/v3/synth_ai.db/dbs/default/data \
|
|
13
|
-
--achievements collect_wood --output ft_data/evals_collect_wood.jsonl
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
import argparse
|
|
17
|
-
import asyncio
|
|
18
|
-
import json
|
|
19
|
-
import json as pyjson
|
|
20
|
-
import math
|
|
21
|
-
from pathlib import Path
|
|
22
|
-
|
|
23
|
-
from synth_ai.environments.examples.crafter_classic.agent_demos.crafter_modal_ft.filter_traces_sft_turso import (
|
|
24
|
-
FinetuningDataExtractorV3,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def build_db_url(path: str) -> str:
|
|
29
|
-
if path.startswith("sqlite+"):
|
|
30
|
-
return path
|
|
31
|
-
return f"sqlite+aiosqlite:///{path}"
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
async def cmd_list(db_path: str) -> None:
|
|
35
|
-
db_url = build_db_url(db_path)
|
|
36
|
-
async with FinetuningDataExtractorV3(db_url) as ex:
|
|
37
|
-
sessions = await ex.get_all_sessions()
|
|
38
|
-
achievement_counts: dict[str, int] = {}
|
|
39
|
-
for _, row in sessions.iterrows():
|
|
40
|
-
ach_list = await ex.get_session_achievements(row["session_id"]) or []
|
|
41
|
-
for name in ach_list:
|
|
42
|
-
achievement_counts[name] = achievement_counts.get(name, 0) + 1
|
|
43
|
-
|
|
44
|
-
print("Achievements present (session counts):")
|
|
45
|
-
for name in sorted(achievement_counts.keys()):
|
|
46
|
-
print(f" - {name}: {achievement_counts[name]}")
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
async def cmd_filter(
|
|
50
|
-
db_path: str, achievements: list[str], output: str, models: list[str] | None = None
|
|
51
|
-
) -> None:
|
|
52
|
-
db_url = build_db_url(db_path)
|
|
53
|
-
required: set[str] = set(achievements)
|
|
54
|
-
async with FinetuningDataExtractorV3(db_url) as ex:
|
|
55
|
-
sessions = await ex.get_all_sessions()
|
|
56
|
-
kept: list[str] = []
|
|
57
|
-
for _, row in sessions.iterrows():
|
|
58
|
-
if models:
|
|
59
|
-
# Restrict to sessions containing any of the requested models
|
|
60
|
-
model_df = await ex.db_manager.query_traces(
|
|
61
|
-
"""
|
|
62
|
-
SELECT DISTINCT model_name
|
|
63
|
-
FROM events
|
|
64
|
-
WHERE session_id = :session_id
|
|
65
|
-
AND event_type = 'cais'
|
|
66
|
-
AND model_name IS NOT NULL
|
|
67
|
-
""",
|
|
68
|
-
{"session_id": row["session_id"]},
|
|
69
|
-
)
|
|
70
|
-
session_models = (
|
|
71
|
-
model_df["model_name"].tolist()
|
|
72
|
-
if model_df is not None and not model_df.empty
|
|
73
|
-
else []
|
|
74
|
-
)
|
|
75
|
-
if not any(m in session_models for m in models):
|
|
76
|
-
continue
|
|
77
|
-
ach_list = await ex.get_session_achievements(row["session_id"]) or []
|
|
78
|
-
if required & set(ach_list):
|
|
79
|
-
kept.append(row["session_id"])
|
|
80
|
-
|
|
81
|
-
data = await ex.extract_openai_format(kept)
|
|
82
|
-
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
-
with open(output, "w") as f:
|
|
84
|
-
for exm in data:
|
|
85
|
-
f.write(json.dumps(exm) + "\n")
|
|
86
|
-
print(f"✅ Wrote {len(data)} examples from {len(kept)} sessions → {output}")
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
async def _first_achievement_step(
|
|
90
|
-
ex: FinetuningDataExtractorV3, session_id: str, required: set[str]
|
|
91
|
-
) -> int | None:
|
|
92
|
-
q = """
|
|
93
|
-
SELECT message_time, system_state_after
|
|
94
|
-
FROM events
|
|
95
|
-
WHERE session_id = :session_id
|
|
96
|
-
AND event_type = 'environment'
|
|
97
|
-
ORDER BY message_time ASC
|
|
98
|
-
"""
|
|
99
|
-
df = await ex.db_manager.query_traces(q, {"session_id": session_id})
|
|
100
|
-
if df is None or df.empty:
|
|
101
|
-
return None
|
|
102
|
-
seen: set[str] = set()
|
|
103
|
-
for _, row in df.iterrows():
|
|
104
|
-
st = row.get("system_state_after")
|
|
105
|
-
if isinstance(st, str):
|
|
106
|
-
try:
|
|
107
|
-
st = pyjson.loads(st)
|
|
108
|
-
except Exception:
|
|
109
|
-
st = None
|
|
110
|
-
ach = None
|
|
111
|
-
if isinstance(st, dict):
|
|
112
|
-
ps = st.get("public_state") or {}
|
|
113
|
-
ach = ps.get("achievements_status") or {}
|
|
114
|
-
if isinstance(ach, dict):
|
|
115
|
-
for name, unlocked in ach.items():
|
|
116
|
-
if unlocked and name in required and name not in seen:
|
|
117
|
-
return int(row.get("message_time") or 0)
|
|
118
|
-
return None
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def _mean(values: list[float]) -> float:
|
|
122
|
-
return (sum(values) / len(values)) if values else 0.0
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def _stddev(values: list[float]) -> float:
|
|
126
|
-
if not values:
|
|
127
|
-
return 0.0
|
|
128
|
-
m = _mean(values)
|
|
129
|
-
var = sum((v - m) * (v - m) for v in values) / len(values)
|
|
130
|
-
return math.sqrt(var)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
async def cmd_stats(db_path: str, achievements: list[str], models: list[str] | None = None) -> None:
|
|
134
|
-
db_url = build_db_url(db_path)
|
|
135
|
-
required: set[str] = set(achievements)
|
|
136
|
-
async with FinetuningDataExtractorV3(db_url) as ex:
|
|
137
|
-
sessions = await ex.get_all_sessions()
|
|
138
|
-
matched_rewards: list[float] = []
|
|
139
|
-
other_rewards: list[float] = []
|
|
140
|
-
first_steps: list[int] = []
|
|
141
|
-
matched_count: int = 0
|
|
142
|
-
other_count: int = 0
|
|
143
|
-
matched_ach_counts: dict[str, int] = {}
|
|
144
|
-
other_ach_counts: dict[str, int] = {}
|
|
145
|
-
|
|
146
|
-
for _, row in sessions.iterrows():
|
|
147
|
-
sid = row["session_id"]
|
|
148
|
-
if models:
|
|
149
|
-
model_df = await ex.db_manager.query_traces(
|
|
150
|
-
"""
|
|
151
|
-
SELECT DISTINCT model_name
|
|
152
|
-
FROM events
|
|
153
|
-
WHERE session_id = :session_id
|
|
154
|
-
AND event_type = 'cais'
|
|
155
|
-
AND model_name IS NOT NULL
|
|
156
|
-
""",
|
|
157
|
-
{"session_id": sid},
|
|
158
|
-
)
|
|
159
|
-
session_models = (
|
|
160
|
-
model_df["model_name"].tolist()
|
|
161
|
-
if model_df is not None and not model_df.empty
|
|
162
|
-
else []
|
|
163
|
-
)
|
|
164
|
-
if not any(m in session_models for m in models):
|
|
165
|
-
continue
|
|
166
|
-
|
|
167
|
-
ach_list = await ex.get_session_achievements(sid) or []
|
|
168
|
-
metrics = await ex.get_session_metrics(sid)
|
|
169
|
-
reward = float(metrics.get("total_reward", 0.0))
|
|
170
|
-
|
|
171
|
-
if required & set(ach_list):
|
|
172
|
-
matched_rewards.append(reward)
|
|
173
|
-
step = await _first_achievement_step(ex, sid, required)
|
|
174
|
-
if step is not None:
|
|
175
|
-
first_steps.append(step)
|
|
176
|
-
matched_count += 1
|
|
177
|
-
for name in ach_list:
|
|
178
|
-
matched_ach_counts[name] = matched_ach_counts.get(name, 0) + 1
|
|
179
|
-
else:
|
|
180
|
-
other_rewards.append(reward)
|
|
181
|
-
other_count += 1
|
|
182
|
-
for name in ach_list:
|
|
183
|
-
other_ach_counts[name] = other_ach_counts.get(name, 0) + 1
|
|
184
|
-
|
|
185
|
-
print("Matched sessions (any of:", ", ".join(sorted(required)), ")")
|
|
186
|
-
print(
|
|
187
|
-
f" n={len(matched_rewards)} avg_reward={_mean(matched_rewards):.2f} stddev={_stddev(matched_rewards):.2f}"
|
|
188
|
-
)
|
|
189
|
-
if first_steps:
|
|
190
|
-
print(
|
|
191
|
-
f" avg_first_unlock_step={_mean([float(s) for s in first_steps]):.1f} stddev={_stddev([float(s) for s in first_steps]):.1f}"
|
|
192
|
-
)
|
|
193
|
-
else:
|
|
194
|
-
print(" avg_first_unlock_step=n/a (no unlocks recorded)")
|
|
195
|
-
print("Others")
|
|
196
|
-
print(
|
|
197
|
-
f" n={len(other_rewards)} avg_reward={_mean(other_rewards):.2f} stddev={_stddev(other_rewards):.2f}"
|
|
198
|
-
)
|
|
199
|
-
|
|
200
|
-
# Achievement frequency comparison (by session presence), excluding required achievements
|
|
201
|
-
all_achievements: set[str] = set(matched_ach_counts.keys()) | set(other_ach_counts.keys())
|
|
202
|
-
compare_achievements = [a for a in sorted(all_achievements) if a not in required]
|
|
203
|
-
if compare_achievements and (matched_count > 0 or other_count > 0):
|
|
204
|
-
print("\nAchievement frequency by session (matched vs others):")
|
|
205
|
-
# Build rows with absolute percentage difference for sorting
|
|
206
|
-
rows: list[tuple[float, str, int, float, int, float]] = []
|
|
207
|
-
for name in compare_achievements:
|
|
208
|
-
m_n = matched_ach_counts.get(name, 0)
|
|
209
|
-
o_n = other_ach_counts.get(name, 0)
|
|
210
|
-
m_pct = (m_n / matched_count * 100.0) if matched_count else 0.0
|
|
211
|
-
o_pct = (o_n / other_count * 100.0) if other_count else 0.0
|
|
212
|
-
diff = abs(m_pct - o_pct)
|
|
213
|
-
rows.append((diff, name, m_n, m_pct, o_n, o_pct))
|
|
214
|
-
|
|
215
|
-
# Show top 10 differences
|
|
216
|
-
rows.sort(reverse=True)
|
|
217
|
-
limit = min(10, len(rows))
|
|
218
|
-
for i in range(limit):
|
|
219
|
-
_, name, m_n, m_pct, o_n, o_pct = rows[i]
|
|
220
|
-
print(
|
|
221
|
-
f" - {name}: matched {m_n}/{matched_count} ({m_pct:.1f}%), others {o_n}/{other_count} ({o_pct:.1f}%)"
|
|
222
|
-
)
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def main() -> None:
|
|
226
|
-
parser = argparse.ArgumentParser(description="Crafter v3 trace analysis")
|
|
227
|
-
sub = parser.add_subparsers(dest="command", required=True)
|
|
228
|
-
|
|
229
|
-
p_list = sub.add_parser("list", help="List achievements present in DB")
|
|
230
|
-
p_list.add_argument(
|
|
231
|
-
"--db", required=True, help="Path to sqld internal data file or full sqlite+aiosqlite URL"
|
|
232
|
-
)
|
|
233
|
-
|
|
234
|
-
p_filter = sub.add_parser("filter", help="Filter sessions by achievements and export JSONL")
|
|
235
|
-
p_filter.add_argument(
|
|
236
|
-
"--db", required=True, help="Path to sqld internal data file or full sqlite+aiosqlite URL"
|
|
237
|
-
)
|
|
238
|
-
p_filter.add_argument(
|
|
239
|
-
"--achievements",
|
|
240
|
-
nargs="+",
|
|
241
|
-
required=True,
|
|
242
|
-
help="Required achievements (any match keeps session)",
|
|
243
|
-
)
|
|
244
|
-
p_filter.add_argument("--output", required=True, help="Output JSONL path")
|
|
245
|
-
p_filter.add_argument("--models", nargs="*", help="Optional model names to include (any match)")
|
|
246
|
-
|
|
247
|
-
p_stats = sub.add_parser("stats", help="Show summary stats for filtered vs others")
|
|
248
|
-
p_stats.add_argument(
|
|
249
|
-
"--db", required=True, help="Path to sqld internal data file or full sqlite+aiosqlite URL"
|
|
250
|
-
)
|
|
251
|
-
p_stats.add_argument(
|
|
252
|
-
"--achievements", nargs="+", required=True, help="Achievements to match (any match)"
|
|
253
|
-
)
|
|
254
|
-
p_stats.add_argument("--models", nargs="*", help="Optional model names to include (any match)")
|
|
255
|
-
|
|
256
|
-
args = parser.parse_args()
|
|
257
|
-
|
|
258
|
-
if args.command == "list":
|
|
259
|
-
asyncio.run(cmd_list(args.db))
|
|
260
|
-
return
|
|
261
|
-
if args.command == "filter":
|
|
262
|
-
asyncio.run(cmd_filter(args.db, args.achievements, args.output, args.models or None))
|
|
263
|
-
return
|
|
264
|
-
if args.command == "stats":
|
|
265
|
-
asyncio.run(cmd_stats(args.db, args.achievements, args.models or None))
|
|
266
|
-
return
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
if __name__ == "__main__":
|
|
270
|
-
main()
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
# Centralized configuration for Synth Qwen Crafter workflows
|
|
2
|
-
|
|
3
|
-
[rollouts]
|
|
4
|
-
model = "Qwen/Qwen3-4B-Instruct-2507"
|
|
5
|
-
episodes = 5
|
|
6
|
-
max_steps = 30
|
|
7
|
-
difficulty = "easy"
|
|
8
|
-
temperature = 0.4
|
|
9
|
-
max_tokens = 2048
|
|
10
|
-
tool_choice = "required"
|
|
11
|
-
|
|
12
|
-
[traces]
|
|
13
|
-
sqld_db_path = "traces/v3/synth_ai.db"
|
|
14
|
-
|
|
15
|
-
[filter]
|
|
16
|
-
# For v3 sqld traces, use the internal data file under the sqld directory
|
|
17
|
-
db_path = "traces/v3/synth_ai.db/dbs/default/data"
|
|
18
|
-
required_achievements = ["collect_wood"]
|
|
19
|
-
min_total_reward = 1.0
|
|
20
|
-
max_cost = 10.0
|
|
21
|
-
max_tokens = 100000
|
|
22
|
-
output_jsonl = "ft_data/qwen4b_crafter_sft_collect_wood.jsonl"
|
|
23
|
-
|
|
24
|
-
[sft]
|
|
25
|
-
base_model = "Qwen/Qwen3-4B-Instruct-2507"
|
|
26
|
-
training_jsonl = "ft_data/qwen4b_crafter_sft_collect_wood.jsonl"
|
|
27
|
-
n_epochs = 1
|
|
28
|
-
batch_size = 4
|
|
29
|
-
upload_to_wasabi = true
|