synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
- examples/sft/evaluate.py +2 -0
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +56 -26
- examples/swe/task_app/hosted/rollout.py +42 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +799 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/models/supported.py +1 -0
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +48 -59
- synth_ai/cli/_modal_wrapper.py +3 -2
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/recent.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_apps.py +1922 -190
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/tui.py +57 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +9 -9
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +24 -5
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +257 -0
- synth_ai/task/contracts.py +138 -39
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +56 -0
- synth_ai/task/rubrics/loaders.py +152 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/trace_correlation_helpers.py +315 -0
- synth_ai/task/validators.py +413 -6
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +16 -6
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/daemon.py +8 -7
- synth_ai/tracing_v3/turso/native_manager.py +66 -43
- synth_ai/tracing_v3/utils.py +3 -3
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +906 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# Filter Command Success - SFT Dataset Creation Working!
|
|
2
|
+
|
|
3
|
+
## ✅ Complete Success!
|
|
4
|
+
|
|
5
|
+
The `uvx synth-ai eval` → `uvx synth-ai filter` loop is now working end-to-end for Crafter!
|
|
6
|
+
|
|
7
|
+
## What Was Fixed
|
|
8
|
+
|
|
9
|
+
### Issue 1: Early Return in `insert_session_trace`
|
|
10
|
+
**Problem**: Sessions created by `start_session` already existed in the database, so `insert_session_trace` returned early without saving messages.
|
|
11
|
+
|
|
12
|
+
**Fix**: Modified `/Users/joshpurtell/Documents/GitHub/synth-ai/synth_ai/tracing_v3/turso/native_manager.py` to continue processing messages even when the session already exists:
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
if session_exists:
|
|
16
|
+
# Update metadata but don't return early
|
|
17
|
+
# Continue to save messages
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Issue 2: Invalid Message Types
|
|
21
|
+
**Problem**: Crafter was using custom message types (`policy_system_prompt`, `policy_user_prompt`, `policy_tool_call`) that violated the database CHECK constraint.
|
|
22
|
+
|
|
23
|
+
**Fix**: Modified `/Users/joshpurtell/Documents/GitHub/synth-ai/examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py` to use standard message types:
|
|
24
|
+
- `policy_system_prompt` → `system`
|
|
25
|
+
- `policy_user_prompt` → `user`
|
|
26
|
+
- `policy_tool_call` → `assistant` (with `is_tool_call: true` metadata)
|
|
27
|
+
|
|
28
|
+
## Full Working Pipeline
|
|
29
|
+
|
|
30
|
+
### 1. Run Evaluation with Tracing
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
34
|
+
|
|
35
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
36
|
+
export TURSO_NATIVE=0
|
|
37
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
38
|
+
|
|
39
|
+
uv run synth-ai eval grpo-crafter-task-app \
|
|
40
|
+
--config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Result**:
|
|
44
|
+
- ✅ 2 rollouts completed
|
|
45
|
+
- ✅ 120 messages saved to database (40 system + 40 user + 40 assistant)
|
|
46
|
+
- ✅ 2 outcome_rewards saved with achievements
|
|
47
|
+
- ✅ Traces returned successfully
|
|
48
|
+
|
|
49
|
+
### 2. Filter to Create SFT Dataset
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv run synth-ai filter \
|
|
53
|
+
--config examples/task_apps/crafter/filter_sft_dataset.toml
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Result**:
|
|
57
|
+
```
|
|
58
|
+
Wrote 40 examples -> ft_data/crafter_image_only_sft.jsonl
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 3. Verify SFT Data
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Check first example
|
|
65
|
+
head -1 ft_data/crafter_image_only_sft.jsonl | jq .
|
|
66
|
+
|
|
67
|
+
# Count examples
|
|
68
|
+
wc -l ft_data/crafter_image_only_sft.jsonl
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## SFT Dataset Format
|
|
72
|
+
|
|
73
|
+
Each line in the JSONL contains:
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{
|
|
77
|
+
"messages": [
|
|
78
|
+
{
|
|
79
|
+
"role": "user",
|
|
80
|
+
"content": "=== CRAFTER GAME STATE ===\nStep: 0/10000\n..."
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"role": "assistant",
|
|
84
|
+
"content": "[{'tool_name': 'interact_many', 'arguments': {...}}]"
|
|
85
|
+
}
|
|
86
|
+
],
|
|
87
|
+
"metadata": {
|
|
88
|
+
"session_id": "...",
|
|
89
|
+
"env_name": "crafter",
|
|
90
|
+
"policy_name": "crafter-react",
|
|
91
|
+
"seed": 0,
|
|
92
|
+
"total_reward": 1,
|
|
93
|
+
"achievements_count": 1,
|
|
94
|
+
"created_at": "2025-10-22T23:55:25.533188+00:00"
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Database Schema
|
|
100
|
+
|
|
101
|
+
The filter command queries these tables:
|
|
102
|
+
|
|
103
|
+
### messages table
|
|
104
|
+
```sql
|
|
105
|
+
SELECT message_type, content, timestamp
|
|
106
|
+
FROM messages
|
|
107
|
+
WHERE session_id = :session_id
|
|
108
|
+
ORDER BY timestamp ASC
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
- ✅ 120 messages total
|
|
112
|
+
- System (40) + User (40) + Assistant (40) messages
|
|
113
|
+
- Pairs extracted: user → assistant
|
|
114
|
+
|
|
115
|
+
### outcome_rewards table
|
|
116
|
+
```sql
|
|
117
|
+
SELECT total_reward, achievements_count
|
|
118
|
+
FROM outcome_rewards
|
|
119
|
+
WHERE session_id = :session_id
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
- Used to filter for successful rollouts
|
|
123
|
+
- `min_official_score = 0.01` filters for rewards > 0
|
|
124
|
+
- Both rollouts had `total_reward = 1` (1 achievement each)
|
|
125
|
+
|
|
126
|
+
## Filter Configuration
|
|
127
|
+
|
|
128
|
+
**File**: `examples/task_apps/crafter/filter_sft_dataset.toml`
|
|
129
|
+
|
|
130
|
+
```toml
|
|
131
|
+
[filter]
|
|
132
|
+
db = "traces/v3/crafter_eval.db"
|
|
133
|
+
output = "ft_data/crafter_image_only_sft.jsonl"
|
|
134
|
+
min_official_score = 0.01 # Only traces with rewards > 0
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Available Filter Options
|
|
138
|
+
|
|
139
|
+
```toml
|
|
140
|
+
[filter]
|
|
141
|
+
db = "path/to/traces.db" # Required
|
|
142
|
+
output = "path/to/output.jsonl" # Required
|
|
143
|
+
|
|
144
|
+
# Optional filters
|
|
145
|
+
min_official_score = 0.01 # Filter by reward
|
|
146
|
+
splits = ["train", "test"] # Filter by split
|
|
147
|
+
task_ids = ["task_1"] # Filter by task
|
|
148
|
+
models = ["gpt-4o"] # Filter by model
|
|
149
|
+
limit = 100 # Limit number of examples
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Statistics
|
|
153
|
+
|
|
154
|
+
From 2 rollouts with 10 turns each:
|
|
155
|
+
|
|
156
|
+
| Metric | Count |
|
|
157
|
+
|--------|-------|
|
|
158
|
+
| Total rollouts | 2 |
|
|
159
|
+
| Rollouts with rewards | 2 (100%) |
|
|
160
|
+
| Total messages saved | 120 |
|
|
161
|
+
| System messages | 40 |
|
|
162
|
+
| User messages | 40 |
|
|
163
|
+
| Assistant messages | 40 |
|
|
164
|
+
| **SFT examples** | **40** |
|
|
165
|
+
| Average turns per rollout | 10 |
|
|
166
|
+
| Examples per rollout | 20 |
|
|
167
|
+
|
|
168
|
+
## Next Steps
|
|
169
|
+
|
|
170
|
+
### Scale Up
|
|
171
|
+
|
|
172
|
+
Run with more seeds for a larger dataset:
|
|
173
|
+
|
|
174
|
+
```toml
|
|
175
|
+
# In eval_image_only_gpt4o.toml
|
|
176
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # 10 rollouts
|
|
177
|
+
max_turns = 50 # More examples per rollout
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Expected output: ~1000 SFT examples from 10 rollouts @ 50 turns each
|
|
181
|
+
|
|
182
|
+
### Use the SFT Data
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
# For OpenAI fine-tuning
|
|
186
|
+
# The JSONL format is compatible with OpenAI's fine-tuning API
|
|
187
|
+
|
|
188
|
+
# For local fine-tuning
|
|
189
|
+
# Convert to your preferred format (HuggingFace, etc.)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Filter Variations
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
# Only high-reward traces
|
|
196
|
+
min_official_score = 2.0
|
|
197
|
+
|
|
198
|
+
# Only specific achievements
|
|
199
|
+
# Query manually then filter by session_id
|
|
200
|
+
|
|
201
|
+
# Time-based filtering
|
|
202
|
+
min_created_at = "2025-10-22T00:00:00"
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Files Modified
|
|
206
|
+
|
|
207
|
+
1. **`synth_ai/tracing_v3/turso/native_manager.py`**
|
|
208
|
+
- Fixed early return when session exists
|
|
209
|
+
- Added logging for debugging
|
|
210
|
+
|
|
211
|
+
2. **`examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py`**
|
|
212
|
+
- Changed message types to standard values
|
|
213
|
+
- Added debug logging
|
|
214
|
+
|
|
215
|
+
3. **`synth_ai/cli/task_apps.py`**
|
|
216
|
+
- Updated filter command to query messages table
|
|
217
|
+
- Added support for outcome_rewards filtering
|
|
218
|
+
- Fixed SQL parameter format
|
|
219
|
+
|
|
220
|
+
4. **`examples/task_apps/crafter/filter_sft_dataset.toml`**
|
|
221
|
+
- Created filter configuration
|
|
222
|
+
|
|
223
|
+
## Troubleshooting
|
|
224
|
+
|
|
225
|
+
### No messages in database
|
|
226
|
+
|
|
227
|
+
**Check**:
|
|
228
|
+
```bash
|
|
229
|
+
sqlite3 traces/v3/crafter_eval.db "SELECT COUNT(*) FROM messages;"
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
**Fix**: Ensure `TASKAPP_TRACING_ENABLED=1` and `TURSO_NATIVE=0`
|
|
233
|
+
|
|
234
|
+
### Filter returns no examples
|
|
235
|
+
|
|
236
|
+
**Check**:
|
|
237
|
+
```bash
|
|
238
|
+
sqlite3 traces/v3/crafter_eval.db \
|
|
239
|
+
"SELECT COUNT(*) FROM outcome_rewards WHERE total_reward > 0;"
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**Fix**: Lower `min_official_score` or remove it to include all traces
|
|
243
|
+
|
|
244
|
+
### Invalid message types
|
|
245
|
+
|
|
246
|
+
**Error**: `CHECK constraint failed: message_type IN (...)`
|
|
247
|
+
|
|
248
|
+
**Fix**: Already fixed in rollout.py - update to latest code
|
|
249
|
+
|
|
250
|
+
## Related Documentation
|
|
251
|
+
|
|
252
|
+
- `README_IMAGE_ONLY_EVAL.md` - How to run evaluations
|
|
253
|
+
- `EVAL_IMAGE_ONLY_RESULTS.md` - Example evaluation results
|
|
254
|
+
- `QUERY_EXAMPLES.md` - SQL query examples
|
|
255
|
+
- `CREATE_SFT_DATASET.md` - Original approach (now superseded)
|
|
256
|
+
|
|
257
|
+
## Success Metrics
|
|
258
|
+
|
|
259
|
+
✅ Eval completes without errors
|
|
260
|
+
✅ Messages saved to database (system, user, assistant)
|
|
261
|
+
✅ Outcome rewards saved with foreign keys
|
|
262
|
+
✅ Filter command extracts user/assistant pairs
|
|
263
|
+
✅ SFT JSONL created with proper format
|
|
264
|
+
✅ Metadata includes rewards and achievements
|
|
265
|
+
|
|
266
|
+
**Status**: 🎉 **WORKING END-TO-END!**
|
|
267
|
+
|
|
268
|
+
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# Crafter Eval Database Query Examples
|
|
2
|
+
|
|
3
|
+
## Database Location
|
|
4
|
+
```bash
|
|
5
|
+
/Users/joshpurtell/Documents/GitHub/synth-ai/traces/v3/crafter_eval.db
|
|
6
|
+
```
|
|
7
|
+
|
|
8
|
+
## Quick Stats
|
|
9
|
+
|
|
10
|
+
Run this query to get an overview:
|
|
11
|
+
```sql
|
|
12
|
+
SELECT
|
|
13
|
+
'Total rollouts' as metric,
|
|
14
|
+
CAST(COUNT(*) as TEXT) as value
|
|
15
|
+
FROM outcome_rewards
|
|
16
|
+
UNION ALL
|
|
17
|
+
SELECT
|
|
18
|
+
'Rollouts with reward > 0',
|
|
19
|
+
CAST(COUNT(*) as TEXT)
|
|
20
|
+
FROM outcome_rewards
|
|
21
|
+
WHERE total_reward > 0
|
|
22
|
+
UNION ALL
|
|
23
|
+
SELECT
|
|
24
|
+
'Average reward',
|
|
25
|
+
CAST(ROUND(AVG(total_reward), 2) as TEXT)
|
|
26
|
+
FROM outcome_rewards
|
|
27
|
+
UNION ALL
|
|
28
|
+
SELECT
|
|
29
|
+
'Max reward',
|
|
30
|
+
CAST(MAX(total_reward) as TEXT)
|
|
31
|
+
FROM outcome_rewards;
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Current Results:**
|
|
35
|
+
- Total rollouts: 10
|
|
36
|
+
- Rollouts with reward > 0: 7
|
|
37
|
+
- Average reward: 1.3
|
|
38
|
+
- Max reward: 3
|
|
39
|
+
|
|
40
|
+
## Filter for Non-Zero Rewards
|
|
41
|
+
|
|
42
|
+
### Simple Query
|
|
43
|
+
```sql
|
|
44
|
+
SELECT
|
|
45
|
+
session_id,
|
|
46
|
+
total_reward,
|
|
47
|
+
achievements_count,
|
|
48
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
49
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
50
|
+
FROM outcome_rewards
|
|
51
|
+
WHERE total_reward > 0
|
|
52
|
+
ORDER BY total_reward DESC, achievements_count DESC;
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### With Full Session Context
|
|
56
|
+
```sql
|
|
57
|
+
SELECT
|
|
58
|
+
st.session_id,
|
|
59
|
+
st.created_at,
|
|
60
|
+
st.num_timesteps,
|
|
61
|
+
st.num_events,
|
|
62
|
+
orw.total_reward,
|
|
63
|
+
orw.achievements_count,
|
|
64
|
+
json_extract(orw.reward_metadata, '$.final_achievements') as achievements,
|
|
65
|
+
json_extract(orw.reward_metadata, '$.env_seed') as seed
|
|
66
|
+
FROM session_traces st
|
|
67
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
68
|
+
WHERE orw.total_reward > 0
|
|
69
|
+
ORDER BY orw.total_reward DESC;
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Filter by Achievement Count
|
|
73
|
+
|
|
74
|
+
### Get rollouts with 2+ achievements
|
|
75
|
+
```sql
|
|
76
|
+
SELECT
|
|
77
|
+
session_id,
|
|
78
|
+
total_reward,
|
|
79
|
+
achievements_count,
|
|
80
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
81
|
+
FROM outcome_rewards
|
|
82
|
+
WHERE achievements_count >= 2
|
|
83
|
+
ORDER BY achievements_count DESC, total_reward DESC;
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Get rollouts with specific achievement
|
|
87
|
+
```sql
|
|
88
|
+
SELECT
|
|
89
|
+
session_id,
|
|
90
|
+
total_reward,
|
|
91
|
+
achievements_count,
|
|
92
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
93
|
+
FROM outcome_rewards
|
|
94
|
+
WHERE reward_metadata LIKE '%collect_drink%'
|
|
95
|
+
ORDER BY total_reward DESC;
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Group by Achievement Count
|
|
99
|
+
```sql
|
|
100
|
+
SELECT
|
|
101
|
+
achievements_count,
|
|
102
|
+
COUNT(*) as num_rollouts,
|
|
103
|
+
ROUND(AVG(total_reward), 2) as avg_reward,
|
|
104
|
+
SUM(total_reward) as total_reward_sum,
|
|
105
|
+
GROUP_CONCAT(DISTINCT json_extract(reward_metadata, '$.env_seed')) as seeds
|
|
106
|
+
FROM outcome_rewards
|
|
107
|
+
GROUP BY achievements_count
|
|
108
|
+
ORDER BY achievements_count DESC;
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Top Performers
|
|
112
|
+
```sql
|
|
113
|
+
SELECT
|
|
114
|
+
json_extract(orw.reward_metadata, '$.env_seed') as seed,
|
|
115
|
+
orw.total_reward,
|
|
116
|
+
orw.achievements_count,
|
|
117
|
+
orw.total_steps,
|
|
118
|
+
st.num_events,
|
|
119
|
+
json_extract(orw.reward_metadata, '$.final_achievements') as achievements
|
|
120
|
+
FROM session_traces st
|
|
121
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
122
|
+
ORDER BY orw.total_reward DESC, orw.achievements_count DESC
|
|
123
|
+
LIMIT 5;
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Get Event Details for High-Reward Rollouts
|
|
127
|
+
```sql
|
|
128
|
+
SELECT
|
|
129
|
+
e.event_type,
|
|
130
|
+
e.model_name,
|
|
131
|
+
e.input_tokens,
|
|
132
|
+
e.output_tokens,
|
|
133
|
+
e.latency_ms,
|
|
134
|
+
e.reward as step_reward
|
|
135
|
+
FROM events e
|
|
136
|
+
INNER JOIN outcome_rewards orw ON e.session_id = orw.session_id
|
|
137
|
+
WHERE orw.total_reward >= 2
|
|
138
|
+
ORDER BY e.session_id, e.id
|
|
139
|
+
LIMIT 20;
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Running Queries
|
|
143
|
+
|
|
144
|
+
### From Command Line
|
|
145
|
+
```bash
|
|
146
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
147
|
+
sqlite3 traces/v3/crafter_eval.db "YOUR_QUERY_HERE"
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### With Formatted Output
|
|
151
|
+
```bash
|
|
152
|
+
sqlite3 -header -column traces/v3/crafter_eval.db "YOUR_QUERY_HERE"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### With JSON Output
|
|
156
|
+
```bash
|
|
157
|
+
sqlite3 -json traces/v3/crafter_eval.db "YOUR_QUERY_HERE" | jq .
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Example: Get CSV Export of Non-Zero Rewards
|
|
161
|
+
```bash
|
|
162
|
+
sqlite3 -header -csv traces/v3/crafter_eval.db \
|
|
163
|
+
"SELECT
|
|
164
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
165
|
+
total_reward,
|
|
166
|
+
achievements_count,
|
|
167
|
+
total_steps,
|
|
168
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
169
|
+
FROM outcome_rewards
|
|
170
|
+
WHERE total_reward > 0
|
|
171
|
+
ORDER BY total_reward DESC" \
|
|
172
|
+
> crafter_rewards_nonzero.csv
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Current Data Summary
|
|
176
|
+
|
|
177
|
+
| Reward | Count | Seeds | Achievements |
|
|
178
|
+
|--------|-------|-------|--------------|
|
|
179
|
+
| 3 | 1 | 0 | collect_drink, collect_sapling, collect_wood |
|
|
180
|
+
| 2 | 4 | 1,3,6,9 | collect_sapling, collect_wood |
|
|
181
|
+
| 1 | 2 | 4,7 | collect_wood |
|
|
182
|
+
| 0 | 3 | 2,5,8 | none |
|
|
183
|
+
|
|
184
|
+
## Verifying Foreign Keys Work
|
|
185
|
+
|
|
186
|
+
```sql
|
|
187
|
+
-- This should return 7 rows (all rollouts with rewards > 0)
|
|
188
|
+
SELECT COUNT(*)
|
|
189
|
+
FROM session_traces st
|
|
190
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
191
|
+
WHERE orw.total_reward > 0;
|
|
192
|
+
|
|
193
|
+
-- This should return the same 7 session_ids
|
|
194
|
+
SELECT st.session_id
|
|
195
|
+
FROM session_traces st
|
|
196
|
+
WHERE st.session_id IN (
|
|
197
|
+
SELECT session_id FROM outcome_rewards WHERE total_reward > 0
|
|
198
|
+
);
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
✅ **Confirmed: Foreign keys are working correctly and can be used to join tables!**
|
|
202
|
+
|
|
203
|
+
|