synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
- examples/sft/evaluate.py +2 -0
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +56 -26
- examples/swe/task_app/hosted/rollout.py +42 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +799 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/models/supported.py +1 -0
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +48 -59
- synth_ai/cli/_modal_wrapper.py +3 -2
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/recent.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_apps.py +1922 -190
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/tui.py +57 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +9 -9
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +24 -5
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +257 -0
- synth_ai/task/contracts.py +138 -39
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +56 -0
- synth_ai/task/rubrics/loaders.py +152 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/trace_correlation_helpers.py +315 -0
- synth_ai/task/validators.py +413 -6
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +16 -6
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/daemon.py +8 -7
- synth_ai/tracing_v3/turso/native_manager.py +66 -43
- synth_ai/tracing_v3/utils.py +3 -3
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +906 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# Crafter Image-Only Evaluation Guide
|
|
2
|
+
|
|
3
|
+
This guide shows you how to run Crafter evaluations with **image-only input** (no text observations) and save traces + rewards to **Turso database**.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
1. **OpenAI API Key**: Set in your `.env` file
|
|
8
|
+
2. **UV Package Manager**: Already installed if you can run `uv run`
|
|
9
|
+
3. **Synth AI Repository**: Clone and set up per main README
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
### 1. Run Image-Only Evaluation (10 Rollouts)
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
17
|
+
|
|
18
|
+
# Set up environment for Turso tracing
|
|
19
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
20
|
+
export TURSO_NATIVE=1
|
|
21
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
22
|
+
|
|
23
|
+
# Run evaluation with image-only input
|
|
24
|
+
uv run synth-ai eval grpo-crafter \
|
|
25
|
+
--config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Expected output**:
|
|
29
|
+
- 10 rollouts complete
|
|
30
|
+
- ~70% will earn achievements (collect_wood, collect_sapling, etc.)
|
|
31
|
+
- All traces and rewards saved to `traces/v3/crafter_eval.db`
|
|
32
|
+
|
|
33
|
+
### 2. Check Results
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
# View database
|
|
37
|
+
ls -lh traces/v3/crafter_eval.db # Should be ~1.7MB
|
|
38
|
+
|
|
39
|
+
# Count sessions
|
|
40
|
+
sqlite3 traces/v3/crafter_eval.db \
|
|
41
|
+
"SELECT COUNT(*) FROM session_traces;"
|
|
42
|
+
|
|
43
|
+
# View all rollouts with rewards
|
|
44
|
+
sqlite3 -header -column traces/v3/crafter_eval.db \
|
|
45
|
+
"SELECT
|
|
46
|
+
json_extract(reward_metadata, '\$.env_seed') as seed,
|
|
47
|
+
total_reward,
|
|
48
|
+
achievements_count,
|
|
49
|
+
json_extract(reward_metadata, '\$.final_achievements') as achievements
|
|
50
|
+
FROM outcome_rewards
|
|
51
|
+
ORDER BY total_reward DESC;"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### 3. Query Non-Zero Rewards
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# Get rollouts that earned achievements
|
|
58
|
+
sqlite3 -header -column traces/v3/crafter_eval.db \
|
|
59
|
+
"SELECT
|
|
60
|
+
session_id,
|
|
61
|
+
total_reward,
|
|
62
|
+
achievements_count,
|
|
63
|
+
json_extract(reward_metadata, '\$.final_achievements') as achievements
|
|
64
|
+
FROM outcome_rewards
|
|
65
|
+
WHERE total_reward > 0
|
|
66
|
+
ORDER BY total_reward DESC;"
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Configuration File
|
|
70
|
+
|
|
71
|
+
**Location**: `examples/task_apps/crafter/eval_image_only_gpt4o.toml`
|
|
72
|
+
|
|
73
|
+
```toml
|
|
74
|
+
[eval]
|
|
75
|
+
app_id = "grpo-crafter"
|
|
76
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
77
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # 10 rollouts
|
|
78
|
+
max_turns = 10
|
|
79
|
+
concurrency = 1
|
|
80
|
+
env_name = "crafter"
|
|
81
|
+
policy_name = "crafter-react"
|
|
82
|
+
trace_format = "full"
|
|
83
|
+
return_trace = true
|
|
84
|
+
|
|
85
|
+
[eval.env_config]
|
|
86
|
+
env_params = {max_steps_per_episode = 10}
|
|
87
|
+
|
|
88
|
+
[eval.policy_config]
|
|
89
|
+
provider = "openai"
|
|
90
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
91
|
+
inference_url = "https://api.openai.com"
|
|
92
|
+
temperature = 0.6
|
|
93
|
+
top_p = 0.95
|
|
94
|
+
max_tokens = 512
|
|
95
|
+
use_vision = true # Enable vision mode
|
|
96
|
+
image_only_mode = true # Send ONLY images (no text)
|
|
97
|
+
max_llm_calls = 10
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Key Configuration Options
|
|
101
|
+
|
|
102
|
+
| Option | Description | Values |
|
|
103
|
+
|--------|-------------|--------|
|
|
104
|
+
| `use_vision` | Enable vision/image input | `true` / `false` |
|
|
105
|
+
| `image_only_mode` | Send only images (no text) | `true` / `false` |
|
|
106
|
+
| `seeds` | Which seeds to run | Array of integers |
|
|
107
|
+
| `max_turns` | Max policy calls per rollout | Integer (10-100) |
|
|
108
|
+
| `concurrency` | Parallel rollouts | 1-5 recommended |
|
|
109
|
+
|
|
110
|
+
## Customization
|
|
111
|
+
|
|
112
|
+
### Run More Rollouts
|
|
113
|
+
|
|
114
|
+
```toml
|
|
115
|
+
# Change seeds to run more episodes
|
|
116
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] # 20 rollouts
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Increase Steps Per Episode
|
|
120
|
+
|
|
121
|
+
```toml
|
|
122
|
+
[eval.env_config]
|
|
123
|
+
env_params = {max_steps_per_episode = 100} # Longer episodes
|
|
124
|
+
|
|
125
|
+
[eval.policy_config]
|
|
126
|
+
max_llm_calls = 100
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Use Different Model
|
|
130
|
+
|
|
131
|
+
```toml
|
|
132
|
+
[eval]
|
|
133
|
+
model = "gpt-4o-2024-08-06" # Full GPT-4o
|
|
134
|
+
|
|
135
|
+
[eval.policy_config]
|
|
136
|
+
model = "gpt-4o-2024-08-06"
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Enable Text + Images (Multimodal)
|
|
140
|
+
|
|
141
|
+
```toml
|
|
142
|
+
[eval.policy_config]
|
|
143
|
+
use_vision = true
|
|
144
|
+
image_only_mode = false # Send both text AND images
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Database Schema
|
|
148
|
+
|
|
149
|
+
### outcome_rewards Table
|
|
150
|
+
|
|
151
|
+
```sql
|
|
152
|
+
CREATE TABLE outcome_rewards (
|
|
153
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
154
|
+
session_id VARCHAR NOT NULL,
|
|
155
|
+
total_reward INTEGER NOT NULL,
|
|
156
|
+
achievements_count INTEGER NOT NULL,
|
|
157
|
+
total_steps INTEGER NOT NULL,
|
|
158
|
+
created_at DATETIME NOT NULL,
|
|
159
|
+
reward_metadata TEXT, -- JSON with achievements, seed, etc.
|
|
160
|
+
FOREIGN KEY(session_id) REFERENCES session_traces(session_id)
|
|
161
|
+
);
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Example Queries
|
|
165
|
+
|
|
166
|
+
```sql
|
|
167
|
+
-- Get statistics
|
|
168
|
+
SELECT
|
|
169
|
+
COUNT(*) as total,
|
|
170
|
+
SUM(CASE WHEN total_reward > 0 THEN 1 ELSE 0 END) as with_rewards,
|
|
171
|
+
AVG(total_reward) as avg_reward,
|
|
172
|
+
MAX(total_reward) as max_reward
|
|
173
|
+
FROM outcome_rewards;
|
|
174
|
+
|
|
175
|
+
-- Find best performers
|
|
176
|
+
SELECT
|
|
177
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
178
|
+
total_reward,
|
|
179
|
+
achievements_count,
|
|
180
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
181
|
+
FROM outcome_rewards
|
|
182
|
+
WHERE achievements_count >= 2
|
|
183
|
+
ORDER BY total_reward DESC;
|
|
184
|
+
|
|
185
|
+
-- Join with session traces
|
|
186
|
+
SELECT
|
|
187
|
+
st.session_id,
|
|
188
|
+
st.created_at,
|
|
189
|
+
st.num_timesteps,
|
|
190
|
+
orw.total_reward,
|
|
191
|
+
orw.achievements_count
|
|
192
|
+
FROM session_traces st
|
|
193
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
194
|
+
WHERE orw.total_reward > 0
|
|
195
|
+
ORDER BY orw.total_reward DESC;
|
|
196
|
+
|
|
197
|
+
-- Count by achievement level
|
|
198
|
+
SELECT
|
|
199
|
+
achievements_count,
|
|
200
|
+
COUNT(*) as num_rollouts,
|
|
201
|
+
AVG(total_reward) as avg_reward
|
|
202
|
+
FROM outcome_rewards
|
|
203
|
+
GROUP BY achievements_count
|
|
204
|
+
ORDER BY achievements_count DESC;
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
## Typical Results
|
|
208
|
+
|
|
209
|
+
**Expected Performance** (10 rollouts, 10 steps each, image-only):
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
Total rollouts: 10
|
|
213
|
+
Rollouts with rewards: ~7 (70%)
|
|
214
|
+
Average reward: ~1.3
|
|
215
|
+
Max reward: ~3
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
**Common Achievements**:
|
|
219
|
+
- `collect_wood` (most common)
|
|
220
|
+
- `collect_sapling` (common)
|
|
221
|
+
- `collect_drink` (rare in 10 steps)
|
|
222
|
+
|
|
223
|
+
## Troubleshooting
|
|
224
|
+
|
|
225
|
+
### No Database Created
|
|
226
|
+
|
|
227
|
+
**Issue**: `traces/v3/crafter_eval.db` doesn't exist or is 0 bytes
|
|
228
|
+
|
|
229
|
+
**Fix**: Ensure environment variables are set:
|
|
230
|
+
```bash
|
|
231
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
232
|
+
export TURSO_NATIVE=1
|
|
233
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
### 401 Unauthorized Error
|
|
237
|
+
|
|
238
|
+
**Issue**: OpenAI API returns 401
|
|
239
|
+
|
|
240
|
+
**Fix**: Check your `.env` file has valid `OPENAI_API_KEY`:
|
|
241
|
+
```bash
|
|
242
|
+
# .env file
|
|
243
|
+
OPENAI_API_KEY=sk-proj-...your-key-here...
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
### All Rewards are Zero
|
|
247
|
+
|
|
248
|
+
**Issue**: Agents aren't earning any achievements
|
|
249
|
+
|
|
250
|
+
**Possible causes**:
|
|
251
|
+
1. **Too few steps**: Increase `max_steps_per_episode` to 50-100
|
|
252
|
+
2. **Image-only too hard**: Try `image_only_mode = false` for multimodal
|
|
253
|
+
3. **Wrong model**: Try full GPT-4o instead of mini
|
|
254
|
+
|
|
255
|
+
### Database Lock Errors
|
|
256
|
+
|
|
257
|
+
**Issue**: `SQLITE_BUSY` or `database is locked`
|
|
258
|
+
|
|
259
|
+
**Fix**: Reduce concurrency in config:
|
|
260
|
+
```toml
|
|
261
|
+
[eval]
|
|
262
|
+
concurrency = 1 # Run sequentially
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
Or use Turso's MVCC mode (already enabled with `TURSO_NATIVE=1`).
|
|
266
|
+
|
|
267
|
+
## Advanced: Export to CSV
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
# Export all rewards to CSV
|
|
271
|
+
sqlite3 -header -csv traces/v3/crafter_eval.db \
|
|
272
|
+
"SELECT
|
|
273
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
274
|
+
total_reward,
|
|
275
|
+
achievements_count,
|
|
276
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
277
|
+
FROM outcome_rewards
|
|
278
|
+
ORDER BY total_reward DESC" \
|
|
279
|
+
> crafter_rewards.csv
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## Files Overview
|
|
283
|
+
|
|
284
|
+
```
|
|
285
|
+
examples/task_apps/crafter/
|
|
286
|
+
├── eval_image_only_gpt4o.toml # Config file
|
|
287
|
+
├── EVAL_IMAGE_ONLY_RESULTS.md # Example results
|
|
288
|
+
├── QUERY_EXAMPLES.md # More SQL queries
|
|
289
|
+
├── README_IMAGE_ONLY_EVAL.md # This file
|
|
290
|
+
└── task_app/
|
|
291
|
+
└── synth_envs_hosted/
|
|
292
|
+
├── envs/crafter/
|
|
293
|
+
│ ├── policy.py # Image-only mode logic
|
|
294
|
+
│ └── react_agent.py # Message construction
|
|
295
|
+
├── rollout.py # SessionTracer integration
|
|
296
|
+
└── inference/
|
|
297
|
+
└── openai_client.py # API authentication
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## See Also
|
|
301
|
+
|
|
302
|
+
- `EVAL_IMAGE_ONLY_RESULTS.md` - Example run with detailed results
|
|
303
|
+
- `QUERY_EXAMPLES.md` - More SQL query examples
|
|
304
|
+
- `../../pokemon_red/README_IMAGE_ONLY_EVAL.md` - Pokemon Red version
|
|
305
|
+
|
|
306
|
+
## Summary
|
|
307
|
+
|
|
308
|
+
1. ✅ Set environment variables for Turso tracing
|
|
309
|
+
2. ✅ Run `uv run synth-ai eval grpo-crafter --config ...`
|
|
310
|
+
3. ✅ Check database: `traces/v3/crafter_eval.db`
|
|
311
|
+
4. ✅ Query rewards: `SELECT * FROM outcome_rewards WHERE total_reward > 0`
|
|
312
|
+
5. ✅ Customize config for different models/steps
|
|
313
|
+
|
|
314
|
+
Enjoy running Crafter with vision-only input! 🎮
|
|
315
|
+
|
|
316
|
+
|
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Evaluation config for Crafter with image-only input
|
|
2
|
+
# This config uses GPT-4o mini with only image data (no text observations)
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
6
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
7
|
+
seeds = [0, 1] # Quick test run
|
|
8
|
+
max_turns = 10
|
|
9
|
+
concurrency = 1 # Reduced to 1 to avoid database locks during testing
|
|
10
|
+
env_name = "crafter"
|
|
11
|
+
policy_name = "crafter-react"
|
|
12
|
+
trace_format = "full"
|
|
13
|
+
return_trace = true
|
|
14
|
+
|
|
15
|
+
[eval.env_config]
|
|
16
|
+
env_params = {max_steps_per_episode = 10}
|
|
17
|
+
|
|
18
|
+
[eval.policy_config]
|
|
19
|
+
provider = "openai"
|
|
20
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
21
|
+
inference_url = "https://api.openai.com" # Base URL (client will append /v1/chat/completions)
|
|
22
|
+
temperature = 0.6
|
|
23
|
+
top_p = 0.95
|
|
24
|
+
max_tokens = 512
|
|
25
|
+
use_vision = true
|
|
26
|
+
image_only_mode = true
|
|
27
|
+
max_llm_calls = 10
|
|
28
|
+
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Evaluation config for Crafter with text-only input
|
|
2
|
+
# This config uses Groq Qwen with only text observations (no images)
|
|
3
|
+
|
|
4
|
+
[eval]
|
|
5
|
+
app_id = "grpo-crafter-task-app"
|
|
6
|
+
task_app_url = "https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run"
|
|
7
|
+
model = "qwen/qwen3-32b"
|
|
8
|
+
seeds = [0, 1, 2]
|
|
9
|
+
max_turns = 10
|
|
10
|
+
concurrency = 1
|
|
11
|
+
env_name = "crafter"
|
|
12
|
+
policy_name = "crafter-react"
|
|
13
|
+
trace_format = "full"
|
|
14
|
+
return_trace = true
|
|
15
|
+
|
|
16
|
+
[eval.env_config]
|
|
17
|
+
env_params = {max_steps_per_episode = 10}
|
|
18
|
+
|
|
19
|
+
[eval.policy_config]
|
|
20
|
+
provider = "groq"
|
|
21
|
+
model = "qwen/qwen3-32b"
|
|
22
|
+
inference_url = "https://api.groq.com/openai/v1/chat/completions"
|
|
23
|
+
temperature = 0.6
|
|
24
|
+
top_p = 0.95
|
|
25
|
+
max_tokens = 512
|
|
26
|
+
use_vision = false
|
|
27
|
+
image_only_mode = false
|
|
28
|
+
max_llm_calls = 10
|
|
29
|
+
|
|
30
|
+
[eval.judge]
|
|
31
|
+
path = "examples/task_apps/crafter/judges/crafter_backend_judge.py"
|
|
32
|
+
name = "Backend"
|
|
33
|
+
backend_url = "http://localhost:8000/api"
|
|
34
|
+
model = "openai/gpt-oss-120b"
|
|
35
|
+
timeout_s = 45
|
|
36
|
+
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Filter config for Crafter image-only traces
|
|
2
|
+
# Creates SFT dataset from rollouts with non-zero rewards
|
|
3
|
+
|
|
4
|
+
[filter]
|
|
5
|
+
# Input: Consolidated traces database
|
|
6
|
+
db = "traces/v3/synth_ai.db"
|
|
7
|
+
|
|
8
|
+
# Output: SFT-ready JSONL file
|
|
9
|
+
output = "ft_data/crafter_image_only_sft.jsonl"
|
|
10
|
+
|
|
11
|
+
# Filter for successful rollouts (those with achievements)
|
|
12
|
+
min_official_score = 0.01 # Only include traces with rewards > 0
|
|
13
|
+
|
|
14
|
+
# Optional: Limit number of examples (remove to get all)
|
|
15
|
+
# limit = 10
|
|
16
|
+
|