synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
- examples/sft/evaluate.py +2 -0
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +56 -26
- examples/swe/task_app/hosted/rollout.py +42 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +799 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/models/supported.py +1 -0
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +48 -59
- synth_ai/cli/_modal_wrapper.py +3 -2
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/recent.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_apps.py +1922 -190
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/tui.py +57 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +9 -9
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +24 -5
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +257 -0
- synth_ai/task/contracts.py +138 -39
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +56 -0
- synth_ai/task/rubrics/loaders.py +152 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/trace_correlation_helpers.py +315 -0
- synth_ai/task/validators.py +413 -6
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +16 -6
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/daemon.py +8 -7
- synth_ai/tracing_v3/turso/native_manager.py +66 -43
- synth_ai/tracing_v3/utils.py +3 -3
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +906 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
synth_ai/task/config.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""Configuration dataclasses for task app CLI commands (eval, filter)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(slots=True)
|
|
11
|
+
class EvalConfig:
|
|
12
|
+
"""Configuration for 'synth-ai eval' command.
|
|
13
|
+
|
|
14
|
+
Validates and provides defaults for evaluation runs against task apps.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
# Required: Task app identifier
|
|
18
|
+
app_id: str
|
|
19
|
+
|
|
20
|
+
# Required: Model to evaluate
|
|
21
|
+
model: str
|
|
22
|
+
|
|
23
|
+
# Required: Seeds to run
|
|
24
|
+
seeds: list[int]
|
|
25
|
+
|
|
26
|
+
# Optional: Task app URL (None = spawn in-process)
|
|
27
|
+
task_app_url: str | None = None
|
|
28
|
+
|
|
29
|
+
# Optional: Data split to use
|
|
30
|
+
split: str = "train"
|
|
31
|
+
|
|
32
|
+
# Optional: Maximum turns/steps per episode
|
|
33
|
+
max_turns: int | None = None
|
|
34
|
+
|
|
35
|
+
# Optional: Maximum LLM calls per episode
|
|
36
|
+
max_llm_calls: int = 10
|
|
37
|
+
|
|
38
|
+
# Optional: Concurrency for parallel rollouts
|
|
39
|
+
concurrency: int = 1
|
|
40
|
+
|
|
41
|
+
# Optional: Environment name
|
|
42
|
+
env_name: str | None = None
|
|
43
|
+
|
|
44
|
+
# Optional: Policy name
|
|
45
|
+
policy_name: str | None = None
|
|
46
|
+
|
|
47
|
+
# Optional: Trace format ("compact", "full", "structured")
|
|
48
|
+
trace_format: Literal["compact", "full", "structured"] = "compact"
|
|
49
|
+
|
|
50
|
+
# Optional: Whether to return traces in response
|
|
51
|
+
return_trace: bool = False
|
|
52
|
+
|
|
53
|
+
# Optional: Operations sequence (if not provided, generates default)
|
|
54
|
+
ops: list[str] | None = None
|
|
55
|
+
|
|
56
|
+
# Optional: Environment config overrides
|
|
57
|
+
env_config: dict[str, Any] = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
# Optional: Policy config overrides
|
|
60
|
+
policy_config: dict[str, Any] = field(default_factory=dict)
|
|
61
|
+
|
|
62
|
+
# Optional: Metadata for traces
|
|
63
|
+
metadata: dict[str, str] = field(default_factory=dict)
|
|
64
|
+
|
|
65
|
+
# Optional: SQL query for metadata filtering
|
|
66
|
+
metadata_sql: str | None = None
|
|
67
|
+
|
|
68
|
+
def __post_init__(self):
|
|
69
|
+
"""Validate configuration after initialization."""
|
|
70
|
+
if not self.app_id:
|
|
71
|
+
raise ValueError("app_id is required")
|
|
72
|
+
|
|
73
|
+
if not self.model:
|
|
74
|
+
raise ValueError("model is required")
|
|
75
|
+
|
|
76
|
+
if not self.seeds:
|
|
77
|
+
raise ValueError("seeds list cannot be empty")
|
|
78
|
+
|
|
79
|
+
if not isinstance(self.seeds, list):
|
|
80
|
+
raise ValueError("seeds must be a list of integers")
|
|
81
|
+
|
|
82
|
+
if self.concurrency < 1:
|
|
83
|
+
raise ValueError("concurrency must be >= 1")
|
|
84
|
+
|
|
85
|
+
if self.max_llm_calls < 1:
|
|
86
|
+
raise ValueError("max_llm_calls must be >= 1")
|
|
87
|
+
|
|
88
|
+
if self.max_turns is not None and self.max_turns < 1:
|
|
89
|
+
raise ValueError("max_turns must be >= 1")
|
|
90
|
+
|
|
91
|
+
if self.trace_format not in ("compact", "full", "structured"):
|
|
92
|
+
raise ValueError(f"trace_format must be 'compact', 'full', or 'structured', got: {self.trace_format}")
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_dict(cls, data: dict[str, Any]) -> EvalConfig:
|
|
96
|
+
"""Create EvalConfig from a dictionary (e.g. from TOML).
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
data: Dictionary with eval configuration
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Validated EvalConfig instance
|
|
103
|
+
"""
|
|
104
|
+
# Extract known fields
|
|
105
|
+
config_dict = {
|
|
106
|
+
"app_id": data.get("app_id"),
|
|
107
|
+
"model": data.get("model"),
|
|
108
|
+
"seeds": data.get("seeds", []),
|
|
109
|
+
"task_app_url": data.get("task_app_url"),
|
|
110
|
+
"split": data.get("split", "train"),
|
|
111
|
+
"max_turns": data.get("max_turns"),
|
|
112
|
+
"max_llm_calls": data.get("max_llm_calls", 10),
|
|
113
|
+
"concurrency": data.get("concurrency", 1),
|
|
114
|
+
"env_name": data.get("env_name"),
|
|
115
|
+
"policy_name": data.get("policy_name"),
|
|
116
|
+
"trace_format": data.get("trace_format", "compact"),
|
|
117
|
+
"return_trace": data.get("return_trace", False),
|
|
118
|
+
"ops": data.get("ops"),
|
|
119
|
+
"env_config": data.get("env_config", {}),
|
|
120
|
+
"policy_config": data.get("policy_config", {}),
|
|
121
|
+
"metadata": data.get("metadata", {}),
|
|
122
|
+
"metadata_sql": data.get("metadata_sql"),
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return cls(**config_dict)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass(slots=True)
|
|
129
|
+
class FilterConfig:
|
|
130
|
+
"""Configuration for 'synth-ai filter' command.
|
|
131
|
+
|
|
132
|
+
Validates and provides defaults for filtering traces into SFT datasets.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
# Required: Database path or URL
|
|
136
|
+
db: str
|
|
137
|
+
|
|
138
|
+
# Required: Output JSONL path
|
|
139
|
+
output: str
|
|
140
|
+
|
|
141
|
+
# Optional: Filter by data splits
|
|
142
|
+
splits: list[str] = field(default_factory=list)
|
|
143
|
+
|
|
144
|
+
# Optional: Filter by task IDs
|
|
145
|
+
task_ids: list[str] = field(default_factory=list)
|
|
146
|
+
|
|
147
|
+
# Optional: Filter by models
|
|
148
|
+
models: list[str] = field(default_factory=list)
|
|
149
|
+
|
|
150
|
+
# Optional: Minimum official score threshold
|
|
151
|
+
min_official_score: float | None = None
|
|
152
|
+
|
|
153
|
+
# Optional: Maximum official score threshold
|
|
154
|
+
max_official_score: float | None = None
|
|
155
|
+
|
|
156
|
+
# Optional: Minimum judge scores (judge_name -> min_score)
|
|
157
|
+
min_judge_scores: dict[str, float] = field(default_factory=dict)
|
|
158
|
+
|
|
159
|
+
# Optional: Maximum judge scores (judge_name -> max_score)
|
|
160
|
+
max_judge_scores: dict[str, float] = field(default_factory=dict)
|
|
161
|
+
|
|
162
|
+
# Optional: Limit number of examples
|
|
163
|
+
limit: int | None = None
|
|
164
|
+
|
|
165
|
+
# Optional: Offset for pagination
|
|
166
|
+
offset: int | None = None
|
|
167
|
+
|
|
168
|
+
# Optional: Whether to shuffle results
|
|
169
|
+
shuffle: bool = False
|
|
170
|
+
|
|
171
|
+
# Optional: Random seed for shuffling
|
|
172
|
+
shuffle_seed: int | None = None
|
|
173
|
+
|
|
174
|
+
def __post_init__(self):
|
|
175
|
+
"""Validate configuration after initialization."""
|
|
176
|
+
if not self.db:
|
|
177
|
+
raise ValueError("db (database path or URL) is required")
|
|
178
|
+
|
|
179
|
+
if not self.output:
|
|
180
|
+
raise ValueError("output (JSONL file path) is required")
|
|
181
|
+
|
|
182
|
+
# Validate output has .jsonl extension
|
|
183
|
+
output_path = Path(self.output)
|
|
184
|
+
if output_path.suffix.lower() not in (".jsonl", ".json"):
|
|
185
|
+
raise ValueError(f"output must be a .jsonl or .json file, got: {self.output}")
|
|
186
|
+
|
|
187
|
+
# Validate score thresholds
|
|
188
|
+
if self.min_official_score is not None and self.max_official_score is not None:
|
|
189
|
+
if self.min_official_score > self.max_official_score:
|
|
190
|
+
raise ValueError("min_official_score cannot be greater than max_official_score")
|
|
191
|
+
|
|
192
|
+
# Validate limit/offset
|
|
193
|
+
if self.limit is not None and self.limit < 1:
|
|
194
|
+
raise ValueError("limit must be >= 1")
|
|
195
|
+
|
|
196
|
+
if self.offset is not None and self.offset < 0:
|
|
197
|
+
raise ValueError("offset must be >= 0")
|
|
198
|
+
|
|
199
|
+
# Validate shuffle seed requires shuffle
|
|
200
|
+
if self.shuffle_seed is not None and not self.shuffle:
|
|
201
|
+
raise ValueError("shuffle_seed requires shuffle=true")
|
|
202
|
+
|
|
203
|
+
@classmethod
|
|
204
|
+
def from_dict(cls, data: dict[str, Any]) -> FilterConfig:
|
|
205
|
+
"""Create FilterConfig from a dictionary (e.g. from TOML).
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
data: Dictionary with filter configuration
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Validated FilterConfig instance
|
|
212
|
+
"""
|
|
213
|
+
# Extract known fields
|
|
214
|
+
config_dict = {
|
|
215
|
+
"db": data.get("db"),
|
|
216
|
+
"output": data.get("output"),
|
|
217
|
+
"splits": data.get("splits", []),
|
|
218
|
+
"task_ids": data.get("task_ids", []),
|
|
219
|
+
"models": data.get("models", []),
|
|
220
|
+
"min_official_score": data.get("min_official_score"),
|
|
221
|
+
"max_official_score": data.get("max_official_score"),
|
|
222
|
+
"min_judge_scores": data.get("min_judge_scores", {}),
|
|
223
|
+
"max_judge_scores": data.get("max_judge_scores", {}),
|
|
224
|
+
"limit": data.get("limit"),
|
|
225
|
+
"offset": data.get("offset"),
|
|
226
|
+
"shuffle": data.get("shuffle", False),
|
|
227
|
+
"shuffle_seed": data.get("shuffle_seed"),
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
return cls(**config_dict)
|
|
231
|
+
|
|
232
|
+
def get_db_url(self) -> str:
|
|
233
|
+
"""Convert db path to proper SQLite URL if needed.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Database URL suitable for SQLAlchemy/aiosqlite
|
|
237
|
+
"""
|
|
238
|
+
db_value = self.db.strip()
|
|
239
|
+
if "://" in db_value:
|
|
240
|
+
return db_value
|
|
241
|
+
else:
|
|
242
|
+
db_path = Path(db_value).expanduser().resolve()
|
|
243
|
+
# Ensure parent directory exists
|
|
244
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
245
|
+
return f"sqlite+aiosqlite:///{db_path}"
|
|
246
|
+
|
|
247
|
+
def get_output_path(self) -> Path:
|
|
248
|
+
"""Get resolved output path with parent directory created.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Resolved Path object with parent directory created
|
|
252
|
+
"""
|
|
253
|
+
output_path = Path(self.output).expanduser().resolve()
|
|
254
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
255
|
+
return output_path
|
|
256
|
+
|
|
257
|
+
|
synth_ai/task/contracts.py
CHANGED
|
@@ -1,19 +1,25 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Any, Literal
|
|
5
6
|
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class RolloutMode(str, Enum):
|
|
11
|
+
"""Mode controls how rollout infrastructure processes inference URLs."""
|
|
12
|
+
RL = "rl"
|
|
13
|
+
EVAL = "eval"
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
@dataclass(frozen=True)
|
|
10
17
|
class TaskAppEndpoints:
|
|
11
|
-
"""
|
|
18
|
+
"""Required Task App endpoints used by RL trainers and clients.
|
|
12
19
|
|
|
13
|
-
Task Apps run as lightweight HTTP services (often on Modal) that expose
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
defined here act as defaults and documentation for clients.
|
|
20
|
+
Task Apps run as lightweight HTTP services (often on Modal) that expose these
|
|
21
|
+
standard endpoints. Additional endpoints (proxies, debug routes) may be added
|
|
22
|
+
by individual task apps as needed.
|
|
17
23
|
"""
|
|
18
24
|
|
|
19
25
|
root: str = "/"
|
|
@@ -21,28 +27,6 @@ class TaskAppEndpoints:
|
|
|
21
27
|
info: str = "/info"
|
|
22
28
|
task_info: str = "/task_info"
|
|
23
29
|
rollout: str = "/rollout"
|
|
24
|
-
proxy_chat_completions: str = "/proxy/v1/chat/completions"
|
|
25
|
-
proxy_groq_chat_completions: str = "/proxy/groq/v1/chat/completions"
|
|
26
|
-
env_initialize: str = "/env/{env_name}/initialize"
|
|
27
|
-
env_step: str = "/env/{env_name}/step"
|
|
28
|
-
env_terminate: str = "/env/{env_name}/terminate"
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
@dataclass(frozen=True)
|
|
32
|
-
class TaskAppContract:
|
|
33
|
-
"""Requirements and expectations for a Task App used by RL trainers.
|
|
34
|
-
|
|
35
|
-
- Auth: ENVIRONMENT_API_KEY must be set in the Task App environment; requests include X-API-Key.
|
|
36
|
-
- Health: /health returns 200 and JSON; may verify X-API-Key header.
|
|
37
|
-
- Env API: initialize/step/terminate are present for the target env (e.g., CrafterClassic).
|
|
38
|
-
- Rollout API: optional; provides a single-call rollout for convenience/testing.
|
|
39
|
-
- Inference routing: policy config passes an inference_url (Synth backend or OpenAI proxy).
|
|
40
|
-
- URL: base must be reachable via HTTPS and should be under .modal.run in production.
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
base_url: str
|
|
44
|
-
env_name: str | None = None
|
|
45
|
-
requires_api_key_header: bool = True
|
|
46
30
|
|
|
47
31
|
|
|
48
32
|
# --- Unified rollout schema used by Task App services and SDK utilities ---
|
|
@@ -66,7 +50,7 @@ class RolloutRecordConfig(BaseModel):
|
|
|
66
50
|
logprobs: bool = False
|
|
67
51
|
value: bool = False
|
|
68
52
|
return_trace: bool = False
|
|
69
|
-
trace_format: Literal["compact", "full"] = "compact"
|
|
53
|
+
trace_format: Literal["compact", "full", "structured"] = "compact"
|
|
70
54
|
|
|
71
55
|
|
|
72
56
|
class RolloutSafetyConfig(BaseModel):
|
|
@@ -84,9 +68,16 @@ class RolloutRequest(BaseModel):
|
|
|
84
68
|
safety: RolloutSafetyConfig = RolloutSafetyConfig()
|
|
85
69
|
training_session_id: str | None = None
|
|
86
70
|
synth_base_url: str | None = None
|
|
71
|
+
mode: RolloutMode # Required: explicit RL vs EVAL mode
|
|
87
72
|
|
|
88
73
|
|
|
89
74
|
class RolloutStep(BaseModel):
|
|
75
|
+
"""Single step in a rollout trajectory.
|
|
76
|
+
|
|
77
|
+
DEPRECATED: This is part of the legacy trajectory format. New code should
|
|
78
|
+
consume v3 traces (RolloutResponse.trace) instead. See monorepo/trace_single_source.txt
|
|
79
|
+
for migration plan.
|
|
80
|
+
"""
|
|
90
81
|
obs: dict[str, Any]
|
|
91
82
|
tool_calls: list[dict[str, Any]]
|
|
92
83
|
reward: float | None = None
|
|
@@ -96,11 +87,40 @@ class RolloutStep(BaseModel):
|
|
|
96
87
|
|
|
97
88
|
|
|
98
89
|
class RolloutTrajectory(BaseModel):
|
|
90
|
+
"""Legacy trajectory format for rollout results.
|
|
91
|
+
|
|
92
|
+
DEPRECATED: This format duplicates data already present in v3 traces and will
|
|
93
|
+
be removed once training code migrates to consuming RolloutResponse.trace.
|
|
94
|
+
|
|
95
|
+
Current state:
|
|
96
|
+
- Task apps emit BOTH this format AND v3 traces (dual serialization)
|
|
97
|
+
- Training code (GSPO) reads from this format
|
|
98
|
+
- Eval/filter tools read from v3 traces
|
|
99
|
+
|
|
100
|
+
Migration plan:
|
|
101
|
+
- Phase 1: Training code learns to read from v3 traces (with fallback to this)
|
|
102
|
+
- Phase 2: Make this field optional once training is migrated
|
|
103
|
+
- Phase 3: Remove this field entirely and delete this class
|
|
104
|
+
|
|
105
|
+
See: monorepo/trace_single_source.txt for full migration plan and timeline.
|
|
106
|
+
|
|
107
|
+
Why v3 traces are better:
|
|
108
|
+
- Single source of truth (no duplication/drift)
|
|
109
|
+
- Richer data: token IDs, logprobs, reasoning, timing, images
|
|
110
|
+
- Built-in audit trail and replay capability
|
|
111
|
+
- Standard schema across all Synth AI tooling
|
|
112
|
+
"""
|
|
99
113
|
env_id: str
|
|
100
114
|
policy_id: str
|
|
101
115
|
steps: list[RolloutStep]
|
|
102
116
|
final: dict[str, Any] | None = None
|
|
103
117
|
length: int
|
|
118
|
+
|
|
119
|
+
# Required for trace correlation with inference mesh (optional initially for backward compat)
|
|
120
|
+
# See: monorepo/INFERENCE_URL_REQUIREMENT_PLAN.md and trace_creation_and_judgement.txt
|
|
121
|
+
inference_url: str
|
|
122
|
+
|
|
123
|
+
decision_samples: list[dict[str, Any]] | None = None
|
|
104
124
|
|
|
105
125
|
|
|
106
126
|
class RolloutMetrics(BaseModel):
|
|
@@ -114,24 +134,103 @@ class RolloutMetrics(BaseModel):
|
|
|
114
134
|
|
|
115
135
|
|
|
116
136
|
class RolloutResponse(BaseModel):
|
|
137
|
+
"""Response from a rollout execution.
|
|
138
|
+
|
|
139
|
+
Contains both legacy trajectory format (for backward compatibility) and
|
|
140
|
+
modern v3 trace format (preferred going forward).
|
|
141
|
+
"""
|
|
117
142
|
run_id: str
|
|
143
|
+
|
|
144
|
+
# DEPRECATED: Legacy format maintained for training code compatibility.
|
|
145
|
+
# Will be removed once training migrates to reading from `trace` field.
|
|
146
|
+
# See: monorepo/trace_single_source.txt for migration plan.
|
|
118
147
|
trajectories: list[RolloutTrajectory]
|
|
148
|
+
|
|
119
149
|
branches: dict[str, list[str]] = Field(default_factory=dict)
|
|
120
150
|
metrics: RolloutMetrics
|
|
121
151
|
aborted: bool = False
|
|
122
152
|
ops_executed: int = 0
|
|
153
|
+
|
|
154
|
+
# OPTIONAL: correlation ID for linking rollout to inference traces
|
|
155
|
+
# If not provided, trainer will infer it from trajectory.inference_url ?cid=... parameter
|
|
156
|
+
trace_correlation_id: str | None = None
|
|
157
|
+
|
|
158
|
+
# PREFERRED: v3 trace format (SessionTrace). This is the single source of truth
|
|
159
|
+
# for rollout data and should be used by all new code. Contains richer data than
|
|
160
|
+
# trajectories including token IDs, logprobs, timing, and multimodal content.
|
|
123
161
|
trace: dict[str, Any] | None = None
|
|
162
|
+
pipeline_metadata: dict[str, Any] = Field(default_factory=dict)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class _ExtraAllowModel(BaseModel):
|
|
166
|
+
"""Base helper that preserves unknown keys while still exposing typed attributes."""
|
|
167
|
+
|
|
168
|
+
model_config = ConfigDict(extra="allow")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class TaskDescriptor(_ExtraAllowModel):
|
|
172
|
+
"""Human-readable task identifiers shown in UIs and logs."""
|
|
173
|
+
|
|
174
|
+
id: str
|
|
175
|
+
name: str
|
|
176
|
+
description: str | None = None
|
|
177
|
+
version: str | None = None
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class DatasetInfo(_ExtraAllowModel):
|
|
181
|
+
"""Metadata about the prompt/task dataset powering the environment."""
|
|
182
|
+
|
|
183
|
+
id: str | None = None
|
|
184
|
+
name: str | None = None
|
|
185
|
+
version: str | None = None
|
|
186
|
+
splits: list[str] | None = None
|
|
187
|
+
default_split: str | None = None
|
|
188
|
+
description: str | None = None
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class RubricCriterion(_ExtraAllowModel):
|
|
192
|
+
id: str
|
|
193
|
+
description: str
|
|
194
|
+
weight: float | None = None
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class RubricSection(_ExtraAllowModel):
|
|
198
|
+
name: str
|
|
199
|
+
criteria: list[RubricCriterion] = Field(default_factory=list)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class RubricInfo(_ExtraAllowModel):
|
|
203
|
+
"""Outcome and event scoring definitions used by judges."""
|
|
204
|
+
|
|
205
|
+
outcome: RubricSection | None = None
|
|
206
|
+
events: RubricSection | None = None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class InferenceInfo(_ExtraAllowModel):
|
|
210
|
+
"""Recommended defaults for policy model routing."""
|
|
211
|
+
|
|
212
|
+
model: str | None = None
|
|
213
|
+
inference_url: str | None = None
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class LimitsInfo(_ExtraAllowModel):
|
|
217
|
+
"""Operational limits the environment enforces."""
|
|
218
|
+
|
|
219
|
+
max_turns: int | None = None
|
|
220
|
+
max_response_tokens: int | None = None
|
|
221
|
+
timeout_seconds: int | None = None
|
|
124
222
|
|
|
125
223
|
|
|
126
|
-
class TaskInfo(
|
|
224
|
+
class TaskInfo(_ExtraAllowModel):
|
|
127
225
|
"""Static metadata describing the capabilities of a Task App task."""
|
|
128
226
|
|
|
129
|
-
task:
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
227
|
+
task: TaskDescriptor
|
|
228
|
+
environment: str
|
|
229
|
+
dataset: DatasetInfo
|
|
230
|
+
rubric: RubricInfo
|
|
231
|
+
inference: InferenceInfo
|
|
232
|
+
limits: LimitsInfo
|
|
233
|
+
task_metadata: dict[str, Any] = Field(
|
|
234
|
+
default_factory=dict,
|
|
235
|
+
description="Task-specific extras (e.g. prompt version info, documentation links).",
|
|
236
|
+
)
|
synth_ai/task/proxy.py
CHANGED
|
@@ -1,39 +1,15 @@
|
|
|
1
|
-
"""Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.).
|
|
1
|
+
"""Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.).
|
|
2
|
+
|
|
3
|
+
The proxy is tool-agnostic - each task app provides its own tools schema.
|
|
4
|
+
"""
|
|
2
5
|
|
|
3
6
|
from __future__ import annotations
|
|
4
7
|
|
|
5
8
|
import copy
|
|
6
9
|
import json
|
|
7
10
|
import re
|
|
8
|
-
from collections.abc import Iterable
|
|
9
11
|
from typing import Any
|
|
10
12
|
|
|
11
|
-
INTERACT_TOOL_SCHEMA: list[dict[str, Any]] = [
|
|
12
|
-
{
|
|
13
|
-
"type": "function",
|
|
14
|
-
"function": {
|
|
15
|
-
"name": "interact",
|
|
16
|
-
"description": "Perform one or more environment actions.",
|
|
17
|
-
"parameters": {
|
|
18
|
-
"type": "object",
|
|
19
|
-
"properties": {
|
|
20
|
-
"actions": {
|
|
21
|
-
"type": "array",
|
|
22
|
-
"items": {"type": "string"},
|
|
23
|
-
"description": "List of environment actions to execute in order.",
|
|
24
|
-
},
|
|
25
|
-
"reasoning": {
|
|
26
|
-
"type": "string",
|
|
27
|
-
"description": "Optional reasoning for the chosen actions.",
|
|
28
|
-
},
|
|
29
|
-
},
|
|
30
|
-
"required": ["actions"],
|
|
31
|
-
"additionalProperties": False,
|
|
32
|
-
},
|
|
33
|
-
},
|
|
34
|
-
}
|
|
35
|
-
]
|
|
36
|
-
|
|
37
13
|
_REMOVE_FIELDS = {
|
|
38
14
|
"stop_after_tool_calls",
|
|
39
15
|
"thinking_mode",
|
|
@@ -44,14 +20,12 @@ _REMOVE_SAMPLING_FIELDS = {"temperature", "top_p"}
|
|
|
44
20
|
_GPT5_MIN_COMPLETION_TOKENS = 16000
|
|
45
21
|
|
|
46
22
|
|
|
47
|
-
def _ensure_tools(payload: dict[str, Any]) -> None:
|
|
48
|
-
tools = payload.get("tools")
|
|
49
|
-
if not isinstance(tools, list) or not tools:
|
|
50
|
-
payload["tools"] = copy.deepcopy(INTERACT_TOOL_SCHEMA)
|
|
51
|
-
|
|
52
|
-
|
|
53
23
|
def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str, Any]:
|
|
54
|
-
"""Sanitise an OpenAI chat completions payload for Task App usage.
|
|
24
|
+
"""Sanitise an OpenAI chat completions payload for Task App usage.
|
|
25
|
+
|
|
26
|
+
The task app is responsible for providing tools in the payload.
|
|
27
|
+
This function only handles model-specific parameter normalization.
|
|
28
|
+
"""
|
|
55
29
|
|
|
56
30
|
sanitized = copy.deepcopy(payload)
|
|
57
31
|
for field in _REMOVE_FIELDS:
|
|
@@ -68,10 +42,18 @@ def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str,
|
|
|
68
42
|
mct = sanitized.get("max_completion_tokens")
|
|
69
43
|
if not isinstance(mct, int) or mct < _GPT5_MIN_COMPLETION_TOKENS:
|
|
70
44
|
sanitized["max_completion_tokens"] = _GPT5_MIN_COMPLETION_TOKENS
|
|
71
|
-
|
|
45
|
+
|
|
46
|
+
# Set tool_choice to first provided tool (task app must provide tools)
|
|
47
|
+
# If tool_choice not already set and tools are provided, use the first one
|
|
48
|
+
if "tool_choice" not in sanitized:
|
|
49
|
+
tools = sanitized.get("tools", [])
|
|
50
|
+
if isinstance(tools, list) and tools:
|
|
51
|
+
first_func = tools[0].get("function", {})
|
|
52
|
+
if isinstance(first_func, dict) and "name" in first_func:
|
|
53
|
+
sanitized["tool_choice"] = {"type": "function", "function": {"name": first_func["name"]}}
|
|
54
|
+
|
|
72
55
|
sanitized["parallel_tool_calls"] = False
|
|
73
56
|
|
|
74
|
-
_ensure_tools(sanitized)
|
|
75
57
|
return sanitized
|
|
76
58
|
|
|
77
59
|
|
|
@@ -206,24 +188,18 @@ def parse_tool_call_from_text(text: str) -> tuple[list[str], str]:
|
|
|
206
188
|
return [], text
|
|
207
189
|
|
|
208
190
|
|
|
209
|
-
def
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
if
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
},
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str, Any]:
|
|
226
|
-
"""Ensure the first choice carries a tool_call derived from text if absent."""
|
|
191
|
+
def synthesize_tool_call_if_missing(
|
|
192
|
+
openai_response: dict[str, Any],
|
|
193
|
+
fallback_tool_name: str = "interact"
|
|
194
|
+
) -> dict[str, Any]:
|
|
195
|
+
"""Ensure the first choice carries a tool_call derived from text if absent.
|
|
196
|
+
|
|
197
|
+
This is a fallback for models that don't properly support tool calling.
|
|
198
|
+
Task apps can specify their preferred fallback tool name (e.g., "interact", "execute_sequence").
|
|
199
|
+
|
|
200
|
+
DEPRECATED: Task apps should prefer models with native tool calling support.
|
|
201
|
+
This function will be removed in a future version.
|
|
202
|
+
"""
|
|
227
203
|
|
|
228
204
|
if not isinstance(openai_response, dict):
|
|
229
205
|
return openai_response
|
|
@@ -245,8 +221,24 @@ def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str
|
|
|
245
221
|
if not actions:
|
|
246
222
|
return openai_response
|
|
247
223
|
|
|
224
|
+
# Build a fallback tool call using the provided tool name
|
|
225
|
+
payload = {
|
|
226
|
+
"actions": [str(a).strip() for a in actions if str(a).strip()],
|
|
227
|
+
}
|
|
228
|
+
if reasoning.strip():
|
|
229
|
+
payload["reasoning"] = reasoning.strip()
|
|
230
|
+
|
|
231
|
+
tool_call = {
|
|
232
|
+
"id": f"tool_{fallback_tool_name}_fallback",
|
|
233
|
+
"type": "function",
|
|
234
|
+
"function": {
|
|
235
|
+
"name": fallback_tool_name,
|
|
236
|
+
"arguments": json.dumps(payload, ensure_ascii=False),
|
|
237
|
+
},
|
|
238
|
+
}
|
|
239
|
+
|
|
248
240
|
new_message = copy.deepcopy(message)
|
|
249
|
-
new_message["tool_calls"] = [
|
|
241
|
+
new_message["tool_calls"] = [tool_call]
|
|
250
242
|
if "content" not in new_message:
|
|
251
243
|
new_message["content"] = None
|
|
252
244
|
|