synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +17 -5
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
- examples/sft/evaluate.py +2 -0
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/grpo_swe_mini.py +56 -26
- examples/swe/task_app/hosted/rollout.py +42 -0
- examples/swe/task_app/hosted/test_service.py +5 -6
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/TESTING.md +275 -0
- examples/task_apps/__init__.py +0 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/__init__.py +0 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +5 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +324 -21
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +76 -7
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +77 -4
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +117 -9
- examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +218 -0
- examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
- examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
- examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
- examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
- examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
- examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/task_app/README.md +14 -0
- examples/task_apps/enron/task_app/__init__.py +1 -0
- examples/task_apps/enron/task_app/grpo_enron.py +906 -0
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
- examples/task_apps/enron/tests/__init__.py +4 -0
- examples/task_apps/enron/tests/conftest.py +115 -0
- examples/task_apps/enron/tests/integration/__init__.py +4 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +179 -0
- examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
- examples/task_apps/enron/tests/unit/__init__.py +4 -0
- examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
- examples/task_apps/math/__init__.py +0 -0
- examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
- examples/task_apps/pokemon_battle/__init__.py +2 -0
- examples/task_apps/pokemon_battle/modal_app.py +104 -0
- examples/task_apps/pokemon_battle/task_app/README.md +68 -0
- examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
- examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README.md +357 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/__init__.py +3 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +75 -0
- examples/task_apps/pokemon_red/task_app.py +799 -0
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +193 -0
- examples/task_apps/sokoban/README.md +307 -0
- examples/task_apps/sokoban/__init__.py +3 -0
- examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
- examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/task_app.py +1058 -0
- examples/task_apps/sokoban/tests/__init__.py +4 -0
- examples/task_apps/sokoban/tests/conftest.py +113 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +4 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
- examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +4 -0
- examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
- examples/task_apps/verilog/__init__.py +1 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +24 -0
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/README.md +12 -0
- examples/task_apps/verilog/task_app/__init__.py +1 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +1166 -0
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
- examples/task_apps/verilog/tests/__init__.py +4 -0
- examples/task_apps/verilog/tests/conftest.py +115 -0
- examples/task_apps/verilog/tests/integration/__init__.py +4 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +181 -0
- examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
- examples/task_apps/verilog/tests/unit/__init__.py +4 -0
- examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
- examples/vlm/crafter_openai_vlm_agent.py +4 -4
- examples/vlm/run_crafter_vlm_benchmark.py +4 -4
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/workflows/__init__.py +0 -0
- examples/workflows/math_rl/__init__.py +0 -0
- examples/workflows/math_rl/download_dataset.py +80 -0
- synth_ai/__init__.py +2 -2
- synth_ai/api/models/supported.py +1 -0
- synth_ai/api/train/builders.py +25 -11
- synth_ai/api/train/cli.py +12 -6
- synth_ai/api/train/configs/__init__.py +10 -10
- synth_ai/api/train/configs/rl.py +5 -4
- synth_ai/api/train/configs/sft.py +4 -3
- synth_ai/api/train/env_resolver.py +5 -2
- synth_ai/api/train/supported_algos.py +10 -5
- synth_ai/api/train/utils.py +7 -4
- synth_ai/cli/__init__.py +48 -59
- synth_ai/cli/_modal_wrapper.py +3 -2
- synth_ai/cli/_storage.py +4 -3
- synth_ai/cli/_validate_task_app.py +11 -0
- synth_ai/cli/balance.py +4 -3
- synth_ai/cli/calc.py +2 -2
- synth_ai/cli/demo.py +14 -7
- synth_ai/cli/legacy_root_backup.py +1 -1
- synth_ai/cli/recent.py +1 -1
- synth_ai/cli/rl_demo.py +8 -7
- synth_ai/cli/root.py +0 -97
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_apps.py +1922 -190
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/tui.py +57 -0
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +29 -17
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/enron/engine.py +7 -2
- synth_ai/environments/examples/enron/environment.py +68 -0
- synth_ai/environments/examples/red/engine.py +27 -0
- synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
- synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
- synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
- synth_ai/environments/examples/red/environment.py +60 -0
- synth_ai/environments/examples/sokoban/taskset.py +116 -0
- synth_ai/environments/examples/verilog/engine.py +104 -12
- synth_ai/evals/client.py +58 -61
- synth_ai/jobs/client.py +16 -4
- synth_ai/judge_schemas.py +9 -9
- synth_ai/py.typed +0 -0
- synth_ai/task/__init__.py +24 -5
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +257 -0
- synth_ai/task/contracts.py +138 -39
- synth_ai/task/proxy.py +48 -56
- synth_ai/task/rubrics/__init__.py +56 -0
- synth_ai/task/rubrics/loaders.py +152 -0
- synth_ai/task/rubrics/models.py +57 -0
- synth_ai/task/rubrics/scoring.py +116 -0
- synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
- synth_ai/task/server.py +8 -7
- synth_ai/task/trace_correlation_helpers.py +315 -0
- synth_ai/task/validators.py +413 -6
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/decorators.py +7 -3
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/replica_sync.py +4 -4
- synth_ai/tracing_v3/serialization.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +16 -6
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/trace_utils.py +317 -0
- synth_ai/tracing_v3/turso/daemon.py +8 -7
- synth_ai/tracing_v3/turso/native_manager.py +66 -43
- synth_ai/tracing_v3/utils.py +3 -3
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +906 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/METADATA +4 -1
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/RECORD +278 -126
- examples/agora_ex/README_MoE.md +0 -224
- examples/agora_ex/__init__.py +0 -7
- examples/agora_ex/agora_ex.py +0 -65
- examples/agora_ex/agora_ex_task_app.py +0 -590
- examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
- examples/agora_ex/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/system_prompt_CURRENT.md +0 -63
- examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
- examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
- examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
- synth_ai/rubrics/__init__.py +0 -22
- synth_ai/task/rubrics.py +0 -219
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
- /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
- /examples/{rl/task_app → task_apps/math}/README.md +0 -0
- /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
- /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
- /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
- /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Rubric schema, loading, and scoring helpers for Task Apps.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Flexible rubric models (Criterion, Rubric) for general task app use
|
|
5
|
+
- Strict validators (StrictCriterion, StrictRubric) for step-wise judges
|
|
6
|
+
- Loading utilities supporting JSON, YAML, and HTTP sources
|
|
7
|
+
- Blending utilities for composing rubrics
|
|
8
|
+
- Scoring utilities for events and outcomes
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Core models (flexible validation)
|
|
12
|
+
from .models import Criterion, Rubric
|
|
13
|
+
|
|
14
|
+
# Loading and blending
|
|
15
|
+
from .loaders import blend_rubrics, load_rubric
|
|
16
|
+
|
|
17
|
+
# Scoring
|
|
18
|
+
from .scoring import score_events_against_rubric, score_outcome_against_rubric
|
|
19
|
+
|
|
20
|
+
# Strict validators (for judge configs)
|
|
21
|
+
from .strict import (
|
|
22
|
+
StrictCriterion,
|
|
23
|
+
StrictRubric,
|
|
24
|
+
ValidationError,
|
|
25
|
+
validate_rubric_dict,
|
|
26
|
+
validate_rubric_file,
|
|
27
|
+
validate_rubric_files,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Flexible models
|
|
32
|
+
"Criterion",
|
|
33
|
+
"Rubric",
|
|
34
|
+
# Loaders
|
|
35
|
+
"load_rubric",
|
|
36
|
+
"blend_rubrics",
|
|
37
|
+
# Scoring
|
|
38
|
+
"score_events_against_rubric",
|
|
39
|
+
"score_outcome_against_rubric",
|
|
40
|
+
# Strict validators
|
|
41
|
+
"StrictCriterion",
|
|
42
|
+
"StrictRubric",
|
|
43
|
+
"ValidationError",
|
|
44
|
+
"validate_rubric_dict",
|
|
45
|
+
"validate_rubric_file",
|
|
46
|
+
"validate_rubric_files",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
# Maintain backwards compatibility
|
|
50
|
+
# Old code may import these names expecting the flexible variants
|
|
51
|
+
RubricCriterion = StrictCriterion
|
|
52
|
+
RubricSpec = StrictRubric
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Rubric loading and blending utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .models import Criterion, Rubric
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _load_text(source: str) -> tuple[str, str | None]:
|
|
13
|
+
"""Load text from file path or return as-is."""
|
|
14
|
+
path = Path(source)
|
|
15
|
+
if path.exists():
|
|
16
|
+
return path.read_text(encoding="utf-8"), path.suffix.lower()
|
|
17
|
+
return source, None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
|
|
21
|
+
"""Parse JSON or YAML text into a dictionary."""
|
|
22
|
+
text = text.strip()
|
|
23
|
+
if not text:
|
|
24
|
+
raise ValueError("Rubric source is empty")
|
|
25
|
+
if suffix in (".yaml", ".yml"):
|
|
26
|
+
try:
|
|
27
|
+
import yaml # type: ignore
|
|
28
|
+
except Exception as exc: # pragma: no cover - optional dependency
|
|
29
|
+
raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
|
|
30
|
+
data = yaml.safe_load(text)
|
|
31
|
+
if not isinstance(data, dict):
|
|
32
|
+
raise ValueError("Rubric YAML must produce a mapping") from None
|
|
33
|
+
return data
|
|
34
|
+
if text.startswith("{"):
|
|
35
|
+
return json.loads(text)
|
|
36
|
+
if text.startswith("http://") or text.startswith("https://"):
|
|
37
|
+
import requests # type: ignore
|
|
38
|
+
|
|
39
|
+
response = requests.get(text, timeout=15)
|
|
40
|
+
response.raise_for_status()
|
|
41
|
+
return _parse_structured(response.text, suffix)
|
|
42
|
+
try:
|
|
43
|
+
return json.loads(text)
|
|
44
|
+
except json.JSONDecodeError:
|
|
45
|
+
try:
|
|
46
|
+
import yaml # type: ignore
|
|
47
|
+
except Exception as exc: # pragma: no cover - optional dependency
|
|
48
|
+
raise RuntimeError("PyYAML is required to load rubric text") from exc
|
|
49
|
+
data = yaml.safe_load(text)
|
|
50
|
+
if not isinstance(data, dict):
|
|
51
|
+
raise ValueError("Rubric text must decode to a mapping") from None
|
|
52
|
+
return data
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
|
|
56
|
+
"""Load rubric from file path, dict, or return existing Rubric.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
source: File path (JSON/YAML), dict, existing Rubric, or None
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Parsed Rubric instance or None if source is None
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
ValueError: If the rubric format is incorrect (e.g., backend judge format)
|
|
66
|
+
ValidationError: If the rubric fails schema validation
|
|
67
|
+
"""
|
|
68
|
+
if source is None:
|
|
69
|
+
return None
|
|
70
|
+
if isinstance(source, Rubric):
|
|
71
|
+
return source
|
|
72
|
+
|
|
73
|
+
# Load and parse the data
|
|
74
|
+
if isinstance(source, dict):
|
|
75
|
+
data = source
|
|
76
|
+
else:
|
|
77
|
+
text, suffix = _load_text(str(source))
|
|
78
|
+
data = _parse_structured(text, suffix)
|
|
79
|
+
|
|
80
|
+
# Check if this looks like a backend judge rubric (wrong format)
|
|
81
|
+
if isinstance(data, dict) and "event" in data and "outcome" in data:
|
|
82
|
+
# Missing required task app rubric fields
|
|
83
|
+
if "version" not in data and "goal_text" not in data and "criteria" not in data:
|
|
84
|
+
source_hint = f" ({source})" if isinstance(source, str) else ""
|
|
85
|
+
raise ValueError(
|
|
86
|
+
f"Rubric appears to be in backend judge format (has 'event'/'outcome' keys){source_hint}. "
|
|
87
|
+
f"Task apps require rubrics with 'version', 'goal_text', and 'criteria' fields. "
|
|
88
|
+
f"Backend judge rubrics should be named '*_backend_judge.json' and loaded by judge functions."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return Rubric.model_validate(data)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _merge_weights(base: Criterion, override: Criterion) -> float:
|
|
95
|
+
"""Merge criterion weights from base and override rubrics."""
|
|
96
|
+
if override.weight != 1.0 and base.weight != 1.0:
|
|
97
|
+
return base.weight * override.weight
|
|
98
|
+
if override.weight != 1.0:
|
|
99
|
+
return override.weight
|
|
100
|
+
return base.weight
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
|
|
104
|
+
"""Blend two rubrics by merging criteria and inheriting properties.
|
|
105
|
+
|
|
106
|
+
Override rubric takes precedence for descriptions and settings.
|
|
107
|
+
Weights are merged multiplicatively when both are non-default.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
base: Base rubric providing defaults
|
|
111
|
+
override: Override rubric with specific customizations
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Blended rubric or None if both inputs are None
|
|
115
|
+
"""
|
|
116
|
+
if override is None and base is None:
|
|
117
|
+
return None
|
|
118
|
+
if base is None:
|
|
119
|
+
return override
|
|
120
|
+
if override is None:
|
|
121
|
+
return base
|
|
122
|
+
|
|
123
|
+
base_map = {criterion.id: criterion for criterion in base.criteria}
|
|
124
|
+
merged: list[Criterion] = []
|
|
125
|
+
|
|
126
|
+
for ov in override.criteria:
|
|
127
|
+
if ov.id in base_map:
|
|
128
|
+
existing = base_map.pop(ov.id)
|
|
129
|
+
merged.append(
|
|
130
|
+
Criterion(
|
|
131
|
+
id=ov.id,
|
|
132
|
+
description=ov.description or existing.description,
|
|
133
|
+
weight=_merge_weights(existing, ov),
|
|
134
|
+
required=ov.required if ov.required is not None else existing.required,
|
|
135
|
+
)
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
merged.append(ov)
|
|
139
|
+
|
|
140
|
+
merged.extend(base_map.values())
|
|
141
|
+
|
|
142
|
+
aggregation = override.aggregation
|
|
143
|
+
if aggregation == "inherit":
|
|
144
|
+
aggregation = base.aggregation
|
|
145
|
+
|
|
146
|
+
return Rubric(
|
|
147
|
+
version=override.version or base.version,
|
|
148
|
+
goal_text=override.goal_text or base.goal_text,
|
|
149
|
+
criteria=merged,
|
|
150
|
+
aggregation=aggregation,
|
|
151
|
+
)
|
|
152
|
+
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Rubric and Criterion data models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, field_validator
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Criterion(BaseModel):
|
|
9
|
+
"""Single scoring criterion within a rubric.
|
|
10
|
+
|
|
11
|
+
Flexible variant allowing weights > 1.0 and no normalization requirement.
|
|
12
|
+
Used by task apps for general rubric scoring.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
id: str
|
|
16
|
+
description: str
|
|
17
|
+
weight: float = 1.0
|
|
18
|
+
required: bool = False
|
|
19
|
+
|
|
20
|
+
@field_validator("weight")
|
|
21
|
+
@classmethod
|
|
22
|
+
def _validate_weight(cls, value: float) -> float:
|
|
23
|
+
if value <= 0:
|
|
24
|
+
raise ValueError("criterion weight must be positive")
|
|
25
|
+
return value
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Rubric(BaseModel):
|
|
29
|
+
"""Rubric definition for scoring task app outcomes.
|
|
30
|
+
|
|
31
|
+
Supports flexible aggregation and blending. Criteria weights do not need
|
|
32
|
+
to sum to 1.0, making this suitable for general task app usage.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
version: str
|
|
36
|
+
goal_text: str | None = None
|
|
37
|
+
criteria: list[Criterion] = Field(default_factory=list)
|
|
38
|
+
aggregation: str = "weighted_sum"
|
|
39
|
+
|
|
40
|
+
@field_validator("aggregation")
|
|
41
|
+
@classmethod
|
|
42
|
+
def _validate_aggregation(cls, value: str) -> str:
|
|
43
|
+
allowed = {"sum", "weighted_sum", "custom", "inherit"}
|
|
44
|
+
if value not in allowed:
|
|
45
|
+
raise ValueError(f"aggregation must be one of {sorted(allowed)}")
|
|
46
|
+
return value
|
|
47
|
+
|
|
48
|
+
@field_validator("criteria")
|
|
49
|
+
@classmethod
|
|
50
|
+
def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
|
|
51
|
+
seen = set()
|
|
52
|
+
for criterion in criteria:
|
|
53
|
+
if criterion.id in seen:
|
|
54
|
+
raise ValueError(f"duplicate criterion id: {criterion.id}")
|
|
55
|
+
seen.add(criterion.id)
|
|
56
|
+
return criteria
|
|
57
|
+
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Rubric scoring utilities for events and outcomes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .models import Criterion, Rubric
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _as_float(value: Any) -> float | None:
|
|
12
|
+
"""Safely convert value to float, returning None on failure."""
|
|
13
|
+
try:
|
|
14
|
+
return float(value)
|
|
15
|
+
except Exception:
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _score(
|
|
20
|
+
criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""Compute aggregate score from criterion values.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
criteria: List of criteria defining scoring dimensions
|
|
26
|
+
values: Map of criterion IDs to scores
|
|
27
|
+
aggregation: How to aggregate ("sum", "weighted_sum", "custom")
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Dict with aggregation method, total score, and per-criterion breakdown
|
|
31
|
+
"""
|
|
32
|
+
if aggregation == "inherit":
|
|
33
|
+
aggregation = "weighted_sum"
|
|
34
|
+
per_criterion: dict[str, dict[str, Any]] = {}
|
|
35
|
+
total = 0.0
|
|
36
|
+
total_weight = 0.0
|
|
37
|
+
for criterion in criteria:
|
|
38
|
+
score = values.get(criterion.id, 0.0)
|
|
39
|
+
per_criterion[criterion.id] = {
|
|
40
|
+
"score": score,
|
|
41
|
+
"weight": criterion.weight,
|
|
42
|
+
"required": criterion.required,
|
|
43
|
+
}
|
|
44
|
+
if aggregation == "sum":
|
|
45
|
+
total += score
|
|
46
|
+
elif aggregation == "weighted_sum":
|
|
47
|
+
total += score * criterion.weight
|
|
48
|
+
total_weight += criterion.weight
|
|
49
|
+
if aggregation == "weighted_sum" and total_weight > 0:
|
|
50
|
+
total = total / total_weight
|
|
51
|
+
if aggregation == "custom":
|
|
52
|
+
total = None # type: ignore[assignment]
|
|
53
|
+
return {
|
|
54
|
+
"aggregation": aggregation,
|
|
55
|
+
"score": total,
|
|
56
|
+
"per_criterion": per_criterion,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def score_events_against_rubric(
|
|
61
|
+
events: list[dict[str, Any]], rubric: Rubric | None
|
|
62
|
+
) -> dict[str, Any]:
|
|
63
|
+
"""Score a list of evaluation events against a rubric.
|
|
64
|
+
|
|
65
|
+
Events should contain criterion_id/id/criterion and score fields.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
events: List of event dicts with scoring info
|
|
69
|
+
rubric: Rubric defining criteria and aggregation
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Scoring result with total and per-criterion scores
|
|
73
|
+
"""
|
|
74
|
+
if rubric is None:
|
|
75
|
+
return {"aggregation": "none", "score": None, "per_criterion": {}}
|
|
76
|
+
values: dict[str, float] = {}
|
|
77
|
+
for event in events or []:
|
|
78
|
+
if not isinstance(event, dict):
|
|
79
|
+
continue
|
|
80
|
+
cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
|
|
81
|
+
score = _as_float(event.get("score"))
|
|
82
|
+
if cid and score is not None:
|
|
83
|
+
values[str(cid)] = score
|
|
84
|
+
return _score(rubric.criteria, values, rubric.aggregation)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
|
|
88
|
+
"""Score a rollout outcome against a rubric.
|
|
89
|
+
|
|
90
|
+
Outcome should be a dict mapping criterion IDs to scores, optionally
|
|
91
|
+
nested under a "criteria" key.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
outcome: Outcome dict with criterion scores
|
|
95
|
+
rubric: Rubric defining criteria and aggregation
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
Scoring result with total and per-criterion scores
|
|
99
|
+
"""
|
|
100
|
+
if rubric is None:
|
|
101
|
+
return {"aggregation": "none", "score": None, "per_criterion": {}}
|
|
102
|
+
values: dict[str, float] = {}
|
|
103
|
+
if isinstance(outcome, dict):
|
|
104
|
+
candidates = (
|
|
105
|
+
outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
|
|
106
|
+
)
|
|
107
|
+
if isinstance(candidates, dict):
|
|
108
|
+
for key, value in candidates.items():
|
|
109
|
+
score = _as_float(value)
|
|
110
|
+
if score is not None:
|
|
111
|
+
values[str(key)] = score
|
|
112
|
+
return _score(rubric.criteria, values, rubric.aggregation)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
|
|
@@ -1,15 +1,32 @@
|
|
|
1
|
+
"""Strict rubric validators for step-wise judges.
|
|
2
|
+
|
|
3
|
+
These validators enforce stricter constraints than the general-purpose rubrics:
|
|
4
|
+
- Weights must be ≤ 1.0 and sum to exactly 1.0
|
|
5
|
+
- Only weighted_sum aggregation is allowed
|
|
6
|
+
- All required fields must be non-empty
|
|
7
|
+
|
|
8
|
+
Used primarily for validation in judge configurations.
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
from __future__ import annotations
|
|
2
12
|
|
|
3
13
|
import json
|
|
4
14
|
import math
|
|
15
|
+
from collections.abc import Iterable
|
|
5
16
|
from pathlib import Path
|
|
6
|
-
from typing import Any,
|
|
17
|
+
from typing import Any, Literal
|
|
7
18
|
|
|
8
19
|
import pydantic
|
|
9
20
|
|
|
10
21
|
|
|
11
|
-
class
|
|
12
|
-
"""Single scoring criterion
|
|
22
|
+
class StrictCriterion(pydantic.BaseModel):
|
|
23
|
+
"""Single scoring criterion with strict validation.
|
|
24
|
+
|
|
25
|
+
Enforces:
|
|
26
|
+
- Weight ≤ 1.0 (for proper normalization)
|
|
27
|
+
- Weight > 0.0 (positive)
|
|
28
|
+
- Non-empty strings
|
|
29
|
+
"""
|
|
13
30
|
|
|
14
31
|
id: str
|
|
15
32
|
description: str
|
|
@@ -35,16 +52,23 @@ class RubricCriterion(pydantic.BaseModel):
|
|
|
35
52
|
return value
|
|
36
53
|
|
|
37
54
|
|
|
38
|
-
class
|
|
39
|
-
"""
|
|
55
|
+
class StrictRubric(pydantic.BaseModel):
|
|
56
|
+
"""Strict rubric definition for step-wise judges.
|
|
57
|
+
|
|
58
|
+
Enforces:
|
|
59
|
+
- Weights must sum to 1.0
|
|
60
|
+
- Only weighted_sum aggregation
|
|
61
|
+
- Non-empty version and goal_text
|
|
62
|
+
- At least one criterion
|
|
63
|
+
"""
|
|
40
64
|
|
|
41
65
|
version: str
|
|
42
66
|
goal_text: str
|
|
43
67
|
aggregation: Literal["weighted_sum"]
|
|
44
|
-
criteria: list[
|
|
68
|
+
criteria: list[StrictCriterion]
|
|
45
69
|
|
|
46
70
|
@pydantic.model_validator(mode="after")
|
|
47
|
-
def _validate_weights(self) ->
|
|
71
|
+
def _validate_weights(self) -> StrictRubric:
|
|
48
72
|
if not self.criteria:
|
|
49
73
|
raise ValueError("rubric must declare at least one criterion")
|
|
50
74
|
total_weight = sum(criterion.weight for criterion in self.criteria)
|
|
@@ -71,56 +95,55 @@ class RubricSpec(pydantic.BaseModel):
|
|
|
71
95
|
return value
|
|
72
96
|
|
|
73
97
|
|
|
98
|
+
# Re-export pydantic's ValidationError for convenience
|
|
74
99
|
ValidationError = pydantic.ValidationError
|
|
75
100
|
|
|
76
101
|
|
|
77
|
-
def validate_rubric_dict(payload: dict[str, Any]) ->
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
|
|
102
|
+
def validate_rubric_dict(payload: dict[str, Any]) -> StrictRubric:
|
|
103
|
+
"""Validate an in-memory rubric payload with strict rules.
|
|
104
|
+
|
|
81
105
|
Args:
|
|
82
|
-
payload: Dictionary representing the rubric JSON
|
|
106
|
+
payload: Dictionary representing the rubric JSON
|
|
107
|
+
|
|
83
108
|
Returns:
|
|
84
|
-
Validated
|
|
109
|
+
Validated StrictRubric instance
|
|
110
|
+
|
|
85
111
|
Raises:
|
|
86
|
-
ValidationError: If
|
|
87
|
-
invalid weights.
|
|
112
|
+
ValidationError: If payload is invalid or doesn't meet strict constraints
|
|
88
113
|
"""
|
|
89
|
-
|
|
90
114
|
if not isinstance(payload, dict):
|
|
91
115
|
raise TypeError("rubric payload must be a dictionary")
|
|
92
|
-
return
|
|
116
|
+
return StrictRubric.model_validate(payload)
|
|
93
117
|
|
|
94
118
|
|
|
95
119
|
def _load_payload_from_file(path: Path) -> dict[str, Any]:
|
|
120
|
+
"""Load JSON rubric from file."""
|
|
96
121
|
if path.suffix.lower() != ".json":
|
|
97
122
|
raise ValueError(f"Unsupported rubric file type: {path}")
|
|
98
123
|
text = path.read_text(encoding="utf-8")
|
|
99
124
|
return json.loads(text)
|
|
100
125
|
|
|
101
126
|
|
|
102
|
-
def validate_rubric_file(path: Path) ->
|
|
103
|
-
"""
|
|
104
|
-
|
|
105
|
-
|
|
127
|
+
def validate_rubric_file(path: Path) -> StrictRubric:
|
|
128
|
+
"""Load and validate a rubric file with strict rules.
|
|
129
|
+
|
|
106
130
|
Args:
|
|
107
|
-
path: Path to a JSON rubric document
|
|
131
|
+
path: Path to a JSON rubric document
|
|
132
|
+
|
|
108
133
|
Returns:
|
|
109
|
-
Validated
|
|
134
|
+
Validated StrictRubric instance
|
|
110
135
|
"""
|
|
111
|
-
|
|
112
136
|
payload = _load_payload_from_file(path)
|
|
113
137
|
return validate_rubric_dict(payload)
|
|
114
138
|
|
|
115
139
|
|
|
116
|
-
def validate_rubric_files(paths: Iterable[Path]) -> list[
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
|
|
140
|
+
def validate_rubric_files(paths: Iterable[Path]) -> list[StrictRubric]:
|
|
141
|
+
"""Validate multiple rubric files with strict rules.
|
|
142
|
+
|
|
120
143
|
Useful for bulk validation inside tests or CI checks.
|
|
121
144
|
"""
|
|
122
|
-
|
|
123
|
-
validated: list[RubricSpec] = []
|
|
145
|
+
validated: list[StrictRubric] = []
|
|
124
146
|
for path in paths:
|
|
125
147
|
validated.append(validate_rubric_file(path))
|
|
126
148
|
return validated
|
|
149
|
+
|
synth_ai/task/server.py
CHANGED
|
@@ -70,7 +70,7 @@ class TaskAppConfig:
|
|
|
70
70
|
provide_task_instances: InstanceProvider
|
|
71
71
|
rollout: RolloutExecutor
|
|
72
72
|
dataset_registry: TaskDatasetRegistry | None = None
|
|
73
|
-
rubrics: RubricBundle = field(default_factory=RubricBundle)
|
|
73
|
+
rubrics: RubricBundle | None = field(default_factory=RubricBundle)
|
|
74
74
|
proxy: ProxyConfig | None = None
|
|
75
75
|
routers: Sequence[APIRouter] = field(default_factory=tuple)
|
|
76
76
|
middleware: Sequence[Middleware] = field(default_factory=tuple)
|
|
@@ -93,7 +93,7 @@ class TaskAppConfig:
|
|
|
93
93
|
provide_task_instances=self.provide_task_instances,
|
|
94
94
|
rollout=self.rollout,
|
|
95
95
|
dataset_registry=self.dataset_registry,
|
|
96
|
-
rubrics=self.rubrics,
|
|
96
|
+
rubrics=self.rubrics or RubricBundle(),
|
|
97
97
|
proxy=self.proxy,
|
|
98
98
|
routers=tuple(self.routers),
|
|
99
99
|
middleware=tuple(self.middleware),
|
|
@@ -221,6 +221,7 @@ def _auth_dependency_factory(config: TaskAppConfig) -> Callable[[Request], None]
|
|
|
221
221
|
|
|
222
222
|
def create_task_app(config: TaskAppConfig) -> FastAPI:
|
|
223
223
|
cfg = config.clone()
|
|
224
|
+
cfg.rubrics = cfg.rubrics or RubricBundle()
|
|
224
225
|
app = FastAPI(title=cfg.name, description=cfg.description)
|
|
225
226
|
|
|
226
227
|
for key, value in cfg.app_state.items():
|
|
@@ -310,20 +311,20 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
|
|
|
310
311
|
async def info() -> Mapping[str, Any]:
|
|
311
312
|
dataset_meta = cfg.base_task_info.dataset
|
|
312
313
|
rubrics: dict[str, Any] | None = None
|
|
313
|
-
|
|
314
|
+
rubric_bundle = cfg.rubrics
|
|
315
|
+
if rubric_bundle and (rubric_bundle.outcome or rubric_bundle.events):
|
|
314
316
|
rubrics = {
|
|
315
|
-
"outcome":
|
|
316
|
-
"events":
|
|
317
|
+
"outcome": rubric_bundle.outcome.model_dump() if rubric_bundle.outcome else None,
|
|
318
|
+
"events": rubric_bundle.events.model_dump() if rubric_bundle.events else None,
|
|
317
319
|
}
|
|
318
320
|
payload = {
|
|
319
321
|
"service": {
|
|
320
322
|
"task": cfg.base_task_info.task,
|
|
321
|
-
"version": cfg.base_task_info.task.
|
|
323
|
+
"version": cfg.base_task_info.task.version,
|
|
322
324
|
},
|
|
323
325
|
"dataset": dataset_meta,
|
|
324
326
|
"rubrics": rubrics,
|
|
325
327
|
"inference": cfg.base_task_info.inference,
|
|
326
|
-
"capabilities": cfg.base_task_info.capabilities,
|
|
327
328
|
"limits": cfg.base_task_info.limits,
|
|
328
329
|
}
|
|
329
330
|
return to_jsonable(payload)
|