synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/analyze_semantic_words.sh +2 -2
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/README.md +98 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
- examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
- examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
- examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/README.md +158 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
- examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
- examples/multi_step/configs/verilog_rl_lora.toml +80 -123
- examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
- examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
- examples/qwen_coder/configs/coder_lora_small.toml +1 -3
- examples/qwen_vl/README.md +10 -12
- examples/qwen_vl/SETUP_COMPLETE.md +7 -8
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
- examples/qwen_vl/collect_data_via_cli.md +76 -84
- examples/qwen_vl/collect_vision_traces.py +4 -4
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
- examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
- examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
- examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
- examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
- examples/qwen_vl/run_vision_comparison.sh +6 -7
- examples/rl/README.md +5 -5
- examples/rl/configs/rl_from_base_qwen.toml +26 -1
- examples/rl/configs/rl_from_base_qwen17.toml +6 -2
- examples/rl/task_app/README.md +1 -2
- examples/rl/task_app/math_single_step.py +2 -2
- examples/run_crafter_demo.sh +2 -2
- examples/sft/README.md +1 -1
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
- examples/swe/task_app/README.md +32 -2
- examples/swe/task_app/grpo_swe_mini.py +4 -0
- examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
- examples/swe/task_app/hosted/inference/openai_client.py +4 -38
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/swe/task_app/morph_backend.py +178 -0
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/README.md +1 -1
- examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
- examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/math/README.md +1 -2
- examples/task_apps/pokemon_red/README.md +3 -4
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
- examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
- examples/task_apps/pokemon_red/task_app.py +288 -39
- examples/task_apps/sokoban/README.md +2 -3
- examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
- examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
- examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
- examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +1 -1
- examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
- synth_ai/api/train/builders.py +99 -4
- synth_ai/api/train/cli.py +516 -26
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +23 -2
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +61 -7
- synth_ai/api/train/configs/sft.py +6 -2
- synth_ai/api/train/configs/shared.py +59 -2
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/auth/credentials.py +119 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +94 -18
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +18 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/demo/__init__.py +6 -0
- synth_ai/cli/commands/demo/core.py +163 -0
- synth_ai/cli/commands/eval/__init__.py +19 -0
- synth_ai/cli/commands/eval/core.py +1112 -0
- synth_ai/cli/commands/eval/errors.py +81 -0
- synth_ai/cli/commands/eval/validation.py +133 -0
- synth_ai/cli/commands/filter/__init__.py +12 -0
- synth_ai/cli/commands/filter/core.py +424 -0
- synth_ai/cli/commands/filter/errors.py +55 -0
- synth_ai/cli/commands/filter/validation.py +77 -0
- synth_ai/cli/commands/help/__init__.py +177 -0
- synth_ai/cli/commands/help/core.py +72 -0
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/__init__.py +64 -0
- synth_ai/cli/commands/status/client.py +192 -0
- synth_ai/cli/commands/status/config.py +92 -0
- synth_ai/cli/commands/status/errors.py +20 -0
- synth_ai/cli/commands/status/formatters.py +164 -0
- synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
- synth_ai/cli/commands/status/subcommands/files.py +79 -0
- synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
- synth_ai/cli/commands/status/subcommands/models.py +79 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/runs.py +81 -0
- synth_ai/cli/commands/status/subcommands/summary.py +47 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/status/utils.py +114 -0
- synth_ai/cli/commands/train/__init__.py +53 -0
- synth_ai/cli/commands/train/core.py +21 -0
- synth_ai/cli/commands/train/errors.py +117 -0
- synth_ai/cli/commands/train/judge_schemas.py +200 -0
- synth_ai/cli/commands/train/judge_validation.py +305 -0
- synth_ai/cli/commands/train/validation.py +386 -0
- synth_ai/cli/demo.py +30 -158
- synth_ai/cli/deploy/__init__.py +43 -0
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/eval/__init__.py +36 -0
- synth_ai/cli/eval/core.py +5 -0
- synth_ai/cli/eval/errors.py +31 -0
- synth_ai/cli/eval/validation.py +5 -0
- synth_ai/cli/filter/__init__.py +28 -0
- synth_ai/cli/filter/core.py +5 -0
- synth_ai/cli/filter/errors.py +23 -0
- synth_ai/cli/filter/validation.py +5 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/modal_serve/__init__.py +12 -0
- synth_ai/cli/modal_serve/core.py +14 -0
- synth_ai/cli/modal_serve/errors.py +8 -0
- synth_ai/cli/modal_serve/validation.py +11 -0
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/serve/__init__.py +12 -0
- synth_ai/cli/serve/core.py +14 -0
- synth_ai/cli/serve/errors.py +8 -0
- synth_ai/cli/serve/validation.py +11 -0
- synth_ai/cli/setup.py +20 -265
- synth_ai/cli/status.py +7 -126
- synth_ai/cli/task_app_deploy.py +1 -10
- synth_ai/cli/task_app_modal_serve.py +4 -9
- synth_ai/cli/task_app_serve.py +4 -11
- synth_ai/cli/task_apps.py +51 -1480
- synth_ai/cli/train/__init__.py +12 -0
- synth_ai/cli/train/core.py +21 -0
- synth_ai/cli/train/errors.py +8 -0
- synth_ai/cli/train/validation.py +24 -0
- synth_ai/cli/train.py +1 -14
- synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/environments/examples/red/engine.py +33 -12
- synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
- synth_ai/environments/examples/red/environment.py +26 -0
- synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
- synth_ai/http.py +12 -0
- synth_ai/judge_schemas.py +10 -10
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/learning/rl/client.py +3 -1
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/__init__.py +29 -0
- synth_ai/streaming/config.py +94 -0
- synth_ai/streaming/handlers.py +518 -0
- synth_ai/streaming/streamer.py +320 -0
- synth_ai/streaming/types.py +95 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +45 -9
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/migration_helper.py +1 -2
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +40 -33
- synth_ai/utils/http.py +4 -1
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +285 -3
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
- synth_ai/cli/tui.py +0 -62
- synth_ai/tui/__init__.py +0 -5
- synth_ai/tui/__main__.py +0 -13
- synth_ai/tui/cli/__init__.py +0 -1
- synth_ai/tui/cli/query_experiments.py +0 -164
- synth_ai/tui/cli/query_experiments_v3.py +0 -164
- synth_ai/tui/dashboard.py +0 -911
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pydantic schemas for judge/rubric configuration.
|
|
3
|
+
|
|
4
|
+
These models define the ACTUAL fields used by the backend judge service,
|
|
5
|
+
with all dead code removed. This is the single source of truth for what
|
|
6
|
+
gets sent in HTTP requests.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
from pydantic import Field, model_validator
|
|
14
|
+
from synth_ai.api.train.configs.shared import ExtraModel
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"RubricWeightsConfig",
|
|
18
|
+
"RubricConfig",
|
|
19
|
+
"JudgeOptionsConfig",
|
|
20
|
+
"JudgeConfig",
|
|
21
|
+
"JudgeRequestPayload",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RubricWeightsConfig(ExtraModel):
|
|
26
|
+
"""
|
|
27
|
+
Reward blending weights (client-side only, not sent to backend).
|
|
28
|
+
|
|
29
|
+
These weights control how env rewards, event judge scores, and outcome
|
|
30
|
+
judge scores are combined into a final reward signal for policy gradients.
|
|
31
|
+
|
|
32
|
+
Formula:
|
|
33
|
+
total_reward = (env * env_return) + (event * sum(event_scores)) + (outcome * outcome_score)
|
|
34
|
+
"""
|
|
35
|
+
env: float = Field(
|
|
36
|
+
default=1.0,
|
|
37
|
+
description="Weight for environment rewards (task app native rewards)",
|
|
38
|
+
ge=0.0,
|
|
39
|
+
)
|
|
40
|
+
event: float = Field(
|
|
41
|
+
default=0.0,
|
|
42
|
+
description="Weight for per-event judge scores (step-level judging)",
|
|
43
|
+
ge=0.0,
|
|
44
|
+
)
|
|
45
|
+
outcome: float = Field(
|
|
46
|
+
default=0.0,
|
|
47
|
+
description="Weight for outcome judge score (episode-level judging)",
|
|
48
|
+
ge=0.0,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
@model_validator(mode="after")
|
|
52
|
+
def _validate_weights_sum(self) -> RubricWeightsConfig:
|
|
53
|
+
"""Ensure at least one weight is non-zero."""
|
|
54
|
+
if self.env == 0.0 and self.event == 0.0 and self.outcome == 0.0:
|
|
55
|
+
raise ValueError("At least one reward weight must be non-zero")
|
|
56
|
+
return self
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class RubricConfig(ExtraModel):
|
|
60
|
+
"""
|
|
61
|
+
Top-level rubric configuration.
|
|
62
|
+
|
|
63
|
+
Controls whether rubric-based judging is enabled and how rewards are blended.
|
|
64
|
+
"""
|
|
65
|
+
enabled: bool = Field(
|
|
66
|
+
default=False,
|
|
67
|
+
description="Master switch for rubric-based judging",
|
|
68
|
+
)
|
|
69
|
+
weights: RubricWeightsConfig = Field(
|
|
70
|
+
default_factory=RubricWeightsConfig,
|
|
71
|
+
description="Reward blending weights (env/event/outcome)",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class JudgeOptionsConfig(ExtraModel):
|
|
76
|
+
"""
|
|
77
|
+
Judge provider options (sent to backend in HTTP request).
|
|
78
|
+
|
|
79
|
+
These fields are sent in the "options" object of the judge score request.
|
|
80
|
+
All fields here map directly to the backend JudgeOptions schema.
|
|
81
|
+
"""
|
|
82
|
+
provider: str = Field(
|
|
83
|
+
...,
|
|
84
|
+
description="Judge provider type ('openai', 'groq', 'gemini')",
|
|
85
|
+
pattern=r"^(openai|groq|gemini)$",
|
|
86
|
+
)
|
|
87
|
+
model: str = Field(
|
|
88
|
+
...,
|
|
89
|
+
description="Model identifier (e.g., 'openai/gpt-oss-120b', 'gpt-5')",
|
|
90
|
+
min_length=1,
|
|
91
|
+
)
|
|
92
|
+
rubric_id: Optional[str] = Field(
|
|
93
|
+
default=None,
|
|
94
|
+
description="Base rubric identifier (e.g., 'crafter/bundle@v1')",
|
|
95
|
+
)
|
|
96
|
+
event: bool = Field(
|
|
97
|
+
default=True,
|
|
98
|
+
description="Enable per-event (step-level) judging",
|
|
99
|
+
)
|
|
100
|
+
outcome: bool = Field(
|
|
101
|
+
default=True,
|
|
102
|
+
description="Enable outcome (episode-level) judging",
|
|
103
|
+
)
|
|
104
|
+
timeout_s: Optional[float] = Field(
|
|
105
|
+
default=None,
|
|
106
|
+
description="Request timeout in seconds",
|
|
107
|
+
gt=0,
|
|
108
|
+
)
|
|
109
|
+
metadata: dict[str, Any] = Field(
|
|
110
|
+
default_factory=dict,
|
|
111
|
+
description="Optional metadata (e.g., {'async': true, 'custom_field': 'value'})",
|
|
112
|
+
)
|
|
113
|
+
rubric_overrides: dict[str, Any] = Field(
|
|
114
|
+
default_factory=dict,
|
|
115
|
+
description=(
|
|
116
|
+
"Static rubric criteria overrides (rarely used - TaskInfo overrides take priority). "
|
|
117
|
+
"Format: {'event': {'criteria': [...]}, 'outcome': {'criteria': [...]}}"
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
@model_validator(mode="after")
|
|
122
|
+
def _validate_at_least_one_enabled(self) -> JudgeOptionsConfig:
|
|
123
|
+
"""Ensure at least one judging type is enabled."""
|
|
124
|
+
if not self.event and not self.outcome:
|
|
125
|
+
raise ValueError("At least one of 'event' or 'outcome' must be enabled")
|
|
126
|
+
return self
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class JudgeConfig(ExtraModel):
|
|
130
|
+
"""
|
|
131
|
+
Top-level judge configuration.
|
|
132
|
+
|
|
133
|
+
This is parsed from TOML [judge] section and contains all judge-related settings.
|
|
134
|
+
"""
|
|
135
|
+
options: JudgeOptionsConfig = Field(
|
|
136
|
+
...,
|
|
137
|
+
description="Judge provider options (sent to backend)",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# HTTP Request Payload Structures (for documentation/type safety)
|
|
142
|
+
|
|
143
|
+
class JudgeRequestPayload(ExtraModel):
|
|
144
|
+
"""
|
|
145
|
+
HTTP request payload structure for POST /api/judge/v1/score.
|
|
146
|
+
|
|
147
|
+
This is the ACTUAL payload sent to the backend judge service.
|
|
148
|
+
Used for type safety and documentation only.
|
|
149
|
+
"""
|
|
150
|
+
policy_name: str = Field(..., description="Name of the policy being evaluated")
|
|
151
|
+
task_app: dict[str, Any] = Field(..., description="Task app metadata (id, base_url)")
|
|
152
|
+
trace: dict[str, Any] = Field(..., description="Tracing v3 payload (event_history, metadata)")
|
|
153
|
+
options: dict[str, Any] = Field(..., description="Judge options (provider, model, etc.)")
|
|
154
|
+
|
|
155
|
+
class Config:
|
|
156
|
+
extra = "allow" # Backend might add extra fields
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
# Helper to convert to backend request format
|
|
160
|
+
|
|
161
|
+
def build_judge_http_options(
|
|
162
|
+
options_config: JudgeOptionsConfig,
|
|
163
|
+
*,
|
|
164
|
+
rubric_overrides_from_task_info: Optional[dict[str, Any]] = None,
|
|
165
|
+
) -> dict[str, Any]:
|
|
166
|
+
"""
|
|
167
|
+
Build the 'options' dict for HTTP request to backend judge.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
options_config: Validated judge options from TOML
|
|
171
|
+
rubric_overrides_from_task_info: Dynamic overrides fetched from TaskInfo (takes priority)
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dict ready to send in HTTP request payload
|
|
175
|
+
"""
|
|
176
|
+
payload = {
|
|
177
|
+
"provider": options_config.provider,
|
|
178
|
+
"model": options_config.model,
|
|
179
|
+
"event": options_config.event,
|
|
180
|
+
"outcome": options_config.outcome,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# Optional fields
|
|
184
|
+
if options_config.rubric_id:
|
|
185
|
+
payload["rubric_id"] = options_config.rubric_id
|
|
186
|
+
|
|
187
|
+
if options_config.timeout_s is not None:
|
|
188
|
+
payload["timeout_s"] = options_config.timeout_s
|
|
189
|
+
|
|
190
|
+
if options_config.metadata:
|
|
191
|
+
payload["metadata"] = options_config.metadata
|
|
192
|
+
|
|
193
|
+
# Rubric overrides: TaskInfo takes priority over static config
|
|
194
|
+
if rubric_overrides_from_task_info:
|
|
195
|
+
payload["rubric_overrides"] = rubric_overrides_from_task_info
|
|
196
|
+
elif options_config.rubric_overrides:
|
|
197
|
+
payload["rubric_overrides"] = options_config.rubric_overrides
|
|
198
|
+
|
|
199
|
+
return payload
|
|
200
|
+
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Validation logic for judge/rubric configuration from TOML.
|
|
3
|
+
|
|
4
|
+
This module validates and normalizes judge/rubric config, removing all dead fields
|
|
5
|
+
and ensuring only the fields actually used by the backend are present.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import warnings
|
|
11
|
+
from collections.abc import MutableMapping
|
|
12
|
+
from typing import Any, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from pydantic import ValidationError
|
|
15
|
+
|
|
16
|
+
from .errors import InvalidJudgeConfigError, InvalidRubricConfigError
|
|
17
|
+
from .judge_schemas import JudgeConfig, JudgeOptionsConfig, RubricConfig, RubricWeightsConfig
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"validate_judge_config",
|
|
21
|
+
"validate_rubric_config",
|
|
22
|
+
"extract_and_validate_judge_rubric",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# Dead fields that should trigger deprecation warnings
|
|
26
|
+
DEPRECATED_RUBRIC_FIELDS = {
|
|
27
|
+
"model",
|
|
28
|
+
"api_base",
|
|
29
|
+
"api_key_env",
|
|
30
|
+
"event",
|
|
31
|
+
"outcome",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
DEPRECATED_JUDGE_FIELDS = {
|
|
35
|
+
"type",
|
|
36
|
+
"timeout_s", # Moved to judge.options.timeout_s
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
DEPRECATED_JUDGE_OPTIONS_FIELDS = {
|
|
40
|
+
"max_concurrency",
|
|
41
|
+
"tracks",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _warn_deprecated_fields(section: str, fields: set[str], present_fields: set[str]) -> None:
|
|
46
|
+
"""Emit deprecation warnings for dead fields that are present in config."""
|
|
47
|
+
deprecated_present = fields & present_fields
|
|
48
|
+
if deprecated_present:
|
|
49
|
+
field_list = ", ".join(sorted(deprecated_present))
|
|
50
|
+
warnings.warn(
|
|
51
|
+
f"[{section}] contains deprecated fields that are no longer used: {field_list}. "
|
|
52
|
+
f"These fields will be ignored and should be removed from your config. "
|
|
53
|
+
f"See judge/rubric cleanup guide for details.",
|
|
54
|
+
DeprecationWarning,
|
|
55
|
+
stacklevel=3,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def validate_rubric_config(config: MutableMapping[str, Any]) -> RubricConfig:
|
|
60
|
+
"""
|
|
61
|
+
Validate and normalize rubric configuration from TOML.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
config: Raw [rubric] section from TOML
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
Validated RubricConfig instance
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
InvalidRubricConfigError: If validation fails
|
|
71
|
+
"""
|
|
72
|
+
if not config:
|
|
73
|
+
# Default: rubric disabled
|
|
74
|
+
return RubricConfig(enabled=False)
|
|
75
|
+
|
|
76
|
+
config_dict = dict(config)
|
|
77
|
+
|
|
78
|
+
# Warn about deprecated fields
|
|
79
|
+
_warn_deprecated_fields("rubric", DEPRECATED_RUBRIC_FIELDS, set(config_dict.keys()))
|
|
80
|
+
|
|
81
|
+
# Warn about deprecated subsections
|
|
82
|
+
if "event" in config_dict:
|
|
83
|
+
warnings.warn(
|
|
84
|
+
"[rubric.event] section is deprecated and no longer used. "
|
|
85
|
+
"Criteria are now fetched dynamically from TaskInfo or specified in "
|
|
86
|
+
"[judge.options.rubric_overrides]. This section will be ignored.",
|
|
87
|
+
DeprecationWarning,
|
|
88
|
+
stacklevel=2,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if "outcome" in config_dict:
|
|
92
|
+
warnings.warn(
|
|
93
|
+
"[rubric.outcome] section is deprecated and no longer used. "
|
|
94
|
+
"Criteria are now fetched dynamically from TaskInfo or specified in "
|
|
95
|
+
"[judge.options.rubric_overrides]. This section will be ignored.",
|
|
96
|
+
DeprecationWarning,
|
|
97
|
+
stacklevel=2,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Extract only valid fields
|
|
101
|
+
enabled = config_dict.get("enabled", False)
|
|
102
|
+
weights_dict = config_dict.get("weights", {})
|
|
103
|
+
|
|
104
|
+
# Validate using Pydantic
|
|
105
|
+
try:
|
|
106
|
+
if not isinstance(weights_dict, dict):
|
|
107
|
+
raise ValueError("[rubric.weights] must be a dictionary")
|
|
108
|
+
|
|
109
|
+
weights = RubricWeightsConfig(**weights_dict)
|
|
110
|
+
return RubricConfig(enabled=enabled, weights=weights)
|
|
111
|
+
|
|
112
|
+
except ValidationError as exc:
|
|
113
|
+
errors = []
|
|
114
|
+
for error in exc.errors():
|
|
115
|
+
loc = ".".join(str(x) for x in error["loc"])
|
|
116
|
+
msg = error["msg"]
|
|
117
|
+
errors.append(f" • rubric.{loc}: {msg}")
|
|
118
|
+
raise InvalidRubricConfigError(
|
|
119
|
+
detail="Rubric validation failed:\n" + "\n".join(errors)
|
|
120
|
+
) from exc
|
|
121
|
+
except Exception as exc:
|
|
122
|
+
raise InvalidRubricConfigError(
|
|
123
|
+
detail=f"Rubric validation failed: {exc}"
|
|
124
|
+
) from exc
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def validate_judge_config(config: MutableMapping[str, Any]) -> Optional[JudgeConfig]:
|
|
128
|
+
"""
|
|
129
|
+
Validate and normalize judge configuration from TOML.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
config: Raw [judge] section from TOML
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Validated JudgeConfig instance, or None if not present
|
|
136
|
+
|
|
137
|
+
Raises:
|
|
138
|
+
InvalidJudgeConfigError: If validation fails
|
|
139
|
+
"""
|
|
140
|
+
if not config:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
config_dict = dict(config)
|
|
144
|
+
|
|
145
|
+
# Warn about deprecated top-level fields
|
|
146
|
+
_warn_deprecated_fields("judge", DEPRECATED_JUDGE_FIELDS, set(config_dict.keys()))
|
|
147
|
+
|
|
148
|
+
# Extract judge.options (required)
|
|
149
|
+
options_dict = config_dict.get("options")
|
|
150
|
+
if not options_dict:
|
|
151
|
+
raise InvalidJudgeConfigError(
|
|
152
|
+
detail="[judge.options] section is required when [judge] is present"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
if not isinstance(options_dict, dict):
|
|
156
|
+
raise InvalidJudgeConfigError(
|
|
157
|
+
detail="[judge.options] must be a dictionary"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Warn about deprecated options fields
|
|
161
|
+
_warn_deprecated_fields(
|
|
162
|
+
"judge.options",
|
|
163
|
+
DEPRECATED_JUDGE_OPTIONS_FIELDS,
|
|
164
|
+
set(options_dict.keys()),
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Remove deprecated fields from options
|
|
168
|
+
options_dict = {
|
|
169
|
+
k: v for k, v in options_dict.items()
|
|
170
|
+
if k not in DEPRECATED_JUDGE_OPTIONS_FIELDS
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
# Migrate judge.timeout_s to judge.options.timeout_s if present
|
|
174
|
+
if "timeout_s" in config_dict and "timeout_s" not in options_dict:
|
|
175
|
+
warnings.warn(
|
|
176
|
+
"[judge].timeout_s is deprecated. Use [judge.options].timeout_s instead. "
|
|
177
|
+
"Auto-migrating for now.",
|
|
178
|
+
DeprecationWarning,
|
|
179
|
+
stacklevel=2,
|
|
180
|
+
)
|
|
181
|
+
options_dict["timeout_s"] = config_dict["timeout_s"]
|
|
182
|
+
|
|
183
|
+
# Validate using Pydantic
|
|
184
|
+
try:
|
|
185
|
+
options = JudgeOptionsConfig(**options_dict)
|
|
186
|
+
return JudgeConfig(options=options)
|
|
187
|
+
|
|
188
|
+
except ValidationError as exc:
|
|
189
|
+
errors = []
|
|
190
|
+
for error in exc.errors():
|
|
191
|
+
loc = ".".join(str(x) for x in error["loc"])
|
|
192
|
+
msg = error["msg"]
|
|
193
|
+
errors.append(f" • judge.options.{loc}: {msg}")
|
|
194
|
+
raise InvalidJudgeConfigError(
|
|
195
|
+
detail="Judge validation failed:\n" + "\n".join(errors)
|
|
196
|
+
) from exc
|
|
197
|
+
except Exception as exc:
|
|
198
|
+
raise InvalidJudgeConfigError(
|
|
199
|
+
detail=f"Judge validation failed: {exc}"
|
|
200
|
+
) from exc
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def extract_and_validate_judge_rubric(
|
|
204
|
+
toml_config: MutableMapping[str, Any]
|
|
205
|
+
) -> Tuple[RubricConfig, Optional[JudgeConfig]]:
|
|
206
|
+
"""
|
|
207
|
+
Extract and validate judge/rubric config from full TOML config.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
toml_config: Full TOML configuration dict
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Tuple of (validated_rubric, validated_judge_or_none)
|
|
214
|
+
|
|
215
|
+
Raises:
|
|
216
|
+
InvalidRubricConfigError: If rubric validation fails
|
|
217
|
+
InvalidJudgeConfigError: If judge validation fails
|
|
218
|
+
"""
|
|
219
|
+
rubric_dict = toml_config.get("rubric", {})
|
|
220
|
+
judge_dict = toml_config.get("judge", {})
|
|
221
|
+
|
|
222
|
+
# Validate rubric
|
|
223
|
+
rubric_config = validate_rubric_config(rubric_dict)
|
|
224
|
+
|
|
225
|
+
# Validate judge (if present)
|
|
226
|
+
judge_config = validate_judge_config(judge_dict) if judge_dict else None
|
|
227
|
+
|
|
228
|
+
# Cross-validation: If rubric is enabled, judge options should be present
|
|
229
|
+
if rubric_config.enabled and not judge_config:
|
|
230
|
+
warnings.warn(
|
|
231
|
+
"[rubric].enabled=true but [judge] section is missing. "
|
|
232
|
+
"Rubric-based judging requires judge configuration. "
|
|
233
|
+
"Rubric scoring will be disabled.",
|
|
234
|
+
UserWarning,
|
|
235
|
+
stacklevel=2,
|
|
236
|
+
)
|
|
237
|
+
rubric_config.enabled = False
|
|
238
|
+
|
|
239
|
+
# Cross-validation: Warn if weights don't align with enabled judging types
|
|
240
|
+
if rubric_config.enabled and judge_config:
|
|
241
|
+
weights = rubric_config.weights
|
|
242
|
+
options = judge_config.options
|
|
243
|
+
|
|
244
|
+
if weights.event > 0 and not options.event:
|
|
245
|
+
warnings.warn(
|
|
246
|
+
"[rubric.weights].event > 0 but [judge.options].event=false. "
|
|
247
|
+
"Event-level judge scores will be 0 (no event judging enabled).",
|
|
248
|
+
UserWarning,
|
|
249
|
+
stacklevel=2,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if weights.outcome > 0 and not options.outcome:
|
|
253
|
+
warnings.warn(
|
|
254
|
+
"[rubric.weights].outcome > 0 but [judge.options].outcome=false. "
|
|
255
|
+
"Outcome judge score will be 0 (no outcome judging enabled).",
|
|
256
|
+
UserWarning,
|
|
257
|
+
stacklevel=2,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
return rubric_config, judge_config
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# Helper to check if config has any deprecated fields (for testing/migration)
|
|
264
|
+
|
|
265
|
+
def check_for_deprecated_fields(toml_config: MutableMapping[str, Any]) -> dict[str, list[str]]:
|
|
266
|
+
"""
|
|
267
|
+
Check TOML config for deprecated fields without validation.
|
|
268
|
+
|
|
269
|
+
Returns dict of {section: [deprecated_field_names]} for reporting.
|
|
270
|
+
"""
|
|
271
|
+
deprecated: dict[str, list[str]] = {}
|
|
272
|
+
|
|
273
|
+
rubric_dict = toml_config.get("rubric", {})
|
|
274
|
+
if rubric_dict:
|
|
275
|
+
found = [
|
|
276
|
+
field for field in DEPRECATED_RUBRIC_FIELDS
|
|
277
|
+
if field in rubric_dict
|
|
278
|
+
]
|
|
279
|
+
if "event" in rubric_dict:
|
|
280
|
+
found.append("event (entire section)")
|
|
281
|
+
if "outcome" in rubric_dict:
|
|
282
|
+
found.append("outcome (entire section)")
|
|
283
|
+
if found:
|
|
284
|
+
deprecated["rubric"] = found
|
|
285
|
+
|
|
286
|
+
judge_dict = toml_config.get("judge", {})
|
|
287
|
+
if judge_dict:
|
|
288
|
+
found = [
|
|
289
|
+
field for field in DEPRECATED_JUDGE_FIELDS
|
|
290
|
+
if field in judge_dict
|
|
291
|
+
]
|
|
292
|
+
if found:
|
|
293
|
+
deprecated["judge"] = found
|
|
294
|
+
|
|
295
|
+
options_dict = judge_dict.get("options", {})
|
|
296
|
+
if options_dict:
|
|
297
|
+
options_found = [
|
|
298
|
+
field for field in DEPRECATED_JUDGE_OPTIONS_FIELDS
|
|
299
|
+
if field in options_dict
|
|
300
|
+
]
|
|
301
|
+
if options_found:
|
|
302
|
+
deprecated["judge.options"] = options_found
|
|
303
|
+
|
|
304
|
+
return deprecated
|
|
305
|
+
|