synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show
  1. examples/analyze_semantic_words.sh +2 -2
  2. examples/baseline/banking77_baseline.py +204 -0
  3. examples/baseline/crafter_baseline.py +407 -0
  4. examples/baseline/pokemon_red_baseline.py +326 -0
  5. examples/baseline/simple_baseline.py +56 -0
  6. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  7. examples/blog_posts/gepa/README.md +355 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  9. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  10. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  13. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  15. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  16. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  18. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  19. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  20. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  21. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  22. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  23. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  24. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  25. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  26. examples/blog_posts/gepa/task_apps.py +105 -0
  27. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  28. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  29. examples/blog_posts/pokemon_vl/README.md +98 -0
  30. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  31. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  32. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  33. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  34. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  35. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  36. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  37. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  38. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  39. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  40. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  41. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  42. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  43. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  44. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  45. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  46. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  47. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  48. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  49. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  50. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  51. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  52. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  53. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  54. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  55. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  56. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  57. examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
  58. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  59. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  60. examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
  61. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
  62. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
  63. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
  64. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  65. examples/multi_step/configs/verilog_rl_lora.toml +80 -123
  66. examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
  67. examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
  68. examples/qwen_coder/configs/coder_lora_small.toml +1 -3
  69. examples/qwen_vl/README.md +10 -12
  70. examples/qwen_vl/SETUP_COMPLETE.md +7 -8
  71. examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
  72. examples/qwen_vl/collect_data_via_cli.md +76 -84
  73. examples/qwen_vl/collect_vision_traces.py +4 -4
  74. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
  75. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
  76. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
  77. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
  78. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  79. examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
  80. examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
  81. examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
  82. examples/qwen_vl/run_vision_comparison.sh +6 -7
  83. examples/rl/README.md +5 -5
  84. examples/rl/configs/rl_from_base_qwen.toml +26 -1
  85. examples/rl/configs/rl_from_base_qwen17.toml +6 -2
  86. examples/rl/task_app/README.md +1 -2
  87. examples/rl/task_app/math_single_step.py +2 -2
  88. examples/run_crafter_demo.sh +2 -2
  89. examples/sft/README.md +1 -1
  90. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
  91. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
  92. examples/swe/task_app/README.md +32 -2
  93. examples/swe/task_app/grpo_swe_mini.py +4 -0
  94. examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
  95. examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
  96. examples/swe/task_app/hosted/inference/openai_client.py +4 -38
  97. examples/swe/task_app/hosted/policy_routes.py +17 -0
  98. examples/swe/task_app/hosted/rollout.py +4 -2
  99. examples/swe/task_app/morph_backend.py +178 -0
  100. examples/task_apps/banking77/__init__.py +6 -0
  101. examples/task_apps/banking77/banking77_task_app.py +841 -0
  102. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  103. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  104. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  105. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  106. examples/task_apps/crafter/task_app/README.md +1 -1
  107. examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
  108. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
  109. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
  110. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
  111. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  112. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
  113. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
  114. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
  115. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  116. examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
  117. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  118. examples/task_apps/gepa_benchmarks/common.py +260 -0
  119. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  120. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  121. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  122. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  123. examples/task_apps/math/README.md +1 -2
  124. examples/task_apps/pokemon_red/README.md +3 -4
  125. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  126. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
  127. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
  128. examples/task_apps/pokemon_red/task_app.py +288 -39
  129. examples/task_apps/sokoban/README.md +2 -3
  130. examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
  131. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
  132. examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
  133. examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
  134. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
  135. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
  136. examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
  137. examples/warming_up_to_rl/task_app/README.md +1 -1
  138. examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
  139. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
  140. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
  141. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
  142. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  143. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
  144. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
  145. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  146. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  147. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
  148. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
  149. synth_ai/api/train/builders.py +99 -4
  150. synth_ai/api/train/cli.py +516 -26
  151. synth_ai/api/train/config_finder.py +13 -2
  152. synth_ai/api/train/configs/__init__.py +23 -2
  153. synth_ai/api/train/configs/prompt_learning.py +442 -0
  154. synth_ai/api/train/configs/rl.py +61 -7
  155. synth_ai/api/train/configs/sft.py +6 -2
  156. synth_ai/api/train/configs/shared.py +59 -2
  157. synth_ai/api/train/task_app.py +1 -1
  158. synth_ai/api/train/validators.py +277 -0
  159. synth_ai/auth/credentials.py +119 -0
  160. synth_ai/baseline/__init__.py +25 -0
  161. synth_ai/baseline/config.py +209 -0
  162. synth_ai/baseline/discovery.py +214 -0
  163. synth_ai/baseline/execution.py +146 -0
  164. synth_ai/cli/__init__.py +94 -18
  165. synth_ai/cli/__main__.py +0 -0
  166. synth_ai/cli/claude.py +70 -0
  167. synth_ai/cli/codex.py +84 -0
  168. synth_ai/cli/commands/__init__.py +18 -0
  169. synth_ai/cli/commands/baseline/__init__.py +12 -0
  170. synth_ai/cli/commands/baseline/core.py +637 -0
  171. synth_ai/cli/commands/baseline/list.py +93 -0
  172. synth_ai/cli/commands/demo/__init__.py +6 -0
  173. synth_ai/cli/commands/demo/core.py +163 -0
  174. synth_ai/cli/commands/eval/__init__.py +19 -0
  175. synth_ai/cli/commands/eval/core.py +1112 -0
  176. synth_ai/cli/commands/eval/errors.py +81 -0
  177. synth_ai/cli/commands/eval/validation.py +133 -0
  178. synth_ai/cli/commands/filter/__init__.py +12 -0
  179. synth_ai/cli/commands/filter/core.py +424 -0
  180. synth_ai/cli/commands/filter/errors.py +55 -0
  181. synth_ai/cli/commands/filter/validation.py +77 -0
  182. synth_ai/cli/commands/help/__init__.py +177 -0
  183. synth_ai/cli/commands/help/core.py +72 -0
  184. synth_ai/cli/commands/smoke/__init__.py +7 -0
  185. synth_ai/cli/commands/smoke/core.py +1436 -0
  186. synth_ai/cli/commands/status/__init__.py +64 -0
  187. synth_ai/cli/commands/status/client.py +192 -0
  188. synth_ai/cli/commands/status/config.py +92 -0
  189. synth_ai/cli/commands/status/errors.py +20 -0
  190. synth_ai/cli/commands/status/formatters.py +164 -0
  191. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  192. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  193. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  194. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  195. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  196. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  197. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  198. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  199. synth_ai/cli/commands/status/utils.py +114 -0
  200. synth_ai/cli/commands/train/__init__.py +53 -0
  201. synth_ai/cli/commands/train/core.py +21 -0
  202. synth_ai/cli/commands/train/errors.py +117 -0
  203. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  204. synth_ai/cli/commands/train/judge_validation.py +305 -0
  205. synth_ai/cli/commands/train/validation.py +386 -0
  206. synth_ai/cli/demo.py +30 -158
  207. synth_ai/cli/deploy/__init__.py +43 -0
  208. synth_ai/cli/deploy.py +162 -0
  209. synth_ai/cli/eval/__init__.py +36 -0
  210. synth_ai/cli/eval/core.py +5 -0
  211. synth_ai/cli/eval/errors.py +31 -0
  212. synth_ai/cli/eval/validation.py +5 -0
  213. synth_ai/cli/filter/__init__.py +28 -0
  214. synth_ai/cli/filter/core.py +5 -0
  215. synth_ai/cli/filter/errors.py +23 -0
  216. synth_ai/cli/filter/validation.py +5 -0
  217. synth_ai/cli/legacy_root_backup.py +14 -8
  218. synth_ai/cli/modal_serve/__init__.py +12 -0
  219. synth_ai/cli/modal_serve/core.py +14 -0
  220. synth_ai/cli/modal_serve/errors.py +8 -0
  221. synth_ai/cli/modal_serve/validation.py +11 -0
  222. synth_ai/cli/opencode.py +107 -0
  223. synth_ai/cli/root.py +9 -5
  224. synth_ai/cli/serve/__init__.py +12 -0
  225. synth_ai/cli/serve/core.py +14 -0
  226. synth_ai/cli/serve/errors.py +8 -0
  227. synth_ai/cli/serve/validation.py +11 -0
  228. synth_ai/cli/setup.py +20 -265
  229. synth_ai/cli/status.py +7 -126
  230. synth_ai/cli/task_app_deploy.py +1 -10
  231. synth_ai/cli/task_app_modal_serve.py +4 -9
  232. synth_ai/cli/task_app_serve.py +4 -11
  233. synth_ai/cli/task_apps.py +51 -1480
  234. synth_ai/cli/train/__init__.py +12 -0
  235. synth_ai/cli/train/core.py +21 -0
  236. synth_ai/cli/train/errors.py +8 -0
  237. synth_ai/cli/train/validation.py +24 -0
  238. synth_ai/cli/train.py +1 -14
  239. synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
  240. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  241. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  242. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  243. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  244. synth_ai/environments/examples/red/engine.py +33 -12
  245. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  246. synth_ai/environments/examples/red/environment.py +26 -0
  247. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  248. synth_ai/http.py +12 -0
  249. synth_ai/judge_schemas.py +10 -10
  250. synth_ai/learning/__init__.py +10 -0
  251. synth_ai/learning/prompt_learning_client.py +276 -0
  252. synth_ai/learning/prompt_learning_types.py +184 -0
  253. synth_ai/learning/rl/client.py +3 -1
  254. synth_ai/pricing/__init__.py +2 -0
  255. synth_ai/pricing/model_pricing.py +57 -0
  256. synth_ai/streaming/__init__.py +29 -0
  257. synth_ai/streaming/config.py +94 -0
  258. synth_ai/streaming/handlers.py +518 -0
  259. synth_ai/streaming/streamer.py +320 -0
  260. synth_ai/streaming/types.py +95 -0
  261. synth_ai/task/apps/__init__.py +1 -0
  262. synth_ai/task/config.py +2 -0
  263. synth_ai/task/tracing_utils.py +25 -25
  264. synth_ai/task/validators.py +45 -9
  265. synth_ai/task_app_cfgs.py +21 -0
  266. synth_ai/tracing_v3/config.py +162 -19
  267. synth_ai/tracing_v3/constants.py +1 -1
  268. synth_ai/tracing_v3/db_config.py +24 -38
  269. synth_ai/tracing_v3/migration_helper.py +1 -2
  270. synth_ai/tracing_v3/storage/config.py +47 -13
  271. synth_ai/tracing_v3/storage/factory.py +3 -3
  272. synth_ai/tracing_v3/turso/daemon.py +113 -11
  273. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  274. synth_ai/types.py +8 -0
  275. synth_ai/urls.py +11 -0
  276. synth_ai/utils/__init__.py +30 -1
  277. synth_ai/utils/agents.py +74 -0
  278. synth_ai/utils/bin.py +39 -0
  279. synth_ai/utils/cli.py +149 -5
  280. synth_ai/utils/env.py +40 -33
  281. synth_ai/utils/http.py +4 -1
  282. synth_ai/utils/json.py +72 -0
  283. synth_ai/utils/modal.py +285 -3
  284. synth_ai/utils/paths.py +48 -0
  285. synth_ai/utils/uvicorn.py +113 -0
  286. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
  287. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
  288. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
  289. synth_ai/cli/tui.py +0 -62
  290. synth_ai/tui/__init__.py +0 -5
  291. synth_ai/tui/__main__.py +0 -13
  292. synth_ai/tui/cli/__init__.py +0 -1
  293. synth_ai/tui/cli/query_experiments.py +0 -164
  294. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  295. synth_ai/tui/dashboard.py +0 -911
  296. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  297. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  298. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  299. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ """
2
+ Pydantic schemas for judge/rubric configuration.
3
+
4
+ These models define the ACTUAL fields used by the backend judge service,
5
+ with all dead code removed. This is the single source of truth for what
6
+ gets sent in HTTP requests.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Optional
12
+
13
+ from pydantic import Field, model_validator
14
+ from synth_ai.api.train.configs.shared import ExtraModel
15
+
16
+ __all__ = [
17
+ "RubricWeightsConfig",
18
+ "RubricConfig",
19
+ "JudgeOptionsConfig",
20
+ "JudgeConfig",
21
+ "JudgeRequestPayload",
22
+ ]
23
+
24
+
25
+ class RubricWeightsConfig(ExtraModel):
26
+ """
27
+ Reward blending weights (client-side only, not sent to backend).
28
+
29
+ These weights control how env rewards, event judge scores, and outcome
30
+ judge scores are combined into a final reward signal for policy gradients.
31
+
32
+ Formula:
33
+ total_reward = (env * env_return) + (event * sum(event_scores)) + (outcome * outcome_score)
34
+ """
35
+ env: float = Field(
36
+ default=1.0,
37
+ description="Weight for environment rewards (task app native rewards)",
38
+ ge=0.0,
39
+ )
40
+ event: float = Field(
41
+ default=0.0,
42
+ description="Weight for per-event judge scores (step-level judging)",
43
+ ge=0.0,
44
+ )
45
+ outcome: float = Field(
46
+ default=0.0,
47
+ description="Weight for outcome judge score (episode-level judging)",
48
+ ge=0.0,
49
+ )
50
+
51
+ @model_validator(mode="after")
52
+ def _validate_weights_sum(self) -> RubricWeightsConfig:
53
+ """Ensure at least one weight is non-zero."""
54
+ if self.env == 0.0 and self.event == 0.0 and self.outcome == 0.0:
55
+ raise ValueError("At least one reward weight must be non-zero")
56
+ return self
57
+
58
+
59
+ class RubricConfig(ExtraModel):
60
+ """
61
+ Top-level rubric configuration.
62
+
63
+ Controls whether rubric-based judging is enabled and how rewards are blended.
64
+ """
65
+ enabled: bool = Field(
66
+ default=False,
67
+ description="Master switch for rubric-based judging",
68
+ )
69
+ weights: RubricWeightsConfig = Field(
70
+ default_factory=RubricWeightsConfig,
71
+ description="Reward blending weights (env/event/outcome)",
72
+ )
73
+
74
+
75
+ class JudgeOptionsConfig(ExtraModel):
76
+ """
77
+ Judge provider options (sent to backend in HTTP request).
78
+
79
+ These fields are sent in the "options" object of the judge score request.
80
+ All fields here map directly to the backend JudgeOptions schema.
81
+ """
82
+ provider: str = Field(
83
+ ...,
84
+ description="Judge provider type ('openai', 'groq', 'gemini')",
85
+ pattern=r"^(openai|groq|gemini)$",
86
+ )
87
+ model: str = Field(
88
+ ...,
89
+ description="Model identifier (e.g., 'openai/gpt-oss-120b', 'gpt-5')",
90
+ min_length=1,
91
+ )
92
+ rubric_id: Optional[str] = Field(
93
+ default=None,
94
+ description="Base rubric identifier (e.g., 'crafter/bundle@v1')",
95
+ )
96
+ event: bool = Field(
97
+ default=True,
98
+ description="Enable per-event (step-level) judging",
99
+ )
100
+ outcome: bool = Field(
101
+ default=True,
102
+ description="Enable outcome (episode-level) judging",
103
+ )
104
+ timeout_s: Optional[float] = Field(
105
+ default=None,
106
+ description="Request timeout in seconds",
107
+ gt=0,
108
+ )
109
+ metadata: dict[str, Any] = Field(
110
+ default_factory=dict,
111
+ description="Optional metadata (e.g., {'async': true, 'custom_field': 'value'})",
112
+ )
113
+ rubric_overrides: dict[str, Any] = Field(
114
+ default_factory=dict,
115
+ description=(
116
+ "Static rubric criteria overrides (rarely used - TaskInfo overrides take priority). "
117
+ "Format: {'event': {'criteria': [...]}, 'outcome': {'criteria': [...]}}"
118
+ ),
119
+ )
120
+
121
+ @model_validator(mode="after")
122
+ def _validate_at_least_one_enabled(self) -> JudgeOptionsConfig:
123
+ """Ensure at least one judging type is enabled."""
124
+ if not self.event and not self.outcome:
125
+ raise ValueError("At least one of 'event' or 'outcome' must be enabled")
126
+ return self
127
+
128
+
129
+ class JudgeConfig(ExtraModel):
130
+ """
131
+ Top-level judge configuration.
132
+
133
+ This is parsed from TOML [judge] section and contains all judge-related settings.
134
+ """
135
+ options: JudgeOptionsConfig = Field(
136
+ ...,
137
+ description="Judge provider options (sent to backend)",
138
+ )
139
+
140
+
141
+ # HTTP Request Payload Structures (for documentation/type safety)
142
+
143
+ class JudgeRequestPayload(ExtraModel):
144
+ """
145
+ HTTP request payload structure for POST /api/judge/v1/score.
146
+
147
+ This is the ACTUAL payload sent to the backend judge service.
148
+ Used for type safety and documentation only.
149
+ """
150
+ policy_name: str = Field(..., description="Name of the policy being evaluated")
151
+ task_app: dict[str, Any] = Field(..., description="Task app metadata (id, base_url)")
152
+ trace: dict[str, Any] = Field(..., description="Tracing v3 payload (event_history, metadata)")
153
+ options: dict[str, Any] = Field(..., description="Judge options (provider, model, etc.)")
154
+
155
+ class Config:
156
+ extra = "allow" # Backend might add extra fields
157
+
158
+
159
+ # Helper to convert to backend request format
160
+
161
+ def build_judge_http_options(
162
+ options_config: JudgeOptionsConfig,
163
+ *,
164
+ rubric_overrides_from_task_info: Optional[dict[str, Any]] = None,
165
+ ) -> dict[str, Any]:
166
+ """
167
+ Build the 'options' dict for HTTP request to backend judge.
168
+
169
+ Args:
170
+ options_config: Validated judge options from TOML
171
+ rubric_overrides_from_task_info: Dynamic overrides fetched from TaskInfo (takes priority)
172
+
173
+ Returns:
174
+ Dict ready to send in HTTP request payload
175
+ """
176
+ payload = {
177
+ "provider": options_config.provider,
178
+ "model": options_config.model,
179
+ "event": options_config.event,
180
+ "outcome": options_config.outcome,
181
+ }
182
+
183
+ # Optional fields
184
+ if options_config.rubric_id:
185
+ payload["rubric_id"] = options_config.rubric_id
186
+
187
+ if options_config.timeout_s is not None:
188
+ payload["timeout_s"] = options_config.timeout_s
189
+
190
+ if options_config.metadata:
191
+ payload["metadata"] = options_config.metadata
192
+
193
+ # Rubric overrides: TaskInfo takes priority over static config
194
+ if rubric_overrides_from_task_info:
195
+ payload["rubric_overrides"] = rubric_overrides_from_task_info
196
+ elif options_config.rubric_overrides:
197
+ payload["rubric_overrides"] = options_config.rubric_overrides
198
+
199
+ return payload
200
+
@@ -0,0 +1,305 @@
1
+ """
2
+ Validation logic for judge/rubric configuration from TOML.
3
+
4
+ This module validates and normalizes judge/rubric config, removing all dead fields
5
+ and ensuring only the fields actually used by the backend are present.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import warnings
11
+ from collections.abc import MutableMapping
12
+ from typing import Any, Optional, Tuple
13
+
14
+ from pydantic import ValidationError
15
+
16
+ from .errors import InvalidJudgeConfigError, InvalidRubricConfigError
17
+ from .judge_schemas import JudgeConfig, JudgeOptionsConfig, RubricConfig, RubricWeightsConfig
18
+
19
+ __all__ = [
20
+ "validate_judge_config",
21
+ "validate_rubric_config",
22
+ "extract_and_validate_judge_rubric",
23
+ ]
24
+
25
+ # Dead fields that should trigger deprecation warnings
26
+ DEPRECATED_RUBRIC_FIELDS = {
27
+ "model",
28
+ "api_base",
29
+ "api_key_env",
30
+ "event",
31
+ "outcome",
32
+ }
33
+
34
+ DEPRECATED_JUDGE_FIELDS = {
35
+ "type",
36
+ "timeout_s", # Moved to judge.options.timeout_s
37
+ }
38
+
39
+ DEPRECATED_JUDGE_OPTIONS_FIELDS = {
40
+ "max_concurrency",
41
+ "tracks",
42
+ }
43
+
44
+
45
+ def _warn_deprecated_fields(section: str, fields: set[str], present_fields: set[str]) -> None:
46
+ """Emit deprecation warnings for dead fields that are present in config."""
47
+ deprecated_present = fields & present_fields
48
+ if deprecated_present:
49
+ field_list = ", ".join(sorted(deprecated_present))
50
+ warnings.warn(
51
+ f"[{section}] contains deprecated fields that are no longer used: {field_list}. "
52
+ f"These fields will be ignored and should be removed from your config. "
53
+ f"See judge/rubric cleanup guide for details.",
54
+ DeprecationWarning,
55
+ stacklevel=3,
56
+ )
57
+
58
+
59
+ def validate_rubric_config(config: MutableMapping[str, Any]) -> RubricConfig:
60
+ """
61
+ Validate and normalize rubric configuration from TOML.
62
+
63
+ Args:
64
+ config: Raw [rubric] section from TOML
65
+
66
+ Returns:
67
+ Validated RubricConfig instance
68
+
69
+ Raises:
70
+ InvalidRubricConfigError: If validation fails
71
+ """
72
+ if not config:
73
+ # Default: rubric disabled
74
+ return RubricConfig(enabled=False)
75
+
76
+ config_dict = dict(config)
77
+
78
+ # Warn about deprecated fields
79
+ _warn_deprecated_fields("rubric", DEPRECATED_RUBRIC_FIELDS, set(config_dict.keys()))
80
+
81
+ # Warn about deprecated subsections
82
+ if "event" in config_dict:
83
+ warnings.warn(
84
+ "[rubric.event] section is deprecated and no longer used. "
85
+ "Criteria are now fetched dynamically from TaskInfo or specified in "
86
+ "[judge.options.rubric_overrides]. This section will be ignored.",
87
+ DeprecationWarning,
88
+ stacklevel=2,
89
+ )
90
+
91
+ if "outcome" in config_dict:
92
+ warnings.warn(
93
+ "[rubric.outcome] section is deprecated and no longer used. "
94
+ "Criteria are now fetched dynamically from TaskInfo or specified in "
95
+ "[judge.options.rubric_overrides]. This section will be ignored.",
96
+ DeprecationWarning,
97
+ stacklevel=2,
98
+ )
99
+
100
+ # Extract only valid fields
101
+ enabled = config_dict.get("enabled", False)
102
+ weights_dict = config_dict.get("weights", {})
103
+
104
+ # Validate using Pydantic
105
+ try:
106
+ if not isinstance(weights_dict, dict):
107
+ raise ValueError("[rubric.weights] must be a dictionary")
108
+
109
+ weights = RubricWeightsConfig(**weights_dict)
110
+ return RubricConfig(enabled=enabled, weights=weights)
111
+
112
+ except ValidationError as exc:
113
+ errors = []
114
+ for error in exc.errors():
115
+ loc = ".".join(str(x) for x in error["loc"])
116
+ msg = error["msg"]
117
+ errors.append(f" • rubric.{loc}: {msg}")
118
+ raise InvalidRubricConfigError(
119
+ detail="Rubric validation failed:\n" + "\n".join(errors)
120
+ ) from exc
121
+ except Exception as exc:
122
+ raise InvalidRubricConfigError(
123
+ detail=f"Rubric validation failed: {exc}"
124
+ ) from exc
125
+
126
+
127
+ def validate_judge_config(config: MutableMapping[str, Any]) -> Optional[JudgeConfig]:
128
+ """
129
+ Validate and normalize judge configuration from TOML.
130
+
131
+ Args:
132
+ config: Raw [judge] section from TOML
133
+
134
+ Returns:
135
+ Validated JudgeConfig instance, or None if not present
136
+
137
+ Raises:
138
+ InvalidJudgeConfigError: If validation fails
139
+ """
140
+ if not config:
141
+ return None
142
+
143
+ config_dict = dict(config)
144
+
145
+ # Warn about deprecated top-level fields
146
+ _warn_deprecated_fields("judge", DEPRECATED_JUDGE_FIELDS, set(config_dict.keys()))
147
+
148
+ # Extract judge.options (required)
149
+ options_dict = config_dict.get("options")
150
+ if not options_dict:
151
+ raise InvalidJudgeConfigError(
152
+ detail="[judge.options] section is required when [judge] is present"
153
+ )
154
+
155
+ if not isinstance(options_dict, dict):
156
+ raise InvalidJudgeConfigError(
157
+ detail="[judge.options] must be a dictionary"
158
+ )
159
+
160
+ # Warn about deprecated options fields
161
+ _warn_deprecated_fields(
162
+ "judge.options",
163
+ DEPRECATED_JUDGE_OPTIONS_FIELDS,
164
+ set(options_dict.keys()),
165
+ )
166
+
167
+ # Remove deprecated fields from options
168
+ options_dict = {
169
+ k: v for k, v in options_dict.items()
170
+ if k not in DEPRECATED_JUDGE_OPTIONS_FIELDS
171
+ }
172
+
173
+ # Migrate judge.timeout_s to judge.options.timeout_s if present
174
+ if "timeout_s" in config_dict and "timeout_s" not in options_dict:
175
+ warnings.warn(
176
+ "[judge].timeout_s is deprecated. Use [judge.options].timeout_s instead. "
177
+ "Auto-migrating for now.",
178
+ DeprecationWarning,
179
+ stacklevel=2,
180
+ )
181
+ options_dict["timeout_s"] = config_dict["timeout_s"]
182
+
183
+ # Validate using Pydantic
184
+ try:
185
+ options = JudgeOptionsConfig(**options_dict)
186
+ return JudgeConfig(options=options)
187
+
188
+ except ValidationError as exc:
189
+ errors = []
190
+ for error in exc.errors():
191
+ loc = ".".join(str(x) for x in error["loc"])
192
+ msg = error["msg"]
193
+ errors.append(f" • judge.options.{loc}: {msg}")
194
+ raise InvalidJudgeConfigError(
195
+ detail="Judge validation failed:\n" + "\n".join(errors)
196
+ ) from exc
197
+ except Exception as exc:
198
+ raise InvalidJudgeConfigError(
199
+ detail=f"Judge validation failed: {exc}"
200
+ ) from exc
201
+
202
+
203
+ def extract_and_validate_judge_rubric(
204
+ toml_config: MutableMapping[str, Any]
205
+ ) -> Tuple[RubricConfig, Optional[JudgeConfig]]:
206
+ """
207
+ Extract and validate judge/rubric config from full TOML config.
208
+
209
+ Args:
210
+ toml_config: Full TOML configuration dict
211
+
212
+ Returns:
213
+ Tuple of (validated_rubric, validated_judge_or_none)
214
+
215
+ Raises:
216
+ InvalidRubricConfigError: If rubric validation fails
217
+ InvalidJudgeConfigError: If judge validation fails
218
+ """
219
+ rubric_dict = toml_config.get("rubric", {})
220
+ judge_dict = toml_config.get("judge", {})
221
+
222
+ # Validate rubric
223
+ rubric_config = validate_rubric_config(rubric_dict)
224
+
225
+ # Validate judge (if present)
226
+ judge_config = validate_judge_config(judge_dict) if judge_dict else None
227
+
228
+ # Cross-validation: If rubric is enabled, judge options should be present
229
+ if rubric_config.enabled and not judge_config:
230
+ warnings.warn(
231
+ "[rubric].enabled=true but [judge] section is missing. "
232
+ "Rubric-based judging requires judge configuration. "
233
+ "Rubric scoring will be disabled.",
234
+ UserWarning,
235
+ stacklevel=2,
236
+ )
237
+ rubric_config.enabled = False
238
+
239
+ # Cross-validation: Warn if weights don't align with enabled judging types
240
+ if rubric_config.enabled and judge_config:
241
+ weights = rubric_config.weights
242
+ options = judge_config.options
243
+
244
+ if weights.event > 0 and not options.event:
245
+ warnings.warn(
246
+ "[rubric.weights].event > 0 but [judge.options].event=false. "
247
+ "Event-level judge scores will be 0 (no event judging enabled).",
248
+ UserWarning,
249
+ stacklevel=2,
250
+ )
251
+
252
+ if weights.outcome > 0 and not options.outcome:
253
+ warnings.warn(
254
+ "[rubric.weights].outcome > 0 but [judge.options].outcome=false. "
255
+ "Outcome judge score will be 0 (no outcome judging enabled).",
256
+ UserWarning,
257
+ stacklevel=2,
258
+ )
259
+
260
+ return rubric_config, judge_config
261
+
262
+
263
+ # Helper to check if config has any deprecated fields (for testing/migration)
264
+
265
+ def check_for_deprecated_fields(toml_config: MutableMapping[str, Any]) -> dict[str, list[str]]:
266
+ """
267
+ Check TOML config for deprecated fields without validation.
268
+
269
+ Returns dict of {section: [deprecated_field_names]} for reporting.
270
+ """
271
+ deprecated: dict[str, list[str]] = {}
272
+
273
+ rubric_dict = toml_config.get("rubric", {})
274
+ if rubric_dict:
275
+ found = [
276
+ field for field in DEPRECATED_RUBRIC_FIELDS
277
+ if field in rubric_dict
278
+ ]
279
+ if "event" in rubric_dict:
280
+ found.append("event (entire section)")
281
+ if "outcome" in rubric_dict:
282
+ found.append("outcome (entire section)")
283
+ if found:
284
+ deprecated["rubric"] = found
285
+
286
+ judge_dict = toml_config.get("judge", {})
287
+ if judge_dict:
288
+ found = [
289
+ field for field in DEPRECATED_JUDGE_FIELDS
290
+ if field in judge_dict
291
+ ]
292
+ if found:
293
+ deprecated["judge"] = found
294
+
295
+ options_dict = judge_dict.get("options", {})
296
+ if options_dict:
297
+ options_found = [
298
+ field for field in DEPRECATED_JUDGE_OPTIONS_FIELDS
299
+ if field in options_dict
300
+ ]
301
+ if options_found:
302
+ deprecated["judge.options"] = options_found
303
+
304
+ return deprecated
305
+