synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,113 @@
1
+ """Rubric scoring utilities for events and outcomes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+ from typing import Any
7
+
8
+ from .models import Criterion, Rubric
9
+
10
+
11
+ def _as_float(value: Any) -> float | None:
12
+ """Safely convert value to float, returning None on failure."""
13
+ try:
14
+ return float(value)
15
+ except Exception:
16
+ return None
17
+
18
+
19
+ def _score(
20
+ criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
21
+ ) -> dict[str, Any]:
22
+ """Compute aggregate score from criterion values.
23
+
24
+ Args:
25
+ criteria: List of criteria defining scoring dimensions
26
+ values: Map of criterion IDs to scores
27
+ aggregation: How to aggregate ("sum", "weighted_sum", "custom")
28
+
29
+ Returns:
30
+ Dict with aggregation method, total score, and per-criterion breakdown
31
+ """
32
+ if aggregation == "inherit":
33
+ aggregation = "weighted_sum"
34
+ per_criterion: dict[str, dict[str, Any]] = {}
35
+ total = 0.0
36
+ total_weight = 0.0
37
+ for criterion in criteria:
38
+ score = values.get(criterion.id, 0.0)
39
+ per_criterion[criterion.id] = {
40
+ "score": score,
41
+ "weight": criterion.weight,
42
+ "required": criterion.required,
43
+ }
44
+ if aggregation == "sum":
45
+ total += score
46
+ elif aggregation == "weighted_sum":
47
+ total += score * criterion.weight
48
+ total_weight += criterion.weight
49
+ if aggregation == "weighted_sum" and total_weight > 0:
50
+ total = total / total_weight
51
+ if aggregation == "custom":
52
+ total = None # type: ignore[assignment]
53
+ return {
54
+ "aggregation": aggregation,
55
+ "score": total,
56
+ "per_criterion": per_criterion,
57
+ }
58
+
59
+
60
+ def score_events_against_rubric(
61
+ events: list[dict[str, Any]], rubric: Rubric | None
62
+ ) -> dict[str, Any]:
63
+ """Score a list of evaluation events against a rubric.
64
+
65
+ Events should contain criterion_id/id/criterion and score fields.
66
+
67
+ Args:
68
+ events: List of event dicts with scoring info
69
+ rubric: Rubric defining criteria and aggregation
70
+
71
+ Returns:
72
+ Scoring result with total and per-criterion scores
73
+ """
74
+ if rubric is None:
75
+ return {"aggregation": "none", "score": None, "per_criterion": {}}
76
+ values: dict[str, float] = {}
77
+ for event in events or []:
78
+ if not isinstance(event, dict):
79
+ continue
80
+ cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
81
+ score = _as_float(event.get("score"))
82
+ if cid and score is not None:
83
+ values[str(cid)] = score
84
+ return _score(rubric.criteria, values, rubric.aggregation)
85
+
86
+
87
+ def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
88
+ """Score a rollout outcome against a rubric.
89
+
90
+ Outcome should be a dict mapping criterion IDs to scores, optionally
91
+ nested under a "criteria" key.
92
+
93
+ Args:
94
+ outcome: Outcome dict with criterion scores
95
+ rubric: Rubric defining criteria and aggregation
96
+
97
+ Returns:
98
+ Scoring result with total and per-criterion scores
99
+ """
100
+ if rubric is None:
101
+ return {"aggregation": "none", "score": None, "per_criterion": {}}
102
+ values: dict[str, float] = {}
103
+ if isinstance(outcome, dict):
104
+ candidates = (
105
+ outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
106
+ )
107
+ if isinstance(candidates, dict):
108
+ for key, value in candidates.items():
109
+ score = _as_float(value)
110
+ if score is not None:
111
+ values[str(key)] = score
112
+ return _score(rubric.criteria, values, rubric.aggregation)
113
+
@@ -1,15 +1,32 @@
1
+ """Strict rubric validators for step-wise judges.
2
+
3
+ These validators enforce stricter constraints than the general-purpose rubrics:
4
+ - Weights must be ≤ 1.0 and sum to exactly 1.0
5
+ - Only weighted_sum aggregation is allowed
6
+ - All required fields must be non-empty
7
+
8
+ Used primarily for validation in judge configurations.
9
+ """
10
+
1
11
  from __future__ import annotations
2
12
 
3
13
  import json
4
14
  import math
15
+ from collections.abc import Iterable
5
16
  from pathlib import Path
6
- from typing import Any, Iterable, Literal
17
+ from typing import Any, Literal
7
18
 
8
19
  import pydantic
9
20
 
10
21
 
11
- class RubricCriterion(pydantic.BaseModel):
12
- """Single scoring criterion within a rubric."""
22
+ class StrictCriterion(pydantic.BaseModel):
23
+ """Single scoring criterion with strict validation.
24
+
25
+ Enforces:
26
+ - Weight ≤ 1.0 (for proper normalization)
27
+ - Weight > 0.0 (positive)
28
+ - Non-empty strings
29
+ """
13
30
 
14
31
  id: str
15
32
  description: str
@@ -35,16 +52,23 @@ class RubricCriterion(pydantic.BaseModel):
35
52
  return value
36
53
 
37
54
 
38
- class RubricSpec(pydantic.BaseModel):
39
- """High-level rubric definition used by step-wise judges."""
55
+ class StrictRubric(pydantic.BaseModel):
56
+ """Strict rubric definition for step-wise judges.
57
+
58
+ Enforces:
59
+ - Weights must sum to 1.0
60
+ - Only weighted_sum aggregation
61
+ - Non-empty version and goal_text
62
+ - At least one criterion
63
+ """
40
64
 
41
65
  version: str
42
66
  goal_text: str
43
67
  aggregation: Literal["weighted_sum"]
44
- criteria: list[RubricCriterion]
68
+ criteria: list[StrictCriterion]
45
69
 
46
70
  @pydantic.model_validator(mode="after")
47
- def _validate_weights(self) -> "RubricSpec":
71
+ def _validate_weights(self) -> StrictRubric:
48
72
  if not self.criteria:
49
73
  raise ValueError("rubric must declare at least one criterion")
50
74
  total_weight = sum(criterion.weight for criterion in self.criteria)
@@ -71,56 +95,55 @@ class RubricSpec(pydantic.BaseModel):
71
95
  return value
72
96
 
73
97
 
98
+ # Re-export pydantic's ValidationError for convenience
74
99
  ValidationError = pydantic.ValidationError
75
100
 
76
101
 
77
- def validate_rubric_dict(payload: dict[str, Any]) -> RubricSpec:
78
- """
79
- Validate an in-memory rubric payload and return the parsed model.
80
-
102
+ def validate_rubric_dict(payload: dict[str, Any]) -> StrictRubric:
103
+ """Validate an in-memory rubric payload with strict rules.
104
+
81
105
  Args:
82
- payload: Dictionary representing the rubric JSON.
106
+ payload: Dictionary representing the rubric JSON
107
+
83
108
  Returns:
84
- Validated RubricSpec instance.
109
+ Validated StrictRubric instance
110
+
85
111
  Raises:
86
- ValidationError: If the payload is missing required fields or contains
87
- invalid weights.
112
+ ValidationError: If payload is invalid or doesn't meet strict constraints
88
113
  """
89
-
90
114
  if not isinstance(payload, dict):
91
115
  raise TypeError("rubric payload must be a dictionary")
92
- return RubricSpec.model_validate(payload)
116
+ return StrictRubric.model_validate(payload)
93
117
 
94
118
 
95
119
  def _load_payload_from_file(path: Path) -> dict[str, Any]:
120
+ """Load JSON rubric from file."""
96
121
  if path.suffix.lower() != ".json":
97
122
  raise ValueError(f"Unsupported rubric file type: {path}")
98
123
  text = path.read_text(encoding="utf-8")
99
124
  return json.loads(text)
100
125
 
101
126
 
102
- def validate_rubric_file(path: Path) -> RubricSpec:
103
- """
104
- Load and validate a rubric file.
105
-
127
+ def validate_rubric_file(path: Path) -> StrictRubric:
128
+ """Load and validate a rubric file with strict rules.
129
+
106
130
  Args:
107
- path: Path to a JSON rubric document.
131
+ path: Path to a JSON rubric document
132
+
108
133
  Returns:
109
- Validated RubricSpec instance.
134
+ Validated StrictRubric instance
110
135
  """
111
-
112
136
  payload = _load_payload_from_file(path)
113
137
  return validate_rubric_dict(payload)
114
138
 
115
139
 
116
- def validate_rubric_files(paths: Iterable[Path]) -> list[RubricSpec]:
117
- """
118
- Validate multiple rubric files and return their parsed models.
119
-
140
+ def validate_rubric_files(paths: Iterable[Path]) -> list[StrictRubric]:
141
+ """Validate multiple rubric files with strict rules.
142
+
120
143
  Useful for bulk validation inside tests or CI checks.
121
144
  """
122
-
123
- validated: list[RubricSpec] = []
145
+ validated: list[StrictRubric] = []
124
146
  for path in paths:
125
147
  validated.append(validate_rubric_file(path))
126
148
  return validated
149
+
synth_ai/task/server.py CHANGED
@@ -70,7 +70,7 @@ class TaskAppConfig:
70
70
  provide_task_instances: InstanceProvider
71
71
  rollout: RolloutExecutor
72
72
  dataset_registry: TaskDatasetRegistry | None = None
73
- rubrics: RubricBundle = field(default_factory=RubricBundle)
73
+ rubrics: RubricBundle | None = field(default_factory=RubricBundle)
74
74
  proxy: ProxyConfig | None = None
75
75
  routers: Sequence[APIRouter] = field(default_factory=tuple)
76
76
  middleware: Sequence[Middleware] = field(default_factory=tuple)
@@ -93,7 +93,7 @@ class TaskAppConfig:
93
93
  provide_task_instances=self.provide_task_instances,
94
94
  rollout=self.rollout,
95
95
  dataset_registry=self.dataset_registry,
96
- rubrics=self.rubrics,
96
+ rubrics=self.rubrics or RubricBundle(),
97
97
  proxy=self.proxy,
98
98
  routers=tuple(self.routers),
99
99
  middleware=tuple(self.middleware),
@@ -221,6 +221,7 @@ def _auth_dependency_factory(config: TaskAppConfig) -> Callable[[Request], None]
221
221
 
222
222
  def create_task_app(config: TaskAppConfig) -> FastAPI:
223
223
  cfg = config.clone()
224
+ cfg.rubrics = cfg.rubrics or RubricBundle()
224
225
  app = FastAPI(title=cfg.name, description=cfg.description)
225
226
 
226
227
  for key, value in cfg.app_state.items():
@@ -310,20 +311,20 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
310
311
  async def info() -> Mapping[str, Any]:
311
312
  dataset_meta = cfg.base_task_info.dataset
312
313
  rubrics: dict[str, Any] | None = None
313
- if cfg.rubrics.outcome or cfg.rubrics.events:
314
+ rubric_bundle = cfg.rubrics
315
+ if rubric_bundle and (rubric_bundle.outcome or rubric_bundle.events):
314
316
  rubrics = {
315
- "outcome": cfg.rubrics.outcome.model_dump() if cfg.rubrics.outcome else None,
316
- "events": cfg.rubrics.events.model_dump() if cfg.rubrics.events else None,
317
+ "outcome": rubric_bundle.outcome.model_dump() if rubric_bundle.outcome else None,
318
+ "events": rubric_bundle.events.model_dump() if rubric_bundle.events else None,
317
319
  }
318
320
  payload = {
319
321
  "service": {
320
322
  "task": cfg.base_task_info.task,
321
- "version": cfg.base_task_info.task.get("version"),
323
+ "version": cfg.base_task_info.task.version,
322
324
  },
323
325
  "dataset": dataset_meta,
324
326
  "rubrics": rubrics,
325
327
  "inference": cfg.base_task_info.inference,
326
- "capabilities": cfg.base_task_info.capabilities,
327
328
  "limits": cfg.base_task_info.limits,
328
329
  }
329
330
  return to_jsonable(payload)
@@ -1,11 +1,274 @@
1
+ """Task app validation utilities."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
- from urllib.parse import urlparse
5
+ import re
6
+ from typing import Any
7
+
8
+ import click
9
+ import httpx
10
+
11
+ from synth_ai.task.contracts import TaskAppEndpoints # type: ignore[attr-defined]
12
+
13
+
14
+ def validate_task_app_url(url: str | None) -> str:
15
+ """Validate and normalize a task app URL.
16
+
17
+ Args:
18
+ url: URL to validate
19
+
20
+ Returns:
21
+ Normalized URL
22
+
23
+ Raises:
24
+ ValueError: If URL is invalid
25
+ """
26
+ if not url:
27
+ raise ValueError("Task app URL is required")
28
+
29
+ url = url.strip().rstrip("/")
30
+
31
+ # Basic URL validation
32
+ url_pattern = re.compile(
33
+ r"^https?://" # http:// or https://
34
+ r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain...
35
+ r"localhost|" # localhost...
36
+ r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
37
+ r"(?::\d+)?" # optional port
38
+ r"(?:/?|[/?]\S+)$",
39
+ re.IGNORECASE,
40
+ )
41
+
42
+ if not url_pattern.match(url):
43
+ raise ValueError(f"Invalid task app URL: {url}")
44
+
45
+ return url
46
+
47
+
48
+ def _print_success(msg: str) -> None:
49
+ """Print success message in green."""
50
+ click.echo(click.style(f"✓ {msg}", fg="green"))
51
+
52
+
53
+ def _print_error(msg: str) -> None:
54
+ """Print error message in red."""
55
+ click.echo(click.style(f"✗ {msg}", fg="red"), err=True)
56
+
57
+
58
+ def _print_warning(msg: str) -> None:
59
+ """Print warning message in yellow."""
60
+ click.echo(click.style(f"⚠ {msg}", fg="yellow"))
61
+
4
62
 
63
+ def _print_info(msg: str) -> None:
64
+ """Print info message."""
65
+ click.echo(f" {msg}")
5
66
 
6
- def validate_task_app_url(url: str, *, name: str = "TASK_APP_BASE_URL") -> None:
7
- """Validate a Task App base URL (scheme + host present)."""
8
67
 
9
- p = urlparse(url)
10
- if p.scheme not in ("http", "https") or not p.netloc:
11
- raise ValueError(f"Invalid {name}: malformed: {url}")
68
+ async def validate_task_app_endpoint(
69
+ url: str,
70
+ api_key: str | None = None,
71
+ min_instances: int = 10,
72
+ verbose: bool = False,
73
+ ) -> tuple[bool, dict[str, Any]]:
74
+ """Validate a task app deployment.
75
+
76
+ Returns:
77
+ (success: bool, results: dict)
78
+ """
79
+ results: dict[str, Any] = {
80
+ "url": url,
81
+ "endpoints": {},
82
+ "auth": {},
83
+ "task_instances": {},
84
+ "overall": False,
85
+ }
86
+
87
+ all_passed = True
88
+ endpoints = TaskAppEndpoints()
89
+
90
+ # Set up headers
91
+ headers = {}
92
+ if api_key:
93
+ headers["X-API-Key"] = api_key
94
+
95
+ click.echo(f"\n{'='*60}")
96
+ click.echo(f"Validating Task App: {url}")
97
+ click.echo(f"{'='*60}\n")
98
+
99
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
100
+ # 1. Check root endpoint
101
+ click.echo("1. Checking root endpoint...")
102
+ try:
103
+ resp = await client.get(f"{url}{endpoints.root}")
104
+ if resp.status_code == 200:
105
+ data = resp.json()
106
+ _print_success(f"Root endpoint responds (status: {data.get('status')})")
107
+ results["endpoints"]["root"] = {"passed": True, "data": data}
108
+ if verbose:
109
+ _print_info(f"Service: {data.get('service', 'N/A')}")
110
+ else:
111
+ _print_error(f"Root endpoint returned {resp.status_code}")
112
+ results["endpoints"]["root"] = {"passed": False, "status": resp.status_code}
113
+ all_passed = False
114
+ except Exception as e:
115
+ _print_error(f"Root endpoint failed: {e}")
116
+ results["endpoints"]["root"] = {"passed": False, "error": str(e)}
117
+ all_passed = False
118
+
119
+ # 2. Check health endpoint
120
+ click.echo("\n2. Checking health endpoint...")
121
+ try:
122
+ resp = await client.get(f"{url}{endpoints.health}", headers=headers)
123
+ if resp.status_code == 200:
124
+ data = resp.json()
125
+ _print_success(f"Health endpoint responds (healthy: {data.get('healthy')})")
126
+ results["endpoints"]["health"] = {"passed": True, "data": data}
127
+
128
+ # Check auth configuration
129
+ auth_info = data.get("auth", {})
130
+ if auth_info.get("required"):
131
+ _print_info(f"Auth required: {auth_info.get('required')}")
132
+ _print_info(f"Expected key prefix: {auth_info.get('expected_prefix', 'N/A')}")
133
+
134
+ if api_key:
135
+ _print_success("API key provided and accepted")
136
+ results["auth"]["provided"] = True
137
+ results["auth"]["accepted"] = True
138
+ else:
139
+ _print_warning("No API key provided but may be required")
140
+ results["auth"]["provided"] = False
141
+ results["auth"]["required"] = True
142
+ else:
143
+ _print_error(f"Health endpoint returned {resp.status_code}")
144
+ results["endpoints"]["health"] = {"passed": False, "status": resp.status_code}
145
+ all_passed = False
146
+
147
+ if resp.status_code == 403:
148
+ _print_error("Authentication failed - provide API key with --api-key")
149
+ results["auth"]["error"] = "Authentication failed"
150
+
151
+ except Exception as e:
152
+ _print_error(f"Health endpoint failed: {e}")
153
+ results["endpoints"]["health"] = {"passed": False, "error": str(e)}
154
+ all_passed = False
155
+
156
+ # 3. Check info endpoint
157
+ click.echo("\n3. Checking info endpoint...")
158
+ try:
159
+ resp = await client.get(f"{url}{endpoints.info}", headers=headers)
160
+ if resp.status_code == 200:
161
+ data = resp.json()
162
+ _print_success("Info endpoint responds")
163
+ results["endpoints"]["info"] = {"passed": True, "data": data}
164
+
165
+ if verbose:
166
+ service = data.get("service", {})
167
+ task_info = service.get("task", {})
168
+ if isinstance(task_info, dict):
169
+ _print_info(f"Task: {task_info.get('name', 'N/A')}")
170
+ _print_info(f"Version: {service.get('version', 'N/A')}")
171
+
172
+ dataset = data.get("dataset", {})
173
+ if isinstance(dataset, dict):
174
+ _print_info(f"Dataset: {dataset.get('id', 'N/A')}")
175
+ else:
176
+ _print_error(f"Info endpoint returned {resp.status_code}")
177
+ results["endpoints"]["info"] = {"passed": False, "status": resp.status_code}
178
+ all_passed = False
179
+ except Exception as e:
180
+ _print_error(f"Info endpoint failed: {e}")
181
+ results["endpoints"]["info"] = {"passed": False, "error": str(e)}
182
+ all_passed = False
183
+
184
+ # 4. Check task_info endpoint and instance count
185
+ click.echo("\n4. Checking task_info endpoint and instance availability...")
186
+ try:
187
+ # Get taskset descriptor first
188
+ resp = await client.get(f"{url}{endpoints.task_info}", headers=headers)
189
+ if resp.status_code == 200:
190
+ data = resp.json()
191
+ _print_success("Task info endpoint responds")
192
+ results["endpoints"]["task_info"] = {"passed": True}
193
+
194
+ taskset = data.get("taskset", {})
195
+ if verbose and taskset:
196
+ if isinstance(taskset, dict):
197
+ _print_info(f"Taskset: {taskset.get('id', 'N/A')}")
198
+ else:
199
+ _print_info(f"Taskset: {taskset}")
200
+
201
+ # Try to get specific task instances (seeds 0-19)
202
+ # Fetch instances one by one to verify we can get at least min_instances
203
+ instances = []
204
+ for seed in range(min_instances + 5): # Try a few extra
205
+ try:
206
+ resp_seed = await client.get(
207
+ f"{url}{endpoints.task_info}",
208
+ params={"seed": seed},
209
+ headers=headers,
210
+ )
211
+ if resp_seed.status_code == 200:
212
+ instance = resp_seed.json()
213
+ instances.append(instance)
214
+ else:
215
+ break # Stop if we hit an invalid seed
216
+ except Exception:
217
+ break
218
+
219
+ instance_count = len(instances)
220
+ results["task_instances"]["count"] = instance_count
221
+ results["task_instances"]["requested"] = min_instances
222
+
223
+ if instance_count >= min_instances:
224
+ _print_success(f"Found {instance_count} task instances (≥ {min_instances} required)")
225
+ results["task_instances"]["passed"] = True
226
+
227
+ if verbose and instances:
228
+ sample = instances[0]
229
+ task_info_sample = sample.get('task', {})
230
+ if isinstance(task_info_sample, dict):
231
+ _print_info(f"Sample task: {task_info_sample.get('name', 'N/A')}")
232
+ _print_info(f"Environment: {sample.get('environment', 'N/A')}")
233
+ else:
234
+ _print_error(f"Only {instance_count} task instances available (need ≥ {min_instances})")
235
+ results["task_instances"]["passed"] = False
236
+ all_passed = False
237
+ else:
238
+ _print_error(f"Task info endpoint returned {resp.status_code}")
239
+ results["endpoints"]["task_info"] = {"passed": False, "status": resp.status_code}
240
+ all_passed = False
241
+ except Exception as e:
242
+ _print_error(f"Task info endpoint failed: {e}")
243
+ results["endpoints"]["task_info"] = {"passed": False, "error": str(e)}
244
+ results["task_instances"]["passed"] = False
245
+ all_passed = False
246
+
247
+ # 5. Check rollout endpoint structure (don't actually run a rollout)
248
+ click.echo("\n5. Checking rollout endpoint availability...")
249
+ try:
250
+ # Just check if it's registered (OPTIONS or a lightweight probe)
251
+ resp = await client.options(f"{url}{endpoints.rollout}", headers=headers)
252
+ # Many servers return 200 for OPTIONS, some return 405
253
+ if resp.status_code in (200, 204, 405):
254
+ _print_success("Rollout endpoint is registered")
255
+ results["endpoints"]["rollout"] = {"passed": True}
256
+ else:
257
+ _print_warning(f"Rollout endpoint returned unexpected status: {resp.status_code}")
258
+ results["endpoints"]["rollout"] = {"passed": True, "note": "endpoint exists"}
259
+ except Exception as e:
260
+ # OPTIONS might not be supported, that's okay
261
+ _print_info(f"Rollout endpoint check skipped (OPTIONS not supported): {e}")
262
+ results["endpoints"]["rollout"] = {"passed": True, "note": "assumed present"}
263
+
264
+ # Summary
265
+ click.echo(f"\n{'='*60}")
266
+ if all_passed:
267
+ _print_success("All validations passed!")
268
+ click.echo(f"{'='*60}\n")
269
+ else:
270
+ _print_error("Some validations failed. See errors above.")
271
+ click.echo(f"{'='*60}\n")
272
+
273
+ results["overall"] = all_passed
274
+ return all_passed, results
@@ -37,10 +37,14 @@ from .utils import calculate_cost, detect_provider
37
37
  # Context variables for session and turn tracking
38
38
  # These variables automatically propagate across async call boundaries,
39
39
  # allowing deeply nested code to access tracing context without explicit passing
40
- _session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar("session_id")
41
- _turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar("turn_number")
40
+ _session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar(
41
+ "session_id"
42
+ )
43
+ _turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar(
44
+ "turn_number"
45
+ )
42
46
  _session_tracer_ctx: contextvars.ContextVar[Any | None] = contextvars.ContextVar(
43
- "session_tracer", default=None
47
+ "session_tracer"
44
48
  )
45
49
 
46
50
 
@@ -25,15 +25,15 @@ application to continue without blocking on sync operations.
25
25
  """
26
26
 
27
27
  import asyncio
28
+ import importlib
28
29
  import logging
29
- from typing import Any
30
-
31
- import libsql
30
+ from typing import Any, cast
32
31
 
33
32
  from .config import CONFIG
34
33
 
35
34
  logger = logging.getLogger(__name__)
36
35
 
36
+ libsql = cast(Any, importlib.import_module("libsql"))
37
37
 
38
38
  class ReplicaSync:
39
39
  """Manages synchronization of embedded SQLite replica with remote Turso database.
@@ -53,7 +53,7 @@ class ReplicaSync:
53
53
  db_path: str = "embedded.db",
54
54
  sync_url: str | None = None,
55
55
  auth_token: str | None = None,
56
- sync_interval: int | None = None,
56
+ sync_interval: float | None = None,
57
57
  ):
58
58
  """Initialize replica sync manager.
59
59
 
@@ -55,11 +55,11 @@ def normalize_for_json(value: Any) -> Any:
55
55
  return {str(k): normalize_for_json(v) for k, v in value.items()}
56
56
 
57
57
  # Sequences
58
- if isinstance(value, (list, tuple, set)):
58
+ if isinstance(value, list | tuple | set):
59
59
  return [normalize_for_json(v) for v in value]
60
60
 
61
61
  # Datetime / Date
62
- if isinstance(value, (datetime, date)):
62
+ if isinstance(value, datetime | date):
63
63
  return value.isoformat()
64
64
 
65
65
  # Decimal
@@ -73,7 +73,7 @@ def normalize_for_json(value: Any) -> Any:
73
73
  return str(value)
74
74
 
75
75
  # Bytes-like
76
- if isinstance(value, (bytes, bytearray)):
76
+ if isinstance(value, bytes | bytearray):
77
77
  return base64.b64encode(bytes(value)).decode("ascii")
78
78
 
79
79
  # Enum
@@ -82,9 +82,9 @@ def normalize_for_json(value: Any) -> Any:
82
82
 
83
83
  # Numpy scalars / arrays
84
84
  if _np is not None:
85
- if isinstance(value, (_np.generic,)): # type: ignore[attr-defined]
85
+ if isinstance(value, _np.generic): # type: ignore[attr-defined]
86
86
  return normalize_for_json(value.item())
87
- if isinstance(value, (_np.ndarray,)):
87
+ if isinstance(value, _np.ndarray):
88
88
  return normalize_for_json(value.tolist())
89
89
 
90
90
  # Floats: sanitize NaN / Infinity to None