synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (229) hide show
  1. examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
  2. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +186 -0
  3. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
  4. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
  5. examples/multi_step/crafter_rl_lora.md +51 -10
  6. examples/multi_step/sse_metrics_streaming_notes.md +357 -0
  7. examples/multi_step/task_app_config_notes.md +7 -1
  8. examples/swe/task_app/grpo_swe_mini.py +55 -26
  9. examples/swe/task_app/hosted/rollout.py +40 -0
  10. examples/swe/task_app/hosted/test_service.py +5 -6
  11. examples/task_apps/TESTING.md +275 -0
  12. examples/task_apps/__init__.py +0 -0
  13. examples/task_apps/crafter/__init__.py +0 -0
  14. examples/task_apps/crafter/task_app/__init__.py +2 -0
  15. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +21 -46
  16. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  17. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  18. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
  19. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +67 -49
  20. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +242 -193
  21. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  22. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  71. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  72. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  73. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  74. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  75. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  76. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  77. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  78. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  79. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  80. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  81. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  82. examples/task_apps/enron/__init__.py +1 -0
  83. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  84. examples/task_apps/enron/task_app/README.md +14 -0
  85. examples/task_apps/enron/task_app/__init__.py +1 -0
  86. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  87. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  88. examples/task_apps/enron/tests/__init__.py +2 -0
  89. examples/task_apps/enron/tests/conftest.py +115 -0
  90. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  91. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  92. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  93. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  94. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  95. examples/task_apps/math/__init__.py +0 -0
  96. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  97. examples/task_apps/pokemon_battle/__init__.py +2 -0
  98. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  99. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  100. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  101. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  102. examples/task_apps/pokemon_red/README.md +357 -0
  103. examples/task_apps/pokemon_red/__init__.py +3 -0
  104. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  105. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  106. examples/task_apps/pokemon_red/task_app.py +606 -0
  107. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  108. examples/task_apps/sokoban/README.md +307 -0
  109. examples/task_apps/sokoban/__init__.py +3 -0
  110. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  111. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  112. examples/task_apps/sokoban/task_app.py +1058 -0
  113. examples/task_apps/sokoban/tests/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/conftest.py +113 -0
  115. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  116. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  117. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  118. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  119. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  120. examples/task_apps/verilog/__init__.py +1 -0
  121. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  122. examples/task_apps/verilog/task_app/README.md +12 -0
  123. examples/task_apps/verilog/task_app/__init__.py +1 -0
  124. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  125. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  126. examples/task_apps/verilog/tests/__init__.py +2 -0
  127. examples/task_apps/verilog/tests/conftest.py +115 -0
  128. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  129. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  130. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  131. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  132. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  133. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  134. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  135. examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +4 -2
  136. examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +4 -2
  137. examples/warming_up_to_rl/run_eval.py +127 -18
  138. examples/workflows/__init__.py +0 -0
  139. examples/workflows/math_rl/__init__.py +0 -0
  140. examples/workflows/math_rl/download_dataset.py +80 -0
  141. synth_ai/__init__.py +41 -1
  142. synth_ai/api/train/builders.py +73 -29
  143. synth_ai/api/train/cli.py +12 -6
  144. synth_ai/api/train/configs/__init__.py +44 -0
  145. synth_ai/api/train/configs/rl.py +134 -0
  146. synth_ai/api/train/configs/sft.py +95 -0
  147. synth_ai/api/train/configs/shared.py +24 -0
  148. synth_ai/api/train/env_resolver.py +5 -2
  149. synth_ai/api/train/supported_algos.py +10 -5
  150. synth_ai/api/train/utils.py +7 -4
  151. synth_ai/cli/__init__.py +7 -51
  152. synth_ai/cli/_storage.py +4 -3
  153. synth_ai/cli/_validate_task_app.py +11 -0
  154. synth_ai/cli/balance.py +4 -3
  155. synth_ai/cli/calc.py +2 -2
  156. synth_ai/cli/demo.py +49 -43
  157. synth_ai/cli/legacy_root_backup.py +1 -1
  158. synth_ai/cli/rl_demo.py +86 -106
  159. synth_ai/cli/root.py +0 -97
  160. synth_ai/cli/task_apps.py +1710 -186
  161. synth_ai/demos/core/cli.py +121 -159
  162. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  163. synth_ai/environments/examples/crafter_classic/environment.py +16 -0
  164. synth_ai/environments/examples/enron/engine.py +7 -2
  165. synth_ai/environments/examples/enron/environment.py +68 -0
  166. synth_ai/environments/examples/red/engine.py +27 -0
  167. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  168. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  169. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  170. synth_ai/environments/examples/red/environment.py +60 -0
  171. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  172. synth_ai/environments/examples/verilog/engine.py +30 -4
  173. synth_ai/evals/__init__.py +15 -0
  174. synth_ai/evals/client.py +82 -0
  175. synth_ai/evals/types.py +42 -0
  176. synth_ai/jobs/client.py +16 -4
  177. synth_ai/judge_schemas.py +127 -0
  178. synth_ai/py.typed +0 -0
  179. synth_ai/task/__init__.py +14 -5
  180. synth_ai/task/contracts.py +124 -38
  181. synth_ai/task/proxy.py +48 -56
  182. synth_ai/task/rubrics/__init__.py +53 -0
  183. synth_ai/task/rubrics/loaders.py +133 -0
  184. synth_ai/task/rubrics/models.py +57 -0
  185. synth_ai/task/rubrics/scoring.py +113 -0
  186. synth_ai/task/rubrics/strict.py +149 -0
  187. synth_ai/task/server.py +8 -7
  188. synth_ai/task/validators.py +269 -6
  189. synth_ai/tracing_v3/decorators.py +7 -3
  190. synth_ai/tracing_v3/replica_sync.py +4 -4
  191. synth_ai/tracing_v3/serialization.py +130 -0
  192. synth_ai/tracing_v3/trace_utils.py +317 -0
  193. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  194. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  195. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +228 -89
  196. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -1
  197. synth_ai/task/rubrics.py +0 -219
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  214. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  215. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  216. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  217. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  218. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  219. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  222. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  223. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  224. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  225. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  226. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  227. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  228. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  229. {synth_ai-0.2.12.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,113 @@
1
+ """Rubric scoring utilities for events and outcomes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+ from typing import Any
7
+
8
+ from .models import Criterion, Rubric
9
+
10
+
11
+ def _as_float(value: Any) -> float | None:
12
+ """Safely convert value to float, returning None on failure."""
13
+ try:
14
+ return float(value)
15
+ except Exception:
16
+ return None
17
+
18
+
19
+ def _score(
20
+ criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
21
+ ) -> dict[str, Any]:
22
+ """Compute aggregate score from criterion values.
23
+
24
+ Args:
25
+ criteria: List of criteria defining scoring dimensions
26
+ values: Map of criterion IDs to scores
27
+ aggregation: How to aggregate ("sum", "weighted_sum", "custom")
28
+
29
+ Returns:
30
+ Dict with aggregation method, total score, and per-criterion breakdown
31
+ """
32
+ if aggregation == "inherit":
33
+ aggregation = "weighted_sum"
34
+ per_criterion: dict[str, dict[str, Any]] = {}
35
+ total = 0.0
36
+ total_weight = 0.0
37
+ for criterion in criteria:
38
+ score = values.get(criterion.id, 0.0)
39
+ per_criterion[criterion.id] = {
40
+ "score": score,
41
+ "weight": criterion.weight,
42
+ "required": criterion.required,
43
+ }
44
+ if aggregation == "sum":
45
+ total += score
46
+ elif aggregation == "weighted_sum":
47
+ total += score * criterion.weight
48
+ total_weight += criterion.weight
49
+ if aggregation == "weighted_sum" and total_weight > 0:
50
+ total = total / total_weight
51
+ if aggregation == "custom":
52
+ total = None # type: ignore[assignment]
53
+ return {
54
+ "aggregation": aggregation,
55
+ "score": total,
56
+ "per_criterion": per_criterion,
57
+ }
58
+
59
+
60
+ def score_events_against_rubric(
61
+ events: list[dict[str, Any]], rubric: Rubric | None
62
+ ) -> dict[str, Any]:
63
+ """Score a list of evaluation events against a rubric.
64
+
65
+ Events should contain criterion_id/id/criterion and score fields.
66
+
67
+ Args:
68
+ events: List of event dicts with scoring info
69
+ rubric: Rubric defining criteria and aggregation
70
+
71
+ Returns:
72
+ Scoring result with total and per-criterion scores
73
+ """
74
+ if rubric is None:
75
+ return {"aggregation": "none", "score": None, "per_criterion": {}}
76
+ values: dict[str, float] = {}
77
+ for event in events or []:
78
+ if not isinstance(event, dict):
79
+ continue
80
+ cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
81
+ score = _as_float(event.get("score"))
82
+ if cid and score is not None:
83
+ values[str(cid)] = score
84
+ return _score(rubric.criteria, values, rubric.aggregation)
85
+
86
+
87
+ def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
88
+ """Score a rollout outcome against a rubric.
89
+
90
+ Outcome should be a dict mapping criterion IDs to scores, optionally
91
+ nested under a "criteria" key.
92
+
93
+ Args:
94
+ outcome: Outcome dict with criterion scores
95
+ rubric: Rubric defining criteria and aggregation
96
+
97
+ Returns:
98
+ Scoring result with total and per-criterion scores
99
+ """
100
+ if rubric is None:
101
+ return {"aggregation": "none", "score": None, "per_criterion": {}}
102
+ values: dict[str, float] = {}
103
+ if isinstance(outcome, dict):
104
+ candidates = (
105
+ outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
106
+ )
107
+ if isinstance(candidates, dict):
108
+ for key, value in candidates.items():
109
+ score = _as_float(value)
110
+ if score is not None:
111
+ values[str(key)] = score
112
+ return _score(rubric.criteria, values, rubric.aggregation)
113
+
@@ -0,0 +1,149 @@
1
+ """Strict rubric validators for step-wise judges.
2
+
3
+ These validators enforce stricter constraints than the general-purpose rubrics:
4
+ - Weights must be ≤ 1.0 and sum to exactly 1.0
5
+ - Only weighted_sum aggregation is allowed
6
+ - All required fields must be non-empty
7
+
8
+ Used primarily for validation in judge configurations.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import math
15
+ from collections.abc import Iterable
16
+ from pathlib import Path
17
+ from typing import Any, Literal
18
+
19
+ import pydantic
20
+
21
+
22
+ class StrictCriterion(pydantic.BaseModel):
23
+ """Single scoring criterion with strict validation.
24
+
25
+ Enforces:
26
+ - Weight ≤ 1.0 (for proper normalization)
27
+ - Weight > 0.0 (positive)
28
+ - Non-empty strings
29
+ """
30
+
31
+ id: str
32
+ description: str
33
+ weight: float
34
+ scale: str | None = None
35
+
36
+ @pydantic.field_validator("weight")
37
+ @classmethod
38
+ def _validate_weight(cls, value: float) -> float:
39
+ if not math.isfinite(value):
40
+ raise ValueError("weight must be a finite number")
41
+ if value <= 0.0:
42
+ raise ValueError("weight must be positive")
43
+ if value > 1.0:
44
+ raise ValueError("weight must be <= 1.0")
45
+ return value
46
+
47
+ @pydantic.field_validator("id", "description", mode="before")
48
+ @classmethod
49
+ def _strip_string(cls, value: Any) -> Any:
50
+ if isinstance(value, str):
51
+ return value.strip()
52
+ return value
53
+
54
+
55
+ class StrictRubric(pydantic.BaseModel):
56
+ """Strict rubric definition for step-wise judges.
57
+
58
+ Enforces:
59
+ - Weights must sum to 1.0
60
+ - Only weighted_sum aggregation
61
+ - Non-empty version and goal_text
62
+ - At least one criterion
63
+ """
64
+
65
+ version: str
66
+ goal_text: str
67
+ aggregation: Literal["weighted_sum"]
68
+ criteria: list[StrictCriterion]
69
+
70
+ @pydantic.model_validator(mode="after")
71
+ def _validate_weights(self) -> StrictRubric:
72
+ if not self.criteria:
73
+ raise ValueError("rubric must declare at least one criterion")
74
+ total_weight = sum(criterion.weight for criterion in self.criteria)
75
+ if not math.isclose(total_weight, 1.0, abs_tol=1e-6, rel_tol=1e-6):
76
+ raise ValueError(
77
+ f"criterion weights must sum to 1 (got {total_weight:.6f})"
78
+ )
79
+ return self
80
+
81
+ @pydantic.field_validator("version")
82
+ @classmethod
83
+ def _non_empty_version(cls, value: str) -> str:
84
+ value = value.strip()
85
+ if not value:
86
+ raise ValueError("version string must not be empty")
87
+ return value
88
+
89
+ @pydantic.field_validator("goal_text")
90
+ @classmethod
91
+ def _non_empty_goal_text(cls, value: str) -> str:
92
+ value = value.strip()
93
+ if not value:
94
+ raise ValueError("goal_text must not be empty")
95
+ return value
96
+
97
+
98
+ # Re-export pydantic's ValidationError for convenience
99
+ ValidationError = pydantic.ValidationError
100
+
101
+
102
+ def validate_rubric_dict(payload: dict[str, Any]) -> StrictRubric:
103
+ """Validate an in-memory rubric payload with strict rules.
104
+
105
+ Args:
106
+ payload: Dictionary representing the rubric JSON
107
+
108
+ Returns:
109
+ Validated StrictRubric instance
110
+
111
+ Raises:
112
+ ValidationError: If payload is invalid or doesn't meet strict constraints
113
+ """
114
+ if not isinstance(payload, dict):
115
+ raise TypeError("rubric payload must be a dictionary")
116
+ return StrictRubric.model_validate(payload)
117
+
118
+
119
+ def _load_payload_from_file(path: Path) -> dict[str, Any]:
120
+ """Load JSON rubric from file."""
121
+ if path.suffix.lower() != ".json":
122
+ raise ValueError(f"Unsupported rubric file type: {path}")
123
+ text = path.read_text(encoding="utf-8")
124
+ return json.loads(text)
125
+
126
+
127
+ def validate_rubric_file(path: Path) -> StrictRubric:
128
+ """Load and validate a rubric file with strict rules.
129
+
130
+ Args:
131
+ path: Path to a JSON rubric document
132
+
133
+ Returns:
134
+ Validated StrictRubric instance
135
+ """
136
+ payload = _load_payload_from_file(path)
137
+ return validate_rubric_dict(payload)
138
+
139
+
140
+ def validate_rubric_files(paths: Iterable[Path]) -> list[StrictRubric]:
141
+ """Validate multiple rubric files with strict rules.
142
+
143
+ Useful for bulk validation inside tests or CI checks.
144
+ """
145
+ validated: list[StrictRubric] = []
146
+ for path in paths:
147
+ validated.append(validate_rubric_file(path))
148
+ return validated
149
+
synth_ai/task/server.py CHANGED
@@ -70,7 +70,7 @@ class TaskAppConfig:
70
70
  provide_task_instances: InstanceProvider
71
71
  rollout: RolloutExecutor
72
72
  dataset_registry: TaskDatasetRegistry | None = None
73
- rubrics: RubricBundle = field(default_factory=RubricBundle)
73
+ rubrics: RubricBundle | None = field(default_factory=RubricBundle)
74
74
  proxy: ProxyConfig | None = None
75
75
  routers: Sequence[APIRouter] = field(default_factory=tuple)
76
76
  middleware: Sequence[Middleware] = field(default_factory=tuple)
@@ -93,7 +93,7 @@ class TaskAppConfig:
93
93
  provide_task_instances=self.provide_task_instances,
94
94
  rollout=self.rollout,
95
95
  dataset_registry=self.dataset_registry,
96
- rubrics=self.rubrics,
96
+ rubrics=self.rubrics or RubricBundle(),
97
97
  proxy=self.proxy,
98
98
  routers=tuple(self.routers),
99
99
  middleware=tuple(self.middleware),
@@ -221,6 +221,7 @@ def _auth_dependency_factory(config: TaskAppConfig) -> Callable[[Request], None]
221
221
 
222
222
  def create_task_app(config: TaskAppConfig) -> FastAPI:
223
223
  cfg = config.clone()
224
+ cfg.rubrics = cfg.rubrics or RubricBundle()
224
225
  app = FastAPI(title=cfg.name, description=cfg.description)
225
226
 
226
227
  for key, value in cfg.app_state.items():
@@ -310,20 +311,20 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
310
311
  async def info() -> Mapping[str, Any]:
311
312
  dataset_meta = cfg.base_task_info.dataset
312
313
  rubrics: dict[str, Any] | None = None
313
- if cfg.rubrics.outcome or cfg.rubrics.events:
314
+ rubric_bundle = cfg.rubrics
315
+ if rubric_bundle and (rubric_bundle.outcome or rubric_bundle.events):
314
316
  rubrics = {
315
- "outcome": cfg.rubrics.outcome.model_dump() if cfg.rubrics.outcome else None,
316
- "events": cfg.rubrics.events.model_dump() if cfg.rubrics.events else None,
317
+ "outcome": rubric_bundle.outcome.model_dump() if rubric_bundle.outcome else None,
318
+ "events": rubric_bundle.events.model_dump() if rubric_bundle.events else None,
317
319
  }
318
320
  payload = {
319
321
  "service": {
320
322
  "task": cfg.base_task_info.task,
321
- "version": cfg.base_task_info.task.get("version"),
323
+ "version": cfg.base_task_info.task.version,
322
324
  },
323
325
  "dataset": dataset_meta,
324
326
  "rubrics": rubrics,
325
327
  "inference": cfg.base_task_info.inference,
326
- "capabilities": cfg.base_task_info.capabilities,
327
328
  "limits": cfg.base_task_info.limits,
328
329
  }
329
330
  return to_jsonable(payload)
@@ -1,11 +1,274 @@
1
+ """Task app validation utilities."""
2
+
1
3
  from __future__ import annotations
2
4
 
3
- from urllib.parse import urlparse
5
+ import re
6
+ from typing import Any
7
+
8
+ import click
9
+ import httpx
10
+
11
+ from synth_ai.task.contracts import TaskAppEndpoints # type: ignore[attr-defined]
12
+
13
+
14
+ def validate_task_app_url(url: str | None) -> str:
15
+ """Validate and normalize a task app URL.
16
+
17
+ Args:
18
+ url: URL to validate
19
+
20
+ Returns:
21
+ Normalized URL
22
+
23
+ Raises:
24
+ ValueError: If URL is invalid
25
+ """
26
+ if not url:
27
+ raise ValueError("Task app URL is required")
28
+
29
+ url = url.strip().rstrip("/")
30
+
31
+ # Basic URL validation
32
+ url_pattern = re.compile(
33
+ r"^https?://" # http:// or https://
34
+ r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain...
35
+ r"localhost|" # localhost...
36
+ r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
37
+ r"(?::\d+)?" # optional port
38
+ r"(?:/?|[/?]\S+)$",
39
+ re.IGNORECASE,
40
+ )
41
+
42
+ if not url_pattern.match(url):
43
+ raise ValueError(f"Invalid task app URL: {url}")
44
+
45
+ return url
46
+
47
+
48
+ def _print_success(msg: str) -> None:
49
+ """Print success message in green."""
50
+ click.echo(click.style(f"✓ {msg}", fg="green"))
51
+
52
+
53
+ def _print_error(msg: str) -> None:
54
+ """Print error message in red."""
55
+ click.echo(click.style(f"✗ {msg}", fg="red"), err=True)
56
+
57
+
58
+ def _print_warning(msg: str) -> None:
59
+ """Print warning message in yellow."""
60
+ click.echo(click.style(f"⚠ {msg}", fg="yellow"))
61
+
4
62
 
63
+ def _print_info(msg: str) -> None:
64
+ """Print info message."""
65
+ click.echo(f" {msg}")
5
66
 
6
- def validate_task_app_url(url: str, *, name: str = "TASK_APP_BASE_URL") -> None:
7
- """Validate a Task App base URL (scheme + host present)."""
8
67
 
9
- p = urlparse(url)
10
- if p.scheme not in ("http", "https") or not p.netloc:
11
- raise ValueError(f"Invalid {name}: malformed: {url}")
68
+ async def validate_task_app_endpoint(
69
+ url: str,
70
+ api_key: str | None = None,
71
+ min_instances: int = 10,
72
+ verbose: bool = False,
73
+ ) -> tuple[bool, dict[str, Any]]:
74
+ """Validate a task app deployment.
75
+
76
+ Returns:
77
+ (success: bool, results: dict)
78
+ """
79
+ results: dict[str, Any] = {
80
+ "url": url,
81
+ "endpoints": {},
82
+ "auth": {},
83
+ "task_instances": {},
84
+ "overall": False,
85
+ }
86
+
87
+ all_passed = True
88
+ endpoints = TaskAppEndpoints()
89
+
90
+ # Set up headers
91
+ headers = {}
92
+ if api_key:
93
+ headers["X-API-Key"] = api_key
94
+
95
+ click.echo(f"\n{'='*60}")
96
+ click.echo(f"Validating Task App: {url}")
97
+ click.echo(f"{'='*60}\n")
98
+
99
+ async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
100
+ # 1. Check root endpoint
101
+ click.echo("1. Checking root endpoint...")
102
+ try:
103
+ resp = await client.get(f"{url}{endpoints.root}")
104
+ if resp.status_code == 200:
105
+ data = resp.json()
106
+ _print_success(f"Root endpoint responds (status: {data.get('status')})")
107
+ results["endpoints"]["root"] = {"passed": True, "data": data}
108
+ if verbose:
109
+ _print_info(f"Service: {data.get('service', 'N/A')}")
110
+ else:
111
+ _print_error(f"Root endpoint returned {resp.status_code}")
112
+ results["endpoints"]["root"] = {"passed": False, "status": resp.status_code}
113
+ all_passed = False
114
+ except Exception as e:
115
+ _print_error(f"Root endpoint failed: {e}")
116
+ results["endpoints"]["root"] = {"passed": False, "error": str(e)}
117
+ all_passed = False
118
+
119
+ # 2. Check health endpoint
120
+ click.echo("\n2. Checking health endpoint...")
121
+ try:
122
+ resp = await client.get(f"{url}{endpoints.health}", headers=headers)
123
+ if resp.status_code == 200:
124
+ data = resp.json()
125
+ _print_success(f"Health endpoint responds (healthy: {data.get('healthy')})")
126
+ results["endpoints"]["health"] = {"passed": True, "data": data}
127
+
128
+ # Check auth configuration
129
+ auth_info = data.get("auth", {})
130
+ if auth_info.get("required"):
131
+ _print_info(f"Auth required: {auth_info.get('required')}")
132
+ _print_info(f"Expected key prefix: {auth_info.get('expected_prefix', 'N/A')}")
133
+
134
+ if api_key:
135
+ _print_success("API key provided and accepted")
136
+ results["auth"]["provided"] = True
137
+ results["auth"]["accepted"] = True
138
+ else:
139
+ _print_warning("No API key provided but may be required")
140
+ results["auth"]["provided"] = False
141
+ results["auth"]["required"] = True
142
+ else:
143
+ _print_error(f"Health endpoint returned {resp.status_code}")
144
+ results["endpoints"]["health"] = {"passed": False, "status": resp.status_code}
145
+ all_passed = False
146
+
147
+ if resp.status_code == 403:
148
+ _print_error("Authentication failed - provide API key with --api-key")
149
+ results["auth"]["error"] = "Authentication failed"
150
+
151
+ except Exception as e:
152
+ _print_error(f"Health endpoint failed: {e}")
153
+ results["endpoints"]["health"] = {"passed": False, "error": str(e)}
154
+ all_passed = False
155
+
156
+ # 3. Check info endpoint
157
+ click.echo("\n3. Checking info endpoint...")
158
+ try:
159
+ resp = await client.get(f"{url}{endpoints.info}", headers=headers)
160
+ if resp.status_code == 200:
161
+ data = resp.json()
162
+ _print_success("Info endpoint responds")
163
+ results["endpoints"]["info"] = {"passed": True, "data": data}
164
+
165
+ if verbose:
166
+ service = data.get("service", {})
167
+ task_info = service.get("task", {})
168
+ if isinstance(task_info, dict):
169
+ _print_info(f"Task: {task_info.get('name', 'N/A')}")
170
+ _print_info(f"Version: {service.get('version', 'N/A')}")
171
+
172
+ dataset = data.get("dataset", {})
173
+ if isinstance(dataset, dict):
174
+ _print_info(f"Dataset: {dataset.get('id', 'N/A')}")
175
+ else:
176
+ _print_error(f"Info endpoint returned {resp.status_code}")
177
+ results["endpoints"]["info"] = {"passed": False, "status": resp.status_code}
178
+ all_passed = False
179
+ except Exception as e:
180
+ _print_error(f"Info endpoint failed: {e}")
181
+ results["endpoints"]["info"] = {"passed": False, "error": str(e)}
182
+ all_passed = False
183
+
184
+ # 4. Check task_info endpoint and instance count
185
+ click.echo("\n4. Checking task_info endpoint and instance availability...")
186
+ try:
187
+ # Get taskset descriptor first
188
+ resp = await client.get(f"{url}{endpoints.task_info}", headers=headers)
189
+ if resp.status_code == 200:
190
+ data = resp.json()
191
+ _print_success("Task info endpoint responds")
192
+ results["endpoints"]["task_info"] = {"passed": True}
193
+
194
+ taskset = data.get("taskset", {})
195
+ if verbose and taskset:
196
+ if isinstance(taskset, dict):
197
+ _print_info(f"Taskset: {taskset.get('id', 'N/A')}")
198
+ else:
199
+ _print_info(f"Taskset: {taskset}")
200
+
201
+ # Try to get specific task instances (seeds 0-19)
202
+ # Fetch instances one by one to verify we can get at least min_instances
203
+ instances = []
204
+ for seed in range(min_instances + 5): # Try a few extra
205
+ try:
206
+ resp_seed = await client.get(
207
+ f"{url}{endpoints.task_info}",
208
+ params={"seed": seed},
209
+ headers=headers,
210
+ )
211
+ if resp_seed.status_code == 200:
212
+ instance = resp_seed.json()
213
+ instances.append(instance)
214
+ else:
215
+ break # Stop if we hit an invalid seed
216
+ except Exception:
217
+ break
218
+
219
+ instance_count = len(instances)
220
+ results["task_instances"]["count"] = instance_count
221
+ results["task_instances"]["requested"] = min_instances
222
+
223
+ if instance_count >= min_instances:
224
+ _print_success(f"Found {instance_count} task instances (≥ {min_instances} required)")
225
+ results["task_instances"]["passed"] = True
226
+
227
+ if verbose and instances:
228
+ sample = instances[0]
229
+ task_info_sample = sample.get('task', {})
230
+ if isinstance(task_info_sample, dict):
231
+ _print_info(f"Sample task: {task_info_sample.get('name', 'N/A')}")
232
+ _print_info(f"Environment: {sample.get('environment', 'N/A')}")
233
+ else:
234
+ _print_error(f"Only {instance_count} task instances available (need ≥ {min_instances})")
235
+ results["task_instances"]["passed"] = False
236
+ all_passed = False
237
+ else:
238
+ _print_error(f"Task info endpoint returned {resp.status_code}")
239
+ results["endpoints"]["task_info"] = {"passed": False, "status": resp.status_code}
240
+ all_passed = False
241
+ except Exception as e:
242
+ _print_error(f"Task info endpoint failed: {e}")
243
+ results["endpoints"]["task_info"] = {"passed": False, "error": str(e)}
244
+ results["task_instances"]["passed"] = False
245
+ all_passed = False
246
+
247
+ # 5. Check rollout endpoint structure (don't actually run a rollout)
248
+ click.echo("\n5. Checking rollout endpoint availability...")
249
+ try:
250
+ # Just check if it's registered (OPTIONS or a lightweight probe)
251
+ resp = await client.options(f"{url}{endpoints.rollout}", headers=headers)
252
+ # Many servers return 200 for OPTIONS, some return 405
253
+ if resp.status_code in (200, 204, 405):
254
+ _print_success("Rollout endpoint is registered")
255
+ results["endpoints"]["rollout"] = {"passed": True}
256
+ else:
257
+ _print_warning(f"Rollout endpoint returned unexpected status: {resp.status_code}")
258
+ results["endpoints"]["rollout"] = {"passed": True, "note": "endpoint exists"}
259
+ except Exception as e:
260
+ # OPTIONS might not be supported, that's okay
261
+ _print_info(f"Rollout endpoint check skipped (OPTIONS not supported): {e}")
262
+ results["endpoints"]["rollout"] = {"passed": True, "note": "assumed present"}
263
+
264
+ # Summary
265
+ click.echo(f"\n{'='*60}")
266
+ if all_passed:
267
+ _print_success("All validations passed!")
268
+ click.echo(f"{'='*60}\n")
269
+ else:
270
+ _print_error("Some validations failed. See errors above.")
271
+ click.echo(f"{'='*60}\n")
272
+
273
+ results["overall"] = all_passed
274
+ return all_passed, results
@@ -37,10 +37,14 @@ from .utils import calculate_cost, detect_provider
37
37
  # Context variables for session and turn tracking
38
38
  # These variables automatically propagate across async call boundaries,
39
39
  # allowing deeply nested code to access tracing context without explicit passing
40
- _session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar("session_id")
41
- _turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar("turn_number")
40
+ _session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar(
41
+ "session_id"
42
+ )
43
+ _turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar(
44
+ "turn_number"
45
+ )
42
46
  _session_tracer_ctx: contextvars.ContextVar[Any | None] = contextvars.ContextVar(
43
- "session_tracer", default=None
47
+ "session_tracer"
44
48
  )
45
49
 
46
50
 
@@ -25,15 +25,15 @@ application to continue without blocking on sync operations.
25
25
  """
26
26
 
27
27
  import asyncio
28
+ import importlib
28
29
  import logging
29
- from typing import Any
30
-
31
- import libsql
30
+ from typing import Any, cast
32
31
 
33
32
  from .config import CONFIG
34
33
 
35
34
  logger = logging.getLogger(__name__)
36
35
 
36
+ libsql = cast(Any, importlib.import_module("libsql"))
37
37
 
38
38
  class ReplicaSync:
39
39
  """Manages synchronization of embedded SQLite replica with remote Turso database.
@@ -53,7 +53,7 @@ class ReplicaSync:
53
53
  db_path: str = "embedded.db",
54
54
  sync_url: str | None = None,
55
55
  auth_token: str | None = None,
56
- sync_interval: int | None = None,
56
+ sync_interval: float | None = None,
57
57
  ):
58
58
  """Initialize replica sync manager.
59
59