synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -3,17 +3,16 @@ from __future__ import annotations
3
3
  from dataclasses import dataclass
4
4
  from typing import Any, Literal
5
5
 
6
- from pydantic import BaseModel, Field
6
+ from pydantic import BaseModel, ConfigDict, Field
7
7
 
8
8
 
9
9
  @dataclass(frozen=True)
10
10
  class TaskAppEndpoints:
11
- """Canonical Task App endpoint shapes used by RL trainers.
11
+ """Required Task App endpoints used by RL trainers and clients.
12
12
 
13
- Task Apps run as lightweight HTTP services (often on Modal) that expose a
14
- consistent set of endpoints for health, metadata, environment lifecycle,
15
- rollouts, and optional proxy access to vendor models. The endpoint strings
16
- defined here act as defaults and documentation for clients.
13
+ Task Apps run as lightweight HTTP services (often on Modal) that expose these
14
+ standard endpoints. Additional endpoints (proxies, debug routes) may be added
15
+ by individual task apps as needed.
17
16
  """
18
17
 
19
18
  root: str = "/"
@@ -21,28 +20,6 @@ class TaskAppEndpoints:
21
20
  info: str = "/info"
22
21
  task_info: str = "/task_info"
23
22
  rollout: str = "/rollout"
24
- proxy_chat_completions: str = "/proxy/v1/chat/completions"
25
- proxy_groq_chat_completions: str = "/proxy/groq/v1/chat/completions"
26
- env_initialize: str = "/env/{env_name}/initialize"
27
- env_step: str = "/env/{env_name}/step"
28
- env_terminate: str = "/env/{env_name}/terminate"
29
-
30
-
31
- @dataclass(frozen=True)
32
- class TaskAppContract:
33
- """Requirements and expectations for a Task App used by RL trainers.
34
-
35
- - Auth: ENVIRONMENT_API_KEY must be set in the Task App environment; requests include X-API-Key.
36
- - Health: /health returns 200 and JSON; may verify X-API-Key header.
37
- - Env API: initialize/step/terminate are present for the target env (e.g., CrafterClassic).
38
- - Rollout API: optional; provides a single-call rollout for convenience/testing.
39
- - Inference routing: policy config passes an inference_url (Synth backend or OpenAI proxy).
40
- - URL: base must be reachable via HTTPS and should be under .modal.run in production.
41
- """
42
-
43
- base_url: str
44
- env_name: str | None = None
45
- requires_api_key_header: bool = True
46
23
 
47
24
 
48
25
  # --- Unified rollout schema used by Task App services and SDK utilities ---
@@ -87,6 +64,12 @@ class RolloutRequest(BaseModel):
87
64
 
88
65
 
89
66
  class RolloutStep(BaseModel):
67
+ """Single step in a rollout trajectory.
68
+
69
+ DEPRECATED: This is part of the legacy trajectory format. New code should
70
+ consume v3 traces (RolloutResponse.trace) instead. See monorepo/trace_single_source.txt
71
+ for migration plan.
72
+ """
90
73
  obs: dict[str, Any]
91
74
  tool_calls: list[dict[str, Any]]
92
75
  reward: float | None = None
@@ -96,11 +79,40 @@ class RolloutStep(BaseModel):
96
79
 
97
80
 
98
81
  class RolloutTrajectory(BaseModel):
82
+ """Legacy trajectory format for rollout results.
83
+
84
+ DEPRECATED: This format duplicates data already present in v3 traces and will
85
+ be removed once training code migrates to consuming RolloutResponse.trace.
86
+
87
+ Current state:
88
+ - Task apps emit BOTH this format AND v3 traces (dual serialization)
89
+ - Training code (GSPO) reads from this format
90
+ - Eval/filter tools read from v3 traces
91
+
92
+ Migration plan:
93
+ - Phase 1: Training code learns to read from v3 traces (with fallback to this)
94
+ - Phase 2: Make this field optional once training is migrated
95
+ - Phase 3: Remove this field entirely and delete this class
96
+
97
+ See: monorepo/trace_single_source.txt for full migration plan and timeline.
98
+
99
+ Why v3 traces are better:
100
+ - Single source of truth (no duplication/drift)
101
+ - Richer data: token IDs, logprobs, reasoning, timing, images
102
+ - Built-in audit trail and replay capability
103
+ - Standard schema across all Synth AI tooling
104
+ """
99
105
  env_id: str
100
106
  policy_id: str
101
107
  steps: list[RolloutStep]
102
108
  final: dict[str, Any] | None = None
103
109
  length: int
110
+
111
+ # Required for trace correlation with inference mesh (optional initially for backward compat)
112
+ # See: monorepo/INFERENCE_URL_REQUIREMENT_PLAN.md and trace_creation_and_judgement.txt
113
+ inference_url: str | None = None
114
+
115
+ decision_samples: list[dict[str, Any]] | None = None
104
116
 
105
117
 
106
118
  class RolloutMetrics(BaseModel):
@@ -114,24 +126,98 @@ class RolloutMetrics(BaseModel):
114
126
 
115
127
 
116
128
  class RolloutResponse(BaseModel):
129
+ """Response from a rollout execution.
130
+
131
+ Contains both legacy trajectory format (for backward compatibility) and
132
+ modern v3 trace format (preferred going forward).
133
+ """
117
134
  run_id: str
135
+
136
+ # DEPRECATED: Legacy format maintained for training code compatibility.
137
+ # Will be removed once training migrates to reading from `trace` field.
138
+ # See: monorepo/trace_single_source.txt for migration plan.
118
139
  trajectories: list[RolloutTrajectory]
140
+
119
141
  branches: dict[str, list[str]] = Field(default_factory=dict)
120
142
  metrics: RolloutMetrics
121
143
  aborted: bool = False
122
144
  ops_executed: int = 0
145
+
146
+ # PREFERRED: v3 trace format (SessionTrace). This is the single source of truth
147
+ # for rollout data and should be used by all new code. Contains richer data than
148
+ # trajectories including token IDs, logprobs, timing, and multimodal content.
123
149
  trace: dict[str, Any] | None = None
124
150
 
125
151
 
126
- class TaskInfo(BaseModel):
152
+ class _ExtraAllowModel(BaseModel):
153
+ """Base helper that preserves unknown keys while still exposing typed attributes."""
154
+
155
+ model_config = ConfigDict(extra="allow")
156
+
157
+
158
+ class TaskDescriptor(_ExtraAllowModel):
159
+ """Human-readable task identifiers shown in UIs and logs."""
160
+
161
+ id: str
162
+ name: str
163
+ description: str | None = None
164
+ version: str | None = None
165
+
166
+
167
+ class DatasetInfo(_ExtraAllowModel):
168
+ """Metadata about the prompt/task dataset powering the environment."""
169
+
170
+ id: str | None = None
171
+ name: str | None = None
172
+ version: str | None = None
173
+ splits: list[str] | None = None
174
+ default_split: str | None = None
175
+ description: str | None = None
176
+
177
+
178
+ class RubricCriterion(_ExtraAllowModel):
179
+ id: str
180
+ description: str
181
+ weight: float | None = None
182
+
183
+
184
+ class RubricSection(_ExtraAllowModel):
185
+ name: str
186
+ criteria: list[RubricCriterion] = Field(default_factory=list)
187
+
188
+
189
+ class RubricInfo(_ExtraAllowModel):
190
+ """Outcome and event scoring definitions used by judges."""
191
+
192
+ outcome: RubricSection | None = None
193
+ events: RubricSection | None = None
194
+
195
+
196
+ class InferenceInfo(_ExtraAllowModel):
197
+ """Recommended defaults for policy model routing."""
198
+
199
+ model: str | None = None
200
+ inference_url: str | None = None
201
+
202
+
203
+ class LimitsInfo(_ExtraAllowModel):
204
+ """Operational limits the environment enforces."""
205
+
206
+ max_turns: int | None = None
207
+ max_response_tokens: int | None = None
208
+ timeout_seconds: int | None = None
209
+
210
+
211
+ class TaskInfo(_ExtraAllowModel):
127
212
  """Static metadata describing the capabilities of a Task App task."""
128
213
 
129
- task: dict[str, Any]
130
- environments: list[str]
131
- action_space: dict[str, Any]
132
- observation: dict[str, Any]
133
- dataset: dict[str, Any]
134
- rubric: dict[str, Any]
135
- inference: dict[str, Any]
136
- capabilities: dict[str, Any]
137
- limits: dict[str, Any]
214
+ task: TaskDescriptor
215
+ environment: str
216
+ dataset: DatasetInfo
217
+ rubric: RubricInfo
218
+ inference: InferenceInfo
219
+ limits: LimitsInfo
220
+ task_metadata: dict[str, Any] = Field(
221
+ default_factory=dict,
222
+ description="Task-specific extras (e.g. prompt version info, documentation links).",
223
+ )
synth_ai/task/proxy.py CHANGED
@@ -1,39 +1,15 @@
1
- """Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.)."""
1
+ """Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.).
2
+
3
+ The proxy is tool-agnostic - each task app provides its own tools schema.
4
+ """
2
5
 
3
6
  from __future__ import annotations
4
7
 
5
8
  import copy
6
9
  import json
7
10
  import re
8
- from collections.abc import Iterable
9
11
  from typing import Any
10
12
 
11
- INTERACT_TOOL_SCHEMA: list[dict[str, Any]] = [
12
- {
13
- "type": "function",
14
- "function": {
15
- "name": "interact",
16
- "description": "Perform one or more environment actions.",
17
- "parameters": {
18
- "type": "object",
19
- "properties": {
20
- "actions": {
21
- "type": "array",
22
- "items": {"type": "string"},
23
- "description": "List of environment actions to execute in order.",
24
- },
25
- "reasoning": {
26
- "type": "string",
27
- "description": "Optional reasoning for the chosen actions.",
28
- },
29
- },
30
- "required": ["actions"],
31
- "additionalProperties": False,
32
- },
33
- },
34
- }
35
- ]
36
-
37
13
  _REMOVE_FIELDS = {
38
14
  "stop_after_tool_calls",
39
15
  "thinking_mode",
@@ -44,14 +20,12 @@ _REMOVE_SAMPLING_FIELDS = {"temperature", "top_p"}
44
20
  _GPT5_MIN_COMPLETION_TOKENS = 16000
45
21
 
46
22
 
47
- def _ensure_tools(payload: dict[str, Any]) -> None:
48
- tools = payload.get("tools")
49
- if not isinstance(tools, list) or not tools:
50
- payload["tools"] = copy.deepcopy(INTERACT_TOOL_SCHEMA)
51
-
52
-
53
23
  def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str, Any]:
54
- """Sanitise an OpenAI chat completions payload for Task App usage."""
24
+ """Sanitise an OpenAI chat completions payload for Task App usage.
25
+
26
+ The task app is responsible for providing tools in the payload.
27
+ This function only handles model-specific parameter normalization.
28
+ """
55
29
 
56
30
  sanitized = copy.deepcopy(payload)
57
31
  for field in _REMOVE_FIELDS:
@@ -68,10 +42,18 @@ def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str,
68
42
  mct = sanitized.get("max_completion_tokens")
69
43
  if not isinstance(mct, int) or mct < _GPT5_MIN_COMPLETION_TOKENS:
70
44
  sanitized["max_completion_tokens"] = _GPT5_MIN_COMPLETION_TOKENS
71
- sanitized["tool_choice"] = {"type": "function", "function": {"name": "interact"}}
45
+
46
+ # Set tool_choice to first provided tool (task app must provide tools)
47
+ # If tool_choice not already set and tools are provided, use the first one
48
+ if "tool_choice" not in sanitized:
49
+ tools = sanitized.get("tools", [])
50
+ if isinstance(tools, list) and tools:
51
+ first_func = tools[0].get("function", {})
52
+ if isinstance(first_func, dict) and "name" in first_func:
53
+ sanitized["tool_choice"] = {"type": "function", "function": {"name": first_func["name"]}}
54
+
72
55
  sanitized["parallel_tool_calls"] = False
73
56
 
74
- _ensure_tools(sanitized)
75
57
  return sanitized
76
58
 
77
59
 
@@ -206,24 +188,18 @@ def parse_tool_call_from_text(text: str) -> tuple[list[str], str]:
206
188
  return [], text
207
189
 
208
190
 
209
- def _build_tool_call(actions: Iterable[str], reasoning: str) -> dict[str, Any]:
210
- payload = {
211
- "actions": [str(a).strip() for a in actions if str(a).strip()],
212
- }
213
- if reasoning.strip():
214
- payload["reasoning"] = reasoning.strip()
215
- return {
216
- "id": "tool_interact_fallback",
217
- "type": "function",
218
- "function": {
219
- "name": INTERACT_TOOL_SCHEMA[0]["function"]["name"],
220
- "arguments": json.dumps(payload, ensure_ascii=False),
221
- },
222
- }
223
-
224
-
225
- def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str, Any]:
226
- """Ensure the first choice carries a tool_call derived from text if absent."""
191
+ def synthesize_tool_call_if_missing(
192
+ openai_response: dict[str, Any],
193
+ fallback_tool_name: str = "interact"
194
+ ) -> dict[str, Any]:
195
+ """Ensure the first choice carries a tool_call derived from text if absent.
196
+
197
+ This is a fallback for models that don't properly support tool calling.
198
+ Task apps can specify their preferred fallback tool name (e.g., "interact", "execute_sequence").
199
+
200
+ DEPRECATED: Task apps should prefer models with native tool calling support.
201
+ This function will be removed in a future version.
202
+ """
227
203
 
228
204
  if not isinstance(openai_response, dict):
229
205
  return openai_response
@@ -245,8 +221,24 @@ def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str
245
221
  if not actions:
246
222
  return openai_response
247
223
 
224
+ # Build a fallback tool call using the provided tool name
225
+ payload = {
226
+ "actions": [str(a).strip() for a in actions if str(a).strip()],
227
+ }
228
+ if reasoning.strip():
229
+ payload["reasoning"] = reasoning.strip()
230
+
231
+ tool_call = {
232
+ "id": f"tool_{fallback_tool_name}_fallback",
233
+ "type": "function",
234
+ "function": {
235
+ "name": fallback_tool_name,
236
+ "arguments": json.dumps(payload, ensure_ascii=False),
237
+ },
238
+ }
239
+
248
240
  new_message = copy.deepcopy(message)
249
- new_message["tool_calls"] = [_build_tool_call(actions, reasoning)]
241
+ new_message["tool_calls"] = [tool_call]
250
242
  if "content" not in new_message:
251
243
  new_message["content"] = None
252
244
 
@@ -0,0 +1,53 @@
1
+ """Rubric schema, loading, and scoring helpers for Task Apps.
2
+
3
+ This module provides:
4
+ - Flexible rubric models (Criterion, Rubric) for general task app use
5
+ - Strict validators (StrictCriterion, StrictRubric) for step-wise judges
6
+ - Loading utilities supporting JSON, YAML, and HTTP sources
7
+ - Blending utilities for composing rubrics
8
+ - Scoring utilities for events and outcomes
9
+ """
10
+
11
+ # Core models (flexible validation)
12
+ from .models import Criterion, Rubric
13
+
14
+ # Loading and blending
15
+ from .loaders import blend_rubrics, load_rubric
16
+
17
+ # Scoring
18
+ from .scoring import score_events_against_rubric, score_outcome_against_rubric
19
+
20
+ # Strict validators (for judge configs)
21
+ from .strict import (
22
+ StrictCriterion,
23
+ StrictRubric,
24
+ ValidationError,
25
+ validate_rubric_dict,
26
+ validate_rubric_file,
27
+ validate_rubric_files,
28
+ )
29
+
30
+ __all__ = [
31
+ # Flexible models
32
+ "Criterion",
33
+ "Rubric",
34
+ # Loaders
35
+ "load_rubric",
36
+ "blend_rubrics",
37
+ # Scoring
38
+ "score_events_against_rubric",
39
+ "score_outcome_against_rubric",
40
+ # Strict validators
41
+ "StrictCriterion",
42
+ "StrictRubric",
43
+ "ValidationError",
44
+ "validate_rubric_dict",
45
+ "validate_rubric_file",
46
+ "validate_rubric_files",
47
+ ]
48
+
49
+ # Maintain backwards compatibility
50
+ # Old code may import these names expecting the flexible variants
51
+ RubricCriterion = StrictCriterion
52
+ RubricSpec = StrictRubric
53
+
@@ -0,0 +1,133 @@
1
+ """Rubric loading and blending utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from .models import Criterion, Rubric
10
+
11
+
12
+ def _load_text(source: str) -> tuple[str, str | None]:
13
+ """Load text from file path or return as-is."""
14
+ path = Path(source)
15
+ if path.exists():
16
+ return path.read_text(encoding="utf-8"), path.suffix.lower()
17
+ return source, None
18
+
19
+
20
+ def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
21
+ """Parse JSON or YAML text into a dictionary."""
22
+ text = text.strip()
23
+ if not text:
24
+ raise ValueError("Rubric source is empty")
25
+ if suffix in (".yaml", ".yml"):
26
+ try:
27
+ import yaml # type: ignore
28
+ except Exception as exc: # pragma: no cover - optional dependency
29
+ raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
30
+ data = yaml.safe_load(text)
31
+ if not isinstance(data, dict):
32
+ raise ValueError("Rubric YAML must produce a mapping") from None
33
+ return data
34
+ if text.startswith("{"):
35
+ return json.loads(text)
36
+ if text.startswith("http://") or text.startswith("https://"):
37
+ import requests # type: ignore
38
+
39
+ response = requests.get(text, timeout=15)
40
+ response.raise_for_status()
41
+ return _parse_structured(response.text, suffix)
42
+ try:
43
+ return json.loads(text)
44
+ except json.JSONDecodeError:
45
+ try:
46
+ import yaml # type: ignore
47
+ except Exception as exc: # pragma: no cover - optional dependency
48
+ raise RuntimeError("PyYAML is required to load rubric text") from exc
49
+ data = yaml.safe_load(text)
50
+ if not isinstance(data, dict):
51
+ raise ValueError("Rubric text must decode to a mapping") from None
52
+ return data
53
+
54
+
55
+ def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
56
+ """Load rubric from file path, dict, or return existing Rubric.
57
+
58
+ Args:
59
+ source: File path (JSON/YAML), dict, existing Rubric, or None
60
+
61
+ Returns:
62
+ Parsed Rubric instance or None if source is None
63
+ """
64
+ if source is None:
65
+ return None
66
+ if isinstance(source, Rubric):
67
+ return source
68
+ if isinstance(source, dict):
69
+ return Rubric.model_validate(source)
70
+ text, suffix = _load_text(str(source))
71
+ data = _parse_structured(text, suffix)
72
+ return Rubric.model_validate(data)
73
+
74
+
75
+ def _merge_weights(base: Criterion, override: Criterion) -> float:
76
+ """Merge criterion weights from base and override rubrics."""
77
+ if override.weight != 1.0 and base.weight != 1.0:
78
+ return base.weight * override.weight
79
+ if override.weight != 1.0:
80
+ return override.weight
81
+ return base.weight
82
+
83
+
84
+ def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
85
+ """Blend two rubrics by merging criteria and inheriting properties.
86
+
87
+ Override rubric takes precedence for descriptions and settings.
88
+ Weights are merged multiplicatively when both are non-default.
89
+
90
+ Args:
91
+ base: Base rubric providing defaults
92
+ override: Override rubric with specific customizations
93
+
94
+ Returns:
95
+ Blended rubric or None if both inputs are None
96
+ """
97
+ if override is None and base is None:
98
+ return None
99
+ if base is None:
100
+ return override
101
+ if override is None:
102
+ return base
103
+
104
+ base_map = {criterion.id: criterion for criterion in base.criteria}
105
+ merged: list[Criterion] = []
106
+
107
+ for ov in override.criteria:
108
+ if ov.id in base_map:
109
+ existing = base_map.pop(ov.id)
110
+ merged.append(
111
+ Criterion(
112
+ id=ov.id,
113
+ description=ov.description or existing.description,
114
+ weight=_merge_weights(existing, ov),
115
+ required=ov.required if ov.required is not None else existing.required,
116
+ )
117
+ )
118
+ else:
119
+ merged.append(ov)
120
+
121
+ merged.extend(base_map.values())
122
+
123
+ aggregation = override.aggregation
124
+ if aggregation == "inherit":
125
+ aggregation = base.aggregation
126
+
127
+ return Rubric(
128
+ version=override.version or base.version,
129
+ goal_text=override.goal_text or base.goal_text,
130
+ criteria=merged,
131
+ aggregation=aggregation,
132
+ )
133
+
@@ -0,0 +1,57 @@
1
+ """Rubric and Criterion data models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel, Field, field_validator
6
+
7
+
8
+ class Criterion(BaseModel):
9
+ """Single scoring criterion within a rubric.
10
+
11
+ Flexible variant allowing weights > 1.0 and no normalization requirement.
12
+ Used by task apps for general rubric scoring.
13
+ """
14
+
15
+ id: str
16
+ description: str
17
+ weight: float = 1.0
18
+ required: bool = False
19
+
20
+ @field_validator("weight")
21
+ @classmethod
22
+ def _validate_weight(cls, value: float) -> float:
23
+ if value <= 0:
24
+ raise ValueError("criterion weight must be positive")
25
+ return value
26
+
27
+
28
+ class Rubric(BaseModel):
29
+ """Rubric definition for scoring task app outcomes.
30
+
31
+ Supports flexible aggregation and blending. Criteria weights do not need
32
+ to sum to 1.0, making this suitable for general task app usage.
33
+ """
34
+
35
+ version: str
36
+ goal_text: str | None = None
37
+ criteria: list[Criterion] = Field(default_factory=list)
38
+ aggregation: str = "weighted_sum"
39
+
40
+ @field_validator("aggregation")
41
+ @classmethod
42
+ def _validate_aggregation(cls, value: str) -> str:
43
+ allowed = {"sum", "weighted_sum", "custom", "inherit"}
44
+ if value not in allowed:
45
+ raise ValueError(f"aggregation must be one of {sorted(allowed)}")
46
+ return value
47
+
48
+ @field_validator("criteria")
49
+ @classmethod
50
+ def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
51
+ seen = set()
52
+ for criterion in criteria:
53
+ if criterion.id in seen:
54
+ raise ValueError(f"duplicate criterion id: {criterion.id}")
55
+ seen.add(criterion.id)
56
+ return criteria
57
+