synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show
  1. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -1
  2. examples/swe/task_app/grpo_swe_mini.py +55 -26
  3. examples/swe/task_app/hosted/rollout.py +40 -0
  4. examples/swe/task_app/hosted/test_service.py +5 -6
  5. examples/task_apps/TESTING.md +275 -0
  6. examples/task_apps/__init__.py +0 -0
  7. examples/task_apps/crafter/__init__.py +0 -0
  8. examples/task_apps/crafter/task_app/__init__.py +2 -0
  9. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter.py +18 -13
  10. examples/{warming_up_to_rl → task_apps/crafter}/task_app/grpo_crafter_task_app.py +1 -1
  11. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/policy.py +60 -4
  12. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/policy_routes.py +25 -3
  13. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/rollout.py +10 -0
  14. examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_service.py +5 -6
  15. examples/task_apps/dev/pokemon_emerald/__init__.py +2 -0
  16. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/README.md +811 -0
  17. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/__init__.py +120 -0
  18. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/action.py +160 -0
  19. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/memory.py +155 -0
  20. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/perception.py +69 -0
  21. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/planning.py +96 -0
  22. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/simple.py +1502 -0
  23. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/agent/system_prompt.py +4 -0
  24. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/grab_map.py +68 -0
  25. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/manual.py +216 -0
  26. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/__init__.py +35 -0
  27. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emerald_utils.py +631 -0
  28. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/emulator.py +1544 -0
  29. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/enums.py +1428 -0
  30. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/memory_reader.py +4848 -0
  31. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/types.py +41 -0
  32. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pokemon_env/utils.py +298 -0
  33. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/pyproject.toml +95 -0
  34. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/run.py +204 -0
  35. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/__init__.py +0 -0
  36. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/app.py +2152 -0
  37. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/client.py +429 -0
  38. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/server/frame_server.py +155 -0
  39. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/README.md +78 -0
  40. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/__init__.py +0 -0
  41. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/run_tests.py +122 -0
  42. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_direct.py +76 -0
  43. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_agent_prompts.py +413 -0
  44. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_battle_state_formatting.py +204 -0
  45. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection.py +133 -0
  46. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_dialogue_detection_comprehensive.py +229 -0
  47. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_direct_agent_emulator.py +300 -0
  48. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_fps_adjustment_pytest.py +205 -0
  49. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_direct.py +200 -0
  50. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_house_to_outside_transition.py +284 -0
  51. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_map_ground_truth_comparison.py +468 -0
  52. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_memory_map.py +575 -0
  53. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_server_map_validation.py +311 -0
  54. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/tests/test_torchic_state.py +259 -0
  55. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/__init__.py +0 -0
  56. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/anticheat.py +372 -0
  57. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/checkpoint.py +296 -0
  58. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/error_handler.py +275 -0
  59. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/get_local_ip.py +22 -0
  60. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/helpers.py +44 -0
  61. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/llm_logger.py +514 -0
  62. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_formatter.py +415 -0
  63. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher.py +1763 -0
  64. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_stitcher_singleton.py +33 -0
  65. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_trimmer.py +106 -0
  66. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/map_visualizer.py +334 -0
  67. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/ocr_dialogue.py +1020 -0
  68. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/recording.py +188 -0
  69. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/state_formatter.py +1481 -0
  70. examples/task_apps/dev/pokemon_emerald/external/pokeagent-speedrun/utils/vlm.py +862 -0
  71. examples/task_apps/dev/pokemon_emerald/modal_app.py +114 -0
  72. examples/task_apps/dev/pokemon_emerald/task_app/README.md +81 -0
  73. examples/task_apps/dev/pokemon_emerald/task_app/__init__.py +6 -0
  74. examples/task_apps/dev/pokemon_emerald/task_app/pokemon_emerald.py +685 -0
  75. examples/task_apps/enron/__init__.py +1 -0
  76. examples/task_apps/enron/eval_groq_qwen32.toml +16 -0
  77. examples/task_apps/enron/task_app/README.md +14 -0
  78. examples/task_apps/enron/task_app/__init__.py +1 -0
  79. examples/task_apps/enron/task_app/grpo_enron.py +906 -0
  80. examples/task_apps/enron/task_app/grpo_enron_task_app.py +146 -0
  81. examples/task_apps/enron/tests/__init__.py +2 -0
  82. examples/task_apps/enron/tests/conftest.py +115 -0
  83. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  84. examples/task_apps/enron/tests/integration/test_enron_eval.py +177 -0
  85. examples/task_apps/enron/tests/integration/test_enron_rollout.py +135 -0
  86. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  87. examples/task_apps/enron/tests/unit/test_enron_environment.py +126 -0
  88. examples/task_apps/math/__init__.py +0 -0
  89. examples/{rl/task_app → task_apps/math}/math_single_step.py +19 -10
  90. examples/task_apps/pokemon_battle/__init__.py +2 -0
  91. examples/task_apps/pokemon_battle/modal_app.py +104 -0
  92. examples/task_apps/pokemon_battle/task_app/README.md +68 -0
  93. examples/task_apps/pokemon_battle/task_app/__init__.py +6 -0
  94. examples/task_apps/pokemon_battle/task_app/pokemon_showdown.py +932 -0
  95. examples/task_apps/pokemon_red/README.md +357 -0
  96. examples/task_apps/pokemon_red/__init__.py +3 -0
  97. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +225 -0
  98. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +73 -0
  99. examples/task_apps/pokemon_red/task_app.py +606 -0
  100. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +191 -0
  101. examples/task_apps/sokoban/README.md +307 -0
  102. examples/task_apps/sokoban/__init__.py +3 -0
  103. examples/task_apps/sokoban/eval_groq_qwen32.toml +16 -0
  104. examples/task_apps/sokoban/eval_openai_gpt5.toml +16 -0
  105. examples/task_apps/sokoban/task_app.py +1058 -0
  106. examples/task_apps/sokoban/tests/__init__.py +2 -0
  107. examples/task_apps/sokoban/tests/conftest.py +113 -0
  108. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  109. examples/task_apps/sokoban/tests/integration/test_sokoban_eval.py +57 -0
  110. examples/task_apps/sokoban/tests/integration/test_sokoban_rollout.py +198 -0
  111. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  112. examples/task_apps/sokoban/tests/unit/test_sokoban_environment.py +114 -0
  113. examples/task_apps/verilog/__init__.py +1 -0
  114. examples/task_apps/verilog/eval_groq_qwen32b.toml +20 -0
  115. examples/task_apps/verilog/task_app/README.md +12 -0
  116. examples/task_apps/verilog/task_app/__init__.py +1 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +931 -0
  118. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +145 -0
  119. examples/task_apps/verilog/tests/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/conftest.py +115 -0
  121. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  122. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +179 -0
  123. examples/task_apps/verilog/tests/integration/test_verilog_rollout.py +55 -0
  124. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  125. examples/task_apps/verilog/tests/unit/test_verilog_scoring.py +118 -0
  126. examples/vlm/crafter_openai_vlm_agent.py +4 -4
  127. examples/vlm/run_crafter_vlm_benchmark.py +4 -4
  128. examples/workflows/__init__.py +0 -0
  129. examples/workflows/math_rl/__init__.py +0 -0
  130. examples/workflows/math_rl/download_dataset.py +80 -0
  131. synth_ai/__init__.py +2 -2
  132. synth_ai/api/train/builders.py +25 -11
  133. synth_ai/api/train/cli.py +12 -6
  134. synth_ai/api/train/configs/__init__.py +10 -10
  135. synth_ai/api/train/configs/rl.py +5 -4
  136. synth_ai/api/train/configs/sft.py +4 -3
  137. synth_ai/api/train/env_resolver.py +5 -2
  138. synth_ai/api/train/supported_algos.py +10 -5
  139. synth_ai/api/train/utils.py +7 -4
  140. synth_ai/cli/__init__.py +7 -51
  141. synth_ai/cli/_storage.py +4 -3
  142. synth_ai/cli/_validate_task_app.py +11 -0
  143. synth_ai/cli/balance.py +4 -3
  144. synth_ai/cli/calc.py +2 -2
  145. synth_ai/cli/demo.py +14 -7
  146. synth_ai/cli/legacy_root_backup.py +1 -1
  147. synth_ai/cli/rl_demo.py +8 -7
  148. synth_ai/cli/root.py +0 -97
  149. synth_ai/cli/task_apps.py +1707 -186
  150. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +28 -16
  151. synth_ai/environments/examples/enron/engine.py +7 -2
  152. synth_ai/environments/examples/enron/environment.py +68 -0
  153. synth_ai/environments/examples/red/engine.py +27 -0
  154. synth_ai/environments/examples/red/engine_helpers/memory_map.py +7 -0
  155. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_progression.py +477 -0
  156. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +32 -0
  157. synth_ai/environments/examples/red/environment.py +60 -0
  158. synth_ai/environments/examples/sokoban/taskset.py +116 -0
  159. synth_ai/environments/examples/verilog/engine.py +30 -4
  160. synth_ai/evals/client.py +58 -61
  161. synth_ai/jobs/client.py +16 -4
  162. synth_ai/judge_schemas.py +16 -16
  163. synth_ai/py.typed +0 -0
  164. synth_ai/task/__init__.py +14 -5
  165. synth_ai/task/contracts.py +124 -38
  166. synth_ai/task/proxy.py +48 -56
  167. synth_ai/task/rubrics/__init__.py +53 -0
  168. synth_ai/task/rubrics/loaders.py +133 -0
  169. synth_ai/task/rubrics/models.py +57 -0
  170. synth_ai/task/rubrics/scoring.py +113 -0
  171. synth_ai/{rubrics/validators.py → task/rubrics/strict.py} +53 -30
  172. synth_ai/task/server.py +8 -7
  173. synth_ai/task/validators.py +269 -6
  174. synth_ai/tracing_v3/decorators.py +7 -3
  175. synth_ai/tracing_v3/replica_sync.py +4 -4
  176. synth_ai/tracing_v3/serialization.py +5 -5
  177. synth_ai/tracing_v3/trace_utils.py +317 -0
  178. synth_ai/tracing_v3/turso/native_manager.py +3 -3
  179. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/METADATA +4 -1
  180. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/RECORD +214 -101
  181. examples/agora_ex/README_MoE.md +0 -224
  182. examples/agora_ex/__init__.py +0 -7
  183. examples/agora_ex/agora_ex.py +0 -65
  184. examples/agora_ex/agora_ex_task_app.py +0 -590
  185. examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +0 -121
  186. examples/agora_ex/reward_fn_grpo-human.py +0 -129
  187. examples/agora_ex/system_prompt_CURRENT.md +0 -63
  188. examples/agora_ex/task_app/agora_ex_task_app.py +0 -590
  189. examples/agora_ex/task_app/reward_fn_grpo-human.py +0 -129
  190. examples/agora_ex/task_app/system_prompt_CURRENT.md +0 -63
  191. synth_ai/rubrics/__init__.py +0 -22
  192. synth_ai/task/rubrics.py +0 -219
  193. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/README.md +0 -0
  194. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/README.md +0 -0
  195. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/__init__.py +0 -0
  196. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/branching.py +0 -0
  197. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/environment_routes.py +0 -0
  198. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/__init__.py +0 -0
  199. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -0
  200. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/app.py +0 -0
  201. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -0
  202. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -0
  203. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -0
  204. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -0
  205. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/hosted_app.py +0 -0
  206. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/__init__.py +0 -0
  207. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/inference/openai_client.py +0 -0
  208. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/main.py +0 -0
  209. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/registry.py +0 -0
  210. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/__init__.py +0 -0
  211. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/storage/volume.py +0 -0
  212. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/test_agents.py +0 -0
  213. /examples/{warming_up_to_rl → task_apps/crafter}/task_app/synth_envs_hosted/utils.py +0 -0
  214. /examples/{rl/task_app → task_apps/math}/README.md +0 -0
  215. /examples/{rl/task_app → task_apps/math}/math_task_app.py +0 -0
  216. /examples/{rl → workflows/math_rl}/configs/eval_base_qwen.toml +0 -0
  217. /examples/{rl → workflows/math_rl}/configs/eval_rl_qwen.toml +0 -0
  218. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen.toml +0 -0
  219. /examples/{rl → workflows/math_rl}/configs/rl_from_base_qwen17.toml +0 -0
  220. /examples/{rl → workflows/math_rl}/configs/rl_from_ft_qwen.toml +0 -0
  221. /examples/{rl → workflows/math_rl}/run_eval.py +0 -0
  222. /examples/{rl → workflows/math_rl}/run_rl_and_save.py +0 -0
  223. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/WHEEL +0 -0
  224. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/entry_points.txt +0 -0
  225. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/licenses/LICENSE +0 -0
  226. {synth_ai-0.2.13.dev1.dist-info → synth_ai-0.2.13.dev2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import os
5
+ from collections.abc import Iterable, Sequence
3
6
  from dataclasses import asdict, dataclass, fields
4
7
  from typing import List, Tuple
5
8
  from uuid import UUID, uuid4
@@ -18,6 +21,7 @@ from synth_ai.environments.tasks.core import (
18
21
  TaskInstanceMetadataFilter,
19
22
  TaskInstanceSet,
20
23
  )
24
+ from synth_ai.task.contracts import TaskInfo
21
25
 
22
26
  logger = logging.getLogger(__name__)
23
27
 
@@ -96,6 +100,118 @@ class SokobanTaskInstance(TaskInstance):
96
100
  return cls(**filtered_data)
97
101
 
98
102
 
103
+ def _base_task_info_template() -> TaskInfo:
104
+ return TaskInfo(
105
+ task={"id": "sokoban", "name": "Sokoban", "version": "1.0.0"},
106
+ environment="sokoban",
107
+ action_space={
108
+ "type": "tool_call",
109
+ "tools": [{"name": "interact", "schema": {"action": "int"}}],
110
+ "max_calls": 1,
111
+ },
112
+ observation={"summary": "Sokoban grid observation", "keys": ["grid", "player"]},
113
+ dataset={"id": "sokoban", "name": "Sokoban", "version": "1.0.0"},
114
+ rubric={"version": "1", "criteria_count": 1, "source": "inline"},
115
+ inference={"supports_proxy": False},
116
+ capabilities={"supports_rollout": True, "supports_env_lifecycle": True},
117
+ limits={"max_turns": 200},
118
+ )
119
+
120
+
121
+ class SokobanTaskSet:
122
+ """Minimal helper compatible with Task App expectations."""
123
+
124
+ def __init__(self) -> None:
125
+ self._taskset: TaskInstanceSet | None = None
126
+ self._seed_index: dict[int, SokobanTaskInstance] = {}
127
+ self._base_info = _base_task_info_template()
128
+
129
+ async def _ensure_loaded(self) -> TaskInstanceSet:
130
+ if self._taskset is None:
131
+ dataset = await create_sokoban_taskset()
132
+ self._taskset = dataset
133
+ self._seed_index.clear()
134
+ for inst in dataset.instances:
135
+ try:
136
+ seed_value = int(getattr(inst.metadata, "seed"))
137
+ except Exception:
138
+ continue
139
+ # Keep the first instance encountered for a seed
140
+ self._seed_index.setdefault(seed_value, inst)
141
+ return self._taskset
142
+
143
+ def describe(self) -> dict[str, object]:
144
+ if not self._taskset:
145
+ return {"id": "sokoban", "name": "Sokoban"}
146
+ return {
147
+ "id": "sokoban",
148
+ "name": self._taskset.name,
149
+ "description": self._taskset.description,
150
+ "instance_count": len(self._taskset.instances),
151
+ }
152
+
153
+ async def provide_task_instances(self, seeds: Sequence[int]) -> Iterable[TaskInfo]:
154
+ await self._ensure_loaded()
155
+ if not seeds:
156
+ return []
157
+
158
+ infos: list[TaskInfo] = []
159
+ for raw_seed in seeds:
160
+ try:
161
+ seed_value = int(raw_seed)
162
+ except Exception:
163
+ continue
164
+
165
+ instance = self._seed_index.get(seed_value)
166
+ if instance is None:
167
+ # Attempt to construct on the fly; try configured difficulties in order
168
+ for difficulty in DIFFICULTY_CONFIGS:
169
+ try:
170
+ instance = await create_task_instance_from_seed(difficulty, seed_value)
171
+ break
172
+ except Exception:
173
+ continue
174
+ if instance is None:
175
+ continue
176
+ self._seed_index[seed_value] = instance
177
+
178
+ metadata = getattr(instance, "metadata", None)
179
+ base_info = self._base_info.model_copy(deep=True)
180
+
181
+ observation = dict(base_info.observation)
182
+ dataset_info = dict(base_info.dataset)
183
+ task_metadata = {"seed": seed_value}
184
+
185
+ if metadata is not None:
186
+ for key in ("difficulty", "num_boxes", "dim_room", "max_steps", "shortest_path_length"):
187
+ value = getattr(metadata, key, None)
188
+ if value is not None:
189
+ observation[key] = value
190
+ task_metadata[key] = value
191
+ dataset_info.update(
192
+ {
193
+ "seed": getattr(metadata, "seed", seed_value),
194
+ "difficulty": getattr(metadata, "difficulty", None),
195
+ "num_boxes": getattr(metadata, "num_boxes", None),
196
+ "dim_room": getattr(metadata, "dim_room", None),
197
+ }
198
+ )
199
+ generation_params = getattr(metadata, "generation_params", None)
200
+ if generation_params is not None:
201
+ task_metadata["generation_params"] = generation_params
202
+
203
+ infos.append(
204
+ base_info.model_copy(
205
+ update={
206
+ "observation": observation,
207
+ "dataset": dataset_info,
208
+ "task_metadata": task_metadata,
209
+ }
210
+ )
211
+ )
212
+ return infos
213
+
214
+
99
215
  async def create_sokoban_taskset() -> TaskInstanceSet:
100
216
  """Generates Sokoban task instances from pre-generated verified puzzles."""
101
217
  instances = []
@@ -67,6 +67,16 @@ class VerilogStepPenaltyComponent(RewardComponent):
67
67
  return self.penalty
68
68
 
69
69
 
70
+ class VerilogSubmitSuccessComponent(RewardComponent):
71
+ """Reward for successful submission (tests passed)."""
72
+ async def score(self, state: VerilogPublicState, action: Any) -> float:
73
+ if hasattr(action, "get") and action.get("type") == "submit":
74
+ # Check if submission passed
75
+ if action.get("passed", False):
76
+ return 10.0 # Large reward for completing the task correctly
77
+ return 0.0
78
+
79
+
70
80
  class VerilogEngine(StatefulEngine):
71
81
  """
72
82
  Stateful Verilog evaluation engine with persistent artifact snapshots.
@@ -81,6 +91,7 @@ class VerilogEngine(StatefulEngine):
81
91
  components=[
82
92
  VerilogCompileSuccessComponent(),
83
93
  VerilogSimulationPassComponent(),
94
+ VerilogSubmitSuccessComponent(),
84
95
  VerilogStepPenaltyComponent(penalty=-0.01),
85
96
  ]
86
97
  )
@@ -284,13 +295,28 @@ class VerilogEngine(StatefulEngine):
284
295
 
285
296
  async def submit(self) -> Dict[str, Any]:
286
297
  """Submit solution for grading."""
287
- # For now, simple check based on last simulation
288
- # In a full implementation, this would call the task's verify method
298
+ # Check if the last simulation passed
299
+ # Parse the last simulation output to determine if tests passed
300
+ passed = False
301
+ detail = "No simulation run yet"
302
+
303
+ if self._last_simulate_output:
304
+ stdout = self._last_simulate_output
305
+ passed = (
306
+ "ALL_TESTS_PASSED" in stdout
307
+ or ("Mismatches: 0 " in stdout and "samples" in stdout)
308
+ or ("no mismatches" in stdout.lower() and "errors" not in stdout.lower())
309
+ )
310
+ if passed:
311
+ detail = "All tests passed"
312
+ else:
313
+ detail = "Tests failed - please review simulation output"
314
+
289
315
  return {
290
316
  "ok": True,
291
317
  "type": "submit",
292
- "passed": True, # Placeholder
293
- "detail": "Submission processed",
318
+ "passed": passed,
319
+ "detail": detail,
294
320
  "submitted": True,
295
321
  }
296
322
 
synth_ai/evals/client.py CHANGED
@@ -1,11 +1,11 @@
1
- from __future__ import annotations
2
-
3
1
  """Experimental Judge API client.
4
2
 
5
3
  This surface is experimental and subject to change without notice.
6
4
  Set environment variable `SYNTH_SILENCE_EXPERIMENTAL=1` to silence warnings.
7
5
  """
8
6
 
7
+ from __future__ import annotations
8
+
9
9
  import os
10
10
  import warnings
11
11
  from typing import Any, Literal, TypedDict
@@ -13,73 +13,70 @@ from typing import Any, Literal, TypedDict
13
13
  from synth_ai.http import AsyncHttpClient, HTTPError
14
14
  from synth_ai.tracing_v3.serialization import normalize_for_json
15
15
 
16
-
17
16
  Provider = Literal["groq", "gemini"]
18
17
 
19
18
 
20
19
  class JudgeOptions(TypedDict, total=False):
21
- event: bool
22
- outcome: bool
23
- rubric_id: str
24
- rubric_overrides: dict[str, Any]
25
- provider: Provider
26
- model: str
27
- max_concurrency: int
20
+ event: bool
21
+ outcome: bool
22
+ rubric_id: str
23
+ rubric_overrides: dict[str, Any]
24
+ provider: Provider
25
+ model: str
26
+ max_concurrency: int
28
27
 
29
28
 
30
29
  class JudgeScoreResponse(TypedDict, total=False):
31
- status: str
32
- event_rewards: list[dict[str, Any]]
33
- outcome_reward: dict[str, Any]
34
- details: dict[str, Any]
30
+ status: str
31
+ event_rewards: list[dict[str, Any]]
32
+ outcome_reward: dict[str, Any]
33
+ details: dict[str, Any]
35
34
 
36
35
 
37
36
  class JudgeClient:
38
- def __init__(self, base_url: str, api_key: str, *, timeout: float = 60.0) -> None:
39
- _silence = (os.getenv("SYNTH_SILENCE_EXPERIMENTAL") or "").strip().lower()
40
- if _silence not in {"1", "true", "t", "yes", "y", "on"}:
41
- warnings.warn(
42
- "Experimental API: synth_ai.evals.JudgeClient is experimental and may change without notice.",
43
- UserWarning,
44
- stacklevel=2,
45
- )
46
- self._base = base_url.rstrip("/")
47
- self._key = api_key
48
- self._timeout = timeout
49
-
50
- async def score(
51
- self,
52
- *,
53
- trace: dict[str, Any] | Any,
54
- policy_name: str,
55
- task_app_id: str,
56
- options: JudgeOptions,
57
- task_app_base_url: str | None = None,
58
- ) -> JudgeScoreResponse:
59
- body = {
60
- "policy_name": policy_name,
61
- "task_app": {"id": task_app_id, **({"base_url": task_app_base_url} if task_app_base_url else {})},
62
- "trace": normalize_for_json(trace),
63
- "options": options or {},
64
- }
65
- try:
66
- async with AsyncHttpClient(self._base, self._key, timeout=self._timeout) as http:
67
- js = await http.post_json("/api/judge/v1/score", json=body)
68
- if not isinstance(js, dict):
69
- raise ValueError("invalid_judge_response_shape")
70
- return js # type: ignore[return-value]
71
- except HTTPError as e: # map to friendlier exceptions
72
- status = int(getattr(e, "status", 0) or 0)
73
- if status in (400, 422):
74
- raise ValueError(f"judge_validation_error: {e.detail}") from e
75
- if status in (401, 403):
76
- raise PermissionError(f"judge_auth_error: {e.detail}") from e
77
- if status == 404:
78
- raise FileNotFoundError(f"judge_route_not_found: {e.detail}") from e
79
- if status == 429:
80
- raise Exception("judge_rate_limited") from e # replace with RetryLater in future
81
- if status >= 500:
82
- raise Exception("judge_transient_error") from e # replace with TransientError in future
83
- raise
84
-
37
+ def __init__(self, base_url: str, api_key: str, *, timeout: float = 60.0) -> None:
38
+ _silence = (os.getenv("SYNTH_SILENCE_EXPERIMENTAL") or "").strip().lower()
39
+ if _silence not in {"1", "true", "t", "yes", "y", "on"}:
40
+ warnings.warn(
41
+ "Experimental API: synth_ai.evals.JudgeClient is experimental and may change without notice.",
42
+ UserWarning,
43
+ stacklevel=2,
44
+ )
45
+ self._base = base_url.rstrip("/")
46
+ self._key = api_key
47
+ self._timeout = timeout
85
48
 
49
+ async def score(
50
+ self,
51
+ *,
52
+ trace: dict[str, Any] | Any,
53
+ policy_name: str,
54
+ task_app_id: str,
55
+ options: JudgeOptions,
56
+ task_app_base_url: str | None = None,
57
+ ) -> JudgeScoreResponse:
58
+ body = {
59
+ "policy_name": policy_name,
60
+ "task_app": {"id": task_app_id, **({"base_url": task_app_base_url} if task_app_base_url else {})},
61
+ "trace": normalize_for_json(trace),
62
+ "options": options or {},
63
+ }
64
+ try:
65
+ async with AsyncHttpClient(self._base, self._key, timeout=self._timeout) as http:
66
+ js = await http.post_json("/api/judge/v1/score", json=body)
67
+ if not isinstance(js, dict):
68
+ raise ValueError("invalid_judge_response_shape")
69
+ return js # type: ignore[return-value]
70
+ except HTTPError as err: # map to friendlier exceptions
71
+ status = int(getattr(err, "status", 0) or 0)
72
+ if status in (400, 422):
73
+ raise ValueError(f"judge_validation_error: {err.detail}") from err
74
+ if status in (401, 403):
75
+ raise PermissionError(f"judge_auth_error: {err.detail}") from err
76
+ if status == 404:
77
+ raise FileNotFoundError(f"judge_route_not_found: {err.detail}") from err
78
+ if status == 429:
79
+ raise Exception("judge_rate_limited") from err # replace with RetryLater in future
80
+ if status >= 500:
81
+ raise Exception("judge_transient_error") from err # replace with TransientError in future
82
+ raise
synth_ai/jobs/client.py CHANGED
@@ -1,20 +1,32 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import importlib
4
- from typing import Any
4
+ from collections.abc import Callable
5
+ from typing import Any, cast
5
6
 
6
7
  try:
7
- normalize_model_identifier = importlib.import_module("synth_ai.api.models.supported").normalize_model_identifier
8
+ _supported_module = cast(
9
+ Any, importlib.import_module("synth_ai.api.models.supported")
10
+ )
11
+ normalize_model_identifier = cast(
12
+ Callable[[str], str], _supported_module.normalize_model_identifier
13
+ )
8
14
  except Exception as exc: # pragma: no cover - critical dependency
9
15
  raise RuntimeError("Unable to load supported model utilities") from exc
10
16
 
11
17
  try:
12
- AsyncHttpClient = importlib.import_module("synth_ai.http").AsyncHttpClient
18
+ _http_module = cast(Any, importlib.import_module("synth_ai.http"))
19
+ AsyncHttpClient = _http_module.AsyncHttpClient
13
20
  except Exception as exc: # pragma: no cover - critical dependency
14
21
  raise RuntimeError("Unable to load HTTP client") from exc
15
22
 
16
23
  try:
17
- prepare_sft_job_payload = importlib.import_module("synth_ai.learning.sft.config").prepare_sft_job_payload
24
+ _sft_config_module = cast(
25
+ Any, importlib.import_module("synth_ai.learning.sft.config")
26
+ )
27
+ prepare_sft_job_payload = cast(
28
+ Callable[..., dict[str, Any]], _sft_config_module.prepare_sft_job_payload
29
+ )
18
30
  except Exception as exc: # pragma: no cover - critical dependency
19
31
  raise RuntimeError("Unable to load SFT configuration helpers") from exc
20
32
 
synth_ai/judge_schemas.py CHANGED
@@ -9,7 +9,7 @@ This is the canonical contract that the backend MUST conform to.
9
9
 
10
10
  from __future__ import annotations
11
11
 
12
- from typing import Any, Dict, List, Literal, Optional
12
+ from typing import Any, Literal
13
13
 
14
14
  from pydantic import BaseModel, Field
15
15
 
@@ -26,12 +26,12 @@ class CriterionScorePayload(BaseModel):
26
26
  class ReviewPayload(BaseModel):
27
27
  """Rubric review (event-level or outcome-level)."""
28
28
 
29
- criteria: Dict[str, CriterionScorePayload] = Field(
29
+ criteria: dict[str, CriterionScorePayload] = Field(
30
30
  default_factory=dict,
31
31
  description="Map of criterion keys to their scores"
32
32
  )
33
33
  total: float = Field(default=0.0, description="Aggregated total score")
34
- summary: Optional[str] = Field(None, description="Optional text summary")
34
+ summary: str | None = Field(None, description="Optional text summary")
35
35
 
36
36
 
37
37
  class JudgeScoreResponse(BaseModel):
@@ -42,23 +42,23 @@ class JudgeScoreResponse(BaseModel):
42
42
  """
43
43
 
44
44
  status: Literal["ok", "failed"] = Field(default="ok", description="Request status")
45
- event_reviews: List[ReviewPayload] = Field(
45
+ event_reviews: list[ReviewPayload] = Field(
46
46
  default_factory=list,
47
47
  description="List of per-event rubric reviews (one per step)"
48
48
  )
49
- outcome_review: Optional[ReviewPayload] = Field(
49
+ outcome_review: ReviewPayload | None = Field(
50
50
  None,
51
51
  description="Optional outcome-level rubric review"
52
52
  )
53
- event_totals: List[float] = Field(
53
+ event_totals: list[float] = Field(
54
54
  default_factory=list,
55
55
  description="List of aggregated scores per event (matches event_reviews length)"
56
56
  )
57
- details: Dict[str, Any] = Field(
57
+ details: dict[str, Any] = Field(
58
58
  default_factory=dict,
59
59
  description="Additional details (provider, latency, etc.)"
60
60
  )
61
- metadata: Dict[str, Any] = Field(
61
+ metadata: dict[str, Any] = Field(
62
62
  default_factory=dict,
63
63
  description="Request metadata (provider, options, etc.)"
64
64
  )
@@ -92,15 +92,15 @@ class JudgeTaskApp(BaseModel):
92
92
  """Task application metadata."""
93
93
 
94
94
  id: str = Field(..., description="Task app identifier")
95
- base_url: Optional[str] = Field(None, description="Optional base URL for task app")
95
+ base_url: str | None = Field(None, description="Optional base URL for task app")
96
96
 
97
97
 
98
98
  class JudgeOptions(BaseModel):
99
99
  """Judge provider and configuration options."""
100
100
 
101
- provider: Optional[str] = Field(None, description="Judge provider (e.g., 'openai', 'groq')")
102
- model: Optional[str] = Field(None, description="Model identifier")
103
- rubric_id: Optional[str] = Field(None, description="Rubric identifier")
101
+ provider: str | None = Field(None, description="Judge provider (e.g., 'openai', 'groq')")
102
+ model: str | None = Field(None, description="Model identifier")
103
+ rubric_id: str | None = Field(None, description="Rubric identifier")
104
104
  event: bool = Field(True, description="Enable event-level judging")
105
105
  outcome: bool = Field(True, description="Enable outcome-level judging")
106
106
 
@@ -108,12 +108,12 @@ class JudgeOptions(BaseModel):
108
108
  class JudgeTracePayload(BaseModel):
109
109
  """Trace payload containing trajectory context."""
110
110
 
111
- event_history: List[Dict[str, Any]] = Field(..., description="List of events/steps")
112
- markov_blanket_message_history: List[Dict[str, Any]] = Field(
111
+ event_history: list[dict[str, Any]] = Field(..., description="List of events/steps")
112
+ markov_blanket_message_history: list[dict[str, Any]] = Field(
113
113
  default_factory=list,
114
114
  description="Optional message history for context"
115
115
  )
116
- metadata: Dict[str, Any] = Field(default_factory=dict, description="Trace metadata")
116
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Trace metadata")
117
117
 
118
118
 
119
119
  class JudgeScoreRequest(BaseModel):
@@ -123,5 +123,5 @@ class JudgeScoreRequest(BaseModel):
123
123
  task_app: JudgeTaskApp = Field(..., description="Task application metadata")
124
124
  trace: JudgeTracePayload = Field(..., description="Trajectory trace to evaluate")
125
125
  options: JudgeOptions = Field(default_factory=lambda: JudgeOptions(), description="Judge options")
126
- rubric: Optional[Dict[str, Any]] = Field(None, description="Optional explicit rubric criteria")
126
+ rubric: dict[str, Any] | None = Field(None, description="Optional explicit rubric criteria")
127
127
 
synth_ai/py.typed ADDED
File without changes
synth_ai/task/__init__.py CHANGED
@@ -5,6 +5,9 @@ from .auth import (
5
5
  )
6
6
  from .client import TaskAppClient
7
7
  from .contracts import (
8
+ DatasetInfo,
9
+ InferenceInfo,
10
+ LimitsInfo,
8
11
  RolloutEnvSpec,
9
12
  RolloutMetrics,
10
13
  RolloutPolicySpec,
@@ -14,8 +17,10 @@ from .contracts import (
14
17
  RolloutSafetyConfig,
15
18
  RolloutStep,
16
19
  RolloutTrajectory,
17
- TaskAppContract,
20
+ RubricInfo,
21
+ RubricSection,
18
22
  TaskAppEndpoints,
23
+ TaskDescriptor,
19
24
  TaskInfo,
20
25
  )
21
26
  from .datasets import TaskDatasetRegistry, TaskDatasetSpec
@@ -23,7 +28,6 @@ from .errors import error_payload, http_exception, json_error_response
23
28
  from .health import task_app_health
24
29
  from .json import to_jsonable
25
30
  from .proxy import (
26
- INTERACT_TOOL_SCHEMA,
27
31
  extract_message_text,
28
32
  inject_system_hint,
29
33
  parse_tool_call_from_text,
@@ -46,7 +50,7 @@ from .server import (
46
50
  create_task_app,
47
51
  run_task_app,
48
52
  )
49
- from .validators import validate_task_app_url
53
+ from .validators import validate_task_app_endpoint, validate_task_app_url
50
54
  from .vendors import (
51
55
  get_groq_key_or_503,
52
56
  get_openai_key_or_503,
@@ -55,8 +59,8 @@ from .vendors import (
55
59
 
56
60
  __all__ = [
57
61
  "validate_task_app_url",
62
+ "validate_task_app_endpoint",
58
63
  "task_app_health",
59
- "TaskAppContract",
60
64
  "TaskAppEndpoints",
61
65
  "RolloutEnvSpec",
62
66
  "RolloutPolicySpec",
@@ -67,6 +71,12 @@ __all__ = [
67
71
  "RolloutTrajectory",
68
72
  "RolloutStep",
69
73
  "RolloutMetrics",
74
+ "TaskDescriptor",
75
+ "DatasetInfo",
76
+ "RubricInfo",
77
+ "RubricSection",
78
+ "InferenceInfo",
79
+ "LimitsInfo",
70
80
  "TaskInfo",
71
81
  "to_jsonable",
72
82
  "normalize_environment_api_key",
@@ -75,7 +85,6 @@ __all__ = [
75
85
  "normalize_vendor_keys",
76
86
  "get_openai_key_or_503",
77
87
  "get_groq_key_or_503",
78
- "INTERACT_TOOL_SCHEMA",
79
88
  "prepare_for_openai",
80
89
  "prepare_for_groq",
81
90
  "inject_system_hint",