synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (169) hide show
  1. examples/baseline/banking77_baseline.py +204 -0
  2. examples/baseline/crafter_baseline.py +407 -0
  3. examples/baseline/pokemon_red_baseline.py +326 -0
  4. examples/baseline/simple_baseline.py +56 -0
  5. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  6. examples/blog_posts/gepa/README.md +355 -0
  7. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  9. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  10. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  13. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  15. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  16. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  18. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  19. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  20. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  21. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  22. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  23. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  24. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  25. examples/blog_posts/gepa/task_apps.py +105 -0
  26. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  27. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  28. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  29. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
  30. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
  31. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  32. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  33. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  34. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  35. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  36. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  37. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  38. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  39. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  40. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  41. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  42. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  43. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
  44. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  45. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
  46. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
  47. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  48. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  49. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  50. examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
  51. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
  52. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
  53. examples/rl/configs/rl_from_base_qwen17.toml +1 -0
  54. examples/swe/task_app/hosted/inference/openai_client.py +0 -34
  55. examples/swe/task_app/hosted/policy_routes.py +17 -0
  56. examples/swe/task_app/hosted/rollout.py +4 -2
  57. examples/task_apps/banking77/__init__.py +6 -0
  58. examples/task_apps/banking77/banking77_task_app.py +841 -0
  59. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  60. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  61. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  62. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  63. examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
  64. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  65. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
  66. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
  67. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
  68. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  69. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  70. examples/task_apps/gepa_benchmarks/common.py +260 -0
  71. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  72. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  73. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  74. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  75. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  76. examples/task_apps/pokemon_red/task_app.py +254 -36
  77. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
  78. examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
  79. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  80. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
  81. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
  82. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  83. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  84. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
  85. synth_ai/api/train/builders.py +90 -1
  86. synth_ai/api/train/cli.py +396 -21
  87. synth_ai/api/train/config_finder.py +13 -2
  88. synth_ai/api/train/configs/__init__.py +15 -1
  89. synth_ai/api/train/configs/prompt_learning.py +442 -0
  90. synth_ai/api/train/configs/rl.py +29 -0
  91. synth_ai/api/train/task_app.py +1 -1
  92. synth_ai/api/train/validators.py +277 -0
  93. synth_ai/baseline/__init__.py +25 -0
  94. synth_ai/baseline/config.py +209 -0
  95. synth_ai/baseline/discovery.py +214 -0
  96. synth_ai/baseline/execution.py +146 -0
  97. synth_ai/cli/__init__.py +85 -17
  98. synth_ai/cli/__main__.py +0 -0
  99. synth_ai/cli/claude.py +70 -0
  100. synth_ai/cli/codex.py +84 -0
  101. synth_ai/cli/commands/__init__.py +1 -0
  102. synth_ai/cli/commands/baseline/__init__.py +12 -0
  103. synth_ai/cli/commands/baseline/core.py +637 -0
  104. synth_ai/cli/commands/baseline/list.py +93 -0
  105. synth_ai/cli/commands/eval/core.py +13 -10
  106. synth_ai/cli/commands/filter/core.py +53 -17
  107. synth_ai/cli/commands/help/core.py +0 -1
  108. synth_ai/cli/commands/smoke/__init__.py +7 -0
  109. synth_ai/cli/commands/smoke/core.py +1436 -0
  110. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  111. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  112. synth_ai/cli/commands/train/judge_schemas.py +1 -0
  113. synth_ai/cli/commands/train/judge_validation.py +1 -0
  114. synth_ai/cli/commands/train/validation.py +0 -57
  115. synth_ai/cli/demo.py +35 -3
  116. synth_ai/cli/deploy/__init__.py +40 -25
  117. synth_ai/cli/deploy.py +162 -0
  118. synth_ai/cli/legacy_root_backup.py +14 -8
  119. synth_ai/cli/opencode.py +107 -0
  120. synth_ai/cli/root.py +9 -5
  121. synth_ai/cli/task_app_deploy.py +1 -1
  122. synth_ai/cli/task_apps.py +53 -53
  123. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  124. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  125. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  126. synth_ai/judge_schemas.py +1 -0
  127. synth_ai/learning/__init__.py +10 -0
  128. synth_ai/learning/prompt_learning_client.py +276 -0
  129. synth_ai/learning/prompt_learning_types.py +184 -0
  130. synth_ai/pricing/__init__.py +2 -0
  131. synth_ai/pricing/model_pricing.py +57 -0
  132. synth_ai/streaming/handlers.py +53 -4
  133. synth_ai/streaming/streamer.py +19 -0
  134. synth_ai/task/apps/__init__.py +1 -0
  135. synth_ai/task/config.py +2 -0
  136. synth_ai/task/tracing_utils.py +25 -25
  137. synth_ai/task/validators.py +44 -8
  138. synth_ai/task_app_cfgs.py +21 -0
  139. synth_ai/tracing_v3/config.py +162 -19
  140. synth_ai/tracing_v3/constants.py +1 -1
  141. synth_ai/tracing_v3/db_config.py +24 -38
  142. synth_ai/tracing_v3/storage/config.py +47 -13
  143. synth_ai/tracing_v3/storage/factory.py +3 -3
  144. synth_ai/tracing_v3/turso/daemon.py +113 -11
  145. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  146. synth_ai/types.py +8 -0
  147. synth_ai/urls.py +11 -0
  148. synth_ai/utils/__init__.py +30 -1
  149. synth_ai/utils/agents.py +74 -0
  150. synth_ai/utils/bin.py +39 -0
  151. synth_ai/utils/cli.py +149 -5
  152. synth_ai/utils/env.py +17 -17
  153. synth_ai/utils/json.py +72 -0
  154. synth_ai/utils/modal.py +283 -1
  155. synth_ai/utils/paths.py +48 -0
  156. synth_ai/utils/uvicorn.py +113 -0
  157. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
  158. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
  159. synth_ai/cli/commands/deploy/__init__.py +0 -23
  160. synth_ai/cli/commands/deploy/core.py +0 -614
  161. synth_ai/cli/commands/deploy/errors.py +0 -72
  162. synth_ai/cli/commands/deploy/validation.py +0 -11
  163. synth_ai/cli/deploy/core.py +0 -5
  164. synth_ai/cli/deploy/errors.py +0 -23
  165. synth_ai/cli/deploy/validation.py +0 -5
  166. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  167. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  168. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  169. {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,436 @@
1
+ """HoVer claim verification task app for Synth prompt optimisation benchmarks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import os
7
+ import uuid
8
+ from collections.abc import Iterable, Sequence
9
+ from pathlib import Path
10
+ from typing import Any, Mapping, cast
11
+
12
+ from datasets import load_dataset
13
+ from fastapi import APIRouter, HTTPException, Request
14
+
15
+ from synth_ai.task.apps import ModalDeploymentConfig, TaskAppEntry, register_task_app
16
+ from synth_ai.task.contracts import (
17
+ RolloutMetrics,
18
+ RolloutRequest,
19
+ RolloutResponse,
20
+ RolloutStep,
21
+ RolloutTrajectory,
22
+ TaskInfo,
23
+ )
24
+ from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
25
+ from synth_ai.task.rubrics import Rubric, load_rubric
26
+ from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig
27
+ from synth_ai.task.vendors import normalize_vendor_keys
28
+
29
+ from .common import call_chat_completion, normalise_answer
30
+
31
+ REPO_ROOT = Path(__file__).resolve().parents[3]
32
+
33
+ DATASET_ID = "Dzeniks/hover"
34
+ DEFAULT_SPLIT = "test"
35
+ AVAILABLE_SPLITS: tuple[str, ...] = ("train", "test")
36
+
37
+
38
+ hover_router = APIRouter()
39
+
40
+
41
+ HOVER_DATASET_SPEC = TaskDatasetSpec(
42
+ id="hover",
43
+ name="HoVer Claim Verification",
44
+ version="1.0.0",
45
+ splits=list(AVAILABLE_SPLITS),
46
+ default_split=DEFAULT_SPLIT,
47
+ description="Claim verification with supporting evidence passages.",
48
+ )
49
+
50
+ LABEL_MAP = {
51
+ 0: "SUPPORTED",
52
+ 1: "REFUTED",
53
+ }
54
+
55
+
56
+ class HoVerDataset:
57
+ """Thin wrapper around the HoVer dataset for sampling."""
58
+
59
+ def __init__(self) -> None:
60
+ self._cache: dict[str, Any] = {}
61
+
62
+ def _load_split(self, split: str):
63
+ if split not in AVAILABLE_SPLITS:
64
+ raise ValueError(f"Unknown split '{split}'. Available: {AVAILABLE_SPLITS}")
65
+ if split not in self._cache:
66
+ try:
67
+ self._cache[split] = load_dataset(DATASET_ID, split=split)
68
+ except Exception as exc: # pragma: no cover
69
+ raise RuntimeError(
70
+ f"Failed to download HoVer split '{split}'. "
71
+ "Ensure network access to Hugging Face."
72
+ ) from exc
73
+ return self._cache[split]
74
+
75
+ def ensure_ready(self, splits: Sequence[str]) -> None:
76
+ for split in splits:
77
+ self._load_split(split)
78
+
79
+ def size(self, split: str) -> int:
80
+ dataset = self._load_split(split)
81
+ return len(dataset)
82
+
83
+ def sample(self, *, split: str, index: int) -> dict[str, Any]:
84
+ dataset = self._load_split(split)
85
+ size = len(dataset)
86
+ if size == 0:
87
+ raise RuntimeError(f"HoVer split '{split}' is empty")
88
+ idx = int(index) % size
89
+ row = dataset[int(idx)]
90
+
91
+ label_idx = int(row.get("label") or 0)
92
+ label_text = LABEL_MAP.get(label_idx, "SUPPORTED")
93
+ evidence = str(row.get("evidence") or "").strip()
94
+
95
+ return {
96
+ "index": idx,
97
+ "split": split,
98
+ "claim": str(row.get("claim") or ""),
99
+ "evidence": evidence,
100
+ "label": label_text,
101
+ }
102
+
103
+
104
+ def _parse_label(response_text: str) -> tuple[str, str]:
105
+ if not response_text:
106
+ return "", ""
107
+ lower = response_text.lower()
108
+ label = ""
109
+ rationale = ""
110
+ if "label:" in lower:
111
+ fragment = lower.split("label:", 1)[1]
112
+ label_line = fragment.splitlines()[0]
113
+ label = label_line.strip().upper()
114
+ else:
115
+ # fallback to first word
116
+ label = response_text.strip().split()[0].upper()
117
+ if "rationale:" in lower:
118
+ rationale_fragment = lower.split("rationale:", 1)[1]
119
+ rationale = rationale_fragment.strip()
120
+ return label, rationale
121
+
122
+
123
+ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
124
+ dataset: HoVerDataset = fastapi_request.app.state.hover_dataset
125
+
126
+ split = str(((request.env.config or {}).get("split")) or DEFAULT_SPLIT)
127
+ seed = request.env.seed or 0
128
+
129
+ sample = dataset.sample(split=split, index=seed)
130
+ observation = {
131
+ "claim": sample["claim"],
132
+ "evidence": sample["evidence"],
133
+ "index": sample["index"],
134
+ "split": sample["split"],
135
+ }
136
+
137
+ placeholders = {
138
+ "claim": sample["claim"],
139
+ "evidence": sample["evidence"],
140
+ }
141
+
142
+ default_messages = [
143
+ {
144
+ "role": "system",
145
+ "pattern": (
146
+ "You verify Wikipedia claims. Decide whether each claim is SUPPORTED or REFUTED "
147
+ "by the evidence provided. Respond with the format:\n"
148
+ "Label: <SUPPORTED|REFUTED>\nRationale: <short explanation>."
149
+ ),
150
+ },
151
+ {
152
+ "role": "user",
153
+ "pattern": "Claim: {claim}\n\nEvidence:\n{evidence}\n\nReturn the label and rationale.",
154
+ },
155
+ ]
156
+
157
+ response_json: dict[str, Any] | None = None
158
+ response_text = ""
159
+ error_info: dict[str, Any] = {}
160
+
161
+ try:
162
+ response_text, response_json, _ = await call_chat_completion(
163
+ request.policy.config or {},
164
+ placeholders,
165
+ default_messages,
166
+ )
167
+ except HTTPException as http_err: # pragma: no cover
168
+ error_info = {"error": str(http_err.detail), "code": http_err.status_code}
169
+ except Exception as exc: # pragma: no cover
170
+ error_info = {"error": str(exc)}
171
+
172
+ predicted_label, rationale = _parse_label(response_text)
173
+ expected_label = sample["label"]
174
+
175
+ # Normalise label (strip punctuation, match synonyms)
176
+ normalised_prediction = normalise_answer(predicted_label)
177
+ normalised_expected = normalise_answer(expected_label)
178
+ is_correct = normalised_prediction.startswith(normalised_expected[:5])
179
+ reward = 1.0 if is_correct else 0.0
180
+
181
+ info_payload = {
182
+ "expected_label": expected_label,
183
+ "predicted_label": predicted_label,
184
+ "rationale": rationale,
185
+ "response_json": response_json,
186
+ "correct": is_correct,
187
+ **error_info,
188
+ }
189
+
190
+ with contextlib.suppress(Exception):
191
+ print(
192
+ f"[HOVER_ROLLOUT] run_id={request.run_id} split={sample['split']} "
193
+ f"index={sample['index']} expected={expected_label} predicted={predicted_label} "
194
+ f"reward={reward}",
195
+ flush=True,
196
+ )
197
+
198
+ step = RolloutStep(
199
+ obs=observation,
200
+ tool_calls=[],
201
+ reward=reward,
202
+ done=True,
203
+ info=info_payload,
204
+ )
205
+
206
+ inference_url = (request.policy.config or {}).get("inference_url")
207
+ trajectory = RolloutTrajectory(
208
+ env_id=f"hover::{sample['split']}::{sample['index']}",
209
+ policy_id=request.policy.policy_id or request.policy.policy_name or "policy",
210
+ steps=[step],
211
+ final={"observation": observation, "reward": reward},
212
+ length=1,
213
+ inference_url=str(inference_url or ""),
214
+ )
215
+
216
+ metrics = RolloutMetrics(
217
+ episode_returns=[reward],
218
+ mean_return=reward,
219
+ num_steps=1,
220
+ num_episodes=1,
221
+ outcome_score=reward,
222
+ events_score=reward,
223
+ details={"correct": is_correct},
224
+ )
225
+
226
+ trace_payload = None
227
+ include_trace = bool(
228
+ (request.record and getattr(request.record, "return_trace", False))
229
+ or os.getenv("TASKAPP_TRACING_ENABLED")
230
+ )
231
+ if include_trace:
232
+ trace_payload = {
233
+ "session_id": str(uuid.uuid4()),
234
+ "events_count": 1,
235
+ "decision_rewards": [reward],
236
+ "metadata": {
237
+ "env": "hover",
238
+ "split": sample["split"],
239
+ "index": sample["index"],
240
+ "correct": is_correct,
241
+ },
242
+ }
243
+
244
+ return RolloutResponse(
245
+ run_id=request.run_id,
246
+ trajectories=[trajectory],
247
+ branches={},
248
+ metrics=metrics,
249
+ aborted=False,
250
+ ops_executed=2,
251
+ trace=trace_payload,
252
+ )
253
+
254
+
255
+ def build_dataset() -> tuple[TaskDatasetRegistry, HoVerDataset]:
256
+ registry = TaskDatasetRegistry()
257
+ dataset = HoVerDataset()
258
+ dataset.ensure_ready([DEFAULT_SPLIT])
259
+ registry.register(HOVER_DATASET_SPEC, lambda _spec: dataset, cache=True)
260
+ return registry, dataset
261
+
262
+
263
+ def _base_task_info() -> TaskInfo:
264
+ return TaskInfo(
265
+ task={
266
+ "id": "hover",
267
+ "name": "HoVer Claim Verification",
268
+ "version": "1.0.0",
269
+ "action_space": {
270
+ "type": "free_text",
271
+ "description": "Return a label (SUPPORTED/REFUTED) and short rationale.",
272
+ },
273
+ },
274
+ environment="hover",
275
+ dataset={
276
+ **HOVER_DATASET_SPEC.model_dump(),
277
+ "hf_dataset": DATASET_ID,
278
+ },
279
+ rubric={
280
+ "version": "1",
281
+ "criteria_count": 1,
282
+ "source": "inline",
283
+ },
284
+ inference={
285
+ "supports_proxy": True,
286
+ "tool": None,
287
+ },
288
+ limits={"max_turns": 1},
289
+ task_metadata={"format": "Label: ... / Rationale: ..."},
290
+ )
291
+
292
+
293
+ def describe_taskset(dataset: HoVerDataset) -> Mapping[str, Any]:
294
+ return {
295
+ **HOVER_DATASET_SPEC.model_dump(),
296
+ "hf_dataset": DATASET_ID,
297
+ "label_map": LABEL_MAP,
298
+ "sizes": {split: dataset.size(split) for split in AVAILABLE_SPLITS},
299
+ }
300
+
301
+
302
+ def provide_task_instances(dataset: HoVerDataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
303
+ base_info = _base_task_info()
304
+ for seed in seeds:
305
+ sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
306
+ yield TaskInfo(
307
+ task=base_info.task,
308
+ environment=base_info.environment,
309
+ dataset={
310
+ **base_info.dataset,
311
+ "split": sample["split"],
312
+ "index": sample["index"],
313
+ },
314
+ rubric=base_info.rubric,
315
+ inference=base_info.inference,
316
+ limits=base_info.limits,
317
+ task_metadata={
318
+ **base_info.task_metadata,
319
+ "claim": sample["claim"],
320
+ },
321
+ )
322
+
323
+
324
+ OUTCOME_RUBRIC: Rubric = cast(
325
+ Rubric,
326
+ load_rubric(
327
+ {
328
+ "version": "1",
329
+ "goal_text": "Assign the correct label (SUPPORTED or REFUTED) to each claim.",
330
+ "aggregation": "weighted_sum",
331
+ "criteria": [
332
+ {
333
+ "id": "label_accuracy",
334
+ "description": "Correctly classify the claim.",
335
+ "weight": 1.0,
336
+ }
337
+ ],
338
+ }
339
+ ),
340
+ )
341
+
342
+ EVENTS_RUBRIC: Rubric = cast(
343
+ Rubric,
344
+ load_rubric(
345
+ {
346
+ "version": "1",
347
+ "goal_text": "Include a concise rationale referencing the evidence.",
348
+ "aggregation": "weighted_sum",
349
+ "criteria": [
350
+ {
351
+ "id": "rationale_quality",
352
+ "description": "Provide a short rationale referencing the provided evidence.",
353
+ "weight": 1.0,
354
+ }
355
+ ],
356
+ }
357
+ ),
358
+ )
359
+
360
+
361
+ def build_config() -> TaskAppConfig:
362
+ registry, dataset = build_dataset()
363
+ base_info = _base_task_info()
364
+
365
+ proxy_keys = normalize_vendor_keys()
366
+ proxy_config = ProxyConfig(
367
+ enable_openai=proxy_keys.get("OPENAI_API_KEY") is not None,
368
+ enable_groq=proxy_keys.get("GROQ_API_KEY") is not None,
369
+ system_hint="Return 'Label: ...' followed by 'Rationale: ...'.",
370
+ )
371
+
372
+ config = TaskAppConfig(
373
+ app_id="hover",
374
+ name="HoVer Claim Verification Task",
375
+ description="HoVer dataset task app for verifying claims with supporting passages.",
376
+ base_task_info=base_info,
377
+ describe_taskset=lambda: describe_taskset(dataset),
378
+ provide_task_instances=lambda seeds: provide_task_instances(dataset, seeds),
379
+ rollout=rollout_executor,
380
+ dataset_registry=registry,
381
+ rubrics=RubricBundle(outcome=OUTCOME_RUBRIC, events=EVENTS_RUBRIC),
382
+ proxy=proxy_config,
383
+ routers=(hover_router,),
384
+ app_state={"hover_dataset": dataset},
385
+ cors_origins=["*"],
386
+ )
387
+ return config
388
+
389
+
390
+ register_task_app(
391
+ entry=TaskAppEntry(
392
+ app_id="hover",
393
+ description="HoVer claim verification task app using the Dzeniks/hover dataset.",
394
+ config_factory=build_config,
395
+ aliases=("hover-claims",),
396
+ modal=ModalDeploymentConfig(
397
+ app_name="synth-hover",
398
+ pip_packages=(
399
+ "datasets>=2.14.0",
400
+ "fastapi>=0.115.0",
401
+ "pydantic>=2.0.0",
402
+ "httpx>=0.26.0",
403
+ ),
404
+ extra_local_dirs=((str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),),
405
+ ),
406
+ )
407
+ )
408
+
409
+
410
+ if __name__ == "__main__": # pragma: no cover - manual helper
411
+ import argparse
412
+ from synth_ai.task.server import run_task_app
413
+
414
+ parser = argparse.ArgumentParser(description="Run the HoVer task app locally")
415
+ parser.add_argument("--host", default="0.0.0.0")
416
+ parser.add_argument("--port", type=int, default=8112)
417
+ parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
418
+ parser.add_argument(
419
+ "--env-file",
420
+ action="append",
421
+ default=[],
422
+ help="Additional .env files to load before startup",
423
+ )
424
+ args = parser.parse_args()
425
+
426
+ default_env = Path(__file__).resolve().parents[2] / ".env"
427
+ env_files = [str(default_env)] if default_env.exists() else []
428
+ env_files.extend(args.env_file or [])
429
+
430
+ run_task_app(
431
+ build_config,
432
+ host=args.host,
433
+ port=args.port,
434
+ reload=args.reload,
435
+ env_files=env_files,
436
+ )