synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
- examples/rl/configs/rl_from_base_qwen17.toml +1 -0
- examples/swe/task_app/hosted/inference/openai_client.py +0 -34
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/task_app.py +254 -36
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
- synth_ai/api/train/builders.py +90 -1
- synth_ai/api/train/cli.py +396 -21
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +15 -1
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +29 -0
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +85 -17
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +1 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/eval/core.py +13 -10
- synth_ai/cli/commands/filter/core.py +53 -17
- synth_ai/cli/commands/help/core.py +0 -1
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/train/judge_schemas.py +1 -0
- synth_ai/cli/commands/train/judge_validation.py +1 -0
- synth_ai/cli/commands/train/validation.py +0 -57
- synth_ai/cli/demo.py +35 -3
- synth_ai/cli/deploy/__init__.py +40 -25
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/task_app_deploy.py +1 -1
- synth_ai/cli/task_apps.py +53 -53
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/judge_schemas.py +1 -0
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/handlers.py +53 -4
- synth_ai/streaming/streamer.py +19 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +44 -8
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +17 -17
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +283 -1
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
- synth_ai/cli/commands/deploy/__init__.py +0 -23
- synth_ai/cli/commands/deploy/core.py +0 -614
- synth_ai/cli/commands/deploy/errors.py +0 -72
- synth_ai/cli/commands/deploy/validation.py +0 -11
- synth_ai/cli/deploy/core.py +0 -5
- synth_ai/cli/deploy/errors.py +0 -23
- synth_ai/cli/deploy/validation.py +0 -5
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
"""HoVer claim verification task app for Synth prompt optimisation benchmarks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import os
|
|
7
|
+
import uuid
|
|
8
|
+
from collections.abc import Iterable, Sequence
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Mapping, cast
|
|
11
|
+
|
|
12
|
+
from datasets import load_dataset
|
|
13
|
+
from fastapi import APIRouter, HTTPException, Request
|
|
14
|
+
|
|
15
|
+
from synth_ai.task.apps import ModalDeploymentConfig, TaskAppEntry, register_task_app
|
|
16
|
+
from synth_ai.task.contracts import (
|
|
17
|
+
RolloutMetrics,
|
|
18
|
+
RolloutRequest,
|
|
19
|
+
RolloutResponse,
|
|
20
|
+
RolloutStep,
|
|
21
|
+
RolloutTrajectory,
|
|
22
|
+
TaskInfo,
|
|
23
|
+
)
|
|
24
|
+
from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
|
|
25
|
+
from synth_ai.task.rubrics import Rubric, load_rubric
|
|
26
|
+
from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig
|
|
27
|
+
from synth_ai.task.vendors import normalize_vendor_keys
|
|
28
|
+
|
|
29
|
+
from .common import call_chat_completion, normalise_answer
|
|
30
|
+
|
|
31
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
32
|
+
|
|
33
|
+
DATASET_ID = "Dzeniks/hover"
|
|
34
|
+
DEFAULT_SPLIT = "test"
|
|
35
|
+
AVAILABLE_SPLITS: tuple[str, ...] = ("train", "test")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
hover_router = APIRouter()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
HOVER_DATASET_SPEC = TaskDatasetSpec(
|
|
42
|
+
id="hover",
|
|
43
|
+
name="HoVer Claim Verification",
|
|
44
|
+
version="1.0.0",
|
|
45
|
+
splits=list(AVAILABLE_SPLITS),
|
|
46
|
+
default_split=DEFAULT_SPLIT,
|
|
47
|
+
description="Claim verification with supporting evidence passages.",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
LABEL_MAP = {
|
|
51
|
+
0: "SUPPORTED",
|
|
52
|
+
1: "REFUTED",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class HoVerDataset:
|
|
57
|
+
"""Thin wrapper around the HoVer dataset for sampling."""
|
|
58
|
+
|
|
59
|
+
def __init__(self) -> None:
|
|
60
|
+
self._cache: dict[str, Any] = {}
|
|
61
|
+
|
|
62
|
+
def _load_split(self, split: str):
|
|
63
|
+
if split not in AVAILABLE_SPLITS:
|
|
64
|
+
raise ValueError(f"Unknown split '{split}'. Available: {AVAILABLE_SPLITS}")
|
|
65
|
+
if split not in self._cache:
|
|
66
|
+
try:
|
|
67
|
+
self._cache[split] = load_dataset(DATASET_ID, split=split)
|
|
68
|
+
except Exception as exc: # pragma: no cover
|
|
69
|
+
raise RuntimeError(
|
|
70
|
+
f"Failed to download HoVer split '{split}'. "
|
|
71
|
+
"Ensure network access to Hugging Face."
|
|
72
|
+
) from exc
|
|
73
|
+
return self._cache[split]
|
|
74
|
+
|
|
75
|
+
def ensure_ready(self, splits: Sequence[str]) -> None:
|
|
76
|
+
for split in splits:
|
|
77
|
+
self._load_split(split)
|
|
78
|
+
|
|
79
|
+
def size(self, split: str) -> int:
|
|
80
|
+
dataset = self._load_split(split)
|
|
81
|
+
return len(dataset)
|
|
82
|
+
|
|
83
|
+
def sample(self, *, split: str, index: int) -> dict[str, Any]:
|
|
84
|
+
dataset = self._load_split(split)
|
|
85
|
+
size = len(dataset)
|
|
86
|
+
if size == 0:
|
|
87
|
+
raise RuntimeError(f"HoVer split '{split}' is empty")
|
|
88
|
+
idx = int(index) % size
|
|
89
|
+
row = dataset[int(idx)]
|
|
90
|
+
|
|
91
|
+
label_idx = int(row.get("label") or 0)
|
|
92
|
+
label_text = LABEL_MAP.get(label_idx, "SUPPORTED")
|
|
93
|
+
evidence = str(row.get("evidence") or "").strip()
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
"index": idx,
|
|
97
|
+
"split": split,
|
|
98
|
+
"claim": str(row.get("claim") or ""),
|
|
99
|
+
"evidence": evidence,
|
|
100
|
+
"label": label_text,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _parse_label(response_text: str) -> tuple[str, str]:
|
|
105
|
+
if not response_text:
|
|
106
|
+
return "", ""
|
|
107
|
+
lower = response_text.lower()
|
|
108
|
+
label = ""
|
|
109
|
+
rationale = ""
|
|
110
|
+
if "label:" in lower:
|
|
111
|
+
fragment = lower.split("label:", 1)[1]
|
|
112
|
+
label_line = fragment.splitlines()[0]
|
|
113
|
+
label = label_line.strip().upper()
|
|
114
|
+
else:
|
|
115
|
+
# fallback to first word
|
|
116
|
+
label = response_text.strip().split()[0].upper()
|
|
117
|
+
if "rationale:" in lower:
|
|
118
|
+
rationale_fragment = lower.split("rationale:", 1)[1]
|
|
119
|
+
rationale = rationale_fragment.strip()
|
|
120
|
+
return label, rationale
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
|
|
124
|
+
dataset: HoVerDataset = fastapi_request.app.state.hover_dataset
|
|
125
|
+
|
|
126
|
+
split = str(((request.env.config or {}).get("split")) or DEFAULT_SPLIT)
|
|
127
|
+
seed = request.env.seed or 0
|
|
128
|
+
|
|
129
|
+
sample = dataset.sample(split=split, index=seed)
|
|
130
|
+
observation = {
|
|
131
|
+
"claim": sample["claim"],
|
|
132
|
+
"evidence": sample["evidence"],
|
|
133
|
+
"index": sample["index"],
|
|
134
|
+
"split": sample["split"],
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
placeholders = {
|
|
138
|
+
"claim": sample["claim"],
|
|
139
|
+
"evidence": sample["evidence"],
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
default_messages = [
|
|
143
|
+
{
|
|
144
|
+
"role": "system",
|
|
145
|
+
"pattern": (
|
|
146
|
+
"You verify Wikipedia claims. Decide whether each claim is SUPPORTED or REFUTED "
|
|
147
|
+
"by the evidence provided. Respond with the format:\n"
|
|
148
|
+
"Label: <SUPPORTED|REFUTED>\nRationale: <short explanation>."
|
|
149
|
+
),
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"role": "user",
|
|
153
|
+
"pattern": "Claim: {claim}\n\nEvidence:\n{evidence}\n\nReturn the label and rationale.",
|
|
154
|
+
},
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
response_json: dict[str, Any] | None = None
|
|
158
|
+
response_text = ""
|
|
159
|
+
error_info: dict[str, Any] = {}
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
response_text, response_json, _ = await call_chat_completion(
|
|
163
|
+
request.policy.config or {},
|
|
164
|
+
placeholders,
|
|
165
|
+
default_messages,
|
|
166
|
+
)
|
|
167
|
+
except HTTPException as http_err: # pragma: no cover
|
|
168
|
+
error_info = {"error": str(http_err.detail), "code": http_err.status_code}
|
|
169
|
+
except Exception as exc: # pragma: no cover
|
|
170
|
+
error_info = {"error": str(exc)}
|
|
171
|
+
|
|
172
|
+
predicted_label, rationale = _parse_label(response_text)
|
|
173
|
+
expected_label = sample["label"]
|
|
174
|
+
|
|
175
|
+
# Normalise label (strip punctuation, match synonyms)
|
|
176
|
+
normalised_prediction = normalise_answer(predicted_label)
|
|
177
|
+
normalised_expected = normalise_answer(expected_label)
|
|
178
|
+
is_correct = normalised_prediction.startswith(normalised_expected[:5])
|
|
179
|
+
reward = 1.0 if is_correct else 0.0
|
|
180
|
+
|
|
181
|
+
info_payload = {
|
|
182
|
+
"expected_label": expected_label,
|
|
183
|
+
"predicted_label": predicted_label,
|
|
184
|
+
"rationale": rationale,
|
|
185
|
+
"response_json": response_json,
|
|
186
|
+
"correct": is_correct,
|
|
187
|
+
**error_info,
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
with contextlib.suppress(Exception):
|
|
191
|
+
print(
|
|
192
|
+
f"[HOVER_ROLLOUT] run_id={request.run_id} split={sample['split']} "
|
|
193
|
+
f"index={sample['index']} expected={expected_label} predicted={predicted_label} "
|
|
194
|
+
f"reward={reward}",
|
|
195
|
+
flush=True,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
step = RolloutStep(
|
|
199
|
+
obs=observation,
|
|
200
|
+
tool_calls=[],
|
|
201
|
+
reward=reward,
|
|
202
|
+
done=True,
|
|
203
|
+
info=info_payload,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
inference_url = (request.policy.config or {}).get("inference_url")
|
|
207
|
+
trajectory = RolloutTrajectory(
|
|
208
|
+
env_id=f"hover::{sample['split']}::{sample['index']}",
|
|
209
|
+
policy_id=request.policy.policy_id or request.policy.policy_name or "policy",
|
|
210
|
+
steps=[step],
|
|
211
|
+
final={"observation": observation, "reward": reward},
|
|
212
|
+
length=1,
|
|
213
|
+
inference_url=str(inference_url or ""),
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
metrics = RolloutMetrics(
|
|
217
|
+
episode_returns=[reward],
|
|
218
|
+
mean_return=reward,
|
|
219
|
+
num_steps=1,
|
|
220
|
+
num_episodes=1,
|
|
221
|
+
outcome_score=reward,
|
|
222
|
+
events_score=reward,
|
|
223
|
+
details={"correct": is_correct},
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
trace_payload = None
|
|
227
|
+
include_trace = bool(
|
|
228
|
+
(request.record and getattr(request.record, "return_trace", False))
|
|
229
|
+
or os.getenv("TASKAPP_TRACING_ENABLED")
|
|
230
|
+
)
|
|
231
|
+
if include_trace:
|
|
232
|
+
trace_payload = {
|
|
233
|
+
"session_id": str(uuid.uuid4()),
|
|
234
|
+
"events_count": 1,
|
|
235
|
+
"decision_rewards": [reward],
|
|
236
|
+
"metadata": {
|
|
237
|
+
"env": "hover",
|
|
238
|
+
"split": sample["split"],
|
|
239
|
+
"index": sample["index"],
|
|
240
|
+
"correct": is_correct,
|
|
241
|
+
},
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
return RolloutResponse(
|
|
245
|
+
run_id=request.run_id,
|
|
246
|
+
trajectories=[trajectory],
|
|
247
|
+
branches={},
|
|
248
|
+
metrics=metrics,
|
|
249
|
+
aborted=False,
|
|
250
|
+
ops_executed=2,
|
|
251
|
+
trace=trace_payload,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def build_dataset() -> tuple[TaskDatasetRegistry, HoVerDataset]:
|
|
256
|
+
registry = TaskDatasetRegistry()
|
|
257
|
+
dataset = HoVerDataset()
|
|
258
|
+
dataset.ensure_ready([DEFAULT_SPLIT])
|
|
259
|
+
registry.register(HOVER_DATASET_SPEC, lambda _spec: dataset, cache=True)
|
|
260
|
+
return registry, dataset
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _base_task_info() -> TaskInfo:
|
|
264
|
+
return TaskInfo(
|
|
265
|
+
task={
|
|
266
|
+
"id": "hover",
|
|
267
|
+
"name": "HoVer Claim Verification",
|
|
268
|
+
"version": "1.0.0",
|
|
269
|
+
"action_space": {
|
|
270
|
+
"type": "free_text",
|
|
271
|
+
"description": "Return a label (SUPPORTED/REFUTED) and short rationale.",
|
|
272
|
+
},
|
|
273
|
+
},
|
|
274
|
+
environment="hover",
|
|
275
|
+
dataset={
|
|
276
|
+
**HOVER_DATASET_SPEC.model_dump(),
|
|
277
|
+
"hf_dataset": DATASET_ID,
|
|
278
|
+
},
|
|
279
|
+
rubric={
|
|
280
|
+
"version": "1",
|
|
281
|
+
"criteria_count": 1,
|
|
282
|
+
"source": "inline",
|
|
283
|
+
},
|
|
284
|
+
inference={
|
|
285
|
+
"supports_proxy": True,
|
|
286
|
+
"tool": None,
|
|
287
|
+
},
|
|
288
|
+
limits={"max_turns": 1},
|
|
289
|
+
task_metadata={"format": "Label: ... / Rationale: ..."},
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def describe_taskset(dataset: HoVerDataset) -> Mapping[str, Any]:
|
|
294
|
+
return {
|
|
295
|
+
**HOVER_DATASET_SPEC.model_dump(),
|
|
296
|
+
"hf_dataset": DATASET_ID,
|
|
297
|
+
"label_map": LABEL_MAP,
|
|
298
|
+
"sizes": {split: dataset.size(split) for split in AVAILABLE_SPLITS},
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def provide_task_instances(dataset: HoVerDataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
|
|
303
|
+
base_info = _base_task_info()
|
|
304
|
+
for seed in seeds:
|
|
305
|
+
sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
|
|
306
|
+
yield TaskInfo(
|
|
307
|
+
task=base_info.task,
|
|
308
|
+
environment=base_info.environment,
|
|
309
|
+
dataset={
|
|
310
|
+
**base_info.dataset,
|
|
311
|
+
"split": sample["split"],
|
|
312
|
+
"index": sample["index"],
|
|
313
|
+
},
|
|
314
|
+
rubric=base_info.rubric,
|
|
315
|
+
inference=base_info.inference,
|
|
316
|
+
limits=base_info.limits,
|
|
317
|
+
task_metadata={
|
|
318
|
+
**base_info.task_metadata,
|
|
319
|
+
"claim": sample["claim"],
|
|
320
|
+
},
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
OUTCOME_RUBRIC: Rubric = cast(
|
|
325
|
+
Rubric,
|
|
326
|
+
load_rubric(
|
|
327
|
+
{
|
|
328
|
+
"version": "1",
|
|
329
|
+
"goal_text": "Assign the correct label (SUPPORTED or REFUTED) to each claim.",
|
|
330
|
+
"aggregation": "weighted_sum",
|
|
331
|
+
"criteria": [
|
|
332
|
+
{
|
|
333
|
+
"id": "label_accuracy",
|
|
334
|
+
"description": "Correctly classify the claim.",
|
|
335
|
+
"weight": 1.0,
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
}
|
|
339
|
+
),
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
EVENTS_RUBRIC: Rubric = cast(
|
|
343
|
+
Rubric,
|
|
344
|
+
load_rubric(
|
|
345
|
+
{
|
|
346
|
+
"version": "1",
|
|
347
|
+
"goal_text": "Include a concise rationale referencing the evidence.",
|
|
348
|
+
"aggregation": "weighted_sum",
|
|
349
|
+
"criteria": [
|
|
350
|
+
{
|
|
351
|
+
"id": "rationale_quality",
|
|
352
|
+
"description": "Provide a short rationale referencing the provided evidence.",
|
|
353
|
+
"weight": 1.0,
|
|
354
|
+
}
|
|
355
|
+
],
|
|
356
|
+
}
|
|
357
|
+
),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def build_config() -> TaskAppConfig:
|
|
362
|
+
registry, dataset = build_dataset()
|
|
363
|
+
base_info = _base_task_info()
|
|
364
|
+
|
|
365
|
+
proxy_keys = normalize_vendor_keys()
|
|
366
|
+
proxy_config = ProxyConfig(
|
|
367
|
+
enable_openai=proxy_keys.get("OPENAI_API_KEY") is not None,
|
|
368
|
+
enable_groq=proxy_keys.get("GROQ_API_KEY") is not None,
|
|
369
|
+
system_hint="Return 'Label: ...' followed by 'Rationale: ...'.",
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
config = TaskAppConfig(
|
|
373
|
+
app_id="hover",
|
|
374
|
+
name="HoVer Claim Verification Task",
|
|
375
|
+
description="HoVer dataset task app for verifying claims with supporting passages.",
|
|
376
|
+
base_task_info=base_info,
|
|
377
|
+
describe_taskset=lambda: describe_taskset(dataset),
|
|
378
|
+
provide_task_instances=lambda seeds: provide_task_instances(dataset, seeds),
|
|
379
|
+
rollout=rollout_executor,
|
|
380
|
+
dataset_registry=registry,
|
|
381
|
+
rubrics=RubricBundle(outcome=OUTCOME_RUBRIC, events=EVENTS_RUBRIC),
|
|
382
|
+
proxy=proxy_config,
|
|
383
|
+
routers=(hover_router,),
|
|
384
|
+
app_state={"hover_dataset": dataset},
|
|
385
|
+
cors_origins=["*"],
|
|
386
|
+
)
|
|
387
|
+
return config
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
register_task_app(
|
|
391
|
+
entry=TaskAppEntry(
|
|
392
|
+
app_id="hover",
|
|
393
|
+
description="HoVer claim verification task app using the Dzeniks/hover dataset.",
|
|
394
|
+
config_factory=build_config,
|
|
395
|
+
aliases=("hover-claims",),
|
|
396
|
+
modal=ModalDeploymentConfig(
|
|
397
|
+
app_name="synth-hover",
|
|
398
|
+
pip_packages=(
|
|
399
|
+
"datasets>=2.14.0",
|
|
400
|
+
"fastapi>=0.115.0",
|
|
401
|
+
"pydantic>=2.0.0",
|
|
402
|
+
"httpx>=0.26.0",
|
|
403
|
+
),
|
|
404
|
+
extra_local_dirs=((str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),),
|
|
405
|
+
),
|
|
406
|
+
)
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
if __name__ == "__main__": # pragma: no cover - manual helper
|
|
411
|
+
import argparse
|
|
412
|
+
from synth_ai.task.server import run_task_app
|
|
413
|
+
|
|
414
|
+
parser = argparse.ArgumentParser(description="Run the HoVer task app locally")
|
|
415
|
+
parser.add_argument("--host", default="0.0.0.0")
|
|
416
|
+
parser.add_argument("--port", type=int, default=8112)
|
|
417
|
+
parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
|
|
418
|
+
parser.add_argument(
|
|
419
|
+
"--env-file",
|
|
420
|
+
action="append",
|
|
421
|
+
default=[],
|
|
422
|
+
help="Additional .env files to load before startup",
|
|
423
|
+
)
|
|
424
|
+
args = parser.parse_args()
|
|
425
|
+
|
|
426
|
+
default_env = Path(__file__).resolve().parents[2] / ".env"
|
|
427
|
+
env_files = [str(default_env)] if default_env.exists() else []
|
|
428
|
+
env_files.extend(args.env_file or [])
|
|
429
|
+
|
|
430
|
+
run_task_app(
|
|
431
|
+
build_config,
|
|
432
|
+
host=args.host,
|
|
433
|
+
port=args.port,
|
|
434
|
+
reload=args.reload,
|
|
435
|
+
env_files=env_files,
|
|
436
|
+
)
|