synth-ai 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/baseline/banking77_baseline.py +204 -0
- examples/baseline/crafter_baseline.py +407 -0
- examples/baseline/pokemon_red_baseline.py +326 -0
- examples/baseline/simple_baseline.py +56 -0
- examples/baseline/warming_up_to_rl_baseline.py +239 -0
- examples/blog_posts/gepa/README.md +355 -0
- examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
- examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
- examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
- examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
- examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
- examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
- examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
- examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
- examples/blog_posts/gepa/gepa_baseline.py +204 -0
- examples/blog_posts/gepa/query_prompts_example.py +97 -0
- examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
- examples/blog_posts/gepa/task_apps.py +105 -0
- examples/blog_posts/gepa/test_gepa_local.sh +67 -0
- examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
- examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
- examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +12 -10
- examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +1 -0
- examples/blog_posts/pokemon_vl/extract_images.py +239 -0
- examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
- examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
- examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
- examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
- examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
- examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
- examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
- examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
- examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
- examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +1 -1
- examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
- examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +60 -10
- examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +1 -1
- examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
- examples/multi_step/configs/crafter_rl_outcome.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +1 -0
- examples/multi_step/configs/crafter_rl_stepwise_simple.toml +1 -0
- examples/rl/configs/rl_from_base_qwen17.toml +1 -0
- examples/swe/task_app/hosted/inference/openai_client.py +0 -34
- examples/swe/task_app/hosted/policy_routes.py +17 -0
- examples/swe/task_app/hosted/rollout.py +4 -2
- examples/task_apps/banking77/__init__.py +6 -0
- examples/task_apps/banking77/banking77_task_app.py +841 -0
- examples/task_apps/banking77/deploy_wrapper.py +46 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +24 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +355 -58
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +68 -7
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +78 -21
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
- examples/task_apps/gepa_benchmarks/__init__.py +7 -0
- examples/task_apps/gepa_benchmarks/common.py +260 -0
- examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
- examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
- examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
- examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
- examples/task_apps/pokemon_red/task_app.py +254 -36
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +1 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +53 -4
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +152 -41
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +31 -1
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
- examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +1 -0
- synth_ai/api/train/builders.py +90 -1
- synth_ai/api/train/cli.py +396 -21
- synth_ai/api/train/config_finder.py +13 -2
- synth_ai/api/train/configs/__init__.py +15 -1
- synth_ai/api/train/configs/prompt_learning.py +442 -0
- synth_ai/api/train/configs/rl.py +29 -0
- synth_ai/api/train/task_app.py +1 -1
- synth_ai/api/train/validators.py +277 -0
- synth_ai/baseline/__init__.py +25 -0
- synth_ai/baseline/config.py +209 -0
- synth_ai/baseline/discovery.py +214 -0
- synth_ai/baseline/execution.py +146 -0
- synth_ai/cli/__init__.py +85 -17
- synth_ai/cli/__main__.py +0 -0
- synth_ai/cli/claude.py +70 -0
- synth_ai/cli/codex.py +84 -0
- synth_ai/cli/commands/__init__.py +1 -0
- synth_ai/cli/commands/baseline/__init__.py +12 -0
- synth_ai/cli/commands/baseline/core.py +637 -0
- synth_ai/cli/commands/baseline/list.py +93 -0
- synth_ai/cli/commands/eval/core.py +13 -10
- synth_ai/cli/commands/filter/core.py +53 -17
- synth_ai/cli/commands/help/core.py +0 -1
- synth_ai/cli/commands/smoke/__init__.py +7 -0
- synth_ai/cli/commands/smoke/core.py +1436 -0
- synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
- synth_ai/cli/commands/status/subcommands/usage.py +203 -0
- synth_ai/cli/commands/train/judge_schemas.py +1 -0
- synth_ai/cli/commands/train/judge_validation.py +1 -0
- synth_ai/cli/commands/train/validation.py +0 -57
- synth_ai/cli/demo.py +35 -3
- synth_ai/cli/deploy/__init__.py +40 -25
- synth_ai/cli/deploy.py +162 -0
- synth_ai/cli/legacy_root_backup.py +14 -8
- synth_ai/cli/opencode.py +107 -0
- synth_ai/cli/root.py +9 -5
- synth_ai/cli/task_app_deploy.py +1 -1
- synth_ai/cli/task_apps.py +53 -53
- synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
- synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
- synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
- synth_ai/judge_schemas.py +1 -0
- synth_ai/learning/__init__.py +10 -0
- synth_ai/learning/prompt_learning_client.py +276 -0
- synth_ai/learning/prompt_learning_types.py +184 -0
- synth_ai/pricing/__init__.py +2 -0
- synth_ai/pricing/model_pricing.py +57 -0
- synth_ai/streaming/handlers.py +53 -4
- synth_ai/streaming/streamer.py +19 -0
- synth_ai/task/apps/__init__.py +1 -0
- synth_ai/task/config.py +2 -0
- synth_ai/task/tracing_utils.py +25 -25
- synth_ai/task/validators.py +44 -8
- synth_ai/task_app_cfgs.py +21 -0
- synth_ai/tracing_v3/config.py +162 -19
- synth_ai/tracing_v3/constants.py +1 -1
- synth_ai/tracing_v3/db_config.py +24 -38
- synth_ai/tracing_v3/storage/config.py +47 -13
- synth_ai/tracing_v3/storage/factory.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +113 -11
- synth_ai/tracing_v3/turso/native_manager.py +92 -16
- synth_ai/types.py +8 -0
- synth_ai/urls.py +11 -0
- synth_ai/utils/__init__.py +30 -1
- synth_ai/utils/agents.py +74 -0
- synth_ai/utils/bin.py +39 -0
- synth_ai/utils/cli.py +149 -5
- synth_ai/utils/env.py +17 -17
- synth_ai/utils/json.py +72 -0
- synth_ai/utils/modal.py +283 -1
- synth_ai/utils/paths.py +48 -0
- synth_ai/utils/uvicorn.py +113 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/METADATA +102 -4
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/RECORD +162 -88
- synth_ai/cli/commands/deploy/__init__.py +0 -23
- synth_ai/cli/commands/deploy/core.py +0 -614
- synth_ai/cli/commands/deploy/errors.py +0 -72
- synth_ai/cli/commands/deploy/validation.py +0 -11
- synth_ai/cli/deploy/core.py +0 -5
- synth_ai/cli/deploy/errors.py +0 -23
- synth_ai/cli/deploy/validation.py +0 -5
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.17.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
"""IFBench instruction-following task app."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import uuid
|
|
9
|
+
from collections.abc import Iterable, Sequence
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Mapping, cast
|
|
12
|
+
|
|
13
|
+
from datasets import load_dataset
|
|
14
|
+
from fastapi import APIRouter, HTTPException, Request
|
|
15
|
+
|
|
16
|
+
from synth_ai.task.apps import ModalDeploymentConfig, TaskAppEntry, register_task_app
|
|
17
|
+
from synth_ai.task.contracts import (
|
|
18
|
+
RolloutMetrics,
|
|
19
|
+
RolloutRequest,
|
|
20
|
+
RolloutResponse,
|
|
21
|
+
RolloutStep,
|
|
22
|
+
RolloutTrajectory,
|
|
23
|
+
TaskInfo,
|
|
24
|
+
)
|
|
25
|
+
from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
|
|
26
|
+
from synth_ai.task.rubrics import Rubric, load_rubric
|
|
27
|
+
from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig
|
|
28
|
+
from synth_ai.task.vendors import normalize_vendor_keys
|
|
29
|
+
|
|
30
|
+
from .common import (
|
|
31
|
+
call_chat_completion,
|
|
32
|
+
count_emojis,
|
|
33
|
+
count_numbers,
|
|
34
|
+
count_pronouns,
|
|
35
|
+
sentence_split,
|
|
36
|
+
tokenize,
|
|
37
|
+
unique_word_count,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
41
|
+
|
|
42
|
+
DATASET_ID = "allenai/IFBench_test"
|
|
43
|
+
AVAILABLE_SPLITS: tuple[str, ...] = ("train",)
|
|
44
|
+
DEFAULT_SPLIT = "train"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
ifbench_router = APIRouter()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
IFBENCH_DATASET_SPEC = TaskDatasetSpec(
|
|
51
|
+
id="ifbench",
|
|
52
|
+
name="IFBench Instruction Following",
|
|
53
|
+
version="1.0.0",
|
|
54
|
+
splits=list(AVAILABLE_SPLITS),
|
|
55
|
+
default_split=DEFAULT_SPLIT,
|
|
56
|
+
description="Instruction following benchmark with programmatically-checked constraints.",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
SUPPORTED_INSTRUCTIONS = {
|
|
60
|
+
"count:keywords_multiple",
|
|
61
|
+
"sentence:keyword",
|
|
62
|
+
"count:numbers",
|
|
63
|
+
"count:word_count_range",
|
|
64
|
+
"count:unique_word_count",
|
|
65
|
+
"count:pronouns",
|
|
66
|
+
"format:list",
|
|
67
|
+
"format:emoji",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class IFBenchDataset:
|
|
72
|
+
"""Load IFBench and filter to instructions we can automatically score."""
|
|
73
|
+
|
|
74
|
+
def __init__(self) -> None:
|
|
75
|
+
self._cache: dict[str, list[dict[str, Any]]] = {}
|
|
76
|
+
|
|
77
|
+
def _load_split(self, split: str) -> list[dict[str, Any]]:
|
|
78
|
+
if split not in AVAILABLE_SPLITS:
|
|
79
|
+
raise ValueError(f"Unknown split '{split}'. Available: {AVAILABLE_SPLITS}")
|
|
80
|
+
if split not in self._cache:
|
|
81
|
+
try:
|
|
82
|
+
raw = load_dataset(DATASET_ID, split=split)
|
|
83
|
+
except Exception as exc: # pragma: no cover
|
|
84
|
+
raise RuntimeError(
|
|
85
|
+
f"Failed to download IFBench split '{split}'. Ensure network access."
|
|
86
|
+
) from exc
|
|
87
|
+
filtered = [
|
|
88
|
+
row
|
|
89
|
+
for row in raw
|
|
90
|
+
if set(row.get("instruction_id_list") or ()).issubset(SUPPORTED_INSTRUCTIONS)
|
|
91
|
+
]
|
|
92
|
+
if not filtered:
|
|
93
|
+
raise RuntimeError(
|
|
94
|
+
f"No IFBench samples remain after filtering for supported instructions ({SUPPORTED_INSTRUCTIONS})."
|
|
95
|
+
)
|
|
96
|
+
self._cache[split] = filtered
|
|
97
|
+
return self._cache[split]
|
|
98
|
+
|
|
99
|
+
def ensure_ready(self, splits: Sequence[str]) -> None:
|
|
100
|
+
for split in splits:
|
|
101
|
+
self._load_split(split)
|
|
102
|
+
|
|
103
|
+
def size(self, split: str) -> int:
|
|
104
|
+
return len(self._load_split(split))
|
|
105
|
+
|
|
106
|
+
def sample(self, *, split: str, index: int) -> dict[str, Any]:
|
|
107
|
+
dataset = self._load_split(split)
|
|
108
|
+
size = len(dataset)
|
|
109
|
+
if size == 0:
|
|
110
|
+
raise RuntimeError(f"IFBench split '{split}' is empty")
|
|
111
|
+
idx = int(index) % size
|
|
112
|
+
row = dataset[int(idx)]
|
|
113
|
+
|
|
114
|
+
instructions = []
|
|
115
|
+
ids = row.get("instruction_id_list") or []
|
|
116
|
+
kwargs_list = row.get("kwargs") or []
|
|
117
|
+
for instr_id, kwargs in zip(ids, kwargs_list):
|
|
118
|
+
instructions.append(
|
|
119
|
+
{
|
|
120
|
+
"id": str(instr_id),
|
|
121
|
+
"kwargs": kwargs or {},
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"index": idx,
|
|
127
|
+
"split": split,
|
|
128
|
+
"prompt": str(row.get("prompt") or ""),
|
|
129
|
+
"instructions": instructions,
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _summarise_kwargs(kwargs: Mapping[str, Any]) -> str:
|
|
134
|
+
items = []
|
|
135
|
+
for key, value in kwargs.items():
|
|
136
|
+
if value in (None, "", [], {}):
|
|
137
|
+
continue
|
|
138
|
+
items.append(f"{key}={value}")
|
|
139
|
+
return ", ".join(items) if items else "default"
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
_KEYWORD_PATTERN = re.compile(
|
|
143
|
+
r"keyword\s+([a-z0-9_-]+)\s+(once|twice|\d+\s+times?)",
|
|
144
|
+
flags=re.IGNORECASE,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _extract_keyword_targets(prompt: str, keywords: Sequence[str]) -> dict[str, int]:
|
|
149
|
+
targets: dict[str, int] = {}
|
|
150
|
+
for match in _KEYWORD_PATTERN.finditer(prompt):
|
|
151
|
+
word = match.group(1)
|
|
152
|
+
if word not in keywords:
|
|
153
|
+
continue
|
|
154
|
+
count_str = match.group(2).lower()
|
|
155
|
+
if count_str == "once":
|
|
156
|
+
targets[word] = 1
|
|
157
|
+
elif count_str == "twice":
|
|
158
|
+
targets[word] = 2
|
|
159
|
+
else:
|
|
160
|
+
digit_match = re.search(r"\d+", count_str)
|
|
161
|
+
targets[word] = int(digit_match.group()) if digit_match else 1
|
|
162
|
+
return targets
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _evaluate_instruction(
|
|
166
|
+
instr_id: str,
|
|
167
|
+
kwargs: Mapping[str, Any],
|
|
168
|
+
prompt: str,
|
|
169
|
+
response: str,
|
|
170
|
+
) -> tuple[bool, dict[str, Any]]:
|
|
171
|
+
tokens = tokenize(response)
|
|
172
|
+
details: dict[str, Any] = {}
|
|
173
|
+
|
|
174
|
+
if instr_id == "count:keywords_multiple":
|
|
175
|
+
keywords = [
|
|
176
|
+
kwargs.get("keyword1"),
|
|
177
|
+
kwargs.get("keyword2"),
|
|
178
|
+
kwargs.get("keyword3"),
|
|
179
|
+
kwargs.get("keyword4"),
|
|
180
|
+
kwargs.get("keyword5"),
|
|
181
|
+
]
|
|
182
|
+
keywords = [str(word) for word in keywords if word]
|
|
183
|
+
targets = _extract_keyword_targets(prompt, keywords)
|
|
184
|
+
passes = True
|
|
185
|
+
occurrences: dict[str, int] = {}
|
|
186
|
+
for word in keywords:
|
|
187
|
+
expected = targets.get(word, 1)
|
|
188
|
+
actual = len(re.findall(rf"\b{re.escape(word)}\b", response, flags=re.IGNORECASE))
|
|
189
|
+
occurrences[word] = actual
|
|
190
|
+
if actual < expected:
|
|
191
|
+
passes = False
|
|
192
|
+
details.update({"keywords": keywords, "counts": occurrences, "targets": targets})
|
|
193
|
+
return passes, details
|
|
194
|
+
|
|
195
|
+
if instr_id == "sentence:keyword":
|
|
196
|
+
target_word = str(kwargs.get("word") or "").strip()
|
|
197
|
+
expected = int(kwargs.get("N") or 1)
|
|
198
|
+
sentences = sentence_split(response)
|
|
199
|
+
satisfied = sum(
|
|
200
|
+
1 for sentence in sentences if re.search(rf"\b{re.escape(target_word)}\b", sentence, re.IGNORECASE)
|
|
201
|
+
)
|
|
202
|
+
details.update({"word": target_word, "required": expected, "actual": satisfied})
|
|
203
|
+
return satisfied >= expected, details
|
|
204
|
+
|
|
205
|
+
if instr_id == "count:numbers":
|
|
206
|
+
expected = int(kwargs.get("N") or 0)
|
|
207
|
+
actual = count_numbers(response)
|
|
208
|
+
details.update({"required": expected, "actual": actual})
|
|
209
|
+
return actual >= expected, details
|
|
210
|
+
|
|
211
|
+
if instr_id == "count:word_count_range":
|
|
212
|
+
min_words = int(kwargs.get("min_words") or 0)
|
|
213
|
+
max_words = int(kwargs.get("max_words") or 10_000)
|
|
214
|
+
word_count = len(tokens)
|
|
215
|
+
details.update({"min": min_words, "max": max_words, "actual": word_count})
|
|
216
|
+
return min_words <= word_count <= max_words, details
|
|
217
|
+
|
|
218
|
+
if instr_id == "count:unique_word_count":
|
|
219
|
+
expected = int(kwargs.get("N") or 0)
|
|
220
|
+
actual = unique_word_count(tokens)
|
|
221
|
+
details.update({"required": expected, "actual": actual})
|
|
222
|
+
return actual >= expected, details
|
|
223
|
+
|
|
224
|
+
if instr_id == "count:pronouns":
|
|
225
|
+
expected = int(kwargs.get("N") or 0)
|
|
226
|
+
actual = count_pronouns(tokens)
|
|
227
|
+
details.update({"required": expected, "actual": actual})
|
|
228
|
+
return actual >= expected, details
|
|
229
|
+
|
|
230
|
+
if instr_id == "format:list":
|
|
231
|
+
separator = str(kwargs.get("sep") or "-").strip()
|
|
232
|
+
lines = [line.strip() for line in response.splitlines() if line.strip()]
|
|
233
|
+
bullet_lines = [line for line in lines if line.startswith(separator)]
|
|
234
|
+
details.update({"separator": separator, "bullet_count": len(bullet_lines)})
|
|
235
|
+
return len(bullet_lines) >= 2, details
|
|
236
|
+
|
|
237
|
+
if instr_id == "format:emoji":
|
|
238
|
+
expected = int(kwargs.get("N") or 1)
|
|
239
|
+
emoji_count = count_emojis(response)
|
|
240
|
+
details.update({"required": expected, "actual": emoji_count})
|
|
241
|
+
return emoji_count >= expected, details
|
|
242
|
+
|
|
243
|
+
return False, {"unsupported": True}
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def evaluate_ifbench(prompt: str, instructions: Sequence[Mapping[str, Any]], response: str) -> tuple[float, dict[str, Any]]:
|
|
247
|
+
results: dict[str, Any] = {}
|
|
248
|
+
passed = 0
|
|
249
|
+
total = 0
|
|
250
|
+
for instruction in instructions:
|
|
251
|
+
instr_id = str(instruction.get("id") or "")
|
|
252
|
+
kwargs = instruction.get("kwargs") or {}
|
|
253
|
+
ok, details = _evaluate_instruction(instr_id, kwargs, prompt, response)
|
|
254
|
+
results[instr_id] = {"pass": ok, **details}
|
|
255
|
+
if instr_id in SUPPORTED_INSTRUCTIONS:
|
|
256
|
+
total += 1
|
|
257
|
+
if ok:
|
|
258
|
+
passed += 1
|
|
259
|
+
reward = (passed / total) if total else 0.0
|
|
260
|
+
return reward, {"passed": passed, "total": total, "details": results}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
|
|
264
|
+
dataset: IFBenchDataset = fastapi_request.app.state.ifbench_dataset
|
|
265
|
+
|
|
266
|
+
split = str(((request.env.config or {}).get("split")) or DEFAULT_SPLIT)
|
|
267
|
+
seed = request.env.seed or 0
|
|
268
|
+
|
|
269
|
+
sample = dataset.sample(split=split, index=seed)
|
|
270
|
+
|
|
271
|
+
instruction_lines = [
|
|
272
|
+
f"- {instr['id']} ({_summarise_kwargs(instr['kwargs'])})" for instr in sample["instructions"]
|
|
273
|
+
]
|
|
274
|
+
constraints_text = "\n".join(instruction_lines)
|
|
275
|
+
|
|
276
|
+
observation = {
|
|
277
|
+
"prompt": sample["prompt"],
|
|
278
|
+
"instructions": sample["instructions"],
|
|
279
|
+
"index": sample["index"],
|
|
280
|
+
"split": sample["split"],
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
placeholders = {
|
|
284
|
+
"prompt": sample["prompt"],
|
|
285
|
+
"instructions": constraints_text,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
default_messages = [
|
|
289
|
+
{
|
|
290
|
+
"role": "system",
|
|
291
|
+
"pattern": (
|
|
292
|
+
"You must follow every instruction exactly. Produce a single response that satisfies all constraints."
|
|
293
|
+
),
|
|
294
|
+
},
|
|
295
|
+
{
|
|
296
|
+
"role": "user",
|
|
297
|
+
"pattern": "Instructions:\n{instructions}\n\nTask:\n{prompt}",
|
|
298
|
+
},
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
response_json: dict[str, Any] | None = None
|
|
302
|
+
response_text = ""
|
|
303
|
+
error_info: dict[str, Any] = {}
|
|
304
|
+
|
|
305
|
+
try:
|
|
306
|
+
response_text, response_json, _ = await call_chat_completion(
|
|
307
|
+
request.policy.config or {},
|
|
308
|
+
placeholders,
|
|
309
|
+
default_messages,
|
|
310
|
+
)
|
|
311
|
+
except HTTPException as http_err: # pragma: no cover
|
|
312
|
+
error_info = {"error": str(http_err.detail), "code": http_err.status_code}
|
|
313
|
+
except Exception as exc: # pragma: no cover
|
|
314
|
+
error_info = {"error": str(exc)}
|
|
315
|
+
|
|
316
|
+
reward, eval_details = evaluate_ifbench(sample["prompt"], sample["instructions"], response_text)
|
|
317
|
+
eval_details["response_json"] = response_json
|
|
318
|
+
eval_details.update(error_info)
|
|
319
|
+
|
|
320
|
+
with contextlib.suppress(Exception):
|
|
321
|
+
print(
|
|
322
|
+
f"[IFBENCH_ROLLOUT] run_id={request.run_id} index={sample['index']} "
|
|
323
|
+
f"passed={eval_details['passed']}/{eval_details['total']} reward={reward:.3f}",
|
|
324
|
+
flush=True,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
step = RolloutStep(
|
|
328
|
+
obs=observation,
|
|
329
|
+
tool_calls=[],
|
|
330
|
+
reward=reward,
|
|
331
|
+
done=True,
|
|
332
|
+
info=eval_details,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
inference_url = (request.policy.config or {}).get("inference_url")
|
|
336
|
+
trajectory = RolloutTrajectory(
|
|
337
|
+
env_id=f"ifbench::{sample['split']}::{sample['index']}",
|
|
338
|
+
policy_id=request.policy.policy_id or request.policy.policy_name or "policy",
|
|
339
|
+
steps=[step],
|
|
340
|
+
final={"observation": observation, "reward": reward},
|
|
341
|
+
length=1,
|
|
342
|
+
inference_url=str(inference_url or ""),
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
metrics = RolloutMetrics(
|
|
346
|
+
episode_returns=[reward],
|
|
347
|
+
mean_return=reward,
|
|
348
|
+
num_steps=1,
|
|
349
|
+
num_episodes=1,
|
|
350
|
+
outcome_score=reward,
|
|
351
|
+
events_score=reward,
|
|
352
|
+
details={"constraints_passed": eval_details.get("passed"), "constraints_total": eval_details.get("total")},
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
trace_payload = None
|
|
356
|
+
include_trace = bool(
|
|
357
|
+
(request.record and getattr(request.record, "return_trace", False))
|
|
358
|
+
or os.getenv("TASKAPP_TRACING_ENABLED")
|
|
359
|
+
)
|
|
360
|
+
if include_trace:
|
|
361
|
+
trace_payload = {
|
|
362
|
+
"session_id": str(uuid.uuid4()),
|
|
363
|
+
"events_count": 1,
|
|
364
|
+
"decision_rewards": [reward],
|
|
365
|
+
"metadata": {
|
|
366
|
+
"env": "ifbench",
|
|
367
|
+
"split": sample["split"],
|
|
368
|
+
"index": sample["index"],
|
|
369
|
+
"constraints_passed": eval_details.get("passed"),
|
|
370
|
+
"constraints_total": eval_details.get("total"),
|
|
371
|
+
},
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return RolloutResponse(
|
|
375
|
+
run_id=request.run_id,
|
|
376
|
+
trajectories=[trajectory],
|
|
377
|
+
branches={},
|
|
378
|
+
metrics=metrics,
|
|
379
|
+
aborted=False,
|
|
380
|
+
ops_executed=2,
|
|
381
|
+
trace=trace_payload,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def build_dataset() -> tuple[TaskDatasetRegistry, IFBenchDataset]:
|
|
386
|
+
registry = TaskDatasetRegistry()
|
|
387
|
+
dataset = IFBenchDataset()
|
|
388
|
+
dataset.ensure_ready([DEFAULT_SPLIT])
|
|
389
|
+
registry.register(IFBENCH_DATASET_SPEC, lambda _spec: dataset, cache=True)
|
|
390
|
+
return registry, dataset
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _base_task_info() -> TaskInfo:
|
|
394
|
+
return TaskInfo(
|
|
395
|
+
task={
|
|
396
|
+
"id": "ifbench",
|
|
397
|
+
"name": "IFBench Instruction Following",
|
|
398
|
+
"version": "1.0.0",
|
|
399
|
+
"action_space": {
|
|
400
|
+
"type": "free_text",
|
|
401
|
+
"description": "Generate a completion that satisfies all constraints.",
|
|
402
|
+
},
|
|
403
|
+
},
|
|
404
|
+
environment="ifbench",
|
|
405
|
+
dataset={
|
|
406
|
+
**IFBENCH_DATASET_SPEC.model_dump(),
|
|
407
|
+
"hf_dataset": DATASET_ID,
|
|
408
|
+
},
|
|
409
|
+
rubric={
|
|
410
|
+
"version": "1",
|
|
411
|
+
"criteria_count": 1,
|
|
412
|
+
"source": "inline",
|
|
413
|
+
},
|
|
414
|
+
inference={
|
|
415
|
+
"supports_proxy": True,
|
|
416
|
+
"tool": None,
|
|
417
|
+
},
|
|
418
|
+
limits={"max_turns": 1},
|
|
419
|
+
task_metadata={"supported_instructions": sorted(SUPPORTED_INSTRUCTIONS)},
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def describe_taskset(dataset: IFBenchDataset) -> Mapping[str, Any]:
|
|
424
|
+
return {
|
|
425
|
+
**IFBENCH_DATASET_SPEC.model_dump(),
|
|
426
|
+
"hf_dataset": DATASET_ID,
|
|
427
|
+
"supported_instructions": sorted(SUPPORTED_INSTRUCTIONS),
|
|
428
|
+
"sizes": {split: dataset.size(split) for split in AVAILABLE_SPLITS},
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def provide_task_instances(dataset: IFBenchDataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
|
|
433
|
+
base_info = _base_task_info()
|
|
434
|
+
for seed in seeds:
|
|
435
|
+
sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
|
|
436
|
+
yield TaskInfo(
|
|
437
|
+
task=base_info.task,
|
|
438
|
+
environment=base_info.environment,
|
|
439
|
+
dataset={
|
|
440
|
+
**base_info.dataset,
|
|
441
|
+
"split": sample["split"],
|
|
442
|
+
"index": sample["index"],
|
|
443
|
+
},
|
|
444
|
+
rubric=base_info.rubric,
|
|
445
|
+
inference=base_info.inference,
|
|
446
|
+
limits=base_info.limits,
|
|
447
|
+
task_metadata={**base_info.task_metadata, "prompt": sample["prompt"][:80]},
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
OUTCOME_RUBRIC: Rubric = cast(
|
|
452
|
+
Rubric,
|
|
453
|
+
load_rubric(
|
|
454
|
+
{
|
|
455
|
+
"version": "1",
|
|
456
|
+
"goal_text": "Satisfy the IFBench constraints.",
|
|
457
|
+
"aggregation": "weighted_sum",
|
|
458
|
+
"criteria": [
|
|
459
|
+
{
|
|
460
|
+
"id": "constraint_satisfaction",
|
|
461
|
+
"description": "Meets all programmatically-checked constraints.",
|
|
462
|
+
"weight": 1.0,
|
|
463
|
+
}
|
|
464
|
+
],
|
|
465
|
+
}
|
|
466
|
+
),
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
EVENTS_RUBRIC: Rubric = cast(
|
|
470
|
+
Rubric,
|
|
471
|
+
load_rubric(
|
|
472
|
+
{
|
|
473
|
+
"version": "1",
|
|
474
|
+
"goal_text": "Keep responses concise while following instructions.",
|
|
475
|
+
"aggregation": "weighted_sum",
|
|
476
|
+
"criteria": [
|
|
477
|
+
{
|
|
478
|
+
"id": "concise_answer",
|
|
479
|
+
"description": "Avoid unnecessary content while satisfying constraints.",
|
|
480
|
+
"weight": 1.0,
|
|
481
|
+
}
|
|
482
|
+
],
|
|
483
|
+
}
|
|
484
|
+
),
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def build_config() -> TaskAppConfig:
|
|
489
|
+
registry, dataset = build_dataset()
|
|
490
|
+
base_info = _base_task_info()
|
|
491
|
+
|
|
492
|
+
proxy_keys = normalize_vendor_keys()
|
|
493
|
+
proxy_config = ProxyConfig(
|
|
494
|
+
enable_openai=proxy_keys.get("OPENAI_API_KEY") is not None,
|
|
495
|
+
enable_groq=proxy_keys.get("GROQ_API_KEY") is not None,
|
|
496
|
+
system_hint="Follow every instruction exactly. Violations are failures.",
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
config = TaskAppConfig(
|
|
500
|
+
app_id="ifbench",
|
|
501
|
+
name="IFBench Instruction Following Task",
|
|
502
|
+
description="IFBench task app with automatic constraint checking for prompt optimisation.",
|
|
503
|
+
base_task_info=base_info,
|
|
504
|
+
describe_taskset=lambda: describe_taskset(dataset),
|
|
505
|
+
provide_task_instances=lambda seeds: provide_task_instances(dataset, seeds),
|
|
506
|
+
rollout=rollout_executor,
|
|
507
|
+
dataset_registry=registry,
|
|
508
|
+
rubrics=RubricBundle(outcome=OUTCOME_RUBRIC, events=EVENTS_RUBRIC),
|
|
509
|
+
proxy=proxy_config,
|
|
510
|
+
routers=(ifbench_router,),
|
|
511
|
+
app_state={"ifbench_dataset": dataset},
|
|
512
|
+
cors_origins=["*"],
|
|
513
|
+
)
|
|
514
|
+
return config
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
register_task_app(
|
|
518
|
+
entry=TaskAppEntry(
|
|
519
|
+
app_id="ifbench",
|
|
520
|
+
description="IFBench task app using automatically scored constraint subsets.",
|
|
521
|
+
config_factory=build_config,
|
|
522
|
+
aliases=("ifbench-instructions",),
|
|
523
|
+
modal=ModalDeploymentConfig(
|
|
524
|
+
app_name="synth-ifbench",
|
|
525
|
+
pip_packages=(
|
|
526
|
+
"datasets>=2.14.0",
|
|
527
|
+
"fastapi>=0.115.0",
|
|
528
|
+
"pydantic>=2.0.0",
|
|
529
|
+
"httpx>=0.26.0",
|
|
530
|
+
),
|
|
531
|
+
extra_local_dirs=((str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),),
|
|
532
|
+
),
|
|
533
|
+
)
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
if __name__ == "__main__": # pragma: no cover - manual helper
|
|
538
|
+
import argparse
|
|
539
|
+
from synth_ai.task.server import run_task_app
|
|
540
|
+
|
|
541
|
+
parser = argparse.ArgumentParser(description="Run the IFBench task app locally")
|
|
542
|
+
parser.add_argument("--host", default="0.0.0.0")
|
|
543
|
+
parser.add_argument("--port", type=int, default=8111)
|
|
544
|
+
parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
|
|
545
|
+
parser.add_argument(
|
|
546
|
+
"--env-file",
|
|
547
|
+
action="append",
|
|
548
|
+
default=[],
|
|
549
|
+
help="Additional .env files to load before startup",
|
|
550
|
+
)
|
|
551
|
+
args = parser.parse_args()
|
|
552
|
+
|
|
553
|
+
default_env = Path(__file__).resolve().parents[2] / ".env"
|
|
554
|
+
env_files = [str(default_env)] if default_env.exists() else []
|
|
555
|
+
env_files.extend(args.env_file or [])
|
|
556
|
+
|
|
557
|
+
run_task_app(
|
|
558
|
+
build_config,
|
|
559
|
+
host=args.host,
|
|
560
|
+
port=args.port,
|
|
561
|
+
reload=args.reload,
|
|
562
|
+
env_files=env_files,
|
|
563
|
+
)
|