mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/benchmarks/__init__.py +12 -0
- mcpbr/benchmarks/adversarial.py +341 -0
- mcpbr/benchmarks/custom.py +607 -0
- mcpbr/benchmarks/longbench.py +623 -0
- mcpbr/benchmarks/mmmu.py +353 -0
- mcpbr/config.py +4 -0
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/custom_metrics.py +405 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/dataset_versioning.py +222 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_prewarm.py +369 -0
- mcpbr/dry_run.py +532 -0
- mcpbr/failure_analysis.py +558 -0
- mcpbr/few_shot.py +367 -0
- mcpbr/formatting.py +444 -0
- mcpbr/gpu_support.py +157 -0
- mcpbr/harness.py +38 -4
- mcpbr/latency_metrics.py +317 -0
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/sampling.py +193 -0
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,607 @@
|
|
|
1
|
+
"""Custom benchmark implementation loaded from YAML definition files."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
from datasets import load_dataset
|
|
10
|
+
|
|
11
|
+
from ..docker_env import DockerEnvironmentManager, TaskEnvironment
|
|
12
|
+
from .base import BenchmarkTask
|
|
13
|
+
|
|
14
|
+
# Required fields in a custom benchmark YAML definition
|
|
15
|
+
REQUIRED_YAML_FIELDS = ("name", "dataset", "evaluation_type")
|
|
16
|
+
|
|
17
|
+
# Supported evaluation types
|
|
18
|
+
VALID_EVALUATION_TYPES = ("exact_match", "numeric", "regex", "script")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CustomBenchmark:
|
|
22
|
+
"""Custom benchmark loaded from a YAML definition file.
|
|
23
|
+
|
|
24
|
+
Allows users to define benchmarks via YAML without writing Python code.
|
|
25
|
+
Supports HuggingFace datasets or local data, with configurable field mapping
|
|
26
|
+
and multiple evaluation strategies (exact_match, numeric, regex, script).
|
|
27
|
+
|
|
28
|
+
Example YAML definition:
|
|
29
|
+
|
|
30
|
+
name: my-benchmark
|
|
31
|
+
dataset: my-org/my-dataset
|
|
32
|
+
split: test
|
|
33
|
+
task_id_field: id
|
|
34
|
+
problem_statement_field: question
|
|
35
|
+
answer_field: answer
|
|
36
|
+
evaluation_type: exact_match
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(self, definition_path: str | Path | None = None, **kwargs: Any):
|
|
40
|
+
"""Initialize custom benchmark from a YAML definition file or kwargs.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
definition_path: Path to the YAML benchmark definition file.
|
|
44
|
+
**kwargs: Override or provide definition fields directly.
|
|
45
|
+
Useful for programmatic construction or testing.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
FileNotFoundError: If the definition file does not exist.
|
|
49
|
+
ValueError: If required fields are missing or evaluation_type is invalid.
|
|
50
|
+
"""
|
|
51
|
+
definition: dict[str, Any] = {}
|
|
52
|
+
|
|
53
|
+
if definition_path is not None:
|
|
54
|
+
path = Path(definition_path)
|
|
55
|
+
if not path.exists():
|
|
56
|
+
msg = f"Custom benchmark definition not found: {path}"
|
|
57
|
+
raise FileNotFoundError(msg)
|
|
58
|
+
|
|
59
|
+
with open(path) as f:
|
|
60
|
+
definition = yaml.safe_load(f) or {}
|
|
61
|
+
|
|
62
|
+
# Merge kwargs on top of file-loaded definition
|
|
63
|
+
definition.update(kwargs)
|
|
64
|
+
|
|
65
|
+
self._validate_definition(definition)
|
|
66
|
+
|
|
67
|
+
# Core identity
|
|
68
|
+
self.name: str = definition["name"]
|
|
69
|
+
self.dataset: str = definition["dataset"]
|
|
70
|
+
self.subset: str | None = definition.get("subset")
|
|
71
|
+
self.split: str = definition.get("split", "test")
|
|
72
|
+
|
|
73
|
+
# Field mapping
|
|
74
|
+
self.task_id_field: str = definition.get("task_id_field", "id")
|
|
75
|
+
self.problem_statement_field: str = definition.get("problem_statement_field", "question")
|
|
76
|
+
self.answer_field: str = definition.get("answer_field", "answer")
|
|
77
|
+
|
|
78
|
+
# Evaluation configuration
|
|
79
|
+
self.evaluation_type: str = definition["evaluation_type"]
|
|
80
|
+
self.evaluation_script: str | None = definition.get("evaluation_script")
|
|
81
|
+
self.regex_pattern: str | None = definition.get("regex_pattern")
|
|
82
|
+
|
|
83
|
+
# Prompt template
|
|
84
|
+
self.prompt_template: str | None = definition.get("prompt_template")
|
|
85
|
+
|
|
86
|
+
# Docker / environment configuration
|
|
87
|
+
self.docker_image: str | None = definition.get("docker_image")
|
|
88
|
+
self.setup_commands: list[str] = definition.get("setup_commands", [])
|
|
89
|
+
|
|
90
|
+
# Numeric comparison tolerances
|
|
91
|
+
self.numeric_rtol: float = definition.get("numeric_rtol", 1e-3)
|
|
92
|
+
self.numeric_atol: float = definition.get("numeric_atol", 1e-3)
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _validate_definition(definition: dict[str, Any]) -> None:
|
|
96
|
+
"""Validate a custom benchmark definition dictionary.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
definition: The parsed YAML definition.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ValueError: If required fields are missing or values are invalid.
|
|
103
|
+
"""
|
|
104
|
+
missing = [f for f in REQUIRED_YAML_FIELDS if f not in definition]
|
|
105
|
+
if missing:
|
|
106
|
+
msg = f"Custom benchmark definition missing required fields: {', '.join(missing)}"
|
|
107
|
+
raise ValueError(msg)
|
|
108
|
+
|
|
109
|
+
eval_type = definition.get("evaluation_type")
|
|
110
|
+
if eval_type not in VALID_EVALUATION_TYPES:
|
|
111
|
+
msg = (
|
|
112
|
+
f"Invalid evaluation_type: {eval_type}. "
|
|
113
|
+
f"Must be one of: {', '.join(VALID_EVALUATION_TYPES)}"
|
|
114
|
+
)
|
|
115
|
+
raise ValueError(msg)
|
|
116
|
+
|
|
117
|
+
if eval_type == "script" and not definition.get("evaluation_script"):
|
|
118
|
+
msg = "evaluation_script is required when evaluation_type is 'script'"
|
|
119
|
+
raise ValueError(msg)
|
|
120
|
+
|
|
121
|
+
if eval_type == "regex" and not definition.get("regex_pattern"):
|
|
122
|
+
msg = "regex_pattern is required when evaluation_type is 'regex'"
|
|
123
|
+
raise ValueError(msg)
|
|
124
|
+
|
|
125
|
+
def load_tasks(
|
|
126
|
+
self,
|
|
127
|
+
sample_size: int | None = None,
|
|
128
|
+
task_ids: list[str] | None = None,
|
|
129
|
+
level: int | None = None,
|
|
130
|
+
filter_difficulty: list[str] | None = None,
|
|
131
|
+
filter_category: list[str] | None = None,
|
|
132
|
+
filter_tags: list[str] | None = None,
|
|
133
|
+
) -> list[dict[str, Any]]:
|
|
134
|
+
"""Load tasks from the configured dataset.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
sample_size: Maximum number of tasks to load (None for all).
|
|
138
|
+
task_ids: Specific task IDs to load (None for all).
|
|
139
|
+
level: Unused for custom benchmarks.
|
|
140
|
+
filter_difficulty: Unused for custom benchmarks.
|
|
141
|
+
filter_category: Unused for custom benchmarks.
|
|
142
|
+
filter_tags: Unused for custom benchmarks.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of task dictionaries augmented with instance_id and problem_statement.
|
|
146
|
+
"""
|
|
147
|
+
if self.subset:
|
|
148
|
+
dataset = load_dataset(self.dataset, self.subset, split=self.split)
|
|
149
|
+
else:
|
|
150
|
+
dataset = load_dataset(self.dataset, split=self.split)
|
|
151
|
+
|
|
152
|
+
if task_ids:
|
|
153
|
+
task_id_set = set(task_ids)
|
|
154
|
+
tasks = []
|
|
155
|
+
original_indices = []
|
|
156
|
+
for idx, item in enumerate(dataset):
|
|
157
|
+
item_id = str(item.get(self.task_id_field, idx))
|
|
158
|
+
if item_id in task_id_set:
|
|
159
|
+
tasks.append(item)
|
|
160
|
+
original_indices.append(idx)
|
|
161
|
+
else:
|
|
162
|
+
tasks = list(dataset)
|
|
163
|
+
original_indices = list(range(len(tasks)))
|
|
164
|
+
|
|
165
|
+
if sample_size is not None and len(tasks) > sample_size:
|
|
166
|
+
tasks = tasks[:sample_size]
|
|
167
|
+
original_indices = original_indices[:sample_size]
|
|
168
|
+
|
|
169
|
+
augmented_tasks = []
|
|
170
|
+
for idx, task in enumerate(tasks):
|
|
171
|
+
augmented = dict(task)
|
|
172
|
+
# Build instance_id from the task_id_field or index
|
|
173
|
+
raw_id = task.get(self.task_id_field, original_indices[idx])
|
|
174
|
+
augmented["instance_id"] = f"{self.name}_{raw_id}"
|
|
175
|
+
# Build problem_statement from the configured field
|
|
176
|
+
augmented["problem_statement"] = self._generate_problem_statement(augmented)
|
|
177
|
+
# Store ground truth answer
|
|
178
|
+
augmented["ground_truth_answer"] = task.get(self.answer_field, "")
|
|
179
|
+
augmented_tasks.append(augmented)
|
|
180
|
+
|
|
181
|
+
return augmented_tasks
|
|
182
|
+
|
|
183
|
+
def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask:
|
|
184
|
+
"""Convert task to normalized BenchmarkTask.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
task: Task dictionary (augmented from load_tasks).
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
Normalized BenchmarkTask.
|
|
191
|
+
|
|
192
|
+
Raises:
|
|
193
|
+
ValueError: If required fields are missing.
|
|
194
|
+
"""
|
|
195
|
+
instance_id = task.get("instance_id")
|
|
196
|
+
if not instance_id:
|
|
197
|
+
msg = f"Task missing required 'instance_id' field: {task.keys()}"
|
|
198
|
+
raise ValueError(msg)
|
|
199
|
+
|
|
200
|
+
problem_field_value = task.get(self.problem_statement_field, "")
|
|
201
|
+
if not problem_field_value and not task.get("problem_statement"):
|
|
202
|
+
msg = (
|
|
203
|
+
f"Task missing required '{self.problem_statement_field}' field: {list(task.keys())}"
|
|
204
|
+
)
|
|
205
|
+
raise ValueError(msg)
|
|
206
|
+
|
|
207
|
+
problem_statement = task.get("problem_statement") or self._generate_problem_statement(task)
|
|
208
|
+
|
|
209
|
+
return BenchmarkTask(
|
|
210
|
+
task_id=instance_id,
|
|
211
|
+
problem_statement=problem_statement,
|
|
212
|
+
repo=f"{self.name}/custom",
|
|
213
|
+
commit="HEAD",
|
|
214
|
+
metadata={
|
|
215
|
+
"benchmark_name": self.name,
|
|
216
|
+
"evaluation_type": self.evaluation_type,
|
|
217
|
+
"ground_truth_answer": task.get("ground_truth_answer", ""),
|
|
218
|
+
"raw_answer": task.get(self.answer_field, ""),
|
|
219
|
+
},
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
def _generate_problem_statement(self, task: dict[str, Any]) -> str:
|
|
223
|
+
"""Generate problem statement from task fields.
|
|
224
|
+
|
|
225
|
+
If a prompt_template is configured, uses it with field substitution.
|
|
226
|
+
Otherwise, returns the raw problem_statement_field value.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
task: Task dictionary.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Problem statement string.
|
|
233
|
+
"""
|
|
234
|
+
raw_statement = task.get(self.problem_statement_field, "No problem statement provided")
|
|
235
|
+
|
|
236
|
+
if self.prompt_template:
|
|
237
|
+
try:
|
|
238
|
+
return self.prompt_template.format(
|
|
239
|
+
problem_statement=raw_statement,
|
|
240
|
+
**{k: v for k, v in task.items() if isinstance(v, (str, int, float, bool))},
|
|
241
|
+
)
|
|
242
|
+
except KeyError:
|
|
243
|
+
# Fall back to raw statement if template substitution fails
|
|
244
|
+
return str(raw_statement)
|
|
245
|
+
|
|
246
|
+
return str(raw_statement)
|
|
247
|
+
|
|
248
|
+
async def create_environment(
|
|
249
|
+
self,
|
|
250
|
+
task: dict[str, Any],
|
|
251
|
+
docker_manager: DockerEnvironmentManager,
|
|
252
|
+
) -> TaskEnvironment:
|
|
253
|
+
"""Create an isolated environment for the task.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
task: Task dictionary.
|
|
257
|
+
docker_manager: Docker environment manager.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
TaskEnvironment for the task.
|
|
261
|
+
"""
|
|
262
|
+
instance_id = task.get("instance_id", f"{self.name}_unknown")
|
|
263
|
+
|
|
264
|
+
temp_task = {
|
|
265
|
+
"instance_id": instance_id,
|
|
266
|
+
"repo": f"{self.name}/custom",
|
|
267
|
+
"base_commit": "HEAD",
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
env = await docker_manager.create_environment(temp_task)
|
|
271
|
+
|
|
272
|
+
# Run setup commands if configured
|
|
273
|
+
if self.setup_commands:
|
|
274
|
+
await self._run_setup_commands(env)
|
|
275
|
+
|
|
276
|
+
return env
|
|
277
|
+
|
|
278
|
+
async def _run_setup_commands(self, env: TaskEnvironment) -> None:
|
|
279
|
+
"""Run configured setup commands in the environment.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
env: Task environment.
|
|
283
|
+
"""
|
|
284
|
+
for cmd in self.setup_commands:
|
|
285
|
+
_exit_code, _stdout, _stderr = await env.exec_command(
|
|
286
|
+
cmd,
|
|
287
|
+
timeout=300,
|
|
288
|
+
)
|
|
289
|
+
# Continue even if individual setup commands fail
|
|
290
|
+
|
|
291
|
+
async def evaluate(
|
|
292
|
+
self,
|
|
293
|
+
env: TaskEnvironment,
|
|
294
|
+
task: dict[str, Any],
|
|
295
|
+
solution: str,
|
|
296
|
+
) -> dict[str, Any]:
|
|
297
|
+
"""Evaluate a solution for the task.
|
|
298
|
+
|
|
299
|
+
Dispatches to the appropriate evaluation method based on evaluation_type.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
env: Task environment.
|
|
303
|
+
task: Task dictionary.
|
|
304
|
+
solution: Solution to evaluate.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Dictionary with evaluation results including 'resolved' boolean.
|
|
308
|
+
"""
|
|
309
|
+
ground_truth = task.get("ground_truth_answer", task.get(self.answer_field, ""))
|
|
310
|
+
|
|
311
|
+
if not ground_truth:
|
|
312
|
+
return {
|
|
313
|
+
"resolved": False,
|
|
314
|
+
"error": "No ground truth answer available for evaluation",
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if self.evaluation_type == "exact_match":
|
|
318
|
+
return self._evaluate_exact_match(solution, str(ground_truth))
|
|
319
|
+
elif self.evaluation_type == "numeric":
|
|
320
|
+
return self._evaluate_numeric(solution, str(ground_truth))
|
|
321
|
+
elif self.evaluation_type == "regex":
|
|
322
|
+
return self._evaluate_regex(solution, str(ground_truth))
|
|
323
|
+
elif self.evaluation_type == "script":
|
|
324
|
+
return await self._evaluate_script(env, task, solution, str(ground_truth))
|
|
325
|
+
else:
|
|
326
|
+
return {
|
|
327
|
+
"resolved": False,
|
|
328
|
+
"error": f"Unknown evaluation_type: {self.evaluation_type}",
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
def _evaluate_exact_match(self, solution: str, ground_truth: str) -> dict[str, Any]:
|
|
332
|
+
"""Evaluate using exact string match (case-insensitive, stripped).
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
solution: Agent's solution.
|
|
336
|
+
ground_truth: Expected answer.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Evaluation result dictionary.
|
|
340
|
+
"""
|
|
341
|
+
normalized_solution = self._normalize_text(solution)
|
|
342
|
+
normalized_truth = self._normalize_text(ground_truth)
|
|
343
|
+
|
|
344
|
+
# Check if the ground truth appears in the solution
|
|
345
|
+
resolved = normalized_truth in normalized_solution
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
"resolved": resolved,
|
|
349
|
+
"agent_answer": solution[:500],
|
|
350
|
+
"ground_truth_answer": ground_truth,
|
|
351
|
+
"match_type": "exact_match",
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
def _evaluate_numeric(self, solution: str, ground_truth: str) -> dict[str, Any]:
|
|
355
|
+
"""Evaluate by comparing numeric values with tolerance.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
solution: Agent's solution.
|
|
359
|
+
ground_truth: Expected numeric answer.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Evaluation result dictionary.
|
|
363
|
+
"""
|
|
364
|
+
agent_num = self._extract_number(solution)
|
|
365
|
+
truth_num = self._extract_number(ground_truth)
|
|
366
|
+
|
|
367
|
+
if truth_num is None:
|
|
368
|
+
return {
|
|
369
|
+
"resolved": False,
|
|
370
|
+
"error": f"Could not parse ground truth as number: {ground_truth}",
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if agent_num is None:
|
|
374
|
+
return {
|
|
375
|
+
"resolved": False,
|
|
376
|
+
"error": "Could not extract numeric answer from solution",
|
|
377
|
+
"agent_solution": solution[:500],
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
resolved = self._compare_numbers(agent_num, truth_num)
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
"resolved": resolved,
|
|
384
|
+
"agent_answer": agent_num,
|
|
385
|
+
"ground_truth_answer": truth_num,
|
|
386
|
+
"match_type": "numeric",
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
def _evaluate_regex(self, solution: str, ground_truth: str) -> dict[str, Any]:
|
|
390
|
+
"""Evaluate using regex pattern matching.
|
|
391
|
+
|
|
392
|
+
The regex_pattern is applied to the solution. If it has a capture group,
|
|
393
|
+
the captured text is compared to the ground truth. Otherwise, the match
|
|
394
|
+
itself is used.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
solution: Agent's solution.
|
|
398
|
+
ground_truth: Expected answer.
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Evaluation result dictionary.
|
|
402
|
+
"""
|
|
403
|
+
if not self.regex_pattern:
|
|
404
|
+
return {
|
|
405
|
+
"resolved": False,
|
|
406
|
+
"error": "No regex_pattern configured for regex evaluation",
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
try:
|
|
410
|
+
match = re.search(self.regex_pattern, solution, re.IGNORECASE | re.DOTALL)
|
|
411
|
+
except re.error as e:
|
|
412
|
+
return {
|
|
413
|
+
"resolved": False,
|
|
414
|
+
"error": f"Invalid regex pattern: {e}",
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
if not match:
|
|
418
|
+
return {
|
|
419
|
+
"resolved": False,
|
|
420
|
+
"error": "Regex pattern did not match solution",
|
|
421
|
+
"agent_solution": solution[:500],
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
# Use first capture group if available, otherwise use full match
|
|
425
|
+
extracted = match.group(1) if match.lastindex else match.group(0)
|
|
426
|
+
normalized_extracted = self._normalize_text(extracted)
|
|
427
|
+
normalized_truth = self._normalize_text(ground_truth)
|
|
428
|
+
|
|
429
|
+
resolved = normalized_extracted == normalized_truth
|
|
430
|
+
|
|
431
|
+
return {
|
|
432
|
+
"resolved": resolved,
|
|
433
|
+
"agent_answer": extracted,
|
|
434
|
+
"ground_truth_answer": ground_truth,
|
|
435
|
+
"match_type": "regex",
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
async def _evaluate_script(
|
|
439
|
+
self,
|
|
440
|
+
env: TaskEnvironment,
|
|
441
|
+
task: dict[str, Any],
|
|
442
|
+
solution: str,
|
|
443
|
+
ground_truth: str,
|
|
444
|
+
) -> dict[str, Any]:
|
|
445
|
+
"""Evaluate by running a custom script in the Docker environment.
|
|
446
|
+
|
|
447
|
+
The script receives the solution and ground truth as arguments.
|
|
448
|
+
Exit code 0 means the solution is correct.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
env: Task environment.
|
|
452
|
+
task: Task dictionary.
|
|
453
|
+
solution: Agent's solution.
|
|
454
|
+
ground_truth: Expected answer.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Evaluation result dictionary.
|
|
458
|
+
"""
|
|
459
|
+
if not self.evaluation_script:
|
|
460
|
+
return {
|
|
461
|
+
"resolved": False,
|
|
462
|
+
"error": "No evaluation_script configured for script evaluation",
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
# Write solution and ground truth using base64 to avoid shell injection
|
|
466
|
+
solution_b64 = base64.b64encode(solution.encode()).decode()
|
|
467
|
+
truth_b64 = base64.b64encode(ground_truth.encode()).decode()
|
|
468
|
+
|
|
469
|
+
setup_cmd = (
|
|
470
|
+
f"echo '{solution_b64}' | base64 -d > /tmp/solution.txt && "
|
|
471
|
+
f"echo '{truth_b64}' | base64 -d > /tmp/ground_truth.txt"
|
|
472
|
+
)
|
|
473
|
+
await env.exec_command(setup_cmd, timeout=30)
|
|
474
|
+
|
|
475
|
+
# Run the evaluation script
|
|
476
|
+
exit_code, stdout, stderr = await env.exec_command(
|
|
477
|
+
self.evaluation_script,
|
|
478
|
+
timeout=300,
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
resolved = exit_code == 0
|
|
482
|
+
|
|
483
|
+
return {
|
|
484
|
+
"resolved": resolved,
|
|
485
|
+
"agent_answer": solution[:500],
|
|
486
|
+
"ground_truth_answer": ground_truth,
|
|
487
|
+
"match_type": "script",
|
|
488
|
+
"script_exit_code": exit_code,
|
|
489
|
+
"script_stdout": stdout[:1000] if stdout else "",
|
|
490
|
+
"script_stderr": stderr[:1000] if stderr else "",
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
def get_prebuilt_image(self, task: dict[str, Any]) -> str | None:
|
|
494
|
+
"""Get pre-built Docker image name for the task.
|
|
495
|
+
|
|
496
|
+
Returns the configured docker_image if set, otherwise None.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
task: Task dictionary.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
Docker image name or None.
|
|
503
|
+
"""
|
|
504
|
+
return self.docker_image
|
|
505
|
+
|
|
506
|
+
def get_prompt_template(self) -> str:
|
|
507
|
+
"""Get the benchmark prompt template.
|
|
508
|
+
|
|
509
|
+
Returns the configured prompt_template or a sensible default.
|
|
510
|
+
|
|
511
|
+
Returns:
|
|
512
|
+
Prompt template string with {problem_statement} placeholder.
|
|
513
|
+
"""
|
|
514
|
+
if self.prompt_template:
|
|
515
|
+
return self.prompt_template
|
|
516
|
+
|
|
517
|
+
return (
|
|
518
|
+
f"Solve the following {self.name} benchmark task:\n\n"
|
|
519
|
+
"{problem_statement}\n\n"
|
|
520
|
+
"Provide your answer clearly."
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
@staticmethod
|
|
524
|
+
def _normalize_text(text: str) -> str:
|
|
525
|
+
"""Normalize text for comparison by lowercasing, stripping, collapsing whitespace.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
text: Input text.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Normalized text.
|
|
532
|
+
"""
|
|
533
|
+
return " ".join(text.lower().strip().split())
|
|
534
|
+
|
|
535
|
+
@staticmethod
|
|
536
|
+
def _extract_number(text: str) -> float | None:
|
|
537
|
+
"""Extract the last numeric value from text.
|
|
538
|
+
|
|
539
|
+
Handles commas, dollar signs, percentages, and negative numbers.
|
|
540
|
+
|
|
541
|
+
Args:
|
|
542
|
+
text: Text potentially containing a number.
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Extracted float or None.
|
|
546
|
+
"""
|
|
547
|
+
if not text:
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
# Try #### format first (GSM8K-style)
|
|
551
|
+
match = re.search(r"####\s*([+-]?[\d,]+(?:\.\d+)?)", text)
|
|
552
|
+
if match:
|
|
553
|
+
return CustomBenchmark._parse_number(match.group(1))
|
|
554
|
+
|
|
555
|
+
# Try "the answer is X" pattern
|
|
556
|
+
match = re.search(
|
|
557
|
+
r"(?:the\s+)?(?:final\s+)?answer\s*(?:is|:)\s*\$?([+-]?[\d,]+(?:\.\d+)?)",
|
|
558
|
+
text,
|
|
559
|
+
re.IGNORECASE,
|
|
560
|
+
)
|
|
561
|
+
if match:
|
|
562
|
+
return CustomBenchmark._parse_number(match.group(1))
|
|
563
|
+
|
|
564
|
+
# Fallback: last number in the text
|
|
565
|
+
numbers = re.findall(r"([+-]?[\d,]+(?:\.\d+)?)", text)
|
|
566
|
+
if numbers:
|
|
567
|
+
return CustomBenchmark._parse_number(numbers[-1])
|
|
568
|
+
|
|
569
|
+
return None
|
|
570
|
+
|
|
571
|
+
@staticmethod
|
|
572
|
+
def _parse_number(num_str: str) -> float | None:
|
|
573
|
+
"""Parse a number string to float.
|
|
574
|
+
|
|
575
|
+
Args:
|
|
576
|
+
num_str: String representation of number.
|
|
577
|
+
|
|
578
|
+
Returns:
|
|
579
|
+
Float value or None if parsing fails.
|
|
580
|
+
"""
|
|
581
|
+
try:
|
|
582
|
+
cleaned = num_str.replace(",", "").replace("$", "").replace("%", "").strip()
|
|
583
|
+
return float(cleaned)
|
|
584
|
+
except (ValueError, AttributeError):
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
def _compare_numbers(
|
|
588
|
+
self,
|
|
589
|
+
answer1: float,
|
|
590
|
+
answer2: float,
|
|
591
|
+
) -> bool:
|
|
592
|
+
"""Compare two numeric answers with configurable tolerance.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
answer1: First answer.
|
|
596
|
+
answer2: Second answer.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
True if answers are equal within tolerance.
|
|
600
|
+
"""
|
|
601
|
+
if answer1 == answer2:
|
|
602
|
+
return True
|
|
603
|
+
|
|
604
|
+
abs_diff = abs(answer1 - answer2)
|
|
605
|
+
rel_diff = abs_diff / max(abs(answer1), abs(answer2), 1e-10)
|
|
606
|
+
|
|
607
|
+
return abs_diff <= self.numeric_atol or rel_diff <= self.numeric_rtol
|