mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/benchmarks/__init__.py +12 -0
- mcpbr/benchmarks/adversarial.py +341 -0
- mcpbr/benchmarks/custom.py +607 -0
- mcpbr/benchmarks/longbench.py +623 -0
- mcpbr/benchmarks/mmmu.py +353 -0
- mcpbr/config.py +4 -0
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/custom_metrics.py +405 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/dataset_versioning.py +222 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_prewarm.py +369 -0
- mcpbr/dry_run.py +532 -0
- mcpbr/failure_analysis.py +558 -0
- mcpbr/few_shot.py +367 -0
- mcpbr/formatting.py +444 -0
- mcpbr/gpu_support.py +157 -0
- mcpbr/harness.py +38 -4
- mcpbr/latency_metrics.py +317 -0
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/sampling.py +193 -0
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
"""LongBench benchmark implementation for long-context understanding."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import string
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from datasets import load_dataset
|
|
9
|
+
|
|
10
|
+
from ..docker_env import DockerEnvironmentManager, TaskEnvironment
|
|
11
|
+
from .base import BenchmarkTask
|
|
12
|
+
|
|
13
|
+
# Mapping of LongBench subsets to their high-level categories
|
|
14
|
+
SUBSET_TO_CATEGORY: dict[str, str] = {
|
|
15
|
+
# Single-Document QA
|
|
16
|
+
"narrativeqa": "single_doc_qa",
|
|
17
|
+
"qasper": "single_doc_qa",
|
|
18
|
+
"multifieldqa_en": "single_doc_qa",
|
|
19
|
+
"multifieldqa_zh": "single_doc_qa",
|
|
20
|
+
# Multi-Document QA
|
|
21
|
+
"hotpotqa": "multi_doc_qa",
|
|
22
|
+
"2wikimqa": "multi_doc_qa",
|
|
23
|
+
"musique": "multi_doc_qa",
|
|
24
|
+
"dureader": "multi_doc_qa",
|
|
25
|
+
# Summarization
|
|
26
|
+
"gov_report": "summarization",
|
|
27
|
+
"qmsum": "summarization",
|
|
28
|
+
"multi_news": "summarization",
|
|
29
|
+
"vcsum": "summarization",
|
|
30
|
+
"samsum": "summarization",
|
|
31
|
+
# Few-Shot Learning
|
|
32
|
+
"triviaqa": "few_shot",
|
|
33
|
+
"trec": "few_shot",
|
|
34
|
+
"lsht": "few_shot",
|
|
35
|
+
# Synthetic Tasks
|
|
36
|
+
"passage_count": "synthetic",
|
|
37
|
+
"passage_retrieval_en": "synthetic",
|
|
38
|
+
"passage_retrieval_zh": "synthetic",
|
|
39
|
+
# Code Completion
|
|
40
|
+
"lcc": "code",
|
|
41
|
+
"repobench-p": "code",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# Subsets that use F1 scoring
|
|
45
|
+
F1_SUBSETS = {
|
|
46
|
+
"narrativeqa",
|
|
47
|
+
"qasper",
|
|
48
|
+
"multifieldqa_en",
|
|
49
|
+
"multifieldqa_zh",
|
|
50
|
+
"hotpotqa",
|
|
51
|
+
"2wikimqa",
|
|
52
|
+
"musique",
|
|
53
|
+
"triviaqa",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
# Subsets that use ROUGE-L scoring
|
|
57
|
+
ROUGE_SUBSETS = {
|
|
58
|
+
"dureader",
|
|
59
|
+
"gov_report",
|
|
60
|
+
"qmsum",
|
|
61
|
+
"multi_news",
|
|
62
|
+
"vcsum",
|
|
63
|
+
"samsum",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
# Subsets that use classification accuracy
|
|
67
|
+
ACCURACY_SUBSETS = {
|
|
68
|
+
"trec",
|
|
69
|
+
"lsht",
|
|
70
|
+
"passage_count",
|
|
71
|
+
"passage_retrieval_en",
|
|
72
|
+
"passage_retrieval_zh",
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
# Subsets that use edit similarity
|
|
76
|
+
EDIT_SIM_SUBSETS = {
|
|
77
|
+
"lcc",
|
|
78
|
+
"repobench-p",
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# All available subsets
|
|
82
|
+
ALL_SUBSETS = list(SUBSET_TO_CATEGORY.keys())
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class LongBenchBenchmark:
|
|
86
|
+
"""LongBench benchmark implementation.
|
|
87
|
+
|
|
88
|
+
LongBench is a bilingual, multitask benchmark for long context understanding.
|
|
89
|
+
It covers 6 categories across 21 tasks with contexts ranging from thousands
|
|
90
|
+
to tens of thousands of tokens:
|
|
91
|
+
|
|
92
|
+
- Single-Document QA: narrativeqa, qasper, multifieldqa_en, multifieldqa_zh
|
|
93
|
+
- Multi-Document QA: hotpotqa, 2wikimqa, musique, dureader
|
|
94
|
+
- Summarization: gov_report, qmsum, multi_news, vcsum, samsum
|
|
95
|
+
- Few-Shot Learning: triviaqa, trec, lsht
|
|
96
|
+
- Synthetic Tasks: passage_count, passage_retrieval_en, passage_retrieval_zh
|
|
97
|
+
- Code Completion: lcc, repobench-p
|
|
98
|
+
|
|
99
|
+
Evaluation uses task-appropriate metrics: F1 for QA, ROUGE-L for summarization,
|
|
100
|
+
accuracy for classification, and edit similarity for code tasks.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
name = "longbench"
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
dataset: str = "THUDM/LongBench",
|
|
108
|
+
subset: str = "hotpotqa",
|
|
109
|
+
):
|
|
110
|
+
"""Initialize LongBench benchmark.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
dataset: HuggingFace dataset identifier.
|
|
114
|
+
subset: Dataset subset/config name (e.g., 'hotpotqa', 'narrativeqa').
|
|
115
|
+
"""
|
|
116
|
+
self.dataset = dataset
|
|
117
|
+
self.subset = subset
|
|
118
|
+
|
|
119
|
+
def load_tasks(
|
|
120
|
+
self,
|
|
121
|
+
sample_size: int | None = None,
|
|
122
|
+
task_ids: list[str] | None = None,
|
|
123
|
+
level: int | None = None,
|
|
124
|
+
filter_difficulty: list[str] | None = None,
|
|
125
|
+
filter_category: list[str] | None = None,
|
|
126
|
+
filter_tags: list[str] | None = None,
|
|
127
|
+
) -> list[dict[str, Any]]:
|
|
128
|
+
"""Load tasks from LongBench dataset.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
sample_size: Maximum number of tasks to load (None for all).
|
|
132
|
+
task_ids: Specific task IDs to load (None for all).
|
|
133
|
+
level: Unused for LongBench.
|
|
134
|
+
filter_difficulty: Unused for LongBench.
|
|
135
|
+
filter_category: Filter by task category. Valid categories:
|
|
136
|
+
single_doc_qa, multi_doc_qa, summarization, few_shot,
|
|
137
|
+
synthetic, code. When specified, loads tasks from all
|
|
138
|
+
subsets matching the given categories.
|
|
139
|
+
filter_tags: Unused for LongBench.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
List of LongBench task dictionaries.
|
|
143
|
+
"""
|
|
144
|
+
_ = level
|
|
145
|
+
_ = filter_difficulty
|
|
146
|
+
_ = filter_tags
|
|
147
|
+
|
|
148
|
+
# Determine which subsets to load based on filter_category
|
|
149
|
+
subsets_to_load = self._resolve_subsets(filter_category)
|
|
150
|
+
|
|
151
|
+
all_tasks: list[dict[str, Any]] = []
|
|
152
|
+
for subset_name in subsets_to_load:
|
|
153
|
+
try:
|
|
154
|
+
ds = load_dataset(self.dataset, subset_name, split="test")
|
|
155
|
+
for idx, item in enumerate(ds):
|
|
156
|
+
task = dict(item)
|
|
157
|
+
task["_subset"] = subset_name
|
|
158
|
+
task["_original_index"] = idx
|
|
159
|
+
all_tasks.append(task)
|
|
160
|
+
except Exception:
|
|
161
|
+
# Skip subsets that fail to load (e.g., unavailable configs)
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
# Filter by task_ids if specified
|
|
165
|
+
if task_ids:
|
|
166
|
+
task_id_set = set(task_ids)
|
|
167
|
+
all_tasks = [
|
|
168
|
+
t
|
|
169
|
+
for t in all_tasks
|
|
170
|
+
if f"longbench_{t['_subset']}_{t['_original_index']}" in task_id_set
|
|
171
|
+
]
|
|
172
|
+
|
|
173
|
+
# Apply sample size limit
|
|
174
|
+
if sample_size is not None and len(all_tasks) > sample_size:
|
|
175
|
+
all_tasks = all_tasks[:sample_size]
|
|
176
|
+
|
|
177
|
+
# Augment tasks with instance_id and problem_statement
|
|
178
|
+
augmented_tasks = []
|
|
179
|
+
for task in all_tasks:
|
|
180
|
+
augmented = dict(task)
|
|
181
|
+
subset_name = task["_subset"]
|
|
182
|
+
orig_idx = task["_original_index"]
|
|
183
|
+
augmented["instance_id"] = f"longbench_{subset_name}_{orig_idx}"
|
|
184
|
+
augmented["problem_statement"] = self._generate_problem_statement(augmented)
|
|
185
|
+
augmented["ground_truth_answers"] = task.get("answers", [])
|
|
186
|
+
augmented["task_category"] = SUBSET_TO_CATEGORY.get(subset_name, "unknown")
|
|
187
|
+
augmented_tasks.append(augmented)
|
|
188
|
+
|
|
189
|
+
return augmented_tasks
|
|
190
|
+
|
|
191
|
+
def _resolve_subsets(self, filter_category: list[str] | None) -> list[str]:
|
|
192
|
+
"""Resolve which subsets to load based on category filter.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
filter_category: List of category names to include.
|
|
196
|
+
|
|
197
|
+
Returns:
|
|
198
|
+
List of subset names to load.
|
|
199
|
+
"""
|
|
200
|
+
if not filter_category:
|
|
201
|
+
# If no category filter, use the configured subset
|
|
202
|
+
return [self.subset]
|
|
203
|
+
|
|
204
|
+
category_set = {c.lower() for c in filter_category}
|
|
205
|
+
resolved = []
|
|
206
|
+
for subset_name, category in SUBSET_TO_CATEGORY.items():
|
|
207
|
+
if category in category_set:
|
|
208
|
+
resolved.append(subset_name)
|
|
209
|
+
|
|
210
|
+
return resolved if resolved else [self.subset]
|
|
211
|
+
|
|
212
|
+
def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask:
|
|
213
|
+
"""Convert LongBench task to normalized format.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
task: LongBench task dictionary.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Normalized BenchmarkTask.
|
|
220
|
+
|
|
221
|
+
Raises:
|
|
222
|
+
ValueError: If required fields are missing.
|
|
223
|
+
"""
|
|
224
|
+
instance_id = task.get("instance_id")
|
|
225
|
+
if not instance_id:
|
|
226
|
+
msg = f"Task missing required 'instance_id' field: {list(task.keys())}"
|
|
227
|
+
raise ValueError(msg)
|
|
228
|
+
|
|
229
|
+
input_text = task.get("input", "")
|
|
230
|
+
if not input_text and not task.get("context", ""):
|
|
231
|
+
msg = f"Task missing required 'input' or 'context' field: {list(task.keys())}"
|
|
232
|
+
raise ValueError(msg)
|
|
233
|
+
|
|
234
|
+
problem_statement = self._generate_problem_statement(task)
|
|
235
|
+
subset_name = task.get("_subset", self.subset)
|
|
236
|
+
|
|
237
|
+
return BenchmarkTask(
|
|
238
|
+
task_id=instance_id,
|
|
239
|
+
problem_statement=problem_statement,
|
|
240
|
+
repo="longbench/context",
|
|
241
|
+
commit="HEAD",
|
|
242
|
+
metadata={
|
|
243
|
+
"input": input_text,
|
|
244
|
+
"context_length": task.get("length", 0),
|
|
245
|
+
"dataset": task.get("dataset", subset_name),
|
|
246
|
+
"language": task.get("language", "en"),
|
|
247
|
+
"answers": task.get("answers", []),
|
|
248
|
+
"all_classes": task.get("all_classes"),
|
|
249
|
+
"subset": subset_name,
|
|
250
|
+
"category": SUBSET_TO_CATEGORY.get(subset_name, "unknown"),
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def _generate_problem_statement(self, task: dict[str, Any]) -> str:
|
|
255
|
+
"""Generate problem statement from task.
|
|
256
|
+
|
|
257
|
+
Includes the full long context in the problem statement so agents
|
|
258
|
+
must process and understand it.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
task: LongBench task dictionary.
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Problem statement for the agent.
|
|
265
|
+
"""
|
|
266
|
+
input_text = task.get("input", "No input provided")
|
|
267
|
+
context = task.get("context", "")
|
|
268
|
+
subset_name = task.get("_subset", task.get("dataset", self.subset))
|
|
269
|
+
category = SUBSET_TO_CATEGORY.get(subset_name, "unknown")
|
|
270
|
+
|
|
271
|
+
# Build category-specific instructions
|
|
272
|
+
if category in ("single_doc_qa", "multi_doc_qa"):
|
|
273
|
+
task_instruction = (
|
|
274
|
+
"Read the following document(s) carefully and answer the question "
|
|
275
|
+
"based on the information provided."
|
|
276
|
+
)
|
|
277
|
+
elif category == "summarization":
|
|
278
|
+
task_instruction = (
|
|
279
|
+
"Read the following document(s) carefully and provide a concise summary "
|
|
280
|
+
"capturing the key points."
|
|
281
|
+
)
|
|
282
|
+
elif category == "few_shot":
|
|
283
|
+
task_instruction = (
|
|
284
|
+
"Study the provided examples carefully and answer the question "
|
|
285
|
+
"following the same pattern."
|
|
286
|
+
)
|
|
287
|
+
elif category == "synthetic":
|
|
288
|
+
task_instruction = (
|
|
289
|
+
"Analyze the provided passages carefully and answer the question precisely."
|
|
290
|
+
)
|
|
291
|
+
elif category == "code":
|
|
292
|
+
task_instruction = "Read the provided code context and complete the code as requested."
|
|
293
|
+
else:
|
|
294
|
+
task_instruction = "Read the following context carefully and respond to the request."
|
|
295
|
+
|
|
296
|
+
parts = [task_instruction, ""]
|
|
297
|
+
|
|
298
|
+
if context:
|
|
299
|
+
parts.append("--- CONTEXT ---")
|
|
300
|
+
parts.append(context)
|
|
301
|
+
parts.append("--- END CONTEXT ---")
|
|
302
|
+
parts.append("")
|
|
303
|
+
|
|
304
|
+
parts.append(f"Question/Task: {input_text}")
|
|
305
|
+
|
|
306
|
+
return "\n".join(parts)
|
|
307
|
+
|
|
308
|
+
async def create_environment(
|
|
309
|
+
self,
|
|
310
|
+
task: dict[str, Any],
|
|
311
|
+
docker_manager: DockerEnvironmentManager,
|
|
312
|
+
) -> TaskEnvironment:
|
|
313
|
+
"""Create environment for LongBench task.
|
|
314
|
+
|
|
315
|
+
LongBench doesn't require repository setup - creates minimal environment.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
task: LongBench task dictionary.
|
|
319
|
+
docker_manager: Docker environment manager.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
TaskEnvironment for the task.
|
|
323
|
+
"""
|
|
324
|
+
instance_id = task.get("instance_id", "longbench_unknown")
|
|
325
|
+
temp_task = {
|
|
326
|
+
"instance_id": instance_id,
|
|
327
|
+
"repo": "longbench/context",
|
|
328
|
+
"base_commit": "HEAD",
|
|
329
|
+
}
|
|
330
|
+
return await docker_manager.create_environment(temp_task)
|
|
331
|
+
|
|
332
|
+
async def evaluate(
|
|
333
|
+
self,
|
|
334
|
+
_env: TaskEnvironment,
|
|
335
|
+
task: dict[str, Any],
|
|
336
|
+
solution: str,
|
|
337
|
+
) -> dict[str, Any]:
|
|
338
|
+
"""Evaluate a solution for LongBench task.
|
|
339
|
+
|
|
340
|
+
Uses the appropriate metric based on the task subset:
|
|
341
|
+
- F1 score for QA tasks
|
|
342
|
+
- ROUGE-L for summarization tasks
|
|
343
|
+
- Accuracy for classification tasks
|
|
344
|
+
- Edit similarity for code tasks
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
_env: Task environment (unused; evaluation is offline).
|
|
348
|
+
task: LongBench task dictionary.
|
|
349
|
+
solution: Solution to evaluate (agent's response).
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Dictionary with evaluation results including 'resolved' boolean.
|
|
353
|
+
"""
|
|
354
|
+
answers = task.get("ground_truth_answers", task.get("answers", []))
|
|
355
|
+
if not answers:
|
|
356
|
+
return {"resolved": False, "error": "No ground truth answers available"}
|
|
357
|
+
|
|
358
|
+
subset_name = task.get("_subset", task.get("dataset", self.subset))
|
|
359
|
+
all_classes = task.get("all_classes")
|
|
360
|
+
|
|
361
|
+
if subset_name in F1_SUBSETS:
|
|
362
|
+
score = self._compute_f1_max(solution, answers)
|
|
363
|
+
metric = "f1"
|
|
364
|
+
elif subset_name in ROUGE_SUBSETS:
|
|
365
|
+
score = self._compute_rouge_l_max(solution, answers)
|
|
366
|
+
metric = "rouge_l"
|
|
367
|
+
elif subset_name in ACCURACY_SUBSETS:
|
|
368
|
+
score = self._compute_classification_accuracy(solution, answers, all_classes)
|
|
369
|
+
metric = "accuracy"
|
|
370
|
+
elif subset_name in EDIT_SIM_SUBSETS:
|
|
371
|
+
score = self._compute_edit_similarity_max(solution, answers)
|
|
372
|
+
metric = "edit_similarity"
|
|
373
|
+
else:
|
|
374
|
+
# Default to F1 for unknown subsets
|
|
375
|
+
score = self._compute_f1_max(solution, answers)
|
|
376
|
+
metric = "f1"
|
|
377
|
+
|
|
378
|
+
# Threshold: score >= 0.5 is considered resolved for F1/ROUGE
|
|
379
|
+
# For accuracy, exact match (1.0) is required
|
|
380
|
+
if metric == "accuracy":
|
|
381
|
+
resolved = score >= 1.0
|
|
382
|
+
else:
|
|
383
|
+
resolved = score >= 0.5
|
|
384
|
+
|
|
385
|
+
return {
|
|
386
|
+
"resolved": resolved,
|
|
387
|
+
"score": score,
|
|
388
|
+
"metric": metric,
|
|
389
|
+
"subset": subset_name,
|
|
390
|
+
"category": SUBSET_TO_CATEGORY.get(subset_name, "unknown"),
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
def _normalize_text(self, text: str) -> str:
|
|
394
|
+
"""Normalize text for scoring by lowercasing, removing punctuation and articles.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
text: Raw text.
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
Normalized text.
|
|
401
|
+
"""
|
|
402
|
+
text = text.lower().strip()
|
|
403
|
+
# Remove articles
|
|
404
|
+
text = re.sub(r"\b(a|an|the)\b", " ", text)
|
|
405
|
+
# Remove punctuation
|
|
406
|
+
text = text.translate(str.maketrans("", "", string.punctuation))
|
|
407
|
+
# Collapse whitespace
|
|
408
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
409
|
+
return text
|
|
410
|
+
|
|
411
|
+
def _compute_f1(self, prediction: str, reference: str) -> float:
|
|
412
|
+
"""Compute token-level F1 score between prediction and reference.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
prediction: Predicted answer text.
|
|
416
|
+
reference: Reference answer text.
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
F1 score between 0.0 and 1.0.
|
|
420
|
+
"""
|
|
421
|
+
pred_tokens = self._normalize_text(prediction).split()
|
|
422
|
+
ref_tokens = self._normalize_text(reference).split()
|
|
423
|
+
|
|
424
|
+
if not pred_tokens or not ref_tokens:
|
|
425
|
+
return 1.0 if pred_tokens == ref_tokens else 0.0
|
|
426
|
+
|
|
427
|
+
common = Counter(pred_tokens) & Counter(ref_tokens)
|
|
428
|
+
num_common = sum(common.values())
|
|
429
|
+
|
|
430
|
+
if num_common == 0:
|
|
431
|
+
return 0.0
|
|
432
|
+
|
|
433
|
+
precision = num_common / len(pred_tokens)
|
|
434
|
+
recall = num_common / len(ref_tokens)
|
|
435
|
+
|
|
436
|
+
return 2 * precision * recall / (precision + recall)
|
|
437
|
+
|
|
438
|
+
def _compute_f1_max(self, prediction: str, references: list[str]) -> float:
|
|
439
|
+
"""Compute maximum F1 score across all reference answers.
|
|
440
|
+
|
|
441
|
+
Args:
|
|
442
|
+
prediction: Predicted answer text.
|
|
443
|
+
references: List of reference answer texts.
|
|
444
|
+
|
|
445
|
+
Returns:
|
|
446
|
+
Maximum F1 score.
|
|
447
|
+
"""
|
|
448
|
+
if not references:
|
|
449
|
+
return 0.0
|
|
450
|
+
return max(self._compute_f1(prediction, ref) for ref in references)
|
|
451
|
+
|
|
452
|
+
def _compute_rouge_l(self, prediction: str, reference: str) -> float:
|
|
453
|
+
"""Compute ROUGE-L (Longest Common Subsequence) F-measure.
|
|
454
|
+
|
|
455
|
+
Args:
|
|
456
|
+
prediction: Predicted text.
|
|
457
|
+
reference: Reference text.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
ROUGE-L F-measure between 0.0 and 1.0.
|
|
461
|
+
"""
|
|
462
|
+
pred_tokens = self._normalize_text(prediction).split()
|
|
463
|
+
ref_tokens = self._normalize_text(reference).split()
|
|
464
|
+
|
|
465
|
+
if not pred_tokens or not ref_tokens:
|
|
466
|
+
return 1.0 if pred_tokens == ref_tokens else 0.0
|
|
467
|
+
|
|
468
|
+
lcs_length = self._lcs_length(pred_tokens, ref_tokens)
|
|
469
|
+
|
|
470
|
+
if lcs_length == 0:
|
|
471
|
+
return 0.0
|
|
472
|
+
|
|
473
|
+
precision = lcs_length / len(pred_tokens)
|
|
474
|
+
recall = lcs_length / len(ref_tokens)
|
|
475
|
+
|
|
476
|
+
return 2 * precision * recall / (precision + recall)
|
|
477
|
+
|
|
478
|
+
def _lcs_length(self, seq1: list[str], seq2: list[str]) -> int:
|
|
479
|
+
"""Compute length of longest common subsequence.
|
|
480
|
+
|
|
481
|
+
Args:
|
|
482
|
+
seq1: First sequence.
|
|
483
|
+
seq2: Second sequence.
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
Length of the LCS.
|
|
487
|
+
"""
|
|
488
|
+
m, n = len(seq1), len(seq2)
|
|
489
|
+
# Use space-optimized DP with two rows
|
|
490
|
+
prev = [0] * (n + 1)
|
|
491
|
+
curr = [0] * (n + 1)
|
|
492
|
+
|
|
493
|
+
for i in range(1, m + 1):
|
|
494
|
+
for j in range(1, n + 1):
|
|
495
|
+
if seq1[i - 1] == seq2[j - 1]:
|
|
496
|
+
curr[j] = prev[j - 1] + 1
|
|
497
|
+
else:
|
|
498
|
+
curr[j] = max(prev[j], curr[j - 1])
|
|
499
|
+
prev, curr = curr, [0] * (n + 1)
|
|
500
|
+
|
|
501
|
+
return prev[n]
|
|
502
|
+
|
|
503
|
+
def _compute_rouge_l_max(self, prediction: str, references: list[str]) -> float:
|
|
504
|
+
"""Compute maximum ROUGE-L score across all reference answers.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
prediction: Predicted text.
|
|
508
|
+
references: List of reference texts.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
Maximum ROUGE-L score.
|
|
512
|
+
"""
|
|
513
|
+
if not references:
|
|
514
|
+
return 0.0
|
|
515
|
+
return max(self._compute_rouge_l(prediction, ref) for ref in references)
|
|
516
|
+
|
|
517
|
+
def _compute_classification_accuracy(
|
|
518
|
+
self,
|
|
519
|
+
prediction: str,
|
|
520
|
+
references: list[str],
|
|
521
|
+
all_classes: list[str] | None = None,
|
|
522
|
+
) -> float:
|
|
523
|
+
"""Compute classification accuracy.
|
|
524
|
+
|
|
525
|
+
For classification tasks, checks if the predicted class matches
|
|
526
|
+
any of the reference answers.
|
|
527
|
+
|
|
528
|
+
Args:
|
|
529
|
+
prediction: Predicted class/answer.
|
|
530
|
+
references: List of correct answers.
|
|
531
|
+
all_classes: All possible class labels (unused but available).
|
|
532
|
+
|
|
533
|
+
Returns:
|
|
534
|
+
1.0 if correct, 0.0 otherwise.
|
|
535
|
+
"""
|
|
536
|
+
_ = all_classes
|
|
537
|
+
pred_normalized = self._normalize_text(prediction)
|
|
538
|
+
for ref in references:
|
|
539
|
+
ref_normalized = self._normalize_text(ref)
|
|
540
|
+
if ref_normalized == pred_normalized or ref_normalized in pred_normalized:
|
|
541
|
+
return 1.0
|
|
542
|
+
return 0.0
|
|
543
|
+
|
|
544
|
+
def _compute_edit_similarity(self, prediction: str, reference: str) -> float:
|
|
545
|
+
"""Compute edit similarity between prediction and reference.
|
|
546
|
+
|
|
547
|
+
Edit similarity = 1 - (edit_distance / max_length).
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
prediction: Predicted code.
|
|
551
|
+
reference: Reference code.
|
|
552
|
+
|
|
553
|
+
Returns:
|
|
554
|
+
Edit similarity between 0.0 and 1.0.
|
|
555
|
+
"""
|
|
556
|
+
if not prediction and not reference:
|
|
557
|
+
return 1.0
|
|
558
|
+
if not prediction or not reference:
|
|
559
|
+
return 0.0
|
|
560
|
+
|
|
561
|
+
m, n = len(prediction), len(reference)
|
|
562
|
+
|
|
563
|
+
# Levenshtein distance using space-optimized DP
|
|
564
|
+
prev = list(range(n + 1))
|
|
565
|
+
curr = [0] * (n + 1)
|
|
566
|
+
|
|
567
|
+
for i in range(1, m + 1):
|
|
568
|
+
curr[0] = i
|
|
569
|
+
for j in range(1, n + 1):
|
|
570
|
+
if prediction[i - 1] == reference[j - 1]:
|
|
571
|
+
curr[j] = prev[j - 1]
|
|
572
|
+
else:
|
|
573
|
+
curr[j] = 1 + min(prev[j], curr[j - 1], prev[j - 1])
|
|
574
|
+
prev, curr = curr, [0] * (n + 1)
|
|
575
|
+
|
|
576
|
+
edit_distance = prev[n]
|
|
577
|
+
max_length = max(m, n)
|
|
578
|
+
return 1.0 - (edit_distance / max_length)
|
|
579
|
+
|
|
580
|
+
def _compute_edit_similarity_max(self, prediction: str, references: list[str]) -> float:
|
|
581
|
+
"""Compute maximum edit similarity across all reference answers.
|
|
582
|
+
|
|
583
|
+
Args:
|
|
584
|
+
prediction: Predicted code.
|
|
585
|
+
references: List of reference code snippets.
|
|
586
|
+
|
|
587
|
+
Returns:
|
|
588
|
+
Maximum edit similarity score.
|
|
589
|
+
"""
|
|
590
|
+
if not references:
|
|
591
|
+
return 0.0
|
|
592
|
+
return max(self._compute_edit_similarity(prediction, ref) for ref in references)
|
|
593
|
+
|
|
594
|
+
def get_prebuilt_image(self, _task: dict[str, Any]) -> str | None:
|
|
595
|
+
"""Get pre-built Docker image name for LongBench task.
|
|
596
|
+
|
|
597
|
+
LongBench doesn't use pre-built images.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
_task: LongBench task dictionary (unused).
|
|
601
|
+
|
|
602
|
+
Returns:
|
|
603
|
+
None (no pre-built images available).
|
|
604
|
+
"""
|
|
605
|
+
return None
|
|
606
|
+
|
|
607
|
+
def get_prompt_template(self) -> str:
|
|
608
|
+
"""Get LongBench prompt template.
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
Prompt template for long-context understanding tasks.
|
|
612
|
+
"""
|
|
613
|
+
return (
|
|
614
|
+
"You are given a long-context understanding task.\n\n"
|
|
615
|
+
"{problem_statement}\n\n"
|
|
616
|
+
"IMPORTANT INSTRUCTIONS:\n"
|
|
617
|
+
"- Read the entire context carefully before answering\n"
|
|
618
|
+
"- Base your answer solely on the information provided in the context\n"
|
|
619
|
+
"- Be concise and precise in your answer\n"
|
|
620
|
+
"- For QA tasks, provide a direct answer to the question\n"
|
|
621
|
+
"- For summarization tasks, capture the key points concisely\n"
|
|
622
|
+
"- For code tasks, complete the code following the existing patterns"
|
|
623
|
+
)
|