mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/benchmarks/__init__.py +12 -0
  2. mcpbr/benchmarks/adversarial.py +341 -0
  3. mcpbr/benchmarks/custom.py +607 -0
  4. mcpbr/benchmarks/longbench.py +623 -0
  5. mcpbr/benchmarks/mmmu.py +353 -0
  6. mcpbr/config.py +4 -0
  7. mcpbr/config_migration.py +470 -0
  8. mcpbr/config_wizard.py +647 -0
  9. mcpbr/custom_metrics.py +405 -0
  10. mcpbr/dashboard.py +619 -0
  11. mcpbr/dataset_streaming.py +491 -0
  12. mcpbr/dataset_versioning.py +222 -0
  13. mcpbr/docker_cache.py +539 -0
  14. mcpbr/docker_prewarm.py +369 -0
  15. mcpbr/dry_run.py +532 -0
  16. mcpbr/failure_analysis.py +558 -0
  17. mcpbr/few_shot.py +367 -0
  18. mcpbr/formatting.py +444 -0
  19. mcpbr/gpu_support.py +157 -0
  20. mcpbr/harness.py +38 -4
  21. mcpbr/latency_metrics.py +317 -0
  22. mcpbr/resource_limits.py +487 -0
  23. mcpbr/result_streaming.py +519 -0
  24. mcpbr/sampling.py +193 -0
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
  28. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
  29. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,623 @@
1
+ """LongBench benchmark implementation for long-context understanding."""
2
+
3
+ import re
4
+ import string
5
+ from collections import Counter
6
+ from typing import Any
7
+
8
+ from datasets import load_dataset
9
+
10
+ from ..docker_env import DockerEnvironmentManager, TaskEnvironment
11
+ from .base import BenchmarkTask
12
+
13
+ # Mapping of LongBench subsets to their high-level categories
14
+ SUBSET_TO_CATEGORY: dict[str, str] = {
15
+ # Single-Document QA
16
+ "narrativeqa": "single_doc_qa",
17
+ "qasper": "single_doc_qa",
18
+ "multifieldqa_en": "single_doc_qa",
19
+ "multifieldqa_zh": "single_doc_qa",
20
+ # Multi-Document QA
21
+ "hotpotqa": "multi_doc_qa",
22
+ "2wikimqa": "multi_doc_qa",
23
+ "musique": "multi_doc_qa",
24
+ "dureader": "multi_doc_qa",
25
+ # Summarization
26
+ "gov_report": "summarization",
27
+ "qmsum": "summarization",
28
+ "multi_news": "summarization",
29
+ "vcsum": "summarization",
30
+ "samsum": "summarization",
31
+ # Few-Shot Learning
32
+ "triviaqa": "few_shot",
33
+ "trec": "few_shot",
34
+ "lsht": "few_shot",
35
+ # Synthetic Tasks
36
+ "passage_count": "synthetic",
37
+ "passage_retrieval_en": "synthetic",
38
+ "passage_retrieval_zh": "synthetic",
39
+ # Code Completion
40
+ "lcc": "code",
41
+ "repobench-p": "code",
42
+ }
43
+
44
+ # Subsets that use F1 scoring
45
+ F1_SUBSETS = {
46
+ "narrativeqa",
47
+ "qasper",
48
+ "multifieldqa_en",
49
+ "multifieldqa_zh",
50
+ "hotpotqa",
51
+ "2wikimqa",
52
+ "musique",
53
+ "triviaqa",
54
+ }
55
+
56
+ # Subsets that use ROUGE-L scoring
57
+ ROUGE_SUBSETS = {
58
+ "dureader",
59
+ "gov_report",
60
+ "qmsum",
61
+ "multi_news",
62
+ "vcsum",
63
+ "samsum",
64
+ }
65
+
66
+ # Subsets that use classification accuracy
67
+ ACCURACY_SUBSETS = {
68
+ "trec",
69
+ "lsht",
70
+ "passage_count",
71
+ "passage_retrieval_en",
72
+ "passage_retrieval_zh",
73
+ }
74
+
75
+ # Subsets that use edit similarity
76
+ EDIT_SIM_SUBSETS = {
77
+ "lcc",
78
+ "repobench-p",
79
+ }
80
+
81
+ # All available subsets
82
+ ALL_SUBSETS = list(SUBSET_TO_CATEGORY.keys())
83
+
84
+
85
+ class LongBenchBenchmark:
86
+ """LongBench benchmark implementation.
87
+
88
+ LongBench is a bilingual, multitask benchmark for long context understanding.
89
+ It covers 6 categories across 21 tasks with contexts ranging from thousands
90
+ to tens of thousands of tokens:
91
+
92
+ - Single-Document QA: narrativeqa, qasper, multifieldqa_en, multifieldqa_zh
93
+ - Multi-Document QA: hotpotqa, 2wikimqa, musique, dureader
94
+ - Summarization: gov_report, qmsum, multi_news, vcsum, samsum
95
+ - Few-Shot Learning: triviaqa, trec, lsht
96
+ - Synthetic Tasks: passage_count, passage_retrieval_en, passage_retrieval_zh
97
+ - Code Completion: lcc, repobench-p
98
+
99
+ Evaluation uses task-appropriate metrics: F1 for QA, ROUGE-L for summarization,
100
+ accuracy for classification, and edit similarity for code tasks.
101
+ """
102
+
103
+ name = "longbench"
104
+
105
+ def __init__(
106
+ self,
107
+ dataset: str = "THUDM/LongBench",
108
+ subset: str = "hotpotqa",
109
+ ):
110
+ """Initialize LongBench benchmark.
111
+
112
+ Args:
113
+ dataset: HuggingFace dataset identifier.
114
+ subset: Dataset subset/config name (e.g., 'hotpotqa', 'narrativeqa').
115
+ """
116
+ self.dataset = dataset
117
+ self.subset = subset
118
+
119
+ def load_tasks(
120
+ self,
121
+ sample_size: int | None = None,
122
+ task_ids: list[str] | None = None,
123
+ level: int | None = None,
124
+ filter_difficulty: list[str] | None = None,
125
+ filter_category: list[str] | None = None,
126
+ filter_tags: list[str] | None = None,
127
+ ) -> list[dict[str, Any]]:
128
+ """Load tasks from LongBench dataset.
129
+
130
+ Args:
131
+ sample_size: Maximum number of tasks to load (None for all).
132
+ task_ids: Specific task IDs to load (None for all).
133
+ level: Unused for LongBench.
134
+ filter_difficulty: Unused for LongBench.
135
+ filter_category: Filter by task category. Valid categories:
136
+ single_doc_qa, multi_doc_qa, summarization, few_shot,
137
+ synthetic, code. When specified, loads tasks from all
138
+ subsets matching the given categories.
139
+ filter_tags: Unused for LongBench.
140
+
141
+ Returns:
142
+ List of LongBench task dictionaries.
143
+ """
144
+ _ = level
145
+ _ = filter_difficulty
146
+ _ = filter_tags
147
+
148
+ # Determine which subsets to load based on filter_category
149
+ subsets_to_load = self._resolve_subsets(filter_category)
150
+
151
+ all_tasks: list[dict[str, Any]] = []
152
+ for subset_name in subsets_to_load:
153
+ try:
154
+ ds = load_dataset(self.dataset, subset_name, split="test")
155
+ for idx, item in enumerate(ds):
156
+ task = dict(item)
157
+ task["_subset"] = subset_name
158
+ task["_original_index"] = idx
159
+ all_tasks.append(task)
160
+ except Exception:
161
+ # Skip subsets that fail to load (e.g., unavailable configs)
162
+ continue
163
+
164
+ # Filter by task_ids if specified
165
+ if task_ids:
166
+ task_id_set = set(task_ids)
167
+ all_tasks = [
168
+ t
169
+ for t in all_tasks
170
+ if f"longbench_{t['_subset']}_{t['_original_index']}" in task_id_set
171
+ ]
172
+
173
+ # Apply sample size limit
174
+ if sample_size is not None and len(all_tasks) > sample_size:
175
+ all_tasks = all_tasks[:sample_size]
176
+
177
+ # Augment tasks with instance_id and problem_statement
178
+ augmented_tasks = []
179
+ for task in all_tasks:
180
+ augmented = dict(task)
181
+ subset_name = task["_subset"]
182
+ orig_idx = task["_original_index"]
183
+ augmented["instance_id"] = f"longbench_{subset_name}_{orig_idx}"
184
+ augmented["problem_statement"] = self._generate_problem_statement(augmented)
185
+ augmented["ground_truth_answers"] = task.get("answers", [])
186
+ augmented["task_category"] = SUBSET_TO_CATEGORY.get(subset_name, "unknown")
187
+ augmented_tasks.append(augmented)
188
+
189
+ return augmented_tasks
190
+
191
+ def _resolve_subsets(self, filter_category: list[str] | None) -> list[str]:
192
+ """Resolve which subsets to load based on category filter.
193
+
194
+ Args:
195
+ filter_category: List of category names to include.
196
+
197
+ Returns:
198
+ List of subset names to load.
199
+ """
200
+ if not filter_category:
201
+ # If no category filter, use the configured subset
202
+ return [self.subset]
203
+
204
+ category_set = {c.lower() for c in filter_category}
205
+ resolved = []
206
+ for subset_name, category in SUBSET_TO_CATEGORY.items():
207
+ if category in category_set:
208
+ resolved.append(subset_name)
209
+
210
+ return resolved if resolved else [self.subset]
211
+
212
+ def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask:
213
+ """Convert LongBench task to normalized format.
214
+
215
+ Args:
216
+ task: LongBench task dictionary.
217
+
218
+ Returns:
219
+ Normalized BenchmarkTask.
220
+
221
+ Raises:
222
+ ValueError: If required fields are missing.
223
+ """
224
+ instance_id = task.get("instance_id")
225
+ if not instance_id:
226
+ msg = f"Task missing required 'instance_id' field: {list(task.keys())}"
227
+ raise ValueError(msg)
228
+
229
+ input_text = task.get("input", "")
230
+ if not input_text and not task.get("context", ""):
231
+ msg = f"Task missing required 'input' or 'context' field: {list(task.keys())}"
232
+ raise ValueError(msg)
233
+
234
+ problem_statement = self._generate_problem_statement(task)
235
+ subset_name = task.get("_subset", self.subset)
236
+
237
+ return BenchmarkTask(
238
+ task_id=instance_id,
239
+ problem_statement=problem_statement,
240
+ repo="longbench/context",
241
+ commit="HEAD",
242
+ metadata={
243
+ "input": input_text,
244
+ "context_length": task.get("length", 0),
245
+ "dataset": task.get("dataset", subset_name),
246
+ "language": task.get("language", "en"),
247
+ "answers": task.get("answers", []),
248
+ "all_classes": task.get("all_classes"),
249
+ "subset": subset_name,
250
+ "category": SUBSET_TO_CATEGORY.get(subset_name, "unknown"),
251
+ },
252
+ )
253
+
254
+ def _generate_problem_statement(self, task: dict[str, Any]) -> str:
255
+ """Generate problem statement from task.
256
+
257
+ Includes the full long context in the problem statement so agents
258
+ must process and understand it.
259
+
260
+ Args:
261
+ task: LongBench task dictionary.
262
+
263
+ Returns:
264
+ Problem statement for the agent.
265
+ """
266
+ input_text = task.get("input", "No input provided")
267
+ context = task.get("context", "")
268
+ subset_name = task.get("_subset", task.get("dataset", self.subset))
269
+ category = SUBSET_TO_CATEGORY.get(subset_name, "unknown")
270
+
271
+ # Build category-specific instructions
272
+ if category in ("single_doc_qa", "multi_doc_qa"):
273
+ task_instruction = (
274
+ "Read the following document(s) carefully and answer the question "
275
+ "based on the information provided."
276
+ )
277
+ elif category == "summarization":
278
+ task_instruction = (
279
+ "Read the following document(s) carefully and provide a concise summary "
280
+ "capturing the key points."
281
+ )
282
+ elif category == "few_shot":
283
+ task_instruction = (
284
+ "Study the provided examples carefully and answer the question "
285
+ "following the same pattern."
286
+ )
287
+ elif category == "synthetic":
288
+ task_instruction = (
289
+ "Analyze the provided passages carefully and answer the question precisely."
290
+ )
291
+ elif category == "code":
292
+ task_instruction = "Read the provided code context and complete the code as requested."
293
+ else:
294
+ task_instruction = "Read the following context carefully and respond to the request."
295
+
296
+ parts = [task_instruction, ""]
297
+
298
+ if context:
299
+ parts.append("--- CONTEXT ---")
300
+ parts.append(context)
301
+ parts.append("--- END CONTEXT ---")
302
+ parts.append("")
303
+
304
+ parts.append(f"Question/Task: {input_text}")
305
+
306
+ return "\n".join(parts)
307
+
308
+ async def create_environment(
309
+ self,
310
+ task: dict[str, Any],
311
+ docker_manager: DockerEnvironmentManager,
312
+ ) -> TaskEnvironment:
313
+ """Create environment for LongBench task.
314
+
315
+ LongBench doesn't require repository setup - creates minimal environment.
316
+
317
+ Args:
318
+ task: LongBench task dictionary.
319
+ docker_manager: Docker environment manager.
320
+
321
+ Returns:
322
+ TaskEnvironment for the task.
323
+ """
324
+ instance_id = task.get("instance_id", "longbench_unknown")
325
+ temp_task = {
326
+ "instance_id": instance_id,
327
+ "repo": "longbench/context",
328
+ "base_commit": "HEAD",
329
+ }
330
+ return await docker_manager.create_environment(temp_task)
331
+
332
+ async def evaluate(
333
+ self,
334
+ _env: TaskEnvironment,
335
+ task: dict[str, Any],
336
+ solution: str,
337
+ ) -> dict[str, Any]:
338
+ """Evaluate a solution for LongBench task.
339
+
340
+ Uses the appropriate metric based on the task subset:
341
+ - F1 score for QA tasks
342
+ - ROUGE-L for summarization tasks
343
+ - Accuracy for classification tasks
344
+ - Edit similarity for code tasks
345
+
346
+ Args:
347
+ _env: Task environment (unused; evaluation is offline).
348
+ task: LongBench task dictionary.
349
+ solution: Solution to evaluate (agent's response).
350
+
351
+ Returns:
352
+ Dictionary with evaluation results including 'resolved' boolean.
353
+ """
354
+ answers = task.get("ground_truth_answers", task.get("answers", []))
355
+ if not answers:
356
+ return {"resolved": False, "error": "No ground truth answers available"}
357
+
358
+ subset_name = task.get("_subset", task.get("dataset", self.subset))
359
+ all_classes = task.get("all_classes")
360
+
361
+ if subset_name in F1_SUBSETS:
362
+ score = self._compute_f1_max(solution, answers)
363
+ metric = "f1"
364
+ elif subset_name in ROUGE_SUBSETS:
365
+ score = self._compute_rouge_l_max(solution, answers)
366
+ metric = "rouge_l"
367
+ elif subset_name in ACCURACY_SUBSETS:
368
+ score = self._compute_classification_accuracy(solution, answers, all_classes)
369
+ metric = "accuracy"
370
+ elif subset_name in EDIT_SIM_SUBSETS:
371
+ score = self._compute_edit_similarity_max(solution, answers)
372
+ metric = "edit_similarity"
373
+ else:
374
+ # Default to F1 for unknown subsets
375
+ score = self._compute_f1_max(solution, answers)
376
+ metric = "f1"
377
+
378
+ # Threshold: score >= 0.5 is considered resolved for F1/ROUGE
379
+ # For accuracy, exact match (1.0) is required
380
+ if metric == "accuracy":
381
+ resolved = score >= 1.0
382
+ else:
383
+ resolved = score >= 0.5
384
+
385
+ return {
386
+ "resolved": resolved,
387
+ "score": score,
388
+ "metric": metric,
389
+ "subset": subset_name,
390
+ "category": SUBSET_TO_CATEGORY.get(subset_name, "unknown"),
391
+ }
392
+
393
+ def _normalize_text(self, text: str) -> str:
394
+ """Normalize text for scoring by lowercasing, removing punctuation and articles.
395
+
396
+ Args:
397
+ text: Raw text.
398
+
399
+ Returns:
400
+ Normalized text.
401
+ """
402
+ text = text.lower().strip()
403
+ # Remove articles
404
+ text = re.sub(r"\b(a|an|the)\b", " ", text)
405
+ # Remove punctuation
406
+ text = text.translate(str.maketrans("", "", string.punctuation))
407
+ # Collapse whitespace
408
+ text = re.sub(r"\s+", " ", text).strip()
409
+ return text
410
+
411
+ def _compute_f1(self, prediction: str, reference: str) -> float:
412
+ """Compute token-level F1 score between prediction and reference.
413
+
414
+ Args:
415
+ prediction: Predicted answer text.
416
+ reference: Reference answer text.
417
+
418
+ Returns:
419
+ F1 score between 0.0 and 1.0.
420
+ """
421
+ pred_tokens = self._normalize_text(prediction).split()
422
+ ref_tokens = self._normalize_text(reference).split()
423
+
424
+ if not pred_tokens or not ref_tokens:
425
+ return 1.0 if pred_tokens == ref_tokens else 0.0
426
+
427
+ common = Counter(pred_tokens) & Counter(ref_tokens)
428
+ num_common = sum(common.values())
429
+
430
+ if num_common == 0:
431
+ return 0.0
432
+
433
+ precision = num_common / len(pred_tokens)
434
+ recall = num_common / len(ref_tokens)
435
+
436
+ return 2 * precision * recall / (precision + recall)
437
+
438
+ def _compute_f1_max(self, prediction: str, references: list[str]) -> float:
439
+ """Compute maximum F1 score across all reference answers.
440
+
441
+ Args:
442
+ prediction: Predicted answer text.
443
+ references: List of reference answer texts.
444
+
445
+ Returns:
446
+ Maximum F1 score.
447
+ """
448
+ if not references:
449
+ return 0.0
450
+ return max(self._compute_f1(prediction, ref) for ref in references)
451
+
452
+ def _compute_rouge_l(self, prediction: str, reference: str) -> float:
453
+ """Compute ROUGE-L (Longest Common Subsequence) F-measure.
454
+
455
+ Args:
456
+ prediction: Predicted text.
457
+ reference: Reference text.
458
+
459
+ Returns:
460
+ ROUGE-L F-measure between 0.0 and 1.0.
461
+ """
462
+ pred_tokens = self._normalize_text(prediction).split()
463
+ ref_tokens = self._normalize_text(reference).split()
464
+
465
+ if not pred_tokens or not ref_tokens:
466
+ return 1.0 if pred_tokens == ref_tokens else 0.0
467
+
468
+ lcs_length = self._lcs_length(pred_tokens, ref_tokens)
469
+
470
+ if lcs_length == 0:
471
+ return 0.0
472
+
473
+ precision = lcs_length / len(pred_tokens)
474
+ recall = lcs_length / len(ref_tokens)
475
+
476
+ return 2 * precision * recall / (precision + recall)
477
+
478
+ def _lcs_length(self, seq1: list[str], seq2: list[str]) -> int:
479
+ """Compute length of longest common subsequence.
480
+
481
+ Args:
482
+ seq1: First sequence.
483
+ seq2: Second sequence.
484
+
485
+ Returns:
486
+ Length of the LCS.
487
+ """
488
+ m, n = len(seq1), len(seq2)
489
+ # Use space-optimized DP with two rows
490
+ prev = [0] * (n + 1)
491
+ curr = [0] * (n + 1)
492
+
493
+ for i in range(1, m + 1):
494
+ for j in range(1, n + 1):
495
+ if seq1[i - 1] == seq2[j - 1]:
496
+ curr[j] = prev[j - 1] + 1
497
+ else:
498
+ curr[j] = max(prev[j], curr[j - 1])
499
+ prev, curr = curr, [0] * (n + 1)
500
+
501
+ return prev[n]
502
+
503
+ def _compute_rouge_l_max(self, prediction: str, references: list[str]) -> float:
504
+ """Compute maximum ROUGE-L score across all reference answers.
505
+
506
+ Args:
507
+ prediction: Predicted text.
508
+ references: List of reference texts.
509
+
510
+ Returns:
511
+ Maximum ROUGE-L score.
512
+ """
513
+ if not references:
514
+ return 0.0
515
+ return max(self._compute_rouge_l(prediction, ref) for ref in references)
516
+
517
+ def _compute_classification_accuracy(
518
+ self,
519
+ prediction: str,
520
+ references: list[str],
521
+ all_classes: list[str] | None = None,
522
+ ) -> float:
523
+ """Compute classification accuracy.
524
+
525
+ For classification tasks, checks if the predicted class matches
526
+ any of the reference answers.
527
+
528
+ Args:
529
+ prediction: Predicted class/answer.
530
+ references: List of correct answers.
531
+ all_classes: All possible class labels (unused but available).
532
+
533
+ Returns:
534
+ 1.0 if correct, 0.0 otherwise.
535
+ """
536
+ _ = all_classes
537
+ pred_normalized = self._normalize_text(prediction)
538
+ for ref in references:
539
+ ref_normalized = self._normalize_text(ref)
540
+ if ref_normalized == pred_normalized or ref_normalized in pred_normalized:
541
+ return 1.0
542
+ return 0.0
543
+
544
+ def _compute_edit_similarity(self, prediction: str, reference: str) -> float:
545
+ """Compute edit similarity between prediction and reference.
546
+
547
+ Edit similarity = 1 - (edit_distance / max_length).
548
+
549
+ Args:
550
+ prediction: Predicted code.
551
+ reference: Reference code.
552
+
553
+ Returns:
554
+ Edit similarity between 0.0 and 1.0.
555
+ """
556
+ if not prediction and not reference:
557
+ return 1.0
558
+ if not prediction or not reference:
559
+ return 0.0
560
+
561
+ m, n = len(prediction), len(reference)
562
+
563
+ # Levenshtein distance using space-optimized DP
564
+ prev = list(range(n + 1))
565
+ curr = [0] * (n + 1)
566
+
567
+ for i in range(1, m + 1):
568
+ curr[0] = i
569
+ for j in range(1, n + 1):
570
+ if prediction[i - 1] == reference[j - 1]:
571
+ curr[j] = prev[j - 1]
572
+ else:
573
+ curr[j] = 1 + min(prev[j], curr[j - 1], prev[j - 1])
574
+ prev, curr = curr, [0] * (n + 1)
575
+
576
+ edit_distance = prev[n]
577
+ max_length = max(m, n)
578
+ return 1.0 - (edit_distance / max_length)
579
+
580
+ def _compute_edit_similarity_max(self, prediction: str, references: list[str]) -> float:
581
+ """Compute maximum edit similarity across all reference answers.
582
+
583
+ Args:
584
+ prediction: Predicted code.
585
+ references: List of reference code snippets.
586
+
587
+ Returns:
588
+ Maximum edit similarity score.
589
+ """
590
+ if not references:
591
+ return 0.0
592
+ return max(self._compute_edit_similarity(prediction, ref) for ref in references)
593
+
594
+ def get_prebuilt_image(self, _task: dict[str, Any]) -> str | None:
595
+ """Get pre-built Docker image name for LongBench task.
596
+
597
+ LongBench doesn't use pre-built images.
598
+
599
+ Args:
600
+ _task: LongBench task dictionary (unused).
601
+
602
+ Returns:
603
+ None (no pre-built images available).
604
+ """
605
+ return None
606
+
607
+ def get_prompt_template(self) -> str:
608
+ """Get LongBench prompt template.
609
+
610
+ Returns:
611
+ Prompt template for long-context understanding tasks.
612
+ """
613
+ return (
614
+ "You are given a long-context understanding task.\n\n"
615
+ "{problem_statement}\n\n"
616
+ "IMPORTANT INSTRUCTIONS:\n"
617
+ "- Read the entire context carefully before answering\n"
618
+ "- Base your answer solely on the information provided in the context\n"
619
+ "- Be concise and precise in your answer\n"
620
+ "- For QA tasks, provide a direct answer to the question\n"
621
+ "- For summarization tasks, capture the key points concisely\n"
622
+ "- For code tasks, complete the code following the existing patterns"
623
+ )