mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcpbr/benchmarks/__init__.py +12 -0
- mcpbr/benchmarks/adversarial.py +341 -0
- mcpbr/benchmarks/custom.py +607 -0
- mcpbr/benchmarks/longbench.py +623 -0
- mcpbr/benchmarks/mmmu.py +353 -0
- mcpbr/config.py +4 -0
- mcpbr/config_migration.py +470 -0
- mcpbr/config_wizard.py +647 -0
- mcpbr/custom_metrics.py +405 -0
- mcpbr/dashboard.py +619 -0
- mcpbr/dataset_streaming.py +491 -0
- mcpbr/dataset_versioning.py +222 -0
- mcpbr/docker_cache.py +539 -0
- mcpbr/docker_prewarm.py +369 -0
- mcpbr/dry_run.py +532 -0
- mcpbr/failure_analysis.py +558 -0
- mcpbr/few_shot.py +367 -0
- mcpbr/formatting.py +444 -0
- mcpbr/gpu_support.py +157 -0
- mcpbr/harness.py +38 -4
- mcpbr/latency_metrics.py +317 -0
- mcpbr/resource_limits.py +487 -0
- mcpbr/result_streaming.py +519 -0
- mcpbr/sampling.py +193 -0
- mcpbr/task_batching.py +403 -0
- mcpbr/task_scheduler.py +468 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
- {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
- {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
mcpbr/benchmarks/mmmu.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""MMMU (Massive Multi-discipline Multimodal Understanding) benchmark implementation."""
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
import io
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from datasets import load_dataset
|
|
9
|
+
|
|
10
|
+
from ..docker_env import DockerEnvironmentManager, TaskEnvironment
|
|
11
|
+
from .base import BenchmarkTask
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MMMUBenchmark:
|
|
15
|
+
"""MMMU benchmark implementation.
|
|
16
|
+
|
|
17
|
+
The Massive Multi-discipline Multimodal Understanding (MMMU) benchmark
|
|
18
|
+
evaluates multimodal models on college-level subject knowledge across
|
|
19
|
+
30 subjects and 183 subfields. Tasks require understanding images
|
|
20
|
+
(diagrams, charts, figures, etc.) alongside text to answer
|
|
21
|
+
multiple-choice questions (A, B, C, D).
|
|
22
|
+
|
|
23
|
+
Evaluation compares the selected answer letter against the ground truth.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "mmmu"
|
|
27
|
+
|
|
28
|
+
def __init__(self, dataset: str = "MMMU/MMMU", subset: str = "Accounting"):
|
|
29
|
+
"""Initialize MMMU benchmark.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset: HuggingFace dataset identifier.
|
|
33
|
+
subset: Dataset subset/config name (default: 'Accounting').
|
|
34
|
+
MMMU is organized by subject (e.g., 'Accounting', 'Art',
|
|
35
|
+
'Biology', 'Chemistry', 'Computer_Science', etc.).
|
|
36
|
+
"""
|
|
37
|
+
self.dataset = dataset
|
|
38
|
+
self.subset = subset
|
|
39
|
+
|
|
40
|
+
def load_tasks(
|
|
41
|
+
self,
|
|
42
|
+
sample_size: int | None = None,
|
|
43
|
+
task_ids: list[str] | None = None,
|
|
44
|
+
level: int | None = None,
|
|
45
|
+
filter_difficulty: list[str] | None = None,
|
|
46
|
+
filter_category: list[str] | None = None,
|
|
47
|
+
filter_tags: list[str] | None = None,
|
|
48
|
+
) -> list[dict[str, Any]]:
|
|
49
|
+
"""Load tasks from MMMU dataset.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
sample_size: Maximum number of tasks to load (None for all).
|
|
53
|
+
task_ids: Specific task IDs to load (None for all).
|
|
54
|
+
level: Unused for MMMU (no difficulty levels).
|
|
55
|
+
filter_difficulty: Unused for MMMU.
|
|
56
|
+
filter_category: Filter by subject category (overrides subset).
|
|
57
|
+
filter_tags: Unused for MMMU.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List of MMMU task dictionaries.
|
|
61
|
+
"""
|
|
62
|
+
_ = level
|
|
63
|
+
_ = filter_difficulty
|
|
64
|
+
_ = filter_tags
|
|
65
|
+
|
|
66
|
+
subset = self.subset
|
|
67
|
+
if filter_category and len(filter_category) > 0:
|
|
68
|
+
subset = filter_category[0]
|
|
69
|
+
|
|
70
|
+
# MMMU uses 'validation' split for evaluation (test labels are hidden)
|
|
71
|
+
dataset = load_dataset(self.dataset, subset, split="validation")
|
|
72
|
+
|
|
73
|
+
tasks = list(dataset)
|
|
74
|
+
|
|
75
|
+
if task_ids:
|
|
76
|
+
task_id_set = set(task_ids)
|
|
77
|
+
tasks = [t for t in tasks if t.get("id", "") in task_id_set]
|
|
78
|
+
|
|
79
|
+
if sample_size is not None and len(tasks) > sample_size:
|
|
80
|
+
tasks = tasks[:sample_size]
|
|
81
|
+
|
|
82
|
+
augmented_tasks = []
|
|
83
|
+
for idx, task in enumerate(tasks):
|
|
84
|
+
augmented = dict(task)
|
|
85
|
+
augmented["instance_id"] = f"mmmu_{task.get('id', idx)}"
|
|
86
|
+
augmented["problem_statement"] = self._generate_problem_statement(augmented)
|
|
87
|
+
augmented["ground_truth_answer"] = task.get("answer", "")
|
|
88
|
+
augmented_tasks.append(augmented)
|
|
89
|
+
|
|
90
|
+
return augmented_tasks
|
|
91
|
+
|
|
92
|
+
def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask:
|
|
93
|
+
"""Convert MMMU task to normalized format.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
task: MMMU task dictionary.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
Normalized BenchmarkTask.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
ValueError: If required fields are missing.
|
|
103
|
+
"""
|
|
104
|
+
instance_id = task.get("instance_id")
|
|
105
|
+
if not instance_id:
|
|
106
|
+
msg = f"Task missing required 'instance_id' field: {task.keys()}"
|
|
107
|
+
raise ValueError(msg)
|
|
108
|
+
|
|
109
|
+
question = task.get("question", "")
|
|
110
|
+
if not question:
|
|
111
|
+
msg = f"Task missing required 'question' field: {task.keys()}"
|
|
112
|
+
raise ValueError(msg)
|
|
113
|
+
|
|
114
|
+
problem_statement = self._generate_problem_statement(task)
|
|
115
|
+
|
|
116
|
+
metadata: dict[str, Any] = {
|
|
117
|
+
"question": question,
|
|
118
|
+
"answer": task.get("answer", ""),
|
|
119
|
+
"subject": task.get("subfield", task.get("subject", "")),
|
|
120
|
+
"options": task.get("options", []),
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Encode images as base64 in metadata
|
|
124
|
+
image_data = self._extract_images(task)
|
|
125
|
+
if image_data:
|
|
126
|
+
metadata["images"] = image_data
|
|
127
|
+
|
|
128
|
+
return BenchmarkTask(
|
|
129
|
+
task_id=instance_id,
|
|
130
|
+
problem_statement=problem_statement,
|
|
131
|
+
repo="mmmu/multimodal",
|
|
132
|
+
commit="HEAD",
|
|
133
|
+
metadata=metadata,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
def _extract_images(self, task: dict[str, Any]) -> list[str]:
|
|
137
|
+
"""Extract and encode images from the task as base64 strings.
|
|
138
|
+
|
|
139
|
+
MMMU tasks can contain up to 7 images in fields image_1 through image_7.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
task: MMMU task dictionary.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of base64-encoded image strings.
|
|
146
|
+
"""
|
|
147
|
+
images = []
|
|
148
|
+
for i in range(1, 8):
|
|
149
|
+
image_key = f"image_{i}"
|
|
150
|
+
image = task.get(image_key)
|
|
151
|
+
if image is not None:
|
|
152
|
+
try:
|
|
153
|
+
# Handle PIL Image objects from HuggingFace datasets
|
|
154
|
+
if hasattr(image, "save"):
|
|
155
|
+
buffer = io.BytesIO()
|
|
156
|
+
image.save(buffer, format="PNG")
|
|
157
|
+
img_bytes = buffer.getvalue()
|
|
158
|
+
images.append(base64.b64encode(img_bytes).decode("utf-8"))
|
|
159
|
+
elif isinstance(image, bytes):
|
|
160
|
+
images.append(base64.b64encode(image).decode("utf-8"))
|
|
161
|
+
elif isinstance(image, str):
|
|
162
|
+
# Already base64 or a path - store as-is
|
|
163
|
+
images.append(image)
|
|
164
|
+
except Exception:
|
|
165
|
+
# Skip images that cannot be encoded
|
|
166
|
+
continue
|
|
167
|
+
return images
|
|
168
|
+
|
|
169
|
+
def _generate_problem_statement(self, task: dict[str, Any]) -> str:
|
|
170
|
+
"""Generate problem statement from task.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
task: MMMU task dictionary.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Problem statement for the agent.
|
|
177
|
+
"""
|
|
178
|
+
question = task.get("question", "No question provided")
|
|
179
|
+
options = task.get("options", [])
|
|
180
|
+
subject = task.get("subfield", task.get("subject", ""))
|
|
181
|
+
|
|
182
|
+
statement = ""
|
|
183
|
+
if subject:
|
|
184
|
+
statement += f"Subject: {subject}\n\n"
|
|
185
|
+
|
|
186
|
+
statement += f"Question:\n{question}\n\n"
|
|
187
|
+
|
|
188
|
+
# Include image references in the statement
|
|
189
|
+
image_data = self._extract_images(task)
|
|
190
|
+
if image_data:
|
|
191
|
+
statement += f"[This question includes {len(image_data)} image(s). "
|
|
192
|
+
statement += "The image data is provided as base64-encoded PNG in the metadata.]\n\n"
|
|
193
|
+
|
|
194
|
+
if options:
|
|
195
|
+
statement += "Options:\n"
|
|
196
|
+
labels = ["A", "B", "C", "D", "E", "F", "G", "H"]
|
|
197
|
+
for i, option in enumerate(options):
|
|
198
|
+
if i < len(labels):
|
|
199
|
+
statement += f" ({labels[i]}) {option}\n"
|
|
200
|
+
|
|
201
|
+
statement += (
|
|
202
|
+
"\nSelect the correct answer by providing ONLY the letter "
|
|
203
|
+
"(A, B, C, or D) of the correct option."
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
return statement
|
|
207
|
+
|
|
208
|
+
async def create_environment(
|
|
209
|
+
self,
|
|
210
|
+
task: dict[str, Any],
|
|
211
|
+
docker_manager: DockerEnvironmentManager,
|
|
212
|
+
) -> TaskEnvironment:
|
|
213
|
+
"""Create environment for MMMU task.
|
|
214
|
+
|
|
215
|
+
MMMU doesn't require complex environment setup - creates minimal
|
|
216
|
+
environment since evaluation is based on answer comparison.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
task: MMMU task dictionary.
|
|
220
|
+
docker_manager: Docker environment manager.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
TaskEnvironment for the task.
|
|
224
|
+
"""
|
|
225
|
+
instance_id = task.get("instance_id", "mmmu_unknown")
|
|
226
|
+
temp_task = {
|
|
227
|
+
"instance_id": instance_id,
|
|
228
|
+
"repo": "mmmu/multimodal",
|
|
229
|
+
"base_commit": "HEAD",
|
|
230
|
+
}
|
|
231
|
+
return await docker_manager.create_environment(temp_task)
|
|
232
|
+
|
|
233
|
+
async def evaluate(
|
|
234
|
+
self,
|
|
235
|
+
_env: TaskEnvironment,
|
|
236
|
+
task: dict[str, Any],
|
|
237
|
+
solution: str,
|
|
238
|
+
) -> dict[str, Any]:
|
|
239
|
+
"""Evaluate a solution for MMMU task.
|
|
240
|
+
|
|
241
|
+
Extracts the answer letter from the solution and compares it
|
|
242
|
+
to the ground truth answer.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
_env: Task environment (unused; evaluation is offline).
|
|
246
|
+
task: MMMU task dictionary.
|
|
247
|
+
solution: Solution to evaluate (agent's response with answer letter).
|
|
248
|
+
|
|
249
|
+
Returns:
|
|
250
|
+
Dictionary with evaluation results including 'resolved' boolean.
|
|
251
|
+
"""
|
|
252
|
+
ground_truth = task.get("answer", task.get("ground_truth_answer", ""))
|
|
253
|
+
if not ground_truth:
|
|
254
|
+
return {
|
|
255
|
+
"resolved": False,
|
|
256
|
+
"error": "No ground truth answer available for evaluation",
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Normalize ground truth to uppercase letter
|
|
260
|
+
ground_truth = ground_truth.strip().upper()
|
|
261
|
+
|
|
262
|
+
# Extract answer from agent's solution
|
|
263
|
+
agent_answer = self._extract_answer(solution)
|
|
264
|
+
if agent_answer is None:
|
|
265
|
+
return {
|
|
266
|
+
"resolved": False,
|
|
267
|
+
"error": "Could not extract answer letter from agent's solution",
|
|
268
|
+
"agent_solution": solution[:500],
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
resolved = agent_answer == ground_truth
|
|
272
|
+
|
|
273
|
+
return {
|
|
274
|
+
"resolved": resolved,
|
|
275
|
+
"agent_answer": agent_answer,
|
|
276
|
+
"correct_answer": ground_truth,
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
def _extract_answer(self, text: str) -> str | None:
|
|
280
|
+
"""Extract the answer letter from the agent's response.
|
|
281
|
+
|
|
282
|
+
Handles various formats:
|
|
283
|
+
- Plain letter: "A", "B", "C", "D"
|
|
284
|
+
- Parenthesized: "(A)", "(B)"
|
|
285
|
+
- Sentence: "The answer is A", "The correct option is B"
|
|
286
|
+
- Boxed: "\\boxed{A}"
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
text: Text containing the answer.
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Uppercase answer letter, or None if no answer found.
|
|
293
|
+
"""
|
|
294
|
+
if not text:
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
text_upper = text.upper().strip()
|
|
298
|
+
|
|
299
|
+
# Pattern 1: LaTeX boxed answer
|
|
300
|
+
match = re.search(r"\\boxed\{([A-D])\}", text_upper)
|
|
301
|
+
if match:
|
|
302
|
+
return match.group(1)
|
|
303
|
+
|
|
304
|
+
# Pattern 2: "The answer is X" or "The correct answer is X"
|
|
305
|
+
match = re.search(
|
|
306
|
+
r"(?:the\s+)?(?:correct\s+)?(?:final\s+)?answer\s*(?:is|:)\s*\(?([A-D])\)?",
|
|
307
|
+
text_upper,
|
|
308
|
+
)
|
|
309
|
+
if match:
|
|
310
|
+
return match.group(1)
|
|
311
|
+
|
|
312
|
+
# Pattern 3: "Option X" or "Choice X"
|
|
313
|
+
match = re.search(r"(?:option|choice)\s*(?:is\s*)?:?\s*\(?([A-D])\)?", text_upper)
|
|
314
|
+
if match:
|
|
315
|
+
return match.group(1)
|
|
316
|
+
|
|
317
|
+
# Pattern 4: Standalone letter (last single A-D found as a word boundary)
|
|
318
|
+
matches = re.findall(r"\b([A-D])\b", text_upper)
|
|
319
|
+
if matches:
|
|
320
|
+
return matches[-1]
|
|
321
|
+
|
|
322
|
+
return None
|
|
323
|
+
|
|
324
|
+
def get_prebuilt_image(self, _task: dict[str, Any]) -> str | None:
|
|
325
|
+
"""Get pre-built Docker image name for MMMU task.
|
|
326
|
+
|
|
327
|
+
MMMU doesn't use pre-built images.
|
|
328
|
+
|
|
329
|
+
Args:
|
|
330
|
+
_task: MMMU task dictionary (unused).
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
None (no pre-built images available).
|
|
334
|
+
"""
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def get_prompt_template(self) -> str:
|
|
338
|
+
"""Get MMMU prompt template.
|
|
339
|
+
|
|
340
|
+
Returns:
|
|
341
|
+
Prompt template for multimodal understanding questions.
|
|
342
|
+
"""
|
|
343
|
+
return (
|
|
344
|
+
"Answer the following multi-modal question that may include images, "
|
|
345
|
+
"diagrams, charts, or figures:\n\n"
|
|
346
|
+
"{problem_statement}\n\n"
|
|
347
|
+
"IMPORTANT INSTRUCTIONS:\n"
|
|
348
|
+
"- Carefully examine any images, diagrams, or figures provided\n"
|
|
349
|
+
"- Apply your knowledge of the relevant subject area\n"
|
|
350
|
+
"- Consider each answer option carefully\n"
|
|
351
|
+
"- Show your reasoning step-by-step\n"
|
|
352
|
+
"- Respond with ONLY the letter of the correct answer (A, B, C, or D)"
|
|
353
|
+
)
|