mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/benchmarks/__init__.py +12 -0
  2. mcpbr/benchmarks/adversarial.py +341 -0
  3. mcpbr/benchmarks/custom.py +607 -0
  4. mcpbr/benchmarks/longbench.py +623 -0
  5. mcpbr/benchmarks/mmmu.py +353 -0
  6. mcpbr/config.py +4 -0
  7. mcpbr/config_migration.py +470 -0
  8. mcpbr/config_wizard.py +647 -0
  9. mcpbr/custom_metrics.py +405 -0
  10. mcpbr/dashboard.py +619 -0
  11. mcpbr/dataset_streaming.py +491 -0
  12. mcpbr/dataset_versioning.py +222 -0
  13. mcpbr/docker_cache.py +539 -0
  14. mcpbr/docker_prewarm.py +369 -0
  15. mcpbr/dry_run.py +532 -0
  16. mcpbr/failure_analysis.py +558 -0
  17. mcpbr/few_shot.py +367 -0
  18. mcpbr/formatting.py +444 -0
  19. mcpbr/gpu_support.py +157 -0
  20. mcpbr/harness.py +38 -4
  21. mcpbr/latency_metrics.py +317 -0
  22. mcpbr/resource_limits.py +487 -0
  23. mcpbr/result_streaming.py +519 -0
  24. mcpbr/sampling.py +193 -0
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
  28. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
  29. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,353 @@
1
+ """MMMU (Massive Multi-discipline Multimodal Understanding) benchmark implementation."""
2
+
3
+ import base64
4
+ import io
5
+ import re
6
+ from typing import Any
7
+
8
+ from datasets import load_dataset
9
+
10
+ from ..docker_env import DockerEnvironmentManager, TaskEnvironment
11
+ from .base import BenchmarkTask
12
+
13
+
14
+ class MMMUBenchmark:
15
+ """MMMU benchmark implementation.
16
+
17
+ The Massive Multi-discipline Multimodal Understanding (MMMU) benchmark
18
+ evaluates multimodal models on college-level subject knowledge across
19
+ 30 subjects and 183 subfields. Tasks require understanding images
20
+ (diagrams, charts, figures, etc.) alongside text to answer
21
+ multiple-choice questions (A, B, C, D).
22
+
23
+ Evaluation compares the selected answer letter against the ground truth.
24
+ """
25
+
26
+ name = "mmmu"
27
+
28
+ def __init__(self, dataset: str = "MMMU/MMMU", subset: str = "Accounting"):
29
+ """Initialize MMMU benchmark.
30
+
31
+ Args:
32
+ dataset: HuggingFace dataset identifier.
33
+ subset: Dataset subset/config name (default: 'Accounting').
34
+ MMMU is organized by subject (e.g., 'Accounting', 'Art',
35
+ 'Biology', 'Chemistry', 'Computer_Science', etc.).
36
+ """
37
+ self.dataset = dataset
38
+ self.subset = subset
39
+
40
+ def load_tasks(
41
+ self,
42
+ sample_size: int | None = None,
43
+ task_ids: list[str] | None = None,
44
+ level: int | None = None,
45
+ filter_difficulty: list[str] | None = None,
46
+ filter_category: list[str] | None = None,
47
+ filter_tags: list[str] | None = None,
48
+ ) -> list[dict[str, Any]]:
49
+ """Load tasks from MMMU dataset.
50
+
51
+ Args:
52
+ sample_size: Maximum number of tasks to load (None for all).
53
+ task_ids: Specific task IDs to load (None for all).
54
+ level: Unused for MMMU (no difficulty levels).
55
+ filter_difficulty: Unused for MMMU.
56
+ filter_category: Filter by subject category (overrides subset).
57
+ filter_tags: Unused for MMMU.
58
+
59
+ Returns:
60
+ List of MMMU task dictionaries.
61
+ """
62
+ _ = level
63
+ _ = filter_difficulty
64
+ _ = filter_tags
65
+
66
+ subset = self.subset
67
+ if filter_category and len(filter_category) > 0:
68
+ subset = filter_category[0]
69
+
70
+ # MMMU uses 'validation' split for evaluation (test labels are hidden)
71
+ dataset = load_dataset(self.dataset, subset, split="validation")
72
+
73
+ tasks = list(dataset)
74
+
75
+ if task_ids:
76
+ task_id_set = set(task_ids)
77
+ tasks = [t for t in tasks if t.get("id", "") in task_id_set]
78
+
79
+ if sample_size is not None and len(tasks) > sample_size:
80
+ tasks = tasks[:sample_size]
81
+
82
+ augmented_tasks = []
83
+ for idx, task in enumerate(tasks):
84
+ augmented = dict(task)
85
+ augmented["instance_id"] = f"mmmu_{task.get('id', idx)}"
86
+ augmented["problem_statement"] = self._generate_problem_statement(augmented)
87
+ augmented["ground_truth_answer"] = task.get("answer", "")
88
+ augmented_tasks.append(augmented)
89
+
90
+ return augmented_tasks
91
+
92
+ def normalize_task(self, task: dict[str, Any]) -> BenchmarkTask:
93
+ """Convert MMMU task to normalized format.
94
+
95
+ Args:
96
+ task: MMMU task dictionary.
97
+
98
+ Returns:
99
+ Normalized BenchmarkTask.
100
+
101
+ Raises:
102
+ ValueError: If required fields are missing.
103
+ """
104
+ instance_id = task.get("instance_id")
105
+ if not instance_id:
106
+ msg = f"Task missing required 'instance_id' field: {task.keys()}"
107
+ raise ValueError(msg)
108
+
109
+ question = task.get("question", "")
110
+ if not question:
111
+ msg = f"Task missing required 'question' field: {task.keys()}"
112
+ raise ValueError(msg)
113
+
114
+ problem_statement = self._generate_problem_statement(task)
115
+
116
+ metadata: dict[str, Any] = {
117
+ "question": question,
118
+ "answer": task.get("answer", ""),
119
+ "subject": task.get("subfield", task.get("subject", "")),
120
+ "options": task.get("options", []),
121
+ }
122
+
123
+ # Encode images as base64 in metadata
124
+ image_data = self._extract_images(task)
125
+ if image_data:
126
+ metadata["images"] = image_data
127
+
128
+ return BenchmarkTask(
129
+ task_id=instance_id,
130
+ problem_statement=problem_statement,
131
+ repo="mmmu/multimodal",
132
+ commit="HEAD",
133
+ metadata=metadata,
134
+ )
135
+
136
+ def _extract_images(self, task: dict[str, Any]) -> list[str]:
137
+ """Extract and encode images from the task as base64 strings.
138
+
139
+ MMMU tasks can contain up to 7 images in fields image_1 through image_7.
140
+
141
+ Args:
142
+ task: MMMU task dictionary.
143
+
144
+ Returns:
145
+ List of base64-encoded image strings.
146
+ """
147
+ images = []
148
+ for i in range(1, 8):
149
+ image_key = f"image_{i}"
150
+ image = task.get(image_key)
151
+ if image is not None:
152
+ try:
153
+ # Handle PIL Image objects from HuggingFace datasets
154
+ if hasattr(image, "save"):
155
+ buffer = io.BytesIO()
156
+ image.save(buffer, format="PNG")
157
+ img_bytes = buffer.getvalue()
158
+ images.append(base64.b64encode(img_bytes).decode("utf-8"))
159
+ elif isinstance(image, bytes):
160
+ images.append(base64.b64encode(image).decode("utf-8"))
161
+ elif isinstance(image, str):
162
+ # Already base64 or a path - store as-is
163
+ images.append(image)
164
+ except Exception:
165
+ # Skip images that cannot be encoded
166
+ continue
167
+ return images
168
+
169
+ def _generate_problem_statement(self, task: dict[str, Any]) -> str:
170
+ """Generate problem statement from task.
171
+
172
+ Args:
173
+ task: MMMU task dictionary.
174
+
175
+ Returns:
176
+ Problem statement for the agent.
177
+ """
178
+ question = task.get("question", "No question provided")
179
+ options = task.get("options", [])
180
+ subject = task.get("subfield", task.get("subject", ""))
181
+
182
+ statement = ""
183
+ if subject:
184
+ statement += f"Subject: {subject}\n\n"
185
+
186
+ statement += f"Question:\n{question}\n\n"
187
+
188
+ # Include image references in the statement
189
+ image_data = self._extract_images(task)
190
+ if image_data:
191
+ statement += f"[This question includes {len(image_data)} image(s). "
192
+ statement += "The image data is provided as base64-encoded PNG in the metadata.]\n\n"
193
+
194
+ if options:
195
+ statement += "Options:\n"
196
+ labels = ["A", "B", "C", "D", "E", "F", "G", "H"]
197
+ for i, option in enumerate(options):
198
+ if i < len(labels):
199
+ statement += f" ({labels[i]}) {option}\n"
200
+
201
+ statement += (
202
+ "\nSelect the correct answer by providing ONLY the letter "
203
+ "(A, B, C, or D) of the correct option."
204
+ )
205
+
206
+ return statement
207
+
208
+ async def create_environment(
209
+ self,
210
+ task: dict[str, Any],
211
+ docker_manager: DockerEnvironmentManager,
212
+ ) -> TaskEnvironment:
213
+ """Create environment for MMMU task.
214
+
215
+ MMMU doesn't require complex environment setup - creates minimal
216
+ environment since evaluation is based on answer comparison.
217
+
218
+ Args:
219
+ task: MMMU task dictionary.
220
+ docker_manager: Docker environment manager.
221
+
222
+ Returns:
223
+ TaskEnvironment for the task.
224
+ """
225
+ instance_id = task.get("instance_id", "mmmu_unknown")
226
+ temp_task = {
227
+ "instance_id": instance_id,
228
+ "repo": "mmmu/multimodal",
229
+ "base_commit": "HEAD",
230
+ }
231
+ return await docker_manager.create_environment(temp_task)
232
+
233
+ async def evaluate(
234
+ self,
235
+ _env: TaskEnvironment,
236
+ task: dict[str, Any],
237
+ solution: str,
238
+ ) -> dict[str, Any]:
239
+ """Evaluate a solution for MMMU task.
240
+
241
+ Extracts the answer letter from the solution and compares it
242
+ to the ground truth answer.
243
+
244
+ Args:
245
+ _env: Task environment (unused; evaluation is offline).
246
+ task: MMMU task dictionary.
247
+ solution: Solution to evaluate (agent's response with answer letter).
248
+
249
+ Returns:
250
+ Dictionary with evaluation results including 'resolved' boolean.
251
+ """
252
+ ground_truth = task.get("answer", task.get("ground_truth_answer", ""))
253
+ if not ground_truth:
254
+ return {
255
+ "resolved": False,
256
+ "error": "No ground truth answer available for evaluation",
257
+ }
258
+
259
+ # Normalize ground truth to uppercase letter
260
+ ground_truth = ground_truth.strip().upper()
261
+
262
+ # Extract answer from agent's solution
263
+ agent_answer = self._extract_answer(solution)
264
+ if agent_answer is None:
265
+ return {
266
+ "resolved": False,
267
+ "error": "Could not extract answer letter from agent's solution",
268
+ "agent_solution": solution[:500],
269
+ }
270
+
271
+ resolved = agent_answer == ground_truth
272
+
273
+ return {
274
+ "resolved": resolved,
275
+ "agent_answer": agent_answer,
276
+ "correct_answer": ground_truth,
277
+ }
278
+
279
+ def _extract_answer(self, text: str) -> str | None:
280
+ """Extract the answer letter from the agent's response.
281
+
282
+ Handles various formats:
283
+ - Plain letter: "A", "B", "C", "D"
284
+ - Parenthesized: "(A)", "(B)"
285
+ - Sentence: "The answer is A", "The correct option is B"
286
+ - Boxed: "\\boxed{A}"
287
+
288
+ Args:
289
+ text: Text containing the answer.
290
+
291
+ Returns:
292
+ Uppercase answer letter, or None if no answer found.
293
+ """
294
+ if not text:
295
+ return None
296
+
297
+ text_upper = text.upper().strip()
298
+
299
+ # Pattern 1: LaTeX boxed answer
300
+ match = re.search(r"\\boxed\{([A-D])\}", text_upper)
301
+ if match:
302
+ return match.group(1)
303
+
304
+ # Pattern 2: "The answer is X" or "The correct answer is X"
305
+ match = re.search(
306
+ r"(?:the\s+)?(?:correct\s+)?(?:final\s+)?answer\s*(?:is|:)\s*\(?([A-D])\)?",
307
+ text_upper,
308
+ )
309
+ if match:
310
+ return match.group(1)
311
+
312
+ # Pattern 3: "Option X" or "Choice X"
313
+ match = re.search(r"(?:option|choice)\s*(?:is\s*)?:?\s*\(?([A-D])\)?", text_upper)
314
+ if match:
315
+ return match.group(1)
316
+
317
+ # Pattern 4: Standalone letter (last single A-D found as a word boundary)
318
+ matches = re.findall(r"\b([A-D])\b", text_upper)
319
+ if matches:
320
+ return matches[-1]
321
+
322
+ return None
323
+
324
+ def get_prebuilt_image(self, _task: dict[str, Any]) -> str | None:
325
+ """Get pre-built Docker image name for MMMU task.
326
+
327
+ MMMU doesn't use pre-built images.
328
+
329
+ Args:
330
+ _task: MMMU task dictionary (unused).
331
+
332
+ Returns:
333
+ None (no pre-built images available).
334
+ """
335
+ return None
336
+
337
+ def get_prompt_template(self) -> str:
338
+ """Get MMMU prompt template.
339
+
340
+ Returns:
341
+ Prompt template for multimodal understanding questions.
342
+ """
343
+ return (
344
+ "Answer the following multi-modal question that may include images, "
345
+ "diagrams, charts, or figures:\n\n"
346
+ "{problem_statement}\n\n"
347
+ "IMPORTANT INSTRUCTIONS:\n"
348
+ "- Carefully examine any images, diagrams, or figures provided\n"
349
+ "- Apply your knowledge of the relevant subject area\n"
350
+ "- Consider each answer option carefully\n"
351
+ "- Show your reasoning step-by-step\n"
352
+ "- Respond with ONLY the letter of the correct answer (A, B, C, or D)"
353
+ )
mcpbr/config.py CHANGED
@@ -42,6 +42,10 @@ VALID_BENCHMARKS = (
42
42
  "webarena",
43
43
  "mlagentbench",
44
44
  "intercode",
45
+ "custom",
46
+ "mmmu",
47
+ "longbench",
48
+ "adversarial",
45
49
  )
46
50
  VALID_INFRASTRUCTURE_MODES = ("local", "azure")
47
51