mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,370 @@
1
+ """Docker image pre-warming for mcpbr benchmark evaluations.
2
+
3
+ Pre-pulls Docker images needed for a benchmark run before evaluation begins,
4
+ so that image pull time does not inflate task-level timing measurements.
5
+ Supports parallel pulling, progress reporting, and local cache detection.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Callable
13
+
14
+ import docker.errors
15
+ from rich.console import Console
16
+ from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
17
+ from rich.table import Table
18
+
19
+ import docker
20
+
21
+ from .docker_env import SWEBENCH_IMAGE_REGISTRY, get_swebench_image_name
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Default base images used by non-SWE-bench benchmarks
26
+ DEFAULT_BASE_IMAGES: dict[str, list[str]] = {
27
+ "humaneval": ["python:3.11-slim"],
28
+ "mbpp": ["python:3.11-slim"],
29
+ "apps": ["python:3.11-slim"],
30
+ "codecontests": ["python:3.11-slim"],
31
+ "bigcodebench": ["python:3.11-slim"],
32
+ "leetcode": ["python:3.11-slim"],
33
+ "codereval": ["python:3.11-slim"],
34
+ "gsm8k": ["python:3.11-slim"],
35
+ "math": ["python:3.11-slim"],
36
+ "truthfulqa": ["python:3.11-slim"],
37
+ "bigbench-hard": ["python:3.11-slim"],
38
+ "hellaswag": ["python:3.11-slim"],
39
+ "arc": ["python:3.11-slim"],
40
+ "repoqa": ["python:3.11-slim"],
41
+ "toolbench": ["python:3.11-slim"],
42
+ "aider-polyglot": ["python:3.11-slim"],
43
+ "terminalbench": ["python:3.11-slim"],
44
+ "gaia": ["python:3.11-slim"],
45
+ "agentbench": ["python:3.11-slim"],
46
+ "webarena": ["python:3.11-slim"],
47
+ "mlagentbench": ["python:3.11-slim"],
48
+ "intercode": ["python:3.11-slim"],
49
+ "mmmu": ["python:3.11-slim"],
50
+ "longbench": ["python:3.11-slim"],
51
+ "adversarial": ["python:3.11-slim"],
52
+ "mcptoolbench": ["python:3.11-slim"],
53
+ "custom": ["python:3.11-slim"],
54
+ "cybergym": ["python:3.11-slim"],
55
+ }
56
+
57
+
58
+ @dataclass
59
+ class PrewarmResult:
60
+ """Result of a Docker image pre-warming operation.
61
+
62
+ Attributes:
63
+ total_images: Total number of images that were requested.
64
+ already_cached: Number of images already available locally.
65
+ newly_pulled: Number of images successfully pulled from registry.
66
+ failed: List of image names that failed to pull.
67
+ pull_time_seconds: Wall-clock time for the entire pre-warm operation.
68
+ """
69
+
70
+ total_images: int = 0
71
+ already_cached: int = 0
72
+ newly_pulled: int = 0
73
+ failed: list[str] = field(default_factory=list)
74
+ pull_time_seconds: float = 0.0
75
+
76
+
77
+ def get_required_images(benchmark_name: str, tasks: list[dict[str, Any]]) -> list[str]:
78
+ """Determine which Docker images are needed for a benchmark run.
79
+
80
+ For SWE-bench variants, each task maps to a unique per-instance image from
81
+ the Epoch Research registry. For other benchmarks, a common base image
82
+ (typically ``python:3.11-slim``) is used.
83
+
84
+ Args:
85
+ benchmark_name: Name of the benchmark (e.g., ``"swe-bench-lite"``).
86
+ tasks: List of task dictionaries loaded from the benchmark.
87
+
88
+ Returns:
89
+ Deduplicated list of Docker image names required for the run.
90
+ """
91
+ images: list[str] = []
92
+
93
+ is_swebench = benchmark_name.startswith("swe-bench")
94
+
95
+ if is_swebench:
96
+ seen: set[str] = set()
97
+ for task in tasks:
98
+ instance_id = task.get("instance_id", "")
99
+ if instance_id and instance_id not in seen:
100
+ seen.add(instance_id)
101
+ images.append(get_swebench_image_name(instance_id))
102
+ else:
103
+ # Use default base images for the benchmark, or fall back to python:3.11-slim
104
+ base_images = DEFAULT_BASE_IMAGES.get(benchmark_name, ["python:3.11-slim"])
105
+ images = list(base_images)
106
+
107
+ return images
108
+
109
+
110
+ def check_cached_images(images: list[str]) -> dict[str, bool]:
111
+ """Check which Docker images are already available in the local cache.
112
+
113
+ Uses ``docker.from_env()`` to inspect the local image store. Images that
114
+ are present locally are marked ``True``; missing or inaccessible images
115
+ are marked ``False``.
116
+
117
+ Args:
118
+ images: List of Docker image names to check.
119
+
120
+ Returns:
121
+ Dictionary mapping each image name to a boolean indicating whether
122
+ it is cached locally.
123
+ """
124
+ result: dict[str, bool] = {}
125
+ try:
126
+ client = docker.from_env()
127
+ except docker.errors.DockerException:
128
+ logger.warning("Could not connect to Docker daemon for cache check")
129
+ return {image: False for image in images}
130
+
131
+ for image in images:
132
+ try:
133
+ client.images.get(image)
134
+ result[image] = True
135
+ except docker.errors.ImageNotFound:
136
+ result[image] = False
137
+ except docker.errors.APIError:
138
+ result[image] = False
139
+
140
+ return result
141
+
142
+
143
+ async def _pull_single_image(
144
+ client: docker.DockerClient,
145
+ image: str,
146
+ semaphore: asyncio.Semaphore,
147
+ on_progress: Callable[[str, str], None] | None = None,
148
+ ) -> tuple[str, bool]:
149
+ """Pull a single Docker image, respecting the concurrency semaphore.
150
+
151
+ Args:
152
+ client: Docker client instance.
153
+ image: Full image name to pull.
154
+ semaphore: Asyncio semaphore to limit parallel pulls.
155
+ on_progress: Optional callback ``(image, status)`` for progress updates.
156
+
157
+ Returns:
158
+ Tuple of ``(image_name, success)``.
159
+ """
160
+
161
+ async with semaphore:
162
+ if on_progress:
163
+ on_progress(image, "pulling")
164
+
165
+ def _do_pull() -> bool:
166
+ try:
167
+ # Determine platform for SWE-bench images
168
+ platform = "linux/amd64" if SWEBENCH_IMAGE_REGISTRY in image else None
169
+ client.images.pull(image, platform=platform)
170
+ return True
171
+ except docker.errors.ImageNotFound:
172
+ logger.warning("Image not found in registry: %s", image)
173
+ return False
174
+ except docker.errors.APIError as exc:
175
+ logger.warning("Failed to pull image %s: %s", image, exc)
176
+ return False
177
+
178
+ loop = asyncio.get_event_loop()
179
+ success = await loop.run_in_executor(None, _do_pull)
180
+
181
+ if on_progress:
182
+ on_progress(image, "done" if success else "failed")
183
+
184
+ return image, success
185
+
186
+
187
+ async def prewarm_images(
188
+ benchmark_name: str,
189
+ tasks: list[dict[str, Any]],
190
+ max_parallel: int = 3,
191
+ on_progress: Callable[[str, str], None] | None = None,
192
+ ) -> PrewarmResult:
193
+ """Pre-pull all Docker images needed for a benchmark run.
194
+
195
+ Checks the local cache first, then pulls missing images in parallel
196
+ (limited by ``max_parallel``). Returns a summary of the operation.
197
+
198
+ Args:
199
+ benchmark_name: Name of the benchmark (e.g., ``"swe-bench-verified"``).
200
+ tasks: List of task dictionaries from the benchmark loader.
201
+ max_parallel: Maximum number of concurrent image pulls. Defaults to 3.
202
+ on_progress: Optional callback ``(image_name, status_string)`` invoked
203
+ when an image starts pulling or completes.
204
+
205
+ Returns:
206
+ PrewarmResult summarising cached, pulled, and failed images.
207
+ """
208
+ start_time = time.monotonic()
209
+
210
+ images = get_required_images(benchmark_name, tasks)
211
+ total = len(images)
212
+
213
+ if total == 0:
214
+ return PrewarmResult(pull_time_seconds=time.monotonic() - start_time)
215
+
216
+ # Check local cache
217
+ cache_status = check_cached_images(images)
218
+ already_cached = sum(1 for cached in cache_status.values() if cached)
219
+ to_pull = [img for img, cached in cache_status.items() if not cached]
220
+
221
+ if not to_pull:
222
+ return PrewarmResult(
223
+ total_images=total,
224
+ already_cached=already_cached,
225
+ newly_pulled=0,
226
+ failed=[],
227
+ pull_time_seconds=time.monotonic() - start_time,
228
+ )
229
+
230
+ # Pull missing images in parallel
231
+ try:
232
+ client = docker.from_env()
233
+ except docker.errors.DockerException as exc:
234
+ logger.error("Cannot connect to Docker daemon: %s", exc)
235
+ return PrewarmResult(
236
+ total_images=total,
237
+ already_cached=already_cached,
238
+ newly_pulled=0,
239
+ failed=to_pull,
240
+ pull_time_seconds=time.monotonic() - start_time,
241
+ )
242
+
243
+ semaphore = asyncio.Semaphore(max_parallel)
244
+
245
+ pull_tasks = [
246
+ _pull_single_image(client, image, semaphore, on_progress=on_progress) for image in to_pull
247
+ ]
248
+
249
+ results = await asyncio.gather(*pull_tasks, return_exceptions=True)
250
+
251
+ newly_pulled = 0
252
+ failed: list[str] = []
253
+ for result in results:
254
+ if isinstance(result, Exception):
255
+ logger.error("Unexpected error during image pull: %s", result)
256
+ failed.append(str(result))
257
+ else:
258
+ image_name, success = result
259
+ if success:
260
+ newly_pulled += 1
261
+ else:
262
+ failed.append(image_name)
263
+
264
+ return PrewarmResult(
265
+ total_images=total,
266
+ already_cached=already_cached,
267
+ newly_pulled=newly_pulled,
268
+ failed=failed,
269
+ pull_time_seconds=time.monotonic() - start_time,
270
+ )
271
+
272
+
273
+ def format_prewarm_report(result: PrewarmResult) -> None:
274
+ """Print a rich-formatted summary of the pre-warm operation.
275
+
276
+ Displays a table with counts of cached, pulled, and failed images,
277
+ plus total elapsed time.
278
+
279
+ Args:
280
+ result: PrewarmResult from a completed pre-warm operation.
281
+ """
282
+ console = Console()
283
+
284
+ table = Table(title="Docker Image Pre-warm Summary", show_header=True, header_style="bold")
285
+ table.add_column("Metric", style="cyan", no_wrap=True)
286
+ table.add_column("Value", justify="right")
287
+
288
+ table.add_row("Total images", str(result.total_images))
289
+ table.add_row("Already cached", str(result.already_cached))
290
+ table.add_row("Newly pulled", str(result.newly_pulled))
291
+ table.add_row("Failed", str(len(result.failed)))
292
+ table.add_row("Pull time", f"{result.pull_time_seconds:.1f}s")
293
+
294
+ console.print()
295
+ console.print(table)
296
+
297
+ if result.failed:
298
+ console.print()
299
+ console.print("[red bold]Failed to pull the following images:[/red bold]")
300
+ for image in result.failed:
301
+ console.print(f"[red] - {image}[/red]")
302
+
303
+ if result.failed:
304
+ console.print()
305
+ console.print(
306
+ "[yellow]Some images could not be pre-warmed. "
307
+ "Evaluation will attempt to pull them at task time.[/yellow]"
308
+ )
309
+ elif result.newly_pulled > 0:
310
+ console.print()
311
+ console.print("[green bold]All images pre-warmed successfully.[/green bold]")
312
+ elif result.total_images > 0:
313
+ console.print()
314
+ console.print("[green]All images already cached locally.[/green]")
315
+ console.print()
316
+
317
+
318
+ async def prewarm_images_with_progress(
319
+ benchmark_name: str,
320
+ tasks: list[dict[str, Any]],
321
+ max_parallel: int = 3,
322
+ ) -> PrewarmResult:
323
+ """Pre-pull images with a rich progress bar displayed in the terminal.
324
+
325
+ This is a convenience wrapper around :func:`prewarm_images` that
326
+ creates and manages a ``rich.progress.Progress`` bar automatically.
327
+
328
+ Args:
329
+ benchmark_name: Name of the benchmark.
330
+ tasks: List of task dictionaries from the benchmark loader.
331
+ max_parallel: Maximum number of concurrent image pulls. Defaults to 3.
332
+
333
+ Returns:
334
+ PrewarmResult summarising cached, pulled, and failed images.
335
+ """
336
+ images = get_required_images(benchmark_name, tasks)
337
+ cache_status = check_cached_images(images)
338
+ to_pull = [img for img, cached in cache_status.items() if not cached]
339
+
340
+ if not to_pull:
341
+ # Nothing to pull, just compute the result quickly
342
+ return await prewarm_images(benchmark_name, tasks, max_parallel)
343
+
344
+ console = Console()
345
+ progress = Progress(
346
+ SpinnerColumn(),
347
+ TextColumn("[progress.description]{task.description}"),
348
+ BarColumn(),
349
+ TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
350
+ TimeElapsedColumn(),
351
+ console=console,
352
+ )
353
+
354
+ task_id = progress.add_task("Pre-warming Docker images", total=len(to_pull))
355
+ completed_count = 0
356
+
357
+ def _on_progress(image: str, status: str) -> None:
358
+ nonlocal completed_count
359
+ if status in ("done", "failed"):
360
+ completed_count += 1
361
+ progress.update(task_id, completed=completed_count)
362
+ else:
363
+ # Shorten image name for display
364
+ short_name = image.split("/")[-1][:40]
365
+ progress.update(task_id, description=f"Pulling {short_name}")
366
+
367
+ with progress:
368
+ result = await prewarm_images(benchmark_name, tasks, max_parallel, on_progress=_on_progress)
369
+
370
+ return result