mcpbr 0.4.16__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/__init__.py +20 -1
  2. mcpbr/config.py +37 -1
  3. mcpbr/config_migration.py +470 -0
  4. mcpbr/config_wizard.py +647 -0
  5. mcpbr/dashboard.py +619 -0
  6. mcpbr/dataset_streaming.py +491 -0
  7. mcpbr/docker_cache.py +539 -0
  8. mcpbr/docker_env.py +2 -1
  9. mcpbr/docker_prewarm.py +370 -0
  10. mcpbr/dry_run.py +533 -0
  11. mcpbr/formatting.py +444 -0
  12. mcpbr/gpu_support.py +2 -1
  13. mcpbr/graceful_degradation.py +277 -0
  14. mcpbr/harness.py +38 -4
  15. mcpbr/languages.py +228 -0
  16. mcpbr/logging_config.py +207 -0
  17. mcpbr/models.py +66 -0
  18. mcpbr/preflight.py +2 -1
  19. mcpbr/pricing.py +72 -0
  20. mcpbr/providers.py +316 -3
  21. mcpbr/resource_limits.py +487 -0
  22. mcpbr/result_streaming.py +519 -0
  23. mcpbr/sdk.py +264 -0
  24. mcpbr/smoke_test.py +2 -1
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/METADATA +8 -1
  28. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/RECORD +38 -22
  29. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.16.data → mcpbr-0.6.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.16.dist-info → mcpbr-0.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,491 @@
1
+ """Memory-efficient large dataset handling for benchmark evaluations.
2
+
3
+ This module provides streaming and chunked loading of large HuggingFace datasets,
4
+ enabling benchmark runs on datasets that would otherwise exceed available memory.
5
+ It includes memory monitoring, automatic chunking under memory pressure, and
6
+ iterator-based APIs compatible with existing benchmark ``load_tasks`` patterns.
7
+
8
+ Key components:
9
+ - ``MemoryMonitor``: Tracks RSS and available memory, detects memory pressure.
10
+ - ``ChunkedLoader``: Iterates over a HuggingFace dataset in configurable chunks.
11
+ - ``StreamingDataset``: High-level API that yields tasks lazily with memory awareness.
12
+ - ``DatasetStats``: Summary statistics for a streaming load session.
13
+ """
14
+
15
+ import logging
16
+ import os
17
+ import time
18
+ from collections.abc import Iterator
19
+ from dataclasses import dataclass
20
+ from typing import Any
21
+
22
+ from datasets import load_dataset, load_dataset_builder
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # DatasetStats
29
+ # ---------------------------------------------------------------------------
30
+
31
+
32
+ @dataclass
33
+ class DatasetStats:
34
+ """Summary statistics for a streaming dataset load session.
35
+
36
+ Attributes:
37
+ total_loaded: Number of individual task items yielded so far.
38
+ peak_memory_mb: Peak RSS observed during loading (in megabytes).
39
+ chunks_processed: Number of chunks fetched from the underlying loader.
40
+ load_time_seconds: Wall-clock seconds elapsed during loading.
41
+ """
42
+
43
+ total_loaded: int = 0
44
+ peak_memory_mb: float = 0.0
45
+ chunks_processed: int = 0
46
+ load_time_seconds: float = 0.0
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # MemoryMonitor
51
+ # ---------------------------------------------------------------------------
52
+
53
+
54
+ class MemoryMonitor:
55
+ """Lightweight monitor for process and system memory usage.
56
+
57
+ Uses ``psutil`` when available, falling back to reading ``/proc/self/status``
58
+ and ``/proc/meminfo`` on Linux. On platforms where neither is available the
59
+ methods return ``0.0`` and memory-pressure detection is disabled.
60
+ """
61
+
62
+ def __init__(self) -> None:
63
+ """Initialize the memory monitor and detect available backends."""
64
+ self._has_psutil = False
65
+ try:
66
+ import psutil # noqa: F401
67
+
68
+ self._has_psutil = True
69
+ except ImportError:
70
+ pass
71
+
72
+ # -- public API ---------------------------------------------------------
73
+
74
+ def get_memory_usage_mb(self) -> float:
75
+ """Return the current Resident Set Size (RSS) in megabytes.
76
+
77
+ Returns:
78
+ RSS in MB, or ``0.0`` if measurement is unavailable.
79
+ """
80
+ if self._has_psutil:
81
+ return self._rss_via_psutil()
82
+ return self._rss_via_proc()
83
+
84
+ def get_available_memory_mb(self) -> float:
85
+ """Return available system memory in megabytes.
86
+
87
+ Returns:
88
+ Available memory in MB, or ``0.0`` if measurement is unavailable.
89
+ """
90
+ if self._has_psutil:
91
+ return self._available_via_psutil()
92
+ return self._available_via_proc()
93
+
94
+ def is_memory_pressure(self, threshold_pct: float = 80.0) -> bool:
95
+ """Check whether system memory usage exceeds a threshold.
96
+
97
+ Args:
98
+ threshold_pct: Percentage (0--100) of total memory above which
99
+ the system is considered under pressure.
100
+
101
+ Returns:
102
+ ``True`` if memory usage exceeds *threshold_pct*, ``False``
103
+ otherwise or if measurement is unavailable.
104
+ """
105
+ if self._has_psutil:
106
+ return self._pressure_via_psutil(threshold_pct)
107
+ return self._pressure_via_proc(threshold_pct)
108
+
109
+ # -- psutil backend -----------------------------------------------------
110
+
111
+ def _rss_via_psutil(self) -> float:
112
+ """Get RSS using psutil."""
113
+ try:
114
+ import psutil
115
+
116
+ process = psutil.Process(os.getpid())
117
+ return process.memory_info().rss / (1024 * 1024)
118
+ except Exception:
119
+ return 0.0
120
+
121
+ def _available_via_psutil(self) -> float:
122
+ """Get available system memory using psutil."""
123
+ try:
124
+ import psutil
125
+
126
+ return psutil.virtual_memory().available / (1024 * 1024)
127
+ except Exception:
128
+ return 0.0
129
+
130
+ def _pressure_via_psutil(self, threshold_pct: float) -> bool:
131
+ """Check memory pressure using psutil."""
132
+ try:
133
+ import psutil
134
+
135
+ return psutil.virtual_memory().percent >= threshold_pct
136
+ except Exception:
137
+ return False
138
+
139
+ # -- /proc fallback -----------------------------------------------------
140
+
141
+ @staticmethod
142
+ def _rss_via_proc() -> float:
143
+ """Get RSS by parsing ``/proc/self/status``."""
144
+ try:
145
+ with open("/proc/self/status") as fh:
146
+ for line in fh:
147
+ if line.startswith("VmRSS:"):
148
+ # Value is in kB
149
+ return int(line.split()[1]) / 1024
150
+ except (OSError, ValueError, IndexError):
151
+ pass
152
+ return 0.0
153
+
154
+ @staticmethod
155
+ def _available_via_proc() -> float:
156
+ """Get available memory by parsing ``/proc/meminfo``."""
157
+ try:
158
+ with open("/proc/meminfo") as fh:
159
+ for line in fh:
160
+ if line.startswith("MemAvailable:"):
161
+ return int(line.split()[1]) / 1024
162
+ except (OSError, ValueError, IndexError):
163
+ pass
164
+ return 0.0
165
+
166
+ @staticmethod
167
+ def _pressure_via_proc(threshold_pct: float) -> bool:
168
+ """Check memory pressure using ``/proc/meminfo``."""
169
+ try:
170
+ mem_total = 0.0
171
+ mem_available = 0.0
172
+ with open("/proc/meminfo") as fh:
173
+ for line in fh:
174
+ if line.startswith("MemTotal:"):
175
+ mem_total = int(line.split()[1]) / 1024
176
+ elif line.startswith("MemAvailable:"):
177
+ mem_available = int(line.split()[1]) / 1024
178
+ if mem_total > 0:
179
+ used_pct = ((mem_total - mem_available) / mem_total) * 100
180
+ return used_pct >= threshold_pct
181
+ except (OSError, ValueError, IndexError):
182
+ pass
183
+ return False
184
+
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # ChunkedLoader
188
+ # ---------------------------------------------------------------------------
189
+
190
+
191
+ class ChunkedLoader:
192
+ """Iterate over a HuggingFace dataset in fixed-size chunks.
193
+
194
+ Each iteration yields a ``list[dict]`` containing up to *chunk_size*
195
+ records. When the HuggingFace ``datasets`` library supports it, the
196
+ dataset is loaded with ``streaming=True`` to avoid downloading the
197
+ entire dataset at once.
198
+
199
+ Args:
200
+ dataset_name: HuggingFace dataset identifier (e.g. ``"openai_humaneval"``).
201
+ split: Dataset split to load (default ``"test"``).
202
+ chunk_size: Maximum number of records per chunk.
203
+ subset: Optional dataset subset / configuration name.
204
+ """
205
+
206
+ def __init__(
207
+ self,
208
+ dataset_name: str,
209
+ split: str = "test",
210
+ chunk_size: int = 1000,
211
+ subset: str | None = None,
212
+ ) -> None:
213
+ self.dataset_name = dataset_name
214
+ self.split = split
215
+ self.chunk_size = chunk_size
216
+ self.subset = subset
217
+ self._total_items: int | None = None
218
+
219
+ # -- public API ---------------------------------------------------------
220
+
221
+ def __len__(self) -> int:
222
+ """Return the total number of items in the dataset.
223
+
224
+ This may trigger a metadata fetch the first time it is called.
225
+
226
+ Returns:
227
+ Total number of items, or ``0`` if the count cannot be determined.
228
+ """
229
+ if self._total_items is None:
230
+ self._total_items = self._fetch_total_items()
231
+ return self._total_items
232
+
233
+ def __iter__(self) -> Iterator[list[dict[str, Any]]]:
234
+ """Yield successive chunks of dataset records.
235
+
236
+ Yields:
237
+ Lists of up to *chunk_size* task dictionaries.
238
+ """
239
+ dataset_iter = self._load_dataset_streaming()
240
+
241
+ chunk: list[dict[str, Any]] = []
242
+ for item in dataset_iter:
243
+ chunk.append(dict(item))
244
+ if len(chunk) >= self.chunk_size:
245
+ yield chunk
246
+ chunk = []
247
+
248
+ # Yield any remaining items
249
+ if chunk:
250
+ yield chunk
251
+
252
+ # -- internal helpers ---------------------------------------------------
253
+
254
+ def _load_dataset_streaming(self) -> Any:
255
+ """Load the dataset, preferring streaming mode.
256
+
257
+ Returns:
258
+ An iterable of dataset records (either a streaming
259
+ ``IterableDataset`` or a regular ``Dataset``).
260
+ """
261
+ load_kwargs: dict[str, Any] = {}
262
+ if self.subset is not None:
263
+ load_kwargs["name"] = self.subset
264
+
265
+ # Try streaming first for memory efficiency
266
+ try:
267
+ ds = load_dataset(
268
+ self.dataset_name,
269
+ split=self.split,
270
+ streaming=True,
271
+ **load_kwargs,
272
+ )
273
+ logger.info(
274
+ "Loaded dataset %s (split=%s) in streaming mode",
275
+ self.dataset_name,
276
+ self.split,
277
+ )
278
+ return ds
279
+ except Exception:
280
+ logger.debug(
281
+ "Streaming not supported for %s; falling back to full load",
282
+ self.dataset_name,
283
+ )
284
+
285
+ # Fallback: load the full dataset into memory
286
+ ds = load_dataset(
287
+ self.dataset_name,
288
+ split=self.split,
289
+ **load_kwargs,
290
+ )
291
+ logger.info(
292
+ "Loaded dataset %s (split=%s) fully into memory (%d items)",
293
+ self.dataset_name,
294
+ self.split,
295
+ len(ds),
296
+ )
297
+ return ds
298
+
299
+ def _fetch_total_items(self) -> int:
300
+ """Fetch the total item count from dataset metadata.
301
+
302
+ Returns:
303
+ The number of items, or ``0`` if it cannot be determined.
304
+ """
305
+ try:
306
+ load_kwargs: dict[str, Any] = {}
307
+ if self.subset is not None:
308
+ load_kwargs["name"] = self.subset
309
+
310
+ builder = load_dataset_builder(self.dataset_name, **load_kwargs)
311
+ info = builder.info
312
+ if info.splits and self.split in info.splits:
313
+ return info.splits[self.split].num_examples
314
+ except Exception:
315
+ logger.debug(
316
+ "Could not determine total items for %s/%s",
317
+ self.dataset_name,
318
+ self.split,
319
+ )
320
+ return 0
321
+
322
+
323
+ # ---------------------------------------------------------------------------
324
+ # StreamingDataset
325
+ # ---------------------------------------------------------------------------
326
+
327
+
328
+ class StreamingDataset:
329
+ """High-level memory-aware dataset loader.
330
+
331
+ Wraps :class:`ChunkedLoader` with automatic memory monitoring and
332
+ adaptive chunk sizing. Provides an iterator-based ``load_tasks`` method
333
+ compatible with the existing :class:`~mcpbr.benchmarks.base.Benchmark`
334
+ protocol (callers can materialise with ``list(...)`` when needed).
335
+
336
+ Args:
337
+ dataset_name: HuggingFace dataset identifier.
338
+ split: Dataset split (default ``"test"``).
339
+ max_memory_mb: Optional soft memory cap. When the process RSS exceeds
340
+ this value the chunk size is halved to reduce pressure.
341
+ """
342
+
343
+ # Default and minimum chunk sizes
344
+ _DEFAULT_CHUNK_SIZE = 1000
345
+ _MIN_CHUNK_SIZE = 50
346
+
347
+ def __init__(
348
+ self,
349
+ dataset_name: str,
350
+ split: str = "test",
351
+ max_memory_mb: float | None = None,
352
+ ) -> None:
353
+ self.dataset_name = dataset_name
354
+ self.split = split
355
+ self.max_memory_mb = max_memory_mb
356
+
357
+ self._monitor = MemoryMonitor()
358
+ self._stats = DatasetStats()
359
+ self._chunk_size = self._DEFAULT_CHUNK_SIZE
360
+ self._start_time: float | None = None
361
+
362
+ # -- public API ---------------------------------------------------------
363
+
364
+ def load_tasks(
365
+ self,
366
+ sample_size: int | None = None,
367
+ task_ids: list[str] | None = None,
368
+ ) -> Iterator[dict[str, Any]]:
369
+ """Lazily yield task dictionaries from the dataset.
370
+
371
+ Args:
372
+ sample_size: Maximum number of tasks to yield (``None`` for all).
373
+ task_ids: If provided, only yield tasks whose ``instance_id`` or
374
+ ``task_id`` is in this set.
375
+
376
+ Yields:
377
+ Individual task dictionaries.
378
+ """
379
+ self._start_time = time.monotonic()
380
+ self._stats = DatasetStats()
381
+
382
+ if sample_size is not None and sample_size <= 0:
383
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
384
+ return
385
+
386
+ task_id_set: set[str] | None = set(task_ids) if task_ids else None
387
+
388
+ loader = ChunkedLoader(
389
+ dataset_name=self.dataset_name,
390
+ split=self.split,
391
+ chunk_size=self._chunk_size,
392
+ )
393
+
394
+ yielded = 0
395
+
396
+ for chunk in loader:
397
+ self._stats.chunks_processed += 1
398
+
399
+ # Adapt chunk size under memory pressure
400
+ self._maybe_adapt_chunk_size(loader)
401
+
402
+ for item in chunk:
403
+ # Apply task_id filter
404
+ if task_id_set is not None:
405
+ item_id = item.get("instance_id") or item.get("task_id")
406
+ if item_id not in task_id_set:
407
+ continue
408
+
409
+ self._stats.total_loaded += 1
410
+ yielded += 1
411
+
412
+ # Track peak memory
413
+ current_mb = self._monitor.get_memory_usage_mb()
414
+ if current_mb > self._stats.peak_memory_mb:
415
+ self._stats.peak_memory_mb = current_mb
416
+
417
+ yield item
418
+
419
+ if sample_size is not None and yielded >= sample_size:
420
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
421
+ return
422
+
423
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
424
+
425
+ def get_stats(self) -> DatasetStats:
426
+ """Return statistics collected during the most recent ``load_tasks`` call.
427
+
428
+ Returns:
429
+ A :class:`DatasetStats` instance with current metrics.
430
+ """
431
+ # Update load_time if still in progress
432
+ if self._start_time is not None and self._stats.load_time_seconds == 0.0:
433
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
434
+ return self._stats
435
+
436
+ # -- internal helpers ---------------------------------------------------
437
+
438
+ def _maybe_adapt_chunk_size(self, loader: ChunkedLoader) -> None:
439
+ """Reduce the chunk size if memory pressure is detected.
440
+
441
+ Args:
442
+ loader: The active :class:`ChunkedLoader` whose chunk size will be
443
+ updated in place.
444
+ """
445
+ under_pressure = False
446
+
447
+ if self.max_memory_mb is not None:
448
+ current_mb = self._monitor.get_memory_usage_mb()
449
+ if current_mb > self.max_memory_mb:
450
+ under_pressure = True
451
+ logger.warning(
452
+ "RSS %.1f MB exceeds max_memory_mb %.1f MB; reducing chunk size",
453
+ current_mb,
454
+ self.max_memory_mb,
455
+ )
456
+
457
+ if not under_pressure and self._monitor.is_memory_pressure():
458
+ under_pressure = True
459
+ logger.warning("System memory pressure detected; reducing chunk size")
460
+
461
+ if under_pressure and loader.chunk_size > self._MIN_CHUNK_SIZE:
462
+ new_size = max(loader.chunk_size // 2, self._MIN_CHUNK_SIZE)
463
+ logger.info("Chunk size reduced from %d to %d", loader.chunk_size, new_size)
464
+ loader.chunk_size = new_size
465
+
466
+
467
+ # ---------------------------------------------------------------------------
468
+ # Convenience helpers
469
+ # ---------------------------------------------------------------------------
470
+
471
+
472
+ def get_memory_usage_mb() -> float:
473
+ """Return current process RSS in megabytes.
474
+
475
+ Convenience wrapper around :meth:`MemoryMonitor.get_memory_usage_mb`.
476
+
477
+ Returns:
478
+ RSS in MB, or ``0.0`` if measurement is unavailable.
479
+ """
480
+ return MemoryMonitor().get_memory_usage_mb()
481
+
482
+
483
+ def get_available_memory_mb() -> float:
484
+ """Return available system memory in megabytes.
485
+
486
+ Convenience wrapper around :meth:`MemoryMonitor.get_available_memory_mb`.
487
+
488
+ Returns:
489
+ Available memory in MB, or ``0.0`` if measurement is unavailable.
490
+ """
491
+ return MemoryMonitor().get_available_memory_mb()