mcpbr 0.4.15__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. mcpbr/benchmarks/__init__.py +12 -0
  2. mcpbr/benchmarks/adversarial.py +341 -0
  3. mcpbr/benchmarks/custom.py +607 -0
  4. mcpbr/benchmarks/longbench.py +623 -0
  5. mcpbr/benchmarks/mmmu.py +353 -0
  6. mcpbr/config.py +4 -0
  7. mcpbr/config_migration.py +470 -0
  8. mcpbr/config_wizard.py +647 -0
  9. mcpbr/custom_metrics.py +405 -0
  10. mcpbr/dashboard.py +619 -0
  11. mcpbr/dataset_streaming.py +491 -0
  12. mcpbr/dataset_versioning.py +222 -0
  13. mcpbr/docker_cache.py +539 -0
  14. mcpbr/docker_prewarm.py +369 -0
  15. mcpbr/dry_run.py +532 -0
  16. mcpbr/failure_analysis.py +558 -0
  17. mcpbr/few_shot.py +367 -0
  18. mcpbr/formatting.py +444 -0
  19. mcpbr/gpu_support.py +157 -0
  20. mcpbr/harness.py +38 -4
  21. mcpbr/latency_metrics.py +317 -0
  22. mcpbr/resource_limits.py +487 -0
  23. mcpbr/result_streaming.py +519 -0
  24. mcpbr/sampling.py +193 -0
  25. mcpbr/task_batching.py +403 -0
  26. mcpbr/task_scheduler.py +468 -0
  27. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/METADATA +10 -6
  28. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/RECORD +38 -15
  29. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/brave-search.yaml +0 -0
  30. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/filesystem.yaml +0 -0
  31. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/github.yaml +0 -0
  32. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/google-maps.yaml +0 -0
  33. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/postgres.yaml +0 -0
  34. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/slack.yaml +0 -0
  35. {mcpbr-0.4.15.data → mcpbr-0.5.0.data}/data/mcpbr/data/templates/sqlite.yaml +0 -0
  36. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/WHEEL +0 -0
  37. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {mcpbr-0.4.15.dist-info → mcpbr-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,491 @@
1
+ """Memory-efficient large dataset handling for benchmark evaluations.
2
+
3
+ This module provides streaming and chunked loading of large HuggingFace datasets,
4
+ enabling benchmark runs on datasets that would otherwise exceed available memory.
5
+ It includes memory monitoring, automatic chunking under memory pressure, and
6
+ iterator-based APIs compatible with existing benchmark ``load_tasks`` patterns.
7
+
8
+ Key components:
9
+ - ``MemoryMonitor``: Tracks RSS and available memory, detects memory pressure.
10
+ - ``ChunkedLoader``: Iterates over a HuggingFace dataset in configurable chunks.
11
+ - ``StreamingDataset``: High-level API that yields tasks lazily with memory awareness.
12
+ - ``DatasetStats``: Summary statistics for a streaming load session.
13
+ """
14
+
15
+ import logging
16
+ import os
17
+ import time
18
+ from collections.abc import Iterator
19
+ from dataclasses import dataclass
20
+ from typing import Any
21
+
22
+ from datasets import load_dataset, load_dataset_builder
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # DatasetStats
29
+ # ---------------------------------------------------------------------------
30
+
31
+
32
+ @dataclass
33
+ class DatasetStats:
34
+ """Summary statistics for a streaming dataset load session.
35
+
36
+ Attributes:
37
+ total_loaded: Number of individual task items yielded so far.
38
+ peak_memory_mb: Peak RSS observed during loading (in megabytes).
39
+ chunks_processed: Number of chunks fetched from the underlying loader.
40
+ load_time_seconds: Wall-clock seconds elapsed during loading.
41
+ """
42
+
43
+ total_loaded: int = 0
44
+ peak_memory_mb: float = 0.0
45
+ chunks_processed: int = 0
46
+ load_time_seconds: float = 0.0
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # MemoryMonitor
51
+ # ---------------------------------------------------------------------------
52
+
53
+
54
+ class MemoryMonitor:
55
+ """Lightweight monitor for process and system memory usage.
56
+
57
+ Uses ``psutil`` when available, falling back to reading ``/proc/self/status``
58
+ and ``/proc/meminfo`` on Linux. On platforms where neither is available the
59
+ methods return ``0.0`` and memory-pressure detection is disabled.
60
+ """
61
+
62
+ def __init__(self) -> None:
63
+ """Initialize the memory monitor and detect available backends."""
64
+ self._has_psutil = False
65
+ try:
66
+ import psutil # noqa: F401
67
+
68
+ self._has_psutil = True
69
+ except ImportError:
70
+ pass
71
+
72
+ # -- public API ---------------------------------------------------------
73
+
74
+ def get_memory_usage_mb(self) -> float:
75
+ """Return the current Resident Set Size (RSS) in megabytes.
76
+
77
+ Returns:
78
+ RSS in MB, or ``0.0`` if measurement is unavailable.
79
+ """
80
+ if self._has_psutil:
81
+ return self._rss_via_psutil()
82
+ return self._rss_via_proc()
83
+
84
+ def get_available_memory_mb(self) -> float:
85
+ """Return available system memory in megabytes.
86
+
87
+ Returns:
88
+ Available memory in MB, or ``0.0`` if measurement is unavailable.
89
+ """
90
+ if self._has_psutil:
91
+ return self._available_via_psutil()
92
+ return self._available_via_proc()
93
+
94
+ def is_memory_pressure(self, threshold_pct: float = 80.0) -> bool:
95
+ """Check whether system memory usage exceeds a threshold.
96
+
97
+ Args:
98
+ threshold_pct: Percentage (0--100) of total memory above which
99
+ the system is considered under pressure.
100
+
101
+ Returns:
102
+ ``True`` if memory usage exceeds *threshold_pct*, ``False``
103
+ otherwise or if measurement is unavailable.
104
+ """
105
+ if self._has_psutil:
106
+ return self._pressure_via_psutil(threshold_pct)
107
+ return self._pressure_via_proc(threshold_pct)
108
+
109
+ # -- psutil backend -----------------------------------------------------
110
+
111
+ def _rss_via_psutil(self) -> float:
112
+ """Get RSS using psutil."""
113
+ try:
114
+ import psutil
115
+
116
+ process = psutil.Process(os.getpid())
117
+ return process.memory_info().rss / (1024 * 1024)
118
+ except Exception:
119
+ return 0.0
120
+
121
+ def _available_via_psutil(self) -> float:
122
+ """Get available system memory using psutil."""
123
+ try:
124
+ import psutil
125
+
126
+ return psutil.virtual_memory().available / (1024 * 1024)
127
+ except Exception:
128
+ return 0.0
129
+
130
+ def _pressure_via_psutil(self, threshold_pct: float) -> bool:
131
+ """Check memory pressure using psutil."""
132
+ try:
133
+ import psutil
134
+
135
+ return psutil.virtual_memory().percent >= threshold_pct
136
+ except Exception:
137
+ return False
138
+
139
+ # -- /proc fallback -----------------------------------------------------
140
+
141
+ @staticmethod
142
+ def _rss_via_proc() -> float:
143
+ """Get RSS by parsing ``/proc/self/status``."""
144
+ try:
145
+ with open("/proc/self/status") as fh:
146
+ for line in fh:
147
+ if line.startswith("VmRSS:"):
148
+ # Value is in kB
149
+ return int(line.split()[1]) / 1024
150
+ except (OSError, ValueError, IndexError):
151
+ pass
152
+ return 0.0
153
+
154
+ @staticmethod
155
+ def _available_via_proc() -> float:
156
+ """Get available memory by parsing ``/proc/meminfo``."""
157
+ try:
158
+ with open("/proc/meminfo") as fh:
159
+ for line in fh:
160
+ if line.startswith("MemAvailable:"):
161
+ return int(line.split()[1]) / 1024
162
+ except (OSError, ValueError, IndexError):
163
+ pass
164
+ return 0.0
165
+
166
+ @staticmethod
167
+ def _pressure_via_proc(threshold_pct: float) -> bool:
168
+ """Check memory pressure using ``/proc/meminfo``."""
169
+ try:
170
+ mem_total = 0.0
171
+ mem_available = 0.0
172
+ with open("/proc/meminfo") as fh:
173
+ for line in fh:
174
+ if line.startswith("MemTotal:"):
175
+ mem_total = int(line.split()[1]) / 1024
176
+ elif line.startswith("MemAvailable:"):
177
+ mem_available = int(line.split()[1]) / 1024
178
+ if mem_total > 0:
179
+ used_pct = ((mem_total - mem_available) / mem_total) * 100
180
+ return used_pct >= threshold_pct
181
+ except (OSError, ValueError, IndexError):
182
+ pass
183
+ return False
184
+
185
+
186
+ # ---------------------------------------------------------------------------
187
+ # ChunkedLoader
188
+ # ---------------------------------------------------------------------------
189
+
190
+
191
+ class ChunkedLoader:
192
+ """Iterate over a HuggingFace dataset in fixed-size chunks.
193
+
194
+ Each iteration yields a ``list[dict]`` containing up to *chunk_size*
195
+ records. When the HuggingFace ``datasets`` library supports it, the
196
+ dataset is loaded with ``streaming=True`` to avoid downloading the
197
+ entire dataset at once.
198
+
199
+ Args:
200
+ dataset_name: HuggingFace dataset identifier (e.g. ``"openai_humaneval"``).
201
+ split: Dataset split to load (default ``"test"``).
202
+ chunk_size: Maximum number of records per chunk.
203
+ subset: Optional dataset subset / configuration name.
204
+ """
205
+
206
+ def __init__(
207
+ self,
208
+ dataset_name: str,
209
+ split: str = "test",
210
+ chunk_size: int = 1000,
211
+ subset: str | None = None,
212
+ ) -> None:
213
+ self.dataset_name = dataset_name
214
+ self.split = split
215
+ self.chunk_size = chunk_size
216
+ self.subset = subset
217
+ self._total_items: int | None = None
218
+
219
+ # -- public API ---------------------------------------------------------
220
+
221
+ def __len__(self) -> int:
222
+ """Return the total number of items in the dataset.
223
+
224
+ This may trigger a metadata fetch the first time it is called.
225
+
226
+ Returns:
227
+ Total number of items, or ``0`` if the count cannot be determined.
228
+ """
229
+ if self._total_items is None:
230
+ self._total_items = self._fetch_total_items()
231
+ return self._total_items
232
+
233
+ def __iter__(self) -> Iterator[list[dict[str, Any]]]:
234
+ """Yield successive chunks of dataset records.
235
+
236
+ Yields:
237
+ Lists of up to *chunk_size* task dictionaries.
238
+ """
239
+ dataset_iter = self._load_dataset_streaming()
240
+
241
+ chunk: list[dict[str, Any]] = []
242
+ for item in dataset_iter:
243
+ chunk.append(dict(item))
244
+ if len(chunk) >= self.chunk_size:
245
+ yield chunk
246
+ chunk = []
247
+
248
+ # Yield any remaining items
249
+ if chunk:
250
+ yield chunk
251
+
252
+ # -- internal helpers ---------------------------------------------------
253
+
254
+ def _load_dataset_streaming(self) -> Any:
255
+ """Load the dataset, preferring streaming mode.
256
+
257
+ Returns:
258
+ An iterable of dataset records (either a streaming
259
+ ``IterableDataset`` or a regular ``Dataset``).
260
+ """
261
+ load_kwargs: dict[str, Any] = {}
262
+ if self.subset is not None:
263
+ load_kwargs["name"] = self.subset
264
+
265
+ # Try streaming first for memory efficiency
266
+ try:
267
+ ds = load_dataset(
268
+ self.dataset_name,
269
+ split=self.split,
270
+ streaming=True,
271
+ **load_kwargs,
272
+ )
273
+ logger.info(
274
+ "Loaded dataset %s (split=%s) in streaming mode",
275
+ self.dataset_name,
276
+ self.split,
277
+ )
278
+ return ds
279
+ except Exception:
280
+ logger.debug(
281
+ "Streaming not supported for %s; falling back to full load",
282
+ self.dataset_name,
283
+ )
284
+
285
+ # Fallback: load the full dataset into memory
286
+ ds = load_dataset(
287
+ self.dataset_name,
288
+ split=self.split,
289
+ **load_kwargs,
290
+ )
291
+ logger.info(
292
+ "Loaded dataset %s (split=%s) fully into memory (%d items)",
293
+ self.dataset_name,
294
+ self.split,
295
+ len(ds),
296
+ )
297
+ return ds
298
+
299
+ def _fetch_total_items(self) -> int:
300
+ """Fetch the total item count from dataset metadata.
301
+
302
+ Returns:
303
+ The number of items, or ``0`` if it cannot be determined.
304
+ """
305
+ try:
306
+ load_kwargs: dict[str, Any] = {}
307
+ if self.subset is not None:
308
+ load_kwargs["name"] = self.subset
309
+
310
+ builder = load_dataset_builder(self.dataset_name, **load_kwargs)
311
+ info = builder.info
312
+ if info.splits and self.split in info.splits:
313
+ return info.splits[self.split].num_examples
314
+ except Exception:
315
+ logger.debug(
316
+ "Could not determine total items for %s/%s",
317
+ self.dataset_name,
318
+ self.split,
319
+ )
320
+ return 0
321
+
322
+
323
+ # ---------------------------------------------------------------------------
324
+ # StreamingDataset
325
+ # ---------------------------------------------------------------------------
326
+
327
+
328
+ class StreamingDataset:
329
+ """High-level memory-aware dataset loader.
330
+
331
+ Wraps :class:`ChunkedLoader` with automatic memory monitoring and
332
+ adaptive chunk sizing. Provides an iterator-based ``load_tasks`` method
333
+ compatible with the existing :class:`~mcpbr.benchmarks.base.Benchmark`
334
+ protocol (callers can materialise with ``list(...)`` when needed).
335
+
336
+ Args:
337
+ dataset_name: HuggingFace dataset identifier.
338
+ split: Dataset split (default ``"test"``).
339
+ max_memory_mb: Optional soft memory cap. When the process RSS exceeds
340
+ this value the chunk size is halved to reduce pressure.
341
+ """
342
+
343
+ # Default and minimum chunk sizes
344
+ _DEFAULT_CHUNK_SIZE = 1000
345
+ _MIN_CHUNK_SIZE = 50
346
+
347
+ def __init__(
348
+ self,
349
+ dataset_name: str,
350
+ split: str = "test",
351
+ max_memory_mb: float | None = None,
352
+ ) -> None:
353
+ self.dataset_name = dataset_name
354
+ self.split = split
355
+ self.max_memory_mb = max_memory_mb
356
+
357
+ self._monitor = MemoryMonitor()
358
+ self._stats = DatasetStats()
359
+ self._chunk_size = self._DEFAULT_CHUNK_SIZE
360
+ self._start_time: float | None = None
361
+
362
+ # -- public API ---------------------------------------------------------
363
+
364
+ def load_tasks(
365
+ self,
366
+ sample_size: int | None = None,
367
+ task_ids: list[str] | None = None,
368
+ ) -> Iterator[dict[str, Any]]:
369
+ """Lazily yield task dictionaries from the dataset.
370
+
371
+ Args:
372
+ sample_size: Maximum number of tasks to yield (``None`` for all).
373
+ task_ids: If provided, only yield tasks whose ``instance_id`` or
374
+ ``task_id`` is in this set.
375
+
376
+ Yields:
377
+ Individual task dictionaries.
378
+ """
379
+ self._start_time = time.monotonic()
380
+ self._stats = DatasetStats()
381
+
382
+ if sample_size is not None and sample_size <= 0:
383
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
384
+ return
385
+
386
+ task_id_set: set[str] | None = set(task_ids) if task_ids else None
387
+
388
+ loader = ChunkedLoader(
389
+ dataset_name=self.dataset_name,
390
+ split=self.split,
391
+ chunk_size=self._chunk_size,
392
+ )
393
+
394
+ yielded = 0
395
+
396
+ for chunk in loader:
397
+ self._stats.chunks_processed += 1
398
+
399
+ # Adapt chunk size under memory pressure
400
+ self._maybe_adapt_chunk_size(loader)
401
+
402
+ for item in chunk:
403
+ # Apply task_id filter
404
+ if task_id_set is not None:
405
+ item_id = item.get("instance_id") or item.get("task_id")
406
+ if item_id not in task_id_set:
407
+ continue
408
+
409
+ self._stats.total_loaded += 1
410
+ yielded += 1
411
+
412
+ # Track peak memory
413
+ current_mb = self._monitor.get_memory_usage_mb()
414
+ if current_mb > self._stats.peak_memory_mb:
415
+ self._stats.peak_memory_mb = current_mb
416
+
417
+ yield item
418
+
419
+ if sample_size is not None and yielded >= sample_size:
420
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
421
+ return
422
+
423
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
424
+
425
+ def get_stats(self) -> DatasetStats:
426
+ """Return statistics collected during the most recent ``load_tasks`` call.
427
+
428
+ Returns:
429
+ A :class:`DatasetStats` instance with current metrics.
430
+ """
431
+ # Update load_time if still in progress
432
+ if self._start_time is not None and self._stats.load_time_seconds == 0.0:
433
+ self._stats.load_time_seconds = time.monotonic() - self._start_time
434
+ return self._stats
435
+
436
+ # -- internal helpers ---------------------------------------------------
437
+
438
+ def _maybe_adapt_chunk_size(self, loader: ChunkedLoader) -> None:
439
+ """Reduce the chunk size if memory pressure is detected.
440
+
441
+ Args:
442
+ loader: The active :class:`ChunkedLoader` whose chunk size will be
443
+ updated in place.
444
+ """
445
+ under_pressure = False
446
+
447
+ if self.max_memory_mb is not None:
448
+ current_mb = self._monitor.get_memory_usage_mb()
449
+ if current_mb > self.max_memory_mb:
450
+ under_pressure = True
451
+ logger.warning(
452
+ "RSS %.1f MB exceeds max_memory_mb %.1f MB; reducing chunk size",
453
+ current_mb,
454
+ self.max_memory_mb,
455
+ )
456
+
457
+ if not under_pressure and self._monitor.is_memory_pressure():
458
+ under_pressure = True
459
+ logger.warning("System memory pressure detected; reducing chunk size")
460
+
461
+ if under_pressure and loader.chunk_size > self._MIN_CHUNK_SIZE:
462
+ new_size = max(loader.chunk_size // 2, self._MIN_CHUNK_SIZE)
463
+ logger.info("Chunk size reduced from %d to %d", loader.chunk_size, new_size)
464
+ loader.chunk_size = new_size
465
+
466
+
467
+ # ---------------------------------------------------------------------------
468
+ # Convenience helpers
469
+ # ---------------------------------------------------------------------------
470
+
471
+
472
+ def get_memory_usage_mb() -> float:
473
+ """Return current process RSS in megabytes.
474
+
475
+ Convenience wrapper around :meth:`MemoryMonitor.get_memory_usage_mb`.
476
+
477
+ Returns:
478
+ RSS in MB, or ``0.0`` if measurement is unavailable.
479
+ """
480
+ return MemoryMonitor().get_memory_usage_mb()
481
+
482
+
483
+ def get_available_memory_mb() -> float:
484
+ """Return available system memory in megabytes.
485
+
486
+ Convenience wrapper around :meth:`MemoryMonitor.get_available_memory_mb`.
487
+
488
+ Returns:
489
+ Available memory in MB, or ``0.0`` if measurement is unavailable.
490
+ """
491
+ return MemoryMonitor().get_available_memory_mb()
@@ -0,0 +1,222 @@
1
+ """Dataset versioning for reproducible benchmark evaluations.
2
+
3
+ This module provides utilities to pin and track HuggingFace dataset versions,
4
+ ensuring that benchmark runs can be reproduced with the exact same data.
5
+ Version information includes dataset revision hashes, download timestamps,
6
+ and optional checksums for data integrity verification.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ from dataclasses import asdict, dataclass
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ from datasets import Dataset, load_dataset
18
+ from huggingface_hub import dataset_info
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class DatasetVersion:
25
+ """Pinned version information for a HuggingFace dataset.
26
+
27
+ Attributes:
28
+ dataset_id: HuggingFace dataset identifier (e.g., 'SWE-bench/SWE-bench_Lite').
29
+ revision: Git revision hash of the dataset (None for latest).
30
+ download_date: ISO 8601 timestamp of when the version was pinned.
31
+ checksum: Optional SHA256 checksum of the dataset content for integrity verification.
32
+ """
33
+
34
+ dataset_id: str
35
+ revision: str | None
36
+ download_date: str
37
+ checksum: str | None
38
+
39
+
40
+ def pin_dataset_version(
41
+ dataset_id: str,
42
+ revision: str | None = None,
43
+ ) -> DatasetVersion:
44
+ """Record the current version of a HuggingFace dataset.
45
+
46
+ Fetches dataset metadata from the HuggingFace Hub to determine the
47
+ current revision. If a specific revision is provided, it is used directly.
48
+
49
+ Args:
50
+ dataset_id: HuggingFace dataset identifier (e.g., 'SWE-bench/SWE-bench_Lite').
51
+ revision: Specific git revision to pin. If None, the latest revision is fetched.
52
+
53
+ Returns:
54
+ DatasetVersion with the pinned revision and metadata.
55
+
56
+ Raises:
57
+ Exception: If the dataset cannot be found or accessed on the HuggingFace Hub.
58
+ """
59
+ info = dataset_info(dataset_id, revision=revision)
60
+ resolved_revision = info.sha
61
+
62
+ # Compute a checksum from the dataset card and file metadata for integrity
63
+ checksum_data = f"{dataset_id}:{resolved_revision}"
64
+ if info.siblings:
65
+ file_names = sorted(s.rfilename for s in info.siblings)
66
+ checksum_data += ":" + ",".join(file_names)
67
+ checksum = hashlib.sha256(checksum_data.encode()).hexdigest()
68
+
69
+ download_date = datetime.now(timezone.utc).isoformat()
70
+
71
+ version = DatasetVersion(
72
+ dataset_id=dataset_id,
73
+ revision=resolved_revision,
74
+ download_date=download_date,
75
+ checksum=checksum,
76
+ )
77
+
78
+ logger.info(
79
+ "Pinned dataset %s at revision %s",
80
+ dataset_id,
81
+ resolved_revision,
82
+ )
83
+
84
+ return version
85
+
86
+
87
+ def load_dataset_pinned(
88
+ dataset_id: str,
89
+ version: DatasetVersion | None = None,
90
+ **kwargs: Any,
91
+ ) -> Dataset:
92
+ """Load a HuggingFace dataset using a pinned version for reproducibility.
93
+
94
+ Wraps the standard ``datasets.load_dataset`` call, injecting the pinned
95
+ revision so that the exact same data snapshot is used across runs.
96
+
97
+ Args:
98
+ dataset_id: HuggingFace dataset identifier.
99
+ version: Pinned version to use. If None, loads the latest version.
100
+ **kwargs: Additional keyword arguments passed to ``datasets.load_dataset``
101
+ (e.g., split, name, streaming).
102
+
103
+ Returns:
104
+ The loaded HuggingFace Dataset.
105
+ """
106
+ revision = None
107
+ if version is not None:
108
+ revision = version.revision
109
+ logger.info(
110
+ "Loading dataset %s at pinned revision %s (pinned on %s)",
111
+ dataset_id,
112
+ revision,
113
+ version.download_date,
114
+ )
115
+ else:
116
+ logger.info("Loading dataset %s at latest revision", dataset_id)
117
+
118
+ return load_dataset(dataset_id, revision=revision, **kwargs)
119
+
120
+
121
+ def save_version_manifest(
122
+ versions: dict[str, DatasetVersion],
123
+ path: Path,
124
+ ) -> None:
125
+ """Save dataset version pins to a JSON manifest file.
126
+
127
+ The manifest file records all pinned dataset versions so they can be
128
+ shared across team members or CI environments for reproducible runs.
129
+
130
+ Args:
131
+ versions: Mapping of dataset identifiers to their pinned versions.
132
+ path: File path to write the JSON manifest.
133
+ """
134
+ manifest: dict[str, Any] = {
135
+ "format_version": "1.0",
136
+ "created_at": datetime.now(timezone.utc).isoformat(),
137
+ "datasets": {},
138
+ }
139
+
140
+ for dataset_id, version in versions.items():
141
+ manifest["datasets"][dataset_id] = asdict(version)
142
+
143
+ path.parent.mkdir(parents=True, exist_ok=True)
144
+
145
+ with open(path, "w") as f:
146
+ json.dump(manifest, f, indent=2)
147
+
148
+ logger.info("Saved version manifest with %d datasets to %s", len(versions), path)
149
+
150
+
151
+ def load_version_manifest(path: Path) -> dict[str, DatasetVersion]:
152
+ """Load pinned dataset versions from a JSON manifest file.
153
+
154
+ Args:
155
+ path: File path to the JSON manifest.
156
+
157
+ Returns:
158
+ Mapping of dataset identifiers to their pinned versions.
159
+
160
+ Raises:
161
+ FileNotFoundError: If the manifest file does not exist.
162
+ json.JSONDecodeError: If the manifest file contains invalid JSON.
163
+ KeyError: If the manifest is missing required fields.
164
+ """
165
+ with open(path) as f:
166
+ manifest = json.load(f)
167
+
168
+ versions: dict[str, DatasetVersion] = {}
169
+ datasets_data = manifest.get("datasets", {})
170
+
171
+ for dataset_id, version_data in datasets_data.items():
172
+ versions[dataset_id] = DatasetVersion(
173
+ dataset_id=version_data["dataset_id"],
174
+ revision=version_data.get("revision"),
175
+ download_date=version_data["download_date"],
176
+ checksum=version_data.get("checksum"),
177
+ )
178
+
179
+ logger.info("Loaded version manifest with %d datasets from %s", len(versions), path)
180
+
181
+ return versions
182
+
183
+
184
+ def get_dataset_info(dataset_id: str) -> dict[str, Any]:
185
+ """Get metadata about a HuggingFace dataset.
186
+
187
+ Retrieves information such as the latest revision, description,
188
+ file listing, and other Hub metadata.
189
+
190
+ Args:
191
+ dataset_id: HuggingFace dataset identifier.
192
+
193
+ Returns:
194
+ Dictionary containing dataset metadata with keys:
195
+ - dataset_id: The dataset identifier.
196
+ - latest_revision: The current HEAD revision hash.
197
+ - description: Dataset description text.
198
+ - tags: List of dataset tags.
199
+ - downloads: Number of downloads.
200
+ - last_modified: Last modification timestamp.
201
+ - files: List of files in the dataset repository.
202
+
203
+ Raises:
204
+ Exception: If the dataset cannot be found or accessed on the HuggingFace Hub.
205
+ """
206
+ info = dataset_info(dataset_id)
207
+
208
+ files: list[str] = []
209
+ if info.siblings:
210
+ files = [s.rfilename for s in info.siblings]
211
+
212
+ result: dict[str, Any] = {
213
+ "dataset_id": dataset_id,
214
+ "latest_revision": info.sha,
215
+ "description": info.description or "",
216
+ "tags": list(info.tags) if info.tags else [],
217
+ "downloads": info.downloads if info.downloads is not None else 0,
218
+ "last_modified": info.last_modified.isoformat() if info.last_modified else None,
219
+ "files": files,
220
+ }
221
+
222
+ return result