eval-toolkit 0.27.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,424 @@
1
+ """Dataset loading: pluggable :class:`DatasetLoader` Protocol + reference impls.
2
+
3
+ A :class:`DatasetLoader` produces a ``dict[str, EvalSlice]`` keyed by split
4
+ name (HuggingFace ``DatasetDict`` shape). For un-split data, return
5
+ ``{"all": slice}`` and pipe through a :class:`~eval_toolkit.splits.Splitter`
6
+ for cross-validation. For pre-split data (``{"train", "validation", "test"}``),
7
+ the harness consumes the dict directly.
8
+
9
+ Reference impls cover the four prompt-injection-* projects' shapes:
10
+ :class:`DataFrameLoader` (in-memory dataframe + a split column),
11
+ :class:`SingleSliceLoader` (already-built EvalSlice as ``{"all": ...}``),
12
+ :class:`ParquetGlobLoader` (load + concat parquet files; hashes for the
13
+ manifest), and :class:`HFDatasetsLoader` (optional, soft-imports
14
+ ``datasets`` only if installed).
15
+
16
+ The :meth:`describe` method on every loader returns a Croissant-compatible
17
+ metadata subset (``name``, ``description``, ``cite_as``, ``license``, ``url``,
18
+ ``distribution: list[{name, contentUrl, sha256, contentSize}]``) so manifests
19
+ interoperate with the MLCommons standard without forcing full Croissant
20
+ production.
21
+
22
+ References
23
+ ----------
24
+ .. [1] Croissant: A Metadata Format for ML-Ready Datasets. arXiv:2403.19546.
25
+ .. [2] HuggingFace ``datasets.DatasetDict`` — the convergent named-splits
26
+ shape adopted by ``lm-evaluation-harness``, ``HELM``, and Inspect AI.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import glob as _glob
32
+ from collections.abc import Mapping, Sequence
33
+ from dataclasses import dataclass
34
+ from pathlib import Path
35
+ from typing import Any, Protocol, cast, runtime_checkable
36
+
37
+ import pandas as pd
38
+
39
+ from eval_toolkit.harness import EvalSlice
40
+ from eval_toolkit.provenance import file_sha256
41
+
42
+ __all__ = [
43
+ "DataFrameLoader",
44
+ "DatasetLoader",
45
+ "HFDatasetsLoader",
46
+ "ParquetGlobLoader",
47
+ "SingleSliceLoader",
48
+ ]
49
+
50
+
51
+ @runtime_checkable
52
+ class DatasetLoader(Protocol):
53
+ """Yields one or more :class:`EvalSlice` keyed by split name.
54
+
55
+ Mirrors HuggingFace ``DatasetDict``. For un-split data, return
56
+ ``{"all": slice}`` and let a :class:`~eval_toolkit.splits.Splitter`
57
+ produce folds. Tensor-agnostic: the
58
+ :class:`~eval_toolkit.harness.Scorer` handles tokenization / tensor
59
+ conversion / device placement.
60
+ """
61
+
62
+ def load_splits(self) -> dict[str, EvalSlice]: # pragma: no cover
63
+ """Return a ``{split_name: EvalSlice}`` dict."""
64
+ ...
65
+
66
+ def describe(self) -> dict[str, object]: # pragma: no cover
67
+ """Croissant-subset metadata for the manifest.
68
+
69
+ Recommended keys:
70
+
71
+ - ``name``: dataset name
72
+ - ``description``: short description
73
+ - ``cite_as``: BibTeX or arXiv id
74
+ - ``license``: SPDX identifier
75
+ - ``url``: canonical URL
76
+ - ``distribution``: list of ``{name, contentUrl, sha256, contentSize}``
77
+ """
78
+ ...
79
+
80
+
81
+ @dataclass(frozen=True, slots=True)
82
+ class DataFrameLoader:
83
+ """Wraps a single in-memory dataframe + a split column.
84
+
85
+ Splits the dataframe by the unique values in ``split_col``. Covers the
86
+ four prompt-injection-* projects' shape (a single parquet with a
87
+ ``split`` column). For pre-split data without a single column,
88
+ concatenate first and synthesize a ``split`` column.
89
+
90
+ Parameters
91
+ ----------
92
+ df : pandas.DataFrame
93
+ The full dataset.
94
+ split_col : str
95
+ Column whose unique values name the splits (e.g. ``"split"`` with
96
+ values ``"train" / "val" / "test"``).
97
+ feature_col : str, optional
98
+ Column carrying the feature passed to ``Scorer.predict_proba``.
99
+ Default ``"text"``.
100
+ label_col : str, optional
101
+ Column with binary labels in ``{0, 1}``. Default ``"label"``.
102
+ strata_col : str or None, optional
103
+ Optional stratifier column for per-stratum recall reporting.
104
+ name : str, optional
105
+ Dataset name used in :meth:`describe`. Default ``""``.
106
+ description : str, optional
107
+ Free-text description used in :meth:`describe`. Default ``""``.
108
+ cite_as : str, optional
109
+ Citation string (BibTeX, arXiv id) used in :meth:`describe`.
110
+ license : str, optional
111
+ SPDX license identifier used in :meth:`describe`.
112
+ url : str, optional
113
+ Canonical URL used in :meth:`describe`.
114
+ """
115
+
116
+ df: pd.DataFrame
117
+ split_col: str
118
+ feature_col: str = "text"
119
+ label_col: str = "label"
120
+ strata_col: str | None = None
121
+ name: str = ""
122
+ description: str = ""
123
+ cite_as: str = ""
124
+ license: str = ""
125
+ url: str = ""
126
+
127
+ def __post_init__(self) -> None:
128
+ """Validate the dataframe has the columns we'll read."""
129
+ for col in (self.split_col, self.feature_col, self.label_col):
130
+ if col not in self.df.columns:
131
+ raise KeyError(
132
+ f"DataFrameLoader: missing column {col!r}; available: "
133
+ f"{list(self.df.columns)}"
134
+ )
135
+ if self.strata_col is not None and self.strata_col not in self.df.columns:
136
+ raise KeyError(
137
+ f"DataFrameLoader: missing strata column {self.strata_col!r}; "
138
+ f"available: {list(self.df.columns)}"
139
+ )
140
+
141
+ def load_splits(self) -> dict[str, EvalSlice]:
142
+ """Group rows by ``split_col`` and build one EvalSlice per group."""
143
+ out: dict[str, EvalSlice] = {}
144
+ for split_name, sub_df in self.df.groupby(self.split_col, sort=False):
145
+ sub_df_reset = sub_df.reset_index(drop=True)
146
+ out[str(split_name)] = EvalSlice(
147
+ name=str(split_name),
148
+ df=sub_df_reset,
149
+ description=self.description,
150
+ feature_col=self.feature_col,
151
+ label_col=self.label_col,
152
+ strata_col=self.strata_col,
153
+ )
154
+ return out
155
+
156
+ def describe(self) -> dict[str, object]:
157
+ """Croissant-subset metadata. ``distribution`` empty (no file artifacts)."""
158
+ return {
159
+ "name": self.name or "DataFrameLoader",
160
+ "description": self.description,
161
+ "citeAs": self.cite_as,
162
+ "license": self.license,
163
+ "url": self.url,
164
+ "distribution": [],
165
+ "n_total_rows": int(len(self.df)),
166
+ "split_col": self.split_col,
167
+ }
168
+
169
+
170
+ @dataclass(frozen=True, slots=True)
171
+ class SingleSliceLoader:
172
+ """Wraps a single pre-built :class:`EvalSlice` as ``{"all": slice}``.
173
+
174
+ The trivial entry point into the :class:`~eval_toolkit.splits.Splitter`
175
+ pipeline when you already have an EvalSlice and want to run K-fold or
176
+ holdout on it.
177
+
178
+ Parameters
179
+ ----------
180
+ slice_ : EvalSlice
181
+ The parent slice. Will be re-keyed as ``"all"``.
182
+ name : str, optional
183
+ Dataset name for :meth:`describe`. Default ``""``.
184
+ description : str, optional
185
+ Free-text description. Default ``""``.
186
+ """
187
+
188
+ slice_: EvalSlice
189
+ name: str = ""
190
+ description: str = ""
191
+
192
+ def load_splits(self) -> dict[str, EvalSlice]:
193
+ """Return ``{"all": <renamed-slice>}``."""
194
+ renamed = EvalSlice(
195
+ name="all",
196
+ df=self.slice_.df,
197
+ description=self.description or self.slice_.description,
198
+ feature_col=self.slice_.feature_col,
199
+ label_col=self.slice_.label_col,
200
+ strata_col=self.slice_.strata_col,
201
+ )
202
+ return {"all": renamed}
203
+
204
+ def describe(self) -> dict[str, object]:
205
+ """Croissant-subset metadata."""
206
+ return {
207
+ "name": self.name or "SingleSliceLoader",
208
+ "description": self.description,
209
+ "citeAs": "",
210
+ "license": "",
211
+ "url": "",
212
+ "distribution": [],
213
+ "n_total_rows": int(len(self.slice_.df)),
214
+ }
215
+
216
+
217
+ @dataclass(frozen=True, slots=True)
218
+ class ParquetGlobLoader:
219
+ """Load + concat parquet files matched by a glob; hash each for the manifest.
220
+
221
+ For each glob, all matching files are loaded with
222
+ :func:`pandas.read_parquet` and concatenated. Each file's SHA-256 is
223
+ captured in :meth:`describe` so manifests are reproducible.
224
+
225
+ Parameters
226
+ ----------
227
+ splits : dict[str, str]
228
+ Mapping ``{split_name: glob_pattern}``. Each pattern's matching
229
+ files become one split. Patterns are evaluated relative to the
230
+ process CWD; pass absolute paths for determinism.
231
+ feature_col : str, optional
232
+ Column name. Default ``"text"``.
233
+ label_col : str, optional
234
+ Column name. Default ``"label"``.
235
+ strata_col : str or None, optional
236
+ name, description, cite_as, license, url : str, optional
237
+ Croissant metadata fields.
238
+ """
239
+
240
+ splits: dict[str, str]
241
+ feature_col: str = "text"
242
+ label_col: str = "label"
243
+ strata_col: str | None = None
244
+ name: str = ""
245
+ description: str = ""
246
+ cite_as: str = ""
247
+ license: str = ""
248
+ url: str = ""
249
+
250
+ def _resolve_files(self) -> dict[str, list[Path]]:
251
+ """Expand each glob pattern to a sorted list of Path objects."""
252
+ out: dict[str, list[Path]] = {}
253
+ for split_name, pattern in self.splits.items():
254
+ files = sorted(Path(p) for p in _glob.glob(pattern))
255
+ if not files:
256
+ raise FileNotFoundError(
257
+ f"ParquetGlobLoader: glob {pattern!r} matched no files for split "
258
+ f"{split_name!r}"
259
+ )
260
+ out[split_name] = files
261
+ return out
262
+
263
+ def load_splits(self) -> dict[str, EvalSlice]:
264
+ """Read + concat each split's parquet files into an EvalSlice.
265
+
266
+ Raises
267
+ ------
268
+ KeyError
269
+ If any split's loaded DataFrame is missing ``feature_col`` or
270
+ ``label_col``.
271
+ """
272
+ files_by_split = self._resolve_files()
273
+ out: dict[str, EvalSlice] = {}
274
+ for split_name, files in files_by_split.items():
275
+ parts = [pd.read_parquet(p) for p in files]
276
+ df = pd.concat(parts, axis=0, ignore_index=True)
277
+ for col in (self.feature_col, self.label_col):
278
+ if col not in df.columns:
279
+ raise KeyError(
280
+ f"ParquetGlobLoader: split {split_name!r}: missing column "
281
+ f"{col!r}; available: {list(df.columns)}"
282
+ )
283
+ out[split_name] = EvalSlice(
284
+ name=split_name,
285
+ df=df,
286
+ description=self.description,
287
+ feature_col=self.feature_col,
288
+ label_col=self.label_col,
289
+ strata_col=self.strata_col,
290
+ )
291
+ return out
292
+
293
+ def describe(self) -> dict[str, object]:
294
+ """Croissant-subset metadata with per-file SHA-256 in ``distribution``."""
295
+ files_by_split = self._resolve_files()
296
+ distribution: list[dict[str, object]] = []
297
+ for split_name, files in files_by_split.items():
298
+ for f in files:
299
+ size = f.stat().st_size if f.exists() else 0
300
+ sha = file_sha256(f, strict=False)
301
+ distribution.append(
302
+ {
303
+ "name": f"{split_name}/{f.name}",
304
+ "contentUrl": str(f),
305
+ "sha256": sha,
306
+ "contentSize": int(size),
307
+ }
308
+ )
309
+ return {
310
+ "name": self.name or "ParquetGlobLoader",
311
+ "description": self.description,
312
+ "citeAs": self.cite_as,
313
+ "license": self.license,
314
+ "url": self.url,
315
+ "distribution": distribution,
316
+ }
317
+
318
+
319
+ @dataclass(frozen=True, slots=True)
320
+ class HFDatasetsLoader:
321
+ """Load a HuggingFace ``datasets`` repo as ``{split: EvalSlice}``.
322
+
323
+ Soft dependency on the ``datasets`` package: an :class:`ImportError` is
324
+ raised at :meth:`load_splits` time with a clear install hint. This is
325
+ intentional — eval-toolkit's core deps are numpy / scipy / sklearn only.
326
+
327
+ Parameters
328
+ ----------
329
+ repo_id : str
330
+ HuggingFace dataset repo, e.g. ``"deepset/prompt-injections"``.
331
+ splits : sequence of str or None, optional
332
+ Subset of HF splits to load. ``None`` = every split the repo defines.
333
+ feature_col : str, optional
334
+ Column name in the HF dataset. Default ``"text"``.
335
+ label_col : str, optional
336
+ Column name. Default ``"label"``.
337
+ strata_col : str or None, optional
338
+ config_name : str or None, optional
339
+ HF dataset config name (some datasets have multiple configs).
340
+ name, description, cite_as, license, url : str, optional
341
+ Croissant metadata fields.
342
+ """
343
+
344
+ repo_id: str
345
+ splits: Sequence[str] | None = None
346
+ feature_col: str = "text"
347
+ label_col: str = "label"
348
+ strata_col: str | None = None
349
+ config_name: str | None = None
350
+ name: str = ""
351
+ description: str = ""
352
+ cite_as: str = ""
353
+ license: str = ""
354
+ url: str = ""
355
+
356
+ def _load_dataset(self) -> Mapping[str, Any]:
357
+ """Soft-import ``datasets`` and return the loaded DatasetDict.
358
+
359
+ Returns a ``Mapping[str, Any]`` (HF ``DatasetDict`` is dict-like —
360
+ keys are split names, values are HF ``Dataset`` objects exposing
361
+ ``.to_pandas()``). Annotated as ``Mapping`` rather than concrete
362
+ ``DatasetDict`` so consumers don't need to install ``datasets`` to
363
+ type-check downstream code.
364
+ """
365
+ try:
366
+ from datasets import load_dataset # type: ignore[import-not-found]
367
+ except ImportError as exc:
368
+ raise ImportError(
369
+ "HFDatasetsLoader requires the optional 'datasets' package. "
370
+ "Install with: pip install datasets"
371
+ ) from exc
372
+ if self.config_name is not None:
373
+ return cast(Mapping[str, Any], load_dataset(self.repo_id, name=self.config_name))
374
+ return cast(Mapping[str, Any], load_dataset(self.repo_id))
375
+
376
+ def load_splits(self) -> dict[str, EvalSlice]:
377
+ """Convert each requested HF split to an :class:`EvalSlice`.
378
+
379
+ Raises
380
+ ------
381
+ KeyError
382
+ If any split's pandas DataFrame is missing ``feature_col`` or
383
+ ``label_col``.
384
+ """
385
+ ds = self._load_dataset()
386
+ ds_splits = list(ds.keys()) if self.splits is None else list(self.splits)
387
+ out: dict[str, EvalSlice] = {}
388
+ for split_name in ds_splits:
389
+ sub = ds[split_name]
390
+ df = sub.to_pandas()
391
+ for col in (self.feature_col, self.label_col):
392
+ if col not in df.columns:
393
+ raise KeyError(
394
+ f"HFDatasetsLoader: split {split_name!r}: missing column "
395
+ f"{col!r}; available: {list(df.columns)}"
396
+ )
397
+ out[split_name] = EvalSlice(
398
+ name=split_name,
399
+ df=df,
400
+ description=self.description,
401
+ feature_col=self.feature_col,
402
+ label_col=self.label_col,
403
+ strata_col=self.strata_col,
404
+ )
405
+ return out
406
+
407
+ def describe(self) -> dict[str, object]:
408
+ """Croissant-subset metadata pointing at the HF repo (no file hashes — HF caches)."""
409
+ return {
410
+ "name": self.name or self.repo_id,
411
+ "description": self.description,
412
+ "citeAs": self.cite_as,
413
+ "license": self.license,
414
+ "url": self.url or f"https://huggingface.co/datasets/{self.repo_id}",
415
+ "distribution": [
416
+ {
417
+ "name": f"hf:{self.repo_id}",
418
+ "contentUrl": f"https://huggingface.co/datasets/{self.repo_id}",
419
+ "sha256": "", # HF cache hash not exposed via the public API
420
+ "contentSize": 0,
421
+ }
422
+ ],
423
+ "config_name": self.config_name,
424
+ }