data-foundry 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_foundry-0.0.2 → data_foundry-0.0.4}/PKG-INFO +2 -3
- {data_foundry-0.0.2 → data_foundry-0.0.4}/README.md +1 -2
- {data_foundry-0.0.2 → data_foundry-0.0.4}/pyproject.toml +2 -2
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/__init__.py +2 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_sources.py +45 -3
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/curation_container.py +113 -31
- data_foundry-0.0.4/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/toy_extra.parquet +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/__init__.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_core.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_registry.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/curation_recommendations.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/dataset_checks.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/__init__.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/container_metadata.json +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset.parquet +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset_metadata.dataset-mold-v1.json +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dtypes.json +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/experiment_metadata.predictive-ml-splits-mold-v1.json +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/task_metadata.predictive-ml-task-mold-v1.json +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/schema.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/utils/__init__.py +0 -0
- {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/utils/checksum.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: data-foundry
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena).
|
|
5
5
|
Keywords: tabular,machine-learning,benchmark,datasets,data-curation,tabarena
|
|
6
6
|
Author: TabArena Maintainers
|
|
@@ -64,7 +64,7 @@ Description-Content-Type: text/markdown
|
|
|
64
64
|
|
|
65
65
|
- A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
|
|
66
66
|
- A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
|
|
67
|
-
- A **collections API** that pins ``(unique_name, uuid)``
|
|
67
|
+
- A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
|
|
68
68
|
|
|
69
69
|
## ⚡ Quickstart
|
|
70
70
|
|
|
@@ -347,7 +347,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
|
|
|
347
347
|
## 📄 Citation
|
|
348
348
|
|
|
349
349
|
**PLACEHOLDER**
|
|
350
|
-
📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
|
|
351
350
|
|
|
352
351
|
```bibtex
|
|
353
352
|
PLACEHOLDER
|
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
|
|
12
12
|
- A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
|
|
13
13
|
- A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
|
|
14
|
-
- A **collections API** that pins ``(unique_name, uuid)``
|
|
14
|
+
- A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
|
|
15
15
|
|
|
16
16
|
## ⚡ Quickstart
|
|
17
17
|
|
|
@@ -294,7 +294,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
|
|
|
294
294
|
## 📄 Citation
|
|
295
295
|
|
|
296
296
|
**PLACEHOLDER**
|
|
297
|
-
📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
|
|
298
297
|
|
|
299
298
|
```bibtex
|
|
300
299
|
PLACEHOLDER
|
|
@@ -4,7 +4,7 @@ build-backend = "uv_build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "data-foundry"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.4"
|
|
8
8
|
description = "A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -195,4 +195,4 @@ force-wrap-aliases = true
|
|
|
195
195
|
convention = "google"
|
|
196
196
|
|
|
197
197
|
[tool.ruff.lint.pylint]
|
|
198
|
-
max-args = 10
|
|
198
|
+
max-args = 10
|
|
@@ -14,6 +14,7 @@ from data_foundry.collections._sources import (
|
|
|
14
14
|
DEFAULT_CACHE_DIR,
|
|
15
15
|
DataSource,
|
|
16
16
|
HuggingFaceSource,
|
|
17
|
+
LocalWarehouseSource,
|
|
17
18
|
clear_cache,
|
|
18
19
|
resolve_cache_dir,
|
|
19
20
|
)
|
|
@@ -26,6 +27,7 @@ __all__ = [
|
|
|
26
27
|
"DataSource",
|
|
27
28
|
"DatasetCollection",
|
|
28
29
|
"HuggingFaceSource",
|
|
30
|
+
"LocalWarehouseSource",
|
|
29
31
|
"clear_cache",
|
|
30
32
|
"get_collection",
|
|
31
33
|
"list_collections",
|
|
@@ -5,9 +5,11 @@ e.g. a Hugging Face dataset repo, an S3 bucket, or a directory you already
|
|
|
5
5
|
have on disk. The collection asks the source to ``fetch`` an entry; the source
|
|
6
6
|
returns a local path that :meth:`CuratedContainer.load` can read.
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
Two sources ship with the package: :class:`HuggingFaceSource` (download from
|
|
9
|
+
a Hub dataset repo) and :class:`LocalWarehouseSource` (point at a directory
|
|
10
|
+
that already mirrors the warehouse layout). The abstraction is designed so
|
|
11
|
+
additional sources (URL/S3/etc.) can slot in without touching the rest of
|
|
12
|
+
the package.
|
|
11
13
|
"""
|
|
12
14
|
|
|
13
15
|
from __future__ import annotations
|
|
@@ -117,6 +119,46 @@ class DataSource:
|
|
|
117
119
|
return [self.fetch(e, cache_dir, force_download=force_download) for e in entries]
|
|
118
120
|
|
|
119
121
|
|
|
122
|
+
@dataclass(frozen=True)
|
|
123
|
+
class LocalWarehouseSource(DataSource):
|
|
124
|
+
"""Source backed by a pre-populated local warehouse directory.
|
|
125
|
+
|
|
126
|
+
Use this when the curated containers already live on disk in the standard
|
|
127
|
+
warehouse layout (``<base_dir>/<unique_name>/[versions/]<uuid>/``) and no
|
|
128
|
+
download is required — e.g. a shared filesystem on a compute cluster, or a
|
|
129
|
+
locally-curated set of containers that has not been published to the Hub.
|
|
130
|
+
|
|
131
|
+
``fetch`` is a no-op pointer lookup: it returns the entry's path under
|
|
132
|
+
``base_dir`` (and validates that the directory exists). ``cache_dir`` and
|
|
133
|
+
``force_download`` are ignored because nothing is cached or downloaded.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
base_dir: Path
|
|
137
|
+
"""Root of the local warehouse — the directory that contains
|
|
138
|
+
``<unique_name>/`` subfolders."""
|
|
139
|
+
|
|
140
|
+
def fetch(
|
|
141
|
+
self,
|
|
142
|
+
entry: CollectionEntry,
|
|
143
|
+
cache_dir: Path, # noqa: ARG002 — kept to match DataSource interface
|
|
144
|
+
*,
|
|
145
|
+
force_download: bool = False, # noqa: ARG002 — same reason
|
|
146
|
+
) -> Path:
|
|
147
|
+
"""Return the on-disk path for ``entry`` under :attr:`base_dir`.
|
|
148
|
+
|
|
149
|
+
Raises ``FileNotFoundError`` if the container directory is missing.
|
|
150
|
+
``cache_dir`` and ``force_download`` are ignored.
|
|
151
|
+
"""
|
|
152
|
+
path = entry.local_path(self.base_dir)
|
|
153
|
+
if not path.is_dir():
|
|
154
|
+
raise FileNotFoundError(
|
|
155
|
+
f"No curated container at {path}. Expected a directory "
|
|
156
|
+
f"matching entry {entry.relative_path.as_posix()!r} under the "
|
|
157
|
+
f"local warehouse {self.base_dir!s}.",
|
|
158
|
+
)
|
|
159
|
+
return path
|
|
160
|
+
|
|
161
|
+
|
|
120
162
|
@dataclass(frozen=True)
|
|
121
163
|
class HuggingFaceSource(DataSource):
|
|
122
164
|
"""Source backed by a Hugging Face Hub dataset repository.
|
|
@@ -5,6 +5,7 @@ import json
|
|
|
5
5
|
import logging
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
from typing import ClassVar
|
|
8
9
|
|
|
9
10
|
import pandas as pd
|
|
10
11
|
from pydantic import TypeAdapter
|
|
@@ -35,6 +36,16 @@ NoIndentMetadata = [
|
|
|
35
36
|
class CuratedContainer:
|
|
36
37
|
"""Schema for a collection of curated items, ready to be used by others."""
|
|
37
38
|
|
|
39
|
+
_RESERVED_EXTRA_FILENAMES: ClassVar[frozenset[str]] = frozenset(
|
|
40
|
+
{
|
|
41
|
+
"dataset.parquet",
|
|
42
|
+
"dtypes.json",
|
|
43
|
+
"test_dataset.parquet",
|
|
44
|
+
"test_dtypes.json",
|
|
45
|
+
"container_metadata.json",
|
|
46
|
+
}
|
|
47
|
+
)
|
|
48
|
+
|
|
38
49
|
dataset: pd.DataFrame
|
|
39
50
|
"""The curated dataset as a pandas DataFrame."""
|
|
40
51
|
dataset_metadata: DatasetMetadata
|
|
@@ -134,10 +145,7 @@ class CuratedContainer:
|
|
|
134
145
|
numeric_cols = feature_df.select_dtypes(include=["number"], exclude=["bool"]).columns
|
|
135
146
|
categorical_cols = feature_df.select_dtypes(include=["category", "bool"]).columns
|
|
136
147
|
datetime_cols = list(feature_df.select_dtypes(include=["datetime", "datetimetz"]).columns)
|
|
137
|
-
datetime_cols += [
|
|
138
|
-
c for c in feature_df.columns
|
|
139
|
-
if isinstance(feature_df[c].dtype, pd.PeriodDtype)
|
|
140
|
-
]
|
|
148
|
+
datetime_cols += [c for c in feature_df.columns if isinstance(feature_df[c].dtype, pd.PeriodDtype)]
|
|
141
149
|
text_cols = feature_df.select_dtypes(include=["string"]).columns
|
|
142
150
|
|
|
143
151
|
return {
|
|
@@ -160,12 +168,14 @@ class CuratedContainer:
|
|
|
160
168
|
checksum_short = (
|
|
161
169
|
(self.checksum or "")[:16] + "…" if self.checksum and len(self.checksum) > 16 else self.checksum
|
|
162
170
|
)
|
|
163
|
-
return "\n".join(
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
171
|
+
return "\n".join(
|
|
172
|
+
[
|
|
173
|
+
"CuratedContainer:",
|
|
174
|
+
f" unique_name: {self.dataset_metadata.unique_name}",
|
|
175
|
+
f" uuid: {uuid_short}",
|
|
176
|
+
f" checksum: {checksum_short}",
|
|
177
|
+
]
|
|
178
|
+
)
|
|
169
179
|
|
|
170
180
|
def describe_dataset(self) -> str:
|
|
171
181
|
"""Return the DataFrame summary: shape and feature-dtype counts.
|
|
@@ -179,16 +189,18 @@ class CuratedContainer:
|
|
|
179
189
|
|
|
180
190
|
counts = self._feature_dtype_counts()
|
|
181
191
|
target = self.task_metadata.target_column_name
|
|
182
|
-
return "\n".join(
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
+
return "\n".join(
|
|
193
|
+
[
|
|
194
|
+
"Dataset:",
|
|
195
|
+
f" shape: {self.dataset.shape}",
|
|
196
|
+
f" feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
|
|
197
|
+
f" numeric: {counts['numeric']}",
|
|
198
|
+
f" categorical: {counts['categorical']}",
|
|
199
|
+
f" datetime: {counts['datetime']}",
|
|
200
|
+
f" text: {counts['text']}",
|
|
201
|
+
f" binary: {counts['binary']}",
|
|
202
|
+
]
|
|
203
|
+
)
|
|
192
204
|
|
|
193
205
|
def describe(self) -> str:
|
|
194
206
|
"""Return a high-level summary of the container.
|
|
@@ -197,17 +209,19 @@ class CuratedContainer:
|
|
|
197
209
|
(shape + dtype counts), and the per-section :meth:`describe` outputs
|
|
198
210
|
of the dataset, task, and experiment metadata objects.
|
|
199
211
|
"""
|
|
200
|
-
return "\n".join(
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
212
|
+
return "\n".join(
|
|
213
|
+
[
|
|
214
|
+
self.describe_container(),
|
|
215
|
+
"",
|
|
216
|
+
self.describe_dataset(),
|
|
217
|
+
"",
|
|
218
|
+
self.dataset_metadata.describe(),
|
|
219
|
+
"",
|
|
220
|
+
self.task_metadata.describe(),
|
|
221
|
+
"",
|
|
222
|
+
self.experiment_metadata.describe(),
|
|
223
|
+
]
|
|
224
|
+
)
|
|
211
225
|
|
|
212
226
|
@staticmethod
|
|
213
227
|
def _save_dtypes(df: pd.DataFrame, path: Path) -> None:
|
|
@@ -366,3 +380,71 @@ class CuratedContainer:
|
|
|
366
380
|
loaded_from_path=path,
|
|
367
381
|
**container_metadata,
|
|
368
382
|
)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
# --- Extra (non-core) artifacts ---------------------------------------------------
|
|
386
|
+
def _resolve_extras_dir(self, path: Path | str | None) -> Path:
|
|
387
|
+
"""Return the directory to look for extra artifacts in.
|
|
388
|
+
|
|
389
|
+
Falls back to ``loaded_from_path`` if ``path`` is omitted.
|
|
390
|
+
"""
|
|
391
|
+
if path is not None:
|
|
392
|
+
return Path(path)
|
|
393
|
+
if self.loaded_from_path is not None:
|
|
394
|
+
return self.loaded_from_path
|
|
395
|
+
raise ValueError(
|
|
396
|
+
"Container has no `loaded_from_path` — pass `path=...` to locate extra files.",
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
def extra_file_path(self, filename: str, *, path: Path | str | None = None) -> Path:
|
|
400
|
+
"""Resolve the path of an extra artifact alongside the container.
|
|
401
|
+
|
|
402
|
+
Extra artifacts are any files a producer ships in the container directory beyond
|
|
403
|
+
the six core files (``dataset.parquet``, ``dtypes.json``, ``container_metadata.json``
|
|
404
|
+
and the three ``*.{type_id}.json`` metadata files) and the optional test-dataset pair.
|
|
405
|
+
Data Foundry does not interpret their contents — it only resolves the path so callers
|
|
406
|
+
can load them however they need (e.g. ``pd.read_parquet``, ``json.load``, ``np.load``).
|
|
407
|
+
|
|
408
|
+
The returned path is not guaranteed to exist — use :meth:`has_extra_file` first.
|
|
409
|
+
``filename`` must be a bare file name (no directory separators).
|
|
410
|
+
"""
|
|
411
|
+
if not filename or "/" in filename or "\\" in filename or filename in {".", ".."}:
|
|
412
|
+
raise ValueError(f"Extra filename must be a bare file name, got {filename!r}.")
|
|
413
|
+
if filename in self._RESERVED_EXTRA_FILENAMES:
|
|
414
|
+
raise ValueError(
|
|
415
|
+
f"{filename!r} is a core container file; use the dedicated load API instead "
|
|
416
|
+
"(e.g. `CuratedContainer.load(...)` or `load_test_dataset()`).",
|
|
417
|
+
)
|
|
418
|
+
return self._resolve_extras_dir(path) / filename
|
|
419
|
+
|
|
420
|
+
def has_extra_file(self, filename: str, *, path: Path | str | None = None) -> bool:
|
|
421
|
+
"""Return whether an extra artifact named ``filename`` exists next to the container."""
|
|
422
|
+
try:
|
|
423
|
+
return self.extra_file_path(filename, path=path).is_file()
|
|
424
|
+
except ValueError:
|
|
425
|
+
return False
|
|
426
|
+
|
|
427
|
+
def list_extra_files(self, *, path: Path | str | None = None) -> list[str]:
|
|
428
|
+
"""Return the sorted list of extra-artifact file names present next to the container.
|
|
429
|
+
|
|
430
|
+
Excludes the core six files and the optional test-dataset pair. Metadata JSON files
|
|
431
|
+
(``<name>.<type_id>.json`` — two dots before ``.json``) are also excluded.
|
|
432
|
+
Returns ``[]`` if the directory does not exist or holds no extras.
|
|
433
|
+
"""
|
|
434
|
+
try:
|
|
435
|
+
base = self._resolve_extras_dir(path)
|
|
436
|
+
except ValueError:
|
|
437
|
+
return []
|
|
438
|
+
if not base.is_dir():
|
|
439
|
+
return []
|
|
440
|
+
extras: list[str] = []
|
|
441
|
+
for entry in sorted(base.iterdir()):
|
|
442
|
+
if not entry.is_file():
|
|
443
|
+
continue
|
|
444
|
+
name = entry.name
|
|
445
|
+
if name in self._RESERVED_EXTRA_FILENAMES:
|
|
446
|
+
continue
|
|
447
|
+
if name.endswith(".json") and name.count(".") >= 2:
|
|
448
|
+
continue
|
|
449
|
+
extras.append(name)
|
|
450
|
+
return extras
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|