data-foundry 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {data_foundry-0.0.2 → data_foundry-0.0.4}/PKG-INFO +2 -3
  2. {data_foundry-0.0.2 → data_foundry-0.0.4}/README.md +1 -2
  3. {data_foundry-0.0.2 → data_foundry-0.0.4}/pyproject.toml +2 -2
  4. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/__init__.py +2 -0
  5. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_sources.py +45 -3
  6. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/curation_container.py +113 -31
  7. data_foundry-0.0.4/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/toy_extra.parquet +0 -0
  8. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/__init__.py +0 -0
  9. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_core.py +0 -0
  10. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_registry.py +0 -0
  11. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/curation_recommendations.py +0 -0
  12. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/dataset_checks.py +0 -0
  13. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/__init__.py +0 -0
  14. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/container_metadata.json +0 -0
  15. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset.parquet +0 -0
  16. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset_metadata.dataset-mold-v1.json +0 -0
  17. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dtypes.json +0 -0
  18. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/experiment_metadata.predictive-ml-splits-mold-v1.json +0 -0
  19. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/task_metadata.predictive-ml-task-mold-v1.json +0 -0
  20. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/schema.py +0 -0
  21. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/utils/__init__.py +0 -0
  22. {data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/utils/checksum.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: data-foundry
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena).
5
5
  Keywords: tabular,machine-learning,benchmark,datasets,data-curation,tabarena
6
6
  Author: TabArena Maintainers
@@ -64,7 +64,7 @@ Description-Content-Type: text/markdown
64
64
 
65
65
  - A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
66
66
  - A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
67
- - A **collections API** that pins ``(unique_name, uuid)`` pointers to immutable curated containers and resolves them against a local warehouse or directly against the BeyondArena Hugging Face mirror.
67
+ - A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
68
68
 
69
69
  ## ⚡ Quickstart
70
70
 
@@ -347,7 +347,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
347
347
  ## 📄 Citation
348
348
 
349
349
  **PLACEHOLDER**
350
- 📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
351
350
 
352
351
  ```bibtex
353
352
  PLACEHOLDER
@@ -11,7 +11,7 @@
11
11
 
12
12
  - A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
13
13
  - A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
14
- - A **collections API** that pins ``(unique_name, uuid)`` pointers to immutable curated containers and resolves them against a local warehouse or directly against the BeyondArena Hugging Face mirror.
14
+ - A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
15
15
 
16
16
  ## ⚡ Quickstart
17
17
 
@@ -294,7 +294,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
294
294
  ## 📄 Citation
295
295
 
296
296
  **PLACEHOLDER**
297
- 📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
298
297
 
299
298
  ```bibtex
300
299
  PLACEHOLDER
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "data-foundry"
7
- version = "0.0.2"
7
+ version = "0.0.4"
8
8
  description = "A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -195,4 +195,4 @@ force-wrap-aliases = true
195
195
  convention = "google"
196
196
 
197
197
  [tool.ruff.lint.pylint]
198
- max-args = 10
198
+ max-args = 10
@@ -14,6 +14,7 @@ from data_foundry.collections._sources import (
14
14
  DEFAULT_CACHE_DIR,
15
15
  DataSource,
16
16
  HuggingFaceSource,
17
+ LocalWarehouseSource,
17
18
  clear_cache,
18
19
  resolve_cache_dir,
19
20
  )
@@ -26,6 +27,7 @@ __all__ = [
26
27
  "DataSource",
27
28
  "DatasetCollection",
28
29
  "HuggingFaceSource",
30
+ "LocalWarehouseSource",
29
31
  "clear_cache",
30
32
  "get_collection",
31
33
  "list_collections",
@@ -5,9 +5,11 @@ e.g. a Hugging Face dataset repo, an S3 bucket, or a directory you already
5
5
  have on disk. The collection asks the source to ``fetch`` an entry; the source
6
6
  returns a local path that :meth:`CuratedContainer.load` can read.
7
7
 
8
- Currently only :class:`HuggingFaceSource` is implemented, but the abstraction
9
- is designed so additional sources (URL/S3/local) can slot in without touching
10
- the rest of the package.
8
+ Two sources ship with the package: :class:`HuggingFaceSource` (download from
9
+ a Hub dataset repo) and :class:`LocalWarehouseSource` (point at a directory
10
+ that already mirrors the warehouse layout). The abstraction is designed so
11
+ additional sources (URL/S3/etc.) can slot in without touching the rest of
12
+ the package.
11
13
  """
12
14
 
13
15
  from __future__ import annotations
@@ -117,6 +119,46 @@ class DataSource:
117
119
  return [self.fetch(e, cache_dir, force_download=force_download) for e in entries]
118
120
 
119
121
 
122
+ @dataclass(frozen=True)
123
+ class LocalWarehouseSource(DataSource):
124
+ """Source backed by a pre-populated local warehouse directory.
125
+
126
+ Use this when the curated containers already live on disk in the standard
127
+ warehouse layout (``<base_dir>/<unique_name>/[versions/]<uuid>/``) and no
128
+ download is required — e.g. a shared filesystem on a compute cluster, or a
129
+ locally-curated set of containers that has not been published to the Hub.
130
+
131
+ ``fetch`` is a no-op pointer lookup: it returns the entry's path under
132
+ ``base_dir`` (and validates that the directory exists). ``cache_dir`` and
133
+ ``force_download`` are ignored because nothing is cached or downloaded.
134
+ """
135
+
136
+ base_dir: Path
137
+ """Root of the local warehouse — the directory that contains
138
+ ``<unique_name>/`` subfolders."""
139
+
140
+ def fetch(
141
+ self,
142
+ entry: CollectionEntry,
143
+ cache_dir: Path, # noqa: ARG002 — kept to match DataSource interface
144
+ *,
145
+ force_download: bool = False, # noqa: ARG002 — same reason
146
+ ) -> Path:
147
+ """Return the on-disk path for ``entry`` under :attr:`base_dir`.
148
+
149
+ Raises ``FileNotFoundError`` if the container directory is missing.
150
+ ``cache_dir`` and ``force_download`` are ignored.
151
+ """
152
+ path = entry.local_path(self.base_dir)
153
+ if not path.is_dir():
154
+ raise FileNotFoundError(
155
+ f"No curated container at {path}. Expected a directory "
156
+ f"matching entry {entry.relative_path.as_posix()!r} under the "
157
+ f"local warehouse {self.base_dir!s}.",
158
+ )
159
+ return path
160
+
161
+
120
162
  @dataclass(frozen=True)
121
163
  class HuggingFaceSource(DataSource):
122
164
  """Source backed by a Hugging Face Hub dataset repository.
@@ -5,6 +5,7 @@ import json
5
5
  import logging
6
6
  from dataclasses import dataclass
7
7
  from pathlib import Path
8
+ from typing import ClassVar
8
9
 
9
10
  import pandas as pd
10
11
  from pydantic import TypeAdapter
@@ -35,6 +36,16 @@ NoIndentMetadata = [
35
36
  class CuratedContainer:
36
37
  """Schema for a collection of curated items, ready to be used by others."""
37
38
 
39
+ _RESERVED_EXTRA_FILENAMES: ClassVar[frozenset[str]] = frozenset(
40
+ {
41
+ "dataset.parquet",
42
+ "dtypes.json",
43
+ "test_dataset.parquet",
44
+ "test_dtypes.json",
45
+ "container_metadata.json",
46
+ }
47
+ )
48
+
38
49
  dataset: pd.DataFrame
39
50
  """The curated dataset as a pandas DataFrame."""
40
51
  dataset_metadata: DatasetMetadata
@@ -134,10 +145,7 @@ class CuratedContainer:
134
145
  numeric_cols = feature_df.select_dtypes(include=["number"], exclude=["bool"]).columns
135
146
  categorical_cols = feature_df.select_dtypes(include=["category", "bool"]).columns
136
147
  datetime_cols = list(feature_df.select_dtypes(include=["datetime", "datetimetz"]).columns)
137
- datetime_cols += [
138
- c for c in feature_df.columns
139
- if isinstance(feature_df[c].dtype, pd.PeriodDtype)
140
- ]
148
+ datetime_cols += [c for c in feature_df.columns if isinstance(feature_df[c].dtype, pd.PeriodDtype)]
141
149
  text_cols = feature_df.select_dtypes(include=["string"]).columns
142
150
 
143
151
  return {
@@ -160,12 +168,14 @@ class CuratedContainer:
160
168
  checksum_short = (
161
169
  (self.checksum or "")[:16] + "…" if self.checksum and len(self.checksum) > 16 else self.checksum
162
170
  )
163
- return "\n".join([
164
- "CuratedContainer:",
165
- f" unique_name: {self.dataset_metadata.unique_name}",
166
- f" uuid: {uuid_short}",
167
- f" checksum: {checksum_short}",
168
- ])
171
+ return "\n".join(
172
+ [
173
+ "CuratedContainer:",
174
+ f" unique_name: {self.dataset_metadata.unique_name}",
175
+ f" uuid: {uuid_short}",
176
+ f" checksum: {checksum_short}",
177
+ ]
178
+ )
169
179
 
170
180
  def describe_dataset(self) -> str:
171
181
  """Return the DataFrame summary: shape and feature-dtype counts.
@@ -179,16 +189,18 @@ class CuratedContainer:
179
189
 
180
190
  counts = self._feature_dtype_counts()
181
191
  target = self.task_metadata.target_column_name
182
- return "\n".join([
183
- "Dataset:",
184
- f" shape: {self.dataset.shape}",
185
- f" feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
186
- f" numeric: {counts['numeric']}",
187
- f" categorical: {counts['categorical']}",
188
- f" datetime: {counts['datetime']}",
189
- f" text: {counts['text']}",
190
- f" binary: {counts['binary']}",
191
- ])
192
+ return "\n".join(
193
+ [
194
+ "Dataset:",
195
+ f" shape: {self.dataset.shape}",
196
+ f" feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
197
+ f" numeric: {counts['numeric']}",
198
+ f" categorical: {counts['categorical']}",
199
+ f" datetime: {counts['datetime']}",
200
+ f" text: {counts['text']}",
201
+ f" binary: {counts['binary']}",
202
+ ]
203
+ )
192
204
 
193
205
  def describe(self) -> str:
194
206
  """Return a high-level summary of the container.
@@ -197,17 +209,19 @@ class CuratedContainer:
197
209
  (shape + dtype counts), and the per-section :meth:`describe` outputs
198
210
  of the dataset, task, and experiment metadata objects.
199
211
  """
200
- return "\n".join([
201
- self.describe_container(),
202
- "",
203
- self.describe_dataset(),
204
- "",
205
- self.dataset_metadata.describe(),
206
- "",
207
- self.task_metadata.describe(),
208
- "",
209
- self.experiment_metadata.describe(),
210
- ])
212
+ return "\n".join(
213
+ [
214
+ self.describe_container(),
215
+ "",
216
+ self.describe_dataset(),
217
+ "",
218
+ self.dataset_metadata.describe(),
219
+ "",
220
+ self.task_metadata.describe(),
221
+ "",
222
+ self.experiment_metadata.describe(),
223
+ ]
224
+ )
211
225
 
212
226
  @staticmethod
213
227
  def _save_dtypes(df: pd.DataFrame, path: Path) -> None:
@@ -366,3 +380,71 @@ class CuratedContainer:
366
380
  loaded_from_path=path,
367
381
  **container_metadata,
368
382
  )
383
+
384
+
385
+ # --- Extra (non-core) artifacts ---------------------------------------------------
386
+ def _resolve_extras_dir(self, path: Path | str | None) -> Path:
387
+ """Return the directory to look for extra artifacts in.
388
+
389
+ Falls back to ``loaded_from_path`` if ``path`` is omitted.
390
+ """
391
+ if path is not None:
392
+ return Path(path)
393
+ if self.loaded_from_path is not None:
394
+ return self.loaded_from_path
395
+ raise ValueError(
396
+ "Container has no `loaded_from_path` — pass `path=...` to locate extra files.",
397
+ )
398
+
399
+ def extra_file_path(self, filename: str, *, path: Path | str | None = None) -> Path:
400
+ """Resolve the path of an extra artifact alongside the container.
401
+
402
+ Extra artifacts are any files a producer ships in the container directory beyond
403
+ the six core files (``dataset.parquet``, ``dtypes.json``, ``container_metadata.json``
404
+ and the three ``*.{type_id}.json`` metadata files) and the optional test-dataset pair.
405
+ Data Foundry does not interpret their contents — it only resolves the path so callers
406
+ can load them however they need (e.g. ``pd.read_parquet``, ``json.load``, ``np.load``).
407
+
408
+ The returned path is not guaranteed to exist — use :meth:`has_extra_file` first.
409
+ ``filename`` must be a bare file name (no directory separators).
410
+ """
411
+ if not filename or "/" in filename or "\\" in filename or filename in {".", ".."}:
412
+ raise ValueError(f"Extra filename must be a bare file name, got {filename!r}.")
413
+ if filename in self._RESERVED_EXTRA_FILENAMES:
414
+ raise ValueError(
415
+ f"{filename!r} is a core container file; use the dedicated load API instead "
416
+ "(e.g. `CuratedContainer.load(...)` or `load_test_dataset()`).",
417
+ )
418
+ return self._resolve_extras_dir(path) / filename
419
+
420
+ def has_extra_file(self, filename: str, *, path: Path | str | None = None) -> bool:
421
+ """Return whether an extra artifact named ``filename`` exists next to the container."""
422
+ try:
423
+ return self.extra_file_path(filename, path=path).is_file()
424
+ except ValueError:
425
+ return False
426
+
427
+ def list_extra_files(self, *, path: Path | str | None = None) -> list[str]:
428
+ """Return the sorted list of extra-artifact file names present next to the container.
429
+
430
+ Excludes the core six files and the optional test-dataset pair. Metadata JSON files
431
+ (``<name>.<type_id>.json`` — two dots before ``.json``) are also excluded.
432
+ Returns ``[]`` if the directory does not exist or holds no extras.
433
+ """
434
+ try:
435
+ base = self._resolve_extras_dir(path)
436
+ except ValueError:
437
+ return []
438
+ if not base.is_dir():
439
+ return []
440
+ extras: list[str] = []
441
+ for entry in sorted(base.iterdir()):
442
+ if not entry.is_file():
443
+ continue
444
+ name = entry.name
445
+ if name in self._RESERVED_EXTRA_FILENAMES:
446
+ continue
447
+ if name.endswith(".json") and name.count(".") >= 2:
448
+ continue
449
+ extras.append(name)
450
+ return extras