data-foundry 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {data_foundry-0.0.2 → data_foundry-0.0.3}/PKG-INFO +2 -3
  2. {data_foundry-0.0.2 → data_foundry-0.0.3}/README.md +1 -2
  3. {data_foundry-0.0.2 → data_foundry-0.0.3}/pyproject.toml +2 -2
  4. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/curation_container.py +113 -31
  5. data_foundry-0.0.3/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/toy_extra.parquet +0 -0
  6. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/__init__.py +0 -0
  7. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/collections/__init__.py +0 -0
  8. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/collections/_core.py +0 -0
  9. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/collections/_registry.py +0 -0
  10. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/collections/_sources.py +0 -0
  11. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/curation_recommendations.py +0 -0
  12. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/dataset_checks.py +0 -0
  13. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/__init__.py +0 -0
  14. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/container_metadata.json +0 -0
  15. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset.parquet +0 -0
  16. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset_metadata.dataset-mold-v1.json +0 -0
  17. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dtypes.json +0 -0
  18. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/experiment_metadata.predictive-ml-splits-mold-v1.json +0 -0
  19. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/task_metadata.predictive-ml-task-mold-v1.json +0 -0
  20. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/schema.py +0 -0
  21. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/utils/__init__.py +0 -0
  22. {data_foundry-0.0.2 → data_foundry-0.0.3}/src/data_foundry/utils/checksum.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: data-foundry
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena).
5
5
  Keywords: tabular,machine-learning,benchmark,datasets,data-curation,tabarena
6
6
  Author: TabArena Maintainers
@@ -64,7 +64,7 @@ Description-Content-Type: text/markdown
64
64
 
65
65
  - A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
66
66
  - A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
67
- - A **collections API** that pins ``(unique_name, uuid)`` pointers to immutable curated containers and resolves them against a local warehouse or directly against the BeyondArena Hugging Face mirror.
67
+ - A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
68
68
 
69
69
  ## ⚡ Quickstart
70
70
 
@@ -347,7 +347,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
347
347
  ## 📄 Citation
348
348
 
349
349
  **PLACEHOLDER**
350
- 📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
351
350
 
352
351
  ```bibtex
353
352
  PLACEHOLDER
@@ -11,7 +11,7 @@
11
11
 
12
12
  - A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
13
13
  - A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
14
- - A **collections API** that pins ``(unique_name, uuid)`` pointers to immutable curated containers and resolves them against a local warehouse or directly against the BeyondArena Hugging Face mirror.
14
+ - A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
15
15
 
16
16
  ## ⚡ Quickstart
17
17
 
@@ -294,7 +294,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
294
294
  ## 📄 Citation
295
295
 
296
296
  **PLACEHOLDER**
297
- 📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
298
297
 
299
298
  ```bibtex
300
299
  PLACEHOLDER
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "data-foundry"
7
- version = "0.0.2"
7
+ version = "0.0.3"
8
8
  description = "A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -195,4 +195,4 @@ force-wrap-aliases = true
195
195
  convention = "google"
196
196
 
197
197
  [tool.ruff.lint.pylint]
198
- max-args = 10
198
+ max-args = 10
@@ -5,6 +5,7 @@ import json
5
5
  import logging
6
6
  from dataclasses import dataclass
7
7
  from pathlib import Path
8
+ from typing import ClassVar
8
9
 
9
10
  import pandas as pd
10
11
  from pydantic import TypeAdapter
@@ -35,6 +36,16 @@ NoIndentMetadata = [
35
36
  class CuratedContainer:
36
37
  """Schema for a collection of curated items, ready to be used by others."""
37
38
 
39
+ _RESERVED_EXTRA_FILENAMES: ClassVar[frozenset[str]] = frozenset(
40
+ {
41
+ "dataset.parquet",
42
+ "dtypes.json",
43
+ "test_dataset.parquet",
44
+ "test_dtypes.json",
45
+ "container_metadata.json",
46
+ }
47
+ )
48
+
38
49
  dataset: pd.DataFrame
39
50
  """The curated dataset as a pandas DataFrame."""
40
51
  dataset_metadata: DatasetMetadata
@@ -134,10 +145,7 @@ class CuratedContainer:
134
145
  numeric_cols = feature_df.select_dtypes(include=["number"], exclude=["bool"]).columns
135
146
  categorical_cols = feature_df.select_dtypes(include=["category", "bool"]).columns
136
147
  datetime_cols = list(feature_df.select_dtypes(include=["datetime", "datetimetz"]).columns)
137
- datetime_cols += [
138
- c for c in feature_df.columns
139
- if isinstance(feature_df[c].dtype, pd.PeriodDtype)
140
- ]
148
+ datetime_cols += [c for c in feature_df.columns if isinstance(feature_df[c].dtype, pd.PeriodDtype)]
141
149
  text_cols = feature_df.select_dtypes(include=["string"]).columns
142
150
 
143
151
  return {
@@ -160,12 +168,14 @@ class CuratedContainer:
160
168
  checksum_short = (
161
169
  (self.checksum or "")[:16] + "…" if self.checksum and len(self.checksum) > 16 else self.checksum
162
170
  )
163
- return "\n".join([
164
- "CuratedContainer:",
165
- f" unique_name: {self.dataset_metadata.unique_name}",
166
- f" uuid: {uuid_short}",
167
- f" checksum: {checksum_short}",
168
- ])
171
+ return "\n".join(
172
+ [
173
+ "CuratedContainer:",
174
+ f" unique_name: {self.dataset_metadata.unique_name}",
175
+ f" uuid: {uuid_short}",
176
+ f" checksum: {checksum_short}",
177
+ ]
178
+ )
169
179
 
170
180
  def describe_dataset(self) -> str:
171
181
  """Return the DataFrame summary: shape and feature-dtype counts.
@@ -179,16 +189,18 @@ class CuratedContainer:
179
189
 
180
190
  counts = self._feature_dtype_counts()
181
191
  target = self.task_metadata.target_column_name
182
- return "\n".join([
183
- "Dataset:",
184
- f" shape: {self.dataset.shape}",
185
- f" feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
186
- f" numeric: {counts['numeric']}",
187
- f" categorical: {counts['categorical']}",
188
- f" datetime: {counts['datetime']}",
189
- f" text: {counts['text']}",
190
- f" binary: {counts['binary']}",
191
- ])
192
+ return "\n".join(
193
+ [
194
+ "Dataset:",
195
+ f" shape: {self.dataset.shape}",
196
+ f" feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
197
+ f" numeric: {counts['numeric']}",
198
+ f" categorical: {counts['categorical']}",
199
+ f" datetime: {counts['datetime']}",
200
+ f" text: {counts['text']}",
201
+ f" binary: {counts['binary']}",
202
+ ]
203
+ )
192
204
 
193
205
  def describe(self) -> str:
194
206
  """Return a high-level summary of the container.
@@ -197,17 +209,19 @@ class CuratedContainer:
197
209
  (shape + dtype counts), and the per-section :meth:`describe` outputs
198
210
  of the dataset, task, and experiment metadata objects.
199
211
  """
200
- return "\n".join([
201
- self.describe_container(),
202
- "",
203
- self.describe_dataset(),
204
- "",
205
- self.dataset_metadata.describe(),
206
- "",
207
- self.task_metadata.describe(),
208
- "",
209
- self.experiment_metadata.describe(),
210
- ])
212
+ return "\n".join(
213
+ [
214
+ self.describe_container(),
215
+ "",
216
+ self.describe_dataset(),
217
+ "",
218
+ self.dataset_metadata.describe(),
219
+ "",
220
+ self.task_metadata.describe(),
221
+ "",
222
+ self.experiment_metadata.describe(),
223
+ ]
224
+ )
211
225
 
212
226
  @staticmethod
213
227
  def _save_dtypes(df: pd.DataFrame, path: Path) -> None:
@@ -366,3 +380,71 @@ class CuratedContainer:
366
380
  loaded_from_path=path,
367
381
  **container_metadata,
368
382
  )
383
+
384
+
385
+ # --- Extra (non-core) artifacts ---------------------------------------------------
386
+ def _resolve_extras_dir(self, path: Path | str | None) -> Path:
387
+ """Return the directory to look for extra artifacts in.
388
+
389
+ Falls back to ``loaded_from_path`` if ``path`` is omitted.
390
+ """
391
+ if path is not None:
392
+ return Path(path)
393
+ if self.loaded_from_path is not None:
394
+ return self.loaded_from_path
395
+ raise ValueError(
396
+ "Container has no `loaded_from_path` — pass `path=...` to locate extra files.",
397
+ )
398
+
399
+ def extra_file_path(self, filename: str, *, path: Path | str | None = None) -> Path:
400
+ """Resolve the path of an extra artifact alongside the container.
401
+
402
+ Extra artifacts are any files a producer ships in the container directory beyond
403
+ the six core files (``dataset.parquet``, ``dtypes.json``, ``container_metadata.json``
404
+ and the three ``*.{type_id}.json`` metadata files) and the optional test-dataset pair.
405
+ Data Foundry does not interpret their contents — it only resolves the path so callers
406
+ can load them however they need (e.g. ``pd.read_parquet``, ``json.load``, ``np.load``).
407
+
408
+ The returned path is not guaranteed to exist — use :meth:`has_extra_file` first.
409
+ ``filename`` must be a bare file name (no directory separators).
410
+ """
411
+ if not filename or "/" in filename or "\\" in filename or filename in {".", ".."}:
412
+ raise ValueError(f"Extra filename must be a bare file name, got {filename!r}.")
413
+ if filename in self._RESERVED_EXTRA_FILENAMES:
414
+ raise ValueError(
415
+ f"{filename!r} is a core container file; use the dedicated load API instead "
416
+ "(e.g. `CuratedContainer.load(...)` or `load_test_dataset()`).",
417
+ )
418
+ return self._resolve_extras_dir(path) / filename
419
+
420
+ def has_extra_file(self, filename: str, *, path: Path | str | None = None) -> bool:
421
+ """Return whether an extra artifact named ``filename`` exists next to the container."""
422
+ try:
423
+ return self.extra_file_path(filename, path=path).is_file()
424
+ except ValueError:
425
+ return False
426
+
427
+ def list_extra_files(self, *, path: Path | str | None = None) -> list[str]:
428
+ """Return the sorted list of extra-artifact file names present next to the container.
429
+
430
+ Excludes the core six files and the optional test-dataset pair. Metadata JSON files
431
+ (``<name>.<type_id>.json`` — two dots before ``.json``) are also excluded.
432
+ Returns ``[]`` if the directory does not exist or holds no extras.
433
+ """
434
+ try:
435
+ base = self._resolve_extras_dir(path)
436
+ except ValueError:
437
+ return []
438
+ if not base.is_dir():
439
+ return []
440
+ extras: list[str] = []
441
+ for entry in sorted(base.iterdir()):
442
+ if not entry.is_file():
443
+ continue
444
+ name = entry.name
445
+ if name in self._RESERVED_EXTRA_FILENAMES:
446
+ continue
447
+ if name.endswith(".json") and name.count(".") >= 2:
448
+ continue
449
+ extras.append(name)
450
+ return extras