PyPI - data-foundry - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

data-foundry 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

{data_foundry-0.0.2 → data_foundry-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: data-foundry
-Version: 0.0.2
+Version: 0.0.4
 Summary: A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena).
 Keywords: tabular,machine-learning,benchmark,datasets,data-curation,tabarena
 Author: TabArena Maintainers
@@ -64,7 +64,7 @@ Description-Content-Type: text/markdown
 - A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
 - A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
-- A **collections API** that pins ``(unique_name, uuid)`` pointers to immutable curated containers and resolves them against a local warehouse or directly against the BeyondArena Hugging Face mirror.
+- A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
 ## ⚡ Quickstart
@@ -347,7 +347,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
 ## 📄 Citation
 **PLACEHOLDER**
-📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
 ```bibtex
 PLACEHOLDER

{data_foundry-0.0.2 → data_foundry-0.0.4}/README.md RENAMED Viewed

@@ -11,7 +11,7 @@
 - A small, opinionated **schema** for tabular datasets, tasks (IID / temporal non-IID / grouped non-IID), and outer CV splits — aligned with OpenML where possible, extended where it had to be.
 - A **curation toolkit** (sanity checks, recommended-split helpers, dtype-preserving save/load) so a curator turns a raw download into a reproducible artifact in one notebook.
-- A **collections API** that pins ``(unique_name, uuid)`` pointers to immutable curated containers and resolves them against a local warehouse or directly against the BeyondArena Hugging Face mirror.
+- A **collections API** that pins datasets (defined by ``(unique_name, uuid)``) to immutable curated containers and resolves them against a local warehouse or directly against the [BeyondArena Datasets](https://huggingface.co/datasets/TabArena/BeyondArena).
 ## ⚡ Quickstart
@@ -294,7 +294,6 @@ gotchas, the `/new-dataset` Claude Code scaffolding skill): see
 ## 📄 Citation
 **PLACEHOLDER**
-📄 [arXiv:XXXX](https://arxiv.org/abs/XXX)
 ```bibtex
 PLACEHOLDER

{data_foundry-0.0.2 → data_foundry-0.0.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "uv_build"
 [project]
 name = "data-foundry"
-version = "0.0.2"
+version = "0.0.4"
 description = "A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena)."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -195,4 +195,4 @@ force-wrap-aliases = true
 convention = "google"
 [tool.ruff.lint.pylint]
-max-args = 10
+max-args = 10

{data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/__init__.py RENAMED Viewed

@@ -14,6 +14,7 @@ from data_foundry.collections._sources import (
     DEFAULT_CACHE_DIR,
     DataSource,
     HuggingFaceSource,
+    LocalWarehouseSource,
     clear_cache,
     resolve_cache_dir,
 )
@@ -26,6 +27,7 @@ __all__ = [
     "DataSource",
     "DatasetCollection",
     "HuggingFaceSource",
+    "LocalWarehouseSource",
     "clear_cache",
     "get_collection",
     "list_collections",

{data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/collections/_sources.py RENAMED Viewed

@@ -5,9 +5,11 @@ e.g. a Hugging Face dataset repo, an S3 bucket, or a directory you already
 have on disk. The collection asks the source to ``fetch`` an entry; the source
 returns a local path that :meth:`CuratedContainer.load` can read.
-Currently only :class:`HuggingFaceSource` is implemented, but the abstraction
-is designed so additional sources (URL/S3/local) can slot in without touching
-the rest of the package.
+Two sources ship with the package: :class:`HuggingFaceSource` (download from
+a Hub dataset repo) and :class:`LocalWarehouseSource` (point at a directory
+that already mirrors the warehouse layout). The abstraction is designed so
+additional sources (URL/S3/etc.) can slot in without touching the rest of
+the package.
 """
 from __future__ import annotations
@@ -117,6 +119,46 @@ class DataSource:
         return [self.fetch(e, cache_dir, force_download=force_download) for e in entries]
+@dataclass(frozen=True)
+class LocalWarehouseSource(DataSource):
+    """Source backed by a pre-populated local warehouse directory.
+    Use this when the curated containers already live on disk in the standard
+    warehouse layout (``<base_dir>/<unique_name>/[versions/]<uuid>/``) and no
+    download is required — e.g. a shared filesystem on a compute cluster, or a
+    locally-curated set of containers that has not been published to the Hub.
+    ``fetch`` is a no-op pointer lookup: it returns the entry's path under
+    ``base_dir`` (and validates that the directory exists). ``cache_dir`` and
+    ``force_download`` are ignored because nothing is cached or downloaded.
+    """
+    base_dir: Path
+    """Root of the local warehouse — the directory that contains
+    ``<unique_name>/`` subfolders."""
+    def fetch(
+        self,
+        entry: CollectionEntry,
+        cache_dir: Path,  # noqa: ARG002 — kept to match DataSource interface
+        *,
+        force_download: bool = False,  # noqa: ARG002 — same reason
+    ) -> Path:
+        """Return the on-disk path for ``entry`` under :attr:`base_dir`.
+        Raises ``FileNotFoundError`` if the container directory is missing.
+        ``cache_dir`` and ``force_download`` are ignored.
+        """
+        path = entry.local_path(self.base_dir)
+        if not path.is_dir():
+            raise FileNotFoundError(
+                f"No curated container at {path}. Expected a directory "
+                f"matching entry {entry.relative_path.as_posix()!r} under the "
+                f"local warehouse {self.base_dir!s}.",
+            )
+        return path
 @dataclass(frozen=True)
 class HuggingFaceSource(DataSource):
     """Source backed by a Hugging Face Hub dataset repository.

{data_foundry-0.0.2 → data_foundry-0.0.4}/src/data_foundry/curation_container.py RENAMED Viewed

@@ -5,6 +5,7 @@ import json
 import logging
 from dataclasses import dataclass
 from pathlib import Path
+from typing import ClassVar
 import pandas as pd
 from pydantic import TypeAdapter
@@ -35,6 +36,16 @@ NoIndentMetadata = [
 class CuratedContainer:
     """Schema for a collection of curated items, ready to be used by others."""
+    _RESERVED_EXTRA_FILENAMES: ClassVar[frozenset[str]] = frozenset(
+        {
+            "dataset.parquet",
+            "dtypes.json",
+            "test_dataset.parquet",
+            "test_dtypes.json",
+            "container_metadata.json",
+        }
+    )
     dataset: pd.DataFrame
     """The curated dataset as a pandas DataFrame."""
     dataset_metadata: DatasetMetadata
@@ -134,10 +145,7 @@ class CuratedContainer:
         numeric_cols = feature_df.select_dtypes(include=["number"], exclude=["bool"]).columns
         categorical_cols = feature_df.select_dtypes(include=["category", "bool"]).columns
         datetime_cols = list(feature_df.select_dtypes(include=["datetime", "datetimetz"]).columns)
-        datetime_cols += [
-            c for c in feature_df.columns
-            if isinstance(feature_df[c].dtype, pd.PeriodDtype)
-        ]
+        datetime_cols += [c for c in feature_df.columns if isinstance(feature_df[c].dtype, pd.PeriodDtype)]
         text_cols = feature_df.select_dtypes(include=["string"]).columns
         return {
@@ -160,12 +168,14 @@ class CuratedContainer:
         checksum_short = (
             (self.checksum or "")[:16] + "…" if self.checksum and len(self.checksum) > 16 else self.checksum
         )
-        return "\n".join([
-            "CuratedContainer:",
-            f"  unique_name:   {self.dataset_metadata.unique_name}",
-            f"  uuid:          {uuid_short}",
-            f"  checksum:      {checksum_short}",
-        ])
+        return "\n".join(
+            [
+                "CuratedContainer:",
+                f"  unique_name:   {self.dataset_metadata.unique_name}",
+                f"  uuid:          {uuid_short}",
+                f"  checksum:      {checksum_short}",
+            ]
+        )
     def describe_dataset(self) -> str:
         """Return the DataFrame summary: shape and feature-dtype counts.
@@ -179,16 +189,18 @@ class CuratedContainer:
         counts = self._feature_dtype_counts()
         target = self.task_metadata.target_column_name
-        return "\n".join([
-            "Dataset:",
-            f"  shape:         {self.dataset.shape}",
-            f"  feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
-            f"    numeric:     {counts['numeric']}",
-            f"    categorical: {counts['categorical']}",
-            f"    datetime:    {counts['datetime']}",
-            f"    text:        {counts['text']}",
-            f"    binary:      {counts['binary']}",
-        ])
+        return "\n".join(
+            [
+                "Dataset:",
+                f"  shape:         {self.dataset.shape}",
+                f"  feature dtypes ({counts['n_features']} features, excluding target `{target}`):",
+                f"    numeric:     {counts['numeric']}",
+                f"    categorical: {counts['categorical']}",
+                f"    datetime:    {counts['datetime']}",
+                f"    text:        {counts['text']}",
+                f"    binary:      {counts['binary']}",
+            ]
+        )
     def describe(self) -> str:
         """Return a high-level summary of the container.
@@ -197,17 +209,19 @@ class CuratedContainer:
         (shape + dtype counts), and the per-section :meth:`describe` outputs
         of the dataset, task, and experiment metadata objects.
         """
-        return "\n".join([
-            self.describe_container(),
-            "",
-            self.describe_dataset(),
-            "",
-            self.dataset_metadata.describe(),
-            "",
-            self.task_metadata.describe(),
-            "",
-            self.experiment_metadata.describe(),
-        ])
+        return "\n".join(
+            [
+                self.describe_container(),
+                "",
+                self.describe_dataset(),
+                "",
+                self.dataset_metadata.describe(),
+                "",
+                self.task_metadata.describe(),
+                "",
+                self.experiment_metadata.describe(),
+            ]
+        )
     @staticmethod
     def _save_dtypes(df: pd.DataFrame, path: Path) -> None:
@@ -366,3 +380,71 @@ class CuratedContainer:
             loaded_from_path=path,
             **container_metadata,
         )
+    # --- Extra (non-core) artifacts ---------------------------------------------------
+    def _resolve_extras_dir(self, path: Path | str | None) -> Path:
+        """Return the directory to look for extra artifacts in.
+        Falls back to ``loaded_from_path`` if ``path`` is omitted.
+        """
+        if path is not None:
+            return Path(path)
+        if self.loaded_from_path is not None:
+            return self.loaded_from_path
+        raise ValueError(
+            "Container has no `loaded_from_path` — pass `path=...` to locate extra files.",
+        )
+    def extra_file_path(self, filename: str, *, path: Path | str | None = None) -> Path:
+        """Resolve the path of an extra artifact alongside the container.
+        Extra artifacts are any files a producer ships in the container directory beyond
+        the six core files (``dataset.parquet``, ``dtypes.json``, ``container_metadata.json``
+        and the three ``*.{type_id}.json`` metadata files) and the optional test-dataset pair.
+        Data Foundry does not interpret their contents — it only resolves the path so callers
+        can load them however they need (e.g. ``pd.read_parquet``, ``json.load``, ``np.load``).
+        The returned path is not guaranteed to exist — use :meth:`has_extra_file` first.
+        ``filename`` must be a bare file name (no directory separators).
+        """
+        if not filename or "/" in filename or "\\" in filename or filename in {".", ".."}:
+            raise ValueError(f"Extra filename must be a bare file name, got {filename!r}.")
+        if filename in self._RESERVED_EXTRA_FILENAMES:
+            raise ValueError(
+                f"{filename!r} is a core container file; use the dedicated load API instead "
+                "(e.g. `CuratedContainer.load(...)` or `load_test_dataset()`).",
+            )
+        return self._resolve_extras_dir(path) / filename
+    def has_extra_file(self, filename: str, *, path: Path | str | None = None) -> bool:
+        """Return whether an extra artifact named ``filename`` exists next to the container."""
+        try:
+            return self.extra_file_path(filename, path=path).is_file()
+        except ValueError:
+            return False
+    def list_extra_files(self, *, path: Path | str | None = None) -> list[str]:
+        """Return the sorted list of extra-artifact file names present next to the container.
+        Excludes the core six files and the optional test-dataset pair. Metadata JSON files
+        (``<name>.<type_id>.json`` — two dots before ``.json``) are also excluded.
+        Returns ``[]`` if the directory does not exist or holds no extras.
+        """
+        try:
+            base = self._resolve_extras_dir(path)
+        except ValueError:
+            return []
+        if not base.is_dir():
+            return []
+        extras: list[str] = []
+        for entry in sorted(base.iterdir()):
+            if not entry.is_file():
+                continue
+            name = entry.name
+            if name in self._RESERVED_EXTRA_FILENAMES:
+                continue
+            if name.endswith(".json") and name.count(".") >= 2:
+                continue
+            extras.append(name)
+        return extras