data-foundry 0.0.3__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {data_foundry-0.0.3 → data_foundry-0.0.4}/PKG-INFO +1 -1
  2. {data_foundry-0.0.3 → data_foundry-0.0.4}/pyproject.toml +1 -1
  3. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/__init__.py +2 -0
  4. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/_sources.py +45 -3
  5. {data_foundry-0.0.3 → data_foundry-0.0.4}/README.md +0 -0
  6. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/__init__.py +0 -0
  7. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/_core.py +0 -0
  8. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/_registry.py +0 -0
  9. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/curation_container.py +0 -0
  10. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/curation_recommendations.py +0 -0
  11. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/dataset_checks.py +0 -0
  12. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/__init__.py +0 -0
  13. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/container_metadata.json +0 -0
  14. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset.parquet +0 -0
  15. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset_metadata.dataset-mold-v1.json +0 -0
  16. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dtypes.json +0 -0
  17. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/experiment_metadata.predictive-ml-splits-mold-v1.json +0 -0
  18. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/task_metadata.predictive-ml-task-mold-v1.json +0 -0
  19. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/toy_extra.parquet +0 -0
  20. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/schema.py +0 -0
  21. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/utils/__init__.py +0 -0
  22. {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/utils/checksum.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: data-foundry
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena).
5
5
  Keywords: tabular,machine-learning,benchmark,datasets,data-curation,tabarena
6
6
  Author: TabArena Maintainers
@@ -4,7 +4,7 @@ build-backend = "uv_build"
4
4
 
5
5
  [project]
6
6
  name = "data-foundry"
7
- version = "0.0.3"
7
+ version = "0.0.4"
8
8
  description = "A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena)."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -14,6 +14,7 @@ from data_foundry.collections._sources import (
14
14
  DEFAULT_CACHE_DIR,
15
15
  DataSource,
16
16
  HuggingFaceSource,
17
+ LocalWarehouseSource,
17
18
  clear_cache,
18
19
  resolve_cache_dir,
19
20
  )
@@ -26,6 +27,7 @@ __all__ = [
26
27
  "DataSource",
27
28
  "DatasetCollection",
28
29
  "HuggingFaceSource",
30
+ "LocalWarehouseSource",
29
31
  "clear_cache",
30
32
  "get_collection",
31
33
  "list_collections",
@@ -5,9 +5,11 @@ e.g. a Hugging Face dataset repo, an S3 bucket, or a directory you already
5
5
  have on disk. The collection asks the source to ``fetch`` an entry; the source
6
6
  returns a local path that :meth:`CuratedContainer.load` can read.
7
7
 
8
- Currently only :class:`HuggingFaceSource` is implemented, but the abstraction
9
- is designed so additional sources (URL/S3/local) can slot in without touching
10
- the rest of the package.
8
+ Two sources ship with the package: :class:`HuggingFaceSource` (download from
9
+ a Hub dataset repo) and :class:`LocalWarehouseSource` (point at a directory
10
+ that already mirrors the warehouse layout). The abstraction is designed so
11
+ additional sources (URL/S3/etc.) can slot in without touching the rest of
12
+ the package.
11
13
  """
12
14
 
13
15
  from __future__ import annotations
@@ -117,6 +119,46 @@ class DataSource:
117
119
  return [self.fetch(e, cache_dir, force_download=force_download) for e in entries]
118
120
 
119
121
 
122
+ @dataclass(frozen=True)
123
+ class LocalWarehouseSource(DataSource):
124
+ """Source backed by a pre-populated local warehouse directory.
125
+
126
+ Use this when the curated containers already live on disk in the standard
127
+ warehouse layout (``<base_dir>/<unique_name>/[versions/]<uuid>/``) and no
128
+ download is required — e.g. a shared filesystem on a compute cluster, or a
129
+ locally-curated set of containers that has not been published to the Hub.
130
+
131
+ ``fetch`` is a no-op pointer lookup: it returns the entry's path under
132
+ ``base_dir`` (and validates that the directory exists). ``cache_dir`` and
133
+ ``force_download`` are ignored because nothing is cached or downloaded.
134
+ """
135
+
136
+ base_dir: Path
137
+ """Root of the local warehouse — the directory that contains
138
+ ``<unique_name>/`` subfolders."""
139
+
140
+ def fetch(
141
+ self,
142
+ entry: CollectionEntry,
143
+ cache_dir: Path, # noqa: ARG002 — kept to match DataSource interface
144
+ *,
145
+ force_download: bool = False, # noqa: ARG002 — same reason
146
+ ) -> Path:
147
+ """Return the on-disk path for ``entry`` under :attr:`base_dir`.
148
+
149
+ Raises ``FileNotFoundError`` if the container directory is missing.
150
+ ``cache_dir`` and ``force_download`` are ignored.
151
+ """
152
+ path = entry.local_path(self.base_dir)
153
+ if not path.is_dir():
154
+ raise FileNotFoundError(
155
+ f"No curated container at {path}. Expected a directory "
156
+ f"matching entry {entry.relative_path.as_posix()!r} under the "
157
+ f"local warehouse {self.base_dir!s}.",
158
+ )
159
+ return path
160
+
161
+
120
162
  @dataclass(frozen=True)
121
163
  class HuggingFaceSource(DataSource):
122
164
  """Source backed by a Hugging Face Hub dataset repository.
File without changes