data-foundry 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_foundry-0.0.3 → data_foundry-0.0.4}/PKG-INFO +1 -1
- {data_foundry-0.0.3 → data_foundry-0.0.4}/pyproject.toml +1 -1
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/__init__.py +2 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/_sources.py +45 -3
- {data_foundry-0.0.3 → data_foundry-0.0.4}/README.md +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/__init__.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/_core.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/collections/_registry.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/curation_container.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/curation_recommendations.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/dataset_checks.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/__init__.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/container_metadata.json +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset.parquet +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dataset_metadata.dataset-mold-v1.json +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/dtypes.json +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/experiment_metadata.predictive-ml-splits-mold-v1.json +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/task_metadata.predictive-ml-task-mold-v1.json +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/examples/toy_container/toy_iid_dataset/00000000-0000-7000-8000-000000000001/toy_extra.parquet +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/schema.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/utils/__init__.py +0 -0
- {data_foundry-0.0.3 → data_foundry-0.0.4}/src/data_foundry/utils/checksum.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: data-foundry
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena).
|
|
5
5
|
Keywords: tabular,machine-learning,benchmark,datasets,data-curation,tabarena
|
|
6
6
|
Author: TabArena Maintainers
|
|
@@ -4,7 +4,7 @@ build-backend = "uv_build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "data-foundry"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.4"
|
|
8
8
|
description = "A schema and toolkit for curating tabular datasets and benchmarking tasks (the data layer behind TabArena)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -14,6 +14,7 @@ from data_foundry.collections._sources import (
|
|
|
14
14
|
DEFAULT_CACHE_DIR,
|
|
15
15
|
DataSource,
|
|
16
16
|
HuggingFaceSource,
|
|
17
|
+
LocalWarehouseSource,
|
|
17
18
|
clear_cache,
|
|
18
19
|
resolve_cache_dir,
|
|
19
20
|
)
|
|
@@ -26,6 +27,7 @@ __all__ = [
|
|
|
26
27
|
"DataSource",
|
|
27
28
|
"DatasetCollection",
|
|
28
29
|
"HuggingFaceSource",
|
|
30
|
+
"LocalWarehouseSource",
|
|
29
31
|
"clear_cache",
|
|
30
32
|
"get_collection",
|
|
31
33
|
"list_collections",
|
|
@@ -5,9 +5,11 @@ e.g. a Hugging Face dataset repo, an S3 bucket, or a directory you already
|
|
|
5
5
|
have on disk. The collection asks the source to ``fetch`` an entry; the source
|
|
6
6
|
returns a local path that :meth:`CuratedContainer.load` can read.
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
Two sources ship with the package: :class:`HuggingFaceSource` (download from
|
|
9
|
+
a Hub dataset repo) and :class:`LocalWarehouseSource` (point at a directory
|
|
10
|
+
that already mirrors the warehouse layout). The abstraction is designed so
|
|
11
|
+
additional sources (URL/S3/etc.) can slot in without touching the rest of
|
|
12
|
+
the package.
|
|
11
13
|
"""
|
|
12
14
|
|
|
13
15
|
from __future__ import annotations
|
|
@@ -117,6 +119,46 @@ class DataSource:
|
|
|
117
119
|
return [self.fetch(e, cache_dir, force_download=force_download) for e in entries]
|
|
118
120
|
|
|
119
121
|
|
|
122
|
+
@dataclass(frozen=True)
|
|
123
|
+
class LocalWarehouseSource(DataSource):
|
|
124
|
+
"""Source backed by a pre-populated local warehouse directory.
|
|
125
|
+
|
|
126
|
+
Use this when the curated containers already live on disk in the standard
|
|
127
|
+
warehouse layout (``<base_dir>/<unique_name>/[versions/]<uuid>/``) and no
|
|
128
|
+
download is required — e.g. a shared filesystem on a compute cluster, or a
|
|
129
|
+
locally-curated set of containers that has not been published to the Hub.
|
|
130
|
+
|
|
131
|
+
``fetch`` is a no-op pointer lookup: it returns the entry's path under
|
|
132
|
+
``base_dir`` (and validates that the directory exists). ``cache_dir`` and
|
|
133
|
+
``force_download`` are ignored because nothing is cached or downloaded.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
base_dir: Path
|
|
137
|
+
"""Root of the local warehouse — the directory that contains
|
|
138
|
+
``<unique_name>/`` subfolders."""
|
|
139
|
+
|
|
140
|
+
def fetch(
|
|
141
|
+
self,
|
|
142
|
+
entry: CollectionEntry,
|
|
143
|
+
cache_dir: Path, # noqa: ARG002 — kept to match DataSource interface
|
|
144
|
+
*,
|
|
145
|
+
force_download: bool = False, # noqa: ARG002 — same reason
|
|
146
|
+
) -> Path:
|
|
147
|
+
"""Return the on-disk path for ``entry`` under :attr:`base_dir`.
|
|
148
|
+
|
|
149
|
+
Raises ``FileNotFoundError`` if the container directory is missing.
|
|
150
|
+
``cache_dir`` and ``force_download`` are ignored.
|
|
151
|
+
"""
|
|
152
|
+
path = entry.local_path(self.base_dir)
|
|
153
|
+
if not path.is_dir():
|
|
154
|
+
raise FileNotFoundError(
|
|
155
|
+
f"No curated container at {path}. Expected a directory "
|
|
156
|
+
f"matching entry {entry.relative_path.as_posix()!r} under the "
|
|
157
|
+
f"local warehouse {self.base_dir!s}.",
|
|
158
|
+
)
|
|
159
|
+
return path
|
|
160
|
+
|
|
161
|
+
|
|
120
162
|
@dataclass(frozen=True)
|
|
121
163
|
class HuggingFaceSource(DataSource):
|
|
122
164
|
"""Source backed by a Hugging Face Hub dataset repository.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|