datamaestro 1.7.0__tar.gz → 1.7.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro-1.7.0 → datamaestro-1.7.2}/PKG-INFO +1 -1
- datamaestro-1.7.2/release-notes.md +5 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/definitions.py +29 -4
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/__init__.py +23 -5
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/links.py +10 -2
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/test_resource.py +269 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/version.py +1 -1
- datamaestro-1.7.0/release-notes.md +0 -5
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.coverage +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.flake8 +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.github/workflows/pytest.yml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.gitignore +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.pre-commit-config.yaml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.python-version +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/.readthedocs.yml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/CHANGELOG.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/LICENSE +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/MANIFEST.in +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/README.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/TODO.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/cliff.toml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/Makefile +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/make.bat +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/requirements.txt +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/api/data.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/api/download.rst +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/api/index.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/api/records.rst +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/cli.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/conf.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/configuration.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/datasets.rst +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/developping.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/getting-started.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/index.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/docs/source/style.css +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/pyproject.toml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/pytest.ini +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/requirements-dev.txt +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/requirements.txt +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/schema.yaml +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/__init__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/__main__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/annotations/__init__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/annotations/agreement.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/commands/__init__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/commands/mainstyle.css +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/commands/site.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/context.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/data/__init__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/data/csv.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/data/huggingface.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/data/ml.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/data/tensor.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/archive.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/custom.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/huggingface.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/manual.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/multiple.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/single.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/sync.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/todo.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/download/wayback.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/record.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/registry.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/search.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/settings.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/sphinx.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/stream/__init__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/stream/compress.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/stream/lines.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/templates/dataset.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/__init__.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/checks.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/conftest.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/test_annotations.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/test_download_handlers.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/test/test_record.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/utils.py +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/src/datamaestro/v2.md +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/tox.ini +0 -0
- {datamaestro-1.7.0 → datamaestro-1.7.2}/uv.lock +0 -0
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import logging
|
|
8
8
|
import inspect
|
|
9
|
+
import re as _re
|
|
9
10
|
import shutil
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
from itertools import chain
|
|
@@ -33,7 +34,7 @@ from typing import Type as TypingType # noqa: F401 (re-exports)
|
|
|
33
34
|
from experimaestro.core.types import Type # noqa: F401 (re-exports)
|
|
34
35
|
|
|
35
36
|
if TYPE_CHECKING:
|
|
36
|
-
from .data import Base
|
|
37
|
+
from .data import Base
|
|
37
38
|
from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
|
|
38
39
|
from datamaestro.download import Download, Resource
|
|
39
40
|
|
|
@@ -130,6 +131,21 @@ def _move_path(src: Path, dst: Path) -> None:
|
|
|
130
131
|
shutil.move(str(src), str(dst))
|
|
131
132
|
|
|
132
133
|
|
|
134
|
+
_CAMEL_RE1 = _re.compile(r"([A-Z]+)([A-Z][a-z])")
|
|
135
|
+
_CAMEL_RE2 = _re.compile(r"([a-z0-9])([A-Z])")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _camel_to_snake(name: str) -> str:
|
|
139
|
+
"""Convert CamelCase to snake_case, then lowercase.
|
|
140
|
+
|
|
141
|
+
Examples: ProcessedMNIST -> processed_mnist, MyData -> my_data,
|
|
142
|
+
MNIST -> mnist, simple -> simple
|
|
143
|
+
"""
|
|
144
|
+
s = _CAMEL_RE1.sub(r"\1_\2", name)
|
|
145
|
+
s = _CAMEL_RE2.sub(r"\1_\2", s)
|
|
146
|
+
return s.lower()
|
|
147
|
+
|
|
148
|
+
|
|
133
149
|
# --- Objects holding information into classes/function
|
|
134
150
|
|
|
135
151
|
|
|
@@ -196,7 +212,12 @@ class DataDefinition(AbstractData):
|
|
|
196
212
|
if components[0] == "datamaestro":
|
|
197
213
|
longest_ix = 0
|
|
198
214
|
|
|
199
|
-
|
|
215
|
+
parts = components[(longest_ix + 1) :]
|
|
216
|
+
# Module components: just lowercase
|
|
217
|
+
# Last component (class/function name): CamelCase → snake_case
|
|
218
|
+
if parts:
|
|
219
|
+
parts = [s.lower() for s in parts[:-1]] + [_camel_to_snake(parts[-1])]
|
|
220
|
+
return repository, parts
|
|
200
221
|
|
|
201
222
|
def ancestors(self):
|
|
202
223
|
ancestors = []
|
|
@@ -594,6 +615,10 @@ class DatasetWrapper(AbstractDataset):
|
|
|
594
615
|
if self.base is self.t:
|
|
595
616
|
self.config = self.base.__create_dataset__(self)
|
|
596
617
|
|
|
618
|
+
elif hasattr(self.t, "__create_dataset__"):
|
|
619
|
+
# Class-based dataset with metadataset or different base
|
|
620
|
+
self.config = self.t.__create_dataset__(self)
|
|
621
|
+
|
|
597
622
|
else:
|
|
598
623
|
# Construct the object
|
|
599
624
|
if self.as_prepare:
|
|
@@ -715,8 +740,8 @@ class DatasetAnnotation:
|
|
|
715
740
|
def __call__(self, dataset: AbstractDataset):
|
|
716
741
|
if isinstance(dataset, AbstractDataset):
|
|
717
742
|
self.annotate(dataset)
|
|
718
|
-
elif
|
|
719
|
-
self.annotate(dataset.
|
|
743
|
+
elif hasattr(dataset, "__dataset__"):
|
|
744
|
+
self.annotate(dataset.__dataset__)
|
|
720
745
|
else:
|
|
721
746
|
raise RuntimeError(
|
|
722
747
|
f"Only datasets can be annotated with {self}, "
|
|
@@ -638,14 +638,32 @@ class reference(Resource):
|
|
|
638
638
|
assert reference is not None, "Reference cannot be null"
|
|
639
639
|
self.reference = reference
|
|
640
640
|
|
|
641
|
+
def _resolve_reference(self):
|
|
642
|
+
"""Resolve the reference to a DatasetWrapper.
|
|
643
|
+
|
|
644
|
+
For class-based datasets, the reference is the class itself with
|
|
645
|
+
a __dataset__ attribute pointing to the DatasetWrapper.
|
|
646
|
+
For function-based datasets, the reference is already a DatasetWrapper.
|
|
647
|
+
"""
|
|
648
|
+
ref = self.reference
|
|
649
|
+
if isinstance(ref, type) and hasattr(ref, "__dataset__"):
|
|
650
|
+
return ref.__dataset__
|
|
651
|
+
return ref
|
|
652
|
+
|
|
641
653
|
def prepare(self):
|
|
642
|
-
|
|
643
|
-
if isinstance(
|
|
644
|
-
return
|
|
645
|
-
return
|
|
654
|
+
resolved = self._resolve_reference()
|
|
655
|
+
if isinstance(resolved, AbstractDataset):
|
|
656
|
+
return resolved._prepare()
|
|
657
|
+
return resolved.prepare()
|
|
646
658
|
|
|
647
659
|
def download(self, force=False):
|
|
648
|
-
self.
|
|
660
|
+
resolved = self._resolve_reference()
|
|
661
|
+
if isinstance(resolved, AbstractDataset):
|
|
662
|
+
resolved.download(force)
|
|
663
|
+
elif hasattr(resolved, "__datamaestro__"):
|
|
664
|
+
resolved.__datamaestro__.download(force)
|
|
665
|
+
else:
|
|
666
|
+
resolved.download(force)
|
|
649
667
|
|
|
650
668
|
def has_files(self):
|
|
651
669
|
# We don't really have files
|
|
@@ -55,9 +55,17 @@ class links(Resource):
|
|
|
55
55
|
def download(self, force=False):
|
|
56
56
|
self.path.mkdir(exist_ok=True, parents=True)
|
|
57
57
|
for key, value in self.links.items():
|
|
58
|
-
|
|
58
|
+
# Resolve class-based datasets
|
|
59
|
+
if hasattr(value, "__dataset__"):
|
|
60
|
+
wrapper = value.__dataset__
|
|
61
|
+
wrapper.download(force)
|
|
62
|
+
path = wrapper()
|
|
63
|
+
elif hasattr(value, "download"):
|
|
64
|
+
value.download(force)
|
|
65
|
+
path = value()
|
|
66
|
+
else:
|
|
67
|
+
path = value # Already a path
|
|
59
68
|
|
|
60
|
-
path = value()
|
|
61
69
|
dest = self.path / key
|
|
62
70
|
|
|
63
71
|
if not dest.exists():
|
|
@@ -22,6 +22,8 @@ import pytest
|
|
|
22
22
|
|
|
23
23
|
from datamaestro.definitions import (
|
|
24
24
|
AbstractDataset,
|
|
25
|
+
DataDefinition,
|
|
26
|
+
DatasetWrapper,
|
|
25
27
|
topological_sort,
|
|
26
28
|
_compute_dependents,
|
|
27
29
|
_bind_class_resources,
|
|
@@ -1109,6 +1111,134 @@ class TestReferenceResource:
|
|
|
1109
1111
|
reference(varname="ref", reference=None)
|
|
1110
1112
|
|
|
1111
1113
|
|
|
1114
|
+
# ==== Reference with Class-Based Datasets ====
|
|
1115
|
+
|
|
1116
|
+
|
|
1117
|
+
class TestReferenceClassBased:
|
|
1118
|
+
"""Tests for `reference` used with class-based datasets.
|
|
1119
|
+
|
|
1120
|
+
When a class-based dataset is decorated with @dataset, the class
|
|
1121
|
+
gets a __dataset__ attribute pointing to the DatasetWrapper.
|
|
1122
|
+
The reference resource must resolve through that attribute.
|
|
1123
|
+
"""
|
|
1124
|
+
|
|
1125
|
+
def _make_base_dataset(self, context):
|
|
1126
|
+
"""Create a minimal class-based dataset to use as a reference target."""
|
|
1127
|
+
from datamaestro.data import Base
|
|
1128
|
+
from datamaestro.definitions import dataset as dataset_dec
|
|
1129
|
+
|
|
1130
|
+
class BaseData(Base):
|
|
1131
|
+
"""Base test dataset."""
|
|
1132
|
+
|
|
1133
|
+
DATA = DummyFileResource("base.txt")
|
|
1134
|
+
|
|
1135
|
+
@classmethod
|
|
1136
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
1137
|
+
return cls.C(id="test.base")
|
|
1138
|
+
|
|
1139
|
+
BaseData.__module__ = "datamaestro.config.test"
|
|
1140
|
+
|
|
1141
|
+
# Apply the @dataset decorator (sets __dataset__ on the class)
|
|
1142
|
+
dataset_dec(base=BaseData, url="http://test.com")(BaseData)
|
|
1143
|
+
return BaseData
|
|
1144
|
+
|
|
1145
|
+
def test_resolve_via_dataset_attr(self, context):
|
|
1146
|
+
"""_resolve_reference follows __dataset__ for class-based targets."""
|
|
1147
|
+
BaseData = self._make_base_dataset(context)
|
|
1148
|
+
|
|
1149
|
+
ref = reference(varname="base", reference=BaseData)
|
|
1150
|
+
resolved = ref._resolve_reference()
|
|
1151
|
+
|
|
1152
|
+
assert resolved is BaseData.__dataset__
|
|
1153
|
+
|
|
1154
|
+
def test_prepare_delegates_to_class_dataset(self, context):
|
|
1155
|
+
"""prepare() calls _prepare() on the referenced DatasetWrapper."""
|
|
1156
|
+
BaseData = self._make_base_dataset(context)
|
|
1157
|
+
|
|
1158
|
+
ref = reference(varname="base", reference=BaseData)
|
|
1159
|
+
|
|
1160
|
+
# Mock the DatasetWrapper._prepare to avoid full experimaestro
|
|
1161
|
+
# Config construction (which rejects classes defined in functions)
|
|
1162
|
+
sentinel = object()
|
|
1163
|
+
BaseData.__dataset__._prepare = MagicMock(return_value=sentinel)
|
|
1164
|
+
|
|
1165
|
+
result = ref.prepare()
|
|
1166
|
+
BaseData.__dataset__._prepare.assert_called_once()
|
|
1167
|
+
assert result is sentinel
|
|
1168
|
+
|
|
1169
|
+
def test_download_delegates_to_class_dataset(self, context):
|
|
1170
|
+
"""download() calls download() on the referenced DatasetWrapper."""
|
|
1171
|
+
BaseData = self._make_base_dataset(context)
|
|
1172
|
+
|
|
1173
|
+
ref = reference(varname="base", reference=BaseData)
|
|
1174
|
+
|
|
1175
|
+
# Mock the DatasetWrapper.download to verify delegation
|
|
1176
|
+
BaseData.__dataset__.download = MagicMock()
|
|
1177
|
+
|
|
1178
|
+
ref.download(force=True)
|
|
1179
|
+
BaseData.__dataset__.download.assert_called_once_with(True)
|
|
1180
|
+
|
|
1181
|
+
def test_download_no_force(self, context):
|
|
1182
|
+
"""download(force=False) passes force=False to the target."""
|
|
1183
|
+
BaseData = self._make_base_dataset(context)
|
|
1184
|
+
|
|
1185
|
+
ref = reference(varname="base", reference=BaseData)
|
|
1186
|
+
BaseData.__dataset__.download = MagicMock()
|
|
1187
|
+
|
|
1188
|
+
ref.download(force=False)
|
|
1189
|
+
BaseData.__dataset__.download.assert_called_once_with(False)
|
|
1190
|
+
|
|
1191
|
+
def test_has_files_false(self, context):
|
|
1192
|
+
"""reference has_files() is always False."""
|
|
1193
|
+
BaseData = self._make_base_dataset(context)
|
|
1194
|
+
|
|
1195
|
+
ref = reference(varname="base", reference=BaseData)
|
|
1196
|
+
assert ref.has_files() is False
|
|
1197
|
+
|
|
1198
|
+
def test_bound_in_class_based_dataset(self, context):
|
|
1199
|
+
"""reference works as a class attribute bound via
|
|
1200
|
+
_bind_class_resources."""
|
|
1201
|
+
BaseData = self._make_base_dataset(context)
|
|
1202
|
+
|
|
1203
|
+
repository = MyRepository(context)
|
|
1204
|
+
ds = SimpleDataset(repository, context.datapath / "derived_test")
|
|
1205
|
+
|
|
1206
|
+
ref = reference(varname="base", reference=BaseData)
|
|
1207
|
+
ref.bind("BASE", ds)
|
|
1208
|
+
|
|
1209
|
+
assert "base" in ds.resources
|
|
1210
|
+
assert ds.resources["base"] is ref
|
|
1211
|
+
assert ref.has_files() is False
|
|
1212
|
+
|
|
1213
|
+
def test_full_class_attribute_integration(self, context):
|
|
1214
|
+
"""reference as a class attribute in a full class-based dataset."""
|
|
1215
|
+
from datamaestro.data import Base
|
|
1216
|
+
|
|
1217
|
+
BaseData = self._make_base_dataset(context)
|
|
1218
|
+
|
|
1219
|
+
class DerivedData(Base):
|
|
1220
|
+
"""Derived dataset referencing the base."""
|
|
1221
|
+
|
|
1222
|
+
BASE = reference(varname="base", reference=BaseData)
|
|
1223
|
+
|
|
1224
|
+
@classmethod
|
|
1225
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
1226
|
+
cls.BASE.prepare()
|
|
1227
|
+
return cls.C(id="test.derived")
|
|
1228
|
+
|
|
1229
|
+
repository = MyRepository(context)
|
|
1230
|
+
ds = SimpleDataset(repository, context.datapath / "derived_full")
|
|
1231
|
+
|
|
1232
|
+
_bind_class_resources(DerivedData, ds)
|
|
1233
|
+
|
|
1234
|
+
assert "base" in ds.resources
|
|
1235
|
+
assert isinstance(ds.resources["base"], reference)
|
|
1236
|
+
|
|
1237
|
+
# The reference should resolve to the base dataset
|
|
1238
|
+
resolved = ds.resources["base"]._resolve_reference()
|
|
1239
|
+
assert resolved is BaseData.__dataset__
|
|
1240
|
+
|
|
1241
|
+
|
|
1112
1242
|
# ==== Links Resource Tests ====
|
|
1113
1243
|
|
|
1114
1244
|
|
|
@@ -1386,3 +1516,142 @@ class TestMultiple:
|
|
|
1386
1516
|
from datamaestro.download.multiple import Datasets
|
|
1387
1517
|
|
|
1388
1518
|
assert issubclass(Datasets, Download)
|
|
1519
|
+
|
|
1520
|
+
|
|
1521
|
+
# ==== Dataset ID Inference Tests ====
|
|
1522
|
+
|
|
1523
|
+
|
|
1524
|
+
class TestDatasetIDInference:
|
|
1525
|
+
"""Integration tests for dataset ID inference stability.
|
|
1526
|
+
|
|
1527
|
+
Verifies that DataDefinition.repository_relpath correctly derives
|
|
1528
|
+
path components from type modules and names, including the
|
|
1529
|
+
CamelCase → snake_case conversion for the final component
|
|
1530
|
+
(class/function name).
|
|
1531
|
+
"""
|
|
1532
|
+
|
|
1533
|
+
@staticmethod
|
|
1534
|
+
def _make_type(module, name):
|
|
1535
|
+
"""Create a mock type with given __module__ and __name__."""
|
|
1536
|
+
t = type(name, (), {})
|
|
1537
|
+
t.__module__ = module
|
|
1538
|
+
return t
|
|
1539
|
+
|
|
1540
|
+
def test_all_caps_class(self, context):
|
|
1541
|
+
"""All-caps class name (e.g. MNIST) becomes lowercase."""
|
|
1542
|
+
t = self._make_type("datamaestro.config.lecun", "MNIST")
|
|
1543
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1544
|
+
assert parts == ["config", "lecun", "mnist"]
|
|
1545
|
+
|
|
1546
|
+
def test_camel_case_class(self, context):
|
|
1547
|
+
"""CamelCase class name becomes snake_case."""
|
|
1548
|
+
t = self._make_type("datamaestro.config.lecun", "ProcessedMNIST")
|
|
1549
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1550
|
+
assert parts == ["config", "lecun", "processed_mnist"]
|
|
1551
|
+
|
|
1552
|
+
def test_multi_word_camel_case(self, context):
|
|
1553
|
+
"""Multi-word CamelCase is split with underscores."""
|
|
1554
|
+
t = self._make_type("datamaestro.config.data", "ImageClassification")
|
|
1555
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1556
|
+
assert parts == ["config", "data", "image_classification"]
|
|
1557
|
+
|
|
1558
|
+
def test_lowercase_function_name(self, context):
|
|
1559
|
+
"""Lowercase function names stay as-is."""
|
|
1560
|
+
t = self._make_type("datamaestro.config.lecun", "mnist")
|
|
1561
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1562
|
+
assert parts == ["config", "lecun", "mnist"]
|
|
1563
|
+
|
|
1564
|
+
def test_name_with_digits(self, context):
|
|
1565
|
+
"""Names with trailing digits are handled correctly."""
|
|
1566
|
+
t = self._make_type("datamaestro.config.trec", "Robust2005")
|
|
1567
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1568
|
+
assert parts == ["config", "trec", "robust2005"]
|
|
1569
|
+
|
|
1570
|
+
def test_acronym_then_word(self, context):
|
|
1571
|
+
"""Acronym followed by word splits correctly."""
|
|
1572
|
+
t = self._make_type("datamaestro.config.web", "HTTPSConnection")
|
|
1573
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1574
|
+
assert parts == ["config", "web", "https_connection"]
|
|
1575
|
+
|
|
1576
|
+
def test_digit_to_upper_boundary(self, context):
|
|
1577
|
+
"""Digit-to-uppercase boundary inserts underscore."""
|
|
1578
|
+
t = self._make_type("datamaestro.config.data", "V2Data")
|
|
1579
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1580
|
+
assert parts == ["config", "data", "v2_data"]
|
|
1581
|
+
|
|
1582
|
+
def test_snake_case_passthrough(self, context):
|
|
1583
|
+
"""Already snake_case names are unchanged."""
|
|
1584
|
+
t = self._make_type("datamaestro.config.lecun", "my_data")
|
|
1585
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1586
|
+
assert parts == ["config", "lecun", "my_data"]
|
|
1587
|
+
|
|
1588
|
+
def test_module_components_lowercased(self, context):
|
|
1589
|
+
"""Module path components are lowercased, not snake_cased."""
|
|
1590
|
+
t = self._make_type("datamaestro.config.LeCun.SubDir", "MNIST")
|
|
1591
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1592
|
+
assert parts == ["config", "lecun", "subdir", "mnist"]
|
|
1593
|
+
|
|
1594
|
+
def test_only_last_component_snake_cased(self, context):
|
|
1595
|
+
"""Only the last component gets CamelCase→snake_case;
|
|
1596
|
+
module components are simply lowercased."""
|
|
1597
|
+
t = self._make_type("datamaestro.config.MyModule.SubPkg", "ProcessedData")
|
|
1598
|
+
_, parts = DataDefinition.repository_relpath(t)
|
|
1599
|
+
# MyModule/SubPkg → lowercased; ProcessedData → snake_cased
|
|
1600
|
+
assert parts == [
|
|
1601
|
+
"config",
|
|
1602
|
+
"mymodule",
|
|
1603
|
+
"subpkg",
|
|
1604
|
+
"processed_data",
|
|
1605
|
+
]
|
|
1606
|
+
|
|
1607
|
+
def test_full_id_class_based(self, context):
|
|
1608
|
+
"""Full dataset ID for a class-based dataset."""
|
|
1609
|
+
from datamaestro.data import Base
|
|
1610
|
+
from datamaestro.definitions import dataset as dataset_dec
|
|
1611
|
+
|
|
1612
|
+
class ProcessedMNIST(Base):
|
|
1613
|
+
"""Test dataset."""
|
|
1614
|
+
|
|
1615
|
+
pass
|
|
1616
|
+
|
|
1617
|
+
ProcessedMNIST.__module__ = "datamaestro.config.lecun"
|
|
1618
|
+
|
|
1619
|
+
ann = dataset_dec(base=ProcessedMNIST, url="http://test.com")
|
|
1620
|
+
dw = DatasetWrapper(ann, ProcessedMNIST)
|
|
1621
|
+
assert dw.id == "lecun.processed_mnist"
|
|
1622
|
+
|
|
1623
|
+
def test_full_id_function_based(self, context):
|
|
1624
|
+
"""Full dataset ID for a function-based (lowercase) dataset."""
|
|
1625
|
+
from datamaestro.data import Base
|
|
1626
|
+
|
|
1627
|
+
class MyData(Base):
|
|
1628
|
+
pass
|
|
1629
|
+
|
|
1630
|
+
from datamaestro.definitions import dataset as dataset_dec
|
|
1631
|
+
|
|
1632
|
+
def mnist() -> MyData:
|
|
1633
|
+
pass
|
|
1634
|
+
|
|
1635
|
+
mnist.__module__ = "datamaestro.config.lecun"
|
|
1636
|
+
|
|
1637
|
+
ann = dataset_dec(url="http://test.com")
|
|
1638
|
+
# Infer base from return annotation
|
|
1639
|
+
ann.base = MyData
|
|
1640
|
+
dw = DatasetWrapper(ann, mnist)
|
|
1641
|
+
assert dw.id == "lecun.mnist"
|
|
1642
|
+
|
|
1643
|
+
def test_full_id_nested_module(self, context):
|
|
1644
|
+
"""Full dataset ID with nested module path."""
|
|
1645
|
+
from datamaestro.data import Base
|
|
1646
|
+
from datamaestro.definitions import dataset as dataset_dec
|
|
1647
|
+
|
|
1648
|
+
class Squad(Base):
|
|
1649
|
+
"""Test dataset."""
|
|
1650
|
+
|
|
1651
|
+
pass
|
|
1652
|
+
|
|
1653
|
+
Squad.__module__ = "datamaestro.config.stanford.qa"
|
|
1654
|
+
|
|
1655
|
+
ann = dataset_dec(base=Squad, url="http://test.com")
|
|
1656
|
+
dw = DatasetWrapper(ann, Squad)
|
|
1657
|
+
assert dw.id == "stanford.qa.squad"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|