datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/utils.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
import os.path as op
|
|
3
2
|
from experimaestro import Config
|
|
4
3
|
import json
|
|
5
4
|
from pathlib import PosixPath, Path
|
|
@@ -43,9 +42,7 @@ def copyfileobjs(fsrc, fdsts, length=0):
|
|
|
43
42
|
|
|
44
43
|
|
|
45
44
|
class FileChecker:
|
|
46
|
-
|
|
47
|
-
"""Check if the file is correct and throws an exception if not"""
|
|
48
|
-
raise NotImplementedError()
|
|
45
|
+
"""Checks a file"""
|
|
49
46
|
|
|
50
47
|
def check(self, path: Path):
|
|
51
48
|
"""Check the given file
|
|
@@ -69,9 +66,12 @@ class FileChecker:
|
|
|
69
66
|
|
|
70
67
|
|
|
71
68
|
class HashCheck(FileChecker):
|
|
72
|
-
"""Check a file against a hash"""
|
|
73
|
-
|
|
74
69
|
def __init__(self, hashstr: str, hasherfn=hashlib.md5):
|
|
70
|
+
"""Check a file against a hash
|
|
71
|
+
|
|
72
|
+
:param hashstr: The HASH value
|
|
73
|
+
:param hasherfn: The hash computer, defaults to hashlib.md5
|
|
74
|
+
"""
|
|
75
75
|
self.hashstr = hashstr
|
|
76
76
|
self.hasherfn = hasherfn
|
|
77
77
|
self.hasher = None
|
|
@@ -158,6 +158,11 @@ def downloadURL(url: str, path: Path, resume: bool = False, size: int = None):
|
|
|
158
158
|
if response is None:
|
|
159
159
|
response = requests.get(url, stream=True)
|
|
160
160
|
|
|
161
|
+
# Valid response
|
|
162
|
+
assert response.status_code >= 200 and response.status_code < 300, (
|
|
163
|
+
f"Status code is not 2XX ({response.status_code})"
|
|
164
|
+
)
|
|
165
|
+
|
|
161
166
|
# Get the total size (or use the provided one)
|
|
162
167
|
total_size = int(response.headers.get("content-length", size or 0))
|
|
163
168
|
|
|
@@ -166,9 +171,10 @@ def downloadURL(url: str, path: Path, resume: bool = False, size: int = None):
|
|
|
166
171
|
total_size += pos
|
|
167
172
|
|
|
168
173
|
CHUNK_SIZE = 1024
|
|
169
|
-
with
|
|
170
|
-
|
|
171
|
-
|
|
174
|
+
with (
|
|
175
|
+
path.open("ab") as f,
|
|
176
|
+
tqdm(initial=pos, total=total_size, unit_scale=True, unit="B") as t,
|
|
177
|
+
):
|
|
172
178
|
for data in response.iter_content(chunk_size=CHUNK_SIZE):
|
|
173
179
|
f.write(data)
|
|
174
180
|
t.update(len(data))
|
datamaestro/v2.md
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
# Resource Interface (v2)
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Resources represent steps in a dataset preparation pipeline. They form a
|
|
6
|
+
directed acyclic graph (DAG) where each resource can depend on other resources.
|
|
7
|
+
|
|
8
|
+
Key concepts:
|
|
9
|
+
|
|
10
|
+
- **Two-path system**: resources write to `transient_path` during download,
|
|
11
|
+
then the framework moves data to `path` and marks the resource as COMPLETE.
|
|
12
|
+
- **Three states**: NONE, PARTIAL, COMPLETE (persisted in `.state.json`)
|
|
13
|
+
- **Transient resources**: intermediate resources that can be deleted after all
|
|
14
|
+
dependents are COMPLETE (eager cleanup)
|
|
15
|
+
- **`can_recover` property**: subclasses override to preserve PARTIAL data on error
|
|
16
|
+
|
|
17
|
+
## Modern API: Class-based datasets (preferred)
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from datamaestro.definitions import dataset
|
|
21
|
+
from datamaestro.download.single import FileDownloader
|
|
22
|
+
|
|
23
|
+
@dataset(url="http://yann.lecun.com/exdb/mnist/")
|
|
24
|
+
class ProcessedMNIST(ImageClassification):
|
|
25
|
+
"""The MNIST database of handwritten digits."""
|
|
26
|
+
|
|
27
|
+
# Resources are class attributes — no decorators needed
|
|
28
|
+
TRAIN_IMAGES = FileDownloader(
|
|
29
|
+
"train_images.idx",
|
|
30
|
+
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
|
|
31
|
+
transient=True,
|
|
32
|
+
)
|
|
33
|
+
TRAIN_IMAGES_NP = NumpyTensorFile.from_idx(TRAIN_IMAGES)
|
|
34
|
+
|
|
35
|
+
TRAIN_LABELS = FileDownloader(
|
|
36
|
+
"train_labels.idx",
|
|
37
|
+
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
|
|
38
|
+
transient=True,
|
|
39
|
+
)
|
|
40
|
+
TRAIN_LABELS_NP = NumpyTensorFile.from_idx(TRAIN_LABELS)
|
|
41
|
+
|
|
42
|
+
TEST_IMAGES = FileDownloader(
|
|
43
|
+
"test_images.idx",
|
|
44
|
+
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
|
|
45
|
+
transient=True,
|
|
46
|
+
)
|
|
47
|
+
TEST_IMAGES_NP = NumpyTensorFile.from_idx(TEST_IMAGES)
|
|
48
|
+
|
|
49
|
+
TEST_LABELS = FileDownloader(
|
|
50
|
+
"test_labels.idx",
|
|
51
|
+
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
|
|
52
|
+
transient=True,
|
|
53
|
+
)
|
|
54
|
+
TEST_LABELS_NP = NumpyTensorFile.from_idx(TEST_LABELS)
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
58
|
+
return cls.C(
|
|
59
|
+
train=LabelledImages(
|
|
60
|
+
images=NumpyTensorFile(path=cls.TRAIN_IMAGES_NP.path),
|
|
61
|
+
labels=NumpyTensorFile(path=cls.TRAIN_LABELS_NP.path),
|
|
62
|
+
),
|
|
63
|
+
test=LabelledImages(
|
|
64
|
+
images=NumpyTensorFile(path=cls.TEST_IMAGES_NP.path),
|
|
65
|
+
labels=NumpyTensorFile(path=cls.TEST_LABELS_NP.path),
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Advantages:
|
|
71
|
+
|
|
72
|
+
1. **Explicit pipeline** — dependencies between resources are visible
|
|
73
|
+
2. **Transient intermediaries** — intermediate files can be deleted after processing
|
|
74
|
+
3. **No varname** — resource names are auto-detected from class attribute names
|
|
75
|
+
4. **Two-path safety** — incomplete downloads never appear at the final path
|
|
76
|
+
|
|
77
|
+
## Resource hierarchy
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Resource (ABC)
|
|
81
|
+
├── FileResource — produces a single file
|
|
82
|
+
├── FolderResource — produces a directory
|
|
83
|
+
├── ValueResource — produces an in-memory value (no files)
|
|
84
|
+
├── reference — references another dataset
|
|
85
|
+
└── Download — (deprecated alias for Resource)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### `ResourceState`
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
class ResourceState(str, Enum):
|
|
92
|
+
NONE = "none" # Not started
|
|
93
|
+
PARTIAL = "partial" # Started but incomplete
|
|
94
|
+
COMPLETE = "complete" # Fully available
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### `Resource` base class
|
|
98
|
+
|
|
99
|
+
| Property / Method | Description |
|
|
100
|
+
|---|---|
|
|
101
|
+
| `name: str` | Resource name (auto-set from class attribute name) |
|
|
102
|
+
| `dataset` | Back-reference to the owning `AbstractDataset` |
|
|
103
|
+
| `transient: bool` | Whether data can be deleted after dependents complete |
|
|
104
|
+
| `can_recover: bool` | Property. If True, PARTIAL data is preserved on error |
|
|
105
|
+
| `dependencies` | List of resources that must be COMPLETE first |
|
|
106
|
+
| `dependents` | Computed inverse of dependencies |
|
|
107
|
+
| `path: Path` | Final storage path (after COMPLETE) |
|
|
108
|
+
| `transient_path: Path` | Temp path where `download()` writes |
|
|
109
|
+
| `state: ResourceState` | Current state (from `.state.json` metadata file) |
|
|
110
|
+
| `download(force)` | Abstract. Execute download/processing step |
|
|
111
|
+
| `prepare()` | Abstract. Return value for dataset construction |
|
|
112
|
+
| `cleanup()` | Remove data from disk, set state to NONE |
|
|
113
|
+
| `has_files() -> bool` | Whether this resource produces files on disk |
|
|
114
|
+
| `bind(name, dataset)` | Bind to a dataset (called by framework) |
|
|
115
|
+
| `stream() -> IO | None` | (FileResource only) Return byte stream or None |
|
|
116
|
+
|
|
117
|
+
### `FileResource`
|
|
118
|
+
|
|
119
|
+
Base for resources that produce a single file. Subclasses implement
|
|
120
|
+
`_download(destination: Path)`.
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
class MyFileResource(FileResource):
|
|
124
|
+
def __init__(self, filename, url, **kw):
|
|
125
|
+
super().__init__(filename, **kw)
|
|
126
|
+
self.url = url
|
|
127
|
+
|
|
128
|
+
def _download(self, destination: Path):
|
|
129
|
+
# Write to destination (which is self.transient_path)
|
|
130
|
+
...
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### `FolderResource`
|
|
134
|
+
|
|
135
|
+
Base for resources that produce a directory. Subclasses implement
|
|
136
|
+
`_download(destination: Path)`.
|
|
137
|
+
|
|
138
|
+
### `ValueResource`
|
|
139
|
+
|
|
140
|
+
Base for resources that produce in-memory values (no files on disk).
|
|
141
|
+
`has_files()` returns False.
|
|
142
|
+
|
|
143
|
+
## Custom resource handlers (modern)
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from datamaestro.download import FileResource
|
|
147
|
+
|
|
148
|
+
class MyProcessor(FileResource):
|
|
149
|
+
"""Process a source file into a numpy array."""
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def can_recover(self) -> bool:
|
|
153
|
+
return False # or True for resumable downloads
|
|
154
|
+
|
|
155
|
+
def __init__(self, filename, source, **kw):
|
|
156
|
+
super().__init__(filename, **kw)
|
|
157
|
+
self._dependencies = [source]
|
|
158
|
+
|
|
159
|
+
def _download(self, destination):
|
|
160
|
+
# Read from dependency, write to destination
|
|
161
|
+
source_path = self.dependencies[0].path
|
|
162
|
+
data = load(source_path)
|
|
163
|
+
save(process(data), destination)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def from_source(cls, source):
|
|
167
|
+
return cls("processed.npy", source)
|
|
168
|
+
|
|
169
|
+
# Factory alias
|
|
170
|
+
my_processor = MyProcessor.from_source
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Built-in resource types
|
|
174
|
+
|
|
175
|
+
| Class | Module | Factory alias | Base |
|
|
176
|
+
|---|---|---|---|
|
|
177
|
+
| `FileDownloader` | `download.single` | `filedownloader` | `FileResource` |
|
|
178
|
+
| `ConcatDownloader` | `download.single` | `concatdownload` | `FileResource` |
|
|
179
|
+
| `ZipDownloader` | `download.archive` | `zipdownloader` | `FolderResource` |
|
|
180
|
+
| `TarDownloader` | `download.archive` | `tardownloader` | `FolderResource` |
|
|
181
|
+
| `HFDownloader` | `download.huggingface` | `hf_download` | `ValueResource` |
|
|
182
|
+
| `custom_download` | `download.custom` | — | `Resource` |
|
|
183
|
+
| `links` | `download.links` | — | `Resource` |
|
|
184
|
+
| `linkfolder` | `download.links` | — | `Resource` |
|
|
185
|
+
| `linkfile` | `download.links` | — | `Resource` |
|
|
186
|
+
| `reference` | `download` | — | `Resource` |
|
|
187
|
+
|
|
188
|
+
## Two-path download flow
|
|
189
|
+
|
|
190
|
+
The framework (in `AbstractDataset.download()`) orchestrates:
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
1. Topological sort resources by dependencies
|
|
194
|
+
2. For each resource:
|
|
195
|
+
a. COMPLETE and not force → skip
|
|
196
|
+
b. PARTIAL and not can_recover → delete transient_path, set NONE
|
|
197
|
+
c. Call resource.download(force)
|
|
198
|
+
→ Resource writes to transient_path
|
|
199
|
+
d. On success: move transient_path → path, set COMPLETE
|
|
200
|
+
e. On failure: if can_recover → set PARTIAL, else delete → NONE
|
|
201
|
+
f. Eager cleanup: for each transient dependency with all
|
|
202
|
+
dependents COMPLETE → cleanup
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## State metadata file
|
|
206
|
+
|
|
207
|
+
Location: `<dataset.datapath>/.downloads/.state.json`
|
|
208
|
+
|
|
209
|
+
```json
|
|
210
|
+
{
|
|
211
|
+
"version": 1,
|
|
212
|
+
"resources": {
|
|
213
|
+
"TRAIN_IMAGES": {"state": "complete"},
|
|
214
|
+
"TRAIN_LABELS": {"state": "partial"}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Deprecated: decorator-based datasets
|
|
222
|
+
|
|
223
|
+
> **Deprecated.** The decorator-based API still works but emits deprecation
|
|
224
|
+
> warnings. Migrate to the class-based approach above.
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
# DEPRECATED — use class-based approach instead
|
|
228
|
+
@filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
|
|
229
|
+
@filedownloader("train_labels.idx", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
|
|
230
|
+
@filedownloader("test_images.idx", "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
|
|
231
|
+
@filedownloader("test_labels.idx", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
|
|
232
|
+
@dataset(
|
|
233
|
+
ImageClassification,
|
|
234
|
+
url="http://yann.lecun.com/exdb/mnist/",
|
|
235
|
+
)
|
|
236
|
+
def MNIST(train_images, train_labels, test_images, test_labels):
|
|
237
|
+
"""The MNIST database"""
|
|
238
|
+
return {
|
|
239
|
+
"train": LabelledImages(
|
|
240
|
+
images=IDX(path=train_images),
|
|
241
|
+
labels=IDX(path=train_labels)
|
|
242
|
+
),
|
|
243
|
+
"test": LabelledImages(
|
|
244
|
+
images=IDX(path=test_images),
|
|
245
|
+
labels=IDX(path=test_labels)
|
|
246
|
+
),
|
|
247
|
+
}
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Deprecated names
|
|
251
|
+
|
|
252
|
+
| Deprecated | Replacement |
|
|
253
|
+
|---|---|
|
|
254
|
+
| `Download` (base class) | `Resource` |
|
|
255
|
+
| `hasfiles()` | `has_files()` |
|
|
256
|
+
| `Resource.definition` | `Resource.dataset` |
|
|
257
|
+
| `Resource.varname` | `Resource.name` |
|
|
258
|
+
| `@filedownloader(...)` (decorator) | `FileDownloader(...)` (class attr) |
|
|
259
|
+
| `SingleDownload` | `FileDownloader` |
|
|
260
|
+
|
|
261
|
+
### Deprecated custom handler pattern
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# DEPRECATED
|
|
265
|
+
class MyDownload(Download):
|
|
266
|
+
def __init__(self, varname, custom_param):
|
|
267
|
+
super().__init__(varname)
|
|
268
|
+
self.custom_param = custom_param
|
|
269
|
+
|
|
270
|
+
def prepare(self):
|
|
271
|
+
return self._download_and_process()
|
|
272
|
+
|
|
273
|
+
def download(self, force=False):
|
|
274
|
+
if force or not self._is_cached():
|
|
275
|
+
self._do_download()
|
|
276
|
+
|
|
277
|
+
def hasfiles(self) -> bool:
|
|
278
|
+
return True
|
|
279
|
+
|
|
280
|
+
def mydownloader(varname, custom_param):
|
|
281
|
+
def decorator(dataset):
|
|
282
|
+
download = MyDownload(varname, custom_param)
|
|
283
|
+
download.register(dataset)
|
|
284
|
+
return dataset
|
|
285
|
+
return decorator
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Modern equivalent:
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
class MyDownload(FileResource):
|
|
292
|
+
def __init__(self, filename, custom_param, **kw):
|
|
293
|
+
super().__init__(filename, **kw)
|
|
294
|
+
self.custom_param = custom_param
|
|
295
|
+
|
|
296
|
+
def _download(self, destination):
|
|
297
|
+
# Write output to destination (self.transient_path)
|
|
298
|
+
self._do_download(destination)
|
|
299
|
+
|
|
300
|
+
mydownloader = MyDownload.apply
|
|
301
|
+
```
|
datamaestro/version.py
ADDED
|
@@ -1,39 +1,37 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version:
|
|
4
|
-
Summary:
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
Author-email: benjamin@piwowarski.fr
|
|
8
|
-
License: GPL-3
|
|
9
|
-
Keywords: dataset manager
|
|
10
|
-
Platform: any
|
|
3
|
+
Version: 1.7.0
|
|
4
|
+
Summary: Add your description here
|
|
5
|
+
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
|
+
License-File: LICENSE
|
|
11
7
|
Classifier: Development Status :: 4 - Beta
|
|
12
8
|
Classifier: Intended Audience :: Science/Research
|
|
13
9
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
14
10
|
Classifier: Operating System :: OS Independent
|
|
15
11
|
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
13
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
-
Requires-Python: >=3.
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
Requires-Dist: click
|
|
23
|
-
Requires-Dist:
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist: marshmallow
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist: bitmath
|
|
29
|
-
Requires-Dist: experimaestro (>=0.9.5)
|
|
30
|
-
Requires-Dist: mkdocs
|
|
31
|
-
Requires-Dist: pymdown-extensions
|
|
32
|
-
Requires-Dist: mkdocs-material
|
|
33
|
-
Requires-Dist: docstring-parser
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: bitmath>=1.3.3.1
|
|
16
|
+
Requires-Dist: cached-property>=2.0.1
|
|
17
|
+
Requires-Dist: click>=8.2.1
|
|
18
|
+
Requires-Dist: docstring-parser>=0.16
|
|
19
|
+
Requires-Dist: experimaestro>=1.8.9
|
|
20
|
+
Requires-Dist: marshmallow>=3.26.1
|
|
21
|
+
Requires-Dist: mkdocs-material>=9.6.15
|
|
22
|
+
Requires-Dist: mkdocs>=1.6.1
|
|
34
23
|
Requires-Dist: numpy
|
|
35
|
-
|
|
36
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: pymdown-extensions>=10.16
|
|
25
|
+
Requires-Dist: requests>=2.32.4
|
|
26
|
+
Requires-Dist: tqdm>=4.67.1
|
|
27
|
+
Requires-Dist: urllib3>=2.5.0
|
|
28
|
+
Provides-Extra: docs
|
|
29
|
+
Requires-Dist: myst-parser>0.18; extra == 'docs'
|
|
30
|
+
Requires-Dist: sphinx-codeautolink>=0.15; extra == 'docs'
|
|
31
|
+
Requires-Dist: sphinx-rtd-theme==1.2.2; extra == 'docs'
|
|
32
|
+
Requires-Dist: sphinx-toolbox>=4.1.2; extra == 'docs'
|
|
33
|
+
Requires-Dist: sphinx>=4.2; extra == 'docs'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
37
35
|
|
|
38
36
|
[](https://badge.fury.io/py/datamaestro) [](https://github.com/pre-commit/pre-commit) [](https://zenodo.org/badge/latestdoi/4573876)
|
|
39
37
|
|
|
@@ -94,22 +92,10 @@ $ datamaestro search tag:image
|
|
|
94
92
|
[image] com.lecun.mnist
|
|
95
93
|
|
|
96
94
|
$ datamaestro prepare com.lecun.mnist
|
|
97
|
-
INFO:root:
|
|
98
|
-
INFO:root:
|
|
99
|
-
INFO:root:
|
|
100
|
-
INFO:root:Downloading
|
|
101
|
-
INFO:root:Transforming file
|
|
102
|
-
INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
|
|
103
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
|
|
104
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
|
|
105
|
-
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
|
|
106
|
-
INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
|
|
107
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
|
|
108
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
|
|
109
|
-
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
|
|
110
|
-
INFO:root:Transforming file
|
|
111
|
-
INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
|
|
112
|
-
...JSON...
|
|
95
|
+
INFO:root:Materializing 4 resources
|
|
96
|
+
INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
|
|
97
|
+
INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
|
|
98
|
+
INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
|
|
113
99
|
```
|
|
114
100
|
|
|
115
101
|
The previous command also returns a JSON on standard output
|
|
@@ -147,68 +133,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
|
|
|
147
133
|
|
|
148
134
|
## Python definition of datasets
|
|
149
135
|
|
|
150
|
-
|
|
151
|
-
and
|
|
152
|
-
|
|
153
|
-
and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
|
|
154
|
-
|
|
155
|
-
Its syntax is described in the [documentation](http://experimaestro.github.io/datamaestro/).
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
For MNIST, this corresponds to.
|
|
136
|
+
Datasets are defined as Python classes with resource attributes that describe how
|
|
137
|
+
to download and process data. The framework automatically builds a dependency graph
|
|
138
|
+
and handles downloads with two-path safety and state tracking.
|
|
159
139
|
|
|
160
140
|
```python
|
|
161
|
-
from datamaestro_image.data import ImageClassification, LabelledImages
|
|
162
|
-
from datamaestro.download.single import filedownloader
|
|
163
|
-
from datamaestro.definitions import argument, datatasks, datatags, dataset
|
|
141
|
+
from datamaestro_image.data import ImageClassification, LabelledImages
|
|
164
142
|
from datamaestro.data.tensor import IDX
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
@
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
)
|
|
192
|
-
|
|
143
|
+
from datamaestro.download.single import FileDownloader
|
|
144
|
+
from datamaestro.definitions import AbstractDataset, dataset
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
@dataset(url="http://yann.lecun.com/exdb/mnist/")
|
|
148
|
+
class MNIST(ImageClassification):
|
|
149
|
+
"""The MNIST database of handwritten digits."""
|
|
150
|
+
|
|
151
|
+
TRAIN_IMAGES = FileDownloader(
|
|
152
|
+
"train_images.idx",
|
|
153
|
+
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
|
|
154
|
+
)
|
|
155
|
+
TRAIN_LABELS = FileDownloader(
|
|
156
|
+
"train_labels.idx",
|
|
157
|
+
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
|
|
158
|
+
)
|
|
159
|
+
TEST_IMAGES = FileDownloader(
|
|
160
|
+
"test_images.idx",
|
|
161
|
+
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
|
|
162
|
+
)
|
|
163
|
+
TEST_LABELS = FileDownloader(
|
|
164
|
+
"test_labels.idx",
|
|
165
|
+
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
170
|
+
return cls.C(
|
|
171
|
+
train=LabelledImages(
|
|
172
|
+
images=IDX(path=cls.TRAIN_IMAGES.path),
|
|
173
|
+
labels=IDX(path=cls.TRAIN_LABELS.path),
|
|
174
|
+
),
|
|
175
|
+
test=LabelledImages(
|
|
176
|
+
images=IDX(path=cls.TEST_IMAGES.path),
|
|
177
|
+
labels=IDX(path=cls.TEST_LABELS.path),
|
|
178
|
+
),
|
|
179
|
+
)
|
|
193
180
|
```
|
|
194
181
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
- Integration with other repositories: abstracting away the notion of dataset
|
|
198
|
-
- Repository prefix
|
|
199
|
-
- Set sub-datasets IDs automatically
|
|
200
|
-
|
|
201
|
-
# 0.7.3
|
|
202
|
-
|
|
203
|
-
- Updates for new experimaestro (0.8.5)
|
|
204
|
-
- Search types with "type:..."
|
|
205
|
-
|
|
206
|
-
# 0.6.17
|
|
207
|
-
|
|
208
|
-
- Allow remote access through rpyc
|
|
209
|
-
|
|
210
|
-
# 0.6.9
|
|
211
|
-
|
|
212
|
-
`version` command
|
|
213
|
-
|
|
214
|
-
|
|
182
|
+
Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
datamaestro/__init__.py,sha256=oh9M4VODuvTc9EFHKirtDxpCJkLUANzpzBOIwzHc_mw,246
|
|
2
|
+
datamaestro/__main__.py,sha256=22v54rQoO2umL1frFO2FOQuuRljr-Jw-ER-OATTpVxw,9218
|
|
3
|
+
datamaestro/context.py,sha256=AL2BTi6dLA8rDGBE0PFyfV9ua29JHvBgx6_w6hDj9Dg,13977
|
|
4
|
+
datamaestro/definitions.py,sha256=kIwyrXZWg1tZw3G1PuUyGJ13ZPunocmu0wuxydVesbQ,27167
|
|
5
|
+
datamaestro/record.py,sha256=e5fjRV3ni7ZxXwYH45bVDB_jpD-n9quvh4ie4uI-MM4,7140
|
|
6
|
+
datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
|
|
7
|
+
datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
|
|
8
|
+
datamaestro/settings.py,sha256=NuUbe_C31GDlzdio2ryz7tPzuo4hsmmdCM5Cyuhqbzs,1294
|
|
9
|
+
datamaestro/sphinx.py,sha256=WWXB63gd0ZgEwFr_YwO2Hmuly5OoiFlu9mDvJSHFYuY,6966
|
|
10
|
+
datamaestro/utils.py,sha256=JUrvtVYnjNKRo0_ZypmXSQ9R4uOyImDjW1GZ14MYzKM,6547
|
|
11
|
+
datamaestro/v2.md,sha256=pLCxQUdfVkd4CM9Ie0ZxCnxUntqoA7k_0m7x1etcr7Y,9801
|
|
12
|
+
datamaestro/version.py,sha256=aCGW8aYYQ-ZQNfHZo9TrCX1MKqWbHUjj3X57h-DmRAs,171
|
|
13
|
+
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
14
|
+
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
15
|
+
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
|
|
17
|
+
datamaestro/commands/site.py,sha256=mVU5JKhwx9LTMf5FddcWgGh41qXtANJqB8qWKSKa-70,14432
|
|
18
|
+
datamaestro/data/__init__.py,sha256=s81ZxT8MQrBGkcu45xr4NaInIsMeunHOLnkLrJE47So,1496
|
|
19
|
+
datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
|
|
20
|
+
datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
|
|
21
|
+
datamaestro/data/ml.py,sha256=4PlH6FJFZwtfTEStkOjOucV8t8yY8LFaPsnDBvEqAPs,710
|
|
22
|
+
datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
|
|
23
|
+
datamaestro/download/__init__.py,sha256=qbmSLtzo4zTLuc1cAVSAKDdbIJROsJa6BMP6ksVJWvU,19375
|
|
24
|
+
datamaestro/download/archive.py,sha256=fz1ElRggB9gYb6F7fek0Tkw9eAj6Glotc_Mit9OcCZU,6986
|
|
25
|
+
datamaestro/download/custom.py,sha256=dxyvwbweVuz0xveExtvta8xycoqTjpDZz_P98ucintA,1287
|
|
26
|
+
datamaestro/download/huggingface.py,sha256=inZbB5EdVvczW9CfM59SqL1Nl-H4y3bWxv1SWjrYeOs,1996
|
|
27
|
+
datamaestro/download/links.py,sha256=NCCpBFAIYznaskJV5NSFX3NoorqHDKyAiRCWSnEnb9E,4364
|
|
28
|
+
datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
|
|
29
|
+
datamaestro/download/multiple.py,sha256=iX3gtgQT1eskHok0pAecU_mgd56of1Sadz7_o95ItaA,2736
|
|
30
|
+
datamaestro/download/single.py,sha256=nFWBH1LeGO-WmMUBbdV6bzkd6Lfe74uhfcWeLZkCC3M,5737
|
|
31
|
+
datamaestro/download/sync.py,sha256=QlpoOkamiX9yE-4P8-ppCZ_wgA2P4oBSOQCX98gWnCc,784
|
|
32
|
+
datamaestro/download/todo.py,sha256=d-mfi_gJlrOvAoa7dXN2ecXYY-cgB-NHzU1J-dzkEkI,444
|
|
33
|
+
datamaestro/download/wayback.py,sha256=wpbrTtE321AwsO8Poj1a4qwEKy1kE0wEbxWgMEf5nLo,5489
|
|
34
|
+
datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
|
|
35
|
+
datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
|
|
36
|
+
datamaestro/stream/lines.py,sha256=DhptjIqhhAJ1tu3e-uoOepHHNALSXS8qz8ASUAyaSkM,2074
|
|
37
|
+
datamaestro/templates/dataset.py,sha256=5065rTMAIl4gtzQ96GFiV1_46tY08miIx3WspTP8yGA,346
|
|
38
|
+
datamaestro/test/__init__.py,sha256=9xXqLvUgiIn74AY6k8qyYX7rq6hWz7dOJFBrUgwuX88,61
|
|
39
|
+
datamaestro/test/checks.py,sha256=1eTkz4YJhAPOcnQSsz4vPnvzwwfrEnpn6H_s1ADISpo,1704
|
|
40
|
+
datamaestro/test/conftest.py,sha256=z8rF0OIKVuCgIYJ-4fQxQL8KhgIfg_4kfkIZNETfNJ0,793
|
|
41
|
+
datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
|
|
42
|
+
datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
|
|
43
|
+
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
44
|
+
datamaestro/test/test_resource.py,sha256=meUCDaoPg5XT3gWIToqXvaofE1vrZq_qG7gZtOHIOfQ,41044
|
|
45
|
+
datamaestro-1.7.0.dist-info/METADATA,sha256=tutvO9o9gHY7DLbF7zliiwcz2ajn7jnufkmotlA-cDQ,7433
|
|
46
|
+
datamaestro-1.7.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
47
|
+
datamaestro-1.7.0.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
48
|
+
datamaestro-1.7.0.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
49
|
+
datamaestro-1.7.0.dist-info/RECORD,,
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|