datamaestro 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__main__.py +9 -5
- datamaestro/commands/site.py +16 -5
- datamaestro/data/ml.py +1 -0
- datamaestro/definitions.py +263 -19
- datamaestro/download/__init__.py +606 -45
- datamaestro/download/archive.py +120 -76
- datamaestro/download/custom.py +38 -6
- datamaestro/download/huggingface.py +46 -14
- datamaestro/download/links.py +116 -51
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +111 -54
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +3 -3
- datamaestro/record.py +48 -2
- datamaestro/settings.py +2 -1
- datamaestro/sphinx.py +1 -3
- datamaestro/stream/lines.py +8 -6
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/conftest.py +1 -2
- datamaestro/test/test_resource.py +1657 -0
- datamaestro/utils.py +7 -6
- datamaestro/v2.md +301 -0
- datamaestro/version.py +1 -1
- {datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/METADATA +46 -47
- datamaestro-1.7.1.dist-info/RECORD +49 -0
- datamaestro-1.6.2.dist-info/RECORD +0 -47
- {datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/WHEEL +0 -0
- {datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/licenses/LICENSE +0 -0
datamaestro/utils.py
CHANGED
|
@@ -159,9 +159,9 @@ def downloadURL(url: str, path: Path, resume: bool = False, size: int = None):
|
|
|
159
159
|
response = requests.get(url, stream=True)
|
|
160
160
|
|
|
161
161
|
# Valid response
|
|
162
|
-
assert (
|
|
163
|
-
|
|
164
|
-
)
|
|
162
|
+
assert response.status_code >= 200 and response.status_code < 300, (
|
|
163
|
+
f"Status code is not 2XX ({response.status_code})"
|
|
164
|
+
)
|
|
165
165
|
|
|
166
166
|
# Get the total size (or use the provided one)
|
|
167
167
|
total_size = int(response.headers.get("content-length", size or 0))
|
|
@@ -171,9 +171,10 @@ def downloadURL(url: str, path: Path, resume: bool = False, size: int = None):
|
|
|
171
171
|
total_size += pos
|
|
172
172
|
|
|
173
173
|
CHUNK_SIZE = 1024
|
|
174
|
-
with
|
|
175
|
-
|
|
176
|
-
|
|
174
|
+
with (
|
|
175
|
+
path.open("ab") as f,
|
|
176
|
+
tqdm(initial=pos, total=total_size, unit_scale=True, unit="B") as t,
|
|
177
|
+
):
|
|
177
178
|
for data in response.iter_content(chunk_size=CHUNK_SIZE):
|
|
178
179
|
f.write(data)
|
|
179
180
|
t.update(len(data))
|
datamaestro/v2.md
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
# Resource Interface (v2)
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Resources represent steps in a dataset preparation pipeline. They form a
|
|
6
|
+
directed acyclic graph (DAG) where each resource can depend on other resources.
|
|
7
|
+
|
|
8
|
+
Key concepts:
|
|
9
|
+
|
|
10
|
+
- **Two-path system**: resources write to `transient_path` during download,
|
|
11
|
+
then the framework moves data to `path` and marks the resource as COMPLETE.
|
|
12
|
+
- **Three states**: NONE, PARTIAL, COMPLETE (persisted in `.state.json`)
|
|
13
|
+
- **Transient resources**: intermediate resources that can be deleted after all
|
|
14
|
+
dependents are COMPLETE (eager cleanup)
|
|
15
|
+
- **`can_recover` property**: subclasses override to preserve PARTIAL data on error
|
|
16
|
+
|
|
17
|
+
## Modern API: Class-based datasets (preferred)
|
|
18
|
+
|
|
19
|
+
```python
|
|
20
|
+
from datamaestro.definitions import dataset
|
|
21
|
+
from datamaestro.download.single import FileDownloader
|
|
22
|
+
|
|
23
|
+
@dataset(url="http://yann.lecun.com/exdb/mnist/")
|
|
24
|
+
class ProcessedMNIST(ImageClassification):
|
|
25
|
+
"""The MNIST database of handwritten digits."""
|
|
26
|
+
|
|
27
|
+
# Resources are class attributes — no decorators needed
|
|
28
|
+
TRAIN_IMAGES = FileDownloader(
|
|
29
|
+
"train_images.idx",
|
|
30
|
+
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
|
|
31
|
+
transient=True,
|
|
32
|
+
)
|
|
33
|
+
TRAIN_IMAGES_NP = NumpyTensorFile.from_idx(TRAIN_IMAGES)
|
|
34
|
+
|
|
35
|
+
TRAIN_LABELS = FileDownloader(
|
|
36
|
+
"train_labels.idx",
|
|
37
|
+
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
|
|
38
|
+
transient=True,
|
|
39
|
+
)
|
|
40
|
+
TRAIN_LABELS_NP = NumpyTensorFile.from_idx(TRAIN_LABELS)
|
|
41
|
+
|
|
42
|
+
TEST_IMAGES = FileDownloader(
|
|
43
|
+
"test_images.idx",
|
|
44
|
+
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
|
|
45
|
+
transient=True,
|
|
46
|
+
)
|
|
47
|
+
TEST_IMAGES_NP = NumpyTensorFile.from_idx(TEST_IMAGES)
|
|
48
|
+
|
|
49
|
+
TEST_LABELS = FileDownloader(
|
|
50
|
+
"test_labels.idx",
|
|
51
|
+
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
|
|
52
|
+
transient=True,
|
|
53
|
+
)
|
|
54
|
+
TEST_LABELS_NP = NumpyTensorFile.from_idx(TEST_LABELS)
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
58
|
+
return cls.C(
|
|
59
|
+
train=LabelledImages(
|
|
60
|
+
images=NumpyTensorFile(path=cls.TRAIN_IMAGES_NP.path),
|
|
61
|
+
labels=NumpyTensorFile(path=cls.TRAIN_LABELS_NP.path),
|
|
62
|
+
),
|
|
63
|
+
test=LabelledImages(
|
|
64
|
+
images=NumpyTensorFile(path=cls.TEST_IMAGES_NP.path),
|
|
65
|
+
labels=NumpyTensorFile(path=cls.TEST_LABELS_NP.path),
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Advantages:
|
|
71
|
+
|
|
72
|
+
1. **Explicit pipeline** — dependencies between resources are visible
|
|
73
|
+
2. **Transient intermediaries** — intermediate files can be deleted after processing
|
|
74
|
+
3. **No varname** — resource names are auto-detected from class attribute names
|
|
75
|
+
4. **Two-path safety** — incomplete downloads never appear at the final path
|
|
76
|
+
|
|
77
|
+
## Resource hierarchy
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Resource (ABC)
|
|
81
|
+
├── FileResource — produces a single file
|
|
82
|
+
├── FolderResource — produces a directory
|
|
83
|
+
├── ValueResource — produces an in-memory value (no files)
|
|
84
|
+
├── reference — references another dataset
|
|
85
|
+
└── Download — (deprecated alias for Resource)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### `ResourceState`
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
class ResourceState(str, Enum):
|
|
92
|
+
NONE = "none" # Not started
|
|
93
|
+
PARTIAL = "partial" # Started but incomplete
|
|
94
|
+
COMPLETE = "complete" # Fully available
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### `Resource` base class
|
|
98
|
+
|
|
99
|
+
| Property / Method | Description |
|
|
100
|
+
|---|---|
|
|
101
|
+
| `name: str` | Resource name (auto-set from class attribute name) |
|
|
102
|
+
| `dataset` | Back-reference to the owning `AbstractDataset` |
|
|
103
|
+
| `transient: bool` | Whether data can be deleted after dependents complete |
|
|
104
|
+
| `can_recover: bool` | Property. If True, PARTIAL data is preserved on error |
|
|
105
|
+
| `dependencies` | List of resources that must be COMPLETE first |
|
|
106
|
+
| `dependents` | Computed inverse of dependencies |
|
|
107
|
+
| `path: Path` | Final storage path (after COMPLETE) |
|
|
108
|
+
| `transient_path: Path` | Temp path where `download()` writes |
|
|
109
|
+
| `state: ResourceState` | Current state (from `.state.json` metadata file) |
|
|
110
|
+
| `download(force)` | Abstract. Execute download/processing step |
|
|
111
|
+
| `prepare()` | Abstract. Return value for dataset construction |
|
|
112
|
+
| `cleanup()` | Remove data from disk, set state to NONE |
|
|
113
|
+
| `has_files() -> bool` | Whether this resource produces files on disk |
|
|
114
|
+
| `bind(name, dataset)` | Bind to a dataset (called by framework) |
|
|
115
|
+
| `stream() -> IO | None` | (FileResource only) Return byte stream or None |
|
|
116
|
+
|
|
117
|
+
### `FileResource`
|
|
118
|
+
|
|
119
|
+
Base for resources that produce a single file. Subclasses implement
|
|
120
|
+
`_download(destination: Path)`.
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
class MyFileResource(FileResource):
|
|
124
|
+
def __init__(self, filename, url, **kw):
|
|
125
|
+
super().__init__(filename, **kw)
|
|
126
|
+
self.url = url
|
|
127
|
+
|
|
128
|
+
def _download(self, destination: Path):
|
|
129
|
+
# Write to destination (which is self.transient_path)
|
|
130
|
+
...
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### `FolderResource`
|
|
134
|
+
|
|
135
|
+
Base for resources that produce a directory. Subclasses implement
|
|
136
|
+
`_download(destination: Path)`.
|
|
137
|
+
|
|
138
|
+
### `ValueResource`
|
|
139
|
+
|
|
140
|
+
Base for resources that produce in-memory values (no files on disk).
|
|
141
|
+
`has_files()` returns False.
|
|
142
|
+
|
|
143
|
+
## Custom resource handlers (modern)
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from datamaestro.download import FileResource
|
|
147
|
+
|
|
148
|
+
class MyProcessor(FileResource):
|
|
149
|
+
"""Process a source file into a numpy array."""
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def can_recover(self) -> bool:
|
|
153
|
+
return False # or True for resumable downloads
|
|
154
|
+
|
|
155
|
+
def __init__(self, filename, source, **kw):
|
|
156
|
+
super().__init__(filename, **kw)
|
|
157
|
+
self._dependencies = [source]
|
|
158
|
+
|
|
159
|
+
def _download(self, destination):
|
|
160
|
+
# Read from dependency, write to destination
|
|
161
|
+
source_path = self.dependencies[0].path
|
|
162
|
+
data = load(source_path)
|
|
163
|
+
save(process(data), destination)
|
|
164
|
+
|
|
165
|
+
@classmethod
|
|
166
|
+
def from_source(cls, source):
|
|
167
|
+
return cls("processed.npy", source)
|
|
168
|
+
|
|
169
|
+
# Factory alias
|
|
170
|
+
my_processor = MyProcessor.from_source
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Built-in resource types
|
|
174
|
+
|
|
175
|
+
| Class | Module | Factory alias | Base |
|
|
176
|
+
|---|---|---|---|
|
|
177
|
+
| `FileDownloader` | `download.single` | `filedownloader` | `FileResource` |
|
|
178
|
+
| `ConcatDownloader` | `download.single` | `concatdownload` | `FileResource` |
|
|
179
|
+
| `ZipDownloader` | `download.archive` | `zipdownloader` | `FolderResource` |
|
|
180
|
+
| `TarDownloader` | `download.archive` | `tardownloader` | `FolderResource` |
|
|
181
|
+
| `HFDownloader` | `download.huggingface` | `hf_download` | `ValueResource` |
|
|
182
|
+
| `custom_download` | `download.custom` | — | `Resource` |
|
|
183
|
+
| `links` | `download.links` | — | `Resource` |
|
|
184
|
+
| `linkfolder` | `download.links` | — | `Resource` |
|
|
185
|
+
| `linkfile` | `download.links` | — | `Resource` |
|
|
186
|
+
| `reference` | `download` | — | `Resource` |
|
|
187
|
+
|
|
188
|
+
## Two-path download flow
|
|
189
|
+
|
|
190
|
+
The framework (in `AbstractDataset.download()`) orchestrates:
|
|
191
|
+
|
|
192
|
+
```
|
|
193
|
+
1. Topological sort resources by dependencies
|
|
194
|
+
2. For each resource:
|
|
195
|
+
a. COMPLETE and not force → skip
|
|
196
|
+
b. PARTIAL and not can_recover → delete transient_path, set NONE
|
|
197
|
+
c. Call resource.download(force)
|
|
198
|
+
→ Resource writes to transient_path
|
|
199
|
+
d. On success: move transient_path → path, set COMPLETE
|
|
200
|
+
e. On failure: if can_recover → set PARTIAL, else delete → NONE
|
|
201
|
+
f. Eager cleanup: for each transient dependency with all
|
|
202
|
+
dependents COMPLETE → cleanup
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## State metadata file
|
|
206
|
+
|
|
207
|
+
Location: `<dataset.datapath>/.downloads/.state.json`
|
|
208
|
+
|
|
209
|
+
```json
|
|
210
|
+
{
|
|
211
|
+
"version": 1,
|
|
212
|
+
"resources": {
|
|
213
|
+
"TRAIN_IMAGES": {"state": "complete"},
|
|
214
|
+
"TRAIN_LABELS": {"state": "partial"}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
---
|
|
220
|
+
|
|
221
|
+
## Deprecated: decorator-based datasets
|
|
222
|
+
|
|
223
|
+
> **Deprecated.** The decorator-based API still works but emits deprecation
|
|
224
|
+
> warnings. Migrate to the class-based approach above.
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
# DEPRECATED — use class-based approach instead
|
|
228
|
+
@filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
|
|
229
|
+
@filedownloader("train_labels.idx", "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
|
|
230
|
+
@filedownloader("test_images.idx", "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
|
|
231
|
+
@filedownloader("test_labels.idx", "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
|
|
232
|
+
@dataset(
|
|
233
|
+
ImageClassification,
|
|
234
|
+
url="http://yann.lecun.com/exdb/mnist/",
|
|
235
|
+
)
|
|
236
|
+
def MNIST(train_images, train_labels, test_images, test_labels):
|
|
237
|
+
"""The MNIST database"""
|
|
238
|
+
return {
|
|
239
|
+
"train": LabelledImages(
|
|
240
|
+
images=IDX(path=train_images),
|
|
241
|
+
labels=IDX(path=train_labels)
|
|
242
|
+
),
|
|
243
|
+
"test": LabelledImages(
|
|
244
|
+
images=IDX(path=test_images),
|
|
245
|
+
labels=IDX(path=test_labels)
|
|
246
|
+
),
|
|
247
|
+
}
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Deprecated names
|
|
251
|
+
|
|
252
|
+
| Deprecated | Replacement |
|
|
253
|
+
|---|---|
|
|
254
|
+
| `Download` (base class) | `Resource` |
|
|
255
|
+
| `hasfiles()` | `has_files()` |
|
|
256
|
+
| `Resource.definition` | `Resource.dataset` |
|
|
257
|
+
| `Resource.varname` | `Resource.name` |
|
|
258
|
+
| `@filedownloader(...)` (decorator) | `FileDownloader(...)` (class attr) |
|
|
259
|
+
| `SingleDownload` | `FileDownloader` |
|
|
260
|
+
|
|
261
|
+
### Deprecated custom handler pattern
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
# DEPRECATED
|
|
265
|
+
class MyDownload(Download):
|
|
266
|
+
def __init__(self, varname, custom_param):
|
|
267
|
+
super().__init__(varname)
|
|
268
|
+
self.custom_param = custom_param
|
|
269
|
+
|
|
270
|
+
def prepare(self):
|
|
271
|
+
return self._download_and_process()
|
|
272
|
+
|
|
273
|
+
def download(self, force=False):
|
|
274
|
+
if force or not self._is_cached():
|
|
275
|
+
self._do_download()
|
|
276
|
+
|
|
277
|
+
def hasfiles(self) -> bool:
|
|
278
|
+
return True
|
|
279
|
+
|
|
280
|
+
def mydownloader(varname, custom_param):
|
|
281
|
+
def decorator(dataset):
|
|
282
|
+
download = MyDownload(varname, custom_param)
|
|
283
|
+
download.register(dataset)
|
|
284
|
+
return dataset
|
|
285
|
+
return decorator
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Modern equivalent:
|
|
289
|
+
|
|
290
|
+
```python
|
|
291
|
+
class MyDownload(FileResource):
|
|
292
|
+
def __init__(self, filename, custom_param, **kw):
|
|
293
|
+
super().__init__(filename, **kw)
|
|
294
|
+
self.custom_param = custom_param
|
|
295
|
+
|
|
296
|
+
def _download(self, destination):
|
|
297
|
+
# Write output to destination (self.transient_path)
|
|
298
|
+
self._do_download(destination)
|
|
299
|
+
|
|
300
|
+
mydownloader = MyDownload.apply
|
|
301
|
+
```
|
datamaestro/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.1
|
|
4
4
|
Summary: Add your description here
|
|
5
5
|
Author-email: Benjamin Piwowarski <benjamin@piwowarski.fr>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -25,6 +25,12 @@ Requires-Dist: pymdown-extensions>=10.16
|
|
|
25
25
|
Requires-Dist: requests>=2.32.4
|
|
26
26
|
Requires-Dist: tqdm>=4.67.1
|
|
27
27
|
Requires-Dist: urllib3>=2.5.0
|
|
28
|
+
Provides-Extra: docs
|
|
29
|
+
Requires-Dist: myst-parser>0.18; extra == 'docs'
|
|
30
|
+
Requires-Dist: sphinx-codeautolink>=0.15; extra == 'docs'
|
|
31
|
+
Requires-Dist: sphinx-rtd-theme==1.2.2; extra == 'docs'
|
|
32
|
+
Requires-Dist: sphinx-toolbox>=4.1.2; extra == 'docs'
|
|
33
|
+
Requires-Dist: sphinx>=4.2; extra == 'docs'
|
|
28
34
|
Description-Content-Type: text/markdown
|
|
29
35
|
|
|
30
36
|
[](https://badge.fury.io/py/datamaestro) [](https://github.com/pre-commit/pre-commit) [](https://zenodo.org/badge/latestdoi/4573876)
|
|
@@ -127,57 +133,50 @@ Out[3]: (dtype('uint8'), (60000, 28, 28))
|
|
|
127
133
|
|
|
128
134
|
## Python definition of datasets
|
|
129
135
|
|
|
130
|
-
|
|
131
|
-
and
|
|
132
|
-
|
|
133
|
-
and is integrated with [experimaestro](http://experimaestro.github.io/experimaestro-python).
|
|
136
|
+
Datasets are defined as Python classes with resource attributes that describe how
|
|
137
|
+
to download and process data. The framework automatically builds a dependency graph
|
|
138
|
+
and handles downloads with two-path safety and state tracking.
|
|
134
139
|
|
|
135
|
-
|
|
140
|
+
```python
|
|
141
|
+
from datamaestro_image.data import ImageClassification, LabelledImages
|
|
142
|
+
from datamaestro.data.tensor import IDX
|
|
143
|
+
from datamaestro.download.single import FileDownloader
|
|
144
|
+
from datamaestro.definitions import AbstractDataset, dataset
|
|
136
145
|
|
|
137
146
|
|
|
138
|
-
|
|
147
|
+
@dataset(url="http://yann.lecun.com/exdb/mnist/")
|
|
148
|
+
class MNIST(ImageClassification):
|
|
149
|
+
"""The MNIST database of handwritten digits."""
|
|
139
150
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
return ImageClassification(
|
|
156
|
-
train=LabelledImages(
|
|
157
|
-
images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
|
|
158
|
-
),
|
|
159
|
-
test=LabelledImages(
|
|
160
|
-
images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
|
|
161
|
-
),
|
|
151
|
+
TRAIN_IMAGES = FileDownloader(
|
|
152
|
+
"train_images.idx",
|
|
153
|
+
"http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz",
|
|
154
|
+
)
|
|
155
|
+
TRAIN_LABELS = FileDownloader(
|
|
156
|
+
"train_labels.idx",
|
|
157
|
+
"http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz",
|
|
158
|
+
)
|
|
159
|
+
TEST_IMAGES = FileDownloader(
|
|
160
|
+
"test_images.idx",
|
|
161
|
+
"http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz",
|
|
162
|
+
)
|
|
163
|
+
TEST_LABELS = FileDownloader(
|
|
164
|
+
"test_labels.idx",
|
|
165
|
+
"http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz",
|
|
162
166
|
)
|
|
163
|
-
```
|
|
164
167
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
digits have been size-normalized and centered in a fixed-size image.
|
|
168
|
+
@classmethod
|
|
169
|
+
def __create_dataset__(cls, dataset: AbstractDataset):
|
|
170
|
+
return cls.C(
|
|
171
|
+
train=LabelledImages(
|
|
172
|
+
images=IDX(path=cls.TRAIN_IMAGES.path),
|
|
173
|
+
labels=IDX(path=cls.TRAIN_LABELS.path),
|
|
174
|
+
),
|
|
175
|
+
test=LabelledImages(
|
|
176
|
+
images=IDX(path=cls.TEST_IMAGES.path),
|
|
177
|
+
labels=IDX(path=cls.TEST_LABELS.path),
|
|
178
|
+
),
|
|
179
|
+
)
|
|
178
180
|
```
|
|
179
181
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
1. Document the dataset
|
|
183
|
-
2. Allow to use the command line interface to manipulate it (download resources, etc.)
|
|
182
|
+
Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
datamaestro/__init__.py,sha256=oh9M4VODuvTc9EFHKirtDxpCJkLUANzpzBOIwzHc_mw,246
|
|
2
|
+
datamaestro/__main__.py,sha256=22v54rQoO2umL1frFO2FOQuuRljr-Jw-ER-OATTpVxw,9218
|
|
3
|
+
datamaestro/context.py,sha256=AL2BTi6dLA8rDGBE0PFyfV9ua29JHvBgx6_w6hDj9Dg,13977
|
|
4
|
+
datamaestro/definitions.py,sha256=xo-MhpQHcUPNFJtkdWOEp1jC-7pbv0TREJKVS0iDVh8,27979
|
|
5
|
+
datamaestro/record.py,sha256=e5fjRV3ni7ZxXwYH45bVDB_jpD-n9quvh4ie4uI-MM4,7140
|
|
6
|
+
datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
|
|
7
|
+
datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
|
|
8
|
+
datamaestro/settings.py,sha256=NuUbe_C31GDlzdio2ryz7tPzuo4hsmmdCM5Cyuhqbzs,1294
|
|
9
|
+
datamaestro/sphinx.py,sha256=WWXB63gd0ZgEwFr_YwO2Hmuly5OoiFlu9mDvJSHFYuY,6966
|
|
10
|
+
datamaestro/utils.py,sha256=JUrvtVYnjNKRo0_ZypmXSQ9R4uOyImDjW1GZ14MYzKM,6547
|
|
11
|
+
datamaestro/v2.md,sha256=pLCxQUdfVkd4CM9Ie0ZxCnxUntqoA7k_0m7x1etcr7Y,9801
|
|
12
|
+
datamaestro/version.py,sha256=Hy65VR_YBBs2cTGjk4KdU_bIDh0FtKY39zpOkoQNGIE,171
|
|
13
|
+
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
14
|
+
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
15
|
+
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
|
|
17
|
+
datamaestro/commands/site.py,sha256=mVU5JKhwx9LTMf5FddcWgGh41qXtANJqB8qWKSKa-70,14432
|
|
18
|
+
datamaestro/data/__init__.py,sha256=s81ZxT8MQrBGkcu45xr4NaInIsMeunHOLnkLrJE47So,1496
|
|
19
|
+
datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
|
|
20
|
+
datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
|
|
21
|
+
datamaestro/data/ml.py,sha256=4PlH6FJFZwtfTEStkOjOucV8t8yY8LFaPsnDBvEqAPs,710
|
|
22
|
+
datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
|
|
23
|
+
datamaestro/download/__init__.py,sha256=az_H9i5ynY_tsnjYiBiFMzTzeaOoEMaxh-knj57tSLc,20087
|
|
24
|
+
datamaestro/download/archive.py,sha256=fz1ElRggB9gYb6F7fek0Tkw9eAj6Glotc_Mit9OcCZU,6986
|
|
25
|
+
datamaestro/download/custom.py,sha256=dxyvwbweVuz0xveExtvta8xycoqTjpDZz_P98ucintA,1287
|
|
26
|
+
datamaestro/download/huggingface.py,sha256=inZbB5EdVvczW9CfM59SqL1Nl-H4y3bWxv1SWjrYeOs,1996
|
|
27
|
+
datamaestro/download/links.py,sha256=m5KX93Xp7WDFEgELvAG1PbBGCIrs401u7KMZwVHrlp0,4688
|
|
28
|
+
datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
|
|
29
|
+
datamaestro/download/multiple.py,sha256=iX3gtgQT1eskHok0pAecU_mgd56of1Sadz7_o95ItaA,2736
|
|
30
|
+
datamaestro/download/single.py,sha256=nFWBH1LeGO-WmMUBbdV6bzkd6Lfe74uhfcWeLZkCC3M,5737
|
|
31
|
+
datamaestro/download/sync.py,sha256=QlpoOkamiX9yE-4P8-ppCZ_wgA2P4oBSOQCX98gWnCc,784
|
|
32
|
+
datamaestro/download/todo.py,sha256=d-mfi_gJlrOvAoa7dXN2ecXYY-cgB-NHzU1J-dzkEkI,444
|
|
33
|
+
datamaestro/download/wayback.py,sha256=wpbrTtE321AwsO8Poj1a4qwEKy1kE0wEbxWgMEf5nLo,5489
|
|
34
|
+
datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
|
|
35
|
+
datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
|
|
36
|
+
datamaestro/stream/lines.py,sha256=DhptjIqhhAJ1tu3e-uoOepHHNALSXS8qz8ASUAyaSkM,2074
|
|
37
|
+
datamaestro/templates/dataset.py,sha256=5065rTMAIl4gtzQ96GFiV1_46tY08miIx3WspTP8yGA,346
|
|
38
|
+
datamaestro/test/__init__.py,sha256=9xXqLvUgiIn74AY6k8qyYX7rq6hWz7dOJFBrUgwuX88,61
|
|
39
|
+
datamaestro/test/checks.py,sha256=1eTkz4YJhAPOcnQSsz4vPnvzwwfrEnpn6H_s1ADISpo,1704
|
|
40
|
+
datamaestro/test/conftest.py,sha256=z8rF0OIKVuCgIYJ-4fQxQL8KhgIfg_4kfkIZNETfNJ0,793
|
|
41
|
+
datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
|
|
42
|
+
datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
|
|
43
|
+
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
44
|
+
datamaestro/test/test_resource.py,sha256=QbwmZkGv_8O_jI0CKcatJSUs3IKbMfBrk0T_aTC1KcE,51124
|
|
45
|
+
datamaestro-1.7.1.dist-info/METADATA,sha256=7voV0DURyp-8ShRiMFK1wGE5SoMrTHL4U6if3dYvB9I,7433
|
|
46
|
+
datamaestro-1.7.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
47
|
+
datamaestro-1.7.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
48
|
+
datamaestro-1.7.1.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
49
|
+
datamaestro-1.7.1.dist-info/RECORD,,
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
datamaestro/__init__.py,sha256=oh9M4VODuvTc9EFHKirtDxpCJkLUANzpzBOIwzHc_mw,246
|
|
2
|
-
datamaestro/__main__.py,sha256=jbwzt-8Yhu4KjCkbBqsGX0yUx67IOE3Nmrc6qlGdGjs,9206
|
|
3
|
-
datamaestro/context.py,sha256=AL2BTi6dLA8rDGBE0PFyfV9ua29JHvBgx6_w6hDj9Dg,13977
|
|
4
|
-
datamaestro/definitions.py,sha256=byJyuh1AJ03zcaeEYOcbJQwjVI8cYJK2rrA_vAE8O1s,19776
|
|
5
|
-
datamaestro/record.py,sha256=IxxcrSIf99iluohtpnuMBTFkqeHRe5S-T_hWEqBgeME,5812
|
|
6
|
-
datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
|
|
7
|
-
datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
|
|
8
|
-
datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
|
|
9
|
-
datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
|
|
10
|
-
datamaestro/utils.py,sha256=9m-AVVww6InAZfGFiGy6XJzfExpYNqH1fhWQEezjafA,6536
|
|
11
|
-
datamaestro/version.py,sha256=_c7uZJ1tNg6l2QN9t8gbOsmSW56keOdPFS_09TZT714,171
|
|
12
|
-
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
13
|
-
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
14
|
-
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
|
|
16
|
-
datamaestro/commands/site.py,sha256=7H9c-ZlXt7bUlldHn8fMebzDKS7B7ijPNKrxHXMG-Lk,14233
|
|
17
|
-
datamaestro/data/__init__.py,sha256=s81ZxT8MQrBGkcu45xr4NaInIsMeunHOLnkLrJE47So,1496
|
|
18
|
-
datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
|
|
19
|
-
datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
|
|
20
|
-
datamaestro/data/ml.py,sha256=7Rv4Tb9g17HDj8mOBJpIDjgolGQAd5Wrb0mHlnm-bPE,709
|
|
21
|
-
datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
|
|
22
|
-
datamaestro/download/__init__.py,sha256=EBoAcw2wErS8ymEYs7LJKez4UO-Gwhe4YgqRAysOxRY,2865
|
|
23
|
-
datamaestro/download/archive.py,sha256=G-2gzepknqT7Us3naMGAApGVGJMeHQIxM-tSpaa9ark,5608
|
|
24
|
-
datamaestro/download/custom.py,sha256=DUjDVAWuHC6sV_apMQb44Yjd6HUXkHY6Ob52FQY3t-M,587
|
|
25
|
-
datamaestro/download/huggingface.py,sha256=b4Y437ATYrugdkvqZrPQmqiXXSrmYyqEKDVI0wnIGDE,1125
|
|
26
|
-
datamaestro/download/links.py,sha256=GFnq_AzI_uen7JBuGWD9qveeC9QFBWDrSnj7pOcwWwM,3352
|
|
27
|
-
datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
|
|
28
|
-
datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0LqM,2159
|
|
29
|
-
datamaestro/download/single.py,sha256=fCIfZdR14YN09MQTgcxL21PWu5CjELfIClgWjFpR5mg,4148
|
|
30
|
-
datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
|
|
31
|
-
datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
|
|
32
|
-
datamaestro/download/wayback.py,sha256=7XuWoLkmHR65wVDv3YnL3fiMtSrjKelk3UDI9ua_t8c,5504
|
|
33
|
-
datamaestro/stream/__init__.py,sha256=Angu_Yg9rNKXb8s4at-DXYcnE-OTgSMLfUEfrL6APD8,896
|
|
34
|
-
datamaestro/stream/compress.py,sha256=0ViFGpJc6pdvZGUNERE-3XV8jAOTSvhJurb2t0NW2eU,260
|
|
35
|
-
datamaestro/stream/lines.py,sha256=UNGcyZlZxN0Q7kw717jbhZFdDVmtfJfkJZCgK7xzF9A,1996
|
|
36
|
-
datamaestro/templates/dataset.py,sha256=5065rTMAIl4gtzQ96GFiV1_46tY08miIx3WspTP8yGA,346
|
|
37
|
-
datamaestro/test/__init__.py,sha256=8-oxS68ufD45pv_HldE4S4rSWFF6L-UB_Cms-72DD2M,22
|
|
38
|
-
datamaestro/test/checks.py,sha256=1eTkz4YJhAPOcnQSsz4vPnvzwwfrEnpn6H_s1ADISpo,1704
|
|
39
|
-
datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,767
|
|
40
|
-
datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
|
|
41
|
-
datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
|
|
42
|
-
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
43
|
-
datamaestro-1.6.2.dist-info/METADATA,sha256=A1NXq-dTeuS-JIBCe9-kgoydkfuxIDeO9G-Imf0t-5w,7635
|
|
44
|
-
datamaestro-1.6.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
45
|
-
datamaestro-1.6.2.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
46
|
-
datamaestro-1.6.2.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
47
|
-
datamaestro-1.6.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|