datamaestro 1.3.2__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +0 -2
- datamaestro/__main__.py +13 -9
- datamaestro/context.py +0 -5
- datamaestro/definitions.py +29 -29
- datamaestro/download/__init__.py +3 -3
- datamaestro/download/custom.py +5 -13
- datamaestro/download/huggingface.py +1 -1
- datamaestro/download/single.py +2 -16
- datamaestro/version.py +2 -2
- {datamaestro-1.3.2.dist-info → datamaestro-1.4.1.dist-info}/METADATA +38 -40
- {datamaestro-1.3.2.dist-info → datamaestro-1.4.1.dist-info}/RECORD +15 -17
- datamaestro/datasets/__init__.py +0 -0
- datamaestro/datasets/yaml_repository.py +0 -103
- {datamaestro-1.3.2.dist-info → datamaestro-1.4.1.dist-info}/WHEEL +0 -0
- {datamaestro-1.3.2.dist-info → datamaestro-1.4.1.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.3.2.dist-info → datamaestro-1.4.1.dist-info}/licenses/LICENSE +0 -0
- {datamaestro-1.3.2.dist-info → datamaestro-1.4.1.dist-info}/top_level.txt +0 -0
datamaestro/__init__.py
CHANGED
datamaestro/__main__.py
CHANGED
|
@@ -319,13 +319,17 @@ def search(config: Config, searchterms):
|
|
|
319
319
|
|
|
320
320
|
logging.debug("Search: %s", condition)
|
|
321
321
|
for dataset in config.context.datasets():
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
322
|
+
try:
|
|
323
|
+
if condition.match(dataset):
|
|
324
|
+
cfg = dataset.configtype
|
|
325
|
+
print(
|
|
326
|
+
"[%s] %s (%s)"
|
|
327
|
+
% (
|
|
328
|
+
dataset.repository.id,
|
|
329
|
+
dataset.id,
|
|
330
|
+
cfg.__name__ if cfg is not None else "?",
|
|
331
|
+
)
|
|
330
332
|
)
|
|
331
|
-
|
|
333
|
+
except Exception:
|
|
334
|
+
logging.error("Error while matching with dataset %s", dataset)
|
|
335
|
+
raise
|
datamaestro/context.py
CHANGED
datamaestro/definitions.py
CHANGED
|
@@ -7,7 +7,6 @@ import inspect
|
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from itertools import chain
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
|
-
from contextlib import contextmanager
|
|
11
10
|
import traceback
|
|
12
11
|
from typing import (
|
|
13
12
|
Dict,
|
|
@@ -19,7 +18,6 @@ from typing import (
|
|
|
19
18
|
Callable,
|
|
20
19
|
TYPE_CHECKING,
|
|
21
20
|
Union,
|
|
22
|
-
ClassVar,
|
|
23
21
|
_GenericAlias,
|
|
24
22
|
)
|
|
25
23
|
from experimaestro import ( # noqa: F401 (re-exports)
|
|
@@ -104,7 +102,7 @@ class DataDefinition(AbstractData):
|
|
|
104
102
|
if components[0] == "datamaestro":
|
|
105
103
|
longest_ix = 0
|
|
106
104
|
|
|
107
|
-
return repository, components[(longest_ix + 1) :]
|
|
105
|
+
return repository, [s.lower() for s in components[(longest_ix + 1) :]]
|
|
108
106
|
|
|
109
107
|
def ancestors(self):
|
|
110
108
|
ancestors = []
|
|
@@ -217,8 +215,8 @@ class AbstractDataset(AbstractData):
|
|
|
217
215
|
def download(self, force=False):
|
|
218
216
|
"""Download all the necessary resources"""
|
|
219
217
|
success = True
|
|
220
|
-
logging.info("Materializing %d resources", len(self.ordered_resources))
|
|
221
218
|
self.prepare()
|
|
219
|
+
logging.info("Materializing %d resources", len(self.ordered_resources))
|
|
222
220
|
for resource in self.ordered_resources:
|
|
223
221
|
try:
|
|
224
222
|
resource.download(force)
|
|
@@ -274,9 +272,6 @@ class DatasetWrapper(AbstractDataset):
|
|
|
274
272
|
annotations (otherwise, derive from `AbstractDataset`).
|
|
275
273
|
"""
|
|
276
274
|
|
|
277
|
-
BUILDING: ClassVar[list["DatasetWrapper"]] = []
|
|
278
|
-
"""Currently built dataset"""
|
|
279
|
-
|
|
280
275
|
def __init__(self, annotation, t: type):
|
|
281
276
|
self.config = None
|
|
282
277
|
self.repository: Optional[Repository] = None
|
|
@@ -287,13 +282,22 @@ class DatasetWrapper(AbstractDataset):
|
|
|
287
282
|
repository, components = DataDefinition.repository_relpath(t)
|
|
288
283
|
super().__init__(repository)
|
|
289
284
|
|
|
285
|
+
self.module_name = None
|
|
286
|
+
if repository is None:
|
|
287
|
+
# Try to find the module name
|
|
288
|
+
self.module_name, _ = t.__module__.split(".", 1)
|
|
289
|
+
|
|
290
290
|
# Set some variables
|
|
291
291
|
self.url = annotation.url
|
|
292
292
|
self.doi = annotation.doi
|
|
293
293
|
|
|
294
294
|
# Builds the ID:
|
|
295
295
|
# Removes module_name.config prefix
|
|
296
|
-
if
|
|
296
|
+
if (
|
|
297
|
+
(annotation.id is None)
|
|
298
|
+
or (annotation.id == "")
|
|
299
|
+
or ("." not in annotation.id)
|
|
300
|
+
):
|
|
297
301
|
# Computes an ID
|
|
298
302
|
assert (
|
|
299
303
|
# id is empty string = use the module id
|
|
@@ -303,7 +307,15 @@ class DatasetWrapper(AbstractDataset):
|
|
|
303
307
|
"A @dataset without `id` should be in the "
|
|
304
308
|
f".config module (not {t.__module__})"
|
|
305
309
|
)
|
|
306
|
-
|
|
310
|
+
|
|
311
|
+
if annotation.id is None:
|
|
312
|
+
# There is nothing, use the full path
|
|
313
|
+
path = ".".join(components[1:])
|
|
314
|
+
else:
|
|
315
|
+
# Replace
|
|
316
|
+
path = ".".join(components[1:-1])
|
|
317
|
+
if annotation.id != "":
|
|
318
|
+
path = f"{path}.{annotation.id}"
|
|
307
319
|
|
|
308
320
|
self.id = path
|
|
309
321
|
else:
|
|
@@ -361,12 +373,6 @@ class DatasetWrapper(AbstractDataset):
|
|
|
361
373
|
self._prepare()
|
|
362
374
|
return super().download(force=force)
|
|
363
375
|
|
|
364
|
-
@contextmanager
|
|
365
|
-
def building(self):
|
|
366
|
-
DatasetWrapper.BUILDING.append(self)
|
|
367
|
-
yield self
|
|
368
|
-
DatasetWrapper.BUILDING.pop()
|
|
369
|
-
|
|
370
376
|
def _prepare(self) -> "Base":
|
|
371
377
|
if self.config is not None:
|
|
372
378
|
return self.config
|
|
@@ -378,8 +384,7 @@ class DatasetWrapper(AbstractDataset):
|
|
|
378
384
|
# Construct the object
|
|
379
385
|
resources = {key: value.prepare() for key, value in self.resources.items()}
|
|
380
386
|
|
|
381
|
-
|
|
382
|
-
result = self.t(**resources)
|
|
387
|
+
result = self.t(**resources)
|
|
383
388
|
|
|
384
389
|
# Download resources
|
|
385
390
|
logging.debug("Building with data type %s and dataset %s", self.base, self.t)
|
|
@@ -425,18 +430,11 @@ class DatasetWrapper(AbstractDataset):
|
|
|
425
430
|
@property
|
|
426
431
|
def datapath(self):
|
|
427
432
|
"""Returns the destination path for downloads"""
|
|
428
|
-
|
|
433
|
+
if self.repository is not None:
|
|
434
|
+
return self.repository.datapath / self._path
|
|
429
435
|
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
if (self.repository is not None) and (not path.exists()):
|
|
433
|
-
old_path: Path = self.repository.datapath / self._path
|
|
434
|
-
if old_path.exists():
|
|
435
|
-
logging.info(
|
|
436
|
-
"Moving from old path [%s] to new path [%s]", old_path, path
|
|
437
|
-
)
|
|
438
|
-
path.parent.mkdir(exist_ok=True, parents=True)
|
|
439
|
-
old_path.rename(path)
|
|
436
|
+
# No repository, use __custom__/[MODULE NAME]
|
|
437
|
+
path = self.context.datapath / "__custom__" / self.module_name / self._path
|
|
440
438
|
|
|
441
439
|
return path
|
|
442
440
|
|
|
@@ -571,13 +569,15 @@ class dataset:
|
|
|
571
569
|
timestamp {bool} -- If the dataset evolves, specify its timestamp
|
|
572
570
|
(default: None)
|
|
573
571
|
|
|
574
|
-
id {[type]} -- [description] (default: {None})
|
|
572
|
+
id {[type]} -- [description] (default: {None}) Gives the full ID of
|
|
573
|
+
the dataset if it contains a ., or just the last component otherwise
|
|
575
574
|
|
|
576
575
|
url {[type]} -- [description] (default: {None})
|
|
577
576
|
|
|
578
577
|
size {str} -- The size (should be a parsable format)
|
|
579
578
|
|
|
580
579
|
doi {str} -- The DOI of the corresponding paper
|
|
580
|
+
|
|
581
581
|
"""
|
|
582
582
|
if hasattr(base, "__datamaestro__") and isinstance(
|
|
583
583
|
base.__datamaestro__, metadataset
|
datamaestro/download/__init__.py
CHANGED
|
@@ -31,7 +31,7 @@ class Resource(DatasetAnnotation, ABC):
|
|
|
31
31
|
self.varname = varname
|
|
32
32
|
# Ensures that the object is initialized
|
|
33
33
|
self._post = False
|
|
34
|
-
self.definition = None
|
|
34
|
+
self.definition: AbstractDataset = None
|
|
35
35
|
|
|
36
36
|
def annotate(self, dataset: AbstractDataset):
|
|
37
37
|
assert self.definition is None
|
|
@@ -45,9 +45,9 @@ class Resource(DatasetAnnotation, ABC):
|
|
|
45
45
|
|
|
46
46
|
def contextualize(self):
|
|
47
47
|
"""When using an annotation inline, uses the current dataset wrapper object"""
|
|
48
|
-
from datamaestro.definitions import
|
|
48
|
+
from datamaestro.definitions import AbstractDataset
|
|
49
49
|
|
|
50
|
-
wrapper =
|
|
50
|
+
wrapper = AbstractDataset.processing()
|
|
51
51
|
self.annotate(wrapper)
|
|
52
52
|
|
|
53
53
|
@property
|
datamaestro/download/custom.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
from typing import Protocol
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from datamaestro import Context
|
|
4
|
-
from datamaestro.definitions import DatasetWrapper
|
|
5
4
|
from datamaestro.download import Resource
|
|
6
5
|
|
|
7
6
|
|
|
@@ -10,20 +9,13 @@ class Downloader(Protocol):
|
|
|
10
9
|
pass
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
class
|
|
14
|
-
def __init__(self,
|
|
15
|
-
|
|
12
|
+
class custom_download(Resource):
|
|
13
|
+
def __init__(self, varname: str, downloader: Downloader):
|
|
14
|
+
super().__init__(varname)
|
|
16
15
|
self.downloader = downloader
|
|
17
16
|
|
|
18
17
|
def prepare(self):
|
|
19
|
-
|
|
18
|
+
return self.definition.datapath
|
|
20
19
|
|
|
21
20
|
def download(self, force=False):
|
|
22
|
-
self.downloader(self.context, self.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def custom_download(downloader: Downloader) -> Path:
|
|
26
|
-
ds_wrapper = DatasetWrapper.BUILDING[-1]
|
|
27
|
-
ds_wrapper.ordered_resources.append(CustomResource(ds_wrapper, downloader))
|
|
28
|
-
|
|
29
|
-
return ds_wrapper.datapath
|
|
21
|
+
self.downloader(self.context, self.definition.datapath, force=force)
|
datamaestro/download/single.py
CHANGED
|
@@ -9,7 +9,7 @@ import os
|
|
|
9
9
|
import urllib3
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
import re
|
|
12
|
-
from datamaestro.utils import copyfileobjs
|
|
12
|
+
from datamaestro.utils import copyfileobjs
|
|
13
13
|
from datamaestro.stream import Transform
|
|
14
14
|
from datamaestro.download import Download
|
|
15
15
|
|
|
@@ -35,7 +35,7 @@ class SingleDownload(Download):
|
|
|
35
35
|
return self.path
|
|
36
36
|
|
|
37
37
|
def download(self, force=False):
|
|
38
|
-
if not self.path.is_file():
|
|
38
|
+
if not self.path.is_file() and not force:
|
|
39
39
|
self._download(self.path)
|
|
40
40
|
|
|
41
41
|
|
|
@@ -96,20 +96,6 @@ class filedownloader(SingleDownload):
|
|
|
96
96
|
logging.info("Created file %s" % destination)
|
|
97
97
|
|
|
98
98
|
|
|
99
|
-
def file_from_url(
|
|
100
|
-
filename: str,
|
|
101
|
-
url: str,
|
|
102
|
-
*,
|
|
103
|
-
size: Optional[int] = None,
|
|
104
|
-
transforms: Optional[Transform] = None,
|
|
105
|
-
checker: Optional[FileChecker] = None,
|
|
106
|
-
) -> Path:
|
|
107
|
-
"""Defines a file that should be downloaded from"""
|
|
108
|
-
downloader = filedownloader(filename, url, size, transforms, checker)
|
|
109
|
-
downloader.contextualize()
|
|
110
|
-
return downloader.path
|
|
111
|
-
|
|
112
|
-
|
|
113
99
|
class concatdownload(SingleDownload):
|
|
114
100
|
"""Concatenate all files in an archive"""
|
|
115
101
|
|
datamaestro/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: "Dataset management command line and API"
|
|
5
5
|
Home-page: https://github.com/experimaestro/datamaestro
|
|
6
6
|
Author: Benjamin Piwowarski
|
|
@@ -97,22 +97,10 @@ $ datamaestro search tag:image
|
|
|
97
97
|
[image] com.lecun.mnist
|
|
98
98
|
|
|
99
99
|
$ datamaestro prepare com.lecun.mnist
|
|
100
|
-
INFO:root:
|
|
101
|
-
INFO:root:
|
|
102
|
-
INFO:root:
|
|
103
|
-
INFO:root:Downloading
|
|
104
|
-
INFO:root:Transforming file
|
|
105
|
-
INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
|
|
106
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
|
|
107
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
|
|
108
|
-
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s] INFO:root:Transforming file
|
|
109
|
-
INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
|
|
110
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
|
|
111
|
-
INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
|
|
112
|
-
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
|
|
113
|
-
INFO:root:Transforming file
|
|
114
|
-
INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
|
|
115
|
-
...JSON...
|
|
100
|
+
INFO:root:Materializing 4 resources
|
|
101
|
+
INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
|
|
102
|
+
INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
|
|
103
|
+
INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
|
|
116
104
|
```
|
|
117
105
|
|
|
118
106
|
The previous command also returns a JSON on standard output
|
|
@@ -158,13 +146,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
|
|
|
158
146
|
Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
|
|
159
147
|
|
|
160
148
|
|
|
161
|
-
For
|
|
149
|
+
For instance, the MNIST dataset can be described by the following
|
|
162
150
|
|
|
163
151
|
```python
|
|
164
|
-
from
|
|
165
|
-
from datamaestro.download.single import
|
|
166
|
-
from
|
|
167
|
-
from datamaestro.data.tensor import IDX
|
|
152
|
+
from datamaestro import dataset
|
|
153
|
+
from datamaestro.download.single import download_file
|
|
154
|
+
from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
|
|
168
155
|
|
|
169
156
|
|
|
170
157
|
@filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
|
|
@@ -175,26 +162,37 @@ from datamaestro.data.tensor import IDX
|
|
|
175
162
|
ImageClassification,
|
|
176
163
|
url="http://yann.lecun.com/exdb/mnist/",
|
|
177
164
|
)
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
165
|
+
|
|
166
|
+
return ImageClassification(
|
|
167
|
+
train=LabelledImages(
|
|
168
|
+
images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
|
|
169
|
+
),
|
|
170
|
+
test=LabelledImages(
|
|
171
|
+
images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
|
|
172
|
+
),
|
|
173
|
+
)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
When building dataset modules, some extra documentation can be provided:
|
|
177
|
+
|
|
178
|
+
```yaml
|
|
179
|
+
ids: [com.lecun.mnist]
|
|
180
|
+
entry_point: "datamaestro_image.config.com.lecun:mnist"
|
|
181
|
+
title: The MNIST database
|
|
182
|
+
url: http://yann.lecun.com/exdb/mnist/
|
|
183
|
+
groups: [image-classification]
|
|
184
|
+
description: |
|
|
185
|
+
The MNIST database of handwritten digits, available from this page,
|
|
186
|
+
has a training set of 60,000 examples, and a test set of 10,000
|
|
187
|
+
examples. It is a subset of a larger set available from NIST. The
|
|
188
|
+
digits have been size-normalized and centered in a fixed-size image.
|
|
196
189
|
```
|
|
197
190
|
|
|
191
|
+
This will allow to
|
|
192
|
+
|
|
193
|
+
1. Document the dataset
|
|
194
|
+
2. Allow to use the command line interface to manipulate it (download resources, etc.)
|
|
195
|
+
|
|
198
196
|
# 0.8.0
|
|
199
197
|
|
|
200
198
|
- Integration with other repositories: abstracting away the notion of dataset
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
datamaestro/__init__.py,sha256=
|
|
2
|
-
datamaestro/__main__.py,sha256=
|
|
3
|
-
datamaestro/context.py,sha256=
|
|
4
|
-
datamaestro/definitions.py,sha256=
|
|
1
|
+
datamaestro/__init__.py,sha256=LR8nx7H3Fo97O0gJXV2PxQezsmSTDLAg_nQEXB5QAjc,322
|
|
2
|
+
datamaestro/__main__.py,sha256=2p36ZcJcZAL9NZBUkMaYRUhKyqhheVPXMGw6K1KNwhk,9196
|
|
3
|
+
datamaestro/context.py,sha256=KsXYNTt4xX4zEVrnd2hciP7PVCh1StRzjU1Ih6VeCtU,13532
|
|
4
|
+
datamaestro/definitions.py,sha256=EyrN24HcQmW_pS2K5hGRF07eJ36mQDFduIGvHmMSzsk,18825
|
|
5
5
|
datamaestro/record.py,sha256=m3WGsPcZ1LouQXNJOBUK3QusAIRiuy6T_oqhq09-Ckg,5504
|
|
6
6
|
datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
|
|
7
7
|
datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
|
|
8
8
|
datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
|
|
9
9
|
datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
|
|
10
10
|
datamaestro/utils.py,sha256=9m-AVVww6InAZfGFiGy6XJzfExpYNqH1fhWQEezjafA,6536
|
|
11
|
-
datamaestro/version.py,sha256=
|
|
11
|
+
datamaestro/version.py,sha256=2wP77AlenYjrtKg1nXf5noV1SfpanFafZAGSe7wvBys,511
|
|
12
12
|
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
13
13
|
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
14
14
|
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -19,16 +19,14 @@ datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
|
|
|
19
19
|
datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
|
|
20
20
|
datamaestro/data/ml.py,sha256=7Rv4Tb9g17HDj8mOBJpIDjgolGQAd5Wrb0mHlnm-bPE,709
|
|
21
21
|
datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
|
|
22
|
-
datamaestro/
|
|
23
|
-
datamaestro/datasets/yaml_repository.py,sha256=X5JjA2dQ5xfdYSUgL2EbZhrOYn-FPiBOAK97kw4kwqo,2533
|
|
24
|
-
datamaestro/download/__init__.py,sha256=XcRw9acAq1IwhLQZpj2HpMNEaMesA5BbllJpbRCkOwA,2846
|
|
22
|
+
datamaestro/download/__init__.py,sha256=EBoAcw2wErS8ymEYs7LJKez4UO-Gwhe4YgqRAysOxRY,2865
|
|
25
23
|
datamaestro/download/archive.py,sha256=G-2gzepknqT7Us3naMGAApGVGJMeHQIxM-tSpaa9ark,5608
|
|
26
|
-
datamaestro/download/custom.py,sha256=
|
|
27
|
-
datamaestro/download/huggingface.py,sha256=
|
|
24
|
+
datamaestro/download/custom.py,sha256=DUjDVAWuHC6sV_apMQb44Yjd6HUXkHY6Ob52FQY3t-M,587
|
|
25
|
+
datamaestro/download/huggingface.py,sha256=b4Y437ATYrugdkvqZrPQmqiXXSrmYyqEKDVI0wnIGDE,1125
|
|
28
26
|
datamaestro/download/links.py,sha256=GFnq_AzI_uen7JBuGWD9qveeC9QFBWDrSnj7pOcwWwM,3352
|
|
29
27
|
datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
|
|
30
28
|
datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0LqM,2159
|
|
31
|
-
datamaestro/download/single.py,sha256=
|
|
29
|
+
datamaestro/download/single.py,sha256=fCIfZdR14YN09MQTgcxL21PWu5CjELfIClgWjFpR5mg,4148
|
|
32
30
|
datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
|
|
33
31
|
datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
|
|
34
32
|
datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
|
|
@@ -42,9 +40,9 @@ datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,
|
|
|
42
40
|
datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
|
|
43
41
|
datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
|
|
44
42
|
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
45
|
-
datamaestro-1.
|
|
46
|
-
datamaestro-1.
|
|
47
|
-
datamaestro-1.
|
|
48
|
-
datamaestro-1.
|
|
49
|
-
datamaestro-1.
|
|
50
|
-
datamaestro-1.
|
|
43
|
+
datamaestro-1.4.1.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
44
|
+
datamaestro-1.4.1.dist-info/METADATA,sha256=jGy6z11AvalmLQuwby5XSEViOS55DtMfq21fhs_rW14,8189
|
|
45
|
+
datamaestro-1.4.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
46
|
+
datamaestro-1.4.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
47
|
+
datamaestro-1.4.1.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
|
|
48
|
+
datamaestro-1.4.1.dist-info/RECORD,,
|
datamaestro/datasets/__init__.py
DELETED
|
File without changes
|
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
from typing import Iterator, Optional
|
|
3
|
-
from functools import cached_property
|
|
4
|
-
from attrs import field
|
|
5
|
-
import importlib
|
|
6
|
-
from omegaconf import OmegaConf
|
|
7
|
-
from functools import partial
|
|
8
|
-
from attrs import define
|
|
9
|
-
from datamaestro import BaseRepository
|
|
10
|
-
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
11
|
-
from datamaestro.data import Base
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
re_spec = re.compile(r"""^(\w\.)+:(\w+)""")
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@define
|
|
18
|
-
class RepositoryDataset:
|
|
19
|
-
ids: list[str]
|
|
20
|
-
"""ID(s) of this dataset"""
|
|
21
|
-
|
|
22
|
-
entry_point: str = field(validator=re_spec.match)
|
|
23
|
-
"""The entry point"""
|
|
24
|
-
|
|
25
|
-
title: str
|
|
26
|
-
"""The full name of the dataset"""
|
|
27
|
-
|
|
28
|
-
description: str
|
|
29
|
-
"""Description of the dataset"""
|
|
30
|
-
|
|
31
|
-
url: Optional[str]
|
|
32
|
-
"""The URL"""
|
|
33
|
-
|
|
34
|
-
groups: Optional[list[str]]
|
|
35
|
-
"""Groups to which this repository belongs"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@define
|
|
39
|
-
class RepositoryAuthors:
|
|
40
|
-
name: str
|
|
41
|
-
email: str
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
@define
|
|
45
|
-
class RepositoryGroup:
|
|
46
|
-
name: str
|
|
47
|
-
tasks: list[str]
|
|
48
|
-
tags: list[str]
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@define
|
|
52
|
-
class RepositoryConfiguration:
|
|
53
|
-
namespace: str
|
|
54
|
-
authors: list[RepositoryAuthors]
|
|
55
|
-
description: str
|
|
56
|
-
groups: dict[str, RepositoryGroup]
|
|
57
|
-
datasets: list[RepositoryDataset]
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
class YAMLDataset(AbstractDataset):
|
|
61
|
-
def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
|
|
62
|
-
super().__init__(repository)
|
|
63
|
-
self.information = information
|
|
64
|
-
self.id = self.information.ids[0]
|
|
65
|
-
self.aliases = set(self.information.ids)
|
|
66
|
-
|
|
67
|
-
@cached_property
|
|
68
|
-
def wrapper(self) -> DatasetWrapper:
|
|
69
|
-
module, func_name = self.information.entry_point.split(":")
|
|
70
|
-
wrapper = getattr(importlib.import_module(module), func_name)
|
|
71
|
-
return wrapper
|
|
72
|
-
|
|
73
|
-
def _prepare(self) -> "Base":
|
|
74
|
-
return self.wrapper()
|
|
75
|
-
|
|
76
|
-
def download(self, **kwargs):
|
|
77
|
-
return self.wrapper.download(**kwargs)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class YAMLRepository(BaseRepository):
|
|
81
|
-
"""YAML-based repository"""
|
|
82
|
-
|
|
83
|
-
@property
|
|
84
|
-
def id(self):
|
|
85
|
-
return self.configuration.namespace
|
|
86
|
-
|
|
87
|
-
@property
|
|
88
|
-
def name(self):
|
|
89
|
-
return self.configuration.namespace
|
|
90
|
-
|
|
91
|
-
@cached_property
|
|
92
|
-
def configuration(self):
|
|
93
|
-
schema = OmegaConf.structured(RepositoryConfiguration)
|
|
94
|
-
with importlib.resources.path(
|
|
95
|
-
self.__class__.__module__, "datamaestro.yaml"
|
|
96
|
-
) as fp:
|
|
97
|
-
conf = OmegaConf.load(fp)
|
|
98
|
-
|
|
99
|
-
conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
|
|
100
|
-
return conf
|
|
101
|
-
|
|
102
|
-
def __iter__(self) -> Iterator["AbstractDataset"]:
|
|
103
|
-
return map(partial(YAMLDataset, self), self.configuration.datasets)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|