datamaestro 1.2.1__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +5 -1
- datamaestro/context.py +71 -67
- datamaestro/data/__init__.py +18 -11
- datamaestro/data/csv.py +8 -6
- datamaestro/data/ml.py +3 -3
- datamaestro/data/tensor.py +30 -23
- datamaestro/datasets/__init__.py +0 -0
- datamaestro/datasets/yaml_repository.py +103 -0
- datamaestro/definitions.py +135 -53
- datamaestro/download/__init__.py +8 -1
- datamaestro/download/custom.py +29 -0
- datamaestro/download/single.py +15 -1
- datamaestro/search.py +1 -1
- datamaestro/test/test_annotations.py +2 -1
- datamaestro/test/test_download_handlers.py +3 -0
- datamaestro/utils.py +2 -0
- datamaestro/version.py +9 -4
- {datamaestro-1.2.1.dist-info → datamaestro-1.3.1.dist-info}/METADATA +8 -6
- {datamaestro-1.2.1.dist-info → datamaestro-1.3.1.dist-info}/RECORD +23 -20
- {datamaestro-1.2.1.dist-info → datamaestro-1.3.1.dist-info}/WHEEL +1 -1
- {datamaestro-1.2.1.dist-info → datamaestro-1.3.1.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.2.1.dist-info → datamaestro-1.3.1.dist-info/licenses}/LICENSE +0 -0
- {datamaestro-1.2.1.dist-info → datamaestro-1.3.1.dist-info}/top_level.txt +0 -0
datamaestro/__init__.py
CHANGED
|
@@ -2,10 +2,14 @@
|
|
|
2
2
|
from .context import (
|
|
3
3
|
Context,
|
|
4
4
|
Repository,
|
|
5
|
+
BaseRepository,
|
|
5
6
|
get_dataset,
|
|
6
7
|
prepare_dataset,
|
|
7
8
|
)
|
|
8
9
|
|
|
9
|
-
from
|
|
10
|
+
from .datasets.yaml_repository import YAMLRepository
|
|
10
11
|
|
|
12
|
+
from pkg_resources import get_distribution, DistributionNotFound
|
|
13
|
+
from .definitions import dataset, metadata
|
|
14
|
+
from .data import Base
|
|
11
15
|
from .version import version, version_tuple
|
datamaestro/context.py
CHANGED
|
@@ -1,21 +1,22 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from
|
|
2
|
+
from typing import Iterable, Iterator, Dict, Union
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
5
|
import hashlib
|
|
6
6
|
import logging
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
|
-
from
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from experimaestro import Config
|
|
10
11
|
import pkg_resources
|
|
11
|
-
from
|
|
12
|
+
from experimaestro.compat import cached_property
|
|
13
|
+
from experimaestro.mkdocs.metaloader import Module
|
|
12
14
|
from .utils import CachedFile, downloadURL
|
|
13
15
|
from .settings import UserSettings, Settings
|
|
14
|
-
|
|
15
16
|
from typing import TYPE_CHECKING
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
|
-
from datamaestro.definitions import AbstractDataset
|
|
19
|
+
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Compression:
|
|
@@ -87,6 +88,11 @@ class Context:
|
|
|
87
88
|
|
|
88
89
|
return ContextManager()
|
|
89
90
|
|
|
91
|
+
@property
|
|
92
|
+
def storepath(self):
|
|
93
|
+
"""Replaces the data path"""
|
|
94
|
+
return self._path.joinpath("store")
|
|
95
|
+
|
|
90
96
|
@property
|
|
91
97
|
def datapath(self):
|
|
92
98
|
return self._path.joinpath("data")
|
|
@@ -98,7 +104,9 @@ class Context:
|
|
|
98
104
|
@cached_property
|
|
99
105
|
def repositorymap(self) -> Dict[str, "Repository"]:
|
|
100
106
|
return {
|
|
101
|
-
repository.basemodule(): repository
|
|
107
|
+
repository.basemodule(): repository
|
|
108
|
+
for repository in self.repositories()
|
|
109
|
+
if repository.basemodule() is not None
|
|
102
110
|
}
|
|
103
111
|
|
|
104
112
|
def repositories(self) -> Iterable["Repository"]:
|
|
@@ -286,10 +294,53 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
286
294
|
yield value.__dataset__
|
|
287
295
|
|
|
288
296
|
|
|
289
|
-
class
|
|
290
|
-
"""A repository
|
|
297
|
+
class BaseRepository(ABC):
|
|
298
|
+
"""A repository groups a set of datasets and their corresponding specific
|
|
291
299
|
handlers (downloading, filtering, etc.)"""
|
|
292
300
|
|
|
301
|
+
def __init__(self, context: Context):
|
|
302
|
+
self.context = context
|
|
303
|
+
p = inspect.getabsfile(self.__class__)
|
|
304
|
+
self.basedir = Path(p).parent
|
|
305
|
+
|
|
306
|
+
@abstractmethod
|
|
307
|
+
def __iter__(self) -> Iterator["AbstractDataset"]:
|
|
308
|
+
...
|
|
309
|
+
|
|
310
|
+
def search(self, name: str):
|
|
311
|
+
"""Search for a dataset in the definitions"""
|
|
312
|
+
for dataset in self:
|
|
313
|
+
if name in dataset.aliases:
|
|
314
|
+
return dataset
|
|
315
|
+
|
|
316
|
+
@classmethod
|
|
317
|
+
def instance(cls, context=None):
|
|
318
|
+
try:
|
|
319
|
+
return cls.__getattribute__(cls, "INSTANCE")
|
|
320
|
+
except AttributeError:
|
|
321
|
+
return cls(context if context else Context.instance())
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def basemodule(cls):
|
|
325
|
+
return cls.__module__
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def generatedpath(self):
|
|
329
|
+
return self.basedir / "generated"
|
|
330
|
+
|
|
331
|
+
@property
|
|
332
|
+
def datapath(self):
|
|
333
|
+
return self.context.datapath.joinpath(self.id)
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def extrapath(self):
|
|
337
|
+
"""Path to the directory containing extra configuration files"""
|
|
338
|
+
return self.basedir / "data"
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class Repository(BaseRepository):
|
|
342
|
+
"""(deprecated) Repository where datasets are located in __module__.config"""
|
|
343
|
+
|
|
293
344
|
def __init__(self, context: Context):
|
|
294
345
|
"""Initialize a new repository
|
|
295
346
|
|
|
@@ -297,26 +348,14 @@ class Repository:
|
|
|
297
348
|
:param basedir: The base directory of the repository
|
|
298
349
|
(by default, the same as the repository class)
|
|
299
350
|
"""
|
|
351
|
+
super().__init__(context)
|
|
300
352
|
self.context = context
|
|
301
|
-
p = inspect.getabsfile(self.__class__)
|
|
302
|
-
self.basedir = Path(p).parent
|
|
303
353
|
self.configdir = self.basedir.joinpath("config")
|
|
304
354
|
self.id = self.__class__.NAMESPACE
|
|
305
355
|
self.name = self.id
|
|
306
356
|
self.module = self.__class__.__module__
|
|
307
357
|
self.__class__.INSTANCE = self
|
|
308
358
|
|
|
309
|
-
@classmethod
|
|
310
|
-
def basemodule(cls):
|
|
311
|
-
return cls.__module__
|
|
312
|
-
|
|
313
|
-
@classmethod
|
|
314
|
-
def instance(cls, context=None):
|
|
315
|
-
try:
|
|
316
|
-
return cls.__getattribute__(cls, "INSTANCE")
|
|
317
|
-
except AttributeError:
|
|
318
|
-
return cls(context if context else Context.instance())
|
|
319
|
-
|
|
320
359
|
@classmethod
|
|
321
360
|
def version(cls):
|
|
322
361
|
from pkg_resources import get_distribution, DistributionNotFound
|
|
@@ -336,36 +375,8 @@ class Repository:
|
|
|
336
375
|
assert isinstance(other, Repository)
|
|
337
376
|
return self.basedir == other.basedir
|
|
338
377
|
|
|
339
|
-
def
|
|
340
|
-
"""
|
|
341
|
-
logging.debug("Searching for %s in %s", name, self.configdir)
|
|
342
|
-
|
|
343
|
-
candidates: List[str] = []
|
|
344
|
-
components = name.split(".")
|
|
345
|
-
path = self.configdir
|
|
346
|
-
for i, c in enumerate(components):
|
|
347
|
-
path = path / c
|
|
348
|
-
|
|
349
|
-
if (path / "__init__.py").is_file():
|
|
350
|
-
candidates.append(".".join(components[: i + 1]))
|
|
351
|
-
|
|
352
|
-
if path.with_suffix(".py").is_file():
|
|
353
|
-
candidates.append(".".join(components[: i + 1]))
|
|
354
|
-
|
|
355
|
-
if not path.is_dir():
|
|
356
|
-
break
|
|
357
|
-
|
|
358
|
-
# Get the dataset
|
|
359
|
-
for candidate in candidates[::-1]:
|
|
360
|
-
logging.debug("Searching in module %s.config.%s", self.module, candidate)
|
|
361
|
-
module = importlib.import_module("%s.config.%s" % (self.module, candidate))
|
|
362
|
-
for value in Datasets(module):
|
|
363
|
-
if name in value.aliases:
|
|
364
|
-
return value
|
|
365
|
-
|
|
366
|
-
return None
|
|
367
|
-
|
|
368
|
-
def datasets(self, candidate):
|
|
378
|
+
def datasets(self, candidate: str):
|
|
379
|
+
"""Returns the dataset candidates from a module"""
|
|
369
380
|
try:
|
|
370
381
|
module = importlib.import_module("%s.config.%s" % (self.module, candidate))
|
|
371
382
|
except ModuleNotFoundError:
|
|
@@ -409,19 +420,6 @@ class Repository:
|
|
|
409
420
|
for dataset in datasets:
|
|
410
421
|
yield dataset
|
|
411
422
|
|
|
412
|
-
@property
|
|
413
|
-
def generatedpath(self):
|
|
414
|
-
return self.basedir.joinpath("generated")
|
|
415
|
-
|
|
416
|
-
@property
|
|
417
|
-
def datapath(self):
|
|
418
|
-
return self.context.datapath.joinpath(self.id)
|
|
419
|
-
|
|
420
|
-
@property
|
|
421
|
-
def extrapath(self):
|
|
422
|
-
"""Path to the directory containing extra configuration files"""
|
|
423
|
-
return self.basedir.joinpath("data")
|
|
424
|
-
|
|
425
423
|
|
|
426
424
|
def find_dataset(dataset_id: str):
|
|
427
425
|
"""Find a dataset given its id"""
|
|
@@ -430,11 +428,17 @@ def find_dataset(dataset_id: str):
|
|
|
430
428
|
return AbstractDataset.find(dataset_id)
|
|
431
429
|
|
|
432
430
|
|
|
433
|
-
def prepare_dataset(dataset_id: str):
|
|
431
|
+
def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
|
|
434
432
|
"""Find a dataset given its id and download the resources"""
|
|
435
|
-
from .definitions import AbstractDataset
|
|
433
|
+
from .definitions import AbstractDataset, DatasetWrapper
|
|
434
|
+
|
|
435
|
+
if isinstance(dataset_id, DatasetWrapper):
|
|
436
|
+
ds = dataset_id
|
|
437
|
+
elif isinstance(dataset_id, Config):
|
|
438
|
+
ds = dataset_id.__datamaestro_dataset__
|
|
439
|
+
else:
|
|
440
|
+
ds = AbstractDataset.find(dataset_id)
|
|
436
441
|
|
|
437
|
-
ds = AbstractDataset.find(dataset_id)
|
|
438
442
|
return ds.prepare(download=True)
|
|
439
443
|
|
|
440
444
|
|
datamaestro/data/__init__.py
CHANGED
|
@@ -1,22 +1,18 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
1
2
|
import logging
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Any, Dict
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from experimaestro import documentation # noqa: F401
|
|
5
|
+
from experimaestro import Config, Param, Meta
|
|
6
|
+
from datamaestro.definitions import AbstractDataset
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Base(Config):
|
|
10
10
|
"""Base object for all data types"""
|
|
11
11
|
|
|
12
12
|
id: Param[str]
|
|
13
|
-
"""The unique dataset ID"""
|
|
13
|
+
"""The unique (sub-)dataset ID"""
|
|
14
14
|
|
|
15
|
-
__datamaestro_dataset__: AbstractDataset
|
|
16
|
-
|
|
17
|
-
def download(self):
|
|
18
|
-
"""Download the dataset"""
|
|
19
|
-
self.__datamaestro_dataset__.download()
|
|
15
|
+
__datamaestro_dataset__: "AbstractDataset"
|
|
20
16
|
|
|
21
17
|
def dataset_information(self) -> Dict[str, Any]:
|
|
22
18
|
"""Returns document meta-informations"""
|
|
@@ -26,6 +22,16 @@ class Base(Config):
|
|
|
26
22
|
"description": self.__datamaestro_dataset__.description,
|
|
27
23
|
}
|
|
28
24
|
|
|
25
|
+
def download(self):
|
|
26
|
+
"""Download the dataset"""
|
|
27
|
+
self.__datamaestro_dataset__.download()
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def prepare(self, *args, **kwargs):
|
|
31
|
+
"""Prepare the dataset"""
|
|
32
|
+
self.__datamaestro_dataset__.prepare()
|
|
33
|
+
return self
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
class Generic(Base):
|
|
31
37
|
"""Generic dataset
|
|
@@ -44,16 +50,17 @@ class Generic(Base):
|
|
|
44
50
|
class File(Base):
|
|
45
51
|
"""A data file"""
|
|
46
52
|
|
|
47
|
-
path:
|
|
53
|
+
path: Meta[Path]
|
|
48
54
|
"""The path of the file"""
|
|
49
55
|
|
|
50
56
|
def open(self, mode):
|
|
51
57
|
return self.path.open(mode)
|
|
52
58
|
|
|
53
59
|
|
|
54
|
-
@argument("path", type=Path)
|
|
55
60
|
class Folder(Base):
|
|
56
61
|
"""A data folder"""
|
|
57
62
|
|
|
63
|
+
path: Meta[Path]
|
|
64
|
+
|
|
58
65
|
def open(self, mode):
|
|
59
66
|
return self.path.open(mode)
|
datamaestro/data/csv.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from typing import Optional, Tuple, List, Any
|
|
1
2
|
from csv import reader as csv_reader
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
3
|
+
from experimaestro import Param, Meta
|
|
4
|
+
from experimaestro import documentation
|
|
5
|
+
from . import File
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class Generic(File):
|
|
@@ -26,12 +27,13 @@ class Generic(File):
|
|
|
26
27
|
return row
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@argument("names_row", type=int, default=-1)
|
|
30
|
-
@argument("size_row", type=int, default=-1)
|
|
31
|
-
@argument("target", type=str, default=None)
|
|
32
30
|
class Matrix(Generic):
|
|
33
31
|
"""A numerical dataset"""
|
|
34
32
|
|
|
33
|
+
names_row: Param[int] = -1
|
|
34
|
+
size_row: Param[int] = -1
|
|
35
|
+
target: Param[Optional[str]] = None
|
|
36
|
+
|
|
35
37
|
@documentation
|
|
36
38
|
def data(self) -> Tuple[List[str], Any]:
|
|
37
39
|
"""Returns the list of fields and the numeric data
|
datamaestro/data/ml.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Machine learning generic data formats"""
|
|
2
|
-
from typing import Generic, TypeVar, Optional
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from
|
|
3
|
+
from typing import Generic, TypeVar, Optional
|
|
4
|
+
from experimaestro import Param, Meta
|
|
5
5
|
from . import Base
|
|
6
6
|
|
|
7
7
|
Train = TypeVar("Train", bound=Base)
|
|
@@ -20,8 +20,8 @@ class Supervised(Base, Generic[Train, Validation, Test]):
|
|
|
20
20
|
"""The training optional"""
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
@argument("classes")
|
|
24
23
|
class FolderBased(Base):
|
|
25
24
|
"""Classification dataset where folders give the basis"""
|
|
26
25
|
|
|
26
|
+
classes: Param[list[str]]
|
|
27
27
|
path: Meta[Path]
|
datamaestro/data/tensor.py
CHANGED
|
@@ -1,44 +1,50 @@
|
|
|
1
|
-
from
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
2
|
from struct import Struct
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from . import File, Base
|
|
4
5
|
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import numpy as np
|
|
5
8
|
|
|
6
|
-
|
|
9
|
+
|
|
10
|
+
class Tensor(Base, ABC):
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def data(self) -> "np.ndarray":
|
|
13
|
+
"""Returns the tensor in numpy format"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IDX(Tensor, File):
|
|
7
18
|
"""IDX File format
|
|
8
19
|
|
|
9
|
-
The IDX file format is a simple format for vectors and multidimensional
|
|
20
|
+
The IDX file format is a simple format for vectors and multidimensional
|
|
21
|
+
matrices of various numerical types.
|
|
10
22
|
|
|
11
23
|
The basic format is:
|
|
12
24
|
|
|
13
|
-
magic number
|
|
14
|
-
size in dimension
|
|
15
|
-
size in dimension 1
|
|
16
|
-
size in dimension 2
|
|
17
|
-
.....
|
|
18
|
-
size in dimension N
|
|
19
|
-
data
|
|
25
|
+
magic number size in dimension 0 size in dimension 1 size in dimension 2
|
|
26
|
+
..... size in dimension N data
|
|
20
27
|
|
|
21
28
|
The magic number is an integer (MSB first). The first 2 bytes are always 0.
|
|
22
29
|
|
|
23
|
-
The third byte codes the type of the data:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
0x0B: short (2 bytes)
|
|
27
|
-
0x0C: int (4 bytes)
|
|
28
|
-
0x0D: float (4 bytes)
|
|
29
|
-
0x0E: double (8 bytes)
|
|
30
|
+
The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
|
|
31
|
+
byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
|
|
32
|
+
double (8 bytes)
|
|
30
33
|
|
|
31
|
-
The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
|
|
34
|
+
The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
|
|
35
|
+
vectors, 2 for matrices....
|
|
32
36
|
|
|
33
|
-
The sizes in each dimension are 4-byte integers (MSB first, high endian,
|
|
37
|
+
The sizes in each dimension are 4-byte integers (MSB first, high endian,
|
|
38
|
+
like in most non-Intel processors).
|
|
34
39
|
|
|
35
|
-
The data is stored like in a C array, i.e. the index in the last dimension
|
|
40
|
+
The data is stored like in a C array, i.e. the index in the last dimension
|
|
41
|
+
changes the fastest.
|
|
36
42
|
"""
|
|
37
43
|
|
|
38
44
|
MAGIC_NUMBER = Struct(">HBB")
|
|
39
45
|
DIM = Struct(">I")
|
|
40
46
|
|
|
41
|
-
def data(self):
|
|
47
|
+
def data(self) -> "np.ndarray":
|
|
42
48
|
"""Returns the tensor"""
|
|
43
49
|
import numpy as np
|
|
44
50
|
|
|
@@ -59,7 +65,8 @@ class IDX(File):
|
|
|
59
65
|
shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
|
|
60
66
|
|
|
61
67
|
size = np.prod(shape)
|
|
62
|
-
# Could use np.fromfile... if it were not broken
|
|
68
|
+
# Could use np.fromfile... if it were not broken
|
|
69
|
+
# see https://github.com/numpy/numpy/issues/7989
|
|
63
70
|
data = np.frombuffer(fp.read(), dtype=dtype, count=size)
|
|
64
71
|
data = data.reshape(shape, order="C")
|
|
65
72
|
return data
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Iterator, Optional
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from attrs import field
|
|
5
|
+
import importlib
|
|
6
|
+
from omegaconf import OmegaConf
|
|
7
|
+
from functools import partial
|
|
8
|
+
from attrs import define
|
|
9
|
+
from datamaestro import BaseRepository
|
|
10
|
+
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
11
|
+
from datamaestro.data import Base
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
re_spec = re.compile(r"""^(\w\.)+:(\w+)""")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@define
|
|
18
|
+
class RepositoryDataset:
|
|
19
|
+
ids: list[str]
|
|
20
|
+
"""ID(s) of this dataset"""
|
|
21
|
+
|
|
22
|
+
entry_point: str = field(validator=re_spec.match)
|
|
23
|
+
"""The entry point"""
|
|
24
|
+
|
|
25
|
+
title: str
|
|
26
|
+
"""The full name of the dataset"""
|
|
27
|
+
|
|
28
|
+
description: str
|
|
29
|
+
"""Description of the dataset"""
|
|
30
|
+
|
|
31
|
+
url: Optional[str]
|
|
32
|
+
"""The URL"""
|
|
33
|
+
|
|
34
|
+
groups: Optional[list[str]]
|
|
35
|
+
"""Groups to which this repository belongs"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@define
|
|
39
|
+
class RepositoryAuthors:
|
|
40
|
+
name: str
|
|
41
|
+
email: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@define
|
|
45
|
+
class RepositoryGroup:
|
|
46
|
+
name: str
|
|
47
|
+
tasks: list[str]
|
|
48
|
+
tags: list[str]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@define
|
|
52
|
+
class RepositoryConfiguration:
|
|
53
|
+
namespace: str
|
|
54
|
+
authors: list[RepositoryAuthors]
|
|
55
|
+
description: str
|
|
56
|
+
groups: dict[str, RepositoryGroup]
|
|
57
|
+
datasets: list[RepositoryDataset]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class YAMLDataset(AbstractDataset):
|
|
61
|
+
def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
|
|
62
|
+
super().__init__(repository)
|
|
63
|
+
self.information = information
|
|
64
|
+
self.id = self.information.ids[0]
|
|
65
|
+
self.aliases = set(self.information.ids)
|
|
66
|
+
|
|
67
|
+
@cached_property
|
|
68
|
+
def wrapper(self) -> DatasetWrapper:
|
|
69
|
+
module, func_name = self.information.entry_point.split(":")
|
|
70
|
+
wrapper = getattr(importlib.import_module(module), func_name)
|
|
71
|
+
return wrapper
|
|
72
|
+
|
|
73
|
+
def _prepare(self) -> "Base":
|
|
74
|
+
return self.wrapper()
|
|
75
|
+
|
|
76
|
+
def download(self, **kwargs):
|
|
77
|
+
return self.wrapper.download(**kwargs)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class YAMLRepository(BaseRepository):
|
|
81
|
+
"""YAML-based repository"""
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def id(self):
|
|
85
|
+
return self.configuration.namespace
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def name(self):
|
|
89
|
+
return self.configuration.namespace
|
|
90
|
+
|
|
91
|
+
@cached_property
|
|
92
|
+
def configuration(self):
|
|
93
|
+
schema = OmegaConf.structured(RepositoryConfiguration)
|
|
94
|
+
with importlib.resources.path(
|
|
95
|
+
self.__class__.__module__, "datamaestro.yaml"
|
|
96
|
+
) as fp:
|
|
97
|
+
conf = OmegaConf.load(fp)
|
|
98
|
+
|
|
99
|
+
conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
|
|
100
|
+
return conf
|
|
101
|
+
|
|
102
|
+
def __iter__(self) -> Iterator["AbstractDataset"]:
|
|
103
|
+
return map(partial(YAMLDataset, self), self.configuration.datasets)
|
datamaestro/definitions.py
CHANGED
|
@@ -6,6 +6,8 @@ import logging
|
|
|
6
6
|
import inspect
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
from itertools import chain
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from contextlib import contextmanager
|
|
9
11
|
import traceback
|
|
10
12
|
from typing import (
|
|
11
13
|
Dict,
|
|
@@ -16,6 +18,9 @@ from typing import (
|
|
|
16
18
|
TypeVar,
|
|
17
19
|
Callable,
|
|
18
20
|
TYPE_CHECKING,
|
|
21
|
+
Union,
|
|
22
|
+
ClassVar,
|
|
23
|
+
_GenericAlias,
|
|
19
24
|
)
|
|
20
25
|
from experimaestro import ( # noqa: F401 (re-exports)
|
|
21
26
|
argument,
|
|
@@ -27,16 +32,16 @@ from experimaestro import ( # noqa: F401 (re-exports)
|
|
|
27
32
|
)
|
|
28
33
|
from typing import Type as TypingType # noqa: F401 (re-exports)
|
|
29
34
|
from experimaestro.core.types import Type # noqa: F401 (re-exports)
|
|
30
|
-
from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
|
|
31
35
|
|
|
32
36
|
if TYPE_CHECKING:
|
|
37
|
+
from .data import Base, Dataset
|
|
38
|
+
from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
|
|
33
39
|
from datamaestro.download import Download
|
|
34
|
-
from .data import Base
|
|
35
40
|
|
|
36
41
|
# --- Objects holding information into classes/function
|
|
37
42
|
|
|
38
43
|
|
|
39
|
-
class AbstractData:
|
|
44
|
+
class AbstractData(ABC):
|
|
40
45
|
"""Data definition groups common fields between a dataset and a data piece,
|
|
41
46
|
such as tags and tasks"""
|
|
42
47
|
|
|
@@ -77,8 +82,10 @@ class DataDefinition(AbstractData):
|
|
|
77
82
|
return self._description
|
|
78
83
|
|
|
79
84
|
@staticmethod
|
|
80
|
-
def repository_relpath(t: type) -> Tuple[Repository, List[str]]:
|
|
85
|
+
def repository_relpath(t: type) -> Tuple["Repository", List[str]]:
|
|
81
86
|
"""Find the repository of the current data or dataset definition"""
|
|
87
|
+
from .context import Context # noqa: F811
|
|
88
|
+
|
|
82
89
|
repositorymap = Context.instance().repositorymap
|
|
83
90
|
|
|
84
91
|
fullname = f"{t.__module__}.{t.__name__}"
|
|
@@ -97,9 +104,6 @@ class DataDefinition(AbstractData):
|
|
|
97
104
|
if components[0] == "datamaestro":
|
|
98
105
|
longest_ix = 0
|
|
99
106
|
|
|
100
|
-
if repository is None:
|
|
101
|
-
raise Exception(f"Could not find the repository for {fullname}")
|
|
102
|
-
|
|
103
107
|
return repository, components[(longest_ix + 1) :]
|
|
104
108
|
|
|
105
109
|
def ancestors(self):
|
|
@@ -163,18 +167,26 @@ class AbstractDataset(AbstractData):
|
|
|
163
167
|
|
|
164
168
|
@property
|
|
165
169
|
def context(self):
|
|
170
|
+
if self.repository is None:
|
|
171
|
+
from datamaestro.context import Context # noqa: F811
|
|
172
|
+
|
|
173
|
+
return Context.instance()
|
|
166
174
|
return self.repository.context
|
|
167
175
|
|
|
168
176
|
def prepare(self, download=False) -> "Base":
|
|
169
|
-
ds = self._prepare(
|
|
177
|
+
ds = self._prepare()
|
|
170
178
|
ds.__datamaestro_dataset__ = self
|
|
179
|
+
|
|
180
|
+
if download:
|
|
181
|
+
ds.download()
|
|
171
182
|
return ds
|
|
172
183
|
|
|
173
184
|
def register_hook(self, hookname: str, hook: Callable):
|
|
174
185
|
self.hooks[hookname].append(hook)
|
|
175
186
|
|
|
176
|
-
|
|
177
|
-
|
|
187
|
+
@abstractmethod
|
|
188
|
+
def _prepare(self) -> "Base":
|
|
189
|
+
...
|
|
178
190
|
|
|
179
191
|
def format(self, encoder: str) -> str:
|
|
180
192
|
s = self.prepare()
|
|
@@ -194,7 +206,10 @@ class AbstractDataset(AbstractData):
|
|
|
194
206
|
from datamaestro.data import Base
|
|
195
207
|
|
|
196
208
|
if isinstance(data, Base):
|
|
197
|
-
|
|
209
|
+
if self.repository is None:
|
|
210
|
+
data.id = id
|
|
211
|
+
else:
|
|
212
|
+
data.id = f"{id}@{self.repository.name}"
|
|
198
213
|
for key, value in data.__xpm__.values.items():
|
|
199
214
|
if isinstance(value, Config):
|
|
200
215
|
self.setDataIDs(value, f"{id}.{key}")
|
|
@@ -203,6 +218,7 @@ class AbstractDataset(AbstractData):
|
|
|
203
218
|
"""Download all the necessary resources"""
|
|
204
219
|
success = True
|
|
205
220
|
logging.info("Materializing %d resources", len(self.ordered_resources))
|
|
221
|
+
self.prepare()
|
|
206
222
|
for resource in self.ordered_resources:
|
|
207
223
|
try:
|
|
208
224
|
resource.download(force)
|
|
@@ -216,6 +232,8 @@ class AbstractDataset(AbstractData):
|
|
|
216
232
|
@staticmethod
|
|
217
233
|
def find(name: str) -> "DataDefinition":
|
|
218
234
|
"""Find a dataset given its name"""
|
|
235
|
+
from datamaestro.context import Context # noqa: F811
|
|
236
|
+
|
|
219
237
|
logging.debug("Searching dataset %s", name)
|
|
220
238
|
for repository in Context.instance().repositories():
|
|
221
239
|
logging.debug("Searching dataset %s in %s", name, repository)
|
|
@@ -226,7 +244,7 @@ class AbstractDataset(AbstractData):
|
|
|
226
244
|
|
|
227
245
|
|
|
228
246
|
class FutureAttr:
|
|
229
|
-
"""Allows to access a dataset
|
|
247
|
+
"""Allows to access a dataset sub-property"""
|
|
230
248
|
|
|
231
249
|
def __init__(self, dataset, keys):
|
|
232
250
|
self.dataset = dataset
|
|
@@ -256,10 +274,14 @@ class DatasetWrapper(AbstractDataset):
|
|
|
256
274
|
annotations (otherwise, derive from `AbstractDataset`).
|
|
257
275
|
"""
|
|
258
276
|
|
|
277
|
+
BUILDING: ClassVar[list["DatasetWrapper"]] = []
|
|
278
|
+
"""Currently built dataset"""
|
|
279
|
+
|
|
259
280
|
def __init__(self, annotation, t: type):
|
|
281
|
+
self.config = None
|
|
282
|
+
self.repository: Optional[Repository] = None
|
|
260
283
|
self.t = t
|
|
261
284
|
self.base = annotation.base
|
|
262
|
-
self.config = None
|
|
263
285
|
assert self.base is not None, f"Could not set the Config type for {t}"
|
|
264
286
|
|
|
265
287
|
repository, components = DataDefinition.repository_relpath(t)
|
|
@@ -271,19 +293,22 @@ class DatasetWrapper(AbstractDataset):
|
|
|
271
293
|
|
|
272
294
|
# Builds the ID:
|
|
273
295
|
# Removes module_name.config prefix
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
296
|
+
if annotation.id is None or annotation.id == "":
|
|
297
|
+
# Computes an ID
|
|
298
|
+
assert (
|
|
299
|
+
# id is empty string = use the module id
|
|
300
|
+
components[0]
|
|
301
|
+
== "config"
|
|
302
|
+
), (
|
|
303
|
+
"A @dataset without `id` should be in the "
|
|
304
|
+
f".config module (not {t.__module__})"
|
|
305
|
+
)
|
|
306
|
+
path = ".".join(components[1:-1])
|
|
277
307
|
|
|
278
|
-
path = ".".join(components[1:-1])
|
|
279
|
-
if annotation.id == "":
|
|
280
|
-
# id is empty string = use the module id
|
|
281
308
|
self.id = path
|
|
282
309
|
else:
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
annotation.id or t.__name__.lower().replace("_", "."),
|
|
286
|
-
)
|
|
310
|
+
# Use the provided ID
|
|
311
|
+
self.id = annotation.id
|
|
287
312
|
|
|
288
313
|
self.aliases.add(self.id)
|
|
289
314
|
|
|
@@ -327,9 +352,6 @@ class DatasetWrapper(AbstractDataset):
|
|
|
327
352
|
def configtype(self):
|
|
328
353
|
return self.base
|
|
329
354
|
|
|
330
|
-
def __call__(self, *args, **kwargs):
|
|
331
|
-
self.t(*args, **kwargs)
|
|
332
|
-
|
|
333
355
|
def __getattr__(self, key):
|
|
334
356
|
"""Returns a pointer to a potential attribute"""
|
|
335
357
|
return FutureAttr(self, [key])
|
|
@@ -339,40 +361,59 @@ class DatasetWrapper(AbstractDataset):
|
|
|
339
361
|
self._prepare()
|
|
340
362
|
return super().download(force=force)
|
|
341
363
|
|
|
342
|
-
|
|
364
|
+
@contextmanager
|
|
365
|
+
def building(self):
|
|
366
|
+
DatasetWrapper.BUILDING.append(self)
|
|
367
|
+
yield self
|
|
368
|
+
DatasetWrapper.BUILDING.pop()
|
|
369
|
+
|
|
370
|
+
def _prepare(self) -> "Base":
|
|
343
371
|
if self.config is not None:
|
|
344
372
|
return self.config
|
|
345
373
|
|
|
374
|
+
# Direct creation of the dataset
|
|
346
375
|
if self.base is self.t:
|
|
347
376
|
self.config = self.base.__create_dataset__(self)
|
|
348
377
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
378
|
+
# Construct the object
|
|
379
|
+
resources = {key: value.prepare() for key, value in self.resources.items()}
|
|
380
|
+
|
|
381
|
+
with self.building():
|
|
382
|
+
result = self.t(**resources)
|
|
383
|
+
|
|
384
|
+
# Download resources
|
|
354
385
|
logging.debug("Building with data type %s and dataset %s", self.base, self.t)
|
|
355
386
|
for hook in self.hooks["pre-use"]:
|
|
356
387
|
hook(self)
|
|
357
388
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
self.config =
|
|
389
|
+
if result is None:
|
|
390
|
+
name = self.t.__name__
|
|
391
|
+
filename = inspect.getfile(self.t)
|
|
392
|
+
raise Exception(
|
|
393
|
+
f"The dataset method {name} defined in "
|
|
394
|
+
f"{filename} returned a null object"
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
if isinstance(result, dict):
|
|
398
|
+
self.config = self.base(**result)
|
|
399
|
+
elif isinstance(result, self.base):
|
|
400
|
+
self.config = result
|
|
401
|
+
else:
|
|
402
|
+
raise RuntimeError(
|
|
403
|
+
f"The dataset method {name} defined in "
|
|
404
|
+
f"{filename} returned an object of type {type(dict)}"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Setup ourself
|
|
408
|
+
self.config.__datamaestro_dataset__ = self
|
|
370
409
|
|
|
371
410
|
# Set the ids
|
|
372
411
|
self.setDataIDs(self.config, self.id)
|
|
373
412
|
|
|
374
413
|
return self.config
|
|
375
414
|
|
|
415
|
+
__call__ = _prepare
|
|
416
|
+
|
|
376
417
|
@property
|
|
377
418
|
def _path(self) -> Path:
|
|
378
419
|
"""Returns a unique relative path for this dataset"""
|
|
@@ -384,7 +425,20 @@ class DatasetWrapper(AbstractDataset):
|
|
|
384
425
|
@property
|
|
385
426
|
def datapath(self):
|
|
386
427
|
"""Returns the destination path for downloads"""
|
|
387
|
-
|
|
428
|
+
from datamaestro import Context # noqa: F811
|
|
429
|
+
|
|
430
|
+
path = Context.instance().storepath / self._path
|
|
431
|
+
|
|
432
|
+
if (self.repository is not None) and (not path.exists()):
|
|
433
|
+
old_path: Path = self.repository.datapath / self._path
|
|
434
|
+
if old_path.exists():
|
|
435
|
+
logging.info(
|
|
436
|
+
"Moving from old path [%s] to new path [%s]", old_path, path
|
|
437
|
+
)
|
|
438
|
+
path.parent.mkdir(exist_ok=True, parents=True)
|
|
439
|
+
old_path.rename(path)
|
|
440
|
+
|
|
441
|
+
return path
|
|
388
442
|
|
|
389
443
|
def hasfiles(self) -> bool:
|
|
390
444
|
"""Returns whether this dataset has files or only includes references"""
|
|
@@ -426,10 +480,16 @@ class DatasetAnnotation:
|
|
|
426
480
|
"""Base class for all annotations"""
|
|
427
481
|
|
|
428
482
|
def __call__(self, dataset: AbstractDataset):
|
|
429
|
-
|
|
430
|
-
dataset
|
|
431
|
-
|
|
432
|
-
|
|
483
|
+
if isinstance(dataset, AbstractDataset):
|
|
484
|
+
self.annotate(dataset)
|
|
485
|
+
elif issubclass(dataset, Dataset):
|
|
486
|
+
self.annotate(dataset.__datamaestro__)
|
|
487
|
+
else:
|
|
488
|
+
raise RuntimeError(
|
|
489
|
+
f"Only datasets can be annotated with {self}, "
|
|
490
|
+
f"but {dataset} is not a dataset"
|
|
491
|
+
)
|
|
492
|
+
|
|
433
493
|
return dataset
|
|
434
494
|
|
|
435
495
|
def annotate(self, dataset: AbstractDataset):
|
|
@@ -477,9 +537,27 @@ datatags = DataTagging(lambda d: d.tags)
|
|
|
477
537
|
datatasks = DataTagging(lambda d: d.tasks)
|
|
478
538
|
|
|
479
539
|
|
|
540
|
+
class metadata:
|
|
541
|
+
def __init__(
|
|
542
|
+
self, tags: Union[str, List[str]] = None, tasks: Union[str, List[str]] = None
|
|
543
|
+
):
|
|
544
|
+
pass
|
|
545
|
+
|
|
546
|
+
def __call__(self, object: type):
|
|
547
|
+
# FIXME: todo
|
|
548
|
+
return object
|
|
549
|
+
|
|
550
|
+
|
|
480
551
|
class dataset:
|
|
481
552
|
def __init__(
|
|
482
|
-
self,
|
|
553
|
+
self,
|
|
554
|
+
base=None,
|
|
555
|
+
*,
|
|
556
|
+
timestamp=None,
|
|
557
|
+
id=None,
|
|
558
|
+
url=None,
|
|
559
|
+
size=None,
|
|
560
|
+
doi=None,
|
|
483
561
|
):
|
|
484
562
|
"""Creates a new (meta)dataset
|
|
485
563
|
|
|
@@ -523,9 +601,12 @@ class dataset:
|
|
|
523
601
|
if inspect.isclass(t) and issubclass(t, Base):
|
|
524
602
|
self.base = t
|
|
525
603
|
else:
|
|
526
|
-
# Get type from return annotation
|
|
527
604
|
try:
|
|
528
|
-
|
|
605
|
+
# Get type from return annotation
|
|
606
|
+
return_type = t.__annotations__["return"]
|
|
607
|
+
if isinstance(return_type, _GenericAlias):
|
|
608
|
+
return_type = return_type.__origin__
|
|
609
|
+
self.base = return_type
|
|
529
610
|
except KeyError:
|
|
530
611
|
logging.warning("No return annotation in %s", t)
|
|
531
612
|
raise
|
|
@@ -533,7 +614,6 @@ class dataset:
|
|
|
533
614
|
raise AssertionError("@data should only be called once")
|
|
534
615
|
except AttributeError:
|
|
535
616
|
pass
|
|
536
|
-
|
|
537
617
|
dw = DatasetWrapper(self, t)
|
|
538
618
|
t.__dataset__ = dw
|
|
539
619
|
if inspect.isclass(t) and issubclass(t, Base):
|
|
@@ -560,3 +640,5 @@ class metadataset(AbstractDataset):
|
|
|
560
640
|
pass
|
|
561
641
|
t.__datamaestro__ = self
|
|
562
642
|
return t
|
|
643
|
+
|
|
644
|
+
_prepare = None
|
datamaestro/download/__init__.py
CHANGED
|
@@ -43,6 +43,13 @@ class Resource(DatasetAnnotation, ABC):
|
|
|
43
43
|
dataset.ordered_resources.append(self)
|
|
44
44
|
self.definition = dataset
|
|
45
45
|
|
|
46
|
+
def contextualize(self):
|
|
47
|
+
"""When using an annotation inline, uses the current dataset wrapper object"""
|
|
48
|
+
from datamaestro.definitions import DatasetWrapper
|
|
49
|
+
|
|
50
|
+
wrapper = DatasetWrapper.BUILDING[-1]
|
|
51
|
+
self.annotate(wrapper)
|
|
52
|
+
|
|
46
53
|
@property
|
|
47
54
|
def context(self):
|
|
48
55
|
return self.definition.context
|
|
@@ -77,7 +84,7 @@ class Resource(DatasetAnnotation, ABC):
|
|
|
77
84
|
Download = Resource
|
|
78
85
|
|
|
79
86
|
|
|
80
|
-
class reference(
|
|
87
|
+
class reference(Resource):
|
|
81
88
|
def __init__(self, varname=None, reference=None):
|
|
82
89
|
"""References another dataset
|
|
83
90
|
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import Protocol
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datamaestro import Context
|
|
4
|
+
from datamaestro.definitions import DatasetWrapper
|
|
5
|
+
from datamaestro.download import Resource
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Downloader(Protocol):
|
|
9
|
+
def __call__(self, context: Context, root: Path, *, force=False):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CustomResource(Resource):
|
|
14
|
+
def __init__(self, ds_wrapper: DatasetWrapper, downloader: Downloader):
|
|
15
|
+
self.ds_wrapper = ds_wrapper
|
|
16
|
+
self.downloader = downloader
|
|
17
|
+
|
|
18
|
+
def prepare(self):
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
def download(self, force=False):
|
|
22
|
+
self.downloader(self.context, self.ds_wrapper.datapath, force=force)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def custom_download(downloader: Downloader) -> Path:
|
|
26
|
+
ds_wrapper = DatasetWrapper.BUILDING[-1]
|
|
27
|
+
ds_wrapper.ordered_resources.append(CustomResource(ds_wrapper, downloader))
|
|
28
|
+
|
|
29
|
+
return ds_wrapper.datapath
|
datamaestro/download/single.py
CHANGED
|
@@ -9,7 +9,7 @@ import os
|
|
|
9
9
|
import urllib3
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
import re
|
|
12
|
-
from datamaestro.utils import copyfileobjs
|
|
12
|
+
from datamaestro.utils import copyfileobjs, FileChecker
|
|
13
13
|
from datamaestro.stream import Transform
|
|
14
14
|
from datamaestro.download import Download
|
|
15
15
|
|
|
@@ -96,6 +96,20 @@ class filedownloader(SingleDownload):
|
|
|
96
96
|
logging.info("Created file %s" % destination)
|
|
97
97
|
|
|
98
98
|
|
|
99
|
+
def file_from_url(
|
|
100
|
+
filename: str,
|
|
101
|
+
url: str,
|
|
102
|
+
*,
|
|
103
|
+
size: Optional[int] = None,
|
|
104
|
+
transforms: Optional[Transform] = None,
|
|
105
|
+
checker: Optional[FileChecker] = None,
|
|
106
|
+
) -> Path:
|
|
107
|
+
"""Defines a file that should be downloaded from"""
|
|
108
|
+
downloader = filedownloader(filename, url, size, transforms, checker)
|
|
109
|
+
downloader.contextualize()
|
|
110
|
+
return downloader.path
|
|
111
|
+
|
|
112
|
+
|
|
99
113
|
class concatdownload(SingleDownload):
|
|
100
114
|
"""Concatenate all files in an archive"""
|
|
101
115
|
|
datamaestro/search.py
CHANGED
datamaestro/utils.py
CHANGED
datamaestro/version.py
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
|
-
# file generated by
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
2
|
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
|
|
5
|
+
|
|
3
6
|
TYPE_CHECKING = False
|
|
4
7
|
if TYPE_CHECKING:
|
|
5
|
-
from typing import Tuple
|
|
8
|
+
from typing import Tuple
|
|
9
|
+
from typing import Union
|
|
10
|
+
|
|
6
11
|
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
7
12
|
else:
|
|
8
13
|
VERSION_TUPLE = object
|
|
@@ -12,5 +17,5 @@ __version__: str
|
|
|
12
17
|
__version_tuple__: VERSION_TUPLE
|
|
13
18
|
version_tuple: VERSION_TUPLE
|
|
14
19
|
|
|
15
|
-
__version__ = version = '1.
|
|
16
|
-
__version_tuple__ = version_tuple = (1,
|
|
20
|
+
__version__ = version = '1.3.1'
|
|
21
|
+
__version_tuple__ = version_tuple = (1, 3, 1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.1
|
|
4
4
|
Summary: "Dataset management command line and API"
|
|
5
5
|
Home-page: https://github.com/experimaestro/datamaestro
|
|
6
6
|
Author: Benjamin Piwowarski
|
|
@@ -25,17 +25,19 @@ Requires-Dist: click
|
|
|
25
25
|
Requires-Dist: tqdm
|
|
26
26
|
Requires-Dist: urllib3
|
|
27
27
|
Requires-Dist: marshmallow
|
|
28
|
-
Requires-Dist:
|
|
28
|
+
Requires-Dist: cached_property
|
|
29
29
|
Requires-Dist: requests
|
|
30
30
|
Requires-Dist: bitmath
|
|
31
|
-
Requires-Dist: experimaestro
|
|
31
|
+
Requires-Dist: experimaestro>=1.5.0
|
|
32
32
|
Requires-Dist: mkdocs
|
|
33
33
|
Requires-Dist: pymdown-extensions
|
|
34
34
|
Requires-Dist: mkdocs-material
|
|
35
|
-
Requires-Dist:
|
|
35
|
+
Requires-Dist: docstring_parser
|
|
36
36
|
Requires-Dist: numpy
|
|
37
37
|
Provides-Extra: test
|
|
38
|
-
Requires-Dist: tox
|
|
38
|
+
Requires-Dist: tox; extra == "test"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
Dynamic: requires-dist
|
|
39
41
|
|
|
40
42
|
[](https://badge.fury.io/py/datamaestro) [](https://github.com/pre-commit/pre-commit) [](https://zenodo.org/badge/latestdoi/4573876)
|
|
41
43
|
|
|
@@ -1,31 +1,34 @@
|
|
|
1
|
-
datamaestro/__init__.py,sha256=
|
|
1
|
+
datamaestro/__init__.py,sha256=gnbxrPFzIuG4oR2Qrw9UYS0SNVsf4yCtqNvzSjstdak,376
|
|
2
2
|
datamaestro/__main__.py,sha256=tJTf1sTWKRIatvBcHlWDIZRZodAZ2B2zkD01pD89MYk,9024
|
|
3
|
-
datamaestro/context.py,sha256=
|
|
4
|
-
datamaestro/definitions.py,sha256=
|
|
3
|
+
datamaestro/context.py,sha256=S7sQ6RQVLjtoY5iyAikfyvfbqoaoDzcHt4-js8t6mMg,13653
|
|
4
|
+
datamaestro/definitions.py,sha256=HEnwB32Reb4ouLOjboEOe_j88keBZPQ0SU6OrO_ohLU,18764
|
|
5
5
|
datamaestro/record.py,sha256=m3WGsPcZ1LouQXNJOBUK3QusAIRiuy6T_oqhq09-Ckg,5504
|
|
6
6
|
datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
|
|
7
|
-
datamaestro/search.py,sha256=
|
|
7
|
+
datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
|
|
8
8
|
datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
|
|
9
9
|
datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
|
|
10
|
-
datamaestro/utils.py,sha256=
|
|
11
|
-
datamaestro/version.py,sha256=
|
|
10
|
+
datamaestro/utils.py,sha256=9m-AVVww6InAZfGFiGy6XJzfExpYNqH1fhWQEezjafA,6536
|
|
11
|
+
datamaestro/version.py,sha256=SRqFNM-332JnoeCoBUmNOKj9orV-sUMTJlWFg9t-9tI,511
|
|
12
12
|
datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
|
|
13
13
|
datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
|
|
14
14
|
datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
|
|
16
16
|
datamaestro/commands/site.py,sha256=nnz4tOwKcgUmsLfPcQVo2SgFIC3OShYfJ8S2N6vuzAw,14173
|
|
17
|
-
datamaestro/data/__init__.py,sha256=
|
|
18
|
-
datamaestro/data/csv.py,sha256
|
|
17
|
+
datamaestro/data/__init__.py,sha256=Z1qZnliJwS5sRaLznK5YBVJCjvAlPbmJjbRvvLv_UVI,1547
|
|
18
|
+
datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
|
|
19
19
|
datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
|
|
20
|
-
datamaestro/data/ml.py,sha256=
|
|
21
|
-
datamaestro/data/tensor.py,sha256=
|
|
22
|
-
datamaestro/
|
|
20
|
+
datamaestro/data/ml.py,sha256=7Rv4Tb9g17HDj8mOBJpIDjgolGQAd5Wrb0mHlnm-bPE,709
|
|
21
|
+
datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
|
|
22
|
+
datamaestro/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
+
datamaestro/datasets/yaml_repository.py,sha256=X5JjA2dQ5xfdYSUgL2EbZhrOYn-FPiBOAK97kw4kwqo,2533
|
|
24
|
+
datamaestro/download/__init__.py,sha256=XcRw9acAq1IwhLQZpj2HpMNEaMesA5BbllJpbRCkOwA,2846
|
|
23
25
|
datamaestro/download/archive.py,sha256=G-2gzepknqT7Us3naMGAApGVGJMeHQIxM-tSpaa9ark,5608
|
|
26
|
+
datamaestro/download/custom.py,sha256=2-gFoOgQ8J93HjH9sc7u6wjVYm7DmSytP1ty2O6-d8k,839
|
|
24
27
|
datamaestro/download/huggingface.py,sha256=LkzmZo2Z0yccqAfj7di7jDNGFrMKN9m8IM8SfexOomY,1125
|
|
25
28
|
datamaestro/download/links.py,sha256=GFnq_AzI_uen7JBuGWD9qveeC9QFBWDrSnj7pOcwWwM,3352
|
|
26
29
|
datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
|
|
27
30
|
datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0LqM,2159
|
|
28
|
-
datamaestro/download/single.py,sha256=
|
|
31
|
+
datamaestro/download/single.py,sha256=bMDLldvODp2ZXyxXeKLT4qbL-v4igA6A7HVjIt2Cf8c,4526
|
|
29
32
|
datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
|
|
30
33
|
datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
|
|
31
34
|
datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
|
|
@@ -36,12 +39,12 @@ datamaestro/templates/dataset.py,sha256=5065rTMAIl4gtzQ96GFiV1_46tY08miIx3WspTP8
|
|
|
36
39
|
datamaestro/test/__init__.py,sha256=8-oxS68ufD45pv_HldE4S4rSWFF6L-UB_Cms-72DD2M,22
|
|
37
40
|
datamaestro/test/checks.py,sha256=1eTkz4YJhAPOcnQSsz4vPnvzwwfrEnpn6H_s1ADISpo,1704
|
|
38
41
|
datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,767
|
|
39
|
-
datamaestro/test/test_annotations.py,sha256=
|
|
40
|
-
datamaestro/test/test_download_handlers.py,sha256
|
|
42
|
+
datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
|
|
43
|
+
datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
|
|
41
44
|
datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
|
|
42
|
-
datamaestro-1.
|
|
43
|
-
datamaestro-1.
|
|
44
|
-
datamaestro-1.
|
|
45
|
-
datamaestro-1.
|
|
46
|
-
datamaestro-1.
|
|
47
|
-
datamaestro-1.
|
|
45
|
+
datamaestro-1.3.1.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
|
|
46
|
+
datamaestro-1.3.1.dist-info/METADATA,sha256=nPhwqTfFtgy8viBLpgHzTJX_7msSqQdn7ab7Ph3kcFA,9042
|
|
47
|
+
datamaestro-1.3.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
48
|
+
datamaestro-1.3.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
|
|
49
|
+
datamaestro-1.3.1.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
|
|
50
|
+
datamaestro-1.3.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|