datamaestro 1.2.0__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datamaestro-1.2.0 → datamaestro-1.3.0}/PKG-INFO +4 -2
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/data.md +0 -5
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/download.rst +11 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/__init__.py +5 -1
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/context.py +71 -67
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/__init__.py +18 -11
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/csv.py +8 -6
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/ml.py +3 -3
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/tensor.py +30 -23
- datamaestro-1.3.0/src/datamaestro/datasets/__init__.py +0 -0
- datamaestro-1.3.0/src/datamaestro/datasets/yaml_repository.py +103 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/definitions.py +135 -53
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/__init__.py +8 -1
- datamaestro-1.3.0/src/datamaestro/download/custom.py +29 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/single.py +15 -1
- datamaestro-1.3.0/src/datamaestro/download/wayback.py +163 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/search.py +1 -1
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/test_annotations.py +2 -1
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/test_download_handlers.py +3 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/utils.py +2 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/version.py +9 -4
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/PKG-INFO +4 -2
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/SOURCES.txt +4 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/.coverage +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/.github/workflows/pytest.yml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/.github/workflows/python-publish.yml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/.gitignore +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/.pre-commit-config.yaml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/.readthedocs.yml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/CHANGELOG.md +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/LICENSE +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/MANIFEST.in +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/README.md +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/TODO.md +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/Makefile +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/make.bat +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/requirements.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/index.md +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/records.rst +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/conf.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/datasets.rst +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/developping.md +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/index.md +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/style.css +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/mkdocs.yml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/pyproject.toml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/pytest.ini +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/requirements-dev.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/requirements.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/schema.yaml +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/setup.cfg +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/setup.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/__main__.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/annotations/__init__.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/annotations/agreement.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/commands/__init__.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/commands/mainstyle.css +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/commands/site.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/huggingface.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/archive.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/huggingface.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/links.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/manual.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/multiple.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/sync.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/download/todo.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/record.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/registry.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/settings.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/sphinx.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/stream/__init__.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/stream/compress.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/stream/lines.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/templates/dataset.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/__init__.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/checks.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/conftest.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/test/test_record.py +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/dependency_links.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/entry_points.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/not-zip-safe +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/requires.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro.egg-info/top_level.txt +0 -0
- {datamaestro-1.2.0 → datamaestro-1.3.0}/tox.ini +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datamaestro
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: "Dataset management command line and API"
|
|
5
5
|
Home-page: https://github.com/experimaestro/datamaestro
|
|
6
6
|
Author: Benjamin Piwowarski
|
|
@@ -36,6 +36,8 @@ Requires-Dist: docstring_parser
|
|
|
36
36
|
Requires-Dist: numpy
|
|
37
37
|
Provides-Extra: test
|
|
38
38
|
Requires-Dist: tox; extra == "test"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
Dynamic: requires-dist
|
|
39
41
|
|
|
40
42
|
[](https://badge.fury.io/py/datamaestro) [](https://github.com/pre-commit/pre-commit) [](https://zenodo.org/badge/latestdoi/4573876)
|
|
41
43
|
|
|
@@ -40,6 +40,10 @@ Package `datamaestro.download.links`
|
|
|
40
40
|
.. autofunction:: datamaestro.download.links.linkfile
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
Other
|
|
44
|
+
=====
|
|
45
|
+
|
|
46
|
+
.. autofunction:: datamaestro.download.wayback.wayback_documents
|
|
43
47
|
|
|
44
48
|
|
|
45
49
|
|
|
@@ -58,3 +62,10 @@ File hashes can be checked with the following checker
|
|
|
58
62
|
.. autoclass:: datamaestro.utils.FileChecker
|
|
59
63
|
.. autoclass:: datamaestro.utils.HashCheck
|
|
60
64
|
:members: __init__
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
Custom
|
|
68
|
+
======
|
|
69
|
+
|
|
70
|
+
.. autofunction:: datamaestro.download.custom.Downloader
|
|
71
|
+
.. autoclass:: datamaestro.download.custom.custom_download
|
|
@@ -2,10 +2,14 @@
|
|
|
2
2
|
from .context import (
|
|
3
3
|
Context,
|
|
4
4
|
Repository,
|
|
5
|
+
BaseRepository,
|
|
5
6
|
get_dataset,
|
|
6
7
|
prepare_dataset,
|
|
7
8
|
)
|
|
8
9
|
|
|
9
|
-
from
|
|
10
|
+
from .datasets.yaml_repository import YAMLRepository
|
|
10
11
|
|
|
12
|
+
from pkg_resources import get_distribution, DistributionNotFound
|
|
13
|
+
from .definitions import dataset, metadata
|
|
14
|
+
from .data import Base
|
|
11
15
|
from .version import version, version_tuple
|
|
@@ -1,21 +1,22 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from
|
|
2
|
+
from typing import Iterable, Iterator, Dict, Union
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
5
|
import hashlib
|
|
6
6
|
import logging
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
|
-
from
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from experimaestro import Config
|
|
10
11
|
import pkg_resources
|
|
11
|
-
from
|
|
12
|
+
from experimaestro.compat import cached_property
|
|
13
|
+
from experimaestro.mkdocs.metaloader import Module
|
|
12
14
|
from .utils import CachedFile, downloadURL
|
|
13
15
|
from .settings import UserSettings, Settings
|
|
14
|
-
|
|
15
16
|
from typing import TYPE_CHECKING
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
|
-
from datamaestro.definitions import AbstractDataset
|
|
19
|
+
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class Compression:
|
|
@@ -87,6 +88,11 @@ class Context:
|
|
|
87
88
|
|
|
88
89
|
return ContextManager()
|
|
89
90
|
|
|
91
|
+
@property
|
|
92
|
+
def storepath(self):
|
|
93
|
+
"""Replaces the data path"""
|
|
94
|
+
return self._path.joinpath("store")
|
|
95
|
+
|
|
90
96
|
@property
|
|
91
97
|
def datapath(self):
|
|
92
98
|
return self._path.joinpath("data")
|
|
@@ -98,7 +104,9 @@ class Context:
|
|
|
98
104
|
@cached_property
|
|
99
105
|
def repositorymap(self) -> Dict[str, "Repository"]:
|
|
100
106
|
return {
|
|
101
|
-
repository.basemodule(): repository
|
|
107
|
+
repository.basemodule(): repository
|
|
108
|
+
for repository in self.repositories()
|
|
109
|
+
if repository.basemodule() is not None
|
|
102
110
|
}
|
|
103
111
|
|
|
104
112
|
def repositories(self) -> Iterable["Repository"]:
|
|
@@ -286,10 +294,53 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
286
294
|
yield value.__dataset__
|
|
287
295
|
|
|
288
296
|
|
|
289
|
-
class
|
|
290
|
-
"""A repository
|
|
297
|
+
class BaseRepository(ABC):
|
|
298
|
+
"""A repository groups a set of datasets and their corresponding specific
|
|
291
299
|
handlers (downloading, filtering, etc.)"""
|
|
292
300
|
|
|
301
|
+
def __init__(self, context: Context):
|
|
302
|
+
self.context = context
|
|
303
|
+
p = inspect.getabsfile(self.__class__)
|
|
304
|
+
self.basedir = Path(p).parent
|
|
305
|
+
|
|
306
|
+
@abstractmethod
|
|
307
|
+
def __iter__(self) -> Iterator["AbstractDataset"]:
|
|
308
|
+
...
|
|
309
|
+
|
|
310
|
+
def search(self, name: str):
|
|
311
|
+
"""Search for a dataset in the definitions"""
|
|
312
|
+
for dataset in self:
|
|
313
|
+
if name in dataset.aliases:
|
|
314
|
+
return dataset
|
|
315
|
+
|
|
316
|
+
@classmethod
|
|
317
|
+
def instance(cls, context=None):
|
|
318
|
+
try:
|
|
319
|
+
return cls.__getattribute__(cls, "INSTANCE")
|
|
320
|
+
except AttributeError:
|
|
321
|
+
return cls(context if context else Context.instance())
|
|
322
|
+
|
|
323
|
+
@classmethod
|
|
324
|
+
def basemodule(cls):
|
|
325
|
+
return cls.__module__
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def generatedpath(self):
|
|
329
|
+
return self.basedir / "generated"
|
|
330
|
+
|
|
331
|
+
@property
|
|
332
|
+
def datapath(self):
|
|
333
|
+
return self.context.datapath.joinpath(self.id)
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def extrapath(self):
|
|
337
|
+
"""Path to the directory containing extra configuration files"""
|
|
338
|
+
return self.basedir / "data"
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class Repository(BaseRepository):
|
|
342
|
+
"""(deprecated) Repository where datasets are located in __module__.config"""
|
|
343
|
+
|
|
293
344
|
def __init__(self, context: Context):
|
|
294
345
|
"""Initialize a new repository
|
|
295
346
|
|
|
@@ -297,26 +348,14 @@ class Repository:
|
|
|
297
348
|
:param basedir: The base directory of the repository
|
|
298
349
|
(by default, the same as the repository class)
|
|
299
350
|
"""
|
|
351
|
+
super().__init__(context)
|
|
300
352
|
self.context = context
|
|
301
|
-
p = inspect.getabsfile(self.__class__)
|
|
302
|
-
self.basedir = Path(p).parent
|
|
303
353
|
self.configdir = self.basedir.joinpath("config")
|
|
304
354
|
self.id = self.__class__.NAMESPACE
|
|
305
355
|
self.name = self.id
|
|
306
356
|
self.module = self.__class__.__module__
|
|
307
357
|
self.__class__.INSTANCE = self
|
|
308
358
|
|
|
309
|
-
@classmethod
|
|
310
|
-
def basemodule(cls):
|
|
311
|
-
return cls.__module__
|
|
312
|
-
|
|
313
|
-
@classmethod
|
|
314
|
-
def instance(cls, context=None):
|
|
315
|
-
try:
|
|
316
|
-
return cls.__getattribute__(cls, "INSTANCE")
|
|
317
|
-
except AttributeError:
|
|
318
|
-
return cls(context if context else Context.instance())
|
|
319
|
-
|
|
320
359
|
@classmethod
|
|
321
360
|
def version(cls):
|
|
322
361
|
from pkg_resources import get_distribution, DistributionNotFound
|
|
@@ -336,36 +375,8 @@ class Repository:
|
|
|
336
375
|
assert isinstance(other, Repository)
|
|
337
376
|
return self.basedir == other.basedir
|
|
338
377
|
|
|
339
|
-
def
|
|
340
|
-
"""
|
|
341
|
-
logging.debug("Searching for %s in %s", name, self.configdir)
|
|
342
|
-
|
|
343
|
-
candidates: List[str] = []
|
|
344
|
-
components = name.split(".")
|
|
345
|
-
path = self.configdir
|
|
346
|
-
for i, c in enumerate(components):
|
|
347
|
-
path = path / c
|
|
348
|
-
|
|
349
|
-
if (path / "__init__.py").is_file():
|
|
350
|
-
candidates.append(".".join(components[: i + 1]))
|
|
351
|
-
|
|
352
|
-
if path.with_suffix(".py").is_file():
|
|
353
|
-
candidates.append(".".join(components[: i + 1]))
|
|
354
|
-
|
|
355
|
-
if not path.is_dir():
|
|
356
|
-
break
|
|
357
|
-
|
|
358
|
-
# Get the dataset
|
|
359
|
-
for candidate in candidates[::-1]:
|
|
360
|
-
logging.debug("Searching in module %s.config.%s", self.module, candidate)
|
|
361
|
-
module = importlib.import_module("%s.config.%s" % (self.module, candidate))
|
|
362
|
-
for value in Datasets(module):
|
|
363
|
-
if name in value.aliases:
|
|
364
|
-
return value
|
|
365
|
-
|
|
366
|
-
return None
|
|
367
|
-
|
|
368
|
-
def datasets(self, candidate):
|
|
378
|
+
def datasets(self, candidate: str):
|
|
379
|
+
"""Returns the dataset candidates from a module"""
|
|
369
380
|
try:
|
|
370
381
|
module = importlib.import_module("%s.config.%s" % (self.module, candidate))
|
|
371
382
|
except ModuleNotFoundError:
|
|
@@ -409,19 +420,6 @@ class Repository:
|
|
|
409
420
|
for dataset in datasets:
|
|
410
421
|
yield dataset
|
|
411
422
|
|
|
412
|
-
@property
|
|
413
|
-
def generatedpath(self):
|
|
414
|
-
return self.basedir.joinpath("generated")
|
|
415
|
-
|
|
416
|
-
@property
|
|
417
|
-
def datapath(self):
|
|
418
|
-
return self.context.datapath.joinpath(self.id)
|
|
419
|
-
|
|
420
|
-
@property
|
|
421
|
-
def extrapath(self):
|
|
422
|
-
"""Path to the directory containing extra configuration files"""
|
|
423
|
-
return self.basedir.joinpath("data")
|
|
424
|
-
|
|
425
423
|
|
|
426
424
|
def find_dataset(dataset_id: str):
|
|
427
425
|
"""Find a dataset given its id"""
|
|
@@ -430,11 +428,17 @@ def find_dataset(dataset_id: str):
|
|
|
430
428
|
return AbstractDataset.find(dataset_id)
|
|
431
429
|
|
|
432
430
|
|
|
433
|
-
def prepare_dataset(dataset_id: str):
|
|
431
|
+
def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
|
|
434
432
|
"""Find a dataset given its id and download the resources"""
|
|
435
|
-
from .definitions import AbstractDataset
|
|
433
|
+
from .definitions import AbstractDataset, DatasetWrapper
|
|
434
|
+
|
|
435
|
+
if isinstance(dataset_id, DatasetWrapper):
|
|
436
|
+
ds = dataset_id
|
|
437
|
+
elif isinstance(dataset_id, Config):
|
|
438
|
+
ds = dataset_id.__datamaestro_dataset__
|
|
439
|
+
else:
|
|
440
|
+
ds = AbstractDataset.find(dataset_id)
|
|
436
441
|
|
|
437
|
-
ds = AbstractDataset.find(dataset_id)
|
|
438
442
|
return ds.prepare(download=True)
|
|
439
443
|
|
|
440
444
|
|
|
@@ -1,22 +1,18 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
1
2
|
import logging
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
from typing import Any, Dict
|
|
4
|
-
from
|
|
5
|
-
from
|
|
6
|
-
from experimaestro import documentation # noqa: F401
|
|
5
|
+
from experimaestro import Config, Param, Meta
|
|
6
|
+
from datamaestro.definitions import AbstractDataset
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Base(Config):
|
|
10
10
|
"""Base object for all data types"""
|
|
11
11
|
|
|
12
12
|
id: Param[str]
|
|
13
|
-
"""The unique dataset ID"""
|
|
13
|
+
"""The unique (sub-)dataset ID"""
|
|
14
14
|
|
|
15
|
-
__datamaestro_dataset__: AbstractDataset
|
|
16
|
-
|
|
17
|
-
def download(self):
|
|
18
|
-
"""Download the dataset"""
|
|
19
|
-
self.__datamaestro_dataset__.download()
|
|
15
|
+
__datamaestro_dataset__: "AbstractDataset"
|
|
20
16
|
|
|
21
17
|
def dataset_information(self) -> Dict[str, Any]:
|
|
22
18
|
"""Returns document meta-informations"""
|
|
@@ -26,6 +22,16 @@ class Base(Config):
|
|
|
26
22
|
"description": self.__datamaestro_dataset__.description,
|
|
27
23
|
}
|
|
28
24
|
|
|
25
|
+
def download(self):
|
|
26
|
+
"""Download the dataset"""
|
|
27
|
+
self.__datamaestro_dataset__.download()
|
|
28
|
+
|
|
29
|
+
@abstractmethod
|
|
30
|
+
def prepare(self, *args, **kwargs):
|
|
31
|
+
"""Prepare the dataset"""
|
|
32
|
+
self.__datamaestro_dataset__.prepare()
|
|
33
|
+
return self
|
|
34
|
+
|
|
29
35
|
|
|
30
36
|
class Generic(Base):
|
|
31
37
|
"""Generic dataset
|
|
@@ -44,16 +50,17 @@ class Generic(Base):
|
|
|
44
50
|
class File(Base):
|
|
45
51
|
"""A data file"""
|
|
46
52
|
|
|
47
|
-
path:
|
|
53
|
+
path: Meta[Path]
|
|
48
54
|
"""The path of the file"""
|
|
49
55
|
|
|
50
56
|
def open(self, mode):
|
|
51
57
|
return self.path.open(mode)
|
|
52
58
|
|
|
53
59
|
|
|
54
|
-
@argument("path", type=Path)
|
|
55
60
|
class Folder(Base):
|
|
56
61
|
"""A data folder"""
|
|
57
62
|
|
|
63
|
+
path: Meta[Path]
|
|
64
|
+
|
|
58
65
|
def open(self, mode):
|
|
59
66
|
return self.path.open(mode)
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
from typing import Optional, Tuple, List, Any
|
|
1
2
|
from csv import reader as csv_reader
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
3
|
+
from experimaestro import Param, Meta
|
|
4
|
+
from experimaestro import documentation
|
|
5
|
+
from . import File
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class Generic(File):
|
|
@@ -26,12 +27,13 @@ class Generic(File):
|
|
|
26
27
|
return row
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
@argument("names_row", type=int, default=-1)
|
|
30
|
-
@argument("size_row", type=int, default=-1)
|
|
31
|
-
@argument("target", type=str, default=None)
|
|
32
30
|
class Matrix(Generic):
|
|
33
31
|
"""A numerical dataset"""
|
|
34
32
|
|
|
33
|
+
names_row: Param[int] = -1
|
|
34
|
+
size_row: Param[int] = -1
|
|
35
|
+
target: Param[Optional[str]] = None
|
|
36
|
+
|
|
35
37
|
@documentation
|
|
36
38
|
def data(self) -> Tuple[List[str], Any]:
|
|
37
39
|
"""Returns the list of fields and the numeric data
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Machine learning generic data formats"""
|
|
2
|
-
from typing import Generic, TypeVar, Optional
|
|
3
2
|
from pathlib import Path
|
|
4
|
-
from
|
|
3
|
+
from typing import Generic, TypeVar, Optional
|
|
4
|
+
from experimaestro import Param, Meta
|
|
5
5
|
from . import Base
|
|
6
6
|
|
|
7
7
|
Train = TypeVar("Train", bound=Base)
|
|
@@ -20,8 +20,8 @@ class Supervised(Base, Generic[Train, Validation, Test]):
|
|
|
20
20
|
"""The training optional"""
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
@argument("classes")
|
|
24
23
|
class FolderBased(Base):
|
|
25
24
|
"""Classification dataset where folders give the basis"""
|
|
26
25
|
|
|
26
|
+
classes: Param[list[str]]
|
|
27
27
|
path: Meta[Path]
|
|
@@ -1,44 +1,50 @@
|
|
|
1
|
-
from
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
2
|
from struct import Struct
|
|
3
|
-
from
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from . import File, Base
|
|
4
5
|
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import numpy as np
|
|
5
8
|
|
|
6
|
-
|
|
9
|
+
|
|
10
|
+
class Tensor(Base, ABC):
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def data(self) -> "np.ndarray":
|
|
13
|
+
"""Returns the tensor in numpy format"""
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IDX(Tensor, File):
|
|
7
18
|
"""IDX File format
|
|
8
19
|
|
|
9
|
-
The IDX file format is a simple format for vectors and multidimensional
|
|
20
|
+
The IDX file format is a simple format for vectors and multidimensional
|
|
21
|
+
matrices of various numerical types.
|
|
10
22
|
|
|
11
23
|
The basic format is:
|
|
12
24
|
|
|
13
|
-
magic number
|
|
14
|
-
size in dimension
|
|
15
|
-
size in dimension 1
|
|
16
|
-
size in dimension 2
|
|
17
|
-
.....
|
|
18
|
-
size in dimension N
|
|
19
|
-
data
|
|
25
|
+
magic number size in dimension 0 size in dimension 1 size in dimension 2
|
|
26
|
+
..... size in dimension N data
|
|
20
27
|
|
|
21
28
|
The magic number is an integer (MSB first). The first 2 bytes are always 0.
|
|
22
29
|
|
|
23
|
-
The third byte codes the type of the data:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
0x0B: short (2 bytes)
|
|
27
|
-
0x0C: int (4 bytes)
|
|
28
|
-
0x0D: float (4 bytes)
|
|
29
|
-
0x0E: double (8 bytes)
|
|
30
|
+
The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
|
|
31
|
+
byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
|
|
32
|
+
double (8 bytes)
|
|
30
33
|
|
|
31
|
-
The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
|
|
34
|
+
The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
|
|
35
|
+
vectors, 2 for matrices....
|
|
32
36
|
|
|
33
|
-
The sizes in each dimension are 4-byte integers (MSB first, high endian,
|
|
37
|
+
The sizes in each dimension are 4-byte integers (MSB first, high endian,
|
|
38
|
+
like in most non-Intel processors).
|
|
34
39
|
|
|
35
|
-
The data is stored like in a C array, i.e. the index in the last dimension
|
|
40
|
+
The data is stored like in a C array, i.e. the index in the last dimension
|
|
41
|
+
changes the fastest.
|
|
36
42
|
"""
|
|
37
43
|
|
|
38
44
|
MAGIC_NUMBER = Struct(">HBB")
|
|
39
45
|
DIM = Struct(">I")
|
|
40
46
|
|
|
41
|
-
def data(self):
|
|
47
|
+
def data(self) -> "np.ndarray":
|
|
42
48
|
"""Returns the tensor"""
|
|
43
49
|
import numpy as np
|
|
44
50
|
|
|
@@ -59,7 +65,8 @@ class IDX(File):
|
|
|
59
65
|
shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
|
|
60
66
|
|
|
61
67
|
size = np.prod(shape)
|
|
62
|
-
# Could use np.fromfile... if it were not broken
|
|
68
|
+
# Could use np.fromfile... if it were not broken
|
|
69
|
+
# see https://github.com/numpy/numpy/issues/7989
|
|
63
70
|
data = np.frombuffer(fp.read(), dtype=dtype, count=size)
|
|
64
71
|
data = data.reshape(shape, order="C")
|
|
65
72
|
return data
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import regex
|
|
2
|
+
from typing import Iterator, Optional
|
|
3
|
+
from functools import cached_property
|
|
4
|
+
from attrs import field
|
|
5
|
+
import importlib
|
|
6
|
+
from omegaconf import OmegaConf
|
|
7
|
+
from functools import partial
|
|
8
|
+
from attrs import define
|
|
9
|
+
from datamaestro import BaseRepository
|
|
10
|
+
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
11
|
+
from datamaestro.data import Base
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
re_spec = regex.compile(r"""^(\w\.)+:(\w+)""")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@define
|
|
18
|
+
class RepositoryDataset:
|
|
19
|
+
ids: list[str]
|
|
20
|
+
"""ID(s) of this dataset"""
|
|
21
|
+
|
|
22
|
+
entry_point: str = field(validator=re_spec.match)
|
|
23
|
+
"""The entry point"""
|
|
24
|
+
|
|
25
|
+
title: str
|
|
26
|
+
"""The full name of the dataset"""
|
|
27
|
+
|
|
28
|
+
description: str
|
|
29
|
+
"""Description of the dataset"""
|
|
30
|
+
|
|
31
|
+
url: Optional[str]
|
|
32
|
+
"""The URL"""
|
|
33
|
+
|
|
34
|
+
groups: Optional[list[str]]
|
|
35
|
+
"""Groups to which this repository belongs"""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@define
|
|
39
|
+
class RepositoryAuthors:
|
|
40
|
+
name: str
|
|
41
|
+
email: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@define
|
|
45
|
+
class RepositoryGroup:
|
|
46
|
+
name: str
|
|
47
|
+
tasks: list[str]
|
|
48
|
+
tags: list[str]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@define
|
|
52
|
+
class RepositoryConfiguration:
|
|
53
|
+
namespace: str
|
|
54
|
+
authors: list[RepositoryAuthors]
|
|
55
|
+
description: str
|
|
56
|
+
groups: dict[str, RepositoryGroup]
|
|
57
|
+
datasets: list[RepositoryDataset]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class YAMLDataset(AbstractDataset):
|
|
61
|
+
def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
|
|
62
|
+
super().__init__(repository)
|
|
63
|
+
self.information = information
|
|
64
|
+
self.id = self.information.ids[0]
|
|
65
|
+
self.aliases = set(self.information.ids)
|
|
66
|
+
|
|
67
|
+
@cached_property
|
|
68
|
+
def wrapper(self) -> DatasetWrapper:
|
|
69
|
+
module, func_name = self.information.entry_point.split(":")
|
|
70
|
+
wrapper = getattr(importlib.import_module(module), func_name)
|
|
71
|
+
return wrapper
|
|
72
|
+
|
|
73
|
+
def _prepare(self) -> "Base":
|
|
74
|
+
return self.wrapper()
|
|
75
|
+
|
|
76
|
+
def download(self, **kwargs):
|
|
77
|
+
return self.wrapper.download(**kwargs)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class YAMLRepository(BaseRepository):
|
|
81
|
+
"""YAML-based repository"""
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def id(self):
|
|
85
|
+
return self.configuration.namespace
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def name(self):
|
|
89
|
+
return self.configuration.namespace
|
|
90
|
+
|
|
91
|
+
@cached_property
|
|
92
|
+
def configuration(self):
|
|
93
|
+
schema = OmegaConf.structured(RepositoryConfiguration)
|
|
94
|
+
with importlib.resources.path(
|
|
95
|
+
self.__class__.__module__, "datamaestro.yaml"
|
|
96
|
+
) as fp:
|
|
97
|
+
conf = OmegaConf.load(fp)
|
|
98
|
+
|
|
99
|
+
conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
|
|
100
|
+
return conf
|
|
101
|
+
|
|
102
|
+
def __iter__(self) -> Iterator["AbstractDataset"]:
|
|
103
|
+
return map(partial(YAMLDataset, self), self.configuration.datasets)
|