datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/__init__.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
# flake8: noqa: F401 (re-exports)
|
|
2
|
+
from .context import (
|
|
3
|
+
Context,
|
|
4
|
+
Repository,
|
|
5
|
+
BaseRepository,
|
|
6
|
+
get_dataset,
|
|
7
|
+
prepare_dataset,
|
|
8
|
+
)
|
|
2
9
|
|
|
3
|
-
from
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
__version__ = get_distribution(__name__).version
|
|
7
|
-
except DistributionNotFound:
|
|
8
|
-
__version__ = None
|
|
10
|
+
from .definitions import dataset, metadata
|
|
11
|
+
from .data import Base
|
|
12
|
+
from .version import __version__
|
datamaestro/__main__.py
CHANGED
|
@@ -1,19 +1,22 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
+
# ruff: noqa: T201
|
|
2
3
|
|
|
4
|
+
from importlib.metadata import entry_points
|
|
3
5
|
import sys
|
|
4
6
|
import logging
|
|
5
7
|
from functools import update_wrapper
|
|
6
8
|
import traceback as tb
|
|
7
|
-
import pkg_resources
|
|
8
9
|
import re
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
import shutil
|
|
11
|
-
from .context import Context
|
|
12
12
|
from typing import Set
|
|
13
|
-
import
|
|
13
|
+
from urllib.parse import urlparse
|
|
14
14
|
|
|
15
15
|
import click
|
|
16
16
|
|
|
17
|
+
import datamaestro
|
|
18
|
+
from .context import Context
|
|
19
|
+
|
|
17
20
|
logging.basicConfig(level=logging.INFO)
|
|
18
21
|
|
|
19
22
|
|
|
@@ -37,7 +40,7 @@ def pass_cfg(f):
|
|
|
37
40
|
# Get all the available repositories
|
|
38
41
|
|
|
39
42
|
REPOSITORIES = {}
|
|
40
|
-
for entry_point in
|
|
43
|
+
for entry_point in entry_points(group="datamaestro.repositories"):
|
|
41
44
|
REPOSITORIES[entry_point.name] = entry_point
|
|
42
45
|
|
|
43
46
|
|
|
@@ -59,7 +62,10 @@ for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
|
|
|
59
62
|
"--traceback", is_flag=True, help="Display traceback if an exception occurs"
|
|
60
63
|
)
|
|
61
64
|
@click.option(
|
|
62
|
-
"--data",
|
|
65
|
+
"--data",
|
|
66
|
+
type=Path,
|
|
67
|
+
help="Directory containing datasets",
|
|
68
|
+
default=Context.MAINDIR,
|
|
63
69
|
)
|
|
64
70
|
@click.pass_context
|
|
65
71
|
def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
|
|
@@ -90,6 +96,8 @@ def main():
|
|
|
90
96
|
@click.argument("dataset", type=str)
|
|
91
97
|
@pass_cfg
|
|
92
98
|
def info(config: Config, dataset):
|
|
99
|
+
from datamaestro.definitions import AbstractDataset
|
|
100
|
+
|
|
93
101
|
dataset = AbstractDataset.find(dataset)
|
|
94
102
|
print(dataset.name)
|
|
95
103
|
if dataset.url:
|
|
@@ -204,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
|
|
|
204
212
|
# --- Create a dataset
|
|
205
213
|
|
|
206
214
|
DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
|
|
207
|
-
from urllib.parse import urlparse
|
|
208
215
|
|
|
209
216
|
|
|
210
217
|
def dataset_id_check(ctx, param, value):
|
|
@@ -254,6 +261,8 @@ def create_dataset(config: Config, repository_id: str, dataset_id: str):
|
|
|
254
261
|
@pass_cfg
|
|
255
262
|
def download(config: Config, dataset):
|
|
256
263
|
"""Download a dataset"""
|
|
264
|
+
from datamaestro.definitions import AbstractDataset
|
|
265
|
+
|
|
257
266
|
dataset = AbstractDataset.find(dataset)
|
|
258
267
|
success = dataset.download()
|
|
259
268
|
if not success:
|
|
@@ -314,5 +323,17 @@ def search(config: Config, searchterms):
|
|
|
314
323
|
|
|
315
324
|
logging.debug("Search: %s", condition)
|
|
316
325
|
for dataset in config.context.datasets():
|
|
317
|
-
|
|
318
|
-
|
|
326
|
+
try:
|
|
327
|
+
if condition.match(dataset):
|
|
328
|
+
cfg = dataset.configtype
|
|
329
|
+
print(
|
|
330
|
+
"[%s] %s (%s)"
|
|
331
|
+
% (
|
|
332
|
+
dataset.repository.id,
|
|
333
|
+
dataset.id,
|
|
334
|
+
cfg.__name__ if cfg is not None else "?",
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
except Exception:
|
|
338
|
+
logging.error("Error while matching with dataset %s", dataset)
|
|
339
|
+
raise
|
|
@@ -1 +1 @@
|
|
|
1
|
-
"""Generic annotations for datasets"""
|
|
1
|
+
"""Generic annotations for datasets"""
|
|
@@ -1,9 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from datamaestro.definitions import
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from datamaestro.definitions import AbstractDataset, hook
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
@hook("pre-use")
|
|
6
|
-
def useragreement(definition: AbstractDataset, message, id=None):
|
|
6
|
+
def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
|
|
7
|
+
"""Asks for a user-agreement
|
|
8
|
+
|
|
9
|
+
:param definition: The dataset for which the agreement is asked
|
|
10
|
+
:param message: The agreement text
|
|
11
|
+
:param id: The ID of the agreement (default to the dataset ID)
|
|
12
|
+
"""
|
|
7
13
|
# Skip agreement when testing
|
|
8
14
|
if definition.context.running_test:
|
|
9
15
|
return
|
datamaestro/commands/site.py
CHANGED
|
@@ -18,6 +18,7 @@ from mkdocs.structure.pages import Page as MkdocPage
|
|
|
18
18
|
from docstring_parser import parse as docstring_parse
|
|
19
19
|
|
|
20
20
|
import experimaestro
|
|
21
|
+
import experimaestro.mkdocs.base
|
|
21
22
|
from experimaestro.core.types import ObjectType
|
|
22
23
|
|
|
23
24
|
from ..context import Context, Repository, Datasets
|
|
@@ -97,7 +98,7 @@ def document_data(datatype: ObjectType):
|
|
|
97
98
|
if doc.long_description:
|
|
98
99
|
s += doc.long_description + "\n"
|
|
99
100
|
s += method_documentation(doc, method.__annotations__)
|
|
100
|
-
except Exception
|
|
101
|
+
except Exception:
|
|
101
102
|
logging.error(
|
|
102
103
|
"Error while parsing documetnation of %s (%s)",
|
|
103
104
|
method,
|
|
@@ -108,8 +109,6 @@ def document_data(datatype: ObjectType):
|
|
|
108
109
|
|
|
109
110
|
|
|
110
111
|
def document_object(object):
|
|
111
|
-
from datamaestro.data import Base
|
|
112
|
-
|
|
113
112
|
try:
|
|
114
113
|
name = object.__name__
|
|
115
114
|
# Get the documentation
|
|
@@ -141,7 +140,7 @@ def document_object(object):
|
|
|
141
140
|
|
|
142
141
|
return s
|
|
143
142
|
|
|
144
|
-
except Exception
|
|
143
|
+
except Exception:
|
|
145
144
|
logging.exception(
|
|
146
145
|
"Exception while generating the documentation for %s" % object.__name__
|
|
147
146
|
)
|
|
@@ -159,8 +158,11 @@ def document(match):
|
|
|
159
158
|
module = importlib.import_module(modulename)
|
|
160
159
|
try:
|
|
161
160
|
object = getattr(module, name)
|
|
162
|
-
except:
|
|
163
|
-
return "<div class='error'>Cannot find %s in %s</div>" % (
|
|
161
|
+
except Exception:
|
|
162
|
+
return "<div class='error'>Cannot find %s in %s</div>" % (
|
|
163
|
+
name,
|
|
164
|
+
modulename,
|
|
165
|
+
)
|
|
164
166
|
|
|
165
167
|
if ismodule(object):
|
|
166
168
|
return "\n\n".join(
|
|
@@ -182,7 +184,7 @@ class Classification:
|
|
|
182
184
|
|
|
183
185
|
def add(self, name, value):
|
|
184
186
|
key = name.lower()
|
|
185
|
-
if not
|
|
187
|
+
if key not in self.map:
|
|
186
188
|
self.map[key] = ClassificationItem(name)
|
|
187
189
|
self.map[key].values.append(value)
|
|
188
190
|
|
|
@@ -201,7 +203,6 @@ class Classification:
|
|
|
201
203
|
)
|
|
202
204
|
|
|
203
205
|
def match(self, path):
|
|
204
|
-
|
|
205
206
|
if path == "datamaestro/%s.md" % self.id:
|
|
206
207
|
r = io.StringIO()
|
|
207
208
|
r.write("# List of %s\n\n" % self.name)
|
|
@@ -222,7 +223,12 @@ class Classification:
|
|
|
222
223
|
module = Datasets(importlib.import_module(meta.t.__module__))
|
|
223
224
|
r.write(
|
|
224
225
|
"- [%s](../df/%s/%s.html#%s)\n"
|
|
225
|
-
% (
|
|
226
|
+
% (
|
|
227
|
+
meta.name or meta.id,
|
|
228
|
+
meta.repository.id,
|
|
229
|
+
module.id,
|
|
230
|
+
meta.id,
|
|
231
|
+
)
|
|
226
232
|
)
|
|
227
233
|
|
|
228
234
|
return r.getvalue()
|
|
@@ -275,7 +281,7 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
|
|
|
275
281
|
def parse_nav(self, nav):
|
|
276
282
|
for entry in nav:
|
|
277
283
|
assert len(entry) == 1
|
|
278
|
-
|
|
284
|
+
_, value = *entry.keys(), *entry.values()
|
|
279
285
|
if isinstance(value, list):
|
|
280
286
|
for value in self.parse_nav(value):
|
|
281
287
|
yield value
|
|
@@ -328,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
|
|
|
328
334
|
import shutil
|
|
329
335
|
|
|
330
336
|
path = Path(config["site_dir"]) / "mainstyle.css"
|
|
331
|
-
with
|
|
332
|
-
|
|
333
|
-
|
|
337
|
+
with (
|
|
338
|
+
importlib.resources.open_binary(
|
|
339
|
+
"datamaestro.commands", "mainstyle.css"
|
|
340
|
+
) as source,
|
|
341
|
+
path.open("wb") as dest,
|
|
342
|
+
):
|
|
334
343
|
shutil.copyfileobj(source, dest)
|
|
335
344
|
|
|
336
345
|
def on_files(self, files, config):
|
|
@@ -382,7 +391,7 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
|
|
|
382
391
|
builder()
|
|
383
392
|
|
|
384
393
|
logging.info("Watching %s", path)
|
|
385
|
-
server.watch(path, rebuild)
|
|
394
|
+
# server.watch(path, rebuild)
|
|
386
395
|
|
|
387
396
|
def on_page_markdown(self, markdown, page, config, **kwargs):
|
|
388
397
|
if page.url.startswith("api/"):
|
|
@@ -420,7 +429,10 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
|
|
|
420
429
|
r.write("## List of datasets\n\n")
|
|
421
430
|
for ds in df:
|
|
422
431
|
r.write(
|
|
423
|
-
|
|
432
|
+
(
|
|
433
|
+
"""<div class="dataset-entry"><div class='dataset-id'>"""
|
|
434
|
+
"""%s<a name="%s"></a></div>\n\n"""
|
|
435
|
+
)
|
|
424
436
|
% (ds.id, ds.id)
|
|
425
437
|
)
|
|
426
438
|
if ds.name:
|
datamaestro/context.py
CHANGED
|
@@ -1,21 +1,37 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from
|
|
2
|
+
from typing import Iterable, Iterator, Dict, Optional, Union
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
5
|
import hashlib
|
|
6
6
|
import logging
|
|
7
7
|
import inspect
|
|
8
8
|
import json
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from experimaestro import Config
|
|
11
|
+
from functools import cached_property
|
|
9
12
|
from experimaestro.mkdocs.metaloader import Module
|
|
10
|
-
import pkg_resources
|
|
11
|
-
from typing import Iterable, Iterator, List, Dict
|
|
12
13
|
from .utils import CachedFile, downloadURL
|
|
13
14
|
from .settings import UserSettings, Settings
|
|
14
|
-
|
|
15
15
|
from typing import TYPE_CHECKING
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
|
-
from datamaestro.definitions import AbstractDataset
|
|
18
|
+
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
19
|
+
|
|
20
|
+
from importlib.metadata import (
|
|
21
|
+
entry_points as _entry_points,
|
|
22
|
+
version as _version,
|
|
23
|
+
PackageNotFoundError as _PackageNotFoundError,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def iter_entry_points(group, name=None):
|
|
28
|
+
"""Yield entry points for a given group (and optional name) using importlib.metadata."""
|
|
29
|
+
eps = _entry_points()
|
|
30
|
+
selected = eps.select(group=group)
|
|
31
|
+
if name:
|
|
32
|
+
selected = [ep for ep in selected if ep.name == name]
|
|
33
|
+
for ep in selected:
|
|
34
|
+
yield ep
|
|
19
35
|
|
|
20
36
|
|
|
21
37
|
class Compression:
|
|
@@ -98,31 +114,31 @@ class Context:
|
|
|
98
114
|
@cached_property
|
|
99
115
|
def repositorymap(self) -> Dict[str, "Repository"]:
|
|
100
116
|
return {
|
|
101
|
-
repository.basemodule(): repository
|
|
117
|
+
repository.basemodule(): repository
|
|
118
|
+
for repository in self.repositories()
|
|
119
|
+
if repository.basemodule() is not None
|
|
102
120
|
}
|
|
103
121
|
|
|
104
122
|
def repositories(self) -> Iterable["Repository"]:
|
|
105
123
|
"""Returns an iterator over repositories"""
|
|
106
|
-
for entry_point in
|
|
124
|
+
for entry_point in iter_entry_points("datamaestro.repositories"):
|
|
107
125
|
yield entry_point.load().instance()
|
|
108
126
|
|
|
109
127
|
def repository(self, repositoryid):
|
|
110
128
|
if repositoryid is None:
|
|
111
129
|
return None
|
|
112
130
|
|
|
113
|
-
|
|
114
|
-
x
|
|
115
|
-
for x in pkg_resources.iter_entry_points(
|
|
116
|
-
"datamaestro.repositories", repositoryid
|
|
117
|
-
)
|
|
131
|
+
entry_points = [
|
|
132
|
+
x for x in iter_entry_points("datamaestro.repositories", repositoryid)
|
|
118
133
|
]
|
|
119
|
-
if not
|
|
134
|
+
if not entry_points:
|
|
120
135
|
raise Exception("No datasets repository named %s", repositoryid)
|
|
121
|
-
if len(
|
|
136
|
+
if len(entry_points) > 1:
|
|
122
137
|
raise Exception(
|
|
123
|
-
"Too many datasets repository named %s (%d)"
|
|
138
|
+
"Too many datasets repository named %s (%d)"
|
|
139
|
+
% (repositoryid, len(entry_points))
|
|
124
140
|
)
|
|
125
|
-
return
|
|
141
|
+
return entry_points[0].load()(self)
|
|
126
142
|
|
|
127
143
|
@property
|
|
128
144
|
def running_test(self):
|
|
@@ -175,7 +191,6 @@ class Context:
|
|
|
175
191
|
if dlpath.is_file():
|
|
176
192
|
logging.debug("Using cached file %s for %s", dlpath, url)
|
|
177
193
|
else:
|
|
178
|
-
|
|
179
194
|
logging.info("Downloading %s", url)
|
|
180
195
|
tmppath = dlpath.with_suffix(".tmp")
|
|
181
196
|
|
|
@@ -188,7 +203,7 @@ class Context:
|
|
|
188
203
|
|
|
189
204
|
def ask(self, question: str, options: Dict[str, str]):
|
|
190
205
|
"""Ask a question to the user"""
|
|
191
|
-
print(question)
|
|
206
|
+
print(question) # noqa: T201
|
|
192
207
|
answer = None
|
|
193
208
|
while answer not in options:
|
|
194
209
|
answer = input().strip().lower()
|
|
@@ -228,17 +243,47 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
228
243
|
def __init__(self, module: Module):
|
|
229
244
|
"""Initialize with a module"""
|
|
230
245
|
self.module = module
|
|
246
|
+
self._title = None
|
|
247
|
+
self._description = None
|
|
231
248
|
|
|
232
249
|
@property
|
|
233
250
|
def id(self):
|
|
234
251
|
return ".".join(self.module.__name__.split(".", 2)[2:])
|
|
235
252
|
|
|
253
|
+
@property
|
|
254
|
+
def title(self):
|
|
255
|
+
self._getdoc()
|
|
256
|
+
return self._title
|
|
257
|
+
|
|
236
258
|
@property
|
|
237
259
|
def description(self):
|
|
238
|
-
|
|
260
|
+
self._getdoc()
|
|
261
|
+
return self._description
|
|
262
|
+
|
|
263
|
+
def _getdoc(self):
|
|
264
|
+
if self._title is not None:
|
|
265
|
+
return
|
|
266
|
+
|
|
267
|
+
if not self.module.__doc__:
|
|
268
|
+
self._title = ""
|
|
269
|
+
self._description = ""
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
intitle = True
|
|
273
|
+
title = []
|
|
274
|
+
description = []
|
|
275
|
+
for line in self.module.__doc__.split("\n"):
|
|
276
|
+
if line.strip() == "" and intitle:
|
|
277
|
+
intitle = False
|
|
278
|
+
else:
|
|
279
|
+
(title if intitle else description).append(line)
|
|
280
|
+
|
|
281
|
+
self._title = " ".join(title)
|
|
282
|
+
self._description = "\n".join(description)
|
|
239
283
|
|
|
240
284
|
def __iter__(self) -> Iterable["AbstractDataset"]:
|
|
241
285
|
from .definitions import DatasetWrapper
|
|
286
|
+
from datamaestro.data import Base
|
|
242
287
|
|
|
243
288
|
# Iterates over defined symbols
|
|
244
289
|
for key, value in self.module.__dict__.items():
|
|
@@ -247,10 +292,60 @@ class Datasets(Iterable["AbstractDataset"]):
|
|
|
247
292
|
# Ensure it comes from the module
|
|
248
293
|
if self.module.__name__ == value.t.__module__:
|
|
249
294
|
yield value
|
|
295
|
+
elif (
|
|
296
|
+
inspect.isclass(value)
|
|
297
|
+
and issubclass(value, Base)
|
|
298
|
+
and hasattr(value, "__dataset__")
|
|
299
|
+
):
|
|
300
|
+
if self.module.__name__ == value.__module__:
|
|
301
|
+
yield value.__dataset__
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class BaseRepository(ABC):
|
|
305
|
+
"""A repository groups a set of datasets and their corresponding specific
|
|
306
|
+
handlers (downloading, filtering, etc.)"""
|
|
307
|
+
|
|
308
|
+
def __init__(self, context: Context):
|
|
309
|
+
self.context = context
|
|
310
|
+
p = inspect.getabsfile(self.__class__)
|
|
311
|
+
self.basedir = Path(p).parent
|
|
312
|
+
|
|
313
|
+
@abstractmethod
|
|
314
|
+
def __iter__(self) -> Iterator["AbstractDataset"]: ...
|
|
315
|
+
|
|
316
|
+
def search(self, name: str):
|
|
317
|
+
"""Search for a dataset in the definitions"""
|
|
318
|
+
for dataset in self:
|
|
319
|
+
if name in dataset.aliases:
|
|
320
|
+
return dataset
|
|
321
|
+
|
|
322
|
+
@classmethod
|
|
323
|
+
def instance(cls, context=None):
|
|
324
|
+
try:
|
|
325
|
+
return cls.__getattribute__(cls, "INSTANCE")
|
|
326
|
+
except AttributeError:
|
|
327
|
+
return cls(context if context else Context.instance())
|
|
328
|
+
|
|
329
|
+
@classmethod
|
|
330
|
+
def basemodule(cls):
|
|
331
|
+
return cls.__module__
|
|
332
|
+
|
|
333
|
+
@property
|
|
334
|
+
def generatedpath(self):
|
|
335
|
+
return self.basedir / "generated"
|
|
336
|
+
|
|
337
|
+
@property
|
|
338
|
+
def datapath(self):
|
|
339
|
+
return self.context.datapath.joinpath(self.id)
|
|
340
|
+
|
|
341
|
+
@property
|
|
342
|
+
def extrapath(self):
|
|
343
|
+
"""Path to the directory containing extra configuration files"""
|
|
344
|
+
return self.basedir / "data"
|
|
250
345
|
|
|
251
346
|
|
|
252
|
-
class Repository:
|
|
253
|
-
"""
|
|
347
|
+
class Repository(BaseRepository):
|
|
348
|
+
"""(deprecated) Repository where datasets are located in __module__.config"""
|
|
254
349
|
|
|
255
350
|
def __init__(self, context: Context):
|
|
256
351
|
"""Initialize a new repository
|
|
@@ -259,34 +354,20 @@ class Repository:
|
|
|
259
354
|
:param basedir: The base directory of the repository
|
|
260
355
|
(by default, the same as the repository class)
|
|
261
356
|
"""
|
|
357
|
+
super().__init__(context)
|
|
262
358
|
self.context = context
|
|
263
|
-
p = inspect.getabsfile(self.__class__)
|
|
264
|
-
self.basedir = Path(p).parent
|
|
265
359
|
self.configdir = self.basedir.joinpath("config")
|
|
266
360
|
self.id = self.__class__.NAMESPACE
|
|
267
361
|
self.name = self.id
|
|
268
362
|
self.module = self.__class__.__module__
|
|
269
363
|
self.__class__.INSTANCE = self
|
|
270
364
|
|
|
271
|
-
@classmethod
|
|
272
|
-
def basemodule(cls):
|
|
273
|
-
return cls.__module__
|
|
274
|
-
|
|
275
|
-
@classmethod
|
|
276
|
-
def instance(cls, context=None):
|
|
277
|
-
try:
|
|
278
|
-
return cls.__getattribute__(cls, "INSTANCE")
|
|
279
|
-
except AttributeError:
|
|
280
|
-
return cls(context if context else Context.instance())
|
|
281
|
-
|
|
282
365
|
@classmethod
|
|
283
366
|
def version(cls):
|
|
284
|
-
from pkg_resources import get_distribution, DistributionNotFound
|
|
285
|
-
|
|
286
367
|
try:
|
|
287
|
-
return
|
|
288
|
-
except
|
|
289
|
-
|
|
368
|
+
return _version(cls.__module__)
|
|
369
|
+
except _PackageNotFoundError:
|
|
370
|
+
return None
|
|
290
371
|
|
|
291
372
|
def __repr__(self):
|
|
292
373
|
return "Repository(%s)" % self.basedir
|
|
@@ -298,40 +379,15 @@ class Repository:
|
|
|
298
379
|
assert isinstance(other, Repository)
|
|
299
380
|
return self.basedir == other.basedir
|
|
300
381
|
|
|
301
|
-
def
|
|
302
|
-
"""
|
|
303
|
-
|
|
304
|
-
logging.debug("Searching for %s in %s", name, self.configdir)
|
|
305
|
-
|
|
306
|
-
candidates: List[str] = []
|
|
307
|
-
components = name.split(".")
|
|
308
|
-
N = len(components)
|
|
309
|
-
sub = None
|
|
310
|
-
prefix = None
|
|
311
|
-
path = self.configdir
|
|
312
|
-
for i, c in enumerate(components):
|
|
313
|
-
path = path / c
|
|
314
|
-
|
|
315
|
-
if (path / "__init__.py").is_file():
|
|
316
|
-
candidates.append(".".join(components[: i + 1]))
|
|
317
|
-
|
|
318
|
-
if path.with_suffix(".py").is_file():
|
|
319
|
-
candidates.append(".".join(components[: i + 1]))
|
|
320
|
-
|
|
321
|
-
if not path.is_dir():
|
|
322
|
-
break
|
|
323
|
-
|
|
324
|
-
# Get the dataset
|
|
325
|
-
for candidate in candidates[::-1]:
|
|
326
|
-
logging.debug("Searching in module %s.config.%s", self.module, candidate)
|
|
382
|
+
def datasets(self, candidate: str):
|
|
383
|
+
"""Returns the dataset candidates from a module"""
|
|
384
|
+
try:
|
|
327
385
|
module = importlib.import_module("%s.config.%s" % (self.module, candidate))
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
return None
|
|
386
|
+
except ModuleNotFoundError:
|
|
387
|
+
return None
|
|
388
|
+
return Datasets(module)
|
|
333
389
|
|
|
334
|
-
def modules(self) -> "Module":
|
|
390
|
+
def modules(self) -> Iterator["Module"]:
|
|
335
391
|
"""Iterates over all modules in this repository"""
|
|
336
392
|
for _, fid, package in self._modules():
|
|
337
393
|
try:
|
|
@@ -368,19 +424,6 @@ class Repository:
|
|
|
368
424
|
for dataset in datasets:
|
|
369
425
|
yield dataset
|
|
370
426
|
|
|
371
|
-
@property
|
|
372
|
-
def generatedpath(self):
|
|
373
|
-
return self.basedir.joinpath("generated")
|
|
374
|
-
|
|
375
|
-
@property
|
|
376
|
-
def datapath(self):
|
|
377
|
-
return self.context.datapath.joinpath(self.id)
|
|
378
|
-
|
|
379
|
-
@property
|
|
380
|
-
def extrapath(self):
|
|
381
|
-
"""Path to the directory containing extra configuration files"""
|
|
382
|
-
return self.basedir.joinpath("data")
|
|
383
|
-
|
|
384
427
|
|
|
385
428
|
def find_dataset(dataset_id: str):
|
|
386
429
|
"""Find a dataset given its id"""
|
|
@@ -389,11 +432,24 @@ def find_dataset(dataset_id: str):
|
|
|
389
432
|
return AbstractDataset.find(dataset_id)
|
|
390
433
|
|
|
391
434
|
|
|
392
|
-
def prepare_dataset(
|
|
435
|
+
def prepare_dataset(
|
|
436
|
+
dataset_id: Union[str, "DatasetWrapper", Config],
|
|
437
|
+
context: Optional[Union[Context, Path]] = None,
|
|
438
|
+
):
|
|
393
439
|
"""Find a dataset given its id and download the resources"""
|
|
394
|
-
from .definitions import AbstractDataset
|
|
440
|
+
from .definitions import AbstractDataset, DatasetWrapper
|
|
441
|
+
|
|
442
|
+
match context:
|
|
443
|
+
case Path() | str():
|
|
444
|
+
context = Context(Path(context))
|
|
445
|
+
|
|
446
|
+
if isinstance(dataset_id, DatasetWrapper):
|
|
447
|
+
ds = dataset_id
|
|
448
|
+
elif isinstance(dataset_id, Config):
|
|
449
|
+
ds = dataset_id.__datamaestro_dataset__
|
|
450
|
+
else:
|
|
451
|
+
ds = AbstractDataset.find(dataset_id, context=context)
|
|
395
452
|
|
|
396
|
-
ds = AbstractDataset.find(dataset_id)
|
|
397
453
|
return ds.prepare(download=True)
|
|
398
454
|
|
|
399
455
|
|
datamaestro/data/__init__.py
CHANGED
|
@@ -1,25 +1,35 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from pathlib import Path
|
|
3
|
-
from
|
|
4
|
-
from experimaestro import Config
|
|
5
|
-
from
|
|
3
|
+
from typing import Any, Dict
|
|
4
|
+
from experimaestro import Config, Param, Meta
|
|
5
|
+
from datamaestro.definitions import AbstractDataset
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Base(Config):
|
|
9
|
-
"""Base object for all data types
|
|
9
|
+
"""Base object for all data types"""
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
id: Param[str]
|
|
12
|
+
"""The unique (sub-)dataset ID"""
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
"""
|
|
14
|
+
__datamaestro_dataset__: "AbstractDataset"
|
|
15
15
|
|
|
16
|
-
|
|
17
|
-
|
|
16
|
+
def dataset_information(self) -> Dict[str, Any]:
|
|
17
|
+
"""Returns document meta-informations"""
|
|
18
|
+
return {
|
|
19
|
+
"id": self.id,
|
|
20
|
+
"name": self.__datamaestro_dataset__.name,
|
|
21
|
+
"description": self.__datamaestro_dataset__.description,
|
|
22
|
+
}
|
|
18
23
|
|
|
19
24
|
def download(self):
|
|
20
25
|
"""Download the dataset"""
|
|
21
26
|
self.__datamaestro_dataset__.download()
|
|
22
27
|
|
|
28
|
+
def prepare(self, *args, **kwargs):
|
|
29
|
+
"""Prepare the dataset"""
|
|
30
|
+
self.__datamaestro_dataset__.prepare()
|
|
31
|
+
return self
|
|
32
|
+
|
|
23
33
|
|
|
24
34
|
class Generic(Base):
|
|
25
35
|
"""Generic dataset
|
|
@@ -38,15 +48,17 @@ class Generic(Base):
|
|
|
38
48
|
class File(Base):
|
|
39
49
|
"""A data file"""
|
|
40
50
|
|
|
41
|
-
path:
|
|
51
|
+
path: Meta[Path]
|
|
52
|
+
"""The path of the file"""
|
|
42
53
|
|
|
43
54
|
def open(self, mode):
|
|
44
55
|
return self.path.open(mode)
|
|
45
56
|
|
|
46
57
|
|
|
47
|
-
@argument("path", type=Path)
|
|
48
58
|
class Folder(Base):
|
|
49
59
|
"""A data folder"""
|
|
50
60
|
|
|
61
|
+
path: Meta[Path]
|
|
62
|
+
|
|
51
63
|
def open(self, mode):
|
|
52
64
|
return self.path.open(mode)
|