datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/__init__.py CHANGED
@@ -1,8 +1,12 @@
1
- from .context import Context, Repository, get_dataset, prepare_dataset
1
+ # flake8: noqa: F401 (re-exports)
2
+ from .context import (
3
+ Context,
4
+ Repository,
5
+ BaseRepository,
6
+ get_dataset,
7
+ prepare_dataset,
8
+ )
2
9
 
3
- from pkg_resources import get_distribution, DistributionNotFound
4
-
5
- try:
6
- __version__ = get_distribution(__name__).version
7
- except DistributionNotFound:
8
- __version__ = None
10
+ from .definitions import dataset, metadata
11
+ from .data import Base
12
+ from .version import __version__
datamaestro/__main__.py CHANGED
@@ -1,19 +1,22 @@
1
1
  #!/usr/bin/env python3
2
+ # ruff: noqa: T201
2
3
 
4
+ from importlib.metadata import entry_points
3
5
  import sys
4
6
  import logging
5
7
  from functools import update_wrapper
6
8
  import traceback as tb
7
- import pkg_resources
8
9
  import re
9
10
  from pathlib import Path
10
11
  import shutil
11
- from .context import Context
12
12
  from typing import Set
13
- import datamaestro
13
+ from urllib.parse import urlparse
14
14
 
15
15
  import click
16
16
 
17
+ import datamaestro
18
+ from .context import Context
19
+
17
20
  logging.basicConfig(level=logging.INFO)
18
21
 
19
22
 
@@ -37,7 +40,7 @@ def pass_cfg(f):
37
40
  # Get all the available repositories
38
41
 
39
42
  REPOSITORIES = {}
40
- for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
43
+ for entry_point in entry_points(group="datamaestro.repositories"):
41
44
  REPOSITORIES[entry_point.name] = entry_point
42
45
 
43
46
 
@@ -59,7 +62,10 @@ for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
59
62
  "--traceback", is_flag=True, help="Display traceback if an exception occurs"
60
63
  )
61
64
  @click.option(
62
- "--data", type=Path, help="Directory containing datasets", default=Context.MAINDIR
65
+ "--data",
66
+ type=Path,
67
+ help="Directory containing datasets",
68
+ default=Context.MAINDIR,
63
69
  )
64
70
  @click.pass_context
65
71
  def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
@@ -90,6 +96,8 @@ def main():
90
96
  @click.argument("dataset", type=str)
91
97
  @pass_cfg
92
98
  def info(config: Config, dataset):
99
+ from datamaestro.definitions import AbstractDataset
100
+
93
101
  dataset = AbstractDataset.find(dataset)
94
102
  print(dataset.name)
95
103
  if dataset.url:
@@ -204,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
204
212
  # --- Create a dataset
205
213
 
206
214
  DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
207
- from urllib.parse import urlparse
208
215
 
209
216
 
210
217
  def dataset_id_check(ctx, param, value):
@@ -254,6 +261,8 @@ def create_dataset(config: Config, repository_id: str, dataset_id: str):
254
261
  @pass_cfg
255
262
  def download(config: Config, dataset):
256
263
  """Download a dataset"""
264
+ from datamaestro.definitions import AbstractDataset
265
+
257
266
  dataset = AbstractDataset.find(dataset)
258
267
  success = dataset.download()
259
268
  if not success:
@@ -314,5 +323,17 @@ def search(config: Config, searchterms):
314
323
 
315
324
  logging.debug("Search: %s", condition)
316
325
  for dataset in config.context.datasets():
317
- if condition.match(dataset):
318
- print("[%s] %s" % (dataset.repository.id, dataset.id))
326
+ try:
327
+ if condition.match(dataset):
328
+ cfg = dataset.configtype
329
+ print(
330
+ "[%s] %s (%s)"
331
+ % (
332
+ dataset.repository.id,
333
+ dataset.id,
334
+ cfg.__name__ if cfg is not None else "?",
335
+ )
336
+ )
337
+ except Exception:
338
+ logging.error("Error while matching with dataset %s", dataset)
339
+ raise
@@ -1 +1 @@
1
- """Generic annotations for datasets"""
1
+ """Generic annotations for datasets"""
@@ -1,9 +1,15 @@
1
- import logging
2
- from datamaestro.definitions import DatasetAnnotation, AbstractDataset, hook
1
+ from typing import Optional
2
+ from datamaestro.definitions import AbstractDataset, hook
3
3
 
4
4
 
5
5
  @hook("pre-use")
6
- def useragreement(definition: AbstractDataset, message, id=None):
6
+ def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
7
+ """Asks for a user-agreement
8
+
9
+ :param definition: The dataset for which the agreement is asked
10
+ :param message: The agreement text
11
+ :param id: The ID of the agreement (default to the dataset ID)
12
+ """
7
13
  # Skip agreement when testing
8
14
  if definition.context.running_test:
9
15
  return
@@ -18,6 +18,7 @@ from mkdocs.structure.pages import Page as MkdocPage
18
18
  from docstring_parser import parse as docstring_parse
19
19
 
20
20
  import experimaestro
21
+ import experimaestro.mkdocs.base
21
22
  from experimaestro.core.types import ObjectType
22
23
 
23
24
  from ..context import Context, Repository, Datasets
@@ -97,7 +98,7 @@ def document_data(datatype: ObjectType):
97
98
  if doc.long_description:
98
99
  s += doc.long_description + "\n"
99
100
  s += method_documentation(doc, method.__annotations__)
100
- except Exception as e:
101
+ except Exception:
101
102
  logging.error(
102
103
  "Error while parsing documetnation of %s (%s)",
103
104
  method,
@@ -108,8 +109,6 @@ def document_data(datatype: ObjectType):
108
109
 
109
110
 
110
111
  def document_object(object):
111
- from datamaestro.data import Base
112
-
113
112
  try:
114
113
  name = object.__name__
115
114
  # Get the documentation
@@ -141,7 +140,7 @@ def document_object(object):
141
140
 
142
141
  return s
143
142
 
144
- except Exception as e:
143
+ except Exception:
145
144
  logging.exception(
146
145
  "Exception while generating the documentation for %s" % object.__name__
147
146
  )
@@ -159,8 +158,11 @@ def document(match):
159
158
  module = importlib.import_module(modulename)
160
159
  try:
161
160
  object = getattr(module, name)
162
- except:
163
- return "<div class='error'>Cannot find %s in %s</div>" % (name, modulename)
161
+ except Exception:
162
+ return "<div class='error'>Cannot find %s in %s</div>" % (
163
+ name,
164
+ modulename,
165
+ )
164
166
 
165
167
  if ismodule(object):
166
168
  return "\n\n".join(
@@ -182,7 +184,7 @@ class Classification:
182
184
 
183
185
  def add(self, name, value):
184
186
  key = name.lower()
185
- if not key in self.map:
187
+ if key not in self.map:
186
188
  self.map[key] = ClassificationItem(name)
187
189
  self.map[key].values.append(value)
188
190
 
@@ -201,7 +203,6 @@ class Classification:
201
203
  )
202
204
 
203
205
  def match(self, path):
204
-
205
206
  if path == "datamaestro/%s.md" % self.id:
206
207
  r = io.StringIO()
207
208
  r.write("# List of %s\n\n" % self.name)
@@ -222,7 +223,12 @@ class Classification:
222
223
  module = Datasets(importlib.import_module(meta.t.__module__))
223
224
  r.write(
224
225
  "- [%s](../df/%s/%s.html#%s)\n"
225
- % (meta.name or meta.id, meta.repository.id, module.id, meta.id)
226
+ % (
227
+ meta.name or meta.id,
228
+ meta.repository.id,
229
+ module.id,
230
+ meta.id,
231
+ )
226
232
  )
227
233
 
228
234
  return r.getvalue()
@@ -275,7 +281,7 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
275
281
  def parse_nav(self, nav):
276
282
  for entry in nav:
277
283
  assert len(entry) == 1
278
- key, value = *entry.keys(), *entry.values()
284
+ _, value = *entry.keys(), *entry.values()
279
285
  if isinstance(value, list):
280
286
  for value in self.parse_nav(value):
281
287
  yield value
@@ -328,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
328
334
  import shutil
329
335
 
330
336
  path = Path(config["site_dir"]) / "mainstyle.css"
331
- with importlib.resources.open_binary(
332
- "datamaestro.commands", "mainstyle.css"
333
- ) as source, path.open("wb") as dest:
337
+ with (
338
+ importlib.resources.open_binary(
339
+ "datamaestro.commands", "mainstyle.css"
340
+ ) as source,
341
+ path.open("wb") as dest,
342
+ ):
334
343
  shutil.copyfileobj(source, dest)
335
344
 
336
345
  def on_files(self, files, config):
@@ -382,7 +391,7 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
382
391
  builder()
383
392
 
384
393
  logging.info("Watching %s", path)
385
- server.watch(path, rebuild)
394
+ # server.watch(path, rebuild)
386
395
 
387
396
  def on_page_markdown(self, markdown, page, config, **kwargs):
388
397
  if page.url.startswith("api/"):
@@ -420,7 +429,10 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
420
429
  r.write("## List of datasets\n\n")
421
430
  for ds in df:
422
431
  r.write(
423
- """<div class="dataset-entry"><div class='dataset-id'>%s<a name="%s"></a></div>\n\n"""
432
+ (
433
+ """<div class="dataset-entry"><div class='dataset-id'>"""
434
+ """%s<a name="%s"></a></div>\n\n"""
435
+ )
424
436
  % (ds.id, ds.id)
425
437
  )
426
438
  if ds.name:
datamaestro/context.py CHANGED
@@ -1,21 +1,37 @@
1
1
  from pathlib import Path
2
- from cached_property import cached_property
2
+ from typing import Iterable, Iterator, Dict, Optional, Union
3
3
  import importlib
4
4
  import os
5
5
  import hashlib
6
6
  import logging
7
7
  import inspect
8
8
  import json
9
+ from abc import ABC, abstractmethod
10
+ from experimaestro import Config
11
+ from functools import cached_property
9
12
  from experimaestro.mkdocs.metaloader import Module
10
- import pkg_resources
11
- from typing import Iterable, Iterator, List, Dict
12
13
  from .utils import CachedFile, downloadURL
13
14
  from .settings import UserSettings, Settings
14
-
15
15
  from typing import TYPE_CHECKING
16
16
 
17
17
  if TYPE_CHECKING:
18
- from datamaestro.definitions import AbstractDataset
18
+ from datamaestro.definitions import AbstractDataset, DatasetWrapper
19
+
20
+ from importlib.metadata import (
21
+ entry_points as _entry_points,
22
+ version as _version,
23
+ PackageNotFoundError as _PackageNotFoundError,
24
+ )
25
+
26
+
27
+ def iter_entry_points(group, name=None):
28
+ """Yield entry points for a given group (and optional name) using importlib.metadata."""
29
+ eps = _entry_points()
30
+ selected = eps.select(group=group)
31
+ if name:
32
+ selected = [ep for ep in selected if ep.name == name]
33
+ for ep in selected:
34
+ yield ep
19
35
 
20
36
 
21
37
  class Compression:
@@ -98,31 +114,31 @@ class Context:
98
114
  @cached_property
99
115
  def repositorymap(self) -> Dict[str, "Repository"]:
100
116
  return {
101
- repository.basemodule(): repository for repository in self.repositories()
117
+ repository.basemodule(): repository
118
+ for repository in self.repositories()
119
+ if repository.basemodule() is not None
102
120
  }
103
121
 
104
122
  def repositories(self) -> Iterable["Repository"]:
105
123
  """Returns an iterator over repositories"""
106
- for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
124
+ for entry_point in iter_entry_points("datamaestro.repositories"):
107
125
  yield entry_point.load().instance()
108
126
 
109
127
  def repository(self, repositoryid):
110
128
  if repositoryid is None:
111
129
  return None
112
130
 
113
- l = [
114
- x
115
- for x in pkg_resources.iter_entry_points(
116
- "datamaestro.repositories", repositoryid
117
- )
131
+ entry_points = [
132
+ x for x in iter_entry_points("datamaestro.repositories", repositoryid)
118
133
  ]
119
- if not l:
134
+ if not entry_points:
120
135
  raise Exception("No datasets repository named %s", repositoryid)
121
- if len(l) > 1:
136
+ if len(entry_points) > 1:
122
137
  raise Exception(
123
- "Too many datasets repository named %s (%d)" % (repositoryid, len(l))
138
+ "Too many datasets repository named %s (%d)"
139
+ % (repositoryid, len(entry_points))
124
140
  )
125
- return l[0].load()(self)
141
+ return entry_points[0].load()(self)
126
142
 
127
143
  @property
128
144
  def running_test(self):
@@ -175,7 +191,6 @@ class Context:
175
191
  if dlpath.is_file():
176
192
  logging.debug("Using cached file %s for %s", dlpath, url)
177
193
  else:
178
-
179
194
  logging.info("Downloading %s", url)
180
195
  tmppath = dlpath.with_suffix(".tmp")
181
196
 
@@ -188,7 +203,7 @@ class Context:
188
203
 
189
204
  def ask(self, question: str, options: Dict[str, str]):
190
205
  """Ask a question to the user"""
191
- print(question)
206
+ print(question) # noqa: T201
192
207
  answer = None
193
208
  while answer not in options:
194
209
  answer = input().strip().lower()
@@ -228,17 +243,47 @@ class Datasets(Iterable["AbstractDataset"]):
228
243
  def __init__(self, module: Module):
229
244
  """Initialize with a module"""
230
245
  self.module = module
246
+ self._title = None
247
+ self._description = None
231
248
 
232
249
  @property
233
250
  def id(self):
234
251
  return ".".join(self.module.__name__.split(".", 2)[2:])
235
252
 
253
+ @property
254
+ def title(self):
255
+ self._getdoc()
256
+ return self._title
257
+
236
258
  @property
237
259
  def description(self):
238
- return self.module.__doc__ or ""
260
+ self._getdoc()
261
+ return self._description
262
+
263
+ def _getdoc(self):
264
+ if self._title is not None:
265
+ return
266
+
267
+ if not self.module.__doc__:
268
+ self._title = ""
269
+ self._description = ""
270
+ return
271
+
272
+ intitle = True
273
+ title = []
274
+ description = []
275
+ for line in self.module.__doc__.split("\n"):
276
+ if line.strip() == "" and intitle:
277
+ intitle = False
278
+ else:
279
+ (title if intitle else description).append(line)
280
+
281
+ self._title = " ".join(title)
282
+ self._description = "\n".join(description)
239
283
 
240
284
  def __iter__(self) -> Iterable["AbstractDataset"]:
241
285
  from .definitions import DatasetWrapper
286
+ from datamaestro.data import Base
242
287
 
243
288
  # Iterates over defined symbols
244
289
  for key, value in self.module.__dict__.items():
@@ -247,10 +292,60 @@ class Datasets(Iterable["AbstractDataset"]):
247
292
  # Ensure it comes from the module
248
293
  if self.module.__name__ == value.t.__module__:
249
294
  yield value
295
+ elif (
296
+ inspect.isclass(value)
297
+ and issubclass(value, Base)
298
+ and hasattr(value, "__dataset__")
299
+ ):
300
+ if self.module.__name__ == value.__module__:
301
+ yield value.__dataset__
302
+
303
+
304
+ class BaseRepository(ABC):
305
+ """A repository groups a set of datasets and their corresponding specific
306
+ handlers (downloading, filtering, etc.)"""
307
+
308
+ def __init__(self, context: Context):
309
+ self.context = context
310
+ p = inspect.getabsfile(self.__class__)
311
+ self.basedir = Path(p).parent
312
+
313
+ @abstractmethod
314
+ def __iter__(self) -> Iterator["AbstractDataset"]: ...
315
+
316
+ def search(self, name: str):
317
+ """Search for a dataset in the definitions"""
318
+ for dataset in self:
319
+ if name in dataset.aliases:
320
+ return dataset
321
+
322
+ @classmethod
323
+ def instance(cls, context=None):
324
+ try:
325
+ return cls.__getattribute__(cls, "INSTANCE")
326
+ except AttributeError:
327
+ return cls(context if context else Context.instance())
328
+
329
+ @classmethod
330
+ def basemodule(cls):
331
+ return cls.__module__
332
+
333
+ @property
334
+ def generatedpath(self):
335
+ return self.basedir / "generated"
336
+
337
+ @property
338
+ def datapath(self):
339
+ return self.context.datapath.joinpath(self.id)
340
+
341
+ @property
342
+ def extrapath(self):
343
+ """Path to the directory containing extra configuration files"""
344
+ return self.basedir / "data"
250
345
 
251
346
 
252
- class Repository:
253
- """A repository regroup a set of datasets and their corresponding specific handlers (downloading, filtering, etc.)"""
347
+ class Repository(BaseRepository):
348
+ """(deprecated) Repository where datasets are located in __module__.config"""
254
349
 
255
350
  def __init__(self, context: Context):
256
351
  """Initialize a new repository
@@ -259,34 +354,20 @@ class Repository:
259
354
  :param basedir: The base directory of the repository
260
355
  (by default, the same as the repository class)
261
356
  """
357
+ super().__init__(context)
262
358
  self.context = context
263
- p = inspect.getabsfile(self.__class__)
264
- self.basedir = Path(p).parent
265
359
  self.configdir = self.basedir.joinpath("config")
266
360
  self.id = self.__class__.NAMESPACE
267
361
  self.name = self.id
268
362
  self.module = self.__class__.__module__
269
363
  self.__class__.INSTANCE = self
270
364
 
271
- @classmethod
272
- def basemodule(cls):
273
- return cls.__module__
274
-
275
- @classmethod
276
- def instance(cls, context=None):
277
- try:
278
- return cls.__getattribute__(cls, "INSTANCE")
279
- except AttributeError:
280
- return cls(context if context else Context.instance())
281
-
282
365
  @classmethod
283
366
  def version(cls):
284
- from pkg_resources import get_distribution, DistributionNotFound
285
-
286
367
  try:
287
- return get_distribution(cls.__module__).version
288
- except DistributionNotFound:
289
- __version__ = None
368
+ return _version(cls.__module__)
369
+ except _PackageNotFoundError:
370
+ return None
290
371
 
291
372
  def __repr__(self):
292
373
  return "Repository(%s)" % self.basedir
@@ -298,40 +379,15 @@ class Repository:
298
379
  assert isinstance(other, Repository)
299
380
  return self.basedir == other.basedir
300
381
 
301
- def search(self, name: str):
302
- """Search for a dataset in the definitions
303
- """
304
- logging.debug("Searching for %s in %s", name, self.configdir)
305
-
306
- candidates: List[str] = []
307
- components = name.split(".")
308
- N = len(components)
309
- sub = None
310
- prefix = None
311
- path = self.configdir
312
- for i, c in enumerate(components):
313
- path = path / c
314
-
315
- if (path / "__init__.py").is_file():
316
- candidates.append(".".join(components[: i + 1]))
317
-
318
- if path.with_suffix(".py").is_file():
319
- candidates.append(".".join(components[: i + 1]))
320
-
321
- if not path.is_dir():
322
- break
323
-
324
- # Get the dataset
325
- for candidate in candidates[::-1]:
326
- logging.debug("Searching in module %s.config.%s", self.module, candidate)
382
+ def datasets(self, candidate: str):
383
+ """Returns the dataset candidates from a module"""
384
+ try:
327
385
  module = importlib.import_module("%s.config.%s" % (self.module, candidate))
328
- for value in Datasets(module):
329
- if name in value.aliases:
330
- return value
331
-
332
- return None
386
+ except ModuleNotFoundError:
387
+ return None
388
+ return Datasets(module)
333
389
 
334
- def modules(self) -> "Module":
390
+ def modules(self) -> Iterator["Module"]:
335
391
  """Iterates over all modules in this repository"""
336
392
  for _, fid, package in self._modules():
337
393
  try:
@@ -368,19 +424,6 @@ class Repository:
368
424
  for dataset in datasets:
369
425
  yield dataset
370
426
 
371
- @property
372
- def generatedpath(self):
373
- return self.basedir.joinpath("generated")
374
-
375
- @property
376
- def datapath(self):
377
- return self.context.datapath.joinpath(self.id)
378
-
379
- @property
380
- def extrapath(self):
381
- """Path to the directory containing extra configuration files"""
382
- return self.basedir.joinpath("data")
383
-
384
427
 
385
428
  def find_dataset(dataset_id: str):
386
429
  """Find a dataset given its id"""
@@ -389,11 +432,24 @@ def find_dataset(dataset_id: str):
389
432
  return AbstractDataset.find(dataset_id)
390
433
 
391
434
 
392
- def prepare_dataset(dataset_id: str):
435
+ def prepare_dataset(
436
+ dataset_id: Union[str, "DatasetWrapper", Config],
437
+ context: Optional[Union[Context, Path]] = None,
438
+ ):
393
439
  """Find a dataset given its id and download the resources"""
394
- from .definitions import AbstractDataset
440
+ from .definitions import AbstractDataset, DatasetWrapper
441
+
442
+ match context:
443
+ case Path() | str():
444
+ context = Context(Path(context))
445
+
446
+ if isinstance(dataset_id, DatasetWrapper):
447
+ ds = dataset_id
448
+ elif isinstance(dataset_id, Config):
449
+ ds = dataset_id.__datamaestro_dataset__
450
+ else:
451
+ ds = AbstractDataset.find(dataset_id, context=context)
395
452
 
396
- ds = AbstractDataset.find(dataset_id)
397
453
  return ds.prepare(download=True)
398
454
 
399
455
 
@@ -1,25 +1,35 @@
1
1
  import logging
2
2
  from pathlib import Path
3
- from datamaestro.definitions import AbstractDataset, argument, Param
4
- from experimaestro import Config
5
- from experimaestro import documentation # noqa: F401
3
+ from typing import Any, Dict
4
+ from experimaestro import Config, Param, Meta
5
+ from datamaestro.definitions import AbstractDataset
6
6
 
7
7
 
8
8
  class Base(Config):
9
- """Base object for all data types
9
+ """Base object for all data types"""
10
10
 
11
- attributes:
11
+ id: Param[str]
12
+ """The unique (sub-)dataset ID"""
12
13
 
13
- id: The unique dataset ID
14
- """
14
+ __datamaestro_dataset__: "AbstractDataset"
15
15
 
16
- id: Param[str]
17
- __datamaestro_dataset__: AbstractDataset
16
+ def dataset_information(self) -> Dict[str, Any]:
17
+ """Returns document meta-informations"""
18
+ return {
19
+ "id": self.id,
20
+ "name": self.__datamaestro_dataset__.name,
21
+ "description": self.__datamaestro_dataset__.description,
22
+ }
18
23
 
19
24
  def download(self):
20
25
  """Download the dataset"""
21
26
  self.__datamaestro_dataset__.download()
22
27
 
28
+ def prepare(self, *args, **kwargs):
29
+ """Prepare the dataset"""
30
+ self.__datamaestro_dataset__.prepare()
31
+ return self
32
+
23
33
 
24
34
  class Generic(Base):
25
35
  """Generic dataset
@@ -38,15 +48,17 @@ class Generic(Base):
38
48
  class File(Base):
39
49
  """A data file"""
40
50
 
41
- path: Param[Path]
51
+ path: Meta[Path]
52
+ """The path of the file"""
42
53
 
43
54
  def open(self, mode):
44
55
  return self.path.open(mode)
45
56
 
46
57
 
47
- @argument("path", type=Path)
48
58
  class Folder(Base):
49
59
  """A data folder"""
50
60
 
61
+ path: Meta[Path]
62
+
51
63
  def open(self, mode):
52
64
  return self.path.open(mode)