datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
@@ -2,13 +2,16 @@
2
2
  # Main datamaestro functions and data models
3
3
  #
4
4
 
5
+ from __future__ import annotations
6
+
5
7
  import logging
6
8
  import inspect
9
+ import shutil
7
10
  from pathlib import Path
8
11
  from itertools import chain
12
+ from abc import ABC, abstractmethod
9
13
  import traceback
10
14
  from typing import (
11
- Any,
12
15
  Dict,
13
16
  List,
14
17
  Optional,
@@ -18,20 +21,119 @@ from typing import (
18
21
  Callable,
19
22
  TYPE_CHECKING,
20
23
  Union,
24
+ _GenericAlias,
25
+ )
26
+ from experimaestro import ( # noqa: F401 (re-exports)
27
+ Param,
28
+ Option,
29
+ Config,
30
+ Meta,
21
31
  )
22
- from experimaestro import argument, constant, Param, Option, Config, Meta
23
- from typing import Type as TypingType
32
+ from typing import Type as TypingType # noqa: F401 (re-exports)
24
33
  from experimaestro.core.types import Type # noqa: F401 (re-exports)
25
- from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
26
34
 
27
35
  if TYPE_CHECKING:
28
- from datamaestro.download import Download
29
- from .data import Base
36
+ from .data import Base, Dataset
37
+ from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
38
+ from datamaestro.download import Download, Resource
39
+
40
+ # --- DAG utilities ---
41
+
42
+
43
+ def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
44
+ """Topological sort of resources by their dependencies.
45
+
46
+ Args:
47
+ resources: Dict mapping resource names to Resource instances.
48
+
49
+ Returns:
50
+ List of resources in dependency order (dependencies first).
51
+
52
+ Raises:
53
+ ValueError: If a cycle is detected in the dependency graph.
54
+ """
55
+ visited: set[str] = set()
56
+ visiting: set[str] = set() # For cycle detection
57
+ result: list["Resource"] = []
58
+
59
+ def visit(resource: "Resource"):
60
+ if resource.name in visited:
61
+ return
62
+ if resource.name in visiting:
63
+ raise ValueError(
64
+ f"Cycle detected in resource dependencies involving {resource.name}"
65
+ )
66
+
67
+ visiting.add(resource.name)
68
+ for dep in resource.dependencies:
69
+ visit(dep)
70
+ visiting.discard(resource.name)
71
+ visited.add(resource.name)
72
+ result.append(resource)
73
+
74
+ for resource in resources.values():
75
+ visit(resource)
76
+
77
+ return result
78
+
79
+
80
+ def _compute_dependents(resources: dict[str, "Resource"]) -> None:
81
+ """Compute the dependents (inverse edges) for all resources."""
82
+ # Clear existing dependents
83
+ for resource in resources.values():
84
+ resource._dependents = []
85
+
86
+ # Build inverse edges
87
+ for resource in resources.values():
88
+ for dep in resource.dependencies:
89
+ if resource not in dep._dependents:
90
+ dep._dependents.append(resource)
91
+
92
+
93
+ def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
94
+ """Scan class attributes for Resource instances and bind them.
95
+
96
+ This is called when a class-based dataset is processed by the
97
+ @dataset decorator. It detects Resource instances defined as
98
+ class attributes and binds them to the dataset.
99
+
100
+ Args:
101
+ cls: The dataset class to scan.
102
+ dataset_wrapper: The AbstractDataset to bind resources to.
103
+ """
104
+ from datamaestro.download import Resource
105
+
106
+ for attr_name, attr_value in vars(cls).items():
107
+ if isinstance(attr_value, Resource):
108
+ attr_value.bind(attr_name, dataset_wrapper)
109
+
110
+ # Build the dependency DAG
111
+ _compute_dependents(dataset_wrapper.resources)
112
+
113
+ # Validate: topological sort will raise on cycles
114
+ dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
115
+
116
+
117
+ def _delete_path(path: Path) -> None:
118
+ """Delete a file or directory at path."""
119
+ if path.exists():
120
+ if path.is_dir():
121
+ shutil.rmtree(path)
122
+ else:
123
+ path.unlink()
124
+
125
+
126
+ def _move_path(src: Path, dst: Path) -> None:
127
+ """Move a file or directory from src to dst."""
128
+ if src.exists():
129
+ dst.parent.mkdir(parents=True, exist_ok=True)
130
+ shutil.move(str(src), str(dst))
131
+
30
132
 
31
133
  # --- Objects holding information into classes/function
32
134
 
33
135
 
34
- class AbstractData:
136
+ class AbstractData(ABC):
35
137
  """Data definition groups common fields between a dataset and a data piece,
36
138
  such as tags and tasks"""
37
139
 
@@ -47,8 +149,7 @@ class AbstractData:
47
149
 
48
150
 
49
151
  class DataDefinition(AbstractData):
50
- """Object that stores the declarative part of a data(set) description
51
- """
152
+ """Object that stores the declarative part of a data(set) description"""
52
153
 
53
154
  def __init__(self, t, base=None):
54
155
  assert base is None or not inspect.isclass(t)
@@ -73,8 +174,10 @@ class DataDefinition(AbstractData):
73
174
  return self._description
74
175
 
75
176
  @staticmethod
76
- def repository_relpath(t: type) -> Tuple[Repository, List[str]]:
177
+ def repository_relpath(t: type) -> Tuple["Repository", List[str]]:
77
178
  """Find the repository of the current data or dataset definition"""
179
+ from .context import Context # noqa: F811
180
+
78
181
  repositorymap = Context.instance().repositorymap
79
182
 
80
183
  fullname = f"{t.__module__}.{t.__name__}"
@@ -93,10 +196,7 @@ class DataDefinition(AbstractData):
93
196
  if components[0] == "datamaestro":
94
197
  longest_ix = 0
95
198
 
96
- if repository is None:
97
- raise Exception(f"Could not find the repository for {fullname}")
98
-
99
- return repository, components[(longest_ix + 1) :]
199
+ return repository, [s.lower() for s in components[(longest_ix + 1) :]]
100
200
 
101
201
  def ancestors(self):
102
202
  ancestors = []
@@ -122,6 +222,15 @@ class AbstractDataset(AbstractData):
122
222
  - timestamp: whether the dataset version depends on the time of the download
123
223
  """
124
224
 
225
+ name: Optional[str] = None
226
+ """The name of the dataset"""
227
+
228
+ url: Optional[str] = None
229
+ """The URL of the dataset"""
230
+
231
+ doi: Optional[str] = None
232
+ """The DOI of this dataset"""
233
+
125
234
  def __init__(self, repository: Optional["Repository"]):
126
235
  super().__init__()
127
236
  self.repository = repository
@@ -130,6 +239,7 @@ class AbstractDataset(AbstractData):
130
239
 
131
240
  # Associated resources
132
241
  self.resources: Dict[str, "Download"] = {}
242
+ self.ordered_resources = []
133
243
 
134
244
  # Hooks
135
245
  # pre-use: before returning the dataset object
@@ -137,7 +247,6 @@ class AbstractDataset(AbstractData):
137
247
  self.hooks = {"pre-use": [], "pre-download": []}
138
248
 
139
249
  self.url = None
140
- self.name: Optional[str] = None
141
250
  self.version = None
142
251
 
143
252
  @property
@@ -150,18 +259,25 @@ class AbstractDataset(AbstractData):
150
259
 
151
260
  @property
152
261
  def context(self):
262
+ if self.repository is None:
263
+ from datamaestro.context import Context # noqa: F811
264
+
265
+ return Context.instance()
153
266
  return self.repository.context
154
267
 
155
268
  def prepare(self, download=False) -> "Base":
156
- ds = self._prepare(download)
269
+ ds = self._prepare()
157
270
  ds.__datamaestro_dataset__ = self
271
+
272
+ if download:
273
+ ds.download()
158
274
  return ds
159
275
 
160
276
  def register_hook(self, hookname: str, hook: Callable):
161
277
  self.hooks[hookname].append(hook)
162
278
 
163
- def _prepare(self, download=False) -> "Base":
164
- raise NotImplementedError(f"prepare() in {self.__class__}")
279
+ @abstractmethod
280
+ def _prepare(self) -> "Base": ...
165
281
 
166
282
  def format(self, encoder: str) -> str:
167
283
  s = self.prepare()
@@ -181,26 +297,153 @@ class AbstractDataset(AbstractData):
181
297
  from datamaestro.data import Base
182
298
 
183
299
  if isinstance(data, Base):
184
- data.id = f"{id}@{self.repository.name}"
300
+ try:
301
+ if data.id:
302
+ # There is already an ID, skip this
303
+ # and the descendants
304
+ return
305
+ except KeyError:
306
+ pass
307
+
308
+ if self.repository is None:
309
+ data.id = id
310
+ else:
311
+ data.id = f"{id}@{self.repository.name}"
185
312
  for key, value in data.__xpm__.values.items():
186
313
  if isinstance(value, Config):
187
314
  self.setDataIDs(value, f"{id}.{key}")
188
315
 
189
316
  def download(self, force=False):
190
- """Download all the necessary resources"""
317
+ """Download all the necessary resources.
318
+
319
+ Uses DAG-based topological ordering and the two-path system:
320
+ 1. Acquire exclusive lock (.state.lock)
321
+ 2. Resource writes to transient_path (under .downloads/)
322
+ 3. Framework moves transient_path → path (main folder)
323
+ 4. State marked COMPLETE
324
+ 5. Transient dependencies cleaned up eagerly
325
+ 6. .downloads/ directory removed after all resources complete
326
+ 7. Release lock
327
+ """
328
+ import fcntl
329
+
330
+ from datamaestro.download import ResourceState
331
+
332
+ self.prepare()
333
+ logging.info(
334
+ "Materializing %d resources",
335
+ len(self.ordered_resources),
336
+ )
337
+
338
+ self.datapath.mkdir(parents=True, exist_ok=True)
339
+ lock_path = self.datapath / ".state.lock"
340
+ lock_file = lock_path.open("w")
341
+ try:
342
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
343
+ success = self._download_locked(force, ResourceState)
344
+ finally:
345
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
346
+ lock_file.close()
347
+
348
+ return success
349
+
350
+ def _download_locked(self, force, ResourceState):
351
+ """Inner download logic, called while holding .state.lock."""
191
352
  success = True
192
- for key, resource in self.resources.items():
353
+
354
+ for resource in self.ordered_resources:
355
+ # Step 1: Check state
356
+ current_state = resource.state
357
+
358
+ if current_state == ResourceState.COMPLETE and not force:
359
+ # Verify files are actually present on disk
360
+ if resource.has_files() and not resource.path.exists():
361
+ logging.warning(
362
+ "Resource %s marked COMPLETE but files "
363
+ "missing at %s — re-downloading",
364
+ resource.name,
365
+ resource.path,
366
+ )
367
+ resource.state = ResourceState.NONE
368
+ current_state = ResourceState.NONE
369
+ else:
370
+ continue
371
+
372
+ # Adopt pre-existing files (old downloads without state file)
373
+ if (
374
+ current_state == ResourceState.NONE
375
+ and not force
376
+ and resource.has_files()
377
+ and resource.path.exists()
378
+ ):
379
+ logging.info(
380
+ "Resource %s already exists at %s — marking COMPLETE",
381
+ resource.name,
382
+ resource.path,
383
+ )
384
+ resource.state = ResourceState.COMPLETE
385
+ continue
386
+
387
+ if current_state == ResourceState.PARTIAL:
388
+ if not resource.can_recover:
389
+ _delete_path(resource.transient_path)
390
+ resource.state = ResourceState.NONE
391
+
392
+ # Verify all dependencies are COMPLETE
393
+ for dep in resource.dependencies:
394
+ if dep.state != ResourceState.COMPLETE:
395
+ logging.error(
396
+ "Dependency %s of %s is not COMPLETE",
397
+ dep.name,
398
+ resource.name,
399
+ )
400
+ return False
401
+
402
+ # Step 2-4: Download with framework-managed state
193
403
  try:
194
- resource.download(force)
404
+ resource.download(force=force)
405
+
406
+ # Move transient -> final, mark COMPLETE
407
+ if resource.has_files():
408
+ _move_path(resource.transient_path, resource.path)
409
+ resource.state = ResourceState.COMPLETE
410
+
195
411
  except Exception:
196
- logging.error("Could not download resource %s", key)
412
+ logging.error("Could not download resource %s", resource)
197
413
  traceback.print_exc()
414
+
415
+ # Handle PARTIAL state
416
+ if resource.has_files() and resource.transient_path.exists():
417
+ if resource.can_recover:
418
+ resource.state = ResourceState.PARTIAL
419
+ else:
420
+ _delete_path(resource.transient_path)
421
+ resource.state = ResourceState.NONE
198
422
  success = False
423
+ break
424
+
425
+ # Step 5: Eager transient cleanup
426
+ for dep in resource.dependencies:
427
+ if dep.transient and all(
428
+ d.state == ResourceState.COMPLETE for d in dep.dependents
429
+ ):
430
+ dep.cleanup()
431
+
432
+ # Step 6: Remove .downloads/ directory after success
433
+ if success:
434
+ downloads_dir = self.datapath / ".downloads"
435
+ if downloads_dir.is_dir():
436
+ shutil.rmtree(downloads_dir)
437
+
199
438
  return success
200
439
 
201
440
  @staticmethod
202
- def find(name: str) -> "DataDefinition":
441
+ def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
203
442
  """Find a dataset given its name"""
443
+ from datamaestro.context import Context # noqa: F811
444
+
445
+ context = Context.instance() if context is None else context
446
+
204
447
  logging.debug("Searching dataset %s", name)
205
448
  for repository in Context.instance().repositories():
206
449
  logging.debug("Searching dataset %s in %s", name, repository)
@@ -211,7 +454,7 @@ class AbstractDataset(AbstractData):
211
454
 
212
455
 
213
456
  class FutureAttr:
214
- """Allows to access a dataset subproperty"""
457
+ """Allows to access a dataset sub-property"""
215
458
 
216
459
  def __init__(self, dataset, keys):
217
460
  self.dataset = dataset
@@ -237,11 +480,13 @@ class FutureAttr:
237
480
  class DatasetWrapper(AbstractDataset):
238
481
  """Wraps an annotated method into a dataset
239
482
 
240
- This is the standard way to define a dataset in datamaestro
483
+ This is the standard way to define a dataset in datamaestro through
484
+ annotations (otherwise, derive from `AbstractDataset`).
241
485
  """
242
486
 
243
- def __init__(self, annotation, t: type):
244
-
487
+ def __init__(self, annotation: "dataset", t: type):
488
+ self.config = None
489
+ self.repository: Optional[Repository] = None
245
490
  self.t = t
246
491
  self.base = annotation.base
247
492
  assert self.base is not None, f"Could not set the Config type for {t}"
@@ -249,82 +494,159 @@ class DatasetWrapper(AbstractDataset):
249
494
  repository, components = DataDefinition.repository_relpath(t)
250
495
  super().__init__(repository)
251
496
 
497
+ self.module_name = None
498
+ if repository is None:
499
+ # Try to find the module name
500
+ self.module_name, _ = t.__module__.split(".", 1)
501
+
252
502
  # Set some variables
253
503
  self.url = annotation.url
504
+ self.doi = annotation.doi
505
+ self.as_prepare = annotation.as_prepare
254
506
 
255
507
  # Builds the ID:
256
508
  # Removes module_name.config prefix
257
- assert (
258
- components[0] == "config"
259
- ), f"A @dataset object should be in the .config module (not {t.__module__})"
509
+ if (
510
+ (annotation.id is None)
511
+ or (annotation.id == "")
512
+ or ("." not in annotation.id)
513
+ or (annotation.id[0] == ".")
514
+ ):
515
+ # Computes an ID
516
+ assert (
517
+ # id is empty string = use the module id
518
+ components[0] == "config"
519
+ ), (
520
+ "A @dataset without `id` should be in the "
521
+ f".config module (not {t.__module__})"
522
+ )
523
+
524
+ if annotation.id is None:
525
+ # There is nothing, use the full path
526
+ path = ".".join(components[1:])
527
+ else:
528
+ # Replace
529
+ path = ".".join(components[1:-1])
530
+ if annotation.id != "":
531
+ path = f"{path}.{annotation.id}"
260
532
 
261
- path = ".".join(components[1:-1])
262
- if annotation.id == "":
263
- # id is empty string = use the module id
264
533
  self.id = path
265
534
  else:
266
- self.id = "%s.%s" % (
267
- path,
268
- annotation.id or t.__name__.lower().replace("_", "."),
269
- )
535
+ # Use the provided ID
536
+ self.id = annotation.id
270
537
 
271
538
  self.aliases.add(self.id)
272
539
 
273
540
  # Get the documentation
274
- self._description = ""
275
- if t.__doc__:
276
- lines = t.__doc__.split("\n", 2)
277
- self.name = lines[0]
278
- if len(lines) > 1:
279
- assert lines[1].strip() == "", "Second line should be blank"
280
- if len(lines) > 2:
281
- self._description = lines[2]
541
+ self._name = None
542
+ self._description = None
543
+
544
+ @property
545
+ def name(self):
546
+ self._process_doc()
547
+ return self._name
282
548
 
283
549
  @property
284
550
  def description(self):
551
+ self._process_doc()
285
552
  return self._description
286
553
 
554
+ def _process_doc(self):
555
+ if self._description is None:
556
+ if self.t.__doc__:
557
+ lines = self.t.__doc__.split("\n")
558
+ self._name = lines[0]
559
+ if len(lines) > 1:
560
+ assert lines[1].strip() == "", "Second line should be blank"
561
+ if len(lines) > 2:
562
+ # Remove the common indent
563
+ lines = [line.rstrip() for line in lines[2:]]
564
+ minindent = max(
565
+ next(idx for idx, chr in enumerate(s) if not chr.isspace())
566
+ for s in lines
567
+ if len(s) > 0
568
+ )
569
+ self._description = "\n".join(
570
+ s[minindent:] if len(s) > 0 else "" for s in lines
571
+ )
572
+ else:
573
+ self._name = ""
574
+ self._description = ""
575
+
287
576
  @property
288
577
  def configtype(self):
289
578
  return self.base
290
579
 
291
- def __call__(self, *args, **kwargs):
292
- self.t(*args, **kwargs)
293
-
294
580
  def __getattr__(self, key):
295
581
  """Returns a pointer to a potential attribute"""
296
582
  return FutureAttr(self, [key])
297
583
 
298
- def _prepare(self, download=False) -> "Base":
299
- if download:
300
- for hook in self.hooks["pre-download"]:
301
- hook(self)
302
- if not self.download(False):
303
- raise Exception("Could not load necessary resources")
304
- logging.debug("Building with data type %s and dataset %s", self.base, self.t)
305
- for hook in self.hooks["pre-use"]:
306
- hook(self)
307
-
308
- resources = {key: value.prepare() for key, value in self.resources.items()}
309
- dict = self.t(**resources)
310
- if dict is None:
311
- name = self.t.__name__
312
- filename = inspect.getfile(self.t)
313
- raise Exception(
314
- f"The dataset method {name} defined in {filename} returned a null object"
584
+ def download(self, force=False):
585
+ if self.base is self.t:
586
+ self._prepare()
587
+ return super().download(force=force)
588
+
589
+ def _prepare(self) -> "Base":
590
+ if self.config is not None:
591
+ return self.config
592
+
593
+ # Direct creation of the dataset
594
+ if self.base is self.t:
595
+ self.config = self.base.__create_dataset__(self)
596
+
597
+ else:
598
+ # Construct the object
599
+ if self.as_prepare:
600
+ result = self.t(self, None)
601
+ else:
602
+ resources = {
603
+ key: value.prepare() for key, value in self.resources.items()
604
+ }
605
+ result = self.t(**resources)
606
+
607
+ if result is None:
608
+ raise RuntimeError(f"{self.base} did not return any resource")
609
+
610
+ # Download resources
611
+ logging.debug(
612
+ "Building with data type %s and dataset %s", self.base, self.t
315
613
  )
614
+ for hook in self.hooks["pre-use"]:
615
+ hook(self)
616
+
617
+ if result is None:
618
+ name = self.t.__name__
619
+ filename = inspect.getfile(self.t)
620
+ raise Exception(
621
+ f"The dataset method {name} defined in "
622
+ f"{filename} returned a null object"
623
+ )
624
+
625
+ if isinstance(result, dict):
626
+ self.config = self.base.C(**result)
627
+ elif isinstance(result, self.base):
628
+ self.config = result
629
+ else:
630
+ name = self.t.__name__
631
+ filename = inspect.getfile(self.t)
632
+ raise RuntimeError(
633
+ f"The dataset method {name} defined in "
634
+ f"{filename} returned an object of type {type(dict)}"
635
+ )
316
636
 
317
- # Constrcut the object
318
- data = self.base(**dict)
637
+ # Setup ourself
638
+ self.config.__datamaestro_dataset__ = self
319
639
 
320
640
  # Set the ids
321
- self.setDataIDs(data, self.id)
641
+ self.setDataIDs(self.config, self.id)
322
642
 
323
- return data
643
+ return self.config
644
+
645
+ __call__ = _prepare
324
646
 
325
647
  @property
326
- def path(self) -> Path:
327
- """Returns the path"""
648
+ def _path(self) -> Path:
649
+ """Returns a unique relative path for this dataset"""
328
650
  path = Path(*self.id.split("."))
329
651
  if self.version:
330
652
  path = path.with_suffix(".v%s" % self.version)
@@ -333,16 +655,32 @@ class DatasetWrapper(AbstractDataset):
333
655
  @property
334
656
  def datapath(self):
335
657
  """Returns the destination path for downloads"""
336
- return self.repository.datapath / self.path
658
+ if self.repository is not None:
659
+ return self.repository.datapath / self._path
337
660
 
338
- def hasfiles(self) -> bool:
339
- """Returns whether this dataset has files or only includes references"""
661
+ # No repository, use __custom__/[MODULE NAME]
662
+ path = self.context.datapath / "__custom__" / self.module_name / self._path
663
+
664
+ return path
665
+
666
+ def has_files(self) -> bool:
667
+ """Returns whether this dataset has files or only includes references."""
340
668
  for resource in self.resources.values():
341
- if resource.hasfiles():
669
+ if resource.has_files():
342
670
  return True
343
-
344
671
  return False
345
672
 
673
+ def hasfiles(self) -> bool:
674
+ """Deprecated: use has_files() instead."""
675
+ import warnings
676
+
677
+ warnings.warn(
678
+ "hasfiles() is deprecated, use has_files()",
679
+ DeprecationWarning,
680
+ stacklevel=2,
681
+ )
682
+ return self.has_files()
683
+
346
684
 
347
685
  # --- Annotations
348
686
 
@@ -358,9 +696,9 @@ class DataAnnotation:
358
696
  self.annotate(object.__datamaestro__)
359
697
  else:
360
698
  # With configuration objects, add a __datamaestro__ member to the class
361
- assert issubclass(
362
- object, Config
363
- ), f"{object} cannot be annotated (only dataset or data definitions)"
699
+ assert issubclass(object, Config), (
700
+ f"{object} cannot be annotated (only dataset or data definitions)"
701
+ )
364
702
  if "__datamaestro__" not in object.__dict__:
365
703
  object.__datamaestro__ = AbstractData()
366
704
  self.annotate(object.__datamaestro__)
@@ -375,10 +713,16 @@ class DatasetAnnotation:
375
713
  """Base class for all annotations"""
376
714
 
377
715
  def __call__(self, dataset: AbstractDataset):
378
- assert isinstance(
379
- dataset, AbstractDataset
380
- ), f"Only datasets can be annotated with {self}, but {dataset} is not a dataset"
381
- self.annotate(dataset)
716
+ if isinstance(dataset, AbstractDataset):
717
+ self.annotate(dataset)
718
+ elif issubclass(dataset, Dataset):
719
+ self.annotate(dataset.__datamaestro__)
720
+ else:
721
+ raise RuntimeError(
722
+ f"Only datasets can be annotated with {self}, "
723
+ f"but {dataset} is not a dataset"
724
+ )
725
+
382
726
  return dataset
383
727
 
384
728
  def annotate(self, dataset: AbstractDataset):
@@ -425,54 +769,47 @@ def DataTagging(f):
425
769
  datatags = DataTagging(lambda d: d.tags)
426
770
  datatasks = DataTagging(lambda d: d.tasks)
427
771
 
428
- # T = TypeVar("T")
429
- # def data(description=None):
430
- # """Deprecated: simply deriving from Base data is enough"""
431
- # if description is not None and not isinstance(description, str):
432
- # raise RuntimeError("@data annotation should be written @data()")
433
-
434
- # def annotate(t: T):
435
- # try:
436
- # object.__getattribute__(t, "__datamaestro__")
437
- # logging.warning("@data should only be called once")
438
- # except AttributeError:
439
- # pass
440
-
441
- # # Determine the data type
442
- # from experimaestro import config
443
772
 
444
- # repository, components = DataDefinition.repository_relpath(t)
445
- # assert (
446
- # components[0] == "data"
447
- # ), f"A @data object should be in the .data module (not {t.__module__})"
773
+ class metadata:
774
+ def __init__(
775
+ self,
776
+ tags: Union[str, List[str]] = None,
777
+ tasks: Union[str, List[str]] = None,
778
+ ):
779
+ pass
448
780
 
449
- # identifier = (
450
- # f"{repository.NAMESPACE if repository else 'datamaestro'}."
451
- # + ".".join(components[1:]).lower()
452
- # )
453
- # t = config(identifier)(t)
454
- # t.__datamaestro__ = DataDefinition(repository, t)
455
-
456
- # return t
457
-
458
- # return annotate
781
+ def __call__(self, object: type):
782
+ # FIXME: todo
783
+ return object
459
784
 
460
785
 
461
786
  class dataset:
462
- def __init__(self, base=None, *, timestamp=None, id=None, url=None, size=None):
463
- """Creates a new (meta)dataset
464
-
465
- Meta-datasets are not associated with any
466
-
467
- Arguments:
468
- base {[type]} -- The base type (or None if infered from type annotation)
787
+ """Dataset decorator
788
+
789
+ Meta-datasets are not associated with any base type.
790
+
791
+ :param base: The base type (or None if inferred from type annotation).
792
+ :param timestamp: If the dataset evolves, specify its timestamp.
793
+ :param id: Gives the full ID of the dataset if it contains a '.',
794
+ the last component if not containing a '.', or the last components
795
+ if starting with '.'
796
+ :param url: The URL associated with the dataset.
797
+ :param size: The size of the dataset (should be a parsable format).
798
+ :param doi: The DOI of the corresponding paper.
799
+ :param as_prepare: Resources are setup within the method itself
800
+ """
469
801
 
470
- Keyword Arguments:
471
- timestamp {bool} -- If the dataset evolves, specify its timestamp (default: None)
472
- id {[type]} -- [description] (default: {None})
473
- url {[type]} -- [description] (default: {None})
474
- size {str} -- The size (should be a parsable format)
475
- """
802
+ def __init__(
803
+ self,
804
+ base=None,
805
+ *,
806
+ timestamp: str | None = None,
807
+ id: None | str = None,
808
+ url: None | str = None,
809
+ size: None | int | str = None,
810
+ doi: None | str = None,
811
+ as_prepare: bool = False,
812
+ ):
476
813
  if hasattr(base, "__datamaestro__") and isinstance(
477
814
  base.__datamaestro__, metadataset
478
815
  ):
@@ -485,24 +822,46 @@ class dataset:
485
822
  self.meta = False
486
823
  self.timestamp = timestamp
487
824
  self.size = size
825
+ self.doi = doi
826
+ self.as_prepare = as_prepare
488
827
 
489
828
  def __call__(self, t):
829
+ from datamaestro.data import Base
830
+
490
831
  try:
491
832
  if self.base is None:
492
- # Get type from return annotation
493
- self.base = t.__annotations__["return"]
833
+ if inspect.isclass(t) and issubclass(t, Base):
834
+ self.base = t
835
+ else:
836
+ try:
837
+ # Get type from return annotation
838
+ return_type = t.__annotations__["return"]
839
+ if isinstance(return_type, _GenericAlias):
840
+ return_type = return_type.__origin__
841
+ self.base = return_type
842
+ except KeyError:
843
+ logging.warning("No return annotation in %s", t)
844
+ raise
494
845
  object.__getattribute__(t, "__datamaestro__")
495
846
  raise AssertionError("@data should only be called once")
496
847
  except AttributeError:
497
848
  pass
498
-
499
849
  dw = DatasetWrapper(self, t)
850
+ t.__dataset__ = dw
851
+
852
+ # For class-based datasets, scan for Resource class attributes
853
+ if inspect.isclass(t) and issubclass(t, Base):
854
+ _bind_class_resources(t, dw)
855
+ return t
500
856
  return dw
501
857
 
502
858
 
503
859
  class metadataset(AbstractDataset):
504
- """Annotation for object/functions which are abstract dataset definitions -- i.e. shared
505
- by more than one real dataset. This is useful to share tags, urls, etc."""
860
+ """Annotation for object/functions which are abstract dataset definitions
861
+
862
+ i.e. shared by more than one real dataset. This is useful to share tags,
863
+ urls, etc.
864
+ """
506
865
 
507
866
  def __init__(self, base):
508
867
  super().__init__(None)
@@ -516,3 +875,5 @@ class metadataset(AbstractDataset):
516
875
  pass
517
876
  t.__datamaestro__ = self
518
877
  return t
878
+
879
+ _prepare = None