datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/definitions.py
CHANGED
|
@@ -2,13 +2,16 @@
|
|
|
2
2
|
# Main datamaestro functions and data models
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
import logging
|
|
6
8
|
import inspect
|
|
9
|
+
import shutil
|
|
7
10
|
from pathlib import Path
|
|
8
11
|
from itertools import chain
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
9
13
|
import traceback
|
|
10
14
|
from typing import (
|
|
11
|
-
Any,
|
|
12
15
|
Dict,
|
|
13
16
|
List,
|
|
14
17
|
Optional,
|
|
@@ -18,20 +21,119 @@ from typing import (
|
|
|
18
21
|
Callable,
|
|
19
22
|
TYPE_CHECKING,
|
|
20
23
|
Union,
|
|
24
|
+
_GenericAlias,
|
|
25
|
+
)
|
|
26
|
+
from experimaestro import ( # noqa: F401 (re-exports)
|
|
27
|
+
Param,
|
|
28
|
+
Option,
|
|
29
|
+
Config,
|
|
30
|
+
Meta,
|
|
21
31
|
)
|
|
22
|
-
from
|
|
23
|
-
from typing import Type as TypingType
|
|
32
|
+
from typing import Type as TypingType # noqa: F401 (re-exports)
|
|
24
33
|
from experimaestro.core.types import Type # noqa: F401 (re-exports)
|
|
25
|
-
from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
|
|
26
34
|
|
|
27
35
|
if TYPE_CHECKING:
|
|
28
|
-
from
|
|
29
|
-
from .
|
|
36
|
+
from .data import Base, Dataset
|
|
37
|
+
from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
|
|
38
|
+
from datamaestro.download import Download, Resource
|
|
39
|
+
|
|
40
|
+
# --- DAG utilities ---
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
|
|
44
|
+
"""Topological sort of resources by their dependencies.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
resources: Dict mapping resource names to Resource instances.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of resources in dependency order (dependencies first).
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If a cycle is detected in the dependency graph.
|
|
54
|
+
"""
|
|
55
|
+
visited: set[str] = set()
|
|
56
|
+
visiting: set[str] = set() # For cycle detection
|
|
57
|
+
result: list["Resource"] = []
|
|
58
|
+
|
|
59
|
+
def visit(resource: "Resource"):
|
|
60
|
+
if resource.name in visited:
|
|
61
|
+
return
|
|
62
|
+
if resource.name in visiting:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
f"Cycle detected in resource dependencies involving {resource.name}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
visiting.add(resource.name)
|
|
68
|
+
for dep in resource.dependencies:
|
|
69
|
+
visit(dep)
|
|
70
|
+
visiting.discard(resource.name)
|
|
71
|
+
visited.add(resource.name)
|
|
72
|
+
result.append(resource)
|
|
73
|
+
|
|
74
|
+
for resource in resources.values():
|
|
75
|
+
visit(resource)
|
|
76
|
+
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _compute_dependents(resources: dict[str, "Resource"]) -> None:
|
|
81
|
+
"""Compute the dependents (inverse edges) for all resources."""
|
|
82
|
+
# Clear existing dependents
|
|
83
|
+
for resource in resources.values():
|
|
84
|
+
resource._dependents = []
|
|
85
|
+
|
|
86
|
+
# Build inverse edges
|
|
87
|
+
for resource in resources.values():
|
|
88
|
+
for dep in resource.dependencies:
|
|
89
|
+
if resource not in dep._dependents:
|
|
90
|
+
dep._dependents.append(resource)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
|
|
94
|
+
"""Scan class attributes for Resource instances and bind them.
|
|
95
|
+
|
|
96
|
+
This is called when a class-based dataset is processed by the
|
|
97
|
+
@dataset decorator. It detects Resource instances defined as
|
|
98
|
+
class attributes and binds them to the dataset.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
cls: The dataset class to scan.
|
|
102
|
+
dataset_wrapper: The AbstractDataset to bind resources to.
|
|
103
|
+
"""
|
|
104
|
+
from datamaestro.download import Resource
|
|
105
|
+
|
|
106
|
+
for attr_name, attr_value in vars(cls).items():
|
|
107
|
+
if isinstance(attr_value, Resource):
|
|
108
|
+
attr_value.bind(attr_name, dataset_wrapper)
|
|
109
|
+
|
|
110
|
+
# Build the dependency DAG
|
|
111
|
+
_compute_dependents(dataset_wrapper.resources)
|
|
112
|
+
|
|
113
|
+
# Validate: topological sort will raise on cycles
|
|
114
|
+
dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _delete_path(path: Path) -> None:
|
|
118
|
+
"""Delete a file or directory at path."""
|
|
119
|
+
if path.exists():
|
|
120
|
+
if path.is_dir():
|
|
121
|
+
shutil.rmtree(path)
|
|
122
|
+
else:
|
|
123
|
+
path.unlink()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _move_path(src: Path, dst: Path) -> None:
|
|
127
|
+
"""Move a file or directory from src to dst."""
|
|
128
|
+
if src.exists():
|
|
129
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
shutil.move(str(src), str(dst))
|
|
131
|
+
|
|
30
132
|
|
|
31
133
|
# --- Objects holding information into classes/function
|
|
32
134
|
|
|
33
135
|
|
|
34
|
-
class AbstractData:
|
|
136
|
+
class AbstractData(ABC):
|
|
35
137
|
"""Data definition groups common fields between a dataset and a data piece,
|
|
36
138
|
such as tags and tasks"""
|
|
37
139
|
|
|
@@ -47,8 +149,7 @@ class AbstractData:
|
|
|
47
149
|
|
|
48
150
|
|
|
49
151
|
class DataDefinition(AbstractData):
|
|
50
|
-
"""Object that stores the declarative part of a data(set) description
|
|
51
|
-
"""
|
|
152
|
+
"""Object that stores the declarative part of a data(set) description"""
|
|
52
153
|
|
|
53
154
|
def __init__(self, t, base=None):
|
|
54
155
|
assert base is None or not inspect.isclass(t)
|
|
@@ -73,8 +174,10 @@ class DataDefinition(AbstractData):
|
|
|
73
174
|
return self._description
|
|
74
175
|
|
|
75
176
|
@staticmethod
|
|
76
|
-
def repository_relpath(t: type) -> Tuple[Repository, List[str]]:
|
|
177
|
+
def repository_relpath(t: type) -> Tuple["Repository", List[str]]:
|
|
77
178
|
"""Find the repository of the current data or dataset definition"""
|
|
179
|
+
from .context import Context # noqa: F811
|
|
180
|
+
|
|
78
181
|
repositorymap = Context.instance().repositorymap
|
|
79
182
|
|
|
80
183
|
fullname = f"{t.__module__}.{t.__name__}"
|
|
@@ -93,10 +196,7 @@ class DataDefinition(AbstractData):
|
|
|
93
196
|
if components[0] == "datamaestro":
|
|
94
197
|
longest_ix = 0
|
|
95
198
|
|
|
96
|
-
|
|
97
|
-
raise Exception(f"Could not find the repository for {fullname}")
|
|
98
|
-
|
|
99
|
-
return repository, components[(longest_ix + 1) :]
|
|
199
|
+
return repository, [s.lower() for s in components[(longest_ix + 1) :]]
|
|
100
200
|
|
|
101
201
|
def ancestors(self):
|
|
102
202
|
ancestors = []
|
|
@@ -122,6 +222,15 @@ class AbstractDataset(AbstractData):
|
|
|
122
222
|
- timestamp: whether the dataset version depends on the time of the download
|
|
123
223
|
"""
|
|
124
224
|
|
|
225
|
+
name: Optional[str] = None
|
|
226
|
+
"""The name of the dataset"""
|
|
227
|
+
|
|
228
|
+
url: Optional[str] = None
|
|
229
|
+
"""The URL of the dataset"""
|
|
230
|
+
|
|
231
|
+
doi: Optional[str] = None
|
|
232
|
+
"""The DOI of this dataset"""
|
|
233
|
+
|
|
125
234
|
def __init__(self, repository: Optional["Repository"]):
|
|
126
235
|
super().__init__()
|
|
127
236
|
self.repository = repository
|
|
@@ -130,6 +239,7 @@ class AbstractDataset(AbstractData):
|
|
|
130
239
|
|
|
131
240
|
# Associated resources
|
|
132
241
|
self.resources: Dict[str, "Download"] = {}
|
|
242
|
+
self.ordered_resources = []
|
|
133
243
|
|
|
134
244
|
# Hooks
|
|
135
245
|
# pre-use: before returning the dataset object
|
|
@@ -137,7 +247,6 @@ class AbstractDataset(AbstractData):
|
|
|
137
247
|
self.hooks = {"pre-use": [], "pre-download": []}
|
|
138
248
|
|
|
139
249
|
self.url = None
|
|
140
|
-
self.name: Optional[str] = None
|
|
141
250
|
self.version = None
|
|
142
251
|
|
|
143
252
|
@property
|
|
@@ -150,18 +259,25 @@ class AbstractDataset(AbstractData):
|
|
|
150
259
|
|
|
151
260
|
@property
|
|
152
261
|
def context(self):
|
|
262
|
+
if self.repository is None:
|
|
263
|
+
from datamaestro.context import Context # noqa: F811
|
|
264
|
+
|
|
265
|
+
return Context.instance()
|
|
153
266
|
return self.repository.context
|
|
154
267
|
|
|
155
268
|
def prepare(self, download=False) -> "Base":
|
|
156
|
-
ds = self._prepare(
|
|
269
|
+
ds = self._prepare()
|
|
157
270
|
ds.__datamaestro_dataset__ = self
|
|
271
|
+
|
|
272
|
+
if download:
|
|
273
|
+
ds.download()
|
|
158
274
|
return ds
|
|
159
275
|
|
|
160
276
|
def register_hook(self, hookname: str, hook: Callable):
|
|
161
277
|
self.hooks[hookname].append(hook)
|
|
162
278
|
|
|
163
|
-
|
|
164
|
-
|
|
279
|
+
@abstractmethod
|
|
280
|
+
def _prepare(self) -> "Base": ...
|
|
165
281
|
|
|
166
282
|
def format(self, encoder: str) -> str:
|
|
167
283
|
s = self.prepare()
|
|
@@ -181,26 +297,153 @@ class AbstractDataset(AbstractData):
|
|
|
181
297
|
from datamaestro.data import Base
|
|
182
298
|
|
|
183
299
|
if isinstance(data, Base):
|
|
184
|
-
|
|
300
|
+
try:
|
|
301
|
+
if data.id:
|
|
302
|
+
# There is already an ID, skip this
|
|
303
|
+
# and the descendants
|
|
304
|
+
return
|
|
305
|
+
except KeyError:
|
|
306
|
+
pass
|
|
307
|
+
|
|
308
|
+
if self.repository is None:
|
|
309
|
+
data.id = id
|
|
310
|
+
else:
|
|
311
|
+
data.id = f"{id}@{self.repository.name}"
|
|
185
312
|
for key, value in data.__xpm__.values.items():
|
|
186
313
|
if isinstance(value, Config):
|
|
187
314
|
self.setDataIDs(value, f"{id}.{key}")
|
|
188
315
|
|
|
189
316
|
def download(self, force=False):
|
|
190
|
-
"""Download all the necessary resources
|
|
317
|
+
"""Download all the necessary resources.
|
|
318
|
+
|
|
319
|
+
Uses DAG-based topological ordering and the two-path system:
|
|
320
|
+
1. Acquire exclusive lock (.state.lock)
|
|
321
|
+
2. Resource writes to transient_path (under .downloads/)
|
|
322
|
+
3. Framework moves transient_path → path (main folder)
|
|
323
|
+
4. State marked COMPLETE
|
|
324
|
+
5. Transient dependencies cleaned up eagerly
|
|
325
|
+
6. .downloads/ directory removed after all resources complete
|
|
326
|
+
7. Release lock
|
|
327
|
+
"""
|
|
328
|
+
import fcntl
|
|
329
|
+
|
|
330
|
+
from datamaestro.download import ResourceState
|
|
331
|
+
|
|
332
|
+
self.prepare()
|
|
333
|
+
logging.info(
|
|
334
|
+
"Materializing %d resources",
|
|
335
|
+
len(self.ordered_resources),
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
self.datapath.mkdir(parents=True, exist_ok=True)
|
|
339
|
+
lock_path = self.datapath / ".state.lock"
|
|
340
|
+
lock_file = lock_path.open("w")
|
|
341
|
+
try:
|
|
342
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
343
|
+
success = self._download_locked(force, ResourceState)
|
|
344
|
+
finally:
|
|
345
|
+
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
|
346
|
+
lock_file.close()
|
|
347
|
+
|
|
348
|
+
return success
|
|
349
|
+
|
|
350
|
+
def _download_locked(self, force, ResourceState):
|
|
351
|
+
"""Inner download logic, called while holding .state.lock."""
|
|
191
352
|
success = True
|
|
192
|
-
|
|
353
|
+
|
|
354
|
+
for resource in self.ordered_resources:
|
|
355
|
+
# Step 1: Check state
|
|
356
|
+
current_state = resource.state
|
|
357
|
+
|
|
358
|
+
if current_state == ResourceState.COMPLETE and not force:
|
|
359
|
+
# Verify files are actually present on disk
|
|
360
|
+
if resource.has_files() and not resource.path.exists():
|
|
361
|
+
logging.warning(
|
|
362
|
+
"Resource %s marked COMPLETE but files "
|
|
363
|
+
"missing at %s — re-downloading",
|
|
364
|
+
resource.name,
|
|
365
|
+
resource.path,
|
|
366
|
+
)
|
|
367
|
+
resource.state = ResourceState.NONE
|
|
368
|
+
current_state = ResourceState.NONE
|
|
369
|
+
else:
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Adopt pre-existing files (old downloads without state file)
|
|
373
|
+
if (
|
|
374
|
+
current_state == ResourceState.NONE
|
|
375
|
+
and not force
|
|
376
|
+
and resource.has_files()
|
|
377
|
+
and resource.path.exists()
|
|
378
|
+
):
|
|
379
|
+
logging.info(
|
|
380
|
+
"Resource %s already exists at %s — marking COMPLETE",
|
|
381
|
+
resource.name,
|
|
382
|
+
resource.path,
|
|
383
|
+
)
|
|
384
|
+
resource.state = ResourceState.COMPLETE
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
if current_state == ResourceState.PARTIAL:
|
|
388
|
+
if not resource.can_recover:
|
|
389
|
+
_delete_path(resource.transient_path)
|
|
390
|
+
resource.state = ResourceState.NONE
|
|
391
|
+
|
|
392
|
+
# Verify all dependencies are COMPLETE
|
|
393
|
+
for dep in resource.dependencies:
|
|
394
|
+
if dep.state != ResourceState.COMPLETE:
|
|
395
|
+
logging.error(
|
|
396
|
+
"Dependency %s of %s is not COMPLETE",
|
|
397
|
+
dep.name,
|
|
398
|
+
resource.name,
|
|
399
|
+
)
|
|
400
|
+
return False
|
|
401
|
+
|
|
402
|
+
# Step 2-4: Download with framework-managed state
|
|
193
403
|
try:
|
|
194
|
-
resource.download(force)
|
|
404
|
+
resource.download(force=force)
|
|
405
|
+
|
|
406
|
+
# Move transient -> final, mark COMPLETE
|
|
407
|
+
if resource.has_files():
|
|
408
|
+
_move_path(resource.transient_path, resource.path)
|
|
409
|
+
resource.state = ResourceState.COMPLETE
|
|
410
|
+
|
|
195
411
|
except Exception:
|
|
196
|
-
logging.error("Could not download resource %s",
|
|
412
|
+
logging.error("Could not download resource %s", resource)
|
|
197
413
|
traceback.print_exc()
|
|
414
|
+
|
|
415
|
+
# Handle PARTIAL state
|
|
416
|
+
if resource.has_files() and resource.transient_path.exists():
|
|
417
|
+
if resource.can_recover:
|
|
418
|
+
resource.state = ResourceState.PARTIAL
|
|
419
|
+
else:
|
|
420
|
+
_delete_path(resource.transient_path)
|
|
421
|
+
resource.state = ResourceState.NONE
|
|
198
422
|
success = False
|
|
423
|
+
break
|
|
424
|
+
|
|
425
|
+
# Step 5: Eager transient cleanup
|
|
426
|
+
for dep in resource.dependencies:
|
|
427
|
+
if dep.transient and all(
|
|
428
|
+
d.state == ResourceState.COMPLETE for d in dep.dependents
|
|
429
|
+
):
|
|
430
|
+
dep.cleanup()
|
|
431
|
+
|
|
432
|
+
# Step 6: Remove .downloads/ directory after success
|
|
433
|
+
if success:
|
|
434
|
+
downloads_dir = self.datapath / ".downloads"
|
|
435
|
+
if downloads_dir.is_dir():
|
|
436
|
+
shutil.rmtree(downloads_dir)
|
|
437
|
+
|
|
199
438
|
return success
|
|
200
439
|
|
|
201
440
|
@staticmethod
|
|
202
|
-
def find(name: str) -> "DataDefinition":
|
|
441
|
+
def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
|
|
203
442
|
"""Find a dataset given its name"""
|
|
443
|
+
from datamaestro.context import Context # noqa: F811
|
|
444
|
+
|
|
445
|
+
context = Context.instance() if context is None else context
|
|
446
|
+
|
|
204
447
|
logging.debug("Searching dataset %s", name)
|
|
205
448
|
for repository in Context.instance().repositories():
|
|
206
449
|
logging.debug("Searching dataset %s in %s", name, repository)
|
|
@@ -211,7 +454,7 @@ class AbstractDataset(AbstractData):
|
|
|
211
454
|
|
|
212
455
|
|
|
213
456
|
class FutureAttr:
|
|
214
|
-
"""Allows to access a dataset
|
|
457
|
+
"""Allows to access a dataset sub-property"""
|
|
215
458
|
|
|
216
459
|
def __init__(self, dataset, keys):
|
|
217
460
|
self.dataset = dataset
|
|
@@ -237,11 +480,13 @@ class FutureAttr:
|
|
|
237
480
|
class DatasetWrapper(AbstractDataset):
|
|
238
481
|
"""Wraps an annotated method into a dataset
|
|
239
482
|
|
|
240
|
-
This is the standard way to define a dataset in datamaestro
|
|
483
|
+
This is the standard way to define a dataset in datamaestro through
|
|
484
|
+
annotations (otherwise, derive from `AbstractDataset`).
|
|
241
485
|
"""
|
|
242
486
|
|
|
243
|
-
def __init__(self, annotation, t: type):
|
|
244
|
-
|
|
487
|
+
def __init__(self, annotation: "dataset", t: type):
|
|
488
|
+
self.config = None
|
|
489
|
+
self.repository: Optional[Repository] = None
|
|
245
490
|
self.t = t
|
|
246
491
|
self.base = annotation.base
|
|
247
492
|
assert self.base is not None, f"Could not set the Config type for {t}"
|
|
@@ -249,82 +494,159 @@ class DatasetWrapper(AbstractDataset):
|
|
|
249
494
|
repository, components = DataDefinition.repository_relpath(t)
|
|
250
495
|
super().__init__(repository)
|
|
251
496
|
|
|
497
|
+
self.module_name = None
|
|
498
|
+
if repository is None:
|
|
499
|
+
# Try to find the module name
|
|
500
|
+
self.module_name, _ = t.__module__.split(".", 1)
|
|
501
|
+
|
|
252
502
|
# Set some variables
|
|
253
503
|
self.url = annotation.url
|
|
504
|
+
self.doi = annotation.doi
|
|
505
|
+
self.as_prepare = annotation.as_prepare
|
|
254
506
|
|
|
255
507
|
# Builds the ID:
|
|
256
508
|
# Removes module_name.config prefix
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
509
|
+
if (
|
|
510
|
+
(annotation.id is None)
|
|
511
|
+
or (annotation.id == "")
|
|
512
|
+
or ("." not in annotation.id)
|
|
513
|
+
or (annotation.id[0] == ".")
|
|
514
|
+
):
|
|
515
|
+
# Computes an ID
|
|
516
|
+
assert (
|
|
517
|
+
# id is empty string = use the module id
|
|
518
|
+
components[0] == "config"
|
|
519
|
+
), (
|
|
520
|
+
"A @dataset without `id` should be in the "
|
|
521
|
+
f".config module (not {t.__module__})"
|
|
522
|
+
)
|
|
523
|
+
|
|
524
|
+
if annotation.id is None:
|
|
525
|
+
# There is nothing, use the full path
|
|
526
|
+
path = ".".join(components[1:])
|
|
527
|
+
else:
|
|
528
|
+
# Replace
|
|
529
|
+
path = ".".join(components[1:-1])
|
|
530
|
+
if annotation.id != "":
|
|
531
|
+
path = f"{path}.{annotation.id}"
|
|
260
532
|
|
|
261
|
-
path = ".".join(components[1:-1])
|
|
262
|
-
if annotation.id == "":
|
|
263
|
-
# id is empty string = use the module id
|
|
264
533
|
self.id = path
|
|
265
534
|
else:
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
annotation.id or t.__name__.lower().replace("_", "."),
|
|
269
|
-
)
|
|
535
|
+
# Use the provided ID
|
|
536
|
+
self.id = annotation.id
|
|
270
537
|
|
|
271
538
|
self.aliases.add(self.id)
|
|
272
539
|
|
|
273
540
|
# Get the documentation
|
|
274
|
-
self.
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
self._description = lines[2]
|
|
541
|
+
self._name = None
|
|
542
|
+
self._description = None
|
|
543
|
+
|
|
544
|
+
@property
|
|
545
|
+
def name(self):
|
|
546
|
+
self._process_doc()
|
|
547
|
+
return self._name
|
|
282
548
|
|
|
283
549
|
@property
|
|
284
550
|
def description(self):
|
|
551
|
+
self._process_doc()
|
|
285
552
|
return self._description
|
|
286
553
|
|
|
554
|
+
def _process_doc(self):
|
|
555
|
+
if self._description is None:
|
|
556
|
+
if self.t.__doc__:
|
|
557
|
+
lines = self.t.__doc__.split("\n")
|
|
558
|
+
self._name = lines[0]
|
|
559
|
+
if len(lines) > 1:
|
|
560
|
+
assert lines[1].strip() == "", "Second line should be blank"
|
|
561
|
+
if len(lines) > 2:
|
|
562
|
+
# Remove the common indent
|
|
563
|
+
lines = [line.rstrip() for line in lines[2:]]
|
|
564
|
+
minindent = max(
|
|
565
|
+
next(idx for idx, chr in enumerate(s) if not chr.isspace())
|
|
566
|
+
for s in lines
|
|
567
|
+
if len(s) > 0
|
|
568
|
+
)
|
|
569
|
+
self._description = "\n".join(
|
|
570
|
+
s[minindent:] if len(s) > 0 else "" for s in lines
|
|
571
|
+
)
|
|
572
|
+
else:
|
|
573
|
+
self._name = ""
|
|
574
|
+
self._description = ""
|
|
575
|
+
|
|
287
576
|
@property
|
|
288
577
|
def configtype(self):
|
|
289
578
|
return self.base
|
|
290
579
|
|
|
291
|
-
def __call__(self, *args, **kwargs):
|
|
292
|
-
self.t(*args, **kwargs)
|
|
293
|
-
|
|
294
580
|
def __getattr__(self, key):
|
|
295
581
|
"""Returns a pointer to a potential attribute"""
|
|
296
582
|
return FutureAttr(self, [key])
|
|
297
583
|
|
|
298
|
-
def
|
|
299
|
-
if
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
584
|
+
def download(self, force=False):
|
|
585
|
+
if self.base is self.t:
|
|
586
|
+
self._prepare()
|
|
587
|
+
return super().download(force=force)
|
|
588
|
+
|
|
589
|
+
def _prepare(self) -> "Base":
|
|
590
|
+
if self.config is not None:
|
|
591
|
+
return self.config
|
|
592
|
+
|
|
593
|
+
# Direct creation of the dataset
|
|
594
|
+
if self.base is self.t:
|
|
595
|
+
self.config = self.base.__create_dataset__(self)
|
|
596
|
+
|
|
597
|
+
else:
|
|
598
|
+
# Construct the object
|
|
599
|
+
if self.as_prepare:
|
|
600
|
+
result = self.t(self, None)
|
|
601
|
+
else:
|
|
602
|
+
resources = {
|
|
603
|
+
key: value.prepare() for key, value in self.resources.items()
|
|
604
|
+
}
|
|
605
|
+
result = self.t(**resources)
|
|
606
|
+
|
|
607
|
+
if result is None:
|
|
608
|
+
raise RuntimeError(f"{self.base} did not return any resource")
|
|
609
|
+
|
|
610
|
+
# Download resources
|
|
611
|
+
logging.debug(
|
|
612
|
+
"Building with data type %s and dataset %s", self.base, self.t
|
|
315
613
|
)
|
|
614
|
+
for hook in self.hooks["pre-use"]:
|
|
615
|
+
hook(self)
|
|
616
|
+
|
|
617
|
+
if result is None:
|
|
618
|
+
name = self.t.__name__
|
|
619
|
+
filename = inspect.getfile(self.t)
|
|
620
|
+
raise Exception(
|
|
621
|
+
f"The dataset method {name} defined in "
|
|
622
|
+
f"{filename} returned a null object"
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
if isinstance(result, dict):
|
|
626
|
+
self.config = self.base.C(**result)
|
|
627
|
+
elif isinstance(result, self.base):
|
|
628
|
+
self.config = result
|
|
629
|
+
else:
|
|
630
|
+
name = self.t.__name__
|
|
631
|
+
filename = inspect.getfile(self.t)
|
|
632
|
+
raise RuntimeError(
|
|
633
|
+
f"The dataset method {name} defined in "
|
|
634
|
+
f"{filename} returned an object of type {type(dict)}"
|
|
635
|
+
)
|
|
316
636
|
|
|
317
|
-
#
|
|
318
|
-
|
|
637
|
+
# Setup ourself
|
|
638
|
+
self.config.__datamaestro_dataset__ = self
|
|
319
639
|
|
|
320
640
|
# Set the ids
|
|
321
|
-
self.setDataIDs(
|
|
641
|
+
self.setDataIDs(self.config, self.id)
|
|
322
642
|
|
|
323
|
-
return
|
|
643
|
+
return self.config
|
|
644
|
+
|
|
645
|
+
__call__ = _prepare
|
|
324
646
|
|
|
325
647
|
@property
|
|
326
|
-
def
|
|
327
|
-
"""Returns
|
|
648
|
+
def _path(self) -> Path:
|
|
649
|
+
"""Returns a unique relative path for this dataset"""
|
|
328
650
|
path = Path(*self.id.split("."))
|
|
329
651
|
if self.version:
|
|
330
652
|
path = path.with_suffix(".v%s" % self.version)
|
|
@@ -333,16 +655,32 @@ class DatasetWrapper(AbstractDataset):
|
|
|
333
655
|
@property
|
|
334
656
|
def datapath(self):
|
|
335
657
|
"""Returns the destination path for downloads"""
|
|
336
|
-
|
|
658
|
+
if self.repository is not None:
|
|
659
|
+
return self.repository.datapath / self._path
|
|
337
660
|
|
|
338
|
-
|
|
339
|
-
|
|
661
|
+
# No repository, use __custom__/[MODULE NAME]
|
|
662
|
+
path = self.context.datapath / "__custom__" / self.module_name / self._path
|
|
663
|
+
|
|
664
|
+
return path
|
|
665
|
+
|
|
666
|
+
def has_files(self) -> bool:
|
|
667
|
+
"""Returns whether this dataset has files or only includes references."""
|
|
340
668
|
for resource in self.resources.values():
|
|
341
|
-
if resource.
|
|
669
|
+
if resource.has_files():
|
|
342
670
|
return True
|
|
343
|
-
|
|
344
671
|
return False
|
|
345
672
|
|
|
673
|
+
def hasfiles(self) -> bool:
|
|
674
|
+
"""Deprecated: use has_files() instead."""
|
|
675
|
+
import warnings
|
|
676
|
+
|
|
677
|
+
warnings.warn(
|
|
678
|
+
"hasfiles() is deprecated, use has_files()",
|
|
679
|
+
DeprecationWarning,
|
|
680
|
+
stacklevel=2,
|
|
681
|
+
)
|
|
682
|
+
return self.has_files()
|
|
683
|
+
|
|
346
684
|
|
|
347
685
|
# --- Annotations
|
|
348
686
|
|
|
@@ -358,9 +696,9 @@ class DataAnnotation:
|
|
|
358
696
|
self.annotate(object.__datamaestro__)
|
|
359
697
|
else:
|
|
360
698
|
# With configuration objects, add a __datamaestro__ member to the class
|
|
361
|
-
assert issubclass(
|
|
362
|
-
object
|
|
363
|
-
)
|
|
699
|
+
assert issubclass(object, Config), (
|
|
700
|
+
f"{object} cannot be annotated (only dataset or data definitions)"
|
|
701
|
+
)
|
|
364
702
|
if "__datamaestro__" not in object.__dict__:
|
|
365
703
|
object.__datamaestro__ = AbstractData()
|
|
366
704
|
self.annotate(object.__datamaestro__)
|
|
@@ -375,10 +713,16 @@ class DatasetAnnotation:
|
|
|
375
713
|
"""Base class for all annotations"""
|
|
376
714
|
|
|
377
715
|
def __call__(self, dataset: AbstractDataset):
|
|
378
|
-
|
|
379
|
-
dataset
|
|
380
|
-
|
|
381
|
-
|
|
716
|
+
if isinstance(dataset, AbstractDataset):
|
|
717
|
+
self.annotate(dataset)
|
|
718
|
+
elif issubclass(dataset, Dataset):
|
|
719
|
+
self.annotate(dataset.__datamaestro__)
|
|
720
|
+
else:
|
|
721
|
+
raise RuntimeError(
|
|
722
|
+
f"Only datasets can be annotated with {self}, "
|
|
723
|
+
f"but {dataset} is not a dataset"
|
|
724
|
+
)
|
|
725
|
+
|
|
382
726
|
return dataset
|
|
383
727
|
|
|
384
728
|
def annotate(self, dataset: AbstractDataset):
|
|
@@ -425,54 +769,47 @@ def DataTagging(f):
|
|
|
425
769
|
datatags = DataTagging(lambda d: d.tags)
|
|
426
770
|
datatasks = DataTagging(lambda d: d.tasks)
|
|
427
771
|
|
|
428
|
-
# T = TypeVar("T")
|
|
429
|
-
# def data(description=None):
|
|
430
|
-
# """Deprecated: simply deriving from Base data is enough"""
|
|
431
|
-
# if description is not None and not isinstance(description, str):
|
|
432
|
-
# raise RuntimeError("@data annotation should be written @data()")
|
|
433
|
-
|
|
434
|
-
# def annotate(t: T):
|
|
435
|
-
# try:
|
|
436
|
-
# object.__getattribute__(t, "__datamaestro__")
|
|
437
|
-
# logging.warning("@data should only be called once")
|
|
438
|
-
# except AttributeError:
|
|
439
|
-
# pass
|
|
440
|
-
|
|
441
|
-
# # Determine the data type
|
|
442
|
-
# from experimaestro import config
|
|
443
772
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
773
|
+
class metadata:
|
|
774
|
+
def __init__(
|
|
775
|
+
self,
|
|
776
|
+
tags: Union[str, List[str]] = None,
|
|
777
|
+
tasks: Union[str, List[str]] = None,
|
|
778
|
+
):
|
|
779
|
+
pass
|
|
448
780
|
|
|
449
|
-
|
|
450
|
-
#
|
|
451
|
-
|
|
452
|
-
# )
|
|
453
|
-
# t = config(identifier)(t)
|
|
454
|
-
# t.__datamaestro__ = DataDefinition(repository, t)
|
|
455
|
-
|
|
456
|
-
# return t
|
|
457
|
-
|
|
458
|
-
# return annotate
|
|
781
|
+
def __call__(self, object: type):
|
|
782
|
+
# FIXME: todo
|
|
783
|
+
return object
|
|
459
784
|
|
|
460
785
|
|
|
461
786
|
class dataset:
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
787
|
+
"""Dataset decorator
|
|
788
|
+
|
|
789
|
+
Meta-datasets are not associated with any base type.
|
|
790
|
+
|
|
791
|
+
:param base: The base type (or None if inferred from type annotation).
|
|
792
|
+
:param timestamp: If the dataset evolves, specify its timestamp.
|
|
793
|
+
:param id: Gives the full ID of the dataset if it contains a '.',
|
|
794
|
+
the last component if not containing a '.', or the last components
|
|
795
|
+
if starting with '.'
|
|
796
|
+
:param url: The URL associated with the dataset.
|
|
797
|
+
:param size: The size of the dataset (should be a parsable format).
|
|
798
|
+
:param doi: The DOI of the corresponding paper.
|
|
799
|
+
:param as_prepare: Resources are setup within the method itself
|
|
800
|
+
"""
|
|
469
801
|
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
802
|
+
def __init__(
|
|
803
|
+
self,
|
|
804
|
+
base=None,
|
|
805
|
+
*,
|
|
806
|
+
timestamp: str | None = None,
|
|
807
|
+
id: None | str = None,
|
|
808
|
+
url: None | str = None,
|
|
809
|
+
size: None | int | str = None,
|
|
810
|
+
doi: None | str = None,
|
|
811
|
+
as_prepare: bool = False,
|
|
812
|
+
):
|
|
476
813
|
if hasattr(base, "__datamaestro__") and isinstance(
|
|
477
814
|
base.__datamaestro__, metadataset
|
|
478
815
|
):
|
|
@@ -485,24 +822,46 @@ class dataset:
|
|
|
485
822
|
self.meta = False
|
|
486
823
|
self.timestamp = timestamp
|
|
487
824
|
self.size = size
|
|
825
|
+
self.doi = doi
|
|
826
|
+
self.as_prepare = as_prepare
|
|
488
827
|
|
|
489
828
|
def __call__(self, t):
|
|
829
|
+
from datamaestro.data import Base
|
|
830
|
+
|
|
490
831
|
try:
|
|
491
832
|
if self.base is None:
|
|
492
|
-
|
|
493
|
-
|
|
833
|
+
if inspect.isclass(t) and issubclass(t, Base):
|
|
834
|
+
self.base = t
|
|
835
|
+
else:
|
|
836
|
+
try:
|
|
837
|
+
# Get type from return annotation
|
|
838
|
+
return_type = t.__annotations__["return"]
|
|
839
|
+
if isinstance(return_type, _GenericAlias):
|
|
840
|
+
return_type = return_type.__origin__
|
|
841
|
+
self.base = return_type
|
|
842
|
+
except KeyError:
|
|
843
|
+
logging.warning("No return annotation in %s", t)
|
|
844
|
+
raise
|
|
494
845
|
object.__getattribute__(t, "__datamaestro__")
|
|
495
846
|
raise AssertionError("@data should only be called once")
|
|
496
847
|
except AttributeError:
|
|
497
848
|
pass
|
|
498
|
-
|
|
499
849
|
dw = DatasetWrapper(self, t)
|
|
850
|
+
t.__dataset__ = dw
|
|
851
|
+
|
|
852
|
+
# For class-based datasets, scan for Resource class attributes
|
|
853
|
+
if inspect.isclass(t) and issubclass(t, Base):
|
|
854
|
+
_bind_class_resources(t, dw)
|
|
855
|
+
return t
|
|
500
856
|
return dw
|
|
501
857
|
|
|
502
858
|
|
|
503
859
|
class metadataset(AbstractDataset):
|
|
504
|
-
"""Annotation for object/functions which are abstract dataset definitions
|
|
505
|
-
|
|
860
|
+
"""Annotation for object/functions which are abstract dataset definitions
|
|
861
|
+
|
|
862
|
+
i.e. shared by more than one real dataset. This is useful to share tags,
|
|
863
|
+
urls, etc.
|
|
864
|
+
"""
|
|
506
865
|
|
|
507
866
|
def __init__(self, base):
|
|
508
867
|
super().__init__(None)
|
|
@@ -516,3 +875,5 @@ class metadataset(AbstractDataset):
|
|
|
516
875
|
pass
|
|
517
876
|
t.__datamaestro__ = self
|
|
518
877
|
return t
|
|
878
|
+
|
|
879
|
+
_prepare = None
|