datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +1 -2
- datamaestro/__main__.py +11 -7
- datamaestro/commands/site.py +16 -5
- datamaestro/context.py +32 -16
- datamaestro/data/ml.py +1 -0
- datamaestro/definitions.py +246 -20
- datamaestro/download/__init__.py +583 -40
- datamaestro/download/archive.py +120 -76
- datamaestro/download/custom.py +38 -6
- datamaestro/download/huggingface.py +46 -14
- datamaestro/download/links.py +106 -49
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +111 -54
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +3 -3
- datamaestro/record.py +48 -2
- datamaestro/settings.py +2 -1
- datamaestro/sphinx.py +1 -3
- datamaestro/stream/lines.py +8 -6
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/conftest.py +1 -2
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +7 -6
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -21
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- datamaestro-1.5.0.dist-info/RECORD +0 -48
- datamaestro-1.5.0.dist-info/top_level.txt +0 -1
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
- {datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0
datamaestro/__init__.py
CHANGED
|
@@ -7,7 +7,6 @@ from .context import (
|
|
|
7
7
|
prepare_dataset,
|
|
8
8
|
)
|
|
9
9
|
|
|
10
|
-
from pkg_resources import get_distribution, DistributionNotFound
|
|
11
10
|
from .definitions import dataset, metadata
|
|
12
11
|
from .data import Base
|
|
13
|
-
from .version import
|
|
12
|
+
from .version import __version__
|
datamaestro/__main__.py
CHANGED
|
@@ -1,20 +1,22 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
#
|
|
2
|
+
# ruff: noqa: T201
|
|
3
3
|
|
|
4
|
+
from importlib.metadata import entry_points
|
|
4
5
|
import sys
|
|
5
6
|
import logging
|
|
6
7
|
from functools import update_wrapper
|
|
7
8
|
import traceback as tb
|
|
8
|
-
import pkg_resources
|
|
9
9
|
import re
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
import shutil
|
|
12
|
-
from .context import Context
|
|
13
12
|
from typing import Set
|
|
14
|
-
import
|
|
13
|
+
from urllib.parse import urlparse
|
|
15
14
|
|
|
16
15
|
import click
|
|
17
16
|
|
|
17
|
+
import datamaestro
|
|
18
|
+
from .context import Context
|
|
19
|
+
|
|
18
20
|
logging.basicConfig(level=logging.INFO)
|
|
19
21
|
|
|
20
22
|
|
|
@@ -38,7 +40,7 @@ def pass_cfg(f):
|
|
|
38
40
|
# Get all the available repositories
|
|
39
41
|
|
|
40
42
|
REPOSITORIES = {}
|
|
41
|
-
for entry_point in
|
|
43
|
+
for entry_point in entry_points(group="datamaestro.repositories"):
|
|
42
44
|
REPOSITORIES[entry_point.name] = entry_point
|
|
43
45
|
|
|
44
46
|
|
|
@@ -60,7 +62,10 @@ for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
|
|
|
60
62
|
"--traceback", is_flag=True, help="Display traceback if an exception occurs"
|
|
61
63
|
)
|
|
62
64
|
@click.option(
|
|
63
|
-
"--data",
|
|
65
|
+
"--data",
|
|
66
|
+
type=Path,
|
|
67
|
+
help="Directory containing datasets",
|
|
68
|
+
default=Context.MAINDIR,
|
|
64
69
|
)
|
|
65
70
|
@click.pass_context
|
|
66
71
|
def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
|
|
@@ -207,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
|
|
|
207
212
|
# --- Create a dataset
|
|
208
213
|
|
|
209
214
|
DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
|
|
210
|
-
from urllib.parse import urlparse
|
|
211
215
|
|
|
212
216
|
|
|
213
217
|
def dataset_id_check(ctx, param, value):
|
datamaestro/commands/site.py
CHANGED
|
@@ -159,7 +159,10 @@ def document(match):
|
|
|
159
159
|
try:
|
|
160
160
|
object = getattr(module, name)
|
|
161
161
|
except Exception:
|
|
162
|
-
return "<div class='error'>Cannot find %s in %s</div>" % (
|
|
162
|
+
return "<div class='error'>Cannot find %s in %s</div>" % (
|
|
163
|
+
name,
|
|
164
|
+
modulename,
|
|
165
|
+
)
|
|
163
166
|
|
|
164
167
|
if ismodule(object):
|
|
165
168
|
return "\n\n".join(
|
|
@@ -220,7 +223,12 @@ class Classification:
|
|
|
220
223
|
module = Datasets(importlib.import_module(meta.t.__module__))
|
|
221
224
|
r.write(
|
|
222
225
|
"- [%s](../df/%s/%s.html#%s)\n"
|
|
223
|
-
% (
|
|
226
|
+
% (
|
|
227
|
+
meta.name or meta.id,
|
|
228
|
+
meta.repository.id,
|
|
229
|
+
module.id,
|
|
230
|
+
meta.id,
|
|
231
|
+
)
|
|
224
232
|
)
|
|
225
233
|
|
|
226
234
|
return r.getvalue()
|
|
@@ -326,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
|
|
|
326
334
|
import shutil
|
|
327
335
|
|
|
328
336
|
path = Path(config["site_dir"]) / "mainstyle.css"
|
|
329
|
-
with
|
|
330
|
-
|
|
331
|
-
|
|
337
|
+
with (
|
|
338
|
+
importlib.resources.open_binary(
|
|
339
|
+
"datamaestro.commands", "mainstyle.css"
|
|
340
|
+
) as source,
|
|
341
|
+
path.open("wb") as dest,
|
|
342
|
+
):
|
|
332
343
|
shutil.copyfileobj(source, dest)
|
|
333
344
|
|
|
334
345
|
def on_files(self, files, config):
|
datamaestro/context.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
-
from typing import Iterable, Iterator, Dict, Union
|
|
2
|
+
from typing import Iterable, Iterator, Dict, Optional, Union
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
5
|
import hashlib
|
|
@@ -8,8 +8,7 @@ import inspect
|
|
|
8
8
|
import json
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
10
|
from experimaestro import Config
|
|
11
|
-
import
|
|
12
|
-
from experimaestro.compat import cached_property
|
|
11
|
+
from functools import cached_property
|
|
13
12
|
from experimaestro.mkdocs.metaloader import Module
|
|
14
13
|
from .utils import CachedFile, downloadURL
|
|
15
14
|
from .settings import UserSettings, Settings
|
|
@@ -18,6 +17,22 @@ from typing import TYPE_CHECKING
|
|
|
18
17
|
if TYPE_CHECKING:
|
|
19
18
|
from datamaestro.definitions import AbstractDataset, DatasetWrapper
|
|
20
19
|
|
|
20
|
+
from importlib.metadata import (
|
|
21
|
+
entry_points as _entry_points,
|
|
22
|
+
version as _version,
|
|
23
|
+
PackageNotFoundError as _PackageNotFoundError,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def iter_entry_points(group, name=None):
|
|
28
|
+
"""Yield entry points for a given group (and optional name) using importlib.metadata."""
|
|
29
|
+
eps = _entry_points()
|
|
30
|
+
selected = eps.select(group=group)
|
|
31
|
+
if name:
|
|
32
|
+
selected = [ep for ep in selected if ep.name == name]
|
|
33
|
+
for ep in selected:
|
|
34
|
+
yield ep
|
|
35
|
+
|
|
21
36
|
|
|
22
37
|
class Compression:
|
|
23
38
|
@staticmethod
|
|
@@ -106,7 +121,7 @@ class Context:
|
|
|
106
121
|
|
|
107
122
|
def repositories(self) -> Iterable["Repository"]:
|
|
108
123
|
"""Returns an iterator over repositories"""
|
|
109
|
-
for entry_point in
|
|
124
|
+
for entry_point in iter_entry_points("datamaestro.repositories"):
|
|
110
125
|
yield entry_point.load().instance()
|
|
111
126
|
|
|
112
127
|
def repository(self, repositoryid):
|
|
@@ -114,10 +129,7 @@ class Context:
|
|
|
114
129
|
return None
|
|
115
130
|
|
|
116
131
|
entry_points = [
|
|
117
|
-
x
|
|
118
|
-
for x in pkg_resources.iter_entry_points(
|
|
119
|
-
"datamaestro.repositories", repositoryid
|
|
120
|
-
)
|
|
132
|
+
x for x in iter_entry_points("datamaestro.repositories", repositoryid)
|
|
121
133
|
]
|
|
122
134
|
if not entry_points:
|
|
123
135
|
raise Exception("No datasets repository named %s", repositoryid)
|
|
@@ -299,8 +311,7 @@ class BaseRepository(ABC):
|
|
|
299
311
|
self.basedir = Path(p).parent
|
|
300
312
|
|
|
301
313
|
@abstractmethod
|
|
302
|
-
def __iter__(self) -> Iterator["AbstractDataset"]:
|
|
303
|
-
...
|
|
314
|
+
def __iter__(self) -> Iterator["AbstractDataset"]: ...
|
|
304
315
|
|
|
305
316
|
def search(self, name: str):
|
|
306
317
|
"""Search for a dataset in the definitions"""
|
|
@@ -353,11 +364,9 @@ class Repository(BaseRepository):
|
|
|
353
364
|
|
|
354
365
|
@classmethod
|
|
355
366
|
def version(cls):
|
|
356
|
-
from pkg_resources import get_distribution, DistributionNotFound
|
|
357
|
-
|
|
358
367
|
try:
|
|
359
|
-
return
|
|
360
|
-
except
|
|
368
|
+
return _version(cls.__module__)
|
|
369
|
+
except _PackageNotFoundError:
|
|
361
370
|
return None
|
|
362
371
|
|
|
363
372
|
def __repr__(self):
|
|
@@ -423,16 +432,23 @@ def find_dataset(dataset_id: str):
|
|
|
423
432
|
return AbstractDataset.find(dataset_id)
|
|
424
433
|
|
|
425
434
|
|
|
426
|
-
def prepare_dataset(
|
|
435
|
+
def prepare_dataset(
|
|
436
|
+
dataset_id: Union[str, "DatasetWrapper", Config],
|
|
437
|
+
context: Optional[Union[Context, Path]] = None,
|
|
438
|
+
):
|
|
427
439
|
"""Find a dataset given its id and download the resources"""
|
|
428
440
|
from .definitions import AbstractDataset, DatasetWrapper
|
|
429
441
|
|
|
442
|
+
match context:
|
|
443
|
+
case Path() | str():
|
|
444
|
+
context = Context(Path(context))
|
|
445
|
+
|
|
430
446
|
if isinstance(dataset_id, DatasetWrapper):
|
|
431
447
|
ds = dataset_id
|
|
432
448
|
elif isinstance(dataset_id, Config):
|
|
433
449
|
ds = dataset_id.__datamaestro_dataset__
|
|
434
450
|
else:
|
|
435
|
-
ds = AbstractDataset.find(dataset_id)
|
|
451
|
+
ds = AbstractDataset.find(dataset_id, context=context)
|
|
436
452
|
|
|
437
453
|
return ds.prepare(download=True)
|
|
438
454
|
|
datamaestro/data/ml.py
CHANGED
datamaestro/definitions.py
CHANGED
|
@@ -2,8 +2,11 @@
|
|
|
2
2
|
# Main datamaestro functions and data models
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
5
7
|
import logging
|
|
6
8
|
import inspect
|
|
9
|
+
import shutil
|
|
7
10
|
from pathlib import Path
|
|
8
11
|
from itertools import chain
|
|
9
12
|
from abc import ABC, abstractmethod
|
|
@@ -21,8 +24,6 @@ from typing import (
|
|
|
21
24
|
_GenericAlias,
|
|
22
25
|
)
|
|
23
26
|
from experimaestro import ( # noqa: F401 (re-exports)
|
|
24
|
-
argument,
|
|
25
|
-
constant,
|
|
26
27
|
Param,
|
|
27
28
|
Option,
|
|
28
29
|
Config,
|
|
@@ -34,7 +35,100 @@ from experimaestro.core.types import Type # noqa: F401 (re-exports)
|
|
|
34
35
|
if TYPE_CHECKING:
|
|
35
36
|
from .data import Base, Dataset
|
|
36
37
|
from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
|
|
37
|
-
from datamaestro.download import Download
|
|
38
|
+
from datamaestro.download import Download, Resource
|
|
39
|
+
|
|
40
|
+
# --- DAG utilities ---
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
|
|
44
|
+
"""Topological sort of resources by their dependencies.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
resources: Dict mapping resource names to Resource instances.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of resources in dependency order (dependencies first).
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
ValueError: If a cycle is detected in the dependency graph.
|
|
54
|
+
"""
|
|
55
|
+
visited: set[str] = set()
|
|
56
|
+
visiting: set[str] = set() # For cycle detection
|
|
57
|
+
result: list["Resource"] = []
|
|
58
|
+
|
|
59
|
+
def visit(resource: "Resource"):
|
|
60
|
+
if resource.name in visited:
|
|
61
|
+
return
|
|
62
|
+
if resource.name in visiting:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
f"Cycle detected in resource dependencies involving {resource.name}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
visiting.add(resource.name)
|
|
68
|
+
for dep in resource.dependencies:
|
|
69
|
+
visit(dep)
|
|
70
|
+
visiting.discard(resource.name)
|
|
71
|
+
visited.add(resource.name)
|
|
72
|
+
result.append(resource)
|
|
73
|
+
|
|
74
|
+
for resource in resources.values():
|
|
75
|
+
visit(resource)
|
|
76
|
+
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _compute_dependents(resources: dict[str, "Resource"]) -> None:
|
|
81
|
+
"""Compute the dependents (inverse edges) for all resources."""
|
|
82
|
+
# Clear existing dependents
|
|
83
|
+
for resource in resources.values():
|
|
84
|
+
resource._dependents = []
|
|
85
|
+
|
|
86
|
+
# Build inverse edges
|
|
87
|
+
for resource in resources.values():
|
|
88
|
+
for dep in resource.dependencies:
|
|
89
|
+
if resource not in dep._dependents:
|
|
90
|
+
dep._dependents.append(resource)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
|
|
94
|
+
"""Scan class attributes for Resource instances and bind them.
|
|
95
|
+
|
|
96
|
+
This is called when a class-based dataset is processed by the
|
|
97
|
+
@dataset decorator. It detects Resource instances defined as
|
|
98
|
+
class attributes and binds them to the dataset.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
cls: The dataset class to scan.
|
|
102
|
+
dataset_wrapper: The AbstractDataset to bind resources to.
|
|
103
|
+
"""
|
|
104
|
+
from datamaestro.download import Resource
|
|
105
|
+
|
|
106
|
+
for attr_name, attr_value in vars(cls).items():
|
|
107
|
+
if isinstance(attr_value, Resource):
|
|
108
|
+
attr_value.bind(attr_name, dataset_wrapper)
|
|
109
|
+
|
|
110
|
+
# Build the dependency DAG
|
|
111
|
+
_compute_dependents(dataset_wrapper.resources)
|
|
112
|
+
|
|
113
|
+
# Validate: topological sort will raise on cycles
|
|
114
|
+
dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _delete_path(path: Path) -> None:
|
|
118
|
+
"""Delete a file or directory at path."""
|
|
119
|
+
if path.exists():
|
|
120
|
+
if path.is_dir():
|
|
121
|
+
shutil.rmtree(path)
|
|
122
|
+
else:
|
|
123
|
+
path.unlink()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _move_path(src: Path, dst: Path) -> None:
|
|
127
|
+
"""Move a file or directory from src to dst."""
|
|
128
|
+
if src.exists():
|
|
129
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
shutil.move(str(src), str(dst))
|
|
131
|
+
|
|
38
132
|
|
|
39
133
|
# --- Objects holding information into classes/function
|
|
40
134
|
|
|
@@ -183,8 +277,7 @@ class AbstractDataset(AbstractData):
|
|
|
183
277
|
self.hooks[hookname].append(hook)
|
|
184
278
|
|
|
185
279
|
@abstractmethod
|
|
186
|
-
def _prepare(self) -> "Base":
|
|
187
|
-
...
|
|
280
|
+
def _prepare(self) -> "Base": ...
|
|
188
281
|
|
|
189
282
|
def format(self, encoder: str) -> str:
|
|
190
283
|
s = self.prepare()
|
|
@@ -204,6 +297,14 @@ class AbstractDataset(AbstractData):
|
|
|
204
297
|
from datamaestro.data import Base
|
|
205
298
|
|
|
206
299
|
if isinstance(data, Base):
|
|
300
|
+
try:
|
|
301
|
+
if data.id:
|
|
302
|
+
# There is already an ID, skip this
|
|
303
|
+
# and the descendants
|
|
304
|
+
return
|
|
305
|
+
except KeyError:
|
|
306
|
+
pass
|
|
307
|
+
|
|
207
308
|
if self.repository is None:
|
|
208
309
|
data.id = id
|
|
209
310
|
else:
|
|
@@ -213,25 +314,136 @@ class AbstractDataset(AbstractData):
|
|
|
213
314
|
self.setDataIDs(value, f"{id}.{key}")
|
|
214
315
|
|
|
215
316
|
def download(self, force=False):
|
|
216
|
-
"""Download all the necessary resources
|
|
217
|
-
|
|
317
|
+
"""Download all the necessary resources.
|
|
318
|
+
|
|
319
|
+
Uses DAG-based topological ordering and the two-path system:
|
|
320
|
+
1. Acquire exclusive lock (.state.lock)
|
|
321
|
+
2. Resource writes to transient_path (under .downloads/)
|
|
322
|
+
3. Framework moves transient_path → path (main folder)
|
|
323
|
+
4. State marked COMPLETE
|
|
324
|
+
5. Transient dependencies cleaned up eagerly
|
|
325
|
+
6. .downloads/ directory removed after all resources complete
|
|
326
|
+
7. Release lock
|
|
327
|
+
"""
|
|
328
|
+
import fcntl
|
|
329
|
+
|
|
330
|
+
from datamaestro.download import ResourceState
|
|
331
|
+
|
|
218
332
|
self.prepare()
|
|
219
|
-
logging.info(
|
|
333
|
+
logging.info(
|
|
334
|
+
"Materializing %d resources",
|
|
335
|
+
len(self.ordered_resources),
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
self.datapath.mkdir(parents=True, exist_ok=True)
|
|
339
|
+
lock_path = self.datapath / ".state.lock"
|
|
340
|
+
lock_file = lock_path.open("w")
|
|
341
|
+
try:
|
|
342
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
343
|
+
success = self._download_locked(force, ResourceState)
|
|
344
|
+
finally:
|
|
345
|
+
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
|
346
|
+
lock_file.close()
|
|
347
|
+
|
|
348
|
+
return success
|
|
349
|
+
|
|
350
|
+
def _download_locked(self, force, ResourceState):
|
|
351
|
+
"""Inner download logic, called while holding .state.lock."""
|
|
352
|
+
success = True
|
|
353
|
+
|
|
220
354
|
for resource in self.ordered_resources:
|
|
355
|
+
# Step 1: Check state
|
|
356
|
+
current_state = resource.state
|
|
357
|
+
|
|
358
|
+
if current_state == ResourceState.COMPLETE and not force:
|
|
359
|
+
# Verify files are actually present on disk
|
|
360
|
+
if resource.has_files() and not resource.path.exists():
|
|
361
|
+
logging.warning(
|
|
362
|
+
"Resource %s marked COMPLETE but files "
|
|
363
|
+
"missing at %s — re-downloading",
|
|
364
|
+
resource.name,
|
|
365
|
+
resource.path,
|
|
366
|
+
)
|
|
367
|
+
resource.state = ResourceState.NONE
|
|
368
|
+
current_state = ResourceState.NONE
|
|
369
|
+
else:
|
|
370
|
+
continue
|
|
371
|
+
|
|
372
|
+
# Adopt pre-existing files (old downloads without state file)
|
|
373
|
+
if (
|
|
374
|
+
current_state == ResourceState.NONE
|
|
375
|
+
and not force
|
|
376
|
+
and resource.has_files()
|
|
377
|
+
and resource.path.exists()
|
|
378
|
+
):
|
|
379
|
+
logging.info(
|
|
380
|
+
"Resource %s already exists at %s — marking COMPLETE",
|
|
381
|
+
resource.name,
|
|
382
|
+
resource.path,
|
|
383
|
+
)
|
|
384
|
+
resource.state = ResourceState.COMPLETE
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
if current_state == ResourceState.PARTIAL:
|
|
388
|
+
if not resource.can_recover:
|
|
389
|
+
_delete_path(resource.transient_path)
|
|
390
|
+
resource.state = ResourceState.NONE
|
|
391
|
+
|
|
392
|
+
# Verify all dependencies are COMPLETE
|
|
393
|
+
for dep in resource.dependencies:
|
|
394
|
+
if dep.state != ResourceState.COMPLETE:
|
|
395
|
+
logging.error(
|
|
396
|
+
"Dependency %s of %s is not COMPLETE",
|
|
397
|
+
dep.name,
|
|
398
|
+
resource.name,
|
|
399
|
+
)
|
|
400
|
+
return False
|
|
401
|
+
|
|
402
|
+
# Step 2-4: Download with framework-managed state
|
|
221
403
|
try:
|
|
222
|
-
resource.download(force)
|
|
404
|
+
resource.download(force=force)
|
|
405
|
+
|
|
406
|
+
# Move transient -> final, mark COMPLETE
|
|
407
|
+
if resource.has_files():
|
|
408
|
+
_move_path(resource.transient_path, resource.path)
|
|
409
|
+
resource.state = ResourceState.COMPLETE
|
|
410
|
+
|
|
223
411
|
except Exception:
|
|
224
412
|
logging.error("Could not download resource %s", resource)
|
|
225
413
|
traceback.print_exc()
|
|
414
|
+
|
|
415
|
+
# Handle PARTIAL state
|
|
416
|
+
if resource.has_files() and resource.transient_path.exists():
|
|
417
|
+
if resource.can_recover:
|
|
418
|
+
resource.state = ResourceState.PARTIAL
|
|
419
|
+
else:
|
|
420
|
+
_delete_path(resource.transient_path)
|
|
421
|
+
resource.state = ResourceState.NONE
|
|
226
422
|
success = False
|
|
227
423
|
break
|
|
424
|
+
|
|
425
|
+
# Step 5: Eager transient cleanup
|
|
426
|
+
for dep in resource.dependencies:
|
|
427
|
+
if dep.transient and all(
|
|
428
|
+
d.state == ResourceState.COMPLETE for d in dep.dependents
|
|
429
|
+
):
|
|
430
|
+
dep.cleanup()
|
|
431
|
+
|
|
432
|
+
# Step 6: Remove .downloads/ directory after success
|
|
433
|
+
if success:
|
|
434
|
+
downloads_dir = self.datapath / ".downloads"
|
|
435
|
+
if downloads_dir.is_dir():
|
|
436
|
+
shutil.rmtree(downloads_dir)
|
|
437
|
+
|
|
228
438
|
return success
|
|
229
439
|
|
|
230
440
|
@staticmethod
|
|
231
|
-
def find(name: str) -> "DataDefinition":
|
|
441
|
+
def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
|
|
232
442
|
"""Find a dataset given its name"""
|
|
233
443
|
from datamaestro.context import Context # noqa: F811
|
|
234
444
|
|
|
445
|
+
context = Context.instance() if context is None else context
|
|
446
|
+
|
|
235
447
|
logging.debug("Searching dataset %s", name)
|
|
236
448
|
for repository in Context.instance().repositories():
|
|
237
449
|
logging.debug("Searching dataset %s in %s", name, repository)
|
|
@@ -303,8 +515,7 @@ class DatasetWrapper(AbstractDataset):
|
|
|
303
515
|
# Computes an ID
|
|
304
516
|
assert (
|
|
305
517
|
# id is empty string = use the module id
|
|
306
|
-
components[0]
|
|
307
|
-
== "config"
|
|
518
|
+
components[0] == "config"
|
|
308
519
|
), (
|
|
309
520
|
"A @dataset without `id` should be in the "
|
|
310
521
|
f".config module (not {t.__module__})"
|
|
@@ -452,14 +663,24 @@ class DatasetWrapper(AbstractDataset):
|
|
|
452
663
|
|
|
453
664
|
return path
|
|
454
665
|
|
|
455
|
-
def
|
|
456
|
-
"""Returns whether this dataset has files or only includes references"""
|
|
666
|
+
def has_files(self) -> bool:
|
|
667
|
+
"""Returns whether this dataset has files or only includes references."""
|
|
457
668
|
for resource in self.resources.values():
|
|
458
|
-
if resource.
|
|
669
|
+
if resource.has_files():
|
|
459
670
|
return True
|
|
460
|
-
|
|
461
671
|
return False
|
|
462
672
|
|
|
673
|
+
def hasfiles(self) -> bool:
|
|
674
|
+
"""Deprecated: use has_files() instead."""
|
|
675
|
+
import warnings
|
|
676
|
+
|
|
677
|
+
warnings.warn(
|
|
678
|
+
"hasfiles() is deprecated, use has_files()",
|
|
679
|
+
DeprecationWarning,
|
|
680
|
+
stacklevel=2,
|
|
681
|
+
)
|
|
682
|
+
return self.has_files()
|
|
683
|
+
|
|
463
684
|
|
|
464
685
|
# --- Annotations
|
|
465
686
|
|
|
@@ -475,9 +696,9 @@ class DataAnnotation:
|
|
|
475
696
|
self.annotate(object.__datamaestro__)
|
|
476
697
|
else:
|
|
477
698
|
# With configuration objects, add a __datamaestro__ member to the class
|
|
478
|
-
assert issubclass(
|
|
479
|
-
object
|
|
480
|
-
)
|
|
699
|
+
assert issubclass(object, Config), (
|
|
700
|
+
f"{object} cannot be annotated (only dataset or data definitions)"
|
|
701
|
+
)
|
|
481
702
|
if "__datamaestro__" not in object.__dict__:
|
|
482
703
|
object.__datamaestro__ = AbstractData()
|
|
483
704
|
self.annotate(object.__datamaestro__)
|
|
@@ -551,7 +772,9 @@ datatasks = DataTagging(lambda d: d.tasks)
|
|
|
551
772
|
|
|
552
773
|
class metadata:
|
|
553
774
|
def __init__(
|
|
554
|
-
self,
|
|
775
|
+
self,
|
|
776
|
+
tags: Union[str, List[str]] = None,
|
|
777
|
+
tasks: Union[str, List[str]] = None,
|
|
555
778
|
):
|
|
556
779
|
pass
|
|
557
780
|
|
|
@@ -625,7 +848,10 @@ class dataset:
|
|
|
625
848
|
pass
|
|
626
849
|
dw = DatasetWrapper(self, t)
|
|
627
850
|
t.__dataset__ = dw
|
|
851
|
+
|
|
852
|
+
# For class-based datasets, scan for Resource class attributes
|
|
628
853
|
if inspect.isclass(t) and issubclass(t, Base):
|
|
854
|
+
_bind_class_resources(t, dw)
|
|
629
855
|
return t
|
|
630
856
|
return dw
|
|
631
857
|
|