datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamaestro/__init__.py +11 -7
- datamaestro/__main__.py +29 -8
- datamaestro/annotations/__init__.py +1 -1
- datamaestro/annotations/agreement.py +9 -3
- datamaestro/commands/site.py +27 -15
- datamaestro/context.py +143 -87
- datamaestro/data/__init__.py +23 -11
- datamaestro/data/csv.py +12 -12
- datamaestro/data/huggingface.py +25 -0
- datamaestro/data/ml.py +19 -10
- datamaestro/data/tensor.py +32 -24
- datamaestro/definitions.py +492 -131
- datamaestro/download/__init__.py +610 -24
- datamaestro/download/archive.py +129 -77
- datamaestro/download/custom.py +53 -0
- datamaestro/download/huggingface.py +77 -0
- datamaestro/download/links.py +106 -50
- datamaestro/download/multiple.py +27 -5
- datamaestro/download/single.py +114 -51
- datamaestro/download/sync.py +0 -1
- datamaestro/download/todo.py +9 -4
- datamaestro/download/wayback.py +164 -0
- datamaestro/record.py +232 -0
- datamaestro/registry.py +1 -0
- datamaestro/search.py +1 -1
- datamaestro/settings.py +3 -1
- datamaestro/sphinx.py +224 -0
- datamaestro/stream/__init__.py +0 -2
- datamaestro/stream/lines.py +10 -7
- datamaestro/templates/dataset.py +5 -4
- datamaestro/test/__init__.py +3 -1
- datamaestro/test/checks.py +1 -5
- datamaestro/test/conftest.py +1 -6
- datamaestro/test/test_annotations.py +2 -2
- datamaestro/test/test_download_handlers.py +3 -4
- datamaestro/test/test_record.py +72 -0
- datamaestro/test/test_resource.py +1388 -0
- datamaestro/utils.py +15 -9
- datamaestro/v2.md +301 -0
- datamaestro/version.py +4 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
- datamaestro-1.7.0.dist-info/RECORD +49 -0
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
- datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
- datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
- datamaestro/__pycache__/context.cpython-38.pyc +0 -0
- datamaestro/__pycache__/context.cpython-39.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
- datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
- datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
- datamaestro/__pycache__/search.cpython-38.pyc +0 -0
- datamaestro/__pycache__/search.cpython-39.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
- datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
- datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
- datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
- datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
- datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
- datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
- datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
- datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
- datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
- datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
- datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
- datamaestro-0.8.1.dist-info/RECORD +0 -109
- datamaestro-0.8.1.dist-info/top_level.txt +0 -1
- {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
datamaestro/download/__init__.py
CHANGED
|
@@ -1,9 +1,116 @@
|
|
|
1
|
-
|
|
1
|
+
"""Resource system for dataset download and processing pipelines.
|
|
2
|
+
|
|
3
|
+
This module defines the Resource interface and its concrete subclasses
|
|
4
|
+
(FileResource, FolderResource, ValueResource) for managing dataset
|
|
5
|
+
download and preprocessing steps as a directed acyclic graph (DAG).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import shutil
|
|
13
|
+
import warnings
|
|
14
|
+
from abc import ABC, abstractmethod
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import IO, Union
|
|
18
|
+
|
|
19
|
+
from attrs import define
|
|
20
|
+
|
|
21
|
+
from datamaestro.definitions import AbstractDataset, DatasetAnnotation
|
|
2
22
|
from datamaestro.utils import deprecated
|
|
3
23
|
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# Module-level deprecation tracking (emit each category only once)
|
|
27
|
+
_deprecation_warned: set[str] = set()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _warn_once(category: str, message: str):
|
|
31
|
+
"""Emit a deprecation warning only once per category."""
|
|
32
|
+
if category not in _deprecation_warned:
|
|
33
|
+
_deprecation_warned.add(category)
|
|
34
|
+
warnings.warn(message, DeprecationWarning, stacklevel=3)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# --- State metadata file helpers ---
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ResourceStateFile:
|
|
41
|
+
"""Manages the .state.json metadata file for resource states.
|
|
42
|
+
|
|
43
|
+
Location: <dataset.datapath>/.state.json
|
|
44
|
+
|
|
45
|
+
Format:
|
|
46
|
+
{
|
|
47
|
+
"version": 1,
|
|
48
|
+
"resources": {
|
|
49
|
+
"RESOURCE_NAME": {"state": "none"|"partial"|"complete"},
|
|
50
|
+
...
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
VERSION = 1
|
|
56
|
+
|
|
57
|
+
def __init__(self, datapath: Path):
|
|
58
|
+
self._path = datapath / ".state.json"
|
|
59
|
+
|
|
60
|
+
def read(self, resource_name: str) -> "ResourceState":
|
|
61
|
+
"""Read the state for a resource. Returns NONE if not found."""
|
|
62
|
+
data = self._load()
|
|
63
|
+
entry = data.get("resources", {}).get(resource_name)
|
|
64
|
+
if entry is None:
|
|
65
|
+
return ResourceState.NONE
|
|
66
|
+
return ResourceState(entry["state"])
|
|
67
|
+
|
|
68
|
+
def write(self, resource_name: str, state: "ResourceState"):
|
|
69
|
+
"""Write the state for a resource (atomic write)."""
|
|
70
|
+
data = self._load()
|
|
71
|
+
if "resources" not in data:
|
|
72
|
+
data["resources"] = {}
|
|
73
|
+
data["resources"][resource_name] = {"state": state.value}
|
|
74
|
+
self._save(data)
|
|
75
|
+
|
|
76
|
+
def _load(self) -> dict:
|
|
77
|
+
if self._path.is_file():
|
|
78
|
+
with self._path.open("r") as f:
|
|
79
|
+
return json.load(f)
|
|
80
|
+
return {"version": self.VERSION, "resources": {}}
|
|
81
|
+
|
|
82
|
+
def _save(self, data: dict):
|
|
83
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
84
|
+
tmp = self._path.with_suffix(".tmp")
|
|
85
|
+
with tmp.open("w") as f:
|
|
86
|
+
json.dump(data, f, indent=2)
|
|
87
|
+
tmp.rename(self._path)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# --- ResourceState enum ---
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ResourceState(str, Enum):
|
|
94
|
+
"""State of a resource in the preparation pipeline."""
|
|
95
|
+
|
|
96
|
+
NONE = "none"
|
|
97
|
+
"""Not started / no data on disk."""
|
|
98
|
+
|
|
99
|
+
PARTIAL = "partial"
|
|
100
|
+
"""Started but incomplete (error during download)."""
|
|
101
|
+
|
|
102
|
+
COMPLETE = "complete"
|
|
103
|
+
"""Fully available."""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# --- Lazy initialization decorator (backward compat) ---
|
|
107
|
+
|
|
4
108
|
|
|
5
109
|
def initialized(method):
|
|
6
|
-
"""Ensure the object is initialized
|
|
110
|
+
"""Ensure the object is initialized (calls postinit on first use).
|
|
111
|
+
|
|
112
|
+
Deprecated: new Resource subclasses should not rely on this pattern.
|
|
113
|
+
"""
|
|
7
114
|
|
|
8
115
|
def wrapper(self, *args, **kwargs):
|
|
9
116
|
if not self._post:
|
|
@@ -14,42 +121,521 @@ def initialized(method):
|
|
|
14
121
|
return wrapper
|
|
15
122
|
|
|
16
123
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
124
|
+
# --- SetupOptions (backward compat) ---
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@define(kw_only=True)
|
|
128
|
+
class SetupOptions:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# --- Resource base class ---
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class Resource(DatasetAnnotation, ABC):
|
|
136
|
+
"""Base class for all dataset resources.
|
|
137
|
+
|
|
138
|
+
A resource represents a single step in a dataset preparation pipeline.
|
|
139
|
+
Resources form a DAG: each resource declares its dependencies, and
|
|
140
|
+
the orchestrator ensures they are processed in topological order.
|
|
141
|
+
|
|
142
|
+
Usage modes:
|
|
143
|
+
|
|
144
|
+
1. Class attribute (preferred)::
|
|
145
|
+
|
|
146
|
+
@dataset(url="...")
|
|
147
|
+
class MyDataset(Base):
|
|
148
|
+
DATA = filedownloader("data.csv", "http://...", transient=True)
|
|
149
|
+
PROCESSED = SomeProcessor.from_file(DATA)
|
|
150
|
+
|
|
151
|
+
2. Decorator on function (deprecated, backward compat)::
|
|
152
|
+
|
|
153
|
+
@filedownloader("data.csv", "http://...")
|
|
154
|
+
@dataset(Base)
|
|
155
|
+
def my_dataset(data): ...
|
|
156
|
+
|
|
157
|
+
Two-path system:
|
|
158
|
+
|
|
159
|
+
- ``transient_path``: where download/processing writes data
|
|
160
|
+
- ``path``: final location after successful completion
|
|
161
|
+
|
|
162
|
+
The framework moves data from ``transient_path`` → ``path`` and then
|
|
163
|
+
marks the resource as COMPLETE. Subclass ``download()`` implementations
|
|
164
|
+
should always write to ``transient_path``.
|
|
165
|
+
|
|
166
|
+
State is persisted in a metadata file at::
|
|
167
|
+
|
|
168
|
+
<dataset.datapath>/.downloads/.state.json
|
|
20
169
|
"""
|
|
21
170
|
|
|
22
|
-
def __init__(
|
|
23
|
-
self
|
|
24
|
-
|
|
171
|
+
def __init__(
|
|
172
|
+
self,
|
|
173
|
+
varname: str | None = None,
|
|
174
|
+
*,
|
|
175
|
+
transient: bool = False,
|
|
176
|
+
):
|
|
177
|
+
"""
|
|
178
|
+
Args:
|
|
179
|
+
varname: Explicit resource name. If None, auto-set from
|
|
180
|
+
class attribute name during binding. Required when
|
|
181
|
+
used as a decorator (backward compat mode).
|
|
182
|
+
transient: If True, this resource's data can be deleted
|
|
183
|
+
after all its dependents reach COMPLETE.
|
|
184
|
+
"""
|
|
185
|
+
self.name: str | None = varname
|
|
186
|
+
self._name_explicit: bool = varname is not None
|
|
187
|
+
self.dataset: AbstractDataset | None = None
|
|
188
|
+
self.transient: bool = transient
|
|
189
|
+
self._dependencies: list[Resource] = []
|
|
190
|
+
self._dependents: list[Resource] = []
|
|
191
|
+
|
|
192
|
+
# Backward compat: lazy initialization support
|
|
25
193
|
self._post = False
|
|
26
194
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
195
|
+
# ---- Properties ----
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def can_recover(self) -> bool:
|
|
199
|
+
"""Whether partial downloads can be resumed.
|
|
200
|
+
|
|
201
|
+
When True and state is PARTIAL, existing data at transient_path
|
|
202
|
+
is preserved on error, allowing the next download() call to
|
|
203
|
+
resume from where it left off.
|
|
204
|
+
|
|
205
|
+
When False and state is PARTIAL, data at transient_path is
|
|
206
|
+
deleted and state is reset to NONE.
|
|
207
|
+
|
|
208
|
+
Default: False. Subclasses override to enable recovery.
|
|
209
|
+
"""
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
@property
|
|
213
|
+
def dependencies(self) -> list[Resource]:
|
|
214
|
+
"""Resources that must be COMPLETE before this one can process.
|
|
215
|
+
|
|
216
|
+
Populated from constructor arguments. Subclasses with factory
|
|
217
|
+
methods should pass dependency resources to ``__init__`` and
|
|
218
|
+
store them in ``_dependencies``.
|
|
219
|
+
"""
|
|
220
|
+
return self._dependencies
|
|
221
|
+
|
|
222
|
+
@property
|
|
223
|
+
def dependents(self) -> list[Resource]:
|
|
224
|
+
"""Resources that depend on this one (inverse of dependencies).
|
|
225
|
+
|
|
226
|
+
Computed by the dataset after all resources are bound.
|
|
227
|
+
Used for eager transient cleanup decisions.
|
|
228
|
+
"""
|
|
229
|
+
return self._dependents
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def path(self) -> Path:
|
|
233
|
+
"""Final storage path for this resource's data.
|
|
234
|
+
|
|
235
|
+
This is where data lives after successful completion.
|
|
236
|
+
Default: ``dataset.datapath / self.name``
|
|
31
237
|
|
|
32
|
-
|
|
33
|
-
|
|
238
|
+
Subclasses may override to customize (e.g., add file extension).
|
|
239
|
+
"""
|
|
240
|
+
return self.dataset.datapath / self.name
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def transient_path(self) -> Path:
|
|
244
|
+
"""Temporary path where download/processing writes data.
|
|
245
|
+
|
|
246
|
+
During download(), subclasses write to this path.
|
|
247
|
+
After successful download, the framework moves the data from
|
|
248
|
+
transient_path to path, then marks state as COMPLETE.
|
|
249
|
+
|
|
250
|
+
Default: ``dataset.datapath / ".downloads" / self.name``
|
|
251
|
+
"""
|
|
252
|
+
return self.dataset.datapath / ".downloads" / self.name
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def state(self) -> ResourceState:
|
|
256
|
+
"""Current state, read from the metadata file.
|
|
257
|
+
|
|
258
|
+
If no metadata entry exists, returns NONE.
|
|
259
|
+
"""
|
|
260
|
+
if self.dataset is None:
|
|
261
|
+
return ResourceState.NONE
|
|
262
|
+
state_file = ResourceStateFile(self.dataset.datapath)
|
|
263
|
+
return state_file.read(self.name)
|
|
264
|
+
|
|
265
|
+
@state.setter
|
|
266
|
+
def state(self, value: ResourceState) -> None:
|
|
267
|
+
"""Update state in the metadata file (atomic write)."""
|
|
268
|
+
state_file = ResourceStateFile(self.dataset.datapath)
|
|
269
|
+
state_file.write(self.name, value)
|
|
34
270
|
|
|
35
271
|
@property
|
|
36
272
|
def context(self):
|
|
37
|
-
|
|
273
|
+
"""Application context (from dataset)."""
|
|
274
|
+
return self.dataset.context
|
|
275
|
+
|
|
276
|
+
# ---- Abstract methods ----
|
|
277
|
+
|
|
278
|
+
@abstractmethod
|
|
279
|
+
def download(self, force: bool = False) -> None:
|
|
280
|
+
"""Execute this resource's download/processing step.
|
|
281
|
+
|
|
282
|
+
Contract:
|
|
283
|
+
|
|
284
|
+
- Called only when all dependencies are COMPLETE.
|
|
285
|
+
- Must write output to ``self.transient_path``.
|
|
286
|
+
- The framework handles moving transient_path → path
|
|
287
|
+
and setting state to COMPLETE after this returns.
|
|
288
|
+
- If force=True, re-execute even if already COMPLETE.
|
|
289
|
+
|
|
290
|
+
Note: State management (COMPLETE/PARTIAL/NONE transitions,
|
|
291
|
+
moving transient_path → path) is handled by the framework,
|
|
292
|
+
NOT by the download() implementation.
|
|
293
|
+
|
|
294
|
+
Raises:
|
|
295
|
+
Exception: On download/processing failure. The framework
|
|
296
|
+
will handle PARTIAL state based on can_recover.
|
|
297
|
+
"""
|
|
298
|
+
...
|
|
299
|
+
|
|
300
|
+
@abstractmethod
|
|
301
|
+
def prepare(self):
|
|
302
|
+
"""Return the value for dataset construction.
|
|
303
|
+
|
|
304
|
+
Called after download() has completed (state is COMPLETE).
|
|
305
|
+
Return type depends on the resource subclass:
|
|
306
|
+
|
|
307
|
+
- FileResource → Path
|
|
308
|
+
- FolderResource → Path
|
|
309
|
+
- ValueResource → resource-specific
|
|
310
|
+
|
|
311
|
+
For backward compat with function-based datasets, this value
|
|
312
|
+
is passed as a keyword argument to the dataset function.
|
|
313
|
+
"""
|
|
314
|
+
...
|
|
315
|
+
|
|
316
|
+
# ---- Concrete methods ----
|
|
317
|
+
|
|
318
|
+
def cleanup(self) -> None:
|
|
319
|
+
"""Remove this resource's data from disk.
|
|
320
|
+
|
|
321
|
+
Called automatically for transient resources after all
|
|
322
|
+
dependents reach COMPLETE (eager cleanup).
|
|
323
|
+
|
|
324
|
+
Default implementation:
|
|
325
|
+
|
|
326
|
+
- Deletes self.path (file or directory)
|
|
327
|
+
- Deletes self.transient_path if it exists
|
|
328
|
+
- Sets self.state = NONE
|
|
329
|
+
|
|
330
|
+
Subclasses may override for custom cleanup.
|
|
331
|
+
"""
|
|
332
|
+
for p in (self.path, self.transient_path):
|
|
333
|
+
if p.exists():
|
|
334
|
+
if p.is_dir():
|
|
335
|
+
shutil.rmtree(p)
|
|
336
|
+
else:
|
|
337
|
+
p.unlink()
|
|
338
|
+
self.state = ResourceState.NONE
|
|
339
|
+
|
|
340
|
+
def has_files(self) -> bool:
|
|
341
|
+
"""Whether this resource produces files on disk.
|
|
342
|
+
|
|
343
|
+
Returns False for reference-only resources (e.g., links
|
|
344
|
+
to other datasets, in-memory values).
|
|
345
|
+
Default: True.
|
|
346
|
+
"""
|
|
347
|
+
return True
|
|
348
|
+
|
|
349
|
+
# Backward compat alias
|
|
350
|
+
def hasfiles(self) -> bool:
|
|
351
|
+
"""Deprecated: use has_files() instead."""
|
|
352
|
+
_warn_once("hasfiles", "hasfiles() is deprecated, use has_files()")
|
|
353
|
+
return self.has_files()
|
|
38
354
|
|
|
39
355
|
def postinit(self):
|
|
356
|
+
"""Legacy lazy initialization hook.
|
|
357
|
+
|
|
358
|
+
Deprecated: new Resource subclasses should perform
|
|
359
|
+
initialization in ``__init__`` or ``bind()``.
|
|
360
|
+
"""
|
|
40
361
|
pass
|
|
41
362
|
|
|
42
|
-
|
|
43
|
-
|
|
363
|
+
# ---- Binding ----
|
|
364
|
+
|
|
365
|
+
def bind(self, name: str, dataset: AbstractDataset) -> None:
|
|
366
|
+
"""Bind this resource to a dataset.
|
|
367
|
+
|
|
368
|
+
Called by the dataset class machinery during initialization.
|
|
369
|
+
Sets self.name (if not explicitly set via varname) and
|
|
370
|
+
self.dataset. Registers the resource in dataset.resources
|
|
371
|
+
and dataset.ordered_resources.
|
|
372
|
+
|
|
373
|
+
For class-based datasets: called by ``@dataset`` when it
|
|
374
|
+
processes class attributes.
|
|
375
|
+
For decorator-based: called by ``annotate()`` (existing protocol).
|
|
376
|
+
"""
|
|
377
|
+
if not self._name_explicit:
|
|
378
|
+
self.name = name
|
|
379
|
+
|
|
380
|
+
assert self.dataset is None, (
|
|
381
|
+
f"Resource {self.name} is already bound to a dataset"
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if self.name in dataset.resources:
|
|
385
|
+
raise AssertionError(f"Name {self.name} already declared as a resource")
|
|
386
|
+
|
|
387
|
+
dataset.resources[self.name] = self
|
|
388
|
+
dataset.ordered_resources.append(self)
|
|
389
|
+
self.dataset = dataset
|
|
390
|
+
|
|
391
|
+
def annotate(self, dataset: AbstractDataset) -> None:
|
|
392
|
+
"""Register with a dataset (DatasetAnnotation protocol).
|
|
393
|
+
|
|
394
|
+
Deprecated for new code. Calls bind() internally.
|
|
395
|
+
"""
|
|
396
|
+
_warn_once(
|
|
397
|
+
"annotate",
|
|
398
|
+
"Using resources as decorators is deprecated. "
|
|
399
|
+
"Define them as class attributes instead.",
|
|
400
|
+
)
|
|
401
|
+
self.bind(self.name, dataset)
|
|
402
|
+
|
|
403
|
+
def contextualize(self):
|
|
404
|
+
"""When using an annotation inline, uses the current
|
|
405
|
+
dataset wrapper object.
|
|
406
|
+
|
|
407
|
+
Deprecated: use class-attribute resource definitions instead.
|
|
408
|
+
"""
|
|
409
|
+
wrapper = AbstractDataset.processing()
|
|
410
|
+
self.annotate(wrapper)
|
|
411
|
+
|
|
412
|
+
def setup(
|
|
413
|
+
self,
|
|
414
|
+
dataset: Union[AbstractDataset],
|
|
415
|
+
options: SetupOptions = None,
|
|
416
|
+
):
|
|
417
|
+
"""Direct way to setup the resource (no annotation).
|
|
418
|
+
|
|
419
|
+
Deprecated: use class-attribute resource definitions instead.
|
|
420
|
+
"""
|
|
421
|
+
self(dataset)
|
|
422
|
+
return self.prepare()
|
|
423
|
+
|
|
424
|
+
# ---- Factory pattern ----
|
|
425
|
+
|
|
426
|
+
@classmethod
|
|
427
|
+
def apply(cls, *args, **kwargs) -> "Resource":
|
|
428
|
+
"""Factory classmethod for creating resource instances.
|
|
429
|
+
|
|
430
|
+
Allows defining shorthand factory functions::
|
|
431
|
+
|
|
432
|
+
filedownloader = FileDownloader.apply
|
|
433
|
+
|
|
434
|
+
Default implementation: ``return cls(*args, **kwargs)``
|
|
435
|
+
Subclasses may override for custom argument handling.
|
|
436
|
+
"""
|
|
437
|
+
return cls(*args, **kwargs)
|
|
438
|
+
|
|
439
|
+
# ---- Backward compat: definition property ----
|
|
440
|
+
|
|
441
|
+
@property
|
|
442
|
+
def definition(self) -> AbstractDataset | None:
|
|
443
|
+
"""Deprecated: use ``dataset`` attribute instead."""
|
|
444
|
+
_warn_once(
|
|
445
|
+
"definition",
|
|
446
|
+
"Resource.definition is deprecated, use Resource.dataset",
|
|
447
|
+
)
|
|
448
|
+
return self.dataset
|
|
449
|
+
|
|
450
|
+
# Backward compat: varname property
|
|
451
|
+
@property
|
|
452
|
+
def varname(self) -> str | None:
|
|
453
|
+
"""Deprecated: use ``name`` attribute instead."""
|
|
454
|
+
_warn_once(
|
|
455
|
+
"varname",
|
|
456
|
+
"Resource.varname is deprecated, use Resource.name",
|
|
457
|
+
)
|
|
458
|
+
return self.name
|
|
459
|
+
|
|
460
|
+
@varname.setter
|
|
461
|
+
def varname(self, value: str | None):
|
|
462
|
+
self.name = value
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
# --- FileResource ---
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class FileResource(Resource):
|
|
469
|
+
"""A resource that produces a single file on disk.
|
|
470
|
+
|
|
471
|
+
Subclasses implement ``_download()`` to produce the file at the
|
|
472
|
+
given destination (which is ``self.transient_path``).
|
|
473
|
+
"""
|
|
474
|
+
|
|
475
|
+
def __init__(
|
|
476
|
+
self,
|
|
477
|
+
filename: str,
|
|
478
|
+
*,
|
|
479
|
+
varname: str | None = None,
|
|
480
|
+
transient: bool = False,
|
|
481
|
+
):
|
|
482
|
+
"""
|
|
483
|
+
Args:
|
|
484
|
+
filename: The filename (with extension) for the produced file.
|
|
485
|
+
Used to construct the storage path.
|
|
486
|
+
varname: Explicit resource name. If None, derived from
|
|
487
|
+
filename (extension stripped) or class attribute name.
|
|
488
|
+
transient: See Resource.
|
|
489
|
+
"""
|
|
490
|
+
import re
|
|
491
|
+
|
|
492
|
+
effective_varname = varname or re.sub(r"\..*$", "", filename)
|
|
493
|
+
super().__init__(varname=effective_varname, transient=transient)
|
|
494
|
+
# Only mark name as explicit if user actually passed varname
|
|
495
|
+
self._name_explicit = varname is not None
|
|
496
|
+
self.filename = filename
|
|
497
|
+
|
|
498
|
+
@property
|
|
499
|
+
def path(self) -> Path:
|
|
500
|
+
"""Final path to the produced file.
|
|
501
|
+
|
|
502
|
+
``dataset.datapath / self.filename``
|
|
503
|
+
"""
|
|
504
|
+
return self.dataset.datapath / self.filename
|
|
505
|
+
|
|
506
|
+
@property
|
|
507
|
+
def transient_path(self) -> Path:
|
|
508
|
+
"""Temporary path for writing during download.
|
|
509
|
+
|
|
510
|
+
``dataset.datapath / ".downloads" / self.filename``
|
|
511
|
+
"""
|
|
512
|
+
return self.dataset.datapath / ".downloads" / self.filename
|
|
513
|
+
|
|
514
|
+
def prepare(self) -> Path:
|
|
515
|
+
"""Returns self.path."""
|
|
516
|
+
return self.path
|
|
517
|
+
|
|
518
|
+
def stream(self) -> IO[bytes] | None:
|
|
519
|
+
"""Return a readable byte stream of the file content.
|
|
520
|
+
|
|
521
|
+
Returns None if streaming is not supported for this resource.
|
|
522
|
+
Default: returns None. Subclasses may override.
|
|
523
|
+
|
|
524
|
+
This allows downstream resources to consume data without
|
|
525
|
+
needing the file to be fully materialized on disk first.
|
|
526
|
+
"""
|
|
527
|
+
return None
|
|
528
|
+
|
|
529
|
+
def download(self, force: bool = False) -> None:
|
|
530
|
+
"""Downloads the file.
|
|
531
|
+
|
|
532
|
+
Delegates to ``_download(self.transient_path)``.
|
|
533
|
+
"""
|
|
534
|
+
self._download(self.transient_path)
|
|
535
|
+
|
|
536
|
+
@abstractmethod
|
|
537
|
+
def _download(self, destination: Path) -> None:
|
|
538
|
+
"""Subclass hook: download/produce the file at destination.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
destination: The path to write the file to
|
|
542
|
+
(``self.transient_path``).
|
|
543
|
+
"""
|
|
544
|
+
...
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
# --- FolderResource ---
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
class FolderResource(Resource):
|
|
551
|
+
"""A resource that produces a directory on disk.
|
|
552
|
+
|
|
553
|
+
Subclasses implement ``_download()`` to populate the directory at
|
|
554
|
+
the given destination (which is ``self.transient_path``).
|
|
555
|
+
"""
|
|
556
|
+
|
|
557
|
+
@property
|
|
558
|
+
def path(self) -> Path:
|
|
559
|
+
"""Final path to the produced directory.
|
|
560
|
+
|
|
561
|
+
``dataset.datapath / self.name``
|
|
562
|
+
"""
|
|
563
|
+
return self.dataset.datapath / self.name
|
|
564
|
+
|
|
565
|
+
@property
|
|
566
|
+
def transient_path(self) -> Path:
|
|
567
|
+
"""Temporary path for writing during download.
|
|
568
|
+
|
|
569
|
+
``dataset.datapath / ".downloads" / self.name``
|
|
570
|
+
"""
|
|
571
|
+
return self.dataset.datapath / ".downloads" / self.name
|
|
572
|
+
|
|
573
|
+
def prepare(self) -> Path:
|
|
574
|
+
"""Returns self.path."""
|
|
575
|
+
return self.path
|
|
576
|
+
|
|
577
|
+
def download(self, force: bool = False) -> None:
|
|
578
|
+
"""Downloads/extracts the directory content to transient_path."""
|
|
579
|
+
self._download(self.transient_path)
|
|
580
|
+
|
|
581
|
+
@abstractmethod
|
|
582
|
+
def _download(self, destination: Path) -> None:
|
|
583
|
+
"""Subclass hook: populate the directory at destination.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
destination: The path to write to (``self.transient_path``).
|
|
587
|
+
"""
|
|
588
|
+
...
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
# --- ValueResource ---
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
class ValueResource(Resource):
|
|
595
|
+
"""A resource that produces an in-memory value (no files on disk).
|
|
596
|
+
|
|
597
|
+
Used for resources like HuggingFace dataset handles that don't
|
|
598
|
+
produce local files. The transient_path/path two-path system
|
|
599
|
+
is not used; state tracking is still via metadata file.
|
|
600
|
+
"""
|
|
601
|
+
|
|
602
|
+
def has_files(self) -> bool:
|
|
603
|
+
return False
|
|
604
|
+
|
|
605
|
+
@abstractmethod
|
|
606
|
+
def prepare(self):
|
|
607
|
+
"""Return the in-memory value."""
|
|
608
|
+
...
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
# --- Deprecated compatibility classes ---
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
class Download(Resource):
|
|
615
|
+
"""Deprecated: use Resource instead."""
|
|
616
|
+
|
|
617
|
+
def __init_subclass__(cls):
|
|
618
|
+
_warn_once(
|
|
619
|
+
f"Download-{cls.__name__}",
|
|
620
|
+
f"Download is deprecated ({cls}): use `Resource`",
|
|
621
|
+
)
|
|
622
|
+
return super().__init_subclass__()
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
# --- reference resource ---
|
|
44
626
|
|
|
45
|
-
def download(self, force=False):
|
|
46
|
-
"""Downloads the content"""
|
|
47
|
-
raise NotImplementedError()
|
|
48
627
|
|
|
628
|
+
class reference(Resource):
|
|
629
|
+
"""References another dataset instead of downloading."""
|
|
49
630
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
631
|
+
def __init__(self, varname=None, reference=None):
|
|
632
|
+
"""
|
|
633
|
+
Args:
|
|
634
|
+
varname: The name of the variable.
|
|
635
|
+
reference: Another dataset to reference.
|
|
636
|
+
"""
|
|
637
|
+
super().__init__(varname=varname)
|
|
638
|
+
assert reference is not None, "Reference cannot be null"
|
|
53
639
|
self.reference = reference
|
|
54
640
|
|
|
55
641
|
def prepare(self):
|
|
@@ -61,7 +647,7 @@ class reference(Download):
|
|
|
61
647
|
def download(self, force=False):
|
|
62
648
|
self.reference.__datamaestro__.download(force)
|
|
63
649
|
|
|
64
|
-
def
|
|
650
|
+
def has_files(self):
|
|
65
651
|
# We don't really have files
|
|
66
652
|
return False
|
|
67
653
|
|