datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. datamaestro/__init__.py +11 -7
  2. datamaestro/__main__.py +29 -8
  3. datamaestro/annotations/__init__.py +1 -1
  4. datamaestro/annotations/agreement.py +9 -3
  5. datamaestro/commands/site.py +27 -15
  6. datamaestro/context.py +143 -87
  7. datamaestro/data/__init__.py +23 -11
  8. datamaestro/data/csv.py +12 -12
  9. datamaestro/data/huggingface.py +25 -0
  10. datamaestro/data/ml.py +19 -10
  11. datamaestro/data/tensor.py +32 -24
  12. datamaestro/definitions.py +492 -131
  13. datamaestro/download/__init__.py +610 -24
  14. datamaestro/download/archive.py +129 -77
  15. datamaestro/download/custom.py +53 -0
  16. datamaestro/download/huggingface.py +77 -0
  17. datamaestro/download/links.py +106 -50
  18. datamaestro/download/multiple.py +27 -5
  19. datamaestro/download/single.py +114 -51
  20. datamaestro/download/sync.py +0 -1
  21. datamaestro/download/todo.py +9 -4
  22. datamaestro/download/wayback.py +164 -0
  23. datamaestro/record.py +232 -0
  24. datamaestro/registry.py +1 -0
  25. datamaestro/search.py +1 -1
  26. datamaestro/settings.py +3 -1
  27. datamaestro/sphinx.py +224 -0
  28. datamaestro/stream/__init__.py +0 -2
  29. datamaestro/stream/lines.py +10 -7
  30. datamaestro/templates/dataset.py +5 -4
  31. datamaestro/test/__init__.py +3 -1
  32. datamaestro/test/checks.py +1 -5
  33. datamaestro/test/conftest.py +1 -6
  34. datamaestro/test/test_annotations.py +2 -2
  35. datamaestro/test/test_download_handlers.py +3 -4
  36. datamaestro/test/test_record.py +72 -0
  37. datamaestro/test/test_resource.py +1388 -0
  38. datamaestro/utils.py +15 -9
  39. datamaestro/v2.md +301 -0
  40. datamaestro/version.py +4 -0
  41. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
  42. datamaestro-1.7.0.dist-info/RECORD +49 -0
  43. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
  44. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
  45. datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
  46. datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
  47. datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
  48. datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
  49. datamaestro/__pycache__/context.cpython-38.pyc +0 -0
  50. datamaestro/__pycache__/context.cpython-39.pyc +0 -0
  51. datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
  52. datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
  53. datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
  54. datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
  55. datamaestro/__pycache__/search.cpython-38.pyc +0 -0
  56. datamaestro/__pycache__/search.cpython-39.pyc +0 -0
  57. datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
  58. datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
  59. datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
  60. datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
  61. datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
  62. datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
  63. datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
  64. datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
  65. datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
  66. datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
  67. datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
  68. datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
  69. datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
  70. datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
  71. datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
  72. datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
  73. datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
  74. datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
  75. datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
  76. datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
  77. datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
  78. datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
  79. datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
  80. datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
  81. datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
  82. datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
  83. datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
  84. datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
  85. datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
  86. datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
  87. datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
  88. datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
  89. datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
  90. datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
  91. datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
  92. datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
  93. datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
  94. datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
  95. datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
  96. datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
  97. datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
  98. datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
  99. datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
  100. datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
  101. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
  102. datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
  103. datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
  104. datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
  105. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
  106. datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
  107. datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
  108. datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
  109. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
  110. datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
  111. datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
  112. datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
  113. datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
  114. datamaestro-0.8.1.dist-info/RECORD +0 -109
  115. datamaestro-0.8.1.dist-info/top_level.txt +0 -1
  116. {datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0
@@ -1,9 +1,116 @@
1
- from datamaestro.definitions import AbstractDataset, DatasetAnnotation, DatasetWrapper
1
+ """Resource system for dataset download and processing pipelines.
2
+
3
+ This module defines the Resource interface and its concrete subclasses
4
+ (FileResource, FolderResource, ValueResource) for managing dataset
5
+ download and preprocessing steps as a directed acyclic graph (DAG).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import shutil
13
+ import warnings
14
+ from abc import ABC, abstractmethod
15
+ from enum import Enum
16
+ from pathlib import Path
17
+ from typing import IO, Union
18
+
19
+ from attrs import define
20
+
21
+ from datamaestro.definitions import AbstractDataset, DatasetAnnotation
2
22
  from datamaestro.utils import deprecated
3
23
 
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Module-level deprecation tracking (emit each category only once)
27
+ _deprecation_warned: set[str] = set()
28
+
29
+
30
+ def _warn_once(category: str, message: str):
31
+ """Emit a deprecation warning only once per category."""
32
+ if category not in _deprecation_warned:
33
+ _deprecation_warned.add(category)
34
+ warnings.warn(message, DeprecationWarning, stacklevel=3)
35
+
36
+
37
+ # --- State metadata file helpers ---
38
+
39
+
40
+ class ResourceStateFile:
41
+ """Manages the .state.json metadata file for resource states.
42
+
43
+ Location: <dataset.datapath>/.state.json
44
+
45
+ Format:
46
+ {
47
+ "version": 1,
48
+ "resources": {
49
+ "RESOURCE_NAME": {"state": "none"|"partial"|"complete"},
50
+ ...
51
+ }
52
+ }
53
+ """
54
+
55
+ VERSION = 1
56
+
57
+ def __init__(self, datapath: Path):
58
+ self._path = datapath / ".state.json"
59
+
60
+ def read(self, resource_name: str) -> "ResourceState":
61
+ """Read the state for a resource. Returns NONE if not found."""
62
+ data = self._load()
63
+ entry = data.get("resources", {}).get(resource_name)
64
+ if entry is None:
65
+ return ResourceState.NONE
66
+ return ResourceState(entry["state"])
67
+
68
+ def write(self, resource_name: str, state: "ResourceState"):
69
+ """Write the state for a resource (atomic write)."""
70
+ data = self._load()
71
+ if "resources" not in data:
72
+ data["resources"] = {}
73
+ data["resources"][resource_name] = {"state": state.value}
74
+ self._save(data)
75
+
76
+ def _load(self) -> dict:
77
+ if self._path.is_file():
78
+ with self._path.open("r") as f:
79
+ return json.load(f)
80
+ return {"version": self.VERSION, "resources": {}}
81
+
82
+ def _save(self, data: dict):
83
+ self._path.parent.mkdir(parents=True, exist_ok=True)
84
+ tmp = self._path.with_suffix(".tmp")
85
+ with tmp.open("w") as f:
86
+ json.dump(data, f, indent=2)
87
+ tmp.rename(self._path)
88
+
89
+
90
+ # --- ResourceState enum ---
91
+
92
+
93
+ class ResourceState(str, Enum):
94
+ """State of a resource in the preparation pipeline."""
95
+
96
+ NONE = "none"
97
+ """Not started / no data on disk."""
98
+
99
+ PARTIAL = "partial"
100
+ """Started but incomplete (error during download)."""
101
+
102
+ COMPLETE = "complete"
103
+ """Fully available."""
104
+
105
+
106
+ # --- Lazy initialization decorator (backward compat) ---
107
+
4
108
 
5
109
  def initialized(method):
6
- """Ensure the object is initialized"""
110
+ """Ensure the object is initialized (calls postinit on first use).
111
+
112
+ Deprecated: new Resource subclasses should not rely on this pattern.
113
+ """
7
114
 
8
115
  def wrapper(self, *args, **kwargs):
9
116
  if not self._post:
@@ -14,42 +121,521 @@ def initialized(method):
14
121
  return wrapper
15
122
 
16
123
 
17
- class Download(DatasetAnnotation):
18
- """
19
- Base class for all download handlers
124
+ # --- SetupOptions (backward compat) ---
125
+
126
+
127
+ @define(kw_only=True)
128
+ class SetupOptions:
129
+ pass
130
+
131
+
132
+ # --- Resource base class ---
133
+
134
+
135
+ class Resource(DatasetAnnotation, ABC):
136
+ """Base class for all dataset resources.
137
+
138
+ A resource represents a single step in a dataset preparation pipeline.
139
+ Resources form a DAG: each resource declares its dependencies, and
140
+ the orchestrator ensures they are processed in topological order.
141
+
142
+ Usage modes:
143
+
144
+ 1. Class attribute (preferred)::
145
+
146
+ @dataset(url="...")
147
+ class MyDataset(Base):
148
+ DATA = filedownloader("data.csv", "http://...", transient=True)
149
+ PROCESSED = SomeProcessor.from_file(DATA)
150
+
151
+ 2. Decorator on function (deprecated, backward compat)::
152
+
153
+ @filedownloader("data.csv", "http://...")
154
+ @dataset(Base)
155
+ def my_dataset(data): ...
156
+
157
+ Two-path system:
158
+
159
+ - ``transient_path``: where download/processing writes data
160
+ - ``path``: final location after successful completion
161
+
162
+ The framework moves data from ``transient_path`` → ``path`` and then
163
+ marks the resource as COMPLETE. Subclass ``download()`` implementations
164
+ should always write to ``transient_path``.
165
+
166
+ State is persisted in a metadata file at::
167
+
168
+ <dataset.datapath>/.downloads/.state.json
20
169
  """
21
170
 
22
- def __init__(self, varname: str):
23
- self.varname = varname
24
- # Ensures that the object is initialized
171
+ def __init__(
172
+ self,
173
+ varname: str | None = None,
174
+ *,
175
+ transient: bool = False,
176
+ ):
177
+ """
178
+ Args:
179
+ varname: Explicit resource name. If None, auto-set from
180
+ class attribute name during binding. Required when
181
+ used as a decorator (backward compat mode).
182
+ transient: If True, this resource's data can be deleted
183
+ after all its dependents reach COMPLETE.
184
+ """
185
+ self.name: str | None = varname
186
+ self._name_explicit: bool = varname is not None
187
+ self.dataset: AbstractDataset | None = None
188
+ self.transient: bool = transient
189
+ self._dependencies: list[Resource] = []
190
+ self._dependents: list[Resource] = []
191
+
192
+ # Backward compat: lazy initialization support
25
193
  self._post = False
26
194
 
27
- def annotate(self, dataset: AbstractDataset):
28
- # Register has a resource download
29
- if self.varname in dataset.resources:
30
- raise AssertionError("Name %s already declared as a resource", self.varname)
195
+ # ---- Properties ----
196
+
197
+ @property
198
+ def can_recover(self) -> bool:
199
+ """Whether partial downloads can be resumed.
200
+
201
+ When True and state is PARTIAL, existing data at transient_path
202
+ is preserved on error, allowing the next download() call to
203
+ resume from where it left off.
204
+
205
+ When False and state is PARTIAL, data at transient_path is
206
+ deleted and state is reset to NONE.
207
+
208
+ Default: False. Subclasses override to enable recovery.
209
+ """
210
+ return False
211
+
212
+ @property
213
+ def dependencies(self) -> list[Resource]:
214
+ """Resources that must be COMPLETE before this one can process.
215
+
216
+ Populated from constructor arguments. Subclasses with factory
217
+ methods should pass dependency resources to ``__init__`` and
218
+ store them in ``_dependencies``.
219
+ """
220
+ return self._dependencies
221
+
222
+ @property
223
+ def dependents(self) -> list[Resource]:
224
+ """Resources that depend on this one (inverse of dependencies).
225
+
226
+ Computed by the dataset after all resources are bound.
227
+ Used for eager transient cleanup decisions.
228
+ """
229
+ return self._dependents
230
+
231
+ @property
232
+ def path(self) -> Path:
233
+ """Final storage path for this resource's data.
234
+
235
+ This is where data lives after successful completion.
236
+ Default: ``dataset.datapath / self.name``
31
237
 
32
- dataset.resources[self.varname] = self
33
- self.definition = dataset
238
+ Subclasses may override to customize (e.g., add file extension).
239
+ """
240
+ return self.dataset.datapath / self.name
241
+
242
+ @property
243
+ def transient_path(self) -> Path:
244
+ """Temporary path where download/processing writes data.
245
+
246
+ During download(), subclasses write to this path.
247
+ After successful download, the framework moves the data from
248
+ transient_path to path, then marks state as COMPLETE.
249
+
250
+ Default: ``dataset.datapath / ".downloads" / self.name``
251
+ """
252
+ return self.dataset.datapath / ".downloads" / self.name
253
+
254
+ @property
255
+ def state(self) -> ResourceState:
256
+ """Current state, read from the metadata file.
257
+
258
+ If no metadata entry exists, returns NONE.
259
+ """
260
+ if self.dataset is None:
261
+ return ResourceState.NONE
262
+ state_file = ResourceStateFile(self.dataset.datapath)
263
+ return state_file.read(self.name)
264
+
265
+ @state.setter
266
+ def state(self, value: ResourceState) -> None:
267
+ """Update state in the metadata file (atomic write)."""
268
+ state_file = ResourceStateFile(self.dataset.datapath)
269
+ state_file.write(self.name, value)
34
270
 
35
271
  @property
36
272
  def context(self):
37
- return self.definition.context
273
+ """Application context (from dataset)."""
274
+ return self.dataset.context
275
+
276
+ # ---- Abstract methods ----
277
+
278
+ @abstractmethod
279
+ def download(self, force: bool = False) -> None:
280
+ """Execute this resource's download/processing step.
281
+
282
+ Contract:
283
+
284
+ - Called only when all dependencies are COMPLETE.
285
+ - Must write output to ``self.transient_path``.
286
+ - The framework handles moving transient_path → path
287
+ and setting state to COMPLETE after this returns.
288
+ - If force=True, re-execute even if already COMPLETE.
289
+
290
+ Note: State management (COMPLETE/PARTIAL/NONE transitions,
291
+ moving transient_path → path) is handled by the framework,
292
+ NOT by the download() implementation.
293
+
294
+ Raises:
295
+ Exception: On download/processing failure. The framework
296
+ will handle PARTIAL state based on can_recover.
297
+ """
298
+ ...
299
+
300
+ @abstractmethod
301
+ def prepare(self):
302
+ """Return the value for dataset construction.
303
+
304
+ Called after download() has completed (state is COMPLETE).
305
+ Return type depends on the resource subclass:
306
+
307
+ - FileResource → Path
308
+ - FolderResource → Path
309
+ - ValueResource → resource-specific
310
+
311
+ For backward compat with function-based datasets, this value
312
+ is passed as a keyword argument to the dataset function.
313
+ """
314
+ ...
315
+
316
+ # ---- Concrete methods ----
317
+
318
+ def cleanup(self) -> None:
319
+ """Remove this resource's data from disk.
320
+
321
+ Called automatically for transient resources after all
322
+ dependents reach COMPLETE (eager cleanup).
323
+
324
+ Default implementation:
325
+
326
+ - Deletes self.path (file or directory)
327
+ - Deletes self.transient_path if it exists
328
+ - Sets self.state = NONE
329
+
330
+ Subclasses may override for custom cleanup.
331
+ """
332
+ for p in (self.path, self.transient_path):
333
+ if p.exists():
334
+ if p.is_dir():
335
+ shutil.rmtree(p)
336
+ else:
337
+ p.unlink()
338
+ self.state = ResourceState.NONE
339
+
340
+ def has_files(self) -> bool:
341
+ """Whether this resource produces files on disk.
342
+
343
+ Returns False for reference-only resources (e.g., links
344
+ to other datasets, in-memory values).
345
+ Default: True.
346
+ """
347
+ return True
348
+
349
+ # Backward compat alias
350
+ def hasfiles(self) -> bool:
351
+ """Deprecated: use has_files() instead."""
352
+ _warn_once("hasfiles", "hasfiles() is deprecated, use has_files()")
353
+ return self.has_files()
38
354
 
39
355
  def postinit(self):
356
+ """Legacy lazy initialization hook.
357
+
358
+ Deprecated: new Resource subclasses should perform
359
+ initialization in ``__init__`` or ``bind()``.
360
+ """
40
361
  pass
41
362
 
42
- def hasfiles(self):
43
- return True
363
+ # ---- Binding ----
364
+
365
+ def bind(self, name: str, dataset: AbstractDataset) -> None:
366
+ """Bind this resource to a dataset.
367
+
368
+ Called by the dataset class machinery during initialization.
369
+ Sets self.name (if not explicitly set via varname) and
370
+ self.dataset. Registers the resource in dataset.resources
371
+ and dataset.ordered_resources.
372
+
373
+ For class-based datasets: called by ``@dataset`` when it
374
+ processes class attributes.
375
+ For decorator-based: called by ``annotate()`` (existing protocol).
376
+ """
377
+ if not self._name_explicit:
378
+ self.name = name
379
+
380
+ assert self.dataset is None, (
381
+ f"Resource {self.name} is already bound to a dataset"
382
+ )
383
+
384
+ if self.name in dataset.resources:
385
+ raise AssertionError(f"Name {self.name} already declared as a resource")
386
+
387
+ dataset.resources[self.name] = self
388
+ dataset.ordered_resources.append(self)
389
+ self.dataset = dataset
390
+
391
+ def annotate(self, dataset: AbstractDataset) -> None:
392
+ """Register with a dataset (DatasetAnnotation protocol).
393
+
394
+ Deprecated for new code. Calls bind() internally.
395
+ """
396
+ _warn_once(
397
+ "annotate",
398
+ "Using resources as decorators is deprecated. "
399
+ "Define them as class attributes instead.",
400
+ )
401
+ self.bind(self.name, dataset)
402
+
403
+ def contextualize(self):
404
+ """When using an annotation inline, uses the current
405
+ dataset wrapper object.
406
+
407
+ Deprecated: use class-attribute resource definitions instead.
408
+ """
409
+ wrapper = AbstractDataset.processing()
410
+ self.annotate(wrapper)
411
+
412
+ def setup(
413
+ self,
414
+ dataset: Union[AbstractDataset],
415
+ options: SetupOptions = None,
416
+ ):
417
+ """Direct way to setup the resource (no annotation).
418
+
419
+ Deprecated: use class-attribute resource definitions instead.
420
+ """
421
+ self(dataset)
422
+ return self.prepare()
423
+
424
+ # ---- Factory pattern ----
425
+
426
+ @classmethod
427
+ def apply(cls, *args, **kwargs) -> "Resource":
428
+ """Factory classmethod for creating resource instances.
429
+
430
+ Allows defining shorthand factory functions::
431
+
432
+ filedownloader = FileDownloader.apply
433
+
434
+ Default implementation: ``return cls(*args, **kwargs)``
435
+ Subclasses may override for custom argument handling.
436
+ """
437
+ return cls(*args, **kwargs)
438
+
439
+ # ---- Backward compat: definition property ----
440
+
441
+ @property
442
+ def definition(self) -> AbstractDataset | None:
443
+ """Deprecated: use ``dataset`` attribute instead."""
444
+ _warn_once(
445
+ "definition",
446
+ "Resource.definition is deprecated, use Resource.dataset",
447
+ )
448
+ return self.dataset
449
+
450
+ # Backward compat: varname property
451
+ @property
452
+ def varname(self) -> str | None:
453
+ """Deprecated: use ``name`` attribute instead."""
454
+ _warn_once(
455
+ "varname",
456
+ "Resource.varname is deprecated, use Resource.name",
457
+ )
458
+ return self.name
459
+
460
+ @varname.setter
461
+ def varname(self, value: str | None):
462
+ self.name = value
463
+
464
+
465
+ # --- FileResource ---
466
+
467
+
468
+ class FileResource(Resource):
469
+ """A resource that produces a single file on disk.
470
+
471
+ Subclasses implement ``_download()`` to produce the file at the
472
+ given destination (which is ``self.transient_path``).
473
+ """
474
+
475
+ def __init__(
476
+ self,
477
+ filename: str,
478
+ *,
479
+ varname: str | None = None,
480
+ transient: bool = False,
481
+ ):
482
+ """
483
+ Args:
484
+ filename: The filename (with extension) for the produced file.
485
+ Used to construct the storage path.
486
+ varname: Explicit resource name. If None, derived from
487
+ filename (extension stripped) or class attribute name.
488
+ transient: See Resource.
489
+ """
490
+ import re
491
+
492
+ effective_varname = varname or re.sub(r"\..*$", "", filename)
493
+ super().__init__(varname=effective_varname, transient=transient)
494
+ # Only mark name as explicit if user actually passed varname
495
+ self._name_explicit = varname is not None
496
+ self.filename = filename
497
+
498
+ @property
499
+ def path(self) -> Path:
500
+ """Final path to the produced file.
501
+
502
+ ``dataset.datapath / self.filename``
503
+ """
504
+ return self.dataset.datapath / self.filename
505
+
506
+ @property
507
+ def transient_path(self) -> Path:
508
+ """Temporary path for writing during download.
509
+
510
+ ``dataset.datapath / ".downloads" / self.filename``
511
+ """
512
+ return self.dataset.datapath / ".downloads" / self.filename
513
+
514
+ def prepare(self) -> Path:
515
+ """Returns self.path."""
516
+ return self.path
517
+
518
+ def stream(self) -> IO[bytes] | None:
519
+ """Return a readable byte stream of the file content.
520
+
521
+ Returns None if streaming is not supported for this resource.
522
+ Default: returns None. Subclasses may override.
523
+
524
+ This allows downstream resources to consume data without
525
+ needing the file to be fully materialized on disk first.
526
+ """
527
+ return None
528
+
529
+ def download(self, force: bool = False) -> None:
530
+ """Downloads the file.
531
+
532
+ Delegates to ``_download(self.transient_path)``.
533
+ """
534
+ self._download(self.transient_path)
535
+
536
+ @abstractmethod
537
+ def _download(self, destination: Path) -> None:
538
+ """Subclass hook: download/produce the file at destination.
539
+
540
+ Args:
541
+ destination: The path to write the file to
542
+ (``self.transient_path``).
543
+ """
544
+ ...
545
+
546
+
547
+ # --- FolderResource ---
548
+
549
+
550
+ class FolderResource(Resource):
551
+ """A resource that produces a directory on disk.
552
+
553
+ Subclasses implement ``_download()`` to populate the directory at
554
+ the given destination (which is ``self.transient_path``).
555
+ """
556
+
557
+ @property
558
+ def path(self) -> Path:
559
+ """Final path to the produced directory.
560
+
561
+ ``dataset.datapath / self.name``
562
+ """
563
+ return self.dataset.datapath / self.name
564
+
565
+ @property
566
+ def transient_path(self) -> Path:
567
+ """Temporary path for writing during download.
568
+
569
+ ``dataset.datapath / ".downloads" / self.name``
570
+ """
571
+ return self.dataset.datapath / ".downloads" / self.name
572
+
573
+ def prepare(self) -> Path:
574
+ """Returns self.path."""
575
+ return self.path
576
+
577
+ def download(self, force: bool = False) -> None:
578
+ """Downloads/extracts the directory content to transient_path."""
579
+ self._download(self.transient_path)
580
+
581
+ @abstractmethod
582
+ def _download(self, destination: Path) -> None:
583
+ """Subclass hook: populate the directory at destination.
584
+
585
+ Args:
586
+ destination: The path to write to (``self.transient_path``).
587
+ """
588
+ ...
589
+
590
+
591
+ # --- ValueResource ---
592
+
593
+
594
+ class ValueResource(Resource):
595
+ """A resource that produces an in-memory value (no files on disk).
596
+
597
+ Used for resources like HuggingFace dataset handles that don't
598
+ produce local files. The transient_path/path two-path system
599
+ is not used; state tracking is still via metadata file.
600
+ """
601
+
602
+ def has_files(self) -> bool:
603
+ return False
604
+
605
+ @abstractmethod
606
+ def prepare(self):
607
+ """Return the in-memory value."""
608
+ ...
609
+
610
+
611
+ # --- Deprecated compatibility classes ---
612
+
613
+
614
+ class Download(Resource):
615
+ """Deprecated: use Resource instead."""
616
+
617
+ def __init_subclass__(cls):
618
+ _warn_once(
619
+ f"Download-{cls.__name__}",
620
+ f"Download is deprecated ({cls}): use `Resource`",
621
+ )
622
+ return super().__init_subclass__()
623
+
624
+
625
+ # --- reference resource ---
44
626
 
45
- def download(self, force=False):
46
- """Downloads the content"""
47
- raise NotImplementedError()
48
627
 
628
+ class reference(Resource):
629
+ """References another dataset instead of downloading."""
49
630
 
50
- class reference(Download):
51
- def __init__(self, varname, reference):
52
- super().__init__(varname)
631
+ def __init__(self, varname=None, reference=None):
632
+ """
633
+ Args:
634
+ varname: The name of the variable.
635
+ reference: Another dataset to reference.
636
+ """
637
+ super().__init__(varname=varname)
638
+ assert reference is not None, "Reference cannot be null"
53
639
  self.reference = reference
54
640
 
55
641
  def prepare(self):
@@ -61,7 +647,7 @@ class reference(Download):
61
647
  def download(self, force=False):
62
648
  self.reference.__datamaestro__.download(force)
63
649
 
64
- def hasfiles(self):
650
+ def has_files(self):
65
651
  # We don't really have files
66
652
  return False
67
653