datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,116 @@
1
- from typing import Union
1
+ """Resource system for dataset download and processing pipelines.
2
+
3
+ This module defines the Resource interface and its concrete subclasses
4
+ (FileResource, FolderResource, ValueResource) for managing dataset
5
+ download and preprocessing steps as a directed acyclic graph (DAG).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import shutil
13
+ import warnings
2
14
  from abc import ABC, abstractmethod
15
+ from enum import Enum
16
+ from pathlib import Path
17
+ from typing import IO, Union
18
+
19
+ from attrs import define
20
+
3
21
  from datamaestro.definitions import AbstractDataset, DatasetAnnotation
4
22
  from datamaestro.utils import deprecated
5
- from attrs import define
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Module-level deprecation tracking (emit each category only once)
27
+ _deprecation_warned: set[str] = set()
28
+
29
+
30
+ def _warn_once(category: str, message: str):
31
+ """Emit a deprecation warning only once per category."""
32
+ if category not in _deprecation_warned:
33
+ _deprecation_warned.add(category)
34
+ warnings.warn(message, DeprecationWarning, stacklevel=3)
35
+
36
+
37
+ # --- State metadata file helpers ---
38
+
39
+
40
+ class ResourceStateFile:
41
+ """Manages the .state.json metadata file for resource states.
42
+
43
+ Location: <dataset.datapath>/.state.json
44
+
45
+ Format:
46
+ {
47
+ "version": 1,
48
+ "resources": {
49
+ "RESOURCE_NAME": {"state": "none"|"partial"|"complete"},
50
+ ...
51
+ }
52
+ }
53
+ """
54
+
55
+ VERSION = 1
56
+
57
+ def __init__(self, datapath: Path):
58
+ self._path = datapath / ".state.json"
59
+
60
+ def read(self, resource_name: str) -> "ResourceState":
61
+ """Read the state for a resource. Returns NONE if not found."""
62
+ data = self._load()
63
+ entry = data.get("resources", {}).get(resource_name)
64
+ if entry is None:
65
+ return ResourceState.NONE
66
+ return ResourceState(entry["state"])
67
+
68
+ def write(self, resource_name: str, state: "ResourceState"):
69
+ """Write the state for a resource (atomic write)."""
70
+ data = self._load()
71
+ if "resources" not in data:
72
+ data["resources"] = {}
73
+ data["resources"][resource_name] = {"state": state.value}
74
+ self._save(data)
75
+
76
+ def _load(self) -> dict:
77
+ if self._path.is_file():
78
+ with self._path.open("r") as f:
79
+ return json.load(f)
80
+ return {"version": self.VERSION, "resources": {}}
81
+
82
+ def _save(self, data: dict):
83
+ self._path.parent.mkdir(parents=True, exist_ok=True)
84
+ tmp = self._path.with_suffix(".tmp")
85
+ with tmp.open("w") as f:
86
+ json.dump(data, f, indent=2)
87
+ tmp.rename(self._path)
88
+
89
+
90
+ # --- ResourceState enum ---
91
+
92
+
93
+ class ResourceState(str, Enum):
94
+ """State of a resource in the preparation pipeline."""
95
+
96
+ NONE = "none"
97
+ """Not started / no data on disk."""
98
+
99
+ PARTIAL = "partial"
100
+ """Started but incomplete (error during download)."""
101
+
102
+ COMPLETE = "complete"
103
+ """Fully available."""
104
+
105
+
106
+ # --- Lazy initialization decorator (backward compat) ---
6
107
 
7
108
 
8
109
  def initialized(method):
9
- """Ensure the object is initialized"""
110
+ """Ensure the object is initialized (calls postinit on first use).
111
+
112
+ Deprecated: new Resource subclasses should not rely on this pattern.
113
+ """
10
114
 
11
115
  def wrapper(self, *args, **kwargs):
12
116
  if not self._post:
@@ -17,81 +121,520 @@ def initialized(method):
17
121
  return wrapper
18
122
 
19
123
 
124
+ # --- SetupOptions (backward compat) ---
125
+
126
+
20
127
  @define(kw_only=True)
21
128
  class SetupOptions:
22
129
  pass
23
130
 
24
131
 
132
+ # --- Resource base class ---
133
+
134
+
25
135
  class Resource(DatasetAnnotation, ABC):
136
+ """Base class for all dataset resources.
137
+
138
+ A resource represents a single step in a dataset preparation pipeline.
139
+ Resources form a DAG: each resource declares its dependencies, and
140
+ the orchestrator ensures they are processed in topological order.
141
+
142
+ Usage modes:
143
+
144
+ 1. Class attribute (preferred)::
145
+
146
+ @dataset(url="...")
147
+ class MyDataset(Base):
148
+ DATA = filedownloader("data.csv", "http://...", transient=True)
149
+ PROCESSED = SomeProcessor.from_file(DATA)
150
+
151
+ 2. Decorator on function (deprecated, backward compat)::
152
+
153
+ @filedownloader("data.csv", "http://...")
154
+ @dataset(Base)
155
+ def my_dataset(data): ...
156
+
157
+ Two-path system:
158
+
159
+ - ``transient_path``: where download/processing writes data
160
+ - ``path``: final location after successful completion
161
+
162
+ The framework moves data from ``transient_path`` → ``path`` and then
163
+ marks the resource as COMPLETE. Subclass ``download()`` implementations
164
+ should always write to ``transient_path``.
165
+
166
+ State is persisted in a metadata file at::
167
+
168
+ <dataset.datapath>/.downloads/.state.json
26
169
  """
27
- Base class for all download handlers
28
- """
29
170
 
30
- def __init__(self, varname: str):
31
- self.varname = varname
32
- # Ensures that the object is initialized
171
+ def __init__(
172
+ self,
173
+ varname: str | None = None,
174
+ *,
175
+ transient: bool = False,
176
+ ):
177
+ """
178
+ Args:
179
+ varname: Explicit resource name. If None, auto-set from
180
+ class attribute name during binding. Required when
181
+ used as a decorator (backward compat mode).
182
+ transient: If True, this resource's data can be deleted
183
+ after all its dependents reach COMPLETE.
184
+ """
185
+ self.name: str | None = varname
186
+ self._name_explicit: bool = varname is not None
187
+ self.dataset: AbstractDataset | None = None
188
+ self.transient: bool = transient
189
+ self._dependencies: list[Resource] = []
190
+ self._dependents: list[Resource] = []
191
+
192
+ # Backward compat: lazy initialization support
33
193
  self._post = False
34
- self.definition: AbstractDataset = None
35
194
 
36
- def annotate(self, dataset: AbstractDataset):
37
- assert self.definition is None
38
- # Register has a resource download
39
- if self.varname in dataset.resources:
40
- raise AssertionError("Name %s already declared as a resource", self.varname)
195
+ # ---- Properties ----
41
196
 
42
- dataset.resources[self.varname] = self
43
- dataset.ordered_resources.append(self)
44
- self.definition = dataset
197
+ @property
198
+ def can_recover(self) -> bool:
199
+ """Whether partial downloads can be resumed.
45
200
 
46
- def contextualize(self):
47
- """When using an annotation inline, uses the current dataset wrapper object"""
48
- from datamaestro.definitions import AbstractDataset
201
+ When True and state is PARTIAL, existing data at transient_path
202
+ is preserved on error, allowing the next download() call to
203
+ resume from where it left off.
49
204
 
50
- wrapper = AbstractDataset.processing()
51
- self.annotate(wrapper)
205
+ When False and state is PARTIAL, data at transient_path is
206
+ deleted and state is reset to NONE.
207
+
208
+ Default: False. Subclasses override to enable recovery.
209
+ """
210
+ return False
52
211
 
53
212
  @property
54
- def context(self):
55
- return self.definition.context
213
+ def dependencies(self) -> list[Resource]:
214
+ """Resources that must be COMPLETE before this one can process.
56
215
 
57
- def postinit(self):
58
- pass
216
+ Populated from constructor arguments. Subclasses with factory
217
+ methods should pass dependency resources to ``__init__`` and
218
+ store them in ``_dependencies``.
219
+ """
220
+ return self._dependencies
59
221
 
60
- def hasfiles(self):
61
- return True
222
+ @property
223
+ def dependents(self) -> list[Resource]:
224
+ """Resources that depend on this one (inverse of dependencies).
225
+
226
+ Computed by the dataset after all resources are bound.
227
+ Used for eager transient cleanup decisions.
228
+ """
229
+ return self._dependents
230
+
231
+ @property
232
+ def path(self) -> Path:
233
+ """Final storage path for this resource's data.
234
+
235
+ This is where data lives after successful completion.
236
+ Default: ``dataset.datapath / self.name``
237
+
238
+ Subclasses may override to customize (e.g., add file extension).
239
+ """
240
+ return self.dataset.datapath / self.name
241
+
242
+ @property
243
+ def transient_path(self) -> Path:
244
+ """Temporary path where download/processing writes data.
245
+
246
+ During download(), subclasses write to this path.
247
+ After successful download, the framework moves the data from
248
+ transient_path to path, then marks state as COMPLETE.
249
+
250
+ Default: ``dataset.datapath / ".downloads" / self.name``
251
+ """
252
+ return self.dataset.datapath / ".downloads" / self.name
253
+
254
+ @property
255
+ def state(self) -> ResourceState:
256
+ """Current state, read from the metadata file.
257
+
258
+ If no metadata entry exists, returns NONE.
259
+ """
260
+ if self.dataset is None:
261
+ return ResourceState.NONE
262
+ state_file = ResourceStateFile(self.dataset.datapath)
263
+ return state_file.read(self.name)
264
+
265
+ @state.setter
266
+ def state(self, value: ResourceState) -> None:
267
+ """Update state in the metadata file (atomic write)."""
268
+ state_file = ResourceStateFile(self.dataset.datapath)
269
+ state_file.write(self.name, value)
270
+
271
+ @property
272
+ def context(self):
273
+ """Application context (from dataset)."""
274
+ return self.dataset.context
275
+
276
+ # ---- Abstract methods ----
62
277
 
63
278
  @abstractmethod
64
- def download(self, force=False):
65
- """Downloads the content"""
279
+ def download(self, force: bool = False) -> None:
280
+ """Execute this resource's download/processing step.
281
+
282
+ Contract:
283
+
284
+ - Called only when all dependencies are COMPLETE.
285
+ - Must write output to ``self.transient_path``.
286
+ - The framework handles moving transient_path → path
287
+ and setting state to COMPLETE after this returns.
288
+ - If force=True, re-execute even if already COMPLETE.
289
+
290
+ Note: State management (COMPLETE/PARTIAL/NONE transitions,
291
+ moving transient_path → path) is handled by the framework,
292
+ NOT by the download() implementation.
293
+
294
+ Raises:
295
+ Exception: On download/processing failure. The framework
296
+ will handle PARTIAL state based on can_recover.
297
+ """
66
298
  ...
67
299
 
68
300
  @abstractmethod
69
301
  def prepare(self):
70
- """Prepares the dataset"""
302
+ """Return the value for dataset construction.
303
+
304
+ Called after download() has completed (state is COMPLETE).
305
+ Return type depends on the resource subclass:
306
+
307
+ - FileResource → Path
308
+ - FolderResource → Path
309
+ - ValueResource → resource-specific
310
+
311
+ For backward compat with function-based datasets, this value
312
+ is passed as a keyword argument to the dataset function.
313
+ """
71
314
  ...
72
315
 
316
+ # ---- Concrete methods ----
317
+
318
+ def cleanup(self) -> None:
319
+ """Remove this resource's data from disk.
320
+
321
+ Called automatically for transient resources after all
322
+ dependents reach COMPLETE (eager cleanup).
323
+
324
+ Default implementation:
325
+
326
+ - Deletes self.path (file or directory)
327
+ - Deletes self.transient_path if it exists
328
+ - Sets self.state = NONE
329
+
330
+ Subclasses may override for custom cleanup.
331
+ """
332
+ for p in (self.path, self.transient_path):
333
+ if p.exists():
334
+ if p.is_dir():
335
+ shutil.rmtree(p)
336
+ else:
337
+ p.unlink()
338
+ self.state = ResourceState.NONE
339
+
340
+ def has_files(self) -> bool:
341
+ """Whether this resource produces files on disk.
342
+
343
+ Returns False for reference-only resources (e.g., links
344
+ to other datasets, in-memory values).
345
+ Default: True.
346
+ """
347
+ return True
348
+
349
+ # Backward compat alias
350
+ def hasfiles(self) -> bool:
351
+ """Deprecated: use has_files() instead."""
352
+ _warn_once("hasfiles", "hasfiles() is deprecated, use has_files()")
353
+ return self.has_files()
354
+
355
+ def postinit(self):
356
+ """Legacy lazy initialization hook.
357
+
358
+ Deprecated: new Resource subclasses should perform
359
+ initialization in ``__init__`` or ``bind()``.
360
+ """
361
+ pass
362
+
363
+ # ---- Binding ----
364
+
365
+ def bind(self, name: str, dataset: AbstractDataset) -> None:
366
+ """Bind this resource to a dataset.
367
+
368
+ Called by the dataset class machinery during initialization.
369
+ Sets self.name (if not explicitly set via varname) and
370
+ self.dataset. Registers the resource in dataset.resources
371
+ and dataset.ordered_resources.
372
+
373
+ For class-based datasets: called by ``@dataset`` when it
374
+ processes class attributes.
375
+ For decorator-based: called by ``annotate()`` (existing protocol).
376
+ """
377
+ if not self._name_explicit:
378
+ self.name = name
379
+
380
+ assert self.dataset is None, (
381
+ f"Resource {self.name} is already bound to a dataset"
382
+ )
383
+
384
+ if self.name in dataset.resources:
385
+ raise AssertionError(f"Name {self.name} already declared as a resource")
386
+
387
+ dataset.resources[self.name] = self
388
+ dataset.ordered_resources.append(self)
389
+ self.dataset = dataset
390
+
391
+ def annotate(self, dataset: AbstractDataset) -> None:
392
+ """Register with a dataset (DatasetAnnotation protocol).
393
+
394
+ Deprecated for new code. Calls bind() internally.
395
+ """
396
+ _warn_once(
397
+ "annotate",
398
+ "Using resources as decorators is deprecated. "
399
+ "Define them as class attributes instead.",
400
+ )
401
+ self.bind(self.name, dataset)
402
+
403
+ def contextualize(self):
404
+ """When using an annotation inline, uses the current
405
+ dataset wrapper object.
406
+
407
+ Deprecated: use class-attribute resource definitions instead.
408
+ """
409
+ wrapper = AbstractDataset.processing()
410
+ self.annotate(wrapper)
411
+
73
412
  def setup(
74
413
  self,
75
414
  dataset: Union[AbstractDataset],
76
415
  options: SetupOptions = None,
77
416
  ):
78
- """Direct way to setup the resource (no annotation)"""
417
+ """Direct way to setup the resource (no annotation).
418
+
419
+ Deprecated: use class-attribute resource definitions instead.
420
+ """
79
421
  self(dataset)
80
422
  return self.prepare()
81
423
 
424
+ # ---- Factory pattern ----
425
+
426
+ @classmethod
427
+ def apply(cls, *args, **kwargs) -> "Resource":
428
+ """Factory classmethod for creating resource instances.
429
+
430
+ Allows defining shorthand factory functions::
431
+
432
+ filedownloader = FileDownloader.apply
433
+
434
+ Default implementation: ``return cls(*args, **kwargs)``
435
+ Subclasses may override for custom argument handling.
436
+ """
437
+ return cls(*args, **kwargs)
438
+
439
+ # ---- Backward compat: definition property ----
440
+
441
+ @property
442
+ def definition(self) -> AbstractDataset | None:
443
+ """Deprecated: use ``dataset`` attribute instead."""
444
+ _warn_once(
445
+ "definition",
446
+ "Resource.definition is deprecated, use Resource.dataset",
447
+ )
448
+ return self.dataset
449
+
450
+ # Backward compat: varname property
451
+ @property
452
+ def varname(self) -> str | None:
453
+ """Deprecated: use ``name`` attribute instead."""
454
+ _warn_once(
455
+ "varname",
456
+ "Resource.varname is deprecated, use Resource.name",
457
+ )
458
+ return self.name
459
+
460
+ @varname.setter
461
+ def varname(self, value: str | None):
462
+ self.name = value
463
+
464
+
465
+ # --- FileResource ---
466
+
467
+
468
+ class FileResource(Resource):
469
+ """A resource that produces a single file on disk.
470
+
471
+ Subclasses implement ``_download()`` to produce the file at the
472
+ given destination (which is ``self.transient_path``).
473
+ """
474
+
475
+ def __init__(
476
+ self,
477
+ filename: str,
478
+ *,
479
+ varname: str | None = None,
480
+ transient: bool = False,
481
+ ):
482
+ """
483
+ Args:
484
+ filename: The filename (with extension) for the produced file.
485
+ Used to construct the storage path.
486
+ varname: Explicit resource name. If None, derived from
487
+ filename (extension stripped) or class attribute name.
488
+ transient: See Resource.
489
+ """
490
+ import re
491
+
492
+ effective_varname = varname or re.sub(r"\..*$", "", filename)
493
+ super().__init__(varname=effective_varname, transient=transient)
494
+ # Only mark name as explicit if user actually passed varname
495
+ self._name_explicit = varname is not None
496
+ self.filename = filename
497
+
498
+ @property
499
+ def path(self) -> Path:
500
+ """Final path to the produced file.
501
+
502
+ ``dataset.datapath / self.filename``
503
+ """
504
+ return self.dataset.datapath / self.filename
505
+
506
+ @property
507
+ def transient_path(self) -> Path:
508
+ """Temporary path for writing during download.
509
+
510
+ ``dataset.datapath / ".downloads" / self.filename``
511
+ """
512
+ return self.dataset.datapath / ".downloads" / self.filename
513
+
514
+ def prepare(self) -> Path:
515
+ """Returns self.path."""
516
+ return self.path
517
+
518
+ def stream(self) -> IO[bytes] | None:
519
+ """Return a readable byte stream of the file content.
520
+
521
+ Returns None if streaming is not supported for this resource.
522
+ Default: returns None. Subclasses may override.
523
+
524
+ This allows downstream resources to consume data without
525
+ needing the file to be fully materialized on disk first.
526
+ """
527
+ return None
528
+
529
+ def download(self, force: bool = False) -> None:
530
+ """Downloads the file.
531
+
532
+ Delegates to ``_download(self.transient_path)``.
533
+ """
534
+ self._download(self.transient_path)
535
+
536
+ @abstractmethod
537
+ def _download(self, destination: Path) -> None:
538
+ """Subclass hook: download/produce the file at destination.
539
+
540
+ Args:
541
+ destination: The path to write the file to
542
+ (``self.transient_path``).
543
+ """
544
+ ...
545
+
546
+
547
+ # --- FolderResource ---
548
+
549
+
550
+ class FolderResource(Resource):
551
+ """A resource that produces a directory on disk.
552
+
553
+ Subclasses implement ``_download()`` to populate the directory at
554
+ the given destination (which is ``self.transient_path``).
555
+ """
556
+
557
+ @property
558
+ def path(self) -> Path:
559
+ """Final path to the produced directory.
560
+
561
+ ``dataset.datapath / self.name``
562
+ """
563
+ return self.dataset.datapath / self.name
564
+
565
+ @property
566
+ def transient_path(self) -> Path:
567
+ """Temporary path for writing during download.
568
+
569
+ ``dataset.datapath / ".downloads" / self.name``
570
+ """
571
+ return self.dataset.datapath / ".downloads" / self.name
82
572
 
83
- # Keeps downwards compatibility
84
- Download = Resource
573
+ def prepare(self) -> Path:
574
+ """Returns self.path."""
575
+ return self.path
576
+
577
+ def download(self, force: bool = False) -> None:
578
+ """Downloads/extracts the directory content to transient_path."""
579
+ self._download(self.transient_path)
580
+
581
+ @abstractmethod
582
+ def _download(self, destination: Path) -> None:
583
+ """Subclass hook: populate the directory at destination.
584
+
585
+ Args:
586
+ destination: The path to write to (``self.transient_path``).
587
+ """
588
+ ...
589
+
590
+
591
+ # --- ValueResource ---
592
+
593
+
594
+ class ValueResource(Resource):
595
+ """A resource that produces an in-memory value (no files on disk).
596
+
597
+ Used for resources like HuggingFace dataset handles that don't
598
+ produce local files. The transient_path/path two-path system
599
+ is not used; state tracking is still via metadata file.
600
+ """
601
+
602
+ def has_files(self) -> bool:
603
+ return False
604
+
605
+ @abstractmethod
606
+ def prepare(self):
607
+ """Return the in-memory value."""
608
+ ...
609
+
610
+
611
+ # --- Deprecated compatibility classes ---
612
+
613
+
614
+ class Download(Resource):
615
+ """Deprecated: use Resource instead."""
616
+
617
+ def __init_subclass__(cls):
618
+ _warn_once(
619
+ f"Download-{cls.__name__}",
620
+ f"Download is deprecated ({cls}): use `Resource`",
621
+ )
622
+ return super().__init_subclass__()
623
+
624
+
625
+ # --- reference resource ---
85
626
 
86
627
 
87
628
  class reference(Resource):
88
- def __init__(self, varname=None, reference=None):
89
- """References another dataset
629
+ """References another dataset instead of downloading."""
90
630
 
91
- :param varname: The name of the variable
92
- :param reference: Another dataset
631
+ def __init__(self, varname=None, reference=None):
632
+ """
633
+ Args:
634
+ varname: The name of the variable.
635
+ reference: Another dataset to reference.
93
636
  """
94
- super().__init__(varname)
637
+ super().__init__(varname=varname)
95
638
  assert reference is not None, "Reference cannot be null"
96
639
  self.reference = reference
97
640
 
@@ -104,7 +647,7 @@ class reference(Resource):
104
647
  def download(self, force=False):
105
648
  self.reference.__datamaestro__.download(force)
106
649
 
107
- def hasfiles(self):
650
+ def has_files(self):
108
651
  # We don't really have files
109
652
  return False
110
653