datamaestro 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamaestro/__main__.py CHANGED
@@ -1,5 +1,5 @@
1
1
  #!/usr/bin/env python3
2
- # flake8: noqa: T201
2
+ # ruff: noqa: T201
3
3
 
4
4
  from importlib.metadata import entry_points
5
5
  import sys
@@ -9,12 +9,14 @@ import traceback as tb
9
9
  import re
10
10
  from pathlib import Path
11
11
  import shutil
12
- from .context import Context
13
12
  from typing import Set
14
- import datamaestro
13
+ from urllib.parse import urlparse
15
14
 
16
15
  import click
17
16
 
17
+ import datamaestro
18
+ from .context import Context
19
+
18
20
  logging.basicConfig(level=logging.INFO)
19
21
 
20
22
 
@@ -60,7 +62,10 @@ for entry_point in entry_points(group="datamaestro.repositories"):
60
62
  "--traceback", is_flag=True, help="Display traceback if an exception occurs"
61
63
  )
62
64
  @click.option(
63
- "--data", type=Path, help="Directory containing datasets", default=Context.MAINDIR
65
+ "--data",
66
+ type=Path,
67
+ help="Directory containing datasets",
68
+ default=Context.MAINDIR,
64
69
  )
65
70
  @click.pass_context
66
71
  def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
@@ -207,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
207
212
  # --- Create a dataset
208
213
 
209
214
  DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
210
- from urllib.parse import urlparse
211
215
 
212
216
 
213
217
  def dataset_id_check(ctx, param, value):
@@ -159,7 +159,10 @@ def document(match):
159
159
  try:
160
160
  object = getattr(module, name)
161
161
  except Exception:
162
- return "<div class='error'>Cannot find %s in %s</div>" % (name, modulename)
162
+ return "<div class='error'>Cannot find %s in %s</div>" % (
163
+ name,
164
+ modulename,
165
+ )
163
166
 
164
167
  if ismodule(object):
165
168
  return "\n\n".join(
@@ -220,7 +223,12 @@ class Classification:
220
223
  module = Datasets(importlib.import_module(meta.t.__module__))
221
224
  r.write(
222
225
  "- [%s](../df/%s/%s.html#%s)\n"
223
- % (meta.name or meta.id, meta.repository.id, module.id, meta.id)
226
+ % (
227
+ meta.name or meta.id,
228
+ meta.repository.id,
229
+ module.id,
230
+ meta.id,
231
+ )
224
232
  )
225
233
 
226
234
  return r.getvalue()
@@ -326,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
326
334
  import shutil
327
335
 
328
336
  path = Path(config["site_dir"]) / "mainstyle.css"
329
- with importlib.resources.open_binary(
330
- "datamaestro.commands", "mainstyle.css"
331
- ) as source, path.open("wb") as dest:
337
+ with (
338
+ importlib.resources.open_binary(
339
+ "datamaestro.commands", "mainstyle.css"
340
+ ) as source,
341
+ path.open("wb") as dest,
342
+ ):
332
343
  shutil.copyfileobj(source, dest)
333
344
 
334
345
  def on_files(self, files, config):
datamaestro/data/ml.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Machine learning generic data formats"""
2
+
2
3
  from pathlib import Path
3
4
  from typing import Generic, TypeVar, Optional
4
5
  from experimaestro import Param, Meta
@@ -2,8 +2,12 @@
2
2
  # Main datamaestro functions and data models
3
3
  #
4
4
 
5
+ from __future__ import annotations
6
+
5
7
  import logging
6
8
  import inspect
9
+ import re as _re
10
+ import shutil
7
11
  from pathlib import Path
8
12
  from itertools import chain
9
13
  from abc import ABC, abstractmethod
@@ -30,9 +34,117 @@ from typing import Type as TypingType # noqa: F401 (re-exports)
30
34
  from experimaestro.core.types import Type # noqa: F401 (re-exports)
31
35
 
32
36
  if TYPE_CHECKING:
33
- from .data import Base, Dataset
37
+ from .data import Base
34
38
  from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
35
- from datamaestro.download import Download
39
+ from datamaestro.download import Download, Resource
40
+
41
+ # --- DAG utilities ---
42
+
43
+
44
+ def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
45
+ """Topological sort of resources by their dependencies.
46
+
47
+ Args:
48
+ resources: Dict mapping resource names to Resource instances.
49
+
50
+ Returns:
51
+ List of resources in dependency order (dependencies first).
52
+
53
+ Raises:
54
+ ValueError: If a cycle is detected in the dependency graph.
55
+ """
56
+ visited: set[str] = set()
57
+ visiting: set[str] = set() # For cycle detection
58
+ result: list["Resource"] = []
59
+
60
+ def visit(resource: "Resource"):
61
+ if resource.name in visited:
62
+ return
63
+ if resource.name in visiting:
64
+ raise ValueError(
65
+ f"Cycle detected in resource dependencies involving {resource.name}"
66
+ )
67
+
68
+ visiting.add(resource.name)
69
+ for dep in resource.dependencies:
70
+ visit(dep)
71
+ visiting.discard(resource.name)
72
+ visited.add(resource.name)
73
+ result.append(resource)
74
+
75
+ for resource in resources.values():
76
+ visit(resource)
77
+
78
+ return result
79
+
80
+
81
+ def _compute_dependents(resources: dict[str, "Resource"]) -> None:
82
+ """Compute the dependents (inverse edges) for all resources."""
83
+ # Clear existing dependents
84
+ for resource in resources.values():
85
+ resource._dependents = []
86
+
87
+ # Build inverse edges
88
+ for resource in resources.values():
89
+ for dep in resource.dependencies:
90
+ if resource not in dep._dependents:
91
+ dep._dependents.append(resource)
92
+
93
+
94
+ def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
95
+ """Scan class attributes for Resource instances and bind them.
96
+
97
+ This is called when a class-based dataset is processed by the
98
+ @dataset decorator. It detects Resource instances defined as
99
+ class attributes and binds them to the dataset.
100
+
101
+ Args:
102
+ cls: The dataset class to scan.
103
+ dataset_wrapper: The AbstractDataset to bind resources to.
104
+ """
105
+ from datamaestro.download import Resource
106
+
107
+ for attr_name, attr_value in vars(cls).items():
108
+ if isinstance(attr_value, Resource):
109
+ attr_value.bind(attr_name, dataset_wrapper)
110
+
111
+ # Build the dependency DAG
112
+ _compute_dependents(dataset_wrapper.resources)
113
+
114
+ # Validate: topological sort will raise on cycles
115
+ dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
116
+
117
+
118
+ def _delete_path(path: Path) -> None:
119
+ """Delete a file or directory at path."""
120
+ if path.exists():
121
+ if path.is_dir():
122
+ shutil.rmtree(path)
123
+ else:
124
+ path.unlink()
125
+
126
+
127
+ def _move_path(src: Path, dst: Path) -> None:
128
+ """Move a file or directory from src to dst."""
129
+ if src.exists():
130
+ dst.parent.mkdir(parents=True, exist_ok=True)
131
+ shutil.move(str(src), str(dst))
132
+
133
+
134
+ _CAMEL_RE1 = _re.compile(r"([A-Z]+)([A-Z][a-z])")
135
+ _CAMEL_RE2 = _re.compile(r"([a-z0-9])([A-Z])")
136
+
137
+
138
+ def _camel_to_snake(name: str) -> str:
139
+ """Convert CamelCase to snake_case, then lowercase.
140
+
141
+ Examples: ProcessedMNIST -> processed_mnist, MyData -> my_data,
142
+ MNIST -> mnist, simple -> simple
143
+ """
144
+ s = _CAMEL_RE1.sub(r"\1_\2", name)
145
+ s = _CAMEL_RE2.sub(r"\1_\2", s)
146
+ return s.lower()
147
+
36
148
 
37
149
  # --- Objects holding information into classes/function
38
150
 
@@ -100,7 +212,12 @@ class DataDefinition(AbstractData):
100
212
  if components[0] == "datamaestro":
101
213
  longest_ix = 0
102
214
 
103
- return repository, [s.lower() for s in components[(longest_ix + 1) :]]
215
+ parts = components[(longest_ix + 1) :]
216
+ # Module components: just lowercase
217
+ # Last component (class/function name): CamelCase → snake_case
218
+ if parts:
219
+ parts = [s.lower() for s in parts[:-1]] + [_camel_to_snake(parts[-1])]
220
+ return repository, parts
104
221
 
105
222
  def ancestors(self):
106
223
  ancestors = []
@@ -218,18 +335,127 @@ class AbstractDataset(AbstractData):
218
335
  self.setDataIDs(value, f"{id}.{key}")
219
336
 
220
337
  def download(self, force=False):
221
- """Download all the necessary resources"""
222
- success = True
338
+ """Download all the necessary resources.
339
+
340
+ Uses DAG-based topological ordering and the two-path system:
341
+ 1. Acquire exclusive lock (.state.lock)
342
+ 2. Resource writes to transient_path (under .downloads/)
343
+ 3. Framework moves transient_path → path (main folder)
344
+ 4. State marked COMPLETE
345
+ 5. Transient dependencies cleaned up eagerly
346
+ 6. .downloads/ directory removed after all resources complete
347
+ 7. Release lock
348
+ """
349
+ import fcntl
350
+
351
+ from datamaestro.download import ResourceState
352
+
223
353
  self.prepare()
224
- logging.info("Materializing %d resources", len(self.ordered_resources))
354
+ logging.info(
355
+ "Materializing %d resources",
356
+ len(self.ordered_resources),
357
+ )
358
+
359
+ self.datapath.mkdir(parents=True, exist_ok=True)
360
+ lock_path = self.datapath / ".state.lock"
361
+ lock_file = lock_path.open("w")
362
+ try:
363
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
364
+ success = self._download_locked(force, ResourceState)
365
+ finally:
366
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
367
+ lock_file.close()
368
+
369
+ return success
370
+
371
+ def _download_locked(self, force, ResourceState):
372
+ """Inner download logic, called while holding .state.lock."""
373
+ success = True
374
+
225
375
  for resource in self.ordered_resources:
376
+ # Step 1: Check state
377
+ current_state = resource.state
378
+
379
+ if current_state == ResourceState.COMPLETE and not force:
380
+ # Verify files are actually present on disk
381
+ if resource.has_files() and not resource.path.exists():
382
+ logging.warning(
383
+ "Resource %s marked COMPLETE but files "
384
+ "missing at %s — re-downloading",
385
+ resource.name,
386
+ resource.path,
387
+ )
388
+ resource.state = ResourceState.NONE
389
+ current_state = ResourceState.NONE
390
+ else:
391
+ continue
392
+
393
+ # Adopt pre-existing files (old downloads without state file)
394
+ if (
395
+ current_state == ResourceState.NONE
396
+ and not force
397
+ and resource.has_files()
398
+ and resource.path.exists()
399
+ ):
400
+ logging.info(
401
+ "Resource %s already exists at %s — marking COMPLETE",
402
+ resource.name,
403
+ resource.path,
404
+ )
405
+ resource.state = ResourceState.COMPLETE
406
+ continue
407
+
408
+ if current_state == ResourceState.PARTIAL:
409
+ if not resource.can_recover:
410
+ _delete_path(resource.transient_path)
411
+ resource.state = ResourceState.NONE
412
+
413
+ # Verify all dependencies are COMPLETE
414
+ for dep in resource.dependencies:
415
+ if dep.state != ResourceState.COMPLETE:
416
+ logging.error(
417
+ "Dependency %s of %s is not COMPLETE",
418
+ dep.name,
419
+ resource.name,
420
+ )
421
+ return False
422
+
423
+ # Step 2-4: Download with framework-managed state
226
424
  try:
227
- resource.download(force)
425
+ resource.download(force=force)
426
+
427
+ # Move transient -> final, mark COMPLETE
428
+ if resource.has_files():
429
+ _move_path(resource.transient_path, resource.path)
430
+ resource.state = ResourceState.COMPLETE
431
+
228
432
  except Exception:
229
433
  logging.error("Could not download resource %s", resource)
230
434
  traceback.print_exc()
435
+
436
+ # Handle PARTIAL state
437
+ if resource.has_files() and resource.transient_path.exists():
438
+ if resource.can_recover:
439
+ resource.state = ResourceState.PARTIAL
440
+ else:
441
+ _delete_path(resource.transient_path)
442
+ resource.state = ResourceState.NONE
231
443
  success = False
232
444
  break
445
+
446
+ # Step 5: Eager transient cleanup
447
+ for dep in resource.dependencies:
448
+ if dep.transient and all(
449
+ d.state == ResourceState.COMPLETE for d in dep.dependents
450
+ ):
451
+ dep.cleanup()
452
+
453
+ # Step 6: Remove .downloads/ directory after success
454
+ if success:
455
+ downloads_dir = self.datapath / ".downloads"
456
+ if downloads_dir.is_dir():
457
+ shutil.rmtree(downloads_dir)
458
+
233
459
  return success
234
460
 
235
461
  @staticmethod
@@ -310,8 +536,7 @@ class DatasetWrapper(AbstractDataset):
310
536
  # Computes an ID
311
537
  assert (
312
538
  # id is empty string = use the module id
313
- components[0]
314
- == "config"
539
+ components[0] == "config"
315
540
  ), (
316
541
  "A @dataset without `id` should be in the "
317
542
  f".config module (not {t.__module__})"
@@ -390,6 +615,10 @@ class DatasetWrapper(AbstractDataset):
390
615
  if self.base is self.t:
391
616
  self.config = self.base.__create_dataset__(self)
392
617
 
618
+ elif hasattr(self.t, "__create_dataset__"):
619
+ # Class-based dataset with metadataset or different base
620
+ self.config = self.t.__create_dataset__(self)
621
+
393
622
  else:
394
623
  # Construct the object
395
624
  if self.as_prepare:
@@ -459,14 +688,24 @@ class DatasetWrapper(AbstractDataset):
459
688
 
460
689
  return path
461
690
 
462
- def hasfiles(self) -> bool:
463
- """Returns whether this dataset has files or only includes references"""
691
+ def has_files(self) -> bool:
692
+ """Returns whether this dataset has files or only includes references."""
464
693
  for resource in self.resources.values():
465
- if resource.hasfiles():
694
+ if resource.has_files():
466
695
  return True
467
-
468
696
  return False
469
697
 
698
+ def hasfiles(self) -> bool:
699
+ """Deprecated: use has_files() instead."""
700
+ import warnings
701
+
702
+ warnings.warn(
703
+ "hasfiles() is deprecated, use has_files()",
704
+ DeprecationWarning,
705
+ stacklevel=2,
706
+ )
707
+ return self.has_files()
708
+
470
709
 
471
710
  # --- Annotations
472
711
 
@@ -482,9 +721,9 @@ class DataAnnotation:
482
721
  self.annotate(object.__datamaestro__)
483
722
  else:
484
723
  # With configuration objects, add a __datamaestro__ member to the class
485
- assert issubclass(
486
- object, Config
487
- ), f"{object} cannot be annotated (only dataset or data definitions)"
724
+ assert issubclass(object, Config), (
725
+ f"{object} cannot be annotated (only dataset or data definitions)"
726
+ )
488
727
  if "__datamaestro__" not in object.__dict__:
489
728
  object.__datamaestro__ = AbstractData()
490
729
  self.annotate(object.__datamaestro__)
@@ -501,8 +740,8 @@ class DatasetAnnotation:
501
740
  def __call__(self, dataset: AbstractDataset):
502
741
  if isinstance(dataset, AbstractDataset):
503
742
  self.annotate(dataset)
504
- elif issubclass(dataset, Dataset):
505
- self.annotate(dataset.__datamaestro__)
743
+ elif hasattr(dataset, "__dataset__"):
744
+ self.annotate(dataset.__dataset__)
506
745
  else:
507
746
  raise RuntimeError(
508
747
  f"Only datasets can be annotated with {self}, "
@@ -558,7 +797,9 @@ datatasks = DataTagging(lambda d: d.tasks)
558
797
 
559
798
  class metadata:
560
799
  def __init__(
561
- self, tags: Union[str, List[str]] = None, tasks: Union[str, List[str]] = None
800
+ self,
801
+ tags: Union[str, List[str]] = None,
802
+ tasks: Union[str, List[str]] = None,
562
803
  ):
563
804
  pass
564
805
 
@@ -632,7 +873,10 @@ class dataset:
632
873
  pass
633
874
  dw = DatasetWrapper(self, t)
634
875
  t.__dataset__ = dw
876
+
877
+ # For class-based datasets, scan for Resource class attributes
635
878
  if inspect.isclass(t) and issubclass(t, Base):
879
+ _bind_class_resources(t, dw)
636
880
  return t
637
881
  return dw
638
882