datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datamaestro/__init__.py CHANGED
@@ -7,7 +7,6 @@ from .context import (
7
7
  prepare_dataset,
8
8
  )
9
9
 
10
- from pkg_resources import get_distribution, DistributionNotFound
11
10
  from .definitions import dataset, metadata
12
11
  from .data import Base
13
- from .version import version, version_tuple
12
+ from .version import __version__
datamaestro/__main__.py CHANGED
@@ -1,20 +1,22 @@
1
1
  #!/usr/bin/env python3
2
- # flake8: noqa: T201
2
+ # ruff: noqa: T201
3
3
 
4
+ from importlib.metadata import entry_points
4
5
  import sys
5
6
  import logging
6
7
  from functools import update_wrapper
7
8
  import traceback as tb
8
- import pkg_resources
9
9
  import re
10
10
  from pathlib import Path
11
11
  import shutil
12
- from .context import Context
13
12
  from typing import Set
14
- import datamaestro
13
+ from urllib.parse import urlparse
15
14
 
16
15
  import click
17
16
 
17
+ import datamaestro
18
+ from .context import Context
19
+
18
20
  logging.basicConfig(level=logging.INFO)
19
21
 
20
22
 
@@ -38,7 +40,7 @@ def pass_cfg(f):
38
40
  # Get all the available repositories
39
41
 
40
42
  REPOSITORIES = {}
41
- for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
43
+ for entry_point in entry_points(group="datamaestro.repositories"):
42
44
  REPOSITORIES[entry_point.name] = entry_point
43
45
 
44
46
 
@@ -60,7 +62,10 @@ for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
60
62
  "--traceback", is_flag=True, help="Display traceback if an exception occurs"
61
63
  )
62
64
  @click.option(
63
- "--data", type=Path, help="Directory containing datasets", default=Context.MAINDIR
65
+ "--data",
66
+ type=Path,
67
+ help="Directory containing datasets",
68
+ default=Context.MAINDIR,
64
69
  )
65
70
  @click.pass_context
66
71
  def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
@@ -207,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
207
212
  # --- Create a dataset
208
213
 
209
214
  DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
210
- from urllib.parse import urlparse
211
215
 
212
216
 
213
217
  def dataset_id_check(ctx, param, value):
@@ -159,7 +159,10 @@ def document(match):
159
159
  try:
160
160
  object = getattr(module, name)
161
161
  except Exception:
162
- return "<div class='error'>Cannot find %s in %s</div>" % (name, modulename)
162
+ return "<div class='error'>Cannot find %s in %s</div>" % (
163
+ name,
164
+ modulename,
165
+ )
163
166
 
164
167
  if ismodule(object):
165
168
  return "\n\n".join(
@@ -220,7 +223,12 @@ class Classification:
220
223
  module = Datasets(importlib.import_module(meta.t.__module__))
221
224
  r.write(
222
225
  "- [%s](../df/%s/%s.html#%s)\n"
223
- % (meta.name or meta.id, meta.repository.id, module.id, meta.id)
226
+ % (
227
+ meta.name or meta.id,
228
+ meta.repository.id,
229
+ module.id,
230
+ meta.id,
231
+ )
224
232
  )
225
233
 
226
234
  return r.getvalue()
@@ -326,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
326
334
  import shutil
327
335
 
328
336
  path = Path(config["site_dir"]) / "mainstyle.css"
329
- with importlib.resources.open_binary(
330
- "datamaestro.commands", "mainstyle.css"
331
- ) as source, path.open("wb") as dest:
337
+ with (
338
+ importlib.resources.open_binary(
339
+ "datamaestro.commands", "mainstyle.css"
340
+ ) as source,
341
+ path.open("wb") as dest,
342
+ ):
332
343
  shutil.copyfileobj(source, dest)
333
344
 
334
345
  def on_files(self, files, config):
datamaestro/context.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from pathlib import Path
2
- from typing import Iterable, Iterator, Dict, Union
2
+ from typing import Iterable, Iterator, Dict, Optional, Union
3
3
  import importlib
4
4
  import os
5
5
  import hashlib
@@ -8,8 +8,7 @@ import inspect
8
8
  import json
9
9
  from abc import ABC, abstractmethod
10
10
  from experimaestro import Config
11
- import pkg_resources
12
- from experimaestro.compat import cached_property
11
+ from functools import cached_property
13
12
  from experimaestro.mkdocs.metaloader import Module
14
13
  from .utils import CachedFile, downloadURL
15
14
  from .settings import UserSettings, Settings
@@ -18,6 +17,22 @@ from typing import TYPE_CHECKING
18
17
  if TYPE_CHECKING:
19
18
  from datamaestro.definitions import AbstractDataset, DatasetWrapper
20
19
 
20
+ from importlib.metadata import (
21
+ entry_points as _entry_points,
22
+ version as _version,
23
+ PackageNotFoundError as _PackageNotFoundError,
24
+ )
25
+
26
+
27
+ def iter_entry_points(group, name=None):
28
+ """Yield entry points for a given group (and optional name) using importlib.metadata."""
29
+ eps = _entry_points()
30
+ selected = eps.select(group=group)
31
+ if name:
32
+ selected = [ep for ep in selected if ep.name == name]
33
+ for ep in selected:
34
+ yield ep
35
+
21
36
 
22
37
  class Compression:
23
38
  @staticmethod
@@ -106,7 +121,7 @@ class Context:
106
121
 
107
122
  def repositories(self) -> Iterable["Repository"]:
108
123
  """Returns an iterator over repositories"""
109
- for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
124
+ for entry_point in iter_entry_points("datamaestro.repositories"):
110
125
  yield entry_point.load().instance()
111
126
 
112
127
  def repository(self, repositoryid):
@@ -114,10 +129,7 @@ class Context:
114
129
  return None
115
130
 
116
131
  entry_points = [
117
- x
118
- for x in pkg_resources.iter_entry_points(
119
- "datamaestro.repositories", repositoryid
120
- )
132
+ x for x in iter_entry_points("datamaestro.repositories", repositoryid)
121
133
  ]
122
134
  if not entry_points:
123
135
  raise Exception("No datasets repository named %s", repositoryid)
@@ -299,8 +311,7 @@ class BaseRepository(ABC):
299
311
  self.basedir = Path(p).parent
300
312
 
301
313
  @abstractmethod
302
- def __iter__(self) -> Iterator["AbstractDataset"]:
303
- ...
314
+ def __iter__(self) -> Iterator["AbstractDataset"]: ...
304
315
 
305
316
  def search(self, name: str):
306
317
  """Search for a dataset in the definitions"""
@@ -353,11 +364,9 @@ class Repository(BaseRepository):
353
364
 
354
365
  @classmethod
355
366
  def version(cls):
356
- from pkg_resources import get_distribution, DistributionNotFound
357
-
358
367
  try:
359
- return get_distribution(cls.__module__).version
360
- except DistributionNotFound:
368
+ return _version(cls.__module__)
369
+ except _PackageNotFoundError:
361
370
  return None
362
371
 
363
372
  def __repr__(self):
@@ -423,16 +432,23 @@ def find_dataset(dataset_id: str):
423
432
  return AbstractDataset.find(dataset_id)
424
433
 
425
434
 
426
- def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
435
+ def prepare_dataset(
436
+ dataset_id: Union[str, "DatasetWrapper", Config],
437
+ context: Optional[Union[Context, Path]] = None,
438
+ ):
427
439
  """Find a dataset given its id and download the resources"""
428
440
  from .definitions import AbstractDataset, DatasetWrapper
429
441
 
442
+ match context:
443
+ case Path() | str():
444
+ context = Context(Path(context))
445
+
430
446
  if isinstance(dataset_id, DatasetWrapper):
431
447
  ds = dataset_id
432
448
  elif isinstance(dataset_id, Config):
433
449
  ds = dataset_id.__datamaestro_dataset__
434
450
  else:
435
- ds = AbstractDataset.find(dataset_id)
451
+ ds = AbstractDataset.find(dataset_id, context=context)
436
452
 
437
453
  return ds.prepare(download=True)
438
454
 
datamaestro/data/ml.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Machine learning generic data formats"""
2
+
2
3
  from pathlib import Path
3
4
  from typing import Generic, TypeVar, Optional
4
5
  from experimaestro import Param, Meta
@@ -2,8 +2,11 @@
2
2
  # Main datamaestro functions and data models
3
3
  #
4
4
 
5
+ from __future__ import annotations
6
+
5
7
  import logging
6
8
  import inspect
9
+ import shutil
7
10
  from pathlib import Path
8
11
  from itertools import chain
9
12
  from abc import ABC, abstractmethod
@@ -21,8 +24,6 @@ from typing import (
21
24
  _GenericAlias,
22
25
  )
23
26
  from experimaestro import ( # noqa: F401 (re-exports)
24
- argument,
25
- constant,
26
27
  Param,
27
28
  Option,
28
29
  Config,
@@ -34,7 +35,100 @@ from experimaestro.core.types import Type # noqa: F401 (re-exports)
34
35
  if TYPE_CHECKING:
35
36
  from .data import Base, Dataset
36
37
  from .context import Repository, Context, DatafolderPath # noqa: F401 (re-exports)
37
- from datamaestro.download import Download
38
+ from datamaestro.download import Download, Resource
39
+
40
+ # --- DAG utilities ---
41
+
42
+
43
+ def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
44
+ """Topological sort of resources by their dependencies.
45
+
46
+ Args:
47
+ resources: Dict mapping resource names to Resource instances.
48
+
49
+ Returns:
50
+ List of resources in dependency order (dependencies first).
51
+
52
+ Raises:
53
+ ValueError: If a cycle is detected in the dependency graph.
54
+ """
55
+ visited: set[str] = set()
56
+ visiting: set[str] = set() # For cycle detection
57
+ result: list["Resource"] = []
58
+
59
+ def visit(resource: "Resource"):
60
+ if resource.name in visited:
61
+ return
62
+ if resource.name in visiting:
63
+ raise ValueError(
64
+ f"Cycle detected in resource dependencies involving {resource.name}"
65
+ )
66
+
67
+ visiting.add(resource.name)
68
+ for dep in resource.dependencies:
69
+ visit(dep)
70
+ visiting.discard(resource.name)
71
+ visited.add(resource.name)
72
+ result.append(resource)
73
+
74
+ for resource in resources.values():
75
+ visit(resource)
76
+
77
+ return result
78
+
79
+
80
+ def _compute_dependents(resources: dict[str, "Resource"]) -> None:
81
+ """Compute the dependents (inverse edges) for all resources."""
82
+ # Clear existing dependents
83
+ for resource in resources.values():
84
+ resource._dependents = []
85
+
86
+ # Build inverse edges
87
+ for resource in resources.values():
88
+ for dep in resource.dependencies:
89
+ if resource not in dep._dependents:
90
+ dep._dependents.append(resource)
91
+
92
+
93
+ def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
94
+ """Scan class attributes for Resource instances and bind them.
95
+
96
+ This is called when a class-based dataset is processed by the
97
+ @dataset decorator. It detects Resource instances defined as
98
+ class attributes and binds them to the dataset.
99
+
100
+ Args:
101
+ cls: The dataset class to scan.
102
+ dataset_wrapper: The AbstractDataset to bind resources to.
103
+ """
104
+ from datamaestro.download import Resource
105
+
106
+ for attr_name, attr_value in vars(cls).items():
107
+ if isinstance(attr_value, Resource):
108
+ attr_value.bind(attr_name, dataset_wrapper)
109
+
110
+ # Build the dependency DAG
111
+ _compute_dependents(dataset_wrapper.resources)
112
+
113
+ # Validate: topological sort will raise on cycles
114
+ dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
115
+
116
+
117
+ def _delete_path(path: Path) -> None:
118
+ """Delete a file or directory at path."""
119
+ if path.exists():
120
+ if path.is_dir():
121
+ shutil.rmtree(path)
122
+ else:
123
+ path.unlink()
124
+
125
+
126
+ def _move_path(src: Path, dst: Path) -> None:
127
+ """Move a file or directory from src to dst."""
128
+ if src.exists():
129
+ dst.parent.mkdir(parents=True, exist_ok=True)
130
+ shutil.move(str(src), str(dst))
131
+
38
132
 
39
133
  # --- Objects holding information into classes/function
40
134
 
@@ -183,8 +277,7 @@ class AbstractDataset(AbstractData):
183
277
  self.hooks[hookname].append(hook)
184
278
 
185
279
  @abstractmethod
186
- def _prepare(self) -> "Base":
187
- ...
280
+ def _prepare(self) -> "Base": ...
188
281
 
189
282
  def format(self, encoder: str) -> str:
190
283
  s = self.prepare()
@@ -204,6 +297,14 @@ class AbstractDataset(AbstractData):
204
297
  from datamaestro.data import Base
205
298
 
206
299
  if isinstance(data, Base):
300
+ try:
301
+ if data.id:
302
+ # There is already an ID, skip this
303
+ # and the descendants
304
+ return
305
+ except KeyError:
306
+ pass
307
+
207
308
  if self.repository is None:
208
309
  data.id = id
209
310
  else:
@@ -213,25 +314,136 @@ class AbstractDataset(AbstractData):
213
314
  self.setDataIDs(value, f"{id}.{key}")
214
315
 
215
316
  def download(self, force=False):
216
- """Download all the necessary resources"""
217
- success = True
317
+ """Download all the necessary resources.
318
+
319
+ Uses DAG-based topological ordering and the two-path system:
320
+ 1. Acquire exclusive lock (.state.lock)
321
+ 2. Resource writes to transient_path (under .downloads/)
322
+ 3. Framework moves transient_path → path (main folder)
323
+ 4. State marked COMPLETE
324
+ 5. Transient dependencies cleaned up eagerly
325
+ 6. .downloads/ directory removed after all resources complete
326
+ 7. Release lock
327
+ """
328
+ import fcntl
329
+
330
+ from datamaestro.download import ResourceState
331
+
218
332
  self.prepare()
219
- logging.info("Materializing %d resources", len(self.ordered_resources))
333
+ logging.info(
334
+ "Materializing %d resources",
335
+ len(self.ordered_resources),
336
+ )
337
+
338
+ self.datapath.mkdir(parents=True, exist_ok=True)
339
+ lock_path = self.datapath / ".state.lock"
340
+ lock_file = lock_path.open("w")
341
+ try:
342
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
343
+ success = self._download_locked(force, ResourceState)
344
+ finally:
345
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
346
+ lock_file.close()
347
+
348
+ return success
349
+
350
+ def _download_locked(self, force, ResourceState):
351
+ """Inner download logic, called while holding .state.lock."""
352
+ success = True
353
+
220
354
  for resource in self.ordered_resources:
355
+ # Step 1: Check state
356
+ current_state = resource.state
357
+
358
+ if current_state == ResourceState.COMPLETE and not force:
359
+ # Verify files are actually present on disk
360
+ if resource.has_files() and not resource.path.exists():
361
+ logging.warning(
362
+ "Resource %s marked COMPLETE but files "
363
+ "missing at %s — re-downloading",
364
+ resource.name,
365
+ resource.path,
366
+ )
367
+ resource.state = ResourceState.NONE
368
+ current_state = ResourceState.NONE
369
+ else:
370
+ continue
371
+
372
+ # Adopt pre-existing files (old downloads without state file)
373
+ if (
374
+ current_state == ResourceState.NONE
375
+ and not force
376
+ and resource.has_files()
377
+ and resource.path.exists()
378
+ ):
379
+ logging.info(
380
+ "Resource %s already exists at %s — marking COMPLETE",
381
+ resource.name,
382
+ resource.path,
383
+ )
384
+ resource.state = ResourceState.COMPLETE
385
+ continue
386
+
387
+ if current_state == ResourceState.PARTIAL:
388
+ if not resource.can_recover:
389
+ _delete_path(resource.transient_path)
390
+ resource.state = ResourceState.NONE
391
+
392
+ # Verify all dependencies are COMPLETE
393
+ for dep in resource.dependencies:
394
+ if dep.state != ResourceState.COMPLETE:
395
+ logging.error(
396
+ "Dependency %s of %s is not COMPLETE",
397
+ dep.name,
398
+ resource.name,
399
+ )
400
+ return False
401
+
402
+ # Step 2-4: Download with framework-managed state
221
403
  try:
222
- resource.download(force)
404
+ resource.download(force=force)
405
+
406
+ # Move transient -> final, mark COMPLETE
407
+ if resource.has_files():
408
+ _move_path(resource.transient_path, resource.path)
409
+ resource.state = ResourceState.COMPLETE
410
+
223
411
  except Exception:
224
412
  logging.error("Could not download resource %s", resource)
225
413
  traceback.print_exc()
414
+
415
+ # Handle PARTIAL state
416
+ if resource.has_files() and resource.transient_path.exists():
417
+ if resource.can_recover:
418
+ resource.state = ResourceState.PARTIAL
419
+ else:
420
+ _delete_path(resource.transient_path)
421
+ resource.state = ResourceState.NONE
226
422
  success = False
227
423
  break
424
+
425
+ # Step 5: Eager transient cleanup
426
+ for dep in resource.dependencies:
427
+ if dep.transient and all(
428
+ d.state == ResourceState.COMPLETE for d in dep.dependents
429
+ ):
430
+ dep.cleanup()
431
+
432
+ # Step 6: Remove .downloads/ directory after success
433
+ if success:
434
+ downloads_dir = self.datapath / ".downloads"
435
+ if downloads_dir.is_dir():
436
+ shutil.rmtree(downloads_dir)
437
+
228
438
  return success
229
439
 
230
440
  @staticmethod
231
- def find(name: str) -> "DataDefinition":
441
+ def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
232
442
  """Find a dataset given its name"""
233
443
  from datamaestro.context import Context # noqa: F811
234
444
 
445
+ context = Context.instance() if context is None else context
446
+
235
447
  logging.debug("Searching dataset %s", name)
236
448
  for repository in Context.instance().repositories():
237
449
  logging.debug("Searching dataset %s in %s", name, repository)
@@ -303,8 +515,7 @@ class DatasetWrapper(AbstractDataset):
303
515
  # Computes an ID
304
516
  assert (
305
517
  # id is empty string = use the module id
306
- components[0]
307
- == "config"
518
+ components[0] == "config"
308
519
  ), (
309
520
  "A @dataset without `id` should be in the "
310
521
  f".config module (not {t.__module__})"
@@ -452,14 +663,24 @@ class DatasetWrapper(AbstractDataset):
452
663
 
453
664
  return path
454
665
 
455
- def hasfiles(self) -> bool:
456
- """Returns whether this dataset has files or only includes references"""
666
+ def has_files(self) -> bool:
667
+ """Returns whether this dataset has files or only includes references."""
457
668
  for resource in self.resources.values():
458
- if resource.hasfiles():
669
+ if resource.has_files():
459
670
  return True
460
-
461
671
  return False
462
672
 
673
+ def hasfiles(self) -> bool:
674
+ """Deprecated: use has_files() instead."""
675
+ import warnings
676
+
677
+ warnings.warn(
678
+ "hasfiles() is deprecated, use has_files()",
679
+ DeprecationWarning,
680
+ stacklevel=2,
681
+ )
682
+ return self.has_files()
683
+
463
684
 
464
685
  # --- Annotations
465
686
 
@@ -475,9 +696,9 @@ class DataAnnotation:
475
696
  self.annotate(object.__datamaestro__)
476
697
  else:
477
698
  # With configuration objects, add a __datamaestro__ member to the class
478
- assert issubclass(
479
- object, Config
480
- ), f"{object} cannot be annotated (only dataset or data definitions)"
699
+ assert issubclass(object, Config), (
700
+ f"{object} cannot be annotated (only dataset or data definitions)"
701
+ )
481
702
  if "__datamaestro__" not in object.__dict__:
482
703
  object.__datamaestro__ = AbstractData()
483
704
  self.annotate(object.__datamaestro__)
@@ -551,7 +772,9 @@ datatasks = DataTagging(lambda d: d.tasks)
551
772
 
552
773
  class metadata:
553
774
  def __init__(
554
- self, tags: Union[str, List[str]] = None, tasks: Union[str, List[str]] = None
775
+ self,
776
+ tags: Union[str, List[str]] = None,
777
+ tasks: Union[str, List[str]] = None,
555
778
  ):
556
779
  pass
557
780
 
@@ -625,7 +848,10 @@ class dataset:
625
848
  pass
626
849
  dw = DatasetWrapper(self, t)
627
850
  t.__dataset__ = dw
851
+
852
+ # For class-based datasets, scan for Resource class attributes
628
853
  if inspect.isclass(t) and issubclass(t, Base):
854
+ _bind_class_resources(t, dw)
629
855
  return t
630
856
  return dw
631
857