PyPI - datamaestro - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

datamaestro/__init__.py +1 -2
datamaestro/__main__.py +11 -7
datamaestro/commands/site.py +16 -5
datamaestro/context.py +32 -16
datamaestro/data/ml.py +1 -0
datamaestro/definitions.py +246 -20
datamaestro/download/__init__.py +583 -40
datamaestro/download/archive.py +120 -76
datamaestro/download/custom.py +38 -6
datamaestro/download/huggingface.py +46 -14
datamaestro/download/links.py +106 -49
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +111 -54
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +3 -3
datamaestro/record.py +48 -2
datamaestro/settings.py +2 -1
datamaestro/sphinx.py +1 -3
datamaestro/stream/lines.py +8 -6
datamaestro/test/__init__.py +3 -1
datamaestro/test/conftest.py +1 -2
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +7 -6
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -21
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
datamaestro-1.5.0.dist-info/RECORD +0 -48
datamaestro-1.5.0.dist-info/top_level.txt +0 -1
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0

datamaestro/__init__.py CHANGED Viewed

@@ -7,7 +7,6 @@ from .context import (
     prepare_dataset,
 )
-from pkg_resources import get_distribution, DistributionNotFound
 from .definitions import dataset, metadata
 from .data import Base
-from .version import version, version_tuple
+from .version import __version__

datamaestro/__main__.py CHANGED Viewed

@@ -1,20 +1,22 @@
 #!/usr/bin/env python3
-# flake8: noqa: T201
+# ruff: noqa: T201
+from importlib.metadata import entry_points
 import sys
 import logging
 from functools import update_wrapper
 import traceback as tb
-import pkg_resources
 import re
 from pathlib import Path
 import shutil
-from .context import Context
 from typing import Set
-import datamaestro
+from urllib.parse import urlparse
 import click
+import datamaestro
+from .context import Context
 logging.basicConfig(level=logging.INFO)
@@ -38,7 +40,7 @@ def pass_cfg(f):
 # Get all the available repositories
 REPOSITORIES = {}
-for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
+for entry_point in entry_points(group="datamaestro.repositories"):
     REPOSITORIES[entry_point.name] = entry_point
@@ -60,7 +62,10 @@ for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
     "--traceback", is_flag=True, help="Display traceback if an exception occurs"
 )
 @click.option(
-    "--data", type=Path, help="Directory containing datasets", default=Context.MAINDIR
+    "--data",
+    type=Path,
+    help="Directory containing datasets",
+    default=Context.MAINDIR,
 )
 @click.pass_context
 def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
@@ -207,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
 # --- Create a dataset
 DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
-from urllib.parse import urlparse
 def dataset_id_check(ctx, param, value):

datamaestro/commands/site.py CHANGED Viewed

@@ -159,7 +159,10 @@ def document(match):
     try:
         object = getattr(module, name)
     except Exception:
-        return "<div class='error'>Cannot find %s in %s</div>" % (name, modulename)
+        return "<div class='error'>Cannot find %s in %s</div>" % (
+            name,
+            modulename,
+        )
     if ismodule(object):
         return "\n\n".join(
@@ -220,7 +223,12 @@ class Classification:
                 module = Datasets(importlib.import_module(meta.t.__module__))
                 r.write(
                     "- [%s](../df/%s/%s.html#%s)\n"
-                    % (meta.name or meta.id, meta.repository.id, module.id, meta.id)
+                    % (
+                        meta.name or meta.id,
+                        meta.repository.id,
+                        module.id,
+                        meta.id,
+                    )
                 )
             return r.getvalue()
@@ -326,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
         import shutil
         path = Path(config["site_dir"]) / "mainstyle.css"
-        with importlib.resources.open_binary(
-            "datamaestro.commands", "mainstyle.css"
-        ) as source, path.open("wb") as dest:
+        with (
+            importlib.resources.open_binary(
+                "datamaestro.commands", "mainstyle.css"
+            ) as source,
+            path.open("wb") as dest,
+        ):
             shutil.copyfileobj(source, dest)
     def on_files(self, files, config):

datamaestro/context.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Iterable, Iterator, Dict, Union
+from typing import Iterable, Iterator, Dict, Optional, Union
 import importlib
 import os
 import hashlib
@@ -8,8 +8,7 @@ import inspect
 import json
 from abc import ABC, abstractmethod
 from experimaestro import Config
-import pkg_resources
-from experimaestro.compat import cached_property
+from functools import cached_property
 from experimaestro.mkdocs.metaloader import Module
 from .utils import CachedFile, downloadURL
 from .settings import UserSettings, Settings
@@ -18,6 +17,22 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from datamaestro.definitions import AbstractDataset, DatasetWrapper
+from importlib.metadata import (
+    entry_points as _entry_points,
+    version as _version,
+    PackageNotFoundError as _PackageNotFoundError,
+)
+def iter_entry_points(group, name=None):
+    """Yield entry points for a given group (and optional name) using importlib.metadata."""
+    eps = _entry_points()
+    selected = eps.select(group=group)
+    if name:
+        selected = [ep for ep in selected if ep.name == name]
+    for ep in selected:
+        yield ep
 class Compression:
     @staticmethod
@@ -106,7 +121,7 @@ class Context:
     def repositories(self) -> Iterable["Repository"]:
         """Returns an iterator over repositories"""
-        for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
+        for entry_point in iter_entry_points("datamaestro.repositories"):
             yield entry_point.load().instance()
     def repository(self, repositoryid):
@@ -114,10 +129,7 @@ class Context:
             return None
         entry_points = [
-            x
-            for x in pkg_resources.iter_entry_points(
-                "datamaestro.repositories", repositoryid
-            )
+            x for x in iter_entry_points("datamaestro.repositories", repositoryid)
         ]
         if not entry_points:
             raise Exception("No datasets repository named %s", repositoryid)
@@ -299,8 +311,7 @@ class BaseRepository(ABC):
         self.basedir = Path(p).parent
     @abstractmethod
-    def __iter__(self) -> Iterator["AbstractDataset"]:
-        ...
+    def __iter__(self) -> Iterator["AbstractDataset"]: ...
     def search(self, name: str):
         """Search for a dataset in the definitions"""
@@ -353,11 +364,9 @@ class Repository(BaseRepository):
     @classmethod
     def version(cls):
-        from pkg_resources import get_distribution, DistributionNotFound
         try:
-            return get_distribution(cls.__module__).version
-        except DistributionNotFound:
+            return _version(cls.__module__)
+        except _PackageNotFoundError:
             return None
     def __repr__(self):
@@ -423,16 +432,23 @@ def find_dataset(dataset_id: str):
     return AbstractDataset.find(dataset_id)
-def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
+def prepare_dataset(
+    dataset_id: Union[str, "DatasetWrapper", Config],
+    context: Optional[Union[Context, Path]] = None,
+):
     """Find a dataset given its id and download the resources"""
     from .definitions import AbstractDataset, DatasetWrapper
+    match context:
+        case Path() | str():
+            context = Context(Path(context))
     if isinstance(dataset_id, DatasetWrapper):
         ds = dataset_id
     elif isinstance(dataset_id, Config):
         ds = dataset_id.__datamaestro_dataset__
     else:
-        ds = AbstractDataset.find(dataset_id)
+        ds = AbstractDataset.find(dataset_id, context=context)
     return ds.prepare(download=True)

datamaestro/data/ml.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Machine learning generic data formats"""
 from pathlib import Path
 from typing import Generic, TypeVar, Optional
 from experimaestro import Param, Meta

datamaestro/definitions.py CHANGED Viewed

@@ -2,8 +2,11 @@
 # Main datamaestro functions and data models
 #
+from __future__ import annotations
 import logging
 import inspect
+import shutil
 from pathlib import Path
 from itertools import chain
 from abc import ABC, abstractmethod
@@ -21,8 +24,6 @@ from typing import (
     _GenericAlias,
 )
 from experimaestro import (  # noqa: F401 (re-exports)
-    argument,
-    constant,
     Param,
     Option,
     Config,
@@ -34,7 +35,100 @@ from experimaestro.core.types import Type  # noqa: F401 (re-exports)
 if TYPE_CHECKING:
     from .data import Base, Dataset
     from .context import Repository, Context, DatafolderPath  # noqa: F401 (re-exports)
-    from datamaestro.download import Download
+    from datamaestro.download import Download, Resource
+# --- DAG utilities ---
+def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
+    """Topological sort of resources by their dependencies.
+    Args:
+        resources: Dict mapping resource names to Resource instances.
+    Returns:
+        List of resources in dependency order (dependencies first).
+    Raises:
+        ValueError: If a cycle is detected in the dependency graph.
+    """
+    visited: set[str] = set()
+    visiting: set[str] = set()  # For cycle detection
+    result: list["Resource"] = []
+    def visit(resource: "Resource"):
+        if resource.name in visited:
+            return
+        if resource.name in visiting:
+            raise ValueError(
+                f"Cycle detected in resource dependencies involving {resource.name}"
+            )
+        visiting.add(resource.name)
+        for dep in resource.dependencies:
+            visit(dep)
+        visiting.discard(resource.name)
+        visited.add(resource.name)
+        result.append(resource)
+    for resource in resources.values():
+        visit(resource)
+    return result
+def _compute_dependents(resources: dict[str, "Resource"]) -> None:
+    """Compute the dependents (inverse edges) for all resources."""
+    # Clear existing dependents
+    for resource in resources.values():
+        resource._dependents = []
+    # Build inverse edges
+    for resource in resources.values():
+        for dep in resource.dependencies:
+            if resource not in dep._dependents:
+                dep._dependents.append(resource)
+def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
+    """Scan class attributes for Resource instances and bind them.
+    This is called when a class-based dataset is processed by the
+    @dataset decorator. It detects Resource instances defined as
+    class attributes and binds them to the dataset.
+    Args:
+        cls: The dataset class to scan.
+        dataset_wrapper: The AbstractDataset to bind resources to.
+    """
+    from datamaestro.download import Resource
+    for attr_name, attr_value in vars(cls).items():
+        if isinstance(attr_value, Resource):
+            attr_value.bind(attr_name, dataset_wrapper)
+    # Build the dependency DAG
+    _compute_dependents(dataset_wrapper.resources)
+    # Validate: topological sort will raise on cycles
+    dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
+def _delete_path(path: Path) -> None:
+    """Delete a file or directory at path."""
+    if path.exists():
+        if path.is_dir():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+def _move_path(src: Path, dst: Path) -> None:
+    """Move a file or directory from src to dst."""
+    if src.exists():
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(src), str(dst))
 # --- Objects holding information into classes/function
@@ -183,8 +277,7 @@ class AbstractDataset(AbstractData):
         self.hooks[hookname].append(hook)
     @abstractmethod
-    def _prepare(self) -> "Base":
-        ...
+    def _prepare(self) -> "Base": ...
     def format(self, encoder: str) -> str:
         s = self.prepare()
@@ -204,6 +297,14 @@ class AbstractDataset(AbstractData):
         from datamaestro.data import Base
         if isinstance(data, Base):
+            try:
+                if data.id:
+                    # There is already an ID, skip this
+                    # and the descendants
+                    return
+            except KeyError:
+                pass
             if self.repository is None:
                 data.id = id
             else:
@@ -213,25 +314,136 @@ class AbstractDataset(AbstractData):
                 self.setDataIDs(value, f"{id}.{key}")
     def download(self, force=False):
-        """Download all the necessary resources"""
-        success = True
+        """Download all the necessary resources.
+        Uses DAG-based topological ordering and the two-path system:
+        1. Acquire exclusive lock (.state.lock)
+        2. Resource writes to transient_path (under .downloads/)
+        3. Framework moves transient_path → path (main folder)
+        4. State marked COMPLETE
+        5. Transient dependencies cleaned up eagerly
+        6. .downloads/ directory removed after all resources complete
+        7. Release lock
+        """
+        import fcntl
+        from datamaestro.download import ResourceState
         self.prepare()
-        logging.info("Materializing %d resources", len(self.ordered_resources))
+        logging.info(
+            "Materializing %d resources",
+            len(self.ordered_resources),
+        )
+        self.datapath.mkdir(parents=True, exist_ok=True)
+        lock_path = self.datapath / ".state.lock"
+        lock_file = lock_path.open("w")
+        try:
+            fcntl.flock(lock_file, fcntl.LOCK_EX)
+            success = self._download_locked(force, ResourceState)
+        finally:
+            fcntl.flock(lock_file, fcntl.LOCK_UN)
+            lock_file.close()
+        return success
+    def _download_locked(self, force, ResourceState):
+        """Inner download logic, called while holding .state.lock."""
+        success = True
         for resource in self.ordered_resources:
+            # Step 1: Check state
+            current_state = resource.state
+            if current_state == ResourceState.COMPLETE and not force:
+                # Verify files are actually present on disk
+                if resource.has_files() and not resource.path.exists():
+                    logging.warning(
+                        "Resource %s marked COMPLETE but files "
+                        "missing at %s — re-downloading",
+                        resource.name,
+                        resource.path,
+                    )
+                    resource.state = ResourceState.NONE
+                    current_state = ResourceState.NONE
+                else:
+                    continue
+            # Adopt pre-existing files (old downloads without state file)
+            if (
+                current_state == ResourceState.NONE
+                and not force
+                and resource.has_files()
+                and resource.path.exists()
+            ):
+                logging.info(
+                    "Resource %s already exists at %s — marking COMPLETE",
+                    resource.name,
+                    resource.path,
+                )
+                resource.state = ResourceState.COMPLETE
+                continue
+            if current_state == ResourceState.PARTIAL:
+                if not resource.can_recover:
+                    _delete_path(resource.transient_path)
+                    resource.state = ResourceState.NONE
+            # Verify all dependencies are COMPLETE
+            for dep in resource.dependencies:
+                if dep.state != ResourceState.COMPLETE:
+                    logging.error(
+                        "Dependency %s of %s is not COMPLETE",
+                        dep.name,
+                        resource.name,
+                    )
+                    return False
+            # Step 2-4: Download with framework-managed state
             try:
-                resource.download(force)
+                resource.download(force=force)
+                # Move transient -> final, mark COMPLETE
+                if resource.has_files():
+                    _move_path(resource.transient_path, resource.path)
+                resource.state = ResourceState.COMPLETE
             except Exception:
                 logging.error("Could not download resource %s", resource)
                 traceback.print_exc()
+                # Handle PARTIAL state
+                if resource.has_files() and resource.transient_path.exists():
+                    if resource.can_recover:
+                        resource.state = ResourceState.PARTIAL
+                    else:
+                        _delete_path(resource.transient_path)
+                        resource.state = ResourceState.NONE
                 success = False
                 break
+            # Step 5: Eager transient cleanup
+            for dep in resource.dependencies:
+                if dep.transient and all(
+                    d.state == ResourceState.COMPLETE for d in dep.dependents
+                ):
+                    dep.cleanup()
+        # Step 6: Remove .downloads/ directory after success
+        if success:
+            downloads_dir = self.datapath / ".downloads"
+            if downloads_dir.is_dir():
+                shutil.rmtree(downloads_dir)
         return success
     @staticmethod
-    def find(name: str) -> "DataDefinition":
+    def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
         """Find a dataset given its name"""
         from datamaestro.context import Context  # noqa: F811
+        context = Context.instance() if context is None else context
         logging.debug("Searching dataset %s", name)
         for repository in Context.instance().repositories():
             logging.debug("Searching dataset %s in %s", name, repository)
@@ -303,8 +515,7 @@ class DatasetWrapper(AbstractDataset):
             # Computes an ID
             assert (
                 # id is empty string = use the module id
-                components[0]
-                == "config"
+                components[0] == "config"
             ), (
                 "A @dataset without `id` should be in the "
                 f".config module (not {t.__module__})"
@@ -452,14 +663,24 @@ class DatasetWrapper(AbstractDataset):
         return path
-    def hasfiles(self) -> bool:
-        """Returns whether this dataset has files or only includes references"""
+    def has_files(self) -> bool:
+        """Returns whether this dataset has files or only includes references."""
         for resource in self.resources.values():
-            if resource.hasfiles():
+            if resource.has_files():
                 return True
         return False
+    def hasfiles(self) -> bool:
+        """Deprecated: use has_files() instead."""
+        import warnings
+        warnings.warn(
+            "hasfiles() is deprecated, use has_files()",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.has_files()
 # --- Annotations
@@ -475,9 +696,9 @@ class DataAnnotation:
                 self.annotate(object.__datamaestro__)
             else:
                 # With configuration objects, add a __datamaestro__ member to the class
-                assert issubclass(
-                    object, Config
-                ), f"{object} cannot be annotated (only dataset or data definitions)"
+                assert issubclass(object, Config), (
+                    f"{object} cannot be annotated (only dataset or data definitions)"
+                )
                 if "__datamaestro__" not in object.__dict__:
                     object.__datamaestro__ = AbstractData()
                 self.annotate(object.__datamaestro__)
@@ -551,7 +772,9 @@ datatasks = DataTagging(lambda d: d.tasks)
 class metadata:
     def __init__(
-        self, tags: Union[str, List[str]] = None, tasks: Union[str, List[str]] = None
+        self,
+        tags: Union[str, List[str]] = None,
+        tasks: Union[str, List[str]] = None,
     ):
         pass
@@ -625,7 +848,10 @@ class dataset:
             pass
         dw = DatasetWrapper(self, t)
         t.__dataset__ = dw
+        # For class-based datasets, scan for Resource class attributes
         if inspect.isclass(t) and issubclass(t, Base):
+            _bind_class_resources(t, dw)
             return t
         return dw

datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl