PyPI - datamaestro - Versions diffs - 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl - Mend

datamaestro 1.6.2py3-none-any.whl → 1.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

datamaestro/__main__.py +9 -5
datamaestro/commands/site.py +16 -5
datamaestro/data/ml.py +1 -0
datamaestro/definitions.py +263 -19
datamaestro/download/__init__.py +606 -45
datamaestro/download/archive.py +120 -76
datamaestro/download/custom.py +38 -6
datamaestro/download/huggingface.py +46 -14
datamaestro/download/links.py +116 -51
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +111 -54
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +3 -3
datamaestro/record.py +48 -2
datamaestro/settings.py +2 -1
datamaestro/sphinx.py +1 -3
datamaestro/stream/lines.py +8 -6
datamaestro/test/__init__.py +3 -1
datamaestro/test/conftest.py +1 -2
datamaestro/test/test_resource.py +1657 -0
datamaestro/utils.py +7 -6
datamaestro/v2.md +301 -0
datamaestro/version.py +1 -1
{datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/METADATA +46 -47
datamaestro-1.7.1.dist-info/RECORD +49 -0
datamaestro-1.6.2.dist-info/RECORD +0 -47
{datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/WHEEL +0 -0
{datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/entry_points.txt +0 -0
{datamaestro-1.6.2.dist-info → datamaestro-1.7.1.dist-info}/licenses/LICENSE +0 -0

datamaestro/__main__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# flake8: noqa: T201
+# ruff: noqa: T201
 from importlib.metadata import entry_points
 import sys
@@ -9,12 +9,14 @@ import traceback as tb
 import re
 from pathlib import Path
 import shutil
-from .context import Context
 from typing import Set
-import datamaestro
+from urllib.parse import urlparse
 import click
+import datamaestro
+from .context import Context
 logging.basicConfig(level=logging.INFO)
@@ -60,7 +62,10 @@ for entry_point in entry_points(group="datamaestro.repositories"):
     "--traceback", is_flag=True, help="Display traceback if an exception occurs"
 )
 @click.option(
-    "--data", type=Path, help="Directory containing datasets", default=Context.MAINDIR
+    "--data",
+    type=Path,
+    help="Directory containing datasets",
+    default=Context.MAINDIR,
 )
 @click.pass_context
 def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
@@ -207,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
 # --- Create a dataset
 DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
-from urllib.parse import urlparse
 def dataset_id_check(ctx, param, value):

datamaestro/commands/site.py CHANGED Viewed

@@ -159,7 +159,10 @@ def document(match):
     try:
         object = getattr(module, name)
     except Exception:
-        return "<div class='error'>Cannot find %s in %s</div>" % (name, modulename)
+        return "<div class='error'>Cannot find %s in %s</div>" % (
+            name,
+            modulename,
+        )
     if ismodule(object):
         return "\n\n".join(
@@ -220,7 +223,12 @@ class Classification:
                 module = Datasets(importlib.import_module(meta.t.__module__))
                 r.write(
                     "- [%s](../df/%s/%s.html#%s)\n"
-                    % (meta.name or meta.id, meta.repository.id, module.id, meta.id)
+                    % (
+                        meta.name or meta.id,
+                        meta.repository.id,
+                        module.id,
+                        meta.id,
+                    )
                 )
             return r.getvalue()
@@ -326,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
         import shutil
         path = Path(config["site_dir"]) / "mainstyle.css"
-        with importlib.resources.open_binary(
-            "datamaestro.commands", "mainstyle.css"
-        ) as source, path.open("wb") as dest:
+        with (
+            importlib.resources.open_binary(
+                "datamaestro.commands", "mainstyle.css"
+            ) as source,
+            path.open("wb") as dest,
+        ):
             shutil.copyfileobj(source, dest)
     def on_files(self, files, config):

datamaestro/data/ml.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Machine learning generic data formats"""
 from pathlib import Path
 from typing import Generic, TypeVar, Optional
 from experimaestro import Param, Meta

datamaestro/definitions.py CHANGED Viewed

@@ -2,8 +2,12 @@
 # Main datamaestro functions and data models
 #
+from __future__ import annotations
 import logging
 import inspect
+import re as _re
+import shutil
 from pathlib import Path
 from itertools import chain
 from abc import ABC, abstractmethod
@@ -30,9 +34,117 @@ from typing import Type as TypingType  # noqa: F401 (re-exports)
 from experimaestro.core.types import Type  # noqa: F401 (re-exports)
 if TYPE_CHECKING:
-    from .data import Base, Dataset
+    from .data import Base
     from .context import Repository, Context, DatafolderPath  # noqa: F401 (re-exports)
-    from datamaestro.download import Download
+    from datamaestro.download import Download, Resource
+# --- DAG utilities ---
+def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
+    """Topological sort of resources by their dependencies.
+    Args:
+        resources: Dict mapping resource names to Resource instances.
+    Returns:
+        List of resources in dependency order (dependencies first).
+    Raises:
+        ValueError: If a cycle is detected in the dependency graph.
+    """
+    visited: set[str] = set()
+    visiting: set[str] = set()  # For cycle detection
+    result: list["Resource"] = []
+    def visit(resource: "Resource"):
+        if resource.name in visited:
+            return
+        if resource.name in visiting:
+            raise ValueError(
+                f"Cycle detected in resource dependencies involving {resource.name}"
+            )
+        visiting.add(resource.name)
+        for dep in resource.dependencies:
+            visit(dep)
+        visiting.discard(resource.name)
+        visited.add(resource.name)
+        result.append(resource)
+    for resource in resources.values():
+        visit(resource)
+    return result
+def _compute_dependents(resources: dict[str, "Resource"]) -> None:
+    """Compute the dependents (inverse edges) for all resources."""
+    # Clear existing dependents
+    for resource in resources.values():
+        resource._dependents = []
+    # Build inverse edges
+    for resource in resources.values():
+        for dep in resource.dependencies:
+            if resource not in dep._dependents:
+                dep._dependents.append(resource)
+def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
+    """Scan class attributes for Resource instances and bind them.
+    This is called when a class-based dataset is processed by the
+    @dataset decorator. It detects Resource instances defined as
+    class attributes and binds them to the dataset.
+    Args:
+        cls: The dataset class to scan.
+        dataset_wrapper: The AbstractDataset to bind resources to.
+    """
+    from datamaestro.download import Resource
+    for attr_name, attr_value in vars(cls).items():
+        if isinstance(attr_value, Resource):
+            attr_value.bind(attr_name, dataset_wrapper)
+    # Build the dependency DAG
+    _compute_dependents(dataset_wrapper.resources)
+    # Validate: topological sort will raise on cycles
+    dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
+def _delete_path(path: Path) -> None:
+    """Delete a file or directory at path."""
+    if path.exists():
+        if path.is_dir():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+def _move_path(src: Path, dst: Path) -> None:
+    """Move a file or directory from src to dst."""
+    if src.exists():
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(src), str(dst))
+_CAMEL_RE1 = _re.compile(r"([A-Z]+)([A-Z][a-z])")
+_CAMEL_RE2 = _re.compile(r"([a-z0-9])([A-Z])")
+def _camel_to_snake(name: str) -> str:
+    """Convert CamelCase to snake_case, then lowercase.
+    Examples: ProcessedMNIST -> processed_mnist, MyData -> my_data,
+    MNIST -> mnist, simple -> simple
+    """
+    s = _CAMEL_RE1.sub(r"\1_\2", name)
+    s = _CAMEL_RE2.sub(r"\1_\2", s)
+    return s.lower()
 # --- Objects holding information into classes/function
@@ -100,7 +212,12 @@ class DataDefinition(AbstractData):
             if components[0] == "datamaestro":
                 longest_ix = 0
-        return repository, [s.lower() for s in components[(longest_ix + 1) :]]
+        parts = components[(longest_ix + 1) :]
+        # Module components: just lowercase
+        # Last component (class/function name): CamelCase → snake_case
+        if parts:
+            parts = [s.lower() for s in parts[:-1]] + [_camel_to_snake(parts[-1])]
+        return repository, parts
     def ancestors(self):
         ancestors = []
@@ -218,18 +335,127 @@ class AbstractDataset(AbstractData):
                 self.setDataIDs(value, f"{id}.{key}")
     def download(self, force=False):
-        """Download all the necessary resources"""
-        success = True
+        """Download all the necessary resources.
+        Uses DAG-based topological ordering and the two-path system:
+        1. Acquire exclusive lock (.state.lock)
+        2. Resource writes to transient_path (under .downloads/)
+        3. Framework moves transient_path → path (main folder)
+        4. State marked COMPLETE
+        5. Transient dependencies cleaned up eagerly
+        6. .downloads/ directory removed after all resources complete
+        7. Release lock
+        """
+        import fcntl
+        from datamaestro.download import ResourceState
         self.prepare()
-        logging.info("Materializing %d resources", len(self.ordered_resources))
+        logging.info(
+            "Materializing %d resources",
+            len(self.ordered_resources),
+        )
+        self.datapath.mkdir(parents=True, exist_ok=True)
+        lock_path = self.datapath / ".state.lock"
+        lock_file = lock_path.open("w")
+        try:
+            fcntl.flock(lock_file, fcntl.LOCK_EX)
+            success = self._download_locked(force, ResourceState)
+        finally:
+            fcntl.flock(lock_file, fcntl.LOCK_UN)
+            lock_file.close()
+        return success
+    def _download_locked(self, force, ResourceState):
+        """Inner download logic, called while holding .state.lock."""
+        success = True
         for resource in self.ordered_resources:
+            # Step 1: Check state
+            current_state = resource.state
+            if current_state == ResourceState.COMPLETE and not force:
+                # Verify files are actually present on disk
+                if resource.has_files() and not resource.path.exists():
+                    logging.warning(
+                        "Resource %s marked COMPLETE but files "
+                        "missing at %s — re-downloading",
+                        resource.name,
+                        resource.path,
+                    )
+                    resource.state = ResourceState.NONE
+                    current_state = ResourceState.NONE
+                else:
+                    continue
+            # Adopt pre-existing files (old downloads without state file)
+            if (
+                current_state == ResourceState.NONE
+                and not force
+                and resource.has_files()
+                and resource.path.exists()
+            ):
+                logging.info(
+                    "Resource %s already exists at %s — marking COMPLETE",
+                    resource.name,
+                    resource.path,
+                )
+                resource.state = ResourceState.COMPLETE
+                continue
+            if current_state == ResourceState.PARTIAL:
+                if not resource.can_recover:
+                    _delete_path(resource.transient_path)
+                    resource.state = ResourceState.NONE
+            # Verify all dependencies are COMPLETE
+            for dep in resource.dependencies:
+                if dep.state != ResourceState.COMPLETE:
+                    logging.error(
+                        "Dependency %s of %s is not COMPLETE",
+                        dep.name,
+                        resource.name,
+                    )
+                    return False
+            # Step 2-4: Download with framework-managed state
             try:
-                resource.download(force)
+                resource.download(force=force)
+                # Move transient -> final, mark COMPLETE
+                if resource.has_files():
+                    _move_path(resource.transient_path, resource.path)
+                resource.state = ResourceState.COMPLETE
             except Exception:
                 logging.error("Could not download resource %s", resource)
                 traceback.print_exc()
+                # Handle PARTIAL state
+                if resource.has_files() and resource.transient_path.exists():
+                    if resource.can_recover:
+                        resource.state = ResourceState.PARTIAL
+                    else:
+                        _delete_path(resource.transient_path)
+                        resource.state = ResourceState.NONE
                 success = False
                 break
+            # Step 5: Eager transient cleanup
+            for dep in resource.dependencies:
+                if dep.transient and all(
+                    d.state == ResourceState.COMPLETE for d in dep.dependents
+                ):
+                    dep.cleanup()
+        # Step 6: Remove .downloads/ directory after success
+        if success:
+            downloads_dir = self.datapath / ".downloads"
+            if downloads_dir.is_dir():
+                shutil.rmtree(downloads_dir)
         return success
     @staticmethod
@@ -310,8 +536,7 @@ class DatasetWrapper(AbstractDataset):
             # Computes an ID
             assert (
                 # id is empty string = use the module id
-                components[0]
-                == "config"
+                components[0] == "config"
             ), (
                 "A @dataset without `id` should be in the "
                 f".config module (not {t.__module__})"
@@ -390,6 +615,10 @@ class DatasetWrapper(AbstractDataset):
         if self.base is self.t:
             self.config = self.base.__create_dataset__(self)
+        elif hasattr(self.t, "__create_dataset__"):
+            # Class-based dataset with metadataset or different base
+            self.config = self.t.__create_dataset__(self)
         else:
             # Construct the object
             if self.as_prepare:
@@ -459,14 +688,24 @@ class DatasetWrapper(AbstractDataset):
         return path
-    def hasfiles(self) -> bool:
-        """Returns whether this dataset has files or only includes references"""
+    def has_files(self) -> bool:
+        """Returns whether this dataset has files or only includes references."""
         for resource in self.resources.values():
-            if resource.hasfiles():
+            if resource.has_files():
                 return True
         return False
+    def hasfiles(self) -> bool:
+        """Deprecated: use has_files() instead."""
+        import warnings
+        warnings.warn(
+            "hasfiles() is deprecated, use has_files()",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.has_files()
 # --- Annotations
@@ -482,9 +721,9 @@ class DataAnnotation:
                 self.annotate(object.__datamaestro__)
             else:
                 # With configuration objects, add a __datamaestro__ member to the class
-                assert issubclass(
-                    object, Config
-                ), f"{object} cannot be annotated (only dataset or data definitions)"
+                assert issubclass(object, Config), (
+                    f"{object} cannot be annotated (only dataset or data definitions)"
+                )
                 if "__datamaestro__" not in object.__dict__:
                     object.__datamaestro__ = AbstractData()
                 self.annotate(object.__datamaestro__)
@@ -501,8 +740,8 @@ class DatasetAnnotation:
     def __call__(self, dataset: AbstractDataset):
         if isinstance(dataset, AbstractDataset):
             self.annotate(dataset)
-        elif issubclass(dataset, Dataset):
-            self.annotate(dataset.__datamaestro__)
+        elif hasattr(dataset, "__dataset__"):
+            self.annotate(dataset.__dataset__)
         else:
             raise RuntimeError(
                 f"Only datasets can be annotated with {self}, "
@@ -558,7 +797,9 @@ datatasks = DataTagging(lambda d: d.tasks)
 class metadata:
     def __init__(
-        self, tags: Union[str, List[str]] = None, tasks: Union[str, List[str]] = None
+        self,
+        tags: Union[str, List[str]] = None,
+        tasks: Union[str, List[str]] = None,
     ):
         pass
@@ -632,7 +873,10 @@ class dataset:
             pass
         dw = DatasetWrapper(self, t)
         t.__dataset__ = dw
+        # For class-based datasets, scan for Resource class attributes
         if inspect.isclass(t) and issubclass(t, Base):
+            _bind_class_resources(t, dw)
             return t
         return dw

datamaestro 1.6.2__py3-none-any.whl → 1.7.1__py3-none-any.whl

datamaestro 1.6.2py3-none-any.whl → 1.7.1py3-none-any.whl