PyPI - datamaestro - Versions diffs - 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

datamaestro/__init__.py +11 -7
datamaestro/__main__.py +29 -8
datamaestro/annotations/__init__.py +1 -1
datamaestro/annotations/agreement.py +9 -3
datamaestro/commands/site.py +27 -15
datamaestro/context.py +143 -87
datamaestro/data/__init__.py +23 -11
datamaestro/data/csv.py +12 -12
datamaestro/data/huggingface.py +25 -0
datamaestro/data/ml.py +19 -10
datamaestro/data/tensor.py +32 -24
datamaestro/definitions.py +492 -131
datamaestro/download/__init__.py +610 -24
datamaestro/download/archive.py +129 -77
datamaestro/download/custom.py +53 -0
datamaestro/download/huggingface.py +77 -0
datamaestro/download/links.py +106 -50
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +114 -51
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +164 -0
datamaestro/record.py +232 -0
datamaestro/registry.py +1 -0
datamaestro/search.py +1 -1
datamaestro/settings.py +3 -1
datamaestro/sphinx.py +224 -0
datamaestro/stream/__init__.py +0 -2
datamaestro/stream/lines.py +10 -7
datamaestro/templates/dataset.py +5 -4
datamaestro/test/__init__.py +3 -1
datamaestro/test/checks.py +1 -5
datamaestro/test/conftest.py +1 -6
datamaestro/test/test_annotations.py +2 -2
datamaestro/test/test_download_handlers.py +3 -4
datamaestro/test/test_record.py +72 -0
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +15 -9
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
datamaestro/__pycache__/context.cpython-38.pyc +0 -0
datamaestro/__pycache__/context.cpython-39.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
datamaestro/__pycache__/search.cpython-38.pyc +0 -0
datamaestro/__pycache__/search.cpython-39.pyc +0 -0
datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro-0.8.1.dist-info/RECORD +0 -109
datamaestro-0.8.1.dist-info/top_level.txt +0 -1
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0

datamaestro/definitions.py CHANGED Viewed

@@ -2,13 +2,16 @@
 # Main datamaestro functions and data models
 #
+from __future__ import annotations
 import logging
 import inspect
+import shutil
 from pathlib import Path
 from itertools import chain
+from abc import ABC, abstractmethod
 import traceback
 from typing import (
-    Any,
     Dict,
     List,
     Optional,
@@ -18,20 +21,119 @@ from typing import (
     Callable,
     TYPE_CHECKING,
     Union,
+    _GenericAlias,
+)
+from experimaestro import (  # noqa: F401 (re-exports)
+    Param,
+    Option,
+    Config,
+    Meta,
 )
-from experimaestro import argument, constant, Param, Option, Config, Meta
-from typing import Type as TypingType
+from typing import Type as TypingType  # noqa: F401 (re-exports)
 from experimaestro.core.types import Type  # noqa: F401 (re-exports)
-from .context import Repository, Context, DatafolderPath  # noqa: F401 (re-exports)
 if TYPE_CHECKING:
-    from datamaestro.download import Download
-    from .data import Base
+    from .data import Base, Dataset
+    from .context import Repository, Context, DatafolderPath  # noqa: F401 (re-exports)
+    from datamaestro.download import Download, Resource
+# --- DAG utilities ---
+def topological_sort(resources: dict[str, "Resource"]) -> list["Resource"]:
+    """Topological sort of resources by their dependencies.
+    Args:
+        resources: Dict mapping resource names to Resource instances.
+    Returns:
+        List of resources in dependency order (dependencies first).
+    Raises:
+        ValueError: If a cycle is detected in the dependency graph.
+    """
+    visited: set[str] = set()
+    visiting: set[str] = set()  # For cycle detection
+    result: list["Resource"] = []
+    def visit(resource: "Resource"):
+        if resource.name in visited:
+            return
+        if resource.name in visiting:
+            raise ValueError(
+                f"Cycle detected in resource dependencies involving {resource.name}"
+            )
+        visiting.add(resource.name)
+        for dep in resource.dependencies:
+            visit(dep)
+        visiting.discard(resource.name)
+        visited.add(resource.name)
+        result.append(resource)
+    for resource in resources.values():
+        visit(resource)
+    return result
+def _compute_dependents(resources: dict[str, "Resource"]) -> None:
+    """Compute the dependents (inverse edges) for all resources."""
+    # Clear existing dependents
+    for resource in resources.values():
+        resource._dependents = []
+    # Build inverse edges
+    for resource in resources.values():
+        for dep in resource.dependencies:
+            if resource not in dep._dependents:
+                dep._dependents.append(resource)
+def _bind_class_resources(cls: type, dataset_wrapper: "AbstractDataset") -> None:
+    """Scan class attributes for Resource instances and bind them.
+    This is called when a class-based dataset is processed by the
+    @dataset decorator. It detects Resource instances defined as
+    class attributes and binds them to the dataset.
+    Args:
+        cls: The dataset class to scan.
+        dataset_wrapper: The AbstractDataset to bind resources to.
+    """
+    from datamaestro.download import Resource
+    for attr_name, attr_value in vars(cls).items():
+        if isinstance(attr_value, Resource):
+            attr_value.bind(attr_name, dataset_wrapper)
+    # Build the dependency DAG
+    _compute_dependents(dataset_wrapper.resources)
+    # Validate: topological sort will raise on cycles
+    dataset_wrapper.ordered_resources = topological_sort(dataset_wrapper.resources)
+def _delete_path(path: Path) -> None:
+    """Delete a file or directory at path."""
+    if path.exists():
+        if path.is_dir():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+def _move_path(src: Path, dst: Path) -> None:
+    """Move a file or directory from src to dst."""
+    if src.exists():
+        dst.parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(src), str(dst))
 # --- Objects holding information into classes/function
-class AbstractData:
+class AbstractData(ABC):
     """Data definition groups common fields between a dataset and a data piece,
     such as tags and tasks"""
@@ -47,8 +149,7 @@ class AbstractData:
 class DataDefinition(AbstractData):
-    """Object that stores the declarative part of a data(set) description
-    """
+    """Object that stores the declarative part of a data(set) description"""
     def __init__(self, t, base=None):
         assert base is None or not inspect.isclass(t)
@@ -73,8 +174,10 @@ class DataDefinition(AbstractData):
         return self._description
     @staticmethod
-    def repository_relpath(t: type) -> Tuple[Repository, List[str]]:
+    def repository_relpath(t: type) -> Tuple["Repository", List[str]]:
         """Find the repository of the current data or dataset definition"""
+        from .context import Context  # noqa: F811
         repositorymap = Context.instance().repositorymap
         fullname = f"{t.__module__}.{t.__name__}"
@@ -93,10 +196,7 @@ class DataDefinition(AbstractData):
             if components[0] == "datamaestro":
                 longest_ix = 0
-        if repository is None:
-            raise Exception(f"Could not find the repository for {fullname}")
-        return repository, components[(longest_ix + 1) :]
+        return repository, [s.lower() for s in components[(longest_ix + 1) :]]
     def ancestors(self):
         ancestors = []
@@ -122,6 +222,15 @@ class AbstractDataset(AbstractData):
         - timestamp: whether the dataset version depends on the time of the download
     """
+    name: Optional[str] = None
+    """The name of the dataset"""
+    url: Optional[str] = None
+    """The URL of the dataset"""
+    doi: Optional[str] = None
+    """The DOI of this dataset"""
     def __init__(self, repository: Optional["Repository"]):
         super().__init__()
         self.repository = repository
@@ -130,6 +239,7 @@ class AbstractDataset(AbstractData):
         # Associated resources
         self.resources: Dict[str, "Download"] = {}
+        self.ordered_resources = []
         # Hooks
         # pre-use: before returning the dataset object
@@ -137,7 +247,6 @@ class AbstractDataset(AbstractData):
         self.hooks = {"pre-use": [], "pre-download": []}
         self.url = None
-        self.name: Optional[str] = None
         self.version = None
     @property
@@ -150,18 +259,25 @@ class AbstractDataset(AbstractData):
     @property
     def context(self):
+        if self.repository is None:
+            from datamaestro.context import Context  # noqa: F811
+            return Context.instance()
         return self.repository.context
     def prepare(self, download=False) -> "Base":
-        ds = self._prepare(download)
+        ds = self._prepare()
         ds.__datamaestro_dataset__ = self
+        if download:
+            ds.download()
         return ds
     def register_hook(self, hookname: str, hook: Callable):
         self.hooks[hookname].append(hook)
-    def _prepare(self, download=False) -> "Base":
-        raise NotImplementedError(f"prepare() in {self.__class__}")
+    @abstractmethod
+    def _prepare(self) -> "Base": ...
     def format(self, encoder: str) -> str:
         s = self.prepare()
@@ -181,26 +297,153 @@ class AbstractDataset(AbstractData):
         from datamaestro.data import Base
         if isinstance(data, Base):
-            data.id = f"{id}@{self.repository.name}"
+            try:
+                if data.id:
+                    # There is already an ID, skip this
+                    # and the descendants
+                    return
+            except KeyError:
+                pass
+            if self.repository is None:
+                data.id = id
+            else:
+                data.id = f"{id}@{self.repository.name}"
         for key, value in data.__xpm__.values.items():
             if isinstance(value, Config):
                 self.setDataIDs(value, f"{id}.{key}")
     def download(self, force=False):
-        """Download all the necessary resources"""
+        """Download all the necessary resources.
+        Uses DAG-based topological ordering and the two-path system:
+        1. Acquire exclusive lock (.state.lock)
+        2. Resource writes to transient_path (under .downloads/)
+        3. Framework moves transient_path → path (main folder)
+        4. State marked COMPLETE
+        5. Transient dependencies cleaned up eagerly
+        6. .downloads/ directory removed after all resources complete
+        7. Release lock
+        """
+        import fcntl
+        from datamaestro.download import ResourceState
+        self.prepare()
+        logging.info(
+            "Materializing %d resources",
+            len(self.ordered_resources),
+        )
+        self.datapath.mkdir(parents=True, exist_ok=True)
+        lock_path = self.datapath / ".state.lock"
+        lock_file = lock_path.open("w")
+        try:
+            fcntl.flock(lock_file, fcntl.LOCK_EX)
+            success = self._download_locked(force, ResourceState)
+        finally:
+            fcntl.flock(lock_file, fcntl.LOCK_UN)
+            lock_file.close()
+        return success
+    def _download_locked(self, force, ResourceState):
+        """Inner download logic, called while holding .state.lock."""
         success = True
-        for key, resource in self.resources.items():
+        for resource in self.ordered_resources:
+            # Step 1: Check state
+            current_state = resource.state
+            if current_state == ResourceState.COMPLETE and not force:
+                # Verify files are actually present on disk
+                if resource.has_files() and not resource.path.exists():
+                    logging.warning(
+                        "Resource %s marked COMPLETE but files "
+                        "missing at %s — re-downloading",
+                        resource.name,
+                        resource.path,
+                    )
+                    resource.state = ResourceState.NONE
+                    current_state = ResourceState.NONE
+                else:
+                    continue
+            # Adopt pre-existing files (old downloads without state file)
+            if (
+                current_state == ResourceState.NONE
+                and not force
+                and resource.has_files()
+                and resource.path.exists()
+            ):
+                logging.info(
+                    "Resource %s already exists at %s — marking COMPLETE",
+                    resource.name,
+                    resource.path,
+                )
+                resource.state = ResourceState.COMPLETE
+                continue
+            if current_state == ResourceState.PARTIAL:
+                if not resource.can_recover:
+                    _delete_path(resource.transient_path)
+                    resource.state = ResourceState.NONE
+            # Verify all dependencies are COMPLETE
+            for dep in resource.dependencies:
+                if dep.state != ResourceState.COMPLETE:
+                    logging.error(
+                        "Dependency %s of %s is not COMPLETE",
+                        dep.name,
+                        resource.name,
+                    )
+                    return False
+            # Step 2-4: Download with framework-managed state
             try:
-                resource.download(force)
+                resource.download(force=force)
+                # Move transient -> final, mark COMPLETE
+                if resource.has_files():
+                    _move_path(resource.transient_path, resource.path)
+                resource.state = ResourceState.COMPLETE
             except Exception:
-                logging.error("Could not download resource %s", key)
+                logging.error("Could not download resource %s", resource)
                 traceback.print_exc()
+                # Handle PARTIAL state
+                if resource.has_files() and resource.transient_path.exists():
+                    if resource.can_recover:
+                        resource.state = ResourceState.PARTIAL
+                    else:
+                        _delete_path(resource.transient_path)
+                        resource.state = ResourceState.NONE
                 success = False
+                break
+            # Step 5: Eager transient cleanup
+            for dep in resource.dependencies:
+                if dep.transient and all(
+                    d.state == ResourceState.COMPLETE for d in dep.dependents
+                ):
+                    dep.cleanup()
+        # Step 6: Remove .downloads/ directory after success
+        if success:
+            downloads_dir = self.datapath / ".downloads"
+            if downloads_dir.is_dir():
+                shutil.rmtree(downloads_dir)
         return success
     @staticmethod
-    def find(name: str) -> "DataDefinition":
+    def find(name: str, context: Optional["Context"] = None) -> "DataDefinition":
         """Find a dataset given its name"""
+        from datamaestro.context import Context  # noqa: F811
+        context = Context.instance() if context is None else context
         logging.debug("Searching dataset %s", name)
         for repository in Context.instance().repositories():
             logging.debug("Searching dataset %s in %s", name, repository)
@@ -211,7 +454,7 @@ class AbstractDataset(AbstractData):
 class FutureAttr:
-    """Allows to access a dataset subproperty"""
+    """Allows to access a dataset sub-property"""
     def __init__(self, dataset, keys):
         self.dataset = dataset
@@ -237,11 +480,13 @@ class FutureAttr:
 class DatasetWrapper(AbstractDataset):
     """Wraps an annotated method into a dataset
-    This is the standard way to define a dataset in datamaestro
+    This is the standard way to define a dataset in datamaestro through
+    annotations (otherwise, derive from `AbstractDataset`).
     """
-    def __init__(self, annotation, t: type):
+    def __init__(self, annotation: "dataset", t: type):
+        self.config = None
+        self.repository: Optional[Repository] = None
         self.t = t
         self.base = annotation.base
         assert self.base is not None, f"Could not set the Config type for {t}"
@@ -249,82 +494,159 @@ class DatasetWrapper(AbstractDataset):
         repository, components = DataDefinition.repository_relpath(t)
         super().__init__(repository)
+        self.module_name = None
+        if repository is None:
+            # Try to find the module name
+            self.module_name, _ = t.__module__.split(".", 1)
         # Set some variables
         self.url = annotation.url
+        self.doi = annotation.doi
+        self.as_prepare = annotation.as_prepare
         # Builds the ID:
         # Removes module_name.config prefix
-        assert (
-            components[0] == "config"
-        ), f"A @dataset object should be in the .config module (not {t.__module__})"
+        if (
+            (annotation.id is None)
+            or (annotation.id == "")
+            or ("." not in annotation.id)
+            or (annotation.id[0] == ".")
+        ):
+            # Computes an ID
+            assert (
+                # id is empty string = use the module id
+                components[0] == "config"
+            ), (
+                "A @dataset without `id` should be in the "
+                f".config module (not {t.__module__})"
+            )
+            if annotation.id is None:
+                # There is nothing, use the full path
+                path = ".".join(components[1:])
+            else:
+                # Replace
+                path = ".".join(components[1:-1])
+                if annotation.id != "":
+                    path = f"{path}.{annotation.id}"
-        path = ".".join(components[1:-1])
-        if annotation.id == "":
-            # id is empty string = use the module id
             self.id = path
         else:
-            self.id = "%s.%s" % (
-                path,
-                annotation.id or t.__name__.lower().replace("_", "."),
-            )
+            # Use the provided ID
+            self.id = annotation.id
         self.aliases.add(self.id)
         # Get the documentation
-        self._description = ""
-        if t.__doc__:
-            lines = t.__doc__.split("\n", 2)
-            self.name = lines[0]
-            if len(lines) > 1:
-                assert lines[1].strip() == "", "Second line should be blank"
-            if len(lines) > 2:
-                self._description = lines[2]
+        self._name = None
+        self._description = None
+    @property
+    def name(self):
+        self._process_doc()
+        return self._name
     @property
     def description(self):
+        self._process_doc()
         return self._description
+    def _process_doc(self):
+        if self._description is None:
+            if self.t.__doc__:
+                lines = self.t.__doc__.split("\n")
+                self._name = lines[0]
+                if len(lines) > 1:
+                    assert lines[1].strip() == "", "Second line should be blank"
+                if len(lines) > 2:
+                    # Remove the common indent
+                    lines = [line.rstrip() for line in lines[2:]]
+                    minindent = max(
+                        next(idx for idx, chr in enumerate(s) if not chr.isspace())
+                        for s in lines
+                        if len(s) > 0
+                    )
+                    self._description = "\n".join(
+                        s[minindent:] if len(s) > 0 else "" for s in lines
+                    )
+            else:
+                self._name = ""
+                self._description = ""
     @property
     def configtype(self):
         return self.base
-    def __call__(self, *args, **kwargs):
-        self.t(*args, **kwargs)
     def __getattr__(self, key):
         """Returns a pointer to a potential attribute"""
         return FutureAttr(self, [key])
-    def _prepare(self, download=False) -> "Base":
-        if download:
-            for hook in self.hooks["pre-download"]:
-                hook(self)
-            if not self.download(False):
-                raise Exception("Could not load necessary resources")
-        logging.debug("Building with data type %s and dataset %s", self.base, self.t)
-        for hook in self.hooks["pre-use"]:
-            hook(self)
-        resources = {key: value.prepare() for key, value in self.resources.items()}
-        dict = self.t(**resources)
-        if dict is None:
-            name = self.t.__name__
-            filename = inspect.getfile(self.t)
-            raise Exception(
-                f"The dataset method {name} defined in {filename} returned a null object"
+    def download(self, force=False):
+        if self.base is self.t:
+            self._prepare()
+        return super().download(force=force)
+    def _prepare(self) -> "Base":
+        if self.config is not None:
+            return self.config
+        # Direct creation of the dataset
+        if self.base is self.t:
+            self.config = self.base.__create_dataset__(self)
+        else:
+            # Construct the object
+            if self.as_prepare:
+                result = self.t(self, None)
+            else:
+                resources = {
+                    key: value.prepare() for key, value in self.resources.items()
+                }
+                result = self.t(**resources)
+            if result is None:
+                raise RuntimeError(f"{self.base} did not return any resource")
+            # Download resources
+            logging.debug(
+                "Building with data type %s and dataset %s", self.base, self.t
             )
+            for hook in self.hooks["pre-use"]:
+                hook(self)
+            if result is None:
+                name = self.t.__name__
+                filename = inspect.getfile(self.t)
+                raise Exception(
+                    f"The dataset method {name} defined in "
+                    f"{filename} returned a null object"
+                )
+            if isinstance(result, dict):
+                self.config = self.base.C(**result)
+            elif isinstance(result, self.base):
+                self.config = result
+            else:
+                name = self.t.__name__
+                filename = inspect.getfile(self.t)
+                raise RuntimeError(
+                    f"The dataset method {name} defined in "
+                    f"{filename} returned an object of type {type(dict)}"
+                )
-        # Constrcut the object
-        data = self.base(**dict)
+        # Setup ourself
+        self.config.__datamaestro_dataset__ = self
         # Set the ids
-        self.setDataIDs(data, self.id)
+        self.setDataIDs(self.config, self.id)
-        return data
+        return self.config
+    __call__ = _prepare
     @property
-    def path(self) -> Path:
-        """Returns the path"""
+    def _path(self) -> Path:
+        """Returns a unique relative path for this dataset"""
         path = Path(*self.id.split("."))
         if self.version:
             path = path.with_suffix(".v%s" % self.version)
@@ -333,16 +655,32 @@ class DatasetWrapper(AbstractDataset):
     @property
     def datapath(self):
         """Returns the destination path for downloads"""
-        return self.repository.datapath / self.path
+        if self.repository is not None:
+            return self.repository.datapath / self._path
-    def hasfiles(self) -> bool:
-        """Returns whether this dataset has files or only includes references"""
+        # No repository, use __custom__/[MODULE NAME]
+        path = self.context.datapath / "__custom__" / self.module_name / self._path
+        return path
+    def has_files(self) -> bool:
+        """Returns whether this dataset has files or only includes references."""
         for resource in self.resources.values():
-            if resource.hasfiles():
+            if resource.has_files():
                 return True
         return False
+    def hasfiles(self) -> bool:
+        """Deprecated: use has_files() instead."""
+        import warnings
+        warnings.warn(
+            "hasfiles() is deprecated, use has_files()",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        return self.has_files()
 # --- Annotations
@@ -358,9 +696,9 @@ class DataAnnotation:
                 self.annotate(object.__datamaestro__)
             else:
                 # With configuration objects, add a __datamaestro__ member to the class
-                assert issubclass(
-                    object, Config
-                ), f"{object} cannot be annotated (only dataset or data definitions)"
+                assert issubclass(object, Config), (
+                    f"{object} cannot be annotated (only dataset or data definitions)"
+                )
                 if "__datamaestro__" not in object.__dict__:
                     object.__datamaestro__ = AbstractData()
                 self.annotate(object.__datamaestro__)
@@ -375,10 +713,16 @@ class DatasetAnnotation:
     """Base class for all annotations"""
     def __call__(self, dataset: AbstractDataset):
-        assert isinstance(
-            dataset, AbstractDataset
-        ), f"Only datasets can be annotated with {self}, but {dataset} is not a dataset"
-        self.annotate(dataset)
+        if isinstance(dataset, AbstractDataset):
+            self.annotate(dataset)
+        elif issubclass(dataset, Dataset):
+            self.annotate(dataset.__datamaestro__)
+        else:
+            raise RuntimeError(
+                f"Only datasets can be annotated with {self}, "
+                f"but {dataset} is not a dataset"
+            )
         return dataset
     def annotate(self, dataset: AbstractDataset):
@@ -425,54 +769,47 @@ def DataTagging(f):
 datatags = DataTagging(lambda d: d.tags)
 datatasks = DataTagging(lambda d: d.tasks)
-# T = TypeVar("T")
-# def data(description=None):
-#     """Deprecated: simply deriving from Base data is enough"""
-#     if description is not None and not isinstance(description, str):
-#         raise RuntimeError("@data annotation should be written @data()")
-#     def annotate(t: T):
-#         try:
-#             object.__getattribute__(t, "__datamaestro__")
-#             logging.warning("@data should only be called once")
-#         except AttributeError:
-#             pass
-#         # Determine the data type
-#         from experimaestro import config
-#         repository, components = DataDefinition.repository_relpath(t)
-#         assert (
-#             components[0] == "data"
-#         ), f"A @data object should be in the .data module (not {t.__module__})"
+class metadata:
+    def __init__(
+        self,
+        tags: Union[str, List[str]] = None,
+        tasks: Union[str, List[str]] = None,
+    ):
+        pass
-#         identifier = (
-#             f"{repository.NAMESPACE if repository else 'datamaestro'}."
-#             + ".".join(components[1:]).lower()
-#         )
-#         t = config(identifier)(t)
-#         t.__datamaestro__ = DataDefinition(repository, t)
-#         return t
-#     return annotate
+    def __call__(self, object: type):
+        # FIXME: todo
+        return object
 class dataset:
-    def __init__(self, base=None, *, timestamp=None, id=None, url=None, size=None):
-        """Creates a new (meta)dataset
-        Meta-datasets are not associated with any
-        Arguments:
-            base {[type]} -- The base type (or None if infered from type annotation)
+    """Dataset decorator
+    Meta-datasets are not associated with any base type.
+    :param base: The base type (or None if inferred from type annotation).
+    :param timestamp: If the dataset evolves, specify its timestamp.
+    :param id: Gives the full ID of the dataset if it contains a '.',
+        the last component if not containing a '.', or the last components
+        if starting with '.'
+    :param url: The URL associated with the dataset.
+    :param size: The size of the dataset (should be a parsable format).
+    :param doi: The DOI of the corresponding paper.
+    :param as_prepare: Resources are setup within the method itself
+    """
-        Keyword Arguments:
-            timestamp {bool} -- If the dataset evolves, specify its timestamp (default: None)
-            id {[type]} -- [description] (default: {None})
-            url {[type]} -- [description] (default: {None})
-            size {str} -- The size (should be a parsable format)
-        """
+    def __init__(
+        self,
+        base=None,
+        *,
+        timestamp: str | None = None,
+        id: None | str = None,
+        url: None | str = None,
+        size: None | int | str = None,
+        doi: None | str = None,
+        as_prepare: bool = False,
+    ):
         if hasattr(base, "__datamaestro__") and isinstance(
             base.__datamaestro__, metadataset
         ):
@@ -485,24 +822,46 @@ class dataset:
         self.meta = False
         self.timestamp = timestamp
         self.size = size
+        self.doi = doi
+        self.as_prepare = as_prepare
     def __call__(self, t):
+        from datamaestro.data import Base
         try:
             if self.base is None:
-                # Get type from return annotation
-                self.base = t.__annotations__["return"]
+                if inspect.isclass(t) and issubclass(t, Base):
+                    self.base = t
+                else:
+                    try:
+                        # Get type from return annotation
+                        return_type = t.__annotations__["return"]
+                        if isinstance(return_type, _GenericAlias):
+                            return_type = return_type.__origin__
+                        self.base = return_type
+                    except KeyError:
+                        logging.warning("No return annotation in %s", t)
+                        raise
             object.__getattribute__(t, "__datamaestro__")
             raise AssertionError("@data should only be called once")
         except AttributeError:
             pass
         dw = DatasetWrapper(self, t)
+        t.__dataset__ = dw
+        # For class-based datasets, scan for Resource class attributes
+        if inspect.isclass(t) and issubclass(t, Base):
+            _bind_class_resources(t, dw)
+            return t
         return dw
 class metadataset(AbstractDataset):
-    """Annotation for object/functions which are abstract dataset definitions -- i.e. shared
-    by more than one real dataset. This is useful to share tags, urls, etc."""
+    """Annotation for object/functions which are abstract dataset definitions
+    i.e. shared by more than one real dataset. This is useful to share tags,
+    urls, etc.
+    """
     def __init__(self, base):
         super().__init__(None)
@@ -516,3 +875,5 @@ class metadataset(AbstractDataset):
             pass
         t.__datamaestro__ = self
         return t
+    _prepare = None

datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl