PyPI - datamaestro - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

datamaestro 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

datamaestro/__init__.py +5 -1
datamaestro/context.py +71 -67
datamaestro/data/__init__.py +18 -11
datamaestro/data/csv.py +8 -6
datamaestro/data/ml.py +3 -3
datamaestro/data/tensor.py +30 -23
datamaestro/datasets/__init__.py +0 -0
datamaestro/datasets/yaml_repository.py +103 -0
datamaestro/definitions.py +135 -53
datamaestro/download/__init__.py +8 -1
datamaestro/download/custom.py +29 -0
datamaestro/download/single.py +15 -1
datamaestro/search.py +1 -1
datamaestro/test/test_annotations.py +2 -1
datamaestro/test/test_download_handlers.py +3 -0
datamaestro/utils.py +2 -0
datamaestro/version.py +9 -4
{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/METADATA +8 -6
{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/RECORD +23 -20
{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/WHEEL +1 -1
{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/entry_points.txt +0 -0
{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info/licenses}/LICENSE +0 -0
{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/top_level.txt +0 -0

datamaestro/__init__.py CHANGED Viewed

@@ -2,10 +2,14 @@
 from .context import (
     Context,
     Repository,
+    BaseRepository,
     get_dataset,
     prepare_dataset,
 )
-from pkg_resources import get_distribution, DistributionNotFound
+from .datasets.yaml_repository import YAMLRepository
+from pkg_resources import get_distribution, DistributionNotFound
+from .definitions import dataset, metadata
+from .data import Base
 from .version import version, version_tuple

datamaestro/context.py CHANGED Viewed

@@ -1,21 +1,22 @@
 from pathlib import Path
-from experimaestro.compat import cached_property
+from typing import Iterable, Iterator, Dict, Union
 import importlib
 import os
 import hashlib
 import logging
 import inspect
 import json
-from experimaestro.mkdocs.metaloader import Module
+from abc import ABC, abstractmethod
+from experimaestro import Config
 import pkg_resources
-from typing import Iterable, Iterator, List, Dict
+from experimaestro.compat import cached_property
+from experimaestro.mkdocs.metaloader import Module
 from .utils import CachedFile, downloadURL
 from .settings import UserSettings, Settings
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
-    from datamaestro.definitions import AbstractDataset
+    from datamaestro.definitions import AbstractDataset, DatasetWrapper
 class Compression:
@@ -87,6 +88,11 @@ class Context:
         return ContextManager()
+    @property
+    def storepath(self):
+        """Replaces the data path"""
+        return self._path.joinpath("store")
     @property
     def datapath(self):
         return self._path.joinpath("data")
@@ -98,7 +104,9 @@ class Context:
     @cached_property
     def repositorymap(self) -> Dict[str, "Repository"]:
         return {
-            repository.basemodule(): repository for repository in self.repositories()
+            repository.basemodule(): repository
+            for repository in self.repositories()
+            if repository.basemodule() is not None
         }
     def repositories(self) -> Iterable["Repository"]:
@@ -286,10 +294,53 @@ class Datasets(Iterable["AbstractDataset"]):
                     yield value.__dataset__
-class Repository:
-    """A repository regroup a set of datasets and their corresponding specific
+class BaseRepository(ABC):
+    """A repository groups a set of datasets and their corresponding specific
     handlers (downloading, filtering, etc.)"""
+    def __init__(self, context: Context):
+        self.context = context
+        p = inspect.getabsfile(self.__class__)
+        self.basedir = Path(p).parent
+    @abstractmethod
+    def __iter__(self) -> Iterator["AbstractDataset"]:
+        ...
+    def search(self, name: str):
+        """Search for a dataset in the definitions"""
+        for dataset in self:
+            if name in dataset.aliases:
+                return dataset
+    @classmethod
+    def instance(cls, context=None):
+        try:
+            return cls.__getattribute__(cls, "INSTANCE")
+        except AttributeError:
+            return cls(context if context else Context.instance())
+    @classmethod
+    def basemodule(cls):
+        return cls.__module__
+    @property
+    def generatedpath(self):
+        return self.basedir / "generated"
+    @property
+    def datapath(self):
+        return self.context.datapath.joinpath(self.id)
+    @property
+    def extrapath(self):
+        """Path to the directory containing extra configuration files"""
+        return self.basedir / "data"
+class Repository(BaseRepository):
+    """(deprecated) Repository where datasets are located in __module__.config"""
     def __init__(self, context: Context):
         """Initialize a new repository
@@ -297,26 +348,14 @@ class Repository:
         :param basedir: The base directory of the repository
             (by default, the same as the repository class)
         """
+        super().__init__(context)
         self.context = context
-        p = inspect.getabsfile(self.__class__)
-        self.basedir = Path(p).parent
         self.configdir = self.basedir.joinpath("config")
         self.id = self.__class__.NAMESPACE
         self.name = self.id
         self.module = self.__class__.__module__
         self.__class__.INSTANCE = self
-    @classmethod
-    def basemodule(cls):
-        return cls.__module__
-    @classmethod
-    def instance(cls, context=None):
-        try:
-            return cls.__getattribute__(cls, "INSTANCE")
-        except AttributeError:
-            return cls(context if context else Context.instance())
     @classmethod
     def version(cls):
         from pkg_resources import get_distribution, DistributionNotFound
@@ -336,36 +375,8 @@ class Repository:
         assert isinstance(other, Repository)
         return self.basedir == other.basedir
-    def search(self, name: str):
-        """Search for a dataset in the definitions"""
-        logging.debug("Searching for %s in %s", name, self.configdir)
-        candidates: List[str] = []
-        components = name.split(".")
-        path = self.configdir
-        for i, c in enumerate(components):
-            path = path / c
-            if (path / "__init__.py").is_file():
-                candidates.append(".".join(components[: i + 1]))
-            if path.with_suffix(".py").is_file():
-                candidates.append(".".join(components[: i + 1]))
-            if not path.is_dir():
-                break
-        # Get the dataset
-        for candidate in candidates[::-1]:
-            logging.debug("Searching in module %s.config.%s", self.module, candidate)
-            module = importlib.import_module("%s.config.%s" % (self.module, candidate))
-            for value in Datasets(module):
-                if name in value.aliases:
-                    return value
-        return None
-    def datasets(self, candidate):
+    def datasets(self, candidate: str):
+        """Returns the dataset candidates from a module"""
         try:
             module = importlib.import_module("%s.config.%s" % (self.module, candidate))
         except ModuleNotFoundError:
@@ -409,19 +420,6 @@ class Repository:
             for dataset in datasets:
                 yield dataset
-    @property
-    def generatedpath(self):
-        return self.basedir.joinpath("generated")
-    @property
-    def datapath(self):
-        return self.context.datapath.joinpath(self.id)
-    @property
-    def extrapath(self):
-        """Path to the directory containing extra configuration files"""
-        return self.basedir.joinpath("data")
 def find_dataset(dataset_id: str):
     """Find a dataset given its id"""
@@ -430,11 +428,17 @@ def find_dataset(dataset_id: str):
     return AbstractDataset.find(dataset_id)
-def prepare_dataset(dataset_id: str):
+def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
     """Find a dataset given its id and download the resources"""
-    from .definitions import AbstractDataset
+    from .definitions import AbstractDataset, DatasetWrapper
+    if isinstance(dataset_id, DatasetWrapper):
+        ds = dataset_id
+    elif isinstance(dataset_id, Config):
+        ds = dataset_id.__datamaestro_dataset__
+    else:
+        ds = AbstractDataset.find(dataset_id)
-    ds = AbstractDataset.find(dataset_id)
     return ds.prepare(download=True)

datamaestro/data/__init__.py CHANGED Viewed

@@ -1,22 +1,18 @@
+from abc import abstractmethod
 import logging
 from pathlib import Path
 from typing import Any, Dict
-from datamaestro.definitions import AbstractDataset, argument, Param
-from experimaestro import Config
-from experimaestro import documentation  # noqa: F401
+from experimaestro import Config, Param, Meta
+from datamaestro.definitions import AbstractDataset
 class Base(Config):
     """Base object for all data types"""
     id: Param[str]
-    """The unique dataset ID"""
+    """The unique (sub-)dataset ID"""
-    __datamaestro_dataset__: AbstractDataset
-    def download(self):
-        """Download the dataset"""
-        self.__datamaestro_dataset__.download()
+    __datamaestro_dataset__: "AbstractDataset"
     def dataset_information(self) -> Dict[str, Any]:
         """Returns document meta-informations"""
@@ -26,6 +22,16 @@ class Base(Config):
             "description": self.__datamaestro_dataset__.description,
         }
+    def download(self):
+        """Download the dataset"""
+        self.__datamaestro_dataset__.download()
+    @abstractmethod
+    def prepare(self, *args, **kwargs):
+        """Prepare the dataset"""
+        self.__datamaestro_dataset__.prepare()
+        return self
 class Generic(Base):
     """Generic dataset
@@ -44,16 +50,17 @@ class Generic(Base):
 class File(Base):
     """A data file"""
-    path: Param[Path]
+    path: Meta[Path]
     """The path of the file"""
     def open(self, mode):
         return self.path.open(mode)
-@argument("path", type=Path)
 class Folder(Base):
     """A data folder"""
+    path: Meta[Path]
     def open(self, mode):
         return self.path.open(mode)

datamaestro/data/csv.py CHANGED Viewed

@@ -1,7 +1,8 @@
+from typing import Optional, Tuple, List, Any
 from csv import reader as csv_reader
-from . import File, argument, documentation
-from datamaestro.definitions import Meta
-from typing import Tuple, List, Any
+from experimaestro import Param, Meta
+from experimaestro import documentation
+from . import File
 class Generic(File):
@@ -26,12 +27,13 @@ class Generic(File):
                     return row
-@argument("names_row", type=int, default=-1)
-@argument("size_row", type=int, default=-1)
-@argument("target", type=str, default=None)
 class Matrix(Generic):
     """A numerical dataset"""
+    names_row: Param[int] = -1
+    size_row: Param[int] = -1
+    target: Param[Optional[str]] = None
     @documentation
     def data(self) -> Tuple[List[str], Any]:
         """Returns the list of fields and the numeric data

datamaestro/data/ml.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Machine learning generic data formats"""
-from typing import Generic, TypeVar, Optional
 from pathlib import Path
-from experimaestro import Param, Meta, argument
+from typing import Generic, TypeVar, Optional
+from experimaestro import Param, Meta
 from . import Base
 Train = TypeVar("Train", bound=Base)
@@ -20,8 +20,8 @@ class Supervised(Base, Generic[Train, Validation, Test]):
     """The training optional"""
-@argument("classes")
 class FolderBased(Base):
     """Classification dataset where folders give the basis"""
+    classes: Param[list[str]]
     path: Meta[Path]

datamaestro/data/tensor.py CHANGED Viewed

@@ -1,44 +1,50 @@
-from pathlib import Path
+from abc import ABC, abstractmethod
 from struct import Struct
-from . import File
+from typing import TYPE_CHECKING
+from . import File, Base
+if TYPE_CHECKING:
+    import numpy as np
-class IDX(File):
+class Tensor(Base, ABC):
+    @abstractmethod
+    def data(self) -> "np.ndarray":
+        """Returns the tensor in numpy format"""
+        pass
+class IDX(Tensor, File):
     """IDX File format
-    The IDX file format is a simple format for vectors and multidimensional matrices of various numerical types.
+    The IDX file format is a simple format for vectors and multidimensional
+    matrices of various numerical types.
     The basic format is:
-    magic number
-    size in dimension 0
-    size in dimension 1
-    size in dimension 2
-    .....
-    size in dimension N
-    data
+    magic number size in dimension 0 size in dimension 1 size in dimension 2
+    ..... size in dimension N data
     The magic number is an integer (MSB first). The first 2 bytes are always 0.
-    The third byte codes the type of the data:
-    0x08: unsigned byte
-    0x09: signed byte
-    0x0B: short (2 bytes)
-    0x0C: int (4 bytes)
-    0x0D: float (4 bytes)
-    0x0E: double (8 bytes)
+    The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
+    byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
+    double (8 bytes)
-    The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices....
+    The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
+    vectors, 2 for matrices....
-    The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors).
+    The sizes in each dimension are 4-byte integers (MSB first, high endian,
+    like in most non-Intel processors).
-    The data is stored like in a C array, i.e. the index in the last dimension changes the fastest.
+    The data is stored like in a C array, i.e. the index in the last dimension
+    changes the fastest.
     """
     MAGIC_NUMBER = Struct(">HBB")
     DIM = Struct(">I")
-    def data(self):
+    def data(self) -> "np.ndarray":
         """Returns the tensor"""
         import numpy as np
@@ -59,7 +65,8 @@ class IDX(File):
             shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
             size = np.prod(shape)
-            # Could use np.fromfile... if it were not broken - see https://github.com/numpy/numpy/issues/7989
+            # Could use np.fromfile... if it were not broken
+            # see https://github.com/numpy/numpy/issues/7989
             data = np.frombuffer(fp.read(), dtype=dtype, count=size)
             data = data.reshape(shape, order="C")
         return data

datamaestro/datasets/__init__.py ADDED Viewed

File without changes

datamaestro/datasets/yaml_repository.py ADDED Viewed

@@ -0,0 +1,103 @@
+import regex
+from typing import Iterator, Optional
+from functools import cached_property
+from attrs import field
+import importlib
+from omegaconf import OmegaConf
+from functools import partial
+from attrs import define
+from datamaestro import BaseRepository
+from datamaestro.definitions import AbstractDataset, DatasetWrapper
+from datamaestro.data import Base
+re_spec = regex.compile(r"""^(\w\.)+:(\w+)""")
+@define
+class RepositoryDataset:
+    ids: list[str]
+    """ID(s) of this dataset"""
+    entry_point: str = field(validator=re_spec.match)
+    """The entry point"""
+    title: str
+    """The full name of the dataset"""
+    description: str
+    """Description of the dataset"""
+    url: Optional[str]
+    """The URL"""
+    groups: Optional[list[str]]
+    """Groups to which this repository belongs"""
+@define
+class RepositoryAuthors:
+    name: str
+    email: str
+@define
+class RepositoryGroup:
+    name: str
+    tasks: list[str]
+    tags: list[str]
+@define
+class RepositoryConfiguration:
+    namespace: str
+    authors: list[RepositoryAuthors]
+    description: str
+    groups: dict[str, RepositoryGroup]
+    datasets: list[RepositoryDataset]
+class YAMLDataset(AbstractDataset):
+    def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
+        super().__init__(repository)
+        self.information = information
+        self.id = self.information.ids[0]
+        self.aliases = set(self.information.ids)
+    @cached_property
+    def wrapper(self) -> DatasetWrapper:
+        module, func_name = self.information.entry_point.split(":")
+        wrapper = getattr(importlib.import_module(module), func_name)
+        return wrapper
+    def _prepare(self) -> "Base":
+        return self.wrapper()
+    def download(self, **kwargs):
+        return self.wrapper.download(**kwargs)
+class YAMLRepository(BaseRepository):
+    """YAML-based repository"""
+    @property
+    def id(self):
+        return self.configuration.namespace
+    @property
+    def name(self):
+        return self.configuration.namespace
+    @cached_property
+    def configuration(self):
+        schema = OmegaConf.structured(RepositoryConfiguration)
+        with importlib.resources.path(
+            self.__class__.__module__, "datamaestro.yaml"
+        ) as fp:
+            conf = OmegaConf.load(fp)
+        conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
+        return conf
+    def __iter__(self) -> Iterator["AbstractDataset"]:
+        return map(partial(YAMLDataset, self), self.configuration.datasets)

datamaestro/definitions.py CHANGED Viewed

@@ -6,6 +6,8 @@ import logging
 import inspect
 from pathlib import Path
 from itertools import chain
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
 import traceback
 from typing import (
     Dict,
@@ -16,6 +18,9 @@ from typing import (
     TypeVar,
     Callable,
     TYPE_CHECKING,
+    Union,
+    ClassVar,
+    _GenericAlias,
 )
 from experimaestro import (  # noqa: F401 (re-exports)
     argument,
@@ -27,16 +32,16 @@ from experimaestro import (  # noqa: F401 (re-exports)
 )
 from typing import Type as TypingType  # noqa: F401 (re-exports)
 from experimaestro.core.types import Type  # noqa: F401 (re-exports)
-from .context import Repository, Context, DatafolderPath  # noqa: F401 (re-exports)
 if TYPE_CHECKING:
+    from .data import Base, Dataset
+    from .context import Repository, Context, DatafolderPath  # noqa: F401 (re-exports)
     from datamaestro.download import Download
-    from .data import Base
 # --- Objects holding information into classes/function
-class AbstractData:
+class AbstractData(ABC):
     """Data definition groups common fields between a dataset and a data piece,
     such as tags and tasks"""
@@ -77,8 +82,10 @@ class DataDefinition(AbstractData):
         return self._description
     @staticmethod
-    def repository_relpath(t: type) -> Tuple[Repository, List[str]]:
+    def repository_relpath(t: type) -> Tuple["Repository", List[str]]:
         """Find the repository of the current data or dataset definition"""
+        from .context import Context  # noqa: F811
         repositorymap = Context.instance().repositorymap
         fullname = f"{t.__module__}.{t.__name__}"
@@ -97,9 +104,6 @@ class DataDefinition(AbstractData):
             if components[0] == "datamaestro":
                 longest_ix = 0
-        if repository is None:
-            raise Exception(f"Could not find the repository for {fullname}")
         return repository, components[(longest_ix + 1) :]
     def ancestors(self):
@@ -163,18 +167,26 @@ class AbstractDataset(AbstractData):
     @property
     def context(self):
+        if self.repository is None:
+            from datamaestro.context import Context  # noqa: F811
+            return Context.instance()
         return self.repository.context
     def prepare(self, download=False) -> "Base":
-        ds = self._prepare(download)
+        ds = self._prepare()
         ds.__datamaestro_dataset__ = self
+        if download:
+            ds.download()
         return ds
     def register_hook(self, hookname: str, hook: Callable):
         self.hooks[hookname].append(hook)
-    def _prepare(self, download=False) -> "Base":
-        raise NotImplementedError(f"prepare() in {self.__class__}")
+    @abstractmethod
+    def _prepare(self) -> "Base":
+        ...
     def format(self, encoder: str) -> str:
         s = self.prepare()
@@ -194,7 +206,10 @@ class AbstractDataset(AbstractData):
         from datamaestro.data import Base
         if isinstance(data, Base):
-            data.id = f"{id}@{self.repository.name}"
+            if self.repository is None:
+                data.id = id
+            else:
+                data.id = f"{id}@{self.repository.name}"
         for key, value in data.__xpm__.values.items():
             if isinstance(value, Config):
                 self.setDataIDs(value, f"{id}.{key}")
@@ -203,6 +218,7 @@ class AbstractDataset(AbstractData):
         """Download all the necessary resources"""
         success = True
         logging.info("Materializing %d resources", len(self.ordered_resources))
+        self.prepare()
         for resource in self.ordered_resources:
             try:
                 resource.download(force)
@@ -216,6 +232,8 @@ class AbstractDataset(AbstractData):
     @staticmethod
     def find(name: str) -> "DataDefinition":
         """Find a dataset given its name"""
+        from datamaestro.context import Context  # noqa: F811
         logging.debug("Searching dataset %s", name)
         for repository in Context.instance().repositories():
             logging.debug("Searching dataset %s in %s", name, repository)
@@ -226,7 +244,7 @@ class AbstractDataset(AbstractData):
 class FutureAttr:
-    """Allows to access a dataset subproperty"""
+    """Allows to access a dataset sub-property"""
     def __init__(self, dataset, keys):
         self.dataset = dataset
@@ -256,10 +274,14 @@ class DatasetWrapper(AbstractDataset):
     annotations (otherwise, derive from `AbstractDataset`).
     """
+    BUILDING: ClassVar[list["DatasetWrapper"]] = []
+    """Currently built dataset"""
     def __init__(self, annotation, t: type):
+        self.config = None
+        self.repository: Optional[Repository] = None
         self.t = t
         self.base = annotation.base
-        self.config = None
         assert self.base is not None, f"Could not set the Config type for {t}"
         repository, components = DataDefinition.repository_relpath(t)
@@ -271,19 +293,22 @@ class DatasetWrapper(AbstractDataset):
         # Builds the ID:
         # Removes module_name.config prefix
-        assert (
-            components[0] == "config"
-        ), f"A @dataset object should be in the .config module (not {t.__module__})"
+        if annotation.id is None or annotation.id == "":
+            # Computes an ID
+            assert (
+                # id is empty string = use the module id
+                components[0]
+                == "config"
+            ), (
+                "A @dataset without `id` should be in the "
+                f".config module (not {t.__module__})"
+            )
+            path = ".".join(components[1:-1])
-        path = ".".join(components[1:-1])
-        if annotation.id == "":
-            # id is empty string = use the module id
             self.id = path
         else:
-            self.id = "%s.%s" % (
-                path,
-                annotation.id or t.__name__.lower().replace("_", "."),
-            )
+            # Use the provided ID
+            self.id = annotation.id
         self.aliases.add(self.id)
@@ -327,9 +352,6 @@ class DatasetWrapper(AbstractDataset):
     def configtype(self):
         return self.base
-    def __call__(self, *args, **kwargs):
-        self.t(*args, **kwargs)
     def __getattr__(self, key):
         """Returns a pointer to a potential attribute"""
         return FutureAttr(self, [key])
@@ -339,40 +361,59 @@ class DatasetWrapper(AbstractDataset):
             self._prepare()
         return super().download(force=force)
-    def _prepare(self, download=False) -> "Base":
+    @contextmanager
+    def building(self):
+        DatasetWrapper.BUILDING.append(self)
+        yield self
+        DatasetWrapper.BUILDING.pop()
+    def _prepare(self) -> "Base":
         if self.config is not None:
             return self.config
+        # Direct creation of the dataset
         if self.base is self.t:
             self.config = self.base.__create_dataset__(self)
-        if download:
-            for hook in self.hooks["pre-download"]:
-                hook(self)
-            if not self.download(False):
-                raise Exception("Could not load necessary resources")
+        # Construct the object
+        resources = {key: value.prepare() for key, value in self.resources.items()}
+        with self.building():
+            result = self.t(**resources)
+        # Download resources
         logging.debug("Building with data type %s and dataset %s", self.base, self.t)
         for hook in self.hooks["pre-use"]:
             hook(self)
-        # Construct the object
-        if self.config is None:
-            resources = {key: value.prepare() for key, value in self.resources.items()}
-            dict = self.t(**resources)
-            if dict is None:
-                name = self.t.__name__
-                filename = inspect.getfile(self.t)
-                raise Exception(
-                    f"The dataset method {name} defined in "
-                    f"{filename} returned a null object"
-                )
-            self.config = self.base(**dict)
+        if result is None:
+            name = self.t.__name__
+            filename = inspect.getfile(self.t)
+            raise Exception(
+                f"The dataset method {name} defined in "
+                f"{filename} returned a null object"
+            )
+        if isinstance(result, dict):
+            self.config = self.base(**result)
+        elif isinstance(result, self.base):
+            self.config = result
+        else:
+            raise RuntimeError(
+                f"The dataset method {name} defined in "
+                f"{filename} returned an object of type {type(dict)}"
+            )
+        # Setup ourself
+        self.config.__datamaestro_dataset__ = self
         # Set the ids
         self.setDataIDs(self.config, self.id)
         return self.config
+    __call__ = _prepare
     @property
     def _path(self) -> Path:
         """Returns a unique relative path for this dataset"""
@@ -384,7 +425,20 @@ class DatasetWrapper(AbstractDataset):
     @property
     def datapath(self):
         """Returns the destination path for downloads"""
-        return self.repository.datapath / self._path
+        from datamaestro import Context  # noqa: F811
+        path = Context.instance().storepath / self._path
+        if (self.repository is not None) and (not path.exists()):
+            old_path: Path = self.repository.datapath / self._path
+            if old_path.exists():
+                logging.info(
+                    "Moving from old path [%s] to new path [%s]", old_path, path
+                )
+                path.parent.mkdir(exist_ok=True, parents=True)
+                old_path.rename(path)
+        return path
     def hasfiles(self) -> bool:
         """Returns whether this dataset has files or only includes references"""
@@ -426,10 +480,16 @@ class DatasetAnnotation:
     """Base class for all annotations"""
     def __call__(self, dataset: AbstractDataset):
-        assert isinstance(
-            dataset, AbstractDataset
-        ), f"Only datasets can be annotated with {self}, but {dataset} is not a dataset"
-        self.annotate(dataset)
+        if isinstance(dataset, AbstractDataset):
+            self.annotate(dataset)
+        elif issubclass(dataset, Dataset):
+            self.annotate(dataset.__datamaestro__)
+        else:
+            raise RuntimeError(
+                f"Only datasets can be annotated with {self}, "
+                f"but {dataset} is not a dataset"
+            )
         return dataset
     def annotate(self, dataset: AbstractDataset):
@@ -477,9 +537,27 @@ datatags = DataTagging(lambda d: d.tags)
 datatasks = DataTagging(lambda d: d.tasks)
+class metadata:
+    def __init__(
+        self, tags: Union[str, List[str]] = None, tasks: Union[str, List[str]] = None
+    ):
+        pass
+    def __call__(self, object: type):
+        # FIXME: todo
+        return object
 class dataset:
     def __init__(
-        self, base=None, *, timestamp=None, id=None, url=None, size=None, doi=None
+        self,
+        base=None,
+        *,
+        timestamp=None,
+        id=None,
+        url=None,
+        size=None,
+        doi=None,
     ):
         """Creates a new (meta)dataset
@@ -523,9 +601,12 @@ class dataset:
                 if inspect.isclass(t) and issubclass(t, Base):
                     self.base = t
                 else:
-                    # Get type from return annotation
                     try:
-                        self.base = t.__annotations__["return"]
+                        # Get type from return annotation
+                        return_type = t.__annotations__["return"]
+                        if isinstance(return_type, _GenericAlias):
+                            return_type = return_type.__origin__
+                        self.base = return_type
                     except KeyError:
                         logging.warning("No return annotation in %s", t)
                         raise
@@ -533,7 +614,6 @@ class dataset:
             raise AssertionError("@data should only be called once")
         except AttributeError:
             pass
         dw = DatasetWrapper(self, t)
         t.__dataset__ = dw
         if inspect.isclass(t) and issubclass(t, Base):
@@ -560,3 +640,5 @@ class metadataset(AbstractDataset):
             pass
         t.__datamaestro__ = self
         return t
+    _prepare = None

datamaestro/download/__init__.py CHANGED Viewed

@@ -43,6 +43,13 @@ class Resource(DatasetAnnotation, ABC):
         dataset.ordered_resources.append(self)
         self.definition = dataset
+    def contextualize(self):
+        """When using an annotation inline, uses the current dataset wrapper object"""
+        from datamaestro.definitions import DatasetWrapper
+        wrapper = DatasetWrapper.BUILDING[-1]
+        self.annotate(wrapper)
     @property
     def context(self):
         return self.definition.context
@@ -77,7 +84,7 @@ class Resource(DatasetAnnotation, ABC):
 Download = Resource
-class reference(Download):
+class reference(Resource):
     def __init__(self, varname=None, reference=None):
         """References another dataset

datamaestro/download/custom.py ADDED Viewed

@@ -0,0 +1,29 @@
+from typing import Protocol
+from pathlib import Path
+from datamaestro import Context
+from datamaestro.definitions import DatasetWrapper
+from datamaestro.download import Resource
+class Downloader(Protocol):
+    def __call__(self, context: Context, root: Path, *, force=False):
+        pass
+class CustomResource(Resource):
+    def __init__(self, ds_wrapper: DatasetWrapper, downloader: Downloader):
+        self.ds_wrapper = ds_wrapper
+        self.downloader = downloader
+    def prepare(self):
+        pass
+    def download(self, force=False):
+        self.downloader(self.context, self.ds_wrapper.datapath, force=force)
+def custom_download(downloader: Downloader) -> Path:
+    ds_wrapper = DatasetWrapper.BUILDING[-1]
+    ds_wrapper.ordered_resources.append(CustomResource(ds_wrapper, downloader))
+    return ds_wrapper.datapath

datamaestro/download/single.py CHANGED Viewed

@@ -9,7 +9,7 @@ import os
 import urllib3
 from pathlib import Path
 import re
-from datamaestro.utils import copyfileobjs
+from datamaestro.utils import copyfileobjs, FileChecker
 from datamaestro.stream import Transform
 from datamaestro.download import Download
@@ -96,6 +96,20 @@ class filedownloader(SingleDownload):
         logging.info("Created file %s" % destination)
+def file_from_url(
+    filename: str,
+    url: str,
+    *,
+    size: Optional[int] = None,
+    transforms: Optional[Transform] = None,
+    checker: Optional[FileChecker] = None,
+) -> Path:
+    """Defines a file that should be downloaded from"""
+    downloader = filedownloader(filename, url, size, transforms, checker)
+    downloader.contextualize()
+    return downloader.path
 class concatdownload(SingleDownload):
     """Concatenate all files in an archive"""

datamaestro/search.py CHANGED Viewed

@@ -40,7 +40,7 @@ class AndCondition(Condition):
         return True
     def __repr__(self):
-        return " AND ".join(self.conditions)
+        return " AND ".join([repr(x) for x in self.conditions])
 class OrCondition(Condition):

datamaestro/test/test_annotations.py CHANGED Viewed

@@ -5,6 +5,7 @@ from datamaestro.definitions import AbstractDataset
 def test_useragreements(context):
     # Fake dataset
     class t(AbstractDataset):
-        pass
+        def _prepare(self):
+            pass
     useragreement("test")(t(None))

datamaestro/test/test_download_handlers.py CHANGED Viewed

@@ -12,6 +12,9 @@ class Dataset(AbstractDataset):
         super().__init__(repository)
         self.datapath = Path(repository.context._path)
+    def _prepare(self):
+        pass
 def test_filedownloader(context):
     repository = MyRepository(context)

datamaestro/utils.py CHANGED Viewed

@@ -42,6 +42,8 @@ def copyfileobjs(fsrc, fdsts, length=0):
 class FileChecker:
+    """Checks a file"""
     def check(self, path: Path):
         """Check the given file

datamaestro/version.py CHANGED Viewed

@@ -1,8 +1,13 @@
-# file generated by setuptools_scm
+# file generated by setuptools-scm
 # don't change, don't track in version control
+__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
 TYPE_CHECKING = False
 if TYPE_CHECKING:
-    from typing import Tuple, Union
+    from typing import Tuple
+    from typing import Union
     VERSION_TUPLE = Tuple[Union[int, str], ...]
 else:
     VERSION_TUPLE = object
@@ -12,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '1.2.1'
-__version_tuple__ = version_tuple = (1, 2, 1)
+__version__ = version = '1.3.0'
+__version_tuple__ = version_tuple = (1, 3, 0)

{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datamaestro
-Version: 1.2.1
+Version: 1.3.0
 Summary: "Dataset management command line and API"
 Home-page: https://github.com/experimaestro/datamaestro
 Author: Benjamin Piwowarski
@@ -25,17 +25,19 @@ Requires-Dist: click
 Requires-Dist: tqdm
 Requires-Dist: urllib3
 Requires-Dist: marshmallow
-Requires-Dist: cached-property
+Requires-Dist: cached_property
 Requires-Dist: requests
 Requires-Dist: bitmath
-Requires-Dist: experimaestro >=1.5.0
+Requires-Dist: experimaestro>=1.5.0
 Requires-Dist: mkdocs
 Requires-Dist: pymdown-extensions
 Requires-Dist: mkdocs-material
-Requires-Dist: docstring-parser
+Requires-Dist: docstring_parser
 Requires-Dist: numpy
 Provides-Extra: test
-Requires-Dist: tox ; extra == 'test'
+Requires-Dist: tox; extra == "test"
+Dynamic: license-file
+Dynamic: requires-dist
 [![PyPI version](https://badge.fury.io/py/datamaestro.svg)](https://badge.fury.io/py/datamaestro) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![DOI](https://zenodo.org/badge/4573876.svg)](https://zenodo.org/badge/latestdoi/4573876)

{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,31 +1,34 @@
-datamaestro/__init__.py,sha256=9M5hA6FVngduJBcjInvJWQM8n0cqapXAFPzfRLHR74c,237
+datamaestro/__init__.py,sha256=gnbxrPFzIuG4oR2Qrw9UYS0SNVsf4yCtqNvzSjstdak,376
 datamaestro/__main__.py,sha256=tJTf1sTWKRIatvBcHlWDIZRZodAZ2B2zkD01pD89MYk,9024
-datamaestro/context.py,sha256=8U5EYEdc9xcHnZFFk4PCZttxxGsmlzRVR8rLBy2zVBw,13605
-datamaestro/definitions.py,sha256=mBoLgrbO1eHVcqMPkb4lxadNdgSsy_w355nZofvBoF8,16732
+datamaestro/context.py,sha256=S7sQ6RQVLjtoY5iyAikfyvfbqoaoDzcHt4-js8t6mMg,13653
+datamaestro/definitions.py,sha256=HEnwB32Reb4ouLOjboEOe_j88keBZPQ0SU6OrO_ohLU,18764
 datamaestro/record.py,sha256=m3WGsPcZ1LouQXNJOBUK3QusAIRiuy6T_oqhq09-Ckg,5504
 datamaestro/registry.py,sha256=M7QJkcWJP_cxAoqIioLQ01ou2Zg9RqGQvW0XGVspYFE,1421
-datamaestro/search.py,sha256=PMceNp5hcp0dlzs4cLb6LJT7XHrdXo58oO7oTucawbE,2887
+datamaestro/search.py,sha256=bRT-91-2VJJ2JSfNaS1mzaVfqq_HMVBVs-RBj0w-ypM,2906
 datamaestro/settings.py,sha256=HYSElTUYZ6DZocBb9o3ifm6WW9knRO64XJUwxGIpvwQ,1304
 datamaestro/sphinx.py,sha256=bp7x_2BFoTSwTqcVZDM8R8cWa7G2pz0Zb8GS054lLYM,6996
-datamaestro/utils.py,sha256=Y3_aqeOHW8vuifwggGWJfgONyDG1FLX7ONAnX85jENI,6511
-datamaestro/version.py,sha256=2U0Gn26fYI3Vgj5hgkLM8I3wI6YEVdffJGllaVW-sSc,411
+datamaestro/utils.py,sha256=9m-AVVww6InAZfGFiGy6XJzfExpYNqH1fhWQEezjafA,6536
+datamaestro/version.py,sha256=qDtcPZdKzxLpd8vVl6fpIFIMkWt2HK_cO9gLDwaHEdk,511
 datamaestro/annotations/__init__.py,sha256=jLprrxSBa5QIqc--vqycEcxU4CR9WjVNRaqR5lH0EuE,39
 datamaestro/annotations/agreement.py,sha256=xEH0ddZxdJ_oG_150PoOa-WjY_OaeQja3FzMzY5IB6k,955
 datamaestro/commands/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 datamaestro/commands/mainstyle.css,sha256=EAWq6hKWjLYZ-gUrGV-z3L8LtkubD7mLoYdSIC7kLOo,465
 datamaestro/commands/site.py,sha256=nnz4tOwKcgUmsLfPcQVo2SgFIC3OShYfJ8S2N6vuzAw,14173
-datamaestro/data/__init__.py,sha256=vOedQsnYtxI2yj-M2nm32eHpIu9S_WRzfA3futlHNs4,1412
-datamaestro/data/csv.py,sha256=-UXjEbKPvhhZ9_MdYnxUsD8Zsz2t4ZFbserFuHak8pw,2515
+datamaestro/data/__init__.py,sha256=Z1qZnliJwS5sRaLznK5YBVJCjvAlPbmJjbRvvLv_UVI,1547
+datamaestro/data/csv.py,sha256=jcXFVBOEQoSi3YL60bqtwjCf2YXHboaMpUmiXZpzuPM,2506
 datamaestro/data/huggingface.py,sha256=rCMiMqVgNI9zRAgm9PYnbwb7musYryBoIP3HuJmH4sg,691
-datamaestro/data/ml.py,sha256=guh1bxi7Dl3SajJdtBFrtPh6K8eNKiMkBKmBeKGuW5U,710
-datamaestro/data/tensor.py,sha256=OVzV1krIRslui8REdl7hPFu3AXlUyDxf5yUZlbNYsz8,2001
-datamaestro/download/__init__.py,sha256=Iqz7zEzeTsBWzE_6bpurhZVtzRjyXVUwCY6MEVjJpO0,2592
+datamaestro/data/ml.py,sha256=7Rv4Tb9g17HDj8mOBJpIDjgolGQAd5Wrb0mHlnm-bPE,709
+datamaestro/data/tensor.py,sha256=in36UQz4cdUEVmCS62pInu9RNekohRON667Z_JqNdhk,2254
+datamaestro/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+datamaestro/datasets/yaml_repository.py,sha256=-UgpRwIALzmfubtb6kXVKjZ9IbiAsnslSth2I1XQ6EU,2539
+datamaestro/download/__init__.py,sha256=XcRw9acAq1IwhLQZpj2HpMNEaMesA5BbllJpbRCkOwA,2846
 datamaestro/download/archive.py,sha256=G-2gzepknqT7Us3naMGAApGVGJMeHQIxM-tSpaa9ark,5608
+datamaestro/download/custom.py,sha256=2-gFoOgQ8J93HjH9sc7u6wjVYm7DmSytP1ty2O6-d8k,839
 datamaestro/download/huggingface.py,sha256=LkzmZo2Z0yccqAfj7di7jDNGFrMKN9m8IM8SfexOomY,1125
 datamaestro/download/links.py,sha256=GFnq_AzI_uen7JBuGWD9qveeC9QFBWDrSnj7pOcwWwM,3352
 datamaestro/download/manual.py,sha256=-T2QWxKAiN3ZbSujjQUVeWDEDFonw9VnlzCfBIHcLao,190
 datamaestro/download/multiple.py,sha256=Mrr0ObHM5cE1CPSHE9PKIrox3qZVgxwRyxLzNXp0LqM,2159
-datamaestro/download/single.py,sha256=QSEviTP9lHLh3ZGyo_KoW3ro8UvWCGNPHeZiNj-9rLA,4134
+datamaestro/download/single.py,sha256=bMDLldvODp2ZXyxXeKLT4qbL-v4igA6A7HVjIt2Cf8c,4526
 datamaestro/download/sync.py,sha256=Z_LsXj4kbZWIYKTVJZEhfdpYiv6wXOOIyw8LahmEcqs,836
 datamaestro/download/todo.py,sha256=y3YnmWC_i-u23ce-vreIwIXZcoO-uA0HXErgJPThnco,256
 datamaestro/download/wayback.py,sha256=B9X1P9jElvd_qnUs9aX0TAO-NrNyvuHLYDAcpNq354w,5430
@@ -36,12 +39,12 @@ datamaestro/templates/dataset.py,sha256=5065rTMAIl4gtzQ96GFiV1_46tY08miIx3WspTP8
 datamaestro/test/__init__.py,sha256=8-oxS68ufD45pv_HldE4S4rSWFF6L-UB_Cms-72DD2M,22
 datamaestro/test/checks.py,sha256=1eTkz4YJhAPOcnQSsz4vPnvzwwfrEnpn6H_s1ADISpo,1704
 datamaestro/test/conftest.py,sha256=it4S5Qq1CA_U8qM0pr4m7v-1dhLj5Y49WjVg5Ee3mpM,767
-datamaestro/test/test_annotations.py,sha256=kRPUmS_UAN6JSSVPUwV4OM_LEuEUHF1OcLSiYXjsKjw,246
-datamaestro/test/test_download_handlers.py,sha256=Qqm-fML1KVp6dPwAUcH6xzi_dpQIshvROzviSYCUzc0,603
+datamaestro/test/test_annotations.py,sha256=XUjDWb3FJimSD91wcItJ0lLwTBmvN4wVu_EgTKSvV2c,278
+datamaestro/test/test_download_handlers.py,sha256=-Gofr89zqIyeI8C4rZqfYR3JfiZVImdcSz9s6q361zQ,641
 datamaestro/test/test_record.py,sha256=hNZ3uo2i5FZ0VsOHRwvLO1Z6Zce92PdipAF65UptPB8,1156
-datamaestro-1.2.1.dist-info/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
-datamaestro-1.2.1.dist-info/METADATA,sha256=2_TL_ysMtfV2a84_0Uu3UQloCHCvetGZWo5tcjdhNCA,8999
-datamaestro-1.2.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-datamaestro-1.2.1.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
-datamaestro-1.2.1.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
-datamaestro-1.2.1.dist-info/RECORD,,
+datamaestro-1.3.0.dist-info/licenses/LICENSE,sha256=WJ7YI-moTFb-uVrFjnzzhGJrnL9P2iqQe8NuED3hutI,35141
+datamaestro-1.3.0.dist-info/METADATA,sha256=UT7JBZzAVGEKtEjUm0jjiHMPW7ZtHlgWljs_9O8s_04,9042
+datamaestro-1.3.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+datamaestro-1.3.0.dist-info/entry_points.txt,sha256=8qMhwSRvFG2iBqtJYVD22Zd4s4c3YkODtcp0Ajw1knw,133
+datamaestro-1.3.0.dist-info/top_level.txt,sha256=XSznaMNAA8jELV7-TOqaAgDsjLzUf9G9MxL7C4helT0,12
+datamaestro-1.3.0.dist-info/RECORD,,

{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.43.0)
+Generator: setuptools (78.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

{datamaestro-1.2.1.dist-info → datamaestro-1.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

datamaestro 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

datamaestro 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl