PyPI - datamaestro - Versions diffs - 1.2.0__tar.gz → 1.3.0__tar.gz - Mend

datamaestro 1.2.0tar.gz → 1.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

{datamaestro-1.2.0 → datamaestro-1.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: datamaestro
-Version: 1.2.0
+Version: 1.3.0
 Summary: "Dataset management command line and API"
 Home-page: https://github.com/experimaestro/datamaestro
 Author: Benjamin Piwowarski
@@ -36,6 +36,8 @@ Requires-Dist: docstring_parser
 Requires-Dist: numpy
 Provides-Extra: test
 Requires-Dist: tox; extra == "test"
+Dynamic: license-file
+Dynamic: requires-dist
 [![PyPI version](https://badge.fury.io/py/datamaestro.svg)](https://badge.fury.io/py/datamaestro) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) [![DOI](https://zenodo.org/badge/4573876.svg)](https://zenodo.org/badge/latestdoi/4573876)

{datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/data.md RENAMED Viewed

@@ -39,11 +39,6 @@ Package `datamaestro.data.ml`
 .. autoxpmconfig:: datamaestro.data.ml.Supervised
 ```
-```{eval-rst}
-.. autoxpmconfig:: datamaestro.data.ml.FolderBased
-```
 ## Tensor
 Package `datamaestro.data.tensor`

{datamaestro-1.2.0 → datamaestro-1.3.0}/docs/source/api/download.rst RENAMED Viewed

@@ -40,6 +40,10 @@ Package `datamaestro.download.links`
 .. autofunction:: datamaestro.download.links.linkfile
+Other
+=====
+.. autofunction:: datamaestro.download.wayback.wayback_documents
@@ -58,3 +62,10 @@ File hashes can be checked with the following checker
 .. autoclass:: datamaestro.utils.FileChecker
 .. autoclass:: datamaestro.utils.HashCheck
        :members: __init__
+Custom
+======
+.. autofunction:: datamaestro.download.custom.Downloader
+.. autoclass:: datamaestro.download.custom.custom_download

{datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/__init__.py RENAMED Viewed

@@ -2,10 +2,14 @@
 from .context import (
     Context,
     Repository,
+    BaseRepository,
     get_dataset,
     prepare_dataset,
 )
-from pkg_resources import get_distribution, DistributionNotFound
+from .datasets.yaml_repository import YAMLRepository
+from pkg_resources import get_distribution, DistributionNotFound
+from .definitions import dataset, metadata
+from .data import Base
 from .version import version, version_tuple

{datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/context.py RENAMED Viewed

@@ -1,21 +1,22 @@
 from pathlib import Path
-from experimaestro.compat import cached_property
+from typing import Iterable, Iterator, Dict, Union
 import importlib
 import os
 import hashlib
 import logging
 import inspect
 import json
-from experimaestro.mkdocs.metaloader import Module
+from abc import ABC, abstractmethod
+from experimaestro import Config
 import pkg_resources
-from typing import Iterable, Iterator, List, Dict
+from experimaestro.compat import cached_property
+from experimaestro.mkdocs.metaloader import Module
 from .utils import CachedFile, downloadURL
 from .settings import UserSettings, Settings
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
-    from datamaestro.definitions import AbstractDataset
+    from datamaestro.definitions import AbstractDataset, DatasetWrapper
 class Compression:
@@ -87,6 +88,11 @@ class Context:
         return ContextManager()
+    @property
+    def storepath(self):
+        """Replaces the data path"""
+        return self._path.joinpath("store")
     @property
     def datapath(self):
         return self._path.joinpath("data")
@@ -98,7 +104,9 @@ class Context:
     @cached_property
     def repositorymap(self) -> Dict[str, "Repository"]:
         return {
-            repository.basemodule(): repository for repository in self.repositories()
+            repository.basemodule(): repository
+            for repository in self.repositories()
+            if repository.basemodule() is not None
         }
     def repositories(self) -> Iterable["Repository"]:
@@ -286,10 +294,53 @@ class Datasets(Iterable["AbstractDataset"]):
                     yield value.__dataset__
-class Repository:
-    """A repository regroup a set of datasets and their corresponding specific
+class BaseRepository(ABC):
+    """A repository groups a set of datasets and their corresponding specific
     handlers (downloading, filtering, etc.)"""
+    def __init__(self, context: Context):
+        self.context = context
+        p = inspect.getabsfile(self.__class__)
+        self.basedir = Path(p).parent
+    @abstractmethod
+    def __iter__(self) -> Iterator["AbstractDataset"]:
+        ...
+    def search(self, name: str):
+        """Search for a dataset in the definitions"""
+        for dataset in self:
+            if name in dataset.aliases:
+                return dataset
+    @classmethod
+    def instance(cls, context=None):
+        try:
+            return cls.__getattribute__(cls, "INSTANCE")
+        except AttributeError:
+            return cls(context if context else Context.instance())
+    @classmethod
+    def basemodule(cls):
+        return cls.__module__
+    @property
+    def generatedpath(self):
+        return self.basedir / "generated"
+    @property
+    def datapath(self):
+        return self.context.datapath.joinpath(self.id)
+    @property
+    def extrapath(self):
+        """Path to the directory containing extra configuration files"""
+        return self.basedir / "data"
+class Repository(BaseRepository):
+    """(deprecated) Repository where datasets are located in __module__.config"""
     def __init__(self, context: Context):
         """Initialize a new repository
@@ -297,26 +348,14 @@ class Repository:
         :param basedir: The base directory of the repository
             (by default, the same as the repository class)
         """
+        super().__init__(context)
         self.context = context
-        p = inspect.getabsfile(self.__class__)
-        self.basedir = Path(p).parent
         self.configdir = self.basedir.joinpath("config")
         self.id = self.__class__.NAMESPACE
         self.name = self.id
         self.module = self.__class__.__module__
         self.__class__.INSTANCE = self
-    @classmethod
-    def basemodule(cls):
-        return cls.__module__
-    @classmethod
-    def instance(cls, context=None):
-        try:
-            return cls.__getattribute__(cls, "INSTANCE")
-        except AttributeError:
-            return cls(context if context else Context.instance())
     @classmethod
     def version(cls):
         from pkg_resources import get_distribution, DistributionNotFound
@@ -336,36 +375,8 @@ class Repository:
         assert isinstance(other, Repository)
         return self.basedir == other.basedir
-    def search(self, name: str):
-        """Search for a dataset in the definitions"""
-        logging.debug("Searching for %s in %s", name, self.configdir)
-        candidates: List[str] = []
-        components = name.split(".")
-        path = self.configdir
-        for i, c in enumerate(components):
-            path = path / c
-            if (path / "__init__.py").is_file():
-                candidates.append(".".join(components[: i + 1]))
-            if path.with_suffix(".py").is_file():
-                candidates.append(".".join(components[: i + 1]))
-            if not path.is_dir():
-                break
-        # Get the dataset
-        for candidate in candidates[::-1]:
-            logging.debug("Searching in module %s.config.%s", self.module, candidate)
-            module = importlib.import_module("%s.config.%s" % (self.module, candidate))
-            for value in Datasets(module):
-                if name in value.aliases:
-                    return value
-        return None
-    def datasets(self, candidate):
+    def datasets(self, candidate: str):
+        """Returns the dataset candidates from a module"""
         try:
             module = importlib.import_module("%s.config.%s" % (self.module, candidate))
         except ModuleNotFoundError:
@@ -409,19 +420,6 @@ class Repository:
             for dataset in datasets:
                 yield dataset
-    @property
-    def generatedpath(self):
-        return self.basedir.joinpath("generated")
-    @property
-    def datapath(self):
-        return self.context.datapath.joinpath(self.id)
-    @property
-    def extrapath(self):
-        """Path to the directory containing extra configuration files"""
-        return self.basedir.joinpath("data")
 def find_dataset(dataset_id: str):
     """Find a dataset given its id"""
@@ -430,11 +428,17 @@ def find_dataset(dataset_id: str):
     return AbstractDataset.find(dataset_id)
-def prepare_dataset(dataset_id: str):
+def prepare_dataset(dataset_id: Union[str, "DatasetWrapper", Config]):
     """Find a dataset given its id and download the resources"""
-    from .definitions import AbstractDataset
+    from .definitions import AbstractDataset, DatasetWrapper
+    if isinstance(dataset_id, DatasetWrapper):
+        ds = dataset_id
+    elif isinstance(dataset_id, Config):
+        ds = dataset_id.__datamaestro_dataset__
+    else:
+        ds = AbstractDataset.find(dataset_id)
-    ds = AbstractDataset.find(dataset_id)
     return ds.prepare(download=True)

{datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/__init__.py RENAMED Viewed

@@ -1,22 +1,18 @@
+from abc import abstractmethod
 import logging
 from pathlib import Path
 from typing import Any, Dict
-from datamaestro.definitions import AbstractDataset, argument, Param
-from experimaestro import Config
-from experimaestro import documentation  # noqa: F401
+from experimaestro import Config, Param, Meta
+from datamaestro.definitions import AbstractDataset
 class Base(Config):
     """Base object for all data types"""
     id: Param[str]
-    """The unique dataset ID"""
+    """The unique (sub-)dataset ID"""
-    __datamaestro_dataset__: AbstractDataset
-    def download(self):
-        """Download the dataset"""
-        self.__datamaestro_dataset__.download()
+    __datamaestro_dataset__: "AbstractDataset"
     def dataset_information(self) -> Dict[str, Any]:
         """Returns document meta-informations"""
@@ -26,6 +22,16 @@ class Base(Config):
             "description": self.__datamaestro_dataset__.description,
         }
+    def download(self):
+        """Download the dataset"""
+        self.__datamaestro_dataset__.download()
+    @abstractmethod
+    def prepare(self, *args, **kwargs):
+        """Prepare the dataset"""
+        self.__datamaestro_dataset__.prepare()
+        return self
 class Generic(Base):
     """Generic dataset
@@ -44,16 +50,17 @@ class Generic(Base):
 class File(Base):
     """A data file"""
-    path: Param[Path]
+    path: Meta[Path]
     """The path of the file"""
     def open(self, mode):
         return self.path.open(mode)
-@argument("path", type=Path)
 class Folder(Base):
     """A data folder"""
+    path: Meta[Path]
     def open(self, mode):
         return self.path.open(mode)

{datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/csv.py RENAMED Viewed

@@ -1,7 +1,8 @@
+from typing import Optional, Tuple, List, Any
 from csv import reader as csv_reader
-from . import File, argument, documentation
-from datamaestro.definitions import Meta
-from typing import Tuple, List, Any
+from experimaestro import Param, Meta
+from experimaestro import documentation
+from . import File
 class Generic(File):
@@ -26,12 +27,13 @@ class Generic(File):
                     return row
-@argument("names_row", type=int, default=-1)
-@argument("size_row", type=int, default=-1)
-@argument("target", type=str, default=None)
 class Matrix(Generic):
     """A numerical dataset"""
+    names_row: Param[int] = -1
+    size_row: Param[int] = -1
+    target: Param[Optional[str]] = None
     @documentation
     def data(self) -> Tuple[List[str], Any]:
         """Returns the list of fields and the numeric data

{datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/ml.py RENAMED Viewed

@@ -1,7 +1,7 @@
 """Machine learning generic data formats"""
-from typing import Generic, TypeVar, Optional
 from pathlib import Path
-from experimaestro import Param, Meta, argument
+from typing import Generic, TypeVar, Optional
+from experimaestro import Param, Meta
 from . import Base
 Train = TypeVar("Train", bound=Base)
@@ -20,8 +20,8 @@ class Supervised(Base, Generic[Train, Validation, Test]):
     """The training optional"""
-@argument("classes")
 class FolderBased(Base):
     """Classification dataset where folders give the basis"""
+    classes: Param[list[str]]
     path: Meta[Path]

{datamaestro-1.2.0 → datamaestro-1.3.0}/src/datamaestro/data/tensor.py RENAMED Viewed

@@ -1,44 +1,50 @@
-from pathlib import Path
+from abc import ABC, abstractmethod
 from struct import Struct
-from . import File
+from typing import TYPE_CHECKING
+from . import File, Base
+if TYPE_CHECKING:
+    import numpy as np
-class IDX(File):
+class Tensor(Base, ABC):
+    @abstractmethod
+    def data(self) -> "np.ndarray":
+        """Returns the tensor in numpy format"""
+        pass
+class IDX(Tensor, File):
     """IDX File format
-    The IDX file format is a simple format for vectors and multidimensional matrices of various numerical types.
+    The IDX file format is a simple format for vectors and multidimensional
+    matrices of various numerical types.
     The basic format is:
-    magic number
-    size in dimension 0
-    size in dimension 1
-    size in dimension 2
-    .....
-    size in dimension N
-    data
+    magic number size in dimension 0 size in dimension 1 size in dimension 2
+    ..... size in dimension N data
     The magic number is an integer (MSB first). The first 2 bytes are always 0.
-    The third byte codes the type of the data:
-    0x08: unsigned byte
-    0x09: signed byte
-    0x0B: short (2 bytes)
-    0x0C: int (4 bytes)
-    0x0D: float (4 bytes)
-    0x0E: double (8 bytes)
+    The third byte codes the type of the data: 0x08: unsigned byte 0x09: signed
+    byte 0x0B: short (2 bytes) 0x0C: int (4 bytes) 0x0D: float (4 bytes) 0x0E:
+    double (8 bytes)
-    The 4-th byte codes the number of dimensions of the vector/matrix: 1 for vectors, 2 for matrices....
+    The 4-th byte codes the number of dimensions of the vector/matrix: 1 for
+    vectors, 2 for matrices....
-    The sizes in each dimension are 4-byte integers (MSB first, high endian, like in most non-Intel processors).
+    The sizes in each dimension are 4-byte integers (MSB first, high endian,
+    like in most non-Intel processors).
-    The data is stored like in a C array, i.e. the index in the last dimension changes the fastest.
+    The data is stored like in a C array, i.e. the index in the last dimension
+    changes the fastest.
     """
     MAGIC_NUMBER = Struct(">HBB")
     DIM = Struct(">I")
-    def data(self):
+    def data(self) -> "np.ndarray":
         """Returns the tensor"""
         import numpy as np
@@ -59,7 +65,8 @@ class IDX(File):
             shape = [IDX.DIM.unpack_from(fp.read(IDX.DIM.size))[0] for i in range(size)]
             size = np.prod(shape)
-            # Could use np.fromfile... if it were not broken - see https://github.com/numpy/numpy/issues/7989
+            # Could use np.fromfile... if it were not broken
+            # see https://github.com/numpy/numpy/issues/7989
             data = np.frombuffer(fp.read(), dtype=dtype, count=size)
             data = data.reshape(shape, order="C")
         return data

datamaestro-1.3.0/src/datamaestro/datasets/__init__.py ADDED Viewed

File without changes

datamaestro-1.3.0/src/datamaestro/datasets/yaml_repository.py ADDED Viewed

@@ -0,0 +1,103 @@
+import regex
+from typing import Iterator, Optional
+from functools import cached_property
+from attrs import field
+import importlib
+from omegaconf import OmegaConf
+from functools import partial
+from attrs import define
+from datamaestro import BaseRepository
+from datamaestro.definitions import AbstractDataset, DatasetWrapper
+from datamaestro.data import Base
+re_spec = regex.compile(r"""^(\w\.)+:(\w+)""")
+@define
+class RepositoryDataset:
+    ids: list[str]
+    """ID(s) of this dataset"""
+    entry_point: str = field(validator=re_spec.match)
+    """The entry point"""
+    title: str
+    """The full name of the dataset"""
+    description: str
+    """Description of the dataset"""
+    url: Optional[str]
+    """The URL"""
+    groups: Optional[list[str]]
+    """Groups to which this repository belongs"""
+@define
+class RepositoryAuthors:
+    name: str
+    email: str
+@define
+class RepositoryGroup:
+    name: str
+    tasks: list[str]
+    tags: list[str]
+@define
+class RepositoryConfiguration:
+    namespace: str
+    authors: list[RepositoryAuthors]
+    description: str
+    groups: dict[str, RepositoryGroup]
+    datasets: list[RepositoryDataset]
+class YAMLDataset(AbstractDataset):
+    def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
+        super().__init__(repository)
+        self.information = information
+        self.id = self.information.ids[0]
+        self.aliases = set(self.information.ids)
+    @cached_property
+    def wrapper(self) -> DatasetWrapper:
+        module, func_name = self.information.entry_point.split(":")
+        wrapper = getattr(importlib.import_module(module), func_name)
+        return wrapper
+    def _prepare(self) -> "Base":
+        return self.wrapper()
+    def download(self, **kwargs):
+        return self.wrapper.download(**kwargs)
+class YAMLRepository(BaseRepository):
+    """YAML-based repository"""
+    @property
+    def id(self):
+        return self.configuration.namespace
+    @property
+    def name(self):
+        return self.configuration.namespace
+    @cached_property
+    def configuration(self):
+        schema = OmegaConf.structured(RepositoryConfiguration)
+        with importlib.resources.path(
+            self.__class__.__module__, "datamaestro.yaml"
+        ) as fp:
+            conf = OmegaConf.load(fp)
+        conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
+        return conf
+    def __iter__(self) -> Iterator["AbstractDataset"]:
+        return map(partial(YAMLDataset, self), self.configuration.datasets)

datamaestro 1.2.0__tar.gz → 1.3.0__tar.gz

datamaestro 1.2.0tar.gz → 1.3.0tar.gz