PyPI - datamaestro - Versions diffs - 1.0.6__tar.gz → 1.2.0__tar.gz - Mend

datamaestro 1.0.6tar.gz → 1.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

{datamaestro-1.0.6 → datamaestro-1.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: datamaestro
-Version: 1.0.6
+Version: 1.2.0
 Summary: "Dataset management command line and API"
 Home-page: https://github.com/experimaestro/datamaestro
 Author: Benjamin Piwowarski

datamaestro-1.2.0/docs/source/api/records.rst ADDED Viewed

@@ -0,0 +1,112 @@
+Records
+=======
+Records are flexible ways to compose information coming from various sources. For instance,
+your processing chain can produce records only containing an ID. Later, you add can retrieve
+the item content and add it to the record. Further in the processing, you would want to add
+some transformation of the item content.
+Records allow to perform this type of transformations by holding a set of **items**. Record types
+form a lattice of types so that checking that some item types are present in an item is easy.
+.. code-block:: python
+    @define
+    class AItem(Item):
+        a: int
+    @define
+    class A1Item(AItem):
+        a1: int
+    @define
+    class BItem(Item):
+        b: int
+    @define
+    class CItem(Item):
+        c: int
+    record = Record(AItem(1), BItem(2))
+    print(record[AItem].a)  # 1
+    print(record[BItem].b)  # 1
+    # records types are only defined by their item types
+    other_record = Record(A1Item(1), BItem(2))
+    # records can be updated
+    new_record = record.update(BItem(3), CItem(4))
+    print(new_record[BItem].b)  # 3
+    print(new_record[CItem].c)  # 4
+    # records only hold one instance of a given item
+    # base type
+    new_record_a1 = record.update(A1Item(3, 4))
+    print(new_record[AItem].a)  # 3
+    print(new_record[A1Item].a)  # 3
+    print(new_record[A1Item].a1)  # 4
+Working with record types
+*************************
+Record types form a lattice of types that can be used to check
+record properties before hand.
+.. code-block:: python
+    ABRecord = record_type(AItem, BItem)
+    AB1Record = record_type(AItem, B1Item)
+    # Hierarchy-based check
+    assert ABRecord.contains(AB1Record)
+    # Checks for specific types
+    assert ABRecord.has(AItem, BItem)
+Validating
+**********
+To ensure that a record fills the requested property,
+one can use record types
+.. code-block:: python
+    ABRecord = record_type(AItem, BItem)
+    # OK
+    ABRecord(AItem(1), BItem(2))
+    # Fails: A1Item is not AItem
+    ABRecord(A1Item(1), BItem(2))
+    # Fails: AItem is not present
+    ABRecord(BItem(2))
+When updating, it is also possible to validate
+.. code-block:: python
+    A1BRecord = record_type(A1Item, BItem)
+    record = Record(AItem(1), BItem(2))
+    # Update the ABRecord into a A1/B one
+    record.update(A1Item(1, 2), target=A1BRecord)
+API
+***
+.. autoclass:: datamaestro.record.Item
+.. autoclass:: datamaestro.record.RecordType
+    :members: __call__, validate, sub
+.. autoclass:: datamaestro.record.Record
+    :members: update, has, get
+.. autofunction:: datamaestro.record.record_type

{datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/annotations/agreement.py RENAMED Viewed

@@ -1,9 +1,15 @@
-import logging
-from datamaestro.definitions import DatasetAnnotation, AbstractDataset, hook
+from typing import Optional
+from datamaestro.definitions import AbstractDataset, hook
 @hook("pre-use")
-def useragreement(definition: AbstractDataset, message, id=None):
+def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
+    """Asks for a user-agreement
+    :param definition: The dataset for which the agreement is asked
+    :param message: The agreement text
+    :param id: The ID of the agreement (default to the dataset ID)
+    """
     # Skip agreement when testing
     if definition.context.running_test:
         return

{datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/context.py RENAMED Viewed

@@ -110,19 +110,20 @@ class Context:
         if repositoryid is None:
             return None
-        l = [
+        entry_points = [
             x
             for x in pkg_resources.iter_entry_points(
                 "datamaestro.repositories", repositoryid
             )
         ]
-        if not l:
+        if not entry_points:
             raise Exception("No datasets repository named %s", repositoryid)
-        if len(l) > 1:
+        if len(entry_points) > 1:
             raise Exception(
-                "Too many datasets repository named %s (%d)" % (repositoryid, len(l))
+                "Too many datasets repository named %s (%d)"
+                % (repositoryid, len(entry_points))
             )
-        return l[0].load()(self)
+        return entry_points[0].load()(self)
     @property
     def running_test(self):
@@ -175,7 +176,6 @@ class Context:
         if dlpath.is_file():
             logging.debug("Using cached file %s for %s", dlpath, url)
         else:
             logging.info("Downloading %s", url)
             tmppath = dlpath.with_suffix(".tmp")
@@ -188,7 +188,7 @@ class Context:
     def ask(self, question: str, options: Dict[str, str]):
         """Ask a question to the user"""
-        print(question)
+        print(question)  # noqa: T201
         answer = None
         while answer not in options:
             answer = input().strip().lower()
@@ -268,6 +268,7 @@ class Datasets(Iterable["AbstractDataset"]):
     def __iter__(self) -> Iterable["AbstractDataset"]:
         from .definitions import DatasetWrapper
+        from datamaestro.data import Base
         # Iterates over defined symbols
         for key, value in self.module.__dict__.items():
@@ -276,10 +277,18 @@ class Datasets(Iterable["AbstractDataset"]):
                 # Ensure it comes from the module
                 if self.module.__name__ == value.t.__module__:
                     yield value
+            elif (
+                inspect.isclass(value)
+                and issubclass(value, Base)
+                and hasattr(value, "__dataset__")
+            ):
+                if self.module.__name__ == value.__module__:
+                    yield value.__dataset__
 class Repository:
-    """A repository regroup a set of datasets and their corresponding specific handlers (downloading, filtering, etc.)"""
+    """A repository regroup a set of datasets and their corresponding specific
+    handlers (downloading, filtering, etc.)"""
     def __init__(self, context: Context):
         """Initialize a new repository
@@ -315,7 +324,7 @@ class Repository:
         try:
             return get_distribution(cls.__module__).version
         except DistributionNotFound:
-            __version__ = None
+            return None
     def __repr__(self):
         return "Repository(%s)" % self.basedir

datamaestro-1.2.0/src/datamaestro/data/ml.py ADDED Viewed

@@ -0,0 +1,27 @@
+"""Machine learning generic data formats"""
+from typing import Generic, TypeVar, Optional
+from pathlib import Path
+from experimaestro import Param, Meta, argument
+from . import Base
+Train = TypeVar("Train", bound=Base)
+Validation = TypeVar("Validation", bound=Base)
+Test = TypeVar("Test", bound=Base)
+class Supervised(Base, Generic[Train, Validation, Test]):
+    train: Param[Base]
+    """The training dataset"""
+    validation: Param[Optional[Base]] = None
+    """The validation dataset (optional)"""
+    test: Param[Optional[Base]] = None
+    """The training optional"""
+@argument("classes")
+class FolderBased(Base):
+    """Classification dataset where folders give the basis"""
+    path: Meta[Path]

{datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/definitions.py RENAMED Viewed

@@ -127,6 +127,13 @@ class AbstractDataset(AbstractData):
     """
     name: Optional[str] = None
+    """The name of the dataset"""
+    url: Optional[str] = None
+    """The URL of the dataset"""
+    doi: Optional[str] = None
+    """The DOI of this dataset"""
     def __init__(self, repository: Optional["Repository"]):
         super().__init__()
@@ -136,6 +143,7 @@ class AbstractDataset(AbstractData):
         # Associated resources
         self.resources: Dict[str, "Download"] = {}
+        self.ordered_resources = []
         # Hooks
         # pre-use: before returning the dataset object
@@ -194,13 +202,15 @@ class AbstractDataset(AbstractData):
     def download(self, force=False):
         """Download all the necessary resources"""
         success = True
-        for key, resource in self.resources.items():
+        logging.info("Materializing %d resources", len(self.ordered_resources))
+        for resource in self.ordered_resources:
             try:
                 resource.download(force)
             except Exception:
-                logging.error("Could not download resource %s", key)
+                logging.error("Could not download resource %s", resource)
                 traceback.print_exc()
                 success = False
+                break
         return success
     @staticmethod
@@ -249,6 +259,7 @@ class DatasetWrapper(AbstractDataset):
     def __init__(self, annotation, t: type):
         self.t = t
         self.base = annotation.base
+        self.config = None
         assert self.base is not None, f"Could not set the Config type for {t}"
         repository, components = DataDefinition.repository_relpath(t)
@@ -256,6 +267,7 @@ class DatasetWrapper(AbstractDataset):
         # Set some variables
         self.url = annotation.url
+        self.doi = annotation.doi
         # Builds the ID:
         # Removes module_name.config prefix
@@ -322,7 +334,18 @@ class DatasetWrapper(AbstractDataset):
         """Returns a pointer to a potential attribute"""
         return FutureAttr(self, [key])
+    def download(self, force=False):
+        if self.base is self.t:
+            self._prepare()
+        return super().download(force=force)
     def _prepare(self, download=False) -> "Base":
+        if self.config is not None:
+            return self.config
+        if self.base is self.t:
+            self.config = self.base.__create_dataset__(self)
         if download:
             for hook in self.hooks["pre-download"]:
                 hook(self)
@@ -332,23 +355,23 @@ class DatasetWrapper(AbstractDataset):
         for hook in self.hooks["pre-use"]:
             hook(self)
-        resources = {key: value.prepare() for key, value in self.resources.items()}
-        dict = self.t(**resources)
-        if dict is None:
-            name = self.t.__name__
-            filename = inspect.getfile(self.t)
-            raise Exception(
-                f"The dataset method {name} defined in "
-                f"{filename} returned a null object"
-            )
         # Construct the object
-        data = self.base(**dict)
+        if self.config is None:
+            resources = {key: value.prepare() for key, value in self.resources.items()}
+            dict = self.t(**resources)
+            if dict is None:
+                name = self.t.__name__
+                filename = inspect.getfile(self.t)
+                raise Exception(
+                    f"The dataset method {name} defined in "
+                    f"{filename} returned a null object"
+                )
+            self.config = self.base(**dict)
         # Set the ids
-        self.setDataIDs(data, self.id)
+        self.setDataIDs(self.config, self.id)
-        return data
+        return self.config
     @property
     def _path(self) -> Path:
@@ -455,7 +478,9 @@ datatasks = DataTagging(lambda d: d.tasks)
 class dataset:
-    def __init__(self, base=None, *, timestamp=None, id=None, url=None, size=None):
+    def __init__(
+        self, base=None, *, timestamp=None, id=None, url=None, size=None, doi=None
+    ):
         """Creates a new (meta)dataset
         Meta-datasets are not associated with any base type
@@ -473,6 +498,8 @@ class dataset:
             url {[type]} -- [description] (default: {None})
             size {str} -- The size (should be a parsable format)
+            doi {str} -- The DOI of the corresponding paper
         """
         if hasattr(base, "__datamaestro__") and isinstance(
             base.__datamaestro__, metadataset
@@ -486,18 +513,31 @@ class dataset:
         self.meta = False
         self.timestamp = timestamp
         self.size = size
+        self.doi = doi
     def __call__(self, t):
         try:
             if self.base is None:
-                # Get type from return annotation
-                self.base = t.__annotations__["return"]
+                from datamaestro.data import Base
+                if inspect.isclass(t) and issubclass(t, Base):
+                    self.base = t
+                else:
+                    # Get type from return annotation
+                    try:
+                        self.base = t.__annotations__["return"]
+                    except KeyError:
+                        logging.warning("No return annotation in %s", t)
+                        raise
             object.__getattribute__(t, "__datamaestro__")
             raise AssertionError("@data should only be called once")
         except AttributeError:
             pass
         dw = DatasetWrapper(self, t)
+        t.__dataset__ = dw
+        if inspect.isclass(t) and issubclass(t, Base):
+            return t
         return dw

{datamaestro-1.0.6 → datamaestro-1.2.0}/src/datamaestro/download/__init__.py RENAMED Viewed

@@ -1,6 +1,8 @@
+from typing import Union
 from abc import ABC, abstractmethod
 from datamaestro.definitions import AbstractDataset, DatasetAnnotation
 from datamaestro.utils import deprecated
+from attrs import define
 def initialized(method):
@@ -15,7 +17,12 @@ def initialized(method):
     return wrapper
-class Download(DatasetAnnotation, ABC):
+@define(kw_only=True)
+class SetupOptions:
+    pass
+class Resource(DatasetAnnotation, ABC):
     """
     Base class for all download handlers
     """
@@ -24,13 +31,16 @@ class Download(DatasetAnnotation, ABC):
         self.varname = varname
         # Ensures that the object is initialized
         self._post = False
+        self.definition = None
     def annotate(self, dataset: AbstractDataset):
+        assert self.definition is None
         # Register has a resource download
         if self.varname in dataset.resources:
             raise AssertionError("Name %s already declared as a resource", self.varname)
         dataset.resources[self.varname] = self
+        dataset.ordered_resources.append(self)
         self.definition = dataset
     @property
@@ -53,10 +63,29 @@ class Download(DatasetAnnotation, ABC):
         """Prepares the dataset"""
         ...
+    def setup(
+        self,
+        dataset: Union[AbstractDataset],
+        options: SetupOptions = None,
+    ):
+        """Direct way to setup the resource (no annotation)"""
+        self(dataset)
+        return self.prepare()
+# Keeps downwards compatibility
+Download = Resource
 class reference(Download):
-    def __init__(self, varname, reference):
+    def __init__(self, varname=None, reference=None):
+        """References another dataset
+        :param varname: The name of the variable
+        :param reference: Another dataset
+        """
         super().__init__(varname)
+        assert reference is not None, "Reference cannot be null"
         self.reference = reference
     def prepare(self):

datamaestro-1.2.0/src/datamaestro/record.py ADDED Viewed

@@ -0,0 +1,177 @@
+from typing import Type, TypeVar, Dict, Union, Optional
+class Item:
+    """Base class for all item types"""
+    @classmethod
+    def __get_base__(cls: Type) -> Type:
+        """Get the most generic superclass for this type of item"""
+        if base := getattr(cls, "__base__cache__", None):
+            return base
+        base = cls
+        for supercls in cls.__mro__:
+            if issubclass(supercls, Item) and supercls is not Item:
+                base = supercls
+        setattr(cls, "__base__cache__", base)
+        return base
+T = TypeVar("T", bound=Item)
+Items = Dict[Type[T], T]
+class RecordType:
+    def __init__(self, *item_types: Type[T]):
+        self.item_types = frozenset(item_types)
+        self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
+    def __repr__(self):
+        return f"""Record({",".join(item_type.__name__ for item_type in
+                self.item_types)})"""
+    def contains(self, other: "RecordType"):
+        """Checks that each item type in other has an item type of a compatible
+        type in self"""
+        if len(self.item_types) != len(other.item_types):
+            return False
+        for item_type in other.item_types:
+            if matching_type := self.mapping.get(item_type.__get_base__(), None):
+                if not issubclass(matching_type, item_type):
+                    return False
+            else:
+                return False
+        return True
+    def sub(self, *item_types: Type[T]):
+        """Returns a new record type based on self and new item types"""
+        cls_itemtypes = [x for x in self.item_types]
+        mapping = {
+            itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
+        }
+        for itemtype in item_types:
+            if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
+                cls_itemtypes[ix] = itemtype
+            else:
+                cls_itemtypes.append(itemtype)
+        return record_type(*cls_itemtypes)
+    def __call__(self, *items: T):
+        record = Record(*items)
+        self.validate(record)
+        return record
+    def has(self, itemtype: Type[T]):
+        return issubclass(self.mapping[itemtype.__get_base__()], itemtype)
+    def validate(self, record: "Record"):
+        """Creates and validate a new record of this type"""
+        if self.item_types:
+            for item_type in self.item_types:
+                try:
+                    record.__getitem__(item_type)
+                except KeyError:
+                    raise KeyError(f"Item of type {item_type} is missing")
+        if len(record.items) != len(self.item_types):
+            unregistered = [
+                item
+                for item in record.items.values()
+                if all(
+                    not issubclass(item.__get_base__(), item_type)
+                    for item_type in self.item_types
+                )
+            ]
+            raise KeyError(
+                f"The record of type {self} contains unregistered items: {unregistered}"
+            )
+        # Creates a new record
+        return record
+def record_type(*item_types: Type[T]):
+    """Returns a new record type"""
+    return RecordType(*item_types)
+class Record:
+    """Associate types with entries
+    A record is a composition of items; each item base class is unique.
+    """
+    #: Items for this record
+    items: Items
+    def __init__(self, *items: Union[Items, T], override=False):
+        self.items = {}
+        if len(items) == 1 and isinstance(items[0], dict):
+            # Just copy the dictionary
+            self.items = items[0]
+        else:
+            for entry in items:
+                # Returns a new record if the item exists
+                base = entry.__get_base__()
+                if not override and base in self.items:
+                    raise RuntimeError(
+                        f"The item type {base} ({entry.__class__})"
+                        " is already in the record"
+                    )
+                self.items[base] = entry
+    def __str__(self):
+        return (
+            "{"
+            + ", ".join(
+                f"{key.__module__}.{key.__qualname__}: {value}"
+                for key, value in self.items.items()
+            )
+            + "}"
+        )
+    def __repr__(self):
+        return (
+            "{"
+            + ", ".join(
+                f"{key.__module__}.{key.__qualname__}: {repr(value)}"
+                for key, value in self.items.items()
+            )
+            + "}"
+        )
+    def get(self, key: Type[T]) -> Optional[T]:
+        """Get a given item or None if it does not exist"""
+        try:
+            return self[key]
+        except KeyError:
+            return None
+    def has(self, key: Type[T]) -> bool:
+        """Returns True if the record has the given item type"""
+        return key.__get_base__() in self.items
+    def __getitem__(self, key: Type[T]) -> T:
+        """Get an item given its type"""
+        base = key.__get_base__()
+        entry = self.items[base]
+        # Check if this matches the expected class
+        if not isinstance(entry, key):
+            raise KeyError(f"No entry with type {key}")
+        return entry
+    def update(self, *items: T, target: RecordType = None) -> "Record":
+        """Update some items"""
+        # Create our new dictionary
+        item_dict = {**self.items}
+        for item in items:
+            item_dict[item.__get_base__()] = item
+        return Record(item_dict)

datamaestro 1.0.6__tar.gz → 1.2.0__tar.gz

datamaestro 1.0.6tar.gz → 1.2.0tar.gz