PyPI - datamaestro - Versions diffs - 1.3.1__tar.gz → 1.4.0__tar.gz - Mend

datamaestro 1.3.1tar.gz → 1.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

{datamaestro-1.3.1 → datamaestro-1.4.0}/.github/workflows/pytest.yml RENAMED Viewed

@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8, 3.9, "3.10", "3.11"]
+        python-version: ["3.9", "3.10", "3.11"]
     steps:
     - uses: actions/checkout@v2

{datamaestro-1.3.1 → datamaestro-1.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datamaestro
-Version: 1.3.1
+Version: 1.4.0
 Summary: "Dataset management command line and API"
 Home-page: https://github.com/experimaestro/datamaestro
 Author: Benjamin Piwowarski
@@ -13,7 +13,6 @@ Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
@@ -28,7 +27,7 @@ Requires-Dist: marshmallow
 Requires-Dist: cached_property
 Requires-Dist: requests
 Requires-Dist: bitmath
-Requires-Dist: experimaestro>=1.5.0
+Requires-Dist: experimaestro>=1.6
 Requires-Dist: mkdocs
 Requires-Dist: pymdown-extensions
 Requires-Dist: mkdocs-material
@@ -98,22 +97,10 @@ $ datamaestro search tag:image
 [image] com.lecun.mnist
 $ datamaestro prepare com.lecun.mnist
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
-Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s]                                                            INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
-...JSON...
+INFO:root:Materializing 4 resources
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
 ```
 The previous command also returns a JSON on standard output
@@ -159,13 +146,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
 Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
-For MNIST, this corresponds to.
+For instance, the MNIST dataset can be described by the following
 ```python
-from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
-from datamaestro.download.single import filedownloader
-from datamaestro.definitions import  argument, datatasks, datatags, dataset
-from datamaestro.data.tensor import IDX
+from datamaestro import dataset
+from datamaestro.download.single import download_file
+from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
 @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -176,26 +162,37 @@ from datamaestro.data.tensor import IDX
   ImageClassification,
   url="http://yann.lecun.com/exdb/mnist/",
 )
-def MNIST(train_images, train_labels, test_images, test_labels):
-  """The MNIST database
-  The MNIST database of handwritten digits, available from this page, has a
-  training set of 60,000 examples, and a test set of 10,000 examples. It is a
-  subset of a larger set available from NIST. The digits have been
-  size-normalized and centered in a fixed-size image.
-  """
-  return {
-    "train": LabelledImages(
-      images=IDXImage(path=train_images),
-      labels=IDX(path=train_labels)
-    ),
-    "test": LabelledImages(
-      images=IDXImage(path=test_images),
-      labels=IDX(path=test_labels)
-    ),
-  }
+    return ImageClassification(
+        train=LabelledImages(
+            images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
+        ),
+        test=LabelledImages(
+            images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
+        ),
+    )
+```
+When building dataset modules, some extra documentation can be provided:
+```yaml
+  ids: [com.lecun.mnist]
+  entry_point: "datamaestro_image.config.com.lecun:mnist"
+  title: The MNIST database
+  url: http://yann.lecun.com/exdb/mnist/
+  groups: [image-classification]
+  description: |
+    The MNIST database of handwritten digits, available from this page,
+    has a training set of 60,000 examples, and a test set of 10,000
+    examples. It is a subset of a larger set available from NIST. The
+    digits have been size-normalized and centered in a fixed-size image.
 ```
+This will allow to
+1. Document the dataset
+2. Allow to use the command line interface to manipulate it (download resources, etc.)
 # 0.8.0
 - Integration with other repositories: abstracting away the notion of dataset

{datamaestro-1.3.1 → datamaestro-1.4.0}/README.md RENAMED Viewed

@@ -57,22 +57,10 @@ $ datamaestro search tag:image
 [image] com.lecun.mnist
 $ datamaestro prepare com.lecun.mnist
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
-Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s]                                                            INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
-...JSON...
+INFO:root:Materializing 4 resources
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
 ```
 The previous command also returns a JSON on standard output
@@ -118,13 +106,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
 Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
-For MNIST, this corresponds to.
+For instance, the MNIST dataset can be described by the following
 ```python
-from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
-from datamaestro.download.single import filedownloader
-from datamaestro.definitions import  argument, datatasks, datatags, dataset
-from datamaestro.data.tensor import IDX
+from datamaestro import dataset
+from datamaestro.download.single import download_file
+from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
 @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -135,22 +122,33 @@ from datamaestro.data.tensor import IDX
   ImageClassification,
   url="http://yann.lecun.com/exdb/mnist/",
 )
-def MNIST(train_images, train_labels, test_images, test_labels):
-  """The MNIST database
-  The MNIST database of handwritten digits, available from this page, has a
-  training set of 60,000 examples, and a test set of 10,000 examples. It is a
-  subset of a larger set available from NIST. The digits have been
-  size-normalized and centered in a fixed-size image.
-  """
-  return {
-    "train": LabelledImages(
-      images=IDXImage(path=train_images),
-      labels=IDX(path=train_labels)
-    ),
-    "test": LabelledImages(
-      images=IDXImage(path=test_images),
-      labels=IDX(path=test_labels)
-    ),
-  }
+    return ImageClassification(
+        train=LabelledImages(
+            images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
+        ),
+        test=LabelledImages(
+            images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
+        ),
+    )
+```
+When building dataset modules, some extra documentation can be provided:
+```yaml
+  ids: [com.lecun.mnist]
+  entry_point: "datamaestro_image.config.com.lecun:mnist"
+  title: The MNIST database
+  url: http://yann.lecun.com/exdb/mnist/
+  groups: [image-classification]
+  description: |
+    The MNIST database of handwritten digits, available from this page,
+    has a training set of 60,000 examples, and a test set of 10,000
+    examples. It is a subset of a larger set available from NIST. The
+    digits have been size-normalized and centered in a fixed-size image.
 ```
+This will allow to
+1. Document the dataset
+2. Allow to use the command line interface to manipulate it (download resources, etc.)

datamaestro-1.4.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,13 @@
+[tool.setuptools_scm]
+write_to = "src/datamaestro/version.py"
+fallback_version = "0.0.0-dev"
+[build-system]
+requires = ["setuptools", "setuptools-scm", "wheel"]  # PEP 508 specifications.
+[tool.flake8]
+doctests = "True"
+exclude = ".git, .eggs, __pycache__, tests/, docs/, build/, dist/, app/"
+max-line-length = "88"
+# See https://github.com/PyCQA/pycodestyle/issues/373
+extend-ignore = "E203"

{datamaestro-1.3.1 → datamaestro-1.4.0}/requirements.txt RENAMED Viewed

@@ -10,7 +10,7 @@ requests
 bitmath
 # Experimaestro for data definitions
-experimaestro>=1.5.0
+experimaestro>=1.6
 # Mkdocs
 mkdocs

{datamaestro-1.3.1 → datamaestro-1.4.0}/setup.cfg RENAMED Viewed

@@ -16,7 +16,6 @@ classifiers =
 	License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 	Operating System :: OS Independent
 	Programming Language :: Python
-	Programming Language :: Python :: 3.8
 	Programming Language :: Python :: 3.9
 	Programming Language :: Python :: 3.10
 	Programming Language :: Python :: 3.11

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/__init__.py RENAMED Viewed

@@ -7,8 +7,6 @@ from .context import (
     prepare_dataset,
 )
-from .datasets.yaml_repository import YAMLRepository
 from pkg_resources import get_distribution, DistributionNotFound
 from .definitions import dataset, metadata
 from .data import Base

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/__main__.py RENAMED Viewed

@@ -319,13 +319,17 @@ def search(config: Config, searchterms):
     logging.debug("Search: %s", condition)
     for dataset in config.context.datasets():
-        if condition.match(dataset):
-            cfg = dataset.configtype
-            print(
-                "[%s] %s (%s)"
-                % (
-                    dataset.repository.id,
-                    dataset.id,
-                    cfg.__name__ if cfg is not None else "?",
+        try:
+            if condition.match(dataset):
+                cfg = dataset.configtype
+                print(
+                    "[%s] %s (%s)"
+                    % (
+                        dataset.repository.id,
+                        dataset.id,
+                        cfg.__name__ if cfg is not None else "?",
+                    )
                 )
-            )
+        except Exception:
+            logging.error("Error while matching with dataset %s", dataset)
+            raise

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/context.py RENAMED Viewed

@@ -88,11 +88,6 @@ class Context:
         return ContextManager()
-    @property
-    def storepath(self):
-        """Replaces the data path"""
-        return self._path.joinpath("store")
     @property
     def datapath(self):
         return self._path.joinpath("data")

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/definitions.py RENAMED Viewed

@@ -7,7 +7,6 @@ import inspect
 from pathlib import Path
 from itertools import chain
 from abc import ABC, abstractmethod
-from contextlib import contextmanager
 import traceback
 from typing import (
     Dict,
@@ -19,7 +18,6 @@ from typing import (
     Callable,
     TYPE_CHECKING,
     Union,
-    ClassVar,
     _GenericAlias,
 )
 from experimaestro import (  # noqa: F401 (re-exports)
@@ -217,8 +215,8 @@ class AbstractDataset(AbstractData):
     def download(self, force=False):
         """Download all the necessary resources"""
         success = True
-        logging.info("Materializing %d resources", len(self.ordered_resources))
         self.prepare()
+        logging.info("Materializing %d resources", len(self.ordered_resources))
         for resource in self.ordered_resources:
             try:
                 resource.download(force)
@@ -274,9 +272,6 @@ class DatasetWrapper(AbstractDataset):
     annotations (otherwise, derive from `AbstractDataset`).
     """
-    BUILDING: ClassVar[list["DatasetWrapper"]] = []
-    """Currently built dataset"""
     def __init__(self, annotation, t: type):
         self.config = None
         self.repository: Optional[Repository] = None
@@ -287,6 +282,11 @@ class DatasetWrapper(AbstractDataset):
         repository, components = DataDefinition.repository_relpath(t)
         super().__init__(repository)
+        self.module_name = None
+        if repository is None:
+            # Try to find the module name
+            self.module_name, _ = t.__module__.split(".", 1)
         # Set some variables
         self.url = annotation.url
         self.doi = annotation.doi
@@ -361,12 +361,6 @@ class DatasetWrapper(AbstractDataset):
             self._prepare()
         return super().download(force=force)
-    @contextmanager
-    def building(self):
-        DatasetWrapper.BUILDING.append(self)
-        yield self
-        DatasetWrapper.BUILDING.pop()
     def _prepare(self) -> "Base":
         if self.config is not None:
             return self.config
@@ -378,8 +372,7 @@ class DatasetWrapper(AbstractDataset):
         # Construct the object
         resources = {key: value.prepare() for key, value in self.resources.items()}
-        with self.building():
-            result = self.t(**resources)
+        result = self.t(**resources)
         # Download resources
         logging.debug("Building with data type %s and dataset %s", self.base, self.t)
@@ -425,18 +418,11 @@ class DatasetWrapper(AbstractDataset):
     @property
     def datapath(self):
         """Returns the destination path for downloads"""
-        from datamaestro import Context  # noqa: F811
-        path = Context.instance().storepath / self._path
-        if (self.repository is not None) and (not path.exists()):
-            old_path: Path = self.repository.datapath / self._path
-            if old_path.exists():
-                logging.info(
-                    "Moving from old path [%s] to new path [%s]", old_path, path
-                )
-                path.parent.mkdir(exist_ok=True, parents=True)
-                old_path.rename(path)
+        if self.repository is not None:
+            return self.repository.datapath / self._path
+        # No repository, use __custom__/[MODULE NAME]
+        path = self.context.datapath / "__custom__" / self.module_name / self._path
         return path

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/__init__.py RENAMED Viewed

@@ -31,7 +31,7 @@ class Resource(DatasetAnnotation, ABC):
         self.varname = varname
         # Ensures that the object is initialized
         self._post = False
-        self.definition = None
+        self.definition: AbstractDataset = None
     def annotate(self, dataset: AbstractDataset):
         assert self.definition is None
@@ -45,9 +45,9 @@ class Resource(DatasetAnnotation, ABC):
     def contextualize(self):
         """When using an annotation inline, uses the current dataset wrapper object"""
-        from datamaestro.definitions import DatasetWrapper
+        from datamaestro.definitions import AbstractDataset
-        wrapper = DatasetWrapper.BUILDING[-1]
+        wrapper = AbstractDataset.processing()
         self.annotate(wrapper)
     @property

datamaestro-1.4.0/src/datamaestro/download/custom.py ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import Protocol
+from pathlib import Path
+from datamaestro import Context
+from datamaestro.download import Resource
+class Downloader(Protocol):
+    def __call__(self, context: Context, root: Path, *, force=False):
+        pass
+class custom_download(Resource):
+    def __init__(self, varname: str, downloader: Downloader):
+        super().__init__(varname)
+        self.downloader = downloader
+    def prepare(self):
+        return self.definition.datapath
+    def download(self, force=False):
+        self.downloader(self.context, self.definition.datapath, force=force)

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/huggingface.py RENAMED Viewed

@@ -5,7 +5,7 @@ from datamaestro.download import Download
 class hf_download(Download):
-    """Use Hugging Face to donwload a file"""
+    """Use Hugging Face to download a file"""
     def __init__(
         self,

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/download/single.py RENAMED Viewed

@@ -9,7 +9,7 @@ import os
 import urllib3
 from pathlib import Path
 import re
-from datamaestro.utils import copyfileobjs, FileChecker
+from datamaestro.utils import copyfileobjs
 from datamaestro.stream import Transform
 from datamaestro.download import Download
@@ -35,7 +35,7 @@ class SingleDownload(Download):
         return self.path
     def download(self, force=False):
-        if not self.path.is_file():
+        if not self.path.is_file() and not force:
             self._download(self.path)
@@ -96,20 +96,6 @@ class filedownloader(SingleDownload):
         logging.info("Created file %s" % destination)
-def file_from_url(
-    filename: str,
-    url: str,
-    *,
-    size: Optional[int] = None,
-    transforms: Optional[Transform] = None,
-    checker: Optional[FileChecker] = None,
-) -> Path:
-    """Defines a file that should be downloaded from"""
-    downloader = filedownloader(filename, url, size, transforms, checker)
-    downloader.contextualize()
-    return downloader.path
 class concatdownload(SingleDownload):
     """Concatenate all files in an archive"""

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro/version.py RENAMED Viewed

@@ -17,5 +17,5 @@ __version__: str
 __version_tuple__: VERSION_TUPLE
 version_tuple: VERSION_TUPLE
-__version__ = version = '1.3.1'
-__version_tuple__ = version_tuple = (1, 3, 1)
+__version__ = version = '1.4.0'
+__version_tuple__ = version_tuple = (1, 4, 0)

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: datamaestro
-Version: 1.3.1
+Version: 1.4.0
 Summary: "Dataset management command line and API"
 Home-page: https://github.com/experimaestro/datamaestro
 Author: Benjamin Piwowarski
@@ -13,7 +13,6 @@ Classifier: Intended Audience :: Science/Research
 Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
 Classifier: Operating System :: OS Independent
 Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3.8
 Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
@@ -28,7 +27,7 @@ Requires-Dist: marshmallow
 Requires-Dist: cached_property
 Requires-Dist: requests
 Requires-Dist: bitmath
-Requires-Dist: experimaestro>=1.5.0
+Requires-Dist: experimaestro>=1.6
 Requires-Dist: mkdocs
 Requires-Dist: pymdown-extensions
 Requires-Dist: mkdocs-material
@@ -98,22 +97,10 @@ $ datamaestro search tag:image
 [image] com.lecun.mnist
 $ datamaestro prepare com.lecun.mnist
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/t10k-images-idx3-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
-Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz: 32.8kB [00:00, 92.1kB/s]                                                            INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-labels-idx1-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz into /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
-INFO:root:Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz: 9.92MB [00:00, 10.6MB/s]
-INFO:root:Transforming file
-INFO:root:Created file /home/bpiwowar/datamaestro/data/image/com/lecun/mnist/train-images-idx3-ubyte
-...JSON...
+INFO:root:Materializing 4 resources
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/train_images.idx
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz into .../datamaestro/store/com/lecun/test_images.idx
+INFO:root:Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz into .../datamaestro/store/com/lecun/test_labels.idx
 ```
 The previous command also returns a JSON on standard output
@@ -159,13 +146,12 @@ and is integrated with [experimaestro](http://experimaestro.github.io/experimaes
 Its syntax is described in the [documentation](https://datamaestro.readthedocs.io).
-For MNIST, this corresponds to.
+For instance, the MNIST dataset can be described by the following
 ```python
-from datamaestro_image.data import ImageClassification, LabelledImages, Base, IDXImage
-from datamaestro.download.single import filedownloader
-from datamaestro.definitions import  argument, datatasks, datatags, dataset
-from datamaestro.data.tensor import IDX
+from datamaestro import dataset
+from datamaestro.download.single import download_file
+from datamaestro_image.data import ImageClassification, LabelledImages, IDXImage
 @filedownloader("train_images.idx", "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
@@ -176,26 +162,37 @@ from datamaestro.data.tensor import IDX
   ImageClassification,
   url="http://yann.lecun.com/exdb/mnist/",
 )
-def MNIST(train_images, train_labels, test_images, test_labels):
-  """The MNIST database
-  The MNIST database of handwritten digits, available from this page, has a
-  training set of 60,000 examples, and a test set of 10,000 examples. It is a
-  subset of a larger set available from NIST. The digits have been
-  size-normalized and centered in a fixed-size image.
-  """
-  return {
-    "train": LabelledImages(
-      images=IDXImage(path=train_images),
-      labels=IDX(path=train_labels)
-    ),
-    "test": LabelledImages(
-      images=IDXImage(path=test_images),
-      labels=IDX(path=test_labels)
-    ),
-  }
+    return ImageClassification(
+        train=LabelledImages(
+            images=IDXImage(path=train_images), labels=IDXImage(path=train_labels)
+        ),
+        test=LabelledImages(
+            images=IDXImage(path=test_images), labels=IDXImage(path=test_labels)
+        ),
+    )
+```
+When building dataset modules, some extra documentation can be provided:
+```yaml
+  ids: [com.lecun.mnist]
+  entry_point: "datamaestro_image.config.com.lecun:mnist"
+  title: The MNIST database
+  url: http://yann.lecun.com/exdb/mnist/
+  groups: [image-classification]
+  description: |
+    The MNIST database of handwritten digits, available from this page,
+    has a training set of 60,000 examples, and a test set of 10,000
+    examples. It is a subset of a larger set available from NIST. The
+    digits have been size-normalized and centered in a fixed-size image.
 ```
+This will allow to
+1. Document the dataset
+2. Allow to use the command line interface to manipulate it (download resources, etc.)
 # 0.8.0
 - Integration with other repositories: abstracting away the notion of dataset

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/SOURCES.txt RENAMED Viewed

@@ -58,8 +58,6 @@ src/datamaestro/data/csv.py
 src/datamaestro/data/huggingface.py
 src/datamaestro/data/ml.py
 src/datamaestro/data/tensor.py
-src/datamaestro/datasets/__init__.py
-src/datamaestro/datasets/yaml_repository.py
 src/datamaestro/download/__init__.py
 src/datamaestro/download/archive.py
 src/datamaestro/download/custom.py

{datamaestro-1.3.1 → datamaestro-1.4.0}/src/datamaestro.egg-info/requires.txt RENAMED Viewed

@@ -5,7 +5,7 @@ marshmallow
 cached_property
 requests
 bitmath
-experimaestro>=1.5.0
+experimaestro>=1.6
 mkdocs
 pymdown-extensions
 mkdocs-material

datamaestro-1.3.1/pyproject.toml DELETED Viewed

@@ -1,6 +0,0 @@
-[tool.setuptools_scm]
-write_to = "src/datamaestro/version.py"
-fallback_version = "0.0.0-dev"
-[build-system]
-requires = ["setuptools", "setuptools-scm", "wheel"]  # PEP 508 specifications.

datamaestro-1.3.1/src/datamaestro/datasets/__init__.py DELETED Viewed

File without changes

datamaestro-1.3.1/src/datamaestro/datasets/yaml_repository.py DELETED Viewed

@@ -1,103 +0,0 @@
-import re
-from typing import Iterator, Optional
-from functools import cached_property
-from attrs import field
-import importlib
-from omegaconf import OmegaConf
-from functools import partial
-from attrs import define
-from datamaestro import BaseRepository
-from datamaestro.definitions import AbstractDataset, DatasetWrapper
-from datamaestro.data import Base
-re_spec = re.compile(r"""^(\w\.)+:(\w+)""")
-@define
-class RepositoryDataset:
-    ids: list[str]
-    """ID(s) of this dataset"""
-    entry_point: str = field(validator=re_spec.match)
-    """The entry point"""
-    title: str
-    """The full name of the dataset"""
-    description: str
-    """Description of the dataset"""
-    url: Optional[str]
-    """The URL"""
-    groups: Optional[list[str]]
-    """Groups to which this repository belongs"""
-@define
-class RepositoryAuthors:
-    name: str
-    email: str
-@define
-class RepositoryGroup:
-    name: str
-    tasks: list[str]
-    tags: list[str]
-@define
-class RepositoryConfiguration:
-    namespace: str
-    authors: list[RepositoryAuthors]
-    description: str
-    groups: dict[str, RepositoryGroup]
-    datasets: list[RepositoryDataset]
-class YAMLDataset(AbstractDataset):
-    def __init__(self, repository: "YAMLRepository", information: RepositoryDataset):
-        super().__init__(repository)
-        self.information = information
-        self.id = self.information.ids[0]
-        self.aliases = set(self.information.ids)
-    @cached_property
-    def wrapper(self) -> DatasetWrapper:
-        module, func_name = self.information.entry_point.split(":")
-        wrapper = getattr(importlib.import_module(module), func_name)
-        return wrapper
-    def _prepare(self) -> "Base":
-        return self.wrapper()
-    def download(self, **kwargs):
-        return self.wrapper.download(**kwargs)
-class YAMLRepository(BaseRepository):
-    """YAML-based repository"""
-    @property
-    def id(self):
-        return self.configuration.namespace
-    @property
-    def name(self):
-        return self.configuration.namespace
-    @cached_property
-    def configuration(self):
-        schema = OmegaConf.structured(RepositoryConfiguration)
-        with importlib.resources.path(
-            self.__class__.__module__, "datamaestro.yaml"
-        ) as fp:
-            conf = OmegaConf.load(fp)
-        conf: RepositoryConfiguration = OmegaConf.merge(schema, conf)
-        return conf
-    def __iter__(self) -> Iterator["AbstractDataset"]:
-        return map(partial(YAMLDataset, self), self.configuration.datasets)

datamaestro-1.3.1/src/datamaestro/download/custom.py DELETED Viewed

@@ -1,29 +0,0 @@
-from typing import Protocol
-from pathlib import Path
-from datamaestro import Context
-from datamaestro.definitions import DatasetWrapper
-from datamaestro.download import Resource
-class Downloader(Protocol):
-    def __call__(self, context: Context, root: Path, *, force=False):
-        pass
-class CustomResource(Resource):
-    def __init__(self, ds_wrapper: DatasetWrapper, downloader: Downloader):
-        self.ds_wrapper = ds_wrapper
-        self.downloader = downloader
-    def prepare(self):
-        pass
-    def download(self, force=False):
-        self.downloader(self.context, self.ds_wrapper.datapath, force=force)
-def custom_download(downloader: Downloader) -> Path:
-    ds_wrapper = DatasetWrapper.BUILDING[-1]
-    ds_wrapper.ordered_resources.append(CustomResource(ds_wrapper, downloader))
-    return ds_wrapper.datapath