PyPI - datamaestro - Versions diffs - 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

datamaestro/__init__.py +11 -7
datamaestro/__main__.py +29 -8
datamaestro/annotations/__init__.py +1 -1
datamaestro/annotations/agreement.py +9 -3
datamaestro/commands/site.py +27 -15
datamaestro/context.py +143 -87
datamaestro/data/__init__.py +23 -11
datamaestro/data/csv.py +12 -12
datamaestro/data/huggingface.py +25 -0
datamaestro/data/ml.py +19 -10
datamaestro/data/tensor.py +32 -24
datamaestro/definitions.py +492 -131
datamaestro/download/__init__.py +610 -24
datamaestro/download/archive.py +129 -77
datamaestro/download/custom.py +53 -0
datamaestro/download/huggingface.py +77 -0
datamaestro/download/links.py +106 -50
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +114 -51
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +164 -0
datamaestro/record.py +232 -0
datamaestro/registry.py +1 -0
datamaestro/search.py +1 -1
datamaestro/settings.py +3 -1
datamaestro/sphinx.py +224 -0
datamaestro/stream/__init__.py +0 -2
datamaestro/stream/lines.py +10 -7
datamaestro/templates/dataset.py +5 -4
datamaestro/test/__init__.py +3 -1
datamaestro/test/checks.py +1 -5
datamaestro/test/conftest.py +1 -6
datamaestro/test/test_annotations.py +2 -2
datamaestro/test/test_download_handlers.py +3 -4
datamaestro/test/test_record.py +72 -0
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +15 -9
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
datamaestro/__pycache__/context.cpython-38.pyc +0 -0
datamaestro/__pycache__/context.cpython-39.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
datamaestro/__pycache__/search.cpython-38.pyc +0 -0
datamaestro/__pycache__/search.cpython-39.pyc +0 -0
datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro-0.8.1.dist-info/RECORD +0 -109
datamaestro-0.8.1.dist-info/top_level.txt +0 -1
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0

datamaestro/__init__.py CHANGED Viewed

@@ -1,8 +1,12 @@
-from .context import Context, Repository, get_dataset, prepare_dataset
+# flake8: noqa: F401 (re-exports)
+from .context import (
+    Context,
+    Repository,
+    BaseRepository,
+    get_dataset,
+    prepare_dataset,
+)
-from pkg_resources import get_distribution, DistributionNotFound
-try:
-    __version__ = get_distribution(__name__).version
-except DistributionNotFound:
-    __version__ = None
+from .definitions import dataset, metadata
+from .data import Base
+from .version import __version__

datamaestro/__main__.py CHANGED Viewed

@@ -1,19 +1,22 @@
 #!/usr/bin/env python3
+# ruff: noqa: T201
+from importlib.metadata import entry_points
 import sys
 import logging
 from functools import update_wrapper
 import traceback as tb
-import pkg_resources
 import re
 from pathlib import Path
 import shutil
-from .context import Context
 from typing import Set
-import datamaestro
+from urllib.parse import urlparse
 import click
+import datamaestro
+from .context import Context
 logging.basicConfig(level=logging.INFO)
@@ -37,7 +40,7 @@ def pass_cfg(f):
 # Get all the available repositories
 REPOSITORIES = {}
-for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
+for entry_point in entry_points(group="datamaestro.repositories"):
     REPOSITORIES[entry_point.name] = entry_point
@@ -59,7 +62,10 @@ for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
     "--traceback", is_flag=True, help="Display traceback if an exception occurs"
 )
 @click.option(
-    "--data", type=Path, help="Directory containing datasets", default=Context.MAINDIR
+    "--data",
+    type=Path,
+    help="Directory containing datasets",
+    default=Context.MAINDIR,
 )
 @click.pass_context
 def cli(ctx, quiet, debug, traceback, data, keep_downloads, host, pythonpath):
@@ -90,6 +96,8 @@ def main():
 @click.argument("dataset", type=str)
 @pass_cfg
 def info(config: Config, dataset):
+    from datamaestro.definitions import AbstractDataset
     dataset = AbstractDataset.find(dataset)
     print(dataset.name)
     if dataset.url:
@@ -204,7 +212,6 @@ def datafolder_set(config: Config, key: str, path: Path):
 # --- Create a dataset
 DATASET_REGEX = re.compile(r"^\w[\w\.-]+\w$")
-from urllib.parse import urlparse
 def dataset_id_check(ctx, param, value):
@@ -254,6 +261,8 @@ def create_dataset(config: Config, repository_id: str, dataset_id: str):
 @pass_cfg
 def download(config: Config, dataset):
     """Download a dataset"""
+    from datamaestro.definitions import AbstractDataset
     dataset = AbstractDataset.find(dataset)
     success = dataset.download()
     if not success:
@@ -314,5 +323,17 @@ def search(config: Config, searchterms):
     logging.debug("Search: %s", condition)
     for dataset in config.context.datasets():
-        if condition.match(dataset):
-            print("[%s] %s" % (dataset.repository.id, dataset.id))
+        try:
+            if condition.match(dataset):
+                cfg = dataset.configtype
+                print(
+                    "[%s] %s (%s)"
+                    % (
+                        dataset.repository.id,
+                        dataset.id,
+                        cfg.__name__ if cfg is not None else "?",
+                    )
+                )
+        except Exception:
+            logging.error("Error while matching with dataset %s", dataset)
+            raise

datamaestro/annotations/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- """Generic annotations for datasets"""
1	+ """Generic annotations for datasets"""

datamaestro/annotations/agreement.py CHANGED Viewed

@@ -1,9 +1,15 @@
-import logging
-from datamaestro.definitions import DatasetAnnotation, AbstractDataset, hook
+from typing import Optional
+from datamaestro.definitions import AbstractDataset, hook
 @hook("pre-use")
-def useragreement(definition: AbstractDataset, message, id=None):
+def useragreement(definition: AbstractDataset, message: str, id: Optional[str] = None):
+    """Asks for a user-agreement
+    :param definition: The dataset for which the agreement is asked
+    :param message: The agreement text
+    :param id: The ID of the agreement (default to the dataset ID)
+    """
     # Skip agreement when testing
     if definition.context.running_test:
         return

datamaestro/commands/site.py CHANGED Viewed

@@ -18,6 +18,7 @@ from mkdocs.structure.pages import Page as MkdocPage
 from docstring_parser import parse as docstring_parse
 import experimaestro
+import experimaestro.mkdocs.base
 from experimaestro.core.types import ObjectType
 from ..context import Context, Repository, Datasets
@@ -97,7 +98,7 @@ def document_data(datatype: ObjectType):
                 if doc.long_description:
                     s += doc.long_description + "\n"
                 s += method_documentation(doc, method.__annotations__)
-            except Exception as e:
+            except Exception:
                 logging.error(
                     "Error while parsing documetnation of %s (%s)",
                     method,
@@ -108,8 +109,6 @@ def document_data(datatype: ObjectType):
 def document_object(object):
-    from datamaestro.data import Base
     try:
         name = object.__name__
         # Get the documentation
@@ -141,7 +140,7 @@ def document_object(object):
         return s
-    except Exception as e:
+    except Exception:
         logging.exception(
             "Exception while generating the documentation for %s" % object.__name__
         )
@@ -159,8 +158,11 @@ def document(match):
     module = importlib.import_module(modulename)
     try:
         object = getattr(module, name)
-    except:
-        return "<div class='error'>Cannot find %s in %s</div>" % (name, modulename)
+    except Exception:
+        return "<div class='error'>Cannot find %s in %s</div>" % (
+            name,
+            modulename,
+        )
     if ismodule(object):
         return "\n\n".join(
@@ -182,7 +184,7 @@ class Classification:
     def add(self, name, value):
         key = name.lower()
-        if not key in self.map:
+        if key not in self.map:
             self.map[key] = ClassificationItem(name)
         self.map[key].values.append(value)
@@ -201,7 +203,6 @@ class Classification:
             )
     def match(self, path):
         if path == "datamaestro/%s.md" % self.id:
             r = io.StringIO()
             r.write("# List of %s\n\n" % self.name)
@@ -222,7 +223,12 @@ class Classification:
                 module = Datasets(importlib.import_module(meta.t.__module__))
                 r.write(
                     "- [%s](../df/%s/%s.html#%s)\n"
-                    % (meta.name or meta.id, meta.repository.id, module.id, meta.id)
+                    % (
+                        meta.name or meta.id,
+                        meta.repository.id,
+                        module.id,
+                        meta.id,
+                    )
                 )
             return r.getvalue()
@@ -275,7 +281,7 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
     def parse_nav(self, nav):
         for entry in nav:
             assert len(entry) == 1
-            key, value = *entry.keys(), *entry.values()
+            _, value = *entry.keys(), *entry.values()
             if isinstance(value, list):
                 for value in self.parse_nav(value):
                     yield value
@@ -328,9 +334,12 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
         import shutil
         path = Path(config["site_dir"]) / "mainstyle.css"
-        with importlib.resources.open_binary(
-            "datamaestro.commands", "mainstyle.css"
-        ) as source, path.open("wb") as dest:
+        with (
+            importlib.resources.open_binary(
+                "datamaestro.commands", "mainstyle.css"
+            ) as source,
+            path.open("wb") as dest,
+        ):
             shutil.copyfileobj(source, dest)
     def on_files(self, files, config):
@@ -382,7 +391,7 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
             builder()
         logging.info("Watching %s", path)
-        server.watch(path, rebuild)
+        # server.watch(path, rebuild)
     def on_page_markdown(self, markdown, page, config, **kwargs):
         if page.url.startswith("api/"):
@@ -420,7 +429,10 @@ class DatasetGenerator(mkdocs.plugins.BasePlugin):
         r.write("## List of datasets\n\n")
         for ds in df:
             r.write(
-                """<div class="dataset-entry"><div class='dataset-id'>%s<a name="%s"></a></div>\n\n"""
+                (
+                    """<div class="dataset-entry"><div class='dataset-id'>"""
+                    """%s<a name="%s"></a></div>\n\n"""
+                )
                 % (ds.id, ds.id)
             )
             if ds.name:

datamaestro/context.py CHANGED Viewed

@@ -1,21 +1,37 @@
 from pathlib import Path
-from cached_property import cached_property
+from typing import Iterable, Iterator, Dict, Optional, Union
 import importlib
 import os
 import hashlib
 import logging
 import inspect
 import json
+from abc import ABC, abstractmethod
+from experimaestro import Config
+from functools import cached_property
 from experimaestro.mkdocs.metaloader import Module
-import pkg_resources
-from typing import Iterable, Iterator, List, Dict
 from .utils import CachedFile, downloadURL
 from .settings import UserSettings, Settings
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
-    from datamaestro.definitions import AbstractDataset
+    from datamaestro.definitions import AbstractDataset, DatasetWrapper
+from importlib.metadata import (
+    entry_points as _entry_points,
+    version as _version,
+    PackageNotFoundError as _PackageNotFoundError,
+)
+def iter_entry_points(group, name=None):
+    """Yield entry points for a given group (and optional name) using importlib.metadata."""
+    eps = _entry_points()
+    selected = eps.select(group=group)
+    if name:
+        selected = [ep for ep in selected if ep.name == name]
+    for ep in selected:
+        yield ep
 class Compression:
@@ -98,31 +114,31 @@ class Context:
     @cached_property
     def repositorymap(self) -> Dict[str, "Repository"]:
         return {
-            repository.basemodule(): repository for repository in self.repositories()
+            repository.basemodule(): repository
+            for repository in self.repositories()
+            if repository.basemodule() is not None
         }
     def repositories(self) -> Iterable["Repository"]:
         """Returns an iterator over repositories"""
-        for entry_point in pkg_resources.iter_entry_points("datamaestro.repositories"):
+        for entry_point in iter_entry_points("datamaestro.repositories"):
             yield entry_point.load().instance()
     def repository(self, repositoryid):
         if repositoryid is None:
             return None
-        l = [
-            x
-            for x in pkg_resources.iter_entry_points(
-                "datamaestro.repositories", repositoryid
-            )
+        entry_points = [
+            x for x in iter_entry_points("datamaestro.repositories", repositoryid)
         ]
-        if not l:
+        if not entry_points:
             raise Exception("No datasets repository named %s", repositoryid)
-        if len(l) > 1:
+        if len(entry_points) > 1:
             raise Exception(
-                "Too many datasets repository named %s (%d)" % (repositoryid, len(l))
+                "Too many datasets repository named %s (%d)"
+                % (repositoryid, len(entry_points))
             )
-        return l[0].load()(self)
+        return entry_points[0].load()(self)
     @property
     def running_test(self):
@@ -175,7 +191,6 @@ class Context:
         if dlpath.is_file():
             logging.debug("Using cached file %s for %s", dlpath, url)
         else:
             logging.info("Downloading %s", url)
             tmppath = dlpath.with_suffix(".tmp")
@@ -188,7 +203,7 @@ class Context:
     def ask(self, question: str, options: Dict[str, str]):
         """Ask a question to the user"""
-        print(question)
+        print(question)  # noqa: T201
         answer = None
         while answer not in options:
             answer = input().strip().lower()
@@ -228,17 +243,47 @@ class Datasets(Iterable["AbstractDataset"]):
     def __init__(self, module: Module):
         """Initialize with a module"""
         self.module = module
+        self._title = None
+        self._description = None
     @property
     def id(self):
         return ".".join(self.module.__name__.split(".", 2)[2:])
+    @property
+    def title(self):
+        self._getdoc()
+        return self._title
     @property
     def description(self):
-        return self.module.__doc__ or ""
+        self._getdoc()
+        return self._description
+    def _getdoc(self):
+        if self._title is not None:
+            return
+        if not self.module.__doc__:
+            self._title = ""
+            self._description = ""
+            return
+        intitle = True
+        title = []
+        description = []
+        for line in self.module.__doc__.split("\n"):
+            if line.strip() == "" and intitle:
+                intitle = False
+            else:
+                (title if intitle else description).append(line)
+        self._title = " ".join(title)
+        self._description = "\n".join(description)
     def __iter__(self) -> Iterable["AbstractDataset"]:
         from .definitions import DatasetWrapper
+        from datamaestro.data import Base
         # Iterates over defined symbols
         for key, value in self.module.__dict__.items():
@@ -247,10 +292,60 @@ class Datasets(Iterable["AbstractDataset"]):
                 # Ensure it comes from the module
                 if self.module.__name__ == value.t.__module__:
                     yield value
+            elif (
+                inspect.isclass(value)
+                and issubclass(value, Base)
+                and hasattr(value, "__dataset__")
+            ):
+                if self.module.__name__ == value.__module__:
+                    yield value.__dataset__
+class BaseRepository(ABC):
+    """A repository groups a set of datasets and their corresponding specific
+    handlers (downloading, filtering, etc.)"""
+    def __init__(self, context: Context):
+        self.context = context
+        p = inspect.getabsfile(self.__class__)
+        self.basedir = Path(p).parent
+    @abstractmethod
+    def __iter__(self) -> Iterator["AbstractDataset"]: ...
+    def search(self, name: str):
+        """Search for a dataset in the definitions"""
+        for dataset in self:
+            if name in dataset.aliases:
+                return dataset
+    @classmethod
+    def instance(cls, context=None):
+        try:
+            return cls.__getattribute__(cls, "INSTANCE")
+        except AttributeError:
+            return cls(context if context else Context.instance())
+    @classmethod
+    def basemodule(cls):
+        return cls.__module__
+    @property
+    def generatedpath(self):
+        return self.basedir / "generated"
+    @property
+    def datapath(self):
+        return self.context.datapath.joinpath(self.id)
+    @property
+    def extrapath(self):
+        """Path to the directory containing extra configuration files"""
+        return self.basedir / "data"
-class Repository:
-    """A repository regroup a set of datasets and their corresponding specific handlers (downloading, filtering, etc.)"""
+class Repository(BaseRepository):
+    """(deprecated) Repository where datasets are located in __module__.config"""
     def __init__(self, context: Context):
         """Initialize a new repository
@@ -259,34 +354,20 @@ class Repository:
         :param basedir: The base directory of the repository
             (by default, the same as the repository class)
         """
+        super().__init__(context)
         self.context = context
-        p = inspect.getabsfile(self.__class__)
-        self.basedir = Path(p).parent
         self.configdir = self.basedir.joinpath("config")
         self.id = self.__class__.NAMESPACE
         self.name = self.id
         self.module = self.__class__.__module__
         self.__class__.INSTANCE = self
-    @classmethod
-    def basemodule(cls):
-        return cls.__module__
-    @classmethod
-    def instance(cls, context=None):
-        try:
-            return cls.__getattribute__(cls, "INSTANCE")
-        except AttributeError:
-            return cls(context if context else Context.instance())
     @classmethod
     def version(cls):
-        from pkg_resources import get_distribution, DistributionNotFound
         try:
-            return get_distribution(cls.__module__).version
-        except DistributionNotFound:
-            __version__ = None
+            return _version(cls.__module__)
+        except _PackageNotFoundError:
+            return None
     def __repr__(self):
         return "Repository(%s)" % self.basedir
@@ -298,40 +379,15 @@ class Repository:
         assert isinstance(other, Repository)
         return self.basedir == other.basedir
-    def search(self, name: str):
-        """Search for a dataset in the definitions
-        """
-        logging.debug("Searching for %s in %s", name, self.configdir)
-        candidates: List[str] = []
-        components = name.split(".")
-        N = len(components)
-        sub = None
-        prefix = None
-        path = self.configdir
-        for i, c in enumerate(components):
-            path = path / c
-            if (path / "__init__.py").is_file():
-                candidates.append(".".join(components[: i + 1]))
-            if path.with_suffix(".py").is_file():
-                candidates.append(".".join(components[: i + 1]))
-            if not path.is_dir():
-                break
-        # Get the dataset
-        for candidate in candidates[::-1]:
-            logging.debug("Searching in module %s.config.%s", self.module, candidate)
+    def datasets(self, candidate: str):
+        """Returns the dataset candidates from a module"""
+        try:
             module = importlib.import_module("%s.config.%s" % (self.module, candidate))
-            for value in Datasets(module):
-                if name in value.aliases:
-                    return value
-        return None
+        except ModuleNotFoundError:
+            return None
+        return Datasets(module)
-    def modules(self) -> "Module":
+    def modules(self) -> Iterator["Module"]:
         """Iterates over all modules in this repository"""
         for _, fid, package in self._modules():
             try:
@@ -368,19 +424,6 @@ class Repository:
             for dataset in datasets:
                 yield dataset
-    @property
-    def generatedpath(self):
-        return self.basedir.joinpath("generated")
-    @property
-    def datapath(self):
-        return self.context.datapath.joinpath(self.id)
-    @property
-    def extrapath(self):
-        """Path to the directory containing extra configuration files"""
-        return self.basedir.joinpath("data")
 def find_dataset(dataset_id: str):
     """Find a dataset given its id"""
@@ -389,11 +432,24 @@ def find_dataset(dataset_id: str):
     return AbstractDataset.find(dataset_id)
-def prepare_dataset(dataset_id: str):
+def prepare_dataset(
+    dataset_id: Union[str, "DatasetWrapper", Config],
+    context: Optional[Union[Context, Path]] = None,
+):
     """Find a dataset given its id and download the resources"""
-    from .definitions import AbstractDataset
+    from .definitions import AbstractDataset, DatasetWrapper
+    match context:
+        case Path() | str():
+            context = Context(Path(context))
+    if isinstance(dataset_id, DatasetWrapper):
+        ds = dataset_id
+    elif isinstance(dataset_id, Config):
+        ds = dataset_id.__datamaestro_dataset__
+    else:
+        ds = AbstractDataset.find(dataset_id, context=context)
-    ds = AbstractDataset.find(dataset_id)
     return ds.prepare(download=True)

datamaestro/data/__init__.py CHANGED Viewed

@@ -1,25 +1,35 @@
 import logging
 from pathlib import Path
-from datamaestro.definitions import AbstractDataset, argument, Param
-from experimaestro import Config
-from experimaestro import documentation  # noqa: F401
+from typing import Any, Dict
+from experimaestro import Config, Param, Meta
+from datamaestro.definitions import AbstractDataset
 class Base(Config):
-    """Base object for all data types
+    """Base object for all data types"""
-    attributes:
+    id: Param[str]
+    """The unique (sub-)dataset ID"""
-    id: The unique dataset ID
-    """
+    __datamaestro_dataset__: "AbstractDataset"
-    id: Param[str]
-    __datamaestro_dataset__: AbstractDataset
+    def dataset_information(self) -> Dict[str, Any]:
+        """Returns document meta-informations"""
+        return {
+            "id": self.id,
+            "name": self.__datamaestro_dataset__.name,
+            "description": self.__datamaestro_dataset__.description,
+        }
     def download(self):
         """Download the dataset"""
         self.__datamaestro_dataset__.download()
+    def prepare(self, *args, **kwargs):
+        """Prepare the dataset"""
+        self.__datamaestro_dataset__.prepare()
+        return self
 class Generic(Base):
     """Generic dataset
@@ -38,15 +48,17 @@ class Generic(Base):
 class File(Base):
     """A data file"""
-    path: Param[Path]
+    path: Meta[Path]
+    """The path of the file"""
     def open(self, mode):
         return self.path.open(mode)
-@argument("path", type=Path)
 class Folder(Base):
     """A data folder"""
+    path: Meta[Path]
     def open(self, mode):
         return self.path.open(mode)

datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl