PyPI - datamaestro - Versions diffs - 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

datamaestro/__init__.py +11 -7
datamaestro/__main__.py +29 -8
datamaestro/annotations/__init__.py +1 -1
datamaestro/annotations/agreement.py +9 -3
datamaestro/commands/site.py +27 -15
datamaestro/context.py +143 -87
datamaestro/data/__init__.py +23 -11
datamaestro/data/csv.py +12 -12
datamaestro/data/huggingface.py +25 -0
datamaestro/data/ml.py +19 -10
datamaestro/data/tensor.py +32 -24
datamaestro/definitions.py +492 -131
datamaestro/download/__init__.py +610 -24
datamaestro/download/archive.py +129 -77
datamaestro/download/custom.py +53 -0
datamaestro/download/huggingface.py +77 -0
datamaestro/download/links.py +106 -50
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +114 -51
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +164 -0
datamaestro/record.py +232 -0
datamaestro/registry.py +1 -0
datamaestro/search.py +1 -1
datamaestro/settings.py +3 -1
datamaestro/sphinx.py +224 -0
datamaestro/stream/__init__.py +0 -2
datamaestro/stream/lines.py +10 -7
datamaestro/templates/dataset.py +5 -4
datamaestro/test/__init__.py +3 -1
datamaestro/test/checks.py +1 -5
datamaestro/test/conftest.py +1 -6
datamaestro/test/test_annotations.py +2 -2
datamaestro/test/test_download_handlers.py +3 -4
datamaestro/test/test_record.py +72 -0
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +15 -9
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
datamaestro/__pycache__/context.cpython-38.pyc +0 -0
datamaestro/__pycache__/context.cpython-39.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
datamaestro/__pycache__/search.cpython-38.pyc +0 -0
datamaestro/__pycache__/search.cpython-39.pyc +0 -0
datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro-0.8.1.dist-info/RECORD +0 -109
datamaestro-0.8.1.dist-info/top_level.txt +0 -1
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0

datamaestro/record.py ADDED Viewed

@@ -0,0 +1,232 @@
+"""Record module for type-safe heterogeneous containers.
+.. deprecated:: 2.0
+    This module will be removed in v2. Use :class:`typing.TypedDict` instead
+    for type-safe heterogeneous data structures. TypedDict provides better IDE
+    support, type checking, and is part of the standard library.
+    When using TypedDict, define key constants in classes (e.g., ``MyItem.ID``)
+    to avoid typos and enable IDE autocomplete. Prefix keys with package name
+    using underscore ``_`` as delimiter to avoid conflicts between different
+    data sources.
+Example migration::
+    # Old way (deprecated)
+    @define
+    class MyItem(Item):
+        value: int
+    record = Record(MyItem(42))
+    print(record[MyItem].value)
+    # New way (recommended)
+    from typing import TypedDict
+    # Define key constants in classes
+    class MyItem:
+        ID = "mypackage_value"
+    class MyRecord(TypedDict):
+        mypackage_value: int
+    data: MyRecord = {MyItem.ID: 42}
+    print(data[MyItem.ID])
+"""
+import warnings
+from typing import Type, TypeVar, Dict, Union, Optional
+# Emit deprecation warning when module is imported
+warnings.warn(
+    "The datamaestro.record module is deprecated and will be removed in v2. "
+    "Use typing.TypedDict instead (use class constants like MyItem.ID for keys, "
+    "prefixed with package name).",
+    DeprecationWarning,
+    stacklevel=2,
+)
+class Item:
+    """Base class for all item types"""
+    @classmethod
+    def __get_base__(cls: Type) -> Type:
+        """Get the most generic superclass for this type of item"""
+        if base := getattr(cls, "__base__cache__", None):
+            return base
+        base = cls
+        for supercls in cls.__mro__:
+            if issubclass(supercls, Item) and supercls is not Item:
+                base = supercls
+        setattr(cls, "__base__cache__", base)
+        return base
+T = TypeVar("T", bound=Item)
+Items = Dict[Type[T], T]
+class RecordType:
+    def __init__(self, *item_types: Type[T]):
+        self.item_types = frozenset(item_types)
+        self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
+    def __repr__(self):
+        names = ",".join(item_type.__name__ for item_type in self.item_types)
+        return f"Record({names})"
+    def contains(self, other: "RecordType"):
+        """Checks that each item type in other has an item type of a compatible
+        type in self"""
+        if len(self.item_types) != len(other.item_types):
+            return False
+        for item_type in other.item_types:
+            if matching_type := self.mapping.get(item_type.__get_base__(), None):
+                if not issubclass(matching_type, item_type):
+                    return False
+            else:
+                return False
+        return True
+    def sub(self, *item_types: Type[T]):
+        """Returns a new record type based on self and new item types"""
+        cls_itemtypes = [x for x in self.item_types]
+        mapping = {
+            itemtype.__get_base__(): ix for ix, itemtype in enumerate(cls_itemtypes)
+        }
+        for itemtype in item_types:
+            if (ix := mapping.get(itemtype.__get_base__(), -1)) >= 0:
+                cls_itemtypes[ix] = itemtype
+            else:
+                cls_itemtypes.append(itemtype)
+        return record_type(*cls_itemtypes)
+    def __call__(self, *items: T):
+        record = Record(*items)
+        self.validate(record)
+        return record
+    def has(self, itemtype: Type[T]):
+        return issubclass(self.mapping[itemtype.__get_base__()], itemtype)
+    def validate(self, record: "Record"):
+        """Creates and validate a new record of this type"""
+        if self.item_types:
+            for item_type in self.item_types:
+                try:
+                    record.__getitem__(item_type)
+                except KeyError:
+                    raise KeyError(f"Item of type {item_type} is missing")
+        if len(record.items) != len(self.item_types):
+            unregistered = [
+                item
+                for item in record.items.values()
+                if all(
+                    not issubclass(item.__get_base__(), item_type)
+                    for item_type in self.item_types
+                )
+            ]
+            raise KeyError(
+                f"The record of type {self} contains unregistered items: {unregistered}"
+            )
+        # Creates a new record
+        return record
+def record_type(*item_types: Type[T]):
+    """Returns a new record type"""
+    return RecordType(*item_types)
+class Record:
+    """Associate types with entries
+    A record is a composition of items; each item base class is unique.
+    """
+    #: Items for this record
+    items: Items
+    def __init__(self, *items: Union[Items, T], override=False):
+        self.items = {}
+        if len(items) == 1 and isinstance(items[0], dict):
+            # Just copy the dictionary
+            self.items = items[0]
+        else:
+            for entry in items:
+                # Returns a new record if the item exists
+                base = entry.__get_base__()
+                if not override and base in self.items:
+                    raise RuntimeError(
+                        f"The item type {base} ({entry.__class__})"
+                        " is already in the record"
+                    )
+                self.items[base] = entry
+    def __str__(self):
+        return (
+            "{"
+            + ", ".join(
+                f"{key.__module__}.{key.__qualname__}: {value}"
+                for key, value in self.items.items()
+            )
+            + "}"
+        )
+    def __repr__(self):
+        return (
+            "{"
+            + ", ".join(
+                f"{key.__module__}.{key.__qualname__}: {repr(value)}"
+                for key, value in self.items.items()
+            )
+            + "}"
+        )
+    def get(self, key: Type[T]) -> Optional[T]:
+        """Get a given item or None if it does not exist"""
+        try:
+            return self[key]
+        except KeyError:
+            return None
+    def has(self, key: Type[T]) -> bool:
+        """Returns True if the record has the given item type"""
+        return key.__get_base__() in self.items
+    def __getitem__(self, key: Type[T]) -> T:
+        """Get an item given its type"""
+        base = key.__get_base__()
+        try:
+            entry = self.items[base]
+        except KeyError:
+            raise KeyError(
+                f"""No entry with type {key}: """
+                f"""{",".join(str(s) for s in self.items.keys())}"""
+            )
+        # Check if this matches the expected class
+        if not isinstance(entry, key):
+            raise KeyError(
+                f"""No entry with type {key}: """
+                f"""{",".join(str(s) for s in self.items.keys())}"""
+            )
+        return entry
+    def update(self, *items: T, target: RecordType = None) -> "Record":
+        """Update some items"""
+        # Create our new dictionary
+        item_dict = {**self.items}
+        for item in items:
+            item_dict[item.__get_base__()] = item
+        return Record(item_dict)

datamaestro/registry.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from pathlib import Path
 import shutil
 from tempfile import NamedTemporaryFile
+import yaml
 class RegistryEntry:

datamaestro/search.py CHANGED Viewed

@@ -40,7 +40,7 @@ class AndCondition(Condition):
         return True
     def __repr__(self):
-        return " AND ".join(self.conditions)
+        return " AND ".join([repr(x) for x in self.conditions])
 class OrCondition(Condition):

datamaestro/settings.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """Global and user settings utility classes"""
 import marshmallow as mm
-from experimaestro.utils.settings import JsonSettings, PathField
+from typing import Dict, Any
+from experimaestro.utils.settings import JsonSettings
 from pathlib import Path
 # --- Global settings

datamaestro/sphinx.py ADDED Viewed

@@ -0,0 +1,224 @@
+# Sphinx extension for datamaestro datasets
+from typing import Any, Dict, Tuple
+from sphinx.ext.autodoc.mock import mock
+from docutils import nodes
+from sphinx.application import Sphinx
+from sphinx.domains import Domain, ObjType
+from sphinx.roles import XRefRole
+from sphinx.util.docutils import SphinxDirective
+from sphinx.locale import _
+from sphinx import addnodes
+from sphinx.util.nodes import make_refnode
+import datamaestro
+from datamaestro.data import AbstractDataset
+import logging
+from myst_parser.config.main import MdParserConfig
+from myst_parser.mdit_to_docutils.base import DocutilsRenderer
+from myst_parser.parsers.mdit import create_md_parser
+class DatasetNode(nodes.paragraph):
+    pass
+def to_docutils(source: str):
+    parser = create_md_parser(MdParserConfig(), DocutilsRenderer)
+    return parser.render(source)
+class DatasetsDirective(SphinxDirective):
+    def dataset_desc(self, ds: AbstractDataset):
+        dm = self.env.get_domain("dm")
+        assert isinstance(dm, DatamaestroDomain)
+        dm.add_dataset(ds.id)
+        # indexnode = addnodes.index(entries=[])
+        desc = addnodes.desc()
+        desc["domain"] = DatamaestroDomain.name
+        desc["objtype"] = desc["desctype"] = "dataset"
+        desc["classes"].append(DatamaestroDomain.name)
+        signodes = addnodes.desc_signature(ds.id, "", is_multiline=True)
+        desc.append(signodes)
+        signode = addnodes.desc_signature_line()
+        signode += nodes.Text("Dataset ")
+        signode += addnodes.desc_name(text=ds.id)
+        signode["ids"].append("dataset" + "-" + ds.id)
+        signodes.append(signode)
+        content = addnodes.desc_content()
+        desc.append(content)
+        if ds.configtype:
+            ctype = ds.configtype
+            name = f"{ctype.__module__}.{ctype.__qualname__}"
+            te = nodes.paragraph()
+            te.append(nodes.Text("Experimaestro type: "))
+            p = nodes.paragraph()
+            returns = addnodes.desc_returns()
+            xref = addnodes.pending_xref(
+                "",
+                nodes.Text(name),
+                refdomain="py",
+                reftype="class",
+                reftarget=name,
+            )
+            returns.append(xref)
+            p.append(returns)
+            content.append(p)
+        # node.append(nodes.Text(ds.id))
+        if ds.name:
+            content.append(
+                nodes.paragraph("", "", nodes.strong("", nodes.Text(ds.name)))
+            )
+        if ds.tags or ds.tasks:
+            if ds.tags:
+                content.append(
+                    nodes.paragraph(
+                        "",
+                        "",
+                        nodes.strong("", nodes.Text("Tags: ")),
+                        nodes.Text(", ".join(ds.tags)),
+                    )
+                )
+            if ds.tasks:
+                content.append(
+                    nodes.paragraph(
+                        "",
+                        "",
+                        nodes.strong("", "Tasks: "),
+                        nodes.Text(", ".join(ds.tasks)),
+                    )
+                )
+        if ds.url:
+            href = nodes.reference(refuri=ds.url)
+            href.append(nodes.Text(ds.url))
+            p = nodes.paragraph()
+            p.append(nodes.Text("External link: "))
+            p.append(href)
+            content.append(p)
+        if ds.description:
+            content.extend(to_docutils(ds.description))
+        return desc
+class RepositoryDirective(DatasetsDirective):
+    """Generates the document for a whole repository"""
+    has_content = True
+    required_arguments = 1
+    optional_arguments = 0
+    def run(self):
+        (repository_id,) = self.arguments
+        with mock(self.config.autodoc_mock_imports):
+            repository = datamaestro.Context.instance().repository(repository_id)  # type: Optional[datamaestro.Repository]
+            assert repository is not None
+            docnodes = []
+            for module in repository.modules():
+                section = nodes.section(
+                    ids=[f"dm-datasets-{repository_id}-{module.id}"]
+                )
+                docnodes.append(section)
+                section += nodes.title("", nodes.Text(module.title))
+                section += nodes.paragraph()
+                if module.description:
+                    section += to_docutils(module.description).children
+                for ds in iter(module):
+                    section += self.dataset_desc(ds)
+        return docnodes
+class DatasetDirective(DatasetsDirective):
+    has_content = True
+    required_arguments = 1
+    optional_arguments = 1
+    def run(self):
+        # --- Retrieve the datasets
+        if len(self.arguments) == 2:
+            module_name, repository_name = self.arguments
+        else:
+            (module_name,) = self.arguments
+            repository_name = self.env.config["datamaestro_repository"]
+        datasets = None
+        with mock(self.config.autodoc_mock_imports):
+            for repository in datamaestro.Context.instance().repositories():
+                if repository_name is None or repository.id == repository_name:
+                    datasets = repository.datasets(module_name)
+                    if datasets is not None:
+                        break
+        assert datasets is not None
+        # --- Start documenting
+        docnodes = []
+        # node.document = self.state.document
+        if datasets.description:
+            docnodes.extend(to_docutils(datasets.description))
+        for ds in datasets:
+            docnodes.append(self.dataset_desc(ds))
+        return docnodes
+class DatamaestroDomain(Domain):
+    name = "dm"
+    object_types = {
+        "dataset": ObjType(_("dataset"), "ds"),
+    }
+    directives = {
+        "repository": RepositoryDirective,
+        "datasets": DatasetDirective,
+    }
+    roles = {"ref": XRefRole()}
+    indices = {
+        # TODO: Add indices for tags and tasks
+    }
+    initial_data: Dict[str, Dict[str, Tuple[str, str]]] = {
+        "datasets": {},  # fullname -> dataset
+        "tags": {},  # tag  -> list of datasets,
+        "tasks": {},  # task name -> list of datasets
+    }
+    def add_dataset(self, dsid):
+        self.data["datasets"][dsid] = (self.env.docname, f"dataset-{dsid}")
+    def resolve_xref(self, env, fromdocname, builder, typ, target, node, contnode):
+        logging.debug("[dm/sphinx] Searching for", target)
+        ref = self.data["datasets"].get(target, None)
+        if ref:
+            docname, targ = ref
+            return make_refnode(builder, fromdocname, docname, targ, contnode, targ)
+        return None
+def setup(app: Sphinx) -> Dict[str, Any]:
+    """Setup experimaestro for Sphinx documentation"""
+    app.add_domain(DatamaestroDomain)
+    app.add_node(DatasetNode)
+    app.add_config_value("datamaestro_repository", None, True)
+    return {"version": datamaestro.version, "parallel_read_safe": True}

datamaestro/stream/__init__.py CHANGED Viewed

@@ -1,5 +1,3 @@
-import io
-import logging
 from pathlib import Path

datamaestro/stream/lines.py CHANGED Viewed

@@ -28,7 +28,8 @@ class LineTransformStream(io.RawIOBase):
             self.current = self.transform(line).encode("utf-8")
     def readinto(self, b):
-        """Read bytes into a pre-allocated, writable bytes-like object b and return the number of bytes read"""
+        """Read bytes into a pre-allocated, writable bytes-like object b and
+        return the number of bytes read"""
         if self.current is None:
             return 0
@@ -41,12 +42,14 @@ class LineTransformStream(io.RawIOBase):
                     return offset
             # How many bytes to read from current line
-            l = min(lb, len(self.current) - self.offset)
-            b[offset : (offset + l)] = self.current[self.offset : (self.offset + l)]
-            lb -= l
-            offset += l
-            self.offset += l
+            chunk_len = min(lb, len(self.current) - self.offset)
+            b[offset : (offset + chunk_len)] = self.current[
+                self.offset : (self.offset + chunk_len)
+            ]
+            lb -= chunk_len
+            offset += chunk_len
+            self.offset += chunk_len
         return offset

datamaestro/templates/dataset.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# See documentation on http://experimaestro.github.io/datamaestro/
+# See documentation on https://datamaestro.readthedocs.io
 from datamaestro.definitions import datatasks, datatags, dataset
 from datamaestro.data import Base
@@ -7,11 +7,12 @@ from datamaestro.data import Base
 @datatags("tag1", "tag2")
 @datatasks("task1", "task2")
 @dataset(
-    Base, url="__URL__",
+    Base,
+    url="__URL__",
 )
 def __IDENTIFIER__():
     """Line description
-  Long description
-  """
+    Long description
+    """
     return {}

datamaestro/test/__init__.py CHANGED Viewed

@@ -1 +1,3 @@
-from .checks import *
+from .checks import DatasetTests
+__all__ = ["DatasetTests"]

datamaestro/test/checks.py CHANGED Viewed

@@ -1,11 +1,7 @@
 import logging
-import traceback
 import importlib
 import inspect
-from datamaestro.context import Context, Repository
-import unittest
+from datamaestro.context import Context
 class DatasetTests:

datamaestro/test/conftest.py CHANGED Viewed

@@ -1,13 +1,8 @@
 from pathlib import Path
-import contextlib
-import unittest
-import tempfile
 from datamaestro import Repository, Context
 import shutil
 import logging
 import pytest
-import os
-import shutil
 class MyRepository(Repository):
@@ -27,7 +22,7 @@ def context(tmp_path_factory):
     context = Context(Path(dir))
     logging.info("Created datamaestro test directory %s", dir)
-    repository = MyRepository(context)
+    _repository = MyRepository(context)  # noqa: F841 - registered on creation
     yield context

datamaestro/test/test_annotations.py CHANGED Viewed

@@ -1,11 +1,11 @@
 from datamaestro.annotations.agreement import useragreement
 from datamaestro.definitions import AbstractDataset
-from .conftest import repository
 def test_useragreements(context):
     # Fake dataset
     class t(AbstractDataset):
-        pass
+        def _prepare(self):
+            pass
     useragreement("test")(t(None))

datamaestro/test/test_download_handlers.py CHANGED Viewed

@@ -1,9 +1,5 @@
-import unittest
-import logging
 from pathlib import Path
-import shutil
 import datamaestro.download.single as single
-from datamaestro import Repository, Context
 from datamaestro.definitions import AbstractDataset
 from .conftest import MyRepository
@@ -16,6 +12,9 @@ class Dataset(AbstractDataset):
         super().__init__(repository)
         self.datapath = Path(repository.context._path)
+    def _prepare(self):
+        pass
 def test_filedownloader(context):
     repository = MyRepository(context)

datamaestro/test/test_record.py ADDED Viewed

@@ -0,0 +1,72 @@
+import pickle
+from datamaestro.record import Item, record_type
+from attrs import define
+import pytest
+@define
+class AItem(Item):
+    a: int
+@define
+class A1Item(AItem):
+    a1: int
+@define
+class BItem(Item):
+    b: int
+@define
+class B1Item(BItem):
+    b1: int
+@define
+class CItem(Item):
+    c: int
+ARecord = record_type(AItem)
+BaseRecord = ARecord.sub(A1Item)
+MyRecord = BaseRecord.sub(BItem)
+def test_record_simple():
+    a = A1Item(1, 2)
+    b = BItem(4)
+    r = MyRecord(a, b)
+    assert r[AItem] is a
+    assert r[A1Item] is a
+    assert r[BItem] is b
+def test_record_missing_init():
+    with pytest.raises(KeyError):
+        # A1Item is missing
+        MyRecord(AItem(1), BItem(2))
+    with pytest.raises(KeyError):
+        MyRecord(A1Item(1, 2))
+def test_record_update():
+    a = A1Item(1, 2)
+    b = BItem(4)
+    r = MyRecord(a, b)
+    r2 = r.update(BItem(3))
+    assert r is not r2
+    assert r2[BItem] is not b
+def test_record_pickled():
+    # First,
+    MyRecord2 = BaseRecord.sub(BItem)
+    r = MyRecord2(A1Item(1, 2), BItem(2))
+    r = pickle.loads(pickle.dumps(r))
+    assert r[A1Item].a == 1
+    assert r[BItem].b == 2

datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl