PyPI - datamaestro - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

datamaestro/__init__.py +1 -2
datamaestro/__main__.py +11 -7
datamaestro/commands/site.py +16 -5
datamaestro/context.py +32 -16
datamaestro/data/ml.py +1 -0
datamaestro/definitions.py +246 -20
datamaestro/download/__init__.py +583 -40
datamaestro/download/archive.py +120 -76
datamaestro/download/custom.py +38 -6
datamaestro/download/huggingface.py +46 -14
datamaestro/download/links.py +106 -49
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +111 -54
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +3 -3
datamaestro/record.py +48 -2
datamaestro/settings.py +2 -1
datamaestro/sphinx.py +1 -3
datamaestro/stream/lines.py +8 -6
datamaestro/test/__init__.py +3 -1
datamaestro/test/conftest.py +1 -2
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +7 -6
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -21
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
datamaestro-1.5.0.dist-info/RECORD +0 -48
datamaestro-1.5.0.dist-info/top_level.txt +0 -1
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0

datamaestro/download/multiple.py CHANGED Viewed

@@ -1,13 +1,31 @@
+"""Multiple download resources (legacy).
+Note: This module uses a legacy API pattern and needs deeper refactoring.
+The List and Datasets classes use an older constructor signature that
+differs from the modern Resource interface.
+"""
 import logging
-from pathlib import Path
 import os
+import warnings
+from pathlib import Path
-from datamaestro import AbstractDataset
+from datamaestro.definitions import AbstractDataset
 from datamaestro.download import Download
+warnings.warn(
+    "datamaestro.download.multiple uses a legacy API. "
+    "Consider migrating to class-attribute resource definitions.",
+    DeprecationWarning,
+    stacklevel=2,
+)
 class List(Download):
-    """Download multiple files or directories given by a list"""
+    """Download multiple files or directories given by a list.
+    Legacy: uses old-style constructor API.
+    """
     def __init__(self, dataset: AbstractDataset, definition: object):
         super().__init__(dataset, definition)
@@ -32,7 +50,10 @@ class List(Download):
 class Datasets(Download):
-    """Use links to dataset files"""
+    """Use links to dataset files.
+    Legacy: uses old-style constructor API.
+    """
     def __init__(self, dataset: AbstractDataset, definition: object):
         super().__init__(dataset, definition)
@@ -48,7 +69,8 @@ class Datasets(Download):
                 if isinstance(files, Path):
                     if not files.is_dir():
                         raise AssertionError(
-                            "Dataset path is not a directory: %s", files
+                            "Dataset path is not a directory: %s",
+                            files,
                         )
                     path = destination / key
                     if not path.exists():

datamaestro/download/single.py CHANGED Viewed

@@ -1,64 +1,83 @@
-from typing import Optional
-import logging
-import shutil
-import tarfile
+"""Single file download resources.
+Provides FileResource subclasses for downloading individual files
+from URLs, with optional transforms and integrity checking.
+"""
+from __future__ import annotations
 import io
 import gzip
-import os.path as op
+import logging
 import os
-import urllib3
+import os.path as op
+import shutil
+import tarfile
 from pathlib import Path
-import re
-from datamaestro.utils import copyfileobjs
+import urllib3
+from datamaestro.download import FileResource
 from datamaestro.stream import Transform
-from datamaestro.download import Download
+from datamaestro.utils import copyfileobjs
+logger = logging.getLogger(__name__)
 def open_ext(*args, **kwargs):
-    """Opens a file according to its extension"""
+    """Opens a file according to its extension."""
     name = args[0]
     if name.endswith(".gz"):
         return gzip.open(*args, *kwargs)
     return io.open(*args, **kwargs)
-class SingleDownload(Download):
-    def __init__(self, filename: str):
-        super().__init__(re.sub(r"\..*$", "", filename))
-        self.name = filename
+class FileDownloader(FileResource):
+    """Downloads a single file from a URL.
+    Supports optional transforms (e.g., gzip decompression)
+    and integrity checking.
-    @property
-    def path(self):
-        return self.definition.datapath / self.name
+    Usage as class attribute (preferred)::
-    def prepare(self):
-        return self.path
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = FileDownloader.apply(
+                "data.csv", "http://example.com/data.csv.gz"
+            )
-    def download(self, force=False):
-        if not self.path.is_file() and not force:
-            self._download(self.path)
+    Usage as decorator (deprecated)::
+        @filedownloader("data.csv", "http://example.com/data.csv.gz")
+        @dataset(Base)
+        def my_dataset(data): ...
+    """
-class filedownloader(SingleDownload):
     def __init__(
         self,
         filename: str,
         url: str,
-        size: int = None,
-        transforms: Optional[Transform] = None,
+        size: int | None = None,
+        transforms: Transform | None = None,
         checker=None,
+        *,
+        varname: str | None = None,
+        transient: bool = False,
     ):
-        """Downloads a file given by a URL
-        :param filename: The filename within the data folder; the variable name
-            corresponds to the filename without the extension.
-        :param url: The URL to download.
-        :param transforms: Transform the file before storing it size: size in
-            bytes (or None)
         """
-        super().__init__(filename)
+        Args:
+            filename: The filename within the data folder; the variable
+                name corresponds to the filename without the extension.
+            url: The URL to download.
+            size: Expected size in bytes (or None).
+            transforms: Transform the file before storing it.
+                Auto-detected from URL path if None.
+            checker: File integrity checker.
+            varname: Explicit resource name.
+            transient: If True, data can be deleted after dependents
+                complete.
+        """
+        super().__init__(filename, varname=varname, transient=transient)
         self.url = url
         self.checker = checker
         self.size = size
@@ -67,8 +86,8 @@ class filedownloader(SingleDownload):
         path = Path(Path(p.path).name)
         self.transforms = transforms if transforms else Transform.createFromPath(path)
-    def _download(self, destination):
-        logging.info("Downloading %s into %s", self.url, destination)
+    def _download(self, destination: Path) -> None:
+        logger.info("Downloading %s into %s", self.url, destination)
         # Creates directory if needed
         dir = op.dirname(destination)
@@ -78,41 +97,69 @@ class filedownloader(SingleDownload):
         with self.context.downloadURL(self.url, size=self.size) as file:
             # Transform if need be
             if self.transforms:
-                logging.info("Transforming file")
-                with self.transforms(file.path.open("rb")) as stream, destination.open(
-                    "wb"
-                ) as out:
+                logger.info("Transforming file")
+                with (
+                    self.transforms(file.path.open("rb")) as stream,
+                    destination.open("wb") as out,
+                ):
                     if self.checker:
                         copyfileobjs(stream, [out, self.checker])
                         self.checker.close()
                     else:
                         shutil.copyfileobj(stream, out)
             else:
-                logging.info("Keeping original downloaded file %s", file.path)
+                logger.info("Keeping original downloaded file %s", file.path)
                 if self.checker:
                     self.checker.check(file.path)
                 (shutil.copy if file.keep else shutil.move)(file.path, destination)
-        logging.info("Created file %s" % destination)
+        logger.info("Created file %s", destination)
-class concatdownload(SingleDownload):
-    """Concatenate all files in an archive"""
+# Factory alias for backward compat and convenient usage
+filedownloader = FileDownloader.apply
-    def __init__(self, filename: str, url: str, transforms=None):
-        """Concat the files in an archive
+class ConcatDownloader(FileResource):
+    """Concatenate all files from an archive into a single file.
+    Usage as class attribute (preferred)::
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = ConcatDownloader.apply(
+                "data.txt", "http://example.com/data.tar.gz"
+            )
+    """
+    def __init__(
+        self,
+        filename: str,
+        url: str,
+        transforms=None,
+        *,
+        varname: str | None = None,
+        transient: bool = False,
+    ):
+        """
         Args:
-            filename: The filename within the data folder; the variable name
-            corresponds to the filename without the extension url: The URL to
-            download transforms: Transform the file before storing it
+            filename: The filename within the data folder; the variable
+                name corresponds to the filename without the extension.
+            url: The URL to download.
+            transforms: Transform the file before storing it.
+            varname: Explicit resource name.
+            transient: If True, data can be deleted after dependents
+                complete.
         """
-        super().__init__(filename)
+        super().__init__(filename, varname=varname, transient=transient)
         self.url = url
         self.transforms = transforms
-    def _download(self, destination):
-        with self.context.downloadURL(self.url) as dl, tarfile.open(dl.path) as archive:
+    def _download(self, destination: Path) -> None:
+        with (
+            self.context.downloadURL(self.url) as dl,
+            tarfile.open(dl.path) as archive,
+        ):
             destination.parent.mkdir(parents=True, exist_ok=True)
             with open(destination, "wb") as out:
@@ -121,6 +168,16 @@ class concatdownload(SingleDownload):
                         transforms = self.transforms or Transform.createFromPath(
                             Path(tarinfo.name)
                         )
-                        logging.debug("Processing file %s", tarinfo.name)
+                        logger.debug("Processing file %s", tarinfo.name)
                         with transforms(archive.fileobject(archive, tarinfo)) as fp:
                             shutil.copyfileobj(fp, out)
+# Factory alias for backward compat
+concatdownload = ConcatDownloader.apply
+# --- Backward compat aliases ---
+# Keep old class names importable but they now point to new classes
+SingleDownload = FileDownloader

datamaestro/download/sync.py CHANGED Viewed

@@ -2,7 +2,6 @@ import logging
 from pathlib import Path
 from datamaestro.download import Download
-from datamaestro.definitions import AbstractDataset
 from subprocess import run

datamaestro/download/todo.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from pathlib import Path
+from datamaestro.download import Resource
-from datamaestro.download import Download
+class Todo(Resource):
+    """Placeholder resource indicating download is not yet implemented."""
-class Todo(Download):
-    def download(self, destination: Path):
+    def download(self, force=False):
         raise NotImplementedError(
             "Download method not defined - please edit the definition file"
         )
+    def prepare(self):
+        raise NotImplementedError(
+            "Prepare method not defined - please edit the definition file"
+        )

datamaestro/download/wayback.py CHANGED Viewed

@@ -129,12 +129,12 @@ class wayback_documents(Resource):
         self.urls_fn = urls_fn
     def prepare(self):
-        return self.definition.datapath / self.varname
+        return self.dataset.datapath / self.name
     def download(self, force=False):
         # Creates directory if needed
-        destination: Path = self.definition.datapath / self.varname
-        self.definition.datapath.mkdir(exist_ok=True)
+        destination: Path = self.dataset.datapath / self.name
+        self.dataset.datapath.mkdir(exist_ok=True)
         # Early exit
         done_path = destination.with_suffix(".done")

datamaestro/record.py CHANGED Viewed

@@ -1,5 +1,51 @@
+"""Record module for type-safe heterogeneous containers.
+.. deprecated:: 2.0
+    This module will be removed in v2. Use :class:`typing.TypedDict` instead
+    for type-safe heterogeneous data structures. TypedDict provides better IDE
+    support, type checking, and is part of the standard library.
+    When using TypedDict, define key constants in classes (e.g., ``MyItem.ID``)
+    to avoid typos and enable IDE autocomplete. Prefix keys with package name
+    using underscore ``_`` as delimiter to avoid conflicts between different
+    data sources.
+Example migration::
+    # Old way (deprecated)
+    @define
+    class MyItem(Item):
+        value: int
+    record = Record(MyItem(42))
+    print(record[MyItem].value)
+    # New way (recommended)
+    from typing import TypedDict
+    # Define key constants in classes
+    class MyItem:
+        ID = "mypackage_value"
+    class MyRecord(TypedDict):
+        mypackage_value: int
+    data: MyRecord = {MyItem.ID: 42}
+    print(data[MyItem.ID])
+"""
+import warnings
 from typing import Type, TypeVar, Dict, Union, Optional
+# Emit deprecation warning when module is imported
+warnings.warn(
+    "The datamaestro.record module is deprecated and will be removed in v2. "
+    "Use typing.TypedDict instead (use class constants like MyItem.ID for keys, "
+    "prefixed with package name).",
+    DeprecationWarning,
+    stacklevel=2,
+)
 class Item:
     """Base class for all item types"""
@@ -28,8 +74,8 @@ class RecordType:
         self.mapping = {item_type.__get_base__(): item_type for item_type in item_types}
     def __repr__(self):
-        return f"""Record({",".join(item_type.__name__ for item_type in
-                self.item_types)})"""
+        names = ",".join(item_type.__name__ for item_type in self.item_types)
+        return f"Record({names})"
     def contains(self, other: "RecordType"):
         """Checks that each item type in other has an item type of a compatible

datamaestro/settings.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """Global and user settings utility classes"""
 import marshmallow as mm
 from typing import Dict, Any
-from experimaestro.utils.settings import JsonSettings, PathField
+from experimaestro.utils.settings import JsonSettings
 from pathlib import Path
 # --- Global settings

datamaestro/sphinx.py CHANGED Viewed

@@ -125,9 +125,7 @@ class RepositoryDirective(DatasetsDirective):
     def run(self):
         (repository_id,) = self.arguments
         with mock(self.config.autodoc_mock_imports):
-            repository = datamaestro.Context.instance().repository(
-                repository_id
-            )  # type: Optional[datamaestro.Repository]
+            repository = datamaestro.Context.instance().repository(repository_id)  # type: Optional[datamaestro.Repository]
             assert repository is not None
             docnodes = []

datamaestro/stream/lines.py CHANGED Viewed

@@ -42,12 +42,14 @@ class LineTransformStream(io.RawIOBase):
                     return offset
             # How many bytes to read from current line
-            l = min(lb, len(self.current) - self.offset)
-            b[offset : (offset + l)] = self.current[self.offset : (self.offset + l)]
-            lb -= l
-            offset += l
-            self.offset += l
+            chunk_len = min(lb, len(self.current) - self.offset)
+            b[offset : (offset + chunk_len)] = self.current[
+                self.offset : (self.offset + chunk_len)
+            ]
+            lb -= chunk_len
+            offset += chunk_len
+            self.offset += chunk_len
         return offset

datamaestro/test/__init__.py CHANGED Viewed

@@ -1 +1,3 @@
-from .checks import *
+from .checks import DatasetTests
+__all__ = ["DatasetTests"]

datamaestro/test/conftest.py CHANGED Viewed

@@ -3,7 +3,6 @@ from datamaestro import Repository, Context
 import shutil
 import logging
 import pytest
-import shutil
 class MyRepository(Repository):
@@ -23,7 +22,7 @@ def context(tmp_path_factory):
     context = Context(Path(dir))
     logging.info("Created datamaestro test directory %s", dir)
-    repository = MyRepository(context)
+    _repository = MyRepository(context)  # noqa: F841 - registered on creation
     yield context

datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl