PyPI - datamaestro - Versions diffs - 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

datamaestro/__init__.py +1 -2
datamaestro/__main__.py +11 -7
datamaestro/commands/site.py +16 -5
datamaestro/context.py +32 -16
datamaestro/data/ml.py +1 -0
datamaestro/definitions.py +246 -20
datamaestro/download/__init__.py +583 -40
datamaestro/download/archive.py +120 -76
datamaestro/download/custom.py +38 -6
datamaestro/download/huggingface.py +46 -14
datamaestro/download/links.py +106 -49
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +111 -54
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +3 -3
datamaestro/record.py +48 -2
datamaestro/settings.py +2 -1
datamaestro/sphinx.py +1 -3
datamaestro/stream/lines.py +8 -6
datamaestro/test/__init__.py +3 -1
datamaestro/test/conftest.py +1 -2
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +7 -6
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -21
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/METADATA +63 -94
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
datamaestro-1.5.0.dist-info/RECORD +0 -48
datamaestro-1.5.0.dist-info/top_level.txt +0 -1
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -0
{datamaestro-1.5.0.dist-info → datamaestro-1.7.0.dist-info}/licenses/LICENSE +0 -0

datamaestro/download/archive.py CHANGED Viewed

@@ -1,41 +1,68 @@
+"""Archive download resources.
+Provides FolderResource subclasses for downloading and extracting
+ZIP and TAR archives.
+"""
+from __future__ import annotations
 import logging
-from pathlib import Path
-import zipfile
+import re
 import shutil
-import urllib3
 import tarfile
-import re
+import zipfile
+from pathlib import Path
 from typing import Set
-from datamaestro.download import Download, initialized
+import urllib3
+from datamaestro.download import FolderResource
 from datamaestro.utils import CachedFile, FileChecker
+logger = logging.getLogger(__name__)
+class ArchiveDownloader(FolderResource):
+    """Abstract base for all archive-related extractors.
+    Usage as class attribute (preferred)::
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = ZipDownloader.apply(
+                "archive", "http://example.com/data.zip"
+            )
-class ArchiveDownloader(Download):
-    """Abstract class for all archive related extractors"""
+    Usage as decorator (deprecated)::
+        @zipdownloader("archive", "http://example.com/data.zip")
+        @dataset(Base)
+        def my_dataset(archive): ...
+    """
     def __init__(
         self,
-        varname,
+        varname: str,
         url: str,
-        subpath: str = None,
-        checker: FileChecker = None,
-        files: Set[str] = None,
+        subpath: str | None = None,
+        checker: FileChecker | None = None,
+        files: Set[str] | None = None,
+        *,
+        transient: bool = False,
     ):
-        """Downloads and extract the content of the archive
+        """Downloads and extract the content of the archive.
         Args:
-            varname: The name of the variable when defining the dataset
-            url: The archive URL
-            checker: the hash check for the downloaded file, composed of two
-            subpath: A subpath in the archive; only files from this subpath will
-            be extracted
-            files: A set of files; if present, only extract those
+            varname: The name of the variable when defining the dataset.
+            url: The archive URL.
+            subpath: A subpath in the archive; only files from this
+                subpath will be extracted.
+            checker: The hash check for the downloaded file.
+            files: A set of files; if present, only extract those.
+            transient: If True, data can be deleted after dependents
+                complete.
         """
-        super().__init__(varname)
+        super().__init__(varname=varname, transient=transient)
         self.url = url
         self.subpath = subpath
         self.checker = checker
@@ -46,20 +73,33 @@ class ArchiveDownloader(Download):
     def postinit(self):
         # Define the path
         p = urllib3.util.parse_url(self.url)
-        name = self._name(Path(p.path).name)
+        self._archive_name = self._name(Path(p.path).name)
-        if len(self.definition.resources) > 1:
-            self.path = self.definition.datapath / name
-        else:
-            self.path = self.definition.datapath
+    @property
+    def path(self) -> Path:
+        """Final path to the extracted directory."""
+        if not self._post:
+            self._post = True
+            self.postinit()
+        if len(self.dataset.resources) > 1:
+            return self.dataset.datapath / self._archive_name
+        return self.dataset.datapath
+    @property
+    def transient_path(self) -> Path:
+        """Temporary path for extraction."""
+        if not self._post:
+            self._post = True
+            self.postinit()
-    @initialized
-    def prepare(self):
-        return self.path
+        if len(self.dataset.resources) > 1:
+            return self.dataset.datapath / ".downloads" / self._archive_name
+        return self.dataset.datapath / ".downloads" / self.name
     @property
     def extractall(self):
-        """Returns whether everything can be extracted"""
+        """Returns whether everything can be extracted."""
         return self._files is None and self.subpath is None
     def filter(self, iterable, getname):
@@ -67,8 +107,8 @@ class ArchiveDownloader(Download):
         for info in iterable:
             name = getname(info)
-            logging.debug("Looking at %s", name)
-            if self._files and not (name in self._files):
+            logger.debug("Looking at %s", name)
+            if self._files and name not in self._files:
                 continue
             if self.subpath and name.startswith(self.subpath):
@@ -77,82 +117,78 @@ class ArchiveDownloader(Download):
             if not self.subpath:
                 yield info, name
-    @initialized
-    def download(self, force=False):
-        # Already downloaded
-        destination = self.definition.datapath
-        if destination.is_dir():
-            return
-        logging.info("Downloading %s into %s", self.url, destination)
+    def _download(self, destination: Path) -> None:
+        logger.info("Downloading %s into %s", self.url, destination)
         destination.parent.mkdir(parents=True, exist_ok=True)
-        tmpdestination = destination.with_suffix(".tmp")
-        if tmpdestination.exists():
-            logging.warn("Removing temporary directory %s", tmpdestination)
-            shutil.rmtree(tmpdestination)
         with self.context.downloadURL(self.url) as file:
             if self.checker:
                 self.checker.check(file.path)
-            self.unarchive(file, tmpdestination)
-        # Look at the content
-        for ix, path in enumerate(tmpdestination.iterdir()):
-            if ix > 1:
-                break
-        # Just one folder: move
-        if ix == 0 and path.is_dir():
-            logging.info(
-                "Moving single file/directory {} into destination {}".format(
-                    path, destination
-                )
+            self.unarchive(file, destination)
+        # Look at the content - if single directory, unwrap
+        children = list(destination.iterdir())
+        if len(children) == 1 and children[0].is_dir():
+            single_dir = children[0]
+            logger.info(
+                "Moving single directory %s into destination %s",
+                single_dir,
+                destination,
             )
-            shutil.move(str(path), str(destination))
-            shutil.rmtree(tmpdestination)
-        else:
-            shutil.move(tmpdestination, destination)
+            # Move contents up one level
+            tmp = destination.with_suffix(".unwrap")
+            shutil.move(str(single_dir), str(tmp))
+            shutil.rmtree(destination)
+            shutil.move(str(tmp), str(destination))
+    def unarchive(self, file, destination: Path):
+        raise NotImplementedError()
+    def _name(self, name: str) -> str:
+        raise NotImplementedError()
-class zipdownloader(ArchiveDownloader):
-    """ZIP Archive handler"""
+class ZipDownloader(ArchiveDownloader):
+    """ZIP Archive handler."""
     def _name(self, name):
         return re.sub(r"\.zip$", "", name)
     def unarchive(self, file, destination: Path):
-        logging.info("Unzipping file")
+        logger.info("Unzipping file")
         with zipfile.ZipFile(file.path) as zip:
             if self.extractall:
                 zip.extractall(destination)
             else:
                 for zip_info, name in self.filter(
-                    zip.infolist(), lambda zip_info: zip_info.filename
+                    zip.infolist(),
+                    lambda zip_info: zip_info.filename,
                 ):
                     if zip_info.is_dir():
                         (destination / name).mkdir()
                     else:
-                        logging.info(
+                        logger.info(
                             "File %s (%s) to %s",
                             zip_info.filename,
                             name,
                             destination / name,
                         )
-                        with zip.open(zip_info) as fp, (destination / name).open(
-                            "wb"
-                        ) as out:
+                        with (
+                            zip.open(zip_info) as fp,
+                            (destination / name).open("wb") as out,
+                        ):
                             shutil.copyfileobj(fp, out)
-class tardownloader(ArchiveDownloader):
-    """TAR archive handler"""
+class TarDownloader(ArchiveDownloader):
+    """TAR archive handler."""
     def _name(self, name):
         return re.sub(r"\.tar(\.gz|\.bz\|xz)?$", "", name)
     def unarchive(self, file: CachedFile, destination: Path):
-        logging.info("Unarchiving file")
+        logger.info("Unarchiving file")
         if self.subpath:
             raise NotImplementedError()
@@ -164,11 +200,19 @@ class tardownloader(ArchiveDownloader):
                     if info.isdir():
                         (destination / name).mkdir()
                     else:
-                        logging.info(
+                        logger.info(
                             "File %s (%s) to %s",
                             info.name,
                             name,
                             destination / name,
                         )
-                        logging.info("Extracting into %s", destination / name)
+                        logger.info(
+                            "Extracting into %s",
+                            destination / name,
+                        )
                         tar.extract(info, destination / name)
+# Factory aliases for backward compat and convenient usage
+zipdownloader = ZipDownloader.apply
+tardownloader = TarDownloader.apply

datamaestro/download/custom.py CHANGED Viewed

@@ -1,21 +1,53 @@
-from typing import Protocol
+"""Custom download resources.
+Provides a Resource subclass that delegates to a user-defined
+download function.
+"""
+from __future__ import annotations
 from pathlib import Path
+from typing import Protocol
 from datamaestro import Context
 from datamaestro.download import Resource
 class Downloader(Protocol):
-    def __call__(self, context: Context, root: Path, *, force=False):
+    def __call__(self, context: Context, root: Path, *, force: bool = False):
         pass
 class custom_download(Resource):
-    def __init__(self, varname: str, downloader: Downloader):
-        super().__init__(varname)
+    """A resource that delegates to a user-defined download function.
+    Usage as class attribute (preferred)::
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = custom_download(
+                "data", downloader=my_download_fn
+            )
+    Usage as decorator (deprecated)::
+        @custom_download("data", downloader=my_download_fn)
+        @dataset(Base)
+        def my_dataset(data): ...
+    """
+    def __init__(
+        self,
+        varname: str,
+        downloader: Downloader,
+        *,
+        transient: bool = False,
+    ):
+        super().__init__(varname=varname, transient=transient)
         self.downloader = downloader
     def prepare(self):
-        return self.definition.datapath
+        return self.dataset.datapath
     def download(self, force=False):
-        self.downloader(self.context, self.definition.datapath, force=force)
+        self.downloader(self.context, self.dataset.datapath, force=force)

datamaestro/download/huggingface.py CHANGED Viewed

@@ -1,27 +1,55 @@
+"""HuggingFace Hub download resources.
+Provides a ValueResource subclass for loading datasets from
+the HuggingFace Hub.
+"""
+from __future__ import annotations
 import logging
-from typing import Optional
-from datamaestro.download import Download
+from datamaestro.download import ValueResource
+logger = logging.getLogger(__name__)
+class HFDownloader(ValueResource):
+    """Load a dataset from the HuggingFace Hub.
+    Usage as class attribute (preferred)::
-class hf_download(Download):
-    """Use Hugging Face to download a file"""
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = HFDownloader.apply(
+                "hf_data", repo_id="user/dataset"
+            )
+    Usage as decorator (deprecated)::
+        @hf_download("hf_data", repo_id="user/dataset")
+        @dataset(Base)
+        def my_dataset(hf_data): ...
+    """
     def __init__(
         self,
         varname: str,
         repo_id: str,
         *,
-        data_files: Optional[str] = None,
-        split: Optional[str] = None
+        data_files: str | None = None,
+        split: str | None = None,
+        transient: bool = False,
     ):
-        """Use
+        """
         Args:
-            varname: Variable name
-            repo_id: The HuggingFace repository ID
+            varname: Variable name.
+            repo_id: The HuggingFace repository ID.
+            data_files: Specific data files to load.
+            split: Dataset split to load.
+            transient: If True, data can be deleted after dependents
+                complete.
         """
-        super().__init__(varname)
+        super().__init__(varname=varname, transient=transient)
         self.repo_id = repo_id
         self.data_files = data_files
         self.split = split
@@ -30,11 +58,11 @@ class hf_download(Download):
         try:
             from datasets import load_dataset
         except ModuleNotFoundError:
-            logging.error("the datasets library is not installed:")
-            logging.error("pip install datasets")
+            logger.error("the datasets library is not installed:")
+            logger.error("pip install datasets")
             raise
-        self.dataset = load_dataset(self.repo_id, data_files=self.data_files)
+        self._dataset = load_dataset(self.repo_id, data_files=self.data_files)
         return True
     def prepare(self):
@@ -43,3 +71,7 @@ class hf_download(Download):
             "data_files": self.data_files,
             "split": self.split,
         }
+# Factory alias for backward compat
+hf_download = HFDownloader.apply

datamaestro/download/links.py CHANGED Viewed

@@ -1,29 +1,53 @@
+"""Link-based resources.
+Provides resources that create symlinks to other datasets or
+user-specified paths.
+"""
+from __future__ import annotations
 import logging
 import os
-from datamaestro.utils import deprecated
-from datamaestro.definitions import AbstractDataset
+from pathlib import Path
 from typing import List
-from datamaestro.download import Download
 from datamaestro.context import ResolvablePath
-from pathlib import Path
-import os
-import logging
+from datamaestro.definitions import AbstractDataset
+from datamaestro.download import Resource
+from datamaestro.utils import deprecated
+logger = logging.getLogger(__name__)
+class links(Resource):
+    """Link with another dataset path.
+    Usage as class attribute (preferred)::
-class links(Download):
-    def __init__(self, varname: str, **links: List[AbstractDataset]):
-        """Link with another dataset path
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = links("data", ref1=other_dataset1)
-        Args:
-            varname: The name of the variable when defining the dataset
-            links: A list of
-        """
-        super().__init__(varname)
-        self.links = links
+    Usage as decorator (deprecated)::
+        @links("data", ref1=other_dataset1)
+        @dataset(Base)
+        def my_dataset(data): ...
+    """
+    def __init__(
+        self,
+        varname: str,
+        *,
+        transient: bool = False,
+        **link_targets: List[AbstractDataset],
+    ):
+        super().__init__(varname=varname, transient=transient)
+        self.links = link_targets
     @property
     def path(self):
-        return self.definition.datapath
+        return self.dataset.datapath
     def prepare(self):
         return self.path
@@ -38,24 +62,36 @@ class links(Download):
             if not dest.exists():
                 if dest.is_symlink():
-                    logging.info("Removing dandling symlink %s", dest)
+                    logger.info("Removing dangling symlink %s", dest)
                     dest.unlink()
                 os.symlink(path, dest)
+    def has_files(self):
+        return False
 # Deprecated
 Links = deprecated("Use @links instead of @Links", links)
-class linkpath(Download):
-    def __init__(self, varname: str, proposals):
-        """Link to a folder
+class linkpath(Resource):
+    """Link to a path selected from proposals.
+    Usage as class attribute (preferred)::
-        Args:
-            varname: Name of the variable
-            proposals: List of potential paths
-        """
-        super().__init__(varname)
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = linkpath("data", proposals=[...])
+    """
+    def __init__(
+        self,
+        varname: str,
+        proposals,
+        *,
+        transient: bool = False,
+    ):
+        super().__init__(varname=varname, transient=transient)
         self.proposals = proposals
     def prepare(self):
@@ -63,62 +99,83 @@ class linkpath(Download):
     @property
     def path(self):
-        return self.definition.datapath / self.varname
+        return self.dataset.datapath / self.name
-    def download(self, destination):
+    def download(self, force=False):
         if self.check(self.path):
             return
         if self.path.is_symlink():
-            logging.warning("Removing dandling symlink %s", self.path)
+            logger.warning("Removing dangling symlink %s", self.path)
             self.path.unlink()
         path = None
         for searchpath in self.proposals:
-            logging.info("Trying path %s", searchpath)
+            logger.info("Trying path %s", searchpath)
             try:
                 path = ResolvablePath.resolve(self.context, searchpath)
                 if self.check(path):
                     break
-                logging.info("Path %s not found", path)
+                logger.info("Path %s not found", path)
             except KeyError:
-                logging.info("Could not expand path %s", searchpath)
+                logger.info("Could not expand path %s", searchpath)
         # Ask the user
         while path is None or not self.check(path):
-            path = Path(input("Path to %s: " % self.varname))
+            path = Path(input("Path to %s: " % self.name))
         assert path.name
-        logging.debug("Linking %s to %s", path, self.path)
+        logger.debug("Linking %s to %s", path, self.path)
         self.path.parent.mkdir(exist_ok=True, parents=True)
         os.symlink(path, self.path)
+    def check(self, path):
+        raise NotImplementedError()
 class linkfolder(linkpath):
+    """Link to a folder.
+    Usage as class attribute::
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = linkfolder("data", proposals=[...])
+    """
+    def __init__(
+        self,
+        varname: str,
+        proposals,
+        *,
+        transient: bool = False,
+    ):
+        super().__init__(varname, proposals, transient=transient)
     def check(self, path):
         return path.is_dir()
-    def __init__(self, varname: str, proposals):
-        """Link to a folder
-        Args:
-            varname: Name of the variable
-            proposals: List of potential paths
-        """
-        super().__init__(varname, proposals)
+class linkfile(linkpath):
+    """Link to a file.
+    Usage as class attribute::
-class linkfile(linkpath):
-    def __init__(self, varname: str, proposals):
-        """Link to a file
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = linkfile("data", proposals=[...])
+    """
-        Args:
-            varname: Name of the variable
-            proposals: List of potential paths
-        """
-        super().__init__(varname, proposals)
+    def __init__(
+        self,
+        varname: str,
+        proposals,
+        *,
+        transient: bool = False,
+    ):
+        super().__init__(varname, proposals, transient=transient)
     def check(self, path):
-        print("Checking", path, path.is_file())
+        logger.debug("Checking %s (exists: %s)", path, path.is_file())
         return path.is_file()

datamaestro 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 1.5.0py3-none-any.whl → 1.7.0py3-none-any.whl