PyPI - datamaestro - Versions diffs - 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

datamaestro/__init__.py +11 -7
datamaestro/__main__.py +29 -8
datamaestro/annotations/__init__.py +1 -1
datamaestro/annotations/agreement.py +9 -3
datamaestro/commands/site.py +27 -15
datamaestro/context.py +143 -87
datamaestro/data/__init__.py +23 -11
datamaestro/data/csv.py +12 -12
datamaestro/data/huggingface.py +25 -0
datamaestro/data/ml.py +19 -10
datamaestro/data/tensor.py +32 -24
datamaestro/definitions.py +492 -131
datamaestro/download/__init__.py +610 -24
datamaestro/download/archive.py +129 -77
datamaestro/download/custom.py +53 -0
datamaestro/download/huggingface.py +77 -0
datamaestro/download/links.py +106 -50
datamaestro/download/multiple.py +27 -5
datamaestro/download/single.py +114 -51
datamaestro/download/sync.py +0 -1
datamaestro/download/todo.py +9 -4
datamaestro/download/wayback.py +164 -0
datamaestro/record.py +232 -0
datamaestro/registry.py +1 -0
datamaestro/search.py +1 -1
datamaestro/settings.py +3 -1
datamaestro/sphinx.py +224 -0
datamaestro/stream/__init__.py +0 -2
datamaestro/stream/lines.py +10 -7
datamaestro/templates/dataset.py +5 -4
datamaestro/test/__init__.py +3 -1
datamaestro/test/checks.py +1 -5
datamaestro/test/conftest.py +1 -6
datamaestro/test/test_annotations.py +2 -2
datamaestro/test/test_download_handlers.py +3 -4
datamaestro/test/test_record.py +72 -0
datamaestro/test/test_resource.py +1388 -0
datamaestro/utils.py +15 -9
datamaestro/v2.md +301 -0
datamaestro/version.py +4 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/METADATA +72 -104
datamaestro-1.7.0.dist-info/RECORD +49 -0
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/WHEEL +1 -2
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info}/entry_points.txt +0 -1
datamaestro/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-38.pyc +0 -0
datamaestro/__pycache__/__main__.cpython-39.pyc +0 -0
datamaestro/__pycache__/context.cpython-38.pyc +0 -0
datamaestro/__pycache__/context.cpython-39.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-38.pyc +0 -0
datamaestro/__pycache__/definitions.cpython-39.pyc +0 -0
datamaestro/__pycache__/registry.cpython-38.pyc +0 -0
datamaestro/__pycache__/registry.cpython-39.pyc +0 -0
datamaestro/__pycache__/search.cpython-38.pyc +0 -0
datamaestro/__pycache__/search.cpython-39.pyc +0 -0
datamaestro/__pycache__/settings.cpython-38.pyc +0 -0
datamaestro/__pycache__/settings.cpython-39.pyc +0 -0
datamaestro/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro/__pycache__/utils.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-38.pyc +0 -0
datamaestro/annotations/__pycache__/agreement.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-38.pyc +0 -0
datamaestro/commands/__pycache__/site.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/csv.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/ml.cpython-39.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-38.pyc +0 -0
datamaestro/data/__pycache__/tensor.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/archive.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/links.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/manual.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/multiple.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/single.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-38.pyc +0 -0
datamaestro/download/__pycache__/sync.cpython-39.pyc +0 -0
datamaestro/download/__pycache__/todo.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/compress.cpython-39.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-38.pyc +0 -0
datamaestro/stream/__pycache__/lines.cpython-39.pyc +0 -0
datamaestro/templates/__pycache__/dataset.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/__init__.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-38.pyc +0 -0
datamaestro/test/__pycache__/checks.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/conftest.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_annotations.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.0.1.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-38-pytest-6.2.0.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39-pytest-6.2.4.pyc +0 -0
datamaestro/test/__pycache__/test_download_handlers.cpython-39.pyc +0 -0
datamaestro/test/__pycache__/utils.cpython-38.pyc +0 -0
datamaestro-0.8.1.dist-info/RECORD +0 -109
datamaestro-0.8.1.dist-info/top_level.txt +0 -1
{datamaestro-0.8.1.dist-info → datamaestro-1.7.0.dist-info/licenses}/LICENSE +0 -0

datamaestro/download/multiple.py CHANGED Viewed

@@ -1,13 +1,31 @@
+"""Multiple download resources (legacy).
+Note: This module uses a legacy API pattern and needs deeper refactoring.
+The List and Datasets classes use an older constructor signature that
+differs from the modern Resource interface.
+"""
 import logging
-from pathlib import Path
 import os
+import warnings
+from pathlib import Path
-from datamaestro import AbstractDataset
+from datamaestro.definitions import AbstractDataset
 from datamaestro.download import Download
+warnings.warn(
+    "datamaestro.download.multiple uses a legacy API. "
+    "Consider migrating to class-attribute resource definitions.",
+    DeprecationWarning,
+    stacklevel=2,
+)
 class List(Download):
-    """Download multiple files or directories given by a list"""
+    """Download multiple files or directories given by a list.
+    Legacy: uses old-style constructor API.
+    """
     def __init__(self, dataset: AbstractDataset, definition: object):
         super().__init__(dataset, definition)
@@ -32,7 +50,10 @@ class List(Download):
 class Datasets(Download):
-    """Use links to dataset files"""
+    """Use links to dataset files.
+    Legacy: uses old-style constructor API.
+    """
     def __init__(self, dataset: AbstractDataset, definition: object):
         super().__init__(dataset, definition)
@@ -48,7 +69,8 @@ class Datasets(Download):
                 if isinstance(files, Path):
                     if not files.is_dir():
                         raise AssertionError(
-                            "Dataset path is not a directory: %s", files
+                            "Dataset path is not a directory: %s",
+                            files,
                         )
                     path = destination / key
                     if not path.exists():

datamaestro/download/single.py CHANGED Viewed

@@ -1,58 +1,83 @@
+"""Single file download resources.
+Provides FileResource subclasses for downloading individual files
+from URLs, with optional transforms and integrity checking.
+"""
+from __future__ import annotations
+import io
+import gzip
 import logging
+import os
+import os.path as op
 import shutil
 import tarfile
-import io
-import tempfile
-import gzip
-import os.path as op, os
-import urllib3
 from pathlib import Path
-from tempfile import NamedTemporaryFile
-import re
-from docstring_parser import parse
-from datamaestro.utils import copyfileobjs
+import urllib3
+from datamaestro.download import FileResource
 from datamaestro.stream import Transform
-from datamaestro.download import Download
+from datamaestro.utils import copyfileobjs
+logger = logging.getLogger(__name__)
 def open_ext(*args, **kwargs):
-    """Opens a file according to its extension"""
+    """Opens a file according to its extension."""
     name = args[0]
     if name.endswith(".gz"):
         return gzip.open(*args, *kwargs)
     return io.open(*args, **kwargs)
-class SingleDownload(Download):
-    def __init__(self, filename: str):
-        super().__init__(re.sub(r"\..*$", "", filename))
-        self.name = filename
+class FileDownloader(FileResource):
+    """Downloads a single file from a URL.
-    @property
-    def path(self):
-        return self.definition.datapath / self.name
+    Supports optional transforms (e.g., gzip decompression)
+    and integrity checking.
-    def prepare(self):
-        return self.path
+    Usage as class attribute (preferred)::
-    def download(self, force=False):
-        if not self.path.is_file():
-            self._download(self.path)
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = FileDownloader.apply(
+                "data.csv", "http://example.com/data.csv.gz"
+            )
+    Usage as decorator (deprecated)::
+        @filedownloader("data.csv", "http://example.com/data.csv.gz")
+        @dataset(Base)
+        def my_dataset(data): ...
+    """
-class filedownloader(SingleDownload):
     def __init__(
-        self, filename: str, url: str, size: int = None, transforms=None, checker=None
+        self,
+        filename: str,
+        url: str,
+        size: int | None = None,
+        transforms: Transform | None = None,
+        checker=None,
+        *,
+        varname: str | None = None,
+        transient: bool = False,
     ):
-        """Downloads a file given by a URL
+        """
         Args:
-            filename: The filename within the data folder; the variable name corresponds to the filename without the extension
-            url: The URL to download
-            transforms: Transform the file before storing it
-            size: size in bytes (or None)
+            filename: The filename within the data folder; the variable
+                name corresponds to the filename without the extension.
+            url: The URL to download.
+            size: Expected size in bytes (or None).
+            transforms: Transform the file before storing it.
+                Auto-detected from URL path if None.
+            checker: File integrity checker.
+            varname: Explicit resource name.
+            transient: If True, data can be deleted after dependents
+                complete.
         """
-        super().__init__(filename)
+        super().__init__(filename, varname=varname, transient=transient)
         self.url = url
         self.checker = checker
         self.size = size
@@ -61,8 +86,8 @@ class filedownloader(SingleDownload):
         path = Path(Path(p.path).name)
         self.transforms = transforms if transforms else Transform.createFromPath(path)
-    def _download(self, destination):
-        logging.info("Downloading %s into %s", self.url, destination)
+    def _download(self, destination: Path) -> None:
+        logger.info("Downloading %s into %s", self.url, destination)
         # Creates directory if needed
         dir = op.dirname(destination)
@@ -72,41 +97,69 @@ class filedownloader(SingleDownload):
         with self.context.downloadURL(self.url, size=self.size) as file:
             # Transform if need be
             if self.transforms:
-                logging.info("Transforming file")
-                with self.transforms(file.path.open("rb")) as stream, destination.open(
-                    "wb"
-                ) as out:
+                logger.info("Transforming file")
+                with (
+                    self.transforms(file.path.open("rb")) as stream,
+                    destination.open("wb") as out,
+                ):
                     if self.checker:
                         copyfileobjs(stream, [out, self.checker])
                         self.checker.close()
                     else:
                         shutil.copyfileobj(stream, out)
             else:
-                logging.info("Keeping original downloaded file %s", file.path)
+                logger.info("Keeping original downloaded file %s", file.path)
                 if self.checker:
                     self.checker.check(file.path)
                 (shutil.copy if file.keep else shutil.move)(file.path, destination)
-        logging.info("Created file %s" % destination)
+        logger.info("Created file %s", destination)
+# Factory alias for backward compat and convenient usage
+filedownloader = FileDownloader.apply
-class concatdownload(SingleDownload):
-    """Concatenate all files in an archive"""
-    def __init__(self, filename: str, url: str, transforms=None):
-        """Concat the files in an archive
+class ConcatDownloader(FileResource):
+    """Concatenate all files from an archive into a single file.
+    Usage as class attribute (preferred)::
+        @dataset(url="...")
+        class MyDataset(Base):
+            DATA = ConcatDownloader.apply(
+                "data.txt", "http://example.com/data.tar.gz"
+            )
+    """
+    def __init__(
+        self,
+        filename: str,
+        url: str,
+        transforms=None,
+        *,
+        varname: str | None = None,
+        transient: bool = False,
+    ):
+        """
         Args:
-            filename: The filename within the data folder; the variable name corresponds to the filename without the extension
-            url: The URL to download
-            transforms: Transform the file before storing it
+            filename: The filename within the data folder; the variable
+                name corresponds to the filename without the extension.
+            url: The URL to download.
+            transforms: Transform the file before storing it.
+            varname: Explicit resource name.
+            transient: If True, data can be deleted after dependents
+                complete.
         """
-        super().__init__(filename)
+        super().__init__(filename, varname=varname, transient=transient)
         self.url = url
         self.transforms = transforms
-    def _download(self, destination):
-        with self.context.downloadURL(self.url) as dl, tarfile.open(dl.path) as archive:
+    def _download(self, destination: Path) -> None:
+        with (
+            self.context.downloadURL(self.url) as dl,
+            tarfile.open(dl.path) as archive,
+        ):
             destination.parent.mkdir(parents=True, exist_ok=True)
             with open(destination, "wb") as out:
@@ -115,6 +168,16 @@ class concatdownload(SingleDownload):
                         transforms = self.transforms or Transform.createFromPath(
                             Path(tarinfo.name)
                         )
-                        logging.debug("Processing file %s", tarinfo.name)
+                        logger.debug("Processing file %s", tarinfo.name)
                         with transforms(archive.fileobject(archive, tarinfo)) as fp:
                             shutil.copyfileobj(fp, out)
+# Factory alias for backward compat
+concatdownload = ConcatDownloader.apply
+# --- Backward compat aliases ---
+# Keep old class names importable but they now point to new classes
+SingleDownload = FileDownloader

datamaestro/download/sync.py CHANGED Viewed

@@ -2,7 +2,6 @@ import logging
 from pathlib import Path
 from datamaestro.download import Download
-from datamaestro.definitions import AbstractDataset
 from subprocess import run

datamaestro/download/todo.py CHANGED Viewed

@@ -1,10 +1,15 @@
-from pathlib import Path
+from datamaestro.download import Resource
-from datamaestro.download import Download
+class Todo(Resource):
+    """Placeholder resource indicating download is not yet implemented."""
-class Todo(Download):
-    def download(self, destination: Path):
+    def download(self, force=False):
         raise NotImplementedError(
             "Download method not defined - please edit the definition file"
         )
+    def prepare(self):
+        raise NotImplementedError(
+            "Prepare method not defined - please edit the definition file"
+        )

datamaestro/download/wayback.py ADDED Viewed

@@ -0,0 +1,164 @@
+import logging
+import json
+from datamaestro.download import Resource
+from typing import Callable, Iterator
+from pathlib import Path
+import requests
+import random
+import re
+from requests.exceptions import HTTPError
+from tqdm.auto import tqdm
+import time
+import urllib.parse
+import uuid
+wayback_prefix = re.compile(r"^https:\/\/web\.archive\.org\/web")
+replace_pattern = re.compile(r"(web\.archive\.org\/web\/\d+)")
+def download_with_retry(url: str, max_retries: int = 10) -> requests.Response:
+    """Download a URL with exponential backoff, until max_retries is reached."""
+    retry_num = 0
+    while True:
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            return response
+        except HTTPError as e:
+            status_code = e.response.status_code
+            if not (status_code == 429 or status_code >= 500):
+                # This is not an error we should retry on
+                raise e
+            if retry_num > max_retries:
+                logging.error(
+                    f"Failed to perform GET request on {url}"
+                    f"after {max_retries} retries."
+                )
+                raise e
+            if status_code == 429:
+                time.sleep(5 + 2**retry_num + random.randint(0, 1000) / 1000)
+            else:
+                time.sleep(2**retry_num + random.randint(0, 1000) / 1000)
+            retry_num += 1
+def download_link(link: str, timestamp: str):
+    page_id = str(uuid.uuid4())
+    url_no_header = None
+    try:
+        # Find the Wayback Machine link
+        if not wayback_prefix.match(link):
+            link_encoded = urllib.parse.quote(link)
+            available, availability_attempt = False, 0
+            # Sometimes the API returns HTTP success code 200, but archived
+            # snapshots shows page is unavailable when it actually is. Give it a
+            # total of three tries.
+            while not available and availability_attempt < 3:
+                response = download_with_retry(
+                    "http://archive.org/wayback/available?"
+                    f"url={link_encoded}&timestamp={timestamp}"
+                )
+                json_response = response.json()
+                available = "closest" in json_response["archived_snapshots"]
+                availability_attempt += 1
+            if not available:
+                logging.warning(
+                    f"Not available on Wayback Machine: {link}, "
+                    f"HTTP code {response.status_code}, {json_response}"
+                )
+                return {"link": link, "page_id": page_id, "available": False}
+            url = json_response["archived_snapshots"]["closest"]["url"]
+        else:
+            url = link
+        match = replace_pattern.search(url)
+        assert match
+        url_no_header = replace_pattern.sub(f"{match.group(1)}id_", url)
+        response = download_with_retry(url_no_header)
+        html_page = response.text
+        return {
+            "link": link,
+            "id": url_no_header,
+            "contents": html_page,
+        }
+    except HTTPError as http_err:
+        logging.warning(f"HTTP error occurred: {http_err} for {link}")
+        return {
+            "link": link,
+            "page_id": page_id,
+            "available": False,
+            "status_code": http_err.response.status_code if http_err.response else None,
+            "wayback_url": url_no_header,
+        }
+    except UnicodeDecodeError as e:
+        logging.warning(f"Unicode decode error occurred: {e} for {link}")
+        return {
+            "link": link,
+            "page_id": page_id,
+            "available": False,
+            "status_code": response.status_code,
+            "wayback_url": url_no_header,
+        }
+    except Exception as e:
+        logging.warning(f"Exception occurred: {e} for {link}")
+        return {
+            "link": link,
+            "page_id": page_id,
+            "available": False,
+            "status_code": None,
+            "wayback_url": url_no_header,
+        }
+class wayback_documents(Resource):
+    """Collect documents from wayback"""
+    def __init__(self, timestamp: str, urls_fn: Callable[[], Iterator[str]], name=None):
+        super().__init__(name)
+        self.timestamp = timestamp
+        self.urls_fn = urls_fn
+    def prepare(self):
+        return self.dataset.datapath / self.name
+    def download(self, force=False):
+        # Creates directory if needed
+        destination: Path = self.dataset.datapath / self.name
+        self.dataset.datapath.mkdir(exist_ok=True)
+        # Early exit
+        done_path = destination.with_suffix(".done")
+        if done_path.is_file() and not force:
+            return True
+        # Reads the URLs
+        logging.info("Retrieving URLs from wayback into %s", destination)
+        pos = 0
+        urls = set()
+        with destination.open("at+") as fp:
+            fp.seek(0)
+            try:
+                while line := fp.readline():
+                    pos = fp.tell()
+                    urls.add(json.loads(line)["url"])
+            except json.JSONDecodeError:
+                logging.warning(f"JSON decoding error: getting back to position {pos}")
+                fp.seek(pos)
+            # Get the remaining ones
+            for url in tqdm(self.urls_fn()):
+                if url not in urls:
+                    fp.write(json.dumps(download_link(url, self.timestamp)))
+        # Everything is fine
+        done_path.touch()

datamaestro 0.8.1__py3-none-any.whl → 1.7.0__py3-none-any.whl

datamaestro 0.8.1py3-none-any.whl → 1.7.0py3-none-any.whl