PyPI - lazyscribe-arrow - Versions diffs - 0.3.0__py3-none-any.whl - Mend

lazyscribe-arrow 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

lazyscribe_arrow/__init__.py +6 -0
lazyscribe_arrow/_meta.py +3 -0
lazyscribe_arrow/csv.py +111 -0
lazyscribe_arrow/interchange.py +88 -0
lazyscribe_arrow/parquet.py +113 -0
lazyscribe_arrow/protocols.py +38 -0
lazyscribe_arrow-0.3.0.dist-info/METADATA +122 -0
lazyscribe_arrow-0.3.0.dist-info/RECORD +10 -0
lazyscribe_arrow-0.3.0.dist-info/WHEEL +4 -0
lazyscribe_arrow-0.3.0.dist-info/entry_points.txt +4 -0

lazyscribe_arrow/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Import the custom artifact handlers."""
+from lazyscribe_arrow.csv import CSVArtifact
+from lazyscribe_arrow.parquet import ParquetArtifact
+__all__: list[str] = ["CSVArtifact", "ParquetArtifact"]

lazyscribe_arrow/_meta.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Version."""
+__version__ = "0.3.0"

lazyscribe_arrow/csv.py ADDED Viewed

@@ -0,0 +1,111 @@
+"""Custom artifact handlers for CSVs."""
+import logging
+from datetime import datetime
+from typing import Any, ClassVar
+import pyarrow as pa
+from attrs import define
+from lazyscribe._utils import utcnow
+from lazyscribe.artifacts.base import Artifact
+from pyarrow import csv
+from pyarrow.interchange import from_dataframe
+from slugify import slugify
+from lazyscribe_arrow.protocols import (
+    ArrowArrayExportable,
+    ArrowStreamExportable,
+    SupportsInterchange,
+)
+LOG = logging.getLogger(__name__)
+@define(auto_attribs=True)
+class CSVArtifact(Artifact):
+    """Arrow-powered CSV handler."""
+    alias: ClassVar[str] = "csv"
+    suffix: ClassVar[str] = "csv"
+    binary: ClassVar[bool] = True
+    output_only: ClassVar[bool] = False
+    @classmethod
+    def construct(
+        cls,
+        name: str,
+        value: Any | None = None,
+        fname: str | None = None,
+        created_at: datetime | None = None,
+        writer_kwargs: dict | None = None,
+        version: int = 0,
+        dirty: bool = True,
+        **kwargs,
+    ):
+        """Construct the handler class."""
+        created_at = created_at or utcnow()
+        return cls(  # type: ignore[call-arg]
+            name=name,
+            value=value,
+            fname=fname
+            or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
+            writer_kwargs=writer_kwargs or {},
+            version=version,
+            created_at=created_at,
+            dirty=dirty,
+        )
+    @classmethod
+    def read(cls, buf, **kwargs) -> pa.Table:
+        """Read in the CSV file.
+        Parameters
+        ----------
+        buf : file-like object
+            The buffer from a ``fsspec`` filesystem.
+        **kwargs
+            Keyword arguments for the read method.
+        Returns
+        -------
+        pyarrow.lib.Table
+            A ``pyarrow`` table with the data.
+        """
+        return csv.read_csv(buf, **kwargs)
+    @classmethod
+    def write(cls, obj, buf, **kwargs):
+        """Write the CSV file using pyarrow.
+        Parameters
+        ----------
+        obj : object
+            The object to write.
+        buf : file-like object
+            The buffer from a ``fsspec`` filesystem.
+        **kwargs
+            Keyword arguments for :py:meth:`pyarrow.csv.write_csv`.
+        Raises
+        ------
+        ValueError
+            Raised if the supplied object does not have ``__arrow_c_array__``
+            or ``__arrow_c_stream__`` attributes. These attributes allow us to
+            perform a zero-copy transformation from the native obejct to a PyArrow
+            Table.
+        """
+        if isinstance(obj, pa.Table):
+            LOG.debug("Provided object is already a PyArrow table.")
+        elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
+            obj = pa.table(obj)
+        elif isinstance(obj, SupportsInterchange):
+            obj = from_dataframe(obj)
+        else:
+            raise ValueError(
+                f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
+                "Please provide an object that implements the Arrow PyCapsule Interface or the "
+                "Dataframe Interchange Protocol."
+            )
+        csv.write_csv(obj, buf, **kwargs)

lazyscribe_arrow/interchange.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Define methods for generating a PyArrow table from a project and/or repository."""
+import copy
+import itertools
+from functools import singledispatch
+import pyarrow as pa
+import pyarrow.compute as pc
+from lazyscribe import Project, Repository
+@singledispatch
+def to_table(obj, /) -> pa.Table:
+    """Convert a lazyscribe Project or Repository to a PyArrow table.
+    Parameters
+    ----------
+    obj : lazyscribe.Project | lazyscribe.Repository
+        The object to convert.
+    Returns
+    -------
+    pyarrow.Table
+        The PyArrow table.
+    """
+@to_table.register(Project)
+def _(obj: Project, /) -> pa.Table:
+    """Convert a lazyscribe Project to a PyArrow table.
+    Parameters
+    ----------
+    obj : lazyscribe.Project
+        A lazyscribe project.
+    Returns
+    -------
+    pyarrow.Table
+        The PyArrow table.
+    """
+    raw_ = pa.Table.from_pylist(list(obj))
+    for name in ["created_at", "last_updated"]:
+        col_index_ = raw_.column_names.index(name)
+        new_ = pc.assume_timezone(
+            raw_.column(name).cast(pa.timestamp("s")), timezone="UTC"
+        )
+        raw_ = raw_.set_column(
+            col_index_, pa.field(name, pa.timestamp("s", tz="UTC")), new_
+        )
+    return raw_
+@to_table.register(Repository)
+def _(obj: Repository, /) -> pa.Table:
+    """Convert a lazyscribe Repository to a PyArrow table.
+    Parameters
+    ----------
+    obj : lazyscribe.Repository
+        A lazyscribe Repository.
+    Returns
+    -------
+    pyarrow.Table
+        The PyArrow table.
+    """
+    # Need to create a unified schema -- get the total list of fields across handlers
+    raw_data_ = list(obj)
+    all_fields_ = set(itertools.chain.from_iterable([art.keys() for art in raw_data_]))
+    parsed_data_: list[dict] = []
+    for art in raw_data_:
+        parsed_data_.append(copy.copy(art))
+        for new_field_ in all_fields_.difference(set(art.keys())):
+            parsed_data_[-1][new_field_] = None
+    table_ = pa.Table.from_pylist(parsed_data_)
+    # make ``created_at`` a timezone-aware timestamp column
+    col_index_ = table_.column_names.index("created_at")
+    new_ = pc.assume_timezone(
+        table_.column("created_at").cast(pa.timestamp("s")), timezone="UTC"
+    )
+    return table_.set_column(
+        col_index_, pa.field("created_at", pa.timestamp("s", tz="UTC")), new_
+    )

lazyscribe_arrow/parquet.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Custom artifact handlers for parquets."""
+import logging
+from datetime import datetime
+from typing import Any, ClassVar
+import pyarrow as pa
+import pyarrow.parquet as pq
+from attrs import define
+from lazyscribe._utils import utcnow
+from lazyscribe.artifacts.base import Artifact
+from pyarrow.interchange import from_dataframe
+from slugify import slugify
+from lazyscribe_arrow.protocols import (
+    ArrowArrayExportable,
+    ArrowStreamExportable,
+    SupportsInterchange,
+)
+LOG = logging.getLogger(__name__)
+@define(auto_attribs=True)
+class ParquetArtifact(Artifact):
+    """Arrow-powered Parquet handler."""
+    alias: ClassVar[str] = "parquet"
+    suffix: ClassVar[str] = "parquet"
+    binary: ClassVar[bool] = True
+    output_only: ClassVar[bool] = False
+    @classmethod
+    def construct(
+        cls,
+        name: str,
+        value: Any | None = None,
+        fname: str | None = None,
+        created_at: datetime | None = None,
+        writer_kwargs: dict | None = None,
+        version: int = 0,
+        dirty: bool = True,
+        **kwargs,
+    ):
+        """Construct the handler class."""
+        created_at = created_at or utcnow()
+        return cls(  # type: ignore[call-arg]
+            name=name,
+            value=value,
+            fname=fname
+            or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
+            writer_kwargs=writer_kwargs or {},
+            version=version,
+            created_at=created_at,
+            dirty=dirty,
+        )
+    @classmethod
+    def read(cls, buf, **kwargs) -> pa.Table:
+        """Read in the parquet file.
+        Parameters
+        ----------
+        buf : file-like object
+            The buffer from a ``fsspec`` filesystem.
+        **kwargs
+            Keyword arguments for the read method.
+        Returns
+        -------
+        pyarrow.lib.Table
+            A ``pyarrow`` table with the data.
+        """
+        return pq.read_table(buf, **kwargs)
+    @classmethod
+    def write(cls, obj, buf, **kwargs):
+        """Write the parquet file using pyarrow.
+        Parameters
+        ----------
+        obj : object
+            The object to write.
+        buf : file-like object
+            The buffer from a ``fsspec`` filesystem.
+        **kwargs
+            Keyword arguments for :py:meth:`pyarrow.parquet.write_table`.
+        Raises
+        ------
+        ValueError
+            Raised if the supplied object does not have ``__arrow_c_array__``
+            or ``__arrow_c_stream__`` attribute
+            or if the object does not
+            implement the dataframe interchange protocol. These attributes allow us to
+            perform a zero-copy transformation from the native obejct to a PyArrow
+            Table.
+        """
+        if isinstance(obj, pa.Table):
+            LOG.debug("Provided object is already a PyArrow table.")
+        elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
+            obj = pa.table(obj)
+        elif isinstance(obj, SupportsInterchange):
+            obj = from_dataframe(obj)
+        else:
+            raise ValueError(
+                f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
+                "Please provide an object that implements the Arrow PyCapsule Interface or the "
+                "Dataframe Interchange Protocol."
+            )
+        pq.write_table(obj, buf, **kwargs)

lazyscribe_arrow/protocols.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Arrow exportable protocols."""
+from __future__ import annotations
+from typing import Protocol, runtime_checkable
+@runtime_checkable
+class ArrowArrayExportable(Protocol):
+    """Type protocol for Arrow C Data Interface via Arrow PyCapsule Interface."""
+    def __arrow_c_array__(
+        self, requested_schema: object | None = None
+    ) -> tuple[object, object]:
+        """Export the object as a pair of ArrowSchema and ArrowArray structures."""
+        ...
+@runtime_checkable
+class ArrowStreamExportable(Protocol):
+    """Type protocol for Arrow C Stream Interface via Arrow PyCapsule Interface."""
+    def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
+        """Export the object as an ArrowArrayStream."""
+        ...
+@runtime_checkable
+class SupportsInterchange(Protocol):
+    """Dataframe that supports conversion into an interchange dataframe object."""
+    def __dataframe__(
+        self,
+        nan_as_null: bool = False,
+        allow_copy: bool = True,
+    ) -> SupportsInterchange:
+        """Convert to a dataframe object implementing the dataframe interchange protocol."""
+        ...

lazyscribe_arrow-0.3.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,122 @@
+Metadata-Version: 2.3
+Name: lazyscribe-arrow
+Version: 0.3.0
+Summary: Arrow-based artifact handlers for Lazyscribe
+Author: Akshay Gupta
+Author-email: Akshay Gupta <akgcodes@gmail.com>
+License: MIT license
+Classifier: Development Status :: 3 - Alpha
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: attrs>=21.2,<=25.3
+Requires-Dist: lazyscribe>=1,<=1.2
+Requires-Dist: pyarrow>=14.0.1,<=22
+Requires-Dist: python-slugify>=5,<=8.0.4
+Requires-Dist: commitizen ; extra == 'build'
+Requires-Dist: uv ; extra == 'build'
+Requires-Dist: lazyscribe-arrow[build] ; extra == 'dev'
+Requires-Dist: lazyscribe-arrow[qa] ; extra == 'dev'
+Requires-Dist: lazyscribe-arrow[tests] ; extra == 'dev'
+Requires-Dist: edgetest ; extra == 'qa'
+Requires-Dist: mypy ; extra == 'qa'
+Requires-Dist: pre-commit ; extra == 'qa'
+Requires-Dist: pyproject-fmt ; extra == 'qa'
+Requires-Dist: ruff ; extra == 'qa'
+Requires-Dist: types-python-slugify ; extra == 'qa'
+Requires-Dist: uv ; extra == 'qa'
+Requires-Dist: pandas ; extra == 'tests'
+Requires-Dist: pytest ; extra == 'tests'
+Requires-Dist: pytest-cov ; extra == 'tests'
+Requires-Dist: time-machine ; extra == 'tests'
+Requires-Python: >=3.10.0
+Project-URL: Documentation, https://github.com/lazyscribe/lazyscribe-arrow
+Project-URL: Repository, https://github.com/lazyscribe/lazyscribe-arrow
+Provides-Extra: build
+Provides-Extra: dev
+Provides-Extra: qa
+Provides-Extra: tests
+Description-Content-Type: text/markdown
+[![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) [![PyPI](https://img.shields.io/pypi/v/lazyscribe-arrow)](https://pypi.org/project/lazyscribe-arrow/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/lazyscribe-arrow)](https://pypi.org/project/lazyscrib-arrow/) [![codecov](https://codecov.io/gh/lazyscribe/lazyscribe-arrow/graph/badge.svg?token=W5TPK7GX7G)](https://codecov.io/gh/lazyscribe/lazyscribe-arrow)
+# Arrow-based artifact handling for lazyscribe
+`lazyscribe-arrow` is a lightweight package that adds the following artifact handlers for `lazyscribe`:
+* `csv`, and
+* `parquet`.
+Any data structure that implements the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
+will be compatible with the handlers in this library. Popular compatible open source data structures include
+* `pandas.DataFrame`
+* `polars.DataFrame`
+* `polars.LazyFrame`
+This library also adds interchange methods to construct a `pyarrow.Table` from `lazyscribe.Project` and `lazyscribe.Repository` objects.
+# Installation
+Python 3.10 and above is required. use `pip` to install:
+```console
+$ python -m pip install lazyscribe-arrow
+```
+# Usage
+## Artifact handlers
+To use this library, simply log an artifact to a `lazyscribe` experiment or repository with
+* `handler="csv"` for a CSV output
+```python
+import pyarrow as pa
+from lazyscribe import Project
+project = Project("project.json", mode="w")
+with project.log("My experiment") as exp:
+    data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
+    exp.log_artifact(name="data", value=data, handler="csv")
+project.save()
+```
+## Interchange
+To convert your `lazyscribe.Project` to a `pyarrow.Table` object, call `lazyscribe_arrow.interchange.to_table`:
+```python
+import pyarrow as pa
+from lazyscribe import Project
+from lazyscribe_arrow.interchange import to_table
+project = Project("project.json", mode="w")
+with project.log("My experiment") as exp:
+    data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
+    exp.log_artifact(name="data", value=data, handler="csv")
+table = to_table(project)
+```
+The same function works for `lazyscribe.Repository` objects.
+```python
+import pyarrow as pa
+from lazyscribe import Repository
+from lazyscribe_arrow.interchange import to_table
+repo = Repository("repository.json", mode="w")
+data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
+repo.log_artifact(name="data", value=data, handler="csv")
+table = to_table(repo)
+```

lazyscribe_arrow-0.3.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+lazyscribe_arrow/__init__.py,sha256=YwnXVqIllCJKZakHtTtDsWh_raw0HGqG8lgsUfCH9FQ,199
+lazyscribe_arrow/_meta.py,sha256=QHt38lU8oHwr8cMphAXLsOCbzqRA4rNJOqW2akosRDM,38
+lazyscribe_arrow/csv.py,sha256=55GIciGtui9sd2l0GdhSXpGaLTenMwxtUbJjboqMDi4,3278
+lazyscribe_arrow/interchange.py,sha256=MeGLkPkyfJyl8ec2PSUyJBygsFmlY9lJll0o9nr-zuo,2361
+lazyscribe_arrow/parquet.py,sha256=C_MofzAUqG5UlOG8NW4odlK_rTm0a4HjlhOZed94Row,3413
+lazyscribe_arrow/protocols.py,sha256=VsG6t1em4qsTRwrDvph1aQdTKjFXCjjAcuvwFuay-8Y,1147
+lazyscribe_arrow-0.3.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
+lazyscribe_arrow-0.3.0.dist-info/entry_points.txt,sha256=OZeI9uVR1xkCYghXAwtsGStl5ItRE3UZ-BgabZufeyc,106
+lazyscribe_arrow-0.3.0.dist-info/METADATA,sha256=mHMpulq3bK8ezgnDWQ7a7SefRIGkeIhfAsfZxCvREt8,4261
+lazyscribe_arrow-0.3.0.dist-info/RECORD,,

lazyscribe_arrow-0.3.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.8.24
+Root-Is-Purelib: true
+Tag: py3-none-any

lazyscribe_arrow-0.3.0.dist-info/entry_points.txt ADDED Viewed

@@ -0,0 +1,4 @@
+[lazyscribe.artifact_type]
+csv = lazyscribe_arrow:CSVArtifact
+parquet = lazyscribe_arrow:ParquetArtifact