lazyscribe-arrow 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ """Import the custom artifact handlers."""
2
+
3
+ from lazyscribe_arrow.csv import CSVArtifact
4
+ from lazyscribe_arrow.parquet import ParquetArtifact
5
+
6
+ __all__: list[str] = ["CSVArtifact", "ParquetArtifact"]
@@ -0,0 +1,3 @@
1
+ """Version."""
2
+
3
+ __version__ = "0.3.0"
@@ -0,0 +1,111 @@
1
+ """Custom artifact handlers for CSVs."""
2
+
3
+ import logging
4
+ from datetime import datetime
5
+ from typing import Any, ClassVar
6
+
7
+ import pyarrow as pa
8
+ from attrs import define
9
+ from lazyscribe._utils import utcnow
10
+ from lazyscribe.artifacts.base import Artifact
11
+ from pyarrow import csv
12
+ from pyarrow.interchange import from_dataframe
13
+ from slugify import slugify
14
+
15
+ from lazyscribe_arrow.protocols import (
16
+ ArrowArrayExportable,
17
+ ArrowStreamExportable,
18
+ SupportsInterchange,
19
+ )
20
+
21
+ LOG = logging.getLogger(__name__)
22
+
23
+
24
+ @define(auto_attribs=True)
25
+ class CSVArtifact(Artifact):
26
+ """Arrow-powered CSV handler."""
27
+
28
+ alias: ClassVar[str] = "csv"
29
+ suffix: ClassVar[str] = "csv"
30
+ binary: ClassVar[bool] = True
31
+ output_only: ClassVar[bool] = False
32
+
33
+ @classmethod
34
+ def construct(
35
+ cls,
36
+ name: str,
37
+ value: Any | None = None,
38
+ fname: str | None = None,
39
+ created_at: datetime | None = None,
40
+ writer_kwargs: dict | None = None,
41
+ version: int = 0,
42
+ dirty: bool = True,
43
+ **kwargs,
44
+ ):
45
+ """Construct the handler class."""
46
+ created_at = created_at or utcnow()
47
+
48
+ return cls( # type: ignore[call-arg]
49
+ name=name,
50
+ value=value,
51
+ fname=fname
52
+ or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
53
+ writer_kwargs=writer_kwargs or {},
54
+ version=version,
55
+ created_at=created_at,
56
+ dirty=dirty,
57
+ )
58
+
59
+ @classmethod
60
+ def read(cls, buf, **kwargs) -> pa.Table:
61
+ """Read in the CSV file.
62
+
63
+ Parameters
64
+ ----------
65
+ buf : file-like object
66
+ The buffer from a ``fsspec`` filesystem.
67
+ **kwargs
68
+ Keyword arguments for the read method.
69
+
70
+ Returns
71
+ -------
72
+ pyarrow.lib.Table
73
+ A ``pyarrow`` table with the data.
74
+ """
75
+ return csv.read_csv(buf, **kwargs)
76
+
77
+ @classmethod
78
+ def write(cls, obj, buf, **kwargs):
79
+ """Write the CSV file using pyarrow.
80
+
81
+ Parameters
82
+ ----------
83
+ obj : object
84
+ The object to write.
85
+ buf : file-like object
86
+ The buffer from a ``fsspec`` filesystem.
87
+ **kwargs
88
+ Keyword arguments for :py:meth:`pyarrow.csv.write_csv`.
89
+
90
+ Raises
91
+ ------
92
+ ValueError
93
+ Raised if the supplied object does not have ``__arrow_c_array__``
94
+ or ``__arrow_c_stream__`` attributes. These attributes allow us to
95
+ perform a zero-copy transformation from the native obejct to a PyArrow
96
+ Table.
97
+ """
98
+ if isinstance(obj, pa.Table):
99
+ LOG.debug("Provided object is already a PyArrow table.")
100
+ elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
101
+ obj = pa.table(obj)
102
+ elif isinstance(obj, SupportsInterchange):
103
+ obj = from_dataframe(obj)
104
+ else:
105
+ raise ValueError(
106
+ f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
107
+ "Please provide an object that implements the Arrow PyCapsule Interface or the "
108
+ "Dataframe Interchange Protocol."
109
+ )
110
+
111
+ csv.write_csv(obj, buf, **kwargs)
@@ -0,0 +1,88 @@
1
+ """Define methods for generating a PyArrow table from a project and/or repository."""
2
+
3
+ import copy
4
+ import itertools
5
+ from functools import singledispatch
6
+
7
+ import pyarrow as pa
8
+ import pyarrow.compute as pc
9
+ from lazyscribe import Project, Repository
10
+
11
+
12
+ @singledispatch
13
+ def to_table(obj, /) -> pa.Table:
14
+ """Convert a lazyscribe Project or Repository to a PyArrow table.
15
+
16
+ Parameters
17
+ ----------
18
+ obj : lazyscribe.Project | lazyscribe.Repository
19
+ The object to convert.
20
+
21
+ Returns
22
+ -------
23
+ pyarrow.Table
24
+ The PyArrow table.
25
+ """
26
+
27
+
28
+ @to_table.register(Project)
29
+ def _(obj: Project, /) -> pa.Table:
30
+ """Convert a lazyscribe Project to a PyArrow table.
31
+
32
+ Parameters
33
+ ----------
34
+ obj : lazyscribe.Project
35
+ A lazyscribe project.
36
+
37
+ Returns
38
+ -------
39
+ pyarrow.Table
40
+ The PyArrow table.
41
+ """
42
+ raw_ = pa.Table.from_pylist(list(obj))
43
+ for name in ["created_at", "last_updated"]:
44
+ col_index_ = raw_.column_names.index(name)
45
+ new_ = pc.assume_timezone(
46
+ raw_.column(name).cast(pa.timestamp("s")), timezone="UTC"
47
+ )
48
+
49
+ raw_ = raw_.set_column(
50
+ col_index_, pa.field(name, pa.timestamp("s", tz="UTC")), new_
51
+ )
52
+
53
+ return raw_
54
+
55
+
56
+ @to_table.register(Repository)
57
+ def _(obj: Repository, /) -> pa.Table:
58
+ """Convert a lazyscribe Repository to a PyArrow table.
59
+
60
+ Parameters
61
+ ----------
62
+ obj : lazyscribe.Repository
63
+ A lazyscribe Repository.
64
+
65
+ Returns
66
+ -------
67
+ pyarrow.Table
68
+ The PyArrow table.
69
+ """
70
+ # Need to create a unified schema -- get the total list of fields across handlers
71
+ raw_data_ = list(obj)
72
+ all_fields_ = set(itertools.chain.from_iterable([art.keys() for art in raw_data_]))
73
+ parsed_data_: list[dict] = []
74
+ for art in raw_data_:
75
+ parsed_data_.append(copy.copy(art))
76
+ for new_field_ in all_fields_.difference(set(art.keys())):
77
+ parsed_data_[-1][new_field_] = None
78
+
79
+ table_ = pa.Table.from_pylist(parsed_data_)
80
+ # make ``created_at`` a timezone-aware timestamp column
81
+ col_index_ = table_.column_names.index("created_at")
82
+ new_ = pc.assume_timezone(
83
+ table_.column("created_at").cast(pa.timestamp("s")), timezone="UTC"
84
+ )
85
+
86
+ return table_.set_column(
87
+ col_index_, pa.field("created_at", pa.timestamp("s", tz="UTC")), new_
88
+ )
@@ -0,0 +1,113 @@
1
+ """Custom artifact handlers for parquets."""
2
+
3
+ import logging
4
+ from datetime import datetime
5
+ from typing import Any, ClassVar
6
+
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ from attrs import define
10
+ from lazyscribe._utils import utcnow
11
+ from lazyscribe.artifacts.base import Artifact
12
+ from pyarrow.interchange import from_dataframe
13
+ from slugify import slugify
14
+
15
+ from lazyscribe_arrow.protocols import (
16
+ ArrowArrayExportable,
17
+ ArrowStreamExportable,
18
+ SupportsInterchange,
19
+ )
20
+
21
+ LOG = logging.getLogger(__name__)
22
+
23
+
24
+ @define(auto_attribs=True)
25
+ class ParquetArtifact(Artifact):
26
+ """Arrow-powered Parquet handler."""
27
+
28
+ alias: ClassVar[str] = "parquet"
29
+ suffix: ClassVar[str] = "parquet"
30
+ binary: ClassVar[bool] = True
31
+ output_only: ClassVar[bool] = False
32
+
33
+ @classmethod
34
+ def construct(
35
+ cls,
36
+ name: str,
37
+ value: Any | None = None,
38
+ fname: str | None = None,
39
+ created_at: datetime | None = None,
40
+ writer_kwargs: dict | None = None,
41
+ version: int = 0,
42
+ dirty: bool = True,
43
+ **kwargs,
44
+ ):
45
+ """Construct the handler class."""
46
+ created_at = created_at or utcnow()
47
+
48
+ return cls( # type: ignore[call-arg]
49
+ name=name,
50
+ value=value,
51
+ fname=fname
52
+ or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
53
+ writer_kwargs=writer_kwargs or {},
54
+ version=version,
55
+ created_at=created_at,
56
+ dirty=dirty,
57
+ )
58
+
59
+ @classmethod
60
+ def read(cls, buf, **kwargs) -> pa.Table:
61
+ """Read in the parquet file.
62
+
63
+ Parameters
64
+ ----------
65
+ buf : file-like object
66
+ The buffer from a ``fsspec`` filesystem.
67
+ **kwargs
68
+ Keyword arguments for the read method.
69
+
70
+ Returns
71
+ -------
72
+ pyarrow.lib.Table
73
+ A ``pyarrow`` table with the data.
74
+ """
75
+ return pq.read_table(buf, **kwargs)
76
+
77
+ @classmethod
78
+ def write(cls, obj, buf, **kwargs):
79
+ """Write the parquet file using pyarrow.
80
+
81
+ Parameters
82
+ ----------
83
+ obj : object
84
+ The object to write.
85
+ buf : file-like object
86
+ The buffer from a ``fsspec`` filesystem.
87
+ **kwargs
88
+ Keyword arguments for :py:meth:`pyarrow.parquet.write_table`.
89
+
90
+ Raises
91
+ ------
92
+ ValueError
93
+ Raised if the supplied object does not have ``__arrow_c_array__``
94
+ or ``__arrow_c_stream__`` attribute
95
+ or if the object does not
96
+ implement the dataframe interchange protocol. These attributes allow us to
97
+ perform a zero-copy transformation from the native obejct to a PyArrow
98
+ Table.
99
+ """
100
+ if isinstance(obj, pa.Table):
101
+ LOG.debug("Provided object is already a PyArrow table.")
102
+ elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
103
+ obj = pa.table(obj)
104
+ elif isinstance(obj, SupportsInterchange):
105
+ obj = from_dataframe(obj)
106
+ else:
107
+ raise ValueError(
108
+ f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
109
+ "Please provide an object that implements the Arrow PyCapsule Interface or the "
110
+ "Dataframe Interchange Protocol."
111
+ )
112
+
113
+ pq.write_table(obj, buf, **kwargs)
@@ -0,0 +1,38 @@
1
+ """Arrow exportable protocols."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+
8
+ @runtime_checkable
9
+ class ArrowArrayExportable(Protocol):
10
+ """Type protocol for Arrow C Data Interface via Arrow PyCapsule Interface."""
11
+
12
+ def __arrow_c_array__(
13
+ self, requested_schema: object | None = None
14
+ ) -> tuple[object, object]:
15
+ """Export the object as a pair of ArrowSchema and ArrowArray structures."""
16
+ ...
17
+
18
+
19
+ @runtime_checkable
20
+ class ArrowStreamExportable(Protocol):
21
+ """Type protocol for Arrow C Stream Interface via Arrow PyCapsule Interface."""
22
+
23
+ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
24
+ """Export the object as an ArrowArrayStream."""
25
+ ...
26
+
27
+
28
+ @runtime_checkable
29
+ class SupportsInterchange(Protocol):
30
+ """Dataframe that supports conversion into an interchange dataframe object."""
31
+
32
+ def __dataframe__(
33
+ self,
34
+ nan_as_null: bool = False,
35
+ allow_copy: bool = True,
36
+ ) -> SupportsInterchange:
37
+ """Convert to a dataframe object implementing the dataframe interchange protocol."""
38
+ ...
@@ -0,0 +1,122 @@
1
+ Metadata-Version: 2.3
2
+ Name: lazyscribe-arrow
3
+ Version: 0.3.0
4
+ Summary: Arrow-based artifact handlers for Lazyscribe
5
+ Author: Akshay Gupta
6
+ Author-email: Akshay Gupta <akgcodes@gmail.com>
7
+ License: MIT license
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Natural Language :: English
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: attrs>=21.2,<=25.3
17
+ Requires-Dist: lazyscribe>=1,<=1.2
18
+ Requires-Dist: pyarrow>=14.0.1,<=22
19
+ Requires-Dist: python-slugify>=5,<=8.0.4
20
+ Requires-Dist: commitizen ; extra == 'build'
21
+ Requires-Dist: uv ; extra == 'build'
22
+ Requires-Dist: lazyscribe-arrow[build] ; extra == 'dev'
23
+ Requires-Dist: lazyscribe-arrow[qa] ; extra == 'dev'
24
+ Requires-Dist: lazyscribe-arrow[tests] ; extra == 'dev'
25
+ Requires-Dist: edgetest ; extra == 'qa'
26
+ Requires-Dist: mypy ; extra == 'qa'
27
+ Requires-Dist: pre-commit ; extra == 'qa'
28
+ Requires-Dist: pyproject-fmt ; extra == 'qa'
29
+ Requires-Dist: ruff ; extra == 'qa'
30
+ Requires-Dist: types-python-slugify ; extra == 'qa'
31
+ Requires-Dist: uv ; extra == 'qa'
32
+ Requires-Dist: pandas ; extra == 'tests'
33
+ Requires-Dist: pytest ; extra == 'tests'
34
+ Requires-Dist: pytest-cov ; extra == 'tests'
35
+ Requires-Dist: time-machine ; extra == 'tests'
36
+ Requires-Python: >=3.10.0
37
+ Project-URL: Documentation, https://github.com/lazyscribe/lazyscribe-arrow
38
+ Project-URL: Repository, https://github.com/lazyscribe/lazyscribe-arrow
39
+ Provides-Extra: build
40
+ Provides-Extra: dev
41
+ Provides-Extra: qa
42
+ Provides-Extra: tests
43
+ Description-Content-Type: text/markdown
44
+
45
+ [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) [![PyPI](https://img.shields.io/pypi/v/lazyscribe-arrow)](https://pypi.org/project/lazyscribe-arrow/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/lazyscribe-arrow)](https://pypi.org/project/lazyscrib-arrow/) [![codecov](https://codecov.io/gh/lazyscribe/lazyscribe-arrow/graph/badge.svg?token=W5TPK7GX7G)](https://codecov.io/gh/lazyscribe/lazyscribe-arrow)
46
+
47
+ # Arrow-based artifact handling for lazyscribe
48
+
49
+ `lazyscribe-arrow` is a lightweight package that adds the following artifact handlers for `lazyscribe`:
50
+
51
+ * `csv`, and
52
+ * `parquet`.
53
+
54
+ Any data structure that implements the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
55
+ will be compatible with the handlers in this library. Popular compatible open source data structures include
56
+
57
+ * `pandas.DataFrame`
58
+ * `polars.DataFrame`
59
+ * `polars.LazyFrame`
60
+
61
+ This library also adds interchange methods to construct a `pyarrow.Table` from `lazyscribe.Project` and `lazyscribe.Repository` objects.
62
+
63
+ # Installation
64
+
65
+ Python 3.10 and above is required. use `pip` to install:
66
+
67
+ ```console
68
+ $ python -m pip install lazyscribe-arrow
69
+ ```
70
+
71
+ # Usage
72
+
73
+ ## Artifact handlers
74
+
75
+ To use this library, simply log an artifact to a `lazyscribe` experiment or repository with
76
+
77
+ * `handler="csv"` for a CSV output
78
+
79
+
80
+ ```python
81
+ import pyarrow as pa
82
+ from lazyscribe import Project
83
+
84
+ project = Project("project.json", mode="w")
85
+ with project.log("My experiment") as exp:
86
+ data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
87
+ exp.log_artifact(name="data", value=data, handler="csv")
88
+
89
+ project.save()
90
+ ```
91
+
92
+ ## Interchange
93
+
94
+ To convert your `lazyscribe.Project` to a `pyarrow.Table` object, call `lazyscribe_arrow.interchange.to_table`:
95
+
96
+ ```python
97
+ import pyarrow as pa
98
+ from lazyscribe import Project
99
+ from lazyscribe_arrow.interchange import to_table
100
+
101
+ project = Project("project.json", mode="w")
102
+ with project.log("My experiment") as exp:
103
+ data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
104
+ exp.log_artifact(name="data", value=data, handler="csv")
105
+
106
+ table = to_table(project)
107
+ ```
108
+
109
+ The same function works for `lazyscribe.Repository` objects.
110
+
111
+ ```python
112
+ import pyarrow as pa
113
+ from lazyscribe import Repository
114
+ from lazyscribe_arrow.interchange import to_table
115
+
116
+ repo = Repository("repository.json", mode="w")
117
+
118
+ data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
119
+ repo.log_artifact(name="data", value=data, handler="csv")
120
+
121
+ table = to_table(repo)
122
+ ```
@@ -0,0 +1,10 @@
1
+ lazyscribe_arrow/__init__.py,sha256=YwnXVqIllCJKZakHtTtDsWh_raw0HGqG8lgsUfCH9FQ,199
2
+ lazyscribe_arrow/_meta.py,sha256=QHt38lU8oHwr8cMphAXLsOCbzqRA4rNJOqW2akosRDM,38
3
+ lazyscribe_arrow/csv.py,sha256=55GIciGtui9sd2l0GdhSXpGaLTenMwxtUbJjboqMDi4,3278
4
+ lazyscribe_arrow/interchange.py,sha256=MeGLkPkyfJyl8ec2PSUyJBygsFmlY9lJll0o9nr-zuo,2361
5
+ lazyscribe_arrow/parquet.py,sha256=C_MofzAUqG5UlOG8NW4odlK_rTm0a4HjlhOZed94Row,3413
6
+ lazyscribe_arrow/protocols.py,sha256=VsG6t1em4qsTRwrDvph1aQdTKjFXCjjAcuvwFuay-8Y,1147
7
+ lazyscribe_arrow-0.3.0.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
8
+ lazyscribe_arrow-0.3.0.dist-info/entry_points.txt,sha256=OZeI9uVR1xkCYghXAwtsGStl5ItRE3UZ-BgabZufeyc,106
9
+ lazyscribe_arrow-0.3.0.dist-info/METADATA,sha256=mHMpulq3bK8ezgnDWQ7a7SefRIGkeIhfAsfZxCvREt8,4261
10
+ lazyscribe_arrow-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.24
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ [lazyscribe.artifact_type]
2
+ csv = lazyscribe_arrow:CSVArtifact
3
+ parquet = lazyscribe_arrow:ParquetArtifact
4
+