lazyscribe-arrow 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lazyscribe-arrow might be problematic. Click here for more details.
- lazyscribe_arrow/__init__.py +2 -1
- lazyscribe_arrow/_meta.py +1 -1
- lazyscribe_arrow/csv.py +12 -2
- lazyscribe_arrow/parquet.py +113 -0
- lazyscribe_arrow/protocols.py +38 -0
- lazyscribe_arrow-0.2.1.dist-info/METADATA +85 -0
- lazyscribe_arrow-0.2.1.dist-info/RECORD +9 -0
- lazyscribe_arrow-0.2.1.dist-info/WHEEL +4 -0
- {lazyscribe_arrow-0.1.0.dist-info → lazyscribe_arrow-0.2.1.dist-info}/entry_points.txt +2 -0
- lazyscribe_arrow-0.1.0.dist-info/METADATA +0 -44
- lazyscribe_arrow-0.1.0.dist-info/RECORD +0 -8
- lazyscribe_arrow-0.1.0.dist-info/WHEEL +0 -5
- lazyscribe_arrow-0.1.0.dist-info/top_level.txt +0 -1
lazyscribe_arrow/__init__.py
CHANGED
lazyscribe_arrow/_meta.py
CHANGED
lazyscribe_arrow/csv.py
CHANGED
|
@@ -9,8 +9,15 @@ from attrs import define
|
|
|
9
9
|
from lazyscribe._utils import utcnow
|
|
10
10
|
from lazyscribe.artifacts.base import Artifact
|
|
11
11
|
from pyarrow import csv
|
|
12
|
+
from pyarrow.interchange import from_dataframe
|
|
12
13
|
from slugify import slugify
|
|
13
14
|
|
|
15
|
+
from lazyscribe_arrow.protocols import (
|
|
16
|
+
ArrowArrayExportable,
|
|
17
|
+
ArrowStreamExportable,
|
|
18
|
+
SupportsInterchange,
|
|
19
|
+
)
|
|
20
|
+
|
|
14
21
|
LOG = logging.getLogger(__name__)
|
|
15
22
|
|
|
16
23
|
|
|
@@ -90,12 +97,15 @@ class CSVArtifact(Artifact):
|
|
|
90
97
|
"""
|
|
91
98
|
if isinstance(obj, pa.Table):
|
|
92
99
|
LOG.debug("Provided object is already a PyArrow table.")
|
|
93
|
-
elif
|
|
100
|
+
elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
|
|
94
101
|
obj = pa.table(obj)
|
|
102
|
+
elif isinstance(obj, SupportsInterchange):
|
|
103
|
+
obj = from_dataframe(obj)
|
|
95
104
|
else:
|
|
96
105
|
raise ValueError(
|
|
97
106
|
f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
|
|
98
|
-
"Please provide an object that implements the Arrow PyCapsule Interface
|
|
107
|
+
"Please provide an object that implements the Arrow PyCapsule Interface or the "
|
|
108
|
+
"Dataframe Interchange Protocol."
|
|
99
109
|
)
|
|
100
110
|
|
|
101
111
|
csv.write_csv(obj, buf, **kwargs)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Custom artifact handlers for parquets."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Any, ClassVar
|
|
6
|
+
|
|
7
|
+
import pyarrow as pa
|
|
8
|
+
import pyarrow.parquet as pq
|
|
9
|
+
from attrs import define
|
|
10
|
+
from lazyscribe._utils import utcnow
|
|
11
|
+
from lazyscribe.artifacts.base import Artifact
|
|
12
|
+
from pyarrow.interchange import from_dataframe
|
|
13
|
+
from slugify import slugify
|
|
14
|
+
|
|
15
|
+
from lazyscribe_arrow.protocols import (
|
|
16
|
+
ArrowArrayExportable,
|
|
17
|
+
ArrowStreamExportable,
|
|
18
|
+
SupportsInterchange,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
LOG = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@define(auto_attribs=True)
|
|
25
|
+
class ParquetArtifact(Artifact):
|
|
26
|
+
"""Arrow-powered Parquet handler."""
|
|
27
|
+
|
|
28
|
+
alias: ClassVar[str] = "parquet"
|
|
29
|
+
suffix: ClassVar[str] = "parquet"
|
|
30
|
+
binary: ClassVar[bool] = True
|
|
31
|
+
output_only: ClassVar[bool] = False
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def construct(
|
|
35
|
+
cls,
|
|
36
|
+
name: str,
|
|
37
|
+
value: Any | None = None,
|
|
38
|
+
fname: str | None = None,
|
|
39
|
+
created_at: datetime | None = None,
|
|
40
|
+
writer_kwargs: dict | None = None,
|
|
41
|
+
version: int = 0,
|
|
42
|
+
dirty: bool = True,
|
|
43
|
+
**kwargs,
|
|
44
|
+
):
|
|
45
|
+
"""Construct the handler class."""
|
|
46
|
+
created_at = created_at or utcnow()
|
|
47
|
+
|
|
48
|
+
return cls( # type: ignore[call-arg]
|
|
49
|
+
name=name,
|
|
50
|
+
value=value,
|
|
51
|
+
fname=fname
|
|
52
|
+
or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
|
|
53
|
+
writer_kwargs=writer_kwargs or {},
|
|
54
|
+
version=version,
|
|
55
|
+
created_at=created_at,
|
|
56
|
+
dirty=dirty,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def read(cls, buf, **kwargs) -> pa.Table:
|
|
61
|
+
"""Read in the parquet file.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
buf : file-like object
|
|
66
|
+
The buffer from a ``fsspec`` filesystem.
|
|
67
|
+
**kwargs
|
|
68
|
+
Keyword arguments for the read method.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
pyarrow.lib.Table
|
|
73
|
+
A ``pyarrow`` table with the data.
|
|
74
|
+
"""
|
|
75
|
+
return pq.read_table(buf, **kwargs)
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def write(cls, obj, buf, **kwargs):
|
|
79
|
+
"""Write the parquet file using pyarrow.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
obj : object
|
|
84
|
+
The object to write.
|
|
85
|
+
buf : file-like object
|
|
86
|
+
The buffer from a ``fsspec`` filesystem.
|
|
87
|
+
**kwargs
|
|
88
|
+
Keyword arguments for :py:meth:`pyarrow.parquet.write_table`.
|
|
89
|
+
|
|
90
|
+
Raises
|
|
91
|
+
------
|
|
92
|
+
ValueError
|
|
93
|
+
Raised if the supplied object does not have ``__arrow_c_array__``
|
|
94
|
+
or ``__arrow_c_stream__`` attribute
|
|
95
|
+
or if the object does not
|
|
96
|
+
implement the dataframe interchange protocol. These attributes allow us to
|
|
97
|
+
perform a zero-copy transformation from the native obejct to a PyArrow
|
|
98
|
+
Table.
|
|
99
|
+
"""
|
|
100
|
+
if isinstance(obj, pa.Table):
|
|
101
|
+
LOG.debug("Provided object is already a PyArrow table.")
|
|
102
|
+
elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
|
|
103
|
+
obj = pa.table(obj)
|
|
104
|
+
elif isinstance(obj, SupportsInterchange):
|
|
105
|
+
obj = from_dataframe(obj)
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
|
|
109
|
+
"Please provide an object that implements the Arrow PyCapsule Interface or the "
|
|
110
|
+
"Dataframe Interchange Protocol."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
pq.write_table(obj, buf, **kwargs)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Arrow exportable protocols."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@runtime_checkable
|
|
9
|
+
class ArrowArrayExportable(Protocol):
|
|
10
|
+
"""Type protocol for Arrow C Data Interface via Arrow PyCapsule Interface."""
|
|
11
|
+
|
|
12
|
+
def __arrow_c_array__(
|
|
13
|
+
self, requested_schema: object | None = None
|
|
14
|
+
) -> tuple[object, object]:
|
|
15
|
+
"""Export the object as a pair of ArrowSchema and ArrowArray structures."""
|
|
16
|
+
...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@runtime_checkable
|
|
20
|
+
class ArrowStreamExportable(Protocol):
|
|
21
|
+
"""Type protocol for Arrow C Stream Interface via Arrow PyCapsule Interface."""
|
|
22
|
+
|
|
23
|
+
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
|
|
24
|
+
"""Export the object as an ArrowArrayStream."""
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@runtime_checkable
|
|
29
|
+
class SupportsInterchange(Protocol):
|
|
30
|
+
"""Dataframe that supports conversion into an interchange dataframe object."""
|
|
31
|
+
|
|
32
|
+
def __dataframe__(
|
|
33
|
+
self,
|
|
34
|
+
nan_as_null: bool = False,
|
|
35
|
+
allow_copy: bool = True,
|
|
36
|
+
) -> SupportsInterchange:
|
|
37
|
+
"""Convert to a dataframe object implementing the dataframe interchange protocol."""
|
|
38
|
+
...
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: lazyscribe-arrow
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Arrow-based artifact handlers for Lazyscribe
|
|
5
|
+
Author: Akshay Gupta
|
|
6
|
+
Author-email: Akshay Gupta <akgcodes@gmail.com>
|
|
7
|
+
License: MIT license
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Dist: attrs>=21.2,<=25.3
|
|
17
|
+
Requires-Dist: lazyscribe>=1,<=1.2
|
|
18
|
+
Requires-Dist: pyarrow>=14.0.1,<=21
|
|
19
|
+
Requires-Dist: python-slugify>=5,<=8.0.4
|
|
20
|
+
Requires-Dist: commitizen ; extra == 'build'
|
|
21
|
+
Requires-Dist: uv ; extra == 'build'
|
|
22
|
+
Requires-Dist: lazyscribe-arrow[build] ; extra == 'dev'
|
|
23
|
+
Requires-Dist: lazyscribe-arrow[qa] ; extra == 'dev'
|
|
24
|
+
Requires-Dist: lazyscribe-arrow[tests] ; extra == 'dev'
|
|
25
|
+
Requires-Dist: edgetest ; extra == 'qa'
|
|
26
|
+
Requires-Dist: mypy ; extra == 'qa'
|
|
27
|
+
Requires-Dist: pre-commit ; extra == 'qa'
|
|
28
|
+
Requires-Dist: pyproject-fmt ; extra == 'qa'
|
|
29
|
+
Requires-Dist: ruff ; extra == 'qa'
|
|
30
|
+
Requires-Dist: types-python-slugify ; extra == 'qa'
|
|
31
|
+
Requires-Dist: uv ; extra == 'qa'
|
|
32
|
+
Requires-Dist: pandas ; extra == 'tests'
|
|
33
|
+
Requires-Dist: pytest ; extra == 'tests'
|
|
34
|
+
Requires-Dist: pytest-cov ; extra == 'tests'
|
|
35
|
+
Requires-Dist: time-machine ; extra == 'tests'
|
|
36
|
+
Requires-Python: >=3.10.0
|
|
37
|
+
Project-URL: Documentation, https://github.com/lazyscribe/lazyscribe-arrow
|
|
38
|
+
Project-URL: Repository, https://github.com/lazyscribe/lazyscribe-arrow
|
|
39
|
+
Provides-Extra: build
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Provides-Extra: qa
|
|
42
|
+
Provides-Extra: tests
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
[](LICENSE) [](https://pypi.org/project/lazyscribe-arrow/) [](https://pypi.org/project/lazyscrib-arrow/) [](https://codecov.io/gh/lazyscribe/lazyscribe-arrow)
|
|
46
|
+
|
|
47
|
+
# Arrow-based artifact handling for lazyscribe
|
|
48
|
+
|
|
49
|
+
`lazyscribe-arrow` is a lightweight package that adds the following artifact handlers for `lazyscribe`:
|
|
50
|
+
|
|
51
|
+
* `csv`
|
|
52
|
+
|
|
53
|
+
Any data structure that implements the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
|
|
54
|
+
will be compatible with the handlers in this library. Popular compatible open source data structures include
|
|
55
|
+
|
|
56
|
+
* `pandas.DataFrame`
|
|
57
|
+
* `polars.DataFrame`
|
|
58
|
+
* `polars.LazyFrame`
|
|
59
|
+
|
|
60
|
+
# Installation
|
|
61
|
+
|
|
62
|
+
Python 3.10 and above is required. use `pip` to install:
|
|
63
|
+
|
|
64
|
+
```console
|
|
65
|
+
$ python -m pip install lazyscribe-arrow
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
# Usage
|
|
69
|
+
|
|
70
|
+
To use this library, simply log an artifact to a `lazyscribe` experiment or repository with
|
|
71
|
+
|
|
72
|
+
* `handler="csv"` for a CSV output
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
import pyarrow as pa
|
|
77
|
+
from lazyscribe import Project
|
|
78
|
+
|
|
79
|
+
project = Project("project.json", mode="w")
|
|
80
|
+
with project.log("My experiment") as exp:
|
|
81
|
+
data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
|
|
82
|
+
exp.log_artifact(name="data", value=data, handler="csv")
|
|
83
|
+
|
|
84
|
+
project.save()
|
|
85
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
lazyscribe_arrow/__init__.py,sha256=YwnXVqIllCJKZakHtTtDsWh_raw0HGqG8lgsUfCH9FQ,199
|
|
2
|
+
lazyscribe_arrow/_meta.py,sha256=5ldinxF7m6V5dgpN2M5yw54r8bPkiGNJFgWiMlNj7Ag,38
|
|
3
|
+
lazyscribe_arrow/csv.py,sha256=55GIciGtui9sd2l0GdhSXpGaLTenMwxtUbJjboqMDi4,3278
|
|
4
|
+
lazyscribe_arrow/parquet.py,sha256=C_MofzAUqG5UlOG8NW4odlK_rTm0a4HjlhOZed94Row,3413
|
|
5
|
+
lazyscribe_arrow/protocols.py,sha256=VsG6t1em4qsTRwrDvph1aQdTKjFXCjjAcuvwFuay-8Y,1147
|
|
6
|
+
lazyscribe_arrow-0.2.1.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
|
|
7
|
+
lazyscribe_arrow-0.2.1.dist-info/entry_points.txt,sha256=OZeI9uVR1xkCYghXAwtsGStl5ItRE3UZ-BgabZufeyc,106
|
|
8
|
+
lazyscribe_arrow-0.2.1.dist-info/METADATA,sha256=6-F4cGUJ6-QqlJSRcZiQ13hIWW-IUHoCa8veRMUwTqs,3237
|
|
9
|
+
lazyscribe_arrow-0.2.1.dist-info/RECORD,,
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: lazyscribe-arrow
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Arrow-based artifact handlers for Lazyscribe
|
|
5
|
-
Author-email: Akshay Gupta <akgcodes@gmail.com>
|
|
6
|
-
License: MIT license
|
|
7
|
-
Project-URL: Documentation, https://github.com/lazyscribe/lazyscribe-arrow
|
|
8
|
-
Project-URL: Repository, https://github.com/lazyscribe/lazyscribe-arrow
|
|
9
|
-
Classifier: Development Status :: 3 - Alpha
|
|
10
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
-
Classifier: Natural Language :: English
|
|
12
|
-
Classifier: Programming Language :: Python :: 3
|
|
13
|
-
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
-
Requires-Python: >=3.10.0
|
|
19
|
-
Description-Content-Type: text/markdown
|
|
20
|
-
Requires-Dist: attrs<=25.1.0,>=21.2.0
|
|
21
|
-
Requires-Dist: lazyscribe<=1.1.0,>=1.0.0
|
|
22
|
-
Requires-Dist: pyarrow<=19.0.1,>=14.0.1
|
|
23
|
-
Requires-Dist: python-slugify<=8.0.4,>=5.0.0
|
|
24
|
-
Provides-Extra: build
|
|
25
|
-
Requires-Dist: build; extra == "build"
|
|
26
|
-
Requires-Dist: commitizen; extra == "build"
|
|
27
|
-
Requires-Dist: twine; extra == "build"
|
|
28
|
-
Requires-Dist: wheel; extra == "build"
|
|
29
|
-
Provides-Extra: qa
|
|
30
|
-
Requires-Dist: edgetest; extra == "qa"
|
|
31
|
-
Requires-Dist: mypy; extra == "qa"
|
|
32
|
-
Requires-Dist: pre-commit; extra == "qa"
|
|
33
|
-
Requires-Dist: ruff; extra == "qa"
|
|
34
|
-
Requires-Dist: types-python-slugify; extra == "qa"
|
|
35
|
-
Requires-Dist: uv; extra == "qa"
|
|
36
|
-
Provides-Extra: tests
|
|
37
|
-
Requires-Dist: pandas; extra == "tests"
|
|
38
|
-
Requires-Dist: pytest; extra == "tests"
|
|
39
|
-
Requires-Dist: pytest-cov; extra == "tests"
|
|
40
|
-
Requires-Dist: time-machine; extra == "tests"
|
|
41
|
-
Provides-Extra: dev
|
|
42
|
-
Requires-Dist: lazyscribe-arrow[build]; extra == "dev"
|
|
43
|
-
Requires-Dist: lazyscribe-arrow[qa]; extra == "dev"
|
|
44
|
-
Requires-Dist: lazyscribe-arrow[tests]; extra == "dev"
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
lazyscribe_arrow/__init__.py,sha256=nZR8SxoxZjsGSVwdMFTI9l582H0M-77iec3NXzCeziU,127
|
|
2
|
-
lazyscribe_arrow/_meta.py,sha256=FT4J4aKqWfsY7SJy-yg7R1ebCnARJzh_OZjkyUpF0NQ,38
|
|
3
|
-
lazyscribe_arrow/csv.py,sha256=mFgYkntabK65KDguS5wuWe3vd7IG0dznKi01EEGf5QY,2972
|
|
4
|
-
lazyscribe_arrow-0.1.0.dist-info/METADATA,sha256=7IcYyuWzrkgeRsxJVH-Erz0fYb-Kaxy7lQZbCf7T6rQ,1813
|
|
5
|
-
lazyscribe_arrow-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
6
|
-
lazyscribe_arrow-0.1.0.dist-info/entry_points.txt,sha256=4ymVC3yTTvMs2iOibuw9pt7goZ0zWWhgdjJ6NzMS_cw,62
|
|
7
|
-
lazyscribe_arrow-0.1.0.dist-info/top_level.txt,sha256=C_ElBcqIKkSjUmMQPDECDhhP54M7muep1KLnEiFJ61I,17
|
|
8
|
-
lazyscribe_arrow-0.1.0.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
lazyscribe_arrow
|