lazyscribe-arrow 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lazyscribe-arrow might be problematic. Click here for more details.

@@ -0,0 +1,85 @@
1
+ Metadata-Version: 2.3
2
+ Name: lazyscribe-arrow
3
+ Version: 0.2.1
4
+ Summary: Arrow-based artifact handlers for Lazyscribe
5
+ Author: Akshay Gupta
6
+ Author-email: Akshay Gupta <akgcodes@gmail.com>
7
+ License: MIT license
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Natural Language :: English
11
+ Classifier: Programming Language :: Python :: 3 :: Only
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Dist: attrs>=21.2,<=25.3
17
+ Requires-Dist: lazyscribe>=1,<=1.2
18
+ Requires-Dist: pyarrow>=14.0.1,<=21
19
+ Requires-Dist: python-slugify>=5,<=8.0.4
20
+ Requires-Dist: commitizen ; extra == 'build'
21
+ Requires-Dist: uv ; extra == 'build'
22
+ Requires-Dist: lazyscribe-arrow[build] ; extra == 'dev'
23
+ Requires-Dist: lazyscribe-arrow[qa] ; extra == 'dev'
24
+ Requires-Dist: lazyscribe-arrow[tests] ; extra == 'dev'
25
+ Requires-Dist: edgetest ; extra == 'qa'
26
+ Requires-Dist: mypy ; extra == 'qa'
27
+ Requires-Dist: pre-commit ; extra == 'qa'
28
+ Requires-Dist: pyproject-fmt ; extra == 'qa'
29
+ Requires-Dist: ruff ; extra == 'qa'
30
+ Requires-Dist: types-python-slugify ; extra == 'qa'
31
+ Requires-Dist: uv ; extra == 'qa'
32
+ Requires-Dist: pandas ; extra == 'tests'
33
+ Requires-Dist: pytest ; extra == 'tests'
34
+ Requires-Dist: pytest-cov ; extra == 'tests'
35
+ Requires-Dist: time-machine ; extra == 'tests'
36
+ Requires-Python: >=3.10.0
37
+ Project-URL: Documentation, https://github.com/lazyscribe/lazyscribe-arrow
38
+ Project-URL: Repository, https://github.com/lazyscribe/lazyscribe-arrow
39
+ Provides-Extra: build
40
+ Provides-Extra: dev
41
+ Provides-Extra: qa
42
+ Provides-Extra: tests
43
+ Description-Content-Type: text/markdown
44
+
45
+ [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) [![PyPI](https://img.shields.io/pypi/v/lazyscribe-arrow)](https://pypi.org/project/lazyscribe-arrow/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/lazyscribe-arrow)](https://pypi.org/project/lazyscrib-arrow/) [![codecov](https://codecov.io/gh/lazyscribe/lazyscribe-arrow/graph/badge.svg?token=W5TPK7GX7G)](https://codecov.io/gh/lazyscribe/lazyscribe-arrow)
46
+
47
+ # Arrow-based artifact handling for lazyscribe
48
+
49
+ `lazyscribe-arrow` is a lightweight package that adds the following artifact handlers for `lazyscribe`:
50
+
51
+ * `csv`
52
+
53
+ Any data structure that implements the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
54
+ will be compatible with the handlers in this library. Popular compatible open source data structures include
55
+
56
+ * `pandas.DataFrame`
57
+ * `polars.DataFrame`
58
+ * `polars.LazyFrame`
59
+
60
+ # Installation
61
+
62
+ Python 3.10 and above is required. use `pip` to install:
63
+
64
+ ```console
65
+ $ python -m pip install lazyscribe-arrow
66
+ ```
67
+
68
+ # Usage
69
+
70
+ To use this library, simply log an artifact to a `lazyscribe` experiment or repository with
71
+
72
+ * `handler="csv"` for a CSV output
73
+
74
+
75
+ ```python
76
+ import pyarrow as pa
77
+ from lazyscribe import Project
78
+
79
+ project = Project("project.json", mode="w")
80
+ with project.log("My experiment") as exp:
81
+ data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
82
+ exp.log_artifact(name="data", value=data, handler="csv")
83
+
84
+ project.save()
85
+ ```
@@ -0,0 +1,41 @@
1
+ [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) [![PyPI](https://img.shields.io/pypi/v/lazyscribe-arrow)](https://pypi.org/project/lazyscribe-arrow/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/lazyscribe-arrow)](https://pypi.org/project/lazyscrib-arrow/) [![codecov](https://codecov.io/gh/lazyscribe/lazyscribe-arrow/graph/badge.svg?token=W5TPK7GX7G)](https://codecov.io/gh/lazyscribe/lazyscribe-arrow)
2
+
3
+ # Arrow-based artifact handling for lazyscribe
4
+
5
+ `lazyscribe-arrow` is a lightweight package that adds the following artifact handlers for `lazyscribe`:
6
+
7
+ * `csv`
8
+
9
+ Any data structure that implements the [Arrow PyCapsule Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
10
+ will be compatible with the handlers in this library. Popular compatible open source data structures include
11
+
12
+ * `pandas.DataFrame`
13
+ * `polars.DataFrame`
14
+ * `polars.LazyFrame`
15
+
16
+ # Installation
17
+
18
+ Python 3.10 and above is required. use `pip` to install:
19
+
20
+ ```console
21
+ $ python -m pip install lazyscribe-arrow
22
+ ```
23
+
24
+ # Usage
25
+
26
+ To use this library, simply log an artifact to a `lazyscribe` experiment or repository with
27
+
28
+ * `handler="csv"` for a CSV output
29
+
30
+
31
+ ```python
32
+ import pyarrow as pa
33
+ from lazyscribe import Project
34
+
35
+ project = Project("project.json", mode="w")
36
+ with project.log("My experiment") as exp:
37
+ data = pa.Table.from_arrays([[0, 1, 2]], names=["a"])
38
+ exp.log_artifact(name="data", value=data, handler="csv")
39
+
40
+ project.save()
41
+ ```
@@ -0,0 +1,6 @@
1
+ """Import the custom artifact handlers."""
2
+
3
+ from lazyscribe_arrow.csv import CSVArtifact
4
+ from lazyscribe_arrow.parquet import ParquetArtifact
5
+
6
+ __all__: list[str] = ["CSVArtifact", "ParquetArtifact"]
@@ -0,0 +1,3 @@
1
+ """Version."""
2
+
3
+ __version__ = "0.2.1"
@@ -0,0 +1,111 @@
1
+ """Custom artifact handlers for CSVs."""
2
+
3
+ import logging
4
+ from datetime import datetime
5
+ from typing import Any, ClassVar
6
+
7
+ import pyarrow as pa
8
+ from attrs import define
9
+ from lazyscribe._utils import utcnow
10
+ from lazyscribe.artifacts.base import Artifact
11
+ from pyarrow import csv
12
+ from pyarrow.interchange import from_dataframe
13
+ from slugify import slugify
14
+
15
+ from lazyscribe_arrow.protocols import (
16
+ ArrowArrayExportable,
17
+ ArrowStreamExportable,
18
+ SupportsInterchange,
19
+ )
20
+
21
+ LOG = logging.getLogger(__name__)
22
+
23
+
24
+ @define(auto_attribs=True)
25
+ class CSVArtifact(Artifact):
26
+ """Arrow-powered CSV handler."""
27
+
28
+ alias: ClassVar[str] = "csv"
29
+ suffix: ClassVar[str] = "csv"
30
+ binary: ClassVar[bool] = True
31
+ output_only: ClassVar[bool] = False
32
+
33
+ @classmethod
34
+ def construct(
35
+ cls,
36
+ name: str,
37
+ value: Any | None = None,
38
+ fname: str | None = None,
39
+ created_at: datetime | None = None,
40
+ writer_kwargs: dict | None = None,
41
+ version: int = 0,
42
+ dirty: bool = True,
43
+ **kwargs,
44
+ ):
45
+ """Construct the handler class."""
46
+ created_at = created_at or utcnow()
47
+
48
+ return cls( # type: ignore[call-arg]
49
+ name=name,
50
+ value=value,
51
+ fname=fname
52
+ or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
53
+ writer_kwargs=writer_kwargs or {},
54
+ version=version,
55
+ created_at=created_at,
56
+ dirty=dirty,
57
+ )
58
+
59
+ @classmethod
60
+ def read(cls, buf, **kwargs) -> pa.Table:
61
+ """Read in the CSV file.
62
+
63
+ Parameters
64
+ ----------
65
+ buf : file-like object
66
+ The buffer from a ``fsspec`` filesystem.
67
+ **kwargs
68
+ Keyword arguments for the read method.
69
+
70
+ Returns
71
+ -------
72
+ pyarrow.lib.Table
73
+ A ``pyarrow`` table with the data.
74
+ """
75
+ return csv.read_csv(buf, **kwargs)
76
+
77
+ @classmethod
78
+ def write(cls, obj, buf, **kwargs):
79
+ """Write the CSV file using pyarrow.
80
+
81
+ Parameters
82
+ ----------
83
+ obj : object
84
+ The object to write.
85
+ buf : file-like object
86
+ The buffer from a ``fsspec`` filesystem.
87
+ **kwargs
88
+ Keyword arguments for :py:meth:`pyarrow.csv.write_csv`.
89
+
90
+ Raises
91
+ ------
92
+ ValueError
93
+ Raised if the supplied object does not have ``__arrow_c_array__``
94
+ or ``__arrow_c_stream__`` attributes. These attributes allow us to
95
+ perform a zero-copy transformation from the native obejct to a PyArrow
96
+ Table.
97
+ """
98
+ if isinstance(obj, pa.Table):
99
+ LOG.debug("Provided object is already a PyArrow table.")
100
+ elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
101
+ obj = pa.table(obj)
102
+ elif isinstance(obj, SupportsInterchange):
103
+ obj = from_dataframe(obj)
104
+ else:
105
+ raise ValueError(
106
+ f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
107
+ "Please provide an object that implements the Arrow PyCapsule Interface or the "
108
+ "Dataframe Interchange Protocol."
109
+ )
110
+
111
+ csv.write_csv(obj, buf, **kwargs)
@@ -0,0 +1,113 @@
1
+ """Custom artifact handlers for parquets."""
2
+
3
+ import logging
4
+ from datetime import datetime
5
+ from typing import Any, ClassVar
6
+
7
+ import pyarrow as pa
8
+ import pyarrow.parquet as pq
9
+ from attrs import define
10
+ from lazyscribe._utils import utcnow
11
+ from lazyscribe.artifacts.base import Artifact
12
+ from pyarrow.interchange import from_dataframe
13
+ from slugify import slugify
14
+
15
+ from lazyscribe_arrow.protocols import (
16
+ ArrowArrayExportable,
17
+ ArrowStreamExportable,
18
+ SupportsInterchange,
19
+ )
20
+
21
+ LOG = logging.getLogger(__name__)
22
+
23
+
24
+ @define(auto_attribs=True)
25
+ class ParquetArtifact(Artifact):
26
+ """Arrow-powered Parquet handler."""
27
+
28
+ alias: ClassVar[str] = "parquet"
29
+ suffix: ClassVar[str] = "parquet"
30
+ binary: ClassVar[bool] = True
31
+ output_only: ClassVar[bool] = False
32
+
33
+ @classmethod
34
+ def construct(
35
+ cls,
36
+ name: str,
37
+ value: Any | None = None,
38
+ fname: str | None = None,
39
+ created_at: datetime | None = None,
40
+ writer_kwargs: dict | None = None,
41
+ version: int = 0,
42
+ dirty: bool = True,
43
+ **kwargs,
44
+ ):
45
+ """Construct the handler class."""
46
+ created_at = created_at or utcnow()
47
+
48
+ return cls( # type: ignore[call-arg]
49
+ name=name,
50
+ value=value,
51
+ fname=fname
52
+ or f"{slugify(name)}-{slugify(created_at.strftime('%Y%m%d%H%M%S'))}.{cls.suffix}",
53
+ writer_kwargs=writer_kwargs or {},
54
+ version=version,
55
+ created_at=created_at,
56
+ dirty=dirty,
57
+ )
58
+
59
+ @classmethod
60
+ def read(cls, buf, **kwargs) -> pa.Table:
61
+ """Read in the parquet file.
62
+
63
+ Parameters
64
+ ----------
65
+ buf : file-like object
66
+ The buffer from a ``fsspec`` filesystem.
67
+ **kwargs
68
+ Keyword arguments for the read method.
69
+
70
+ Returns
71
+ -------
72
+ pyarrow.lib.Table
73
+ A ``pyarrow`` table with the data.
74
+ """
75
+ return pq.read_table(buf, **kwargs)
76
+
77
+ @classmethod
78
+ def write(cls, obj, buf, **kwargs):
79
+ """Write the parquet file using pyarrow.
80
+
81
+ Parameters
82
+ ----------
83
+ obj : object
84
+ The object to write.
85
+ buf : file-like object
86
+ The buffer from a ``fsspec`` filesystem.
87
+ **kwargs
88
+ Keyword arguments for :py:meth:`pyarrow.parquet.write_table`.
89
+
90
+ Raises
91
+ ------
92
+ ValueError
93
+ Raised if the supplied object does not have ``__arrow_c_array__``
94
+ or ``__arrow_c_stream__`` attribute
95
+ or if the object does not
96
+ implement the dataframe interchange protocol. These attributes allow us to
97
+ perform a zero-copy transformation from the native obejct to a PyArrow
98
+ Table.
99
+ """
100
+ if isinstance(obj, pa.Table):
101
+ LOG.debug("Provided object is already a PyArrow table.")
102
+ elif isinstance(obj, (ArrowArrayExportable, ArrowStreamExportable)):
103
+ obj = pa.table(obj)
104
+ elif isinstance(obj, SupportsInterchange):
105
+ obj = from_dataframe(obj)
106
+ else:
107
+ raise ValueError(
108
+ f"Object of type `{type(obj)}` cannot be easily coerced into a PyArrow Table. "
109
+ "Please provide an object that implements the Arrow PyCapsule Interface or the "
110
+ "Dataframe Interchange Protocol."
111
+ )
112
+
113
+ pq.write_table(obj, buf, **kwargs)
@@ -0,0 +1,38 @@
1
+ """Arrow exportable protocols."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+
8
+ @runtime_checkable
9
+ class ArrowArrayExportable(Protocol):
10
+ """Type protocol for Arrow C Data Interface via Arrow PyCapsule Interface."""
11
+
12
+ def __arrow_c_array__(
13
+ self, requested_schema: object | None = None
14
+ ) -> tuple[object, object]:
15
+ """Export the object as a pair of ArrowSchema and ArrowArray structures."""
16
+ ...
17
+
18
+
19
+ @runtime_checkable
20
+ class ArrowStreamExportable(Protocol):
21
+ """Type protocol for Arrow C Stream Interface via Arrow PyCapsule Interface."""
22
+
23
+ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
24
+ """Export the object as an ArrowArrayStream."""
25
+ ...
26
+
27
+
28
+ @runtime_checkable
29
+ class SupportsInterchange(Protocol):
30
+ """Dataframe that supports conversion into an interchange dataframe object."""
31
+
32
+ def __dataframe__(
33
+ self,
34
+ nan_as_null: bool = False,
35
+ allow_copy: bool = True,
36
+ ) -> SupportsInterchange:
37
+ """Convert to a dataframe object implementing the dataframe interchange protocol."""
38
+ ...
@@ -0,0 +1,90 @@
1
+ [build-system]
2
+ build-backend = "uv_build"
3
+
4
+ requires = [ "uv-build>=0.8,<0.9" ]
5
+
6
+ [project]
7
+ name = "lazyscribe-arrow"
8
+ version = "0.2.1"
9
+
10
+ description = "Arrow-based artifact handlers for Lazyscribe"
11
+ readme = { file = "README.md", content-type = "text/markdown" }
12
+ license = { text = "MIT license" }
13
+ authors = [
14
+ { name = "Akshay Gupta", email = "akgcodes@gmail.com" },
15
+ ]
16
+ requires-python = ">=3.10.0"
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Natural Language :: English",
21
+ "Programming Language :: Python :: 3 :: Only",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Programming Language :: Python :: 3.13",
26
+ ]
27
+ dependencies = [ "attrs>=21.2,<=25.3", "lazyscribe>=1,<=1.2", "pyarrow>=14.0.1,<=21", "python-slugify>=5,<=8.0.4" ]
28
+
29
+ optional-dependencies.build = [ "commitizen", "uv" ]
30
+ optional-dependencies.dev = [ "lazyscribe-arrow[build]", "lazyscribe-arrow[qa]", "lazyscribe-arrow[tests]" ]
31
+ optional-dependencies.qa = [ "edgetest", "mypy", "pre-commit", "pyproject-fmt", "ruff", "types-python-slugify", "uv" ]
32
+ optional-dependencies.tests = [ "pandas", "pytest", "pytest-cov", "time-machine" ]
33
+ urls."Documentation" = "https://github.com/lazyscribe/lazyscribe-arrow"
34
+ urls."Repository" = "https://github.com/lazyscribe/lazyscribe-arrow"
35
+ entry-points."lazyscribe.artifact_type".csv = "lazyscribe_arrow:CSVArtifact"
36
+ entry-points."lazyscribe.artifact_type".parquet = "lazyscribe_arrow:ParquetArtifact"
37
+
38
+ [tool.ruff]
39
+ target-version = "py310"
40
+
41
+ lint.select = [
42
+ "B", # flake8-bugbear
43
+ "C", # flake8-comprehensions
44
+ "D", # pydocstyle
45
+ "E", # pycodestyle errors
46
+ "F", # pyflakes
47
+ "I", # isort
48
+ "LOG", # flake8-logging
49
+ "RUF", # Ruff errors
50
+ "SIM", # flake8-simplify
51
+ "T20", # flake8-print
52
+ "TID252", # flake8-tidy-imports ban relative imports
53
+ "UP", # pyupgrade
54
+ "W", # pycodestyle warnings
55
+ ]
56
+ lint.ignore = [
57
+ "C901", # Function/method is too complex. (Add back in later.)
58
+ "E501", # Line too long. Using formatter instead.
59
+ ]
60
+ lint.per-file-ignores."**/{tests,docs}/*" = [ "ARG", "D", "E402", "F841" ]
61
+ lint.per-file-ignores."__init__.py" = [ "E402" ]
62
+ lint.per-file-ignores."tutorials/*" = [ "D", "T201" ]
63
+ lint.flake8-tidy-imports.ban-relative-imports = "all"
64
+ lint.pydocstyle.convention = "numpy"
65
+ lint.preview = true
66
+
67
+ [tool.mypy]
68
+ python_version = "3.10"
69
+ warn_return_any = true
70
+ warn_unused_configs = true
71
+ ignore_missing_imports = true
72
+ allow_redefinition = true
73
+ check_untyped_defs = true
74
+
75
+ [tool.uv.build-backend]
76
+ module-name = "lazyscribe_arrow"
77
+ module-root = ""
78
+
79
+ [tool.commitizen]
80
+ version_provider = "pep621"
81
+ tag_format = "v$version"
82
+ update_changelog_on_bump = true
83
+ version_files = [
84
+ "lazyscribe_arrow/_meta.py:__version__",
85
+ ]
86
+
87
+ [edgetest.envs.core]
88
+ extras = [ "tests" ]
89
+ upgrade = [ "attrs", "lazyscribe", "pyarrow", "python-slugify" ]
90
+ command = "pytest tests"