pl-row-encode 0.3.1__cp39-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,197 @@
1
+ """Row-level, type-preserving encode/decode for Polars columns.
2
+
3
+ `encode(*cols)` packs a set of columns into a single `Binary` column where each value is
4
+ an opaque, self-describing token (the polars-row encoding of the row plus an embedded
5
+ schema header). `decode(...)` / `decode_series(...)` reverse it back into a `Struct`.
6
+
7
+ The token is self-describing, so the schema does not need to be stored anywhere external
8
+ to round-trip through a vendor that only holds the opaque bytes.
9
+
10
+ Encode paths:
11
+ * :func:`encode` -- lazy expr producing a token column.
12
+ * :func:`encode_series` -- eager, returns the token `Series` directly.
13
+ * :func:`get_header` -- pull the schema header out of a token / token `Series`.
14
+
15
+ Decode paths:
16
+ * :func:`decode` -- fully lazy; you supply the ``schema_header`` bytes (zero peeking).
17
+ * :func:`decode_peek` -- lazy bulk decode; sniffs the header from one materialized token.
18
+ * :func:`decode_series` -- fully eager, schema-free.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import struct
24
+ from pathlib import Path
25
+ from typing import TYPE_CHECKING, cast
26
+
27
+ import polars as pl
28
+ from polars.plugins import register_plugin_function
29
+
30
+ if TYPE_CHECKING:
31
+ from polars._typing import IntoExpr
32
+ from typing import Union
33
+
34
+ Frame = Union[pl.DataFrame, pl.LazyFrame]
35
+
36
+ __all__ = [
37
+ "encode",
38
+ "encode_series",
39
+ "get_header",
40
+ "decode",
41
+ "decode_peek",
42
+ "decode_series",
43
+ ]
44
+
45
+ _LIB = Path(__file__).parent
46
+
47
+
48
+ def encode(*exprs: IntoExpr) -> pl.Expr:
49
+ """Encode one or more columns into a single self-describing `Binary` token column.
50
+
51
+ >>> import polars as pl
52
+ >>> df = pl.DataFrame({"x": [1, 2], "y": ["a", "b"]})
53
+ >>> out = df.select(encode("x", "y").alias("tok"))
54
+ >>> out.schema
55
+ Schema({'tok': Binary})
56
+ >>> out.select(decode_peek(out, "tok").alias("row")).to_series().to_list()
57
+ [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]
58
+ """
59
+ if not exprs:
60
+ msg = "encode() requires at least one column"
61
+ raise ValueError(msg)
62
+ return register_plugin_function(
63
+ plugin_path=_LIB,
64
+ function_name="row_encode",
65
+ args=list(exprs),
66
+ is_elementwise=True,
67
+ )
68
+
69
+
70
+ def encode_series(*series: pl.Series) -> pl.Series:
71
+ """Eagerly encode one or more `Series` into a single `Binary` token `Series`.
72
+
73
+ The eager counterpart to :func:`encode`: instead of a lazy expression it returns the
74
+ materialized token column directly, so callers holding `Series` (not a frame) can get
75
+ tokens — and, via :func:`get_header`, the schema header — in one step.
76
+
77
+ >>> import polars as pl
78
+ >>> tok = encode_series(pl.Series("x", [1, 2, 3]))
79
+ >>> tok.dtype
80
+ Binary
81
+ >>> decode_series(tok).to_list()
82
+ [{'x': 1}, {'x': 2}, {'x': 3}]
83
+ """
84
+ if not series:
85
+ msg = "encode_series() requires at least one series"
86
+ raise ValueError(msg)
87
+ return pl.select(encode(*(pl.lit(s) for s in series))).to_series()
88
+
89
+
90
+ def get_header(token: bytes | pl.Series) -> bytes:
91
+ """Lift the `[u32 len][header]` schema prefix out of a token.
92
+
93
+ Accepts either a single token (`bytes`) or a whole token `Series`, in which case the
94
+ header is read from its first non-null value. The returned bytes are exactly what
95
+ :func:`decode` expects as its ``schema_header`` argument, enabling a fully lazy decode
96
+ without re-sniffing the data.
97
+
98
+ The first four bytes are the little-endian length of the header that follows:
99
+
100
+ >>> import polars as pl
101
+ >>> tok = encode_series(pl.Series("x", [1, 2, 3]))
102
+ >>> header = get_header(tok)
103
+ >>> import struct
104
+ >>> struct.unpack_from("<I", header)[0] == len(header) - 4
105
+ True
106
+ >>> get_header(tok[0]) == header
107
+ True
108
+ >>> decode_series(tok).to_list() == pl.select(
109
+ ... decode(pl.lit(tok), schema_header=header)
110
+ ... ).to_series().to_list()
111
+ True
112
+ """
113
+ if isinstance(token, pl.Series):
114
+ first = next((v for v in token if v is not None), None)
115
+ if first is None:
116
+ msg = "cannot extract header from an all-null / empty Series"
117
+ raise ValueError(msg)
118
+ token = first
119
+ if len(token) < 4:
120
+ msg = "token too short to contain a schema header"
121
+ raise ValueError(msg)
122
+ (header_len,) = struct.unpack_from("<I", token, 0)
123
+ return token[: 4 + header_len]
124
+
125
+
126
+ # TODO: Should be able to do this without the schema_header
127
+ # TODO: What about regular dataframes?
128
+ def decode(expr: IntoExpr, *, schema_header: bytes) -> pl.Expr:
129
+ """Decode a `Binary` token column back into a `Struct`.
130
+
131
+ `schema_header` is the header prefix of any token in the column (see
132
+ :func:`decode_series` for the eager path that extracts it for you). It is required so
133
+ the output `Struct` dtype can be resolved before the data is materialized, which the
134
+ Polars lazy engine needs.
135
+
136
+ >>> import polars as pl
137
+ >>> df = pl.DataFrame({"x": [1, 2]}).select(encode("x").alias("tok"))
138
+ >>> header = get_header(df.to_series())
139
+ >>> df.select(decode("tok", schema_header=header).alias("row")).to_series().to_list()
140
+ [{'x': 1}, {'x': 2}]
141
+ """
142
+ return register_plugin_function(
143
+ plugin_path=_LIB,
144
+ function_name="row_decode",
145
+ args=[expr],
146
+ is_elementwise=True,
147
+ kwargs={"schema_header": schema_header},
148
+ )
149
+
150
+
151
+ def decode_peek(frame: Frame, column: str) -> pl.Expr:
152
+ """Build a lazy `decode` expr for `column`, sniffing the schema header for you.
153
+
154
+ Unlike :func:`decode`, the caller does not supply a ``schema_header``. This collects a
155
+ single token from `frame` to read the embedded header, then returns the normal lazy
156
+ `decode` expression with that header baked in -- so the bulk decode still runs lazily
157
+ while only one value is materialized up front.
158
+
159
+ `frame` may be a `DataFrame` or `LazyFrame`. A peek is required because the output
160
+ `Struct` dtype must be known before the lazy engine sees any data (see :func:`decode`).
161
+
162
+ >>> import polars as pl
163
+ >>> df = pl.DataFrame({"x": [1, 2], "y": ["a", "b"]}).select(
164
+ ... encode("x", "y").alias("tok")
165
+ ... )
166
+ >>> df.select(decode_peek(df, "tok").alias("row")).to_series().to_list()
167
+ [{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]
168
+ """
169
+ peek = cast(
170
+ "pl.DataFrame",
171
+ frame.lazy().select(pl.col(column).drop_nulls().first()).collect(),
172
+ )
173
+ first = peek.to_series()[0] if peek.height else None
174
+ if first is None:
175
+ msg = (
176
+ "cannot infer schema: no non-null token in the column; "
177
+ "use decode(schema_header=...)"
178
+ )
179
+ raise ValueError(msg)
180
+ header = get_header(first)
181
+ return decode(pl.col(column), schema_header=header)
182
+
183
+
184
+ # TODO: Should try and unify this with the regular decoder
185
+ def decode_series(s: pl.Series) -> pl.Series:
186
+ """Eagerly decode a `Binary` token Series into a `Struct` Series, schema-free.
187
+
188
+ The schema header is read directly from the first non-null token, so the caller does
189
+ not need to supply or retain any schema.
190
+
191
+ >>> import polars as pl
192
+ >>> tok = encode_series(pl.Series("x", [10, 20]))
193
+ >>> decode_series(tok).to_list()
194
+ [{'x': 10}, {'x': 20}]
195
+ """
196
+ header = get_header(s)
197
+ return pl.select(decode(pl.lit(s), schema_header=header)).to_series()
Binary file
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: pl-row-encode
3
+ Version: 0.3.1
4
+ Classifier: Programming Language :: Rust
5
+ Classifier: Programming Language :: Python :: Implementation :: CPython
6
+ Classifier: Programming Language :: Python :: 3.12
7
+ Classifier: Programming Language :: Python :: 3.13
8
+ Classifier: Programming Language :: Python :: 3.14
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Dist: polars>=1.34,<2
11
+ License-File: LICENSE
12
+ Summary: Row-level, type-preserving encode/decode for Polars columns
13
+ Keywords: polars,plugin,encode,decode,serialization,rust
14
+ Author-email: Tyler Riccio <tylerriccio8@gmail.com>
15
+ License-Expression: MIT
16
+ Requires-Python: >=3.12
17
+ Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
18
+ Project-URL: Homepage, https://github.com/tylerriccio33/pl-row-encode
19
+ Project-URL: Repository, https://github.com/tylerriccio33/pl-row-encode
20
+
21
+ # pl-row-encode
22
+
23
+ A Polars plugin for **row-level, type-preserving encode/decode**.
24
+
25
+ `encode(*cols)` packs a set of columns into a single `Binary` column where each value is an
26
+ opaque, **self-describing** token: the [`polars-row`](https://docs.rs/polars-row) encoding of
27
+ the row, prefixed with an embedded schema header. `decode_series(...)` reverses it back into a
28
+ `Struct`, recovering the original dtypes **without needing any external schema**.
29
+
30
+ ```
31
+ DataFrame
32
+ -> encode(*cols)
33
+ -> opaque bytes
34
+ -> decode(...) # (row bytes -> Struct -> original typed columns)
35
+ -> DataFrame
36
+ ```
37
+
38
+ The type information rides with the token and can be decoded on the spot at some later date.
39
+
40
+ ## Token layout
41
+
42
+ Each `Binary` value is:
43
+
44
+ ```
45
+ [ u32 header_len (LE) ][ header bytes ][ row bytes ]
46
+ ```
47
+
48
+ `header` is a bincode-serialized `Vec<Field>` (logical schema); `row bytes` is the
49
+ unordered `polars-row` encoding of that single row. Embedding the header per value makes
50
+ every token independently decodable.
51
+
52
+ ## Usage
53
+
54
+ ```python
55
+ import polars as pl
56
+ from pl_row_encode import encode, decode_series
57
+
58
+ df = pl.DataFrame({"id": [1, 2], "name": ["alice", "bob"]})
59
+
60
+ tokens = df.select(tok=encode("id", "name"))["tok"] # dtype: Binary
61
+ # ... hand `tokens` to a vendor, get them back ...
62
+
63
+ decoded = decode_series(tokens).struct.unnest() # back to id / name with dtypes
64
+ ```
65
+
66
+ For the lazy engine, the output `Struct` dtype must be known up front, so pass a token's
67
+ header explicitly:
68
+
69
+ ```python
70
+ from pl_row_encode import decode
71
+ header = ... # the [u32 len][header] prefix of any token
72
+ lf.select(decode("tok", schema_header=header)).collect()
73
+ ```
74
+
75
+ ## Development
76
+
77
+ ```bash
78
+ make develop # build the Rust extension into the venv (uv run maturin develop)
79
+ make test # build + run pytest
80
+ make lint # ruff + ty
81
+ ```
82
+
83
+ The first `make develop` compiles the full Polars Rust workspace and takes a few minutes;
84
+ subsequent builds are incremental and fast.
85
+
86
+ ## Notes / limitations
87
+
88
+ - Built on `polars-row`, the same machinery Polars uses internally for sort/group-by row
89
+ encoding — lossless for primitive, string, boolean, temporal, and nested types.
90
+ - `decode_series` infers the schema from the first non-null token, so an all-null/empty
91
+ Series needs the explicit `decode(schema_header=...)` form.
92
+
@@ -0,0 +1,7 @@
1
+ pl_row_encode/__init__.py,sha256=yH6TtK3su-kwKgqmmnq5adTNVOAMai68NlsnRO-xGi4,7443
2
+ pl_row_encode/_internal.pyd,sha256=ZFSk8yzOpSOBwcstLPbjF3d4rKDTMqHSg69Srpqls5A,27591168
3
+ pl_row_encode-0.3.1.dist-info/METADATA,sha256=Y9Way1gW8ZgHVGE5zkfa1oQkni_pqjb3NpNs7izyajw,3219
4
+ pl_row_encode-0.3.1.dist-info/WHEEL,sha256=dB50znGoqD95tHNeV5UBZvlfLNwgEh0iQcswp5YahQ4,95
5
+ pl_row_encode-0.3.1.dist-info/licenses/LICENSE,sha256=NbFGVa7pEgkuRWafOKEf0lUJ_XBdwRFAW-e5AzPbk9w,1090
6
+ pl_row_encode-0.3.1.dist-info/sboms/pl_row_encode.cyclonedx.json,sha256=tPTc_REDrsDGlsmY217w9wV6c8O3EeCmTvzDmUggR_c,239574
7
+ pl_row_encode-0.3.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: maturin (1.13.3)
3
+ Root-Is-Purelib: false
4
+ Tag: cp39-abi3-win_amd64
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Tyler Riccio
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.