pl-row-encode 0.3.1__cp39-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pl_row_encode/__init__.py +197 -0
- pl_row_encode/_internal.pyd +0 -0
- pl_row_encode-0.3.1.dist-info/METADATA +92 -0
- pl_row_encode-0.3.1.dist-info/RECORD +7 -0
- pl_row_encode-0.3.1.dist-info/WHEEL +4 -0
- pl_row_encode-0.3.1.dist-info/licenses/LICENSE +21 -0
- pl_row_encode-0.3.1.dist-info/sboms/pl_row_encode.cyclonedx.json +7344 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Row-level, type-preserving encode/decode for Polars columns.
|
|
2
|
+
|
|
3
|
+
`encode(*cols)` packs a set of columns into a single `Binary` column where each value is
|
|
4
|
+
an opaque, self-describing token (the polars-row encoding of the row plus an embedded
|
|
5
|
+
schema header). `decode(...)` / `decode_series(...)` reverse it back into a `Struct`.
|
|
6
|
+
|
|
7
|
+
The token is self-describing, so the schema does not need to be stored anywhere external
|
|
8
|
+
to round-trip through a vendor that only holds the opaque bytes.
|
|
9
|
+
|
|
10
|
+
Encode paths:
|
|
11
|
+
* :func:`encode` -- lazy expr producing a token column.
|
|
12
|
+
* :func:`encode_series` -- eager, returns the token `Series` directly.
|
|
13
|
+
* :func:`get_header` -- pull the schema header out of a token / token `Series`.
|
|
14
|
+
|
|
15
|
+
Decode paths:
|
|
16
|
+
* :func:`decode` -- fully lazy; you supply the ``schema_header`` bytes (zero peeking).
|
|
17
|
+
* :func:`decode_peek` -- lazy bulk decode; sniffs the header from one materialized token.
|
|
18
|
+
* :func:`decode_series` -- fully eager, schema-free.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import struct
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import TYPE_CHECKING, cast
|
|
26
|
+
|
|
27
|
+
import polars as pl
|
|
28
|
+
from polars.plugins import register_plugin_function
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from polars._typing import IntoExpr
|
|
32
|
+
from typing import Union
|
|
33
|
+
|
|
34
|
+
Frame = Union[pl.DataFrame, pl.LazyFrame]
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"encode",
|
|
38
|
+
"encode_series",
|
|
39
|
+
"get_header",
|
|
40
|
+
"decode",
|
|
41
|
+
"decode_peek",
|
|
42
|
+
"decode_series",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
_LIB = Path(__file__).parent
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def encode(*exprs: IntoExpr) -> pl.Expr:
|
|
49
|
+
"""Encode one or more columns into a single self-describing `Binary` token column.
|
|
50
|
+
|
|
51
|
+
>>> import polars as pl
|
|
52
|
+
>>> df = pl.DataFrame({"x": [1, 2], "y": ["a", "b"]})
|
|
53
|
+
>>> out = df.select(encode("x", "y").alias("tok"))
|
|
54
|
+
>>> out.schema
|
|
55
|
+
Schema({'tok': Binary})
|
|
56
|
+
>>> out.select(decode_peek(out, "tok").alias("row")).to_series().to_list()
|
|
57
|
+
[{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]
|
|
58
|
+
"""
|
|
59
|
+
if not exprs:
|
|
60
|
+
msg = "encode() requires at least one column"
|
|
61
|
+
raise ValueError(msg)
|
|
62
|
+
return register_plugin_function(
|
|
63
|
+
plugin_path=_LIB,
|
|
64
|
+
function_name="row_encode",
|
|
65
|
+
args=list(exprs),
|
|
66
|
+
is_elementwise=True,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def encode_series(*series: pl.Series) -> pl.Series:
|
|
71
|
+
"""Eagerly encode one or more `Series` into a single `Binary` token `Series`.
|
|
72
|
+
|
|
73
|
+
The eager counterpart to :func:`encode`: instead of a lazy expression it returns the
|
|
74
|
+
materialized token column directly, so callers holding `Series` (not a frame) can get
|
|
75
|
+
tokens — and, via :func:`get_header`, the schema header — in one step.
|
|
76
|
+
|
|
77
|
+
>>> import polars as pl
|
|
78
|
+
>>> tok = encode_series(pl.Series("x", [1, 2, 3]))
|
|
79
|
+
>>> tok.dtype
|
|
80
|
+
Binary
|
|
81
|
+
>>> decode_series(tok).to_list()
|
|
82
|
+
[{'x': 1}, {'x': 2}, {'x': 3}]
|
|
83
|
+
"""
|
|
84
|
+
if not series:
|
|
85
|
+
msg = "encode_series() requires at least one series"
|
|
86
|
+
raise ValueError(msg)
|
|
87
|
+
return pl.select(encode(*(pl.lit(s) for s in series))).to_series()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_header(token: bytes | pl.Series) -> bytes:
|
|
91
|
+
"""Lift the `[u32 len][header]` schema prefix out of a token.
|
|
92
|
+
|
|
93
|
+
Accepts either a single token (`bytes`) or a whole token `Series`, in which case the
|
|
94
|
+
header is read from its first non-null value. The returned bytes are exactly what
|
|
95
|
+
:func:`decode` expects as its ``schema_header`` argument, enabling a fully lazy decode
|
|
96
|
+
without re-sniffing the data.
|
|
97
|
+
|
|
98
|
+
The first four bytes are the little-endian length of the header that follows:
|
|
99
|
+
|
|
100
|
+
>>> import polars as pl
|
|
101
|
+
>>> tok = encode_series(pl.Series("x", [1, 2, 3]))
|
|
102
|
+
>>> header = get_header(tok)
|
|
103
|
+
>>> import struct
|
|
104
|
+
>>> struct.unpack_from("<I", header)[0] == len(header) - 4
|
|
105
|
+
True
|
|
106
|
+
>>> get_header(tok[0]) == header
|
|
107
|
+
True
|
|
108
|
+
>>> decode_series(tok).to_list() == pl.select(
|
|
109
|
+
... decode(pl.lit(tok), schema_header=header)
|
|
110
|
+
... ).to_series().to_list()
|
|
111
|
+
True
|
|
112
|
+
"""
|
|
113
|
+
if isinstance(token, pl.Series):
|
|
114
|
+
first = next((v for v in token if v is not None), None)
|
|
115
|
+
if first is None:
|
|
116
|
+
msg = "cannot extract header from an all-null / empty Series"
|
|
117
|
+
raise ValueError(msg)
|
|
118
|
+
token = first
|
|
119
|
+
if len(token) < 4:
|
|
120
|
+
msg = "token too short to contain a schema header"
|
|
121
|
+
raise ValueError(msg)
|
|
122
|
+
(header_len,) = struct.unpack_from("<I", token, 0)
|
|
123
|
+
return token[: 4 + header_len]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# TODO: Should be able to do this without the schema_header
|
|
127
|
+
# TODO: What about regular dataframes?
|
|
128
|
+
def decode(expr: IntoExpr, *, schema_header: bytes) -> pl.Expr:
|
|
129
|
+
"""Decode a `Binary` token column back into a `Struct`.
|
|
130
|
+
|
|
131
|
+
`schema_header` is the header prefix of any token in the column (see
|
|
132
|
+
:func:`decode_series` for the eager path that extracts it for you). It is required so
|
|
133
|
+
the output `Struct` dtype can be resolved before the data is materialized, which the
|
|
134
|
+
Polars lazy engine needs.
|
|
135
|
+
|
|
136
|
+
>>> import polars as pl
|
|
137
|
+
>>> df = pl.DataFrame({"x": [1, 2]}).select(encode("x").alias("tok"))
|
|
138
|
+
>>> header = get_header(df.to_series())
|
|
139
|
+
>>> df.select(decode("tok", schema_header=header).alias("row")).to_series().to_list()
|
|
140
|
+
[{'x': 1}, {'x': 2}]
|
|
141
|
+
"""
|
|
142
|
+
return register_plugin_function(
|
|
143
|
+
plugin_path=_LIB,
|
|
144
|
+
function_name="row_decode",
|
|
145
|
+
args=[expr],
|
|
146
|
+
is_elementwise=True,
|
|
147
|
+
kwargs={"schema_header": schema_header},
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def decode_peek(frame: Frame, column: str) -> pl.Expr:
|
|
152
|
+
"""Build a lazy `decode` expr for `column`, sniffing the schema header for you.
|
|
153
|
+
|
|
154
|
+
Unlike :func:`decode`, the caller does not supply a ``schema_header``. This collects a
|
|
155
|
+
single token from `frame` to read the embedded header, then returns the normal lazy
|
|
156
|
+
`decode` expression with that header baked in -- so the bulk decode still runs lazily
|
|
157
|
+
while only one value is materialized up front.
|
|
158
|
+
|
|
159
|
+
`frame` may be a `DataFrame` or `LazyFrame`. A peek is required because the output
|
|
160
|
+
`Struct` dtype must be known before the lazy engine sees any data (see :func:`decode`).
|
|
161
|
+
|
|
162
|
+
>>> import polars as pl
|
|
163
|
+
>>> df = pl.DataFrame({"x": [1, 2], "y": ["a", "b"]}).select(
|
|
164
|
+
... encode("x", "y").alias("tok")
|
|
165
|
+
... )
|
|
166
|
+
>>> df.select(decode_peek(df, "tok").alias("row")).to_series().to_list()
|
|
167
|
+
[{'x': 1, 'y': 'a'}, {'x': 2, 'y': 'b'}]
|
|
168
|
+
"""
|
|
169
|
+
peek = cast(
|
|
170
|
+
"pl.DataFrame",
|
|
171
|
+
frame.lazy().select(pl.col(column).drop_nulls().first()).collect(),
|
|
172
|
+
)
|
|
173
|
+
first = peek.to_series()[0] if peek.height else None
|
|
174
|
+
if first is None:
|
|
175
|
+
msg = (
|
|
176
|
+
"cannot infer schema: no non-null token in the column; "
|
|
177
|
+
"use decode(schema_header=...)"
|
|
178
|
+
)
|
|
179
|
+
raise ValueError(msg)
|
|
180
|
+
header = get_header(first)
|
|
181
|
+
return decode(pl.col(column), schema_header=header)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# TODO: Should try and unify this with the regular decoder
|
|
185
|
+
def decode_series(s: pl.Series) -> pl.Series:
|
|
186
|
+
"""Eagerly decode a `Binary` token Series into a `Struct` Series, schema-free.
|
|
187
|
+
|
|
188
|
+
The schema header is read directly from the first non-null token, so the caller does
|
|
189
|
+
not need to supply or retain any schema.
|
|
190
|
+
|
|
191
|
+
>>> import polars as pl
|
|
192
|
+
>>> tok = encode_series(pl.Series("x", [10, 20]))
|
|
193
|
+
>>> decode_series(tok).to_list()
|
|
194
|
+
[{'x': 10}, {'x': 20}]
|
|
195
|
+
"""
|
|
196
|
+
header = get_header(s)
|
|
197
|
+
return pl.select(decode(pl.lit(s), schema_header=header)).to_series()
|
|
Binary file
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pl-row-encode
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Classifier: Programming Language :: Rust
|
|
5
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
6
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
7
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Dist: polars>=1.34,<2
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Summary: Row-level, type-preserving encode/decode for Polars columns
|
|
13
|
+
Keywords: polars,plugin,encode,decode,serialization,rust
|
|
14
|
+
Author-email: Tyler Riccio <tylerriccio8@gmail.com>
|
|
15
|
+
License-Expression: MIT
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
18
|
+
Project-URL: Homepage, https://github.com/tylerriccio33/pl-row-encode
|
|
19
|
+
Project-URL: Repository, https://github.com/tylerriccio33/pl-row-encode
|
|
20
|
+
|
|
21
|
+
# pl-row-encode
|
|
22
|
+
|
|
23
|
+
A Polars plugin for **row-level, type-preserving encode/decode**.
|
|
24
|
+
|
|
25
|
+
`encode(*cols)` packs a set of columns into a single `Binary` column where each value is an
|
|
26
|
+
opaque, **self-describing** token: the [`polars-row`](https://docs.rs/polars-row) encoding of
|
|
27
|
+
the row, prefixed with an embedded schema header. `decode_series(...)` reverses it back into a
|
|
28
|
+
`Struct`, recovering the original dtypes **without needing any external schema**.
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
DataFrame
|
|
32
|
+
-> encode(*cols)
|
|
33
|
+
-> opaque bytes
|
|
34
|
+
-> decode(...) # (row bytes -> Struct -> original typed columns)
|
|
35
|
+
-> DataFrame
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The type information rides with the token and can be decoded on the spot at some later date.
|
|
39
|
+
|
|
40
|
+
## Token layout
|
|
41
|
+
|
|
42
|
+
Each `Binary` value is:
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
[ u32 header_len (LE) ][ header bytes ][ row bytes ]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
`header` is a bincode-serialized `Vec<Field>` (logical schema); `row bytes` is the
|
|
49
|
+
unordered `polars-row` encoding of that single row. Embedding the header per value makes
|
|
50
|
+
every token independently decodable.
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import polars as pl
|
|
56
|
+
from pl_row_encode import encode, decode_series
|
|
57
|
+
|
|
58
|
+
df = pl.DataFrame({"id": [1, 2], "name": ["alice", "bob"]})
|
|
59
|
+
|
|
60
|
+
tokens = df.select(tok=encode("id", "name"))["tok"] # dtype: Binary
|
|
61
|
+
# ... hand `tokens` to a vendor, get them back ...
|
|
62
|
+
|
|
63
|
+
decoded = decode_series(tokens).struct.unnest() # back to id / name with dtypes
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
For the lazy engine, the output `Struct` dtype must be known up front, so pass a token's
|
|
67
|
+
header explicitly:
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from pl_row_encode import decode
|
|
71
|
+
header = ... # the [u32 len][header] prefix of any token
|
|
72
|
+
lf.select(decode("tok", schema_header=header)).collect()
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Development
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
make develop # build the Rust extension into the venv (uv run maturin develop)
|
|
79
|
+
make test # build + run pytest
|
|
80
|
+
make lint # ruff + ty
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The first `make develop` compiles the full Polars Rust workspace and takes a few minutes;
|
|
84
|
+
subsequent builds are incremental and fast.
|
|
85
|
+
|
|
86
|
+
## Notes / limitations
|
|
87
|
+
|
|
88
|
+
- Built on `polars-row`, the same machinery Polars uses internally for sort/group-by row
|
|
89
|
+
encoding — lossless for primitive, string, boolean, temporal, and nested types.
|
|
90
|
+
- `decode_series` infers the schema from the first non-null token, so an all-null/empty
|
|
91
|
+
Series needs the explicit `decode(schema_header=...)` form.
|
|
92
|
+
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
pl_row_encode/__init__.py,sha256=yH6TtK3su-kwKgqmmnq5adTNVOAMai68NlsnRO-xGi4,7443
|
|
2
|
+
pl_row_encode/_internal.pyd,sha256=ZFSk8yzOpSOBwcstLPbjF3d4rKDTMqHSg69Srpqls5A,27591168
|
|
3
|
+
pl_row_encode-0.3.1.dist-info/METADATA,sha256=Y9Way1gW8ZgHVGE5zkfa1oQkni_pqjb3NpNs7izyajw,3219
|
|
4
|
+
pl_row_encode-0.3.1.dist-info/WHEEL,sha256=dB50znGoqD95tHNeV5UBZvlfLNwgEh0iQcswp5YahQ4,95
|
|
5
|
+
pl_row_encode-0.3.1.dist-info/licenses/LICENSE,sha256=NbFGVa7pEgkuRWafOKEf0lUJ_XBdwRFAW-e5AzPbk9w,1090
|
|
6
|
+
pl_row_encode-0.3.1.dist-info/sboms/pl_row_encode.cyclonedx.json,sha256=tPTc_REDrsDGlsmY217w9wV6c8O3EeCmTvzDmUggR_c,239574
|
|
7
|
+
pl_row_encode-0.3.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Tyler Riccio
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|