planframe-polars 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- planframe_polars-0.1.0/.gitignore +41 -0
- planframe_polars-0.1.0/LICENSE +22 -0
- planframe_polars-0.1.0/PKG-INFO +63 -0
- planframe_polars-0.1.0/README.md +43 -0
- planframe_polars-0.1.0/planframe_polars/__init__.py +7 -0
- planframe_polars-0.1.0/planframe_polars/adapter.py +442 -0
- planframe_polars-0.1.0/planframe_polars/compile_expr.py +179 -0
- planframe_polars-0.1.0/planframe_polars/frame.py +229 -0
- planframe_polars-0.1.0/planframe_polars/frame.pyi +103 -0
- planframe_polars-0.1.0/pyproject.toml +32 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# Python bytecode / caches
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Virtual environments
|
|
7
|
+
.venv/
|
|
8
|
+
venv/
|
|
9
|
+
ENV/
|
|
10
|
+
env/
|
|
11
|
+
.env/
|
|
12
|
+
|
|
13
|
+
# Packaging / build artifacts
|
|
14
|
+
build/
|
|
15
|
+
dist/
|
|
16
|
+
*.egg-info/
|
|
17
|
+
*.egg
|
|
18
|
+
pip-wheel-metadata/
|
|
19
|
+
|
|
20
|
+
# Test / coverage artifacts
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
.coverage.*
|
|
24
|
+
coverage.xml
|
|
25
|
+
htmlcov/
|
|
26
|
+
|
|
27
|
+
# Type check / lint caches
|
|
28
|
+
.mypy_cache/
|
|
29
|
+
.pyright/
|
|
30
|
+
.ruff_cache/
|
|
31
|
+
|
|
32
|
+
# Notebook checkpoints
|
|
33
|
+
.ipynb_checkpoints/
|
|
34
|
+
|
|
35
|
+
# IDE/editor settings
|
|
36
|
+
.vscode/
|
|
37
|
+
.idea/
|
|
38
|
+
|
|
39
|
+
# OS files
|
|
40
|
+
.DS_Store
|
|
41
|
+
Thumbs.db
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 PlanFrame Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: planframe-polars
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Polars backend adapter for PlanFrame.
|
|
5
|
+
Project-URL: Repository, https://github.com/eddiethedean/planframe
|
|
6
|
+
Project-URL: Documentation, https://github.com/eddiethedean/planframe/blob/main/README.md
|
|
7
|
+
Project-URL: Issues, https://github.com/eddiethedean/planframe/issues
|
|
8
|
+
Author: PlanFrame Contributors
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Requires-Dist: planframe<0.2.0,>=0.1.0
|
|
18
|
+
Requires-Dist: polars>=0.20
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
## planframe-polars
|
|
22
|
+
|
|
23
|
+
Polars adapter package for PlanFrame. Import as `planframe_polars`.
|
|
24
|
+
|
|
25
|
+
### Usage
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
import polars as pl
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
|
|
31
|
+
from planframe_polars import PolarsFrame
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class UserSchema:
|
|
36
|
+
id: int
|
|
37
|
+
age: int
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
lf = pl.DataFrame({"id": [1], "age": [2]}).lazy()
|
|
41
|
+
class User(PolarsFrame):
|
|
42
|
+
id: int
|
|
43
|
+
age: int
|
|
44
|
+
|
|
45
|
+
pf = User(lf)
|
|
46
|
+
df = pf.select("id").collect()
|
|
47
|
+
|
|
48
|
+
# Or construct from python data:
|
|
49
|
+
pf2 = User({"id": [1], "age": [2]})
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Execution model
|
|
53
|
+
|
|
54
|
+
PlanFrame is always lazy:
|
|
55
|
+
- Chaining methods (like `.select(...)`) does **not** run Polars operations.
|
|
56
|
+
- `collect()` evaluates the full plan. If the source is a `polars.LazyFrame`, this naturally compiles into a single lazy query before collecting.
|
|
57
|
+
|
|
58
|
+
### Notes (Polars-specific)
|
|
59
|
+
|
|
60
|
+
- **Pivot**: `LazyFrame.pivot(...)` requires `on_columns` to be provided up-front (Polars must know the output schema prior to `collect()`). PlanFrame enforces this at execution time.
|
|
61
|
+
- **concat_vertical**: implemented via `polars.concat(..., how="vertical")`.
|
|
62
|
+
- **Join**: implemented via `LazyFrame.join(...)` / `DataFrame.join(...)` using the provided `on`, `how`, and `suffix`.
|
|
63
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
## planframe-polars
|
|
2
|
+
|
|
3
|
+
Polars adapter package for PlanFrame. Import as `planframe_polars`.
|
|
4
|
+
|
|
5
|
+
### Usage
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
import polars as pl
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
from planframe_polars import PolarsFrame
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class UserSchema:
|
|
16
|
+
id: int
|
|
17
|
+
age: int
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
lf = pl.DataFrame({"id": [1], "age": [2]}).lazy()
|
|
21
|
+
class User(PolarsFrame):
|
|
22
|
+
id: int
|
|
23
|
+
age: int
|
|
24
|
+
|
|
25
|
+
pf = User(lf)
|
|
26
|
+
df = pf.select("id").collect()
|
|
27
|
+
|
|
28
|
+
# Or construct from python data:
|
|
29
|
+
pf2 = User({"id": [1], "age": [2]})
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Execution model
|
|
33
|
+
|
|
34
|
+
PlanFrame is always lazy:
|
|
35
|
+
- Chaining methods (like `.select(...)`) does **not** run Polars operations.
|
|
36
|
+
- `collect()` evaluates the full plan. If the source is a `polars.LazyFrame`, this naturally compiles into a single lazy query before collecting.
|
|
37
|
+
|
|
38
|
+
### Notes (Polars-specific)
|
|
39
|
+
|
|
40
|
+
- **Pivot**: `LazyFrame.pivot(...)` requires `on_columns` to be provided up-front (Polars must know the output schema prior to `collect()`). PlanFrame enforces this at execution time.
|
|
41
|
+
- **concat_vertical**: implemented via `polars.concat(..., how="vertical")`.
|
|
42
|
+
- **Join**: implemented via `LazyFrame.join(...)` / `DataFrame.join(...)` using the provided `on`, `how`, and `suffix`.
|
|
43
|
+
|
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal, cast
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
from planframe.backend.adapter import BaseAdapter
|
|
8
|
+
from planframe.expr.api import Expr
|
|
9
|
+
from planframe_polars.compile_expr import compile_expr
|
|
10
|
+
|
|
11
|
+
PolarsBackendFrame = pl.DataFrame | pl.LazyFrame
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PolarsAdapter(BaseAdapter[PolarsBackendFrame, pl.Expr]):
|
|
15
|
+
name = "polars"
|
|
16
|
+
|
|
17
|
+
def _collect_df(self, df: PolarsBackendFrame) -> pl.DataFrame:
|
|
18
|
+
out = df.collect() if isinstance(df, pl.LazyFrame) else df
|
|
19
|
+
# Polars' type stubs may model `collect()` as returning an intermediate query.
|
|
20
|
+
if not isinstance(out, pl.DataFrame):
|
|
21
|
+
raise TypeError("Expected Polars collect() to return a DataFrame")
|
|
22
|
+
return out
|
|
23
|
+
|
|
24
|
+
def select(self, df: PolarsBackendFrame, columns: tuple[str, ...]) -> PolarsBackendFrame:
|
|
25
|
+
return df.select(list(columns))
|
|
26
|
+
|
|
27
|
+
def drop(self, df: PolarsBackendFrame, columns: tuple[str, ...]) -> PolarsBackendFrame:
|
|
28
|
+
return df.drop(list(columns))
|
|
29
|
+
|
|
30
|
+
def rename(self, df: PolarsBackendFrame, mapping: dict[str, str]) -> PolarsBackendFrame:
|
|
31
|
+
return df.rename(mapping)
|
|
32
|
+
|
|
33
|
+
def with_column(self, df: PolarsBackendFrame, name: str, expr: pl.Expr) -> PolarsBackendFrame:
|
|
34
|
+
return df.with_columns(expr.alias(name))
|
|
35
|
+
|
|
36
|
+
def cast(self, df: PolarsBackendFrame, name: str, dtype: Any) -> PolarsBackendFrame:
|
|
37
|
+
return df.with_columns(pl.col(name).cast(dtype))
|
|
38
|
+
|
|
39
|
+
def filter(self, df: PolarsBackendFrame, predicate: pl.Expr) -> PolarsBackendFrame:
|
|
40
|
+
return df.filter(predicate)
|
|
41
|
+
|
|
42
|
+
def sort(
|
|
43
|
+
self,
|
|
44
|
+
df: PolarsBackendFrame,
|
|
45
|
+
columns: tuple[str, ...],
|
|
46
|
+
*,
|
|
47
|
+
descending: bool = False,
|
|
48
|
+
nulls_last: bool = False,
|
|
49
|
+
) -> PolarsBackendFrame:
|
|
50
|
+
if not columns:
|
|
51
|
+
return df
|
|
52
|
+
return df.sort(list(columns), descending=descending, nulls_last=nulls_last)
|
|
53
|
+
|
|
54
|
+
def unique(
|
|
55
|
+
self,
|
|
56
|
+
df: PolarsBackendFrame,
|
|
57
|
+
subset: tuple[str, ...] | None,
|
|
58
|
+
*,
|
|
59
|
+
keep: str = "first",
|
|
60
|
+
maintain_order: bool = False,
|
|
61
|
+
) -> PolarsBackendFrame:
|
|
62
|
+
kwargs: dict[str, Any] = {"keep": keep, "maintain_order": maintain_order}
|
|
63
|
+
if subset is not None:
|
|
64
|
+
kwargs["subset"] = list(subset)
|
|
65
|
+
return df.unique(**kwargs) # type: ignore[arg-type]
|
|
66
|
+
|
|
67
|
+
def duplicated(
|
|
68
|
+
self,
|
|
69
|
+
df: PolarsBackendFrame,
|
|
70
|
+
subset: tuple[str, ...] | None,
|
|
71
|
+
*,
|
|
72
|
+
keep: str | bool = "first",
|
|
73
|
+
out_name: str = "duplicated",
|
|
74
|
+
) -> PolarsBackendFrame:
|
|
75
|
+
if keep is False:
|
|
76
|
+
raise NotImplementedError(
|
|
77
|
+
"duplicated(..., keep=False) is not supported in this adapter yet"
|
|
78
|
+
)
|
|
79
|
+
if keep not in {"first", "last"}:
|
|
80
|
+
raise ValueError("keep must be 'first', 'last', or False")
|
|
81
|
+
|
|
82
|
+
cols = list(subset) if subset is not None else None
|
|
83
|
+
expr = pl.struct(cols) if cols is not None else pl.struct(pl.all())
|
|
84
|
+
mask_expr = expr.is_duplicated()
|
|
85
|
+
|
|
86
|
+
if isinstance(df, pl.LazyFrame):
|
|
87
|
+
return df.select(mask_expr.alias(out_name))
|
|
88
|
+
mask = df.select(mask_expr.alias(out_name))[out_name]
|
|
89
|
+
return pl.DataFrame({out_name: mask})
|
|
90
|
+
|
|
91
|
+
def compile_expr(self, expr: Any) -> pl.Expr:
|
|
92
|
+
if not isinstance(expr, Expr):
|
|
93
|
+
raise TypeError(f"Expected PlanFrame Expr, got {type(expr)!r}")
|
|
94
|
+
return compile_expr(expr)
|
|
95
|
+
|
|
96
|
+
def group_by_agg(
|
|
97
|
+
self,
|
|
98
|
+
df: PolarsBackendFrame,
|
|
99
|
+
*,
|
|
100
|
+
keys: tuple[str, ...],
|
|
101
|
+
named_aggs: dict[str, tuple[str, str]],
|
|
102
|
+
) -> PolarsBackendFrame:
|
|
103
|
+
if not keys:
|
|
104
|
+
raise ValueError("keys must be non-empty")
|
|
105
|
+
agg_exprs: list[pl.Expr] = []
|
|
106
|
+
for out_name, (op, col) in named_aggs.items():
|
|
107
|
+
e = pl.col(col)
|
|
108
|
+
if op == "count":
|
|
109
|
+
ex = e.count()
|
|
110
|
+
elif op == "sum":
|
|
111
|
+
ex = e.sum()
|
|
112
|
+
elif op == "mean":
|
|
113
|
+
ex = e.mean()
|
|
114
|
+
elif op == "min":
|
|
115
|
+
ex = e.min()
|
|
116
|
+
elif op == "max":
|
|
117
|
+
ex = e.max()
|
|
118
|
+
elif op == "n_unique":
|
|
119
|
+
ex = e.n_unique()
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"Unsupported agg op: {op!r}")
|
|
122
|
+
agg_exprs.append(ex.alias(out_name))
|
|
123
|
+
return df.group_by(list(keys)).agg(agg_exprs)
|
|
124
|
+
|
|
125
|
+
def drop_nulls(
|
|
126
|
+
self, df: PolarsBackendFrame, subset: tuple[str, ...] | None
|
|
127
|
+
) -> PolarsBackendFrame:
|
|
128
|
+
if subset is None:
|
|
129
|
+
return df.drop_nulls()
|
|
130
|
+
return df.drop_nulls(list(subset))
|
|
131
|
+
|
|
132
|
+
def fill_null(
|
|
133
|
+
self, df: PolarsBackendFrame, value: Any, subset: tuple[str, ...] | None
|
|
134
|
+
) -> PolarsBackendFrame:
|
|
135
|
+
if subset is None:
|
|
136
|
+
return df.fill_null(value)
|
|
137
|
+
exprs = [pl.col(c).fill_null(value) for c in subset]
|
|
138
|
+
return df.with_columns(exprs)
|
|
139
|
+
|
|
140
|
+
def melt(
|
|
141
|
+
self,
|
|
142
|
+
df: PolarsBackendFrame,
|
|
143
|
+
*,
|
|
144
|
+
id_vars: tuple[str, ...],
|
|
145
|
+
value_vars: tuple[str, ...],
|
|
146
|
+
variable_name: str,
|
|
147
|
+
value_name: str,
|
|
148
|
+
) -> PolarsBackendFrame:
|
|
149
|
+
# Prefer unpivot (polars deprecates melt on LazyFrame).
|
|
150
|
+
return df.unpivot(
|
|
151
|
+
index=list(id_vars),
|
|
152
|
+
on=list(value_vars),
|
|
153
|
+
variable_name=variable_name,
|
|
154
|
+
value_name=value_name,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def join(
|
|
158
|
+
self,
|
|
159
|
+
left: PolarsBackendFrame,
|
|
160
|
+
right: PolarsBackendFrame,
|
|
161
|
+
*,
|
|
162
|
+
on: tuple[str, ...],
|
|
163
|
+
how: str = "inner",
|
|
164
|
+
suffix: str = "_right",
|
|
165
|
+
) -> PolarsBackendFrame:
|
|
166
|
+
if not on:
|
|
167
|
+
raise ValueError("on must be non-empty")
|
|
168
|
+
allowed_how = {
|
|
169
|
+
"inner",
|
|
170
|
+
"left",
|
|
171
|
+
"right",
|
|
172
|
+
"full",
|
|
173
|
+
"semi",
|
|
174
|
+
"anti",
|
|
175
|
+
"cross",
|
|
176
|
+
}
|
|
177
|
+
if how not in allowed_how:
|
|
178
|
+
raise ValueError(f"Unsupported join how={how!r}")
|
|
179
|
+
|
|
180
|
+
# Keep plans always-lazy: coerce eager frames to LazyFrame.
|
|
181
|
+
left_lf = left.lazy() if isinstance(left, pl.DataFrame) else left
|
|
182
|
+
right_lf = right.lazy() if isinstance(right, pl.DataFrame) else right
|
|
183
|
+
|
|
184
|
+
how_lit = cast(
|
|
185
|
+
Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
|
|
186
|
+
how,
|
|
187
|
+
)
|
|
188
|
+
return left_lf.join(right_lf, on=list(on), how=how_lit, suffix=suffix)
|
|
189
|
+
|
|
190
|
+
def slice(
|
|
191
|
+
self, df: PolarsBackendFrame, *, offset: int, length: int | None
|
|
192
|
+
) -> PolarsBackendFrame:
|
|
193
|
+
return df.slice(offset, length)
|
|
194
|
+
|
|
195
|
+
def head(self, df: PolarsBackendFrame, n: int) -> PolarsBackendFrame:
|
|
196
|
+
return df.head(n)
|
|
197
|
+
|
|
198
|
+
def tail(self, df: PolarsBackendFrame, n: int) -> PolarsBackendFrame:
|
|
199
|
+
return df.tail(n)
|
|
200
|
+
|
|
201
|
+
def concat_vertical(
|
|
202
|
+
self, left: PolarsBackendFrame, right: PolarsBackendFrame
|
|
203
|
+
) -> PolarsBackendFrame:
|
|
204
|
+
return pl.concat([left, right], how="vertical")
|
|
205
|
+
|
|
206
|
+
def concat_horizontal(
|
|
207
|
+
self, left: PolarsBackendFrame, right: PolarsBackendFrame
|
|
208
|
+
) -> PolarsBackendFrame:
|
|
209
|
+
return pl.concat([left, right], how="horizontal")
|
|
210
|
+
|
|
211
|
+
def pivot(
|
|
212
|
+
self,
|
|
213
|
+
df: PolarsBackendFrame,
|
|
214
|
+
*,
|
|
215
|
+
index: tuple[str, ...],
|
|
216
|
+
on: str,
|
|
217
|
+
values: str,
|
|
218
|
+
agg: str = "first",
|
|
219
|
+
on_columns: tuple[str, ...] | None = None,
|
|
220
|
+
separator: str = "_",
|
|
221
|
+
) -> PolarsBackendFrame:
|
|
222
|
+
allowed_agg = {
|
|
223
|
+
"min",
|
|
224
|
+
"max",
|
|
225
|
+
"first",
|
|
226
|
+
"last",
|
|
227
|
+
"sum",
|
|
228
|
+
"mean",
|
|
229
|
+
"median",
|
|
230
|
+
"len", # polars name for "count" in pivot
|
|
231
|
+
"count",
|
|
232
|
+
"n_unique",
|
|
233
|
+
}
|
|
234
|
+
if agg not in allowed_agg:
|
|
235
|
+
raise ValueError(f"Unsupported pivot agg={agg!r}")
|
|
236
|
+
if agg == "count":
|
|
237
|
+
agg_arg: Literal[
|
|
238
|
+
"min",
|
|
239
|
+
"max",
|
|
240
|
+
"first",
|
|
241
|
+
"last",
|
|
242
|
+
"sum",
|
|
243
|
+
"mean",
|
|
244
|
+
"median",
|
|
245
|
+
"len",
|
|
246
|
+
] | pl.Expr = "len"
|
|
247
|
+
elif agg == "n_unique":
|
|
248
|
+
agg_arg = pl.col(values).n_unique()
|
|
249
|
+
else:
|
|
250
|
+
agg_arg = cast(
|
|
251
|
+
Literal[
|
|
252
|
+
"min",
|
|
253
|
+
"max",
|
|
254
|
+
"first",
|
|
255
|
+
"last",
|
|
256
|
+
"sum",
|
|
257
|
+
"mean",
|
|
258
|
+
"median",
|
|
259
|
+
"len",
|
|
260
|
+
],
|
|
261
|
+
agg,
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
if isinstance(df, pl.LazyFrame):
|
|
265
|
+
if on_columns is None:
|
|
266
|
+
raise ValueError("Lazy pivot requires on_columns to be provided")
|
|
267
|
+
return df.pivot(
|
|
268
|
+
index=list(index),
|
|
269
|
+
on=on,
|
|
270
|
+
values=values,
|
|
271
|
+
aggregate_function=agg_arg,
|
|
272
|
+
on_columns=list(on_columns),
|
|
273
|
+
separator=separator,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
return df.pivot(
|
|
277
|
+
index=list(index),
|
|
278
|
+
on=on,
|
|
279
|
+
values=values,
|
|
280
|
+
aggregate_function=agg_arg,
|
|
281
|
+
on_columns=list(on_columns) if on_columns is not None else None,
|
|
282
|
+
separator=separator,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def write_parquet(
|
|
286
|
+
self,
|
|
287
|
+
df: PolarsBackendFrame,
|
|
288
|
+
path: str,
|
|
289
|
+
*,
|
|
290
|
+
compression: str = "zstd",
|
|
291
|
+
row_group_size: int | None = None,
|
|
292
|
+
partition_by: tuple[str, ...] | None = None,
|
|
293
|
+
storage_options: dict[str, Any] | None = None,
|
|
294
|
+
) -> None:
|
|
295
|
+
out = self._collect_df(df)
|
|
296
|
+
if compression not in {"uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4"}:
|
|
297
|
+
raise ValueError(f"Unsupported parquet compression={compression!r}")
|
|
298
|
+
comp_lit = cast(
|
|
299
|
+
Literal["uncompressed", "snappy", "gzip", "brotli", "zstd", "lz4"],
|
|
300
|
+
compression,
|
|
301
|
+
)
|
|
302
|
+
out.write_parquet(
|
|
303
|
+
path,
|
|
304
|
+
compression=comp_lit,
|
|
305
|
+
row_group_size=row_group_size,
|
|
306
|
+
partition_by=list(partition_by) if partition_by is not None else None,
|
|
307
|
+
storage_options=storage_options,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def write_csv(
|
|
311
|
+
self,
|
|
312
|
+
df: PolarsBackendFrame,
|
|
313
|
+
path: str,
|
|
314
|
+
*,
|
|
315
|
+
separator: str = ",",
|
|
316
|
+
include_header: bool = True,
|
|
317
|
+
storage_options: dict[str, Any] | None = None,
|
|
318
|
+
) -> None:
|
|
319
|
+
out = self._collect_df(df)
|
|
320
|
+
out.write_csv(
|
|
321
|
+
path,
|
|
322
|
+
separator=separator,
|
|
323
|
+
include_header=include_header,
|
|
324
|
+
storage_options=storage_options,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
def write_ndjson(
|
|
328
|
+
self, df: PolarsBackendFrame, path: str, *, storage_options: dict[str, Any] | None = None
|
|
329
|
+
) -> None:
|
|
330
|
+
out = self._collect_df(df)
|
|
331
|
+
# Polars write_ndjson does not currently accept storage_options.
|
|
332
|
+
# The path may still be a cloud URI if the Polars build supports it implicitly.
|
|
333
|
+
out.write_ndjson(path)
|
|
334
|
+
|
|
335
|
+
def write_ipc(
|
|
336
|
+
self,
|
|
337
|
+
df: PolarsBackendFrame,
|
|
338
|
+
path: str,
|
|
339
|
+
*,
|
|
340
|
+
compression: str = "uncompressed",
|
|
341
|
+
storage_options: dict[str, Any] | None = None,
|
|
342
|
+
) -> None:
|
|
343
|
+
out = self._collect_df(df)
|
|
344
|
+
if compression not in {"uncompressed", "lz4", "zstd"}:
|
|
345
|
+
raise ValueError(f"Unsupported ipc compression={compression!r}")
|
|
346
|
+
comp_lit = cast(Literal["uncompressed", "lz4", "zstd"], compression)
|
|
347
|
+
# Polars write_ipc does not currently accept storage_options.
|
|
348
|
+
out.write_ipc(path, compression=comp_lit)
|
|
349
|
+
|
|
350
|
+
def write_database(
|
|
351
|
+
self,
|
|
352
|
+
df: PolarsBackendFrame,
|
|
353
|
+
*,
|
|
354
|
+
table_name: str,
|
|
355
|
+
connection: Any,
|
|
356
|
+
if_table_exists: str = "fail",
|
|
357
|
+
engine: str | None = None,
|
|
358
|
+
) -> None:
|
|
359
|
+
out = self._collect_df(df)
|
|
360
|
+
kwargs: dict[str, Any] = {"if_table_exists": if_table_exists}
|
|
361
|
+
if engine is not None:
|
|
362
|
+
kwargs["engine"] = engine
|
|
363
|
+
out.write_database(table_name=table_name, connection=connection, **kwargs)
|
|
364
|
+
|
|
365
|
+
def write_excel(self, df: PolarsBackendFrame, path: str, *, worksheet: str = "Sheet1") -> None:
|
|
366
|
+
out = self._collect_df(df)
|
|
367
|
+
out.write_excel(workbook=path, worksheet=worksheet)
|
|
368
|
+
|
|
369
|
+
def write_delta(
|
|
370
|
+
self,
|
|
371
|
+
df: PolarsBackendFrame,
|
|
372
|
+
target: str,
|
|
373
|
+
*,
|
|
374
|
+
mode: str = "error",
|
|
375
|
+
storage_options: dict[str, Any] | None = None,
|
|
376
|
+
) -> None:
|
|
377
|
+
out = self._collect_df(df)
|
|
378
|
+
if mode not in {"error", "append", "overwrite", "ignore"}:
|
|
379
|
+
raise ValueError(f"Unsupported delta mode={mode!r}")
|
|
380
|
+
mode_lit = cast(Literal["error", "append", "overwrite", "ignore"], mode)
|
|
381
|
+
out.write_delta(target, mode=mode_lit, storage_options=storage_options)
|
|
382
|
+
|
|
383
|
+
def write_avro(
|
|
384
|
+
self,
|
|
385
|
+
df: PolarsBackendFrame,
|
|
386
|
+
path: str,
|
|
387
|
+
*,
|
|
388
|
+
compression: str = "uncompressed",
|
|
389
|
+
name: str = "",
|
|
390
|
+
) -> None:
|
|
391
|
+
out = self._collect_df(df)
|
|
392
|
+
if compression not in {"uncompressed", "snappy", "deflate"}:
|
|
393
|
+
raise ValueError(f"Unsupported avro compression={compression!r}")
|
|
394
|
+
comp_lit = cast(Literal["uncompressed", "snappy", "deflate"], compression)
|
|
395
|
+
out.write_avro(path, compression=comp_lit, name=name)
|
|
396
|
+
|
|
397
|
+
def explode(self, df: PolarsBackendFrame, column: str) -> PolarsBackendFrame:
|
|
398
|
+
return df.explode(column)
|
|
399
|
+
|
|
400
|
+
def unnest(self, df: PolarsBackendFrame, column: str) -> PolarsBackendFrame:
|
|
401
|
+
return df.unnest(column)
|
|
402
|
+
|
|
403
|
+
def drop_nulls_all(
|
|
404
|
+
self, df: PolarsBackendFrame, subset: tuple[str, ...] | None
|
|
405
|
+
) -> PolarsBackendFrame:
|
|
406
|
+
if subset is None:
|
|
407
|
+
# Drop rows where *all* columns are null.
|
|
408
|
+
return df.filter(~pl.all_horizontal(pl.all().is_null()))
|
|
409
|
+
cols = list(subset)
|
|
410
|
+
if not cols:
|
|
411
|
+
return df
|
|
412
|
+
mask = pl.all_horizontal([pl.col(c).is_null() for c in cols])
|
|
413
|
+
return df.filter(~mask)
|
|
414
|
+
|
|
415
|
+
def sample(
|
|
416
|
+
self,
|
|
417
|
+
df: PolarsBackendFrame,
|
|
418
|
+
*,
|
|
419
|
+
n: int | None = None,
|
|
420
|
+
frac: float | None = None,
|
|
421
|
+
with_replacement: bool = False,
|
|
422
|
+
shuffle: bool = False,
|
|
423
|
+
seed: int | None = None,
|
|
424
|
+
) -> PolarsBackendFrame:
|
|
425
|
+
df2 = self._collect_df(df)
|
|
426
|
+
kwargs: dict[str, Any] = {
|
|
427
|
+
"with_replacement": with_replacement,
|
|
428
|
+
"shuffle": shuffle,
|
|
429
|
+
"seed": seed,
|
|
430
|
+
}
|
|
431
|
+
if n is not None:
|
|
432
|
+
return df2.sample(n=n, **kwargs)
|
|
433
|
+
return df2.sample(fraction=frac, **kwargs)
|
|
434
|
+
|
|
435
|
+
def collect(self, df: PolarsBackendFrame) -> PolarsBackendFrame:
|
|
436
|
+
return self._collect_df(df) if isinstance(df, pl.LazyFrame) else df
|
|
437
|
+
|
|
438
|
+
def to_dicts(self, df: PolarsBackendFrame) -> list[dict[str, object]]:
|
|
439
|
+
return self._collect_df(df).to_dicts()
|
|
440
|
+
|
|
441
|
+
def to_dict(self, df: PolarsBackendFrame) -> dict[str, list[object]]:
|
|
442
|
+
return self._collect_df(df).to_dict(as_series=False) # type: ignore[return-value]
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Literal, cast
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
|
|
7
|
+
from planframe.backend.errors import PlanFrameExpressionError
|
|
8
|
+
from planframe.expr.api import (
|
|
9
|
+
Add,
|
|
10
|
+
And,
|
|
11
|
+
Abs,
|
|
12
|
+
Between,
|
|
13
|
+
Ceil,
|
|
14
|
+
Col,
|
|
15
|
+
Coalesce,
|
|
16
|
+
Clip,
|
|
17
|
+
DtDay,
|
|
18
|
+
DtMonth,
|
|
19
|
+
DtYear,
|
|
20
|
+
Eq,
|
|
21
|
+
Exp,
|
|
22
|
+
Expr,
|
|
23
|
+
Floor,
|
|
24
|
+
Ge,
|
|
25
|
+
Gt,
|
|
26
|
+
IfElse,
|
|
27
|
+
IsIn,
|
|
28
|
+
IsNotNull,
|
|
29
|
+
IsNull,
|
|
30
|
+
Log,
|
|
31
|
+
Le,
|
|
32
|
+
Lit,
|
|
33
|
+
Lt,
|
|
34
|
+
Mul,
|
|
35
|
+
Ne,
|
|
36
|
+
Not,
|
|
37
|
+
Over,
|
|
38
|
+
Or,
|
|
39
|
+
Pow,
|
|
40
|
+
Round,
|
|
41
|
+
StrContains,
|
|
42
|
+
StrEndsWith,
|
|
43
|
+
StrLen,
|
|
44
|
+
StrLower,
|
|
45
|
+
StrReplace,
|
|
46
|
+
StrSplit,
|
|
47
|
+
StrStrip,
|
|
48
|
+
StrStartsWith,
|
|
49
|
+
StrUpper,
|
|
50
|
+
Sub,
|
|
51
|
+
Sqrt,
|
|
52
|
+
TrueDiv,
|
|
53
|
+
IsFinite,
|
|
54
|
+
Xor,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def compile_expr(expr: Expr[Any]) -> pl.Expr:
|
|
59
|
+
if isinstance(expr, Col):
|
|
60
|
+
return pl.col(expr.name)
|
|
61
|
+
if isinstance(expr, Lit):
|
|
62
|
+
return pl.lit(expr.value)
|
|
63
|
+
if isinstance(expr, Add):
|
|
64
|
+
return compile_expr(expr.left) + compile_expr(expr.right)
|
|
65
|
+
if isinstance(expr, Sub):
|
|
66
|
+
return compile_expr(expr.left) - compile_expr(expr.right)
|
|
67
|
+
if isinstance(expr, Mul):
|
|
68
|
+
return compile_expr(expr.left) * compile_expr(expr.right)
|
|
69
|
+
if isinstance(expr, TrueDiv):
|
|
70
|
+
return compile_expr(expr.left) / compile_expr(expr.right)
|
|
71
|
+
if isinstance(expr, Eq):
|
|
72
|
+
return compile_expr(expr.left) == compile_expr(expr.right)
|
|
73
|
+
if isinstance(expr, Ne):
|
|
74
|
+
return compile_expr(expr.left) != compile_expr(expr.right)
|
|
75
|
+
if isinstance(expr, Lt):
|
|
76
|
+
return compile_expr(expr.left) < compile_expr(expr.right)
|
|
77
|
+
if isinstance(expr, Le):
|
|
78
|
+
return compile_expr(expr.left) <= compile_expr(expr.right)
|
|
79
|
+
if isinstance(expr, Gt):
|
|
80
|
+
return compile_expr(expr.left) > compile_expr(expr.right)
|
|
81
|
+
if isinstance(expr, Ge):
|
|
82
|
+
return compile_expr(expr.left) >= compile_expr(expr.right)
|
|
83
|
+
if isinstance(expr, IsNull):
|
|
84
|
+
return compile_expr(expr.value).is_null()
|
|
85
|
+
if isinstance(expr, IsNotNull):
|
|
86
|
+
return compile_expr(expr.value).is_not_null()
|
|
87
|
+
if isinstance(expr, IsIn):
|
|
88
|
+
return compile_expr(expr.value).is_in(list(expr.options))
|
|
89
|
+
if isinstance(expr, And):
|
|
90
|
+
return compile_expr(expr.left) & compile_expr(expr.right)
|
|
91
|
+
if isinstance(expr, Or):
|
|
92
|
+
return compile_expr(expr.left) | compile_expr(expr.right)
|
|
93
|
+
if isinstance(expr, Not):
|
|
94
|
+
return ~compile_expr(expr.value)
|
|
95
|
+
if isinstance(expr, Xor):
|
|
96
|
+
return compile_expr(expr.left) ^ compile_expr(expr.right)
|
|
97
|
+
if isinstance(expr, Abs):
|
|
98
|
+
return compile_expr(expr.value).abs()
|
|
99
|
+
if isinstance(expr, Round):
|
|
100
|
+
e = compile_expr(expr.value)
|
|
101
|
+
return e.round(expr.ndigits) if expr.ndigits is not None else e.round()
|
|
102
|
+
if isinstance(expr, Floor):
|
|
103
|
+
return compile_expr(expr.value).floor()
|
|
104
|
+
if isinstance(expr, Ceil):
|
|
105
|
+
return compile_expr(expr.value).ceil()
|
|
106
|
+
if isinstance(expr, Coalesce):
|
|
107
|
+
return pl.coalesce([compile_expr(v) for v in expr.values])
|
|
108
|
+
if isinstance(expr, IfElse):
|
|
109
|
+
return (
|
|
110
|
+
pl.when(compile_expr(expr.cond))
|
|
111
|
+
.then(compile_expr(expr.then_value))
|
|
112
|
+
.otherwise(compile_expr(expr.else_value))
|
|
113
|
+
)
|
|
114
|
+
if isinstance(expr, Over):
|
|
115
|
+
e = compile_expr(expr.value)
|
|
116
|
+
return e.over(
|
|
117
|
+
partition_by=list(expr.partition_by),
|
|
118
|
+
order_by=(list(expr.order_by) if expr.order_by is not None else None),
|
|
119
|
+
)
|
|
120
|
+
if isinstance(expr, Between):
|
|
121
|
+
allowed_closed = {"left", "right", "both", "none"}
|
|
122
|
+
if expr.closed not in allowed_closed:
|
|
123
|
+
raise ValueError(f"Unsupported closed interval: {expr.closed!r}")
|
|
124
|
+
closed_lit = cast(Literal["left", "right", "both", "none"], expr.closed)
|
|
125
|
+
return compile_expr(expr.value).is_between(
|
|
126
|
+
compile_expr(expr.low),
|
|
127
|
+
compile_expr(expr.high),
|
|
128
|
+
closed=closed_lit,
|
|
129
|
+
)
|
|
130
|
+
if isinstance(expr, Clip):
|
|
131
|
+
e = compile_expr(expr.value)
|
|
132
|
+
lower = compile_expr(expr.lower) if expr.lower is not None else None
|
|
133
|
+
upper = compile_expr(expr.upper) if expr.upper is not None else None
|
|
134
|
+
return e.clip(lower_bound=lower, upper_bound=upper)
|
|
135
|
+
if isinstance(expr, Pow):
|
|
136
|
+
return compile_expr(expr.base) ** compile_expr(expr.exponent)
|
|
137
|
+
if isinstance(expr, Exp):
|
|
138
|
+
return compile_expr(expr.value).exp()
|
|
139
|
+
if isinstance(expr, Log):
|
|
140
|
+
return compile_expr(expr.value).log()
|
|
141
|
+
if isinstance(expr, StrContains):
|
|
142
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
143
|
+
return e.str.contains(expr.pattern, literal=expr.literal)
|
|
144
|
+
if isinstance(expr, StrStartsWith):
|
|
145
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
146
|
+
return e.str.starts_with(expr.prefix)
|
|
147
|
+
if isinstance(expr, StrEndsWith):
|
|
148
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
149
|
+
return e.str.ends_with(expr.suffix)
|
|
150
|
+
if isinstance(expr, StrLower):
|
|
151
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
152
|
+
return e.str.to_lowercase()
|
|
153
|
+
if isinstance(expr, StrUpper):
|
|
154
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
155
|
+
return e.str.to_uppercase()
|
|
156
|
+
if isinstance(expr, StrLen):
|
|
157
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
158
|
+
return e.str.len_chars()
|
|
159
|
+
if isinstance(expr, StrReplace):
|
|
160
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
161
|
+
return e.str.replace_all(expr.pattern, expr.replacement, literal=expr.literal)
|
|
162
|
+
if isinstance(expr, StrStrip):
|
|
163
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
164
|
+
return e.str.strip_chars()
|
|
165
|
+
if isinstance(expr, StrSplit):
|
|
166
|
+
e = compile_expr(expr.value).cast(pl.Utf8)
|
|
167
|
+
return e.str.split(expr.by)
|
|
168
|
+
if isinstance(expr, DtYear):
|
|
169
|
+
return compile_expr(expr.value).dt.year()
|
|
170
|
+
if isinstance(expr, DtMonth):
|
|
171
|
+
return compile_expr(expr.value).dt.month()
|
|
172
|
+
if isinstance(expr, DtDay):
|
|
173
|
+
return compile_expr(expr.value).dt.day()
|
|
174
|
+
if isinstance(expr, Sqrt):
|
|
175
|
+
return compile_expr(expr.value).sqrt()
|
|
176
|
+
if isinstance(expr, IsFinite):
|
|
177
|
+
return compile_expr(expr.value).is_finite()
|
|
178
|
+
|
|
179
|
+
raise PlanFrameExpressionError(f"Unsupported expr node: {type(expr)!r}")
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping, Sequence
|
|
4
|
+
from typing import Any, ClassVar, Generic, Literal, TypeVar, cast
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from planframe.frame import Frame
|
|
9
|
+
from planframe_polars.adapter import PolarsAdapter, PolarsBackendFrame
|
|
10
|
+
|
|
11
|
+
SchemaT = TypeVar("SchemaT")
|
|
12
|
+
|
|
13
|
+
PolarsData = Mapping[str, Sequence[object]] | Sequence[Mapping[str, object]]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _schema_defaults(schema: type[Any]) -> dict[str, object]:
|
|
17
|
+
ann = dict(getattr(schema, "__dict__", {}).get("__annotations__", {}))
|
|
18
|
+
out: dict[str, object] = {}
|
|
19
|
+
for name in ann.keys():
|
|
20
|
+
if name in getattr(schema, "__dict__", {}):
|
|
21
|
+
out[name] = getattr(schema, name)
|
|
22
|
+
return out
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _fill_missing_from_defaults(data: PolarsData, *, defaults: dict[str, object]) -> PolarsData:
|
|
26
|
+
if not defaults:
|
|
27
|
+
return data
|
|
28
|
+
|
|
29
|
+
if isinstance(data, Mapping):
|
|
30
|
+
data_map = cast(Mapping[str, Sequence[object]], data)
|
|
31
|
+
if not data_map:
|
|
32
|
+
return dict(data_map)
|
|
33
|
+
# Infer row count from the first column.
|
|
34
|
+
first = next(iter(data_map.values()))
|
|
35
|
+
n = len(first)
|
|
36
|
+
out: dict[str, list[object]] = {k: list(v) for k, v in data_map.items()}
|
|
37
|
+
for k, dv in defaults.items():
|
|
38
|
+
if k not in out:
|
|
39
|
+
out[k] = [dv] * n
|
|
40
|
+
return out
|
|
41
|
+
|
|
42
|
+
# list-of-dicts
|
|
43
|
+
out_rows: list[dict[str, object]] = []
|
|
44
|
+
for row in data:
|
|
45
|
+
r = dict(row)
|
|
46
|
+
for k, dv in defaults.items():
|
|
47
|
+
if k not in r:
|
|
48
|
+
r[k] = dv
|
|
49
|
+
out_rows.append(r)
|
|
50
|
+
return out_rows
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _to_polars_backend_frame(
|
|
54
|
+
data: PolarsData, *, schema: type[Any], lazy: bool
|
|
55
|
+
) -> PolarsBackendFrame:
|
|
56
|
+
defaults = _schema_defaults(schema)
|
|
57
|
+
data2 = _fill_missing_from_defaults(data, defaults=defaults)
|
|
58
|
+
df = pl.DataFrame(data2) # type: ignore[arg-type]
|
|
59
|
+
return df.lazy() if lazy else df
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class _PolarsFrameMeta(type):
|
|
63
|
+
def __call__(cls, *args: Any, **kwargs: Any) -> Any:
|
|
64
|
+
# Allow normal dataclass construction when `Frame.source(...)` calls `cls(_data=..., ...)`.
|
|
65
|
+
if "_data" in kwargs and "_adapter" in kwargs and "_plan" in kwargs and "_schema" in kwargs:
|
|
66
|
+
return super().__call__(*args, **kwargs)
|
|
67
|
+
|
|
68
|
+
data = args[0] if args else kwargs.pop("data")
|
|
69
|
+
if isinstance(data, (pl.DataFrame, pl.LazyFrame)):
|
|
70
|
+
raise TypeError(
|
|
71
|
+
"PolarsFrame constructors accept only Python data (dict-of-lists or list-of-dicts). "
|
|
72
|
+
"Use `Frame.source(...)` for advanced usage."
|
|
73
|
+
)
|
|
74
|
+
lazy = kwargs.pop("lazy", True)
|
|
75
|
+
if kwargs:
|
|
76
|
+
raise TypeError(f"Unexpected constructor kwargs: {sorted(kwargs)}")
|
|
77
|
+
df = _to_polars_backend_frame(data, schema=cls, lazy=lazy)
|
|
78
|
+
return PolarsFrame.source(
|
|
79
|
+
df,
|
|
80
|
+
adapter=PolarsFrame._adapter_singleton,
|
|
81
|
+
schema=cast(type[SchemaT], cls),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class PolarsFrame(
|
|
86
|
+
Frame[SchemaT, PolarsBackendFrame, pl.Expr], Generic[SchemaT], metaclass=_PolarsFrameMeta
|
|
87
|
+
):
|
|
88
|
+
"""A PlanFrame `Frame` bound to the Polars backend."""
|
|
89
|
+
|
|
90
|
+
_adapter_singleton: ClassVar[PolarsAdapter] = PolarsAdapter()
|
|
91
|
+
__planframe_model__ = True
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def scan_parquet(
|
|
95
|
+
cls,
|
|
96
|
+
path: str,
|
|
97
|
+
*,
|
|
98
|
+
schema: type[SchemaT],
|
|
99
|
+
hive_partitioning: bool | None = None,
|
|
100
|
+
storage_options: dict[str, Any] | None = None,
|
|
101
|
+
) -> PolarsFrame[SchemaT]:
|
|
102
|
+
kwargs: dict[str, Any] = {"storage_options": storage_options}
|
|
103
|
+
if hive_partitioning is not None:
|
|
104
|
+
kwargs["hive_partitioning"] = hive_partitioning
|
|
105
|
+
lf = pl.scan_parquet(path, **kwargs)
|
|
106
|
+
return cls.source(lf, adapter=cls._adapter_singleton, schema=schema)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def scan_parquet_dataset(
|
|
110
|
+
cls,
|
|
111
|
+
path_or_glob: str,
|
|
112
|
+
*,
|
|
113
|
+
schema: type[SchemaT],
|
|
114
|
+
storage_options: dict[str, Any] | None = None,
|
|
115
|
+
) -> PolarsFrame[SchemaT]:
|
|
116
|
+
return cls.scan_parquet(
|
|
117
|
+
path_or_glob,
|
|
118
|
+
schema=schema,
|
|
119
|
+
hive_partitioning=True,
|
|
120
|
+
storage_options=storage_options,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
@classmethod
|
|
124
|
+
def scan_csv(
|
|
125
|
+
cls,
|
|
126
|
+
path: str,
|
|
127
|
+
*,
|
|
128
|
+
schema: type[SchemaT],
|
|
129
|
+
storage_options: dict[str, Any] | None = None,
|
|
130
|
+
) -> PolarsFrame[SchemaT]:
|
|
131
|
+
lf = pl.scan_csv(path, storage_options=storage_options)
|
|
132
|
+
return cls.source(lf, adapter=cls._adapter_singleton, schema=schema)
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def scan_ndjson(
|
|
136
|
+
cls,
|
|
137
|
+
path: str,
|
|
138
|
+
*,
|
|
139
|
+
schema: type[SchemaT],
|
|
140
|
+
storage_options: dict[str, Any] | None = None,
|
|
141
|
+
) -> PolarsFrame[SchemaT]:
|
|
142
|
+
lf = pl.scan_ndjson(path, storage_options=storage_options)
|
|
143
|
+
return cls.source(lf, adapter=cls._adapter_singleton, schema=schema)
|
|
144
|
+
|
|
145
|
+
@classmethod
|
|
146
|
+
def scan_ipc(
|
|
147
|
+
cls,
|
|
148
|
+
path: str,
|
|
149
|
+
*,
|
|
150
|
+
schema: type[SchemaT],
|
|
151
|
+
hive_partitioning: bool | None = None,
|
|
152
|
+
storage_options: dict[str, Any] | None = None,
|
|
153
|
+
) -> PolarsFrame[SchemaT]:
|
|
154
|
+
kwargs: dict[str, Any] = {"storage_options": storage_options}
|
|
155
|
+
if hive_partitioning is not None:
|
|
156
|
+
kwargs["hive_partitioning"] = hive_partitioning
|
|
157
|
+
lf = pl.scan_ipc(path, **kwargs)
|
|
158
|
+
return cls.source(lf, adapter=cls._adapter_singleton, schema=schema)
|
|
159
|
+
|
|
160
|
+
@classmethod
|
|
161
|
+
def scan_delta(
|
|
162
|
+
cls,
|
|
163
|
+
source: str,
|
|
164
|
+
*,
|
|
165
|
+
schema: type[SchemaT],
|
|
166
|
+
version: int | str | None = None,
|
|
167
|
+
storage_options: dict[str, Any] | None = None,
|
|
168
|
+
) -> PolarsFrame[SchemaT]:
|
|
169
|
+
kwargs: dict[str, Any] = {"storage_options": storage_options}
|
|
170
|
+
if version is not None:
|
|
171
|
+
kwargs["version"] = version
|
|
172
|
+
lf = pl.scan_delta(source, **kwargs)
|
|
173
|
+
return cls.source(lf, adapter=cls._adapter_singleton, schema=schema)
|
|
174
|
+
|
|
175
|
+
@classmethod
|
|
176
|
+
def read_delta(
|
|
177
|
+
cls,
|
|
178
|
+
source: str,
|
|
179
|
+
*,
|
|
180
|
+
schema: type[SchemaT],
|
|
181
|
+
version: int | str | None = None,
|
|
182
|
+
storage_options: dict[str, Any] | None = None,
|
|
183
|
+
) -> PolarsFrame[SchemaT]:
|
|
184
|
+
kwargs: dict[str, Any] = {"storage_options": storage_options}
|
|
185
|
+
if version is not None:
|
|
186
|
+
kwargs["version"] = version
|
|
187
|
+
df = pl.read_delta(source, **kwargs)
|
|
188
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
189
|
+
|
|
190
|
+
@classmethod
|
|
191
|
+
def read_excel(
|
|
192
|
+
cls,
|
|
193
|
+
path: str,
|
|
194
|
+
*,
|
|
195
|
+
schema: type[SchemaT],
|
|
196
|
+
sheet_name: str | None = None,
|
|
197
|
+
) -> PolarsFrame[SchemaT]:
|
|
198
|
+
kwargs: dict[str, Any] = {}
|
|
199
|
+
if sheet_name is not None:
|
|
200
|
+
kwargs["sheet_name"] = sheet_name
|
|
201
|
+
df = pl.read_excel(path, **kwargs)
|
|
202
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
203
|
+
|
|
204
|
+
@classmethod
|
|
205
|
+
def read_avro(cls, path: str, *, schema: type[SchemaT]) -> PolarsFrame[SchemaT]:
|
|
206
|
+
df = pl.read_avro(path)
|
|
207
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
208
|
+
|
|
209
|
+
@classmethod
|
|
210
|
+
def read_database(
|
|
211
|
+
cls, query: str, *, connection: Any, schema: type[SchemaT]
|
|
212
|
+
) -> PolarsFrame[SchemaT]:
|
|
213
|
+
df = pl.read_database(query=query, connection=connection)
|
|
214
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
215
|
+
|
|
216
|
+
@classmethod
|
|
217
|
+
def read_database_uri(
|
|
218
|
+
cls,
|
|
219
|
+
query: str,
|
|
220
|
+
*,
|
|
221
|
+
uri: str,
|
|
222
|
+
engine: Literal["connectorx", "adbc"] | None = None,
|
|
223
|
+
schema: type[SchemaT],
|
|
224
|
+
) -> PolarsFrame[SchemaT]:
|
|
225
|
+
kwargs: dict[str, Any] = {}
|
|
226
|
+
if engine is not None:
|
|
227
|
+
kwargs["engine"] = engine
|
|
228
|
+
df = pl.read_database_uri(query=query, uri=uri, **kwargs)
|
|
229
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping, Sequence
|
|
4
|
+
from typing import Any, Literal, TypeVar
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
|
|
8
|
+
from planframe.frame import Frame
|
|
9
|
+
|
|
10
|
+
SchemaT = TypeVar("SchemaT")
|
|
11
|
+
|
|
12
|
+
PolarsBackendFrame = pl.DataFrame | pl.LazyFrame
|
|
13
|
+
|
|
14
|
+
class PolarsFrame(Frame[Any, PolarsBackendFrame, pl.Expr]):
|
|
15
|
+
def __new__(
|
|
16
|
+
cls,
|
|
17
|
+
data: Mapping[str, Sequence[object]] | Sequence[Mapping[str, object]],
|
|
18
|
+
*,
|
|
19
|
+
lazy: bool = ...,
|
|
20
|
+
) -> PolarsFrame: ...
|
|
21
|
+
@classmethod
|
|
22
|
+
def scan_parquet(
|
|
23
|
+
cls,
|
|
24
|
+
path: str,
|
|
25
|
+
*,
|
|
26
|
+
schema: type[SchemaT],
|
|
27
|
+
hive_partitioning: bool | None = ...,
|
|
28
|
+
storage_options: dict[str, Any] | None = ...,
|
|
29
|
+
) -> PolarsFrame: ...
|
|
30
|
+
@classmethod
|
|
31
|
+
def scan_parquet_dataset(
|
|
32
|
+
cls,
|
|
33
|
+
path_or_glob: str,
|
|
34
|
+
*,
|
|
35
|
+
schema: type[SchemaT],
|
|
36
|
+
storage_options: dict[str, Any] | None = ...,
|
|
37
|
+
) -> PolarsFrame: ...
|
|
38
|
+
@classmethod
|
|
39
|
+
def scan_csv(
|
|
40
|
+
cls,
|
|
41
|
+
path: str,
|
|
42
|
+
*,
|
|
43
|
+
schema: type[SchemaT],
|
|
44
|
+
storage_options: dict[str, Any] | None = ...,
|
|
45
|
+
) -> PolarsFrame: ...
|
|
46
|
+
@classmethod
|
|
47
|
+
def scan_ndjson(
|
|
48
|
+
cls,
|
|
49
|
+
path: str,
|
|
50
|
+
*,
|
|
51
|
+
schema: type[SchemaT],
|
|
52
|
+
storage_options: dict[str, Any] | None = ...,
|
|
53
|
+
) -> PolarsFrame: ...
|
|
54
|
+
@classmethod
|
|
55
|
+
def scan_ipc(
|
|
56
|
+
cls,
|
|
57
|
+
path: str,
|
|
58
|
+
*,
|
|
59
|
+
schema: type[SchemaT],
|
|
60
|
+
hive_partitioning: bool | None = ...,
|
|
61
|
+
storage_options: dict[str, Any] | None = ...,
|
|
62
|
+
) -> PolarsFrame: ...
|
|
63
|
+
@classmethod
|
|
64
|
+
def scan_delta(
|
|
65
|
+
cls,
|
|
66
|
+
source: str,
|
|
67
|
+
*,
|
|
68
|
+
schema: type[SchemaT],
|
|
69
|
+
version: int | str | None = ...,
|
|
70
|
+
storage_options: dict[str, Any] | None = ...,
|
|
71
|
+
) -> PolarsFrame: ...
|
|
72
|
+
@classmethod
|
|
73
|
+
def read_delta(
|
|
74
|
+
cls,
|
|
75
|
+
source: str,
|
|
76
|
+
*,
|
|
77
|
+
schema: type[SchemaT],
|
|
78
|
+
version: int | str | None = ...,
|
|
79
|
+
storage_options: dict[str, Any] | None = ...,
|
|
80
|
+
) -> PolarsFrame: ...
|
|
81
|
+
@classmethod
|
|
82
|
+
def read_excel(
|
|
83
|
+
cls,
|
|
84
|
+
path: str,
|
|
85
|
+
*,
|
|
86
|
+
schema: type[SchemaT],
|
|
87
|
+
sheet_name: str | None = ...,
|
|
88
|
+
) -> PolarsFrame: ...
|
|
89
|
+
@classmethod
|
|
90
|
+
def read_avro(cls, path: str, *, schema: type[SchemaT]) -> PolarsFrame: ...
|
|
91
|
+
@classmethod
|
|
92
|
+
def read_database(
|
|
93
|
+
cls, query: str, *, connection: Any, schema: type[SchemaT]
|
|
94
|
+
) -> PolarsFrame: ...
|
|
95
|
+
@classmethod
|
|
96
|
+
def read_database_uri(
|
|
97
|
+
cls,
|
|
98
|
+
query: str,
|
|
99
|
+
*,
|
|
100
|
+
uri: str,
|
|
101
|
+
engine: Literal["connectorx", "adbc"] | None = ...,
|
|
102
|
+
schema: type[SchemaT],
|
|
103
|
+
) -> PolarsFrame: ...
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "planframe-polars"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Polars backend adapter for PlanFrame."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "PlanFrame Contributors" }]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Typing :: Typed",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"planframe>=0.1.0,<0.2.0",
|
|
22
|
+
"polars>=0.20",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Repository = "https://github.com/eddiethedean/planframe"
|
|
27
|
+
Documentation = "https://github.com/eddiethedean/planframe/blob/main/README.md"
|
|
28
|
+
Issues = "https://github.com/eddiethedean/planframe/issues"
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.wheel]
|
|
31
|
+
packages = ["planframe_polars"]
|
|
32
|
+
|