planframe-sparkless 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- planframe_sparkless-0.1.0/.gitignore +53 -0
- planframe_sparkless-0.1.0/PKG-INFO +80 -0
- planframe_sparkless-0.1.0/README.md +61 -0
- planframe_sparkless-0.1.0/planframe_sparkless/__init__.py +5 -0
- planframe_sparkless-0.1.0/planframe_sparkless/_spark.py +11 -0
- planframe_sparkless-0.1.0/planframe_sparkless/adapter.py +595 -0
- planframe_sparkless-0.1.0/planframe_sparkless/compile_expr.py +188 -0
- planframe_sparkless-0.1.0/planframe_sparkless/frame.py +159 -0
- planframe_sparkless-0.1.0/pyproject.toml +32 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Python bytecode / caches
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Virtual environments
|
|
7
|
+
.venv/
|
|
8
|
+
.venv*/
|
|
9
|
+
venv/
|
|
10
|
+
ENV/
|
|
11
|
+
env/
|
|
12
|
+
.env/
|
|
13
|
+
.release-venv/
|
|
14
|
+
.pandas-smoke-venv/
|
|
15
|
+
|
|
16
|
+
# Packaging / build artifacts
|
|
17
|
+
build/
|
|
18
|
+
dist/
|
|
19
|
+
dist-ci/
|
|
20
|
+
dist-release/
|
|
21
|
+
*.egg-info/
|
|
22
|
+
*.egg
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
.python-version
|
|
25
|
+
.installed.cfg
|
|
26
|
+
|
|
27
|
+
# Test / coverage artifacts
|
|
28
|
+
.hypothesis/
|
|
29
|
+
.pytest_cache/
|
|
30
|
+
.coverage
|
|
31
|
+
.coverage.*
|
|
32
|
+
coverage.xml
|
|
33
|
+
htmlcov/
|
|
34
|
+
site/
|
|
35
|
+
.tox/
|
|
36
|
+
.nox/
|
|
37
|
+
|
|
38
|
+
# Type check / lint caches
|
|
39
|
+
.mypy_cache/
|
|
40
|
+
.pyright/
|
|
41
|
+
.ruff_cache/
|
|
42
|
+
.ty_cache/
|
|
43
|
+
|
|
44
|
+
# Notebook checkpoints
|
|
45
|
+
.ipynb_checkpoints/
|
|
46
|
+
|
|
47
|
+
# IDE/editor settings
|
|
48
|
+
.vscode/
|
|
49
|
+
.idea/
|
|
50
|
+
|
|
51
|
+
# OS files
|
|
52
|
+
.DS_Store
|
|
53
|
+
Thumbs.db
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: planframe-sparkless
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: sparkless backend adapter for PlanFrame (SparkFrame UI + sparkless engine).
|
|
5
|
+
Project-URL: Repository, https://github.com/eddiethedean/planframe
|
|
6
|
+
Project-URL: Documentation, https://planframe.readthedocs.io/en/latest/planframe_sparkless/
|
|
7
|
+
Project-URL: Issues, https://github.com/eddiethedean/planframe/issues
|
|
8
|
+
Author: PlanFrame Contributors
|
|
9
|
+
License: MIT
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Typing :: Typed
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Requires-Dist: planframe<2.0.0,>=1.0.0
|
|
17
|
+
Requires-Dist: sparkless>=4.5
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
## planframe-sparkless
|
|
21
|
+
|
|
22
|
+
[](https://planframe.readthedocs.io/en/latest/planframe_sparkless/)
|
|
23
|
+
[](../../LICENSE)
|
|
24
|
+
|
|
25
|
+
Sparkless adapter package for PlanFrame. Import as `planframe_sparkless`.
|
|
26
|
+
|
|
27
|
+
This package:
|
|
28
|
+
|
|
29
|
+
- uses the **PySpark-like UI** from `planframe.spark` (`SparkFrame`)
|
|
30
|
+
- executes plans using the **`sparkless`** engine (no JVM)
|
|
31
|
+
|
|
32
|
+
Documentation (ReadTheDocs):
|
|
33
|
+
|
|
34
|
+
- Sparkless track (end users): `https://planframe.readthedocs.io/en/latest/planframe_sparkless/`
|
|
35
|
+
|
|
36
|
+
### Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install planframe-sparkless
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Quickstart
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from planframe.expr import add, col, lit
|
|
46
|
+
from planframe_sparkless import SparklessFrame
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class User(SparklessFrame):
|
|
50
|
+
id: int
|
|
51
|
+
x: int
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
pf = User([{"id": 1, "x": 2}, {"id": 2, "x": 3}])
|
|
55
|
+
|
|
56
|
+
out = (
|
|
57
|
+
pf.select("id", "x")
|
|
58
|
+
.withColumn("x2", add(col("x"), lit(1)))
|
|
59
|
+
.where(pf["x"] > lit(2))
|
|
60
|
+
.select("id", "x2")
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
print(out.to_dicts())
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Execution model (PlanFrame)
|
|
67
|
+
|
|
68
|
+
- PlanFrame is **always lazy**: chaining does not execute backend work.
|
|
69
|
+
- Materialization boundaries:
|
|
70
|
+
- `collect()` returns `list[pydantic.BaseModel]`
|
|
71
|
+
- `collect_backend()` returns the sparkless backend dataframe object
|
|
72
|
+
- `to_dicts()` / `to_dict()` export rows/columns
|
|
73
|
+
- Async equivalents: `acollect()` / `ato_dicts()` / `ato_dict()`
|
|
74
|
+
|
|
75
|
+
### Notes / limitations
|
|
76
|
+
|
|
77
|
+
- This adapter aims to support a practical subset of Spark-like operations using `sparkless`.
|
|
78
|
+
- Row streaming: `stream_dicts()` currently materializes via `to_dicts()` (sparkless does not expose an efficient local iterator API yet).
|
|
79
|
+
- For backend-agnostic semantics and supported transforms, see the core docs: `https://planframe.readthedocs.io/en/latest/planframe/`
|
|
80
|
+
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
## planframe-sparkless
|
|
2
|
+
|
|
3
|
+
[](https://planframe.readthedocs.io/en/latest/planframe_sparkless/)
|
|
4
|
+
[](../../LICENSE)
|
|
5
|
+
|
|
6
|
+
Sparkless adapter package for PlanFrame. Import as `planframe_sparkless`.
|
|
7
|
+
|
|
8
|
+
This package:
|
|
9
|
+
|
|
10
|
+
- uses the **PySpark-like UI** from `planframe.spark` (`SparkFrame`)
|
|
11
|
+
- executes plans using the **`sparkless`** engine (no JVM)
|
|
12
|
+
|
|
13
|
+
Documentation (ReadTheDocs):
|
|
14
|
+
|
|
15
|
+
- Sparkless track (end users): `https://planframe.readthedocs.io/en/latest/planframe_sparkless/`
|
|
16
|
+
|
|
17
|
+
### Install
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install planframe-sparkless
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
### Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from planframe.expr import add, col, lit
|
|
27
|
+
from planframe_sparkless import SparklessFrame
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class User(SparklessFrame):
|
|
31
|
+
id: int
|
|
32
|
+
x: int
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
pf = User([{"id": 1, "x": 2}, {"id": 2, "x": 3}])
|
|
36
|
+
|
|
37
|
+
out = (
|
|
38
|
+
pf.select("id", "x")
|
|
39
|
+
.withColumn("x2", add(col("x"), lit(1)))
|
|
40
|
+
.where(pf["x"] > lit(2))
|
|
41
|
+
.select("id", "x2")
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
print(out.to_dicts())
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Execution model (PlanFrame)
|
|
48
|
+
|
|
49
|
+
- PlanFrame is **always lazy**: chaining does not execute backend work.
|
|
50
|
+
- Materialization boundaries:
|
|
51
|
+
- `collect()` returns `list[pydantic.BaseModel]`
|
|
52
|
+
- `collect_backend()` returns the sparkless backend dataframe object
|
|
53
|
+
- `to_dicts()` / `to_dict()` export rows/columns
|
|
54
|
+
- Async equivalents: `acollect()` / `ato_dicts()` / `ato_dict()`
|
|
55
|
+
|
|
56
|
+
### Notes / limitations
|
|
57
|
+
|
|
58
|
+
- This adapter aims to support a practical subset of Spark-like operations using `sparkless`.
|
|
59
|
+
- Row streaming: `stream_dicts()` currently materializes via `to_dicts()` (sparkless does not expose an efficient local iterator API yet).
|
|
60
|
+
- For backend-agnostic semantics and supported transforms, see the core docs: `https://planframe.readthedocs.io/en/latest/planframe/`
|
|
61
|
+
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from functools import lru_cache
|
|
4
|
+
|
|
5
|
+
from sparkless.sql import SparkSession
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@lru_cache(maxsize=1)
|
|
9
|
+
def _spark() -> SparkSession:
|
|
10
|
+
# `SparkSession` is lightweight in sparkless and doesn’t require a JVM.
|
|
11
|
+
return SparkSession("planframe_sparkless")
|
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import Any, Literal, cast
|
|
5
|
+
|
|
6
|
+
from sparkless.sql import functions as F
|
|
7
|
+
from sparkless.sql.window import Window
|
|
8
|
+
|
|
9
|
+
from planframe.backend.adapter import (
|
|
10
|
+
BaseAdapter,
|
|
11
|
+
ColumnName,
|
|
12
|
+
Columns,
|
|
13
|
+
CompiledJoinKey,
|
|
14
|
+
CompiledProjectItem,
|
|
15
|
+
CompiledSortKey,
|
|
16
|
+
)
|
|
17
|
+
from planframe.backend.errors import PlanFrameBackendError, PlanFrameExpressionError
|
|
18
|
+
from planframe.execution_options import ExecutionOptions
|
|
19
|
+
from planframe.plan.join_options import JoinOptions
|
|
20
|
+
from planframe.plan.nodes import UnnestItem
|
|
21
|
+
from planframe.schema.ir import Schema
|
|
22
|
+
from planframe.typing.scalars import Scalar
|
|
23
|
+
from planframe.typing.storage import StorageOptions
|
|
24
|
+
from planframe_sparkless._spark import _spark
|
|
25
|
+
from planframe_sparkless.compile_expr import compile_expr
|
|
26
|
+
|
|
27
|
+
SparklessBackendFrame = Any # runtime type is `builtins.PyDataFrame`
|
|
28
|
+
SparklessBackendExpr = Any # runtime type is `builtins.PyColumn`
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SparklessAdapter(BaseAdapter[SparklessBackendFrame, SparklessBackendExpr]):
|
|
32
|
+
name = "sparkless"
|
|
33
|
+
|
|
34
|
+
# ---- AdapterReader surface (used by SparklessFrame classmethods) ----
|
|
35
|
+
def scan_parquet(
|
|
36
|
+
self,
|
|
37
|
+
path: str,
|
|
38
|
+
*,
|
|
39
|
+
hive_partitioning: bool | None = None,
|
|
40
|
+
storage_options: StorageOptions | None = None,
|
|
41
|
+
) -> SparklessBackendFrame:
|
|
42
|
+
_ = hive_partitioning, storage_options
|
|
43
|
+
return _spark().read.parquet(path)
|
|
44
|
+
|
|
45
|
+
def scan_parquet_dataset(
|
|
46
|
+
self, path_or_glob: str, *, storage_options: StorageOptions | None = None
|
|
47
|
+
) -> SparklessBackendFrame:
|
|
48
|
+
_ = storage_options
|
|
49
|
+
# Spark-style readers generally accept globs as path patterns.
|
|
50
|
+
return _spark().read.parquet(path_or_glob)
|
|
51
|
+
|
|
52
|
+
def scan_csv(
|
|
53
|
+
self, path: str, *, storage_options: StorageOptions | None = None
|
|
54
|
+
) -> SparklessBackendFrame:
|
|
55
|
+
_ = storage_options
|
|
56
|
+
return _spark().read.csv(path)
|
|
57
|
+
|
|
58
|
+
def scan_ndjson(
|
|
59
|
+
self, path: str, *, storage_options: StorageOptions | None = None
|
|
60
|
+
) -> SparklessBackendFrame:
|
|
61
|
+
_ = storage_options
|
|
62
|
+
return _spark().read.json(path)
|
|
63
|
+
|
|
64
|
+
def scan_ipc(
|
|
65
|
+
self,
|
|
66
|
+
path: str,
|
|
67
|
+
*,
|
|
68
|
+
hive_partitioning: bool | None = None,
|
|
69
|
+
storage_options: StorageOptions | None = None,
|
|
70
|
+
) -> SparklessBackendFrame:
|
|
71
|
+
_ = path, hive_partitioning, storage_options
|
|
72
|
+
raise PlanFrameBackendError("sparkless adapter does not implement scan_ipc")
|
|
73
|
+
|
|
74
|
+
def scan_delta(
|
|
75
|
+
self,
|
|
76
|
+
source: str,
|
|
77
|
+
*,
|
|
78
|
+
version: int | str | None = None,
|
|
79
|
+
storage_options: StorageOptions | None = None,
|
|
80
|
+
) -> SparklessBackendFrame:
|
|
81
|
+
_ = source, version, storage_options
|
|
82
|
+
raise PlanFrameBackendError("sparkless adapter does not implement scan_delta")
|
|
83
|
+
|
|
84
|
+
def read_delta(
|
|
85
|
+
self,
|
|
86
|
+
source: str,
|
|
87
|
+
*,
|
|
88
|
+
version: int | str | None = None,
|
|
89
|
+
storage_options: StorageOptions | None = None,
|
|
90
|
+
) -> SparklessBackendFrame:
|
|
91
|
+
_ = source, version, storage_options
|
|
92
|
+
raise PlanFrameBackendError("sparkless adapter does not implement read_delta")
|
|
93
|
+
|
|
94
|
+
def read_excel(
|
|
95
|
+
self,
|
|
96
|
+
path: str,
|
|
97
|
+
*,
|
|
98
|
+
sheet_name: str | None = None,
|
|
99
|
+
) -> SparklessBackendFrame:
|
|
100
|
+
_ = path, sheet_name
|
|
101
|
+
raise PlanFrameBackendError("sparkless adapter does not implement read_excel")
|
|
102
|
+
|
|
103
|
+
def read_avro(self, path: str) -> SparklessBackendFrame:
|
|
104
|
+
_ = path
|
|
105
|
+
raise PlanFrameBackendError("sparkless adapter does not implement read_avro")
|
|
106
|
+
|
|
107
|
+
def read_database(self, query: str, *, connection: object) -> SparklessBackendFrame:
|
|
108
|
+
_ = query, connection
|
|
109
|
+
raise PlanFrameBackendError("sparkless adapter does not implement read_database")
|
|
110
|
+
|
|
111
|
+
def read_database_uri(
|
|
112
|
+
self,
|
|
113
|
+
query: str,
|
|
114
|
+
*,
|
|
115
|
+
uri: str,
|
|
116
|
+
engine: Literal["connectorx", "adbc"] | None = None,
|
|
117
|
+
) -> SparklessBackendFrame:
|
|
118
|
+
_ = query, uri, engine
|
|
119
|
+
raise PlanFrameBackendError("sparkless adapter does not implement read_database_uri")
|
|
120
|
+
|
|
121
|
+
# ---- Core transforms ----
|
|
122
|
+
def select(self, df: SparklessBackendFrame, columns: Columns) -> SparklessBackendFrame:
|
|
123
|
+
return df.select(*columns)
|
|
124
|
+
|
|
125
|
+
def project(
|
|
126
|
+
self,
|
|
127
|
+
df: SparklessBackendFrame,
|
|
128
|
+
items: tuple[CompiledProjectItem[SparklessBackendExpr], ...],
|
|
129
|
+
) -> SparklessBackendFrame:
|
|
130
|
+
cols: list[Any] = []
|
|
131
|
+
for it in items:
|
|
132
|
+
if it.from_column is not None:
|
|
133
|
+
cols.append(it.from_column)
|
|
134
|
+
elif it.expr is not None:
|
|
135
|
+
cols.append(cast(Any, it.expr).alias(it.name))
|
|
136
|
+
else:
|
|
137
|
+
raise AssertionError("Invalid CompiledProjectItem")
|
|
138
|
+
return df.select(*cols)
|
|
139
|
+
|
|
140
|
+
def drop(
|
|
141
|
+
self, df: SparklessBackendFrame, columns: Columns, *, strict: bool = True
|
|
142
|
+
) -> SparklessBackendFrame:
|
|
143
|
+
existing = set(df.columns)
|
|
144
|
+
if strict:
|
|
145
|
+
missing = [c for c in columns if c not in existing]
|
|
146
|
+
if missing:
|
|
147
|
+
raise PlanFrameBackendError(f"Columns not found for drop: {missing}")
|
|
148
|
+
return df.drop(*columns)
|
|
149
|
+
|
|
150
|
+
cols2 = tuple(c for c in columns if c in existing)
|
|
151
|
+
if not cols2:
|
|
152
|
+
return df
|
|
153
|
+
return df.drop(*cols2)
|
|
154
|
+
|
|
155
|
+
def rename(
|
|
156
|
+
self,
|
|
157
|
+
df: SparklessBackendFrame,
|
|
158
|
+
mapping: dict[ColumnName, ColumnName],
|
|
159
|
+
*,
|
|
160
|
+
strict: bool = True,
|
|
161
|
+
) -> SparklessBackendFrame:
|
|
162
|
+
if strict:
|
|
163
|
+
missing = [c for c in mapping if c not in set(df.columns)]
|
|
164
|
+
if missing:
|
|
165
|
+
raise PlanFrameBackendError(f"Columns not found for rename: {missing}")
|
|
166
|
+
out = df
|
|
167
|
+
for old, new in mapping.items():
|
|
168
|
+
if old in set(out.columns):
|
|
169
|
+
out = out.withColumnRenamed(old, new)
|
|
170
|
+
return out
|
|
171
|
+
|
|
172
|
+
def with_column(
|
|
173
|
+
self, df: SparklessBackendFrame, name: str, expr: SparklessBackendExpr
|
|
174
|
+
) -> SparklessBackendFrame:
|
|
175
|
+
return df.withColumn(name, expr)
|
|
176
|
+
|
|
177
|
+
def cast(self, df: SparklessBackendFrame, name: str, dtype: object) -> SparklessBackendFrame:
|
|
178
|
+
# PlanFrame dtypes are backend-agnostic; sparkless expects Spark SQL type strings.
|
|
179
|
+
# We accept `object` here and rely on adapter users passing strings when needed.
|
|
180
|
+
if not isinstance(dtype, str):
|
|
181
|
+
raise PlanFrameBackendError(
|
|
182
|
+
"sparkless cast expects dtype as Spark SQL string (e.g. 'int')"
|
|
183
|
+
)
|
|
184
|
+
return df.withColumn(name, F.col(name).cast(dtype))
|
|
185
|
+
|
|
186
|
+
def with_row_count(
|
|
187
|
+
self, df: SparklessBackendFrame, *, name: str = "row_nr", offset: int = 0
|
|
188
|
+
) -> SparklessBackendFrame:
|
|
189
|
+
# Spark requires an ordering for row_number(). Sparkless does not accept ordering
|
|
190
|
+
# by a pure literal expression, so we order by the first column.
|
|
191
|
+
first_col = df.columns[0] if df.columns else None
|
|
192
|
+
if first_col is None:
|
|
193
|
+
raise PlanFrameBackendError("Cannot add row count to empty-column sparkless DataFrame")
|
|
194
|
+
w = Window.orderBy(F.col(first_col))
|
|
195
|
+
return df.withColumn(name, F.row_number().over(w) + F.lit(offset) - F.lit(1))
|
|
196
|
+
|
|
197
|
+
def filter(
|
|
198
|
+
self, df: SparklessBackendFrame, predicate: SparklessBackendExpr
|
|
199
|
+
) -> SparklessBackendFrame:
|
|
200
|
+
return df.filter(predicate)
|
|
201
|
+
|
|
202
|
+
def sort(
|
|
203
|
+
self,
|
|
204
|
+
df: SparklessBackendFrame,
|
|
205
|
+
keys: tuple[CompiledSortKey[SparklessBackendExpr], ...],
|
|
206
|
+
*,
|
|
207
|
+
descending: tuple[bool, ...],
|
|
208
|
+
nulls_last: tuple[bool, ...],
|
|
209
|
+
) -> SparklessBackendFrame:
|
|
210
|
+
# sparkless Columns support Spark-style null ordering (asc_nulls_* / desc_nulls_*).
|
|
211
|
+
cols: list[Any] = []
|
|
212
|
+
for k, desc, nl in zip(keys, descending, nulls_last, strict=True):
|
|
213
|
+
if k.column is not None:
|
|
214
|
+
c = F.col(k.column)
|
|
215
|
+
else:
|
|
216
|
+
if k.expr is None:
|
|
217
|
+
raise PlanFrameBackendError("Sort key expr cannot be None")
|
|
218
|
+
c = k.expr
|
|
219
|
+
if desc:
|
|
220
|
+
cols.append(c.desc_nulls_last() if nl else c.desc_nulls_first())
|
|
221
|
+
else:
|
|
222
|
+
cols.append(c.asc_nulls_last() if nl else c.asc_nulls_first())
|
|
223
|
+
return df.orderBy(*cols)
|
|
224
|
+
|
|
225
|
+
def unique(
|
|
226
|
+
self,
|
|
227
|
+
df: SparklessBackendFrame,
|
|
228
|
+
subset: Columns | None,
|
|
229
|
+
*,
|
|
230
|
+
keep: str = "first",
|
|
231
|
+
maintain_order: bool = False,
|
|
232
|
+
) -> SparklessBackendFrame:
|
|
233
|
+
_ = keep, maintain_order
|
|
234
|
+
if subset is None:
|
|
235
|
+
return df.distinct()
|
|
236
|
+
return df.dropDuplicates(list(subset))
|
|
237
|
+
|
|
238
|
+
def duplicated(
|
|
239
|
+
self,
|
|
240
|
+
df: SparklessBackendFrame,
|
|
241
|
+
subset: Columns | None,
|
|
242
|
+
*,
|
|
243
|
+
keep: str | bool = "first",
|
|
244
|
+
out_name: str = "duplicated",
|
|
245
|
+
) -> SparklessBackendFrame:
|
|
246
|
+
# Approximate via window count > 1.
|
|
247
|
+
_ = keep
|
|
248
|
+
cols = list(subset) if subset is not None else list(df.columns)
|
|
249
|
+
w = Window.partitionBy(*[F.col(c) for c in cols]).orderBy(F.lit(1))
|
|
250
|
+
return df.withColumn(out_name, (F.count(F.lit(1)).over(w) > F.lit(1)))
|
|
251
|
+
|
|
252
|
+
def group_by_agg(
|
|
253
|
+
self,
|
|
254
|
+
df: SparklessBackendFrame,
|
|
255
|
+
*,
|
|
256
|
+
keys: tuple[CompiledJoinKey[SparklessBackendExpr], ...],
|
|
257
|
+
named_aggs: dict[ColumnName, Any],
|
|
258
|
+
) -> SparklessBackendFrame:
|
|
259
|
+
group_cols: list[Any] = []
|
|
260
|
+
for k in keys:
|
|
261
|
+
if k.column is not None:
|
|
262
|
+
group_cols.append(F.col(k.column))
|
|
263
|
+
else:
|
|
264
|
+
group_cols.append(k.expr)
|
|
265
|
+
g = df.groupBy(*group_cols)
|
|
266
|
+
|
|
267
|
+
aggs: list[Any] = []
|
|
268
|
+
for out_name, spec in named_aggs.items():
|
|
269
|
+
if isinstance(spec, tuple):
|
|
270
|
+
op, col = spec
|
|
271
|
+
if op == "count":
|
|
272
|
+
aggs.append(F.count(F.col(col)).alias(out_name))
|
|
273
|
+
elif op == "sum":
|
|
274
|
+
aggs.append(F.sum(F.col(col)).alias(out_name))
|
|
275
|
+
elif op == "mean":
|
|
276
|
+
aggs.append(F.avg(F.col(col)).alias(out_name))
|
|
277
|
+
elif op == "min":
|
|
278
|
+
aggs.append(F.min(F.col(col)).alias(out_name))
|
|
279
|
+
elif op == "max":
|
|
280
|
+
aggs.append(F.max(F.col(col)).alias(out_name))
|
|
281
|
+
elif op == "n_unique":
|
|
282
|
+
aggs.append(F.countDistinct(F.col(col)).alias(out_name))
|
|
283
|
+
else:
|
|
284
|
+
raise PlanFrameBackendError(f"Unsupported aggregation op: {op!r}")
|
|
285
|
+
else:
|
|
286
|
+
aggs.append(cast(Any, spec).alias(out_name))
|
|
287
|
+
return g.agg(*aggs)
|
|
288
|
+
|
|
289
|
+
def group_by_dynamic_agg(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
|
|
290
|
+
raise PlanFrameBackendError("sparkless adapter does not implement dynamic group_by yet")
|
|
291
|
+
|
|
292
|
+
def rolling_agg(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
|
|
293
|
+
raise PlanFrameBackendError("sparkless adapter does not implement rolling_agg yet")
|
|
294
|
+
|
|
295
|
+
def drop_nulls(
|
|
296
|
+
self,
|
|
297
|
+
df: SparklessBackendFrame,
|
|
298
|
+
subset: Columns | None,
|
|
299
|
+
*,
|
|
300
|
+
how: Literal["any", "all"] = "any",
|
|
301
|
+
threshold: int | None = None,
|
|
302
|
+
) -> SparklessBackendFrame:
|
|
303
|
+
# Spark uses DataFrame.na.drop
|
|
304
|
+
subset_list = None if subset is None else list(subset)
|
|
305
|
+
if threshold is not None:
|
|
306
|
+
return df.na.drop(thresh=threshold, subset=subset_list)
|
|
307
|
+
return df.na.drop(how=how, subset=subset_list)
|
|
308
|
+
|
|
309
|
+
def fill_null(
|
|
310
|
+
self,
|
|
311
|
+
df: SparklessBackendFrame,
|
|
312
|
+
value: Scalar | SparklessBackendExpr | None,
|
|
313
|
+
subset: Columns | None,
|
|
314
|
+
*,
|
|
315
|
+
strategy: str | None = None,
|
|
316
|
+
) -> SparklessBackendFrame:
|
|
317
|
+
_ = strategy
|
|
318
|
+
subset_list = None if subset is None else list(subset)
|
|
319
|
+
if value is None:
|
|
320
|
+
raise PlanFrameBackendError("sparkless fill_null does not support value=None")
|
|
321
|
+
if isinstance(value, (int, float, str, bool)):
|
|
322
|
+
return df.na.fill(value=value, subset=subset_list)
|
|
323
|
+
raise PlanFrameBackendError("sparkless fill_null only supports scalar values currently")
|
|
324
|
+
|
|
325
|
+
def melt(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
|
|
326
|
+
raise PlanFrameBackendError("sparkless adapter does not implement melt yet")
|
|
327
|
+
|
|
328
|
+
def join(
|
|
329
|
+
self,
|
|
330
|
+
left: SparklessBackendFrame,
|
|
331
|
+
right: SparklessBackendFrame,
|
|
332
|
+
*,
|
|
333
|
+
left_on: tuple[CompiledJoinKey[SparklessBackendExpr], ...],
|
|
334
|
+
right_on: tuple[CompiledJoinKey[SparklessBackendExpr], ...],
|
|
335
|
+
how: str = "inner",
|
|
336
|
+
suffix: str = "_right",
|
|
337
|
+
options: JoinOptions | None = None,
|
|
338
|
+
) -> SparklessBackendFrame:
|
|
339
|
+
_ = suffix, options
|
|
340
|
+
if not left_on and not right_on:
|
|
341
|
+
return left.crossJoin(right)
|
|
342
|
+
if len(left_on) != len(right_on):
|
|
343
|
+
raise ValueError("Join keys must match in length")
|
|
344
|
+
|
|
345
|
+
# Prefer Spark-style ``on=`` / ``left_on=``/``right_on=`` so join keys are not
|
|
346
|
+
# ambiguous when column names overlap (unqualified ``F.col`` in a boolean ``on``).
|
|
347
|
+
simple_name_keys = True
|
|
348
|
+
left_names: list[str] = []
|
|
349
|
+
right_names: list[str] = []
|
|
350
|
+
for lk, rk in zip(left_on, right_on, strict=True):
|
|
351
|
+
if lk.column is None or lk.expr is not None or rk.column is None or rk.expr is not None:
|
|
352
|
+
simple_name_keys = False
|
|
353
|
+
break
|
|
354
|
+
left_names.append(lk.column)
|
|
355
|
+
right_names.append(rk.column)
|
|
356
|
+
|
|
357
|
+
if simple_name_keys:
|
|
358
|
+
if left_names == right_names:
|
|
359
|
+
on_arg = left_names[0] if len(left_names) == 1 else left_names
|
|
360
|
+
return left.join(right, on=on_arg, how=how)
|
|
361
|
+
return left.join(right, left_on=left_names, right_on=right_names, how=how)
|
|
362
|
+
|
|
363
|
+
conds: list[Any] = []
|
|
364
|
+
for lk, rk in zip(left_on, right_on, strict=True):
|
|
365
|
+
lcol = F.col(lk.column) if lk.column is not None else lk.expr
|
|
366
|
+
rcol = F.col(rk.column) if rk.column is not None else rk.expr
|
|
367
|
+
conds.append(lcol == rcol)
|
|
368
|
+
cond = conds[0]
|
|
369
|
+
for c in conds[1:]:
|
|
370
|
+
cond = cond & c
|
|
371
|
+
return left.join(right, on=cond, how=how)
|
|
372
|
+
|
|
373
|
+
def slice(
|
|
374
|
+
self, df: SparklessBackendFrame, *, offset: int, length: int | None
|
|
375
|
+
) -> SparklessBackendFrame:
|
|
376
|
+
if offset != 0:
|
|
377
|
+
raise PlanFrameBackendError("sparkless adapter does not support offset slicing yet")
|
|
378
|
+
return df.limit(length) if length is not None else df
|
|
379
|
+
|
|
380
|
+
def head(self, df: SparklessBackendFrame, n: int) -> SparklessBackendFrame:
|
|
381
|
+
return df.limit(n)
|
|
382
|
+
|
|
383
|
+
def tail(self, df: SparklessBackendFrame, n: int) -> SparklessBackendFrame:
|
|
384
|
+
# No direct tail in Spark; approximate by collecting and re-creating.
|
|
385
|
+
rows = df.collect()[-n:]
|
|
386
|
+
dicts: list[dict[str, object]] = []
|
|
387
|
+
for r in rows:
|
|
388
|
+
if hasattr(r, "asDict"):
|
|
389
|
+
dicts.append(cast(dict[str, object], r.asDict()))
|
|
390
|
+
else:
|
|
391
|
+
raise PlanFrameBackendError(
|
|
392
|
+
f"Unexpected row type from sparkless collect(): {type(r)!r}"
|
|
393
|
+
)
|
|
394
|
+
return _spark().createDataFrame(dicts)
|
|
395
|
+
|
|
396
|
+
def concat_vertical(
|
|
397
|
+
self, left: SparklessBackendFrame, right: SparklessBackendFrame
|
|
398
|
+
) -> SparklessBackendFrame:
|
|
399
|
+
return left.unionByName(right, allowMissingColumns=True)
|
|
400
|
+
|
|
401
|
+
def concat_horizontal(
|
|
402
|
+
self, left: SparklessBackendFrame, right: SparklessBackendFrame
|
|
403
|
+
) -> SparklessBackendFrame:
|
|
404
|
+
raise PlanFrameBackendError("sparkless adapter does not implement concat_horizontal yet")
|
|
405
|
+
|
|
406
|
+
def pivot(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
|
|
407
|
+
raise PlanFrameBackendError("sparkless adapter does not implement pivot yet")
|
|
408
|
+
|
|
409
|
+
# ---- Writes ----
|
|
410
|
+
def write_parquet(
|
|
411
|
+
self,
|
|
412
|
+
df: SparklessBackendFrame,
|
|
413
|
+
path: str,
|
|
414
|
+
*,
|
|
415
|
+
compression: str = "zstd",
|
|
416
|
+
row_group_size: int | None = None,
|
|
417
|
+
partition_by: tuple[str, ...] | None = None,
|
|
418
|
+
storage_options: StorageOptions | None = None,
|
|
419
|
+
) -> None:
|
|
420
|
+
_ = compression, row_group_size, partition_by, storage_options
|
|
421
|
+
df.write.parquet(path)
|
|
422
|
+
|
|
423
|
+
def write_csv(
|
|
424
|
+
self,
|
|
425
|
+
df: SparklessBackendFrame,
|
|
426
|
+
path: str,
|
|
427
|
+
*,
|
|
428
|
+
separator: str = ",",
|
|
429
|
+
include_header: bool = True,
|
|
430
|
+
storage_options: StorageOptions | None = None,
|
|
431
|
+
) -> None:
|
|
432
|
+
_ = storage_options
|
|
433
|
+
df.write.csv(path, sep=separator, header=include_header)
|
|
434
|
+
|
|
435
|
+
def write_ndjson(
|
|
436
|
+
self, df: SparklessBackendFrame, path: str, *, storage_options: StorageOptions | None = None
|
|
437
|
+
) -> None:
|
|
438
|
+
_ = storage_options
|
|
439
|
+
df.write.json(path)
|
|
440
|
+
|
|
441
|
+
def write_ipc(
|
|
442
|
+
self,
|
|
443
|
+
df: SparklessBackendFrame,
|
|
444
|
+
path: str,
|
|
445
|
+
*,
|
|
446
|
+
compression: str = "uncompressed",
|
|
447
|
+
storage_options: StorageOptions | None = None,
|
|
448
|
+
) -> None:
|
|
449
|
+
_ = df, path, compression, storage_options
|
|
450
|
+
raise PlanFrameBackendError("sparkless adapter does not implement IPC writing")
|
|
451
|
+
|
|
452
|
+
def write_database(
|
|
453
|
+
self,
|
|
454
|
+
df: SparklessBackendFrame,
|
|
455
|
+
*,
|
|
456
|
+
table_name: str,
|
|
457
|
+
connection: object,
|
|
458
|
+
if_table_exists: str = "fail",
|
|
459
|
+
engine: str | None = None,
|
|
460
|
+
) -> None:
|
|
461
|
+
_ = df, table_name, connection, if_table_exists, engine
|
|
462
|
+
raise PlanFrameBackendError("sparkless adapter does not implement database writing")
|
|
463
|
+
|
|
464
|
+
def write_excel(
|
|
465
|
+
self, df: SparklessBackendFrame, path: str, *, worksheet: str = "Sheet1"
|
|
466
|
+
) -> None:
|
|
467
|
+
_ = df, path, worksheet
|
|
468
|
+
raise PlanFrameBackendError("sparkless adapter does not implement Excel writing")
|
|
469
|
+
|
|
470
|
+
def write_delta(
|
|
471
|
+
self,
|
|
472
|
+
df: SparklessBackendFrame,
|
|
473
|
+
target: str,
|
|
474
|
+
*,
|
|
475
|
+
mode: str = "error",
|
|
476
|
+
storage_options: StorageOptions | None = None,
|
|
477
|
+
) -> None:
|
|
478
|
+
_ = df, target, mode, storage_options
|
|
479
|
+
raise PlanFrameBackendError("sparkless adapter does not implement Delta writing")
|
|
480
|
+
|
|
481
|
+
def write_avro(
|
|
482
|
+
self,
|
|
483
|
+
df: SparklessBackendFrame,
|
|
484
|
+
path: str,
|
|
485
|
+
*,
|
|
486
|
+
compression: str = "uncompressed",
|
|
487
|
+
name: str = "",
|
|
488
|
+
) -> None:
|
|
489
|
+
_ = df, path, compression, name
|
|
490
|
+
raise PlanFrameBackendError("sparkless adapter does not implement Avro writing")
|
|
491
|
+
|
|
492
|
+
# ---- Nested/array ops ----
|
|
493
|
+
def explode(
|
|
494
|
+
self, df: SparklessBackendFrame, columns: Columns, *, outer: bool = False
|
|
495
|
+
) -> SparklessBackendFrame:
|
|
496
|
+
_ = outer
|
|
497
|
+
out = df
|
|
498
|
+
for c in columns:
|
|
499
|
+
out = out.withColumn(c, F.explode(F.col(c)))
|
|
500
|
+
return out
|
|
501
|
+
|
|
502
|
+
def unnest(
|
|
503
|
+
self, df: SparklessBackendFrame, items: tuple[UnnestItem, ...]
|
|
504
|
+
) -> SparklessBackendFrame:
|
|
505
|
+
_ = df, items
|
|
506
|
+
raise PlanFrameBackendError("sparkless adapter does not implement unnest yet")
|
|
507
|
+
|
|
508
|
+
def posexplode(
|
|
509
|
+
self,
|
|
510
|
+
df: SparklessBackendFrame,
|
|
511
|
+
column: str,
|
|
512
|
+
*,
|
|
513
|
+
pos: str = "pos",
|
|
514
|
+
value: str | None = None,
|
|
515
|
+
outer: bool = False,
|
|
516
|
+
) -> SparklessBackendFrame:
|
|
517
|
+
_ = outer
|
|
518
|
+
value_name = value or column
|
|
519
|
+
return df.select("*", F.posexplode(F.col(column)).alias(pos, value_name))
|
|
520
|
+
|
|
521
|
+
def drop_nulls_all(
|
|
522
|
+
self, df: SparklessBackendFrame, subset: tuple[str, ...] | None
|
|
523
|
+
) -> SparklessBackendFrame:
|
|
524
|
+
return self.drop_nulls(df, subset, how="all")
|
|
525
|
+
|
|
526
|
+
def sample(
|
|
527
|
+
self,
|
|
528
|
+
df: SparklessBackendFrame,
|
|
529
|
+
*,
|
|
530
|
+
n: int | None = None,
|
|
531
|
+
frac: float | None = None,
|
|
532
|
+
with_replacement: bool = False,
|
|
533
|
+
shuffle: bool = False,
|
|
534
|
+
seed: int | None = None,
|
|
535
|
+
) -> SparklessBackendFrame:
|
|
536
|
+
_ = shuffle
|
|
537
|
+
if frac is None and n is None:
|
|
538
|
+
raise ValueError("sample requires n or frac")
|
|
539
|
+
if frac is None:
|
|
540
|
+
# Approximate n via fraction; requires count (expensive). Keep simple.
|
|
541
|
+
raise PlanFrameBackendError("sparkless adapter sample(n=...) is not implemented")
|
|
542
|
+
return df.sample(withReplacement=with_replacement, fraction=frac, seed=seed)
|
|
543
|
+
|
|
544
|
+
# ---- Expression compilation + materialization ----
|
|
545
|
+
def compile_expr(self, expr: object, *, schema: Schema | None = None) -> SparklessBackendExpr:
|
|
546
|
+
_ = schema
|
|
547
|
+
if isinstance(expr, object) and hasattr(expr, "__class__"):
|
|
548
|
+
# PlanFrame Expr nodes are dataclasses; compile using our mapping.
|
|
549
|
+
from planframe.expr.api import Expr as PFExpr
|
|
550
|
+
|
|
551
|
+
if isinstance(expr, PFExpr):
|
|
552
|
+
return compile_expr(cast(Any, expr))
|
|
553
|
+
raise PlanFrameExpressionError(f"Unsupported expr type for sparkless: {type(expr)!r}")
|
|
554
|
+
|
|
555
|
+
def collect(
|
|
556
|
+
self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
|
|
557
|
+
) -> SparklessBackendFrame:
|
|
558
|
+
# Return backend-native dataframe (lazy plan). Row export methods execute.
|
|
559
|
+
_ = options
|
|
560
|
+
return df
|
|
561
|
+
|
|
562
|
+
def to_dicts(
|
|
563
|
+
self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
|
|
564
|
+
) -> list[dict[str, object]]:
|
|
565
|
+
_ = options
|
|
566
|
+
rows = df.collect()
|
|
567
|
+
out: list[dict[str, object]] = []
|
|
568
|
+
for r in rows:
|
|
569
|
+
if hasattr(r, "asDict"):
|
|
570
|
+
out.append(cast(dict[str, object], r.asDict()))
|
|
571
|
+
elif isinstance(r, dict):
|
|
572
|
+
out.append(cast(dict[str, object], r))
|
|
573
|
+
else:
|
|
574
|
+
raise PlanFrameBackendError(
|
|
575
|
+
f"Unexpected row type from sparkless collect(): {type(r)!r}"
|
|
576
|
+
)
|
|
577
|
+
return out
|
|
578
|
+
|
|
579
|
+
def to_dict(
|
|
580
|
+
self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
|
|
581
|
+
) -> dict[str, list[object]]:
|
|
582
|
+
rows = self.to_dicts(df, options=options)
|
|
583
|
+
if not rows:
|
|
584
|
+
return {str(c): [] for c in df.columns}
|
|
585
|
+
out: dict[str, list[object]] = {k: [] for k in rows[0]}
|
|
586
|
+
for r in rows:
|
|
587
|
+
for k, v in r.items():
|
|
588
|
+
out[k].append(v)
|
|
589
|
+
return out
|
|
590
|
+
|
|
591
|
+
def stream_dicts(
|
|
592
|
+
self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
|
|
593
|
+
) -> Iterator[dict[str, object]]:
|
|
594
|
+
# sparkless currently doesn't expose toLocalIterator; fall back to materializing rows.
|
|
595
|
+
yield from self.to_dicts(df, options=options)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, cast
|
|
4
|
+
|
|
5
|
+
from sparkless.sql import functions as F
|
|
6
|
+
|
|
7
|
+
from planframe.backend.errors import PlanFrameExpressionError
|
|
8
|
+
from planframe.expr.api import (
|
|
9
|
+
Abs,
|
|
10
|
+
Add,
|
|
11
|
+
AggExpr,
|
|
12
|
+
Alias,
|
|
13
|
+
And,
|
|
14
|
+
Between,
|
|
15
|
+
Ceil,
|
|
16
|
+
Clip,
|
|
17
|
+
Coalesce,
|
|
18
|
+
Col,
|
|
19
|
+
DtDay,
|
|
20
|
+
DtMonth,
|
|
21
|
+
DtYear,
|
|
22
|
+
Eq,
|
|
23
|
+
Exp,
|
|
24
|
+
Expr,
|
|
25
|
+
Floor,
|
|
26
|
+
Ge,
|
|
27
|
+
Gt,
|
|
28
|
+
IfElse,
|
|
29
|
+
IsFinite,
|
|
30
|
+
IsIn,
|
|
31
|
+
IsNotNull,
|
|
32
|
+
IsNull,
|
|
33
|
+
Le,
|
|
34
|
+
Lit,
|
|
35
|
+
Log,
|
|
36
|
+
Lt,
|
|
37
|
+
Mul,
|
|
38
|
+
Ne,
|
|
39
|
+
Not,
|
|
40
|
+
Or,
|
|
41
|
+
Over,
|
|
42
|
+
Pow,
|
|
43
|
+
Round,
|
|
44
|
+
Sqrt,
|
|
45
|
+
StrContains,
|
|
46
|
+
StrEndsWith,
|
|
47
|
+
StrLen,
|
|
48
|
+
StrLower,
|
|
49
|
+
StrReplace,
|
|
50
|
+
StrSplit,
|
|
51
|
+
StrStartsWith,
|
|
52
|
+
StrStrip,
|
|
53
|
+
StrUpper,
|
|
54
|
+
Sub,
|
|
55
|
+
TrueDiv,
|
|
56
|
+
Xor,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def compile_expr(expr: Expr[Any]) -> Any:
|
|
61
|
+
if isinstance(expr, Alias):
|
|
62
|
+
return compile_expr(expr.expr)
|
|
63
|
+
if isinstance(expr, Col):
|
|
64
|
+
return F.col(expr.name)
|
|
65
|
+
if isinstance(expr, Lit):
|
|
66
|
+
return F.lit(expr.value)
|
|
67
|
+
if isinstance(expr, Add):
|
|
68
|
+
return compile_expr(expr.left) + compile_expr(expr.right)
|
|
69
|
+
if isinstance(expr, Sub):
|
|
70
|
+
return compile_expr(expr.left) - compile_expr(expr.right)
|
|
71
|
+
if isinstance(expr, Mul):
|
|
72
|
+
return compile_expr(expr.left) * compile_expr(expr.right)
|
|
73
|
+
if isinstance(expr, TrueDiv):
|
|
74
|
+
return compile_expr(expr.left) / compile_expr(expr.right)
|
|
75
|
+
if isinstance(expr, Eq):
|
|
76
|
+
return compile_expr(expr.left) == compile_expr(expr.right)
|
|
77
|
+
if isinstance(expr, Ne):
|
|
78
|
+
return compile_expr(expr.left) != compile_expr(expr.right)
|
|
79
|
+
if isinstance(expr, Lt):
|
|
80
|
+
return compile_expr(expr.left) < compile_expr(expr.right)
|
|
81
|
+
if isinstance(expr, Le):
|
|
82
|
+
return compile_expr(expr.left) <= compile_expr(expr.right)
|
|
83
|
+
if isinstance(expr, Gt):
|
|
84
|
+
return compile_expr(expr.left) > compile_expr(expr.right)
|
|
85
|
+
if isinstance(expr, Ge):
|
|
86
|
+
return compile_expr(expr.left) >= compile_expr(expr.right)
|
|
87
|
+
if isinstance(expr, IsNull):
|
|
88
|
+
return compile_expr(expr.value).isNull()
|
|
89
|
+
if isinstance(expr, IsNotNull):
|
|
90
|
+
return compile_expr(expr.value).isNotNull()
|
|
91
|
+
if isinstance(expr, IsIn):
|
|
92
|
+
return compile_expr(expr.value).isin(list(expr.options))
|
|
93
|
+
if isinstance(expr, And):
|
|
94
|
+
return compile_expr(expr.left) & compile_expr(expr.right)
|
|
95
|
+
if isinstance(expr, Or):
|
|
96
|
+
return compile_expr(expr.left) | compile_expr(expr.right)
|
|
97
|
+
if isinstance(expr, Not):
|
|
98
|
+
return ~compile_expr(expr.value)
|
|
99
|
+
if isinstance(expr, Xor):
|
|
100
|
+
return compile_expr(expr.left) ^ compile_expr(expr.right)
|
|
101
|
+
if isinstance(expr, Abs):
|
|
102
|
+
return F.abs(compile_expr(expr.value))
|
|
103
|
+
if isinstance(expr, Round):
|
|
104
|
+
e = compile_expr(expr.value)
|
|
105
|
+
return F.round(e, expr.ndigits) if expr.ndigits is not None else F.round(e)
|
|
106
|
+
if isinstance(expr, Floor):
|
|
107
|
+
return F.floor(compile_expr(expr.value))
|
|
108
|
+
if isinstance(expr, Ceil):
|
|
109
|
+
return F.ceil(compile_expr(expr.value))
|
|
110
|
+
if isinstance(expr, Coalesce):
|
|
111
|
+
return F.coalesce(*[compile_expr(v) for v in expr.values])
|
|
112
|
+
if isinstance(expr, IfElse):
|
|
113
|
+
return F.when(compile_expr(expr.cond), compile_expr(expr.then_value)).otherwise(
|
|
114
|
+
compile_expr(expr.else_value)
|
|
115
|
+
)
|
|
116
|
+
if isinstance(expr, Over):
|
|
117
|
+
# PlanFrame Over only carries partition/order column names, so we can map to Window.
|
|
118
|
+
from sparkless.sql.window import Window
|
|
119
|
+
|
|
120
|
+
w = Window.partitionBy(*expr.partition_by)
|
|
121
|
+
if expr.order_by is not None:
|
|
122
|
+
w = w.orderBy(*expr.order_by)
|
|
123
|
+
return compile_expr(expr.value).over(w)
|
|
124
|
+
if isinstance(expr, Between):
|
|
125
|
+
return compile_expr(expr.value).between(compile_expr(expr.low), compile_expr(expr.high))
|
|
126
|
+
if isinstance(expr, Clip):
|
|
127
|
+
e = compile_expr(expr.value)
|
|
128
|
+
if expr.lower is not None:
|
|
129
|
+
e = F.greatest(e, compile_expr(expr.lower))
|
|
130
|
+
if expr.upper is not None:
|
|
131
|
+
e = F.least(e, compile_expr(expr.upper))
|
|
132
|
+
return e
|
|
133
|
+
if isinstance(expr, Pow):
|
|
134
|
+
return F.pow(compile_expr(expr.base), compile_expr(expr.exponent))
|
|
135
|
+
if isinstance(expr, Exp):
|
|
136
|
+
return F.exp(compile_expr(expr.value))
|
|
137
|
+
if isinstance(expr, Log):
|
|
138
|
+
return F.log(compile_expr(expr.value))
|
|
139
|
+
if isinstance(expr, StrContains):
|
|
140
|
+
e = compile_expr(expr.value)
|
|
141
|
+
if expr.literal:
|
|
142
|
+
return e.contains(expr.pattern)
|
|
143
|
+
return e.rlike(expr.pattern)
|
|
144
|
+
if isinstance(expr, StrStartsWith):
|
|
145
|
+
return compile_expr(expr.value).startswith(expr.prefix)
|
|
146
|
+
if isinstance(expr, StrEndsWith):
|
|
147
|
+
return compile_expr(expr.value).endswith(expr.suffix)
|
|
148
|
+
if isinstance(expr, StrLower):
|
|
149
|
+
return F.lower(compile_expr(expr.value))
|
|
150
|
+
if isinstance(expr, StrUpper):
|
|
151
|
+
return F.upper(compile_expr(expr.value))
|
|
152
|
+
if isinstance(expr, StrLen):
|
|
153
|
+
return F.length(compile_expr(expr.value))
|
|
154
|
+
if isinstance(expr, StrReplace):
|
|
155
|
+
return F.regexp_replace(compile_expr(expr.value), expr.pattern, expr.replacement)
|
|
156
|
+
if isinstance(expr, StrStrip):
|
|
157
|
+
return F.trim(compile_expr(expr.value))
|
|
158
|
+
if isinstance(expr, StrSplit):
|
|
159
|
+
return F.split(compile_expr(expr.value), expr.by)
|
|
160
|
+
if isinstance(expr, DtYear):
|
|
161
|
+
return F.year(compile_expr(expr.value))
|
|
162
|
+
if isinstance(expr, DtMonth):
|
|
163
|
+
return F.month(compile_expr(expr.value))
|
|
164
|
+
if isinstance(expr, DtDay):
|
|
165
|
+
return F.dayofmonth(compile_expr(expr.value))
|
|
166
|
+
if isinstance(expr, Sqrt):
|
|
167
|
+
return F.sqrt(compile_expr(expr.value))
|
|
168
|
+
if isinstance(expr, IsFinite):
|
|
169
|
+
# Spark doesn't have a direct isFinite; approximate via isnan/isnull checks.
|
|
170
|
+
e = compile_expr(expr.value)
|
|
171
|
+
return (~cast(Any, F.isnan(e))) & e.isNotNull()
|
|
172
|
+
if isinstance(expr, AggExpr):
|
|
173
|
+
inner = compile_expr(expr.inner)
|
|
174
|
+
if expr.op == "count":
|
|
175
|
+
return F.count(inner)
|
|
176
|
+
if expr.op == "sum":
|
|
177
|
+
return F.sum(inner)
|
|
178
|
+
if expr.op == "mean":
|
|
179
|
+
return F.avg(inner)
|
|
180
|
+
if expr.op == "min":
|
|
181
|
+
return F.min(inner)
|
|
182
|
+
if expr.op == "max":
|
|
183
|
+
return F.max(inner)
|
|
184
|
+
if expr.op == "n_unique":
|
|
185
|
+
return F.countDistinct(inner)
|
|
186
|
+
raise PlanFrameExpressionError(f"Unsupported aggregation op: {expr.op!r}")
|
|
187
|
+
|
|
188
|
+
raise PlanFrameExpressionError(f"Unsupported expr node for sparkless: {type(expr)!r}")
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Mapping, Sequence
|
|
4
|
+
from typing import Any, ClassVar, Generic, TypeVar, cast
|
|
5
|
+
|
|
6
|
+
from planframe.frame import Frame
|
|
7
|
+
from planframe.spark import SparkFrame
|
|
8
|
+
from planframe.typing.storage import StorageOptions
|
|
9
|
+
from planframe_sparkless._spark import _spark
|
|
10
|
+
from planframe_sparkless.adapter import (
|
|
11
|
+
SparklessAdapter,
|
|
12
|
+
SparklessBackendExpr,
|
|
13
|
+
SparklessBackendFrame,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
SchemaT = TypeVar("SchemaT")
|
|
17
|
+
|
|
18
|
+
SparklessData = Mapping[str, Sequence[object]] | Sequence[Mapping[str, object]]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _schema_defaults(schema: type[Any]) -> dict[str, object]:
|
|
22
|
+
ann = dict(getattr(schema, "__dict__", {}).get("__annotations__", {}))
|
|
23
|
+
out: dict[str, object] = {}
|
|
24
|
+
for name in ann:
|
|
25
|
+
if name in getattr(schema, "__dict__", {}):
|
|
26
|
+
out[name] = getattr(schema, name)
|
|
27
|
+
return out
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _fill_missing_from_defaults(
|
|
31
|
+
data: SparklessData, *, defaults: dict[str, object]
|
|
32
|
+
) -> SparklessData:
|
|
33
|
+
if not defaults:
|
|
34
|
+
return data
|
|
35
|
+
|
|
36
|
+
if isinstance(data, Mapping):
|
|
37
|
+
data_map = cast(Mapping[str, Sequence[object]], data)
|
|
38
|
+
if not data_map:
|
|
39
|
+
return dict(data_map)
|
|
40
|
+
first = next(iter(data_map.values()))
|
|
41
|
+
n = len(first)
|
|
42
|
+
out: dict[str, list[object]] = {k: list(v) for k, v in data_map.items()}
|
|
43
|
+
for k, dv in defaults.items():
|
|
44
|
+
if k not in out:
|
|
45
|
+
out[k] = [dv] * n
|
|
46
|
+
return out
|
|
47
|
+
|
|
48
|
+
out_rows: list[dict[str, object]] = []
|
|
49
|
+
for row in data:
|
|
50
|
+
r = dict(row)
|
|
51
|
+
for k, dv in defaults.items():
|
|
52
|
+
if k not in r:
|
|
53
|
+
r[k] = dv
|
|
54
|
+
out_rows.append(r)
|
|
55
|
+
return out_rows
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _to_sparkless_df(data: SparklessData, *, schema: type[Any]) -> SparklessBackendFrame:
|
|
59
|
+
defaults = _schema_defaults(schema)
|
|
60
|
+
data2 = _fill_missing_from_defaults(data, defaults=defaults)
|
|
61
|
+
return _spark().createDataFrame(data2) # type: ignore[arg-type]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class _SparklessFrameMeta(type):
|
|
65
|
+
def __call__(cls, *args: Any, **kwargs: Any) -> Any: # noqa: ANN401
|
|
66
|
+
# Allow normal construction when `Frame.source(...)` calls `cls(_data=..., ...)`.
|
|
67
|
+
if "_data" in kwargs and "_adapter" in kwargs and "_plan" in kwargs and "_schema" in kwargs:
|
|
68
|
+
return super().__call__(*args, **kwargs)
|
|
69
|
+
|
|
70
|
+
data = args[0] if args else kwargs.pop("data")
|
|
71
|
+
if kwargs:
|
|
72
|
+
raise TypeError(f"Unexpected constructor kwargs: {sorted(kwargs)}")
|
|
73
|
+
|
|
74
|
+
if not isinstance(data, (dict, list)):
|
|
75
|
+
raise TypeError("SparklessFrame expects dict-of-lists or list-of-dicts")
|
|
76
|
+
|
|
77
|
+
df = _to_sparkless_df(cast(SparklessData, data), schema=cast(type[Any], cls))
|
|
78
|
+
cls_any = cast(Any, cls)
|
|
79
|
+
return cls_any.source(df, adapter=cls_any._adapter_singleton, schema=cast(type[Any], cls))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class SparklessFrame(
|
|
83
|
+
SparkFrame[SchemaT, SparklessBackendFrame, SparklessBackendExpr],
|
|
84
|
+
Frame[SchemaT, SparklessBackendFrame, SparklessBackendExpr],
|
|
85
|
+
Generic[SchemaT],
|
|
86
|
+
metaclass=_SparklessFrameMeta,
|
|
87
|
+
):
|
|
88
|
+
"""A PlanFrame `Frame` bound to the sparkless backend, using the SparkFrame UI."""
|
|
89
|
+
|
|
90
|
+
_adapter_singleton: ClassVar[SparklessAdapter] = SparklessAdapter()
|
|
91
|
+
__planframe_model__ = True
|
|
92
|
+
|
|
93
|
+
# ---- IO (Spark-style engine readers) ----
|
|
94
|
+
@classmethod
|
|
95
|
+
def scan_parquet(
|
|
96
|
+
cls,
|
|
97
|
+
path: str,
|
|
98
|
+
*,
|
|
99
|
+
schema: type[SchemaT],
|
|
100
|
+
hive_partitioning: bool | None = None,
|
|
101
|
+
storage_options: StorageOptions | None = None,
|
|
102
|
+
) -> SparklessFrame[SchemaT]:
|
|
103
|
+
df = cls._adapter_singleton.reader.scan_parquet(
|
|
104
|
+
path, hive_partitioning=hive_partitioning, storage_options=storage_options
|
|
105
|
+
)
|
|
106
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
107
|
+
|
|
108
|
+
@classmethod
|
|
109
|
+
def scan_csv(
|
|
110
|
+
cls,
|
|
111
|
+
path: str,
|
|
112
|
+
*,
|
|
113
|
+
schema: type[SchemaT],
|
|
114
|
+
storage_options: StorageOptions | None = None,
|
|
115
|
+
) -> SparklessFrame[SchemaT]:
|
|
116
|
+
df = cls._adapter_singleton.reader.scan_csv(path, storage_options=storage_options)
|
|
117
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def scan_ndjson(
|
|
121
|
+
cls,
|
|
122
|
+
path: str,
|
|
123
|
+
*,
|
|
124
|
+
schema: type[SchemaT],
|
|
125
|
+
storage_options: StorageOptions | None = None,
|
|
126
|
+
) -> SparklessFrame[SchemaT]:
|
|
127
|
+
df = cls._adapter_singleton.reader.scan_ndjson(path, storage_options=storage_options)
|
|
128
|
+
return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
|
|
129
|
+
|
|
130
|
+
# Eager read aliases (Sparkless is lazy-ish; these are just naming aliases)
|
|
131
|
+
@classmethod
|
|
132
|
+
def read_parquet(
|
|
133
|
+
cls,
|
|
134
|
+
path: str,
|
|
135
|
+
*,
|
|
136
|
+
schema: type[SchemaT],
|
|
137
|
+
storage_options: StorageOptions | None = None,
|
|
138
|
+
) -> SparklessFrame[SchemaT]:
|
|
139
|
+
return cls.scan_parquet(path, schema=schema, storage_options=storage_options)
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def read_csv(
|
|
143
|
+
cls,
|
|
144
|
+
path: str,
|
|
145
|
+
*,
|
|
146
|
+
schema: type[SchemaT],
|
|
147
|
+
storage_options: StorageOptions | None = None,
|
|
148
|
+
) -> SparklessFrame[SchemaT]:
|
|
149
|
+
return cls.scan_csv(path, schema=schema, storage_options=storage_options)
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def read_json(
|
|
153
|
+
cls,
|
|
154
|
+
path: str,
|
|
155
|
+
*,
|
|
156
|
+
schema: type[SchemaT],
|
|
157
|
+
storage_options: StorageOptions | None = None,
|
|
158
|
+
) -> SparklessFrame[SchemaT]:
|
|
159
|
+
return cls.scan_ndjson(path, schema=schema, storage_options=storage_options)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "planframe-sparkless"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "sparkless backend adapter for PlanFrame (SparkFrame UI + sparkless engine)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "PlanFrame Contributors" }]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Typing :: Typed",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"planframe>=1.0.0,<2.0.0",
|
|
22
|
+
"sparkless>=4.5",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Repository = "https://github.com/eddiethedean/planframe"
|
|
27
|
+
Documentation = "https://planframe.readthedocs.io/en/latest/planframe_sparkless/"
|
|
28
|
+
Issues = "https://github.com/eddiethedean/planframe/issues"
|
|
29
|
+
|
|
30
|
+
[tool.hatch.build.targets.wheel]
|
|
31
|
+
packages = ["planframe_sparkless"]
|
|
32
|
+
|