planframe-sparkless 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ # Python bytecode / caches
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environments
7
+ .venv/
8
+ .venv*/
9
+ venv/
10
+ ENV/
11
+ env/
12
+ .env/
13
+ .release-venv/
14
+ .pandas-smoke-venv/
15
+
16
+ # Packaging / build artifacts
17
+ build/
18
+ dist/
19
+ dist-ci/
20
+ dist-release/
21
+ *.egg-info/
22
+ *.egg
23
+ pip-wheel-metadata/
24
+ .python-version
25
+ .installed.cfg
26
+
27
+ # Test / coverage artifacts
28
+ .hypothesis/
29
+ .pytest_cache/
30
+ .coverage
31
+ .coverage.*
32
+ coverage.xml
33
+ htmlcov/
34
+ site/
35
+ .tox/
36
+ .nox/
37
+
38
+ # Type check / lint caches
39
+ .mypy_cache/
40
+ .pyright/
41
+ .ruff_cache/
42
+ .ty_cache/
43
+
44
+ # Notebook checkpoints
45
+ .ipynb_checkpoints/
46
+
47
+ # IDE/editor settings
48
+ .vscode/
49
+ .idea/
50
+
51
+ # OS files
52
+ .DS_Store
53
+ Thumbs.db
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: planframe-sparkless
3
+ Version: 0.1.0
4
+ Summary: sparkless backend adapter for PlanFrame (SparkFrame UI + sparkless engine).
5
+ Project-URL: Repository, https://github.com/eddiethedean/planframe
6
+ Project-URL: Documentation, https://planframe.readthedocs.io/en/latest/planframe_sparkless/
7
+ Project-URL: Issues, https://github.com/eddiethedean/planframe/issues
8
+ Author: PlanFrame Contributors
9
+ License: MIT
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Typing :: Typed
15
+ Requires-Python: >=3.10
16
+ Requires-Dist: planframe<2.0.0,>=1.0.0
17
+ Requires-Dist: sparkless>=4.5
18
+ Description-Content-Type: text/markdown
19
+
20
+ ## planframe-sparkless
21
+
22
+ [![Docs](https://readthedocs.org/projects/planframe/badge/?version=latest)](https://planframe.readthedocs.io/en/latest/planframe_sparkless/)
23
+ [![License: MIT](https://img.shields.io/badge/License-MIT-informational)](../../LICENSE)
24
+
25
+ Sparkless adapter package for PlanFrame. Import as `planframe_sparkless`.
26
+
27
+ This package:
28
+
29
+ - uses the **PySpark-like UI** from `planframe.spark` (`SparkFrame`)
30
+ - executes plans using the **`sparkless`** engine (no JVM)
31
+
32
+ Documentation (ReadTheDocs):
33
+
34
+ - Sparkless track (end users): `https://planframe.readthedocs.io/en/latest/planframe_sparkless/`
35
+
36
+ ### Install
37
+
38
+ ```bash
39
+ pip install planframe-sparkless
40
+ ```
41
+
42
+ ### Quickstart
43
+
44
+ ```python
45
+ from planframe.expr import add, col, lit
46
+ from planframe_sparkless import SparklessFrame
47
+
48
+
49
+ class User(SparklessFrame):
50
+ id: int
51
+ x: int
52
+
53
+
54
+ pf = User([{"id": 1, "x": 2}, {"id": 2, "x": 3}])
55
+
56
+ out = (
57
+ pf.select("id", "x")
58
+ .withColumn("x2", add(col("x"), lit(1)))
59
+ .where(pf["x"] > lit(2))
60
+ .select("id", "x2")
61
+ )
62
+
63
+ print(out.to_dicts())
64
+ ```
65
+
66
+ ### Execution model (PlanFrame)
67
+
68
+ - PlanFrame is **always lazy**: chaining does not execute backend work.
69
+ - Materialization boundaries:
70
+ - `collect()` returns `list[pydantic.BaseModel]`
71
+ - `collect_backend()` returns the sparkless backend dataframe object
72
+ - `to_dicts()` / `to_dict()` export rows/columns
73
+ - Async equivalents: `acollect()` / `ato_dicts()` / `ato_dict()`
74
+
75
+ ### Notes / limitations
76
+
77
+ - This adapter aims to support a practical subset of Spark-like operations using `sparkless`.
78
+ - Row streaming: `stream_dicts()` currently materializes via `to_dicts()` (sparkless does not expose an efficient local iterator API yet).
79
+ - For backend-agnostic semantics and supported transforms, see the core docs: `https://planframe.readthedocs.io/en/latest/planframe/`
80
+
@@ -0,0 +1,61 @@
1
+ ## planframe-sparkless
2
+
3
+ [![Docs](https://readthedocs.org/projects/planframe/badge/?version=latest)](https://planframe.readthedocs.io/en/latest/planframe_sparkless/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-informational)](../../LICENSE)
5
+
6
+ Sparkless adapter package for PlanFrame. Import as `planframe_sparkless`.
7
+
8
+ This package:
9
+
10
+ - uses the **PySpark-like UI** from `planframe.spark` (`SparkFrame`)
11
+ - executes plans using the **`sparkless`** engine (no JVM)
12
+
13
+ Documentation (ReadTheDocs):
14
+
15
+ - Sparkless track (end users): `https://planframe.readthedocs.io/en/latest/planframe_sparkless/`
16
+
17
+ ### Install
18
+
19
+ ```bash
20
+ pip install planframe-sparkless
21
+ ```
22
+
23
+ ### Quickstart
24
+
25
+ ```python
26
+ from planframe.expr import add, col, lit
27
+ from planframe_sparkless import SparklessFrame
28
+
29
+
30
+ class User(SparklessFrame):
31
+ id: int
32
+ x: int
33
+
34
+
35
+ pf = User([{"id": 1, "x": 2}, {"id": 2, "x": 3}])
36
+
37
+ out = (
38
+ pf.select("id", "x")
39
+ .withColumn("x2", add(col("x"), lit(1)))
40
+ .where(pf["x"] > lit(2))
41
+ .select("id", "x2")
42
+ )
43
+
44
+ print(out.to_dicts())
45
+ ```
46
+
47
+ ### Execution model (PlanFrame)
48
+
49
+ - PlanFrame is **always lazy**: chaining does not execute backend work.
50
+ - Materialization boundaries:
51
+ - `collect()` returns `list[pydantic.BaseModel]`
52
+ - `collect_backend()` returns the sparkless backend dataframe object
53
+ - `to_dicts()` / `to_dict()` export rows/columns
54
+ - Async equivalents: `acollect()` / `ato_dicts()` / `ato_dict()`
55
+
56
+ ### Notes / limitations
57
+
58
+ - This adapter aims to support a practical subset of Spark-like operations using `sparkless`.
59
+ - Row streaming: `stream_dicts()` currently materializes via `to_dicts()` (sparkless does not expose an efficient local iterator API yet).
60
+ - For backend-agnostic semantics and supported transforms, see the core docs: `https://planframe.readthedocs.io/en/latest/planframe/`
61
+
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+ from planframe_sparkless.frame import SparklessFrame
4
+
5
+ __all__ = ["SparklessFrame"]
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from functools import lru_cache
4
+
5
+ from sparkless.sql import SparkSession
6
+
7
+
8
+ @lru_cache(maxsize=1)
9
+ def _spark() -> SparkSession:
10
+ # `SparkSession` is lightweight in sparkless and doesn’t require a JVM.
11
+ return SparkSession("planframe_sparkless")
@@ -0,0 +1,595 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import Any, Literal, cast
5
+
6
+ from sparkless.sql import functions as F
7
+ from sparkless.sql.window import Window
8
+
9
+ from planframe.backend.adapter import (
10
+ BaseAdapter,
11
+ ColumnName,
12
+ Columns,
13
+ CompiledJoinKey,
14
+ CompiledProjectItem,
15
+ CompiledSortKey,
16
+ )
17
+ from planframe.backend.errors import PlanFrameBackendError, PlanFrameExpressionError
18
+ from planframe.execution_options import ExecutionOptions
19
+ from planframe.plan.join_options import JoinOptions
20
+ from planframe.plan.nodes import UnnestItem
21
+ from planframe.schema.ir import Schema
22
+ from planframe.typing.scalars import Scalar
23
+ from planframe.typing.storage import StorageOptions
24
+ from planframe_sparkless._spark import _spark
25
+ from planframe_sparkless.compile_expr import compile_expr
26
+
27
+ SparklessBackendFrame = Any # runtime type is `builtins.PyDataFrame`
28
+ SparklessBackendExpr = Any # runtime type is `builtins.PyColumn`
29
+
30
+
31
+ class SparklessAdapter(BaseAdapter[SparklessBackendFrame, SparklessBackendExpr]):
32
+ name = "sparkless"
33
+
34
+ # ---- AdapterReader surface (used by SparklessFrame classmethods) ----
35
+ def scan_parquet(
36
+ self,
37
+ path: str,
38
+ *,
39
+ hive_partitioning: bool | None = None,
40
+ storage_options: StorageOptions | None = None,
41
+ ) -> SparklessBackendFrame:
42
+ _ = hive_partitioning, storage_options
43
+ return _spark().read.parquet(path)
44
+
45
+ def scan_parquet_dataset(
46
+ self, path_or_glob: str, *, storage_options: StorageOptions | None = None
47
+ ) -> SparklessBackendFrame:
48
+ _ = storage_options
49
+ # Spark-style readers generally accept globs as path patterns.
50
+ return _spark().read.parquet(path_or_glob)
51
+
52
+ def scan_csv(
53
+ self, path: str, *, storage_options: StorageOptions | None = None
54
+ ) -> SparklessBackendFrame:
55
+ _ = storage_options
56
+ return _spark().read.csv(path)
57
+
58
+ def scan_ndjson(
59
+ self, path: str, *, storage_options: StorageOptions | None = None
60
+ ) -> SparklessBackendFrame:
61
+ _ = storage_options
62
+ return _spark().read.json(path)
63
+
64
+ def scan_ipc(
65
+ self,
66
+ path: str,
67
+ *,
68
+ hive_partitioning: bool | None = None,
69
+ storage_options: StorageOptions | None = None,
70
+ ) -> SparklessBackendFrame:
71
+ _ = path, hive_partitioning, storage_options
72
+ raise PlanFrameBackendError("sparkless adapter does not implement scan_ipc")
73
+
74
+ def scan_delta(
75
+ self,
76
+ source: str,
77
+ *,
78
+ version: int | str | None = None,
79
+ storage_options: StorageOptions | None = None,
80
+ ) -> SparklessBackendFrame:
81
+ _ = source, version, storage_options
82
+ raise PlanFrameBackendError("sparkless adapter does not implement scan_delta")
83
+
84
+ def read_delta(
85
+ self,
86
+ source: str,
87
+ *,
88
+ version: int | str | None = None,
89
+ storage_options: StorageOptions | None = None,
90
+ ) -> SparklessBackendFrame:
91
+ _ = source, version, storage_options
92
+ raise PlanFrameBackendError("sparkless adapter does not implement read_delta")
93
+
94
+ def read_excel(
95
+ self,
96
+ path: str,
97
+ *,
98
+ sheet_name: str | None = None,
99
+ ) -> SparklessBackendFrame:
100
+ _ = path, sheet_name
101
+ raise PlanFrameBackendError("sparkless adapter does not implement read_excel")
102
+
103
+ def read_avro(self, path: str) -> SparklessBackendFrame:
104
+ _ = path
105
+ raise PlanFrameBackendError("sparkless adapter does not implement read_avro")
106
+
107
+ def read_database(self, query: str, *, connection: object) -> SparklessBackendFrame:
108
+ _ = query, connection
109
+ raise PlanFrameBackendError("sparkless adapter does not implement read_database")
110
+
111
+ def read_database_uri(
112
+ self,
113
+ query: str,
114
+ *,
115
+ uri: str,
116
+ engine: Literal["connectorx", "adbc"] | None = None,
117
+ ) -> SparklessBackendFrame:
118
+ _ = query, uri, engine
119
+ raise PlanFrameBackendError("sparkless adapter does not implement read_database_uri")
120
+
121
+ # ---- Core transforms ----
122
+ def select(self, df: SparklessBackendFrame, columns: Columns) -> SparklessBackendFrame:
123
+ return df.select(*columns)
124
+
125
+ def project(
126
+ self,
127
+ df: SparklessBackendFrame,
128
+ items: tuple[CompiledProjectItem[SparklessBackendExpr], ...],
129
+ ) -> SparklessBackendFrame:
130
+ cols: list[Any] = []
131
+ for it in items:
132
+ if it.from_column is not None:
133
+ cols.append(it.from_column)
134
+ elif it.expr is not None:
135
+ cols.append(cast(Any, it.expr).alias(it.name))
136
+ else:
137
+ raise AssertionError("Invalid CompiledProjectItem")
138
+ return df.select(*cols)
139
+
140
+ def drop(
141
+ self, df: SparklessBackendFrame, columns: Columns, *, strict: bool = True
142
+ ) -> SparklessBackendFrame:
143
+ existing = set(df.columns)
144
+ if strict:
145
+ missing = [c for c in columns if c not in existing]
146
+ if missing:
147
+ raise PlanFrameBackendError(f"Columns not found for drop: {missing}")
148
+ return df.drop(*columns)
149
+
150
+ cols2 = tuple(c for c in columns if c in existing)
151
+ if not cols2:
152
+ return df
153
+ return df.drop(*cols2)
154
+
155
+ def rename(
156
+ self,
157
+ df: SparklessBackendFrame,
158
+ mapping: dict[ColumnName, ColumnName],
159
+ *,
160
+ strict: bool = True,
161
+ ) -> SparklessBackendFrame:
162
+ if strict:
163
+ missing = [c for c in mapping if c not in set(df.columns)]
164
+ if missing:
165
+ raise PlanFrameBackendError(f"Columns not found for rename: {missing}")
166
+ out = df
167
+ for old, new in mapping.items():
168
+ if old in set(out.columns):
169
+ out = out.withColumnRenamed(old, new)
170
+ return out
171
+
172
+ def with_column(
173
+ self, df: SparklessBackendFrame, name: str, expr: SparklessBackendExpr
174
+ ) -> SparklessBackendFrame:
175
+ return df.withColumn(name, expr)
176
+
177
+ def cast(self, df: SparklessBackendFrame, name: str, dtype: object) -> SparklessBackendFrame:
178
+ # PlanFrame dtypes are backend-agnostic; sparkless expects Spark SQL type strings.
179
+ # We accept `object` here and rely on adapter users passing strings when needed.
180
+ if not isinstance(dtype, str):
181
+ raise PlanFrameBackendError(
182
+ "sparkless cast expects dtype as Spark SQL string (e.g. 'int')"
183
+ )
184
+ return df.withColumn(name, F.col(name).cast(dtype))
185
+
186
+ def with_row_count(
187
+ self, df: SparklessBackendFrame, *, name: str = "row_nr", offset: int = 0
188
+ ) -> SparklessBackendFrame:
189
+ # Spark requires an ordering for row_number(). Sparkless does not accept ordering
190
+ # by a pure literal expression, so we order by the first column.
191
+ first_col = df.columns[0] if df.columns else None
192
+ if first_col is None:
193
+ raise PlanFrameBackendError("Cannot add row count to empty-column sparkless DataFrame")
194
+ w = Window.orderBy(F.col(first_col))
195
+ return df.withColumn(name, F.row_number().over(w) + F.lit(offset) - F.lit(1))
196
+
197
+ def filter(
198
+ self, df: SparklessBackendFrame, predicate: SparklessBackendExpr
199
+ ) -> SparklessBackendFrame:
200
+ return df.filter(predicate)
201
+
202
+ def sort(
203
+ self,
204
+ df: SparklessBackendFrame,
205
+ keys: tuple[CompiledSortKey[SparklessBackendExpr], ...],
206
+ *,
207
+ descending: tuple[bool, ...],
208
+ nulls_last: tuple[bool, ...],
209
+ ) -> SparklessBackendFrame:
210
+ # sparkless Columns support Spark-style null ordering (asc_nulls_* / desc_nulls_*).
211
+ cols: list[Any] = []
212
+ for k, desc, nl in zip(keys, descending, nulls_last, strict=True):
213
+ if k.column is not None:
214
+ c = F.col(k.column)
215
+ else:
216
+ if k.expr is None:
217
+ raise PlanFrameBackendError("Sort key expr cannot be None")
218
+ c = k.expr
219
+ if desc:
220
+ cols.append(c.desc_nulls_last() if nl else c.desc_nulls_first())
221
+ else:
222
+ cols.append(c.asc_nulls_last() if nl else c.asc_nulls_first())
223
+ return df.orderBy(*cols)
224
+
225
+ def unique(
226
+ self,
227
+ df: SparklessBackendFrame,
228
+ subset: Columns | None,
229
+ *,
230
+ keep: str = "first",
231
+ maintain_order: bool = False,
232
+ ) -> SparklessBackendFrame:
233
+ _ = keep, maintain_order
234
+ if subset is None:
235
+ return df.distinct()
236
+ return df.dropDuplicates(list(subset))
237
+
238
+ def duplicated(
239
+ self,
240
+ df: SparklessBackendFrame,
241
+ subset: Columns | None,
242
+ *,
243
+ keep: str | bool = "first",
244
+ out_name: str = "duplicated",
245
+ ) -> SparklessBackendFrame:
246
+ # Approximate via window count > 1.
247
+ _ = keep
248
+ cols = list(subset) if subset is not None else list(df.columns)
249
+ w = Window.partitionBy(*[F.col(c) for c in cols]).orderBy(F.lit(1))
250
+ return df.withColumn(out_name, (F.count(F.lit(1)).over(w) > F.lit(1)))
251
+
252
+ def group_by_agg(
253
+ self,
254
+ df: SparklessBackendFrame,
255
+ *,
256
+ keys: tuple[CompiledJoinKey[SparklessBackendExpr], ...],
257
+ named_aggs: dict[ColumnName, Any],
258
+ ) -> SparklessBackendFrame:
259
+ group_cols: list[Any] = []
260
+ for k in keys:
261
+ if k.column is not None:
262
+ group_cols.append(F.col(k.column))
263
+ else:
264
+ group_cols.append(k.expr)
265
+ g = df.groupBy(*group_cols)
266
+
267
+ aggs: list[Any] = []
268
+ for out_name, spec in named_aggs.items():
269
+ if isinstance(spec, tuple):
270
+ op, col = spec
271
+ if op == "count":
272
+ aggs.append(F.count(F.col(col)).alias(out_name))
273
+ elif op == "sum":
274
+ aggs.append(F.sum(F.col(col)).alias(out_name))
275
+ elif op == "mean":
276
+ aggs.append(F.avg(F.col(col)).alias(out_name))
277
+ elif op == "min":
278
+ aggs.append(F.min(F.col(col)).alias(out_name))
279
+ elif op == "max":
280
+ aggs.append(F.max(F.col(col)).alias(out_name))
281
+ elif op == "n_unique":
282
+ aggs.append(F.countDistinct(F.col(col)).alias(out_name))
283
+ else:
284
+ raise PlanFrameBackendError(f"Unsupported aggregation op: {op!r}")
285
+ else:
286
+ aggs.append(cast(Any, spec).alias(out_name))
287
+ return g.agg(*aggs)
288
+
289
+ def group_by_dynamic_agg(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
290
+ raise PlanFrameBackendError("sparkless adapter does not implement dynamic group_by yet")
291
+
292
+ def rolling_agg(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
293
+ raise PlanFrameBackendError("sparkless adapter does not implement rolling_agg yet")
294
+
295
+ def drop_nulls(
296
+ self,
297
+ df: SparklessBackendFrame,
298
+ subset: Columns | None,
299
+ *,
300
+ how: Literal["any", "all"] = "any",
301
+ threshold: int | None = None,
302
+ ) -> SparklessBackendFrame:
303
+ # Spark uses DataFrame.na.drop
304
+ subset_list = None if subset is None else list(subset)
305
+ if threshold is not None:
306
+ return df.na.drop(thresh=threshold, subset=subset_list)
307
+ return df.na.drop(how=how, subset=subset_list)
308
+
309
+ def fill_null(
310
+ self,
311
+ df: SparklessBackendFrame,
312
+ value: Scalar | SparklessBackendExpr | None,
313
+ subset: Columns | None,
314
+ *,
315
+ strategy: str | None = None,
316
+ ) -> SparklessBackendFrame:
317
+ _ = strategy
318
+ subset_list = None if subset is None else list(subset)
319
+ if value is None:
320
+ raise PlanFrameBackendError("sparkless fill_null does not support value=None")
321
+ if isinstance(value, (int, float, str, bool)):
322
+ return df.na.fill(value=value, subset=subset_list)
323
+ raise PlanFrameBackendError("sparkless fill_null only supports scalar values currently")
324
+
325
+ def melt(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
326
+ raise PlanFrameBackendError("sparkless adapter does not implement melt yet")
327
+
328
+ def join(
329
+ self,
330
+ left: SparklessBackendFrame,
331
+ right: SparklessBackendFrame,
332
+ *,
333
+ left_on: tuple[CompiledJoinKey[SparklessBackendExpr], ...],
334
+ right_on: tuple[CompiledJoinKey[SparklessBackendExpr], ...],
335
+ how: str = "inner",
336
+ suffix: str = "_right",
337
+ options: JoinOptions | None = None,
338
+ ) -> SparklessBackendFrame:
339
+ _ = suffix, options
340
+ if not left_on and not right_on:
341
+ return left.crossJoin(right)
342
+ if len(left_on) != len(right_on):
343
+ raise ValueError("Join keys must match in length")
344
+
345
+ # Prefer Spark-style ``on=`` / ``left_on=``/``right_on=`` so join keys are not
346
+ # ambiguous when column names overlap (unqualified ``F.col`` in a boolean ``on``).
347
+ simple_name_keys = True
348
+ left_names: list[str] = []
349
+ right_names: list[str] = []
350
+ for lk, rk in zip(left_on, right_on, strict=True):
351
+ if lk.column is None or lk.expr is not None or rk.column is None or rk.expr is not None:
352
+ simple_name_keys = False
353
+ break
354
+ left_names.append(lk.column)
355
+ right_names.append(rk.column)
356
+
357
+ if simple_name_keys:
358
+ if left_names == right_names:
359
+ on_arg = left_names[0] if len(left_names) == 1 else left_names
360
+ return left.join(right, on=on_arg, how=how)
361
+ return left.join(right, left_on=left_names, right_on=right_names, how=how)
362
+
363
+ conds: list[Any] = []
364
+ for lk, rk in zip(left_on, right_on, strict=True):
365
+ lcol = F.col(lk.column) if lk.column is not None else lk.expr
366
+ rcol = F.col(rk.column) if rk.column is not None else rk.expr
367
+ conds.append(lcol == rcol)
368
+ cond = conds[0]
369
+ for c in conds[1:]:
370
+ cond = cond & c
371
+ return left.join(right, on=cond, how=how)
372
+
373
+ def slice(
374
+ self, df: SparklessBackendFrame, *, offset: int, length: int | None
375
+ ) -> SparklessBackendFrame:
376
+ if offset != 0:
377
+ raise PlanFrameBackendError("sparkless adapter does not support offset slicing yet")
378
+ return df.limit(length) if length is not None else df
379
+
380
+ def head(self, df: SparklessBackendFrame, n: int) -> SparklessBackendFrame:
381
+ return df.limit(n)
382
+
383
+ def tail(self, df: SparklessBackendFrame, n: int) -> SparklessBackendFrame:
384
+ # No direct tail in Spark; approximate by collecting and re-creating.
385
+ rows = df.collect()[-n:]
386
+ dicts: list[dict[str, object]] = []
387
+ for r in rows:
388
+ if hasattr(r, "asDict"):
389
+ dicts.append(cast(dict[str, object], r.asDict()))
390
+ else:
391
+ raise PlanFrameBackendError(
392
+ f"Unexpected row type from sparkless collect(): {type(r)!r}"
393
+ )
394
+ return _spark().createDataFrame(dicts)
395
+
396
+ def concat_vertical(
397
+ self, left: SparklessBackendFrame, right: SparklessBackendFrame
398
+ ) -> SparklessBackendFrame:
399
+ return left.unionByName(right, allowMissingColumns=True)
400
+
401
+ def concat_horizontal(
402
+ self, left: SparklessBackendFrame, right: SparklessBackendFrame
403
+ ) -> SparklessBackendFrame:
404
+ raise PlanFrameBackendError("sparkless adapter does not implement concat_horizontal yet")
405
+
406
+ def pivot(self, df: SparklessBackendFrame, **_: Any) -> SparklessBackendFrame:
407
+ raise PlanFrameBackendError("sparkless adapter does not implement pivot yet")
408
+
409
+ # ---- Writes ----
410
+ def write_parquet(
411
+ self,
412
+ df: SparklessBackendFrame,
413
+ path: str,
414
+ *,
415
+ compression: str = "zstd",
416
+ row_group_size: int | None = None,
417
+ partition_by: tuple[str, ...] | None = None,
418
+ storage_options: StorageOptions | None = None,
419
+ ) -> None:
420
+ _ = compression, row_group_size, partition_by, storage_options
421
+ df.write.parquet(path)
422
+
423
+ def write_csv(
424
+ self,
425
+ df: SparklessBackendFrame,
426
+ path: str,
427
+ *,
428
+ separator: str = ",",
429
+ include_header: bool = True,
430
+ storage_options: StorageOptions | None = None,
431
+ ) -> None:
432
+ _ = storage_options
433
+ df.write.csv(path, sep=separator, header=include_header)
434
+
435
+ def write_ndjson(
436
+ self, df: SparklessBackendFrame, path: str, *, storage_options: StorageOptions | None = None
437
+ ) -> None:
438
+ _ = storage_options
439
+ df.write.json(path)
440
+
441
+ def write_ipc(
442
+ self,
443
+ df: SparklessBackendFrame,
444
+ path: str,
445
+ *,
446
+ compression: str = "uncompressed",
447
+ storage_options: StorageOptions | None = None,
448
+ ) -> None:
449
+ _ = df, path, compression, storage_options
450
+ raise PlanFrameBackendError("sparkless adapter does not implement IPC writing")
451
+
452
+ def write_database(
453
+ self,
454
+ df: SparklessBackendFrame,
455
+ *,
456
+ table_name: str,
457
+ connection: object,
458
+ if_table_exists: str = "fail",
459
+ engine: str | None = None,
460
+ ) -> None:
461
+ _ = df, table_name, connection, if_table_exists, engine
462
+ raise PlanFrameBackendError("sparkless adapter does not implement database writing")
463
+
464
+ def write_excel(
465
+ self, df: SparklessBackendFrame, path: str, *, worksheet: str = "Sheet1"
466
+ ) -> None:
467
+ _ = df, path, worksheet
468
+ raise PlanFrameBackendError("sparkless adapter does not implement Excel writing")
469
+
470
+ def write_delta(
471
+ self,
472
+ df: SparklessBackendFrame,
473
+ target: str,
474
+ *,
475
+ mode: str = "error",
476
+ storage_options: StorageOptions | None = None,
477
+ ) -> None:
478
+ _ = df, target, mode, storage_options
479
+ raise PlanFrameBackendError("sparkless adapter does not implement Delta writing")
480
+
481
+ def write_avro(
482
+ self,
483
+ df: SparklessBackendFrame,
484
+ path: str,
485
+ *,
486
+ compression: str = "uncompressed",
487
+ name: str = "",
488
+ ) -> None:
489
+ _ = df, path, compression, name
490
+ raise PlanFrameBackendError("sparkless adapter does not implement Avro writing")
491
+
492
+ # ---- Nested/array ops ----
493
+ def explode(
494
+ self, df: SparklessBackendFrame, columns: Columns, *, outer: bool = False
495
+ ) -> SparklessBackendFrame:
496
+ _ = outer
497
+ out = df
498
+ for c in columns:
499
+ out = out.withColumn(c, F.explode(F.col(c)))
500
+ return out
501
+
502
+ def unnest(
503
+ self, df: SparklessBackendFrame, items: tuple[UnnestItem, ...]
504
+ ) -> SparklessBackendFrame:
505
+ _ = df, items
506
+ raise PlanFrameBackendError("sparkless adapter does not implement unnest yet")
507
+
508
+ def posexplode(
509
+ self,
510
+ df: SparklessBackendFrame,
511
+ column: str,
512
+ *,
513
+ pos: str = "pos",
514
+ value: str | None = None,
515
+ outer: bool = False,
516
+ ) -> SparklessBackendFrame:
517
+ _ = outer
518
+ value_name = value or column
519
+ return df.select("*", F.posexplode(F.col(column)).alias(pos, value_name))
520
+
521
+ def drop_nulls_all(
522
+ self, df: SparklessBackendFrame, subset: tuple[str, ...] | None
523
+ ) -> SparklessBackendFrame:
524
+ return self.drop_nulls(df, subset, how="all")
525
+
526
+ def sample(
527
+ self,
528
+ df: SparklessBackendFrame,
529
+ *,
530
+ n: int | None = None,
531
+ frac: float | None = None,
532
+ with_replacement: bool = False,
533
+ shuffle: bool = False,
534
+ seed: int | None = None,
535
+ ) -> SparklessBackendFrame:
536
+ _ = shuffle
537
+ if frac is None and n is None:
538
+ raise ValueError("sample requires n or frac")
539
+ if frac is None:
540
+ # Approximate n via fraction; requires count (expensive). Keep simple.
541
+ raise PlanFrameBackendError("sparkless adapter sample(n=...) is not implemented")
542
+ return df.sample(withReplacement=with_replacement, fraction=frac, seed=seed)
543
+
544
+ # ---- Expression compilation + materialization ----
545
+ def compile_expr(self, expr: object, *, schema: Schema | None = None) -> SparklessBackendExpr:
546
+ _ = schema
547
+ if isinstance(expr, object) and hasattr(expr, "__class__"):
548
+ # PlanFrame Expr nodes are dataclasses; compile using our mapping.
549
+ from planframe.expr.api import Expr as PFExpr
550
+
551
+ if isinstance(expr, PFExpr):
552
+ return compile_expr(cast(Any, expr))
553
+ raise PlanFrameExpressionError(f"Unsupported expr type for sparkless: {type(expr)!r}")
554
+
555
+ def collect(
556
+ self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
557
+ ) -> SparklessBackendFrame:
558
+ # Return backend-native dataframe (lazy plan). Row export methods execute.
559
+ _ = options
560
+ return df
561
+
562
+ def to_dicts(
563
+ self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
564
+ ) -> list[dict[str, object]]:
565
+ _ = options
566
+ rows = df.collect()
567
+ out: list[dict[str, object]] = []
568
+ for r in rows:
569
+ if hasattr(r, "asDict"):
570
+ out.append(cast(dict[str, object], r.asDict()))
571
+ elif isinstance(r, dict):
572
+ out.append(cast(dict[str, object], r))
573
+ else:
574
+ raise PlanFrameBackendError(
575
+ f"Unexpected row type from sparkless collect(): {type(r)!r}"
576
+ )
577
+ return out
578
+
579
+ def to_dict(
580
+ self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
581
+ ) -> dict[str, list[object]]:
582
+ rows = self.to_dicts(df, options=options)
583
+ if not rows:
584
+ return {str(c): [] for c in df.columns}
585
+ out: dict[str, list[object]] = {k: [] for k in rows[0]}
586
+ for r in rows:
587
+ for k, v in r.items():
588
+ out[k].append(v)
589
+ return out
590
+
591
+ def stream_dicts(
592
+ self, df: SparklessBackendFrame, *, options: ExecutionOptions | None = None
593
+ ) -> Iterator[dict[str, object]]:
594
+ # sparkless currently doesn't expose toLocalIterator; fall back to materializing rows.
595
+ yield from self.to_dicts(df, options=options)
@@ -0,0 +1,188 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, cast
4
+
5
+ from sparkless.sql import functions as F
6
+
7
+ from planframe.backend.errors import PlanFrameExpressionError
8
+ from planframe.expr.api import (
9
+ Abs,
10
+ Add,
11
+ AggExpr,
12
+ Alias,
13
+ And,
14
+ Between,
15
+ Ceil,
16
+ Clip,
17
+ Coalesce,
18
+ Col,
19
+ DtDay,
20
+ DtMonth,
21
+ DtYear,
22
+ Eq,
23
+ Exp,
24
+ Expr,
25
+ Floor,
26
+ Ge,
27
+ Gt,
28
+ IfElse,
29
+ IsFinite,
30
+ IsIn,
31
+ IsNotNull,
32
+ IsNull,
33
+ Le,
34
+ Lit,
35
+ Log,
36
+ Lt,
37
+ Mul,
38
+ Ne,
39
+ Not,
40
+ Or,
41
+ Over,
42
+ Pow,
43
+ Round,
44
+ Sqrt,
45
+ StrContains,
46
+ StrEndsWith,
47
+ StrLen,
48
+ StrLower,
49
+ StrReplace,
50
+ StrSplit,
51
+ StrStartsWith,
52
+ StrStrip,
53
+ StrUpper,
54
+ Sub,
55
+ TrueDiv,
56
+ Xor,
57
+ )
58
+
59
+
60
+ def compile_expr(expr: Expr[Any]) -> Any:
61
+ if isinstance(expr, Alias):
62
+ return compile_expr(expr.expr)
63
+ if isinstance(expr, Col):
64
+ return F.col(expr.name)
65
+ if isinstance(expr, Lit):
66
+ return F.lit(expr.value)
67
+ if isinstance(expr, Add):
68
+ return compile_expr(expr.left) + compile_expr(expr.right)
69
+ if isinstance(expr, Sub):
70
+ return compile_expr(expr.left) - compile_expr(expr.right)
71
+ if isinstance(expr, Mul):
72
+ return compile_expr(expr.left) * compile_expr(expr.right)
73
+ if isinstance(expr, TrueDiv):
74
+ return compile_expr(expr.left) / compile_expr(expr.right)
75
+ if isinstance(expr, Eq):
76
+ return compile_expr(expr.left) == compile_expr(expr.right)
77
+ if isinstance(expr, Ne):
78
+ return compile_expr(expr.left) != compile_expr(expr.right)
79
+ if isinstance(expr, Lt):
80
+ return compile_expr(expr.left) < compile_expr(expr.right)
81
+ if isinstance(expr, Le):
82
+ return compile_expr(expr.left) <= compile_expr(expr.right)
83
+ if isinstance(expr, Gt):
84
+ return compile_expr(expr.left) > compile_expr(expr.right)
85
+ if isinstance(expr, Ge):
86
+ return compile_expr(expr.left) >= compile_expr(expr.right)
87
+ if isinstance(expr, IsNull):
88
+ return compile_expr(expr.value).isNull()
89
+ if isinstance(expr, IsNotNull):
90
+ return compile_expr(expr.value).isNotNull()
91
+ if isinstance(expr, IsIn):
92
+ return compile_expr(expr.value).isin(list(expr.options))
93
+ if isinstance(expr, And):
94
+ return compile_expr(expr.left) & compile_expr(expr.right)
95
+ if isinstance(expr, Or):
96
+ return compile_expr(expr.left) | compile_expr(expr.right)
97
+ if isinstance(expr, Not):
98
+ return ~compile_expr(expr.value)
99
+ if isinstance(expr, Xor):
100
+ return compile_expr(expr.left) ^ compile_expr(expr.right)
101
+ if isinstance(expr, Abs):
102
+ return F.abs(compile_expr(expr.value))
103
+ if isinstance(expr, Round):
104
+ e = compile_expr(expr.value)
105
+ return F.round(e, expr.ndigits) if expr.ndigits is not None else F.round(e)
106
+ if isinstance(expr, Floor):
107
+ return F.floor(compile_expr(expr.value))
108
+ if isinstance(expr, Ceil):
109
+ return F.ceil(compile_expr(expr.value))
110
+ if isinstance(expr, Coalesce):
111
+ return F.coalesce(*[compile_expr(v) for v in expr.values])
112
+ if isinstance(expr, IfElse):
113
+ return F.when(compile_expr(expr.cond), compile_expr(expr.then_value)).otherwise(
114
+ compile_expr(expr.else_value)
115
+ )
116
+ if isinstance(expr, Over):
117
+ # PlanFrame Over only carries partition/order column names, so we can map to Window.
118
+ from sparkless.sql.window import Window
119
+
120
+ w = Window.partitionBy(*expr.partition_by)
121
+ if expr.order_by is not None:
122
+ w = w.orderBy(*expr.order_by)
123
+ return compile_expr(expr.value).over(w)
124
+ if isinstance(expr, Between):
125
+ return compile_expr(expr.value).between(compile_expr(expr.low), compile_expr(expr.high))
126
+ if isinstance(expr, Clip):
127
+ e = compile_expr(expr.value)
128
+ if expr.lower is not None:
129
+ e = F.greatest(e, compile_expr(expr.lower))
130
+ if expr.upper is not None:
131
+ e = F.least(e, compile_expr(expr.upper))
132
+ return e
133
+ if isinstance(expr, Pow):
134
+ return F.pow(compile_expr(expr.base), compile_expr(expr.exponent))
135
+ if isinstance(expr, Exp):
136
+ return F.exp(compile_expr(expr.value))
137
+ if isinstance(expr, Log):
138
+ return F.log(compile_expr(expr.value))
139
+ if isinstance(expr, StrContains):
140
+ e = compile_expr(expr.value)
141
+ if expr.literal:
142
+ return e.contains(expr.pattern)
143
+ return e.rlike(expr.pattern)
144
+ if isinstance(expr, StrStartsWith):
145
+ return compile_expr(expr.value).startswith(expr.prefix)
146
+ if isinstance(expr, StrEndsWith):
147
+ return compile_expr(expr.value).endswith(expr.suffix)
148
+ if isinstance(expr, StrLower):
149
+ return F.lower(compile_expr(expr.value))
150
+ if isinstance(expr, StrUpper):
151
+ return F.upper(compile_expr(expr.value))
152
+ if isinstance(expr, StrLen):
153
+ return F.length(compile_expr(expr.value))
154
+ if isinstance(expr, StrReplace):
155
+ return F.regexp_replace(compile_expr(expr.value), expr.pattern, expr.replacement)
156
+ if isinstance(expr, StrStrip):
157
+ return F.trim(compile_expr(expr.value))
158
+ if isinstance(expr, StrSplit):
159
+ return F.split(compile_expr(expr.value), expr.by)
160
+ if isinstance(expr, DtYear):
161
+ return F.year(compile_expr(expr.value))
162
+ if isinstance(expr, DtMonth):
163
+ return F.month(compile_expr(expr.value))
164
+ if isinstance(expr, DtDay):
165
+ return F.dayofmonth(compile_expr(expr.value))
166
+ if isinstance(expr, Sqrt):
167
+ return F.sqrt(compile_expr(expr.value))
168
+ if isinstance(expr, IsFinite):
169
+ # Spark doesn't have a direct isFinite; approximate via isnan/isnull checks.
170
+ e = compile_expr(expr.value)
171
+ return (~cast(Any, F.isnan(e))) & e.isNotNull()
172
+ if isinstance(expr, AggExpr):
173
+ inner = compile_expr(expr.inner)
174
+ if expr.op == "count":
175
+ return F.count(inner)
176
+ if expr.op == "sum":
177
+ return F.sum(inner)
178
+ if expr.op == "mean":
179
+ return F.avg(inner)
180
+ if expr.op == "min":
181
+ return F.min(inner)
182
+ if expr.op == "max":
183
+ return F.max(inner)
184
+ if expr.op == "n_unique":
185
+ return F.countDistinct(inner)
186
+ raise PlanFrameExpressionError(f"Unsupported aggregation op: {expr.op!r}")
187
+
188
+ raise PlanFrameExpressionError(f"Unsupported expr node for sparkless: {type(expr)!r}")
@@ -0,0 +1,159 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Mapping, Sequence
4
+ from typing import Any, ClassVar, Generic, TypeVar, cast
5
+
6
+ from planframe.frame import Frame
7
+ from planframe.spark import SparkFrame
8
+ from planframe.typing.storage import StorageOptions
9
+ from planframe_sparkless._spark import _spark
10
+ from planframe_sparkless.adapter import (
11
+ SparklessAdapter,
12
+ SparklessBackendExpr,
13
+ SparklessBackendFrame,
14
+ )
15
+
16
+ SchemaT = TypeVar("SchemaT")
17
+
18
+ SparklessData = Mapping[str, Sequence[object]] | Sequence[Mapping[str, object]]
19
+
20
+
21
+ def _schema_defaults(schema: type[Any]) -> dict[str, object]:
22
+ ann = dict(getattr(schema, "__dict__", {}).get("__annotations__", {}))
23
+ out: dict[str, object] = {}
24
+ for name in ann:
25
+ if name in getattr(schema, "__dict__", {}):
26
+ out[name] = getattr(schema, name)
27
+ return out
28
+
29
+
30
+ def _fill_missing_from_defaults(
31
+ data: SparklessData, *, defaults: dict[str, object]
32
+ ) -> SparklessData:
33
+ if not defaults:
34
+ return data
35
+
36
+ if isinstance(data, Mapping):
37
+ data_map = cast(Mapping[str, Sequence[object]], data)
38
+ if not data_map:
39
+ return dict(data_map)
40
+ first = next(iter(data_map.values()))
41
+ n = len(first)
42
+ out: dict[str, list[object]] = {k: list(v) for k, v in data_map.items()}
43
+ for k, dv in defaults.items():
44
+ if k not in out:
45
+ out[k] = [dv] * n
46
+ return out
47
+
48
+ out_rows: list[dict[str, object]] = []
49
+ for row in data:
50
+ r = dict(row)
51
+ for k, dv in defaults.items():
52
+ if k not in r:
53
+ r[k] = dv
54
+ out_rows.append(r)
55
+ return out_rows
56
+
57
+
58
+ def _to_sparkless_df(data: SparklessData, *, schema: type[Any]) -> SparklessBackendFrame:
59
+ defaults = _schema_defaults(schema)
60
+ data2 = _fill_missing_from_defaults(data, defaults=defaults)
61
+ return _spark().createDataFrame(data2) # type: ignore[arg-type]
62
+
63
+
64
+ class _SparklessFrameMeta(type):
65
+ def __call__(cls, *args: Any, **kwargs: Any) -> Any: # noqa: ANN401
66
+ # Allow normal construction when `Frame.source(...)` calls `cls(_data=..., ...)`.
67
+ if "_data" in kwargs and "_adapter" in kwargs and "_plan" in kwargs and "_schema" in kwargs:
68
+ return super().__call__(*args, **kwargs)
69
+
70
+ data = args[0] if args else kwargs.pop("data")
71
+ if kwargs:
72
+ raise TypeError(f"Unexpected constructor kwargs: {sorted(kwargs)}")
73
+
74
+ if not isinstance(data, (dict, list)):
75
+ raise TypeError("SparklessFrame expects dict-of-lists or list-of-dicts")
76
+
77
+ df = _to_sparkless_df(cast(SparklessData, data), schema=cast(type[Any], cls))
78
+ cls_any = cast(Any, cls)
79
+ return cls_any.source(df, adapter=cls_any._adapter_singleton, schema=cast(type[Any], cls))
80
+
81
+
82
+ class SparklessFrame(
83
+ SparkFrame[SchemaT, SparklessBackendFrame, SparklessBackendExpr],
84
+ Frame[SchemaT, SparklessBackendFrame, SparklessBackendExpr],
85
+ Generic[SchemaT],
86
+ metaclass=_SparklessFrameMeta,
87
+ ):
88
+ """A PlanFrame `Frame` bound to the sparkless backend, using the SparkFrame UI."""
89
+
90
+ _adapter_singleton: ClassVar[SparklessAdapter] = SparklessAdapter()
91
+ __planframe_model__ = True
92
+
93
+ # ---- IO (Spark-style engine readers) ----
94
+ @classmethod
95
+ def scan_parquet(
96
+ cls,
97
+ path: str,
98
+ *,
99
+ schema: type[SchemaT],
100
+ hive_partitioning: bool | None = None,
101
+ storage_options: StorageOptions | None = None,
102
+ ) -> SparklessFrame[SchemaT]:
103
+ df = cls._adapter_singleton.reader.scan_parquet(
104
+ path, hive_partitioning=hive_partitioning, storage_options=storage_options
105
+ )
106
+ return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
107
+
108
+ @classmethod
109
+ def scan_csv(
110
+ cls,
111
+ path: str,
112
+ *,
113
+ schema: type[SchemaT],
114
+ storage_options: StorageOptions | None = None,
115
+ ) -> SparklessFrame[SchemaT]:
116
+ df = cls._adapter_singleton.reader.scan_csv(path, storage_options=storage_options)
117
+ return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
118
+
119
+ @classmethod
120
+ def scan_ndjson(
121
+ cls,
122
+ path: str,
123
+ *,
124
+ schema: type[SchemaT],
125
+ storage_options: StorageOptions | None = None,
126
+ ) -> SparklessFrame[SchemaT]:
127
+ df = cls._adapter_singleton.reader.scan_ndjson(path, storage_options=storage_options)
128
+ return cls.source(df, adapter=cls._adapter_singleton, schema=schema)
129
+
130
+ # Eager read aliases (Sparkless is lazy-ish; these are just naming aliases)
131
+ @classmethod
132
+ def read_parquet(
133
+ cls,
134
+ path: str,
135
+ *,
136
+ schema: type[SchemaT],
137
+ storage_options: StorageOptions | None = None,
138
+ ) -> SparklessFrame[SchemaT]:
139
+ return cls.scan_parquet(path, schema=schema, storage_options=storage_options)
140
+
141
+ @classmethod
142
+ def read_csv(
143
+ cls,
144
+ path: str,
145
+ *,
146
+ schema: type[SchemaT],
147
+ storage_options: StorageOptions | None = None,
148
+ ) -> SparklessFrame[SchemaT]:
149
+ return cls.scan_csv(path, schema=schema, storage_options=storage_options)
150
+
151
+ @classmethod
152
+ def read_json(
153
+ cls,
154
+ path: str,
155
+ *,
156
+ schema: type[SchemaT],
157
+ storage_options: StorageOptions | None = None,
158
+ ) -> SparklessFrame[SchemaT]:
159
+ return cls.scan_ndjson(path, schema=schema, storage_options=storage_options)
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.25"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "planframe-sparkless"
7
+ version = "0.1.0"
8
+ description = "sparkless backend adapter for PlanFrame (SparkFrame UI + sparkless engine)."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "PlanFrame Contributors" }]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3 :: Only",
16
+ "Programming Language :: Python :: 3.10",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Typing :: Typed",
19
+ ]
20
+ dependencies = [
21
+ "planframe>=1.0.0,<2.0.0",
22
+ "sparkless>=4.5",
23
+ ]
24
+
25
+ [project.urls]
26
+ Repository = "https://github.com/eddiethedean/planframe"
27
+ Documentation = "https://planframe.readthedocs.io/en/latest/planframe_sparkless/"
28
+ Issues = "https://github.com/eddiethedean/planframe/issues"
29
+
30
+ [tool.hatch.build.targets.wheel]
31
+ packages = ["planframe_sparkless"]
32
+