robin-sparkless 0.1.0__cp38-abi3-win_amd64.whl → 0.1.1__cp38-abi3-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- robin_sparkless/__init__.pyi +358 -0
- robin_sparkless/py.typed +0 -0
- robin_sparkless/robin_sparkless.pyd +0 -0
- robin_sparkless-0.1.1.dist-info/METADATA +112 -0
- robin_sparkless-0.1.1.dist-info/RECORD +8 -0
- robin_sparkless.pyi +358 -0
- robin_sparkless-0.1.0.dist-info/METADATA +0 -166
- robin_sparkless-0.1.0.dist-info/RECORD +0 -5
- {robin_sparkless-0.1.0.dist-info → robin_sparkless-0.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type stubs for robin_sparkless (PySpark-like DataFrame API on Polars).
|
|
3
|
+
Use for static type checking (mypy, pyright, etc.).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
# --- Classes ---
|
|
11
|
+
|
|
12
|
+
class SparkSession:
|
|
13
|
+
@classmethod
|
|
14
|
+
def builder(cls) -> SparkSessionBuilder: ...
|
|
15
|
+
def is_case_sensitive(self) -> bool: ...
|
|
16
|
+
def create_dataframe(
|
|
17
|
+
self,
|
|
18
|
+
data: Any,
|
|
19
|
+
column_names: list[str],
|
|
20
|
+
) -> DataFrame: ...
|
|
21
|
+
def createDataFrame(
|
|
22
|
+
self,
|
|
23
|
+
data: Any,
|
|
24
|
+
column_names: list[str],
|
|
25
|
+
) -> DataFrame: ...
|
|
26
|
+
def create_dataframe_from_rows(
|
|
27
|
+
self,
|
|
28
|
+
data: Any,
|
|
29
|
+
schema: list[tuple[str, str]],
|
|
30
|
+
) -> DataFrame: ...
|
|
31
|
+
def read_csv(self, path: str) -> DataFrame: ...
|
|
32
|
+
def read_parquet(self, path: str) -> DataFrame: ...
|
|
33
|
+
def read_json(self, path: str) -> DataFrame: ...
|
|
34
|
+
def read_delta(self, path: str) -> DataFrame: ...
|
|
35
|
+
def read_delta_version(
|
|
36
|
+
self, path: str, version: int | None = None
|
|
37
|
+
) -> DataFrame: ...
|
|
38
|
+
def create_or_replace_temp_view(self, name: str, df: DataFrame) -> None: ...
|
|
39
|
+
def table(self, name: str) -> DataFrame: ...
|
|
40
|
+
def sql(self, query: str) -> DataFrame: ...
|
|
41
|
+
|
|
42
|
+
class SparkSessionBuilder:
|
|
43
|
+
def app_name(self, name: str) -> SparkSessionBuilder: ...
|
|
44
|
+
def master(self, master: str) -> SparkSessionBuilder: ...
|
|
45
|
+
def config(self, key: str, value: str) -> SparkSessionBuilder: ...
|
|
46
|
+
def get_or_create(self) -> SparkSession: ...
|
|
47
|
+
|
|
48
|
+
class DataFrame:
|
|
49
|
+
def count(self) -> int: ...
|
|
50
|
+
def show(self, n: int | None = None) -> None: ...
|
|
51
|
+
def collect(self) -> list[dict[str, Any]]: ...
|
|
52
|
+
def filter(self, condition: Column) -> DataFrame: ...
|
|
53
|
+
def select(self, cols: list[str]) -> DataFrame: ...
|
|
54
|
+
def select_expr(self, exprs: list[str]) -> DataFrame: ...
|
|
55
|
+
def with_column(self, column_name: str, expr: Column) -> DataFrame: ...
|
|
56
|
+
def with_columns(
|
|
57
|
+
self, mapping: dict[str, Column] | list[tuple[str, Column]]
|
|
58
|
+
) -> DataFrame: ...
|
|
59
|
+
def with_columns_renamed(
|
|
60
|
+
self,
|
|
61
|
+
mapping: dict[str, str] | list[tuple[str, str]],
|
|
62
|
+
) -> DataFrame: ...
|
|
63
|
+
def with_column_renamed(self, old: str, new: str) -> DataFrame: ...
|
|
64
|
+
def order_by(
|
|
65
|
+
self,
|
|
66
|
+
cols: list[str],
|
|
67
|
+
ascending: list[bool] | None = None,
|
|
68
|
+
) -> DataFrame: ...
|
|
69
|
+
def order_by_exprs(self, sort_orders: list[SortOrder]) -> DataFrame: ...
|
|
70
|
+
def group_by(self, cols: list[str]) -> GroupedData: ...
|
|
71
|
+
def limit(self, n: int) -> DataFrame: ...
|
|
72
|
+
def distinct(self, subset: list[str] | None = None) -> DataFrame: ...
|
|
73
|
+
def union(self, other: DataFrame) -> DataFrame: ...
|
|
74
|
+
def union_by_name(self, other: DataFrame) -> DataFrame: ...
|
|
75
|
+
def join(
|
|
76
|
+
self,
|
|
77
|
+
other: DataFrame,
|
|
78
|
+
on: str | list[str],
|
|
79
|
+
how: str = "inner",
|
|
80
|
+
) -> DataFrame: ...
|
|
81
|
+
def drop(self, *cols: str) -> DataFrame: ...
|
|
82
|
+
def dropna(self, subset: list[str] | None = None) -> DataFrame: ...
|
|
83
|
+
def col_regex(self, pattern: str) -> DataFrame: ...
|
|
84
|
+
def stat(self) -> DataFrameStat: ...
|
|
85
|
+
def na(self) -> DataFrameNa: ...
|
|
86
|
+
def to_pandas(self) -> list[dict[str, Any]]: ...
|
|
87
|
+
def corr(self) -> DataFrame: ...
|
|
88
|
+
def write(self) -> DataFrameWriter: ...
|
|
89
|
+
|
|
90
|
+
class DataFrameStat:
|
|
91
|
+
def cov(self, col1: str, col2: str) -> float: ...
|
|
92
|
+
def corr(self, col1: str, col2: str) -> float: ...
|
|
93
|
+
def corr_matrix(self) -> DataFrame: ...
|
|
94
|
+
|
|
95
|
+
class DataFrameNa:
|
|
96
|
+
def fill(self, value: Column) -> DataFrame: ...
|
|
97
|
+
def drop(self, subset: list[str] | None = None) -> DataFrame: ...
|
|
98
|
+
|
|
99
|
+
class Column:
|
|
100
|
+
def alias(self, name: str) -> Column: ...
|
|
101
|
+
def gt(self, other: Column) -> Column: ...
|
|
102
|
+
def ge(self, other: Column) -> Column: ...
|
|
103
|
+
def lt(self, other: Column) -> Column: ...
|
|
104
|
+
def le(self, other: Column) -> Column: ...
|
|
105
|
+
def eq(self, other: Column) -> Column: ...
|
|
106
|
+
def neq(self, other: Column) -> Column: ...
|
|
107
|
+
def __eq__(self, other: object) -> Column: ... # type: ignore[override]
|
|
108
|
+
def __ne__(self, other: object) -> Column: ... # type: ignore[override]
|
|
109
|
+
def __lt__(self, other: Column) -> Column: ...
|
|
110
|
+
def __le__(self, other: Column) -> Column: ...
|
|
111
|
+
def __gt__(self, other: Column) -> Column: ...
|
|
112
|
+
def __ge__(self, other: Column) -> Column: ...
|
|
113
|
+
def __add__(self, other: Column) -> Column: ...
|
|
114
|
+
def __sub__(self, other: Column) -> Column: ...
|
|
115
|
+
def __mul__(self, other: Column) -> Column: ...
|
|
116
|
+
def __truediv__(self, other: Column) -> Column: ...
|
|
117
|
+
def __mod__(self, other: Column) -> Column: ...
|
|
118
|
+
def __and__(self, other: Column) -> Column: ...
|
|
119
|
+
def __or__(self, other: Column) -> Column: ...
|
|
120
|
+
def map_concat(self, other: Column) -> Column: ...
|
|
121
|
+
|
|
122
|
+
class SortOrder: ...
|
|
123
|
+
|
|
124
|
+
class WhenBuilder:
|
|
125
|
+
def when(self, condition: Column) -> ThenBuilder: ...
|
|
126
|
+
def otherwise(self, value: Column) -> Column: ...
|
|
127
|
+
|
|
128
|
+
class ThenBuilder:
|
|
129
|
+
def then(self, value: Column) -> WhenBuilder: ...
|
|
130
|
+
def otherwise(self, value: Column) -> Column: ...
|
|
131
|
+
|
|
132
|
+
class GroupedData:
|
|
133
|
+
def count(self) -> DataFrame: ...
|
|
134
|
+
def sum(self, column: str) -> DataFrame: ...
|
|
135
|
+
def avg(self, column: str) -> DataFrame: ...
|
|
136
|
+
def min(self, column: str) -> DataFrame: ...
|
|
137
|
+
def max(self, column: str) -> DataFrame: ...
|
|
138
|
+
def agg(self, exprs: list[Column]) -> DataFrame: ...
|
|
139
|
+
def collect_list(self, column: str) -> DataFrame: ...
|
|
140
|
+
def collect_set(self, column: str) -> DataFrame: ...
|
|
141
|
+
def count_if(self, column: str) -> DataFrame: ...
|
|
142
|
+
def any_value(self, column: str) -> DataFrame: ...
|
|
143
|
+
def bool_and(self, column: str) -> DataFrame: ...
|
|
144
|
+
def bool_or(self, column: str) -> DataFrame: ...
|
|
145
|
+
def product(self, column: str) -> DataFrame: ...
|
|
146
|
+
def kurtosis(self, column: str) -> DataFrame: ...
|
|
147
|
+
def skewness(self, column: str) -> DataFrame: ...
|
|
148
|
+
|
|
149
|
+
class CubeRollupData:
|
|
150
|
+
def agg(self, exprs: list[Column]) -> DataFrame: ...
|
|
151
|
+
|
|
152
|
+
class DataFrameWriter:
|
|
153
|
+
def mode(self, mode: str) -> DataFrameWriter: ...
|
|
154
|
+
def format(self, format: str) -> DataFrameWriter: ...
|
|
155
|
+
def save(self, path: str) -> None: ...
|
|
156
|
+
|
|
157
|
+
# --- Module-level functions (expression builders return Column) ---
|
|
158
|
+
|
|
159
|
+
def col(name: str) -> Column: ...
|
|
160
|
+
def lit(value: None | int | float | bool | str) -> Column: ...
|
|
161
|
+
def when(condition: Column) -> ThenBuilder: ...
|
|
162
|
+
def coalesce(*cols: Column) -> Column: ...
|
|
163
|
+
def sum(column: Column) -> Column: ...
|
|
164
|
+
def avg(column: Column) -> Column: ...
|
|
165
|
+
def min(column: Column) -> Column: ...
|
|
166
|
+
def max(column: Column) -> Column: ...
|
|
167
|
+
def count(column: Column) -> Column: ...
|
|
168
|
+
def execute_plan(
|
|
169
|
+
data: Any,
|
|
170
|
+
schema: list[tuple[str, str]],
|
|
171
|
+
plan_json: str,
|
|
172
|
+
) -> DataFrame: ...
|
|
173
|
+
|
|
174
|
+
# Unary column functions (column -> Column)
|
|
175
|
+
def ascii(column: Column) -> Column: ...
|
|
176
|
+
def base64(column: Column) -> Column: ...
|
|
177
|
+
def unbase64(column: Column) -> Column: ...
|
|
178
|
+
def cast(column: Column, dtype: str) -> Column: ...
|
|
179
|
+
def try_cast(column: Column, dtype: str) -> Column: ...
|
|
180
|
+
def isnull(column: Column) -> Column: ...
|
|
181
|
+
def isnotnull(column: Column) -> Column: ...
|
|
182
|
+
def isnan(column: Column) -> Column: ...
|
|
183
|
+
def asc(column: Column) -> SortOrder: ...
|
|
184
|
+
def asc_nulls_first(column: Column) -> SortOrder: ...
|
|
185
|
+
def asc_nulls_last(column: Column) -> SortOrder: ...
|
|
186
|
+
def desc(column: Column) -> SortOrder: ...
|
|
187
|
+
def desc_nulls_first(column: Column) -> SortOrder: ...
|
|
188
|
+
def desc_nulls_last(column: Column) -> SortOrder: ...
|
|
189
|
+
|
|
190
|
+
# Binary / variadic (common patterns; others follow same style)
|
|
191
|
+
def greatest(*columns: Column) -> Column: ...
|
|
192
|
+
def least(*columns: Column) -> Column: ...
|
|
193
|
+
def add_months(column: Column, n: int) -> Column: ...
|
|
194
|
+
def substring(column: Column, pos: int, len: int) -> Column: ...
|
|
195
|
+
def overlay(src: Column, replace: Column, pos: int, len: int = -1) -> Column: ...
|
|
196
|
+
def format_number(column: Column, d: int) -> Column: ...
|
|
197
|
+
def format_string(fmt: str, *columns: Column) -> Column: ...
|
|
198
|
+
def concat(*columns: Column) -> Column: ...
|
|
199
|
+
def concat_ws(sep: str, *columns: Column) -> Column: ...
|
|
200
|
+
def array(*columns: Column) -> Column: ...
|
|
201
|
+
def struct(*columns: Column) -> Column: ...
|
|
202
|
+
def named_struct(*name_column_pairs: Any) -> Column: ...
|
|
203
|
+
def create_map(*key_value_columns: Column) -> Column: ...
|
|
204
|
+
def map_concat(a: Column, b: Column) -> Column: ...
|
|
205
|
+
def equal_null(a: Column, b: Column) -> Column: ...
|
|
206
|
+
def get(column: Column, key: Any) -> Column: ...
|
|
207
|
+
def isin(column: Column, values: Any) -> Column: ...
|
|
208
|
+
def rand(seed: int | None = None) -> Column: ...
|
|
209
|
+
def randn(seed: int | None = None) -> Column: ...
|
|
210
|
+
|
|
211
|
+
# Placeholder for remaining 200+ expression functions (same pattern: Column in, Column out)
|
|
212
|
+
def acos(column: Column) -> Column: ...
|
|
213
|
+
def acosh(column: Column) -> Column: ...
|
|
214
|
+
def asin(column: Column) -> Column: ...
|
|
215
|
+
def asinh(column: Column) -> Column: ...
|
|
216
|
+
def atan(column: Column) -> Column: ...
|
|
217
|
+
def atan2(y: Column, x: Column) -> Column: ...
|
|
218
|
+
def atanh(column: Column) -> Column: ...
|
|
219
|
+
def sin(column: Column) -> Column: ...
|
|
220
|
+
def cos(column: Column) -> Column: ...
|
|
221
|
+
def tan(column: Column) -> Column: ...
|
|
222
|
+
def sinh(column: Column) -> Column: ...
|
|
223
|
+
def cosh(column: Column) -> Column: ...
|
|
224
|
+
def tanh(column: Column) -> Column: ...
|
|
225
|
+
def degrees(column: Column) -> Column: ...
|
|
226
|
+
def radians(column: Column) -> Column: ...
|
|
227
|
+
def cbrt(column: Column) -> Column: ...
|
|
228
|
+
def ceiling(column: Column) -> Column: ...
|
|
229
|
+
def floor(column: Column) -> Column: ...
|
|
230
|
+
def exp(column: Column) -> Column: ...
|
|
231
|
+
def expm1(column: Column) -> Column: ...
|
|
232
|
+
def ln(column: Column) -> Column: ...
|
|
233
|
+
def log10(column: Column) -> Column: ...
|
|
234
|
+
def log1p(column: Column) -> Column: ...
|
|
235
|
+
def log2(column: Column) -> Column: ...
|
|
236
|
+
def power(base: Column, exp: Column) -> Column: ...
|
|
237
|
+
def rint(column: Column) -> Column: ...
|
|
238
|
+
def round(column: Column, scale: int = 0) -> Column: ...
|
|
239
|
+
def signum(column: Column) -> Column: ...
|
|
240
|
+
def sqrt(column: Column) -> Column: ...
|
|
241
|
+
def left(column: Column, n: int) -> Column: ...
|
|
242
|
+
def right(column: Column, n: int) -> Column: ...
|
|
243
|
+
def length(column: Column) -> Column: ...
|
|
244
|
+
def lower(column: Column) -> Column: ...
|
|
245
|
+
def upper(column: Column) -> Column: ...
|
|
246
|
+
def ltrim(column: Column) -> Column: ...
|
|
247
|
+
def rtrim(column: Column) -> Column: ...
|
|
248
|
+
def trim(column: Column) -> Column: ...
|
|
249
|
+
def reverse(column: Column) -> Column: ...
|
|
250
|
+
def repeat(column: Column, n: int) -> Column: ...
|
|
251
|
+
def replace(src: Column, search: str, replacement: str) -> Column: ...
|
|
252
|
+
def contains(column: Column, literal: str) -> Column: ...
|
|
253
|
+
def startswith(column: Column, literal: str) -> Column: ...
|
|
254
|
+
def endswith(column: Column, literal: str) -> Column: ...
|
|
255
|
+
def like(column: Column, pattern: str) -> Column: ...
|
|
256
|
+
def ilike(column: Column, pattern: str) -> Column: ...
|
|
257
|
+
def rlike(column: Column, pattern: str) -> Column: ...
|
|
258
|
+
def regexp_extract(column: Column, pattern: str, idx: int = 0) -> Column: ...
|
|
259
|
+
def regexp_replace(column: Column, pattern: str, replacement: str) -> Column: ...
|
|
260
|
+
def split(column: Column, pattern: str) -> Column: ...
|
|
261
|
+
def substring_index(column: Column, delim: str, count: int) -> Column: ...
|
|
262
|
+
def chr(column: Column) -> Column: ...
|
|
263
|
+
def char(column: Column) -> Column: ...
|
|
264
|
+
def md5(column: Column) -> Column: ...
|
|
265
|
+
def sha1(column: Column) -> Column: ...
|
|
266
|
+
def sha2(column: Column, numBits: int) -> Column: ...
|
|
267
|
+
def dayofmonth(column: Column) -> Column: ...
|
|
268
|
+
def dayofweek(column: Column) -> Column: ...
|
|
269
|
+
def dayofyear(column: Column) -> Column: ...
|
|
270
|
+
def hour(column: Column) -> Column: ...
|
|
271
|
+
def minute(column: Column) -> Column: ...
|
|
272
|
+
def month(column: Column) -> Column: ...
|
|
273
|
+
def quarter(column: Column) -> Column: ...
|
|
274
|
+
def second(column: Column) -> Column: ...
|
|
275
|
+
def weekofyear(column: Column) -> Column: ...
|
|
276
|
+
def year(column: Column) -> Column: ...
|
|
277
|
+
def date_add(column: Column, days: int) -> Column: ...
|
|
278
|
+
def date_sub(column: Column, days: int) -> Column: ...
|
|
279
|
+
def datediff(end: Column, start: Column) -> Column: ...
|
|
280
|
+
def months_between(end: Column, start: Column) -> Column: ...
|
|
281
|
+
def to_date(column: Column) -> Column: ...
|
|
282
|
+
def to_timestamp(column: Column) -> Column: ...
|
|
283
|
+
def unix_timestamp(column: Column, fmt: str | None = None) -> Column: ...
|
|
284
|
+
def from_unixtime(column: Column, fmt: str = "yyyy-MM-dd HH:mm:ss") -> Column: ...
|
|
285
|
+
def current_timestamp() -> Column: ...
|
|
286
|
+
def current_date() -> Column: ...
|
|
287
|
+
def date_format(column: Column, fmt: str) -> Column: ...
|
|
288
|
+
def nvl(column: Column, replacement: Column) -> Column: ...
|
|
289
|
+
def nvl2(column: Column, if_not_null: Column, if_null: Column) -> Column: ...
|
|
290
|
+
def ifnull(column: Column, replacement: Column) -> Column: ...
|
|
291
|
+
def array_contains(column: Column, value: Any) -> Column: ...
|
|
292
|
+
def array_distinct(column: Column) -> Column: ...
|
|
293
|
+
def array_except(a: Column, b: Column) -> Column: ...
|
|
294
|
+
def array_intersect(a: Column, b: Column) -> Column: ...
|
|
295
|
+
def array_join(
|
|
296
|
+
column: Column, delimiter: str, null_replacement: str | None = None
|
|
297
|
+
) -> Column: ...
|
|
298
|
+
def array_max(column: Column) -> Column: ...
|
|
299
|
+
def array_min(column: Column) -> Column: ...
|
|
300
|
+
def array_position(column: Column, value: Any) -> Column: ...
|
|
301
|
+
def array_remove(column: Column, value: Any) -> Column: ...
|
|
302
|
+
def array_sort(column: Column) -> Column: ...
|
|
303
|
+
def array_union(a: Column, b: Column) -> Column: ...
|
|
304
|
+
def arrays_zip(*columns: Column) -> Column: ...
|
|
305
|
+
def explode(column: Column) -> Column: ...
|
|
306
|
+
def explode_outer(column: Column) -> Column: ...
|
|
307
|
+
def posexplode(column: Column) -> Column: ...
|
|
308
|
+
def size(column: Column) -> Column: ...
|
|
309
|
+
def slice(column: Column, start: int, length: int) -> Column: ...
|
|
310
|
+
def sort_array(column: Column, asc: bool = True) -> Column: ...
|
|
311
|
+
def element_at(column: Column, idx: int) -> Column: ...
|
|
312
|
+
def map_keys(column: Column) -> Column: ...
|
|
313
|
+
def map_values(column: Column) -> Column: ...
|
|
314
|
+
def map_contains_key(column: Column, key: Any) -> Column: ...
|
|
315
|
+
def row_number() -> Column: ...
|
|
316
|
+
def rank() -> Column: ...
|
|
317
|
+
def dense_rank() -> Column: ...
|
|
318
|
+
def ntile(n: int) -> Column: ...
|
|
319
|
+
def broadcast(df: DataFrame) -> DataFrame: ...
|
|
320
|
+
def input_file_name() -> Column: ...
|
|
321
|
+
def monotonically_increasing_id() -> Column: ...
|
|
322
|
+
def spark_partition_id() -> Column: ...
|
|
323
|
+
def version() -> Column: ...
|
|
324
|
+
def current_user() -> Column: ...
|
|
325
|
+
def user() -> Column: ...
|
|
326
|
+
def hash(*columns: Column) -> Column: ...
|
|
327
|
+
def crc32(column: Column) -> Column: ...
|
|
328
|
+
def xxhash64(*columns: Column) -> Column: ...
|
|
329
|
+
def assert_true(column: Column) -> Column: ...
|
|
330
|
+
def raise_error(column: Column) -> Column: ...
|
|
331
|
+
def bitwiseNOT(column: Column) -> Column: ...
|
|
332
|
+
def bitwise_not(column: Column) -> Column: ...
|
|
333
|
+
def bit_count(column: Column) -> Column: ...
|
|
334
|
+
def bit_get(column: Column, bit: int) -> Column: ...
|
|
335
|
+
def getbit(column: Column, bit: int) -> Column: ...
|
|
336
|
+
def shiftleft(column: Column, n: int) -> Column: ...
|
|
337
|
+
def shiftright(column: Column, n: int) -> Column: ...
|
|
338
|
+
def shift_left(column: Column, n: int) -> Column: ...
|
|
339
|
+
def shift_right(column: Column, n: int) -> Column: ...
|
|
340
|
+
def typeof(column: Column) -> Column: ...
|
|
341
|
+
def bround(column: Column, scale: int = 0) -> Column: ...
|
|
342
|
+
def negate(column: Column) -> Column: ...
|
|
343
|
+
def positive(column: Column) -> Column: ...
|
|
344
|
+
def cot(column: Column) -> Column: ...
|
|
345
|
+
def csc(column: Column) -> Column: ...
|
|
346
|
+
def sec(column: Column) -> Column: ...
|
|
347
|
+
def e() -> Column: ...
|
|
348
|
+
def pi() -> Column: ...
|
|
349
|
+
def median(column: Column) -> Column: ...
|
|
350
|
+
def mode(column: Column) -> Column: ...
|
|
351
|
+
def stddev_pop(column: Column) -> Column: ...
|
|
352
|
+
def var_pop(column: Column) -> Column: ...
|
|
353
|
+
def count_distinct(column: Column) -> Column: ...
|
|
354
|
+
def approx_count_distinct(column: Column) -> Column: ...
|
|
355
|
+
def first(column: Column) -> Column: ...
|
|
356
|
+
def last(column: Column) -> Column: ...
|
|
357
|
+
def collect_list(column: Column) -> Column: ...
|
|
358
|
+
def collect_set(column: Column) -> Column: ...
|
robin_sparkless/py.typed
ADDED
|
File without changes
|
|
Binary file
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: robin-sparkless
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Classifier: Development Status :: 3 - Alpha
|
|
5
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Programming Language :: Rust
|
|
8
|
+
Classifier: Topic :: Scientific/Engineering
|
|
9
|
+
Summary: PySpark-like DataFrame API in Rust (Polars backend), with Python bindings via PyO3
|
|
10
|
+
Author: Robin Sparkless contributors
|
|
11
|
+
License: MIT
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
14
|
+
|
|
15
|
+
# robin-sparkless (Python)
|
|
16
|
+
|
|
17
|
+
[](https://pypi.org/project/robin-sparkless/)
|
|
18
|
+
[](https://www.python.org/downloads/)
|
|
19
|
+
[](https://opensource.org/licenses/MIT)
|
|
20
|
+
[](https://robin-sparkless.readthedocs.io/)
|
|
21
|
+
[](https://github.com/eddiethedean/robin-sparkless)
|
|
22
|
+
|
|
23
|
+
**PySpark-style DataFrames in Python—no JVM.** Uses [Polars](https://www.pola.rs/) under the hood for fast execution.
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install robin-sparkless
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
**Requirements:** Python 3.8+
|
|
32
|
+
|
|
33
|
+
## Quick start
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
import robin_sparkless as rs
|
|
37
|
+
|
|
38
|
+
spark = rs.SparkSession.builder().app_name("demo").get_or_create()
|
|
39
|
+
df = spark.create_dataframe(
|
|
40
|
+
[(1, 25, "Alice"), (2, 30, "Bob"), (3, 35, "Charlie")],
|
|
41
|
+
["id", "age", "name"],
|
|
42
|
+
)
|
|
43
|
+
filtered = df.filter(rs.col("age").gt(rs.lit(26)))
|
|
44
|
+
print(filtered.collect())
|
|
45
|
+
# [{"id": 2, "age": 30, "name": "Bob"}, {"id": 3, "age": 35, "name": "Charlie"}]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Read from files:
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
df = spark.read_csv("data.csv")
|
|
52
|
+
df = spark.read_parquet("data.parquet")
|
|
53
|
+
df = spark.read_json("data.json")
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Filter, select, group, join, and use window functions with a PySpark-like API. See the [full documentation](https://robin-sparkless.readthedocs.io/) for details.
|
|
57
|
+
|
|
58
|
+
## Optional features (install from source)
|
|
59
|
+
|
|
60
|
+
Building from source requires [Rust](https://rustup.rs/) and [maturin](https://www.maturin.rs/). Clone the repo, then:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install maturin
|
|
64
|
+
maturin develop --features pyo3 # default: DataFrame API
|
|
65
|
+
maturin develop --features "pyo3,sql" # spark.sql() and temp views
|
|
66
|
+
maturin develop --features "pyo3,delta" # read_delta / write_delta
|
|
67
|
+
maturin develop --features "pyo3,sql,delta" # all optional features
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Type checking
|
|
71
|
+
|
|
72
|
+
The package ships with PEP 561 type stubs (`robin_sparkless.pyi`). Use mypy, pyright, or another checker:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install robin-sparkless mypy
|
|
76
|
+
mypy your_script.py
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
For **Python 3.8** compatibility, use mypy <1.10 (newer mypy drops support for `python_version = "3.8"` in config). The project’s `pyproject.toml` includes `[tool.mypy]` and `[tool.ruff]` with `target-version` / `python_version` set for 3.8.
|
|
80
|
+
|
|
81
|
+
## Development
|
|
82
|
+
|
|
83
|
+
From a clone of the repo:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
python -m venv .venv
|
|
87
|
+
source .venv/bin/activate # or .venv\Scripts\activate on Windows
|
|
88
|
+
pip install -r requirements-ci.txt
|
|
89
|
+
maturin develop --features pyo3 --release
|
|
90
|
+
pytest tests/python/ -v
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Lint and format:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
ruff format .
|
|
97
|
+
ruff check .
|
|
98
|
+
mypy .
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
CI uses `requirements-ci.txt` (maturin, pytest, mypy<1.10 for Python 3.8).
|
|
102
|
+
|
|
103
|
+
## Links
|
|
104
|
+
|
|
105
|
+
- **Documentation:** [robin-sparkless.readthedocs.io](https://robin-sparkless.readthedocs.io/)
|
|
106
|
+
- **Source:** [github.com/eddiethedean/robin-sparkless](https://github.com/eddiethedean/robin-sparkless)
|
|
107
|
+
- **Rust crate:** [crates.io/crates/robin-sparkless](https://crates.io/crates/robin-sparkless)
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
|
|
111
|
+
MIT
|
|
112
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
robin_sparkless\__init__.py,sha256=h26i0kv9czksgzJ0zbFJvX-5IZV8hrWVEerFQOVG3fA,143
|
|
2
|
+
robin_sparkless\__init__.pyi,sha256=RUzDhVjxwH14YRwqwwqsIPaiLNDWbYGoc51vYjhbo54,15677
|
|
3
|
+
robin_sparkless\py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
robin_sparkless\robin_sparkless.pyd,sha256=4YCUz0p_2-JN8_1FZ1R-YH9aCeQzkzY-YWRbMtryZgg,58458624
|
|
5
|
+
robin_sparkless-0.1.1.dist-info\METADATA,sha256=BqIYlQkQG4LeSX-yh-FdHc5v1YrI2EnttpuZGx8Y1PQ,3748
|
|
6
|
+
robin_sparkless-0.1.1.dist-info\WHEEL,sha256=gPqN4EsdiAyGvmfrYy_ONrF276O8o0hPitI2CKZrEFA,95
|
|
7
|
+
robin_sparkless.pyi,sha256=RUzDhVjxwH14YRwqwwqsIPaiLNDWbYGoc51vYjhbo54,15677
|
|
8
|
+
robin_sparkless-0.1.1.dist-info\RECORD,,
|
robin_sparkless.pyi
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Type stubs for robin_sparkless (PySpark-like DataFrame API on Polars).
|
|
3
|
+
Use for static type checking (mypy, pyright, etc.).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
# --- Classes ---
|
|
11
|
+
|
|
12
|
+
class SparkSession:
|
|
13
|
+
@classmethod
|
|
14
|
+
def builder(cls) -> SparkSessionBuilder: ...
|
|
15
|
+
def is_case_sensitive(self) -> bool: ...
|
|
16
|
+
def create_dataframe(
|
|
17
|
+
self,
|
|
18
|
+
data: Any,
|
|
19
|
+
column_names: list[str],
|
|
20
|
+
) -> DataFrame: ...
|
|
21
|
+
def createDataFrame(
|
|
22
|
+
self,
|
|
23
|
+
data: Any,
|
|
24
|
+
column_names: list[str],
|
|
25
|
+
) -> DataFrame: ...
|
|
26
|
+
def create_dataframe_from_rows(
|
|
27
|
+
self,
|
|
28
|
+
data: Any,
|
|
29
|
+
schema: list[tuple[str, str]],
|
|
30
|
+
) -> DataFrame: ...
|
|
31
|
+
def read_csv(self, path: str) -> DataFrame: ...
|
|
32
|
+
def read_parquet(self, path: str) -> DataFrame: ...
|
|
33
|
+
def read_json(self, path: str) -> DataFrame: ...
|
|
34
|
+
def read_delta(self, path: str) -> DataFrame: ...
|
|
35
|
+
def read_delta_version(
|
|
36
|
+
self, path: str, version: int | None = None
|
|
37
|
+
) -> DataFrame: ...
|
|
38
|
+
def create_or_replace_temp_view(self, name: str, df: DataFrame) -> None: ...
|
|
39
|
+
def table(self, name: str) -> DataFrame: ...
|
|
40
|
+
def sql(self, query: str) -> DataFrame: ...
|
|
41
|
+
|
|
42
|
+
class SparkSessionBuilder:
|
|
43
|
+
def app_name(self, name: str) -> SparkSessionBuilder: ...
|
|
44
|
+
def master(self, master: str) -> SparkSessionBuilder: ...
|
|
45
|
+
def config(self, key: str, value: str) -> SparkSessionBuilder: ...
|
|
46
|
+
def get_or_create(self) -> SparkSession: ...
|
|
47
|
+
|
|
48
|
+
class DataFrame:
|
|
49
|
+
def count(self) -> int: ...
|
|
50
|
+
def show(self, n: int | None = None) -> None: ...
|
|
51
|
+
def collect(self) -> list[dict[str, Any]]: ...
|
|
52
|
+
def filter(self, condition: Column) -> DataFrame: ...
|
|
53
|
+
def select(self, cols: list[str]) -> DataFrame: ...
|
|
54
|
+
def select_expr(self, exprs: list[str]) -> DataFrame: ...
|
|
55
|
+
def with_column(self, column_name: str, expr: Column) -> DataFrame: ...
|
|
56
|
+
def with_columns(
|
|
57
|
+
self, mapping: dict[str, Column] | list[tuple[str, Column]]
|
|
58
|
+
) -> DataFrame: ...
|
|
59
|
+
def with_columns_renamed(
|
|
60
|
+
self,
|
|
61
|
+
mapping: dict[str, str] | list[tuple[str, str]],
|
|
62
|
+
) -> DataFrame: ...
|
|
63
|
+
def with_column_renamed(self, old: str, new: str) -> DataFrame: ...
|
|
64
|
+
def order_by(
|
|
65
|
+
self,
|
|
66
|
+
cols: list[str],
|
|
67
|
+
ascending: list[bool] | None = None,
|
|
68
|
+
) -> DataFrame: ...
|
|
69
|
+
def order_by_exprs(self, sort_orders: list[SortOrder]) -> DataFrame: ...
|
|
70
|
+
def group_by(self, cols: list[str]) -> GroupedData: ...
|
|
71
|
+
def limit(self, n: int) -> DataFrame: ...
|
|
72
|
+
def distinct(self, subset: list[str] | None = None) -> DataFrame: ...
|
|
73
|
+
def union(self, other: DataFrame) -> DataFrame: ...
|
|
74
|
+
def union_by_name(self, other: DataFrame) -> DataFrame: ...
|
|
75
|
+
def join(
|
|
76
|
+
self,
|
|
77
|
+
other: DataFrame,
|
|
78
|
+
on: str | list[str],
|
|
79
|
+
how: str = "inner",
|
|
80
|
+
) -> DataFrame: ...
|
|
81
|
+
def drop(self, *cols: str) -> DataFrame: ...
|
|
82
|
+
def dropna(self, subset: list[str] | None = None) -> DataFrame: ...
|
|
83
|
+
def col_regex(self, pattern: str) -> DataFrame: ...
|
|
84
|
+
def stat(self) -> DataFrameStat: ...
|
|
85
|
+
def na(self) -> DataFrameNa: ...
|
|
86
|
+
def to_pandas(self) -> list[dict[str, Any]]: ...
|
|
87
|
+
def corr(self) -> DataFrame: ...
|
|
88
|
+
def write(self) -> DataFrameWriter: ...
|
|
89
|
+
|
|
90
|
+
class DataFrameStat:
|
|
91
|
+
def cov(self, col1: str, col2: str) -> float: ...
|
|
92
|
+
def corr(self, col1: str, col2: str) -> float: ...
|
|
93
|
+
def corr_matrix(self) -> DataFrame: ...
|
|
94
|
+
|
|
95
|
+
class DataFrameNa:
|
|
96
|
+
def fill(self, value: Column) -> DataFrame: ...
|
|
97
|
+
def drop(self, subset: list[str] | None = None) -> DataFrame: ...
|
|
98
|
+
|
|
99
|
+
class Column:
|
|
100
|
+
def alias(self, name: str) -> Column: ...
|
|
101
|
+
def gt(self, other: Column) -> Column: ...
|
|
102
|
+
def ge(self, other: Column) -> Column: ...
|
|
103
|
+
def lt(self, other: Column) -> Column: ...
|
|
104
|
+
def le(self, other: Column) -> Column: ...
|
|
105
|
+
def eq(self, other: Column) -> Column: ...
|
|
106
|
+
def neq(self, other: Column) -> Column: ...
|
|
107
|
+
def __eq__(self, other: object) -> Column: ... # type: ignore[override]
|
|
108
|
+
def __ne__(self, other: object) -> Column: ... # type: ignore[override]
|
|
109
|
+
def __lt__(self, other: Column) -> Column: ...
|
|
110
|
+
def __le__(self, other: Column) -> Column: ...
|
|
111
|
+
def __gt__(self, other: Column) -> Column: ...
|
|
112
|
+
def __ge__(self, other: Column) -> Column: ...
|
|
113
|
+
def __add__(self, other: Column) -> Column: ...
|
|
114
|
+
def __sub__(self, other: Column) -> Column: ...
|
|
115
|
+
def __mul__(self, other: Column) -> Column: ...
|
|
116
|
+
def __truediv__(self, other: Column) -> Column: ...
|
|
117
|
+
def __mod__(self, other: Column) -> Column: ...
|
|
118
|
+
def __and__(self, other: Column) -> Column: ...
|
|
119
|
+
def __or__(self, other: Column) -> Column: ...
|
|
120
|
+
def map_concat(self, other: Column) -> Column: ...
|
|
121
|
+
|
|
122
|
+
class SortOrder: ...
|
|
123
|
+
|
|
124
|
+
class WhenBuilder:
|
|
125
|
+
def when(self, condition: Column) -> ThenBuilder: ...
|
|
126
|
+
def otherwise(self, value: Column) -> Column: ...
|
|
127
|
+
|
|
128
|
+
class ThenBuilder:
|
|
129
|
+
def then(self, value: Column) -> WhenBuilder: ...
|
|
130
|
+
def otherwise(self, value: Column) -> Column: ...
|
|
131
|
+
|
|
132
|
+
class GroupedData:
|
|
133
|
+
def count(self) -> DataFrame: ...
|
|
134
|
+
def sum(self, column: str) -> DataFrame: ...
|
|
135
|
+
def avg(self, column: str) -> DataFrame: ...
|
|
136
|
+
def min(self, column: str) -> DataFrame: ...
|
|
137
|
+
def max(self, column: str) -> DataFrame: ...
|
|
138
|
+
def agg(self, exprs: list[Column]) -> DataFrame: ...
|
|
139
|
+
def collect_list(self, column: str) -> DataFrame: ...
|
|
140
|
+
def collect_set(self, column: str) -> DataFrame: ...
|
|
141
|
+
def count_if(self, column: str) -> DataFrame: ...
|
|
142
|
+
def any_value(self, column: str) -> DataFrame: ...
|
|
143
|
+
def bool_and(self, column: str) -> DataFrame: ...
|
|
144
|
+
def bool_or(self, column: str) -> DataFrame: ...
|
|
145
|
+
def product(self, column: str) -> DataFrame: ...
|
|
146
|
+
def kurtosis(self, column: str) -> DataFrame: ...
|
|
147
|
+
def skewness(self, column: str) -> DataFrame: ...
|
|
148
|
+
|
|
149
|
+
class CubeRollupData:
|
|
150
|
+
def agg(self, exprs: list[Column]) -> DataFrame: ...
|
|
151
|
+
|
|
152
|
+
class DataFrameWriter:
|
|
153
|
+
def mode(self, mode: str) -> DataFrameWriter: ...
|
|
154
|
+
def format(self, format: str) -> DataFrameWriter: ...
|
|
155
|
+
def save(self, path: str) -> None: ...
|
|
156
|
+
|
|
157
|
+
# --- Module-level functions (expression builders return Column) ---
|
|
158
|
+
|
|
159
|
+
def col(name: str) -> Column: ...
|
|
160
|
+
def lit(value: None | int | float | bool | str) -> Column: ...
|
|
161
|
+
def when(condition: Column) -> ThenBuilder: ...
|
|
162
|
+
def coalesce(*cols: Column) -> Column: ...
|
|
163
|
+
def sum(column: Column) -> Column: ...
|
|
164
|
+
def avg(column: Column) -> Column: ...
|
|
165
|
+
def min(column: Column) -> Column: ...
|
|
166
|
+
def max(column: Column) -> Column: ...
|
|
167
|
+
def count(column: Column) -> Column: ...
|
|
168
|
+
def execute_plan(
|
|
169
|
+
data: Any,
|
|
170
|
+
schema: list[tuple[str, str]],
|
|
171
|
+
plan_json: str,
|
|
172
|
+
) -> DataFrame: ...
|
|
173
|
+
|
|
174
|
+
# Unary column functions (column -> Column)
|
|
175
|
+
def ascii(column: Column) -> Column: ...
|
|
176
|
+
def base64(column: Column) -> Column: ...
|
|
177
|
+
def unbase64(column: Column) -> Column: ...
|
|
178
|
+
def cast(column: Column, dtype: str) -> Column: ...
|
|
179
|
+
def try_cast(column: Column, dtype: str) -> Column: ...
|
|
180
|
+
def isnull(column: Column) -> Column: ...
|
|
181
|
+
def isnotnull(column: Column) -> Column: ...
|
|
182
|
+
def isnan(column: Column) -> Column: ...
|
|
183
|
+
def asc(column: Column) -> SortOrder: ...
|
|
184
|
+
def asc_nulls_first(column: Column) -> SortOrder: ...
|
|
185
|
+
def asc_nulls_last(column: Column) -> SortOrder: ...
|
|
186
|
+
def desc(column: Column) -> SortOrder: ...
|
|
187
|
+
def desc_nulls_first(column: Column) -> SortOrder: ...
|
|
188
|
+
def desc_nulls_last(column: Column) -> SortOrder: ...
|
|
189
|
+
|
|
190
|
+
# Binary / variadic (common patterns; others follow same style)
|
|
191
|
+
def greatest(*columns: Column) -> Column: ...
|
|
192
|
+
def least(*columns: Column) -> Column: ...
|
|
193
|
+
def add_months(column: Column, n: int) -> Column: ...
|
|
194
|
+
def substring(column: Column, pos: int, len: int) -> Column: ...
|
|
195
|
+
def overlay(src: Column, replace: Column, pos: int, len: int = -1) -> Column: ...
|
|
196
|
+
def format_number(column: Column, d: int) -> Column: ...
|
|
197
|
+
def format_string(fmt: str, *columns: Column) -> Column: ...
|
|
198
|
+
def concat(*columns: Column) -> Column: ...
|
|
199
|
+
def concat_ws(sep: str, *columns: Column) -> Column: ...
|
|
200
|
+
def array(*columns: Column) -> Column: ...
|
|
201
|
+
def struct(*columns: Column) -> Column: ...
|
|
202
|
+
def named_struct(*name_column_pairs: Any) -> Column: ...
|
|
203
|
+
def create_map(*key_value_columns: Column) -> Column: ...
|
|
204
|
+
def map_concat(a: Column, b: Column) -> Column: ...
|
|
205
|
+
def equal_null(a: Column, b: Column) -> Column: ...
|
|
206
|
+
def get(column: Column, key: Any) -> Column: ...
|
|
207
|
+
def isin(column: Column, values: Any) -> Column: ...
|
|
208
|
+
def rand(seed: int | None = None) -> Column: ...
|
|
209
|
+
def randn(seed: int | None = None) -> Column: ...
|
|
210
|
+
|
|
211
|
+
# Placeholder for remaining 200+ expression functions (same pattern: Column in, Column out)
|
|
212
|
+
def acos(column: Column) -> Column: ...
|
|
213
|
+
def acosh(column: Column) -> Column: ...
|
|
214
|
+
def asin(column: Column) -> Column: ...
|
|
215
|
+
def asinh(column: Column) -> Column: ...
|
|
216
|
+
def atan(column: Column) -> Column: ...
|
|
217
|
+
def atan2(y: Column, x: Column) -> Column: ...
|
|
218
|
+
def atanh(column: Column) -> Column: ...
|
|
219
|
+
def sin(column: Column) -> Column: ...
|
|
220
|
+
def cos(column: Column) -> Column: ...
|
|
221
|
+
def tan(column: Column) -> Column: ...
|
|
222
|
+
def sinh(column: Column) -> Column: ...
|
|
223
|
+
def cosh(column: Column) -> Column: ...
|
|
224
|
+
def tanh(column: Column) -> Column: ...
|
|
225
|
+
def degrees(column: Column) -> Column: ...
|
|
226
|
+
def radians(column: Column) -> Column: ...
|
|
227
|
+
def cbrt(column: Column) -> Column: ...
|
|
228
|
+
def ceiling(column: Column) -> Column: ...
|
|
229
|
+
def floor(column: Column) -> Column: ...
|
|
230
|
+
def exp(column: Column) -> Column: ...
|
|
231
|
+
def expm1(column: Column) -> Column: ...
|
|
232
|
+
def ln(column: Column) -> Column: ...
|
|
233
|
+
def log10(column: Column) -> Column: ...
|
|
234
|
+
def log1p(column: Column) -> Column: ...
|
|
235
|
+
def log2(column: Column) -> Column: ...
|
|
236
|
+
def power(base: Column, exp: Column) -> Column: ...
|
|
237
|
+
def rint(column: Column) -> Column: ...
|
|
238
|
+
def round(column: Column, scale: int = 0) -> Column: ...
|
|
239
|
+
def signum(column: Column) -> Column: ...
|
|
240
|
+
def sqrt(column: Column) -> Column: ...
|
|
241
|
+
def left(column: Column, n: int) -> Column: ...
|
|
242
|
+
def right(column: Column, n: int) -> Column: ...
|
|
243
|
+
def length(column: Column) -> Column: ...
|
|
244
|
+
def lower(column: Column) -> Column: ...
|
|
245
|
+
def upper(column: Column) -> Column: ...
|
|
246
|
+
def ltrim(column: Column) -> Column: ...
|
|
247
|
+
def rtrim(column: Column) -> Column: ...
|
|
248
|
+
def trim(column: Column) -> Column: ...
|
|
249
|
+
def reverse(column: Column) -> Column: ...
|
|
250
|
+
def repeat(column: Column, n: int) -> Column: ...
|
|
251
|
+
def replace(src: Column, search: str, replacement: str) -> Column: ...
|
|
252
|
+
def contains(column: Column, literal: str) -> Column: ...
|
|
253
|
+
def startswith(column: Column, literal: str) -> Column: ...
|
|
254
|
+
def endswith(column: Column, literal: str) -> Column: ...
|
|
255
|
+
def like(column: Column, pattern: str) -> Column: ...
|
|
256
|
+
def ilike(column: Column, pattern: str) -> Column: ...
|
|
257
|
+
def rlike(column: Column, pattern: str) -> Column: ...
|
|
258
|
+
def regexp_extract(column: Column, pattern: str, idx: int = 0) -> Column: ...
|
|
259
|
+
def regexp_replace(column: Column, pattern: str, replacement: str) -> Column: ...
|
|
260
|
+
def split(column: Column, pattern: str) -> Column: ...
|
|
261
|
+
def substring_index(column: Column, delim: str, count: int) -> Column: ...
|
|
262
|
+
def chr(column: Column) -> Column: ...
|
|
263
|
+
def char(column: Column) -> Column: ...
|
|
264
|
+
def md5(column: Column) -> Column: ...
|
|
265
|
+
def sha1(column: Column) -> Column: ...
|
|
266
|
+
def sha2(column: Column, numBits: int) -> Column: ...
|
|
267
|
+
def dayofmonth(column: Column) -> Column: ...
|
|
268
|
+
def dayofweek(column: Column) -> Column: ...
|
|
269
|
+
def dayofyear(column: Column) -> Column: ...
|
|
270
|
+
def hour(column: Column) -> Column: ...
|
|
271
|
+
def minute(column: Column) -> Column: ...
|
|
272
|
+
def month(column: Column) -> Column: ...
|
|
273
|
+
def quarter(column: Column) -> Column: ...
|
|
274
|
+
def second(column: Column) -> Column: ...
|
|
275
|
+
def weekofyear(column: Column) -> Column: ...
|
|
276
|
+
def year(column: Column) -> Column: ...
|
|
277
|
+
def date_add(column: Column, days: int) -> Column: ...
|
|
278
|
+
def date_sub(column: Column, days: int) -> Column: ...
|
|
279
|
+
def datediff(end: Column, start: Column) -> Column: ...
|
|
280
|
+
def months_between(end: Column, start: Column) -> Column: ...
|
|
281
|
+
def to_date(column: Column) -> Column: ...
|
|
282
|
+
def to_timestamp(column: Column) -> Column: ...
|
|
283
|
+
def unix_timestamp(column: Column, fmt: str | None = None) -> Column: ...
|
|
284
|
+
def from_unixtime(column: Column, fmt: str = "yyyy-MM-dd HH:mm:ss") -> Column: ...
|
|
285
|
+
def current_timestamp() -> Column: ...
|
|
286
|
+
def current_date() -> Column: ...
|
|
287
|
+
def date_format(column: Column, fmt: str) -> Column: ...
|
|
288
|
+
def nvl(column: Column, replacement: Column) -> Column: ...
|
|
289
|
+
def nvl2(column: Column, if_not_null: Column, if_null: Column) -> Column: ...
|
|
290
|
+
def ifnull(column: Column, replacement: Column) -> Column: ...
|
|
291
|
+
def array_contains(column: Column, value: Any) -> Column: ...
|
|
292
|
+
def array_distinct(column: Column) -> Column: ...
|
|
293
|
+
def array_except(a: Column, b: Column) -> Column: ...
|
|
294
|
+
def array_intersect(a: Column, b: Column) -> Column: ...
|
|
295
|
+
def array_join(
|
|
296
|
+
column: Column, delimiter: str, null_replacement: str | None = None
|
|
297
|
+
) -> Column: ...
|
|
298
|
+
def array_max(column: Column) -> Column: ...
|
|
299
|
+
def array_min(column: Column) -> Column: ...
|
|
300
|
+
def array_position(column: Column, value: Any) -> Column: ...
|
|
301
|
+
def array_remove(column: Column, value: Any) -> Column: ...
|
|
302
|
+
def array_sort(column: Column) -> Column: ...
|
|
303
|
+
def array_union(a: Column, b: Column) -> Column: ...
|
|
304
|
+
def arrays_zip(*columns: Column) -> Column: ...
|
|
305
|
+
def explode(column: Column) -> Column: ...
|
|
306
|
+
def explode_outer(column: Column) -> Column: ...
|
|
307
|
+
def posexplode(column: Column) -> Column: ...
|
|
308
|
+
def size(column: Column) -> Column: ...
|
|
309
|
+
def slice(column: Column, start: int, length: int) -> Column: ...
|
|
310
|
+
def sort_array(column: Column, asc: bool = True) -> Column: ...
|
|
311
|
+
def element_at(column: Column, idx: int) -> Column: ...
|
|
312
|
+
def map_keys(column: Column) -> Column: ...
|
|
313
|
+
def map_values(column: Column) -> Column: ...
|
|
314
|
+
def map_contains_key(column: Column, key: Any) -> Column: ...
|
|
315
|
+
def row_number() -> Column: ...
|
|
316
|
+
def rank() -> Column: ...
|
|
317
|
+
def dense_rank() -> Column: ...
|
|
318
|
+
def ntile(n: int) -> Column: ...
|
|
319
|
+
def broadcast(df: DataFrame) -> DataFrame: ...
|
|
320
|
+
def input_file_name() -> Column: ...
|
|
321
|
+
def monotonically_increasing_id() -> Column: ...
|
|
322
|
+
def spark_partition_id() -> Column: ...
|
|
323
|
+
def version() -> Column: ...
|
|
324
|
+
def current_user() -> Column: ...
|
|
325
|
+
def user() -> Column: ...
|
|
326
|
+
def hash(*columns: Column) -> Column: ...
|
|
327
|
+
def crc32(column: Column) -> Column: ...
|
|
328
|
+
def xxhash64(*columns: Column) -> Column: ...
|
|
329
|
+
def assert_true(column: Column) -> Column: ...
|
|
330
|
+
def raise_error(column: Column) -> Column: ...
|
|
331
|
+
def bitwiseNOT(column: Column) -> Column: ...
|
|
332
|
+
def bitwise_not(column: Column) -> Column: ...
|
|
333
|
+
def bit_count(column: Column) -> Column: ...
|
|
334
|
+
def bit_get(column: Column, bit: int) -> Column: ...
|
|
335
|
+
def getbit(column: Column, bit: int) -> Column: ...
|
|
336
|
+
def shiftleft(column: Column, n: int) -> Column: ...
|
|
337
|
+
def shiftright(column: Column, n: int) -> Column: ...
|
|
338
|
+
def shift_left(column: Column, n: int) -> Column: ...
|
|
339
|
+
def shift_right(column: Column, n: int) -> Column: ...
|
|
340
|
+
def typeof(column: Column) -> Column: ...
|
|
341
|
+
def bround(column: Column, scale: int = 0) -> Column: ...
|
|
342
|
+
def negate(column: Column) -> Column: ...
|
|
343
|
+
def positive(column: Column) -> Column: ...
|
|
344
|
+
def cot(column: Column) -> Column: ...
|
|
345
|
+
def csc(column: Column) -> Column: ...
|
|
346
|
+
def sec(column: Column) -> Column: ...
|
|
347
|
+
def e() -> Column: ...
|
|
348
|
+
def pi() -> Column: ...
|
|
349
|
+
def median(column: Column) -> Column: ...
|
|
350
|
+
def mode(column: Column) -> Column: ...
|
|
351
|
+
def stddev_pop(column: Column) -> Column: ...
|
|
352
|
+
def var_pop(column: Column) -> Column: ...
|
|
353
|
+
def count_distinct(column: Column) -> Column: ...
|
|
354
|
+
def approx_count_distinct(column: Column) -> Column: ...
|
|
355
|
+
def first(column: Column) -> Column: ...
|
|
356
|
+
def last(column: Column) -> Column: ...
|
|
357
|
+
def collect_list(column: Column) -> Column: ...
|
|
358
|
+
def collect_set(column: Column) -> Column: ...
|
|
@@ -1,166 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: robin-sparkless
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Classifier: Development Status :: 3 - Alpha
|
|
5
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
6
|
-
Classifier: Programming Language :: Python :: 3
|
|
7
|
-
Classifier: Programming Language :: Rust
|
|
8
|
-
Classifier: Topic :: Scientific/Engineering
|
|
9
|
-
Summary: PySpark-like DataFrame API in Rust (Polars backend), with Python bindings via PyO3
|
|
10
|
-
Author: Robin Sparkless contributors
|
|
11
|
-
License: MIT
|
|
12
|
-
Requires-Python: >=3.8
|
|
13
|
-
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
|
14
|
-
|
|
15
|
-
# Robin Sparkless
|
|
16
|
-
|
|
17
|
-
**PySpark-style DataFrames in Rust—no JVM.** A DataFrame library that mirrors PySpark’s API and semantics while using [Polars](https://www.pola.rs/) as the execution engine.
|
|
18
|
-
|
|
19
|
-
[](https://crates.io/crates/robin-sparkless)
|
|
20
|
-
[](https://docs.rs/robin-sparkless)
|
|
21
|
-
[](https://robin-sparkless.readthedocs.io/en/latest/)
|
|
22
|
-
[](LICENSE)
|
|
23
|
-
|
|
24
|
-
---
|
|
25
|
-
|
|
26
|
-
## Why Robin Sparkless?
|
|
27
|
-
|
|
28
|
-
- **Familiar API** — `SparkSession`, `DataFrame`, `Column`, and PySpark-like functions so you can reuse patterns without the JVM.
|
|
29
|
-
- **Polars under the hood** — Fast, native Rust execution with Polars for IO, expressions, and aggregations.
|
|
30
|
-
- **Rust-first, Python optional** — Use it as a Rust library or build the Python extension via PyO3 for a drop-in style API.
|
|
31
|
-
- **Sparkless backend target** — Designed to power [Sparkless](https://github.com/eddiethedean/sparkless) (the Python PySpark replacement) so Sparkless can run on this engine via PyO3.
|
|
32
|
-
|
|
33
|
-
---
|
|
34
|
-
|
|
35
|
-
## Features
|
|
36
|
-
|
|
37
|
-
| Area | What’s included |
|
|
38
|
-
|------|------------------|
|
|
39
|
-
| **Core** | `SparkSession`, `DataFrame`, `Column`; `filter`, `select`, `with_column`, `order_by`, `group_by`, joins |
|
|
40
|
-
| **IO** | CSV, Parquet, JSON via `SparkSession::read_*` |
|
|
41
|
-
| **Expressions** | `col()`, `lit()`, `when`/`then`/`otherwise`, `coalesce`, cast, type/conditional helpers |
|
|
42
|
-
| **Aggregates** | `count`, `sum`, `avg`, `min`, `max`, and more; multi-column groupBy |
|
|
43
|
-
| **Window** | `row_number`, `rank`, `dense_rank`, `lag`, `lead`, `first_value`, `last_value`, and others with `.over()` |
|
|
44
|
-
| **Arrays & maps** | `array_*`, `explode`, `create_map`, `map_keys`, `map_values`, and related functions |
|
|
45
|
-
| **Strings & JSON** | String functions (`upper`, `lower`, `substring`, `regexp_*`, etc.), `get_json_object`, `from_json`, `to_json` |
|
|
46
|
-
| **Datetime & math** | Date/time extractors and arithmetic, `year`/`month`/`day`, math (`sin`, `cos`, `sqrt`, `pow`, …) |
|
|
47
|
-
| **Optional SQL** | `spark.sql("SELECT ...")` with temp views (`createOrReplaceTempView`, `table`) — enable with `--features sql` |
|
|
48
|
-
| **Optional Delta** | `read_delta`, `read_delta_with_version`, `write_delta` — enable with `--features delta` |
|
|
49
|
-
|
|
50
|
-
Known differences from PySpark are documented in [docs/PYSPARK_DIFFERENCES.md](docs/PYSPARK_DIFFERENCES.md). Parity status and roadmap are in [docs/PARITY_STATUS.md](docs/PARITY_STATUS.md) and [docs/ROADMAP.md](docs/ROADMAP.md).
|
|
51
|
-
|
|
52
|
-
---
|
|
53
|
-
|
|
54
|
-
## Installation
|
|
55
|
-
|
|
56
|
-
### Rust
|
|
57
|
-
|
|
58
|
-
Add to your `Cargo.toml`:
|
|
59
|
-
|
|
60
|
-
```toml
|
|
61
|
-
[dependencies]
|
|
62
|
-
robin-sparkless = "0.1.0"
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
Optional features:
|
|
66
|
-
|
|
67
|
-
```toml
|
|
68
|
-
robin-sparkless = { version = "0.1.0", features = ["sql"] } # spark.sql(), temp views
|
|
69
|
-
robin-sparkless = { version = "0.1.0", features = ["delta"] } # Delta Lake read/write
|
|
70
|
-
```
|
|
71
|
-
|
|
72
|
-
### Python (PyO3)
|
|
73
|
-
|
|
74
|
-
Build the Python extension with [maturin](https://www.maturin.rs/) (Rust + Python 3.8+):
|
|
75
|
-
|
|
76
|
-
```bash
|
|
77
|
-
pip install maturin
|
|
78
|
-
maturin develop --features pyo3
|
|
79
|
-
# With optional SQL and/or Delta:
|
|
80
|
-
maturin develop --features "pyo3,sql"
|
|
81
|
-
maturin develop --features "pyo3,delta"
|
|
82
|
-
maturin develop --features "pyo3,sql,delta"
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
Then use the `robin_sparkless` module; see [docs/PYTHON_API.md](docs/PYTHON_API.md).
|
|
86
|
-
|
|
87
|
-
---
|
|
88
|
-
|
|
89
|
-
## Quick start
|
|
90
|
-
|
|
91
|
-
### Rust
|
|
92
|
-
|
|
93
|
-
```rust
|
|
94
|
-
use robin_sparkless::{col, lit_i64, SparkSession};
|
|
95
|
-
|
|
96
|
-
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
97
|
-
let spark = SparkSession::builder().app_name("demo").get_or_create();
|
|
98
|
-
|
|
99
|
-
// Create a DataFrame from rows (id, age, name)
|
|
100
|
-
let df = spark.create_dataframe(
|
|
101
|
-
vec![
|
|
102
|
-
(1, 25, "Alice".to_string()),
|
|
103
|
-
(2, 30, "Bob".to_string()),
|
|
104
|
-
(3, 35, "Charlie".to_string()),
|
|
105
|
-
],
|
|
106
|
-
vec!["id", "age", "name"],
|
|
107
|
-
)?;
|
|
108
|
-
|
|
109
|
-
// Filter and show
|
|
110
|
-
let adults = df.filter(col("age").gt(lit_i64(26)))?;
|
|
111
|
-
adults.show(Some(10))?;
|
|
112
|
-
|
|
113
|
-
Ok(())
|
|
114
|
-
}
|
|
115
|
-
```
|
|
116
|
-
|
|
117
|
-
You can also wrap an existing Polars `DataFrame` with `DataFrame::from_polars(polars_df)`. See [docs/QUICKSTART.md](docs/QUICKSTART.md) for joins, window functions, and more.
|
|
118
|
-
|
|
119
|
-
### Python
|
|
120
|
-
|
|
121
|
-
```python
|
|
122
|
-
import robin_sparkless as rs
|
|
123
|
-
|
|
124
|
-
spark = rs.SparkSession.builder().app_name("demo").get_or_create()
|
|
125
|
-
df = spark.create_dataframe([(1, 25, "Alice"), (2, 30, "Bob")], ["id", "age", "name"])
|
|
126
|
-
filtered = df.filter(rs.col("age").gt(rs.lit(26)))
|
|
127
|
-
print(filtered.collect()) # [{"id": 2, "age": 30, "name": "Bob"}]
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
---
|
|
131
|
-
|
|
132
|
-
## Development
|
|
133
|
-
|
|
134
|
-
**Prerequisites:** Rust (see [rust-toolchain.toml](rust-toolchain.toml)), and for Python tests: Python 3.8+, `maturin`, `pytest`.
|
|
135
|
-
|
|
136
|
-
| Command | Description |
|
|
137
|
-
|---------|-------------|
|
|
138
|
-
| `cargo build` | Build (Rust only) |
|
|
139
|
-
| `cargo build --features pyo3` | Build with Python extension |
|
|
140
|
-
| `cargo test` | Run Rust tests |
|
|
141
|
-
| `make test` | Run Rust + Python tests (creates venv, `maturin develop`, `pytest`) |
|
|
142
|
-
| `make check` | Format, clippy, audit, deny, tests |
|
|
143
|
-
| `cargo bench` | Benchmarks (robin-sparkless vs Polars) |
|
|
144
|
-
| `cargo doc --open` | Build and open API docs |
|
|
145
|
-
|
|
146
|
-
CI runs the same checks on push/PR (see [.github/workflows/ci.yml](.github/workflows/ci.yml)).
|
|
147
|
-
|
|
148
|
-
---
|
|
149
|
-
|
|
150
|
-
## Documentation
|
|
151
|
-
|
|
152
|
-
- [**Full documentation (Read the Docs)**](https://robin-sparkless.readthedocs.io/) — Quickstart, Python API, reference, and Sparkless integration (MkDocs)
|
|
153
|
-
- [**API reference (docs.rs)**](https://docs.rs/robin-sparkless) — Crate API
|
|
154
|
-
- [**QUICKSTART**](docs/QUICKSTART.md) — Build, usage, optional features, benchmarks
|
|
155
|
-
- [**ROADMAP**](docs/ROADMAP.md) — Development roadmap and Sparkless integration
|
|
156
|
-
- [**PYSPARK_DIFFERENCES**](docs/PYSPARK_DIFFERENCES.md) — Known divergences from PySpark
|
|
157
|
-
- [**RELEASING**](docs/RELEASING.md) — Releasing and publishing to crates.io
|
|
158
|
-
|
|
159
|
-
See also [CHANGELOG.md](CHANGELOG.md) for version history.
|
|
160
|
-
|
|
161
|
-
---
|
|
162
|
-
|
|
163
|
-
## License
|
|
164
|
-
|
|
165
|
-
MIT
|
|
166
|
-
|
|
@@ -1,5 +0,0 @@
|
|
|
1
|
-
robin_sparkless\__init__.py,sha256=h26i0kv9czksgzJ0zbFJvX-5IZV8hrWVEerFQOVG3fA,143
|
|
2
|
-
robin_sparkless\robin_sparkless.pyd,sha256=st7JPqkmNtOfSiCGQvS-d_95cMDUBy0zB5XX1bejwOI,58464256
|
|
3
|
-
robin_sparkless-0.1.0.dist-info\METADATA,sha256=oriVbTGk7qSpqb53OaL4l3-NG0almNl-CGrctUIxHtQ,6544
|
|
4
|
-
robin_sparkless-0.1.0.dist-info\WHEEL,sha256=gPqN4EsdiAyGvmfrYy_ONrF276O8o0hPitI2CKZrEFA,95
|
|
5
|
-
robin_sparkless-0.1.0.dist-info\RECORD,,
|
|
File without changes
|