duckdb 1.5.0.dev44__cp313-cp313-win_amd64.whl → 1.5.0.dev94__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb-stubs/__init__.pyi +1443 -0
- _duckdb-stubs/_func.pyi +46 -0
- _duckdb-stubs/_sqltypes.pyi +75 -0
- _duckdb.cp313-win_amd64.pyd +0 -0
- adbc_driver_duckdb/__init__.py +49 -0
- adbc_driver_duckdb/dbapi.py +115 -0
- duckdb/__init__.py +341 -435
- duckdb/_dbapi_type_object.py +231 -0
- duckdb/_version.py +22 -0
- duckdb/bytes_io_wrapper.py +12 -9
- duckdb/experimental/__init__.py +2 -1
- duckdb/experimental/spark/__init__.py +3 -4
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +7 -9
- duckdb/experimental/spark/conf.py +16 -15
- duckdb/experimental/spark/context.py +60 -44
- duckdb/experimental/spark/errors/__init__.py +33 -35
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +39 -88
- duckdb/experimental/spark/errors/utils.py +11 -16
- duckdb/experimental/spark/exception.py +9 -6
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +8 -15
- duckdb/experimental/spark/sql/catalog.py +21 -20
- duckdb/experimental/spark/sql/column.py +48 -55
- duckdb/experimental/spark/sql/conf.py +9 -8
- duckdb/experimental/spark/sql/dataframe.py +185 -233
- duckdb/experimental/spark/sql/functions.py +1222 -1248
- duckdb/experimental/spark/sql/group.py +56 -52
- duckdb/experimental/spark/sql/readwriter.py +80 -94
- duckdb/experimental/spark/sql/session.py +64 -59
- duckdb/experimental/spark/sql/streaming.py +9 -10
- duckdb/experimental/spark/sql/type_utils.py +67 -65
- duckdb/experimental/spark/sql/types.py +309 -345
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +26 -16
- duckdb/func/__init__.py +3 -0
- duckdb/functional/__init__.py +12 -16
- duckdb/polars_io.py +130 -83
- duckdb/query_graph/__main__.py +91 -96
- duckdb/sqltypes/__init__.py +63 -0
- duckdb/typing/__init__.py +18 -8
- duckdb/udf.py +10 -5
- duckdb/value/__init__.py +1 -0
- duckdb/value/constant/__init__.py +62 -60
- {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/METADATA +12 -4
- duckdb-1.5.0.dev94.dist-info/RECORD +52 -0
- duckdb/__init__.pyi +0 -713
- duckdb/functional/__init__.pyi +0 -31
- duckdb/typing/__init__.pyi +0 -36
- duckdb/value/constant/__init__.pyi +0 -115
- duckdb-1.5.0.dev44.dist-info/RECORD +0 -47
- /duckdb/{value/__init__.pyi → py.typed} +0 -0
- {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/WHEEL +0 -0
- {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,24 +1,20 @@
|
|
|
1
|
+
import uuid # noqa: D100
|
|
1
2
|
from functools import reduce
|
|
3
|
+
from keyword import iskeyword
|
|
2
4
|
from typing import (
|
|
3
5
|
TYPE_CHECKING,
|
|
4
6
|
Any,
|
|
5
7
|
Callable,
|
|
6
|
-
List,
|
|
7
|
-
Dict,
|
|
8
8
|
Optional,
|
|
9
|
-
Tuple,
|
|
10
9
|
Union,
|
|
11
10
|
cast,
|
|
12
11
|
overload,
|
|
13
12
|
)
|
|
14
|
-
import uuid
|
|
15
|
-
from keyword import iskeyword
|
|
16
13
|
|
|
17
14
|
import duckdb
|
|
18
15
|
from duckdb import ColumnExpression, Expression, StarExpression
|
|
19
16
|
|
|
20
|
-
from
|
|
21
|
-
from ..errors import PySparkTypeError, PySparkValueError, PySparkIndexError
|
|
17
|
+
from ..errors import PySparkIndexError, PySparkTypeError, PySparkValueError
|
|
22
18
|
from ..exception import ContributionsAcceptedError
|
|
23
19
|
from .column import Column
|
|
24
20
|
from .readwriter import DataFrameWriter
|
|
@@ -29,43 +25,42 @@ if TYPE_CHECKING:
|
|
|
29
25
|
import pyarrow as pa
|
|
30
26
|
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
31
27
|
|
|
32
|
-
from .
|
|
28
|
+
from ._typing import ColumnOrName
|
|
29
|
+
from .group import GroupedData
|
|
33
30
|
from .session import SparkSession
|
|
34
31
|
|
|
35
|
-
from
|
|
36
|
-
from .functions import _to_column_expr, col, lit
|
|
32
|
+
from duckdb.experimental.spark.sql import functions as spark_sql_functions
|
|
37
33
|
|
|
38
34
|
|
|
39
|
-
class DataFrame:
|
|
40
|
-
def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession"):
|
|
35
|
+
class DataFrame: # noqa: D101
|
|
36
|
+
def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession") -> None: # noqa: D107
|
|
41
37
|
self.relation = relation
|
|
42
38
|
self.session = session
|
|
43
39
|
self._schema = None
|
|
44
40
|
if self.relation is not None:
|
|
45
41
|
self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
|
|
46
42
|
|
|
47
|
-
def show(self, **kwargs) -> None:
|
|
43
|
+
def show(self, **kwargs) -> None: # noqa: D102
|
|
48
44
|
self.relation.show()
|
|
49
45
|
|
|
50
|
-
def toPandas(self) -> "PandasDataFrame":
|
|
46
|
+
def toPandas(self) -> "PandasDataFrame": # noqa: D102
|
|
51
47
|
return self.relation.df()
|
|
52
48
|
|
|
53
49
|
def toArrow(self) -> "pa.Table":
|
|
54
|
-
"""
|
|
55
|
-
Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
|
|
50
|
+
"""Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
|
|
56
51
|
|
|
57
52
|
This is only available if PyArrow is installed and available.
|
|
58
53
|
|
|
59
54
|
.. versionadded:: 4.0.0
|
|
60
55
|
|
|
61
|
-
Notes
|
|
56
|
+
Notes:
|
|
62
57
|
-----
|
|
63
58
|
This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
|
|
64
59
|
expected to be small, as all the data is loaded into the driver's memory.
|
|
65
60
|
|
|
66
61
|
This API is a developer API.
|
|
67
62
|
|
|
68
|
-
Examples
|
|
63
|
+
Examples:
|
|
69
64
|
--------
|
|
70
65
|
>>> df.toArrow() # doctest: +SKIP
|
|
71
66
|
pyarrow.Table
|
|
@@ -88,7 +83,7 @@ class DataFrame:
|
|
|
88
83
|
name : str
|
|
89
84
|
Name of the view.
|
|
90
85
|
|
|
91
|
-
Examples
|
|
86
|
+
Examples:
|
|
92
87
|
--------
|
|
93
88
|
Create a local temporary view named 'people'.
|
|
94
89
|
|
|
@@ -108,12 +103,13 @@ class DataFrame:
|
|
|
108
103
|
"""
|
|
109
104
|
self.relation.create_view(name, True)
|
|
110
105
|
|
|
111
|
-
def createGlobalTempView(self, name: str) -> None:
|
|
106
|
+
def createGlobalTempView(self, name: str) -> None: # noqa: D102
|
|
112
107
|
raise NotImplementedError
|
|
113
108
|
|
|
114
|
-
def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
|
|
109
|
+
def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame": # noqa: D102
|
|
115
110
|
if columnName not in self.relation:
|
|
116
|
-
|
|
111
|
+
msg = f"DataFrame does not contain a column named {columnName}"
|
|
112
|
+
raise ValueError(msg)
|
|
117
113
|
cols = []
|
|
118
114
|
for x in self.relation.columns:
|
|
119
115
|
col = ColumnExpression(x)
|
|
@@ -123,7 +119,7 @@ class DataFrame:
|
|
|
123
119
|
rel = self.relation.select(*cols)
|
|
124
120
|
return DataFrame(rel, self.session)
|
|
125
121
|
|
|
126
|
-
def withColumn(self, columnName: str, col: Column) -> "DataFrame":
|
|
122
|
+
def withColumn(self, columnName: str, col: Column) -> "DataFrame": # noqa: D102
|
|
127
123
|
if not isinstance(col, Column):
|
|
128
124
|
raise PySparkTypeError(
|
|
129
125
|
error_class="NOT_COLUMN",
|
|
@@ -143,9 +139,8 @@ class DataFrame:
|
|
|
143
139
|
rel = self.relation.select(*cols)
|
|
144
140
|
return DataFrame(rel, self.session)
|
|
145
141
|
|
|
146
|
-
def withColumns(self, *colsMap:
|
|
147
|
-
"""
|
|
148
|
-
Returns a new :class:`DataFrame` by adding multiple columns or replacing the
|
|
142
|
+
def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame":
|
|
143
|
+
"""Returns a new :class:`DataFrame` by adding multiple columns or replacing the
|
|
149
144
|
existing columns that have the same names.
|
|
150
145
|
|
|
151
146
|
The colsMap is a map of column name and column, the column must only refer to attributes
|
|
@@ -162,22 +157,22 @@ class DataFrame:
|
|
|
162
157
|
colsMap : dict
|
|
163
158
|
a dict of column name and :class:`Column`. Currently, only a single map is supported.
|
|
164
159
|
|
|
165
|
-
Returns
|
|
160
|
+
Returns:
|
|
166
161
|
-------
|
|
167
162
|
:class:`DataFrame`
|
|
168
163
|
DataFrame with new or replaced columns.
|
|
169
164
|
|
|
170
|
-
Examples
|
|
165
|
+
Examples:
|
|
171
166
|
--------
|
|
172
167
|
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
173
|
-
>>> df.withColumns({
|
|
168
|
+
>>> df.withColumns({"age2": df.age + 2, "age3": df.age + 3}).show()
|
|
174
169
|
+---+-----+----+----+
|
|
175
170
|
|age| name|age2|age3|
|
|
176
171
|
+---+-----+----+----+
|
|
177
172
|
| 2|Alice| 4| 5|
|
|
178
173
|
| 5| Bob| 7| 8|
|
|
179
174
|
+---+-----+----+----+
|
|
180
|
-
"""
|
|
175
|
+
""" # noqa: D205
|
|
181
176
|
# Below code is to help enable kwargs in future.
|
|
182
177
|
assert len(colsMap) == 1
|
|
183
178
|
colsMap = colsMap[0] # type: ignore[assignment]
|
|
@@ -218,9 +213,8 @@ class DataFrame:
|
|
|
218
213
|
rel = self.relation.select(*cols)
|
|
219
214
|
return DataFrame(rel, self.session)
|
|
220
215
|
|
|
221
|
-
def withColumnsRenamed(self, colsMap:
|
|
222
|
-
"""
|
|
223
|
-
Returns a new :class:`DataFrame` by renaming multiple columns.
|
|
216
|
+
def withColumnsRenamed(self, colsMap: dict[str, str]) -> "DataFrame":
|
|
217
|
+
"""Returns a new :class:`DataFrame` by renaming multiple columns.
|
|
224
218
|
This is a no-op if the schema doesn't contain the given column names.
|
|
225
219
|
|
|
226
220
|
.. versionadded:: 3.4.0
|
|
@@ -232,31 +226,31 @@ class DataFrame:
|
|
|
232
226
|
a dict of existing column names and corresponding desired column names.
|
|
233
227
|
Currently, only a single map is supported.
|
|
234
228
|
|
|
235
|
-
Returns
|
|
229
|
+
Returns:
|
|
236
230
|
-------
|
|
237
231
|
:class:`DataFrame`
|
|
238
232
|
DataFrame with renamed columns.
|
|
239
233
|
|
|
240
|
-
See Also
|
|
234
|
+
See Also:
|
|
241
235
|
--------
|
|
242
236
|
:meth:`withColumnRenamed`
|
|
243
237
|
|
|
244
|
-
Notes
|
|
238
|
+
Notes:
|
|
245
239
|
-----
|
|
246
240
|
Support Spark Connect
|
|
247
241
|
|
|
248
|
-
Examples
|
|
242
|
+
Examples:
|
|
249
243
|
--------
|
|
250
244
|
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
251
|
-
>>> df = df.withColumns({
|
|
252
|
-
>>> df.withColumnsRenamed({
|
|
245
|
+
>>> df = df.withColumns({"age2": df.age + 2, "age3": df.age + 3})
|
|
246
|
+
>>> df.withColumnsRenamed({"age2": "age4", "age3": "age5"}).show()
|
|
253
247
|
+---+-----+----+----+
|
|
254
248
|
|age| name|age4|age5|
|
|
255
249
|
+---+-----+----+----+
|
|
256
250
|
| 2|Alice| 4| 5|
|
|
257
251
|
| 5| Bob| 7| 8|
|
|
258
252
|
+---+-----+----+----+
|
|
259
|
-
"""
|
|
253
|
+
""" # noqa: D205
|
|
260
254
|
if not isinstance(colsMap, dict):
|
|
261
255
|
raise PySparkTypeError(
|
|
262
256
|
error_class="NOT_DICT",
|
|
@@ -265,9 +259,8 @@ class DataFrame:
|
|
|
265
259
|
|
|
266
260
|
unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
|
|
267
261
|
if unknown_columns:
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
)
|
|
262
|
+
msg = f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
|
|
263
|
+
raise ValueError(msg)
|
|
271
264
|
|
|
272
265
|
# Compute this only once
|
|
273
266
|
old_column_names = list(colsMap.keys())
|
|
@@ -289,11 +282,7 @@ class DataFrame:
|
|
|
289
282
|
rel = self.relation.select(*cols)
|
|
290
283
|
return DataFrame(rel, self.session)
|
|
291
284
|
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
def transform(
|
|
295
|
-
self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any
|
|
296
|
-
) -> "DataFrame":
|
|
285
|
+
def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": # noqa: ANN401
|
|
297
286
|
"""Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
|
|
298
287
|
|
|
299
288
|
.. versionadded:: 3.0.0
|
|
@@ -314,21 +303,19 @@ class DataFrame:
|
|
|
314
303
|
|
|
315
304
|
.. versionadded:: 3.3.0
|
|
316
305
|
|
|
317
|
-
Returns
|
|
306
|
+
Returns:
|
|
318
307
|
-------
|
|
319
308
|
:class:`DataFrame`
|
|
320
309
|
Transformed DataFrame.
|
|
321
310
|
|
|
322
|
-
Examples
|
|
311
|
+
Examples:
|
|
323
312
|
--------
|
|
324
313
|
>>> from pyspark.sql.functions import col
|
|
325
314
|
>>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
|
|
326
315
|
>>> def cast_all_to_int(input_df):
|
|
327
316
|
... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
|
|
328
|
-
...
|
|
329
317
|
>>> def sort_columns_asc(input_df):
|
|
330
318
|
... return input_df.select(*sorted(input_df.columns))
|
|
331
|
-
...
|
|
332
319
|
>>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
|
|
333
320
|
+-----+---+
|
|
334
321
|
|float|int|
|
|
@@ -338,8 +325,9 @@ class DataFrame:
|
|
|
338
325
|
+-----+---+
|
|
339
326
|
|
|
340
327
|
>>> def add_n(input_df, n):
|
|
341
|
-
... return input_df.select(
|
|
342
|
-
...
|
|
328
|
+
... return input_df.select(
|
|
329
|
+
... [(col(col_name) + n).alias(col_name) for col_name in input_df.columns]
|
|
330
|
+
... )
|
|
343
331
|
>>> df.transform(add_n, 1).transform(add_n, n=10).show()
|
|
344
332
|
+---+-----+
|
|
345
333
|
|int|float|
|
|
@@ -350,14 +338,11 @@ class DataFrame:
|
|
|
350
338
|
"""
|
|
351
339
|
result = func(self, *args, **kwargs)
|
|
352
340
|
assert isinstance(result, DataFrame), (
|
|
353
|
-
"Func returned an instance of type [
|
|
354
|
-
"should have been DataFrame." % type(result)
|
|
341
|
+
f"Func returned an instance of type [{type(result)}], should have been DataFrame."
|
|
355
342
|
)
|
|
356
343
|
return result
|
|
357
344
|
|
|
358
|
-
def sort(
|
|
359
|
-
self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
|
|
360
|
-
) -> "DataFrame":
|
|
345
|
+
def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame": # noqa: ANN401
|
|
361
346
|
"""Returns a new :class:`DataFrame` sorted by the specified column(s).
|
|
362
347
|
|
|
363
348
|
Parameters
|
|
@@ -372,16 +357,15 @@ class DataFrame:
|
|
|
372
357
|
Sort ascending vs. descending. Specify list for multiple sort orders.
|
|
373
358
|
If a list is specified, the length of the list must equal the length of the `cols`.
|
|
374
359
|
|
|
375
|
-
Returns
|
|
360
|
+
Returns:
|
|
376
361
|
-------
|
|
377
362
|
:class:`DataFrame`
|
|
378
363
|
Sorted DataFrame.
|
|
379
364
|
|
|
380
|
-
Examples
|
|
365
|
+
Examples:
|
|
381
366
|
--------
|
|
382
367
|
>>> from pyspark.sql.functions import desc, asc
|
|
383
|
-
>>> df = spark.createDataFrame([
|
|
384
|
-
... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
368
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
385
369
|
|
|
386
370
|
Sort the DataFrame in ascending order.
|
|
387
371
|
|
|
@@ -419,8 +403,9 @@ class DataFrame:
|
|
|
419
403
|
|
|
420
404
|
Specify multiple columns
|
|
421
405
|
|
|
422
|
-
>>> df = spark.createDataFrame(
|
|
423
|
-
... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
|
|
406
|
+
>>> df = spark.createDataFrame(
|
|
407
|
+
... [(2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
|
|
408
|
+
... )
|
|
424
409
|
>>> df.orderBy(desc("age"), "name").show()
|
|
425
410
|
+---+-----+
|
|
426
411
|
|age| name|
|
|
@@ -453,7 +438,7 @@ class DataFrame:
|
|
|
453
438
|
for c in cols:
|
|
454
439
|
_c = c
|
|
455
440
|
if isinstance(c, str):
|
|
456
|
-
_c = col(c)
|
|
441
|
+
_c = spark_sql_functions.col(c)
|
|
457
442
|
elif isinstance(c, int) and not isinstance(c, bool):
|
|
458
443
|
# ordinal is 1-based
|
|
459
444
|
if c > 0:
|
|
@@ -481,13 +466,13 @@ class DataFrame:
|
|
|
481
466
|
message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
|
|
482
467
|
)
|
|
483
468
|
|
|
484
|
-
columns = [_to_column_expr(c) for c in columns]
|
|
469
|
+
columns = [spark_sql_functions._to_column_expr(c) for c in columns]
|
|
485
470
|
rel = self.relation.sort(*columns)
|
|
486
471
|
return DataFrame(rel, self.session)
|
|
487
472
|
|
|
488
473
|
orderBy = sort
|
|
489
474
|
|
|
490
|
-
def head(self, n: Optional[int] = None) -> Union[Optional[Row],
|
|
475
|
+
def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]: # noqa: D102
|
|
491
476
|
if n is None:
|
|
492
477
|
rs = self.head(1)
|
|
493
478
|
return rs[0] if rs else None
|
|
@@ -495,7 +480,7 @@ class DataFrame:
|
|
|
495
480
|
|
|
496
481
|
first = head
|
|
497
482
|
|
|
498
|
-
def take(self, num: int) ->
|
|
483
|
+
def take(self, num: int) -> list[Row]: # noqa: D102
|
|
499
484
|
return self.limit(num).collect()
|
|
500
485
|
|
|
501
486
|
def filter(self, condition: "ColumnOrName") -> "DataFrame":
|
|
@@ -509,15 +494,14 @@ class DataFrame:
|
|
|
509
494
|
a :class:`Column` of :class:`types.BooleanType`
|
|
510
495
|
or a string of SQL expressions.
|
|
511
496
|
|
|
512
|
-
Returns
|
|
497
|
+
Returns:
|
|
513
498
|
-------
|
|
514
499
|
:class:`DataFrame`
|
|
515
500
|
Filtered DataFrame.
|
|
516
501
|
|
|
517
|
-
Examples
|
|
502
|
+
Examples:
|
|
518
503
|
--------
|
|
519
|
-
>>> df = spark.createDataFrame([
|
|
520
|
-
... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
504
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
521
505
|
|
|
522
506
|
Filter by :class:`Column` instances.
|
|
523
507
|
|
|
@@ -563,38 +547,34 @@ class DataFrame:
|
|
|
563
547
|
|
|
564
548
|
where = filter
|
|
565
549
|
|
|
566
|
-
def select(self, *cols) -> "DataFrame":
|
|
550
|
+
def select(self, *cols) -> "DataFrame": # noqa: D102
|
|
567
551
|
cols = list(cols)
|
|
568
552
|
if len(cols) == 1:
|
|
569
553
|
cols = cols[0]
|
|
570
554
|
if isinstance(cols, list):
|
|
571
|
-
projections = [
|
|
572
|
-
x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols
|
|
573
|
-
]
|
|
555
|
+
projections = [x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols]
|
|
574
556
|
else:
|
|
575
|
-
projections = [
|
|
576
|
-
cols.expr if isinstance(cols, Column) else ColumnExpression(cols)
|
|
577
|
-
]
|
|
557
|
+
projections = [cols.expr if isinstance(cols, Column) else ColumnExpression(cols)]
|
|
578
558
|
rel = self.relation.select(*projections)
|
|
579
559
|
return DataFrame(rel, self.session)
|
|
580
560
|
|
|
581
561
|
@property
|
|
582
|
-
def columns(self) ->
|
|
562
|
+
def columns(self) -> list[str]:
|
|
583
563
|
"""Returns all column names as a list.
|
|
584
564
|
|
|
585
|
-
Examples
|
|
565
|
+
Examples:
|
|
586
566
|
--------
|
|
587
567
|
>>> df.columns
|
|
588
568
|
['age', 'name']
|
|
589
569
|
"""
|
|
590
570
|
return [f.name for f in self.schema.fields]
|
|
591
571
|
|
|
592
|
-
def _ipython_key_completions_(self) ->
|
|
572
|
+
def _ipython_key_completions_(self) -> list[str]:
|
|
593
573
|
# Provides tab-completion for column names in PySpark DataFrame
|
|
594
574
|
# when accessed in bracket notation, e.g. df['<TAB>]
|
|
595
575
|
return self.columns
|
|
596
576
|
|
|
597
|
-
def __dir__(self) ->
|
|
577
|
+
def __dir__(self) -> list[str]: # noqa: D105
|
|
598
578
|
out = set(super().__dir__())
|
|
599
579
|
out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
|
|
600
580
|
return sorted(out)
|
|
@@ -602,7 +582,7 @@ class DataFrame:
|
|
|
602
582
|
def join(
|
|
603
583
|
self,
|
|
604
584
|
other: "DataFrame",
|
|
605
|
-
on: Optional[Union[str,
|
|
585
|
+
on: Optional[Union[str, list[str], Column, list[Column]]] = None,
|
|
606
586
|
how: Optional[str] = None,
|
|
607
587
|
) -> "DataFrame":
|
|
608
588
|
"""Joins with another :class:`DataFrame`, using the given join expression.
|
|
@@ -622,12 +602,12 @@ class DataFrame:
|
|
|
622
602
|
``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
|
|
623
603
|
``anti``, ``leftanti`` and ``left_anti``.
|
|
624
604
|
|
|
625
|
-
Returns
|
|
605
|
+
Returns:
|
|
626
606
|
-------
|
|
627
607
|
:class:`DataFrame`
|
|
628
608
|
Joined DataFrame.
|
|
629
609
|
|
|
630
|
-
Examples
|
|
610
|
+
Examples:
|
|
631
611
|
--------
|
|
632
612
|
The following performs a full outer join between ``df1`` and ``df2``.
|
|
633
613
|
|
|
@@ -636,22 +616,24 @@ class DataFrame:
|
|
|
636
616
|
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
|
|
637
617
|
>>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
638
618
|
>>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
|
|
639
|
-
>>> df4 = spark.createDataFrame(
|
|
640
|
-
...
|
|
641
|
-
...
|
|
642
|
-
...
|
|
643
|
-
...
|
|
644
|
-
...
|
|
619
|
+
>>> df4 = spark.createDataFrame(
|
|
620
|
+
... [
|
|
621
|
+
... Row(age=10, height=80, name="Alice"),
|
|
622
|
+
... Row(age=5, height=None, name="Bob"),
|
|
623
|
+
... Row(age=None, height=None, name="Tom"),
|
|
624
|
+
... Row(age=None, height=None, name=None),
|
|
625
|
+
... ]
|
|
626
|
+
... )
|
|
645
627
|
|
|
646
628
|
Inner join on columns (default)
|
|
647
629
|
|
|
648
|
-
>>> df.join(df2,
|
|
630
|
+
>>> df.join(df2, "name").select(df.name, df2.height).show()
|
|
649
631
|
+----+------+
|
|
650
632
|
|name|height|
|
|
651
633
|
+----+------+
|
|
652
634
|
| Bob| 85|
|
|
653
635
|
+----+------+
|
|
654
|
-
>>> df.join(df4, [
|
|
636
|
+
>>> df.join(df4, ["name", "age"]).select(df.name, df.age).show()
|
|
655
637
|
+----+---+
|
|
656
638
|
|name|age|
|
|
657
639
|
+----+---+
|
|
@@ -660,8 +642,9 @@ class DataFrame:
|
|
|
660
642
|
|
|
661
643
|
Outer join for both DataFrames on the 'name' column.
|
|
662
644
|
|
|
663
|
-
>>> df.join(df2, df.name == df2.name,
|
|
664
|
-
...
|
|
645
|
+
>>> df.join(df2, df.name == df2.name, "outer").select(df.name, df2.height).sort(
|
|
646
|
+
... desc("name")
|
|
647
|
+
... ).show()
|
|
665
648
|
+-----+------+
|
|
666
649
|
| name|height|
|
|
667
650
|
+-----+------+
|
|
@@ -669,7 +652,7 @@ class DataFrame:
|
|
|
669
652
|
|Alice| NULL|
|
|
670
653
|
| NULL| 80|
|
|
671
654
|
+-----+------+
|
|
672
|
-
>>> df.join(df2,
|
|
655
|
+
>>> df.join(df2, "name", "outer").select("name", "height").sort(desc("name")).show()
|
|
673
656
|
+-----+------+
|
|
674
657
|
| name|height|
|
|
675
658
|
+-----+------+
|
|
@@ -680,11 +663,9 @@ class DataFrame:
|
|
|
680
663
|
|
|
681
664
|
Outer join for both DataFrams with multiple columns.
|
|
682
665
|
|
|
683
|
-
>>> df.join(
|
|
684
|
-
... df3
|
|
685
|
-
...
|
|
686
|
-
... 'outer'
|
|
687
|
-
... ).select(df.name, df3.age).show()
|
|
666
|
+
>>> df.join(df3, [df.name == df3.name, df.age == df3.age], "outer").select(
|
|
667
|
+
... df.name, df3.age
|
|
668
|
+
... ).show()
|
|
688
669
|
+-----+---+
|
|
689
670
|
| name|age|
|
|
690
671
|
+-----+---+
|
|
@@ -692,20 +673,16 @@ class DataFrame:
|
|
|
692
673
|
| Bob| 5|
|
|
693
674
|
+-----+---+
|
|
694
675
|
"""
|
|
695
|
-
|
|
696
676
|
if on is not None and not isinstance(on, list):
|
|
697
677
|
on = [on] # type: ignore[assignment]
|
|
698
|
-
if on is not None and not all(
|
|
678
|
+
if on is not None and not all(isinstance(x, str) for x in on):
|
|
699
679
|
assert isinstance(on, list)
|
|
700
680
|
# Get (or create) the Expressions from the list of Columns
|
|
701
|
-
on = [_to_column_expr(x) for x in on]
|
|
681
|
+
on = [spark_sql_functions._to_column_expr(x) for x in on]
|
|
702
682
|
|
|
703
683
|
# & all the Expressions together to form one Expression
|
|
704
|
-
assert isinstance(
|
|
705
|
-
|
|
706
|
-
), "on should be Column or list of Column"
|
|
707
|
-
on = reduce(lambda x, y: x.__and__(y), cast(List[Expression], on))
|
|
708
|
-
|
|
684
|
+
assert isinstance(on[0], Expression), "on should be Column or list of Column"
|
|
685
|
+
on = reduce(lambda x, y: x.__and__(y), cast("list[Expression]", on))
|
|
709
686
|
|
|
710
687
|
if on is None and how is None:
|
|
711
688
|
result = self.relation.join(other.relation)
|
|
@@ -714,14 +691,14 @@ class DataFrame:
|
|
|
714
691
|
how = "inner"
|
|
715
692
|
if on is None:
|
|
716
693
|
on = "true"
|
|
717
|
-
elif isinstance(on, list) and all(
|
|
694
|
+
elif isinstance(on, list) and all(isinstance(x, str) for x in on):
|
|
718
695
|
# Passed directly through as a list of strings
|
|
719
696
|
on = on
|
|
720
697
|
else:
|
|
721
698
|
on = str(on)
|
|
722
699
|
assert isinstance(how, str), "how should be a string"
|
|
723
700
|
|
|
724
|
-
def map_to_recognized_jointype(how):
|
|
701
|
+
def map_to_recognized_jointype(how: str) -> str:
|
|
725
702
|
known_aliases = {
|
|
726
703
|
"inner": [],
|
|
727
704
|
"outer": ["full", "fullouter", "full_outer"],
|
|
@@ -730,15 +707,10 @@ class DataFrame:
|
|
|
730
707
|
"anti": ["leftanti", "left_anti"],
|
|
731
708
|
"semi": ["leftsemi", "left_semi"],
|
|
732
709
|
}
|
|
733
|
-
mapped_type = None
|
|
734
710
|
for type, aliases in known_aliases.items():
|
|
735
711
|
if how == type or how in aliases:
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
if not mapped_type:
|
|
740
|
-
mapped_type = how
|
|
741
|
-
return mapped_type
|
|
712
|
+
return type
|
|
713
|
+
return how
|
|
742
714
|
|
|
743
715
|
how = map_to_recognized_jointype(how)
|
|
744
716
|
result = self.relation.join(other.relation, on, how)
|
|
@@ -757,18 +729,16 @@ class DataFrame:
|
|
|
757
729
|
other : :class:`DataFrame`
|
|
758
730
|
Right side of the cartesian product.
|
|
759
731
|
|
|
760
|
-
Returns
|
|
732
|
+
Returns:
|
|
761
733
|
-------
|
|
762
734
|
:class:`DataFrame`
|
|
763
735
|
Joined DataFrame.
|
|
764
736
|
|
|
765
|
-
Examples
|
|
737
|
+
Examples:
|
|
766
738
|
--------
|
|
767
739
|
>>> from pyspark.sql import Row
|
|
768
|
-
>>> df = spark.createDataFrame(
|
|
769
|
-
|
|
770
|
-
>>> df2 = spark.createDataFrame(
|
|
771
|
-
... [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
740
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
741
|
+
>>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
772
742
|
>>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
|
|
773
743
|
+---+-----+------+
|
|
774
744
|
|age| name|height|
|
|
@@ -791,21 +761,21 @@ class DataFrame:
|
|
|
791
761
|
alias : str
|
|
792
762
|
an alias name to be set for the :class:`DataFrame`.
|
|
793
763
|
|
|
794
|
-
Returns
|
|
764
|
+
Returns:
|
|
795
765
|
-------
|
|
796
766
|
:class:`DataFrame`
|
|
797
767
|
Aliased DataFrame.
|
|
798
768
|
|
|
799
|
-
Examples
|
|
769
|
+
Examples:
|
|
800
770
|
--------
|
|
801
771
|
>>> from pyspark.sql.functions import col, desc
|
|
802
|
-
>>> df = spark.createDataFrame(
|
|
803
|
-
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
772
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
804
773
|
>>> df_as1 = df.alias("df_as1")
|
|
805
774
|
>>> df_as2 = df.alias("df_as2")
|
|
806
|
-
>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"),
|
|
807
|
-
>>> joined_df.select(
|
|
808
|
-
...
|
|
775
|
+
>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
|
|
776
|
+
>>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").sort(
|
|
777
|
+
... desc("df_as1.name")
|
|
778
|
+
... ).show()
|
|
809
779
|
+-----+-----+---+
|
|
810
780
|
| name| name|age|
|
|
811
781
|
+-----+-----+---+
|
|
@@ -817,7 +787,7 @@ class DataFrame:
|
|
|
817
787
|
assert isinstance(alias, str), "alias should be a string"
|
|
818
788
|
return DataFrame(self.relation.set_alias(alias), self.session)
|
|
819
789
|
|
|
820
|
-
def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]
|
|
790
|
+
def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] # noqa: D102
|
|
821
791
|
exclude = []
|
|
822
792
|
for col in cols:
|
|
823
793
|
if isinstance(col, str):
|
|
@@ -834,7 +804,7 @@ class DataFrame:
|
|
|
834
804
|
expr = StarExpression(exclude=exclude)
|
|
835
805
|
return DataFrame(self.relation.select(expr), self.session)
|
|
836
806
|
|
|
837
|
-
def __repr__(self) -> str:
|
|
807
|
+
def __repr__(self) -> str: # noqa: D105
|
|
838
808
|
return str(self.relation)
|
|
839
809
|
|
|
840
810
|
def limit(self, num: int) -> "DataFrame":
|
|
@@ -846,15 +816,14 @@ class DataFrame:
|
|
|
846
816
|
Number of records to return. Will return this number of records
|
|
847
817
|
or all records if the DataFrame contains less than this number of records.
|
|
848
818
|
|
|
849
|
-
Returns
|
|
819
|
+
Returns:
|
|
850
820
|
-------
|
|
851
821
|
:class:`DataFrame`
|
|
852
822
|
Subset of the records
|
|
853
823
|
|
|
854
|
-
Examples
|
|
824
|
+
Examples:
|
|
855
825
|
--------
|
|
856
|
-
>>> df = spark.createDataFrame(
|
|
857
|
-
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
826
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
858
827
|
>>> df.limit(1).show()
|
|
859
828
|
+---+----+
|
|
860
829
|
|age|name|
|
|
@@ -870,17 +839,15 @@ class DataFrame:
|
|
|
870
839
|
rel = self.relation.limit(num)
|
|
871
840
|
return DataFrame(rel, self.session)
|
|
872
841
|
|
|
873
|
-
def __contains__(self, item: str):
|
|
874
|
-
"""
|
|
875
|
-
Check if the :class:`DataFrame` contains a column by the name of `item`
|
|
876
|
-
"""
|
|
842
|
+
def __contains__(self, item: str) -> bool:
|
|
843
|
+
"""Check if the :class:`DataFrame` contains a column by the name of `item`."""
|
|
877
844
|
return item in self.relation
|
|
878
845
|
|
|
879
846
|
@property
|
|
880
847
|
def schema(self) -> StructType:
|
|
881
848
|
"""Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
|
|
882
849
|
|
|
883
|
-
Examples
|
|
850
|
+
Examples:
|
|
884
851
|
--------
|
|
885
852
|
>>> df.schema
|
|
886
853
|
StructType([StructField('age', IntegerType(), True),
|
|
@@ -889,25 +856,21 @@ class DataFrame:
|
|
|
889
856
|
return self._schema
|
|
890
857
|
|
|
891
858
|
@overload
|
|
892
|
-
def __getitem__(self, item: Union[int, str]) -> Column:
|
|
893
|
-
...
|
|
859
|
+
def __getitem__(self, item: Union[int, str]) -> Column: ...
|
|
894
860
|
|
|
895
861
|
@overload
|
|
896
|
-
def __getitem__(self, item: Union[Column,
|
|
897
|
-
...
|
|
862
|
+
def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ...
|
|
898
863
|
|
|
899
|
-
def __getitem__(
|
|
900
|
-
self, item: Union[int, str, Column, List, Tuple]
|
|
901
|
-
) -> Union[Column, "DataFrame"]:
|
|
864
|
+
def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]:
|
|
902
865
|
"""Returns the column as a :class:`Column`.
|
|
903
866
|
|
|
904
|
-
Examples
|
|
867
|
+
Examples:
|
|
905
868
|
--------
|
|
906
|
-
>>> df.select(df[
|
|
869
|
+
>>> df.select(df["age"]).collect()
|
|
907
870
|
[Row(age=2), Row(age=5)]
|
|
908
|
-
>>> df[
|
|
871
|
+
>>> df[["name", "age"]].collect()
|
|
909
872
|
[Row(name='Alice', age=2), Row(name='Bob', age=5)]
|
|
910
|
-
>>> df[
|
|
873
|
+
>>> df[df.age > 3].collect()
|
|
911
874
|
[Row(age=5, name='Bob')]
|
|
912
875
|
>>> df[df[0] > 3].collect()
|
|
913
876
|
[Row(age=5, name='Bob')]
|
|
@@ -919,31 +882,29 @@ class DataFrame:
|
|
|
919
882
|
elif isinstance(item, (list, tuple)):
|
|
920
883
|
return self.select(*item)
|
|
921
884
|
elif isinstance(item, int):
|
|
922
|
-
return col(self._schema[item].name)
|
|
885
|
+
return spark_sql_functions.col(self._schema[item].name)
|
|
923
886
|
else:
|
|
924
|
-
|
|
887
|
+
msg = f"Unexpected item type: {type(item)}"
|
|
888
|
+
raise TypeError(msg)
|
|
925
889
|
|
|
926
890
|
def __getattr__(self, name: str) -> Column:
|
|
927
891
|
"""Returns the :class:`Column` denoted by ``name``.
|
|
928
892
|
|
|
929
|
-
Examples
|
|
893
|
+
Examples:
|
|
930
894
|
--------
|
|
931
895
|
>>> df.select(df.age).collect()
|
|
932
896
|
[Row(age=2), Row(age=5)]
|
|
933
897
|
"""
|
|
934
898
|
if name not in self.relation.columns:
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
)
|
|
899
|
+
msg = f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
|
900
|
+
raise AttributeError(msg)
|
|
938
901
|
return Column(duckdb.ColumnExpression(self.relation.alias, name))
|
|
939
902
|
|
|
940
903
|
@overload
|
|
941
|
-
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
|
|
942
|
-
...
|
|
904
|
+
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ...
|
|
943
905
|
|
|
944
906
|
@overload
|
|
945
|
-
def groupBy(self, __cols: Union[
|
|
946
|
-
...
|
|
907
|
+
def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ... # noqa: PYI063
|
|
947
908
|
|
|
948
909
|
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
|
|
949
910
|
"""Groups the :class:`DataFrame` using the specified columns,
|
|
@@ -959,15 +920,16 @@ class DataFrame:
|
|
|
959
920
|
Each element should be a column name (string) or an expression (:class:`Column`)
|
|
960
921
|
or list of them.
|
|
961
922
|
|
|
962
|
-
Returns
|
|
923
|
+
Returns:
|
|
963
924
|
-------
|
|
964
925
|
:class:`GroupedData`
|
|
965
926
|
Grouped data by given columns.
|
|
966
927
|
|
|
967
|
-
Examples
|
|
928
|
+
Examples:
|
|
968
929
|
--------
|
|
969
|
-
>>> df = spark.createDataFrame(
|
|
970
|
-
... (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
|
|
930
|
+
>>> df = spark.createDataFrame(
|
|
931
|
+
... [(2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
|
|
932
|
+
... )
|
|
971
933
|
|
|
972
934
|
Empty grouping columns triggers a global aggregation.
|
|
973
935
|
|
|
@@ -1008,22 +970,19 @@ class DataFrame:
|
|
|
1008
970
|
| Bob| 2| 2|
|
|
1009
971
|
| Bob| 5| 1|
|
|
1010
972
|
+-----+---+-----+
|
|
1011
|
-
"""
|
|
973
|
+
""" # noqa: D205
|
|
1012
974
|
from .group import GroupedData, Grouping
|
|
1013
975
|
|
|
1014
|
-
if len(cols) == 1 and isinstance(cols[0], list)
|
|
1015
|
-
columns = cols[0]
|
|
1016
|
-
else:
|
|
1017
|
-
columns = cols
|
|
976
|
+
columns = cols[0] if len(cols) == 1 and isinstance(cols[0], list) else cols
|
|
1018
977
|
return GroupedData(Grouping(*columns), self)
|
|
1019
978
|
|
|
1020
979
|
groupby = groupBy
|
|
1021
980
|
|
|
1022
981
|
@property
|
|
1023
|
-
def write(self) -> DataFrameWriter:
|
|
982
|
+
def write(self) -> DataFrameWriter: # noqa: D102
|
|
1024
983
|
return DataFrameWriter(self)
|
|
1025
984
|
|
|
1026
|
-
def printSchema(self):
|
|
985
|
+
def printSchema(self) -> None: # noqa: D102
|
|
1027
986
|
raise ContributionsAcceptedError
|
|
1028
987
|
|
|
1029
988
|
def union(self, other: "DataFrame") -> "DataFrame":
|
|
@@ -1035,22 +994,22 @@ class DataFrame:
|
|
|
1035
994
|
other : :class:`DataFrame`
|
|
1036
995
|
Another :class:`DataFrame` that needs to be unioned
|
|
1037
996
|
|
|
1038
|
-
Returns
|
|
997
|
+
Returns:
|
|
1039
998
|
-------
|
|
1040
999
|
:class:`DataFrame`
|
|
1041
1000
|
|
|
1042
|
-
See Also
|
|
1001
|
+
See Also:
|
|
1043
1002
|
--------
|
|
1044
1003
|
DataFrame.unionAll
|
|
1045
1004
|
|
|
1046
|
-
Notes
|
|
1005
|
+
Notes:
|
|
1047
1006
|
-----
|
|
1048
1007
|
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
|
|
1049
1008
|
(that does deduplication of elements), use this function followed by :func:`distinct`.
|
|
1050
1009
|
|
|
1051
1010
|
Also as standard in SQL, this function resolves columns by position (not by name).
|
|
1052
1011
|
|
|
1053
|
-
Examples
|
|
1012
|
+
Examples:
|
|
1054
1013
|
--------
|
|
1055
1014
|
>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
|
|
1056
1015
|
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
|
|
@@ -1068,14 +1027,12 @@ class DataFrame:
|
|
|
1068
1027
|
| 1| 2| 3|
|
|
1069
1028
|
| 1| 2| 3|
|
|
1070
1029
|
+----+----+----+
|
|
1071
|
-
"""
|
|
1030
|
+
""" # noqa: D205
|
|
1072
1031
|
return DataFrame(self.relation.union(other.relation), self.session)
|
|
1073
1032
|
|
|
1074
1033
|
unionAll = union
|
|
1075
1034
|
|
|
1076
|
-
def unionByName(
|
|
1077
|
-
self, other: "DataFrame", allowMissingColumns: bool = False
|
|
1078
|
-
) -> "DataFrame":
|
|
1035
|
+
def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
|
|
1079
1036
|
"""Returns a new :class:`DataFrame` containing union of rows in this and another
|
|
1080
1037
|
:class:`DataFrame`.
|
|
1081
1038
|
|
|
@@ -1096,12 +1053,12 @@ class DataFrame:
|
|
|
1096
1053
|
|
|
1097
1054
|
.. versionadded:: 3.1.0
|
|
1098
1055
|
|
|
1099
|
-
Returns
|
|
1056
|
+
Returns:
|
|
1100
1057
|
-------
|
|
1101
1058
|
:class:`DataFrame`
|
|
1102
1059
|
Combined DataFrame.
|
|
1103
1060
|
|
|
1104
|
-
Examples
|
|
1061
|
+
Examples:
|
|
1105
1062
|
--------
|
|
1106
1063
|
The difference between this function and :func:`union` is that this function
|
|
1107
1064
|
resolves columns by name (not by position):
|
|
@@ -1130,14 +1087,14 @@ class DataFrame:
|
|
|
1130
1087
|
| 1| 2| 3|NULL|
|
|
1131
1088
|
|NULL| 4| 5| 6|
|
|
1132
1089
|
+----+----+----+----+
|
|
1133
|
-
"""
|
|
1090
|
+
""" # noqa: D205
|
|
1134
1091
|
if allowMissingColumns:
|
|
1135
1092
|
cols = []
|
|
1136
1093
|
for col in self.relation.columns:
|
|
1137
1094
|
if col in other.relation.columns:
|
|
1138
1095
|
cols.append(col)
|
|
1139
1096
|
else:
|
|
1140
|
-
cols.append(lit(None))
|
|
1097
|
+
cols.append(spark_sql_functions.lit(None))
|
|
1141
1098
|
other = other.select(*cols)
|
|
1142
1099
|
else:
|
|
1143
1100
|
other = other.select(*self.relation.columns)
|
|
@@ -1160,16 +1117,16 @@ class DataFrame:
|
|
|
1160
1117
|
other : :class:`DataFrame`
|
|
1161
1118
|
Another :class:`DataFrame` that needs to be combined.
|
|
1162
1119
|
|
|
1163
|
-
Returns
|
|
1120
|
+
Returns:
|
|
1164
1121
|
-------
|
|
1165
1122
|
:class:`DataFrame`
|
|
1166
1123
|
Combined DataFrame.
|
|
1167
1124
|
|
|
1168
|
-
Notes
|
|
1125
|
+
Notes:
|
|
1169
1126
|
-----
|
|
1170
1127
|
This is equivalent to `INTERSECT` in SQL.
|
|
1171
1128
|
|
|
1172
|
-
Examples
|
|
1129
|
+
Examples:
|
|
1173
1130
|
--------
|
|
1174
1131
|
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1175
1132
|
>>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
|
|
@@ -1180,7 +1137,7 @@ class DataFrame:
|
|
|
1180
1137
|
| b| 3|
|
|
1181
1138
|
| a| 1|
|
|
1182
1139
|
+---+---+
|
|
1183
|
-
"""
|
|
1140
|
+
""" # noqa: D205
|
|
1184
1141
|
return self.intersectAll(other).drop_duplicates()
|
|
1185
1142
|
|
|
1186
1143
|
def intersectAll(self, other: "DataFrame") -> "DataFrame":
|
|
@@ -1200,12 +1157,12 @@ class DataFrame:
|
|
|
1200
1157
|
other : :class:`DataFrame`
|
|
1201
1158
|
Another :class:`DataFrame` that needs to be combined.
|
|
1202
1159
|
|
|
1203
|
-
Returns
|
|
1160
|
+
Returns:
|
|
1204
1161
|
-------
|
|
1205
1162
|
:class:`DataFrame`
|
|
1206
1163
|
Combined DataFrame.
|
|
1207
1164
|
|
|
1208
|
-
Examples
|
|
1165
|
+
Examples:
|
|
1209
1166
|
--------
|
|
1210
1167
|
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1211
1168
|
>>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
|
|
@@ -1217,7 +1174,7 @@ class DataFrame:
|
|
|
1217
1174
|
| a| 1|
|
|
1218
1175
|
| b| 3|
|
|
1219
1176
|
+---+---+
|
|
1220
|
-
"""
|
|
1177
|
+
""" # noqa: D205
|
|
1221
1178
|
return DataFrame(self.relation.intersect(other.relation), self.session)
|
|
1222
1179
|
|
|
1223
1180
|
def exceptAll(self, other: "DataFrame") -> "DataFrame":
|
|
@@ -1237,14 +1194,15 @@ class DataFrame:
|
|
|
1237
1194
|
other : :class:`DataFrame`
|
|
1238
1195
|
The other :class:`DataFrame` to compare to.
|
|
1239
1196
|
|
|
1240
|
-
Returns
|
|
1197
|
+
Returns:
|
|
1241
1198
|
-------
|
|
1242
1199
|
:class:`DataFrame`
|
|
1243
1200
|
|
|
1244
|
-
Examples
|
|
1201
|
+
Examples:
|
|
1245
1202
|
--------
|
|
1246
1203
|
>>> df1 = spark.createDataFrame(
|
|
1247
|
-
...
|
|
1204
|
+
... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]
|
|
1205
|
+
... )
|
|
1248
1206
|
>>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
|
|
1249
1207
|
>>> df1.exceptAll(df2).show()
|
|
1250
1208
|
+---+---+
|
|
@@ -1256,10 +1214,10 @@ class DataFrame:
|
|
|
1256
1214
|
| c| 4|
|
|
1257
1215
|
+---+---+
|
|
1258
1216
|
|
|
1259
|
-
"""
|
|
1217
|
+
""" # noqa: D205
|
|
1260
1218
|
return DataFrame(self.relation.except_(other.relation), self.session)
|
|
1261
1219
|
|
|
1262
|
-
def dropDuplicates(self, subset: Optional[
|
|
1220
|
+
def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame":
|
|
1263
1221
|
"""Return a new :class:`DataFrame` with duplicate rows removed,
|
|
1264
1222
|
optionally only considering certain columns.
|
|
1265
1223
|
|
|
@@ -1276,19 +1234,21 @@ class DataFrame:
|
|
|
1276
1234
|
subset : List of column names, optional
|
|
1277
1235
|
List of columns to use for duplicate comparison (default All columns).
|
|
1278
1236
|
|
|
1279
|
-
Returns
|
|
1237
|
+
Returns:
|
|
1280
1238
|
-------
|
|
1281
1239
|
:class:`DataFrame`
|
|
1282
1240
|
DataFrame without duplicates.
|
|
1283
1241
|
|
|
1284
|
-
Examples
|
|
1242
|
+
Examples:
|
|
1285
1243
|
--------
|
|
1286
1244
|
>>> from pyspark.sql import Row
|
|
1287
|
-
>>> df = spark.createDataFrame(
|
|
1288
|
-
...
|
|
1289
|
-
...
|
|
1290
|
-
...
|
|
1291
|
-
...
|
|
1245
|
+
>>> df = spark.createDataFrame(
|
|
1246
|
+
... [
|
|
1247
|
+
... Row(name="Alice", age=5, height=80),
|
|
1248
|
+
... Row(name="Alice", age=5, height=80),
|
|
1249
|
+
... Row(name="Alice", age=10, height=80),
|
|
1250
|
+
... ]
|
|
1251
|
+
... )
|
|
1292
1252
|
|
|
1293
1253
|
Deduplicate the same rows.
|
|
1294
1254
|
|
|
@@ -1302,16 +1262,16 @@ class DataFrame:
|
|
|
1302
1262
|
|
|
1303
1263
|
Deduplicate values on 'name' and 'height' columns.
|
|
1304
1264
|
|
|
1305
|
-
>>> df.dropDuplicates([
|
|
1265
|
+
>>> df.dropDuplicates(["name", "height"]).show()
|
|
1306
1266
|
+-----+---+------+
|
|
1307
1267
|
| name|age|height|
|
|
1308
1268
|
+-----+---+------+
|
|
1309
1269
|
|Alice| 5| 80|
|
|
1310
1270
|
+-----+---+------+
|
|
1311
|
-
"""
|
|
1271
|
+
""" # noqa: D205
|
|
1312
1272
|
if subset:
|
|
1313
1273
|
rn_col = f"tmp_col_{uuid.uuid1().hex}"
|
|
1314
|
-
subset_str =
|
|
1274
|
+
subset_str = ", ".join([f'"{c}"' for c in subset])
|
|
1315
1275
|
window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
|
|
1316
1276
|
df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
|
|
1317
1277
|
return df.filter(f"{rn_col} = 1").drop(rn_col)
|
|
@@ -1320,19 +1280,17 @@ class DataFrame:
|
|
|
1320
1280
|
|
|
1321
1281
|
drop_duplicates = dropDuplicates
|
|
1322
1282
|
|
|
1323
|
-
|
|
1324
1283
|
def distinct(self) -> "DataFrame":
|
|
1325
1284
|
"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
|
|
1326
1285
|
|
|
1327
|
-
Returns
|
|
1286
|
+
Returns:
|
|
1328
1287
|
-------
|
|
1329
1288
|
:class:`DataFrame`
|
|
1330
1289
|
DataFrame with distinct records.
|
|
1331
1290
|
|
|
1332
|
-
Examples
|
|
1291
|
+
Examples:
|
|
1333
1292
|
--------
|
|
1334
|
-
>>> df = spark.createDataFrame(
|
|
1335
|
-
... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
|
|
1293
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
|
|
1336
1294
|
|
|
1337
1295
|
Return the number of distinct rows in the :class:`DataFrame`
|
|
1338
1296
|
|
|
@@ -1345,15 +1303,14 @@ class DataFrame:
|
|
|
1345
1303
|
def count(self) -> int:
|
|
1346
1304
|
"""Returns the number of rows in this :class:`DataFrame`.
|
|
1347
1305
|
|
|
1348
|
-
Returns
|
|
1306
|
+
Returns:
|
|
1349
1307
|
-------
|
|
1350
1308
|
int
|
|
1351
1309
|
Number of rows.
|
|
1352
1310
|
|
|
1353
|
-
Examples
|
|
1311
|
+
Examples:
|
|
1354
1312
|
--------
|
|
1355
|
-
>>> df = spark.createDataFrame(
|
|
1356
|
-
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
1313
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
1357
1314
|
|
|
1358
1315
|
Return the number of rows in the :class:`DataFrame`.
|
|
1359
1316
|
|
|
@@ -1369,33 +1326,28 @@ class DataFrame:
|
|
|
1369
1326
|
assert types_count == len(existing_columns)
|
|
1370
1327
|
|
|
1371
1328
|
cast_expressions = [
|
|
1372
|
-
f"{existing}::{target_type} as {existing}"
|
|
1373
|
-
for existing, target_type in zip(existing_columns, types)
|
|
1329
|
+
f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types)
|
|
1374
1330
|
]
|
|
1375
1331
|
cast_expressions = ", ".join(cast_expressions)
|
|
1376
1332
|
new_rel = self.relation.project(cast_expressions)
|
|
1377
1333
|
return DataFrame(new_rel, self.session)
|
|
1378
1334
|
|
|
1379
|
-
def toDF(self, *cols) -> "DataFrame":
|
|
1335
|
+
def toDF(self, *cols) -> "DataFrame": # noqa: D102
|
|
1380
1336
|
existing_columns = self.relation.columns
|
|
1381
1337
|
column_count = len(cols)
|
|
1382
1338
|
if column_count != len(existing_columns):
|
|
1383
|
-
raise PySparkValueError(
|
|
1384
|
-
message="Provided column names and number of columns in the DataFrame don't match"
|
|
1385
|
-
)
|
|
1339
|
+
raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match")
|
|
1386
1340
|
|
|
1387
1341
|
existing_columns = [ColumnExpression(x) for x in existing_columns]
|
|
1388
|
-
projections = [
|
|
1389
|
-
existing.alias(new) for existing, new in zip(existing_columns, cols)
|
|
1390
|
-
]
|
|
1342
|
+
projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)]
|
|
1391
1343
|
new_rel = self.relation.project(*projections)
|
|
1392
1344
|
return DataFrame(new_rel, self.session)
|
|
1393
1345
|
|
|
1394
|
-
def collect(self) ->
|
|
1346
|
+
def collect(self) -> list[Row]: # noqa: D102
|
|
1395
1347
|
columns = self.relation.columns
|
|
1396
1348
|
result = self.relation.fetchall()
|
|
1397
1349
|
|
|
1398
|
-
def construct_row(values, names) -> Row:
|
|
1350
|
+
def construct_row(values: list, names: list[str]) -> Row:
|
|
1399
1351
|
row = tuple.__new__(Row, list(values))
|
|
1400
1352
|
row.__fields__ = list(names)
|
|
1401
1353
|
return row
|
|
@@ -1411,16 +1363,16 @@ class DataFrame:
|
|
|
1411
1363
|
.. versionchanged:: 3.4.0
|
|
1412
1364
|
Supports Spark Connect.
|
|
1413
1365
|
|
|
1414
|
-
Notes
|
|
1366
|
+
Notes:
|
|
1415
1367
|
-----
|
|
1416
1368
|
The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
|
|
1417
1369
|
|
|
1418
|
-
Returns
|
|
1370
|
+
Returns:
|
|
1419
1371
|
-------
|
|
1420
1372
|
:class:`DataFrame`
|
|
1421
1373
|
Cached DataFrame.
|
|
1422
1374
|
|
|
1423
|
-
Examples
|
|
1375
|
+
Examples:
|
|
1424
1376
|
--------
|
|
1425
1377
|
>>> df = spark.range(1)
|
|
1426
1378
|
>>> df.cache()
|