duckdb 1.4.1.dev125__cp39-cp39-win_amd64.whl → 1.5.0.dev44__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb.cp39-win_amd64.pyd +0 -0
- duckdb/__init__.py +374 -373
- duckdb/__init__.pyi +180 -604
- duckdb/bytes_io_wrapper.py +7 -6
- duckdb/experimental/__init__.py +1 -2
- duckdb/experimental/spark/__init__.py +4 -3
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +9 -7
- duckdb/experimental/spark/conf.py +15 -16
- duckdb/experimental/spark/context.py +44 -60
- duckdb/experimental/spark/errors/__init__.py +35 -33
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +88 -39
- duckdb/experimental/spark/errors/utils.py +16 -11
- duckdb/experimental/spark/exception.py +6 -9
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +15 -8
- duckdb/experimental/spark/sql/catalog.py +20 -21
- duckdb/experimental/spark/sql/column.py +54 -47
- duckdb/experimental/spark/sql/conf.py +8 -9
- duckdb/experimental/spark/sql/dataframe.py +233 -185
- duckdb/experimental/spark/sql/functions.py +1248 -1222
- duckdb/experimental/spark/sql/group.py +52 -56
- duckdb/experimental/spark/sql/readwriter.py +94 -80
- duckdb/experimental/spark/sql/session.py +59 -64
- duckdb/experimental/spark/sql/streaming.py +10 -9
- duckdb/experimental/spark/sql/type_utils.py +64 -66
- duckdb/experimental/spark/sql/types.py +344 -308
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +8 -13
- duckdb/functional/__init__.py +16 -2
- duckdb/polars_io.py +57 -66
- duckdb/query_graph/__main__.py +96 -91
- duckdb/typing/__init__.py +8 -8
- duckdb/typing/__init__.pyi +2 -4
- duckdb/udf.py +5 -10
- duckdb/value/__init__.py +0 -1
- duckdb/value/constant/__init__.py +59 -61
- duckdb/value/constant/__init__.pyi +4 -3
- duckdb-1.5.0.dev44.dist-info/METADATA +80 -0
- duckdb-1.5.0.dev44.dist-info/RECORD +47 -0
- adbc_driver_duckdb/__init__.py +0 -50
- adbc_driver_duckdb/dbapi.py +0 -115
- duckdb-1.4.1.dev125.dist-info/METADATA +0 -326
- duckdb-1.4.1.dev125.dist-info/RECORD +0 -49
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev44.dist-info}/WHEEL +0 -0
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev44.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,20 +1,24 @@
|
|
|
1
|
-
import uuid # noqa: D100
|
|
2
1
|
from functools import reduce
|
|
3
|
-
from keyword import iskeyword
|
|
4
2
|
from typing import (
|
|
5
3
|
TYPE_CHECKING,
|
|
6
4
|
Any,
|
|
7
5
|
Callable,
|
|
6
|
+
List,
|
|
7
|
+
Dict,
|
|
8
8
|
Optional,
|
|
9
|
+
Tuple,
|
|
9
10
|
Union,
|
|
10
11
|
cast,
|
|
11
12
|
overload,
|
|
12
13
|
)
|
|
14
|
+
import uuid
|
|
15
|
+
from keyword import iskeyword
|
|
13
16
|
|
|
14
17
|
import duckdb
|
|
15
18
|
from duckdb import ColumnExpression, Expression, StarExpression
|
|
16
19
|
|
|
17
|
-
from
|
|
20
|
+
from ._typing import ColumnOrName
|
|
21
|
+
from ..errors import PySparkTypeError, PySparkValueError, PySparkIndexError
|
|
18
22
|
from ..exception import ContributionsAcceptedError
|
|
19
23
|
from .column import Column
|
|
20
24
|
from .readwriter import DataFrameWriter
|
|
@@ -25,42 +29,43 @@ if TYPE_CHECKING:
|
|
|
25
29
|
import pyarrow as pa
|
|
26
30
|
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
27
31
|
|
|
28
|
-
from .
|
|
29
|
-
from .group import GroupedData
|
|
32
|
+
from .group import GroupedData, Grouping
|
|
30
33
|
from .session import SparkSession
|
|
31
34
|
|
|
32
|
-
from
|
|
35
|
+
from ..errors import PySparkValueError
|
|
36
|
+
from .functions import _to_column_expr, col, lit
|
|
33
37
|
|
|
34
38
|
|
|
35
|
-
class DataFrame:
|
|
36
|
-
def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession")
|
|
39
|
+
class DataFrame:
|
|
40
|
+
def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession"):
|
|
37
41
|
self.relation = relation
|
|
38
42
|
self.session = session
|
|
39
43
|
self._schema = None
|
|
40
44
|
if self.relation is not None:
|
|
41
45
|
self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
|
|
42
46
|
|
|
43
|
-
def show(self, **kwargs) -> None:
|
|
47
|
+
def show(self, **kwargs) -> None:
|
|
44
48
|
self.relation.show()
|
|
45
49
|
|
|
46
|
-
def toPandas(self) -> "PandasDataFrame":
|
|
50
|
+
def toPandas(self) -> "PandasDataFrame":
|
|
47
51
|
return self.relation.df()
|
|
48
52
|
|
|
49
53
|
def toArrow(self) -> "pa.Table":
|
|
50
|
-
"""
|
|
54
|
+
"""
|
|
55
|
+
Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
|
|
51
56
|
|
|
52
57
|
This is only available if PyArrow is installed and available.
|
|
53
58
|
|
|
54
59
|
.. versionadded:: 4.0.0
|
|
55
60
|
|
|
56
|
-
Notes
|
|
61
|
+
Notes
|
|
57
62
|
-----
|
|
58
63
|
This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
|
|
59
64
|
expected to be small, as all the data is loaded into the driver's memory.
|
|
60
65
|
|
|
61
66
|
This API is a developer API.
|
|
62
67
|
|
|
63
|
-
Examples
|
|
68
|
+
Examples
|
|
64
69
|
--------
|
|
65
70
|
>>> df.toArrow() # doctest: +SKIP
|
|
66
71
|
pyarrow.Table
|
|
@@ -83,7 +88,7 @@ class DataFrame: # noqa: D101
|
|
|
83
88
|
name : str
|
|
84
89
|
Name of the view.
|
|
85
90
|
|
|
86
|
-
Examples
|
|
91
|
+
Examples
|
|
87
92
|
--------
|
|
88
93
|
Create a local temporary view named 'people'.
|
|
89
94
|
|
|
@@ -103,13 +108,12 @@ class DataFrame: # noqa: D101
|
|
|
103
108
|
"""
|
|
104
109
|
self.relation.create_view(name, True)
|
|
105
110
|
|
|
106
|
-
def createGlobalTempView(self, name: str) -> None:
|
|
111
|
+
def createGlobalTempView(self, name: str) -> None:
|
|
107
112
|
raise NotImplementedError
|
|
108
113
|
|
|
109
|
-
def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
|
|
114
|
+
def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
|
|
110
115
|
if columnName not in self.relation:
|
|
111
|
-
|
|
112
|
-
raise ValueError(msg)
|
|
116
|
+
raise ValueError(f"DataFrame does not contain a column named {columnName}")
|
|
113
117
|
cols = []
|
|
114
118
|
for x in self.relation.columns:
|
|
115
119
|
col = ColumnExpression(x)
|
|
@@ -119,7 +123,7 @@ class DataFrame: # noqa: D101
|
|
|
119
123
|
rel = self.relation.select(*cols)
|
|
120
124
|
return DataFrame(rel, self.session)
|
|
121
125
|
|
|
122
|
-
def withColumn(self, columnName: str, col: Column) -> "DataFrame":
|
|
126
|
+
def withColumn(self, columnName: str, col: Column) -> "DataFrame":
|
|
123
127
|
if not isinstance(col, Column):
|
|
124
128
|
raise PySparkTypeError(
|
|
125
129
|
error_class="NOT_COLUMN",
|
|
@@ -139,8 +143,9 @@ class DataFrame: # noqa: D101
|
|
|
139
143
|
rel = self.relation.select(*cols)
|
|
140
144
|
return DataFrame(rel, self.session)
|
|
141
145
|
|
|
142
|
-
def withColumns(self, *colsMap:
|
|
143
|
-
"""
|
|
146
|
+
def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame":
|
|
147
|
+
"""
|
|
148
|
+
Returns a new :class:`DataFrame` by adding multiple columns or replacing the
|
|
144
149
|
existing columns that have the same names.
|
|
145
150
|
|
|
146
151
|
The colsMap is a map of column name and column, the column must only refer to attributes
|
|
@@ -157,22 +162,22 @@ class DataFrame: # noqa: D101
|
|
|
157
162
|
colsMap : dict
|
|
158
163
|
a dict of column name and :class:`Column`. Currently, only a single map is supported.
|
|
159
164
|
|
|
160
|
-
Returns
|
|
165
|
+
Returns
|
|
161
166
|
-------
|
|
162
167
|
:class:`DataFrame`
|
|
163
168
|
DataFrame with new or replaced columns.
|
|
164
169
|
|
|
165
|
-
Examples
|
|
170
|
+
Examples
|
|
166
171
|
--------
|
|
167
172
|
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
168
|
-
>>> df.withColumns({
|
|
173
|
+
>>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()
|
|
169
174
|
+---+-----+----+----+
|
|
170
175
|
|age| name|age2|age3|
|
|
171
176
|
+---+-----+----+----+
|
|
172
177
|
| 2|Alice| 4| 5|
|
|
173
178
|
| 5| Bob| 7| 8|
|
|
174
179
|
+---+-----+----+----+
|
|
175
|
-
"""
|
|
180
|
+
"""
|
|
176
181
|
# Below code is to help enable kwargs in future.
|
|
177
182
|
assert len(colsMap) == 1
|
|
178
183
|
colsMap = colsMap[0] # type: ignore[assignment]
|
|
@@ -213,8 +218,9 @@ class DataFrame: # noqa: D101
|
|
|
213
218
|
rel = self.relation.select(*cols)
|
|
214
219
|
return DataFrame(rel, self.session)
|
|
215
220
|
|
|
216
|
-
def withColumnsRenamed(self, colsMap:
|
|
217
|
-
"""
|
|
221
|
+
def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
|
|
222
|
+
"""
|
|
223
|
+
Returns a new :class:`DataFrame` by renaming multiple columns.
|
|
218
224
|
This is a no-op if the schema doesn't contain the given column names.
|
|
219
225
|
|
|
220
226
|
.. versionadded:: 3.4.0
|
|
@@ -226,31 +232,31 @@ class DataFrame: # noqa: D101
|
|
|
226
232
|
a dict of existing column names and corresponding desired column names.
|
|
227
233
|
Currently, only a single map is supported.
|
|
228
234
|
|
|
229
|
-
Returns
|
|
235
|
+
Returns
|
|
230
236
|
-------
|
|
231
237
|
:class:`DataFrame`
|
|
232
238
|
DataFrame with renamed columns.
|
|
233
239
|
|
|
234
|
-
See Also
|
|
240
|
+
See Also
|
|
235
241
|
--------
|
|
236
242
|
:meth:`withColumnRenamed`
|
|
237
243
|
|
|
238
|
-
Notes
|
|
244
|
+
Notes
|
|
239
245
|
-----
|
|
240
246
|
Support Spark Connect
|
|
241
247
|
|
|
242
|
-
Examples
|
|
248
|
+
Examples
|
|
243
249
|
--------
|
|
244
250
|
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
245
|
-
>>> df = df.withColumns({
|
|
246
|
-
>>> df.withColumnsRenamed({
|
|
251
|
+
>>> df = df.withColumns({'age2': df.age + 2, 'age3': df.age + 3})
|
|
252
|
+
>>> df.withColumnsRenamed({'age2': 'age4', 'age3': 'age5'}).show()
|
|
247
253
|
+---+-----+----+----+
|
|
248
254
|
|age| name|age4|age5|
|
|
249
255
|
+---+-----+----+----+
|
|
250
256
|
| 2|Alice| 4| 5|
|
|
251
257
|
| 5| Bob| 7| 8|
|
|
252
258
|
+---+-----+----+----+
|
|
253
|
-
"""
|
|
259
|
+
"""
|
|
254
260
|
if not isinstance(colsMap, dict):
|
|
255
261
|
raise PySparkTypeError(
|
|
256
262
|
error_class="NOT_DICT",
|
|
@@ -259,8 +265,9 @@ class DataFrame: # noqa: D101
|
|
|
259
265
|
|
|
260
266
|
unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
|
|
261
267
|
if unknown_columns:
|
|
262
|
-
|
|
263
|
-
|
|
268
|
+
raise ValueError(
|
|
269
|
+
f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
|
|
270
|
+
)
|
|
264
271
|
|
|
265
272
|
# Compute this only once
|
|
266
273
|
old_column_names = list(colsMap.keys())
|
|
@@ -282,7 +289,11 @@ class DataFrame: # noqa: D101
|
|
|
282
289
|
rel = self.relation.select(*cols)
|
|
283
290
|
return DataFrame(rel, self.session)
|
|
284
291
|
|
|
285
|
-
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def transform(
|
|
295
|
+
self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any
|
|
296
|
+
) -> "DataFrame":
|
|
286
297
|
"""Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
|
|
287
298
|
|
|
288
299
|
.. versionadded:: 3.0.0
|
|
@@ -303,19 +314,21 @@ class DataFrame: # noqa: D101
|
|
|
303
314
|
|
|
304
315
|
.. versionadded:: 3.3.0
|
|
305
316
|
|
|
306
|
-
Returns
|
|
317
|
+
Returns
|
|
307
318
|
-------
|
|
308
319
|
:class:`DataFrame`
|
|
309
320
|
Transformed DataFrame.
|
|
310
321
|
|
|
311
|
-
Examples
|
|
322
|
+
Examples
|
|
312
323
|
--------
|
|
313
324
|
>>> from pyspark.sql.functions import col
|
|
314
325
|
>>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
|
|
315
326
|
>>> def cast_all_to_int(input_df):
|
|
316
327
|
... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
|
|
328
|
+
...
|
|
317
329
|
>>> def sort_columns_asc(input_df):
|
|
318
330
|
... return input_df.select(*sorted(input_df.columns))
|
|
331
|
+
...
|
|
319
332
|
>>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
|
|
320
333
|
+-----+---+
|
|
321
334
|
|float|int|
|
|
@@ -325,9 +338,8 @@ class DataFrame: # noqa: D101
|
|
|
325
338
|
+-----+---+
|
|
326
339
|
|
|
327
340
|
>>> def add_n(input_df, n):
|
|
328
|
-
... return input_df.select(
|
|
329
|
-
...
|
|
330
|
-
... )
|
|
341
|
+
... return input_df.select([(col(col_name) + n).alias(col_name)
|
|
342
|
+
... for col_name in input_df.columns])
|
|
331
343
|
>>> df.transform(add_n, 1).transform(add_n, n=10).show()
|
|
332
344
|
+---+-----+
|
|
333
345
|
|int|float|
|
|
@@ -338,11 +350,14 @@ class DataFrame: # noqa: D101
|
|
|
338
350
|
"""
|
|
339
351
|
result = func(self, *args, **kwargs)
|
|
340
352
|
assert isinstance(result, DataFrame), (
|
|
341
|
-
|
|
353
|
+
"Func returned an instance of type [%s], "
|
|
354
|
+
"should have been DataFrame." % type(result)
|
|
342
355
|
)
|
|
343
356
|
return result
|
|
344
357
|
|
|
345
|
-
def sort(
|
|
358
|
+
def sort(
|
|
359
|
+
self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
|
|
360
|
+
) -> "DataFrame":
|
|
346
361
|
"""Returns a new :class:`DataFrame` sorted by the specified column(s).
|
|
347
362
|
|
|
348
363
|
Parameters
|
|
@@ -357,15 +372,16 @@ class DataFrame: # noqa: D101
|
|
|
357
372
|
Sort ascending vs. descending. Specify list for multiple sort orders.
|
|
358
373
|
If a list is specified, the length of the list must equal the length of the `cols`.
|
|
359
374
|
|
|
360
|
-
Returns
|
|
375
|
+
Returns
|
|
361
376
|
-------
|
|
362
377
|
:class:`DataFrame`
|
|
363
378
|
Sorted DataFrame.
|
|
364
379
|
|
|
365
|
-
Examples
|
|
380
|
+
Examples
|
|
366
381
|
--------
|
|
367
382
|
>>> from pyspark.sql.functions import desc, asc
|
|
368
|
-
>>> df = spark.createDataFrame([
|
|
383
|
+
>>> df = spark.createDataFrame([
|
|
384
|
+
... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
369
385
|
|
|
370
386
|
Sort the DataFrame in ascending order.
|
|
371
387
|
|
|
@@ -403,9 +419,8 @@ class DataFrame: # noqa: D101
|
|
|
403
419
|
|
|
404
420
|
Specify multiple columns
|
|
405
421
|
|
|
406
|
-
>>> df = spark.createDataFrame(
|
|
407
|
-
...
|
|
408
|
-
... )
|
|
422
|
+
>>> df = spark.createDataFrame([
|
|
423
|
+
... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
|
|
409
424
|
>>> df.orderBy(desc("age"), "name").show()
|
|
410
425
|
+---+-----+
|
|
411
426
|
|age| name|
|
|
@@ -438,7 +453,7 @@ class DataFrame: # noqa: D101
|
|
|
438
453
|
for c in cols:
|
|
439
454
|
_c = c
|
|
440
455
|
if isinstance(c, str):
|
|
441
|
-
_c =
|
|
456
|
+
_c = col(c)
|
|
442
457
|
elif isinstance(c, int) and not isinstance(c, bool):
|
|
443
458
|
# ordinal is 1-based
|
|
444
459
|
if c > 0:
|
|
@@ -466,13 +481,13 @@ class DataFrame: # noqa: D101
|
|
|
466
481
|
message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
|
|
467
482
|
)
|
|
468
483
|
|
|
469
|
-
columns = [
|
|
484
|
+
columns = [_to_column_expr(c) for c in columns]
|
|
470
485
|
rel = self.relation.sort(*columns)
|
|
471
486
|
return DataFrame(rel, self.session)
|
|
472
487
|
|
|
473
488
|
orderBy = sort
|
|
474
489
|
|
|
475
|
-
def head(self, n: Optional[int] = None) -> Union[Optional[Row],
|
|
490
|
+
def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]:
|
|
476
491
|
if n is None:
|
|
477
492
|
rs = self.head(1)
|
|
478
493
|
return rs[0] if rs else None
|
|
@@ -480,7 +495,7 @@ class DataFrame: # noqa: D101
|
|
|
480
495
|
|
|
481
496
|
first = head
|
|
482
497
|
|
|
483
|
-
def take(self, num: int) ->
|
|
498
|
+
def take(self, num: int) -> List[Row]:
|
|
484
499
|
return self.limit(num).collect()
|
|
485
500
|
|
|
486
501
|
def filter(self, condition: "ColumnOrName") -> "DataFrame":
|
|
@@ -494,14 +509,15 @@ class DataFrame: # noqa: D101
|
|
|
494
509
|
a :class:`Column` of :class:`types.BooleanType`
|
|
495
510
|
or a string of SQL expressions.
|
|
496
511
|
|
|
497
|
-
Returns
|
|
512
|
+
Returns
|
|
498
513
|
-------
|
|
499
514
|
:class:`DataFrame`
|
|
500
515
|
Filtered DataFrame.
|
|
501
516
|
|
|
502
|
-
Examples
|
|
517
|
+
Examples
|
|
503
518
|
--------
|
|
504
|
-
>>> df = spark.createDataFrame([
|
|
519
|
+
>>> df = spark.createDataFrame([
|
|
520
|
+
... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
505
521
|
|
|
506
522
|
Filter by :class:`Column` instances.
|
|
507
523
|
|
|
@@ -547,34 +563,38 @@ class DataFrame: # noqa: D101
|
|
|
547
563
|
|
|
548
564
|
where = filter
|
|
549
565
|
|
|
550
|
-
def select(self, *cols) -> "DataFrame":
|
|
566
|
+
def select(self, *cols) -> "DataFrame":
|
|
551
567
|
cols = list(cols)
|
|
552
568
|
if len(cols) == 1:
|
|
553
569
|
cols = cols[0]
|
|
554
570
|
if isinstance(cols, list):
|
|
555
|
-
projections = [
|
|
571
|
+
projections = [
|
|
572
|
+
x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols
|
|
573
|
+
]
|
|
556
574
|
else:
|
|
557
|
-
projections = [
|
|
575
|
+
projections = [
|
|
576
|
+
cols.expr if isinstance(cols, Column) else ColumnExpression(cols)
|
|
577
|
+
]
|
|
558
578
|
rel = self.relation.select(*projections)
|
|
559
579
|
return DataFrame(rel, self.session)
|
|
560
580
|
|
|
561
581
|
@property
|
|
562
|
-
def columns(self) ->
|
|
582
|
+
def columns(self) -> List[str]:
|
|
563
583
|
"""Returns all column names as a list.
|
|
564
584
|
|
|
565
|
-
Examples
|
|
585
|
+
Examples
|
|
566
586
|
--------
|
|
567
587
|
>>> df.columns
|
|
568
588
|
['age', 'name']
|
|
569
589
|
"""
|
|
570
590
|
return [f.name for f in self.schema.fields]
|
|
571
591
|
|
|
572
|
-
def _ipython_key_completions_(self) ->
|
|
592
|
+
def _ipython_key_completions_(self) -> List[str]:
|
|
573
593
|
# Provides tab-completion for column names in PySpark DataFrame
|
|
574
594
|
# when accessed in bracket notation, e.g. df['<TAB>]
|
|
575
595
|
return self.columns
|
|
576
596
|
|
|
577
|
-
def __dir__(self) ->
|
|
597
|
+
def __dir__(self) -> List[str]:
|
|
578
598
|
out = set(super().__dir__())
|
|
579
599
|
out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
|
|
580
600
|
return sorted(out)
|
|
@@ -582,7 +602,7 @@ class DataFrame: # noqa: D101
|
|
|
582
602
|
def join(
|
|
583
603
|
self,
|
|
584
604
|
other: "DataFrame",
|
|
585
|
-
on: Optional[Union[str,
|
|
605
|
+
on: Optional[Union[str, List[str], Column, List[Column]]] = None,
|
|
586
606
|
how: Optional[str] = None,
|
|
587
607
|
) -> "DataFrame":
|
|
588
608
|
"""Joins with another :class:`DataFrame`, using the given join expression.
|
|
@@ -602,12 +622,12 @@ class DataFrame: # noqa: D101
|
|
|
602
622
|
``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
|
|
603
623
|
``anti``, ``leftanti`` and ``left_anti``.
|
|
604
624
|
|
|
605
|
-
Returns
|
|
625
|
+
Returns
|
|
606
626
|
-------
|
|
607
627
|
:class:`DataFrame`
|
|
608
628
|
Joined DataFrame.
|
|
609
629
|
|
|
610
|
-
Examples
|
|
630
|
+
Examples
|
|
611
631
|
--------
|
|
612
632
|
The following performs a full outer join between ``df1`` and ``df2``.
|
|
613
633
|
|
|
@@ -616,24 +636,22 @@ class DataFrame: # noqa: D101
|
|
|
616
636
|
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
|
|
617
637
|
>>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
618
638
|
>>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
|
|
619
|
-
>>> df4 = spark.createDataFrame(
|
|
620
|
-
...
|
|
621
|
-
...
|
|
622
|
-
...
|
|
623
|
-
...
|
|
624
|
-
...
|
|
625
|
-
... ]
|
|
626
|
-
... )
|
|
639
|
+
>>> df4 = spark.createDataFrame([
|
|
640
|
+
... Row(age=10, height=80, name="Alice"),
|
|
641
|
+
... Row(age=5, height=None, name="Bob"),
|
|
642
|
+
... Row(age=None, height=None, name="Tom"),
|
|
643
|
+
... Row(age=None, height=None, name=None),
|
|
644
|
+
... ])
|
|
627
645
|
|
|
628
646
|
Inner join on columns (default)
|
|
629
647
|
|
|
630
|
-
>>> df.join(df2,
|
|
648
|
+
>>> df.join(df2, 'name').select(df.name, df2.height).show()
|
|
631
649
|
+----+------+
|
|
632
650
|
|name|height|
|
|
633
651
|
+----+------+
|
|
634
652
|
| Bob| 85|
|
|
635
653
|
+----+------+
|
|
636
|
-
>>> df.join(df4, [
|
|
654
|
+
>>> df.join(df4, ['name', 'age']).select(df.name, df.age).show()
|
|
637
655
|
+----+---+
|
|
638
656
|
|name|age|
|
|
639
657
|
+----+---+
|
|
@@ -642,9 +660,8 @@ class DataFrame: # noqa: D101
|
|
|
642
660
|
|
|
643
661
|
Outer join for both DataFrames on the 'name' column.
|
|
644
662
|
|
|
645
|
-
>>> df.join(df2, df.name == df2.name,
|
|
646
|
-
... desc("name")
|
|
647
|
-
... ).show()
|
|
663
|
+
>>> df.join(df2, df.name == df2.name, 'outer').select(
|
|
664
|
+
... df.name, df2.height).sort(desc("name")).show()
|
|
648
665
|
+-----+------+
|
|
649
666
|
| name|height|
|
|
650
667
|
+-----+------+
|
|
@@ -652,7 +669,7 @@ class DataFrame: # noqa: D101
|
|
|
652
669
|
|Alice| NULL|
|
|
653
670
|
| NULL| 80|
|
|
654
671
|
+-----+------+
|
|
655
|
-
>>> df.join(df2,
|
|
672
|
+
>>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
|
|
656
673
|
+-----+------+
|
|
657
674
|
| name|height|
|
|
658
675
|
+-----+------+
|
|
@@ -663,9 +680,11 @@ class DataFrame: # noqa: D101
|
|
|
663
680
|
|
|
664
681
|
Outer join for both DataFrams with multiple columns.
|
|
665
682
|
|
|
666
|
-
>>> df.join(
|
|
667
|
-
...
|
|
668
|
-
...
|
|
683
|
+
>>> df.join(
|
|
684
|
+
... df3,
|
|
685
|
+
... [df.name == df3.name, df.age == df3.age],
|
|
686
|
+
... 'outer'
|
|
687
|
+
... ).select(df.name, df3.age).show()
|
|
669
688
|
+-----+---+
|
|
670
689
|
| name|age|
|
|
671
690
|
+-----+---+
|
|
@@ -673,16 +692,20 @@ class DataFrame: # noqa: D101
|
|
|
673
692
|
| Bob| 5|
|
|
674
693
|
+-----+---+
|
|
675
694
|
"""
|
|
695
|
+
|
|
676
696
|
if on is not None and not isinstance(on, list):
|
|
677
697
|
on = [on] # type: ignore[assignment]
|
|
678
|
-
if on is not None and not all(isinstance(x, str) for x in on):
|
|
698
|
+
if on is not None and not all([isinstance(x, str) for x in on]):
|
|
679
699
|
assert isinstance(on, list)
|
|
680
700
|
# Get (or create) the Expressions from the list of Columns
|
|
681
|
-
on = [
|
|
701
|
+
on = [_to_column_expr(x) for x in on]
|
|
682
702
|
|
|
683
703
|
# & all the Expressions together to form one Expression
|
|
684
|
-
assert isinstance(
|
|
685
|
-
|
|
704
|
+
assert isinstance(
|
|
705
|
+
on[0], Expression
|
|
706
|
+
), "on should be Column or list of Column"
|
|
707
|
+
on = reduce(lambda x, y: x.__and__(y), cast(List[Expression], on))
|
|
708
|
+
|
|
686
709
|
|
|
687
710
|
if on is None and how is None:
|
|
688
711
|
result = self.relation.join(other.relation)
|
|
@@ -691,14 +714,14 @@ class DataFrame: # noqa: D101
|
|
|
691
714
|
how = "inner"
|
|
692
715
|
if on is None:
|
|
693
716
|
on = "true"
|
|
694
|
-
elif isinstance(on, list) and all(isinstance(x, str) for x in on):
|
|
717
|
+
elif isinstance(on, list) and all([isinstance(x, str) for x in on]):
|
|
695
718
|
# Passed directly through as a list of strings
|
|
696
719
|
on = on
|
|
697
720
|
else:
|
|
698
721
|
on = str(on)
|
|
699
722
|
assert isinstance(how, str), "how should be a string"
|
|
700
723
|
|
|
701
|
-
def map_to_recognized_jointype(how
|
|
724
|
+
def map_to_recognized_jointype(how):
|
|
702
725
|
known_aliases = {
|
|
703
726
|
"inner": [],
|
|
704
727
|
"outer": ["full", "fullouter", "full_outer"],
|
|
@@ -707,10 +730,15 @@ class DataFrame: # noqa: D101
|
|
|
707
730
|
"anti": ["leftanti", "left_anti"],
|
|
708
731
|
"semi": ["leftsemi", "left_semi"],
|
|
709
732
|
}
|
|
733
|
+
mapped_type = None
|
|
710
734
|
for type, aliases in known_aliases.items():
|
|
711
735
|
if how == type or how in aliases:
|
|
712
|
-
|
|
713
|
-
|
|
736
|
+
mapped_type = type
|
|
737
|
+
break
|
|
738
|
+
|
|
739
|
+
if not mapped_type:
|
|
740
|
+
mapped_type = how
|
|
741
|
+
return mapped_type
|
|
714
742
|
|
|
715
743
|
how = map_to_recognized_jointype(how)
|
|
716
744
|
result = self.relation.join(other.relation, on, how)
|
|
@@ -729,16 +757,18 @@ class DataFrame: # noqa: D101
|
|
|
729
757
|
other : :class:`DataFrame`
|
|
730
758
|
Right side of the cartesian product.
|
|
731
759
|
|
|
732
|
-
Returns
|
|
760
|
+
Returns
|
|
733
761
|
-------
|
|
734
762
|
:class:`DataFrame`
|
|
735
763
|
Joined DataFrame.
|
|
736
764
|
|
|
737
|
-
Examples
|
|
765
|
+
Examples
|
|
738
766
|
--------
|
|
739
767
|
>>> from pyspark.sql import Row
|
|
740
|
-
>>> df = spark.createDataFrame(
|
|
741
|
-
|
|
768
|
+
>>> df = spark.createDataFrame(
|
|
769
|
+
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
770
|
+
>>> df2 = spark.createDataFrame(
|
|
771
|
+
... [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
742
772
|
>>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
|
|
743
773
|
+---+-----+------+
|
|
744
774
|
|age| name|height|
|
|
@@ -761,21 +791,21 @@ class DataFrame: # noqa: D101
|
|
|
761
791
|
alias : str
|
|
762
792
|
an alias name to be set for the :class:`DataFrame`.
|
|
763
793
|
|
|
764
|
-
Returns
|
|
794
|
+
Returns
|
|
765
795
|
-------
|
|
766
796
|
:class:`DataFrame`
|
|
767
797
|
Aliased DataFrame.
|
|
768
798
|
|
|
769
|
-
Examples
|
|
799
|
+
Examples
|
|
770
800
|
--------
|
|
771
801
|
>>> from pyspark.sql.functions import col, desc
|
|
772
|
-
>>> df = spark.createDataFrame(
|
|
802
|
+
>>> df = spark.createDataFrame(
|
|
803
|
+
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
773
804
|
>>> df_as1 = df.alias("df_as1")
|
|
774
805
|
>>> df_as2 = df.alias("df_as2")
|
|
775
|
-
>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"),
|
|
776
|
-
>>> joined_df.select(
|
|
777
|
-
... desc("df_as1.name")
|
|
778
|
-
... ).show()
|
|
806
|
+
>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
|
|
807
|
+
>>> joined_df.select(
|
|
808
|
+
... "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
|
|
779
809
|
+-----+-----+---+
|
|
780
810
|
| name| name|age|
|
|
781
811
|
+-----+-----+---+
|
|
@@ -787,7 +817,7 @@ class DataFrame: # noqa: D101
|
|
|
787
817
|
assert isinstance(alias, str), "alias should be a string"
|
|
788
818
|
return DataFrame(self.relation.set_alias(alias), self.session)
|
|
789
819
|
|
|
790
|
-
def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]
|
|
820
|
+
def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]
|
|
791
821
|
exclude = []
|
|
792
822
|
for col in cols:
|
|
793
823
|
if isinstance(col, str):
|
|
@@ -804,7 +834,7 @@ class DataFrame: # noqa: D101
|
|
|
804
834
|
expr = StarExpression(exclude=exclude)
|
|
805
835
|
return DataFrame(self.relation.select(expr), self.session)
|
|
806
836
|
|
|
807
|
-
def __repr__(self) -> str:
|
|
837
|
+
def __repr__(self) -> str:
|
|
808
838
|
return str(self.relation)
|
|
809
839
|
|
|
810
840
|
def limit(self, num: int) -> "DataFrame":
|
|
@@ -816,14 +846,15 @@ class DataFrame: # noqa: D101
|
|
|
816
846
|
Number of records to return. Will return this number of records
|
|
817
847
|
or all records if the DataFrame contains less than this number of records.
|
|
818
848
|
|
|
819
|
-
Returns
|
|
849
|
+
Returns
|
|
820
850
|
-------
|
|
821
851
|
:class:`DataFrame`
|
|
822
852
|
Subset of the records
|
|
823
853
|
|
|
824
|
-
Examples
|
|
854
|
+
Examples
|
|
825
855
|
--------
|
|
826
|
-
>>> df = spark.createDataFrame(
|
|
856
|
+
>>> df = spark.createDataFrame(
|
|
857
|
+
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
827
858
|
>>> df.limit(1).show()
|
|
828
859
|
+---+----+
|
|
829
860
|
|age|name|
|
|
@@ -839,15 +870,17 @@ class DataFrame: # noqa: D101
|
|
|
839
870
|
rel = self.relation.limit(num)
|
|
840
871
|
return DataFrame(rel, self.session)
|
|
841
872
|
|
|
842
|
-
def __contains__(self, item: str)
|
|
843
|
-
"""
|
|
873
|
+
def __contains__(self, item: str):
|
|
874
|
+
"""
|
|
875
|
+
Check if the :class:`DataFrame` contains a column by the name of `item`
|
|
876
|
+
"""
|
|
844
877
|
return item in self.relation
|
|
845
878
|
|
|
846
879
|
@property
|
|
847
880
|
def schema(self) -> StructType:
|
|
848
881
|
"""Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
|
|
849
882
|
|
|
850
|
-
Examples
|
|
883
|
+
Examples
|
|
851
884
|
--------
|
|
852
885
|
>>> df.schema
|
|
853
886
|
StructType([StructField('age', IntegerType(), True),
|
|
@@ -856,21 +889,25 @@ class DataFrame: # noqa: D101
|
|
|
856
889
|
return self._schema
|
|
857
890
|
|
|
858
891
|
@overload
|
|
859
|
-
def __getitem__(self, item: Union[int, str]) -> Column:
|
|
892
|
+
def __getitem__(self, item: Union[int, str]) -> Column:
|
|
893
|
+
...
|
|
860
894
|
|
|
861
895
|
@overload
|
|
862
|
-
def __getitem__(self, item: Union[Column,
|
|
896
|
+
def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame":
|
|
897
|
+
...
|
|
863
898
|
|
|
864
|
-
def __getitem__(
|
|
899
|
+
def __getitem__(
|
|
900
|
+
self, item: Union[int, str, Column, List, Tuple]
|
|
901
|
+
) -> Union[Column, "DataFrame"]:
|
|
865
902
|
"""Returns the column as a :class:`Column`.
|
|
866
903
|
|
|
867
|
-
Examples
|
|
904
|
+
Examples
|
|
868
905
|
--------
|
|
869
|
-
>>> df.select(df[
|
|
906
|
+
>>> df.select(df['age']).collect()
|
|
870
907
|
[Row(age=2), Row(age=5)]
|
|
871
|
-
>>> df[["name", "age"]].collect()
|
|
908
|
+
>>> df[ ["name", "age"]].collect()
|
|
872
909
|
[Row(name='Alice', age=2), Row(name='Bob', age=5)]
|
|
873
|
-
>>> df[df.age > 3].collect()
|
|
910
|
+
>>> df[ df.age > 3 ].collect()
|
|
874
911
|
[Row(age=5, name='Bob')]
|
|
875
912
|
>>> df[df[0] > 3].collect()
|
|
876
913
|
[Row(age=5, name='Bob')]
|
|
@@ -882,29 +919,31 @@ class DataFrame: # noqa: D101
|
|
|
882
919
|
elif isinstance(item, (list, tuple)):
|
|
883
920
|
return self.select(*item)
|
|
884
921
|
elif isinstance(item, int):
|
|
885
|
-
return
|
|
922
|
+
return col(self._schema[item].name)
|
|
886
923
|
else:
|
|
887
|
-
|
|
888
|
-
raise TypeError(msg)
|
|
924
|
+
raise TypeError(f"Unexpected item type: {type(item)}")
|
|
889
925
|
|
|
890
926
|
def __getattr__(self, name: str) -> Column:
|
|
891
927
|
"""Returns the :class:`Column` denoted by ``name``.
|
|
892
928
|
|
|
893
|
-
Examples
|
|
929
|
+
Examples
|
|
894
930
|
--------
|
|
895
931
|
>>> df.select(df.age).collect()
|
|
896
932
|
[Row(age=2), Row(age=5)]
|
|
897
933
|
"""
|
|
898
934
|
if name not in self.relation.columns:
|
|
899
|
-
|
|
900
|
-
|
|
935
|
+
raise AttributeError(
|
|
936
|
+
"'%s' object has no attribute '%s'" % (self.__class__.__name__, name)
|
|
937
|
+
)
|
|
901
938
|
return Column(duckdb.ColumnExpression(self.relation.alias, name))
|
|
902
939
|
|
|
903
940
|
@overload
|
|
904
|
-
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
|
|
941
|
+
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
|
|
942
|
+
...
|
|
905
943
|
|
|
906
944
|
@overload
|
|
907
|
-
def groupBy(self, __cols: Union[
|
|
945
|
+
def groupBy(self, __cols: Union[List[Column], List[str]]) -> "GroupedData":
|
|
946
|
+
...
|
|
908
947
|
|
|
909
948
|
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
|
|
910
949
|
"""Groups the :class:`DataFrame` using the specified columns,
|
|
@@ -920,16 +959,15 @@ class DataFrame: # noqa: D101
|
|
|
920
959
|
Each element should be a column name (string) or an expression (:class:`Column`)
|
|
921
960
|
or list of them.
|
|
922
961
|
|
|
923
|
-
Returns
|
|
962
|
+
Returns
|
|
924
963
|
-------
|
|
925
964
|
:class:`GroupedData`
|
|
926
965
|
Grouped data by given columns.
|
|
927
966
|
|
|
928
|
-
Examples
|
|
967
|
+
Examples
|
|
929
968
|
--------
|
|
930
|
-
>>> df = spark.createDataFrame(
|
|
931
|
-
...
|
|
932
|
-
... )
|
|
969
|
+
>>> df = spark.createDataFrame([
|
|
970
|
+
... (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
|
|
933
971
|
|
|
934
972
|
Empty grouping columns triggers a global aggregation.
|
|
935
973
|
|
|
@@ -970,19 +1008,22 @@ class DataFrame: # noqa: D101
|
|
|
970
1008
|
| Bob| 2| 2|
|
|
971
1009
|
| Bob| 5| 1|
|
|
972
1010
|
+-----+---+-----+
|
|
973
|
-
"""
|
|
1011
|
+
"""
|
|
974
1012
|
from .group import GroupedData, Grouping
|
|
975
1013
|
|
|
976
|
-
|
|
1014
|
+
if len(cols) == 1 and isinstance(cols[0], list):
|
|
1015
|
+
columns = cols[0]
|
|
1016
|
+
else:
|
|
1017
|
+
columns = cols
|
|
977
1018
|
return GroupedData(Grouping(*columns), self)
|
|
978
1019
|
|
|
979
1020
|
groupby = groupBy
|
|
980
1021
|
|
|
981
1022
|
@property
|
|
982
|
-
def write(self) -> DataFrameWriter:
|
|
1023
|
+
def write(self) -> DataFrameWriter:
|
|
983
1024
|
return DataFrameWriter(self)
|
|
984
1025
|
|
|
985
|
-
def printSchema(self)
|
|
1026
|
+
def printSchema(self):
|
|
986
1027
|
raise ContributionsAcceptedError
|
|
987
1028
|
|
|
988
1029
|
def union(self, other: "DataFrame") -> "DataFrame":
|
|
@@ -994,22 +1035,22 @@ class DataFrame: # noqa: D101
|
|
|
994
1035
|
other : :class:`DataFrame`
|
|
995
1036
|
Another :class:`DataFrame` that needs to be unioned
|
|
996
1037
|
|
|
997
|
-
Returns
|
|
1038
|
+
Returns
|
|
998
1039
|
-------
|
|
999
1040
|
:class:`DataFrame`
|
|
1000
1041
|
|
|
1001
|
-
See Also
|
|
1042
|
+
See Also
|
|
1002
1043
|
--------
|
|
1003
1044
|
DataFrame.unionAll
|
|
1004
1045
|
|
|
1005
|
-
Notes
|
|
1046
|
+
Notes
|
|
1006
1047
|
-----
|
|
1007
1048
|
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
|
|
1008
1049
|
(that does deduplication of elements), use this function followed by :func:`distinct`.
|
|
1009
1050
|
|
|
1010
1051
|
Also as standard in SQL, this function resolves columns by position (not by name).
|
|
1011
1052
|
|
|
1012
|
-
Examples
|
|
1053
|
+
Examples
|
|
1013
1054
|
--------
|
|
1014
1055
|
>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
|
|
1015
1056
|
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
|
|
@@ -1027,12 +1068,14 @@ class DataFrame: # noqa: D101
|
|
|
1027
1068
|
| 1| 2| 3|
|
|
1028
1069
|
| 1| 2| 3|
|
|
1029
1070
|
+----+----+----+
|
|
1030
|
-
"""
|
|
1071
|
+
"""
|
|
1031
1072
|
return DataFrame(self.relation.union(other.relation), self.session)
|
|
1032
1073
|
|
|
1033
1074
|
unionAll = union
|
|
1034
1075
|
|
|
1035
|
-
def unionByName(
|
|
1076
|
+
def unionByName(
|
|
1077
|
+
self, other: "DataFrame", allowMissingColumns: bool = False
|
|
1078
|
+
) -> "DataFrame":
|
|
1036
1079
|
"""Returns a new :class:`DataFrame` containing union of rows in this and another
|
|
1037
1080
|
:class:`DataFrame`.
|
|
1038
1081
|
|
|
@@ -1053,12 +1096,12 @@ class DataFrame: # noqa: D101
|
|
|
1053
1096
|
|
|
1054
1097
|
.. versionadded:: 3.1.0
|
|
1055
1098
|
|
|
1056
|
-
Returns
|
|
1099
|
+
Returns
|
|
1057
1100
|
-------
|
|
1058
1101
|
:class:`DataFrame`
|
|
1059
1102
|
Combined DataFrame.
|
|
1060
1103
|
|
|
1061
|
-
Examples
|
|
1104
|
+
Examples
|
|
1062
1105
|
--------
|
|
1063
1106
|
The difference between this function and :func:`union` is that this function
|
|
1064
1107
|
resolves columns by name (not by position):
|
|
@@ -1087,14 +1130,14 @@ class DataFrame: # noqa: D101
|
|
|
1087
1130
|
| 1| 2| 3|NULL|
|
|
1088
1131
|
|NULL| 4| 5| 6|
|
|
1089
1132
|
+----+----+----+----+
|
|
1090
|
-
"""
|
|
1133
|
+
"""
|
|
1091
1134
|
if allowMissingColumns:
|
|
1092
1135
|
cols = []
|
|
1093
1136
|
for col in self.relation.columns:
|
|
1094
1137
|
if col in other.relation.columns:
|
|
1095
1138
|
cols.append(col)
|
|
1096
1139
|
else:
|
|
1097
|
-
cols.append(
|
|
1140
|
+
cols.append(lit(None))
|
|
1098
1141
|
other = other.select(*cols)
|
|
1099
1142
|
else:
|
|
1100
1143
|
other = other.select(*self.relation.columns)
|
|
@@ -1117,16 +1160,16 @@ class DataFrame: # noqa: D101
|
|
|
1117
1160
|
other : :class:`DataFrame`
|
|
1118
1161
|
Another :class:`DataFrame` that needs to be combined.
|
|
1119
1162
|
|
|
1120
|
-
Returns
|
|
1163
|
+
Returns
|
|
1121
1164
|
-------
|
|
1122
1165
|
:class:`DataFrame`
|
|
1123
1166
|
Combined DataFrame.
|
|
1124
1167
|
|
|
1125
|
-
Notes
|
|
1168
|
+
Notes
|
|
1126
1169
|
-----
|
|
1127
1170
|
This is equivalent to `INTERSECT` in SQL.
|
|
1128
1171
|
|
|
1129
|
-
Examples
|
|
1172
|
+
Examples
|
|
1130
1173
|
--------
|
|
1131
1174
|
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1132
1175
|
>>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
|
|
@@ -1137,7 +1180,7 @@ class DataFrame: # noqa: D101
|
|
|
1137
1180
|
| b| 3|
|
|
1138
1181
|
| a| 1|
|
|
1139
1182
|
+---+---+
|
|
1140
|
-
"""
|
|
1183
|
+
"""
|
|
1141
1184
|
return self.intersectAll(other).drop_duplicates()
|
|
1142
1185
|
|
|
1143
1186
|
def intersectAll(self, other: "DataFrame") -> "DataFrame":
|
|
@@ -1157,12 +1200,12 @@ class DataFrame: # noqa: D101
|
|
|
1157
1200
|
other : :class:`DataFrame`
|
|
1158
1201
|
Another :class:`DataFrame` that needs to be combined.
|
|
1159
1202
|
|
|
1160
|
-
Returns
|
|
1203
|
+
Returns
|
|
1161
1204
|
-------
|
|
1162
1205
|
:class:`DataFrame`
|
|
1163
1206
|
Combined DataFrame.
|
|
1164
1207
|
|
|
1165
|
-
Examples
|
|
1208
|
+
Examples
|
|
1166
1209
|
--------
|
|
1167
1210
|
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1168
1211
|
>>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
|
|
@@ -1174,7 +1217,7 @@ class DataFrame: # noqa: D101
|
|
|
1174
1217
|
| a| 1|
|
|
1175
1218
|
| b| 3|
|
|
1176
1219
|
+---+---+
|
|
1177
|
-
"""
|
|
1220
|
+
"""
|
|
1178
1221
|
return DataFrame(self.relation.intersect(other.relation), self.session)
|
|
1179
1222
|
|
|
1180
1223
|
def exceptAll(self, other: "DataFrame") -> "DataFrame":
|
|
@@ -1194,15 +1237,14 @@ class DataFrame: # noqa: D101
|
|
|
1194
1237
|
other : :class:`DataFrame`
|
|
1195
1238
|
The other :class:`DataFrame` to compare to.
|
|
1196
1239
|
|
|
1197
|
-
Returns
|
|
1240
|
+
Returns
|
|
1198
1241
|
-------
|
|
1199
1242
|
:class:`DataFrame`
|
|
1200
1243
|
|
|
1201
|
-
Examples
|
|
1244
|
+
Examples
|
|
1202
1245
|
--------
|
|
1203
1246
|
>>> df1 = spark.createDataFrame(
|
|
1204
|
-
...
|
|
1205
|
-
... )
|
|
1247
|
+
... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1206
1248
|
>>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
|
|
1207
1249
|
>>> df1.exceptAll(df2).show()
|
|
1208
1250
|
+---+---+
|
|
@@ -1214,10 +1256,10 @@ class DataFrame: # noqa: D101
|
|
|
1214
1256
|
| c| 4|
|
|
1215
1257
|
+---+---+
|
|
1216
1258
|
|
|
1217
|
-
"""
|
|
1259
|
+
"""
|
|
1218
1260
|
return DataFrame(self.relation.except_(other.relation), self.session)
|
|
1219
1261
|
|
|
1220
|
-
def dropDuplicates(self, subset: Optional[
|
|
1262
|
+
def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
|
|
1221
1263
|
"""Return a new :class:`DataFrame` with duplicate rows removed,
|
|
1222
1264
|
optionally only considering certain columns.
|
|
1223
1265
|
|
|
@@ -1234,21 +1276,19 @@ class DataFrame: # noqa: D101
|
|
|
1234
1276
|
subset : List of column names, optional
|
|
1235
1277
|
List of columns to use for duplicate comparison (default All columns).
|
|
1236
1278
|
|
|
1237
|
-
Returns
|
|
1279
|
+
Returns
|
|
1238
1280
|
-------
|
|
1239
1281
|
:class:`DataFrame`
|
|
1240
1282
|
DataFrame without duplicates.
|
|
1241
1283
|
|
|
1242
|
-
Examples
|
|
1284
|
+
Examples
|
|
1243
1285
|
--------
|
|
1244
1286
|
>>> from pyspark.sql import Row
|
|
1245
|
-
>>> df = spark.createDataFrame(
|
|
1246
|
-
...
|
|
1247
|
-
...
|
|
1248
|
-
...
|
|
1249
|
-
...
|
|
1250
|
-
... ]
|
|
1251
|
-
... )
|
|
1287
|
+
>>> df = spark.createDataFrame([
|
|
1288
|
+
... Row(name='Alice', age=5, height=80),
|
|
1289
|
+
... Row(name='Alice', age=5, height=80),
|
|
1290
|
+
... Row(name='Alice', age=10, height=80)
|
|
1291
|
+
... ])
|
|
1252
1292
|
|
|
1253
1293
|
Deduplicate the same rows.
|
|
1254
1294
|
|
|
@@ -1262,16 +1302,16 @@ class DataFrame: # noqa: D101
|
|
|
1262
1302
|
|
|
1263
1303
|
Deduplicate values on 'name' and 'height' columns.
|
|
1264
1304
|
|
|
1265
|
-
>>> df.dropDuplicates([
|
|
1305
|
+
>>> df.dropDuplicates(['name', 'height']).show()
|
|
1266
1306
|
+-----+---+------+
|
|
1267
1307
|
| name|age|height|
|
|
1268
1308
|
+-----+---+------+
|
|
1269
1309
|
|Alice| 5| 80|
|
|
1270
1310
|
+-----+---+------+
|
|
1271
|
-
"""
|
|
1311
|
+
"""
|
|
1272
1312
|
if subset:
|
|
1273
1313
|
rn_col = f"tmp_col_{uuid.uuid1().hex}"
|
|
1274
|
-
subset_str =
|
|
1314
|
+
subset_str = ', '.join([f'"{c}"' for c in subset])
|
|
1275
1315
|
window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
|
|
1276
1316
|
df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
|
|
1277
1317
|
return df.filter(f"{rn_col} = 1").drop(rn_col)
|
|
@@ -1280,17 +1320,19 @@ class DataFrame: # noqa: D101
|
|
|
1280
1320
|
|
|
1281
1321
|
drop_duplicates = dropDuplicates
|
|
1282
1322
|
|
|
1323
|
+
|
|
1283
1324
|
def distinct(self) -> "DataFrame":
|
|
1284
1325
|
"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
|
|
1285
1326
|
|
|
1286
|
-
Returns
|
|
1327
|
+
Returns
|
|
1287
1328
|
-------
|
|
1288
1329
|
:class:`DataFrame`
|
|
1289
1330
|
DataFrame with distinct records.
|
|
1290
1331
|
|
|
1291
|
-
Examples
|
|
1332
|
+
Examples
|
|
1292
1333
|
--------
|
|
1293
|
-
>>> df = spark.createDataFrame(
|
|
1334
|
+
>>> df = spark.createDataFrame(
|
|
1335
|
+
... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
|
|
1294
1336
|
|
|
1295
1337
|
Return the number of distinct rows in the :class:`DataFrame`
|
|
1296
1338
|
|
|
@@ -1303,14 +1345,15 @@ class DataFrame: # noqa: D101
|
|
|
1303
1345
|
def count(self) -> int:
|
|
1304
1346
|
"""Returns the number of rows in this :class:`DataFrame`.
|
|
1305
1347
|
|
|
1306
|
-
Returns
|
|
1348
|
+
Returns
|
|
1307
1349
|
-------
|
|
1308
1350
|
int
|
|
1309
1351
|
Number of rows.
|
|
1310
1352
|
|
|
1311
|
-
Examples
|
|
1353
|
+
Examples
|
|
1312
1354
|
--------
|
|
1313
|
-
>>> df = spark.createDataFrame(
|
|
1355
|
+
>>> df = spark.createDataFrame(
|
|
1356
|
+
... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
1314
1357
|
|
|
1315
1358
|
Return the number of rows in the :class:`DataFrame`.
|
|
1316
1359
|
|
|
@@ -1326,28 +1369,33 @@ class DataFrame: # noqa: D101
|
|
|
1326
1369
|
assert types_count == len(existing_columns)
|
|
1327
1370
|
|
|
1328
1371
|
cast_expressions = [
|
|
1329
|
-
f"{existing}::{target_type} as {existing}"
|
|
1372
|
+
f"{existing}::{target_type} as {existing}"
|
|
1373
|
+
for existing, target_type in zip(existing_columns, types)
|
|
1330
1374
|
]
|
|
1331
1375
|
cast_expressions = ", ".join(cast_expressions)
|
|
1332
1376
|
new_rel = self.relation.project(cast_expressions)
|
|
1333
1377
|
return DataFrame(new_rel, self.session)
|
|
1334
1378
|
|
|
1335
|
-
def toDF(self, *cols) -> "DataFrame":
|
|
1379
|
+
def toDF(self, *cols) -> "DataFrame":
|
|
1336
1380
|
existing_columns = self.relation.columns
|
|
1337
1381
|
column_count = len(cols)
|
|
1338
1382
|
if column_count != len(existing_columns):
|
|
1339
|
-
raise PySparkValueError(
|
|
1383
|
+
raise PySparkValueError(
|
|
1384
|
+
message="Provided column names and number of columns in the DataFrame don't match"
|
|
1385
|
+
)
|
|
1340
1386
|
|
|
1341
1387
|
existing_columns = [ColumnExpression(x) for x in existing_columns]
|
|
1342
|
-
projections = [
|
|
1388
|
+
projections = [
|
|
1389
|
+
existing.alias(new) for existing, new in zip(existing_columns, cols)
|
|
1390
|
+
]
|
|
1343
1391
|
new_rel = self.relation.project(*projections)
|
|
1344
1392
|
return DataFrame(new_rel, self.session)
|
|
1345
1393
|
|
|
1346
|
-
def collect(self) ->
|
|
1394
|
+
def collect(self) -> List[Row]:
|
|
1347
1395
|
columns = self.relation.columns
|
|
1348
1396
|
result = self.relation.fetchall()
|
|
1349
1397
|
|
|
1350
|
-
def construct_row(values
|
|
1398
|
+
def construct_row(values, names) -> Row:
|
|
1351
1399
|
row = tuple.__new__(Row, list(values))
|
|
1352
1400
|
row.__fields__ = list(names)
|
|
1353
1401
|
return row
|
|
@@ -1363,16 +1411,16 @@ class DataFrame: # noqa: D101
|
|
|
1363
1411
|
.. versionchanged:: 3.4.0
|
|
1364
1412
|
Supports Spark Connect.
|
|
1365
1413
|
|
|
1366
|
-
Notes
|
|
1414
|
+
Notes
|
|
1367
1415
|
-----
|
|
1368
1416
|
The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
|
|
1369
1417
|
|
|
1370
|
-
Returns
|
|
1418
|
+
Returns
|
|
1371
1419
|
-------
|
|
1372
1420
|
:class:`DataFrame`
|
|
1373
1421
|
Cached DataFrame.
|
|
1374
1422
|
|
|
1375
|
-
Examples
|
|
1423
|
+
Examples
|
|
1376
1424
|
--------
|
|
1377
1425
|
>>> df = spark.range(1)
|
|
1378
1426
|
>>> df.cache()
|