duckdb 1.5.0.dev56__cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb-stubs/__init__.pyi +1443 -0
- _duckdb-stubs/_func.pyi +46 -0
- _duckdb-stubs/_sqltypes.pyi +75 -0
- _duckdb.cpython-314-x86_64-linux-gnu.so +0 -0
- adbc_driver_duckdb/__init__.py +50 -0
- adbc_driver_duckdb/dbapi.py +115 -0
- duckdb/__init__.py +381 -0
- duckdb/_dbapi_type_object.py +231 -0
- duckdb/_version.py +22 -0
- duckdb/bytes_io_wrapper.py +69 -0
- duckdb/experimental/__init__.py +3 -0
- duckdb/experimental/spark/LICENSE +260 -0
- duckdb/experimental/spark/__init__.py +6 -0
- duckdb/experimental/spark/_globals.py +77 -0
- duckdb/experimental/spark/_typing.py +46 -0
- duckdb/experimental/spark/conf.py +46 -0
- duckdb/experimental/spark/context.py +180 -0
- duckdb/experimental/spark/errors/__init__.py +70 -0
- duckdb/experimental/spark/errors/error_classes.py +918 -0
- duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
- duckdb/experimental/spark/errors/exceptions/base.py +168 -0
- duckdb/experimental/spark/errors/utils.py +111 -0
- duckdb/experimental/spark/exception.py +18 -0
- duckdb/experimental/spark/sql/__init__.py +7 -0
- duckdb/experimental/spark/sql/_typing.py +86 -0
- duckdb/experimental/spark/sql/catalog.py +79 -0
- duckdb/experimental/spark/sql/column.py +361 -0
- duckdb/experimental/spark/sql/conf.py +24 -0
- duckdb/experimental/spark/sql/dataframe.py +1389 -0
- duckdb/experimental/spark/sql/functions.py +6195 -0
- duckdb/experimental/spark/sql/group.py +424 -0
- duckdb/experimental/spark/sql/readwriter.py +435 -0
- duckdb/experimental/spark/sql/session.py +297 -0
- duckdb/experimental/spark/sql/streaming.py +36 -0
- duckdb/experimental/spark/sql/type_utils.py +107 -0
- duckdb/experimental/spark/sql/types.py +1239 -0
- duckdb/experimental/spark/sql/udf.py +37 -0
- duckdb/filesystem.py +33 -0
- duckdb/func/__init__.py +3 -0
- duckdb/functional/__init__.py +13 -0
- duckdb/polars_io.py +284 -0
- duckdb/py.typed +0 -0
- duckdb/query_graph/__main__.py +358 -0
- duckdb/sqltypes/__init__.py +63 -0
- duckdb/typing/__init__.py +71 -0
- duckdb/udf.py +24 -0
- duckdb/value/__init__.py +1 -0
- duckdb/value/constant/__init__.py +270 -0
- duckdb-1.5.0.dev56.dist-info/METADATA +87 -0
- duckdb-1.5.0.dev56.dist-info/RECORD +52 -0
- duckdb-1.5.0.dev56.dist-info/WHEEL +6 -0
- duckdb-1.5.0.dev56.dist-info/licenses/LICENSE +7 -0
|
@@ -0,0 +1,1389 @@
|
|
|
1
|
+
import uuid # noqa: D100
|
|
2
|
+
from functools import reduce
|
|
3
|
+
from keyword import iskeyword
|
|
4
|
+
from typing import (
|
|
5
|
+
TYPE_CHECKING,
|
|
6
|
+
Any,
|
|
7
|
+
Callable,
|
|
8
|
+
Optional,
|
|
9
|
+
Union,
|
|
10
|
+
cast,
|
|
11
|
+
overload,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
import duckdb
|
|
15
|
+
from duckdb import ColumnExpression, Expression, StarExpression
|
|
16
|
+
|
|
17
|
+
from ..errors import PySparkIndexError, PySparkTypeError, PySparkValueError
|
|
18
|
+
from ..exception import ContributionsAcceptedError
|
|
19
|
+
from .column import Column
|
|
20
|
+
from .readwriter import DataFrameWriter
|
|
21
|
+
from .type_utils import duckdb_to_spark_schema
|
|
22
|
+
from .types import Row, StructType
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import pyarrow as pa
|
|
26
|
+
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
27
|
+
|
|
28
|
+
from ._typing import ColumnOrName
|
|
29
|
+
from .group import GroupedData
|
|
30
|
+
from .session import SparkSession
|
|
31
|
+
|
|
32
|
+
from duckdb.experimental.spark.sql import functions as spark_sql_functions
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DataFrame: # noqa: D101
|
|
36
|
+
def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession") -> None: # noqa: D107
|
|
37
|
+
self.relation = relation
|
|
38
|
+
self.session = session
|
|
39
|
+
self._schema = None
|
|
40
|
+
if self.relation is not None:
|
|
41
|
+
self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
|
|
42
|
+
|
|
43
|
+
def show(self, **kwargs) -> None: # noqa: D102
|
|
44
|
+
self.relation.show()
|
|
45
|
+
|
|
46
|
+
def toPandas(self) -> "PandasDataFrame": # noqa: D102
|
|
47
|
+
return self.relation.df()
|
|
48
|
+
|
|
49
|
+
def toArrow(self) -> "pa.Table":
|
|
50
|
+
"""Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
|
|
51
|
+
|
|
52
|
+
This is only available if PyArrow is installed and available.
|
|
53
|
+
|
|
54
|
+
.. versionadded:: 4.0.0
|
|
55
|
+
|
|
56
|
+
Notes:
|
|
57
|
+
-----
|
|
58
|
+
This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
|
|
59
|
+
expected to be small, as all the data is loaded into the driver's memory.
|
|
60
|
+
|
|
61
|
+
This API is a developer API.
|
|
62
|
+
|
|
63
|
+
Examples:
|
|
64
|
+
--------
|
|
65
|
+
>>> df.toArrow() # doctest: +SKIP
|
|
66
|
+
pyarrow.Table
|
|
67
|
+
age: int64
|
|
68
|
+
name: string
|
|
69
|
+
----
|
|
70
|
+
age: [[2,5]]
|
|
71
|
+
name: [["Alice","Bob"]]
|
|
72
|
+
"""
|
|
73
|
+
return self.relation.to_arrow_table()
|
|
74
|
+
|
|
75
|
+
def createOrReplaceTempView(self, name: str) -> None:
|
|
76
|
+
"""Creates or replaces a local temporary view with this :class:`DataFrame`.
|
|
77
|
+
|
|
78
|
+
The lifetime of this temporary table is tied to the :class:`SparkSession`
|
|
79
|
+
that was used to create this :class:`DataFrame`.
|
|
80
|
+
|
|
81
|
+
Parameters
|
|
82
|
+
----------
|
|
83
|
+
name : str
|
|
84
|
+
Name of the view.
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
--------
|
|
88
|
+
Create a local temporary view named 'people'.
|
|
89
|
+
|
|
90
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
91
|
+
>>> df.createOrReplaceTempView("people")
|
|
92
|
+
|
|
93
|
+
Replace the local temporary view.
|
|
94
|
+
|
|
95
|
+
>>> df2 = df.filter(df.age > 3)
|
|
96
|
+
>>> df2.createOrReplaceTempView("people")
|
|
97
|
+
>>> df3 = spark.sql("SELECT * FROM people")
|
|
98
|
+
>>> sorted(df3.collect()) == sorted(df2.collect())
|
|
99
|
+
True
|
|
100
|
+
>>> spark.catalog.dropTempView("people")
|
|
101
|
+
True
|
|
102
|
+
|
|
103
|
+
"""
|
|
104
|
+
self.relation.create_view(name, True)
|
|
105
|
+
|
|
106
|
+
def createGlobalTempView(self, name: str) -> None: # noqa: D102
|
|
107
|
+
raise NotImplementedError
|
|
108
|
+
|
|
109
|
+
def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame": # noqa: D102
|
|
110
|
+
if columnName not in self.relation:
|
|
111
|
+
msg = f"DataFrame does not contain a column named {columnName}"
|
|
112
|
+
raise ValueError(msg)
|
|
113
|
+
cols = []
|
|
114
|
+
for x in self.relation.columns:
|
|
115
|
+
col = ColumnExpression(x)
|
|
116
|
+
if x.casefold() == columnName.casefold():
|
|
117
|
+
col = col.alias(newName)
|
|
118
|
+
cols.append(col)
|
|
119
|
+
rel = self.relation.select(*cols)
|
|
120
|
+
return DataFrame(rel, self.session)
|
|
121
|
+
|
|
122
|
+
def withColumn(self, columnName: str, col: Column) -> "DataFrame": # noqa: D102
|
|
123
|
+
if not isinstance(col, Column):
|
|
124
|
+
raise PySparkTypeError(
|
|
125
|
+
error_class="NOT_COLUMN",
|
|
126
|
+
message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
|
|
127
|
+
)
|
|
128
|
+
if columnName in self.relation:
|
|
129
|
+
# We want to replace the existing column with this new expression
|
|
130
|
+
cols = []
|
|
131
|
+
for x in self.relation.columns:
|
|
132
|
+
if x.casefold() == columnName.casefold():
|
|
133
|
+
cols.append(col.expr.alias(columnName))
|
|
134
|
+
else:
|
|
135
|
+
cols.append(ColumnExpression(x))
|
|
136
|
+
else:
|
|
137
|
+
cols = [ColumnExpression(x) for x in self.relation.columns]
|
|
138
|
+
cols.append(col.expr.alias(columnName))
|
|
139
|
+
rel = self.relation.select(*cols)
|
|
140
|
+
return DataFrame(rel, self.session)
|
|
141
|
+
|
|
142
|
+
def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame":
|
|
143
|
+
"""Returns a new :class:`DataFrame` by adding multiple columns or replacing the
|
|
144
|
+
existing columns that have the same names.
|
|
145
|
+
|
|
146
|
+
The colsMap is a map of column name and column, the column must only refer to attributes
|
|
147
|
+
supplied by this Dataset. It is an error to add columns that refer to some other Dataset.
|
|
148
|
+
|
|
149
|
+
.. versionadded:: 3.3.0
|
|
150
|
+
Added support for multiple columns adding
|
|
151
|
+
|
|
152
|
+
.. versionchanged:: 3.4.0
|
|
153
|
+
Supports Spark Connect.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
colsMap : dict
|
|
158
|
+
a dict of column name and :class:`Column`. Currently, only a single map is supported.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
-------
|
|
162
|
+
:class:`DataFrame`
|
|
163
|
+
DataFrame with new or replaced columns.
|
|
164
|
+
|
|
165
|
+
Examples:
|
|
166
|
+
--------
|
|
167
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
168
|
+
>>> df.withColumns({"age2": df.age + 2, "age3": df.age + 3}).show()
|
|
169
|
+
+---+-----+----+----+
|
|
170
|
+
|age| name|age2|age3|
|
|
171
|
+
+---+-----+----+----+
|
|
172
|
+
| 2|Alice| 4| 5|
|
|
173
|
+
| 5| Bob| 7| 8|
|
|
174
|
+
+---+-----+----+----+
|
|
175
|
+
""" # noqa: D205
|
|
176
|
+
# Below code is to help enable kwargs in future.
|
|
177
|
+
assert len(colsMap) == 1
|
|
178
|
+
colsMap = colsMap[0] # type: ignore[assignment]
|
|
179
|
+
|
|
180
|
+
if not isinstance(colsMap, dict):
|
|
181
|
+
raise PySparkTypeError(
|
|
182
|
+
error_class="NOT_DICT",
|
|
183
|
+
message_parameters={
|
|
184
|
+
"arg_name": "colsMap",
|
|
185
|
+
"arg_type": type(colsMap).__name__,
|
|
186
|
+
},
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
column_names = list(colsMap.keys())
|
|
190
|
+
columns = list(colsMap.values())
|
|
191
|
+
|
|
192
|
+
# Compute this only once
|
|
193
|
+
column_names_for_comparison = [x.casefold() for x in column_names]
|
|
194
|
+
|
|
195
|
+
cols = []
|
|
196
|
+
for x in self.relation.columns:
|
|
197
|
+
if x.casefold() in column_names_for_comparison:
|
|
198
|
+
idx = column_names_for_comparison.index(x)
|
|
199
|
+
# We extract the column name from the originally passed
|
|
200
|
+
# in ones, as the casing might be different than the one
|
|
201
|
+
# in the relation
|
|
202
|
+
col_name = column_names.pop(idx)
|
|
203
|
+
col = columns.pop(idx)
|
|
204
|
+
cols.append(col.expr.alias(col_name))
|
|
205
|
+
else:
|
|
206
|
+
cols.append(ColumnExpression(x))
|
|
207
|
+
|
|
208
|
+
# In case anything is remaining, these are new columns
|
|
209
|
+
# that we need to add to the DataFrame
|
|
210
|
+
for col_name, col in zip(column_names, columns):
|
|
211
|
+
cols.append(col.expr.alias(col_name))
|
|
212
|
+
|
|
213
|
+
rel = self.relation.select(*cols)
|
|
214
|
+
return DataFrame(rel, self.session)
|
|
215
|
+
|
|
216
|
+
def withColumnsRenamed(self, colsMap: dict[str, str]) -> "DataFrame":
|
|
217
|
+
"""Returns a new :class:`DataFrame` by renaming multiple columns.
|
|
218
|
+
This is a no-op if the schema doesn't contain the given column names.
|
|
219
|
+
|
|
220
|
+
.. versionadded:: 3.4.0
|
|
221
|
+
Added support for multiple columns renaming
|
|
222
|
+
|
|
223
|
+
Parameters
|
|
224
|
+
----------
|
|
225
|
+
colsMap : dict
|
|
226
|
+
a dict of existing column names and corresponding desired column names.
|
|
227
|
+
Currently, only a single map is supported.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
-------
|
|
231
|
+
:class:`DataFrame`
|
|
232
|
+
DataFrame with renamed columns.
|
|
233
|
+
|
|
234
|
+
See Also:
|
|
235
|
+
--------
|
|
236
|
+
:meth:`withColumnRenamed`
|
|
237
|
+
|
|
238
|
+
Notes:
|
|
239
|
+
-----
|
|
240
|
+
Support Spark Connect
|
|
241
|
+
|
|
242
|
+
Examples:
|
|
243
|
+
--------
|
|
244
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
245
|
+
>>> df = df.withColumns({"age2": df.age + 2, "age3": df.age + 3})
|
|
246
|
+
>>> df.withColumnsRenamed({"age2": "age4", "age3": "age5"}).show()
|
|
247
|
+
+---+-----+----+----+
|
|
248
|
+
|age| name|age4|age5|
|
|
249
|
+
+---+-----+----+----+
|
|
250
|
+
| 2|Alice| 4| 5|
|
|
251
|
+
| 5| Bob| 7| 8|
|
|
252
|
+
+---+-----+----+----+
|
|
253
|
+
""" # noqa: D205
|
|
254
|
+
if not isinstance(colsMap, dict):
|
|
255
|
+
raise PySparkTypeError(
|
|
256
|
+
error_class="NOT_DICT",
|
|
257
|
+
message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__},
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
|
|
261
|
+
if unknown_columns:
|
|
262
|
+
msg = f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
|
|
263
|
+
raise ValueError(msg)
|
|
264
|
+
|
|
265
|
+
# Compute this only once
|
|
266
|
+
old_column_names = list(colsMap.keys())
|
|
267
|
+
old_column_names_for_comparison = [x.casefold() for x in old_column_names]
|
|
268
|
+
|
|
269
|
+
cols = []
|
|
270
|
+
for x in self.relation.columns:
|
|
271
|
+
col = ColumnExpression(x)
|
|
272
|
+
if x.casefold() in old_column_names_for_comparison:
|
|
273
|
+
idx = old_column_names.index(x)
|
|
274
|
+
# We extract the column name from the originally passed
|
|
275
|
+
# in ones, as the casing might be different than the one
|
|
276
|
+
# in the relation
|
|
277
|
+
col_name = old_column_names.pop(idx)
|
|
278
|
+
new_col_name = colsMap[col_name]
|
|
279
|
+
col = col.alias(new_col_name)
|
|
280
|
+
cols.append(col)
|
|
281
|
+
|
|
282
|
+
rel = self.relation.select(*cols)
|
|
283
|
+
return DataFrame(rel, self.session)
|
|
284
|
+
|
|
285
|
+
def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": # noqa: ANN401
|
|
286
|
+
"""Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
|
|
287
|
+
|
|
288
|
+
.. versionadded:: 3.0.0
|
|
289
|
+
|
|
290
|
+
.. versionchanged:: 3.4.0
|
|
291
|
+
Supports Spark Connect.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
func : function
|
|
296
|
+
a function that takes and returns a :class:`DataFrame`.
|
|
297
|
+
*args
|
|
298
|
+
Positional arguments to pass to func.
|
|
299
|
+
|
|
300
|
+
.. versionadded:: 3.3.0
|
|
301
|
+
**kwargs
|
|
302
|
+
Keyword arguments to pass to func.
|
|
303
|
+
|
|
304
|
+
.. versionadded:: 3.3.0
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
-------
|
|
308
|
+
:class:`DataFrame`
|
|
309
|
+
Transformed DataFrame.
|
|
310
|
+
|
|
311
|
+
Examples:
|
|
312
|
+
--------
|
|
313
|
+
>>> from pyspark.sql.functions import col
|
|
314
|
+
>>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
|
|
315
|
+
>>> def cast_all_to_int(input_df):
|
|
316
|
+
... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
|
|
317
|
+
>>> def sort_columns_asc(input_df):
|
|
318
|
+
... return input_df.select(*sorted(input_df.columns))
|
|
319
|
+
>>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
|
|
320
|
+
+-----+---+
|
|
321
|
+
|float|int|
|
|
322
|
+
+-----+---+
|
|
323
|
+
| 1| 1|
|
|
324
|
+
| 2| 2|
|
|
325
|
+
+-----+---+
|
|
326
|
+
|
|
327
|
+
>>> def add_n(input_df, n):
|
|
328
|
+
... return input_df.select(
|
|
329
|
+
... [(col(col_name) + n).alias(col_name) for col_name in input_df.columns]
|
|
330
|
+
... )
|
|
331
|
+
>>> df.transform(add_n, 1).transform(add_n, n=10).show()
|
|
332
|
+
+---+-----+
|
|
333
|
+
|int|float|
|
|
334
|
+
+---+-----+
|
|
335
|
+
| 12| 12.0|
|
|
336
|
+
| 13| 13.0|
|
|
337
|
+
+---+-----+
|
|
338
|
+
"""
|
|
339
|
+
result = func(self, *args, **kwargs)
|
|
340
|
+
assert isinstance(result, DataFrame), (
|
|
341
|
+
f"Func returned an instance of type [{type(result)}], should have been DataFrame."
|
|
342
|
+
)
|
|
343
|
+
return result
|
|
344
|
+
|
|
345
|
+
def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame": # noqa: ANN401
|
|
346
|
+
"""Returns a new :class:`DataFrame` sorted by the specified column(s).
|
|
347
|
+
|
|
348
|
+
Parameters
|
|
349
|
+
----------
|
|
350
|
+
cols : str, list, or :class:`Column`, optional
|
|
351
|
+
list of :class:`Column` or column names to sort by.
|
|
352
|
+
|
|
353
|
+
Other Parameters
|
|
354
|
+
----------------
|
|
355
|
+
ascending : bool or list, optional, default True
|
|
356
|
+
boolean or list of boolean.
|
|
357
|
+
Sort ascending vs. descending. Specify list for multiple sort orders.
|
|
358
|
+
If a list is specified, the length of the list must equal the length of the `cols`.
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
-------
|
|
362
|
+
:class:`DataFrame`
|
|
363
|
+
Sorted DataFrame.
|
|
364
|
+
|
|
365
|
+
Examples:
|
|
366
|
+
--------
|
|
367
|
+
>>> from pyspark.sql.functions import desc, asc
|
|
368
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
369
|
+
|
|
370
|
+
Sort the DataFrame in ascending order.
|
|
371
|
+
|
|
372
|
+
>>> df.sort(asc("age")).show()
|
|
373
|
+
+---+-----+
|
|
374
|
+
|age| name|
|
|
375
|
+
+---+-----+
|
|
376
|
+
| 2|Alice|
|
|
377
|
+
| 5| Bob|
|
|
378
|
+
+---+-----+
|
|
379
|
+
|
|
380
|
+
Sort the DataFrame in descending order.
|
|
381
|
+
|
|
382
|
+
>>> df.sort(df.age.desc()).show()
|
|
383
|
+
+---+-----+
|
|
384
|
+
|age| name|
|
|
385
|
+
+---+-----+
|
|
386
|
+
| 5| Bob|
|
|
387
|
+
| 2|Alice|
|
|
388
|
+
+---+-----+
|
|
389
|
+
>>> df.orderBy(df.age.desc()).show()
|
|
390
|
+
+---+-----+
|
|
391
|
+
|age| name|
|
|
392
|
+
+---+-----+
|
|
393
|
+
| 5| Bob|
|
|
394
|
+
| 2|Alice|
|
|
395
|
+
+---+-----+
|
|
396
|
+
>>> df.sort("age", ascending=False).show()
|
|
397
|
+
+---+-----+
|
|
398
|
+
|age| name|
|
|
399
|
+
+---+-----+
|
|
400
|
+
| 5| Bob|
|
|
401
|
+
| 2|Alice|
|
|
402
|
+
+---+-----+
|
|
403
|
+
|
|
404
|
+
Specify multiple columns
|
|
405
|
+
|
|
406
|
+
>>> df = spark.createDataFrame(
|
|
407
|
+
... [(2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
|
|
408
|
+
... )
|
|
409
|
+
>>> df.orderBy(desc("age"), "name").show()
|
|
410
|
+
+---+-----+
|
|
411
|
+
|age| name|
|
|
412
|
+
+---+-----+
|
|
413
|
+
| 5| Bob|
|
|
414
|
+
| 2|Alice|
|
|
415
|
+
| 2| Bob|
|
|
416
|
+
+---+-----+
|
|
417
|
+
|
|
418
|
+
Specify multiple columns for sorting order at `ascending`.
|
|
419
|
+
|
|
420
|
+
>>> df.orderBy(["age", "name"], ascending=[False, False]).show()
|
|
421
|
+
+---+-----+
|
|
422
|
+
|age| name|
|
|
423
|
+
+---+-----+
|
|
424
|
+
| 5| Bob|
|
|
425
|
+
| 2| Bob|
|
|
426
|
+
| 2|Alice|
|
|
427
|
+
+---+-----+
|
|
428
|
+
"""
|
|
429
|
+
if not cols:
|
|
430
|
+
raise PySparkValueError(
|
|
431
|
+
error_class="CANNOT_BE_EMPTY",
|
|
432
|
+
message_parameters={"item": "column"},
|
|
433
|
+
)
|
|
434
|
+
if len(cols) == 1 and isinstance(cols[0], list):
|
|
435
|
+
cols = cols[0]
|
|
436
|
+
|
|
437
|
+
columns = []
|
|
438
|
+
for c in cols:
|
|
439
|
+
_c = c
|
|
440
|
+
if isinstance(c, str):
|
|
441
|
+
_c = spark_sql_functions.col(c)
|
|
442
|
+
elif isinstance(c, int) and not isinstance(c, bool):
|
|
443
|
+
# ordinal is 1-based
|
|
444
|
+
if c > 0:
|
|
445
|
+
_c = self[c - 1]
|
|
446
|
+
# negative ordinal means sort by desc
|
|
447
|
+
elif c < 0:
|
|
448
|
+
_c = self[-c - 1].desc()
|
|
449
|
+
else:
|
|
450
|
+
raise PySparkIndexError(
|
|
451
|
+
error_class="ZERO_INDEX",
|
|
452
|
+
message_parameters={},
|
|
453
|
+
)
|
|
454
|
+
columns.append(_c)
|
|
455
|
+
|
|
456
|
+
ascending = kwargs.get("ascending", True)
|
|
457
|
+
|
|
458
|
+
if isinstance(ascending, (bool, int)):
|
|
459
|
+
if not ascending:
|
|
460
|
+
columns = [c.desc() for c in columns]
|
|
461
|
+
elif isinstance(ascending, list):
|
|
462
|
+
columns = [c if asc else c.desc() for asc, c in zip(ascending, columns)]
|
|
463
|
+
else:
|
|
464
|
+
raise PySparkTypeError(
|
|
465
|
+
error_class="NOT_BOOL_OR_LIST",
|
|
466
|
+
message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
columns = [spark_sql_functions._to_column_expr(c) for c in columns]
|
|
470
|
+
rel = self.relation.sort(*columns)
|
|
471
|
+
return DataFrame(rel, self.session)
|
|
472
|
+
|
|
473
|
+
orderBy = sort
|
|
474
|
+
|
|
475
|
+
def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]: # noqa: D102
|
|
476
|
+
if n is None:
|
|
477
|
+
rs = self.head(1)
|
|
478
|
+
return rs[0] if rs else None
|
|
479
|
+
return self.take(n)
|
|
480
|
+
|
|
481
|
+
first = head
|
|
482
|
+
|
|
483
|
+
def take(self, num: int) -> list[Row]: # noqa: D102
|
|
484
|
+
return self.limit(num).collect()
|
|
485
|
+
|
|
486
|
+
def filter(self, condition: "ColumnOrName") -> "DataFrame":
|
|
487
|
+
"""Filters rows using the given condition.
|
|
488
|
+
|
|
489
|
+
:func:`where` is an alias for :func:`filter`.
|
|
490
|
+
|
|
491
|
+
Parameters
|
|
492
|
+
----------
|
|
493
|
+
condition : :class:`Column` or str
|
|
494
|
+
a :class:`Column` of :class:`types.BooleanType`
|
|
495
|
+
or a string of SQL expressions.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
-------
|
|
499
|
+
:class:`DataFrame`
|
|
500
|
+
Filtered DataFrame.
|
|
501
|
+
|
|
502
|
+
Examples:
|
|
503
|
+
--------
|
|
504
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
|
|
505
|
+
|
|
506
|
+
Filter by :class:`Column` instances.
|
|
507
|
+
|
|
508
|
+
>>> df.filter(df.age > 3).show()
|
|
509
|
+
+---+----+
|
|
510
|
+
|age|name|
|
|
511
|
+
+---+----+
|
|
512
|
+
| 5| Bob|
|
|
513
|
+
+---+----+
|
|
514
|
+
>>> df.where(df.age == 2).show()
|
|
515
|
+
+---+-----+
|
|
516
|
+
|age| name|
|
|
517
|
+
+---+-----+
|
|
518
|
+
| 2|Alice|
|
|
519
|
+
+---+-----+
|
|
520
|
+
|
|
521
|
+
Filter by SQL expression in a string.
|
|
522
|
+
|
|
523
|
+
>>> df.filter("age > 3").show()
|
|
524
|
+
+---+----+
|
|
525
|
+
|age|name|
|
|
526
|
+
+---+----+
|
|
527
|
+
| 5| Bob|
|
|
528
|
+
+---+----+
|
|
529
|
+
>>> df.where("age = 2").show()
|
|
530
|
+
+---+-----+
|
|
531
|
+
|age| name|
|
|
532
|
+
+---+-----+
|
|
533
|
+
| 2|Alice|
|
|
534
|
+
+---+-----+
|
|
535
|
+
"""
|
|
536
|
+
if isinstance(condition, Column):
|
|
537
|
+
cond = condition.expr
|
|
538
|
+
elif isinstance(condition, str):
|
|
539
|
+
cond = condition
|
|
540
|
+
else:
|
|
541
|
+
raise PySparkTypeError(
|
|
542
|
+
error_class="NOT_COLUMN_OR_STR",
|
|
543
|
+
message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__},
|
|
544
|
+
)
|
|
545
|
+
rel = self.relation.filter(cond)
|
|
546
|
+
return DataFrame(rel, self.session)
|
|
547
|
+
|
|
548
|
+
where = filter
|
|
549
|
+
|
|
550
|
+
def select(self, *cols) -> "DataFrame": # noqa: D102
|
|
551
|
+
cols = list(cols)
|
|
552
|
+
if len(cols) == 1:
|
|
553
|
+
cols = cols[0]
|
|
554
|
+
if isinstance(cols, list):
|
|
555
|
+
projections = [x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols]
|
|
556
|
+
else:
|
|
557
|
+
projections = [cols.expr if isinstance(cols, Column) else ColumnExpression(cols)]
|
|
558
|
+
rel = self.relation.select(*projections)
|
|
559
|
+
return DataFrame(rel, self.session)
|
|
560
|
+
|
|
561
|
+
@property
|
|
562
|
+
def columns(self) -> list[str]:
|
|
563
|
+
"""Returns all column names as a list.
|
|
564
|
+
|
|
565
|
+
Examples:
|
|
566
|
+
--------
|
|
567
|
+
>>> df.columns
|
|
568
|
+
['age', 'name']
|
|
569
|
+
"""
|
|
570
|
+
return [f.name for f in self.schema.fields]
|
|
571
|
+
|
|
572
|
+
def _ipython_key_completions_(self) -> list[str]:
|
|
573
|
+
# Provides tab-completion for column names in PySpark DataFrame
|
|
574
|
+
# when accessed in bracket notation, e.g. df['<TAB>]
|
|
575
|
+
return self.columns
|
|
576
|
+
|
|
577
|
+
def __dir__(self) -> list[str]: # noqa: D105
|
|
578
|
+
out = set(super().__dir__())
|
|
579
|
+
out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
|
|
580
|
+
return sorted(out)
|
|
581
|
+
|
|
582
|
+
def join(
|
|
583
|
+
self,
|
|
584
|
+
other: "DataFrame",
|
|
585
|
+
on: Optional[Union[str, list[str], Column, list[Column]]] = None,
|
|
586
|
+
how: Optional[str] = None,
|
|
587
|
+
) -> "DataFrame":
|
|
588
|
+
"""Joins with another :class:`DataFrame`, using the given join expression.
|
|
589
|
+
|
|
590
|
+
Parameters
|
|
591
|
+
----------
|
|
592
|
+
other : :class:`DataFrame`
|
|
593
|
+
Right side of the join
|
|
594
|
+
on : str, list or :class:`Column`, optional
|
|
595
|
+
a string for the join column name, a list of column names,
|
|
596
|
+
a join expression (Column), or a list of Columns.
|
|
597
|
+
If `on` is a string or a list of strings indicating the name of the join column(s),
|
|
598
|
+
the column(s) must exist on both sides, and this performs an equi-join.
|
|
599
|
+
how : str, optional
|
|
600
|
+
default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
|
|
601
|
+
``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,
|
|
602
|
+
``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
|
|
603
|
+
``anti``, ``leftanti`` and ``left_anti``.
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
-------
|
|
607
|
+
:class:`DataFrame`
|
|
608
|
+
Joined DataFrame.
|
|
609
|
+
|
|
610
|
+
Examples:
|
|
611
|
+
--------
|
|
612
|
+
The following performs a full outer join between ``df1`` and ``df2``.
|
|
613
|
+
|
|
614
|
+
>>> from pyspark.sql import Row
|
|
615
|
+
>>> from pyspark.sql.functions import desc
|
|
616
|
+
>>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
|
|
617
|
+
>>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
618
|
+
>>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
|
|
619
|
+
>>> df4 = spark.createDataFrame(
|
|
620
|
+
... [
|
|
621
|
+
... Row(age=10, height=80, name="Alice"),
|
|
622
|
+
... Row(age=5, height=None, name="Bob"),
|
|
623
|
+
... Row(age=None, height=None, name="Tom"),
|
|
624
|
+
... Row(age=None, height=None, name=None),
|
|
625
|
+
... ]
|
|
626
|
+
... )
|
|
627
|
+
|
|
628
|
+
Inner join on columns (default)
|
|
629
|
+
|
|
630
|
+
>>> df.join(df2, "name").select(df.name, df2.height).show()
|
|
631
|
+
+----+------+
|
|
632
|
+
|name|height|
|
|
633
|
+
+----+------+
|
|
634
|
+
| Bob| 85|
|
|
635
|
+
+----+------+
|
|
636
|
+
>>> df.join(df4, ["name", "age"]).select(df.name, df.age).show()
|
|
637
|
+
+----+---+
|
|
638
|
+
|name|age|
|
|
639
|
+
+----+---+
|
|
640
|
+
| Bob| 5|
|
|
641
|
+
+----+---+
|
|
642
|
+
|
|
643
|
+
Outer join for both DataFrames on the 'name' column.
|
|
644
|
+
|
|
645
|
+
>>> df.join(df2, df.name == df2.name, "outer").select(df.name, df2.height).sort(
|
|
646
|
+
... desc("name")
|
|
647
|
+
... ).show()
|
|
648
|
+
+-----+------+
|
|
649
|
+
| name|height|
|
|
650
|
+
+-----+------+
|
|
651
|
+
| Bob| 85|
|
|
652
|
+
|Alice| NULL|
|
|
653
|
+
| NULL| 80|
|
|
654
|
+
+-----+------+
|
|
655
|
+
>>> df.join(df2, "name", "outer").select("name", "height").sort(desc("name")).show()
|
|
656
|
+
+-----+------+
|
|
657
|
+
| name|height|
|
|
658
|
+
+-----+------+
|
|
659
|
+
| Tom| 80|
|
|
660
|
+
| Bob| 85|
|
|
661
|
+
|Alice| NULL|
|
|
662
|
+
+-----+------+
|
|
663
|
+
|
|
664
|
+
Outer join for both DataFrams with multiple columns.
|
|
665
|
+
|
|
666
|
+
>>> df.join(df3, [df.name == df3.name, df.age == df3.age], "outer").select(
|
|
667
|
+
... df.name, df3.age
|
|
668
|
+
... ).show()
|
|
669
|
+
+-----+---+
|
|
670
|
+
| name|age|
|
|
671
|
+
+-----+---+
|
|
672
|
+
|Alice| 2|
|
|
673
|
+
| Bob| 5|
|
|
674
|
+
+-----+---+
|
|
675
|
+
"""
|
|
676
|
+
if on is not None and not isinstance(on, list):
|
|
677
|
+
on = [on] # type: ignore[assignment]
|
|
678
|
+
if on is not None and not all(isinstance(x, str) for x in on):
|
|
679
|
+
assert isinstance(on, list)
|
|
680
|
+
# Get (or create) the Expressions from the list of Columns
|
|
681
|
+
on = [spark_sql_functions._to_column_expr(x) for x in on]
|
|
682
|
+
|
|
683
|
+
# & all the Expressions together to form one Expression
|
|
684
|
+
assert isinstance(on[0], Expression), "on should be Column or list of Column"
|
|
685
|
+
on = reduce(lambda x, y: x.__and__(y), cast("list[Expression]", on))
|
|
686
|
+
|
|
687
|
+
if on is None and how is None:
|
|
688
|
+
result = self.relation.join(other.relation)
|
|
689
|
+
else:
|
|
690
|
+
if how is None:
|
|
691
|
+
how = "inner"
|
|
692
|
+
if on is None:
|
|
693
|
+
on = "true"
|
|
694
|
+
elif isinstance(on, list) and all(isinstance(x, str) for x in on):
|
|
695
|
+
# Passed directly through as a list of strings
|
|
696
|
+
on = on
|
|
697
|
+
else:
|
|
698
|
+
on = str(on)
|
|
699
|
+
assert isinstance(how, str), "how should be a string"
|
|
700
|
+
|
|
701
|
+
def map_to_recognized_jointype(how: str) -> str:
|
|
702
|
+
known_aliases = {
|
|
703
|
+
"inner": [],
|
|
704
|
+
"outer": ["full", "fullouter", "full_outer"],
|
|
705
|
+
"left": ["leftouter", "left_outer"],
|
|
706
|
+
"right": ["rightouter", "right_outer"],
|
|
707
|
+
"anti": ["leftanti", "left_anti"],
|
|
708
|
+
"semi": ["leftsemi", "left_semi"],
|
|
709
|
+
}
|
|
710
|
+
for type, aliases in known_aliases.items():
|
|
711
|
+
if how == type or how in aliases:
|
|
712
|
+
return type
|
|
713
|
+
return how
|
|
714
|
+
|
|
715
|
+
how = map_to_recognized_jointype(how)
|
|
716
|
+
result = self.relation.join(other.relation, on, how)
|
|
717
|
+
return DataFrame(result, self.session)
|
|
718
|
+
|
|
719
|
+
def crossJoin(self, other: "DataFrame") -> "DataFrame":
|
|
720
|
+
"""Returns the cartesian product with another :class:`DataFrame`.
|
|
721
|
+
|
|
722
|
+
.. versionadded:: 2.1.0
|
|
723
|
+
|
|
724
|
+
.. versionchanged:: 3.4.0
|
|
725
|
+
Supports Spark Connect.
|
|
726
|
+
|
|
727
|
+
Parameters
|
|
728
|
+
----------
|
|
729
|
+
other : :class:`DataFrame`
|
|
730
|
+
Right side of the cartesian product.
|
|
731
|
+
|
|
732
|
+
Returns:
|
|
733
|
+
-------
|
|
734
|
+
:class:`DataFrame`
|
|
735
|
+
Joined DataFrame.
|
|
736
|
+
|
|
737
|
+
Examples:
|
|
738
|
+
--------
|
|
739
|
+
>>> from pyspark.sql import Row
|
|
740
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
741
|
+
>>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
|
|
742
|
+
>>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
|
|
743
|
+
+---+-----+------+
|
|
744
|
+
|age| name|height|
|
|
745
|
+
+---+-----+------+
|
|
746
|
+
| 14| Tom| 80|
|
|
747
|
+
| 14| Tom| 85|
|
|
748
|
+
| 23|Alice| 80|
|
|
749
|
+
| 23|Alice| 85|
|
|
750
|
+
| 16| Bob| 80|
|
|
751
|
+
| 16| Bob| 85|
|
|
752
|
+
+---+-----+------+
|
|
753
|
+
"""
|
|
754
|
+
return DataFrame(self.relation.cross(other.relation), self.session)
|
|
755
|
+
|
|
756
|
+
def alias(self, alias: str) -> "DataFrame":
|
|
757
|
+
"""Returns a new :class:`DataFrame` with an alias set.
|
|
758
|
+
|
|
759
|
+
Parameters
|
|
760
|
+
----------
|
|
761
|
+
alias : str
|
|
762
|
+
an alias name to be set for the :class:`DataFrame`.
|
|
763
|
+
|
|
764
|
+
Returns:
|
|
765
|
+
-------
|
|
766
|
+
:class:`DataFrame`
|
|
767
|
+
Aliased DataFrame.
|
|
768
|
+
|
|
769
|
+
Examples:
|
|
770
|
+
--------
|
|
771
|
+
>>> from pyspark.sql.functions import col, desc
|
|
772
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
773
|
+
>>> df_as1 = df.alias("df_as1")
|
|
774
|
+
>>> df_as2 = df.alias("df_as2")
|
|
775
|
+
>>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
|
|
776
|
+
>>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").sort(
|
|
777
|
+
... desc("df_as1.name")
|
|
778
|
+
... ).show()
|
|
779
|
+
+-----+-----+---+
|
|
780
|
+
| name| name|age|
|
|
781
|
+
+-----+-----+---+
|
|
782
|
+
| Tom| Tom| 14|
|
|
783
|
+
| Bob| Bob| 16|
|
|
784
|
+
|Alice|Alice| 23|
|
|
785
|
+
+-----+-----+---+
|
|
786
|
+
"""
|
|
787
|
+
assert isinstance(alias, str), "alias should be a string"
|
|
788
|
+
return DataFrame(self.relation.set_alias(alias), self.session)
|
|
789
|
+
|
|
790
|
+
def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] # noqa: D102
|
|
791
|
+
exclude = []
|
|
792
|
+
for col in cols:
|
|
793
|
+
if isinstance(col, str):
|
|
794
|
+
exclude.append(col)
|
|
795
|
+
elif isinstance(col, Column):
|
|
796
|
+
exclude.append(col.expr.get_name())
|
|
797
|
+
else:
|
|
798
|
+
raise PySparkTypeError(
|
|
799
|
+
error_class="NOT_COLUMN_OR_STR",
|
|
800
|
+
message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
|
|
801
|
+
)
|
|
802
|
+
# Filter out the columns that don't exist in the relation
|
|
803
|
+
exclude = [x for x in exclude if x in self.relation.columns]
|
|
804
|
+
expr = StarExpression(exclude=exclude)
|
|
805
|
+
return DataFrame(self.relation.select(expr), self.session)
|
|
806
|
+
|
|
807
|
+
def __repr__(self) -> str: # noqa: D105
|
|
808
|
+
return str(self.relation)
|
|
809
|
+
|
|
810
|
+
def limit(self, num: int) -> "DataFrame":
|
|
811
|
+
"""Limits the result count to the number specified.
|
|
812
|
+
|
|
813
|
+
Parameters
|
|
814
|
+
----------
|
|
815
|
+
num : int
|
|
816
|
+
Number of records to return. Will return this number of records
|
|
817
|
+
or all records if the DataFrame contains less than this number of records.
|
|
818
|
+
|
|
819
|
+
Returns:
|
|
820
|
+
-------
|
|
821
|
+
:class:`DataFrame`
|
|
822
|
+
Subset of the records
|
|
823
|
+
|
|
824
|
+
Examples:
|
|
825
|
+
--------
|
|
826
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
827
|
+
>>> df.limit(1).show()
|
|
828
|
+
+---+----+
|
|
829
|
+
|age|name|
|
|
830
|
+
+---+----+
|
|
831
|
+
| 14| Tom|
|
|
832
|
+
+---+----+
|
|
833
|
+
>>> df.limit(0).show()
|
|
834
|
+
+---+----+
|
|
835
|
+
|age|name|
|
|
836
|
+
+---+----+
|
|
837
|
+
+---+----+
|
|
838
|
+
"""
|
|
839
|
+
rel = self.relation.limit(num)
|
|
840
|
+
return DataFrame(rel, self.session)
|
|
841
|
+
|
|
842
|
+
def __contains__(self, item: str) -> bool:
|
|
843
|
+
"""Check if the :class:`DataFrame` contains a column by the name of `item`."""
|
|
844
|
+
return item in self.relation
|
|
845
|
+
|
|
846
|
+
@property
|
|
847
|
+
def schema(self) -> StructType:
|
|
848
|
+
"""Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
|
|
849
|
+
|
|
850
|
+
Examples:
|
|
851
|
+
--------
|
|
852
|
+
>>> df.schema
|
|
853
|
+
StructType([StructField('age', IntegerType(), True),
|
|
854
|
+
StructField('name', StringType(), True)])
|
|
855
|
+
"""
|
|
856
|
+
return self._schema
|
|
857
|
+
|
|
858
|
+
@overload
|
|
859
|
+
def __getitem__(self, item: Union[int, str]) -> Column: ...
|
|
860
|
+
|
|
861
|
+
@overload
|
|
862
|
+
def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ...
|
|
863
|
+
|
|
864
|
+
def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]:
|
|
865
|
+
"""Returns the column as a :class:`Column`.
|
|
866
|
+
|
|
867
|
+
Examples:
|
|
868
|
+
--------
|
|
869
|
+
>>> df.select(df["age"]).collect()
|
|
870
|
+
[Row(age=2), Row(age=5)]
|
|
871
|
+
>>> df[["name", "age"]].collect()
|
|
872
|
+
[Row(name='Alice', age=2), Row(name='Bob', age=5)]
|
|
873
|
+
>>> df[df.age > 3].collect()
|
|
874
|
+
[Row(age=5, name='Bob')]
|
|
875
|
+
>>> df[df[0] > 3].collect()
|
|
876
|
+
[Row(age=5, name='Bob')]
|
|
877
|
+
"""
|
|
878
|
+
if isinstance(item, str):
|
|
879
|
+
return Column(duckdb.ColumnExpression(self.relation.alias, item))
|
|
880
|
+
elif isinstance(item, Column):
|
|
881
|
+
return self.filter(item)
|
|
882
|
+
elif isinstance(item, (list, tuple)):
|
|
883
|
+
return self.select(*item)
|
|
884
|
+
elif isinstance(item, int):
|
|
885
|
+
return spark_sql_functions.col(self._schema[item].name)
|
|
886
|
+
else:
|
|
887
|
+
msg = f"Unexpected item type: {type(item)}"
|
|
888
|
+
raise TypeError(msg)
|
|
889
|
+
|
|
890
|
+
def __getattr__(self, name: str) -> Column:
|
|
891
|
+
"""Returns the :class:`Column` denoted by ``name``.
|
|
892
|
+
|
|
893
|
+
Examples:
|
|
894
|
+
--------
|
|
895
|
+
>>> df.select(df.age).collect()
|
|
896
|
+
[Row(age=2), Row(age=5)]
|
|
897
|
+
"""
|
|
898
|
+
if name not in self.relation.columns:
|
|
899
|
+
msg = f"'{self.__class__.__name__}' object has no attribute '{name}'"
|
|
900
|
+
raise AttributeError(msg)
|
|
901
|
+
return Column(duckdb.ColumnExpression(self.relation.alias, name))
|
|
902
|
+
|
|
903
|
+
@overload
|
|
904
|
+
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ...
|
|
905
|
+
|
|
906
|
+
@overload
|
|
907
|
+
def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ... # noqa: PYI063
|
|
908
|
+
|
|
909
|
+
def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
|
|
910
|
+
"""Groups the :class:`DataFrame` using the specified columns,
|
|
911
|
+
so we can run aggregation on them. See :class:`GroupedData`
|
|
912
|
+
for all the available aggregate functions.
|
|
913
|
+
|
|
914
|
+
:func:`groupby` is an alias for :func:`groupBy`.
|
|
915
|
+
|
|
916
|
+
Parameters
|
|
917
|
+
----------
|
|
918
|
+
cols : list, str or :class:`Column`
|
|
919
|
+
columns to group by.
|
|
920
|
+
Each element should be a column name (string) or an expression (:class:`Column`)
|
|
921
|
+
or list of them.
|
|
922
|
+
|
|
923
|
+
Returns:
|
|
924
|
+
-------
|
|
925
|
+
:class:`GroupedData`
|
|
926
|
+
Grouped data by given columns.
|
|
927
|
+
|
|
928
|
+
Examples:
|
|
929
|
+
--------
|
|
930
|
+
>>> df = spark.createDataFrame(
|
|
931
|
+
... [(2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
|
|
932
|
+
... )
|
|
933
|
+
|
|
934
|
+
Empty grouping columns triggers a global aggregation.
|
|
935
|
+
|
|
936
|
+
>>> df.groupBy().avg().show()
|
|
937
|
+
+--------+
|
|
938
|
+
|avg(age)|
|
|
939
|
+
+--------+
|
|
940
|
+
| 2.75|
|
|
941
|
+
+--------+
|
|
942
|
+
|
|
943
|
+
Group-by 'name', and specify a dictionary to calculate the summation of 'age'.
|
|
944
|
+
|
|
945
|
+
>>> df.groupBy("name").agg({"age": "sum"}).sort("name").show()
|
|
946
|
+
+-----+--------+
|
|
947
|
+
| name|sum(age)|
|
|
948
|
+
+-----+--------+
|
|
949
|
+
|Alice| 2|
|
|
950
|
+
| Bob| 9|
|
|
951
|
+
+-----+--------+
|
|
952
|
+
|
|
953
|
+
Group-by 'name', and calculate maximum values.
|
|
954
|
+
|
|
955
|
+
>>> df.groupBy(df.name).max().sort("name").show()
|
|
956
|
+
+-----+--------+
|
|
957
|
+
| name|max(age)|
|
|
958
|
+
+-----+--------+
|
|
959
|
+
|Alice| 2|
|
|
960
|
+
| Bob| 5|
|
|
961
|
+
+-----+--------+
|
|
962
|
+
|
|
963
|
+
Group-by 'name' and 'age', and calculate the number of rows in each group.
|
|
964
|
+
|
|
965
|
+
>>> df.groupBy(["name", df.age]).count().sort("name", "age").show()
|
|
966
|
+
+-----+---+-----+
|
|
967
|
+
| name|age|count|
|
|
968
|
+
+-----+---+-----+
|
|
969
|
+
|Alice| 2| 1|
|
|
970
|
+
| Bob| 2| 2|
|
|
971
|
+
| Bob| 5| 1|
|
|
972
|
+
+-----+---+-----+
|
|
973
|
+
""" # noqa: D205
|
|
974
|
+
from .group import GroupedData, Grouping
|
|
975
|
+
|
|
976
|
+
columns = cols[0] if len(cols) == 1 and isinstance(cols[0], list) else cols
|
|
977
|
+
return GroupedData(Grouping(*columns), self)
|
|
978
|
+
|
|
979
|
+
groupby = groupBy
|
|
980
|
+
|
|
981
|
+
@property
|
|
982
|
+
def write(self) -> DataFrameWriter: # noqa: D102
|
|
983
|
+
return DataFrameWriter(self)
|
|
984
|
+
|
|
985
|
+
def printSchema(self) -> None: # noqa: D102
|
|
986
|
+
raise ContributionsAcceptedError
|
|
987
|
+
|
|
988
|
+
def union(self, other: "DataFrame") -> "DataFrame":
|
|
989
|
+
"""Return a new :class:`DataFrame` containing union of rows in this and another
|
|
990
|
+
:class:`DataFrame`.
|
|
991
|
+
|
|
992
|
+
Parameters
|
|
993
|
+
----------
|
|
994
|
+
other : :class:`DataFrame`
|
|
995
|
+
Another :class:`DataFrame` that needs to be unioned
|
|
996
|
+
|
|
997
|
+
Returns:
|
|
998
|
+
-------
|
|
999
|
+
:class:`DataFrame`
|
|
1000
|
+
|
|
1001
|
+
See Also:
|
|
1002
|
+
--------
|
|
1003
|
+
DataFrame.unionAll
|
|
1004
|
+
|
|
1005
|
+
Notes:
|
|
1006
|
+
-----
|
|
1007
|
+
This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
|
|
1008
|
+
(that does deduplication of elements), use this function followed by :func:`distinct`.
|
|
1009
|
+
|
|
1010
|
+
Also as standard in SQL, this function resolves columns by position (not by name).
|
|
1011
|
+
|
|
1012
|
+
Examples:
|
|
1013
|
+
--------
|
|
1014
|
+
>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
|
|
1015
|
+
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
|
|
1016
|
+
>>> df1.union(df2).show()
|
|
1017
|
+
+----+----+----+
|
|
1018
|
+
|col0|col1|col2|
|
|
1019
|
+
+----+----+----+
|
|
1020
|
+
| 1| 2| 3|
|
|
1021
|
+
| 4| 5| 6|
|
|
1022
|
+
+----+----+----+
|
|
1023
|
+
>>> df1.union(df1).show()
|
|
1024
|
+
+----+----+----+
|
|
1025
|
+
|col0|col1|col2|
|
|
1026
|
+
+----+----+----+
|
|
1027
|
+
| 1| 2| 3|
|
|
1028
|
+
| 1| 2| 3|
|
|
1029
|
+
+----+----+----+
|
|
1030
|
+
""" # noqa: D205
|
|
1031
|
+
return DataFrame(self.relation.union(other.relation), self.session)
|
|
1032
|
+
|
|
1033
|
+
unionAll = union
|
|
1034
|
+
|
|
1035
|
+
def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
|
|
1036
|
+
"""Returns a new :class:`DataFrame` containing union of rows in this and another
|
|
1037
|
+
:class:`DataFrame`.
|
|
1038
|
+
|
|
1039
|
+
This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
|
|
1040
|
+
union (that does deduplication of elements), use this function followed by :func:`distinct`.
|
|
1041
|
+
|
|
1042
|
+
.. versionadded:: 2.3.0
|
|
1043
|
+
|
|
1044
|
+
.. versionchanged:: 3.4.0
|
|
1045
|
+
Supports Spark Connect.
|
|
1046
|
+
|
|
1047
|
+
Parameters
|
|
1048
|
+
----------
|
|
1049
|
+
other : :class:`DataFrame`
|
|
1050
|
+
Another :class:`DataFrame` that needs to be combined.
|
|
1051
|
+
allowMissingColumns : bool, optional, default False
|
|
1052
|
+
Specify whether to allow missing columns.
|
|
1053
|
+
|
|
1054
|
+
.. versionadded:: 3.1.0
|
|
1055
|
+
|
|
1056
|
+
Returns:
|
|
1057
|
+
-------
|
|
1058
|
+
:class:`DataFrame`
|
|
1059
|
+
Combined DataFrame.
|
|
1060
|
+
|
|
1061
|
+
Examples:
|
|
1062
|
+
--------
|
|
1063
|
+
The difference between this function and :func:`union` is that this function
|
|
1064
|
+
resolves columns by name (not by position):
|
|
1065
|
+
|
|
1066
|
+
>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
|
|
1067
|
+
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
|
|
1068
|
+
>>> df1.unionByName(df2).show()
|
|
1069
|
+
+----+----+----+
|
|
1070
|
+
|col0|col1|col2|
|
|
1071
|
+
+----+----+----+
|
|
1072
|
+
| 1| 2| 3|
|
|
1073
|
+
| 6| 4| 5|
|
|
1074
|
+
+----+----+----+
|
|
1075
|
+
|
|
1076
|
+
When the parameter `allowMissingColumns` is ``True``, the set of column names
|
|
1077
|
+
in this and other :class:`DataFrame` can differ; missing columns will be filled with null.
|
|
1078
|
+
Further, the missing columns of this :class:`DataFrame` will be added at the end
|
|
1079
|
+
in the schema of the union result:
|
|
1080
|
+
|
|
1081
|
+
>>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
|
|
1082
|
+
>>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])
|
|
1083
|
+
>>> df1.unionByName(df2, allowMissingColumns=True).show()
|
|
1084
|
+
+----+----+----+----+
|
|
1085
|
+
|col0|col1|col2|col3|
|
|
1086
|
+
+----+----+----+----+
|
|
1087
|
+
| 1| 2| 3|NULL|
|
|
1088
|
+
|NULL| 4| 5| 6|
|
|
1089
|
+
+----+----+----+----+
|
|
1090
|
+
""" # noqa: D205
|
|
1091
|
+
if allowMissingColumns:
|
|
1092
|
+
cols = []
|
|
1093
|
+
for col in self.relation.columns:
|
|
1094
|
+
if col in other.relation.columns:
|
|
1095
|
+
cols.append(col)
|
|
1096
|
+
else:
|
|
1097
|
+
cols.append(spark_sql_functions.lit(None))
|
|
1098
|
+
other = other.select(*cols)
|
|
1099
|
+
else:
|
|
1100
|
+
other = other.select(*self.relation.columns)
|
|
1101
|
+
|
|
1102
|
+
return DataFrame(self.relation.union(other.relation), self.session)
|
|
1103
|
+
|
|
1104
|
+
def intersect(self, other: "DataFrame") -> "DataFrame":
|
|
1105
|
+
"""Return a new :class:`DataFrame` containing rows only in
|
|
1106
|
+
both this :class:`DataFrame` and another :class:`DataFrame`.
|
|
1107
|
+
Note that any duplicates are removed. To preserve duplicates
|
|
1108
|
+
use :func:`intersectAll`.
|
|
1109
|
+
|
|
1110
|
+
.. versionadded:: 1.3.0
|
|
1111
|
+
|
|
1112
|
+
.. versionchanged:: 3.4.0
|
|
1113
|
+
Supports Spark Connect.
|
|
1114
|
+
|
|
1115
|
+
Parameters
|
|
1116
|
+
----------
|
|
1117
|
+
other : :class:`DataFrame`
|
|
1118
|
+
Another :class:`DataFrame` that needs to be combined.
|
|
1119
|
+
|
|
1120
|
+
Returns:
|
|
1121
|
+
-------
|
|
1122
|
+
:class:`DataFrame`
|
|
1123
|
+
Combined DataFrame.
|
|
1124
|
+
|
|
1125
|
+
Notes:
|
|
1126
|
+
-----
|
|
1127
|
+
This is equivalent to `INTERSECT` in SQL.
|
|
1128
|
+
|
|
1129
|
+
Examples:
|
|
1130
|
+
--------
|
|
1131
|
+
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1132
|
+
>>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
|
|
1133
|
+
>>> df1.intersect(df2).sort(df1.C1.desc()).show()
|
|
1134
|
+
+---+---+
|
|
1135
|
+
| C1| C2|
|
|
1136
|
+
+---+---+
|
|
1137
|
+
| b| 3|
|
|
1138
|
+
| a| 1|
|
|
1139
|
+
+---+---+
|
|
1140
|
+
""" # noqa: D205
|
|
1141
|
+
return self.intersectAll(other).drop_duplicates()
|
|
1142
|
+
|
|
1143
|
+
def intersectAll(self, other: "DataFrame") -> "DataFrame":
|
|
1144
|
+
"""Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
|
|
1145
|
+
and another :class:`DataFrame` while preserving duplicates.
|
|
1146
|
+
|
|
1147
|
+
This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function
|
|
1148
|
+
resolves columns by position (not by name).
|
|
1149
|
+
|
|
1150
|
+
.. versionadded:: 2.4.0
|
|
1151
|
+
|
|
1152
|
+
.. versionchanged:: 3.4.0
|
|
1153
|
+
Supports Spark Connect.
|
|
1154
|
+
|
|
1155
|
+
Parameters
|
|
1156
|
+
----------
|
|
1157
|
+
other : :class:`DataFrame`
|
|
1158
|
+
Another :class:`DataFrame` that needs to be combined.
|
|
1159
|
+
|
|
1160
|
+
Returns:
|
|
1161
|
+
-------
|
|
1162
|
+
:class:`DataFrame`
|
|
1163
|
+
Combined DataFrame.
|
|
1164
|
+
|
|
1165
|
+
Examples:
|
|
1166
|
+
--------
|
|
1167
|
+
>>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
|
|
1168
|
+
>>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
|
|
1169
|
+
>>> df1.intersectAll(df2).sort("C1", "C2").show()
|
|
1170
|
+
+---+---+
|
|
1171
|
+
| C1| C2|
|
|
1172
|
+
+---+---+
|
|
1173
|
+
| a| 1|
|
|
1174
|
+
| a| 1|
|
|
1175
|
+
| b| 3|
|
|
1176
|
+
+---+---+
|
|
1177
|
+
""" # noqa: D205
|
|
1178
|
+
return DataFrame(self.relation.intersect(other.relation), self.session)
|
|
1179
|
+
|
|
1180
|
+
def exceptAll(self, other: "DataFrame") -> "DataFrame":
|
|
1181
|
+
"""Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but
|
|
1182
|
+
not in another :class:`DataFrame` while preserving duplicates.
|
|
1183
|
+
|
|
1184
|
+
This is equivalent to `EXCEPT ALL` in SQL.
|
|
1185
|
+
As standard in SQL, this function resolves columns by position (not by name).
|
|
1186
|
+
|
|
1187
|
+
.. versionadded:: 2.4.0
|
|
1188
|
+
|
|
1189
|
+
.. versionchanged:: 3.4.0
|
|
1190
|
+
Supports Spark Connect.
|
|
1191
|
+
|
|
1192
|
+
Parameters
|
|
1193
|
+
----------
|
|
1194
|
+
other : :class:`DataFrame`
|
|
1195
|
+
The other :class:`DataFrame` to compare to.
|
|
1196
|
+
|
|
1197
|
+
Returns:
|
|
1198
|
+
-------
|
|
1199
|
+
:class:`DataFrame`
|
|
1200
|
+
|
|
1201
|
+
Examples:
|
|
1202
|
+
--------
|
|
1203
|
+
>>> df1 = spark.createDataFrame(
|
|
1204
|
+
... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]
|
|
1205
|
+
... )
|
|
1206
|
+
>>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
|
|
1207
|
+
>>> df1.exceptAll(df2).show()
|
|
1208
|
+
+---+---+
|
|
1209
|
+
| C1| C2|
|
|
1210
|
+
+---+---+
|
|
1211
|
+
| a| 1|
|
|
1212
|
+
| a| 1|
|
|
1213
|
+
| a| 2|
|
|
1214
|
+
| c| 4|
|
|
1215
|
+
+---+---+
|
|
1216
|
+
|
|
1217
|
+
""" # noqa: D205
|
|
1218
|
+
return DataFrame(self.relation.except_(other.relation), self.session)
|
|
1219
|
+
|
|
1220
|
+
def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame":
|
|
1221
|
+
"""Return a new :class:`DataFrame` with duplicate rows removed,
|
|
1222
|
+
optionally only considering certain columns.
|
|
1223
|
+
|
|
1224
|
+
For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming
|
|
1225
|
+
:class:`DataFrame`, it will keep all data across triggers as intermediate state to drop
|
|
1226
|
+
duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can
|
|
1227
|
+
be and the system will accordingly limit the state. In addition, data older than
|
|
1228
|
+
watermark will be dropped to avoid any possibility of duplicates.
|
|
1229
|
+
|
|
1230
|
+
:func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
|
|
1231
|
+
|
|
1232
|
+
Parameters
|
|
1233
|
+
----------
|
|
1234
|
+
subset : List of column names, optional
|
|
1235
|
+
List of columns to use for duplicate comparison (default All columns).
|
|
1236
|
+
|
|
1237
|
+
Returns:
|
|
1238
|
+
-------
|
|
1239
|
+
:class:`DataFrame`
|
|
1240
|
+
DataFrame without duplicates.
|
|
1241
|
+
|
|
1242
|
+
Examples:
|
|
1243
|
+
--------
|
|
1244
|
+
>>> from pyspark.sql import Row
|
|
1245
|
+
>>> df = spark.createDataFrame(
|
|
1246
|
+
... [
|
|
1247
|
+
... Row(name="Alice", age=5, height=80),
|
|
1248
|
+
... Row(name="Alice", age=5, height=80),
|
|
1249
|
+
... Row(name="Alice", age=10, height=80),
|
|
1250
|
+
... ]
|
|
1251
|
+
... )
|
|
1252
|
+
|
|
1253
|
+
Deduplicate the same rows.
|
|
1254
|
+
|
|
1255
|
+
>>> df.dropDuplicates().show()
|
|
1256
|
+
+-----+---+------+
|
|
1257
|
+
| name|age|height|
|
|
1258
|
+
+-----+---+------+
|
|
1259
|
+
|Alice| 5| 80|
|
|
1260
|
+
|Alice| 10| 80|
|
|
1261
|
+
+-----+---+------+
|
|
1262
|
+
|
|
1263
|
+
Deduplicate values on 'name' and 'height' columns.
|
|
1264
|
+
|
|
1265
|
+
>>> df.dropDuplicates(["name", "height"]).show()
|
|
1266
|
+
+-----+---+------+
|
|
1267
|
+
| name|age|height|
|
|
1268
|
+
+-----+---+------+
|
|
1269
|
+
|Alice| 5| 80|
|
|
1270
|
+
+-----+---+------+
|
|
1271
|
+
""" # noqa: D205
|
|
1272
|
+
if subset:
|
|
1273
|
+
rn_col = f"tmp_col_{uuid.uuid1().hex}"
|
|
1274
|
+
subset_str = ", ".join([f'"{c}"' for c in subset])
|
|
1275
|
+
window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
|
|
1276
|
+
df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
|
|
1277
|
+
return df.filter(f"{rn_col} = 1").drop(rn_col)
|
|
1278
|
+
|
|
1279
|
+
return self.distinct()
|
|
1280
|
+
|
|
1281
|
+
drop_duplicates = dropDuplicates
|
|
1282
|
+
|
|
1283
|
+
def distinct(self) -> "DataFrame":
|
|
1284
|
+
"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
-------
|
|
1288
|
+
:class:`DataFrame`
|
|
1289
|
+
DataFrame with distinct records.
|
|
1290
|
+
|
|
1291
|
+
Examples:
|
|
1292
|
+
--------
|
|
1293
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
|
|
1294
|
+
|
|
1295
|
+
Return the number of distinct rows in the :class:`DataFrame`
|
|
1296
|
+
|
|
1297
|
+
>>> df.distinct().count()
|
|
1298
|
+
2
|
|
1299
|
+
"""
|
|
1300
|
+
distinct_rel = self.relation.distinct()
|
|
1301
|
+
return DataFrame(distinct_rel, self.session)
|
|
1302
|
+
|
|
1303
|
+
def count(self) -> int:
|
|
1304
|
+
"""Returns the number of rows in this :class:`DataFrame`.
|
|
1305
|
+
|
|
1306
|
+
Returns:
|
|
1307
|
+
-------
|
|
1308
|
+
int
|
|
1309
|
+
Number of rows.
|
|
1310
|
+
|
|
1311
|
+
Examples:
|
|
1312
|
+
--------
|
|
1313
|
+
>>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
|
|
1314
|
+
|
|
1315
|
+
Return the number of rows in the :class:`DataFrame`.
|
|
1316
|
+
|
|
1317
|
+
>>> df.count()
|
|
1318
|
+
3
|
|
1319
|
+
"""
|
|
1320
|
+
count_rel = self.relation.count("*")
|
|
1321
|
+
return int(count_rel.fetchone()[0])
|
|
1322
|
+
|
|
1323
|
+
def _cast_types(self, *types) -> "DataFrame":
|
|
1324
|
+
existing_columns = self.relation.columns
|
|
1325
|
+
types_count = len(types)
|
|
1326
|
+
assert types_count == len(existing_columns)
|
|
1327
|
+
|
|
1328
|
+
cast_expressions = [
|
|
1329
|
+
f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types)
|
|
1330
|
+
]
|
|
1331
|
+
cast_expressions = ", ".join(cast_expressions)
|
|
1332
|
+
new_rel = self.relation.project(cast_expressions)
|
|
1333
|
+
return DataFrame(new_rel, self.session)
|
|
1334
|
+
|
|
1335
|
+
def toDF(self, *cols) -> "DataFrame": # noqa: D102
|
|
1336
|
+
existing_columns = self.relation.columns
|
|
1337
|
+
column_count = len(cols)
|
|
1338
|
+
if column_count != len(existing_columns):
|
|
1339
|
+
raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match")
|
|
1340
|
+
|
|
1341
|
+
existing_columns = [ColumnExpression(x) for x in existing_columns]
|
|
1342
|
+
projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)]
|
|
1343
|
+
new_rel = self.relation.project(*projections)
|
|
1344
|
+
return DataFrame(new_rel, self.session)
|
|
1345
|
+
|
|
1346
|
+
def collect(self) -> list[Row]: # noqa: D102
|
|
1347
|
+
columns = self.relation.columns
|
|
1348
|
+
result = self.relation.fetchall()
|
|
1349
|
+
|
|
1350
|
+
def construct_row(values: list, names: list[str]) -> Row:
|
|
1351
|
+
row = tuple.__new__(Row, list(values))
|
|
1352
|
+
row.__fields__ = list(names)
|
|
1353
|
+
return row
|
|
1354
|
+
|
|
1355
|
+
rows = [construct_row(x, columns) for x in result]
|
|
1356
|
+
return rows
|
|
1357
|
+
|
|
1358
|
+
def cache(self) -> "DataFrame":
|
|
1359
|
+
"""Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`).
|
|
1360
|
+
|
|
1361
|
+
.. versionadded:: 1.3.0
|
|
1362
|
+
|
|
1363
|
+
.. versionchanged:: 3.4.0
|
|
1364
|
+
Supports Spark Connect.
|
|
1365
|
+
|
|
1366
|
+
Notes:
|
|
1367
|
+
-----
|
|
1368
|
+
The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
|
|
1369
|
+
|
|
1370
|
+
Returns:
|
|
1371
|
+
-------
|
|
1372
|
+
:class:`DataFrame`
|
|
1373
|
+
Cached DataFrame.
|
|
1374
|
+
|
|
1375
|
+
Examples:
|
|
1376
|
+
--------
|
|
1377
|
+
>>> df = spark.range(1)
|
|
1378
|
+
>>> df.cache()
|
|
1379
|
+
DataFrame[id: bigint]
|
|
1380
|
+
|
|
1381
|
+
>>> df.explain()
|
|
1382
|
+
== Physical Plan ==
|
|
1383
|
+
InMemoryTableScan ...
|
|
1384
|
+
"""
|
|
1385
|
+
cached_relation = self.relation.execute()
|
|
1386
|
+
return DataFrame(cached_relation, self.session)
|
|
1387
|
+
|
|
1388
|
+
|
|
1389
|
+
__all__ = ["DataFrame"]
|