duckdb 1.4.4.dev18__cp314-cp314-macosx_10_15_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. _duckdb-stubs/__init__.pyi +1479 -0
  2. _duckdb-stubs/_func.pyi +46 -0
  3. _duckdb-stubs/_sqltypes.pyi +75 -0
  4. _duckdb.cpython-314-darwin.so +0 -0
  5. adbc_driver_duckdb/__init__.py +49 -0
  6. adbc_driver_duckdb/dbapi.py +115 -0
  7. duckdb/__init__.py +381 -0
  8. duckdb/_dbapi_type_object.py +231 -0
  9. duckdb/_version.py +22 -0
  10. duckdb/bytes_io_wrapper.py +69 -0
  11. duckdb/experimental/__init__.py +5 -0
  12. duckdb/experimental/spark/LICENSE +260 -0
  13. duckdb/experimental/spark/__init__.py +6 -0
  14. duckdb/experimental/spark/_globals.py +77 -0
  15. duckdb/experimental/spark/_typing.py +46 -0
  16. duckdb/experimental/spark/conf.py +46 -0
  17. duckdb/experimental/spark/context.py +180 -0
  18. duckdb/experimental/spark/errors/__init__.py +70 -0
  19. duckdb/experimental/spark/errors/error_classes.py +918 -0
  20. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  21. duckdb/experimental/spark/errors/exceptions/base.py +168 -0
  22. duckdb/experimental/spark/errors/utils.py +111 -0
  23. duckdb/experimental/spark/exception.py +18 -0
  24. duckdb/experimental/spark/sql/__init__.py +7 -0
  25. duckdb/experimental/spark/sql/_typing.py +86 -0
  26. duckdb/experimental/spark/sql/catalog.py +79 -0
  27. duckdb/experimental/spark/sql/column.py +361 -0
  28. duckdb/experimental/spark/sql/conf.py +24 -0
  29. duckdb/experimental/spark/sql/dataframe.py +1423 -0
  30. duckdb/experimental/spark/sql/functions.py +6216 -0
  31. duckdb/experimental/spark/sql/group.py +424 -0
  32. duckdb/experimental/spark/sql/readwriter.py +435 -0
  33. duckdb/experimental/spark/sql/session.py +297 -0
  34. duckdb/experimental/spark/sql/streaming.py +36 -0
  35. duckdb/experimental/spark/sql/type_utils.py +113 -0
  36. duckdb/experimental/spark/sql/types.py +1310 -0
  37. duckdb/experimental/spark/sql/udf.py +37 -0
  38. duckdb/filesystem.py +33 -0
  39. duckdb/func/__init__.py +3 -0
  40. duckdb/functional/__init__.py +13 -0
  41. duckdb/polars_io.py +285 -0
  42. duckdb/py.typed +0 -0
  43. duckdb/query_graph/__main__.py +358 -0
  44. duckdb/sqltypes/__init__.py +63 -0
  45. duckdb/typing/__init__.py +71 -0
  46. duckdb/udf.py +24 -0
  47. duckdb/value/__init__.py +1 -0
  48. duckdb/value/constant/__init__.py +270 -0
  49. duckdb-1.4.4.dev18.dist-info/METADATA +88 -0
  50. duckdb-1.4.4.dev18.dist-info/RECORD +52 -0
  51. duckdb-1.4.4.dev18.dist-info/WHEEL +6 -0
  52. duckdb-1.4.4.dev18.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,1423 @@
1
+ import uuid # noqa: D100
2
+ from functools import reduce
3
+ from keyword import iskeyword
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ Any,
7
+ Callable,
8
+ Optional,
9
+ Union,
10
+ cast,
11
+ overload,
12
+ )
13
+
14
+ import duckdb
15
+ from duckdb import ColumnExpression, Expression, StarExpression
16
+
17
+ from ..errors import PySparkIndexError, PySparkTypeError, PySparkValueError
18
+ from .column import Column
19
+ from .readwriter import DataFrameWriter
20
+ from .type_utils import duckdb_to_spark_schema
21
+ from .types import Row, StructType
22
+
23
+ if TYPE_CHECKING:
24
+ import pyarrow as pa
25
+ from pandas.core.frame import DataFrame as PandasDataFrame
26
+
27
+ from ._typing import ColumnOrName
28
+ from .group import GroupedData
29
+ from .session import SparkSession
30
+
31
+ from duckdb.experimental.spark.sql import functions as spark_sql_functions
32
+
33
+
34
+ class DataFrame: # noqa: D101
35
+ def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession") -> None: # noqa: D107
36
+ self.relation = relation
37
+ self.session = session
38
+ self._schema = None
39
+ if self.relation is not None:
40
+ self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
41
+
42
+ def show(self, **kwargs) -> None: # noqa: D102
43
+ self.relation.show()
44
+
45
+ def toPandas(self) -> "PandasDataFrame": # noqa: D102
46
+ return self.relation.df()
47
+
48
+ def toArrow(self) -> "pa.Table":
49
+ """Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
50
+
51
+ This is only available if PyArrow is installed and available.
52
+
53
+ .. versionadded:: 4.0.0
54
+
55
+ Notes:
56
+ -----
57
+ This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
58
+ expected to be small, as all the data is loaded into the driver's memory.
59
+
60
+ This API is a developer API.
61
+
62
+ Examples:
63
+ --------
64
+ >>> df.toArrow() # doctest: +SKIP
65
+ pyarrow.Table
66
+ age: int64
67
+ name: string
68
+ ----
69
+ age: [[2,5]]
70
+ name: [["Alice","Bob"]]
71
+ """
72
+ return self.relation.to_arrow_table()
73
+
74
+ def createOrReplaceTempView(self, name: str) -> None:
75
+ """Creates or replaces a local temporary view with this :class:`DataFrame`.
76
+
77
+ The lifetime of this temporary table is tied to the :class:`SparkSession`
78
+ that was used to create this :class:`DataFrame`.
79
+
80
+ Parameters
81
+ ----------
82
+ name : str
83
+ Name of the view.
84
+
85
+ Examples:
86
+ --------
87
+ Create a local temporary view named 'people'.
88
+
89
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
90
+ >>> df.createOrReplaceTempView("people")
91
+
92
+ Replace the local temporary view.
93
+
94
+ >>> df2 = df.filter(df.age > 3)
95
+ >>> df2.createOrReplaceTempView("people")
96
+ >>> df3 = spark.sql("SELECT * FROM people")
97
+ >>> sorted(df3.collect()) == sorted(df2.collect())
98
+ True
99
+ >>> spark.catalog.dropTempView("people")
100
+ True
101
+
102
+ """
103
+ self.relation.create_view(name, True)
104
+
105
+ def createGlobalTempView(self, name: str) -> None: # noqa: D102
106
+ raise NotImplementedError
107
+
108
+ def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame": # noqa: D102
109
+ if columnName not in self.relation:
110
+ msg = f"DataFrame does not contain a column named {columnName}"
111
+ raise ValueError(msg)
112
+ cols = []
113
+ for x in self.relation.columns:
114
+ col = ColumnExpression(x)
115
+ if x.casefold() == columnName.casefold():
116
+ col = col.alias(newName)
117
+ cols.append(col)
118
+ rel = self.relation.select(*cols)
119
+ return DataFrame(rel, self.session)
120
+
121
+ def withColumn(self, columnName: str, col: Column) -> "DataFrame": # noqa: D102
122
+ if not isinstance(col, Column):
123
+ raise PySparkTypeError(
124
+ error_class="NOT_COLUMN",
125
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
126
+ )
127
+ if columnName in self.relation:
128
+ # We want to replace the existing column with this new expression
129
+ cols = []
130
+ for x in self.relation.columns:
131
+ if x.casefold() == columnName.casefold():
132
+ cols.append(col.expr.alias(columnName))
133
+ else:
134
+ cols.append(ColumnExpression(x))
135
+ else:
136
+ cols = [ColumnExpression(x) for x in self.relation.columns]
137
+ cols.append(col.expr.alias(columnName))
138
+ rel = self.relation.select(*cols)
139
+ return DataFrame(rel, self.session)
140
+
141
+ def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame":
142
+ """Returns a new :class:`DataFrame` by adding multiple columns or replacing the
143
+ existing columns that have the same names.
144
+
145
+ The colsMap is a map of column name and column, the column must only refer to attributes
146
+ supplied by this Dataset. It is an error to add columns that refer to some other Dataset.
147
+
148
+ .. versionadded:: 3.3.0
149
+ Added support for multiple columns adding
150
+
151
+ .. versionchanged:: 3.4.0
152
+ Supports Spark Connect.
153
+
154
+ Parameters
155
+ ----------
156
+ colsMap : dict
157
+ a dict of column name and :class:`Column`. Currently, only a single map is supported.
158
+
159
+ Returns:
160
+ -------
161
+ :class:`DataFrame`
162
+ DataFrame with new or replaced columns.
163
+
164
+ Examples:
165
+ --------
166
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
167
+ >>> df.withColumns({"age2": df.age + 2, "age3": df.age + 3}).show()
168
+ +---+-----+----+----+
169
+ |age| name|age2|age3|
170
+ +---+-----+----+----+
171
+ | 2|Alice| 4| 5|
172
+ | 5| Bob| 7| 8|
173
+ +---+-----+----+----+
174
+ """ # noqa: D205
175
+ # Below code is to help enable kwargs in future.
176
+ assert len(colsMap) == 1
177
+ colsMap = colsMap[0] # type: ignore[assignment]
178
+
179
+ if not isinstance(colsMap, dict):
180
+ raise PySparkTypeError(
181
+ error_class="NOT_DICT",
182
+ message_parameters={
183
+ "arg_name": "colsMap",
184
+ "arg_type": type(colsMap).__name__,
185
+ },
186
+ )
187
+
188
+ column_names = list(colsMap.keys())
189
+ columns = list(colsMap.values())
190
+
191
+ # Compute this only once
192
+ column_names_for_comparison = [x.casefold() for x in column_names]
193
+
194
+ cols = []
195
+ for x in self.relation.columns:
196
+ if x.casefold() in column_names_for_comparison:
197
+ idx = column_names_for_comparison.index(x)
198
+ # We extract the column name from the originally passed
199
+ # in ones, as the casing might be different than the one
200
+ # in the relation
201
+ col_name = column_names.pop(idx)
202
+ col = columns.pop(idx)
203
+ cols.append(col.expr.alias(col_name))
204
+ else:
205
+ cols.append(ColumnExpression(x))
206
+
207
+ # In case anything is remaining, these are new columns
208
+ # that we need to add to the DataFrame
209
+ for col_name, col in zip(column_names, columns):
210
+ cols.append(col.expr.alias(col_name))
211
+
212
+ rel = self.relation.select(*cols)
213
+ return DataFrame(rel, self.session)
214
+
215
+ def withColumnsRenamed(self, colsMap: dict[str, str]) -> "DataFrame":
216
+ """Returns a new :class:`DataFrame` by renaming multiple columns.
217
+ This is a no-op if the schema doesn't contain the given column names.
218
+
219
+ .. versionadded:: 3.4.0
220
+ Added support for multiple columns renaming
221
+
222
+ Parameters
223
+ ----------
224
+ colsMap : dict
225
+ a dict of existing column names and corresponding desired column names.
226
+ Currently, only a single map is supported.
227
+
228
+ Returns:
229
+ -------
230
+ :class:`DataFrame`
231
+ DataFrame with renamed columns.
232
+
233
+ See Also:
234
+ --------
235
+ :meth:`withColumnRenamed`
236
+
237
+ Notes:
238
+ -----
239
+ Support Spark Connect
240
+
241
+ Examples:
242
+ --------
243
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
244
+ >>> df = df.withColumns({"age2": df.age + 2, "age3": df.age + 3})
245
+ >>> df.withColumnsRenamed({"age2": "age4", "age3": "age5"}).show()
246
+ +---+-----+----+----+
247
+ |age| name|age4|age5|
248
+ +---+-----+----+----+
249
+ | 2|Alice| 4| 5|
250
+ | 5| Bob| 7| 8|
251
+ +---+-----+----+----+
252
+ """ # noqa: D205
253
+ if not isinstance(colsMap, dict):
254
+ raise PySparkTypeError(
255
+ error_class="NOT_DICT",
256
+ message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__},
257
+ )
258
+
259
+ unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
260
+ if unknown_columns:
261
+ msg = f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
262
+ raise ValueError(msg)
263
+
264
+ # Compute this only once
265
+ old_column_names = list(colsMap.keys())
266
+ old_column_names_for_comparison = [x.casefold() for x in old_column_names]
267
+
268
+ cols = []
269
+ for x in self.relation.columns:
270
+ col = ColumnExpression(x)
271
+ if x.casefold() in old_column_names_for_comparison:
272
+ idx = old_column_names.index(x)
273
+ # We extract the column name from the originally passed
274
+ # in ones, as the casing might be different than the one
275
+ # in the relation
276
+ col_name = old_column_names.pop(idx)
277
+ new_col_name = colsMap[col_name]
278
+ col = col.alias(new_col_name)
279
+ cols.append(col)
280
+
281
+ rel = self.relation.select(*cols)
282
+ return DataFrame(rel, self.session)
283
+
284
+ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": # noqa: ANN401
285
+ """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
286
+
287
+ .. versionadded:: 3.0.0
288
+
289
+ .. versionchanged:: 3.4.0
290
+ Supports Spark Connect.
291
+
292
+ Parameters
293
+ ----------
294
+ func : function
295
+ a function that takes and returns a :class:`DataFrame`.
296
+ *args
297
+ Positional arguments to pass to func.
298
+
299
+ .. versionadded:: 3.3.0
300
+ **kwargs
301
+ Keyword arguments to pass to func.
302
+
303
+ .. versionadded:: 3.3.0
304
+
305
+ Returns:
306
+ -------
307
+ :class:`DataFrame`
308
+ Transformed DataFrame.
309
+
310
+ Examples:
311
+ --------
312
+ >>> from pyspark.sql.functions import col
313
+ >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
314
+ >>> def cast_all_to_int(input_df):
315
+ ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
316
+ >>> def sort_columns_asc(input_df):
317
+ ... return input_df.select(*sorted(input_df.columns))
318
+ >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
319
+ +-----+---+
320
+ |float|int|
321
+ +-----+---+
322
+ | 1| 1|
323
+ | 2| 2|
324
+ +-----+---+
325
+
326
+ >>> def add_n(input_df, n):
327
+ ... return input_df.select(
328
+ ... [(col(col_name) + n).alias(col_name) for col_name in input_df.columns]
329
+ ... )
330
+ >>> df.transform(add_n, 1).transform(add_n, n=10).show()
331
+ +---+-----+
332
+ |int|float|
333
+ +---+-----+
334
+ | 12| 12.0|
335
+ | 13| 13.0|
336
+ +---+-----+
337
+ """
338
+ result = func(self, *args, **kwargs)
339
+ assert isinstance(result, DataFrame), (
340
+ f"Func returned an instance of type [{type(result)}], should have been DataFrame."
341
+ )
342
+ return result
343
+
344
+ def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame": # noqa: ANN401
345
+ """Returns a new :class:`DataFrame` sorted by the specified column(s).
346
+
347
+ Parameters
348
+ ----------
349
+ cols : str, list, or :class:`Column`, optional
350
+ list of :class:`Column` or column names to sort by.
351
+
352
+ Other Parameters
353
+ ----------------
354
+ ascending : bool or list, optional, default True
355
+ boolean or list of boolean.
356
+ Sort ascending vs. descending. Specify list for multiple sort orders.
357
+ If a list is specified, the length of the list must equal the length of the `cols`.
358
+
359
+ Returns:
360
+ -------
361
+ :class:`DataFrame`
362
+ Sorted DataFrame.
363
+
364
+ Examples:
365
+ --------
366
+ >>> from pyspark.sql.functions import desc, asc
367
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
368
+
369
+ Sort the DataFrame in ascending order.
370
+
371
+ >>> df.sort(asc("age")).show()
372
+ +---+-----+
373
+ |age| name|
374
+ +---+-----+
375
+ | 2|Alice|
376
+ | 5| Bob|
377
+ +---+-----+
378
+
379
+ Sort the DataFrame in descending order.
380
+
381
+ >>> df.sort(df.age.desc()).show()
382
+ +---+-----+
383
+ |age| name|
384
+ +---+-----+
385
+ | 5| Bob|
386
+ | 2|Alice|
387
+ +---+-----+
388
+ >>> df.orderBy(df.age.desc()).show()
389
+ +---+-----+
390
+ |age| name|
391
+ +---+-----+
392
+ | 5| Bob|
393
+ | 2|Alice|
394
+ +---+-----+
395
+ >>> df.sort("age", ascending=False).show()
396
+ +---+-----+
397
+ |age| name|
398
+ +---+-----+
399
+ | 5| Bob|
400
+ | 2|Alice|
401
+ +---+-----+
402
+
403
+ Specify multiple columns
404
+
405
+ >>> df = spark.createDataFrame(
406
+ ... [(2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
407
+ ... )
408
+ >>> df.orderBy(desc("age"), "name").show()
409
+ +---+-----+
410
+ |age| name|
411
+ +---+-----+
412
+ | 5| Bob|
413
+ | 2|Alice|
414
+ | 2| Bob|
415
+ +---+-----+
416
+
417
+ Specify multiple columns for sorting order at `ascending`.
418
+
419
+ >>> df.orderBy(["age", "name"], ascending=[False, False]).show()
420
+ +---+-----+
421
+ |age| name|
422
+ +---+-----+
423
+ | 5| Bob|
424
+ | 2| Bob|
425
+ | 2|Alice|
426
+ +---+-----+
427
+ """
428
+ if not cols:
429
+ raise PySparkValueError(
430
+ error_class="CANNOT_BE_EMPTY",
431
+ message_parameters={"item": "column"},
432
+ )
433
+ if len(cols) == 1 and isinstance(cols[0], list):
434
+ cols = cols[0]
435
+
436
+ columns = []
437
+ for c in cols:
438
+ _c = c
439
+ if isinstance(c, str):
440
+ _c = spark_sql_functions.col(c)
441
+ elif isinstance(c, int) and not isinstance(c, bool):
442
+ # ordinal is 1-based
443
+ if c > 0:
444
+ _c = self[c - 1]
445
+ # negative ordinal means sort by desc
446
+ elif c < 0:
447
+ _c = self[-c - 1].desc()
448
+ else:
449
+ raise PySparkIndexError(
450
+ error_class="ZERO_INDEX",
451
+ message_parameters={},
452
+ )
453
+ columns.append(_c)
454
+
455
+ ascending = kwargs.get("ascending", True)
456
+
457
+ if isinstance(ascending, (bool, int)):
458
+ if not ascending:
459
+ columns = [c.desc() for c in columns]
460
+ elif isinstance(ascending, list):
461
+ columns = [c if asc else c.desc() for asc, c in zip(ascending, columns)]
462
+ else:
463
+ raise PySparkTypeError(
464
+ error_class="NOT_BOOL_OR_LIST",
465
+ message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
466
+ )
467
+
468
+ columns = [spark_sql_functions._to_column_expr(c) for c in columns]
469
+ rel = self.relation.sort(*columns)
470
+ return DataFrame(rel, self.session)
471
+
472
+ orderBy = sort
473
+
474
+ def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]: # noqa: D102
475
+ if n is None:
476
+ rs = self.head(1)
477
+ return rs[0] if rs else None
478
+ return self.take(n)
479
+
480
+ first = head
481
+
482
+ def take(self, num: int) -> list[Row]: # noqa: D102
483
+ return self.limit(num).collect()
484
+
485
+ def filter(self, condition: "ColumnOrName") -> "DataFrame":
486
+ """Filters rows using the given condition.
487
+
488
+ :func:`where` is an alias for :func:`filter`.
489
+
490
+ Parameters
491
+ ----------
492
+ condition : :class:`Column` or str
493
+ a :class:`Column` of :class:`types.BooleanType`
494
+ or a string of SQL expressions.
495
+
496
+ Returns:
497
+ -------
498
+ :class:`DataFrame`
499
+ Filtered DataFrame.
500
+
501
+ Examples:
502
+ --------
503
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
504
+
505
+ Filter by :class:`Column` instances.
506
+
507
+ >>> df.filter(df.age > 3).show()
508
+ +---+----+
509
+ |age|name|
510
+ +---+----+
511
+ | 5| Bob|
512
+ +---+----+
513
+ >>> df.where(df.age == 2).show()
514
+ +---+-----+
515
+ |age| name|
516
+ +---+-----+
517
+ | 2|Alice|
518
+ +---+-----+
519
+
520
+ Filter by SQL expression in a string.
521
+
522
+ >>> df.filter("age > 3").show()
523
+ +---+----+
524
+ |age|name|
525
+ +---+----+
526
+ | 5| Bob|
527
+ +---+----+
528
+ >>> df.where("age = 2").show()
529
+ +---+-----+
530
+ |age| name|
531
+ +---+-----+
532
+ | 2|Alice|
533
+ +---+-----+
534
+ """
535
+ if isinstance(condition, Column):
536
+ cond = condition.expr
537
+ elif isinstance(condition, str):
538
+ cond = condition
539
+ else:
540
+ raise PySparkTypeError(
541
+ error_class="NOT_COLUMN_OR_STR",
542
+ message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__},
543
+ )
544
+ rel = self.relation.filter(cond)
545
+ return DataFrame(rel, self.session)
546
+
547
+ where = filter
548
+
549
+ def select(self, *cols) -> "DataFrame": # noqa: D102
550
+ cols = list(cols)
551
+ if len(cols) == 1:
552
+ cols = cols[0]
553
+ if isinstance(cols, list):
554
+ projections = [x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols]
555
+ else:
556
+ projections = [cols.expr if isinstance(cols, Column) else ColumnExpression(cols)]
557
+ rel = self.relation.select(*projections)
558
+ return DataFrame(rel, self.session)
559
+
560
+ @property
561
+ def columns(self) -> list[str]:
562
+ """Returns all column names as a list.
563
+
564
+ Examples:
565
+ --------
566
+ >>> df.columns
567
+ ['age', 'name']
568
+ """
569
+ return [f.name for f in self.schema.fields]
570
+
571
+ @property
572
+ def dtypes(self) -> list[tuple[str, str]]:
573
+ """Returns all column names and their data types as a list of tuples.
574
+
575
+ Returns:
576
+ -------
577
+ list of tuple
578
+ List of tuples, each tuple containing a column name and its data type as strings.
579
+
580
+ Examples:
581
+ --------
582
+ >>> df.dtypes
583
+ [('age', 'bigint'), ('name', 'string')]
584
+ """
585
+ return [(f.name, f.dataType.simpleString()) for f in self.schema.fields]
586
+
587
+ def _ipython_key_completions_(self) -> list[str]:
588
+ # Provides tab-completion for column names in PySpark DataFrame
589
+ # when accessed in bracket notation, e.g. df['<TAB>]
590
+ return self.columns
591
+
592
+ def __dir__(self) -> list[str]: # noqa: D105
593
+ out = set(super().__dir__())
594
+ out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
595
+ return sorted(out)
596
+
597
+ def join(
598
+ self,
599
+ other: "DataFrame",
600
+ on: Optional[Union[str, list[str], Column, list[Column]]] = None,
601
+ how: Optional[str] = None,
602
+ ) -> "DataFrame":
603
+ """Joins with another :class:`DataFrame`, using the given join expression.
604
+
605
+ Parameters
606
+ ----------
607
+ other : :class:`DataFrame`
608
+ Right side of the join
609
+ on : str, list or :class:`Column`, optional
610
+ a string for the join column name, a list of column names,
611
+ a join expression (Column), or a list of Columns.
612
+ If `on` is a string or a list of strings indicating the name of the join column(s),
613
+ the column(s) must exist on both sides, and this performs an equi-join.
614
+ how : str, optional
615
+ default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
616
+ ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,
617
+ ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
618
+ ``anti``, ``leftanti`` and ``left_anti``.
619
+
620
+ Returns:
621
+ -------
622
+ :class:`DataFrame`
623
+ Joined DataFrame.
624
+
625
+ Examples:
626
+ --------
627
+ The following performs a full outer join between ``df1`` and ``df2``.
628
+
629
+ >>> from pyspark.sql import Row
630
+ >>> from pyspark.sql.functions import desc
631
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
632
+ >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
633
+ >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
634
+ >>> df4 = spark.createDataFrame(
635
+ ... [
636
+ ... Row(age=10, height=80, name="Alice"),
637
+ ... Row(age=5, height=None, name="Bob"),
638
+ ... Row(age=None, height=None, name="Tom"),
639
+ ... Row(age=None, height=None, name=None),
640
+ ... ]
641
+ ... )
642
+
643
+ Inner join on columns (default)
644
+
645
+ >>> df.join(df2, "name").select(df.name, df2.height).show()
646
+ +----+------+
647
+ |name|height|
648
+ +----+------+
649
+ | Bob| 85|
650
+ +----+------+
651
+ >>> df.join(df4, ["name", "age"]).select(df.name, df.age).show()
652
+ +----+---+
653
+ |name|age|
654
+ +----+---+
655
+ | Bob| 5|
656
+ +----+---+
657
+
658
+ Outer join for both DataFrames on the 'name' column.
659
+
660
+ >>> df.join(df2, df.name == df2.name, "outer").select(df.name, df2.height).sort(
661
+ ... desc("name")
662
+ ... ).show()
663
+ +-----+------+
664
+ | name|height|
665
+ +-----+------+
666
+ | Bob| 85|
667
+ |Alice| NULL|
668
+ | NULL| 80|
669
+ +-----+------+
670
+ >>> df.join(df2, "name", "outer").select("name", "height").sort(desc("name")).show()
671
+ +-----+------+
672
+ | name|height|
673
+ +-----+------+
674
+ | Tom| 80|
675
+ | Bob| 85|
676
+ |Alice| NULL|
677
+ +-----+------+
678
+
679
+ Outer join for both DataFrams with multiple columns.
680
+
681
+ >>> df.join(df3, [df.name == df3.name, df.age == df3.age], "outer").select(
682
+ ... df.name, df3.age
683
+ ... ).show()
684
+ +-----+---+
685
+ | name|age|
686
+ +-----+---+
687
+ |Alice| 2|
688
+ | Bob| 5|
689
+ +-----+---+
690
+ """
691
+ if on is not None and not isinstance(on, list):
692
+ on = [on] # type: ignore[assignment]
693
+ if on is not None and not all(isinstance(x, str) for x in on):
694
+ assert isinstance(on, list)
695
+ # Get (or create) the Expressions from the list of Columns
696
+ on = [spark_sql_functions._to_column_expr(x) for x in on]
697
+
698
+ # & all the Expressions together to form one Expression
699
+ assert isinstance(on[0], Expression), "on should be Column or list of Column"
700
+ on = reduce(lambda x, y: x.__and__(y), cast("list[Expression]", on))
701
+
702
+ if on is None and how is None:
703
+ result = self.relation.join(other.relation)
704
+ else:
705
+ if how is None:
706
+ how = "inner"
707
+ if on is None:
708
+ on = "true"
709
+ elif isinstance(on, list) and all(isinstance(x, str) for x in on):
710
+ # Passed directly through as a list of strings
711
+ on = on
712
+ else:
713
+ on = str(on)
714
+ assert isinstance(how, str), "how should be a string"
715
+
716
+ def map_to_recognized_jointype(how: str) -> str:
717
+ known_aliases = {
718
+ "inner": [],
719
+ "outer": ["full", "fullouter", "full_outer"],
720
+ "left": ["leftouter", "left_outer"],
721
+ "right": ["rightouter", "right_outer"],
722
+ "anti": ["leftanti", "left_anti"],
723
+ "semi": ["leftsemi", "left_semi"],
724
+ }
725
+ for type, aliases in known_aliases.items():
726
+ if how == type or how in aliases:
727
+ return type
728
+ return how
729
+
730
+ how = map_to_recognized_jointype(how)
731
+ result = self.relation.join(other.relation, on, how)
732
+ return DataFrame(result, self.session)
733
+
734
+ def crossJoin(self, other: "DataFrame") -> "DataFrame":
735
+ """Returns the cartesian product with another :class:`DataFrame`.
736
+
737
+ .. versionadded:: 2.1.0
738
+
739
+ .. versionchanged:: 3.4.0
740
+ Supports Spark Connect.
741
+
742
+ Parameters
743
+ ----------
744
+ other : :class:`DataFrame`
745
+ Right side of the cartesian product.
746
+
747
+ Returns:
748
+ -------
749
+ :class:`DataFrame`
750
+ Joined DataFrame.
751
+
752
+ Examples:
753
+ --------
754
+ >>> from pyspark.sql import Row
755
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
756
+ >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
757
+ >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
758
+ +---+-----+------+
759
+ |age| name|height|
760
+ +---+-----+------+
761
+ | 14| Tom| 80|
762
+ | 14| Tom| 85|
763
+ | 23|Alice| 80|
764
+ | 23|Alice| 85|
765
+ | 16| Bob| 80|
766
+ | 16| Bob| 85|
767
+ +---+-----+------+
768
+ """
769
+ return DataFrame(self.relation.cross(other.relation), self.session)
770
+
771
+ def alias(self, alias: str) -> "DataFrame":
772
+ """Returns a new :class:`DataFrame` with an alias set.
773
+
774
+ Parameters
775
+ ----------
776
+ alias : str
777
+ an alias name to be set for the :class:`DataFrame`.
778
+
779
+ Returns:
780
+ -------
781
+ :class:`DataFrame`
782
+ Aliased DataFrame.
783
+
784
+ Examples:
785
+ --------
786
+ >>> from pyspark.sql.functions import col, desc
787
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
788
+ >>> df_as1 = df.alias("df_as1")
789
+ >>> df_as2 = df.alias("df_as2")
790
+ >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
791
+ >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").sort(
792
+ ... desc("df_as1.name")
793
+ ... ).show()
794
+ +-----+-----+---+
795
+ | name| name|age|
796
+ +-----+-----+---+
797
+ | Tom| Tom| 14|
798
+ | Bob| Bob| 16|
799
+ |Alice|Alice| 23|
800
+ +-----+-----+---+
801
+ """
802
+ assert isinstance(alias, str), "alias should be a string"
803
+ return DataFrame(self.relation.set_alias(alias), self.session)
804
+
805
+ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] # noqa: D102
806
+ exclude = []
807
+ for col in cols:
808
+ if isinstance(col, str):
809
+ exclude.append(col)
810
+ elif isinstance(col, Column):
811
+ exclude.append(col.expr.get_name())
812
+ else:
813
+ raise PySparkTypeError(
814
+ error_class="NOT_COLUMN_OR_STR",
815
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
816
+ )
817
+ # Filter out the columns that don't exist in the relation
818
+ exclude = [x for x in exclude if x in self.relation.columns]
819
+ expr = StarExpression(exclude=exclude)
820
+ return DataFrame(self.relation.select(expr), self.session)
821
+
822
+ def __repr__(self) -> str: # noqa: D105
823
+ return str(self.relation)
824
+
825
+ def limit(self, num: int) -> "DataFrame":
826
+ """Limits the result count to the number specified.
827
+
828
+ Parameters
829
+ ----------
830
+ num : int
831
+ Number of records to return. Will return this number of records
832
+ or all records if the DataFrame contains less than this number of records.
833
+
834
+ Returns:
835
+ -------
836
+ :class:`DataFrame`
837
+ Subset of the records
838
+
839
+ Examples:
840
+ --------
841
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
842
+ >>> df.limit(1).show()
843
+ +---+----+
844
+ |age|name|
845
+ +---+----+
846
+ | 14| Tom|
847
+ +---+----+
848
+ >>> df.limit(0).show()
849
+ +---+----+
850
+ |age|name|
851
+ +---+----+
852
+ +---+----+
853
+ """
854
+ rel = self.relation.limit(num)
855
+ return DataFrame(rel, self.session)
856
+
857
+ def __contains__(self, item: str) -> bool:
858
+ """Check if the :class:`DataFrame` contains a column by the name of `item`."""
859
+ return item in self.relation
860
+
861
+ @property
862
+ def schema(self) -> StructType:
863
+ """Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
864
+
865
+ Examples:
866
+ --------
867
+ >>> df.schema
868
+ StructType([StructField('age', IntegerType(), True),
869
+ StructField('name', StringType(), True)])
870
+ """
871
+ return self._schema
872
+
873
+ @overload
874
+ def __getitem__(self, item: Union[int, str]) -> Column: ...
875
+
876
+ @overload
877
+ def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ...
878
+
879
+ def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]:
880
+ """Returns the column as a :class:`Column`.
881
+
882
+ Examples:
883
+ --------
884
+ >>> df.select(df["age"]).collect()
885
+ [Row(age=2), Row(age=5)]
886
+ >>> df[["name", "age"]].collect()
887
+ [Row(name='Alice', age=2), Row(name='Bob', age=5)]
888
+ >>> df[df.age > 3].collect()
889
+ [Row(age=5, name='Bob')]
890
+ >>> df[df[0] > 3].collect()
891
+ [Row(age=5, name='Bob')]
892
+ """
893
+ if isinstance(item, str):
894
+ return Column(duckdb.ColumnExpression(self.relation.alias, item))
895
+ elif isinstance(item, Column):
896
+ return self.filter(item)
897
+ elif isinstance(item, (list, tuple)):
898
+ return self.select(*item)
899
+ elif isinstance(item, int):
900
+ return spark_sql_functions.col(self._schema[item].name)
901
+ else:
902
+ msg = f"Unexpected item type: {type(item)}"
903
+ raise TypeError(msg)
904
+
905
+ def __getattr__(self, name: str) -> Column:
906
+ """Returns the :class:`Column` denoted by ``name``.
907
+
908
+ Examples:
909
+ --------
910
+ >>> df.select(df.age).collect()
911
+ [Row(age=2), Row(age=5)]
912
+ """
913
+ if name not in self.relation.columns:
914
+ msg = f"'{self.__class__.__name__}' object has no attribute '{name}'"
915
+ raise AttributeError(msg)
916
+ return Column(duckdb.ColumnExpression(self.relation.alias, name))
917
+
918
+ @overload
919
+ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ...
920
+
921
+ @overload
922
+ def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ... # noqa: PYI063
923
+
924
+ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
925
+ """Groups the :class:`DataFrame` using the specified columns,
926
+ so we can run aggregation on them. See :class:`GroupedData`
927
+ for all the available aggregate functions.
928
+
929
+ :func:`groupby` is an alias for :func:`groupBy`.
930
+
931
+ Parameters
932
+ ----------
933
+ cols : list, str or :class:`Column`
934
+ columns to group by.
935
+ Each element should be a column name (string) or an expression (:class:`Column`)
936
+ or list of them.
937
+
938
+ Returns:
939
+ -------
940
+ :class:`GroupedData`
941
+ Grouped data by given columns.
942
+
943
+ Examples:
944
+ --------
945
+ >>> df = spark.createDataFrame(
946
+ ... [(2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
947
+ ... )
948
+
949
+ Empty grouping columns triggers a global aggregation.
950
+
951
+ >>> df.groupBy().avg().show()
952
+ +--------+
953
+ |avg(age)|
954
+ +--------+
955
+ | 2.75|
956
+ +--------+
957
+
958
+ Group-by 'name', and specify a dictionary to calculate the summation of 'age'.
959
+
960
+ >>> df.groupBy("name").agg({"age": "sum"}).sort("name").show()
961
+ +-----+--------+
962
+ | name|sum(age)|
963
+ +-----+--------+
964
+ |Alice| 2|
965
+ | Bob| 9|
966
+ +-----+--------+
967
+
968
+ Group-by 'name', and calculate maximum values.
969
+
970
+ >>> df.groupBy(df.name).max().sort("name").show()
971
+ +-----+--------+
972
+ | name|max(age)|
973
+ +-----+--------+
974
+ |Alice| 2|
975
+ | Bob| 5|
976
+ +-----+--------+
977
+
978
+ Group-by 'name' and 'age', and calculate the number of rows in each group.
979
+
980
+ >>> df.groupBy(["name", df.age]).count().sort("name", "age").show()
981
+ +-----+---+-----+
982
+ | name|age|count|
983
+ +-----+---+-----+
984
+ |Alice| 2| 1|
985
+ | Bob| 2| 2|
986
+ | Bob| 5| 1|
987
+ +-----+---+-----+
988
+ """ # noqa: D205
989
+ from .group import GroupedData, Grouping
990
+
991
+ columns = cols[0] if len(cols) == 1 and isinstance(cols[0], list) else cols
992
+ return GroupedData(Grouping(*columns), self)
993
+
994
+ groupby = groupBy
995
+
996
+ @property
997
+ def write(self) -> DataFrameWriter: # noqa: D102
998
+ return DataFrameWriter(self)
999
+
1000
+ def printSchema(self, level: Optional[int] = None) -> None:
1001
+ """Prints out the schema in the tree format.
1002
+
1003
+ Parameters
1004
+ ----------
1005
+ level : int, optional
1006
+ How many levels to print for nested schemas. Prints all levels by default.
1007
+
1008
+ Examples:
1009
+ --------
1010
+ >>> df.printSchema()
1011
+ root
1012
+ |-- age: bigint (nullable = true)
1013
+ |-- name: string (nullable = true)
1014
+ """
1015
+ if level is not None and level < 0:
1016
+ raise PySparkValueError(
1017
+ error_class="NEGATIVE_VALUE",
1018
+ message_parameters={"arg_name": "level", "arg_value": str(level)},
1019
+ )
1020
+ print(self.schema.treeString(level))
1021
+
1022
+ def union(self, other: "DataFrame") -> "DataFrame":
1023
+ """Return a new :class:`DataFrame` containing union of rows in this and another
1024
+ :class:`DataFrame`.
1025
+
1026
+ Parameters
1027
+ ----------
1028
+ other : :class:`DataFrame`
1029
+ Another :class:`DataFrame` that needs to be unioned
1030
+
1031
+ Returns:
1032
+ -------
1033
+ :class:`DataFrame`
1034
+
1035
+ See Also:
1036
+ --------
1037
+ DataFrame.unionAll
1038
+
1039
+ Notes:
1040
+ -----
1041
+ This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1042
+ (that does deduplication of elements), use this function followed by :func:`distinct`.
1043
+
1044
+ Also as standard in SQL, this function resolves columns by position (not by name).
1045
+
1046
+ Examples:
1047
+ --------
1048
+ >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1049
+ >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
1050
+ >>> df1.union(df2).show()
1051
+ +----+----+----+
1052
+ |col0|col1|col2|
1053
+ +----+----+----+
1054
+ | 1| 2| 3|
1055
+ | 4| 5| 6|
1056
+ +----+----+----+
1057
+ >>> df1.union(df1).show()
1058
+ +----+----+----+
1059
+ |col0|col1|col2|
1060
+ +----+----+----+
1061
+ | 1| 2| 3|
1062
+ | 1| 2| 3|
1063
+ +----+----+----+
1064
+ """ # noqa: D205
1065
+ return DataFrame(self.relation.union(other.relation), self.session)
1066
+
1067
+ unionAll = union
1068
+
1069
+ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
1070
+ """Returns a new :class:`DataFrame` containing union of rows in this and another
1071
+ :class:`DataFrame`.
1072
+
1073
+ This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
1074
+ union (that does deduplication of elements), use this function followed by :func:`distinct`.
1075
+
1076
+ .. versionadded:: 2.3.0
1077
+
1078
+ .. versionchanged:: 3.4.0
1079
+ Supports Spark Connect.
1080
+
1081
+ Parameters
1082
+ ----------
1083
+ other : :class:`DataFrame`
1084
+ Another :class:`DataFrame` that needs to be combined.
1085
+ allowMissingColumns : bool, optional, default False
1086
+ Specify whether to allow missing columns.
1087
+
1088
+ .. versionadded:: 3.1.0
1089
+
1090
+ Returns:
1091
+ -------
1092
+ :class:`DataFrame`
1093
+ Combined DataFrame.
1094
+
1095
+ Examples:
1096
+ --------
1097
+ The difference between this function and :func:`union` is that this function
1098
+ resolves columns by name (not by position):
1099
+
1100
+ >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1101
+ >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
1102
+ >>> df1.unionByName(df2).show()
1103
+ +----+----+----+
1104
+ |col0|col1|col2|
1105
+ +----+----+----+
1106
+ | 1| 2| 3|
1107
+ | 6| 4| 5|
1108
+ +----+----+----+
1109
+
1110
+ When the parameter `allowMissingColumns` is ``True``, the set of column names
1111
+ in this and other :class:`DataFrame` can differ; missing columns will be filled with null.
1112
+ Further, the missing columns of this :class:`DataFrame` will be added at the end
1113
+ in the schema of the union result:
1114
+
1115
+ >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1116
+ >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])
1117
+ >>> df1.unionByName(df2, allowMissingColumns=True).show()
1118
+ +----+----+----+----+
1119
+ |col0|col1|col2|col3|
1120
+ +----+----+----+----+
1121
+ | 1| 2| 3|NULL|
1122
+ |NULL| 4| 5| 6|
1123
+ +----+----+----+----+
1124
+ """ # noqa: D205
1125
+ if allowMissingColumns:
1126
+ cols = []
1127
+ for col in self.relation.columns:
1128
+ if col in other.relation.columns:
1129
+ cols.append(col)
1130
+ else:
1131
+ cols.append(spark_sql_functions.lit(None))
1132
+ other = other.select(*cols)
1133
+ else:
1134
+ other = other.select(*self.relation.columns)
1135
+
1136
+ return DataFrame(self.relation.union(other.relation), self.session)
1137
+
1138
+ def intersect(self, other: "DataFrame") -> "DataFrame":
1139
+ """Return a new :class:`DataFrame` containing rows only in
1140
+ both this :class:`DataFrame` and another :class:`DataFrame`.
1141
+ Note that any duplicates are removed. To preserve duplicates
1142
+ use :func:`intersectAll`.
1143
+
1144
+ .. versionadded:: 1.3.0
1145
+
1146
+ .. versionchanged:: 3.4.0
1147
+ Supports Spark Connect.
1148
+
1149
+ Parameters
1150
+ ----------
1151
+ other : :class:`DataFrame`
1152
+ Another :class:`DataFrame` that needs to be combined.
1153
+
1154
+ Returns:
1155
+ -------
1156
+ :class:`DataFrame`
1157
+ Combined DataFrame.
1158
+
1159
+ Notes:
1160
+ -----
1161
+ This is equivalent to `INTERSECT` in SQL.
1162
+
1163
+ Examples:
1164
+ --------
1165
+ >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1166
+ >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
1167
+ >>> df1.intersect(df2).sort(df1.C1.desc()).show()
1168
+ +---+---+
1169
+ | C1| C2|
1170
+ +---+---+
1171
+ | b| 3|
1172
+ | a| 1|
1173
+ +---+---+
1174
+ """ # noqa: D205
1175
+ return self.intersectAll(other).drop_duplicates()
1176
+
1177
+ def intersectAll(self, other: "DataFrame") -> "DataFrame":
1178
+ """Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
1179
+ and another :class:`DataFrame` while preserving duplicates.
1180
+
1181
+ This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function
1182
+ resolves columns by position (not by name).
1183
+
1184
+ .. versionadded:: 2.4.0
1185
+
1186
+ .. versionchanged:: 3.4.0
1187
+ Supports Spark Connect.
1188
+
1189
+ Parameters
1190
+ ----------
1191
+ other : :class:`DataFrame`
1192
+ Another :class:`DataFrame` that needs to be combined.
1193
+
1194
+ Returns:
1195
+ -------
1196
+ :class:`DataFrame`
1197
+ Combined DataFrame.
1198
+
1199
+ Examples:
1200
+ --------
1201
+ >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1202
+ >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
1203
+ >>> df1.intersectAll(df2).sort("C1", "C2").show()
1204
+ +---+---+
1205
+ | C1| C2|
1206
+ +---+---+
1207
+ | a| 1|
1208
+ | a| 1|
1209
+ | b| 3|
1210
+ +---+---+
1211
+ """ # noqa: D205
1212
+ return DataFrame(self.relation.intersect(other.relation), self.session)
1213
+
1214
+ def exceptAll(self, other: "DataFrame") -> "DataFrame":
1215
+ """Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but
1216
+ not in another :class:`DataFrame` while preserving duplicates.
1217
+
1218
+ This is equivalent to `EXCEPT ALL` in SQL.
1219
+ As standard in SQL, this function resolves columns by position (not by name).
1220
+
1221
+ .. versionadded:: 2.4.0
1222
+
1223
+ .. versionchanged:: 3.4.0
1224
+ Supports Spark Connect.
1225
+
1226
+ Parameters
1227
+ ----------
1228
+ other : :class:`DataFrame`
1229
+ The other :class:`DataFrame` to compare to.
1230
+
1231
+ Returns:
1232
+ -------
1233
+ :class:`DataFrame`
1234
+
1235
+ Examples:
1236
+ --------
1237
+ >>> df1 = spark.createDataFrame(
1238
+ ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]
1239
+ ... )
1240
+ >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
1241
+ >>> df1.exceptAll(df2).show()
1242
+ +---+---+
1243
+ | C1| C2|
1244
+ +---+---+
1245
+ | a| 1|
1246
+ | a| 1|
1247
+ | a| 2|
1248
+ | c| 4|
1249
+ +---+---+
1250
+
1251
+ """ # noqa: D205
1252
+ return DataFrame(self.relation.except_(other.relation), self.session)
1253
+
1254
+ def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame":
1255
+ """Return a new :class:`DataFrame` with duplicate rows removed,
1256
+ optionally only considering certain columns.
1257
+
1258
+ For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming
1259
+ :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop
1260
+ duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can
1261
+ be and the system will accordingly limit the state. In addition, data older than
1262
+ watermark will be dropped to avoid any possibility of duplicates.
1263
+
1264
+ :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
1265
+
1266
+ Parameters
1267
+ ----------
1268
+ subset : List of column names, optional
1269
+ List of columns to use for duplicate comparison (default All columns).
1270
+
1271
+ Returns:
1272
+ -------
1273
+ :class:`DataFrame`
1274
+ DataFrame without duplicates.
1275
+
1276
+ Examples:
1277
+ --------
1278
+ >>> from pyspark.sql import Row
1279
+ >>> df = spark.createDataFrame(
1280
+ ... [
1281
+ ... Row(name="Alice", age=5, height=80),
1282
+ ... Row(name="Alice", age=5, height=80),
1283
+ ... Row(name="Alice", age=10, height=80),
1284
+ ... ]
1285
+ ... )
1286
+
1287
+ Deduplicate the same rows.
1288
+
1289
+ >>> df.dropDuplicates().show()
1290
+ +-----+---+------+
1291
+ | name|age|height|
1292
+ +-----+---+------+
1293
+ |Alice| 5| 80|
1294
+ |Alice| 10| 80|
1295
+ +-----+---+------+
1296
+
1297
+ Deduplicate values on 'name' and 'height' columns.
1298
+
1299
+ >>> df.dropDuplicates(["name", "height"]).show()
1300
+ +-----+---+------+
1301
+ | name|age|height|
1302
+ +-----+---+------+
1303
+ |Alice| 5| 80|
1304
+ +-----+---+------+
1305
+ """ # noqa: D205
1306
+ if subset:
1307
+ rn_col = f"tmp_col_{uuid.uuid1().hex}"
1308
+ subset_str = ", ".join([f'"{c}"' for c in subset])
1309
+ window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
1310
+ df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
1311
+ return df.filter(f"{rn_col} = 1").drop(rn_col)
1312
+
1313
+ return self.distinct()
1314
+
1315
+ drop_duplicates = dropDuplicates
1316
+
1317
+ def distinct(self) -> "DataFrame":
1318
+ """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
1319
+
1320
+ Returns:
1321
+ -------
1322
+ :class:`DataFrame`
1323
+ DataFrame with distinct records.
1324
+
1325
+ Examples:
1326
+ --------
1327
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
1328
+
1329
+ Return the number of distinct rows in the :class:`DataFrame`
1330
+
1331
+ >>> df.distinct().count()
1332
+ 2
1333
+ """
1334
+ distinct_rel = self.relation.distinct()
1335
+ return DataFrame(distinct_rel, self.session)
1336
+
1337
+ def count(self) -> int:
1338
+ """Returns the number of rows in this :class:`DataFrame`.
1339
+
1340
+ Returns:
1341
+ -------
1342
+ int
1343
+ Number of rows.
1344
+
1345
+ Examples:
1346
+ --------
1347
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1348
+
1349
+ Return the number of rows in the :class:`DataFrame`.
1350
+
1351
+ >>> df.count()
1352
+ 3
1353
+ """
1354
+ count_rel = self.relation.count("*")
1355
+ return int(count_rel.fetchone()[0])
1356
+
1357
+ def _cast_types(self, *types) -> "DataFrame":
1358
+ existing_columns = self.relation.columns
1359
+ types_count = len(types)
1360
+ assert types_count == len(existing_columns)
1361
+
1362
+ cast_expressions = [
1363
+ f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types)
1364
+ ]
1365
+ cast_expressions = ", ".join(cast_expressions)
1366
+ new_rel = self.relation.project(cast_expressions)
1367
+ return DataFrame(new_rel, self.session)
1368
+
1369
+ def toDF(self, *cols) -> "DataFrame": # noqa: D102
1370
+ existing_columns = self.relation.columns
1371
+ column_count = len(cols)
1372
+ if column_count != len(existing_columns):
1373
+ raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match")
1374
+
1375
+ existing_columns = [ColumnExpression(x) for x in existing_columns]
1376
+ projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)]
1377
+ new_rel = self.relation.project(*projections)
1378
+ return DataFrame(new_rel, self.session)
1379
+
1380
+ def collect(self) -> list[Row]: # noqa: D102
1381
+ columns = self.relation.columns
1382
+ result = self.relation.fetchall()
1383
+
1384
+ def construct_row(values: list, names: list[str]) -> Row:
1385
+ row = tuple.__new__(Row, list(values))
1386
+ row.__fields__ = list(names)
1387
+ return row
1388
+
1389
+ rows = [construct_row(x, columns) for x in result]
1390
+ return rows
1391
+
1392
+ def cache(self) -> "DataFrame":
1393
+ """Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`).
1394
+
1395
+ .. versionadded:: 1.3.0
1396
+
1397
+ .. versionchanged:: 3.4.0
1398
+ Supports Spark Connect.
1399
+
1400
+ Notes:
1401
+ -----
1402
+ The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
1403
+
1404
+ Returns:
1405
+ -------
1406
+ :class:`DataFrame`
1407
+ Cached DataFrame.
1408
+
1409
+ Examples:
1410
+ --------
1411
+ >>> df = spark.range(1)
1412
+ >>> df.cache()
1413
+ DataFrame[id: bigint]
1414
+
1415
+ >>> df.explain()
1416
+ == Physical Plan ==
1417
+ InMemoryTableScan ...
1418
+ """
1419
+ cached_relation = self.relation.execute()
1420
+ return DataFrame(cached_relation, self.session)
1421
+
1422
+
1423
+ __all__ = ["DataFrame"]