duckdb 1.5.0.dev37__cp314-cp314t-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (47) hide show
  1. _duckdb.cpython-314t-darwin.so +0 -0
  2. duckdb/__init__.py +475 -0
  3. duckdb/__init__.pyi +713 -0
  4. duckdb/bytes_io_wrapper.py +66 -0
  5. duckdb/experimental/__init__.py +2 -0
  6. duckdb/experimental/spark/LICENSE +260 -0
  7. duckdb/experimental/spark/__init__.py +7 -0
  8. duckdb/experimental/spark/_globals.py +77 -0
  9. duckdb/experimental/spark/_typing.py +48 -0
  10. duckdb/experimental/spark/conf.py +45 -0
  11. duckdb/experimental/spark/context.py +164 -0
  12. duckdb/experimental/spark/errors/__init__.py +72 -0
  13. duckdb/experimental/spark/errors/error_classes.py +918 -0
  14. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  15. duckdb/experimental/spark/errors/exceptions/base.py +217 -0
  16. duckdb/experimental/spark/errors/utils.py +116 -0
  17. duckdb/experimental/spark/exception.py +15 -0
  18. duckdb/experimental/spark/sql/__init__.py +7 -0
  19. duckdb/experimental/spark/sql/_typing.py +93 -0
  20. duckdb/experimental/spark/sql/catalog.py +78 -0
  21. duckdb/experimental/spark/sql/column.py +368 -0
  22. duckdb/experimental/spark/sql/conf.py +23 -0
  23. duckdb/experimental/spark/sql/dataframe.py +1437 -0
  24. duckdb/experimental/spark/sql/functions.py +6221 -0
  25. duckdb/experimental/spark/sql/group.py +420 -0
  26. duckdb/experimental/spark/sql/readwriter.py +449 -0
  27. duckdb/experimental/spark/sql/session.py +292 -0
  28. duckdb/experimental/spark/sql/streaming.py +37 -0
  29. duckdb/experimental/spark/sql/type_utils.py +105 -0
  30. duckdb/experimental/spark/sql/types.py +1275 -0
  31. duckdb/experimental/spark/sql/udf.py +37 -0
  32. duckdb/filesystem.py +23 -0
  33. duckdb/functional/__init__.py +17 -0
  34. duckdb/functional/__init__.pyi +31 -0
  35. duckdb/polars_io.py +237 -0
  36. duckdb/query_graph/__main__.py +363 -0
  37. duckdb/typing/__init__.py +61 -0
  38. duckdb/typing/__init__.pyi +36 -0
  39. duckdb/udf.py +19 -0
  40. duckdb/value/__init__.py +0 -0
  41. duckdb/value/__init__.pyi +0 -0
  42. duckdb/value/constant/__init__.py +268 -0
  43. duckdb/value/constant/__init__.pyi +115 -0
  44. duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
  45. duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
  46. duckdb-1.5.0.dev37.dist-info/WHEEL +6 -0
  47. duckdb-1.5.0.dev37.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,1437 @@
1
+ from functools import reduce
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Callable,
6
+ List,
7
+ Dict,
8
+ Optional,
9
+ Tuple,
10
+ Union,
11
+ cast,
12
+ overload,
13
+ )
14
+ import uuid
15
+ from keyword import iskeyword
16
+
17
+ import duckdb
18
+ from duckdb import ColumnExpression, Expression, StarExpression
19
+
20
+ from ._typing import ColumnOrName
21
+ from ..errors import PySparkTypeError, PySparkValueError, PySparkIndexError
22
+ from ..exception import ContributionsAcceptedError
23
+ from .column import Column
24
+ from .readwriter import DataFrameWriter
25
+ from .type_utils import duckdb_to_spark_schema
26
+ from .types import Row, StructType
27
+
28
+ if TYPE_CHECKING:
29
+ import pyarrow as pa
30
+ from pandas.core.frame import DataFrame as PandasDataFrame
31
+
32
+ from .group import GroupedData, Grouping
33
+ from .session import SparkSession
34
+
35
+ from ..errors import PySparkValueError
36
+ from .functions import _to_column_expr, col, lit
37
+
38
+
39
+ class DataFrame:
40
+ def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession"):
41
+ self.relation = relation
42
+ self.session = session
43
+ self._schema = None
44
+ if self.relation is not None:
45
+ self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
46
+
47
+ def show(self, **kwargs) -> None:
48
+ self.relation.show()
49
+
50
+ def toPandas(self) -> "PandasDataFrame":
51
+ return self.relation.df()
52
+
53
+ def toArrow(self) -> "pa.Table":
54
+ """
55
+ Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
56
+
57
+ This is only available if PyArrow is installed and available.
58
+
59
+ .. versionadded:: 4.0.0
60
+
61
+ Notes
62
+ -----
63
+ This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
64
+ expected to be small, as all the data is loaded into the driver's memory.
65
+
66
+ This API is a developer API.
67
+
68
+ Examples
69
+ --------
70
+ >>> df.toArrow() # doctest: +SKIP
71
+ pyarrow.Table
72
+ age: int64
73
+ name: string
74
+ ----
75
+ age: [[2,5]]
76
+ name: [["Alice","Bob"]]
77
+ """
78
+ return self.relation.to_arrow_table()
79
+
80
+ def createOrReplaceTempView(self, name: str) -> None:
81
+ """Creates or replaces a local temporary view with this :class:`DataFrame`.
82
+
83
+ The lifetime of this temporary table is tied to the :class:`SparkSession`
84
+ that was used to create this :class:`DataFrame`.
85
+
86
+ Parameters
87
+ ----------
88
+ name : str
89
+ Name of the view.
90
+
91
+ Examples
92
+ --------
93
+ Create a local temporary view named 'people'.
94
+
95
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
96
+ >>> df.createOrReplaceTempView("people")
97
+
98
+ Replace the local temporary view.
99
+
100
+ >>> df2 = df.filter(df.age > 3)
101
+ >>> df2.createOrReplaceTempView("people")
102
+ >>> df3 = spark.sql("SELECT * FROM people")
103
+ >>> sorted(df3.collect()) == sorted(df2.collect())
104
+ True
105
+ >>> spark.catalog.dropTempView("people")
106
+ True
107
+
108
+ """
109
+ self.relation.create_view(name, True)
110
+
111
+ def createGlobalTempView(self, name: str) -> None:
112
+ raise NotImplementedError
113
+
114
+ def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
115
+ if columnName not in self.relation:
116
+ raise ValueError(f"DataFrame does not contain a column named {columnName}")
117
+ cols = []
118
+ for x in self.relation.columns:
119
+ col = ColumnExpression(x)
120
+ if x.casefold() == columnName.casefold():
121
+ col = col.alias(newName)
122
+ cols.append(col)
123
+ rel = self.relation.select(*cols)
124
+ return DataFrame(rel, self.session)
125
+
126
+ def withColumn(self, columnName: str, col: Column) -> "DataFrame":
127
+ if not isinstance(col, Column):
128
+ raise PySparkTypeError(
129
+ error_class="NOT_COLUMN",
130
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
131
+ )
132
+ if columnName in self.relation:
133
+ # We want to replace the existing column with this new expression
134
+ cols = []
135
+ for x in self.relation.columns:
136
+ if x.casefold() == columnName.casefold():
137
+ cols.append(col.expr.alias(columnName))
138
+ else:
139
+ cols.append(ColumnExpression(x))
140
+ else:
141
+ cols = [ColumnExpression(x) for x in self.relation.columns]
142
+ cols.append(col.expr.alias(columnName))
143
+ rel = self.relation.select(*cols)
144
+ return DataFrame(rel, self.session)
145
+
146
+ def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame":
147
+ """
148
+ Returns a new :class:`DataFrame` by adding multiple columns or replacing the
149
+ existing columns that have the same names.
150
+
151
+ The colsMap is a map of column name and column, the column must only refer to attributes
152
+ supplied by this Dataset. It is an error to add columns that refer to some other Dataset.
153
+
154
+ .. versionadded:: 3.3.0
155
+ Added support for multiple columns adding
156
+
157
+ .. versionchanged:: 3.4.0
158
+ Supports Spark Connect.
159
+
160
+ Parameters
161
+ ----------
162
+ colsMap : dict
163
+ a dict of column name and :class:`Column`. Currently, only a single map is supported.
164
+
165
+ Returns
166
+ -------
167
+ :class:`DataFrame`
168
+ DataFrame with new or replaced columns.
169
+
170
+ Examples
171
+ --------
172
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
173
+ >>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()
174
+ +---+-----+----+----+
175
+ |age| name|age2|age3|
176
+ +---+-----+----+----+
177
+ | 2|Alice| 4| 5|
178
+ | 5| Bob| 7| 8|
179
+ +---+-----+----+----+
180
+ """
181
+ # Below code is to help enable kwargs in future.
182
+ assert len(colsMap) == 1
183
+ colsMap = colsMap[0] # type: ignore[assignment]
184
+
185
+ if not isinstance(colsMap, dict):
186
+ raise PySparkTypeError(
187
+ error_class="NOT_DICT",
188
+ message_parameters={
189
+ "arg_name": "colsMap",
190
+ "arg_type": type(colsMap).__name__,
191
+ },
192
+ )
193
+
194
+ column_names = list(colsMap.keys())
195
+ columns = list(colsMap.values())
196
+
197
+ # Compute this only once
198
+ column_names_for_comparison = [x.casefold() for x in column_names]
199
+
200
+ cols = []
201
+ for x in self.relation.columns:
202
+ if x.casefold() in column_names_for_comparison:
203
+ idx = column_names_for_comparison.index(x)
204
+ # We extract the column name from the originally passed
205
+ # in ones, as the casing might be different than the one
206
+ # in the relation
207
+ col_name = column_names.pop(idx)
208
+ col = columns.pop(idx)
209
+ cols.append(col.expr.alias(col_name))
210
+ else:
211
+ cols.append(ColumnExpression(x))
212
+
213
+ # In case anything is remaining, these are new columns
214
+ # that we need to add to the DataFrame
215
+ for col_name, col in zip(column_names, columns):
216
+ cols.append(col.expr.alias(col_name))
217
+
218
+ rel = self.relation.select(*cols)
219
+ return DataFrame(rel, self.session)
220
+
221
+ def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
222
+ """
223
+ Returns a new :class:`DataFrame` by renaming multiple columns.
224
+ This is a no-op if the schema doesn't contain the given column names.
225
+
226
+ .. versionadded:: 3.4.0
227
+ Added support for multiple columns renaming
228
+
229
+ Parameters
230
+ ----------
231
+ colsMap : dict
232
+ a dict of existing column names and corresponding desired column names.
233
+ Currently, only a single map is supported.
234
+
235
+ Returns
236
+ -------
237
+ :class:`DataFrame`
238
+ DataFrame with renamed columns.
239
+
240
+ See Also
241
+ --------
242
+ :meth:`withColumnRenamed`
243
+
244
+ Notes
245
+ -----
246
+ Support Spark Connect
247
+
248
+ Examples
249
+ --------
250
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
251
+ >>> df = df.withColumns({'age2': df.age + 2, 'age3': df.age + 3})
252
+ >>> df.withColumnsRenamed({'age2': 'age4', 'age3': 'age5'}).show()
253
+ +---+-----+----+----+
254
+ |age| name|age4|age5|
255
+ +---+-----+----+----+
256
+ | 2|Alice| 4| 5|
257
+ | 5| Bob| 7| 8|
258
+ +---+-----+----+----+
259
+ """
260
+ if not isinstance(colsMap, dict):
261
+ raise PySparkTypeError(
262
+ error_class="NOT_DICT",
263
+ message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__},
264
+ )
265
+
266
+ unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
267
+ if unknown_columns:
268
+ raise ValueError(
269
+ f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
270
+ )
271
+
272
+ # Compute this only once
273
+ old_column_names = list(colsMap.keys())
274
+ old_column_names_for_comparison = [x.casefold() for x in old_column_names]
275
+
276
+ cols = []
277
+ for x in self.relation.columns:
278
+ col = ColumnExpression(x)
279
+ if x.casefold() in old_column_names_for_comparison:
280
+ idx = old_column_names.index(x)
281
+ # We extract the column name from the originally passed
282
+ # in ones, as the casing might be different than the one
283
+ # in the relation
284
+ col_name = old_column_names.pop(idx)
285
+ new_col_name = colsMap[col_name]
286
+ col = col.alias(new_col_name)
287
+ cols.append(col)
288
+
289
+ rel = self.relation.select(*cols)
290
+ return DataFrame(rel, self.session)
291
+
292
+
293
+
294
+ def transform(
295
+ self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any
296
+ ) -> "DataFrame":
297
+ """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
298
+
299
+ .. versionadded:: 3.0.0
300
+
301
+ .. versionchanged:: 3.4.0
302
+ Supports Spark Connect.
303
+
304
+ Parameters
305
+ ----------
306
+ func : function
307
+ a function that takes and returns a :class:`DataFrame`.
308
+ *args
309
+ Positional arguments to pass to func.
310
+
311
+ .. versionadded:: 3.3.0
312
+ **kwargs
313
+ Keyword arguments to pass to func.
314
+
315
+ .. versionadded:: 3.3.0
316
+
317
+ Returns
318
+ -------
319
+ :class:`DataFrame`
320
+ Transformed DataFrame.
321
+
322
+ Examples
323
+ --------
324
+ >>> from pyspark.sql.functions import col
325
+ >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
326
+ >>> def cast_all_to_int(input_df):
327
+ ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
328
+ ...
329
+ >>> def sort_columns_asc(input_df):
330
+ ... return input_df.select(*sorted(input_df.columns))
331
+ ...
332
+ >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
333
+ +-----+---+
334
+ |float|int|
335
+ +-----+---+
336
+ | 1| 1|
337
+ | 2| 2|
338
+ +-----+---+
339
+
340
+ >>> def add_n(input_df, n):
341
+ ... return input_df.select([(col(col_name) + n).alias(col_name)
342
+ ... for col_name in input_df.columns])
343
+ >>> df.transform(add_n, 1).transform(add_n, n=10).show()
344
+ +---+-----+
345
+ |int|float|
346
+ +---+-----+
347
+ | 12| 12.0|
348
+ | 13| 13.0|
349
+ +---+-----+
350
+ """
351
+ result = func(self, *args, **kwargs)
352
+ assert isinstance(result, DataFrame), (
353
+ "Func returned an instance of type [%s], "
354
+ "should have been DataFrame." % type(result)
355
+ )
356
+ return result
357
+
358
+ def sort(
359
+ self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
360
+ ) -> "DataFrame":
361
+ """Returns a new :class:`DataFrame` sorted by the specified column(s).
362
+
363
+ Parameters
364
+ ----------
365
+ cols : str, list, or :class:`Column`, optional
366
+ list of :class:`Column` or column names to sort by.
367
+
368
+ Other Parameters
369
+ ----------------
370
+ ascending : bool or list, optional, default True
371
+ boolean or list of boolean.
372
+ Sort ascending vs. descending. Specify list for multiple sort orders.
373
+ If a list is specified, the length of the list must equal the length of the `cols`.
374
+
375
+ Returns
376
+ -------
377
+ :class:`DataFrame`
378
+ Sorted DataFrame.
379
+
380
+ Examples
381
+ --------
382
+ >>> from pyspark.sql.functions import desc, asc
383
+ >>> df = spark.createDataFrame([
384
+ ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
385
+
386
+ Sort the DataFrame in ascending order.
387
+
388
+ >>> df.sort(asc("age")).show()
389
+ +---+-----+
390
+ |age| name|
391
+ +---+-----+
392
+ | 2|Alice|
393
+ | 5| Bob|
394
+ +---+-----+
395
+
396
+ Sort the DataFrame in descending order.
397
+
398
+ >>> df.sort(df.age.desc()).show()
399
+ +---+-----+
400
+ |age| name|
401
+ +---+-----+
402
+ | 5| Bob|
403
+ | 2|Alice|
404
+ +---+-----+
405
+ >>> df.orderBy(df.age.desc()).show()
406
+ +---+-----+
407
+ |age| name|
408
+ +---+-----+
409
+ | 5| Bob|
410
+ | 2|Alice|
411
+ +---+-----+
412
+ >>> df.sort("age", ascending=False).show()
413
+ +---+-----+
414
+ |age| name|
415
+ +---+-----+
416
+ | 5| Bob|
417
+ | 2|Alice|
418
+ +---+-----+
419
+
420
+ Specify multiple columns
421
+
422
+ >>> df = spark.createDataFrame([
423
+ ... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
424
+ >>> df.orderBy(desc("age"), "name").show()
425
+ +---+-----+
426
+ |age| name|
427
+ +---+-----+
428
+ | 5| Bob|
429
+ | 2|Alice|
430
+ | 2| Bob|
431
+ +---+-----+
432
+
433
+ Specify multiple columns for sorting order at `ascending`.
434
+
435
+ >>> df.orderBy(["age", "name"], ascending=[False, False]).show()
436
+ +---+-----+
437
+ |age| name|
438
+ +---+-----+
439
+ | 5| Bob|
440
+ | 2| Bob|
441
+ | 2|Alice|
442
+ +---+-----+
443
+ """
444
+ if not cols:
445
+ raise PySparkValueError(
446
+ error_class="CANNOT_BE_EMPTY",
447
+ message_parameters={"item": "column"},
448
+ )
449
+ if len(cols) == 1 and isinstance(cols[0], list):
450
+ cols = cols[0]
451
+
452
+ columns = []
453
+ for c in cols:
454
+ _c = c
455
+ if isinstance(c, str):
456
+ _c = col(c)
457
+ elif isinstance(c, int) and not isinstance(c, bool):
458
+ # ordinal is 1-based
459
+ if c > 0:
460
+ _c = self[c - 1]
461
+ # negative ordinal means sort by desc
462
+ elif c < 0:
463
+ _c = self[-c - 1].desc()
464
+ else:
465
+ raise PySparkIndexError(
466
+ error_class="ZERO_INDEX",
467
+ message_parameters={},
468
+ )
469
+ columns.append(_c)
470
+
471
+ ascending = kwargs.get("ascending", True)
472
+
473
+ if isinstance(ascending, (bool, int)):
474
+ if not ascending:
475
+ columns = [c.desc() for c in columns]
476
+ elif isinstance(ascending, list):
477
+ columns = [c if asc else c.desc() for asc, c in zip(ascending, columns)]
478
+ else:
479
+ raise PySparkTypeError(
480
+ error_class="NOT_BOOL_OR_LIST",
481
+ message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
482
+ )
483
+
484
+ columns = [_to_column_expr(c) for c in columns]
485
+ rel = self.relation.sort(*columns)
486
+ return DataFrame(rel, self.session)
487
+
488
+ orderBy = sort
489
+
490
+ def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]:
491
+ if n is None:
492
+ rs = self.head(1)
493
+ return rs[0] if rs else None
494
+ return self.take(n)
495
+
496
+ first = head
497
+
498
+ def take(self, num: int) -> List[Row]:
499
+ return self.limit(num).collect()
500
+
501
+ def filter(self, condition: "ColumnOrName") -> "DataFrame":
502
+ """Filters rows using the given condition.
503
+
504
+ :func:`where` is an alias for :func:`filter`.
505
+
506
+ Parameters
507
+ ----------
508
+ condition : :class:`Column` or str
509
+ a :class:`Column` of :class:`types.BooleanType`
510
+ or a string of SQL expressions.
511
+
512
+ Returns
513
+ -------
514
+ :class:`DataFrame`
515
+ Filtered DataFrame.
516
+
517
+ Examples
518
+ --------
519
+ >>> df = spark.createDataFrame([
520
+ ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
521
+
522
+ Filter by :class:`Column` instances.
523
+
524
+ >>> df.filter(df.age > 3).show()
525
+ +---+----+
526
+ |age|name|
527
+ +---+----+
528
+ | 5| Bob|
529
+ +---+----+
530
+ >>> df.where(df.age == 2).show()
531
+ +---+-----+
532
+ |age| name|
533
+ +---+-----+
534
+ | 2|Alice|
535
+ +---+-----+
536
+
537
+ Filter by SQL expression in a string.
538
+
539
+ >>> df.filter("age > 3").show()
540
+ +---+----+
541
+ |age|name|
542
+ +---+----+
543
+ | 5| Bob|
544
+ +---+----+
545
+ >>> df.where("age = 2").show()
546
+ +---+-----+
547
+ |age| name|
548
+ +---+-----+
549
+ | 2|Alice|
550
+ +---+-----+
551
+ """
552
+ if isinstance(condition, Column):
553
+ cond = condition.expr
554
+ elif isinstance(condition, str):
555
+ cond = condition
556
+ else:
557
+ raise PySparkTypeError(
558
+ error_class="NOT_COLUMN_OR_STR",
559
+ message_parameters={"arg_name": "condition", "arg_type": type(condition).__name__},
560
+ )
561
+ rel = self.relation.filter(cond)
562
+ return DataFrame(rel, self.session)
563
+
564
+ where = filter
565
+
566
+ def select(self, *cols) -> "DataFrame":
567
+ cols = list(cols)
568
+ if len(cols) == 1:
569
+ cols = cols[0]
570
+ if isinstance(cols, list):
571
+ projections = [
572
+ x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols
573
+ ]
574
+ else:
575
+ projections = [
576
+ cols.expr if isinstance(cols, Column) else ColumnExpression(cols)
577
+ ]
578
+ rel = self.relation.select(*projections)
579
+ return DataFrame(rel, self.session)
580
+
581
+ @property
582
+ def columns(self) -> List[str]:
583
+ """Returns all column names as a list.
584
+
585
+ Examples
586
+ --------
587
+ >>> df.columns
588
+ ['age', 'name']
589
+ """
590
+ return [f.name for f in self.schema.fields]
591
+
592
+ def _ipython_key_completions_(self) -> List[str]:
593
+ # Provides tab-completion for column names in PySpark DataFrame
594
+ # when accessed in bracket notation, e.g. df['<TAB>]
595
+ return self.columns
596
+
597
+ def __dir__(self) -> List[str]:
598
+ out = set(super().__dir__())
599
+ out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
600
+ return sorted(out)
601
+
602
+ def join(
603
+ self,
604
+ other: "DataFrame",
605
+ on: Optional[Union[str, List[str], Column, List[Column]]] = None,
606
+ how: Optional[str] = None,
607
+ ) -> "DataFrame":
608
+ """Joins with another :class:`DataFrame`, using the given join expression.
609
+
610
+ Parameters
611
+ ----------
612
+ other : :class:`DataFrame`
613
+ Right side of the join
614
+ on : str, list or :class:`Column`, optional
615
+ a string for the join column name, a list of column names,
616
+ a join expression (Column), or a list of Columns.
617
+ If `on` is a string or a list of strings indicating the name of the join column(s),
618
+ the column(s) must exist on both sides, and this performs an equi-join.
619
+ how : str, optional
620
+ default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
621
+ ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,
622
+ ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
623
+ ``anti``, ``leftanti`` and ``left_anti``.
624
+
625
+ Returns
626
+ -------
627
+ :class:`DataFrame`
628
+ Joined DataFrame.
629
+
630
+ Examples
631
+ --------
632
+ The following performs a full outer join between ``df1`` and ``df2``.
633
+
634
+ >>> from pyspark.sql import Row
635
+ >>> from pyspark.sql.functions import desc
636
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
637
+ >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
638
+ >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
639
+ >>> df4 = spark.createDataFrame([
640
+ ... Row(age=10, height=80, name="Alice"),
641
+ ... Row(age=5, height=None, name="Bob"),
642
+ ... Row(age=None, height=None, name="Tom"),
643
+ ... Row(age=None, height=None, name=None),
644
+ ... ])
645
+
646
+ Inner join on columns (default)
647
+
648
+ >>> df.join(df2, 'name').select(df.name, df2.height).show()
649
+ +----+------+
650
+ |name|height|
651
+ +----+------+
652
+ | Bob| 85|
653
+ +----+------+
654
+ >>> df.join(df4, ['name', 'age']).select(df.name, df.age).show()
655
+ +----+---+
656
+ |name|age|
657
+ +----+---+
658
+ | Bob| 5|
659
+ +----+---+
660
+
661
+ Outer join for both DataFrames on the 'name' column.
662
+
663
+ >>> df.join(df2, df.name == df2.name, 'outer').select(
664
+ ... df.name, df2.height).sort(desc("name")).show()
665
+ +-----+------+
666
+ | name|height|
667
+ +-----+------+
668
+ | Bob| 85|
669
+ |Alice| NULL|
670
+ | NULL| 80|
671
+ +-----+------+
672
+ >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
673
+ +-----+------+
674
+ | name|height|
675
+ +-----+------+
676
+ | Tom| 80|
677
+ | Bob| 85|
678
+ |Alice| NULL|
679
+ +-----+------+
680
+
681
+ Outer join for both DataFrams with multiple columns.
682
+
683
+ >>> df.join(
684
+ ... df3,
685
+ ... [df.name == df3.name, df.age == df3.age],
686
+ ... 'outer'
687
+ ... ).select(df.name, df3.age).show()
688
+ +-----+---+
689
+ | name|age|
690
+ +-----+---+
691
+ |Alice| 2|
692
+ | Bob| 5|
693
+ +-----+---+
694
+ """
695
+
696
+ if on is not None and not isinstance(on, list):
697
+ on = [on] # type: ignore[assignment]
698
+ if on is not None and not all([isinstance(x, str) for x in on]):
699
+ assert isinstance(on, list)
700
+ # Get (or create) the Expressions from the list of Columns
701
+ on = [_to_column_expr(x) for x in on]
702
+
703
+ # & all the Expressions together to form one Expression
704
+ assert isinstance(
705
+ on[0], Expression
706
+ ), "on should be Column or list of Column"
707
+ on = reduce(lambda x, y: x.__and__(y), cast(List[Expression], on))
708
+
709
+
710
+ if on is None and how is None:
711
+ result = self.relation.join(other.relation)
712
+ else:
713
+ if how is None:
714
+ how = "inner"
715
+ if on is None:
716
+ on = "true"
717
+ elif isinstance(on, list) and all([isinstance(x, str) for x in on]):
718
+ # Passed directly through as a list of strings
719
+ on = on
720
+ else:
721
+ on = str(on)
722
+ assert isinstance(how, str), "how should be a string"
723
+
724
+ def map_to_recognized_jointype(how):
725
+ known_aliases = {
726
+ "inner": [],
727
+ "outer": ["full", "fullouter", "full_outer"],
728
+ "left": ["leftouter", "left_outer"],
729
+ "right": ["rightouter", "right_outer"],
730
+ "anti": ["leftanti", "left_anti"],
731
+ "semi": ["leftsemi", "left_semi"],
732
+ }
733
+ mapped_type = None
734
+ for type, aliases in known_aliases.items():
735
+ if how == type or how in aliases:
736
+ mapped_type = type
737
+ break
738
+
739
+ if not mapped_type:
740
+ mapped_type = how
741
+ return mapped_type
742
+
743
+ how = map_to_recognized_jointype(how)
744
+ result = self.relation.join(other.relation, on, how)
745
+ return DataFrame(result, self.session)
746
+
747
+ def crossJoin(self, other: "DataFrame") -> "DataFrame":
748
+ """Returns the cartesian product with another :class:`DataFrame`.
749
+
750
+ .. versionadded:: 2.1.0
751
+
752
+ .. versionchanged:: 3.4.0
753
+ Supports Spark Connect.
754
+
755
+ Parameters
756
+ ----------
757
+ other : :class:`DataFrame`
758
+ Right side of the cartesian product.
759
+
760
+ Returns
761
+ -------
762
+ :class:`DataFrame`
763
+ Joined DataFrame.
764
+
765
+ Examples
766
+ --------
767
+ >>> from pyspark.sql import Row
768
+ >>> df = spark.createDataFrame(
769
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
770
+ >>> df2 = spark.createDataFrame(
771
+ ... [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
772
+ >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
773
+ +---+-----+------+
774
+ |age| name|height|
775
+ +---+-----+------+
776
+ | 14| Tom| 80|
777
+ | 14| Tom| 85|
778
+ | 23|Alice| 80|
779
+ | 23|Alice| 85|
780
+ | 16| Bob| 80|
781
+ | 16| Bob| 85|
782
+ +---+-----+------+
783
+ """
784
+ return DataFrame(self.relation.cross(other.relation), self.session)
785
+
786
+ def alias(self, alias: str) -> "DataFrame":
787
+ """Returns a new :class:`DataFrame` with an alias set.
788
+
789
+ Parameters
790
+ ----------
791
+ alias : str
792
+ an alias name to be set for the :class:`DataFrame`.
793
+
794
+ Returns
795
+ -------
796
+ :class:`DataFrame`
797
+ Aliased DataFrame.
798
+
799
+ Examples
800
+ --------
801
+ >>> from pyspark.sql.functions import col, desc
802
+ >>> df = spark.createDataFrame(
803
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
804
+ >>> df_as1 = df.alias("df_as1")
805
+ >>> df_as2 = df.alias("df_as2")
806
+ >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
807
+ >>> joined_df.select(
808
+ ... "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
809
+ +-----+-----+---+
810
+ | name| name|age|
811
+ +-----+-----+---+
812
+ | Tom| Tom| 14|
813
+ | Bob| Bob| 16|
814
+ |Alice|Alice| 23|
815
+ +-----+-----+---+
816
+ """
817
+ assert isinstance(alias, str), "alias should be a string"
818
+ return DataFrame(self.relation.set_alias(alias), self.session)
819
+
820
+ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]
821
+ exclude = []
822
+ for col in cols:
823
+ if isinstance(col, str):
824
+ exclude.append(col)
825
+ elif isinstance(col, Column):
826
+ exclude.append(col.expr.get_name())
827
+ else:
828
+ raise PySparkTypeError(
829
+ error_class="NOT_COLUMN_OR_STR",
830
+ message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
831
+ )
832
+ # Filter out the columns that don't exist in the relation
833
+ exclude = [x for x in exclude if x in self.relation.columns]
834
+ expr = StarExpression(exclude=exclude)
835
+ return DataFrame(self.relation.select(expr), self.session)
836
+
837
+ def __repr__(self) -> str:
838
+ return str(self.relation)
839
+
840
+ def limit(self, num: int) -> "DataFrame":
841
+ """Limits the result count to the number specified.
842
+
843
+ Parameters
844
+ ----------
845
+ num : int
846
+ Number of records to return. Will return this number of records
847
+ or all records if the DataFrame contains less than this number of records.
848
+
849
+ Returns
850
+ -------
851
+ :class:`DataFrame`
852
+ Subset of the records
853
+
854
+ Examples
855
+ --------
856
+ >>> df = spark.createDataFrame(
857
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
858
+ >>> df.limit(1).show()
859
+ +---+----+
860
+ |age|name|
861
+ +---+----+
862
+ | 14| Tom|
863
+ +---+----+
864
+ >>> df.limit(0).show()
865
+ +---+----+
866
+ |age|name|
867
+ +---+----+
868
+ +---+----+
869
+ """
870
+ rel = self.relation.limit(num)
871
+ return DataFrame(rel, self.session)
872
+
873
+ def __contains__(self, item: str):
874
+ """
875
+ Check if the :class:`DataFrame` contains a column by the name of `item`
876
+ """
877
+ return item in self.relation
878
+
879
+ @property
880
+ def schema(self) -> StructType:
881
+ """Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
882
+
883
+ Examples
884
+ --------
885
+ >>> df.schema
886
+ StructType([StructField('age', IntegerType(), True),
887
+ StructField('name', StringType(), True)])
888
+ """
889
+ return self._schema
890
+
891
+ @overload
892
+ def __getitem__(self, item: Union[int, str]) -> Column:
893
+ ...
894
+
895
+ @overload
896
+ def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame":
897
+ ...
898
+
899
+ def __getitem__(
900
+ self, item: Union[int, str, Column, List, Tuple]
901
+ ) -> Union[Column, "DataFrame"]:
902
+ """Returns the column as a :class:`Column`.
903
+
904
+ Examples
905
+ --------
906
+ >>> df.select(df['age']).collect()
907
+ [Row(age=2), Row(age=5)]
908
+ >>> df[ ["name", "age"]].collect()
909
+ [Row(name='Alice', age=2), Row(name='Bob', age=5)]
910
+ >>> df[ df.age > 3 ].collect()
911
+ [Row(age=5, name='Bob')]
912
+ >>> df[df[0] > 3].collect()
913
+ [Row(age=5, name='Bob')]
914
+ """
915
+ if isinstance(item, str):
916
+ return Column(duckdb.ColumnExpression(self.relation.alias, item))
917
+ elif isinstance(item, Column):
918
+ return self.filter(item)
919
+ elif isinstance(item, (list, tuple)):
920
+ return self.select(*item)
921
+ elif isinstance(item, int):
922
+ return col(self._schema[item].name)
923
+ else:
924
+ raise TypeError(f"Unexpected item type: {type(item)}")
925
+
926
+ def __getattr__(self, name: str) -> Column:
927
+ """Returns the :class:`Column` denoted by ``name``.
928
+
929
+ Examples
930
+ --------
931
+ >>> df.select(df.age).collect()
932
+ [Row(age=2), Row(age=5)]
933
+ """
934
+ if name not in self.relation.columns:
935
+ raise AttributeError(
936
+ "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)
937
+ )
938
+ return Column(duckdb.ColumnExpression(self.relation.alias, name))
939
+
940
+ @overload
941
+ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
942
+ ...
943
+
944
+ @overload
945
+ def groupBy(self, __cols: Union[List[Column], List[str]]) -> "GroupedData":
946
+ ...
947
+
948
+ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
949
+ """Groups the :class:`DataFrame` using the specified columns,
950
+ so we can run aggregation on them. See :class:`GroupedData`
951
+ for all the available aggregate functions.
952
+
953
+ :func:`groupby` is an alias for :func:`groupBy`.
954
+
955
+ Parameters
956
+ ----------
957
+ cols : list, str or :class:`Column`
958
+ columns to group by.
959
+ Each element should be a column name (string) or an expression (:class:`Column`)
960
+ or list of them.
961
+
962
+ Returns
963
+ -------
964
+ :class:`GroupedData`
965
+ Grouped data by given columns.
966
+
967
+ Examples
968
+ --------
969
+ >>> df = spark.createDataFrame([
970
+ ... (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
971
+
972
+ Empty grouping columns triggers a global aggregation.
973
+
974
+ >>> df.groupBy().avg().show()
975
+ +--------+
976
+ |avg(age)|
977
+ +--------+
978
+ | 2.75|
979
+ +--------+
980
+
981
+ Group-by 'name', and specify a dictionary to calculate the summation of 'age'.
982
+
983
+ >>> df.groupBy("name").agg({"age": "sum"}).sort("name").show()
984
+ +-----+--------+
985
+ | name|sum(age)|
986
+ +-----+--------+
987
+ |Alice| 2|
988
+ | Bob| 9|
989
+ +-----+--------+
990
+
991
+ Group-by 'name', and calculate maximum values.
992
+
993
+ >>> df.groupBy(df.name).max().sort("name").show()
994
+ +-----+--------+
995
+ | name|max(age)|
996
+ +-----+--------+
997
+ |Alice| 2|
998
+ | Bob| 5|
999
+ +-----+--------+
1000
+
1001
+ Group-by 'name' and 'age', and calculate the number of rows in each group.
1002
+
1003
+ >>> df.groupBy(["name", df.age]).count().sort("name", "age").show()
1004
+ +-----+---+-----+
1005
+ | name|age|count|
1006
+ +-----+---+-----+
1007
+ |Alice| 2| 1|
1008
+ | Bob| 2| 2|
1009
+ | Bob| 5| 1|
1010
+ +-----+---+-----+
1011
+ """
1012
+ from .group import GroupedData, Grouping
1013
+
1014
+ if len(cols) == 1 and isinstance(cols[0], list):
1015
+ columns = cols[0]
1016
+ else:
1017
+ columns = cols
1018
+ return GroupedData(Grouping(*columns), self)
1019
+
1020
+ groupby = groupBy
1021
+
1022
+ @property
1023
+ def write(self) -> DataFrameWriter:
1024
+ return DataFrameWriter(self)
1025
+
1026
+ def printSchema(self):
1027
+ raise ContributionsAcceptedError
1028
+
1029
+ def union(self, other: "DataFrame") -> "DataFrame":
1030
+ """Return a new :class:`DataFrame` containing union of rows in this and another
1031
+ :class:`DataFrame`.
1032
+
1033
+ Parameters
1034
+ ----------
1035
+ other : :class:`DataFrame`
1036
+ Another :class:`DataFrame` that needs to be unioned
1037
+
1038
+ Returns
1039
+ -------
1040
+ :class:`DataFrame`
1041
+
1042
+ See Also
1043
+ --------
1044
+ DataFrame.unionAll
1045
+
1046
+ Notes
1047
+ -----
1048
+ This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1049
+ (that does deduplication of elements), use this function followed by :func:`distinct`.
1050
+
1051
+ Also as standard in SQL, this function resolves columns by position (not by name).
1052
+
1053
+ Examples
1054
+ --------
1055
+ >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1056
+ >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
1057
+ >>> df1.union(df2).show()
1058
+ +----+----+----+
1059
+ |col0|col1|col2|
1060
+ +----+----+----+
1061
+ | 1| 2| 3|
1062
+ | 4| 5| 6|
1063
+ +----+----+----+
1064
+ >>> df1.union(df1).show()
1065
+ +----+----+----+
1066
+ |col0|col1|col2|
1067
+ +----+----+----+
1068
+ | 1| 2| 3|
1069
+ | 1| 2| 3|
1070
+ +----+----+----+
1071
+ """
1072
+ return DataFrame(self.relation.union(other.relation), self.session)
1073
+
1074
+ unionAll = union
1075
+
1076
+ def unionByName(
1077
+ self, other: "DataFrame", allowMissingColumns: bool = False
1078
+ ) -> "DataFrame":
1079
+ """Returns a new :class:`DataFrame` containing union of rows in this and another
1080
+ :class:`DataFrame`.
1081
+
1082
+ This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set
1083
+ union (that does deduplication of elements), use this function followed by :func:`distinct`.
1084
+
1085
+ .. versionadded:: 2.3.0
1086
+
1087
+ .. versionchanged:: 3.4.0
1088
+ Supports Spark Connect.
1089
+
1090
+ Parameters
1091
+ ----------
1092
+ other : :class:`DataFrame`
1093
+ Another :class:`DataFrame` that needs to be combined.
1094
+ allowMissingColumns : bool, optional, default False
1095
+ Specify whether to allow missing columns.
1096
+
1097
+ .. versionadded:: 3.1.0
1098
+
1099
+ Returns
1100
+ -------
1101
+ :class:`DataFrame`
1102
+ Combined DataFrame.
1103
+
1104
+ Examples
1105
+ --------
1106
+ The difference between this function and :func:`union` is that this function
1107
+ resolves columns by name (not by position):
1108
+
1109
+ >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1110
+ >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
1111
+ >>> df1.unionByName(df2).show()
1112
+ +----+----+----+
1113
+ |col0|col1|col2|
1114
+ +----+----+----+
1115
+ | 1| 2| 3|
1116
+ | 6| 4| 5|
1117
+ +----+----+----+
1118
+
1119
+ When the parameter `allowMissingColumns` is ``True``, the set of column names
1120
+ in this and other :class:`DataFrame` can differ; missing columns will be filled with null.
1121
+ Further, the missing columns of this :class:`DataFrame` will be added at the end
1122
+ in the schema of the union result:
1123
+
1124
+ >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1125
+ >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])
1126
+ >>> df1.unionByName(df2, allowMissingColumns=True).show()
1127
+ +----+----+----+----+
1128
+ |col0|col1|col2|col3|
1129
+ +----+----+----+----+
1130
+ | 1| 2| 3|NULL|
1131
+ |NULL| 4| 5| 6|
1132
+ +----+----+----+----+
1133
+ """
1134
+ if allowMissingColumns:
1135
+ cols = []
1136
+ for col in self.relation.columns:
1137
+ if col in other.relation.columns:
1138
+ cols.append(col)
1139
+ else:
1140
+ cols.append(lit(None))
1141
+ other = other.select(*cols)
1142
+ else:
1143
+ other = other.select(*self.relation.columns)
1144
+
1145
+ return DataFrame(self.relation.union(other.relation), self.session)
1146
+
1147
+ def intersect(self, other: "DataFrame") -> "DataFrame":
1148
+ """Return a new :class:`DataFrame` containing rows only in
1149
+ both this :class:`DataFrame` and another :class:`DataFrame`.
1150
+ Note that any duplicates are removed. To preserve duplicates
1151
+ use :func:`intersectAll`.
1152
+
1153
+ .. versionadded:: 1.3.0
1154
+
1155
+ .. versionchanged:: 3.4.0
1156
+ Supports Spark Connect.
1157
+
1158
+ Parameters
1159
+ ----------
1160
+ other : :class:`DataFrame`
1161
+ Another :class:`DataFrame` that needs to be combined.
1162
+
1163
+ Returns
1164
+ -------
1165
+ :class:`DataFrame`
1166
+ Combined DataFrame.
1167
+
1168
+ Notes
1169
+ -----
1170
+ This is equivalent to `INTERSECT` in SQL.
1171
+
1172
+ Examples
1173
+ --------
1174
+ >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1175
+ >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
1176
+ >>> df1.intersect(df2).sort(df1.C1.desc()).show()
1177
+ +---+---+
1178
+ | C1| C2|
1179
+ +---+---+
1180
+ | b| 3|
1181
+ | a| 1|
1182
+ +---+---+
1183
+ """
1184
+ return self.intersectAll(other).drop_duplicates()
1185
+
1186
+ def intersectAll(self, other: "DataFrame") -> "DataFrame":
1187
+ """Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`
1188
+ and another :class:`DataFrame` while preserving duplicates.
1189
+
1190
+ This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function
1191
+ resolves columns by position (not by name).
1192
+
1193
+ .. versionadded:: 2.4.0
1194
+
1195
+ .. versionchanged:: 3.4.0
1196
+ Supports Spark Connect.
1197
+
1198
+ Parameters
1199
+ ----------
1200
+ other : :class:`DataFrame`
1201
+ Another :class:`DataFrame` that needs to be combined.
1202
+
1203
+ Returns
1204
+ -------
1205
+ :class:`DataFrame`
1206
+ Combined DataFrame.
1207
+
1208
+ Examples
1209
+ --------
1210
+ >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1211
+ >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
1212
+ >>> df1.intersectAll(df2).sort("C1", "C2").show()
1213
+ +---+---+
1214
+ | C1| C2|
1215
+ +---+---+
1216
+ | a| 1|
1217
+ | a| 1|
1218
+ | b| 3|
1219
+ +---+---+
1220
+ """
1221
+ return DataFrame(self.relation.intersect(other.relation), self.session)
1222
+
1223
+ def exceptAll(self, other: "DataFrame") -> "DataFrame":
1224
+ """Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but
1225
+ not in another :class:`DataFrame` while preserving duplicates.
1226
+
1227
+ This is equivalent to `EXCEPT ALL` in SQL.
1228
+ As standard in SQL, this function resolves columns by position (not by name).
1229
+
1230
+ .. versionadded:: 2.4.0
1231
+
1232
+ .. versionchanged:: 3.4.0
1233
+ Supports Spark Connect.
1234
+
1235
+ Parameters
1236
+ ----------
1237
+ other : :class:`DataFrame`
1238
+ The other :class:`DataFrame` to compare to.
1239
+
1240
+ Returns
1241
+ -------
1242
+ :class:`DataFrame`
1243
+
1244
+ Examples
1245
+ --------
1246
+ >>> df1 = spark.createDataFrame(
1247
+ ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])
1248
+ >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
1249
+ >>> df1.exceptAll(df2).show()
1250
+ +---+---+
1251
+ | C1| C2|
1252
+ +---+---+
1253
+ | a| 1|
1254
+ | a| 1|
1255
+ | a| 2|
1256
+ | c| 4|
1257
+ +---+---+
1258
+
1259
+ """
1260
+ return DataFrame(self.relation.except_(other.relation), self.session)
1261
+
1262
+ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
1263
+ """Return a new :class:`DataFrame` with duplicate rows removed,
1264
+ optionally only considering certain columns.
1265
+
1266
+ For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming
1267
+ :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop
1268
+ duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can
1269
+ be and the system will accordingly limit the state. In addition, data older than
1270
+ watermark will be dropped to avoid any possibility of duplicates.
1271
+
1272
+ :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.
1273
+
1274
+ Parameters
1275
+ ----------
1276
+ subset : List of column names, optional
1277
+ List of columns to use for duplicate comparison (default All columns).
1278
+
1279
+ Returns
1280
+ -------
1281
+ :class:`DataFrame`
1282
+ DataFrame without duplicates.
1283
+
1284
+ Examples
1285
+ --------
1286
+ >>> from pyspark.sql import Row
1287
+ >>> df = spark.createDataFrame([
1288
+ ... Row(name='Alice', age=5, height=80),
1289
+ ... Row(name='Alice', age=5, height=80),
1290
+ ... Row(name='Alice', age=10, height=80)
1291
+ ... ])
1292
+
1293
+ Deduplicate the same rows.
1294
+
1295
+ >>> df.dropDuplicates().show()
1296
+ +-----+---+------+
1297
+ | name|age|height|
1298
+ +-----+---+------+
1299
+ |Alice| 5| 80|
1300
+ |Alice| 10| 80|
1301
+ +-----+---+------+
1302
+
1303
+ Deduplicate values on 'name' and 'height' columns.
1304
+
1305
+ >>> df.dropDuplicates(['name', 'height']).show()
1306
+ +-----+---+------+
1307
+ | name|age|height|
1308
+ +-----+---+------+
1309
+ |Alice| 5| 80|
1310
+ +-----+---+------+
1311
+ """
1312
+ if subset:
1313
+ rn_col = f"tmp_col_{uuid.uuid1().hex}"
1314
+ subset_str = ', '.join([f'"{c}"' for c in subset])
1315
+ window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
1316
+ df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
1317
+ return df.filter(f"{rn_col} = 1").drop(rn_col)
1318
+
1319
+ return self.distinct()
1320
+
1321
+ drop_duplicates = dropDuplicates
1322
+
1323
+
1324
+ def distinct(self) -> "DataFrame":
1325
+ """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
1326
+
1327
+ Returns
1328
+ -------
1329
+ :class:`DataFrame`
1330
+ DataFrame with distinct records.
1331
+
1332
+ Examples
1333
+ --------
1334
+ >>> df = spark.createDataFrame(
1335
+ ... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
1336
+
1337
+ Return the number of distinct rows in the :class:`DataFrame`
1338
+
1339
+ >>> df.distinct().count()
1340
+ 2
1341
+ """
1342
+ distinct_rel = self.relation.distinct()
1343
+ return DataFrame(distinct_rel, self.session)
1344
+
1345
+ def count(self) -> int:
1346
+ """Returns the number of rows in this :class:`DataFrame`.
1347
+
1348
+ Returns
1349
+ -------
1350
+ int
1351
+ Number of rows.
1352
+
1353
+ Examples
1354
+ --------
1355
+ >>> df = spark.createDataFrame(
1356
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1357
+
1358
+ Return the number of rows in the :class:`DataFrame`.
1359
+
1360
+ >>> df.count()
1361
+ 3
1362
+ """
1363
+ count_rel = self.relation.count("*")
1364
+ return int(count_rel.fetchone()[0])
1365
+
1366
+ def _cast_types(self, *types) -> "DataFrame":
1367
+ existing_columns = self.relation.columns
1368
+ types_count = len(types)
1369
+ assert types_count == len(existing_columns)
1370
+
1371
+ cast_expressions = [
1372
+ f"{existing}::{target_type} as {existing}"
1373
+ for existing, target_type in zip(existing_columns, types)
1374
+ ]
1375
+ cast_expressions = ", ".join(cast_expressions)
1376
+ new_rel = self.relation.project(cast_expressions)
1377
+ return DataFrame(new_rel, self.session)
1378
+
1379
+ def toDF(self, *cols) -> "DataFrame":
1380
+ existing_columns = self.relation.columns
1381
+ column_count = len(cols)
1382
+ if column_count != len(existing_columns):
1383
+ raise PySparkValueError(
1384
+ message="Provided column names and number of columns in the DataFrame don't match"
1385
+ )
1386
+
1387
+ existing_columns = [ColumnExpression(x) for x in existing_columns]
1388
+ projections = [
1389
+ existing.alias(new) for existing, new in zip(existing_columns, cols)
1390
+ ]
1391
+ new_rel = self.relation.project(*projections)
1392
+ return DataFrame(new_rel, self.session)
1393
+
1394
+ def collect(self) -> List[Row]:
1395
+ columns = self.relation.columns
1396
+ result = self.relation.fetchall()
1397
+
1398
+ def construct_row(values, names) -> Row:
1399
+ row = tuple.__new__(Row, list(values))
1400
+ row.__fields__ = list(names)
1401
+ return row
1402
+
1403
+ rows = [construct_row(x, columns) for x in result]
1404
+ return rows
1405
+
1406
+ def cache(self) -> "DataFrame":
1407
+ """Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`).
1408
+
1409
+ .. versionadded:: 1.3.0
1410
+
1411
+ .. versionchanged:: 3.4.0
1412
+ Supports Spark Connect.
1413
+
1414
+ Notes
1415
+ -----
1416
+ The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
1417
+
1418
+ Returns
1419
+ -------
1420
+ :class:`DataFrame`
1421
+ Cached DataFrame.
1422
+
1423
+ Examples
1424
+ --------
1425
+ >>> df = spark.range(1)
1426
+ >>> df.cache()
1427
+ DataFrame[id: bigint]
1428
+
1429
+ >>> df.explain()
1430
+ == Physical Plan ==
1431
+ InMemoryTableScan ...
1432
+ """
1433
+ cached_relation = self.relation.execute()
1434
+ return DataFrame(cached_relation, self.session)
1435
+
1436
+
1437
+ __all__ = ["DataFrame"]