duckdb 1.4.1.dev135__cp311-cp311-macosx_10_9_universal2.whl → 1.5.0.dev44__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (57) hide show
  1. _duckdb.cpython-311-darwin.so +0 -0
  2. duckdb/__init__.py +435 -341
  3. duckdb/__init__.pyi +713 -0
  4. duckdb/bytes_io_wrapper.py +9 -12
  5. duckdb/experimental/__init__.py +1 -2
  6. duckdb/experimental/spark/__init__.py +4 -3
  7. duckdb/experimental/spark/_globals.py +8 -8
  8. duckdb/experimental/spark/_typing.py +9 -7
  9. duckdb/experimental/spark/conf.py +15 -16
  10. duckdb/experimental/spark/context.py +44 -60
  11. duckdb/experimental/spark/errors/__init__.py +35 -33
  12. duckdb/experimental/spark/errors/error_classes.py +1 -1
  13. duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
  14. duckdb/experimental/spark/errors/exceptions/base.py +88 -39
  15. duckdb/experimental/spark/errors/utils.py +16 -11
  16. duckdb/experimental/spark/exception.py +6 -9
  17. duckdb/experimental/spark/sql/__init__.py +5 -5
  18. duckdb/experimental/spark/sql/_typing.py +15 -8
  19. duckdb/experimental/spark/sql/catalog.py +20 -21
  20. duckdb/experimental/spark/sql/column.py +55 -48
  21. duckdb/experimental/spark/sql/conf.py +8 -9
  22. duckdb/experimental/spark/sql/dataframe.py +233 -185
  23. duckdb/experimental/spark/sql/functions.py +1248 -1222
  24. duckdb/experimental/spark/sql/group.py +52 -56
  25. duckdb/experimental/spark/sql/readwriter.py +94 -80
  26. duckdb/experimental/spark/sql/session.py +59 -64
  27. duckdb/experimental/spark/sql/streaming.py +10 -9
  28. duckdb/experimental/spark/sql/type_utils.py +65 -67
  29. duckdb/experimental/spark/sql/types.py +345 -309
  30. duckdb/experimental/spark/sql/udf.py +6 -6
  31. duckdb/filesystem.py +16 -26
  32. duckdb/functional/__init__.py +16 -12
  33. duckdb/functional/__init__.pyi +31 -0
  34. duckdb/polars_io.py +82 -124
  35. duckdb/query_graph/__main__.py +96 -91
  36. duckdb/typing/__init__.py +8 -18
  37. duckdb/typing/__init__.pyi +36 -0
  38. duckdb/udf.py +5 -10
  39. duckdb/value/__init__.py +0 -1
  40. duckdb/value/constant/__init__.py +60 -62
  41. duckdb/value/constant/__init__.pyi +115 -0
  42. duckdb-1.5.0.dev44.dist-info/METADATA +80 -0
  43. duckdb-1.5.0.dev44.dist-info/RECORD +47 -0
  44. _duckdb-stubs/__init__.pyi +0 -1443
  45. _duckdb-stubs/_func.pyi +0 -46
  46. _duckdb-stubs/_sqltypes.pyi +0 -75
  47. adbc_driver_duckdb/__init__.py +0 -50
  48. adbc_driver_duckdb/dbapi.py +0 -115
  49. duckdb/_dbapi_type_object.py +0 -231
  50. duckdb/_version.py +0 -22
  51. duckdb/func/__init__.py +0 -3
  52. duckdb/sqltypes/__init__.py +0 -63
  53. duckdb-1.4.1.dev135.dist-info/METADATA +0 -326
  54. duckdb-1.4.1.dev135.dist-info/RECORD +0 -52
  55. /duckdb/{py.typed → value/__init__.pyi} +0 -0
  56. {duckdb-1.4.1.dev135.dist-info → duckdb-1.5.0.dev44.dist-info}/WHEEL +0 -0
  57. {duckdb-1.4.1.dev135.dist-info → duckdb-1.5.0.dev44.dist-info}/licenses/LICENSE +0 -0
@@ -1,20 +1,24 @@
1
- import uuid # noqa: D100
2
1
  from functools import reduce
3
- from keyword import iskeyword
4
2
  from typing import (
5
3
  TYPE_CHECKING,
6
4
  Any,
7
5
  Callable,
6
+ List,
7
+ Dict,
8
8
  Optional,
9
+ Tuple,
9
10
  Union,
10
11
  cast,
11
12
  overload,
12
13
  )
14
+ import uuid
15
+ from keyword import iskeyword
13
16
 
14
17
  import duckdb
15
18
  from duckdb import ColumnExpression, Expression, StarExpression
16
19
 
17
- from ..errors import PySparkIndexError, PySparkTypeError, PySparkValueError
20
+ from ._typing import ColumnOrName
21
+ from ..errors import PySparkTypeError, PySparkValueError, PySparkIndexError
18
22
  from ..exception import ContributionsAcceptedError
19
23
  from .column import Column
20
24
  from .readwriter import DataFrameWriter
@@ -25,42 +29,43 @@ if TYPE_CHECKING:
25
29
  import pyarrow as pa
26
30
  from pandas.core.frame import DataFrame as PandasDataFrame
27
31
 
28
- from ._typing import ColumnOrName
29
- from .group import GroupedData
32
+ from .group import GroupedData, Grouping
30
33
  from .session import SparkSession
31
34
 
32
- from duckdb.experimental.spark.sql import functions as spark_sql_functions
35
+ from ..errors import PySparkValueError
36
+ from .functions import _to_column_expr, col, lit
33
37
 
34
38
 
35
- class DataFrame: # noqa: D101
36
- def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession") -> None: # noqa: D107
39
+ class DataFrame:
40
+ def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession"):
37
41
  self.relation = relation
38
42
  self.session = session
39
43
  self._schema = None
40
44
  if self.relation is not None:
41
45
  self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
42
46
 
43
- def show(self, **kwargs) -> None: # noqa: D102
47
+ def show(self, **kwargs) -> None:
44
48
  self.relation.show()
45
49
 
46
- def toPandas(self) -> "PandasDataFrame": # noqa: D102
50
+ def toPandas(self) -> "PandasDataFrame":
47
51
  return self.relation.df()
48
52
 
49
53
  def toArrow(self) -> "pa.Table":
50
- """Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
54
+ """
55
+ Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
51
56
 
52
57
  This is only available if PyArrow is installed and available.
53
58
 
54
59
  .. versionadded:: 4.0.0
55
60
 
56
- Notes:
61
+ Notes
57
62
  -----
58
63
  This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
59
64
  expected to be small, as all the data is loaded into the driver's memory.
60
65
 
61
66
  This API is a developer API.
62
67
 
63
- Examples:
68
+ Examples
64
69
  --------
65
70
  >>> df.toArrow() # doctest: +SKIP
66
71
  pyarrow.Table
@@ -83,7 +88,7 @@ class DataFrame: # noqa: D101
83
88
  name : str
84
89
  Name of the view.
85
90
 
86
- Examples:
91
+ Examples
87
92
  --------
88
93
  Create a local temporary view named 'people'.
89
94
 
@@ -103,13 +108,12 @@ class DataFrame: # noqa: D101
103
108
  """
104
109
  self.relation.create_view(name, True)
105
110
 
106
- def createGlobalTempView(self, name: str) -> None: # noqa: D102
111
+ def createGlobalTempView(self, name: str) -> None:
107
112
  raise NotImplementedError
108
113
 
109
- def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame": # noqa: D102
114
+ def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
110
115
  if columnName not in self.relation:
111
- msg = f"DataFrame does not contain a column named {columnName}"
112
- raise ValueError(msg)
116
+ raise ValueError(f"DataFrame does not contain a column named {columnName}")
113
117
  cols = []
114
118
  for x in self.relation.columns:
115
119
  col = ColumnExpression(x)
@@ -119,7 +123,7 @@ class DataFrame: # noqa: D101
119
123
  rel = self.relation.select(*cols)
120
124
  return DataFrame(rel, self.session)
121
125
 
122
- def withColumn(self, columnName: str, col: Column) -> "DataFrame": # noqa: D102
126
+ def withColumn(self, columnName: str, col: Column) -> "DataFrame":
123
127
  if not isinstance(col, Column):
124
128
  raise PySparkTypeError(
125
129
  error_class="NOT_COLUMN",
@@ -139,8 +143,9 @@ class DataFrame: # noqa: D101
139
143
  rel = self.relation.select(*cols)
140
144
  return DataFrame(rel, self.session)
141
145
 
142
- def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame":
143
- """Returns a new :class:`DataFrame` by adding multiple columns or replacing the
146
+ def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame":
147
+ """
148
+ Returns a new :class:`DataFrame` by adding multiple columns or replacing the
144
149
  existing columns that have the same names.
145
150
 
146
151
  The colsMap is a map of column name and column, the column must only refer to attributes
@@ -157,22 +162,22 @@ class DataFrame: # noqa: D101
157
162
  colsMap : dict
158
163
  a dict of column name and :class:`Column`. Currently, only a single map is supported.
159
164
 
160
- Returns:
165
+ Returns
161
166
  -------
162
167
  :class:`DataFrame`
163
168
  DataFrame with new or replaced columns.
164
169
 
165
- Examples:
170
+ Examples
166
171
  --------
167
172
  >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
168
- >>> df.withColumns({"age2": df.age + 2, "age3": df.age + 3}).show()
173
+ >>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()
169
174
  +---+-----+----+----+
170
175
  |age| name|age2|age3|
171
176
  +---+-----+----+----+
172
177
  | 2|Alice| 4| 5|
173
178
  | 5| Bob| 7| 8|
174
179
  +---+-----+----+----+
175
- """ # noqa: D205
180
+ """
176
181
  # Below code is to help enable kwargs in future.
177
182
  assert len(colsMap) == 1
178
183
  colsMap = colsMap[0] # type: ignore[assignment]
@@ -213,8 +218,9 @@ class DataFrame: # noqa: D101
213
218
  rel = self.relation.select(*cols)
214
219
  return DataFrame(rel, self.session)
215
220
 
216
- def withColumnsRenamed(self, colsMap: dict[str, str]) -> "DataFrame":
217
- """Returns a new :class:`DataFrame` by renaming multiple columns.
221
+ def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
222
+ """
223
+ Returns a new :class:`DataFrame` by renaming multiple columns.
218
224
  This is a no-op if the schema doesn't contain the given column names.
219
225
 
220
226
  .. versionadded:: 3.4.0
@@ -226,31 +232,31 @@ class DataFrame: # noqa: D101
226
232
  a dict of existing column names and corresponding desired column names.
227
233
  Currently, only a single map is supported.
228
234
 
229
- Returns:
235
+ Returns
230
236
  -------
231
237
  :class:`DataFrame`
232
238
  DataFrame with renamed columns.
233
239
 
234
- See Also:
240
+ See Also
235
241
  --------
236
242
  :meth:`withColumnRenamed`
237
243
 
238
- Notes:
244
+ Notes
239
245
  -----
240
246
  Support Spark Connect
241
247
 
242
- Examples:
248
+ Examples
243
249
  --------
244
250
  >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
245
- >>> df = df.withColumns({"age2": df.age + 2, "age3": df.age + 3})
246
- >>> df.withColumnsRenamed({"age2": "age4", "age3": "age5"}).show()
251
+ >>> df = df.withColumns({'age2': df.age + 2, 'age3': df.age + 3})
252
+ >>> df.withColumnsRenamed({'age2': 'age4', 'age3': 'age5'}).show()
247
253
  +---+-----+----+----+
248
254
  |age| name|age4|age5|
249
255
  +---+-----+----+----+
250
256
  | 2|Alice| 4| 5|
251
257
  | 5| Bob| 7| 8|
252
258
  +---+-----+----+----+
253
- """ # noqa: D205
259
+ """
254
260
  if not isinstance(colsMap, dict):
255
261
  raise PySparkTypeError(
256
262
  error_class="NOT_DICT",
@@ -259,8 +265,9 @@ class DataFrame: # noqa: D101
259
265
 
260
266
  unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
261
267
  if unknown_columns:
262
- msg = f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
263
- raise ValueError(msg)
268
+ raise ValueError(
269
+ f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
270
+ )
264
271
 
265
272
  # Compute this only once
266
273
  old_column_names = list(colsMap.keys())
@@ -282,7 +289,11 @@ class DataFrame: # noqa: D101
282
289
  rel = self.relation.select(*cols)
283
290
  return DataFrame(rel, self.session)
284
291
 
285
- def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": # noqa: ANN401
292
+
293
+
294
+ def transform(
295
+ self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any
296
+ ) -> "DataFrame":
286
297
  """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
287
298
 
288
299
  .. versionadded:: 3.0.0
@@ -303,19 +314,21 @@ class DataFrame: # noqa: D101
303
314
 
304
315
  .. versionadded:: 3.3.0
305
316
 
306
- Returns:
317
+ Returns
307
318
  -------
308
319
  :class:`DataFrame`
309
320
  Transformed DataFrame.
310
321
 
311
- Examples:
322
+ Examples
312
323
  --------
313
324
  >>> from pyspark.sql.functions import col
314
325
  >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
315
326
  >>> def cast_all_to_int(input_df):
316
327
  ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
328
+ ...
317
329
  >>> def sort_columns_asc(input_df):
318
330
  ... return input_df.select(*sorted(input_df.columns))
331
+ ...
319
332
  >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
320
333
  +-----+---+
321
334
  |float|int|
@@ -325,9 +338,8 @@ class DataFrame: # noqa: D101
325
338
  +-----+---+
326
339
 
327
340
  >>> def add_n(input_df, n):
328
- ... return input_df.select(
329
- ... [(col(col_name) + n).alias(col_name) for col_name in input_df.columns]
330
- ... )
341
+ ... return input_df.select([(col(col_name) + n).alias(col_name)
342
+ ... for col_name in input_df.columns])
331
343
  >>> df.transform(add_n, 1).transform(add_n, n=10).show()
332
344
  +---+-----+
333
345
  |int|float|
@@ -338,11 +350,14 @@ class DataFrame: # noqa: D101
338
350
  """
339
351
  result = func(self, *args, **kwargs)
340
352
  assert isinstance(result, DataFrame), (
341
- f"Func returned an instance of type [{type(result)}], should have been DataFrame."
353
+ "Func returned an instance of type [%s], "
354
+ "should have been DataFrame." % type(result)
342
355
  )
343
356
  return result
344
357
 
345
- def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame": # noqa: ANN401
358
+ def sort(
359
+ self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
360
+ ) -> "DataFrame":
346
361
  """Returns a new :class:`DataFrame` sorted by the specified column(s).
347
362
 
348
363
  Parameters
@@ -357,15 +372,16 @@ class DataFrame: # noqa: D101
357
372
  Sort ascending vs. descending. Specify list for multiple sort orders.
358
373
  If a list is specified, the length of the list must equal the length of the `cols`.
359
374
 
360
- Returns:
375
+ Returns
361
376
  -------
362
377
  :class:`DataFrame`
363
378
  Sorted DataFrame.
364
379
 
365
- Examples:
380
+ Examples
366
381
  --------
367
382
  >>> from pyspark.sql.functions import desc, asc
368
- >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
383
+ >>> df = spark.createDataFrame([
384
+ ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
369
385
 
370
386
  Sort the DataFrame in ascending order.
371
387
 
@@ -403,9 +419,8 @@ class DataFrame: # noqa: D101
403
419
 
404
420
  Specify multiple columns
405
421
 
406
- >>> df = spark.createDataFrame(
407
- ... [(2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
408
- ... )
422
+ >>> df = spark.createDataFrame([
423
+ ... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
409
424
  >>> df.orderBy(desc("age"), "name").show()
410
425
  +---+-----+
411
426
  |age| name|
@@ -438,7 +453,7 @@ class DataFrame: # noqa: D101
438
453
  for c in cols:
439
454
  _c = c
440
455
  if isinstance(c, str):
441
- _c = spark_sql_functions.col(c)
456
+ _c = col(c)
442
457
  elif isinstance(c, int) and not isinstance(c, bool):
443
458
  # ordinal is 1-based
444
459
  if c > 0:
@@ -466,13 +481,13 @@ class DataFrame: # noqa: D101
466
481
  message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
467
482
  )
468
483
 
469
- columns = [spark_sql_functions._to_column_expr(c) for c in columns]
484
+ columns = [_to_column_expr(c) for c in columns]
470
485
  rel = self.relation.sort(*columns)
471
486
  return DataFrame(rel, self.session)
472
487
 
473
488
  orderBy = sort
474
489
 
475
- def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]: # noqa: D102
490
+ def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]:
476
491
  if n is None:
477
492
  rs = self.head(1)
478
493
  return rs[0] if rs else None
@@ -480,7 +495,7 @@ class DataFrame: # noqa: D101
480
495
 
481
496
  first = head
482
497
 
483
- def take(self, num: int) -> list[Row]: # noqa: D102
498
+ def take(self, num: int) -> List[Row]:
484
499
  return self.limit(num).collect()
485
500
 
486
501
  def filter(self, condition: "ColumnOrName") -> "DataFrame":
@@ -494,14 +509,15 @@ class DataFrame: # noqa: D101
494
509
  a :class:`Column` of :class:`types.BooleanType`
495
510
  or a string of SQL expressions.
496
511
 
497
- Returns:
512
+ Returns
498
513
  -------
499
514
  :class:`DataFrame`
500
515
  Filtered DataFrame.
501
516
 
502
- Examples:
517
+ Examples
503
518
  --------
504
- >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
519
+ >>> df = spark.createDataFrame([
520
+ ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
505
521
 
506
522
  Filter by :class:`Column` instances.
507
523
 
@@ -547,34 +563,38 @@ class DataFrame: # noqa: D101
547
563
 
548
564
  where = filter
549
565
 
550
- def select(self, *cols) -> "DataFrame": # noqa: D102
566
+ def select(self, *cols) -> "DataFrame":
551
567
  cols = list(cols)
552
568
  if len(cols) == 1:
553
569
  cols = cols[0]
554
570
  if isinstance(cols, list):
555
- projections = [x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols]
571
+ projections = [
572
+ x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols
573
+ ]
556
574
  else:
557
- projections = [cols.expr if isinstance(cols, Column) else ColumnExpression(cols)]
575
+ projections = [
576
+ cols.expr if isinstance(cols, Column) else ColumnExpression(cols)
577
+ ]
558
578
  rel = self.relation.select(*projections)
559
579
  return DataFrame(rel, self.session)
560
580
 
561
581
  @property
562
- def columns(self) -> list[str]:
582
+ def columns(self) -> List[str]:
563
583
  """Returns all column names as a list.
564
584
 
565
- Examples:
585
+ Examples
566
586
  --------
567
587
  >>> df.columns
568
588
  ['age', 'name']
569
589
  """
570
590
  return [f.name for f in self.schema.fields]
571
591
 
572
- def _ipython_key_completions_(self) -> list[str]:
592
+ def _ipython_key_completions_(self) -> List[str]:
573
593
  # Provides tab-completion for column names in PySpark DataFrame
574
594
  # when accessed in bracket notation, e.g. df['<TAB>]
575
595
  return self.columns
576
596
 
577
- def __dir__(self) -> list[str]: # noqa: D105
597
+ def __dir__(self) -> List[str]:
578
598
  out = set(super().__dir__())
579
599
  out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
580
600
  return sorted(out)
@@ -582,7 +602,7 @@ class DataFrame: # noqa: D101
582
602
  def join(
583
603
  self,
584
604
  other: "DataFrame",
585
- on: Optional[Union[str, list[str], Column, list[Column]]] = None,
605
+ on: Optional[Union[str, List[str], Column, List[Column]]] = None,
586
606
  how: Optional[str] = None,
587
607
  ) -> "DataFrame":
588
608
  """Joins with another :class:`DataFrame`, using the given join expression.
@@ -602,12 +622,12 @@ class DataFrame: # noqa: D101
602
622
  ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
603
623
  ``anti``, ``leftanti`` and ``left_anti``.
604
624
 
605
- Returns:
625
+ Returns
606
626
  -------
607
627
  :class:`DataFrame`
608
628
  Joined DataFrame.
609
629
 
610
- Examples:
630
+ Examples
611
631
  --------
612
632
  The following performs a full outer join between ``df1`` and ``df2``.
613
633
 
@@ -616,24 +636,22 @@ class DataFrame: # noqa: D101
616
636
  >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
617
637
  >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
618
638
  >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
619
- >>> df4 = spark.createDataFrame(
620
- ... [
621
- ... Row(age=10, height=80, name="Alice"),
622
- ... Row(age=5, height=None, name="Bob"),
623
- ... Row(age=None, height=None, name="Tom"),
624
- ... Row(age=None, height=None, name=None),
625
- ... ]
626
- ... )
639
+ >>> df4 = spark.createDataFrame([
640
+ ... Row(age=10, height=80, name="Alice"),
641
+ ... Row(age=5, height=None, name="Bob"),
642
+ ... Row(age=None, height=None, name="Tom"),
643
+ ... Row(age=None, height=None, name=None),
644
+ ... ])
627
645
 
628
646
  Inner join on columns (default)
629
647
 
630
- >>> df.join(df2, "name").select(df.name, df2.height).show()
648
+ >>> df.join(df2, 'name').select(df.name, df2.height).show()
631
649
  +----+------+
632
650
  |name|height|
633
651
  +----+------+
634
652
  | Bob| 85|
635
653
  +----+------+
636
- >>> df.join(df4, ["name", "age"]).select(df.name, df.age).show()
654
+ >>> df.join(df4, ['name', 'age']).select(df.name, df.age).show()
637
655
  +----+---+
638
656
  |name|age|
639
657
  +----+---+
@@ -642,9 +660,8 @@ class DataFrame: # noqa: D101
642
660
 
643
661
  Outer join for both DataFrames on the 'name' column.
644
662
 
645
- >>> df.join(df2, df.name == df2.name, "outer").select(df.name, df2.height).sort(
646
- ... desc("name")
647
- ... ).show()
663
+ >>> df.join(df2, df.name == df2.name, 'outer').select(
664
+ ... df.name, df2.height).sort(desc("name")).show()
648
665
  +-----+------+
649
666
  | name|height|
650
667
  +-----+------+
@@ -652,7 +669,7 @@ class DataFrame: # noqa: D101
652
669
  |Alice| NULL|
653
670
  | NULL| 80|
654
671
  +-----+------+
655
- >>> df.join(df2, "name", "outer").select("name", "height").sort(desc("name")).show()
672
+ >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
656
673
  +-----+------+
657
674
  | name|height|
658
675
  +-----+------+
@@ -663,9 +680,11 @@ class DataFrame: # noqa: D101
663
680
 
664
681
  Outer join for both DataFrams with multiple columns.
665
682
 
666
- >>> df.join(df3, [df.name == df3.name, df.age == df3.age], "outer").select(
667
- ... df.name, df3.age
668
- ... ).show()
683
+ >>> df.join(
684
+ ... df3,
685
+ ... [df.name == df3.name, df.age == df3.age],
686
+ ... 'outer'
687
+ ... ).select(df.name, df3.age).show()
669
688
  +-----+---+
670
689
  | name|age|
671
690
  +-----+---+
@@ -673,16 +692,20 @@ class DataFrame: # noqa: D101
673
692
  | Bob| 5|
674
693
  +-----+---+
675
694
  """
695
+
676
696
  if on is not None and not isinstance(on, list):
677
697
  on = [on] # type: ignore[assignment]
678
- if on is not None and not all(isinstance(x, str) for x in on):
698
+ if on is not None and not all([isinstance(x, str) for x in on]):
679
699
  assert isinstance(on, list)
680
700
  # Get (or create) the Expressions from the list of Columns
681
- on = [spark_sql_functions._to_column_expr(x) for x in on]
701
+ on = [_to_column_expr(x) for x in on]
682
702
 
683
703
  # & all the Expressions together to form one Expression
684
- assert isinstance(on[0], Expression), "on should be Column or list of Column"
685
- on = reduce(lambda x, y: x.__and__(y), cast("list[Expression]", on))
704
+ assert isinstance(
705
+ on[0], Expression
706
+ ), "on should be Column or list of Column"
707
+ on = reduce(lambda x, y: x.__and__(y), cast(List[Expression], on))
708
+
686
709
 
687
710
  if on is None and how is None:
688
711
  result = self.relation.join(other.relation)
@@ -691,14 +714,14 @@ class DataFrame: # noqa: D101
691
714
  how = "inner"
692
715
  if on is None:
693
716
  on = "true"
694
- elif isinstance(on, list) and all(isinstance(x, str) for x in on):
717
+ elif isinstance(on, list) and all([isinstance(x, str) for x in on]):
695
718
  # Passed directly through as a list of strings
696
719
  on = on
697
720
  else:
698
721
  on = str(on)
699
722
  assert isinstance(how, str), "how should be a string"
700
723
 
701
- def map_to_recognized_jointype(how: str) -> str:
724
+ def map_to_recognized_jointype(how):
702
725
  known_aliases = {
703
726
  "inner": [],
704
727
  "outer": ["full", "fullouter", "full_outer"],
@@ -707,10 +730,15 @@ class DataFrame: # noqa: D101
707
730
  "anti": ["leftanti", "left_anti"],
708
731
  "semi": ["leftsemi", "left_semi"],
709
732
  }
733
+ mapped_type = None
710
734
  for type, aliases in known_aliases.items():
711
735
  if how == type or how in aliases:
712
- return type
713
- return how
736
+ mapped_type = type
737
+ break
738
+
739
+ if not mapped_type:
740
+ mapped_type = how
741
+ return mapped_type
714
742
 
715
743
  how = map_to_recognized_jointype(how)
716
744
  result = self.relation.join(other.relation, on, how)
@@ -729,16 +757,18 @@ class DataFrame: # noqa: D101
729
757
  other : :class:`DataFrame`
730
758
  Right side of the cartesian product.
731
759
 
732
- Returns:
760
+ Returns
733
761
  -------
734
762
  :class:`DataFrame`
735
763
  Joined DataFrame.
736
764
 
737
- Examples:
765
+ Examples
738
766
  --------
739
767
  >>> from pyspark.sql import Row
740
- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
741
- >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
768
+ >>> df = spark.createDataFrame(
769
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
770
+ >>> df2 = spark.createDataFrame(
771
+ ... [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
742
772
  >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
743
773
  +---+-----+------+
744
774
  |age| name|height|
@@ -761,21 +791,21 @@ class DataFrame: # noqa: D101
761
791
  alias : str
762
792
  an alias name to be set for the :class:`DataFrame`.
763
793
 
764
- Returns:
794
+ Returns
765
795
  -------
766
796
  :class:`DataFrame`
767
797
  Aliased DataFrame.
768
798
 
769
- Examples:
799
+ Examples
770
800
  --------
771
801
  >>> from pyspark.sql.functions import col, desc
772
- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
802
+ >>> df = spark.createDataFrame(
803
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
773
804
  >>> df_as1 = df.alias("df_as1")
774
805
  >>> df_as2 = df.alias("df_as2")
775
- >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
776
- >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").sort(
777
- ... desc("df_as1.name")
778
- ... ).show()
806
+ >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
807
+ >>> joined_df.select(
808
+ ... "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
779
809
  +-----+-----+---+
780
810
  | name| name|age|
781
811
  +-----+-----+---+
@@ -787,7 +817,7 @@ class DataFrame: # noqa: D101
787
817
  assert isinstance(alias, str), "alias should be a string"
788
818
  return DataFrame(self.relation.set_alias(alias), self.session)
789
819
 
790
- def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] # noqa: D102
820
+ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]
791
821
  exclude = []
792
822
  for col in cols:
793
823
  if isinstance(col, str):
@@ -804,7 +834,7 @@ class DataFrame: # noqa: D101
804
834
  expr = StarExpression(exclude=exclude)
805
835
  return DataFrame(self.relation.select(expr), self.session)
806
836
 
807
- def __repr__(self) -> str: # noqa: D105
837
+ def __repr__(self) -> str:
808
838
  return str(self.relation)
809
839
 
810
840
  def limit(self, num: int) -> "DataFrame":
@@ -816,14 +846,15 @@ class DataFrame: # noqa: D101
816
846
  Number of records to return. Will return this number of records
817
847
  or all records if the DataFrame contains less than this number of records.
818
848
 
819
- Returns:
849
+ Returns
820
850
  -------
821
851
  :class:`DataFrame`
822
852
  Subset of the records
823
853
 
824
- Examples:
854
+ Examples
825
855
  --------
826
- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
856
+ >>> df = spark.createDataFrame(
857
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
827
858
  >>> df.limit(1).show()
828
859
  +---+----+
829
860
  |age|name|
@@ -839,15 +870,17 @@ class DataFrame: # noqa: D101
839
870
  rel = self.relation.limit(num)
840
871
  return DataFrame(rel, self.session)
841
872
 
842
- def __contains__(self, item: str) -> bool:
843
- """Check if the :class:`DataFrame` contains a column by the name of `item`."""
873
+ def __contains__(self, item: str):
874
+ """
875
+ Check if the :class:`DataFrame` contains a column by the name of `item`
876
+ """
844
877
  return item in self.relation
845
878
 
846
879
  @property
847
880
  def schema(self) -> StructType:
848
881
  """Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
849
882
 
850
- Examples:
883
+ Examples
851
884
  --------
852
885
  >>> df.schema
853
886
  StructType([StructField('age', IntegerType(), True),
@@ -856,21 +889,25 @@ class DataFrame: # noqa: D101
856
889
  return self._schema
857
890
 
858
891
  @overload
859
- def __getitem__(self, item: Union[int, str]) -> Column: ...
892
+ def __getitem__(self, item: Union[int, str]) -> Column:
893
+ ...
860
894
 
861
895
  @overload
862
- def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ...
896
+ def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame":
897
+ ...
863
898
 
864
- def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]:
899
+ def __getitem__(
900
+ self, item: Union[int, str, Column, List, Tuple]
901
+ ) -> Union[Column, "DataFrame"]:
865
902
  """Returns the column as a :class:`Column`.
866
903
 
867
- Examples:
904
+ Examples
868
905
  --------
869
- >>> df.select(df["age"]).collect()
906
+ >>> df.select(df['age']).collect()
870
907
  [Row(age=2), Row(age=5)]
871
- >>> df[["name", "age"]].collect()
908
+ >>> df[ ["name", "age"]].collect()
872
909
  [Row(name='Alice', age=2), Row(name='Bob', age=5)]
873
- >>> df[df.age > 3].collect()
910
+ >>> df[ df.age > 3 ].collect()
874
911
  [Row(age=5, name='Bob')]
875
912
  >>> df[df[0] > 3].collect()
876
913
  [Row(age=5, name='Bob')]
@@ -882,29 +919,31 @@ class DataFrame: # noqa: D101
882
919
  elif isinstance(item, (list, tuple)):
883
920
  return self.select(*item)
884
921
  elif isinstance(item, int):
885
- return spark_sql_functions.col(self._schema[item].name)
922
+ return col(self._schema[item].name)
886
923
  else:
887
- msg = f"Unexpected item type: {type(item)}"
888
- raise TypeError(msg)
924
+ raise TypeError(f"Unexpected item type: {type(item)}")
889
925
 
890
926
  def __getattr__(self, name: str) -> Column:
891
927
  """Returns the :class:`Column` denoted by ``name``.
892
928
 
893
- Examples:
929
+ Examples
894
930
  --------
895
931
  >>> df.select(df.age).collect()
896
932
  [Row(age=2), Row(age=5)]
897
933
  """
898
934
  if name not in self.relation.columns:
899
- msg = f"'{self.__class__.__name__}' object has no attribute '{name}'"
900
- raise AttributeError(msg)
935
+ raise AttributeError(
936
+ "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)
937
+ )
901
938
  return Column(duckdb.ColumnExpression(self.relation.alias, name))
902
939
 
903
940
  @overload
904
- def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ...
941
+ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
942
+ ...
905
943
 
906
944
  @overload
907
- def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ... # noqa: PYI063
945
+ def groupBy(self, __cols: Union[List[Column], List[str]]) -> "GroupedData":
946
+ ...
908
947
 
909
948
  def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
910
949
  """Groups the :class:`DataFrame` using the specified columns,
@@ -920,16 +959,15 @@ class DataFrame: # noqa: D101
920
959
  Each element should be a column name (string) or an expression (:class:`Column`)
921
960
  or list of them.
922
961
 
923
- Returns:
962
+ Returns
924
963
  -------
925
964
  :class:`GroupedData`
926
965
  Grouped data by given columns.
927
966
 
928
- Examples:
967
+ Examples
929
968
  --------
930
- >>> df = spark.createDataFrame(
931
- ... [(2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
932
- ... )
969
+ >>> df = spark.createDataFrame([
970
+ ... (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
933
971
 
934
972
  Empty grouping columns triggers a global aggregation.
935
973
 
@@ -970,19 +1008,22 @@ class DataFrame: # noqa: D101
970
1008
  | Bob| 2| 2|
971
1009
  | Bob| 5| 1|
972
1010
  +-----+---+-----+
973
- """ # noqa: D205
1011
+ """
974
1012
  from .group import GroupedData, Grouping
975
1013
 
976
- columns = cols[0] if len(cols) == 1 and isinstance(cols[0], list) else cols
1014
+ if len(cols) == 1 and isinstance(cols[0], list):
1015
+ columns = cols[0]
1016
+ else:
1017
+ columns = cols
977
1018
  return GroupedData(Grouping(*columns), self)
978
1019
 
979
1020
  groupby = groupBy
980
1021
 
981
1022
  @property
982
- def write(self) -> DataFrameWriter: # noqa: D102
1023
+ def write(self) -> DataFrameWriter:
983
1024
  return DataFrameWriter(self)
984
1025
 
985
- def printSchema(self) -> None: # noqa: D102
1026
+ def printSchema(self):
986
1027
  raise ContributionsAcceptedError
987
1028
 
988
1029
  def union(self, other: "DataFrame") -> "DataFrame":
@@ -994,22 +1035,22 @@ class DataFrame: # noqa: D101
994
1035
  other : :class:`DataFrame`
995
1036
  Another :class:`DataFrame` that needs to be unioned
996
1037
 
997
- Returns:
1038
+ Returns
998
1039
  -------
999
1040
  :class:`DataFrame`
1000
1041
 
1001
- See Also:
1042
+ See Also
1002
1043
  --------
1003
1044
  DataFrame.unionAll
1004
1045
 
1005
- Notes:
1046
+ Notes
1006
1047
  -----
1007
1048
  This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1008
1049
  (that does deduplication of elements), use this function followed by :func:`distinct`.
1009
1050
 
1010
1051
  Also as standard in SQL, this function resolves columns by position (not by name).
1011
1052
 
1012
- Examples:
1053
+ Examples
1013
1054
  --------
1014
1055
  >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1015
1056
  >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
@@ -1027,12 +1068,14 @@ class DataFrame: # noqa: D101
1027
1068
  | 1| 2| 3|
1028
1069
  | 1| 2| 3|
1029
1070
  +----+----+----+
1030
- """ # noqa: D205
1071
+ """
1031
1072
  return DataFrame(self.relation.union(other.relation), self.session)
1032
1073
 
1033
1074
  unionAll = union
1034
1075
 
1035
- def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
1076
+ def unionByName(
1077
+ self, other: "DataFrame", allowMissingColumns: bool = False
1078
+ ) -> "DataFrame":
1036
1079
  """Returns a new :class:`DataFrame` containing union of rows in this and another
1037
1080
  :class:`DataFrame`.
1038
1081
 
@@ -1053,12 +1096,12 @@ class DataFrame: # noqa: D101
1053
1096
 
1054
1097
  .. versionadded:: 3.1.0
1055
1098
 
1056
- Returns:
1099
+ Returns
1057
1100
  -------
1058
1101
  :class:`DataFrame`
1059
1102
  Combined DataFrame.
1060
1103
 
1061
- Examples:
1104
+ Examples
1062
1105
  --------
1063
1106
  The difference between this function and :func:`union` is that this function
1064
1107
  resolves columns by name (not by position):
@@ -1087,14 +1130,14 @@ class DataFrame: # noqa: D101
1087
1130
  | 1| 2| 3|NULL|
1088
1131
  |NULL| 4| 5| 6|
1089
1132
  +----+----+----+----+
1090
- """ # noqa: D205
1133
+ """
1091
1134
  if allowMissingColumns:
1092
1135
  cols = []
1093
1136
  for col in self.relation.columns:
1094
1137
  if col in other.relation.columns:
1095
1138
  cols.append(col)
1096
1139
  else:
1097
- cols.append(spark_sql_functions.lit(None))
1140
+ cols.append(lit(None))
1098
1141
  other = other.select(*cols)
1099
1142
  else:
1100
1143
  other = other.select(*self.relation.columns)
@@ -1117,16 +1160,16 @@ class DataFrame: # noqa: D101
1117
1160
  other : :class:`DataFrame`
1118
1161
  Another :class:`DataFrame` that needs to be combined.
1119
1162
 
1120
- Returns:
1163
+ Returns
1121
1164
  -------
1122
1165
  :class:`DataFrame`
1123
1166
  Combined DataFrame.
1124
1167
 
1125
- Notes:
1168
+ Notes
1126
1169
  -----
1127
1170
  This is equivalent to `INTERSECT` in SQL.
1128
1171
 
1129
- Examples:
1172
+ Examples
1130
1173
  --------
1131
1174
  >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1132
1175
  >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
@@ -1137,7 +1180,7 @@ class DataFrame: # noqa: D101
1137
1180
  | b| 3|
1138
1181
  | a| 1|
1139
1182
  +---+---+
1140
- """ # noqa: D205
1183
+ """
1141
1184
  return self.intersectAll(other).drop_duplicates()
1142
1185
 
1143
1186
  def intersectAll(self, other: "DataFrame") -> "DataFrame":
@@ -1157,12 +1200,12 @@ class DataFrame: # noqa: D101
1157
1200
  other : :class:`DataFrame`
1158
1201
  Another :class:`DataFrame` that needs to be combined.
1159
1202
 
1160
- Returns:
1203
+ Returns
1161
1204
  -------
1162
1205
  :class:`DataFrame`
1163
1206
  Combined DataFrame.
1164
1207
 
1165
- Examples:
1208
+ Examples
1166
1209
  --------
1167
1210
  >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1168
1211
  >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
@@ -1174,7 +1217,7 @@ class DataFrame: # noqa: D101
1174
1217
  | a| 1|
1175
1218
  | b| 3|
1176
1219
  +---+---+
1177
- """ # noqa: D205
1220
+ """
1178
1221
  return DataFrame(self.relation.intersect(other.relation), self.session)
1179
1222
 
1180
1223
  def exceptAll(self, other: "DataFrame") -> "DataFrame":
@@ -1194,15 +1237,14 @@ class DataFrame: # noqa: D101
1194
1237
  other : :class:`DataFrame`
1195
1238
  The other :class:`DataFrame` to compare to.
1196
1239
 
1197
- Returns:
1240
+ Returns
1198
1241
  -------
1199
1242
  :class:`DataFrame`
1200
1243
 
1201
- Examples:
1244
+ Examples
1202
1245
  --------
1203
1246
  >>> df1 = spark.createDataFrame(
1204
- ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]
1205
- ... )
1247
+ ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])
1206
1248
  >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
1207
1249
  >>> df1.exceptAll(df2).show()
1208
1250
  +---+---+
@@ -1214,10 +1256,10 @@ class DataFrame: # noqa: D101
1214
1256
  | c| 4|
1215
1257
  +---+---+
1216
1258
 
1217
- """ # noqa: D205
1259
+ """
1218
1260
  return DataFrame(self.relation.except_(other.relation), self.session)
1219
1261
 
1220
- def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame":
1262
+ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
1221
1263
  """Return a new :class:`DataFrame` with duplicate rows removed,
1222
1264
  optionally only considering certain columns.
1223
1265
 
@@ -1234,21 +1276,19 @@ class DataFrame: # noqa: D101
1234
1276
  subset : List of column names, optional
1235
1277
  List of columns to use for duplicate comparison (default All columns).
1236
1278
 
1237
- Returns:
1279
+ Returns
1238
1280
  -------
1239
1281
  :class:`DataFrame`
1240
1282
  DataFrame without duplicates.
1241
1283
 
1242
- Examples:
1284
+ Examples
1243
1285
  --------
1244
1286
  >>> from pyspark.sql import Row
1245
- >>> df = spark.createDataFrame(
1246
- ... [
1247
- ... Row(name="Alice", age=5, height=80),
1248
- ... Row(name="Alice", age=5, height=80),
1249
- ... Row(name="Alice", age=10, height=80),
1250
- ... ]
1251
- ... )
1287
+ >>> df = spark.createDataFrame([
1288
+ ... Row(name='Alice', age=5, height=80),
1289
+ ... Row(name='Alice', age=5, height=80),
1290
+ ... Row(name='Alice', age=10, height=80)
1291
+ ... ])
1252
1292
 
1253
1293
  Deduplicate the same rows.
1254
1294
 
@@ -1262,16 +1302,16 @@ class DataFrame: # noqa: D101
1262
1302
 
1263
1303
  Deduplicate values on 'name' and 'height' columns.
1264
1304
 
1265
- >>> df.dropDuplicates(["name", "height"]).show()
1305
+ >>> df.dropDuplicates(['name', 'height']).show()
1266
1306
  +-----+---+------+
1267
1307
  | name|age|height|
1268
1308
  +-----+---+------+
1269
1309
  |Alice| 5| 80|
1270
1310
  +-----+---+------+
1271
- """ # noqa: D205
1311
+ """
1272
1312
  if subset:
1273
1313
  rn_col = f"tmp_col_{uuid.uuid1().hex}"
1274
- subset_str = ", ".join([f'"{c}"' for c in subset])
1314
+ subset_str = ', '.join([f'"{c}"' for c in subset])
1275
1315
  window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
1276
1316
  df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
1277
1317
  return df.filter(f"{rn_col} = 1").drop(rn_col)
@@ -1280,17 +1320,19 @@ class DataFrame: # noqa: D101
1280
1320
 
1281
1321
  drop_duplicates = dropDuplicates
1282
1322
 
1323
+
1283
1324
  def distinct(self) -> "DataFrame":
1284
1325
  """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
1285
1326
 
1286
- Returns:
1327
+ Returns
1287
1328
  -------
1288
1329
  :class:`DataFrame`
1289
1330
  DataFrame with distinct records.
1290
1331
 
1291
- Examples:
1332
+ Examples
1292
1333
  --------
1293
- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
1334
+ >>> df = spark.createDataFrame(
1335
+ ... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
1294
1336
 
1295
1337
  Return the number of distinct rows in the :class:`DataFrame`
1296
1338
 
@@ -1303,14 +1345,15 @@ class DataFrame: # noqa: D101
1303
1345
  def count(self) -> int:
1304
1346
  """Returns the number of rows in this :class:`DataFrame`.
1305
1347
 
1306
- Returns:
1348
+ Returns
1307
1349
  -------
1308
1350
  int
1309
1351
  Number of rows.
1310
1352
 
1311
- Examples:
1353
+ Examples
1312
1354
  --------
1313
- >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1355
+ >>> df = spark.createDataFrame(
1356
+ ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1314
1357
 
1315
1358
  Return the number of rows in the :class:`DataFrame`.
1316
1359
 
@@ -1326,28 +1369,33 @@ class DataFrame: # noqa: D101
1326
1369
  assert types_count == len(existing_columns)
1327
1370
 
1328
1371
  cast_expressions = [
1329
- f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types)
1372
+ f"{existing}::{target_type} as {existing}"
1373
+ for existing, target_type in zip(existing_columns, types)
1330
1374
  ]
1331
1375
  cast_expressions = ", ".join(cast_expressions)
1332
1376
  new_rel = self.relation.project(cast_expressions)
1333
1377
  return DataFrame(new_rel, self.session)
1334
1378
 
1335
- def toDF(self, *cols) -> "DataFrame": # noqa: D102
1379
+ def toDF(self, *cols) -> "DataFrame":
1336
1380
  existing_columns = self.relation.columns
1337
1381
  column_count = len(cols)
1338
1382
  if column_count != len(existing_columns):
1339
- raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match")
1383
+ raise PySparkValueError(
1384
+ message="Provided column names and number of columns in the DataFrame don't match"
1385
+ )
1340
1386
 
1341
1387
  existing_columns = [ColumnExpression(x) for x in existing_columns]
1342
- projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)]
1388
+ projections = [
1389
+ existing.alias(new) for existing, new in zip(existing_columns, cols)
1390
+ ]
1343
1391
  new_rel = self.relation.project(*projections)
1344
1392
  return DataFrame(new_rel, self.session)
1345
1393
 
1346
- def collect(self) -> list[Row]: # noqa: D102
1394
+ def collect(self) -> List[Row]:
1347
1395
  columns = self.relation.columns
1348
1396
  result = self.relation.fetchall()
1349
1397
 
1350
- def construct_row(values: list, names: list[str]) -> Row:
1398
+ def construct_row(values, names) -> Row:
1351
1399
  row = tuple.__new__(Row, list(values))
1352
1400
  row.__fields__ = list(names)
1353
1401
  return row
@@ -1363,16 +1411,16 @@ class DataFrame: # noqa: D101
1363
1411
  .. versionchanged:: 3.4.0
1364
1412
  Supports Spark Connect.
1365
1413
 
1366
- Notes:
1414
+ Notes
1367
1415
  -----
1368
1416
  The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
1369
1417
 
1370
- Returns:
1418
+ Returns
1371
1419
  -------
1372
1420
  :class:`DataFrame`
1373
1421
  Cached DataFrame.
1374
1422
 
1375
- Examples:
1423
+ Examples
1376
1424
  --------
1377
1425
  >>> df = spark.range(1)
1378
1426
  >>> df.cache()