duckdb 1.5.0.dev44__cp313-cp313-win_amd64.whl → 1.5.0.dev94__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (56) hide show
  1. _duckdb-stubs/__init__.pyi +1443 -0
  2. _duckdb-stubs/_func.pyi +46 -0
  3. _duckdb-stubs/_sqltypes.pyi +75 -0
  4. _duckdb.cp313-win_amd64.pyd +0 -0
  5. adbc_driver_duckdb/__init__.py +49 -0
  6. adbc_driver_duckdb/dbapi.py +115 -0
  7. duckdb/__init__.py +341 -435
  8. duckdb/_dbapi_type_object.py +231 -0
  9. duckdb/_version.py +22 -0
  10. duckdb/bytes_io_wrapper.py +12 -9
  11. duckdb/experimental/__init__.py +2 -1
  12. duckdb/experimental/spark/__init__.py +3 -4
  13. duckdb/experimental/spark/_globals.py +8 -8
  14. duckdb/experimental/spark/_typing.py +7 -9
  15. duckdb/experimental/spark/conf.py +16 -15
  16. duckdb/experimental/spark/context.py +60 -44
  17. duckdb/experimental/spark/errors/__init__.py +33 -35
  18. duckdb/experimental/spark/errors/error_classes.py +1 -1
  19. duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
  20. duckdb/experimental/spark/errors/exceptions/base.py +39 -88
  21. duckdb/experimental/spark/errors/utils.py +11 -16
  22. duckdb/experimental/spark/exception.py +9 -6
  23. duckdb/experimental/spark/sql/__init__.py +5 -5
  24. duckdb/experimental/spark/sql/_typing.py +8 -15
  25. duckdb/experimental/spark/sql/catalog.py +21 -20
  26. duckdb/experimental/spark/sql/column.py +48 -55
  27. duckdb/experimental/spark/sql/conf.py +9 -8
  28. duckdb/experimental/spark/sql/dataframe.py +185 -233
  29. duckdb/experimental/spark/sql/functions.py +1222 -1248
  30. duckdb/experimental/spark/sql/group.py +56 -52
  31. duckdb/experimental/spark/sql/readwriter.py +80 -94
  32. duckdb/experimental/spark/sql/session.py +64 -59
  33. duckdb/experimental/spark/sql/streaming.py +9 -10
  34. duckdb/experimental/spark/sql/type_utils.py +67 -65
  35. duckdb/experimental/spark/sql/types.py +309 -345
  36. duckdb/experimental/spark/sql/udf.py +6 -6
  37. duckdb/filesystem.py +26 -16
  38. duckdb/func/__init__.py +3 -0
  39. duckdb/functional/__init__.py +12 -16
  40. duckdb/polars_io.py +130 -83
  41. duckdb/query_graph/__main__.py +91 -96
  42. duckdb/sqltypes/__init__.py +63 -0
  43. duckdb/typing/__init__.py +18 -8
  44. duckdb/udf.py +10 -5
  45. duckdb/value/__init__.py +1 -0
  46. duckdb/value/constant/__init__.py +62 -60
  47. {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/METADATA +12 -4
  48. duckdb-1.5.0.dev94.dist-info/RECORD +52 -0
  49. duckdb/__init__.pyi +0 -713
  50. duckdb/functional/__init__.pyi +0 -31
  51. duckdb/typing/__init__.pyi +0 -36
  52. duckdb/value/constant/__init__.pyi +0 -115
  53. duckdb-1.5.0.dev44.dist-info/RECORD +0 -47
  54. /duckdb/{value/__init__.pyi → py.typed} +0 -0
  55. {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/WHEEL +0 -0
  56. {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +1,20 @@
1
+ import uuid # noqa: D100
1
2
  from functools import reduce
3
+ from keyword import iskeyword
2
4
  from typing import (
3
5
  TYPE_CHECKING,
4
6
  Any,
5
7
  Callable,
6
- List,
7
- Dict,
8
8
  Optional,
9
- Tuple,
10
9
  Union,
11
10
  cast,
12
11
  overload,
13
12
  )
14
- import uuid
15
- from keyword import iskeyword
16
13
 
17
14
  import duckdb
18
15
  from duckdb import ColumnExpression, Expression, StarExpression
19
16
 
20
- from ._typing import ColumnOrName
21
- from ..errors import PySparkTypeError, PySparkValueError, PySparkIndexError
17
+ from ..errors import PySparkIndexError, PySparkTypeError, PySparkValueError
22
18
  from ..exception import ContributionsAcceptedError
23
19
  from .column import Column
24
20
  from .readwriter import DataFrameWriter
@@ -29,43 +25,42 @@ if TYPE_CHECKING:
29
25
  import pyarrow as pa
30
26
  from pandas.core.frame import DataFrame as PandasDataFrame
31
27
 
32
- from .group import GroupedData, Grouping
28
+ from ._typing import ColumnOrName
29
+ from .group import GroupedData
33
30
  from .session import SparkSession
34
31
 
35
- from ..errors import PySparkValueError
36
- from .functions import _to_column_expr, col, lit
32
+ from duckdb.experimental.spark.sql import functions as spark_sql_functions
37
33
 
38
34
 
39
- class DataFrame:
40
- def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession"):
35
+ class DataFrame: # noqa: D101
36
+ def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession") -> None: # noqa: D107
41
37
  self.relation = relation
42
38
  self.session = session
43
39
  self._schema = None
44
40
  if self.relation is not None:
45
41
  self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
46
42
 
47
- def show(self, **kwargs) -> None:
43
+ def show(self, **kwargs) -> None: # noqa: D102
48
44
  self.relation.show()
49
45
 
50
- def toPandas(self) -> "PandasDataFrame":
46
+ def toPandas(self) -> "PandasDataFrame": # noqa: D102
51
47
  return self.relation.df()
52
48
 
53
49
  def toArrow(self) -> "pa.Table":
54
- """
55
- Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
50
+ """Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
56
51
 
57
52
  This is only available if PyArrow is installed and available.
58
53
 
59
54
  .. versionadded:: 4.0.0
60
55
 
61
- Notes
56
+ Notes:
62
57
  -----
63
58
  This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
64
59
  expected to be small, as all the data is loaded into the driver's memory.
65
60
 
66
61
  This API is a developer API.
67
62
 
68
- Examples
63
+ Examples:
69
64
  --------
70
65
  >>> df.toArrow() # doctest: +SKIP
71
66
  pyarrow.Table
@@ -88,7 +83,7 @@ class DataFrame:
88
83
  name : str
89
84
  Name of the view.
90
85
 
91
- Examples
86
+ Examples:
92
87
  --------
93
88
  Create a local temporary view named 'people'.
94
89
 
@@ -108,12 +103,13 @@ class DataFrame:
108
103
  """
109
104
  self.relation.create_view(name, True)
110
105
 
111
- def createGlobalTempView(self, name: str) -> None:
106
+ def createGlobalTempView(self, name: str) -> None: # noqa: D102
112
107
  raise NotImplementedError
113
108
 
114
- def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
109
+ def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame": # noqa: D102
115
110
  if columnName not in self.relation:
116
- raise ValueError(f"DataFrame does not contain a column named {columnName}")
111
+ msg = f"DataFrame does not contain a column named {columnName}"
112
+ raise ValueError(msg)
117
113
  cols = []
118
114
  for x in self.relation.columns:
119
115
  col = ColumnExpression(x)
@@ -123,7 +119,7 @@ class DataFrame:
123
119
  rel = self.relation.select(*cols)
124
120
  return DataFrame(rel, self.session)
125
121
 
126
- def withColumn(self, columnName: str, col: Column) -> "DataFrame":
122
+ def withColumn(self, columnName: str, col: Column) -> "DataFrame": # noqa: D102
127
123
  if not isinstance(col, Column):
128
124
  raise PySparkTypeError(
129
125
  error_class="NOT_COLUMN",
@@ -143,9 +139,8 @@ class DataFrame:
143
139
  rel = self.relation.select(*cols)
144
140
  return DataFrame(rel, self.session)
145
141
 
146
- def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame":
147
- """
148
- Returns a new :class:`DataFrame` by adding multiple columns or replacing the
142
+ def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame":
143
+ """Returns a new :class:`DataFrame` by adding multiple columns or replacing the
149
144
  existing columns that have the same names.
150
145
 
151
146
  The colsMap is a map of column name and column, the column must only refer to attributes
@@ -162,22 +157,22 @@ class DataFrame:
162
157
  colsMap : dict
163
158
  a dict of column name and :class:`Column`. Currently, only a single map is supported.
164
159
 
165
- Returns
160
+ Returns:
166
161
  -------
167
162
  :class:`DataFrame`
168
163
  DataFrame with new or replaced columns.
169
164
 
170
- Examples
165
+ Examples:
171
166
  --------
172
167
  >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
173
- >>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()
168
+ >>> df.withColumns({"age2": df.age + 2, "age3": df.age + 3}).show()
174
169
  +---+-----+----+----+
175
170
  |age| name|age2|age3|
176
171
  +---+-----+----+----+
177
172
  | 2|Alice| 4| 5|
178
173
  | 5| Bob| 7| 8|
179
174
  +---+-----+----+----+
180
- """
175
+ """ # noqa: D205
181
176
  # Below code is to help enable kwargs in future.
182
177
  assert len(colsMap) == 1
183
178
  colsMap = colsMap[0] # type: ignore[assignment]
@@ -218,9 +213,8 @@ class DataFrame:
218
213
  rel = self.relation.select(*cols)
219
214
  return DataFrame(rel, self.session)
220
215
 
221
- def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
222
- """
223
- Returns a new :class:`DataFrame` by renaming multiple columns.
216
+ def withColumnsRenamed(self, colsMap: dict[str, str]) -> "DataFrame":
217
+ """Returns a new :class:`DataFrame` by renaming multiple columns.
224
218
  This is a no-op if the schema doesn't contain the given column names.
225
219
 
226
220
  .. versionadded:: 3.4.0
@@ -232,31 +226,31 @@ class DataFrame:
232
226
  a dict of existing column names and corresponding desired column names.
233
227
  Currently, only a single map is supported.
234
228
 
235
- Returns
229
+ Returns:
236
230
  -------
237
231
  :class:`DataFrame`
238
232
  DataFrame with renamed columns.
239
233
 
240
- See Also
234
+ See Also:
241
235
  --------
242
236
  :meth:`withColumnRenamed`
243
237
 
244
- Notes
238
+ Notes:
245
239
  -----
246
240
  Support Spark Connect
247
241
 
248
- Examples
242
+ Examples:
249
243
  --------
250
244
  >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
251
- >>> df = df.withColumns({'age2': df.age + 2, 'age3': df.age + 3})
252
- >>> df.withColumnsRenamed({'age2': 'age4', 'age3': 'age5'}).show()
245
+ >>> df = df.withColumns({"age2": df.age + 2, "age3": df.age + 3})
246
+ >>> df.withColumnsRenamed({"age2": "age4", "age3": "age5"}).show()
253
247
  +---+-----+----+----+
254
248
  |age| name|age4|age5|
255
249
  +---+-----+----+----+
256
250
  | 2|Alice| 4| 5|
257
251
  | 5| Bob| 7| 8|
258
252
  +---+-----+----+----+
259
- """
253
+ """ # noqa: D205
260
254
  if not isinstance(colsMap, dict):
261
255
  raise PySparkTypeError(
262
256
  error_class="NOT_DICT",
@@ -265,9 +259,8 @@ class DataFrame:
265
259
 
266
260
  unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
267
261
  if unknown_columns:
268
- raise ValueError(
269
- f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
270
- )
262
+ msg = f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
263
+ raise ValueError(msg)
271
264
 
272
265
  # Compute this only once
273
266
  old_column_names = list(colsMap.keys())
@@ -289,11 +282,7 @@ class DataFrame:
289
282
  rel = self.relation.select(*cols)
290
283
  return DataFrame(rel, self.session)
291
284
 
292
-
293
-
294
- def transform(
295
- self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any
296
- ) -> "DataFrame":
285
+ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame": # noqa: ANN401
297
286
  """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
298
287
 
299
288
  .. versionadded:: 3.0.0
@@ -314,21 +303,19 @@ class DataFrame:
314
303
 
315
304
  .. versionadded:: 3.3.0
316
305
 
317
- Returns
306
+ Returns:
318
307
  -------
319
308
  :class:`DataFrame`
320
309
  Transformed DataFrame.
321
310
 
322
- Examples
311
+ Examples:
323
312
  --------
324
313
  >>> from pyspark.sql.functions import col
325
314
  >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
326
315
  >>> def cast_all_to_int(input_df):
327
316
  ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
328
- ...
329
317
  >>> def sort_columns_asc(input_df):
330
318
  ... return input_df.select(*sorted(input_df.columns))
331
- ...
332
319
  >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
333
320
  +-----+---+
334
321
  |float|int|
@@ -338,8 +325,9 @@ class DataFrame:
338
325
  +-----+---+
339
326
 
340
327
  >>> def add_n(input_df, n):
341
- ... return input_df.select([(col(col_name) + n).alias(col_name)
342
- ... for col_name in input_df.columns])
328
+ ... return input_df.select(
329
+ ... [(col(col_name) + n).alias(col_name) for col_name in input_df.columns]
330
+ ... )
343
331
  >>> df.transform(add_n, 1).transform(add_n, n=10).show()
344
332
  +---+-----+
345
333
  |int|float|
@@ -350,14 +338,11 @@ class DataFrame:
350
338
  """
351
339
  result = func(self, *args, **kwargs)
352
340
  assert isinstance(result, DataFrame), (
353
- "Func returned an instance of type [%s], "
354
- "should have been DataFrame." % type(result)
341
+ f"Func returned an instance of type [{type(result)}], should have been DataFrame."
355
342
  )
356
343
  return result
357
344
 
358
- def sort(
359
- self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
360
- ) -> "DataFrame":
345
+ def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame": # noqa: ANN401
361
346
  """Returns a new :class:`DataFrame` sorted by the specified column(s).
362
347
 
363
348
  Parameters
@@ -372,16 +357,15 @@ class DataFrame:
372
357
  Sort ascending vs. descending. Specify list for multiple sort orders.
373
358
  If a list is specified, the length of the list must equal the length of the `cols`.
374
359
 
375
- Returns
360
+ Returns:
376
361
  -------
377
362
  :class:`DataFrame`
378
363
  Sorted DataFrame.
379
364
 
380
- Examples
365
+ Examples:
381
366
  --------
382
367
  >>> from pyspark.sql.functions import desc, asc
383
- >>> df = spark.createDataFrame([
384
- ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
368
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
385
369
 
386
370
  Sort the DataFrame in ascending order.
387
371
 
@@ -419,8 +403,9 @@ class DataFrame:
419
403
 
420
404
  Specify multiple columns
421
405
 
422
- >>> df = spark.createDataFrame([
423
- ... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
406
+ >>> df = spark.createDataFrame(
407
+ ... [(2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
408
+ ... )
424
409
  >>> df.orderBy(desc("age"), "name").show()
425
410
  +---+-----+
426
411
  |age| name|
@@ -453,7 +438,7 @@ class DataFrame:
453
438
  for c in cols:
454
439
  _c = c
455
440
  if isinstance(c, str):
456
- _c = col(c)
441
+ _c = spark_sql_functions.col(c)
457
442
  elif isinstance(c, int) and not isinstance(c, bool):
458
443
  # ordinal is 1-based
459
444
  if c > 0:
@@ -481,13 +466,13 @@ class DataFrame:
481
466
  message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
482
467
  )
483
468
 
484
- columns = [_to_column_expr(c) for c in columns]
469
+ columns = [spark_sql_functions._to_column_expr(c) for c in columns]
485
470
  rel = self.relation.sort(*columns)
486
471
  return DataFrame(rel, self.session)
487
472
 
488
473
  orderBy = sort
489
474
 
490
- def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]:
475
+ def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]: # noqa: D102
491
476
  if n is None:
492
477
  rs = self.head(1)
493
478
  return rs[0] if rs else None
@@ -495,7 +480,7 @@ class DataFrame:
495
480
 
496
481
  first = head
497
482
 
498
- def take(self, num: int) -> List[Row]:
483
+ def take(self, num: int) -> list[Row]: # noqa: D102
499
484
  return self.limit(num).collect()
500
485
 
501
486
  def filter(self, condition: "ColumnOrName") -> "DataFrame":
@@ -509,15 +494,14 @@ class DataFrame:
509
494
  a :class:`Column` of :class:`types.BooleanType`
510
495
  or a string of SQL expressions.
511
496
 
512
- Returns
497
+ Returns:
513
498
  -------
514
499
  :class:`DataFrame`
515
500
  Filtered DataFrame.
516
501
 
517
- Examples
502
+ Examples:
518
503
  --------
519
- >>> df = spark.createDataFrame([
520
- ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])
504
+ >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
521
505
 
522
506
  Filter by :class:`Column` instances.
523
507
 
@@ -563,38 +547,34 @@ class DataFrame:
563
547
 
564
548
  where = filter
565
549
 
566
- def select(self, *cols) -> "DataFrame":
550
+ def select(self, *cols) -> "DataFrame": # noqa: D102
567
551
  cols = list(cols)
568
552
  if len(cols) == 1:
569
553
  cols = cols[0]
570
554
  if isinstance(cols, list):
571
- projections = [
572
- x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols
573
- ]
555
+ projections = [x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols]
574
556
  else:
575
- projections = [
576
- cols.expr if isinstance(cols, Column) else ColumnExpression(cols)
577
- ]
557
+ projections = [cols.expr if isinstance(cols, Column) else ColumnExpression(cols)]
578
558
  rel = self.relation.select(*projections)
579
559
  return DataFrame(rel, self.session)
580
560
 
581
561
  @property
582
- def columns(self) -> List[str]:
562
+ def columns(self) -> list[str]:
583
563
  """Returns all column names as a list.
584
564
 
585
- Examples
565
+ Examples:
586
566
  --------
587
567
  >>> df.columns
588
568
  ['age', 'name']
589
569
  """
590
570
  return [f.name for f in self.schema.fields]
591
571
 
592
- def _ipython_key_completions_(self) -> List[str]:
572
+ def _ipython_key_completions_(self) -> list[str]:
593
573
  # Provides tab-completion for column names in PySpark DataFrame
594
574
  # when accessed in bracket notation, e.g. df['<TAB>]
595
575
  return self.columns
596
576
 
597
- def __dir__(self) -> List[str]:
577
+ def __dir__(self) -> list[str]: # noqa: D105
598
578
  out = set(super().__dir__())
599
579
  out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
600
580
  return sorted(out)
@@ -602,7 +582,7 @@ class DataFrame:
602
582
  def join(
603
583
  self,
604
584
  other: "DataFrame",
605
- on: Optional[Union[str, List[str], Column, List[Column]]] = None,
585
+ on: Optional[Union[str, list[str], Column, list[Column]]] = None,
606
586
  how: Optional[str] = None,
607
587
  ) -> "DataFrame":
608
588
  """Joins with another :class:`DataFrame`, using the given join expression.
@@ -622,12 +602,12 @@ class DataFrame:
622
602
  ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
623
603
  ``anti``, ``leftanti`` and ``left_anti``.
624
604
 
625
- Returns
605
+ Returns:
626
606
  -------
627
607
  :class:`DataFrame`
628
608
  Joined DataFrame.
629
609
 
630
- Examples
610
+ Examples:
631
611
  --------
632
612
  The following performs a full outer join between ``df1`` and ``df2``.
633
613
 
@@ -636,22 +616,24 @@ class DataFrame:
636
616
  >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
637
617
  >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
638
618
  >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
639
- >>> df4 = spark.createDataFrame([
640
- ... Row(age=10, height=80, name="Alice"),
641
- ... Row(age=5, height=None, name="Bob"),
642
- ... Row(age=None, height=None, name="Tom"),
643
- ... Row(age=None, height=None, name=None),
644
- ... ])
619
+ >>> df4 = spark.createDataFrame(
620
+ ... [
621
+ ... Row(age=10, height=80, name="Alice"),
622
+ ... Row(age=5, height=None, name="Bob"),
623
+ ... Row(age=None, height=None, name="Tom"),
624
+ ... Row(age=None, height=None, name=None),
625
+ ... ]
626
+ ... )
645
627
 
646
628
  Inner join on columns (default)
647
629
 
648
- >>> df.join(df2, 'name').select(df.name, df2.height).show()
630
+ >>> df.join(df2, "name").select(df.name, df2.height).show()
649
631
  +----+------+
650
632
  |name|height|
651
633
  +----+------+
652
634
  | Bob| 85|
653
635
  +----+------+
654
- >>> df.join(df4, ['name', 'age']).select(df.name, df.age).show()
636
+ >>> df.join(df4, ["name", "age"]).select(df.name, df.age).show()
655
637
  +----+---+
656
638
  |name|age|
657
639
  +----+---+
@@ -660,8 +642,9 @@ class DataFrame:
660
642
 
661
643
  Outer join for both DataFrames on the 'name' column.
662
644
 
663
- >>> df.join(df2, df.name == df2.name, 'outer').select(
664
- ... df.name, df2.height).sort(desc("name")).show()
645
+ >>> df.join(df2, df.name == df2.name, "outer").select(df.name, df2.height).sort(
646
+ ... desc("name")
647
+ ... ).show()
665
648
  +-----+------+
666
649
  | name|height|
667
650
  +-----+------+
@@ -669,7 +652,7 @@ class DataFrame:
669
652
  |Alice| NULL|
670
653
  | NULL| 80|
671
654
  +-----+------+
672
- >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
655
+ >>> df.join(df2, "name", "outer").select("name", "height").sort(desc("name")).show()
673
656
  +-----+------+
674
657
  | name|height|
675
658
  +-----+------+
@@ -680,11 +663,9 @@ class DataFrame:
680
663
 
681
664
  Outer join for both DataFrams with multiple columns.
682
665
 
683
- >>> df.join(
684
- ... df3,
685
- ... [df.name == df3.name, df.age == df3.age],
686
- ... 'outer'
687
- ... ).select(df.name, df3.age).show()
666
+ >>> df.join(df3, [df.name == df3.name, df.age == df3.age], "outer").select(
667
+ ... df.name, df3.age
668
+ ... ).show()
688
669
  +-----+---+
689
670
  | name|age|
690
671
  +-----+---+
@@ -692,20 +673,16 @@ class DataFrame:
692
673
  | Bob| 5|
693
674
  +-----+---+
694
675
  """
695
-
696
676
  if on is not None and not isinstance(on, list):
697
677
  on = [on] # type: ignore[assignment]
698
- if on is not None and not all([isinstance(x, str) for x in on]):
678
+ if on is not None and not all(isinstance(x, str) for x in on):
699
679
  assert isinstance(on, list)
700
680
  # Get (or create) the Expressions from the list of Columns
701
- on = [_to_column_expr(x) for x in on]
681
+ on = [spark_sql_functions._to_column_expr(x) for x in on]
702
682
 
703
683
  # & all the Expressions together to form one Expression
704
- assert isinstance(
705
- on[0], Expression
706
- ), "on should be Column or list of Column"
707
- on = reduce(lambda x, y: x.__and__(y), cast(List[Expression], on))
708
-
684
+ assert isinstance(on[0], Expression), "on should be Column or list of Column"
685
+ on = reduce(lambda x, y: x.__and__(y), cast("list[Expression]", on))
709
686
 
710
687
  if on is None and how is None:
711
688
  result = self.relation.join(other.relation)
@@ -714,14 +691,14 @@ class DataFrame:
714
691
  how = "inner"
715
692
  if on is None:
716
693
  on = "true"
717
- elif isinstance(on, list) and all([isinstance(x, str) for x in on]):
694
+ elif isinstance(on, list) and all(isinstance(x, str) for x in on):
718
695
  # Passed directly through as a list of strings
719
696
  on = on
720
697
  else:
721
698
  on = str(on)
722
699
  assert isinstance(how, str), "how should be a string"
723
700
 
724
- def map_to_recognized_jointype(how):
701
+ def map_to_recognized_jointype(how: str) -> str:
725
702
  known_aliases = {
726
703
  "inner": [],
727
704
  "outer": ["full", "fullouter", "full_outer"],
@@ -730,15 +707,10 @@ class DataFrame:
730
707
  "anti": ["leftanti", "left_anti"],
731
708
  "semi": ["leftsemi", "left_semi"],
732
709
  }
733
- mapped_type = None
734
710
  for type, aliases in known_aliases.items():
735
711
  if how == type or how in aliases:
736
- mapped_type = type
737
- break
738
-
739
- if not mapped_type:
740
- mapped_type = how
741
- return mapped_type
712
+ return type
713
+ return how
742
714
 
743
715
  how = map_to_recognized_jointype(how)
744
716
  result = self.relation.join(other.relation, on, how)
@@ -757,18 +729,16 @@ class DataFrame:
757
729
  other : :class:`DataFrame`
758
730
  Right side of the cartesian product.
759
731
 
760
- Returns
732
+ Returns:
761
733
  -------
762
734
  :class:`DataFrame`
763
735
  Joined DataFrame.
764
736
 
765
- Examples
737
+ Examples:
766
738
  --------
767
739
  >>> from pyspark.sql import Row
768
- >>> df = spark.createDataFrame(
769
- ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
770
- >>> df2 = spark.createDataFrame(
771
- ... [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
740
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
741
+ >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
772
742
  >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
773
743
  +---+-----+------+
774
744
  |age| name|height|
@@ -791,21 +761,21 @@ class DataFrame:
791
761
  alias : str
792
762
  an alias name to be set for the :class:`DataFrame`.
793
763
 
794
- Returns
764
+ Returns:
795
765
  -------
796
766
  :class:`DataFrame`
797
767
  Aliased DataFrame.
798
768
 
799
- Examples
769
+ Examples:
800
770
  --------
801
771
  >>> from pyspark.sql.functions import col, desc
802
- >>> df = spark.createDataFrame(
803
- ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
772
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
804
773
  >>> df_as1 = df.alias("df_as1")
805
774
  >>> df_as2 = df.alias("df_as2")
806
- >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
807
- >>> joined_df.select(
808
- ... "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
775
+ >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
776
+ >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").sort(
777
+ ... desc("df_as1.name")
778
+ ... ).show()
809
779
  +-----+-----+---+
810
780
  | name| name|age|
811
781
  +-----+-----+---+
@@ -817,7 +787,7 @@ class DataFrame:
817
787
  assert isinstance(alias, str), "alias should be a string"
818
788
  return DataFrame(self.relation.set_alias(alias), self.session)
819
789
 
820
- def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc]
790
+ def drop(self, *cols: "ColumnOrName") -> "DataFrame": # type: ignore[misc] # noqa: D102
821
791
  exclude = []
822
792
  for col in cols:
823
793
  if isinstance(col, str):
@@ -834,7 +804,7 @@ class DataFrame:
834
804
  expr = StarExpression(exclude=exclude)
835
805
  return DataFrame(self.relation.select(expr), self.session)
836
806
 
837
- def __repr__(self) -> str:
807
+ def __repr__(self) -> str: # noqa: D105
838
808
  return str(self.relation)
839
809
 
840
810
  def limit(self, num: int) -> "DataFrame":
@@ -846,15 +816,14 @@ class DataFrame:
846
816
  Number of records to return. Will return this number of records
847
817
  or all records if the DataFrame contains less than this number of records.
848
818
 
849
- Returns
819
+ Returns:
850
820
  -------
851
821
  :class:`DataFrame`
852
822
  Subset of the records
853
823
 
854
- Examples
824
+ Examples:
855
825
  --------
856
- >>> df = spark.createDataFrame(
857
- ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
826
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
858
827
  >>> df.limit(1).show()
859
828
  +---+----+
860
829
  |age|name|
@@ -870,17 +839,15 @@ class DataFrame:
870
839
  rel = self.relation.limit(num)
871
840
  return DataFrame(rel, self.session)
872
841
 
873
- def __contains__(self, item: str):
874
- """
875
- Check if the :class:`DataFrame` contains a column by the name of `item`
876
- """
842
+ def __contains__(self, item: str) -> bool:
843
+ """Check if the :class:`DataFrame` contains a column by the name of `item`."""
877
844
  return item in self.relation
878
845
 
879
846
  @property
880
847
  def schema(self) -> StructType:
881
848
  """Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
882
849
 
883
- Examples
850
+ Examples:
884
851
  --------
885
852
  >>> df.schema
886
853
  StructType([StructField('age', IntegerType(), True),
@@ -889,25 +856,21 @@ class DataFrame:
889
856
  return self._schema
890
857
 
891
858
  @overload
892
- def __getitem__(self, item: Union[int, str]) -> Column:
893
- ...
859
+ def __getitem__(self, item: Union[int, str]) -> Column: ...
894
860
 
895
861
  @overload
896
- def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame":
897
- ...
862
+ def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ...
898
863
 
899
- def __getitem__(
900
- self, item: Union[int, str, Column, List, Tuple]
901
- ) -> Union[Column, "DataFrame"]:
864
+ def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]:
902
865
  """Returns the column as a :class:`Column`.
903
866
 
904
- Examples
867
+ Examples:
905
868
  --------
906
- >>> df.select(df['age']).collect()
869
+ >>> df.select(df["age"]).collect()
907
870
  [Row(age=2), Row(age=5)]
908
- >>> df[ ["name", "age"]].collect()
871
+ >>> df[["name", "age"]].collect()
909
872
  [Row(name='Alice', age=2), Row(name='Bob', age=5)]
910
- >>> df[ df.age > 3 ].collect()
873
+ >>> df[df.age > 3].collect()
911
874
  [Row(age=5, name='Bob')]
912
875
  >>> df[df[0] > 3].collect()
913
876
  [Row(age=5, name='Bob')]
@@ -919,31 +882,29 @@ class DataFrame:
919
882
  elif isinstance(item, (list, tuple)):
920
883
  return self.select(*item)
921
884
  elif isinstance(item, int):
922
- return col(self._schema[item].name)
885
+ return spark_sql_functions.col(self._schema[item].name)
923
886
  else:
924
- raise TypeError(f"Unexpected item type: {type(item)}")
887
+ msg = f"Unexpected item type: {type(item)}"
888
+ raise TypeError(msg)
925
889
 
926
890
  def __getattr__(self, name: str) -> Column:
927
891
  """Returns the :class:`Column` denoted by ``name``.
928
892
 
929
- Examples
893
+ Examples:
930
894
  --------
931
895
  >>> df.select(df.age).collect()
932
896
  [Row(age=2), Row(age=5)]
933
897
  """
934
898
  if name not in self.relation.columns:
935
- raise AttributeError(
936
- "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)
937
- )
899
+ msg = f"'{self.__class__.__name__}' object has no attribute '{name}'"
900
+ raise AttributeError(msg)
938
901
  return Column(duckdb.ColumnExpression(self.relation.alias, name))
939
902
 
940
903
  @overload
941
- def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
942
- ...
904
+ def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ...
943
905
 
944
906
  @overload
945
- def groupBy(self, __cols: Union[List[Column], List[str]]) -> "GroupedData":
946
- ...
907
+ def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ... # noqa: PYI063
947
908
 
948
909
  def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": # type: ignore[misc]
949
910
  """Groups the :class:`DataFrame` using the specified columns,
@@ -959,15 +920,16 @@ class DataFrame:
959
920
  Each element should be a column name (string) or an expression (:class:`Column`)
960
921
  or list of them.
961
922
 
962
- Returns
923
+ Returns:
963
924
  -------
964
925
  :class:`GroupedData`
965
926
  Grouped data by given columns.
966
927
 
967
- Examples
928
+ Examples:
968
929
  --------
969
- >>> df = spark.createDataFrame([
970
- ... (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
930
+ >>> df = spark.createDataFrame(
931
+ ... [(2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
932
+ ... )
971
933
 
972
934
  Empty grouping columns triggers a global aggregation.
973
935
 
@@ -1008,22 +970,19 @@ class DataFrame:
1008
970
  | Bob| 2| 2|
1009
971
  | Bob| 5| 1|
1010
972
  +-----+---+-----+
1011
- """
973
+ """ # noqa: D205
1012
974
  from .group import GroupedData, Grouping
1013
975
 
1014
- if len(cols) == 1 and isinstance(cols[0], list):
1015
- columns = cols[0]
1016
- else:
1017
- columns = cols
976
+ columns = cols[0] if len(cols) == 1 and isinstance(cols[0], list) else cols
1018
977
  return GroupedData(Grouping(*columns), self)
1019
978
 
1020
979
  groupby = groupBy
1021
980
 
1022
981
  @property
1023
- def write(self) -> DataFrameWriter:
982
+ def write(self) -> DataFrameWriter: # noqa: D102
1024
983
  return DataFrameWriter(self)
1025
984
 
1026
- def printSchema(self):
985
+ def printSchema(self) -> None: # noqa: D102
1027
986
  raise ContributionsAcceptedError
1028
987
 
1029
988
  def union(self, other: "DataFrame") -> "DataFrame":
@@ -1035,22 +994,22 @@ class DataFrame:
1035
994
  other : :class:`DataFrame`
1036
995
  Another :class:`DataFrame` that needs to be unioned
1037
996
 
1038
- Returns
997
+ Returns:
1039
998
  -------
1040
999
  :class:`DataFrame`
1041
1000
 
1042
- See Also
1001
+ See Also:
1043
1002
  --------
1044
1003
  DataFrame.unionAll
1045
1004
 
1046
- Notes
1005
+ Notes:
1047
1006
  -----
1048
1007
  This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
1049
1008
  (that does deduplication of elements), use this function followed by :func:`distinct`.
1050
1009
 
1051
1010
  Also as standard in SQL, this function resolves columns by position (not by name).
1052
1011
 
1053
- Examples
1012
+ Examples:
1054
1013
  --------
1055
1014
  >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
1056
1015
  >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
@@ -1068,14 +1027,12 @@ class DataFrame:
1068
1027
  | 1| 2| 3|
1069
1028
  | 1| 2| 3|
1070
1029
  +----+----+----+
1071
- """
1030
+ """ # noqa: D205
1072
1031
  return DataFrame(self.relation.union(other.relation), self.session)
1073
1032
 
1074
1033
  unionAll = union
1075
1034
 
1076
- def unionByName(
1077
- self, other: "DataFrame", allowMissingColumns: bool = False
1078
- ) -> "DataFrame":
1035
+ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
1079
1036
  """Returns a new :class:`DataFrame` containing union of rows in this and another
1080
1037
  :class:`DataFrame`.
1081
1038
 
@@ -1096,12 +1053,12 @@ class DataFrame:
1096
1053
 
1097
1054
  .. versionadded:: 3.1.0
1098
1055
 
1099
- Returns
1056
+ Returns:
1100
1057
  -------
1101
1058
  :class:`DataFrame`
1102
1059
  Combined DataFrame.
1103
1060
 
1104
- Examples
1061
+ Examples:
1105
1062
  --------
1106
1063
  The difference between this function and :func:`union` is that this function
1107
1064
  resolves columns by name (not by position):
@@ -1130,14 +1087,14 @@ class DataFrame:
1130
1087
  | 1| 2| 3|NULL|
1131
1088
  |NULL| 4| 5| 6|
1132
1089
  +----+----+----+----+
1133
- """
1090
+ """ # noqa: D205
1134
1091
  if allowMissingColumns:
1135
1092
  cols = []
1136
1093
  for col in self.relation.columns:
1137
1094
  if col in other.relation.columns:
1138
1095
  cols.append(col)
1139
1096
  else:
1140
- cols.append(lit(None))
1097
+ cols.append(spark_sql_functions.lit(None))
1141
1098
  other = other.select(*cols)
1142
1099
  else:
1143
1100
  other = other.select(*self.relation.columns)
@@ -1160,16 +1117,16 @@ class DataFrame:
1160
1117
  other : :class:`DataFrame`
1161
1118
  Another :class:`DataFrame` that needs to be combined.
1162
1119
 
1163
- Returns
1120
+ Returns:
1164
1121
  -------
1165
1122
  :class:`DataFrame`
1166
1123
  Combined DataFrame.
1167
1124
 
1168
- Notes
1125
+ Notes:
1169
1126
  -----
1170
1127
  This is equivalent to `INTERSECT` in SQL.
1171
1128
 
1172
- Examples
1129
+ Examples:
1173
1130
  --------
1174
1131
  >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1175
1132
  >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
@@ -1180,7 +1137,7 @@ class DataFrame:
1180
1137
  | b| 3|
1181
1138
  | a| 1|
1182
1139
  +---+---+
1183
- """
1140
+ """ # noqa: D205
1184
1141
  return self.intersectAll(other).drop_duplicates()
1185
1142
 
1186
1143
  def intersectAll(self, other: "DataFrame") -> "DataFrame":
@@ -1200,12 +1157,12 @@ class DataFrame:
1200
1157
  other : :class:`DataFrame`
1201
1158
  Another :class:`DataFrame` that needs to be combined.
1202
1159
 
1203
- Returns
1160
+ Returns:
1204
1161
  -------
1205
1162
  :class:`DataFrame`
1206
1163
  Combined DataFrame.
1207
1164
 
1208
- Examples
1165
+ Examples:
1209
1166
  --------
1210
1167
  >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
1211
1168
  >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
@@ -1217,7 +1174,7 @@ class DataFrame:
1217
1174
  | a| 1|
1218
1175
  | b| 3|
1219
1176
  +---+---+
1220
- """
1177
+ """ # noqa: D205
1221
1178
  return DataFrame(self.relation.intersect(other.relation), self.session)
1222
1179
 
1223
1180
  def exceptAll(self, other: "DataFrame") -> "DataFrame":
@@ -1237,14 +1194,15 @@ class DataFrame:
1237
1194
  other : :class:`DataFrame`
1238
1195
  The other :class:`DataFrame` to compare to.
1239
1196
 
1240
- Returns
1197
+ Returns:
1241
1198
  -------
1242
1199
  :class:`DataFrame`
1243
1200
 
1244
- Examples
1201
+ Examples:
1245
1202
  --------
1246
1203
  >>> df1 = spark.createDataFrame(
1247
- ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])
1204
+ ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]
1205
+ ... )
1248
1206
  >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
1249
1207
  >>> df1.exceptAll(df2).show()
1250
1208
  +---+---+
@@ -1256,10 +1214,10 @@ class DataFrame:
1256
1214
  | c| 4|
1257
1215
  +---+---+
1258
1216
 
1259
- """
1217
+ """ # noqa: D205
1260
1218
  return DataFrame(self.relation.except_(other.relation), self.session)
1261
1219
 
1262
- def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
1220
+ def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame":
1263
1221
  """Return a new :class:`DataFrame` with duplicate rows removed,
1264
1222
  optionally only considering certain columns.
1265
1223
 
@@ -1276,19 +1234,21 @@ class DataFrame:
1276
1234
  subset : List of column names, optional
1277
1235
  List of columns to use for duplicate comparison (default All columns).
1278
1236
 
1279
- Returns
1237
+ Returns:
1280
1238
  -------
1281
1239
  :class:`DataFrame`
1282
1240
  DataFrame without duplicates.
1283
1241
 
1284
- Examples
1242
+ Examples:
1285
1243
  --------
1286
1244
  >>> from pyspark.sql import Row
1287
- >>> df = spark.createDataFrame([
1288
- ... Row(name='Alice', age=5, height=80),
1289
- ... Row(name='Alice', age=5, height=80),
1290
- ... Row(name='Alice', age=10, height=80)
1291
- ... ])
1245
+ >>> df = spark.createDataFrame(
1246
+ ... [
1247
+ ... Row(name="Alice", age=5, height=80),
1248
+ ... Row(name="Alice", age=5, height=80),
1249
+ ... Row(name="Alice", age=10, height=80),
1250
+ ... ]
1251
+ ... )
1292
1252
 
1293
1253
  Deduplicate the same rows.
1294
1254
 
@@ -1302,16 +1262,16 @@ class DataFrame:
1302
1262
 
1303
1263
  Deduplicate values on 'name' and 'height' columns.
1304
1264
 
1305
- >>> df.dropDuplicates(['name', 'height']).show()
1265
+ >>> df.dropDuplicates(["name", "height"]).show()
1306
1266
  +-----+---+------+
1307
1267
  | name|age|height|
1308
1268
  +-----+---+------+
1309
1269
  |Alice| 5| 80|
1310
1270
  +-----+---+------+
1311
- """
1271
+ """ # noqa: D205
1312
1272
  if subset:
1313
1273
  rn_col = f"tmp_col_{uuid.uuid1().hex}"
1314
- subset_str = ', '.join([f'"{c}"' for c in subset])
1274
+ subset_str = ", ".join([f'"{c}"' for c in subset])
1315
1275
  window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
1316
1276
  df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
1317
1277
  return df.filter(f"{rn_col} = 1").drop(rn_col)
@@ -1320,19 +1280,17 @@ class DataFrame:
1320
1280
 
1321
1281
  drop_duplicates = dropDuplicates
1322
1282
 
1323
-
1324
1283
  def distinct(self) -> "DataFrame":
1325
1284
  """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
1326
1285
 
1327
- Returns
1286
+ Returns:
1328
1287
  -------
1329
1288
  :class:`DataFrame`
1330
1289
  DataFrame with distinct records.
1331
1290
 
1332
- Examples
1291
+ Examples:
1333
1292
  --------
1334
- >>> df = spark.createDataFrame(
1335
- ... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
1293
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
1336
1294
 
1337
1295
  Return the number of distinct rows in the :class:`DataFrame`
1338
1296
 
@@ -1345,15 +1303,14 @@ class DataFrame:
1345
1303
  def count(self) -> int:
1346
1304
  """Returns the number of rows in this :class:`DataFrame`.
1347
1305
 
1348
- Returns
1306
+ Returns:
1349
1307
  -------
1350
1308
  int
1351
1309
  Number of rows.
1352
1310
 
1353
- Examples
1311
+ Examples:
1354
1312
  --------
1355
- >>> df = spark.createDataFrame(
1356
- ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1313
+ >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
1357
1314
 
1358
1315
  Return the number of rows in the :class:`DataFrame`.
1359
1316
 
@@ -1369,33 +1326,28 @@ class DataFrame:
1369
1326
  assert types_count == len(existing_columns)
1370
1327
 
1371
1328
  cast_expressions = [
1372
- f"{existing}::{target_type} as {existing}"
1373
- for existing, target_type in zip(existing_columns, types)
1329
+ f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types)
1374
1330
  ]
1375
1331
  cast_expressions = ", ".join(cast_expressions)
1376
1332
  new_rel = self.relation.project(cast_expressions)
1377
1333
  return DataFrame(new_rel, self.session)
1378
1334
 
1379
- def toDF(self, *cols) -> "DataFrame":
1335
+ def toDF(self, *cols) -> "DataFrame": # noqa: D102
1380
1336
  existing_columns = self.relation.columns
1381
1337
  column_count = len(cols)
1382
1338
  if column_count != len(existing_columns):
1383
- raise PySparkValueError(
1384
- message="Provided column names and number of columns in the DataFrame don't match"
1385
- )
1339
+ raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match")
1386
1340
 
1387
1341
  existing_columns = [ColumnExpression(x) for x in existing_columns]
1388
- projections = [
1389
- existing.alias(new) for existing, new in zip(existing_columns, cols)
1390
- ]
1342
+ projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)]
1391
1343
  new_rel = self.relation.project(*projections)
1392
1344
  return DataFrame(new_rel, self.session)
1393
1345
 
1394
- def collect(self) -> List[Row]:
1346
+ def collect(self) -> list[Row]: # noqa: D102
1395
1347
  columns = self.relation.columns
1396
1348
  result = self.relation.fetchall()
1397
1349
 
1398
- def construct_row(values, names) -> Row:
1350
+ def construct_row(values: list, names: list[str]) -> Row:
1399
1351
  row = tuple.__new__(Row, list(values))
1400
1352
  row.__fields__ = list(names)
1401
1353
  return row
@@ -1411,16 +1363,16 @@ class DataFrame:
1411
1363
  .. versionchanged:: 3.4.0
1412
1364
  Supports Spark Connect.
1413
1365
 
1414
- Notes
1366
+ Notes:
1415
1367
  -----
1416
1368
  The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
1417
1369
 
1418
- Returns
1370
+ Returns:
1419
1371
  -------
1420
1372
  :class:`DataFrame`
1421
1373
  Cached DataFrame.
1422
1374
 
1423
- Examples
1375
+ Examples:
1424
1376
  --------
1425
1377
  >>> df = spark.range(1)
1426
1378
  >>> df.cache()