PyPI - duckdb - Versions diffs - 1.4.1.dev135__cp311-cp311-macosx_10_9_universal2.whl → 1.5.0.dev44__cp311-cp311-macosx_10_9_universal2.whl - Mend

duckdb 1.4.1.dev135__cp311-cp311-macosx_10_9_universal2.whl → 1.5.0.dev44__cp311-cp311-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of duckdb might be problematic. Click here for more details.

Files changed (57) hide show

_duckdb.cpython-311-darwin.so +0 -0
duckdb/__init__.py +435 -341
duckdb/__init__.pyi +713 -0
duckdb/bytes_io_wrapper.py +9 -12
duckdb/experimental/__init__.py +1 -2
duckdb/experimental/spark/__init__.py +4 -3
duckdb/experimental/spark/_globals.py +8 -8
duckdb/experimental/spark/_typing.py +9 -7
duckdb/experimental/spark/conf.py +15 -16
duckdb/experimental/spark/context.py +44 -60
duckdb/experimental/spark/errors/__init__.py +35 -33
duckdb/experimental/spark/errors/error_classes.py +1 -1
duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
duckdb/experimental/spark/errors/exceptions/base.py +88 -39
duckdb/experimental/spark/errors/utils.py +16 -11
duckdb/experimental/spark/exception.py +6 -9
duckdb/experimental/spark/sql/__init__.py +5 -5
duckdb/experimental/spark/sql/_typing.py +15 -8
duckdb/experimental/spark/sql/catalog.py +20 -21
duckdb/experimental/spark/sql/column.py +55 -48
duckdb/experimental/spark/sql/conf.py +8 -9
duckdb/experimental/spark/sql/dataframe.py +233 -185
duckdb/experimental/spark/sql/functions.py +1248 -1222
duckdb/experimental/spark/sql/group.py +52 -56
duckdb/experimental/spark/sql/readwriter.py +94 -80
duckdb/experimental/spark/sql/session.py +59 -64
duckdb/experimental/spark/sql/streaming.py +10 -9
duckdb/experimental/spark/sql/type_utils.py +65 -67
duckdb/experimental/spark/sql/types.py +345 -309
duckdb/experimental/spark/sql/udf.py +6 -6
duckdb/filesystem.py +16 -26
duckdb/functional/__init__.py +16 -12
duckdb/functional/__init__.pyi +31 -0
duckdb/polars_io.py +82 -124
duckdb/query_graph/__main__.py +96 -91
duckdb/typing/__init__.py +8 -18
duckdb/typing/__init__.pyi +36 -0
duckdb/udf.py +5 -10
duckdb/value/__init__.py +0 -1
duckdb/value/constant/__init__.py +60 -62
duckdb/value/constant/__init__.pyi +115 -0
duckdb-1.5.0.dev44.dist-info/METADATA +80 -0
duckdb-1.5.0.dev44.dist-info/RECORD +47 -0
_duckdb-stubs/__init__.pyi +0 -1443
_duckdb-stubs/_func.pyi +0 -46
_duckdb-stubs/_sqltypes.pyi +0 -75
adbc_driver_duckdb/__init__.py +0 -50
adbc_driver_duckdb/dbapi.py +0 -115
duckdb/_dbapi_type_object.py +0 -231
duckdb/_version.py +0 -22
duckdb/func/__init__.py +0 -3
duckdb/sqltypes/__init__.py +0 -63
duckdb-1.4.1.dev135.dist-info/METADATA +0 -326
duckdb-1.4.1.dev135.dist-info/RECORD +0 -52
/duckdb/{py.typed → value/__init__.pyi} +0 -0
{duckdb-1.4.1.dev135.dist-info → duckdb-1.5.0.dev44.dist-info}/WHEEL +0 -0
{duckdb-1.4.1.dev135.dist-info → duckdb-1.5.0.dev44.dist-info}/licenses/LICENSE +0 -0

duckdb/experimental/spark/sql/dataframe.py CHANGED Viewed

@@ -1,20 +1,24 @@
-import uuid  # noqa: D100
 from functools import reduce
-from keyword import iskeyword
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
+    List,
+    Dict,
     Optional,
+    Tuple,
     Union,
     cast,
     overload,
 )
+import uuid
+from keyword import iskeyword
 import duckdb
 from duckdb import ColumnExpression, Expression, StarExpression
-from ..errors import PySparkIndexError, PySparkTypeError, PySparkValueError
+from ._typing import ColumnOrName
+from ..errors import PySparkTypeError, PySparkValueError, PySparkIndexError
 from ..exception import ContributionsAcceptedError
 from .column import Column
 from .readwriter import DataFrameWriter
@@ -25,42 +29,43 @@ if TYPE_CHECKING:
     import pyarrow as pa
     from pandas.core.frame import DataFrame as PandasDataFrame
-    from ._typing import ColumnOrName
-    from .group import GroupedData
+    from .group import GroupedData, Grouping
     from .session import SparkSession
-from duckdb.experimental.spark.sql import functions as spark_sql_functions
+from ..errors import PySparkValueError
+from .functions import _to_column_expr, col, lit
-class DataFrame:  # noqa: D101
-    def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession") -> None:  # noqa: D107
+class DataFrame:
+    def __init__(self, relation: duckdb.DuckDBPyRelation, session: "SparkSession"):
         self.relation = relation
         self.session = session
         self._schema = None
         if self.relation is not None:
             self._schema = duckdb_to_spark_schema(self.relation.columns, self.relation.types)
-    def show(self, **kwargs) -> None:  # noqa: D102
+    def show(self, **kwargs) -> None:
         self.relation.show()
-    def toPandas(self) -> "PandasDataFrame":  # noqa: D102
+    def toPandas(self) -> "PandasDataFrame":
         return self.relation.df()
     def toArrow(self) -> "pa.Table":
-        """Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
+        """
+        Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.
         This is only available if PyArrow is installed and available.
         .. versionadded:: 4.0.0
-        Notes:
+        Notes
         -----
         This method should only be used if the resulting PyArrow ``pyarrow.Table`` is
         expected to be small, as all the data is loaded into the driver's memory.
         This API is a developer API.
-        Examples:
+        Examples
         --------
         >>> df.toArrow()  # doctest: +SKIP
         pyarrow.Table
@@ -83,7 +88,7 @@ class DataFrame:  # noqa: D101
         name : str
             Name of the view.
-        Examples:
+        Examples
         --------
         Create a local temporary view named 'people'.
@@ -103,13 +108,12 @@ class DataFrame:  # noqa: D101
         """
         self.relation.create_view(name, True)
-    def createGlobalTempView(self, name: str) -> None:  # noqa: D102
+    def createGlobalTempView(self, name: str) -> None:
         raise NotImplementedError
-    def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":  # noqa: D102
+    def withColumnRenamed(self, columnName: str, newName: str) -> "DataFrame":
         if columnName not in self.relation:
-            msg = f"DataFrame does not contain a column named {columnName}"
-            raise ValueError(msg)
+            raise ValueError(f"DataFrame does not contain a column named {columnName}")
         cols = []
         for x in self.relation.columns:
             col = ColumnExpression(x)
@@ -119,7 +123,7 @@ class DataFrame:  # noqa: D101
         rel = self.relation.select(*cols)
         return DataFrame(rel, self.session)
-    def withColumn(self, columnName: str, col: Column) -> "DataFrame":  # noqa: D102
+    def withColumn(self, columnName: str, col: Column) -> "DataFrame":
         if not isinstance(col, Column):
             raise PySparkTypeError(
                 error_class="NOT_COLUMN",
@@ -139,8 +143,9 @@ class DataFrame:  # noqa: D101
         rel = self.relation.select(*cols)
         return DataFrame(rel, self.session)
-    def withColumns(self, *colsMap: dict[str, Column]) -> "DataFrame":
-        """Returns a new :class:`DataFrame` by adding multiple columns or replacing the
+    def withColumns(self, *colsMap: Dict[str, Column]) -> "DataFrame":
+        """
+        Returns a new :class:`DataFrame` by adding multiple columns or replacing the
         existing columns that have the same names.
         The colsMap is a map of column name and column, the column must only refer to attributes
@@ -157,22 +162,22 @@ class DataFrame:  # noqa: D101
         colsMap : dict
             a dict of column name and :class:`Column`. Currently, only a single map is supported.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             DataFrame with new or replaced columns.
-        Examples:
+        Examples
         --------
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
-        >>> df.withColumns({"age2": df.age + 2, "age3": df.age + 3}).show()
+        >>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()
         +---+-----+----+----+
         |age| name|age2|age3|
         +---+-----+----+----+
         |  2|Alice|   4|   5|
         |  5|  Bob|   7|   8|
         +---+-----+----+----+
-        """  # noqa: D205
+        """
         # Below code is to help enable kwargs in future.
         assert len(colsMap) == 1
         colsMap = colsMap[0]  # type: ignore[assignment]
@@ -213,8 +218,9 @@ class DataFrame:  # noqa: D101
         rel = self.relation.select(*cols)
         return DataFrame(rel, self.session)
-    def withColumnsRenamed(self, colsMap: dict[str, str]) -> "DataFrame":
-        """Returns a new :class:`DataFrame` by renaming multiple columns.
+    def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
+        """
+        Returns a new :class:`DataFrame` by renaming multiple columns.
         This is a no-op if the schema doesn't contain the given column names.
         .. versionadded:: 3.4.0
@@ -226,31 +232,31 @@ class DataFrame:  # noqa: D101
             a dict of existing column names and corresponding desired column names.
             Currently, only a single map is supported.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             DataFrame with renamed columns.
-        See Also:
+        See Also
         --------
         :meth:`withColumnRenamed`
-        Notes:
+        Notes
         -----
         Support Spark Connect
-        Examples:
+        Examples
         --------
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
-        >>> df = df.withColumns({"age2": df.age + 2, "age3": df.age + 3})
-        >>> df.withColumnsRenamed({"age2": "age4", "age3": "age5"}).show()
+        >>> df = df.withColumns({'age2': df.age + 2, 'age3': df.age + 3})
+        >>> df.withColumnsRenamed({'age2': 'age4', 'age3': 'age5'}).show()
         +---+-----+----+----+
         |age| name|age4|age5|
         +---+-----+----+----+
         |  2|Alice|   4|   5|
         |  5|  Bob|   7|   8|
         +---+-----+----+----+
-        """  # noqa: D205
+        """
         if not isinstance(colsMap, dict):
             raise PySparkTypeError(
                 error_class="NOT_DICT",
@@ -259,8 +265,9 @@ class DataFrame:  # noqa: D101
         unknown_columns = set(colsMap.keys()) - set(self.relation.columns)
         if unknown_columns:
-            msg = f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
-            raise ValueError(msg)
+            raise ValueError(
+                f"DataFrame does not contain column(s): {', '.join(unknown_columns)}"
+            )
         # Compute this only once
         old_column_names = list(colsMap.keys())
@@ -282,7 +289,11 @@ class DataFrame:  # noqa: D101
         rel = self.relation.select(*cols)
         return DataFrame(rel, self.session)
-    def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any) -> "DataFrame":  # noqa: ANN401
+    def transform(
+        self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any
+    ) -> "DataFrame":
         """Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.
         .. versionadded:: 3.0.0
@@ -303,19 +314,21 @@ class DataFrame:  # noqa: D101
             .. versionadded:: 3.3.0
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Transformed DataFrame.
-        Examples:
+        Examples
         --------
         >>> from pyspark.sql.functions import col
         >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
         >>> def cast_all_to_int(input_df):
         ...     return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
+        ...
         >>> def sort_columns_asc(input_df):
         ...     return input_df.select(*sorted(input_df.columns))
+        ...
         >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()
         +-----+---+
         |float|int|
@@ -325,9 +338,8 @@ class DataFrame:  # noqa: D101
         +-----+---+
         >>> def add_n(input_df, n):
-        ...     return input_df.select(
-        ...         [(col(col_name) + n).alias(col_name) for col_name in input_df.columns]
-        ...     )
+        ...     return input_df.select([(col(col_name) + n).alias(col_name)
+        ...                             for col_name in input_df.columns])
         >>> df.transform(add_n, 1).transform(add_n, n=10).show()
         +---+-----+
         |int|float|
@@ -338,11 +350,14 @@ class DataFrame:  # noqa: D101
         """
         result = func(self, *args, **kwargs)
         assert isinstance(result, DataFrame), (
-            f"Func returned an instance of type [{type(result)}], should have been DataFrame."
+            "Func returned an instance of type [%s], "
+            "should have been DataFrame." % type(result)
         )
         return result
-    def sort(self, *cols: Union[str, Column, list[Union[str, Column]]], **kwargs: Any) -> "DataFrame":  # noqa: ANN401
+    def sort(
+        self, *cols: Union[str, Column, List[Union[str, Column]]], **kwargs: Any
+    ) -> "DataFrame":
         """Returns a new :class:`DataFrame` sorted by the specified column(s).
         Parameters
@@ -357,15 +372,16 @@ class DataFrame:  # noqa: D101
             Sort ascending vs. descending. Specify list for multiple sort orders.
             If a list is specified, the length of the list must equal the length of the `cols`.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Sorted DataFrame.
-        Examples:
+        Examples
         --------
         >>> from pyspark.sql.functions import desc, asc
-        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
+        >>> df = spark.createDataFrame([
+        ...     (2, "Alice"), (5, "Bob")], schema=["age", "name"])
         Sort the DataFrame in ascending order.
@@ -403,9 +419,8 @@ class DataFrame:  # noqa: D101
         Specify multiple columns
-        >>> df = spark.createDataFrame(
-        ...     [(2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
-        ... )
+        >>> df = spark.createDataFrame([
+        ...     (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
         >>> df.orderBy(desc("age"), "name").show()
         +---+-----+
         |age| name|
@@ -438,7 +453,7 @@ class DataFrame:  # noqa: D101
         for c in cols:
             _c = c
             if isinstance(c, str):
-                _c = spark_sql_functions.col(c)
+                _c = col(c)
             elif isinstance(c, int) and not isinstance(c, bool):
                 # ordinal is 1-based
                 if c > 0:
@@ -466,13 +481,13 @@ class DataFrame:  # noqa: D101
                 message_parameters={"arg_name": "ascending", "arg_type": type(ascending).__name__},
             )
-        columns = [spark_sql_functions._to_column_expr(c) for c in columns]
+        columns = [_to_column_expr(c) for c in columns]
         rel = self.relation.sort(*columns)
         return DataFrame(rel, self.session)
     orderBy = sort
-    def head(self, n: Optional[int] = None) -> Union[Optional[Row], list[Row]]:  # noqa: D102
+    def head(self, n: Optional[int] = None) -> Union[Optional[Row], List[Row]]:
         if n is None:
             rs = self.head(1)
             return rs[0] if rs else None
@@ -480,7 +495,7 @@ class DataFrame:  # noqa: D101
     first = head
-    def take(self, num: int) -> list[Row]:  # noqa: D102
+    def take(self, num: int) -> List[Row]:
         return self.limit(num).collect()
     def filter(self, condition: "ColumnOrName") -> "DataFrame":
@@ -494,14 +509,15 @@ class DataFrame:  # noqa: D101
             a :class:`Column` of :class:`types.BooleanType`
             or a string of SQL expressions.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Filtered DataFrame.
-        Examples:
+        Examples
         --------
-        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
+        >>> df = spark.createDataFrame([
+        ...     (2, "Alice"), (5, "Bob")], schema=["age", "name"])
         Filter by :class:`Column` instances.
@@ -547,34 +563,38 @@ class DataFrame:  # noqa: D101
     where = filter
-    def select(self, *cols) -> "DataFrame":  # noqa: D102
+    def select(self, *cols) -> "DataFrame":
         cols = list(cols)
         if len(cols) == 1:
             cols = cols[0]
         if isinstance(cols, list):
-            projections = [x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols]
+            projections = [
+                x.expr if isinstance(x, Column) else ColumnExpression(x) for x in cols
+            ]
         else:
-            projections = [cols.expr if isinstance(cols, Column) else ColumnExpression(cols)]
+            projections = [
+                cols.expr if isinstance(cols, Column) else ColumnExpression(cols)
+            ]
         rel = self.relation.select(*projections)
         return DataFrame(rel, self.session)
     @property
-    def columns(self) -> list[str]:
+    def columns(self) -> List[str]:
         """Returns all column names as a list.
-        Examples:
+        Examples
         --------
         >>> df.columns
         ['age', 'name']
         """
         return [f.name for f in self.schema.fields]
-    def _ipython_key_completions_(self) -> list[str]:
+    def _ipython_key_completions_(self) -> List[str]:
         # Provides tab-completion for column names in PySpark DataFrame
         # when accessed in bracket notation, e.g. df['<TAB>]
         return self.columns
-    def __dir__(self) -> list[str]:  # noqa: D105
+    def __dir__(self) -> List[str]:
         out = set(super().__dir__())
         out.update(c for c in self.columns if c.isidentifier() and not iskeyword(c))
         return sorted(out)
@@ -582,7 +602,7 @@ class DataFrame:  # noqa: D101
     def join(
         self,
         other: "DataFrame",
-        on: Optional[Union[str, list[str], Column, list[Column]]] = None,
+        on: Optional[Union[str, List[str], Column, List[Column]]] = None,
         how: Optional[str] = None,
     ) -> "DataFrame":
         """Joins with another :class:`DataFrame`, using the given join expression.
@@ -602,12 +622,12 @@ class DataFrame:  # noqa: D101
             ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,
             ``anti``, ``leftanti`` and ``left_anti``.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Joined DataFrame.
-        Examples:
+        Examples
         --------
         The following performs a full outer join between ``df1`` and ``df2``.
@@ -616,24 +636,22 @@ class DataFrame:  # noqa: D101
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")]).toDF("age", "name")
         >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
         >>> df3 = spark.createDataFrame([Row(age=2, name="Alice"), Row(age=5, name="Bob")])
-        >>> df4 = spark.createDataFrame(
-        ...     [
-        ...         Row(age=10, height=80, name="Alice"),
-        ...         Row(age=5, height=None, name="Bob"),
-        ...         Row(age=None, height=None, name="Tom"),
-        ...         Row(age=None, height=None, name=None),
-        ...     ]
-        ... )
+        >>> df4 = spark.createDataFrame([
+        ...     Row(age=10, height=80, name="Alice"),
+        ...     Row(age=5, height=None, name="Bob"),
+        ...     Row(age=None, height=None, name="Tom"),
+        ...     Row(age=None, height=None, name=None),
+        ... ])
         Inner join on columns (default)
-        >>> df.join(df2, "name").select(df.name, df2.height).show()
+        >>> df.join(df2, 'name').select(df.name, df2.height).show()
         +----+------+
         |name|height|
         +----+------+
         | Bob|    85|
         +----+------+
-        >>> df.join(df4, ["name", "age"]).select(df.name, df.age).show()
+        >>> df.join(df4, ['name', 'age']).select(df.name, df.age).show()
         +----+---+
         |name|age|
         +----+---+
@@ -642,9 +660,8 @@ class DataFrame:  # noqa: D101
         Outer join for both DataFrames on the 'name' column.
-        >>> df.join(df2, df.name == df2.name, "outer").select(df.name, df2.height).sort(
-        ...     desc("name")
-        ... ).show()
+        >>> df.join(df2, df.name == df2.name, 'outer').select(
+        ...     df.name, df2.height).sort(desc("name")).show()
         +-----+------+
         | name|height|
         +-----+------+
@@ -652,7 +669,7 @@ class DataFrame:  # noqa: D101
         |Alice|  NULL|
         | NULL|    80|
         +-----+------+
-        >>> df.join(df2, "name", "outer").select("name", "height").sort(desc("name")).show()
+        >>> df.join(df2, 'name', 'outer').select('name', 'height').sort(desc("name")).show()
         +-----+------+
         | name|height|
         +-----+------+
@@ -663,9 +680,11 @@ class DataFrame:  # noqa: D101
         Outer join for both DataFrams with multiple columns.
-        >>> df.join(df3, [df.name == df3.name, df.age == df3.age], "outer").select(
-        ...     df.name, df3.age
-        ... ).show()
+        >>> df.join(
+        ...     df3,
+        ...     [df.name == df3.name, df.age == df3.age],
+        ...     'outer'
+        ... ).select(df.name, df3.age).show()
         +-----+---+
         | name|age|
         +-----+---+
@@ -673,16 +692,20 @@ class DataFrame:  # noqa: D101
         |  Bob|  5|
         +-----+---+
         """
         if on is not None and not isinstance(on, list):
             on = [on]  # type: ignore[assignment]
-        if on is not None and not all(isinstance(x, str) for x in on):
+        if on is not None and not all([isinstance(x, str) for x in on]):
             assert isinstance(on, list)
             # Get (or create) the Expressions from the list of Columns
-            on = [spark_sql_functions._to_column_expr(x) for x in on]
+            on = [_to_column_expr(x) for x in on]
             # & all the Expressions together to form one Expression
-            assert isinstance(on[0], Expression), "on should be Column or list of Column"
-            on = reduce(lambda x, y: x.__and__(y), cast("list[Expression]", on))
+            assert isinstance(
+                on[0], Expression
+            ), "on should be Column or list of Column"
+            on = reduce(lambda x, y: x.__and__(y), cast(List[Expression], on))
         if on is None and how is None:
             result = self.relation.join(other.relation)
@@ -691,14 +714,14 @@ class DataFrame:  # noqa: D101
                 how = "inner"
             if on is None:
                 on = "true"
-            elif isinstance(on, list) and all(isinstance(x, str) for x in on):
+            elif isinstance(on, list) and all([isinstance(x, str) for x in on]):
                 # Passed directly through as a list of strings
                 on = on
             else:
                 on = str(on)
             assert isinstance(how, str), "how should be a string"
-            def map_to_recognized_jointype(how: str) -> str:
+            def map_to_recognized_jointype(how):
                 known_aliases = {
                     "inner": [],
                     "outer": ["full", "fullouter", "full_outer"],
@@ -707,10 +730,15 @@ class DataFrame:  # noqa: D101
                     "anti": ["leftanti", "left_anti"],
                     "semi": ["leftsemi", "left_semi"],
                 }
+                mapped_type = None
                 for type, aliases in known_aliases.items():
                     if how == type or how in aliases:
-                        return type
-                return how
+                        mapped_type = type
+                        break
+                if not mapped_type:
+                    mapped_type = how
+                return mapped_type
             how = map_to_recognized_jointype(how)
             result = self.relation.join(other.relation, on, how)
@@ -729,16 +757,18 @@ class DataFrame:  # noqa: D101
         other : :class:`DataFrame`
             Right side of the cartesian product.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Joined DataFrame.
-        Examples:
+        Examples
         --------
         >>> from pyspark.sql import Row
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
-        >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df2 = spark.createDataFrame(
+        ...     [Row(height=80, name="Tom"), Row(height=85, name="Bob")])
         >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()
         +---+-----+------+
         |age| name|height|
@@ -761,21 +791,21 @@ class DataFrame:  # noqa: D101
         alias : str
             an alias name to be set for the :class:`DataFrame`.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Aliased DataFrame.
-        Examples:
+        Examples
         --------
         >>> from pyspark.sql.functions import col, desc
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
         >>> df_as1 = df.alias("df_as1")
         >>> df_as2 = df.alias("df_as2")
-        >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
-        >>> joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").sort(
-        ...     desc("df_as1.name")
-        ... ).show()
+        >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
+        >>> joined_df.select(
+        ...     "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
         +-----+-----+---+
         | name| name|age|
         +-----+-----+---+
@@ -787,7 +817,7 @@ class DataFrame:  # noqa: D101
         assert isinstance(alias, str), "alias should be a string"
         return DataFrame(self.relation.set_alias(alias), self.session)
-    def drop(self, *cols: "ColumnOrName") -> "DataFrame":  # type: ignore[misc]  # noqa: D102
+    def drop(self, *cols: "ColumnOrName") -> "DataFrame":  # type: ignore[misc]
         exclude = []
         for col in cols:
             if isinstance(col, str):
@@ -804,7 +834,7 @@ class DataFrame:  # noqa: D101
         expr = StarExpression(exclude=exclude)
         return DataFrame(self.relation.select(expr), self.session)
-    def __repr__(self) -> str:  # noqa: D105
+    def __repr__(self) -> str:
         return str(self.relation)
     def limit(self, num: int) -> "DataFrame":
@@ -816,14 +846,15 @@ class DataFrame:  # noqa: D101
             Number of records to return. Will return this number of records
             or all records if the DataFrame contains less than this number of records.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Subset of the records
-        Examples:
+        Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
         >>> df.limit(1).show()
         +---+----+
         |age|name|
@@ -839,15 +870,17 @@ class DataFrame:  # noqa: D101
         rel = self.relation.limit(num)
         return DataFrame(rel, self.session)
-    def __contains__(self, item: str) -> bool:
-        """Check if the :class:`DataFrame` contains a column by the name of `item`."""
+    def __contains__(self, item: str):
+        """
+        Check if the :class:`DataFrame` contains a column by the name of `item`
+        """
         return item in self.relation
     @property
     def schema(self) -> StructType:
         """Returns the schema of this :class:`DataFrame` as a :class:`duckdb.experimental.spark.sql.types.StructType`.
-        Examples:
+        Examples
         --------
         >>> df.schema
         StructType([StructField('age', IntegerType(), True),
@@ -856,21 +889,25 @@ class DataFrame:  # noqa: D101
         return self._schema
     @overload
-    def __getitem__(self, item: Union[int, str]) -> Column: ...
+    def __getitem__(self, item: Union[int, str]) -> Column:
+        ...
     @overload
-    def __getitem__(self, item: Union[Column, list, tuple]) -> "DataFrame": ...
+    def __getitem__(self, item: Union[Column, List, Tuple]) -> "DataFrame":
+        ...
-    def __getitem__(self, item: Union[int, str, Column, list, tuple]) -> Union[Column, "DataFrame"]:
+    def __getitem__(
+        self, item: Union[int, str, Column, List, Tuple]
+    ) -> Union[Column, "DataFrame"]:
         """Returns the column as a :class:`Column`.
-        Examples:
+        Examples
         --------
-        >>> df.select(df["age"]).collect()
+        >>> df.select(df['age']).collect()
         [Row(age=2), Row(age=5)]
-        >>> df[["name", "age"]].collect()
+        >>> df[ ["name", "age"]].collect()
         [Row(name='Alice', age=2), Row(name='Bob', age=5)]
-        >>> df[df.age > 3].collect()
+        >>> df[ df.age > 3 ].collect()
         [Row(age=5, name='Bob')]
         >>> df[df[0] > 3].collect()
         [Row(age=5, name='Bob')]
@@ -882,29 +919,31 @@ class DataFrame:  # noqa: D101
         elif isinstance(item, (list, tuple)):
             return self.select(*item)
         elif isinstance(item, int):
-            return spark_sql_functions.col(self._schema[item].name)
+            return col(self._schema[item].name)
         else:
-            msg = f"Unexpected item type: {type(item)}"
-            raise TypeError(msg)
+            raise TypeError(f"Unexpected item type: {type(item)}")
     def __getattr__(self, name: str) -> Column:
         """Returns the :class:`Column` denoted by ``name``.
-        Examples:
+        Examples
         --------
         >>> df.select(df.age).collect()
         [Row(age=2), Row(age=5)]
         """
         if name not in self.relation.columns:
-            msg = f"'{self.__class__.__name__}' object has no attribute '{name}'"
-            raise AttributeError(msg)
+            raise AttributeError(
+                "'%s' object has no attribute '%s'" % (self.__class__.__name__, name)
+            )
         return Column(duckdb.ColumnExpression(self.relation.alias, name))
     @overload
-    def groupBy(self, *cols: "ColumnOrName") -> "GroupedData": ...
+    def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":
+        ...
     @overload
-    def groupBy(self, __cols: Union[list[Column], list[str]]) -> "GroupedData": ...  # noqa: PYI063
+    def groupBy(self, __cols: Union[List[Column], List[str]]) -> "GroupedData":
+        ...
     def groupBy(self, *cols: "ColumnOrName") -> "GroupedData":  # type: ignore[misc]
         """Groups the :class:`DataFrame` using the specified columns,
@@ -920,16 +959,15 @@ class DataFrame:  # noqa: D101
             Each element should be a column name (string) or an expression (:class:`Column`)
             or list of them.
-        Returns:
+        Returns
         -------
         :class:`GroupedData`
             Grouped data by given columns.
-        Examples:
+        Examples
         --------
-        >>> df = spark.createDataFrame(
-        ...     [(2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"]
-        ... )
+        >>> df = spark.createDataFrame([
+        ...     (2, "Alice"), (2, "Bob"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])
         Empty grouping columns triggers a global aggregation.
@@ -970,19 +1008,22 @@ class DataFrame:  # noqa: D101
         |  Bob|  2|    2|
         |  Bob|  5|    1|
         +-----+---+-----+
-        """  # noqa: D205
+        """
         from .group import GroupedData, Grouping
-        columns = cols[0] if len(cols) == 1 and isinstance(cols[0], list) else cols
+        if len(cols) == 1 and isinstance(cols[0], list):
+            columns = cols[0]
+        else:
+            columns = cols
         return GroupedData(Grouping(*columns), self)
     groupby = groupBy
     @property
-    def write(self) -> DataFrameWriter:  # noqa: D102
+    def write(self) -> DataFrameWriter:
         return DataFrameWriter(self)
-    def printSchema(self) -> None:  # noqa: D102
+    def printSchema(self):
         raise ContributionsAcceptedError
     def union(self, other: "DataFrame") -> "DataFrame":
@@ -994,22 +1035,22 @@ class DataFrame:  # noqa: D101
         other : :class:`DataFrame`
             Another :class:`DataFrame` that needs to be unioned
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
-        See Also:
+        See Also
         --------
         DataFrame.unionAll
-        Notes:
+        Notes
         -----
         This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union
         (that does deduplication of elements), use this function followed by :func:`distinct`.
         Also as standard in SQL, this function resolves columns by position (not by name).
-        Examples:
+        Examples
         --------
         >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])
         >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])
@@ -1027,12 +1068,14 @@ class DataFrame:  # noqa: D101
         |   1|   2|   3|
         |   1|   2|   3|
         +----+----+----+
-        """  # noqa: D205
+        """
         return DataFrame(self.relation.union(other.relation), self.session)
     unionAll = union
-    def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
+    def unionByName(
+        self, other: "DataFrame", allowMissingColumns: bool = False
+    ) -> "DataFrame":
         """Returns a new :class:`DataFrame` containing union of rows in this and another
         :class:`DataFrame`.
@@ -1053,12 +1096,12 @@ class DataFrame:  # noqa: D101
            .. versionadded:: 3.1.0
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Combined DataFrame.
-        Examples:
+        Examples
         --------
         The difference between this function and :func:`union` is that this function
         resolves columns by name (not by position):
@@ -1087,14 +1130,14 @@ class DataFrame:  # noqa: D101
         |   1|   2|   3|NULL|
         |NULL|   4|   5|   6|
         +----+----+----+----+
-        """  # noqa: D205
+        """
         if allowMissingColumns:
             cols = []
             for col in self.relation.columns:
                 if col in other.relation.columns:
                     cols.append(col)
                 else:
-                    cols.append(spark_sql_functions.lit(None))
+                    cols.append(lit(None))
             other = other.select(*cols)
         else:
             other = other.select(*self.relation.columns)
@@ -1117,16 +1160,16 @@ class DataFrame:  # noqa: D101
         other : :class:`DataFrame`
             Another :class:`DataFrame` that needs to be combined.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Combined DataFrame.
-        Notes:
+        Notes
         -----
         This is equivalent to `INTERSECT` in SQL.
-        Examples:
+        Examples
         --------
         >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
         >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
@@ -1137,7 +1180,7 @@ class DataFrame:  # noqa: D101
         |  b|  3|
         |  a|  1|
         +---+---+
-        """  # noqa: D205
+        """
         return self.intersectAll(other).drop_duplicates()
     def intersectAll(self, other: "DataFrame") -> "DataFrame":
@@ -1157,12 +1200,12 @@ class DataFrame:  # noqa: D101
         other : :class:`DataFrame`
             Another :class:`DataFrame` that needs to be combined.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Combined DataFrame.
-        Examples:
+        Examples
         --------
         >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])
         >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])
@@ -1174,7 +1217,7 @@ class DataFrame:  # noqa: D101
         |  a|  1|
         |  b|  3|
         +---+---+
-        """  # noqa: D205
+        """
         return DataFrame(self.relation.intersect(other.relation), self.session)
     def exceptAll(self, other: "DataFrame") -> "DataFrame":
@@ -1194,15 +1237,14 @@ class DataFrame:  # noqa: D101
         other : :class:`DataFrame`
             The other :class:`DataFrame` to compare to.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
-        Examples:
+        Examples
         --------
         >>> df1 = spark.createDataFrame(
-        ...     [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"]
-        ... )
+        ...         [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b",  3), ("c", 4)], ["C1", "C2"])
         >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])
         >>> df1.exceptAll(df2).show()
         +---+---+
@@ -1214,10 +1256,10 @@ class DataFrame:  # noqa: D101
         |  c|  4|
         +---+---+
-        """  # noqa: D205
+        """
         return DataFrame(self.relation.except_(other.relation), self.session)
-    def dropDuplicates(self, subset: Optional[list[str]] = None) -> "DataFrame":
+    def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
         """Return a new :class:`DataFrame` with duplicate rows removed,
         optionally only considering certain columns.
@@ -1234,21 +1276,19 @@ class DataFrame:  # noqa: D101
         subset : List of column names, optional
             List of columns to use for duplicate comparison (default All columns).
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             DataFrame without duplicates.
-        Examples:
+        Examples
         --------
         >>> from pyspark.sql import Row
-        >>> df = spark.createDataFrame(
-        ...     [
-        ...         Row(name="Alice", age=5, height=80),
-        ...         Row(name="Alice", age=5, height=80),
-        ...         Row(name="Alice", age=10, height=80),
-        ...     ]
-        ... )
+        >>> df = spark.createDataFrame([
+        ...     Row(name='Alice', age=5, height=80),
+        ...     Row(name='Alice', age=5, height=80),
+        ...     Row(name='Alice', age=10, height=80)
+        ... ])
         Deduplicate the same rows.
@@ -1262,16 +1302,16 @@ class DataFrame:  # noqa: D101
         Deduplicate values on 'name' and 'height' columns.
-        >>> df.dropDuplicates(["name", "height"]).show()
+        >>> df.dropDuplicates(['name', 'height']).show()
         +-----+---+------+
         | name|age|height|
         +-----+---+------+
         |Alice|  5|    80|
         +-----+---+------+
-        """  # noqa: D205
+        """
         if subset:
             rn_col = f"tmp_col_{uuid.uuid1().hex}"
-            subset_str = ", ".join([f'"{c}"' for c in subset])
+            subset_str = ', '.join([f'"{c}"' for c in subset])
             window_spec = f"OVER(PARTITION BY {subset_str}) AS {rn_col}"
             df = DataFrame(self.relation.row_number(window_spec, "*"), self.session)
             return df.filter(f"{rn_col} = 1").drop(rn_col)
@@ -1280,17 +1320,19 @@ class DataFrame:  # noqa: D101
     drop_duplicates = dropDuplicates
     def distinct(self) -> "DataFrame":
         """Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             DataFrame with distinct records.
-        Examples:
+        Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])
         Return the number of distinct rows in the :class:`DataFrame`
@@ -1303,14 +1345,15 @@ class DataFrame:  # noqa: D101
     def count(self) -> int:
         """Returns the number of rows in this :class:`DataFrame`.
-        Returns:
+        Returns
         -------
         int
             Number of rows.
-        Examples:
+        Examples
         --------
-        >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        >>> df = spark.createDataFrame(
+        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
         Return the number of rows in the :class:`DataFrame`.
@@ -1326,28 +1369,33 @@ class DataFrame:  # noqa: D101
         assert types_count == len(existing_columns)
         cast_expressions = [
-            f"{existing}::{target_type} as {existing}" for existing, target_type in zip(existing_columns, types)
+            f"{existing}::{target_type} as {existing}"
+            for existing, target_type in zip(existing_columns, types)
         ]
         cast_expressions = ", ".join(cast_expressions)
         new_rel = self.relation.project(cast_expressions)
         return DataFrame(new_rel, self.session)
-    def toDF(self, *cols) -> "DataFrame":  # noqa: D102
+    def toDF(self, *cols) -> "DataFrame":
         existing_columns = self.relation.columns
         column_count = len(cols)
         if column_count != len(existing_columns):
-            raise PySparkValueError(message="Provided column names and number of columns in the DataFrame don't match")
+            raise PySparkValueError(
+                message="Provided column names and number of columns in the DataFrame don't match"
+            )
         existing_columns = [ColumnExpression(x) for x in existing_columns]
-        projections = [existing.alias(new) for existing, new in zip(existing_columns, cols)]
+        projections = [
+            existing.alias(new) for existing, new in zip(existing_columns, cols)
+        ]
         new_rel = self.relation.project(*projections)
         return DataFrame(new_rel, self.session)
-    def collect(self) -> list[Row]:  # noqa: D102
+    def collect(self) -> List[Row]:
         columns = self.relation.columns
         result = self.relation.fetchall()
-        def construct_row(values: list, names: list[str]) -> Row:
+        def construct_row(values, names) -> Row:
             row = tuple.__new__(Row, list(values))
             row.__fields__ = list(names)
             return row
@@ -1363,16 +1411,16 @@ class DataFrame:  # noqa: D101
         .. versionchanged:: 3.4.0
             Supports Spark Connect.
-        Notes:
+        Notes
         -----
         The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.
-        Returns:
+        Returns
         -------
         :class:`DataFrame`
             Cached DataFrame.
-        Examples:
+        Examples
         --------
         >>> df = spark.range(1)
         >>> df.cache()