PyPI - duckdb - Versions diffs - 1.4.1.dev113__cp39-cp39-macosx_10_9_universal2.whl → 1.5.0.dev37__cp39-cp39-macosx_10_9_universal2.whl - Mend

duckdb 1.4.1.dev113__cp39-cp39-macosx_10_9_universal2.whl → 1.5.0.dev37__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of duckdb might be problematic. Click here for more details.

Files changed (46) hide show

_duckdb.cpython-39-darwin.so +0 -0
duckdb/__init__.py +374 -373
duckdb/__init__.pyi +180 -604
duckdb/bytes_io_wrapper.py +7 -6
duckdb/experimental/__init__.py +1 -2
duckdb/experimental/spark/__init__.py +4 -3
duckdb/experimental/spark/_globals.py +8 -8
duckdb/experimental/spark/_typing.py +9 -7
duckdb/experimental/spark/conf.py +15 -16
duckdb/experimental/spark/context.py +44 -60
duckdb/experimental/spark/errors/__init__.py +35 -33
duckdb/experimental/spark/errors/error_classes.py +1 -1
duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
duckdb/experimental/spark/errors/exceptions/base.py +88 -39
duckdb/experimental/spark/errors/utils.py +16 -11
duckdb/experimental/spark/exception.py +6 -9
duckdb/experimental/spark/sql/__init__.py +5 -5
duckdb/experimental/spark/sql/_typing.py +15 -8
duckdb/experimental/spark/sql/catalog.py +20 -21
duckdb/experimental/spark/sql/column.py +54 -47
duckdb/experimental/spark/sql/conf.py +8 -9
duckdb/experimental/spark/sql/dataframe.py +233 -185
duckdb/experimental/spark/sql/functions.py +1248 -1222
duckdb/experimental/spark/sql/group.py +52 -56
duckdb/experimental/spark/sql/readwriter.py +94 -80
duckdb/experimental/spark/sql/session.py +59 -64
duckdb/experimental/spark/sql/streaming.py +10 -9
duckdb/experimental/spark/sql/type_utils.py +64 -66
duckdb/experimental/spark/sql/types.py +344 -308
duckdb/experimental/spark/sql/udf.py +6 -6
duckdb/filesystem.py +8 -13
duckdb/functional/__init__.py +16 -2
duckdb/polars_io.py +57 -66
duckdb/query_graph/__main__.py +96 -91
duckdb/typing/__init__.py +8 -8
duckdb/typing/__init__.pyi +2 -4
duckdb/udf.py +5 -10
duckdb/value/__init__.py +0 -1
duckdb/value/constant/__init__.py +59 -61
duckdb/value/constant/__init__.pyi +4 -3
duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
duckdb-1.4.1.dev113.dist-info/METADATA +0 -326
duckdb-1.4.1.dev113.dist-info/RECORD +0 -47
{duckdb-1.4.1.dev113.dist-info → duckdb-1.5.0.dev37.dist-info}/WHEEL +0 -0
{duckdb-1.4.1.dev113.dist-info → duckdb-1.5.0.dev37.dist-info}/licenses/LICENSE +0 -0

duckdb/experimental/spark/sql/session.py CHANGED Viewed

@@ -1,31 +1,32 @@
-import uuid  # noqa: D100
-from collections.abc import Iterable, Sized
-from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
-import duckdb
+from typing import Optional, List, Any, Union, Iterable, TYPE_CHECKING
+import uuid
 if TYPE_CHECKING:
-    from pandas.core.frame import DataFrame as PandasDataFrame
     from .catalog import Catalog
+    from pandas.core.frame import DataFrame as PandasDataFrame
-from ..conf import SparkConf
-from ..context import SparkContext
-from ..errors import PySparkTypeError
 from ..exception import ContributionsAcceptedError
-from .conf import RuntimeConfig
+from .types import StructType, AtomicType, DataType
+from ..conf import SparkConf
 from .dataframe import DataFrame
+from .conf import RuntimeConfig
 from .readwriter import DataFrameReader
-from .streaming import DataStreamReader
-from .types import StructType
+from ..context import SparkContext
 from .udf import UDFRegistration
+from .streaming import DataStreamReader
+import duckdb
+from ..errors import (
+    PySparkTypeError,
+    PySparkValueError
+)
+from ..errors.error_classes import *
 # In spark:
 # SparkSession holds a SparkContext
 # SparkContext gets created from SparkConf
-# At this level the check is made to determine whether the instance already exists and just needs
-# to be retrieved or it needs to be created.
+# At this level the check is made to determine whether the instance already exists and just needs to be retrieved or it needs to be created
 # For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
 # SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
@@ -33,7 +34,7 @@ from .udf import UDFRegistration
 # data is a List of rows
 # every value in each row needs to be turned into a Value
-def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[duckdb.Value]:
+def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
     from duckdb import Value
     new_data = []
@@ -43,8 +44,8 @@ def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[du
     return new_data
-class SparkSession:  # noqa: D101
-    def __init__(self, context: SparkContext) -> None:  # noqa: D107
+class SparkSession:
+    def __init__(self, context: SparkContext):
         self.conn = context.connection
         self._context = context
         self._conf = RuntimeConfig(self.conn)
@@ -52,16 +53,15 @@ class SparkSession:  # noqa: D101
     def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
         try:
             import pandas
             has_pandas = True
         except ImportError:
             has_pandas = False
         if has_pandas and isinstance(data, pandas.DataFrame):
-            unique_name = f"pyspark_pandas_df_{uuid.uuid1()}"
+            unique_name = f'pyspark_pandas_df_{uuid.uuid1()}'
             self.conn.register(unique_name, data)
             return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
-        def verify_tuple_integrity(tuples: list[tuple]) -> None:
+        def verify_tuple_integrity(tuples):
             if len(tuples) <= 1:
                 return
             expected_length = len(tuples[0])
@@ -73,9 +73,9 @@ class SparkSession:  # noqa: D101
                     error_class="LENGTH_SHOULD_BE_THE_SAME",
                     message_parameters={
                         "arg1": f"data{i}",
-                        "arg2": f"data{i + 1}",
+                        "arg2": f"data{i+1}",
                         "arg1_length": str(expected_length),
-                        "arg2_length": str(actual_length),
+                        "arg2_length": str(actual_length)
                     },
                 )
@@ -83,16 +83,16 @@ class SparkSession:  # noqa: D101
             data = list(data)
         verify_tuple_integrity(data)
-        def construct_query(tuples: Iterable) -> str:
-            def construct_values_list(row: Sized, start_param_idx: int) -> str:
+        def construct_query(tuples) -> str:
+            def construct_values_list(row, start_param_idx):
                 parameter_count = len(row)
-                parameters = [f"${x + start_param_idx}" for x in range(parameter_count)]
-                parameters = "(" + ", ".join(parameters) + ")"
+                parameters = [f'${x+start_param_idx}' for x in range(parameter_count)]
+                parameters = '(' + ', '.join(parameters) + ')'
                 return parameters
             row_size = len(tuples[0])
             values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
-            values_list = ", ".join(values_list)
+            values_list = ', '.join(values_list)
             query = f"""
                 select * from (values {values_list})
@@ -101,7 +101,7 @@ class SparkSession:  # noqa: D101
         query = construct_query(data)
-        def construct_parameters(tuples: Iterable) -> list[list]:
+        def construct_parameters(tuples):
             parameters = []
             for row in tuples:
                 parameters.extend(list(row))
@@ -112,9 +112,7 @@ class SparkSession:  # noqa: D101
         rel = self.conn.sql(query, params=parameters)
         return DataFrame(rel, self)
-    def _createDataFrameFromPandas(
-        self, data: "PandasDataFrame", types: Union[list[str], None], names: Union[list[str], None]
-    ) -> DataFrame:
+    def _createDataFrameFromPandas(self, data: "PandasDataFrame", types, names) -> DataFrame:
         df = self._create_dataframe(data)
         # Cast to types
@@ -125,10 +123,10 @@ class SparkSession:  # noqa: D101
             df = df.toDF(*names)
         return df
-    def createDataFrame(  # noqa: D102
+    def createDataFrame(
         self,
         data: Union["PandasDataFrame", Iterable[Any]],
-        schema: Optional[Union[StructType, list[str]]] = None,
+        schema: Optional[Union[StructType, List[str]]] = None,
         samplingRatio: Optional[float] = None,
         verifySchema: bool = True,
     ) -> DataFrame:
@@ -177,7 +175,7 @@ class SparkSession:  # noqa: D101
         if is_empty:
             rel = df.relation
             # Add impossible where clause
-            rel = rel.filter("1=0")
+            rel = rel.filter('1=0')
             df = DataFrame(rel, self)
         # Cast to types
@@ -188,10 +186,10 @@ class SparkSession:  # noqa: D101
             df = df.toDF(*names)
         return df
-    def newSession(self) -> "SparkSession":  # noqa: D102
+    def newSession(self) -> "SparkSession":
         return SparkSession(self._context)
-    def range(  # noqa: D102
+    def range(
         self,
         start: int,
         end: Optional[int] = None,
@@ -205,26 +203,26 @@ class SparkSession:  # noqa: D101
             end = start
             start = 0
-        return DataFrame(self.conn.table_function("range", parameters=[start, end, step]), self)
+        return DataFrame(self.conn.table_function("range", parameters=[start, end, step]),self)
-    def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:  # noqa: D102, ANN401
+    def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
         if kwargs:
             raise NotImplementedError
         relation = self.conn.sql(sqlQuery)
         return DataFrame(relation, self)
-    def stop(self) -> None:  # noqa: D102
+    def stop(self) -> None:
         self._context.stop()
-    def table(self, tableName: str) -> DataFrame:  # noqa: D102
+    def table(self, tableName: str) -> DataFrame:
         relation = self.conn.table(tableName)
         return DataFrame(relation, self)
-    def getActiveSession(self) -> "SparkSession":  # noqa: D102
+    def getActiveSession(self) -> "SparkSession":
         return self
     @property
-    def catalog(self) -> "Catalog":  # noqa: D102
+    def catalog(self) -> "Catalog":
         if not hasattr(self, "_catalog"):
             from duckdb.experimental.spark.sql.catalog import Catalog
@@ -232,62 +230,59 @@ class SparkSession:  # noqa: D101
         return self._catalog
     @property
-    def conf(self) -> RuntimeConfig:  # noqa: D102
+    def conf(self) -> RuntimeConfig:
         return self._conf
     @property
-    def read(self) -> DataFrameReader:  # noqa: D102
+    def read(self) -> DataFrameReader:
         return DataFrameReader(self)
     @property
-    def readStream(self) -> DataStreamReader:  # noqa: D102
+    def readStream(self) -> DataStreamReader:
         return DataStreamReader(self)
     @property
-    def sparkContext(self) -> SparkContext:  # noqa: D102
+    def sparkContext(self) -> SparkContext:
         return self._context
     @property
-    def streams(self) -> NoReturn:  # noqa: D102
+    def streams(self) -> Any:
         raise ContributionsAcceptedError
     @property
-    def udf(self) -> UDFRegistration:  # noqa: D102
+    def udf(self) -> UDFRegistration:
         return UDFRegistration(self)
     @property
-    def version(self) -> str:  # noqa: D102
-        return "1.0.0"
+    def version(self) -> str:
+        return '1.0.0'
-    class Builder:  # noqa: D106
-        def __init__(self) -> None:  # noqa: D107
+    class Builder:
+        def __init__(self):
             pass
-        def master(self, name: str) -> "SparkSession.Builder":  # noqa: D102
+        def master(self, name: str) -> "SparkSession.Builder":
             # no-op
             return self
-        def appName(self, name: str) -> "SparkSession.Builder":  # noqa: D102
+        def appName(self, name: str) -> "SparkSession.Builder":
             # no-op
             return self
-        def remote(self, url: str) -> "SparkSession.Builder":  # noqa: D102
+        def remote(self, url: str) -> "SparkSession.Builder":
             # no-op
             return self
-        def getOrCreate(self) -> "SparkSession":  # noqa: D102
+        def getOrCreate(self) -> "SparkSession":
             context = SparkContext("__ignored__")
             return SparkSession(context)
-        def config(  # noqa: D102
-            self,
-            key: Optional[str] = None,
-            value: Optional[Any] = None,  # noqa: ANN401
-            conf: Optional[SparkConf] = None,
+        def config(
+            self, key: Optional[str] = None, value: Optional[Any] = None, conf: Optional[SparkConf] = None
         ) -> "SparkSession.Builder":
             return self
-        def enableHiveSupport(self) -> "SparkSession.Builder":  # noqa: D102
+        def enableHiveSupport(self) -> "SparkSession.Builder":
             # no-op
             return self

duckdb/experimental/spark/sql/streaming.py CHANGED Viewed

@@ -1,5 +1,4 @@
-from typing import TYPE_CHECKING, Optional, Union  # noqa: D100
+from typing import TYPE_CHECKING, Optional, Union
 from .types import StructType
 if TYPE_CHECKING:
@@ -10,26 +9,28 @@ PrimitiveType = Union[bool, float, int, str]
 OptionalPrimitiveType = Optional[PrimitiveType]
-class DataStreamWriter:  # noqa: D101
-    def __init__(self, dataframe: "DataFrame") -> None:  # noqa: D107
+class DataStreamWriter:
+    def __init__(self, dataframe: "DataFrame"):
         self.dataframe = dataframe
-    def toTable(self, table_name: str) -> None:  # noqa: D102
+    def toTable(self, table_name: str) -> None:
         # Should we register the dataframe or create a table from the contents?
         raise NotImplementedError
-class DataStreamReader:  # noqa: D101
-    def __init__(self, session: "SparkSession") -> None:  # noqa: D107
+class DataStreamReader:
+    def __init__(self, session: "SparkSession"):
         self.session = session
-    def load(  # noqa: D102
+    def load(
         self,
         path: Optional[str] = None,
         format: Optional[str] = None,
         schema: Union[StructType, str, None] = None,
-        **options: OptionalPrimitiveType,
+        **options: OptionalPrimitiveType
     ) -> "DataFrame":
+        from duckdb.experimental.spark.sql.dataframe import DataFrame
         raise NotImplementedError

duckdb/experimental/spark/sql/type_utils.py CHANGED Viewed

@@ -1,107 +1,105 @@
-from typing import cast  # noqa: D100
 from duckdb.typing import DuckDBPyType
+from typing import List, Tuple, cast
 from .types import (
-    ArrayType,
+    DataType,
+    StringType,
     BinaryType,
     BitstringType,
+    UUIDType,
     BooleanType,
-    ByteType,
-    DataType,
     DateType,
-    DayTimeIntervalType,
+    TimestampType,
+    TimestampNTZType,
+    TimeType,
+    TimeNTZType,
+    TimestampNanosecondNTZType,
+    TimestampMilisecondNTZType,
+    TimestampSecondNTZType,
     DecimalType,
     DoubleType,
     FloatType,
-    HugeIntegerType,
+    ByteType,
+    UnsignedByteType,
+    ShortType,
+    UnsignedShortType,
     IntegerType,
+    UnsignedIntegerType,
     LongType,
+    UnsignedLongType,
+    HugeIntegerType,
+    UnsignedHugeIntegerType,
+    DayTimeIntervalType,
+    ArrayType,
     MapType,
-    ShortType,
-    StringType,
     StructField,
     StructType,
-    TimeNTZType,
-    TimestampMilisecondNTZType,
-    TimestampNanosecondNTZType,
-    TimestampNTZType,
-    TimestampSecondNTZType,
-    TimestampType,
-    TimeType,
-    UnsignedByteType,
-    UnsignedHugeIntegerType,
-    UnsignedIntegerType,
-    UnsignedLongType,
-    UnsignedShortType,
-    UUIDType,
 )
 _sqltype_to_spark_class = {
-    "boolean": BooleanType,
-    "utinyint": UnsignedByteType,
-    "tinyint": ByteType,
-    "usmallint": UnsignedShortType,
-    "smallint": ShortType,
-    "uinteger": UnsignedIntegerType,
-    "integer": IntegerType,
-    "ubigint": UnsignedLongType,
-    "bigint": LongType,
-    "hugeint": HugeIntegerType,
-    "uhugeint": UnsignedHugeIntegerType,
-    "varchar": StringType,
-    "blob": BinaryType,
-    "bit": BitstringType,
-    "uuid": UUIDType,
-    "date": DateType,
-    "time": TimeNTZType,
-    "time with time zone": TimeType,
-    "timestamp": TimestampNTZType,
-    "timestamp with time zone": TimestampType,
-    "timestamp_ms": TimestampNanosecondNTZType,
-    "timestamp_ns": TimestampMilisecondNTZType,
-    "timestamp_s": TimestampSecondNTZType,
-    "interval": DayTimeIntervalType,
-    "list": ArrayType,
-    "struct": StructType,
-    "map": MapType,
+    'boolean': BooleanType,
+    'utinyint': UnsignedByteType,
+    'tinyint': ByteType,
+    'usmallint': UnsignedShortType,
+    'smallint': ShortType,
+    'uinteger': UnsignedIntegerType,
+    'integer': IntegerType,
+    'ubigint': UnsignedLongType,
+    'bigint': LongType,
+    'hugeint': HugeIntegerType,
+    'uhugeint': UnsignedHugeIntegerType,
+    'varchar': StringType,
+    'blob': BinaryType,
+    'bit': BitstringType,
+    'uuid': UUIDType,
+    'date': DateType,
+    'time': TimeNTZType,
+    'time with time zone': TimeType,
+    'timestamp': TimestampNTZType,
+    'timestamp with time zone': TimestampType,
+    'timestamp_ms': TimestampNanosecondNTZType,
+    'timestamp_ns': TimestampMilisecondNTZType,
+    'timestamp_s': TimestampSecondNTZType,
+    'interval': DayTimeIntervalType,
+    'list': ArrayType,
+    'struct': StructType,
+    'map': MapType,
     # union
     # enum
     # null (???)
-    "float": FloatType,
-    "double": DoubleType,
-    "decimal": DecimalType,
+    'float': FloatType,
+    'double': DoubleType,
+    'decimal': DecimalType,
 }
-def convert_nested_type(dtype: DuckDBPyType) -> DataType:  # noqa: D103
+def convert_nested_type(dtype: DuckDBPyType) -> DataType:
     id = dtype.id
-    if id == "list" or id == "array":
+    if id == 'list' or id == 'array':
         children = dtype.children
         return ArrayType(convert_type(children[0][1]))
-    # TODO: add support for 'union'  # noqa: TD002, TD003
-    if id == "struct":
-        children: list[tuple[str, DuckDBPyType]] = dtype.children
+    # TODO: add support for 'union'
+    if id == 'struct':
+        children: List[Tuple[str, DuckDBPyType]] = dtype.children
         fields = [StructField(x[0], convert_type(x[1])) for x in children]
         return StructType(fields)
-    if id == "map":
+    if id == 'map':
         return MapType(convert_type(dtype.key), convert_type(dtype.value))
     raise NotImplementedError
-def convert_type(dtype: DuckDBPyType) -> DataType:  # noqa: D103
+def convert_type(dtype: DuckDBPyType) -> DataType:
     id = dtype.id
-    if id in ["list", "struct", "map", "array"]:
+    if id in ['list', 'struct', 'map', 'array']:
         return convert_nested_type(dtype)
-    if id == "decimal":
-        children: list[tuple[str, DuckDBPyType]] = dtype.children
-        precision = cast("int", children[0][1])
-        scale = cast("int", children[1][1])
+    if id == 'decimal':
+        children: List[Tuple[str, DuckDBPyType]] = dtype.children
+        precision = cast(int, children[0][1])
+        scale = cast(int, children[1][1])
         return DecimalType(precision, scale)
     spark_type = _sqltype_to_spark_class[id]
     return spark_type()
-def duckdb_to_spark_schema(names: list[str], types: list[DuckDBPyType]) -> StructType:  # noqa: D103
+def duckdb_to_spark_schema(names: List[str], types: List[DuckDBPyType]) -> StructType:
     fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
     return StructType(fields)