PyPI - duckdb - Versions diffs - 1.4.1__cp39-cp39-macosx_10_9_universal2.whl → 1.5.0.dev44__cp39-cp39-macosx_10_9_universal2.whl - Mend

duckdb 1.4.1__cp39-cp39-macosx_10_9_universal2.whl → 1.5.0.dev44__cp39-cp39-macosx_10_9_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of duckdb might be problematic. Click here for more details.

Files changed (57) hide show

_duckdb.cpython-39-darwin.so +0 -0
duckdb/__init__.py +435 -341
duckdb/__init__.pyi +713 -0
duckdb/bytes_io_wrapper.py +9 -12
duckdb/experimental/__init__.py +1 -2
duckdb/experimental/spark/__init__.py +4 -3
duckdb/experimental/spark/_globals.py +8 -8
duckdb/experimental/spark/_typing.py +9 -7
duckdb/experimental/spark/conf.py +15 -16
duckdb/experimental/spark/context.py +44 -60
duckdb/experimental/spark/errors/__init__.py +35 -33
duckdb/experimental/spark/errors/error_classes.py +1 -1
duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
duckdb/experimental/spark/errors/exceptions/base.py +88 -39
duckdb/experimental/spark/errors/utils.py +16 -11
duckdb/experimental/spark/exception.py +6 -9
duckdb/experimental/spark/sql/__init__.py +5 -5
duckdb/experimental/spark/sql/_typing.py +15 -8
duckdb/experimental/spark/sql/catalog.py +20 -21
duckdb/experimental/spark/sql/column.py +55 -48
duckdb/experimental/spark/sql/conf.py +8 -9
duckdb/experimental/spark/sql/dataframe.py +233 -185
duckdb/experimental/spark/sql/functions.py +1248 -1222
duckdb/experimental/spark/sql/group.py +52 -56
duckdb/experimental/spark/sql/readwriter.py +94 -80
duckdb/experimental/spark/sql/session.py +59 -64
duckdb/experimental/spark/sql/streaming.py +10 -9
duckdb/experimental/spark/sql/type_utils.py +65 -67
duckdb/experimental/spark/sql/types.py +345 -309
duckdb/experimental/spark/sql/udf.py +6 -6
duckdb/filesystem.py +16 -26
duckdb/functional/__init__.py +16 -12
duckdb/functional/__init__.pyi +31 -0
duckdb/polars_io.py +83 -130
duckdb/query_graph/__main__.py +96 -91
duckdb/typing/__init__.py +8 -18
duckdb/typing/__init__.pyi +36 -0
duckdb/udf.py +5 -10
duckdb/value/__init__.py +0 -1
duckdb/value/constant/__init__.py +60 -62
duckdb/value/constant/__init__.pyi +115 -0
duckdb-1.5.0.dev44.dist-info/METADATA +80 -0
duckdb-1.5.0.dev44.dist-info/RECORD +47 -0
_duckdb-stubs/__init__.pyi +0 -1443
_duckdb-stubs/_func.pyi +0 -46
_duckdb-stubs/_sqltypes.pyi +0 -75
adbc_driver_duckdb/__init__.py +0 -50
adbc_driver_duckdb/dbapi.py +0 -115
duckdb/_dbapi_type_object.py +0 -231
duckdb/_version.py +0 -22
duckdb/func/__init__.py +0 -3
duckdb/sqltypes/__init__.py +0 -63
duckdb-1.4.1.dist-info/METADATA +0 -326
duckdb-1.4.1.dist-info/RECORD +0 -52
/duckdb/{py.typed → value/__init__.pyi} +0 -0
{duckdb-1.4.1.dist-info → duckdb-1.5.0.dev44.dist-info}/WHEEL +0 -0
{duckdb-1.4.1.dist-info → duckdb-1.5.0.dev44.dist-info}/licenses/LICENSE +0 -0

duckdb/experimental/spark/sql/udf.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/  # noqa: D100
+# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
 from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
 from .types import DataType
@@ -10,11 +10,11 @@ DataTypeOrString = Union[DataType, str]
 UserDefinedFunctionLike = TypeVar("UserDefinedFunctionLike")
-class UDFRegistration:  # noqa: D101
-    def __init__(self, sparkSession: "SparkSession") -> None:  # noqa: D107
+class UDFRegistration:
+    def __init__(self, sparkSession: "SparkSession"):
         self.sparkSession = sparkSession
-    def register(  # noqa: D102
+    def register(
         self,
         name: str,
         f: Union[Callable[..., Any], "UserDefinedFunctionLike"],
@@ -22,7 +22,7 @@ class UDFRegistration:  # noqa: D101
     ) -> "UserDefinedFunctionLike":
         self.sparkSession.conn.create_function(name, f, return_type=returnType)
-    def registerJavaFunction(  # noqa: D102
+    def registerJavaFunction(
         self,
         name: str,
         javaClassName: str,
@@ -30,7 +30,7 @@ class UDFRegistration:  # noqa: D101
     ) -> None:
         raise NotImplementedError
-    def registerJavaUDAF(self, name: str, javaClassName: str) -> None:  # noqa: D102
+    def registerJavaUDAF(self, name: str, javaClassName: str) -> None:
         raise NotImplementedError

duckdb/filesystem.py CHANGED Viewed

@@ -1,33 +1,23 @@
-"""In-memory filesystem to store ephemeral dependencies.
-Warning: Not for external use. May change at any moment. Likely to be made internal.
-"""
-from __future__ import annotations
-import io
-import typing
-from fsspec import AbstractFileSystem
-from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
+from fsspec import filesystem, AbstractFileSystem
+from fsspec.implementations.memory import MemoryFileSystem, MemoryFile
 from .bytes_io_wrapper import BytesIOWrapper
+from io import TextIOBase
+def is_file_like(obj):
+    # We only care that we can read from the file
+    return hasattr(obj, "read") and hasattr(obj, "seek")
-class ModifiedMemoryFileSystem(MemoryFileSystem):
-    """In-memory filesystem implementation that uses its own protocol."""
-    protocol = ("DUCKDB_INTERNAL_OBJECTSTORE",)
+class ModifiedMemoryFileSystem(MemoryFileSystem):
+    protocol = ('DUCKDB_INTERNAL_OBJECTSTORE',)
     # defer to the original implementation that doesn't hardcode the protocol
-    _strip_protocol: typing.Callable[[str], str] = classmethod(AbstractFileSystem._strip_protocol.__func__)  # type: ignore[assignment]
+    _strip_protocol = classmethod(AbstractFileSystem._strip_protocol.__func__)
-    def add_file(self, obj: io.IOBase | BytesIOWrapper | object, path: str) -> None:
-        """Add a file to the filesystem."""
-        if not (hasattr(obj, "read") and hasattr(obj, "seek")):
-            msg = "Can not read from a non file-like object"
-            raise TypeError(msg)
-        if isinstance(obj, io.TextIOBase):
-            # Wrap this so that we can return a bytes object from 'read'
-            obj = BytesIOWrapper(obj)
+    def add_file(self, object, path):
+        if not is_file_like(object):
+            raise ValueError("Can not read from a non file-like object")
         path = self._strip_protocol(path)
-        self.store[path] = MemoryFile(self, path, obj.read())
+        if isinstance(object, TextIOBase):
+            # Wrap this so that we can return a bytes object from 'read'
+            object = BytesIOWrapper(object)
+        self.store[path] = MemoryFile(self, path, object.read())

duckdb/functional/__init__.py CHANGED Viewed

@@ -1,13 +1,17 @@
-"""DuckDB function constants and types. DEPRECATED: please use `duckdb.func` instead."""
-import warnings
-from duckdb.func import ARROW, DEFAULT, NATIVE, SPECIAL, FunctionNullHandling, PythonUDFType
-__all__ = ["ARROW", "DEFAULT", "NATIVE", "SPECIAL", "FunctionNullHandling", "PythonUDFType"]
-warnings.warn(
-    "`duckdb.functional` is deprecated and will be removed in a future version. Please use `duckdb.func` instead.",
-    DeprecationWarning,
-    stacklevel=2,
+from _duckdb.functional import (
+	FunctionNullHandling,
+	PythonUDFType,
+	SPECIAL,
+	DEFAULT,
+	NATIVE,
+	ARROW
 )
+__all__ = [
+	"FunctionNullHandling",
+	"PythonUDFType",
+	"SPECIAL",
+	"DEFAULT",
+	"NATIVE",
+	"ARROW"
+]

duckdb/functional/__init__.pyi ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import Dict
+SPECIAL: FunctionNullHandling
+DEFAULT: FunctionNullHandling
+NATIVE: PythonUDFType
+ARROW: PythonUDFType
+class FunctionNullHandling:
+    DEFAULT: FunctionNullHandling
+    SPECIAL: FunctionNullHandling
+    def __int__(self) -> int: ...
+    def __index__(self) -> int: ...
+    @property
+    def __members__(self) -> Dict[str, FunctionNullHandling]: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def value(self) -> int: ...
+class PythonUDFType:
+    NATIVE: PythonUDFType
+    ARROW: PythonUDFType
+    def __int__(self) -> int: ...
+    def __index__(self) -> int: ...
+    @property
+    def __members__(self) -> Dict[str, PythonUDFType]: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def value(self) -> int: ...

duckdb/polars_io.py CHANGED Viewed

@@ -1,30 +1,20 @@
-from __future__ import annotations  # noqa: D100
+import duckdb
+import polars as pl
+from typing import Iterator, Optional
-import contextlib
-import datetime
+from polars.io.plugins import register_io_source
+from duckdb import SQLExpression
 import json
-import typing
 from decimal import Decimal
+import datetime
-import polars as pl
-from polars.io.plugins import register_io_source
-import duckdb
-if typing.TYPE_CHECKING:
-    from collections.abc import Iterator
-    import typing_extensions
-_ExpressionTree: typing_extensions.TypeAlias = typing.Dict[str, typing.Union[str, int, "_ExpressionTree", typing.Any]]  # noqa: UP006
-def _predicate_to_expression(predicate: pl.Expr) -> duckdb.Expression | None:
-    """Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
+def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
+    """
+    Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
     Parameters:
         predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
     Returns:
         SQLExpression: A DuckDB SQL expression string equivalent.
         None: If conversion fails.
@@ -35,19 +25,20 @@ def _predicate_to_expression(predicate: pl.Expr) -> duckdb.Expression | None:
     """
     # Serialize the Polars expression tree to JSON
     tree = json.loads(predicate.meta.serialize(format="json"))
     try:
         # Convert the tree to SQL
         sql_filter = _pl_tree_to_sql(tree)
-        return duckdb.SQLExpression(sql_filter)
-    except Exception:
+        return SQLExpression(sql_filter)
+    except:
         # If the conversion fails, we return None
         return None
 def _pl_operation_to_sql(op: str) -> str:
-    """Map Polars binary operation strings to SQL equivalents.
+    """
+    Map Polars binary operation strings to SQL equivalents.
     Example:
         >>> _pl_operation_to_sql("Eq")
         '='
@@ -64,11 +55,12 @@ def _pl_operation_to_sql(op: str) -> str:
             "Or": "OR",
         }[op]
     except KeyError:
-        raise NotImplementedError(op)  # noqa: B904
+        raise NotImplementedError(op)
 def _escape_sql_identifier(identifier: str) -> str:
-    """Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
+    """
+    Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
     Example:
         >>> _escape_sql_identifier('column"name')
@@ -78,15 +70,16 @@ def _escape_sql_identifier(identifier: str) -> str:
     return f'"{escaped}"'
-def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
-    """Recursively convert a Polars expression tree (as JSON) to a SQL string.
+def _pl_tree_to_sql(tree: dict) -> str:
+    """
+    Recursively convert a Polars expression tree (as JSON) to a SQL string.
     Parameters:
         tree (dict): JSON-deserialized expression tree from Polars
     Returns:
         str: SQL expression string
     Example:
         Input tree:
         {
@@ -99,51 +92,36 @@ def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
         Output: "(foo > 5)"
     """
     [node_type] = tree.keys()
+    subtree = tree[node_type]
     if node_type == "BinaryExpr":
         # Binary expressions: left OP right
-        bin_expr_tree = tree[node_type]
-        assert isinstance(bin_expr_tree, dict), f"A {node_type} should be a dict but got {type(bin_expr_tree)}"
-        lhs, op, rhs = bin_expr_tree["left"], bin_expr_tree["op"], bin_expr_tree["right"]
-        assert isinstance(lhs, dict), f"LHS of a {node_type} should be a dict but got {type(lhs)}"
-        assert isinstance(op, str), f"The op of a {node_type} should be a str but got {type(op)}"
-        assert isinstance(rhs, dict), f"RHS of a {node_type} should be a dict but got {type(rhs)}"
-        return f"({_pl_tree_to_sql(lhs)} {_pl_operation_to_sql(op)} {_pl_tree_to_sql(rhs)})"
+        return (
+                "(" +
+                " ".join((
+                    _pl_tree_to_sql(subtree['left']),
+                    _pl_operation_to_sql(subtree['op']),
+                    _pl_tree_to_sql(subtree['right'])
+                )) +
+                ")"
+        )
     if node_type == "Column":
         # A reference to a column name
         # Wrap in quotes to handle special characters
-        col_name = tree[node_type]
-        assert isinstance(col_name, str), f"The col name of a {node_type} should be a str but got {type(col_name)}"
-        return _escape_sql_identifier(col_name)
+        return _escape_sql_identifier(subtree)
     if node_type in ("Literal", "Dyn"):
         # Recursively process dynamic or literal values
-        val_tree = tree[node_type]
-        assert isinstance(val_tree, dict), f"A {node_type} should be a dict but got {type(val_tree)}"
-        return _pl_tree_to_sql(val_tree)
+        return _pl_tree_to_sql(subtree)
     if node_type == "Int":
         # Direct integer literals
-        int_literal = tree[node_type]
-        assert isinstance(int_literal, (int, str)), (
-            f"The value of an Int should be an int or str but got {type(int_literal)}"
-        )
-        return str(int_literal)
+        return str(subtree)
     if node_type == "Function":
         # Handle boolean functions like IsNull, IsNotNull
-        func_tree = tree[node_type]
-        assert isinstance(func_tree, dict), f"A {node_type} should be a dict but got {type(func_tree)}"
-        inputs = func_tree["input"]
-        assert isinstance(inputs, list), f"A {node_type} should have a list of dicts as input but got {type(inputs)}"
-        input_tree = inputs[0]
-        assert isinstance(input_tree, dict), (
-            f"A {node_type} should have a list of dicts as input but got {type(input_tree)}"
-        )
-        func_dict = func_tree["function"]
-        assert isinstance(func_dict, dict), (
-            f"A {node_type} should have a function dict as input but got {type(func_dict)}"
-        )
+        inputs = subtree["input"]
+        func_dict = subtree["function"]
         if "Boolean" in func_dict:
             func = func_dict["Boolean"]
@@ -153,107 +131,80 @@ def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
                 return f"({arg_sql} IS NULL)"
             if func == "IsNotNull":
                 return f"({arg_sql} IS NOT NULL)"
-            msg = f"Boolean function not supported: {func}"
-            raise NotImplementedError(msg)
+            raise NotImplementedError(f"Boolean function not supported: {func}")
-        msg = f"Unsupported function type: {func_dict}"
-        raise NotImplementedError(msg)
+        raise NotImplementedError(f"Unsupported function type: {func_dict}")
     if node_type == "Scalar":
         # Detect format: old style (dtype/value) or new style (direct type key)
-        scalar_tree = tree[node_type]
-        assert isinstance(scalar_tree, dict), f"A {node_type} should be a dict but got {type(scalar_tree)}"
-        if "dtype" in scalar_tree and "value" in scalar_tree:
-            dtype = str(scalar_tree["dtype"])
-            value = scalar_tree["value"]
+        if "dtype" in subtree and "value" in subtree:
+            dtype = str(subtree["dtype"])
+            value = subtree["value"]
         else:
             # New style: dtype is the single key in the dict
-            dtype = next(iter(scalar_tree.keys()))
-            value = scalar_tree
-        assert isinstance(dtype, str), f"A {node_type} should have a str dtype but got  {type(dtype)}"
-        assert isinstance(value, dict), f"A {node_type} should have a dict value but got {type(value)}"
+            dtype = next(iter(subtree.keys()))
+            value = subtree
         # Decimal support
         if dtype.startswith("{'Decimal'") or dtype == "Decimal":
-            decimal_value = value["Decimal"]
-            assert isinstance(decimal_value, list), (
-                f"A {dtype} should be a two or three member list but got {type(decimal_value)}"
-            )
-            assert 2 <= len(decimal_value) <= 3, (
-                f"A {dtype} should be a two or three member list but got {len(decimal_value)} member list"
-            )
-            return str(Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[-1]))
+            decimal_value = value['Decimal']
+            decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
+            return str(decimal_value)
         # Datetime with microseconds since epoch
         if dtype.startswith("{'Datetime'") or dtype == "Datetime":
-            micros = value["Datetime"]
-            assert isinstance(micros, list), f"A {dtype} should be a one member list but got {type(micros)}"
-            dt_timestamp = datetime.datetime.fromtimestamp(micros[0] / 1_000_000, tz=datetime.timezone.utc)
-            return f"'{dt_timestamp!s}'::TIMESTAMP"
+            micros = value['Datetime'][0]
+            dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
+            return f"'{str(dt_timestamp)}'::TIMESTAMP"
         # Match simple numeric/boolean types
-        if dtype in (
-            "Int8",
-            "Int16",
-            "Int32",
-            "Int64",
-            "UInt8",
-            "UInt16",
-            "UInt32",
-            "UInt64",
-            "Float32",
-            "Float64",
-            "Boolean",
-        ):
+        if dtype in ("Int8", "Int16", "Int32", "Int64",
+                     "UInt8", "UInt16", "UInt32", "UInt64",
+                     "Float32", "Float64", "Boolean"):
             return str(value[dtype])
         # Time type
         if dtype == "Time":
             nanoseconds = value["Time"]
-            assert isinstance(nanoseconds, int), f"A {dtype} should be an int but got {type(nanoseconds)}"
             seconds = nanoseconds // 1_000_000_000
             microseconds = (nanoseconds % 1_000_000_000) // 1_000
-            dt_time = (datetime.datetime.min + datetime.timedelta(seconds=seconds, microseconds=microseconds)).time()
+            dt_time = (datetime.datetime.min + datetime.timedelta(
+                seconds=seconds, microseconds=microseconds
+            )).time()
             return f"'{dt_time}'::TIME"
         # Date type
         if dtype == "Date":
             days_since_epoch = value["Date"]
-            assert isinstance(days_since_epoch, (float, int)), (
-                f"A {dtype} should be a number but got {type(days_since_epoch)}"
-            )
             date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
             return f"'{date}'::DATE"
         # Binary type
         if dtype == "Binary":
-            bin_value = value["Binary"]
-            assert isinstance(bin_value, list), f"A {dtype} should be a list but got {type(bin_value)}"
-            binary_data = bytes(bin_value)
-            escaped = "".join(f"\\x{b:02x}" for b in binary_data)
+            binary_data = bytes(value["Binary"])
+            escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
             return f"'{escaped}'::BLOB"
         # String type
         if dtype == "String" or dtype == "StringOwned":
             # Some new formats may store directly under StringOwned
-            string_val: object | None = value.get("StringOwned", value.get("String", None))
+            string_val = value.get("StringOwned", value.get("String", None))
             return f"'{string_val}'"
-        msg = f"Unsupported scalar type {dtype!s}, with value {value}"
-        raise NotImplementedError(msg)
-    msg = f"Node type: {node_type} is not implemented. {tree[node_type]}"
-    raise NotImplementedError(msg)
+        raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
+    raise NotImplementedError(f"Node type: {node_type} is not implemented. {subtree}")
 def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
-    """A polars IO plugin for DuckDB."""
+    """
+    A polars IO plugin for DuckDB.
+    """
     def source_generator(
-        with_columns: list[str] | None,
-        predicate: pl.Expr | None,
-        n_rows: int | None,
-        batch_size: int | None,
+        with_columns: Optional[list[str]],
+        predicate: Optional[pl.Expr],
+        n_rows: Optional[int],
+        batch_size: Optional[int],
     ) -> Iterator[pl.DataFrame]:
         duck_predicate = None
         relation_final = relation
@@ -264,8 +215,7 @@ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -
             relation_final = relation_final.limit(n_rows)
         if predicate is not None:
             # We have a predicate, if possible, we push it down to DuckDB
-            with contextlib.suppress(AssertionError, KeyError):
-                duck_predicate = _predicate_to_expression(predicate)
+            duck_predicate = _predicate_to_expression(predicate)
         # Try to pushdown filter, if one exists
         if duck_predicate is not None:
             relation_final = relation_final.filter(duck_predicate)
@@ -273,12 +223,15 @@ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -
             results = relation_final.fetch_arrow_reader()
         else:
             results = relation_final.fetch_arrow_reader(batch_size)
-        for record_batch in iter(results.read_next_batch, None):
-            if predicate is not None and duck_predicate is None:
-                # We have a predicate, but did not manage to push it down, we fallback here
-                yield pl.from_arrow(record_batch).filter(predicate)  # type: ignore[arg-type,misc,unused-ignore]
-            else:
-                yield pl.from_arrow(record_batch)  # type: ignore[misc,unused-ignore]
+        while True:
+            try:
+                record_batch = results.read_next_batch()
+                if predicate is not None and duck_predicate is None:
+                    # We have a predicate, but did not manage to push it down, we fallback here
+                    yield pl.from_arrow(record_batch).filter(predicate)
+                else:
+                    yield pl.from_arrow(record_batch)
+            except StopIteration:
+                break
     return register_io_source(source_generator, schema=schema)