duckdb 1.5.0.dev53__cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (52) hide show
  1. _duckdb-stubs/__init__.pyi +1443 -0
  2. _duckdb-stubs/_func.pyi +46 -0
  3. _duckdb-stubs/_sqltypes.pyi +75 -0
  4. _duckdb.cpython-314-x86_64-linux-gnu.so +0 -0
  5. adbc_driver_duckdb/__init__.py +50 -0
  6. adbc_driver_duckdb/dbapi.py +115 -0
  7. duckdb/__init__.py +381 -0
  8. duckdb/_dbapi_type_object.py +231 -0
  9. duckdb/_version.py +22 -0
  10. duckdb/bytes_io_wrapper.py +69 -0
  11. duckdb/experimental/__init__.py +3 -0
  12. duckdb/experimental/spark/LICENSE +260 -0
  13. duckdb/experimental/spark/__init__.py +6 -0
  14. duckdb/experimental/spark/_globals.py +77 -0
  15. duckdb/experimental/spark/_typing.py +46 -0
  16. duckdb/experimental/spark/conf.py +46 -0
  17. duckdb/experimental/spark/context.py +180 -0
  18. duckdb/experimental/spark/errors/__init__.py +70 -0
  19. duckdb/experimental/spark/errors/error_classes.py +918 -0
  20. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  21. duckdb/experimental/spark/errors/exceptions/base.py +168 -0
  22. duckdb/experimental/spark/errors/utils.py +111 -0
  23. duckdb/experimental/spark/exception.py +18 -0
  24. duckdb/experimental/spark/sql/__init__.py +7 -0
  25. duckdb/experimental/spark/sql/_typing.py +86 -0
  26. duckdb/experimental/spark/sql/catalog.py +79 -0
  27. duckdb/experimental/spark/sql/column.py +361 -0
  28. duckdb/experimental/spark/sql/conf.py +24 -0
  29. duckdb/experimental/spark/sql/dataframe.py +1389 -0
  30. duckdb/experimental/spark/sql/functions.py +6195 -0
  31. duckdb/experimental/spark/sql/group.py +424 -0
  32. duckdb/experimental/spark/sql/readwriter.py +435 -0
  33. duckdb/experimental/spark/sql/session.py +297 -0
  34. duckdb/experimental/spark/sql/streaming.py +36 -0
  35. duckdb/experimental/spark/sql/type_utils.py +107 -0
  36. duckdb/experimental/spark/sql/types.py +1239 -0
  37. duckdb/experimental/spark/sql/udf.py +37 -0
  38. duckdb/filesystem.py +33 -0
  39. duckdb/func/__init__.py +3 -0
  40. duckdb/functional/__init__.py +13 -0
  41. duckdb/polars_io.py +284 -0
  42. duckdb/py.typed +0 -0
  43. duckdb/query_graph/__main__.py +358 -0
  44. duckdb/sqltypes/__init__.py +63 -0
  45. duckdb/typing/__init__.py +71 -0
  46. duckdb/udf.py +24 -0
  47. duckdb/value/__init__.py +1 -0
  48. duckdb/value/constant/__init__.py +270 -0
  49. duckdb-1.5.0.dev53.dist-info/METADATA +87 -0
  50. duckdb-1.5.0.dev53.dist-info/RECORD +52 -0
  51. duckdb-1.5.0.dev53.dist-info/WHEEL +6 -0
  52. duckdb-1.5.0.dev53.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,37 @@
1
+ # https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/ # noqa: D100
2
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
3
+
4
+ from .types import DataType
5
+
6
+ if TYPE_CHECKING:
7
+ from .session import SparkSession
8
+
9
+ DataTypeOrString = Union[DataType, str]
10
+ UserDefinedFunctionLike = TypeVar("UserDefinedFunctionLike")
11
+
12
+
13
+ class UDFRegistration: # noqa: D101
14
+ def __init__(self, sparkSession: "SparkSession") -> None: # noqa: D107
15
+ self.sparkSession = sparkSession
16
+
17
+ def register( # noqa: D102
18
+ self,
19
+ name: str,
20
+ f: Union[Callable[..., Any], "UserDefinedFunctionLike"],
21
+ returnType: Optional["DataTypeOrString"] = None,
22
+ ) -> "UserDefinedFunctionLike":
23
+ self.sparkSession.conn.create_function(name, f, return_type=returnType)
24
+
25
+ def registerJavaFunction( # noqa: D102
26
+ self,
27
+ name: str,
28
+ javaClassName: str,
29
+ returnType: Optional["DataTypeOrString"] = None,
30
+ ) -> None:
31
+ raise NotImplementedError
32
+
33
+ def registerJavaUDAF(self, name: str, javaClassName: str) -> None: # noqa: D102
34
+ raise NotImplementedError
35
+
36
+
37
+ __all__ = ["UDFRegistration"]
duckdb/filesystem.py ADDED
@@ -0,0 +1,33 @@
1
+ """In-memory filesystem to store ephemeral dependencies.
2
+
3
+ Warning: Not for external use. May change at any moment. Likely to be made internal.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import io
9
+ import typing
10
+
11
+ from fsspec import AbstractFileSystem
12
+ from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
13
+
14
+ from .bytes_io_wrapper import BytesIOWrapper
15
+
16
+
17
+ class ModifiedMemoryFileSystem(MemoryFileSystem):
18
+ """In-memory filesystem implementation that uses its own protocol."""
19
+
20
+ protocol = ("DUCKDB_INTERNAL_OBJECTSTORE",)
21
+ # defer to the original implementation that doesn't hardcode the protocol
22
+ _strip_protocol: typing.Callable[[str], str] = classmethod(AbstractFileSystem._strip_protocol.__func__) # type: ignore[assignment]
23
+
24
+ def add_file(self, obj: io.IOBase | BytesIOWrapper | object, path: str) -> None:
25
+ """Add a file to the filesystem."""
26
+ if not (hasattr(obj, "read") and hasattr(obj, "seek")):
27
+ msg = "Can not read from a non file-like object"
28
+ raise TypeError(msg)
29
+ if isinstance(obj, io.TextIOBase):
30
+ # Wrap this so that we can return a bytes object from 'read'
31
+ obj = BytesIOWrapper(obj)
32
+ path = self._strip_protocol(path)
33
+ self.store[path] = MemoryFile(self, path, obj.read())
@@ -0,0 +1,3 @@
1
+ from _duckdb._func import ARROW, DEFAULT, NATIVE, SPECIAL, FunctionNullHandling, PythonUDFType # noqa: D104
2
+
3
+ __all__ = ["ARROW", "DEFAULT", "NATIVE", "SPECIAL", "FunctionNullHandling", "PythonUDFType"]
@@ -0,0 +1,13 @@
1
+ """DuckDB function constants and types. DEPRECATED: please use `duckdb.func` instead."""
2
+
3
+ import warnings
4
+
5
+ from duckdb.func import ARROW, DEFAULT, NATIVE, SPECIAL, FunctionNullHandling, PythonUDFType
6
+
7
+ __all__ = ["ARROW", "DEFAULT", "NATIVE", "SPECIAL", "FunctionNullHandling", "PythonUDFType"]
8
+
9
+ warnings.warn(
10
+ "`duckdb.functional` is deprecated and will be removed in a future version. Please use `duckdb.func` instead.",
11
+ DeprecationWarning,
12
+ stacklevel=2,
13
+ )
duckdb/polars_io.py ADDED
@@ -0,0 +1,284 @@
1
+ from __future__ import annotations # noqa: D100
2
+
3
+ import contextlib
4
+ import datetime
5
+ import json
6
+ import typing
7
+ from decimal import Decimal
8
+
9
+ import polars as pl
10
+ from polars.io.plugins import register_io_source
11
+
12
+ import duckdb
13
+
14
+ if typing.TYPE_CHECKING:
15
+ from collections.abc import Iterator
16
+
17
+ import typing_extensions
18
+
19
+ _ExpressionTree: typing_extensions.TypeAlias = typing.Dict[str, typing.Union[str, int, "_ExpressionTree", typing.Any]] # noqa: UP006
20
+
21
+
22
+ def _predicate_to_expression(predicate: pl.Expr) -> duckdb.Expression | None:
23
+ """Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
24
+
25
+ Parameters:
26
+ predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
27
+
28
+ Returns:
29
+ SQLExpression: A DuckDB SQL expression string equivalent.
30
+ None: If conversion fails.
31
+
32
+ Example:
33
+ >>> _predicate_to_expression(pl.col("foo") > 5)
34
+ SQLExpression("(foo > 5)")
35
+ """
36
+ # Serialize the Polars expression tree to JSON
37
+ tree = json.loads(predicate.meta.serialize(format="json"))
38
+
39
+ try:
40
+ # Convert the tree to SQL
41
+ sql_filter = _pl_tree_to_sql(tree)
42
+ return duckdb.SQLExpression(sql_filter)
43
+ except Exception:
44
+ # If the conversion fails, we return None
45
+ return None
46
+
47
+
48
+ def _pl_operation_to_sql(op: str) -> str:
49
+ """Map Polars binary operation strings to SQL equivalents.
50
+
51
+ Example:
52
+ >>> _pl_operation_to_sql("Eq")
53
+ '='
54
+ """
55
+ try:
56
+ return {
57
+ "Lt": "<",
58
+ "LtEq": "<=",
59
+ "Gt": ">",
60
+ "GtEq": ">=",
61
+ "Eq": "=",
62
+ "Modulus": "%",
63
+ "And": "AND",
64
+ "Or": "OR",
65
+ }[op]
66
+ except KeyError:
67
+ raise NotImplementedError(op) # noqa: B904
68
+
69
+
70
+ def _escape_sql_identifier(identifier: str) -> str:
71
+ """Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
72
+
73
+ Example:
74
+ >>> _escape_sql_identifier('column"name')
75
+ '"column""name"'
76
+ """
77
+ escaped = identifier.replace('"', '""')
78
+ return f'"{escaped}"'
79
+
80
+
81
+ def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
82
+ """Recursively convert a Polars expression tree (as JSON) to a SQL string.
83
+
84
+ Parameters:
85
+ tree (dict): JSON-deserialized expression tree from Polars
86
+
87
+ Returns:
88
+ str: SQL expression string
89
+
90
+ Example:
91
+ Input tree:
92
+ {
93
+ "BinaryExpr": {
94
+ "left": { "Column": "foo" },
95
+ "op": "Gt",
96
+ "right": { "Literal": { "Int": 5 } }
97
+ }
98
+ }
99
+ Output: "(foo > 5)"
100
+ """
101
+ [node_type] = tree.keys()
102
+
103
+ if node_type == "BinaryExpr":
104
+ # Binary expressions: left OP right
105
+ bin_expr_tree = tree[node_type]
106
+ assert isinstance(bin_expr_tree, dict), f"A {node_type} should be a dict but got {type(bin_expr_tree)}"
107
+ lhs, op, rhs = bin_expr_tree["left"], bin_expr_tree["op"], bin_expr_tree["right"]
108
+ assert isinstance(lhs, dict), f"LHS of a {node_type} should be a dict but got {type(lhs)}"
109
+ assert isinstance(op, str), f"The op of a {node_type} should be a str but got {type(op)}"
110
+ assert isinstance(rhs, dict), f"RHS of a {node_type} should be a dict but got {type(rhs)}"
111
+ return f"({_pl_tree_to_sql(lhs)} {_pl_operation_to_sql(op)} {_pl_tree_to_sql(rhs)})"
112
+ if node_type == "Column":
113
+ # A reference to a column name
114
+ # Wrap in quotes to handle special characters
115
+ col_name = tree[node_type]
116
+ assert isinstance(col_name, str), f"The col name of a {node_type} should be a str but got {type(col_name)}"
117
+ return _escape_sql_identifier(col_name)
118
+
119
+ if node_type in ("Literal", "Dyn"):
120
+ # Recursively process dynamic or literal values
121
+ val_tree = tree[node_type]
122
+ assert isinstance(val_tree, dict), f"A {node_type} should be a dict but got {type(val_tree)}"
123
+ return _pl_tree_to_sql(val_tree)
124
+
125
+ if node_type == "Int":
126
+ # Direct integer literals
127
+ int_literal = tree[node_type]
128
+ assert isinstance(int_literal, (int, str)), (
129
+ f"The value of an Int should be an int or str but got {type(int_literal)}"
130
+ )
131
+ return str(int_literal)
132
+
133
+ if node_type == "Function":
134
+ # Handle boolean functions like IsNull, IsNotNull
135
+ func_tree = tree[node_type]
136
+ assert isinstance(func_tree, dict), f"A {node_type} should be a dict but got {type(func_tree)}"
137
+ inputs = func_tree["input"]
138
+ assert isinstance(inputs, list), f"A {node_type} should have a list of dicts as input but got {type(inputs)}"
139
+ input_tree = inputs[0]
140
+ assert isinstance(input_tree, dict), (
141
+ f"A {node_type} should have a list of dicts as input but got {type(input_tree)}"
142
+ )
143
+ func_dict = func_tree["function"]
144
+ assert isinstance(func_dict, dict), (
145
+ f"A {node_type} should have a function dict as input but got {type(func_dict)}"
146
+ )
147
+
148
+ if "Boolean" in func_dict:
149
+ func = func_dict["Boolean"]
150
+ arg_sql = _pl_tree_to_sql(inputs[0])
151
+
152
+ if func == "IsNull":
153
+ return f"({arg_sql} IS NULL)"
154
+ if func == "IsNotNull":
155
+ return f"({arg_sql} IS NOT NULL)"
156
+ msg = f"Boolean function not supported: {func}"
157
+ raise NotImplementedError(msg)
158
+
159
+ msg = f"Unsupported function type: {func_dict}"
160
+ raise NotImplementedError(msg)
161
+
162
+ if node_type == "Scalar":
163
+ # Detect format: old style (dtype/value) or new style (direct type key)
164
+ scalar_tree = tree[node_type]
165
+ assert isinstance(scalar_tree, dict), f"A {node_type} should be a dict but got {type(scalar_tree)}"
166
+ if "dtype" in scalar_tree and "value" in scalar_tree:
167
+ dtype = str(scalar_tree["dtype"])
168
+ value = scalar_tree["value"]
169
+ else:
170
+ # New style: dtype is the single key in the dict
171
+ dtype = next(iter(scalar_tree.keys()))
172
+ value = scalar_tree
173
+ assert isinstance(dtype, str), f"A {node_type} should have a str dtype but got {type(dtype)}"
174
+ assert isinstance(value, dict), f"A {node_type} should have a dict value but got {type(value)}"
175
+
176
+ # Decimal support
177
+ if dtype.startswith("{'Decimal'") or dtype == "Decimal":
178
+ decimal_value = value["Decimal"]
179
+ assert isinstance(decimal_value, list), (
180
+ f"A {dtype} should be a two or three member list but got {type(decimal_value)}"
181
+ )
182
+ assert 2 <= len(decimal_value) <= 3, (
183
+ f"A {dtype} should be a two or three member list but got {len(decimal_value)} member list"
184
+ )
185
+ return str(Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[-1]))
186
+
187
+ # Datetime with microseconds since epoch
188
+ if dtype.startswith("{'Datetime'") or dtype == "Datetime":
189
+ micros = value["Datetime"]
190
+ assert isinstance(micros, list), f"A {dtype} should be a one member list but got {type(micros)}"
191
+ dt_timestamp = datetime.datetime.fromtimestamp(micros[0] / 1_000_000, tz=datetime.timezone.utc)
192
+ return f"'{dt_timestamp!s}'::TIMESTAMP"
193
+
194
+ # Match simple numeric/boolean types
195
+ if dtype in (
196
+ "Int8",
197
+ "Int16",
198
+ "Int32",
199
+ "Int64",
200
+ "UInt8",
201
+ "UInt16",
202
+ "UInt32",
203
+ "UInt64",
204
+ "Float32",
205
+ "Float64",
206
+ "Boolean",
207
+ ):
208
+ return str(value[dtype])
209
+
210
+ # Time type
211
+ if dtype == "Time":
212
+ nanoseconds = value["Time"]
213
+ assert isinstance(nanoseconds, int), f"A {dtype} should be an int but got {type(nanoseconds)}"
214
+ seconds = nanoseconds // 1_000_000_000
215
+ microseconds = (nanoseconds % 1_000_000_000) // 1_000
216
+ dt_time = (datetime.datetime.min + datetime.timedelta(seconds=seconds, microseconds=microseconds)).time()
217
+ return f"'{dt_time}'::TIME"
218
+
219
+ # Date type
220
+ if dtype == "Date":
221
+ days_since_epoch = value["Date"]
222
+ assert isinstance(days_since_epoch, (float, int)), (
223
+ f"A {dtype} should be a number but got {type(days_since_epoch)}"
224
+ )
225
+ date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
226
+ return f"'{date}'::DATE"
227
+
228
+ # Binary type
229
+ if dtype == "Binary":
230
+ bin_value = value["Binary"]
231
+ assert isinstance(bin_value, list), f"A {dtype} should be a list but got {type(bin_value)}"
232
+ binary_data = bytes(bin_value)
233
+ escaped = "".join(f"\\x{b:02x}" for b in binary_data)
234
+ return f"'{escaped}'::BLOB"
235
+
236
+ # String type
237
+ if dtype == "String" or dtype == "StringOwned":
238
+ # Some new formats may store directly under StringOwned
239
+ string_val: object | None = value.get("StringOwned", value.get("String", None))
240
+ return f"'{string_val}'"
241
+
242
+ msg = f"Unsupported scalar type {dtype!s}, with value {value}"
243
+ raise NotImplementedError(msg)
244
+
245
+ msg = f"Node type: {node_type} is not implemented. {tree[node_type]}"
246
+ raise NotImplementedError(msg)
247
+
248
+
249
+ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
250
+ """A polars IO plugin for DuckDB."""
251
+
252
+ def source_generator(
253
+ with_columns: list[str] | None,
254
+ predicate: pl.Expr | None,
255
+ n_rows: int | None,
256
+ batch_size: int | None,
257
+ ) -> Iterator[pl.DataFrame]:
258
+ duck_predicate = None
259
+ relation_final = relation
260
+ if with_columns is not None:
261
+ cols = ",".join(map(_escape_sql_identifier, with_columns))
262
+ relation_final = relation_final.project(cols)
263
+ if n_rows is not None:
264
+ relation_final = relation_final.limit(n_rows)
265
+ if predicate is not None:
266
+ # We have a predicate, if possible, we push it down to DuckDB
267
+ with contextlib.suppress(AssertionError, KeyError):
268
+ duck_predicate = _predicate_to_expression(predicate)
269
+ # Try to pushdown filter, if one exists
270
+ if duck_predicate is not None:
271
+ relation_final = relation_final.filter(duck_predicate)
272
+ if batch_size is None:
273
+ results = relation_final.fetch_arrow_reader()
274
+ else:
275
+ results = relation_final.fetch_arrow_reader(batch_size)
276
+
277
+ for record_batch in iter(results.read_next_batch, None):
278
+ if predicate is not None and duck_predicate is None:
279
+ # We have a predicate, but did not manage to push it down, we fallback here
280
+ yield pl.from_arrow(record_batch).filter(predicate) # type: ignore[arg-type,misc,unused-ignore]
281
+ else:
282
+ yield pl.from_arrow(record_batch) # type: ignore[misc,unused-ignore]
283
+
284
+ return register_io_source(source_generator, schema=schema)
duckdb/py.typed ADDED
File without changes