duckdb 1.4.1.dev125__cp313-cp313-macosx_10_13_universal2.whl → 1.5.0.dev37__cp313-cp313-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb.cpython-313-darwin.so +0 -0
- duckdb/__init__.py +374 -373
- duckdb/__init__.pyi +180 -604
- duckdb/bytes_io_wrapper.py +7 -6
- duckdb/experimental/__init__.py +1 -2
- duckdb/experimental/spark/__init__.py +4 -3
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +9 -7
- duckdb/experimental/spark/conf.py +15 -16
- duckdb/experimental/spark/context.py +44 -60
- duckdb/experimental/spark/errors/__init__.py +35 -33
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +88 -39
- duckdb/experimental/spark/errors/utils.py +16 -11
- duckdb/experimental/spark/exception.py +6 -9
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +15 -8
- duckdb/experimental/spark/sql/catalog.py +20 -21
- duckdb/experimental/spark/sql/column.py +54 -47
- duckdb/experimental/spark/sql/conf.py +8 -9
- duckdb/experimental/spark/sql/dataframe.py +233 -185
- duckdb/experimental/spark/sql/functions.py +1248 -1222
- duckdb/experimental/spark/sql/group.py +52 -56
- duckdb/experimental/spark/sql/readwriter.py +94 -80
- duckdb/experimental/spark/sql/session.py +59 -64
- duckdb/experimental/spark/sql/streaming.py +10 -9
- duckdb/experimental/spark/sql/type_utils.py +64 -66
- duckdb/experimental/spark/sql/types.py +344 -308
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +8 -13
- duckdb/functional/__init__.py +16 -2
- duckdb/polars_io.py +57 -66
- duckdb/query_graph/__main__.py +96 -91
- duckdb/typing/__init__.py +8 -8
- duckdb/typing/__init__.pyi +2 -4
- duckdb/udf.py +5 -10
- duckdb/value/__init__.py +0 -1
- duckdb/value/constant/__init__.py +59 -61
- duckdb/value/constant/__init__.pyi +4 -3
- duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
- duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
- adbc_driver_duckdb/__init__.py +0 -50
- adbc_driver_duckdb/dbapi.py +0 -115
- duckdb-1.4.1.dev125.dist-info/METADATA +0 -326
- duckdb-1.4.1.dev125.dist-info/RECORD +0 -49
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev37.dist-info}/WHEEL +0 -0
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev37.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
|
|
1
|
+
# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
|
|
2
2
|
from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
|
|
3
3
|
|
|
4
4
|
from .types import DataType
|
|
@@ -10,11 +10,11 @@ DataTypeOrString = Union[DataType, str]
|
|
|
10
10
|
UserDefinedFunctionLike = TypeVar("UserDefinedFunctionLike")
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
class UDFRegistration:
|
|
14
|
-
def __init__(self, sparkSession: "SparkSession")
|
|
13
|
+
class UDFRegistration:
|
|
14
|
+
def __init__(self, sparkSession: "SparkSession"):
|
|
15
15
|
self.sparkSession = sparkSession
|
|
16
16
|
|
|
17
|
-
def register(
|
|
17
|
+
def register(
|
|
18
18
|
self,
|
|
19
19
|
name: str,
|
|
20
20
|
f: Union[Callable[..., Any], "UserDefinedFunctionLike"],
|
|
@@ -22,7 +22,7 @@ class UDFRegistration: # noqa: D101
|
|
|
22
22
|
) -> "UserDefinedFunctionLike":
|
|
23
23
|
self.sparkSession.conn.create_function(name, f, return_type=returnType)
|
|
24
24
|
|
|
25
|
-
def registerJavaFunction(
|
|
25
|
+
def registerJavaFunction(
|
|
26
26
|
self,
|
|
27
27
|
name: str,
|
|
28
28
|
javaClassName: str,
|
|
@@ -30,7 +30,7 @@ class UDFRegistration: # noqa: D101
|
|
|
30
30
|
) -> None:
|
|
31
31
|
raise NotImplementedError
|
|
32
32
|
|
|
33
|
-
def registerJavaUDAF(self, name: str, javaClassName: str) -> None:
|
|
33
|
+
def registerJavaUDAF(self, name: str, javaClassName: str) -> None:
|
|
34
34
|
raise NotImplementedError
|
|
35
35
|
|
|
36
36
|
|
duckdb/filesystem.py
CHANGED
|
@@ -1,26 +1,21 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
3
|
-
|
|
4
|
-
from fsspec import AbstractFileSystem
|
|
5
|
-
from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
|
|
6
|
-
|
|
1
|
+
from fsspec import filesystem, AbstractFileSystem
|
|
2
|
+
from fsspec.implementations.memory import MemoryFileSystem, MemoryFile
|
|
7
3
|
from .bytes_io_wrapper import BytesIOWrapper
|
|
4
|
+
from io import TextIOBase
|
|
8
5
|
|
|
9
|
-
|
|
10
|
-
def is_file_like(obj) -> bool: # noqa: D103, ANN001
|
|
6
|
+
def is_file_like(obj):
|
|
11
7
|
# We only care that we can read from the file
|
|
12
8
|
return hasattr(obj, "read") and hasattr(obj, "seek")
|
|
13
9
|
|
|
14
10
|
|
|
15
|
-
class ModifiedMemoryFileSystem(MemoryFileSystem):
|
|
16
|
-
protocol = (
|
|
11
|
+
class ModifiedMemoryFileSystem(MemoryFileSystem):
|
|
12
|
+
protocol = ('DUCKDB_INTERNAL_OBJECTSTORE',)
|
|
17
13
|
# defer to the original implementation that doesn't hardcode the protocol
|
|
18
14
|
_strip_protocol = classmethod(AbstractFileSystem._strip_protocol.__func__)
|
|
19
15
|
|
|
20
|
-
def add_file(self, object
|
|
16
|
+
def add_file(self, object, path):
|
|
21
17
|
if not is_file_like(object):
|
|
22
|
-
|
|
23
|
-
raise ValueError(msg)
|
|
18
|
+
raise ValueError("Can not read from a non file-like object")
|
|
24
19
|
path = self._strip_protocol(path)
|
|
25
20
|
if isinstance(object, TextIOBase):
|
|
26
21
|
# Wrap this so that we can return a bytes object from 'read'
|
duckdb/functional/__init__.py
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
|
-
from _duckdb.functional import
|
|
1
|
+
from _duckdb.functional import (
|
|
2
|
+
FunctionNullHandling,
|
|
3
|
+
PythonUDFType,
|
|
4
|
+
SPECIAL,
|
|
5
|
+
DEFAULT,
|
|
6
|
+
NATIVE,
|
|
7
|
+
ARROW
|
|
8
|
+
)
|
|
2
9
|
|
|
3
|
-
__all__ = [
|
|
10
|
+
__all__ = [
|
|
11
|
+
"FunctionNullHandling",
|
|
12
|
+
"PythonUDFType",
|
|
13
|
+
"SPECIAL",
|
|
14
|
+
"DEFAULT",
|
|
15
|
+
"NATIVE",
|
|
16
|
+
"ARROW"
|
|
17
|
+
]
|
duckdb/polars_io.py
CHANGED
|
@@ -1,22 +1,20 @@
|
|
|
1
|
-
import
|
|
2
|
-
import json
|
|
3
|
-
from collections.abc import Iterator
|
|
4
|
-
from decimal import Decimal
|
|
5
|
-
from typing import Optional
|
|
6
|
-
|
|
1
|
+
import duckdb
|
|
7
2
|
import polars as pl
|
|
8
|
-
from
|
|
3
|
+
from typing import Iterator, Optional
|
|
9
4
|
|
|
10
|
-
import
|
|
5
|
+
from polars.io.plugins import register_io_source
|
|
11
6
|
from duckdb import SQLExpression
|
|
12
|
-
|
|
7
|
+
import json
|
|
8
|
+
from decimal import Decimal
|
|
9
|
+
import datetime
|
|
13
10
|
|
|
14
11
|
def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
|
|
15
|
-
"""
|
|
16
|
-
|
|
12
|
+
"""
|
|
13
|
+
Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
|
|
14
|
+
|
|
17
15
|
Parameters:
|
|
18
16
|
predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
|
|
19
|
-
|
|
17
|
+
|
|
20
18
|
Returns:
|
|
21
19
|
SQLExpression: A DuckDB SQL expression string equivalent.
|
|
22
20
|
None: If conversion fails.
|
|
@@ -27,19 +25,20 @@ def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
|
|
|
27
25
|
"""
|
|
28
26
|
# Serialize the Polars expression tree to JSON
|
|
29
27
|
tree = json.loads(predicate.meta.serialize(format="json"))
|
|
30
|
-
|
|
28
|
+
|
|
31
29
|
try:
|
|
32
30
|
# Convert the tree to SQL
|
|
33
31
|
sql_filter = _pl_tree_to_sql(tree)
|
|
34
32
|
return SQLExpression(sql_filter)
|
|
35
|
-
except
|
|
33
|
+
except:
|
|
36
34
|
# If the conversion fails, we return None
|
|
37
35
|
return None
|
|
38
36
|
|
|
39
37
|
|
|
40
38
|
def _pl_operation_to_sql(op: str) -> str:
|
|
41
|
-
"""
|
|
42
|
-
|
|
39
|
+
"""
|
|
40
|
+
Map Polars binary operation strings to SQL equivalents.
|
|
41
|
+
|
|
43
42
|
Example:
|
|
44
43
|
>>> _pl_operation_to_sql("Eq")
|
|
45
44
|
'='
|
|
@@ -56,11 +55,12 @@ def _pl_operation_to_sql(op: str) -> str:
|
|
|
56
55
|
"Or": "OR",
|
|
57
56
|
}[op]
|
|
58
57
|
except KeyError:
|
|
59
|
-
raise NotImplementedError(op)
|
|
58
|
+
raise NotImplementedError(op)
|
|
60
59
|
|
|
61
60
|
|
|
62
61
|
def _escape_sql_identifier(identifier: str) -> str:
|
|
63
|
-
"""
|
|
62
|
+
"""
|
|
63
|
+
Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
|
|
64
64
|
|
|
65
65
|
Example:
|
|
66
66
|
>>> _escape_sql_identifier('column"name')
|
|
@@ -71,14 +71,15 @@ def _escape_sql_identifier(identifier: str) -> str:
|
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def _pl_tree_to_sql(tree: dict) -> str:
|
|
74
|
-
"""
|
|
75
|
-
|
|
74
|
+
"""
|
|
75
|
+
Recursively convert a Polars expression tree (as JSON) to a SQL string.
|
|
76
|
+
|
|
76
77
|
Parameters:
|
|
77
78
|
tree (dict): JSON-deserialized expression tree from Polars
|
|
78
|
-
|
|
79
|
+
|
|
79
80
|
Returns:
|
|
80
81
|
str: SQL expression string
|
|
81
|
-
|
|
82
|
+
|
|
82
83
|
Example:
|
|
83
84
|
Input tree:
|
|
84
85
|
{
|
|
@@ -96,15 +97,13 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
96
97
|
if node_type == "BinaryExpr":
|
|
97
98
|
# Binary expressions: left OP right
|
|
98
99
|
return (
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
+ ")"
|
|
100
|
+
"(" +
|
|
101
|
+
" ".join((
|
|
102
|
+
_pl_tree_to_sql(subtree['left']),
|
|
103
|
+
_pl_operation_to_sql(subtree['op']),
|
|
104
|
+
_pl_tree_to_sql(subtree['right'])
|
|
105
|
+
)) +
|
|
106
|
+
")"
|
|
108
107
|
)
|
|
109
108
|
if node_type == "Column":
|
|
110
109
|
# A reference to a column name
|
|
@@ -132,11 +131,9 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
132
131
|
return f"({arg_sql} IS NULL)"
|
|
133
132
|
if func == "IsNotNull":
|
|
134
133
|
return f"({arg_sql} IS NOT NULL)"
|
|
135
|
-
|
|
136
|
-
raise NotImplementedError(msg)
|
|
134
|
+
raise NotImplementedError(f"Boolean function not supported: {func}")
|
|
137
135
|
|
|
138
|
-
|
|
139
|
-
raise NotImplementedError(msg)
|
|
136
|
+
raise NotImplementedError(f"Unsupported function type: {func_dict}")
|
|
140
137
|
|
|
141
138
|
if node_type == "Scalar":
|
|
142
139
|
# Detect format: old style (dtype/value) or new style (direct type key)
|
|
@@ -150,30 +147,20 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
150
147
|
|
|
151
148
|
# Decimal support
|
|
152
149
|
if dtype.startswith("{'Decimal'") or dtype == "Decimal":
|
|
153
|
-
decimal_value = value[
|
|
150
|
+
decimal_value = value['Decimal']
|
|
154
151
|
decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
|
|
155
152
|
return str(decimal_value)
|
|
156
153
|
|
|
157
154
|
# Datetime with microseconds since epoch
|
|
158
155
|
if dtype.startswith("{'Datetime'") or dtype == "Datetime":
|
|
159
|
-
micros = value[
|
|
156
|
+
micros = value['Datetime'][0]
|
|
160
157
|
dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
|
|
161
|
-
return f"'{dt_timestamp
|
|
158
|
+
return f"'{str(dt_timestamp)}'::TIMESTAMP"
|
|
162
159
|
|
|
163
160
|
# Match simple numeric/boolean types
|
|
164
|
-
if dtype in (
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
"Int32",
|
|
168
|
-
"Int64",
|
|
169
|
-
"UInt8",
|
|
170
|
-
"UInt16",
|
|
171
|
-
"UInt32",
|
|
172
|
-
"UInt64",
|
|
173
|
-
"Float32",
|
|
174
|
-
"Float64",
|
|
175
|
-
"Boolean",
|
|
176
|
-
):
|
|
161
|
+
if dtype in ("Int8", "Int16", "Int32", "Int64",
|
|
162
|
+
"UInt8", "UInt16", "UInt32", "UInt64",
|
|
163
|
+
"Float32", "Float64", "Boolean"):
|
|
177
164
|
return str(value[dtype])
|
|
178
165
|
|
|
179
166
|
# Time type
|
|
@@ -181,7 +168,9 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
181
168
|
nanoseconds = value["Time"]
|
|
182
169
|
seconds = nanoseconds // 1_000_000_000
|
|
183
170
|
microseconds = (nanoseconds % 1_000_000_000) // 1_000
|
|
184
|
-
dt_time = (datetime.datetime.min + datetime.timedelta(
|
|
171
|
+
dt_time = (datetime.datetime.min + datetime.timedelta(
|
|
172
|
+
seconds=seconds, microseconds=microseconds
|
|
173
|
+
)).time()
|
|
185
174
|
return f"'{dt_time}'::TIME"
|
|
186
175
|
|
|
187
176
|
# Date type
|
|
@@ -193,7 +182,7 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
193
182
|
# Binary type
|
|
194
183
|
if dtype == "Binary":
|
|
195
184
|
binary_data = bytes(value["Binary"])
|
|
196
|
-
escaped =
|
|
185
|
+
escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
|
|
197
186
|
return f"'{escaped}'::BLOB"
|
|
198
187
|
|
|
199
188
|
# String type
|
|
@@ -202,16 +191,15 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
202
191
|
string_val = value.get("StringOwned", value.get("String", None))
|
|
203
192
|
return f"'{string_val}'"
|
|
204
193
|
|
|
205
|
-
msg = f"Unsupported scalar type {dtype!s}, with value {value}"
|
|
206
|
-
raise NotImplementedError(msg)
|
|
207
194
|
|
|
208
|
-
|
|
209
|
-
raise NotImplementedError(msg)
|
|
195
|
+
raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
|
|
210
196
|
|
|
197
|
+
raise NotImplementedError(f"Node type: {node_type} is not implemented. {subtree}")
|
|
211
198
|
|
|
212
199
|
def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
|
|
213
|
-
"""
|
|
214
|
-
|
|
200
|
+
"""
|
|
201
|
+
A polars IO plugin for DuckDB.
|
|
202
|
+
"""
|
|
215
203
|
def source_generator(
|
|
216
204
|
with_columns: Optional[list[str]],
|
|
217
205
|
predicate: Optional[pl.Expr],
|
|
@@ -235,12 +223,15 @@ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -
|
|
|
235
223
|
results = relation_final.fetch_arrow_reader()
|
|
236
224
|
else:
|
|
237
225
|
results = relation_final.fetch_arrow_reader(batch_size)
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
226
|
+
while True:
|
|
227
|
+
try:
|
|
228
|
+
record_batch = results.read_next_batch()
|
|
229
|
+
if predicate is not None and duck_predicate is None:
|
|
230
|
+
# We have a predicate, but did not manage to push it down, we fallback here
|
|
231
|
+
yield pl.from_arrow(record_batch).filter(predicate)
|
|
232
|
+
else:
|
|
233
|
+
yield pl.from_arrow(record_batch)
|
|
234
|
+
except StopIteration:
|
|
235
|
+
break
|
|
245
236
|
|
|
246
237
|
return register_io_source(source_generator, schema=schema)
|