duckdb 1.5.0.dev32__cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb.cpython-314t-aarch64-linux-gnu.so +0 -0
- duckdb/__init__.py +475 -0
- duckdb/__init__.pyi +713 -0
- duckdb/bytes_io_wrapper.py +66 -0
- duckdb/experimental/__init__.py +2 -0
- duckdb/experimental/spark/LICENSE +260 -0
- duckdb/experimental/spark/__init__.py +7 -0
- duckdb/experimental/spark/_globals.py +77 -0
- duckdb/experimental/spark/_typing.py +48 -0
- duckdb/experimental/spark/conf.py +45 -0
- duckdb/experimental/spark/context.py +164 -0
- duckdb/experimental/spark/errors/__init__.py +72 -0
- duckdb/experimental/spark/errors/error_classes.py +918 -0
- duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
- duckdb/experimental/spark/errors/exceptions/base.py +217 -0
- duckdb/experimental/spark/errors/utils.py +116 -0
- duckdb/experimental/spark/exception.py +15 -0
- duckdb/experimental/spark/sql/__init__.py +7 -0
- duckdb/experimental/spark/sql/_typing.py +93 -0
- duckdb/experimental/spark/sql/catalog.py +78 -0
- duckdb/experimental/spark/sql/column.py +368 -0
- duckdb/experimental/spark/sql/conf.py +23 -0
- duckdb/experimental/spark/sql/dataframe.py +1437 -0
- duckdb/experimental/spark/sql/functions.py +6221 -0
- duckdb/experimental/spark/sql/group.py +420 -0
- duckdb/experimental/spark/sql/readwriter.py +449 -0
- duckdb/experimental/spark/sql/session.py +292 -0
- duckdb/experimental/spark/sql/streaming.py +37 -0
- duckdb/experimental/spark/sql/type_utils.py +105 -0
- duckdb/experimental/spark/sql/types.py +1275 -0
- duckdb/experimental/spark/sql/udf.py +37 -0
- duckdb/filesystem.py +23 -0
- duckdb/functional/__init__.py +17 -0
- duckdb/functional/__init__.pyi +31 -0
- duckdb/polars_io.py +237 -0
- duckdb/query_graph/__main__.py +363 -0
- duckdb/typing/__init__.py +61 -0
- duckdb/typing/__init__.pyi +36 -0
- duckdb/udf.py +19 -0
- duckdb/value/__init__.py +0 -0
- duckdb/value/__init__.pyi +0 -0
- duckdb/value/constant/__init__.py +268 -0
- duckdb/value/constant/__init__.pyi +115 -0
- duckdb-1.5.0.dev32.dist-info/METADATA +326 -0
- duckdb-1.5.0.dev32.dist-info/RECORD +47 -0
- duckdb-1.5.0.dev32.dist-info/WHEEL +6 -0
- duckdb-1.5.0.dev32.dist-info/licenses/LICENSE +7 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
|
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
|
|
3
|
+
|
|
4
|
+
from .types import DataType
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .session import SparkSession
|
|
8
|
+
|
|
9
|
+
DataTypeOrString = Union[DataType, str]
|
|
10
|
+
UserDefinedFunctionLike = TypeVar("UserDefinedFunctionLike")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UDFRegistration:
|
|
14
|
+
def __init__(self, sparkSession: "SparkSession"):
|
|
15
|
+
self.sparkSession = sparkSession
|
|
16
|
+
|
|
17
|
+
def register(
|
|
18
|
+
self,
|
|
19
|
+
name: str,
|
|
20
|
+
f: Union[Callable[..., Any], "UserDefinedFunctionLike"],
|
|
21
|
+
returnType: Optional["DataTypeOrString"] = None,
|
|
22
|
+
) -> "UserDefinedFunctionLike":
|
|
23
|
+
self.sparkSession.conn.create_function(name, f, return_type=returnType)
|
|
24
|
+
|
|
25
|
+
def registerJavaFunction(
|
|
26
|
+
self,
|
|
27
|
+
name: str,
|
|
28
|
+
javaClassName: str,
|
|
29
|
+
returnType: Optional["DataTypeOrString"] = None,
|
|
30
|
+
) -> None:
|
|
31
|
+
raise NotImplementedError
|
|
32
|
+
|
|
33
|
+
def registerJavaUDAF(self, name: str, javaClassName: str) -> None:
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = ["UDFRegistration"]
|
duckdb/filesystem.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from fsspec import filesystem, AbstractFileSystem
|
|
2
|
+
from fsspec.implementations.memory import MemoryFileSystem, MemoryFile
|
|
3
|
+
from .bytes_io_wrapper import BytesIOWrapper
|
|
4
|
+
from io import TextIOBase
|
|
5
|
+
|
|
6
|
+
def is_file_like(obj):
|
|
7
|
+
# We only care that we can read from the file
|
|
8
|
+
return hasattr(obj, "read") and hasattr(obj, "seek")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ModifiedMemoryFileSystem(MemoryFileSystem):
|
|
12
|
+
protocol = ('DUCKDB_INTERNAL_OBJECTSTORE',)
|
|
13
|
+
# defer to the original implementation that doesn't hardcode the protocol
|
|
14
|
+
_strip_protocol = classmethod(AbstractFileSystem._strip_protocol.__func__)
|
|
15
|
+
|
|
16
|
+
def add_file(self, object, path):
|
|
17
|
+
if not is_file_like(object):
|
|
18
|
+
raise ValueError("Can not read from a non file-like object")
|
|
19
|
+
path = self._strip_protocol(path)
|
|
20
|
+
if isinstance(object, TextIOBase):
|
|
21
|
+
# Wrap this so that we can return a bytes object from 'read'
|
|
22
|
+
object = BytesIOWrapper(object)
|
|
23
|
+
self.store[path] = MemoryFile(self, path, object.read())
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
SPECIAL: FunctionNullHandling
|
|
4
|
+
DEFAULT: FunctionNullHandling
|
|
5
|
+
|
|
6
|
+
NATIVE: PythonUDFType
|
|
7
|
+
ARROW: PythonUDFType
|
|
8
|
+
|
|
9
|
+
class FunctionNullHandling:
|
|
10
|
+
DEFAULT: FunctionNullHandling
|
|
11
|
+
SPECIAL: FunctionNullHandling
|
|
12
|
+
def __int__(self) -> int: ...
|
|
13
|
+
def __index__(self) -> int: ...
|
|
14
|
+
@property
|
|
15
|
+
def __members__(self) -> Dict[str, FunctionNullHandling]: ...
|
|
16
|
+
@property
|
|
17
|
+
def name(self) -> str: ...
|
|
18
|
+
@property
|
|
19
|
+
def value(self) -> int: ...
|
|
20
|
+
|
|
21
|
+
class PythonUDFType:
|
|
22
|
+
NATIVE: PythonUDFType
|
|
23
|
+
ARROW: PythonUDFType
|
|
24
|
+
def __int__(self) -> int: ...
|
|
25
|
+
def __index__(self) -> int: ...
|
|
26
|
+
@property
|
|
27
|
+
def __members__(self) -> Dict[str, PythonUDFType]: ...
|
|
28
|
+
@property
|
|
29
|
+
def name(self) -> str: ...
|
|
30
|
+
@property
|
|
31
|
+
def value(self) -> int: ...
|
duckdb/polars_io.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import duckdb
|
|
2
|
+
import polars as pl
|
|
3
|
+
from typing import Iterator, Optional
|
|
4
|
+
|
|
5
|
+
from polars.io.plugins import register_io_source
|
|
6
|
+
from duckdb import SQLExpression
|
|
7
|
+
import json
|
|
8
|
+
from decimal import Decimal
|
|
9
|
+
import datetime
|
|
10
|
+
|
|
11
|
+
def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
|
|
12
|
+
"""
|
|
13
|
+
Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
|
|
14
|
+
|
|
15
|
+
Parameters:
|
|
16
|
+
predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
SQLExpression: A DuckDB SQL expression string equivalent.
|
|
20
|
+
None: If conversion fails.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> _predicate_to_expression(pl.col("foo") > 5)
|
|
24
|
+
SQLExpression("(foo > 5)")
|
|
25
|
+
"""
|
|
26
|
+
# Serialize the Polars expression tree to JSON
|
|
27
|
+
tree = json.loads(predicate.meta.serialize(format="json"))
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
# Convert the tree to SQL
|
|
31
|
+
sql_filter = _pl_tree_to_sql(tree)
|
|
32
|
+
return SQLExpression(sql_filter)
|
|
33
|
+
except:
|
|
34
|
+
# If the conversion fails, we return None
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _pl_operation_to_sql(op: str) -> str:
|
|
39
|
+
"""
|
|
40
|
+
Map Polars binary operation strings to SQL equivalents.
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> _pl_operation_to_sql("Eq")
|
|
44
|
+
'='
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
return {
|
|
48
|
+
"Lt": "<",
|
|
49
|
+
"LtEq": "<=",
|
|
50
|
+
"Gt": ">",
|
|
51
|
+
"GtEq": ">=",
|
|
52
|
+
"Eq": "=",
|
|
53
|
+
"Modulus": "%",
|
|
54
|
+
"And": "AND",
|
|
55
|
+
"Or": "OR",
|
|
56
|
+
}[op]
|
|
57
|
+
except KeyError:
|
|
58
|
+
raise NotImplementedError(op)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _escape_sql_identifier(identifier: str) -> str:
|
|
62
|
+
"""
|
|
63
|
+
Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> _escape_sql_identifier('column"name')
|
|
67
|
+
'"column""name"'
|
|
68
|
+
"""
|
|
69
|
+
escaped = identifier.replace('"', '""')
|
|
70
|
+
return f'"{escaped}"'
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _pl_tree_to_sql(tree: dict) -> str:
|
|
74
|
+
"""
|
|
75
|
+
Recursively convert a Polars expression tree (as JSON) to a SQL string.
|
|
76
|
+
|
|
77
|
+
Parameters:
|
|
78
|
+
tree (dict): JSON-deserialized expression tree from Polars
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
str: SQL expression string
|
|
82
|
+
|
|
83
|
+
Example:
|
|
84
|
+
Input tree:
|
|
85
|
+
{
|
|
86
|
+
"BinaryExpr": {
|
|
87
|
+
"left": { "Column": "foo" },
|
|
88
|
+
"op": "Gt",
|
|
89
|
+
"right": { "Literal": { "Int": 5 } }
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
Output: "(foo > 5)"
|
|
93
|
+
"""
|
|
94
|
+
[node_type] = tree.keys()
|
|
95
|
+
subtree = tree[node_type]
|
|
96
|
+
|
|
97
|
+
if node_type == "BinaryExpr":
|
|
98
|
+
# Binary expressions: left OP right
|
|
99
|
+
return (
|
|
100
|
+
"(" +
|
|
101
|
+
" ".join((
|
|
102
|
+
_pl_tree_to_sql(subtree['left']),
|
|
103
|
+
_pl_operation_to_sql(subtree['op']),
|
|
104
|
+
_pl_tree_to_sql(subtree['right'])
|
|
105
|
+
)) +
|
|
106
|
+
")"
|
|
107
|
+
)
|
|
108
|
+
if node_type == "Column":
|
|
109
|
+
# A reference to a column name
|
|
110
|
+
# Wrap in quotes to handle special characters
|
|
111
|
+
return _escape_sql_identifier(subtree)
|
|
112
|
+
|
|
113
|
+
if node_type in ("Literal", "Dyn"):
|
|
114
|
+
# Recursively process dynamic or literal values
|
|
115
|
+
return _pl_tree_to_sql(subtree)
|
|
116
|
+
|
|
117
|
+
if node_type == "Int":
|
|
118
|
+
# Direct integer literals
|
|
119
|
+
return str(subtree)
|
|
120
|
+
|
|
121
|
+
if node_type == "Function":
|
|
122
|
+
# Handle boolean functions like IsNull, IsNotNull
|
|
123
|
+
inputs = subtree["input"]
|
|
124
|
+
func_dict = subtree["function"]
|
|
125
|
+
|
|
126
|
+
if "Boolean" in func_dict:
|
|
127
|
+
func = func_dict["Boolean"]
|
|
128
|
+
arg_sql = _pl_tree_to_sql(inputs[0])
|
|
129
|
+
|
|
130
|
+
if func == "IsNull":
|
|
131
|
+
return f"({arg_sql} IS NULL)"
|
|
132
|
+
if func == "IsNotNull":
|
|
133
|
+
return f"({arg_sql} IS NOT NULL)"
|
|
134
|
+
raise NotImplementedError(f"Boolean function not supported: {func}")
|
|
135
|
+
|
|
136
|
+
raise NotImplementedError(f"Unsupported function type: {func_dict}")
|
|
137
|
+
|
|
138
|
+
if node_type == "Scalar":
|
|
139
|
+
# Detect format: old style (dtype/value) or new style (direct type key)
|
|
140
|
+
if "dtype" in subtree and "value" in subtree:
|
|
141
|
+
dtype = str(subtree["dtype"])
|
|
142
|
+
value = subtree["value"]
|
|
143
|
+
else:
|
|
144
|
+
# New style: dtype is the single key in the dict
|
|
145
|
+
dtype = next(iter(subtree.keys()))
|
|
146
|
+
value = subtree
|
|
147
|
+
|
|
148
|
+
# Decimal support
|
|
149
|
+
if dtype.startswith("{'Decimal'") or dtype == "Decimal":
|
|
150
|
+
decimal_value = value['Decimal']
|
|
151
|
+
decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
|
|
152
|
+
return str(decimal_value)
|
|
153
|
+
|
|
154
|
+
# Datetime with microseconds since epoch
|
|
155
|
+
if dtype.startswith("{'Datetime'") or dtype == "Datetime":
|
|
156
|
+
micros = value['Datetime'][0]
|
|
157
|
+
dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
|
|
158
|
+
return f"'{str(dt_timestamp)}'::TIMESTAMP"
|
|
159
|
+
|
|
160
|
+
# Match simple numeric/boolean types
|
|
161
|
+
if dtype in ("Int8", "Int16", "Int32", "Int64",
|
|
162
|
+
"UInt8", "UInt16", "UInt32", "UInt64",
|
|
163
|
+
"Float32", "Float64", "Boolean"):
|
|
164
|
+
return str(value[dtype])
|
|
165
|
+
|
|
166
|
+
# Time type
|
|
167
|
+
if dtype == "Time":
|
|
168
|
+
nanoseconds = value["Time"]
|
|
169
|
+
seconds = nanoseconds // 1_000_000_000
|
|
170
|
+
microseconds = (nanoseconds % 1_000_000_000) // 1_000
|
|
171
|
+
dt_time = (datetime.datetime.min + datetime.timedelta(
|
|
172
|
+
seconds=seconds, microseconds=microseconds
|
|
173
|
+
)).time()
|
|
174
|
+
return f"'{dt_time}'::TIME"
|
|
175
|
+
|
|
176
|
+
# Date type
|
|
177
|
+
if dtype == "Date":
|
|
178
|
+
days_since_epoch = value["Date"]
|
|
179
|
+
date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
|
|
180
|
+
return f"'{date}'::DATE"
|
|
181
|
+
|
|
182
|
+
# Binary type
|
|
183
|
+
if dtype == "Binary":
|
|
184
|
+
binary_data = bytes(value["Binary"])
|
|
185
|
+
escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
|
|
186
|
+
return f"'{escaped}'::BLOB"
|
|
187
|
+
|
|
188
|
+
# String type
|
|
189
|
+
if dtype == "String" or dtype == "StringOwned":
|
|
190
|
+
# Some new formats may store directly under StringOwned
|
|
191
|
+
string_val = value.get("StringOwned", value.get("String", None))
|
|
192
|
+
return f"'{string_val}'"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
|
|
196
|
+
|
|
197
|
+
raise NotImplementedError(f"Node type: {node_type} is not implemented. {subtree}")
|
|
198
|
+
|
|
199
|
+
def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
|
|
200
|
+
"""
|
|
201
|
+
A polars IO plugin for DuckDB.
|
|
202
|
+
"""
|
|
203
|
+
def source_generator(
|
|
204
|
+
with_columns: Optional[list[str]],
|
|
205
|
+
predicate: Optional[pl.Expr],
|
|
206
|
+
n_rows: Optional[int],
|
|
207
|
+
batch_size: Optional[int],
|
|
208
|
+
) -> Iterator[pl.DataFrame]:
|
|
209
|
+
duck_predicate = None
|
|
210
|
+
relation_final = relation
|
|
211
|
+
if with_columns is not None:
|
|
212
|
+
cols = ",".join(map(_escape_sql_identifier, with_columns))
|
|
213
|
+
relation_final = relation_final.project(cols)
|
|
214
|
+
if n_rows is not None:
|
|
215
|
+
relation_final = relation_final.limit(n_rows)
|
|
216
|
+
if predicate is not None:
|
|
217
|
+
# We have a predicate, if possible, we push it down to DuckDB
|
|
218
|
+
duck_predicate = _predicate_to_expression(predicate)
|
|
219
|
+
# Try to pushdown filter, if one exists
|
|
220
|
+
if duck_predicate is not None:
|
|
221
|
+
relation_final = relation_final.filter(duck_predicate)
|
|
222
|
+
if batch_size is None:
|
|
223
|
+
results = relation_final.fetch_arrow_reader()
|
|
224
|
+
else:
|
|
225
|
+
results = relation_final.fetch_arrow_reader(batch_size)
|
|
226
|
+
while True:
|
|
227
|
+
try:
|
|
228
|
+
record_batch = results.read_next_batch()
|
|
229
|
+
if predicate is not None and duck_predicate is None:
|
|
230
|
+
# We have a predicate, but did not manage to push it down, we fallback here
|
|
231
|
+
yield pl.from_arrow(record_batch).filter(predicate)
|
|
232
|
+
else:
|
|
233
|
+
yield pl.from_arrow(record_batch)
|
|
234
|
+
except StopIteration:
|
|
235
|
+
break
|
|
236
|
+
|
|
237
|
+
return register_io_source(source_generator, schema=schema)
|