duckdb 1.5.0.dev37__cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (47) hide show
  1. _duckdb.cpython-314-aarch64-linux-gnu.so +0 -0
  2. duckdb/__init__.py +475 -0
  3. duckdb/__init__.pyi +713 -0
  4. duckdb/bytes_io_wrapper.py +66 -0
  5. duckdb/experimental/__init__.py +2 -0
  6. duckdb/experimental/spark/LICENSE +260 -0
  7. duckdb/experimental/spark/__init__.py +7 -0
  8. duckdb/experimental/spark/_globals.py +77 -0
  9. duckdb/experimental/spark/_typing.py +48 -0
  10. duckdb/experimental/spark/conf.py +45 -0
  11. duckdb/experimental/spark/context.py +164 -0
  12. duckdb/experimental/spark/errors/__init__.py +72 -0
  13. duckdb/experimental/spark/errors/error_classes.py +918 -0
  14. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  15. duckdb/experimental/spark/errors/exceptions/base.py +217 -0
  16. duckdb/experimental/spark/errors/utils.py +116 -0
  17. duckdb/experimental/spark/exception.py +15 -0
  18. duckdb/experimental/spark/sql/__init__.py +7 -0
  19. duckdb/experimental/spark/sql/_typing.py +93 -0
  20. duckdb/experimental/spark/sql/catalog.py +78 -0
  21. duckdb/experimental/spark/sql/column.py +368 -0
  22. duckdb/experimental/spark/sql/conf.py +23 -0
  23. duckdb/experimental/spark/sql/dataframe.py +1437 -0
  24. duckdb/experimental/spark/sql/functions.py +6221 -0
  25. duckdb/experimental/spark/sql/group.py +420 -0
  26. duckdb/experimental/spark/sql/readwriter.py +449 -0
  27. duckdb/experimental/spark/sql/session.py +292 -0
  28. duckdb/experimental/spark/sql/streaming.py +37 -0
  29. duckdb/experimental/spark/sql/type_utils.py +105 -0
  30. duckdb/experimental/spark/sql/types.py +1275 -0
  31. duckdb/experimental/spark/sql/udf.py +37 -0
  32. duckdb/filesystem.py +23 -0
  33. duckdb/functional/__init__.py +17 -0
  34. duckdb/functional/__init__.pyi +31 -0
  35. duckdb/polars_io.py +237 -0
  36. duckdb/query_graph/__main__.py +363 -0
  37. duckdb/typing/__init__.py +61 -0
  38. duckdb/typing/__init__.pyi +36 -0
  39. duckdb/udf.py +19 -0
  40. duckdb/value/__init__.py +0 -0
  41. duckdb/value/__init__.pyi +0 -0
  42. duckdb/value/constant/__init__.py +268 -0
  43. duckdb/value/constant/__init__.pyi +115 -0
  44. duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
  45. duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
  46. duckdb-1.5.0.dev37.dist-info/WHEEL +6 -0
  47. duckdb-1.5.0.dev37.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,37 @@
1
+ # https://sparkbyexamples.com/pyspark/pyspark-udf-user-defined-function/
2
+ from typing import TYPE_CHECKING, Any, Callable, Optional, TypeVar, Union
3
+
4
+ from .types import DataType
5
+
6
+ if TYPE_CHECKING:
7
+ from .session import SparkSession
8
+
9
+ DataTypeOrString = Union[DataType, str]
10
+ UserDefinedFunctionLike = TypeVar("UserDefinedFunctionLike")
11
+
12
+
13
+ class UDFRegistration:
14
+ def __init__(self, sparkSession: "SparkSession"):
15
+ self.sparkSession = sparkSession
16
+
17
+ def register(
18
+ self,
19
+ name: str,
20
+ f: Union[Callable[..., Any], "UserDefinedFunctionLike"],
21
+ returnType: Optional["DataTypeOrString"] = None,
22
+ ) -> "UserDefinedFunctionLike":
23
+ self.sparkSession.conn.create_function(name, f, return_type=returnType)
24
+
25
+ def registerJavaFunction(
26
+ self,
27
+ name: str,
28
+ javaClassName: str,
29
+ returnType: Optional["DataTypeOrString"] = None,
30
+ ) -> None:
31
+ raise NotImplementedError
32
+
33
+ def registerJavaUDAF(self, name: str, javaClassName: str) -> None:
34
+ raise NotImplementedError
35
+
36
+
37
+ __all__ = ["UDFRegistration"]
duckdb/filesystem.py ADDED
@@ -0,0 +1,23 @@
1
+ from fsspec import filesystem, AbstractFileSystem
2
+ from fsspec.implementations.memory import MemoryFileSystem, MemoryFile
3
+ from .bytes_io_wrapper import BytesIOWrapper
4
+ from io import TextIOBase
5
+
6
+ def is_file_like(obj):
7
+ # We only care that we can read from the file
8
+ return hasattr(obj, "read") and hasattr(obj, "seek")
9
+
10
+
11
+ class ModifiedMemoryFileSystem(MemoryFileSystem):
12
+ protocol = ('DUCKDB_INTERNAL_OBJECTSTORE',)
13
+ # defer to the original implementation that doesn't hardcode the protocol
14
+ _strip_protocol = classmethod(AbstractFileSystem._strip_protocol.__func__)
15
+
16
+ def add_file(self, object, path):
17
+ if not is_file_like(object):
18
+ raise ValueError("Can not read from a non file-like object")
19
+ path = self._strip_protocol(path)
20
+ if isinstance(object, TextIOBase):
21
+ # Wrap this so that we can return a bytes object from 'read'
22
+ object = BytesIOWrapper(object)
23
+ self.store[path] = MemoryFile(self, path, object.read())
@@ -0,0 +1,17 @@
1
+ from _duckdb.functional import (
2
+ FunctionNullHandling,
3
+ PythonUDFType,
4
+ SPECIAL,
5
+ DEFAULT,
6
+ NATIVE,
7
+ ARROW
8
+ )
9
+
10
+ __all__ = [
11
+ "FunctionNullHandling",
12
+ "PythonUDFType",
13
+ "SPECIAL",
14
+ "DEFAULT",
15
+ "NATIVE",
16
+ "ARROW"
17
+ ]
@@ -0,0 +1,31 @@
1
+ from typing import Dict
2
+
3
+ SPECIAL: FunctionNullHandling
4
+ DEFAULT: FunctionNullHandling
5
+
6
+ NATIVE: PythonUDFType
7
+ ARROW: PythonUDFType
8
+
9
+ class FunctionNullHandling:
10
+ DEFAULT: FunctionNullHandling
11
+ SPECIAL: FunctionNullHandling
12
+ def __int__(self) -> int: ...
13
+ def __index__(self) -> int: ...
14
+ @property
15
+ def __members__(self) -> Dict[str, FunctionNullHandling]: ...
16
+ @property
17
+ def name(self) -> str: ...
18
+ @property
19
+ def value(self) -> int: ...
20
+
21
+ class PythonUDFType:
22
+ NATIVE: PythonUDFType
23
+ ARROW: PythonUDFType
24
+ def __int__(self) -> int: ...
25
+ def __index__(self) -> int: ...
26
+ @property
27
+ def __members__(self) -> Dict[str, PythonUDFType]: ...
28
+ @property
29
+ def name(self) -> str: ...
30
+ @property
31
+ def value(self) -> int: ...
duckdb/polars_io.py ADDED
@@ -0,0 +1,237 @@
1
+ import duckdb
2
+ import polars as pl
3
+ from typing import Iterator, Optional
4
+
5
+ from polars.io.plugins import register_io_source
6
+ from duckdb import SQLExpression
7
+ import json
8
+ from decimal import Decimal
9
+ import datetime
10
+
11
+ def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
12
+ """
13
+ Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
14
+
15
+ Parameters:
16
+ predicate (pl.Expr): A Polars expression (e.g., col("foo") > 5)
17
+
18
+ Returns:
19
+ SQLExpression: A DuckDB SQL expression string equivalent.
20
+ None: If conversion fails.
21
+
22
+ Example:
23
+ >>> _predicate_to_expression(pl.col("foo") > 5)
24
+ SQLExpression("(foo > 5)")
25
+ """
26
+ # Serialize the Polars expression tree to JSON
27
+ tree = json.loads(predicate.meta.serialize(format="json"))
28
+
29
+ try:
30
+ # Convert the tree to SQL
31
+ sql_filter = _pl_tree_to_sql(tree)
32
+ return SQLExpression(sql_filter)
33
+ except:
34
+ # If the conversion fails, we return None
35
+ return None
36
+
37
+
38
+ def _pl_operation_to_sql(op: str) -> str:
39
+ """
40
+ Map Polars binary operation strings to SQL equivalents.
41
+
42
+ Example:
43
+ >>> _pl_operation_to_sql("Eq")
44
+ '='
45
+ """
46
+ try:
47
+ return {
48
+ "Lt": "<",
49
+ "LtEq": "<=",
50
+ "Gt": ">",
51
+ "GtEq": ">=",
52
+ "Eq": "=",
53
+ "Modulus": "%",
54
+ "And": "AND",
55
+ "Or": "OR",
56
+ }[op]
57
+ except KeyError:
58
+ raise NotImplementedError(op)
59
+
60
+
61
+ def _escape_sql_identifier(identifier: str) -> str:
62
+ """
63
+ Escape SQL identifiers by doubling any double quotes and wrapping in double quotes.
64
+
65
+ Example:
66
+ >>> _escape_sql_identifier('column"name')
67
+ '"column""name"'
68
+ """
69
+ escaped = identifier.replace('"', '""')
70
+ return f'"{escaped}"'
71
+
72
+
73
+ def _pl_tree_to_sql(tree: dict) -> str:
74
+ """
75
+ Recursively convert a Polars expression tree (as JSON) to a SQL string.
76
+
77
+ Parameters:
78
+ tree (dict): JSON-deserialized expression tree from Polars
79
+
80
+ Returns:
81
+ str: SQL expression string
82
+
83
+ Example:
84
+ Input tree:
85
+ {
86
+ "BinaryExpr": {
87
+ "left": { "Column": "foo" },
88
+ "op": "Gt",
89
+ "right": { "Literal": { "Int": 5 } }
90
+ }
91
+ }
92
+ Output: "(foo > 5)"
93
+ """
94
+ [node_type] = tree.keys()
95
+ subtree = tree[node_type]
96
+
97
+ if node_type == "BinaryExpr":
98
+ # Binary expressions: left OP right
99
+ return (
100
+ "(" +
101
+ " ".join((
102
+ _pl_tree_to_sql(subtree['left']),
103
+ _pl_operation_to_sql(subtree['op']),
104
+ _pl_tree_to_sql(subtree['right'])
105
+ )) +
106
+ ")"
107
+ )
108
+ if node_type == "Column":
109
+ # A reference to a column name
110
+ # Wrap in quotes to handle special characters
111
+ return _escape_sql_identifier(subtree)
112
+
113
+ if node_type in ("Literal", "Dyn"):
114
+ # Recursively process dynamic or literal values
115
+ return _pl_tree_to_sql(subtree)
116
+
117
+ if node_type == "Int":
118
+ # Direct integer literals
119
+ return str(subtree)
120
+
121
+ if node_type == "Function":
122
+ # Handle boolean functions like IsNull, IsNotNull
123
+ inputs = subtree["input"]
124
+ func_dict = subtree["function"]
125
+
126
+ if "Boolean" in func_dict:
127
+ func = func_dict["Boolean"]
128
+ arg_sql = _pl_tree_to_sql(inputs[0])
129
+
130
+ if func == "IsNull":
131
+ return f"({arg_sql} IS NULL)"
132
+ if func == "IsNotNull":
133
+ return f"({arg_sql} IS NOT NULL)"
134
+ raise NotImplementedError(f"Boolean function not supported: {func}")
135
+
136
+ raise NotImplementedError(f"Unsupported function type: {func_dict}")
137
+
138
+ if node_type == "Scalar":
139
+ # Detect format: old style (dtype/value) or new style (direct type key)
140
+ if "dtype" in subtree and "value" in subtree:
141
+ dtype = str(subtree["dtype"])
142
+ value = subtree["value"]
143
+ else:
144
+ # New style: dtype is the single key in the dict
145
+ dtype = next(iter(subtree.keys()))
146
+ value = subtree
147
+
148
+ # Decimal support
149
+ if dtype.startswith("{'Decimal'") or dtype == "Decimal":
150
+ decimal_value = value['Decimal']
151
+ decimal_value = Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[1])
152
+ return str(decimal_value)
153
+
154
+ # Datetime with microseconds since epoch
155
+ if dtype.startswith("{'Datetime'") or dtype == "Datetime":
156
+ micros = value['Datetime'][0]
157
+ dt_timestamp = datetime.datetime.fromtimestamp(micros / 1_000_000, tz=datetime.UTC)
158
+ return f"'{str(dt_timestamp)}'::TIMESTAMP"
159
+
160
+ # Match simple numeric/boolean types
161
+ if dtype in ("Int8", "Int16", "Int32", "Int64",
162
+ "UInt8", "UInt16", "UInt32", "UInt64",
163
+ "Float32", "Float64", "Boolean"):
164
+ return str(value[dtype])
165
+
166
+ # Time type
167
+ if dtype == "Time":
168
+ nanoseconds = value["Time"]
169
+ seconds = nanoseconds // 1_000_000_000
170
+ microseconds = (nanoseconds % 1_000_000_000) // 1_000
171
+ dt_time = (datetime.datetime.min + datetime.timedelta(
172
+ seconds=seconds, microseconds=microseconds
173
+ )).time()
174
+ return f"'{dt_time}'::TIME"
175
+
176
+ # Date type
177
+ if dtype == "Date":
178
+ days_since_epoch = value["Date"]
179
+ date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
180
+ return f"'{date}'::DATE"
181
+
182
+ # Binary type
183
+ if dtype == "Binary":
184
+ binary_data = bytes(value["Binary"])
185
+ escaped = ''.join(f'\\x{b:02x}' for b in binary_data)
186
+ return f"'{escaped}'::BLOB"
187
+
188
+ # String type
189
+ if dtype == "String" or dtype == "StringOwned":
190
+ # Some new formats may store directly under StringOwned
191
+ string_val = value.get("StringOwned", value.get("String", None))
192
+ return f"'{string_val}'"
193
+
194
+
195
+ raise NotImplementedError(f"Unsupported scalar type {str(dtype)}, with value {value}")
196
+
197
+ raise NotImplementedError(f"Node type: {node_type} is not implemented. {subtree}")
198
+
199
+ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -> pl.LazyFrame:
200
+ """
201
+ A polars IO plugin for DuckDB.
202
+ """
203
+ def source_generator(
204
+ with_columns: Optional[list[str]],
205
+ predicate: Optional[pl.Expr],
206
+ n_rows: Optional[int],
207
+ batch_size: Optional[int],
208
+ ) -> Iterator[pl.DataFrame]:
209
+ duck_predicate = None
210
+ relation_final = relation
211
+ if with_columns is not None:
212
+ cols = ",".join(map(_escape_sql_identifier, with_columns))
213
+ relation_final = relation_final.project(cols)
214
+ if n_rows is not None:
215
+ relation_final = relation_final.limit(n_rows)
216
+ if predicate is not None:
217
+ # We have a predicate, if possible, we push it down to DuckDB
218
+ duck_predicate = _predicate_to_expression(predicate)
219
+ # Try to pushdown filter, if one exists
220
+ if duck_predicate is not None:
221
+ relation_final = relation_final.filter(duck_predicate)
222
+ if batch_size is None:
223
+ results = relation_final.fetch_arrow_reader()
224
+ else:
225
+ results = relation_final.fetch_arrow_reader(batch_size)
226
+ while True:
227
+ try:
228
+ record_batch = results.read_next_batch()
229
+ if predicate is not None and duck_predicate is None:
230
+ # We have a predicate, but did not manage to push it down, we fallback here
231
+ yield pl.from_arrow(record_batch).filter(predicate)
232
+ else:
233
+ yield pl.from_arrow(record_batch)
234
+ except StopIteration:
235
+ break
236
+
237
+ return register_io_source(source_generator, schema=schema)