duckdb 1.4.1.dev125__cp313-cp313-win_amd64.whl → 1.5.0.dev94__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb-stubs/__init__.pyi +1443 -0
- _duckdb-stubs/_func.pyi +46 -0
- _duckdb-stubs/_sqltypes.pyi +75 -0
- _duckdb.cp313-win_amd64.pyd +0 -0
- adbc_driver_duckdb/__init__.py +1 -2
- duckdb/__init__.py +248 -341
- duckdb/_dbapi_type_object.py +231 -0
- duckdb/_version.py +22 -0
- duckdb/bytes_io_wrapper.py +10 -6
- duckdb/experimental/spark/sql/column.py +1 -1
- duckdb/experimental/spark/sql/type_utils.py +1 -1
- duckdb/experimental/spark/sql/types.py +1 -1
- duckdb/filesystem.py +20 -15
- duckdb/func/__init__.py +3 -0
- duckdb/functional/__init__.py +11 -1
- duckdb/polars_io.py +81 -43
- duckdb/sqltypes/__init__.py +63 -0
- duckdb/typing/__init__.py +11 -1
- duckdb/udf.py +2 -2
- duckdb/value/constant/__init__.py +1 -1
- duckdb-1.5.0.dev94.dist-info/METADATA +88 -0
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev94.dist-info}/RECORD +25 -22
- duckdb/__init__.pyi +0 -1137
- duckdb/functional/__init__.pyi +0 -31
- duckdb/typing/__init__.pyi +0 -38
- duckdb/value/constant/__init__.pyi +0 -114
- duckdb-1.4.1.dev125.dist-info/METADATA +0 -326
- /duckdb/{value/__init__.pyi → py.typed} +0 -0
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev94.dist-info}/WHEEL +0 -0
- {duckdb-1.4.1.dev125.dist-info → duckdb-1.5.0.dev94.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
"""DuckDB DB API 2.0 Type Objects Module.
|
|
2
|
+
|
|
3
|
+
This module provides DB API 2.0 compliant type objects for DuckDB, allowing applications
|
|
4
|
+
to check column types returned by queries against standard database API categories.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> import duckdb
|
|
8
|
+
>>>
|
|
9
|
+
>>> conn = duckdb.connect()
|
|
10
|
+
>>> cursor = conn.cursor()
|
|
11
|
+
>>> cursor.execute("SELECT 'hello' as text_col, 42 as num_col, CURRENT_DATE as date_col")
|
|
12
|
+
>>>
|
|
13
|
+
>>> # Check column types using DB API type objects
|
|
14
|
+
>>> for i, desc in enumerate(cursor.description):
|
|
15
|
+
>>> col_name, col_type = desc[0], desc[1]
|
|
16
|
+
>>> if col_type == duckdb.STRING:
|
|
17
|
+
>>> print(f"{col_name} is a string type")
|
|
18
|
+
>>> elif col_type == duckdb.NUMBER:
|
|
19
|
+
>>> print(f"{col_name} is a numeric type")
|
|
20
|
+
>>> elif col_type == duckdb.DATETIME:
|
|
21
|
+
>>> print(f"{col_name} is a date/time type")
|
|
22
|
+
|
|
23
|
+
See Also:
|
|
24
|
+
- PEP 249: https://peps.python.org/pep-0249/
|
|
25
|
+
- DuckDB Type System: https://duckdb.org/docs/sql/data_types/overview
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from duckdb import sqltypes
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DBAPITypeObject:
|
|
32
|
+
"""DB API 2.0 type object for categorizing database column types.
|
|
33
|
+
|
|
34
|
+
This class implements the type objects defined in PEP 249 (DB API 2.0).
|
|
35
|
+
It allows checking whether a specific DuckDB type belongs to a broader
|
|
36
|
+
category like STRING, NUMBER, DATETIME, etc.
|
|
37
|
+
|
|
38
|
+
The type object supports equality comparison with DuckDBPyType instances,
|
|
39
|
+
returning True if the type belongs to this category.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
types: A list of DuckDBPyType instances that belong to this type category.
|
|
43
|
+
|
|
44
|
+
Example:
|
|
45
|
+
>>> string_types = DBAPITypeObject([sqltypes.VARCHAR, sqltypes.CHAR])
|
|
46
|
+
>>> result = sqltypes.VARCHAR == string_types # True
|
|
47
|
+
>>> result = sqltypes.INTEGER == string_types # False
|
|
48
|
+
|
|
49
|
+
Note:
|
|
50
|
+
This follows the DB API 2.0 specification where type objects are compared
|
|
51
|
+
using equality operators rather than isinstance() checks.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self, types: list[sqltypes.DuckDBPyType]) -> None:
|
|
55
|
+
"""Initialize a DB API type object.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
types: List of DuckDB types that belong to this category.
|
|
59
|
+
"""
|
|
60
|
+
self.types = types
|
|
61
|
+
|
|
62
|
+
def __eq__(self, other: object) -> bool:
|
|
63
|
+
"""Check if a DuckDB type belongs to this type category.
|
|
64
|
+
|
|
65
|
+
This method implements the DB API 2.0 type checking mechanism.
|
|
66
|
+
It returns True if the other object is a DuckDBPyType that
|
|
67
|
+
is contained in this type category.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
other: The object to compare, typically a DuckDBPyType instance.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
True if other is a DuckDBPyType in this category, False otherwise.
|
|
74
|
+
|
|
75
|
+
Example:
|
|
76
|
+
>>> NUMBER == sqltypes.INTEGER # True
|
|
77
|
+
>>> NUMBER == sqltypes.VARCHAR # False
|
|
78
|
+
"""
|
|
79
|
+
if isinstance(other, sqltypes.DuckDBPyType):
|
|
80
|
+
return other in self.types
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
def __repr__(self) -> str:
|
|
84
|
+
"""Return a string representation of this type object.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
A string showing the type object and its contained DuckDB types.
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> repr(STRING)
|
|
91
|
+
'<DBAPITypeObject [VARCHAR]>'
|
|
92
|
+
"""
|
|
93
|
+
return f"<DBAPITypeObject [{','.join(str(x) for x in self.types)}]>"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# Define the standard DB API 2.0 type objects for DuckDB
|
|
97
|
+
|
|
98
|
+
STRING = DBAPITypeObject([sqltypes.VARCHAR])
|
|
99
|
+
"""
|
|
100
|
+
STRING type object for text-based database columns.
|
|
101
|
+
|
|
102
|
+
This type object represents all string/text types in DuckDB. Currently includes:
|
|
103
|
+
- VARCHAR: Variable-length character strings
|
|
104
|
+
|
|
105
|
+
Use this to check if a column contains textual data that should be handled
|
|
106
|
+
as Python strings.
|
|
107
|
+
|
|
108
|
+
DB API 2.0 Reference:
|
|
109
|
+
https://peps.python.org/pep-0249/#string
|
|
110
|
+
|
|
111
|
+
Example:
|
|
112
|
+
>>> cursor.description[0][1] == STRING # Check if first column is text
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
NUMBER = DBAPITypeObject(
|
|
116
|
+
[
|
|
117
|
+
sqltypes.TINYINT,
|
|
118
|
+
sqltypes.UTINYINT,
|
|
119
|
+
sqltypes.SMALLINT,
|
|
120
|
+
sqltypes.USMALLINT,
|
|
121
|
+
sqltypes.INTEGER,
|
|
122
|
+
sqltypes.UINTEGER,
|
|
123
|
+
sqltypes.BIGINT,
|
|
124
|
+
sqltypes.UBIGINT,
|
|
125
|
+
sqltypes.HUGEINT,
|
|
126
|
+
sqltypes.UHUGEINT,
|
|
127
|
+
sqltypes.DuckDBPyType("BIGNUM"),
|
|
128
|
+
sqltypes.DuckDBPyType("DECIMAL"),
|
|
129
|
+
sqltypes.FLOAT,
|
|
130
|
+
sqltypes.DOUBLE,
|
|
131
|
+
]
|
|
132
|
+
)
|
|
133
|
+
"""
|
|
134
|
+
NUMBER type object for numeric database columns.
|
|
135
|
+
|
|
136
|
+
This type object represents all numeric types in DuckDB, including:
|
|
137
|
+
|
|
138
|
+
Integer Types:
|
|
139
|
+
- TINYINT, UTINYINT: 8-bit signed/unsigned integers
|
|
140
|
+
- SMALLINT, USMALLINT: 16-bit signed/unsigned integers
|
|
141
|
+
- INTEGER, UINTEGER: 32-bit signed/unsigned integers
|
|
142
|
+
- BIGINT, UBIGINT: 64-bit signed/unsigned integers
|
|
143
|
+
- HUGEINT, UHUGEINT: 128-bit signed/unsigned integers
|
|
144
|
+
|
|
145
|
+
Decimal Types:
|
|
146
|
+
- BIGNUM: Arbitrary precision integers
|
|
147
|
+
- DECIMAL: Fixed-point decimal numbers
|
|
148
|
+
|
|
149
|
+
Floating Point Types:
|
|
150
|
+
- FLOAT: 32-bit floating point
|
|
151
|
+
- DOUBLE: 64-bit floating point
|
|
152
|
+
|
|
153
|
+
Use this to check if a column contains numeric data that should be handled
|
|
154
|
+
as Python int, float, or Decimal objects.
|
|
155
|
+
|
|
156
|
+
DB API 2.0 Reference:
|
|
157
|
+
https://peps.python.org/pep-0249/#number
|
|
158
|
+
|
|
159
|
+
Example:
|
|
160
|
+
>>> cursor.description[1][1] == NUMBER # Check if second column is numeric
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
DATETIME = DBAPITypeObject(
|
|
164
|
+
[
|
|
165
|
+
sqltypes.DATE,
|
|
166
|
+
sqltypes.TIME,
|
|
167
|
+
sqltypes.TIME_TZ,
|
|
168
|
+
sqltypes.TIMESTAMP,
|
|
169
|
+
sqltypes.TIMESTAMP_TZ,
|
|
170
|
+
sqltypes.TIMESTAMP_NS,
|
|
171
|
+
sqltypes.TIMESTAMP_MS,
|
|
172
|
+
sqltypes.TIMESTAMP_S,
|
|
173
|
+
]
|
|
174
|
+
)
|
|
175
|
+
"""
|
|
176
|
+
DATETIME type object for date and time database columns.
|
|
177
|
+
|
|
178
|
+
This type object represents all date/time types in DuckDB, including:
|
|
179
|
+
|
|
180
|
+
Date Types:
|
|
181
|
+
- DATE: Calendar dates (year, month, day)
|
|
182
|
+
|
|
183
|
+
Time Types:
|
|
184
|
+
- TIME: Time of day without timezone
|
|
185
|
+
- TIME_TZ: Time of day with timezone
|
|
186
|
+
|
|
187
|
+
Timestamp Types:
|
|
188
|
+
- TIMESTAMP: Date and time without timezone (microsecond precision)
|
|
189
|
+
- TIMESTAMP_TZ: Date and time with timezone
|
|
190
|
+
- TIMESTAMP_NS: Nanosecond precision timestamps
|
|
191
|
+
- TIMESTAMP_MS: Millisecond precision timestamps
|
|
192
|
+
- TIMESTAMP_S: Second precision timestamps
|
|
193
|
+
|
|
194
|
+
Use this to check if a column contains temporal data that should be handled
|
|
195
|
+
as Python datetime, date, or time objects.
|
|
196
|
+
|
|
197
|
+
DB API 2.0 Reference:
|
|
198
|
+
https://peps.python.org/pep-0249/#datetime
|
|
199
|
+
|
|
200
|
+
Example:
|
|
201
|
+
>>> cursor.description[2][1] == DATETIME # Check if third column is date/time
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
BINARY = DBAPITypeObject([sqltypes.BLOB])
|
|
205
|
+
"""
|
|
206
|
+
BINARY type object for binary data database columns.
|
|
207
|
+
|
|
208
|
+
This type object represents binary data types in DuckDB:
|
|
209
|
+
- BLOB: Binary Large Objects for storing arbitrary binary data
|
|
210
|
+
|
|
211
|
+
Use this to check if a column contains binary data that should be handled
|
|
212
|
+
as Python bytes objects.
|
|
213
|
+
|
|
214
|
+
DB API 2.0 Reference:
|
|
215
|
+
https://peps.python.org/pep-0249/#binary
|
|
216
|
+
|
|
217
|
+
Example:
|
|
218
|
+
>>> cursor.description[3][1] == BINARY # Check if fourth column is binary
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
ROWID = None
|
|
222
|
+
"""
|
|
223
|
+
ROWID type object for row identifier columns.
|
|
224
|
+
|
|
225
|
+
DB API 2.0 Reference:
|
|
226
|
+
https://peps.python.org/pep-0249/#rowid
|
|
227
|
+
|
|
228
|
+
Note:
|
|
229
|
+
This will always be None for DuckDB connections. Applications should not
|
|
230
|
+
rely on ROWID functionality when using DuckDB.
|
|
231
|
+
"""
|
duckdb/_version.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# ----------------------------------------------------------------------
|
|
2
|
+
# Version API
|
|
3
|
+
#
|
|
4
|
+
# We provide three symbols:
|
|
5
|
+
# - duckdb.__version__: The version of this package
|
|
6
|
+
# - duckdb.__duckdb_version__: The version of duckdb that is bundled
|
|
7
|
+
# - duckdb.version(): A human-readable version string containing both of the above
|
|
8
|
+
# ----------------------------------------------------------------------
|
|
9
|
+
from importlib.metadata import version as _dist_version
|
|
10
|
+
|
|
11
|
+
import _duckdb
|
|
12
|
+
|
|
13
|
+
__version__: str = _dist_version("duckdb")
|
|
14
|
+
"""Version of the DuckDB Python Package."""
|
|
15
|
+
|
|
16
|
+
__duckdb_version__: str = _duckdb.__version__
|
|
17
|
+
"""Version of DuckDB that is bundled."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def version() -> str:
|
|
21
|
+
"""Human-friendly formatted version string of both the distribution package and the bundled DuckDB engine."""
|
|
22
|
+
return f"{__version__} (with duckdb {_duckdb.__version__})"
|
duckdb/bytes_io_wrapper.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
|
|
2
|
-
from typing import Any, Union
|
|
1
|
+
"""StringIO buffer wrapper.
|
|
3
2
|
|
|
4
|
-
"""
|
|
5
3
|
BSD 3-Clause License
|
|
6
4
|
|
|
7
5
|
Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
|
|
@@ -35,10 +33,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
35
33
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
36
34
|
"""
|
|
37
35
|
|
|
36
|
+
from io import StringIO, TextIOBase
|
|
37
|
+
from typing import Any, Union
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class BytesIOWrapper:
|
|
41
|
+
"""Wrapper that wraps a StringIO buffer and reads bytes from it.
|
|
42
|
+
|
|
43
|
+
Created for compat with pyarrow read_csv.
|
|
44
|
+
"""
|
|
38
45
|
|
|
39
|
-
class BytesIOWrapper: # noqa: D101
|
|
40
|
-
# Wrapper that wraps a StringIO buffer and reads bytes from it
|
|
41
|
-
# Created for compat with pyarrow read_csv
|
|
42
46
|
def __init__(self, buffer: Union[StringIO, TextIOBase], encoding: str = "utf-8") -> None: # noqa: D107
|
|
43
47
|
self.buffer = buffer
|
|
44
48
|
self.encoding = encoding
|
|
@@ -8,7 +8,7 @@ if TYPE_CHECKING:
|
|
|
8
8
|
from ._typing import DateTimeLiteral, DecimalLiteral, LiteralType
|
|
9
9
|
|
|
10
10
|
from duckdb import ColumnExpression, ConstantExpression, Expression, FunctionExpression
|
|
11
|
-
from duckdb.
|
|
11
|
+
from duckdb.sqltypes import DuckDBPyType
|
|
12
12
|
|
|
13
13
|
__all__ = ["Column"]
|
|
14
14
|
|
duckdb/filesystem.py
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""In-memory filesystem to store ephemeral dependencies.
|
|
2
|
+
|
|
3
|
+
Warning: Not for external use. May change at any moment. Likely to be made internal.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import typing
|
|
3
10
|
|
|
4
11
|
from fsspec import AbstractFileSystem
|
|
5
12
|
from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
|
|
@@ -7,22 +14,20 @@ from fsspec.implementations.memory import MemoryFile, MemoryFileSystem
|
|
|
7
14
|
from .bytes_io_wrapper import BytesIOWrapper
|
|
8
15
|
|
|
9
16
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
return hasattr(obj, "read") and hasattr(obj, "seek")
|
|
17
|
+
class ModifiedMemoryFileSystem(MemoryFileSystem):
|
|
18
|
+
"""In-memory filesystem implementation that uses its own protocol."""
|
|
13
19
|
|
|
14
|
-
|
|
15
|
-
class ModifiedMemoryFileSystem(MemoryFileSystem): # noqa: D101
|
|
16
20
|
protocol = ("DUCKDB_INTERNAL_OBJECTSTORE",)
|
|
17
21
|
# defer to the original implementation that doesn't hardcode the protocol
|
|
18
|
-
_strip_protocol = classmethod(AbstractFileSystem._strip_protocol.__func__)
|
|
22
|
+
_strip_protocol: typing.Callable[[str], str] = classmethod(AbstractFileSystem._strip_protocol.__func__) # type: ignore[assignment]
|
|
19
23
|
|
|
20
|
-
def add_file(self,
|
|
21
|
-
|
|
24
|
+
def add_file(self, obj: io.IOBase | BytesIOWrapper | object, path: str) -> None:
|
|
25
|
+
"""Add a file to the filesystem."""
|
|
26
|
+
if not (hasattr(obj, "read") and hasattr(obj, "seek")):
|
|
22
27
|
msg = "Can not read from a non file-like object"
|
|
23
|
-
raise
|
|
24
|
-
|
|
25
|
-
if isinstance(object, TextIOBase):
|
|
28
|
+
raise TypeError(msg)
|
|
29
|
+
if isinstance(obj, io.TextIOBase):
|
|
26
30
|
# Wrap this so that we can return a bytes object from 'read'
|
|
27
|
-
|
|
28
|
-
|
|
31
|
+
obj = BytesIOWrapper(obj)
|
|
32
|
+
path = self._strip_protocol(path)
|
|
33
|
+
self.store[path] = MemoryFile(self, path, obj.read())
|
duckdb/func/__init__.py
ADDED
duckdb/functional/__init__.py
CHANGED
|
@@ -1,3 +1,13 @@
|
|
|
1
|
-
|
|
1
|
+
"""DuckDB function constants and types. DEPRECATED: please use `duckdb.func` instead."""
|
|
2
|
+
|
|
3
|
+
import warnings
|
|
4
|
+
|
|
5
|
+
from duckdb.func import ARROW, DEFAULT, NATIVE, SPECIAL, FunctionNullHandling, PythonUDFType
|
|
2
6
|
|
|
3
7
|
__all__ = ["ARROW", "DEFAULT", "NATIVE", "SPECIAL", "FunctionNullHandling", "PythonUDFType"]
|
|
8
|
+
|
|
9
|
+
warnings.warn(
|
|
10
|
+
"`duckdb.functional` is deprecated and will be removed in a future version. Please use `duckdb.func` instead.",
|
|
11
|
+
DeprecationWarning,
|
|
12
|
+
stacklevel=2,
|
|
13
|
+
)
|
duckdb/polars_io.py
CHANGED
|
@@ -1,17 +1,25 @@
|
|
|
1
|
-
import
|
|
1
|
+
from __future__ import annotations # noqa: D100
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import datetime
|
|
2
5
|
import json
|
|
3
|
-
|
|
6
|
+
import typing
|
|
4
7
|
from decimal import Decimal
|
|
5
|
-
from typing import Optional
|
|
6
8
|
|
|
7
9
|
import polars as pl
|
|
8
10
|
from polars.io.plugins import register_io_source
|
|
9
11
|
|
|
10
12
|
import duckdb
|
|
11
|
-
from duckdb import SQLExpression
|
|
12
13
|
|
|
14
|
+
if typing.TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Iterator
|
|
16
|
+
|
|
17
|
+
import typing_extensions
|
|
18
|
+
|
|
19
|
+
_ExpressionTree: typing_extensions.TypeAlias = typing.Dict[str, typing.Union[str, int, "_ExpressionTree", typing.Any]] # noqa: UP006
|
|
13
20
|
|
|
14
|
-
|
|
21
|
+
|
|
22
|
+
def _predicate_to_expression(predicate: pl.Expr) -> duckdb.Expression | None:
|
|
15
23
|
"""Convert a Polars predicate expression to a DuckDB-compatible SQL expression.
|
|
16
24
|
|
|
17
25
|
Parameters:
|
|
@@ -31,7 +39,7 @@ def _predicate_to_expression(predicate: pl.Expr) -> Optional[SQLExpression]:
|
|
|
31
39
|
try:
|
|
32
40
|
# Convert the tree to SQL
|
|
33
41
|
sql_filter = _pl_tree_to_sql(tree)
|
|
34
|
-
return SQLExpression(sql_filter)
|
|
42
|
+
return duckdb.SQLExpression(sql_filter)
|
|
35
43
|
except Exception:
|
|
36
44
|
# If the conversion fails, we return None
|
|
37
45
|
return None
|
|
@@ -70,7 +78,7 @@ def _escape_sql_identifier(identifier: str) -> str:
|
|
|
70
78
|
return f'"{escaped}"'
|
|
71
79
|
|
|
72
80
|
|
|
73
|
-
def _pl_tree_to_sql(tree:
|
|
81
|
+
def _pl_tree_to_sql(tree: _ExpressionTree) -> str:
|
|
74
82
|
"""Recursively convert a Polars expression tree (as JSON) to a SQL string.
|
|
75
83
|
|
|
76
84
|
Parameters:
|
|
@@ -91,38 +99,51 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
91
99
|
Output: "(foo > 5)"
|
|
92
100
|
"""
|
|
93
101
|
[node_type] = tree.keys()
|
|
94
|
-
subtree = tree[node_type]
|
|
95
102
|
|
|
96
103
|
if node_type == "BinaryExpr":
|
|
97
104
|
# Binary expressions: left OP right
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
+ ")"
|
|
108
|
-
)
|
|
105
|
+
bin_expr_tree = tree[node_type]
|
|
106
|
+
assert isinstance(bin_expr_tree, dict), f"A {node_type} should be a dict but got {type(bin_expr_tree)}"
|
|
107
|
+
lhs, op, rhs = bin_expr_tree["left"], bin_expr_tree["op"], bin_expr_tree["right"]
|
|
108
|
+
assert isinstance(lhs, dict), f"LHS of a {node_type} should be a dict but got {type(lhs)}"
|
|
109
|
+
assert isinstance(op, str), f"The op of a {node_type} should be a str but got {type(op)}"
|
|
110
|
+
assert isinstance(rhs, dict), f"RHS of a {node_type} should be a dict but got {type(rhs)}"
|
|
111
|
+
return f"({_pl_tree_to_sql(lhs)} {_pl_operation_to_sql(op)} {_pl_tree_to_sql(rhs)})"
|
|
109
112
|
if node_type == "Column":
|
|
110
113
|
# A reference to a column name
|
|
111
114
|
# Wrap in quotes to handle special characters
|
|
112
|
-
|
|
115
|
+
col_name = tree[node_type]
|
|
116
|
+
assert isinstance(col_name, str), f"The col name of a {node_type} should be a str but got {type(col_name)}"
|
|
117
|
+
return _escape_sql_identifier(col_name)
|
|
113
118
|
|
|
114
119
|
if node_type in ("Literal", "Dyn"):
|
|
115
120
|
# Recursively process dynamic or literal values
|
|
116
|
-
|
|
121
|
+
val_tree = tree[node_type]
|
|
122
|
+
assert isinstance(val_tree, dict), f"A {node_type} should be a dict but got {type(val_tree)}"
|
|
123
|
+
return _pl_tree_to_sql(val_tree)
|
|
117
124
|
|
|
118
125
|
if node_type == "Int":
|
|
119
126
|
# Direct integer literals
|
|
120
|
-
|
|
127
|
+
int_literal = tree[node_type]
|
|
128
|
+
assert isinstance(int_literal, (int, str)), (
|
|
129
|
+
f"The value of an Int should be an int or str but got {type(int_literal)}"
|
|
130
|
+
)
|
|
131
|
+
return str(int_literal)
|
|
121
132
|
|
|
122
133
|
if node_type == "Function":
|
|
123
134
|
# Handle boolean functions like IsNull, IsNotNull
|
|
124
|
-
|
|
125
|
-
|
|
135
|
+
func_tree = tree[node_type]
|
|
136
|
+
assert isinstance(func_tree, dict), f"A {node_type} should be a dict but got {type(func_tree)}"
|
|
137
|
+
inputs = func_tree["input"]
|
|
138
|
+
assert isinstance(inputs, list), f"A {node_type} should have a list of dicts as input but got {type(inputs)}"
|
|
139
|
+
input_tree = inputs[0]
|
|
140
|
+
assert isinstance(input_tree, dict), (
|
|
141
|
+
f"A {node_type} should have a list of dicts as input but got {type(input_tree)}"
|
|
142
|
+
)
|
|
143
|
+
func_dict = func_tree["function"]
|
|
144
|
+
assert isinstance(func_dict, dict), (
|
|
145
|
+
f"A {node_type} should have a function dict as input but got {type(func_dict)}"
|
|
146
|
+
)
|
|
126
147
|
|
|
127
148
|
if "Boolean" in func_dict:
|
|
128
149
|
func = func_dict["Boolean"]
|
|
@@ -140,24 +161,34 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
140
161
|
|
|
141
162
|
if node_type == "Scalar":
|
|
142
163
|
# Detect format: old style (dtype/value) or new style (direct type key)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
164
|
+
scalar_tree = tree[node_type]
|
|
165
|
+
assert isinstance(scalar_tree, dict), f"A {node_type} should be a dict but got {type(scalar_tree)}"
|
|
166
|
+
if "dtype" in scalar_tree and "value" in scalar_tree:
|
|
167
|
+
dtype = str(scalar_tree["dtype"])
|
|
168
|
+
value = scalar_tree["value"]
|
|
146
169
|
else:
|
|
147
170
|
# New style: dtype is the single key in the dict
|
|
148
|
-
dtype = next(iter(
|
|
149
|
-
value =
|
|
171
|
+
dtype = next(iter(scalar_tree.keys()))
|
|
172
|
+
value = scalar_tree
|
|
173
|
+
assert isinstance(dtype, str), f"A {node_type} should have a str dtype but got {type(dtype)}"
|
|
174
|
+
assert isinstance(value, dict), f"A {node_type} should have a dict value but got {type(value)}"
|
|
150
175
|
|
|
151
176
|
# Decimal support
|
|
152
177
|
if dtype.startswith("{'Decimal'") or dtype == "Decimal":
|
|
153
178
|
decimal_value = value["Decimal"]
|
|
154
|
-
|
|
155
|
-
|
|
179
|
+
assert isinstance(decimal_value, list), (
|
|
180
|
+
f"A {dtype} should be a two or three member list but got {type(decimal_value)}"
|
|
181
|
+
)
|
|
182
|
+
assert 2 <= len(decimal_value) <= 3, (
|
|
183
|
+
f"A {dtype} should be a two or three member list but got {len(decimal_value)} member list"
|
|
184
|
+
)
|
|
185
|
+
return str(Decimal(decimal_value[0]) / Decimal(10 ** decimal_value[-1]))
|
|
156
186
|
|
|
157
187
|
# Datetime with microseconds since epoch
|
|
158
188
|
if dtype.startswith("{'Datetime'") or dtype == "Datetime":
|
|
159
|
-
micros = value["Datetime"]
|
|
160
|
-
|
|
189
|
+
micros = value["Datetime"]
|
|
190
|
+
assert isinstance(micros, list), f"A {dtype} should be a one member list but got {type(micros)}"
|
|
191
|
+
dt_timestamp = datetime.datetime.fromtimestamp(micros[0] / 1_000_000, tz=datetime.timezone.utc)
|
|
161
192
|
return f"'{dt_timestamp!s}'::TIMESTAMP"
|
|
162
193
|
|
|
163
194
|
# Match simple numeric/boolean types
|
|
@@ -179,6 +210,7 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
179
210
|
# Time type
|
|
180
211
|
if dtype == "Time":
|
|
181
212
|
nanoseconds = value["Time"]
|
|
213
|
+
assert isinstance(nanoseconds, int), f"A {dtype} should be an int but got {type(nanoseconds)}"
|
|
182
214
|
seconds = nanoseconds // 1_000_000_000
|
|
183
215
|
microseconds = (nanoseconds % 1_000_000_000) // 1_000
|
|
184
216
|
dt_time = (datetime.datetime.min + datetime.timedelta(seconds=seconds, microseconds=microseconds)).time()
|
|
@@ -187,25 +219,30 @@ def _pl_tree_to_sql(tree: dict) -> str:
|
|
|
187
219
|
# Date type
|
|
188
220
|
if dtype == "Date":
|
|
189
221
|
days_since_epoch = value["Date"]
|
|
222
|
+
assert isinstance(days_since_epoch, (float, int)), (
|
|
223
|
+
f"A {dtype} should be a number but got {type(days_since_epoch)}"
|
|
224
|
+
)
|
|
190
225
|
date = datetime.date(1970, 1, 1) + datetime.timedelta(days=days_since_epoch)
|
|
191
226
|
return f"'{date}'::DATE"
|
|
192
227
|
|
|
193
228
|
# Binary type
|
|
194
229
|
if dtype == "Binary":
|
|
195
|
-
|
|
230
|
+
bin_value = value["Binary"]
|
|
231
|
+
assert isinstance(bin_value, list), f"A {dtype} should be a list but got {type(bin_value)}"
|
|
232
|
+
binary_data = bytes(bin_value)
|
|
196
233
|
escaped = "".join(f"\\x{b:02x}" for b in binary_data)
|
|
197
234
|
return f"'{escaped}'::BLOB"
|
|
198
235
|
|
|
199
236
|
# String type
|
|
200
237
|
if dtype == "String" or dtype == "StringOwned":
|
|
201
238
|
# Some new formats may store directly under StringOwned
|
|
202
|
-
string_val = value.get("StringOwned", value.get("String", None))
|
|
239
|
+
string_val: object | None = value.get("StringOwned", value.get("String", None))
|
|
203
240
|
return f"'{string_val}'"
|
|
204
241
|
|
|
205
242
|
msg = f"Unsupported scalar type {dtype!s}, with value {value}"
|
|
206
243
|
raise NotImplementedError(msg)
|
|
207
244
|
|
|
208
|
-
msg = f"Node type: {node_type} is not implemented. {
|
|
245
|
+
msg = f"Node type: {node_type} is not implemented. {tree[node_type]}"
|
|
209
246
|
raise NotImplementedError(msg)
|
|
210
247
|
|
|
211
248
|
|
|
@@ -213,10 +250,10 @@ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -
|
|
|
213
250
|
"""A polars IO plugin for DuckDB."""
|
|
214
251
|
|
|
215
252
|
def source_generator(
|
|
216
|
-
with_columns:
|
|
217
|
-
predicate:
|
|
218
|
-
n_rows:
|
|
219
|
-
batch_size:
|
|
253
|
+
with_columns: list[str] | None,
|
|
254
|
+
predicate: pl.Expr | None,
|
|
255
|
+
n_rows: int | None,
|
|
256
|
+
batch_size: int | None,
|
|
220
257
|
) -> Iterator[pl.DataFrame]:
|
|
221
258
|
duck_predicate = None
|
|
222
259
|
relation_final = relation
|
|
@@ -227,7 +264,8 @@ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -
|
|
|
227
264
|
relation_final = relation_final.limit(n_rows)
|
|
228
265
|
if predicate is not None:
|
|
229
266
|
# We have a predicate, if possible, we push it down to DuckDB
|
|
230
|
-
|
|
267
|
+
with contextlib.suppress(AssertionError, KeyError):
|
|
268
|
+
duck_predicate = _predicate_to_expression(predicate)
|
|
231
269
|
# Try to pushdown filter, if one exists
|
|
232
270
|
if duck_predicate is not None:
|
|
233
271
|
relation_final = relation_final.filter(duck_predicate)
|
|
@@ -239,8 +277,8 @@ def duckdb_source(relation: duckdb.DuckDBPyRelation, schema: pl.schema.Schema) -
|
|
|
239
277
|
for record_batch in iter(results.read_next_batch, None):
|
|
240
278
|
if predicate is not None and duck_predicate is None:
|
|
241
279
|
# We have a predicate, but did not manage to push it down, we fallback here
|
|
242
|
-
yield pl.from_arrow(record_batch).filter(predicate)
|
|
280
|
+
yield pl.from_arrow(record_batch).filter(predicate) # type: ignore[arg-type,misc,unused-ignore]
|
|
243
281
|
else:
|
|
244
|
-
yield pl.from_arrow(record_batch)
|
|
282
|
+
yield pl.from_arrow(record_batch) # type: ignore[misc,unused-ignore]
|
|
245
283
|
|
|
246
284
|
return register_io_source(source_generator, schema=schema)
|