duckdb 1.5.0.dev37__cp310-cp310-win_amd64.whl → 1.5.0.dev94__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb-stubs/__init__.pyi +1443 -0
- _duckdb-stubs/_func.pyi +46 -0
- _duckdb-stubs/_sqltypes.pyi +75 -0
- _duckdb.cp310-win_amd64.pyd +0 -0
- adbc_driver_duckdb/__init__.py +49 -0
- adbc_driver_duckdb/dbapi.py +115 -0
- duckdb/__init__.py +341 -435
- duckdb/_dbapi_type_object.py +231 -0
- duckdb/_version.py +22 -0
- duckdb/bytes_io_wrapper.py +12 -9
- duckdb/experimental/__init__.py +2 -1
- duckdb/experimental/spark/__init__.py +3 -4
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +7 -9
- duckdb/experimental/spark/conf.py +16 -15
- duckdb/experimental/spark/context.py +60 -44
- duckdb/experimental/spark/errors/__init__.py +33 -35
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +39 -88
- duckdb/experimental/spark/errors/utils.py +11 -16
- duckdb/experimental/spark/exception.py +9 -6
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +8 -15
- duckdb/experimental/spark/sql/catalog.py +21 -20
- duckdb/experimental/spark/sql/column.py +48 -55
- duckdb/experimental/spark/sql/conf.py +9 -8
- duckdb/experimental/spark/sql/dataframe.py +185 -233
- duckdb/experimental/spark/sql/functions.py +1222 -1248
- duckdb/experimental/spark/sql/group.py +56 -52
- duckdb/experimental/spark/sql/readwriter.py +80 -94
- duckdb/experimental/spark/sql/session.py +64 -59
- duckdb/experimental/spark/sql/streaming.py +9 -10
- duckdb/experimental/spark/sql/type_utils.py +67 -65
- duckdb/experimental/spark/sql/types.py +309 -345
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +26 -16
- duckdb/func/__init__.py +3 -0
- duckdb/functional/__init__.py +12 -16
- duckdb/polars_io.py +130 -83
- duckdb/query_graph/__main__.py +91 -96
- duckdb/sqltypes/__init__.py +63 -0
- duckdb/typing/__init__.py +18 -8
- duckdb/udf.py +10 -5
- duckdb/value/__init__.py +1 -0
- duckdb/value/constant/__init__.py +62 -60
- {duckdb-1.5.0.dev37.dist-info → duckdb-1.5.0.dev94.dist-info}/METADATA +12 -4
- duckdb-1.5.0.dev94.dist-info/RECORD +52 -0
- duckdb/__init__.pyi +0 -713
- duckdb/functional/__init__.pyi +0 -31
- duckdb/typing/__init__.pyi +0 -36
- duckdb/value/constant/__init__.pyi +0 -115
- duckdb-1.5.0.dev37.dist-info/RECORD +0 -47
- /duckdb/{value/__init__.pyi → py.typed} +0 -0
- {duckdb-1.5.0.dev37.dist-info → duckdb-1.5.0.dev94.dist-info}/WHEEL +0 -0
- {duckdb-1.5.0.dev37.dist-info → duckdb-1.5.0.dev94.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,32 +1,31 @@
|
|
|
1
|
-
|
|
2
|
-
import
|
|
1
|
+
import uuid # noqa: D100
|
|
2
|
+
from collections.abc import Iterable, Sized
|
|
3
|
+
from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
|
|
4
|
+
|
|
5
|
+
import duckdb
|
|
3
6
|
|
|
4
7
|
if TYPE_CHECKING:
|
|
5
|
-
from .catalog import Catalog
|
|
6
8
|
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
7
9
|
|
|
8
|
-
from
|
|
9
|
-
|
|
10
|
+
from .catalog import Catalog
|
|
11
|
+
|
|
12
|
+
|
|
10
13
|
from ..conf import SparkConf
|
|
11
|
-
from
|
|
14
|
+
from ..context import SparkContext
|
|
15
|
+
from ..errors import PySparkTypeError
|
|
16
|
+
from ..exception import ContributionsAcceptedError
|
|
12
17
|
from .conf import RuntimeConfig
|
|
18
|
+
from .dataframe import DataFrame
|
|
13
19
|
from .readwriter import DataFrameReader
|
|
14
|
-
from ..context import SparkContext
|
|
15
|
-
from .udf import UDFRegistration
|
|
16
20
|
from .streaming import DataStreamReader
|
|
17
|
-
import
|
|
18
|
-
|
|
19
|
-
from ..errors import (
|
|
20
|
-
PySparkTypeError,
|
|
21
|
-
PySparkValueError
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
from ..errors.error_classes import *
|
|
21
|
+
from .types import StructType
|
|
22
|
+
from .udf import UDFRegistration
|
|
25
23
|
|
|
26
24
|
# In spark:
|
|
27
25
|
# SparkSession holds a SparkContext
|
|
28
26
|
# SparkContext gets created from SparkConf
|
|
29
|
-
# At this level the check is made to determine whether the instance already exists and just needs
|
|
27
|
+
# At this level the check is made to determine whether the instance already exists and just needs
|
|
28
|
+
# to be retrieved or it needs to be created.
|
|
30
29
|
|
|
31
30
|
# For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
|
|
32
31
|
# SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
|
|
@@ -34,7 +33,7 @@ from ..errors.error_classes import *
|
|
|
34
33
|
|
|
35
34
|
# data is a List of rows
|
|
36
35
|
# every value in each row needs to be turned into a Value
|
|
37
|
-
def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
|
|
36
|
+
def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[duckdb.Value]:
|
|
38
37
|
from duckdb import Value
|
|
39
38
|
|
|
40
39
|
new_data = []
|
|
@@ -44,8 +43,8 @@ def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
|
|
|
44
43
|
return new_data
|
|
45
44
|
|
|
46
45
|
|
|
47
|
-
class SparkSession:
|
|
48
|
-
def __init__(self, context: SparkContext):
|
|
46
|
+
class SparkSession: # noqa: D101
|
|
47
|
+
def __init__(self, context: SparkContext) -> None: # noqa: D107
|
|
49
48
|
self.conn = context.connection
|
|
50
49
|
self._context = context
|
|
51
50
|
self._conf = RuntimeConfig(self.conn)
|
|
@@ -53,15 +52,16 @@ class SparkSession:
|
|
|
53
52
|
def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
|
|
54
53
|
try:
|
|
55
54
|
import pandas
|
|
55
|
+
|
|
56
56
|
has_pandas = True
|
|
57
57
|
except ImportError:
|
|
58
58
|
has_pandas = False
|
|
59
59
|
if has_pandas and isinstance(data, pandas.DataFrame):
|
|
60
|
-
unique_name = f
|
|
60
|
+
unique_name = f"pyspark_pandas_df_{uuid.uuid1()}"
|
|
61
61
|
self.conn.register(unique_name, data)
|
|
62
62
|
return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
|
|
63
63
|
|
|
64
|
-
def verify_tuple_integrity(tuples):
|
|
64
|
+
def verify_tuple_integrity(tuples: list[tuple]) -> None:
|
|
65
65
|
if len(tuples) <= 1:
|
|
66
66
|
return
|
|
67
67
|
expected_length = len(tuples[0])
|
|
@@ -73,9 +73,9 @@ class SparkSession:
|
|
|
73
73
|
error_class="LENGTH_SHOULD_BE_THE_SAME",
|
|
74
74
|
message_parameters={
|
|
75
75
|
"arg1": f"data{i}",
|
|
76
|
-
"arg2": f"data{i+1}",
|
|
76
|
+
"arg2": f"data{i + 1}",
|
|
77
77
|
"arg1_length": str(expected_length),
|
|
78
|
-
"arg2_length": str(actual_length)
|
|
78
|
+
"arg2_length": str(actual_length),
|
|
79
79
|
},
|
|
80
80
|
)
|
|
81
81
|
|
|
@@ -83,16 +83,16 @@ class SparkSession:
|
|
|
83
83
|
data = list(data)
|
|
84
84
|
verify_tuple_integrity(data)
|
|
85
85
|
|
|
86
|
-
def construct_query(tuples) -> str:
|
|
87
|
-
def construct_values_list(row, start_param_idx):
|
|
86
|
+
def construct_query(tuples: Iterable) -> str:
|
|
87
|
+
def construct_values_list(row: Sized, start_param_idx: int) -> str:
|
|
88
88
|
parameter_count = len(row)
|
|
89
|
-
parameters = [f
|
|
90
|
-
parameters =
|
|
89
|
+
parameters = [f"${x + start_param_idx}" for x in range(parameter_count)]
|
|
90
|
+
parameters = "(" + ", ".join(parameters) + ")"
|
|
91
91
|
return parameters
|
|
92
92
|
|
|
93
93
|
row_size = len(tuples[0])
|
|
94
94
|
values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
|
|
95
|
-
values_list =
|
|
95
|
+
values_list = ", ".join(values_list)
|
|
96
96
|
|
|
97
97
|
query = f"""
|
|
98
98
|
select * from (values {values_list})
|
|
@@ -101,7 +101,7 @@ class SparkSession:
|
|
|
101
101
|
|
|
102
102
|
query = construct_query(data)
|
|
103
103
|
|
|
104
|
-
def construct_parameters(tuples):
|
|
104
|
+
def construct_parameters(tuples: Iterable) -> list[list]:
|
|
105
105
|
parameters = []
|
|
106
106
|
for row in tuples:
|
|
107
107
|
parameters.extend(list(row))
|
|
@@ -112,7 +112,9 @@ class SparkSession:
|
|
|
112
112
|
rel = self.conn.sql(query, params=parameters)
|
|
113
113
|
return DataFrame(rel, self)
|
|
114
114
|
|
|
115
|
-
def _createDataFrameFromPandas(
|
|
115
|
+
def _createDataFrameFromPandas(
|
|
116
|
+
self, data: "PandasDataFrame", types: Union[list[str], None], names: Union[list[str], None]
|
|
117
|
+
) -> DataFrame:
|
|
116
118
|
df = self._create_dataframe(data)
|
|
117
119
|
|
|
118
120
|
# Cast to types
|
|
@@ -123,10 +125,10 @@ class SparkSession:
|
|
|
123
125
|
df = df.toDF(*names)
|
|
124
126
|
return df
|
|
125
127
|
|
|
126
|
-
def createDataFrame(
|
|
128
|
+
def createDataFrame( # noqa: D102
|
|
127
129
|
self,
|
|
128
130
|
data: Union["PandasDataFrame", Iterable[Any]],
|
|
129
|
-
schema: Optional[Union[StructType,
|
|
131
|
+
schema: Optional[Union[StructType, list[str]]] = None,
|
|
130
132
|
samplingRatio: Optional[float] = None,
|
|
131
133
|
verifySchema: bool = True,
|
|
132
134
|
) -> DataFrame:
|
|
@@ -175,7 +177,7 @@ class SparkSession:
|
|
|
175
177
|
if is_empty:
|
|
176
178
|
rel = df.relation
|
|
177
179
|
# Add impossible where clause
|
|
178
|
-
rel = rel.filter(
|
|
180
|
+
rel = rel.filter("1=0")
|
|
179
181
|
df = DataFrame(rel, self)
|
|
180
182
|
|
|
181
183
|
# Cast to types
|
|
@@ -186,10 +188,10 @@ class SparkSession:
|
|
|
186
188
|
df = df.toDF(*names)
|
|
187
189
|
return df
|
|
188
190
|
|
|
189
|
-
def newSession(self) -> "SparkSession":
|
|
191
|
+
def newSession(self) -> "SparkSession": # noqa: D102
|
|
190
192
|
return SparkSession(self._context)
|
|
191
193
|
|
|
192
|
-
def range(
|
|
194
|
+
def range( # noqa: D102
|
|
193
195
|
self,
|
|
194
196
|
start: int,
|
|
195
197
|
end: Optional[int] = None,
|
|
@@ -203,26 +205,26 @@ class SparkSession:
|
|
|
203
205
|
end = start
|
|
204
206
|
start = 0
|
|
205
207
|
|
|
206
|
-
return DataFrame(self.conn.table_function("range", parameters=[start, end, step]),self)
|
|
208
|
+
return DataFrame(self.conn.table_function("range", parameters=[start, end, step]), self)
|
|
207
209
|
|
|
208
|
-
def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
|
|
210
|
+
def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame: # noqa: D102, ANN401
|
|
209
211
|
if kwargs:
|
|
210
212
|
raise NotImplementedError
|
|
211
213
|
relation = self.conn.sql(sqlQuery)
|
|
212
214
|
return DataFrame(relation, self)
|
|
213
215
|
|
|
214
|
-
def stop(self) -> None:
|
|
216
|
+
def stop(self) -> None: # noqa: D102
|
|
215
217
|
self._context.stop()
|
|
216
218
|
|
|
217
|
-
def table(self, tableName: str) -> DataFrame:
|
|
219
|
+
def table(self, tableName: str) -> DataFrame: # noqa: D102
|
|
218
220
|
relation = self.conn.table(tableName)
|
|
219
221
|
return DataFrame(relation, self)
|
|
220
222
|
|
|
221
|
-
def getActiveSession(self) -> "SparkSession":
|
|
223
|
+
def getActiveSession(self) -> "SparkSession": # noqa: D102
|
|
222
224
|
return self
|
|
223
225
|
|
|
224
226
|
@property
|
|
225
|
-
def catalog(self) -> "Catalog":
|
|
227
|
+
def catalog(self) -> "Catalog": # noqa: D102
|
|
226
228
|
if not hasattr(self, "_catalog"):
|
|
227
229
|
from duckdb.experimental.spark.sql.catalog import Catalog
|
|
228
230
|
|
|
@@ -230,59 +232,62 @@ class SparkSession:
|
|
|
230
232
|
return self._catalog
|
|
231
233
|
|
|
232
234
|
@property
|
|
233
|
-
def conf(self) -> RuntimeConfig:
|
|
235
|
+
def conf(self) -> RuntimeConfig: # noqa: D102
|
|
234
236
|
return self._conf
|
|
235
237
|
|
|
236
238
|
@property
|
|
237
|
-
def read(self) -> DataFrameReader:
|
|
239
|
+
def read(self) -> DataFrameReader: # noqa: D102
|
|
238
240
|
return DataFrameReader(self)
|
|
239
241
|
|
|
240
242
|
@property
|
|
241
|
-
def readStream(self) -> DataStreamReader:
|
|
243
|
+
def readStream(self) -> DataStreamReader: # noqa: D102
|
|
242
244
|
return DataStreamReader(self)
|
|
243
245
|
|
|
244
246
|
@property
|
|
245
|
-
def sparkContext(self) -> SparkContext:
|
|
247
|
+
def sparkContext(self) -> SparkContext: # noqa: D102
|
|
246
248
|
return self._context
|
|
247
249
|
|
|
248
250
|
@property
|
|
249
|
-
def streams(self) ->
|
|
251
|
+
def streams(self) -> NoReturn: # noqa: D102
|
|
250
252
|
raise ContributionsAcceptedError
|
|
251
253
|
|
|
252
254
|
@property
|
|
253
|
-
def udf(self) -> UDFRegistration:
|
|
255
|
+
def udf(self) -> UDFRegistration: # noqa: D102
|
|
254
256
|
return UDFRegistration(self)
|
|
255
257
|
|
|
256
258
|
@property
|
|
257
|
-
def version(self) -> str:
|
|
258
|
-
return
|
|
259
|
+
def version(self) -> str: # noqa: D102
|
|
260
|
+
return "1.0.0"
|
|
259
261
|
|
|
260
|
-
class Builder:
|
|
261
|
-
def __init__(self):
|
|
262
|
+
class Builder: # noqa: D106
|
|
263
|
+
def __init__(self) -> None: # noqa: D107
|
|
262
264
|
pass
|
|
263
265
|
|
|
264
|
-
def master(self, name: str) -> "SparkSession.Builder":
|
|
266
|
+
def master(self, name: str) -> "SparkSession.Builder": # noqa: D102
|
|
265
267
|
# no-op
|
|
266
268
|
return self
|
|
267
269
|
|
|
268
|
-
def appName(self, name: str) -> "SparkSession.Builder":
|
|
270
|
+
def appName(self, name: str) -> "SparkSession.Builder": # noqa: D102
|
|
269
271
|
# no-op
|
|
270
272
|
return self
|
|
271
273
|
|
|
272
|
-
def remote(self, url: str) -> "SparkSession.Builder":
|
|
274
|
+
def remote(self, url: str) -> "SparkSession.Builder": # noqa: D102
|
|
273
275
|
# no-op
|
|
274
276
|
return self
|
|
275
277
|
|
|
276
|
-
def getOrCreate(self) -> "SparkSession":
|
|
278
|
+
def getOrCreate(self) -> "SparkSession": # noqa: D102
|
|
277
279
|
context = SparkContext("__ignored__")
|
|
278
280
|
return SparkSession(context)
|
|
279
281
|
|
|
280
|
-
def config(
|
|
281
|
-
self,
|
|
282
|
+
def config( # noqa: D102
|
|
283
|
+
self,
|
|
284
|
+
key: Optional[str] = None,
|
|
285
|
+
value: Optional[Any] = None, # noqa: ANN401
|
|
286
|
+
conf: Optional[SparkConf] = None,
|
|
282
287
|
) -> "SparkSession.Builder":
|
|
283
288
|
return self
|
|
284
289
|
|
|
285
|
-
def enableHiveSupport(self) -> "SparkSession.Builder":
|
|
290
|
+
def enableHiveSupport(self) -> "SparkSession.Builder": # noqa: D102
|
|
286
291
|
# no-op
|
|
287
292
|
return self
|
|
288
293
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Union # noqa: D100
|
|
2
|
+
|
|
2
3
|
from .types import StructType
|
|
3
4
|
|
|
4
5
|
if TYPE_CHECKING:
|
|
@@ -9,28 +10,26 @@ PrimitiveType = Union[bool, float, int, str]
|
|
|
9
10
|
OptionalPrimitiveType = Optional[PrimitiveType]
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
class DataStreamWriter:
|
|
13
|
-
def __init__(self, dataframe: "DataFrame"):
|
|
13
|
+
class DataStreamWriter: # noqa: D101
|
|
14
|
+
def __init__(self, dataframe: "DataFrame") -> None: # noqa: D107
|
|
14
15
|
self.dataframe = dataframe
|
|
15
16
|
|
|
16
|
-
def toTable(self, table_name: str) -> None:
|
|
17
|
+
def toTable(self, table_name: str) -> None: # noqa: D102
|
|
17
18
|
# Should we register the dataframe or create a table from the contents?
|
|
18
19
|
raise NotImplementedError
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
class DataStreamReader:
|
|
22
|
-
def __init__(self, session: "SparkSession"):
|
|
22
|
+
class DataStreamReader: # noqa: D101
|
|
23
|
+
def __init__(self, session: "SparkSession") -> None: # noqa: D107
|
|
23
24
|
self.session = session
|
|
24
25
|
|
|
25
|
-
def load(
|
|
26
|
+
def load( # noqa: D102
|
|
26
27
|
self,
|
|
27
28
|
path: Optional[str] = None,
|
|
28
29
|
format: Optional[str] = None,
|
|
29
30
|
schema: Union[StructType, str, None] = None,
|
|
30
|
-
**options: OptionalPrimitiveType
|
|
31
|
+
**options: OptionalPrimitiveType,
|
|
31
32
|
) -> "DataFrame":
|
|
32
|
-
from duckdb.experimental.spark.sql.dataframe import DataFrame
|
|
33
|
-
|
|
34
33
|
raise NotImplementedError
|
|
35
34
|
|
|
36
35
|
|
|
@@ -1,105 +1,107 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
1
|
+
from typing import cast # noqa: D100
|
|
2
|
+
|
|
3
|
+
from duckdb.sqltypes import DuckDBPyType
|
|
4
|
+
|
|
3
5
|
from .types import (
|
|
4
|
-
|
|
5
|
-
StringType,
|
|
6
|
+
ArrayType,
|
|
6
7
|
BinaryType,
|
|
7
8
|
BitstringType,
|
|
8
|
-
UUIDType,
|
|
9
9
|
BooleanType,
|
|
10
|
+
ByteType,
|
|
11
|
+
DataType,
|
|
10
12
|
DateType,
|
|
11
|
-
|
|
12
|
-
TimestampNTZType,
|
|
13
|
-
TimeType,
|
|
14
|
-
TimeNTZType,
|
|
15
|
-
TimestampNanosecondNTZType,
|
|
16
|
-
TimestampMilisecondNTZType,
|
|
17
|
-
TimestampSecondNTZType,
|
|
13
|
+
DayTimeIntervalType,
|
|
18
14
|
DecimalType,
|
|
19
15
|
DoubleType,
|
|
20
16
|
FloatType,
|
|
21
|
-
|
|
22
|
-
UnsignedByteType,
|
|
23
|
-
ShortType,
|
|
24
|
-
UnsignedShortType,
|
|
17
|
+
HugeIntegerType,
|
|
25
18
|
IntegerType,
|
|
26
|
-
UnsignedIntegerType,
|
|
27
19
|
LongType,
|
|
28
|
-
UnsignedLongType,
|
|
29
|
-
HugeIntegerType,
|
|
30
|
-
UnsignedHugeIntegerType,
|
|
31
|
-
DayTimeIntervalType,
|
|
32
|
-
ArrayType,
|
|
33
20
|
MapType,
|
|
21
|
+
ShortType,
|
|
22
|
+
StringType,
|
|
34
23
|
StructField,
|
|
35
24
|
StructType,
|
|
25
|
+
TimeNTZType,
|
|
26
|
+
TimestampMilisecondNTZType,
|
|
27
|
+
TimestampNanosecondNTZType,
|
|
28
|
+
TimestampNTZType,
|
|
29
|
+
TimestampSecondNTZType,
|
|
30
|
+
TimestampType,
|
|
31
|
+
TimeType,
|
|
32
|
+
UnsignedByteType,
|
|
33
|
+
UnsignedHugeIntegerType,
|
|
34
|
+
UnsignedIntegerType,
|
|
35
|
+
UnsignedLongType,
|
|
36
|
+
UnsignedShortType,
|
|
37
|
+
UUIDType,
|
|
36
38
|
)
|
|
37
39
|
|
|
38
40
|
_sqltype_to_spark_class = {
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
41
|
+
"boolean": BooleanType,
|
|
42
|
+
"utinyint": UnsignedByteType,
|
|
43
|
+
"tinyint": ByteType,
|
|
44
|
+
"usmallint": UnsignedShortType,
|
|
45
|
+
"smallint": ShortType,
|
|
46
|
+
"uinteger": UnsignedIntegerType,
|
|
47
|
+
"integer": IntegerType,
|
|
48
|
+
"ubigint": UnsignedLongType,
|
|
49
|
+
"bigint": LongType,
|
|
50
|
+
"hugeint": HugeIntegerType,
|
|
51
|
+
"uhugeint": UnsignedHugeIntegerType,
|
|
52
|
+
"varchar": StringType,
|
|
53
|
+
"blob": BinaryType,
|
|
54
|
+
"bit": BitstringType,
|
|
55
|
+
"uuid": UUIDType,
|
|
56
|
+
"date": DateType,
|
|
57
|
+
"time": TimeNTZType,
|
|
58
|
+
"time with time zone": TimeType,
|
|
59
|
+
"timestamp": TimestampNTZType,
|
|
60
|
+
"timestamp with time zone": TimestampType,
|
|
61
|
+
"timestamp_ms": TimestampNanosecondNTZType,
|
|
62
|
+
"timestamp_ns": TimestampMilisecondNTZType,
|
|
63
|
+
"timestamp_s": TimestampSecondNTZType,
|
|
64
|
+
"interval": DayTimeIntervalType,
|
|
65
|
+
"list": ArrayType,
|
|
66
|
+
"struct": StructType,
|
|
67
|
+
"map": MapType,
|
|
66
68
|
# union
|
|
67
69
|
# enum
|
|
68
70
|
# null (???)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
71
|
+
"float": FloatType,
|
|
72
|
+
"double": DoubleType,
|
|
73
|
+
"decimal": DecimalType,
|
|
72
74
|
}
|
|
73
75
|
|
|
74
76
|
|
|
75
|
-
def convert_nested_type(dtype: DuckDBPyType) -> DataType:
|
|
77
|
+
def convert_nested_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
|
|
76
78
|
id = dtype.id
|
|
77
|
-
if id ==
|
|
79
|
+
if id == "list" or id == "array":
|
|
78
80
|
children = dtype.children
|
|
79
81
|
return ArrayType(convert_type(children[0][1]))
|
|
80
|
-
# TODO: add support for 'union'
|
|
81
|
-
if id ==
|
|
82
|
-
children:
|
|
82
|
+
# TODO: add support for 'union' # noqa: TD002, TD003
|
|
83
|
+
if id == "struct":
|
|
84
|
+
children: list[tuple[str, DuckDBPyType]] = dtype.children
|
|
83
85
|
fields = [StructField(x[0], convert_type(x[1])) for x in children]
|
|
84
86
|
return StructType(fields)
|
|
85
|
-
if id ==
|
|
87
|
+
if id == "map":
|
|
86
88
|
return MapType(convert_type(dtype.key), convert_type(dtype.value))
|
|
87
89
|
raise NotImplementedError
|
|
88
90
|
|
|
89
91
|
|
|
90
|
-
def convert_type(dtype: DuckDBPyType) -> DataType:
|
|
92
|
+
def convert_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
|
|
91
93
|
id = dtype.id
|
|
92
|
-
if id in [
|
|
94
|
+
if id in ["list", "struct", "map", "array"]:
|
|
93
95
|
return convert_nested_type(dtype)
|
|
94
|
-
if id ==
|
|
95
|
-
children:
|
|
96
|
-
precision = cast(int, children[0][1])
|
|
97
|
-
scale = cast(int, children[1][1])
|
|
96
|
+
if id == "decimal":
|
|
97
|
+
children: list[tuple[str, DuckDBPyType]] = dtype.children
|
|
98
|
+
precision = cast("int", children[0][1])
|
|
99
|
+
scale = cast("int", children[1][1])
|
|
98
100
|
return DecimalType(precision, scale)
|
|
99
101
|
spark_type = _sqltype_to_spark_class[id]
|
|
100
102
|
return spark_type()
|
|
101
103
|
|
|
102
104
|
|
|
103
|
-
def duckdb_to_spark_schema(names:
|
|
105
|
+
def duckdb_to_spark_schema(names: list[str], types: list[DuckDBPyType]) -> StructType: # noqa: D103
|
|
104
106
|
fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
|
|
105
107
|
return StructType(fields)
|