duckdb 1.4.1.dev113__cp39-cp39-macosx_10_9_universal2.whl → 1.5.0.dev37__cp39-cp39-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb.cpython-39-darwin.so +0 -0
- duckdb/__init__.py +374 -373
- duckdb/__init__.pyi +180 -604
- duckdb/bytes_io_wrapper.py +7 -6
- duckdb/experimental/__init__.py +1 -2
- duckdb/experimental/spark/__init__.py +4 -3
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +9 -7
- duckdb/experimental/spark/conf.py +15 -16
- duckdb/experimental/spark/context.py +44 -60
- duckdb/experimental/spark/errors/__init__.py +35 -33
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +88 -39
- duckdb/experimental/spark/errors/utils.py +16 -11
- duckdb/experimental/spark/exception.py +6 -9
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +15 -8
- duckdb/experimental/spark/sql/catalog.py +20 -21
- duckdb/experimental/spark/sql/column.py +54 -47
- duckdb/experimental/spark/sql/conf.py +8 -9
- duckdb/experimental/spark/sql/dataframe.py +233 -185
- duckdb/experimental/spark/sql/functions.py +1248 -1222
- duckdb/experimental/spark/sql/group.py +52 -56
- duckdb/experimental/spark/sql/readwriter.py +94 -80
- duckdb/experimental/spark/sql/session.py +59 -64
- duckdb/experimental/spark/sql/streaming.py +10 -9
- duckdb/experimental/spark/sql/type_utils.py +64 -66
- duckdb/experimental/spark/sql/types.py +344 -308
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +8 -13
- duckdb/functional/__init__.py +16 -2
- duckdb/polars_io.py +57 -66
- duckdb/query_graph/__main__.py +96 -91
- duckdb/typing/__init__.py +8 -8
- duckdb/typing/__init__.pyi +2 -4
- duckdb/udf.py +5 -10
- duckdb/value/__init__.py +0 -1
- duckdb/value/constant/__init__.py +59 -61
- duckdb/value/constant/__init__.pyi +4 -3
- duckdb-1.5.0.dev37.dist-info/METADATA +80 -0
- duckdb-1.5.0.dev37.dist-info/RECORD +47 -0
- duckdb-1.4.1.dev113.dist-info/METADATA +0 -326
- duckdb-1.4.1.dev113.dist-info/RECORD +0 -47
- {duckdb-1.4.1.dev113.dist-info → duckdb-1.5.0.dev37.dist-info}/WHEEL +0 -0
- {duckdb-1.4.1.dev113.dist-info → duckdb-1.5.0.dev37.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,31 +1,32 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
|
|
4
|
-
|
|
5
|
-
import duckdb
|
|
1
|
+
from typing import Optional, List, Any, Union, Iterable, TYPE_CHECKING
|
|
2
|
+
import uuid
|
|
6
3
|
|
|
7
4
|
if TYPE_CHECKING:
|
|
8
|
-
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
9
|
-
|
|
10
5
|
from .catalog import Catalog
|
|
6
|
+
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
11
7
|
|
|
12
|
-
|
|
13
|
-
from ..conf import SparkConf
|
|
14
|
-
from ..context import SparkContext
|
|
15
|
-
from ..errors import PySparkTypeError
|
|
16
8
|
from ..exception import ContributionsAcceptedError
|
|
17
|
-
from .
|
|
9
|
+
from .types import StructType, AtomicType, DataType
|
|
10
|
+
from ..conf import SparkConf
|
|
18
11
|
from .dataframe import DataFrame
|
|
12
|
+
from .conf import RuntimeConfig
|
|
19
13
|
from .readwriter import DataFrameReader
|
|
20
|
-
from
|
|
21
|
-
from .types import StructType
|
|
14
|
+
from ..context import SparkContext
|
|
22
15
|
from .udf import UDFRegistration
|
|
16
|
+
from .streaming import DataStreamReader
|
|
17
|
+
import duckdb
|
|
18
|
+
|
|
19
|
+
from ..errors import (
|
|
20
|
+
PySparkTypeError,
|
|
21
|
+
PySparkValueError
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
from ..errors.error_classes import *
|
|
23
25
|
|
|
24
26
|
# In spark:
|
|
25
27
|
# SparkSession holds a SparkContext
|
|
26
28
|
# SparkContext gets created from SparkConf
|
|
27
|
-
# At this level the check is made to determine whether the instance already exists and just needs
|
|
28
|
-
# to be retrieved or it needs to be created.
|
|
29
|
+
# At this level the check is made to determine whether the instance already exists and just needs to be retrieved or it needs to be created
|
|
29
30
|
|
|
30
31
|
# For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
|
|
31
32
|
# SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
|
|
@@ -33,7 +34,7 @@ from .udf import UDFRegistration
|
|
|
33
34
|
|
|
34
35
|
# data is a List of rows
|
|
35
36
|
# every value in each row needs to be turned into a Value
|
|
36
|
-
def _combine_data_and_schema(data: Iterable[Any], schema: StructType)
|
|
37
|
+
def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
|
|
37
38
|
from duckdb import Value
|
|
38
39
|
|
|
39
40
|
new_data = []
|
|
@@ -43,8 +44,8 @@ def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[du
|
|
|
43
44
|
return new_data
|
|
44
45
|
|
|
45
46
|
|
|
46
|
-
class SparkSession:
|
|
47
|
-
def __init__(self, context: SparkContext)
|
|
47
|
+
class SparkSession:
|
|
48
|
+
def __init__(self, context: SparkContext):
|
|
48
49
|
self.conn = context.connection
|
|
49
50
|
self._context = context
|
|
50
51
|
self._conf = RuntimeConfig(self.conn)
|
|
@@ -52,16 +53,15 @@ class SparkSession: # noqa: D101
|
|
|
52
53
|
def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
|
|
53
54
|
try:
|
|
54
55
|
import pandas
|
|
55
|
-
|
|
56
56
|
has_pandas = True
|
|
57
57
|
except ImportError:
|
|
58
58
|
has_pandas = False
|
|
59
59
|
if has_pandas and isinstance(data, pandas.DataFrame):
|
|
60
|
-
unique_name = f
|
|
60
|
+
unique_name = f'pyspark_pandas_df_{uuid.uuid1()}'
|
|
61
61
|
self.conn.register(unique_name, data)
|
|
62
62
|
return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
|
|
63
63
|
|
|
64
|
-
def verify_tuple_integrity(tuples
|
|
64
|
+
def verify_tuple_integrity(tuples):
|
|
65
65
|
if len(tuples) <= 1:
|
|
66
66
|
return
|
|
67
67
|
expected_length = len(tuples[0])
|
|
@@ -73,9 +73,9 @@ class SparkSession: # noqa: D101
|
|
|
73
73
|
error_class="LENGTH_SHOULD_BE_THE_SAME",
|
|
74
74
|
message_parameters={
|
|
75
75
|
"arg1": f"data{i}",
|
|
76
|
-
"arg2": f"data{i
|
|
76
|
+
"arg2": f"data{i+1}",
|
|
77
77
|
"arg1_length": str(expected_length),
|
|
78
|
-
"arg2_length": str(actual_length)
|
|
78
|
+
"arg2_length": str(actual_length)
|
|
79
79
|
},
|
|
80
80
|
)
|
|
81
81
|
|
|
@@ -83,16 +83,16 @@ class SparkSession: # noqa: D101
|
|
|
83
83
|
data = list(data)
|
|
84
84
|
verify_tuple_integrity(data)
|
|
85
85
|
|
|
86
|
-
def construct_query(tuples
|
|
87
|
-
def construct_values_list(row
|
|
86
|
+
def construct_query(tuples) -> str:
|
|
87
|
+
def construct_values_list(row, start_param_idx):
|
|
88
88
|
parameter_count = len(row)
|
|
89
|
-
parameters = [f
|
|
90
|
-
parameters =
|
|
89
|
+
parameters = [f'${x+start_param_idx}' for x in range(parameter_count)]
|
|
90
|
+
parameters = '(' + ', '.join(parameters) + ')'
|
|
91
91
|
return parameters
|
|
92
92
|
|
|
93
93
|
row_size = len(tuples[0])
|
|
94
94
|
values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
|
|
95
|
-
values_list =
|
|
95
|
+
values_list = ', '.join(values_list)
|
|
96
96
|
|
|
97
97
|
query = f"""
|
|
98
98
|
select * from (values {values_list})
|
|
@@ -101,7 +101,7 @@ class SparkSession: # noqa: D101
|
|
|
101
101
|
|
|
102
102
|
query = construct_query(data)
|
|
103
103
|
|
|
104
|
-
def construct_parameters(tuples
|
|
104
|
+
def construct_parameters(tuples):
|
|
105
105
|
parameters = []
|
|
106
106
|
for row in tuples:
|
|
107
107
|
parameters.extend(list(row))
|
|
@@ -112,9 +112,7 @@ class SparkSession: # noqa: D101
|
|
|
112
112
|
rel = self.conn.sql(query, params=parameters)
|
|
113
113
|
return DataFrame(rel, self)
|
|
114
114
|
|
|
115
|
-
def _createDataFrameFromPandas(
|
|
116
|
-
self, data: "PandasDataFrame", types: Union[list[str], None], names: Union[list[str], None]
|
|
117
|
-
) -> DataFrame:
|
|
115
|
+
def _createDataFrameFromPandas(self, data: "PandasDataFrame", types, names) -> DataFrame:
|
|
118
116
|
df = self._create_dataframe(data)
|
|
119
117
|
|
|
120
118
|
# Cast to types
|
|
@@ -125,10 +123,10 @@ class SparkSession: # noqa: D101
|
|
|
125
123
|
df = df.toDF(*names)
|
|
126
124
|
return df
|
|
127
125
|
|
|
128
|
-
def createDataFrame(
|
|
126
|
+
def createDataFrame(
|
|
129
127
|
self,
|
|
130
128
|
data: Union["PandasDataFrame", Iterable[Any]],
|
|
131
|
-
schema: Optional[Union[StructType,
|
|
129
|
+
schema: Optional[Union[StructType, List[str]]] = None,
|
|
132
130
|
samplingRatio: Optional[float] = None,
|
|
133
131
|
verifySchema: bool = True,
|
|
134
132
|
) -> DataFrame:
|
|
@@ -177,7 +175,7 @@ class SparkSession: # noqa: D101
|
|
|
177
175
|
if is_empty:
|
|
178
176
|
rel = df.relation
|
|
179
177
|
# Add impossible where clause
|
|
180
|
-
rel = rel.filter(
|
|
178
|
+
rel = rel.filter('1=0')
|
|
181
179
|
df = DataFrame(rel, self)
|
|
182
180
|
|
|
183
181
|
# Cast to types
|
|
@@ -188,10 +186,10 @@ class SparkSession: # noqa: D101
|
|
|
188
186
|
df = df.toDF(*names)
|
|
189
187
|
return df
|
|
190
188
|
|
|
191
|
-
def newSession(self) -> "SparkSession":
|
|
189
|
+
def newSession(self) -> "SparkSession":
|
|
192
190
|
return SparkSession(self._context)
|
|
193
191
|
|
|
194
|
-
def range(
|
|
192
|
+
def range(
|
|
195
193
|
self,
|
|
196
194
|
start: int,
|
|
197
195
|
end: Optional[int] = None,
|
|
@@ -205,26 +203,26 @@ class SparkSession: # noqa: D101
|
|
|
205
203
|
end = start
|
|
206
204
|
start = 0
|
|
207
205
|
|
|
208
|
-
return DataFrame(self.conn.table_function("range", parameters=[start, end, step]),
|
|
206
|
+
return DataFrame(self.conn.table_function("range", parameters=[start, end, step]),self)
|
|
209
207
|
|
|
210
|
-
def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
|
|
208
|
+
def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
|
|
211
209
|
if kwargs:
|
|
212
210
|
raise NotImplementedError
|
|
213
211
|
relation = self.conn.sql(sqlQuery)
|
|
214
212
|
return DataFrame(relation, self)
|
|
215
213
|
|
|
216
|
-
def stop(self) -> None:
|
|
214
|
+
def stop(self) -> None:
|
|
217
215
|
self._context.stop()
|
|
218
216
|
|
|
219
|
-
def table(self, tableName: str) -> DataFrame:
|
|
217
|
+
def table(self, tableName: str) -> DataFrame:
|
|
220
218
|
relation = self.conn.table(tableName)
|
|
221
219
|
return DataFrame(relation, self)
|
|
222
220
|
|
|
223
|
-
def getActiveSession(self) -> "SparkSession":
|
|
221
|
+
def getActiveSession(self) -> "SparkSession":
|
|
224
222
|
return self
|
|
225
223
|
|
|
226
224
|
@property
|
|
227
|
-
def catalog(self) -> "Catalog":
|
|
225
|
+
def catalog(self) -> "Catalog":
|
|
228
226
|
if not hasattr(self, "_catalog"):
|
|
229
227
|
from duckdb.experimental.spark.sql.catalog import Catalog
|
|
230
228
|
|
|
@@ -232,62 +230,59 @@ class SparkSession: # noqa: D101
|
|
|
232
230
|
return self._catalog
|
|
233
231
|
|
|
234
232
|
@property
|
|
235
|
-
def conf(self) -> RuntimeConfig:
|
|
233
|
+
def conf(self) -> RuntimeConfig:
|
|
236
234
|
return self._conf
|
|
237
235
|
|
|
238
236
|
@property
|
|
239
|
-
def read(self) -> DataFrameReader:
|
|
237
|
+
def read(self) -> DataFrameReader:
|
|
240
238
|
return DataFrameReader(self)
|
|
241
239
|
|
|
242
240
|
@property
|
|
243
|
-
def readStream(self) -> DataStreamReader:
|
|
241
|
+
def readStream(self) -> DataStreamReader:
|
|
244
242
|
return DataStreamReader(self)
|
|
245
243
|
|
|
246
244
|
@property
|
|
247
|
-
def sparkContext(self) -> SparkContext:
|
|
245
|
+
def sparkContext(self) -> SparkContext:
|
|
248
246
|
return self._context
|
|
249
247
|
|
|
250
248
|
@property
|
|
251
|
-
def streams(self) ->
|
|
249
|
+
def streams(self) -> Any:
|
|
252
250
|
raise ContributionsAcceptedError
|
|
253
251
|
|
|
254
252
|
@property
|
|
255
|
-
def udf(self) -> UDFRegistration:
|
|
253
|
+
def udf(self) -> UDFRegistration:
|
|
256
254
|
return UDFRegistration(self)
|
|
257
255
|
|
|
258
256
|
@property
|
|
259
|
-
def version(self) -> str:
|
|
260
|
-
return
|
|
257
|
+
def version(self) -> str:
|
|
258
|
+
return '1.0.0'
|
|
261
259
|
|
|
262
|
-
class Builder:
|
|
263
|
-
def __init__(self)
|
|
260
|
+
class Builder:
|
|
261
|
+
def __init__(self):
|
|
264
262
|
pass
|
|
265
263
|
|
|
266
|
-
def master(self, name: str) -> "SparkSession.Builder":
|
|
264
|
+
def master(self, name: str) -> "SparkSession.Builder":
|
|
267
265
|
# no-op
|
|
268
266
|
return self
|
|
269
267
|
|
|
270
|
-
def appName(self, name: str) -> "SparkSession.Builder":
|
|
268
|
+
def appName(self, name: str) -> "SparkSession.Builder":
|
|
271
269
|
# no-op
|
|
272
270
|
return self
|
|
273
271
|
|
|
274
|
-
def remote(self, url: str) -> "SparkSession.Builder":
|
|
272
|
+
def remote(self, url: str) -> "SparkSession.Builder":
|
|
275
273
|
# no-op
|
|
276
274
|
return self
|
|
277
275
|
|
|
278
|
-
def getOrCreate(self) -> "SparkSession":
|
|
276
|
+
def getOrCreate(self) -> "SparkSession":
|
|
279
277
|
context = SparkContext("__ignored__")
|
|
280
278
|
return SparkSession(context)
|
|
281
279
|
|
|
282
|
-
def config(
|
|
283
|
-
self,
|
|
284
|
-
key: Optional[str] = None,
|
|
285
|
-
value: Optional[Any] = None, # noqa: ANN401
|
|
286
|
-
conf: Optional[SparkConf] = None,
|
|
280
|
+
def config(
|
|
281
|
+
self, key: Optional[str] = None, value: Optional[Any] = None, conf: Optional[SparkConf] = None
|
|
287
282
|
) -> "SparkSession.Builder":
|
|
288
283
|
return self
|
|
289
284
|
|
|
290
|
-
def enableHiveSupport(self) -> "SparkSession.Builder":
|
|
285
|
+
def enableHiveSupport(self) -> "SparkSession.Builder":
|
|
291
286
|
# no-op
|
|
292
287
|
return self
|
|
293
288
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional, Union
|
|
2
|
-
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
3
2
|
from .types import StructType
|
|
4
3
|
|
|
5
4
|
if TYPE_CHECKING:
|
|
@@ -10,26 +9,28 @@ PrimitiveType = Union[bool, float, int, str]
|
|
|
10
9
|
OptionalPrimitiveType = Optional[PrimitiveType]
|
|
11
10
|
|
|
12
11
|
|
|
13
|
-
class DataStreamWriter:
|
|
14
|
-
def __init__(self, dataframe: "DataFrame")
|
|
12
|
+
class DataStreamWriter:
|
|
13
|
+
def __init__(self, dataframe: "DataFrame"):
|
|
15
14
|
self.dataframe = dataframe
|
|
16
15
|
|
|
17
|
-
def toTable(self, table_name: str) -> None:
|
|
16
|
+
def toTable(self, table_name: str) -> None:
|
|
18
17
|
# Should we register the dataframe or create a table from the contents?
|
|
19
18
|
raise NotImplementedError
|
|
20
19
|
|
|
21
20
|
|
|
22
|
-
class DataStreamReader:
|
|
23
|
-
def __init__(self, session: "SparkSession")
|
|
21
|
+
class DataStreamReader:
|
|
22
|
+
def __init__(self, session: "SparkSession"):
|
|
24
23
|
self.session = session
|
|
25
24
|
|
|
26
|
-
def load(
|
|
25
|
+
def load(
|
|
27
26
|
self,
|
|
28
27
|
path: Optional[str] = None,
|
|
29
28
|
format: Optional[str] = None,
|
|
30
29
|
schema: Union[StructType, str, None] = None,
|
|
31
|
-
**options: OptionalPrimitiveType
|
|
30
|
+
**options: OptionalPrimitiveType
|
|
32
31
|
) -> "DataFrame":
|
|
32
|
+
from duckdb.experimental.spark.sql.dataframe import DataFrame
|
|
33
|
+
|
|
33
34
|
raise NotImplementedError
|
|
34
35
|
|
|
35
36
|
|
|
@@ -1,107 +1,105 @@
|
|
|
1
|
-
from typing import cast # noqa: D100
|
|
2
|
-
|
|
3
1
|
from duckdb.typing import DuckDBPyType
|
|
4
|
-
|
|
2
|
+
from typing import List, Tuple, cast
|
|
5
3
|
from .types import (
|
|
6
|
-
|
|
4
|
+
DataType,
|
|
5
|
+
StringType,
|
|
7
6
|
BinaryType,
|
|
8
7
|
BitstringType,
|
|
8
|
+
UUIDType,
|
|
9
9
|
BooleanType,
|
|
10
|
-
ByteType,
|
|
11
|
-
DataType,
|
|
12
10
|
DateType,
|
|
13
|
-
|
|
11
|
+
TimestampType,
|
|
12
|
+
TimestampNTZType,
|
|
13
|
+
TimeType,
|
|
14
|
+
TimeNTZType,
|
|
15
|
+
TimestampNanosecondNTZType,
|
|
16
|
+
TimestampMilisecondNTZType,
|
|
17
|
+
TimestampSecondNTZType,
|
|
14
18
|
DecimalType,
|
|
15
19
|
DoubleType,
|
|
16
20
|
FloatType,
|
|
17
|
-
|
|
21
|
+
ByteType,
|
|
22
|
+
UnsignedByteType,
|
|
23
|
+
ShortType,
|
|
24
|
+
UnsignedShortType,
|
|
18
25
|
IntegerType,
|
|
26
|
+
UnsignedIntegerType,
|
|
19
27
|
LongType,
|
|
28
|
+
UnsignedLongType,
|
|
29
|
+
HugeIntegerType,
|
|
30
|
+
UnsignedHugeIntegerType,
|
|
31
|
+
DayTimeIntervalType,
|
|
32
|
+
ArrayType,
|
|
20
33
|
MapType,
|
|
21
|
-
ShortType,
|
|
22
|
-
StringType,
|
|
23
34
|
StructField,
|
|
24
35
|
StructType,
|
|
25
|
-
TimeNTZType,
|
|
26
|
-
TimestampMilisecondNTZType,
|
|
27
|
-
TimestampNanosecondNTZType,
|
|
28
|
-
TimestampNTZType,
|
|
29
|
-
TimestampSecondNTZType,
|
|
30
|
-
TimestampType,
|
|
31
|
-
TimeType,
|
|
32
|
-
UnsignedByteType,
|
|
33
|
-
UnsignedHugeIntegerType,
|
|
34
|
-
UnsignedIntegerType,
|
|
35
|
-
UnsignedLongType,
|
|
36
|
-
UnsignedShortType,
|
|
37
|
-
UUIDType,
|
|
38
36
|
)
|
|
39
37
|
|
|
40
38
|
_sqltype_to_spark_class = {
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
39
|
+
'boolean': BooleanType,
|
|
40
|
+
'utinyint': UnsignedByteType,
|
|
41
|
+
'tinyint': ByteType,
|
|
42
|
+
'usmallint': UnsignedShortType,
|
|
43
|
+
'smallint': ShortType,
|
|
44
|
+
'uinteger': UnsignedIntegerType,
|
|
45
|
+
'integer': IntegerType,
|
|
46
|
+
'ubigint': UnsignedLongType,
|
|
47
|
+
'bigint': LongType,
|
|
48
|
+
'hugeint': HugeIntegerType,
|
|
49
|
+
'uhugeint': UnsignedHugeIntegerType,
|
|
50
|
+
'varchar': StringType,
|
|
51
|
+
'blob': BinaryType,
|
|
52
|
+
'bit': BitstringType,
|
|
53
|
+
'uuid': UUIDType,
|
|
54
|
+
'date': DateType,
|
|
55
|
+
'time': TimeNTZType,
|
|
56
|
+
'time with time zone': TimeType,
|
|
57
|
+
'timestamp': TimestampNTZType,
|
|
58
|
+
'timestamp with time zone': TimestampType,
|
|
59
|
+
'timestamp_ms': TimestampNanosecondNTZType,
|
|
60
|
+
'timestamp_ns': TimestampMilisecondNTZType,
|
|
61
|
+
'timestamp_s': TimestampSecondNTZType,
|
|
62
|
+
'interval': DayTimeIntervalType,
|
|
63
|
+
'list': ArrayType,
|
|
64
|
+
'struct': StructType,
|
|
65
|
+
'map': MapType,
|
|
68
66
|
# union
|
|
69
67
|
# enum
|
|
70
68
|
# null (???)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
69
|
+
'float': FloatType,
|
|
70
|
+
'double': DoubleType,
|
|
71
|
+
'decimal': DecimalType,
|
|
74
72
|
}
|
|
75
73
|
|
|
76
74
|
|
|
77
|
-
def convert_nested_type(dtype: DuckDBPyType) -> DataType:
|
|
75
|
+
def convert_nested_type(dtype: DuckDBPyType) -> DataType:
|
|
78
76
|
id = dtype.id
|
|
79
|
-
if id ==
|
|
77
|
+
if id == 'list' or id == 'array':
|
|
80
78
|
children = dtype.children
|
|
81
79
|
return ArrayType(convert_type(children[0][1]))
|
|
82
|
-
# TODO: add support for 'union'
|
|
83
|
-
if id ==
|
|
84
|
-
children:
|
|
80
|
+
# TODO: add support for 'union'
|
|
81
|
+
if id == 'struct':
|
|
82
|
+
children: List[Tuple[str, DuckDBPyType]] = dtype.children
|
|
85
83
|
fields = [StructField(x[0], convert_type(x[1])) for x in children]
|
|
86
84
|
return StructType(fields)
|
|
87
|
-
if id ==
|
|
85
|
+
if id == 'map':
|
|
88
86
|
return MapType(convert_type(dtype.key), convert_type(dtype.value))
|
|
89
87
|
raise NotImplementedError
|
|
90
88
|
|
|
91
89
|
|
|
92
|
-
def convert_type(dtype: DuckDBPyType) -> DataType:
|
|
90
|
+
def convert_type(dtype: DuckDBPyType) -> DataType:
|
|
93
91
|
id = dtype.id
|
|
94
|
-
if id in [
|
|
92
|
+
if id in ['list', 'struct', 'map', 'array']:
|
|
95
93
|
return convert_nested_type(dtype)
|
|
96
|
-
if id ==
|
|
97
|
-
children:
|
|
98
|
-
precision = cast(
|
|
99
|
-
scale = cast(
|
|
94
|
+
if id == 'decimal':
|
|
95
|
+
children: List[Tuple[str, DuckDBPyType]] = dtype.children
|
|
96
|
+
precision = cast(int, children[0][1])
|
|
97
|
+
scale = cast(int, children[1][1])
|
|
100
98
|
return DecimalType(precision, scale)
|
|
101
99
|
spark_type = _sqltype_to_spark_class[id]
|
|
102
100
|
return spark_type()
|
|
103
101
|
|
|
104
102
|
|
|
105
|
-
def duckdb_to_spark_schema(names:
|
|
103
|
+
def duckdb_to_spark_schema(names: List[str], types: List[DuckDBPyType]) -> StructType:
|
|
106
104
|
fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
|
|
107
105
|
return StructType(fields)
|