duckdb 0.8.2.dev3007__cp311-cp311-win_amd64.whl → 1.4.3.dev8__cp311-cp311-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _duckdb-stubs/__init__.pyi +1478 -0
- _duckdb-stubs/_func.pyi +46 -0
- _duckdb-stubs/_sqltypes.pyi +75 -0
- duckdb/duckdb.cp311-win_amd64.pyd → _duckdb.cp311-win_amd64.pyd +0 -0
- adbc_driver_duckdb/__init__.py +10 -8
- adbc_driver_duckdb/dbapi.py +4 -5
- duckdb/__init__.py +250 -196
- duckdb/_dbapi_type_object.py +231 -0
- duckdb/_version.py +22 -0
- {pyduckdb → duckdb}/bytes_io_wrapper.py +12 -8
- duckdb/experimental/__init__.py +5 -0
- duckdb/experimental/spark/__init__.py +6 -0
- {pyduckdb → duckdb/experimental}/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +46 -0
- duckdb/experimental/spark/conf.py +46 -0
- duckdb/experimental/spark/context.py +180 -0
- duckdb/experimental/spark/errors/__init__.py +70 -0
- duckdb/experimental/spark/errors/error_classes.py +918 -0
- duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
- duckdb/experimental/spark/errors/exceptions/base.py +168 -0
- duckdb/experimental/spark/errors/utils.py +111 -0
- duckdb/experimental/spark/exception.py +18 -0
- {pyduckdb → duckdb/experimental}/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +86 -0
- duckdb/experimental/spark/sql/catalog.py +79 -0
- duckdb/experimental/spark/sql/column.py +361 -0
- duckdb/experimental/spark/sql/conf.py +24 -0
- duckdb/experimental/spark/sql/dataframe.py +1389 -0
- duckdb/experimental/spark/sql/functions.py +6195 -0
- duckdb/experimental/spark/sql/group.py +424 -0
- duckdb/experimental/spark/sql/readwriter.py +435 -0
- duckdb/experimental/spark/sql/session.py +297 -0
- duckdb/experimental/spark/sql/streaming.py +36 -0
- duckdb/experimental/spark/sql/type_utils.py +107 -0
- {pyduckdb → duckdb/experimental}/spark/sql/types.py +323 -342
- duckdb/experimental/spark/sql/udf.py +37 -0
- duckdb/filesystem.py +33 -0
- duckdb/func/__init__.py +3 -0
- duckdb/functional/__init__.py +12 -16
- duckdb/polars_io.py +284 -0
- duckdb/py.typed +0 -0
- duckdb/query_graph/__main__.py +358 -0
- duckdb/sqltypes/__init__.py +63 -0
- duckdb/typing/__init__.py +18 -6
- {pyduckdb → duckdb}/udf.py +10 -5
- duckdb/value/__init__.py +1 -0
- pyduckdb/value/constant.py → duckdb/value/constant/__init__.py +66 -57
- duckdb-1.4.3.dev8.dist-info/METADATA +88 -0
- duckdb-1.4.3.dev8.dist-info/RECORD +52 -0
- {duckdb-0.8.2.dev3007.dist-info → duckdb-1.4.3.dev8.dist-info}/WHEEL +1 -1
- duckdb-1.4.3.dev8.dist-info/licenses/LICENSE +7 -0
- duckdb-0.8.2.dev3007.dist-info/METADATA +0 -20
- duckdb-0.8.2.dev3007.dist-info/RECORD +0 -34
- duckdb-0.8.2.dev3007.dist-info/top_level.txt +0 -4
- duckdb-stubs/__init__.pyi +0 -574
- duckdb-stubs/functional/__init__.pyi +0 -33
- duckdb-stubs/typing/__init__.pyi +0 -35
- pyduckdb/__init__.py +0 -61
- pyduckdb/filesystem.py +0 -64
- pyduckdb/spark/__init__.py +0 -7
- pyduckdb/spark/conf.py +0 -45
- pyduckdb/spark/context.py +0 -162
- pyduckdb/spark/exception.py +0 -9
- pyduckdb/spark/sql/catalog.py +0 -78
- pyduckdb/spark/sql/conf.py +0 -23
- pyduckdb/spark/sql/dataframe.py +0 -75
- pyduckdb/spark/sql/readwriter.py +0 -180
- pyduckdb/spark/sql/session.py +0 -249
- pyduckdb/spark/sql/streaming.py +0 -37
- pyduckdb/spark/sql/type_utils.py +0 -104
- pyduckdb/spark/sql/udf.py +0 -9
- {pyduckdb → duckdb/experimental}/spark/LICENSE +0 -0
pyduckdb/spark/sql/readwriter.py
DELETED
|
@@ -1,180 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Iterable, Union, List, Optional, cast
|
|
2
|
-
from pyduckdb.spark.sql.types import StructType
|
|
3
|
-
from ..exception import ContributionsAcceptedError
|
|
4
|
-
|
|
5
|
-
PrimitiveType = Union[bool, float, int, str]
|
|
6
|
-
OptionalPrimitiveType = Optional[PrimitiveType]
|
|
7
|
-
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
from pyduckdb.spark.sql.dataframe import DataFrame
|
|
10
|
-
from pyduckdb.spark.sql.session import SparkSession
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class DataFrameWriter:
|
|
14
|
-
def __init__(self, dataframe: "DataFrame"):
|
|
15
|
-
self.dataframe = dataframe
|
|
16
|
-
|
|
17
|
-
def saveAsTable(self, table_name: str) -> None:
|
|
18
|
-
relation = self.dataframe.relation
|
|
19
|
-
relation.create(table_name)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class DataFrameReader:
|
|
23
|
-
def __init__(self, session: "SparkSession"):
|
|
24
|
-
self.session = session
|
|
25
|
-
|
|
26
|
-
def load(
|
|
27
|
-
self,
|
|
28
|
-
path: Optional[Union[str, List[str]]] = None,
|
|
29
|
-
format: Optional[str] = None,
|
|
30
|
-
schema: Optional[Union[StructType, str]] = None,
|
|
31
|
-
**options: OptionalPrimitiveType,
|
|
32
|
-
) -> "DataFrame":
|
|
33
|
-
from pyduckdb.spark.sql.dataframe import DataFrame
|
|
34
|
-
|
|
35
|
-
if not isinstance(path, str):
|
|
36
|
-
raise ImportError
|
|
37
|
-
if options:
|
|
38
|
-
raise ContributionsAcceptedError
|
|
39
|
-
|
|
40
|
-
rel = None
|
|
41
|
-
if format:
|
|
42
|
-
format = format.lower()
|
|
43
|
-
if format == 'csv' or format == 'tsv':
|
|
44
|
-
rel = self.session.conn.read_csv(path)
|
|
45
|
-
elif format == 'json':
|
|
46
|
-
rel = self.session.conn.read_json(path)
|
|
47
|
-
elif format == 'parquet':
|
|
48
|
-
rel = self.session.conn.read_parquet(path)
|
|
49
|
-
else:
|
|
50
|
-
raise ContributionsAcceptedError
|
|
51
|
-
else:
|
|
52
|
-
rel = self.session.conn.sql(f'select * from {path}')
|
|
53
|
-
df = DataFrame(rel, self.session)
|
|
54
|
-
if schema:
|
|
55
|
-
if not isinstance(schema, StructType):
|
|
56
|
-
raise ContributionsAcceptedError
|
|
57
|
-
schema = cast(StructType, schema)
|
|
58
|
-
types, names = schema.extract_types_and_names()
|
|
59
|
-
df = df._cast_types(types)
|
|
60
|
-
df = df.toDF(names)
|
|
61
|
-
raise NotImplementedError
|
|
62
|
-
|
|
63
|
-
def csv(
|
|
64
|
-
self,
|
|
65
|
-
path: Union[str, List[str]],
|
|
66
|
-
schema: Optional[Union[StructType, str]] = None,
|
|
67
|
-
sep: Optional[str] = None,
|
|
68
|
-
encoding: Optional[str] = None,
|
|
69
|
-
quote: Optional[str] = None,
|
|
70
|
-
escape: Optional[str] = None,
|
|
71
|
-
comment: Optional[str] = None,
|
|
72
|
-
header: Optional[Union[bool, str]] = None,
|
|
73
|
-
inferSchema: Optional[Union[bool, str]] = None,
|
|
74
|
-
ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
|
|
75
|
-
ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
|
|
76
|
-
nullValue: Optional[str] = None,
|
|
77
|
-
nanValue: Optional[str] = None,
|
|
78
|
-
positiveInf: Optional[str] = None,
|
|
79
|
-
negativeInf: Optional[str] = None,
|
|
80
|
-
dateFormat: Optional[str] = None,
|
|
81
|
-
timestampFormat: Optional[str] = None,
|
|
82
|
-
maxColumns: Optional[Union[int, str]] = None,
|
|
83
|
-
maxCharsPerColumn: Optional[Union[int, str]] = None,
|
|
84
|
-
maxMalformedLogPerPartition: Optional[Union[int, str]] = None,
|
|
85
|
-
mode: Optional[str] = None,
|
|
86
|
-
columnNameOfCorruptRecord: Optional[str] = None,
|
|
87
|
-
multiLine: Optional[Union[bool, str]] = None,
|
|
88
|
-
charToEscapeQuoteEscaping: Optional[str] = None,
|
|
89
|
-
samplingRatio: Optional[Union[float, str]] = None,
|
|
90
|
-
enforceSchema: Optional[Union[bool, str]] = None,
|
|
91
|
-
emptyValue: Optional[str] = None,
|
|
92
|
-
locale: Optional[str] = None,
|
|
93
|
-
lineSep: Optional[str] = None,
|
|
94
|
-
pathGlobFilter: Optional[Union[bool, str]] = None,
|
|
95
|
-
recursiveFileLookup: Optional[Union[bool, str]] = None,
|
|
96
|
-
modifiedBefore: Optional[Union[bool, str]] = None,
|
|
97
|
-
modifiedAfter: Optional[Union[bool, str]] = None,
|
|
98
|
-
unescapedQuoteHandling: Optional[str] = None,
|
|
99
|
-
) -> "DataFrame":
|
|
100
|
-
if not isinstance(path, str):
|
|
101
|
-
raise NotImplementedError
|
|
102
|
-
if schema and not isinstance(schema, StructType):
|
|
103
|
-
raise ContributionsAcceptedError
|
|
104
|
-
if comment:
|
|
105
|
-
raise ContributionsAcceptedError
|
|
106
|
-
if inferSchema:
|
|
107
|
-
raise ContributionsAcceptedError
|
|
108
|
-
if ignoreLeadingWhiteSpace:
|
|
109
|
-
raise ContributionsAcceptedError
|
|
110
|
-
if ignoreTrailingWhiteSpace:
|
|
111
|
-
raise ContributionsAcceptedError
|
|
112
|
-
if nanValue:
|
|
113
|
-
raise ConnectionAbortedError
|
|
114
|
-
if positiveInf:
|
|
115
|
-
raise ConnectionAbortedError
|
|
116
|
-
if negativeInf:
|
|
117
|
-
raise ConnectionAbortedError
|
|
118
|
-
if negativeInf:
|
|
119
|
-
raise ConnectionAbortedError
|
|
120
|
-
if maxColumns:
|
|
121
|
-
raise ContributionsAcceptedError
|
|
122
|
-
if maxCharsPerColumn:
|
|
123
|
-
raise ContributionsAcceptedError
|
|
124
|
-
if maxMalformedLogPerPartition:
|
|
125
|
-
raise ContributionsAcceptedError
|
|
126
|
-
if mode:
|
|
127
|
-
raise ContributionsAcceptedError
|
|
128
|
-
if columnNameOfCorruptRecord:
|
|
129
|
-
raise ContributionsAcceptedError
|
|
130
|
-
if multiLine:
|
|
131
|
-
raise ContributionsAcceptedError
|
|
132
|
-
if charToEscapeQuoteEscaping:
|
|
133
|
-
raise ContributionsAcceptedError
|
|
134
|
-
if samplingRatio:
|
|
135
|
-
raise ContributionsAcceptedError
|
|
136
|
-
if enforceSchema:
|
|
137
|
-
raise ContributionsAcceptedError
|
|
138
|
-
if emptyValue:
|
|
139
|
-
raise ContributionsAcceptedError
|
|
140
|
-
if locale:
|
|
141
|
-
raise ContributionsAcceptedError
|
|
142
|
-
if pathGlobFilter:
|
|
143
|
-
raise ContributionsAcceptedError
|
|
144
|
-
if recursiveFileLookup:
|
|
145
|
-
raise ContributionsAcceptedError
|
|
146
|
-
if modifiedBefore:
|
|
147
|
-
raise ContributionsAcceptedError
|
|
148
|
-
if modifiedAfter:
|
|
149
|
-
raise ContributionsAcceptedError
|
|
150
|
-
if unescapedQuoteHandling:
|
|
151
|
-
raise ContributionsAcceptedError
|
|
152
|
-
if lineSep:
|
|
153
|
-
# We have support for custom newline, just needs to be ported to 'read_csv'
|
|
154
|
-
raise NotImplementedError
|
|
155
|
-
|
|
156
|
-
dtype = None
|
|
157
|
-
names = None
|
|
158
|
-
if schema:
|
|
159
|
-
schema = cast(StructType, schema)
|
|
160
|
-
dtype, names = schema.extract_types_and_names()
|
|
161
|
-
|
|
162
|
-
rel = self.session.conn.read_csv(
|
|
163
|
-
path,
|
|
164
|
-
header=header if isinstance(header, bool) else header == "True",
|
|
165
|
-
sep=sep,
|
|
166
|
-
dtype=dtype,
|
|
167
|
-
na_values=nullValue,
|
|
168
|
-
quotechar=quote,
|
|
169
|
-
escapechar=escape,
|
|
170
|
-
encoding=encoding,
|
|
171
|
-
date_format=dateFormat,
|
|
172
|
-
timestamp_format=timestampFormat,
|
|
173
|
-
)
|
|
174
|
-
df = DataFrame(rel, self.session)
|
|
175
|
-
if names:
|
|
176
|
-
df = df.toDF(*names)
|
|
177
|
-
return df
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
__all__ = ["DataFrameWriter", "DataFrameReader"]
|
pyduckdb/spark/sql/session.py
DELETED
|
@@ -1,249 +0,0 @@
|
|
|
1
|
-
from typing import Optional, List, Tuple, Any, Union, Iterable, TYPE_CHECKING
|
|
2
|
-
import uuid
|
|
3
|
-
|
|
4
|
-
if TYPE_CHECKING:
|
|
5
|
-
from pyduckdb.spark.sql.catalog import Catalog
|
|
6
|
-
from pandas.core.frame import DataFrame as PandasDataFrame
|
|
7
|
-
|
|
8
|
-
from pyduckdb.spark.exception import ContributionsAcceptedError
|
|
9
|
-
|
|
10
|
-
from pyduckdb.spark.sql.types import StructType, AtomicType, DataType
|
|
11
|
-
from pyduckdb.spark.conf import SparkConf
|
|
12
|
-
from pyduckdb.spark.sql.dataframe import DataFrame
|
|
13
|
-
from pyduckdb.spark.sql.conf import RuntimeConfig
|
|
14
|
-
from pyduckdb.spark.sql.readwriter import DataFrameReader
|
|
15
|
-
from pyduckdb.spark.context import SparkContext
|
|
16
|
-
from pyduckdb.spark.sql.udf import UDFRegistration
|
|
17
|
-
from pyduckdb.spark.sql.streaming import DataStreamReader
|
|
18
|
-
import duckdb
|
|
19
|
-
|
|
20
|
-
# In spark:
|
|
21
|
-
# SparkSession holds a SparkContext
|
|
22
|
-
# SparkContext gets created from SparkConf
|
|
23
|
-
# At this level the check is made to determine whether the instance already exists and just needs to be retrieved or it needs to be created
|
|
24
|
-
|
|
25
|
-
# For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
|
|
26
|
-
# SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# data is a List of rows
|
|
30
|
-
# every value in each row needs to be turned into a Value
|
|
31
|
-
def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
|
|
32
|
-
from pyduckdb import Value
|
|
33
|
-
|
|
34
|
-
new_data = []
|
|
35
|
-
for row in data:
|
|
36
|
-
new_row = [Value(x, dtype.duckdb_type) for x, dtype in zip(row, [y.dataType for y in schema])]
|
|
37
|
-
new_data.append(new_row)
|
|
38
|
-
return new_data
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class SparkSession:
|
|
42
|
-
def __init__(self, context: SparkContext):
|
|
43
|
-
self.conn = context.connection
|
|
44
|
-
self._context = context
|
|
45
|
-
self._conf = RuntimeConfig(self.conn)
|
|
46
|
-
|
|
47
|
-
def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
|
|
48
|
-
try:
|
|
49
|
-
import pandas
|
|
50
|
-
|
|
51
|
-
has_pandas = True
|
|
52
|
-
except:
|
|
53
|
-
has_pandas = False
|
|
54
|
-
if has_pandas and isinstance(data, pandas.DataFrame):
|
|
55
|
-
unique_name = f'pyspark_pandas_df_{uuid.uuid1()}'
|
|
56
|
-
self.conn.register(unique_name, data)
|
|
57
|
-
return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
|
|
58
|
-
|
|
59
|
-
def verify_tuple_integrity(tuples):
|
|
60
|
-
if len(tuples) <= 1:
|
|
61
|
-
return
|
|
62
|
-
assert all([len(x) == len(tuples[0]) for x in tuples[1:]])
|
|
63
|
-
|
|
64
|
-
if not isinstance(data, list):
|
|
65
|
-
data = list(data)
|
|
66
|
-
verify_tuple_integrity(data)
|
|
67
|
-
|
|
68
|
-
def construct_query(tuples) -> str:
|
|
69
|
-
def construct_values_list(row, start_param_idx):
|
|
70
|
-
parameter_count = len(row)
|
|
71
|
-
parameters = [f'${x+start_param_idx}' for x in range(parameter_count)]
|
|
72
|
-
parameters = '(' + ', '.join(parameters) + ')'
|
|
73
|
-
return parameters
|
|
74
|
-
|
|
75
|
-
row_size = len(tuples[0])
|
|
76
|
-
values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
|
|
77
|
-
values_list = ', '.join(values_list)
|
|
78
|
-
|
|
79
|
-
query = f"""
|
|
80
|
-
select * from (values {values_list})
|
|
81
|
-
"""
|
|
82
|
-
return query
|
|
83
|
-
|
|
84
|
-
query = construct_query(data)
|
|
85
|
-
|
|
86
|
-
def construct_parameters(tuples):
|
|
87
|
-
parameters = []
|
|
88
|
-
for row in tuples:
|
|
89
|
-
parameters.extend(list(row))
|
|
90
|
-
return parameters
|
|
91
|
-
|
|
92
|
-
parameters = construct_parameters(data)
|
|
93
|
-
|
|
94
|
-
rel = self.conn.sql(query, params=parameters)
|
|
95
|
-
return DataFrame(rel, self)
|
|
96
|
-
|
|
97
|
-
def createDataFrame(
|
|
98
|
-
self,
|
|
99
|
-
data: Union["PandasDataFrame", Iterable[Any]],
|
|
100
|
-
schema: Optional[Union[StructType, List[str]]] = None,
|
|
101
|
-
samplingRatio: Optional[float] = None,
|
|
102
|
-
verifySchema: bool = True,
|
|
103
|
-
) -> DataFrame:
|
|
104
|
-
if samplingRatio:
|
|
105
|
-
raise NotImplementedError
|
|
106
|
-
if not verifySchema:
|
|
107
|
-
raise NotImplementedError
|
|
108
|
-
types = None
|
|
109
|
-
names = None
|
|
110
|
-
if schema:
|
|
111
|
-
if isinstance(schema, StructType):
|
|
112
|
-
types, names = schema.extract_types_and_names()
|
|
113
|
-
else:
|
|
114
|
-
names = schema
|
|
115
|
-
|
|
116
|
-
try:
|
|
117
|
-
import pandas
|
|
118
|
-
|
|
119
|
-
has_pandas = True
|
|
120
|
-
except:
|
|
121
|
-
has_pandas = False
|
|
122
|
-
# Falsey check on pandas dataframe is not defined, so first check if it's not a pandas dataframe
|
|
123
|
-
# Then check if 'data' is None or []
|
|
124
|
-
# Finally check if a schema was provided
|
|
125
|
-
is_empty = False
|
|
126
|
-
if (not has_pandas or (has_pandas and not isinstance(data, pandas.DataFrame))) and not data and names:
|
|
127
|
-
# Create NULLs for every type in our the dataframe
|
|
128
|
-
is_empty = True
|
|
129
|
-
data = [tuple(None for _ in names)]
|
|
130
|
-
|
|
131
|
-
if schema and isinstance(schema, StructType):
|
|
132
|
-
# Transform the data into Values to combine the data+schema
|
|
133
|
-
data = _combine_data_and_schema(data, schema)
|
|
134
|
-
|
|
135
|
-
df = self._create_dataframe(data)
|
|
136
|
-
if is_empty:
|
|
137
|
-
rel = df.relation
|
|
138
|
-
# Add impossible where clause
|
|
139
|
-
rel = rel.filter('1=0')
|
|
140
|
-
df = DataFrame(rel, self)
|
|
141
|
-
|
|
142
|
-
# Cast to types
|
|
143
|
-
if types:
|
|
144
|
-
df = df._cast_types(*types)
|
|
145
|
-
# Alias to names
|
|
146
|
-
if names:
|
|
147
|
-
df = df.toDF(*names)
|
|
148
|
-
return df
|
|
149
|
-
|
|
150
|
-
def newSession(self) -> "SparkSession":
|
|
151
|
-
return SparkSession(self._context)
|
|
152
|
-
|
|
153
|
-
def range(
|
|
154
|
-
self, start: int, end: Optional[int] = None, step: int = 1, numPartitions: Optional[int] = None
|
|
155
|
-
) -> "DataFrame":
|
|
156
|
-
raise ContributionsAcceptedError
|
|
157
|
-
|
|
158
|
-
def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
|
|
159
|
-
if kwargs:
|
|
160
|
-
raise NotImplementedError
|
|
161
|
-
relation = self.conn.sql(sqlQuery)
|
|
162
|
-
return DataFrame(relation, self)
|
|
163
|
-
|
|
164
|
-
def stop(self) -> None:
|
|
165
|
-
self._context.stop()
|
|
166
|
-
|
|
167
|
-
def table(self, tableName: str) -> DataFrame:
|
|
168
|
-
relation = self.conn.table(tableName)
|
|
169
|
-
return DataFrame(relation, self)
|
|
170
|
-
|
|
171
|
-
def getActiveSession(self) -> "SparkSession":
|
|
172
|
-
return self
|
|
173
|
-
|
|
174
|
-
@property
|
|
175
|
-
def catalog(self) -> "Catalog":
|
|
176
|
-
if not hasattr(self, "_catalog"):
|
|
177
|
-
from pyduckdb.spark.sql.catalog import Catalog
|
|
178
|
-
|
|
179
|
-
self._catalog = Catalog(self)
|
|
180
|
-
return self._catalog
|
|
181
|
-
|
|
182
|
-
@property
|
|
183
|
-
def conf(self) -> RuntimeConfig:
|
|
184
|
-
return self._conf
|
|
185
|
-
|
|
186
|
-
@property
|
|
187
|
-
def read(self) -> DataFrameReader:
|
|
188
|
-
return DataFrameReader(self)
|
|
189
|
-
|
|
190
|
-
@property
|
|
191
|
-
def readStream(self) -> DataStreamReader:
|
|
192
|
-
return DataStreamReader(self)
|
|
193
|
-
|
|
194
|
-
@property
|
|
195
|
-
def sparkContext(self) -> SparkContext:
|
|
196
|
-
return self._context
|
|
197
|
-
|
|
198
|
-
@property
|
|
199
|
-
def streams(self) -> Any:
|
|
200
|
-
raise ContributionsAcceptedError
|
|
201
|
-
|
|
202
|
-
@property
|
|
203
|
-
def udf(self) -> UDFRegistration:
|
|
204
|
-
return UDFRegistration()
|
|
205
|
-
|
|
206
|
-
@property
|
|
207
|
-
def version(self) -> str:
|
|
208
|
-
return '1.0.0'
|
|
209
|
-
|
|
210
|
-
class Builder:
|
|
211
|
-
def __init__(self):
|
|
212
|
-
self.name = "builder"
|
|
213
|
-
self._master = ':memory:'
|
|
214
|
-
self._config = {}
|
|
215
|
-
|
|
216
|
-
def master(self, name: str) -> "SparkSession.Builder":
|
|
217
|
-
self._master = name
|
|
218
|
-
return self
|
|
219
|
-
|
|
220
|
-
def appName(self, name: str) -> "SparkSession.Builder":
|
|
221
|
-
# no-op
|
|
222
|
-
return self
|
|
223
|
-
|
|
224
|
-
def remote(self, url: str) -> "SparkSession.Builder":
|
|
225
|
-
# no-op
|
|
226
|
-
return self
|
|
227
|
-
|
|
228
|
-
def getOrCreate(self) -> "SparkSession":
|
|
229
|
-
# TODO: use the config to pass in methods to 'connect'
|
|
230
|
-
context = SparkContext(self._master)
|
|
231
|
-
return SparkSession(context)
|
|
232
|
-
|
|
233
|
-
def config(
|
|
234
|
-
self, key: Optional[str] = None, value: Optional[Any] = None, conf: Optional[SparkConf] = None
|
|
235
|
-
) -> "SparkSession.Builder":
|
|
236
|
-
if conf:
|
|
237
|
-
raise NotImplementedError
|
|
238
|
-
if key and value:
|
|
239
|
-
self._config[key] = value
|
|
240
|
-
return self
|
|
241
|
-
|
|
242
|
-
def enableHiveSupport(self) -> "SparkSession.Builder":
|
|
243
|
-
# no-op
|
|
244
|
-
return self
|
|
245
|
-
|
|
246
|
-
builder = Builder()
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
__all__ = ["SparkSession"]
|
pyduckdb/spark/sql/streaming.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional, Union
|
|
2
|
-
from pyduckdb.spark.sql.types import StructType
|
|
3
|
-
|
|
4
|
-
if TYPE_CHECKING:
|
|
5
|
-
from pyduckdb.spark.sql.dataframe import DataFrame
|
|
6
|
-
from pyduckdb.spark.sql.session import SparkSession
|
|
7
|
-
|
|
8
|
-
PrimitiveType = Union[bool, float, int, str]
|
|
9
|
-
OptionalPrimitiveType = Optional[PrimitiveType]
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class DataStreamWriter:
|
|
13
|
-
def __init__(self, dataframe: "DataFrame"):
|
|
14
|
-
self.dataframe = dataframe
|
|
15
|
-
|
|
16
|
-
def toTable(self, table_name: str) -> None:
|
|
17
|
-
# Should we register the dataframe or create a table from the contents?
|
|
18
|
-
raise NotImplementedError
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class DataStreamReader:
|
|
22
|
-
def __init__(self, session: "SparkSession"):
|
|
23
|
-
self.session = session
|
|
24
|
-
|
|
25
|
-
def load(
|
|
26
|
-
self,
|
|
27
|
-
path: Optional[str] = None,
|
|
28
|
-
format: Optional[str] = None,
|
|
29
|
-
schema: Union[StructType, str, None] = None,
|
|
30
|
-
**options: OptionalPrimitiveType
|
|
31
|
-
) -> "DataFrame":
|
|
32
|
-
from pyduckdb.spark.sql.dataframe import DataFrame
|
|
33
|
-
|
|
34
|
-
raise NotImplementedError
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
__all__ = ["DataStreamReader", "DataStreamWriter"]
|
pyduckdb/spark/sql/type_utils.py
DELETED
|
@@ -1,104 +0,0 @@
|
|
|
1
|
-
import typing
|
|
2
|
-
from duckdb.typing import DuckDBPyType
|
|
3
|
-
from typing import List, Tuple, cast
|
|
4
|
-
from .types import (
|
|
5
|
-
DataType,
|
|
6
|
-
StringType,
|
|
7
|
-
BinaryType,
|
|
8
|
-
BitstringType,
|
|
9
|
-
UUIDType,
|
|
10
|
-
BooleanType,
|
|
11
|
-
DateType,
|
|
12
|
-
TimestampType,
|
|
13
|
-
TimestampNTZType,
|
|
14
|
-
TimeType,
|
|
15
|
-
TimeNTZType,
|
|
16
|
-
TimestampNanosecondNTZType,
|
|
17
|
-
TimestampMilisecondNTZType,
|
|
18
|
-
TimestampSecondNTZType,
|
|
19
|
-
DecimalType,
|
|
20
|
-
DoubleType,
|
|
21
|
-
FloatType,
|
|
22
|
-
ByteType,
|
|
23
|
-
UnsignedByteType,
|
|
24
|
-
ShortType,
|
|
25
|
-
UnsignedShortType,
|
|
26
|
-
IntegerType,
|
|
27
|
-
UnsignedIntegerType,
|
|
28
|
-
LongType,
|
|
29
|
-
UnsignedLongType,
|
|
30
|
-
HugeIntegerType,
|
|
31
|
-
DayTimeIntervalType,
|
|
32
|
-
ArrayType,
|
|
33
|
-
MapType,
|
|
34
|
-
StructField,
|
|
35
|
-
StructType,
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
_sqltype_to_spark_class = {
|
|
39
|
-
'boolean': BooleanType,
|
|
40
|
-
'utinyint': UnsignedByteType,
|
|
41
|
-
'tinyint': ByteType,
|
|
42
|
-
'usmallint': UnsignedShortType,
|
|
43
|
-
'smallint': ShortType,
|
|
44
|
-
'uinteger': UnsignedIntegerType,
|
|
45
|
-
'integer': IntegerType,
|
|
46
|
-
'ubigint': UnsignedLongType,
|
|
47
|
-
'bigint': LongType,
|
|
48
|
-
'hugeint': HugeIntegerType,
|
|
49
|
-
'varchar': StringType,
|
|
50
|
-
'blob': BinaryType,
|
|
51
|
-
'bit': BitstringType,
|
|
52
|
-
'uuid': UUIDType,
|
|
53
|
-
'date': DateType,
|
|
54
|
-
'time': TimeNTZType,
|
|
55
|
-
'time with time zone': TimeType,
|
|
56
|
-
'timestamp': TimestampNTZType,
|
|
57
|
-
'timestamp with time zone': TimestampType,
|
|
58
|
-
'timestamp_ms': TimestampNanosecondNTZType,
|
|
59
|
-
'timestamp_ns': TimestampMilisecondNTZType,
|
|
60
|
-
'timestamp_s': TimestampSecondNTZType,
|
|
61
|
-
'interval': DayTimeIntervalType,
|
|
62
|
-
'list': ArrayType,
|
|
63
|
-
'struct': StructType,
|
|
64
|
-
'map': MapType,
|
|
65
|
-
# union
|
|
66
|
-
# enum
|
|
67
|
-
# null (???)
|
|
68
|
-
'float': FloatType,
|
|
69
|
-
'double': DoubleType,
|
|
70
|
-
'decimal': DecimalType,
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def convert_nested_type(dtype: DuckDBPyType) -> DataType:
|
|
75
|
-
id = dtype.id
|
|
76
|
-
if id == 'list':
|
|
77
|
-
children = dtype.children
|
|
78
|
-
return ArrayType(convert_type(children[0][1]))
|
|
79
|
-
# TODO: add support for 'union'
|
|
80
|
-
if id == 'struct':
|
|
81
|
-
children: List[Tuple[str, DuckDBPyType]] = dtype.children
|
|
82
|
-
fields = [StructField(x[0], convert_type(x[1])) for x in children]
|
|
83
|
-
return StructType(fields)
|
|
84
|
-
if id == 'map':
|
|
85
|
-
return MapType(convert_type(dtype.key), convert_type(dtype.value))
|
|
86
|
-
raise NotImplementedError
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def convert_type(dtype: DuckDBPyType) -> DataType:
|
|
90
|
-
id = dtype.id
|
|
91
|
-
if id in ['list', 'struct', 'map']:
|
|
92
|
-
return convert_nested_type(dtype)
|
|
93
|
-
if id == 'decimal':
|
|
94
|
-
children: List[Tuple[str, DuckDBPyType]] = dtype.children
|
|
95
|
-
precision = cast(int, children[0][1])
|
|
96
|
-
scale = cast(int, children[1][1])
|
|
97
|
-
return DecimalType(precision, scale)
|
|
98
|
-
spark_type = _sqltype_to_spark_class[id]
|
|
99
|
-
return spark_type()
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def duckdb_to_spark_schema(names: List[str], types: List[DuckDBPyType]) -> StructType:
|
|
103
|
-
fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
|
|
104
|
-
return StructType(fields)
|
pyduckdb/spark/sql/udf.py
DELETED
|
File without changes
|