duckdb 1.5.0.dev53__cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (52) hide show
  1. _duckdb-stubs/__init__.pyi +1443 -0
  2. _duckdb-stubs/_func.pyi +46 -0
  3. _duckdb-stubs/_sqltypes.pyi +75 -0
  4. _duckdb.cpython-314-x86_64-linux-gnu.so +0 -0
  5. adbc_driver_duckdb/__init__.py +50 -0
  6. adbc_driver_duckdb/dbapi.py +115 -0
  7. duckdb/__init__.py +381 -0
  8. duckdb/_dbapi_type_object.py +231 -0
  9. duckdb/_version.py +22 -0
  10. duckdb/bytes_io_wrapper.py +69 -0
  11. duckdb/experimental/__init__.py +3 -0
  12. duckdb/experimental/spark/LICENSE +260 -0
  13. duckdb/experimental/spark/__init__.py +6 -0
  14. duckdb/experimental/spark/_globals.py +77 -0
  15. duckdb/experimental/spark/_typing.py +46 -0
  16. duckdb/experimental/spark/conf.py +46 -0
  17. duckdb/experimental/spark/context.py +180 -0
  18. duckdb/experimental/spark/errors/__init__.py +70 -0
  19. duckdb/experimental/spark/errors/error_classes.py +918 -0
  20. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  21. duckdb/experimental/spark/errors/exceptions/base.py +168 -0
  22. duckdb/experimental/spark/errors/utils.py +111 -0
  23. duckdb/experimental/spark/exception.py +18 -0
  24. duckdb/experimental/spark/sql/__init__.py +7 -0
  25. duckdb/experimental/spark/sql/_typing.py +86 -0
  26. duckdb/experimental/spark/sql/catalog.py +79 -0
  27. duckdb/experimental/spark/sql/column.py +361 -0
  28. duckdb/experimental/spark/sql/conf.py +24 -0
  29. duckdb/experimental/spark/sql/dataframe.py +1389 -0
  30. duckdb/experimental/spark/sql/functions.py +6195 -0
  31. duckdb/experimental/spark/sql/group.py +424 -0
  32. duckdb/experimental/spark/sql/readwriter.py +435 -0
  33. duckdb/experimental/spark/sql/session.py +297 -0
  34. duckdb/experimental/spark/sql/streaming.py +36 -0
  35. duckdb/experimental/spark/sql/type_utils.py +107 -0
  36. duckdb/experimental/spark/sql/types.py +1239 -0
  37. duckdb/experimental/spark/sql/udf.py +37 -0
  38. duckdb/filesystem.py +33 -0
  39. duckdb/func/__init__.py +3 -0
  40. duckdb/functional/__init__.py +13 -0
  41. duckdb/polars_io.py +284 -0
  42. duckdb/py.typed +0 -0
  43. duckdb/query_graph/__main__.py +358 -0
  44. duckdb/sqltypes/__init__.py +63 -0
  45. duckdb/typing/__init__.py +71 -0
  46. duckdb/udf.py +24 -0
  47. duckdb/value/__init__.py +1 -0
  48. duckdb/value/constant/__init__.py +270 -0
  49. duckdb-1.5.0.dev53.dist-info/METADATA +87 -0
  50. duckdb-1.5.0.dev53.dist-info/RECORD +52 -0
  51. duckdb-1.5.0.dev53.dist-info/WHEEL +6 -0
  52. duckdb-1.5.0.dev53.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,297 @@
1
+ import uuid # noqa: D100
2
+ from collections.abc import Iterable, Sized
3
+ from typing import TYPE_CHECKING, Any, NoReturn, Optional, Union
4
+
5
+ import duckdb
6
+
7
+ if TYPE_CHECKING:
8
+ from pandas.core.frame import DataFrame as PandasDataFrame
9
+
10
+ from .catalog import Catalog
11
+
12
+
13
+ from ..conf import SparkConf
14
+ from ..context import SparkContext
15
+ from ..errors import PySparkTypeError
16
+ from ..exception import ContributionsAcceptedError
17
+ from .conf import RuntimeConfig
18
+ from .dataframe import DataFrame
19
+ from .readwriter import DataFrameReader
20
+ from .streaming import DataStreamReader
21
+ from .types import StructType
22
+ from .udf import UDFRegistration
23
+
24
+ # In spark:
25
+ # SparkSession holds a SparkContext
26
+ # SparkContext gets created from SparkConf
27
+ # At this level the check is made to determine whether the instance already exists and just needs
28
+ # to be retrieved or it needs to be created.
29
+
30
+ # For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
31
+ # SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
32
+
33
+
34
+ # data is a List of rows
35
+ # every value in each row needs to be turned into a Value
36
+ def _combine_data_and_schema(data: Iterable[Any], schema: StructType) -> list[duckdb.Value]:
37
+ from duckdb import Value
38
+
39
+ new_data = []
40
+ for row in data:
41
+ new_row = [Value(x, dtype.duckdb_type) for x, dtype in zip(row, [y.dataType for y in schema])]
42
+ new_data.append(new_row)
43
+ return new_data
44
+
45
+
46
+ class SparkSession: # noqa: D101
47
+ def __init__(self, context: SparkContext) -> None: # noqa: D107
48
+ self.conn = context.connection
49
+ self._context = context
50
+ self._conf = RuntimeConfig(self.conn)
51
+
52
+ def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
53
+ try:
54
+ import pandas
55
+
56
+ has_pandas = True
57
+ except ImportError:
58
+ has_pandas = False
59
+ if has_pandas and isinstance(data, pandas.DataFrame):
60
+ unique_name = f"pyspark_pandas_df_{uuid.uuid1()}"
61
+ self.conn.register(unique_name, data)
62
+ return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
63
+
64
+ def verify_tuple_integrity(tuples: list[tuple]) -> None:
65
+ if len(tuples) <= 1:
66
+ return
67
+ expected_length = len(tuples[0])
68
+ for i, item in enumerate(tuples[1:]):
69
+ actual_length = len(item)
70
+ if expected_length == actual_length:
71
+ continue
72
+ raise PySparkTypeError(
73
+ error_class="LENGTH_SHOULD_BE_THE_SAME",
74
+ message_parameters={
75
+ "arg1": f"data{i}",
76
+ "arg2": f"data{i + 1}",
77
+ "arg1_length": str(expected_length),
78
+ "arg2_length": str(actual_length),
79
+ },
80
+ )
81
+
82
+ if not isinstance(data, list):
83
+ data = list(data)
84
+ verify_tuple_integrity(data)
85
+
86
+ def construct_query(tuples: Iterable) -> str:
87
+ def construct_values_list(row: Sized, start_param_idx: int) -> str:
88
+ parameter_count = len(row)
89
+ parameters = [f"${x + start_param_idx}" for x in range(parameter_count)]
90
+ parameters = "(" + ", ".join(parameters) + ")"
91
+ return parameters
92
+
93
+ row_size = len(tuples[0])
94
+ values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
95
+ values_list = ", ".join(values_list)
96
+
97
+ query = f"""
98
+ select * from (values {values_list})
99
+ """
100
+ return query
101
+
102
+ query = construct_query(data)
103
+
104
+ def construct_parameters(tuples: Iterable) -> list[list]:
105
+ parameters = []
106
+ for row in tuples:
107
+ parameters.extend(list(row))
108
+ return parameters
109
+
110
+ parameters = construct_parameters(data)
111
+
112
+ rel = self.conn.sql(query, params=parameters)
113
+ return DataFrame(rel, self)
114
+
115
+ def _createDataFrameFromPandas(
116
+ self, data: "PandasDataFrame", types: Union[list[str], None], names: Union[list[str], None]
117
+ ) -> DataFrame:
118
+ df = self._create_dataframe(data)
119
+
120
+ # Cast to types
121
+ if types:
122
+ df = df._cast_types(*types)
123
+ # Alias to names
124
+ if names:
125
+ df = df.toDF(*names)
126
+ return df
127
+
128
+ def createDataFrame( # noqa: D102
129
+ self,
130
+ data: Union["PandasDataFrame", Iterable[Any]],
131
+ schema: Optional[Union[StructType, list[str]]] = None,
132
+ samplingRatio: Optional[float] = None,
133
+ verifySchema: bool = True,
134
+ ) -> DataFrame:
135
+ if samplingRatio:
136
+ raise NotImplementedError
137
+ if not verifySchema:
138
+ raise NotImplementedError
139
+ types = None
140
+ names = None
141
+
142
+ if isinstance(data, DataFrame):
143
+ raise PySparkTypeError(
144
+ error_class="SHOULD_NOT_DATAFRAME",
145
+ message_parameters={"arg_name": "data"},
146
+ )
147
+
148
+ if schema:
149
+ if isinstance(schema, StructType):
150
+ types, names = schema.extract_types_and_names()
151
+ else:
152
+ names = schema
153
+
154
+ try:
155
+ import pandas
156
+
157
+ has_pandas = True
158
+ except ImportError:
159
+ has_pandas = False
160
+ # Falsey check on pandas dataframe is not defined, so first check if it's not a pandas dataframe
161
+ # Then check if 'data' is None or []
162
+ if has_pandas and isinstance(data, pandas.DataFrame):
163
+ return self._createDataFrameFromPandas(data, types, names)
164
+
165
+ # Finally check if a schema was provided
166
+ is_empty = False
167
+ if not data and names:
168
+ # Create NULLs for every type in our dataframe
169
+ is_empty = True
170
+ data = [tuple(None for _ in names)]
171
+
172
+ if schema and isinstance(schema, StructType):
173
+ # Transform the data into Values to combine the data+schema
174
+ data = _combine_data_and_schema(data, schema)
175
+
176
+ df = self._create_dataframe(data)
177
+ if is_empty:
178
+ rel = df.relation
179
+ # Add impossible where clause
180
+ rel = rel.filter("1=0")
181
+ df = DataFrame(rel, self)
182
+
183
+ # Cast to types
184
+ if types:
185
+ df = df._cast_types(*types)
186
+ # Alias to names
187
+ if names:
188
+ df = df.toDF(*names)
189
+ return df
190
+
191
+ def newSession(self) -> "SparkSession": # noqa: D102
192
+ return SparkSession(self._context)
193
+
194
+ def range( # noqa: D102
195
+ self,
196
+ start: int,
197
+ end: Optional[int] = None,
198
+ step: int = 1,
199
+ numPartitions: Optional[int] = None,
200
+ ) -> "DataFrame":
201
+ if numPartitions:
202
+ raise ContributionsAcceptedError
203
+
204
+ if end is None:
205
+ end = start
206
+ start = 0
207
+
208
+ return DataFrame(self.conn.table_function("range", parameters=[start, end, step]), self)
209
+
210
+ def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame: # noqa: D102, ANN401
211
+ if kwargs:
212
+ raise NotImplementedError
213
+ relation = self.conn.sql(sqlQuery)
214
+ return DataFrame(relation, self)
215
+
216
+ def stop(self) -> None: # noqa: D102
217
+ self._context.stop()
218
+
219
+ def table(self, tableName: str) -> DataFrame: # noqa: D102
220
+ relation = self.conn.table(tableName)
221
+ return DataFrame(relation, self)
222
+
223
+ def getActiveSession(self) -> "SparkSession": # noqa: D102
224
+ return self
225
+
226
+ @property
227
+ def catalog(self) -> "Catalog": # noqa: D102
228
+ if not hasattr(self, "_catalog"):
229
+ from duckdb.experimental.spark.sql.catalog import Catalog
230
+
231
+ self._catalog = Catalog(self)
232
+ return self._catalog
233
+
234
+ @property
235
+ def conf(self) -> RuntimeConfig: # noqa: D102
236
+ return self._conf
237
+
238
+ @property
239
+ def read(self) -> DataFrameReader: # noqa: D102
240
+ return DataFrameReader(self)
241
+
242
+ @property
243
+ def readStream(self) -> DataStreamReader: # noqa: D102
244
+ return DataStreamReader(self)
245
+
246
+ @property
247
+ def sparkContext(self) -> SparkContext: # noqa: D102
248
+ return self._context
249
+
250
+ @property
251
+ def streams(self) -> NoReturn: # noqa: D102
252
+ raise ContributionsAcceptedError
253
+
254
+ @property
255
+ def udf(self) -> UDFRegistration: # noqa: D102
256
+ return UDFRegistration(self)
257
+
258
+ @property
259
+ def version(self) -> str: # noqa: D102
260
+ return "1.0.0"
261
+
262
+ class Builder: # noqa: D106
263
+ def __init__(self) -> None: # noqa: D107
264
+ pass
265
+
266
+ def master(self, name: str) -> "SparkSession.Builder": # noqa: D102
267
+ # no-op
268
+ return self
269
+
270
+ def appName(self, name: str) -> "SparkSession.Builder": # noqa: D102
271
+ # no-op
272
+ return self
273
+
274
+ def remote(self, url: str) -> "SparkSession.Builder": # noqa: D102
275
+ # no-op
276
+ return self
277
+
278
+ def getOrCreate(self) -> "SparkSession": # noqa: D102
279
+ context = SparkContext("__ignored__")
280
+ return SparkSession(context)
281
+
282
+ def config( # noqa: D102
283
+ self,
284
+ key: Optional[str] = None,
285
+ value: Optional[Any] = None, # noqa: ANN401
286
+ conf: Optional[SparkConf] = None,
287
+ ) -> "SparkSession.Builder":
288
+ return self
289
+
290
+ def enableHiveSupport(self) -> "SparkSession.Builder": # noqa: D102
291
+ # no-op
292
+ return self
293
+
294
+ builder = Builder()
295
+
296
+
297
+ __all__ = ["SparkSession"]
@@ -0,0 +1,36 @@
1
+ from typing import TYPE_CHECKING, Optional, Union # noqa: D100
2
+
3
+ from .types import StructType
4
+
5
+ if TYPE_CHECKING:
6
+ from .dataframe import DataFrame
7
+ from .session import SparkSession
8
+
9
+ PrimitiveType = Union[bool, float, int, str]
10
+ OptionalPrimitiveType = Optional[PrimitiveType]
11
+
12
+
13
+ class DataStreamWriter: # noqa: D101
14
+ def __init__(self, dataframe: "DataFrame") -> None: # noqa: D107
15
+ self.dataframe = dataframe
16
+
17
+ def toTable(self, table_name: str) -> None: # noqa: D102
18
+ # Should we register the dataframe or create a table from the contents?
19
+ raise NotImplementedError
20
+
21
+
22
+ class DataStreamReader: # noqa: D101
23
+ def __init__(self, session: "SparkSession") -> None: # noqa: D107
24
+ self.session = session
25
+
26
+ def load( # noqa: D102
27
+ self,
28
+ path: Optional[str] = None,
29
+ format: Optional[str] = None,
30
+ schema: Union[StructType, str, None] = None,
31
+ **options: OptionalPrimitiveType,
32
+ ) -> "DataFrame":
33
+ raise NotImplementedError
34
+
35
+
36
+ __all__ = ["DataStreamReader", "DataStreamWriter"]
@@ -0,0 +1,107 @@
1
+ from typing import cast # noqa: D100
2
+
3
+ from duckdb.sqltypes import DuckDBPyType
4
+
5
+ from .types import (
6
+ ArrayType,
7
+ BinaryType,
8
+ BitstringType,
9
+ BooleanType,
10
+ ByteType,
11
+ DataType,
12
+ DateType,
13
+ DayTimeIntervalType,
14
+ DecimalType,
15
+ DoubleType,
16
+ FloatType,
17
+ HugeIntegerType,
18
+ IntegerType,
19
+ LongType,
20
+ MapType,
21
+ ShortType,
22
+ StringType,
23
+ StructField,
24
+ StructType,
25
+ TimeNTZType,
26
+ TimestampMilisecondNTZType,
27
+ TimestampNanosecondNTZType,
28
+ TimestampNTZType,
29
+ TimestampSecondNTZType,
30
+ TimestampType,
31
+ TimeType,
32
+ UnsignedByteType,
33
+ UnsignedHugeIntegerType,
34
+ UnsignedIntegerType,
35
+ UnsignedLongType,
36
+ UnsignedShortType,
37
+ UUIDType,
38
+ )
39
+
40
+ _sqltype_to_spark_class = {
41
+ "boolean": BooleanType,
42
+ "utinyint": UnsignedByteType,
43
+ "tinyint": ByteType,
44
+ "usmallint": UnsignedShortType,
45
+ "smallint": ShortType,
46
+ "uinteger": UnsignedIntegerType,
47
+ "integer": IntegerType,
48
+ "ubigint": UnsignedLongType,
49
+ "bigint": LongType,
50
+ "hugeint": HugeIntegerType,
51
+ "uhugeint": UnsignedHugeIntegerType,
52
+ "varchar": StringType,
53
+ "blob": BinaryType,
54
+ "bit": BitstringType,
55
+ "uuid": UUIDType,
56
+ "date": DateType,
57
+ "time": TimeNTZType,
58
+ "time with time zone": TimeType,
59
+ "timestamp": TimestampNTZType,
60
+ "timestamp with time zone": TimestampType,
61
+ "timestamp_ms": TimestampNanosecondNTZType,
62
+ "timestamp_ns": TimestampMilisecondNTZType,
63
+ "timestamp_s": TimestampSecondNTZType,
64
+ "interval": DayTimeIntervalType,
65
+ "list": ArrayType,
66
+ "struct": StructType,
67
+ "map": MapType,
68
+ # union
69
+ # enum
70
+ # null (???)
71
+ "float": FloatType,
72
+ "double": DoubleType,
73
+ "decimal": DecimalType,
74
+ }
75
+
76
+
77
+ def convert_nested_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
78
+ id = dtype.id
79
+ if id == "list" or id == "array":
80
+ children = dtype.children
81
+ return ArrayType(convert_type(children[0][1]))
82
+ # TODO: add support for 'union' # noqa: TD002, TD003
83
+ if id == "struct":
84
+ children: list[tuple[str, DuckDBPyType]] = dtype.children
85
+ fields = [StructField(x[0], convert_type(x[1])) for x in children]
86
+ return StructType(fields)
87
+ if id == "map":
88
+ return MapType(convert_type(dtype.key), convert_type(dtype.value))
89
+ raise NotImplementedError
90
+
91
+
92
+ def convert_type(dtype: DuckDBPyType) -> DataType: # noqa: D103
93
+ id = dtype.id
94
+ if id in ["list", "struct", "map", "array"]:
95
+ return convert_nested_type(dtype)
96
+ if id == "decimal":
97
+ children: list[tuple[str, DuckDBPyType]] = dtype.children
98
+ precision = cast("int", children[0][1])
99
+ scale = cast("int", children[1][1])
100
+ return DecimalType(precision, scale)
101
+ spark_type = _sqltype_to_spark_class[id]
102
+ return spark_type()
103
+
104
+
105
+ def duckdb_to_spark_schema(names: list[str], types: list[DuckDBPyType]) -> StructType: # noqa: D103
106
+ fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
107
+ return StructType(fields)