duckdb 1.5.0.dev32__cp314-cp314-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (47) hide show
  1. _duckdb.cpython-314-darwin.so +0 -0
  2. duckdb/__init__.py +475 -0
  3. duckdb/__init__.pyi +713 -0
  4. duckdb/bytes_io_wrapper.py +66 -0
  5. duckdb/experimental/__init__.py +2 -0
  6. duckdb/experimental/spark/LICENSE +260 -0
  7. duckdb/experimental/spark/__init__.py +7 -0
  8. duckdb/experimental/spark/_globals.py +77 -0
  9. duckdb/experimental/spark/_typing.py +48 -0
  10. duckdb/experimental/spark/conf.py +45 -0
  11. duckdb/experimental/spark/context.py +164 -0
  12. duckdb/experimental/spark/errors/__init__.py +72 -0
  13. duckdb/experimental/spark/errors/error_classes.py +918 -0
  14. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  15. duckdb/experimental/spark/errors/exceptions/base.py +217 -0
  16. duckdb/experimental/spark/errors/utils.py +116 -0
  17. duckdb/experimental/spark/exception.py +15 -0
  18. duckdb/experimental/spark/sql/__init__.py +7 -0
  19. duckdb/experimental/spark/sql/_typing.py +93 -0
  20. duckdb/experimental/spark/sql/catalog.py +78 -0
  21. duckdb/experimental/spark/sql/column.py +368 -0
  22. duckdb/experimental/spark/sql/conf.py +23 -0
  23. duckdb/experimental/spark/sql/dataframe.py +1437 -0
  24. duckdb/experimental/spark/sql/functions.py +6221 -0
  25. duckdb/experimental/spark/sql/group.py +420 -0
  26. duckdb/experimental/spark/sql/readwriter.py +449 -0
  27. duckdb/experimental/spark/sql/session.py +292 -0
  28. duckdb/experimental/spark/sql/streaming.py +37 -0
  29. duckdb/experimental/spark/sql/type_utils.py +105 -0
  30. duckdb/experimental/spark/sql/types.py +1275 -0
  31. duckdb/experimental/spark/sql/udf.py +37 -0
  32. duckdb/filesystem.py +23 -0
  33. duckdb/functional/__init__.py +17 -0
  34. duckdb/functional/__init__.pyi +31 -0
  35. duckdb/polars_io.py +237 -0
  36. duckdb/query_graph/__main__.py +363 -0
  37. duckdb/typing/__init__.py +61 -0
  38. duckdb/typing/__init__.pyi +36 -0
  39. duckdb/udf.py +19 -0
  40. duckdb/value/__init__.py +0 -0
  41. duckdb/value/__init__.pyi +0 -0
  42. duckdb/value/constant/__init__.py +268 -0
  43. duckdb/value/constant/__init__.pyi +115 -0
  44. duckdb-1.5.0.dev32.dist-info/METADATA +326 -0
  45. duckdb-1.5.0.dev32.dist-info/RECORD +47 -0
  46. duckdb-1.5.0.dev32.dist-info/WHEEL +6 -0
  47. duckdb-1.5.0.dev32.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,292 @@
1
+ from typing import Optional, List, Any, Union, Iterable, TYPE_CHECKING
2
+ import uuid
3
+
4
+ if TYPE_CHECKING:
5
+ from .catalog import Catalog
6
+ from pandas.core.frame import DataFrame as PandasDataFrame
7
+
8
+ from ..exception import ContributionsAcceptedError
9
+ from .types import StructType, AtomicType, DataType
10
+ from ..conf import SparkConf
11
+ from .dataframe import DataFrame
12
+ from .conf import RuntimeConfig
13
+ from .readwriter import DataFrameReader
14
+ from ..context import SparkContext
15
+ from .udf import UDFRegistration
16
+ from .streaming import DataStreamReader
17
+ import duckdb
18
+
19
+ from ..errors import (
20
+ PySparkTypeError,
21
+ PySparkValueError
22
+ )
23
+
24
+ from ..errors.error_classes import *
25
+
26
+ # In spark:
27
+ # SparkSession holds a SparkContext
28
+ # SparkContext gets created from SparkConf
29
+ # At this level the check is made to determine whether the instance already exists and just needs to be retrieved or it needs to be created
30
+
31
+ # For us this is done inside of `duckdb.connect`, based on the passed in path + configuration
32
+ # SparkContext can be compared to our Connection class, and SparkConf to our ClientContext class
33
+
34
+
35
+ # data is a List of rows
36
+ # every value in each row needs to be turned into a Value
37
+ def _combine_data_and_schema(data: Iterable[Any], schema: StructType):
38
+ from duckdb import Value
39
+
40
+ new_data = []
41
+ for row in data:
42
+ new_row = [Value(x, dtype.duckdb_type) for x, dtype in zip(row, [y.dataType for y in schema])]
43
+ new_data.append(new_row)
44
+ return new_data
45
+
46
+
47
+ class SparkSession:
48
+ def __init__(self, context: SparkContext):
49
+ self.conn = context.connection
50
+ self._context = context
51
+ self._conf = RuntimeConfig(self.conn)
52
+
53
+ def _create_dataframe(self, data: Union[Iterable[Any], "PandasDataFrame"]) -> DataFrame:
54
+ try:
55
+ import pandas
56
+ has_pandas = True
57
+ except ImportError:
58
+ has_pandas = False
59
+ if has_pandas and isinstance(data, pandas.DataFrame):
60
+ unique_name = f'pyspark_pandas_df_{uuid.uuid1()}'
61
+ self.conn.register(unique_name, data)
62
+ return DataFrame(self.conn.sql(f'select * from "{unique_name}"'), self)
63
+
64
+ def verify_tuple_integrity(tuples):
65
+ if len(tuples) <= 1:
66
+ return
67
+ expected_length = len(tuples[0])
68
+ for i, item in enumerate(tuples[1:]):
69
+ actual_length = len(item)
70
+ if expected_length == actual_length:
71
+ continue
72
+ raise PySparkTypeError(
73
+ error_class="LENGTH_SHOULD_BE_THE_SAME",
74
+ message_parameters={
75
+ "arg1": f"data{i}",
76
+ "arg2": f"data{i+1}",
77
+ "arg1_length": str(expected_length),
78
+ "arg2_length": str(actual_length)
79
+ },
80
+ )
81
+
82
+ if not isinstance(data, list):
83
+ data = list(data)
84
+ verify_tuple_integrity(data)
85
+
86
+ def construct_query(tuples) -> str:
87
+ def construct_values_list(row, start_param_idx):
88
+ parameter_count = len(row)
89
+ parameters = [f'${x+start_param_idx}' for x in range(parameter_count)]
90
+ parameters = '(' + ', '.join(parameters) + ')'
91
+ return parameters
92
+
93
+ row_size = len(tuples[0])
94
+ values_list = [construct_values_list(x, 1 + (i * row_size)) for i, x in enumerate(tuples)]
95
+ values_list = ', '.join(values_list)
96
+
97
+ query = f"""
98
+ select * from (values {values_list})
99
+ """
100
+ return query
101
+
102
+ query = construct_query(data)
103
+
104
+ def construct_parameters(tuples):
105
+ parameters = []
106
+ for row in tuples:
107
+ parameters.extend(list(row))
108
+ return parameters
109
+
110
+ parameters = construct_parameters(data)
111
+
112
+ rel = self.conn.sql(query, params=parameters)
113
+ return DataFrame(rel, self)
114
+
115
+ def _createDataFrameFromPandas(self, data: "PandasDataFrame", types, names) -> DataFrame:
116
+ df = self._create_dataframe(data)
117
+
118
+ # Cast to types
119
+ if types:
120
+ df = df._cast_types(*types)
121
+ # Alias to names
122
+ if names:
123
+ df = df.toDF(*names)
124
+ return df
125
+
126
+ def createDataFrame(
127
+ self,
128
+ data: Union["PandasDataFrame", Iterable[Any]],
129
+ schema: Optional[Union[StructType, List[str]]] = None,
130
+ samplingRatio: Optional[float] = None,
131
+ verifySchema: bool = True,
132
+ ) -> DataFrame:
133
+ if samplingRatio:
134
+ raise NotImplementedError
135
+ if not verifySchema:
136
+ raise NotImplementedError
137
+ types = None
138
+ names = None
139
+
140
+ if isinstance(data, DataFrame):
141
+ raise PySparkTypeError(
142
+ error_class="SHOULD_NOT_DATAFRAME",
143
+ message_parameters={"arg_name": "data"},
144
+ )
145
+
146
+ if schema:
147
+ if isinstance(schema, StructType):
148
+ types, names = schema.extract_types_and_names()
149
+ else:
150
+ names = schema
151
+
152
+ try:
153
+ import pandas
154
+
155
+ has_pandas = True
156
+ except ImportError:
157
+ has_pandas = False
158
+ # Falsey check on pandas dataframe is not defined, so first check if it's not a pandas dataframe
159
+ # Then check if 'data' is None or []
160
+ if has_pandas and isinstance(data, pandas.DataFrame):
161
+ return self._createDataFrameFromPandas(data, types, names)
162
+
163
+ # Finally check if a schema was provided
164
+ is_empty = False
165
+ if not data and names:
166
+ # Create NULLs for every type in our dataframe
167
+ is_empty = True
168
+ data = [tuple(None for _ in names)]
169
+
170
+ if schema and isinstance(schema, StructType):
171
+ # Transform the data into Values to combine the data+schema
172
+ data = _combine_data_and_schema(data, schema)
173
+
174
+ df = self._create_dataframe(data)
175
+ if is_empty:
176
+ rel = df.relation
177
+ # Add impossible where clause
178
+ rel = rel.filter('1=0')
179
+ df = DataFrame(rel, self)
180
+
181
+ # Cast to types
182
+ if types:
183
+ df = df._cast_types(*types)
184
+ # Alias to names
185
+ if names:
186
+ df = df.toDF(*names)
187
+ return df
188
+
189
+ def newSession(self) -> "SparkSession":
190
+ return SparkSession(self._context)
191
+
192
+ def range(
193
+ self,
194
+ start: int,
195
+ end: Optional[int] = None,
196
+ step: int = 1,
197
+ numPartitions: Optional[int] = None,
198
+ ) -> "DataFrame":
199
+ if numPartitions:
200
+ raise ContributionsAcceptedError
201
+
202
+ if end is None:
203
+ end = start
204
+ start = 0
205
+
206
+ return DataFrame(self.conn.table_function("range", parameters=[start, end, step]),self)
207
+
208
+ def sql(self, sqlQuery: str, **kwargs: Any) -> DataFrame:
209
+ if kwargs:
210
+ raise NotImplementedError
211
+ relation = self.conn.sql(sqlQuery)
212
+ return DataFrame(relation, self)
213
+
214
+ def stop(self) -> None:
215
+ self._context.stop()
216
+
217
+ def table(self, tableName: str) -> DataFrame:
218
+ relation = self.conn.table(tableName)
219
+ return DataFrame(relation, self)
220
+
221
+ def getActiveSession(self) -> "SparkSession":
222
+ return self
223
+
224
+ @property
225
+ def catalog(self) -> "Catalog":
226
+ if not hasattr(self, "_catalog"):
227
+ from duckdb.experimental.spark.sql.catalog import Catalog
228
+
229
+ self._catalog = Catalog(self)
230
+ return self._catalog
231
+
232
+ @property
233
+ def conf(self) -> RuntimeConfig:
234
+ return self._conf
235
+
236
+ @property
237
+ def read(self) -> DataFrameReader:
238
+ return DataFrameReader(self)
239
+
240
+ @property
241
+ def readStream(self) -> DataStreamReader:
242
+ return DataStreamReader(self)
243
+
244
+ @property
245
+ def sparkContext(self) -> SparkContext:
246
+ return self._context
247
+
248
+ @property
249
+ def streams(self) -> Any:
250
+ raise ContributionsAcceptedError
251
+
252
+ @property
253
+ def udf(self) -> UDFRegistration:
254
+ return UDFRegistration(self)
255
+
256
+ @property
257
+ def version(self) -> str:
258
+ return '1.0.0'
259
+
260
+ class Builder:
261
+ def __init__(self):
262
+ pass
263
+
264
+ def master(self, name: str) -> "SparkSession.Builder":
265
+ # no-op
266
+ return self
267
+
268
+ def appName(self, name: str) -> "SparkSession.Builder":
269
+ # no-op
270
+ return self
271
+
272
+ def remote(self, url: str) -> "SparkSession.Builder":
273
+ # no-op
274
+ return self
275
+
276
+ def getOrCreate(self) -> "SparkSession":
277
+ context = SparkContext("__ignored__")
278
+ return SparkSession(context)
279
+
280
+ def config(
281
+ self, key: Optional[str] = None, value: Optional[Any] = None, conf: Optional[SparkConf] = None
282
+ ) -> "SparkSession.Builder":
283
+ return self
284
+
285
+ def enableHiveSupport(self) -> "SparkSession.Builder":
286
+ # no-op
287
+ return self
288
+
289
+ builder = Builder()
290
+
291
+
292
+ __all__ = ["SparkSession"]
@@ -0,0 +1,37 @@
1
+ from typing import TYPE_CHECKING, Optional, Union
2
+ from .types import StructType
3
+
4
+ if TYPE_CHECKING:
5
+ from .dataframe import DataFrame
6
+ from .session import SparkSession
7
+
8
+ PrimitiveType = Union[bool, float, int, str]
9
+ OptionalPrimitiveType = Optional[PrimitiveType]
10
+
11
+
12
+ class DataStreamWriter:
13
+ def __init__(self, dataframe: "DataFrame"):
14
+ self.dataframe = dataframe
15
+
16
+ def toTable(self, table_name: str) -> None:
17
+ # Should we register the dataframe or create a table from the contents?
18
+ raise NotImplementedError
19
+
20
+
21
+ class DataStreamReader:
22
+ def __init__(self, session: "SparkSession"):
23
+ self.session = session
24
+
25
+ def load(
26
+ self,
27
+ path: Optional[str] = None,
28
+ format: Optional[str] = None,
29
+ schema: Union[StructType, str, None] = None,
30
+ **options: OptionalPrimitiveType
31
+ ) -> "DataFrame":
32
+ from duckdb.experimental.spark.sql.dataframe import DataFrame
33
+
34
+ raise NotImplementedError
35
+
36
+
37
+ __all__ = ["DataStreamReader", "DataStreamWriter"]
@@ -0,0 +1,105 @@
1
+ from duckdb.typing import DuckDBPyType
2
+ from typing import List, Tuple, cast
3
+ from .types import (
4
+ DataType,
5
+ StringType,
6
+ BinaryType,
7
+ BitstringType,
8
+ UUIDType,
9
+ BooleanType,
10
+ DateType,
11
+ TimestampType,
12
+ TimestampNTZType,
13
+ TimeType,
14
+ TimeNTZType,
15
+ TimestampNanosecondNTZType,
16
+ TimestampMilisecondNTZType,
17
+ TimestampSecondNTZType,
18
+ DecimalType,
19
+ DoubleType,
20
+ FloatType,
21
+ ByteType,
22
+ UnsignedByteType,
23
+ ShortType,
24
+ UnsignedShortType,
25
+ IntegerType,
26
+ UnsignedIntegerType,
27
+ LongType,
28
+ UnsignedLongType,
29
+ HugeIntegerType,
30
+ UnsignedHugeIntegerType,
31
+ DayTimeIntervalType,
32
+ ArrayType,
33
+ MapType,
34
+ StructField,
35
+ StructType,
36
+ )
37
+
38
+ _sqltype_to_spark_class = {
39
+ 'boolean': BooleanType,
40
+ 'utinyint': UnsignedByteType,
41
+ 'tinyint': ByteType,
42
+ 'usmallint': UnsignedShortType,
43
+ 'smallint': ShortType,
44
+ 'uinteger': UnsignedIntegerType,
45
+ 'integer': IntegerType,
46
+ 'ubigint': UnsignedLongType,
47
+ 'bigint': LongType,
48
+ 'hugeint': HugeIntegerType,
49
+ 'uhugeint': UnsignedHugeIntegerType,
50
+ 'varchar': StringType,
51
+ 'blob': BinaryType,
52
+ 'bit': BitstringType,
53
+ 'uuid': UUIDType,
54
+ 'date': DateType,
55
+ 'time': TimeNTZType,
56
+ 'time with time zone': TimeType,
57
+ 'timestamp': TimestampNTZType,
58
+ 'timestamp with time zone': TimestampType,
59
+ 'timestamp_ms': TimestampNanosecondNTZType,
60
+ 'timestamp_ns': TimestampMilisecondNTZType,
61
+ 'timestamp_s': TimestampSecondNTZType,
62
+ 'interval': DayTimeIntervalType,
63
+ 'list': ArrayType,
64
+ 'struct': StructType,
65
+ 'map': MapType,
66
+ # union
67
+ # enum
68
+ # null (???)
69
+ 'float': FloatType,
70
+ 'double': DoubleType,
71
+ 'decimal': DecimalType,
72
+ }
73
+
74
+
75
+ def convert_nested_type(dtype: DuckDBPyType) -> DataType:
76
+ id = dtype.id
77
+ if id == 'list' or id == 'array':
78
+ children = dtype.children
79
+ return ArrayType(convert_type(children[0][1]))
80
+ # TODO: add support for 'union'
81
+ if id == 'struct':
82
+ children: List[Tuple[str, DuckDBPyType]] = dtype.children
83
+ fields = [StructField(x[0], convert_type(x[1])) for x in children]
84
+ return StructType(fields)
85
+ if id == 'map':
86
+ return MapType(convert_type(dtype.key), convert_type(dtype.value))
87
+ raise NotImplementedError
88
+
89
+
90
+ def convert_type(dtype: DuckDBPyType) -> DataType:
91
+ id = dtype.id
92
+ if id in ['list', 'struct', 'map', 'array']:
93
+ return convert_nested_type(dtype)
94
+ if id == 'decimal':
95
+ children: List[Tuple[str, DuckDBPyType]] = dtype.children
96
+ precision = cast(int, children[0][1])
97
+ scale = cast(int, children[1][1])
98
+ return DecimalType(precision, scale)
99
+ spark_type = _sqltype_to_spark_class[id]
100
+ return spark_type()
101
+
102
+
103
+ def duckdb_to_spark_schema(names: List[str], types: List[DuckDBPyType]) -> StructType:
104
+ fields = [StructField(name, dtype) for name, dtype in zip(names, [convert_type(x) for x in types])]
105
+ return StructType(fields)