duckdb 1.5.0.dev56__cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (52) hide show
  1. _duckdb-stubs/__init__.pyi +1443 -0
  2. _duckdb-stubs/_func.pyi +46 -0
  3. _duckdb-stubs/_sqltypes.pyi +75 -0
  4. _duckdb.cpython-314-x86_64-linux-gnu.so +0 -0
  5. adbc_driver_duckdb/__init__.py +50 -0
  6. adbc_driver_duckdb/dbapi.py +115 -0
  7. duckdb/__init__.py +381 -0
  8. duckdb/_dbapi_type_object.py +231 -0
  9. duckdb/_version.py +22 -0
  10. duckdb/bytes_io_wrapper.py +69 -0
  11. duckdb/experimental/__init__.py +3 -0
  12. duckdb/experimental/spark/LICENSE +260 -0
  13. duckdb/experimental/spark/__init__.py +6 -0
  14. duckdb/experimental/spark/_globals.py +77 -0
  15. duckdb/experimental/spark/_typing.py +46 -0
  16. duckdb/experimental/spark/conf.py +46 -0
  17. duckdb/experimental/spark/context.py +180 -0
  18. duckdb/experimental/spark/errors/__init__.py +70 -0
  19. duckdb/experimental/spark/errors/error_classes.py +918 -0
  20. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  21. duckdb/experimental/spark/errors/exceptions/base.py +168 -0
  22. duckdb/experimental/spark/errors/utils.py +111 -0
  23. duckdb/experimental/spark/exception.py +18 -0
  24. duckdb/experimental/spark/sql/__init__.py +7 -0
  25. duckdb/experimental/spark/sql/_typing.py +86 -0
  26. duckdb/experimental/spark/sql/catalog.py +79 -0
  27. duckdb/experimental/spark/sql/column.py +361 -0
  28. duckdb/experimental/spark/sql/conf.py +24 -0
  29. duckdb/experimental/spark/sql/dataframe.py +1389 -0
  30. duckdb/experimental/spark/sql/functions.py +6195 -0
  31. duckdb/experimental/spark/sql/group.py +424 -0
  32. duckdb/experimental/spark/sql/readwriter.py +435 -0
  33. duckdb/experimental/spark/sql/session.py +297 -0
  34. duckdb/experimental/spark/sql/streaming.py +36 -0
  35. duckdb/experimental/spark/sql/type_utils.py +107 -0
  36. duckdb/experimental/spark/sql/types.py +1239 -0
  37. duckdb/experimental/spark/sql/udf.py +37 -0
  38. duckdb/filesystem.py +33 -0
  39. duckdb/func/__init__.py +3 -0
  40. duckdb/functional/__init__.py +13 -0
  41. duckdb/polars_io.py +284 -0
  42. duckdb/py.typed +0 -0
  43. duckdb/query_graph/__main__.py +358 -0
  44. duckdb/sqltypes/__init__.py +63 -0
  45. duckdb/typing/__init__.py +71 -0
  46. duckdb/udf.py +24 -0
  47. duckdb/value/__init__.py +1 -0
  48. duckdb/value/constant/__init__.py +270 -0
  49. duckdb-1.5.0.dev56.dist-info/METADATA +87 -0
  50. duckdb-1.5.0.dev56.dist-info/RECORD +52 -0
  51. duckdb-1.5.0.dev56.dist-info/WHEEL +6 -0
  52. duckdb-1.5.0.dev56.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,435 @@
1
+ from typing import TYPE_CHECKING, Optional, Union, cast # noqa: D100
2
+
3
+ from ..errors import PySparkNotImplementedError, PySparkTypeError
4
+ from ..exception import ContributionsAcceptedError
5
+ from .types import StructType
6
+
7
+ PrimitiveType = Union[bool, float, int, str]
8
+ OptionalPrimitiveType = Optional[PrimitiveType]
9
+
10
+ if TYPE_CHECKING:
11
+ from duckdb.experimental.spark.sql.dataframe import DataFrame
12
+ from duckdb.experimental.spark.sql.session import SparkSession
13
+
14
+
15
+ class DataFrameWriter: # noqa: D101
16
+ def __init__(self, dataframe: "DataFrame") -> None: # noqa: D107
17
+ self.dataframe = dataframe
18
+
19
+ def saveAsTable(self, table_name: str) -> None: # noqa: D102
20
+ relation = self.dataframe.relation
21
+ relation.create(table_name)
22
+
23
+ def parquet( # noqa: D102
24
+ self,
25
+ path: str,
26
+ mode: Optional[str] = None,
27
+ partitionBy: Union[str, list[str], None] = None,
28
+ compression: Optional[str] = None,
29
+ ) -> None:
30
+ relation = self.dataframe.relation
31
+ if mode:
32
+ raise NotImplementedError
33
+ if partitionBy:
34
+ raise NotImplementedError
35
+
36
+ relation.write_parquet(path, compression=compression)
37
+
38
+ def csv( # noqa: D102
39
+ self,
40
+ path: str,
41
+ mode: Optional[str] = None,
42
+ compression: Optional[str] = None,
43
+ sep: Optional[str] = None,
44
+ quote: Optional[str] = None,
45
+ escape: Optional[str] = None,
46
+ header: Optional[Union[bool, str]] = None,
47
+ nullValue: Optional[str] = None,
48
+ escapeQuotes: Optional[Union[bool, str]] = None,
49
+ quoteAll: Optional[Union[bool, str]] = None,
50
+ dateFormat: Optional[str] = None,
51
+ timestampFormat: Optional[str] = None,
52
+ ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
53
+ ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
54
+ charToEscapeQuoteEscaping: Optional[str] = None,
55
+ encoding: Optional[str] = None,
56
+ emptyValue: Optional[str] = None,
57
+ lineSep: Optional[str] = None,
58
+ ) -> None:
59
+ if mode not in (None, "overwrite"):
60
+ raise NotImplementedError
61
+ if escapeQuotes:
62
+ raise NotImplementedError
63
+ if ignoreLeadingWhiteSpace:
64
+ raise NotImplementedError
65
+ if ignoreTrailingWhiteSpace:
66
+ raise NotImplementedError
67
+ if charToEscapeQuoteEscaping:
68
+ raise NotImplementedError
69
+ if emptyValue:
70
+ raise NotImplementedError
71
+ if lineSep:
72
+ raise NotImplementedError
73
+ relation = self.dataframe.relation
74
+ relation.write_csv(
75
+ path,
76
+ sep=sep,
77
+ na_rep=nullValue,
78
+ quotechar=quote,
79
+ compression=compression,
80
+ escapechar=escape,
81
+ header=header if isinstance(header, bool) else header == "True",
82
+ encoding=encoding,
83
+ quoting=quoteAll,
84
+ date_format=dateFormat,
85
+ timestamp_format=timestampFormat,
86
+ )
87
+
88
+
89
+ class DataFrameReader: # noqa: D101
90
+ def __init__(self, session: "SparkSession") -> None: # noqa: D107
91
+ self.session = session
92
+
93
+ def load( # noqa: D102
94
+ self,
95
+ path: Optional[Union[str, list[str]]] = None,
96
+ format: Optional[str] = None,
97
+ schema: Optional[Union[StructType, str]] = None,
98
+ **options: OptionalPrimitiveType,
99
+ ) -> "DataFrame":
100
+ from duckdb.experimental.spark.sql.dataframe import DataFrame
101
+
102
+ if not isinstance(path, str):
103
+ raise TypeError
104
+ if options:
105
+ raise ContributionsAcceptedError
106
+
107
+ rel = None
108
+ if format:
109
+ format = format.lower()
110
+ if format == "csv" or format == "tsv":
111
+ rel = self.session.conn.read_csv(path)
112
+ elif format == "json":
113
+ rel = self.session.conn.read_json(path)
114
+ elif format == "parquet":
115
+ rel = self.session.conn.read_parquet(path)
116
+ else:
117
+ raise ContributionsAcceptedError
118
+ else:
119
+ rel = self.session.conn.sql(f"select * from {path}")
120
+ df = DataFrame(rel, self.session)
121
+ if schema:
122
+ if not isinstance(schema, StructType):
123
+ raise ContributionsAcceptedError
124
+ schema = cast("StructType", schema)
125
+ types, names = schema.extract_types_and_names()
126
+ df = df._cast_types(types)
127
+ df = df.toDF(names)
128
+ raise NotImplementedError
129
+
130
+ def csv( # noqa: D102
131
+ self,
132
+ path: Union[str, list[str]],
133
+ schema: Optional[Union[StructType, str]] = None,
134
+ sep: Optional[str] = None,
135
+ encoding: Optional[str] = None,
136
+ quote: Optional[str] = None,
137
+ escape: Optional[str] = None,
138
+ comment: Optional[str] = None,
139
+ header: Optional[Union[bool, str]] = None,
140
+ inferSchema: Optional[Union[bool, str]] = None,
141
+ ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
142
+ ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
143
+ nullValue: Optional[str] = None,
144
+ nanValue: Optional[str] = None,
145
+ positiveInf: Optional[str] = None,
146
+ negativeInf: Optional[str] = None,
147
+ dateFormat: Optional[str] = None,
148
+ timestampFormat: Optional[str] = None,
149
+ maxColumns: Optional[Union[int, str]] = None,
150
+ maxCharsPerColumn: Optional[Union[int, str]] = None,
151
+ maxMalformedLogPerPartition: Optional[Union[int, str]] = None,
152
+ mode: Optional[str] = None,
153
+ columnNameOfCorruptRecord: Optional[str] = None,
154
+ multiLine: Optional[Union[bool, str]] = None,
155
+ charToEscapeQuoteEscaping: Optional[str] = None,
156
+ samplingRatio: Optional[Union[float, str]] = None,
157
+ enforceSchema: Optional[Union[bool, str]] = None,
158
+ emptyValue: Optional[str] = None,
159
+ locale: Optional[str] = None,
160
+ lineSep: Optional[str] = None,
161
+ pathGlobFilter: Optional[Union[bool, str]] = None,
162
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
163
+ modifiedBefore: Optional[Union[bool, str]] = None,
164
+ modifiedAfter: Optional[Union[bool, str]] = None,
165
+ unescapedQuoteHandling: Optional[str] = None,
166
+ ) -> "DataFrame":
167
+ if not isinstance(path, str):
168
+ raise NotImplementedError
169
+ if schema and not isinstance(schema, StructType):
170
+ raise ContributionsAcceptedError
171
+ if comment:
172
+ raise ContributionsAcceptedError
173
+ if inferSchema:
174
+ raise ContributionsAcceptedError
175
+ if ignoreLeadingWhiteSpace:
176
+ raise ContributionsAcceptedError
177
+ if ignoreTrailingWhiteSpace:
178
+ raise ContributionsAcceptedError
179
+ if nanValue:
180
+ raise ConnectionAbortedError
181
+ if positiveInf:
182
+ raise ConnectionAbortedError
183
+ if negativeInf:
184
+ raise ConnectionAbortedError
185
+ if negativeInf:
186
+ raise ConnectionAbortedError
187
+ if maxColumns:
188
+ raise ContributionsAcceptedError
189
+ if maxCharsPerColumn:
190
+ raise ContributionsAcceptedError
191
+ if maxMalformedLogPerPartition:
192
+ raise ContributionsAcceptedError
193
+ if mode:
194
+ raise ContributionsAcceptedError
195
+ if columnNameOfCorruptRecord:
196
+ raise ContributionsAcceptedError
197
+ if multiLine:
198
+ raise ContributionsAcceptedError
199
+ if charToEscapeQuoteEscaping:
200
+ raise ContributionsAcceptedError
201
+ if samplingRatio:
202
+ raise ContributionsAcceptedError
203
+ if enforceSchema:
204
+ raise ContributionsAcceptedError
205
+ if emptyValue:
206
+ raise ContributionsAcceptedError
207
+ if locale:
208
+ raise ContributionsAcceptedError
209
+ if pathGlobFilter:
210
+ raise ContributionsAcceptedError
211
+ if recursiveFileLookup:
212
+ raise ContributionsAcceptedError
213
+ if modifiedBefore:
214
+ raise ContributionsAcceptedError
215
+ if modifiedAfter:
216
+ raise ContributionsAcceptedError
217
+ if unescapedQuoteHandling:
218
+ raise ContributionsAcceptedError
219
+ if lineSep:
220
+ # We have support for custom newline, just needs to be ported to 'read_csv'
221
+ raise NotImplementedError
222
+
223
+ dtype = None
224
+ names = None
225
+ if schema:
226
+ schema = cast("StructType", schema)
227
+ dtype, names = schema.extract_types_and_names()
228
+
229
+ rel = self.session.conn.read_csv(
230
+ path,
231
+ header=header if isinstance(header, bool) else header == "True",
232
+ sep=sep,
233
+ dtype=dtype,
234
+ na_values=nullValue,
235
+ quotechar=quote,
236
+ escapechar=escape,
237
+ encoding=encoding,
238
+ date_format=dateFormat,
239
+ timestamp_format=timestampFormat,
240
+ )
241
+ from ..sql.dataframe import DataFrame
242
+
243
+ df = DataFrame(rel, self.session)
244
+ if names:
245
+ df = df.toDF(*names)
246
+ return df
247
+
248
+ def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame": # noqa: D102
249
+ input = list(paths)
250
+ if len(input) != 1:
251
+ msg = "Only single paths are supported for now"
252
+ raise NotImplementedError(msg)
253
+ option_amount = len(options.keys())
254
+ if option_amount != 0:
255
+ msg = "Options are not supported"
256
+ raise ContributionsAcceptedError(msg)
257
+ path = input[0]
258
+ rel = self.session.conn.read_parquet(path)
259
+ from ..sql.dataframe import DataFrame
260
+
261
+ df = DataFrame(rel, self.session)
262
+ return df
263
+
264
+ def json(
265
+ self,
266
+ path: Union[str, list[str]],
267
+ schema: Optional[Union[StructType, str]] = None,
268
+ primitivesAsString: Optional[Union[bool, str]] = None,
269
+ prefersDecimal: Optional[Union[bool, str]] = None,
270
+ allowComments: Optional[Union[bool, str]] = None,
271
+ allowUnquotedFieldNames: Optional[Union[bool, str]] = None,
272
+ allowSingleQuotes: Optional[Union[bool, str]] = None,
273
+ allowNumericLeadingZero: Optional[Union[bool, str]] = None,
274
+ allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None,
275
+ mode: Optional[str] = None,
276
+ columnNameOfCorruptRecord: Optional[str] = None,
277
+ dateFormat: Optional[str] = None,
278
+ timestampFormat: Optional[str] = None,
279
+ multiLine: Optional[Union[bool, str]] = None,
280
+ allowUnquotedControlChars: Optional[Union[bool, str]] = None,
281
+ lineSep: Optional[str] = None,
282
+ samplingRatio: Optional[Union[float, str]] = None,
283
+ dropFieldIfAllNull: Optional[Union[bool, str]] = None,
284
+ encoding: Optional[str] = None,
285
+ locale: Optional[str] = None,
286
+ pathGlobFilter: Optional[Union[bool, str]] = None,
287
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
288
+ modifiedBefore: Optional[Union[bool, str]] = None,
289
+ modifiedAfter: Optional[Union[bool, str]] = None,
290
+ allowNonNumericNumbers: Optional[Union[bool, str]] = None,
291
+ ) -> "DataFrame":
292
+ """Loads JSON files and returns the results as a :class:`DataFrame`.
293
+
294
+ `JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
295
+ For JSON (one record per file), set the ``multiLine`` parameter to ``true``.
296
+
297
+ If the ``schema`` parameter is not specified, this function goes
298
+ through the input once to determine the input schema.
299
+
300
+ .. versionadded:: 1.4.0
301
+
302
+ .. versionchanged:: 3.4.0
303
+ Supports Spark Connect.
304
+
305
+ Parameters
306
+ ----------
307
+ path : str, list or :class:`RDD`
308
+ string represents path to the JSON dataset, or a list of paths,
309
+ or RDD of Strings storing JSON objects.
310
+ schema : :class:`pyspark.sql.types.StructType` or str, optional
311
+ an optional :class:`pyspark.sql.types.StructType` for the input schema or
312
+ a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
313
+
314
+ Other Parameters
315
+ ----------------
316
+ Extra options
317
+ For the extra options, refer to
318
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_
319
+ for the version you use.
320
+
321
+ .. # noqa
322
+
323
+ Examples:
324
+ --------
325
+ Write a DataFrame into a JSON file and read it back.
326
+
327
+ >>> import tempfile
328
+ >>> with tempfile.TemporaryDirectory() as d:
329
+ ... # Write a DataFrame into a JSON file
330
+ ... spark.createDataFrame([{"age": 100, "name": "Hyukjin Kwon"}]).write.mode(
331
+ ... "overwrite"
332
+ ... ).format("json").save(d)
333
+ ...
334
+ ... # Read the JSON file as a DataFrame.
335
+ ... spark.read.json(d).show()
336
+ +---+------------+
337
+ |age| name|
338
+ +---+------------+
339
+ |100|Hyukjin Kwon|
340
+ +---+------------+
341
+ """
342
+ if schema is not None:
343
+ msg = "The 'schema' option is not supported"
344
+ raise ContributionsAcceptedError(msg)
345
+ if primitivesAsString is not None:
346
+ msg = "The 'primitivesAsString' option is not supported"
347
+ raise ContributionsAcceptedError(msg)
348
+ if prefersDecimal is not None:
349
+ msg = "The 'prefersDecimal' option is not supported"
350
+ raise ContributionsAcceptedError(msg)
351
+ if allowComments is not None:
352
+ msg = "The 'allowComments' option is not supported"
353
+ raise ContributionsAcceptedError(msg)
354
+ if allowUnquotedFieldNames is not None:
355
+ msg = "The 'allowUnquotedFieldNames' option is not supported"
356
+ raise ContributionsAcceptedError(msg)
357
+ if allowSingleQuotes is not None:
358
+ msg = "The 'allowSingleQuotes' option is not supported"
359
+ raise ContributionsAcceptedError(msg)
360
+ if allowNumericLeadingZero is not None:
361
+ msg = "The 'allowNumericLeadingZero' option is not supported"
362
+ raise ContributionsAcceptedError(msg)
363
+ if allowBackslashEscapingAnyCharacter is not None:
364
+ msg = "The 'allowBackslashEscapingAnyCharacter' option is not supported"
365
+ raise ContributionsAcceptedError(msg)
366
+ if mode is not None:
367
+ msg = "The 'mode' option is not supported"
368
+ raise ContributionsAcceptedError(msg)
369
+ if columnNameOfCorruptRecord is not None:
370
+ msg = "The 'columnNameOfCorruptRecord' option is not supported"
371
+ raise ContributionsAcceptedError(msg)
372
+ if dateFormat is not None:
373
+ msg = "The 'dateFormat' option is not supported"
374
+ raise ContributionsAcceptedError(msg)
375
+ if timestampFormat is not None:
376
+ msg = "The 'timestampFormat' option is not supported"
377
+ raise ContributionsAcceptedError(msg)
378
+ if multiLine is not None:
379
+ msg = "The 'multiLine' option is not supported"
380
+ raise ContributionsAcceptedError(msg)
381
+ if allowUnquotedControlChars is not None:
382
+ msg = "The 'allowUnquotedControlChars' option is not supported"
383
+ raise ContributionsAcceptedError(msg)
384
+ if lineSep is not None:
385
+ msg = "The 'lineSep' option is not supported"
386
+ raise ContributionsAcceptedError(msg)
387
+ if samplingRatio is not None:
388
+ msg = "The 'samplingRatio' option is not supported"
389
+ raise ContributionsAcceptedError(msg)
390
+ if dropFieldIfAllNull is not None:
391
+ msg = "The 'dropFieldIfAllNull' option is not supported"
392
+ raise ContributionsAcceptedError(msg)
393
+ if encoding is not None:
394
+ msg = "The 'encoding' option is not supported"
395
+ raise ContributionsAcceptedError(msg)
396
+ if locale is not None:
397
+ msg = "The 'locale' option is not supported"
398
+ raise ContributionsAcceptedError(msg)
399
+ if pathGlobFilter is not None:
400
+ msg = "The 'pathGlobFilter' option is not supported"
401
+ raise ContributionsAcceptedError(msg)
402
+ if recursiveFileLookup is not None:
403
+ msg = "The 'recursiveFileLookup' option is not supported"
404
+ raise ContributionsAcceptedError(msg)
405
+ if modifiedBefore is not None:
406
+ msg = "The 'modifiedBefore' option is not supported"
407
+ raise ContributionsAcceptedError(msg)
408
+ if modifiedAfter is not None:
409
+ msg = "The 'modifiedAfter' option is not supported"
410
+ raise ContributionsAcceptedError(msg)
411
+ if allowNonNumericNumbers is not None:
412
+ msg = "The 'allowNonNumericNumbers' option is not supported"
413
+ raise ContributionsAcceptedError(msg)
414
+
415
+ if isinstance(path, str):
416
+ path = [path]
417
+ if isinstance(path, list):
418
+ if len(path) == 1:
419
+ rel = self.session.conn.read_json(path[0])
420
+ from .dataframe import DataFrame
421
+
422
+ df = DataFrame(rel, self.session)
423
+ return df
424
+ raise PySparkNotImplementedError(message="Only a single path is supported for now")
425
+ else:
426
+ raise PySparkTypeError(
427
+ error_class="NOT_STR_OR_LIST_OF_RDD",
428
+ message_parameters={
429
+ "arg_name": "path",
430
+ "arg_type": type(path).__name__,
431
+ },
432
+ )
433
+
434
+
435
+ __all__ = ["DataFrameReader", "DataFrameWriter"]