duckdb 1.5.0.dev44__cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of duckdb might be problematic. Click here for more details.

Files changed (47) hide show
  1. _duckdb.cpython-314t-x86_64-linux-gnu.so +0 -0
  2. duckdb/__init__.py +475 -0
  3. duckdb/__init__.pyi +713 -0
  4. duckdb/bytes_io_wrapper.py +66 -0
  5. duckdb/experimental/__init__.py +2 -0
  6. duckdb/experimental/spark/LICENSE +260 -0
  7. duckdb/experimental/spark/__init__.py +7 -0
  8. duckdb/experimental/spark/_globals.py +77 -0
  9. duckdb/experimental/spark/_typing.py +48 -0
  10. duckdb/experimental/spark/conf.py +45 -0
  11. duckdb/experimental/spark/context.py +164 -0
  12. duckdb/experimental/spark/errors/__init__.py +72 -0
  13. duckdb/experimental/spark/errors/error_classes.py +918 -0
  14. duckdb/experimental/spark/errors/exceptions/__init__.py +16 -0
  15. duckdb/experimental/spark/errors/exceptions/base.py +217 -0
  16. duckdb/experimental/spark/errors/utils.py +116 -0
  17. duckdb/experimental/spark/exception.py +15 -0
  18. duckdb/experimental/spark/sql/__init__.py +7 -0
  19. duckdb/experimental/spark/sql/_typing.py +93 -0
  20. duckdb/experimental/spark/sql/catalog.py +78 -0
  21. duckdb/experimental/spark/sql/column.py +368 -0
  22. duckdb/experimental/spark/sql/conf.py +23 -0
  23. duckdb/experimental/spark/sql/dataframe.py +1437 -0
  24. duckdb/experimental/spark/sql/functions.py +6221 -0
  25. duckdb/experimental/spark/sql/group.py +420 -0
  26. duckdb/experimental/spark/sql/readwriter.py +449 -0
  27. duckdb/experimental/spark/sql/session.py +292 -0
  28. duckdb/experimental/spark/sql/streaming.py +37 -0
  29. duckdb/experimental/spark/sql/type_utils.py +105 -0
  30. duckdb/experimental/spark/sql/types.py +1275 -0
  31. duckdb/experimental/spark/sql/udf.py +37 -0
  32. duckdb/filesystem.py +23 -0
  33. duckdb/functional/__init__.py +17 -0
  34. duckdb/functional/__init__.pyi +31 -0
  35. duckdb/polars_io.py +237 -0
  36. duckdb/query_graph/__main__.py +363 -0
  37. duckdb/typing/__init__.py +61 -0
  38. duckdb/typing/__init__.pyi +36 -0
  39. duckdb/udf.py +19 -0
  40. duckdb/value/__init__.py +0 -0
  41. duckdb/value/__init__.pyi +0 -0
  42. duckdb/value/constant/__init__.py +268 -0
  43. duckdb/value/constant/__init__.pyi +115 -0
  44. duckdb-1.5.0.dev44.dist-info/METADATA +80 -0
  45. duckdb-1.5.0.dev44.dist-info/RECORD +47 -0
  46. duckdb-1.5.0.dev44.dist-info/WHEEL +6 -0
  47. duckdb-1.5.0.dev44.dist-info/licenses/LICENSE +7 -0
@@ -0,0 +1,449 @@
1
+ from typing import TYPE_CHECKING, List, Optional, Union, cast
2
+
3
+ from ..exception import ContributionsAcceptedError
4
+ from .types import StructType
5
+
6
+
7
+ from ..errors import PySparkNotImplementedError, PySparkTypeError
8
+
9
+ PrimitiveType = Union[bool, float, int, str]
10
+ OptionalPrimitiveType = Optional[PrimitiveType]
11
+
12
+ if TYPE_CHECKING:
13
+ from duckdb.experimental.spark.sql.dataframe import DataFrame
14
+ from duckdb.experimental.spark.sql.session import SparkSession
15
+
16
+
17
+ class DataFrameWriter:
18
+ def __init__(self, dataframe: "DataFrame"):
19
+ self.dataframe = dataframe
20
+
21
+ def saveAsTable(self, table_name: str) -> None:
22
+ relation = self.dataframe.relation
23
+ relation.create(table_name)
24
+
25
+ def parquet(
26
+ self,
27
+ path: str,
28
+ mode: Optional[str] = None,
29
+ partitionBy: Union[str, List[str], None] = None,
30
+ compression: Optional[str] = None,
31
+ ) -> None:
32
+ relation = self.dataframe.relation
33
+ if mode:
34
+ raise NotImplementedError
35
+ if partitionBy:
36
+ raise NotImplementedError
37
+
38
+ relation.write_parquet(path, compression=compression)
39
+
40
+ def csv(
41
+ self,
42
+ path: str,
43
+ mode: Optional[str] = None,
44
+ compression: Optional[str] = None,
45
+ sep: Optional[str] = None,
46
+ quote: Optional[str] = None,
47
+ escape: Optional[str] = None,
48
+ header: Optional[Union[bool, str]] = None,
49
+ nullValue: Optional[str] = None,
50
+ escapeQuotes: Optional[Union[bool, str]] = None,
51
+ quoteAll: Optional[Union[bool, str]] = None,
52
+ dateFormat: Optional[str] = None,
53
+ timestampFormat: Optional[str] = None,
54
+ ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
55
+ ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
56
+ charToEscapeQuoteEscaping: Optional[str] = None,
57
+ encoding: Optional[str] = None,
58
+ emptyValue: Optional[str] = None,
59
+ lineSep: Optional[str] = None,
60
+ ):
61
+ if mode not in (None, "overwrite"):
62
+ raise NotImplementedError
63
+ if escapeQuotes:
64
+ raise NotImplementedError
65
+ if ignoreLeadingWhiteSpace:
66
+ raise NotImplementedError
67
+ if ignoreTrailingWhiteSpace:
68
+ raise NotImplementedError
69
+ if charToEscapeQuoteEscaping:
70
+ raise NotImplementedError
71
+ if emptyValue:
72
+ raise NotImplementedError
73
+ if lineSep:
74
+ raise NotImplementedError
75
+ relation = self.dataframe.relation
76
+ relation.write_csv(
77
+ path,
78
+ sep=sep,
79
+ na_rep=nullValue,
80
+ quotechar=quote,
81
+ compression=compression,
82
+ escapechar=escape,
83
+ header=header if isinstance(header, bool) else header == "True",
84
+ encoding=encoding,
85
+ quoting=quoteAll,
86
+ date_format=dateFormat,
87
+ timestamp_format=timestampFormat,
88
+ )
89
+
90
+
91
+ class DataFrameReader:
92
+ def __init__(self, session: "SparkSession"):
93
+ self.session = session
94
+
95
+ def load(
96
+ self,
97
+ path: Optional[Union[str, List[str]]] = None,
98
+ format: Optional[str] = None,
99
+ schema: Optional[Union[StructType, str]] = None,
100
+ **options: OptionalPrimitiveType,
101
+ ) -> "DataFrame":
102
+ from duckdb.experimental.spark.sql.dataframe import DataFrame
103
+
104
+ if not isinstance(path, str):
105
+ raise ImportError
106
+ if options:
107
+ raise ContributionsAcceptedError
108
+
109
+ rel = None
110
+ if format:
111
+ format = format.lower()
112
+ if format == "csv" or format == "tsv":
113
+ rel = self.session.conn.read_csv(path)
114
+ elif format == "json":
115
+ rel = self.session.conn.read_json(path)
116
+ elif format == "parquet":
117
+ rel = self.session.conn.read_parquet(path)
118
+ else:
119
+ raise ContributionsAcceptedError
120
+ else:
121
+ rel = self.session.conn.sql(f"select * from {path}")
122
+ df = DataFrame(rel, self.session)
123
+ if schema:
124
+ if not isinstance(schema, StructType):
125
+ raise ContributionsAcceptedError
126
+ schema = cast(StructType, schema)
127
+ types, names = schema.extract_types_and_names()
128
+ df = df._cast_types(types)
129
+ df = df.toDF(names)
130
+ raise NotImplementedError
131
+
132
+ def csv(
133
+ self,
134
+ path: Union[str, List[str]],
135
+ schema: Optional[Union[StructType, str]] = None,
136
+ sep: Optional[str] = None,
137
+ encoding: Optional[str] = None,
138
+ quote: Optional[str] = None,
139
+ escape: Optional[str] = None,
140
+ comment: Optional[str] = None,
141
+ header: Optional[Union[bool, str]] = None,
142
+ inferSchema: Optional[Union[bool, str]] = None,
143
+ ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,
144
+ ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,
145
+ nullValue: Optional[str] = None,
146
+ nanValue: Optional[str] = None,
147
+ positiveInf: Optional[str] = None,
148
+ negativeInf: Optional[str] = None,
149
+ dateFormat: Optional[str] = None,
150
+ timestampFormat: Optional[str] = None,
151
+ maxColumns: Optional[Union[int, str]] = None,
152
+ maxCharsPerColumn: Optional[Union[int, str]] = None,
153
+ maxMalformedLogPerPartition: Optional[Union[int, str]] = None,
154
+ mode: Optional[str] = None,
155
+ columnNameOfCorruptRecord: Optional[str] = None,
156
+ multiLine: Optional[Union[bool, str]] = None,
157
+ charToEscapeQuoteEscaping: Optional[str] = None,
158
+ samplingRatio: Optional[Union[float, str]] = None,
159
+ enforceSchema: Optional[Union[bool, str]] = None,
160
+ emptyValue: Optional[str] = None,
161
+ locale: Optional[str] = None,
162
+ lineSep: Optional[str] = None,
163
+ pathGlobFilter: Optional[Union[bool, str]] = None,
164
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
165
+ modifiedBefore: Optional[Union[bool, str]] = None,
166
+ modifiedAfter: Optional[Union[bool, str]] = None,
167
+ unescapedQuoteHandling: Optional[str] = None,
168
+ ) -> "DataFrame":
169
+ if not isinstance(path, str):
170
+ raise NotImplementedError
171
+ if schema and not isinstance(schema, StructType):
172
+ raise ContributionsAcceptedError
173
+ if comment:
174
+ raise ContributionsAcceptedError
175
+ if inferSchema:
176
+ raise ContributionsAcceptedError
177
+ if ignoreLeadingWhiteSpace:
178
+ raise ContributionsAcceptedError
179
+ if ignoreTrailingWhiteSpace:
180
+ raise ContributionsAcceptedError
181
+ if nanValue:
182
+ raise ConnectionAbortedError
183
+ if positiveInf:
184
+ raise ConnectionAbortedError
185
+ if negativeInf:
186
+ raise ConnectionAbortedError
187
+ if negativeInf:
188
+ raise ConnectionAbortedError
189
+ if maxColumns:
190
+ raise ContributionsAcceptedError
191
+ if maxCharsPerColumn:
192
+ raise ContributionsAcceptedError
193
+ if maxMalformedLogPerPartition:
194
+ raise ContributionsAcceptedError
195
+ if mode:
196
+ raise ContributionsAcceptedError
197
+ if columnNameOfCorruptRecord:
198
+ raise ContributionsAcceptedError
199
+ if multiLine:
200
+ raise ContributionsAcceptedError
201
+ if charToEscapeQuoteEscaping:
202
+ raise ContributionsAcceptedError
203
+ if samplingRatio:
204
+ raise ContributionsAcceptedError
205
+ if enforceSchema:
206
+ raise ContributionsAcceptedError
207
+ if emptyValue:
208
+ raise ContributionsAcceptedError
209
+ if locale:
210
+ raise ContributionsAcceptedError
211
+ if pathGlobFilter:
212
+ raise ContributionsAcceptedError
213
+ if recursiveFileLookup:
214
+ raise ContributionsAcceptedError
215
+ if modifiedBefore:
216
+ raise ContributionsAcceptedError
217
+ if modifiedAfter:
218
+ raise ContributionsAcceptedError
219
+ if unescapedQuoteHandling:
220
+ raise ContributionsAcceptedError
221
+ if lineSep:
222
+ # We have support for custom newline, just needs to be ported to 'read_csv'
223
+ raise NotImplementedError
224
+
225
+ dtype = None
226
+ names = None
227
+ if schema:
228
+ schema = cast(StructType, schema)
229
+ dtype, names = schema.extract_types_and_names()
230
+
231
+ rel = self.session.conn.read_csv(
232
+ path,
233
+ header=header if isinstance(header, bool) else header == "True",
234
+ sep=sep,
235
+ dtype=dtype,
236
+ na_values=nullValue,
237
+ quotechar=quote,
238
+ escapechar=escape,
239
+ encoding=encoding,
240
+ date_format=dateFormat,
241
+ timestamp_format=timestampFormat,
242
+ )
243
+ from ..sql.dataframe import DataFrame
244
+
245
+ df = DataFrame(rel, self.session)
246
+ if names:
247
+ df = df.toDF(*names)
248
+ return df
249
+
250
+ def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame":
251
+ input = list(paths)
252
+ if len(input) != 1:
253
+ raise NotImplementedError("Only single paths are supported for now")
254
+ option_amount = len(options.keys())
255
+ if option_amount != 0:
256
+ raise ContributionsAcceptedError("Options are not supported")
257
+ path = input[0]
258
+ rel = self.session.conn.read_parquet(path)
259
+ from ..sql.dataframe import DataFrame
260
+
261
+ df = DataFrame(rel, self.session)
262
+ return df
263
+
264
+ def json(
265
+ self,
266
+ path: Union[str, List[str]],
267
+ schema: Optional[Union[StructType, str]] = None,
268
+ primitivesAsString: Optional[Union[bool, str]] = None,
269
+ prefersDecimal: Optional[Union[bool, str]] = None,
270
+ allowComments: Optional[Union[bool, str]] = None,
271
+ allowUnquotedFieldNames: Optional[Union[bool, str]] = None,
272
+ allowSingleQuotes: Optional[Union[bool, str]] = None,
273
+ allowNumericLeadingZero: Optional[Union[bool, str]] = None,
274
+ allowBackslashEscapingAnyCharacter: Optional[Union[bool, str]] = None,
275
+ mode: Optional[str] = None,
276
+ columnNameOfCorruptRecord: Optional[str] = None,
277
+ dateFormat: Optional[str] = None,
278
+ timestampFormat: Optional[str] = None,
279
+ multiLine: Optional[Union[bool, str]] = None,
280
+ allowUnquotedControlChars: Optional[Union[bool, str]] = None,
281
+ lineSep: Optional[str] = None,
282
+ samplingRatio: Optional[Union[float, str]] = None,
283
+ dropFieldIfAllNull: Optional[Union[bool, str]] = None,
284
+ encoding: Optional[str] = None,
285
+ locale: Optional[str] = None,
286
+ pathGlobFilter: Optional[Union[bool, str]] = None,
287
+ recursiveFileLookup: Optional[Union[bool, str]] = None,
288
+ modifiedBefore: Optional[Union[bool, str]] = None,
289
+ modifiedAfter: Optional[Union[bool, str]] = None,
290
+ allowNonNumericNumbers: Optional[Union[bool, str]] = None,
291
+ ) -> "DataFrame":
292
+ """
293
+ Loads JSON files and returns the results as a :class:`DataFrame`.
294
+
295
+ `JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
296
+ For JSON (one record per file), set the ``multiLine`` parameter to ``true``.
297
+
298
+ If the ``schema`` parameter is not specified, this function goes
299
+ through the input once to determine the input schema.
300
+
301
+ .. versionadded:: 1.4.0
302
+
303
+ .. versionchanged:: 3.4.0
304
+ Supports Spark Connect.
305
+
306
+ Parameters
307
+ ----------
308
+ path : str, list or :class:`RDD`
309
+ string represents path to the JSON dataset, or a list of paths,
310
+ or RDD of Strings storing JSON objects.
311
+ schema : :class:`pyspark.sql.types.StructType` or str, optional
312
+ an optional :class:`pyspark.sql.types.StructType` for the input schema or
313
+ a DDL-formatted string (For example ``col0 INT, col1 DOUBLE``).
314
+
315
+ Other Parameters
316
+ ----------------
317
+ Extra options
318
+ For the extra options, refer to
319
+ `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_
320
+ for the version you use.
321
+
322
+ .. # noqa
323
+
324
+ Examples
325
+ --------
326
+ Write a DataFrame into a JSON file and read it back.
327
+
328
+ >>> import tempfile
329
+ >>> with tempfile.TemporaryDirectory() as d:
330
+ ... # Write a DataFrame into a JSON file
331
+ ... spark.createDataFrame(
332
+ ... [{"age": 100, "name": "Hyukjin Kwon"}]
333
+ ... ).write.mode("overwrite").format("json").save(d)
334
+ ...
335
+ ... # Read the JSON file as a DataFrame.
336
+ ... spark.read.json(d).show()
337
+ +---+------------+
338
+ |age| name|
339
+ +---+------------+
340
+ |100|Hyukjin Kwon|
341
+ +---+------------+
342
+ """
343
+
344
+ if schema is not None:
345
+ raise ContributionsAcceptedError("The 'schema' option is not supported")
346
+ if primitivesAsString is not None:
347
+ raise ContributionsAcceptedError(
348
+ "The 'primitivesAsString' option is not supported"
349
+ )
350
+ if prefersDecimal is not None:
351
+ raise ContributionsAcceptedError(
352
+ "The 'prefersDecimal' option is not supported"
353
+ )
354
+ if allowComments is not None:
355
+ raise ContributionsAcceptedError(
356
+ "The 'allowComments' option is not supported"
357
+ )
358
+ if allowUnquotedFieldNames is not None:
359
+ raise ContributionsAcceptedError(
360
+ "The 'allowUnquotedFieldNames' option is not supported"
361
+ )
362
+ if allowSingleQuotes is not None:
363
+ raise ContributionsAcceptedError(
364
+ "The 'allowSingleQuotes' option is not supported"
365
+ )
366
+ if allowNumericLeadingZero is not None:
367
+ raise ContributionsAcceptedError(
368
+ "The 'allowNumericLeadingZero' option is not supported"
369
+ )
370
+ if allowBackslashEscapingAnyCharacter is not None:
371
+ raise ContributionsAcceptedError(
372
+ "The 'allowBackslashEscapingAnyCharacter' option is not supported"
373
+ )
374
+ if mode is not None:
375
+ raise ContributionsAcceptedError("The 'mode' option is not supported")
376
+ if columnNameOfCorruptRecord is not None:
377
+ raise ContributionsAcceptedError(
378
+ "The 'columnNameOfCorruptRecord' option is not supported"
379
+ )
380
+ if dateFormat is not None:
381
+ raise ContributionsAcceptedError("The 'dateFormat' option is not supported")
382
+ if timestampFormat is not None:
383
+ raise ContributionsAcceptedError(
384
+ "The 'timestampFormat' option is not supported"
385
+ )
386
+ if multiLine is not None:
387
+ raise ContributionsAcceptedError("The 'multiLine' option is not supported")
388
+ if allowUnquotedControlChars is not None:
389
+ raise ContributionsAcceptedError(
390
+ "The 'allowUnquotedControlChars' option is not supported"
391
+ )
392
+ if lineSep is not None:
393
+ raise ContributionsAcceptedError("The 'lineSep' option is not supported")
394
+ if samplingRatio is not None:
395
+ raise ContributionsAcceptedError(
396
+ "The 'samplingRatio' option is not supported"
397
+ )
398
+ if dropFieldIfAllNull is not None:
399
+ raise ContributionsAcceptedError(
400
+ "The 'dropFieldIfAllNull' option is not supported"
401
+ )
402
+ if encoding is not None:
403
+ raise ContributionsAcceptedError("The 'encoding' option is not supported")
404
+ if locale is not None:
405
+ raise ContributionsAcceptedError("The 'locale' option is not supported")
406
+ if pathGlobFilter is not None:
407
+ raise ContributionsAcceptedError(
408
+ "The 'pathGlobFilter' option is not supported"
409
+ )
410
+ if recursiveFileLookup is not None:
411
+ raise ContributionsAcceptedError(
412
+ "The 'recursiveFileLookup' option is not supported"
413
+ )
414
+ if modifiedBefore is not None:
415
+ raise ContributionsAcceptedError(
416
+ "The 'modifiedBefore' option is not supported"
417
+ )
418
+ if modifiedAfter is not None:
419
+ raise ContributionsAcceptedError(
420
+ "The 'modifiedAfter' option is not supported"
421
+ )
422
+ if allowNonNumericNumbers is not None:
423
+ raise ContributionsAcceptedError(
424
+ "The 'allowNonNumericNumbers' option is not supported"
425
+ )
426
+
427
+ if isinstance(path, str):
428
+ path = [path]
429
+ if isinstance(path, list):
430
+ if len(path) == 1:
431
+ rel = self.session.conn.read_json(path[0])
432
+ from .dataframe import DataFrame
433
+
434
+ df = DataFrame(rel, self.session)
435
+ return df
436
+ raise PySparkNotImplementedError(
437
+ message="Only a single path is supported for now"
438
+ )
439
+ else:
440
+ raise PySparkTypeError(
441
+ error_class="NOT_STR_OR_LIST_OF_RDD",
442
+ message_parameters={
443
+ "arg_name": "path",
444
+ "arg_type": type(path).__name__,
445
+ },
446
+ )
447
+
448
+
449
+ __all__ = ["DataFrameWriter", "DataFrameReader"]