duckdb 1.5.0.dev44__cp39-cp39-macosx_11_0_arm64.whl → 1.5.0.dev94__cp39-cp39-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb-stubs/__init__.pyi +1443 -0
- _duckdb-stubs/_func.pyi +46 -0
- _duckdb-stubs/_sqltypes.pyi +75 -0
- _duckdb.cpython-39-darwin.so +0 -0
- adbc_driver_duckdb/__init__.py +49 -0
- adbc_driver_duckdb/dbapi.py +115 -0
- duckdb/__init__.py +341 -435
- duckdb/_dbapi_type_object.py +231 -0
- duckdb/_version.py +22 -0
- duckdb/bytes_io_wrapper.py +12 -9
- duckdb/experimental/__init__.py +2 -1
- duckdb/experimental/spark/__init__.py +3 -4
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +7 -9
- duckdb/experimental/spark/conf.py +16 -15
- duckdb/experimental/spark/context.py +60 -44
- duckdb/experimental/spark/errors/__init__.py +33 -35
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +39 -88
- duckdb/experimental/spark/errors/utils.py +11 -16
- duckdb/experimental/spark/exception.py +9 -6
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +8 -15
- duckdb/experimental/spark/sql/catalog.py +21 -20
- duckdb/experimental/spark/sql/column.py +48 -55
- duckdb/experimental/spark/sql/conf.py +9 -8
- duckdb/experimental/spark/sql/dataframe.py +185 -233
- duckdb/experimental/spark/sql/functions.py +1222 -1248
- duckdb/experimental/spark/sql/group.py +56 -52
- duckdb/experimental/spark/sql/readwriter.py +80 -94
- duckdb/experimental/spark/sql/session.py +64 -59
- duckdb/experimental/spark/sql/streaming.py +9 -10
- duckdb/experimental/spark/sql/type_utils.py +67 -65
- duckdb/experimental/spark/sql/types.py +309 -345
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +26 -16
- duckdb/func/__init__.py +3 -0
- duckdb/functional/__init__.py +12 -16
- duckdb/polars_io.py +130 -83
- duckdb/query_graph/__main__.py +91 -96
- duckdb/sqltypes/__init__.py +63 -0
- duckdb/typing/__init__.py +18 -8
- duckdb/udf.py +10 -5
- duckdb/value/__init__.py +1 -0
- duckdb/value/constant/__init__.py +62 -60
- {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/METADATA +12 -4
- duckdb-1.5.0.dev94.dist-info/RECORD +52 -0
- duckdb/__init__.pyi +0 -713
- duckdb/functional/__init__.pyi +0 -31
- duckdb/typing/__init__.pyi +0 -36
- duckdb/value/constant/__init__.pyi +0 -115
- duckdb-1.5.0.dev44.dist-info/RECORD +0 -47
- /duckdb/{value/__init__.pyi → py.typed} +0 -0
- {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/WHEEL +0 -0
- {duckdb-1.5.0.dev44.dist-info → duckdb-1.5.0.dev94.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# # noqa: D100
|
|
2
2
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
3
|
# contributor license agreements. See the NOTICE file distributed with
|
|
4
4
|
# this work for additional information regarding copyright ownership.
|
|
@@ -15,26 +15,27 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
#
|
|
17
17
|
|
|
18
|
-
from
|
|
19
|
-
from typing import Callable, TYPE_CHECKING, overload, Dict, Union, List
|
|
18
|
+
from typing import TYPE_CHECKING, Callable, Union, overload
|
|
20
19
|
|
|
20
|
+
from ..exception import ContributionsAcceptedError
|
|
21
21
|
from .column import Column
|
|
22
|
-
from .session import SparkSession
|
|
23
22
|
from .dataframe import DataFrame
|
|
24
23
|
from .functions import _to_column_expr
|
|
25
|
-
from ._typing import ColumnOrName
|
|
26
24
|
from .types import NumericType
|
|
27
25
|
|
|
26
|
+
# Only import symbols needed for type checking if something is type checking
|
|
28
27
|
if TYPE_CHECKING:
|
|
29
|
-
from ._typing import
|
|
28
|
+
from ._typing import ColumnOrName
|
|
29
|
+
from .session import SparkSession
|
|
30
30
|
|
|
31
31
|
__all__ = ["GroupedData", "Grouping"]
|
|
32
32
|
|
|
33
|
+
|
|
33
34
|
def _api_internal(self: "GroupedData", name: str, *cols: str) -> DataFrame:
|
|
34
35
|
expressions = ",".join(list(cols))
|
|
35
36
|
group_by = str(self._grouping) if self._grouping else ""
|
|
36
37
|
projections = self._grouping.get_columns()
|
|
37
|
-
jdf =
|
|
38
|
+
jdf = self._df.relation.apply(
|
|
38
39
|
function_name=name, # aggregate function
|
|
39
40
|
function_aggr=expressions, # inputs to aggregate
|
|
40
41
|
group_expr=group_by, # groups
|
|
@@ -42,6 +43,7 @@ def _api_internal(self: "GroupedData", name: str, *cols: str) -> DataFrame:
|
|
|
42
43
|
)
|
|
43
44
|
return DataFrame(jdf, self.session)
|
|
44
45
|
|
|
46
|
+
|
|
45
47
|
def df_varargs_api(f: Callable[..., DataFrame]) -> Callable[..., DataFrame]:
|
|
46
48
|
def _api(self: "GroupedData", *cols: str) -> DataFrame:
|
|
47
49
|
name = f.__name__
|
|
@@ -52,49 +54,49 @@ def df_varargs_api(f: Callable[..., DataFrame]) -> Callable[..., DataFrame]:
|
|
|
52
54
|
return _api
|
|
53
55
|
|
|
54
56
|
|
|
55
|
-
class Grouping:
|
|
56
|
-
def __init__(self, *cols: "ColumnOrName", **kwargs):
|
|
57
|
+
class Grouping: # noqa: D101
|
|
58
|
+
def __init__(self, *cols: "ColumnOrName", **kwargs) -> None: # noqa: D107
|
|
57
59
|
self._type = ""
|
|
58
60
|
self._cols = [_to_column_expr(x) for x in cols]
|
|
59
|
-
if
|
|
60
|
-
special = kwargs[
|
|
61
|
+
if "special" in kwargs:
|
|
62
|
+
special = kwargs["special"]
|
|
61
63
|
accepted_special = ["cube", "rollup"]
|
|
62
64
|
assert special in accepted_special
|
|
63
65
|
self._type = special
|
|
64
66
|
|
|
65
|
-
def get_columns(self) -> str:
|
|
67
|
+
def get_columns(self) -> str: # noqa: D102
|
|
66
68
|
columns = ",".join([str(x) for x in self._cols])
|
|
67
69
|
return columns
|
|
68
70
|
|
|
69
|
-
def __str__(self):
|
|
71
|
+
def __str__(self) -> str: # noqa: D105
|
|
70
72
|
columns = self.get_columns()
|
|
71
73
|
if self._type:
|
|
72
|
-
return self._type +
|
|
74
|
+
return self._type + "(" + columns + ")"
|
|
73
75
|
return columns
|
|
74
76
|
|
|
75
77
|
|
|
76
78
|
class GroupedData:
|
|
77
|
-
"""
|
|
78
|
-
A set of methods for aggregations on a :class:`DataFrame`,
|
|
79
|
+
"""A set of methods for aggregations on a :class:`DataFrame`,
|
|
79
80
|
created by :func:`DataFrame.groupBy`.
|
|
80
81
|
|
|
81
|
-
"""
|
|
82
|
+
""" # noqa: D205
|
|
82
83
|
|
|
83
|
-
def __init__(self, grouping: Grouping, df: DataFrame):
|
|
84
|
+
def __init__(self, grouping: Grouping, df: DataFrame) -> None: # noqa: D107
|
|
84
85
|
self._grouping = grouping
|
|
85
86
|
self._df = df
|
|
86
87
|
self.session: SparkSession = df.session
|
|
87
88
|
|
|
88
|
-
def __repr__(self) -> str:
|
|
89
|
+
def __repr__(self) -> str: # noqa: D105
|
|
89
90
|
return str(self._df)
|
|
90
91
|
|
|
91
92
|
def count(self) -> DataFrame:
|
|
92
93
|
"""Counts the number of records for each group.
|
|
93
94
|
|
|
94
|
-
Examples
|
|
95
|
+
Examples:
|
|
95
96
|
--------
|
|
96
97
|
>>> df = spark.createDataFrame(
|
|
97
|
-
...
|
|
98
|
+
... [(2, "Alice"), (3, "Alice"), (5, "Bob"), (10, "Bob")], ["age", "name"]
|
|
99
|
+
... )
|
|
98
100
|
>>> df.show()
|
|
99
101
|
+---+-----+
|
|
100
102
|
|age| name|
|
|
@@ -115,7 +117,7 @@ class GroupedData:
|
|
|
115
117
|
| Bob| 2|
|
|
116
118
|
+-----+-----+
|
|
117
119
|
"""
|
|
118
|
-
return _api_internal(self, "count").withColumnRenamed(
|
|
120
|
+
return _api_internal(self, "count").withColumnRenamed("count_star()", "count")
|
|
119
121
|
|
|
120
122
|
@df_varargs_api
|
|
121
123
|
def mean(self, *cols: str) -> DataFrame:
|
|
@@ -139,11 +141,12 @@ class GroupedData:
|
|
|
139
141
|
cols : str
|
|
140
142
|
column names. Non-numeric columns are ignored.
|
|
141
143
|
|
|
142
|
-
Examples
|
|
144
|
+
Examples:
|
|
143
145
|
--------
|
|
144
|
-
>>> df = spark.createDataFrame(
|
|
145
|
-
... (2, "Alice", 80), (3, "Alice", 100),
|
|
146
|
-
...
|
|
146
|
+
>>> df = spark.createDataFrame(
|
|
147
|
+
... [(2, "Alice", 80), (3, "Alice", 100), (5, "Bob", 120), (10, "Bob", 140)],
|
|
148
|
+
... ["age", "name", "height"],
|
|
149
|
+
... )
|
|
147
150
|
>>> df.show()
|
|
148
151
|
+---+-----+------+
|
|
149
152
|
|age| name|height|
|
|
@@ -156,7 +159,7 @@ class GroupedData:
|
|
|
156
159
|
|
|
157
160
|
Group-by name, and calculate the mean of the age in each group.
|
|
158
161
|
|
|
159
|
-
>>> df.groupBy("name").avg(
|
|
162
|
+
>>> df.groupBy("name").avg("age").sort("name").show()
|
|
160
163
|
+-----+--------+
|
|
161
164
|
| name|avg(age)|
|
|
162
165
|
+-----+--------+
|
|
@@ -166,7 +169,7 @@ class GroupedData:
|
|
|
166
169
|
|
|
167
170
|
Calculate the mean of the age and height in all data.
|
|
168
171
|
|
|
169
|
-
>>> df.groupBy().avg(
|
|
172
|
+
>>> df.groupBy().avg("age", "height").show()
|
|
170
173
|
+--------+-----------+
|
|
171
174
|
|avg(age)|avg(height)|
|
|
172
175
|
+--------+-----------+
|
|
@@ -177,18 +180,19 @@ class GroupedData:
|
|
|
177
180
|
if len(columns) == 0:
|
|
178
181
|
schema = self._df.schema
|
|
179
182
|
# Take only the numeric types of the relation
|
|
180
|
-
columns:
|
|
183
|
+
columns: list[str] = [x.name for x in schema.fields if isinstance(x.dataType, NumericType)]
|
|
181
184
|
return _api_internal(self, "avg", *columns)
|
|
182
185
|
|
|
183
186
|
@df_varargs_api
|
|
184
187
|
def max(self, *cols: str) -> DataFrame:
|
|
185
188
|
"""Computes the max value for each numeric columns for each group.
|
|
186
189
|
|
|
187
|
-
Examples
|
|
190
|
+
Examples:
|
|
188
191
|
--------
|
|
189
|
-
>>> df = spark.createDataFrame(
|
|
190
|
-
... (2, "Alice", 80), (3, "Alice", 100),
|
|
191
|
-
...
|
|
192
|
+
>>> df = spark.createDataFrame(
|
|
193
|
+
... [(2, "Alice", 80), (3, "Alice", 100), (5, "Bob", 120), (10, "Bob", 140)],
|
|
194
|
+
... ["age", "name", "height"],
|
|
195
|
+
... )
|
|
192
196
|
>>> df.show()
|
|
193
197
|
+---+-----+------+
|
|
194
198
|
|age| name|height|
|
|
@@ -228,11 +232,12 @@ class GroupedData:
|
|
|
228
232
|
cols : str
|
|
229
233
|
column names. Non-numeric columns are ignored.
|
|
230
234
|
|
|
231
|
-
Examples
|
|
235
|
+
Examples:
|
|
232
236
|
--------
|
|
233
|
-
>>> df = spark.createDataFrame(
|
|
234
|
-
... (2, "Alice", 80), (3, "Alice", 100),
|
|
235
|
-
...
|
|
237
|
+
>>> df = spark.createDataFrame(
|
|
238
|
+
... [(2, "Alice", 80), (3, "Alice", 100), (5, "Bob", 120), (10, "Bob", 140)],
|
|
239
|
+
... ["age", "name", "height"],
|
|
240
|
+
... )
|
|
236
241
|
>>> df.show()
|
|
237
242
|
+---+-----+------+
|
|
238
243
|
|age| name|height|
|
|
@@ -272,11 +277,12 @@ class GroupedData:
|
|
|
272
277
|
cols : str
|
|
273
278
|
column names. Non-numeric columns are ignored.
|
|
274
279
|
|
|
275
|
-
Examples
|
|
280
|
+
Examples:
|
|
276
281
|
--------
|
|
277
|
-
>>> df = spark.createDataFrame(
|
|
278
|
-
... (2, "Alice", 80), (3, "Alice", 100),
|
|
279
|
-
...
|
|
282
|
+
>>> df = spark.createDataFrame(
|
|
283
|
+
... [(2, "Alice", 80), (3, "Alice", 100), (5, "Bob", 120), (10, "Bob", 140)],
|
|
284
|
+
... ["age", "name", "height"],
|
|
285
|
+
... )
|
|
280
286
|
>>> df.show()
|
|
281
287
|
+---+-----+------+
|
|
282
288
|
|age| name|height|
|
|
@@ -308,14 +314,12 @@ class GroupedData:
|
|
|
308
314
|
"""
|
|
309
315
|
|
|
310
316
|
@overload
|
|
311
|
-
def agg(self, *exprs: Column) -> DataFrame:
|
|
312
|
-
...
|
|
317
|
+
def agg(self, *exprs: Column) -> DataFrame: ...
|
|
313
318
|
|
|
314
319
|
@overload
|
|
315
|
-
def agg(self, __exprs:
|
|
316
|
-
...
|
|
320
|
+
def agg(self, __exprs: dict[str, str]) -> DataFrame: ... # noqa: PYI063
|
|
317
321
|
|
|
318
|
-
def agg(self, *exprs: Union[Column,
|
|
322
|
+
def agg(self, *exprs: Union[Column, dict[str, str]]) -> DataFrame:
|
|
319
323
|
"""Compute aggregates and returns the result as a :class:`DataFrame`.
|
|
320
324
|
|
|
321
325
|
The available aggregate functions can be:
|
|
@@ -347,17 +351,18 @@ class GroupedData:
|
|
|
347
351
|
a dict mapping from column name (string) to aggregate functions (string),
|
|
348
352
|
or a list of :class:`Column`.
|
|
349
353
|
|
|
350
|
-
Notes
|
|
354
|
+
Notes:
|
|
351
355
|
-----
|
|
352
356
|
Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed
|
|
353
357
|
in a single call to this function.
|
|
354
358
|
|
|
355
|
-
Examples
|
|
359
|
+
Examples:
|
|
356
360
|
--------
|
|
357
361
|
>>> from pyspark.sql import functions as F
|
|
358
362
|
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
|
|
359
363
|
>>> df = spark.createDataFrame(
|
|
360
|
-
...
|
|
364
|
+
... [(2, "Alice"), (3, "Alice"), (5, "Bob"), (10, "Bob")], ["age", "name"]
|
|
365
|
+
... )
|
|
361
366
|
>>> df.show()
|
|
362
367
|
+---+-----+
|
|
363
368
|
|age| name|
|
|
@@ -393,10 +398,9 @@ class GroupedData:
|
|
|
393
398
|
|
|
394
399
|
Same as above but uses pandas UDF.
|
|
395
400
|
|
|
396
|
-
>>> @pandas_udf(
|
|
401
|
+
>>> @pandas_udf("int", PandasUDFType.GROUPED_AGG) # doctest: +SKIP
|
|
397
402
|
... def min_udf(v):
|
|
398
403
|
... return v.min()
|
|
399
|
-
...
|
|
400
404
|
>>> df.groupBy(df.name).agg(min_udf(df.age)).sort("name").show() # doctest: +SKIP
|
|
401
405
|
+-----+------------+
|
|
402
406
|
| name|min_udf(age)|
|
|
@@ -417,4 +421,4 @@ class GroupedData:
|
|
|
417
421
|
rel = self._df.relation.select(*expressions, groups=group_by)
|
|
418
422
|
return DataFrame(rel, self.session)
|
|
419
423
|
|
|
420
|
-
# TODO: add 'pivot'
|
|
424
|
+
# TODO: add 'pivot' # noqa: TD002, TD003
|
|
@@ -1,11 +1,9 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING,
|
|
1
|
+
from typing import TYPE_CHECKING, Optional, Union, cast # noqa: D100
|
|
2
2
|
|
|
3
|
+
from ..errors import PySparkNotImplementedError, PySparkTypeError
|
|
3
4
|
from ..exception import ContributionsAcceptedError
|
|
4
5
|
from .types import StructType
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
from ..errors import PySparkNotImplementedError, PySparkTypeError
|
|
8
|
-
|
|
9
7
|
PrimitiveType = Union[bool, float, int, str]
|
|
10
8
|
OptionalPrimitiveType = Optional[PrimitiveType]
|
|
11
9
|
|
|
@@ -14,19 +12,19 @@ if TYPE_CHECKING:
|
|
|
14
12
|
from duckdb.experimental.spark.sql.session import SparkSession
|
|
15
13
|
|
|
16
14
|
|
|
17
|
-
class DataFrameWriter:
|
|
18
|
-
def __init__(self, dataframe: "DataFrame"):
|
|
15
|
+
class DataFrameWriter: # noqa: D101
|
|
16
|
+
def __init__(self, dataframe: "DataFrame") -> None: # noqa: D107
|
|
19
17
|
self.dataframe = dataframe
|
|
20
18
|
|
|
21
|
-
def saveAsTable(self, table_name: str) -> None:
|
|
19
|
+
def saveAsTable(self, table_name: str) -> None: # noqa: D102
|
|
22
20
|
relation = self.dataframe.relation
|
|
23
21
|
relation.create(table_name)
|
|
24
22
|
|
|
25
|
-
def parquet(
|
|
23
|
+
def parquet( # noqa: D102
|
|
26
24
|
self,
|
|
27
25
|
path: str,
|
|
28
26
|
mode: Optional[str] = None,
|
|
29
|
-
partitionBy: Union[str,
|
|
27
|
+
partitionBy: Union[str, list[str], None] = None,
|
|
30
28
|
compression: Optional[str] = None,
|
|
31
29
|
) -> None:
|
|
32
30
|
relation = self.dataframe.relation
|
|
@@ -37,7 +35,7 @@ class DataFrameWriter:
|
|
|
37
35
|
|
|
38
36
|
relation.write_parquet(path, compression=compression)
|
|
39
37
|
|
|
40
|
-
def csv(
|
|
38
|
+
def csv( # noqa: D102
|
|
41
39
|
self,
|
|
42
40
|
path: str,
|
|
43
41
|
mode: Optional[str] = None,
|
|
@@ -57,7 +55,7 @@ class DataFrameWriter:
|
|
|
57
55
|
encoding: Optional[str] = None,
|
|
58
56
|
emptyValue: Optional[str] = None,
|
|
59
57
|
lineSep: Optional[str] = None,
|
|
60
|
-
):
|
|
58
|
+
) -> None:
|
|
61
59
|
if mode not in (None, "overwrite"):
|
|
62
60
|
raise NotImplementedError
|
|
63
61
|
if escapeQuotes:
|
|
@@ -88,13 +86,13 @@ class DataFrameWriter:
|
|
|
88
86
|
)
|
|
89
87
|
|
|
90
88
|
|
|
91
|
-
class DataFrameReader:
|
|
92
|
-
def __init__(self, session: "SparkSession"):
|
|
89
|
+
class DataFrameReader: # noqa: D101
|
|
90
|
+
def __init__(self, session: "SparkSession") -> None: # noqa: D107
|
|
93
91
|
self.session = session
|
|
94
92
|
|
|
95
|
-
def load(
|
|
93
|
+
def load( # noqa: D102
|
|
96
94
|
self,
|
|
97
|
-
path: Optional[Union[str,
|
|
95
|
+
path: Optional[Union[str, list[str]]] = None,
|
|
98
96
|
format: Optional[str] = None,
|
|
99
97
|
schema: Optional[Union[StructType, str]] = None,
|
|
100
98
|
**options: OptionalPrimitiveType,
|
|
@@ -102,7 +100,7 @@ class DataFrameReader:
|
|
|
102
100
|
from duckdb.experimental.spark.sql.dataframe import DataFrame
|
|
103
101
|
|
|
104
102
|
if not isinstance(path, str):
|
|
105
|
-
raise
|
|
103
|
+
raise TypeError
|
|
106
104
|
if options:
|
|
107
105
|
raise ContributionsAcceptedError
|
|
108
106
|
|
|
@@ -123,15 +121,15 @@ class DataFrameReader:
|
|
|
123
121
|
if schema:
|
|
124
122
|
if not isinstance(schema, StructType):
|
|
125
123
|
raise ContributionsAcceptedError
|
|
126
|
-
schema = cast(StructType, schema)
|
|
124
|
+
schema = cast("StructType", schema)
|
|
127
125
|
types, names = schema.extract_types_and_names()
|
|
128
126
|
df = df._cast_types(types)
|
|
129
127
|
df = df.toDF(names)
|
|
130
128
|
raise NotImplementedError
|
|
131
129
|
|
|
132
|
-
def csv(
|
|
130
|
+
def csv( # noqa: D102
|
|
133
131
|
self,
|
|
134
|
-
path: Union[str,
|
|
132
|
+
path: Union[str, list[str]],
|
|
135
133
|
schema: Optional[Union[StructType, str]] = None,
|
|
136
134
|
sep: Optional[str] = None,
|
|
137
135
|
encoding: Optional[str] = None,
|
|
@@ -225,7 +223,7 @@ class DataFrameReader:
|
|
|
225
223
|
dtype = None
|
|
226
224
|
names = None
|
|
227
225
|
if schema:
|
|
228
|
-
schema = cast(StructType, schema)
|
|
226
|
+
schema = cast("StructType", schema)
|
|
229
227
|
dtype, names = schema.extract_types_and_names()
|
|
230
228
|
|
|
231
229
|
rel = self.session.conn.read_csv(
|
|
@@ -247,13 +245,15 @@ class DataFrameReader:
|
|
|
247
245
|
df = df.toDF(*names)
|
|
248
246
|
return df
|
|
249
247
|
|
|
250
|
-
def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame":
|
|
248
|
+
def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame": # noqa: D102
|
|
251
249
|
input = list(paths)
|
|
252
250
|
if len(input) != 1:
|
|
253
|
-
|
|
251
|
+
msg = "Only single paths are supported for now"
|
|
252
|
+
raise NotImplementedError(msg)
|
|
254
253
|
option_amount = len(options.keys())
|
|
255
254
|
if option_amount != 0:
|
|
256
|
-
|
|
255
|
+
msg = "Options are not supported"
|
|
256
|
+
raise ContributionsAcceptedError(msg)
|
|
257
257
|
path = input[0]
|
|
258
258
|
rel = self.session.conn.read_parquet(path)
|
|
259
259
|
from ..sql.dataframe import DataFrame
|
|
@@ -263,7 +263,7 @@ class DataFrameReader:
|
|
|
263
263
|
|
|
264
264
|
def json(
|
|
265
265
|
self,
|
|
266
|
-
path: Union[str,
|
|
266
|
+
path: Union[str, list[str]],
|
|
267
267
|
schema: Optional[Union[StructType, str]] = None,
|
|
268
268
|
primitivesAsString: Optional[Union[bool, str]] = None,
|
|
269
269
|
prefersDecimal: Optional[Union[bool, str]] = None,
|
|
@@ -289,8 +289,7 @@ class DataFrameReader:
|
|
|
289
289
|
modifiedAfter: Optional[Union[bool, str]] = None,
|
|
290
290
|
allowNonNumericNumbers: Optional[Union[bool, str]] = None,
|
|
291
291
|
) -> "DataFrame":
|
|
292
|
-
"""
|
|
293
|
-
Loads JSON files and returns the results as a :class:`DataFrame`.
|
|
292
|
+
"""Loads JSON files and returns the results as a :class:`DataFrame`.
|
|
294
293
|
|
|
295
294
|
`JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
|
|
296
295
|
For JSON (one record per file), set the ``multiLine`` parameter to ``true``.
|
|
@@ -321,16 +320,16 @@ class DataFrameReader:
|
|
|
321
320
|
|
|
322
321
|
.. # noqa
|
|
323
322
|
|
|
324
|
-
Examples
|
|
323
|
+
Examples:
|
|
325
324
|
--------
|
|
326
325
|
Write a DataFrame into a JSON file and read it back.
|
|
327
326
|
|
|
328
327
|
>>> import tempfile
|
|
329
328
|
>>> with tempfile.TemporaryDirectory() as d:
|
|
330
329
|
... # Write a DataFrame into a JSON file
|
|
331
|
-
... spark.createDataFrame(
|
|
332
|
-
...
|
|
333
|
-
... ).
|
|
330
|
+
... spark.createDataFrame([{"age": 100, "name": "Hyukjin Kwon"}]).write.mode(
|
|
331
|
+
... "overwrite"
|
|
332
|
+
... ).format("json").save(d)
|
|
334
333
|
...
|
|
335
334
|
... # Read the JSON file as a DataFrame.
|
|
336
335
|
... spark.read.json(d).show()
|
|
@@ -340,102 +339,89 @@ class DataFrameReader:
|
|
|
340
339
|
|100|Hyukjin Kwon|
|
|
341
340
|
+---+------------+
|
|
342
341
|
"""
|
|
343
|
-
|
|
344
342
|
if schema is not None:
|
|
345
|
-
|
|
343
|
+
msg = "The 'schema' option is not supported"
|
|
344
|
+
raise ContributionsAcceptedError(msg)
|
|
346
345
|
if primitivesAsString is not None:
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
)
|
|
346
|
+
msg = "The 'primitivesAsString' option is not supported"
|
|
347
|
+
raise ContributionsAcceptedError(msg)
|
|
350
348
|
if prefersDecimal is not None:
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
)
|
|
349
|
+
msg = "The 'prefersDecimal' option is not supported"
|
|
350
|
+
raise ContributionsAcceptedError(msg)
|
|
354
351
|
if allowComments is not None:
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
)
|
|
352
|
+
msg = "The 'allowComments' option is not supported"
|
|
353
|
+
raise ContributionsAcceptedError(msg)
|
|
358
354
|
if allowUnquotedFieldNames is not None:
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
)
|
|
355
|
+
msg = "The 'allowUnquotedFieldNames' option is not supported"
|
|
356
|
+
raise ContributionsAcceptedError(msg)
|
|
362
357
|
if allowSingleQuotes is not None:
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
)
|
|
358
|
+
msg = "The 'allowSingleQuotes' option is not supported"
|
|
359
|
+
raise ContributionsAcceptedError(msg)
|
|
366
360
|
if allowNumericLeadingZero is not None:
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
)
|
|
361
|
+
msg = "The 'allowNumericLeadingZero' option is not supported"
|
|
362
|
+
raise ContributionsAcceptedError(msg)
|
|
370
363
|
if allowBackslashEscapingAnyCharacter is not None:
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
)
|
|
364
|
+
msg = "The 'allowBackslashEscapingAnyCharacter' option is not supported"
|
|
365
|
+
raise ContributionsAcceptedError(msg)
|
|
374
366
|
if mode is not None:
|
|
375
|
-
|
|
367
|
+
msg = "The 'mode' option is not supported"
|
|
368
|
+
raise ContributionsAcceptedError(msg)
|
|
376
369
|
if columnNameOfCorruptRecord is not None:
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
)
|
|
370
|
+
msg = "The 'columnNameOfCorruptRecord' option is not supported"
|
|
371
|
+
raise ContributionsAcceptedError(msg)
|
|
380
372
|
if dateFormat is not None:
|
|
381
|
-
|
|
373
|
+
msg = "The 'dateFormat' option is not supported"
|
|
374
|
+
raise ContributionsAcceptedError(msg)
|
|
382
375
|
if timestampFormat is not None:
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
)
|
|
376
|
+
msg = "The 'timestampFormat' option is not supported"
|
|
377
|
+
raise ContributionsAcceptedError(msg)
|
|
386
378
|
if multiLine is not None:
|
|
387
|
-
|
|
379
|
+
msg = "The 'multiLine' option is not supported"
|
|
380
|
+
raise ContributionsAcceptedError(msg)
|
|
388
381
|
if allowUnquotedControlChars is not None:
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
)
|
|
382
|
+
msg = "The 'allowUnquotedControlChars' option is not supported"
|
|
383
|
+
raise ContributionsAcceptedError(msg)
|
|
392
384
|
if lineSep is not None:
|
|
393
|
-
|
|
385
|
+
msg = "The 'lineSep' option is not supported"
|
|
386
|
+
raise ContributionsAcceptedError(msg)
|
|
394
387
|
if samplingRatio is not None:
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
)
|
|
388
|
+
msg = "The 'samplingRatio' option is not supported"
|
|
389
|
+
raise ContributionsAcceptedError(msg)
|
|
398
390
|
if dropFieldIfAllNull is not None:
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
)
|
|
391
|
+
msg = "The 'dropFieldIfAllNull' option is not supported"
|
|
392
|
+
raise ContributionsAcceptedError(msg)
|
|
402
393
|
if encoding is not None:
|
|
403
|
-
|
|
394
|
+
msg = "The 'encoding' option is not supported"
|
|
395
|
+
raise ContributionsAcceptedError(msg)
|
|
404
396
|
if locale is not None:
|
|
405
|
-
|
|
397
|
+
msg = "The 'locale' option is not supported"
|
|
398
|
+
raise ContributionsAcceptedError(msg)
|
|
406
399
|
if pathGlobFilter is not None:
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
)
|
|
400
|
+
msg = "The 'pathGlobFilter' option is not supported"
|
|
401
|
+
raise ContributionsAcceptedError(msg)
|
|
410
402
|
if recursiveFileLookup is not None:
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
)
|
|
403
|
+
msg = "The 'recursiveFileLookup' option is not supported"
|
|
404
|
+
raise ContributionsAcceptedError(msg)
|
|
414
405
|
if modifiedBefore is not None:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
)
|
|
406
|
+
msg = "The 'modifiedBefore' option is not supported"
|
|
407
|
+
raise ContributionsAcceptedError(msg)
|
|
418
408
|
if modifiedAfter is not None:
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
)
|
|
409
|
+
msg = "The 'modifiedAfter' option is not supported"
|
|
410
|
+
raise ContributionsAcceptedError(msg)
|
|
422
411
|
if allowNonNumericNumbers is not None:
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
)
|
|
412
|
+
msg = "The 'allowNonNumericNumbers' option is not supported"
|
|
413
|
+
raise ContributionsAcceptedError(msg)
|
|
426
414
|
|
|
427
415
|
if isinstance(path, str):
|
|
428
416
|
path = [path]
|
|
429
|
-
if
|
|
417
|
+
if isinstance(path, list):
|
|
430
418
|
if len(path) == 1:
|
|
431
419
|
rel = self.session.conn.read_json(path[0])
|
|
432
420
|
from .dataframe import DataFrame
|
|
433
421
|
|
|
434
422
|
df = DataFrame(rel, self.session)
|
|
435
423
|
return df
|
|
436
|
-
raise PySparkNotImplementedError(
|
|
437
|
-
message="Only a single path is supported for now"
|
|
438
|
-
)
|
|
424
|
+
raise PySparkNotImplementedError(message="Only a single path is supported for now")
|
|
439
425
|
else:
|
|
440
426
|
raise PySparkTypeError(
|
|
441
427
|
error_class="NOT_STR_OR_LIST_OF_RDD",
|
|
@@ -446,4 +432,4 @@ class DataFrameReader:
|
|
|
446
432
|
)
|
|
447
433
|
|
|
448
434
|
|
|
449
|
-
__all__ = ["
|
|
435
|
+
__all__ = ["DataFrameReader", "DataFrameWriter"]
|