duckdb 1.4.1.dev141__cp310-cp310-macosx_10_9_universal2.whl → 1.5.0.dev44__cp310-cp310-macosx_10_9_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of duckdb might be problematic. Click here for more details.
- _duckdb.cpython-310-darwin.so +0 -0
- duckdb/__init__.py +435 -341
- duckdb/__init__.pyi +713 -0
- duckdb/bytes_io_wrapper.py +9 -12
- duckdb/experimental/__init__.py +1 -2
- duckdb/experimental/spark/__init__.py +4 -3
- duckdb/experimental/spark/_globals.py +8 -8
- duckdb/experimental/spark/_typing.py +9 -7
- duckdb/experimental/spark/conf.py +15 -16
- duckdb/experimental/spark/context.py +44 -60
- duckdb/experimental/spark/errors/__init__.py +35 -33
- duckdb/experimental/spark/errors/error_classes.py +1 -1
- duckdb/experimental/spark/errors/exceptions/__init__.py +1 -1
- duckdb/experimental/spark/errors/exceptions/base.py +88 -39
- duckdb/experimental/spark/errors/utils.py +16 -11
- duckdb/experimental/spark/exception.py +6 -9
- duckdb/experimental/spark/sql/__init__.py +5 -5
- duckdb/experimental/spark/sql/_typing.py +15 -8
- duckdb/experimental/spark/sql/catalog.py +20 -21
- duckdb/experimental/spark/sql/column.py +55 -48
- duckdb/experimental/spark/sql/conf.py +8 -9
- duckdb/experimental/spark/sql/dataframe.py +233 -185
- duckdb/experimental/spark/sql/functions.py +1248 -1222
- duckdb/experimental/spark/sql/group.py +52 -56
- duckdb/experimental/spark/sql/readwriter.py +94 -80
- duckdb/experimental/spark/sql/session.py +59 -64
- duckdb/experimental/spark/sql/streaming.py +10 -9
- duckdb/experimental/spark/sql/type_utils.py +65 -67
- duckdb/experimental/spark/sql/types.py +345 -309
- duckdb/experimental/spark/sql/udf.py +6 -6
- duckdb/filesystem.py +16 -26
- duckdb/functional/__init__.py +16 -12
- duckdb/functional/__init__.pyi +31 -0
- duckdb/polars_io.py +83 -130
- duckdb/query_graph/__main__.py +96 -91
- duckdb/typing/__init__.py +8 -18
- duckdb/typing/__init__.pyi +36 -0
- duckdb/udf.py +5 -10
- duckdb/value/__init__.py +0 -1
- duckdb/value/constant/__init__.py +60 -62
- duckdb/value/constant/__init__.pyi +115 -0
- duckdb-1.5.0.dev44.dist-info/METADATA +80 -0
- duckdb-1.5.0.dev44.dist-info/RECORD +47 -0
- _duckdb-stubs/__init__.pyi +0 -1443
- _duckdb-stubs/_func.pyi +0 -46
- _duckdb-stubs/_sqltypes.pyi +0 -75
- adbc_driver_duckdb/__init__.py +0 -50
- adbc_driver_duckdb/dbapi.py +0 -115
- duckdb/_dbapi_type_object.py +0 -231
- duckdb/_version.py +0 -22
- duckdb/func/__init__.py +0 -3
- duckdb/sqltypes/__init__.py +0 -63
- duckdb-1.4.1.dev141.dist-info/METADATA +0 -326
- duckdb-1.4.1.dev141.dist-info/RECORD +0 -52
- /duckdb/{py.typed → value/__init__.pyi} +0 -0
- {duckdb-1.4.1.dev141.dist-info → duckdb-1.5.0.dev44.dist-info}/WHEEL +0 -0
- {duckdb-1.4.1.dev141.dist-info → duckdb-1.5.0.dev44.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
#
|
|
2
2
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
3
3
|
# contributor license agreements. See the NOTICE file distributed with
|
|
4
4
|
# this work for additional information regarding copyright ownership.
|
|
@@ -15,27 +15,26 @@
|
|
|
15
15
|
# limitations under the License.
|
|
16
16
|
#
|
|
17
17
|
|
|
18
|
-
from typing import TYPE_CHECKING, Callable, Union, overload
|
|
19
|
-
|
|
20
18
|
from ..exception import ContributionsAcceptedError
|
|
19
|
+
from typing import Callable, TYPE_CHECKING, overload, Dict, Union, List
|
|
20
|
+
|
|
21
21
|
from .column import Column
|
|
22
|
+
from .session import SparkSession
|
|
22
23
|
from .dataframe import DataFrame
|
|
23
24
|
from .functions import _to_column_expr
|
|
25
|
+
from ._typing import ColumnOrName
|
|
24
26
|
from .types import NumericType
|
|
25
27
|
|
|
26
|
-
# Only import symbols needed for type checking if something is type checking
|
|
27
28
|
if TYPE_CHECKING:
|
|
28
|
-
from ._typing import
|
|
29
|
-
from .session import SparkSession
|
|
29
|
+
from ._typing import LiteralType
|
|
30
30
|
|
|
31
31
|
__all__ = ["GroupedData", "Grouping"]
|
|
32
32
|
|
|
33
|
-
|
|
34
33
|
def _api_internal(self: "GroupedData", name: str, *cols: str) -> DataFrame:
|
|
35
34
|
expressions = ",".join(list(cols))
|
|
36
35
|
group_by = str(self._grouping) if self._grouping else ""
|
|
37
36
|
projections = self._grouping.get_columns()
|
|
38
|
-
jdf = self._df.relation
|
|
37
|
+
jdf = getattr(self._df.relation, "apply")(
|
|
39
38
|
function_name=name, # aggregate function
|
|
40
39
|
function_aggr=expressions, # inputs to aggregate
|
|
41
40
|
group_expr=group_by, # groups
|
|
@@ -43,7 +42,6 @@ def _api_internal(self: "GroupedData", name: str, *cols: str) -> DataFrame:
|
|
|
43
42
|
)
|
|
44
43
|
return DataFrame(jdf, self.session)
|
|
45
44
|
|
|
46
|
-
|
|
47
45
|
def df_varargs_api(f: Callable[..., DataFrame]) -> Callable[..., DataFrame]:
|
|
48
46
|
def _api(self: "GroupedData", *cols: str) -> DataFrame:
|
|
49
47
|
name = f.__name__
|
|
@@ -54,49 +52,49 @@ def df_varargs_api(f: Callable[..., DataFrame]) -> Callable[..., DataFrame]:
|
|
|
54
52
|
return _api
|
|
55
53
|
|
|
56
54
|
|
|
57
|
-
class Grouping:
|
|
58
|
-
def __init__(self, *cols: "ColumnOrName", **kwargs)
|
|
55
|
+
class Grouping:
|
|
56
|
+
def __init__(self, *cols: "ColumnOrName", **kwargs):
|
|
59
57
|
self._type = ""
|
|
60
58
|
self._cols = [_to_column_expr(x) for x in cols]
|
|
61
|
-
if
|
|
62
|
-
special = kwargs[
|
|
59
|
+
if 'special' in kwargs:
|
|
60
|
+
special = kwargs['special']
|
|
63
61
|
accepted_special = ["cube", "rollup"]
|
|
64
62
|
assert special in accepted_special
|
|
65
63
|
self._type = special
|
|
66
64
|
|
|
67
|
-
def get_columns(self) -> str:
|
|
65
|
+
def get_columns(self) -> str:
|
|
68
66
|
columns = ",".join([str(x) for x in self._cols])
|
|
69
67
|
return columns
|
|
70
68
|
|
|
71
|
-
def __str__(self)
|
|
69
|
+
def __str__(self):
|
|
72
70
|
columns = self.get_columns()
|
|
73
71
|
if self._type:
|
|
74
|
-
return self._type +
|
|
72
|
+
return self._type + '(' + columns + ')'
|
|
75
73
|
return columns
|
|
76
74
|
|
|
77
75
|
|
|
78
76
|
class GroupedData:
|
|
79
|
-
"""
|
|
77
|
+
"""
|
|
78
|
+
A set of methods for aggregations on a :class:`DataFrame`,
|
|
80
79
|
created by :func:`DataFrame.groupBy`.
|
|
81
80
|
|
|
82
|
-
"""
|
|
81
|
+
"""
|
|
83
82
|
|
|
84
|
-
def __init__(self, grouping: Grouping, df: DataFrame)
|
|
83
|
+
def __init__(self, grouping: Grouping, df: DataFrame):
|
|
85
84
|
self._grouping = grouping
|
|
86
85
|
self._df = df
|
|
87
86
|
self.session: SparkSession = df.session
|
|
88
87
|
|
|
89
|
-
def __repr__(self) -> str:
|
|
88
|
+
def __repr__(self) -> str:
|
|
90
89
|
return str(self._df)
|
|
91
90
|
|
|
92
91
|
def count(self) -> DataFrame:
|
|
93
92
|
"""Counts the number of records for each group.
|
|
94
93
|
|
|
95
|
-
Examples
|
|
94
|
+
Examples
|
|
96
95
|
--------
|
|
97
96
|
>>> df = spark.createDataFrame(
|
|
98
|
-
...
|
|
99
|
-
... )
|
|
97
|
+
... [(2, "Alice"), (3, "Alice"), (5, "Bob"), (10, "Bob")], ["age", "name"])
|
|
100
98
|
>>> df.show()
|
|
101
99
|
+---+-----+
|
|
102
100
|
|age| name|
|
|
@@ -117,7 +115,7 @@ class GroupedData:
|
|
|
117
115
|
| Bob| 2|
|
|
118
116
|
+-----+-----+
|
|
119
117
|
"""
|
|
120
|
-
return _api_internal(self, "count").withColumnRenamed(
|
|
118
|
+
return _api_internal(self, "count").withColumnRenamed('count_star()', 'count')
|
|
121
119
|
|
|
122
120
|
@df_varargs_api
|
|
123
121
|
def mean(self, *cols: str) -> DataFrame:
|
|
@@ -141,12 +139,11 @@ class GroupedData:
|
|
|
141
139
|
cols : str
|
|
142
140
|
column names. Non-numeric columns are ignored.
|
|
143
141
|
|
|
144
|
-
Examples
|
|
142
|
+
Examples
|
|
145
143
|
--------
|
|
146
|
-
>>> df = spark.createDataFrame(
|
|
147
|
-
...
|
|
148
|
-
... ["age", "name", "height"]
|
|
149
|
-
... )
|
|
144
|
+
>>> df = spark.createDataFrame([
|
|
145
|
+
... (2, "Alice", 80), (3, "Alice", 100),
|
|
146
|
+
... (5, "Bob", 120), (10, "Bob", 140)], ["age", "name", "height"])
|
|
150
147
|
>>> df.show()
|
|
151
148
|
+---+-----+------+
|
|
152
149
|
|age| name|height|
|
|
@@ -159,7 +156,7 @@ class GroupedData:
|
|
|
159
156
|
|
|
160
157
|
Group-by name, and calculate the mean of the age in each group.
|
|
161
158
|
|
|
162
|
-
>>> df.groupBy("name").avg(
|
|
159
|
+
>>> df.groupBy("name").avg('age').sort("name").show()
|
|
163
160
|
+-----+--------+
|
|
164
161
|
| name|avg(age)|
|
|
165
162
|
+-----+--------+
|
|
@@ -169,7 +166,7 @@ class GroupedData:
|
|
|
169
166
|
|
|
170
167
|
Calculate the mean of the age and height in all data.
|
|
171
168
|
|
|
172
|
-
>>> df.groupBy().avg(
|
|
169
|
+
>>> df.groupBy().avg('age', 'height').show()
|
|
173
170
|
+--------+-----------+
|
|
174
171
|
|avg(age)|avg(height)|
|
|
175
172
|
+--------+-----------+
|
|
@@ -180,19 +177,18 @@ class GroupedData:
|
|
|
180
177
|
if len(columns) == 0:
|
|
181
178
|
schema = self._df.schema
|
|
182
179
|
# Take only the numeric types of the relation
|
|
183
|
-
columns:
|
|
180
|
+
columns: List[str] = [x.name for x in schema.fields if isinstance(x.dataType, NumericType)]
|
|
184
181
|
return _api_internal(self, "avg", *columns)
|
|
185
182
|
|
|
186
183
|
@df_varargs_api
|
|
187
184
|
def max(self, *cols: str) -> DataFrame:
|
|
188
185
|
"""Computes the max value for each numeric columns for each group.
|
|
189
186
|
|
|
190
|
-
Examples
|
|
187
|
+
Examples
|
|
191
188
|
--------
|
|
192
|
-
>>> df = spark.createDataFrame(
|
|
193
|
-
...
|
|
194
|
-
... ["age", "name", "height"]
|
|
195
|
-
... )
|
|
189
|
+
>>> df = spark.createDataFrame([
|
|
190
|
+
... (2, "Alice", 80), (3, "Alice", 100),
|
|
191
|
+
... (5, "Bob", 120), (10, "Bob", 140)], ["age", "name", "height"])
|
|
196
192
|
>>> df.show()
|
|
197
193
|
+---+-----+------+
|
|
198
194
|
|age| name|height|
|
|
@@ -232,12 +228,11 @@ class GroupedData:
|
|
|
232
228
|
cols : str
|
|
233
229
|
column names. Non-numeric columns are ignored.
|
|
234
230
|
|
|
235
|
-
Examples
|
|
231
|
+
Examples
|
|
236
232
|
--------
|
|
237
|
-
>>> df = spark.createDataFrame(
|
|
238
|
-
...
|
|
239
|
-
... ["age", "name", "height"]
|
|
240
|
-
... )
|
|
233
|
+
>>> df = spark.createDataFrame([
|
|
234
|
+
... (2, "Alice", 80), (3, "Alice", 100),
|
|
235
|
+
... (5, "Bob", 120), (10, "Bob", 140)], ["age", "name", "height"])
|
|
241
236
|
>>> df.show()
|
|
242
237
|
+---+-----+------+
|
|
243
238
|
|age| name|height|
|
|
@@ -277,12 +272,11 @@ class GroupedData:
|
|
|
277
272
|
cols : str
|
|
278
273
|
column names. Non-numeric columns are ignored.
|
|
279
274
|
|
|
280
|
-
Examples
|
|
275
|
+
Examples
|
|
281
276
|
--------
|
|
282
|
-
>>> df = spark.createDataFrame(
|
|
283
|
-
...
|
|
284
|
-
... ["age", "name", "height"]
|
|
285
|
-
... )
|
|
277
|
+
>>> df = spark.createDataFrame([
|
|
278
|
+
... (2, "Alice", 80), (3, "Alice", 100),
|
|
279
|
+
... (5, "Bob", 120), (10, "Bob", 140)], ["age", "name", "height"])
|
|
286
280
|
>>> df.show()
|
|
287
281
|
+---+-----+------+
|
|
288
282
|
|age| name|height|
|
|
@@ -314,12 +308,14 @@ class GroupedData:
|
|
|
314
308
|
"""
|
|
315
309
|
|
|
316
310
|
@overload
|
|
317
|
-
def agg(self, *exprs: Column) -> DataFrame:
|
|
311
|
+
def agg(self, *exprs: Column) -> DataFrame:
|
|
312
|
+
...
|
|
318
313
|
|
|
319
314
|
@overload
|
|
320
|
-
def agg(self, __exprs:
|
|
315
|
+
def agg(self, __exprs: Dict[str, str]) -> DataFrame:
|
|
316
|
+
...
|
|
321
317
|
|
|
322
|
-
def agg(self, *exprs: Union[Column,
|
|
318
|
+
def agg(self, *exprs: Union[Column, Dict[str, str]]) -> DataFrame:
|
|
323
319
|
"""Compute aggregates and returns the result as a :class:`DataFrame`.
|
|
324
320
|
|
|
325
321
|
The available aggregate functions can be:
|
|
@@ -351,18 +347,17 @@ class GroupedData:
|
|
|
351
347
|
a dict mapping from column name (string) to aggregate functions (string),
|
|
352
348
|
or a list of :class:`Column`.
|
|
353
349
|
|
|
354
|
-
Notes
|
|
350
|
+
Notes
|
|
355
351
|
-----
|
|
356
352
|
Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed
|
|
357
353
|
in a single call to this function.
|
|
358
354
|
|
|
359
|
-
Examples
|
|
355
|
+
Examples
|
|
360
356
|
--------
|
|
361
357
|
>>> from pyspark.sql import functions as F
|
|
362
358
|
>>> from pyspark.sql.functions import pandas_udf, PandasUDFType
|
|
363
359
|
>>> df = spark.createDataFrame(
|
|
364
|
-
...
|
|
365
|
-
... )
|
|
360
|
+
... [(2, "Alice"), (3, "Alice"), (5, "Bob"), (10, "Bob")], ["age", "name"])
|
|
366
361
|
>>> df.show()
|
|
367
362
|
+---+-----+
|
|
368
363
|
|age| name|
|
|
@@ -398,9 +393,10 @@ class GroupedData:
|
|
|
398
393
|
|
|
399
394
|
Same as above but uses pandas UDF.
|
|
400
395
|
|
|
401
|
-
>>> @pandas_udf(
|
|
396
|
+
>>> @pandas_udf('int', PandasUDFType.GROUPED_AGG) # doctest: +SKIP
|
|
402
397
|
... def min_udf(v):
|
|
403
398
|
... return v.min()
|
|
399
|
+
...
|
|
404
400
|
>>> df.groupBy(df.name).agg(min_udf(df.age)).sort("name").show() # doctest: +SKIP
|
|
405
401
|
+-----+------------+
|
|
406
402
|
| name|min_udf(age)|
|
|
@@ -421,4 +417,4 @@ class GroupedData:
|
|
|
421
417
|
rel = self._df.relation.select(*expressions, groups=group_by)
|
|
422
418
|
return DataFrame(rel, self.session)
|
|
423
419
|
|
|
424
|
-
# TODO: add 'pivot'
|
|
420
|
+
# TODO: add 'pivot'
|
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Optional, Union, cast
|
|
1
|
+
from typing import TYPE_CHECKING, List, Optional, Union, cast
|
|
2
2
|
|
|
3
|
-
from ..errors import PySparkNotImplementedError, PySparkTypeError
|
|
4
3
|
from ..exception import ContributionsAcceptedError
|
|
5
4
|
from .types import StructType
|
|
6
5
|
|
|
6
|
+
|
|
7
|
+
from ..errors import PySparkNotImplementedError, PySparkTypeError
|
|
8
|
+
|
|
7
9
|
PrimitiveType = Union[bool, float, int, str]
|
|
8
10
|
OptionalPrimitiveType = Optional[PrimitiveType]
|
|
9
11
|
|
|
@@ -12,19 +14,19 @@ if TYPE_CHECKING:
|
|
|
12
14
|
from duckdb.experimental.spark.sql.session import SparkSession
|
|
13
15
|
|
|
14
16
|
|
|
15
|
-
class DataFrameWriter:
|
|
16
|
-
def __init__(self, dataframe: "DataFrame")
|
|
17
|
+
class DataFrameWriter:
|
|
18
|
+
def __init__(self, dataframe: "DataFrame"):
|
|
17
19
|
self.dataframe = dataframe
|
|
18
20
|
|
|
19
|
-
def saveAsTable(self, table_name: str) -> None:
|
|
21
|
+
def saveAsTable(self, table_name: str) -> None:
|
|
20
22
|
relation = self.dataframe.relation
|
|
21
23
|
relation.create(table_name)
|
|
22
24
|
|
|
23
|
-
def parquet(
|
|
25
|
+
def parquet(
|
|
24
26
|
self,
|
|
25
27
|
path: str,
|
|
26
28
|
mode: Optional[str] = None,
|
|
27
|
-
partitionBy: Union[str,
|
|
29
|
+
partitionBy: Union[str, List[str], None] = None,
|
|
28
30
|
compression: Optional[str] = None,
|
|
29
31
|
) -> None:
|
|
30
32
|
relation = self.dataframe.relation
|
|
@@ -35,7 +37,7 @@ class DataFrameWriter: # noqa: D101
|
|
|
35
37
|
|
|
36
38
|
relation.write_parquet(path, compression=compression)
|
|
37
39
|
|
|
38
|
-
def csv(
|
|
40
|
+
def csv(
|
|
39
41
|
self,
|
|
40
42
|
path: str,
|
|
41
43
|
mode: Optional[str] = None,
|
|
@@ -55,7 +57,7 @@ class DataFrameWriter: # noqa: D101
|
|
|
55
57
|
encoding: Optional[str] = None,
|
|
56
58
|
emptyValue: Optional[str] = None,
|
|
57
59
|
lineSep: Optional[str] = None,
|
|
58
|
-
)
|
|
60
|
+
):
|
|
59
61
|
if mode not in (None, "overwrite"):
|
|
60
62
|
raise NotImplementedError
|
|
61
63
|
if escapeQuotes:
|
|
@@ -86,13 +88,13 @@ class DataFrameWriter: # noqa: D101
|
|
|
86
88
|
)
|
|
87
89
|
|
|
88
90
|
|
|
89
|
-
class DataFrameReader:
|
|
90
|
-
def __init__(self, session: "SparkSession")
|
|
91
|
+
class DataFrameReader:
|
|
92
|
+
def __init__(self, session: "SparkSession"):
|
|
91
93
|
self.session = session
|
|
92
94
|
|
|
93
|
-
def load(
|
|
95
|
+
def load(
|
|
94
96
|
self,
|
|
95
|
-
path: Optional[Union[str,
|
|
97
|
+
path: Optional[Union[str, List[str]]] = None,
|
|
96
98
|
format: Optional[str] = None,
|
|
97
99
|
schema: Optional[Union[StructType, str]] = None,
|
|
98
100
|
**options: OptionalPrimitiveType,
|
|
@@ -100,7 +102,7 @@ class DataFrameReader: # noqa: D101
|
|
|
100
102
|
from duckdb.experimental.spark.sql.dataframe import DataFrame
|
|
101
103
|
|
|
102
104
|
if not isinstance(path, str):
|
|
103
|
-
raise
|
|
105
|
+
raise ImportError
|
|
104
106
|
if options:
|
|
105
107
|
raise ContributionsAcceptedError
|
|
106
108
|
|
|
@@ -121,15 +123,15 @@ class DataFrameReader: # noqa: D101
|
|
|
121
123
|
if schema:
|
|
122
124
|
if not isinstance(schema, StructType):
|
|
123
125
|
raise ContributionsAcceptedError
|
|
124
|
-
schema = cast(
|
|
126
|
+
schema = cast(StructType, schema)
|
|
125
127
|
types, names = schema.extract_types_and_names()
|
|
126
128
|
df = df._cast_types(types)
|
|
127
129
|
df = df.toDF(names)
|
|
128
130
|
raise NotImplementedError
|
|
129
131
|
|
|
130
|
-
def csv(
|
|
132
|
+
def csv(
|
|
131
133
|
self,
|
|
132
|
-
path: Union[str,
|
|
134
|
+
path: Union[str, List[str]],
|
|
133
135
|
schema: Optional[Union[StructType, str]] = None,
|
|
134
136
|
sep: Optional[str] = None,
|
|
135
137
|
encoding: Optional[str] = None,
|
|
@@ -223,7 +225,7 @@ class DataFrameReader: # noqa: D101
|
|
|
223
225
|
dtype = None
|
|
224
226
|
names = None
|
|
225
227
|
if schema:
|
|
226
|
-
schema = cast(
|
|
228
|
+
schema = cast(StructType, schema)
|
|
227
229
|
dtype, names = schema.extract_types_and_names()
|
|
228
230
|
|
|
229
231
|
rel = self.session.conn.read_csv(
|
|
@@ -245,15 +247,13 @@ class DataFrameReader: # noqa: D101
|
|
|
245
247
|
df = df.toDF(*names)
|
|
246
248
|
return df
|
|
247
249
|
|
|
248
|
-
def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame":
|
|
250
|
+
def parquet(self, *paths: str, **options: "OptionalPrimitiveType") -> "DataFrame":
|
|
249
251
|
input = list(paths)
|
|
250
252
|
if len(input) != 1:
|
|
251
|
-
|
|
252
|
-
raise NotImplementedError(msg)
|
|
253
|
+
raise NotImplementedError("Only single paths are supported for now")
|
|
253
254
|
option_amount = len(options.keys())
|
|
254
255
|
if option_amount != 0:
|
|
255
|
-
|
|
256
|
-
raise ContributionsAcceptedError(msg)
|
|
256
|
+
raise ContributionsAcceptedError("Options are not supported")
|
|
257
257
|
path = input[0]
|
|
258
258
|
rel = self.session.conn.read_parquet(path)
|
|
259
259
|
from ..sql.dataframe import DataFrame
|
|
@@ -263,7 +263,7 @@ class DataFrameReader: # noqa: D101
|
|
|
263
263
|
|
|
264
264
|
def json(
|
|
265
265
|
self,
|
|
266
|
-
path: Union[str,
|
|
266
|
+
path: Union[str, List[str]],
|
|
267
267
|
schema: Optional[Union[StructType, str]] = None,
|
|
268
268
|
primitivesAsString: Optional[Union[bool, str]] = None,
|
|
269
269
|
prefersDecimal: Optional[Union[bool, str]] = None,
|
|
@@ -289,7 +289,8 @@ class DataFrameReader: # noqa: D101
|
|
|
289
289
|
modifiedAfter: Optional[Union[bool, str]] = None,
|
|
290
290
|
allowNonNumericNumbers: Optional[Union[bool, str]] = None,
|
|
291
291
|
) -> "DataFrame":
|
|
292
|
-
"""
|
|
292
|
+
"""
|
|
293
|
+
Loads JSON files and returns the results as a :class:`DataFrame`.
|
|
293
294
|
|
|
294
295
|
`JSON Lines <http://jsonlines.org/>`_ (newline-delimited JSON) is supported by default.
|
|
295
296
|
For JSON (one record per file), set the ``multiLine`` parameter to ``true``.
|
|
@@ -320,16 +321,16 @@ class DataFrameReader: # noqa: D101
|
|
|
320
321
|
|
|
321
322
|
.. # noqa
|
|
322
323
|
|
|
323
|
-
Examples
|
|
324
|
+
Examples
|
|
324
325
|
--------
|
|
325
326
|
Write a DataFrame into a JSON file and read it back.
|
|
326
327
|
|
|
327
328
|
>>> import tempfile
|
|
328
329
|
>>> with tempfile.TemporaryDirectory() as d:
|
|
329
330
|
... # Write a DataFrame into a JSON file
|
|
330
|
-
... spark.createDataFrame(
|
|
331
|
-
... "
|
|
332
|
-
... ).format("json").save(d)
|
|
331
|
+
... spark.createDataFrame(
|
|
332
|
+
... [{"age": 100, "name": "Hyukjin Kwon"}]
|
|
333
|
+
... ).write.mode("overwrite").format("json").save(d)
|
|
333
334
|
...
|
|
334
335
|
... # Read the JSON file as a DataFrame.
|
|
335
336
|
... spark.read.json(d).show()
|
|
@@ -339,89 +340,102 @@ class DataFrameReader: # noqa: D101
|
|
|
339
340
|
|100|Hyukjin Kwon|
|
|
340
341
|
+---+------------+
|
|
341
342
|
"""
|
|
343
|
+
|
|
342
344
|
if schema is not None:
|
|
343
|
-
|
|
344
|
-
raise ContributionsAcceptedError(msg)
|
|
345
|
+
raise ContributionsAcceptedError("The 'schema' option is not supported")
|
|
345
346
|
if primitivesAsString is not None:
|
|
346
|
-
|
|
347
|
-
|
|
347
|
+
raise ContributionsAcceptedError(
|
|
348
|
+
"The 'primitivesAsString' option is not supported"
|
|
349
|
+
)
|
|
348
350
|
if prefersDecimal is not None:
|
|
349
|
-
|
|
350
|
-
|
|
351
|
+
raise ContributionsAcceptedError(
|
|
352
|
+
"The 'prefersDecimal' option is not supported"
|
|
353
|
+
)
|
|
351
354
|
if allowComments is not None:
|
|
352
|
-
|
|
353
|
-
|
|
355
|
+
raise ContributionsAcceptedError(
|
|
356
|
+
"The 'allowComments' option is not supported"
|
|
357
|
+
)
|
|
354
358
|
if allowUnquotedFieldNames is not None:
|
|
355
|
-
|
|
356
|
-
|
|
359
|
+
raise ContributionsAcceptedError(
|
|
360
|
+
"The 'allowUnquotedFieldNames' option is not supported"
|
|
361
|
+
)
|
|
357
362
|
if allowSingleQuotes is not None:
|
|
358
|
-
|
|
359
|
-
|
|
363
|
+
raise ContributionsAcceptedError(
|
|
364
|
+
"The 'allowSingleQuotes' option is not supported"
|
|
365
|
+
)
|
|
360
366
|
if allowNumericLeadingZero is not None:
|
|
361
|
-
|
|
362
|
-
|
|
367
|
+
raise ContributionsAcceptedError(
|
|
368
|
+
"The 'allowNumericLeadingZero' option is not supported"
|
|
369
|
+
)
|
|
363
370
|
if allowBackslashEscapingAnyCharacter is not None:
|
|
364
|
-
|
|
365
|
-
|
|
371
|
+
raise ContributionsAcceptedError(
|
|
372
|
+
"The 'allowBackslashEscapingAnyCharacter' option is not supported"
|
|
373
|
+
)
|
|
366
374
|
if mode is not None:
|
|
367
|
-
|
|
368
|
-
raise ContributionsAcceptedError(msg)
|
|
375
|
+
raise ContributionsAcceptedError("The 'mode' option is not supported")
|
|
369
376
|
if columnNameOfCorruptRecord is not None:
|
|
370
|
-
|
|
371
|
-
|
|
377
|
+
raise ContributionsAcceptedError(
|
|
378
|
+
"The 'columnNameOfCorruptRecord' option is not supported"
|
|
379
|
+
)
|
|
372
380
|
if dateFormat is not None:
|
|
373
|
-
|
|
374
|
-
raise ContributionsAcceptedError(msg)
|
|
381
|
+
raise ContributionsAcceptedError("The 'dateFormat' option is not supported")
|
|
375
382
|
if timestampFormat is not None:
|
|
376
|
-
|
|
377
|
-
|
|
383
|
+
raise ContributionsAcceptedError(
|
|
384
|
+
"The 'timestampFormat' option is not supported"
|
|
385
|
+
)
|
|
378
386
|
if multiLine is not None:
|
|
379
|
-
|
|
380
|
-
raise ContributionsAcceptedError(msg)
|
|
387
|
+
raise ContributionsAcceptedError("The 'multiLine' option is not supported")
|
|
381
388
|
if allowUnquotedControlChars is not None:
|
|
382
|
-
|
|
383
|
-
|
|
389
|
+
raise ContributionsAcceptedError(
|
|
390
|
+
"The 'allowUnquotedControlChars' option is not supported"
|
|
391
|
+
)
|
|
384
392
|
if lineSep is not None:
|
|
385
|
-
|
|
386
|
-
raise ContributionsAcceptedError(msg)
|
|
393
|
+
raise ContributionsAcceptedError("The 'lineSep' option is not supported")
|
|
387
394
|
if samplingRatio is not None:
|
|
388
|
-
|
|
389
|
-
|
|
395
|
+
raise ContributionsAcceptedError(
|
|
396
|
+
"The 'samplingRatio' option is not supported"
|
|
397
|
+
)
|
|
390
398
|
if dropFieldIfAllNull is not None:
|
|
391
|
-
|
|
392
|
-
|
|
399
|
+
raise ContributionsAcceptedError(
|
|
400
|
+
"The 'dropFieldIfAllNull' option is not supported"
|
|
401
|
+
)
|
|
393
402
|
if encoding is not None:
|
|
394
|
-
|
|
395
|
-
raise ContributionsAcceptedError(msg)
|
|
403
|
+
raise ContributionsAcceptedError("The 'encoding' option is not supported")
|
|
396
404
|
if locale is not None:
|
|
397
|
-
|
|
398
|
-
raise ContributionsAcceptedError(msg)
|
|
405
|
+
raise ContributionsAcceptedError("The 'locale' option is not supported")
|
|
399
406
|
if pathGlobFilter is not None:
|
|
400
|
-
|
|
401
|
-
|
|
407
|
+
raise ContributionsAcceptedError(
|
|
408
|
+
"The 'pathGlobFilter' option is not supported"
|
|
409
|
+
)
|
|
402
410
|
if recursiveFileLookup is not None:
|
|
403
|
-
|
|
404
|
-
|
|
411
|
+
raise ContributionsAcceptedError(
|
|
412
|
+
"The 'recursiveFileLookup' option is not supported"
|
|
413
|
+
)
|
|
405
414
|
if modifiedBefore is not None:
|
|
406
|
-
|
|
407
|
-
|
|
415
|
+
raise ContributionsAcceptedError(
|
|
416
|
+
"The 'modifiedBefore' option is not supported"
|
|
417
|
+
)
|
|
408
418
|
if modifiedAfter is not None:
|
|
409
|
-
|
|
410
|
-
|
|
419
|
+
raise ContributionsAcceptedError(
|
|
420
|
+
"The 'modifiedAfter' option is not supported"
|
|
421
|
+
)
|
|
411
422
|
if allowNonNumericNumbers is not None:
|
|
412
|
-
|
|
413
|
-
|
|
423
|
+
raise ContributionsAcceptedError(
|
|
424
|
+
"The 'allowNonNumericNumbers' option is not supported"
|
|
425
|
+
)
|
|
414
426
|
|
|
415
427
|
if isinstance(path, str):
|
|
416
428
|
path = [path]
|
|
417
|
-
if
|
|
429
|
+
if isinstance(path, list):
|
|
418
430
|
if len(path) == 1:
|
|
419
431
|
rel = self.session.conn.read_json(path[0])
|
|
420
432
|
from .dataframe import DataFrame
|
|
421
433
|
|
|
422
434
|
df = DataFrame(rel, self.session)
|
|
423
435
|
return df
|
|
424
|
-
raise PySparkNotImplementedError(
|
|
436
|
+
raise PySparkNotImplementedError(
|
|
437
|
+
message="Only a single path is supported for now"
|
|
438
|
+
)
|
|
425
439
|
else:
|
|
426
440
|
raise PySparkTypeError(
|
|
427
441
|
error_class="NOT_STR_OR_LIST_OF_RDD",
|
|
@@ -432,4 +446,4 @@ class DataFrameReader: # noqa: D101
|
|
|
432
446
|
)
|
|
433
447
|
|
|
434
448
|
|
|
435
|
-
__all__ = ["
|
|
449
|
+
__all__ = ["DataFrameWriter", "DataFrameReader"]
|