fugue 0.8.7.dev4__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/api.py +1 -0
- fugue/dataframe/api.py +51 -15
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +72 -4
- fugue/execution/execution_engine.py +1 -1
- fugue/execution/native_execution_engine.py +1 -1
- fugue/plugins.py +1 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/METADATA +5 -4
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/RECORD +30 -30
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/entry_points.txt +1 -1
- fugue_dask/_io.py +5 -0
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_duckdb/dataframe.py +87 -29
- fugue_ibis/dataframe.py +13 -0
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/dataframe.py +71 -19
- fugue_spark/_utils/convert.py +32 -7
- fugue_spark/_utils/io.py +3 -1
- fugue_spark/dataframe.py +94 -22
- fugue_spark/execution_engine.py +7 -3
- fugue_test/builtin_suite.py +1 -1
- fugue_test/dataframe_suite.py +14 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/top_level.txt +0 -0
fugue_polars/polars_dataframe.py
CHANGED
|
@@ -14,22 +14,32 @@ from triad.utils.pyarrow import (
|
|
|
14
14
|
|
|
15
15
|
from fugue import ArrowDataFrame
|
|
16
16
|
from fugue.api import (
|
|
17
|
+
as_array,
|
|
18
|
+
as_array_iterable,
|
|
17
19
|
as_arrow,
|
|
20
|
+
as_dict_iterable,
|
|
21
|
+
as_dicts,
|
|
18
22
|
drop_columns,
|
|
19
23
|
get_column_names,
|
|
20
24
|
get_schema,
|
|
21
25
|
is_df,
|
|
26
|
+
is_empty,
|
|
22
27
|
rename,
|
|
23
28
|
select_columns,
|
|
24
29
|
)
|
|
25
30
|
from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
|
|
31
|
+
from fugue.dataframe.utils import (
|
|
32
|
+
pa_table_as_array,
|
|
33
|
+
pa_table_as_array_iterable,
|
|
34
|
+
pa_table_as_dict_iterable,
|
|
35
|
+
pa_table_as_dicts,
|
|
36
|
+
)
|
|
26
37
|
from fugue.dataset.api import (
|
|
27
38
|
as_local,
|
|
28
39
|
as_local_bounded,
|
|
29
40
|
count,
|
|
30
41
|
get_num_partitions,
|
|
31
42
|
is_bounded,
|
|
32
|
-
is_empty,
|
|
33
43
|
is_local,
|
|
34
44
|
)
|
|
35
45
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
@@ -52,7 +62,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
52
62
|
):
|
|
53
63
|
if df is None:
|
|
54
64
|
schema = _input_schema(schema).assert_not_empty()
|
|
55
|
-
self._native:
|
|
65
|
+
self._native: pl.DataFrame = build_empty_pl(schema)
|
|
56
66
|
super().__init__(schema)
|
|
57
67
|
return
|
|
58
68
|
else:
|
|
@@ -73,7 +83,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
73
83
|
|
|
74
84
|
@property
|
|
75
85
|
def empty(self) -> bool:
|
|
76
|
-
return self._native.
|
|
86
|
+
return self._native.is_empty()
|
|
77
87
|
|
|
78
88
|
def peek_array(self) -> List[Any]:
|
|
79
89
|
self.assert_not_empty()
|
|
@@ -118,26 +128,20 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
118
128
|
def as_array(
|
|
119
129
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
120
130
|
) -> List[Any]:
|
|
121
|
-
|
|
122
|
-
if columns is not None:
|
|
123
|
-
tdf = tdf.select(columns)
|
|
124
|
-
return [list(row) for row in tdf.rows()]
|
|
131
|
+
return _pl_as_array(self.native, columns=columns)
|
|
125
132
|
|
|
126
133
|
def as_array_iterable(
|
|
127
134
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
128
135
|
) -> Iterable[Any]:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
136
|
+
yield from _pl_as_array_iterable(self.native, columns=columns)
|
|
137
|
+
|
|
138
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
139
|
+
return _pl_as_dicts(self.native, columns=columns)
|
|
133
140
|
|
|
134
141
|
def as_dict_iterable(
|
|
135
142
|
self, columns: Optional[List[str]] = None
|
|
136
143
|
) -> Iterable[Dict[str, Any]]:
|
|
137
|
-
|
|
138
|
-
yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
|
|
139
|
-
columns=columns
|
|
140
|
-
)
|
|
144
|
+
yield from _pl_as_dict_iterable(self.native, columns=columns)
|
|
141
145
|
|
|
142
146
|
|
|
143
147
|
@as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
@@ -174,7 +178,7 @@ def _pl_is_bounded(df: pl.DataFrame) -> bool:
|
|
|
174
178
|
|
|
175
179
|
@is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
176
180
|
def _pl_is_empty(df: pl.DataFrame) -> bool:
|
|
177
|
-
return df.
|
|
181
|
+
return df.is_empty()
|
|
178
182
|
|
|
179
183
|
|
|
180
184
|
@is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
@@ -228,6 +232,39 @@ def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
|
|
|
228
232
|
return df.select(columns)
|
|
229
233
|
|
|
230
234
|
|
|
235
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
236
|
+
def _pl_as_array(
|
|
237
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
238
|
+
) -> List[List[Any]]:
|
|
239
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
240
|
+
adf = _pl_as_arrow(_df)
|
|
241
|
+
return pa_table_as_array(adf, columns=columns)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
245
|
+
def _pl_as_array_iterable(
|
|
246
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
247
|
+
) -> Iterable[List[Any]]:
|
|
248
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
249
|
+
yield from pa_table_as_array_iterable(_df.to_arrow(), columns=columns)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
253
|
+
def _pl_as_dicts(
|
|
254
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None
|
|
255
|
+
) -> List[Dict[str, Any]]:
|
|
256
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
257
|
+
return pa_table_as_dicts(_df.to_arrow(), columns=columns)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
261
|
+
def _pl_as_dict_iterable(
|
|
262
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None
|
|
263
|
+
) -> Iterable[Dict[str, Any]]:
|
|
264
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
265
|
+
yield from pa_table_as_dict_iterable(_df.to_arrow(), columns=columns)
|
|
266
|
+
|
|
267
|
+
|
|
231
268
|
def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
|
|
232
269
|
missing = [x for x in columns if x not in df.schema.keys()]
|
|
233
270
|
if len(missing) > 0:
|
fugue_ray/dataframe.py
CHANGED
|
@@ -4,14 +4,22 @@ import pandas as pd
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
import ray
|
|
6
6
|
import ray.data as rd
|
|
7
|
+
from triad import assert_or_throw
|
|
7
8
|
from triad.collections.schema import Schema
|
|
8
9
|
from triad.utils.pyarrow import cast_pa_table
|
|
9
10
|
|
|
10
11
|
from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
11
12
|
from fugue.dataframe.dataframe import _input_schema
|
|
13
|
+
from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
|
|
12
14
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
13
15
|
from fugue.plugins import (
|
|
16
|
+
as_array,
|
|
17
|
+
as_array_iterable,
|
|
18
|
+
as_arrow,
|
|
19
|
+
as_dict_iterable,
|
|
20
|
+
as_dicts,
|
|
14
21
|
as_local_bounded,
|
|
22
|
+
as_pandas,
|
|
15
23
|
get_column_names,
|
|
16
24
|
get_num_partitions,
|
|
17
25
|
is_df,
|
|
@@ -141,13 +149,11 @@ class RayDataFrame(DataFrame):
|
|
|
141
149
|
def _select_cols(self, cols: List[Any]) -> DataFrame:
|
|
142
150
|
if cols == self.columns:
|
|
143
151
|
return self
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
**self._remote_args(),
|
|
152
|
+
return RayDataFrame(
|
|
153
|
+
self.native.select_columns(cols),
|
|
154
|
+
self.schema.extract(cols),
|
|
155
|
+
internal_schema=True,
|
|
149
156
|
)
|
|
150
|
-
return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
|
|
151
157
|
|
|
152
158
|
def peek_array(self) -> List[Any]:
|
|
153
159
|
data = self.native.limit(1).to_pandas().values.tolist()
|
|
@@ -164,10 +170,10 @@ class RayDataFrame(DataFrame):
|
|
|
164
170
|
return self.native.count()
|
|
165
171
|
|
|
166
172
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
167
|
-
return
|
|
173
|
+
return _rd_as_arrow(self.native)
|
|
168
174
|
|
|
169
175
|
def as_pandas(self) -> pd.DataFrame:
|
|
170
|
-
return self.
|
|
176
|
+
return _rd_as_pandas(self.native)
|
|
171
177
|
|
|
172
178
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
173
179
|
try:
|
|
@@ -201,18 +207,20 @@ class RayDataFrame(DataFrame):
|
|
|
201
207
|
def as_array(
|
|
202
208
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
203
209
|
) -> List[Any]:
|
|
204
|
-
|
|
205
|
-
if columns is not None:
|
|
206
|
-
df = df[columns]
|
|
207
|
-
adf = df.as_arrow()
|
|
208
|
-
if adf.shape[0] == 0:
|
|
209
|
-
return []
|
|
210
|
-
return ArrowDataFrame(adf).as_array(type_safe=type_safe)
|
|
210
|
+
return _rd_as_array(self.native, columns, type_safe)
|
|
211
211
|
|
|
212
212
|
def as_array_iterable(
|
|
213
213
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
214
214
|
) -> Iterable[Any]:
|
|
215
|
-
yield from self.
|
|
215
|
+
yield from _rd_as_array_iterable(self.native, columns, type_safe)
|
|
216
|
+
|
|
217
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
218
|
+
return _rd_as_dicts(self.native, columns)
|
|
219
|
+
|
|
220
|
+
def as_dict_iterable(
|
|
221
|
+
self, columns: Optional[List[str]] = None
|
|
222
|
+
) -> Iterable[Dict[str, Any]]:
|
|
223
|
+
yield from _rd_as_dict_iterable(self.native, columns)
|
|
216
224
|
|
|
217
225
|
def head(
|
|
218
226
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -259,8 +267,8 @@ def _rd_num_partitions(df: rd.Dataset) -> int:
|
|
|
259
267
|
|
|
260
268
|
|
|
261
269
|
@as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
262
|
-
def _rd_as_local(df: rd.Dataset) ->
|
|
263
|
-
return
|
|
270
|
+
def _rd_as_local(df: rd.Dataset) -> pa.Table:
|
|
271
|
+
return _rd_as_arrow(df)
|
|
264
272
|
|
|
265
273
|
|
|
266
274
|
@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
@@ -290,10 +298,54 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
|
|
|
290
298
|
)
|
|
291
299
|
|
|
292
300
|
|
|
301
|
+
@as_pandas.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
302
|
+
def _rd_as_pandas(df: rd.Dataset) -> pd.DataFrame:
|
|
303
|
+
return _rd_as_arrow(df).to_pandas()
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@as_arrow.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
307
|
+
def _rd_as_arrow(df: rd.Dataset) -> pa.Table:
|
|
308
|
+
return pa.concat_tables(_get_arrow_tables(df))
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
312
|
+
def _rd_as_array(
|
|
313
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
314
|
+
) -> List[Any]:
|
|
315
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
316
|
+
_df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
|
|
317
|
+
adf = _rd_as_arrow(_df)
|
|
318
|
+
return pa_table_as_array(adf)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
322
|
+
def _rd_as_array_iterable(
|
|
323
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
324
|
+
) -> Iterable[Any]:
|
|
325
|
+
yield from _rd_as_array(df, columns, type_safe)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
329
|
+
def _rd_as_dicts(
|
|
330
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
331
|
+
) -> List[Dict[str, Any]]:
|
|
332
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
333
|
+
_df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
|
|
334
|
+
adf = _rd_as_arrow(_df)
|
|
335
|
+
return pa_table_as_dicts(adf)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
339
|
+
def _rd_as_dict_iterable(
|
|
340
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
341
|
+
) -> Iterable[Dict[str, Any]]:
|
|
342
|
+
yield from _rd_as_dicts(df, columns, type_safe)
|
|
343
|
+
|
|
344
|
+
|
|
293
345
|
def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]:
|
|
294
346
|
last_empty: Any = None
|
|
295
347
|
empty = True
|
|
296
|
-
for block in df.
|
|
348
|
+
for block in df.to_arrow_refs():
|
|
297
349
|
tb = ray.get(block)
|
|
298
350
|
if tb.shape[0] > 0:
|
|
299
351
|
yield tb
|
fugue_spark/_utils/convert.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import pickle
|
|
2
|
-
from typing import Any, Iterable, List, Tuple
|
|
2
|
+
from typing import Any, Iterable, List, Tuple, Optional
|
|
3
|
+
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
5
6
|
import pyarrow as pa
|
|
@@ -16,7 +17,7 @@ from pyspark.sql.pandas.types import (
|
|
|
16
17
|
)
|
|
17
18
|
from triad.collections import Schema
|
|
18
19
|
from triad.utils.assertion import assert_arg_not_none, assert_or_throw
|
|
19
|
-
from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP
|
|
20
|
+
from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP, cast_pa_table
|
|
20
21
|
from triad.utils.schema import quote_name
|
|
21
22
|
|
|
22
23
|
import fugue.api as fa
|
|
@@ -41,7 +42,7 @@ def pandas_udf_can_accept(schema: Schema, is_input: bool) -> bool:
|
|
|
41
42
|
return False
|
|
42
43
|
to_arrow_schema(from_arrow_schema(schema.pa_schema))
|
|
43
44
|
return True
|
|
44
|
-
except Exception:
|
|
45
|
+
except Exception: # pragma: no cover
|
|
45
46
|
return False
|
|
46
47
|
|
|
47
48
|
|
|
@@ -132,7 +133,7 @@ def to_type_safe_input(rows: Iterable[ps.Row], schema: Schema) -> Iterable[List[
|
|
|
132
133
|
if r[i] is not None:
|
|
133
134
|
r[i] = r[i].asDict(recursive=True)
|
|
134
135
|
yield r
|
|
135
|
-
else:
|
|
136
|
+
else: # pragma: no cover
|
|
136
137
|
for row in rows:
|
|
137
138
|
data = row.asDict(recursive=True)
|
|
138
139
|
r = [data[n] for n in schema.names]
|
|
@@ -173,14 +174,14 @@ def pd_to_spark_df(
|
|
|
173
174
|
|
|
174
175
|
|
|
175
176
|
def to_pandas(df: ps.DataFrame) -> pd.DataFrame:
|
|
176
|
-
if pd.__version__ < "2" or not any(
|
|
177
|
+
if version.parse(pd.__version__) < version.parse("2.0.0") or not any(
|
|
177
178
|
isinstance(x.dataType, (pt.TimestampType, TimestampNTZType))
|
|
178
179
|
for x in df.schema.fields
|
|
179
180
|
):
|
|
180
181
|
return df.toPandas()
|
|
181
|
-
else:
|
|
182
|
+
else: # pragma: no cover
|
|
182
183
|
|
|
183
|
-
def serialize(dfs):
|
|
184
|
+
def serialize(dfs):
|
|
184
185
|
for df in dfs:
|
|
185
186
|
data = pickle.dumps(df)
|
|
186
187
|
yield pd.DataFrame([[data]], columns=["data"])
|
|
@@ -189,6 +190,30 @@ def to_pandas(df: ps.DataFrame) -> pd.DataFrame:
|
|
|
189
190
|
return pd.concat(pickle.loads(x.data) for x in sdf.collect())
|
|
190
191
|
|
|
191
192
|
|
|
193
|
+
def to_arrow(df: ps.DataFrame) -> pa.Table:
|
|
194
|
+
schema = to_schema(df.schema)
|
|
195
|
+
destruct: Optional[bool] = None
|
|
196
|
+
try:
|
|
197
|
+
jconf = df.sparkSession._jconf
|
|
198
|
+
if jconf.arrowPySparkEnabled() and pandas_udf_can_accept(
|
|
199
|
+
schema, is_input=False
|
|
200
|
+
):
|
|
201
|
+
destruct = jconf.arrowPySparkSelfDestructEnabled()
|
|
202
|
+
except Exception: # pragma: no cover
|
|
203
|
+
# older spark does not have this config
|
|
204
|
+
pass
|
|
205
|
+
if destruct is not None and hasattr(df, "_collect_as_arrow"):
|
|
206
|
+
batches = df._collect_as_arrow(split_batches=destruct)
|
|
207
|
+
if len(batches) == 0:
|
|
208
|
+
return schema.create_empty_arrow_table()
|
|
209
|
+
table = pa.Table.from_batches(batches)
|
|
210
|
+
del batches
|
|
211
|
+
return cast_pa_table(table, schema.pa_schema)
|
|
212
|
+
else: # pragma: no cover
|
|
213
|
+
# df.toPandas has bugs on nested types
|
|
214
|
+
return pa.Table.from_pylist(df.collect(), schema=schema.pa_schema)
|
|
215
|
+
|
|
216
|
+
|
|
192
217
|
# TODO: the following function always set nullable to true,
|
|
193
218
|
# but should we use field.nullable?
|
|
194
219
|
def _to_arrow_type(dt: pt.DataType) -> pa.DataType:
|
fugue_spark/_utils/io.py
CHANGED
|
@@ -9,7 +9,7 @@ from triad.utils.assertion import assert_or_throw
|
|
|
9
9
|
|
|
10
10
|
from fugue._utils.io import FileParser, save_df
|
|
11
11
|
from fugue.collections.partition import PartitionSpec
|
|
12
|
-
from fugue.dataframe import DataFrame
|
|
12
|
+
from fugue.dataframe import DataFrame, PandasDataFrame
|
|
13
13
|
from fugue_spark.dataframe import SparkDataFrame
|
|
14
14
|
|
|
15
15
|
from .convert import to_schema, to_spark_schema
|
|
@@ -62,6 +62,8 @@ class SparkIO(object):
|
|
|
62
62
|
writer.save(uri)
|
|
63
63
|
else:
|
|
64
64
|
ldf = df.as_local()
|
|
65
|
+
if isinstance(ldf, PandasDataFrame) and hasattr(ldf.native, "attrs"):
|
|
66
|
+
ldf.native.attrs = {} # pragma: no cover
|
|
65
67
|
save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
|
|
66
68
|
|
|
67
69
|
def _get_writer(
|
fugue_spark/dataframe.py
CHANGED
|
@@ -9,15 +9,21 @@ from triad.collections.schema import SchemaError
|
|
|
9
9
|
from triad.utils.assertion import assert_or_throw
|
|
10
10
|
|
|
11
11
|
from fugue.dataframe import (
|
|
12
|
-
|
|
12
|
+
ArrowDataFrame,
|
|
13
13
|
DataFrame,
|
|
14
14
|
IterableDataFrame,
|
|
15
15
|
LocalBoundedDataFrame,
|
|
16
|
-
PandasDataFrame,
|
|
17
16
|
)
|
|
17
|
+
from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
|
|
18
18
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
19
19
|
from fugue.plugins import (
|
|
20
|
+
as_array,
|
|
21
|
+
as_array_iterable,
|
|
22
|
+
as_arrow,
|
|
23
|
+
as_dict_iterable,
|
|
24
|
+
as_dicts,
|
|
20
25
|
as_local_bounded,
|
|
26
|
+
as_pandas,
|
|
21
27
|
count,
|
|
22
28
|
drop_columns,
|
|
23
29
|
get_column_names,
|
|
@@ -31,7 +37,13 @@ from fugue.plugins import (
|
|
|
31
37
|
select_columns,
|
|
32
38
|
)
|
|
33
39
|
|
|
34
|
-
from ._utils.convert import
|
|
40
|
+
from ._utils.convert import (
|
|
41
|
+
to_arrow,
|
|
42
|
+
to_cast_expression,
|
|
43
|
+
to_pandas,
|
|
44
|
+
to_schema,
|
|
45
|
+
to_type_safe_input,
|
|
46
|
+
)
|
|
35
47
|
from ._utils.misc import is_spark_connect, is_spark_dataframe
|
|
36
48
|
|
|
37
49
|
|
|
@@ -92,11 +104,7 @@ class SparkDataFrame(DataFrame):
|
|
|
92
104
|
return True
|
|
93
105
|
|
|
94
106
|
def as_local_bounded(self) -> LocalBoundedDataFrame:
|
|
95
|
-
|
|
96
|
-
data = list(to_type_safe_input(self.native.collect(), self.schema))
|
|
97
|
-
res: LocalBoundedDataFrame = ArrayDataFrame(data, self.schema)
|
|
98
|
-
else:
|
|
99
|
-
res = PandasDataFrame(self.as_pandas(), self.schema)
|
|
107
|
+
res = ArrowDataFrame(self.as_arrow())
|
|
100
108
|
if self.has_metadata:
|
|
101
109
|
res.reset_metadata(self.metadata)
|
|
102
110
|
return res
|
|
@@ -128,7 +136,10 @@ class SparkDataFrame(DataFrame):
|
|
|
128
136
|
return SparkDataFrame(self.native[schema.names])
|
|
129
137
|
|
|
130
138
|
def as_pandas(self) -> pd.DataFrame:
|
|
131
|
-
return
|
|
139
|
+
return _spark_df_as_pandas(self.native)
|
|
140
|
+
|
|
141
|
+
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
142
|
+
return _spark_df_as_arrow(self.native)
|
|
132
143
|
|
|
133
144
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
134
145
|
try:
|
|
@@ -146,23 +157,22 @@ class SparkDataFrame(DataFrame):
|
|
|
146
157
|
def as_array(
|
|
147
158
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
148
159
|
) -> List[Any]:
|
|
149
|
-
|
|
150
|
-
return sdf.as_local().as_array(type_safe=type_safe)
|
|
160
|
+
return _spark_as_array(self.native, columns=columns, type_safe=type_safe)
|
|
151
161
|
|
|
152
162
|
def as_array_iterable(
|
|
153
163
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
154
164
|
) -> Iterable[Any]:
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
165
|
+
yield from _spark_as_array_iterable(
|
|
166
|
+
self.native, columns=columns, type_safe=type_safe
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
170
|
+
return _spark_as_dicts(self.native, columns=columns)
|
|
171
|
+
|
|
172
|
+
def as_dict_iterable(
|
|
173
|
+
self, columns: Optional[List[str]] = None
|
|
174
|
+
) -> Iterable[Dict[str, Any]]:
|
|
175
|
+
yield from _spark_as_dict_iterable(self.native, columns=columns)
|
|
166
176
|
|
|
167
177
|
def head(
|
|
168
178
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -192,6 +202,16 @@ def _spark_is_df(df: ps.DataFrame) -> bool:
|
|
|
192
202
|
return True
|
|
193
203
|
|
|
194
204
|
|
|
205
|
+
@as_arrow.candidate(lambda df: isinstance(df, ps.DataFrame))
|
|
206
|
+
def _spark_df_as_arrow(df: ps.DataFrame) -> pd.DataFrame:
|
|
207
|
+
return to_arrow(df)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@as_pandas.candidate(lambda df: isinstance(df, ps.DataFrame))
|
|
211
|
+
def _spark_df_as_pandas(df: ps.DataFrame) -> pd.DataFrame:
|
|
212
|
+
return to_pandas(df)
|
|
213
|
+
|
|
214
|
+
|
|
195
215
|
@get_num_partitions.candidate(lambda df: is_spark_dataframe(df))
|
|
196
216
|
def _spark_num_partitions(df: ps.DataFrame) -> int:
|
|
197
217
|
return df.rdd.getNumPartitions()
|
|
@@ -272,6 +292,58 @@ def _spark_df_head(
|
|
|
272
292
|
return SparkDataFrame(res).as_local() if as_fugue else to_pandas(res)
|
|
273
293
|
|
|
274
294
|
|
|
295
|
+
@as_array.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
296
|
+
def _spark_as_array(
|
|
297
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
298
|
+
) -> List[Any]:
|
|
299
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
300
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
301
|
+
return pa_table_as_array(to_arrow(_df), columns)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
305
|
+
def _spark_as_array_iterable(
|
|
306
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
307
|
+
) -> Iterable[Any]:
|
|
308
|
+
if is_spark_connect(df): # pragma: no cover
|
|
309
|
+
yield from _spark_as_array(df, columns, type_safe=type_safe)
|
|
310
|
+
else:
|
|
311
|
+
assert_or_throw(
|
|
312
|
+
columns is None or len(columns) > 0, ValueError("empty columns")
|
|
313
|
+
)
|
|
314
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
315
|
+
if not type_safe:
|
|
316
|
+
for row in to_type_safe_input(
|
|
317
|
+
_df.rdd.toLocalIterator(), to_schema(_df.schema)
|
|
318
|
+
):
|
|
319
|
+
yield list(row)
|
|
320
|
+
else:
|
|
321
|
+
tdf = IterableDataFrame(
|
|
322
|
+
_spark_as_array_iterable(_df, type_safe=False), to_schema(_df.schema)
|
|
323
|
+
)
|
|
324
|
+
yield from tdf.as_array_iterable(type_safe=True)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
328
|
+
def _spark_as_dicts(
|
|
329
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
330
|
+
) -> List[Dict[str, Any]]:
|
|
331
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
332
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
333
|
+
return pa_table_as_dicts(to_arrow(_df), columns)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
337
|
+
def _spark_as_dict_iterable(
|
|
338
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
339
|
+
) -> Iterable[Dict[str, Any]]:
|
|
340
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
341
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
342
|
+
cols = list(_df.columns)
|
|
343
|
+
for row in _spark_as_array_iterable(_df, type_safe=type_safe):
|
|
344
|
+
yield dict(zip(cols, row))
|
|
345
|
+
|
|
346
|
+
|
|
275
347
|
def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame:
|
|
276
348
|
cols: List[ps.Column] = []
|
|
277
349
|
for f in df.schema:
|
fugue_spark/execution_engine.py
CHANGED
|
@@ -5,6 +5,7 @@ from uuid import uuid4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import pyarrow as pa
|
|
7
7
|
import pyspark.sql as ps
|
|
8
|
+
from py4j.protocol import Py4JError
|
|
8
9
|
from pyspark import StorageLevel
|
|
9
10
|
from pyspark.rdd import RDD
|
|
10
11
|
from pyspark.sql import SparkSession
|
|
@@ -350,9 +351,12 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
350
351
|
self._spark_session = spark_session
|
|
351
352
|
cf = dict(FUGUE_SPARK_DEFAULT_CONF)
|
|
352
353
|
if not self.is_spark_connect:
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
354
|
+
try:
|
|
355
|
+
spark_conf = spark_session.sparkContext.getConf()
|
|
356
|
+
cf.update({x[0]: x[1] for x in spark_conf.getAll()})
|
|
357
|
+
except Py4JError: # pragma: no cover:
|
|
358
|
+
# edge case: https://github.com/fugue-project/fugue/issues/517z
|
|
359
|
+
pass
|
|
356
360
|
cf.update(ParamDict(conf))
|
|
357
361
|
super().__init__(cf)
|
|
358
362
|
self._lock = SerializableRLock()
|
fugue_test/builtin_suite.py
CHANGED
|
@@ -1329,7 +1329,7 @@ class BuiltInTests(object):
|
|
|
1329
1329
|
[[datetime.date(2020, 1, 1), datetime.datetime(2020, 1, 2)]],
|
|
1330
1330
|
"a:date,b:datetime",
|
|
1331
1331
|
)
|
|
1332
|
-
b.assert_eq(a)
|
|
1332
|
+
b.assert_eq(a, no_pandas=True)
|
|
1333
1333
|
c = dag.df([["2020-01-01", "2020-01-01 00:00:00"]], "a:date,b:datetime")
|
|
1334
1334
|
c.transform(T2).assert_eq(c)
|
|
1335
1335
|
c.partition(by=["a"]).transform(T2).assert_eq(c)
|
fugue_test/dataframe_suite.py
CHANGED
|
@@ -208,8 +208,22 @@ class DataFrameTests(object):
|
|
|
208
208
|
def test_as_dict_iterable(self):
|
|
209
209
|
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
210
210
|
assert [dict(a=None, b=1)] == list(fi.as_dict_iterable(df))
|
|
211
|
+
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
212
|
+
assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
|
|
211
213
|
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
212
214
|
assert [dict(a=datetime(2020, 1, 1), b=1)] == list(fi.as_dict_iterable(df))
|
|
215
|
+
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
216
|
+
assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
|
|
217
|
+
|
|
218
|
+
def test_as_dicts(self):
|
|
219
|
+
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
220
|
+
assert [dict(a=None, b=1)] == fi.as_dicts(df)
|
|
221
|
+
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
222
|
+
assert [dict(b=1)] == fi.as_dicts(df, ["b"])
|
|
223
|
+
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
224
|
+
assert [dict(a=datetime(2020, 1, 1), b=1)] == fi.as_dicts(df)
|
|
225
|
+
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
226
|
+
assert [dict(b=1)] == fi.as_dicts(df, ["b"])
|
|
213
227
|
|
|
214
228
|
def test_list_type(self):
|
|
215
229
|
data = [[[30, 40]]]
|
|
File without changes
|
|
File without changes
|