fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/api.py +1 -0
- fugue/dataframe/api.py +19 -2
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +68 -2
- fugue/execution/execution_engine.py +1 -1
- fugue/plugins.py +1 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/METADATA +4 -4
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/RECORD +24 -24
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/entry_points.txt +1 -1
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_duckdb/dataframe.py +87 -29
- fugue_ibis/dataframe.py +13 -0
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/dataframe.py +71 -19
- fugue_spark/dataframe.py +69 -13
- fugue_test/dataframe_suite.py +14 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/WHEEL +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/top_level.txt +0 -0
fugue_spark/dataframe.py
CHANGED
|
@@ -14,9 +14,14 @@ from fugue.dataframe import (
|
|
|
14
14
|
IterableDataFrame,
|
|
15
15
|
LocalBoundedDataFrame,
|
|
16
16
|
)
|
|
17
|
+
from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
|
|
17
18
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
18
19
|
from fugue.plugins import (
|
|
20
|
+
as_array,
|
|
21
|
+
as_array_iterable,
|
|
19
22
|
as_arrow,
|
|
23
|
+
as_dict_iterable,
|
|
24
|
+
as_dicts,
|
|
20
25
|
as_local_bounded,
|
|
21
26
|
as_pandas,
|
|
22
27
|
count,
|
|
@@ -152,23 +157,22 @@ class SparkDataFrame(DataFrame):
|
|
|
152
157
|
def as_array(
|
|
153
158
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
154
159
|
) -> List[Any]:
|
|
155
|
-
|
|
156
|
-
return sdf.as_local().as_array(type_safe=type_safe)
|
|
160
|
+
return _spark_as_array(self.native, columns=columns, type_safe=type_safe)
|
|
157
161
|
|
|
158
162
|
def as_array_iterable(
|
|
159
163
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
160
164
|
) -> Iterable[Any]:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
165
|
+
yield from _spark_as_array_iterable(
|
|
166
|
+
self.native, columns=columns, type_safe=type_safe
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
170
|
+
return _spark_as_dicts(self.native, columns=columns)
|
|
171
|
+
|
|
172
|
+
def as_dict_iterable(
|
|
173
|
+
self, columns: Optional[List[str]] = None
|
|
174
|
+
) -> Iterable[Dict[str, Any]]:
|
|
175
|
+
yield from _spark_as_dict_iterable(self.native, columns=columns)
|
|
172
176
|
|
|
173
177
|
def head(
|
|
174
178
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -288,6 +292,58 @@ def _spark_df_head(
|
|
|
288
292
|
return SparkDataFrame(res).as_local() if as_fugue else to_pandas(res)
|
|
289
293
|
|
|
290
294
|
|
|
295
|
+
@as_array.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
296
|
+
def _spark_as_array(
|
|
297
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
298
|
+
) -> List[Any]:
|
|
299
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
300
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
301
|
+
return pa_table_as_array(to_arrow(_df), columns)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
305
|
+
def _spark_as_array_iterable(
|
|
306
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
307
|
+
) -> Iterable[Any]:
|
|
308
|
+
if is_spark_connect(df): # pragma: no cover
|
|
309
|
+
yield from _spark_as_array(df, columns, type_safe=type_safe)
|
|
310
|
+
else:
|
|
311
|
+
assert_or_throw(
|
|
312
|
+
columns is None or len(columns) > 0, ValueError("empty columns")
|
|
313
|
+
)
|
|
314
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
315
|
+
if not type_safe:
|
|
316
|
+
for row in to_type_safe_input(
|
|
317
|
+
_df.rdd.toLocalIterator(), to_schema(_df.schema)
|
|
318
|
+
):
|
|
319
|
+
yield list(row)
|
|
320
|
+
else:
|
|
321
|
+
tdf = IterableDataFrame(
|
|
322
|
+
_spark_as_array_iterable(_df, type_safe=False), to_schema(_df.schema)
|
|
323
|
+
)
|
|
324
|
+
yield from tdf.as_array_iterable(type_safe=True)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
328
|
+
def _spark_as_dicts(
|
|
329
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
330
|
+
) -> List[Dict[str, Any]]:
|
|
331
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
332
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
333
|
+
return pa_table_as_dicts(to_arrow(_df), columns)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
|
|
337
|
+
def _spark_as_dict_iterable(
|
|
338
|
+
df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
339
|
+
) -> Iterable[Dict[str, Any]]:
|
|
340
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
341
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
342
|
+
cols = list(_df.columns)
|
|
343
|
+
for row in _spark_as_array_iterable(_df, type_safe=type_safe):
|
|
344
|
+
yield dict(zip(cols, row))
|
|
345
|
+
|
|
346
|
+
|
|
291
347
|
def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame:
|
|
292
348
|
cols: List[ps.Column] = []
|
|
293
349
|
for f in df.schema:
|
fugue_test/dataframe_suite.py
CHANGED
|
@@ -208,8 +208,22 @@ class DataFrameTests(object):
|
|
|
208
208
|
def test_as_dict_iterable(self):
|
|
209
209
|
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
210
210
|
assert [dict(a=None, b=1)] == list(fi.as_dict_iterable(df))
|
|
211
|
+
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
212
|
+
assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
|
|
211
213
|
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
212
214
|
assert [dict(a=datetime(2020, 1, 1), b=1)] == list(fi.as_dict_iterable(df))
|
|
215
|
+
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
216
|
+
assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
|
|
217
|
+
|
|
218
|
+
def test_as_dicts(self):
|
|
219
|
+
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
220
|
+
assert [dict(a=None, b=1)] == fi.as_dicts(df)
|
|
221
|
+
df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
|
|
222
|
+
assert [dict(b=1)] == fi.as_dicts(df, ["b"])
|
|
223
|
+
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
224
|
+
assert [dict(a=datetime(2020, 1, 1), b=1)] == fi.as_dicts(df)
|
|
225
|
+
df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
|
|
226
|
+
assert [dict(b=1)] == fi.as_dicts(df, ["b"])
|
|
213
227
|
|
|
214
228
|
def test_list_type(self):
|
|
215
229
|
data = [[[30, 40]]]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|