fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fugue_spark/dataframe.py CHANGED
@@ -14,9 +14,14 @@ from fugue.dataframe import (
14
14
  IterableDataFrame,
15
15
  LocalBoundedDataFrame,
16
16
  )
17
+ from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
17
18
  from fugue.exceptions import FugueDataFrameOperationError
18
19
  from fugue.plugins import (
20
+ as_array,
21
+ as_array_iterable,
19
22
  as_arrow,
23
+ as_dict_iterable,
24
+ as_dicts,
20
25
  as_local_bounded,
21
26
  as_pandas,
22
27
  count,
@@ -152,23 +157,22 @@ class SparkDataFrame(DataFrame):
152
157
  def as_array(
153
158
  self, columns: Optional[List[str]] = None, type_safe: bool = False
154
159
  ) -> List[Any]:
155
- sdf = self._select_columns(columns)
156
- return sdf.as_local().as_array(type_safe=type_safe)
160
+ return _spark_as_array(self.native, columns=columns, type_safe=type_safe)
157
161
 
158
162
  def as_array_iterable(
159
163
  self, columns: Optional[List[str]] = None, type_safe: bool = False
160
164
  ) -> Iterable[Any]:
161
- if is_spark_connect(self.native): # pragma: no cover
162
- yield from self.as_array(columns, type_safe=type_safe)
163
- return
164
- sdf = self._select_columns(columns)
165
- if not type_safe:
166
- for row in to_type_safe_input(sdf.native.rdd.toLocalIterator(), sdf.schema):
167
- yield row
168
- else:
169
- df = IterableDataFrame(sdf.as_array_iterable(type_safe=False), sdf.schema)
170
- for row in df.as_array_iterable(type_safe=True):
171
- yield row
165
+ yield from _spark_as_array_iterable(
166
+ self.native, columns=columns, type_safe=type_safe
167
+ )
168
+
169
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
170
+ return _spark_as_dicts(self.native, columns=columns)
171
+
172
+ def as_dict_iterable(
173
+ self, columns: Optional[List[str]] = None
174
+ ) -> Iterable[Dict[str, Any]]:
175
+ yield from _spark_as_dict_iterable(self.native, columns=columns)
172
176
 
173
177
  def head(
174
178
  self, n: int, columns: Optional[List[str]] = None
@@ -288,6 +292,58 @@ def _spark_df_head(
288
292
  return SparkDataFrame(res).as_local() if as_fugue else to_pandas(res)
289
293
 
290
294
 
295
+ @as_array.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
296
+ def _spark_as_array(
297
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
298
+ ) -> List[Any]:
299
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
300
+ _df = df if columns is None or len(columns) == 0 else df[columns]
301
+ return pa_table_as_array(to_arrow(_df), columns)
302
+
303
+
304
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
305
+ def _spark_as_array_iterable(
306
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
307
+ ) -> Iterable[Any]:
308
+ if is_spark_connect(df): # pragma: no cover
309
+ yield from _spark_as_array(df, columns, type_safe=type_safe)
310
+ else:
311
+ assert_or_throw(
312
+ columns is None or len(columns) > 0, ValueError("empty columns")
313
+ )
314
+ _df = df if columns is None or len(columns) == 0 else df[columns]
315
+ if not type_safe:
316
+ for row in to_type_safe_input(
317
+ _df.rdd.toLocalIterator(), to_schema(_df.schema)
318
+ ):
319
+ yield list(row)
320
+ else:
321
+ tdf = IterableDataFrame(
322
+ _spark_as_array_iterable(_df, type_safe=False), to_schema(_df.schema)
323
+ )
324
+ yield from tdf.as_array_iterable(type_safe=True)
325
+
326
+
327
+ @as_dicts.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
328
+ def _spark_as_dicts(
329
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
330
+ ) -> List[Dict[str, Any]]:
331
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
332
+ _df = df if columns is None or len(columns) == 0 else df[columns]
333
+ return pa_table_as_dicts(to_arrow(_df), columns)
334
+
335
+
336
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
337
+ def _spark_as_dict_iterable(
338
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
339
+ ) -> Iterable[Dict[str, Any]]:
340
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
341
+ _df = df if columns is None or len(columns) == 0 else df[columns]
342
+ cols = list(_df.columns)
343
+ for row in _spark_as_array_iterable(_df, type_safe=type_safe):
344
+ yield dict(zip(cols, row))
345
+
346
+
291
347
  def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame:
292
348
  cols: List[ps.Column] = []
293
349
  for f in df.schema:
@@ -208,8 +208,22 @@ class DataFrameTests(object):
208
208
  def test_as_dict_iterable(self):
209
209
  df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
210
210
  assert [dict(a=None, b=1)] == list(fi.as_dict_iterable(df))
211
+ df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
212
+ assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
211
213
  df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
212
214
  assert [dict(a=datetime(2020, 1, 1), b=1)] == list(fi.as_dict_iterable(df))
215
+ df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
216
+ assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
217
+
218
+ def test_as_dicts(self):
219
+ df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
220
+ assert [dict(a=None, b=1)] == fi.as_dicts(df)
221
+ df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
222
+ assert [dict(b=1)] == fi.as_dicts(df, ["b"])
223
+ df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
224
+ assert [dict(a=datetime(2020, 1, 1), b=1)] == fi.as_dicts(df)
225
+ df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
226
+ assert [dict(b=1)] == fi.as_dicts(df, ["b"])
213
227
 
214
228
  def test_list_type(self):
215
229
  data = [[[30, 40]]]