fugue 0.8.7.dev4__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,22 +14,32 @@ from triad.utils.pyarrow import (
14
14
 
15
15
  from fugue import ArrowDataFrame
16
16
  from fugue.api import (
17
+ as_array,
18
+ as_array_iterable,
17
19
  as_arrow,
20
+ as_dict_iterable,
21
+ as_dicts,
18
22
  drop_columns,
19
23
  get_column_names,
20
24
  get_schema,
21
25
  is_df,
26
+ is_empty,
22
27
  rename,
23
28
  select_columns,
24
29
  )
25
30
  from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
31
+ from fugue.dataframe.utils import (
32
+ pa_table_as_array,
33
+ pa_table_as_array_iterable,
34
+ pa_table_as_dict_iterable,
35
+ pa_table_as_dicts,
36
+ )
26
37
  from fugue.dataset.api import (
27
38
  as_local,
28
39
  as_local_bounded,
29
40
  count,
30
41
  get_num_partitions,
31
42
  is_bounded,
32
- is_empty,
33
43
  is_local,
34
44
  )
35
45
  from fugue.exceptions import FugueDataFrameOperationError
@@ -52,7 +62,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
52
62
  ):
53
63
  if df is None:
54
64
  schema = _input_schema(schema).assert_not_empty()
55
- self._native: pa.Table = build_empty_pl(schema)
65
+ self._native: pl.DataFrame = build_empty_pl(schema)
56
66
  super().__init__(schema)
57
67
  return
58
68
  else:
@@ -73,7 +83,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
73
83
 
74
84
  @property
75
85
  def empty(self) -> bool:
76
- return self._native.shape[0] == 0
86
+ return self._native.is_empty()
77
87
 
78
88
  def peek_array(self) -> List[Any]:
79
89
  self.assert_not_empty()
@@ -118,26 +128,20 @@ class PolarsDataFrame(LocalBoundedDataFrame):
118
128
  def as_array(
119
129
  self, columns: Optional[List[str]] = None, type_safe: bool = False
120
130
  ) -> List[Any]:
121
- tdf = self.native
122
- if columns is not None:
123
- tdf = tdf.select(columns)
124
- return [list(row) for row in tdf.rows()]
131
+ return _pl_as_array(self.native, columns=columns)
125
132
 
126
133
  def as_array_iterable(
127
134
  self, columns: Optional[List[str]] = None, type_safe: bool = False
128
135
  ) -> Iterable[Any]:
129
- if not self.empty:
130
- yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
131
- columns=columns
132
- )
136
+ yield from _pl_as_array_iterable(self.native, columns=columns)
137
+
138
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
139
+ return _pl_as_dicts(self.native, columns=columns)
133
140
 
134
141
  def as_dict_iterable(
135
142
  self, columns: Optional[List[str]] = None
136
143
  ) -> Iterable[Dict[str, Any]]:
137
- if not self.empty:
138
- yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
139
- columns=columns
140
- )
144
+ yield from _pl_as_dict_iterable(self.native, columns=columns)
141
145
 
142
146
 
143
147
  @as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
@@ -174,7 +178,7 @@ def _pl_is_bounded(df: pl.DataFrame) -> bool:
174
178
 
175
179
  @is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
176
180
  def _pl_is_empty(df: pl.DataFrame) -> bool:
177
- return df.shape[0] == 0
181
+ return df.is_empty()
178
182
 
179
183
 
180
184
  @is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
@@ -228,6 +232,39 @@ def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
228
232
  return df.select(columns)
229
233
 
230
234
 
235
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
236
+ def _pl_as_array(
237
+ df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
238
+ ) -> List[List[Any]]:
239
+ _df = df if columns is None else _select_pa_columns(df, columns)
240
+ adf = _pl_as_arrow(_df)
241
+ return pa_table_as_array(adf, columns=columns)
242
+
243
+
244
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
245
+ def _pl_as_array_iterable(
246
+ df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
247
+ ) -> Iterable[List[Any]]:
248
+ _df = df if columns is None else _select_pa_columns(df, columns)
249
+ yield from pa_table_as_array_iterable(_df.to_arrow(), columns=columns)
250
+
251
+
252
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
253
+ def _pl_as_dicts(
254
+ df: pl.DataFrame, columns: Optional[List[str]] = None
255
+ ) -> List[Dict[str, Any]]:
256
+ _df = df if columns is None else _select_pa_columns(df, columns)
257
+ return pa_table_as_dicts(_df.to_arrow(), columns=columns)
258
+
259
+
260
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
261
+ def _pl_as_dict_iterable(
262
+ df: pl.DataFrame, columns: Optional[List[str]] = None
263
+ ) -> Iterable[Dict[str, Any]]:
264
+ _df = df if columns is None else _select_pa_columns(df, columns)
265
+ yield from pa_table_as_dict_iterable(_df.to_arrow(), columns=columns)
266
+
267
+
231
268
  def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
232
269
  missing = [x for x in columns if x not in df.schema.keys()]
233
270
  if len(missing) > 0:
fugue_ray/dataframe.py CHANGED
@@ -4,14 +4,22 @@ import pandas as pd
4
4
  import pyarrow as pa
5
5
  import ray
6
6
  import ray.data as rd
7
+ from triad import assert_or_throw
7
8
  from triad.collections.schema import Schema
8
9
  from triad.utils.pyarrow import cast_pa_table
9
10
 
10
11
  from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
11
12
  from fugue.dataframe.dataframe import _input_schema
13
+ from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
12
14
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
13
15
  from fugue.plugins import (
16
+ as_array,
17
+ as_array_iterable,
18
+ as_arrow,
19
+ as_dict_iterable,
20
+ as_dicts,
14
21
  as_local_bounded,
22
+ as_pandas,
15
23
  get_column_names,
16
24
  get_num_partitions,
17
25
  is_df,
@@ -141,13 +149,11 @@ class RayDataFrame(DataFrame):
141
149
  def _select_cols(self, cols: List[Any]) -> DataFrame:
142
150
  if cols == self.columns:
143
151
  return self
144
- rdf = self.native.map_batches(
145
- lambda b: b.select(cols),
146
- batch_format="pyarrow",
147
- **_ZERO_COPY,
148
- **self._remote_args(),
152
+ return RayDataFrame(
153
+ self.native.select_columns(cols),
154
+ self.schema.extract(cols),
155
+ internal_schema=True,
149
156
  )
150
- return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
151
157
 
152
158
  def peek_array(self) -> List[Any]:
153
159
  data = self.native.limit(1).to_pandas().values.tolist()
@@ -164,10 +170,10 @@ class RayDataFrame(DataFrame):
164
170
  return self.native.count()
165
171
 
166
172
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
167
- return pa.concat_tables(_get_arrow_tables(self.native))
173
+ return _rd_as_arrow(self.native)
168
174
 
169
175
  def as_pandas(self) -> pd.DataFrame:
170
- return self.as_arrow().to_pandas()
176
+ return _rd_as_pandas(self.native)
171
177
 
172
178
  def rename(self, columns: Dict[str, str]) -> DataFrame:
173
179
  try:
@@ -201,18 +207,20 @@ class RayDataFrame(DataFrame):
201
207
  def as_array(
202
208
  self, columns: Optional[List[str]] = None, type_safe: bool = False
203
209
  ) -> List[Any]:
204
- df: DataFrame = self
205
- if columns is not None:
206
- df = df[columns]
207
- adf = df.as_arrow()
208
- if adf.shape[0] == 0:
209
- return []
210
- return ArrowDataFrame(adf).as_array(type_safe=type_safe)
210
+ return _rd_as_array(self.native, columns, type_safe)
211
211
 
212
212
  def as_array_iterable(
213
213
  self, columns: Optional[List[str]] = None, type_safe: bool = False
214
214
  ) -> Iterable[Any]:
215
- yield from self.as_array(columns=columns, type_safe=type_safe)
215
+ yield from _rd_as_array_iterable(self.native, columns, type_safe)
216
+
217
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
218
+ return _rd_as_dicts(self.native, columns)
219
+
220
+ def as_dict_iterable(
221
+ self, columns: Optional[List[str]] = None
222
+ ) -> Iterable[Dict[str, Any]]:
223
+ yield from _rd_as_dict_iterable(self.native, columns)
216
224
 
217
225
  def head(
218
226
  self, n: int, columns: Optional[List[str]] = None
@@ -259,8 +267,8 @@ def _rd_num_partitions(df: rd.Dataset) -> int:
259
267
 
260
268
 
261
269
  @as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset))
262
- def _rd_as_local(df: rd.Dataset) -> bool:
263
- return pa.concat_tables(_get_arrow_tables(df))
270
+ def _rd_as_local(df: rd.Dataset) -> pa.Table:
271
+ return _rd_as_arrow(df)
264
272
 
265
273
 
266
274
  @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
@@ -290,10 +298,54 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
290
298
  )
291
299
 
292
300
 
301
+ @as_pandas.candidate(lambda df: isinstance(df, rd.Dataset))
302
+ def _rd_as_pandas(df: rd.Dataset) -> pd.DataFrame:
303
+ return _rd_as_arrow(df).to_pandas()
304
+
305
+
306
+ @as_arrow.candidate(lambda df: isinstance(df, rd.Dataset))
307
+ def _rd_as_arrow(df: rd.Dataset) -> pa.Table:
308
+ return pa.concat_tables(_get_arrow_tables(df))
309
+
310
+
311
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
312
+ def _rd_as_array(
313
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
314
+ ) -> List[Any]:
315
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
316
+ _df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
317
+ adf = _rd_as_arrow(_df)
318
+ return pa_table_as_array(adf)
319
+
320
+
321
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
322
+ def _rd_as_array_iterable(
323
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
324
+ ) -> Iterable[Any]:
325
+ yield from _rd_as_array(df, columns, type_safe)
326
+
327
+
328
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
329
+ def _rd_as_dicts(
330
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
331
+ ) -> List[Dict[str, Any]]:
332
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
333
+ _df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
334
+ adf = _rd_as_arrow(_df)
335
+ return pa_table_as_dicts(adf)
336
+
337
+
338
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
339
+ def _rd_as_dict_iterable(
340
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
341
+ ) -> Iterable[Dict[str, Any]]:
342
+ yield from _rd_as_dicts(df, columns, type_safe)
343
+
344
+
293
345
  def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]:
294
346
  last_empty: Any = None
295
347
  empty = True
296
- for block in df.get_internal_block_refs():
348
+ for block in df.to_arrow_refs():
297
349
  tb = ray.get(block)
298
350
  if tb.shape[0] > 0:
299
351
  yield tb
@@ -1,5 +1,6 @@
1
1
  import pickle
2
- from typing import Any, Iterable, List, Tuple
2
+ from typing import Any, Iterable, List, Tuple, Optional
3
+
3
4
 
4
5
  import pandas as pd
5
6
  import pyarrow as pa
@@ -16,7 +17,7 @@ from pyspark.sql.pandas.types import (
16
17
  )
17
18
  from triad.collections import Schema
18
19
  from triad.utils.assertion import assert_arg_not_none, assert_or_throw
19
- from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP
20
+ from triad.utils.pyarrow import TRIAD_DEFAULT_TIMESTAMP, cast_pa_table
20
21
  from triad.utils.schema import quote_name
21
22
 
22
23
  import fugue.api as fa
@@ -41,7 +42,7 @@ def pandas_udf_can_accept(schema: Schema, is_input: bool) -> bool:
41
42
  return False
42
43
  to_arrow_schema(from_arrow_schema(schema.pa_schema))
43
44
  return True
44
- except Exception:
45
+ except Exception: # pragma: no cover
45
46
  return False
46
47
 
47
48
 
@@ -132,7 +133,7 @@ def to_type_safe_input(rows: Iterable[ps.Row], schema: Schema) -> Iterable[List[
132
133
  if r[i] is not None:
133
134
  r[i] = r[i].asDict(recursive=True)
134
135
  yield r
135
- else:
136
+ else: # pragma: no cover
136
137
  for row in rows:
137
138
  data = row.asDict(recursive=True)
138
139
  r = [data[n] for n in schema.names]
@@ -173,14 +174,14 @@ def pd_to_spark_df(
173
174
 
174
175
 
175
176
  def to_pandas(df: ps.DataFrame) -> pd.DataFrame:
176
- if pd.__version__ < "2" or not any(
177
+ if version.parse(pd.__version__) < version.parse("2.0.0") or not any(
177
178
  isinstance(x.dataType, (pt.TimestampType, TimestampNTZType))
178
179
  for x in df.schema.fields
179
180
  ):
180
181
  return df.toPandas()
181
- else:
182
+ else: # pragma: no cover
182
183
 
183
- def serialize(dfs): # pragma: no cover
184
+ def serialize(dfs):
184
185
  for df in dfs:
185
186
  data = pickle.dumps(df)
186
187
  yield pd.DataFrame([[data]], columns=["data"])
@@ -189,6 +190,30 @@ def to_pandas(df: ps.DataFrame) -> pd.DataFrame:
189
190
  return pd.concat(pickle.loads(x.data) for x in sdf.collect())
190
191
 
191
192
 
193
+ def to_arrow(df: ps.DataFrame) -> pa.Table:
194
+ schema = to_schema(df.schema)
195
+ destruct: Optional[bool] = None
196
+ try:
197
+ jconf = df.sparkSession._jconf
198
+ if jconf.arrowPySparkEnabled() and pandas_udf_can_accept(
199
+ schema, is_input=False
200
+ ):
201
+ destruct = jconf.arrowPySparkSelfDestructEnabled()
202
+ except Exception: # pragma: no cover
203
+ # older spark does not have this config
204
+ pass
205
+ if destruct is not None and hasattr(df, "_collect_as_arrow"):
206
+ batches = df._collect_as_arrow(split_batches=destruct)
207
+ if len(batches) == 0:
208
+ return schema.create_empty_arrow_table()
209
+ table = pa.Table.from_batches(batches)
210
+ del batches
211
+ return cast_pa_table(table, schema.pa_schema)
212
+ else: # pragma: no cover
213
+ # df.toPandas has bugs on nested types
214
+ return pa.Table.from_pylist(df.collect(), schema=schema.pa_schema)
215
+
216
+
192
217
  # TODO: the following function always set nullable to true,
193
218
  # but should we use field.nullable?
194
219
  def _to_arrow_type(dt: pt.DataType) -> pa.DataType:
fugue_spark/_utils/io.py CHANGED
@@ -9,7 +9,7 @@ from triad.utils.assertion import assert_or_throw
9
9
 
10
10
  from fugue._utils.io import FileParser, save_df
11
11
  from fugue.collections.partition import PartitionSpec
12
- from fugue.dataframe import DataFrame
12
+ from fugue.dataframe import DataFrame, PandasDataFrame
13
13
  from fugue_spark.dataframe import SparkDataFrame
14
14
 
15
15
  from .convert import to_schema, to_spark_schema
@@ -62,6 +62,8 @@ class SparkIO(object):
62
62
  writer.save(uri)
63
63
  else:
64
64
  ldf = df.as_local()
65
+ if isinstance(ldf, PandasDataFrame) and hasattr(ldf.native, "attrs"):
66
+ ldf.native.attrs = {} # pragma: no cover
65
67
  save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
66
68
 
67
69
  def _get_writer(
fugue_spark/dataframe.py CHANGED
@@ -9,15 +9,21 @@ from triad.collections.schema import SchemaError
9
9
  from triad.utils.assertion import assert_or_throw
10
10
 
11
11
  from fugue.dataframe import (
12
- ArrayDataFrame,
12
+ ArrowDataFrame,
13
13
  DataFrame,
14
14
  IterableDataFrame,
15
15
  LocalBoundedDataFrame,
16
- PandasDataFrame,
17
16
  )
17
+ from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
18
18
  from fugue.exceptions import FugueDataFrameOperationError
19
19
  from fugue.plugins import (
20
+ as_array,
21
+ as_array_iterable,
22
+ as_arrow,
23
+ as_dict_iterable,
24
+ as_dicts,
20
25
  as_local_bounded,
26
+ as_pandas,
21
27
  count,
22
28
  drop_columns,
23
29
  get_column_names,
@@ -31,7 +37,13 @@ from fugue.plugins import (
31
37
  select_columns,
32
38
  )
33
39
 
34
- from ._utils.convert import to_cast_expression, to_pandas, to_schema, to_type_safe_input
40
+ from ._utils.convert import (
41
+ to_arrow,
42
+ to_cast_expression,
43
+ to_pandas,
44
+ to_schema,
45
+ to_type_safe_input,
46
+ )
35
47
  from ._utils.misc import is_spark_connect, is_spark_dataframe
36
48
 
37
49
 
@@ -92,11 +104,7 @@ class SparkDataFrame(DataFrame):
92
104
  return True
93
105
 
94
106
  def as_local_bounded(self) -> LocalBoundedDataFrame:
95
- if any(pa.types.is_nested(t) for t in self.schema.types):
96
- data = list(to_type_safe_input(self.native.collect(), self.schema))
97
- res: LocalBoundedDataFrame = ArrayDataFrame(data, self.schema)
98
- else:
99
- res = PandasDataFrame(self.as_pandas(), self.schema)
107
+ res = ArrowDataFrame(self.as_arrow())
100
108
  if self.has_metadata:
101
109
  res.reset_metadata(self.metadata)
102
110
  return res
@@ -128,7 +136,10 @@ class SparkDataFrame(DataFrame):
128
136
  return SparkDataFrame(self.native[schema.names])
129
137
 
130
138
  def as_pandas(self) -> pd.DataFrame:
131
- return to_pandas(self.native)
139
+ return _spark_df_as_pandas(self.native)
140
+
141
+ def as_arrow(self, type_safe: bool = False) -> pa.Table:
142
+ return _spark_df_as_arrow(self.native)
132
143
 
133
144
  def rename(self, columns: Dict[str, str]) -> DataFrame:
134
145
  try:
@@ -146,23 +157,22 @@ class SparkDataFrame(DataFrame):
146
157
  def as_array(
147
158
  self, columns: Optional[List[str]] = None, type_safe: bool = False
148
159
  ) -> List[Any]:
149
- sdf = self._select_columns(columns)
150
- return sdf.as_local().as_array(type_safe=type_safe)
160
+ return _spark_as_array(self.native, columns=columns, type_safe=type_safe)
151
161
 
152
162
  def as_array_iterable(
153
163
  self, columns: Optional[List[str]] = None, type_safe: bool = False
154
164
  ) -> Iterable[Any]:
155
- if is_spark_connect(self.native): # pragma: no cover
156
- yield from self.as_array(columns, type_safe=type_safe)
157
- return
158
- sdf = self._select_columns(columns)
159
- if not type_safe:
160
- for row in to_type_safe_input(sdf.native.rdd.toLocalIterator(), sdf.schema):
161
- yield row
162
- else:
163
- df = IterableDataFrame(sdf.as_array_iterable(type_safe=False), sdf.schema)
164
- for row in df.as_array_iterable(type_safe=True):
165
- yield row
165
+ yield from _spark_as_array_iterable(
166
+ self.native, columns=columns, type_safe=type_safe
167
+ )
168
+
169
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
170
+ return _spark_as_dicts(self.native, columns=columns)
171
+
172
+ def as_dict_iterable(
173
+ self, columns: Optional[List[str]] = None
174
+ ) -> Iterable[Dict[str, Any]]:
175
+ yield from _spark_as_dict_iterable(self.native, columns=columns)
166
176
 
167
177
  def head(
168
178
  self, n: int, columns: Optional[List[str]] = None
@@ -192,6 +202,16 @@ def _spark_is_df(df: ps.DataFrame) -> bool:
192
202
  return True
193
203
 
194
204
 
205
+ @as_arrow.candidate(lambda df: isinstance(df, ps.DataFrame))
206
+ def _spark_df_as_arrow(df: ps.DataFrame) -> pd.DataFrame:
207
+ return to_arrow(df)
208
+
209
+
210
+ @as_pandas.candidate(lambda df: isinstance(df, ps.DataFrame))
211
+ def _spark_df_as_pandas(df: ps.DataFrame) -> pd.DataFrame:
212
+ return to_pandas(df)
213
+
214
+
195
215
  @get_num_partitions.candidate(lambda df: is_spark_dataframe(df))
196
216
  def _spark_num_partitions(df: ps.DataFrame) -> int:
197
217
  return df.rdd.getNumPartitions()
@@ -272,6 +292,58 @@ def _spark_df_head(
272
292
  return SparkDataFrame(res).as_local() if as_fugue else to_pandas(res)
273
293
 
274
294
 
295
+ @as_array.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
296
+ def _spark_as_array(
297
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
298
+ ) -> List[Any]:
299
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
300
+ _df = df if columns is None or len(columns) == 0 else df[columns]
301
+ return pa_table_as_array(to_arrow(_df), columns)
302
+
303
+
304
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
305
+ def _spark_as_array_iterable(
306
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
307
+ ) -> Iterable[Any]:
308
+ if is_spark_connect(df): # pragma: no cover
309
+ yield from _spark_as_array(df, columns, type_safe=type_safe)
310
+ else:
311
+ assert_or_throw(
312
+ columns is None or len(columns) > 0, ValueError("empty columns")
313
+ )
314
+ _df = df if columns is None or len(columns) == 0 else df[columns]
315
+ if not type_safe:
316
+ for row in to_type_safe_input(
317
+ _df.rdd.toLocalIterator(), to_schema(_df.schema)
318
+ ):
319
+ yield list(row)
320
+ else:
321
+ tdf = IterableDataFrame(
322
+ _spark_as_array_iterable(_df, type_safe=False), to_schema(_df.schema)
323
+ )
324
+ yield from tdf.as_array_iterable(type_safe=True)
325
+
326
+
327
+ @as_dicts.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
328
+ def _spark_as_dicts(
329
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
330
+ ) -> List[Dict[str, Any]]:
331
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
332
+ _df = df if columns is None or len(columns) == 0 else df[columns]
333
+ return pa_table_as_dicts(to_arrow(_df), columns)
334
+
335
+
336
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: is_spark_dataframe(df))
337
+ def _spark_as_dict_iterable(
338
+ df: ps.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
339
+ ) -> Iterable[Dict[str, Any]]:
340
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
341
+ _df = df if columns is None or len(columns) == 0 else df[columns]
342
+ cols = list(_df.columns)
343
+ for row in _spark_as_array_iterable(_df, type_safe=type_safe):
344
+ yield dict(zip(cols, row))
345
+
346
+
275
347
  def _rename_spark_dataframe(df: ps.DataFrame, names: Dict[str, Any]) -> ps.DataFrame:
276
348
  cols: List[ps.Column] = []
277
349
  for f in df.schema:
@@ -5,6 +5,7 @@ from uuid import uuid4
5
5
  import pandas as pd
6
6
  import pyarrow as pa
7
7
  import pyspark.sql as ps
8
+ from py4j.protocol import Py4JError
8
9
  from pyspark import StorageLevel
9
10
  from pyspark.rdd import RDD
10
11
  from pyspark.sql import SparkSession
@@ -350,9 +351,12 @@ class SparkExecutionEngine(ExecutionEngine):
350
351
  self._spark_session = spark_session
351
352
  cf = dict(FUGUE_SPARK_DEFAULT_CONF)
352
353
  if not self.is_spark_connect:
353
- cf.update(
354
- {x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}
355
- )
354
+ try:
355
+ spark_conf = spark_session.sparkContext.getConf()
356
+ cf.update({x[0]: x[1] for x in spark_conf.getAll()})
357
+ except Py4JError: # pragma: no cover:
358
+ # edge case: https://github.com/fugue-project/fugue/issues/517z
359
+ pass
356
360
  cf.update(ParamDict(conf))
357
361
  super().__init__(cf)
358
362
  self._lock = SerializableRLock()
@@ -1329,7 +1329,7 @@ class BuiltInTests(object):
1329
1329
  [[datetime.date(2020, 1, 1), datetime.datetime(2020, 1, 2)]],
1330
1330
  "a:date,b:datetime",
1331
1331
  )
1332
- b.assert_eq(a)
1332
+ b.assert_eq(a, no_pandas=True)
1333
1333
  c = dag.df([["2020-01-01", "2020-01-01 00:00:00"]], "a:date,b:datetime")
1334
1334
  c.transform(T2).assert_eq(c)
1335
1335
  c.partition(by=["a"]).transform(T2).assert_eq(c)
@@ -208,8 +208,22 @@ class DataFrameTests(object):
208
208
  def test_as_dict_iterable(self):
209
209
  df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
210
210
  assert [dict(a=None, b=1)] == list(fi.as_dict_iterable(df))
211
+ df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
212
+ assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
211
213
  df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
212
214
  assert [dict(a=datetime(2020, 1, 1), b=1)] == list(fi.as_dict_iterable(df))
215
+ df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
216
+ assert [dict(b=1)] == list(fi.as_dict_iterable(df, ["b"]))
217
+
218
+ def test_as_dicts(self):
219
+ df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
220
+ assert [dict(a=None, b=1)] == fi.as_dicts(df)
221
+ df = self.df([[pd.NaT, 1]], "a:datetime,b:int")
222
+ assert [dict(b=1)] == fi.as_dicts(df, ["b"])
223
+ df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
224
+ assert [dict(a=datetime(2020, 1, 1), b=1)] == fi.as_dicts(df)
225
+ df = self.df([[pd.Timestamp("2020-01-01"), 1]], "a:datetime,b:int")
226
+ assert [dict(b=1)] == fi.as_dicts(df, ["b"])
213
227
 
214
228
  def test_list_type(self):
215
229
  data = [[[30, 40]]]