fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fugue_dask/dataframe.py CHANGED
@@ -3,20 +3,21 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple
3
3
  import dask.dataframe as dd
4
4
  import pandas as pd
5
5
  import pyarrow as pa
6
+ from triad import assert_or_throw
6
7
  from triad.collections.schema import Schema
7
8
  from triad.utils.assertion import assert_arg_not_none
8
9
  from triad.utils.pandas_like import PD_UTILS
9
10
  from triad.utils.pyarrow import cast_pa_table
10
11
 
11
- from fugue.dataframe import (
12
- ArrowDataFrame,
13
- DataFrame,
14
- LocalBoundedDataFrame,
15
- PandasDataFrame,
16
- )
12
+ from fugue.dataframe import DataFrame, LocalBoundedDataFrame, PandasDataFrame
17
13
  from fugue.dataframe.dataframe import _input_schema
14
+ from fugue.dataframe.pandas_dataframe import _pd_as_dicts
18
15
  from fugue.exceptions import FugueDataFrameOperationError
19
16
  from fugue.plugins import (
17
+ as_array,
18
+ as_array_iterable,
19
+ as_dict_iterable,
20
+ as_dicts,
20
21
  as_local_bounded,
21
22
  count,
22
23
  drop_columns,
@@ -32,7 +33,7 @@ from fugue.plugins import (
32
33
  )
33
34
 
34
35
  from ._constants import FUGUE_DASK_USE_ARROW
35
- from ._utils import DASK_UTILS, get_default_partitions
36
+ from ._utils import DASK_UTILS, collect, get_default_partitions
36
37
 
37
38
 
38
39
  class DaskDataFrame(DataFrame):
@@ -150,8 +151,16 @@ class DaskDataFrame(DataFrame):
150
151
  )
151
152
 
152
153
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
153
- adf = pa.Table.from_pandas(self.native.compute().reset_index(drop=True))
154
- return cast_pa_table(adf, self.schema.pa_schema)
154
+ schema = self.schema.pa_schema
155
+ return pa.concat_tables(
156
+ collect(
157
+ self.native,
158
+ lambda df: cast_pa_table(
159
+ pa.Table.from_pandas(df.reset_index(drop=True), schema=schema),
160
+ schema=schema,
161
+ ),
162
+ )
163
+ )
155
164
 
156
165
  def rename(self, columns: Dict[str, str]) -> DataFrame:
157
166
  try:
@@ -170,17 +179,28 @@ class DaskDataFrame(DataFrame):
170
179
  def as_array(
171
180
  self, columns: Optional[List[str]] = None, type_safe: bool = False
172
181
  ) -> List[Any]:
173
- df: DataFrame = self
174
- if columns is not None:
175
- df = df[columns]
176
- return ArrowDataFrame(df.as_pandas(), schema=df.schema).as_array(
177
- type_safe=type_safe
178
- )
182
+ chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
183
+ res: List[List[Any]] = []
184
+ for x in chunks:
185
+ res += x
186
+ return res
179
187
 
180
188
  def as_array_iterable(
181
189
  self, columns: Optional[List[str]] = None, type_safe: bool = False
182
190
  ) -> Iterable[Any]:
183
- yield from self.as_array(columns=columns, type_safe=type_safe)
191
+ chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
192
+ for x in chunks:
193
+ yield from x
194
+
195
+ def as_dicts(
196
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
197
+ ) -> List[Dict[str, Any]]:
198
+ return _dd_as_dicts(self.native, columns)
199
+
200
+ def as_dict_iterable(
201
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
202
+ ) -> Iterable[Dict[str, Any]]:
203
+ yield from _dd_as_dict_iterable(self.native, columns)
184
204
 
185
205
  def head(
186
206
  self, n: int, columns: Optional[List[str]] = None
@@ -197,8 +217,11 @@ class DaskDataFrame(DataFrame):
197
217
  assert_arg_not_none(schema, "schema")
198
218
  return pdf, schema
199
219
  DASK_UTILS.ensure_compatible(pdf)
200
- pschema = Schema(DASK_UTILS.to_schema(pdf))
201
- if schema is None or pschema == schema:
220
+ # when pdf contains bytes, or any object types, and schema contains str
221
+ # there is no way to get the real schema of the pdf, (pschema will contain
222
+ # strs instead of the real types) so we have to force cast it to the schema
223
+ if schema is None:
224
+ pschema = Schema(DASK_UTILS.to_schema(pdf))
202
225
  return pdf, pschema.assert_not_empty()
203
226
  pdf = pdf[schema.assert_not_empty().names]
204
227
  return (
@@ -295,6 +318,48 @@ def _dd_head(
295
318
  return PandasDataFrame(res) if as_fugue else res
296
319
 
297
320
 
321
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
322
+ def _dd_as_array(
323
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
324
+ ) -> List[Any]:
325
+ chunks = _to_array_chunks(df, columns, type_safe)
326
+ res: List[List[Any]] = []
327
+ for x in chunks:
328
+ res += x
329
+ return res
330
+
331
+
332
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
333
+ def _dd_as_array_iterable(
334
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
335
+ ) -> Iterable[Any]:
336
+ chunks = _to_array_chunks(df, columns, type_safe)
337
+ for x in chunks:
338
+ yield from x
339
+
340
+
341
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
342
+ def _dd_as_dicts(
343
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
344
+ ) -> List[Dict[str, Any]]:
345
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
346
+ _df = df if columns is None or len(columns) == 0 else df[columns]
347
+ res: List[Dict[str, Any]] = []
348
+ for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
349
+ res += x
350
+ return res
351
+
352
+
353
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
354
+ def _dd_as_dict_iterable(
355
+ df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
356
+ ) -> Iterable[Dict[str, Any]]:
357
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
358
+ _df = df if columns is None or len(columns) == 0 else df[columns]
359
+ for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
360
+ yield from x
361
+
362
+
298
363
  def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
299
364
  missing = set(columns) - set(df.columns)
300
365
  if len(missing) > 0:
@@ -303,3 +368,25 @@ def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
303
368
 
304
369
  def _adjust_df(res: dd.DataFrame, as_fugue: bool):
305
370
  return res if not as_fugue else DaskDataFrame(res)
371
+
372
+
373
+ def _to_array_chunks(
374
+ df: dd.DataFrame,
375
+ columns: Optional[List[str]] = None,
376
+ type_safe: bool = False,
377
+ schema: Optional[Schema] = None,
378
+ ) -> Tuple[List[Any]]:
379
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
380
+ _df = df if columns is None or len(columns) == 0 else df[columns]
381
+
382
+ def _to_list(pdf: pd.DataFrame) -> List[Any]:
383
+ return list(
384
+ PD_UTILS.as_array_iterable(
385
+ pdf,
386
+ schema=None if schema is None else schema.pa_schema,
387
+ columns=columns,
388
+ type_safe=type_safe,
389
+ )
390
+ )
391
+
392
+ return collect(_df, _to_list)
fugue_duckdb/dataframe.py CHANGED
@@ -3,21 +3,33 @@ from typing import Any, Dict, Iterable, List, Optional
3
3
  import pandas as pd
4
4
  import pyarrow as pa
5
5
  from duckdb import DuckDBPyRelation
6
- from triad import Schema
6
+ from triad import Schema, assert_or_throw
7
7
  from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
8
8
 
9
- from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
9
+ from fugue import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
10
10
  from fugue.dataframe.arrow_dataframe import _pa_table_as_pandas
11
+ from fugue.dataframe.utils import (
12
+ pa_table_as_array,
13
+ pa_table_as_array_iterable,
14
+ pa_table_as_dict_iterable,
15
+ pa_table_as_dicts,
16
+ )
11
17
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
12
18
  from fugue.plugins import (
19
+ as_array,
20
+ as_array_iterable,
13
21
  as_arrow,
22
+ as_dict_iterable,
23
+ as_dicts,
14
24
  as_fugue_dataset,
15
25
  as_local_bounded,
16
26
  as_pandas,
27
+ drop_columns,
17
28
  get_column_names,
18
29
  get_num_partitions,
19
30
  get_schema,
20
31
  is_df,
32
+ select_columns,
21
33
  )
22
34
 
23
35
  from ._utils import encode_column_name, to_duck_type, to_pa_type
@@ -59,13 +71,10 @@ class DuckDataFrame(LocalBoundedDataFrame):
59
71
  return len(self._rel)
60
72
 
61
73
  def _drop_cols(self, cols: List[str]) -> DataFrame:
62
- cols = [col for col in self._rel.columns if col not in cols]
63
- rel = self._rel.project(",".join(encode_column_name(n) for n in cols))
64
- return DuckDataFrame(rel)
74
+ return DuckDataFrame(_drop_duckdb_columns(self._rel, cols))
65
75
 
66
76
  def _select_cols(self, keys: List[Any]) -> DataFrame:
67
- rel = self._rel.project(",".join(encode_column_name(n) for n in keys))
68
- return DuckDataFrame(rel)
77
+ return DuckDataFrame(_select_duckdb_columns(self._rel, keys))
69
78
 
70
79
  def rename(self, columns: Dict[str, str]) -> DataFrame:
71
80
  _assert_no_missing(self._rel, columns.keys())
@@ -109,38 +118,29 @@ class DuckDataFrame(LocalBoundedDataFrame):
109
118
  def as_array(
110
119
  self, columns: Optional[List[str]] = None, type_safe: bool = False
111
120
  ) -> List[Any]:
112
- if columns is not None:
113
- return self[columns].as_array(type_safe=type_safe)
114
- return self._fetchall(self._rel)
121
+ return _duck_as_array(self._rel, columns=columns, type_safe=type_safe)
115
122
 
116
123
  def as_array_iterable(
117
124
  self, columns: Optional[List[str]] = None, type_safe: bool = False
118
125
  ) -> Iterable[Any]:
119
- if columns is not None:
120
- yield from self[columns].as_array_iterable(type_safe=type_safe)
121
- else:
122
- yield from self._fetchall(self._rel)
126
+ yield from _duck_as_array_iterable(
127
+ self._rel, columns=columns, type_safe=type_safe
128
+ )
129
+
130
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
131
+ return _duck_as_dicts(self._rel, columns=columns)
132
+
133
+ def as_dict_iterable(
134
+ self, columns: Optional[List[str]] = None
135
+ ) -> Iterable[Dict[str, Any]]:
136
+ yield from _duck_as_dict_iterable(self._rel, columns=columns)
123
137
 
124
138
  def head(
125
139
  self, n: int, columns: Optional[List[str]] = None
126
140
  ) -> LocalBoundedDataFrame:
127
141
  if columns is not None:
128
142
  return self[columns].head(n)
129
- return ArrayDataFrame(self._fetchall(self._rel.limit(n)), schema=self.schema)
130
-
131
- def _fetchall(self, rel: DuckDBPyRelation) -> List[List[Any]]:
132
- map_pos = [i for i, t in enumerate(self.schema.types) if pa.types.is_map(t)]
133
- if len(map_pos) == 0:
134
- return [list(x) for x in rel.fetchall()]
135
- else:
136
-
137
- def to_list(row: Any) -> List[Any]:
138
- res = list(row)
139
- for p in map_pos:
140
- res[p] = list(zip(row[p]["key"], row[p]["value"]))
141
- return res
142
-
143
- return [to_list(x) for x in rel.fetchall()]
143
+ return ArrowDataFrame(_duck_as_arrow(self._rel.limit(n)))
144
144
 
145
145
 
146
146
  @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation))
@@ -186,6 +186,64 @@ def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
186
186
  return list(df.columns)
187
187
 
188
188
 
189
+ @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
190
+ def _select_duckdb_columns(
191
+ df: DuckDBPyRelation, columns: List[Any]
192
+ ) -> DuckDBPyRelation:
193
+ if len(columns) == 0:
194
+ raise FugueDataFrameOperationError("must select at least one column")
195
+ _assert_no_missing(df, columns)
196
+ return df.project(",".join(encode_column_name(n) for n in columns))
197
+
198
+
199
+ @drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
200
+ def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRelation:
201
+ # if len(columns) == 0:
202
+ # return df
203
+ _columns = {c: 1 for c in columns}
204
+ cols = [col for col in df.columns if _columns.pop(col, None) is None]
205
+ assert_or_throw(
206
+ len(cols) > 0, FugueDataFrameOperationError("must keep at least one column")
207
+ )
208
+ assert_or_throw(
209
+ len(_columns) == 0,
210
+ FugueDataFrameOperationError("found nonexistent columns {_columns}"),
211
+ )
212
+ return df.project(",".join(encode_column_name(n) for n in cols))
213
+
214
+
215
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
216
+ def _duck_as_array(
217
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
218
+ ) -> List[Any]:
219
+ return pa_table_as_array(df.arrow(), columns=columns)
220
+
221
+
222
+ @as_array_iterable.candidate(
223
+ lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
224
+ )
225
+ def _duck_as_array_iterable(
226
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
227
+ ) -> Iterable[Any]:
228
+ yield from pa_table_as_array_iterable(df.arrow(), columns=columns)
229
+
230
+
231
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
232
+ def _duck_as_dicts(
233
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None
234
+ ) -> List[Dict[str, Any]]:
235
+ return pa_table_as_dicts(df.arrow(), columns=columns)
236
+
237
+
238
+ @as_dict_iterable.candidate(
239
+ lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
240
+ )
241
+ def _duck_as_dict_iterable(
242
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None
243
+ ) -> Iterable[Dict[str, Any]]:
244
+ yield from pa_table_as_dict_iterable(df.arrow(), columns=columns)
245
+
246
+
189
247
  def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
190
248
  missing = set(columns) - set(df.columns)
191
249
  if len(missing) > 0:
fugue_ibis/dataframe.py CHANGED
@@ -143,6 +143,19 @@ class IbisDataFrame(DataFrame):
143
143
  type_safe=type_safe
144
144
  )
145
145
 
146
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
147
+ if columns is not None:
148
+ return self[columns].as_dicts()
149
+ return self.as_local().as_dicts()
150
+
151
+ def as_dict_iterable(
152
+ self, columns: Optional[List[str]] = None
153
+ ) -> Iterable[Dict[str, Any]]:
154
+ if columns is not None:
155
+ yield from self[columns].as_dict_iterable()
156
+ else:
157
+ yield from self._to_iterable_df(self._table).as_dict_iterable()
158
+
146
159
  def head(
147
160
  self, n: int, columns: Optional[List[str]] = None
148
161
  ) -> LocalBoundedDataFrame:
@@ -14,22 +14,32 @@ from triad.utils.pyarrow import (
14
14
 
15
15
  from fugue import ArrowDataFrame
16
16
  from fugue.api import (
17
+ as_array,
18
+ as_array_iterable,
17
19
  as_arrow,
20
+ as_dict_iterable,
21
+ as_dicts,
18
22
  drop_columns,
19
23
  get_column_names,
20
24
  get_schema,
21
25
  is_df,
26
+ is_empty,
22
27
  rename,
23
28
  select_columns,
24
29
  )
25
30
  from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
31
+ from fugue.dataframe.utils import (
32
+ pa_table_as_array,
33
+ pa_table_as_array_iterable,
34
+ pa_table_as_dict_iterable,
35
+ pa_table_as_dicts,
36
+ )
26
37
  from fugue.dataset.api import (
27
38
  as_local,
28
39
  as_local_bounded,
29
40
  count,
30
41
  get_num_partitions,
31
42
  is_bounded,
32
- is_empty,
33
43
  is_local,
34
44
  )
35
45
  from fugue.exceptions import FugueDataFrameOperationError
@@ -52,7 +62,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
52
62
  ):
53
63
  if df is None:
54
64
  schema = _input_schema(schema).assert_not_empty()
55
- self._native: pa.Table = build_empty_pl(schema)
65
+ self._native: pl.DataFrame = build_empty_pl(schema)
56
66
  super().__init__(schema)
57
67
  return
58
68
  else:
@@ -73,7 +83,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
73
83
 
74
84
  @property
75
85
  def empty(self) -> bool:
76
- return self._native.shape[0] == 0
86
+ return self._native.is_empty()
77
87
 
78
88
  def peek_array(self) -> List[Any]:
79
89
  self.assert_not_empty()
@@ -118,26 +128,20 @@ class PolarsDataFrame(LocalBoundedDataFrame):
118
128
  def as_array(
119
129
  self, columns: Optional[List[str]] = None, type_safe: bool = False
120
130
  ) -> List[Any]:
121
- tdf = self.native
122
- if columns is not None:
123
- tdf = tdf.select(columns)
124
- return [list(row) for row in tdf.rows()]
131
+ return _pl_as_array(self.native, columns=columns)
125
132
 
126
133
  def as_array_iterable(
127
134
  self, columns: Optional[List[str]] = None, type_safe: bool = False
128
135
  ) -> Iterable[Any]:
129
- if not self.empty:
130
- yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
131
- columns=columns
132
- )
136
+ yield from _pl_as_array_iterable(self.native, columns=columns)
137
+
138
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
139
+ return _pl_as_dicts(self.native, columns=columns)
133
140
 
134
141
  def as_dict_iterable(
135
142
  self, columns: Optional[List[str]] = None
136
143
  ) -> Iterable[Dict[str, Any]]:
137
- if not self.empty:
138
- yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
139
- columns=columns
140
- )
144
+ yield from _pl_as_dict_iterable(self.native, columns=columns)
141
145
 
142
146
 
143
147
  @as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
@@ -174,7 +178,7 @@ def _pl_is_bounded(df: pl.DataFrame) -> bool:
174
178
 
175
179
  @is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
176
180
  def _pl_is_empty(df: pl.DataFrame) -> bool:
177
- return df.shape[0] == 0
181
+ return df.is_empty()
178
182
 
179
183
 
180
184
  @is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
@@ -228,6 +232,39 @@ def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
228
232
  return df.select(columns)
229
233
 
230
234
 
235
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
236
+ def _pl_as_array(
237
+ df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
238
+ ) -> List[List[Any]]:
239
+ _df = df if columns is None else _select_pa_columns(df, columns)
240
+ adf = _pl_as_arrow(_df)
241
+ return pa_table_as_array(adf, columns=columns)
242
+
243
+
244
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
245
+ def _pl_as_array_iterable(
246
+ df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
247
+ ) -> Iterable[List[Any]]:
248
+ _df = df if columns is None else _select_pa_columns(df, columns)
249
+ yield from pa_table_as_array_iterable(_df.to_arrow(), columns=columns)
250
+
251
+
252
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
253
+ def _pl_as_dicts(
254
+ df: pl.DataFrame, columns: Optional[List[str]] = None
255
+ ) -> List[Dict[str, Any]]:
256
+ _df = df if columns is None else _select_pa_columns(df, columns)
257
+ return pa_table_as_dicts(_df.to_arrow(), columns=columns)
258
+
259
+
260
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
261
+ def _pl_as_dict_iterable(
262
+ df: pl.DataFrame, columns: Optional[List[str]] = None
263
+ ) -> Iterable[Dict[str, Any]]:
264
+ _df = df if columns is None else _select_pa_columns(df, columns)
265
+ yield from pa_table_as_dict_iterable(_df.to_arrow(), columns=columns)
266
+
267
+
231
268
  def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
232
269
  missing = [x for x in columns if x not in df.schema.keys()]
233
270
  if len(missing) > 0:
fugue_ray/dataframe.py CHANGED
@@ -4,14 +4,22 @@ import pandas as pd
4
4
  import pyarrow as pa
5
5
  import ray
6
6
  import ray.data as rd
7
+ from triad import assert_or_throw
7
8
  from triad.collections.schema import Schema
8
9
  from triad.utils.pyarrow import cast_pa_table
9
10
 
10
11
  from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
11
12
  from fugue.dataframe.dataframe import _input_schema
13
+ from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
12
14
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
13
15
  from fugue.plugins import (
16
+ as_array,
17
+ as_array_iterable,
18
+ as_arrow,
19
+ as_dict_iterable,
20
+ as_dicts,
14
21
  as_local_bounded,
22
+ as_pandas,
15
23
  get_column_names,
16
24
  get_num_partitions,
17
25
  is_df,
@@ -141,13 +149,11 @@ class RayDataFrame(DataFrame):
141
149
  def _select_cols(self, cols: List[Any]) -> DataFrame:
142
150
  if cols == self.columns:
143
151
  return self
144
- rdf = self.native.map_batches(
145
- lambda b: b.select(cols),
146
- batch_format="pyarrow",
147
- **_ZERO_COPY,
148
- **self._remote_args(),
152
+ return RayDataFrame(
153
+ self.native.select_columns(cols),
154
+ self.schema.extract(cols),
155
+ internal_schema=True,
149
156
  )
150
- return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
151
157
 
152
158
  def peek_array(self) -> List[Any]:
153
159
  data = self.native.limit(1).to_pandas().values.tolist()
@@ -164,10 +170,10 @@ class RayDataFrame(DataFrame):
164
170
  return self.native.count()
165
171
 
166
172
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
167
- return pa.concat_tables(_get_arrow_tables(self.native))
173
+ return _rd_as_arrow(self.native)
168
174
 
169
175
  def as_pandas(self) -> pd.DataFrame:
170
- return self.as_arrow().to_pandas()
176
+ return _rd_as_pandas(self.native)
171
177
 
172
178
  def rename(self, columns: Dict[str, str]) -> DataFrame:
173
179
  try:
@@ -201,18 +207,20 @@ class RayDataFrame(DataFrame):
201
207
  def as_array(
202
208
  self, columns: Optional[List[str]] = None, type_safe: bool = False
203
209
  ) -> List[Any]:
204
- df: DataFrame = self
205
- if columns is not None:
206
- df = df[columns]
207
- adf = df.as_arrow()
208
- if adf.shape[0] == 0:
209
- return []
210
- return ArrowDataFrame(adf).as_array(type_safe=type_safe)
210
+ return _rd_as_array(self.native, columns, type_safe)
211
211
 
212
212
  def as_array_iterable(
213
213
  self, columns: Optional[List[str]] = None, type_safe: bool = False
214
214
  ) -> Iterable[Any]:
215
- yield from self.as_array(columns=columns, type_safe=type_safe)
215
+ yield from _rd_as_array_iterable(self.native, columns, type_safe)
216
+
217
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
218
+ return _rd_as_dicts(self.native, columns)
219
+
220
+ def as_dict_iterable(
221
+ self, columns: Optional[List[str]] = None
222
+ ) -> Iterable[Dict[str, Any]]:
223
+ yield from _rd_as_dict_iterable(self.native, columns)
216
224
 
217
225
  def head(
218
226
  self, n: int, columns: Optional[List[str]] = None
@@ -259,8 +267,8 @@ def _rd_num_partitions(df: rd.Dataset) -> int:
259
267
 
260
268
 
261
269
  @as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset))
262
- def _rd_as_local(df: rd.Dataset) -> bool:
263
- return pa.concat_tables(_get_arrow_tables(df))
270
+ def _rd_as_local(df: rd.Dataset) -> pa.Table:
271
+ return _rd_as_arrow(df)
264
272
 
265
273
 
266
274
  @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
@@ -290,10 +298,54 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
290
298
  )
291
299
 
292
300
 
301
+ @as_pandas.candidate(lambda df: isinstance(df, rd.Dataset))
302
+ def _rd_as_pandas(df: rd.Dataset) -> pd.DataFrame:
303
+ return _rd_as_arrow(df).to_pandas()
304
+
305
+
306
+ @as_arrow.candidate(lambda df: isinstance(df, rd.Dataset))
307
+ def _rd_as_arrow(df: rd.Dataset) -> pa.Table:
308
+ return pa.concat_tables(_get_arrow_tables(df))
309
+
310
+
311
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
312
+ def _rd_as_array(
313
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
314
+ ) -> List[Any]:
315
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
316
+ _df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
317
+ adf = _rd_as_arrow(_df)
318
+ return pa_table_as_array(adf)
319
+
320
+
321
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
322
+ def _rd_as_array_iterable(
323
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
324
+ ) -> Iterable[Any]:
325
+ yield from _rd_as_array(df, columns, type_safe)
326
+
327
+
328
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
329
+ def _rd_as_dicts(
330
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
331
+ ) -> List[Dict[str, Any]]:
332
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
333
+ _df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
334
+ adf = _rd_as_arrow(_df)
335
+ return pa_table_as_dicts(adf)
336
+
337
+
338
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
339
+ def _rd_as_dict_iterable(
340
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
341
+ ) -> Iterable[Dict[str, Any]]:
342
+ yield from _rd_as_dicts(df, columns, type_safe)
343
+
344
+
293
345
  def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]:
294
346
  last_empty: Any = None
295
347
  empty = True
296
- for block in df.get_internal_block_refs():
348
+ for block in df.to_arrow_refs():
297
349
  tb = ray.get(block)
298
350
  if tb.shape[0] > 0:
299
351
  yield tb