fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +84 -89
  3. fugue/api.py +1 -0
  4. fugue/dataframe/api.py +19 -2
  5. fugue/dataframe/arrow_dataframe.py +48 -11
  6. fugue/dataframe/dataframe.py +20 -2
  7. fugue/dataframe/function_wrapper.py +1 -1
  8. fugue/dataframe/iterable_dataframe.py +3 -0
  9. fugue/dataframe/pandas_dataframe.py +73 -0
  10. fugue/dataframe/utils.py +78 -25
  11. fugue/execution/execution_engine.py +1 -8
  12. fugue/execution/native_execution_engine.py +5 -11
  13. fugue/plugins.py +1 -0
  14. fugue/workflow/_checkpoint.py +9 -9
  15. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +4 -4
  16. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +40 -38
  17. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
  18. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +3 -2
  19. fugue_dask/_io.py +22 -29
  20. fugue_dask/_utils.py +15 -2
  21. fugue_dask/dataframe.py +105 -18
  22. fugue_dask/execution_engine.py +5 -12
  23. fugue_duckdb/_io.py +21 -37
  24. fugue_duckdb/dataframe.py +87 -29
  25. fugue_duckdb/execution_engine.py +2 -7
  26. fugue_ibis/dataframe.py +13 -0
  27. fugue_ibis/execution_engine.py +1 -5
  28. fugue_polars/polars_dataframe.py +53 -16
  29. fugue_ray/_utils/io.py +15 -17
  30. fugue_ray/dataframe.py +71 -19
  31. fugue_spark/_utils/io.py +3 -5
  32. fugue_spark/dataframe.py +69 -13
  33. fugue_spark/execution_engine.py +2 -7
  34. fugue_test/builtin_suite.py +12 -12
  35. fugue_test/dataframe_suite.py +14 -0
  36. fugue_test/execution_suite.py +13 -18
  37. fugue_test/plugins/misc/__init__.py +2 -0
  38. fugue_test/plugins/misc/fixtures.py +18 -0
  39. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
  40. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue_duckdb/dataframe.py CHANGED
@@ -3,21 +3,33 @@ from typing import Any, Dict, Iterable, List, Optional
3
3
  import pandas as pd
4
4
  import pyarrow as pa
5
5
  from duckdb import DuckDBPyRelation
6
- from triad import Schema
6
+ from triad import Schema, assert_or_throw
7
7
  from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
8
8
 
9
- from fugue import ArrayDataFrame, ArrowDataFrame, DataFrame, LocalBoundedDataFrame
9
+ from fugue import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
10
10
  from fugue.dataframe.arrow_dataframe import _pa_table_as_pandas
11
+ from fugue.dataframe.utils import (
12
+ pa_table_as_array,
13
+ pa_table_as_array_iterable,
14
+ pa_table_as_dict_iterable,
15
+ pa_table_as_dicts,
16
+ )
11
17
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
12
18
  from fugue.plugins import (
19
+ as_array,
20
+ as_array_iterable,
13
21
  as_arrow,
22
+ as_dict_iterable,
23
+ as_dicts,
14
24
  as_fugue_dataset,
15
25
  as_local_bounded,
16
26
  as_pandas,
27
+ drop_columns,
17
28
  get_column_names,
18
29
  get_num_partitions,
19
30
  get_schema,
20
31
  is_df,
32
+ select_columns,
21
33
  )
22
34
 
23
35
  from ._utils import encode_column_name, to_duck_type, to_pa_type
@@ -59,13 +71,10 @@ class DuckDataFrame(LocalBoundedDataFrame):
59
71
  return len(self._rel)
60
72
 
61
73
  def _drop_cols(self, cols: List[str]) -> DataFrame:
62
- cols = [col for col in self._rel.columns if col not in cols]
63
- rel = self._rel.project(",".join(encode_column_name(n) for n in cols))
64
- return DuckDataFrame(rel)
74
+ return DuckDataFrame(_drop_duckdb_columns(self._rel, cols))
65
75
 
66
76
  def _select_cols(self, keys: List[Any]) -> DataFrame:
67
- rel = self._rel.project(",".join(encode_column_name(n) for n in keys))
68
- return DuckDataFrame(rel)
77
+ return DuckDataFrame(_select_duckdb_columns(self._rel, keys))
69
78
 
70
79
  def rename(self, columns: Dict[str, str]) -> DataFrame:
71
80
  _assert_no_missing(self._rel, columns.keys())
@@ -109,38 +118,29 @@ class DuckDataFrame(LocalBoundedDataFrame):
109
118
  def as_array(
110
119
  self, columns: Optional[List[str]] = None, type_safe: bool = False
111
120
  ) -> List[Any]:
112
- if columns is not None:
113
- return self[columns].as_array(type_safe=type_safe)
114
- return self._fetchall(self._rel)
121
+ return _duck_as_array(self._rel, columns=columns, type_safe=type_safe)
115
122
 
116
123
  def as_array_iterable(
117
124
  self, columns: Optional[List[str]] = None, type_safe: bool = False
118
125
  ) -> Iterable[Any]:
119
- if columns is not None:
120
- yield from self[columns].as_array_iterable(type_safe=type_safe)
121
- else:
122
- yield from self._fetchall(self._rel)
126
+ yield from _duck_as_array_iterable(
127
+ self._rel, columns=columns, type_safe=type_safe
128
+ )
129
+
130
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
131
+ return _duck_as_dicts(self._rel, columns=columns)
132
+
133
+ def as_dict_iterable(
134
+ self, columns: Optional[List[str]] = None
135
+ ) -> Iterable[Dict[str, Any]]:
136
+ yield from _duck_as_dict_iterable(self._rel, columns=columns)
123
137
 
124
138
  def head(
125
139
  self, n: int, columns: Optional[List[str]] = None
126
140
  ) -> LocalBoundedDataFrame:
127
141
  if columns is not None:
128
142
  return self[columns].head(n)
129
- return ArrayDataFrame(self._fetchall(self._rel.limit(n)), schema=self.schema)
130
-
131
- def _fetchall(self, rel: DuckDBPyRelation) -> List[List[Any]]:
132
- map_pos = [i for i, t in enumerate(self.schema.types) if pa.types.is_map(t)]
133
- if len(map_pos) == 0:
134
- return [list(x) for x in rel.fetchall()]
135
- else:
136
-
137
- def to_list(row: Any) -> List[Any]:
138
- res = list(row)
139
- for p in map_pos:
140
- res[p] = list(zip(row[p]["key"], row[p]["value"]))
141
- return res
142
-
143
- return [to_list(x) for x in rel.fetchall()]
143
+ return ArrowDataFrame(_duck_as_arrow(self._rel.limit(n)))
144
144
 
145
145
 
146
146
  @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation))
@@ -186,6 +186,64 @@ def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
186
186
  return list(df.columns)
187
187
 
188
188
 
189
+ @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
190
+ def _select_duckdb_columns(
191
+ df: DuckDBPyRelation, columns: List[Any]
192
+ ) -> DuckDBPyRelation:
193
+ if len(columns) == 0:
194
+ raise FugueDataFrameOperationError("must select at least one column")
195
+ _assert_no_missing(df, columns)
196
+ return df.project(",".join(encode_column_name(n) for n in columns))
197
+
198
+
199
+ @drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
200
+ def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRelation:
201
+ # if len(columns) == 0:
202
+ # return df
203
+ _columns = {c: 1 for c in columns}
204
+ cols = [col for col in df.columns if _columns.pop(col, None) is None]
205
+ assert_or_throw(
206
+ len(cols) > 0, FugueDataFrameOperationError("must keep at least one column")
207
+ )
208
+ assert_or_throw(
209
+ len(_columns) == 0,
210
+ FugueDataFrameOperationError("found nonexistent columns {_columns}"),
211
+ )
212
+ return df.project(",".join(encode_column_name(n) for n in cols))
213
+
214
+
215
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
216
+ def _duck_as_array(
217
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
218
+ ) -> List[Any]:
219
+ return pa_table_as_array(df.arrow(), columns=columns)
220
+
221
+
222
+ @as_array_iterable.candidate(
223
+ lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
224
+ )
225
+ def _duck_as_array_iterable(
226
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
227
+ ) -> Iterable[Any]:
228
+ yield from pa_table_as_array_iterable(df.arrow(), columns=columns)
229
+
230
+
231
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
232
+ def _duck_as_dicts(
233
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None
234
+ ) -> List[Dict[str, Any]]:
235
+ return pa_table_as_dicts(df.arrow(), columns=columns)
236
+
237
+
238
+ @as_dict_iterable.candidate(
239
+ lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
240
+ )
241
+ def _duck_as_dict_iterable(
242
+ df: DuckDBPyRelation, columns: Optional[List[str]] = None
243
+ ) -> Iterable[Dict[str, Any]]:
244
+ yield from pa_table_as_dict_iterable(df.arrow(), columns=columns)
245
+
246
+
189
247
  def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
190
248
  missing = set(columns) - set(df.columns)
191
249
  if len(missing) > 0:
@@ -4,7 +4,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
4
4
  import duckdb
5
5
  from duckdb import DuckDBPyConnection, DuckDBPyRelation
6
6
  from triad import SerializableRLock
7
- from triad.collections.fs import FileSystem
8
7
  from triad.utils.assertion import assert_or_throw
9
8
  from triad.utils.schema import quote_name
10
9
 
@@ -195,10 +194,6 @@ class DuckExecutionEngine(ExecutionEngine):
195
194
  def log(self) -> logging.Logger:
196
195
  return self._native_engine.log
197
196
 
198
- @property
199
- def fs(self) -> FileSystem:
200
- return self._native_engine.fs
201
-
202
197
  def create_default_sql_engine(self) -> SQLEngine:
203
198
  return DuckDBEngine(self)
204
199
 
@@ -488,7 +483,7 @@ class DuckExecutionEngine(ExecutionEngine):
488
483
  columns: Any = None,
489
484
  **kwargs: Any,
490
485
  ) -> LocalBoundedDataFrame:
491
- dio = DuckDBIO(self.fs, self.connection)
486
+ dio = DuckDBIO(self.connection)
492
487
  return dio.load_df(path, format_hint, columns, **kwargs)
493
488
 
494
489
  def save_df(
@@ -504,7 +499,7 @@ class DuckExecutionEngine(ExecutionEngine):
504
499
  partition_spec = partition_spec or PartitionSpec()
505
500
  if not partition_spec.empty and not force_single:
506
501
  kwargs["partition_cols"] = partition_spec.partition_by
507
- dio = DuckDBIO(self.fs, self.connection)
502
+ dio = DuckDBIO(self.connection)
508
503
  dio.save_df(_to_duck_df(self, df), path, format_hint, mode, **kwargs)
509
504
 
510
505
  def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame:
fugue_ibis/dataframe.py CHANGED
@@ -143,6 +143,19 @@ class IbisDataFrame(DataFrame):
143
143
  type_safe=type_safe
144
144
  )
145
145
 
146
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
147
+ if columns is not None:
148
+ return self[columns].as_dicts()
149
+ return self.as_local().as_dicts()
150
+
151
+ def as_dict_iterable(
152
+ self, columns: Optional[List[str]] = None
153
+ ) -> Iterable[Dict[str, Any]]:
154
+ if columns is not None:
155
+ yield from self[columns].as_dict_iterable()
156
+ else:
157
+ yield from self._to_iterable_df(self._table).as_dict_iterable()
158
+
146
159
  def head(
147
160
  self, n: int, columns: Optional[List[str]] = None
148
161
  ) -> LocalBoundedDataFrame:
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Type
5
5
 
6
6
  import ibis
7
7
  from ibis import BaseBackend
8
- from triad import FileSystem, assert_or_throw
8
+ from triad import assert_or_throw
9
9
 
10
10
  from fugue import StructuredRawSQL
11
11
  from fugue.bag import Bag, LocalBag
@@ -375,10 +375,6 @@ class IbisExecutionEngine(ExecutionEngine):
375
375
  def log(self) -> logging.Logger:
376
376
  return self.non_ibis_engine.log
377
377
 
378
- @property
379
- def fs(self) -> FileSystem:
380
- return self.non_ibis_engine.fs
381
-
382
378
  def get_current_parallelism(self) -> int:
383
379
  return self.non_ibis_engine.get_current_parallelism()
384
380
 
@@ -14,22 +14,32 @@ from triad.utils.pyarrow import (
14
14
 
15
15
  from fugue import ArrowDataFrame
16
16
  from fugue.api import (
17
+ as_array,
18
+ as_array_iterable,
17
19
  as_arrow,
20
+ as_dict_iterable,
21
+ as_dicts,
18
22
  drop_columns,
19
23
  get_column_names,
20
24
  get_schema,
21
25
  is_df,
26
+ is_empty,
22
27
  rename,
23
28
  select_columns,
24
29
  )
25
30
  from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
31
+ from fugue.dataframe.utils import (
32
+ pa_table_as_array,
33
+ pa_table_as_array_iterable,
34
+ pa_table_as_dict_iterable,
35
+ pa_table_as_dicts,
36
+ )
26
37
  from fugue.dataset.api import (
27
38
  as_local,
28
39
  as_local_bounded,
29
40
  count,
30
41
  get_num_partitions,
31
42
  is_bounded,
32
- is_empty,
33
43
  is_local,
34
44
  )
35
45
  from fugue.exceptions import FugueDataFrameOperationError
@@ -52,7 +62,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
52
62
  ):
53
63
  if df is None:
54
64
  schema = _input_schema(schema).assert_not_empty()
55
- self._native: pa.Table = build_empty_pl(schema)
65
+ self._native: pl.DataFrame = build_empty_pl(schema)
56
66
  super().__init__(schema)
57
67
  return
58
68
  else:
@@ -73,7 +83,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
73
83
 
74
84
  @property
75
85
  def empty(self) -> bool:
76
- return self._native.shape[0] == 0
86
+ return self._native.is_empty()
77
87
 
78
88
  def peek_array(self) -> List[Any]:
79
89
  self.assert_not_empty()
@@ -118,26 +128,20 @@ class PolarsDataFrame(LocalBoundedDataFrame):
118
128
  def as_array(
119
129
  self, columns: Optional[List[str]] = None, type_safe: bool = False
120
130
  ) -> List[Any]:
121
- tdf = self.native
122
- if columns is not None:
123
- tdf = tdf.select(columns)
124
- return [list(row) for row in tdf.rows()]
131
+ return _pl_as_array(self.native, columns=columns)
125
132
 
126
133
  def as_array_iterable(
127
134
  self, columns: Optional[List[str]] = None, type_safe: bool = False
128
135
  ) -> Iterable[Any]:
129
- if not self.empty:
130
- yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
131
- columns=columns
132
- )
136
+ yield from _pl_as_array_iterable(self.native, columns=columns)
137
+
138
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
139
+ return _pl_as_dicts(self.native, columns=columns)
133
140
 
134
141
  def as_dict_iterable(
135
142
  self, columns: Optional[List[str]] = None
136
143
  ) -> Iterable[Dict[str, Any]]:
137
- if not self.empty:
138
- yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
139
- columns=columns
140
- )
144
+ yield from _pl_as_dict_iterable(self.native, columns=columns)
141
145
 
142
146
 
143
147
  @as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
@@ -174,7 +178,7 @@ def _pl_is_bounded(df: pl.DataFrame) -> bool:
174
178
 
175
179
  @is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
176
180
  def _pl_is_empty(df: pl.DataFrame) -> bool:
177
- return df.shape[0] == 0
181
+ return df.is_empty()
178
182
 
179
183
 
180
184
  @is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
@@ -228,6 +232,39 @@ def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
228
232
  return df.select(columns)
229
233
 
230
234
 
235
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
236
+ def _pl_as_array(
237
+ df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
238
+ ) -> List[List[Any]]:
239
+ _df = df if columns is None else _select_pa_columns(df, columns)
240
+ adf = _pl_as_arrow(_df)
241
+ return pa_table_as_array(adf, columns=columns)
242
+
243
+
244
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
245
+ def _pl_as_array_iterable(
246
+ df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
247
+ ) -> Iterable[List[Any]]:
248
+ _df = df if columns is None else _select_pa_columns(df, columns)
249
+ yield from pa_table_as_array_iterable(_df.to_arrow(), columns=columns)
250
+
251
+
252
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
253
+ def _pl_as_dicts(
254
+ df: pl.DataFrame, columns: Optional[List[str]] = None
255
+ ) -> List[Dict[str, Any]]:
256
+ _df = df if columns is None else _select_pa_columns(df, columns)
257
+ return pa_table_as_dicts(_df.to_arrow(), columns=columns)
258
+
259
+
260
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
261
+ def _pl_as_dict_iterable(
262
+ df: pl.DataFrame, columns: Optional[List[str]] = None
263
+ ) -> Iterable[Dict[str, Any]]:
264
+ _df = df if columns is None else _select_pa_columns(df, columns)
265
+ yield from pa_table_as_dict_iterable(_df.to_arrow(), columns=columns)
266
+
267
+
231
268
  def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
232
269
  missing = [x for x in columns if x not in df.schema.keys()]
233
270
  if len(missing) > 0:
fugue_ray/_utils/io.py CHANGED
@@ -4,23 +4,24 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
4
4
 
5
5
  import pyarrow as pa
6
6
  import ray.data as rd
7
- from fugue import ExecutionEngine
8
- from fugue._utils.io import FileParser, save_df
9
- from fugue.collections.partition import PartitionSpec
10
- from fugue.dataframe import DataFrame
11
- from fugue_ray.dataframe import RayDataFrame
12
7
  from pyarrow import csv as pacsv
13
8
  from pyarrow import json as pajson
14
9
  from ray.data.datasource import FileExtensionFilter
15
10
  from triad.collections import Schema
16
11
  from triad.collections.dict import ParamDict
17
12
  from triad.utils.assertion import assert_or_throw
13
+ from triad.utils.io import exists, makedirs, rm
14
+
15
+ from fugue import ExecutionEngine
16
+ from fugue._utils.io import FileParser, save_df
17
+ from fugue.collections.partition import PartitionSpec
18
+ from fugue.dataframe import DataFrame
19
+ from fugue_ray.dataframe import RayDataFrame
18
20
 
19
21
 
20
22
  class RayIO(object):
21
23
  def __init__(self, engine: ExecutionEngine):
22
24
  self._engine = engine
23
- self._fs = engine.fs
24
25
  self._logger = engine.log
25
26
  self._loads: Dict[str, Callable[..., DataFrame]] = {
26
27
  "csv": self._load_csv,
@@ -49,7 +50,7 @@ class RayIO(object):
49
50
  len(fmts) == 1, NotImplementedError("can't support multiple formats")
50
51
  )
51
52
  fmt = fmts[0]
52
- files = [f.uri for f in fp]
53
+ files = [f.path for f in fp]
53
54
  return self._loads[fmt](files, columns, **kwargs)
54
55
 
55
56
  def save_df(
@@ -63,24 +64,21 @@ class RayIO(object):
63
64
  **kwargs: Any,
64
65
  ) -> None:
65
66
  partition_spec = partition_spec or PartitionSpec()
66
- if self._fs.exists(uri):
67
+ if exists(uri):
67
68
  assert_or_throw(mode == "overwrite", FileExistsError(uri))
68
69
  try:
69
- self._fs.remove(uri)
70
- except Exception:
71
- try:
72
- self._fs.removetree(uri)
73
- except Exception: # pragma: no cover
74
- pass
70
+ rm(uri, recursive=True)
71
+ except Exception: # pragma: no cover
72
+ pass
75
73
  p = FileParser(uri, format_hint)
76
74
  if not force_single:
77
75
  df = self._prepartition(df, partition_spec=partition_spec)
78
76
 
79
- self._saves[p.file_format](df=df, uri=p.uri, **kwargs)
77
+ self._saves[p.file_format](df=df, uri=p.path, **kwargs)
80
78
  else:
81
79
  ldf = df.as_local()
82
- self._fs.makedirs(os.path.dirname(uri), recreate=True)
83
- save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
80
+ makedirs(os.path.dirname(uri), exist_ok=True)
81
+ save_df(ldf, uri, format_hint=format_hint, mode=mode, **kwargs)
84
82
 
85
83
  def _save_parquet(
86
84
  self,
fugue_ray/dataframe.py CHANGED
@@ -4,14 +4,22 @@ import pandas as pd
4
4
  import pyarrow as pa
5
5
  import ray
6
6
  import ray.data as rd
7
+ from triad import assert_or_throw
7
8
  from triad.collections.schema import Schema
8
9
  from triad.utils.pyarrow import cast_pa_table
9
10
 
10
11
  from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
11
12
  from fugue.dataframe.dataframe import _input_schema
13
+ from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
12
14
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
13
15
  from fugue.plugins import (
16
+ as_array,
17
+ as_array_iterable,
18
+ as_arrow,
19
+ as_dict_iterable,
20
+ as_dicts,
14
21
  as_local_bounded,
22
+ as_pandas,
15
23
  get_column_names,
16
24
  get_num_partitions,
17
25
  is_df,
@@ -141,13 +149,11 @@ class RayDataFrame(DataFrame):
141
149
  def _select_cols(self, cols: List[Any]) -> DataFrame:
142
150
  if cols == self.columns:
143
151
  return self
144
- rdf = self.native.map_batches(
145
- lambda b: b.select(cols),
146
- batch_format="pyarrow",
147
- **_ZERO_COPY,
148
- **self._remote_args(),
152
+ return RayDataFrame(
153
+ self.native.select_columns(cols),
154
+ self.schema.extract(cols),
155
+ internal_schema=True,
149
156
  )
150
- return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
151
157
 
152
158
  def peek_array(self) -> List[Any]:
153
159
  data = self.native.limit(1).to_pandas().values.tolist()
@@ -164,10 +170,10 @@ class RayDataFrame(DataFrame):
164
170
  return self.native.count()
165
171
 
166
172
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
167
- return pa.concat_tables(_get_arrow_tables(self.native))
173
+ return _rd_as_arrow(self.native)
168
174
 
169
175
  def as_pandas(self) -> pd.DataFrame:
170
- return self.as_arrow().to_pandas()
176
+ return _rd_as_pandas(self.native)
171
177
 
172
178
  def rename(self, columns: Dict[str, str]) -> DataFrame:
173
179
  try:
@@ -201,18 +207,20 @@ class RayDataFrame(DataFrame):
201
207
  def as_array(
202
208
  self, columns: Optional[List[str]] = None, type_safe: bool = False
203
209
  ) -> List[Any]:
204
- df: DataFrame = self
205
- if columns is not None:
206
- df = df[columns]
207
- adf = df.as_arrow()
208
- if adf.shape[0] == 0:
209
- return []
210
- return ArrowDataFrame(adf).as_array(type_safe=type_safe)
210
+ return _rd_as_array(self.native, columns, type_safe)
211
211
 
212
212
  def as_array_iterable(
213
213
  self, columns: Optional[List[str]] = None, type_safe: bool = False
214
214
  ) -> Iterable[Any]:
215
- yield from self.as_array(columns=columns, type_safe=type_safe)
215
+ yield from _rd_as_array_iterable(self.native, columns, type_safe)
216
+
217
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
218
+ return _rd_as_dicts(self.native, columns)
219
+
220
+ def as_dict_iterable(
221
+ self, columns: Optional[List[str]] = None
222
+ ) -> Iterable[Dict[str, Any]]:
223
+ yield from _rd_as_dict_iterable(self.native, columns)
216
224
 
217
225
  def head(
218
226
  self, n: int, columns: Optional[List[str]] = None
@@ -259,8 +267,8 @@ def _rd_num_partitions(df: rd.Dataset) -> int:
259
267
 
260
268
 
261
269
  @as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset))
262
- def _rd_as_local(df: rd.Dataset) -> bool:
263
- return pa.concat_tables(_get_arrow_tables(df))
270
+ def _rd_as_local(df: rd.Dataset) -> pa.Table:
271
+ return _rd_as_arrow(df)
264
272
 
265
273
 
266
274
  @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
@@ -290,10 +298,54 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
290
298
  )
291
299
 
292
300
 
301
+ @as_pandas.candidate(lambda df: isinstance(df, rd.Dataset))
302
+ def _rd_as_pandas(df: rd.Dataset) -> pd.DataFrame:
303
+ return _rd_as_arrow(df).to_pandas()
304
+
305
+
306
+ @as_arrow.candidate(lambda df: isinstance(df, rd.Dataset))
307
+ def _rd_as_arrow(df: rd.Dataset) -> pa.Table:
308
+ return pa.concat_tables(_get_arrow_tables(df))
309
+
310
+
311
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
312
+ def _rd_as_array(
313
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
314
+ ) -> List[Any]:
315
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
316
+ _df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
317
+ adf = _rd_as_arrow(_df)
318
+ return pa_table_as_array(adf)
319
+
320
+
321
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
322
+ def _rd_as_array_iterable(
323
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
324
+ ) -> Iterable[Any]:
325
+ yield from _rd_as_array(df, columns, type_safe)
326
+
327
+
328
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
329
+ def _rd_as_dicts(
330
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
331
+ ) -> List[Dict[str, Any]]:
332
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
333
+ _df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
334
+ adf = _rd_as_arrow(_df)
335
+ return pa_table_as_dicts(adf)
336
+
337
+
338
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
339
+ def _rd_as_dict_iterable(
340
+ df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
341
+ ) -> Iterable[Dict[str, Any]]:
342
+ yield from _rd_as_dicts(df, columns, type_safe)
343
+
344
+
293
345
  def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]:
294
346
  last_empty: Any = None
295
347
  empty = True
296
- for block in df.get_internal_block_refs():
348
+ for block in df.to_arrow_refs():
297
349
  tb = ray.get(block)
298
350
  if tb.shape[0] > 0:
299
351
  yield tb
fugue_spark/_utils/io.py CHANGED
@@ -4,7 +4,6 @@ import pyspark.sql as ps
4
4
  from pyspark.sql import SparkSession
5
5
  from triad.collections import Schema
6
6
  from triad.collections.dict import ParamDict
7
- from triad.collections.fs import FileSystem
8
7
  from triad.utils.assertion import assert_or_throw
9
8
 
10
9
  from fugue._utils.io import FileParser, save_df
@@ -16,9 +15,8 @@ from .convert import to_schema, to_spark_schema
16
15
 
17
16
 
18
17
  class SparkIO(object):
19
- def __init__(self, spark_session: SparkSession, fs: FileSystem):
18
+ def __init__(self, spark_session: SparkSession):
20
19
  self._session = spark_session
21
- self._fs = fs
22
20
  self._loads: Dict[str, Callable[..., DataFrame]] = {
23
21
  "csv": self._load_csv,
24
22
  "parquet": self._load_parquet,
@@ -41,7 +39,7 @@ class SparkIO(object):
41
39
  len(fmts) == 1, NotImplementedError("can't support multiple formats")
42
40
  )
43
41
  fmt = fmts[0]
44
- files = [f.uri for f in fp]
42
+ files = [f.path for f in fp]
45
43
  return self._loads[fmt](files, columns, **kwargs)
46
44
 
47
45
  def save_df(
@@ -64,7 +62,7 @@ class SparkIO(object):
64
62
  ldf = df.as_local()
65
63
  if isinstance(ldf, PandasDataFrame) and hasattr(ldf.native, "attrs"):
66
64
  ldf.native.attrs = {} # pragma: no cover
67
- save_df(ldf, uri, format_hint=format_hint, mode=mode, fs=self._fs, **kwargs)
65
+ save_df(ldf, uri, format_hint=format_hint, mode=mode, **kwargs)
68
66
 
69
67
  def _get_writer(
70
68
  self, sdf: ps.DataFrame, partition_spec: PartitionSpec