fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +84 -89
- fugue/api.py +1 -0
- fugue/dataframe/api.py +19 -2
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +78 -25
- fugue/execution/execution_engine.py +1 -8
- fugue/execution/native_execution_engine.py +5 -11
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +9 -9
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +4 -4
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +40 -38
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +3 -2
- fugue_dask/_io.py +22 -29
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_dask/execution_engine.py +5 -12
- fugue_duckdb/_io.py +21 -37
- fugue_duckdb/dataframe.py +87 -29
- fugue_duckdb/execution_engine.py +2 -7
- fugue_ibis/dataframe.py +13 -0
- fugue_ibis/execution_engine.py +1 -5
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/_utils/io.py +15 -17
- fugue_ray/dataframe.py +71 -19
- fugue_spark/_utils/io.py +3 -5
- fugue_spark/dataframe.py +69 -13
- fugue_spark/execution_engine.py +2 -7
- fugue_test/builtin_suite.py +12 -12
- fugue_test/dataframe_suite.py +14 -0
- fugue_test/execution_suite.py +13 -18
- fugue_test/plugins/misc/__init__.py +2 -0
- fugue_test/plugins/misc/fixtures.py +18 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue_duckdb/dataframe.py
CHANGED
|
@@ -3,21 +3,33 @@ from typing import Any, Dict, Iterable, List, Optional
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
from duckdb import DuckDBPyRelation
|
|
6
|
-
from triad import Schema
|
|
6
|
+
from triad import Schema, assert_or_throw
|
|
7
7
|
from triad.utils.pyarrow import LARGE_TYPES_REPLACEMENT, replace_types_in_table
|
|
8
8
|
|
|
9
|
-
from fugue import
|
|
9
|
+
from fugue import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
10
10
|
from fugue.dataframe.arrow_dataframe import _pa_table_as_pandas
|
|
11
|
+
from fugue.dataframe.utils import (
|
|
12
|
+
pa_table_as_array,
|
|
13
|
+
pa_table_as_array_iterable,
|
|
14
|
+
pa_table_as_dict_iterable,
|
|
15
|
+
pa_table_as_dicts,
|
|
16
|
+
)
|
|
11
17
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
12
18
|
from fugue.plugins import (
|
|
19
|
+
as_array,
|
|
20
|
+
as_array_iterable,
|
|
13
21
|
as_arrow,
|
|
22
|
+
as_dict_iterable,
|
|
23
|
+
as_dicts,
|
|
14
24
|
as_fugue_dataset,
|
|
15
25
|
as_local_bounded,
|
|
16
26
|
as_pandas,
|
|
27
|
+
drop_columns,
|
|
17
28
|
get_column_names,
|
|
18
29
|
get_num_partitions,
|
|
19
30
|
get_schema,
|
|
20
31
|
is_df,
|
|
32
|
+
select_columns,
|
|
21
33
|
)
|
|
22
34
|
|
|
23
35
|
from ._utils import encode_column_name, to_duck_type, to_pa_type
|
|
@@ -59,13 +71,10 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
59
71
|
return len(self._rel)
|
|
60
72
|
|
|
61
73
|
def _drop_cols(self, cols: List[str]) -> DataFrame:
|
|
62
|
-
|
|
63
|
-
rel = self._rel.project(",".join(encode_column_name(n) for n in cols))
|
|
64
|
-
return DuckDataFrame(rel)
|
|
74
|
+
return DuckDataFrame(_drop_duckdb_columns(self._rel, cols))
|
|
65
75
|
|
|
66
76
|
def _select_cols(self, keys: List[Any]) -> DataFrame:
|
|
67
|
-
|
|
68
|
-
return DuckDataFrame(rel)
|
|
77
|
+
return DuckDataFrame(_select_duckdb_columns(self._rel, keys))
|
|
69
78
|
|
|
70
79
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
71
80
|
_assert_no_missing(self._rel, columns.keys())
|
|
@@ -109,38 +118,29 @@ class DuckDataFrame(LocalBoundedDataFrame):
|
|
|
109
118
|
def as_array(
|
|
110
119
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
111
120
|
) -> List[Any]:
|
|
112
|
-
|
|
113
|
-
return self[columns].as_array(type_safe=type_safe)
|
|
114
|
-
return self._fetchall(self._rel)
|
|
121
|
+
return _duck_as_array(self._rel, columns=columns, type_safe=type_safe)
|
|
115
122
|
|
|
116
123
|
def as_array_iterable(
|
|
117
124
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
118
125
|
) -> Iterable[Any]:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
126
|
+
yield from _duck_as_array_iterable(
|
|
127
|
+
self._rel, columns=columns, type_safe=type_safe
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
131
|
+
return _duck_as_dicts(self._rel, columns=columns)
|
|
132
|
+
|
|
133
|
+
def as_dict_iterable(
|
|
134
|
+
self, columns: Optional[List[str]] = None
|
|
135
|
+
) -> Iterable[Dict[str, Any]]:
|
|
136
|
+
yield from _duck_as_dict_iterable(self._rel, columns=columns)
|
|
123
137
|
|
|
124
138
|
def head(
|
|
125
139
|
self, n: int, columns: Optional[List[str]] = None
|
|
126
140
|
) -> LocalBoundedDataFrame:
|
|
127
141
|
if columns is not None:
|
|
128
142
|
return self[columns].head(n)
|
|
129
|
-
return
|
|
130
|
-
|
|
131
|
-
def _fetchall(self, rel: DuckDBPyRelation) -> List[List[Any]]:
|
|
132
|
-
map_pos = [i for i, t in enumerate(self.schema.types) if pa.types.is_map(t)]
|
|
133
|
-
if len(map_pos) == 0:
|
|
134
|
-
return [list(x) for x in rel.fetchall()]
|
|
135
|
-
else:
|
|
136
|
-
|
|
137
|
-
def to_list(row: Any) -> List[Any]:
|
|
138
|
-
res = list(row)
|
|
139
|
-
for p in map_pos:
|
|
140
|
-
res[p] = list(zip(row[p]["key"], row[p]["value"]))
|
|
141
|
-
return res
|
|
142
|
-
|
|
143
|
-
return [to_list(x) for x in rel.fetchall()]
|
|
143
|
+
return ArrowDataFrame(_duck_as_arrow(self._rel.limit(n)))
|
|
144
144
|
|
|
145
145
|
|
|
146
146
|
@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
@@ -186,6 +186,64 @@ def _get_duckdb_columns(df: DuckDBPyRelation) -> List[Any]:
|
|
|
186
186
|
return list(df.columns)
|
|
187
187
|
|
|
188
188
|
|
|
189
|
+
@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
190
|
+
def _select_duckdb_columns(
|
|
191
|
+
df: DuckDBPyRelation, columns: List[Any]
|
|
192
|
+
) -> DuckDBPyRelation:
|
|
193
|
+
if len(columns) == 0:
|
|
194
|
+
raise FugueDataFrameOperationError("must select at least one column")
|
|
195
|
+
_assert_no_missing(df, columns)
|
|
196
|
+
return df.project(",".join(encode_column_name(n) for n in columns))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
200
|
+
def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRelation:
|
|
201
|
+
# if len(columns) == 0:
|
|
202
|
+
# return df
|
|
203
|
+
_columns = {c: 1 for c in columns}
|
|
204
|
+
cols = [col for col in df.columns if _columns.pop(col, None) is None]
|
|
205
|
+
assert_or_throw(
|
|
206
|
+
len(cols) > 0, FugueDataFrameOperationError("must keep at least one column")
|
|
207
|
+
)
|
|
208
|
+
assert_or_throw(
|
|
209
|
+
len(_columns) == 0,
|
|
210
|
+
FugueDataFrameOperationError("found nonexistent columns {_columns}"),
|
|
211
|
+
)
|
|
212
|
+
return df.project(",".join(encode_column_name(n) for n in cols))
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
216
|
+
def _duck_as_array(
|
|
217
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
218
|
+
) -> List[Any]:
|
|
219
|
+
return pa_table_as_array(df.arrow(), columns=columns)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
@as_array_iterable.candidate(
|
|
223
|
+
lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
|
|
224
|
+
)
|
|
225
|
+
def _duck_as_array_iterable(
|
|
226
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
227
|
+
) -> Iterable[Any]:
|
|
228
|
+
yield from pa_table_as_array_iterable(df.arrow(), columns=columns)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
232
|
+
def _duck_as_dicts(
|
|
233
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None
|
|
234
|
+
) -> List[Dict[str, Any]]:
|
|
235
|
+
return pa_table_as_dicts(df.arrow(), columns=columns)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@as_dict_iterable.candidate(
|
|
239
|
+
lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation)
|
|
240
|
+
)
|
|
241
|
+
def _duck_as_dict_iterable(
|
|
242
|
+
df: DuckDBPyRelation, columns: Optional[List[str]] = None
|
|
243
|
+
) -> Iterable[Dict[str, Any]]:
|
|
244
|
+
yield from pa_table_as_dict_iterable(df.arrow(), columns=columns)
|
|
245
|
+
|
|
246
|
+
|
|
189
247
|
def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
|
|
190
248
|
missing = set(columns) - set(df.columns)
|
|
191
249
|
if len(missing) > 0:
|
fugue_duckdb/execution_engine.py
CHANGED
|
@@ -4,7 +4,6 @@ from typing import Any, Dict, Iterable, List, Optional, Union
|
|
|
4
4
|
import duckdb
|
|
5
5
|
from duckdb import DuckDBPyConnection, DuckDBPyRelation
|
|
6
6
|
from triad import SerializableRLock
|
|
7
|
-
from triad.collections.fs import FileSystem
|
|
8
7
|
from triad.utils.assertion import assert_or_throw
|
|
9
8
|
from triad.utils.schema import quote_name
|
|
10
9
|
|
|
@@ -195,10 +194,6 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
195
194
|
def log(self) -> logging.Logger:
|
|
196
195
|
return self._native_engine.log
|
|
197
196
|
|
|
198
|
-
@property
|
|
199
|
-
def fs(self) -> FileSystem:
|
|
200
|
-
return self._native_engine.fs
|
|
201
|
-
|
|
202
197
|
def create_default_sql_engine(self) -> SQLEngine:
|
|
203
198
|
return DuckDBEngine(self)
|
|
204
199
|
|
|
@@ -488,7 +483,7 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
488
483
|
columns: Any = None,
|
|
489
484
|
**kwargs: Any,
|
|
490
485
|
) -> LocalBoundedDataFrame:
|
|
491
|
-
dio = DuckDBIO(self.
|
|
486
|
+
dio = DuckDBIO(self.connection)
|
|
492
487
|
return dio.load_df(path, format_hint, columns, **kwargs)
|
|
493
488
|
|
|
494
489
|
def save_df(
|
|
@@ -504,7 +499,7 @@ class DuckExecutionEngine(ExecutionEngine):
|
|
|
504
499
|
partition_spec = partition_spec or PartitionSpec()
|
|
505
500
|
if not partition_spec.empty and not force_single:
|
|
506
501
|
kwargs["partition_cols"] = partition_spec.partition_by
|
|
507
|
-
dio = DuckDBIO(self.
|
|
502
|
+
dio = DuckDBIO(self.connection)
|
|
508
503
|
dio.save_df(_to_duck_df(self, df), path, format_hint, mode, **kwargs)
|
|
509
504
|
|
|
510
505
|
def convert_yield_dataframe(self, df: DataFrame, as_local: bool) -> DataFrame:
|
fugue_ibis/dataframe.py
CHANGED
|
@@ -143,6 +143,19 @@ class IbisDataFrame(DataFrame):
|
|
|
143
143
|
type_safe=type_safe
|
|
144
144
|
)
|
|
145
145
|
|
|
146
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
147
|
+
if columns is not None:
|
|
148
|
+
return self[columns].as_dicts()
|
|
149
|
+
return self.as_local().as_dicts()
|
|
150
|
+
|
|
151
|
+
def as_dict_iterable(
|
|
152
|
+
self, columns: Optional[List[str]] = None
|
|
153
|
+
) -> Iterable[Dict[str, Any]]:
|
|
154
|
+
if columns is not None:
|
|
155
|
+
yield from self[columns].as_dict_iterable()
|
|
156
|
+
else:
|
|
157
|
+
yield from self._to_iterable_df(self._table).as_dict_iterable()
|
|
158
|
+
|
|
146
159
|
def head(
|
|
147
160
|
self, n: int, columns: Optional[List[str]] = None
|
|
148
161
|
) -> LocalBoundedDataFrame:
|
fugue_ibis/execution_engine.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, List, Optional, Type
|
|
|
5
5
|
|
|
6
6
|
import ibis
|
|
7
7
|
from ibis import BaseBackend
|
|
8
|
-
from triad import
|
|
8
|
+
from triad import assert_or_throw
|
|
9
9
|
|
|
10
10
|
from fugue import StructuredRawSQL
|
|
11
11
|
from fugue.bag import Bag, LocalBag
|
|
@@ -375,10 +375,6 @@ class IbisExecutionEngine(ExecutionEngine):
|
|
|
375
375
|
def log(self) -> logging.Logger:
|
|
376
376
|
return self.non_ibis_engine.log
|
|
377
377
|
|
|
378
|
-
@property
|
|
379
|
-
def fs(self) -> FileSystem:
|
|
380
|
-
return self.non_ibis_engine.fs
|
|
381
|
-
|
|
382
378
|
def get_current_parallelism(self) -> int:
|
|
383
379
|
return self.non_ibis_engine.get_current_parallelism()
|
|
384
380
|
|
fugue_polars/polars_dataframe.py
CHANGED
|
@@ -14,22 +14,32 @@ from triad.utils.pyarrow import (
|
|
|
14
14
|
|
|
15
15
|
from fugue import ArrowDataFrame
|
|
16
16
|
from fugue.api import (
|
|
17
|
+
as_array,
|
|
18
|
+
as_array_iterable,
|
|
17
19
|
as_arrow,
|
|
20
|
+
as_dict_iterable,
|
|
21
|
+
as_dicts,
|
|
18
22
|
drop_columns,
|
|
19
23
|
get_column_names,
|
|
20
24
|
get_schema,
|
|
21
25
|
is_df,
|
|
26
|
+
is_empty,
|
|
22
27
|
rename,
|
|
23
28
|
select_columns,
|
|
24
29
|
)
|
|
25
30
|
from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
|
|
31
|
+
from fugue.dataframe.utils import (
|
|
32
|
+
pa_table_as_array,
|
|
33
|
+
pa_table_as_array_iterable,
|
|
34
|
+
pa_table_as_dict_iterable,
|
|
35
|
+
pa_table_as_dicts,
|
|
36
|
+
)
|
|
26
37
|
from fugue.dataset.api import (
|
|
27
38
|
as_local,
|
|
28
39
|
as_local_bounded,
|
|
29
40
|
count,
|
|
30
41
|
get_num_partitions,
|
|
31
42
|
is_bounded,
|
|
32
|
-
is_empty,
|
|
33
43
|
is_local,
|
|
34
44
|
)
|
|
35
45
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
@@ -52,7 +62,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
52
62
|
):
|
|
53
63
|
if df is None:
|
|
54
64
|
schema = _input_schema(schema).assert_not_empty()
|
|
55
|
-
self._native:
|
|
65
|
+
self._native: pl.DataFrame = build_empty_pl(schema)
|
|
56
66
|
super().__init__(schema)
|
|
57
67
|
return
|
|
58
68
|
else:
|
|
@@ -73,7 +83,7 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
73
83
|
|
|
74
84
|
@property
|
|
75
85
|
def empty(self) -> bool:
|
|
76
|
-
return self._native.
|
|
86
|
+
return self._native.is_empty()
|
|
77
87
|
|
|
78
88
|
def peek_array(self) -> List[Any]:
|
|
79
89
|
self.assert_not_empty()
|
|
@@ -118,26 +128,20 @@ class PolarsDataFrame(LocalBoundedDataFrame):
|
|
|
118
128
|
def as_array(
|
|
119
129
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
120
130
|
) -> List[Any]:
|
|
121
|
-
|
|
122
|
-
if columns is not None:
|
|
123
|
-
tdf = tdf.select(columns)
|
|
124
|
-
return [list(row) for row in tdf.rows()]
|
|
131
|
+
return _pl_as_array(self.native, columns=columns)
|
|
125
132
|
|
|
126
133
|
def as_array_iterable(
|
|
127
134
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
128
135
|
) -> Iterable[Any]:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
136
|
+
yield from _pl_as_array_iterable(self.native, columns=columns)
|
|
137
|
+
|
|
138
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
139
|
+
return _pl_as_dicts(self.native, columns=columns)
|
|
133
140
|
|
|
134
141
|
def as_dict_iterable(
|
|
135
142
|
self, columns: Optional[List[str]] = None
|
|
136
143
|
) -> Iterable[Dict[str, Any]]:
|
|
137
|
-
|
|
138
|
-
yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
|
|
139
|
-
columns=columns
|
|
140
|
-
)
|
|
144
|
+
yield from _pl_as_dict_iterable(self.native, columns=columns)
|
|
141
145
|
|
|
142
146
|
|
|
143
147
|
@as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
@@ -174,7 +178,7 @@ def _pl_is_bounded(df: pl.DataFrame) -> bool:
|
|
|
174
178
|
|
|
175
179
|
@is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
176
180
|
def _pl_is_empty(df: pl.DataFrame) -> bool:
|
|
177
|
-
return df.
|
|
181
|
+
return df.is_empty()
|
|
178
182
|
|
|
179
183
|
|
|
180
184
|
@is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
@@ -228,6 +232,39 @@ def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
|
|
|
228
232
|
return df.select(columns)
|
|
229
233
|
|
|
230
234
|
|
|
235
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
236
|
+
def _pl_as_array(
|
|
237
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
238
|
+
) -> List[List[Any]]:
|
|
239
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
240
|
+
adf = _pl_as_arrow(_df)
|
|
241
|
+
return pa_table_as_array(adf, columns=columns)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
245
|
+
def _pl_as_array_iterable(
|
|
246
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
247
|
+
) -> Iterable[List[Any]]:
|
|
248
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
249
|
+
yield from pa_table_as_array_iterable(_df.to_arrow(), columns=columns)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
253
|
+
def _pl_as_dicts(
|
|
254
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None
|
|
255
|
+
) -> List[Dict[str, Any]]:
|
|
256
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
257
|
+
return pa_table_as_dicts(_df.to_arrow(), columns=columns)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
261
|
+
def _pl_as_dict_iterable(
|
|
262
|
+
df: pl.DataFrame, columns: Optional[List[str]] = None
|
|
263
|
+
) -> Iterable[Dict[str, Any]]:
|
|
264
|
+
_df = df if columns is None else _select_pa_columns(df, columns)
|
|
265
|
+
yield from pa_table_as_dict_iterable(_df.to_arrow(), columns=columns)
|
|
266
|
+
|
|
267
|
+
|
|
231
268
|
def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
|
|
232
269
|
missing = [x for x in columns if x not in df.schema.keys()]
|
|
233
270
|
if len(missing) > 0:
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -4,23 +4,24 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
|
4
4
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
import ray.data as rd
|
|
7
|
-
from fugue import ExecutionEngine
|
|
8
|
-
from fugue._utils.io import FileParser, save_df
|
|
9
|
-
from fugue.collections.partition import PartitionSpec
|
|
10
|
-
from fugue.dataframe import DataFrame
|
|
11
|
-
from fugue_ray.dataframe import RayDataFrame
|
|
12
7
|
from pyarrow import csv as pacsv
|
|
13
8
|
from pyarrow import json as pajson
|
|
14
9
|
from ray.data.datasource import FileExtensionFilter
|
|
15
10
|
from triad.collections import Schema
|
|
16
11
|
from triad.collections.dict import ParamDict
|
|
17
12
|
from triad.utils.assertion import assert_or_throw
|
|
13
|
+
from triad.utils.io import exists, makedirs, rm
|
|
14
|
+
|
|
15
|
+
from fugue import ExecutionEngine
|
|
16
|
+
from fugue._utils.io import FileParser, save_df
|
|
17
|
+
from fugue.collections.partition import PartitionSpec
|
|
18
|
+
from fugue.dataframe import DataFrame
|
|
19
|
+
from fugue_ray.dataframe import RayDataFrame
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class RayIO(object):
|
|
21
23
|
def __init__(self, engine: ExecutionEngine):
|
|
22
24
|
self._engine = engine
|
|
23
|
-
self._fs = engine.fs
|
|
24
25
|
self._logger = engine.log
|
|
25
26
|
self._loads: Dict[str, Callable[..., DataFrame]] = {
|
|
26
27
|
"csv": self._load_csv,
|
|
@@ -49,7 +50,7 @@ class RayIO(object):
|
|
|
49
50
|
len(fmts) == 1, NotImplementedError("can't support multiple formats")
|
|
50
51
|
)
|
|
51
52
|
fmt = fmts[0]
|
|
52
|
-
files = [f.
|
|
53
|
+
files = [f.path for f in fp]
|
|
53
54
|
return self._loads[fmt](files, columns, **kwargs)
|
|
54
55
|
|
|
55
56
|
def save_df(
|
|
@@ -63,24 +64,21 @@ class RayIO(object):
|
|
|
63
64
|
**kwargs: Any,
|
|
64
65
|
) -> None:
|
|
65
66
|
partition_spec = partition_spec or PartitionSpec()
|
|
66
|
-
if
|
|
67
|
+
if exists(uri):
|
|
67
68
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
68
69
|
try:
|
|
69
|
-
|
|
70
|
-
except Exception:
|
|
71
|
-
|
|
72
|
-
self._fs.removetree(uri)
|
|
73
|
-
except Exception: # pragma: no cover
|
|
74
|
-
pass
|
|
70
|
+
rm(uri, recursive=True)
|
|
71
|
+
except Exception: # pragma: no cover
|
|
72
|
+
pass
|
|
75
73
|
p = FileParser(uri, format_hint)
|
|
76
74
|
if not force_single:
|
|
77
75
|
df = self._prepartition(df, partition_spec=partition_spec)
|
|
78
76
|
|
|
79
|
-
self._saves[p.file_format](df=df, uri=p.
|
|
77
|
+
self._saves[p.file_format](df=df, uri=p.path, **kwargs)
|
|
80
78
|
else:
|
|
81
79
|
ldf = df.as_local()
|
|
82
|
-
|
|
83
|
-
save_df(ldf, uri, format_hint=format_hint, mode=mode,
|
|
80
|
+
makedirs(os.path.dirname(uri), exist_ok=True)
|
|
81
|
+
save_df(ldf, uri, format_hint=format_hint, mode=mode, **kwargs)
|
|
84
82
|
|
|
85
83
|
def _save_parquet(
|
|
86
84
|
self,
|
fugue_ray/dataframe.py
CHANGED
|
@@ -4,14 +4,22 @@ import pandas as pd
|
|
|
4
4
|
import pyarrow as pa
|
|
5
5
|
import ray
|
|
6
6
|
import ray.data as rd
|
|
7
|
+
from triad import assert_or_throw
|
|
7
8
|
from triad.collections.schema import Schema
|
|
8
9
|
from triad.utils.pyarrow import cast_pa_table
|
|
9
10
|
|
|
10
11
|
from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
11
12
|
from fugue.dataframe.dataframe import _input_schema
|
|
13
|
+
from fugue.dataframe.utils import pa_table_as_array, pa_table_as_dicts
|
|
12
14
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
13
15
|
from fugue.plugins import (
|
|
16
|
+
as_array,
|
|
17
|
+
as_array_iterable,
|
|
18
|
+
as_arrow,
|
|
19
|
+
as_dict_iterable,
|
|
20
|
+
as_dicts,
|
|
14
21
|
as_local_bounded,
|
|
22
|
+
as_pandas,
|
|
15
23
|
get_column_names,
|
|
16
24
|
get_num_partitions,
|
|
17
25
|
is_df,
|
|
@@ -141,13 +149,11 @@ class RayDataFrame(DataFrame):
|
|
|
141
149
|
def _select_cols(self, cols: List[Any]) -> DataFrame:
|
|
142
150
|
if cols == self.columns:
|
|
143
151
|
return self
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
**self._remote_args(),
|
|
152
|
+
return RayDataFrame(
|
|
153
|
+
self.native.select_columns(cols),
|
|
154
|
+
self.schema.extract(cols),
|
|
155
|
+
internal_schema=True,
|
|
149
156
|
)
|
|
150
|
-
return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
|
|
151
157
|
|
|
152
158
|
def peek_array(self) -> List[Any]:
|
|
153
159
|
data = self.native.limit(1).to_pandas().values.tolist()
|
|
@@ -164,10 +170,10 @@ class RayDataFrame(DataFrame):
|
|
|
164
170
|
return self.native.count()
|
|
165
171
|
|
|
166
172
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
167
|
-
return
|
|
173
|
+
return _rd_as_arrow(self.native)
|
|
168
174
|
|
|
169
175
|
def as_pandas(self) -> pd.DataFrame:
|
|
170
|
-
return self.
|
|
176
|
+
return _rd_as_pandas(self.native)
|
|
171
177
|
|
|
172
178
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
173
179
|
try:
|
|
@@ -201,18 +207,20 @@ class RayDataFrame(DataFrame):
|
|
|
201
207
|
def as_array(
|
|
202
208
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
203
209
|
) -> List[Any]:
|
|
204
|
-
|
|
205
|
-
if columns is not None:
|
|
206
|
-
df = df[columns]
|
|
207
|
-
adf = df.as_arrow()
|
|
208
|
-
if adf.shape[0] == 0:
|
|
209
|
-
return []
|
|
210
|
-
return ArrowDataFrame(adf).as_array(type_safe=type_safe)
|
|
210
|
+
return _rd_as_array(self.native, columns, type_safe)
|
|
211
211
|
|
|
212
212
|
def as_array_iterable(
|
|
213
213
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
214
214
|
) -> Iterable[Any]:
|
|
215
|
-
yield from self.
|
|
215
|
+
yield from _rd_as_array_iterable(self.native, columns, type_safe)
|
|
216
|
+
|
|
217
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
218
|
+
return _rd_as_dicts(self.native, columns)
|
|
219
|
+
|
|
220
|
+
def as_dict_iterable(
|
|
221
|
+
self, columns: Optional[List[str]] = None
|
|
222
|
+
) -> Iterable[Dict[str, Any]]:
|
|
223
|
+
yield from _rd_as_dict_iterable(self.native, columns)
|
|
216
224
|
|
|
217
225
|
def head(
|
|
218
226
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -259,8 +267,8 @@ def _rd_num_partitions(df: rd.Dataset) -> int:
|
|
|
259
267
|
|
|
260
268
|
|
|
261
269
|
@as_local_bounded.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
262
|
-
def _rd_as_local(df: rd.Dataset) ->
|
|
263
|
-
return
|
|
270
|
+
def _rd_as_local(df: rd.Dataset) -> pa.Table:
|
|
271
|
+
return _rd_as_arrow(df)
|
|
264
272
|
|
|
265
273
|
|
|
266
274
|
@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
@@ -290,10 +298,54 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
|
|
|
290
298
|
)
|
|
291
299
|
|
|
292
300
|
|
|
301
|
+
@as_pandas.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
302
|
+
def _rd_as_pandas(df: rd.Dataset) -> pd.DataFrame:
|
|
303
|
+
return _rd_as_arrow(df).to_pandas()
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@as_arrow.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
307
|
+
def _rd_as_arrow(df: rd.Dataset) -> pa.Table:
|
|
308
|
+
return pa.concat_tables(_get_arrow_tables(df))
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
312
|
+
def _rd_as_array(
|
|
313
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
314
|
+
) -> List[Any]:
|
|
315
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
316
|
+
_df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
|
|
317
|
+
adf = _rd_as_arrow(_df)
|
|
318
|
+
return pa_table_as_array(adf)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
322
|
+
def _rd_as_array_iterable(
|
|
323
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
324
|
+
) -> Iterable[Any]:
|
|
325
|
+
yield from _rd_as_array(df, columns, type_safe)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
329
|
+
def _rd_as_dicts(
|
|
330
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
331
|
+
) -> List[Dict[str, Any]]:
|
|
332
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
333
|
+
_df = df if columns is None or len(columns) == 0 else df.select_columns(columns)
|
|
334
|
+
adf = _rd_as_arrow(_df)
|
|
335
|
+
return pa_table_as_dicts(adf)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
339
|
+
def _rd_as_dict_iterable(
|
|
340
|
+
df: rd.Dataset, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
341
|
+
) -> Iterable[Dict[str, Any]]:
|
|
342
|
+
yield from _rd_as_dicts(df, columns, type_safe)
|
|
343
|
+
|
|
344
|
+
|
|
293
345
|
def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]:
|
|
294
346
|
last_empty: Any = None
|
|
295
347
|
empty = True
|
|
296
|
-
for block in df.
|
|
348
|
+
for block in df.to_arrow_refs():
|
|
297
349
|
tb = ray.get(block)
|
|
298
350
|
if tb.shape[0] > 0:
|
|
299
351
|
yield tb
|
fugue_spark/_utils/io.py
CHANGED
|
@@ -4,7 +4,6 @@ import pyspark.sql as ps
|
|
|
4
4
|
from pyspark.sql import SparkSession
|
|
5
5
|
from triad.collections import Schema
|
|
6
6
|
from triad.collections.dict import ParamDict
|
|
7
|
-
from triad.collections.fs import FileSystem
|
|
8
7
|
from triad.utils.assertion import assert_or_throw
|
|
9
8
|
|
|
10
9
|
from fugue._utils.io import FileParser, save_df
|
|
@@ -16,9 +15,8 @@ from .convert import to_schema, to_spark_schema
|
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
class SparkIO(object):
|
|
19
|
-
def __init__(self, spark_session: SparkSession
|
|
18
|
+
def __init__(self, spark_session: SparkSession):
|
|
20
19
|
self._session = spark_session
|
|
21
|
-
self._fs = fs
|
|
22
20
|
self._loads: Dict[str, Callable[..., DataFrame]] = {
|
|
23
21
|
"csv": self._load_csv,
|
|
24
22
|
"parquet": self._load_parquet,
|
|
@@ -41,7 +39,7 @@ class SparkIO(object):
|
|
|
41
39
|
len(fmts) == 1, NotImplementedError("can't support multiple formats")
|
|
42
40
|
)
|
|
43
41
|
fmt = fmts[0]
|
|
44
|
-
files = [f.
|
|
42
|
+
files = [f.path for f in fp]
|
|
45
43
|
return self._loads[fmt](files, columns, **kwargs)
|
|
46
44
|
|
|
47
45
|
def save_df(
|
|
@@ -64,7 +62,7 @@ class SparkIO(object):
|
|
|
64
62
|
ldf = df.as_local()
|
|
65
63
|
if isinstance(ldf, PandasDataFrame) and hasattr(ldf.native, "attrs"):
|
|
66
64
|
ldf.native.attrs = {} # pragma: no cover
|
|
67
|
-
save_df(ldf, uri, format_hint=format_hint, mode=mode,
|
|
65
|
+
save_df(ldf, uri, format_hint=format_hint, mode=mode, **kwargs)
|
|
68
66
|
|
|
69
67
|
def _get_writer(
|
|
70
68
|
self, sdf: ps.DataFrame, partition_spec: PartitionSpec
|