fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +9 -5
- fugue/_utils/interfaceless.py +1 -558
- fugue/_utils/io.py +2 -91
- fugue/_utils/registry.py +3 -2
- fugue/api.py +1 -0
- fugue/bag/bag.py +8 -4
- fugue/collections/__init__.py +0 -7
- fugue/collections/partition.py +21 -9
- fugue/constants.py +3 -1
- fugue/dataframe/__init__.py +7 -8
- fugue/dataframe/arrow_dataframe.py +1 -2
- fugue/dataframe/dataframe.py +17 -18
- fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
- fugue/dataframe/function_wrapper.py +432 -0
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/utils.py +11 -79
- fugue/dataset/api.py +0 -4
- fugue/dev.py +47 -0
- fugue/execution/__init__.py +1 -5
- fugue/execution/api.py +36 -14
- fugue/execution/execution_engine.py +30 -4
- fugue/execution/factory.py +0 -6
- fugue/execution/native_execution_engine.py +44 -67
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +4 -3
- fugue/extensions/_builtins/processors.py +3 -3
- fugue/extensions/creator/convert.py +5 -2
- fugue/extensions/outputter/convert.py +2 -2
- fugue/extensions/processor/convert.py +3 -2
- fugue/extensions/transformer/convert.py +22 -9
- fugue/extensions/transformer/transformer.py +15 -1
- fugue/plugins.py +2 -0
- fugue/registry.py +0 -39
- fugue/sql/_utils.py +1 -1
- fugue/workflow/_checkpoint.py +1 -1
- fugue/workflow/api.py +13 -13
- fugue/workflow/module.py +30 -37
- fugue/workflow/workflow.py +6 -0
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
- fugue_contrib/contrib.py +1 -0
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/dataframe.py +1 -2
- fugue_dask/execution_engine.py +45 -18
- fugue_dask/registry.py +8 -33
- fugue_duckdb/_io.py +8 -2
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +23 -19
- fugue_duckdb/execution_engine.py +19 -22
- fugue_duckdb/registry.py +11 -34
- fugue_ibis/dataframe.py +6 -10
- fugue_ibis/execution_engine.py +7 -1
- fugue_notebook/env.py +5 -10
- fugue_polars/__init__.py +2 -0
- fugue_polars/_utils.py +8 -0
- fugue_polars/polars_dataframe.py +234 -0
- fugue_polars/registry.py +86 -0
- fugue_ray/_constants.py +10 -1
- fugue_ray/_utils/dataframe.py +36 -9
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +16 -12
- fugue_ray/execution_engine.py +53 -32
- fugue_ray/registry.py +8 -32
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +26 -22
- fugue_spark/execution_engine.py +136 -54
- fugue_spark/registry.py +29 -78
- fugue_test/builtin_suite.py +36 -14
- fugue_test/dataframe_suite.py +9 -5
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/bag/test_array_bag.py +0 -9
- tests/fugue/collections/test_partition.py +10 -3
- tests/fugue/dataframe/test_function_wrapper.py +293 -0
- tests/fugue/dataframe/test_utils.py +2 -34
- tests/fugue/execution/test_factory.py +7 -9
- tests/fugue/execution/test_naive_execution_engine.py +35 -80
- tests/fugue/extensions/test_utils.py +12 -7
- tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
- tests/fugue/sql/test_workflow.py +1 -1
- tests/fugue/sql/test_workflow_parse.py +3 -5
- tests/fugue/utils/test_interfaceless.py +1 -325
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +48 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_execution_engine.py +16 -1
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_ibis/test_dataframe.py +6 -3
- tests/fugue_polars/__init__.py +0 -0
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_dataframe.py +82 -0
- tests/fugue_polars/test_transform.py +100 -0
- tests/fugue_ray/test_execution_engine.py +40 -4
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +50 -11
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue/_utils/register.py +0 -3
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
from typing import Any, Dict, Iterable, List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import polars as pl
|
|
5
|
+
import pyarrow as pa
|
|
6
|
+
from triad.collections.schema import Schema
|
|
7
|
+
from triad.exceptions import InvalidOperationError
|
|
8
|
+
from triad.utils.assertion import assert_or_throw
|
|
9
|
+
from triad.utils.pyarrow import (
|
|
10
|
+
LARGE_TYPES_REPLACEMENT,
|
|
11
|
+
replace_types_in_schema,
|
|
12
|
+
replace_types_in_table,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from fugue import ArrowDataFrame
|
|
16
|
+
from fugue.api import (
|
|
17
|
+
as_arrow,
|
|
18
|
+
drop_columns,
|
|
19
|
+
get_column_names,
|
|
20
|
+
get_schema,
|
|
21
|
+
is_df,
|
|
22
|
+
rename,
|
|
23
|
+
select_columns,
|
|
24
|
+
)
|
|
25
|
+
from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
|
|
26
|
+
from fugue.dataset.api import (
|
|
27
|
+
as_local,
|
|
28
|
+
as_local_bounded,
|
|
29
|
+
count,
|
|
30
|
+
get_num_partitions,
|
|
31
|
+
is_bounded,
|
|
32
|
+
is_empty,
|
|
33
|
+
is_local,
|
|
34
|
+
)
|
|
35
|
+
from fugue.exceptions import FugueDataFrameOperationError
|
|
36
|
+
|
|
37
|
+
from ._utils import build_empty_pl
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PolarsDataFrame(LocalBoundedDataFrame):
|
|
41
|
+
"""DataFrame that wraps :func:`pyarrow.Table <pa:pyarrow.table>`. Please also read
|
|
42
|
+
|DataFrameTutorial| to understand this Fugue concept
|
|
43
|
+
|
|
44
|
+
:param df: polars DataFrame or None, defaults to None
|
|
45
|
+
:param schema: |SchemaLikeObject|
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
df: Optional[pl.DataFrame] = None,
|
|
51
|
+
schema: Any = None,
|
|
52
|
+
):
|
|
53
|
+
if df is None:
|
|
54
|
+
schema = _input_schema(schema).assert_not_empty()
|
|
55
|
+
self._native: pa.Table = build_empty_pl(schema)
|
|
56
|
+
super().__init__(schema)
|
|
57
|
+
return
|
|
58
|
+
else:
|
|
59
|
+
assert_or_throw(
|
|
60
|
+
schema is None,
|
|
61
|
+
InvalidOperationError("can't reset schema for pl.DataFrame"),
|
|
62
|
+
)
|
|
63
|
+
self._native = df
|
|
64
|
+
super().__init__(_get_pl_schema(df))
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def native(self) -> pl.DataFrame:
|
|
68
|
+
""":func:`pyarrow.Table <pa:pyarrow.table>`"""
|
|
69
|
+
return self._native
|
|
70
|
+
|
|
71
|
+
def native_as_df(self) -> pl.DataFrame:
|
|
72
|
+
return self._native
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def empty(self) -> bool:
|
|
76
|
+
return self._native.shape[0] == 0
|
|
77
|
+
|
|
78
|
+
def peek_array(self) -> List[Any]:
|
|
79
|
+
self.assert_not_empty()
|
|
80
|
+
return list(self._native.row(0))
|
|
81
|
+
|
|
82
|
+
def peek_dict(self) -> Dict[str, Any]:
|
|
83
|
+
self.assert_not_empty()
|
|
84
|
+
return self._native.row(0, named=True)
|
|
85
|
+
|
|
86
|
+
def count(self) -> int:
|
|
87
|
+
return self.native.shape[0]
|
|
88
|
+
|
|
89
|
+
def as_pandas(self) -> pd.DataFrame:
|
|
90
|
+
return self.native.to_pandas()
|
|
91
|
+
|
|
92
|
+
def head(
|
|
93
|
+
self, n: int, columns: Optional[List[str]] = None
|
|
94
|
+
) -> LocalBoundedDataFrame:
|
|
95
|
+
adf = self.native if columns is None else self.native.select(columns)
|
|
96
|
+
n = min(n, self.count())
|
|
97
|
+
if n == 0:
|
|
98
|
+
schema = self.schema if columns is None else self.schema.extract(columns)
|
|
99
|
+
return PolarsDataFrame(None, schema=schema)
|
|
100
|
+
return PolarsDataFrame(adf.head(n))
|
|
101
|
+
|
|
102
|
+
def _drop_cols(self, cols: List[str]) -> DataFrame:
|
|
103
|
+
return PolarsDataFrame(self.native.drop(cols))
|
|
104
|
+
|
|
105
|
+
def _select_cols(self, keys: List[Any]) -> DataFrame:
|
|
106
|
+
return PolarsDataFrame(self.native.select(keys))
|
|
107
|
+
|
|
108
|
+
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
109
|
+
return PolarsDataFrame(_rename_pl_dataframe(self.native, columns))
|
|
110
|
+
|
|
111
|
+
def alter_columns(self, columns: Any) -> DataFrame:
|
|
112
|
+
adf = ArrowDataFrame(self.as_arrow()).alter_columns(columns)
|
|
113
|
+
return PolarsDataFrame(pl.from_arrow(adf.native))
|
|
114
|
+
|
|
115
|
+
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
116
|
+
return _pl_as_arrow(self.native)
|
|
117
|
+
|
|
118
|
+
def as_array(
|
|
119
|
+
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
120
|
+
) -> List[Any]:
|
|
121
|
+
tdf = self.native
|
|
122
|
+
if columns is not None:
|
|
123
|
+
tdf = tdf.select(columns)
|
|
124
|
+
return [list(row) for row in tdf.rows()]
|
|
125
|
+
|
|
126
|
+
def as_array_iterable(
|
|
127
|
+
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
128
|
+
) -> Iterable[Any]:
|
|
129
|
+
if not self.empty:
|
|
130
|
+
yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
|
|
131
|
+
columns=columns
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
def as_dict_iterable(
|
|
135
|
+
self, columns: Optional[List[str]] = None
|
|
136
|
+
) -> Iterable[Dict[str, Any]]:
|
|
137
|
+
if not self.empty:
|
|
138
|
+
yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
|
|
139
|
+
columns=columns
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
144
|
+
def _pl_as_local(df: pl.DataFrame) -> pl.DataFrame:
|
|
145
|
+
return df
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@as_local_bounded.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
149
|
+
def _pl_as_local_bounded(df: pl.DataFrame) -> pl.DataFrame:
|
|
150
|
+
return df
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@as_arrow.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
154
|
+
def _pl_as_arrow(df: pl.DataFrame) -> pa.Table:
|
|
155
|
+
adf = df.to_arrow()
|
|
156
|
+
adf = replace_types_in_table(adf, LARGE_TYPES_REPLACEMENT)
|
|
157
|
+
return adf
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@is_df.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
161
|
+
def _pl_is_df(df: pl.DataFrame) -> bool:
|
|
162
|
+
return True
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
@count.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
166
|
+
def _pl_count(df: pl.DataFrame) -> int:
|
|
167
|
+
return df.shape[0]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@is_bounded.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
171
|
+
def _pl_is_bounded(df: pl.DataFrame) -> bool:
|
|
172
|
+
return True
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
176
|
+
def _pl_is_empty(df: pl.DataFrame) -> bool:
|
|
177
|
+
return df.shape[0] == 0
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
@is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
181
|
+
def _pl_is_local(df: pl.DataFrame) -> bool:
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
@get_num_partitions.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
186
|
+
def _pl_get_num_partitions(df: pl.DataFrame) -> int:
|
|
187
|
+
return 1
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
@get_column_names.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
191
|
+
def _get_pl_columns(df: pl.DataFrame) -> List[Any]:
|
|
192
|
+
return list(df.schema.keys())
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@get_schema.candidate(lambda df: isinstance(df, pl.DataFrame))
|
|
196
|
+
def _get_pl_schema(df: pl.DataFrame) -> Schema:
|
|
197
|
+
adf = df.to_arrow()
|
|
198
|
+
schema = replace_types_in_schema(adf.schema, LARGE_TYPES_REPLACEMENT)
|
|
199
|
+
return Schema(schema)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@rename.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
203
|
+
def _rename_pl_dataframe(df: pl.DataFrame, columns: Dict[str, Any]) -> pl.DataFrame:
|
|
204
|
+
if len(columns) == 0:
|
|
205
|
+
return df
|
|
206
|
+
assert_or_throw(
|
|
207
|
+
set(columns.keys()).issubset(set(df.columns)),
|
|
208
|
+
FugueDataFrameOperationError(f"invalid {columns}"),
|
|
209
|
+
)
|
|
210
|
+
return df.rename(columns)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
@drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
214
|
+
def _drop_pa_columns(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:
|
|
215
|
+
cols = [x for x in df.schema.keys() if x not in columns]
|
|
216
|
+
if len(cols) == 0:
|
|
217
|
+
raise FugueDataFrameOperationError("cannot drop all columns")
|
|
218
|
+
if len(cols) + len(columns) != len(df.columns):
|
|
219
|
+
_assert_no_missing(df, columns)
|
|
220
|
+
return df.select(cols)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
|
|
224
|
+
def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
|
|
225
|
+
if len(columns) == 0:
|
|
226
|
+
raise FugueDataFrameOperationError("must select at least one column")
|
|
227
|
+
_assert_no_missing(df, columns=columns)
|
|
228
|
+
return df.select(columns)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
|
|
232
|
+
missing = [x for x in columns if x not in df.schema.keys()]
|
|
233
|
+
if len(missing) > 0:
|
|
234
|
+
raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
|
fugue_polars/registry.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from typing import Any, Iterable, Iterator, Optional, no_type_check
|
|
2
|
+
|
|
3
|
+
import polars as pl
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
from triad import Schema, make_empty_aware
|
|
6
|
+
from triad.utils.pyarrow import get_alter_func
|
|
7
|
+
|
|
8
|
+
from fugue import (
|
|
9
|
+
ArrowDataFrame,
|
|
10
|
+
DataFrame,
|
|
11
|
+
IterableArrowDataFrame,
|
|
12
|
+
LocalDataFrameIterableDataFrame,
|
|
13
|
+
)
|
|
14
|
+
from fugue.dev import LocalDataFrameParam, fugue_annotated_param
|
|
15
|
+
from .polars_dataframe import PolarsDataFrame
|
|
16
|
+
from fugue.plugins import as_fugue_dataset
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, pl.DataFrame))
|
|
20
|
+
def _pl_as_fugue_df(df: pl.DataFrame, **kwargs: Any) -> PolarsDataFrame:
|
|
21
|
+
return PolarsDataFrame(df, **kwargs)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@fugue_annotated_param(pl.DataFrame)
|
|
25
|
+
class _PolarsParam(LocalDataFrameParam):
|
|
26
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
|
|
27
|
+
return pl.from_arrow(df.as_arrow())
|
|
28
|
+
|
|
29
|
+
def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
|
|
30
|
+
assert isinstance(output, pl.DataFrame)
|
|
31
|
+
return _to_adf(output, schema=schema)
|
|
32
|
+
|
|
33
|
+
def count(self, df: Any) -> int: # pragma: no cover
|
|
34
|
+
return df.shape[0]
|
|
35
|
+
|
|
36
|
+
def format_hint(self) -> Optional[str]:
|
|
37
|
+
return "pyarrow"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@fugue_annotated_param(
|
|
41
|
+
Iterable[pl.DataFrame],
|
|
42
|
+
matcher=lambda x: x == Iterable[pl.DataFrame] or x == Iterator[pl.DataFrame],
|
|
43
|
+
)
|
|
44
|
+
class _IterablePolarsParam(LocalDataFrameParam):
|
|
45
|
+
@no_type_check
|
|
46
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
|
|
47
|
+
if not isinstance(df, LocalDataFrameIterableDataFrame):
|
|
48
|
+
yield pl.from_arrow(df.as_arrow())
|
|
49
|
+
else: # pragma: no cover # spark code coverage can't be included
|
|
50
|
+
for sub in df.native:
|
|
51
|
+
yield pl.from_arrow(sub.as_arrow())
|
|
52
|
+
|
|
53
|
+
@no_type_check
|
|
54
|
+
def to_output_df(
|
|
55
|
+
self, output: Iterable[pl.DataFrame], schema: Any, ctx: Any
|
|
56
|
+
) -> DataFrame:
|
|
57
|
+
def dfs(_schema: Schema) -> Iterable[ArrowDataFrame]:
|
|
58
|
+
if output is not None:
|
|
59
|
+
for df in output:
|
|
60
|
+
yield _to_adf(df, _schema)
|
|
61
|
+
|
|
62
|
+
_schema: Optional[Schema] = (
|
|
63
|
+
None
|
|
64
|
+
if schema is None
|
|
65
|
+
else (schema if isinstance(schema, Schema) else Schema(schema))
|
|
66
|
+
)
|
|
67
|
+
_dfs = make_empty_aware(dfs(_schema))
|
|
68
|
+
if not _dfs.empty:
|
|
69
|
+
return IterableArrowDataFrame(_dfs)
|
|
70
|
+
return IterableArrowDataFrame([], schema=_schema)
|
|
71
|
+
|
|
72
|
+
@no_type_check
|
|
73
|
+
def count(self, df: Iterable[pl.DataFrame]) -> int: # pragma: no cover
|
|
74
|
+
return sum(_.shape[0] for _ in df)
|
|
75
|
+
|
|
76
|
+
def format_hint(self) -> Optional[str]:
|
|
77
|
+
return "pyarrow"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _to_adf(output: pl.DataFrame, schema: Any) -> ArrowDataFrame:
|
|
81
|
+
adf = output.to_arrow()
|
|
82
|
+
if schema is None: # pragma: no cover
|
|
83
|
+
return ArrowDataFrame(adf)
|
|
84
|
+
_schema = schema if isinstance(schema, Schema) else Schema(schema)
|
|
85
|
+
f = get_alter_func(adf.schema, _schema.pa_schema, safe=False)
|
|
86
|
+
return ArrowDataFrame(f(adf))
|
fugue_ray/_constants.py
CHANGED
|
@@ -1,10 +1,19 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import Any, Dict
|
|
2
|
+
|
|
3
|
+
import ray
|
|
2
4
|
|
|
3
5
|
FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
|
|
4
6
|
FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
|
|
5
7
|
FUGUE_RAY_DEFAULT_BATCH_SIZE = "fugue.ray.default.batch_size"
|
|
8
|
+
FUGUE_RAY_ZERO_COPY = "fugue.ray.zero_copy"
|
|
6
9
|
|
|
7
10
|
FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
|
|
8
11
|
FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1,
|
|
9
12
|
FUGUE_RAY_DEFAULT_PARTITIONS: 0,
|
|
13
|
+
FUGUE_RAY_ZERO_COPY: True,
|
|
10
14
|
}
|
|
15
|
+
|
|
16
|
+
if ray.__version__ >= "2.3":
|
|
17
|
+
_ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
|
|
18
|
+
else: # pragma: no cover
|
|
19
|
+
_ZERO_COPY = {}
|
fugue_ray/_utils/dataframe.py
CHANGED
|
@@ -1,21 +1,27 @@
|
|
|
1
1
|
import pickle
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
3
3
|
|
|
4
|
+
import pandas as pd
|
|
4
5
|
import pyarrow as pa
|
|
5
6
|
import ray.data as rd
|
|
6
|
-
from fugue.dataframe.arrow_dataframe import _build_empty_arrow
|
|
7
7
|
from triad import Schema
|
|
8
8
|
|
|
9
|
+
from fugue.dataframe.arrow_dataframe import _build_empty_arrow
|
|
10
|
+
|
|
11
|
+
from .._constants import _ZERO_COPY
|
|
12
|
+
|
|
9
13
|
_RAY_NULL_REPR = "__RAY_NULL__"
|
|
10
14
|
|
|
11
15
|
|
|
12
16
|
def get_dataset_format(df: rd.Dataset) -> Optional[str]:
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
return df._dataset_format()
|
|
16
|
-
return df.dataset_format() # ray>=2.2
|
|
17
|
-
except Exception:
|
|
17
|
+
df.fully_executed()
|
|
18
|
+
if df.count() == 0:
|
|
18
19
|
return None
|
|
20
|
+
if hasattr(df, "_dataset_format"): # pragma: no cover
|
|
21
|
+
return df._dataset_format() # ray<2.2
|
|
22
|
+
ctx = rd.context.DatasetContext.get_current()
|
|
23
|
+
ctx.use_streaming_executor = False
|
|
24
|
+
return df.dataset_format() # ray>=2.2
|
|
19
25
|
|
|
20
26
|
|
|
21
27
|
def build_empty(schema: Schema) -> rd.Dataset:
|
|
@@ -50,7 +56,7 @@ def add_partition_key(
|
|
|
50
56
|
)
|
|
51
57
|
|
|
52
58
|
return df.map_batches(
|
|
53
|
-
add_simple_key, batch_format="pyarrow", **ray_remote_args
|
|
59
|
+
add_simple_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
|
|
54
60
|
), input_schema + (
|
|
55
61
|
output_key,
|
|
56
62
|
str,
|
|
@@ -67,8 +73,29 @@ def add_partition_key(
|
|
|
67
73
|
return fdf.append_column(output_key, sarr)
|
|
68
74
|
|
|
69
75
|
return df.map_batches(
|
|
70
|
-
add_key, batch_format="pyarrow", **ray_remote_args
|
|
76
|
+
add_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
|
|
71
77
|
), input_schema + (
|
|
72
78
|
output_key,
|
|
73
79
|
pa.binary(),
|
|
74
80
|
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def add_coarse_partition_key(
|
|
84
|
+
df: rd.Dataset,
|
|
85
|
+
keys: List[str],
|
|
86
|
+
output_key: str,
|
|
87
|
+
bucket: int,
|
|
88
|
+
) -> rd.Dataset:
|
|
89
|
+
ray_remote_args: Dict[str, Any] = {"num_cpus": 1}
|
|
90
|
+
|
|
91
|
+
def add_coarse_key(arrow_df: pa.Table) -> pa.Table: # pragma: no cover
|
|
92
|
+
hdf = arrow_df.select(keys).to_pandas()
|
|
93
|
+
_hash = pd.util.hash_pandas_object(hdf, index=False).mod(bucket)
|
|
94
|
+
return arrow_df.append_column(output_key, pa.Array.from_pandas(_hash))
|
|
95
|
+
|
|
96
|
+
return df.map_batches(
|
|
97
|
+
add_coarse_key,
|
|
98
|
+
batch_format="pyarrow",
|
|
99
|
+
**_ZERO_COPY,
|
|
100
|
+
**ray_remote_args,
|
|
101
|
+
)
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
import ray.data as rd
|
|
7
7
|
from fugue import ExecutionEngine
|
|
8
|
-
from fugue._utils.io import FileParser,
|
|
8
|
+
from fugue._utils.io import FileParser, save_df
|
|
9
9
|
from fugue.collections.partition import PartitionSpec
|
|
10
10
|
from fugue.dataframe import DataFrame
|
|
11
11
|
from fugue_ray.dataframe import RayDataFrame
|
|
@@ -49,8 +49,6 @@ class RayIO(object):
|
|
|
49
49
|
len(fmts) == 1, NotImplementedError("can't support multiple formats")
|
|
50
50
|
)
|
|
51
51
|
fmt = fmts[0]
|
|
52
|
-
if fmt == "avro": # TODO: remove avro support
|
|
53
|
-
return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
|
|
54
52
|
files = [f.uri for f in fp]
|
|
55
53
|
return self._loads[fmt](files, columns, **kwargs)
|
|
56
54
|
|
|
@@ -75,7 +73,7 @@ class RayIO(object):
|
|
|
75
73
|
except Exception: # pragma: no cover
|
|
76
74
|
pass
|
|
77
75
|
p = FileParser(uri, format_hint)
|
|
78
|
-
if not force_single
|
|
76
|
+
if not force_single:
|
|
79
77
|
df = self._prepartition(df, partition_spec=partition_spec)
|
|
80
78
|
|
|
81
79
|
self._saves[p.file_format](df=df, uri=p.uri, **kwargs)
|
fugue_ray/dataframe.py
CHANGED
|
@@ -6,12 +6,7 @@ import ray
|
|
|
6
6
|
import ray.data as rd
|
|
7
7
|
from triad.collections.schema import Schema
|
|
8
8
|
|
|
9
|
-
from fugue.dataframe import
|
|
10
|
-
ArrowDataFrame,
|
|
11
|
-
DataFrame,
|
|
12
|
-
LocalBoundedDataFrame,
|
|
13
|
-
LocalDataFrame,
|
|
14
|
-
)
|
|
9
|
+
from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
|
|
15
10
|
from fugue.dataframe.dataframe import _input_schema
|
|
16
11
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
17
12
|
from fugue.plugins import (
|
|
@@ -22,6 +17,7 @@ from fugue.plugins import (
|
|
|
22
17
|
rename,
|
|
23
18
|
)
|
|
24
19
|
|
|
20
|
+
from ._constants import _ZERO_COPY
|
|
25
21
|
from ._utils.dataframe import build_empty, get_dataset_format
|
|
26
22
|
|
|
27
23
|
|
|
@@ -115,7 +111,7 @@ class RayDataFrame(DataFrame):
|
|
|
115
111
|
def is_local(self) -> bool:
|
|
116
112
|
return False
|
|
117
113
|
|
|
118
|
-
def
|
|
114
|
+
def as_local_bounded(self) -> LocalBoundedDataFrame:
|
|
119
115
|
adf = self.as_arrow()
|
|
120
116
|
if adf.shape[0] == 0:
|
|
121
117
|
res = ArrowDataFrame([], self.schema)
|
|
@@ -145,7 +141,10 @@ class RayDataFrame(DataFrame):
|
|
|
145
141
|
if cols == self.columns:
|
|
146
142
|
return self
|
|
147
143
|
rdf = self.native.map_batches(
|
|
148
|
-
lambda b: b.select(cols),
|
|
144
|
+
lambda b: b.select(cols),
|
|
145
|
+
batch_format="pyarrow",
|
|
146
|
+
**_ZERO_COPY,
|
|
147
|
+
**self._remote_args(),
|
|
149
148
|
)
|
|
150
149
|
return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
|
|
151
150
|
|
|
@@ -158,7 +157,7 @@ class RayDataFrame(DataFrame):
|
|
|
158
157
|
def persist(self, **kwargs: Any) -> "RayDataFrame":
|
|
159
158
|
# TODO: it mutates the dataframe, is this a good bahavior
|
|
160
159
|
if not self.native.is_fully_executed(): # pragma: no cover
|
|
161
|
-
self.
|
|
160
|
+
self.native.fully_executed()
|
|
162
161
|
return self
|
|
163
162
|
|
|
164
163
|
def count(self) -> int:
|
|
@@ -179,6 +178,7 @@ class RayDataFrame(DataFrame):
|
|
|
179
178
|
rdf = self.native.map_batches(
|
|
180
179
|
lambda b: b.rename_columns(new_cols),
|
|
181
180
|
batch_format="pyarrow",
|
|
181
|
+
**_ZERO_COPY,
|
|
182
182
|
**self._remote_args(),
|
|
183
183
|
)
|
|
184
184
|
return RayDataFrame(rdf, schema=new_schema, internal_schema=True)
|
|
@@ -193,7 +193,7 @@ class RayDataFrame(DataFrame):
|
|
|
193
193
|
if self.schema == new_schema:
|
|
194
194
|
return self
|
|
195
195
|
rdf = self.native.map_batches(
|
|
196
|
-
_alter, batch_format="pyarrow", **self._remote_args()
|
|
196
|
+
_alter, batch_format="pyarrow", **_ZERO_COPY, **self._remote_args()
|
|
197
197
|
)
|
|
198
198
|
return RayDataFrame(rdf, schema=new_schema, internal_schema=True)
|
|
199
199
|
|
|
@@ -236,7 +236,9 @@ class RayDataFrame(DataFrame):
|
|
|
236
236
|
return ArrowDataFrame(table).alter_columns(schema).native # type: ignore
|
|
237
237
|
|
|
238
238
|
return (
|
|
239
|
-
rdf.map_batches(
|
|
239
|
+
rdf.map_batches(
|
|
240
|
+
_alter, batch_format="pyarrow", **_ZERO_COPY, **self._remote_args()
|
|
241
|
+
),
|
|
240
242
|
schema,
|
|
241
243
|
)
|
|
242
244
|
|
|
@@ -278,7 +280,9 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
|
|
|
278
280
|
if len(missing) > 0:
|
|
279
281
|
raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
|
|
280
282
|
new_cols = [columns.get(name, name) for name in cols]
|
|
281
|
-
return df.map_batches(
|
|
283
|
+
return df.map_batches(
|
|
284
|
+
lambda b: b.rename_columns(new_cols), batch_format="pyarrow", **_ZERO_COPY
|
|
285
|
+
)
|
|
282
286
|
|
|
283
287
|
|
|
284
288
|
def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]:
|