fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. fugue/__init__.py +9 -5
  2. fugue/_utils/interfaceless.py +1 -558
  3. fugue/_utils/io.py +2 -91
  4. fugue/_utils/registry.py +3 -2
  5. fugue/api.py +1 -0
  6. fugue/bag/bag.py +8 -4
  7. fugue/collections/__init__.py +0 -7
  8. fugue/collections/partition.py +21 -9
  9. fugue/constants.py +3 -1
  10. fugue/dataframe/__init__.py +7 -8
  11. fugue/dataframe/arrow_dataframe.py +1 -2
  12. fugue/dataframe/dataframe.py +17 -18
  13. fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
  14. fugue/dataframe/function_wrapper.py +432 -0
  15. fugue/dataframe/iterable_dataframe.py +3 -0
  16. fugue/dataframe/utils.py +11 -79
  17. fugue/dataset/api.py +0 -4
  18. fugue/dev.py +47 -0
  19. fugue/execution/__init__.py +1 -5
  20. fugue/execution/api.py +36 -14
  21. fugue/execution/execution_engine.py +30 -4
  22. fugue/execution/factory.py +0 -6
  23. fugue/execution/native_execution_engine.py +44 -67
  24. fugue/extensions/_builtins/creators.py +4 -2
  25. fugue/extensions/_builtins/outputters.py +4 -3
  26. fugue/extensions/_builtins/processors.py +3 -3
  27. fugue/extensions/creator/convert.py +5 -2
  28. fugue/extensions/outputter/convert.py +2 -2
  29. fugue/extensions/processor/convert.py +3 -2
  30. fugue/extensions/transformer/convert.py +22 -9
  31. fugue/extensions/transformer/transformer.py +15 -1
  32. fugue/plugins.py +2 -0
  33. fugue/registry.py +0 -39
  34. fugue/sql/_utils.py +1 -1
  35. fugue/workflow/_checkpoint.py +1 -1
  36. fugue/workflow/api.py +13 -13
  37. fugue/workflow/module.py +30 -37
  38. fugue/workflow/workflow.py +6 -0
  39. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
  40. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
  41. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
  42. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
  43. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
  44. fugue_contrib/contrib.py +1 -0
  45. fugue_contrib/viz/_ext.py +7 -1
  46. fugue_dask/_io.py +0 -13
  47. fugue_dask/_utils.py +10 -4
  48. fugue_dask/dataframe.py +1 -2
  49. fugue_dask/execution_engine.py +45 -18
  50. fugue_dask/registry.py +8 -33
  51. fugue_duckdb/_io.py +8 -2
  52. fugue_duckdb/_utils.py +7 -2
  53. fugue_duckdb/dask.py +1 -1
  54. fugue_duckdb/dataframe.py +23 -19
  55. fugue_duckdb/execution_engine.py +19 -22
  56. fugue_duckdb/registry.py +11 -34
  57. fugue_ibis/dataframe.py +6 -10
  58. fugue_ibis/execution_engine.py +7 -1
  59. fugue_notebook/env.py +5 -10
  60. fugue_polars/__init__.py +2 -0
  61. fugue_polars/_utils.py +8 -0
  62. fugue_polars/polars_dataframe.py +234 -0
  63. fugue_polars/registry.py +86 -0
  64. fugue_ray/_constants.py +10 -1
  65. fugue_ray/_utils/dataframe.py +36 -9
  66. fugue_ray/_utils/io.py +2 -4
  67. fugue_ray/dataframe.py +16 -12
  68. fugue_ray/execution_engine.py +53 -32
  69. fugue_ray/registry.py +8 -32
  70. fugue_spark/_utils/convert.py +22 -11
  71. fugue_spark/_utils/io.py +0 -13
  72. fugue_spark/_utils/misc.py +27 -0
  73. fugue_spark/_utils/partition.py +11 -18
  74. fugue_spark/dataframe.py +26 -22
  75. fugue_spark/execution_engine.py +136 -54
  76. fugue_spark/registry.py +29 -78
  77. fugue_test/builtin_suite.py +36 -14
  78. fugue_test/dataframe_suite.py +9 -5
  79. fugue_test/execution_suite.py +100 -122
  80. fugue_version/__init__.py +1 -1
  81. tests/fugue/bag/test_array_bag.py +0 -9
  82. tests/fugue/collections/test_partition.py +10 -3
  83. tests/fugue/dataframe/test_function_wrapper.py +293 -0
  84. tests/fugue/dataframe/test_utils.py +2 -34
  85. tests/fugue/execution/test_factory.py +7 -9
  86. tests/fugue/execution/test_naive_execution_engine.py +35 -80
  87. tests/fugue/extensions/test_utils.py +12 -7
  88. tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
  89. tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
  90. tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
  91. tests/fugue/sql/test_workflow.py +1 -1
  92. tests/fugue/sql/test_workflow_parse.py +3 -5
  93. tests/fugue/utils/test_interfaceless.py +1 -325
  94. tests/fugue/utils/test_io.py +0 -80
  95. tests/fugue_dask/test_execution_engine.py +48 -0
  96. tests/fugue_dask/test_io.py +0 -55
  97. tests/fugue_duckdb/test_dataframe.py +2 -2
  98. tests/fugue_duckdb/test_execution_engine.py +16 -1
  99. tests/fugue_duckdb/test_utils.py +1 -1
  100. tests/fugue_ibis/test_dataframe.py +6 -3
  101. tests/fugue_polars/__init__.py +0 -0
  102. tests/fugue_polars/test_api.py +13 -0
  103. tests/fugue_polars/test_dataframe.py +82 -0
  104. tests/fugue_polars/test_transform.py +100 -0
  105. tests/fugue_ray/test_execution_engine.py +40 -4
  106. tests/fugue_spark/test_dataframe.py +0 -8
  107. tests/fugue_spark/test_execution_engine.py +50 -11
  108. tests/fugue_spark/test_importless.py +4 -4
  109. tests/fugue_spark/test_spark_connect.py +82 -0
  110. tests/fugue_spark/utils/test_convert.py +6 -8
  111. tests/fugue_spark/utils/test_io.py +0 -17
  112. fugue/_utils/register.py +0 -3
  113. fugue_test/_utils.py +0 -13
  114. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
@@ -0,0 +1,234 @@
1
+ from typing import Any, Dict, Iterable, List, Optional
2
+
3
+ import pandas as pd
4
+ import polars as pl
5
+ import pyarrow as pa
6
+ from triad.collections.schema import Schema
7
+ from triad.exceptions import InvalidOperationError
8
+ from triad.utils.assertion import assert_or_throw
9
+ from triad.utils.pyarrow import (
10
+ LARGE_TYPES_REPLACEMENT,
11
+ replace_types_in_schema,
12
+ replace_types_in_table,
13
+ )
14
+
15
+ from fugue import ArrowDataFrame
16
+ from fugue.api import (
17
+ as_arrow,
18
+ drop_columns,
19
+ get_column_names,
20
+ get_schema,
21
+ is_df,
22
+ rename,
23
+ select_columns,
24
+ )
25
+ from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
26
+ from fugue.dataset.api import (
27
+ as_local,
28
+ as_local_bounded,
29
+ count,
30
+ get_num_partitions,
31
+ is_bounded,
32
+ is_empty,
33
+ is_local,
34
+ )
35
+ from fugue.exceptions import FugueDataFrameOperationError
36
+
37
+ from ._utils import build_empty_pl
38
+
39
+
40
+ class PolarsDataFrame(LocalBoundedDataFrame):
41
+ """DataFrame that wraps :func:`pyarrow.Table <pa:pyarrow.table>`. Please also read
42
+ |DataFrameTutorial| to understand this Fugue concept
43
+
44
+ :param df: polars DataFrame or None, defaults to None
45
+ :param schema: |SchemaLikeObject|
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ df: Optional[pl.DataFrame] = None,
51
+ schema: Any = None,
52
+ ):
53
+ if df is None:
54
+ schema = _input_schema(schema).assert_not_empty()
55
+ self._native: pa.Table = build_empty_pl(schema)
56
+ super().__init__(schema)
57
+ return
58
+ else:
59
+ assert_or_throw(
60
+ schema is None,
61
+ InvalidOperationError("can't reset schema for pl.DataFrame"),
62
+ )
63
+ self._native = df
64
+ super().__init__(_get_pl_schema(df))
65
+
66
+ @property
67
+ def native(self) -> pl.DataFrame:
68
+ """:func:`pyarrow.Table <pa:pyarrow.table>`"""
69
+ return self._native
70
+
71
+ def native_as_df(self) -> pl.DataFrame:
72
+ return self._native
73
+
74
+ @property
75
+ def empty(self) -> bool:
76
+ return self._native.shape[0] == 0
77
+
78
+ def peek_array(self) -> List[Any]:
79
+ self.assert_not_empty()
80
+ return list(self._native.row(0))
81
+
82
+ def peek_dict(self) -> Dict[str, Any]:
83
+ self.assert_not_empty()
84
+ return self._native.row(0, named=True)
85
+
86
+ def count(self) -> int:
87
+ return self.native.shape[0]
88
+
89
+ def as_pandas(self) -> pd.DataFrame:
90
+ return self.native.to_pandas()
91
+
92
+ def head(
93
+ self, n: int, columns: Optional[List[str]] = None
94
+ ) -> LocalBoundedDataFrame:
95
+ adf = self.native if columns is None else self.native.select(columns)
96
+ n = min(n, self.count())
97
+ if n == 0:
98
+ schema = self.schema if columns is None else self.schema.extract(columns)
99
+ return PolarsDataFrame(None, schema=schema)
100
+ return PolarsDataFrame(adf.head(n))
101
+
102
+ def _drop_cols(self, cols: List[str]) -> DataFrame:
103
+ return PolarsDataFrame(self.native.drop(cols))
104
+
105
+ def _select_cols(self, keys: List[Any]) -> DataFrame:
106
+ return PolarsDataFrame(self.native.select(keys))
107
+
108
+ def rename(self, columns: Dict[str, str]) -> DataFrame:
109
+ return PolarsDataFrame(_rename_pl_dataframe(self.native, columns))
110
+
111
+ def alter_columns(self, columns: Any) -> DataFrame:
112
+ adf = ArrowDataFrame(self.as_arrow()).alter_columns(columns)
113
+ return PolarsDataFrame(pl.from_arrow(adf.native))
114
+
115
+ def as_arrow(self, type_safe: bool = False) -> pa.Table:
116
+ return _pl_as_arrow(self.native)
117
+
118
+ def as_array(
119
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
120
+ ) -> List[Any]:
121
+ tdf = self.native
122
+ if columns is not None:
123
+ tdf = tdf.select(columns)
124
+ return [list(row) for row in tdf.rows()]
125
+
126
+ def as_array_iterable(
127
+ self, columns: Optional[List[str]] = None, type_safe: bool = False
128
+ ) -> Iterable[Any]:
129
+ if not self.empty:
130
+ yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_array_iterable(
131
+ columns=columns
132
+ )
133
+
134
+ def as_dict_iterable(
135
+ self, columns: Optional[List[str]] = None
136
+ ) -> Iterable[Dict[str, Any]]:
137
+ if not self.empty:
138
+ yield from ArrowDataFrame(_pl_as_arrow(self.native)).as_dict_iterable(
139
+ columns=columns
140
+ )
141
+
142
+
143
+ @as_local.candidate(lambda df: isinstance(df, pl.DataFrame))
144
+ def _pl_as_local(df: pl.DataFrame) -> pl.DataFrame:
145
+ return df
146
+
147
+
148
+ @as_local_bounded.candidate(lambda df: isinstance(df, pl.DataFrame))
149
+ def _pl_as_local_bounded(df: pl.DataFrame) -> pl.DataFrame:
150
+ return df
151
+
152
+
153
+ @as_arrow.candidate(lambda df: isinstance(df, pl.DataFrame))
154
+ def _pl_as_arrow(df: pl.DataFrame) -> pa.Table:
155
+ adf = df.to_arrow()
156
+ adf = replace_types_in_table(adf, LARGE_TYPES_REPLACEMENT)
157
+ return adf
158
+
159
+
160
+ @is_df.candidate(lambda df: isinstance(df, pl.DataFrame))
161
+ def _pl_is_df(df: pl.DataFrame) -> bool:
162
+ return True
163
+
164
+
165
+ @count.candidate(lambda df: isinstance(df, pl.DataFrame))
166
+ def _pl_count(df: pl.DataFrame) -> int:
167
+ return df.shape[0]
168
+
169
+
170
+ @is_bounded.candidate(lambda df: isinstance(df, pl.DataFrame))
171
+ def _pl_is_bounded(df: pl.DataFrame) -> bool:
172
+ return True
173
+
174
+
175
+ @is_empty.candidate(lambda df: isinstance(df, pl.DataFrame))
176
+ def _pl_is_empty(df: pl.DataFrame) -> bool:
177
+ return df.shape[0] == 0
178
+
179
+
180
+ @is_local.candidate(lambda df: isinstance(df, pl.DataFrame))
181
+ def _pl_is_local(df: pl.DataFrame) -> bool:
182
+ return True
183
+
184
+
185
+ @get_num_partitions.candidate(lambda df: isinstance(df, pl.DataFrame))
186
+ def _pl_get_num_partitions(df: pl.DataFrame) -> int:
187
+ return 1
188
+
189
+
190
+ @get_column_names.candidate(lambda df: isinstance(df, pl.DataFrame))
191
+ def _get_pl_columns(df: pl.DataFrame) -> List[Any]:
192
+ return list(df.schema.keys())
193
+
194
+
195
+ @get_schema.candidate(lambda df: isinstance(df, pl.DataFrame))
196
+ def _get_pl_schema(df: pl.DataFrame) -> Schema:
197
+ adf = df.to_arrow()
198
+ schema = replace_types_in_schema(adf.schema, LARGE_TYPES_REPLACEMENT)
199
+ return Schema(schema)
200
+
201
+
202
+ @rename.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
203
+ def _rename_pl_dataframe(df: pl.DataFrame, columns: Dict[str, Any]) -> pl.DataFrame:
204
+ if len(columns) == 0:
205
+ return df
206
+ assert_or_throw(
207
+ set(columns.keys()).issubset(set(df.columns)),
208
+ FugueDataFrameOperationError(f"invalid {columns}"),
209
+ )
210
+ return df.rename(columns)
211
+
212
+
213
+ @drop_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
214
+ def _drop_pa_columns(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:
215
+ cols = [x for x in df.schema.keys() if x not in columns]
216
+ if len(cols) == 0:
217
+ raise FugueDataFrameOperationError("cannot drop all columns")
218
+ if len(cols) + len(columns) != len(df.columns):
219
+ _assert_no_missing(df, columns)
220
+ return df.select(cols)
221
+
222
+
223
+ @select_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pl.DataFrame))
224
+ def _select_pa_columns(df: pl.DataFrame, columns: List[Any]) -> pl.DataFrame:
225
+ if len(columns) == 0:
226
+ raise FugueDataFrameOperationError("must select at least one column")
227
+ _assert_no_missing(df, columns=columns)
228
+ return df.select(columns)
229
+
230
+
231
+ def _assert_no_missing(df: pl.DataFrame, columns: Iterable[Any]) -> None:
232
+ missing = [x for x in columns if x not in df.schema.keys()]
233
+ if len(missing) > 0:
234
+ raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
@@ -0,0 +1,86 @@
1
+ from typing import Any, Iterable, Iterator, Optional, no_type_check
2
+
3
+ import polars as pl
4
+ import pyarrow as pa
5
+ from triad import Schema, make_empty_aware
6
+ from triad.utils.pyarrow import get_alter_func
7
+
8
+ from fugue import (
9
+ ArrowDataFrame,
10
+ DataFrame,
11
+ IterableArrowDataFrame,
12
+ LocalDataFrameIterableDataFrame,
13
+ )
14
+ from fugue.dev import LocalDataFrameParam, fugue_annotated_param
15
+ from .polars_dataframe import PolarsDataFrame
16
+ from fugue.plugins import as_fugue_dataset
17
+
18
+
19
+ @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, pl.DataFrame))
20
+ def _pl_as_fugue_df(df: pl.DataFrame, **kwargs: Any) -> PolarsDataFrame:
21
+ return PolarsDataFrame(df, **kwargs)
22
+
23
+
24
+ @fugue_annotated_param(pl.DataFrame)
25
+ class _PolarsParam(LocalDataFrameParam):
26
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
27
+ return pl.from_arrow(df.as_arrow())
28
+
29
+ def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
30
+ assert isinstance(output, pl.DataFrame)
31
+ return _to_adf(output, schema=schema)
32
+
33
+ def count(self, df: Any) -> int: # pragma: no cover
34
+ return df.shape[0]
35
+
36
+ def format_hint(self) -> Optional[str]:
37
+ return "pyarrow"
38
+
39
+
40
+ @fugue_annotated_param(
41
+ Iterable[pl.DataFrame],
42
+ matcher=lambda x: x == Iterable[pl.DataFrame] or x == Iterator[pl.DataFrame],
43
+ )
44
+ class _IterablePolarsParam(LocalDataFrameParam):
45
+ @no_type_check
46
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
47
+ if not isinstance(df, LocalDataFrameIterableDataFrame):
48
+ yield pl.from_arrow(df.as_arrow())
49
+ else: # pragma: no cover # spark code coverage can't be included
50
+ for sub in df.native:
51
+ yield pl.from_arrow(sub.as_arrow())
52
+
53
+ @no_type_check
54
+ def to_output_df(
55
+ self, output: Iterable[pl.DataFrame], schema: Any, ctx: Any
56
+ ) -> DataFrame:
57
+ def dfs(_schema: Schema) -> Iterable[ArrowDataFrame]:
58
+ if output is not None:
59
+ for df in output:
60
+ yield _to_adf(df, _schema)
61
+
62
+ _schema: Optional[Schema] = (
63
+ None
64
+ if schema is None
65
+ else (schema if isinstance(schema, Schema) else Schema(schema))
66
+ )
67
+ _dfs = make_empty_aware(dfs(_schema))
68
+ if not _dfs.empty:
69
+ return IterableArrowDataFrame(_dfs)
70
+ return IterableArrowDataFrame([], schema=_schema)
71
+
72
+ @no_type_check
73
+ def count(self, df: Iterable[pl.DataFrame]) -> int: # pragma: no cover
74
+ return sum(_.shape[0] for _ in df)
75
+
76
+ def format_hint(self) -> Optional[str]:
77
+ return "pyarrow"
78
+
79
+
80
+ def _to_adf(output: pl.DataFrame, schema: Any) -> ArrowDataFrame:
81
+ adf = output.to_arrow()
82
+ if schema is None: # pragma: no cover
83
+ return ArrowDataFrame(adf)
84
+ _schema = schema if isinstance(schema, Schema) else Schema(schema)
85
+ f = get_alter_func(adf.schema, _schema.pa_schema, safe=False)
86
+ return ArrowDataFrame(f(adf))
fugue_ray/_constants.py CHANGED
@@ -1,10 +1,19 @@
1
- from typing import Dict, Any
1
+ from typing import Any, Dict
2
+
3
+ import ray
2
4
 
3
5
  FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
4
6
  FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
5
7
  FUGUE_RAY_DEFAULT_BATCH_SIZE = "fugue.ray.default.batch_size"
8
+ FUGUE_RAY_ZERO_COPY = "fugue.ray.zero_copy"
6
9
 
7
10
  FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
8
11
  FUGUE_RAY_CONF_SHUFFLE_PARTITIONS: -1,
9
12
  FUGUE_RAY_DEFAULT_PARTITIONS: 0,
13
+ FUGUE_RAY_ZERO_COPY: True,
10
14
  }
15
+
16
+ if ray.__version__ >= "2.3":
17
+ _ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
18
+ else: # pragma: no cover
19
+ _ZERO_COPY = {}
@@ -1,21 +1,27 @@
1
1
  import pickle
2
- from typing import List, Optional, Tuple, Dict, Any
2
+ from typing import Any, Dict, List, Optional, Tuple
3
3
 
4
+ import pandas as pd
4
5
  import pyarrow as pa
5
6
  import ray.data as rd
6
- from fugue.dataframe.arrow_dataframe import _build_empty_arrow
7
7
  from triad import Schema
8
8
 
9
+ from fugue.dataframe.arrow_dataframe import _build_empty_arrow
10
+
11
+ from .._constants import _ZERO_COPY
12
+
9
13
  _RAY_NULL_REPR = "__RAY_NULL__"
10
14
 
11
15
 
12
16
  def get_dataset_format(df: rd.Dataset) -> Optional[str]:
13
- try: # pragma: no cover
14
- if hasattr(df, "_dataset_format"): # ray<2.2
15
- return df._dataset_format()
16
- return df.dataset_format() # ray>=2.2
17
- except Exception:
17
+ df.fully_executed()
18
+ if df.count() == 0:
18
19
  return None
20
+ if hasattr(df, "_dataset_format"): # pragma: no cover
21
+ return df._dataset_format() # ray<2.2
22
+ ctx = rd.context.DatasetContext.get_current()
23
+ ctx.use_streaming_executor = False
24
+ return df.dataset_format() # ray>=2.2
19
25
 
20
26
 
21
27
  def build_empty(schema: Schema) -> rd.Dataset:
@@ -50,7 +56,7 @@ def add_partition_key(
50
56
  )
51
57
 
52
58
  return df.map_batches(
53
- add_simple_key, batch_format="pyarrow", **ray_remote_args
59
+ add_simple_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
54
60
  ), input_schema + (
55
61
  output_key,
56
62
  str,
@@ -67,8 +73,29 @@ def add_partition_key(
67
73
  return fdf.append_column(output_key, sarr)
68
74
 
69
75
  return df.map_batches(
70
- add_key, batch_format="pyarrow", **ray_remote_args
76
+ add_key, batch_format="pyarrow", **_ZERO_COPY, **ray_remote_args
71
77
  ), input_schema + (
72
78
  output_key,
73
79
  pa.binary(),
74
80
  )
81
+
82
+
83
+ def add_coarse_partition_key(
84
+ df: rd.Dataset,
85
+ keys: List[str],
86
+ output_key: str,
87
+ bucket: int,
88
+ ) -> rd.Dataset:
89
+ ray_remote_args: Dict[str, Any] = {"num_cpus": 1}
90
+
91
+ def add_coarse_key(arrow_df: pa.Table) -> pa.Table: # pragma: no cover
92
+ hdf = arrow_df.select(keys).to_pandas()
93
+ _hash = pd.util.hash_pandas_object(hdf, index=False).mod(bucket)
94
+ return arrow_df.append_column(output_key, pa.Array.from_pandas(_hash))
95
+
96
+ return df.map_batches(
97
+ add_coarse_key,
98
+ batch_format="pyarrow",
99
+ **_ZERO_COPY,
100
+ **ray_remote_args,
101
+ )
fugue_ray/_utils/io.py CHANGED
@@ -5,7 +5,7 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
5
5
  import pyarrow as pa
6
6
  import ray.data as rd
7
7
  from fugue import ExecutionEngine
8
- from fugue._utils.io import FileParser, load_df, save_df
8
+ from fugue._utils.io import FileParser, save_df
9
9
  from fugue.collections.partition import PartitionSpec
10
10
  from fugue.dataframe import DataFrame
11
11
  from fugue_ray.dataframe import RayDataFrame
@@ -49,8 +49,6 @@ class RayIO(object):
49
49
  len(fmts) == 1, NotImplementedError("can't support multiple formats")
50
50
  )
51
51
  fmt = fmts[0]
52
- if fmt == "avro": # TODO: remove avro support
53
- return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
54
52
  files = [f.uri for f in fp]
55
53
  return self._loads[fmt](files, columns, **kwargs)
56
54
 
@@ -75,7 +73,7 @@ class RayIO(object):
75
73
  except Exception: # pragma: no cover
76
74
  pass
77
75
  p = FileParser(uri, format_hint)
78
- if not force_single and p.file_format != "avro":
76
+ if not force_single:
79
77
  df = self._prepartition(df, partition_spec=partition_spec)
80
78
 
81
79
  self._saves[p.file_format](df=df, uri=p.uri, **kwargs)
fugue_ray/dataframe.py CHANGED
@@ -6,12 +6,7 @@ import ray
6
6
  import ray.data as rd
7
7
  from triad.collections.schema import Schema
8
8
 
9
- from fugue.dataframe import (
10
- ArrowDataFrame,
11
- DataFrame,
12
- LocalBoundedDataFrame,
13
- LocalDataFrame,
14
- )
9
+ from fugue.dataframe import ArrowDataFrame, DataFrame, LocalBoundedDataFrame
15
10
  from fugue.dataframe.dataframe import _input_schema
16
11
  from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
17
12
  from fugue.plugins import (
@@ -22,6 +17,7 @@ from fugue.plugins import (
22
17
  rename,
23
18
  )
24
19
 
20
+ from ._constants import _ZERO_COPY
25
21
  from ._utils.dataframe import build_empty, get_dataset_format
26
22
 
27
23
 
@@ -115,7 +111,7 @@ class RayDataFrame(DataFrame):
115
111
  def is_local(self) -> bool:
116
112
  return False
117
113
 
118
- def as_local(self) -> LocalDataFrame:
114
+ def as_local_bounded(self) -> LocalBoundedDataFrame:
119
115
  adf = self.as_arrow()
120
116
  if adf.shape[0] == 0:
121
117
  res = ArrowDataFrame([], self.schema)
@@ -145,7 +141,10 @@ class RayDataFrame(DataFrame):
145
141
  if cols == self.columns:
146
142
  return self
147
143
  rdf = self.native.map_batches(
148
- lambda b: b.select(cols), batch_format="pyarrow", **self._remote_args()
144
+ lambda b: b.select(cols),
145
+ batch_format="pyarrow",
146
+ **_ZERO_COPY,
147
+ **self._remote_args(),
149
148
  )
150
149
  return RayDataFrame(rdf, self.schema.extract(cols), internal_schema=True)
151
150
 
@@ -158,7 +157,7 @@ class RayDataFrame(DataFrame):
158
157
  def persist(self, **kwargs: Any) -> "RayDataFrame":
159
158
  # TODO: it mutates the dataframe, is this a good bahavior
160
159
  if not self.native.is_fully_executed(): # pragma: no cover
161
- self._native = self.native.fully_executed()
160
+ self.native.fully_executed()
162
161
  return self
163
162
 
164
163
  def count(self) -> int:
@@ -179,6 +178,7 @@ class RayDataFrame(DataFrame):
179
178
  rdf = self.native.map_batches(
180
179
  lambda b: b.rename_columns(new_cols),
181
180
  batch_format="pyarrow",
181
+ **_ZERO_COPY,
182
182
  **self._remote_args(),
183
183
  )
184
184
  return RayDataFrame(rdf, schema=new_schema, internal_schema=True)
@@ -193,7 +193,7 @@ class RayDataFrame(DataFrame):
193
193
  if self.schema == new_schema:
194
194
  return self
195
195
  rdf = self.native.map_batches(
196
- _alter, batch_format="pyarrow", **self._remote_args()
196
+ _alter, batch_format="pyarrow", **_ZERO_COPY, **self._remote_args()
197
197
  )
198
198
  return RayDataFrame(rdf, schema=new_schema, internal_schema=True)
199
199
 
@@ -236,7 +236,9 @@ class RayDataFrame(DataFrame):
236
236
  return ArrowDataFrame(table).alter_columns(schema).native # type: ignore
237
237
 
238
238
  return (
239
- rdf.map_batches(_alter, batch_format="pyarrow", **self._remote_args()),
239
+ rdf.map_batches(
240
+ _alter, batch_format="pyarrow", **_ZERO_COPY, **self._remote_args()
241
+ ),
240
242
  schema,
241
243
  )
242
244
 
@@ -278,7 +280,9 @@ def _rename_ray_dataframe(df: rd.Dataset, columns: Dict[str, Any]) -> rd.Dataset
278
280
  if len(missing) > 0:
279
281
  raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
280
282
  new_cols = [columns.get(name, name) for name in cols]
281
- return df.map_batches(lambda b: b.rename_columns(new_cols), batch_format="pyarrow")
283
+ return df.map_batches(
284
+ lambda b: b.rename_columns(new_cols), batch_format="pyarrow", **_ZERO_COPY
285
+ )
282
286
 
283
287
 
284
288
  def _get_arrow_tables(df: rd.Dataset) -> Iterable[pa.Table]: