fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +9 -5
- fugue/_utils/interfaceless.py +1 -558
- fugue/_utils/io.py +2 -91
- fugue/_utils/registry.py +3 -2
- fugue/api.py +1 -0
- fugue/bag/bag.py +8 -4
- fugue/collections/__init__.py +0 -7
- fugue/collections/partition.py +21 -9
- fugue/constants.py +3 -1
- fugue/dataframe/__init__.py +7 -8
- fugue/dataframe/arrow_dataframe.py +1 -2
- fugue/dataframe/dataframe.py +17 -18
- fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
- fugue/dataframe/function_wrapper.py +432 -0
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/utils.py +11 -79
- fugue/dataset/api.py +0 -4
- fugue/dev.py +47 -0
- fugue/execution/__init__.py +1 -5
- fugue/execution/api.py +36 -14
- fugue/execution/execution_engine.py +30 -4
- fugue/execution/factory.py +0 -6
- fugue/execution/native_execution_engine.py +44 -67
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +4 -3
- fugue/extensions/_builtins/processors.py +3 -3
- fugue/extensions/creator/convert.py +5 -2
- fugue/extensions/outputter/convert.py +2 -2
- fugue/extensions/processor/convert.py +3 -2
- fugue/extensions/transformer/convert.py +22 -9
- fugue/extensions/transformer/transformer.py +15 -1
- fugue/plugins.py +2 -0
- fugue/registry.py +0 -39
- fugue/sql/_utils.py +1 -1
- fugue/workflow/_checkpoint.py +1 -1
- fugue/workflow/api.py +13 -13
- fugue/workflow/module.py +30 -37
- fugue/workflow/workflow.py +6 -0
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
- fugue_contrib/contrib.py +1 -0
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/dataframe.py +1 -2
- fugue_dask/execution_engine.py +45 -18
- fugue_dask/registry.py +8 -33
- fugue_duckdb/_io.py +8 -2
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +23 -19
- fugue_duckdb/execution_engine.py +19 -22
- fugue_duckdb/registry.py +11 -34
- fugue_ibis/dataframe.py +6 -10
- fugue_ibis/execution_engine.py +7 -1
- fugue_notebook/env.py +5 -10
- fugue_polars/__init__.py +2 -0
- fugue_polars/_utils.py +8 -0
- fugue_polars/polars_dataframe.py +234 -0
- fugue_polars/registry.py +86 -0
- fugue_ray/_constants.py +10 -1
- fugue_ray/_utils/dataframe.py +36 -9
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +16 -12
- fugue_ray/execution_engine.py +53 -32
- fugue_ray/registry.py +8 -32
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +26 -22
- fugue_spark/execution_engine.py +136 -54
- fugue_spark/registry.py +29 -78
- fugue_test/builtin_suite.py +36 -14
- fugue_test/dataframe_suite.py +9 -5
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/bag/test_array_bag.py +0 -9
- tests/fugue/collections/test_partition.py +10 -3
- tests/fugue/dataframe/test_function_wrapper.py +293 -0
- tests/fugue/dataframe/test_utils.py +2 -34
- tests/fugue/execution/test_factory.py +7 -9
- tests/fugue/execution/test_naive_execution_engine.py +35 -80
- tests/fugue/extensions/test_utils.py +12 -7
- tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
- tests/fugue/sql/test_workflow.py +1 -1
- tests/fugue/sql/test_workflow_parse.py +3 -5
- tests/fugue/utils/test_interfaceless.py +1 -325
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +48 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_execution_engine.py +16 -1
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_ibis/test_dataframe.py +6 -3
- tests/fugue_polars/__init__.py +0 -0
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_dataframe.py +82 -0
- tests/fugue_polars/test_transform.py +100 -0
- tests/fugue_ray/test_execution_engine.py +40 -4
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +50 -11
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue/_utils/register.py +0 -3
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
from typing import (
|
|
3
|
+
Any,
|
|
4
|
+
Callable,
|
|
5
|
+
Dict,
|
|
6
|
+
Iterable,
|
|
7
|
+
Iterator,
|
|
8
|
+
List,
|
|
9
|
+
Optional,
|
|
10
|
+
no_type_check,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import pyarrow as pa
|
|
15
|
+
from triad import Schema, assert_or_throw
|
|
16
|
+
from triad.collections.function_wrapper import (
|
|
17
|
+
AnnotatedParam,
|
|
18
|
+
FunctionWrapper,
|
|
19
|
+
KeywordParam,
|
|
20
|
+
PositionalParam,
|
|
21
|
+
function_wrapper,
|
|
22
|
+
)
|
|
23
|
+
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
24
|
+
|
|
25
|
+
from ..constants import FUGUE_ENTRYPOINT
|
|
26
|
+
from .array_dataframe import ArrayDataFrame
|
|
27
|
+
from .arrow_dataframe import ArrowDataFrame
|
|
28
|
+
from .dataframe import DataFrame, LocalDataFrame
|
|
29
|
+
from .dataframe_iterable_dataframe import (
|
|
30
|
+
IterableArrowDataFrame,
|
|
31
|
+
IterablePandasDataFrame,
|
|
32
|
+
LocalDataFrameIterableDataFrame,
|
|
33
|
+
)
|
|
34
|
+
from .dataframes import DataFrames
|
|
35
|
+
from .iterable_dataframe import IterableDataFrame
|
|
36
|
+
from .pandas_dataframe import PandasDataFrame
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@function_wrapper(FUGUE_ENTRYPOINT)
|
|
40
|
+
class DataFrameFunctionWrapper(FunctionWrapper):
|
|
41
|
+
@property
|
|
42
|
+
def need_output_schema(self) -> Optional[bool]:
|
|
43
|
+
return (
|
|
44
|
+
self._rt.need_schema()
|
|
45
|
+
if isinstance(self._rt, _DataFrameParamBase)
|
|
46
|
+
else False
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def get_format_hint(self) -> Optional[str]:
|
|
50
|
+
for v in self._params.values():
|
|
51
|
+
if isinstance(v, _DataFrameParamBase):
|
|
52
|
+
if v.format_hint() is not None:
|
|
53
|
+
return v.format_hint()
|
|
54
|
+
if isinstance(self._rt, _DataFrameParamBase):
|
|
55
|
+
return self._rt.format_hint()
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
def run( # noqa: C901
|
|
59
|
+
self,
|
|
60
|
+
args: List[Any],
|
|
61
|
+
kwargs: Dict[str, Any],
|
|
62
|
+
ignore_unknown: bool = False,
|
|
63
|
+
output_schema: Any = None,
|
|
64
|
+
output: bool = True,
|
|
65
|
+
ctx: Any = None,
|
|
66
|
+
) -> Any:
|
|
67
|
+
p: Dict[str, Any] = {}
|
|
68
|
+
for i in range(len(args)):
|
|
69
|
+
p[self._params.get_key_by_index(i)] = args[i]
|
|
70
|
+
p.update(kwargs)
|
|
71
|
+
has_kw = False
|
|
72
|
+
rargs: Dict[str, Any] = {}
|
|
73
|
+
for k, v in self._params.items():
|
|
74
|
+
if isinstance(v, (PositionalParam, KeywordParam)):
|
|
75
|
+
if isinstance(v, KeywordParam):
|
|
76
|
+
has_kw = True
|
|
77
|
+
elif k in p:
|
|
78
|
+
if isinstance(v, _DataFrameParamBase):
|
|
79
|
+
assert_or_throw(
|
|
80
|
+
isinstance(p[k], DataFrame),
|
|
81
|
+
lambda: TypeError(f"{p[k]} is not a DataFrame"),
|
|
82
|
+
)
|
|
83
|
+
rargs[k] = v.to_input_data(p[k], ctx=ctx)
|
|
84
|
+
else:
|
|
85
|
+
rargs[k] = p[k] # TODO: should we do auto type conversion?
|
|
86
|
+
del p[k]
|
|
87
|
+
elif v.required:
|
|
88
|
+
raise ValueError(f"{k} is required by not given")
|
|
89
|
+
if has_kw:
|
|
90
|
+
rargs.update(p)
|
|
91
|
+
elif not ignore_unknown and len(p) > 0:
|
|
92
|
+
raise ValueError(f"{p} are not acceptable parameters")
|
|
93
|
+
rt = self._func(**rargs)
|
|
94
|
+
if not output:
|
|
95
|
+
if isinstance(self._rt, _DataFrameParamBase):
|
|
96
|
+
self._rt.count(rt)
|
|
97
|
+
return
|
|
98
|
+
if isinstance(self._rt, _DataFrameParamBase):
|
|
99
|
+
return self._rt.to_output_df(rt, output_schema, ctx=ctx)
|
|
100
|
+
return rt
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
@fugue_annotated_param(
|
|
107
|
+
"Callable",
|
|
108
|
+
"F",
|
|
109
|
+
lambda annotation: (
|
|
110
|
+
annotation == Callable
|
|
111
|
+
or annotation == callable # pylint: disable=comparison-with-callable
|
|
112
|
+
or str(annotation).startswith("typing.Callable")
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
class _CallableParam(AnnotatedParam):
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@fugue_annotated_param(
|
|
120
|
+
"Callable",
|
|
121
|
+
"f",
|
|
122
|
+
lambda annotation: (
|
|
123
|
+
annotation == Optional[Callable]
|
|
124
|
+
or annotation == Optional[callable]
|
|
125
|
+
or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
|
|
126
|
+
or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
|
|
127
|
+
),
|
|
128
|
+
)
|
|
129
|
+
class _OptionalCallableParam(AnnotatedParam):
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class _DataFrameParamBase(AnnotatedParam):
|
|
134
|
+
def __init__(self, param: Optional[inspect.Parameter]):
|
|
135
|
+
super().__init__(param)
|
|
136
|
+
assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
|
|
137
|
+
|
|
138
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
|
|
139
|
+
raise NotImplementedError
|
|
140
|
+
|
|
141
|
+
def to_output_df(
|
|
142
|
+
self, df: Any, schema: Any, ctx: Any
|
|
143
|
+
) -> DataFrame: # pragma: no cover
|
|
144
|
+
raise NotImplementedError
|
|
145
|
+
|
|
146
|
+
def count(self, df: Any) -> int: # pragma: no cover
|
|
147
|
+
raise NotImplementedError
|
|
148
|
+
|
|
149
|
+
def need_schema(self) -> Optional[bool]:
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
def format_hint(self) -> Optional[str]:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@fugue_annotated_param(DataFrame, "d", child_can_reuse_code=True)
|
|
157
|
+
class DataFrameParam(_DataFrameParamBase):
|
|
158
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
|
|
159
|
+
return df
|
|
160
|
+
|
|
161
|
+
def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
|
|
162
|
+
assert_or_throw(
|
|
163
|
+
schema is None or output.schema == schema,
|
|
164
|
+
lambda: f"Output schema mismatch {output.schema} vs {schema}",
|
|
165
|
+
)
|
|
166
|
+
return output
|
|
167
|
+
|
|
168
|
+
def count(self, df: Any) -> int:
|
|
169
|
+
if df.is_bounded:
|
|
170
|
+
return df.count()
|
|
171
|
+
else:
|
|
172
|
+
return sum(1 for _ in df.as_array_iterable())
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
@fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
|
|
176
|
+
class LocalDataFrameParam(DataFrameParam):
|
|
177
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
|
|
178
|
+
return df.as_local()
|
|
179
|
+
|
|
180
|
+
def to_output_df(self, output: LocalDataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
181
|
+
assert_or_throw(
|
|
182
|
+
schema is None or output.schema == schema,
|
|
183
|
+
lambda: f"Output schema mismatch {output.schema} vs {schema}",
|
|
184
|
+
)
|
|
185
|
+
return output
|
|
186
|
+
|
|
187
|
+
def count(self, df: LocalDataFrame) -> int:
|
|
188
|
+
if df.is_bounded:
|
|
189
|
+
return df.count()
|
|
190
|
+
else:
|
|
191
|
+
return sum(1 for _ in df.as_array_iterable())
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
@fugue_annotated_param(
|
|
195
|
+
"[NoSchema]", "s", matcher=lambda x: False, child_can_reuse_code=True
|
|
196
|
+
)
|
|
197
|
+
class _LocalNoSchemaDataFrameParam(LocalDataFrameParam):
|
|
198
|
+
def need_schema(self) -> Optional[bool]:
|
|
199
|
+
return True
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@fugue_annotated_param(List[List[Any]])
|
|
203
|
+
class _ListListParam(_LocalNoSchemaDataFrameParam):
|
|
204
|
+
@no_type_check
|
|
205
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> List[List[Any]]:
|
|
206
|
+
return df.as_array(type_safe=True)
|
|
207
|
+
|
|
208
|
+
@no_type_check
|
|
209
|
+
def to_output_df(self, output: List[List[Any]], schema: Any, ctx: Any) -> DataFrame:
|
|
210
|
+
return ArrayDataFrame(output, schema)
|
|
211
|
+
|
|
212
|
+
@no_type_check
|
|
213
|
+
def count(self, df: List[List[Any]]) -> int:
|
|
214
|
+
return len(df)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@fugue_annotated_param(
|
|
218
|
+
Iterable[List[Any]],
|
|
219
|
+
matcher=lambda x: x == Iterable[List[Any]] or x == Iterator[List[Any]],
|
|
220
|
+
)
|
|
221
|
+
class _IterableListParam(_LocalNoSchemaDataFrameParam):
|
|
222
|
+
@no_type_check
|
|
223
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[List[Any]]:
|
|
224
|
+
return df.as_array_iterable(type_safe=True)
|
|
225
|
+
|
|
226
|
+
@no_type_check
|
|
227
|
+
def to_output_df(
|
|
228
|
+
self, output: Iterable[List[Any]], schema: Any, ctx: Any
|
|
229
|
+
) -> DataFrame:
|
|
230
|
+
return IterableDataFrame(output, schema)
|
|
231
|
+
|
|
232
|
+
@no_type_check
|
|
233
|
+
def count(self, df: Iterable[List[Any]]) -> int:
|
|
234
|
+
return sum(1 for _ in df)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
@fugue_annotated_param(EmptyAwareIterable[List[Any]])
|
|
238
|
+
class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
|
|
239
|
+
@no_type_check
|
|
240
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> EmptyAwareIterable[List[Any]]:
|
|
241
|
+
return make_empty_aware(df.as_array_iterable(type_safe=True))
|
|
242
|
+
|
|
243
|
+
@no_type_check
|
|
244
|
+
def to_output_df(
|
|
245
|
+
self, output: EmptyAwareIterable[List[Any]], schema: Any, ctx: Any
|
|
246
|
+
) -> DataFrame:
|
|
247
|
+
return IterableDataFrame(output, schema)
|
|
248
|
+
|
|
249
|
+
@no_type_check
|
|
250
|
+
def count(self, df: EmptyAwareIterable[List[Any]]) -> int:
|
|
251
|
+
return sum(1 for _ in df)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
@fugue_annotated_param(List[Dict[str, Any]])
|
|
255
|
+
class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
256
|
+
@no_type_check
|
|
257
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
|
|
258
|
+
return list(df.as_local().as_dict_iterable())
|
|
259
|
+
|
|
260
|
+
@no_type_check
|
|
261
|
+
def to_output_df(
|
|
262
|
+
self, output: List[Dict[str, Any]], schema: Any, ctx: Any
|
|
263
|
+
) -> DataFrame:
|
|
264
|
+
schema = schema if isinstance(schema, Schema) else Schema(schema)
|
|
265
|
+
|
|
266
|
+
def get_all() -> Iterable[List[Any]]:
|
|
267
|
+
for row in output:
|
|
268
|
+
yield [row[x] for x in schema.names]
|
|
269
|
+
|
|
270
|
+
return IterableDataFrame(get_all(), schema)
|
|
271
|
+
|
|
272
|
+
@no_type_check
|
|
273
|
+
def count(self, df: List[Dict[str, Any]]) -> int:
|
|
274
|
+
return len(df)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@fugue_annotated_param(
|
|
278
|
+
Iterable[Dict[str, Any]],
|
|
279
|
+
matcher=lambda x: x == Iterable[Dict[str, Any]] or x == Iterator[Dict[str, Any]],
|
|
280
|
+
)
|
|
281
|
+
class _IterableDictParam(_LocalNoSchemaDataFrameParam):
|
|
282
|
+
@no_type_check
|
|
283
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[Dict[str, Any]]:
|
|
284
|
+
return df.as_dict_iterable()
|
|
285
|
+
|
|
286
|
+
@no_type_check
|
|
287
|
+
def to_output_df(
|
|
288
|
+
self, output: Iterable[Dict[str, Any]], schema: Any, ctx: Any
|
|
289
|
+
) -> DataFrame:
|
|
290
|
+
schema = schema if isinstance(schema, Schema) else Schema(schema)
|
|
291
|
+
|
|
292
|
+
def get_all() -> Iterable[List[Any]]:
|
|
293
|
+
for row in output:
|
|
294
|
+
yield [row[x] for x in schema.names]
|
|
295
|
+
|
|
296
|
+
return IterableDataFrame(get_all(), schema)
|
|
297
|
+
|
|
298
|
+
@no_type_check
|
|
299
|
+
def count(self, df: Iterable[Dict[str, Any]]) -> int:
|
|
300
|
+
return sum(1 for _ in df)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@fugue_annotated_param(EmptyAwareIterable[Dict[str, Any]])
|
|
304
|
+
class _EmptyAwareIterableDictParam(_LocalNoSchemaDataFrameParam):
|
|
305
|
+
@no_type_check
|
|
306
|
+
def to_input_data(
|
|
307
|
+
self, df: DataFrame, ctx: Any
|
|
308
|
+
) -> EmptyAwareIterable[Dict[str, Any]]:
|
|
309
|
+
return make_empty_aware(df.as_dict_iterable())
|
|
310
|
+
|
|
311
|
+
@no_type_check
|
|
312
|
+
def to_output_df(
|
|
313
|
+
self, output: EmptyAwareIterable[Dict[str, Any]], schema: Any, ctx: Any
|
|
314
|
+
) -> DataFrame:
|
|
315
|
+
schema = schema if isinstance(schema, Schema) else Schema(schema)
|
|
316
|
+
|
|
317
|
+
def get_all() -> Iterable[List[Any]]:
|
|
318
|
+
for row in output:
|
|
319
|
+
yield [row[x] for x in schema.names]
|
|
320
|
+
|
|
321
|
+
return IterableDataFrame(get_all(), schema)
|
|
322
|
+
|
|
323
|
+
@no_type_check
|
|
324
|
+
def count(self, df: EmptyAwareIterable[Dict[str, Any]]) -> int:
|
|
325
|
+
return sum(1 for _ in df)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
@fugue_annotated_param(pd.DataFrame, "p")
|
|
329
|
+
class _PandasParam(LocalDataFrameParam):
|
|
330
|
+
@no_type_check
|
|
331
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> pd.DataFrame:
|
|
332
|
+
return df.as_pandas()
|
|
333
|
+
|
|
334
|
+
@no_type_check
|
|
335
|
+
def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
336
|
+
return PandasDataFrame(output, schema)
|
|
337
|
+
|
|
338
|
+
@no_type_check
|
|
339
|
+
def count(self, df: pd.DataFrame) -> int:
|
|
340
|
+
return df.shape[0]
|
|
341
|
+
|
|
342
|
+
def format_hint(self) -> Optional[str]:
|
|
343
|
+
return "pandas"
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
@fugue_annotated_param(
|
|
347
|
+
Iterable[pd.DataFrame],
|
|
348
|
+
matcher=lambda x: x == Iterable[pd.DataFrame] or x == Iterator[pd.DataFrame],
|
|
349
|
+
)
|
|
350
|
+
class _IterablePandasParam(LocalDataFrameParam):
|
|
351
|
+
@no_type_check
|
|
352
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pd.DataFrame]:
|
|
353
|
+
if not isinstance(df, LocalDataFrameIterableDataFrame):
|
|
354
|
+
yield df.as_pandas()
|
|
355
|
+
else:
|
|
356
|
+
for sub in df.native:
|
|
357
|
+
yield sub.as_pandas()
|
|
358
|
+
|
|
359
|
+
@no_type_check
|
|
360
|
+
def to_output_df(
|
|
361
|
+
self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
|
|
362
|
+
) -> DataFrame:
|
|
363
|
+
def dfs():
|
|
364
|
+
for df in output:
|
|
365
|
+
yield PandasDataFrame(df, schema)
|
|
366
|
+
|
|
367
|
+
return IterablePandasDataFrame(dfs())
|
|
368
|
+
|
|
369
|
+
@no_type_check
|
|
370
|
+
def count(self, df: Iterable[pd.DataFrame]) -> int:
|
|
371
|
+
return sum(_.shape[0] for _ in df)
|
|
372
|
+
|
|
373
|
+
def format_hint(self) -> Optional[str]:
|
|
374
|
+
return "pandas"
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
@fugue_annotated_param(pa.Table)
|
|
378
|
+
class _PyArrowTableParam(LocalDataFrameParam):
|
|
379
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
|
|
380
|
+
return df.as_arrow()
|
|
381
|
+
|
|
382
|
+
def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
|
|
383
|
+
assert isinstance(output, pa.Table)
|
|
384
|
+
return ArrowDataFrame(output, schema=schema)
|
|
385
|
+
|
|
386
|
+
def count(self, df: Any) -> int: # pragma: no cover
|
|
387
|
+
return df.count()
|
|
388
|
+
|
|
389
|
+
def format_hint(self) -> Optional[str]:
|
|
390
|
+
return "pyarrow"
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
@fugue_annotated_param(
|
|
394
|
+
Iterable[pa.Table],
|
|
395
|
+
matcher=lambda x: x == Iterable[pa.Table] or x == Iterator[pa.Table],
|
|
396
|
+
)
|
|
397
|
+
class _IterableArrowParam(LocalDataFrameParam):
|
|
398
|
+
@no_type_check
|
|
399
|
+
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
|
|
400
|
+
if not isinstance(df, LocalDataFrameIterableDataFrame):
|
|
401
|
+
yield df.as_arrow()
|
|
402
|
+
else:
|
|
403
|
+
for sub in df.native:
|
|
404
|
+
yield sub.as_arrow()
|
|
405
|
+
|
|
406
|
+
@no_type_check
|
|
407
|
+
def to_output_df(
|
|
408
|
+
self, output: Iterable[pa.Table], schema: Any, ctx: Any
|
|
409
|
+
) -> DataFrame:
|
|
410
|
+
def dfs():
|
|
411
|
+
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
412
|
+
for df in output:
|
|
413
|
+
adf = ArrowDataFrame(df)
|
|
414
|
+
if _schema is not None and not ( # pylint: disable-all
|
|
415
|
+
adf.schema == schema
|
|
416
|
+
):
|
|
417
|
+
adf = adf[_schema.names].alter_columns(_schema)
|
|
418
|
+
yield adf
|
|
419
|
+
|
|
420
|
+
return IterableArrowDataFrame(dfs())
|
|
421
|
+
|
|
422
|
+
@no_type_check
|
|
423
|
+
def count(self, df: Iterable[pa.Table]) -> int:
|
|
424
|
+
return sum(_.shape[0] for _ in df)
|
|
425
|
+
|
|
426
|
+
def format_hint(self) -> Optional[str]:
|
|
427
|
+
return "pyarrow"
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
@fugue_annotated_param(DataFrames, "c")
|
|
431
|
+
class _DataFramesParam(AnnotatedParam):
|
|
432
|
+
pass
|
|
@@ -97,6 +97,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
|
|
|
97
97
|
return self
|
|
98
98
|
return IterableDataFrame(self.native, new_schema)
|
|
99
99
|
|
|
100
|
+
def as_local_bounded(self) -> LocalBoundedDataFrame:
|
|
101
|
+
return ArrayDataFrame(self.as_array(), schema=self.schema)
|
|
102
|
+
|
|
100
103
|
def as_array(
|
|
101
104
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
102
105
|
) -> List[Any]:
|
fugue/dataframe/utils.py
CHANGED
|
@@ -13,11 +13,9 @@ from triad.exceptions import InvalidOperationError
|
|
|
13
13
|
from triad.utils.assertion import assert_arg_not_none
|
|
14
14
|
from triad.utils.assertion import assert_or_throw as aot
|
|
15
15
|
|
|
16
|
-
from .api import get_column_names, normalize_column_names, rename
|
|
16
|
+
from .api import get_column_names, normalize_column_names, rename, as_fugue_df
|
|
17
17
|
from .array_dataframe import ArrayDataFrame
|
|
18
|
-
from .
|
|
19
|
-
from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame
|
|
20
|
-
from .iterable_dataframe import IterableDataFrame
|
|
18
|
+
from .dataframe import DataFrame, LocalBoundedDataFrame
|
|
21
19
|
from .pandas_dataframe import PandasDataFrame
|
|
22
20
|
|
|
23
21
|
# For backward compatibility, TODO: remove!
|
|
@@ -29,7 +27,7 @@ rename_dataframe_column_names = rename
|
|
|
29
27
|
def _pa_type_eq(t1: pa.DataType, t2: pa.DataType) -> bool:
|
|
30
28
|
# should ignore the name difference of list
|
|
31
29
|
# e.g. list<item: string> == list<l: string>
|
|
32
|
-
if pa.types.is_list(t1) and pa.types.is_list(t2):
|
|
30
|
+
if pa.types.is_list(t1) and pa.types.is_list(t2): # pragma: no cover
|
|
33
31
|
return _pa_type_eq(t1.value_type, t2.value_type)
|
|
34
32
|
return t1 == t2
|
|
35
33
|
|
|
@@ -74,8 +72,11 @@ def _df_eq(
|
|
|
74
72
|
:param throw: if to throw error if not equal, defaults to False
|
|
75
73
|
:return: if they equal
|
|
76
74
|
"""
|
|
77
|
-
df1 =
|
|
78
|
-
|
|
75
|
+
df1 = as_fugue_df(df).as_local_bounded()
|
|
76
|
+
if schema is not None:
|
|
77
|
+
df2 = as_fugue_df(data, schema=schema).as_local_bounded()
|
|
78
|
+
else:
|
|
79
|
+
df2 = as_fugue_df(data).as_local_bounded()
|
|
79
80
|
try:
|
|
80
81
|
assert (
|
|
81
82
|
df1.count() == df2.count()
|
|
@@ -99,7 +100,7 @@ def _df_eq(
|
|
|
99
100
|
d1 = d1.reset_index(drop=True)
|
|
100
101
|
d2 = d2.reset_index(drop=True)
|
|
101
102
|
pd.testing.assert_frame_equal(
|
|
102
|
-
d1, d2,
|
|
103
|
+
d1, d2, rtol=0, atol=10 ** (-digits), check_dtype=False, check_exact=False
|
|
103
104
|
)
|
|
104
105
|
return True
|
|
105
106
|
except AssertionError:
|
|
@@ -108,78 +109,9 @@ def _df_eq(
|
|
|
108
109
|
return False
|
|
109
110
|
|
|
110
111
|
|
|
111
|
-
def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame:
|
|
112
|
-
"""Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`
|
|
113
|
-
|
|
114
|
-
:param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
|
|
115
|
-
list or iterable of arrays
|
|
116
|
-
:param schema: |SchemaLikeObject|, defaults to None, it should not be set for
|
|
117
|
-
:class:`~fugue.dataframe.dataframe.DataFrame` type
|
|
118
|
-
:raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
|
|
119
|
-
but you set ``schema``
|
|
120
|
-
:raises TypeError: if ``df`` is not compatible
|
|
121
|
-
:return: the dataframe itself if it's
|
|
122
|
-
:class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one
|
|
123
|
-
|
|
124
|
-
.. admonition:: Examples
|
|
125
|
-
|
|
126
|
-
>>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
|
|
127
|
-
>>> assert to_local_df(a) is a
|
|
128
|
-
>>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
|
|
129
|
-
"""
|
|
130
|
-
assert_arg_not_none(df, "df")
|
|
131
|
-
if isinstance(df, DataFrame):
|
|
132
|
-
aot(
|
|
133
|
-
schema is None,
|
|
134
|
-
ValueError("schema and metadata must be None when df is a DataFrame"),
|
|
135
|
-
)
|
|
136
|
-
return df.as_local()
|
|
137
|
-
if isinstance(df, pd.DataFrame):
|
|
138
|
-
return PandasDataFrame(df, schema)
|
|
139
|
-
if isinstance(df, pa.Table):
|
|
140
|
-
return ArrowDataFrame(df, schema)
|
|
141
|
-
if isinstance(df, List):
|
|
142
|
-
return ArrayDataFrame(df, schema)
|
|
143
|
-
if isinstance(df, Iterable):
|
|
144
|
-
return IterableDataFrame(df, schema)
|
|
145
|
-
raise TypeError(f"{df} cannot convert to a LocalDataFrame")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def to_local_bounded_df(df: Any, schema: Any = None) -> LocalBoundedDataFrame:
|
|
149
|
-
"""Convert a data structure to
|
|
150
|
-
:class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`
|
|
151
|
-
|
|
152
|
-
:param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
|
|
153
|
-
list or iterable of arrays
|
|
154
|
-
:param schema: |SchemaLikeObject|, defaults to None, it should not be set for
|
|
155
|
-
:class:`~fugue.dataframe.dataframe.DataFrame` type
|
|
156
|
-
:raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
|
|
157
|
-
but you set ``schema``
|
|
158
|
-
:raises TypeError: if ``df`` is not compatible
|
|
159
|
-
:return: the dataframe itself if it's
|
|
160
|
-
:class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one
|
|
161
|
-
|
|
162
|
-
.. admonition:: Examples
|
|
163
|
-
|
|
164
|
-
>>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str")
|
|
165
|
-
>>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame)
|
|
166
|
-
>>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
|
|
167
|
-
|
|
168
|
-
.. note::
|
|
169
|
-
|
|
170
|
-
Compared to :func:`.to_local_df`, this function makes sure the dataframe is also
|
|
171
|
-
bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will
|
|
172
|
-
be converted although it's local.
|
|
173
|
-
"""
|
|
174
|
-
df = to_local_df(df, schema)
|
|
175
|
-
if isinstance(df, LocalBoundedDataFrame):
|
|
176
|
-
return df
|
|
177
|
-
return ArrayDataFrame(df.as_array(), df.schema)
|
|
178
|
-
|
|
179
|
-
|
|
180
112
|
def pickle_df(df: DataFrame) -> bytes:
|
|
181
113
|
"""Pickles a dataframe to bytes array. It firstly converts the dataframe
|
|
182
|
-
|
|
114
|
+
local bounded, and then serialize the underlying data.
|
|
183
115
|
|
|
184
116
|
:param df: input DataFrame
|
|
185
117
|
:return: pickled binary data
|
|
@@ -189,7 +121,7 @@ def pickle_df(df: DataFrame) -> bytes:
|
|
|
189
121
|
Be careful to use on large dataframes or non-local, un-materialized dataframes,
|
|
190
122
|
it can be slow. You should always use :func:`.unpickle_df` to deserialize.
|
|
191
123
|
"""
|
|
192
|
-
df =
|
|
124
|
+
df = df.as_local_bounded()
|
|
193
125
|
o: List[Any] = [df.schema]
|
|
194
126
|
if isinstance(df, PandasDataFrame):
|
|
195
127
|
o.append("p")
|
fugue/dataset/api.py
CHANGED
|
@@ -41,8 +41,6 @@ def as_local(data: AnyDataset) -> AnyDataset:
|
|
|
41
41
|
|
|
42
42
|
:param data: the dataset that can be recognized by Fugue
|
|
43
43
|
"""
|
|
44
|
-
if isinstance(data, Dataset) and data.is_local:
|
|
45
|
-
return data
|
|
46
44
|
return as_local_bounded(data)
|
|
47
45
|
|
|
48
46
|
|
|
@@ -52,8 +50,6 @@ def as_local_bounded(data: AnyDataset) -> AnyDataset:
|
|
|
52
50
|
|
|
53
51
|
:param data: the dataset that can be recognized by Fugue
|
|
54
52
|
"""
|
|
55
|
-
if isinstance(data, Dataset) and data.is_local and data.is_bounded:
|
|
56
|
-
return data
|
|
57
53
|
raise NotImplementedError(
|
|
58
54
|
f"no registered function to convert {type(data)} to a local bounded dataset"
|
|
59
55
|
)
|
fugue/dev.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
All modeuls for developing and extending Fugue
|
|
3
|
+
"""
|
|
4
|
+
# flake8: noqa
|
|
5
|
+
# pylint: disable-all
|
|
6
|
+
|
|
7
|
+
from triad.collections.function_wrapper import AnnotatedParam
|
|
8
|
+
|
|
9
|
+
from fugue.bag.bag import BagDisplay
|
|
10
|
+
from fugue.collections.partition import PartitionCursor, PartitionSpec
|
|
11
|
+
from fugue.collections.sql import StructuredRawSQL, TempTableName
|
|
12
|
+
from fugue.collections.yielded import PhysicalYielded, Yielded
|
|
13
|
+
from fugue.dataframe.function_wrapper import (
|
|
14
|
+
DataFrameFunctionWrapper,
|
|
15
|
+
DataFrameParam,
|
|
16
|
+
LocalDataFrameParam,
|
|
17
|
+
fugue_annotated_param,
|
|
18
|
+
)
|
|
19
|
+
from fugue.dataset import DatasetDisplay
|
|
20
|
+
from fugue.execution.execution_engine import (
|
|
21
|
+
EngineFacet,
|
|
22
|
+
ExecutionEngineParam,
|
|
23
|
+
MapEngine,
|
|
24
|
+
SQLEngine,
|
|
25
|
+
)
|
|
26
|
+
from fugue.execution.factory import (
|
|
27
|
+
is_pandas_or,
|
|
28
|
+
make_execution_engine,
|
|
29
|
+
make_sql_engine,
|
|
30
|
+
register_default_execution_engine,
|
|
31
|
+
register_default_sql_engine,
|
|
32
|
+
register_execution_engine,
|
|
33
|
+
register_sql_engine,
|
|
34
|
+
)
|
|
35
|
+
from fugue.execution.native_execution_engine import PandasMapEngine, QPDPandasEngine
|
|
36
|
+
from fugue.rpc import (
|
|
37
|
+
EmptyRPCHandler,
|
|
38
|
+
RPCClient,
|
|
39
|
+
RPCFunc,
|
|
40
|
+
RPCHandler,
|
|
41
|
+
RPCServer,
|
|
42
|
+
make_rpc_server,
|
|
43
|
+
to_rpc_handler,
|
|
44
|
+
)
|
|
45
|
+
from fugue.workflow._workflow_context import FugueWorkflowContext
|
|
46
|
+
from fugue.workflow.module import module
|
|
47
|
+
from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames
|
fugue/execution/__init__.py
CHANGED
|
@@ -10,8 +10,4 @@ from .factory import (
|
|
|
10
10
|
register_execution_engine,
|
|
11
11
|
register_sql_engine,
|
|
12
12
|
)
|
|
13
|
-
from .native_execution_engine import
|
|
14
|
-
NativeExecutionEngine,
|
|
15
|
-
QPDPandasEngine,
|
|
16
|
-
SqliteEngine,
|
|
17
|
-
)
|
|
13
|
+
from .native_execution_engine import NativeExecutionEngine, QPDPandasEngine
|