fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. fugue/__init__.py +9 -5
  2. fugue/_utils/interfaceless.py +1 -558
  3. fugue/_utils/io.py +2 -91
  4. fugue/_utils/registry.py +3 -2
  5. fugue/api.py +1 -0
  6. fugue/bag/bag.py +8 -4
  7. fugue/collections/__init__.py +0 -7
  8. fugue/collections/partition.py +21 -9
  9. fugue/constants.py +3 -1
  10. fugue/dataframe/__init__.py +7 -8
  11. fugue/dataframe/arrow_dataframe.py +1 -2
  12. fugue/dataframe/dataframe.py +17 -18
  13. fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
  14. fugue/dataframe/function_wrapper.py +432 -0
  15. fugue/dataframe/iterable_dataframe.py +3 -0
  16. fugue/dataframe/utils.py +11 -79
  17. fugue/dataset/api.py +0 -4
  18. fugue/dev.py +47 -0
  19. fugue/execution/__init__.py +1 -5
  20. fugue/execution/api.py +36 -14
  21. fugue/execution/execution_engine.py +30 -4
  22. fugue/execution/factory.py +0 -6
  23. fugue/execution/native_execution_engine.py +44 -67
  24. fugue/extensions/_builtins/creators.py +4 -2
  25. fugue/extensions/_builtins/outputters.py +4 -3
  26. fugue/extensions/_builtins/processors.py +3 -3
  27. fugue/extensions/creator/convert.py +5 -2
  28. fugue/extensions/outputter/convert.py +2 -2
  29. fugue/extensions/processor/convert.py +3 -2
  30. fugue/extensions/transformer/convert.py +22 -9
  31. fugue/extensions/transformer/transformer.py +15 -1
  32. fugue/plugins.py +2 -0
  33. fugue/registry.py +0 -39
  34. fugue/sql/_utils.py +1 -1
  35. fugue/workflow/_checkpoint.py +1 -1
  36. fugue/workflow/api.py +13 -13
  37. fugue/workflow/module.py +30 -37
  38. fugue/workflow/workflow.py +6 -0
  39. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
  40. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
  41. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
  42. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
  43. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
  44. fugue_contrib/contrib.py +1 -0
  45. fugue_contrib/viz/_ext.py +7 -1
  46. fugue_dask/_io.py +0 -13
  47. fugue_dask/_utils.py +10 -4
  48. fugue_dask/dataframe.py +1 -2
  49. fugue_dask/execution_engine.py +45 -18
  50. fugue_dask/registry.py +8 -33
  51. fugue_duckdb/_io.py +8 -2
  52. fugue_duckdb/_utils.py +7 -2
  53. fugue_duckdb/dask.py +1 -1
  54. fugue_duckdb/dataframe.py +23 -19
  55. fugue_duckdb/execution_engine.py +19 -22
  56. fugue_duckdb/registry.py +11 -34
  57. fugue_ibis/dataframe.py +6 -10
  58. fugue_ibis/execution_engine.py +7 -1
  59. fugue_notebook/env.py +5 -10
  60. fugue_polars/__init__.py +2 -0
  61. fugue_polars/_utils.py +8 -0
  62. fugue_polars/polars_dataframe.py +234 -0
  63. fugue_polars/registry.py +86 -0
  64. fugue_ray/_constants.py +10 -1
  65. fugue_ray/_utils/dataframe.py +36 -9
  66. fugue_ray/_utils/io.py +2 -4
  67. fugue_ray/dataframe.py +16 -12
  68. fugue_ray/execution_engine.py +53 -32
  69. fugue_ray/registry.py +8 -32
  70. fugue_spark/_utils/convert.py +22 -11
  71. fugue_spark/_utils/io.py +0 -13
  72. fugue_spark/_utils/misc.py +27 -0
  73. fugue_spark/_utils/partition.py +11 -18
  74. fugue_spark/dataframe.py +26 -22
  75. fugue_spark/execution_engine.py +136 -54
  76. fugue_spark/registry.py +29 -78
  77. fugue_test/builtin_suite.py +36 -14
  78. fugue_test/dataframe_suite.py +9 -5
  79. fugue_test/execution_suite.py +100 -122
  80. fugue_version/__init__.py +1 -1
  81. tests/fugue/bag/test_array_bag.py +0 -9
  82. tests/fugue/collections/test_partition.py +10 -3
  83. tests/fugue/dataframe/test_function_wrapper.py +293 -0
  84. tests/fugue/dataframe/test_utils.py +2 -34
  85. tests/fugue/execution/test_factory.py +7 -9
  86. tests/fugue/execution/test_naive_execution_engine.py +35 -80
  87. tests/fugue/extensions/test_utils.py +12 -7
  88. tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
  89. tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
  90. tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
  91. tests/fugue/sql/test_workflow.py +1 -1
  92. tests/fugue/sql/test_workflow_parse.py +3 -5
  93. tests/fugue/utils/test_interfaceless.py +1 -325
  94. tests/fugue/utils/test_io.py +0 -80
  95. tests/fugue_dask/test_execution_engine.py +48 -0
  96. tests/fugue_dask/test_io.py +0 -55
  97. tests/fugue_duckdb/test_dataframe.py +2 -2
  98. tests/fugue_duckdb/test_execution_engine.py +16 -1
  99. tests/fugue_duckdb/test_utils.py +1 -1
  100. tests/fugue_ibis/test_dataframe.py +6 -3
  101. tests/fugue_polars/__init__.py +0 -0
  102. tests/fugue_polars/test_api.py +13 -0
  103. tests/fugue_polars/test_dataframe.py +82 -0
  104. tests/fugue_polars/test_transform.py +100 -0
  105. tests/fugue_ray/test_execution_engine.py +40 -4
  106. tests/fugue_spark/test_dataframe.py +0 -8
  107. tests/fugue_spark/test_execution_engine.py +50 -11
  108. tests/fugue_spark/test_importless.py +4 -4
  109. tests/fugue_spark/test_spark_connect.py +82 -0
  110. tests/fugue_spark/utils/test_convert.py +6 -8
  111. tests/fugue_spark/utils/test_io.py +0 -17
  112. fugue/_utils/register.py +0 -3
  113. fugue_test/_utils.py +0 -13
  114. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
@@ -0,0 +1,432 @@
1
+ import inspect
2
+ from typing import (
3
+ Any,
4
+ Callable,
5
+ Dict,
6
+ Iterable,
7
+ Iterator,
8
+ List,
9
+ Optional,
10
+ no_type_check,
11
+ )
12
+
13
+ import pandas as pd
14
+ import pyarrow as pa
15
+ from triad import Schema, assert_or_throw
16
+ from triad.collections.function_wrapper import (
17
+ AnnotatedParam,
18
+ FunctionWrapper,
19
+ KeywordParam,
20
+ PositionalParam,
21
+ function_wrapper,
22
+ )
23
+ from triad.utils.iter import EmptyAwareIterable, make_empty_aware
24
+
25
+ from ..constants import FUGUE_ENTRYPOINT
26
+ from .array_dataframe import ArrayDataFrame
27
+ from .arrow_dataframe import ArrowDataFrame
28
+ from .dataframe import DataFrame, LocalDataFrame
29
+ from .dataframe_iterable_dataframe import (
30
+ IterableArrowDataFrame,
31
+ IterablePandasDataFrame,
32
+ LocalDataFrameIterableDataFrame,
33
+ )
34
+ from .dataframes import DataFrames
35
+ from .iterable_dataframe import IterableDataFrame
36
+ from .pandas_dataframe import PandasDataFrame
37
+
38
+
39
+ @function_wrapper(FUGUE_ENTRYPOINT)
40
+ class DataFrameFunctionWrapper(FunctionWrapper):
41
+ @property
42
+ def need_output_schema(self) -> Optional[bool]:
43
+ return (
44
+ self._rt.need_schema()
45
+ if isinstance(self._rt, _DataFrameParamBase)
46
+ else False
47
+ )
48
+
49
+ def get_format_hint(self) -> Optional[str]:
50
+ for v in self._params.values():
51
+ if isinstance(v, _DataFrameParamBase):
52
+ if v.format_hint() is not None:
53
+ return v.format_hint()
54
+ if isinstance(self._rt, _DataFrameParamBase):
55
+ return self._rt.format_hint()
56
+ return None
57
+
58
+ def run( # noqa: C901
59
+ self,
60
+ args: List[Any],
61
+ kwargs: Dict[str, Any],
62
+ ignore_unknown: bool = False,
63
+ output_schema: Any = None,
64
+ output: bool = True,
65
+ ctx: Any = None,
66
+ ) -> Any:
67
+ p: Dict[str, Any] = {}
68
+ for i in range(len(args)):
69
+ p[self._params.get_key_by_index(i)] = args[i]
70
+ p.update(kwargs)
71
+ has_kw = False
72
+ rargs: Dict[str, Any] = {}
73
+ for k, v in self._params.items():
74
+ if isinstance(v, (PositionalParam, KeywordParam)):
75
+ if isinstance(v, KeywordParam):
76
+ has_kw = True
77
+ elif k in p:
78
+ if isinstance(v, _DataFrameParamBase):
79
+ assert_or_throw(
80
+ isinstance(p[k], DataFrame),
81
+ lambda: TypeError(f"{p[k]} is not a DataFrame"),
82
+ )
83
+ rargs[k] = v.to_input_data(p[k], ctx=ctx)
84
+ else:
85
+ rargs[k] = p[k] # TODO: should we do auto type conversion?
86
+ del p[k]
87
+ elif v.required:
88
+ raise ValueError(f"{k} is required by not given")
89
+ if has_kw:
90
+ rargs.update(p)
91
+ elif not ignore_unknown and len(p) > 0:
92
+ raise ValueError(f"{p} are not acceptable parameters")
93
+ rt = self._func(**rargs)
94
+ if not output:
95
+ if isinstance(self._rt, _DataFrameParamBase):
96
+ self._rt.count(rt)
97
+ return
98
+ if isinstance(self._rt, _DataFrameParamBase):
99
+ return self._rt.to_output_df(rt, output_schema, ctx=ctx)
100
+ return rt
101
+
102
+
103
+ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
104
+
105
+
106
+ @fugue_annotated_param(
107
+ "Callable",
108
+ "F",
109
+ lambda annotation: (
110
+ annotation == Callable
111
+ or annotation == callable # pylint: disable=comparison-with-callable
112
+ or str(annotation).startswith("typing.Callable")
113
+ ),
114
+ )
115
+ class _CallableParam(AnnotatedParam):
116
+ pass
117
+
118
+
119
+ @fugue_annotated_param(
120
+ "Callable",
121
+ "f",
122
+ lambda annotation: (
123
+ annotation == Optional[Callable]
124
+ or annotation == Optional[callable]
125
+ or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
126
+ or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
127
+ ),
128
+ )
129
+ class _OptionalCallableParam(AnnotatedParam):
130
+ pass
131
+
132
+
133
+ class _DataFrameParamBase(AnnotatedParam):
134
+ def __init__(self, param: Optional[inspect.Parameter]):
135
+ super().__init__(param)
136
+ assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
137
+
138
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
139
+ raise NotImplementedError
140
+
141
+ def to_output_df(
142
+ self, df: Any, schema: Any, ctx: Any
143
+ ) -> DataFrame: # pragma: no cover
144
+ raise NotImplementedError
145
+
146
+ def count(self, df: Any) -> int: # pragma: no cover
147
+ raise NotImplementedError
148
+
149
+ def need_schema(self) -> Optional[bool]:
150
+ return False
151
+
152
+ def format_hint(self) -> Optional[str]:
153
+ return None
154
+
155
+
156
+ @fugue_annotated_param(DataFrame, "d", child_can_reuse_code=True)
157
+ class DataFrameParam(_DataFrameParamBase):
158
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
159
+ return df
160
+
161
+ def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
162
+ assert_or_throw(
163
+ schema is None or output.schema == schema,
164
+ lambda: f"Output schema mismatch {output.schema} vs {schema}",
165
+ )
166
+ return output
167
+
168
+ def count(self, df: Any) -> int:
169
+ if df.is_bounded:
170
+ return df.count()
171
+ else:
172
+ return sum(1 for _ in df.as_array_iterable())
173
+
174
+
175
+ @fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
176
+ class LocalDataFrameParam(DataFrameParam):
177
+ def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
178
+ return df.as_local()
179
+
180
+ def to_output_df(self, output: LocalDataFrame, schema: Any, ctx: Any) -> DataFrame:
181
+ assert_or_throw(
182
+ schema is None or output.schema == schema,
183
+ lambda: f"Output schema mismatch {output.schema} vs {schema}",
184
+ )
185
+ return output
186
+
187
+ def count(self, df: LocalDataFrame) -> int:
188
+ if df.is_bounded:
189
+ return df.count()
190
+ else:
191
+ return sum(1 for _ in df.as_array_iterable())
192
+
193
+
194
+ @fugue_annotated_param(
195
+ "[NoSchema]", "s", matcher=lambda x: False, child_can_reuse_code=True
196
+ )
197
+ class _LocalNoSchemaDataFrameParam(LocalDataFrameParam):
198
+ def need_schema(self) -> Optional[bool]:
199
+ return True
200
+
201
+
202
+ @fugue_annotated_param(List[List[Any]])
203
+ class _ListListParam(_LocalNoSchemaDataFrameParam):
204
+ @no_type_check
205
+ def to_input_data(self, df: DataFrame, ctx: Any) -> List[List[Any]]:
206
+ return df.as_array(type_safe=True)
207
+
208
+ @no_type_check
209
+ def to_output_df(self, output: List[List[Any]], schema: Any, ctx: Any) -> DataFrame:
210
+ return ArrayDataFrame(output, schema)
211
+
212
+ @no_type_check
213
+ def count(self, df: List[List[Any]]) -> int:
214
+ return len(df)
215
+
216
+
217
+ @fugue_annotated_param(
218
+ Iterable[List[Any]],
219
+ matcher=lambda x: x == Iterable[List[Any]] or x == Iterator[List[Any]],
220
+ )
221
+ class _IterableListParam(_LocalNoSchemaDataFrameParam):
222
+ @no_type_check
223
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[List[Any]]:
224
+ return df.as_array_iterable(type_safe=True)
225
+
226
+ @no_type_check
227
+ def to_output_df(
228
+ self, output: Iterable[List[Any]], schema: Any, ctx: Any
229
+ ) -> DataFrame:
230
+ return IterableDataFrame(output, schema)
231
+
232
+ @no_type_check
233
+ def count(self, df: Iterable[List[Any]]) -> int:
234
+ return sum(1 for _ in df)
235
+
236
+
237
+ @fugue_annotated_param(EmptyAwareIterable[List[Any]])
238
+ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
239
+ @no_type_check
240
+ def to_input_data(self, df: DataFrame, ctx: Any) -> EmptyAwareIterable[List[Any]]:
241
+ return make_empty_aware(df.as_array_iterable(type_safe=True))
242
+
243
+ @no_type_check
244
+ def to_output_df(
245
+ self, output: EmptyAwareIterable[List[Any]], schema: Any, ctx: Any
246
+ ) -> DataFrame:
247
+ return IterableDataFrame(output, schema)
248
+
249
+ @no_type_check
250
+ def count(self, df: EmptyAwareIterable[List[Any]]) -> int:
251
+ return sum(1 for _ in df)
252
+
253
+
254
+ @fugue_annotated_param(List[Dict[str, Any]])
255
+ class _ListDictParam(_LocalNoSchemaDataFrameParam):
256
+ @no_type_check
257
+ def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
258
+ return list(df.as_local().as_dict_iterable())
259
+
260
+ @no_type_check
261
+ def to_output_df(
262
+ self, output: List[Dict[str, Any]], schema: Any, ctx: Any
263
+ ) -> DataFrame:
264
+ schema = schema if isinstance(schema, Schema) else Schema(schema)
265
+
266
+ def get_all() -> Iterable[List[Any]]:
267
+ for row in output:
268
+ yield [row[x] for x in schema.names]
269
+
270
+ return IterableDataFrame(get_all(), schema)
271
+
272
+ @no_type_check
273
+ def count(self, df: List[Dict[str, Any]]) -> int:
274
+ return len(df)
275
+
276
+
277
+ @fugue_annotated_param(
278
+ Iterable[Dict[str, Any]],
279
+ matcher=lambda x: x == Iterable[Dict[str, Any]] or x == Iterator[Dict[str, Any]],
280
+ )
281
+ class _IterableDictParam(_LocalNoSchemaDataFrameParam):
282
+ @no_type_check
283
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[Dict[str, Any]]:
284
+ return df.as_dict_iterable()
285
+
286
+ @no_type_check
287
+ def to_output_df(
288
+ self, output: Iterable[Dict[str, Any]], schema: Any, ctx: Any
289
+ ) -> DataFrame:
290
+ schema = schema if isinstance(schema, Schema) else Schema(schema)
291
+
292
+ def get_all() -> Iterable[List[Any]]:
293
+ for row in output:
294
+ yield [row[x] for x in schema.names]
295
+
296
+ return IterableDataFrame(get_all(), schema)
297
+
298
+ @no_type_check
299
+ def count(self, df: Iterable[Dict[str, Any]]) -> int:
300
+ return sum(1 for _ in df)
301
+
302
+
303
+ @fugue_annotated_param(EmptyAwareIterable[Dict[str, Any]])
304
+ class _EmptyAwareIterableDictParam(_LocalNoSchemaDataFrameParam):
305
+ @no_type_check
306
+ def to_input_data(
307
+ self, df: DataFrame, ctx: Any
308
+ ) -> EmptyAwareIterable[Dict[str, Any]]:
309
+ return make_empty_aware(df.as_dict_iterable())
310
+
311
+ @no_type_check
312
+ def to_output_df(
313
+ self, output: EmptyAwareIterable[Dict[str, Any]], schema: Any, ctx: Any
314
+ ) -> DataFrame:
315
+ schema = schema if isinstance(schema, Schema) else Schema(schema)
316
+
317
+ def get_all() -> Iterable[List[Any]]:
318
+ for row in output:
319
+ yield [row[x] for x in schema.names]
320
+
321
+ return IterableDataFrame(get_all(), schema)
322
+
323
+ @no_type_check
324
+ def count(self, df: EmptyAwareIterable[Dict[str, Any]]) -> int:
325
+ return sum(1 for _ in df)
326
+
327
+
328
+ @fugue_annotated_param(pd.DataFrame, "p")
329
+ class _PandasParam(LocalDataFrameParam):
330
+ @no_type_check
331
+ def to_input_data(self, df: DataFrame, ctx: Any) -> pd.DataFrame:
332
+ return df.as_pandas()
333
+
334
+ @no_type_check
335
+ def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
336
+ return PandasDataFrame(output, schema)
337
+
338
+ @no_type_check
339
+ def count(self, df: pd.DataFrame) -> int:
340
+ return df.shape[0]
341
+
342
+ def format_hint(self) -> Optional[str]:
343
+ return "pandas"
344
+
345
+
346
+ @fugue_annotated_param(
347
+ Iterable[pd.DataFrame],
348
+ matcher=lambda x: x == Iterable[pd.DataFrame] or x == Iterator[pd.DataFrame],
349
+ )
350
+ class _IterablePandasParam(LocalDataFrameParam):
351
+ @no_type_check
352
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pd.DataFrame]:
353
+ if not isinstance(df, LocalDataFrameIterableDataFrame):
354
+ yield df.as_pandas()
355
+ else:
356
+ for sub in df.native:
357
+ yield sub.as_pandas()
358
+
359
+ @no_type_check
360
+ def to_output_df(
361
+ self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
362
+ ) -> DataFrame:
363
+ def dfs():
364
+ for df in output:
365
+ yield PandasDataFrame(df, schema)
366
+
367
+ return IterablePandasDataFrame(dfs())
368
+
369
+ @no_type_check
370
+ def count(self, df: Iterable[pd.DataFrame]) -> int:
371
+ return sum(_.shape[0] for _ in df)
372
+
373
+ def format_hint(self) -> Optional[str]:
374
+ return "pandas"
375
+
376
+
377
+ @fugue_annotated_param(pa.Table)
378
+ class _PyArrowTableParam(LocalDataFrameParam):
379
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Any:
380
+ return df.as_arrow()
381
+
382
+ def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
383
+ assert isinstance(output, pa.Table)
384
+ return ArrowDataFrame(output, schema=schema)
385
+
386
+ def count(self, df: Any) -> int: # pragma: no cover
387
+ return df.count()
388
+
389
+ def format_hint(self) -> Optional[str]:
390
+ return "pyarrow"
391
+
392
+
393
+ @fugue_annotated_param(
394
+ Iterable[pa.Table],
395
+ matcher=lambda x: x == Iterable[pa.Table] or x == Iterator[pa.Table],
396
+ )
397
+ class _IterableArrowParam(LocalDataFrameParam):
398
+ @no_type_check
399
+ def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
400
+ if not isinstance(df, LocalDataFrameIterableDataFrame):
401
+ yield df.as_arrow()
402
+ else:
403
+ for sub in df.native:
404
+ yield sub.as_arrow()
405
+
406
+ @no_type_check
407
+ def to_output_df(
408
+ self, output: Iterable[pa.Table], schema: Any, ctx: Any
409
+ ) -> DataFrame:
410
+ def dfs():
411
+ _schema: Optional[Schema] = None if schema is None else Schema(schema)
412
+ for df in output:
413
+ adf = ArrowDataFrame(df)
414
+ if _schema is not None and not ( # pylint: disable-all
415
+ adf.schema == schema
416
+ ):
417
+ adf = adf[_schema.names].alter_columns(_schema)
418
+ yield adf
419
+
420
+ return IterableArrowDataFrame(dfs())
421
+
422
+ @no_type_check
423
+ def count(self, df: Iterable[pa.Table]) -> int:
424
+ return sum(_.shape[0] for _ in df)
425
+
426
+ def format_hint(self) -> Optional[str]:
427
+ return "pyarrow"
428
+
429
+
430
+ @fugue_annotated_param(DataFrames, "c")
431
+ class _DataFramesParam(AnnotatedParam):
432
+ pass
@@ -97,6 +97,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
97
97
  return self
98
98
  return IterableDataFrame(self.native, new_schema)
99
99
 
100
+ def as_local_bounded(self) -> LocalBoundedDataFrame:
101
+ return ArrayDataFrame(self.as_array(), schema=self.schema)
102
+
100
103
  def as_array(
101
104
  self, columns: Optional[List[str]] = None, type_safe: bool = False
102
105
  ) -> List[Any]:
fugue/dataframe/utils.py CHANGED
@@ -13,11 +13,9 @@ from triad.exceptions import InvalidOperationError
13
13
  from triad.utils.assertion import assert_arg_not_none
14
14
  from triad.utils.assertion import assert_or_throw as aot
15
15
 
16
- from .api import get_column_names, normalize_column_names, rename
16
+ from .api import get_column_names, normalize_column_names, rename, as_fugue_df
17
17
  from .array_dataframe import ArrayDataFrame
18
- from .arrow_dataframe import ArrowDataFrame
19
- from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame
20
- from .iterable_dataframe import IterableDataFrame
18
+ from .dataframe import DataFrame, LocalBoundedDataFrame
21
19
  from .pandas_dataframe import PandasDataFrame
22
20
 
23
21
  # For backward compatibility, TODO: remove!
@@ -29,7 +27,7 @@ rename_dataframe_column_names = rename
29
27
  def _pa_type_eq(t1: pa.DataType, t2: pa.DataType) -> bool:
30
28
  # should ignore the name difference of list
31
29
  # e.g. list<item: string> == list<l: string>
32
- if pa.types.is_list(t1) and pa.types.is_list(t2):
30
+ if pa.types.is_list(t1) and pa.types.is_list(t2): # pragma: no cover
33
31
  return _pa_type_eq(t1.value_type, t2.value_type)
34
32
  return t1 == t2
35
33
 
@@ -74,8 +72,11 @@ def _df_eq(
74
72
  :param throw: if to throw error if not equal, defaults to False
75
73
  :return: if they equal
76
74
  """
77
- df1 = to_local_bounded_df(df)
78
- df2 = to_local_bounded_df(data, schema)
75
+ df1 = as_fugue_df(df).as_local_bounded()
76
+ if schema is not None:
77
+ df2 = as_fugue_df(data, schema=schema).as_local_bounded()
78
+ else:
79
+ df2 = as_fugue_df(data).as_local_bounded()
79
80
  try:
80
81
  assert (
81
82
  df1.count() == df2.count()
@@ -99,7 +100,7 @@ def _df_eq(
99
100
  d1 = d1.reset_index(drop=True)
100
101
  d2 = d2.reset_index(drop=True)
101
102
  pd.testing.assert_frame_equal(
102
- d1, d2, check_less_precise=digits, check_dtype=False
103
+ d1, d2, rtol=0, atol=10 ** (-digits), check_dtype=False, check_exact=False
103
104
  )
104
105
  return True
105
106
  except AssertionError:
@@ -108,78 +109,9 @@ def _df_eq(
108
109
  return False
109
110
 
110
111
 
111
- def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame:
112
- """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`
113
-
114
- :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
115
- list or iterable of arrays
116
- :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
117
- :class:`~fugue.dataframe.dataframe.DataFrame` type
118
- :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
119
- but you set ``schema``
120
- :raises TypeError: if ``df`` is not compatible
121
- :return: the dataframe itself if it's
122
- :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one
123
-
124
- .. admonition:: Examples
125
-
126
- >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
127
- >>> assert to_local_df(a) is a
128
- >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
129
- """
130
- assert_arg_not_none(df, "df")
131
- if isinstance(df, DataFrame):
132
- aot(
133
- schema is None,
134
- ValueError("schema and metadata must be None when df is a DataFrame"),
135
- )
136
- return df.as_local()
137
- if isinstance(df, pd.DataFrame):
138
- return PandasDataFrame(df, schema)
139
- if isinstance(df, pa.Table):
140
- return ArrowDataFrame(df, schema)
141
- if isinstance(df, List):
142
- return ArrayDataFrame(df, schema)
143
- if isinstance(df, Iterable):
144
- return IterableDataFrame(df, schema)
145
- raise TypeError(f"{df} cannot convert to a LocalDataFrame")
146
-
147
-
148
- def to_local_bounded_df(df: Any, schema: Any = None) -> LocalBoundedDataFrame:
149
- """Convert a data structure to
150
- :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`
151
-
152
- :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
153
- list or iterable of arrays
154
- :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
155
- :class:`~fugue.dataframe.dataframe.DataFrame` type
156
- :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
157
- but you set ``schema``
158
- :raises TypeError: if ``df`` is not compatible
159
- :return: the dataframe itself if it's
160
- :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one
161
-
162
- .. admonition:: Examples
163
-
164
- >>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str")
165
- >>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame)
166
- >>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
167
-
168
- .. note::
169
-
170
- Compared to :func:`.to_local_df`, this function makes sure the dataframe is also
171
- bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will
172
- be converted although it's local.
173
- """
174
- df = to_local_df(df, schema)
175
- if isinstance(df, LocalBoundedDataFrame):
176
- return df
177
- return ArrayDataFrame(df.as_array(), df.schema)
178
-
179
-
180
112
  def pickle_df(df: DataFrame) -> bytes:
181
113
  """Pickles a dataframe to bytes array. It firstly converts the dataframe
182
- using :func:`.to_local_bounded_df`, and then serialize the underlying data.
114
+ local bounded, and then serialize the underlying data.
183
115
 
184
116
  :param df: input DataFrame
185
117
  :return: pickled binary data
@@ -189,7 +121,7 @@ def pickle_df(df: DataFrame) -> bytes:
189
121
  Be careful to use on large dataframes or non-local, un-materialized dataframes,
190
122
  it can be slow. You should always use :func:`.unpickle_df` to deserialize.
191
123
  """
192
- df = to_local_bounded_df(df)
124
+ df = df.as_local_bounded()
193
125
  o: List[Any] = [df.schema]
194
126
  if isinstance(df, PandasDataFrame):
195
127
  o.append("p")
fugue/dataset/api.py CHANGED
@@ -41,8 +41,6 @@ def as_local(data: AnyDataset) -> AnyDataset:
41
41
 
42
42
  :param data: the dataset that can be recognized by Fugue
43
43
  """
44
- if isinstance(data, Dataset) and data.is_local:
45
- return data
46
44
  return as_local_bounded(data)
47
45
 
48
46
 
@@ -52,8 +50,6 @@ def as_local_bounded(data: AnyDataset) -> AnyDataset:
52
50
 
53
51
  :param data: the dataset that can be recognized by Fugue
54
52
  """
55
- if isinstance(data, Dataset) and data.is_local and data.is_bounded:
56
- return data
57
53
  raise NotImplementedError(
58
54
  f"no registered function to convert {type(data)} to a local bounded dataset"
59
55
  )
fugue/dev.py ADDED
@@ -0,0 +1,47 @@
1
+ """
2
+ All modeuls for developing and extending Fugue
3
+ """
4
+ # flake8: noqa
5
+ # pylint: disable-all
6
+
7
+ from triad.collections.function_wrapper import AnnotatedParam
8
+
9
+ from fugue.bag.bag import BagDisplay
10
+ from fugue.collections.partition import PartitionCursor, PartitionSpec
11
+ from fugue.collections.sql import StructuredRawSQL, TempTableName
12
+ from fugue.collections.yielded import PhysicalYielded, Yielded
13
+ from fugue.dataframe.function_wrapper import (
14
+ DataFrameFunctionWrapper,
15
+ DataFrameParam,
16
+ LocalDataFrameParam,
17
+ fugue_annotated_param,
18
+ )
19
+ from fugue.dataset import DatasetDisplay
20
+ from fugue.execution.execution_engine import (
21
+ EngineFacet,
22
+ ExecutionEngineParam,
23
+ MapEngine,
24
+ SQLEngine,
25
+ )
26
+ from fugue.execution.factory import (
27
+ is_pandas_or,
28
+ make_execution_engine,
29
+ make_sql_engine,
30
+ register_default_execution_engine,
31
+ register_default_sql_engine,
32
+ register_execution_engine,
33
+ register_sql_engine,
34
+ )
35
+ from fugue.execution.native_execution_engine import PandasMapEngine, QPDPandasEngine
36
+ from fugue.rpc import (
37
+ EmptyRPCHandler,
38
+ RPCClient,
39
+ RPCFunc,
40
+ RPCHandler,
41
+ RPCServer,
42
+ make_rpc_server,
43
+ to_rpc_handler,
44
+ )
45
+ from fugue.workflow._workflow_context import FugueWorkflowContext
46
+ from fugue.workflow.module import module
47
+ from fugue.workflow.workflow import FugueWorkflow, WorkflowDataFrame, WorkflowDataFrames
@@ -10,8 +10,4 @@ from .factory import (
10
10
  register_execution_engine,
11
11
  register_sql_engine,
12
12
  )
13
- from .native_execution_engine import (
14
- NativeExecutionEngine,
15
- QPDPandasEngine,
16
- SqliteEngine,
17
- )
13
+ from .native_execution_engine import NativeExecutionEngine, QPDPandasEngine