fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. fugue/__init__.py +9 -5
  2. fugue/_utils/interfaceless.py +1 -558
  3. fugue/_utils/io.py +2 -91
  4. fugue/_utils/registry.py +3 -2
  5. fugue/api.py +1 -0
  6. fugue/bag/bag.py +8 -4
  7. fugue/collections/__init__.py +0 -7
  8. fugue/collections/partition.py +21 -9
  9. fugue/constants.py +3 -1
  10. fugue/dataframe/__init__.py +7 -8
  11. fugue/dataframe/arrow_dataframe.py +1 -2
  12. fugue/dataframe/dataframe.py +17 -18
  13. fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
  14. fugue/dataframe/function_wrapper.py +432 -0
  15. fugue/dataframe/iterable_dataframe.py +3 -0
  16. fugue/dataframe/utils.py +11 -79
  17. fugue/dataset/api.py +0 -4
  18. fugue/dev.py +47 -0
  19. fugue/execution/__init__.py +1 -5
  20. fugue/execution/api.py +36 -14
  21. fugue/execution/execution_engine.py +30 -4
  22. fugue/execution/factory.py +0 -6
  23. fugue/execution/native_execution_engine.py +44 -67
  24. fugue/extensions/_builtins/creators.py +4 -2
  25. fugue/extensions/_builtins/outputters.py +4 -3
  26. fugue/extensions/_builtins/processors.py +3 -3
  27. fugue/extensions/creator/convert.py +5 -2
  28. fugue/extensions/outputter/convert.py +2 -2
  29. fugue/extensions/processor/convert.py +3 -2
  30. fugue/extensions/transformer/convert.py +22 -9
  31. fugue/extensions/transformer/transformer.py +15 -1
  32. fugue/plugins.py +2 -0
  33. fugue/registry.py +0 -39
  34. fugue/sql/_utils.py +1 -1
  35. fugue/workflow/_checkpoint.py +1 -1
  36. fugue/workflow/api.py +13 -13
  37. fugue/workflow/module.py +30 -37
  38. fugue/workflow/workflow.py +6 -0
  39. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
  40. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
  41. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
  42. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
  43. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
  44. fugue_contrib/contrib.py +1 -0
  45. fugue_contrib/viz/_ext.py +7 -1
  46. fugue_dask/_io.py +0 -13
  47. fugue_dask/_utils.py +10 -4
  48. fugue_dask/dataframe.py +1 -2
  49. fugue_dask/execution_engine.py +45 -18
  50. fugue_dask/registry.py +8 -33
  51. fugue_duckdb/_io.py +8 -2
  52. fugue_duckdb/_utils.py +7 -2
  53. fugue_duckdb/dask.py +1 -1
  54. fugue_duckdb/dataframe.py +23 -19
  55. fugue_duckdb/execution_engine.py +19 -22
  56. fugue_duckdb/registry.py +11 -34
  57. fugue_ibis/dataframe.py +6 -10
  58. fugue_ibis/execution_engine.py +7 -1
  59. fugue_notebook/env.py +5 -10
  60. fugue_polars/__init__.py +2 -0
  61. fugue_polars/_utils.py +8 -0
  62. fugue_polars/polars_dataframe.py +234 -0
  63. fugue_polars/registry.py +86 -0
  64. fugue_ray/_constants.py +10 -1
  65. fugue_ray/_utils/dataframe.py +36 -9
  66. fugue_ray/_utils/io.py +2 -4
  67. fugue_ray/dataframe.py +16 -12
  68. fugue_ray/execution_engine.py +53 -32
  69. fugue_ray/registry.py +8 -32
  70. fugue_spark/_utils/convert.py +22 -11
  71. fugue_spark/_utils/io.py +0 -13
  72. fugue_spark/_utils/misc.py +27 -0
  73. fugue_spark/_utils/partition.py +11 -18
  74. fugue_spark/dataframe.py +26 -22
  75. fugue_spark/execution_engine.py +136 -54
  76. fugue_spark/registry.py +29 -78
  77. fugue_test/builtin_suite.py +36 -14
  78. fugue_test/dataframe_suite.py +9 -5
  79. fugue_test/execution_suite.py +100 -122
  80. fugue_version/__init__.py +1 -1
  81. tests/fugue/bag/test_array_bag.py +0 -9
  82. tests/fugue/collections/test_partition.py +10 -3
  83. tests/fugue/dataframe/test_function_wrapper.py +293 -0
  84. tests/fugue/dataframe/test_utils.py +2 -34
  85. tests/fugue/execution/test_factory.py +7 -9
  86. tests/fugue/execution/test_naive_execution_engine.py +35 -80
  87. tests/fugue/extensions/test_utils.py +12 -7
  88. tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
  89. tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
  90. tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
  91. tests/fugue/sql/test_workflow.py +1 -1
  92. tests/fugue/sql/test_workflow_parse.py +3 -5
  93. tests/fugue/utils/test_interfaceless.py +1 -325
  94. tests/fugue/utils/test_io.py +0 -80
  95. tests/fugue_dask/test_execution_engine.py +48 -0
  96. tests/fugue_dask/test_io.py +0 -55
  97. tests/fugue_duckdb/test_dataframe.py +2 -2
  98. tests/fugue_duckdb/test_execution_engine.py +16 -1
  99. tests/fugue_duckdb/test_utils.py +1 -1
  100. tests/fugue_ibis/test_dataframe.py +6 -3
  101. tests/fugue_polars/__init__.py +0 -0
  102. tests/fugue_polars/test_api.py +13 -0
  103. tests/fugue_polars/test_dataframe.py +82 -0
  104. tests/fugue_polars/test_transform.py +100 -0
  105. tests/fugue_ray/test_execution_engine.py +40 -4
  106. tests/fugue_spark/test_dataframe.py +0 -8
  107. tests/fugue_spark/test_execution_engine.py +50 -11
  108. tests/fugue_spark/test_importless.py +4 -4
  109. tests/fugue_spark/test_spark_connect.py +82 -0
  110. tests/fugue_spark/utils/test_convert.py +6 -8
  111. tests/fugue_spark/utils/test_io.py +0 -17
  112. fugue/_utils/register.py +0 -3
  113. fugue_test/_utils.py +0 -13
  114. {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
fugue/execution/api.py CHANGED
@@ -15,6 +15,7 @@ from .execution_engine import (
15
15
  ExecutionEngine,
16
16
  )
17
17
  from .factory import make_execution_engine, try_get_context_execution_engine
18
+ from .._utils.registry import fugue_plugin
18
19
 
19
20
 
20
21
  @contextmanager
@@ -120,6 +121,27 @@ def get_current_parallelism() -> int:
120
121
  return make_execution_engine().get_current_parallelism()
121
122
 
122
123
 
124
+ @fugue_plugin
125
+ def as_fugue_engine_df(
126
+ engine: ExecutionEngine, df: AnyDataFrame, schema: Any = None
127
+ ) -> DataFrame:
128
+ """Convert a dataframe to a Fugue engine dependent DataFrame.
129
+ This function is used internally by Fugue. It is not recommended
130
+ to use
131
+
132
+ :param engine: the ExecutionEngine to use, must not be None
133
+ :param df: a dataframe like object
134
+ :param schema: the schema of the dataframe, defaults to None
135
+
136
+ :return: the engine dependent DataFrame
137
+ """
138
+ if schema is None:
139
+ fdf = as_fugue_df(df)
140
+ else:
141
+ fdf = as_fugue_df(df, schema=schema)
142
+ return engine.to_df(fdf)
143
+
144
+
123
145
  def run_engine_function(
124
146
  func: Callable[[ExecutionEngine], Any],
125
147
  engine: AnyExecutionEngine = None,
@@ -199,10 +221,10 @@ def broadcast(
199
221
  as_fugue: bool = False,
200
222
  as_local: bool = False,
201
223
  ) -> AnyDataFrame:
202
- """Broadcast the dataframe to all workers for a distributed computing framework
224
+ """Broadcast the dataframe to all workers of a distributed computing backend
203
225
 
204
226
  :param df: an input dataframe that can be recognized by Fugue
205
- :param engine: an engine like object, defaults to None
227
+ :param engine: an engine-like object, defaults to None
206
228
  :param engine_conf: the configs for the engine, defaults to None
207
229
  :param as_fugue: whether to force return a Fugue DataFrame, defaults to False
208
230
  :param as_local: whether to force return a local DataFrame, defaults to False
@@ -549,11 +571,11 @@ def join(
549
571
  """
550
572
 
551
573
  def _join(e: ExecutionEngine):
552
- edf1 = e.to_df(df1)
553
- edf2 = e.to_df(df2)
574
+ edf1 = as_fugue_engine_df(e, df1)
575
+ edf2 = as_fugue_engine_df(e, df2)
554
576
  res = e.join(edf1, edf2, how=how, on=on)
555
577
  for odf in dfs:
556
- res = e.join(res, e.to_df(odf), how=how, on=on)
578
+ res = e.join(res, as_fugue_engine_df(e, odf), how=how, on=on)
557
579
  return res
558
580
 
559
581
  return run_engine_function(
@@ -837,11 +859,11 @@ def union(
837
859
  """
838
860
 
839
861
  def _union(e: ExecutionEngine):
840
- edf1 = e.to_df(df1)
841
- edf2 = e.to_df(df2)
862
+ edf1 = as_fugue_engine_df(e, df1)
863
+ edf2 = as_fugue_engine_df(e, df2)
842
864
  res = e.union(edf1, edf2, distinct=distinct)
843
865
  for odf in dfs:
844
- res = e.union(res, e.to_df(odf), distinct=distinct)
866
+ res = e.union(res, as_fugue_engine_df(e, odf), distinct=distinct)
845
867
  return res
846
868
 
847
869
  return run_engine_function(
@@ -885,11 +907,11 @@ def subtract(
885
907
  """
886
908
 
887
909
  def _subtract(e: ExecutionEngine):
888
- edf1 = e.to_df(df1)
889
- edf2 = e.to_df(df2)
910
+ edf1 = as_fugue_engine_df(e, df1)
911
+ edf2 = as_fugue_engine_df(e, df2)
890
912
  res = e.subtract(edf1, edf2, distinct=distinct)
891
913
  for odf in dfs:
892
- res = e.subtract(res, e.to_df(odf), distinct=distinct)
914
+ res = e.subtract(res, as_fugue_engine_df(e, odf), distinct=distinct)
893
915
  return res
894
916
 
895
917
  return run_engine_function(
@@ -933,11 +955,11 @@ def intersect(
933
955
  """
934
956
 
935
957
  def _intersect(e: ExecutionEngine):
936
- edf1 = e.to_df(df1)
937
- edf2 = e.to_df(df2)
958
+ edf1 = as_fugue_engine_df(e, df1)
959
+ edf2 = as_fugue_engine_df(e, df2)
938
960
  res = e.intersect(edf1, edf2, distinct=distinct)
939
961
  for odf in dfs:
940
- res = e.intersect(res, e.to_df(odf), distinct=distinct)
962
+ res = e.intersect(res, as_fugue_engine_df(e, odf), distinct=distinct)
941
963
  return res
942
964
 
943
965
  return run_engine_function(
@@ -1,3 +1,4 @@
1
+ import inspect
1
2
  import logging
2
3
  from abc import ABC, abstractmethod
3
4
  from contextlib import contextmanager
@@ -17,8 +18,9 @@ from typing import (
17
18
  )
18
19
  from uuid import uuid4
19
20
 
20
- from triad import ParamDict, Schema, SerializableRLock, assert_or_throw
21
+ from triad import ParamDict, Schema, SerializableRLock, assert_or_throw, to_uuid
21
22
  from triad.collections.fs import FileSystem
23
+ from triad.collections.function_wrapper import AnnotatedParam
22
24
  from triad.exceptions import InvalidOperationError
23
25
  from triad.utils.convert import to_size
24
26
  from triad.utils.string import validate_triad_var_name
@@ -30,7 +32,7 @@ from fugue.collections.partition import (
30
32
  PartitionSpec,
31
33
  )
32
34
  from fugue.collections.sql import StructuredRawSQL, TempTableName
33
- from fugue.collections.yielded import Yielded, PhysicalYielded
35
+ from fugue.collections.yielded import PhysicalYielded, Yielded
34
36
  from fugue.column import (
35
37
  ColumnExpr,
36
38
  SelectColumns,
@@ -40,11 +42,11 @@ from fugue.column import (
40
42
  is_agg,
41
43
  )
42
44
  from fugue.constants import _FUGUE_GLOBAL_CONF, FUGUE_SQL_DEFAULT_DIALECT
43
- from fugue.dataframe import AnyDataFrame, DataFrame, DataFrames
45
+ from fugue.dataframe import AnyDataFrame, DataFrame, DataFrames, fugue_annotated_param
44
46
  from fugue.dataframe.array_dataframe import ArrayDataFrame
45
47
  from fugue.dataframe.dataframe import LocalDataFrame
46
48
  from fugue.dataframe.utils import deserialize_df, serialize_df
47
- from fugue.exceptions import FugueBug
49
+ from fugue.exceptions import FugueBug, FugueWorkflowRuntimeError
48
50
 
49
51
  AnyExecutionEngine = TypeVar("AnyExecutionEngine", object, None)
50
52
 
@@ -275,6 +277,7 @@ class MapEngine(EngineFacet):
275
277
  output_schema: Any,
276
278
  partition_spec: PartitionSpec,
277
279
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
280
+ map_func_format_hint: Optional[str] = None,
278
281
  ) -> DataFrame: # pragma: no cover
279
282
  """Apply a function to each partition after you partition the dataframe in a
280
283
  specified way.
@@ -287,6 +290,9 @@ class MapEngine(EngineFacet):
287
290
  :param partition_spec: partition specification
288
291
  :param on_init: callback function when the physical partition is initializaing,
289
292
  defaults to None
293
+ :param map_func_format_hint: the preferred data format for ``map_func``, it can
294
+ be ``pandas``, `pyarrow`, etc, defaults to None. Certain engines can provide
295
+ the most efficient map operations based on the hint.
290
296
  :return: the dataframe after the map operation
291
297
 
292
298
  .. note::
@@ -1298,6 +1304,26 @@ class ExecutionEngine(FugueEngineBase):
1298
1304
  return res
1299
1305
 
1300
1306
 
1307
+ @fugue_annotated_param(ExecutionEngine, "e", child_can_reuse_code=True)
1308
+ class ExecutionEngineParam(AnnotatedParam):
1309
+ def __init__(
1310
+ self,
1311
+ param: Optional[inspect.Parameter],
1312
+ ):
1313
+ super().__init__(param)
1314
+ self._type = self.annotation
1315
+
1316
+ def to_input(self, engine: Any) -> Any:
1317
+ assert_or_throw(
1318
+ isinstance(engine, self._type),
1319
+ FugueWorkflowRuntimeError(f"{engine} is not of type {self._type}"),
1320
+ )
1321
+ return engine
1322
+
1323
+ def __uuid__(self) -> str:
1324
+ return to_uuid(self.code, self.annotation, self._type)
1325
+
1326
+
1301
1327
  def _get_file_threshold(size: Any) -> int:
1302
1328
  if size is None:
1303
1329
  return -1
@@ -492,9 +492,6 @@ def make_sql_engine(
492
492
 
493
493
  # S2(engine)
494
494
  make_sql_engine("s2", engine)
495
-
496
- # SqliteEngine(engine)
497
- make_sql_engine(SqliteEngine)
498
495
  """
499
496
  if isinstance(engine, SQLEngine):
500
497
  assert_or_throw(
@@ -554,9 +551,6 @@ def parse_sql_engine(
554
551
 
555
552
  # S2(engine)
556
553
  make_sql_engine("s2", engine)
557
-
558
- # SqliteEngine(engine)
559
- make_sql_engine(SqliteEngine)
560
554
  """
561
555
  if engine is None or (isinstance(engine, str) and engine == ""):
562
556
  assert_or_throw(
@@ -1,4 +1,3 @@
1
- import inspect
2
1
  import logging
3
2
  import os
4
3
  from typing import Any, Callable, Dict, List, Optional, Type, Union
@@ -6,17 +5,11 @@ from typing import Any, Callable, Dict, List, Optional, Type, Union
6
5
  import pandas as pd
7
6
  from qpd_pandas import run_sql_on_pandas
8
7
  from qpd_pandas.engine import PandasUtils
9
- from sqlalchemy import create_engine
10
8
  from triad import Schema
11
9
  from triad.collections.dict import IndexedOrderedDict
12
10
  from triad.collections.fs import FileSystem
13
11
  from triad.utils.assertion import assert_or_throw
14
12
 
15
- from fugue._utils.interfaceless import (
16
- ExecutionEngineParam,
17
- SimpleAnnotationConverter,
18
- register_annotation_converter,
19
- )
20
13
  from fugue._utils.io import load_df, save_df
21
14
  from fugue.collections.partition import (
22
15
  PartitionCursor,
@@ -31,34 +24,17 @@ from fugue.dataframe import (
31
24
  LocalBoundedDataFrame,
32
25
  LocalDataFrame,
33
26
  PandasDataFrame,
34
- to_local_bounded_df,
27
+ fugue_annotated_param,
35
28
  )
36
- from fugue.dataframe.utils import get_join_schemas, to_local_df
37
-
38
- from .execution_engine import ExecutionEngine, MapEngine, SQLEngine
39
-
40
-
41
- class SqliteEngine(SQLEngine):
42
- """Sqlite execution implementation.
43
-
44
- :param execution_engine: the execution engine this sql engine will run on
45
- """
29
+ from fugue.dataframe.dataframe import as_fugue_df
30
+ from fugue.dataframe.utils import get_join_schemas
46
31
 
47
- @property
48
- def is_distributed(self) -> bool:
49
- return False
50
-
51
- @property
52
- def dialect(self) -> Optional[str]:
53
- return "sqlite"
54
-
55
- def select(self, dfs: DataFrames, statement: StructuredRawSQL) -> DataFrame:
56
- _dfs, _sql = self.encode(dfs, statement)
57
- sql_engine = create_engine("sqlite:///:memory:")
58
- for k, v in _dfs.items():
59
- v.as_pandas().to_sql(k, sql_engine, if_exists="replace", index=False)
60
- df = pd.read_sql_query(_sql, sql_engine)
61
- return PandasDataFrame(df)
32
+ from .execution_engine import (
33
+ ExecutionEngine,
34
+ ExecutionEngineParam,
35
+ MapEngine,
36
+ SQLEngine,
37
+ )
62
38
 
63
39
 
64
40
  class QPDPandasEngine(SQLEngine):
@@ -105,20 +81,38 @@ class PandasMapEngine(MapEngine):
105
81
  output_schema: Any,
106
82
  partition_spec: PartitionSpec,
107
83
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
84
+ map_func_format_hint: Optional[str] = None,
108
85
  ) -> DataFrame:
109
- if partition_spec.num_partitions != "0":
110
- self.log.warning(
111
- "%s doesn't respect num_partitions %s",
112
- self,
113
- partition_spec.num_partitions,
114
- )
86
+ # if partition_spec.num_partitions != "0":
87
+ # self.log.warning(
88
+ # "%s doesn't respect num_partitions %s",
89
+ # self,
90
+ # partition_spec.num_partitions,
91
+ # )
92
+ is_coarse = partition_spec.algo == "coarse"
93
+ presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
94
+ presort_keys = list(presort.keys())
95
+ presort_asc = list(presort.values())
96
+ output_schema = Schema(output_schema)
115
97
  cursor = partition_spec.get_cursor(df.schema, 0)
116
98
  if on_init is not None:
117
99
  on_init(0, df)
118
- if len(partition_spec.partition_by) == 0: # no partition
119
- df = to_local_df(df)
120
- cursor.set(df.peek_array(), 0, 0)
121
- output_df = map_func(cursor, df)
100
+ if (
101
+ len(partition_spec.partition_by) == 0 or partition_spec.algo == "coarse"
102
+ ): # no partition
103
+ if len(partition_spec.presort) > 0:
104
+ pdf = (
105
+ df.as_pandas()
106
+ .sort_values(presort_keys, ascending=presort_asc)
107
+ .reset_index(drop=True)
108
+ )
109
+ input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
110
+ cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
111
+ output_df = map_func(cursor, input_df)
112
+ else:
113
+ df = df.as_local()
114
+ cursor.set(lambda: df.peek_array(), 0, 0)
115
+ output_df = map_func(cursor, df)
122
116
  if (
123
117
  isinstance(output_df, PandasDataFrame)
124
118
  and output_df.schema != output_schema
@@ -130,18 +124,14 @@ class PandasMapEngine(MapEngine):
130
124
  f"mismatches given {output_schema}",
131
125
  )
132
126
  return self.to_df(output_df) # type: ignore
133
- presort = partition_spec.presort
134
- presort_keys = list(presort.keys())
135
- presort_asc = list(presort.values())
136
- output_schema = Schema(output_schema)
137
127
 
138
128
  def _map(pdf: pd.DataFrame) -> pd.DataFrame:
139
- if len(presort_keys) > 0:
129
+ if len(partition_spec.presort) > 0:
140
130
  pdf = pdf.sort_values(presort_keys, ascending=presort_asc).reset_index(
141
131
  drop=True
142
132
  )
143
133
  input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
144
- cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
134
+ cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
145
135
  output_df = map_func(cursor, input_df)
146
136
  return output_df.as_pandas()
147
137
 
@@ -200,7 +190,7 @@ class NativeExecutionEngine(ExecutionEngine):
200
190
  def repartition(
201
191
  self, df: DataFrame, partition_spec: PartitionSpec
202
192
  ) -> DataFrame: # pragma: no cover
203
- self.log.warning("%s doesn't respect repartition", self)
193
+ # self.log.warning("%s doesn't respect repartition", self)
204
194
  return df
205
195
 
206
196
  def broadcast(self, df: DataFrame) -> DataFrame:
@@ -401,24 +391,11 @@ class NativeExecutionEngine(ExecutionEngine):
401
391
  save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)
402
392
 
403
393
 
394
+ @fugue_annotated_param(NativeExecutionEngine)
404
395
  class _NativeExecutionEngineParam(ExecutionEngineParam):
405
- def __init__(
406
- self,
407
- param: Optional[inspect.Parameter],
408
- ):
409
- super().__init__(
410
- param, annotation="NativeExecutionEngine", engine_type=NativeExecutionEngine
411
- )
396
+ pass
412
397
 
413
398
 
414
399
  def _to_native_execution_engine_df(df: AnyDataFrame, schema: Any = None) -> DataFrame:
415
- return to_local_bounded_df(df, schema)
416
-
417
-
418
- register_annotation_converter(
419
- 0.8,
420
- SimpleAnnotationConverter(
421
- NativeExecutionEngine,
422
- lambda param: _NativeExecutionEngineParam(param),
423
- ),
424
- )
400
+ fdf = as_fugue_df(df) if schema is None else as_fugue_df(df, schema=schema)
401
+ return fdf.as_local_bounded()
@@ -1,10 +1,12 @@
1
1
  from typing import Any, Callable, Optional
2
2
 
3
+ from triad import Schema, assert_or_throw, to_uuid
4
+
3
5
  from fugue.collections.yielded import Yielded
4
6
  from fugue.dataframe import DataFrame
5
7
  from fugue.exceptions import FugueWorkflowCompileError
8
+ from fugue.execution.api import as_fugue_engine_df
6
9
  from fugue.extensions.creator import Creator
7
- from triad import Schema, assert_or_throw, to_uuid
8
10
 
9
11
 
10
12
  class Load(Creator):
@@ -39,7 +41,7 @@ class CreateData(Creator):
39
41
  def create(self) -> DataFrame:
40
42
  if isinstance(self._df, Yielded):
41
43
  return self.execution_engine.load_yielded(self._df)
42
- return self.execution_engine.to_df(self._df, schema=self._schema)
44
+ return as_fugue_engine_df(self.execution_engine, self._df, schema=self._schema)
43
45
 
44
46
  def _df_uid(self):
45
47
  if self._data_determiner is not None:
@@ -6,7 +6,7 @@ from triad.utils.convert import to_type
6
6
  from fugue.collections.partition import PartitionCursor
7
7
  from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame
8
8
  from fugue.dataframe.array_dataframe import ArrayDataFrame
9
- from fugue.dataframe.utils import _df_eq, to_local_bounded_df
9
+ from fugue.dataframe.utils import _df_eq
10
10
  from fugue.exceptions import FugueWorkflowError
11
11
  from fugue.execution.execution_engine import _generate_comap_empty_dfs
12
12
  from fugue.rpc import EmptyRPCHandler, to_rpc_handler
@@ -99,6 +99,7 @@ class RunOutputTransformer(Outputter):
99
99
  output_schema=tf.output_schema, # type: ignore
100
100
  partition_spec=tf.partition_spec,
101
101
  on_init=tr.on_init,
102
+ map_func_format_hint=tf.get_format_hint(),
102
103
  )
103
104
  self.execution_engine.persist(df, lazy=False)
104
105
 
@@ -135,7 +136,7 @@ class _TransformerRunner(object):
135
136
  def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
136
137
  self.transformer._cursor = cursor # type: ignore
137
138
  try:
138
- to_local_bounded_df(self.transformer.transform(df))
139
+ self.transformer.transform(df).as_local_bounded()
139
140
  return ArrayDataFrame([], self.transformer.output_schema)
140
141
  except self.ignore_errors: # type: ignore
141
142
  return ArrayDataFrame([], self.transformer.output_schema)
@@ -159,7 +160,7 @@ class _CoTransformerRunner(object):
159
160
  def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame:
160
161
  self.transformer._cursor = cursor # type: ignore
161
162
  try:
162
- to_local_bounded_df(self.transformer.transform(dfs))
163
+ self.transformer.transform(dfs).as_local_bounded()
163
164
  return ArrayDataFrame([], self.transformer.output_schema)
164
165
  except self.ignore_errors: # type: ignore
165
166
  return ArrayDataFrame([], self.transformer.output_schema)
@@ -6,7 +6,6 @@ from fugue.dataframe import (
6
6
  DataFrame,
7
7
  DataFrames,
8
8
  LocalDataFrame,
9
- to_local_bounded_df,
10
9
  )
11
10
  from fugue.column import ColumnExpr, SelectColumns as ColumnsSelect
12
11
  from fugue.exceptions import FugueWorkflowError
@@ -53,6 +52,7 @@ class RunTransformer(Processor):
53
52
  output_schema=tf.output_schema, # type: ignore
54
53
  partition_spec=tf.partition_spec,
55
54
  on_init=tr.on_init,
55
+ map_func_format_hint=tf.get_format_hint(),
56
56
  )
57
57
 
58
58
  @no_type_check
@@ -333,7 +333,7 @@ class _TransformerRunner(object):
333
333
  return self.transformer.transform(df)
334
334
  else:
335
335
  try:
336
- return to_local_bounded_df(self.transformer.transform(df))
336
+ return self.transformer.transform(df).as_local_bounded()
337
337
  except self.ignore_errors: # type: ignore # pylint: disable=E0712
338
338
  return ArrayDataFrame([], self.transformer.output_schema)
339
339
 
@@ -363,7 +363,7 @@ class _CoTransformerRunner(object):
363
363
 
364
364
  else:
365
365
  try:
366
- return to_local_bounded_df(self.transformer.transform(dfs))
366
+ return self.transformer.transform(dfs).as_local_bounded()
367
367
  except self.ignore_errors: # type: ignore # pylint: disable=E0712
368
368
  return ArrayDataFrame([], self.transformer.output_schema)
369
369
 
@@ -7,9 +7,10 @@ from triad.utils.assertion import assert_or_throw
7
7
  from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
8
8
  from triad.utils.hash import to_uuid
9
9
 
10
- from fugue._utils.interfaceless import FunctionWrapper, parse_output_schema_from_comment
10
+ from fugue._utils.interfaceless import parse_output_schema_from_comment
11
11
  from fugue._utils.registry import fugue_plugin
12
12
  from fugue.dataframe import DataFrame
13
+ from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
13
14
  from fugue.exceptions import FugueInterfacelessError
14
15
  from fugue.extensions.creator.creator import Creator
15
16
 
@@ -200,7 +201,9 @@ class _FuncAsCreator(Creator):
200
201
  if schema is None:
201
202
  schema = parse_output_schema_from_comment(func)
202
203
  tr = _FuncAsCreator()
203
- tr._wrapper = FunctionWrapper(func, "^e?x*z?$", "^[dlspq]$") # type: ignore
204
+ tr._wrapper = DataFrameFunctionWrapper( # type: ignore
205
+ func, "^e?x*z?$", "^[dlspq]$"
206
+ )
204
207
  tr._engine_param = (
205
208
  tr._wrapper._params.get_value_by_index(0)
206
209
  if tr._wrapper.input_code.startswith("e")
@@ -4,9 +4,9 @@ from typing import Any, Callable, Dict, List, Optional, no_type_check
4
4
  from triad import ParamDict, to_uuid
5
5
  from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
6
6
 
7
- from fugue._utils.interfaceless import FunctionWrapper
8
7
  from fugue._utils.registry import fugue_plugin
9
8
  from fugue.dataframe import DataFrames
9
+ from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
10
10
  from fugue.exceptions import FugueInterfacelessError
11
11
  from fugue.extensions._utils import (
12
12
  load_namespace_extensions,
@@ -204,7 +204,7 @@ class _FuncAsOutputter(Outputter):
204
204
  ) -> "_FuncAsOutputter":
205
205
  validation_rules.update(parse_validation_rules_from_comment(func))
206
206
  tr = _FuncAsOutputter()
207
- tr._wrapper = FunctionWrapper( # type: ignore
207
+ tr._wrapper = DataFrameFunctionWrapper( # type: ignore
208
208
  func, "^e?(c|[dlspq]+)x*z?$", "^n$"
209
209
  )
210
210
  tr._engine_param = (
@@ -6,9 +6,10 @@ from triad.collections import Schema
6
6
  from triad.utils.assertion import assert_or_throw
7
7
  from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
8
8
 
9
- from fugue._utils.interfaceless import FunctionWrapper, parse_output_schema_from_comment
9
+ from fugue._utils.interfaceless import parse_output_schema_from_comment
10
10
  from fugue._utils.registry import fugue_plugin
11
11
  from fugue.dataframe import DataFrame, DataFrames
12
+ from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
12
13
  from fugue.exceptions import FugueInterfacelessError
13
14
  from fugue.extensions.processor.processor import Processor
14
15
 
@@ -223,7 +224,7 @@ class _FuncAsProcessor(Processor):
223
224
  schema = parse_output_schema_from_comment(func)
224
225
  validation_rules.update(parse_validation_rules_from_comment(func))
225
226
  tr = _FuncAsProcessor()
226
- tr._wrapper = FunctionWrapper(
227
+ tr._wrapper = DataFrameFunctionWrapper(
227
228
  func, "^e?(c|[dlspq]+)x*z?$", "^[dlspq]$"
228
229
  ) # type: ignore
229
230
  tr._engine_param = (
@@ -6,13 +6,10 @@ from triad.utils.assertion import assert_arg_not_none, assert_or_throw
6
6
  from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
7
7
  from triad.utils.hash import to_uuid
8
8
 
9
- from fugue._utils.interfaceless import (
10
- FunctionWrapper,
11
- is_class_method,
12
- parse_output_schema_from_comment,
13
- )
9
+ from fugue._utils.interfaceless import is_class_method, parse_output_schema_from_comment
14
10
  from fugue._utils.registry import fugue_plugin
15
11
  from fugue.dataframe import ArrayDataFrame, DataFrame, DataFrames, LocalDataFrame
12
+ from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
16
13
  from fugue.exceptions import FugueInterfacelessError
17
14
  from fugue.extensions.transformer.constants import OUTPUT_TRANSFORMER_DUMMY_SCHEMA
18
15
  from fugue.extensions.transformer.transformer import CoTransformer, Transformer
@@ -336,6 +333,9 @@ class _FuncAsTransformer(Transformer):
336
333
  def get_output_schema(self, df: DataFrame) -> Any:
337
334
  return self._parse_schema(self._output_schema_arg, df) # type: ignore
338
335
 
336
+ def get_format_hint(self) -> Optional[str]:
337
+ return self._format_hint # type: ignore
338
+
339
339
  @property
340
340
  def validation_rules(self) -> Dict[str, Any]:
341
341
  return self._validation_rules # type: ignore
@@ -374,13 +374,14 @@ class _FuncAsTransformer(Transformer):
374
374
  validation_rules.update(parse_validation_rules_from_comment(func))
375
375
  assert_arg_not_none(schema, "schema")
376
376
  tr = _FuncAsTransformer()
377
- tr._wrapper = FunctionWrapper( # type: ignore
377
+ tr._wrapper = DataFrameFunctionWrapper( # type: ignore
378
378
  func, "^[lspq][fF]?x*z?$", "^[lspq]$"
379
379
  )
380
380
  tr._output_schema_arg = schema # type: ignore
381
381
  tr._validation_rules = validation_rules # type: ignore
382
382
  tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
383
383
  tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
384
+ tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
384
385
  return tr
385
386
 
386
387
 
@@ -392,6 +393,9 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
392
393
  def get_output_schema(self, df: DataFrame) -> Any:
393
394
  return OUTPUT_TRANSFORMER_DUMMY_SCHEMA
394
395
 
396
+ def get_format_hint(self) -> Optional[str]:
397
+ return self._format_hint # type: ignore
398
+
395
399
  @no_type_check
396
400
  def transform(self, df: LocalDataFrame) -> LocalDataFrame:
397
401
  args = [df] + _get_callback(self)
@@ -405,13 +409,14 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
405
409
  assert_or_throw(schema is None, "schema must be None for output transformers")
406
410
  validation_rules.update(parse_validation_rules_from_comment(func))
407
411
  tr = _FuncAsOutputTransformer()
408
- tr._wrapper = FunctionWrapper( # type: ignore
412
+ tr._wrapper = DataFrameFunctionWrapper( # type: ignore
409
413
  func, "^[lspq][fF]?x*z?$", "^[lspnq]$"
410
414
  )
411
415
  tr._output_schema_arg = None # type: ignore
412
416
  tr._validation_rules = validation_rules # type: ignore
413
417
  tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
414
418
  tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
419
+ tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
415
420
  return tr
416
421
 
417
422
 
@@ -423,6 +428,9 @@ class _FuncAsCoTransformer(CoTransformer):
423
428
  def get_output_schema(self, dfs: DataFrames) -> Any:
424
429
  return self._parse_schema(self._output_schema_arg, dfs) # type: ignore
425
430
 
431
+ def get_format_hint(self) -> Optional[str]:
432
+ return self._format_hint # type: ignore
433
+
426
434
  @property
427
435
  def validation_rules(self) -> ParamDict:
428
436
  return self._validation_rules # type: ignore
@@ -494,7 +502,7 @@ class _FuncAsCoTransformer(CoTransformer):
494
502
  )
495
503
  assert_arg_not_none(schema, "schema")
496
504
  tr = _FuncAsCoTransformer()
497
- tr._wrapper = FunctionWrapper( # type: ignore
505
+ tr._wrapper = DataFrameFunctionWrapper( # type: ignore
498
506
  func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$"
499
507
  )
500
508
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
@@ -502,6 +510,7 @@ class _FuncAsCoTransformer(CoTransformer):
502
510
  tr._validation_rules = {} # type: ignore
503
511
  tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
504
512
  tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
513
+ tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
505
514
  return tr
506
515
 
507
516
 
@@ -513,6 +522,9 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
513
522
  def get_output_schema(self, dfs: DataFrames) -> Any:
514
523
  return OUTPUT_TRANSFORMER_DUMMY_SCHEMA
515
524
 
525
+ def get_format_hint(self) -> Optional[str]:
526
+ return self._format_hint # type: ignore
527
+
516
528
  @no_type_check
517
529
  def transform(self, dfs: DataFrames) -> LocalDataFrame:
518
530
  cb = _get_callback(self)
@@ -549,7 +561,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
549
561
  )
550
562
 
551
563
  tr = _FuncAsOutputCoTransformer()
552
- tr._wrapper = FunctionWrapper( # type: ignore
564
+ tr._wrapper = DataFrameFunctionWrapper( # type: ignore
553
565
  func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnq]$"
554
566
  )
555
567
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
@@ -557,6 +569,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
557
569
  tr._validation_rules = {} # type: ignore
558
570
  tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
559
571
  tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
572
+ tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
560
573
  return tr
561
574
 
562
575