fugue 0.8.2.dev1__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +9 -5
- fugue/_utils/interfaceless.py +1 -558
- fugue/_utils/io.py +2 -91
- fugue/_utils/registry.py +3 -2
- fugue/api.py +1 -0
- fugue/bag/bag.py +8 -4
- fugue/collections/__init__.py +0 -7
- fugue/collections/partition.py +21 -9
- fugue/constants.py +3 -1
- fugue/dataframe/__init__.py +7 -8
- fugue/dataframe/arrow_dataframe.py +1 -2
- fugue/dataframe/dataframe.py +17 -18
- fugue/dataframe/dataframe_iterable_dataframe.py +22 -6
- fugue/dataframe/function_wrapper.py +432 -0
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/utils.py +11 -79
- fugue/dataset/api.py +0 -4
- fugue/dev.py +47 -0
- fugue/execution/__init__.py +1 -5
- fugue/execution/api.py +36 -14
- fugue/execution/execution_engine.py +30 -4
- fugue/execution/factory.py +0 -6
- fugue/execution/native_execution_engine.py +44 -67
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +4 -3
- fugue/extensions/_builtins/processors.py +3 -3
- fugue/extensions/creator/convert.py +5 -2
- fugue/extensions/outputter/convert.py +2 -2
- fugue/extensions/processor/convert.py +3 -2
- fugue/extensions/transformer/convert.py +22 -9
- fugue/extensions/transformer/transformer.py +15 -1
- fugue/plugins.py +2 -0
- fugue/registry.py +0 -39
- fugue/sql/_utils.py +1 -1
- fugue/workflow/_checkpoint.py +1 -1
- fugue/workflow/api.py +13 -13
- fugue/workflow/module.py +30 -37
- fugue/workflow/workflow.py +6 -0
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/METADATA +37 -23
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/RECORD +112 -101
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/WHEEL +1 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -1
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/top_level.txt +1 -0
- fugue_contrib/contrib.py +1 -0
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/dataframe.py +1 -2
- fugue_dask/execution_engine.py +45 -18
- fugue_dask/registry.py +8 -33
- fugue_duckdb/_io.py +8 -2
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +23 -19
- fugue_duckdb/execution_engine.py +19 -22
- fugue_duckdb/registry.py +11 -34
- fugue_ibis/dataframe.py +6 -10
- fugue_ibis/execution_engine.py +7 -1
- fugue_notebook/env.py +5 -10
- fugue_polars/__init__.py +2 -0
- fugue_polars/_utils.py +8 -0
- fugue_polars/polars_dataframe.py +234 -0
- fugue_polars/registry.py +86 -0
- fugue_ray/_constants.py +10 -1
- fugue_ray/_utils/dataframe.py +36 -9
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +16 -12
- fugue_ray/execution_engine.py +53 -32
- fugue_ray/registry.py +8 -32
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +26 -22
- fugue_spark/execution_engine.py +136 -54
- fugue_spark/registry.py +29 -78
- fugue_test/builtin_suite.py +36 -14
- fugue_test/dataframe_suite.py +9 -5
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/bag/test_array_bag.py +0 -9
- tests/fugue/collections/test_partition.py +10 -3
- tests/fugue/dataframe/test_function_wrapper.py +293 -0
- tests/fugue/dataframe/test_utils.py +2 -34
- tests/fugue/execution/test_factory.py +7 -9
- tests/fugue/execution/test_naive_execution_engine.py +35 -80
- tests/fugue/extensions/test_utils.py +12 -7
- tests/fugue/extensions/transformer/test_convert_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_output_cotransformer.py +1 -0
- tests/fugue/extensions/transformer/test_convert_transformer.py +2 -0
- tests/fugue/sql/test_workflow.py +1 -1
- tests/fugue/sql/test_workflow_parse.py +3 -5
- tests/fugue/utils/test_interfaceless.py +1 -325
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +48 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_execution_engine.py +16 -1
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_ibis/test_dataframe.py +6 -3
- tests/fugue_polars/__init__.py +0 -0
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_dataframe.py +82 -0
- tests/fugue_polars/test_transform.py +100 -0
- tests/fugue_ray/test_execution_engine.py +40 -4
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +50 -11
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue/_utils/register.py +0 -3
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev1.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
fugue/execution/api.py
CHANGED
|
@@ -15,6 +15,7 @@ from .execution_engine import (
|
|
|
15
15
|
ExecutionEngine,
|
|
16
16
|
)
|
|
17
17
|
from .factory import make_execution_engine, try_get_context_execution_engine
|
|
18
|
+
from .._utils.registry import fugue_plugin
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
@contextmanager
|
|
@@ -120,6 +121,27 @@ def get_current_parallelism() -> int:
|
|
|
120
121
|
return make_execution_engine().get_current_parallelism()
|
|
121
122
|
|
|
122
123
|
|
|
124
|
+
@fugue_plugin
|
|
125
|
+
def as_fugue_engine_df(
|
|
126
|
+
engine: ExecutionEngine, df: AnyDataFrame, schema: Any = None
|
|
127
|
+
) -> DataFrame:
|
|
128
|
+
"""Convert a dataframe to a Fugue engine dependent DataFrame.
|
|
129
|
+
This function is used internally by Fugue. It is not recommended
|
|
130
|
+
to use
|
|
131
|
+
|
|
132
|
+
:param engine: the ExecutionEngine to use, must not be None
|
|
133
|
+
:param df: a dataframe like object
|
|
134
|
+
:param schema: the schema of the dataframe, defaults to None
|
|
135
|
+
|
|
136
|
+
:return: the engine dependent DataFrame
|
|
137
|
+
"""
|
|
138
|
+
if schema is None:
|
|
139
|
+
fdf = as_fugue_df(df)
|
|
140
|
+
else:
|
|
141
|
+
fdf = as_fugue_df(df, schema=schema)
|
|
142
|
+
return engine.to_df(fdf)
|
|
143
|
+
|
|
144
|
+
|
|
123
145
|
def run_engine_function(
|
|
124
146
|
func: Callable[[ExecutionEngine], Any],
|
|
125
147
|
engine: AnyExecutionEngine = None,
|
|
@@ -199,10 +221,10 @@ def broadcast(
|
|
|
199
221
|
as_fugue: bool = False,
|
|
200
222
|
as_local: bool = False,
|
|
201
223
|
) -> AnyDataFrame:
|
|
202
|
-
"""Broadcast the dataframe to all workers
|
|
224
|
+
"""Broadcast the dataframe to all workers of a distributed computing backend
|
|
203
225
|
|
|
204
226
|
:param df: an input dataframe that can be recognized by Fugue
|
|
205
|
-
:param engine: an engine
|
|
227
|
+
:param engine: an engine-like object, defaults to None
|
|
206
228
|
:param engine_conf: the configs for the engine, defaults to None
|
|
207
229
|
:param as_fugue: whether to force return a Fugue DataFrame, defaults to False
|
|
208
230
|
:param as_local: whether to force return a local DataFrame, defaults to False
|
|
@@ -549,11 +571,11 @@ def join(
|
|
|
549
571
|
"""
|
|
550
572
|
|
|
551
573
|
def _join(e: ExecutionEngine):
|
|
552
|
-
edf1 = e
|
|
553
|
-
edf2 = e
|
|
574
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
575
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
554
576
|
res = e.join(edf1, edf2, how=how, on=on)
|
|
555
577
|
for odf in dfs:
|
|
556
|
-
res = e.join(res, e
|
|
578
|
+
res = e.join(res, as_fugue_engine_df(e, odf), how=how, on=on)
|
|
557
579
|
return res
|
|
558
580
|
|
|
559
581
|
return run_engine_function(
|
|
@@ -837,11 +859,11 @@ def union(
|
|
|
837
859
|
"""
|
|
838
860
|
|
|
839
861
|
def _union(e: ExecutionEngine):
|
|
840
|
-
edf1 = e
|
|
841
|
-
edf2 = e
|
|
862
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
863
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
842
864
|
res = e.union(edf1, edf2, distinct=distinct)
|
|
843
865
|
for odf in dfs:
|
|
844
|
-
res = e.union(res, e
|
|
866
|
+
res = e.union(res, as_fugue_engine_df(e, odf), distinct=distinct)
|
|
845
867
|
return res
|
|
846
868
|
|
|
847
869
|
return run_engine_function(
|
|
@@ -885,11 +907,11 @@ def subtract(
|
|
|
885
907
|
"""
|
|
886
908
|
|
|
887
909
|
def _subtract(e: ExecutionEngine):
|
|
888
|
-
edf1 = e
|
|
889
|
-
edf2 = e
|
|
910
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
911
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
890
912
|
res = e.subtract(edf1, edf2, distinct=distinct)
|
|
891
913
|
for odf in dfs:
|
|
892
|
-
res = e.subtract(res, e
|
|
914
|
+
res = e.subtract(res, as_fugue_engine_df(e, odf), distinct=distinct)
|
|
893
915
|
return res
|
|
894
916
|
|
|
895
917
|
return run_engine_function(
|
|
@@ -933,11 +955,11 @@ def intersect(
|
|
|
933
955
|
"""
|
|
934
956
|
|
|
935
957
|
def _intersect(e: ExecutionEngine):
|
|
936
|
-
edf1 = e
|
|
937
|
-
edf2 = e
|
|
958
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
959
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
938
960
|
res = e.intersect(edf1, edf2, distinct=distinct)
|
|
939
961
|
for odf in dfs:
|
|
940
|
-
res = e.intersect(res, e
|
|
962
|
+
res = e.intersect(res, as_fugue_engine_df(e, odf), distinct=distinct)
|
|
941
963
|
return res
|
|
942
964
|
|
|
943
965
|
return run_engine_function(
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
from abc import ABC, abstractmethod
|
|
3
4
|
from contextlib import contextmanager
|
|
@@ -17,8 +18,9 @@ from typing import (
|
|
|
17
18
|
)
|
|
18
19
|
from uuid import uuid4
|
|
19
20
|
|
|
20
|
-
from triad import ParamDict, Schema, SerializableRLock, assert_or_throw
|
|
21
|
+
from triad import ParamDict, Schema, SerializableRLock, assert_or_throw, to_uuid
|
|
21
22
|
from triad.collections.fs import FileSystem
|
|
23
|
+
from triad.collections.function_wrapper import AnnotatedParam
|
|
22
24
|
from triad.exceptions import InvalidOperationError
|
|
23
25
|
from triad.utils.convert import to_size
|
|
24
26
|
from triad.utils.string import validate_triad_var_name
|
|
@@ -30,7 +32,7 @@ from fugue.collections.partition import (
|
|
|
30
32
|
PartitionSpec,
|
|
31
33
|
)
|
|
32
34
|
from fugue.collections.sql import StructuredRawSQL, TempTableName
|
|
33
|
-
from fugue.collections.yielded import
|
|
35
|
+
from fugue.collections.yielded import PhysicalYielded, Yielded
|
|
34
36
|
from fugue.column import (
|
|
35
37
|
ColumnExpr,
|
|
36
38
|
SelectColumns,
|
|
@@ -40,11 +42,11 @@ from fugue.column import (
|
|
|
40
42
|
is_agg,
|
|
41
43
|
)
|
|
42
44
|
from fugue.constants import _FUGUE_GLOBAL_CONF, FUGUE_SQL_DEFAULT_DIALECT
|
|
43
|
-
from fugue.dataframe import AnyDataFrame, DataFrame, DataFrames
|
|
45
|
+
from fugue.dataframe import AnyDataFrame, DataFrame, DataFrames, fugue_annotated_param
|
|
44
46
|
from fugue.dataframe.array_dataframe import ArrayDataFrame
|
|
45
47
|
from fugue.dataframe.dataframe import LocalDataFrame
|
|
46
48
|
from fugue.dataframe.utils import deserialize_df, serialize_df
|
|
47
|
-
from fugue.exceptions import FugueBug
|
|
49
|
+
from fugue.exceptions import FugueBug, FugueWorkflowRuntimeError
|
|
48
50
|
|
|
49
51
|
AnyExecutionEngine = TypeVar("AnyExecutionEngine", object, None)
|
|
50
52
|
|
|
@@ -275,6 +277,7 @@ class MapEngine(EngineFacet):
|
|
|
275
277
|
output_schema: Any,
|
|
276
278
|
partition_spec: PartitionSpec,
|
|
277
279
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
280
|
+
map_func_format_hint: Optional[str] = None,
|
|
278
281
|
) -> DataFrame: # pragma: no cover
|
|
279
282
|
"""Apply a function to each partition after you partition the dataframe in a
|
|
280
283
|
specified way.
|
|
@@ -287,6 +290,9 @@ class MapEngine(EngineFacet):
|
|
|
287
290
|
:param partition_spec: partition specification
|
|
288
291
|
:param on_init: callback function when the physical partition is initializaing,
|
|
289
292
|
defaults to None
|
|
293
|
+
:param map_func_format_hint: the preferred data format for ``map_func``, it can
|
|
294
|
+
be ``pandas``, `pyarrow`, etc, defaults to None. Certain engines can provide
|
|
295
|
+
the most efficient map operations based on the hint.
|
|
290
296
|
:return: the dataframe after the map operation
|
|
291
297
|
|
|
292
298
|
.. note::
|
|
@@ -1298,6 +1304,26 @@ class ExecutionEngine(FugueEngineBase):
|
|
|
1298
1304
|
return res
|
|
1299
1305
|
|
|
1300
1306
|
|
|
1307
|
+
@fugue_annotated_param(ExecutionEngine, "e", child_can_reuse_code=True)
|
|
1308
|
+
class ExecutionEngineParam(AnnotatedParam):
|
|
1309
|
+
def __init__(
|
|
1310
|
+
self,
|
|
1311
|
+
param: Optional[inspect.Parameter],
|
|
1312
|
+
):
|
|
1313
|
+
super().__init__(param)
|
|
1314
|
+
self._type = self.annotation
|
|
1315
|
+
|
|
1316
|
+
def to_input(self, engine: Any) -> Any:
|
|
1317
|
+
assert_or_throw(
|
|
1318
|
+
isinstance(engine, self._type),
|
|
1319
|
+
FugueWorkflowRuntimeError(f"{engine} is not of type {self._type}"),
|
|
1320
|
+
)
|
|
1321
|
+
return engine
|
|
1322
|
+
|
|
1323
|
+
def __uuid__(self) -> str:
|
|
1324
|
+
return to_uuid(self.code, self.annotation, self._type)
|
|
1325
|
+
|
|
1326
|
+
|
|
1301
1327
|
def _get_file_threshold(size: Any) -> int:
|
|
1302
1328
|
if size is None:
|
|
1303
1329
|
return -1
|
fugue/execution/factory.py
CHANGED
|
@@ -492,9 +492,6 @@ def make_sql_engine(
|
|
|
492
492
|
|
|
493
493
|
# S2(engine)
|
|
494
494
|
make_sql_engine("s2", engine)
|
|
495
|
-
|
|
496
|
-
# SqliteEngine(engine)
|
|
497
|
-
make_sql_engine(SqliteEngine)
|
|
498
495
|
"""
|
|
499
496
|
if isinstance(engine, SQLEngine):
|
|
500
497
|
assert_or_throw(
|
|
@@ -554,9 +551,6 @@ def parse_sql_engine(
|
|
|
554
551
|
|
|
555
552
|
# S2(engine)
|
|
556
553
|
make_sql_engine("s2", engine)
|
|
557
|
-
|
|
558
|
-
# SqliteEngine(engine)
|
|
559
|
-
make_sql_engine(SqliteEngine)
|
|
560
554
|
"""
|
|
561
555
|
if engine is None or (isinstance(engine, str) and engine == ""):
|
|
562
556
|
assert_or_throw(
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import inspect
|
|
2
1
|
import logging
|
|
3
2
|
import os
|
|
4
3
|
from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
@@ -6,17 +5,11 @@ from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
|
6
5
|
import pandas as pd
|
|
7
6
|
from qpd_pandas import run_sql_on_pandas
|
|
8
7
|
from qpd_pandas.engine import PandasUtils
|
|
9
|
-
from sqlalchemy import create_engine
|
|
10
8
|
from triad import Schema
|
|
11
9
|
from triad.collections.dict import IndexedOrderedDict
|
|
12
10
|
from triad.collections.fs import FileSystem
|
|
13
11
|
from triad.utils.assertion import assert_or_throw
|
|
14
12
|
|
|
15
|
-
from fugue._utils.interfaceless import (
|
|
16
|
-
ExecutionEngineParam,
|
|
17
|
-
SimpleAnnotationConverter,
|
|
18
|
-
register_annotation_converter,
|
|
19
|
-
)
|
|
20
13
|
from fugue._utils.io import load_df, save_df
|
|
21
14
|
from fugue.collections.partition import (
|
|
22
15
|
PartitionCursor,
|
|
@@ -31,34 +24,17 @@ from fugue.dataframe import (
|
|
|
31
24
|
LocalBoundedDataFrame,
|
|
32
25
|
LocalDataFrame,
|
|
33
26
|
PandasDataFrame,
|
|
34
|
-
|
|
27
|
+
fugue_annotated_param,
|
|
35
28
|
)
|
|
36
|
-
from fugue.dataframe.
|
|
37
|
-
|
|
38
|
-
from .execution_engine import ExecutionEngine, MapEngine, SQLEngine
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
class SqliteEngine(SQLEngine):
|
|
42
|
-
"""Sqlite execution implementation.
|
|
43
|
-
|
|
44
|
-
:param execution_engine: the execution engine this sql engine will run on
|
|
45
|
-
"""
|
|
29
|
+
from fugue.dataframe.dataframe import as_fugue_df
|
|
30
|
+
from fugue.dataframe.utils import get_join_schemas
|
|
46
31
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
return "sqlite"
|
|
54
|
-
|
|
55
|
-
def select(self, dfs: DataFrames, statement: StructuredRawSQL) -> DataFrame:
|
|
56
|
-
_dfs, _sql = self.encode(dfs, statement)
|
|
57
|
-
sql_engine = create_engine("sqlite:///:memory:")
|
|
58
|
-
for k, v in _dfs.items():
|
|
59
|
-
v.as_pandas().to_sql(k, sql_engine, if_exists="replace", index=False)
|
|
60
|
-
df = pd.read_sql_query(_sql, sql_engine)
|
|
61
|
-
return PandasDataFrame(df)
|
|
32
|
+
from .execution_engine import (
|
|
33
|
+
ExecutionEngine,
|
|
34
|
+
ExecutionEngineParam,
|
|
35
|
+
MapEngine,
|
|
36
|
+
SQLEngine,
|
|
37
|
+
)
|
|
62
38
|
|
|
63
39
|
|
|
64
40
|
class QPDPandasEngine(SQLEngine):
|
|
@@ -105,20 +81,38 @@ class PandasMapEngine(MapEngine):
|
|
|
105
81
|
output_schema: Any,
|
|
106
82
|
partition_spec: PartitionSpec,
|
|
107
83
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
84
|
+
map_func_format_hint: Optional[str] = None,
|
|
108
85
|
) -> DataFrame:
|
|
109
|
-
if partition_spec.num_partitions != "0":
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
86
|
+
# if partition_spec.num_partitions != "0":
|
|
87
|
+
# self.log.warning(
|
|
88
|
+
# "%s doesn't respect num_partitions %s",
|
|
89
|
+
# self,
|
|
90
|
+
# partition_spec.num_partitions,
|
|
91
|
+
# )
|
|
92
|
+
is_coarse = partition_spec.algo == "coarse"
|
|
93
|
+
presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
|
|
94
|
+
presort_keys = list(presort.keys())
|
|
95
|
+
presort_asc = list(presort.values())
|
|
96
|
+
output_schema = Schema(output_schema)
|
|
115
97
|
cursor = partition_spec.get_cursor(df.schema, 0)
|
|
116
98
|
if on_init is not None:
|
|
117
99
|
on_init(0, df)
|
|
118
|
-
if
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
100
|
+
if (
|
|
101
|
+
len(partition_spec.partition_by) == 0 or partition_spec.algo == "coarse"
|
|
102
|
+
): # no partition
|
|
103
|
+
if len(partition_spec.presort) > 0:
|
|
104
|
+
pdf = (
|
|
105
|
+
df.as_pandas()
|
|
106
|
+
.sort_values(presort_keys, ascending=presort_asc)
|
|
107
|
+
.reset_index(drop=True)
|
|
108
|
+
)
|
|
109
|
+
input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
|
|
110
|
+
cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
|
|
111
|
+
output_df = map_func(cursor, input_df)
|
|
112
|
+
else:
|
|
113
|
+
df = df.as_local()
|
|
114
|
+
cursor.set(lambda: df.peek_array(), 0, 0)
|
|
115
|
+
output_df = map_func(cursor, df)
|
|
122
116
|
if (
|
|
123
117
|
isinstance(output_df, PandasDataFrame)
|
|
124
118
|
and output_df.schema != output_schema
|
|
@@ -130,18 +124,14 @@ class PandasMapEngine(MapEngine):
|
|
|
130
124
|
f"mismatches given {output_schema}",
|
|
131
125
|
)
|
|
132
126
|
return self.to_df(output_df) # type: ignore
|
|
133
|
-
presort = partition_spec.presort
|
|
134
|
-
presort_keys = list(presort.keys())
|
|
135
|
-
presort_asc = list(presort.values())
|
|
136
|
-
output_schema = Schema(output_schema)
|
|
137
127
|
|
|
138
128
|
def _map(pdf: pd.DataFrame) -> pd.DataFrame:
|
|
139
|
-
if len(
|
|
129
|
+
if len(partition_spec.presort) > 0:
|
|
140
130
|
pdf = pdf.sort_values(presort_keys, ascending=presort_asc).reset_index(
|
|
141
131
|
drop=True
|
|
142
132
|
)
|
|
143
133
|
input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
|
|
144
|
-
cursor.set(input_df.peek_array(), cursor.partition_no + 1, 0)
|
|
134
|
+
cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
|
|
145
135
|
output_df = map_func(cursor, input_df)
|
|
146
136
|
return output_df.as_pandas()
|
|
147
137
|
|
|
@@ -200,7 +190,7 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
200
190
|
def repartition(
|
|
201
191
|
self, df: DataFrame, partition_spec: PartitionSpec
|
|
202
192
|
) -> DataFrame: # pragma: no cover
|
|
203
|
-
self.log.warning("%s doesn't respect repartition", self)
|
|
193
|
+
# self.log.warning("%s doesn't respect repartition", self)
|
|
204
194
|
return df
|
|
205
195
|
|
|
206
196
|
def broadcast(self, df: DataFrame) -> DataFrame:
|
|
@@ -401,24 +391,11 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
401
391
|
save_df(df, path, format_hint=format_hint, mode=mode, fs=self.fs, **kwargs)
|
|
402
392
|
|
|
403
393
|
|
|
394
|
+
@fugue_annotated_param(NativeExecutionEngine)
|
|
404
395
|
class _NativeExecutionEngineParam(ExecutionEngineParam):
|
|
405
|
-
|
|
406
|
-
self,
|
|
407
|
-
param: Optional[inspect.Parameter],
|
|
408
|
-
):
|
|
409
|
-
super().__init__(
|
|
410
|
-
param, annotation="NativeExecutionEngine", engine_type=NativeExecutionEngine
|
|
411
|
-
)
|
|
396
|
+
pass
|
|
412
397
|
|
|
413
398
|
|
|
414
399
|
def _to_native_execution_engine_df(df: AnyDataFrame, schema: Any = None) -> DataFrame:
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
register_annotation_converter(
|
|
419
|
-
0.8,
|
|
420
|
-
SimpleAnnotationConverter(
|
|
421
|
-
NativeExecutionEngine,
|
|
422
|
-
lambda param: _NativeExecutionEngineParam(param),
|
|
423
|
-
),
|
|
424
|
-
)
|
|
400
|
+
fdf = as_fugue_df(df) if schema is None else as_fugue_df(df, schema=schema)
|
|
401
|
+
return fdf.as_local_bounded()
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from typing import Any, Callable, Optional
|
|
2
2
|
|
|
3
|
+
from triad import Schema, assert_or_throw, to_uuid
|
|
4
|
+
|
|
3
5
|
from fugue.collections.yielded import Yielded
|
|
4
6
|
from fugue.dataframe import DataFrame
|
|
5
7
|
from fugue.exceptions import FugueWorkflowCompileError
|
|
8
|
+
from fugue.execution.api import as_fugue_engine_df
|
|
6
9
|
from fugue.extensions.creator import Creator
|
|
7
|
-
from triad import Schema, assert_or_throw, to_uuid
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class Load(Creator):
|
|
@@ -39,7 +41,7 @@ class CreateData(Creator):
|
|
|
39
41
|
def create(self) -> DataFrame:
|
|
40
42
|
if isinstance(self._df, Yielded):
|
|
41
43
|
return self.execution_engine.load_yielded(self._df)
|
|
42
|
-
return self.execution_engine
|
|
44
|
+
return as_fugue_engine_df(self.execution_engine, self._df, schema=self._schema)
|
|
43
45
|
|
|
44
46
|
def _df_uid(self):
|
|
45
47
|
if self._data_determiner is not None:
|
|
@@ -6,7 +6,7 @@ from triad.utils.convert import to_type
|
|
|
6
6
|
from fugue.collections.partition import PartitionCursor
|
|
7
7
|
from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame
|
|
8
8
|
from fugue.dataframe.array_dataframe import ArrayDataFrame
|
|
9
|
-
from fugue.dataframe.utils import _df_eq
|
|
9
|
+
from fugue.dataframe.utils import _df_eq
|
|
10
10
|
from fugue.exceptions import FugueWorkflowError
|
|
11
11
|
from fugue.execution.execution_engine import _generate_comap_empty_dfs
|
|
12
12
|
from fugue.rpc import EmptyRPCHandler, to_rpc_handler
|
|
@@ -99,6 +99,7 @@ class RunOutputTransformer(Outputter):
|
|
|
99
99
|
output_schema=tf.output_schema, # type: ignore
|
|
100
100
|
partition_spec=tf.partition_spec,
|
|
101
101
|
on_init=tr.on_init,
|
|
102
|
+
map_func_format_hint=tf.get_format_hint(),
|
|
102
103
|
)
|
|
103
104
|
self.execution_engine.persist(df, lazy=False)
|
|
104
105
|
|
|
@@ -135,7 +136,7 @@ class _TransformerRunner(object):
|
|
|
135
136
|
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
|
|
136
137
|
self.transformer._cursor = cursor # type: ignore
|
|
137
138
|
try:
|
|
138
|
-
|
|
139
|
+
self.transformer.transform(df).as_local_bounded()
|
|
139
140
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
140
141
|
except self.ignore_errors: # type: ignore
|
|
141
142
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
@@ -159,7 +160,7 @@ class _CoTransformerRunner(object):
|
|
|
159
160
|
def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame:
|
|
160
161
|
self.transformer._cursor = cursor # type: ignore
|
|
161
162
|
try:
|
|
162
|
-
|
|
163
|
+
self.transformer.transform(dfs).as_local_bounded()
|
|
163
164
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
164
165
|
except self.ignore_errors: # type: ignore
|
|
165
166
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
@@ -6,7 +6,6 @@ from fugue.dataframe import (
|
|
|
6
6
|
DataFrame,
|
|
7
7
|
DataFrames,
|
|
8
8
|
LocalDataFrame,
|
|
9
|
-
to_local_bounded_df,
|
|
10
9
|
)
|
|
11
10
|
from fugue.column import ColumnExpr, SelectColumns as ColumnsSelect
|
|
12
11
|
from fugue.exceptions import FugueWorkflowError
|
|
@@ -53,6 +52,7 @@ class RunTransformer(Processor):
|
|
|
53
52
|
output_schema=tf.output_schema, # type: ignore
|
|
54
53
|
partition_spec=tf.partition_spec,
|
|
55
54
|
on_init=tr.on_init,
|
|
55
|
+
map_func_format_hint=tf.get_format_hint(),
|
|
56
56
|
)
|
|
57
57
|
|
|
58
58
|
@no_type_check
|
|
@@ -333,7 +333,7 @@ class _TransformerRunner(object):
|
|
|
333
333
|
return self.transformer.transform(df)
|
|
334
334
|
else:
|
|
335
335
|
try:
|
|
336
|
-
return
|
|
336
|
+
return self.transformer.transform(df).as_local_bounded()
|
|
337
337
|
except self.ignore_errors: # type: ignore # pylint: disable=E0712
|
|
338
338
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
339
339
|
|
|
@@ -363,7 +363,7 @@ class _CoTransformerRunner(object):
|
|
|
363
363
|
|
|
364
364
|
else:
|
|
365
365
|
try:
|
|
366
|
-
return
|
|
366
|
+
return self.transformer.transform(dfs).as_local_bounded()
|
|
367
367
|
except self.ignore_errors: # type: ignore # pylint: disable=E0712
|
|
368
368
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
369
369
|
|
|
@@ -7,9 +7,10 @@ from triad.utils.assertion import assert_or_throw
|
|
|
7
7
|
from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
|
|
8
8
|
from triad.utils.hash import to_uuid
|
|
9
9
|
|
|
10
|
-
from fugue._utils.interfaceless import
|
|
10
|
+
from fugue._utils.interfaceless import parse_output_schema_from_comment
|
|
11
11
|
from fugue._utils.registry import fugue_plugin
|
|
12
12
|
from fugue.dataframe import DataFrame
|
|
13
|
+
from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
|
|
13
14
|
from fugue.exceptions import FugueInterfacelessError
|
|
14
15
|
from fugue.extensions.creator.creator import Creator
|
|
15
16
|
|
|
@@ -200,7 +201,9 @@ class _FuncAsCreator(Creator):
|
|
|
200
201
|
if schema is None:
|
|
201
202
|
schema = parse_output_schema_from_comment(func)
|
|
202
203
|
tr = _FuncAsCreator()
|
|
203
|
-
tr._wrapper =
|
|
204
|
+
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
205
|
+
func, "^e?x*z?$", "^[dlspq]$"
|
|
206
|
+
)
|
|
204
207
|
tr._engine_param = (
|
|
205
208
|
tr._wrapper._params.get_value_by_index(0)
|
|
206
209
|
if tr._wrapper.input_code.startswith("e")
|
|
@@ -4,9 +4,9 @@ from typing import Any, Callable, Dict, List, Optional, no_type_check
|
|
|
4
4
|
from triad import ParamDict, to_uuid
|
|
5
5
|
from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
|
|
6
6
|
|
|
7
|
-
from fugue._utils.interfaceless import FunctionWrapper
|
|
8
7
|
from fugue._utils.registry import fugue_plugin
|
|
9
8
|
from fugue.dataframe import DataFrames
|
|
9
|
+
from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
|
|
10
10
|
from fugue.exceptions import FugueInterfacelessError
|
|
11
11
|
from fugue.extensions._utils import (
|
|
12
12
|
load_namespace_extensions,
|
|
@@ -204,7 +204,7 @@ class _FuncAsOutputter(Outputter):
|
|
|
204
204
|
) -> "_FuncAsOutputter":
|
|
205
205
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
206
206
|
tr = _FuncAsOutputter()
|
|
207
|
-
tr._wrapper =
|
|
207
|
+
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
208
208
|
func, "^e?(c|[dlspq]+)x*z?$", "^n$"
|
|
209
209
|
)
|
|
210
210
|
tr._engine_param = (
|
|
@@ -6,9 +6,10 @@ from triad.collections import Schema
|
|
|
6
6
|
from triad.utils.assertion import assert_or_throw
|
|
7
7
|
from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
|
|
8
8
|
|
|
9
|
-
from fugue._utils.interfaceless import
|
|
9
|
+
from fugue._utils.interfaceless import parse_output_schema_from_comment
|
|
10
10
|
from fugue._utils.registry import fugue_plugin
|
|
11
11
|
from fugue.dataframe import DataFrame, DataFrames
|
|
12
|
+
from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
|
|
12
13
|
from fugue.exceptions import FugueInterfacelessError
|
|
13
14
|
from fugue.extensions.processor.processor import Processor
|
|
14
15
|
|
|
@@ -223,7 +224,7 @@ class _FuncAsProcessor(Processor):
|
|
|
223
224
|
schema = parse_output_schema_from_comment(func)
|
|
224
225
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
225
226
|
tr = _FuncAsProcessor()
|
|
226
|
-
tr._wrapper =
|
|
227
|
+
tr._wrapper = DataFrameFunctionWrapper(
|
|
227
228
|
func, "^e?(c|[dlspq]+)x*z?$", "^[dlspq]$"
|
|
228
229
|
) # type: ignore
|
|
229
230
|
tr._engine_param = (
|
|
@@ -6,13 +6,10 @@ from triad.utils.assertion import assert_arg_not_none, assert_or_throw
|
|
|
6
6
|
from triad.utils.convert import get_caller_global_local_vars, to_function, to_instance
|
|
7
7
|
from triad.utils.hash import to_uuid
|
|
8
8
|
|
|
9
|
-
from fugue._utils.interfaceless import
|
|
10
|
-
FunctionWrapper,
|
|
11
|
-
is_class_method,
|
|
12
|
-
parse_output_schema_from_comment,
|
|
13
|
-
)
|
|
9
|
+
from fugue._utils.interfaceless import is_class_method, parse_output_schema_from_comment
|
|
14
10
|
from fugue._utils.registry import fugue_plugin
|
|
15
11
|
from fugue.dataframe import ArrayDataFrame, DataFrame, DataFrames, LocalDataFrame
|
|
12
|
+
from fugue.dataframe.function_wrapper import DataFrameFunctionWrapper
|
|
16
13
|
from fugue.exceptions import FugueInterfacelessError
|
|
17
14
|
from fugue.extensions.transformer.constants import OUTPUT_TRANSFORMER_DUMMY_SCHEMA
|
|
18
15
|
from fugue.extensions.transformer.transformer import CoTransformer, Transformer
|
|
@@ -336,6 +333,9 @@ class _FuncAsTransformer(Transformer):
|
|
|
336
333
|
def get_output_schema(self, df: DataFrame) -> Any:
|
|
337
334
|
return self._parse_schema(self._output_schema_arg, df) # type: ignore
|
|
338
335
|
|
|
336
|
+
def get_format_hint(self) -> Optional[str]:
|
|
337
|
+
return self._format_hint # type: ignore
|
|
338
|
+
|
|
339
339
|
@property
|
|
340
340
|
def validation_rules(self) -> Dict[str, Any]:
|
|
341
341
|
return self._validation_rules # type: ignore
|
|
@@ -374,13 +374,14 @@ class _FuncAsTransformer(Transformer):
|
|
|
374
374
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
375
375
|
assert_arg_not_none(schema, "schema")
|
|
376
376
|
tr = _FuncAsTransformer()
|
|
377
|
-
tr._wrapper =
|
|
377
|
+
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
378
378
|
func, "^[lspq][fF]?x*z?$", "^[lspq]$"
|
|
379
379
|
)
|
|
380
380
|
tr._output_schema_arg = schema # type: ignore
|
|
381
381
|
tr._validation_rules = validation_rules # type: ignore
|
|
382
382
|
tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
|
|
383
383
|
tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
|
|
384
|
+
tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
|
|
384
385
|
return tr
|
|
385
386
|
|
|
386
387
|
|
|
@@ -392,6 +393,9 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
|
|
|
392
393
|
def get_output_schema(self, df: DataFrame) -> Any:
|
|
393
394
|
return OUTPUT_TRANSFORMER_DUMMY_SCHEMA
|
|
394
395
|
|
|
396
|
+
def get_format_hint(self) -> Optional[str]:
|
|
397
|
+
return self._format_hint # type: ignore
|
|
398
|
+
|
|
395
399
|
@no_type_check
|
|
396
400
|
def transform(self, df: LocalDataFrame) -> LocalDataFrame:
|
|
397
401
|
args = [df] + _get_callback(self)
|
|
@@ -405,13 +409,14 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
|
|
|
405
409
|
assert_or_throw(schema is None, "schema must be None for output transformers")
|
|
406
410
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
407
411
|
tr = _FuncAsOutputTransformer()
|
|
408
|
-
tr._wrapper =
|
|
412
|
+
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
409
413
|
func, "^[lspq][fF]?x*z?$", "^[lspnq]$"
|
|
410
414
|
)
|
|
411
415
|
tr._output_schema_arg = None # type: ignore
|
|
412
416
|
tr._validation_rules = validation_rules # type: ignore
|
|
413
417
|
tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
|
|
414
418
|
tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
|
|
419
|
+
tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
|
|
415
420
|
return tr
|
|
416
421
|
|
|
417
422
|
|
|
@@ -423,6 +428,9 @@ class _FuncAsCoTransformer(CoTransformer):
|
|
|
423
428
|
def get_output_schema(self, dfs: DataFrames) -> Any:
|
|
424
429
|
return self._parse_schema(self._output_schema_arg, dfs) # type: ignore
|
|
425
430
|
|
|
431
|
+
def get_format_hint(self) -> Optional[str]:
|
|
432
|
+
return self._format_hint # type: ignore
|
|
433
|
+
|
|
426
434
|
@property
|
|
427
435
|
def validation_rules(self) -> ParamDict:
|
|
428
436
|
return self._validation_rules # type: ignore
|
|
@@ -494,7 +502,7 @@ class _FuncAsCoTransformer(CoTransformer):
|
|
|
494
502
|
)
|
|
495
503
|
assert_arg_not_none(schema, "schema")
|
|
496
504
|
tr = _FuncAsCoTransformer()
|
|
497
|
-
tr._wrapper =
|
|
505
|
+
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
498
506
|
func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$"
|
|
499
507
|
)
|
|
500
508
|
tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
|
|
@@ -502,6 +510,7 @@ class _FuncAsCoTransformer(CoTransformer):
|
|
|
502
510
|
tr._validation_rules = {} # type: ignore
|
|
503
511
|
tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
|
|
504
512
|
tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
|
|
513
|
+
tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
|
|
505
514
|
return tr
|
|
506
515
|
|
|
507
516
|
|
|
@@ -513,6 +522,9 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
|
|
|
513
522
|
def get_output_schema(self, dfs: DataFrames) -> Any:
|
|
514
523
|
return OUTPUT_TRANSFORMER_DUMMY_SCHEMA
|
|
515
524
|
|
|
525
|
+
def get_format_hint(self) -> Optional[str]:
|
|
526
|
+
return self._format_hint # type: ignore
|
|
527
|
+
|
|
516
528
|
@no_type_check
|
|
517
529
|
def transform(self, dfs: DataFrames) -> LocalDataFrame:
|
|
518
530
|
cb = _get_callback(self)
|
|
@@ -549,7 +561,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
|
|
|
549
561
|
)
|
|
550
562
|
|
|
551
563
|
tr = _FuncAsOutputCoTransformer()
|
|
552
|
-
tr._wrapper =
|
|
564
|
+
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
553
565
|
func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnq]$"
|
|
554
566
|
)
|
|
555
567
|
tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
|
|
@@ -557,6 +569,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
|
|
|
557
569
|
tr._validation_rules = {} # type: ignore
|
|
558
570
|
tr._uses_callback = "f" in tr._wrapper.input_code.lower() # type: ignore
|
|
559
571
|
tr._requires_callback = "F" in tr._wrapper.input_code # type: ignore
|
|
572
|
+
tr._format_hint = tr._wrapper.get_format_hint() # type: ignore
|
|
560
573
|
return tr
|
|
561
574
|
|
|
562
575
|
|