fugue 0.8.7.dev7__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/collections/sql.py +1 -1
- fugue/dataframe/utils.py +4 -18
- fugue/test/__init__.py +11 -0
- fugue/test/pandas_tester.py +24 -0
- fugue/test/plugins.py +393 -0
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/METADATA +24 -15
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/RECORD +38 -47
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/WHEEL +1 -1
- fugue-0.9.0.dist-info/entry_points.txt +12 -0
- fugue_dask/_io.py +8 -5
- fugue_dask/_utils.py +4 -4
- fugue_dask/execution_engine.py +11 -0
- fugue_dask/registry.py +2 -0
- fugue_dask/tester.py +24 -0
- fugue_duckdb/__init__.py +0 -5
- fugue_duckdb/_io.py +1 -0
- fugue_duckdb/registry.py +30 -2
- fugue_duckdb/tester.py +49 -0
- fugue_ibis/__init__.py +0 -3
- fugue_ibis/dataframe.py +2 -2
- fugue_ibis/execution_engine.py +14 -7
- fugue_ray/_constants.py +3 -4
- fugue_ray/_utils/dataframe.py +10 -21
- fugue_ray/_utils/io.py +38 -9
- fugue_ray/execution_engine.py +1 -2
- fugue_ray/registry.py +1 -0
- fugue_ray/tester.py +22 -0
- fugue_spark/execution_engine.py +5 -5
- fugue_spark/registry.py +13 -1
- fugue_spark/tester.py +78 -0
- fugue_test/__init__.py +82 -0
- fugue_test/builtin_suite.py +26 -43
- fugue_test/dataframe_suite.py +5 -14
- fugue_test/execution_suite.py +170 -143
- fugue_test/fixtures.py +61 -0
- fugue_version/__init__.py +1 -1
- fugue-0.8.7.dev7.dist-info/entry_points.txt +0 -17
- fugue_dask/ibis_engine.py +0 -62
- fugue_duckdb/ibis_engine.py +0 -56
- fugue_ibis/execution/__init__.py +0 -0
- fugue_ibis/execution/ibis_engine.py +0 -49
- fugue_ibis/execution/pandas_backend.py +0 -54
- fugue_ibis/extensions.py +0 -203
- fugue_spark/ibis_engine.py +0 -45
- fugue_test/ibis_suite.py +0 -92
- fugue_test/plugins/__init__.py +0 -0
- fugue_test/plugins/dask/__init__.py +0 -2
- fugue_test/plugins/dask/fixtures.py +0 -12
- fugue_test/plugins/duckdb/__init__.py +0 -2
- fugue_test/plugins/duckdb/fixtures.py +0 -9
- fugue_test/plugins/misc/__init__.py +0 -2
- fugue_test/plugins/misc/fixtures.py +0 -18
- fugue_test/plugins/ray/__init__.py +0 -2
- fugue_test/plugins/ray/fixtures.py +0 -9
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/top_level.txt +0 -0
fugue_test/fixtures.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
_DEFAULT_SCOPE = "module"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
7
|
+
def pandas_session():
|
|
8
|
+
yield "pandas"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
12
|
+
def native_session():
|
|
13
|
+
yield "native"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
17
|
+
def dask_session():
|
|
18
|
+
from fugue_dask.tester import DaskTestBackend
|
|
19
|
+
|
|
20
|
+
with DaskTestBackend.generate_session_fixture() as session:
|
|
21
|
+
yield session
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
25
|
+
def duckdb_session():
|
|
26
|
+
from fugue_duckdb.tester import DuckDBTestBackend
|
|
27
|
+
|
|
28
|
+
with DuckDBTestBackend.generate_session_fixture() as session:
|
|
29
|
+
yield session
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
33
|
+
def duckdask_session():
|
|
34
|
+
from fugue_duckdb.tester import DuckDaskTestBackend
|
|
35
|
+
|
|
36
|
+
with DuckDaskTestBackend.generate_session_fixture() as session:
|
|
37
|
+
yield session
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
41
|
+
def ray_session():
|
|
42
|
+
from fugue_ray.tester import RayTestBackend
|
|
43
|
+
|
|
44
|
+
with RayTestBackend.generate_session_fixture() as session:
|
|
45
|
+
yield session
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
49
|
+
def spark_session():
|
|
50
|
+
from fugue_spark.tester import SparkTestBackend
|
|
51
|
+
|
|
52
|
+
with SparkTestBackend.generate_session_fixture() as session:
|
|
53
|
+
yield session
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@pytest.fixture(scope=_DEFAULT_SCOPE)
|
|
57
|
+
def sparkconnect_session():
|
|
58
|
+
from fugue_spark.tester import SparkConnectTestBackend
|
|
59
|
+
|
|
60
|
+
with SparkConnectTestBackend.generate_session_fixture() as session:
|
|
61
|
+
yield session
|
fugue_version/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.9.0"
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
[fugue.plugins]
|
|
2
|
-
dask = fugue_dask.registry [dask]
|
|
3
|
-
dask_ibis = fugue_dask.ibis_engine [dask,ibis]
|
|
4
|
-
duckdb = fugue_duckdb.registry [duckdb]
|
|
5
|
-
duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
|
|
6
|
-
ibis = fugue_ibis [ibis]
|
|
7
|
-
polars = fugue_polars.registry [polars]
|
|
8
|
-
ray = fugue_ray.registry [ray]
|
|
9
|
-
spark = fugue_spark.registry [spark]
|
|
10
|
-
spark_ibis = fugue_spark.ibis_engine [spark,ibis]
|
|
11
|
-
|
|
12
|
-
[pytest11]
|
|
13
|
-
fugue_test_dask = fugue_test.plugins.dask [dask]
|
|
14
|
-
fugue_test_duckdb = fugue_test.plugins.duckdb [duckdb]
|
|
15
|
-
fugue_test_misc = fugue_test.plugins.misc
|
|
16
|
-
fugue_test_ray = fugue_test.plugins.ray [ray]
|
|
17
|
-
|
fugue_dask/ibis_engine.py
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
from typing import Any, Callable
|
|
2
|
-
|
|
3
|
-
import dask.dataframe as dd
|
|
4
|
-
import ibis
|
|
5
|
-
from ibis.backends.dask import Backend
|
|
6
|
-
from triad.utils.assertion import assert_or_throw
|
|
7
|
-
|
|
8
|
-
from fugue import DataFrame, DataFrames, ExecutionEngine
|
|
9
|
-
from fugue_dask.dataframe import DaskDataFrame
|
|
10
|
-
from fugue_dask.execution_engine import DaskExecutionEngine
|
|
11
|
-
from fugue_ibis import IbisTable
|
|
12
|
-
from fugue_ibis._utils import to_ibis_schema, to_schema
|
|
13
|
-
from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class DaskIbisEngine(IbisEngine):
|
|
17
|
-
def __init__(self, execution_engine: ExecutionEngine) -> None:
|
|
18
|
-
assert_or_throw(
|
|
19
|
-
isinstance(execution_engine, DaskExecutionEngine),
|
|
20
|
-
lambda: ValueError(
|
|
21
|
-
f"DaskIbisEngine must use DaskExecutionEngine ({execution_engine})"
|
|
22
|
-
),
|
|
23
|
-
)
|
|
24
|
-
super().__init__(execution_engine)
|
|
25
|
-
|
|
26
|
-
def select(
|
|
27
|
-
self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable]
|
|
28
|
-
) -> DataFrame:
|
|
29
|
-
pdfs = {
|
|
30
|
-
k: self.execution_engine.to_df(v).native # type: ignore
|
|
31
|
-
for k, v in dfs.items()
|
|
32
|
-
}
|
|
33
|
-
be = _BackendWrapper().connect(pdfs)
|
|
34
|
-
be.set_schemas(dfs)
|
|
35
|
-
expr = ibis_func(be)
|
|
36
|
-
schema = to_schema(expr.schema())
|
|
37
|
-
result = expr.compile()
|
|
38
|
-
assert_or_throw(
|
|
39
|
-
isinstance(result, dd.DataFrame),
|
|
40
|
-
lambda: ValueError(f"result must be a Dask DataFrame ({type(result)})"),
|
|
41
|
-
)
|
|
42
|
-
return DaskDataFrame(result, schema=schema)
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
@parse_ibis_engine.candidate(
|
|
46
|
-
lambda obj, *args, **kwargs: isinstance(obj, DaskExecutionEngine)
|
|
47
|
-
)
|
|
48
|
-
def _to_dask_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine:
|
|
49
|
-
return DaskIbisEngine(engine)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
class _BackendWrapper(Backend):
|
|
53
|
-
def set_schemas(self, dfs: DataFrames) -> None:
|
|
54
|
-
self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()}
|
|
55
|
-
|
|
56
|
-
def table(self, name: str, schema: Any = None):
|
|
57
|
-
return super().table(
|
|
58
|
-
name,
|
|
59
|
-
schema=self._schemas[name]
|
|
60
|
-
if schema is None and name in self._schemas
|
|
61
|
-
else schema,
|
|
62
|
-
)
|
fugue_duckdb/ibis_engine.py
DELETED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
from typing import Any, Callable, Dict, Optional, Tuple
|
|
2
|
-
|
|
3
|
-
import ibis
|
|
4
|
-
from ibis.backends.pandas import Backend
|
|
5
|
-
|
|
6
|
-
from fugue import DataFrame, DataFrames, ExecutionEngine
|
|
7
|
-
from fugue.collections.sql import StructuredRawSQL, TempTableName
|
|
8
|
-
from fugue_ibis import IbisTable
|
|
9
|
-
from fugue_ibis._utils import to_ibis_schema
|
|
10
|
-
from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine
|
|
11
|
-
|
|
12
|
-
from .execution_engine import DuckDBEngine, DuckExecutionEngine
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class DuckDBIbisEngine(IbisEngine):
|
|
16
|
-
def select(
|
|
17
|
-
self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable]
|
|
18
|
-
) -> DataFrame:
|
|
19
|
-
be = _BackendWrapper().connect({})
|
|
20
|
-
be.set_schemas(dfs)
|
|
21
|
-
expr = ibis_func(be)
|
|
22
|
-
sql = StructuredRawSQL.from_expr(
|
|
23
|
-
str(
|
|
24
|
-
ibis.postgres.compile(expr).compile(
|
|
25
|
-
compile_kwargs={"literal_binds": True}
|
|
26
|
-
)
|
|
27
|
-
),
|
|
28
|
-
prefix='"<tmpdf:',
|
|
29
|
-
suffix='>"',
|
|
30
|
-
dialect="postgres",
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
engine = DuckDBEngine(self.execution_engine)
|
|
34
|
-
_dfs = DataFrames({be._name_map[k][0].key: v for k, v in dfs.items()})
|
|
35
|
-
return engine.select(_dfs, sql)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
@parse_ibis_engine.candidate(
|
|
39
|
-
lambda obj, *args, **kwargs: isinstance(obj, DuckExecutionEngine)
|
|
40
|
-
or (isinstance(obj, str) and obj in ["duck", "duckdb"])
|
|
41
|
-
)
|
|
42
|
-
def _to_duck_ibis_engine(obj: Any, engine: ExecutionEngine) -> Optional[IbisEngine]:
|
|
43
|
-
return DuckDBIbisEngine(engine)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class _BackendWrapper(Backend):
|
|
47
|
-
def set_schemas(self, dfs: DataFrames) -> None:
|
|
48
|
-
self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()}
|
|
49
|
-
self._name_map: Dict[str, Tuple[TempTableName, IbisTable]] = {}
|
|
50
|
-
|
|
51
|
-
def table(self, name: str, schema: Any = None) -> IbisTable:
|
|
52
|
-
if name not in self._name_map:
|
|
53
|
-
tn = TempTableName()
|
|
54
|
-
tb = ibis.table(self._schemas[name], name=(str(tn)))
|
|
55
|
-
self._name_map[name] = (tn, tb)
|
|
56
|
-
return self._name_map[name][1]
|
fugue_ibis/execution/__init__.py
DELETED
|
File without changes
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from abc import abstractmethod
|
|
2
|
-
from typing import Any, Callable
|
|
3
|
-
|
|
4
|
-
import ibis
|
|
5
|
-
|
|
6
|
-
from fugue import AnyDataFrame, DataFrame, DataFrames, EngineFacet, ExecutionEngine
|
|
7
|
-
from fugue._utils.registry import fugue_plugin
|
|
8
|
-
|
|
9
|
-
from .._compat import IbisTable
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@fugue_plugin
|
|
13
|
-
def parse_ibis_engine(obj: Any, engine: ExecutionEngine) -> "IbisEngine":
|
|
14
|
-
if isinstance(obj, IbisEngine):
|
|
15
|
-
return obj
|
|
16
|
-
raise NotImplementedError(
|
|
17
|
-
f"Ibis execution engine can't be parsed from {obj}."
|
|
18
|
-
" You may need to register a parser for it."
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class IbisEngine(EngineFacet):
|
|
23
|
-
"""The abstract base class for different ibis execution implementations.
|
|
24
|
-
|
|
25
|
-
:param execution_engine: the execution engine this ibis engine will run on
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
@property
|
|
29
|
-
def is_distributed(self) -> bool: # pragma: no cover
|
|
30
|
-
return self.execution_engine.is_distributed
|
|
31
|
-
|
|
32
|
-
def to_df(self, df: AnyDataFrame, schema: Any = None) -> DataFrame:
|
|
33
|
-
raise NotImplementedError # pragma: no cover
|
|
34
|
-
|
|
35
|
-
@abstractmethod
|
|
36
|
-
def select(
|
|
37
|
-
self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable]
|
|
38
|
-
) -> DataFrame: # pragma: no cover
|
|
39
|
-
"""Execute the ibis select expression.
|
|
40
|
-
|
|
41
|
-
:param dfs: a collection of dataframes that must have keys
|
|
42
|
-
:param ibis_func: the ibis compute function
|
|
43
|
-
:return: result of the ibis function
|
|
44
|
-
|
|
45
|
-
.. note::
|
|
46
|
-
|
|
47
|
-
This interface is experimental, so it is subjected to change.
|
|
48
|
-
"""
|
|
49
|
-
raise NotImplementedError
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
from typing import Any, Callable
|
|
2
|
-
|
|
3
|
-
import ibis
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from ibis.backends.pandas import Backend
|
|
6
|
-
from triad.utils.assertion import assert_or_throw
|
|
7
|
-
|
|
8
|
-
from fugue import (
|
|
9
|
-
DataFrame,
|
|
10
|
-
DataFrames,
|
|
11
|
-
ExecutionEngine,
|
|
12
|
-
NativeExecutionEngine,
|
|
13
|
-
PandasDataFrame,
|
|
14
|
-
)
|
|
15
|
-
from fugue_ibis._utils import to_ibis_schema, to_schema
|
|
16
|
-
|
|
17
|
-
from .._compat import IbisTable
|
|
18
|
-
from .ibis_engine import IbisEngine, parse_ibis_engine
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class PandasIbisEngine(IbisEngine):
|
|
22
|
-
def select(
|
|
23
|
-
self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable]
|
|
24
|
-
) -> DataFrame: # pragma: no cover
|
|
25
|
-
pdfs = {k: v.as_pandas() for k, v in dfs.items()}
|
|
26
|
-
be = _BackendWrapper().connect(pdfs)
|
|
27
|
-
be.set_schemas(dfs)
|
|
28
|
-
expr = ibis_func(be)
|
|
29
|
-
schema = to_schema(expr.schema())
|
|
30
|
-
result = expr.execute()
|
|
31
|
-
assert_or_throw(
|
|
32
|
-
isinstance(result, pd.DataFrame), "result must be a pandas DataFrame"
|
|
33
|
-
)
|
|
34
|
-
return PandasDataFrame(result, schema=schema)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
@parse_ibis_engine.candidate(
|
|
38
|
-
lambda obj, *args, **kwargs: isinstance(obj, NativeExecutionEngine)
|
|
39
|
-
)
|
|
40
|
-
def _pd_to_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine:
|
|
41
|
-
return PandasIbisEngine(engine)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class _BackendWrapper(Backend):
|
|
45
|
-
def set_schemas(self, dfs: DataFrames) -> None:
|
|
46
|
-
self._schemas = {k: to_ibis_schema(v.schema) for k, v in dfs.items()}
|
|
47
|
-
|
|
48
|
-
def table(self, name: str, schema: Any = None):
|
|
49
|
-
return super().table(
|
|
50
|
-
name,
|
|
51
|
-
schema=self._schemas[name]
|
|
52
|
-
if schema is None and name in self._schemas
|
|
53
|
-
else schema,
|
|
54
|
-
)
|
fugue_ibis/extensions.py
DELETED
|
@@ -1,203 +0,0 @@
|
|
|
1
|
-
from typing import Any, Callable, Dict
|
|
2
|
-
|
|
3
|
-
import ibis
|
|
4
|
-
from fugue import DataFrame, DataFrames, Processor, WorkflowDataFrame
|
|
5
|
-
from fugue.exceptions import FugueWorkflowCompileError
|
|
6
|
-
from fugue.workflow.workflow import WorkflowDataFrames
|
|
7
|
-
from triad import assert_or_throw, extension_method
|
|
8
|
-
|
|
9
|
-
from ._utils import LazyIbisObject, _materialize
|
|
10
|
-
from .execution.ibis_engine import parse_ibis_engine
|
|
11
|
-
|
|
12
|
-
from ._compat import IbisTable
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def run_ibis(
|
|
16
|
-
ibis_func: Callable[[ibis.BaseBackend], IbisTable],
|
|
17
|
-
ibis_engine: Any = None,
|
|
18
|
-
**dfs: WorkflowDataFrame,
|
|
19
|
-
) -> WorkflowDataFrame:
|
|
20
|
-
"""Run an ibis workflow wrapped in ``ibis_func``
|
|
21
|
-
|
|
22
|
-
:param ibis_func: the function taking in an ibis backend, and returning
|
|
23
|
-
an Ibis TableExpr
|
|
24
|
-
:param ibis_engine: an object that together with |ExecutionEngine|
|
|
25
|
-
can determine :class:`~fugue_ibis.execution.ibis_engine.IbisEngine`
|
|
26
|
-
, defaults to None
|
|
27
|
-
:param dfs: dataframes in the same workflow
|
|
28
|
-
:return: the output workflow dataframe
|
|
29
|
-
|
|
30
|
-
.. admonition:: Examples
|
|
31
|
-
|
|
32
|
-
.. code-block:: python
|
|
33
|
-
|
|
34
|
-
import fugue as FugueWorkflow
|
|
35
|
-
from fugue_ibis import run_ibis
|
|
36
|
-
|
|
37
|
-
def func(backend):
|
|
38
|
-
t = backend.table("tb")
|
|
39
|
-
return t.mutate(b=t.a+1)
|
|
40
|
-
|
|
41
|
-
dag = FugueWorkflow()
|
|
42
|
-
df = dag.df([[0]], "a:int")
|
|
43
|
-
result = run_ibis(func, tb=df)
|
|
44
|
-
result.show()
|
|
45
|
-
"""
|
|
46
|
-
wdfs = WorkflowDataFrames(**dfs)
|
|
47
|
-
return wdfs.workflow.process(
|
|
48
|
-
wdfs,
|
|
49
|
-
using=_IbisProcessor,
|
|
50
|
-
params=dict(ibis_func=ibis_func, ibis_engine=ibis_engine),
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@extension_method
|
|
55
|
-
def as_ibis(df: WorkflowDataFrame) -> IbisTable:
|
|
56
|
-
"""Convert the Fugue workflow dataframe to an ibis table for ibis
|
|
57
|
-
operations.
|
|
58
|
-
|
|
59
|
-
:param df: the Fugue workflow dataframe
|
|
60
|
-
:return: the object representing the ibis table
|
|
61
|
-
|
|
62
|
-
.. admonition:: Examples
|
|
63
|
-
|
|
64
|
-
.. code-block:: python
|
|
65
|
-
|
|
66
|
-
# non-magical approach
|
|
67
|
-
import fugue as FugueWorkflow
|
|
68
|
-
from fugue_ibis import as_ibis, as_fugue
|
|
69
|
-
|
|
70
|
-
dag = FugueWorkflow()
|
|
71
|
-
df1 = dag.df([[0]], "a:int")
|
|
72
|
-
df2 = dag.df([[1]], "a:int")
|
|
73
|
-
idf1 = as_ibis(df1)
|
|
74
|
-
idf2 = as_ibis(df2)
|
|
75
|
-
idf3 = idf1.union(idf2)
|
|
76
|
-
result = idf3.mutate(b=idf3.a+1)
|
|
77
|
-
as_fugue(result).show()
|
|
78
|
-
|
|
79
|
-
.. code-block:: python
|
|
80
|
-
|
|
81
|
-
# magical approach
|
|
82
|
-
import fugue as FugueWorkflow
|
|
83
|
-
import fugue_ibis # must import
|
|
84
|
-
|
|
85
|
-
dag = FugueWorkflow()
|
|
86
|
-
idf1 = dag.df([[0]], "a:int").as_ibis()
|
|
87
|
-
idf2 = dag.df([[1]], "a:int").as_ibis()
|
|
88
|
-
idf3 = idf1.union(idf2)
|
|
89
|
-
result = idf3.mutate(b=idf3.a+1).as_fugue()
|
|
90
|
-
result.show()
|
|
91
|
-
|
|
92
|
-
.. note::
|
|
93
|
-
|
|
94
|
-
The magic is that when importing ``fugue_ibis``, the functions
|
|
95
|
-
``as_ibis`` and ``as_fugue`` are added to the correspondent classes
|
|
96
|
-
so you can use them as if they are parts of the original classes.
|
|
97
|
-
|
|
98
|
-
This is an idea similar to patching. Ibis uses this programming model
|
|
99
|
-
a lot. Fugue provides this as an option.
|
|
100
|
-
|
|
101
|
-
.. note::
|
|
102
|
-
|
|
103
|
-
The returned object is not really a ``TableExpr``, it's a 'super lazy'
|
|
104
|
-
object that will be translated into ``TableExpr`` at run time.
|
|
105
|
-
This is because to compile an ibis execution graph, the input schemas
|
|
106
|
-
must be known. However, in Fugue, this is not always true. For example
|
|
107
|
-
if the previous step is to pivot a table, then the output schema can be
|
|
108
|
-
known at runtime. So in order to be a part of Fugue, we need to be able to
|
|
109
|
-
construct ibis expressions before knowing the input schemas.
|
|
110
|
-
"""
|
|
111
|
-
return LazyIbisObject(df) # type: ignore
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
@extension_method(class_type=LazyIbisObject)
|
|
115
|
-
def as_fugue(
|
|
116
|
-
expr: IbisTable,
|
|
117
|
-
ibis_engine: Any = None,
|
|
118
|
-
) -> WorkflowDataFrame:
|
|
119
|
-
"""Convert a lazy ibis object to Fugue workflow dataframe
|
|
120
|
-
|
|
121
|
-
:param expr: the actual instance should be LazyIbisObject
|
|
122
|
-
:return: the Fugue workflow dataframe
|
|
123
|
-
|
|
124
|
-
.. admonition:: Examples
|
|
125
|
-
|
|
126
|
-
.. code-block:: python
|
|
127
|
-
|
|
128
|
-
# non-magical approach
|
|
129
|
-
import fugue as FugueWorkflow
|
|
130
|
-
from fugue_ibis import as_ibis, as_fugue
|
|
131
|
-
|
|
132
|
-
dag = FugueWorkflow()
|
|
133
|
-
df1 = dag.df([[0]], "a:int")
|
|
134
|
-
df2 = dag.df([[1]], "a:int")
|
|
135
|
-
idf1 = as_ibis(df1)
|
|
136
|
-
idf2 = as_ibis(df2)
|
|
137
|
-
idf3 = idf1.union(idf2)
|
|
138
|
-
result = idf3.mutate(b=idf3.a+1)
|
|
139
|
-
as_fugue(result).show()
|
|
140
|
-
|
|
141
|
-
.. code-block:: python
|
|
142
|
-
|
|
143
|
-
# magical approach
|
|
144
|
-
import fugue as FugueWorkflow
|
|
145
|
-
import fugue_ibis # must import
|
|
146
|
-
|
|
147
|
-
dag = FugueWorkflow()
|
|
148
|
-
idf1 = dag.df([[0]], "a:int").as_ibis()
|
|
149
|
-
idf2 = dag.df([[1]], "a:int").as_ibis()
|
|
150
|
-
idf3 = idf1.union(idf2)
|
|
151
|
-
result = idf3.mutate(b=idf3.a+1).as_fugue()
|
|
152
|
-
result.show()
|
|
153
|
-
|
|
154
|
-
.. note::
|
|
155
|
-
|
|
156
|
-
The magic is that when importing ``fugue_ibis``, the functions
|
|
157
|
-
``as_ibis`` and ``as_fugue`` are added to the correspondent classes
|
|
158
|
-
so you can use them as if they are parts of the original classes.
|
|
159
|
-
|
|
160
|
-
This is an idea similar to patching. Ibis uses this programming model
|
|
161
|
-
a lot. Fugue provides this as an option.
|
|
162
|
-
|
|
163
|
-
.. note::
|
|
164
|
-
|
|
165
|
-
The returned object is not really a ``TableExpr``, it's a 'super lazy'
|
|
166
|
-
object that will be translated into ``TableExpr`` at run time.
|
|
167
|
-
This is because to compile an ibis execution graph, the input schemas
|
|
168
|
-
must be known. However, in Fugue, this is not always true. For example
|
|
169
|
-
if the previous step is to pivot a table, then the output schema can be
|
|
170
|
-
known at runtime. So in order to be a part of Fugue, we need to be able to
|
|
171
|
-
construct ibis expressions before knowing the input schemas.
|
|
172
|
-
"""
|
|
173
|
-
|
|
174
|
-
def _func(
|
|
175
|
-
be: ibis.BaseBackend,
|
|
176
|
-
lazy_expr: LazyIbisObject,
|
|
177
|
-
ctx: Dict[int, Any],
|
|
178
|
-
) -> IbisTable:
|
|
179
|
-
return _materialize(
|
|
180
|
-
lazy_expr, {k: be.table(f"_{id(v)}") for k, v in ctx.items()}
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
assert_or_throw(
|
|
184
|
-
isinstance(expr, LazyIbisObject),
|
|
185
|
-
FugueWorkflowCompileError("expr must be a LazyIbisObject"),
|
|
186
|
-
)
|
|
187
|
-
_lazy_expr: LazyIbisObject = expr # type: ignore
|
|
188
|
-
_ctx = _lazy_expr._super_lazy_internal_ctx
|
|
189
|
-
_dfs = {f"_{id(v)}": v for _, v in _ctx.items()}
|
|
190
|
-
return run_ibis(
|
|
191
|
-
lambda be: _func(be, _lazy_expr, _ctx), ibis_engine=ibis_engine, **_dfs
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
class _IbisProcessor(Processor):
|
|
196
|
-
def process(self, dfs: DataFrames) -> DataFrame:
|
|
197
|
-
ibis_func = self.params.get_or_throw("ibis_func", Callable)
|
|
198
|
-
ibis_engine = self.params.get_or_none("ibis_engine", object)
|
|
199
|
-
ie = parse_ibis_engine(
|
|
200
|
-
self.execution_engine if ibis_engine is None else ibis_engine,
|
|
201
|
-
self.execution_engine,
|
|
202
|
-
)
|
|
203
|
-
return ie.select(dfs, ibis_func)
|
fugue_spark/ibis_engine.py
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
from typing import Any, Callable
|
|
2
|
-
|
|
3
|
-
import ibis
|
|
4
|
-
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
5
|
-
from triad.utils.assertion import assert_or_throw
|
|
6
|
-
|
|
7
|
-
from fugue import DataFrame, DataFrames, ExecutionEngine
|
|
8
|
-
from fugue_ibis import IbisTable
|
|
9
|
-
from fugue_ibis._utils import to_schema
|
|
10
|
-
from fugue_ibis.execution.ibis_engine import IbisEngine, parse_ibis_engine
|
|
11
|
-
from fugue_spark.dataframe import SparkDataFrame
|
|
12
|
-
from fugue_spark.execution_engine import SparkExecutionEngine
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class SparkIbisEngine(IbisEngine):
|
|
16
|
-
def __init__(self, execution_engine: ExecutionEngine) -> None:
|
|
17
|
-
assert_or_throw(
|
|
18
|
-
isinstance(execution_engine, SparkExecutionEngine),
|
|
19
|
-
lambda: ValueError(
|
|
20
|
-
f"SparkIbisEngine must use SparkExecutionEngine ({execution_engine})"
|
|
21
|
-
),
|
|
22
|
-
)
|
|
23
|
-
super().__init__(execution_engine)
|
|
24
|
-
|
|
25
|
-
def select(
|
|
26
|
-
self, dfs: DataFrames, ibis_func: Callable[[ibis.BaseBackend], IbisTable]
|
|
27
|
-
) -> DataFrame:
|
|
28
|
-
for k, v in dfs.items():
|
|
29
|
-
self.execution_engine.register(v, k) # type: ignore
|
|
30
|
-
con = ibis.pyspark.connect(self.execution_engine.spark_session) # type: ignore
|
|
31
|
-
expr = ibis_func(con)
|
|
32
|
-
schema = to_schema(expr.schema())
|
|
33
|
-
result = expr.compile()
|
|
34
|
-
assert_or_throw(
|
|
35
|
-
isinstance(result, PySparkDataFrame),
|
|
36
|
-
lambda: ValueError(f"result must be a PySpark DataFrame ({type(result)})"),
|
|
37
|
-
)
|
|
38
|
-
return SparkDataFrame(result, schema=schema)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
@parse_ibis_engine.candidate(
|
|
42
|
-
lambda obj, *args, **kwargs: isinstance(obj, SparkExecutionEngine)
|
|
43
|
-
)
|
|
44
|
-
def _spark_to_ibis_engine(obj: Any, engine: ExecutionEngine) -> IbisEngine:
|
|
45
|
-
return SparkIbisEngine(engine)
|
fugue_test/ibis_suite.py
DELETED
|
@@ -1,92 +0,0 @@
|
|
|
1
|
-
# pylint: disable-all
|
|
2
|
-
from unittest import TestCase
|
|
3
|
-
|
|
4
|
-
import ibis
|
|
5
|
-
from fugue import ExecutionEngine, FugueWorkflow, register_default_sql_engine
|
|
6
|
-
from fugue_ibis import IbisEngine, as_fugue, as_ibis, run_ibis
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class IbisTests(object):
|
|
10
|
-
"""Ibis test suite.
|
|
11
|
-
Any new engine from :class:`~fugue_ibis.execution.ibis_engine.IbisEngine`
|
|
12
|
-
should also pass this test suite.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
class Tests(TestCase):
|
|
16
|
-
@classmethod
|
|
17
|
-
def setUpClass(cls):
|
|
18
|
-
register_default_sql_engine(lambda engine: engine.sql_engine)
|
|
19
|
-
cls._engine = cls.make_engine(cls)
|
|
20
|
-
cls._ibis_engine = cls.make_ibis_engine(cls)
|
|
21
|
-
|
|
22
|
-
@property
|
|
23
|
-
def engine(self) -> ExecutionEngine:
|
|
24
|
-
return self._engine # type: ignore
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def ibis_engine(self) -> ExecutionEngine:
|
|
28
|
-
return self._ibis_engine # type: ignore
|
|
29
|
-
|
|
30
|
-
@classmethod
|
|
31
|
-
def tearDownClass(cls):
|
|
32
|
-
cls._engine.stop()
|
|
33
|
-
|
|
34
|
-
def make_engine(self) -> ExecutionEngine: # pragma: no cover
|
|
35
|
-
raise NotImplementedError
|
|
36
|
-
|
|
37
|
-
def make_ibis_engine(self) -> IbisEngine: # pragma: no cover
|
|
38
|
-
raise NotImplementedError
|
|
39
|
-
|
|
40
|
-
def test_run_ibis(self):
|
|
41
|
-
def _test1(con: ibis.BaseBackend) -> ibis.Expr:
|
|
42
|
-
tb = con.table("a")
|
|
43
|
-
return tb
|
|
44
|
-
|
|
45
|
-
def _test2(con: ibis.BaseBackend) -> ibis.Expr:
|
|
46
|
-
tb = con.table("a")
|
|
47
|
-
return tb.mutate(c=tb.a + tb.b)
|
|
48
|
-
|
|
49
|
-
dag = FugueWorkflow()
|
|
50
|
-
df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
|
|
51
|
-
res = run_ibis(_test1, ibis_engine=self.ibis_engine, a=df)
|
|
52
|
-
res.assert_eq(df)
|
|
53
|
-
df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
|
|
54
|
-
res = run_ibis(_test2, ibis_engine=self.ibis_engine, a=df)
|
|
55
|
-
df2 = dag.df([[0, 1, 1], [2, 3, 5]], "a:long,b:long,c:long")
|
|
56
|
-
res.assert_eq(df2)
|
|
57
|
-
dag.run(self.engine)
|
|
58
|
-
|
|
59
|
-
def test_run_as_ibis(self):
|
|
60
|
-
dag = FugueWorkflow()
|
|
61
|
-
df = dag.df([[0, 1], [2, 3]], "a:long,b:long")
|
|
62
|
-
idf = as_ibis(df)
|
|
63
|
-
res = as_fugue(idf)
|
|
64
|
-
res.assert_eq(df)
|
|
65
|
-
dag.run(self.engine)
|
|
66
|
-
|
|
67
|
-
dag = FugueWorkflow()
|
|
68
|
-
df1 = dag.df([[0, 1], [2, 3]], "a:long,b:long")
|
|
69
|
-
df2 = dag.df([[0, ["x"]], [3, ["y"]]], "a:long,c:[str]")
|
|
70
|
-
idf1 = as_ibis(df1)
|
|
71
|
-
idf2 = as_ibis(df2)
|
|
72
|
-
idf = idf1.inner_join(idf2, idf1.a == idf2.a)[idf1, idf2.c]
|
|
73
|
-
res = as_fugue(idf)
|
|
74
|
-
expected = dag.df([[0, 1, ["x"]]], "a:long,b:long,c:[str]")
|
|
75
|
-
res.assert_eq(expected, check_order=True, check_schema=True)
|
|
76
|
-
dag.run(self.engine)
|
|
77
|
-
|
|
78
|
-
dag = FugueWorkflow()
|
|
79
|
-
idf1 = dag.df([[0, 1], [2, 3]], "a:long,b:long").as_ibis()
|
|
80
|
-
idf2 = dag.df([[0, ["x"]], [3, ["y"]]], "a:long,c:[str]").as_ibis()
|
|
81
|
-
res = idf1.inner_join(idf2, idf1.a == idf2.a)[idf1, idf2.c].as_fugue()
|
|
82
|
-
expected = dag.df([[0, 1, ["x"]]], "a:long,b:long,c:[str]")
|
|
83
|
-
res.assert_eq(expected, check_order=True, check_schema=True)
|
|
84
|
-
dag.run(self.engine)
|
|
85
|
-
|
|
86
|
-
def test_literal(self):
|
|
87
|
-
dag = FugueWorkflow()
|
|
88
|
-
idf1 = dag.df([[0, 1], [2, 3]], "a:long,b:long").as_ibis()
|
|
89
|
-
res = idf1.mutate(c=idf1.b + 10).as_fugue()
|
|
90
|
-
expected = dag.df([[0, 1, 11], [2, 3, 13]], "a:long,b:long,c:long")
|
|
91
|
-
res.assert_eq(expected, check_order=True, check_schema=True)
|
|
92
|
-
dag.run(self.engine)
|
fugue_test/plugins/__init__.py
DELETED
|
File without changes
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
@pytest.fixture(scope="session")
|
|
5
|
-
def fugue_dask_client():
|
|
6
|
-
from dask.distributed import Client
|
|
7
|
-
import dask
|
|
8
|
-
|
|
9
|
-
with Client(processes=True, n_workers=3, threads_per_worker=1) as client:
|
|
10
|
-
dask.config.set({"dataframe.shuffle.method": "tasks"})
|
|
11
|
-
dask.config.set({"dataframe.convert-string": False})
|
|
12
|
-
yield client
|