PyPI - fugue - Versions diffs - 0.8.5.dev1__tar.gz → 0.8.6.dev2__tar.gz - Mend

fugue 0.8.5.dev1tar.gz → 0.8.6.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fugue
-Version: 0.8.5.dev1
+Version: 0.8.6.dev2
 Summary: An abstraction layer for distributed computation
 Home-page: http://github.com/fugue-project/fugue
 Author: The Fugue Development Team
@@ -152,9 +152,16 @@ Description: # Fugue
         pip install fugue
         ```
+        In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
+        ```bash
+        pip install fugue[sql]
+        ```
         It also has the following installation extras:
-        *   **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html)
+        *   **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
+        *   **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
         *   **dask**: to support Dask as the ExecutionEngine.
         *   **ray**: to support Ray as the ExecutionEngine.
         *   **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/README.md RENAMED Viewed

@@ -144,9 +144,16 @@ Fugue can be installed through pip or conda. For example:
 pip install fugue
 ```
+In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
+```bash
+pip install fugue[sql]
+```
 It also has the following installation extras:
-*   **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html)
+*   **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
+*   **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
 *   **dask**: to support Dask as the ExecutionEngine.
 *   **ray**: to support Ray as the ExecutionEngine.
 *   **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframe.py RENAMED Viewed

@@ -113,6 +113,13 @@ class DataFrame(Dataset):
     def as_pandas(self) -> pd.DataFrame:
         """Convert to pandas DataFrame"""
         pdf = pd.DataFrame(self.as_array(), columns=self.columns)
+        if len(pdf) == 0:  # TODO: move to triad
+            return pd.DataFrame(
+                {
+                    k: pd.Series(dtype=v.type.to_pandas_dtype())
+                    for k, v in self.schema.items()
+                }
+            )
         return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
     def as_arrow(self, type_safe: bool = False) -> pa.Table:

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframe_iterable_dataframe.py RENAMED Viewed

@@ -165,7 +165,7 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
     def as_pandas(self) -> pd.DataFrame:
         if self.empty:
-            return ArrayDataFrame([], self.schema).as_pandas()
+            return PandasDataFrame(schema=self.schema).as_pandas()
         return pd.concat(df.as_pandas() for df in self.native)

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/function_wrapper.py RENAMED Viewed

@@ -23,9 +23,10 @@ from triad.collections.function_wrapper import (
 from triad.utils.iter import EmptyAwareIterable, make_empty_aware
 from ..constants import FUGUE_ENTRYPOINT
+from ..dataset.api import count as df_count
 from .array_dataframe import ArrayDataFrame
 from .arrow_dataframe import ArrowDataFrame
-from .dataframe import DataFrame, LocalDataFrame
+from .dataframe import AnyDataFrame, DataFrame, LocalDataFrame, as_fugue_df
 from .dataframe_iterable_dataframe import (
     IterableArrowDataFrame,
     IterablePandasDataFrame,
@@ -172,6 +173,19 @@ class DataFrameParam(_DataFrameParamBase):
             return sum(1 for _ in df.as_array_iterable())
+@fugue_annotated_param(AnyDataFrame)
+class _AnyDataFrameParam(DataFrameParam):
+    def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
+        return (
+            as_fugue_df(output)
+            if schema is None
+            else as_fugue_df(output, schema=schema)
+        )
+    def count(self, df: Any) -> int:
+        return df_count(df)
 @fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
 class LocalDataFrameParam(DataFrameParam):
     def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
@@ -333,6 +347,9 @@ class _PandasParam(LocalDataFrameParam):
     @no_type_check
     def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
+        _schema: Optional[Schema] = None if schema is None else Schema(schema)
+        if _schema is not None and _schema.names != list(output.columns):
+            output = output[_schema.names]
         return PandasDataFrame(output, schema)
     @no_type_check
@@ -361,8 +378,15 @@ class _IterablePandasParam(LocalDataFrameParam):
         self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
     ) -> DataFrame:
         def dfs():
+            _schema: Optional[Schema] = None if schema is None else Schema(schema)
+            has_return = False
             for df in output:
-                yield PandasDataFrame(df, schema)
+                if _schema is not None and _schema.names != list(df.columns):
+                    df = df[_schema.names]
+                yield PandasDataFrame(df, _schema)
+                has_return = True
+            if not has_return and _schema is not None:
+                yield PandasDataFrame(schema=_schema)
         return IterablePandasDataFrame(dfs())
@@ -381,7 +405,12 @@ class _PyArrowTableParam(LocalDataFrameParam):
     def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
         assert isinstance(output, pa.Table)
-        return ArrowDataFrame(output, schema=schema)
+        adf: DataFrame = ArrowDataFrame(output)
+        if schema is not None:
+            _schema = Schema(schema)
+            if adf.schema != _schema:
+                adf = adf[_schema.names].alter_columns(_schema)
+        return adf
     def count(self, df: Any) -> int:  # pragma: no cover
         return df.count()
@@ -409,13 +438,15 @@ class _IterableArrowParam(LocalDataFrameParam):
     ) -> DataFrame:
         def dfs():
             _schema: Optional[Schema] = None if schema is None else Schema(schema)
+            has_return = False
             for df in output:
-                adf = ArrowDataFrame(df)
-                if _schema is not None and not (  # pylint: disable-all
-                    adf.schema == schema
-                ):
+                adf: DataFrame = ArrowDataFrame(df)
+                if _schema is not None and adf.schema != _schema:
                     adf = adf[_schema.names].alter_columns(_schema)
                 yield adf
+                has_return = True
+            if not has_return and _schema is not None:
+                yield ArrowDataFrame(schema=_schema)
         return IterableArrowDataFrame(dfs())

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: fugue
-Version: 0.8.5.dev1
+Version: 0.8.6.dev2
 Summary: An abstraction layer for distributed computation
 Home-page: http://github.com/fugue-project/fugue
 Author: The Fugue Development Team
@@ -152,9 +152,16 @@ Description: # Fugue
         pip install fugue
         ```
+        In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
+        ```bash
+        pip install fugue[sql]
+        ```
         It also has the following installation extras:
-        *   **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html)
+        *   **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
+        *   **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
         *   **dask**: to support Dask as the ExecutionEngine.
         *   **ray**: to support Ray as the ExecutionEngine.
         *   **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ fugue/constants.py
 fugue/dev.py
 fugue/exceptions.py
 fugue/plugins.py
+fugue/py.typed
 fugue/registry.py
 fugue.egg-info/PKG-INFO
 fugue.egg-info/SOURCES.txt

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/entry_points.txt RENAMED Viewed

@@ -2,10 +2,10 @@
 dask = fugue_dask.registry [dask]
 dask_ibis = fugue_dask.ibis_engine [dask,ibis]
 duckdb = fugue_duckdb.registry [duckdb]
-duckdb_ibis = fugue_duckdb.ibis_engine [duckdb,ibis]
+duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
 ibis = fugue_ibis [ibis]
 polars = fugue_polars.registry [polars]
 ray = fugue_ray.registry [ray]
 spark = fugue_spark.registry [spark]
-spark_ibis = fugue_spark.ibis_engine [spark,ibis]
+spark_ibis = fugue_spark.ibis_engine [ibis,spark]

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/requires.txt RENAMED Viewed

@@ -1,8 +1,8 @@
-triad>=0.9.0
+triad>=0.9.1
 adagio>=0.2.4
 pyarrow>=0.15.1
 pandas>=1.2.0
-qpd>=0.4.3
+qpd>=0.4.4
 fugue-sql-antlr>=0.1.6
 sqlglot
 jinja2
@@ -12,8 +12,8 @@ sqlglot
 jinja2
 fugue-sql-antlr[cpp]>=0.1.6
 pyspark>=3.1.1
-ray[data]>=2.0.0
-qpd[dask]>=0.4.3
+ray[data]>=2.1.0
+qpd[dask]>=0.4.4
 notebook
 jupyterlab
 ipython>=7.10.0
@@ -26,20 +26,20 @@ dask[dataframe,distributed]
 ibis-framework>=2.1.1
 [all:python_version >= "3.8"]
-dask[dataframe,distributed]>=2022.9.0
-ibis-framework>=3.2.0
+dask[dataframe,distributed]<2023.7.1,>=2022.9.0
+ibis-framework<6,>=3.2.0
 [cpp_sql_parser]
 fugue-sql-antlr[cpp]>=0.1.6
 [dask]
-qpd[dask]>=0.4.3
+qpd[dask]>=0.4.4
 [dask:python_version < "3.8"]
 dask[dataframe,distributed]
 [dask:python_version >= "3.8"]
-dask[dataframe,distributed]>=2022.9.0
+dask[dataframe,distributed]<2023.7.1,>=2022.9.0
 [duckdb]
 duckdb>=0.5.0
@@ -52,7 +52,7 @@ numpy
 ibis-framework>=2.1.1
 [ibis:python_version >= "3.8"]
-ibis-framework>=3.2.0
+ibis-framework<6,>=3.2.0
 [notebook]
 notebook
@@ -63,7 +63,7 @@ ipython>=7.10.0
 polars
 [ray]
-ray[data]>=2.0.0
+ray[data]>=2.1.0
 duckdb>=0.5.0
 pyarrow>=6.0.1
@@ -71,7 +71,7 @@ pyarrow>=6.0.1
 pyspark>=3.1.1
 [sql]
-qpd>=0.4.3
+qpd>=0.4.4
 fugue-sql-antlr>=0.1.6
 sqlglot
 jinja2

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution_engine.py RENAMED Viewed

@@ -92,20 +92,19 @@ class IbisSQLEngine(SQLEngine):
         _df2 = self.to_df(df2)
         key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
         on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
+        if ibis.__version__ < "6":
+            suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
+        else:  # pragma: no cover
+            # breaking change in ibis 6.0
+            suffixes = dict(lname="", rname=_JOIN_RIGHT_SUFFIX)
         if how.lower() == "cross":
-            tb = _df1.native.cross_join(_df2.native, suffixes=("", _JOIN_RIGHT_SUFFIX))
+            tb = _df1.native.cross_join(_df2.native, **suffixes)
         elif how.lower() == "right_outer":
-            tb = _df2.native.left_join(
-                _df1.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df2.native.left_join(_df1.native, on_fields, **suffixes)
         elif how.lower() == "left_outer":
-            tb = _df1.native.left_join(
-                _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.left_join(_df2.native, on_fields, **suffixes)
         elif how.lower() == "full_outer":
-            tb = _df1.native.outer_join(
-                _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.outer_join(_df2.native, on_fields, **suffixes)
             cols: List[Any] = []
             for k in end_schema.names:
                 if k not in key_schema:
@@ -116,17 +115,11 @@ class IbisSQLEngine(SQLEngine):
                     )
             tb = tb[cols]
         elif how.lower() in ["semi", "left_semi"]:
-            tb = _df1.native.semi_join(
-                _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
         elif how.lower() in ["anti", "left_anti"]:
-            tb = _df1.native.anti_join(
-                _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.anti_join(_df2.native, on_fields, **suffixes)
         else:
-            tb = _df1.native.inner_join(
-                _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
-            )
+            tb = _df1.native.inner_join(_df2.native, on_fields, **suffixes)
         return self.to_df(tb[end_schema.names], schema=end_schema)
     def union(self, df1: DataFrame, df2: DataFrame, distinct: bool = True) -> DataFrame:

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/dataframe.py RENAMED Viewed

@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple
 import pandas as pd
 import pyarrow as pa
+import ray
 import ray.data as rd
 from triad import Schema
@@ -13,15 +14,51 @@ from .._constants import _ZERO_COPY
 _RAY_NULL_REPR = "__RAY_NULL__"
-def get_dataset_format(df: rd.Dataset) -> Optional[str]:
-    df.fully_executed()
+def is_materialized(df: rd.Dataset) -> bool:
+    if hasattr(rd.dataset, "MaterializedDataset"):
+        return isinstance(df, rd.dataset.MaterializedDataset)
+    return df.is_fully_executed()  # pragma: no cover
+def materialize(df: rd.Dataset) -> rd.Dataset:
+    if not is_materialized(df):
+        if hasattr(df, "materialize"):
+            df = df.materialize()
+        else:  # pragma: no cover
+            df = df.fully_executed()
+    return df
+def get_dataset_format(df: rd.Dataset) -> Tuple[Optional[str], rd.Dataset]:
+    df = materialize(df)
     if df.count() == 0:
-        return None
-    if hasattr(df, "_dataset_format"):  # pragma: no cover
-        return df._dataset_format()  # ray<2.2
-    ctx = rd.context.DatasetContext.get_current()
-    ctx.use_streaming_executor = False
-    return df.dataset_format()  # ray>=2.2
+        return None, df
+    if ray.__version__ < "2.5.0":  # pragma: no cover
+        if hasattr(df, "_dataset_format"):  # pragma: no cover
+            return df._dataset_format(), df  # ray<2.2
+        ctx = rd.context.DatasetContext.get_current()
+        ctx.use_streaming_executor = False
+        return df.dataset_format(), df  # ray>=2.2
+    else:
+        schema = df.schema(fetch_if_missing=True)
+        if schema is None:  # pragma: no cover
+            return None, df
+        if isinstance(schema.base_schema, pa.Schema):
+            return "arrow", df
+        return "pandas", df
+def to_schema(schema: Any) -> Schema:  # pragma: no cover
+    if isinstance(schema, pa.Schema):
+        return Schema(schema)
+    if ray.__version__ >= "2.5.0":
+        if isinstance(schema, rd.Schema):
+            if hasattr(schema, "base_schema") and isinstance(
+                schema.base_schema, pa.Schema
+            ):
+                return Schema(schema.base_schema)
+            return Schema(list(zip(schema.names, schema.types)))
+    raise ValueError(f"{schema} is not supported")
 def build_empty(schema: Schema) -> rd.Dataset:

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/dataframe.py RENAMED Viewed

@@ -18,7 +18,7 @@ from fugue.plugins import (
 )
 from ._constants import _ZERO_COPY
-from ._utils.dataframe import build_empty, get_dataset_format
+from ._utils.dataframe import build_empty, get_dataset_format, materialize, to_schema
 class RayDataFrame(DataFrame):
@@ -52,7 +52,7 @@ class RayDataFrame(DataFrame):
             self._native = build_empty(schema)
             return
         if isinstance(df, rd.Dataset):
-            fmt = get_dataset_format(df)
+            fmt, df = get_dataset_format(df)
             if fmt is None:  # empty:
                 schema = _input_schema(schema).assert_not_empty()
                 super().__init__(schema)
@@ -62,7 +62,7 @@ class RayDataFrame(DataFrame):
                 rdf = rd.from_arrow_refs(df.to_arrow_refs())
             elif fmt == "arrow":
                 rdf = df
-            else:
+            else:  # pragma: no cover
                 raise NotImplementedError(
                     f"Ray Dataset in {fmt} format is not supported"
                 )
@@ -156,8 +156,7 @@ class RayDataFrame(DataFrame):
     def persist(self, **kwargs: Any) -> "RayDataFrame":
         # TODO: it mutates the dataframe, is this a good bahavior
-        if not self.native.is_fully_executed():  # pragma: no cover
-            self.native.fully_executed()
+        self._native = materialize(self._native)
         return self
     def count(self) -> int:
@@ -226,11 +225,12 @@ class RayDataFrame(DataFrame):
     ) -> Tuple[rd.Dataset, Schema]:
         if internal_schema:
             return rdf, schema
-        if get_dataset_format(rdf) is None:  # empty
+        fmt, rdf = get_dataset_format(rdf)
+        if fmt is None:  # empty
             schema = _input_schema(schema).assert_not_empty()
             return build_empty(schema), schema
-        if schema is None or schema == rdf.schema(fetch_if_missing=True):
-            return rdf, rdf.schema(fetch_if_missing=True)
+        if schema is None or schema == to_schema(rdf.schema(fetch_if_missing=True)):
+            return rdf, to_schema(rdf.schema(fetch_if_missing=True))
         def _alter(table: pa.Table) -> pa.Table:  # pragma: no cover
             return ArrowDataFrame(table).alter_columns(schema).native  # type: ignore
@@ -263,12 +263,15 @@ def _rd_as_local(df: rd.Dataset) -> bool:
 @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
 def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]:
-    fmt = get_dataset_format(df)
-    if fmt == "pandas":
-        return list(df.schema(True).names)
-    elif fmt == "arrow":
-        return [f.name for f in df.schema(True)]
-    raise NotImplementedError(f"{fmt} is not supported")  # pragma: no cover
+    if hasattr(df, "columns"):  # higher version of ray
+        return df.columns(fetch_if_missing=True)
+    else:  # pragma: no cover
+        fmt, _ = get_dataset_format(df)
+        if fmt == "pandas":
+            return list(df.schema(True).names)
+        elif fmt == "arrow":
+            return df.schema(fetch_if_missing=True).names
+        raise NotImplementedError(f"{fmt} is not supported")  # pragma: no cover
 @rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/execution_engine.py RENAMED Viewed

@@ -134,9 +134,8 @@ class SparkMapEngine(MapEngine):
     ) -> DataFrame:
         output_schema = Schema(output_schema)
         if self._should_use_pandas_udf(output_schema):
-            # pandas udf can only be used for pyspark > 3
             if len(partition_spec.partition_by) > 0:
-                if partition_spec.algo == "coarse":
+                if partition_spec.algo in ["coarse", "even"]:
                     return self._map_by_pandas_udf(
                         df,
                         map_func=map_func,
@@ -145,7 +144,18 @@ class SparkMapEngine(MapEngine):
                         on_init=on_init,
                         map_func_format_hint=map_func_format_hint,
                     )
-                elif partition_spec.algo != "even" or self.is_spark_connect:
+                else:
+                    if (  # not simple partitioning
+                        partition_spec.algo != "hash"
+                        or partition_spec.num_partitions != "0"
+                    ):
+                        # TODO: not sure if presort should be done
+                        # on physical partition level
+                        df = self.to_df(
+                            self.execution_engine.repartition(
+                                df, PartitionSpec(partition_spec, presort=[])
+                            )
+                        )
                     return self._group_map_by_pandas_udf(
                         df,
                         map_func=map_func,
@@ -154,7 +164,7 @@ class SparkMapEngine(MapEngine):
                         on_init=on_init,
                         map_func_format_hint=map_func_format_hint,
                     )
-            elif len(partition_spec.partition_by) == 0:
+            else:
                 return self._map_by_pandas_udf(
                     df,
                     map_func=map_func,

fugue-0.8.6.dev2/fugue_test/__init__.py ADDED Viewed

File without changes

{fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_test/builtin_suite.py RENAMED Viewed

@@ -22,6 +22,7 @@ from triad import SerializableRLock
 import fugue.api as fa
 from fugue import (
+    AnyDataFrame,
     ArrayDataFrame,
     CoTransformer,
     DataFrame,
@@ -365,6 +366,12 @@ class BuiltInTests(object):
                 dag.output(dict(df=a), using=mock_outputter2)
                 a.partition(num=3).output(MockOutputter3)
                 dag.output(dict(aa=a, bb=b), using=MockOutputter4)
+                a = dag.create(mock_creator2, params=dict(p=2))
+                b = dag.create(mock_creator2, params=dict(p=2))
+                c = dag.process(a, b, using=mock_processor4)
+                c.assert_eq(ArrayDataFrame([[2]], "a:int"))
+                dag.output(a, b, using=mock_outputter4)
             dag.run(self.engine)
         def test_zip(self):
@@ -435,9 +442,14 @@ class BuiltInTests(object):
             # this test is important for using mapInPandas in spark
             # schema: *,c:int
-            def mt_pandas(dfs: Iterable[pd.DataFrame]) -> Iterator[pd.DataFrame]:
+            def mt_pandas(
+                dfs: Iterable[pd.DataFrame], empty: bool = False
+            ) -> Iterator[pd.DataFrame]:
                 for df in dfs:
-                    yield df.assign(c=2)
+                    if not empty:
+                        df = df.assign(c=2)
+                        df = df[reversed(list(df.columns))]
+                        yield df
             with FugueWorkflow() as dag:
                 a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
@@ -445,10 +457,25 @@ class BuiltInTests(object):
                 dag.df([[1, 2, 2], [3, 4, 2]], "a:int,b:int,c:int").assert_eq(b)
             dag.run(self.engine)
+            # when iterable returns nothing
+            with FugueWorkflow() as dag:
+                a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
+                # without partitioning
+                b = a.transform(mt_pandas, params=dict(empty=True))
+                dag.df([], "a:int,b:int,c:int").assert_eq(b)
+                # with partitioning
+                b = a.partition_by("a").transform(mt_pandas, params=dict(empty=True))
+                dag.df([], "a:int,b:int,c:int").assert_eq(b)
+            dag.run(self.engine)
             # schema: *
-            def mt_arrow(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
+            def mt_arrow(
+                dfs: Iterable[pa.Table], empty: bool = False
+            ) -> Iterator[pa.Table]:
                 for df in dfs:
-                    yield df
+                    if not empty:
+                        df = df.select(reversed(df.schema.names))
+                        yield df
             # schema: a:long
             def mt_arrow_2(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
@@ -463,6 +490,17 @@ class BuiltInTests(object):
                 dag.df([[1], [3]], "a:long").assert_eq(b)
             dag.run(self.engine)
+            # when iterable returns nothing
+            with FugueWorkflow() as dag:
+                a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
+                # without partitioning
+                b = a.transform(mt_arrow, params=dict(empty=True))
+                dag.df([], "a:int,b:int").assert_eq(b)
+                # with partitioning
+                b = a.partition_by("a").transform(mt_arrow, params=dict(empty=True))
+                dag.df([], "a:int,b:int").assert_eq(b)
+            dag.run(self.engine)
         def test_transform_binary(self):
             with FugueWorkflow() as dag:
                 a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
@@ -1829,6 +1867,10 @@ def mock_creator(p: int) -> DataFrame:
     return ArrayDataFrame([[p]], "a:int")
+def mock_creator2(p: int) -> AnyDataFrame:
+    return fa.as_fugue_df([[p]], schema="a:int")
 def mock_processor(df1: List[List[Any]], df2: List[List[Any]]) -> DataFrame:
     return ArrayDataFrame([[len(df1) + len(df2)]], "a:int")
@@ -1844,6 +1886,10 @@ class MockProcessor3(Processor):
         return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
+def mock_processor4(df1: AnyDataFrame, df2: AnyDataFrame) -> AnyDataFrame:
+    return ArrayDataFrame([[fa.count(df1) + fa.count(df2)]], "a:int")
 def mock_outputter(df1: List[List[Any]], df2: List[List[Any]]) -> None:
     assert len(df1) == len(df2)
@@ -1857,6 +1903,10 @@ class MockOutputter3(Outputter):
         assert "3" == self.partition_spec.num_partitions
+def mock_outputter4(df1: AnyDataFrame, df2: AnyDataFrame) -> None:
+    assert fa.count(df1) == fa.count(df2)
 class MockOutputter4(Outputter):
     def process(self, dfs):
         for k, v in dfs.items():
@@ -1895,8 +1945,8 @@ def mock_tf0(df: pd.DataFrame, p=1, col="p") -> pd.DataFrame:
 # schema: *,ct:int,p:int
 def mock_tf1(df: pd.DataFrame, p=1) -> pd.DataFrame:
-    df["ct"] = df.shape[0]
     df["p"] = p
+    df["ct"] = df.shape[0]
     return df

fugue-0.8.6.dev2/fugue_version/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.8.6"

fugue 0.8.5.dev1__tar.gz → 0.8.6.dev2__tar.gz

fugue 0.8.5.dev1tar.gz → 0.8.6.dev2tar.gz