PyPI - fugue - Versions diffs - 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

fugue 0.8.2.dev4py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

fugue/__init__.py +0 -1
fugue/_utils/io.py +2 -91
fugue/api.py +1 -0
fugue/collections/partition.py +12 -6
fugue/constants.py +1 -1
fugue/dataframe/__init__.py +1 -7
fugue/dataframe/arrow_dataframe.py +1 -1
fugue/dataframe/function_wrapper.py +2 -3
fugue/dataframe/utils.py +10 -84
fugue/execution/api.py +34 -12
fugue/execution/native_execution_engine.py +33 -19
fugue/extensions/_builtins/creators.py +4 -2
fugue/extensions/_builtins/outputters.py +3 -3
fugue/extensions/_builtins/processors.py +2 -3
fugue/plugins.py +1 -0
fugue/workflow/_checkpoint.py +1 -1
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
fugue_contrib/viz/_ext.py +7 -1
fugue_dask/_io.py +0 -13
fugue_dask/_utils.py +10 -4
fugue_dask/execution_engine.py +42 -16
fugue_duckdb/_utils.py +7 -2
fugue_duckdb/dask.py +1 -1
fugue_duckdb/dataframe.py +17 -10
fugue_duckdb/execution_engine.py +12 -22
fugue_ibis/dataframe.py +2 -7
fugue_notebook/env.py +5 -10
fugue_polars/_utils.py +0 -40
fugue_polars/polars_dataframe.py +22 -7
fugue_ray/_constants.py +8 -1
fugue_ray/_utils/dataframe.py +31 -4
fugue_ray/_utils/io.py +2 -4
fugue_ray/dataframe.py +13 -4
fugue_ray/execution_engine.py +39 -21
fugue_spark/_utils/convert.py +22 -11
fugue_spark/_utils/io.py +0 -13
fugue_spark/_utils/misc.py +27 -0
fugue_spark/_utils/partition.py +11 -18
fugue_spark/dataframe.py +24 -19
fugue_spark/execution_engine.py +61 -35
fugue_spark/registry.py +15 -3
fugue_test/builtin_suite.py +7 -9
fugue_test/dataframe_suite.py +7 -3
fugue_test/execution_suite.py +100 -122
fugue_version/__init__.py +1 -1
tests/fugue/collections/test_partition.py +6 -3
tests/fugue/dataframe/test_utils.py +2 -43
tests/fugue/execution/test_naive_execution_engine.py +33 -0
tests/fugue/utils/test_io.py +0 -80
tests/fugue_dask/test_execution_engine.py +45 -0
tests/fugue_dask/test_io.py +0 -55
tests/fugue_duckdb/test_dataframe.py +2 -2
tests/fugue_duckdb/test_utils.py +1 -1
tests/fugue_polars/test_api.py +13 -0
tests/fugue_polars/test_transform.py +11 -5
tests/fugue_ray/test_execution_engine.py +32 -1
tests/fugue_spark/test_dataframe.py +0 -8
tests/fugue_spark/test_execution_engine.py +48 -10
tests/fugue_spark/test_importless.py +4 -4
tests/fugue_spark/test_spark_connect.py +82 -0
tests/fugue_spark/utils/test_convert.py +6 -8
tests/fugue_spark/utils/test_io.py +0 -17
fugue_test/_utils.py +0 -13
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0

tests/fugue/utils/test_io.py CHANGED Viewed

@@ -223,83 +223,3 @@ def test_json(tmpdir):
     raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
-def test_avro_io(tmpdir):
-    df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
-    df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long")
-    path1 = os.path.join(tmpdir, "df1.avro")
-    path2 = os.path.join(tmpdir, "df2.avro")
-    save_df(df1, path1)
-    actual = load_df(path1)
-    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
-    actual = load_df(path1, columns=["a", "b"])
-    df_eq(actual, [["1", 3]], "a:str,b:long")
-    actual = load_df(path1, columns="a:str,b:int,c:long")
-    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
-    actual = load_df(
-        path1, columns="a:str,b:int,c:long", infer_schema=True
-    )  # TODO raise error when both provided?
-    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
-    actual = load_df(path1, columns=["b", "c"], infer_schema=True)
-    df_eq(actual, [[2, 3]], "b:long,c:long")
-    # save in append mode
-    path3 = os.path.join(tmpdir, "append.avro")
-    save_df(df1, path3)
-    save_df(df2, path3, append=True)
-    actual = load_df(path1, columns="a:str,b:int,c:long")
-    df_eq(actual, [["1", 2, 3], ["hello", 2, 3]], "a:str,b:int,c:long")
-    # save times_as_micros =False (i.e milliseconds instead)
-    df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
-    path4 = os.path.join(tmpdir, "df4.avro")
-    save_df(df4, path4)
-    actual = load_df(path4, columns="a:datetime,b:int,c:long")
-    df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
-    save_df(df4, path4, times_as_micros=False)
-    actual = load_df(path4, columns="a:datetime,b:int,c:long")
-    df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
-    # provide avro schema
-    schema = {
-        "type": "record",
-        "name": "Root",
-        "fields": [
-            {"name": "a", "type": "string"},
-            {"name": "b", "type": "int"},
-            {"name": "c", "type": "long"},
-        ],
-    }
-    save_df(df1, path1, schema=schema)
-    actual = load_df(path1, columns="a:str,b:int,c:long")
-    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
-    # provide wrong types in columns arg
-    save_df(df2, path2, schema=schema)
-    raises(
-        FugueDataFrameOperationError,
-        lambda: load_df(df2, path2, columns="a:int,b:int,c:long"),
-    )
-    # load with process_record function
-    actual = load_df(
-        path2,
-        columns="a:str,b:int,c:long",
-        process_record=lambda s: {"a": str.upper(s["a"]), "b": s["b"], "c": s["c"]},
-    )
-    df_eq(actual, [["HELLO", 2, 3]], "a:str,b:int,c:long")
-    # provide wrong type in avro schema
-    schema = {
-        "type": "record",
-        "name": "Root",
-        "fields": [
-            {"name": "a", "type": "int"},
-            {"name": "b", "type": "int"},
-            {"name": "c", "type": "long"},
-        ],
-    }
-    raises(TypeError, lambda: save_df(df2, path2, schema=schema))

tests/fugue_dask/test_execution_engine.py CHANGED Viewed

@@ -121,9 +121,11 @@ class DaskExecutionEngineBuiltInTests(BuiltInTests.Tests):
     @classmethod
     def setUpClass(cls):
         cls._engine = cls.make_engine(cls)
+        fa.set_global_engine(cls._engine)
     @classmethod
     def tearDownClass(cls):
+        fa.clear_global_engine()
         cls._engine.dask_client.close()
     def make_engine(self):
@@ -153,6 +155,49 @@ class DaskExecutionEngineBuiltInTests(BuiltInTests.Tests):
             df.output(m_o)
         dag.run(self.engine)
+    def test_bool_bytes_union(self):
+        # this is to verify a bug in enforce type is fixed
+        def tr(df: pd.DataFrame) -> pd.DataFrame:
+            return df.assign(data=b"asdf")
+        df = pd.DataFrame(dict(a=[True, False], b=[1, 2]))
+        r1 = fa.transform(df, tr, schema="*,data:bytes", as_fugue=True)
+        r2 = fa.transform(df, tr, schema="*,data:bytes", as_fugue=True)
+        r3 = fa.union(r1, r2, distinct=False)
+        r3.show()
+    def test_coarse_partition(self):
+        def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
+            ct = df.a.nunique()
+            s = df.a * 1000 + df.b
+            ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
+            return [[ct, ordered]]
+        def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
+            if rc > 0:
+                assert len(df) == rc
+            assert df.ct.sum() == n
+            if check_ordered:
+                assert (df.ordered == True).all()
+        gps = 100
+        partition_num = 6
+        df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
+            frac=1.0
+        )
+        with FugueWorkflow() as dag:
+            a = dag.df(df)
+            c = a.partition(
+                algo="coarse", by="a", presort="b", num=partition_num
+            ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
+            dag.output(
+                c,
+                using=assert_,
+                params=dict(rc=partition_num, n=gps, check_ordered=True),
+            )
+        dag.run(self.engine)
 def test_transform():
     class CB:

tests/fugue_dask/test_io.py CHANGED Viewed

@@ -117,58 +117,3 @@ def test_json(tmpdir):
     actual = load_df(path, columns="b:str,a:int")
     df_eq(actual, [["2", 1]], "b:str,a:int")
     raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
-@mark.skip(reason="Unable to test due to spark jars not being downloaded properly")
-def test_avro_io(tmpdir):
-    df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
-    path = os.path.join(tmpdir, "a.avro")
-    save_df(df1, path)
-    actual = load_df(path)
-    df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
-    actual = load_df(path, columns=["a", "b"])
-    df_eq(actual, [["1", 3]], "a:str,b:long")
-    actual = load_df(path, columns="a:str,b:int,c:long")
-    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
-    actual = load_df(path, columns=["b", "c"], infer_schema=True)
-    df_eq(actual, [[2, 3]], "b:long,c:long")
-    # provide schema and columns -> throw error
-    raises(
-        Exception,
-        lambda: save_df(
-            path,
-            columns="a:str,b:int,c:long",
-            schema={
-                "type": "record",
-                "name": "Root",
-                "fields": [
-                    {"name": "station", "type": "string"},
-                    {"name": "time", "type": "long"},
-                    {"name": "temp", "type": "int"},
-                ],
-            },
-        ),
-    )
-    # provide schema and infer_schema is True -> throw error
-    raises(
-        Exception,
-        lambda: save_df(
-            path,
-            columns=None,
-            schema={
-                "type": "record",
-                "name": "Root",
-                "fields": [
-                    {"name": "station", "type": "string"},
-                    {"name": "time", "type": "long"},
-                    {"name": "temp", "type": "int"},
-                ],
-            },
-            infer_schema=True,
-        ),
-    )

tests/fugue_duckdb/test_dataframe.py CHANGED Viewed

@@ -17,7 +17,7 @@ class DuckDataFrameTests(DataFrameTests.Tests):
     def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame:
         df = ArrowDataFrame(data, schema)
-        return DuckDataFrame(duckdb.arrow(df.native, self._con))
+        return DuckDataFrame(duckdb.from_arrow(df.native, self._con))
     def test_as_array_special_values(self):
         for func in [
@@ -74,7 +74,7 @@ class NativeDuckDataFrameTests(DataFrameTests.NativeTests):
     def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame:
         df = ArrowDataFrame(data, schema)
-        return DuckDataFrame(duckdb.arrow(df.native, self._con)).native
+        return DuckDataFrame(duckdb.from_arrow(df.native, self._con)).native
     def to_native_df(self, pdf: pd.DataFrame) -> Any:
         return duckdb.from_df(pdf)

tests/fugue_duckdb/test_utils.py CHANGED Viewed

@@ -42,7 +42,7 @@ def test_type_conversion():
     con = duckdb.connect()
     def assert_(tp):
-        dt = duckdb.arrow(pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), con).types[0]
+        dt = duckdb.from_arrow(pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), con).types[0]
         assert to_pa_type(dt) == tp
         dt = to_duck_type(tp)
         assert to_pa_type(dt) == tp

tests/fugue_polars/test_api.py ADDED Viewed

@@ -0,0 +1,13 @@
+import fugue.api as fa
+import pandas as pd
+import polars as pl
+def test_to_df():
+    df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
+    res = fa.fugue_sql("SELECT * FROM df", df=df, engine="duckdb")
+    assert fa.as_array(res) == [[0], [1]]
+    df2 = pl.from_pandas(pd.DataFrame({"a": [0]}))
+    res = fa.inner_join(df, df2, engine="duckdb")
+    assert fa.as_array(res) == [[0]]

tests/fugue_polars/test_transform.py CHANGED Viewed

@@ -5,18 +5,19 @@ import polars as pl
 import ray
 from dask.distributed import Client
 from pyspark.sql import SparkSession
+import pandas as pd
 import fugue.api as fa
 def test_transform_common():
     def tr1(df: pl.DataFrame) -> pl.DataFrame:
-        tdf = df.with_column(pl.lit(1, pl.Int32()).alias("b"))
+        tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
         return tdf
     def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
         for df in dfs:
-            tdf = df.with_column(pl.lit(1, pl.Int32()).alias("b"))
+            tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
             yield tdf
     for tr in [tr1, tr2]:
@@ -41,10 +42,15 @@ def test_transform_common():
         assert fdf.schema == "a:int,b:int"
         assert fdf.as_array() == []
+        df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
+        fdf = fa.transform(df, tr, schema="a:int,b:int", as_fugue=True)
+        assert fdf.schema == "a:int,b:int"
+        assert fdf.as_array() == [[0, 1], [1, 1]]
 def test_transform_empty_result():
     def tr1(df: pl.DataFrame) -> pl.DataFrame:
-        tdf = df.with_column(pl.lit(1, pl.Int32()).alias("b"))
+        tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
         return tdf.head(0)
     def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
@@ -63,12 +69,12 @@ def test_transform_empty_result():
 def test_polars_on_engines():
     def tr1(df: pl.DataFrame) -> pl.DataFrame:
-        tdf = df.with_column(pl.lit(1, pl.Int32()).alias("c"))
+        tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("c"))
         return tdf
     def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
         for df in dfs:
-            tdf = df.with_column(pl.lit(1, pl.Int32()).alias("c"))
+            tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("c"))
             yield tdf
     def test(engine):

tests/fugue_ray/test_execution_engine.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
+from typing import Any, List
 import duckdb
 import pandas as pd
 import ray
 import ray.data as rd
-from pytest import raises
 from triad import FileSystem
 import fugue.api as fa
@@ -237,3 +237,34 @@ class RayBuiltInTests(BuiltInTests.Tests):
         #     ),
         #     check_like=True,
         # )
+    def test_coarse_partition(self):
+        def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
+            ct = df.a.nunique()
+            s = df.a * 1000 + df.b
+            ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
+            return [[ct, ordered]]
+        def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
+            if rc > 0:
+                assert len(df) == rc
+            assert df.ct.sum() == n
+            if check_ordered:
+                assert (df.ordered == True).all()
+        gps = 100
+        partition_num = 6
+        df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
+            frac=1.0
+        )
+        with FugueWorkflow() as dag:
+            a = dag.df(df)
+            c = a.partition(
+                algo="coarse", by="a", presort="b", num=partition_num
+            ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
+            dag.output(
+                c,
+                using=assert_,
+                params=dict(rc=partition_num, n=gps, check_ordered=True),
+            )
+        dag.run(self.engine)

tests/fugue_spark/test_dataframe.py CHANGED Viewed

@@ -31,10 +31,6 @@ class SparkDataFrameTests(DataFrameTests.Tests):
         # TODO: Spark will silently cast invalid data to nulls without exceptions
         pass
-    def test_map_type(self):
-        if pyspark.__version__ >= "3":
-            return super().test_map_type()
 class NativeSparkDataFrameTests(DataFrameTests.NativeTests):
     @pytest.fixture(autouse=True)
@@ -55,10 +51,6 @@ class NativeSparkDataFrameTests(DataFrameTests.NativeTests):
         # TODO: Spark will silently cast invalid data to nulls without exceptions
         pass
-    def test_map_type(self):
-        if pyspark.__version__ >= "3":
-            return super().test_map_type()
 def test_init(spark_session):
     sdf = spark_session.createDataFrame([["a", 1]])

tests/fugue_spark/test_execution_engine.py CHANGED Viewed

@@ -26,6 +26,8 @@ from fugue.dataframe.utils import _df_eq as df_eq
 from fugue.extensions.transformer import Transformer, transformer
 from fugue.plugins import infer_execution_engine
 from fugue.workflow.workflow import FugueWorkflow
+from fugue_spark._utils.convert import to_pandas
+from fugue_spark._utils.misc import is_spark_dataframe, is_spark_session
 from fugue_spark.dataframe import SparkDataFrame
 from fugue_spark.execution_engine import SparkExecutionEngine
 from fugue_test.builtin_suite import BuiltInTests
@@ -86,6 +88,11 @@ class SparkExecutionEngineTests(ExecutionEngineTests.Tests):
         res = a.as_array(type_safe=True)
         assert res[0][0] == {"a": "b"}
+        pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        pdf = pdf[pdf.a < 1]
+        a = e.to_df(pdf)
+        assert fa.get_schema(a) == "a:long,b:long"
     def test_persist(self):
         e = self.engine
@@ -115,13 +122,12 @@ class SparkExecutionEngineTests(ExecutionEngineTests.Tests):
     def test_infer_engine(self):
         df = self.spark_session.createDataFrame(pd.DataFrame([[0]], columns=["a"]))
-        assert isinstance(infer_execution_engine([df]), SparkSession)
+        assert is_spark_session(infer_execution_engine([df]))
         fdf = SparkDataFrame(df)
-        assert isinstance(infer_execution_engine([fdf]), SparkSession)
+        assert is_spark_session(infer_execution_engine([fdf]))
-@pytest.mark.skipif(pyspark.__version__ < "3", reason="pyspark < 3")
 class SparkExecutionEnginePandasUDFTests(ExecutionEngineTests.Tests):
     @pytest.fixture(autouse=True)
     def init_session(self, spark_session):
@@ -192,6 +198,7 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
             session,
             {
                 "test": True,
+                "fugue.spark.use_pandas_udf": False,
                 "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
                 "fugue.rpc.flask_server.host": "127.0.0.1",
                 "fugue.rpc.flask_server.port": "1234",
@@ -258,10 +265,41 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
             dag.output(c, using=assert_match, params=dict(values=[100]))
         dag.run(self.engine)
+    def test_coarse_partition(self):
+        def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
+            ct = df.a.nunique()
+            s = df.a * 1000 + df.b
+            ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
+            return [[ct, ordered]]
+        def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
+            if rc > 0:
+                assert len(df) == rc
+            assert df.ct.sum() == n
+            if check_ordered:
+                assert (df.ordered == True).all()
+        gps = 100
+        partition_num = 6
+        df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
+            frac=1.0
+        )
+        with FugueWorkflow() as dag:
+            a = dag.df(df)
+            c = a.partition(
+                algo="coarse", by="a", presort="b", num=partition_num
+            ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
+            dag.output(
+                c,
+                using=assert_,
+                params=dict(rc=partition_num, n=gps, check_ordered=True),
+            )
+        dag.run(self.engine)
     def test_session_as_engine(self):
         dag = FugueWorkflow()
         a = dag.df([[p, 0] for p in range(100)], "a:int,b:int")
-        a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist()
+        # a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist()
         dag.run(self.spark_session)
     def test_interfaceless(self):
@@ -274,8 +312,8 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
             return df.sort_values("b").head(1)
         result = transform(sdf, f1, partition=dict(by=["a"]), engine=self.engine)
-        assert isinstance(result, SDataFrame)
-        assert result.toPandas().sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
+        assert is_spark_dataframe(result)
+        assert to_pandas(result).sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
     def test_annotation_1(self):
         def m_c(engine: SparkExecutionEngine) -> ps.DataFrame:
@@ -285,7 +323,7 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
             return df
         def m_o(engine: SparkExecutionEngine, df: ps.DataFrame) -> None:
-            assert 1 == df.toPandas().shape[0]
+            assert 1 == to_pandas(df).shape[0]
         with FugueWorkflow() as dag:
             df = dag.create(m_c).process(m_p)
@@ -298,12 +336,12 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
             return session.createDataFrame([[0]], "a:long")
         def m_p(session: SparkSession, df: ps.DataFrame) -> ps.DataFrame:
-            assert isinstance(session, SparkSession)
+            assert is_spark_session(session)
             return df
         def m_o(session: SparkSession, df: ps.DataFrame) -> None:
-            assert isinstance(session, SparkSession)
-            assert 1 == df.toPandas().shape[0]
+            assert is_spark_session(session)
+            assert 1 == to_pandas(df).shape[0]
         with FugueWorkflow() as dag:
             df = dag.create(m_c).process(m_p)

tests/fugue_spark/test_importless.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from fugue import FugueWorkflow, transform
-from fugue import fsql
-from pyspark.sql import SparkSession, DataFrame
 import pandas as pd
+from pyspark.sql import DataFrame, SparkSession
+from fugue import FugueWorkflow, fsql, transform
+from fugue_spark._utils.convert import to_pandas
 from fugue_spark.registry import _is_sparksql
@@ -41,4 +41,4 @@ def test_transform_from_sparksql(spark_session):
     res = transform(("sparksql", "SELECT 1 AS a, 'b' AS aa"), t)
     assert isinstance(res, DataFrame)  # engine inference
-    assert res.toPandas().to_dict("records") == [{"a": 1, "aa": "b"}]
+    assert to_pandas(res).to_dict("records") == [{"a": 1, "aa": "b"}]

tests/fugue_spark/test_spark_connect.py ADDED Viewed

@@ -0,0 +1,82 @@
+import pytest
+from pyspark.sql import SparkSession
+import fugue.api as fa
+from fugue_spark.execution_engine import SparkExecutionEngine
+from .test_dataframe import NativeSparkDataFrameTests as _NativeDataFrameTests
+from .test_dataframe import SparkDataFrameTests as _DataFrameTests
+from .test_execution_engine import (
+    SparkExecutionEnginePandasUDFBuiltInTests as _WorkflowTests,
+)
+from .test_execution_engine import SparkExecutionEnginePandasUDFTests as _EngineTests
+class SparkConnectDataFrameTests(_DataFrameTests):
+    @pytest.fixture(autouse=True)
+    def init_session(self):
+        self.spark_session = _connect()
+class SparkConnectNativeDataFrameTests(_NativeDataFrameTests):
+    @pytest.fixture(autouse=True)
+    def init_session(self):
+        self.spark_session = _connect()
+class SparkConnectExecutionEngineTests(_EngineTests):
+    @pytest.fixture(autouse=True)
+    def init_session(self):
+        self.spark_session = _connect()
+    def make_engine(self):
+        session = _connect()
+        e = SparkExecutionEngine(
+            session, {"test": True, "fugue.spark.use_pandas_udf": False}
+        )
+        return e
+    def test_get_parallelism(self):
+        assert fa.get_current_parallelism() == 200
+    def test_using_pandas_udf(self):
+        return
+    def test_map_with_dict_col(self):
+        return  # spark connect has a bug
+class SparkConnectBuiltInTests(_WorkflowTests):
+    @pytest.fixture(autouse=True)
+    def init_session(self):
+        self.spark_session = _connect()
+    def make_engine(self):
+        session = _connect()
+        e = SparkExecutionEngine(
+            session,
+            {
+                "test": True,
+                "fugue.spark.use_pandas_udf": True,
+                "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
+                "fugue.rpc.flask_server.host": "127.0.0.1",
+                "fugue.rpc.flask_server.port": "1234",
+                "fugue.rpc.flask_server.timeout": "2 sec",
+                "spark.sql.shuffle.partitions": "10",
+            },
+        )
+        assert e.conf.get_or_throw("fugue.spark.use_pandas_udf", bool)
+        return e
+    def test_annotation_3(self):
+        return  # RDD is not implemented in spark connect
+    def test_repartition(self):
+        return  # spark connect doesn't support even repartitioning
+    def test_repartition_large(self):
+        return  # spark connect doesn't support even repartitioning
+def _connect():
+    return SparkSession.builder.remote("sc://localhost").getOrCreate()

tests/fugue_spark/utils/test_convert.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import pyspark
 from fugue_spark._utils.convert import (
     to_cast_expression,
     to_schema,
@@ -53,13 +52,12 @@ def test_schema_conversion(spark_session):
     assert to_schema(df) == "name:[{nest_name:str,nest_value:int}]"
     assert to_spark_schema("name:[{nest_name:str,nest_value:int}]") == schema
-    if pyspark.__version__ >= "3":
-        schema = StructType(
-            [StructField("a", MapType(StringType(), IntegerType(), True), True)],
-        )
-        df = spark_session.createDataFrame([[{"x": 1}], [{"y": 2}]], schema)
-        assert to_schema(df) == "a:<str,int>"
-        assert to_spark_schema("a:<str,int>") == schema
+    schema = StructType(
+        [StructField("a", MapType(StringType(), IntegerType(), True), True)],
+    )
+    df = spark_session.createDataFrame([[{"x": 1}], [{"y": 2}]], schema)
+    assert to_schema(df) == "a:<str,int>"
+    assert to_spark_schema("a:<str,int>") == schema
 def test_to_cast_expression():

tests/fugue_spark/utils/test_io.py CHANGED Viewed

@@ -105,23 +105,6 @@ def test_json_io(tmpdir, spark_session):
     raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
-def test_avro_io(tmpdir, spark_session):
-    if spark_session.version < "3.0.0":
-        return
-    fs = FileSystem()
-    si = SparkIO(spark_session, fs)
-    df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
-    path = os.path.join(tmpdir, "a.avro")
-    si.save_df(df1, path)
-    actual = si.load_df(path)
-    df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
-    actual = si.load_df(path, columns=["b", "a"])
-    df_eq(actual, [[2, "1"]], "b:int,a:str")
-    actual = si.load_df(path, columns="b:str,a:int")
-    df_eq(actual, [["2", 1]], "b:str,a:int")
-    raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
 def test_save_with_partition(tmpdir, spark_session):
     si = SparkIO(spark_session, FileSystem())
     df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")

fugue_test/_utils.py DELETED Viewed

@@ -1,13 +0,0 @@
-import pytest
-def _is_spark2():
-    try:
-        import pyspark
-        return pyspark.__version__ < "3.0.0"
-    except Exception:  # pragma: no cover
-        return False
-skip_spark2 = pytest.mark.skipif(_is_spark2(), reason="Skip Spark<3")

{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt RENAMED Viewed

File without changes

fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

fugue 0.8.2.dev4py3-none-any.whl → 0.8.4py3-none-any.whl