PyPI - fugue - Versions diffs - 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl - Mend

fugue 0.8.2.dev4py3-none-any.whl → 0.8.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

fugue/__init__.py +0 -1
fugue/_utils/io.py +2 -91
fugue/api.py +1 -0
fugue/collections/partition.py +12 -6
fugue/constants.py +1 -1
fugue/dataframe/__init__.py +1 -7
fugue/dataframe/arrow_dataframe.py +1 -1
fugue/dataframe/function_wrapper.py +2 -3
fugue/dataframe/utils.py +10 -84
fugue/execution/api.py +34 -12
fugue/execution/native_execution_engine.py +33 -19
fugue/extensions/_builtins/creators.py +4 -2
fugue/extensions/_builtins/outputters.py +3 -3
fugue/extensions/_builtins/processors.py +2 -3
fugue/plugins.py +1 -0
fugue/workflow/_checkpoint.py +1 -1
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
fugue_contrib/viz/_ext.py +7 -1
fugue_dask/_io.py +0 -13
fugue_dask/_utils.py +10 -4
fugue_dask/execution_engine.py +42 -16
fugue_duckdb/_utils.py +7 -2
fugue_duckdb/dask.py +1 -1
fugue_duckdb/dataframe.py +17 -10
fugue_duckdb/execution_engine.py +12 -22
fugue_ibis/dataframe.py +2 -7
fugue_notebook/env.py +5 -10
fugue_polars/_utils.py +0 -40
fugue_polars/polars_dataframe.py +22 -7
fugue_ray/_constants.py +8 -1
fugue_ray/_utils/dataframe.py +31 -4
fugue_ray/_utils/io.py +2 -4
fugue_ray/dataframe.py +13 -4
fugue_ray/execution_engine.py +39 -21
fugue_spark/_utils/convert.py +22 -11
fugue_spark/_utils/io.py +0 -13
fugue_spark/_utils/misc.py +27 -0
fugue_spark/_utils/partition.py +11 -18
fugue_spark/dataframe.py +24 -19
fugue_spark/execution_engine.py +61 -35
fugue_spark/registry.py +15 -3
fugue_test/builtin_suite.py +7 -9
fugue_test/dataframe_suite.py +7 -3
fugue_test/execution_suite.py +100 -122
fugue_version/__init__.py +1 -1
tests/fugue/collections/test_partition.py +6 -3
tests/fugue/dataframe/test_utils.py +2 -43
tests/fugue/execution/test_naive_execution_engine.py +33 -0
tests/fugue/utils/test_io.py +0 -80
tests/fugue_dask/test_execution_engine.py +45 -0
tests/fugue_dask/test_io.py +0 -55
tests/fugue_duckdb/test_dataframe.py +2 -2
tests/fugue_duckdb/test_utils.py +1 -1
tests/fugue_polars/test_api.py +13 -0
tests/fugue_polars/test_transform.py +11 -5
tests/fugue_ray/test_execution_engine.py +32 -1
tests/fugue_spark/test_dataframe.py +0 -8
tests/fugue_spark/test_execution_engine.py +48 -10
tests/fugue_spark/test_importless.py +4 -4
tests/fugue_spark/test_spark_connect.py +82 -0
tests/fugue_spark/utils/test_convert.py +6 -8
tests/fugue_spark/utils/test_io.py +0 -17
fugue_test/_utils.py +0 -13
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
{fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0

fugue_test/execution_suite.py CHANGED Viewed

@@ -26,7 +26,6 @@ from fugue import (
 from fugue.column import all_cols, col, lit
 from fugue.dataframe.utils import _df_eq as df_eq
 from fugue.execution.native_execution_engine import NativeExecutionEngine
-from fugue_test._utils import skip_spark2
 class ExecutionEngineTests(object):
@@ -72,20 +71,20 @@ class ExecutionEngineTests(object):
             )
             # all engines should accept these types of inputs
             # should take fugue.DataFrame
-            df_eq(o, e.to_df(o), throw=True)
+            df_eq(o, fa.as_fugue_engine_df(e, o), throw=True)
             # should take array, shema
             df_eq(
                 o,
-                e.to_df([[1.1, 2.2], [3.3, 4.4]], "a:double,b:double"),
+                fa.as_fugue_engine_df(e, [[1.1, 2.2], [3.3, 4.4]], "a:double,b:double"),
                 throw=True,
             )
             # should take pandas dataframe
             pdf = pd.DataFrame([[1.1, 2.2], [3.3, 4.4]], columns=["a", "b"])
-            df_eq(o, e.to_df(pdf), throw=True)
+            df_eq(o, fa.as_fugue_engine_df(e, pdf), throw=True)
             # should convert string to datetime in to_df
             df_eq(
-                e.to_df([["2020-01-01"]], "a:datetime"),
+                fa.as_fugue_engine_df(e, [["2020-01-01"]], "a:datetime"),
                 [[datetime(2020, 1, 1)]],
                 "a:datetime",
                 throw=True,
@@ -95,7 +94,7 @@ class ExecutionEngineTests(object):
             o = ArrayDataFrame([], "a:double,b:str")
             pdf = pd.DataFrame([[0.1, "a"]], columns=["a", "b"])
             pdf = pdf[pdf.a < 0]
-            df_eq(o, e.to_df(pdf), throw=True)
+            df_eq(o, fa.as_fugue_engine_df(e, pdf), throw=True)
         def test_filter(self):
             a = ArrayDataFrame(
@@ -230,7 +229,7 @@ class ExecutionEngineTests(object):
             o = ArrayDataFrame(
                 [[1, 2], [None, 2], [None, 1], [3, 4], [None, 4]], "a:double,b:int"
             )
-            a = e.to_df(o)
+            a = fa.as_fugue_engine_df(e, o)
             # no partition
             c = e.map_engine.map_dataframe(a, noop, a.schema, PartitionSpec())
             df_eq(c, o, throw=True)
@@ -353,9 +352,9 @@ class ExecutionEngineTests(object):
         def test_join_multiple(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
-            b = e.to_df([[1, 20], [3, 40]], "a:int,c:int")
-            c = e.to_df([[1, 200], [3, 400]], "a:int,d:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[1, 20], [3, 40]], "a:int,c:int")
+            c = fa.as_fugue_engine_df(e, [[1, 200], [3, 400]], "a:int,d:int")
             d = fa.inner_join(a, b, c)
             df_eq(
                 d,
@@ -366,8 +365,8 @@ class ExecutionEngineTests(object):
         def test__join_cross(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
-            b = e.to_df([[6], [7]], "c:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6], [7]], "c:int")
             c = fa.join(a, b, how="Cross")
             df_eq(
                 c,
@@ -376,56 +375,56 @@ class ExecutionEngineTests(object):
                 throw=True,
             )
-            b = e.to_df([], "c:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int")
             c = fa.cross_join(a, b)
             df_eq(c, [], "a:int,b:int,c:int", throw=True)
-            a = e.to_df([], "a:int,b:int")
-            b = e.to_df([], "c:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int")
             c = fa.join(a, b, how="Cross")
             df_eq(c, [], "a:int,b:int,c:int", throw=True)
         def test__join_inner(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
             c = fa.join(a, b, how="INNER", on=["a"])
             df_eq(c, [[1, 2, 6]], "a:int,b:int,c:int", throw=True)
             c = fa.inner_join(b, a)
             df_eq(c, [[6, 1, 2]], "c:int,a:int,b:int", throw=True)
-            a = e.to_df([], "a:int,b:int")
-            b = e.to_df([], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
             c = fa.join(a, b, how="INNER", on=["a"])
             df_eq(c, [], "a:int,b:int,c:int", throw=True)
         def test__join_outer(self):
             e = self.engine
-            a = e.to_df([], "a:int,b:int")
-            b = e.to_df([], "c:str,a:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [], "c:str,a:int")
             c = fa.left_outer_join(a, b)
             df_eq(c, [], "a:int,b:int,c:str", throw=True)
-            a = e.to_df([], "a:int,b:str")
-            b = e.to_df([], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
             c = fa.right_outer_join(a, b)
             df_eq(c, [], "a:int,b:str,c:int", throw=True)
-            a = e.to_df([], "a:int,b:str")
-            b = e.to_df([], "c:str,a:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [], "c:str,a:int")
             c = fa.full_outer_join(a, b)
             df_eq(c, [], "a:int,b:str,c:str", throw=True)
-            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
-            b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [["6", 1], ["2", 7]], "c:str,a:int")
             c = fa.join(a, b, how="left_OUTER", on=["a"])
             df_eq(c, [[1, "2", "6"], [3, "4", None]], "a:int,b:str,c:str", throw=True)
             c = fa.join(b, a, how="left_outer", on=["a"])
             df_eq(c, [["6", 1, "2"], ["2", 7, None]], "c:str,a:int,b:str", throw=True)
-            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
-            b = e.to_df([[6, 1], [2, 7]], "c:double,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:double,a:int")
             c = fa.join(a, b, how="left_OUTER", on=["a"])
             df_eq(
                 c, [[1, "2", 6.0], [3, "4", None]], "a:int,b:str,c:double", throw=True
@@ -436,8 +435,8 @@ class ExecutionEngineTests(object):
                 c, [[6.0, 1, "2"], [2.0, 7, None]], "c:double,a:int,b:str", throw=True
             )
-            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
-            b = e.to_df([["6", 1], ["2", 7]], "c:str,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [["6", 1], ["2", 7]], "c:str,a:int")
             c = fa.join(a, b, how="right_outer", on=["a"])
             # assert c.as_pandas().values.tolist()[1][1] is None
             df_eq(c, [[1, "2", "6"], [7, None, "2"]], "a:int,b:str,c:str", throw=True)
@@ -453,8 +452,8 @@ class ExecutionEngineTests(object):
         def test__join_outer_pandas_incompatible(self):
             e = self.engine
-            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
             c = fa.join(a, b, how="left_OUTER", on=["a"])
             df_eq(
                 c,
@@ -465,8 +464,8 @@ class ExecutionEngineTests(object):
             c = fa.join(b, a, how="left_outer", on=["a"])
             df_eq(c, [[6, 1, "2"], [2, 7, None]], "c:int,a:int,b:str", throw=True)
-            a = e.to_df([[1, "2"], [3, "4"]], "a:int,b:str")
-            b = e.to_df([[True, 1], [False, 7]], "c:bool,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, "2"], [3, "4"]], "a:int,b:str")
+            b = fa.as_fugue_engine_df(e, [[True, 1], [False, 7]], "c:bool,a:int")
             c = fa.join(a, b, how="left_OUTER", on=["a"])
             df_eq(c, [[1, "2", True], [3, "4", None]], "a:int,b:str,c:bool", throw=True)
             c = fa.join(b, a, how="left_outer", on=["a"])
@@ -476,52 +475,60 @@ class ExecutionEngineTests(object):
         def test__join_semi(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
             c = fa.join(a, b, how="semi", on=["a"])
             df_eq(c, [[1, 2]], "a:int,b:int", throw=True)
             c = fa.semi_join(b, a)
             df_eq(c, [[6, 1]], "c:int,a:int", throw=True)
-            b = e.to_df([], "c:int,a:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
             c = fa.join(a, b, how="semi", on=["a"])
             df_eq(c, [], "a:int,b:int", throw=True)
-            a = e.to_df([], "a:int,b:int")
-            b = e.to_df([], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
             c = fa.join(a, b, how="semi", on=["a"])
             df_eq(c, [], "a:int,b:int", throw=True)
         def test__join_anti(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4]], "a:int,b:int")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
             c = fa.join(a, b, how="anti", on=["a"])
             df_eq(c, [[3, 4]], "a:int,b:int", throw=True)
             c = fa.anti_join(b, a)
             df_eq(c, [[2, 7]], "c:int,a:int", throw=True)
-            b = e.to_df([], "c:int,a:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
             c = fa.join(a, b, how="anti", on=["a"])
             df_eq(c, [[1, 2], [3, 4]], "a:int,b:int", throw=True)
-            a = e.to_df([], "a:int,b:int")
-            b = e.to_df([], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [], "c:int,a:int")
             c = fa.join(a, b, how="anti", on=["a"])
             df_eq(c, [], "a:int,b:int", throw=True)
         def test__join_with_null_keys(self):
             # SQL will not match null values
             e = self.engine
-            a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
-            b = e.to_df([[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int")
+            a = fa.as_fugue_engine_df(
+                e, [[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
+            )
+            b = fa.as_fugue_engine_df(
+                e, [[1, 2, 33], [4, None, 63]], "a:double,b:double,d:int"
+            )
             c = fa.join(a, b, how="INNER")
             df_eq(c, [[1, 2, 3, 33]], "a:double,b:double,c:int,d:int", throw=True)
         def test_union(self):
             e = self.engine
-            a = e.to_df([[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
-            b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int")
+            a = fa.as_fugue_engine_df(
+                e, [[1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
+            )
+            b = fa.as_fugue_engine_df(
+                e, [[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int"
+            )
             c = fa.union(a, b)
             df_eq(
                 c,
@@ -555,8 +562,12 @@ class ExecutionEngineTests(object):
         def test_subtract(self):
             e = self.engine
-            a = e.to_df([[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int")
-            b = e.to_df([[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int")
+            a = fa.as_fugue_engine_df(
+                e, [[1, 2, 3], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
+            )
+            b = fa.as_fugue_engine_df(
+                e, [[1, 2, 33], [4, None, 6]], "a:double,b:double,c:int"
+            )
             c = fa.subtract(a, b)
             df_eq(
                 c,
@@ -564,8 +575,8 @@ class ExecutionEngineTests(object):
                 "a:double,b:double,c:int",
                 throw=True,
             )
-            x = e.to_df([[1, 2, 33]], "a:double,b:double,c:int")
-            y = e.to_df([[4, None, 6]], "a:double,b:double,c:int")
+            x = fa.as_fugue_engine_df(e, [[1, 2, 33]], "a:double,b:double,c:int")
+            y = fa.as_fugue_engine_df(e, [[4, None, 6]], "a:double,b:double,c:int")
             z = fa.subtract(a, x, y)
             df_eq(
                 z,
@@ -584,10 +595,11 @@ class ExecutionEngineTests(object):
         def test_intersect(self):
             e = self.engine
-            a = e.to_df(
-                [[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int"
+            a = fa.as_fugue_engine_df(
+                e, [[1, 2, 3], [4, None, 6], [4, None, 6]], "a:double,b:double,c:int"
             )
-            b = e.to_df(
+            b = fa.as_fugue_engine_df(
+                e,
                 [[1, 2, 33], [4, None, 6], [4, None, 6], [4, None, 6]],
                 "a:double,b:double,c:int",
             )
@@ -598,11 +610,13 @@ class ExecutionEngineTests(object):
                 "a:double,b:double,c:int",
                 throw=True,
             )
-            x = e.to_df(
+            x = fa.as_fugue_engine_df(
+                e,
                 [[1, 2, 33]],
                 "a:double,b:double,c:int",
             )
-            y = e.to_df(
+            y = fa.as_fugue_engine_df(
+                e,
                 [[4, None, 6], [4, None, 6], [4, None, 6]],
                 "a:double,b:double,c:int",
             )
@@ -624,8 +638,8 @@ class ExecutionEngineTests(object):
         def test_distinct(self):
             e = self.engine
-            a = e.to_df(
-                [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
+            a = fa.as_fugue_engine_df(
+                e, [[4, None, 6], [1, 2, 3], [4, None, 6]], "a:double,b:double,c:int"
             )
             c = fa.distinct(a)
             df_eq(
@@ -637,8 +651,10 @@ class ExecutionEngineTests(object):
         def test_dropna(self):
             e = self.engine
-            a = e.to_df(
-                [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double"
+            a = fa.as_fugue_engine_df(
+                e,
+                [[4, None, 6], [1, 2, 3], [4, None, None]],
+                "a:double,b:double,c:double",
             )
             c = fa.dropna(a)  # default
             d = fa.dropna(a, how="all")
@@ -672,8 +688,10 @@ class ExecutionEngineTests(object):
         def test_fillna(self):
             e = self.engine
-            a = e.to_df(
-                [[4, None, 6], [1, 2, 3], [4, None, None]], "a:double,b:double,c:double"
+            a = fa.as_fugue_engine_df(
+                e,
+                [[4, None, 6], [1, 2, 3], [4, None, None]],
+                "a:double,b:double,c:double",
             )
             c = fa.fillna(a, value=1)
             d = fa.fillna(a, {"b": 99, "c": -99})
@@ -703,8 +721,8 @@ class ExecutionEngineTests(object):
             # raises(ValueError, lambda: fa.fillna(a, ["b"]))
         def test_sample(self):
-            engine = self.engine
-            a = engine.to_df([[x] for x in range(100)], "a:int")
+            e = self.engine
+            a = fa.as_fugue_engine_df(e, [[x] for x in range(100)], "a:int")
             with raises(ValueError):
                 fa.sample(a)  # must set one
@@ -725,7 +743,8 @@ class ExecutionEngineTests(object):
             e = self.engine
             ps = dict(by=["a"], presort="b DESC,c DESC")
             ps2 = dict(by=["c"], presort="b ASC")
-            a = e.to_df(
+            a = fa.as_fugue_engine_df(
+                e,
                 [
                     ["a", 2, 3],
                     ["a", 3, 4],
@@ -784,8 +803,8 @@ class ExecutionEngineTests(object):
             raises(ValueError, lambda: fa.take(a, n=0.5, presort=None))
         def test_sample_n(self):
-            engine = self.engine
-            a = engine.to_df([[x] for x in range(100)], "a:int")
+            e = self.engine
+            a = fa.as_fugue_engine_df(e, [[x] for x in range(100)], "a:int")
             b = fa.sample(a, n=90, replace=False)
             c = fa.sample(a, n=90, replace=True)
@@ -799,7 +818,7 @@ class ExecutionEngineTests(object):
         def test__serialize_by_partition(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
             s = e._serialize_by_partition(
                 a, PartitionSpec(by=["a"], presort="b"), df_name="_0"
             )
@@ -814,8 +833,8 @@ class ExecutionEngineTests(object):
         def test_zip(self):
             ps = PartitionSpec(by=["a"], presort="b DESC,c DESC")
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
             sa = e._serialize_by_partition(a, ps, df_name="_0")
             sb = e._serialize_by_partition(b, ps, df_name="_1")
             # test zip with serialized dfs
@@ -874,7 +893,7 @@ class ExecutionEngineTests(object):
         def test_zip_all(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
             z = fa.persist(e.zip_all(DataFrames(a)))
             assert 1 == z.count()
             assert z.metadata.get("serialized", False)
@@ -890,8 +909,8 @@ class ExecutionEngineTests(object):
             assert z.metadata.get("serialized", False)
             assert z.metadata.get("serialized_has_name", False)
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
-            c = e.to_df([[6, 1], [2, 7]], "d:int,a:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
+            c = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "d:int,a:int")
             z = fa.persist(e.zip_all(DataFrames(a, b, c)))
             assert 1 == z.count()
             assert not z.metadata.get("serialized_has_name", False)
@@ -918,8 +937,8 @@ class ExecutionEngineTests(object):
         def test_comap(self):
             ps = PartitionSpec(presort="b,c")
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
             z1 = fa.persist(e.zip(a, b))
             z2 = fa.persist(e.zip(a, b, partition_spec=ps, how="left_outer"))
             z3 = fa.persist(
@@ -966,9 +985,9 @@ class ExecutionEngineTests(object):
         def test_comap_with_key(self):
             e = self.engine
-            a = e.to_df([[1, 2], [3, 4], [1, 5]], "a:int,b:int")
-            b = e.to_df([[6, 1], [2, 7]], "c:int,a:int")
-            c = e.to_df([[6, 1]], "c:int,a:int")
+            a = fa.as_fugue_engine_df(e, [[1, 2], [3, 4], [1, 5]], "a:int,b:int")
+            b = fa.as_fugue_engine_df(e, [[6, 1], [2, 7]], "c:int,a:int")
+            c = fa.as_fugue_engine_df(e, [[6, 1]], "c:int,a:int")
             z1 = fa.persist(e.zip(a, b, df1_name="x", df2_name="y"))
             z2 = fa.persist(e.zip_all(DataFrames(x=a, y=b, z=b)))
             z3 = fa.persist(
@@ -1068,47 +1087,6 @@ class ExecutionEngineTests(object):
             )
             df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:int", throw=True)
-        @skip_spark2
-        def test_save_single_and_load_avro(self):
-            # TODO: switch to c:int,a:long when we can preserve schema to avro
-            e = self.engine
-            b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
-            path = os.path.join(self.tmpdir, "a", "b")
-            e.fs.makedirs(path, recreate=True)
-            # over write folder with single file
-            fa.save(b, path, format_hint="avro", force_single=True)
-            assert e.fs.isfile(path)
-            c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
-            df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
-            # overwirte single with folder (if applicable)
-            b = ArrayDataFrame([[60, 1], [20, 7]], "c:long,a:long")
-            fa.save(b, path, format_hint="avro", mode="overwrite")
-            c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
-            df_eq(c, [[1, 60], [7, 20]], "a:long,c:long", throw=True)
-        @skip_spark2
-        def test_save_and_load_avro(self):
-            # TODO: switch to c:int,a:long when we can preserve schema to avro
-            b = ArrayDataFrame([[6, 1], [2, 7]], "c:long,a:long")
-            path = os.path.join(self.tmpdir, "a", "b")
-            fa.save(b, path, format_hint="avro")
-            c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
-            df_eq(c, [[1, 6], [7, 2]], "a:long,c:long", throw=True)
-        @skip_spark2
-        def test_load_avro_folder(self):
-            # TODO: switch to c:int,a:long when we can preserve schema to avro
-            native = NativeExecutionEngine()
-            a = ArrayDataFrame([[6, 1]], "c:long,a:long")
-            b = ArrayDataFrame([[2, 7], [4, 8]], "c:long,a:long")
-            path = os.path.join(self.tmpdir, "a", "b")
-            fa.save(a, os.path.join(path, "a.avro"), engine=native)
-            fa.save(b, os.path.join(path, "b.avro"), engine=native)
-            FileSystem().touch(os.path.join(path, "_SUCCESS"))
-            c = fa.load(path, format_hint="avro", columns=["a", "c"], as_fugue=True)
-            df_eq(c, [[1, 6], [7, 2], [8, 4]], "a:long,c:long", throw=True)
         def test_save_single_and_load_csv(self):
             e = self.engine
             b = ArrayDataFrame([[6.1, 1.1], [2.1, 7.1]], "c:double,a:double")
@@ -1297,7 +1275,7 @@ class ExecutionEngineTests(object):
             b = ArrayDataFrame([[6, 1], [3, 4], [2, 7], [4, 8], [6, 7]], "c:int,a:long")
             path = os.path.join(self.tmpdir, "a", "b")
             fa.save(
-                e.repartition(e.to_df(b), PartitionSpec(num=2)),
+                e.repartition(fa.as_fugue_engine_df(e, b), PartitionSpec(num=2)),
                 path,
                 format_hint="json",
             )

fugue_version/__init__.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.8.2"
1	+ __version__ = "0.8.4"

tests/fugue/collections/test_partition.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import json
 from fugue.collections.partition import parse_presort_exp, PartitionSpec
-from fugue.constants import KEYWORD_CORECOUNT, KEYWORD_ROWCOUNT
+from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
 from pytest import raises
 from triad.collections.schema import Schema
 from triad.utils.hash import to_uuid
@@ -148,6 +148,9 @@ def test_partition_spec():
     assert dict(a=True, d=True, e=False) == p.get_sorts(
         Schema("a:int,b:int,d:int,e:int")
     )
+    assert dict(d=True, e=False) == p.get_sorts(
+        Schema("a:int,b:int,d:int,e:int"), with_partition_keys=False
+    )
     p = PartitionSpec(dict(partition_by=["e", "a"], presort="d asc"))
     assert p.get_key_schema(Schema("a:int,b:int,d:int,e:int")) == "e:int,a:int"
@@ -228,9 +231,9 @@ def test_get_num_partitions():
     assert 6 == p.get_num_partitions(x=lambda: 1, Y=lambda: 2)
     raises(Exception, lambda: p.get_num_partitions(x=lambda: 1))
-    p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CORECOUNT)"))
+    p = PartitionSpec(dict(partition_by=["b", "a"], num="min(ROWCOUNT,CONCURRENCY)"))
     assert 90 == p.get_num_partitions(
-        **{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_CORECOUNT: lambda: 90}
+        **{KEYWORD_ROWCOUNT: lambda: 100, KEYWORD_PARALLELISM: lambda: 90}
     )

tests/fugue/dataframe/test_utils.py CHANGED Viewed

@@ -8,8 +8,7 @@ from triad import FileSystem, Schema
 from triad.collections.schema import SchemaError
 from triad.exceptions import InvalidOperationError, NoneArgumentError
-from fugue import ArrayDataFrame, ArrowDataFrame, IterableDataFrame, PandasDataFrame
-from fugue.dataframe import to_local_bounded_df, to_local_df
+from fugue import ArrayDataFrame, IterableDataFrame, PandasDataFrame
 from fugue.dataframe.utils import _df_eq as df_eq
 from fugue.dataframe.utils import (
     _schema_eq,
@@ -24,46 +23,6 @@ from fugue.dataframe.utils import (
 )
-def test_to_local_df():
-    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
-    pdf = PandasDataFrame(df.as_pandas(), "a:int,b:int")
-    idf = IterableDataFrame([[0, 1]], "a:int,b:int")
-    assert to_local_df(df) is df
-    assert to_local_df(pdf) is pdf
-    assert to_local_df(idf) is idf
-    assert isinstance(to_local_df(df.native, "a:int,b:int"), ArrayDataFrame)
-    assert isinstance(to_local_df(pdf.native, "a:int,b:int"), PandasDataFrame)
-    assert isinstance(to_local_df(idf.native, "a:int,b:int"), IterableDataFrame)
-    raises(ValueError, lambda: to_local_df(123))
-    raises(NoneArgumentError, lambda: to_local_df(None))
-    raises(ValueError, lambda: to_local_df(df, "a:int,b:int"))
-def test_to_local_bounded_df():
-    df = ArrayDataFrame([[0, 1]], "a:int,b:int")
-    idf = IterableDataFrame([[0, 1]], "a:int,b:int")
-    adf = ArrowDataFrame(df.as_array(), "a:int,b:int")
-    assert to_local_bounded_df(df) is df
-    r = to_local_bounded_df(idf)
-    assert r is not idf
-    assert r.as_array() == [[0, 1]]
-    assert r.schema == "a:int,b:int"
-    r = to_local_bounded_df(adf.native)
-    assert isinstance(r, ArrowDataFrame)
-    assert r.as_array() == [[0, 1]]
-    assert r.schema == "a:int,b:int"
-    raises(ValueError, lambda: to_local_bounded_df(123))
-    def rows():
-        yield [0]
-        yield [1]
-    with raises(ValueError):
-        to_local_bounded_df(rows(), schema="a:int")
 def test_schema_eq():
     assert not _schema_eq(Schema("a:int"), Schema("a:int8"))
     assert not _schema_eq(Schema("a:int"), Schema("b:int"))
@@ -85,7 +44,7 @@ def test_df_eq():
     df1 = ArrayDataFrame([[0, 100.0, "a"]], "a:int,b:double,c:str")
     df2 = ArrayDataFrame([[0, 100.001, "a"]], "a:int,b:double,c:str")
     assert df_eq(df1, df1)
-    assert df_eq(df1, df2, digits=4)
+    assert df_eq(df1, df2, digits=2)
     # precision
     assert not df_eq(df1, df2, digits=6)
     # no content

tests/fugue/execution/test_naive_execution_engine.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from typing import Any, List
 import pandas as pd
 import pyarrow as pa
@@ -34,6 +36,37 @@ class NativeExecutionEngineBuiltInQPDTests(BuiltInTests.Tests):
     def test_yield_table(self):
         pass
+    def test_coarse_partition(self):
+        def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
+            ct = df.a.nunique()
+            s = df.a * 1000 + df.b
+            ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
+            return [[ct, ordered]]
+        def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
+            if rc > 0:
+                assert len(df) == rc
+            assert df.ct.sum() == n
+            if check_ordered:
+                assert (df.ordered == True).all()
+        gps = 100
+        partition_num = 6
+        df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
+            frac=1.0
+        )
+        with FugueWorkflow() as dag:
+            a = dag.df(df)
+            c = a.partition(
+                algo="coarse", by="a", presort="b", num=partition_num
+            ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
+            dag.output(
+                c,
+                using=assert_,
+                params=dict(rc=0, n=gps, check_ordered=True),
+            )
+        dag.run(self.engine)
 def test_get_file_threshold():
     assert -1 == _get_file_threshold(None)

fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

fugue 0.8.2.dev4py3-none-any.whl → 0.8.4py3-none-any.whl