fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +2 -91
  3. fugue/api.py +1 -0
  4. fugue/collections/partition.py +12 -6
  5. fugue/constants.py +1 -1
  6. fugue/dataframe/__init__.py +1 -7
  7. fugue/dataframe/arrow_dataframe.py +1 -1
  8. fugue/dataframe/function_wrapper.py +2 -3
  9. fugue/dataframe/utils.py +10 -84
  10. fugue/execution/api.py +34 -12
  11. fugue/execution/native_execution_engine.py +33 -19
  12. fugue/extensions/_builtins/creators.py +4 -2
  13. fugue/extensions/_builtins/outputters.py +3 -3
  14. fugue/extensions/_builtins/processors.py +2 -3
  15. fugue/plugins.py +1 -0
  16. fugue/workflow/_checkpoint.py +1 -1
  17. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
  18. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
  19. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
  20. fugue_contrib/viz/_ext.py +7 -1
  21. fugue_dask/_io.py +0 -13
  22. fugue_dask/_utils.py +10 -4
  23. fugue_dask/execution_engine.py +42 -16
  24. fugue_duckdb/_utils.py +7 -2
  25. fugue_duckdb/dask.py +1 -1
  26. fugue_duckdb/dataframe.py +17 -10
  27. fugue_duckdb/execution_engine.py +12 -22
  28. fugue_ibis/dataframe.py +2 -7
  29. fugue_notebook/env.py +5 -10
  30. fugue_polars/_utils.py +0 -40
  31. fugue_polars/polars_dataframe.py +22 -7
  32. fugue_ray/_constants.py +8 -1
  33. fugue_ray/_utils/dataframe.py +31 -4
  34. fugue_ray/_utils/io.py +2 -4
  35. fugue_ray/dataframe.py +13 -4
  36. fugue_ray/execution_engine.py +39 -21
  37. fugue_spark/_utils/convert.py +22 -11
  38. fugue_spark/_utils/io.py +0 -13
  39. fugue_spark/_utils/misc.py +27 -0
  40. fugue_spark/_utils/partition.py +11 -18
  41. fugue_spark/dataframe.py +24 -19
  42. fugue_spark/execution_engine.py +61 -35
  43. fugue_spark/registry.py +15 -3
  44. fugue_test/builtin_suite.py +7 -9
  45. fugue_test/dataframe_suite.py +7 -3
  46. fugue_test/execution_suite.py +100 -122
  47. fugue_version/__init__.py +1 -1
  48. tests/fugue/collections/test_partition.py +6 -3
  49. tests/fugue/dataframe/test_utils.py +2 -43
  50. tests/fugue/execution/test_naive_execution_engine.py +33 -0
  51. tests/fugue/utils/test_io.py +0 -80
  52. tests/fugue_dask/test_execution_engine.py +45 -0
  53. tests/fugue_dask/test_io.py +0 -55
  54. tests/fugue_duckdb/test_dataframe.py +2 -2
  55. tests/fugue_duckdb/test_utils.py +1 -1
  56. tests/fugue_polars/test_api.py +13 -0
  57. tests/fugue_polars/test_transform.py +11 -5
  58. tests/fugue_ray/test_execution_engine.py +32 -1
  59. tests/fugue_spark/test_dataframe.py +0 -8
  60. tests/fugue_spark/test_execution_engine.py +48 -10
  61. tests/fugue_spark/test_importless.py +4 -4
  62. tests/fugue_spark/test_spark_connect.py +82 -0
  63. tests/fugue_spark/utils/test_convert.py +6 -8
  64. tests/fugue_spark/utils/test_io.py +0 -17
  65. fugue_test/_utils.py +0 -13
  66. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
  67. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
  68. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
@@ -223,83 +223,3 @@ def test_json(tmpdir):
223
223
  raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
224
224
 
225
225
 
226
- def test_avro_io(tmpdir):
227
- df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
228
- df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long")
229
- path1 = os.path.join(tmpdir, "df1.avro")
230
- path2 = os.path.join(tmpdir, "df2.avro")
231
- save_df(df1, path1)
232
- actual = load_df(path1)
233
-
234
- df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
235
- actual = load_df(path1, columns=["a", "b"])
236
- df_eq(actual, [["1", 3]], "a:str,b:long")
237
-
238
- actual = load_df(path1, columns="a:str,b:int,c:long")
239
- df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
240
-
241
- actual = load_df(
242
- path1, columns="a:str,b:int,c:long", infer_schema=True
243
- ) # TODO raise error when both provided?
244
- df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
245
-
246
- actual = load_df(path1, columns=["b", "c"], infer_schema=True)
247
- df_eq(actual, [[2, 3]], "b:long,c:long")
248
-
249
- # save in append mode
250
- path3 = os.path.join(tmpdir, "append.avro")
251
- save_df(df1, path3)
252
- save_df(df2, path3, append=True)
253
- actual = load_df(path1, columns="a:str,b:int,c:long")
254
- df_eq(actual, [["1", 2, 3], ["hello", 2, 3]], "a:str,b:int,c:long")
255
-
256
- # save times_as_micros =False (i.e milliseconds instead)
257
- df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
258
- path4 = os.path.join(tmpdir, "df4.avro")
259
- save_df(df4, path4)
260
- actual = load_df(path4, columns="a:datetime,b:int,c:long")
261
- df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
262
- save_df(df4, path4, times_as_micros=False)
263
- actual = load_df(path4, columns="a:datetime,b:int,c:long")
264
- df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
265
-
266
- # provide avro schema
267
- schema = {
268
- "type": "record",
269
- "name": "Root",
270
- "fields": [
271
- {"name": "a", "type": "string"},
272
- {"name": "b", "type": "int"},
273
- {"name": "c", "type": "long"},
274
- ],
275
- }
276
- save_df(df1, path1, schema=schema)
277
- actual = load_df(path1, columns="a:str,b:int,c:long")
278
- df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
279
-
280
- # provide wrong types in columns arg
281
- save_df(df2, path2, schema=schema)
282
- raises(
283
- FugueDataFrameOperationError,
284
- lambda: load_df(df2, path2, columns="a:int,b:int,c:long"),
285
- )
286
-
287
- # load with process_record function
288
- actual = load_df(
289
- path2,
290
- columns="a:str,b:int,c:long",
291
- process_record=lambda s: {"a": str.upper(s["a"]), "b": s["b"], "c": s["c"]},
292
- )
293
- df_eq(actual, [["HELLO", 2, 3]], "a:str,b:int,c:long")
294
-
295
- # provide wrong type in avro schema
296
- schema = {
297
- "type": "record",
298
- "name": "Root",
299
- "fields": [
300
- {"name": "a", "type": "int"},
301
- {"name": "b", "type": "int"},
302
- {"name": "c", "type": "long"},
303
- ],
304
- }
305
- raises(TypeError, lambda: save_df(df2, path2, schema=schema))
@@ -121,9 +121,11 @@ class DaskExecutionEngineBuiltInTests(BuiltInTests.Tests):
121
121
  @classmethod
122
122
  def setUpClass(cls):
123
123
  cls._engine = cls.make_engine(cls)
124
+ fa.set_global_engine(cls._engine)
124
125
 
125
126
  @classmethod
126
127
  def tearDownClass(cls):
128
+ fa.clear_global_engine()
127
129
  cls._engine.dask_client.close()
128
130
 
129
131
  def make_engine(self):
@@ -153,6 +155,49 @@ class DaskExecutionEngineBuiltInTests(BuiltInTests.Tests):
153
155
  df.output(m_o)
154
156
  dag.run(self.engine)
155
157
 
158
+ def test_bool_bytes_union(self):
159
+ # this is to verify a bug in enforce type is fixed
160
+ def tr(df: pd.DataFrame) -> pd.DataFrame:
161
+ return df.assign(data=b"asdf")
162
+
163
+ df = pd.DataFrame(dict(a=[True, False], b=[1, 2]))
164
+
165
+ r1 = fa.transform(df, tr, schema="*,data:bytes", as_fugue=True)
166
+ r2 = fa.transform(df, tr, schema="*,data:bytes", as_fugue=True)
167
+ r3 = fa.union(r1, r2, distinct=False)
168
+ r3.show()
169
+
170
+ def test_coarse_partition(self):
171
+ def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
172
+ ct = df.a.nunique()
173
+ s = df.a * 1000 + df.b
174
+ ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
175
+ return [[ct, ordered]]
176
+
177
+ def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
178
+ if rc > 0:
179
+ assert len(df) == rc
180
+ assert df.ct.sum() == n
181
+ if check_ordered:
182
+ assert (df.ordered == True).all()
183
+
184
+ gps = 100
185
+ partition_num = 6
186
+ df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
187
+ frac=1.0
188
+ )
189
+ with FugueWorkflow() as dag:
190
+ a = dag.df(df)
191
+ c = a.partition(
192
+ algo="coarse", by="a", presort="b", num=partition_num
193
+ ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
194
+ dag.output(
195
+ c,
196
+ using=assert_,
197
+ params=dict(rc=partition_num, n=gps, check_ordered=True),
198
+ )
199
+ dag.run(self.engine)
200
+
156
201
 
157
202
  def test_transform():
158
203
  class CB:
@@ -117,58 +117,3 @@ def test_json(tmpdir):
117
117
  actual = load_df(path, columns="b:str,a:int")
118
118
  df_eq(actual, [["2", 1]], "b:str,a:int")
119
119
  raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
120
-
121
-
122
- @mark.skip(reason="Unable to test due to spark jars not being downloaded properly")
123
- def test_avro_io(tmpdir):
124
- df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
125
- path = os.path.join(tmpdir, "a.avro")
126
- save_df(df1, path)
127
- actual = load_df(path)
128
-
129
- df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
130
- actual = load_df(path, columns=["a", "b"])
131
- df_eq(actual, [["1", 3]], "a:str,b:long")
132
-
133
- actual = load_df(path, columns="a:str,b:int,c:long")
134
- df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
135
-
136
- actual = load_df(path, columns=["b", "c"], infer_schema=True)
137
- df_eq(actual, [[2, 3]], "b:long,c:long")
138
-
139
- # provide schema and columns -> throw error
140
- raises(
141
- Exception,
142
- lambda: save_df(
143
- path,
144
- columns="a:str,b:int,c:long",
145
- schema={
146
- "type": "record",
147
- "name": "Root",
148
- "fields": [
149
- {"name": "station", "type": "string"},
150
- {"name": "time", "type": "long"},
151
- {"name": "temp", "type": "int"},
152
- ],
153
- },
154
- ),
155
- )
156
-
157
- # provide schema and infer_schema is True -> throw error
158
- raises(
159
- Exception,
160
- lambda: save_df(
161
- path,
162
- columns=None,
163
- schema={
164
- "type": "record",
165
- "name": "Root",
166
- "fields": [
167
- {"name": "station", "type": "string"},
168
- {"name": "time", "type": "long"},
169
- {"name": "temp", "type": "int"},
170
- ],
171
- },
172
- infer_schema=True,
173
- ),
174
- )
@@ -17,7 +17,7 @@ class DuckDataFrameTests(DataFrameTests.Tests):
17
17
 
18
18
  def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame:
19
19
  df = ArrowDataFrame(data, schema)
20
- return DuckDataFrame(duckdb.arrow(df.native, self._con))
20
+ return DuckDataFrame(duckdb.from_arrow(df.native, self._con))
21
21
 
22
22
  def test_as_array_special_values(self):
23
23
  for func in [
@@ -74,7 +74,7 @@ class NativeDuckDataFrameTests(DataFrameTests.NativeTests):
74
74
 
75
75
  def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame:
76
76
  df = ArrowDataFrame(data, schema)
77
- return DuckDataFrame(duckdb.arrow(df.native, self._con)).native
77
+ return DuckDataFrame(duckdb.from_arrow(df.native, self._con)).native
78
78
 
79
79
  def to_native_df(self, pdf: pd.DataFrame) -> Any:
80
80
  return duckdb.from_df(pdf)
@@ -42,7 +42,7 @@ def test_type_conversion():
42
42
  con = duckdb.connect()
43
43
 
44
44
  def assert_(tp):
45
- dt = duckdb.arrow(pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), con).types[0]
45
+ dt = duckdb.from_arrow(pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), con).types[0]
46
46
  assert to_pa_type(dt) == tp
47
47
  dt = to_duck_type(tp)
48
48
  assert to_pa_type(dt) == tp
@@ -0,0 +1,13 @@
1
+ import fugue.api as fa
2
+ import pandas as pd
3
+ import polars as pl
4
+
5
+
6
+ def test_to_df():
7
+ df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
8
+ res = fa.fugue_sql("SELECT * FROM df", df=df, engine="duckdb")
9
+ assert fa.as_array(res) == [[0], [1]]
10
+
11
+ df2 = pl.from_pandas(pd.DataFrame({"a": [0]}))
12
+ res = fa.inner_join(df, df2, engine="duckdb")
13
+ assert fa.as_array(res) == [[0]]
@@ -5,18 +5,19 @@ import polars as pl
5
5
  import ray
6
6
  from dask.distributed import Client
7
7
  from pyspark.sql import SparkSession
8
+ import pandas as pd
8
9
 
9
10
  import fugue.api as fa
10
11
 
11
12
 
12
13
  def test_transform_common():
13
14
  def tr1(df: pl.DataFrame) -> pl.DataFrame:
14
- tdf = df.with_column(pl.lit(1, pl.Int32()).alias("b"))
15
+ tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
15
16
  return tdf
16
17
 
17
18
  def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
18
19
  for df in dfs:
19
- tdf = df.with_column(pl.lit(1, pl.Int32()).alias("b"))
20
+ tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
20
21
  yield tdf
21
22
 
22
23
  for tr in [tr1, tr2]:
@@ -41,10 +42,15 @@ def test_transform_common():
41
42
  assert fdf.schema == "a:int,b:int"
42
43
  assert fdf.as_array() == []
43
44
 
45
+ df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
46
+ fdf = fa.transform(df, tr, schema="a:int,b:int", as_fugue=True)
47
+ assert fdf.schema == "a:int,b:int"
48
+ assert fdf.as_array() == [[0, 1], [1, 1]]
49
+
44
50
 
45
51
  def test_transform_empty_result():
46
52
  def tr1(df: pl.DataFrame) -> pl.DataFrame:
47
- tdf = df.with_column(pl.lit(1, pl.Int32()).alias("b"))
53
+ tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
48
54
  return tdf.head(0)
49
55
 
50
56
  def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
@@ -63,12 +69,12 @@ def test_transform_empty_result():
63
69
 
64
70
  def test_polars_on_engines():
65
71
  def tr1(df: pl.DataFrame) -> pl.DataFrame:
66
- tdf = df.with_column(pl.lit(1, pl.Int32()).alias("c"))
72
+ tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("c"))
67
73
  return tdf
68
74
 
69
75
  def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
70
76
  for df in dfs:
71
- tdf = df.with_column(pl.lit(1, pl.Int32()).alias("c"))
77
+ tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("c"))
72
78
  yield tdf
73
79
 
74
80
  def test(engine):
@@ -1,10 +1,10 @@
1
1
  import os
2
+ from typing import Any, List
2
3
 
3
4
  import duckdb
4
5
  import pandas as pd
5
6
  import ray
6
7
  import ray.data as rd
7
- from pytest import raises
8
8
  from triad import FileSystem
9
9
 
10
10
  import fugue.api as fa
@@ -237,3 +237,34 @@ class RayBuiltInTests(BuiltInTests.Tests):
237
237
  # ),
238
238
  # check_like=True,
239
239
  # )
240
+
241
+ def test_coarse_partition(self):
242
+ def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
243
+ ct = df.a.nunique()
244
+ s = df.a * 1000 + df.b
245
+ ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
246
+ return [[ct, ordered]]
247
+
248
+ def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
249
+ if rc > 0:
250
+ assert len(df) == rc
251
+ assert df.ct.sum() == n
252
+ if check_ordered:
253
+ assert (df.ordered == True).all()
254
+
255
+ gps = 100
256
+ partition_num = 6
257
+ df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
258
+ frac=1.0
259
+ )
260
+ with FugueWorkflow() as dag:
261
+ a = dag.df(df)
262
+ c = a.partition(
263
+ algo="coarse", by="a", presort="b", num=partition_num
264
+ ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
265
+ dag.output(
266
+ c,
267
+ using=assert_,
268
+ params=dict(rc=partition_num, n=gps, check_ordered=True),
269
+ )
270
+ dag.run(self.engine)
@@ -31,10 +31,6 @@ class SparkDataFrameTests(DataFrameTests.Tests):
31
31
  # TODO: Spark will silently cast invalid data to nulls without exceptions
32
32
  pass
33
33
 
34
- def test_map_type(self):
35
- if pyspark.__version__ >= "3":
36
- return super().test_map_type()
37
-
38
34
 
39
35
  class NativeSparkDataFrameTests(DataFrameTests.NativeTests):
40
36
  @pytest.fixture(autouse=True)
@@ -55,10 +51,6 @@ class NativeSparkDataFrameTests(DataFrameTests.NativeTests):
55
51
  # TODO: Spark will silently cast invalid data to nulls without exceptions
56
52
  pass
57
53
 
58
- def test_map_type(self):
59
- if pyspark.__version__ >= "3":
60
- return super().test_map_type()
61
-
62
54
 
63
55
  def test_init(spark_session):
64
56
  sdf = spark_session.createDataFrame([["a", 1]])
@@ -26,6 +26,8 @@ from fugue.dataframe.utils import _df_eq as df_eq
26
26
  from fugue.extensions.transformer import Transformer, transformer
27
27
  from fugue.plugins import infer_execution_engine
28
28
  from fugue.workflow.workflow import FugueWorkflow
29
+ from fugue_spark._utils.convert import to_pandas
30
+ from fugue_spark._utils.misc import is_spark_dataframe, is_spark_session
29
31
  from fugue_spark.dataframe import SparkDataFrame
30
32
  from fugue_spark.execution_engine import SparkExecutionEngine
31
33
  from fugue_test.builtin_suite import BuiltInTests
@@ -86,6 +88,11 @@ class SparkExecutionEngineTests(ExecutionEngineTests.Tests):
86
88
  res = a.as_array(type_safe=True)
87
89
  assert res[0][0] == {"a": "b"}
88
90
 
91
+ pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
92
+ pdf = pdf[pdf.a < 1]
93
+ a = e.to_df(pdf)
94
+ assert fa.get_schema(a) == "a:long,b:long"
95
+
89
96
  def test_persist(self):
90
97
  e = self.engine
91
98
 
@@ -115,13 +122,12 @@ class SparkExecutionEngineTests(ExecutionEngineTests.Tests):
115
122
 
116
123
  def test_infer_engine(self):
117
124
  df = self.spark_session.createDataFrame(pd.DataFrame([[0]], columns=["a"]))
118
- assert isinstance(infer_execution_engine([df]), SparkSession)
125
+ assert is_spark_session(infer_execution_engine([df]))
119
126
 
120
127
  fdf = SparkDataFrame(df)
121
- assert isinstance(infer_execution_engine([fdf]), SparkSession)
128
+ assert is_spark_session(infer_execution_engine([fdf]))
122
129
 
123
130
 
124
- @pytest.mark.skipif(pyspark.__version__ < "3", reason="pyspark < 3")
125
131
  class SparkExecutionEnginePandasUDFTests(ExecutionEngineTests.Tests):
126
132
  @pytest.fixture(autouse=True)
127
133
  def init_session(self, spark_session):
@@ -192,6 +198,7 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
192
198
  session,
193
199
  {
194
200
  "test": True,
201
+ "fugue.spark.use_pandas_udf": False,
195
202
  "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
196
203
  "fugue.rpc.flask_server.host": "127.0.0.1",
197
204
  "fugue.rpc.flask_server.port": "1234",
@@ -258,10 +265,41 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
258
265
  dag.output(c, using=assert_match, params=dict(values=[100]))
259
266
  dag.run(self.engine)
260
267
 
268
+ def test_coarse_partition(self):
269
+ def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
270
+ ct = df.a.nunique()
271
+ s = df.a * 1000 + df.b
272
+ ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
273
+ return [[ct, ordered]]
274
+
275
+ def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
276
+ if rc > 0:
277
+ assert len(df) == rc
278
+ assert df.ct.sum() == n
279
+ if check_ordered:
280
+ assert (df.ordered == True).all()
281
+
282
+ gps = 100
283
+ partition_num = 6
284
+ df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
285
+ frac=1.0
286
+ )
287
+ with FugueWorkflow() as dag:
288
+ a = dag.df(df)
289
+ c = a.partition(
290
+ algo="coarse", by="a", presort="b", num=partition_num
291
+ ).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
292
+ dag.output(
293
+ c,
294
+ using=assert_,
295
+ params=dict(rc=partition_num, n=gps, check_ordered=True),
296
+ )
297
+ dag.run(self.engine)
298
+
261
299
  def test_session_as_engine(self):
262
300
  dag = FugueWorkflow()
263
301
  a = dag.df([[p, 0] for p in range(100)], "a:int,b:int")
264
- a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist()
302
+ # a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist()
265
303
  dag.run(self.spark_session)
266
304
 
267
305
  def test_interfaceless(self):
@@ -274,8 +312,8 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
274
312
  return df.sort_values("b").head(1)
275
313
 
276
314
  result = transform(sdf, f1, partition=dict(by=["a"]), engine=self.engine)
277
- assert isinstance(result, SDataFrame)
278
- assert result.toPandas().sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
315
+ assert is_spark_dataframe(result)
316
+ assert to_pandas(result).sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
279
317
 
280
318
  def test_annotation_1(self):
281
319
  def m_c(engine: SparkExecutionEngine) -> ps.DataFrame:
@@ -285,7 +323,7 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
285
323
  return df
286
324
 
287
325
  def m_o(engine: SparkExecutionEngine, df: ps.DataFrame) -> None:
288
- assert 1 == df.toPandas().shape[0]
326
+ assert 1 == to_pandas(df).shape[0]
289
327
 
290
328
  with FugueWorkflow() as dag:
291
329
  df = dag.create(m_c).process(m_p)
@@ -298,12 +336,12 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
298
336
  return session.createDataFrame([[0]], "a:long")
299
337
 
300
338
  def m_p(session: SparkSession, df: ps.DataFrame) -> ps.DataFrame:
301
- assert isinstance(session, SparkSession)
339
+ assert is_spark_session(session)
302
340
  return df
303
341
 
304
342
  def m_o(session: SparkSession, df: ps.DataFrame) -> None:
305
- assert isinstance(session, SparkSession)
306
- assert 1 == df.toPandas().shape[0]
343
+ assert is_spark_session(session)
344
+ assert 1 == to_pandas(df).shape[0]
307
345
 
308
346
  with FugueWorkflow() as dag:
309
347
  df = dag.create(m_c).process(m_p)
@@ -1,8 +1,8 @@
1
- from fugue import FugueWorkflow, transform
2
- from fugue import fsql
3
- from pyspark.sql import SparkSession, DataFrame
4
1
  import pandas as pd
2
+ from pyspark.sql import DataFrame, SparkSession
5
3
 
4
+ from fugue import FugueWorkflow, fsql, transform
5
+ from fugue_spark._utils.convert import to_pandas
6
6
  from fugue_spark.registry import _is_sparksql
7
7
 
8
8
 
@@ -41,4 +41,4 @@ def test_transform_from_sparksql(spark_session):
41
41
 
42
42
  res = transform(("sparksql", "SELECT 1 AS a, 'b' AS aa"), t)
43
43
  assert isinstance(res, DataFrame) # engine inference
44
- assert res.toPandas().to_dict("records") == [{"a": 1, "aa": "b"}]
44
+ assert to_pandas(res).to_dict("records") == [{"a": 1, "aa": "b"}]
@@ -0,0 +1,82 @@
1
+ import pytest
2
+ from pyspark.sql import SparkSession
3
+
4
+ import fugue.api as fa
5
+ from fugue_spark.execution_engine import SparkExecutionEngine
6
+
7
+ from .test_dataframe import NativeSparkDataFrameTests as _NativeDataFrameTests
8
+ from .test_dataframe import SparkDataFrameTests as _DataFrameTests
9
+ from .test_execution_engine import (
10
+ SparkExecutionEnginePandasUDFBuiltInTests as _WorkflowTests,
11
+ )
12
+ from .test_execution_engine import SparkExecutionEnginePandasUDFTests as _EngineTests
13
+
14
+
15
+ class SparkConnectDataFrameTests(_DataFrameTests):
16
+ @pytest.fixture(autouse=True)
17
+ def init_session(self):
18
+ self.spark_session = _connect()
19
+
20
+
21
+ class SparkConnectNativeDataFrameTests(_NativeDataFrameTests):
22
+ @pytest.fixture(autouse=True)
23
+ def init_session(self):
24
+ self.spark_session = _connect()
25
+
26
+
27
+ class SparkConnectExecutionEngineTests(_EngineTests):
28
+ @pytest.fixture(autouse=True)
29
+ def init_session(self):
30
+ self.spark_session = _connect()
31
+
32
+ def make_engine(self):
33
+ session = _connect()
34
+ e = SparkExecutionEngine(
35
+ session, {"test": True, "fugue.spark.use_pandas_udf": False}
36
+ )
37
+ return e
38
+
39
+ def test_get_parallelism(self):
40
+ assert fa.get_current_parallelism() == 200
41
+
42
+ def test_using_pandas_udf(self):
43
+ return
44
+
45
+ def test_map_with_dict_col(self):
46
+ return # spark connect has a bug
47
+
48
+
49
+ class SparkConnectBuiltInTests(_WorkflowTests):
50
+ @pytest.fixture(autouse=True)
51
+ def init_session(self):
52
+ self.spark_session = _connect()
53
+
54
+ def make_engine(self):
55
+ session = _connect()
56
+ e = SparkExecutionEngine(
57
+ session,
58
+ {
59
+ "test": True,
60
+ "fugue.spark.use_pandas_udf": True,
61
+ "fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
62
+ "fugue.rpc.flask_server.host": "127.0.0.1",
63
+ "fugue.rpc.flask_server.port": "1234",
64
+ "fugue.rpc.flask_server.timeout": "2 sec",
65
+ "spark.sql.shuffle.partitions": "10",
66
+ },
67
+ )
68
+ assert e.conf.get_or_throw("fugue.spark.use_pandas_udf", bool)
69
+ return e
70
+
71
+ def test_annotation_3(self):
72
+ return # RDD is not implemented in spark connect
73
+
74
+ def test_repartition(self):
75
+ return # spark connect doesn't support even repartitioning
76
+
77
+ def test_repartition_large(self):
78
+ return # spark connect doesn't support even repartitioning
79
+
80
+
81
+ def _connect():
82
+ return SparkSession.builder.remote("sc://localhost").getOrCreate()
@@ -1,4 +1,3 @@
1
- import pyspark
2
1
  from fugue_spark._utils.convert import (
3
2
  to_cast_expression,
4
3
  to_schema,
@@ -53,13 +52,12 @@ def test_schema_conversion(spark_session):
53
52
  assert to_schema(df) == "name:[{nest_name:str,nest_value:int}]"
54
53
  assert to_spark_schema("name:[{nest_name:str,nest_value:int}]") == schema
55
54
 
56
- if pyspark.__version__ >= "3":
57
- schema = StructType(
58
- [StructField("a", MapType(StringType(), IntegerType(), True), True)],
59
- )
60
- df = spark_session.createDataFrame([[{"x": 1}], [{"y": 2}]], schema)
61
- assert to_schema(df) == "a:<str,int>"
62
- assert to_spark_schema("a:<str,int>") == schema
55
+ schema = StructType(
56
+ [StructField("a", MapType(StringType(), IntegerType(), True), True)],
57
+ )
58
+ df = spark_session.createDataFrame([[{"x": 1}], [{"y": 2}]], schema)
59
+ assert to_schema(df) == "a:<str,int>"
60
+ assert to_spark_schema("a:<str,int>") == schema
63
61
 
64
62
 
65
63
  def test_to_cast_expression():
@@ -105,23 +105,6 @@ def test_json_io(tmpdir, spark_session):
105
105
  raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
106
106
 
107
107
 
108
- def test_avro_io(tmpdir, spark_session):
109
- if spark_session.version < "3.0.0":
110
- return
111
- fs = FileSystem()
112
- si = SparkIO(spark_session, fs)
113
- df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
114
- path = os.path.join(tmpdir, "a.avro")
115
- si.save_df(df1, path)
116
- actual = si.load_df(path)
117
- df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
118
- actual = si.load_df(path, columns=["b", "a"])
119
- df_eq(actual, [[2, "1"]], "b:int,a:str")
120
- actual = si.load_df(path, columns="b:str,a:int")
121
- df_eq(actual, [["2", 1]], "b:str,a:int")
122
- raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
123
-
124
-
125
108
  def test_save_with_partition(tmpdir, spark_session):
126
109
  si = SparkIO(spark_session, FileSystem())
127
110
  df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
fugue_test/_utils.py DELETED
@@ -1,13 +0,0 @@
1
- import pytest
2
-
3
-
4
- def _is_spark2():
5
- try:
6
- import pyspark
7
-
8
- return pyspark.__version__ < "3.0.0"
9
- except Exception: # pragma: no cover
10
- return False
11
-
12
-
13
- skip_spark2 = pytest.mark.skipif(_is_spark2(), reason="Skip Spark<3")