fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +2 -91
- fugue/api.py +1 -0
- fugue/collections/partition.py +12 -6
- fugue/constants.py +1 -1
- fugue/dataframe/__init__.py +1 -7
- fugue/dataframe/arrow_dataframe.py +1 -1
- fugue/dataframe/function_wrapper.py +2 -3
- fugue/dataframe/utils.py +10 -84
- fugue/execution/api.py +34 -12
- fugue/execution/native_execution_engine.py +33 -19
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +3 -3
- fugue/extensions/_builtins/processors.py +2 -3
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +1 -1
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/execution_engine.py +42 -16
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +17 -10
- fugue_duckdb/execution_engine.py +12 -22
- fugue_ibis/dataframe.py +2 -7
- fugue_notebook/env.py +5 -10
- fugue_polars/_utils.py +0 -40
- fugue_polars/polars_dataframe.py +22 -7
- fugue_ray/_constants.py +8 -1
- fugue_ray/_utils/dataframe.py +31 -4
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +13 -4
- fugue_ray/execution_engine.py +39 -21
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +24 -19
- fugue_spark/execution_engine.py +61 -35
- fugue_spark/registry.py +15 -3
- fugue_test/builtin_suite.py +7 -9
- fugue_test/dataframe_suite.py +7 -3
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/collections/test_partition.py +6 -3
- tests/fugue/dataframe/test_utils.py +2 -43
- tests/fugue/execution/test_naive_execution_engine.py +33 -0
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +45 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_transform.py +11 -5
- tests/fugue_ray/test_execution_engine.py +32 -1
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +48 -10
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
tests/fugue/utils/test_io.py
CHANGED
|
@@ -223,83 +223,3 @@ def test_json(tmpdir):
|
|
|
223
223
|
raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
|
|
224
224
|
|
|
225
225
|
|
|
226
|
-
def test_avro_io(tmpdir):
|
|
227
|
-
df1 = PandasDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
|
|
228
|
-
df2 = PandasDataFrame([["hello", 2, 3]], "a:str,b:int,c:long")
|
|
229
|
-
path1 = os.path.join(tmpdir, "df1.avro")
|
|
230
|
-
path2 = os.path.join(tmpdir, "df2.avro")
|
|
231
|
-
save_df(df1, path1)
|
|
232
|
-
actual = load_df(path1)
|
|
233
|
-
|
|
234
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
|
|
235
|
-
actual = load_df(path1, columns=["a", "b"])
|
|
236
|
-
df_eq(actual, [["1", 3]], "a:str,b:long")
|
|
237
|
-
|
|
238
|
-
actual = load_df(path1, columns="a:str,b:int,c:long")
|
|
239
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
|
|
240
|
-
|
|
241
|
-
actual = load_df(
|
|
242
|
-
path1, columns="a:str,b:int,c:long", infer_schema=True
|
|
243
|
-
) # TODO raise error when both provided?
|
|
244
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
|
|
245
|
-
|
|
246
|
-
actual = load_df(path1, columns=["b", "c"], infer_schema=True)
|
|
247
|
-
df_eq(actual, [[2, 3]], "b:long,c:long")
|
|
248
|
-
|
|
249
|
-
# save in append mode
|
|
250
|
-
path3 = os.path.join(tmpdir, "append.avro")
|
|
251
|
-
save_df(df1, path3)
|
|
252
|
-
save_df(df2, path3, append=True)
|
|
253
|
-
actual = load_df(path1, columns="a:str,b:int,c:long")
|
|
254
|
-
df_eq(actual, [["1", 2, 3], ["hello", 2, 3]], "a:str,b:int,c:long")
|
|
255
|
-
|
|
256
|
-
# save times_as_micros =False (i.e milliseconds instead)
|
|
257
|
-
df4 = PandasDataFrame([["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
|
|
258
|
-
path4 = os.path.join(tmpdir, "df4.avro")
|
|
259
|
-
save_df(df4, path4)
|
|
260
|
-
actual = load_df(path4, columns="a:datetime,b:int,c:long")
|
|
261
|
-
df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
|
|
262
|
-
save_df(df4, path4, times_as_micros=False)
|
|
263
|
-
actual = load_df(path4, columns="a:datetime,b:int,c:long")
|
|
264
|
-
df_eq(actual, [["2021-05-04", 2, 3]], "a:datetime,b:int,c:long")
|
|
265
|
-
|
|
266
|
-
# provide avro schema
|
|
267
|
-
schema = {
|
|
268
|
-
"type": "record",
|
|
269
|
-
"name": "Root",
|
|
270
|
-
"fields": [
|
|
271
|
-
{"name": "a", "type": "string"},
|
|
272
|
-
{"name": "b", "type": "int"},
|
|
273
|
-
{"name": "c", "type": "long"},
|
|
274
|
-
],
|
|
275
|
-
}
|
|
276
|
-
save_df(df1, path1, schema=schema)
|
|
277
|
-
actual = load_df(path1, columns="a:str,b:int,c:long")
|
|
278
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
|
|
279
|
-
|
|
280
|
-
# provide wrong types in columns arg
|
|
281
|
-
save_df(df2, path2, schema=schema)
|
|
282
|
-
raises(
|
|
283
|
-
FugueDataFrameOperationError,
|
|
284
|
-
lambda: load_df(df2, path2, columns="a:int,b:int,c:long"),
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
# load with process_record function
|
|
288
|
-
actual = load_df(
|
|
289
|
-
path2,
|
|
290
|
-
columns="a:str,b:int,c:long",
|
|
291
|
-
process_record=lambda s: {"a": str.upper(s["a"]), "b": s["b"], "c": s["c"]},
|
|
292
|
-
)
|
|
293
|
-
df_eq(actual, [["HELLO", 2, 3]], "a:str,b:int,c:long")
|
|
294
|
-
|
|
295
|
-
# provide wrong type in avro schema
|
|
296
|
-
schema = {
|
|
297
|
-
"type": "record",
|
|
298
|
-
"name": "Root",
|
|
299
|
-
"fields": [
|
|
300
|
-
{"name": "a", "type": "int"},
|
|
301
|
-
{"name": "b", "type": "int"},
|
|
302
|
-
{"name": "c", "type": "long"},
|
|
303
|
-
],
|
|
304
|
-
}
|
|
305
|
-
raises(TypeError, lambda: save_df(df2, path2, schema=schema))
|
|
@@ -121,9 +121,11 @@ class DaskExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
121
121
|
@classmethod
|
|
122
122
|
def setUpClass(cls):
|
|
123
123
|
cls._engine = cls.make_engine(cls)
|
|
124
|
+
fa.set_global_engine(cls._engine)
|
|
124
125
|
|
|
125
126
|
@classmethod
|
|
126
127
|
def tearDownClass(cls):
|
|
128
|
+
fa.clear_global_engine()
|
|
127
129
|
cls._engine.dask_client.close()
|
|
128
130
|
|
|
129
131
|
def make_engine(self):
|
|
@@ -153,6 +155,49 @@ class DaskExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
153
155
|
df.output(m_o)
|
|
154
156
|
dag.run(self.engine)
|
|
155
157
|
|
|
158
|
+
def test_bool_bytes_union(self):
|
|
159
|
+
# this is to verify a bug in enforce type is fixed
|
|
160
|
+
def tr(df: pd.DataFrame) -> pd.DataFrame:
|
|
161
|
+
return df.assign(data=b"asdf")
|
|
162
|
+
|
|
163
|
+
df = pd.DataFrame(dict(a=[True, False], b=[1, 2]))
|
|
164
|
+
|
|
165
|
+
r1 = fa.transform(df, tr, schema="*,data:bytes", as_fugue=True)
|
|
166
|
+
r2 = fa.transform(df, tr, schema="*,data:bytes", as_fugue=True)
|
|
167
|
+
r3 = fa.union(r1, r2, distinct=False)
|
|
168
|
+
r3.show()
|
|
169
|
+
|
|
170
|
+
def test_coarse_partition(self):
|
|
171
|
+
def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
|
|
172
|
+
ct = df.a.nunique()
|
|
173
|
+
s = df.a * 1000 + df.b
|
|
174
|
+
ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
|
|
175
|
+
return [[ct, ordered]]
|
|
176
|
+
|
|
177
|
+
def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
|
|
178
|
+
if rc > 0:
|
|
179
|
+
assert len(df) == rc
|
|
180
|
+
assert df.ct.sum() == n
|
|
181
|
+
if check_ordered:
|
|
182
|
+
assert (df.ordered == True).all()
|
|
183
|
+
|
|
184
|
+
gps = 100
|
|
185
|
+
partition_num = 6
|
|
186
|
+
df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
|
|
187
|
+
frac=1.0
|
|
188
|
+
)
|
|
189
|
+
with FugueWorkflow() as dag:
|
|
190
|
+
a = dag.df(df)
|
|
191
|
+
c = a.partition(
|
|
192
|
+
algo="coarse", by="a", presort="b", num=partition_num
|
|
193
|
+
).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
|
|
194
|
+
dag.output(
|
|
195
|
+
c,
|
|
196
|
+
using=assert_,
|
|
197
|
+
params=dict(rc=partition_num, n=gps, check_ordered=True),
|
|
198
|
+
)
|
|
199
|
+
dag.run(self.engine)
|
|
200
|
+
|
|
156
201
|
|
|
157
202
|
def test_transform():
|
|
158
203
|
class CB:
|
tests/fugue_dask/test_io.py
CHANGED
|
@@ -117,58 +117,3 @@ def test_json(tmpdir):
|
|
|
117
117
|
actual = load_df(path, columns="b:str,a:int")
|
|
118
118
|
df_eq(actual, [["2", 1]], "b:str,a:int")
|
|
119
119
|
raises(KeyError, lambda: load_df(path, columns="bb:str,a:int"))
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
@mark.skip(reason="Unable to test due to spark jars not being downloaded properly")
|
|
123
|
-
def test_avro_io(tmpdir):
|
|
124
|
-
df1 = DaskDataFrame([["1", 2, 3]], "a:str,b:int,c:long")
|
|
125
|
-
path = os.path.join(tmpdir, "a.avro")
|
|
126
|
-
save_df(df1, path)
|
|
127
|
-
actual = load_df(path)
|
|
128
|
-
|
|
129
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:long,c:long")
|
|
130
|
-
actual = load_df(path, columns=["a", "b"])
|
|
131
|
-
df_eq(actual, [["1", 3]], "a:str,b:long")
|
|
132
|
-
|
|
133
|
-
actual = load_df(path, columns="a:str,b:int,c:long")
|
|
134
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
|
|
135
|
-
|
|
136
|
-
actual = load_df(path, columns=["b", "c"], infer_schema=True)
|
|
137
|
-
df_eq(actual, [[2, 3]], "b:long,c:long")
|
|
138
|
-
|
|
139
|
-
# provide schema and columns -> throw error
|
|
140
|
-
raises(
|
|
141
|
-
Exception,
|
|
142
|
-
lambda: save_df(
|
|
143
|
-
path,
|
|
144
|
-
columns="a:str,b:int,c:long",
|
|
145
|
-
schema={
|
|
146
|
-
"type": "record",
|
|
147
|
-
"name": "Root",
|
|
148
|
-
"fields": [
|
|
149
|
-
{"name": "station", "type": "string"},
|
|
150
|
-
{"name": "time", "type": "long"},
|
|
151
|
-
{"name": "temp", "type": "int"},
|
|
152
|
-
],
|
|
153
|
-
},
|
|
154
|
-
),
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
# provide schema and infer_schema is True -> throw error
|
|
158
|
-
raises(
|
|
159
|
-
Exception,
|
|
160
|
-
lambda: save_df(
|
|
161
|
-
path,
|
|
162
|
-
columns=None,
|
|
163
|
-
schema={
|
|
164
|
-
"type": "record",
|
|
165
|
-
"name": "Root",
|
|
166
|
-
"fields": [
|
|
167
|
-
{"name": "station", "type": "string"},
|
|
168
|
-
{"name": "time", "type": "long"},
|
|
169
|
-
{"name": "temp", "type": "int"},
|
|
170
|
-
],
|
|
171
|
-
},
|
|
172
|
-
infer_schema=True,
|
|
173
|
-
),
|
|
174
|
-
)
|
|
@@ -17,7 +17,7 @@ class DuckDataFrameTests(DataFrameTests.Tests):
|
|
|
17
17
|
|
|
18
18
|
def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame:
|
|
19
19
|
df = ArrowDataFrame(data, schema)
|
|
20
|
-
return DuckDataFrame(duckdb.
|
|
20
|
+
return DuckDataFrame(duckdb.from_arrow(df.native, self._con))
|
|
21
21
|
|
|
22
22
|
def test_as_array_special_values(self):
|
|
23
23
|
for func in [
|
|
@@ -74,7 +74,7 @@ class NativeDuckDataFrameTests(DataFrameTests.NativeTests):
|
|
|
74
74
|
|
|
75
75
|
def df(self, data: Any = None, schema: Any = None) -> DuckDataFrame:
|
|
76
76
|
df = ArrowDataFrame(data, schema)
|
|
77
|
-
return DuckDataFrame(duckdb.
|
|
77
|
+
return DuckDataFrame(duckdb.from_arrow(df.native, self._con)).native
|
|
78
78
|
|
|
79
79
|
def to_native_df(self, pdf: pd.DataFrame) -> Any:
|
|
80
80
|
return duckdb.from_df(pdf)
|
tests/fugue_duckdb/test_utils.py
CHANGED
|
@@ -42,7 +42,7 @@ def test_type_conversion():
|
|
|
42
42
|
con = duckdb.connect()
|
|
43
43
|
|
|
44
44
|
def assert_(tp):
|
|
45
|
-
dt = duckdb.
|
|
45
|
+
dt = duckdb.from_arrow(pa.Table.from_pydict(dict(a=pa.nulls(2, tp))), con).types[0]
|
|
46
46
|
assert to_pa_type(dt) == tp
|
|
47
47
|
dt = to_duck_type(tp)
|
|
48
48
|
assert to_pa_type(dt) == tp
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import fugue.api as fa
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_to_df():
|
|
7
|
+
df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
|
|
8
|
+
res = fa.fugue_sql("SELECT * FROM df", df=df, engine="duckdb")
|
|
9
|
+
assert fa.as_array(res) == [[0], [1]]
|
|
10
|
+
|
|
11
|
+
df2 = pl.from_pandas(pd.DataFrame({"a": [0]}))
|
|
12
|
+
res = fa.inner_join(df, df2, engine="duckdb")
|
|
13
|
+
assert fa.as_array(res) == [[0]]
|
|
@@ -5,18 +5,19 @@ import polars as pl
|
|
|
5
5
|
import ray
|
|
6
6
|
from dask.distributed import Client
|
|
7
7
|
from pyspark.sql import SparkSession
|
|
8
|
+
import pandas as pd
|
|
8
9
|
|
|
9
10
|
import fugue.api as fa
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def test_transform_common():
|
|
13
14
|
def tr1(df: pl.DataFrame) -> pl.DataFrame:
|
|
14
|
-
tdf = df.
|
|
15
|
+
tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
|
|
15
16
|
return tdf
|
|
16
17
|
|
|
17
18
|
def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
|
|
18
19
|
for df in dfs:
|
|
19
|
-
tdf = df.
|
|
20
|
+
tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
|
|
20
21
|
yield tdf
|
|
21
22
|
|
|
22
23
|
for tr in [tr1, tr2]:
|
|
@@ -41,10 +42,15 @@ def test_transform_common():
|
|
|
41
42
|
assert fdf.schema == "a:int,b:int"
|
|
42
43
|
assert fdf.as_array() == []
|
|
43
44
|
|
|
45
|
+
df = pl.from_pandas(pd.DataFrame({"a": [0, 1]}))
|
|
46
|
+
fdf = fa.transform(df, tr, schema="a:int,b:int", as_fugue=True)
|
|
47
|
+
assert fdf.schema == "a:int,b:int"
|
|
48
|
+
assert fdf.as_array() == [[0, 1], [1, 1]]
|
|
49
|
+
|
|
44
50
|
|
|
45
51
|
def test_transform_empty_result():
|
|
46
52
|
def tr1(df: pl.DataFrame) -> pl.DataFrame:
|
|
47
|
-
tdf = df.
|
|
53
|
+
tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("b"))
|
|
48
54
|
return tdf.head(0)
|
|
49
55
|
|
|
50
56
|
def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
|
|
@@ -63,12 +69,12 @@ def test_transform_empty_result():
|
|
|
63
69
|
|
|
64
70
|
def test_polars_on_engines():
|
|
65
71
|
def tr1(df: pl.DataFrame) -> pl.DataFrame:
|
|
66
|
-
tdf = df.
|
|
72
|
+
tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("c"))
|
|
67
73
|
return tdf
|
|
68
74
|
|
|
69
75
|
def tr2(dfs: Iterable[pl.DataFrame]) -> Iterator[pl.DataFrame]:
|
|
70
76
|
for df in dfs:
|
|
71
|
-
tdf = df.
|
|
77
|
+
tdf = df.with_columns(pl.lit(1, pl.Int32()).alias("c"))
|
|
72
78
|
yield tdf
|
|
73
79
|
|
|
74
80
|
def test(engine):
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Any, List
|
|
2
3
|
|
|
3
4
|
import duckdb
|
|
4
5
|
import pandas as pd
|
|
5
6
|
import ray
|
|
6
7
|
import ray.data as rd
|
|
7
|
-
from pytest import raises
|
|
8
8
|
from triad import FileSystem
|
|
9
9
|
|
|
10
10
|
import fugue.api as fa
|
|
@@ -237,3 +237,34 @@ class RayBuiltInTests(BuiltInTests.Tests):
|
|
|
237
237
|
# ),
|
|
238
238
|
# check_like=True,
|
|
239
239
|
# )
|
|
240
|
+
|
|
241
|
+
def test_coarse_partition(self):
|
|
242
|
+
def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
|
|
243
|
+
ct = df.a.nunique()
|
|
244
|
+
s = df.a * 1000 + df.b
|
|
245
|
+
ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
|
|
246
|
+
return [[ct, ordered]]
|
|
247
|
+
|
|
248
|
+
def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
|
|
249
|
+
if rc > 0:
|
|
250
|
+
assert len(df) == rc
|
|
251
|
+
assert df.ct.sum() == n
|
|
252
|
+
if check_ordered:
|
|
253
|
+
assert (df.ordered == True).all()
|
|
254
|
+
|
|
255
|
+
gps = 100
|
|
256
|
+
partition_num = 6
|
|
257
|
+
df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
|
|
258
|
+
frac=1.0
|
|
259
|
+
)
|
|
260
|
+
with FugueWorkflow() as dag:
|
|
261
|
+
a = dag.df(df)
|
|
262
|
+
c = a.partition(
|
|
263
|
+
algo="coarse", by="a", presort="b", num=partition_num
|
|
264
|
+
).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
|
|
265
|
+
dag.output(
|
|
266
|
+
c,
|
|
267
|
+
using=assert_,
|
|
268
|
+
params=dict(rc=partition_num, n=gps, check_ordered=True),
|
|
269
|
+
)
|
|
270
|
+
dag.run(self.engine)
|
|
@@ -31,10 +31,6 @@ class SparkDataFrameTests(DataFrameTests.Tests):
|
|
|
31
31
|
# TODO: Spark will silently cast invalid data to nulls without exceptions
|
|
32
32
|
pass
|
|
33
33
|
|
|
34
|
-
def test_map_type(self):
|
|
35
|
-
if pyspark.__version__ >= "3":
|
|
36
|
-
return super().test_map_type()
|
|
37
|
-
|
|
38
34
|
|
|
39
35
|
class NativeSparkDataFrameTests(DataFrameTests.NativeTests):
|
|
40
36
|
@pytest.fixture(autouse=True)
|
|
@@ -55,10 +51,6 @@ class NativeSparkDataFrameTests(DataFrameTests.NativeTests):
|
|
|
55
51
|
# TODO: Spark will silently cast invalid data to nulls without exceptions
|
|
56
52
|
pass
|
|
57
53
|
|
|
58
|
-
def test_map_type(self):
|
|
59
|
-
if pyspark.__version__ >= "3":
|
|
60
|
-
return super().test_map_type()
|
|
61
|
-
|
|
62
54
|
|
|
63
55
|
def test_init(spark_session):
|
|
64
56
|
sdf = spark_session.createDataFrame([["a", 1]])
|
|
@@ -26,6 +26,8 @@ from fugue.dataframe.utils import _df_eq as df_eq
|
|
|
26
26
|
from fugue.extensions.transformer import Transformer, transformer
|
|
27
27
|
from fugue.plugins import infer_execution_engine
|
|
28
28
|
from fugue.workflow.workflow import FugueWorkflow
|
|
29
|
+
from fugue_spark._utils.convert import to_pandas
|
|
30
|
+
from fugue_spark._utils.misc import is_spark_dataframe, is_spark_session
|
|
29
31
|
from fugue_spark.dataframe import SparkDataFrame
|
|
30
32
|
from fugue_spark.execution_engine import SparkExecutionEngine
|
|
31
33
|
from fugue_test.builtin_suite import BuiltInTests
|
|
@@ -86,6 +88,11 @@ class SparkExecutionEngineTests(ExecutionEngineTests.Tests):
|
|
|
86
88
|
res = a.as_array(type_safe=True)
|
|
87
89
|
assert res[0][0] == {"a": "b"}
|
|
88
90
|
|
|
91
|
+
pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
92
|
+
pdf = pdf[pdf.a < 1]
|
|
93
|
+
a = e.to_df(pdf)
|
|
94
|
+
assert fa.get_schema(a) == "a:long,b:long"
|
|
95
|
+
|
|
89
96
|
def test_persist(self):
|
|
90
97
|
e = self.engine
|
|
91
98
|
|
|
@@ -115,13 +122,12 @@ class SparkExecutionEngineTests(ExecutionEngineTests.Tests):
|
|
|
115
122
|
|
|
116
123
|
def test_infer_engine(self):
|
|
117
124
|
df = self.spark_session.createDataFrame(pd.DataFrame([[0]], columns=["a"]))
|
|
118
|
-
assert
|
|
125
|
+
assert is_spark_session(infer_execution_engine([df]))
|
|
119
126
|
|
|
120
127
|
fdf = SparkDataFrame(df)
|
|
121
|
-
assert
|
|
128
|
+
assert is_spark_session(infer_execution_engine([fdf]))
|
|
122
129
|
|
|
123
130
|
|
|
124
|
-
@pytest.mark.skipif(pyspark.__version__ < "3", reason="pyspark < 3")
|
|
125
131
|
class SparkExecutionEnginePandasUDFTests(ExecutionEngineTests.Tests):
|
|
126
132
|
@pytest.fixture(autouse=True)
|
|
127
133
|
def init_session(self, spark_session):
|
|
@@ -192,6 +198,7 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
192
198
|
session,
|
|
193
199
|
{
|
|
194
200
|
"test": True,
|
|
201
|
+
"fugue.spark.use_pandas_udf": False,
|
|
195
202
|
"fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
|
|
196
203
|
"fugue.rpc.flask_server.host": "127.0.0.1",
|
|
197
204
|
"fugue.rpc.flask_server.port": "1234",
|
|
@@ -258,10 +265,41 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
258
265
|
dag.output(c, using=assert_match, params=dict(values=[100]))
|
|
259
266
|
dag.run(self.engine)
|
|
260
267
|
|
|
268
|
+
def test_coarse_partition(self):
|
|
269
|
+
def verify_coarse_partition(df: pd.DataFrame) -> List[List[Any]]:
|
|
270
|
+
ct = df.a.nunique()
|
|
271
|
+
s = df.a * 1000 + df.b
|
|
272
|
+
ordered = ((s - s.shift(1)).dropna() >= 0).all(axis=None)
|
|
273
|
+
return [[ct, ordered]]
|
|
274
|
+
|
|
275
|
+
def assert_(df: pd.DataFrame, rc: int, n: int, check_ordered: bool) -> None:
|
|
276
|
+
if rc > 0:
|
|
277
|
+
assert len(df) == rc
|
|
278
|
+
assert df.ct.sum() == n
|
|
279
|
+
if check_ordered:
|
|
280
|
+
assert (df.ordered == True).all()
|
|
281
|
+
|
|
282
|
+
gps = 100
|
|
283
|
+
partition_num = 6
|
|
284
|
+
df = pd.DataFrame(dict(a=list(range(gps)) * 10, b=range(gps * 10))).sample(
|
|
285
|
+
frac=1.0
|
|
286
|
+
)
|
|
287
|
+
with FugueWorkflow() as dag:
|
|
288
|
+
a = dag.df(df)
|
|
289
|
+
c = a.partition(
|
|
290
|
+
algo="coarse", by="a", presort="b", num=partition_num
|
|
291
|
+
).transform(verify_coarse_partition, schema="ct:int,ordered:bool")
|
|
292
|
+
dag.output(
|
|
293
|
+
c,
|
|
294
|
+
using=assert_,
|
|
295
|
+
params=dict(rc=partition_num, n=gps, check_ordered=True),
|
|
296
|
+
)
|
|
297
|
+
dag.run(self.engine)
|
|
298
|
+
|
|
261
299
|
def test_session_as_engine(self):
|
|
262
300
|
dag = FugueWorkflow()
|
|
263
301
|
a = dag.df([[p, 0] for p in range(100)], "a:int,b:int")
|
|
264
|
-
a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist()
|
|
302
|
+
# a.partition(algo="even", by=["a"]).transform(AssertMaxNTransform).persist()
|
|
265
303
|
dag.run(self.spark_session)
|
|
266
304
|
|
|
267
305
|
def test_interfaceless(self):
|
|
@@ -274,8 +312,8 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
274
312
|
return df.sort_values("b").head(1)
|
|
275
313
|
|
|
276
314
|
result = transform(sdf, f1, partition=dict(by=["a"]), engine=self.engine)
|
|
277
|
-
assert
|
|
278
|
-
assert result
|
|
315
|
+
assert is_spark_dataframe(result)
|
|
316
|
+
assert to_pandas(result).sort_values(["a"]).values.tolist() == [[0, 0], [1, 1]]
|
|
279
317
|
|
|
280
318
|
def test_annotation_1(self):
|
|
281
319
|
def m_c(engine: SparkExecutionEngine) -> ps.DataFrame:
|
|
@@ -285,7 +323,7 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
285
323
|
return df
|
|
286
324
|
|
|
287
325
|
def m_o(engine: SparkExecutionEngine, df: ps.DataFrame) -> None:
|
|
288
|
-
assert 1 == df
|
|
326
|
+
assert 1 == to_pandas(df).shape[0]
|
|
289
327
|
|
|
290
328
|
with FugueWorkflow() as dag:
|
|
291
329
|
df = dag.create(m_c).process(m_p)
|
|
@@ -298,12 +336,12 @@ class SparkExecutionEngineBuiltInTests(BuiltInTests.Tests):
|
|
|
298
336
|
return session.createDataFrame([[0]], "a:long")
|
|
299
337
|
|
|
300
338
|
def m_p(session: SparkSession, df: ps.DataFrame) -> ps.DataFrame:
|
|
301
|
-
assert
|
|
339
|
+
assert is_spark_session(session)
|
|
302
340
|
return df
|
|
303
341
|
|
|
304
342
|
def m_o(session: SparkSession, df: ps.DataFrame) -> None:
|
|
305
|
-
assert
|
|
306
|
-
assert 1 == df
|
|
343
|
+
assert is_spark_session(session)
|
|
344
|
+
assert 1 == to_pandas(df).shape[0]
|
|
307
345
|
|
|
308
346
|
with FugueWorkflow() as dag:
|
|
309
347
|
df = dag.create(m_c).process(m_p)
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from fugue import FugueWorkflow, transform
|
|
2
|
-
from fugue import fsql
|
|
3
|
-
from pyspark.sql import SparkSession, DataFrame
|
|
4
1
|
import pandas as pd
|
|
2
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
3
|
|
|
4
|
+
from fugue import FugueWorkflow, fsql, transform
|
|
5
|
+
from fugue_spark._utils.convert import to_pandas
|
|
6
6
|
from fugue_spark.registry import _is_sparksql
|
|
7
7
|
|
|
8
8
|
|
|
@@ -41,4 +41,4 @@ def test_transform_from_sparksql(spark_session):
|
|
|
41
41
|
|
|
42
42
|
res = transform(("sparksql", "SELECT 1 AS a, 'b' AS aa"), t)
|
|
43
43
|
assert isinstance(res, DataFrame) # engine inference
|
|
44
|
-
assert res
|
|
44
|
+
assert to_pandas(res).to_dict("records") == [{"a": 1, "aa": "b"}]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
|
|
4
|
+
import fugue.api as fa
|
|
5
|
+
from fugue_spark.execution_engine import SparkExecutionEngine
|
|
6
|
+
|
|
7
|
+
from .test_dataframe import NativeSparkDataFrameTests as _NativeDataFrameTests
|
|
8
|
+
from .test_dataframe import SparkDataFrameTests as _DataFrameTests
|
|
9
|
+
from .test_execution_engine import (
|
|
10
|
+
SparkExecutionEnginePandasUDFBuiltInTests as _WorkflowTests,
|
|
11
|
+
)
|
|
12
|
+
from .test_execution_engine import SparkExecutionEnginePandasUDFTests as _EngineTests
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SparkConnectDataFrameTests(_DataFrameTests):
|
|
16
|
+
@pytest.fixture(autouse=True)
|
|
17
|
+
def init_session(self):
|
|
18
|
+
self.spark_session = _connect()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SparkConnectNativeDataFrameTests(_NativeDataFrameTests):
|
|
22
|
+
@pytest.fixture(autouse=True)
|
|
23
|
+
def init_session(self):
|
|
24
|
+
self.spark_session = _connect()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SparkConnectExecutionEngineTests(_EngineTests):
|
|
28
|
+
@pytest.fixture(autouse=True)
|
|
29
|
+
def init_session(self):
|
|
30
|
+
self.spark_session = _connect()
|
|
31
|
+
|
|
32
|
+
def make_engine(self):
|
|
33
|
+
session = _connect()
|
|
34
|
+
e = SparkExecutionEngine(
|
|
35
|
+
session, {"test": True, "fugue.spark.use_pandas_udf": False}
|
|
36
|
+
)
|
|
37
|
+
return e
|
|
38
|
+
|
|
39
|
+
def test_get_parallelism(self):
|
|
40
|
+
assert fa.get_current_parallelism() == 200
|
|
41
|
+
|
|
42
|
+
def test_using_pandas_udf(self):
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
def test_map_with_dict_col(self):
|
|
46
|
+
return # spark connect has a bug
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SparkConnectBuiltInTests(_WorkflowTests):
|
|
50
|
+
@pytest.fixture(autouse=True)
|
|
51
|
+
def init_session(self):
|
|
52
|
+
self.spark_session = _connect()
|
|
53
|
+
|
|
54
|
+
def make_engine(self):
|
|
55
|
+
session = _connect()
|
|
56
|
+
e = SparkExecutionEngine(
|
|
57
|
+
session,
|
|
58
|
+
{
|
|
59
|
+
"test": True,
|
|
60
|
+
"fugue.spark.use_pandas_udf": True,
|
|
61
|
+
"fugue.rpc.server": "fugue.rpc.flask.FlaskRPCServer",
|
|
62
|
+
"fugue.rpc.flask_server.host": "127.0.0.1",
|
|
63
|
+
"fugue.rpc.flask_server.port": "1234",
|
|
64
|
+
"fugue.rpc.flask_server.timeout": "2 sec",
|
|
65
|
+
"spark.sql.shuffle.partitions": "10",
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
assert e.conf.get_or_throw("fugue.spark.use_pandas_udf", bool)
|
|
69
|
+
return e
|
|
70
|
+
|
|
71
|
+
def test_annotation_3(self):
|
|
72
|
+
return # RDD is not implemented in spark connect
|
|
73
|
+
|
|
74
|
+
def test_repartition(self):
|
|
75
|
+
return # spark connect doesn't support even repartitioning
|
|
76
|
+
|
|
77
|
+
def test_repartition_large(self):
|
|
78
|
+
return # spark connect doesn't support even repartitioning
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _connect():
|
|
82
|
+
return SparkSession.builder.remote("sc://localhost").getOrCreate()
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import pyspark
|
|
2
1
|
from fugue_spark._utils.convert import (
|
|
3
2
|
to_cast_expression,
|
|
4
3
|
to_schema,
|
|
@@ -53,13 +52,12 @@ def test_schema_conversion(spark_session):
|
|
|
53
52
|
assert to_schema(df) == "name:[{nest_name:str,nest_value:int}]"
|
|
54
53
|
assert to_spark_schema("name:[{nest_name:str,nest_value:int}]") == schema
|
|
55
54
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
assert to_spark_schema("a:<str,int>") == schema
|
|
55
|
+
schema = StructType(
|
|
56
|
+
[StructField("a", MapType(StringType(), IntegerType(), True), True)],
|
|
57
|
+
)
|
|
58
|
+
df = spark_session.createDataFrame([[{"x": 1}], [{"y": 2}]], schema)
|
|
59
|
+
assert to_schema(df) == "a:<str,int>"
|
|
60
|
+
assert to_spark_schema("a:<str,int>") == schema
|
|
63
61
|
|
|
64
62
|
|
|
65
63
|
def test_to_cast_expression():
|
|
@@ -105,23 +105,6 @@ def test_json_io(tmpdir, spark_session):
|
|
|
105
105
|
raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
|
|
106
106
|
|
|
107
107
|
|
|
108
|
-
def test_avro_io(tmpdir, spark_session):
|
|
109
|
-
if spark_session.version < "3.0.0":
|
|
110
|
-
return
|
|
111
|
-
fs = FileSystem()
|
|
112
|
-
si = SparkIO(spark_session, fs)
|
|
113
|
-
df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
|
|
114
|
-
path = os.path.join(tmpdir, "a.avro")
|
|
115
|
-
si.save_df(df1, path)
|
|
116
|
-
actual = si.load_df(path)
|
|
117
|
-
df_eq(actual, [["1", 2, 3]], "a:str,b:int,c:long")
|
|
118
|
-
actual = si.load_df(path, columns=["b", "a"])
|
|
119
|
-
df_eq(actual, [[2, "1"]], "b:int,a:str")
|
|
120
|
-
actual = si.load_df(path, columns="b:str,a:int")
|
|
121
|
-
df_eq(actual, [["2", 1]], "b:str,a:int")
|
|
122
|
-
raises(Exception, lambda: si.load_df(path, columns="bb:str,a:int"))
|
|
123
|
-
|
|
124
|
-
|
|
125
108
|
def test_save_with_partition(tmpdir, spark_session):
|
|
126
109
|
si = SparkIO(spark_session, FileSystem())
|
|
127
110
|
df1 = _df([["1", 2, 3]], "a:str,b:int,c:long")
|
fugue_test/_utils.py
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|