fugue 0.8.7.dev7__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/collections/sql.py +1 -1
- fugue/dataframe/utils.py +4 -18
- fugue/test/__init__.py +11 -0
- fugue/test/pandas_tester.py +24 -0
- fugue/test/plugins.py +393 -0
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/METADATA +24 -15
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/RECORD +38 -47
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/WHEEL +1 -1
- fugue-0.9.0.dist-info/entry_points.txt +12 -0
- fugue_dask/_io.py +8 -5
- fugue_dask/_utils.py +4 -4
- fugue_dask/execution_engine.py +11 -0
- fugue_dask/registry.py +2 -0
- fugue_dask/tester.py +24 -0
- fugue_duckdb/__init__.py +0 -5
- fugue_duckdb/_io.py +1 -0
- fugue_duckdb/registry.py +30 -2
- fugue_duckdb/tester.py +49 -0
- fugue_ibis/__init__.py +0 -3
- fugue_ibis/dataframe.py +2 -2
- fugue_ibis/execution_engine.py +14 -7
- fugue_ray/_constants.py +3 -4
- fugue_ray/_utils/dataframe.py +10 -21
- fugue_ray/_utils/io.py +38 -9
- fugue_ray/execution_engine.py +1 -2
- fugue_ray/registry.py +1 -0
- fugue_ray/tester.py +22 -0
- fugue_spark/execution_engine.py +5 -5
- fugue_spark/registry.py +13 -1
- fugue_spark/tester.py +78 -0
- fugue_test/__init__.py +82 -0
- fugue_test/builtin_suite.py +26 -43
- fugue_test/dataframe_suite.py +5 -14
- fugue_test/execution_suite.py +170 -143
- fugue_test/fixtures.py +61 -0
- fugue_version/__init__.py +1 -1
- fugue-0.8.7.dev7.dist-info/entry_points.txt +0 -17
- fugue_dask/ibis_engine.py +0 -62
- fugue_duckdb/ibis_engine.py +0 -56
- fugue_ibis/execution/__init__.py +0 -0
- fugue_ibis/execution/ibis_engine.py +0 -49
- fugue_ibis/execution/pandas_backend.py +0 -54
- fugue_ibis/extensions.py +0 -203
- fugue_spark/ibis_engine.py +0 -45
- fugue_test/ibis_suite.py +0 -92
- fugue_test/plugins/__init__.py +0 -0
- fugue_test/plugins/dask/__init__.py +0 -2
- fugue_test/plugins/dask/fixtures.py +0 -12
- fugue_test/plugins/duckdb/__init__.py +0 -2
- fugue_test/plugins/duckdb/fixtures.py +0 -9
- fugue_test/plugins/misc/__init__.py +0 -2
- fugue_test/plugins/misc/fixtures.py +0 -18
- fugue_test/plugins/ray/__init__.py +0 -2
- fugue_test/plugins/ray/fixtures.py +0 -9
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev7.dist-info → fugue-0.9.0.dist-info}/top_level.txt +0 -0
fugue_ray/_utils/io.py
CHANGED
|
@@ -4,13 +4,14 @@ from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
|
4
4
|
|
|
5
5
|
import pyarrow as pa
|
|
6
6
|
import ray.data as rd
|
|
7
|
+
from packaging import version
|
|
7
8
|
from pyarrow import csv as pacsv
|
|
8
9
|
from pyarrow import json as pajson
|
|
9
10
|
from ray.data.datasource import FileExtensionFilter
|
|
10
11
|
from triad.collections import Schema
|
|
11
12
|
from triad.collections.dict import ParamDict
|
|
12
13
|
from triad.utils.assertion import assert_or_throw
|
|
13
|
-
from triad.utils.io import exists, makedirs, rm
|
|
14
|
+
from triad.utils.io import exists, makedirs, rm, isfile
|
|
14
15
|
|
|
15
16
|
from fugue import ExecutionEngine
|
|
16
17
|
from fugue._utils.io import FileParser, save_df
|
|
@@ -18,6 +19,8 @@ from fugue.collections.partition import PartitionSpec
|
|
|
18
19
|
from fugue.dataframe import DataFrame
|
|
19
20
|
from fugue_ray.dataframe import RayDataFrame
|
|
20
21
|
|
|
22
|
+
from .._constants import RAY_VERSION
|
|
23
|
+
|
|
21
24
|
|
|
22
25
|
class RayIO(object):
|
|
23
26
|
def __init__(self, engine: ExecutionEngine):
|
|
@@ -148,6 +151,18 @@ class RayIO(object):
|
|
|
148
151
|
if infer_schema and columns is not None and not isinstance(columns, list):
|
|
149
152
|
raise ValueError("can't set columns as a schema when infer schema is true")
|
|
150
153
|
|
|
154
|
+
if RAY_VERSION >= version.parse("2.10"):
|
|
155
|
+
if len(p) == 1 and isfile(p[0]): # TODO: very hacky
|
|
156
|
+
params: Dict[str, Any] = {}
|
|
157
|
+
else:
|
|
158
|
+
params = {"file_extensions": ["csv"]}
|
|
159
|
+
else: # pragma: no cover
|
|
160
|
+
params = {
|
|
161
|
+
"partition_filter": _FileFiler(
|
|
162
|
+
file_extensions=["csv"], exclude=["_SUCCESS"]
|
|
163
|
+
),
|
|
164
|
+
}
|
|
165
|
+
|
|
151
166
|
def _read_csv(to_str: bool) -> RayDataFrame:
|
|
152
167
|
res = rd.read_csv(
|
|
153
168
|
p,
|
|
@@ -155,9 +170,7 @@ class RayIO(object):
|
|
|
155
170
|
read_options=pacsv.ReadOptions(**read_options),
|
|
156
171
|
parse_options=pacsv.ParseOptions(**parse_options),
|
|
157
172
|
convert_options=pacsv.ConvertOptions(**convert_options),
|
|
158
|
-
|
|
159
|
-
file_extensions=["csv"], exclude=["_SUCCESS"]
|
|
160
|
-
),
|
|
173
|
+
**params,
|
|
161
174
|
)
|
|
162
175
|
if to_str:
|
|
163
176
|
_schema = res.schema(fetch_if_missing=True)
|
|
@@ -195,16 +208,32 @@ class RayIO(object):
|
|
|
195
208
|
read_options: Dict[str, Any] = {"use_threads": False}
|
|
196
209
|
parse_options: Dict[str, Any] = {}
|
|
197
210
|
|
|
198
|
-
def _read_json() -> RayDataFrame:
|
|
211
|
+
def _read_json() -> RayDataFrame: # pragma: no cover
|
|
212
|
+
if RAY_VERSION >= version.parse("2.10"):
|
|
213
|
+
if len(p) == 1 and isfile(p[0]): # TODO: very hacky
|
|
214
|
+
params: Dict[str, Any] = {"file_extensions": None}
|
|
215
|
+
else:
|
|
216
|
+
params = {"file_extensions": ["json"]}
|
|
217
|
+
elif RAY_VERSION >= version.parse("2.9"): # pragma: no cover
|
|
218
|
+
params = {
|
|
219
|
+
"file_extensions": None,
|
|
220
|
+
"partition_filter": _FileFiler(
|
|
221
|
+
file_extensions=["json"], exclude=["_SUCCESS"]
|
|
222
|
+
),
|
|
223
|
+
}
|
|
224
|
+
else: # pragma: no cover
|
|
225
|
+
params = {
|
|
226
|
+
"partition_filter": _FileFiler(
|
|
227
|
+
file_extensions=["json"], exclude=["_SUCCESS"]
|
|
228
|
+
),
|
|
229
|
+
}
|
|
199
230
|
return RayDataFrame(
|
|
200
231
|
rd.read_json(
|
|
201
232
|
p,
|
|
202
233
|
ray_remote_args=self._remote_args(),
|
|
203
234
|
read_options=pajson.ReadOptions(**read_options),
|
|
204
235
|
parse_options=pajson.ParseOptions(**parse_options),
|
|
205
|
-
|
|
206
|
-
file_extensions=["json"], exclude=["_SUCCESS"]
|
|
207
|
-
),
|
|
236
|
+
**params,
|
|
208
237
|
)
|
|
209
238
|
)
|
|
210
239
|
|
|
@@ -221,7 +250,7 @@ class RayIO(object):
|
|
|
221
250
|
return {"num_cpus": 1}
|
|
222
251
|
|
|
223
252
|
|
|
224
|
-
class _FileFiler(FileExtensionFilter):
|
|
253
|
+
class _FileFiler(FileExtensionFilter): # pragma: no cover
|
|
225
254
|
def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
|
|
226
255
|
super().__init__(file_extensions, allow_if_no_extension=True)
|
|
227
256
|
self._exclude = set(exclude)
|
fugue_ray/execution_engine.py
CHANGED
|
@@ -191,8 +191,7 @@ class RayMapEngine(MapEngine):
|
|
|
191
191
|
mb_args["batch_size"] = self.conf.get_or_throw(
|
|
192
192
|
FUGUE_RAY_DEFAULT_BATCH_SIZE, int
|
|
193
193
|
)
|
|
194
|
-
|
|
195
|
-
mb_args["zero_copy_batch"] = self.conf.get(FUGUE_RAY_ZERO_COPY, True)
|
|
194
|
+
mb_args["zero_copy_batch"] = self.conf.get(FUGUE_RAY_ZERO_COPY, True)
|
|
196
195
|
sdf = rdf.native.map_batches(
|
|
197
196
|
_udf,
|
|
198
197
|
batch_format="pyarrow",
|
fugue_ray/registry.py
CHANGED
|
@@ -14,6 +14,7 @@ from fugue.plugins import as_fugue_dataset, infer_execution_engine
|
|
|
14
14
|
|
|
15
15
|
from .dataframe import RayDataFrame
|
|
16
16
|
from .execution_engine import RayExecutionEngine
|
|
17
|
+
from .tester import RayTestBackend # noqa: F401 # pylint: disable-all
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
@infer_execution_engine.candidate(
|
fugue_ray/tester.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from typing import Any, Dict, Iterator
|
|
3
|
+
|
|
4
|
+
import ray
|
|
5
|
+
|
|
6
|
+
import fugue.test as ft
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@ft.fugue_test_backend
|
|
10
|
+
class RayTestBackend(ft.FugueTestBackend):
|
|
11
|
+
name = "ray"
|
|
12
|
+
default_session_conf = {"num_cpus": 2}
|
|
13
|
+
default_fugue_conf = {
|
|
14
|
+
"fugue.ray.zero_copy": True,
|
|
15
|
+
"fugue.ray.default.batch_size": 10000,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
@contextmanager
|
|
20
|
+
def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
|
|
21
|
+
with ray.init(**session_conf):
|
|
22
|
+
yield "ray"
|
fugue_spark/execution_engine.py
CHANGED
|
@@ -492,11 +492,6 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
492
492
|
res.reset_metadata(df.metadata)
|
|
493
493
|
return res
|
|
494
494
|
|
|
495
|
-
def register(self, df: DataFrame, name: str) -> SparkDataFrame:
|
|
496
|
-
sdf = self._to_spark_df(df)
|
|
497
|
-
sdf.native.createOrReplaceTempView(name)
|
|
498
|
-
return sdf
|
|
499
|
-
|
|
500
495
|
def join(
|
|
501
496
|
self,
|
|
502
497
|
df1: DataFrame,
|
|
@@ -679,6 +674,11 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
679
674
|
|
|
680
675
|
# If partition exists
|
|
681
676
|
else:
|
|
677
|
+
if len(_presort.keys()) == 0 and n == 1:
|
|
678
|
+
return self._to_spark_df(
|
|
679
|
+
d.dropDuplicates(subset=partition_spec.partition_by), df.schema
|
|
680
|
+
)
|
|
681
|
+
|
|
682
682
|
w = Window.partitionBy([col(x) for x in partition_spec.partition_by])
|
|
683
683
|
|
|
684
684
|
if len(_presort.keys()) > 0:
|
fugue_spark/registry.py
CHANGED
|
@@ -19,6 +19,13 @@ from fugue_spark.dataframe import SparkDataFrame
|
|
|
19
19
|
from fugue_spark.execution_engine import SparkExecutionEngine
|
|
20
20
|
|
|
21
21
|
from ._utils.misc import SparkConnectDataFrame, SparkConnectSession, is_spark_dataframe
|
|
22
|
+
from .tester import SparkTestBackend # noqa: F401 # pylint: disable-all
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
from .tester import SparkConnectTestBackend # noqa: F401 # pylint: disable-all
|
|
26
|
+
except ImportError: # pragma: no cover
|
|
27
|
+
pass
|
|
28
|
+
|
|
22
29
|
|
|
23
30
|
_is_sparksql = namespace_candidate("sparksql", lambda x: isinstance(x, str))
|
|
24
31
|
|
|
@@ -31,7 +38,12 @@ _is_sparksql = namespace_candidate("sparksql", lambda x: isinstance(x, str))
|
|
|
31
38
|
)
|
|
32
39
|
or any(_is_sparksql(obj) for obj in objs)
|
|
33
40
|
)
|
|
34
|
-
def _infer_spark_client(
|
|
41
|
+
def _infer_spark_client(objs: Any) -> Any:
|
|
42
|
+
obj = objs[0]
|
|
43
|
+
if isinstance(obj, SparkDataFrame):
|
|
44
|
+
obj = obj.native
|
|
45
|
+
if hasattr(obj, "sparkSession"):
|
|
46
|
+
return obj.sparkSession
|
|
35
47
|
return SparkSession.builder.getOrCreate()
|
|
36
48
|
|
|
37
49
|
|
fugue_spark/tester.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from typing import Any, Dict, Iterator
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import SparkSession
|
|
5
|
+
|
|
6
|
+
import fugue.test as ft
|
|
7
|
+
|
|
8
|
+
from ._utils.misc import SparkConnectSession
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@ft.fugue_test_backend
|
|
12
|
+
class SparkTestBackend(ft.FugueTestBackend):
|
|
13
|
+
name = "spark"
|
|
14
|
+
default_session_conf = {
|
|
15
|
+
"spark.app.name": "fugue-test-spark",
|
|
16
|
+
"spark.master": "local[*]",
|
|
17
|
+
"spark.default.parallelism": 4,
|
|
18
|
+
"spark.dynamicAllocation.enabled": "false",
|
|
19
|
+
"spark.executor.cores": 4,
|
|
20
|
+
"spark.executor.instances": 1,
|
|
21
|
+
"spark.io.compression.codec": "lz4",
|
|
22
|
+
"spark.rdd.compress": "false",
|
|
23
|
+
"spark.sql.shuffle.partitions": 4,
|
|
24
|
+
"spark.shuffle.compress": "false",
|
|
25
|
+
"spark.sql.catalogImplementation": "in-memory",
|
|
26
|
+
"spark.sql.execution.arrow.pyspark.enabled": True,
|
|
27
|
+
"spark.sql.adaptive.enabled": False,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def transform_session_conf(cls, conf: Dict[str, Any]) -> Dict[str, Any]:
|
|
32
|
+
return ft.extract_conf(conf, "spark.", remove_prefix=False)
|
|
33
|
+
|
|
34
|
+
@classmethod
|
|
35
|
+
@contextmanager
|
|
36
|
+
def session_context(cls, session_conf: Dict[str, Any]) -> Iterator[Any]:
|
|
37
|
+
with _create_session(session_conf).getOrCreate() as spark:
|
|
38
|
+
yield spark
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
if SparkConnectSession is not None:
|
|
42
|
+
|
|
43
|
+
@ft.fugue_test_backend
|
|
44
|
+
class SparkConnectTestBackend(SparkTestBackend):
|
|
45
|
+
name = "sparkconnect"
|
|
46
|
+
default_session_conf = {
|
|
47
|
+
"spark.default.parallelism": 4,
|
|
48
|
+
"spark.sql.shuffle.partitions": 4,
|
|
49
|
+
"spark.sql.execution.arrow.pyspark.enabled": True,
|
|
50
|
+
"spark.sql.adaptive.enabled": False,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def transform_session_conf(
|
|
55
|
+
cls, conf: Dict[str, Any]
|
|
56
|
+
) -> Dict[str, Any]: # pragma: no cover
|
|
57
|
+
# replace sparkconnect. with spark.
|
|
58
|
+
return {
|
|
59
|
+
"spark." + k: v
|
|
60
|
+
for k, v in ft.extract_conf(
|
|
61
|
+
conf, cls.name + ".", remove_prefix=True
|
|
62
|
+
).items()
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
@contextmanager
|
|
67
|
+
def session_context(
|
|
68
|
+
cls, session_conf: Dict[str, Any]
|
|
69
|
+
) -> Iterator[Any]: # pragma: no cover
|
|
70
|
+
spark = _create_session(session_conf).remote("sc://localhost").getOrCreate()
|
|
71
|
+
yield spark
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _create_session(conf: Dict[str, Any]) -> Any:
|
|
75
|
+
sb = SparkSession.builder
|
|
76
|
+
for k, v in conf.items():
|
|
77
|
+
sb = sb.config(k, v)
|
|
78
|
+
return sb
|
fugue_test/__init__.py
CHANGED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Any, Dict, Tuple
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pytest
|
|
5
|
+
from triad.utils.pyarrow import to_pa_datatype
|
|
6
|
+
|
|
7
|
+
_FUGUE_TEST_CONF_NAME = "fugue_test_conf"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pytest_addoption(parser: Any): # pragma: no cover
|
|
11
|
+
parser.addini(
|
|
12
|
+
_FUGUE_TEST_CONF_NAME,
|
|
13
|
+
help="Configs for fugue testing execution engines",
|
|
14
|
+
type="linelist",
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pytest_configure(config: Any):
|
|
19
|
+
from fugue.test.plugins import _set_global_conf
|
|
20
|
+
|
|
21
|
+
options = config.getini(_FUGUE_TEST_CONF_NAME)
|
|
22
|
+
conf: Dict[str, Any] = {}
|
|
23
|
+
if options:
|
|
24
|
+
for line in options:
|
|
25
|
+
line = line.strip()
|
|
26
|
+
if not line.startswith("#"):
|
|
27
|
+
k, v = _parse_line(line)
|
|
28
|
+
conf[k] = v
|
|
29
|
+
_set_global_conf(conf)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def pytest_report_header(config, start_path):
|
|
33
|
+
from fugue.test.plugins import _get_all_ini_conf
|
|
34
|
+
|
|
35
|
+
header_lines = []
|
|
36
|
+
header_lines.append("Fugue tests will be initialized with options:")
|
|
37
|
+
for k, v in _get_all_ini_conf().items():
|
|
38
|
+
header_lines.append(f"\t{k} = {v}")
|
|
39
|
+
return "\n".join(header_lines)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _parse_line(line: str) -> Tuple[str, Any]:
|
|
43
|
+
try:
|
|
44
|
+
kv = line.split("=", 1)
|
|
45
|
+
if len(kv) == 1:
|
|
46
|
+
raise ValueError()
|
|
47
|
+
kt = kv[0].split(":", 1)
|
|
48
|
+
if len(kt) == 1:
|
|
49
|
+
tp = pa.string()
|
|
50
|
+
else:
|
|
51
|
+
tp = to_pa_datatype(kt[1].strip())
|
|
52
|
+
key = kt[0].strip()
|
|
53
|
+
if key == "":
|
|
54
|
+
raise ValueError()
|
|
55
|
+
value = pa.compute.cast([kv[1].strip()], tp).to_pylist()[0]
|
|
56
|
+
return key, value
|
|
57
|
+
except Exception:
|
|
58
|
+
raise ValueError(
|
|
59
|
+
f"Invalid config line: {line}, it must be in format: key[:type]=value"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@pytest.fixture(scope="class")
|
|
64
|
+
def backend_context(request: Any):
|
|
65
|
+
from fugue.test.plugins import _make_backend_context, _parse_backend
|
|
66
|
+
|
|
67
|
+
c, _ = _parse_backend(request.param)
|
|
68
|
+
session = request.getfixturevalue(c + "_session")
|
|
69
|
+
with _make_backend_context(request.param, session) as ctx:
|
|
70
|
+
yield ctx
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@pytest.fixture(scope="class")
|
|
74
|
+
def _class_backend_context(request, backend_context):
|
|
75
|
+
from fugue.test.plugins import FugueTestContext
|
|
76
|
+
|
|
77
|
+
request.cls._test_context = FugueTestContext(
|
|
78
|
+
engine=backend_context.engine,
|
|
79
|
+
session=backend_context.session,
|
|
80
|
+
name=backend_context.name,
|
|
81
|
+
)
|
|
82
|
+
yield
|
fugue_test/builtin_suite.py
CHANGED
|
@@ -10,9 +10,8 @@ import datetime
|
|
|
10
10
|
import os
|
|
11
11
|
import pickle
|
|
12
12
|
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional
|
|
13
|
-
from unittest import TestCase
|
|
14
13
|
from uuid import uuid4
|
|
15
|
-
|
|
14
|
+
|
|
16
15
|
import numpy as np
|
|
17
16
|
import pandas as pd
|
|
18
17
|
import pyarrow as pa
|
|
@@ -20,8 +19,10 @@ import pytest
|
|
|
20
19
|
from fsspec.implementations.local import LocalFileSystem
|
|
21
20
|
from pytest import raises
|
|
22
21
|
from triad import SerializableRLock
|
|
22
|
+
from triad.utils.io import join, write_text
|
|
23
23
|
|
|
24
24
|
import fugue.api as fa
|
|
25
|
+
import fugue.test as ft
|
|
25
26
|
from fugue import (
|
|
26
27
|
AnyDataFrame,
|
|
27
28
|
ArrayDataFrame,
|
|
@@ -46,7 +47,6 @@ from fugue import (
|
|
|
46
47
|
outputter,
|
|
47
48
|
processor,
|
|
48
49
|
register_creator,
|
|
49
|
-
register_default_sql_engine,
|
|
50
50
|
register_output_transformer,
|
|
51
51
|
register_outputter,
|
|
52
52
|
register_processor,
|
|
@@ -56,7 +56,6 @@ from fugue import (
|
|
|
56
56
|
from fugue.column import col
|
|
57
57
|
from fugue.column import functions as ff
|
|
58
58
|
from fugue.column import lit
|
|
59
|
-
from fugue.dataframe.utils import _df_eq as df_eq
|
|
60
59
|
from fugue.exceptions import (
|
|
61
60
|
FugueInterfacelessError,
|
|
62
61
|
FugueWorkflowCompileError,
|
|
@@ -78,26 +77,10 @@ class BuiltInTests(object):
|
|
|
78
77
|
add correspondent tests here
|
|
79
78
|
"""
|
|
80
79
|
|
|
81
|
-
class Tests(
|
|
82
|
-
@classmethod
|
|
83
|
-
def setUpClass(cls):
|
|
84
|
-
register_default_sql_engine(lambda engine: engine.sql_engine)
|
|
85
|
-
cls._engine = cls.make_engine(cls)
|
|
86
|
-
|
|
87
|
-
@property
|
|
88
|
-
def engine(self) -> ExecutionEngine:
|
|
89
|
-
return self._engine # type: ignore
|
|
90
|
-
|
|
91
|
-
@classmethod
|
|
92
|
-
def tearDownClass(cls):
|
|
93
|
-
cls._engine.stop()
|
|
94
|
-
|
|
95
|
-
def make_engine(self) -> ExecutionEngine: # pragma: no cover
|
|
96
|
-
raise NotImplementedError
|
|
97
|
-
|
|
80
|
+
class Tests(ft.FugueTestSuite):
|
|
98
81
|
def test_workflows(self):
|
|
99
82
|
a = FugueWorkflow().df([[0]], "a:int")
|
|
100
|
-
df_eq(a.compute(self.engine), [[0]], "a:int")
|
|
83
|
+
self.df_eq(a.compute(self.engine), [[0]], "a:int")
|
|
101
84
|
|
|
102
85
|
def test_create_show(self):
|
|
103
86
|
with FugueWorkflow() as dag:
|
|
@@ -1706,7 +1689,7 @@ class BuiltInTests(object):
|
|
|
1706
1689
|
""",
|
|
1707
1690
|
x=sdf3,
|
|
1708
1691
|
).run()
|
|
1709
|
-
df_eq(
|
|
1692
|
+
self.df_eq(
|
|
1710
1693
|
res["res"],
|
|
1711
1694
|
[[3, 4, 13]],
|
|
1712
1695
|
schema="a:long,b:int,c:long",
|
|
@@ -1739,9 +1722,9 @@ class BuiltInTests(object):
|
|
|
1739
1722
|
df1 = pd.DataFrame([[0, 1], [2, 3]], columns=["a b", " "])
|
|
1740
1723
|
df2 = pd.DataFrame([[0, 10], [20, 3]], columns=["a b", "d"])
|
|
1741
1724
|
r = fa.inner_join(df1, df2, as_fugue=True)
|
|
1742
|
-
df_eq(r, [[0, 1, 10]], "`a b`:long,` `:long,d:long", throw=True)
|
|
1725
|
+
self.df_eq(r, [[0, 1, 10]], "`a b`:long,` `:long,d:long", throw=True)
|
|
1743
1726
|
r = fa.transform(r, tr)
|
|
1744
|
-
df_eq(
|
|
1727
|
+
self.df_eq(
|
|
1745
1728
|
r,
|
|
1746
1729
|
[[0, 1, 10, 2]],
|
|
1747
1730
|
"`a b`:long,` `:long,d:long,`c *`:long",
|
|
@@ -1755,7 +1738,7 @@ class BuiltInTests(object):
|
|
|
1755
1738
|
col("d"),
|
|
1756
1739
|
col("c *").cast(int),
|
|
1757
1740
|
)
|
|
1758
|
-
df_eq(
|
|
1741
|
+
self.df_eq(
|
|
1759
1742
|
r,
|
|
1760
1743
|
[[0, 1, 10, 2]],
|
|
1761
1744
|
"`a b `:long,`x y`:long,d:long,`c *`:long",
|
|
@@ -1764,13 +1747,13 @@ class BuiltInTests(object):
|
|
|
1764
1747
|
r = fa.rename(r, {"a b ": "a b"})
|
|
1765
1748
|
fa.save(r, f_csv, header=True, force_single=True)
|
|
1766
1749
|
fa.save(r, f_parquet)
|
|
1767
|
-
df_eq(
|
|
1750
|
+
self.df_eq(
|
|
1768
1751
|
fa.load(f_parquet, columns=["x y", "d", "c *"], as_fugue=True),
|
|
1769
1752
|
[[1, 10, 2]],
|
|
1770
1753
|
"`x y`:long,d:long,`c *`:long",
|
|
1771
1754
|
throw=True,
|
|
1772
1755
|
)
|
|
1773
|
-
df_eq(
|
|
1756
|
+
self.df_eq(
|
|
1774
1757
|
fa.load(
|
|
1775
1758
|
f_csv,
|
|
1776
1759
|
header=True,
|
|
@@ -1782,7 +1765,7 @@ class BuiltInTests(object):
|
|
|
1782
1765
|
"d:str,`c *`:str",
|
|
1783
1766
|
throw=True,
|
|
1784
1767
|
)
|
|
1785
|
-
df_eq(
|
|
1768
|
+
self.df_eq(
|
|
1786
1769
|
fa.load(
|
|
1787
1770
|
f_csv,
|
|
1788
1771
|
header=True,
|
|
@@ -1802,14 +1785,14 @@ class BuiltInTests(object):
|
|
|
1802
1785
|
""",
|
|
1803
1786
|
as_fugue=True,
|
|
1804
1787
|
)
|
|
1805
|
-
df_eq(r, [[0, 1, 10]], "`a b`:long,` `:long,d:long", throw=True)
|
|
1788
|
+
self.df_eq(r, [[0, 1, 10]], "`a b`:long,` `:long,d:long", throw=True)
|
|
1806
1789
|
r = fa.fugue_sql(
|
|
1807
1790
|
"""
|
|
1808
1791
|
TRANSFORM r USING tr SCHEMA *,`c *`:long
|
|
1809
1792
|
""",
|
|
1810
1793
|
as_fugue=True,
|
|
1811
1794
|
)
|
|
1812
|
-
df_eq(
|
|
1795
|
+
self.df_eq(
|
|
1813
1796
|
r,
|
|
1814
1797
|
[[0, 1, 10, 2]],
|
|
1815
1798
|
"`a b`:long,` `:long,d:long,`c *`:long",
|
|
@@ -1821,7 +1804,7 @@ class BuiltInTests(object):
|
|
|
1821
1804
|
""",
|
|
1822
1805
|
as_fugue=True,
|
|
1823
1806
|
)
|
|
1824
|
-
df_eq(
|
|
1807
|
+
self.df_eq(
|
|
1825
1808
|
r,
|
|
1826
1809
|
[[0, 1, 10, 2]],
|
|
1827
1810
|
"`a b`:long,` `:long,d:long,`c *`:long",
|
|
@@ -1842,19 +1825,19 @@ class BuiltInTests(object):
|
|
|
1842
1825
|
f_parquet=f_parquet,
|
|
1843
1826
|
f_csv=f_csv,
|
|
1844
1827
|
).run()
|
|
1845
|
-
df_eq(
|
|
1828
|
+
self.df_eq(
|
|
1846
1829
|
res["r1"],
|
|
1847
1830
|
[[1, 10, 2]],
|
|
1848
1831
|
"`x y`:long,d:long,`c *`:long",
|
|
1849
1832
|
throw=True,
|
|
1850
1833
|
)
|
|
1851
|
-
df_eq(
|
|
1834
|
+
self.df_eq(
|
|
1852
1835
|
res["r2"],
|
|
1853
1836
|
[["1", "10", "2"]],
|
|
1854
1837
|
"`x y`:str,d:str,`c *`:str",
|
|
1855
1838
|
throw=True,
|
|
1856
1839
|
)
|
|
1857
|
-
df_eq(
|
|
1840
|
+
self.df_eq(
|
|
1858
1841
|
res["r3"],
|
|
1859
1842
|
[[0, 1, 10, 2]],
|
|
1860
1843
|
"`a b`:long,`x y`:long,d:long,`c *`:long",
|
|
@@ -1875,13 +1858,13 @@ def mock_processor(df1: List[List[Any]], df2: List[List[Any]]) -> DataFrame:
|
|
|
1875
1858
|
|
|
1876
1859
|
|
|
1877
1860
|
def mock_processor2(e: ExecutionEngine, dfs: DataFrames) -> DataFrame:
|
|
1878
|
-
assert "test" in e.conf
|
|
1861
|
+
assert "fugue.test" in e.conf
|
|
1879
1862
|
return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
|
|
1880
1863
|
|
|
1881
1864
|
|
|
1882
1865
|
class MockProcessor3(Processor):
|
|
1883
1866
|
def process(self, dfs):
|
|
1884
|
-
assert "test" in self.workflow_conf
|
|
1867
|
+
assert "fugue.test" in self.workflow_conf
|
|
1885
1868
|
return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
|
|
1886
1869
|
|
|
1887
1870
|
|
|
@@ -1915,11 +1898,11 @@ class MockOutputter4(Outputter):
|
|
|
1915
1898
|
|
|
1916
1899
|
class MockTransform1(Transformer):
|
|
1917
1900
|
def get_output_schema(self, df: DataFrame) -> Any:
|
|
1918
|
-
assert "test" in self.workflow_conf
|
|
1901
|
+
assert "fugue.test" in self.workflow_conf
|
|
1919
1902
|
return [df.schema, "ct:int,p:int"]
|
|
1920
1903
|
|
|
1921
1904
|
def on_init(self, df: DataFrame) -> None:
|
|
1922
|
-
assert "test" in self.workflow_conf
|
|
1905
|
+
assert "fugue.test" in self.workflow_conf
|
|
1923
1906
|
self.pn = self.cursor.physical_partition_no
|
|
1924
1907
|
self.ks = self.key_schema
|
|
1925
1908
|
if "on_init_called" not in self.__dict__:
|
|
@@ -1929,7 +1912,7 @@ class MockTransform1(Transformer):
|
|
|
1929
1912
|
|
|
1930
1913
|
def transform(self, df: LocalDataFrame) -> LocalDataFrame:
|
|
1931
1914
|
assert 1 == self.on_init_called
|
|
1932
|
-
assert "test" in self.workflow_conf
|
|
1915
|
+
assert "fugue.test" in self.workflow_conf
|
|
1933
1916
|
pdf = df.as_pandas()
|
|
1934
1917
|
pdf["p"] = self.params.get("p", 1)
|
|
1935
1918
|
pdf["ct"] = pdf.shape[0]
|
|
@@ -1971,7 +1954,7 @@ def mock_tf3(df: Iterable[Dict[str, Any]]) -> Iterable[Dict[str, Any]]:
|
|
|
1971
1954
|
|
|
1972
1955
|
class MockCoTransform1(CoTransformer):
|
|
1973
1956
|
def get_output_schema(self, dfs: DataFrames) -> Any:
|
|
1974
|
-
assert "test" in self.workflow_conf
|
|
1957
|
+
assert "fugue.test" in self.workflow_conf
|
|
1975
1958
|
assert 2 == len(dfs)
|
|
1976
1959
|
if self.params.get("named", False):
|
|
1977
1960
|
assert dfs.has_key
|
|
@@ -1980,7 +1963,7 @@ class MockCoTransform1(CoTransformer):
|
|
|
1980
1963
|
return [self.key_schema, "ct1:int,ct2:int,p:int"]
|
|
1981
1964
|
|
|
1982
1965
|
def on_init(self, dfs: DataFrames) -> None:
|
|
1983
|
-
assert "test" in self.workflow_conf
|
|
1966
|
+
assert "fugue.test" in self.workflow_conf
|
|
1984
1967
|
assert 2 == len(dfs)
|
|
1985
1968
|
if self.params.get("named", False):
|
|
1986
1969
|
assert dfs.has_key
|
|
@@ -1995,7 +1978,7 @@ class MockCoTransform1(CoTransformer):
|
|
|
1995
1978
|
|
|
1996
1979
|
def transform(self, dfs: DataFrames) -> LocalDataFrame:
|
|
1997
1980
|
assert 1 == self.on_init_called
|
|
1998
|
-
assert "test" in self.workflow_conf
|
|
1981
|
+
assert "fugue.test" in self.workflow_conf
|
|
1999
1982
|
assert 2 == len(dfs)
|
|
2000
1983
|
if self.params.get("named", False):
|
|
2001
1984
|
assert dfs.has_key
|
fugue_test/dataframe_suite.py
CHANGED
|
@@ -2,15 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from datetime import date, datetime
|
|
4
4
|
from typing import Any
|
|
5
|
-
from unittest import TestCase
|
|
6
5
|
|
|
7
6
|
import numpy as np
|
|
8
7
|
import pandas as pd
|
|
9
8
|
from pytest import raises
|
|
10
9
|
|
|
11
10
|
import fugue.api as fi
|
|
11
|
+
import fugue.test as ft
|
|
12
12
|
from fugue.dataframe import ArrowDataFrame, DataFrame
|
|
13
|
-
from fugue.dataframe.utils import _df_eq as df_eq
|
|
14
13
|
from fugue.exceptions import FugueDataFrameOperationError, FugueDatasetEmptyError
|
|
15
14
|
|
|
16
15
|
|
|
@@ -19,15 +18,7 @@ class DataFrameTests(object):
|
|
|
19
18
|
All new DataFrame types should pass this test suite.
|
|
20
19
|
"""
|
|
21
20
|
|
|
22
|
-
class Tests(
|
|
23
|
-
@classmethod
|
|
24
|
-
def setUpClass(cls):
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
@classmethod
|
|
28
|
-
def tearDownClass(cls):
|
|
29
|
-
pass
|
|
30
|
-
|
|
21
|
+
class Tests(ft.FugueTestSuite):
|
|
31
22
|
def df(self, data: Any = None, schema: Any = None) -> Any: # pragma: no cover
|
|
32
23
|
raise NotImplementedError
|
|
33
24
|
|
|
@@ -129,7 +120,7 @@ class DataFrameTests(object):
|
|
|
129
120
|
assert [[1]] == fi.as_array(df, type_safe=True)
|
|
130
121
|
|
|
131
122
|
df = self.df([["a", 1, 2]], "a:str,b:int,c:int")
|
|
132
|
-
df_eq(
|
|
123
|
+
self.df_eq(
|
|
133
124
|
fi.as_fugue_df(fi.select_columns(df, ["c", "a"])),
|
|
134
125
|
[[2, "a"]],
|
|
135
126
|
"a:str,c:int",
|
|
@@ -140,13 +131,13 @@ class DataFrameTests(object):
|
|
|
140
131
|
df = self.df(data, "a:str,b:int")
|
|
141
132
|
df2 = fi.rename(df, columns=dict(a="aa"))
|
|
142
133
|
assert fi.get_schema(df) == "a:str,b:int"
|
|
143
|
-
df_eq(fi.as_fugue_df(df2), data, "aa:str,b:int", throw=True)
|
|
134
|
+
self.df_eq(fi.as_fugue_df(df2), data, "aa:str,b:int", throw=True)
|
|
144
135
|
|
|
145
136
|
for data in [[["a", 1]], []]:
|
|
146
137
|
df = self.df(data, "a:str,b:int")
|
|
147
138
|
df3 = fi.rename(df, columns={})
|
|
148
139
|
assert fi.get_schema(df3) == "a:str,b:int"
|
|
149
|
-
df_eq(fi.as_fugue_df(df3), data, "a:str,b:int", throw=True)
|
|
140
|
+
self.df_eq(fi.as_fugue_df(df3), data, "a:str,b:int", throw=True)
|
|
150
141
|
|
|
151
142
|
def test_rename_invalid(self):
|
|
152
143
|
df = self.df([["a", 1]], "a:str,b:int")
|