fugue 0.9.0.dev3__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/collections/sql.py +1 -1
- fugue/dataframe/function_wrapper.py +13 -16
- fugue/dataframe/utils.py +4 -18
- fugue/test/plugins.py +11 -1
- {fugue-0.9.0.dev3.dist-info → fugue-0.9.1.dist-info}/METADATA +7 -5
- {fugue-0.9.0.dev3.dist-info → fugue-0.9.1.dist-info}/RECORD +23 -23
- {fugue-0.9.0.dev3.dist-info → fugue-0.9.1.dist-info}/WHEEL +1 -1
- fugue_dask/_io.py +8 -5
- fugue_dask/_utils.py +4 -4
- fugue_duckdb/_io.py +1 -0
- fugue_ibis/execution_engine.py +11 -4
- fugue_ray/_constants.py +3 -4
- fugue_ray/_utils/dataframe.py +10 -21
- fugue_ray/_utils/io.py +36 -13
- fugue_ray/execution_engine.py +1 -2
- fugue_spark/_utils/misc.py +1 -1
- fugue_test/builtin_suite.py +14 -15
- fugue_test/dataframe_suite.py +3 -4
- fugue_test/execution_suite.py +130 -123
- fugue_version/__init__.py +1 -1
- {fugue-0.9.0.dev3.dist-info → fugue-0.9.1.dist-info}/LICENSE +0 -0
- {fugue-0.9.0.dev3.dist-info → fugue-0.9.1.dist-info}/entry_points.txt +0 -0
- {fugue-0.9.0.dev3.dist-info → fugue-0.9.1.dist-info}/top_level.txt +0 -0
fugue/collections/sql.py
CHANGED
|
@@ -15,7 +15,7 @@ class TempTableName:
|
|
|
15
15
|
"""Generating a temporary, random and globaly unique table name"""
|
|
16
16
|
|
|
17
17
|
def __init__(self):
|
|
18
|
-
self.key = "_" + str(uuid4())[:5]
|
|
18
|
+
self.key = "_" + str(uuid4())[:5].upper()
|
|
19
19
|
|
|
20
20
|
def __repr__(self) -> str:
|
|
21
21
|
return _TEMP_TABLE_EXPR_PREFIX + self.key + _TEMP_TABLE_EXPR_SUFFIX
|
|
@@ -20,6 +20,7 @@ from triad.collections.function_wrapper import (
|
|
|
20
20
|
PositionalParam,
|
|
21
21
|
function_wrapper,
|
|
22
22
|
)
|
|
23
|
+
from triad.utils.convert import compare_annotations
|
|
23
24
|
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
24
25
|
|
|
25
26
|
from ..constants import FUGUE_ENTRYPOINT
|
|
@@ -37,6 +38,14 @@ from .iterable_dataframe import IterableDataFrame
|
|
|
37
38
|
from .pandas_dataframe import PandasDataFrame
|
|
38
39
|
|
|
39
40
|
|
|
41
|
+
def _compare_iter(tp: Any) -> Any:
|
|
42
|
+
return lambda x: compare_annotations(
|
|
43
|
+
x, Iterable[tp] # type:ignore
|
|
44
|
+
) or compare_annotations(
|
|
45
|
+
x, Iterator[tp] # type:ignore
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
40
49
|
@function_wrapper(FUGUE_ENTRYPOINT)
|
|
41
50
|
class DataFrameFunctionWrapper(FunctionWrapper):
|
|
42
51
|
@property
|
|
@@ -228,10 +237,7 @@ class _ListListParam(_LocalNoSchemaDataFrameParam):
|
|
|
228
237
|
return len(df)
|
|
229
238
|
|
|
230
239
|
|
|
231
|
-
@fugue_annotated_param(
|
|
232
|
-
Iterable[List[Any]],
|
|
233
|
-
matcher=lambda x: x == Iterable[List[Any]] or x == Iterator[List[Any]],
|
|
234
|
-
)
|
|
240
|
+
@fugue_annotated_param(Iterable[List[Any]], matcher=_compare_iter(List[Any]))
|
|
235
241
|
class _IterableListParam(_LocalNoSchemaDataFrameParam):
|
|
236
242
|
@no_type_check
|
|
237
243
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[List[Any]]:
|
|
@@ -288,10 +294,7 @@ class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
|
288
294
|
return len(df)
|
|
289
295
|
|
|
290
296
|
|
|
291
|
-
@fugue_annotated_param(
|
|
292
|
-
Iterable[Dict[str, Any]],
|
|
293
|
-
matcher=lambda x: x == Iterable[Dict[str, Any]] or x == Iterator[Dict[str, Any]],
|
|
294
|
-
)
|
|
297
|
+
@fugue_annotated_param(Iterable[Dict[str, Any]], matcher=_compare_iter(Dict[str, Any]))
|
|
295
298
|
class _IterableDictParam(_LocalNoSchemaDataFrameParam):
|
|
296
299
|
@no_type_check
|
|
297
300
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[Dict[str, Any]]:
|
|
@@ -360,10 +363,7 @@ class _PandasParam(LocalDataFrameParam):
|
|
|
360
363
|
return "pandas"
|
|
361
364
|
|
|
362
365
|
|
|
363
|
-
@fugue_annotated_param(
|
|
364
|
-
Iterable[pd.DataFrame],
|
|
365
|
-
matcher=lambda x: x == Iterable[pd.DataFrame] or x == Iterator[pd.DataFrame],
|
|
366
|
-
)
|
|
366
|
+
@fugue_annotated_param(Iterable[pd.DataFrame], matcher=_compare_iter(pd.DataFrame))
|
|
367
367
|
class _IterablePandasParam(LocalDataFrameParam):
|
|
368
368
|
@no_type_check
|
|
369
369
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pd.DataFrame]:
|
|
@@ -419,10 +419,7 @@ class _PyArrowTableParam(LocalDataFrameParam):
|
|
|
419
419
|
return "pyarrow"
|
|
420
420
|
|
|
421
421
|
|
|
422
|
-
@fugue_annotated_param(
|
|
423
|
-
Iterable[pa.Table],
|
|
424
|
-
matcher=lambda x: x == Iterable[pa.Table] or x == Iterator[pa.Table],
|
|
425
|
-
)
|
|
422
|
+
@fugue_annotated_param(Iterable[pa.Table], matcher=_compare_iter(pa.Table))
|
|
426
423
|
class _IterableArrowParam(LocalDataFrameParam):
|
|
427
424
|
@no_type_check
|
|
428
425
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
|
fugue/dataframe/utils.py
CHANGED
|
@@ -21,22 +21,6 @@ normalize_dataframe_column_names = normalize_column_names
|
|
|
21
21
|
rename_dataframe_column_names = rename
|
|
22
22
|
|
|
23
23
|
|
|
24
|
-
def _pa_type_eq(t1: pa.DataType, t2: pa.DataType) -> bool:
|
|
25
|
-
# should ignore the name difference of list
|
|
26
|
-
# e.g. list<item: string> == list<l: string>
|
|
27
|
-
if pa.types.is_list(t1) and pa.types.is_list(t2): # pragma: no cover
|
|
28
|
-
return _pa_type_eq(t1.value_type, t2.value_type)
|
|
29
|
-
return t1 == t2
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def _schema_eq(s1: Schema, s2: Schema) -> bool:
|
|
33
|
-
if s1 == s2:
|
|
34
|
-
return True
|
|
35
|
-
return s1.names == s2.names and all(
|
|
36
|
-
_pa_type_eq(f1.type, f2.type) for f1, f2 in zip(s1.fields, s2.fields)
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
24
|
def _df_eq(
|
|
41
25
|
df: DataFrame,
|
|
42
26
|
data: Any,
|
|
@@ -46,6 +30,7 @@ def _df_eq(
|
|
|
46
30
|
check_schema: bool = True,
|
|
47
31
|
check_content: bool = True,
|
|
48
32
|
no_pandas: bool = False,
|
|
33
|
+
equal_type_groups: Optional[List[List[Any]]] = None,
|
|
49
34
|
throw=False,
|
|
50
35
|
) -> bool:
|
|
51
36
|
"""Compare if two dataframes are equal. Is for internal, unit test
|
|
@@ -66,6 +51,7 @@ def _df_eq(
|
|
|
66
51
|
:param no_pandas: if true, it will compare the string representations of the
|
|
67
52
|
dataframes, otherwise, it will convert both to pandas dataframe to compare,
|
|
68
53
|
defaults to False
|
|
54
|
+
:param equal_type_groups: the groups to treat as equal types, defaults to None.
|
|
69
55
|
:param throw: if to throw error if not equal, defaults to False
|
|
70
56
|
:return: if they equal
|
|
71
57
|
"""
|
|
@@ -78,8 +64,8 @@ def _df_eq(
|
|
|
78
64
|
assert (
|
|
79
65
|
df1.count() == df2.count()
|
|
80
66
|
), f"count mismatch {df1.count()}, {df2.count()}"
|
|
81
|
-
assert not check_schema or
|
|
82
|
-
|
|
67
|
+
assert not check_schema or df.schema.is_like(
|
|
68
|
+
df2.schema, equal_groups=equal_type_groups
|
|
83
69
|
), f"schema mismatch {df.schema.pa_schema}, {df2.schema.pa_schema}"
|
|
84
70
|
if not check_content:
|
|
85
71
|
return True
|
fugue/test/plugins.py
CHANGED
|
@@ -2,7 +2,7 @@ from contextlib import contextmanager
|
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type
|
|
5
|
-
|
|
5
|
+
from fugue.dataframe.utils import _df_eq
|
|
6
6
|
from triad import assert_or_throw, run_once
|
|
7
7
|
from triad.utils.entry_points import load_entry_point
|
|
8
8
|
|
|
@@ -160,6 +160,7 @@ class FugueTestSuite:
|
|
|
160
160
|
|
|
161
161
|
backend: Any
|
|
162
162
|
tmp_path: Path
|
|
163
|
+
equal_type_groups: Any = None
|
|
163
164
|
|
|
164
165
|
__test__ = False
|
|
165
166
|
_test_context: Any = None
|
|
@@ -180,6 +181,15 @@ class FugueTestSuite:
|
|
|
180
181
|
"""The engine object inside the ``FugueTestContext``"""
|
|
181
182
|
return self.context.engine
|
|
182
183
|
|
|
184
|
+
def get_equal_type_groups(self) -> Optional[List[List[Any]]]:
|
|
185
|
+
return None # pragma: no cover
|
|
186
|
+
|
|
187
|
+
def df_eq(self, *args: Any, **kwargs: Any) -> bool:
|
|
188
|
+
"""A wrapper of :func:`~fugue.dataframe.utils.df_eq`"""
|
|
189
|
+
if "equal_type_groups" not in kwargs:
|
|
190
|
+
kwargs["equal_type_groups"] = self.equal_type_groups
|
|
191
|
+
return _df_eq(*args, **kwargs)
|
|
192
|
+
|
|
183
193
|
|
|
184
194
|
def fugue_test_suite(backend: Any, mark_test: Optional[bool] = None) -> Any:
|
|
185
195
|
def deco(cls: Type["FugueTestSuite"]) -> Type["FugueTestSuite"]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.1
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
21
|
Requires-Python: >=3.8
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
|
-
Requires-Dist: triad >=0.9.
|
|
23
|
+
Requires-Dist: triad >=0.9.7
|
|
24
24
|
Requires-Dist: adagio >=0.2.4
|
|
25
25
|
Provides-Extra: all
|
|
26
26
|
Requires-Dist: qpd >=0.4.4 ; extra == 'all'
|
|
@@ -30,7 +30,7 @@ Requires-Dist: jinja2 ; extra == 'all'
|
|
|
30
30
|
Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
|
|
31
31
|
Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
|
|
32
32
|
Requires-Dist: dask-sql ; extra == 'all'
|
|
33
|
-
Requires-Dist: ray[data] >=2.
|
|
33
|
+
Requires-Dist: ray[data] >=2.5.0 ; extra == 'all'
|
|
34
34
|
Requires-Dist: notebook ; extra == 'all'
|
|
35
35
|
Requires-Dist: jupyterlab ; extra == 'all'
|
|
36
36
|
Requires-Dist: ipython >=7.10.0 ; extra == 'all'
|
|
@@ -45,6 +45,7 @@ Provides-Extra: dask
|
|
|
45
45
|
Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'dask'
|
|
46
46
|
Requires-Dist: pyarrow >=7.0.0 ; extra == 'dask'
|
|
47
47
|
Requires-Dist: pandas >=2.0.2 ; extra == 'dask'
|
|
48
|
+
Requires-Dist: dask[dataframe,distributed] >=2024.4.0 ; (python_version >= "3.11.9") and extra == 'dask'
|
|
48
49
|
Provides-Extra: duckdb
|
|
49
50
|
Requires-Dist: qpd >=0.4.4 ; extra == 'duckdb'
|
|
50
51
|
Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'duckdb'
|
|
@@ -58,6 +59,7 @@ Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'ibis'
|
|
|
58
59
|
Requires-Dist: sqlglot ; extra == 'ibis'
|
|
59
60
|
Requires-Dist: jinja2 ; extra == 'ibis'
|
|
60
61
|
Requires-Dist: ibis-framework ; extra == 'ibis'
|
|
62
|
+
Requires-Dist: pandas <2.2 ; extra == 'ibis'
|
|
61
63
|
Provides-Extra: notebook
|
|
62
64
|
Requires-Dist: notebook ; extra == 'notebook'
|
|
63
65
|
Requires-Dist: jupyterlab ; extra == 'notebook'
|
|
@@ -65,9 +67,9 @@ Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
|
|
|
65
67
|
Provides-Extra: polars
|
|
66
68
|
Requires-Dist: polars ; extra == 'polars'
|
|
67
69
|
Provides-Extra: ray
|
|
68
|
-
Requires-Dist: ray[data] >=2.
|
|
70
|
+
Requires-Dist: ray[data] >=2.5.0 ; extra == 'ray'
|
|
69
71
|
Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
|
|
70
|
-
Requires-Dist: pyarrow >=
|
|
72
|
+
Requires-Dist: pyarrow >=7.0.0 ; extra == 'ray'
|
|
71
73
|
Requires-Dist: pandas <2.2 ; extra == 'ray'
|
|
72
74
|
Provides-Extra: spark
|
|
73
75
|
Requires-Dist: pyspark >=3.1.1 ; extra == 'spark'
|
|
@@ -18,7 +18,7 @@ fugue/bag/array_bag.py,sha256=b0UdDPmZpEAI3R0SBbZVOLVLAwMQnBCFeYDEpFWen14,1111
|
|
|
18
18
|
fugue/bag/bag.py,sha256=sNBAzPmEh5fEm8ME8NEEOOre6l58ri6oouVBWwafqTc,3018
|
|
19
19
|
fugue/collections/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
20
|
fugue/collections/partition.py,sha256=rPkU-3y6E598Q7wvE-jTSbSwWh3fzIVxdwPpbQvWS-M,17257
|
|
21
|
-
fugue/collections/sql.py,sha256=
|
|
21
|
+
fugue/collections/sql.py,sha256=3MjnuQMPuUMq55n-EypikkRqcpOCZtOjp7S2fs7ujAA,4955
|
|
22
22
|
fugue/collections/yielded.py,sha256=KAvCXAZpeuErGww7Y217_F7M2zv9G5hfdl2AWiO7wEM,2040
|
|
23
23
|
fugue/column/__init__.py,sha256=aoZwwzyJtNL-duLxzU2sNGoaKikWd-yesbigE_Wj29s,208
|
|
24
24
|
fugue/column/expressions.py,sha256=fdGX9oPCqJBuROFZqrOYVcwkjghdXT9ngaSTG5tW_i8,26544
|
|
@@ -31,10 +31,10 @@ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvr
|
|
|
31
31
|
fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
|
|
32
32
|
fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
|
|
33
33
|
fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
|
|
34
|
-
fugue/dataframe/function_wrapper.py,sha256=
|
|
34
|
+
fugue/dataframe/function_wrapper.py,sha256=hOZF3GmwpxqwqKi9-pEOAPZSW1ZFyB47hLxRrGyOiuM,14855
|
|
35
35
|
fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
|
|
36
36
|
fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
|
|
37
|
-
fugue/dataframe/utils.py,sha256=
|
|
37
|
+
fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
|
|
38
38
|
fugue/dataset/__init__.py,sha256=5f2CAJ4xst6Z2o9Q2e2twfDOGUw8ZJoE2ild4JEU2pg,112
|
|
39
39
|
fugue/dataset/api.py,sha256=DacI4L2w5NJ-eZ6nFxNMqmReEnb0WUXswbjVp7BeErk,2794
|
|
40
40
|
fugue/dataset/dataset.py,sha256=jWXZqy3msMPFFkhas2PYJEX55ZAI3gk3Txq5f4-Qya4,4759
|
|
@@ -73,7 +73,7 @@ fugue/sql/api.py,sha256=l2I9CAy_W2oFFTct9fDPLyXF0LiDxQhMx5O8jBHTAxU,10050
|
|
|
73
73
|
fugue/sql/workflow.py,sha256=S1pOhp0b0t6johFAJWmj6xUB7Ti5LQgNABpAzmLGjrQ,3010
|
|
74
74
|
fugue/test/__init__.py,sha256=hvVrNbJYkWI_6otpILneyTjUafxURaA4obK6AoDyCUw,250
|
|
75
75
|
fugue/test/pandas_tester.py,sha256=_w6rFqlzZKjBtmFf-08a4C97W5xtqGw5XorLhj6Zyoo,622
|
|
76
|
-
fugue/test/plugins.py,sha256=
|
|
76
|
+
fugue/test/plugins.py,sha256=GLZia5GCmy0eBVGNbIqTbX7Ou3euf2SY4litKgdigwY,12318
|
|
77
77
|
fugue/workflow/__init__.py,sha256=tXM_KYO8Q358W6qAVlwhIQIaYNRDgZtTubrIEX4QMgM,229
|
|
78
78
|
fugue/workflow/_checkpoint.py,sha256=tt5Iv7c5ZStC0MD1inItksQ0GuK0ViniA3nvrgym-5c,5681
|
|
79
79
|
fugue/workflow/_tasks.py,sha256=Zq_jXJO_VaF8DrWUuBiwO2Y3OVuhsiOQdzP4VBsp7Fo,11826
|
|
@@ -89,14 +89,14 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
|
|
|
89
89
|
fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
|
|
90
90
|
fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
|
|
91
91
|
fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
|
|
92
|
-
fugue_dask/_io.py,sha256=
|
|
93
|
-
fugue_dask/_utils.py,sha256=
|
|
92
|
+
fugue_dask/_io.py,sha256=pl4F7mbVgP7Rwh1FFG7xfOz2TBZRUj1l3lLvDY4jOf4,6020
|
|
93
|
+
fugue_dask/_utils.py,sha256=1uplEqvpCDZDp2YdwJxa6cuGScpgG9VvN3057J02bys,8956
|
|
94
94
|
fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
|
|
95
95
|
fugue_dask/execution_engine.py,sha256=60IiwYRBVhN-pX3v6i9BZ8Pa4bcSh5UoklvCScM_XAM,21361
|
|
96
96
|
fugue_dask/registry.py,sha256=jepWKH55VWNIWV3pOF5vpCl2OpO0rI1IULx5GM2Gk6w,2274
|
|
97
97
|
fugue_dask/tester.py,sha256=E7BZjgFpJgrHsLMKzvSO5im5OwocYcratjzulJSQZl0,718
|
|
98
98
|
fugue_duckdb/__init__.py,sha256=ZzhmAWbROR1YL9Kmlt7OlwkgPZzFhsSdwLV2pFmAqGI,268
|
|
99
|
-
fugue_duckdb/_io.py,sha256=
|
|
99
|
+
fugue_duckdb/_io.py,sha256=vnd8m8C6XeMCBJBbAdA5h695NMfsduQrvONyS0HcEFA,8475
|
|
100
100
|
fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
|
|
101
101
|
fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
|
|
102
102
|
fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
|
|
@@ -107,7 +107,7 @@ fugue_ibis/__init__.py,sha256=z7TkK7M2_0p9XO6jQATNDgT0aHXn5k69Ttz2ga-eQG8,190
|
|
|
107
107
|
fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
|
|
108
108
|
fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
|
|
109
109
|
fugue_ibis/dataframe.py,sha256=k4Q6qBLBIADF5YhbvaDplXO7OkMZSHuf_Wg5o-AusEI,7796
|
|
110
|
-
fugue_ibis/execution_engine.py,sha256=
|
|
110
|
+
fugue_ibis/execution_engine.py,sha256=5I-ou5xPdomVu-srdvidvP8f7wDYbGrCV_lGffZa_ac,18679
|
|
111
111
|
fugue_notebook/__init__.py,sha256=9r_-2uxu1lBeZ8GgpYCKom_OZy2soIOYZajg7JDO-HY,4326
|
|
112
112
|
fugue_notebook/env.py,sha256=TYiTxYPFi-BVJJY49jDsvw9mddhK8WrifeRxBke30I8,4773
|
|
113
113
|
fugue_notebook/nbextension/README.md,sha256=QLnr957YeGfwzy2r4c4qbZPaXyCbyGrKPvcqSBQYSnU,123
|
|
@@ -119,15 +119,15 @@ fugue_polars/_utils.py,sha256=7rGGWgB1-VqFwh4PcBLYk_5VNjd8FNOS4TDFyDVz2sg,159
|
|
|
119
119
|
fugue_polars/polars_dataframe.py,sha256=8LQ0IB-JFFdjW2ltDzq8DfIbUC_jjjDr1YM29usJag0,8831
|
|
120
120
|
fugue_polars/registry.py,sha256=gd6qQ-OxYtTAQFyvYbLDPXmSvCR-LW6n5K5ylgMY_7A,2950
|
|
121
121
|
fugue_ray/__init__.py,sha256=HzEHfG2mpc0ugf3nf1Pdy15Bhg35K6maZpYejn1aoyI,119
|
|
122
|
-
fugue_ray/_constants.py,sha256=
|
|
122
|
+
fugue_ray/_constants.py,sha256=RHlaVKyjQnwdbo5mFO_GBtQZcz5GvWcCbkOkLfVTQ1A,565
|
|
123
123
|
fugue_ray/dataframe.py,sha256=7asw2qf9vm6vLBSzqghm9pUcNAppJOz5CkT7XyR0S5g,12514
|
|
124
|
-
fugue_ray/execution_engine.py,sha256=
|
|
124
|
+
fugue_ray/execution_engine.py,sha256=PZlWbmdCwTPfZJhN2I-44JW7so8NVCFFumaKIhJLfoI,12566
|
|
125
125
|
fugue_ray/registry.py,sha256=TS-HWy2IUozp6_A0vqc8_ZdVUT_Z9yVjG6e1gbbgy2A,1757
|
|
126
126
|
fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
|
|
127
127
|
fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
128
|
fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
|
|
129
|
-
fugue_ray/_utils/dataframe.py,sha256=
|
|
130
|
-
fugue_ray/_utils/io.py,sha256=
|
|
129
|
+
fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
|
|
130
|
+
fugue_ray/_utils/io.py,sha256=3hFNDeBuh4bfCud40ZsGrGZLSvCSuxL_1VlqCTnn6RA,9794
|
|
131
131
|
fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
|
|
132
132
|
fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
|
|
133
133
|
fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
|
|
@@ -137,20 +137,20 @@ fugue_spark/tester.py,sha256=VX003yGNlBukaZTQSN-w7XvgSk4rqxrWQIzno0dWrXg,2481
|
|
|
137
137
|
fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
138
|
fugue_spark/_utils/convert.py,sha256=eRWkDYA4UO-FQu-2y4O80WEdawx7X_rIrWg55AlOiRc,10007
|
|
139
139
|
fugue_spark/_utils/io.py,sha256=OdUezKpB29Lx9aUS2k9x0xUAGZrmgMZyQYGPEeHk7rQ,5574
|
|
140
|
-
fugue_spark/_utils/misc.py,sha256=
|
|
140
|
+
fugue_spark/_utils/misc.py,sha256=9LsbBp6nOEhqXFLr8oWTc3VKzKk-vuVyixlRoquGnEs,858
|
|
141
141
|
fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
|
|
142
142
|
fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
|
|
143
143
|
fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
|
|
144
144
|
fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
|
|
145
145
|
fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
|
|
146
|
-
fugue_test/builtin_suite.py,sha256=
|
|
147
|
-
fugue_test/dataframe_suite.py,sha256=
|
|
148
|
-
fugue_test/execution_suite.py,sha256=
|
|
146
|
+
fugue_test/builtin_suite.py,sha256=cOkZG6w1RHhWWxtjQhZClZQaGT6haNd576BoUmNC_cA,77960
|
|
147
|
+
fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
|
|
148
|
+
fugue_test/execution_suite.py,sha256=jcSSoKqTGbeWzTxkyYU-8i2zJAjzuXn7BqE8ul-JjIc,48646
|
|
149
149
|
fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
|
|
150
|
-
fugue_version/__init__.py,sha256=
|
|
151
|
-
fugue-0.9.
|
|
152
|
-
fugue-0.9.
|
|
153
|
-
fugue-0.9.
|
|
154
|
-
fugue-0.9.
|
|
155
|
-
fugue-0.9.
|
|
156
|
-
fugue-0.9.
|
|
150
|
+
fugue_version/__init__.py,sha256=UwJXM8JY2T3tE2id0K2k_lEaVThbRTrGO1mNibyzIz8,22
|
|
151
|
+
fugue-0.9.1.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
152
|
+
fugue-0.9.1.dist-info/METADATA,sha256=zu44QGPIwk28QyKe9H4Si2ANByy1sJ9cmauNrhCg4bc,18380
|
|
153
|
+
fugue-0.9.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
|
154
|
+
fugue-0.9.1.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
|
|
155
|
+
fugue-0.9.1.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
|
|
156
|
+
fugue-0.9.1.dist-info/RECORD,,
|
fugue_dask/_io.py
CHANGED
|
@@ -6,7 +6,7 @@ from fsspec import AbstractFileSystem
|
|
|
6
6
|
from triad.collections.dict import ParamDict
|
|
7
7
|
from triad.collections.schema import Schema
|
|
8
8
|
from triad.utils.assertion import assert_or_throw
|
|
9
|
-
from triad.utils.io import join, makedirs, url_to_fs
|
|
9
|
+
from triad.utils.io import isfile, join, makedirs, url_to_fs
|
|
10
10
|
|
|
11
11
|
from fugue._utils.io import FileParser, _get_single_files
|
|
12
12
|
from fugue_dask.dataframe import DaskDataFrame
|
|
@@ -100,9 +100,11 @@ def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
|
100
100
|
|
|
101
101
|
|
|
102
102
|
def _safe_load_csv(path: str, **kwargs: Any) -> dd.DataFrame:
|
|
103
|
+
if not isfile(path):
|
|
104
|
+
return dd.read_csv(join(path, "*.csv"), **kwargs)
|
|
103
105
|
try:
|
|
104
106
|
return dd.read_csv(path, **kwargs)
|
|
105
|
-
except (IsADirectoryError, PermissionError):
|
|
107
|
+
except (IsADirectoryError, PermissionError): # pragma: no cover
|
|
106
108
|
return dd.read_csv(join(path, "*.csv"), **kwargs)
|
|
107
109
|
|
|
108
110
|
|
|
@@ -148,11 +150,12 @@ def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
|
148
150
|
|
|
149
151
|
|
|
150
152
|
def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame:
|
|
153
|
+
if not isfile(path):
|
|
154
|
+
return dd.read_json(join(path, "*.json"), **kwargs)
|
|
151
155
|
try:
|
|
152
156
|
return dd.read_json(path, **kwargs)
|
|
153
|
-
except (IsADirectoryError, PermissionError):
|
|
154
|
-
|
|
155
|
-
return x
|
|
157
|
+
except (IsADirectoryError, PermissionError): # pragma: no cover
|
|
158
|
+
return dd.read_json(join(path, "*.json"), **kwargs)
|
|
156
159
|
|
|
157
160
|
|
|
158
161
|
def _load_json(
|
fugue_dask/_utils.py
CHANGED
|
@@ -53,7 +53,7 @@ def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFram
|
|
|
53
53
|
if num < 1:
|
|
54
54
|
return df
|
|
55
55
|
if num == 1:
|
|
56
|
-
return df.repartition(1)
|
|
56
|
+
return df.repartition(npartitions=1)
|
|
57
57
|
df = df.reset_index(drop=True).clear_divisions()
|
|
58
58
|
idf, ct = _add_hash_index(df, num, cols)
|
|
59
59
|
return _postprocess(idf, ct, num)
|
|
@@ -76,7 +76,7 @@ def even_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFram
|
|
|
76
76
|
the number of partitions will be the number of groups.
|
|
77
77
|
"""
|
|
78
78
|
if num == 1:
|
|
79
|
-
return df.repartition(1)
|
|
79
|
+
return df.repartition(npartitions=1)
|
|
80
80
|
if len(cols) == 0 and num <= 0:
|
|
81
81
|
return df
|
|
82
82
|
df = df.reset_index(drop=True).clear_divisions()
|
|
@@ -111,7 +111,7 @@ def rand_repartition(
|
|
|
111
111
|
if num < 1:
|
|
112
112
|
return df
|
|
113
113
|
if num == 1:
|
|
114
|
-
return df.repartition(1)
|
|
114
|
+
return df.repartition(npartitions=1)
|
|
115
115
|
df = df.reset_index(drop=True).clear_divisions()
|
|
116
116
|
if len(cols) == 0:
|
|
117
117
|
idf, ct = _add_random_index(df, num=num, seed=seed)
|
|
@@ -124,7 +124,7 @@ def rand_repartition(
|
|
|
124
124
|
def _postprocess(idf: dd.DataFrame, ct: int, num: int) -> dd.DataFrame:
|
|
125
125
|
parts = min(ct, num)
|
|
126
126
|
if parts <= 1:
|
|
127
|
-
return idf.repartition(1)
|
|
127
|
+
return idf.repartition(npartitions=1)
|
|
128
128
|
divisions = list(np.arange(ct, step=math.ceil(ct / parts)))
|
|
129
129
|
divisions.append(ct - 1)
|
|
130
130
|
return idf.repartition(divisions=divisions, force=True)
|
fugue_duckdb/_io.py
CHANGED
fugue_ibis/execution_engine.py
CHANGED
|
@@ -23,8 +23,8 @@ from ._compat import IbisTable
|
|
|
23
23
|
from ._utils import to_ibis_schema
|
|
24
24
|
from .dataframe import IbisDataFrame
|
|
25
25
|
|
|
26
|
-
_JOIN_RIGHT_SUFFIX = "_ibis_y__"
|
|
27
|
-
_GEN_TABLE_NAMES = (f"_fugue_temp_table_{i:d}" for i in itertools.count())
|
|
26
|
+
_JOIN_RIGHT_SUFFIX = "_ibis_y__".upper()
|
|
27
|
+
_GEN_TABLE_NAMES = (f"_fugue_temp_table_{i:d}".upper() for i in itertools.count())
|
|
28
28
|
|
|
29
29
|
|
|
30
30
|
class IbisSQLEngine(SQLEngine):
|
|
@@ -224,7 +224,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
224
224
|
_presort = parse_presort_exp(presort)
|
|
225
225
|
else:
|
|
226
226
|
_presort = partition_spec.presort
|
|
227
|
-
tbn = "
|
|
227
|
+
tbn = "_TEMP"
|
|
228
228
|
idf = self.to_df(df)
|
|
229
229
|
|
|
230
230
|
if len(_presort) == 0:
|
|
@@ -233,9 +233,10 @@ class IbisSQLEngine(SQLEngine):
|
|
|
233
233
|
pcols = ", ".join(
|
|
234
234
|
self.encode_column_name(x) for x in partition_spec.partition_by
|
|
235
235
|
)
|
|
236
|
+
dummy_order_by = self._dummy_window_order_by()
|
|
236
237
|
sql = (
|
|
237
238
|
f"SELECT * FROM ("
|
|
238
|
-
f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols}) "
|
|
239
|
+
f"SELECT *, ROW_NUMBER() OVER (PARTITION BY {pcols} {dummy_order_by}) "
|
|
239
240
|
f"AS __fugue_take_param FROM {tbn}"
|
|
240
241
|
f") WHERE __fugue_take_param<={n}"
|
|
241
242
|
)
|
|
@@ -290,6 +291,12 @@ class IbisSQLEngine(SQLEngine):
|
|
|
290
291
|
def load_table(self, table: str, **kwargs: Any) -> DataFrame:
|
|
291
292
|
return self.to_df(self.backend.table(table))
|
|
292
293
|
|
|
294
|
+
def _dummy_window_order_by(self) -> str:
|
|
295
|
+
"""Return a dummy window order by clause, this is required for
|
|
296
|
+
some SQL backends when there is no real order by clause in window
|
|
297
|
+
"""
|
|
298
|
+
return ""
|
|
299
|
+
|
|
293
300
|
|
|
294
301
|
class IbisMapEngine(MapEngine):
|
|
295
302
|
"""IbisExecutionEngine's MapEngine, it is a wrapper of the map engine
|
fugue_ray/_constants.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import Any, Dict
|
|
2
2
|
|
|
3
3
|
import ray
|
|
4
|
+
from packaging import version
|
|
4
5
|
|
|
5
6
|
FUGUE_RAY_CONF_SHUFFLE_PARTITIONS = "fugue.ray.shuffle.partitions"
|
|
6
7
|
FUGUE_RAY_DEFAULT_PARTITIONS = "fugue.ray.default.partitions"
|
|
@@ -12,8 +13,6 @@ FUGUE_RAY_DEFAULT_CONF: Dict[str, Any] = {
|
|
|
12
13
|
FUGUE_RAY_DEFAULT_PARTITIONS: 0,
|
|
13
14
|
FUGUE_RAY_ZERO_COPY: True,
|
|
14
15
|
}
|
|
16
|
+
RAY_VERSION = version.parse(ray.__version__)
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
_ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
|
|
18
|
-
else: # pragma: no cover
|
|
19
|
-
_ZERO_COPY = {}
|
|
18
|
+
_ZERO_COPY: Dict[str, Any] = {"zero_copy_batch": True}
|
fugue_ray/_utils/dataframe.py
CHANGED
|
@@ -3,7 +3,6 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pyarrow as pa
|
|
6
|
-
import ray
|
|
7
6
|
import ray.data as rd
|
|
8
7
|
from triad import Schema
|
|
9
8
|
|
|
@@ -31,31 +30,21 @@ def get_dataset_format(df: rd.Dataset) -> Tuple[Optional[str], rd.Dataset]:
|
|
|
31
30
|
df = materialize(df)
|
|
32
31
|
if df.count() == 0:
|
|
33
32
|
return None, df
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
else:
|
|
41
|
-
schema = df.schema(fetch_if_missing=True)
|
|
42
|
-
if schema is None: # pragma: no cover
|
|
43
|
-
return None, df
|
|
44
|
-
if isinstance(schema.base_schema, pa.Schema):
|
|
45
|
-
return "arrow", df
|
|
46
|
-
return "pandas", df
|
|
33
|
+
schema = df.schema(fetch_if_missing=True)
|
|
34
|
+
if schema is None: # pragma: no cover
|
|
35
|
+
return None, df
|
|
36
|
+
if isinstance(schema.base_schema, pa.Schema):
|
|
37
|
+
return "arrow", df
|
|
38
|
+
return "pandas", df
|
|
47
39
|
|
|
48
40
|
|
|
49
41
|
def to_schema(schema: Any) -> Schema: # pragma: no cover
|
|
50
42
|
if isinstance(schema, pa.Schema):
|
|
51
43
|
return Schema(schema)
|
|
52
|
-
if
|
|
53
|
-
if isinstance(schema,
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
):
|
|
57
|
-
return Schema(schema.base_schema)
|
|
58
|
-
return Schema(list(zip(schema.names, schema.types)))
|
|
44
|
+
if isinstance(schema, rd.Schema):
|
|
45
|
+
if hasattr(schema, "base_schema") and isinstance(schema.base_schema, pa.Schema):
|
|
46
|
+
return Schema(schema.base_schema)
|
|
47
|
+
return Schema(list(zip(schema.names, schema.types)))
|
|
59
48
|
raise ValueError(f"{schema} is not supported")
|
|
60
49
|
|
|
61
50
|
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -3,15 +3,15 @@ import pathlib
|
|
|
3
3
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
import pyarrow as pa
|
|
6
|
-
import ray
|
|
7
6
|
import ray.data as rd
|
|
7
|
+
from packaging import version
|
|
8
8
|
from pyarrow import csv as pacsv
|
|
9
9
|
from pyarrow import json as pajson
|
|
10
10
|
from ray.data.datasource import FileExtensionFilter
|
|
11
11
|
from triad.collections import Schema
|
|
12
12
|
from triad.collections.dict import ParamDict
|
|
13
13
|
from triad.utils.assertion import assert_or_throw
|
|
14
|
-
from triad.utils.io import exists, makedirs, rm
|
|
14
|
+
from triad.utils.io import exists, makedirs, rm, isfile
|
|
15
15
|
|
|
16
16
|
from fugue import ExecutionEngine
|
|
17
17
|
from fugue._utils.io import FileParser, save_df
|
|
@@ -19,6 +19,8 @@ from fugue.collections.partition import PartitionSpec
|
|
|
19
19
|
from fugue.dataframe import DataFrame
|
|
20
20
|
from fugue_ray.dataframe import RayDataFrame
|
|
21
21
|
|
|
22
|
+
from .._constants import RAY_VERSION
|
|
23
|
+
|
|
22
24
|
|
|
23
25
|
class RayIO(object):
|
|
24
26
|
def __init__(self, engine: ExecutionEngine):
|
|
@@ -149,6 +151,18 @@ class RayIO(object):
|
|
|
149
151
|
if infer_schema and columns is not None and not isinstance(columns, list):
|
|
150
152
|
raise ValueError("can't set columns as a schema when infer schema is true")
|
|
151
153
|
|
|
154
|
+
if RAY_VERSION >= version.parse("2.10"):
|
|
155
|
+
if len(p) == 1 and isfile(p[0]): # TODO: very hacky
|
|
156
|
+
params: Dict[str, Any] = {}
|
|
157
|
+
else:
|
|
158
|
+
params = {"file_extensions": ["csv"]}
|
|
159
|
+
else: # pragma: no cover
|
|
160
|
+
params = {
|
|
161
|
+
"partition_filter": _FileFiler(
|
|
162
|
+
file_extensions=["csv"], exclude=["_SUCCESS"]
|
|
163
|
+
),
|
|
164
|
+
}
|
|
165
|
+
|
|
152
166
|
def _read_csv(to_str: bool) -> RayDataFrame:
|
|
153
167
|
res = rd.read_csv(
|
|
154
168
|
p,
|
|
@@ -156,9 +170,7 @@ class RayIO(object):
|
|
|
156
170
|
read_options=pacsv.ReadOptions(**read_options),
|
|
157
171
|
parse_options=pacsv.ParseOptions(**parse_options),
|
|
158
172
|
convert_options=pacsv.ConvertOptions(**convert_options),
|
|
159
|
-
|
|
160
|
-
file_extensions=["csv"], exclude=["_SUCCESS"]
|
|
161
|
-
),
|
|
173
|
+
**params,
|
|
162
174
|
)
|
|
163
175
|
if to_str:
|
|
164
176
|
_schema = res.schema(fetch_if_missing=True)
|
|
@@ -196,20 +208,31 @@ class RayIO(object):
|
|
|
196
208
|
read_options: Dict[str, Any] = {"use_threads": False}
|
|
197
209
|
parse_options: Dict[str, Any] = {}
|
|
198
210
|
|
|
199
|
-
def _read_json() -> RayDataFrame:
|
|
200
|
-
if
|
|
201
|
-
|
|
211
|
+
def _read_json() -> RayDataFrame: # pragma: no cover
|
|
212
|
+
if RAY_VERSION >= version.parse("2.10"):
|
|
213
|
+
if len(p) == 1 and isfile(p[0]): # TODO: very hacky
|
|
214
|
+
params: Dict[str, Any] = {"file_extensions": None}
|
|
215
|
+
else:
|
|
216
|
+
params = {"file_extensions": ["json"]}
|
|
217
|
+
elif RAY_VERSION >= version.parse("2.9"): # pragma: no cover
|
|
218
|
+
params = {
|
|
219
|
+
"file_extensions": None,
|
|
220
|
+
"partition_filter": _FileFiler(
|
|
221
|
+
file_extensions=["json"], exclude=["_SUCCESS"]
|
|
222
|
+
),
|
|
223
|
+
}
|
|
202
224
|
else: # pragma: no cover
|
|
203
|
-
params = {
|
|
225
|
+
params = {
|
|
226
|
+
"partition_filter": _FileFiler(
|
|
227
|
+
file_extensions=["json"], exclude=["_SUCCESS"]
|
|
228
|
+
),
|
|
229
|
+
}
|
|
204
230
|
return RayDataFrame(
|
|
205
231
|
rd.read_json(
|
|
206
232
|
p,
|
|
207
233
|
ray_remote_args=self._remote_args(),
|
|
208
234
|
read_options=pajson.ReadOptions(**read_options),
|
|
209
235
|
parse_options=pajson.ParseOptions(**parse_options),
|
|
210
|
-
partition_filter=_FileFiler(
|
|
211
|
-
file_extensions=["json"], exclude=["_SUCCESS"]
|
|
212
|
-
),
|
|
213
236
|
**params,
|
|
214
237
|
)
|
|
215
238
|
)
|
|
@@ -227,7 +250,7 @@ class RayIO(object):
|
|
|
227
250
|
return {"num_cpus": 1}
|
|
228
251
|
|
|
229
252
|
|
|
230
|
-
class _FileFiler(FileExtensionFilter):
|
|
253
|
+
class _FileFiler(FileExtensionFilter): # pragma: no cover
|
|
231
254
|
def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
|
|
232
255
|
super().__init__(file_extensions, allow_if_no_extension=True)
|
|
233
256
|
self._exclude = set(exclude)
|
fugue_ray/execution_engine.py
CHANGED
|
@@ -191,8 +191,7 @@ class RayMapEngine(MapEngine):
|
|
|
191
191
|
mb_args["batch_size"] = self.conf.get_or_throw(
|
|
192
192
|
FUGUE_RAY_DEFAULT_BATCH_SIZE, int
|
|
193
193
|
)
|
|
194
|
-
|
|
195
|
-
mb_args["zero_copy_batch"] = self.conf.get(FUGUE_RAY_ZERO_COPY, True)
|
|
194
|
+
mb_args["zero_copy_batch"] = self.conf.get(FUGUE_RAY_ZERO_COPY, True)
|
|
196
195
|
sdf = rdf.native.map_batches(
|
|
197
196
|
_udf,
|
|
198
197
|
batch_format="pyarrow",
|
fugue_spark/_utils/misc.py
CHANGED
|
@@ -3,7 +3,7 @@ from typing import Any
|
|
|
3
3
|
try:
|
|
4
4
|
from pyspark.sql.connect.session import SparkSession as SparkConnectSession
|
|
5
5
|
from pyspark.sql.connect.dataframe import DataFrame as SparkConnectDataFrame
|
|
6
|
-
except
|
|
6
|
+
except Exception: # pragma: no cover
|
|
7
7
|
SparkConnectSession = None
|
|
8
8
|
SparkConnectDataFrame = None
|
|
9
9
|
import pyspark.sql as ps
|