fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +2 -91
- fugue/api.py +1 -0
- fugue/collections/partition.py +12 -6
- fugue/constants.py +1 -1
- fugue/dataframe/__init__.py +1 -7
- fugue/dataframe/arrow_dataframe.py +1 -1
- fugue/dataframe/function_wrapper.py +2 -3
- fugue/dataframe/utils.py +10 -84
- fugue/execution/api.py +34 -12
- fugue/execution/native_execution_engine.py +33 -19
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +3 -3
- fugue/extensions/_builtins/processors.py +2 -3
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +1 -1
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/execution_engine.py +42 -16
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +17 -10
- fugue_duckdb/execution_engine.py +12 -22
- fugue_ibis/dataframe.py +2 -7
- fugue_notebook/env.py +5 -10
- fugue_polars/_utils.py +0 -40
- fugue_polars/polars_dataframe.py +22 -7
- fugue_ray/_constants.py +8 -1
- fugue_ray/_utils/dataframe.py +31 -4
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +13 -4
- fugue_ray/execution_engine.py +39 -21
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +24 -19
- fugue_spark/execution_engine.py +61 -35
- fugue_spark/registry.py +15 -3
- fugue_test/builtin_suite.py +7 -9
- fugue_test/dataframe_suite.py +7 -3
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/collections/test_partition.py +6 -3
- tests/fugue/dataframe/test_utils.py +2 -43
- tests/fugue/execution/test_naive_execution_engine.py +33 -0
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +45 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_transform.py +11 -5
- tests/fugue_ray/test_execution_engine.py +32 -1
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +48 -10
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
fugue_spark/execution_engine.py
CHANGED
|
@@ -4,7 +4,6 @@ from uuid import uuid4
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import pyarrow as pa
|
|
7
|
-
import pyspark
|
|
8
7
|
import pyspark.sql as ps
|
|
9
8
|
from pyspark import StorageLevel
|
|
10
9
|
from pyspark.rdd import RDD
|
|
@@ -25,7 +24,7 @@ from fugue.collections.partition import (
|
|
|
25
24
|
PartitionSpec,
|
|
26
25
|
parse_presort_exp,
|
|
27
26
|
)
|
|
28
|
-
from fugue.constants import KEYWORD_ROWCOUNT
|
|
27
|
+
from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
|
|
29
28
|
from fugue.dataframe import (
|
|
30
29
|
ArrayDataFrame,
|
|
31
30
|
ArrowDataFrame,
|
|
@@ -42,18 +41,13 @@ from fugue.dataframe.arrow_dataframe import _build_empty_arrow
|
|
|
42
41
|
from fugue.dataframe.utils import get_join_schemas
|
|
43
42
|
from fugue.exceptions import FugueDataFrameInitError
|
|
44
43
|
from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
from
|
|
50
|
-
from
|
|
51
|
-
from
|
|
52
|
-
even_repartition,
|
|
53
|
-
hash_repartition,
|
|
54
|
-
rand_repartition,
|
|
55
|
-
)
|
|
56
|
-
from fugue_spark.dataframe import SparkDataFrame
|
|
44
|
+
|
|
45
|
+
from ._constants import FUGUE_SPARK_CONF_USE_PANDAS_UDF, FUGUE_SPARK_DEFAULT_CONF
|
|
46
|
+
from ._utils.convert import to_schema, to_spark_schema, to_type_safe_input
|
|
47
|
+
from ._utils.io import SparkIO
|
|
48
|
+
from ._utils.misc import is_spark_connect as _is_spark_connect, is_spark_dataframe
|
|
49
|
+
from ._utils.partition import even_repartition, hash_repartition, rand_repartition
|
|
50
|
+
from .dataframe import SparkDataFrame
|
|
57
51
|
|
|
58
52
|
_TO_SPARK_JOIN_MAP: Dict[str, str] = {
|
|
59
53
|
"inner": "inner",
|
|
@@ -103,12 +97,15 @@ class SparkMapEngine(MapEngine):
|
|
|
103
97
|
def is_distributed(self) -> bool:
|
|
104
98
|
return True
|
|
105
99
|
|
|
100
|
+
@property
|
|
101
|
+
def is_spark_connect(self) -> bool:
|
|
102
|
+
"""Whether the spark session is created by spark connect"""
|
|
103
|
+
return self.execution_engine.is_spark_connect # type:ignore
|
|
104
|
+
|
|
106
105
|
def _should_use_pandas_udf(self, schema: Schema) -> bool:
|
|
106
|
+
if self.is_spark_connect: # pragma: no cover
|
|
107
|
+
return True
|
|
107
108
|
possible = hasattr(ps.DataFrame, "mapInPandas") # must be new version of Spark
|
|
108
|
-
if pyspark.__version__ < "3": # pragma: no cover
|
|
109
|
-
possible &= self.execution_engine.conf.get(
|
|
110
|
-
"spark.sql.execution.arrow.enabled", False
|
|
111
|
-
)
|
|
112
109
|
# else: # this condition seems to be unnecessary
|
|
113
110
|
# possible &= self.execution_engine.conf.get(
|
|
114
111
|
# "spark.sql.execution.arrow.pyspark.enabled", False
|
|
@@ -138,15 +135,25 @@ class SparkMapEngine(MapEngine):
|
|
|
138
135
|
output_schema = Schema(output_schema)
|
|
139
136
|
if self._should_use_pandas_udf(output_schema):
|
|
140
137
|
# pandas udf can only be used for pyspark > 3
|
|
141
|
-
if len(partition_spec.partition_by) > 0
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
138
|
+
if len(partition_spec.partition_by) > 0:
|
|
139
|
+
if partition_spec.algo == "coarse":
|
|
140
|
+
return self._map_by_pandas_udf(
|
|
141
|
+
df,
|
|
142
|
+
map_func=map_func,
|
|
143
|
+
output_schema=output_schema,
|
|
144
|
+
partition_spec=partition_spec,
|
|
145
|
+
on_init=on_init,
|
|
146
|
+
map_func_format_hint=map_func_format_hint,
|
|
147
|
+
)
|
|
148
|
+
elif partition_spec.algo != "even" or self.is_spark_connect:
|
|
149
|
+
return self._group_map_by_pandas_udf(
|
|
150
|
+
df,
|
|
151
|
+
map_func=map_func,
|
|
152
|
+
output_schema=output_schema,
|
|
153
|
+
partition_spec=partition_spec,
|
|
154
|
+
on_init=on_init,
|
|
155
|
+
map_func_format_hint=map_func_format_hint,
|
|
156
|
+
)
|
|
150
157
|
elif len(partition_spec.partition_by) == 0:
|
|
151
158
|
return self._map_by_pandas_udf(
|
|
152
159
|
df,
|
|
@@ -187,7 +194,7 @@ class SparkMapEngine(MapEngine):
|
|
|
187
194
|
def _udf_pandas(pdf: Any) -> pd.DataFrame: # pragma: no cover
|
|
188
195
|
if pdf.shape[0] == 0:
|
|
189
196
|
return PandasDataFrame([], output_schema).as_pandas()
|
|
190
|
-
if len(
|
|
197
|
+
if len(partition_spec.presort) > 0:
|
|
191
198
|
pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
|
|
192
199
|
input_df = PandasDataFrame(
|
|
193
200
|
pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
|
|
@@ -239,6 +246,7 @@ class SparkMapEngine(MapEngine):
|
|
|
239
246
|
)
|
|
240
247
|
if not cursor_set:
|
|
241
248
|
cursor.set(lambda: pdf.peek_array(), 0, 0)
|
|
249
|
+
cursor_set = True
|
|
242
250
|
yield pdf
|
|
243
251
|
|
|
244
252
|
input_df = IterablePandasDataFrame(get_dfs(), input_schema)
|
|
@@ -273,6 +281,7 @@ class SparkMapEngine(MapEngine):
|
|
|
273
281
|
pdf = ArrowDataFrame(func(adf))
|
|
274
282
|
if not cursor_set:
|
|
275
283
|
cursor.set(lambda: pdf.peek_array(), 0, 0)
|
|
284
|
+
cursor_set = True
|
|
276
285
|
yield pdf
|
|
277
286
|
|
|
278
287
|
input_df = IterableArrowDataFrame(get_dfs(), input_schema)
|
|
@@ -316,7 +325,10 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
316
325
|
spark_session = SparkSession.builder.getOrCreate()
|
|
317
326
|
self._spark_session = spark_session
|
|
318
327
|
cf = dict(FUGUE_SPARK_DEFAULT_CONF)
|
|
319
|
-
|
|
328
|
+
if not self.is_spark_connect:
|
|
329
|
+
cf.update(
|
|
330
|
+
{x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}
|
|
331
|
+
)
|
|
320
332
|
cf.update(ParamDict(conf))
|
|
321
333
|
super().__init__(cf)
|
|
322
334
|
self._lock = SerializableRLock()
|
|
@@ -343,6 +355,10 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
343
355
|
)
|
|
344
356
|
return self._spark_session
|
|
345
357
|
|
|
358
|
+
@property
|
|
359
|
+
def is_spark_connect(self) -> bool:
|
|
360
|
+
return _is_spark_connect(self.spark_session)
|
|
361
|
+
|
|
346
362
|
@property
|
|
347
363
|
def is_distributed(self) -> bool:
|
|
348
364
|
return True
|
|
@@ -363,6 +379,11 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
363
379
|
|
|
364
380
|
def get_current_parallelism(self) -> int:
|
|
365
381
|
spark = self.spark_session
|
|
382
|
+
if self.is_spark_connect: # pragma: no cover
|
|
383
|
+
num = spark.conf.get("spark.default.parallelism", "")
|
|
384
|
+
if num != "":
|
|
385
|
+
return int(num)
|
|
386
|
+
return int(spark.conf.get("spark.sql.shuffle.partitions", "200"))
|
|
366
387
|
e_cores = int(spark.conf.get("spark.executor.cores", "1"))
|
|
367
388
|
tc = int(spark.conf.get("spark.task.cpus", "1"))
|
|
368
389
|
sc = spark._jsc.sc()
|
|
@@ -403,10 +424,13 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
403
424
|
return df.count()
|
|
404
425
|
|
|
405
426
|
df = self._to_spark_df(df)
|
|
406
|
-
num_funcs = {
|
|
427
|
+
num_funcs = {
|
|
428
|
+
KEYWORD_ROWCOUNT: lambda: _persist_and_count(df),
|
|
429
|
+
KEYWORD_PARALLELISM: lambda: self.get_current_parallelism(),
|
|
430
|
+
}
|
|
407
431
|
num = partition_spec.get_num_partitions(**num_funcs)
|
|
408
432
|
|
|
409
|
-
if partition_spec.algo
|
|
433
|
+
if partition_spec.algo in ["hash", "coarse"]:
|
|
410
434
|
sdf = hash_repartition(
|
|
411
435
|
self.spark_session, df.native, num, partition_spec.partition_by
|
|
412
436
|
)
|
|
@@ -712,14 +736,16 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
712
736
|
if isinstance(df, SparkDataFrame):
|
|
713
737
|
return df
|
|
714
738
|
if isinstance(df, ArrowDataFrame):
|
|
739
|
+
raw_df: Any = df.as_pandas()
|
|
715
740
|
sdf = self.spark_session.createDataFrame(
|
|
716
|
-
|
|
741
|
+
raw_df, to_spark_schema(df.schema)
|
|
717
742
|
)
|
|
718
743
|
return SparkDataFrame(sdf, df.schema)
|
|
719
744
|
if isinstance(df, (ArrayDataFrame, IterableDataFrame)):
|
|
720
745
|
adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema)
|
|
746
|
+
raw_df = adf.as_pandas()
|
|
721
747
|
sdf = self.spark_session.createDataFrame(
|
|
722
|
-
|
|
748
|
+
raw_df, to_spark_schema(df.schema)
|
|
723
749
|
)
|
|
724
750
|
return SparkDataFrame(sdf, df.schema)
|
|
725
751
|
if any(pa.types.is_struct(t) for t in df.schema.types):
|
|
@@ -731,7 +757,7 @@ class SparkExecutionEngine(ExecutionEngine):
|
|
|
731
757
|
df.as_pandas(), to_spark_schema(df.schema)
|
|
732
758
|
)
|
|
733
759
|
return SparkDataFrame(sdf, df.schema)
|
|
734
|
-
if
|
|
760
|
+
if is_spark_dataframe(df):
|
|
735
761
|
return SparkDataFrame(df, None if schema is None else to_schema(schema))
|
|
736
762
|
if isinstance(df, RDD):
|
|
737
763
|
assert_arg_not_none(schema, "schema")
|
|
@@ -805,7 +831,7 @@ class _Mapper(object): # pragma: no cover
|
|
|
805
831
|
return
|
|
806
832
|
if self.on_init is not None:
|
|
807
833
|
self.on_init(no, df)
|
|
808
|
-
if self.partition_spec.empty:
|
|
834
|
+
if self.partition_spec.empty or self.partition_spec.algo == "coarse":
|
|
809
835
|
partitions: Iterable[Tuple[int, int, EmptyAwareIterable]] = [
|
|
810
836
|
(0, 0, df.native)
|
|
811
837
|
]
|
fugue_spark/registry.py
CHANGED
|
@@ -18,18 +18,24 @@ from fugue.plugins import as_fugue_dataset, infer_execution_engine, parse_creato
|
|
|
18
18
|
from fugue_spark.dataframe import SparkDataFrame
|
|
19
19
|
from fugue_spark.execution_engine import SparkExecutionEngine
|
|
20
20
|
|
|
21
|
+
from ._utils.misc import SparkConnectDataFrame, SparkConnectSession, is_spark_dataframe
|
|
22
|
+
|
|
21
23
|
_is_sparksql = namespace_candidate("sparksql", lambda x: isinstance(x, str))
|
|
22
24
|
|
|
23
25
|
|
|
24
26
|
@infer_execution_engine.candidate(
|
|
25
|
-
lambda objs:
|
|
27
|
+
lambda objs: (
|
|
28
|
+
is_pandas_or(objs, (ps.DataFrame, SparkConnectDataFrame, SparkDataFrame))
|
|
29
|
+
if SparkConnectDataFrame is not None
|
|
30
|
+
else is_pandas_or(objs, (ps.DataFrame, SparkDataFrame))
|
|
31
|
+
)
|
|
26
32
|
or any(_is_sparksql(obj) for obj in objs)
|
|
27
33
|
)
|
|
28
34
|
def _infer_spark_client(obj: Any) -> Any:
|
|
29
35
|
return SparkSession.builder.getOrCreate()
|
|
30
36
|
|
|
31
37
|
|
|
32
|
-
@as_fugue_dataset.candidate(lambda df, **kwargs:
|
|
38
|
+
@as_fugue_dataset.candidate(lambda df, **kwargs: is_spark_dataframe(df))
|
|
33
39
|
def _spark_as_fugue_df(df: ps.DataFrame, **kwargs: Any) -> SparkDataFrame:
|
|
34
40
|
return SparkDataFrame(df, **kwargs)
|
|
35
41
|
|
|
@@ -53,6 +59,12 @@ def _register_engines() -> None:
|
|
|
53
59
|
lambda session, conf, **kwargs: SparkExecutionEngine(session, conf=conf),
|
|
54
60
|
on_dup="ignore",
|
|
55
61
|
)
|
|
62
|
+
if SparkConnectSession is not None:
|
|
63
|
+
register_execution_engine(
|
|
64
|
+
SparkConnectSession,
|
|
65
|
+
lambda session, conf, **kwargs: SparkExecutionEngine(session, conf=conf),
|
|
66
|
+
on_dup="ignore",
|
|
67
|
+
)
|
|
56
68
|
|
|
57
69
|
|
|
58
70
|
@fugue_annotated_param(SparkExecutionEngine)
|
|
@@ -81,7 +93,7 @@ class _SparkDataFrameParam(DataFrameParam):
|
|
|
81
93
|
return ctx.to_df(df).native
|
|
82
94
|
|
|
83
95
|
def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
|
|
84
|
-
assert
|
|
96
|
+
assert is_spark_dataframe(output)
|
|
85
97
|
assert isinstance(ctx, SparkExecutionEngine)
|
|
86
98
|
return ctx.to_df(output, schema=schema)
|
|
87
99
|
|
fugue_test/builtin_suite.py
CHANGED
|
@@ -57,7 +57,6 @@ from fugue.exceptions import (
|
|
|
57
57
|
FugueWorkflowError,
|
|
58
58
|
FugueWorkflowRuntimeValidationError,
|
|
59
59
|
)
|
|
60
|
-
from fugue_test._utils import _is_spark2
|
|
61
60
|
|
|
62
61
|
|
|
63
62
|
class BuiltInTests(object):
|
|
@@ -98,7 +97,7 @@ class BuiltInTests(object):
|
|
|
98
97
|
dag.run(self.engine)
|
|
99
98
|
|
|
100
99
|
def test_create_df_equivalence(self):
|
|
101
|
-
ndf = self.engine
|
|
100
|
+
ndf = fa.as_fugue_engine_df(self.engine, pd.DataFrame([[0]], columns=["a"]))
|
|
102
101
|
dag1 = FugueWorkflow()
|
|
103
102
|
dag1.df(ndf).show()
|
|
104
103
|
dag2 = FugueWorkflow()
|
|
@@ -1316,12 +1315,13 @@ class BuiltInTests(object):
|
|
|
1316
1315
|
assert FileSystem().isdir(os.path.join(path3, "c=2"))
|
|
1317
1316
|
# TODO: in test below, once issue #288 is fixed, use dag.load
|
|
1318
1317
|
# instead of pd.read_parquet
|
|
1318
|
+
pdf = pd.read_parquet(path3).sort_values("a").reset_index(drop=True)
|
|
1319
|
+
pdf["c"] = pdf["c"].astype(int)
|
|
1319
1320
|
pd.testing.assert_frame_equal(
|
|
1320
|
-
|
|
1321
|
-
pd.DataFrame({"c":
|
|
1322
|
-
drop=True
|
|
1323
|
-
),
|
|
1321
|
+
pdf,
|
|
1322
|
+
pd.DataFrame({"c": [6, 2], "a": [1, 7]}).reset_index(drop=True),
|
|
1324
1323
|
check_like=True,
|
|
1324
|
+
check_dtype=False,
|
|
1325
1325
|
)
|
|
1326
1326
|
|
|
1327
1327
|
def test_save_and_use(self):
|
|
@@ -1675,9 +1675,7 @@ class BuiltInTests(object):
|
|
|
1675
1675
|
assert not isinstance(sdf4, DataFrame)
|
|
1676
1676
|
assert fa.is_local(sdf4)
|
|
1677
1677
|
|
|
1678
|
-
@pytest.mark.skipif(
|
|
1679
|
-
_is_spark2() or os.name == "nt", reason="Skip Spark<3 or Windows"
|
|
1680
|
-
)
|
|
1678
|
+
@pytest.mark.skipif(os.name == "nt", reason="Skip Windows")
|
|
1681
1679
|
def test_any_column_name(self):
|
|
1682
1680
|
|
|
1683
1681
|
f_parquet = os.path.join(str(self.tmpdir), "a.parquet")
|
fugue_test/dataframe_suite.py
CHANGED
|
@@ -415,7 +415,7 @@ class DataFrameTests(object):
|
|
|
415
415
|
|
|
416
416
|
# str -> date
|
|
417
417
|
df = self.df(
|
|
418
|
-
[["1", "2020-01-01"], ["2", "2020-01-02
|
|
418
|
+
[["1", "2020-01-01"], ["2", "2020-01-02"], ["3", None]],
|
|
419
419
|
"a:str,b:str",
|
|
420
420
|
)
|
|
421
421
|
ndf = fi.alter_columns(df, "b:date,a:int", as_fugue=True)
|
|
@@ -428,12 +428,16 @@ class DataFrameTests(object):
|
|
|
428
428
|
|
|
429
429
|
# str -> datetime
|
|
430
430
|
df = self.df(
|
|
431
|
-
[
|
|
431
|
+
[
|
|
432
|
+
["1", "2020-01-01 01:02:03"],
|
|
433
|
+
["2", "2020-01-02 01:02:03"],
|
|
434
|
+
["3", None],
|
|
435
|
+
],
|
|
432
436
|
"a:str,b:str",
|
|
433
437
|
)
|
|
434
438
|
ndf = fi.alter_columns(df, "b:datetime,a:int", as_fugue=True)
|
|
435
439
|
assert [
|
|
436
|
-
[1, datetime(2020, 1, 1)],
|
|
440
|
+
[1, datetime(2020, 1, 1, 1, 2, 3)],
|
|
437
441
|
[2, datetime(2020, 1, 2, 1, 2, 3)],
|
|
438
442
|
[3, None],
|
|
439
443
|
] == fi.as_array(ndf, type_safe=True)
|