fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +2 -91
  3. fugue/api.py +1 -0
  4. fugue/collections/partition.py +12 -6
  5. fugue/constants.py +1 -1
  6. fugue/dataframe/__init__.py +1 -7
  7. fugue/dataframe/arrow_dataframe.py +1 -1
  8. fugue/dataframe/function_wrapper.py +2 -3
  9. fugue/dataframe/utils.py +10 -84
  10. fugue/execution/api.py +34 -12
  11. fugue/execution/native_execution_engine.py +33 -19
  12. fugue/extensions/_builtins/creators.py +4 -2
  13. fugue/extensions/_builtins/outputters.py +3 -3
  14. fugue/extensions/_builtins/processors.py +2 -3
  15. fugue/plugins.py +1 -0
  16. fugue/workflow/_checkpoint.py +1 -1
  17. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
  18. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
  19. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
  20. fugue_contrib/viz/_ext.py +7 -1
  21. fugue_dask/_io.py +0 -13
  22. fugue_dask/_utils.py +10 -4
  23. fugue_dask/execution_engine.py +42 -16
  24. fugue_duckdb/_utils.py +7 -2
  25. fugue_duckdb/dask.py +1 -1
  26. fugue_duckdb/dataframe.py +17 -10
  27. fugue_duckdb/execution_engine.py +12 -22
  28. fugue_ibis/dataframe.py +2 -7
  29. fugue_notebook/env.py +5 -10
  30. fugue_polars/_utils.py +0 -40
  31. fugue_polars/polars_dataframe.py +22 -7
  32. fugue_ray/_constants.py +8 -1
  33. fugue_ray/_utils/dataframe.py +31 -4
  34. fugue_ray/_utils/io.py +2 -4
  35. fugue_ray/dataframe.py +13 -4
  36. fugue_ray/execution_engine.py +39 -21
  37. fugue_spark/_utils/convert.py +22 -11
  38. fugue_spark/_utils/io.py +0 -13
  39. fugue_spark/_utils/misc.py +27 -0
  40. fugue_spark/_utils/partition.py +11 -18
  41. fugue_spark/dataframe.py +24 -19
  42. fugue_spark/execution_engine.py +61 -35
  43. fugue_spark/registry.py +15 -3
  44. fugue_test/builtin_suite.py +7 -9
  45. fugue_test/dataframe_suite.py +7 -3
  46. fugue_test/execution_suite.py +100 -122
  47. fugue_version/__init__.py +1 -1
  48. tests/fugue/collections/test_partition.py +6 -3
  49. tests/fugue/dataframe/test_utils.py +2 -43
  50. tests/fugue/execution/test_naive_execution_engine.py +33 -0
  51. tests/fugue/utils/test_io.py +0 -80
  52. tests/fugue_dask/test_execution_engine.py +45 -0
  53. tests/fugue_dask/test_io.py +0 -55
  54. tests/fugue_duckdb/test_dataframe.py +2 -2
  55. tests/fugue_duckdb/test_utils.py +1 -1
  56. tests/fugue_polars/test_api.py +13 -0
  57. tests/fugue_polars/test_transform.py +11 -5
  58. tests/fugue_ray/test_execution_engine.py +32 -1
  59. tests/fugue_spark/test_dataframe.py +0 -8
  60. tests/fugue_spark/test_execution_engine.py +48 -10
  61. tests/fugue_spark/test_importless.py +4 -4
  62. tests/fugue_spark/test_spark_connect.py +82 -0
  63. tests/fugue_spark/utils/test_convert.py +6 -8
  64. tests/fugue_spark/utils/test_io.py +0 -17
  65. fugue_test/_utils.py +0 -13
  66. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
  67. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
  68. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ from uuid import uuid4
4
4
 
5
5
  import pandas as pd
6
6
  import pyarrow as pa
7
- import pyspark
8
7
  import pyspark.sql as ps
9
8
  from pyspark import StorageLevel
10
9
  from pyspark.rdd import RDD
@@ -25,7 +24,7 @@ from fugue.collections.partition import (
25
24
  PartitionSpec,
26
25
  parse_presort_exp,
27
26
  )
28
- from fugue.constants import KEYWORD_ROWCOUNT
27
+ from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
29
28
  from fugue.dataframe import (
30
29
  ArrayDataFrame,
31
30
  ArrowDataFrame,
@@ -42,18 +41,13 @@ from fugue.dataframe.arrow_dataframe import _build_empty_arrow
42
41
  from fugue.dataframe.utils import get_join_schemas
43
42
  from fugue.exceptions import FugueDataFrameInitError
44
43
  from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine
45
- from fugue_spark._constants import (
46
- FUGUE_SPARK_CONF_USE_PANDAS_UDF,
47
- FUGUE_SPARK_DEFAULT_CONF,
48
- )
49
- from fugue_spark._utils.convert import to_schema, to_spark_schema, to_type_safe_input
50
- from fugue_spark._utils.io import SparkIO
51
- from fugue_spark._utils.partition import (
52
- even_repartition,
53
- hash_repartition,
54
- rand_repartition,
55
- )
56
- from fugue_spark.dataframe import SparkDataFrame
44
+
45
+ from ._constants import FUGUE_SPARK_CONF_USE_PANDAS_UDF, FUGUE_SPARK_DEFAULT_CONF
46
+ from ._utils.convert import to_schema, to_spark_schema, to_type_safe_input
47
+ from ._utils.io import SparkIO
48
+ from ._utils.misc import is_spark_connect as _is_spark_connect, is_spark_dataframe
49
+ from ._utils.partition import even_repartition, hash_repartition, rand_repartition
50
+ from .dataframe import SparkDataFrame
57
51
 
58
52
  _TO_SPARK_JOIN_MAP: Dict[str, str] = {
59
53
  "inner": "inner",
@@ -103,12 +97,15 @@ class SparkMapEngine(MapEngine):
103
97
  def is_distributed(self) -> bool:
104
98
  return True
105
99
 
100
+ @property
101
+ def is_spark_connect(self) -> bool:
102
+ """Whether the spark session is created by spark connect"""
103
+ return self.execution_engine.is_spark_connect # type:ignore
104
+
106
105
  def _should_use_pandas_udf(self, schema: Schema) -> bool:
106
+ if self.is_spark_connect: # pragma: no cover
107
+ return True
107
108
  possible = hasattr(ps.DataFrame, "mapInPandas") # must be new version of Spark
108
- if pyspark.__version__ < "3": # pragma: no cover
109
- possible &= self.execution_engine.conf.get(
110
- "spark.sql.execution.arrow.enabled", False
111
- )
112
109
  # else: # this condition seems to be unnecessary
113
110
  # possible &= self.execution_engine.conf.get(
114
111
  # "spark.sql.execution.arrow.pyspark.enabled", False
@@ -138,15 +135,25 @@ class SparkMapEngine(MapEngine):
138
135
  output_schema = Schema(output_schema)
139
136
  if self._should_use_pandas_udf(output_schema):
140
137
  # pandas udf can only be used for pyspark > 3
141
- if len(partition_spec.partition_by) > 0 and partition_spec.algo != "even":
142
- return self._group_map_by_pandas_udf(
143
- df,
144
- map_func=map_func,
145
- output_schema=output_schema,
146
- partition_spec=partition_spec,
147
- on_init=on_init,
148
- map_func_format_hint=map_func_format_hint,
149
- )
138
+ if len(partition_spec.partition_by) > 0:
139
+ if partition_spec.algo == "coarse":
140
+ return self._map_by_pandas_udf(
141
+ df,
142
+ map_func=map_func,
143
+ output_schema=output_schema,
144
+ partition_spec=partition_spec,
145
+ on_init=on_init,
146
+ map_func_format_hint=map_func_format_hint,
147
+ )
148
+ elif partition_spec.algo != "even" or self.is_spark_connect:
149
+ return self._group_map_by_pandas_udf(
150
+ df,
151
+ map_func=map_func,
152
+ output_schema=output_schema,
153
+ partition_spec=partition_spec,
154
+ on_init=on_init,
155
+ map_func_format_hint=map_func_format_hint,
156
+ )
150
157
  elif len(partition_spec.partition_by) == 0:
151
158
  return self._map_by_pandas_udf(
152
159
  df,
@@ -187,7 +194,7 @@ class SparkMapEngine(MapEngine):
187
194
  def _udf_pandas(pdf: Any) -> pd.DataFrame: # pragma: no cover
188
195
  if pdf.shape[0] == 0:
189
196
  return PandasDataFrame([], output_schema).as_pandas()
190
- if len(presort_keys) > 0:
197
+ if len(partition_spec.presort) > 0:
191
198
  pdf = pdf.sort_values(presort_keys, ascending=presort_asc)
192
199
  input_df = PandasDataFrame(
193
200
  pdf.reset_index(drop=True), input_schema, pandas_df_wrapper=True
@@ -239,6 +246,7 @@ class SparkMapEngine(MapEngine):
239
246
  )
240
247
  if not cursor_set:
241
248
  cursor.set(lambda: pdf.peek_array(), 0, 0)
249
+ cursor_set = True
242
250
  yield pdf
243
251
 
244
252
  input_df = IterablePandasDataFrame(get_dfs(), input_schema)
@@ -273,6 +281,7 @@ class SparkMapEngine(MapEngine):
273
281
  pdf = ArrowDataFrame(func(adf))
274
282
  if not cursor_set:
275
283
  cursor.set(lambda: pdf.peek_array(), 0, 0)
284
+ cursor_set = True
276
285
  yield pdf
277
286
 
278
287
  input_df = IterableArrowDataFrame(get_dfs(), input_schema)
@@ -316,7 +325,10 @@ class SparkExecutionEngine(ExecutionEngine):
316
325
  spark_session = SparkSession.builder.getOrCreate()
317
326
  self._spark_session = spark_session
318
327
  cf = dict(FUGUE_SPARK_DEFAULT_CONF)
319
- cf.update({x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()})
328
+ if not self.is_spark_connect:
329
+ cf.update(
330
+ {x[0]: x[1] for x in spark_session.sparkContext.getConf().getAll()}
331
+ )
320
332
  cf.update(ParamDict(conf))
321
333
  super().__init__(cf)
322
334
  self._lock = SerializableRLock()
@@ -343,6 +355,10 @@ class SparkExecutionEngine(ExecutionEngine):
343
355
  )
344
356
  return self._spark_session
345
357
 
358
+ @property
359
+ def is_spark_connect(self) -> bool:
360
+ return _is_spark_connect(self.spark_session)
361
+
346
362
  @property
347
363
  def is_distributed(self) -> bool:
348
364
  return True
@@ -363,6 +379,11 @@ class SparkExecutionEngine(ExecutionEngine):
363
379
 
364
380
  def get_current_parallelism(self) -> int:
365
381
  spark = self.spark_session
382
+ if self.is_spark_connect: # pragma: no cover
383
+ num = spark.conf.get("spark.default.parallelism", "")
384
+ if num != "":
385
+ return int(num)
386
+ return int(spark.conf.get("spark.sql.shuffle.partitions", "200"))
366
387
  e_cores = int(spark.conf.get("spark.executor.cores", "1"))
367
388
  tc = int(spark.conf.get("spark.task.cpus", "1"))
368
389
  sc = spark._jsc.sc()
@@ -403,10 +424,13 @@ class SparkExecutionEngine(ExecutionEngine):
403
424
  return df.count()
404
425
 
405
426
  df = self._to_spark_df(df)
406
- num_funcs = {KEYWORD_ROWCOUNT: lambda: _persist_and_count(df)}
427
+ num_funcs = {
428
+ KEYWORD_ROWCOUNT: lambda: _persist_and_count(df),
429
+ KEYWORD_PARALLELISM: lambda: self.get_current_parallelism(),
430
+ }
407
431
  num = partition_spec.get_num_partitions(**num_funcs)
408
432
 
409
- if partition_spec.algo == "hash":
433
+ if partition_spec.algo in ["hash", "coarse"]:
410
434
  sdf = hash_repartition(
411
435
  self.spark_session, df.native, num, partition_spec.partition_by
412
436
  )
@@ -712,14 +736,16 @@ class SparkExecutionEngine(ExecutionEngine):
712
736
  if isinstance(df, SparkDataFrame):
713
737
  return df
714
738
  if isinstance(df, ArrowDataFrame):
739
+ raw_df: Any = df.as_pandas()
715
740
  sdf = self.spark_session.createDataFrame(
716
- df.as_array(), to_spark_schema(df.schema)
741
+ raw_df, to_spark_schema(df.schema)
717
742
  )
718
743
  return SparkDataFrame(sdf, df.schema)
719
744
  if isinstance(df, (ArrayDataFrame, IterableDataFrame)):
720
745
  adf = ArrowDataFrame(df.as_array(type_safe=False), df.schema)
746
+ raw_df = adf.as_pandas()
721
747
  sdf = self.spark_session.createDataFrame(
722
- adf.as_array(), to_spark_schema(df.schema)
748
+ raw_df, to_spark_schema(df.schema)
723
749
  )
724
750
  return SparkDataFrame(sdf, df.schema)
725
751
  if any(pa.types.is_struct(t) for t in df.schema.types):
@@ -731,7 +757,7 @@ class SparkExecutionEngine(ExecutionEngine):
731
757
  df.as_pandas(), to_spark_schema(df.schema)
732
758
  )
733
759
  return SparkDataFrame(sdf, df.schema)
734
- if isinstance(df, ps.DataFrame):
760
+ if is_spark_dataframe(df):
735
761
  return SparkDataFrame(df, None if schema is None else to_schema(schema))
736
762
  if isinstance(df, RDD):
737
763
  assert_arg_not_none(schema, "schema")
@@ -805,7 +831,7 @@ class _Mapper(object): # pragma: no cover
805
831
  return
806
832
  if self.on_init is not None:
807
833
  self.on_init(no, df)
808
- if self.partition_spec.empty:
834
+ if self.partition_spec.empty or self.partition_spec.algo == "coarse":
809
835
  partitions: Iterable[Tuple[int, int, EmptyAwareIterable]] = [
810
836
  (0, 0, df.native)
811
837
  ]
fugue_spark/registry.py CHANGED
@@ -18,18 +18,24 @@ from fugue.plugins import as_fugue_dataset, infer_execution_engine, parse_creato
18
18
  from fugue_spark.dataframe import SparkDataFrame
19
19
  from fugue_spark.execution_engine import SparkExecutionEngine
20
20
 
21
+ from ._utils.misc import SparkConnectDataFrame, SparkConnectSession, is_spark_dataframe
22
+
21
23
  _is_sparksql = namespace_candidate("sparksql", lambda x: isinstance(x, str))
22
24
 
23
25
 
24
26
  @infer_execution_engine.candidate(
25
- lambda objs: is_pandas_or(objs, (ps.DataFrame, SparkDataFrame))
27
+ lambda objs: (
28
+ is_pandas_or(objs, (ps.DataFrame, SparkConnectDataFrame, SparkDataFrame))
29
+ if SparkConnectDataFrame is not None
30
+ else is_pandas_or(objs, (ps.DataFrame, SparkDataFrame))
31
+ )
26
32
  or any(_is_sparksql(obj) for obj in objs)
27
33
  )
28
34
  def _infer_spark_client(obj: Any) -> Any:
29
35
  return SparkSession.builder.getOrCreate()
30
36
 
31
37
 
32
- @as_fugue_dataset.candidate(lambda df, **kwargs: isinstance(df, ps.DataFrame))
38
+ @as_fugue_dataset.candidate(lambda df, **kwargs: is_spark_dataframe(df))
33
39
  def _spark_as_fugue_df(df: ps.DataFrame, **kwargs: Any) -> SparkDataFrame:
34
40
  return SparkDataFrame(df, **kwargs)
35
41
 
@@ -53,6 +59,12 @@ def _register_engines() -> None:
53
59
  lambda session, conf, **kwargs: SparkExecutionEngine(session, conf=conf),
54
60
  on_dup="ignore",
55
61
  )
62
+ if SparkConnectSession is not None:
63
+ register_execution_engine(
64
+ SparkConnectSession,
65
+ lambda session, conf, **kwargs: SparkExecutionEngine(session, conf=conf),
66
+ on_dup="ignore",
67
+ )
56
68
 
57
69
 
58
70
  @fugue_annotated_param(SparkExecutionEngine)
@@ -81,7 +93,7 @@ class _SparkDataFrameParam(DataFrameParam):
81
93
  return ctx.to_df(df).native
82
94
 
83
95
  def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
84
- assert isinstance(output, ps.DataFrame)
96
+ assert is_spark_dataframe(output)
85
97
  assert isinstance(ctx, SparkExecutionEngine)
86
98
  return ctx.to_df(output, schema=schema)
87
99
 
@@ -57,7 +57,6 @@ from fugue.exceptions import (
57
57
  FugueWorkflowError,
58
58
  FugueWorkflowRuntimeValidationError,
59
59
  )
60
- from fugue_test._utils import _is_spark2
61
60
 
62
61
 
63
62
  class BuiltInTests(object):
@@ -98,7 +97,7 @@ class BuiltInTests(object):
98
97
  dag.run(self.engine)
99
98
 
100
99
  def test_create_df_equivalence(self):
101
- ndf = self.engine.to_df(pd.DataFrame([[0]], columns=["a"]))
100
+ ndf = fa.as_fugue_engine_df(self.engine, pd.DataFrame([[0]], columns=["a"]))
102
101
  dag1 = FugueWorkflow()
103
102
  dag1.df(ndf).show()
104
103
  dag2 = FugueWorkflow()
@@ -1316,12 +1315,13 @@ class BuiltInTests(object):
1316
1315
  assert FileSystem().isdir(os.path.join(path3, "c=2"))
1317
1316
  # TODO: in test below, once issue #288 is fixed, use dag.load
1318
1317
  # instead of pd.read_parquet
1318
+ pdf = pd.read_parquet(path3).sort_values("a").reset_index(drop=True)
1319
+ pdf["c"] = pdf["c"].astype(int)
1319
1320
  pd.testing.assert_frame_equal(
1320
- pd.read_parquet(path3).sort_values("a").reset_index(drop=True),
1321
- pd.DataFrame({"c": pd.Categorical([6, 2]), "a": [1, 7]}).reset_index(
1322
- drop=True
1323
- ),
1321
+ pdf,
1322
+ pd.DataFrame({"c": [6, 2], "a": [1, 7]}).reset_index(drop=True),
1324
1323
  check_like=True,
1324
+ check_dtype=False,
1325
1325
  )
1326
1326
 
1327
1327
  def test_save_and_use(self):
@@ -1675,9 +1675,7 @@ class BuiltInTests(object):
1675
1675
  assert not isinstance(sdf4, DataFrame)
1676
1676
  assert fa.is_local(sdf4)
1677
1677
 
1678
- @pytest.mark.skipif(
1679
- _is_spark2() or os.name == "nt", reason="Skip Spark<3 or Windows"
1680
- )
1678
+ @pytest.mark.skipif(os.name == "nt", reason="Skip Windows")
1681
1679
  def test_any_column_name(self):
1682
1680
 
1683
1681
  f_parquet = os.path.join(str(self.tmpdir), "a.parquet")
@@ -415,7 +415,7 @@ class DataFrameTests(object):
415
415
 
416
416
  # str -> date
417
417
  df = self.df(
418
- [["1", "2020-01-01"], ["2", "2020-01-02 01:02:03"], ["3", None]],
418
+ [["1", "2020-01-01"], ["2", "2020-01-02"], ["3", None]],
419
419
  "a:str,b:str",
420
420
  )
421
421
  ndf = fi.alter_columns(df, "b:date,a:int", as_fugue=True)
@@ -428,12 +428,16 @@ class DataFrameTests(object):
428
428
 
429
429
  # str -> datetime
430
430
  df = self.df(
431
- [["1", "2020-01-01"], ["2", "2020-01-02 01:02:03"], ["3", None]],
431
+ [
432
+ ["1", "2020-01-01 01:02:03"],
433
+ ["2", "2020-01-02 01:02:03"],
434
+ ["3", None],
435
+ ],
432
436
  "a:str,b:str",
433
437
  )
434
438
  ndf = fi.alter_columns(df, "b:datetime,a:int", as_fugue=True)
435
439
  assert [
436
- [1, datetime(2020, 1, 1)],
440
+ [1, datetime(2020, 1, 1, 1, 2, 3)],
437
441
  [2, datetime(2020, 1, 2, 1, 2, 3)],
438
442
  [3, None],
439
443
  ] == fi.as_array(ndf, type_safe=True)