fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +2 -91
  3. fugue/api.py +1 -0
  4. fugue/collections/partition.py +12 -6
  5. fugue/constants.py +1 -1
  6. fugue/dataframe/__init__.py +1 -7
  7. fugue/dataframe/arrow_dataframe.py +1 -1
  8. fugue/dataframe/function_wrapper.py +2 -3
  9. fugue/dataframe/utils.py +10 -84
  10. fugue/execution/api.py +34 -12
  11. fugue/execution/native_execution_engine.py +33 -19
  12. fugue/extensions/_builtins/creators.py +4 -2
  13. fugue/extensions/_builtins/outputters.py +3 -3
  14. fugue/extensions/_builtins/processors.py +2 -3
  15. fugue/plugins.py +1 -0
  16. fugue/workflow/_checkpoint.py +1 -1
  17. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
  18. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
  19. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
  20. fugue_contrib/viz/_ext.py +7 -1
  21. fugue_dask/_io.py +0 -13
  22. fugue_dask/_utils.py +10 -4
  23. fugue_dask/execution_engine.py +42 -16
  24. fugue_duckdb/_utils.py +7 -2
  25. fugue_duckdb/dask.py +1 -1
  26. fugue_duckdb/dataframe.py +17 -10
  27. fugue_duckdb/execution_engine.py +12 -22
  28. fugue_ibis/dataframe.py +2 -7
  29. fugue_notebook/env.py +5 -10
  30. fugue_polars/_utils.py +0 -40
  31. fugue_polars/polars_dataframe.py +22 -7
  32. fugue_ray/_constants.py +8 -1
  33. fugue_ray/_utils/dataframe.py +31 -4
  34. fugue_ray/_utils/io.py +2 -4
  35. fugue_ray/dataframe.py +13 -4
  36. fugue_ray/execution_engine.py +39 -21
  37. fugue_spark/_utils/convert.py +22 -11
  38. fugue_spark/_utils/io.py +0 -13
  39. fugue_spark/_utils/misc.py +27 -0
  40. fugue_spark/_utils/partition.py +11 -18
  41. fugue_spark/dataframe.py +24 -19
  42. fugue_spark/execution_engine.py +61 -35
  43. fugue_spark/registry.py +15 -3
  44. fugue_test/builtin_suite.py +7 -9
  45. fugue_test/dataframe_suite.py +7 -3
  46. fugue_test/execution_suite.py +100 -122
  47. fugue_version/__init__.py +1 -1
  48. tests/fugue/collections/test_partition.py +6 -3
  49. tests/fugue/dataframe/test_utils.py +2 -43
  50. tests/fugue/execution/test_naive_execution_engine.py +33 -0
  51. tests/fugue/utils/test_io.py +0 -80
  52. tests/fugue_dask/test_execution_engine.py +45 -0
  53. tests/fugue_dask/test_io.py +0 -55
  54. tests/fugue_duckdb/test_dataframe.py +2 -2
  55. tests/fugue_duckdb/test_utils.py +1 -1
  56. tests/fugue_polars/test_api.py +13 -0
  57. tests/fugue_polars/test_transform.py +11 -5
  58. tests/fugue_ray/test_execution_engine.py +32 -1
  59. tests/fugue_spark/test_dataframe.py +0 -8
  60. tests/fugue_spark/test_execution_engine.py +48 -10
  61. tests/fugue_spark/test_importless.py +4 -4
  62. tests/fugue_spark/test_spark_connect.py +82 -0
  63. tests/fugue_spark/utils/test_convert.py +6 -8
  64. tests/fugue_spark/utils/test_io.py +0 -17
  65. fugue_test/_utils.py +0 -13
  66. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
  67. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
  68. {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
fugue/__init__.py CHANGED
@@ -26,7 +26,6 @@ from fugue.dataframe.dataframe_iterable_dataframe import (
26
26
  from fugue.dataframe.dataframes import DataFrames
27
27
  from fugue.dataframe.iterable_dataframe import IterableDataFrame
28
28
  from fugue.dataframe.pandas_dataframe import PandasDataFrame
29
- from fugue.dataframe.utils import to_local_bounded_df, to_local_df
30
29
  from fugue.dataset import (
31
30
  AnyDataset,
32
31
  Dataset,
fugue/_utils/io.py CHANGED
@@ -5,13 +5,13 @@ from urllib.parse import urlparse
5
5
 
6
6
  import fs as pfs
7
7
  import pandas as pd
8
- from fs.errors import FileExpected
9
- from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
10
8
  from triad.collections.dict import ParamDict
11
9
  from triad.collections.fs import FileSystem
12
10
  from triad.collections.schema import Schema
13
11
  from triad.utils.assertion import assert_or_throw
14
12
 
13
+ from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
14
+
15
15
 
16
16
  class FileParser(object):
17
17
  def __init__(self, path: str, format_hint: Optional[str] = None):
@@ -271,111 +271,22 @@ def _load_json(
271
271
  return pdf[schema.names], schema
272
272
 
273
273
 
274
- def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
275
- """Save pandas dataframe as avro.
276
- If providing your own schema, the usage of schema argument is preferred
277
-
278
- :param schema: Avro Schema determines dtypes saved
279
- """
280
- import pandavro as pdx
281
-
282
- kw = ParamDict(kwargs)
283
-
284
- # pandavro defaults
285
- schema = None
286
- append = False
287
- times_as_micros = True
288
-
289
- if "schema" in kw:
290
- schema = kw["schema"]
291
- del kw["schema"]
292
-
293
- if "append" in kw:
294
- append = kw["append"] # default is overwrite (False) instead of append (True)
295
- del kw["append"]
296
-
297
- if "times_as_micros" in kw:
298
- times_as_micros = kw["times_as_micros"]
299
- del kw["times_as_micros"]
300
-
301
- pdf = df.as_pandas()
302
- pdx.to_avro(
303
- p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw
304
- )
305
-
306
-
307
- def _load_avro(
308
- p: FileParser, columns: Any = None, **kwargs: Any
309
- ) -> Tuple[pd.DataFrame, Any]:
310
- path = p.uri
311
- try:
312
- pdf = _load_single_avro(path, **kwargs)
313
- except (IsADirectoryError, PermissionError, FileExpected):
314
- fs = FileSystem()
315
- pdf = pd.concat(
316
- [
317
- _load_single_avro(
318
- pfs.path.combine(path, pfs.path.basename(x.path)), **kwargs
319
- )
320
- for x in fs.opendir(path).glob("*.avro")
321
- ]
322
- )
323
-
324
- if columns is None:
325
- return pdf, None
326
- if isinstance(columns, list): # column names
327
- return pdf[columns], None
328
-
329
- schema = Schema(columns)
330
-
331
- # Return created DataFrame
332
- return pdf[schema.names], schema
333
-
334
-
335
- def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
336
- from fastavro import reader
337
-
338
- kw = ParamDict(kwargs)
339
- process_record = None
340
- if "process_record" in kw:
341
- process_record = kw["process_record"]
342
- del kw["process_record"]
343
-
344
- fs = FileSystem()
345
- with fs.openbin(path) as fp:
346
- # Configure Avro reader
347
- avro_reader = reader(fp)
348
- # Load records in memory
349
- if process_record:
350
- records = [process_record(r) for r in avro_reader]
351
-
352
- else:
353
- records = list(avro_reader)
354
-
355
- # Populate pandas.DataFrame with records
356
- return pd.DataFrame.from_records(records)
357
-
358
-
359
274
  _FORMAT_MAP: Dict[str, str] = {
360
275
  ".csv": "csv",
361
276
  ".csv.gz": "csv",
362
277
  ".parquet": "parquet",
363
278
  ".json": "json",
364
279
  ".json.gz": "json",
365
- ".avro": "avro",
366
- ".avro.gz": "avro",
367
280
  }
368
281
 
369
282
  _FORMAT_LOAD: Dict[str, Callable[..., Tuple[pd.DataFrame, Any]]] = {
370
283
  "csv": _load_csv,
371
284
  "parquet": _load_parquet,
372
285
  "json": _load_json,
373
- "avro": _load_avro,
374
286
  }
375
287
 
376
288
  _FORMAT_SAVE: Dict[str, Callable] = {
377
289
  "csv": _save_csv,
378
290
  "parquet": _save_parquet,
379
291
  "json": _save_json,
380
- "avro": _save_avro,
381
292
  }
fugue/api.py CHANGED
@@ -34,6 +34,7 @@ from .dataset.api import (
34
34
  from .execution.api import (
35
35
  aggregate,
36
36
  anti_join,
37
+ as_fugue_engine_df,
37
38
  assign,
38
39
  broadcast,
39
40
  clear_global_engine,
@@ -98,7 +98,7 @@ class PartitionSpec(object):
98
98
 
99
99
  Partition consists for these specs:
100
100
 
101
- * **algo**: can be one of ``hash`` (default), ``rand`` and ``even``
101
+ * **algo**: can be one of ``hash`` (default), ``rand``, ``even`` or ``coarse``
102
102
  * **num** or **num_partitions**: number of physical partitions, it can be an
103
103
  expression or integer numbers, e.g ``(ROWCOUNT+4) / 3``
104
104
  * **by** or **partition_by**: keys to partition on
@@ -208,7 +208,9 @@ class PartitionSpec(object):
208
208
 
209
209
  @property
210
210
  def algo(self) -> str:
211
- """Get algo of the spec, one of ``hash`` (default), ``rand`` and ``even``"""
211
+ """Get algo of the spec, one of ``hash`` (default),
212
+ ``rand`` ``even`` or ``coarse``
213
+ """
212
214
  return self._algo if self._algo != "" else "hash"
213
215
 
214
216
  @property
@@ -258,11 +260,14 @@ class PartitionSpec(object):
258
260
  """Get deterministic unique id of this object"""
259
261
  return to_uuid(self.jsondict)
260
262
 
261
- def get_sorts(self, schema: Schema) -> IndexedOrderedDict[str, bool]:
263
+ def get_sorts(
264
+ self, schema: Schema, with_partition_keys: bool = True
265
+ ) -> IndexedOrderedDict[str, bool]:
262
266
  """Get keys for sorting in a partition, it's the combination of partition
263
267
  keys plus the presort keys
264
268
 
265
269
  :param schema: the dataframe schema this partition spec to operate on
270
+ :param with_partition_keys: whether to include partition keys
266
271
  :return: an ordered dictionary of key, order pairs
267
272
 
268
273
  .. admonition:: Examples
@@ -272,9 +277,10 @@ class PartitionSpec(object):
272
277
  >>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False}
273
278
  """
274
279
  d: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
275
- for p in self.partition_by:
276
- aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
277
- d[p] = True
280
+ if with_partition_keys:
281
+ for p in self.partition_by:
282
+ aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
283
+ d[p] = True
278
284
  for p, v in self.presort.items():
279
285
  aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
280
286
  d[p] = v
fugue/constants.py CHANGED
@@ -2,7 +2,7 @@ from typing import Any, Dict
2
2
  from triad import ParamDict
3
3
 
4
4
  KEYWORD_ROWCOUNT = "ROWCOUNT"
5
- KEYWORD_CORECOUNT = "CORECOUNT"
5
+ KEYWORD_PARALLELISM = "CONCURRENCY"
6
6
 
7
7
  FUGUE_ENTRYPOINT = "fugue.plugins"
8
8
 
@@ -18,10 +18,4 @@ from .dataframes import DataFrames
18
18
  from .function_wrapper import DataFrameFunctionWrapper, fugue_annotated_param
19
19
  from .iterable_dataframe import IterableDataFrame
20
20
  from .pandas_dataframe import PandasDataFrame
21
- from .utils import (
22
- get_column_names,
23
- normalize_dataframe_column_names,
24
- rename,
25
- to_local_bounded_df,
26
- to_local_df,
27
- )
21
+ from .utils import get_column_names, normalize_dataframe_column_names, rename
@@ -141,7 +141,7 @@ class ArrowDataFrame(LocalBoundedDataFrame):
141
141
  return self.native.shape[0]
142
142
 
143
143
  def as_pandas(self) -> pd.DataFrame:
144
- return self.native.to_pandas()
144
+ return self.native.to_pandas(use_threads=False, date_as_object=False)
145
145
 
146
146
  def head(
147
147
  self, n: int, columns: Optional[List[str]] = None
@@ -34,7 +34,6 @@ from .dataframe_iterable_dataframe import (
34
34
  from .dataframes import DataFrames
35
35
  from .iterable_dataframe import IterableDataFrame
36
36
  from .pandas_dataframe import PandasDataFrame
37
- from .utils import to_local_df
38
37
 
39
38
 
40
39
  @function_wrapper(FUGUE_ENTRYPOINT)
@@ -176,7 +175,7 @@ class DataFrameParam(_DataFrameParamBase):
176
175
  @fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
177
176
  class LocalDataFrameParam(DataFrameParam):
178
177
  def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
179
- return to_local_df(df)
178
+ return df.as_local()
180
179
 
181
180
  def to_output_df(self, output: LocalDataFrame, schema: Any, ctx: Any) -> DataFrame:
182
181
  assert_or_throw(
@@ -256,7 +255,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
256
255
  class _ListDictParam(_LocalNoSchemaDataFrameParam):
257
256
  @no_type_check
258
257
  def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
259
- return list(to_local_df(df).as_dict_iterable())
258
+ return list(df.as_local().as_dict_iterable())
260
259
 
261
260
  @no_type_check
262
261
  def to_output_df(
fugue/dataframe/utils.py CHANGED
@@ -13,11 +13,9 @@ from triad.exceptions import InvalidOperationError
13
13
  from triad.utils.assertion import assert_arg_not_none
14
14
  from triad.utils.assertion import assert_or_throw as aot
15
15
 
16
- from .api import get_column_names, normalize_column_names, rename
16
+ from .api import get_column_names, normalize_column_names, rename, as_fugue_df
17
17
  from .array_dataframe import ArrayDataFrame
18
- from .arrow_dataframe import ArrowDataFrame
19
- from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame
20
- from .iterable_dataframe import IterableDataFrame
18
+ from .dataframe import DataFrame, LocalBoundedDataFrame
21
19
  from .pandas_dataframe import PandasDataFrame
22
20
 
23
21
  # For backward compatibility, TODO: remove!
@@ -74,8 +72,11 @@ def _df_eq(
74
72
  :param throw: if to throw error if not equal, defaults to False
75
73
  :return: if they equal
76
74
  """
77
- df1 = to_local_bounded_df(df)
78
- df2 = to_local_bounded_df(data, schema)
75
+ df1 = as_fugue_df(df).as_local_bounded()
76
+ if schema is not None:
77
+ df2 = as_fugue_df(data, schema=schema).as_local_bounded()
78
+ else:
79
+ df2 = as_fugue_df(data).as_local_bounded()
79
80
  try:
80
81
  assert (
81
82
  df1.count() == df2.count()
@@ -99,7 +100,7 @@ def _df_eq(
99
100
  d1 = d1.reset_index(drop=True)
100
101
  d2 = d2.reset_index(drop=True)
101
102
  pd.testing.assert_frame_equal(
102
- d1, d2, check_less_precise=digits, check_dtype=False
103
+ d1, d2, rtol=0, atol=10 ** (-digits), check_dtype=False, check_exact=False
103
104
  )
104
105
  return True
105
106
  except AssertionError:
@@ -108,84 +109,9 @@ def _df_eq(
108
109
  return False
109
110
 
110
111
 
111
- def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame:
112
- """Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`
113
-
114
- :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
115
- list or iterable of arrays
116
- :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
117
- :class:`~fugue.dataframe.dataframe.DataFrame` type
118
- :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
119
- but you set ``schema``
120
- :raises TypeError: if ``df`` is not compatible
121
- :return: the dataframe itself if it's
122
- :class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one
123
-
124
- .. admonition:: Examples
125
-
126
- >>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
127
- >>> assert to_local_df(a) is a
128
- >>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
129
- """
130
- assert_arg_not_none(df, "df")
131
- if isinstance(df, DataFrame):
132
- aot(
133
- schema is None,
134
- ValueError("schema and metadata must be None when df is a DataFrame"),
135
- )
136
- return df.as_local()
137
- if isinstance(df, pd.DataFrame):
138
- return PandasDataFrame(df, schema)
139
- if isinstance(df, pa.Table):
140
- return ArrowDataFrame(df, schema)
141
- if isinstance(df, List):
142
- return ArrayDataFrame(df, schema)
143
- if isinstance(df, Iterable):
144
- return IterableDataFrame(df, schema)
145
- raise ValueError(f"{df} cannot convert to a LocalDataFrame")
146
-
147
-
148
- def to_local_bounded_df(df: Any, schema: Any = None) -> LocalBoundedDataFrame:
149
- """Convert a data structure to
150
- :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`
151
-
152
- :param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
153
- list or iterable of arrays
154
- :param schema: |SchemaLikeObject|, defaults to None, it should not be set for
155
- :class:`~fugue.dataframe.dataframe.DataFrame` type
156
- :raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
157
- but you set ``schema``
158
- :raises TypeError: if ``df`` is not compatible
159
- :return: the dataframe itself if it's
160
- :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one
161
-
162
- .. admonition:: Examples
163
-
164
- >>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str")
165
- >>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame)
166
- >>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
167
-
168
- .. note::
169
-
170
- Compared to :func:`.to_local_df`, this function makes sure the dataframe is also
171
- bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will
172
- be converted although it's local.
173
- """
174
- if isinstance(df, DataFrame):
175
- aot(
176
- schema is None,
177
- ValueError("schema and metadata must be None when df is a DataFrame"),
178
- )
179
- return df.as_local_bounded()
180
- df = to_local_df(df, schema)
181
- if isinstance(df, LocalBoundedDataFrame):
182
- return df
183
- raise ValueError(f"{df} cannot convert to a LocalBoundedDataFrame")
184
-
185
-
186
112
  def pickle_df(df: DataFrame) -> bytes:
187
113
  """Pickles a dataframe to bytes array. It firstly converts the dataframe
188
- using :func:`.to_local_bounded_df`, and then serialize the underlying data.
114
+ local bounded, and then serialize the underlying data.
189
115
 
190
116
  :param df: input DataFrame
191
117
  :return: pickled binary data
@@ -195,7 +121,7 @@ def pickle_df(df: DataFrame) -> bytes:
195
121
  Be careful to use on large dataframes or non-local, un-materialized dataframes,
196
122
  it can be slow. You should always use :func:`.unpickle_df` to deserialize.
197
123
  """
198
- df = to_local_bounded_df(df)
124
+ df = df.as_local_bounded()
199
125
  o: List[Any] = [df.schema]
200
126
  if isinstance(df, PandasDataFrame):
201
127
  o.append("p")
fugue/execution/api.py CHANGED
@@ -15,6 +15,7 @@ from .execution_engine import (
15
15
  ExecutionEngine,
16
16
  )
17
17
  from .factory import make_execution_engine, try_get_context_execution_engine
18
+ from .._utils.registry import fugue_plugin
18
19
 
19
20
 
20
21
  @contextmanager
@@ -120,6 +121,27 @@ def get_current_parallelism() -> int:
120
121
  return make_execution_engine().get_current_parallelism()
121
122
 
122
123
 
124
+ @fugue_plugin
125
+ def as_fugue_engine_df(
126
+ engine: ExecutionEngine, df: AnyDataFrame, schema: Any = None
127
+ ) -> DataFrame:
128
+ """Convert a dataframe to a Fugue engine dependent DataFrame.
129
+ This function is used internally by Fugue. It is not recommended
130
+ to use
131
+
132
+ :param engine: the ExecutionEngine to use, must not be None
133
+ :param df: a dataframe like object
134
+ :param schema: the schema of the dataframe, defaults to None
135
+
136
+ :return: the engine dependent DataFrame
137
+ """
138
+ if schema is None:
139
+ fdf = as_fugue_df(df)
140
+ else:
141
+ fdf = as_fugue_df(df, schema=schema)
142
+ return engine.to_df(fdf)
143
+
144
+
123
145
  def run_engine_function(
124
146
  func: Callable[[ExecutionEngine], Any],
125
147
  engine: AnyExecutionEngine = None,
@@ -549,11 +571,11 @@ def join(
549
571
  """
550
572
 
551
573
  def _join(e: ExecutionEngine):
552
- edf1 = e.to_df(df1)
553
- edf2 = e.to_df(df2)
574
+ edf1 = as_fugue_engine_df(e, df1)
575
+ edf2 = as_fugue_engine_df(e, df2)
554
576
  res = e.join(edf1, edf2, how=how, on=on)
555
577
  for odf in dfs:
556
- res = e.join(res, e.to_df(odf), how=how, on=on)
578
+ res = e.join(res, as_fugue_engine_df(e, odf), how=how, on=on)
557
579
  return res
558
580
 
559
581
  return run_engine_function(
@@ -837,11 +859,11 @@ def union(
837
859
  """
838
860
 
839
861
  def _union(e: ExecutionEngine):
840
- edf1 = e.to_df(df1)
841
- edf2 = e.to_df(df2)
862
+ edf1 = as_fugue_engine_df(e, df1)
863
+ edf2 = as_fugue_engine_df(e, df2)
842
864
  res = e.union(edf1, edf2, distinct=distinct)
843
865
  for odf in dfs:
844
- res = e.union(res, e.to_df(odf), distinct=distinct)
866
+ res = e.union(res, as_fugue_engine_df(e, odf), distinct=distinct)
845
867
  return res
846
868
 
847
869
  return run_engine_function(
@@ -885,11 +907,11 @@ def subtract(
885
907
  """
886
908
 
887
909
  def _subtract(e: ExecutionEngine):
888
- edf1 = e.to_df(df1)
889
- edf2 = e.to_df(df2)
910
+ edf1 = as_fugue_engine_df(e, df1)
911
+ edf2 = as_fugue_engine_df(e, df2)
890
912
  res = e.subtract(edf1, edf2, distinct=distinct)
891
913
  for odf in dfs:
892
- res = e.subtract(res, e.to_df(odf), distinct=distinct)
914
+ res = e.subtract(res, as_fugue_engine_df(e, odf), distinct=distinct)
893
915
  return res
894
916
 
895
917
  return run_engine_function(
@@ -933,11 +955,11 @@ def intersect(
933
955
  """
934
956
 
935
957
  def _intersect(e: ExecutionEngine):
936
- edf1 = e.to_df(df1)
937
- edf2 = e.to_df(df2)
958
+ edf1 = as_fugue_engine_df(e, df1)
959
+ edf2 = as_fugue_engine_df(e, df2)
938
960
  res = e.intersect(edf1, edf2, distinct=distinct)
939
961
  for odf in dfs:
940
- res = e.intersect(res, e.to_df(odf), distinct=distinct)
962
+ res = e.intersect(res, as_fugue_engine_df(e, odf), distinct=distinct)
941
963
  return res
942
964
 
943
965
  return run_engine_function(
@@ -25,9 +25,9 @@ from fugue.dataframe import (
25
25
  LocalDataFrame,
26
26
  PandasDataFrame,
27
27
  fugue_annotated_param,
28
- to_local_bounded_df,
29
28
  )
30
- from fugue.dataframe.utils import get_join_schemas, to_local_df
29
+ from fugue.dataframe.dataframe import as_fugue_df
30
+ from fugue.dataframe.utils import get_join_schemas
31
31
 
32
32
  from .execution_engine import (
33
33
  ExecutionEngine,
@@ -83,19 +83,36 @@ class PandasMapEngine(MapEngine):
83
83
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
84
84
  map_func_format_hint: Optional[str] = None,
85
85
  ) -> DataFrame:
86
- if partition_spec.num_partitions != "0":
87
- self.log.warning(
88
- "%s doesn't respect num_partitions %s",
89
- self,
90
- partition_spec.num_partitions,
91
- )
86
+ # if partition_spec.num_partitions != "0":
87
+ # self.log.warning(
88
+ # "%s doesn't respect num_partitions %s",
89
+ # self,
90
+ # partition_spec.num_partitions,
91
+ # )
92
+ is_coarse = partition_spec.algo == "coarse"
93
+ presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
94
+ presort_keys = list(presort.keys())
95
+ presort_asc = list(presort.values())
96
+ output_schema = Schema(output_schema)
92
97
  cursor = partition_spec.get_cursor(df.schema, 0)
93
98
  if on_init is not None:
94
99
  on_init(0, df)
95
- if len(partition_spec.partition_by) == 0: # no partition
96
- df = to_local_df(df)
97
- cursor.set(lambda: df.peek_array(), 0, 0)
98
- output_df = map_func(cursor, df)
100
+ if (
101
+ len(partition_spec.partition_by) == 0 or partition_spec.algo == "coarse"
102
+ ): # no partition
103
+ if len(partition_spec.presort) > 0:
104
+ pdf = (
105
+ df.as_pandas()
106
+ .sort_values(presort_keys, ascending=presort_asc)
107
+ .reset_index(drop=True)
108
+ )
109
+ input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
110
+ cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
111
+ output_df = map_func(cursor, input_df)
112
+ else:
113
+ df = df.as_local()
114
+ cursor.set(lambda: df.peek_array(), 0, 0)
115
+ output_df = map_func(cursor, df)
99
116
  if (
100
117
  isinstance(output_df, PandasDataFrame)
101
118
  and output_df.schema != output_schema
@@ -107,13 +124,9 @@ class PandasMapEngine(MapEngine):
107
124
  f"mismatches given {output_schema}",
108
125
  )
109
126
  return self.to_df(output_df) # type: ignore
110
- presort = partition_spec.presort
111
- presort_keys = list(presort.keys())
112
- presort_asc = list(presort.values())
113
- output_schema = Schema(output_schema)
114
127
 
115
128
  def _map(pdf: pd.DataFrame) -> pd.DataFrame:
116
- if len(presort_keys) > 0:
129
+ if len(partition_spec.presort) > 0:
117
130
  pdf = pdf.sort_values(presort_keys, ascending=presort_asc).reset_index(
118
131
  drop=True
119
132
  )
@@ -177,7 +190,7 @@ class NativeExecutionEngine(ExecutionEngine):
177
190
  def repartition(
178
191
  self, df: DataFrame, partition_spec: PartitionSpec
179
192
  ) -> DataFrame: # pragma: no cover
180
- self.log.warning("%s doesn't respect repartition", self)
193
+ # self.log.warning("%s doesn't respect repartition", self)
181
194
  return df
182
195
 
183
196
  def broadcast(self, df: DataFrame) -> DataFrame:
@@ -384,4 +397,5 @@ class _NativeExecutionEngineParam(ExecutionEngineParam):
384
397
 
385
398
 
386
399
  def _to_native_execution_engine_df(df: AnyDataFrame, schema: Any = None) -> DataFrame:
387
- return to_local_bounded_df(df, schema)
400
+ fdf = as_fugue_df(df) if schema is None else as_fugue_df(df, schema=schema)
401
+ return fdf.as_local_bounded()
@@ -1,10 +1,12 @@
1
1
  from typing import Any, Callable, Optional
2
2
 
3
+ from triad import Schema, assert_or_throw, to_uuid
4
+
3
5
  from fugue.collections.yielded import Yielded
4
6
  from fugue.dataframe import DataFrame
5
7
  from fugue.exceptions import FugueWorkflowCompileError
8
+ from fugue.execution.api import as_fugue_engine_df
6
9
  from fugue.extensions.creator import Creator
7
- from triad import Schema, assert_or_throw, to_uuid
8
10
 
9
11
 
10
12
  class Load(Creator):
@@ -39,7 +41,7 @@ class CreateData(Creator):
39
41
  def create(self) -> DataFrame:
40
42
  if isinstance(self._df, Yielded):
41
43
  return self.execution_engine.load_yielded(self._df)
42
- return self.execution_engine.to_df(self._df, schema=self._schema)
44
+ return as_fugue_engine_df(self.execution_engine, self._df, schema=self._schema)
43
45
 
44
46
  def _df_uid(self):
45
47
  if self._data_determiner is not None:
@@ -6,7 +6,7 @@ from triad.utils.convert import to_type
6
6
  from fugue.collections.partition import PartitionCursor
7
7
  from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame
8
8
  from fugue.dataframe.array_dataframe import ArrayDataFrame
9
- from fugue.dataframe.utils import _df_eq, to_local_bounded_df
9
+ from fugue.dataframe.utils import _df_eq
10
10
  from fugue.exceptions import FugueWorkflowError
11
11
  from fugue.execution.execution_engine import _generate_comap_empty_dfs
12
12
  from fugue.rpc import EmptyRPCHandler, to_rpc_handler
@@ -136,7 +136,7 @@ class _TransformerRunner(object):
136
136
  def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
137
137
  self.transformer._cursor = cursor # type: ignore
138
138
  try:
139
- to_local_bounded_df(self.transformer.transform(df))
139
+ self.transformer.transform(df).as_local_bounded()
140
140
  return ArrayDataFrame([], self.transformer.output_schema)
141
141
  except self.ignore_errors: # type: ignore
142
142
  return ArrayDataFrame([], self.transformer.output_schema)
@@ -160,7 +160,7 @@ class _CoTransformerRunner(object):
160
160
  def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame:
161
161
  self.transformer._cursor = cursor # type: ignore
162
162
  try:
163
- to_local_bounded_df(self.transformer.transform(dfs))
163
+ self.transformer.transform(dfs).as_local_bounded()
164
164
  return ArrayDataFrame([], self.transformer.output_schema)
165
165
  except self.ignore_errors: # type: ignore
166
166
  return ArrayDataFrame([], self.transformer.output_schema)
@@ -6,7 +6,6 @@ from fugue.dataframe import (
6
6
  DataFrame,
7
7
  DataFrames,
8
8
  LocalDataFrame,
9
- to_local_bounded_df,
10
9
  )
11
10
  from fugue.column import ColumnExpr, SelectColumns as ColumnsSelect
12
11
  from fugue.exceptions import FugueWorkflowError
@@ -334,7 +333,7 @@ class _TransformerRunner(object):
334
333
  return self.transformer.transform(df)
335
334
  else:
336
335
  try:
337
- return to_local_bounded_df(self.transformer.transform(df))
336
+ return self.transformer.transform(df).as_local_bounded()
338
337
  except self.ignore_errors: # type: ignore # pylint: disable=E0712
339
338
  return ArrayDataFrame([], self.transformer.output_schema)
340
339
 
@@ -364,7 +363,7 @@ class _CoTransformerRunner(object):
364
363
 
365
364
  else:
366
365
  try:
367
- return to_local_bounded_df(self.transformer.transform(dfs))
366
+ return self.transformer.transform(dfs).as_local_bounded()
368
367
  except self.ignore_errors: # type: ignore # pylint: disable=E0712
369
368
  return ArrayDataFrame([], self.transformer.output_schema)
370
369
 
fugue/plugins.py CHANGED
@@ -30,6 +30,7 @@ from fugue.dataset import (
30
30
  is_empty,
31
31
  is_local,
32
32
  )
33
+ from fugue.execution.api import as_fugue_engine_df
33
34
  from fugue.execution.factory import (
34
35
  infer_execution_engine,
35
36
  parse_execution_engine,
@@ -166,7 +166,7 @@ class CheckpointPath(object):
166
166
 
167
167
  def get_table_name(self, obj_id: str, permanent: bool) -> str:
168
168
  path = self._path if permanent else self._temp_path
169
- return to_uuid(path, obj_id)[:5]
169
+ return "temp_" + to_uuid(path, obj_id)[:5]
170
170
 
171
171
  def temp_file_exists(self, path: str) -> bool:
172
172
  try: