fugue 0.8.2.dev4__py3-none-any.whl → 0.8.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +2 -91
- fugue/api.py +1 -0
- fugue/collections/partition.py +12 -6
- fugue/constants.py +1 -1
- fugue/dataframe/__init__.py +1 -7
- fugue/dataframe/arrow_dataframe.py +1 -1
- fugue/dataframe/function_wrapper.py +2 -3
- fugue/dataframe/utils.py +10 -84
- fugue/execution/api.py +34 -12
- fugue/execution/native_execution_engine.py +33 -19
- fugue/extensions/_builtins/creators.py +4 -2
- fugue/extensions/_builtins/outputters.py +3 -3
- fugue/extensions/_builtins/processors.py +2 -3
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +1 -1
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/METADATA +20 -10
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/RECORD +67 -65
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/entry_points.txt +2 -2
- fugue_contrib/viz/_ext.py +7 -1
- fugue_dask/_io.py +0 -13
- fugue_dask/_utils.py +10 -4
- fugue_dask/execution_engine.py +42 -16
- fugue_duckdb/_utils.py +7 -2
- fugue_duckdb/dask.py +1 -1
- fugue_duckdb/dataframe.py +17 -10
- fugue_duckdb/execution_engine.py +12 -22
- fugue_ibis/dataframe.py +2 -7
- fugue_notebook/env.py +5 -10
- fugue_polars/_utils.py +0 -40
- fugue_polars/polars_dataframe.py +22 -7
- fugue_ray/_constants.py +8 -1
- fugue_ray/_utils/dataframe.py +31 -4
- fugue_ray/_utils/io.py +2 -4
- fugue_ray/dataframe.py +13 -4
- fugue_ray/execution_engine.py +39 -21
- fugue_spark/_utils/convert.py +22 -11
- fugue_spark/_utils/io.py +0 -13
- fugue_spark/_utils/misc.py +27 -0
- fugue_spark/_utils/partition.py +11 -18
- fugue_spark/dataframe.py +24 -19
- fugue_spark/execution_engine.py +61 -35
- fugue_spark/registry.py +15 -3
- fugue_test/builtin_suite.py +7 -9
- fugue_test/dataframe_suite.py +7 -3
- fugue_test/execution_suite.py +100 -122
- fugue_version/__init__.py +1 -1
- tests/fugue/collections/test_partition.py +6 -3
- tests/fugue/dataframe/test_utils.py +2 -43
- tests/fugue/execution/test_naive_execution_engine.py +33 -0
- tests/fugue/utils/test_io.py +0 -80
- tests/fugue_dask/test_execution_engine.py +45 -0
- tests/fugue_dask/test_io.py +0 -55
- tests/fugue_duckdb/test_dataframe.py +2 -2
- tests/fugue_duckdb/test_utils.py +1 -1
- tests/fugue_polars/test_api.py +13 -0
- tests/fugue_polars/test_transform.py +11 -5
- tests/fugue_ray/test_execution_engine.py +32 -1
- tests/fugue_spark/test_dataframe.py +0 -8
- tests/fugue_spark/test_execution_engine.py +48 -10
- tests/fugue_spark/test_importless.py +4 -4
- tests/fugue_spark/test_spark_connect.py +82 -0
- tests/fugue_spark/utils/test_convert.py +6 -8
- tests/fugue_spark/utils/test_io.py +0 -17
- fugue_test/_utils.py +0 -13
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/LICENSE +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/WHEEL +0 -0
- {fugue-0.8.2.dev4.dist-info → fugue-0.8.4.dist-info}/top_level.txt +0 -0
fugue/__init__.py
CHANGED
|
@@ -26,7 +26,6 @@ from fugue.dataframe.dataframe_iterable_dataframe import (
|
|
|
26
26
|
from fugue.dataframe.dataframes import DataFrames
|
|
27
27
|
from fugue.dataframe.iterable_dataframe import IterableDataFrame
|
|
28
28
|
from fugue.dataframe.pandas_dataframe import PandasDataFrame
|
|
29
|
-
from fugue.dataframe.utils import to_local_bounded_df, to_local_df
|
|
30
29
|
from fugue.dataset import (
|
|
31
30
|
AnyDataset,
|
|
32
31
|
Dataset,
|
fugue/_utils/io.py
CHANGED
|
@@ -5,13 +5,13 @@ from urllib.parse import urlparse
|
|
|
5
5
|
|
|
6
6
|
import fs as pfs
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from fs.errors import FileExpected
|
|
9
|
-
from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
|
|
10
8
|
from triad.collections.dict import ParamDict
|
|
11
9
|
from triad.collections.fs import FileSystem
|
|
12
10
|
from triad.collections.schema import Schema
|
|
13
11
|
from triad.utils.assertion import assert_or_throw
|
|
14
12
|
|
|
13
|
+
from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
|
|
14
|
+
|
|
15
15
|
|
|
16
16
|
class FileParser(object):
|
|
17
17
|
def __init__(self, path: str, format_hint: Optional[str] = None):
|
|
@@ -271,111 +271,22 @@ def _load_json(
|
|
|
271
271
|
return pdf[schema.names], schema
|
|
272
272
|
|
|
273
273
|
|
|
274
|
-
def _save_avro(df: LocalDataFrame, p: FileParser, **kwargs: Any):
|
|
275
|
-
"""Save pandas dataframe as avro.
|
|
276
|
-
If providing your own schema, the usage of schema argument is preferred
|
|
277
|
-
|
|
278
|
-
:param schema: Avro Schema determines dtypes saved
|
|
279
|
-
"""
|
|
280
|
-
import pandavro as pdx
|
|
281
|
-
|
|
282
|
-
kw = ParamDict(kwargs)
|
|
283
|
-
|
|
284
|
-
# pandavro defaults
|
|
285
|
-
schema = None
|
|
286
|
-
append = False
|
|
287
|
-
times_as_micros = True
|
|
288
|
-
|
|
289
|
-
if "schema" in kw:
|
|
290
|
-
schema = kw["schema"]
|
|
291
|
-
del kw["schema"]
|
|
292
|
-
|
|
293
|
-
if "append" in kw:
|
|
294
|
-
append = kw["append"] # default is overwrite (False) instead of append (True)
|
|
295
|
-
del kw["append"]
|
|
296
|
-
|
|
297
|
-
if "times_as_micros" in kw:
|
|
298
|
-
times_as_micros = kw["times_as_micros"]
|
|
299
|
-
del kw["times_as_micros"]
|
|
300
|
-
|
|
301
|
-
pdf = df.as_pandas()
|
|
302
|
-
pdx.to_avro(
|
|
303
|
-
p.uri, pdf, schema=schema, append=append, times_as_micros=times_as_micros, **kw
|
|
304
|
-
)
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def _load_avro(
|
|
308
|
-
p: FileParser, columns: Any = None, **kwargs: Any
|
|
309
|
-
) -> Tuple[pd.DataFrame, Any]:
|
|
310
|
-
path = p.uri
|
|
311
|
-
try:
|
|
312
|
-
pdf = _load_single_avro(path, **kwargs)
|
|
313
|
-
except (IsADirectoryError, PermissionError, FileExpected):
|
|
314
|
-
fs = FileSystem()
|
|
315
|
-
pdf = pd.concat(
|
|
316
|
-
[
|
|
317
|
-
_load_single_avro(
|
|
318
|
-
pfs.path.combine(path, pfs.path.basename(x.path)), **kwargs
|
|
319
|
-
)
|
|
320
|
-
for x in fs.opendir(path).glob("*.avro")
|
|
321
|
-
]
|
|
322
|
-
)
|
|
323
|
-
|
|
324
|
-
if columns is None:
|
|
325
|
-
return pdf, None
|
|
326
|
-
if isinstance(columns, list): # column names
|
|
327
|
-
return pdf[columns], None
|
|
328
|
-
|
|
329
|
-
schema = Schema(columns)
|
|
330
|
-
|
|
331
|
-
# Return created DataFrame
|
|
332
|
-
return pdf[schema.names], schema
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def _load_single_avro(path: str, **kwargs: Any) -> pd.DataFrame:
|
|
336
|
-
from fastavro import reader
|
|
337
|
-
|
|
338
|
-
kw = ParamDict(kwargs)
|
|
339
|
-
process_record = None
|
|
340
|
-
if "process_record" in kw:
|
|
341
|
-
process_record = kw["process_record"]
|
|
342
|
-
del kw["process_record"]
|
|
343
|
-
|
|
344
|
-
fs = FileSystem()
|
|
345
|
-
with fs.openbin(path) as fp:
|
|
346
|
-
# Configure Avro reader
|
|
347
|
-
avro_reader = reader(fp)
|
|
348
|
-
# Load records in memory
|
|
349
|
-
if process_record:
|
|
350
|
-
records = [process_record(r) for r in avro_reader]
|
|
351
|
-
|
|
352
|
-
else:
|
|
353
|
-
records = list(avro_reader)
|
|
354
|
-
|
|
355
|
-
# Populate pandas.DataFrame with records
|
|
356
|
-
return pd.DataFrame.from_records(records)
|
|
357
|
-
|
|
358
|
-
|
|
359
274
|
_FORMAT_MAP: Dict[str, str] = {
|
|
360
275
|
".csv": "csv",
|
|
361
276
|
".csv.gz": "csv",
|
|
362
277
|
".parquet": "parquet",
|
|
363
278
|
".json": "json",
|
|
364
279
|
".json.gz": "json",
|
|
365
|
-
".avro": "avro",
|
|
366
|
-
".avro.gz": "avro",
|
|
367
280
|
}
|
|
368
281
|
|
|
369
282
|
_FORMAT_LOAD: Dict[str, Callable[..., Tuple[pd.DataFrame, Any]]] = {
|
|
370
283
|
"csv": _load_csv,
|
|
371
284
|
"parquet": _load_parquet,
|
|
372
285
|
"json": _load_json,
|
|
373
|
-
"avro": _load_avro,
|
|
374
286
|
}
|
|
375
287
|
|
|
376
288
|
_FORMAT_SAVE: Dict[str, Callable] = {
|
|
377
289
|
"csv": _save_csv,
|
|
378
290
|
"parquet": _save_parquet,
|
|
379
291
|
"json": _save_json,
|
|
380
|
-
"avro": _save_avro,
|
|
381
292
|
}
|
fugue/api.py
CHANGED
fugue/collections/partition.py
CHANGED
|
@@ -98,7 +98,7 @@ class PartitionSpec(object):
|
|
|
98
98
|
|
|
99
99
|
Partition consists for these specs:
|
|
100
100
|
|
|
101
|
-
* **algo**: can be one of ``hash`` (default), ``rand``
|
|
101
|
+
* **algo**: can be one of ``hash`` (default), ``rand``, ``even`` or ``coarse``
|
|
102
102
|
* **num** or **num_partitions**: number of physical partitions, it can be an
|
|
103
103
|
expression or integer numbers, e.g ``(ROWCOUNT+4) / 3``
|
|
104
104
|
* **by** or **partition_by**: keys to partition on
|
|
@@ -208,7 +208,9 @@ class PartitionSpec(object):
|
|
|
208
208
|
|
|
209
209
|
@property
|
|
210
210
|
def algo(self) -> str:
|
|
211
|
-
"""Get algo of the spec, one of ``hash`` (default),
|
|
211
|
+
"""Get algo of the spec, one of ``hash`` (default),
|
|
212
|
+
``rand`` ``even`` or ``coarse``
|
|
213
|
+
"""
|
|
212
214
|
return self._algo if self._algo != "" else "hash"
|
|
213
215
|
|
|
214
216
|
@property
|
|
@@ -258,11 +260,14 @@ class PartitionSpec(object):
|
|
|
258
260
|
"""Get deterministic unique id of this object"""
|
|
259
261
|
return to_uuid(self.jsondict)
|
|
260
262
|
|
|
261
|
-
def get_sorts(
|
|
263
|
+
def get_sorts(
|
|
264
|
+
self, schema: Schema, with_partition_keys: bool = True
|
|
265
|
+
) -> IndexedOrderedDict[str, bool]:
|
|
262
266
|
"""Get keys for sorting in a partition, it's the combination of partition
|
|
263
267
|
keys plus the presort keys
|
|
264
268
|
|
|
265
269
|
:param schema: the dataframe schema this partition spec to operate on
|
|
270
|
+
:param with_partition_keys: whether to include partition keys
|
|
266
271
|
:return: an ordered dictionary of key, order pairs
|
|
267
272
|
|
|
268
273
|
.. admonition:: Examples
|
|
@@ -272,9 +277,10 @@ class PartitionSpec(object):
|
|
|
272
277
|
>>> assert p.get_sorts(schema) == {"a":True, "b":True, "c": False}
|
|
273
278
|
"""
|
|
274
279
|
d: IndexedOrderedDict[str, bool] = IndexedOrderedDict()
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
280
|
+
if with_partition_keys:
|
|
281
|
+
for p in self.partition_by:
|
|
282
|
+
aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
|
|
283
|
+
d[p] = True
|
|
278
284
|
for p, v in self.presort.items():
|
|
279
285
|
aot(p in schema, lambda: KeyError(f"{p} not in {schema}"))
|
|
280
286
|
d[p] = v
|
fugue/constants.py
CHANGED
fugue/dataframe/__init__.py
CHANGED
|
@@ -18,10 +18,4 @@ from .dataframes import DataFrames
|
|
|
18
18
|
from .function_wrapper import DataFrameFunctionWrapper, fugue_annotated_param
|
|
19
19
|
from .iterable_dataframe import IterableDataFrame
|
|
20
20
|
from .pandas_dataframe import PandasDataFrame
|
|
21
|
-
from .utils import
|
|
22
|
-
get_column_names,
|
|
23
|
-
normalize_dataframe_column_names,
|
|
24
|
-
rename,
|
|
25
|
-
to_local_bounded_df,
|
|
26
|
-
to_local_df,
|
|
27
|
-
)
|
|
21
|
+
from .utils import get_column_names, normalize_dataframe_column_names, rename
|
|
@@ -141,7 +141,7 @@ class ArrowDataFrame(LocalBoundedDataFrame):
|
|
|
141
141
|
return self.native.shape[0]
|
|
142
142
|
|
|
143
143
|
def as_pandas(self) -> pd.DataFrame:
|
|
144
|
-
return self.native.to_pandas()
|
|
144
|
+
return self.native.to_pandas(use_threads=False, date_as_object=False)
|
|
145
145
|
|
|
146
146
|
def head(
|
|
147
147
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -34,7 +34,6 @@ from .dataframe_iterable_dataframe import (
|
|
|
34
34
|
from .dataframes import DataFrames
|
|
35
35
|
from .iterable_dataframe import IterableDataFrame
|
|
36
36
|
from .pandas_dataframe import PandasDataFrame
|
|
37
|
-
from .utils import to_local_df
|
|
38
37
|
|
|
39
38
|
|
|
40
39
|
@function_wrapper(FUGUE_ENTRYPOINT)
|
|
@@ -176,7 +175,7 @@ class DataFrameParam(_DataFrameParamBase):
|
|
|
176
175
|
@fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
|
|
177
176
|
class LocalDataFrameParam(DataFrameParam):
|
|
178
177
|
def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
|
|
179
|
-
return
|
|
178
|
+
return df.as_local()
|
|
180
179
|
|
|
181
180
|
def to_output_df(self, output: LocalDataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
182
181
|
assert_or_throw(
|
|
@@ -256,7 +255,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
|
|
|
256
255
|
class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
257
256
|
@no_type_check
|
|
258
257
|
def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
|
|
259
|
-
return list(
|
|
258
|
+
return list(df.as_local().as_dict_iterable())
|
|
260
259
|
|
|
261
260
|
@no_type_check
|
|
262
261
|
def to_output_df(
|
fugue/dataframe/utils.py
CHANGED
|
@@ -13,11 +13,9 @@ from triad.exceptions import InvalidOperationError
|
|
|
13
13
|
from triad.utils.assertion import assert_arg_not_none
|
|
14
14
|
from triad.utils.assertion import assert_or_throw as aot
|
|
15
15
|
|
|
16
|
-
from .api import get_column_names, normalize_column_names, rename
|
|
16
|
+
from .api import get_column_names, normalize_column_names, rename, as_fugue_df
|
|
17
17
|
from .array_dataframe import ArrayDataFrame
|
|
18
|
-
from .
|
|
19
|
-
from .dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame
|
|
20
|
-
from .iterable_dataframe import IterableDataFrame
|
|
18
|
+
from .dataframe import DataFrame, LocalBoundedDataFrame
|
|
21
19
|
from .pandas_dataframe import PandasDataFrame
|
|
22
20
|
|
|
23
21
|
# For backward compatibility, TODO: remove!
|
|
@@ -74,8 +72,11 @@ def _df_eq(
|
|
|
74
72
|
:param throw: if to throw error if not equal, defaults to False
|
|
75
73
|
:return: if they equal
|
|
76
74
|
"""
|
|
77
|
-
df1 =
|
|
78
|
-
|
|
75
|
+
df1 = as_fugue_df(df).as_local_bounded()
|
|
76
|
+
if schema is not None:
|
|
77
|
+
df2 = as_fugue_df(data, schema=schema).as_local_bounded()
|
|
78
|
+
else:
|
|
79
|
+
df2 = as_fugue_df(data).as_local_bounded()
|
|
79
80
|
try:
|
|
80
81
|
assert (
|
|
81
82
|
df1.count() == df2.count()
|
|
@@ -99,7 +100,7 @@ def _df_eq(
|
|
|
99
100
|
d1 = d1.reset_index(drop=True)
|
|
100
101
|
d2 = d2.reset_index(drop=True)
|
|
101
102
|
pd.testing.assert_frame_equal(
|
|
102
|
-
d1, d2,
|
|
103
|
+
d1, d2, rtol=0, atol=10 ** (-digits), check_dtype=False, check_exact=False
|
|
103
104
|
)
|
|
104
105
|
return True
|
|
105
106
|
except AssertionError:
|
|
@@ -108,84 +109,9 @@ def _df_eq(
|
|
|
108
109
|
return False
|
|
109
110
|
|
|
110
111
|
|
|
111
|
-
def to_local_df(df: Any, schema: Any = None) -> LocalDataFrame:
|
|
112
|
-
"""Convert a data structure to :class:`~fugue.dataframe.dataframe.LocalDataFrame`
|
|
113
|
-
|
|
114
|
-
:param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
|
|
115
|
-
list or iterable of arrays
|
|
116
|
-
:param schema: |SchemaLikeObject|, defaults to None, it should not be set for
|
|
117
|
-
:class:`~fugue.dataframe.dataframe.DataFrame` type
|
|
118
|
-
:raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
|
|
119
|
-
but you set ``schema``
|
|
120
|
-
:raises TypeError: if ``df`` is not compatible
|
|
121
|
-
:return: the dataframe itself if it's
|
|
122
|
-
:class:`~fugue.dataframe.dataframe.LocalDataFrame` else a converted one
|
|
123
|
-
|
|
124
|
-
.. admonition:: Examples
|
|
125
|
-
|
|
126
|
-
>>> a = to_local_df([[0,'a'],[1,'b']],"a:int,b:str")
|
|
127
|
-
>>> assert to_local_df(a) is a
|
|
128
|
-
>>> to_local_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
|
|
129
|
-
"""
|
|
130
|
-
assert_arg_not_none(df, "df")
|
|
131
|
-
if isinstance(df, DataFrame):
|
|
132
|
-
aot(
|
|
133
|
-
schema is None,
|
|
134
|
-
ValueError("schema and metadata must be None when df is a DataFrame"),
|
|
135
|
-
)
|
|
136
|
-
return df.as_local()
|
|
137
|
-
if isinstance(df, pd.DataFrame):
|
|
138
|
-
return PandasDataFrame(df, schema)
|
|
139
|
-
if isinstance(df, pa.Table):
|
|
140
|
-
return ArrowDataFrame(df, schema)
|
|
141
|
-
if isinstance(df, List):
|
|
142
|
-
return ArrayDataFrame(df, schema)
|
|
143
|
-
if isinstance(df, Iterable):
|
|
144
|
-
return IterableDataFrame(df, schema)
|
|
145
|
-
raise ValueError(f"{df} cannot convert to a LocalDataFrame")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def to_local_bounded_df(df: Any, schema: Any = None) -> LocalBoundedDataFrame:
|
|
149
|
-
"""Convert a data structure to
|
|
150
|
-
:class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`
|
|
151
|
-
|
|
152
|
-
:param df: :class:`~fugue.dataframe.dataframe.DataFrame`, pandas DataFramme and
|
|
153
|
-
list or iterable of arrays
|
|
154
|
-
:param schema: |SchemaLikeObject|, defaults to None, it should not be set for
|
|
155
|
-
:class:`~fugue.dataframe.dataframe.DataFrame` type
|
|
156
|
-
:raises ValueError: if ``df`` is :class:`~fugue.dataframe.dataframe.DataFrame`
|
|
157
|
-
but you set ``schema``
|
|
158
|
-
:raises TypeError: if ``df`` is not compatible
|
|
159
|
-
:return: the dataframe itself if it's
|
|
160
|
-
:class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` else a converted one
|
|
161
|
-
|
|
162
|
-
.. admonition:: Examples
|
|
163
|
-
|
|
164
|
-
>>> a = IterableDataFrame([[0,'a'],[1,'b']],"a:int,b:str")
|
|
165
|
-
>>> assert isinstance(to_local_bounded_df(a), LocalBoundedDataFrame)
|
|
166
|
-
>>> to_local_bounded_df(SparkDataFrame([[0,'a'],[1,'b']],"a:int,b:str"))
|
|
167
|
-
|
|
168
|
-
.. note::
|
|
169
|
-
|
|
170
|
-
Compared to :func:`.to_local_df`, this function makes sure the dataframe is also
|
|
171
|
-
bounded, so :class:`~fugue.dataframe.iterable_dataframe.IterableDataFrame` will
|
|
172
|
-
be converted although it's local.
|
|
173
|
-
"""
|
|
174
|
-
if isinstance(df, DataFrame):
|
|
175
|
-
aot(
|
|
176
|
-
schema is None,
|
|
177
|
-
ValueError("schema and metadata must be None when df is a DataFrame"),
|
|
178
|
-
)
|
|
179
|
-
return df.as_local_bounded()
|
|
180
|
-
df = to_local_df(df, schema)
|
|
181
|
-
if isinstance(df, LocalBoundedDataFrame):
|
|
182
|
-
return df
|
|
183
|
-
raise ValueError(f"{df} cannot convert to a LocalBoundedDataFrame")
|
|
184
|
-
|
|
185
|
-
|
|
186
112
|
def pickle_df(df: DataFrame) -> bytes:
|
|
187
113
|
"""Pickles a dataframe to bytes array. It firstly converts the dataframe
|
|
188
|
-
|
|
114
|
+
local bounded, and then serialize the underlying data.
|
|
189
115
|
|
|
190
116
|
:param df: input DataFrame
|
|
191
117
|
:return: pickled binary data
|
|
@@ -195,7 +121,7 @@ def pickle_df(df: DataFrame) -> bytes:
|
|
|
195
121
|
Be careful to use on large dataframes or non-local, un-materialized dataframes,
|
|
196
122
|
it can be slow. You should always use :func:`.unpickle_df` to deserialize.
|
|
197
123
|
"""
|
|
198
|
-
df =
|
|
124
|
+
df = df.as_local_bounded()
|
|
199
125
|
o: List[Any] = [df.schema]
|
|
200
126
|
if isinstance(df, PandasDataFrame):
|
|
201
127
|
o.append("p")
|
fugue/execution/api.py
CHANGED
|
@@ -15,6 +15,7 @@ from .execution_engine import (
|
|
|
15
15
|
ExecutionEngine,
|
|
16
16
|
)
|
|
17
17
|
from .factory import make_execution_engine, try_get_context_execution_engine
|
|
18
|
+
from .._utils.registry import fugue_plugin
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
@contextmanager
|
|
@@ -120,6 +121,27 @@ def get_current_parallelism() -> int:
|
|
|
120
121
|
return make_execution_engine().get_current_parallelism()
|
|
121
122
|
|
|
122
123
|
|
|
124
|
+
@fugue_plugin
|
|
125
|
+
def as_fugue_engine_df(
|
|
126
|
+
engine: ExecutionEngine, df: AnyDataFrame, schema: Any = None
|
|
127
|
+
) -> DataFrame:
|
|
128
|
+
"""Convert a dataframe to a Fugue engine dependent DataFrame.
|
|
129
|
+
This function is used internally by Fugue. It is not recommended
|
|
130
|
+
to use
|
|
131
|
+
|
|
132
|
+
:param engine: the ExecutionEngine to use, must not be None
|
|
133
|
+
:param df: a dataframe like object
|
|
134
|
+
:param schema: the schema of the dataframe, defaults to None
|
|
135
|
+
|
|
136
|
+
:return: the engine dependent DataFrame
|
|
137
|
+
"""
|
|
138
|
+
if schema is None:
|
|
139
|
+
fdf = as_fugue_df(df)
|
|
140
|
+
else:
|
|
141
|
+
fdf = as_fugue_df(df, schema=schema)
|
|
142
|
+
return engine.to_df(fdf)
|
|
143
|
+
|
|
144
|
+
|
|
123
145
|
def run_engine_function(
|
|
124
146
|
func: Callable[[ExecutionEngine], Any],
|
|
125
147
|
engine: AnyExecutionEngine = None,
|
|
@@ -549,11 +571,11 @@ def join(
|
|
|
549
571
|
"""
|
|
550
572
|
|
|
551
573
|
def _join(e: ExecutionEngine):
|
|
552
|
-
edf1 = e
|
|
553
|
-
edf2 = e
|
|
574
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
575
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
554
576
|
res = e.join(edf1, edf2, how=how, on=on)
|
|
555
577
|
for odf in dfs:
|
|
556
|
-
res = e.join(res, e
|
|
578
|
+
res = e.join(res, as_fugue_engine_df(e, odf), how=how, on=on)
|
|
557
579
|
return res
|
|
558
580
|
|
|
559
581
|
return run_engine_function(
|
|
@@ -837,11 +859,11 @@ def union(
|
|
|
837
859
|
"""
|
|
838
860
|
|
|
839
861
|
def _union(e: ExecutionEngine):
|
|
840
|
-
edf1 = e
|
|
841
|
-
edf2 = e
|
|
862
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
863
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
842
864
|
res = e.union(edf1, edf2, distinct=distinct)
|
|
843
865
|
for odf in dfs:
|
|
844
|
-
res = e.union(res, e
|
|
866
|
+
res = e.union(res, as_fugue_engine_df(e, odf), distinct=distinct)
|
|
845
867
|
return res
|
|
846
868
|
|
|
847
869
|
return run_engine_function(
|
|
@@ -885,11 +907,11 @@ def subtract(
|
|
|
885
907
|
"""
|
|
886
908
|
|
|
887
909
|
def _subtract(e: ExecutionEngine):
|
|
888
|
-
edf1 = e
|
|
889
|
-
edf2 = e
|
|
910
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
911
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
890
912
|
res = e.subtract(edf1, edf2, distinct=distinct)
|
|
891
913
|
for odf in dfs:
|
|
892
|
-
res = e.subtract(res, e
|
|
914
|
+
res = e.subtract(res, as_fugue_engine_df(e, odf), distinct=distinct)
|
|
893
915
|
return res
|
|
894
916
|
|
|
895
917
|
return run_engine_function(
|
|
@@ -933,11 +955,11 @@ def intersect(
|
|
|
933
955
|
"""
|
|
934
956
|
|
|
935
957
|
def _intersect(e: ExecutionEngine):
|
|
936
|
-
edf1 = e
|
|
937
|
-
edf2 = e
|
|
958
|
+
edf1 = as_fugue_engine_df(e, df1)
|
|
959
|
+
edf2 = as_fugue_engine_df(e, df2)
|
|
938
960
|
res = e.intersect(edf1, edf2, distinct=distinct)
|
|
939
961
|
for odf in dfs:
|
|
940
|
-
res = e.intersect(res, e
|
|
962
|
+
res = e.intersect(res, as_fugue_engine_df(e, odf), distinct=distinct)
|
|
941
963
|
return res
|
|
942
964
|
|
|
943
965
|
return run_engine_function(
|
|
@@ -25,9 +25,9 @@ from fugue.dataframe import (
|
|
|
25
25
|
LocalDataFrame,
|
|
26
26
|
PandasDataFrame,
|
|
27
27
|
fugue_annotated_param,
|
|
28
|
-
to_local_bounded_df,
|
|
29
28
|
)
|
|
30
|
-
from fugue.dataframe.
|
|
29
|
+
from fugue.dataframe.dataframe import as_fugue_df
|
|
30
|
+
from fugue.dataframe.utils import get_join_schemas
|
|
31
31
|
|
|
32
32
|
from .execution_engine import (
|
|
33
33
|
ExecutionEngine,
|
|
@@ -83,19 +83,36 @@ class PandasMapEngine(MapEngine):
|
|
|
83
83
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
84
84
|
map_func_format_hint: Optional[str] = None,
|
|
85
85
|
) -> DataFrame:
|
|
86
|
-
if partition_spec.num_partitions != "0":
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
86
|
+
# if partition_spec.num_partitions != "0":
|
|
87
|
+
# self.log.warning(
|
|
88
|
+
# "%s doesn't respect num_partitions %s",
|
|
89
|
+
# self,
|
|
90
|
+
# partition_spec.num_partitions,
|
|
91
|
+
# )
|
|
92
|
+
is_coarse = partition_spec.algo == "coarse"
|
|
93
|
+
presort = partition_spec.get_sorts(df.schema, with_partition_keys=is_coarse)
|
|
94
|
+
presort_keys = list(presort.keys())
|
|
95
|
+
presort_asc = list(presort.values())
|
|
96
|
+
output_schema = Schema(output_schema)
|
|
92
97
|
cursor = partition_spec.get_cursor(df.schema, 0)
|
|
93
98
|
if on_init is not None:
|
|
94
99
|
on_init(0, df)
|
|
95
|
-
if
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
100
|
+
if (
|
|
101
|
+
len(partition_spec.partition_by) == 0 or partition_spec.algo == "coarse"
|
|
102
|
+
): # no partition
|
|
103
|
+
if len(partition_spec.presort) > 0:
|
|
104
|
+
pdf = (
|
|
105
|
+
df.as_pandas()
|
|
106
|
+
.sort_values(presort_keys, ascending=presort_asc)
|
|
107
|
+
.reset_index(drop=True)
|
|
108
|
+
)
|
|
109
|
+
input_df = PandasDataFrame(pdf, df.schema, pandas_df_wrapper=True)
|
|
110
|
+
cursor.set(lambda: input_df.peek_array(), cursor.partition_no + 1, 0)
|
|
111
|
+
output_df = map_func(cursor, input_df)
|
|
112
|
+
else:
|
|
113
|
+
df = df.as_local()
|
|
114
|
+
cursor.set(lambda: df.peek_array(), 0, 0)
|
|
115
|
+
output_df = map_func(cursor, df)
|
|
99
116
|
if (
|
|
100
117
|
isinstance(output_df, PandasDataFrame)
|
|
101
118
|
and output_df.schema != output_schema
|
|
@@ -107,13 +124,9 @@ class PandasMapEngine(MapEngine):
|
|
|
107
124
|
f"mismatches given {output_schema}",
|
|
108
125
|
)
|
|
109
126
|
return self.to_df(output_df) # type: ignore
|
|
110
|
-
presort = partition_spec.presort
|
|
111
|
-
presort_keys = list(presort.keys())
|
|
112
|
-
presort_asc = list(presort.values())
|
|
113
|
-
output_schema = Schema(output_schema)
|
|
114
127
|
|
|
115
128
|
def _map(pdf: pd.DataFrame) -> pd.DataFrame:
|
|
116
|
-
if len(
|
|
129
|
+
if len(partition_spec.presort) > 0:
|
|
117
130
|
pdf = pdf.sort_values(presort_keys, ascending=presort_asc).reset_index(
|
|
118
131
|
drop=True
|
|
119
132
|
)
|
|
@@ -177,7 +190,7 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
177
190
|
def repartition(
|
|
178
191
|
self, df: DataFrame, partition_spec: PartitionSpec
|
|
179
192
|
) -> DataFrame: # pragma: no cover
|
|
180
|
-
self.log.warning("%s doesn't respect repartition", self)
|
|
193
|
+
# self.log.warning("%s doesn't respect repartition", self)
|
|
181
194
|
return df
|
|
182
195
|
|
|
183
196
|
def broadcast(self, df: DataFrame) -> DataFrame:
|
|
@@ -384,4 +397,5 @@ class _NativeExecutionEngineParam(ExecutionEngineParam):
|
|
|
384
397
|
|
|
385
398
|
|
|
386
399
|
def _to_native_execution_engine_df(df: AnyDataFrame, schema: Any = None) -> DataFrame:
|
|
387
|
-
|
|
400
|
+
fdf = as_fugue_df(df) if schema is None else as_fugue_df(df, schema=schema)
|
|
401
|
+
return fdf.as_local_bounded()
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
from typing import Any, Callable, Optional
|
|
2
2
|
|
|
3
|
+
from triad import Schema, assert_or_throw, to_uuid
|
|
4
|
+
|
|
3
5
|
from fugue.collections.yielded import Yielded
|
|
4
6
|
from fugue.dataframe import DataFrame
|
|
5
7
|
from fugue.exceptions import FugueWorkflowCompileError
|
|
8
|
+
from fugue.execution.api import as_fugue_engine_df
|
|
6
9
|
from fugue.extensions.creator import Creator
|
|
7
|
-
from triad import Schema, assert_or_throw, to_uuid
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class Load(Creator):
|
|
@@ -39,7 +41,7 @@ class CreateData(Creator):
|
|
|
39
41
|
def create(self) -> DataFrame:
|
|
40
42
|
if isinstance(self._df, Yielded):
|
|
41
43
|
return self.execution_engine.load_yielded(self._df)
|
|
42
|
-
return self.execution_engine
|
|
44
|
+
return as_fugue_engine_df(self.execution_engine, self._df, schema=self._schema)
|
|
43
45
|
|
|
44
46
|
def _df_uid(self):
|
|
45
47
|
if self._data_determiner is not None:
|
|
@@ -6,7 +6,7 @@ from triad.utils.convert import to_type
|
|
|
6
6
|
from fugue.collections.partition import PartitionCursor
|
|
7
7
|
from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame
|
|
8
8
|
from fugue.dataframe.array_dataframe import ArrayDataFrame
|
|
9
|
-
from fugue.dataframe.utils import _df_eq
|
|
9
|
+
from fugue.dataframe.utils import _df_eq
|
|
10
10
|
from fugue.exceptions import FugueWorkflowError
|
|
11
11
|
from fugue.execution.execution_engine import _generate_comap_empty_dfs
|
|
12
12
|
from fugue.rpc import EmptyRPCHandler, to_rpc_handler
|
|
@@ -136,7 +136,7 @@ class _TransformerRunner(object):
|
|
|
136
136
|
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
|
|
137
137
|
self.transformer._cursor = cursor # type: ignore
|
|
138
138
|
try:
|
|
139
|
-
|
|
139
|
+
self.transformer.transform(df).as_local_bounded()
|
|
140
140
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
141
141
|
except self.ignore_errors: # type: ignore
|
|
142
142
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
@@ -160,7 +160,7 @@ class _CoTransformerRunner(object):
|
|
|
160
160
|
def run(self, cursor: PartitionCursor, dfs: DataFrames) -> LocalDataFrame:
|
|
161
161
|
self.transformer._cursor = cursor # type: ignore
|
|
162
162
|
try:
|
|
163
|
-
|
|
163
|
+
self.transformer.transform(dfs).as_local_bounded()
|
|
164
164
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
165
165
|
except self.ignore_errors: # type: ignore
|
|
166
166
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
@@ -6,7 +6,6 @@ from fugue.dataframe import (
|
|
|
6
6
|
DataFrame,
|
|
7
7
|
DataFrames,
|
|
8
8
|
LocalDataFrame,
|
|
9
|
-
to_local_bounded_df,
|
|
10
9
|
)
|
|
11
10
|
from fugue.column import ColumnExpr, SelectColumns as ColumnsSelect
|
|
12
11
|
from fugue.exceptions import FugueWorkflowError
|
|
@@ -334,7 +333,7 @@ class _TransformerRunner(object):
|
|
|
334
333
|
return self.transformer.transform(df)
|
|
335
334
|
else:
|
|
336
335
|
try:
|
|
337
|
-
return
|
|
336
|
+
return self.transformer.transform(df).as_local_bounded()
|
|
338
337
|
except self.ignore_errors: # type: ignore # pylint: disable=E0712
|
|
339
338
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
340
339
|
|
|
@@ -364,7 +363,7 @@ class _CoTransformerRunner(object):
|
|
|
364
363
|
|
|
365
364
|
else:
|
|
366
365
|
try:
|
|
367
|
-
return
|
|
366
|
+
return self.transformer.transform(dfs).as_local_bounded()
|
|
368
367
|
except self.ignore_errors: # type: ignore # pylint: disable=E0712
|
|
369
368
|
return ArrayDataFrame([], self.transformer.output_schema)
|
|
370
369
|
|
fugue/plugins.py
CHANGED
fugue/workflow/_checkpoint.py
CHANGED
|
@@ -166,7 +166,7 @@ class CheckpointPath(object):
|
|
|
166
166
|
|
|
167
167
|
def get_table_name(self, obj_id: str, permanent: bool) -> str:
|
|
168
168
|
path = self._path if permanent else self._temp_path
|
|
169
|
-
return to_uuid(path, obj_id)[:5]
|
|
169
|
+
return "temp_" + to_uuid(path, obj_id)[:5]
|
|
170
170
|
|
|
171
171
|
def temp_file_exists(self, path: str) -> bool:
|
|
172
172
|
try:
|