fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/api.py +1 -0
- fugue/dataframe/api.py +19 -2
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +68 -2
- fugue/execution/execution_engine.py +1 -1
- fugue/plugins.py +1 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/METADATA +4 -4
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/RECORD +24 -24
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/entry_points.txt +1 -1
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_duckdb/dataframe.py +87 -29
- fugue_ibis/dataframe.py +13 -0
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/dataframe.py +71 -19
- fugue_spark/dataframe.py +69 -13
- fugue_test/dataframe_suite.py +14 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/WHEEL +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev6.dist-info}/top_level.txt +0 -0
fugue/api.py
CHANGED
fugue/dataframe/api.py
CHANGED
|
@@ -116,15 +116,32 @@ def as_array_iterable(
|
|
|
116
116
|
return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe)
|
|
117
117
|
|
|
118
118
|
|
|
119
|
+
@fugue_plugin
|
|
120
|
+
def as_dicts(
|
|
121
|
+
df: AnyDataFrame, columns: Optional[List[str]] = None
|
|
122
|
+
) -> List[Dict[str, Any]]:
|
|
123
|
+
"""Convert any dataframe to a list of python dicts
|
|
124
|
+
|
|
125
|
+
:param df: the object that can be recognized as a dataframe by Fugue
|
|
126
|
+
:param columns: columns to extract, defaults to None
|
|
127
|
+
:return: a list of python dicts
|
|
128
|
+
|
|
129
|
+
.. note::
|
|
130
|
+
|
|
131
|
+
The default implementation enforces ``type_safe`` True
|
|
132
|
+
"""
|
|
133
|
+
return as_fugue_df(df).as_dicts(columns=columns)
|
|
134
|
+
|
|
135
|
+
|
|
119
136
|
@fugue_plugin
|
|
120
137
|
def as_dict_iterable(
|
|
121
138
|
df: AnyDataFrame, columns: Optional[List[str]] = None
|
|
122
139
|
) -> Iterable[Dict[str, Any]]:
|
|
123
|
-
"""Convert any dataframe to iterable of
|
|
140
|
+
"""Convert any dataframe to iterable of python dicts
|
|
124
141
|
|
|
125
142
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
126
143
|
:param columns: columns to extract, defaults to None
|
|
127
|
-
:return: iterable of
|
|
144
|
+
:return: iterable of python dicts
|
|
128
145
|
|
|
129
146
|
.. note::
|
|
130
147
|
|
|
@@ -21,6 +21,10 @@ from fugue.exceptions import FugueDataFrameOperationError
|
|
|
21
21
|
|
|
22
22
|
from .api import (
|
|
23
23
|
alter_columns,
|
|
24
|
+
as_array,
|
|
25
|
+
as_array_iterable,
|
|
26
|
+
as_dict_iterable,
|
|
27
|
+
as_dicts,
|
|
24
28
|
as_pandas,
|
|
25
29
|
drop_columns,
|
|
26
30
|
get_column_names,
|
|
@@ -30,6 +34,12 @@ from .api import (
|
|
|
30
34
|
select_columns,
|
|
31
35
|
)
|
|
32
36
|
from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
|
|
37
|
+
from .utils import (
|
|
38
|
+
pa_table_as_array,
|
|
39
|
+
pa_table_as_array_iterable,
|
|
40
|
+
pa_table_as_dict_iterable,
|
|
41
|
+
pa_table_as_dicts,
|
|
42
|
+
)
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
class ArrowDataFrame(LocalBoundedDataFrame):
|
|
@@ -174,21 +184,20 @@ class ArrowDataFrame(LocalBoundedDataFrame):
|
|
|
174
184
|
def as_array(
|
|
175
185
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
176
186
|
) -> List[Any]:
|
|
177
|
-
return
|
|
187
|
+
return pa_table_as_array(self.native, columns=columns)
|
|
188
|
+
|
|
189
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
190
|
+
return pa_table_as_dicts(self.native, columns=columns)
|
|
178
191
|
|
|
179
192
|
def as_array_iterable(
|
|
180
193
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
181
194
|
) -> Iterable[Any]:
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
d = self.native.to_pydict()
|
|
189
|
-
cols = [d[n] for n in self.columns]
|
|
190
|
-
for arr in zip(*cols):
|
|
191
|
-
yield list(arr)
|
|
195
|
+
yield from pa_table_as_array_iterable(self.native, columns=columns)
|
|
196
|
+
|
|
197
|
+
def as_dict_iterable(
|
|
198
|
+
self, columns: Optional[List[str]] = None
|
|
199
|
+
) -> Iterable[Dict[str, Any]]:
|
|
200
|
+
yield from pa_table_as_dict_iterable(self.native, columns=columns)
|
|
192
201
|
|
|
193
202
|
|
|
194
203
|
@as_local.candidate(lambda df: isinstance(df, pa.Table))
|
|
@@ -212,6 +221,34 @@ def _pa_table_as_pandas(df: pa.Table) -> pd.DataFrame:
|
|
|
212
221
|
)
|
|
213
222
|
|
|
214
223
|
|
|
224
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
225
|
+
def _pa_table_as_array(
|
|
226
|
+
df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
227
|
+
) -> List[Any]:
|
|
228
|
+
return pa_table_as_array(df, columns=columns)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
232
|
+
def _pa_table_as_array_iterable(
|
|
233
|
+
df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
234
|
+
) -> Iterable[Any]:
|
|
235
|
+
yield from pa_table_as_array_iterable(df, columns=columns)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
239
|
+
def _pa_table_as_dicts(
|
|
240
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
241
|
+
) -> List[Dict[str, Any]]:
|
|
242
|
+
return pa_table_as_dicts(df, columns=columns)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
246
|
+
def _pa_table_as_dict_iterable(
|
|
247
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
248
|
+
) -> Iterable[Dict[str, Any]]:
|
|
249
|
+
yield from pa_table_as_dict_iterable(df, columns=columns)
|
|
250
|
+
|
|
251
|
+
|
|
215
252
|
@alter_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
216
253
|
def _pa_table_alter_columns(
|
|
217
254
|
df: pa.Table, columns: Any, as_fugue: bool = False
|
fugue/dataframe/dataframe.py
CHANGED
|
@@ -237,13 +237,31 @@ class DataFrame(Dataset):
|
|
|
237
237
|
"""
|
|
238
238
|
raise NotImplementedError
|
|
239
239
|
|
|
240
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
241
|
+
"""Convert to a list of python dicts
|
|
242
|
+
|
|
243
|
+
:param columns: columns to extract, defaults to None
|
|
244
|
+
:return: a list of python dicts
|
|
245
|
+
|
|
246
|
+
.. note::
|
|
247
|
+
|
|
248
|
+
The default implementation enforces ``type_safe`` True
|
|
249
|
+
"""
|
|
250
|
+
if columns is None:
|
|
251
|
+
columns = self.columns
|
|
252
|
+
idx = range(len(columns))
|
|
253
|
+
return [
|
|
254
|
+
{columns[i]: x[i] for i in idx}
|
|
255
|
+
for x in self.as_array(columns, type_safe=True)
|
|
256
|
+
]
|
|
257
|
+
|
|
240
258
|
def as_dict_iterable(
|
|
241
259
|
self, columns: Optional[List[str]] = None
|
|
242
260
|
) -> Iterable[Dict[str, Any]]:
|
|
243
|
-
"""Convert to iterable of
|
|
261
|
+
"""Convert to iterable of python dicts
|
|
244
262
|
|
|
245
263
|
:param columns: columns to extract, defaults to None
|
|
246
|
-
:return: iterable of
|
|
264
|
+
:return: iterable of python dicts
|
|
247
265
|
|
|
248
266
|
.. note::
|
|
249
267
|
|
|
@@ -269,7 +269,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
|
|
|
269
269
|
class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
270
270
|
@no_type_check
|
|
271
271
|
def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
|
|
272
|
-
return
|
|
272
|
+
return df.as_local().as_dicts()
|
|
273
273
|
|
|
274
274
|
@no_type_check
|
|
275
275
|
def to_output_df(
|
|
@@ -105,6 +105,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
|
|
|
105
105
|
) -> List[Any]:
|
|
106
106
|
return list(self.as_array_iterable(columns, type_safe=type_safe))
|
|
107
107
|
|
|
108
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
109
|
+
return list(self.as_dict_iterable(columns))
|
|
110
|
+
|
|
108
111
|
def as_array_iterable(
|
|
109
112
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
110
113
|
) -> Iterable[Any]:
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
from triad import assert_or_throw
|
|
4
6
|
from triad.collections.schema import Schema
|
|
5
7
|
from triad.utils.pandas_like import PD_UTILS
|
|
8
|
+
from triad.utils.pyarrow import pa_batch_to_dicts
|
|
6
9
|
|
|
7
10
|
from fugue.dataset.api import (
|
|
8
11
|
as_fugue_dataset,
|
|
@@ -17,6 +20,10 @@ from fugue.dataset.api import (
|
|
|
17
20
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
18
21
|
|
|
19
22
|
from .api import (
|
|
23
|
+
as_array,
|
|
24
|
+
as_array_iterable,
|
|
25
|
+
as_dict_iterable,
|
|
26
|
+
as_dicts,
|
|
20
27
|
drop_columns,
|
|
21
28
|
get_column_names,
|
|
22
29
|
get_schema,
|
|
@@ -134,6 +141,9 @@ class PandasDataFrame(LocalBoundedDataFrame):
|
|
|
134
141
|
return self
|
|
135
142
|
return PandasDataFrame(self.native, new_schema)
|
|
136
143
|
|
|
144
|
+
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
145
|
+
return PD_UTILS.as_arrow(self.native, schema=self.schema.pa_schema)
|
|
146
|
+
|
|
137
147
|
def as_array(
|
|
138
148
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
139
149
|
) -> List[Any]:
|
|
@@ -150,6 +160,18 @@ class PandasDataFrame(LocalBoundedDataFrame):
|
|
|
150
160
|
):
|
|
151
161
|
yield row
|
|
152
162
|
|
|
163
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
164
|
+
res: List[Dict[str, Any]] = []
|
|
165
|
+
for block in _to_dicts(self.native, columns, self.schema):
|
|
166
|
+
res += block
|
|
167
|
+
return res
|
|
168
|
+
|
|
169
|
+
def as_dict_iterable(
|
|
170
|
+
self, columns: Optional[List[str]] = None
|
|
171
|
+
) -> Iterable[Dict[str, Any]]:
|
|
172
|
+
for block in _to_dicts(self.native, columns, self.schema):
|
|
173
|
+
yield from block
|
|
174
|
+
|
|
153
175
|
def head(
|
|
154
176
|
self, n: int, columns: Optional[List[str]] = None
|
|
155
177
|
) -> LocalBoundedDataFrame:
|
|
@@ -272,6 +294,43 @@ def _pd_head(
|
|
|
272
294
|
return _adjust_df(df.head(n), as_fugue=as_fugue)
|
|
273
295
|
|
|
274
296
|
|
|
297
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
298
|
+
def _pd_as_array(
|
|
299
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
300
|
+
) -> List[Any]:
|
|
301
|
+
return list(_pd_as_array_iterable(df, columns, type_safe=type_safe))
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
305
|
+
def _pd_as_array_iterable(
|
|
306
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
307
|
+
) -> Iterable[Any]:
|
|
308
|
+
for row in PD_UTILS.as_array_iterable(
|
|
309
|
+
df,
|
|
310
|
+
columns=columns,
|
|
311
|
+
type_safe=type_safe,
|
|
312
|
+
):
|
|
313
|
+
yield row
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
317
|
+
def _pd_as_dicts(
|
|
318
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None
|
|
319
|
+
) -> List[Dict[str, Any]]:
|
|
320
|
+
res: List[Dict[str, Any]] = []
|
|
321
|
+
for block in _to_dicts(df, columns):
|
|
322
|
+
res += block
|
|
323
|
+
return res
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
327
|
+
def _pd_as_dict_iterable(
|
|
328
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
329
|
+
) -> Iterable[Dict[str, Any]]:
|
|
330
|
+
for block in _to_dicts(df, columns):
|
|
331
|
+
yield from block
|
|
332
|
+
|
|
333
|
+
|
|
275
334
|
def _adjust_df(res: pd.DataFrame, as_fugue: bool):
|
|
276
335
|
return res if not as_fugue else PandasDataFrame(res)
|
|
277
336
|
|
|
@@ -280,3 +339,17 @@ def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None:
|
|
|
280
339
|
missing = [x for x in columns if x not in df.columns]
|
|
281
340
|
if len(missing) > 0:
|
|
282
341
|
raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _to_dicts(
|
|
345
|
+
df: pd.DataFrame,
|
|
346
|
+
columns: Optional[List[str]] = None,
|
|
347
|
+
schema: Optional[Schema] = None,
|
|
348
|
+
) -> Iterable[List[Dict[str, Any]]]:
|
|
349
|
+
cols = list(df.columns) if columns is None else columns
|
|
350
|
+
assert_or_throw(len(cols) > 0, ValueError("columns cannot be empty"))
|
|
351
|
+
pa_schema = schema.extract(cols).pa_schema if schema is not None else None
|
|
352
|
+
adf = PD_UTILS.as_arrow(df[cols], schema=pa_schema)
|
|
353
|
+
for batch in adf.to_batches():
|
|
354
|
+
if batch.num_rows > 0:
|
|
355
|
+
yield pa_batch_to_dicts(batch)
|
fugue/dataframe/utils.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pickle
|
|
3
|
-
from typing import Any, Iterable, Optional, Tuple
|
|
3
|
+
from typing import Any, Iterable, Optional, Tuple, List, Dict
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import pyarrow as pa
|
|
7
7
|
from fs import open_fs
|
|
8
|
-
from triad import FileSystem, Schema
|
|
8
|
+
from triad import FileSystem, Schema, assert_or_throw
|
|
9
9
|
from triad.collections.schema import SchemaError
|
|
10
10
|
from triad.exceptions import InvalidOperationError
|
|
11
11
|
from triad.utils.assertion import assert_arg_not_none
|
|
12
12
|
from triad.utils.assertion import assert_or_throw as aot
|
|
13
|
+
from triad.utils.pyarrow import pa_batch_to_dicts
|
|
13
14
|
|
|
14
15
|
from .api import as_fugue_df, get_column_names, normalize_column_names, rename
|
|
15
16
|
from .dataframe import DataFrame, LocalBoundedDataFrame
|
|
@@ -250,3 +251,68 @@ def get_join_schemas(
|
|
|
250
251
|
else:
|
|
251
252
|
aot(len(on) > 0, SchemaError("join on columns must be specified"))
|
|
252
253
|
return cm, (df1.schema.union(schema2))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def pa_table_as_array_iterable(
|
|
257
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
258
|
+
) -> Iterable[List[List[Any]]]:
|
|
259
|
+
"""Convert a pyarrow table to an iterable of list
|
|
260
|
+
|
|
261
|
+
:param df: pyarrow table
|
|
262
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
263
|
+
:return: an iterable of list
|
|
264
|
+
"""
|
|
265
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
266
|
+
_df = df if columns is None or len(columns) == 0 else df.select(columns)
|
|
267
|
+
for batch in _df.to_batches():
|
|
268
|
+
for x in zip(*batch.to_pydict().values()):
|
|
269
|
+
yield list(x)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def pa_table_as_array(
|
|
273
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
274
|
+
) -> List[List[List[Any]]]:
|
|
275
|
+
"""Convert a pyarrow table to a list of list
|
|
276
|
+
|
|
277
|
+
:param df: pyarrow table
|
|
278
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
279
|
+
:return: a list of list
|
|
280
|
+
"""
|
|
281
|
+
return list(pa_table_as_array_iterable(df, columns=columns))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def pa_table_as_dict_iterable(
|
|
285
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
286
|
+
) -> Iterable[Dict[str, Any]]:
|
|
287
|
+
"""Convert a pyarrow table to an iterable of dict
|
|
288
|
+
|
|
289
|
+
:param df: pyarrow table
|
|
290
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
291
|
+
:return: an iterable of dict
|
|
292
|
+
"""
|
|
293
|
+
for ck in _pa_table_as_dicts_chunks(df, columns=columns):
|
|
294
|
+
yield from ck
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def pa_table_as_dicts(
|
|
298
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
299
|
+
) -> List[Dict[str, Any]]:
|
|
300
|
+
"""Convert a pyarrow table to a list of dict
|
|
301
|
+
|
|
302
|
+
:param df: pyarrow table
|
|
303
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
304
|
+
:return: a list of dict
|
|
305
|
+
"""
|
|
306
|
+
res: List[Dict[str, Any]] = []
|
|
307
|
+
for ck in _pa_table_as_dicts_chunks(df, columns=columns):
|
|
308
|
+
res += ck
|
|
309
|
+
return res
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _pa_table_as_dicts_chunks(
|
|
313
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
314
|
+
) -> Iterable[List[Dict[str, Any]]]:
|
|
315
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
316
|
+
_df = df if columns is None or len(columns) == 0 else df.select(columns)
|
|
317
|
+
for batch in _df.to_batches():
|
|
318
|
+
yield pa_batch_to_dicts(batch)
|
|
@@ -1323,7 +1323,7 @@ class _Comap:
|
|
|
1323
1323
|
self._on_init(partition_no, empty_dfs)
|
|
1324
1324
|
|
|
1325
1325
|
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
|
|
1326
|
-
data =
|
|
1326
|
+
data = df.as_dicts()
|
|
1327
1327
|
if self.how == "inner":
|
|
1328
1328
|
if len(data) < self.dfs_count:
|
|
1329
1329
|
return ArrayDataFrame([], self.output_schema)
|
fugue/plugins.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.8.7.
|
|
3
|
+
Version: 0.8.7.dev6
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.8
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
|
-
Requires-Dist: triad ==0.9.2.
|
|
22
|
+
Requires-Dist: triad ==0.9.2.dev5
|
|
23
23
|
Requires-Dist: adagio >=0.2.4
|
|
24
24
|
Requires-Dist: qpd >=0.4.4
|
|
25
25
|
Requires-Dist: fugue-sql-antlr >=0.1.6
|
|
@@ -32,7 +32,7 @@ Requires-Dist: fugue-sql-antlr[cpp] >=0.1.6 ; extra == 'all'
|
|
|
32
32
|
Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
|
|
33
33
|
Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
|
|
34
34
|
Requires-Dist: dask-sql ; extra == 'all'
|
|
35
|
-
Requires-Dist: ray[data] >=2.
|
|
35
|
+
Requires-Dist: ray[data] >=2.4.0 ; extra == 'all'
|
|
36
36
|
Requires-Dist: notebook ; extra == 'all'
|
|
37
37
|
Requires-Dist: jupyterlab ; extra == 'all'
|
|
38
38
|
Requires-Dist: ipython >=7.10.0 ; extra == 'all'
|
|
@@ -59,7 +59,7 @@ Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
|
|
|
59
59
|
Provides-Extra: polars
|
|
60
60
|
Requires-Dist: polars ; extra == 'polars'
|
|
61
61
|
Provides-Extra: ray
|
|
62
|
-
Requires-Dist: ray[data] >=2.
|
|
62
|
+
Requires-Dist: ray[data] >=2.4.0 ; extra == 'ray'
|
|
63
63
|
Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
|
|
64
64
|
Requires-Dist: pyarrow >=6.0.1 ; extra == 'ray'
|
|
65
65
|
Provides-Extra: spark
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
fugue/__init__.py,sha256=xT5zuNZfRkjbA8a-uTT5oLK6hLGuezGZLWYBl6eS5J4,2749
|
|
2
|
-
fugue/api.py,sha256=
|
|
2
|
+
fugue/api.py,sha256=dLUrigFhDMB5x7cvlWSK8EyaY2o0AmhgPr0VRtfzSz0,1254
|
|
3
3
|
fugue/constants.py,sha256=crd0VqX8WtBcjSUNwZDi2LDIEkhUMWOlSn73H8JI9ds,3385
|
|
4
4
|
fugue/dev.py,sha256=GQCkezBBl4V0lVDWhGtUQKqomiCxgR9dMhfqj9C8cS8,1369
|
|
5
5
|
fugue/exceptions.py,sha256=ylP8gkZL8ao_ZLinNYKv16FPyO_n7c29dN-4QChUxi0,1544
|
|
6
|
-
fugue/plugins.py,sha256=
|
|
6
|
+
fugue/plugins.py,sha256=kao-H5z-cRbujBKW1QC9IHUOBKxXMhpVQ6saIE7cXm8,1012
|
|
7
7
|
fugue/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
8
|
fugue/registry.py,sha256=SNULGv08f37fRO-cIxFDmnVcod7ref2fNLSK6G7nVnI,868
|
|
9
9
|
fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -25,22 +25,22 @@ fugue/column/expressions.py,sha256=fdGX9oPCqJBuROFZqrOYVcwkjghdXT9ngaSTG5tW_i8,2
|
|
|
25
25
|
fugue/column/functions.py,sha256=ygLyn2gp5lTdGbYqJXeGeMmRNhbm4-vfJvAY_Zt0pb0,9774
|
|
26
26
|
fugue/column/sql.py,sha256=s_qTtHgnvRFqjhCWr7s595PTrHM-Pr9zHUQfU5xcTVA,17391
|
|
27
27
|
fugue/dataframe/__init__.py,sha256=zm7TbsaJLIvfm7zymWm2LGcuJd3nxfGsFnQiyrSnenM,678
|
|
28
|
-
fugue/dataframe/api.py,sha256=
|
|
28
|
+
fugue/dataframe/api.py,sha256=aWBvMaiSUxOvdQMfe79zHShWuPfLcgiWggC9HvVxvSE,11017
|
|
29
29
|
fugue/dataframe/array_dataframe.py,sha256=4scWnmQ6sjy1A6o7IYdRc0VVutBEfcJrA1f9wkph4Kg,4440
|
|
30
|
-
fugue/dataframe/arrow_dataframe.py,sha256=
|
|
31
|
-
fugue/dataframe/dataframe.py,sha256=
|
|
30
|
+
fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvrwRp_c,11485
|
|
31
|
+
fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
|
|
32
32
|
fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
|
|
33
33
|
fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
|
|
34
|
-
fugue/dataframe/function_wrapper.py,sha256=
|
|
35
|
-
fugue/dataframe/iterable_dataframe.py,sha256=
|
|
36
|
-
fugue/dataframe/pandas_dataframe.py,sha256=
|
|
37
|
-
fugue/dataframe/utils.py,sha256=
|
|
34
|
+
fugue/dataframe/function_wrapper.py,sha256=V1eQMOn27UroEYT7_YiwoEF0RjZYIM0zkD3vfaMAQFs,14813
|
|
35
|
+
fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
|
|
36
|
+
fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
|
|
37
|
+
fugue/dataframe/utils.py,sha256=VS1qLCr-9NEcEjaK-219rADJadDf6EfzYZCGRUpn1fY,11405
|
|
38
38
|
fugue/dataset/__init__.py,sha256=5f2CAJ4xst6Z2o9Q2e2twfDOGUw8ZJoE2ild4JEU2pg,112
|
|
39
39
|
fugue/dataset/api.py,sha256=DacI4L2w5NJ-eZ6nFxNMqmReEnb0WUXswbjVp7BeErk,2794
|
|
40
40
|
fugue/dataset/dataset.py,sha256=jWXZqy3msMPFFkhas2PYJEX55ZAI3gk3Txq5f4-Qya4,4759
|
|
41
41
|
fugue/execution/__init__.py,sha256=iZGxAznZz9piM3k4gp0tln97MDIBxdliLyNbD-0Zc48,427
|
|
42
42
|
fugue/execution/api.py,sha256=KsFOLGdWQMlXmlQ5JRgRsbUeB64qzTVHxSEaunjiojo,39818
|
|
43
|
-
fugue/execution/execution_engine.py,sha256=
|
|
43
|
+
fugue/execution/execution_engine.py,sha256=G_SsTmcuDcy6_azi_88lGzsOodiizu0JdWxebxgbqRg,47721
|
|
44
44
|
fugue/execution/factory.py,sha256=5ICzfNh2QqqABuVyYLijY5-7LZgfRqczlaZN32p78bE,21003
|
|
45
45
|
fugue/execution/native_execution_engine.py,sha256=Mm9BVC3dEMS3IWRZe4YvGKp6_mmW7dLmoLMK5HgAPcs,14408
|
|
46
46
|
fugue/extensions/__init__.py,sha256=y-uLKd6mZ8sZ_8-OdW6ELoBO_9IfC0gDmEbE_rMCvOA,599
|
|
@@ -87,8 +87,8 @@ fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,134
|
|
|
87
87
|
fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
|
|
88
88
|
fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
|
|
89
89
|
fugue_dask/_io.py,sha256=9G516yM6zQvSC5_JA6qHb3LwBDmhWcxK5sjFHrQ81zo,6012
|
|
90
|
-
fugue_dask/_utils.py,sha256=
|
|
91
|
-
fugue_dask/dataframe.py,sha256=
|
|
90
|
+
fugue_dask/_utils.py,sha256=n70N3wPPMz13Jh0GWJM3Je-TCYpU36yGP_YCwIHqUrc,8908
|
|
91
|
+
fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
|
|
92
92
|
fugue_dask/execution_engine.py,sha256=XJp6wrdkaNh5pOpwt-Hjoa2sxgCOgusFRWrcqoCcaNM,21153
|
|
93
93
|
fugue_dask/ibis_engine.py,sha256=kQdaG_KlZZ2AjtYETNCdTJOgtwI_eH0aGzLaAiIBbRI,2120
|
|
94
94
|
fugue_dask/registry.py,sha256=7UTg_eie7zKlHYKMCyOo0TNn5y2TiIjE8kiS2PruHFc,2200
|
|
@@ -96,14 +96,14 @@ fugue_duckdb/__init__.py,sha256=nSNv-fxBAKD6W23EbMeV4dVRIaSTqr9DzQUWuVOES8s,379
|
|
|
96
96
|
fugue_duckdb/_io.py,sha256=Sq228unVnroYTq4GX-Wnv22SLHC9Ji-aWgiqrfdu81w,8880
|
|
97
97
|
fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
|
|
98
98
|
fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
|
|
99
|
-
fugue_duckdb/dataframe.py,sha256=
|
|
99
|
+
fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
|
|
100
100
|
fugue_duckdb/execution_engine.py,sha256=fkkQb4Eh0m7SwKrTplVk2oQalLkNoj3CW0R12g01ofk,20536
|
|
101
101
|
fugue_duckdb/ibis_engine.py,sha256=MrypeABozqwetKOpqtrmWvCJX2QPfBXhbSEhvK9vqmI,1990
|
|
102
102
|
fugue_duckdb/registry.py,sha256=Dj0Tng1cXVT6Q7t-KxOky2k1dD9xSBjYGQmI26UgZPo,3095
|
|
103
103
|
fugue_ibis/__init__.py,sha256=PcUt66KlLyGGicad7asq5j2U567_fhR0HzvWQBhV1VM,362
|
|
104
104
|
fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
|
|
105
105
|
fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
|
|
106
|
-
fugue_ibis/dataframe.py,sha256=
|
|
106
|
+
fugue_ibis/dataframe.py,sha256=0Fb1vJjwEeffgoUCDfDGIMuSFaPgUJqcB-JqJOAALfs,7789
|
|
107
107
|
fugue_ibis/execution_engine.py,sha256=p5zy0IBXiJgLi67RBHCRcHgZsaJMANdNSpUxz0k_6C0,18453
|
|
108
108
|
fugue_ibis/extensions.py,sha256=H8l-SPfoqLuUoILtOuL2nccOpoL83zHeSoIhoqjtWQM,6905
|
|
109
109
|
fugue_ibis/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -117,11 +117,11 @@ fugue_notebook/nbextension/description.yaml,sha256=CsXgx9CSLbAlO4Z1kvX9ejYA_TImP
|
|
|
117
117
|
fugue_notebook/nbextension/main.js,sha256=Px2tQuBCNGEZOEBKsnfVruFEg-AxK7Tj0dY84ktub_U,3709
|
|
118
118
|
fugue_polars/__init__.py,sha256=NDkjlbLhHPTjUaCAw6mAwIqeK3HSeh-z88s9dqmwheQ,61
|
|
119
119
|
fugue_polars/_utils.py,sha256=7rGGWgB1-VqFwh4PcBLYk_5VNjd8FNOS4TDFyDVz2sg,159
|
|
120
|
-
fugue_polars/polars_dataframe.py,sha256=
|
|
120
|
+
fugue_polars/polars_dataframe.py,sha256=8LQ0IB-JFFdjW2ltDzq8DfIbUC_jjjDr1YM29usJag0,8831
|
|
121
121
|
fugue_polars/registry.py,sha256=gd6qQ-OxYtTAQFyvYbLDPXmSvCR-LW6n5K5ylgMY_7A,2950
|
|
122
122
|
fugue_ray/__init__.py,sha256=HzEHfG2mpc0ugf3nf1Pdy15Bhg35K6maZpYejn1aoyI,119
|
|
123
123
|
fugue_ray/_constants.py,sha256=vu5l1w-Wi-2V_nm0HLXKOYhh5HdWRCc5yQktO2XzhOg,569
|
|
124
|
-
fugue_ray/dataframe.py,sha256=
|
|
124
|
+
fugue_ray/dataframe.py,sha256=7asw2qf9vm6vLBSzqghm9pUcNAppJOz5CkT7XyR0S5g,12514
|
|
125
125
|
fugue_ray/execution_engine.py,sha256=NT_mnacijp1zskFbtganUwA3JNRPU-FNNvJswA6U_Yg,12607
|
|
126
126
|
fugue_ray/registry.py,sha256=xJRAhbwNrg695EwghQDnVtTKi4YkqZ0_61BD4OAblSA,1685
|
|
127
127
|
fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -130,7 +130,7 @@ fugue_ray/_utils/dataframe.py,sha256=_EadzS4rPom1A_cF0pqoPlwrNYZTfTwcyyu86_fFsqU
|
|
|
130
130
|
fugue_ray/_utils/io.py,sha256=SFTU4qXubGEmO5IGZA5yHy8Hu4b9aFZ9-eTU4Qs-NsQ,8757
|
|
131
131
|
fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
|
|
132
132
|
fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
|
|
133
|
-
fugue_spark/dataframe.py,sha256=
|
|
133
|
+
fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
|
|
134
134
|
fugue_spark/execution_engine.py,sha256=rqgY9U1bpjh0GFNyNkuPcI7iV0xeipadURhNIir4w08,33147
|
|
135
135
|
fugue_spark/ibis_engine.py,sha256=Yl5xxwROo1idcD2hFaylaI1IpmBUgbvOZRWtcrE0Zjo,1697
|
|
136
136
|
fugue_spark/registry.py,sha256=kyIMk6dAiKRSKCHawQKyXu9DhZ24T6j3gL57TiOAZ8c,4162
|
|
@@ -144,7 +144,7 @@ fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
|
|
|
144
144
|
fugue_test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
145
|
fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
|
|
146
146
|
fugue_test/builtin_suite.py,sha256=o8aMZTKa74nKBmcUTTBbliTJMtNbsXE9SPKZopS504o,78400
|
|
147
|
-
fugue_test/dataframe_suite.py,sha256=
|
|
147
|
+
fugue_test/dataframe_suite.py,sha256=LgB931CkASbGOrRQ9j92DGk9wPb__FoNusOk-HeqU9E,19165
|
|
148
148
|
fugue_test/execution_suite.py,sha256=FI6UmwBvdoT1jkJRBqJT_Q0IDehFryvv00UL6jjxyAk,47689
|
|
149
149
|
fugue_test/ibis_suite.py,sha256=Dk4AHVD00RcFsNm9VvJ4_4LOyFdGX30OnAtpO2SPruE,3529
|
|
150
150
|
fugue_test/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -155,9 +155,9 @@ fugue_test/plugins/duckdb/fixtures.py,sha256=UxQbIMRbSrTZ3pgCmKZgd5wd1YvnVrqLSUP
|
|
|
155
155
|
fugue_test/plugins/ray/__init__.py,sha256=nyKGW6xgTXtMhSs7yjgFNKO7mVboCNg63Bvdf39fO_I,55
|
|
156
156
|
fugue_test/plugins/ray/fixtures.py,sha256=hZkvuo0AcD63XJl5JUroc9tm2LWHUPszg2zzY6FCSao,141
|
|
157
157
|
fugue_version/__init__.py,sha256=vTwvdJOZi8jZb9U-Em7-d50qNDNPS2z51IXqRoojeNM,22
|
|
158
|
-
fugue-0.8.7.
|
|
159
|
-
fugue-0.8.7.
|
|
160
|
-
fugue-0.8.7.
|
|
161
|
-
fugue-0.8.7.
|
|
162
|
-
fugue-0.8.7.
|
|
163
|
-
fugue-0.8.7.
|
|
158
|
+
fugue-0.8.7.dev6.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
159
|
+
fugue-0.8.7.dev6.dist-info/METADATA,sha256=0i4ibczIy_wEMtZ6vFvaCw40x5KmuQa6OsuBVWUTQyk,17860
|
|
160
|
+
fugue-0.8.7.dev6.dist-info/WHEEL,sha256=yQN5g4mg4AybRjkgi-9yy4iQEFibGQmlz78Pik5Or-A,92
|
|
161
|
+
fugue-0.8.7.dev6.dist-info/entry_points.txt,sha256=N_BIIy3lSvF6Z32QE0yXTucgdHrPbUrOwH1zj7bZ0ow,536
|
|
162
|
+
fugue-0.8.7.dev6.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
|
|
163
|
+
fugue-0.8.7.dev6.dist-info/RECORD,,
|
|
@@ -7,7 +7,7 @@ ibis = fugue_ibis [ibis]
|
|
|
7
7
|
polars = fugue_polars.registry [polars]
|
|
8
8
|
ray = fugue_ray.registry [ray]
|
|
9
9
|
spark = fugue_spark.registry [spark]
|
|
10
|
-
spark_ibis = fugue_spark.ibis_engine [ibis
|
|
10
|
+
spark_ibis = fugue_spark.ibis_engine [spark,ibis]
|
|
11
11
|
|
|
12
12
|
[pytest11]
|
|
13
13
|
fugue_test_dask = fugue_test.plugins.dask [dask]
|
fugue_dask/_utils.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from typing import Any, List, Optional, Tuple
|
|
2
|
+
from typing import Any, Callable, List, Optional, Tuple, TypeVar
|
|
3
3
|
|
|
4
4
|
import dask.dataframe as dd
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pyarrow as pa
|
|
8
8
|
from dask.dataframe.core import DataFrame
|
|
9
|
+
from dask.delayed import delayed
|
|
9
10
|
from dask.distributed import Client, get_client
|
|
10
|
-
from triad.utils.pandas_like import
|
|
11
|
+
from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
|
|
11
12
|
from triad.utils.pyarrow import to_pandas_dtype
|
|
12
13
|
|
|
13
14
|
import fugue.api as fa
|
|
@@ -16,6 +17,7 @@ from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS
|
|
|
16
17
|
from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS
|
|
17
18
|
|
|
18
19
|
_FUGUE_DASK_TEMP_IDX_COLUMN = "_fugue_dask_temp_index"
|
|
20
|
+
T = TypeVar("T")
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def get_default_partitions() -> int:
|
|
@@ -28,6 +30,17 @@ def get_default_partitions() -> int:
|
|
|
28
30
|
return n if n > 0 else fa.get_current_parallelism() * 2
|
|
29
31
|
|
|
30
32
|
|
|
33
|
+
def collect(df: dd.DataFrame, func: Callable[[pd.DataFrame], T]) -> Tuple[T]:
|
|
34
|
+
"""Compute each partition in parallel and collect the results
|
|
35
|
+
|
|
36
|
+
:param df: dask dataframe
|
|
37
|
+
:return: the collected result
|
|
38
|
+
"""
|
|
39
|
+
dfs = df.to_delayed()
|
|
40
|
+
objs = [delayed(func)(df) for df in dfs]
|
|
41
|
+
return dd.compute(*objs)
|
|
42
|
+
|
|
43
|
+
|
|
31
44
|
def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFrame:
|
|
32
45
|
"""Repartition the dataframe by hashing the given columns
|
|
33
46
|
|