fugue 0.8.7.dev4__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/api.py +1 -0
- fugue/dataframe/api.py +51 -15
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +72 -4
- fugue/execution/execution_engine.py +1 -1
- fugue/execution/native_execution_engine.py +1 -1
- fugue/plugins.py +1 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/METADATA +5 -4
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/RECORD +30 -30
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/entry_points.txt +1 -1
- fugue_dask/_io.py +5 -0
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_duckdb/dataframe.py +87 -29
- fugue_ibis/dataframe.py +13 -0
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/dataframe.py +71 -19
- fugue_spark/_utils/convert.py +32 -7
- fugue_spark/_utils/io.py +3 -1
- fugue_spark/dataframe.py +94 -22
- fugue_spark/execution_engine.py +7 -3
- fugue_test/builtin_suite.py +1 -1
- fugue_test/dataframe_suite.py +14 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev4.dist-info → fugue-0.8.7.dev6.dist-info}/top_level.txt +0 -0
fugue/api.py
CHANGED
fugue/dataframe/api.py
CHANGED
|
@@ -11,12 +11,12 @@ from .dataframe import AnyDataFrame, DataFrame, as_fugue_df
|
|
|
11
11
|
|
|
12
12
|
@fugue_plugin
|
|
13
13
|
def is_df(df: Any) -> bool:
|
|
14
|
-
"""Whether
|
|
14
|
+
"""Whether the input object is any type of DataFrame"""
|
|
15
15
|
return isinstance(df, DataFrame)
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def get_native_as_df(df: AnyDataFrame) -> AnyDataFrame:
|
|
19
|
-
"""Return the dataframe form of
|
|
19
|
+
"""Return the dataframe form of any dataframe.
|
|
20
20
|
If ``df`` is a :class:`~.DataFrame`, then call the
|
|
21
21
|
:meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is
|
|
22
22
|
a correspondent function handling it.
|
|
@@ -30,30 +30,49 @@ def get_native_as_df(df: AnyDataFrame) -> AnyDataFrame:
|
|
|
30
30
|
|
|
31
31
|
@fugue_plugin
|
|
32
32
|
def get_schema(df: AnyDataFrame) -> Schema:
|
|
33
|
-
"""
|
|
33
|
+
"""The generic function to get the schema of any dataframe
|
|
34
34
|
|
|
35
35
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
36
36
|
:return: the Schema object
|
|
37
|
+
|
|
38
|
+
.. admonition:: Examples
|
|
39
|
+
|
|
40
|
+
.. code-block:: python
|
|
41
|
+
|
|
42
|
+
import fugue.api as fa
|
|
43
|
+
import pandas as pd
|
|
44
|
+
|
|
45
|
+
df = pd.DataFrame([[0,1],[2,3]], columns=["a","b"])
|
|
46
|
+
fa.get_schema(df) # == Schema("a:long,b:long")
|
|
47
|
+
|
|
48
|
+
.. related_topics
|
|
49
|
+
How to get schema of any dataframe using Fugue?
|
|
37
50
|
"""
|
|
38
51
|
return as_fugue_df(df).schema
|
|
39
52
|
|
|
40
53
|
|
|
41
54
|
@fugue_plugin
|
|
42
55
|
def as_pandas(df: AnyDataFrame) -> pd.DataFrame:
|
|
43
|
-
"""
|
|
56
|
+
"""The generic function to convert any dataframe to a Pandas DataFrame
|
|
44
57
|
|
|
45
58
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
46
59
|
:return: the Pandas DataFrame
|
|
60
|
+
|
|
61
|
+
.. related_topics
|
|
62
|
+
How to convert any dataframe to a pandas dataframe?
|
|
47
63
|
"""
|
|
48
64
|
return as_fugue_df(df).as_pandas()
|
|
49
65
|
|
|
50
66
|
|
|
51
67
|
@fugue_plugin
|
|
52
68
|
def as_arrow(df: AnyDataFrame) -> pa.Table:
|
|
53
|
-
"""
|
|
69
|
+
"""The generic function to convert any dataframe to a PyArrow Table
|
|
54
70
|
|
|
55
71
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
56
72
|
:return: the PyArrow Table
|
|
73
|
+
|
|
74
|
+
.. related_topics
|
|
75
|
+
How to convert any dataframe to a pyarrow dataframe?
|
|
57
76
|
"""
|
|
58
77
|
return as_fugue_df(df).as_arrow()
|
|
59
78
|
|
|
@@ -62,7 +81,7 @@ def as_arrow(df: AnyDataFrame) -> pa.Table:
|
|
|
62
81
|
def as_array(
|
|
63
82
|
df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
64
83
|
) -> List[Any]: # pragma: no cover
|
|
65
|
-
"""
|
|
84
|
+
"""The generic function to convert any dataframe to a 2-dimensional python array
|
|
66
85
|
|
|
67
86
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
68
87
|
:param columns: columns to extract, defaults to None
|
|
@@ -81,7 +100,7 @@ def as_array(
|
|
|
81
100
|
def as_array_iterable(
|
|
82
101
|
df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
83
102
|
) -> Iterable[Any]: # pragma: no cover
|
|
84
|
-
"""
|
|
103
|
+
"""The generic function to convert any dataframe to iterable of python arrays
|
|
85
104
|
|
|
86
105
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
87
106
|
:param columns: columns to extract, defaults to None
|
|
@@ -97,15 +116,32 @@ def as_array_iterable(
|
|
|
97
116
|
return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe)
|
|
98
117
|
|
|
99
118
|
|
|
119
|
+
@fugue_plugin
|
|
120
|
+
def as_dicts(
|
|
121
|
+
df: AnyDataFrame, columns: Optional[List[str]] = None
|
|
122
|
+
) -> List[Dict[str, Any]]:
|
|
123
|
+
"""Convert any dataframe to a list of python dicts
|
|
124
|
+
|
|
125
|
+
:param df: the object that can be recognized as a dataframe by Fugue
|
|
126
|
+
:param columns: columns to extract, defaults to None
|
|
127
|
+
:return: a list of python dicts
|
|
128
|
+
|
|
129
|
+
.. note::
|
|
130
|
+
|
|
131
|
+
The default implementation enforces ``type_safe`` True
|
|
132
|
+
"""
|
|
133
|
+
return as_fugue_df(df).as_dicts(columns=columns)
|
|
134
|
+
|
|
135
|
+
|
|
100
136
|
@fugue_plugin
|
|
101
137
|
def as_dict_iterable(
|
|
102
138
|
df: AnyDataFrame, columns: Optional[List[str]] = None
|
|
103
139
|
) -> Iterable[Dict[str, Any]]:
|
|
104
|
-
"""Convert
|
|
140
|
+
"""Convert any dataframe to iterable of python dicts
|
|
105
141
|
|
|
106
142
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
107
143
|
:param columns: columns to extract, defaults to None
|
|
108
|
-
:return: iterable of
|
|
144
|
+
:return: iterable of python dicts
|
|
109
145
|
|
|
110
146
|
.. note::
|
|
111
147
|
|
|
@@ -116,7 +152,7 @@ def as_dict_iterable(
|
|
|
116
152
|
|
|
117
153
|
@fugue_plugin
|
|
118
154
|
def peek_array(df: AnyDataFrame) -> List[Any]:
|
|
119
|
-
"""Peek the first row of
|
|
155
|
+
"""Peek the first row of any dataframe as an array
|
|
120
156
|
|
|
121
157
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
122
158
|
:return: the first row as an array
|
|
@@ -126,7 +162,7 @@ def peek_array(df: AnyDataFrame) -> List[Any]:
|
|
|
126
162
|
|
|
127
163
|
@fugue_plugin
|
|
128
164
|
def peek_dict(df: AnyDataFrame) -> Dict[str, Any]:
|
|
129
|
-
"""Peek the first row of
|
|
165
|
+
"""Peek the first row of any dataframe as a array
|
|
130
166
|
|
|
131
167
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
132
168
|
:return: the first row as a dict
|
|
@@ -141,7 +177,7 @@ def head(
|
|
|
141
177
|
columns: Optional[List[str]] = None,
|
|
142
178
|
as_fugue: bool = False,
|
|
143
179
|
) -> AnyDataFrame:
|
|
144
|
-
"""Get first n rows of
|
|
180
|
+
"""Get first n rows of any dataframe as a new local bounded dataframe
|
|
145
181
|
|
|
146
182
|
:param n: number of rows
|
|
147
183
|
:param columns: selected columns, defaults to None (all columns)
|
|
@@ -160,7 +196,7 @@ def head(
|
|
|
160
196
|
def alter_columns(
|
|
161
197
|
df: AnyDataFrame, columns: Any, as_fugue: bool = False
|
|
162
198
|
) -> AnyDataFrame:
|
|
163
|
-
"""Change column types
|
|
199
|
+
"""Change column data types of any dataframe
|
|
164
200
|
|
|
165
201
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
166
202
|
:param columns: |SchemaLikeObject|,
|
|
@@ -178,7 +214,7 @@ def alter_columns(
|
|
|
178
214
|
def drop_columns(
|
|
179
215
|
df: AnyDataFrame, columns: List[str], as_fugue: bool = False
|
|
180
216
|
) -> AnyDataFrame:
|
|
181
|
-
"""Drop certain columns
|
|
217
|
+
"""Drop certain columns of any dataframe
|
|
182
218
|
|
|
183
219
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
184
220
|
:param columns: columns to drop
|
|
@@ -194,7 +230,7 @@ def drop_columns(
|
|
|
194
230
|
def select_columns(
|
|
195
231
|
df: AnyDataFrame, columns: List[Any], as_fugue: bool = False
|
|
196
232
|
) -> AnyDataFrame:
|
|
197
|
-
"""Select certain columns and return a new dataframe
|
|
233
|
+
"""Select certain columns of any dataframe and return a new dataframe
|
|
198
234
|
|
|
199
235
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
200
236
|
:param columns: columns to return
|
|
@@ -21,6 +21,10 @@ from fugue.exceptions import FugueDataFrameOperationError
|
|
|
21
21
|
|
|
22
22
|
from .api import (
|
|
23
23
|
alter_columns,
|
|
24
|
+
as_array,
|
|
25
|
+
as_array_iterable,
|
|
26
|
+
as_dict_iterable,
|
|
27
|
+
as_dicts,
|
|
24
28
|
as_pandas,
|
|
25
29
|
drop_columns,
|
|
26
30
|
get_column_names,
|
|
@@ -30,6 +34,12 @@ from .api import (
|
|
|
30
34
|
select_columns,
|
|
31
35
|
)
|
|
32
36
|
from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
|
|
37
|
+
from .utils import (
|
|
38
|
+
pa_table_as_array,
|
|
39
|
+
pa_table_as_array_iterable,
|
|
40
|
+
pa_table_as_dict_iterable,
|
|
41
|
+
pa_table_as_dicts,
|
|
42
|
+
)
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
class ArrowDataFrame(LocalBoundedDataFrame):
|
|
@@ -174,21 +184,20 @@ class ArrowDataFrame(LocalBoundedDataFrame):
|
|
|
174
184
|
def as_array(
|
|
175
185
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
176
186
|
) -> List[Any]:
|
|
177
|
-
return
|
|
187
|
+
return pa_table_as_array(self.native, columns=columns)
|
|
188
|
+
|
|
189
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
190
|
+
return pa_table_as_dicts(self.native, columns=columns)
|
|
178
191
|
|
|
179
192
|
def as_array_iterable(
|
|
180
193
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
181
194
|
) -> Iterable[Any]:
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
d = self.native.to_pydict()
|
|
189
|
-
cols = [d[n] for n in self.columns]
|
|
190
|
-
for arr in zip(*cols):
|
|
191
|
-
yield list(arr)
|
|
195
|
+
yield from pa_table_as_array_iterable(self.native, columns=columns)
|
|
196
|
+
|
|
197
|
+
def as_dict_iterable(
|
|
198
|
+
self, columns: Optional[List[str]] = None
|
|
199
|
+
) -> Iterable[Dict[str, Any]]:
|
|
200
|
+
yield from pa_table_as_dict_iterable(self.native, columns=columns)
|
|
192
201
|
|
|
193
202
|
|
|
194
203
|
@as_local.candidate(lambda df: isinstance(df, pa.Table))
|
|
@@ -212,6 +221,34 @@ def _pa_table_as_pandas(df: pa.Table) -> pd.DataFrame:
|
|
|
212
221
|
)
|
|
213
222
|
|
|
214
223
|
|
|
224
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
225
|
+
def _pa_table_as_array(
|
|
226
|
+
df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
227
|
+
) -> List[Any]:
|
|
228
|
+
return pa_table_as_array(df, columns=columns)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
232
|
+
def _pa_table_as_array_iterable(
|
|
233
|
+
df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
234
|
+
) -> Iterable[Any]:
|
|
235
|
+
yield from pa_table_as_array_iterable(df, columns=columns)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
239
|
+
def _pa_table_as_dicts(
|
|
240
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
241
|
+
) -> List[Dict[str, Any]]:
|
|
242
|
+
return pa_table_as_dicts(df, columns=columns)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
246
|
+
def _pa_table_as_dict_iterable(
|
|
247
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
248
|
+
) -> Iterable[Dict[str, Any]]:
|
|
249
|
+
yield from pa_table_as_dict_iterable(df, columns=columns)
|
|
250
|
+
|
|
251
|
+
|
|
215
252
|
@alter_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
216
253
|
def _pa_table_alter_columns(
|
|
217
254
|
df: pa.Table, columns: Any, as_fugue: bool = False
|
fugue/dataframe/dataframe.py
CHANGED
|
@@ -237,13 +237,31 @@ class DataFrame(Dataset):
|
|
|
237
237
|
"""
|
|
238
238
|
raise NotImplementedError
|
|
239
239
|
|
|
240
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
241
|
+
"""Convert to a list of python dicts
|
|
242
|
+
|
|
243
|
+
:param columns: columns to extract, defaults to None
|
|
244
|
+
:return: a list of python dicts
|
|
245
|
+
|
|
246
|
+
.. note::
|
|
247
|
+
|
|
248
|
+
The default implementation enforces ``type_safe`` True
|
|
249
|
+
"""
|
|
250
|
+
if columns is None:
|
|
251
|
+
columns = self.columns
|
|
252
|
+
idx = range(len(columns))
|
|
253
|
+
return [
|
|
254
|
+
{columns[i]: x[i] for i in idx}
|
|
255
|
+
for x in self.as_array(columns, type_safe=True)
|
|
256
|
+
]
|
|
257
|
+
|
|
240
258
|
def as_dict_iterable(
|
|
241
259
|
self, columns: Optional[List[str]] = None
|
|
242
260
|
) -> Iterable[Dict[str, Any]]:
|
|
243
|
-
"""Convert to iterable of
|
|
261
|
+
"""Convert to iterable of python dicts
|
|
244
262
|
|
|
245
263
|
:param columns: columns to extract, defaults to None
|
|
246
|
-
:return: iterable of
|
|
264
|
+
:return: iterable of python dicts
|
|
247
265
|
|
|
248
266
|
.. note::
|
|
249
267
|
|
|
@@ -269,7 +269,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
|
|
|
269
269
|
class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
270
270
|
@no_type_check
|
|
271
271
|
def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
|
|
272
|
-
return
|
|
272
|
+
return df.as_local().as_dicts()
|
|
273
273
|
|
|
274
274
|
@no_type_check
|
|
275
275
|
def to_output_df(
|
|
@@ -105,6 +105,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
|
|
|
105
105
|
) -> List[Any]:
|
|
106
106
|
return list(self.as_array_iterable(columns, type_safe=type_safe))
|
|
107
107
|
|
|
108
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
109
|
+
return list(self.as_dict_iterable(columns))
|
|
110
|
+
|
|
108
111
|
def as_array_iterable(
|
|
109
112
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
110
113
|
) -> Iterable[Any]:
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
from triad import assert_or_throw
|
|
4
6
|
from triad.collections.schema import Schema
|
|
5
7
|
from triad.utils.pandas_like import PD_UTILS
|
|
8
|
+
from triad.utils.pyarrow import pa_batch_to_dicts
|
|
6
9
|
|
|
7
10
|
from fugue.dataset.api import (
|
|
8
11
|
as_fugue_dataset,
|
|
@@ -17,6 +20,10 @@ from fugue.dataset.api import (
|
|
|
17
20
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
18
21
|
|
|
19
22
|
from .api import (
|
|
23
|
+
as_array,
|
|
24
|
+
as_array_iterable,
|
|
25
|
+
as_dict_iterable,
|
|
26
|
+
as_dicts,
|
|
20
27
|
drop_columns,
|
|
21
28
|
get_column_names,
|
|
22
29
|
get_schema,
|
|
@@ -134,6 +141,9 @@ class PandasDataFrame(LocalBoundedDataFrame):
|
|
|
134
141
|
return self
|
|
135
142
|
return PandasDataFrame(self.native, new_schema)
|
|
136
143
|
|
|
144
|
+
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
145
|
+
return PD_UTILS.as_arrow(self.native, schema=self.schema.pa_schema)
|
|
146
|
+
|
|
137
147
|
def as_array(
|
|
138
148
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
139
149
|
) -> List[Any]:
|
|
@@ -150,6 +160,18 @@ class PandasDataFrame(LocalBoundedDataFrame):
|
|
|
150
160
|
):
|
|
151
161
|
yield row
|
|
152
162
|
|
|
163
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
164
|
+
res: List[Dict[str, Any]] = []
|
|
165
|
+
for block in _to_dicts(self.native, columns, self.schema):
|
|
166
|
+
res += block
|
|
167
|
+
return res
|
|
168
|
+
|
|
169
|
+
def as_dict_iterable(
|
|
170
|
+
self, columns: Optional[List[str]] = None
|
|
171
|
+
) -> Iterable[Dict[str, Any]]:
|
|
172
|
+
for block in _to_dicts(self.native, columns, self.schema):
|
|
173
|
+
yield from block
|
|
174
|
+
|
|
153
175
|
def head(
|
|
154
176
|
self, n: int, columns: Optional[List[str]] = None
|
|
155
177
|
) -> LocalBoundedDataFrame:
|
|
@@ -272,6 +294,43 @@ def _pd_head(
|
|
|
272
294
|
return _adjust_df(df.head(n), as_fugue=as_fugue)
|
|
273
295
|
|
|
274
296
|
|
|
297
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
298
|
+
def _pd_as_array(
|
|
299
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
300
|
+
) -> List[Any]:
|
|
301
|
+
return list(_pd_as_array_iterable(df, columns, type_safe=type_safe))
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
305
|
+
def _pd_as_array_iterable(
|
|
306
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
307
|
+
) -> Iterable[Any]:
|
|
308
|
+
for row in PD_UTILS.as_array_iterable(
|
|
309
|
+
df,
|
|
310
|
+
columns=columns,
|
|
311
|
+
type_safe=type_safe,
|
|
312
|
+
):
|
|
313
|
+
yield row
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
317
|
+
def _pd_as_dicts(
|
|
318
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None
|
|
319
|
+
) -> List[Dict[str, Any]]:
|
|
320
|
+
res: List[Dict[str, Any]] = []
|
|
321
|
+
for block in _to_dicts(df, columns):
|
|
322
|
+
res += block
|
|
323
|
+
return res
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
327
|
+
def _pd_as_dict_iterable(
|
|
328
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
329
|
+
) -> Iterable[Dict[str, Any]]:
|
|
330
|
+
for block in _to_dicts(df, columns):
|
|
331
|
+
yield from block
|
|
332
|
+
|
|
333
|
+
|
|
275
334
|
def _adjust_df(res: pd.DataFrame, as_fugue: bool):
|
|
276
335
|
return res if not as_fugue else PandasDataFrame(res)
|
|
277
336
|
|
|
@@ -280,3 +339,17 @@ def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None:
|
|
|
280
339
|
missing = [x for x in columns if x not in df.columns]
|
|
281
340
|
if len(missing) > 0:
|
|
282
341
|
raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _to_dicts(
|
|
345
|
+
df: pd.DataFrame,
|
|
346
|
+
columns: Optional[List[str]] = None,
|
|
347
|
+
schema: Optional[Schema] = None,
|
|
348
|
+
) -> Iterable[List[Dict[str, Any]]]:
|
|
349
|
+
cols = list(df.columns) if columns is None else columns
|
|
350
|
+
assert_or_throw(len(cols) > 0, ValueError("columns cannot be empty"))
|
|
351
|
+
pa_schema = schema.extract(cols).pa_schema if schema is not None else None
|
|
352
|
+
adf = PD_UTILS.as_arrow(df[cols], schema=pa_schema)
|
|
353
|
+
for batch in adf.to_batches():
|
|
354
|
+
if batch.num_rows > 0:
|
|
355
|
+
yield pa_batch_to_dicts(batch)
|
fugue/dataframe/utils.py
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pickle
|
|
3
|
-
from typing import Any, Iterable, Optional, Tuple
|
|
3
|
+
from typing import Any, Iterable, Optional, Tuple, List, Dict
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import pyarrow as pa
|
|
7
7
|
from fs import open_fs
|
|
8
|
-
from triad import FileSystem, Schema
|
|
8
|
+
from triad import FileSystem, Schema, assert_or_throw
|
|
9
9
|
from triad.collections.schema import SchemaError
|
|
10
10
|
from triad.exceptions import InvalidOperationError
|
|
11
11
|
from triad.utils.assertion import assert_arg_not_none
|
|
12
12
|
from triad.utils.assertion import assert_or_throw as aot
|
|
13
|
+
from triad.utils.pyarrow import pa_batch_to_dicts
|
|
13
14
|
|
|
14
15
|
from .api import as_fugue_df, get_column_names, normalize_column_names, rename
|
|
15
16
|
from .dataframe import DataFrame, LocalBoundedDataFrame
|
|
@@ -82,17 +83,19 @@ def _df_eq(
|
|
|
82
83
|
), f"schema mismatch {df.schema.pa_schema}, {df2.schema.pa_schema}"
|
|
83
84
|
if not check_content:
|
|
84
85
|
return True
|
|
86
|
+
cols: Any = df1.columns
|
|
85
87
|
if no_pandas:
|
|
86
88
|
dd1 = [[x.__repr__()] for x in df1.as_array_iterable(type_safe=True)]
|
|
87
89
|
dd2 = [[x.__repr__()] for x in df2.as_array_iterable(type_safe=True)]
|
|
88
90
|
d1 = pd.DataFrame(dd1, columns=["data"])
|
|
89
91
|
d2 = pd.DataFrame(dd2, columns=["data"])
|
|
92
|
+
cols = ["data"]
|
|
90
93
|
else:
|
|
91
94
|
d1 = df1.as_pandas()
|
|
92
95
|
d2 = df2.as_pandas()
|
|
93
96
|
if not check_order:
|
|
94
|
-
d1 = d1.sort_values(
|
|
95
|
-
d2 = d2.sort_values(
|
|
97
|
+
d1 = d1.sort_values(cols)
|
|
98
|
+
d2 = d2.sort_values(cols)
|
|
96
99
|
d1 = d1.reset_index(drop=True)
|
|
97
100
|
d2 = d2.reset_index(drop=True)
|
|
98
101
|
pd.testing.assert_frame_equal(
|
|
@@ -248,3 +251,68 @@ def get_join_schemas(
|
|
|
248
251
|
else:
|
|
249
252
|
aot(len(on) > 0, SchemaError("join on columns must be specified"))
|
|
250
253
|
return cm, (df1.schema.union(schema2))
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def pa_table_as_array_iterable(
|
|
257
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
258
|
+
) -> Iterable[List[List[Any]]]:
|
|
259
|
+
"""Convert a pyarrow table to an iterable of list
|
|
260
|
+
|
|
261
|
+
:param df: pyarrow table
|
|
262
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
263
|
+
:return: an iterable of list
|
|
264
|
+
"""
|
|
265
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
266
|
+
_df = df if columns is None or len(columns) == 0 else df.select(columns)
|
|
267
|
+
for batch in _df.to_batches():
|
|
268
|
+
for x in zip(*batch.to_pydict().values()):
|
|
269
|
+
yield list(x)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def pa_table_as_array(
|
|
273
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
274
|
+
) -> List[List[List[Any]]]:
|
|
275
|
+
"""Convert a pyarrow table to a list of list
|
|
276
|
+
|
|
277
|
+
:param df: pyarrow table
|
|
278
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
279
|
+
:return: a list of list
|
|
280
|
+
"""
|
|
281
|
+
return list(pa_table_as_array_iterable(df, columns=columns))
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def pa_table_as_dict_iterable(
|
|
285
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
286
|
+
) -> Iterable[Dict[str, Any]]:
|
|
287
|
+
"""Convert a pyarrow table to an iterable of dict
|
|
288
|
+
|
|
289
|
+
:param df: pyarrow table
|
|
290
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
291
|
+
:return: an iterable of dict
|
|
292
|
+
"""
|
|
293
|
+
for ck in _pa_table_as_dicts_chunks(df, columns=columns):
|
|
294
|
+
yield from ck
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def pa_table_as_dicts(
|
|
298
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
299
|
+
) -> List[Dict[str, Any]]:
|
|
300
|
+
"""Convert a pyarrow table to a list of dict
|
|
301
|
+
|
|
302
|
+
:param df: pyarrow table
|
|
303
|
+
:param columns: if not None, only these columns will be returned, defaults to None
|
|
304
|
+
:return: a list of dict
|
|
305
|
+
"""
|
|
306
|
+
res: List[Dict[str, Any]] = []
|
|
307
|
+
for ck in _pa_table_as_dicts_chunks(df, columns=columns):
|
|
308
|
+
res += ck
|
|
309
|
+
return res
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _pa_table_as_dicts_chunks(
|
|
313
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
314
|
+
) -> Iterable[List[Dict[str, Any]]]:
|
|
315
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
316
|
+
_df = df if columns is None or len(columns) == 0 else df.select(columns)
|
|
317
|
+
for batch in _df.to_batches():
|
|
318
|
+
yield pa_batch_to_dicts(batch)
|
|
@@ -1323,7 +1323,7 @@ class _Comap:
|
|
|
1323
1323
|
self._on_init(partition_no, empty_dfs)
|
|
1324
1324
|
|
|
1325
1325
|
def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
|
|
1326
|
-
data =
|
|
1326
|
+
data = df.as_dicts()
|
|
1327
1327
|
if self.how == "inner":
|
|
1328
1328
|
if len(data) < self.dfs_count:
|
|
1329
1329
|
return ArrayDataFrame([], self.output_schema)
|
|
@@ -143,7 +143,7 @@ class PandasMapEngine(MapEngine):
|
|
|
143
143
|
if (
|
|
144
144
|
isinstance(output_df, PandasDataFrame)
|
|
145
145
|
and output_df.schema != output_schema
|
|
146
|
-
):
|
|
146
|
+
): # pragma: no cover
|
|
147
147
|
output_df = PandasDataFrame(output_df.native, output_schema)
|
|
148
148
|
assert_or_throw(
|
|
149
149
|
output_df.schema == output_schema,
|
fugue/plugins.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.8.7.
|
|
3
|
+
Version: 0.8.7.dev6
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.8
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
|
-
Requires-Dist: triad ==0.9.2.
|
|
22
|
+
Requires-Dist: triad ==0.9.2.dev5
|
|
23
23
|
Requires-Dist: adagio >=0.2.4
|
|
24
24
|
Requires-Dist: qpd >=0.4.4
|
|
25
25
|
Requires-Dist: fugue-sql-antlr >=0.1.6
|
|
@@ -32,7 +32,7 @@ Requires-Dist: fugue-sql-antlr[cpp] >=0.1.6 ; extra == 'all'
|
|
|
32
32
|
Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
|
|
33
33
|
Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
|
|
34
34
|
Requires-Dist: dask-sql ; extra == 'all'
|
|
35
|
-
Requires-Dist: ray[data] >=2.
|
|
35
|
+
Requires-Dist: ray[data] >=2.4.0 ; extra == 'all'
|
|
36
36
|
Requires-Dist: notebook ; extra == 'all'
|
|
37
37
|
Requires-Dist: jupyterlab ; extra == 'all'
|
|
38
38
|
Requires-Dist: ipython >=7.10.0 ; extra == 'all'
|
|
@@ -59,7 +59,7 @@ Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
|
|
|
59
59
|
Provides-Extra: polars
|
|
60
60
|
Requires-Dist: polars ; extra == 'polars'
|
|
61
61
|
Provides-Extra: ray
|
|
62
|
-
Requires-Dist: ray[data] >=2.
|
|
62
|
+
Requires-Dist: ray[data] >=2.4.0 ; extra == 'ray'
|
|
63
63
|
Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
|
|
64
64
|
Requires-Dist: pyarrow >=6.0.1 ; extra == 'ray'
|
|
65
65
|
Provides-Extra: spark
|
|
@@ -323,6 +323,7 @@ Feel free to message us on [Slack](http://slack.fugue.ai). We also have [contrib
|
|
|
323
323
|
|
|
324
324
|
* [How LyftLearn Democratizes Distributed Compute through Kubernetes Spark and Fugue](https://eng.lyft.com/how-lyftlearn-democratizes-distributed-compute-through-kubernetes-spark-and-fugue-c0875b97c3d9)
|
|
325
325
|
* [Clobotics - Large Scale Image Processing with Spark through Fugue](https://medium.com/fugue-project/large-scale-image-processing-with-spark-through-fugue-e510b9813da8)
|
|
326
|
+
* [Architecture for a data lake REST API using Delta Lake, Fugue & Spark (article by bitsofinfo)](https://bitsofinfo.wordpress.com/2023/08/14/data-lake-rest-api-delta-lake-fugue-spark)
|
|
326
327
|
|
|
327
328
|
### Mentioned Uses
|
|
328
329
|
|