fugue 0.8.7.dev4__py3-none-any.whl → 0.8.7.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fugue/api.py CHANGED
@@ -6,6 +6,7 @@ from .dataframe.api import (
6
6
  as_array_iterable,
7
7
  as_arrow,
8
8
  as_dict_iterable,
9
+ as_dicts,
9
10
  as_fugue_df,
10
11
  as_pandas,
11
12
  drop_columns,
fugue/dataframe/api.py CHANGED
@@ -11,12 +11,12 @@ from .dataframe import AnyDataFrame, DataFrame, as_fugue_df
11
11
 
12
12
  @fugue_plugin
13
13
  def is_df(df: Any) -> bool:
14
- """Whether ``df`` is a DataFrame like object"""
14
+ """Whether the input object is any type of DataFrame"""
15
15
  return isinstance(df, DataFrame)
16
16
 
17
17
 
18
18
  def get_native_as_df(df: AnyDataFrame) -> AnyDataFrame:
19
- """Return the dataframe form of the input ``df``.
19
+ """Return the dataframe form of any dataframe.
20
20
  If ``df`` is a :class:`~.DataFrame`, then call the
21
21
  :meth:`~.DataFrame.native_as_df`, otherwise, it depends on whether there is
22
22
  a correspondent function handling it.
@@ -30,30 +30,49 @@ def get_native_as_df(df: AnyDataFrame) -> AnyDataFrame:
30
30
 
31
31
  @fugue_plugin
32
32
  def get_schema(df: AnyDataFrame) -> Schema:
33
- """Get the schema of the ``df``
33
+ """The generic function to get the schema of any dataframe
34
34
 
35
35
  :param df: the object that can be recognized as a dataframe by Fugue
36
36
  :return: the Schema object
37
+
38
+ .. admonition:: Examples
39
+
40
+ .. code-block:: python
41
+
42
+ import fugue.api as fa
43
+ import pandas as pd
44
+
45
+ df = pd.DataFrame([[0,1],[2,3]], columns=["a","b"])
46
+ fa.get_schema(df) # == Schema("a:long,b:long")
47
+
48
+ .. related_topics
49
+ How to get schema of any dataframe using Fugue?
37
50
  """
38
51
  return as_fugue_df(df).schema
39
52
 
40
53
 
41
54
  @fugue_plugin
42
55
  def as_pandas(df: AnyDataFrame) -> pd.DataFrame:
43
- """Convert ``df`` to a Pandas DataFrame
56
+ """The generic function to convert any dataframe to a Pandas DataFrame
44
57
 
45
58
  :param df: the object that can be recognized as a dataframe by Fugue
46
59
  :return: the Pandas DataFrame
60
+
61
+ .. related_topics
62
+ How to convert any dataframe to a pandas dataframe?
47
63
  """
48
64
  return as_fugue_df(df).as_pandas()
49
65
 
50
66
 
51
67
  @fugue_plugin
52
68
  def as_arrow(df: AnyDataFrame) -> pa.Table:
53
- """Convert ``df`` to a PyArrow Table
69
+ """The generic function to convert any dataframe to a PyArrow Table
54
70
 
55
71
  :param df: the object that can be recognized as a dataframe by Fugue
56
72
  :return: the PyArrow Table
73
+
74
+ .. related_topics
75
+ How to convert any dataframe to a pyarrow dataframe?
57
76
  """
58
77
  return as_fugue_df(df).as_arrow()
59
78
 
@@ -62,7 +81,7 @@ def as_arrow(df: AnyDataFrame) -> pa.Table:
62
81
  def as_array(
63
82
  df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
64
83
  ) -> List[Any]: # pragma: no cover
65
- """Convert df to 2-dimensional native python array
84
+ """The generic function to convert any dataframe to a 2-dimensional python array
66
85
 
67
86
  :param df: the object that can be recognized as a dataframe by Fugue
68
87
  :param columns: columns to extract, defaults to None
@@ -81,7 +100,7 @@ def as_array(
81
100
  def as_array_iterable(
82
101
  df: AnyDataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
83
102
  ) -> Iterable[Any]: # pragma: no cover
84
- """Convert df to iterable of native python arrays
103
+ """The generic function to convert any dataframe to iterable of python arrays
85
104
 
86
105
  :param df: the object that can be recognized as a dataframe by Fugue
87
106
  :param columns: columns to extract, defaults to None
@@ -97,15 +116,32 @@ def as_array_iterable(
97
116
  return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe)
98
117
 
99
118
 
119
+ @fugue_plugin
120
+ def as_dicts(
121
+ df: AnyDataFrame, columns: Optional[List[str]] = None
122
+ ) -> List[Dict[str, Any]]:
123
+ """Convert any dataframe to a list of python dicts
124
+
125
+ :param df: the object that can be recognized as a dataframe by Fugue
126
+ :param columns: columns to extract, defaults to None
127
+ :return: a list of python dicts
128
+
129
+ .. note::
130
+
131
+ The default implementation enforces ``type_safe`` True
132
+ """
133
+ return as_fugue_df(df).as_dicts(columns=columns)
134
+
135
+
100
136
  @fugue_plugin
101
137
  def as_dict_iterable(
102
138
  df: AnyDataFrame, columns: Optional[List[str]] = None
103
139
  ) -> Iterable[Dict[str, Any]]:
104
- """Convert df to iterable of native python dicts
140
+ """Convert any dataframe to iterable of python dicts
105
141
 
106
142
  :param df: the object that can be recognized as a dataframe by Fugue
107
143
  :param columns: columns to extract, defaults to None
108
- :return: iterable of native python dicts
144
+ :return: iterable of python dicts
109
145
 
110
146
  .. note::
111
147
 
@@ -116,7 +152,7 @@ def as_dict_iterable(
116
152
 
117
153
  @fugue_plugin
118
154
  def peek_array(df: AnyDataFrame) -> List[Any]:
119
- """Peek the first row of the dataframe as an array
155
+ """Peek the first row of any dataframe as an array
120
156
 
121
157
  :param df: the object that can be recognized as a dataframe by Fugue
122
158
  :return: the first row as an array
@@ -126,7 +162,7 @@ def peek_array(df: AnyDataFrame) -> List[Any]:
126
162
 
127
163
  @fugue_plugin
128
164
  def peek_dict(df: AnyDataFrame) -> Dict[str, Any]:
129
- """Peek the first row of the dataframe as a array
165
+ """Peek the first row of any dataframe as a array
130
166
 
131
167
  :param df: the object that can be recognized as a dataframe by Fugue
132
168
  :return: the first row as a dict
@@ -141,7 +177,7 @@ def head(
141
177
  columns: Optional[List[str]] = None,
142
178
  as_fugue: bool = False,
143
179
  ) -> AnyDataFrame:
144
- """Get first n rows of the dataframe as a new local bounded dataframe
180
+ """Get first n rows of any dataframe as a new local bounded dataframe
145
181
 
146
182
  :param n: number of rows
147
183
  :param columns: selected columns, defaults to None (all columns)
@@ -160,7 +196,7 @@ def head(
160
196
  def alter_columns(
161
197
  df: AnyDataFrame, columns: Any, as_fugue: bool = False
162
198
  ) -> AnyDataFrame:
163
- """Change column types
199
+ """Change column data types of any dataframe
164
200
 
165
201
  :param df: the object that can be recognized as a dataframe by Fugue
166
202
  :param columns: |SchemaLikeObject|,
@@ -178,7 +214,7 @@ def alter_columns(
178
214
  def drop_columns(
179
215
  df: AnyDataFrame, columns: List[str], as_fugue: bool = False
180
216
  ) -> AnyDataFrame:
181
- """Drop certain columns and return a new dataframe
217
+ """Drop certain columns of any dataframe
182
218
 
183
219
  :param df: the object that can be recognized as a dataframe by Fugue
184
220
  :param columns: columns to drop
@@ -194,7 +230,7 @@ def drop_columns(
194
230
  def select_columns(
195
231
  df: AnyDataFrame, columns: List[Any], as_fugue: bool = False
196
232
  ) -> AnyDataFrame:
197
- """Select certain columns and return a new dataframe
233
+ """Select certain columns of any dataframe and return a new dataframe
198
234
 
199
235
  :param df: the object that can be recognized as a dataframe by Fugue
200
236
  :param columns: columns to return
@@ -21,6 +21,10 @@ from fugue.exceptions import FugueDataFrameOperationError
21
21
 
22
22
  from .api import (
23
23
  alter_columns,
24
+ as_array,
25
+ as_array_iterable,
26
+ as_dict_iterable,
27
+ as_dicts,
24
28
  as_pandas,
25
29
  drop_columns,
26
30
  get_column_names,
@@ -30,6 +34,12 @@ from .api import (
30
34
  select_columns,
31
35
  )
32
36
  from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
37
+ from .utils import (
38
+ pa_table_as_array,
39
+ pa_table_as_array_iterable,
40
+ pa_table_as_dict_iterable,
41
+ pa_table_as_dicts,
42
+ )
33
43
 
34
44
 
35
45
  class ArrowDataFrame(LocalBoundedDataFrame):
@@ -174,21 +184,20 @@ class ArrowDataFrame(LocalBoundedDataFrame):
174
184
  def as_array(
175
185
  self, columns: Optional[List[str]] = None, type_safe: bool = False
176
186
  ) -> List[Any]:
177
- return list(self.as_array_iterable(columns, type_safe=type_safe))
187
+ return pa_table_as_array(self.native, columns=columns)
188
+
189
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
190
+ return pa_table_as_dicts(self.native, columns=columns)
178
191
 
179
192
  def as_array_iterable(
180
193
  self, columns: Optional[List[str]] = None, type_safe: bool = False
181
194
  ) -> Iterable[Any]:
182
- if self.empty:
183
- return
184
- if columns is not None:
185
- for x in self[columns].as_array_iterable(type_safe=type_safe):
186
- yield x
187
- else:
188
- d = self.native.to_pydict()
189
- cols = [d[n] for n in self.columns]
190
- for arr in zip(*cols):
191
- yield list(arr)
195
+ yield from pa_table_as_array_iterable(self.native, columns=columns)
196
+
197
+ def as_dict_iterable(
198
+ self, columns: Optional[List[str]] = None
199
+ ) -> Iterable[Dict[str, Any]]:
200
+ yield from pa_table_as_dict_iterable(self.native, columns=columns)
192
201
 
193
202
 
194
203
  @as_local.candidate(lambda df: isinstance(df, pa.Table))
@@ -212,6 +221,34 @@ def _pa_table_as_pandas(df: pa.Table) -> pd.DataFrame:
212
221
  )
213
222
 
214
223
 
224
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
225
+ def _pa_table_as_array(
226
+ df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
227
+ ) -> List[Any]:
228
+ return pa_table_as_array(df, columns=columns)
229
+
230
+
231
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
232
+ def _pa_table_as_array_iterable(
233
+ df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
234
+ ) -> Iterable[Any]:
235
+ yield from pa_table_as_array_iterable(df, columns=columns)
236
+
237
+
238
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
239
+ def _pa_table_as_dicts(
240
+ df: pa.Table, columns: Optional[List[str]] = None
241
+ ) -> List[Dict[str, Any]]:
242
+ return pa_table_as_dicts(df, columns=columns)
243
+
244
+
245
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
246
+ def _pa_table_as_dict_iterable(
247
+ df: pa.Table, columns: Optional[List[str]] = None
248
+ ) -> Iterable[Dict[str, Any]]:
249
+ yield from pa_table_as_dict_iterable(df, columns=columns)
250
+
251
+
215
252
  @alter_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
216
253
  def _pa_table_alter_columns(
217
254
  df: pa.Table, columns: Any, as_fugue: bool = False
@@ -237,13 +237,31 @@ class DataFrame(Dataset):
237
237
  """
238
238
  raise NotImplementedError
239
239
 
240
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
241
+ """Convert to a list of python dicts
242
+
243
+ :param columns: columns to extract, defaults to None
244
+ :return: a list of python dicts
245
+
246
+ .. note::
247
+
248
+ The default implementation enforces ``type_safe`` True
249
+ """
250
+ if columns is None:
251
+ columns = self.columns
252
+ idx = range(len(columns))
253
+ return [
254
+ {columns[i]: x[i] for i in idx}
255
+ for x in self.as_array(columns, type_safe=True)
256
+ ]
257
+
240
258
  def as_dict_iterable(
241
259
  self, columns: Optional[List[str]] = None
242
260
  ) -> Iterable[Dict[str, Any]]:
243
- """Convert to iterable of native python dicts
261
+ """Convert to iterable of python dicts
244
262
 
245
263
  :param columns: columns to extract, defaults to None
246
- :return: iterable of native python dicts
264
+ :return: iterable of python dicts
247
265
 
248
266
  .. note::
249
267
 
@@ -269,7 +269,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
269
269
  class _ListDictParam(_LocalNoSchemaDataFrameParam):
270
270
  @no_type_check
271
271
  def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
272
- return list(df.as_local().as_dict_iterable())
272
+ return df.as_local().as_dicts()
273
273
 
274
274
  @no_type_check
275
275
  def to_output_df(
@@ -105,6 +105,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
105
105
  ) -> List[Any]:
106
106
  return list(self.as_array_iterable(columns, type_safe=type_safe))
107
107
 
108
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
109
+ return list(self.as_dict_iterable(columns))
110
+
108
111
  def as_array_iterable(
109
112
  self, columns: Optional[List[str]] = None, type_safe: bool = False
110
113
  ) -> Iterable[Any]:
@@ -1,8 +1,11 @@
1
1
  from typing import Any, Dict, Iterable, List, Optional, Tuple
2
2
 
3
3
  import pandas as pd
4
+ import pyarrow as pa
5
+ from triad import assert_or_throw
4
6
  from triad.collections.schema import Schema
5
7
  from triad.utils.pandas_like import PD_UTILS
8
+ from triad.utils.pyarrow import pa_batch_to_dicts
6
9
 
7
10
  from fugue.dataset.api import (
8
11
  as_fugue_dataset,
@@ -17,6 +20,10 @@ from fugue.dataset.api import (
17
20
  from fugue.exceptions import FugueDataFrameOperationError
18
21
 
19
22
  from .api import (
23
+ as_array,
24
+ as_array_iterable,
25
+ as_dict_iterable,
26
+ as_dicts,
20
27
  drop_columns,
21
28
  get_column_names,
22
29
  get_schema,
@@ -134,6 +141,9 @@ class PandasDataFrame(LocalBoundedDataFrame):
134
141
  return self
135
142
  return PandasDataFrame(self.native, new_schema)
136
143
 
144
+ def as_arrow(self, type_safe: bool = False) -> pa.Table:
145
+ return PD_UTILS.as_arrow(self.native, schema=self.schema.pa_schema)
146
+
137
147
  def as_array(
138
148
  self, columns: Optional[List[str]] = None, type_safe: bool = False
139
149
  ) -> List[Any]:
@@ -150,6 +160,18 @@ class PandasDataFrame(LocalBoundedDataFrame):
150
160
  ):
151
161
  yield row
152
162
 
163
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
164
+ res: List[Dict[str, Any]] = []
165
+ for block in _to_dicts(self.native, columns, self.schema):
166
+ res += block
167
+ return res
168
+
169
+ def as_dict_iterable(
170
+ self, columns: Optional[List[str]] = None
171
+ ) -> Iterable[Dict[str, Any]]:
172
+ for block in _to_dicts(self.native, columns, self.schema):
173
+ yield from block
174
+
153
175
  def head(
154
176
  self, n: int, columns: Optional[List[str]] = None
155
177
  ) -> LocalBoundedDataFrame:
@@ -272,6 +294,43 @@ def _pd_head(
272
294
  return _adjust_df(df.head(n), as_fugue=as_fugue)
273
295
 
274
296
 
297
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
298
+ def _pd_as_array(
299
+ df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
300
+ ) -> List[Any]:
301
+ return list(_pd_as_array_iterable(df, columns, type_safe=type_safe))
302
+
303
+
304
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
305
+ def _pd_as_array_iterable(
306
+ df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
307
+ ) -> Iterable[Any]:
308
+ for row in PD_UTILS.as_array_iterable(
309
+ df,
310
+ columns=columns,
311
+ type_safe=type_safe,
312
+ ):
313
+ yield row
314
+
315
+
316
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
317
+ def _pd_as_dicts(
318
+ df: pd.DataFrame, columns: Optional[List[str]] = None
319
+ ) -> List[Dict[str, Any]]:
320
+ res: List[Dict[str, Any]] = []
321
+ for block in _to_dicts(df, columns):
322
+ res += block
323
+ return res
324
+
325
+
326
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
327
+ def _pd_as_dict_iterable(
328
+ df: pa.Table, columns: Optional[List[str]] = None
329
+ ) -> Iterable[Dict[str, Any]]:
330
+ for block in _to_dicts(df, columns):
331
+ yield from block
332
+
333
+
275
334
  def _adjust_df(res: pd.DataFrame, as_fugue: bool):
276
335
  return res if not as_fugue else PandasDataFrame(res)
277
336
 
@@ -280,3 +339,17 @@ def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None:
280
339
  missing = [x for x in columns if x not in df.columns]
281
340
  if len(missing) > 0:
282
341
  raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
342
+
343
+
344
+ def _to_dicts(
345
+ df: pd.DataFrame,
346
+ columns: Optional[List[str]] = None,
347
+ schema: Optional[Schema] = None,
348
+ ) -> Iterable[List[Dict[str, Any]]]:
349
+ cols = list(df.columns) if columns is None else columns
350
+ assert_or_throw(len(cols) > 0, ValueError("columns cannot be empty"))
351
+ pa_schema = schema.extract(cols).pa_schema if schema is not None else None
352
+ adf = PD_UTILS.as_arrow(df[cols], schema=pa_schema)
353
+ for batch in adf.to_batches():
354
+ if batch.num_rows > 0:
355
+ yield pa_batch_to_dicts(batch)
fugue/dataframe/utils.py CHANGED
@@ -1,15 +1,16 @@
1
1
  import os
2
2
  import pickle
3
- from typing import Any, Iterable, Optional, Tuple
3
+ from typing import Any, Iterable, Optional, Tuple, List, Dict
4
4
 
5
5
  import pandas as pd
6
6
  import pyarrow as pa
7
7
  from fs import open_fs
8
- from triad import FileSystem, Schema
8
+ from triad import FileSystem, Schema, assert_or_throw
9
9
  from triad.collections.schema import SchemaError
10
10
  from triad.exceptions import InvalidOperationError
11
11
  from triad.utils.assertion import assert_arg_not_none
12
12
  from triad.utils.assertion import assert_or_throw as aot
13
+ from triad.utils.pyarrow import pa_batch_to_dicts
13
14
 
14
15
  from .api import as_fugue_df, get_column_names, normalize_column_names, rename
15
16
  from .dataframe import DataFrame, LocalBoundedDataFrame
@@ -82,17 +83,19 @@ def _df_eq(
82
83
  ), f"schema mismatch {df.schema.pa_schema}, {df2.schema.pa_schema}"
83
84
  if not check_content:
84
85
  return True
86
+ cols: Any = df1.columns
85
87
  if no_pandas:
86
88
  dd1 = [[x.__repr__()] for x in df1.as_array_iterable(type_safe=True)]
87
89
  dd2 = [[x.__repr__()] for x in df2.as_array_iterable(type_safe=True)]
88
90
  d1 = pd.DataFrame(dd1, columns=["data"])
89
91
  d2 = pd.DataFrame(dd2, columns=["data"])
92
+ cols = ["data"]
90
93
  else:
91
94
  d1 = df1.as_pandas()
92
95
  d2 = df2.as_pandas()
93
96
  if not check_order:
94
- d1 = d1.sort_values(df1.columns)
95
- d2 = d2.sort_values(df1.columns)
97
+ d1 = d1.sort_values(cols)
98
+ d2 = d2.sort_values(cols)
96
99
  d1 = d1.reset_index(drop=True)
97
100
  d2 = d2.reset_index(drop=True)
98
101
  pd.testing.assert_frame_equal(
@@ -248,3 +251,68 @@ def get_join_schemas(
248
251
  else:
249
252
  aot(len(on) > 0, SchemaError("join on columns must be specified"))
250
253
  return cm, (df1.schema.union(schema2))
254
+
255
+
256
+ def pa_table_as_array_iterable(
257
+ df: pa.Table, columns: Optional[List[str]] = None
258
+ ) -> Iterable[List[List[Any]]]:
259
+ """Convert a pyarrow table to an iterable of list
260
+
261
+ :param df: pyarrow table
262
+ :param columns: if not None, only these columns will be returned, defaults to None
263
+ :return: an iterable of list
264
+ """
265
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
266
+ _df = df if columns is None or len(columns) == 0 else df.select(columns)
267
+ for batch in _df.to_batches():
268
+ for x in zip(*batch.to_pydict().values()):
269
+ yield list(x)
270
+
271
+
272
+ def pa_table_as_array(
273
+ df: pa.Table, columns: Optional[List[str]] = None
274
+ ) -> List[List[List[Any]]]:
275
+ """Convert a pyarrow table to a list of list
276
+
277
+ :param df: pyarrow table
278
+ :param columns: if not None, only these columns will be returned, defaults to None
279
+ :return: a list of list
280
+ """
281
+ return list(pa_table_as_array_iterable(df, columns=columns))
282
+
283
+
284
+ def pa_table_as_dict_iterable(
285
+ df: pa.Table, columns: Optional[List[str]] = None
286
+ ) -> Iterable[Dict[str, Any]]:
287
+ """Convert a pyarrow table to an iterable of dict
288
+
289
+ :param df: pyarrow table
290
+ :param columns: if not None, only these columns will be returned, defaults to None
291
+ :return: an iterable of dict
292
+ """
293
+ for ck in _pa_table_as_dicts_chunks(df, columns=columns):
294
+ yield from ck
295
+
296
+
297
+ def pa_table_as_dicts(
298
+ df: pa.Table, columns: Optional[List[str]] = None
299
+ ) -> List[Dict[str, Any]]:
300
+ """Convert a pyarrow table to a list of dict
301
+
302
+ :param df: pyarrow table
303
+ :param columns: if not None, only these columns will be returned, defaults to None
304
+ :return: a list of dict
305
+ """
306
+ res: List[Dict[str, Any]] = []
307
+ for ck in _pa_table_as_dicts_chunks(df, columns=columns):
308
+ res += ck
309
+ return res
310
+
311
+
312
+ def _pa_table_as_dicts_chunks(
313
+ df: pa.Table, columns: Optional[List[str]] = None
314
+ ) -> Iterable[List[Dict[str, Any]]]:
315
+ assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
316
+ _df = df if columns is None or len(columns) == 0 else df.select(columns)
317
+ for batch in _df.to_batches():
318
+ yield pa_batch_to_dicts(batch)
@@ -1323,7 +1323,7 @@ class _Comap:
1323
1323
  self._on_init(partition_no, empty_dfs)
1324
1324
 
1325
1325
  def run(self, cursor: PartitionCursor, df: LocalDataFrame) -> LocalDataFrame:
1326
- data = list(df.as_dict_iterable())
1326
+ data = df.as_dicts()
1327
1327
  if self.how == "inner":
1328
1328
  if len(data) < self.dfs_count:
1329
1329
  return ArrayDataFrame([], self.output_schema)
@@ -143,7 +143,7 @@ class PandasMapEngine(MapEngine):
143
143
  if (
144
144
  isinstance(output_df, PandasDataFrame)
145
145
  and output_df.schema != output_schema
146
- ):
146
+ ): # pragma: no cover
147
147
  output_df = PandasDataFrame(output_df.native, output_schema)
148
148
  assert_or_throw(
149
149
  output_df.schema == output_schema,
fugue/plugins.py CHANGED
@@ -7,6 +7,7 @@ from fugue.dataframe import (
7
7
  as_array_iterable,
8
8
  as_arrow,
9
9
  as_dict_iterable,
10
+ as_dicts,
10
11
  as_pandas,
11
12
  drop_columns,
12
13
  fugue_annotated_param,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fugue
3
- Version: 0.8.7.dev4
3
+ Version: 0.8.7.dev6
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.8
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: triad ==0.9.2.dev3
22
+ Requires-Dist: triad ==0.9.2.dev5
23
23
  Requires-Dist: adagio >=0.2.4
24
24
  Requires-Dist: qpd >=0.4.4
25
25
  Requires-Dist: fugue-sql-antlr >=0.1.6
@@ -32,7 +32,7 @@ Requires-Dist: fugue-sql-antlr[cpp] >=0.1.6 ; extra == 'all'
32
32
  Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
33
33
  Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
34
34
  Requires-Dist: dask-sql ; extra == 'all'
35
- Requires-Dist: ray[data] >=2.1.0 ; extra == 'all'
35
+ Requires-Dist: ray[data] >=2.4.0 ; extra == 'all'
36
36
  Requires-Dist: notebook ; extra == 'all'
37
37
  Requires-Dist: jupyterlab ; extra == 'all'
38
38
  Requires-Dist: ipython >=7.10.0 ; extra == 'all'
@@ -59,7 +59,7 @@ Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
59
59
  Provides-Extra: polars
60
60
  Requires-Dist: polars ; extra == 'polars'
61
61
  Provides-Extra: ray
62
- Requires-Dist: ray[data] >=2.1.0 ; extra == 'ray'
62
+ Requires-Dist: ray[data] >=2.4.0 ; extra == 'ray'
63
63
  Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
64
64
  Requires-Dist: pyarrow >=6.0.1 ; extra == 'ray'
65
65
  Provides-Extra: spark
@@ -323,6 +323,7 @@ Feel free to message us on [Slack](http://slack.fugue.ai). We also have [contrib
323
323
 
324
324
  * [How LyftLearn Democratizes Distributed Compute through Kubernetes Spark and Fugue](https://eng.lyft.com/how-lyftlearn-democratizes-distributed-compute-through-kubernetes-spark-and-fugue-c0875b97c3d9)
325
325
  * [Clobotics - Large Scale Image Processing with Spark through Fugue](https://medium.com/fugue-project/large-scale-image-processing-with-spark-through-fugue-e510b9813da8)
326
+ * [Architecture for a data lake REST API using Delta Lake, Fugue & Spark (article by bitsofinfo)](https://bitsofinfo.wordpress.com/2023/08/14/data-lake-rest-api-delta-lake-fugue-spark)
326
327
 
327
328
  ### Mentioned Uses
328
329