fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. fugue/__init__.py +0 -1
  2. fugue/_utils/io.py +84 -89
  3. fugue/api.py +1 -0
  4. fugue/dataframe/api.py +19 -2
  5. fugue/dataframe/arrow_dataframe.py +48 -11
  6. fugue/dataframe/dataframe.py +20 -2
  7. fugue/dataframe/function_wrapper.py +1 -1
  8. fugue/dataframe/iterable_dataframe.py +3 -0
  9. fugue/dataframe/pandas_dataframe.py +73 -0
  10. fugue/dataframe/utils.py +78 -25
  11. fugue/execution/execution_engine.py +1 -8
  12. fugue/execution/native_execution_engine.py +5 -11
  13. fugue/plugins.py +1 -0
  14. fugue/workflow/_checkpoint.py +9 -9
  15. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +4 -4
  16. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +40 -38
  17. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
  18. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +3 -2
  19. fugue_dask/_io.py +22 -29
  20. fugue_dask/_utils.py +15 -2
  21. fugue_dask/dataframe.py +105 -18
  22. fugue_dask/execution_engine.py +5 -12
  23. fugue_duckdb/_io.py +21 -37
  24. fugue_duckdb/dataframe.py +87 -29
  25. fugue_duckdb/execution_engine.py +2 -7
  26. fugue_ibis/dataframe.py +13 -0
  27. fugue_ibis/execution_engine.py +1 -5
  28. fugue_polars/polars_dataframe.py +53 -16
  29. fugue_ray/_utils/io.py +15 -17
  30. fugue_ray/dataframe.py +71 -19
  31. fugue_spark/_utils/io.py +3 -5
  32. fugue_spark/dataframe.py +69 -13
  33. fugue_spark/execution_engine.py +2 -7
  34. fugue_test/builtin_suite.py +12 -12
  35. fugue_test/dataframe_suite.py +14 -0
  36. fugue_test/execution_suite.py +13 -18
  37. fugue_test/plugins/misc/__init__.py +2 -0
  38. fugue_test/plugins/misc/fixtures.py +18 -0
  39. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
  40. {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue/__init__.py CHANGED
@@ -1,6 +1,5 @@
1
1
  # flake8: noqa
2
2
  from triad.collections import Schema
3
- from triad.collections.fs import FileSystem
4
3
 
5
4
  from fugue.api import out_transform, transform
6
5
  from fugue.bag.array_bag import ArrayBag
fugue/_utils/io.py CHANGED
@@ -1,14 +1,14 @@
1
1
  import os
2
2
  import pathlib
3
3
  from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
4
- from urllib.parse import urlparse
5
4
 
6
- import fs as pfs
7
5
  import pandas as pd
6
+ from fsspec import AbstractFileSystem
7
+ from fsspec.implementations.local import LocalFileSystem
8
8
  from triad.collections.dict import ParamDict
9
- from triad.collections.fs import FileSystem
10
9
  from triad.collections.schema import Schema
11
10
  from triad.utils.assertion import assert_or_throw
11
+ from triad.utils.io import join, url_to_fs
12
12
  from triad.utils.pandas_like import PD_UTILS
13
13
 
14
14
  from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
@@ -16,23 +16,14 @@ from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFra
16
16
 
17
17
  class FileParser(object):
18
18
  def __init__(self, path: str, format_hint: Optional[str] = None):
19
- last = len(path)
20
- has_glob = False
21
19
  self._orig_format_hint = format_hint
22
- for i in range(len(path)):
23
- if path[i] in ["/", "\\"]:
24
- last = i
25
- if path[i] in ["*", "?"]:
26
- has_glob = True
27
- break
28
- if not has_glob:
29
- self._uri = urlparse(path)
30
- self._glob_pattern = ""
31
- self._path = self._uri.path
20
+ self._has_glob = "*" in path or "?" in path
21
+ self._raw_path = path
22
+ self._fs, self._fs_path = url_to_fs(path)
23
+ if not self.is_local:
24
+ self._path = self._fs.unstrip_protocol(self._fs_path)
32
25
  else:
33
- self._uri = urlparse(path[:last])
34
- self._glob_pattern = path[last + 1 :]
35
- self._path = pfs.path.combine(self._uri.path, self._glob_pattern)
26
+ self._path = os.path.abspath(self._fs._strip_protocol(path))
36
27
 
37
28
  if format_hint is None or format_hint == "":
38
29
  for k, v in _FORMAT_MAP.items():
@@ -48,56 +39,64 @@ class FileParser(object):
48
39
  self._format = format_hint
49
40
 
50
41
  def assert_no_glob(self) -> "FileParser":
51
- assert_or_throw(self.glob_pattern == "", f"{self.path} has glob pattern")
42
+ assert_or_throw(not self.has_glob, f"{self.raw_path} has glob pattern")
52
43
  return self
53
44
 
54
- def with_glob(self, glob: str, format_hint: Optional[str] = None) -> "FileParser":
55
- uri = self.uri
56
- if glob != "":
57
- uri = pfs.path.combine(uri, glob)
58
- return FileParser(uri, format_hint or self._orig_format_hint)
59
-
60
45
  @property
61
- def glob_pattern(self) -> str:
62
- return self._glob_pattern
46
+ def has_glob(self):
47
+ return self._has_glob
63
48
 
64
49
  @property
65
- def uri(self) -> str:
66
- return self._uri.geturl()
50
+ def is_local(self):
51
+ return isinstance(self._fs, LocalFileSystem)
67
52
 
68
- @property
69
- def uri_with_glob(self) -> str:
70
- if self.glob_pattern == "":
71
- return self.uri
72
- return pfs.path.combine(self.uri, self.glob_pattern)
53
+ def join(self, path: str, format_hint: Optional[str] = None) -> "FileParser":
54
+ if not self.has_glob:
55
+ _path = join(self.path, path)
56
+ else:
57
+ _path = join(self.parent, path)
58
+ return FileParser(_path, format_hint or self._orig_format_hint)
73
59
 
74
60
  @property
75
61
  def parent(self) -> str:
76
- dn = os.path.dirname(self.uri)
77
- return dn if dn != "" else "."
78
-
79
- @property
80
- def scheme(self) -> str:
81
- return self._uri.scheme
62
+ return self._fs.unstrip_protocol(self._fs._parent(self._fs_path))
82
63
 
83
64
  @property
84
65
  def path(self) -> str:
85
66
  return self._path
86
67
 
68
+ @property
69
+ def raw_path(self) -> str:
70
+ return self._raw_path
71
+
87
72
  @property
88
73
  def suffix(self) -> str:
89
- return "".join(pathlib.Path(self.path.lower()).suffixes)
74
+ return "".join(pathlib.Path(self.raw_path.lower()).suffixes)
90
75
 
91
76
  @property
92
77
  def file_format(self) -> str:
93
78
  return self._format
94
79
 
80
+ def make_parent_dirs(self) -> None:
81
+ self._fs.makedirs(self._fs._parent(self._fs_path), exist_ok=True)
82
+
83
+ def find_all(self) -> Iterable["FileParser"]:
84
+ if self.has_glob:
85
+ for x in self._fs.glob(self._fs_path):
86
+ yield FileParser(self._fs.unstrip_protocol(x))
87
+ else:
88
+ yield self
89
+
90
+ def open(self, *args: Any, **kwargs: Any) -> Any:
91
+ self.assert_no_glob()
92
+ return self._fs.open(self._fs_path, *args, **kwargs)
93
+
95
94
 
96
95
  def load_df(
97
96
  uri: Union[str, List[str]],
98
97
  format_hint: Optional[str] = None,
99
98
  columns: Any = None,
100
- fs: Optional[FileSystem] = None,
99
+ fs: Optional[AbstractFileSystem] = None,
101
100
  **kwargs: Any,
102
101
  ) -> LocalBoundedDataFrame:
103
102
  if isinstance(uri, str):
@@ -117,7 +116,7 @@ def save_df(
117
116
  uri: str,
118
117
  format_hint: Optional[str] = None,
119
118
  mode: str = "overwrite",
120
- fs: Optional[FileSystem] = None,
119
+ fs: Optional[AbstractFileSystem] = None,
121
120
  **kwargs: Any,
122
121
  ) -> None:
123
122
  assert_or_throw(
@@ -125,40 +124,28 @@ def save_df(
125
124
  )
126
125
  p = FileParser(uri, format_hint).assert_no_glob()
127
126
  if fs is None:
128
- fs = FileSystem()
127
+ fs, _ = url_to_fs(uri)
129
128
  if fs.exists(uri):
130
129
  assert_or_throw(mode == "overwrite", FileExistsError(uri))
131
130
  try:
132
- fs.remove(uri)
133
- except Exception:
134
- try:
135
- fs.removetree(uri)
136
- except Exception: # pragma: no cover
137
- pass
131
+ fs.rm(uri, recursive=True)
132
+ except Exception: # pragma: no cover
133
+ pass
138
134
  _FORMAT_SAVE[p.file_format](df, p, **kwargs)
139
135
 
140
136
 
141
137
  def _get_single_files(
142
- fp: Iterable[FileParser], fs: Optional[FileSystem]
138
+ fp: Iterable[FileParser], fs: Optional[AbstractFileSystem]
143
139
  ) -> Iterable[FileParser]:
144
- if fs is None:
145
- fs = FileSystem()
146
140
  for f in fp:
147
- if f.glob_pattern != "":
148
- files = [
149
- FileParser(pfs.path.combine(f.uri, pfs.path.basename(x.path)))
150
- for x in fs.opendir(f.uri).glob(f.glob_pattern)
151
- ]
152
- yield from _get_single_files(files, fs)
153
- else:
154
- yield f
141
+ yield from f.find_all()
155
142
 
156
143
 
157
144
  def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
158
145
  PD_UTILS.to_parquet_friendly(
159
146
  df.as_pandas(), partition_cols=kwargs.get("partition_cols", [])
160
147
  ).to_parquet(
161
- p.uri,
148
+ p.assert_no_glob().path,
162
149
  **{
163
150
  "engine": "pyarrow",
164
151
  "schema": df.schema.pa_schema,
@@ -171,34 +158,36 @@ def _load_parquet(
171
158
  p: FileParser, columns: Any = None, **kwargs: Any
172
159
  ) -> Tuple[pd.DataFrame, Any]:
173
160
  if columns is None:
174
- pdf = pd.read_parquet(p.uri, **{"engine": "pyarrow", **kwargs})
161
+ pdf = pd.read_parquet(p.path, **{"engine": "pyarrow", **kwargs})
175
162
  return pdf, None
176
163
  if isinstance(columns, list): # column names
177
- pdf = pd.read_parquet(p.uri, columns=columns, **{"engine": "pyarrow", **kwargs})
164
+ pdf = pd.read_parquet(
165
+ p.path, columns=columns, **{"engine": "pyarrow", **kwargs}
166
+ )
178
167
  return pdf, None
179
168
  schema = Schema(columns)
180
169
  pdf = pd.read_parquet(
181
- p.uri, columns=schema.names, **{"engine": "pyarrow", **kwargs}
170
+ p.path, columns=schema.names, **{"engine": "pyarrow", **kwargs}
182
171
  )
183
172
  return pdf, schema
184
173
 
185
174
 
186
175
  def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
187
- df.as_pandas().to_csv(p.uri, **{"index": False, "header": False, **kwargs})
176
+ with p.open("w") as f:
177
+ df.as_pandas().to_csv(f, **{"index": False, "header": False, **kwargs})
188
178
 
189
179
 
190
- def _safe_load_csv(path: str, **kwargs: Any) -> pd.DataFrame:
180
+ def _safe_load_csv(p: FileParser, **kwargs: Any) -> pd.DataFrame:
191
181
  def load_dir() -> pd.DataFrame:
192
- fs = FileSystem()
193
- return pd.concat(
194
- [
195
- pd.read_csv(pfs.path.combine(path, pfs.path.basename(x.path)), **kwargs)
196
- for x in fs.opendir(path).glob("*.csv")
197
- ]
198
- )
182
+ dfs: List[pd.DataFrame] = []
183
+ for _p in p.join("*.csv").find_all(): # type: ignore
184
+ with _p.open("r") as f:
185
+ dfs.append(pd.read_csv(f, **kwargs))
186
+ return pd.concat(dfs)
199
187
 
200
188
  try:
201
- return pd.read_csv(path, **kwargs)
189
+ with p.open("r") as f:
190
+ return pd.read_csv(f, **kwargs)
202
191
  except IsADirectoryError:
203
192
  return load_dir()
204
193
  except pd.errors.ParserError: # pragma: no cover
@@ -224,7 +213,7 @@ def _load_csv( # noqa: C901
224
213
  header = kw["header"]
225
214
  del kw["header"]
226
215
  if str(header) in ["True", "0"]:
227
- pdf = _safe_load_csv(p.uri, **{"index_col": False, "header": 0, **kw})
216
+ pdf = _safe_load_csv(p, **{"index_col": False, "header": 0, **kw})
228
217
  if columns is None:
229
218
  return pdf, None
230
219
  if isinstance(columns, list): # column names
@@ -236,12 +225,14 @@ def _load_csv( # noqa: C901
236
225
  raise ValueError("columns must be set if without header")
237
226
  if isinstance(columns, list): # column names
238
227
  pdf = _safe_load_csv(
239
- p.uri, **{"index_col": False, "header": None, "names": columns, **kw}
228
+ p,
229
+ **{"index_col": False, "header": None, "names": columns, **kw},
240
230
  )
241
231
  return pdf, None
242
232
  schema = Schema(columns)
243
233
  pdf = _safe_load_csv(
244
- p.uri, **{"index_col": False, "header": None, "names": schema.names, **kw}
234
+ p,
235
+ **{"index_col": False, "header": None, "names": schema.names, **kw},
245
236
  )
246
237
  return pdf, schema
247
238
  else:
@@ -249,27 +240,31 @@ def _load_csv( # noqa: C901
249
240
 
250
241
 
251
242
  def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
252
- df.as_pandas().to_json(p.uri, **{"orient": "records", "lines": True, **kwargs})
243
+ with p.open("w") as f:
244
+ df.as_pandas().to_json(f, **{"orient": "records", "lines": True, **kwargs})
253
245
 
254
246
 
255
- def _safe_load_json(path: str, **kwargs: Any) -> pd.DataFrame:
247
+ def _safe_load_json(p: FileParser, **kwargs: Any) -> pd.DataFrame:
256
248
  kw = {"orient": "records", "lines": True, **kwargs}
249
+
250
+ def load_dir() -> pd.DataFrame:
251
+ dfs: List[pd.DataFrame] = []
252
+ for _p in p.join("*.json").find_all(): # type: ignore
253
+ with _p.open("r") as f:
254
+ dfs.append(pd.read_json(f, **kw))
255
+ return pd.concat(dfs)
256
+
257
257
  try:
258
- return pd.read_json(path, **kw)
258
+ with p.open("r") as f:
259
+ return pd.read_json(f, **kw)
259
260
  except (IsADirectoryError, PermissionError):
260
- fs = FileSystem()
261
- return pd.concat(
262
- [
263
- pd.read_json(pfs.path.combine(path, pfs.path.basename(x.path)), **kw)
264
- for x in fs.opendir(path).glob("*.json")
265
- ]
266
- )
261
+ return load_dir()
267
262
 
268
263
 
269
264
  def _load_json(
270
265
  p: FileParser, columns: Any = None, **kwargs: Any
271
266
  ) -> Tuple[pd.DataFrame, Any]:
272
- pdf = _safe_load_json(p.uri, **kwargs).reset_index(drop=True)
267
+ pdf = _safe_load_json(p, **kwargs).reset_index(drop=True)
273
268
  if columns is None:
274
269
  return pdf, None
275
270
  if isinstance(columns, list): # column names
fugue/api.py CHANGED
@@ -6,6 +6,7 @@ from .dataframe.api import (
6
6
  as_array_iterable,
7
7
  as_arrow,
8
8
  as_dict_iterable,
9
+ as_dicts,
9
10
  as_fugue_df,
10
11
  as_pandas,
11
12
  drop_columns,
fugue/dataframe/api.py CHANGED
@@ -116,15 +116,32 @@ def as_array_iterable(
116
116
  return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe)
117
117
 
118
118
 
119
+ @fugue_plugin
120
+ def as_dicts(
121
+ df: AnyDataFrame, columns: Optional[List[str]] = None
122
+ ) -> List[Dict[str, Any]]:
123
+ """Convert any dataframe to a list of python dicts
124
+
125
+ :param df: the object that can be recognized as a dataframe by Fugue
126
+ :param columns: columns to extract, defaults to None
127
+ :return: a list of python dicts
128
+
129
+ .. note::
130
+
131
+ The default implementation enforces ``type_safe`` True
132
+ """
133
+ return as_fugue_df(df).as_dicts(columns=columns)
134
+
135
+
119
136
  @fugue_plugin
120
137
  def as_dict_iterable(
121
138
  df: AnyDataFrame, columns: Optional[List[str]] = None
122
139
  ) -> Iterable[Dict[str, Any]]:
123
- """Convert any dataframe to iterable of native python dicts
140
+ """Convert any dataframe to iterable of python dicts
124
141
 
125
142
  :param df: the object that can be recognized as a dataframe by Fugue
126
143
  :param columns: columns to extract, defaults to None
127
- :return: iterable of native python dicts
144
+ :return: iterable of python dicts
128
145
 
129
146
  .. note::
130
147
 
@@ -21,6 +21,10 @@ from fugue.exceptions import FugueDataFrameOperationError
21
21
 
22
22
  from .api import (
23
23
  alter_columns,
24
+ as_array,
25
+ as_array_iterable,
26
+ as_dict_iterable,
27
+ as_dicts,
24
28
  as_pandas,
25
29
  drop_columns,
26
30
  get_column_names,
@@ -30,6 +34,12 @@ from .api import (
30
34
  select_columns,
31
35
  )
32
36
  from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
37
+ from .utils import (
38
+ pa_table_as_array,
39
+ pa_table_as_array_iterable,
40
+ pa_table_as_dict_iterable,
41
+ pa_table_as_dicts,
42
+ )
33
43
 
34
44
 
35
45
  class ArrowDataFrame(LocalBoundedDataFrame):
@@ -174,21 +184,20 @@ class ArrowDataFrame(LocalBoundedDataFrame):
174
184
  def as_array(
175
185
  self, columns: Optional[List[str]] = None, type_safe: bool = False
176
186
  ) -> List[Any]:
177
- return list(self.as_array_iterable(columns, type_safe=type_safe))
187
+ return pa_table_as_array(self.native, columns=columns)
188
+
189
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
190
+ return pa_table_as_dicts(self.native, columns=columns)
178
191
 
179
192
  def as_array_iterable(
180
193
  self, columns: Optional[List[str]] = None, type_safe: bool = False
181
194
  ) -> Iterable[Any]:
182
- if self.empty:
183
- return
184
- if columns is not None:
185
- for x in self[columns].as_array_iterable(type_safe=type_safe):
186
- yield x
187
- else:
188
- d = self.native.to_pydict()
189
- cols = [d[n] for n in self.columns]
190
- for arr in zip(*cols):
191
- yield list(arr)
195
+ yield from pa_table_as_array_iterable(self.native, columns=columns)
196
+
197
+ def as_dict_iterable(
198
+ self, columns: Optional[List[str]] = None
199
+ ) -> Iterable[Dict[str, Any]]:
200
+ yield from pa_table_as_dict_iterable(self.native, columns=columns)
192
201
 
193
202
 
194
203
  @as_local.candidate(lambda df: isinstance(df, pa.Table))
@@ -212,6 +221,34 @@ def _pa_table_as_pandas(df: pa.Table) -> pd.DataFrame:
212
221
  )
213
222
 
214
223
 
224
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
225
+ def _pa_table_as_array(
226
+ df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
227
+ ) -> List[Any]:
228
+ return pa_table_as_array(df, columns=columns)
229
+
230
+
231
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
232
+ def _pa_table_as_array_iterable(
233
+ df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
234
+ ) -> Iterable[Any]:
235
+ yield from pa_table_as_array_iterable(df, columns=columns)
236
+
237
+
238
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
239
+ def _pa_table_as_dicts(
240
+ df: pa.Table, columns: Optional[List[str]] = None
241
+ ) -> List[Dict[str, Any]]:
242
+ return pa_table_as_dicts(df, columns=columns)
243
+
244
+
245
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
246
+ def _pa_table_as_dict_iterable(
247
+ df: pa.Table, columns: Optional[List[str]] = None
248
+ ) -> Iterable[Dict[str, Any]]:
249
+ yield from pa_table_as_dict_iterable(df, columns=columns)
250
+
251
+
215
252
  @alter_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
216
253
  def _pa_table_alter_columns(
217
254
  df: pa.Table, columns: Any, as_fugue: bool = False
@@ -237,13 +237,31 @@ class DataFrame(Dataset):
237
237
  """
238
238
  raise NotImplementedError
239
239
 
240
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
241
+ """Convert to a list of python dicts
242
+
243
+ :param columns: columns to extract, defaults to None
244
+ :return: a list of python dicts
245
+
246
+ .. note::
247
+
248
+ The default implementation enforces ``type_safe`` True
249
+ """
250
+ if columns is None:
251
+ columns = self.columns
252
+ idx = range(len(columns))
253
+ return [
254
+ {columns[i]: x[i] for i in idx}
255
+ for x in self.as_array(columns, type_safe=True)
256
+ ]
257
+
240
258
  def as_dict_iterable(
241
259
  self, columns: Optional[List[str]] = None
242
260
  ) -> Iterable[Dict[str, Any]]:
243
- """Convert to iterable of native python dicts
261
+ """Convert to iterable of python dicts
244
262
 
245
263
  :param columns: columns to extract, defaults to None
246
- :return: iterable of native python dicts
264
+ :return: iterable of python dicts
247
265
 
248
266
  .. note::
249
267
 
@@ -269,7 +269,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
269
269
  class _ListDictParam(_LocalNoSchemaDataFrameParam):
270
270
  @no_type_check
271
271
  def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
272
- return list(df.as_local().as_dict_iterable())
272
+ return df.as_local().as_dicts()
273
273
 
274
274
  @no_type_check
275
275
  def to_output_df(
@@ -105,6 +105,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
105
105
  ) -> List[Any]:
106
106
  return list(self.as_array_iterable(columns, type_safe=type_safe))
107
107
 
108
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
109
+ return list(self.as_dict_iterable(columns))
110
+
108
111
  def as_array_iterable(
109
112
  self, columns: Optional[List[str]] = None, type_safe: bool = False
110
113
  ) -> Iterable[Any]:
@@ -1,8 +1,11 @@
1
1
  from typing import Any, Dict, Iterable, List, Optional, Tuple
2
2
 
3
3
  import pandas as pd
4
+ import pyarrow as pa
5
+ from triad import assert_or_throw
4
6
  from triad.collections.schema import Schema
5
7
  from triad.utils.pandas_like import PD_UTILS
8
+ from triad.utils.pyarrow import pa_batch_to_dicts
6
9
 
7
10
  from fugue.dataset.api import (
8
11
  as_fugue_dataset,
@@ -17,6 +20,10 @@ from fugue.dataset.api import (
17
20
  from fugue.exceptions import FugueDataFrameOperationError
18
21
 
19
22
  from .api import (
23
+ as_array,
24
+ as_array_iterable,
25
+ as_dict_iterable,
26
+ as_dicts,
20
27
  drop_columns,
21
28
  get_column_names,
22
29
  get_schema,
@@ -134,6 +141,9 @@ class PandasDataFrame(LocalBoundedDataFrame):
134
141
  return self
135
142
  return PandasDataFrame(self.native, new_schema)
136
143
 
144
+ def as_arrow(self, type_safe: bool = False) -> pa.Table:
145
+ return PD_UTILS.as_arrow(self.native, schema=self.schema.pa_schema)
146
+
137
147
  def as_array(
138
148
  self, columns: Optional[List[str]] = None, type_safe: bool = False
139
149
  ) -> List[Any]:
@@ -150,6 +160,18 @@ class PandasDataFrame(LocalBoundedDataFrame):
150
160
  ):
151
161
  yield row
152
162
 
163
+ def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
164
+ res: List[Dict[str, Any]] = []
165
+ for block in _to_dicts(self.native, columns, self.schema):
166
+ res += block
167
+ return res
168
+
169
+ def as_dict_iterable(
170
+ self, columns: Optional[List[str]] = None
171
+ ) -> Iterable[Dict[str, Any]]:
172
+ for block in _to_dicts(self.native, columns, self.schema):
173
+ yield from block
174
+
153
175
  def head(
154
176
  self, n: int, columns: Optional[List[str]] = None
155
177
  ) -> LocalBoundedDataFrame:
@@ -272,6 +294,43 @@ def _pd_head(
272
294
  return _adjust_df(df.head(n), as_fugue=as_fugue)
273
295
 
274
296
 
297
+ @as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
298
+ def _pd_as_array(
299
+ df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
300
+ ) -> List[Any]:
301
+ return list(_pd_as_array_iterable(df, columns, type_safe=type_safe))
302
+
303
+
304
+ @as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
305
+ def _pd_as_array_iterable(
306
+ df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
307
+ ) -> Iterable[Any]:
308
+ for row in PD_UTILS.as_array_iterable(
309
+ df,
310
+ columns=columns,
311
+ type_safe=type_safe,
312
+ ):
313
+ yield row
314
+
315
+
316
+ @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
317
+ def _pd_as_dicts(
318
+ df: pd.DataFrame, columns: Optional[List[str]] = None
319
+ ) -> List[Dict[str, Any]]:
320
+ res: List[Dict[str, Any]] = []
321
+ for block in _to_dicts(df, columns):
322
+ res += block
323
+ return res
324
+
325
+
326
+ @as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
327
+ def _pd_as_dict_iterable(
328
+ df: pa.Table, columns: Optional[List[str]] = None
329
+ ) -> Iterable[Dict[str, Any]]:
330
+ for block in _to_dicts(df, columns):
331
+ yield from block
332
+
333
+
275
334
  def _adjust_df(res: pd.DataFrame, as_fugue: bool):
276
335
  return res if not as_fugue else PandasDataFrame(res)
277
336
 
@@ -280,3 +339,17 @@ def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None:
280
339
  missing = [x for x in columns if x not in df.columns]
281
340
  if len(missing) > 0:
282
341
  raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
342
+
343
+
344
+ def _to_dicts(
345
+ df: pd.DataFrame,
346
+ columns: Optional[List[str]] = None,
347
+ schema: Optional[Schema] = None,
348
+ ) -> Iterable[List[Dict[str, Any]]]:
349
+ cols = list(df.columns) if columns is None else columns
350
+ assert_or_throw(len(cols) > 0, ValueError("columns cannot be empty"))
351
+ pa_schema = schema.extract(cols).pa_schema if schema is not None else None
352
+ adf = PD_UTILS.as_arrow(df[cols], schema=pa_schema)
353
+ for batch in adf.to_batches():
354
+ if batch.num_rows > 0:
355
+ yield pa_batch_to_dicts(batch)