fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +84 -89
- fugue/api.py +1 -0
- fugue/dataframe/api.py +19 -2
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +78 -25
- fugue/execution/execution_engine.py +1 -8
- fugue/execution/native_execution_engine.py +5 -11
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +9 -9
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +4 -4
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +40 -38
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +3 -2
- fugue_dask/_io.py +22 -29
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_dask/execution_engine.py +5 -12
- fugue_duckdb/_io.py +21 -37
- fugue_duckdb/dataframe.py +87 -29
- fugue_duckdb/execution_engine.py +2 -7
- fugue_ibis/dataframe.py +13 -0
- fugue_ibis/execution_engine.py +1 -5
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/_utils/io.py +15 -17
- fugue_ray/dataframe.py +71 -19
- fugue_spark/_utils/io.py +3 -5
- fugue_spark/dataframe.py +69 -13
- fugue_spark/execution_engine.py +2 -7
- fugue_test/builtin_suite.py +12 -12
- fugue_test/dataframe_suite.py +14 -0
- fugue_test/execution_suite.py +13 -18
- fugue_test/plugins/misc/__init__.py +2 -0
- fugue_test/plugins/misc/fixtures.py +18 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue/__init__.py
CHANGED
fugue/_utils/io.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pathlib
|
|
3
3
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
4
|
-
from urllib.parse import urlparse
|
|
5
4
|
|
|
6
|
-
import fs as pfs
|
|
7
5
|
import pandas as pd
|
|
6
|
+
from fsspec import AbstractFileSystem
|
|
7
|
+
from fsspec.implementations.local import LocalFileSystem
|
|
8
8
|
from triad.collections.dict import ParamDict
|
|
9
|
-
from triad.collections.fs import FileSystem
|
|
10
9
|
from triad.collections.schema import Schema
|
|
11
10
|
from triad.utils.assertion import assert_or_throw
|
|
11
|
+
from triad.utils.io import join, url_to_fs
|
|
12
12
|
from triad.utils.pandas_like import PD_UTILS
|
|
13
13
|
|
|
14
14
|
from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
|
|
@@ -16,23 +16,14 @@ from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFra
|
|
|
16
16
|
|
|
17
17
|
class FileParser(object):
|
|
18
18
|
def __init__(self, path: str, format_hint: Optional[str] = None):
|
|
19
|
-
last = len(path)
|
|
20
|
-
has_glob = False
|
|
21
19
|
self._orig_format_hint = format_hint
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
break
|
|
28
|
-
if not has_glob:
|
|
29
|
-
self._uri = urlparse(path)
|
|
30
|
-
self._glob_pattern = ""
|
|
31
|
-
self._path = self._uri.path
|
|
20
|
+
self._has_glob = "*" in path or "?" in path
|
|
21
|
+
self._raw_path = path
|
|
22
|
+
self._fs, self._fs_path = url_to_fs(path)
|
|
23
|
+
if not self.is_local:
|
|
24
|
+
self._path = self._fs.unstrip_protocol(self._fs_path)
|
|
32
25
|
else:
|
|
33
|
-
self.
|
|
34
|
-
self._glob_pattern = path[last + 1 :]
|
|
35
|
-
self._path = pfs.path.combine(self._uri.path, self._glob_pattern)
|
|
26
|
+
self._path = os.path.abspath(self._fs._strip_protocol(path))
|
|
36
27
|
|
|
37
28
|
if format_hint is None or format_hint == "":
|
|
38
29
|
for k, v in _FORMAT_MAP.items():
|
|
@@ -48,56 +39,64 @@ class FileParser(object):
|
|
|
48
39
|
self._format = format_hint
|
|
49
40
|
|
|
50
41
|
def assert_no_glob(self) -> "FileParser":
|
|
51
|
-
assert_or_throw(self.
|
|
42
|
+
assert_or_throw(not self.has_glob, f"{self.raw_path} has glob pattern")
|
|
52
43
|
return self
|
|
53
44
|
|
|
54
|
-
def with_glob(self, glob: str, format_hint: Optional[str] = None) -> "FileParser":
|
|
55
|
-
uri = self.uri
|
|
56
|
-
if glob != "":
|
|
57
|
-
uri = pfs.path.combine(uri, glob)
|
|
58
|
-
return FileParser(uri, format_hint or self._orig_format_hint)
|
|
59
|
-
|
|
60
45
|
@property
|
|
61
|
-
def
|
|
62
|
-
return self.
|
|
46
|
+
def has_glob(self):
|
|
47
|
+
return self._has_glob
|
|
63
48
|
|
|
64
49
|
@property
|
|
65
|
-
def
|
|
66
|
-
return self.
|
|
50
|
+
def is_local(self):
|
|
51
|
+
return isinstance(self._fs, LocalFileSystem)
|
|
67
52
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
53
|
+
def join(self, path: str, format_hint: Optional[str] = None) -> "FileParser":
|
|
54
|
+
if not self.has_glob:
|
|
55
|
+
_path = join(self.path, path)
|
|
56
|
+
else:
|
|
57
|
+
_path = join(self.parent, path)
|
|
58
|
+
return FileParser(_path, format_hint or self._orig_format_hint)
|
|
73
59
|
|
|
74
60
|
@property
|
|
75
61
|
def parent(self) -> str:
|
|
76
|
-
|
|
77
|
-
return dn if dn != "" else "."
|
|
78
|
-
|
|
79
|
-
@property
|
|
80
|
-
def scheme(self) -> str:
|
|
81
|
-
return self._uri.scheme
|
|
62
|
+
return self._fs.unstrip_protocol(self._fs._parent(self._fs_path))
|
|
82
63
|
|
|
83
64
|
@property
|
|
84
65
|
def path(self) -> str:
|
|
85
66
|
return self._path
|
|
86
67
|
|
|
68
|
+
@property
|
|
69
|
+
def raw_path(self) -> str:
|
|
70
|
+
return self._raw_path
|
|
71
|
+
|
|
87
72
|
@property
|
|
88
73
|
def suffix(self) -> str:
|
|
89
|
-
return "".join(pathlib.Path(self.
|
|
74
|
+
return "".join(pathlib.Path(self.raw_path.lower()).suffixes)
|
|
90
75
|
|
|
91
76
|
@property
|
|
92
77
|
def file_format(self) -> str:
|
|
93
78
|
return self._format
|
|
94
79
|
|
|
80
|
+
def make_parent_dirs(self) -> None:
|
|
81
|
+
self._fs.makedirs(self._fs._parent(self._fs_path), exist_ok=True)
|
|
82
|
+
|
|
83
|
+
def find_all(self) -> Iterable["FileParser"]:
|
|
84
|
+
if self.has_glob:
|
|
85
|
+
for x in self._fs.glob(self._fs_path):
|
|
86
|
+
yield FileParser(self._fs.unstrip_protocol(x))
|
|
87
|
+
else:
|
|
88
|
+
yield self
|
|
89
|
+
|
|
90
|
+
def open(self, *args: Any, **kwargs: Any) -> Any:
|
|
91
|
+
self.assert_no_glob()
|
|
92
|
+
return self._fs.open(self._fs_path, *args, **kwargs)
|
|
93
|
+
|
|
95
94
|
|
|
96
95
|
def load_df(
|
|
97
96
|
uri: Union[str, List[str]],
|
|
98
97
|
format_hint: Optional[str] = None,
|
|
99
98
|
columns: Any = None,
|
|
100
|
-
fs: Optional[
|
|
99
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
101
100
|
**kwargs: Any,
|
|
102
101
|
) -> LocalBoundedDataFrame:
|
|
103
102
|
if isinstance(uri, str):
|
|
@@ -117,7 +116,7 @@ def save_df(
|
|
|
117
116
|
uri: str,
|
|
118
117
|
format_hint: Optional[str] = None,
|
|
119
118
|
mode: str = "overwrite",
|
|
120
|
-
fs: Optional[
|
|
119
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
121
120
|
**kwargs: Any,
|
|
122
121
|
) -> None:
|
|
123
122
|
assert_or_throw(
|
|
@@ -125,40 +124,28 @@ def save_df(
|
|
|
125
124
|
)
|
|
126
125
|
p = FileParser(uri, format_hint).assert_no_glob()
|
|
127
126
|
if fs is None:
|
|
128
|
-
fs =
|
|
127
|
+
fs, _ = url_to_fs(uri)
|
|
129
128
|
if fs.exists(uri):
|
|
130
129
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
131
130
|
try:
|
|
132
|
-
fs.
|
|
133
|
-
except Exception:
|
|
134
|
-
|
|
135
|
-
fs.removetree(uri)
|
|
136
|
-
except Exception: # pragma: no cover
|
|
137
|
-
pass
|
|
131
|
+
fs.rm(uri, recursive=True)
|
|
132
|
+
except Exception: # pragma: no cover
|
|
133
|
+
pass
|
|
138
134
|
_FORMAT_SAVE[p.file_format](df, p, **kwargs)
|
|
139
135
|
|
|
140
136
|
|
|
141
137
|
def _get_single_files(
|
|
142
|
-
fp: Iterable[FileParser], fs: Optional[
|
|
138
|
+
fp: Iterable[FileParser], fs: Optional[AbstractFileSystem]
|
|
143
139
|
) -> Iterable[FileParser]:
|
|
144
|
-
if fs is None:
|
|
145
|
-
fs = FileSystem()
|
|
146
140
|
for f in fp:
|
|
147
|
-
|
|
148
|
-
files = [
|
|
149
|
-
FileParser(pfs.path.combine(f.uri, pfs.path.basename(x.path)))
|
|
150
|
-
for x in fs.opendir(f.uri).glob(f.glob_pattern)
|
|
151
|
-
]
|
|
152
|
-
yield from _get_single_files(files, fs)
|
|
153
|
-
else:
|
|
154
|
-
yield f
|
|
141
|
+
yield from f.find_all()
|
|
155
142
|
|
|
156
143
|
|
|
157
144
|
def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
158
145
|
PD_UTILS.to_parquet_friendly(
|
|
159
146
|
df.as_pandas(), partition_cols=kwargs.get("partition_cols", [])
|
|
160
147
|
).to_parquet(
|
|
161
|
-
p.
|
|
148
|
+
p.assert_no_glob().path,
|
|
162
149
|
**{
|
|
163
150
|
"engine": "pyarrow",
|
|
164
151
|
"schema": df.schema.pa_schema,
|
|
@@ -171,34 +158,36 @@ def _load_parquet(
|
|
|
171
158
|
p: FileParser, columns: Any = None, **kwargs: Any
|
|
172
159
|
) -> Tuple[pd.DataFrame, Any]:
|
|
173
160
|
if columns is None:
|
|
174
|
-
pdf = pd.read_parquet(p.
|
|
161
|
+
pdf = pd.read_parquet(p.path, **{"engine": "pyarrow", **kwargs})
|
|
175
162
|
return pdf, None
|
|
176
163
|
if isinstance(columns, list): # column names
|
|
177
|
-
pdf = pd.read_parquet(
|
|
164
|
+
pdf = pd.read_parquet(
|
|
165
|
+
p.path, columns=columns, **{"engine": "pyarrow", **kwargs}
|
|
166
|
+
)
|
|
178
167
|
return pdf, None
|
|
179
168
|
schema = Schema(columns)
|
|
180
169
|
pdf = pd.read_parquet(
|
|
181
|
-
p.
|
|
170
|
+
p.path, columns=schema.names, **{"engine": "pyarrow", **kwargs}
|
|
182
171
|
)
|
|
183
172
|
return pdf, schema
|
|
184
173
|
|
|
185
174
|
|
|
186
175
|
def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
187
|
-
|
|
176
|
+
with p.open("w") as f:
|
|
177
|
+
df.as_pandas().to_csv(f, **{"index": False, "header": False, **kwargs})
|
|
188
178
|
|
|
189
179
|
|
|
190
|
-
def _safe_load_csv(
|
|
180
|
+
def _safe_load_csv(p: FileParser, **kwargs: Any) -> pd.DataFrame:
|
|
191
181
|
def load_dir() -> pd.DataFrame:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
pd.read_csv(
|
|
196
|
-
|
|
197
|
-
]
|
|
198
|
-
)
|
|
182
|
+
dfs: List[pd.DataFrame] = []
|
|
183
|
+
for _p in p.join("*.csv").find_all(): # type: ignore
|
|
184
|
+
with _p.open("r") as f:
|
|
185
|
+
dfs.append(pd.read_csv(f, **kwargs))
|
|
186
|
+
return pd.concat(dfs)
|
|
199
187
|
|
|
200
188
|
try:
|
|
201
|
-
|
|
189
|
+
with p.open("r") as f:
|
|
190
|
+
return pd.read_csv(f, **kwargs)
|
|
202
191
|
except IsADirectoryError:
|
|
203
192
|
return load_dir()
|
|
204
193
|
except pd.errors.ParserError: # pragma: no cover
|
|
@@ -224,7 +213,7 @@ def _load_csv( # noqa: C901
|
|
|
224
213
|
header = kw["header"]
|
|
225
214
|
del kw["header"]
|
|
226
215
|
if str(header) in ["True", "0"]:
|
|
227
|
-
pdf = _safe_load_csv(p
|
|
216
|
+
pdf = _safe_load_csv(p, **{"index_col": False, "header": 0, **kw})
|
|
228
217
|
if columns is None:
|
|
229
218
|
return pdf, None
|
|
230
219
|
if isinstance(columns, list): # column names
|
|
@@ -236,12 +225,14 @@ def _load_csv( # noqa: C901
|
|
|
236
225
|
raise ValueError("columns must be set if without header")
|
|
237
226
|
if isinstance(columns, list): # column names
|
|
238
227
|
pdf = _safe_load_csv(
|
|
239
|
-
p
|
|
228
|
+
p,
|
|
229
|
+
**{"index_col": False, "header": None, "names": columns, **kw},
|
|
240
230
|
)
|
|
241
231
|
return pdf, None
|
|
242
232
|
schema = Schema(columns)
|
|
243
233
|
pdf = _safe_load_csv(
|
|
244
|
-
p
|
|
234
|
+
p,
|
|
235
|
+
**{"index_col": False, "header": None, "names": schema.names, **kw},
|
|
245
236
|
)
|
|
246
237
|
return pdf, schema
|
|
247
238
|
else:
|
|
@@ -249,27 +240,31 @@ def _load_csv( # noqa: C901
|
|
|
249
240
|
|
|
250
241
|
|
|
251
242
|
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
252
|
-
|
|
243
|
+
with p.open("w") as f:
|
|
244
|
+
df.as_pandas().to_json(f, **{"orient": "records", "lines": True, **kwargs})
|
|
253
245
|
|
|
254
246
|
|
|
255
|
-
def _safe_load_json(
|
|
247
|
+
def _safe_load_json(p: FileParser, **kwargs: Any) -> pd.DataFrame:
|
|
256
248
|
kw = {"orient": "records", "lines": True, **kwargs}
|
|
249
|
+
|
|
250
|
+
def load_dir() -> pd.DataFrame:
|
|
251
|
+
dfs: List[pd.DataFrame] = []
|
|
252
|
+
for _p in p.join("*.json").find_all(): # type: ignore
|
|
253
|
+
with _p.open("r") as f:
|
|
254
|
+
dfs.append(pd.read_json(f, **kw))
|
|
255
|
+
return pd.concat(dfs)
|
|
256
|
+
|
|
257
257
|
try:
|
|
258
|
-
|
|
258
|
+
with p.open("r") as f:
|
|
259
|
+
return pd.read_json(f, **kw)
|
|
259
260
|
except (IsADirectoryError, PermissionError):
|
|
260
|
-
|
|
261
|
-
return pd.concat(
|
|
262
|
-
[
|
|
263
|
-
pd.read_json(pfs.path.combine(path, pfs.path.basename(x.path)), **kw)
|
|
264
|
-
for x in fs.opendir(path).glob("*.json")
|
|
265
|
-
]
|
|
266
|
-
)
|
|
261
|
+
return load_dir()
|
|
267
262
|
|
|
268
263
|
|
|
269
264
|
def _load_json(
|
|
270
265
|
p: FileParser, columns: Any = None, **kwargs: Any
|
|
271
266
|
) -> Tuple[pd.DataFrame, Any]:
|
|
272
|
-
pdf = _safe_load_json(p
|
|
267
|
+
pdf = _safe_load_json(p, **kwargs).reset_index(drop=True)
|
|
273
268
|
if columns is None:
|
|
274
269
|
return pdf, None
|
|
275
270
|
if isinstance(columns, list): # column names
|
fugue/api.py
CHANGED
fugue/dataframe/api.py
CHANGED
|
@@ -116,15 +116,32 @@ def as_array_iterable(
|
|
|
116
116
|
return as_fugue_df(df).as_array_iterable(columns=columns, type_safe=type_safe)
|
|
117
117
|
|
|
118
118
|
|
|
119
|
+
@fugue_plugin
|
|
120
|
+
def as_dicts(
|
|
121
|
+
df: AnyDataFrame, columns: Optional[List[str]] = None
|
|
122
|
+
) -> List[Dict[str, Any]]:
|
|
123
|
+
"""Convert any dataframe to a list of python dicts
|
|
124
|
+
|
|
125
|
+
:param df: the object that can be recognized as a dataframe by Fugue
|
|
126
|
+
:param columns: columns to extract, defaults to None
|
|
127
|
+
:return: a list of python dicts
|
|
128
|
+
|
|
129
|
+
.. note::
|
|
130
|
+
|
|
131
|
+
The default implementation enforces ``type_safe`` True
|
|
132
|
+
"""
|
|
133
|
+
return as_fugue_df(df).as_dicts(columns=columns)
|
|
134
|
+
|
|
135
|
+
|
|
119
136
|
@fugue_plugin
|
|
120
137
|
def as_dict_iterable(
|
|
121
138
|
df: AnyDataFrame, columns: Optional[List[str]] = None
|
|
122
139
|
) -> Iterable[Dict[str, Any]]:
|
|
123
|
-
"""Convert any dataframe to iterable of
|
|
140
|
+
"""Convert any dataframe to iterable of python dicts
|
|
124
141
|
|
|
125
142
|
:param df: the object that can be recognized as a dataframe by Fugue
|
|
126
143
|
:param columns: columns to extract, defaults to None
|
|
127
|
-
:return: iterable of
|
|
144
|
+
:return: iterable of python dicts
|
|
128
145
|
|
|
129
146
|
.. note::
|
|
130
147
|
|
|
@@ -21,6 +21,10 @@ from fugue.exceptions import FugueDataFrameOperationError
|
|
|
21
21
|
|
|
22
22
|
from .api import (
|
|
23
23
|
alter_columns,
|
|
24
|
+
as_array,
|
|
25
|
+
as_array_iterable,
|
|
26
|
+
as_dict_iterable,
|
|
27
|
+
as_dicts,
|
|
24
28
|
as_pandas,
|
|
25
29
|
drop_columns,
|
|
26
30
|
get_column_names,
|
|
@@ -30,6 +34,12 @@ from .api import (
|
|
|
30
34
|
select_columns,
|
|
31
35
|
)
|
|
32
36
|
from .dataframe import DataFrame, LocalBoundedDataFrame, _input_schema
|
|
37
|
+
from .utils import (
|
|
38
|
+
pa_table_as_array,
|
|
39
|
+
pa_table_as_array_iterable,
|
|
40
|
+
pa_table_as_dict_iterable,
|
|
41
|
+
pa_table_as_dicts,
|
|
42
|
+
)
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
class ArrowDataFrame(LocalBoundedDataFrame):
|
|
@@ -174,21 +184,20 @@ class ArrowDataFrame(LocalBoundedDataFrame):
|
|
|
174
184
|
def as_array(
|
|
175
185
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
176
186
|
) -> List[Any]:
|
|
177
|
-
return
|
|
187
|
+
return pa_table_as_array(self.native, columns=columns)
|
|
188
|
+
|
|
189
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
190
|
+
return pa_table_as_dicts(self.native, columns=columns)
|
|
178
191
|
|
|
179
192
|
def as_array_iterable(
|
|
180
193
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
181
194
|
) -> Iterable[Any]:
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
d = self.native.to_pydict()
|
|
189
|
-
cols = [d[n] for n in self.columns]
|
|
190
|
-
for arr in zip(*cols):
|
|
191
|
-
yield list(arr)
|
|
195
|
+
yield from pa_table_as_array_iterable(self.native, columns=columns)
|
|
196
|
+
|
|
197
|
+
def as_dict_iterable(
|
|
198
|
+
self, columns: Optional[List[str]] = None
|
|
199
|
+
) -> Iterable[Dict[str, Any]]:
|
|
200
|
+
yield from pa_table_as_dict_iterable(self.native, columns=columns)
|
|
192
201
|
|
|
193
202
|
|
|
194
203
|
@as_local.candidate(lambda df: isinstance(df, pa.Table))
|
|
@@ -212,6 +221,34 @@ def _pa_table_as_pandas(df: pa.Table) -> pd.DataFrame:
|
|
|
212
221
|
)
|
|
213
222
|
|
|
214
223
|
|
|
224
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
225
|
+
def _pa_table_as_array(
|
|
226
|
+
df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
227
|
+
) -> List[Any]:
|
|
228
|
+
return pa_table_as_array(df, columns=columns)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
232
|
+
def _pa_table_as_array_iterable(
|
|
233
|
+
df: pa.Table, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
234
|
+
) -> Iterable[Any]:
|
|
235
|
+
yield from pa_table_as_array_iterable(df, columns=columns)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
239
|
+
def _pa_table_as_dicts(
|
|
240
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
241
|
+
) -> List[Dict[str, Any]]:
|
|
242
|
+
return pa_table_as_dicts(df, columns=columns)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
246
|
+
def _pa_table_as_dict_iterable(
|
|
247
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
248
|
+
) -> Iterable[Dict[str, Any]]:
|
|
249
|
+
yield from pa_table_as_dict_iterable(df, columns=columns)
|
|
250
|
+
|
|
251
|
+
|
|
215
252
|
@alter_columns.candidate(lambda df, *args, **kwargs: isinstance(df, pa.Table))
|
|
216
253
|
def _pa_table_alter_columns(
|
|
217
254
|
df: pa.Table, columns: Any, as_fugue: bool = False
|
fugue/dataframe/dataframe.py
CHANGED
|
@@ -237,13 +237,31 @@ class DataFrame(Dataset):
|
|
|
237
237
|
"""
|
|
238
238
|
raise NotImplementedError
|
|
239
239
|
|
|
240
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
241
|
+
"""Convert to a list of python dicts
|
|
242
|
+
|
|
243
|
+
:param columns: columns to extract, defaults to None
|
|
244
|
+
:return: a list of python dicts
|
|
245
|
+
|
|
246
|
+
.. note::
|
|
247
|
+
|
|
248
|
+
The default implementation enforces ``type_safe`` True
|
|
249
|
+
"""
|
|
250
|
+
if columns is None:
|
|
251
|
+
columns = self.columns
|
|
252
|
+
idx = range(len(columns))
|
|
253
|
+
return [
|
|
254
|
+
{columns[i]: x[i] for i in idx}
|
|
255
|
+
for x in self.as_array(columns, type_safe=True)
|
|
256
|
+
]
|
|
257
|
+
|
|
240
258
|
def as_dict_iterable(
|
|
241
259
|
self, columns: Optional[List[str]] = None
|
|
242
260
|
) -> Iterable[Dict[str, Any]]:
|
|
243
|
-
"""Convert to iterable of
|
|
261
|
+
"""Convert to iterable of python dicts
|
|
244
262
|
|
|
245
263
|
:param columns: columns to extract, defaults to None
|
|
246
|
-
:return: iterable of
|
|
264
|
+
:return: iterable of python dicts
|
|
247
265
|
|
|
248
266
|
.. note::
|
|
249
267
|
|
|
@@ -269,7 +269,7 @@ class _EmptyAwareIterableListParam(_LocalNoSchemaDataFrameParam):
|
|
|
269
269
|
class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
270
270
|
@no_type_check
|
|
271
271
|
def to_input_data(self, df: DataFrame, ctx: Any) -> List[Dict[str, Any]]:
|
|
272
|
-
return
|
|
272
|
+
return df.as_local().as_dicts()
|
|
273
273
|
|
|
274
274
|
@no_type_check
|
|
275
275
|
def to_output_df(
|
|
@@ -105,6 +105,9 @@ class IterableDataFrame(LocalUnboundedDataFrame):
|
|
|
105
105
|
) -> List[Any]:
|
|
106
106
|
return list(self.as_array_iterable(columns, type_safe=type_safe))
|
|
107
107
|
|
|
108
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
109
|
+
return list(self.as_dict_iterable(columns))
|
|
110
|
+
|
|
108
111
|
def as_array_iterable(
|
|
109
112
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
110
113
|
) -> Iterable[Any]:
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
+
import pyarrow as pa
|
|
5
|
+
from triad import assert_or_throw
|
|
4
6
|
from triad.collections.schema import Schema
|
|
5
7
|
from triad.utils.pandas_like import PD_UTILS
|
|
8
|
+
from triad.utils.pyarrow import pa_batch_to_dicts
|
|
6
9
|
|
|
7
10
|
from fugue.dataset.api import (
|
|
8
11
|
as_fugue_dataset,
|
|
@@ -17,6 +20,10 @@ from fugue.dataset.api import (
|
|
|
17
20
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
18
21
|
|
|
19
22
|
from .api import (
|
|
23
|
+
as_array,
|
|
24
|
+
as_array_iterable,
|
|
25
|
+
as_dict_iterable,
|
|
26
|
+
as_dicts,
|
|
20
27
|
drop_columns,
|
|
21
28
|
get_column_names,
|
|
22
29
|
get_schema,
|
|
@@ -134,6 +141,9 @@ class PandasDataFrame(LocalBoundedDataFrame):
|
|
|
134
141
|
return self
|
|
135
142
|
return PandasDataFrame(self.native, new_schema)
|
|
136
143
|
|
|
144
|
+
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
145
|
+
return PD_UTILS.as_arrow(self.native, schema=self.schema.pa_schema)
|
|
146
|
+
|
|
137
147
|
def as_array(
|
|
138
148
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
139
149
|
) -> List[Any]:
|
|
@@ -150,6 +160,18 @@ class PandasDataFrame(LocalBoundedDataFrame):
|
|
|
150
160
|
):
|
|
151
161
|
yield row
|
|
152
162
|
|
|
163
|
+
def as_dicts(self, columns: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
164
|
+
res: List[Dict[str, Any]] = []
|
|
165
|
+
for block in _to_dicts(self.native, columns, self.schema):
|
|
166
|
+
res += block
|
|
167
|
+
return res
|
|
168
|
+
|
|
169
|
+
def as_dict_iterable(
|
|
170
|
+
self, columns: Optional[List[str]] = None
|
|
171
|
+
) -> Iterable[Dict[str, Any]]:
|
|
172
|
+
for block in _to_dicts(self.native, columns, self.schema):
|
|
173
|
+
yield from block
|
|
174
|
+
|
|
153
175
|
def head(
|
|
154
176
|
self, n: int, columns: Optional[List[str]] = None
|
|
155
177
|
) -> LocalBoundedDataFrame:
|
|
@@ -272,6 +294,43 @@ def _pd_head(
|
|
|
272
294
|
return _adjust_df(df.head(n), as_fugue=as_fugue)
|
|
273
295
|
|
|
274
296
|
|
|
297
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
298
|
+
def _pd_as_array(
|
|
299
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
300
|
+
) -> List[Any]:
|
|
301
|
+
return list(_pd_as_array_iterable(df, columns, type_safe=type_safe))
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
305
|
+
def _pd_as_array_iterable(
|
|
306
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
307
|
+
) -> Iterable[Any]:
|
|
308
|
+
for row in PD_UTILS.as_array_iterable(
|
|
309
|
+
df,
|
|
310
|
+
columns=columns,
|
|
311
|
+
type_safe=type_safe,
|
|
312
|
+
):
|
|
313
|
+
yield row
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
317
|
+
def _pd_as_dicts(
|
|
318
|
+
df: pd.DataFrame, columns: Optional[List[str]] = None
|
|
319
|
+
) -> List[Dict[str, Any]]:
|
|
320
|
+
res: List[Dict[str, Any]] = []
|
|
321
|
+
for block in _to_dicts(df, columns):
|
|
322
|
+
res += block
|
|
323
|
+
return res
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, pd.DataFrame))
|
|
327
|
+
def _pd_as_dict_iterable(
|
|
328
|
+
df: pa.Table, columns: Optional[List[str]] = None
|
|
329
|
+
) -> Iterable[Dict[str, Any]]:
|
|
330
|
+
for block in _to_dicts(df, columns):
|
|
331
|
+
yield from block
|
|
332
|
+
|
|
333
|
+
|
|
275
334
|
def _adjust_df(res: pd.DataFrame, as_fugue: bool):
|
|
276
335
|
return res if not as_fugue else PandasDataFrame(res)
|
|
277
336
|
|
|
@@ -280,3 +339,17 @@ def _assert_no_missing(df: pd.DataFrame, columns: Iterable[Any]) -> None:
|
|
|
280
339
|
missing = [x for x in columns if x not in df.columns]
|
|
281
340
|
if len(missing) > 0:
|
|
282
341
|
raise FugueDataFrameOperationError("found nonexistent columns: {missing}")
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _to_dicts(
|
|
345
|
+
df: pd.DataFrame,
|
|
346
|
+
columns: Optional[List[str]] = None,
|
|
347
|
+
schema: Optional[Schema] = None,
|
|
348
|
+
) -> Iterable[List[Dict[str, Any]]]:
|
|
349
|
+
cols = list(df.columns) if columns is None else columns
|
|
350
|
+
assert_or_throw(len(cols) > 0, ValueError("columns cannot be empty"))
|
|
351
|
+
pa_schema = schema.extract(cols).pa_schema if schema is not None else None
|
|
352
|
+
adf = PD_UTILS.as_arrow(df[cols], schema=pa_schema)
|
|
353
|
+
for batch in adf.to_batches():
|
|
354
|
+
if batch.num_rows > 0:
|
|
355
|
+
yield pa_batch_to_dicts(batch)
|