fugue 0.8.7.dev6__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +84 -89
- fugue/dataframe/utils.py +12 -25
- fugue/execution/execution_engine.py +0 -7
- fugue/execution/native_execution_engine.py +5 -11
- fugue/workflow/_checkpoint.py +9 -9
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +2 -2
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +24 -22
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +2 -1
- fugue_dask/_io.py +22 -29
- fugue_dask/execution_engine.py +5 -12
- fugue_duckdb/_io.py +21 -37
- fugue_duckdb/execution_engine.py +2 -7
- fugue_ibis/execution_engine.py +1 -5
- fugue_ray/_utils/io.py +15 -17
- fugue_spark/_utils/io.py +3 -5
- fugue_spark/execution_engine.py +2 -7
- fugue_test/builtin_suite.py +12 -12
- fugue_test/execution_suite.py +13 -18
- fugue_test/plugins/misc/__init__.py +2 -0
- fugue_test/plugins/misc/fixtures.py +18 -0
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev6.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue/__init__.py
CHANGED
fugue/_utils/io.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import pathlib
|
|
3
3
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
|
4
|
-
from urllib.parse import urlparse
|
|
5
4
|
|
|
6
|
-
import fs as pfs
|
|
7
5
|
import pandas as pd
|
|
6
|
+
from fsspec import AbstractFileSystem
|
|
7
|
+
from fsspec.implementations.local import LocalFileSystem
|
|
8
8
|
from triad.collections.dict import ParamDict
|
|
9
|
-
from triad.collections.fs import FileSystem
|
|
10
9
|
from triad.collections.schema import Schema
|
|
11
10
|
from triad.utils.assertion import assert_or_throw
|
|
11
|
+
from triad.utils.io import join, url_to_fs
|
|
12
12
|
from triad.utils.pandas_like import PD_UTILS
|
|
13
13
|
|
|
14
14
|
from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFrame
|
|
@@ -16,23 +16,14 @@ from fugue.dataframe import LocalBoundedDataFrame, LocalDataFrame, PandasDataFra
|
|
|
16
16
|
|
|
17
17
|
class FileParser(object):
|
|
18
18
|
def __init__(self, path: str, format_hint: Optional[str] = None):
|
|
19
|
-
last = len(path)
|
|
20
|
-
has_glob = False
|
|
21
19
|
self._orig_format_hint = format_hint
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
break
|
|
28
|
-
if not has_glob:
|
|
29
|
-
self._uri = urlparse(path)
|
|
30
|
-
self._glob_pattern = ""
|
|
31
|
-
self._path = self._uri.path
|
|
20
|
+
self._has_glob = "*" in path or "?" in path
|
|
21
|
+
self._raw_path = path
|
|
22
|
+
self._fs, self._fs_path = url_to_fs(path)
|
|
23
|
+
if not self.is_local:
|
|
24
|
+
self._path = self._fs.unstrip_protocol(self._fs_path)
|
|
32
25
|
else:
|
|
33
|
-
self.
|
|
34
|
-
self._glob_pattern = path[last + 1 :]
|
|
35
|
-
self._path = pfs.path.combine(self._uri.path, self._glob_pattern)
|
|
26
|
+
self._path = os.path.abspath(self._fs._strip_protocol(path))
|
|
36
27
|
|
|
37
28
|
if format_hint is None or format_hint == "":
|
|
38
29
|
for k, v in _FORMAT_MAP.items():
|
|
@@ -48,56 +39,64 @@ class FileParser(object):
|
|
|
48
39
|
self._format = format_hint
|
|
49
40
|
|
|
50
41
|
def assert_no_glob(self) -> "FileParser":
|
|
51
|
-
assert_or_throw(self.
|
|
42
|
+
assert_or_throw(not self.has_glob, f"{self.raw_path} has glob pattern")
|
|
52
43
|
return self
|
|
53
44
|
|
|
54
|
-
def with_glob(self, glob: str, format_hint: Optional[str] = None) -> "FileParser":
|
|
55
|
-
uri = self.uri
|
|
56
|
-
if glob != "":
|
|
57
|
-
uri = pfs.path.combine(uri, glob)
|
|
58
|
-
return FileParser(uri, format_hint or self._orig_format_hint)
|
|
59
|
-
|
|
60
45
|
@property
|
|
61
|
-
def
|
|
62
|
-
return self.
|
|
46
|
+
def has_glob(self):
|
|
47
|
+
return self._has_glob
|
|
63
48
|
|
|
64
49
|
@property
|
|
65
|
-
def
|
|
66
|
-
return self.
|
|
50
|
+
def is_local(self):
|
|
51
|
+
return isinstance(self._fs, LocalFileSystem)
|
|
67
52
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
53
|
+
def join(self, path: str, format_hint: Optional[str] = None) -> "FileParser":
|
|
54
|
+
if not self.has_glob:
|
|
55
|
+
_path = join(self.path, path)
|
|
56
|
+
else:
|
|
57
|
+
_path = join(self.parent, path)
|
|
58
|
+
return FileParser(_path, format_hint or self._orig_format_hint)
|
|
73
59
|
|
|
74
60
|
@property
|
|
75
61
|
def parent(self) -> str:
|
|
76
|
-
|
|
77
|
-
return dn if dn != "" else "."
|
|
78
|
-
|
|
79
|
-
@property
|
|
80
|
-
def scheme(self) -> str:
|
|
81
|
-
return self._uri.scheme
|
|
62
|
+
return self._fs.unstrip_protocol(self._fs._parent(self._fs_path))
|
|
82
63
|
|
|
83
64
|
@property
|
|
84
65
|
def path(self) -> str:
|
|
85
66
|
return self._path
|
|
86
67
|
|
|
68
|
+
@property
|
|
69
|
+
def raw_path(self) -> str:
|
|
70
|
+
return self._raw_path
|
|
71
|
+
|
|
87
72
|
@property
|
|
88
73
|
def suffix(self) -> str:
|
|
89
|
-
return "".join(pathlib.Path(self.
|
|
74
|
+
return "".join(pathlib.Path(self.raw_path.lower()).suffixes)
|
|
90
75
|
|
|
91
76
|
@property
|
|
92
77
|
def file_format(self) -> str:
|
|
93
78
|
return self._format
|
|
94
79
|
|
|
80
|
+
def make_parent_dirs(self) -> None:
|
|
81
|
+
self._fs.makedirs(self._fs._parent(self._fs_path), exist_ok=True)
|
|
82
|
+
|
|
83
|
+
def find_all(self) -> Iterable["FileParser"]:
|
|
84
|
+
if self.has_glob:
|
|
85
|
+
for x in self._fs.glob(self._fs_path):
|
|
86
|
+
yield FileParser(self._fs.unstrip_protocol(x))
|
|
87
|
+
else:
|
|
88
|
+
yield self
|
|
89
|
+
|
|
90
|
+
def open(self, *args: Any, **kwargs: Any) -> Any:
|
|
91
|
+
self.assert_no_glob()
|
|
92
|
+
return self._fs.open(self._fs_path, *args, **kwargs)
|
|
93
|
+
|
|
95
94
|
|
|
96
95
|
def load_df(
|
|
97
96
|
uri: Union[str, List[str]],
|
|
98
97
|
format_hint: Optional[str] = None,
|
|
99
98
|
columns: Any = None,
|
|
100
|
-
fs: Optional[
|
|
99
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
101
100
|
**kwargs: Any,
|
|
102
101
|
) -> LocalBoundedDataFrame:
|
|
103
102
|
if isinstance(uri, str):
|
|
@@ -117,7 +116,7 @@ def save_df(
|
|
|
117
116
|
uri: str,
|
|
118
117
|
format_hint: Optional[str] = None,
|
|
119
118
|
mode: str = "overwrite",
|
|
120
|
-
fs: Optional[
|
|
119
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
121
120
|
**kwargs: Any,
|
|
122
121
|
) -> None:
|
|
123
122
|
assert_or_throw(
|
|
@@ -125,40 +124,28 @@ def save_df(
|
|
|
125
124
|
)
|
|
126
125
|
p = FileParser(uri, format_hint).assert_no_glob()
|
|
127
126
|
if fs is None:
|
|
128
|
-
fs =
|
|
127
|
+
fs, _ = url_to_fs(uri)
|
|
129
128
|
if fs.exists(uri):
|
|
130
129
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
131
130
|
try:
|
|
132
|
-
fs.
|
|
133
|
-
except Exception:
|
|
134
|
-
|
|
135
|
-
fs.removetree(uri)
|
|
136
|
-
except Exception: # pragma: no cover
|
|
137
|
-
pass
|
|
131
|
+
fs.rm(uri, recursive=True)
|
|
132
|
+
except Exception: # pragma: no cover
|
|
133
|
+
pass
|
|
138
134
|
_FORMAT_SAVE[p.file_format](df, p, **kwargs)
|
|
139
135
|
|
|
140
136
|
|
|
141
137
|
def _get_single_files(
|
|
142
|
-
fp: Iterable[FileParser], fs: Optional[
|
|
138
|
+
fp: Iterable[FileParser], fs: Optional[AbstractFileSystem]
|
|
143
139
|
) -> Iterable[FileParser]:
|
|
144
|
-
if fs is None:
|
|
145
|
-
fs = FileSystem()
|
|
146
140
|
for f in fp:
|
|
147
|
-
|
|
148
|
-
files = [
|
|
149
|
-
FileParser(pfs.path.combine(f.uri, pfs.path.basename(x.path)))
|
|
150
|
-
for x in fs.opendir(f.uri).glob(f.glob_pattern)
|
|
151
|
-
]
|
|
152
|
-
yield from _get_single_files(files, fs)
|
|
153
|
-
else:
|
|
154
|
-
yield f
|
|
141
|
+
yield from f.find_all()
|
|
155
142
|
|
|
156
143
|
|
|
157
144
|
def _save_parquet(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
158
145
|
PD_UTILS.to_parquet_friendly(
|
|
159
146
|
df.as_pandas(), partition_cols=kwargs.get("partition_cols", [])
|
|
160
147
|
).to_parquet(
|
|
161
|
-
p.
|
|
148
|
+
p.assert_no_glob().path,
|
|
162
149
|
**{
|
|
163
150
|
"engine": "pyarrow",
|
|
164
151
|
"schema": df.schema.pa_schema,
|
|
@@ -171,34 +158,36 @@ def _load_parquet(
|
|
|
171
158
|
p: FileParser, columns: Any = None, **kwargs: Any
|
|
172
159
|
) -> Tuple[pd.DataFrame, Any]:
|
|
173
160
|
if columns is None:
|
|
174
|
-
pdf = pd.read_parquet(p.
|
|
161
|
+
pdf = pd.read_parquet(p.path, **{"engine": "pyarrow", **kwargs})
|
|
175
162
|
return pdf, None
|
|
176
163
|
if isinstance(columns, list): # column names
|
|
177
|
-
pdf = pd.read_parquet(
|
|
164
|
+
pdf = pd.read_parquet(
|
|
165
|
+
p.path, columns=columns, **{"engine": "pyarrow", **kwargs}
|
|
166
|
+
)
|
|
178
167
|
return pdf, None
|
|
179
168
|
schema = Schema(columns)
|
|
180
169
|
pdf = pd.read_parquet(
|
|
181
|
-
p.
|
|
170
|
+
p.path, columns=schema.names, **{"engine": "pyarrow", **kwargs}
|
|
182
171
|
)
|
|
183
172
|
return pdf, schema
|
|
184
173
|
|
|
185
174
|
|
|
186
175
|
def _save_csv(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
187
|
-
|
|
176
|
+
with p.open("w") as f:
|
|
177
|
+
df.as_pandas().to_csv(f, **{"index": False, "header": False, **kwargs})
|
|
188
178
|
|
|
189
179
|
|
|
190
|
-
def _safe_load_csv(
|
|
180
|
+
def _safe_load_csv(p: FileParser, **kwargs: Any) -> pd.DataFrame:
|
|
191
181
|
def load_dir() -> pd.DataFrame:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
pd.read_csv(
|
|
196
|
-
|
|
197
|
-
]
|
|
198
|
-
)
|
|
182
|
+
dfs: List[pd.DataFrame] = []
|
|
183
|
+
for _p in p.join("*.csv").find_all(): # type: ignore
|
|
184
|
+
with _p.open("r") as f:
|
|
185
|
+
dfs.append(pd.read_csv(f, **kwargs))
|
|
186
|
+
return pd.concat(dfs)
|
|
199
187
|
|
|
200
188
|
try:
|
|
201
|
-
|
|
189
|
+
with p.open("r") as f:
|
|
190
|
+
return pd.read_csv(f, **kwargs)
|
|
202
191
|
except IsADirectoryError:
|
|
203
192
|
return load_dir()
|
|
204
193
|
except pd.errors.ParserError: # pragma: no cover
|
|
@@ -224,7 +213,7 @@ def _load_csv( # noqa: C901
|
|
|
224
213
|
header = kw["header"]
|
|
225
214
|
del kw["header"]
|
|
226
215
|
if str(header) in ["True", "0"]:
|
|
227
|
-
pdf = _safe_load_csv(p
|
|
216
|
+
pdf = _safe_load_csv(p, **{"index_col": False, "header": 0, **kw})
|
|
228
217
|
if columns is None:
|
|
229
218
|
return pdf, None
|
|
230
219
|
if isinstance(columns, list): # column names
|
|
@@ -236,12 +225,14 @@ def _load_csv( # noqa: C901
|
|
|
236
225
|
raise ValueError("columns must be set if without header")
|
|
237
226
|
if isinstance(columns, list): # column names
|
|
238
227
|
pdf = _safe_load_csv(
|
|
239
|
-
p
|
|
228
|
+
p,
|
|
229
|
+
**{"index_col": False, "header": None, "names": columns, **kw},
|
|
240
230
|
)
|
|
241
231
|
return pdf, None
|
|
242
232
|
schema = Schema(columns)
|
|
243
233
|
pdf = _safe_load_csv(
|
|
244
|
-
p
|
|
234
|
+
p,
|
|
235
|
+
**{"index_col": False, "header": None, "names": schema.names, **kw},
|
|
245
236
|
)
|
|
246
237
|
return pdf, schema
|
|
247
238
|
else:
|
|
@@ -249,27 +240,31 @@ def _load_csv( # noqa: C901
|
|
|
249
240
|
|
|
250
241
|
|
|
251
242
|
def _save_json(df: LocalDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
252
|
-
|
|
243
|
+
with p.open("w") as f:
|
|
244
|
+
df.as_pandas().to_json(f, **{"orient": "records", "lines": True, **kwargs})
|
|
253
245
|
|
|
254
246
|
|
|
255
|
-
def _safe_load_json(
|
|
247
|
+
def _safe_load_json(p: FileParser, **kwargs: Any) -> pd.DataFrame:
|
|
256
248
|
kw = {"orient": "records", "lines": True, **kwargs}
|
|
249
|
+
|
|
250
|
+
def load_dir() -> pd.DataFrame:
|
|
251
|
+
dfs: List[pd.DataFrame] = []
|
|
252
|
+
for _p in p.join("*.json").find_all(): # type: ignore
|
|
253
|
+
with _p.open("r") as f:
|
|
254
|
+
dfs.append(pd.read_json(f, **kw))
|
|
255
|
+
return pd.concat(dfs)
|
|
256
|
+
|
|
257
257
|
try:
|
|
258
|
-
|
|
258
|
+
with p.open("r") as f:
|
|
259
|
+
return pd.read_json(f, **kw)
|
|
259
260
|
except (IsADirectoryError, PermissionError):
|
|
260
|
-
|
|
261
|
-
return pd.concat(
|
|
262
|
-
[
|
|
263
|
-
pd.read_json(pfs.path.combine(path, pfs.path.basename(x.path)), **kw)
|
|
264
|
-
for x in fs.opendir(path).glob("*.json")
|
|
265
|
-
]
|
|
266
|
-
)
|
|
261
|
+
return load_dir()
|
|
267
262
|
|
|
268
263
|
|
|
269
264
|
def _load_json(
|
|
270
265
|
p: FileParser, columns: Any = None, **kwargs: Any
|
|
271
266
|
) -> Tuple[pd.DataFrame, Any]:
|
|
272
|
-
pdf = _safe_load_json(p
|
|
267
|
+
pdf = _safe_load_json(p, **kwargs).reset_index(drop=True)
|
|
273
268
|
if columns is None:
|
|
274
269
|
return pdf, None
|
|
275
270
|
if isinstance(columns, list): # column names
|
fugue/dataframe/utils.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import pickle
|
|
3
|
-
from typing import Any,
|
|
2
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
4
3
|
|
|
5
4
|
import pandas as pd
|
|
6
5
|
import pyarrow as pa
|
|
7
|
-
from
|
|
8
|
-
from triad import
|
|
6
|
+
from fsspec import AbstractFileSystem
|
|
7
|
+
from triad import Schema, assert_or_throw
|
|
9
8
|
from triad.collections.schema import SchemaError
|
|
10
9
|
from triad.exceptions import InvalidOperationError
|
|
11
10
|
from triad.utils.assertion import assert_arg_not_none
|
|
12
11
|
from triad.utils.assertion import assert_or_throw as aot
|
|
12
|
+
from triad.utils.io import url_to_fs
|
|
13
13
|
from triad.utils.pyarrow import pa_batch_to_dicts
|
|
14
14
|
|
|
15
15
|
from .api import as_fugue_df, get_column_names, normalize_column_names, rename
|
|
@@ -112,7 +112,6 @@ def serialize_df(
|
|
|
112
112
|
df: Optional[DataFrame],
|
|
113
113
|
threshold: int = -1,
|
|
114
114
|
file_path: Optional[str] = None,
|
|
115
|
-
fs: Optional[FileSystem] = None,
|
|
116
115
|
) -> Optional[bytes]:
|
|
117
116
|
"""Serialize input dataframe to base64 string or to file
|
|
118
117
|
if it's larger than threshold
|
|
@@ -121,15 +120,8 @@ def serialize_df(
|
|
|
121
120
|
:param threshold: file byte size threshold, defaults to -1
|
|
122
121
|
:param file_path: file path to store the data (used only if the serialized data
|
|
123
122
|
is larger than ``threshold``), defaults to None
|
|
124
|
-
:param fs: :class:`~triad:triad.collections.fs.FileSystem`, defaults to None
|
|
125
123
|
:raises InvalidOperationError: if file is large but ``file_path`` is not provided
|
|
126
124
|
:return: a pickled blob either containing the data or the file path
|
|
127
|
-
|
|
128
|
-
.. note::
|
|
129
|
-
|
|
130
|
-
If fs is not provided but it needs to write to disk, then it will use
|
|
131
|
-
:meth:`~fs:fs.opener.registry.Registry.open_fs` to try to open the file to
|
|
132
|
-
write.
|
|
133
125
|
"""
|
|
134
126
|
if df is None:
|
|
135
127
|
return None
|
|
@@ -140,24 +132,20 @@ def serialize_df(
|
|
|
140
132
|
else:
|
|
141
133
|
if file_path is None:
|
|
142
134
|
raise InvalidOperationError("file_path is not provided")
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
) as _fs:
|
|
147
|
-
_fs.writebytes(os.path.basename(file_path), data)
|
|
148
|
-
else:
|
|
149
|
-
fs.writebytes(file_path, data)
|
|
135
|
+
fs, path = url_to_fs(file_path)
|
|
136
|
+
with fs.open(path, "wb") as f:
|
|
137
|
+
f.write(data)
|
|
150
138
|
return pickle.dumps(file_path)
|
|
151
139
|
|
|
152
140
|
|
|
153
141
|
def deserialize_df(
|
|
154
|
-
data: Optional[bytes], fs: Optional[
|
|
142
|
+
data: Optional[bytes], fs: Optional[AbstractFileSystem] = None
|
|
155
143
|
) -> Optional[LocalBoundedDataFrame]:
|
|
156
144
|
"""Deserialize json string to
|
|
157
145
|
:class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame`
|
|
158
146
|
|
|
159
147
|
:param json_str: json string containing the base64 data or a file path
|
|
160
|
-
:param fs:
|
|
148
|
+
:param fs: the file system to use, defaults to None
|
|
161
149
|
:raises ValueError: if the json string is invalid, not generated from
|
|
162
150
|
:func:`~.serialize_df`
|
|
163
151
|
:return: :class:`~fugue.dataframe.dataframe.LocalBoundedDataFrame` if ``json_str``
|
|
@@ -169,10 +157,9 @@ def deserialize_df(
|
|
|
169
157
|
if isinstance(obj, LocalBoundedDataFrame):
|
|
170
158
|
return obj
|
|
171
159
|
elif isinstance(obj, str):
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
return pickle.loads(fs.readbytes(obj))
|
|
160
|
+
fs, path = url_to_fs(obj)
|
|
161
|
+
with fs.open(path, "rb") as f:
|
|
162
|
+
return pickle.load(f)
|
|
176
163
|
raise ValueError("data is invalid")
|
|
177
164
|
|
|
178
165
|
|
|
@@ -18,7 +18,6 @@ from typing import (
|
|
|
18
18
|
from uuid import uuid4
|
|
19
19
|
|
|
20
20
|
from triad import ParamDict, Schema, SerializableRLock, assert_or_throw, to_uuid
|
|
21
|
-
from triad.collections.fs import FileSystem
|
|
22
21
|
from triad.collections.function_wrapper import AnnotatedParam
|
|
23
22
|
from triad.exceptions import InvalidOperationError
|
|
24
23
|
from triad.utils.convert import to_size
|
|
@@ -471,12 +470,6 @@ class ExecutionEngine(FugueEngineBase):
|
|
|
471
470
|
"""
|
|
472
471
|
self._sql_engine = engine
|
|
473
472
|
|
|
474
|
-
@property
|
|
475
|
-
@abstractmethod
|
|
476
|
-
def fs(self) -> FileSystem: # pragma: no cover
|
|
477
|
-
"""File system of this engine instance"""
|
|
478
|
-
raise NotImplementedError
|
|
479
|
-
|
|
480
473
|
@abstractmethod
|
|
481
474
|
def create_default_map_engine(self) -> MapEngine: # pragma: no cover
|
|
482
475
|
"""Default MapEngine if user doesn't specify"""
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
3
|
from typing import Any, Callable, Dict, List, Optional, Type, Union
|
|
4
|
+
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from triad import Schema
|
|
7
8
|
from triad.collections.dict import IndexedOrderedDict
|
|
8
|
-
from triad.collections.fs import FileSystem
|
|
9
9
|
from triad.utils.assertion import assert_or_throw
|
|
10
|
+
from triad.utils.io import makedirs
|
|
10
11
|
from triad.utils.pandas_like import PandasUtils
|
|
11
12
|
|
|
12
13
|
from fugue._utils.io import load_df, save_df
|
|
@@ -179,7 +180,6 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
179
180
|
|
|
180
181
|
def __init__(self, conf: Any = None):
|
|
181
182
|
super().__init__(conf)
|
|
182
|
-
self._fs = FileSystem()
|
|
183
183
|
self._log = logging.getLogger()
|
|
184
184
|
|
|
185
185
|
def __repr__(self) -> str:
|
|
@@ -189,10 +189,6 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
189
189
|
def log(self) -> logging.Logger:
|
|
190
190
|
return self._log
|
|
191
191
|
|
|
192
|
-
@property
|
|
193
|
-
def fs(self) -> FileSystem:
|
|
194
|
-
return self._fs
|
|
195
|
-
|
|
196
192
|
@property
|
|
197
193
|
def is_distributed(self) -> bool:
|
|
198
194
|
return False
|
|
@@ -395,9 +391,7 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
395
391
|
**kwargs: Any,
|
|
396
392
|
) -> LocalBoundedDataFrame:
|
|
397
393
|
return self.to_df(
|
|
398
|
-
load_df(
|
|
399
|
-
path, format_hint=format_hint, columns=columns, fs=self.fs, **kwargs
|
|
400
|
-
)
|
|
394
|
+
load_df(path, format_hint=format_hint, columns=columns, **kwargs)
|
|
401
395
|
)
|
|
402
396
|
|
|
403
397
|
def save_df(
|
|
@@ -413,9 +407,9 @@ class NativeExecutionEngine(ExecutionEngine):
|
|
|
413
407
|
partition_spec = partition_spec or PartitionSpec()
|
|
414
408
|
if not force_single and not partition_spec.empty:
|
|
415
409
|
kwargs["partition_cols"] = partition_spec.partition_by
|
|
416
|
-
|
|
410
|
+
makedirs(os.path.dirname(path), exist_ok=True)
|
|
417
411
|
df = self.to_df(df)
|
|
418
|
-
save_df(df, path, format_hint=format_hint, mode=mode,
|
|
412
|
+
save_df(df, path, format_hint=format_hint, mode=mode, **kwargs)
|
|
419
413
|
|
|
420
414
|
|
|
421
415
|
@fugue_annotated_param(NativeExecutionEngine)
|
fugue/workflow/_checkpoint.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
from triad.utils.assertion import assert_or_throw
|
|
4
|
+
from triad.utils.hash import to_uuid
|
|
5
|
+
from triad.utils.io import exists, join, makedirs, rm
|
|
6
|
+
|
|
4
7
|
from fugue.collections.partition import PartitionSpec
|
|
5
8
|
from fugue.collections.yielded import PhysicalYielded
|
|
6
9
|
from fugue.constants import FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH
|
|
7
10
|
from fugue.dataframe import DataFrame
|
|
8
11
|
from fugue.exceptions import FugueWorkflowCompileError, FugueWorkflowRuntimeError
|
|
9
12
|
from fugue.execution.execution_engine import ExecutionEngine
|
|
10
|
-
from triad.utils.assertion import assert_or_throw
|
|
11
|
-
from triad.utils.hash import to_uuid
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class Checkpoint(object):
|
|
@@ -130,7 +131,6 @@ class WeakCheckpoint(Checkpoint):
|
|
|
130
131
|
class CheckpointPath(object):
|
|
131
132
|
def __init__(self, engine: ExecutionEngine):
|
|
132
133
|
self._engine = engine
|
|
133
|
-
self._fs = engine.fs
|
|
134
134
|
self._log = engine.log
|
|
135
135
|
self._path = engine.conf.get(FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH, "").strip()
|
|
136
136
|
self._temp_path = ""
|
|
@@ -143,14 +143,14 @@ class CheckpointPath(object):
|
|
|
143
143
|
if self._path == "":
|
|
144
144
|
self._temp_path = ""
|
|
145
145
|
return ""
|
|
146
|
-
self._temp_path =
|
|
147
|
-
|
|
146
|
+
self._temp_path = join(self._path, execution_id)
|
|
147
|
+
makedirs(self._temp_path, exist_ok=True)
|
|
148
148
|
return self._temp_path
|
|
149
149
|
|
|
150
150
|
def remove_temp_path(self):
|
|
151
151
|
if self._temp_path != "":
|
|
152
152
|
try:
|
|
153
|
-
|
|
153
|
+
rm(self._temp_path, recursive=True)
|
|
154
154
|
except Exception as e: # pragma: no cover
|
|
155
155
|
self._log.info("Unable to remove " + self._temp_path, e)
|
|
156
156
|
|
|
@@ -162,7 +162,7 @@ class CheckpointPath(object):
|
|
|
162
162
|
f"{FUGUE_CONF_WORKFLOW_CHECKPOINT_PATH} is not set"
|
|
163
163
|
),
|
|
164
164
|
)
|
|
165
|
-
return
|
|
165
|
+
return join(path, obj_id + ".parquet")
|
|
166
166
|
|
|
167
167
|
def get_table_name(self, obj_id: str, permanent: bool) -> str:
|
|
168
168
|
path = self._path if permanent else self._temp_path
|
|
@@ -170,6 +170,6 @@ class CheckpointPath(object):
|
|
|
170
170
|
|
|
171
171
|
def temp_file_exists(self, path: str) -> bool:
|
|
172
172
|
try:
|
|
173
|
-
return
|
|
173
|
+
return exists(path)
|
|
174
174
|
except Exception: # pragma: no cover
|
|
175
175
|
return False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.8.7.
|
|
3
|
+
Version: 0.8.7.dev7
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.8
|
|
21
21
|
Description-Content-Type: text/markdown
|
|
22
|
-
Requires-Dist: triad ==0.9.2.
|
|
22
|
+
Requires-Dist: triad ==0.9.2.dev8
|
|
23
23
|
Requires-Dist: adagio >=0.2.4
|
|
24
24
|
Requires-Dist: qpd >=0.4.4
|
|
25
25
|
Requires-Dist: fugue-sql-antlr >=0.1.6
|