fugue 0.8.7.dev5__py3-none-any.whl → 0.8.7.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/__init__.py +0 -1
- fugue/_utils/io.py +84 -89
- fugue/api.py +1 -0
- fugue/dataframe/api.py +19 -2
- fugue/dataframe/arrow_dataframe.py +48 -11
- fugue/dataframe/dataframe.py +20 -2
- fugue/dataframe/function_wrapper.py +1 -1
- fugue/dataframe/iterable_dataframe.py +3 -0
- fugue/dataframe/pandas_dataframe.py +73 -0
- fugue/dataframe/utils.py +78 -25
- fugue/execution/execution_engine.py +1 -8
- fugue/execution/native_execution_engine.py +5 -11
- fugue/plugins.py +1 -0
- fugue/workflow/_checkpoint.py +9 -9
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/METADATA +4 -4
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/RECORD +40 -38
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/WHEEL +1 -1
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/entry_points.txt +3 -2
- fugue_dask/_io.py +22 -29
- fugue_dask/_utils.py +15 -2
- fugue_dask/dataframe.py +105 -18
- fugue_dask/execution_engine.py +5 -12
- fugue_duckdb/_io.py +21 -37
- fugue_duckdb/dataframe.py +87 -29
- fugue_duckdb/execution_engine.py +2 -7
- fugue_ibis/dataframe.py +13 -0
- fugue_ibis/execution_engine.py +1 -5
- fugue_polars/polars_dataframe.py +53 -16
- fugue_ray/_utils/io.py +15 -17
- fugue_ray/dataframe.py +71 -19
- fugue_spark/_utils/io.py +3 -5
- fugue_spark/dataframe.py +69 -13
- fugue_spark/execution_engine.py +2 -7
- fugue_test/builtin_suite.py +12 -12
- fugue_test/dataframe_suite.py +14 -0
- fugue_test/execution_suite.py +13 -18
- fugue_test/plugins/misc/__init__.py +2 -0
- fugue_test/plugins/misc/fixtures.py +18 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/LICENSE +0 -0
- {fugue-0.8.7.dev5.dist-info → fugue-0.8.7.dev7.dist-info}/top_level.txt +0 -0
fugue_dask/_io.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
2
2
|
|
|
3
|
-
import fsspec
|
|
4
|
-
import fs as pfs
|
|
5
3
|
import pandas as pd
|
|
6
4
|
from dask import dataframe as dd
|
|
5
|
+
from fsspec import AbstractFileSystem
|
|
7
6
|
from triad.collections.dict import ParamDict
|
|
8
|
-
from triad.collections.fs import FileSystem
|
|
9
7
|
from triad.collections.schema import Schema
|
|
10
8
|
from triad.utils.assertion import assert_or_throw
|
|
9
|
+
from triad.utils.io import join, makedirs, url_to_fs
|
|
11
10
|
|
|
12
11
|
from fugue._utils.io import FileParser, _get_single_files
|
|
13
12
|
from fugue_dask.dataframe import DaskDataFrame
|
|
@@ -19,7 +18,7 @@ def load_df(
|
|
|
19
18
|
uri: Union[str, List[str]],
|
|
20
19
|
format_hint: Optional[str] = None,
|
|
21
20
|
columns: Any = None,
|
|
22
|
-
fs: Optional[
|
|
21
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
23
22
|
**kwargs: Any,
|
|
24
23
|
) -> DaskDataFrame:
|
|
25
24
|
if isinstance(uri, str):
|
|
@@ -39,7 +38,7 @@ def save_df(
|
|
|
39
38
|
uri: str,
|
|
40
39
|
format_hint: Optional[str] = None,
|
|
41
40
|
mode: str = "overwrite",
|
|
42
|
-
fs: Optional[
|
|
41
|
+
fs: Optional[AbstractFileSystem] = None,
|
|
43
42
|
**kwargs: Any,
|
|
44
43
|
) -> None:
|
|
45
44
|
assert_or_throw(
|
|
@@ -48,16 +47,13 @@ def save_df(
|
|
|
48
47
|
)
|
|
49
48
|
p = FileParser(uri, format_hint).assert_no_glob()
|
|
50
49
|
if fs is None:
|
|
51
|
-
fs =
|
|
50
|
+
fs, _ = url_to_fs(uri)
|
|
52
51
|
if fs.exists(uri):
|
|
53
52
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
54
53
|
try:
|
|
55
|
-
fs.
|
|
56
|
-
except Exception:
|
|
57
|
-
|
|
58
|
-
fs.removetree(uri)
|
|
59
|
-
except Exception: # pragma: no cover
|
|
60
|
-
pass
|
|
54
|
+
fs.rm(uri, recursive=True)
|
|
55
|
+
except Exception: # pragma: no cover
|
|
56
|
+
pass
|
|
61
57
|
_FORMAT_SAVE[p.file_format](df, p, **kwargs)
|
|
62
58
|
|
|
63
59
|
|
|
@@ -67,7 +63,7 @@ def _save_parquet(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
|
67
63
|
"write_index": False,
|
|
68
64
|
**kwargs,
|
|
69
65
|
}
|
|
70
|
-
DASK_UTILS.to_parquet_friendly(df.native).to_parquet(p.
|
|
66
|
+
DASK_UTILS.to_parquet_friendly(df.native).to_parquet(p.path, **params)
|
|
71
67
|
|
|
72
68
|
|
|
73
69
|
def _load_parquet(
|
|
@@ -80,27 +76,26 @@ def _load_parquet(
|
|
|
80
76
|
if pd.__version__ >= "1.5":
|
|
81
77
|
dtype_backend = "pyarrow"
|
|
82
78
|
if columns is None:
|
|
83
|
-
pdf = dd.read_parquet(p.
|
|
79
|
+
pdf = dd.read_parquet(p.path, dtype_backend=dtype_backend, **params)
|
|
84
80
|
schema = Schema(pdf.head(1))
|
|
85
81
|
return pdf, schema
|
|
86
82
|
if isinstance(columns, list): # column names
|
|
87
83
|
pdf = dd.read_parquet(
|
|
88
|
-
p.
|
|
84
|
+
p.path, columns=columns, dtype_backend=dtype_backend, **params
|
|
89
85
|
)
|
|
90
86
|
schema = Schema(pdf.head(1))
|
|
91
87
|
return pdf, schema
|
|
92
88
|
schema = Schema(columns)
|
|
93
89
|
pdf = dd.read_parquet(
|
|
94
|
-
p.
|
|
90
|
+
p.path, columns=schema.names, dtype_backend=dtype_backend, **params
|
|
95
91
|
)
|
|
96
92
|
return pdf, schema
|
|
97
93
|
|
|
98
94
|
|
|
99
95
|
def _save_csv(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
100
|
-
|
|
101
|
-
fs.makedirs(path, exist_ok=True)
|
|
96
|
+
makedirs(p.path, exist_ok=True)
|
|
102
97
|
df.native.to_csv(
|
|
103
|
-
|
|
98
|
+
p.join("*.csv").path, **{"index": False, "header": False, **kwargs}
|
|
104
99
|
)
|
|
105
100
|
|
|
106
101
|
|
|
@@ -108,7 +103,7 @@ def _safe_load_csv(path: str, **kwargs: Any) -> dd.DataFrame:
|
|
|
108
103
|
try:
|
|
109
104
|
return dd.read_csv(path, **kwargs)
|
|
110
105
|
except (IsADirectoryError, PermissionError):
|
|
111
|
-
return dd.read_csv(
|
|
106
|
+
return dd.read_csv(join(path, "*.csv"), **kwargs)
|
|
112
107
|
|
|
113
108
|
|
|
114
109
|
def _load_csv( # noqa: C901
|
|
@@ -127,7 +122,7 @@ def _load_csv( # noqa: C901
|
|
|
127
122
|
header = kw["header"]
|
|
128
123
|
del kw["header"]
|
|
129
124
|
if str(header) in ["True", "0"]:
|
|
130
|
-
pdf = _safe_load_csv(p.
|
|
125
|
+
pdf = _safe_load_csv(p.path, **{"header": 0, **kw})
|
|
131
126
|
if columns is None:
|
|
132
127
|
return pdf, None
|
|
133
128
|
if isinstance(columns, list): # column names
|
|
@@ -138,34 +133,32 @@ def _load_csv( # noqa: C901
|
|
|
138
133
|
if columns is None:
|
|
139
134
|
raise ValueError("columns must be set if without header")
|
|
140
135
|
if isinstance(columns, list): # column names
|
|
141
|
-
pdf = _safe_load_csv(p.
|
|
136
|
+
pdf = _safe_load_csv(p.path, **{"header": None, "names": columns, **kw})
|
|
142
137
|
return pdf, None
|
|
143
138
|
schema = Schema(columns)
|
|
144
|
-
pdf = _safe_load_csv(p.
|
|
139
|
+
pdf = _safe_load_csv(p.path, **{"header": None, "names": schema.names, **kw})
|
|
145
140
|
return pdf, schema
|
|
146
141
|
else:
|
|
147
142
|
raise NotImplementedError(f"{header} is not supported")
|
|
148
143
|
|
|
149
144
|
|
|
150
145
|
def _save_json(df: DaskDataFrame, p: FileParser, **kwargs: Any) -> None:
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
df.native.to_json(pfs.path.combine(p.uri, "*.json"), **kwargs)
|
|
146
|
+
makedirs(p.path, exist_ok=True)
|
|
147
|
+
df.native.to_json(p.join("*.json").path, **kwargs)
|
|
154
148
|
|
|
155
149
|
|
|
156
150
|
def _safe_load_json(path: str, **kwargs: Any) -> dd.DataFrame:
|
|
157
151
|
try:
|
|
158
152
|
return dd.read_json(path, **kwargs)
|
|
159
153
|
except (IsADirectoryError, PermissionError):
|
|
160
|
-
x = dd.read_json(
|
|
161
|
-
print(x.compute())
|
|
154
|
+
x = dd.read_json(join(path, "*.json"), **kwargs)
|
|
162
155
|
return x
|
|
163
156
|
|
|
164
157
|
|
|
165
158
|
def _load_json(
|
|
166
159
|
p: FileParser, columns: Any = None, **kwargs: Any
|
|
167
160
|
) -> Tuple[dd.DataFrame, Any]:
|
|
168
|
-
pdf = _safe_load_json(p.
|
|
161
|
+
pdf = _safe_load_json(p.path, **kwargs).reset_index(drop=True)
|
|
169
162
|
if columns is None:
|
|
170
163
|
return pdf, None
|
|
171
164
|
if isinstance(columns, list): # column names
|
fugue_dask/_utils.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import math
|
|
2
|
-
from typing import Any, List, Optional, Tuple
|
|
2
|
+
from typing import Any, Callable, List, Optional, Tuple, TypeVar
|
|
3
3
|
|
|
4
4
|
import dask.dataframe as dd
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pyarrow as pa
|
|
8
8
|
from dask.dataframe.core import DataFrame
|
|
9
|
+
from dask.delayed import delayed
|
|
9
10
|
from dask.distributed import Client, get_client
|
|
10
|
-
from triad.utils.pandas_like import
|
|
11
|
+
from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
|
|
11
12
|
from triad.utils.pyarrow import to_pandas_dtype
|
|
12
13
|
|
|
13
14
|
import fugue.api as fa
|
|
@@ -16,6 +17,7 @@ from fugue.constants import FUGUE_CONF_DEFAULT_PARTITIONS
|
|
|
16
17
|
from ._constants import FUGUE_DASK_CONF_DEFAULT_PARTITIONS
|
|
17
18
|
|
|
18
19
|
_FUGUE_DASK_TEMP_IDX_COLUMN = "_fugue_dask_temp_index"
|
|
20
|
+
T = TypeVar("T")
|
|
19
21
|
|
|
20
22
|
|
|
21
23
|
def get_default_partitions() -> int:
|
|
@@ -28,6 +30,17 @@ def get_default_partitions() -> int:
|
|
|
28
30
|
return n if n > 0 else fa.get_current_parallelism() * 2
|
|
29
31
|
|
|
30
32
|
|
|
33
|
+
def collect(df: dd.DataFrame, func: Callable[[pd.DataFrame], T]) -> Tuple[T]:
|
|
34
|
+
"""Compute each partition in parallel and collect the results
|
|
35
|
+
|
|
36
|
+
:param df: dask dataframe
|
|
37
|
+
:return: the collected result
|
|
38
|
+
"""
|
|
39
|
+
dfs = df.to_delayed()
|
|
40
|
+
objs = [delayed(func)(df) for df in dfs]
|
|
41
|
+
return dd.compute(*objs)
|
|
42
|
+
|
|
43
|
+
|
|
31
44
|
def hash_repartition(df: dd.DataFrame, num: int, cols: List[Any]) -> dd.DataFrame:
|
|
32
45
|
"""Repartition the dataframe by hashing the given columns
|
|
33
46
|
|
fugue_dask/dataframe.py
CHANGED
|
@@ -3,20 +3,21 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
|
3
3
|
import dask.dataframe as dd
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pyarrow as pa
|
|
6
|
+
from triad import assert_or_throw
|
|
6
7
|
from triad.collections.schema import Schema
|
|
7
8
|
from triad.utils.assertion import assert_arg_not_none
|
|
8
9
|
from triad.utils.pandas_like import PD_UTILS
|
|
9
10
|
from triad.utils.pyarrow import cast_pa_table
|
|
10
11
|
|
|
11
|
-
from fugue.dataframe import
|
|
12
|
-
ArrowDataFrame,
|
|
13
|
-
DataFrame,
|
|
14
|
-
LocalBoundedDataFrame,
|
|
15
|
-
PandasDataFrame,
|
|
16
|
-
)
|
|
12
|
+
from fugue.dataframe import DataFrame, LocalBoundedDataFrame, PandasDataFrame
|
|
17
13
|
from fugue.dataframe.dataframe import _input_schema
|
|
14
|
+
from fugue.dataframe.pandas_dataframe import _pd_as_dicts
|
|
18
15
|
from fugue.exceptions import FugueDataFrameOperationError
|
|
19
16
|
from fugue.plugins import (
|
|
17
|
+
as_array,
|
|
18
|
+
as_array_iterable,
|
|
19
|
+
as_dict_iterable,
|
|
20
|
+
as_dicts,
|
|
20
21
|
as_local_bounded,
|
|
21
22
|
count,
|
|
22
23
|
drop_columns,
|
|
@@ -32,7 +33,7 @@ from fugue.plugins import (
|
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
from ._constants import FUGUE_DASK_USE_ARROW
|
|
35
|
-
from ._utils import DASK_UTILS, get_default_partitions
|
|
36
|
+
from ._utils import DASK_UTILS, collect, get_default_partitions
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
class DaskDataFrame(DataFrame):
|
|
@@ -150,8 +151,16 @@ class DaskDataFrame(DataFrame):
|
|
|
150
151
|
)
|
|
151
152
|
|
|
152
153
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
153
|
-
|
|
154
|
-
return
|
|
154
|
+
schema = self.schema.pa_schema
|
|
155
|
+
return pa.concat_tables(
|
|
156
|
+
collect(
|
|
157
|
+
self.native,
|
|
158
|
+
lambda df: cast_pa_table(
|
|
159
|
+
pa.Table.from_pandas(df.reset_index(drop=True), schema=schema),
|
|
160
|
+
schema=schema,
|
|
161
|
+
),
|
|
162
|
+
)
|
|
163
|
+
)
|
|
155
164
|
|
|
156
165
|
def rename(self, columns: Dict[str, str]) -> DataFrame:
|
|
157
166
|
try:
|
|
@@ -170,17 +179,28 @@ class DaskDataFrame(DataFrame):
|
|
|
170
179
|
def as_array(
|
|
171
180
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
172
181
|
) -> List[Any]:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
)
|
|
182
|
+
chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
|
|
183
|
+
res: List[List[Any]] = []
|
|
184
|
+
for x in chunks:
|
|
185
|
+
res += x
|
|
186
|
+
return res
|
|
179
187
|
|
|
180
188
|
def as_array_iterable(
|
|
181
189
|
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
182
190
|
) -> Iterable[Any]:
|
|
183
|
-
|
|
191
|
+
chunks = _to_array_chunks(self.native, columns, type_safe, schema=self.schema)
|
|
192
|
+
for x in chunks:
|
|
193
|
+
yield from x
|
|
194
|
+
|
|
195
|
+
def as_dicts(
|
|
196
|
+
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
197
|
+
) -> List[Dict[str, Any]]:
|
|
198
|
+
return _dd_as_dicts(self.native, columns)
|
|
199
|
+
|
|
200
|
+
def as_dict_iterable(
|
|
201
|
+
self, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
202
|
+
) -> Iterable[Dict[str, Any]]:
|
|
203
|
+
yield from _dd_as_dict_iterable(self.native, columns)
|
|
184
204
|
|
|
185
205
|
def head(
|
|
186
206
|
self, n: int, columns: Optional[List[str]] = None
|
|
@@ -197,8 +217,11 @@ class DaskDataFrame(DataFrame):
|
|
|
197
217
|
assert_arg_not_none(schema, "schema")
|
|
198
218
|
return pdf, schema
|
|
199
219
|
DASK_UTILS.ensure_compatible(pdf)
|
|
200
|
-
|
|
201
|
-
|
|
220
|
+
# when pdf contains bytes, or any object types, and schema contains str
|
|
221
|
+
# there is no way to get the real schema of the pdf, (pschema will contain
|
|
222
|
+
# strs instead of the real types) so we have to force cast it to the schema
|
|
223
|
+
if schema is None:
|
|
224
|
+
pschema = Schema(DASK_UTILS.to_schema(pdf))
|
|
202
225
|
return pdf, pschema.assert_not_empty()
|
|
203
226
|
pdf = pdf[schema.assert_not_empty().names]
|
|
204
227
|
return (
|
|
@@ -295,6 +318,48 @@ def _dd_head(
|
|
|
295
318
|
return PandasDataFrame(res) if as_fugue else res
|
|
296
319
|
|
|
297
320
|
|
|
321
|
+
@as_array.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
322
|
+
def _dd_as_array(
|
|
323
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
324
|
+
) -> List[Any]:
|
|
325
|
+
chunks = _to_array_chunks(df, columns, type_safe)
|
|
326
|
+
res: List[List[Any]] = []
|
|
327
|
+
for x in chunks:
|
|
328
|
+
res += x
|
|
329
|
+
return res
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@as_array_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
333
|
+
def _dd_as_array_iterable(
|
|
334
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
335
|
+
) -> Iterable[Any]:
|
|
336
|
+
chunks = _to_array_chunks(df, columns, type_safe)
|
|
337
|
+
for x in chunks:
|
|
338
|
+
yield from x
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
342
|
+
def _dd_as_dicts(
|
|
343
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
344
|
+
) -> List[Dict[str, Any]]:
|
|
345
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
346
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
347
|
+
res: List[Dict[str, Any]] = []
|
|
348
|
+
for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
|
|
349
|
+
res += x
|
|
350
|
+
return res
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@as_dict_iterable.candidate(lambda df, *args, **kwargs: isinstance(df, dd.DataFrame))
|
|
354
|
+
def _dd_as_dict_iterable(
|
|
355
|
+
df: dd.DataFrame, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
356
|
+
) -> Iterable[Dict[str, Any]]:
|
|
357
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
358
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
359
|
+
for x in collect(_df, lambda df: _pd_as_dicts(df, columns)):
|
|
360
|
+
yield from x
|
|
361
|
+
|
|
362
|
+
|
|
298
363
|
def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
|
|
299
364
|
missing = set(columns) - set(df.columns)
|
|
300
365
|
if len(missing) > 0:
|
|
@@ -303,3 +368,25 @@ def _assert_no_missing(df: dd.DataFrame, columns: Iterable[Any]) -> None:
|
|
|
303
368
|
|
|
304
369
|
def _adjust_df(res: dd.DataFrame, as_fugue: bool):
|
|
305
370
|
return res if not as_fugue else DaskDataFrame(res)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _to_array_chunks(
|
|
374
|
+
df: dd.DataFrame,
|
|
375
|
+
columns: Optional[List[str]] = None,
|
|
376
|
+
type_safe: bool = False,
|
|
377
|
+
schema: Optional[Schema] = None,
|
|
378
|
+
) -> Tuple[List[Any]]:
|
|
379
|
+
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
380
|
+
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
381
|
+
|
|
382
|
+
def _to_list(pdf: pd.DataFrame) -> List[Any]:
|
|
383
|
+
return list(
|
|
384
|
+
PD_UTILS.as_array_iterable(
|
|
385
|
+
pdf,
|
|
386
|
+
schema=None if schema is None else schema.pa_schema,
|
|
387
|
+
columns=columns,
|
|
388
|
+
type_safe=type_safe,
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
return collect(_df, _to_list)
|
fugue_dask/execution_engine.py
CHANGED
|
@@ -7,18 +7,17 @@ import pandas as pd
|
|
|
7
7
|
from distributed import Client
|
|
8
8
|
from triad.collections import Schema
|
|
9
9
|
from triad.collections.dict import IndexedOrderedDict, ParamDict
|
|
10
|
-
from triad.collections.fs import FileSystem
|
|
11
10
|
from triad.utils.assertion import assert_or_throw
|
|
12
11
|
from triad.utils.hash import to_uuid
|
|
13
12
|
from triad.utils.pandas_like import PandasUtils
|
|
14
13
|
from triad.utils.threading import RunOnce
|
|
14
|
+
from triad.utils.io import makedirs
|
|
15
15
|
from fugue import StructuredRawSQL
|
|
16
16
|
from fugue.collections.partition import (
|
|
17
17
|
PartitionCursor,
|
|
18
18
|
PartitionSpec,
|
|
19
19
|
parse_presort_exp,
|
|
20
20
|
)
|
|
21
|
-
from fugue.exceptions import FugueBug
|
|
22
21
|
from fugue.constants import KEYWORD_PARALLELISM, KEYWORD_ROWCOUNT
|
|
23
22
|
from fugue.dataframe import (
|
|
24
23
|
AnyDataFrame,
|
|
@@ -28,6 +27,7 @@ from fugue.dataframe import (
|
|
|
28
27
|
PandasDataFrame,
|
|
29
28
|
)
|
|
30
29
|
from fugue.dataframe.utils import get_join_schemas
|
|
30
|
+
from fugue.exceptions import FugueBug
|
|
31
31
|
from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine
|
|
32
32
|
from fugue.execution.native_execution_engine import NativeExecutionEngine
|
|
33
33
|
from fugue_dask._constants import FUGUE_DASK_DEFAULT_CONF
|
|
@@ -206,7 +206,6 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
206
206
|
p = ParamDict(FUGUE_DASK_DEFAULT_CONF)
|
|
207
207
|
p.update(ParamDict(conf))
|
|
208
208
|
super().__init__(p)
|
|
209
|
-
self._fs = FileSystem()
|
|
210
209
|
self._log = logging.getLogger()
|
|
211
210
|
self._client = DASK_UTILS.get_or_create_client(dask_client)
|
|
212
211
|
self._native = NativeExecutionEngine(conf=conf)
|
|
@@ -227,10 +226,6 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
227
226
|
def log(self) -> logging.Logger:
|
|
228
227
|
return self._log
|
|
229
228
|
|
|
230
|
-
@property
|
|
231
|
-
def fs(self) -> FileSystem:
|
|
232
|
-
return self._fs
|
|
233
|
-
|
|
234
229
|
def create_default_sql_engine(self) -> SQLEngine:
|
|
235
230
|
return DaskSQLEngine(self)
|
|
236
231
|
|
|
@@ -527,9 +522,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
527
522
|
**kwargs: Any,
|
|
528
523
|
) -> DaskDataFrame:
|
|
529
524
|
return self.to_df(
|
|
530
|
-
load_df(
|
|
531
|
-
path, format_hint=format_hint, columns=columns, fs=self.fs, **kwargs
|
|
532
|
-
)
|
|
525
|
+
load_df(path, format_hint=format_hint, columns=columns, **kwargs)
|
|
533
526
|
)
|
|
534
527
|
|
|
535
528
|
def save_df(
|
|
@@ -556,9 +549,9 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
556
549
|
else:
|
|
557
550
|
if not partition_spec.empty:
|
|
558
551
|
kwargs["partition_on"] = partition_spec.partition_by
|
|
559
|
-
|
|
552
|
+
makedirs(os.path.dirname(path), exist_ok=True)
|
|
560
553
|
df = self.to_df(df)
|
|
561
|
-
save_df(df, path, format_hint=format_hint, mode=mode,
|
|
554
|
+
save_df(df, path, format_hint=format_hint, mode=mode, **kwargs)
|
|
562
555
|
|
|
563
556
|
|
|
564
557
|
def to_dask_engine_df(df: Any, schema: Any = None) -> DaskDataFrame:
|
fugue_duckdb/_io.py
CHANGED
|
@@ -3,9 +3,9 @@ from typing import Any, Iterable, List, Optional, Union
|
|
|
3
3
|
|
|
4
4
|
from duckdb import DuckDBPyConnection
|
|
5
5
|
from triad import ParamDict, Schema
|
|
6
|
-
from triad.collections.fs import FileSystem
|
|
7
|
-
from triad.utils.assertion import assert_or_throw
|
|
8
6
|
|
|
7
|
+
from triad.utils.assertion import assert_or_throw
|
|
8
|
+
from triad.utils.io import isdir, makedirs, rm, exists
|
|
9
9
|
from fugue._utils.io import FileParser, load_df, save_df
|
|
10
10
|
from fugue.collections.sql import TempTableName
|
|
11
11
|
from fugue.dataframe import ArrowDataFrame, LocalBoundedDataFrame
|
|
@@ -18,26 +18,17 @@ from fugue_duckdb._utils import (
|
|
|
18
18
|
from fugue_duckdb.dataframe import DuckDataFrame
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
fp: Iterable[FileParser], fs: FileSystem, fmt: str
|
|
23
|
-
) -> Iterable[FileParser]:
|
|
24
|
-
def _isdir(d: str) -> bool:
|
|
25
|
-
try:
|
|
26
|
-
return fs.isdir(d)
|
|
27
|
-
except Exception: # pragma: no cover
|
|
28
|
-
return False
|
|
29
|
-
|
|
21
|
+
def _get_files(fp: Iterable[FileParser], fmt: str) -> Iterable[FileParser]:
|
|
30
22
|
for f in fp:
|
|
31
|
-
if f.
|
|
32
|
-
yield f.
|
|
23
|
+
if not f.has_glob and isdir(f.path):
|
|
24
|
+
yield from f.join("*." + fmt, fmt).find_all()
|
|
33
25
|
else:
|
|
34
26
|
yield f
|
|
35
27
|
|
|
36
28
|
|
|
37
29
|
class DuckDBIO:
|
|
38
|
-
def __init__(self,
|
|
30
|
+
def __init__(self, con: DuckDBPyConnection) -> None:
|
|
39
31
|
self._con = con
|
|
40
|
-
self._fs = fs
|
|
41
32
|
self._format_load = {"csv": self._load_csv, "parquet": self._load_parquet}
|
|
42
33
|
self._format_save = {"csv": self._save_csv, "parquet": self._save_parquet}
|
|
43
34
|
|
|
@@ -55,11 +46,9 @@ class DuckDBIO:
|
|
|
55
46
|
else:
|
|
56
47
|
fp = [FileParser(u, format_hint) for u in uri]
|
|
57
48
|
if fp[0].file_format not in self._format_load:
|
|
58
|
-
return load_df(
|
|
59
|
-
uri, format_hint=format_hint, columns=columns, fs=self._fs, **kwargs
|
|
60
|
-
)
|
|
49
|
+
return load_df(uri, format_hint=format_hint, columns=columns, **kwargs)
|
|
61
50
|
dfs: List[DuckDataFrame] = []
|
|
62
|
-
for f in
|
|
51
|
+
for f in _get_files(fp, fp[0].file_format):
|
|
63
52
|
df = self._format_load[f.file_format](f, columns, **kwargs)
|
|
64
53
|
dfs.append(df)
|
|
65
54
|
rel = dfs[0].native
|
|
@@ -83,26 +72,20 @@ class DuckDBIO:
|
|
|
83
72
|
)
|
|
84
73
|
p = FileParser(uri, format_hint).assert_no_glob()
|
|
85
74
|
if (p.file_format not in self._format_save) or ("partition_cols" in kwargs):
|
|
86
|
-
|
|
75
|
+
makedirs(os.path.dirname(uri), exist_ok=True)
|
|
87
76
|
ldf = ArrowDataFrame(df.as_arrow())
|
|
88
|
-
return save_df(
|
|
89
|
-
|
|
90
|
-
)
|
|
91
|
-
fs = self._fs
|
|
92
|
-
if fs.exists(uri):
|
|
77
|
+
return save_df(ldf, uri=uri, format_hint=format_hint, mode=mode, **kwargs)
|
|
78
|
+
if exists(uri):
|
|
93
79
|
assert_or_throw(mode == "overwrite", FileExistsError(uri))
|
|
94
80
|
try:
|
|
95
|
-
|
|
96
|
-
except Exception:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
except Exception: # pragma: no cover
|
|
100
|
-
pass
|
|
101
|
-
if not fs.exists(p.parent):
|
|
102
|
-
fs.makedirs(p.parent, recreate=True)
|
|
81
|
+
rm(uri, recursive=True)
|
|
82
|
+
except Exception: # pragma: no cover
|
|
83
|
+
pass
|
|
84
|
+
p.make_parent_dirs()
|
|
103
85
|
self._format_save[p.file_format](df, p, **kwargs)
|
|
104
86
|
|
|
105
87
|
def _save_csv(self, df: DuckDataFrame, p: FileParser, **kwargs: Any):
|
|
88
|
+
p.assert_no_glob()
|
|
106
89
|
dn = TempTableName()
|
|
107
90
|
df.native.create_view(dn.key)
|
|
108
91
|
kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
|
|
@@ -111,7 +94,7 @@ class DuckDBIO:
|
|
|
111
94
|
for k, v in kw.items():
|
|
112
95
|
params.append(f"{k.upper()} " + encode_value_to_expr(v))
|
|
113
96
|
pm = ", ".join(params)
|
|
114
|
-
query = f"COPY {dn.key} TO {encode_value_to_expr(p.
|
|
97
|
+
query = f"COPY {dn.key} TO {encode_value_to_expr(p.path)} WITH ({pm})"
|
|
115
98
|
self._con.execute(query)
|
|
116
99
|
|
|
117
100
|
def _load_csv( # noqa: C901
|
|
@@ -125,7 +108,7 @@ class DuckDBIO:
|
|
|
125
108
|
ValueError("when csv has no header, columns must be specified"),
|
|
126
109
|
)
|
|
127
110
|
kw.pop("auto_detect", None)
|
|
128
|
-
params: List[str] = [encode_value_to_expr(p.
|
|
111
|
+
params: List[str] = [encode_value_to_expr(p.path)]
|
|
129
112
|
kw["header"] = 1 if header else 0
|
|
130
113
|
kw["auto_detect"] = 1 if infer_schema else 0
|
|
131
114
|
if infer_schema:
|
|
@@ -188,6 +171,7 @@ class DuckDBIO:
|
|
|
188
171
|
return DuckDataFrame(self._con.from_query(query))
|
|
189
172
|
|
|
190
173
|
def _save_parquet(self, df: DuckDataFrame, p: FileParser, **kwargs: Any):
|
|
174
|
+
p.assert_no_glob()
|
|
191
175
|
dn = TempTableName()
|
|
192
176
|
df.native.create_view(dn.key)
|
|
193
177
|
kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
|
|
@@ -196,7 +180,7 @@ class DuckDBIO:
|
|
|
196
180
|
for k, v in kw.items():
|
|
197
181
|
params.append(f"{k.upper()} " + encode_value_to_expr(v))
|
|
198
182
|
pm = ", ".join(params)
|
|
199
|
-
query = f"COPY {dn.key} TO {encode_value_to_expr(p.
|
|
183
|
+
query = f"COPY {dn.key} TO {encode_value_to_expr(p.path)}"
|
|
200
184
|
if len(params) > 0:
|
|
201
185
|
query += f" WITH ({pm})"
|
|
202
186
|
self._con.execute(query)
|
|
@@ -205,7 +189,7 @@ class DuckDBIO:
|
|
|
205
189
|
self, p: FileParser, columns: Any = None, **kwargs: Any
|
|
206
190
|
) -> DuckDataFrame:
|
|
207
191
|
kw = ParamDict({k.lower(): v for k, v in kwargs.items()})
|
|
208
|
-
params: List[str] = [encode_value_to_expr(p.
|
|
192
|
+
params: List[str] = [encode_value_to_expr(p.path)]
|
|
209
193
|
if isinstance(columns, list):
|
|
210
194
|
cols = ", ".join(encode_column_names(columns))
|
|
211
195
|
else:
|