fugue 0.9.2.dev1__py3-none-any.whl → 0.9.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/_utils/io.py +14 -2
- fugue/dataframe/function_wrapper.py +14 -4
- fugue/extensions/transformer/convert.py +2 -2
- fugue/rpc/flask.py +13 -5
- {fugue-0.9.2.dev1.dist-info → fugue-0.9.3.dist-info}/METADATA +70 -58
- {fugue-0.9.2.dev1.dist-info → fugue-0.9.3.dist-info}/RECORD +22 -21
- {fugue-0.9.2.dev1.dist-info → fugue-0.9.3.dist-info}/WHEEL +1 -1
- fugue-0.9.3.dist-info/entry_points.txt +11 -0
- fugue_dask/_dask_sql_wrapper.py +76 -0
- fugue_dask/_utils.py +9 -5
- fugue_dask/dataframe.py +1 -1
- fugue_dask/execution_engine.py +8 -11
- fugue_duckdb/dataframe.py +5 -5
- fugue_duckdb/execution_engine.py +1 -1
- fugue_ibis/execution_engine.py +7 -6
- fugue_ray/_utils/io.py +5 -1
- fugue_spark/_utils/convert.py +18 -12
- fugue_test/builtin_suite.py +6 -4
- fugue_test/execution_suite.py +2 -0
- fugue_version/__init__.py +1 -1
- fugue-0.9.2.dev1.dist-info/entry_points.txt +0 -12
- {fugue-0.9.2.dev1.dist-info → fugue-0.9.3.dist-info/licenses}/LICENSE +0 -0
- {fugue-0.9.2.dev1.dist-info → fugue-0.9.3.dist-info}/top_level.txt +0 -0
fugue/_utils/io.py
CHANGED
|
@@ -20,6 +20,10 @@ class FileParser(object):
|
|
|
20
20
|
self._has_glob = "*" in path or "?" in path
|
|
21
21
|
self._raw_path = path
|
|
22
22
|
self._fs, self._fs_path = url_to_fs(path)
|
|
23
|
+
if not self._has_glob and self._fs.isdir(self._fs_path):
|
|
24
|
+
self._is_dir = True
|
|
25
|
+
else:
|
|
26
|
+
self._is_dir = False
|
|
23
27
|
if not self.is_local:
|
|
24
28
|
self._path = self._fs.unstrip_protocol(self._fs_path)
|
|
25
29
|
else:
|
|
@@ -43,11 +47,15 @@ class FileParser(object):
|
|
|
43
47
|
return self
|
|
44
48
|
|
|
45
49
|
@property
|
|
46
|
-
def
|
|
50
|
+
def is_dir(self) -> bool:
|
|
51
|
+
return self._is_dir
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def has_glob(self) -> bool:
|
|
47
55
|
return self._has_glob
|
|
48
56
|
|
|
49
57
|
@property
|
|
50
|
-
def is_local(self):
|
|
58
|
+
def is_local(self) -> bool:
|
|
51
59
|
return isinstance(self._fs, LocalFileSystem)
|
|
52
60
|
|
|
53
61
|
def join(self, path: str, format_hint: Optional[str] = None) -> "FileParser":
|
|
@@ -65,6 +73,10 @@ class FileParser(object):
|
|
|
65
73
|
def path(self) -> str:
|
|
66
74
|
return self._path
|
|
67
75
|
|
|
76
|
+
def as_dir_path(self) -> str:
|
|
77
|
+
assert_or_throw(self.is_dir, f"{self.raw_path} is not a directory")
|
|
78
|
+
return self.path + self._fs.sep
|
|
79
|
+
|
|
68
80
|
@property
|
|
69
81
|
def raw_path(self) -> str:
|
|
70
82
|
return self._raw_path
|
|
@@ -91,7 +91,9 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
91
91
|
isinstance(p[k], DataFrame),
|
|
92
92
|
lambda: TypeError(f"{p[k]} is not a DataFrame"),
|
|
93
93
|
)
|
|
94
|
-
if v.is_per_row:
|
|
94
|
+
if v.is_per_row: # pragma: no cover
|
|
95
|
+
# TODO: this branch is used only if row annotations
|
|
96
|
+
# are allowed as input
|
|
95
97
|
assert_or_throw(
|
|
96
98
|
row_param_info is None,
|
|
97
99
|
lambda: ValueError("only one row parameter is allowed"),
|
|
@@ -110,7 +112,9 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
110
112
|
raise ValueError(f"{p} are not acceptable parameters")
|
|
111
113
|
if row_param_info is None:
|
|
112
114
|
return self._run_func(rargs, output, output_schema, ctx, raw=False)
|
|
113
|
-
else: #
|
|
115
|
+
else: # pragma: no cover
|
|
116
|
+
# input contains row parameter
|
|
117
|
+
# TODO: this branch is used only if row annotations are allowed as input
|
|
114
118
|
|
|
115
119
|
def _dfs() -> Iterable[Any]:
|
|
116
120
|
k, v, df = row_param_info
|
|
@@ -154,6 +158,7 @@ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
|
|
|
154
158
|
annotation == Callable
|
|
155
159
|
or annotation == callable # pylint: disable=comparison-with-callable
|
|
156
160
|
or str(annotation).startswith("typing.Callable")
|
|
161
|
+
or str(annotation).startswith("collections.abc.Callable")
|
|
157
162
|
),
|
|
158
163
|
)
|
|
159
164
|
class _CallableParam(AnnotatedParam):
|
|
@@ -168,6 +173,9 @@ class _CallableParam(AnnotatedParam):
|
|
|
168
173
|
or annotation == Optional[callable]
|
|
169
174
|
or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
|
|
170
175
|
or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
|
|
176
|
+
or str(annotation).startswith(
|
|
177
|
+
"typing.Optional[collections.abc.Callable]"
|
|
178
|
+
) # 3.9+
|
|
171
179
|
),
|
|
172
180
|
)
|
|
173
181
|
class _OptionalCallableParam(AnnotatedParam):
|
|
@@ -233,7 +241,8 @@ class DataFrameParam(_DataFrameParamBase):
|
|
|
233
241
|
|
|
234
242
|
|
|
235
243
|
@fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
|
|
236
|
-
class RowParam(_DataFrameParamBase):
|
|
244
|
+
class RowParam(_DataFrameParamBase): # pragma: no cover
|
|
245
|
+
# TODO: this class is used only if row annotations are allowed as input
|
|
237
246
|
@property
|
|
238
247
|
def is_per_row(self) -> bool:
|
|
239
248
|
return True
|
|
@@ -243,7 +252,8 @@ class RowParam(_DataFrameParamBase):
|
|
|
243
252
|
|
|
244
253
|
|
|
245
254
|
@fugue_annotated_param(Dict[str, Any])
|
|
246
|
-
class DictParam(RowParam):
|
|
255
|
+
class DictParam(RowParam): # pragma: no cover
|
|
256
|
+
# TODO: this class is used only if row annotations are allowed as input
|
|
247
257
|
def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
|
|
248
258
|
yield from df.as_dict_iterable()
|
|
249
259
|
|
|
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
|
|
|
375
375
|
assert_arg_not_none(schema, "schema")
|
|
376
376
|
tr = _FuncAsTransformer()
|
|
377
377
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
378
|
-
func, "^[
|
|
378
|
+
func, "^[lspq][fF]?x*z?$", "^[lspqr]$"
|
|
379
379
|
)
|
|
380
380
|
tr._output_schema_arg = schema # type: ignore
|
|
381
381
|
tr._validation_rules = validation_rules # type: ignore
|
|
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
|
|
|
410
410
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
411
411
|
tr = _FuncAsOutputTransformer()
|
|
412
412
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
413
|
-
func, "^[
|
|
413
|
+
func, "^[lspq][fF]?x*z?$", "^[lspnqr]$"
|
|
414
414
|
)
|
|
415
415
|
tr._output_schema_arg = None # type: ignore
|
|
416
416
|
tr._validation_rules = validation_rules # type: ignore
|
fugue/rpc/flask.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import base64
|
|
2
1
|
import logging
|
|
3
|
-
import
|
|
2
|
+
import json
|
|
4
3
|
from threading import Thread
|
|
5
4
|
from typing import Any, Optional, Tuple, Dict, List
|
|
6
5
|
|
|
@@ -60,6 +59,7 @@ class FlaskRPCServer(RPCServer):
|
|
|
60
59
|
-1.0 if timeout is None else to_timedelta(timeout).total_seconds()
|
|
61
60
|
)
|
|
62
61
|
self._server: Optional[FlaskRPCServer._Thread] = None
|
|
62
|
+
self._log = logging.getLogger()
|
|
63
63
|
|
|
64
64
|
def make_client(self, handler: Any) -> RPCClient:
|
|
65
65
|
"""Add ``handler`` and correspondent :class:`~.FlaskRPCClient`
|
|
@@ -77,6 +77,14 @@ class FlaskRPCServer(RPCServer):
|
|
|
77
77
|
|
|
78
78
|
def start_server(self) -> None:
|
|
79
79
|
"""Start Flask RPC server"""
|
|
80
|
+
msg = (
|
|
81
|
+
"Starting RPC server on %s:%s. "
|
|
82
|
+
"This server has no authentication and relies on network isolation. "
|
|
83
|
+
"Ensure proper VPC/firewall configuration in production. "
|
|
84
|
+
"See https://fugue-tutorials.readthedocs.io/tutorials/resources/"
|
|
85
|
+
"security.html"
|
|
86
|
+
)
|
|
87
|
+
self._log.warning(msg, self._host, self._port)
|
|
80
88
|
app = Flask("FlaskRPCServer")
|
|
81
89
|
app.route("/invoke", methods=["POST"])(self._invoke)
|
|
82
90
|
self._server = FlaskRPCServer._Thread(app, self._host, self._port)
|
|
@@ -122,10 +130,10 @@ class FlaskRPCClient(RPCClient):
|
|
|
122
130
|
|
|
123
131
|
|
|
124
132
|
def _encode(*args: Any, **kwargs: Any) -> str:
|
|
125
|
-
data =
|
|
126
|
-
return data
|
|
133
|
+
data = json.dumps(dict(args=args, kwargs=kwargs))
|
|
134
|
+
return data
|
|
127
135
|
|
|
128
136
|
|
|
129
137
|
def _decode(data: str) -> Tuple[List[Any], Dict[str, Any]]:
|
|
130
|
-
data =
|
|
138
|
+
data = json.loads(data)
|
|
131
139
|
return data["args"], data["kwargs"] # type: ignore
|
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.3
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
7
7
|
Author-email: hello@fugue.ai
|
|
8
8
|
License: Apache-2.0
|
|
9
9
|
Keywords: distributed spark dask ray sql dsl domain specific language
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Intended Audience :: Developers
|
|
13
12
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
@@ -17,67 +16,81 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
21
|
Requires-Python: >=3.8
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
|
-
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Requires-Dist:
|
|
28
|
-
Requires-Dist:
|
|
29
|
-
Requires-Dist:
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
|
|
32
|
-
Requires-Dist:
|
|
33
|
-
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist: jupyterlab ; extra == 'all'
|
|
36
|
-
Requires-Dist: ipython >=7.10.0 ; extra == 'all'
|
|
37
|
-
Requires-Dist: duckdb >=0.5.0 ; extra == 'all'
|
|
38
|
-
Requires-Dist: pyarrow >=6.0.1 ; extra == 'all'
|
|
39
|
-
Requires-Dist: pandas <2.2,>=2.0.2 ; extra == 'all'
|
|
40
|
-
Requires-Dist: ibis-framework ; extra == 'all'
|
|
41
|
-
Requires-Dist: polars ; extra == 'all'
|
|
42
|
-
Provides-Extra: cpp_sql_parser
|
|
43
|
-
Requires-Dist: fugue-sql-antlr[cpp] >=0.2.0 ; extra == 'cpp_sql_parser'
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: triad>=1.0.0
|
|
25
|
+
Requires-Dist: adagio>=0.2.6
|
|
26
|
+
Provides-Extra: sql
|
|
27
|
+
Requires-Dist: qpd>=0.4.4; extra == "sql"
|
|
28
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "sql"
|
|
29
|
+
Requires-Dist: sqlglot<28; extra == "sql"
|
|
30
|
+
Requires-Dist: jinja2; extra == "sql"
|
|
31
|
+
Provides-Extra: cpp-sql-parser
|
|
32
|
+
Requires-Dist: fugue-sql-antlr[cpp]>=0.2.0; extra == "cpp-sql-parser"
|
|
33
|
+
Provides-Extra: spark
|
|
34
|
+
Requires-Dist: pyspark>=3.1.1; extra == "spark"
|
|
44
35
|
Provides-Extra: dask
|
|
45
|
-
Requires-Dist: dask[dataframe,distributed]
|
|
46
|
-
Requires-Dist: pyarrow
|
|
47
|
-
Requires-Dist: pandas
|
|
48
|
-
|
|
36
|
+
Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "dask"
|
|
37
|
+
Requires-Dist: pyarrow>=7.0.0; extra == "dask"
|
|
38
|
+
Requires-Dist: pandas>=2.0.2; extra == "dask"
|
|
39
|
+
Provides-Extra: ray
|
|
40
|
+
Requires-Dist: ray[data]>=2.30.0; extra == "ray"
|
|
41
|
+
Requires-Dist: duckdb>=0.5.0; extra == "ray"
|
|
42
|
+
Requires-Dist: pyarrow>=7.0.0; extra == "ray"
|
|
43
|
+
Requires-Dist: pandas<2.2; extra == "ray"
|
|
49
44
|
Provides-Extra: duckdb
|
|
50
|
-
Requires-Dist: qpd
|
|
51
|
-
Requires-Dist: fugue-sql-antlr
|
|
52
|
-
Requires-Dist: sqlglot
|
|
53
|
-
Requires-Dist: jinja2
|
|
54
|
-
Requires-Dist: duckdb
|
|
55
|
-
Requires-Dist: numpy
|
|
45
|
+
Requires-Dist: qpd>=0.4.4; extra == "duckdb"
|
|
46
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "duckdb"
|
|
47
|
+
Requires-Dist: sqlglot<28; extra == "duckdb"
|
|
48
|
+
Requires-Dist: jinja2; extra == "duckdb"
|
|
49
|
+
Requires-Dist: duckdb>=0.5.0; extra == "duckdb"
|
|
50
|
+
Requires-Dist: numpy; extra == "duckdb"
|
|
51
|
+
Provides-Extra: polars
|
|
52
|
+
Requires-Dist: polars; extra == "polars"
|
|
56
53
|
Provides-Extra: ibis
|
|
57
|
-
Requires-Dist: qpd
|
|
58
|
-
Requires-Dist: fugue-sql-antlr
|
|
59
|
-
Requires-Dist: sqlglot
|
|
60
|
-
Requires-Dist: jinja2
|
|
61
|
-
Requires-Dist: ibis-framework
|
|
62
|
-
Requires-Dist: pandas
|
|
54
|
+
Requires-Dist: qpd>=0.4.4; extra == "ibis"
|
|
55
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "ibis"
|
|
56
|
+
Requires-Dist: sqlglot<28; extra == "ibis"
|
|
57
|
+
Requires-Dist: jinja2; extra == "ibis"
|
|
58
|
+
Requires-Dist: ibis-framework[pandas]; extra == "ibis"
|
|
59
|
+
Requires-Dist: pandas<2.2; extra == "ibis"
|
|
63
60
|
Provides-Extra: notebook
|
|
64
|
-
Requires-Dist: notebook
|
|
65
|
-
Requires-Dist: jupyterlab
|
|
66
|
-
Requires-Dist: ipython
|
|
67
|
-
Provides-Extra:
|
|
68
|
-
Requires-Dist:
|
|
69
|
-
|
|
70
|
-
Requires-Dist:
|
|
71
|
-
Requires-Dist:
|
|
72
|
-
Requires-Dist:
|
|
73
|
-
Requires-Dist:
|
|
74
|
-
|
|
75
|
-
Requires-Dist:
|
|
76
|
-
|
|
77
|
-
Requires-Dist:
|
|
78
|
-
Requires-Dist:
|
|
79
|
-
Requires-Dist:
|
|
80
|
-
Requires-Dist:
|
|
61
|
+
Requires-Dist: notebook; extra == "notebook"
|
|
62
|
+
Requires-Dist: jupyterlab; extra == "notebook"
|
|
63
|
+
Requires-Dist: ipython>=7.10.0; extra == "notebook"
|
|
64
|
+
Provides-Extra: all
|
|
65
|
+
Requires-Dist: qpd>=0.4.4; extra == "all"
|
|
66
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "all"
|
|
67
|
+
Requires-Dist: sqlglot<28; extra == "all"
|
|
68
|
+
Requires-Dist: jinja2; extra == "all"
|
|
69
|
+
Requires-Dist: pyspark>=3.1.1; extra == "all"
|
|
70
|
+
Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "all"
|
|
71
|
+
Requires-Dist: dask-sql; extra == "all"
|
|
72
|
+
Requires-Dist: ray[data]>=2.30.0; extra == "all"
|
|
73
|
+
Requires-Dist: notebook; extra == "all"
|
|
74
|
+
Requires-Dist: jupyterlab; extra == "all"
|
|
75
|
+
Requires-Dist: ipython>=7.10.0; extra == "all"
|
|
76
|
+
Requires-Dist: duckdb>=0.5.0; extra == "all"
|
|
77
|
+
Requires-Dist: pyarrow>=6.0.1; extra == "all"
|
|
78
|
+
Requires-Dist: pandas<2.2,>=2.0.2; extra == "all"
|
|
79
|
+
Requires-Dist: ibis-framework[duckdb,pandas]; extra == "all"
|
|
80
|
+
Requires-Dist: polars; extra == "all"
|
|
81
|
+
Dynamic: author
|
|
82
|
+
Dynamic: author-email
|
|
83
|
+
Dynamic: classifier
|
|
84
|
+
Dynamic: description
|
|
85
|
+
Dynamic: description-content-type
|
|
86
|
+
Dynamic: home-page
|
|
87
|
+
Dynamic: keywords
|
|
88
|
+
Dynamic: license
|
|
89
|
+
Dynamic: license-file
|
|
90
|
+
Dynamic: provides-extra
|
|
91
|
+
Dynamic: requires-dist
|
|
92
|
+
Dynamic: requires-python
|
|
93
|
+
Dynamic: summary
|
|
81
94
|
|
|
82
95
|
# Fugue
|
|
83
96
|
|
|
@@ -355,4 +368,3 @@ View some of our latest conferences presentations and content. For a more comple
|
|
|
355
368
|
* [Large Scale Data Validation with Spark and Dask (PyCon US)](https://www.youtube.com/watch?v=2AdvBgjO_3Q)
|
|
356
369
|
* [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames (PyData Global)](https://www.youtube.com/watch?v=OBpnGYjNBBI)
|
|
357
370
|
* [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk)
|
|
358
|
-
|
|
@@ -10,7 +10,7 @@ fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
10
10
|
fugue/_utils/display.py,sha256=JV8oDA7efHm1wceZulCBOY5dMvjbWHvIm6ASisKfoWY,3164
|
|
11
11
|
fugue/_utils/exception.py,sha256=SFIjwjV4CIEovp3P9k7ePNOFB12A5D8hDdhtfFUeM5Y,2247
|
|
12
12
|
fugue/_utils/interfaceless.py,sha256=wI0H6L4W_1uQjh9tpjgT9HzN-fbrrtXXHC1x6Q_rrPg,2203
|
|
13
|
-
fugue/_utils/io.py,sha256=
|
|
13
|
+
fugue/_utils/io.py,sha256=5twd99LBzHtIMT67il1qwnEUa5n13WZmVKNd1shO4No,9649
|
|
14
14
|
fugue/_utils/misc.py,sha256=_huy0eylmRTEFoReGR2M4rbAI8m79hFcfY5bDceVEXU,887
|
|
15
15
|
fugue/_utils/registry.py,sha256=lrbzTdUEVnW6paBGDj-Yb-aTIbP5mjCqrXuRU9_N6os,316
|
|
16
16
|
fugue/bag/__init__.py,sha256=0Q0_rnrEThrTx2U-1xGNyAg95idp_xcnywymIcW4Xck,46
|
|
@@ -31,7 +31,7 @@ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvr
|
|
|
31
31
|
fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
|
|
32
32
|
fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
|
|
33
33
|
fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
|
|
34
|
-
fugue/dataframe/function_wrapper.py,sha256=
|
|
34
|
+
fugue/dataframe/function_wrapper.py,sha256=7Sb6XrWTD_swtHJbHDWZRxHvFNWkERynnCDzLM0wSbo,18340
|
|
35
35
|
fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
|
|
36
36
|
fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
|
|
37
37
|
fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
|
|
@@ -61,11 +61,11 @@ fugue/extensions/processor/convert.py,sha256=zG0lMtHGwY5TsqK4eplbMdlTg7J_PD3HbI0
|
|
|
61
61
|
fugue/extensions/processor/processor.py,sha256=czhQlQgMpAXXoLVAX9Q0TFUMYEEhsgufTammxcKSmOY,1665
|
|
62
62
|
fugue/extensions/transformer/__init__.py,sha256=VD6d-8xW1Yl8fUPj43cBWNR9pCOlYD9xWyGIHAlHwvI,456
|
|
63
63
|
fugue/extensions/transformer/constants.py,sha256=76DfpoTOGQ8gp5XtCs_xznfbr_H015-prXpHWSqMNDU,59
|
|
64
|
-
fugue/extensions/transformer/convert.py,sha256=
|
|
64
|
+
fugue/extensions/transformer/convert.py,sha256=zDDIpZawMnHFarjZNZAyiw1jfyXGuPjnvgQk9jpYLak,23384
|
|
65
65
|
fugue/extensions/transformer/transformer.py,sha256=zhOUgyv5-DPxYd1CP_98WeEw-zUgwknRnPW_6di-q3g,9098
|
|
66
66
|
fugue/rpc/__init__.py,sha256=3GzUl4QZQuCChjD7eaTJW8tnTwfke6ZY9r9g5nCeBZ8,167
|
|
67
67
|
fugue/rpc/base.py,sha256=3Fq5SvwLZqw9NXru3r32WuJKBGFr9bl7nFgy6e9boGo,8470
|
|
68
|
-
fugue/rpc/flask.py,sha256=
|
|
68
|
+
fugue/rpc/flask.py,sha256=VzJEND8Pqatf6pYYT9LDXeO1JDMmYAOY0lm8XUncKbA,4807
|
|
69
69
|
fugue/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
70
|
fugue/sql/_utils.py,sha256=khpjGeFCVlaqf2JIYvS4TVTJO3fe5-8bEsvy6AIP_5Q,2083
|
|
71
71
|
fugue/sql/_visitors.py,sha256=2pc0J-AHJAiIexsKgNjcgrCGOyhC3_7rzonSgtjy--k,33844
|
|
@@ -82,6 +82,7 @@ fugue/workflow/api.py,sha256=uQoxPSCZ91-ST4vwuPWG7qioRGW4eo-Sgi3DdwtSL4k,12495
|
|
|
82
82
|
fugue/workflow/input.py,sha256=V_zLDNzndmQuYJAPXtdK4n-vOp7LrimGIf_wQtwf2mc,321
|
|
83
83
|
fugue/workflow/module.py,sha256=ajyqgMwX6hFMZY9xp4Bp1Q-Zdta0p5f_W_n_SNrc4LE,5547
|
|
84
84
|
fugue/workflow/workflow.py,sha256=-SFCXkyxgXbS6DpQGSBox4d3Ws3psIlB6PnraJLSu9Y,88219
|
|
85
|
+
fugue-0.9.3.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
85
86
|
fugue_contrib/__init__.py,sha256=QJioX-r2AiU7Pvt24M-k2c4vNq29qpK-3WNUde7ucck,222
|
|
86
87
|
fugue_contrib/contrib.py,sha256=3B--6oIVBMZ-GwjIOXwZqYqkloH7Cxfq1I8vkwl2yPk,267
|
|
87
88
|
fugue_contrib/seaborn/__init__.py,sha256=NuVv8EI4Om4gHcHwYO8ddextLQqw24vDj8qJio3E1MU,1405
|
|
@@ -89,25 +90,26 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
|
|
|
89
90
|
fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
|
|
90
91
|
fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
|
|
91
92
|
fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
|
|
93
|
+
fugue_dask/_dask_sql_wrapper.py,sha256=lj38gJIOdoMV9W44gpwzLjUEtPVsQNKjRWuEkfI7-PM,2618
|
|
92
94
|
fugue_dask/_io.py,sha256=pl4F7mbVgP7Rwh1FFG7xfOz2TBZRUj1l3lLvDY4jOf4,6020
|
|
93
|
-
fugue_dask/_utils.py,sha256=
|
|
94
|
-
fugue_dask/dataframe.py,sha256=
|
|
95
|
-
fugue_dask/execution_engine.py,sha256=
|
|
95
|
+
fugue_dask/_utils.py,sha256=0R0pCh4B47kQsAS_o0QGaosIqVcZnSakm6pfMB7fSXs,9059
|
|
96
|
+
fugue_dask/dataframe.py,sha256=4Dvckpc4mlld2WsEFTTemxoA1zYK8Cn6jMKxUxYQCEE,13491
|
|
97
|
+
fugue_dask/execution_engine.py,sha256=mFN_IurhdBEu8C5OreqpGSRdTbTBqSpzJO2dMQzEF-o,21264
|
|
96
98
|
fugue_dask/registry.py,sha256=jepWKH55VWNIWV3pOF5vpCl2OpO0rI1IULx5GM2Gk6w,2274
|
|
97
99
|
fugue_dask/tester.py,sha256=E7BZjgFpJgrHsLMKzvSO5im5OwocYcratjzulJSQZl0,718
|
|
98
100
|
fugue_duckdb/__init__.py,sha256=ZzhmAWbROR1YL9Kmlt7OlwkgPZzFhsSdwLV2pFmAqGI,268
|
|
99
101
|
fugue_duckdb/_io.py,sha256=vnd8m8C6XeMCBJBbAdA5h695NMfsduQrvONyS0HcEFA,8475
|
|
100
102
|
fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
|
|
101
103
|
fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
|
|
102
|
-
fugue_duckdb/dataframe.py,sha256=
|
|
103
|
-
fugue_duckdb/execution_engine.py,sha256=
|
|
104
|
+
fugue_duckdb/dataframe.py,sha256=LAPoPOad9hgGhjyhlMGMfrnhkyBKe06Xzn6eP1hkl-w,8504
|
|
105
|
+
fugue_duckdb/execution_engine.py,sha256=3f5hbWcX1y9mAtfFixrri-snYxVIQAf4HOgo9fHbDwQ,20385
|
|
104
106
|
fugue_duckdb/registry.py,sha256=9_41KO42kXqcjF4yParQ5JGyg5TckcbhH-Q2IlGpSho,3987
|
|
105
107
|
fugue_duckdb/tester.py,sha256=MzTkv3sdOwOjI59LRrSGGl4w59Njv3OArTU5kSRL-P0,1526
|
|
106
108
|
fugue_ibis/__init__.py,sha256=z7TkK7M2_0p9XO6jQATNDgT0aHXn5k69Ttz2ga-eQG8,190
|
|
107
109
|
fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
|
|
108
110
|
fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
|
|
109
111
|
fugue_ibis/dataframe.py,sha256=k4Q6qBLBIADF5YhbvaDplXO7OkMZSHuf_Wg5o-AusEI,7796
|
|
110
|
-
fugue_ibis/execution_engine.py,sha256=
|
|
112
|
+
fugue_ibis/execution_engine.py,sha256=jRnp1m1wuTicS29A-WA043f8QwdoK8b9rwPXvTkm8r8,18751
|
|
111
113
|
fugue_notebook/__init__.py,sha256=9r_-2uxu1lBeZ8GgpYCKom_OZy2soIOYZajg7JDO-HY,4326
|
|
112
114
|
fugue_notebook/env.py,sha256=TYiTxYPFi-BVJJY49jDsvw9mddhK8WrifeRxBke30I8,4773
|
|
113
115
|
fugue_notebook/nbextension/README.md,sha256=QLnr957YeGfwzy2r4c4qbZPaXyCbyGrKPvcqSBQYSnU,123
|
|
@@ -127,7 +129,7 @@ fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
|
|
|
127
129
|
fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
130
|
fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
|
|
129
131
|
fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
|
|
130
|
-
fugue_ray/_utils/io.py,sha256=
|
|
132
|
+
fugue_ray/_utils/io.py,sha256=JZdL7pdpk1DUIj77NJSzU_EZOW4cN7oNjwGy2w-LRTw,10142
|
|
131
133
|
fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
|
|
132
134
|
fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
|
|
133
135
|
fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
|
|
@@ -135,7 +137,7 @@ fugue_spark/execution_engine.py,sha256=YBMtNxCvpy77xICFSg9PHMa6feNoYhWEZe8MmxznX
|
|
|
135
137
|
fugue_spark/registry.py,sha256=_NmiV2cOooYK0YmqATEnNkPEMT9suUMtuecw2NNbIIk,4530
|
|
136
138
|
fugue_spark/tester.py,sha256=VX003yGNlBukaZTQSN-w7XvgSk4rqxrWQIzno0dWrXg,2481
|
|
137
139
|
fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
138
|
-
fugue_spark/_utils/convert.py,sha256=
|
|
140
|
+
fugue_spark/_utils/convert.py,sha256=J3HtbuzomTYTN6A11iuvsC1h2C7o3fQBW5U360xGDhE,10234
|
|
139
141
|
fugue_spark/_utils/io.py,sha256=OdUezKpB29Lx9aUS2k9x0xUAGZrmgMZyQYGPEeHk7rQ,5574
|
|
140
142
|
fugue_spark/_utils/misc.py,sha256=9LsbBp6nOEhqXFLr8oWTc3VKzKk-vuVyixlRoquGnEs,858
|
|
141
143
|
fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
|
|
@@ -143,14 +145,13 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
|
|
|
143
145
|
fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
|
|
144
146
|
fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
|
|
145
147
|
fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
|
|
146
|
-
fugue_test/builtin_suite.py,sha256=
|
|
148
|
+
fugue_test/builtin_suite.py,sha256=jP3xiq2vRZNNGzoSRjcUfrUk8NVg31SU0kpJaEvP25E,79400
|
|
147
149
|
fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
|
|
148
|
-
fugue_test/execution_suite.py,sha256=
|
|
150
|
+
fugue_test/execution_suite.py,sha256=wUiGdb8wLRd13JXo7Lo19vPOLh7t1C-L2NPLeU0k-uE,48736
|
|
149
151
|
fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
|
|
150
|
-
fugue_version/__init__.py,sha256=
|
|
151
|
-
fugue-0.9.
|
|
152
|
-
fugue-0.9.
|
|
153
|
-
fugue-0.9.
|
|
154
|
-
fugue-0.9.
|
|
155
|
-
fugue-0.9.
|
|
156
|
-
fugue-0.9.2.dev1.dist-info/RECORD,,
|
|
152
|
+
fugue_version/__init__.py,sha256=xKd3pzbczuMsdB08eLAOqZDUd_q1IRxwZ_ccAFL4c4A,22
|
|
153
|
+
fugue-0.9.3.dist-info/METADATA,sha256=AXA7npC7pohZQCKiAqe6M5Zoq2K--K4SisIvBi_l1Tc,18570
|
|
154
|
+
fugue-0.9.3.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
|
|
155
|
+
fugue-0.9.3.dist-info/entry_points.txt,sha256=2Vxp1qew_tswacA8m0RzIliLlFOQMlzezvSXPugM_KA,295
|
|
156
|
+
fugue-0.9.3.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
|
|
157
|
+
fugue-0.9.3.dist-info/RECORD,,
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
[fugue.plugins]
|
|
2
|
+
dask = fugue_dask.registry[dask]
|
|
3
|
+
duckdb = fugue_duckdb.registry[duckdb]
|
|
4
|
+
ibis = fugue_ibis[ibis]
|
|
5
|
+
polars = fugue_polars.registry[polars]
|
|
6
|
+
ray = fugue_ray.registry[ray]
|
|
7
|
+
spark = fugue_spark.registry[spark]
|
|
8
|
+
|
|
9
|
+
[pytest11]
|
|
10
|
+
fugue_test = fugue_test
|
|
11
|
+
fugue_test_fixtures = fugue_test.fixtures
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import dask.dataframe as dd
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from dask.dataframe.dask_expr.io.parquet import ReadParquet
|
|
7
|
+
|
|
8
|
+
HAS_DASK_EXPR = True # newer dask
|
|
9
|
+
except ImportError: # pragma: no cover
|
|
10
|
+
HAS_DASK_EXPR = False # older dask
|
|
11
|
+
|
|
12
|
+
if not HAS_DASK_EXPR: # pragma: no cover
|
|
13
|
+
try:
|
|
14
|
+
from dask_sql import Context as ContextWrapper # pylint: disable-all
|
|
15
|
+
except ImportError: # pragma: no cover
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"dask-sql is not installed. Please install it with `pip install dask-sql`"
|
|
18
|
+
)
|
|
19
|
+
else:
|
|
20
|
+
from triad.utils.assertion import assert_or_throw
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from dask_sql import Context
|
|
24
|
+
from dask_sql.datacontainer import Statistics
|
|
25
|
+
from dask_sql.input_utils import InputUtil
|
|
26
|
+
except ImportError: # pragma: no cover
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"dask-sql is not installed. Please install it with `pip install dask-sql`"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
class ContextWrapper(Context): # type: ignore
|
|
32
|
+
def create_table(
|
|
33
|
+
self,
|
|
34
|
+
table_name: str,
|
|
35
|
+
input_table: dd.DataFrame,
|
|
36
|
+
format: Optional[str] = None, # noqa
|
|
37
|
+
persist: bool = False,
|
|
38
|
+
schema_name: Optional[str] = None,
|
|
39
|
+
statistics: Optional[Statistics] = None,
|
|
40
|
+
gpu: bool = False,
|
|
41
|
+
**kwargs: Any,
|
|
42
|
+
) -> None: # pragma: no cover
|
|
43
|
+
assert_or_throw(
|
|
44
|
+
isinstance(input_table, dd.DataFrame),
|
|
45
|
+
lambda: ValueError(
|
|
46
|
+
f"input_table must be a dask dataframe, but got {type(input_table)}"
|
|
47
|
+
),
|
|
48
|
+
)
|
|
49
|
+
assert_or_throw(
|
|
50
|
+
dd._dask_expr_enabled(), lambda: ValueError("Dask expr must be enabled")
|
|
51
|
+
)
|
|
52
|
+
schema_name = schema_name or self.schema_name
|
|
53
|
+
|
|
54
|
+
dc = InputUtil.to_dc(
|
|
55
|
+
input_table,
|
|
56
|
+
table_name=table_name,
|
|
57
|
+
format=format,
|
|
58
|
+
persist=persist,
|
|
59
|
+
gpu=gpu,
|
|
60
|
+
**kwargs,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
dask_filepath = None
|
|
64
|
+
operations = input_table.find_operations(ReadParquet)
|
|
65
|
+
for op in operations:
|
|
66
|
+
dask_filepath = op._args[0]
|
|
67
|
+
|
|
68
|
+
dc.filepath = dask_filepath
|
|
69
|
+
self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
|
|
70
|
+
|
|
71
|
+
if not statistics:
|
|
72
|
+
statistics = Statistics(float("nan"))
|
|
73
|
+
dc.statistics = statistics
|
|
74
|
+
|
|
75
|
+
self.schema[schema_name].tables[table_name.lower()] = dc
|
|
76
|
+
self.schema[schema_name].statistics[table_name.lower()] = statistics
|
fugue_dask/_utils.py
CHANGED
|
@@ -5,7 +5,7 @@ import dask.dataframe as dd
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pyarrow as pa
|
|
8
|
-
from dask.dataframe
|
|
8
|
+
from dask.dataframe import DataFrame
|
|
9
9
|
from dask.delayed import delayed
|
|
10
10
|
from dask.distributed import Client, get_client
|
|
11
11
|
from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
|
|
@@ -149,7 +149,7 @@ def _add_hash_index(
|
|
|
149
149
|
if len(cols) == 0:
|
|
150
150
|
cols = list(df.columns)
|
|
151
151
|
|
|
152
|
-
def _add_hash(df: pd.DataFrame) -> pd.DataFrame:
|
|
152
|
+
def _add_hash(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover
|
|
153
153
|
if len(df) == 0:
|
|
154
154
|
return df.assign(**{_FUGUE_DASK_TEMP_IDX_COLUMN: pd.Series(dtype=int)})
|
|
155
155
|
return df.assign(
|
|
@@ -171,7 +171,7 @@ def _add_hash_index(
|
|
|
171
171
|
|
|
172
172
|
def _add_random_index(
|
|
173
173
|
df: dd.DataFrame, num: int, seed: Any = None
|
|
174
|
-
) -> Tuple[dd.DataFrame, int]:
|
|
174
|
+
) -> Tuple[dd.DataFrame, int]: # pragma: no cover
|
|
175
175
|
def _add_rand(df: pd.DataFrame) -> pd.DataFrame:
|
|
176
176
|
if len(df) == 0:
|
|
177
177
|
return df.assign(**{_FUGUE_DASK_TEMP_IDX_COLUMN: pd.Series(dtype=int)})
|
|
@@ -189,7 +189,9 @@ def _add_random_index(
|
|
|
189
189
|
|
|
190
190
|
|
|
191
191
|
def _add_continuous_index(df: dd.DataFrame) -> Tuple[dd.DataFrame, int]:
|
|
192
|
-
def _get_info(
|
|
192
|
+
def _get_info(
|
|
193
|
+
df: pd.DataFrame, partition_info: Any
|
|
194
|
+
) -> pd.DataFrame: # pragma: no cover
|
|
193
195
|
return pd.DataFrame(dict(no=[partition_info["number"]], ct=[len(df)]))
|
|
194
196
|
|
|
195
197
|
pinfo = (
|
|
@@ -200,7 +202,9 @@ def _add_continuous_index(df: dd.DataFrame) -> Tuple[dd.DataFrame, int]:
|
|
|
200
202
|
counts = pinfo.sort_values("no").ct.cumsum().tolist()
|
|
201
203
|
starts = [0] + counts[0:-1]
|
|
202
204
|
|
|
203
|
-
def _add_index(
|
|
205
|
+
def _add_index(
|
|
206
|
+
df: pd.DataFrame, partition_info: Any
|
|
207
|
+
) -> pd.DataFrame: # pragma: no cover
|
|
204
208
|
return df.assign(
|
|
205
209
|
**{
|
|
206
210
|
_FUGUE_DASK_TEMP_IDX_COLUMN: np.arange(len(df))
|
fugue_dask/dataframe.py
CHANGED
|
@@ -379,7 +379,7 @@ def _to_array_chunks(
|
|
|
379
379
|
assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
|
|
380
380
|
_df = df if columns is None or len(columns) == 0 else df[columns]
|
|
381
381
|
|
|
382
|
-
def _to_list(pdf: pd.DataFrame) -> List[Any]:
|
|
382
|
+
def _to_list(pdf: pd.DataFrame) -> List[Any]: # pragma: no cover
|
|
383
383
|
return list(
|
|
384
384
|
PD_UTILS.as_array_iterable(
|
|
385
385
|
pdf,
|
fugue_dask/execution_engine.py
CHANGED
|
@@ -9,9 +9,10 @@ from triad.collections import Schema
|
|
|
9
9
|
from triad.collections.dict import IndexedOrderedDict, ParamDict
|
|
10
10
|
from triad.utils.assertion import assert_or_throw
|
|
11
11
|
from triad.utils.hash import to_uuid
|
|
12
|
+
from triad.utils.io import makedirs
|
|
12
13
|
from triad.utils.pandas_like import PandasUtils
|
|
13
14
|
from triad.utils.threading import RunOnce
|
|
14
|
-
|
|
15
|
+
|
|
15
16
|
from fugue import StructuredRawSQL
|
|
16
17
|
from fugue.collections.partition import (
|
|
17
18
|
PartitionCursor,
|
|
@@ -61,14 +62,9 @@ class DaskSQLEngine(SQLEngine):
|
|
|
61
62
|
return True
|
|
62
63
|
|
|
63
64
|
def select(self, dfs: DataFrames, statement: StructuredRawSQL) -> DataFrame:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
raise ImportError(
|
|
68
|
-
"dask-sql is not installed. "
|
|
69
|
-
"Please install it with `pip install dask-sql`"
|
|
70
|
-
)
|
|
71
|
-
ctx = Context()
|
|
65
|
+
from ._dask_sql_wrapper import ContextWrapper
|
|
66
|
+
|
|
67
|
+
ctx = ContextWrapper()
|
|
72
68
|
_dfs: Dict[str, dd.DataFrame] = {k: self._to_safe_df(v) for k, v in dfs.items()}
|
|
73
69
|
sql = statement.construct(dialect=self.dialect, log=self.log)
|
|
74
70
|
res = ctx.sql(
|
|
@@ -102,7 +98,8 @@ class DaskMapEngine(MapEngine):
|
|
|
102
98
|
partition_spec: PartitionSpec,
|
|
103
99
|
on_init: Optional[Callable[[int, DataFrame], Any]] = None,
|
|
104
100
|
map_func_format_hint: Optional[str] = None,
|
|
105
|
-
) -> DataFrame:
|
|
101
|
+
) -> DataFrame: # pragma: no cover
|
|
102
|
+
# It is well tested but not captured by coverage
|
|
106
103
|
presort = partition_spec.get_sorts(
|
|
107
104
|
df.schema, with_partition_keys=partition_spec.algo == "coarse"
|
|
108
105
|
)
|
|
@@ -475,7 +472,7 @@ class DaskExecutionEngine(ExecutionEngine):
|
|
|
475
472
|
# Use presort over partition_spec.presort if possible
|
|
476
473
|
_presort: IndexedOrderedDict = presort or partition_spec.presort
|
|
477
474
|
|
|
478
|
-
def _partition_take(partition, n, presort):
|
|
475
|
+
def _partition_take(partition, n, presort): # pragma: no cover
|
|
479
476
|
assert_or_throw(
|
|
480
477
|
partition.shape[1] == len(meta),
|
|
481
478
|
FugueBug("hitting the dask bug where partition keys are lost"),
|
fugue_duckdb/dataframe.py
CHANGED
|
@@ -165,7 +165,7 @@ def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation:
|
|
|
165
165
|
|
|
166
166
|
@as_arrow.candidate(lambda df: isinstance(df, DuckDBPyRelation))
|
|
167
167
|
def _duck_as_arrow(df: DuckDBPyRelation) -> pa.Table:
|
|
168
|
-
_df = df.
|
|
168
|
+
_df = df.fetch_arrow_table()
|
|
169
169
|
_df = replace_types_in_table(_df, LARGE_TYPES_REPLACEMENT, recursive=True)
|
|
170
170
|
return _df
|
|
171
171
|
|
|
@@ -216,7 +216,7 @@ def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRe
|
|
|
216
216
|
def _duck_as_array(
|
|
217
217
|
df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
218
218
|
) -> List[Any]:
|
|
219
|
-
return pa_table_as_array(df.
|
|
219
|
+
return pa_table_as_array(df.fetch_arrow_table(), columns=columns)
|
|
220
220
|
|
|
221
221
|
|
|
222
222
|
@as_array_iterable.candidate(
|
|
@@ -225,14 +225,14 @@ def _duck_as_array(
|
|
|
225
225
|
def _duck_as_array_iterable(
|
|
226
226
|
df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
|
|
227
227
|
) -> Iterable[Any]:
|
|
228
|
-
yield from pa_table_as_array_iterable(df.
|
|
228
|
+
yield from pa_table_as_array_iterable(df.fetch_arrow_table(), columns=columns)
|
|
229
229
|
|
|
230
230
|
|
|
231
231
|
@as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
|
|
232
232
|
def _duck_as_dicts(
|
|
233
233
|
df: DuckDBPyRelation, columns: Optional[List[str]] = None
|
|
234
234
|
) -> List[Dict[str, Any]]:
|
|
235
|
-
return pa_table_as_dicts(df.
|
|
235
|
+
return pa_table_as_dicts(df.fetch_arrow_table(), columns=columns)
|
|
236
236
|
|
|
237
237
|
|
|
238
238
|
@as_dict_iterable.candidate(
|
|
@@ -241,7 +241,7 @@ def _duck_as_dicts(
|
|
|
241
241
|
def _duck_as_dict_iterable(
|
|
242
242
|
df: DuckDBPyRelation, columns: Optional[List[str]] = None
|
|
243
243
|
) -> Iterable[Dict[str, Any]]:
|
|
244
|
-
yield from pa_table_as_dict_iterable(df.
|
|
244
|
+
yield from pa_table_as_dict_iterable(df.fetch_arrow_table(), columns=columns)
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
|
fugue_duckdb/execution_engine.py
CHANGED
|
@@ -108,7 +108,7 @@ class DuckDBEngine(SQLEngine):
|
|
|
108
108
|
try:
|
|
109
109
|
for k, v in dfs.items():
|
|
110
110
|
duckdb.from_arrow(v.as_arrow(), connection=conn).create_view(k)
|
|
111
|
-
return ArrowDataFrame(_duck_as_arrow(conn.
|
|
111
|
+
return ArrowDataFrame(_duck_as_arrow(conn.sql(statement)))
|
|
112
112
|
finally:
|
|
113
113
|
conn.close()
|
|
114
114
|
|
fugue_ibis/execution_engine.py
CHANGED
|
@@ -92,7 +92,8 @@ class IbisSQLEngine(SQLEngine):
|
|
|
92
92
|
_df2 = self.to_df(df2)
|
|
93
93
|
key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
|
|
94
94
|
on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
|
|
95
|
-
|
|
95
|
+
version = int(ibis.__version__.split(".")[0])
|
|
96
|
+
if version < 6: # pragma: no cover
|
|
96
97
|
suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
|
|
97
98
|
else:
|
|
98
99
|
# breaking change in ibis 6.0
|
|
@@ -113,7 +114,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
113
114
|
cols.append(
|
|
114
115
|
ibis.coalesce(tb[k], tb[k + _JOIN_RIGHT_SUFFIX]).name(k)
|
|
115
116
|
)
|
|
116
|
-
tb = tb
|
|
117
|
+
tb = tb.select(*cols)
|
|
117
118
|
elif how.lower() in ["semi", "left_semi"]:
|
|
118
119
|
tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
|
|
119
120
|
elif how.lower() in ["anti", "left_anti"]:
|
|
@@ -153,7 +154,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
153
154
|
self,
|
|
154
155
|
df: DataFrame,
|
|
155
156
|
how: str = "any",
|
|
156
|
-
thresh: int = None,
|
|
157
|
+
thresh: Optional[int] = None,
|
|
157
158
|
subset: Optional[List[str]] = None,
|
|
158
159
|
) -> DataFrame:
|
|
159
160
|
schema = df.schema
|
|
@@ -161,7 +162,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
161
162
|
schema = schema.extract(subset)
|
|
162
163
|
_df = self.to_df(df)
|
|
163
164
|
if thresh is None:
|
|
164
|
-
tb = _df.native.
|
|
165
|
+
tb = _df.native.drop_null(subset, how=how)
|
|
165
166
|
return self.to_df(tb, df.schema)
|
|
166
167
|
assert_or_throw(
|
|
167
168
|
how == "any", ValueError("when thresh is set, how must be 'any'")
|
|
@@ -204,7 +205,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
204
205
|
ibis.coalesce(tb[f], ibis.literal(vd[f])).name(f) if f in names else tb[f]
|
|
205
206
|
for f in df.columns
|
|
206
207
|
]
|
|
207
|
-
return self.to_df(tb
|
|
208
|
+
return self.to_df(tb.select(cols), schema=df.schema)
|
|
208
209
|
|
|
209
210
|
def take(
|
|
210
211
|
self,
|
|
@@ -241,7 +242,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
241
242
|
f") WHERE __fugue_take_param<={n}"
|
|
242
243
|
)
|
|
243
244
|
tb = self.query_to_table(sql, {tbn: idf})
|
|
244
|
-
return self.to_df(tb
|
|
245
|
+
return self.to_df(tb.select(*df.columns), schema=df.schema)
|
|
245
246
|
|
|
246
247
|
sorts: List[str] = []
|
|
247
248
|
for k, v in _presort.items():
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -74,7 +74,7 @@ class RayIO(object):
|
|
|
74
74
|
len(fmts) == 1, NotImplementedError("can't support multiple formats")
|
|
75
75
|
)
|
|
76
76
|
fmt = fmts[0]
|
|
77
|
-
files = [f.path for f in fp]
|
|
77
|
+
files = [f.as_dir_path() if f.is_dir else f.path for f in fp]
|
|
78
78
|
return self._loads[fmt](files, columns, **kwargs)
|
|
79
79
|
|
|
80
80
|
def save_df(
|
|
@@ -153,6 +153,10 @@ class RayIO(object):
|
|
|
153
153
|
def _load_parquet(
|
|
154
154
|
self, p: List[str], columns: Any = None, **kwargs: Any
|
|
155
155
|
) -> DataFrame:
|
|
156
|
+
# in 2.52.0 the default changes to ["parquet"]
|
|
157
|
+
if "file_extensions" not in kwargs:
|
|
158
|
+
kwargs = kwargs.copy()
|
|
159
|
+
kwargs["file_extensions"] = None
|
|
156
160
|
sdf = rd.read_parquet(p, ray_remote_args=self._remote_args(), **kwargs)
|
|
157
161
|
if columns is None:
|
|
158
162
|
return RayDataFrame(sdf)
|
fugue_spark/_utils/convert.py
CHANGED
|
@@ -174,20 +174,26 @@ def pd_to_spark_df(
|
|
|
174
174
|
|
|
175
175
|
|
|
176
176
|
def to_pandas(df: ps.DataFrame) -> pd.DataFrame:
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
177
|
+
def _to_df() -> pd.DataFrame:
|
|
178
|
+
if version.parse(pd.__version__) < version.parse("2.0.0") or not any(
|
|
179
|
+
isinstance(x.dataType, (pt.TimestampType, TimestampNTZType))
|
|
180
|
+
for x in df.schema.fields
|
|
181
|
+
):
|
|
182
|
+
return df.toPandas()
|
|
183
|
+
else: # pragma: no cover
|
|
184
|
+
|
|
185
|
+
def serialize(dfs):
|
|
186
|
+
for df in dfs:
|
|
187
|
+
data = pickle.dumps(df)
|
|
188
|
+
yield pd.DataFrame([[data]], columns=["data"])
|
|
183
189
|
|
|
184
|
-
|
|
185
|
-
for
|
|
186
|
-
data = pickle.dumps(df)
|
|
187
|
-
yield pd.DataFrame([[data]], columns=["data"])
|
|
190
|
+
sdf = df.mapInPandas(serialize, schema="data binary")
|
|
191
|
+
return pd.concat(pickle.loads(x.data) for x in sdf.collect())
|
|
188
192
|
|
|
189
|
-
|
|
190
|
-
|
|
193
|
+
pdf = _to_df()
|
|
194
|
+
if hasattr(pdf, "attrs") and "metrics" in pdf.attrs: # pragma: no cover
|
|
195
|
+
del pdf.attrs["metrics"]
|
|
196
|
+
return pdf
|
|
191
197
|
|
|
192
198
|
|
|
193
199
|
def to_arrow(df: ps.DataFrame) -> pa.Table:
|
fugue_test/builtin_suite.py
CHANGED
|
@@ -486,7 +486,9 @@ class BuiltInTests(object):
|
|
|
486
486
|
dag.df([], "a:int,b:int").assert_eq(b)
|
|
487
487
|
dag.run(self.engine)
|
|
488
488
|
|
|
489
|
-
def
|
|
489
|
+
def _test_transform_row_wise(self): # pragma: no cover
|
|
490
|
+
# TODO: currently disabled because we don't support Dict[str, Any]
|
|
491
|
+
# as dataframe input
|
|
490
492
|
def t1(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
491
493
|
row["b"] = 1
|
|
492
494
|
return row
|
|
@@ -695,11 +697,11 @@ class BuiltInTests(object):
|
|
|
695
697
|
incr()
|
|
696
698
|
yield pa.Table.from_pandas(df)
|
|
697
699
|
|
|
698
|
-
def t11(row:
|
|
700
|
+
def t11(row: list[dict[str, Any]]) -> dict[str, Any]:
|
|
699
701
|
incr()
|
|
700
|
-
return row
|
|
702
|
+
return row[0]
|
|
701
703
|
|
|
702
|
-
def t12(row:
|
|
704
|
+
def t12(row: list[dict[str, Any]]) -> None:
|
|
703
705
|
incr()
|
|
704
706
|
|
|
705
707
|
with FugueWorkflow() as dag:
|
fugue_test/execution_suite.py
CHANGED
|
@@ -9,6 +9,7 @@ except ImportError: # pragma: no cover
|
|
|
9
9
|
import copy
|
|
10
10
|
import os
|
|
11
11
|
import pickle
|
|
12
|
+
import sys
|
|
12
13
|
from datetime import datetime
|
|
13
14
|
|
|
14
15
|
import pandas as pd
|
|
@@ -1194,6 +1195,7 @@ class ExecutionEngineTests(object):
|
|
|
1194
1195
|
)
|
|
1195
1196
|
self.df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True)
|
|
1196
1197
|
|
|
1198
|
+
@pytest.mark.skipif(sys.platform == "win32", reason="skip on Windows")
|
|
1197
1199
|
def test_load_csv_folder(self):
|
|
1198
1200
|
native = NativeExecutionEngine()
|
|
1199
1201
|
a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double")
|
fugue_version/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.9.
|
|
1
|
+
__version__ = "0.9.3"
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
[fugue.plugins]
|
|
2
|
-
dask = fugue_dask.registry [dask]
|
|
3
|
-
duckdb = fugue_duckdb.registry [duckdb]
|
|
4
|
-
ibis = fugue_ibis [ibis]
|
|
5
|
-
polars = fugue_polars.registry [polars]
|
|
6
|
-
ray = fugue_ray.registry [ray]
|
|
7
|
-
spark = fugue_spark.registry [spark]
|
|
8
|
-
|
|
9
|
-
[pytest11]
|
|
10
|
-
fugue_test = fugue_test
|
|
11
|
-
fugue_test_fixtures = fugue_test.fixtures
|
|
12
|
-
|
|
File without changes
|
|
File without changes
|