fugue 0.9.1__py3-none-any.whl → 0.9.2.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue/dataframe/function_wrapper.py +93 -2
- fugue/extensions/transformer/convert.py +4 -4
- {fugue-0.9.1.dist-info → fugue-0.9.2.dev2.dist-info}/METADATA +50 -51
- {fugue-0.9.1.dist-info → fugue-0.9.2.dev2.dist-info}/RECORD +15 -14
- {fugue-0.9.1.dist-info → fugue-0.9.2.dev2.dist-info}/WHEEL +1 -1
- fugue-0.9.2.dev2.dist-info/entry_points.txt +11 -0
- fugue_dask/_dask_sql_wrapper.py +76 -0
- fugue_dask/_utils.py +1 -1
- fugue_dask/execution_engine.py +5 -9
- fugue_ibis/execution_engine.py +7 -6
- fugue_ray/_utils/io.py +22 -15
- fugue_test/builtin_suite.py +36 -1
- fugue_version/__init__.py +1 -1
- fugue-0.9.1.dist-info/entry_points.txt +0 -12
- {fugue-0.9.1.dist-info → fugue-0.9.2.dev2.dist-info}/LICENSE +0 -0
- {fugue-0.9.1.dist-info → fugue-0.9.2.dev2.dist-info}/top_level.txt +0 -0
|
@@ -80,6 +80,7 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
80
80
|
p.update(kwargs)
|
|
81
81
|
has_kw = False
|
|
82
82
|
rargs: Dict[str, Any] = {}
|
|
83
|
+
row_param_info: Any = None
|
|
83
84
|
for k, v in self._params.items():
|
|
84
85
|
if isinstance(v, (PositionalParam, KeywordParam)):
|
|
85
86
|
if isinstance(v, KeywordParam):
|
|
@@ -90,7 +91,14 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
90
91
|
isinstance(p[k], DataFrame),
|
|
91
92
|
lambda: TypeError(f"{p[k]} is not a DataFrame"),
|
|
92
93
|
)
|
|
93
|
-
|
|
94
|
+
if v.is_per_row:
|
|
95
|
+
assert_or_throw(
|
|
96
|
+
row_param_info is None,
|
|
97
|
+
lambda: ValueError("only one row parameter is allowed"),
|
|
98
|
+
)
|
|
99
|
+
row_param_info = (k, v, p[k])
|
|
100
|
+
else:
|
|
101
|
+
rargs[k] = v.to_input_data(p[k], ctx=ctx)
|
|
94
102
|
else:
|
|
95
103
|
rargs[k] = p[k] # TODO: should we do auto type conversion?
|
|
96
104
|
del p[k]
|
|
@@ -100,12 +108,38 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
100
108
|
rargs.update(p)
|
|
101
109
|
elif not ignore_unknown and len(p) > 0:
|
|
102
110
|
raise ValueError(f"{p} are not acceptable parameters")
|
|
111
|
+
if row_param_info is None:
|
|
112
|
+
return self._run_func(rargs, output, output_schema, ctx, raw=False)
|
|
113
|
+
else: # input contains row parameter
|
|
114
|
+
|
|
115
|
+
def _dfs() -> Iterable[Any]:
|
|
116
|
+
k, v, df = row_param_info
|
|
117
|
+
for row in v.to_input_rows(df, ctx):
|
|
118
|
+
rargs[k] = None
|
|
119
|
+
_rargs = rargs.copy()
|
|
120
|
+
_rargs[k] = row
|
|
121
|
+
yield self._run_func(_rargs, output, output_schema, ctx, raw=True)
|
|
122
|
+
|
|
123
|
+
if not output:
|
|
124
|
+
sum(1 for _ in _dfs())
|
|
125
|
+
return
|
|
126
|
+
else:
|
|
127
|
+
return self._rt.iterable_to_output_df(_dfs(), output_schema, ctx)
|
|
128
|
+
|
|
129
|
+
def _run_func(
|
|
130
|
+
self,
|
|
131
|
+
rargs: Dict[str, Any],
|
|
132
|
+
output: bool,
|
|
133
|
+
output_schema: Any,
|
|
134
|
+
ctx: Any,
|
|
135
|
+
raw: bool,
|
|
136
|
+
) -> Any:
|
|
103
137
|
rt = self._func(**rargs)
|
|
104
138
|
if not output:
|
|
105
139
|
if isinstance(self._rt, _DataFrameParamBase):
|
|
106
140
|
self._rt.count(rt)
|
|
107
141
|
return
|
|
108
|
-
if isinstance(self._rt, _DataFrameParamBase):
|
|
142
|
+
if not raw and isinstance(self._rt, _DataFrameParamBase):
|
|
109
143
|
return self._rt.to_output_df(rt, output_schema, ctx=ctx)
|
|
110
144
|
return rt
|
|
111
145
|
|
|
@@ -120,6 +154,7 @@ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
|
|
|
120
154
|
annotation == Callable
|
|
121
155
|
or annotation == callable # pylint: disable=comparison-with-callable
|
|
122
156
|
or str(annotation).startswith("typing.Callable")
|
|
157
|
+
or str(annotation).startswith("collections.abc.Callable")
|
|
123
158
|
),
|
|
124
159
|
)
|
|
125
160
|
class _CallableParam(AnnotatedParam):
|
|
@@ -134,6 +169,9 @@ class _CallableParam(AnnotatedParam):
|
|
|
134
169
|
or annotation == Optional[callable]
|
|
135
170
|
or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
|
|
136
171
|
or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
|
|
172
|
+
or str(annotation).startswith(
|
|
173
|
+
"typing.Optional[collections.abc.Callable]"
|
|
174
|
+
) # 3.9+
|
|
137
175
|
),
|
|
138
176
|
)
|
|
139
177
|
class _OptionalCallableParam(AnnotatedParam):
|
|
@@ -145,14 +183,30 @@ class _DataFrameParamBase(AnnotatedParam):
|
|
|
145
183
|
super().__init__(param)
|
|
146
184
|
assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
|
|
147
185
|
|
|
186
|
+
@property
|
|
187
|
+
def is_per_row(self) -> bool:
|
|
188
|
+
return False
|
|
189
|
+
|
|
148
190
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
|
|
149
191
|
raise NotImplementedError
|
|
150
192
|
|
|
193
|
+
def to_input_rows(
|
|
194
|
+
self,
|
|
195
|
+
df: DataFrame,
|
|
196
|
+
ctx: Any,
|
|
197
|
+
) -> Iterable[Any]:
|
|
198
|
+
raise NotImplementedError # pragma: no cover
|
|
199
|
+
|
|
151
200
|
def to_output_df(
|
|
152
201
|
self, df: Any, schema: Any, ctx: Any
|
|
153
202
|
) -> DataFrame: # pragma: no cover
|
|
154
203
|
raise NotImplementedError
|
|
155
204
|
|
|
205
|
+
def iterable_to_output_df(
|
|
206
|
+
self, dfs: Iterable[Any], schema: Any, ctx: Any
|
|
207
|
+
) -> DataFrame: # pragma: no cover
|
|
208
|
+
raise NotImplementedError
|
|
209
|
+
|
|
156
210
|
def count(self, df: Any) -> int: # pragma: no cover
|
|
157
211
|
raise NotImplementedError
|
|
158
212
|
|
|
@@ -182,6 +236,34 @@ class DataFrameParam(_DataFrameParamBase):
|
|
|
182
236
|
return sum(1 for _ in df.as_array_iterable())
|
|
183
237
|
|
|
184
238
|
|
|
239
|
+
@fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
|
|
240
|
+
class RowParam(_DataFrameParamBase):
|
|
241
|
+
@property
|
|
242
|
+
def is_per_row(self) -> bool:
|
|
243
|
+
return True
|
|
244
|
+
|
|
245
|
+
def count(self, df: Any) -> int:
|
|
246
|
+
return 1
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
@fugue_annotated_param(Dict[str, Any])
|
|
250
|
+
class DictParam(RowParam):
|
|
251
|
+
def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
|
|
252
|
+
yield from df.as_dict_iterable()
|
|
253
|
+
|
|
254
|
+
def to_output_df(self, output: Dict[str, Any], schema: Any, ctx: Any) -> DataFrame:
|
|
255
|
+
return ArrayDataFrame([list(output.values())], schema)
|
|
256
|
+
|
|
257
|
+
def iterable_to_output_df(
|
|
258
|
+
self, dfs: Iterable[Dict[str, Any]], schema: Any, ctx: Any
|
|
259
|
+
) -> DataFrame: # pragma: no cover
|
|
260
|
+
params: Dict[str, Any] = {}
|
|
261
|
+
if schema is not None:
|
|
262
|
+
params["schema"] = Schema(schema).pa_schema
|
|
263
|
+
adf = pa.Table.from_pylist(list(dfs), **params)
|
|
264
|
+
return ArrowDataFrame(adf)
|
|
265
|
+
|
|
266
|
+
|
|
185
267
|
@fugue_annotated_param(AnyDataFrame)
|
|
186
268
|
class _AnyDataFrameParam(DataFrameParam):
|
|
187
269
|
def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
@@ -207,6 +289,15 @@ class LocalDataFrameParam(DataFrameParam):
|
|
|
207
289
|
)
|
|
208
290
|
return output
|
|
209
291
|
|
|
292
|
+
def iterable_to_output_df(
|
|
293
|
+
self, dfs: Iterable[Any], schema: Any, ctx: Any
|
|
294
|
+
) -> DataFrame: # pragma: no cover
|
|
295
|
+
def _dfs() -> Iterable[DataFrame]:
|
|
296
|
+
for df in dfs:
|
|
297
|
+
yield self.to_output_df(df, schema, ctx)
|
|
298
|
+
|
|
299
|
+
return LocalDataFrameIterableDataFrame(_dfs(), schema=schema)
|
|
300
|
+
|
|
210
301
|
def count(self, df: LocalDataFrame) -> int:
|
|
211
302
|
if df.is_bounded:
|
|
212
303
|
return df.count()
|
|
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
|
|
|
375
375
|
assert_arg_not_none(schema, "schema")
|
|
376
376
|
tr = _FuncAsTransformer()
|
|
377
377
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
378
|
-
func, "^[
|
|
378
|
+
func, "^[lspqr][fF]?x*z?$", "^[lspqr]$"
|
|
379
379
|
)
|
|
380
380
|
tr._output_schema_arg = schema # type: ignore
|
|
381
381
|
tr._validation_rules = validation_rules # type: ignore
|
|
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
|
|
|
410
410
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
411
411
|
tr = _FuncAsOutputTransformer()
|
|
412
412
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
413
|
-
func, "^[
|
|
413
|
+
func, "^[lspqr][fF]?x*z?$", "^[lspnqr]$"
|
|
414
414
|
)
|
|
415
415
|
tr._output_schema_arg = None # type: ignore
|
|
416
416
|
tr._validation_rules = validation_rules # type: ignore
|
|
@@ -503,7 +503,7 @@ class _FuncAsCoTransformer(CoTransformer):
|
|
|
503
503
|
assert_arg_not_none(schema, "schema")
|
|
504
504
|
tr = _FuncAsCoTransformer()
|
|
505
505
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
506
|
-
func, "^(c|[lspq]+)[fF]?x*z?$", "^[
|
|
506
|
+
func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspqr]$"
|
|
507
507
|
)
|
|
508
508
|
tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
|
|
509
509
|
tr._output_schema_arg = schema # type: ignore
|
|
@@ -562,7 +562,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
|
|
|
562
562
|
|
|
563
563
|
tr = _FuncAsOutputCoTransformer()
|
|
564
564
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
565
|
-
func, "^(c|[lspq]+)[fF]?x*z?$", "^[
|
|
565
|
+
func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnqr]$"
|
|
566
566
|
)
|
|
567
567
|
tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
|
|
568
568
|
tr._output_schema_arg = None # type: ignore
|
|
@@ -1,13 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.2.dev2
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
7
7
|
Author-email: hello@fugue.ai
|
|
8
8
|
License: Apache-2.0
|
|
9
9
|
Keywords: distributed spark dask ray sql dsl domain specific language
|
|
10
|
-
Platform: UNKNOWN
|
|
11
10
|
Classifier: Development Status :: 5 - Production/Stable
|
|
12
11
|
Classifier: Intended Audience :: Developers
|
|
13
12
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
@@ -17,67 +16,68 @@ Classifier: Programming Language :: Python :: 3.8
|
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.9
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.10
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
21
|
Requires-Python: >=3.8
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
|
-
|
|
24
|
-
Requires-Dist:
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: triad>=0.9.7
|
|
25
|
+
Requires-Dist: adagio>=0.2.6
|
|
25
26
|
Provides-Extra: all
|
|
26
|
-
Requires-Dist: qpd
|
|
27
|
-
Requires-Dist: fugue-sql-antlr
|
|
28
|
-
Requires-Dist: sqlglot
|
|
29
|
-
Requires-Dist: jinja2
|
|
30
|
-
Requires-Dist: pyspark
|
|
31
|
-
Requires-Dist: dask[dataframe,distributed]
|
|
32
|
-
Requires-Dist: dask-sql
|
|
33
|
-
Requires-Dist: ray[data]
|
|
34
|
-
Requires-Dist: notebook
|
|
35
|
-
Requires-Dist: jupyterlab
|
|
36
|
-
Requires-Dist: ipython
|
|
37
|
-
Requires-Dist: duckdb
|
|
38
|
-
Requires-Dist: pyarrow
|
|
39
|
-
Requires-Dist: pandas
|
|
40
|
-
Requires-Dist: ibis-framework
|
|
41
|
-
Requires-Dist: polars
|
|
27
|
+
Requires-Dist: qpd>=0.4.4; extra == "all"
|
|
28
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "all"
|
|
29
|
+
Requires-Dist: sqlglot; extra == "all"
|
|
30
|
+
Requires-Dist: jinja2; extra == "all"
|
|
31
|
+
Requires-Dist: pyspark>=3.1.1; extra == "all"
|
|
32
|
+
Requires-Dist: dask[dataframe,distributed]>=2023.5.0; extra == "all"
|
|
33
|
+
Requires-Dist: dask-sql; extra == "all"
|
|
34
|
+
Requires-Dist: ray[data]>=2.5.0; extra == "all"
|
|
35
|
+
Requires-Dist: notebook; extra == "all"
|
|
36
|
+
Requires-Dist: jupyterlab; extra == "all"
|
|
37
|
+
Requires-Dist: ipython>=7.10.0; extra == "all"
|
|
38
|
+
Requires-Dist: duckdb>=0.5.0; extra == "all"
|
|
39
|
+
Requires-Dist: pyarrow>=6.0.1; extra == "all"
|
|
40
|
+
Requires-Dist: pandas<2.2,>=2.0.2; extra == "all"
|
|
41
|
+
Requires-Dist: ibis-framework[duckdb,pandas]; extra == "all"
|
|
42
|
+
Requires-Dist: polars; extra == "all"
|
|
42
43
|
Provides-Extra: cpp_sql_parser
|
|
43
|
-
Requires-Dist: fugue-sql-antlr[cpp]
|
|
44
|
+
Requires-Dist: fugue-sql-antlr[cpp]>=0.2.0; extra == "cpp-sql-parser"
|
|
44
45
|
Provides-Extra: dask
|
|
45
|
-
Requires-Dist: dask[dataframe,distributed]
|
|
46
|
-
Requires-Dist: pyarrow
|
|
47
|
-
Requires-Dist: pandas
|
|
48
|
-
Requires-Dist: dask[dataframe,distributed] >=2024.4.0 ; (python_version >= "3.11.9") and extra == 'dask'
|
|
46
|
+
Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "dask"
|
|
47
|
+
Requires-Dist: pyarrow>=7.0.0; extra == "dask"
|
|
48
|
+
Requires-Dist: pandas>=2.0.2; extra == "dask"
|
|
49
49
|
Provides-Extra: duckdb
|
|
50
|
-
Requires-Dist: qpd
|
|
51
|
-
Requires-Dist: fugue-sql-antlr
|
|
52
|
-
Requires-Dist: sqlglot
|
|
53
|
-
Requires-Dist: jinja2
|
|
54
|
-
Requires-Dist: duckdb
|
|
55
|
-
Requires-Dist: numpy
|
|
50
|
+
Requires-Dist: qpd>=0.4.4; extra == "duckdb"
|
|
51
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "duckdb"
|
|
52
|
+
Requires-Dist: sqlglot; extra == "duckdb"
|
|
53
|
+
Requires-Dist: jinja2; extra == "duckdb"
|
|
54
|
+
Requires-Dist: duckdb>=0.5.0; extra == "duckdb"
|
|
55
|
+
Requires-Dist: numpy; extra == "duckdb"
|
|
56
56
|
Provides-Extra: ibis
|
|
57
|
-
Requires-Dist: qpd
|
|
58
|
-
Requires-Dist: fugue-sql-antlr
|
|
59
|
-
Requires-Dist: sqlglot
|
|
60
|
-
Requires-Dist: jinja2
|
|
61
|
-
Requires-Dist: ibis-framework
|
|
62
|
-
Requires-Dist: pandas
|
|
57
|
+
Requires-Dist: qpd>=0.4.4; extra == "ibis"
|
|
58
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "ibis"
|
|
59
|
+
Requires-Dist: sqlglot; extra == "ibis"
|
|
60
|
+
Requires-Dist: jinja2; extra == "ibis"
|
|
61
|
+
Requires-Dist: ibis-framework[pandas]; extra == "ibis"
|
|
62
|
+
Requires-Dist: pandas<2.2; extra == "ibis"
|
|
63
63
|
Provides-Extra: notebook
|
|
64
|
-
Requires-Dist: notebook
|
|
65
|
-
Requires-Dist: jupyterlab
|
|
66
|
-
Requires-Dist: ipython
|
|
64
|
+
Requires-Dist: notebook; extra == "notebook"
|
|
65
|
+
Requires-Dist: jupyterlab; extra == "notebook"
|
|
66
|
+
Requires-Dist: ipython>=7.10.0; extra == "notebook"
|
|
67
67
|
Provides-Extra: polars
|
|
68
|
-
Requires-Dist: polars
|
|
68
|
+
Requires-Dist: polars; extra == "polars"
|
|
69
69
|
Provides-Extra: ray
|
|
70
|
-
Requires-Dist: ray[data]
|
|
71
|
-
Requires-Dist: duckdb
|
|
72
|
-
Requires-Dist: pyarrow
|
|
73
|
-
Requires-Dist: pandas
|
|
70
|
+
Requires-Dist: ray[data]>=2.5.0; extra == "ray"
|
|
71
|
+
Requires-Dist: duckdb>=0.5.0; extra == "ray"
|
|
72
|
+
Requires-Dist: pyarrow>=7.0.0; extra == "ray"
|
|
73
|
+
Requires-Dist: pandas<2.2; extra == "ray"
|
|
74
74
|
Provides-Extra: spark
|
|
75
|
-
Requires-Dist: pyspark
|
|
75
|
+
Requires-Dist: pyspark>=3.1.1; extra == "spark"
|
|
76
76
|
Provides-Extra: sql
|
|
77
|
-
Requires-Dist: qpd
|
|
78
|
-
Requires-Dist: fugue-sql-antlr
|
|
79
|
-
Requires-Dist: sqlglot
|
|
80
|
-
Requires-Dist: jinja2
|
|
77
|
+
Requires-Dist: qpd>=0.4.4; extra == "sql"
|
|
78
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "sql"
|
|
79
|
+
Requires-Dist: sqlglot; extra == "sql"
|
|
80
|
+
Requires-Dist: jinja2; extra == "sql"
|
|
81
81
|
|
|
82
82
|
# Fugue
|
|
83
83
|
|
|
@@ -355,4 +355,3 @@ View some of our latest conferences presentations and content. For a more comple
|
|
|
355
355
|
* [Large Scale Data Validation with Spark and Dask (PyCon US)](https://www.youtube.com/watch?v=2AdvBgjO_3Q)
|
|
356
356
|
* [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames (PyData Global)](https://www.youtube.com/watch?v=OBpnGYjNBBI)
|
|
357
357
|
* [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk)
|
|
358
|
-
|
|
@@ -31,7 +31,7 @@ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvr
|
|
|
31
31
|
fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
|
|
32
32
|
fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
|
|
33
33
|
fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
|
|
34
|
-
fugue/dataframe/function_wrapper.py,sha256=
|
|
34
|
+
fugue/dataframe/function_wrapper.py,sha256=1CjI4UXHffomylK0_u0CGL1dPv_sSXTN22S5grD10_w,17889
|
|
35
35
|
fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
|
|
36
36
|
fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
|
|
37
37
|
fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
|
|
@@ -61,7 +61,7 @@ fugue/extensions/processor/convert.py,sha256=zG0lMtHGwY5TsqK4eplbMdlTg7J_PD3HbI0
|
|
|
61
61
|
fugue/extensions/processor/processor.py,sha256=czhQlQgMpAXXoLVAX9Q0TFUMYEEhsgufTammxcKSmOY,1665
|
|
62
62
|
fugue/extensions/transformer/__init__.py,sha256=VD6d-8xW1Yl8fUPj43cBWNR9pCOlYD9xWyGIHAlHwvI,456
|
|
63
63
|
fugue/extensions/transformer/constants.py,sha256=76DfpoTOGQ8gp5XtCs_xznfbr_H015-prXpHWSqMNDU,59
|
|
64
|
-
fugue/extensions/transformer/convert.py,sha256=
|
|
64
|
+
fugue/extensions/transformer/convert.py,sha256=SU_KvzZp_nV8oCxZGx7qDsdCE0CJ--8UAp5m8z4d4HY,23386
|
|
65
65
|
fugue/extensions/transformer/transformer.py,sha256=zhOUgyv5-DPxYd1CP_98WeEw-zUgwknRnPW_6di-q3g,9098
|
|
66
66
|
fugue/rpc/__init__.py,sha256=3GzUl4QZQuCChjD7eaTJW8tnTwfke6ZY9r9g5nCeBZ8,167
|
|
67
67
|
fugue/rpc/base.py,sha256=3Fq5SvwLZqw9NXru3r32WuJKBGFr9bl7nFgy6e9boGo,8470
|
|
@@ -89,10 +89,11 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
|
|
|
89
89
|
fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
|
|
90
90
|
fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
|
|
91
91
|
fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
|
|
92
|
+
fugue_dask/_dask_sql_wrapper.py,sha256=lj38gJIOdoMV9W44gpwzLjUEtPVsQNKjRWuEkfI7-PM,2618
|
|
92
93
|
fugue_dask/_io.py,sha256=pl4F7mbVgP7Rwh1FFG7xfOz2TBZRUj1l3lLvDY4jOf4,6020
|
|
93
|
-
fugue_dask/_utils.py,sha256=
|
|
94
|
+
fugue_dask/_utils.py,sha256=dGUkhOoXQqgGQH_BY6aeYFo9UIWUAyo8YjwtdB7QD4s,8951
|
|
94
95
|
fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
|
|
95
|
-
fugue_dask/execution_engine.py,sha256=
|
|
96
|
+
fugue_dask/execution_engine.py,sha256=Em9pN6cw5w5DGLcjV6oKQKQeLLblc9DZ0DkvxKVFxQQ,21167
|
|
96
97
|
fugue_dask/registry.py,sha256=jepWKH55VWNIWV3pOF5vpCl2OpO0rI1IULx5GM2Gk6w,2274
|
|
97
98
|
fugue_dask/tester.py,sha256=E7BZjgFpJgrHsLMKzvSO5im5OwocYcratjzulJSQZl0,718
|
|
98
99
|
fugue_duckdb/__init__.py,sha256=ZzhmAWbROR1YL9Kmlt7OlwkgPZzFhsSdwLV2pFmAqGI,268
|
|
@@ -107,7 +108,7 @@ fugue_ibis/__init__.py,sha256=z7TkK7M2_0p9XO6jQATNDgT0aHXn5k69Ttz2ga-eQG8,190
|
|
|
107
108
|
fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
|
|
108
109
|
fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
|
|
109
110
|
fugue_ibis/dataframe.py,sha256=k4Q6qBLBIADF5YhbvaDplXO7OkMZSHuf_Wg5o-AusEI,7796
|
|
110
|
-
fugue_ibis/execution_engine.py,sha256=
|
|
111
|
+
fugue_ibis/execution_engine.py,sha256=jRnp1m1wuTicS29A-WA043f8QwdoK8b9rwPXvTkm8r8,18751
|
|
111
112
|
fugue_notebook/__init__.py,sha256=9r_-2uxu1lBeZ8GgpYCKom_OZy2soIOYZajg7JDO-HY,4326
|
|
112
113
|
fugue_notebook/env.py,sha256=TYiTxYPFi-BVJJY49jDsvw9mddhK8WrifeRxBke30I8,4773
|
|
113
114
|
fugue_notebook/nbextension/README.md,sha256=QLnr957YeGfwzy2r4c4qbZPaXyCbyGrKPvcqSBQYSnU,123
|
|
@@ -127,7 +128,7 @@ fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
|
|
|
127
128
|
fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
129
|
fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
|
|
129
130
|
fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
|
|
130
|
-
fugue_ray/_utils/io.py,sha256=
|
|
131
|
+
fugue_ray/_utils/io.py,sha256=Dz0WuQrh_8Ix7jU5viFIA6caJcfxV4ew0ruBZLQbD1s,9930
|
|
131
132
|
fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
|
|
132
133
|
fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
|
|
133
134
|
fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
|
|
@@ -143,14 +144,14 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
|
|
|
143
144
|
fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
|
|
144
145
|
fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
|
|
145
146
|
fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
|
|
146
|
-
fugue_test/builtin_suite.py,sha256=
|
|
147
|
+
fugue_test/builtin_suite.py,sha256=BpGwa66cAUuuc7ULOsPP3ax8IKQtNIPoSmlUFgqUKQk,79252
|
|
147
148
|
fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
|
|
148
149
|
fugue_test/execution_suite.py,sha256=jcSSoKqTGbeWzTxkyYU-8i2zJAjzuXn7BqE8ul-JjIc,48646
|
|
149
150
|
fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
|
|
150
|
-
fugue_version/__init__.py,sha256=
|
|
151
|
-
fugue-0.9.
|
|
152
|
-
fugue-0.9.
|
|
153
|
-
fugue-0.9.
|
|
154
|
-
fugue-0.9.
|
|
155
|
-
fugue-0.9.
|
|
156
|
-
fugue-0.9.
|
|
151
|
+
fugue_version/__init__.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
|
|
152
|
+
fugue-0.9.2.dev2.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
153
|
+
fugue-0.9.2.dev2.dist-info/METADATA,sha256=eR5mL6Tf1RGa_-Do5Dmzy4ZkbcbKf-FzW4qA0cAW1Ec,18283
|
|
154
|
+
fugue-0.9.2.dev2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
155
|
+
fugue-0.9.2.dev2.dist-info/entry_points.txt,sha256=2Vxp1qew_tswacA8m0RzIliLlFOQMlzezvSXPugM_KA,295
|
|
156
|
+
fugue-0.9.2.dev2.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
|
|
157
|
+
fugue-0.9.2.dev2.dist-info/RECORD,,
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
[fugue.plugins]
|
|
2
|
+
dask = fugue_dask.registry[dask]
|
|
3
|
+
duckdb = fugue_duckdb.registry[duckdb]
|
|
4
|
+
ibis = fugue_ibis[ibis]
|
|
5
|
+
polars = fugue_polars.registry[polars]
|
|
6
|
+
ray = fugue_ray.registry[ray]
|
|
7
|
+
spark = fugue_spark.registry[spark]
|
|
8
|
+
|
|
9
|
+
[pytest11]
|
|
10
|
+
fugue_test = fugue_test
|
|
11
|
+
fugue_test_fixtures = fugue_test.fixtures
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import dask.dataframe as dd
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from dask.dataframe.dask_expr.io.parquet import ReadParquet
|
|
7
|
+
|
|
8
|
+
HAS_DASK_EXPR = True # newer dask
|
|
9
|
+
except ImportError: # pragma: no cover
|
|
10
|
+
HAS_DASK_EXPR = False # older dask
|
|
11
|
+
|
|
12
|
+
if not HAS_DASK_EXPR: # pragma: no cover
|
|
13
|
+
try:
|
|
14
|
+
from dask_sql import Context as ContextWrapper # pylint: disable-all
|
|
15
|
+
except ImportError: # pragma: no cover
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"dask-sql is not installed. Please install it with `pip install dask-sql`"
|
|
18
|
+
)
|
|
19
|
+
else:
|
|
20
|
+
from triad.utils.assertion import assert_or_throw
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
from dask_sql import Context
|
|
24
|
+
from dask_sql.datacontainer import Statistics
|
|
25
|
+
from dask_sql.input_utils import InputUtil
|
|
26
|
+
except ImportError: # pragma: no cover
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"dask-sql is not installed. Please install it with `pip install dask-sql`"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
class ContextWrapper(Context): # type: ignore
|
|
32
|
+
def create_table(
|
|
33
|
+
self,
|
|
34
|
+
table_name: str,
|
|
35
|
+
input_table: dd.DataFrame,
|
|
36
|
+
format: Optional[str] = None, # noqa
|
|
37
|
+
persist: bool = False,
|
|
38
|
+
schema_name: Optional[str] = None,
|
|
39
|
+
statistics: Optional[Statistics] = None,
|
|
40
|
+
gpu: bool = False,
|
|
41
|
+
**kwargs: Any,
|
|
42
|
+
) -> None: # pragma: no cover
|
|
43
|
+
assert_or_throw(
|
|
44
|
+
isinstance(input_table, dd.DataFrame),
|
|
45
|
+
lambda: ValueError(
|
|
46
|
+
f"input_table must be a dask dataframe, but got {type(input_table)}"
|
|
47
|
+
),
|
|
48
|
+
)
|
|
49
|
+
assert_or_throw(
|
|
50
|
+
dd._dask_expr_enabled(), lambda: ValueError("Dask expr must be enabled")
|
|
51
|
+
)
|
|
52
|
+
schema_name = schema_name or self.schema_name
|
|
53
|
+
|
|
54
|
+
dc = InputUtil.to_dc(
|
|
55
|
+
input_table,
|
|
56
|
+
table_name=table_name,
|
|
57
|
+
format=format,
|
|
58
|
+
persist=persist,
|
|
59
|
+
gpu=gpu,
|
|
60
|
+
**kwargs,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
dask_filepath = None
|
|
64
|
+
operations = input_table.find_operations(ReadParquet)
|
|
65
|
+
for op in operations:
|
|
66
|
+
dask_filepath = op._args[0]
|
|
67
|
+
|
|
68
|
+
dc.filepath = dask_filepath
|
|
69
|
+
self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
|
|
70
|
+
|
|
71
|
+
if not statistics:
|
|
72
|
+
statistics = Statistics(float("nan"))
|
|
73
|
+
dc.statistics = statistics
|
|
74
|
+
|
|
75
|
+
self.schema[schema_name].tables[table_name.lower()] = dc
|
|
76
|
+
self.schema[schema_name].statistics[table_name.lower()] = statistics
|
fugue_dask/_utils.py
CHANGED
|
@@ -5,7 +5,7 @@ import dask.dataframe as dd
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import pyarrow as pa
|
|
8
|
-
from dask.dataframe
|
|
8
|
+
from dask.dataframe import DataFrame
|
|
9
9
|
from dask.delayed import delayed
|
|
10
10
|
from dask.distributed import Client, get_client
|
|
11
11
|
from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
|
fugue_dask/execution_engine.py
CHANGED
|
@@ -9,9 +9,10 @@ from triad.collections import Schema
|
|
|
9
9
|
from triad.collections.dict import IndexedOrderedDict, ParamDict
|
|
10
10
|
from triad.utils.assertion import assert_or_throw
|
|
11
11
|
from triad.utils.hash import to_uuid
|
|
12
|
+
from triad.utils.io import makedirs
|
|
12
13
|
from triad.utils.pandas_like import PandasUtils
|
|
13
14
|
from triad.utils.threading import RunOnce
|
|
14
|
-
|
|
15
|
+
|
|
15
16
|
from fugue import StructuredRawSQL
|
|
16
17
|
from fugue.collections.partition import (
|
|
17
18
|
PartitionCursor,
|
|
@@ -61,14 +62,9 @@ class DaskSQLEngine(SQLEngine):
|
|
|
61
62
|
return True
|
|
62
63
|
|
|
63
64
|
def select(self, dfs: DataFrames, statement: StructuredRawSQL) -> DataFrame:
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
raise ImportError(
|
|
68
|
-
"dask-sql is not installed. "
|
|
69
|
-
"Please install it with `pip install dask-sql`"
|
|
70
|
-
)
|
|
71
|
-
ctx = Context()
|
|
65
|
+
from ._dask_sql_wrapper import ContextWrapper
|
|
66
|
+
|
|
67
|
+
ctx = ContextWrapper()
|
|
72
68
|
_dfs: Dict[str, dd.DataFrame] = {k: self._to_safe_df(v) for k, v in dfs.items()}
|
|
73
69
|
sql = statement.construct(dialect=self.dialect, log=self.log)
|
|
74
70
|
res = ctx.sql(
|
fugue_ibis/execution_engine.py
CHANGED
|
@@ -92,7 +92,8 @@ class IbisSQLEngine(SQLEngine):
|
|
|
92
92
|
_df2 = self.to_df(df2)
|
|
93
93
|
key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
|
|
94
94
|
on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
|
|
95
|
-
|
|
95
|
+
version = int(ibis.__version__.split(".")[0])
|
|
96
|
+
if version < 6: # pragma: no cover
|
|
96
97
|
suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
|
|
97
98
|
else:
|
|
98
99
|
# breaking change in ibis 6.0
|
|
@@ -113,7 +114,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
113
114
|
cols.append(
|
|
114
115
|
ibis.coalesce(tb[k], tb[k + _JOIN_RIGHT_SUFFIX]).name(k)
|
|
115
116
|
)
|
|
116
|
-
tb = tb
|
|
117
|
+
tb = tb.select(*cols)
|
|
117
118
|
elif how.lower() in ["semi", "left_semi"]:
|
|
118
119
|
tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
|
|
119
120
|
elif how.lower() in ["anti", "left_anti"]:
|
|
@@ -153,7 +154,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
153
154
|
self,
|
|
154
155
|
df: DataFrame,
|
|
155
156
|
how: str = "any",
|
|
156
|
-
thresh: int = None,
|
|
157
|
+
thresh: Optional[int] = None,
|
|
157
158
|
subset: Optional[List[str]] = None,
|
|
158
159
|
) -> DataFrame:
|
|
159
160
|
schema = df.schema
|
|
@@ -161,7 +162,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
161
162
|
schema = schema.extract(subset)
|
|
162
163
|
_df = self.to_df(df)
|
|
163
164
|
if thresh is None:
|
|
164
|
-
tb = _df.native.
|
|
165
|
+
tb = _df.native.drop_null(subset, how=how)
|
|
165
166
|
return self.to_df(tb, df.schema)
|
|
166
167
|
assert_or_throw(
|
|
167
168
|
how == "any", ValueError("when thresh is set, how must be 'any'")
|
|
@@ -204,7 +205,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
204
205
|
ibis.coalesce(tb[f], ibis.literal(vd[f])).name(f) if f in names else tb[f]
|
|
205
206
|
for f in df.columns
|
|
206
207
|
]
|
|
207
|
-
return self.to_df(tb
|
|
208
|
+
return self.to_df(tb.select(cols), schema=df.schema)
|
|
208
209
|
|
|
209
210
|
def take(
|
|
210
211
|
self,
|
|
@@ -241,7 +242,7 @@ class IbisSQLEngine(SQLEngine):
|
|
|
241
242
|
f") WHERE __fugue_take_param<={n}"
|
|
242
243
|
)
|
|
243
244
|
tb = self.query_to_table(sql, {tbn: idf})
|
|
244
|
-
return self.to_df(tb
|
|
245
|
+
return self.to_df(tb.select(*df.columns), schema=df.schema)
|
|
245
246
|
|
|
246
247
|
sorts: List[str] = []
|
|
247
248
|
for k, v in _presort.items():
|
fugue_ray/_utils/io.py
CHANGED
|
@@ -7,7 +7,7 @@ import ray.data as rd
|
|
|
7
7
|
from packaging import version
|
|
8
8
|
from pyarrow import csv as pacsv
|
|
9
9
|
from pyarrow import json as pajson
|
|
10
|
-
|
|
10
|
+
|
|
11
11
|
from triad.collections import Schema
|
|
12
12
|
from triad.collections.dict import ParamDict
|
|
13
13
|
from triad.utils.assertion import assert_or_throw
|
|
@@ -21,6 +21,27 @@ from fugue_ray.dataframe import RayDataFrame
|
|
|
21
21
|
|
|
22
22
|
from .._constants import RAY_VERSION
|
|
23
23
|
|
|
24
|
+
try:
|
|
25
|
+
from ray.data.datasource import FileExtensionFilter
|
|
26
|
+
|
|
27
|
+
class _FileFiler(FileExtensionFilter): # pragma: no cover
|
|
28
|
+
def __init__(
|
|
29
|
+
self, file_extensions: Union[str, List[str]], exclude: Iterable[str]
|
|
30
|
+
):
|
|
31
|
+
super().__init__(file_extensions, allow_if_no_extension=True)
|
|
32
|
+
self._exclude = set(exclude)
|
|
33
|
+
|
|
34
|
+
def _is_valid(self, path: str) -> bool:
|
|
35
|
+
return pathlib.Path(
|
|
36
|
+
path
|
|
37
|
+
).name not in self._exclude and self._file_has_extension(path)
|
|
38
|
+
|
|
39
|
+
def __call__(self, paths: List[str]) -> List[str]:
|
|
40
|
+
return [path for path in paths if self._is_valid(path)]
|
|
41
|
+
|
|
42
|
+
except ImportError: # pragma: no cover
|
|
43
|
+
pass # ray >=2.10
|
|
44
|
+
|
|
24
45
|
|
|
25
46
|
class RayIO(object):
|
|
26
47
|
def __init__(self, engine: ExecutionEngine):
|
|
@@ -248,17 +269,3 @@ class RayIO(object):
|
|
|
248
269
|
|
|
249
270
|
def _remote_args(self) -> Dict[str, Any]:
|
|
250
271
|
return {"num_cpus": 1}
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
class _FileFiler(FileExtensionFilter): # pragma: no cover
|
|
254
|
-
def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
|
|
255
|
-
super().__init__(file_extensions, allow_if_no_extension=True)
|
|
256
|
-
self._exclude = set(exclude)
|
|
257
|
-
|
|
258
|
-
def _is_valid(self, path: str) -> bool:
|
|
259
|
-
return pathlib.Path(
|
|
260
|
-
path
|
|
261
|
-
).name not in self._exclude and self._file_has_extension(path)
|
|
262
|
-
|
|
263
|
-
def __call__(self, paths: List[str]) -> List[str]:
|
|
264
|
-
return [path for path in paths if self._is_valid(path)]
|
fugue_test/builtin_suite.py
CHANGED
|
@@ -486,6 +486,23 @@ class BuiltInTests(object):
|
|
|
486
486
|
dag.df([], "a:int,b:int").assert_eq(b)
|
|
487
487
|
dag.run(self.engine)
|
|
488
488
|
|
|
489
|
+
def test_transform_row_wise(self):
|
|
490
|
+
def t1(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
491
|
+
row["b"] = 1
|
|
492
|
+
return row
|
|
493
|
+
|
|
494
|
+
def t2(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
495
|
+
return rows[0]
|
|
496
|
+
|
|
497
|
+
with fa.engine_context(self.engine):
|
|
498
|
+
a = pd.DataFrame([[3, 4], [1, 2], [3, 5]], columns=["a", "b"])
|
|
499
|
+
b = fa.transform(a, t1, schema="*")
|
|
500
|
+
assert sorted(fa.as_array(b)) == [[1, 1], [3, 1], [3, 1]]
|
|
501
|
+
b = fa.transform(
|
|
502
|
+
a, t2, schema="*", partition={"by": "a", "presort": "b"}
|
|
503
|
+
)
|
|
504
|
+
assert sorted(fa.as_array(b)) == [[1, 2], [3, 4]]
|
|
505
|
+
|
|
489
506
|
def test_transform_binary(self):
|
|
490
507
|
with FugueWorkflow() as dag:
|
|
491
508
|
a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
|
|
@@ -548,6 +565,8 @@ class BuiltInTests(object):
|
|
|
548
565
|
e = dag.df([[1, 2, 1, 10]], "a:int,ct1:int,ct2:int,x:int")
|
|
549
566
|
e.assert_eq(c)
|
|
550
567
|
|
|
568
|
+
a.zip(b).transform(mock_co_tf1_d, params=dict(p=10)).assert_eq(e)
|
|
569
|
+
|
|
551
570
|
# interfaceless
|
|
552
571
|
c = dag.transform(
|
|
553
572
|
a.zip(b),
|
|
@@ -676,6 +695,13 @@ class BuiltInTests(object):
|
|
|
676
695
|
incr()
|
|
677
696
|
yield pa.Table.from_pandas(df)
|
|
678
697
|
|
|
698
|
+
def t11(row: Dict[str, Any]) -> Dict[str, Any]:
|
|
699
|
+
incr()
|
|
700
|
+
return row
|
|
701
|
+
|
|
702
|
+
def t12(row: Dict[str, Any]) -> None:
|
|
703
|
+
incr()
|
|
704
|
+
|
|
679
705
|
with FugueWorkflow() as dag:
|
|
680
706
|
a = dag.df([[1, 2], [3, 4]], "a:double,b:int")
|
|
681
707
|
a.out_transform(t1) # +2
|
|
@@ -688,6 +714,8 @@ class BuiltInTests(object):
|
|
|
688
714
|
a.out_transform(t8, ignore_errors=[NotImplementedError]) # +1
|
|
689
715
|
a.out_transform(t9) # +1
|
|
690
716
|
a.out_transform(t10) # +1
|
|
717
|
+
a.out_transform(t11) # +2
|
|
718
|
+
a.out_transform(t12) # +2
|
|
691
719
|
raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t2))
|
|
692
720
|
raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t3))
|
|
693
721
|
raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t4))
|
|
@@ -695,7 +723,7 @@ class BuiltInTests(object):
|
|
|
695
723
|
raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(T7))
|
|
696
724
|
dag.run(self.engine)
|
|
697
725
|
|
|
698
|
-
assert
|
|
726
|
+
assert 17 <= incr()
|
|
699
727
|
|
|
700
728
|
def test_out_cotransform(self): # noqa: C901
|
|
701
729
|
tmpdir = str(self.tmpdir)
|
|
@@ -2001,6 +2029,13 @@ def mock_co_tf1(
|
|
|
2001
2029
|
return [[df1[0]["a"], len(df1), len(df2), p]]
|
|
2002
2030
|
|
|
2003
2031
|
|
|
2032
|
+
@cotransformer(lambda dfs, **kwargs: "a:int,ct1:int,ct2:int,x:int")
|
|
2033
|
+
def mock_co_tf1_d(
|
|
2034
|
+
df1: List[Dict[str, Any]], df2: List[List[Any]], p=1
|
|
2035
|
+
) -> Dict[str, Any]:
|
|
2036
|
+
return dict(a=df1[0]["a"], ct1=len(df1), ct2=len(df2), x=p)
|
|
2037
|
+
|
|
2038
|
+
|
|
2004
2039
|
def mock_co_tf2(dfs: DataFrames, p=1) -> List[List[Any]]:
|
|
2005
2040
|
return [[dfs[0].peek_dict()["a"], dfs[0].count(), dfs[1].count(), p]]
|
|
2006
2041
|
|
fugue_version/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.9.
|
|
1
|
+
__version__ = "0.9.2"
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
[fugue.plugins]
|
|
2
|
-
dask = fugue_dask.registry [dask]
|
|
3
|
-
duckdb = fugue_duckdb.registry [duckdb]
|
|
4
|
-
ibis = fugue_ibis [ibis]
|
|
5
|
-
polars = fugue_polars.registry [polars]
|
|
6
|
-
ray = fugue_ray.registry [ray]
|
|
7
|
-
spark = fugue_spark.registry [spark]
|
|
8
|
-
|
|
9
|
-
[pytest11]
|
|
10
|
-
fugue_test = fugue_test
|
|
11
|
-
fugue_test_fixtures = fugue_test.fixtures
|
|
12
|
-
|
|
File without changes
|
|
File without changes
|