fugue 0.9.1__py3-none-any.whl → 0.9.2.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,6 +80,7 @@ class DataFrameFunctionWrapper(FunctionWrapper):
80
80
  p.update(kwargs)
81
81
  has_kw = False
82
82
  rargs: Dict[str, Any] = {}
83
+ row_param_info: Any = None
83
84
  for k, v in self._params.items():
84
85
  if isinstance(v, (PositionalParam, KeywordParam)):
85
86
  if isinstance(v, KeywordParam):
@@ -90,7 +91,14 @@ class DataFrameFunctionWrapper(FunctionWrapper):
90
91
  isinstance(p[k], DataFrame),
91
92
  lambda: TypeError(f"{p[k]} is not a DataFrame"),
92
93
  )
93
- rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
+ if v.is_per_row:
95
+ assert_or_throw(
96
+ row_param_info is None,
97
+ lambda: ValueError("only one row parameter is allowed"),
98
+ )
99
+ row_param_info = (k, v, p[k])
100
+ else:
101
+ rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
102
  else:
95
103
  rargs[k] = p[k] # TODO: should we do auto type conversion?
96
104
  del p[k]
@@ -100,12 +108,38 @@ class DataFrameFunctionWrapper(FunctionWrapper):
100
108
  rargs.update(p)
101
109
  elif not ignore_unknown and len(p) > 0:
102
110
  raise ValueError(f"{p} are not acceptable parameters")
111
+ if row_param_info is None:
112
+ return self._run_func(rargs, output, output_schema, ctx, raw=False)
113
+ else: # input contains row parameter
114
+
115
+ def _dfs() -> Iterable[Any]:
116
+ k, v, df = row_param_info
117
+ for row in v.to_input_rows(df, ctx):
118
+ rargs[k] = None
119
+ _rargs = rargs.copy()
120
+ _rargs[k] = row
121
+ yield self._run_func(_rargs, output, output_schema, ctx, raw=True)
122
+
123
+ if not output:
124
+ sum(1 for _ in _dfs())
125
+ return
126
+ else:
127
+ return self._rt.iterable_to_output_df(_dfs(), output_schema, ctx)
128
+
129
+ def _run_func(
130
+ self,
131
+ rargs: Dict[str, Any],
132
+ output: bool,
133
+ output_schema: Any,
134
+ ctx: Any,
135
+ raw: bool,
136
+ ) -> Any:
103
137
  rt = self._func(**rargs)
104
138
  if not output:
105
139
  if isinstance(self._rt, _DataFrameParamBase):
106
140
  self._rt.count(rt)
107
141
  return
108
- if isinstance(self._rt, _DataFrameParamBase):
142
+ if not raw and isinstance(self._rt, _DataFrameParamBase):
109
143
  return self._rt.to_output_df(rt, output_schema, ctx=ctx)
110
144
  return rt
111
145
 
@@ -120,6 +154,7 @@ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
120
154
  annotation == Callable
121
155
  or annotation == callable # pylint: disable=comparison-with-callable
122
156
  or str(annotation).startswith("typing.Callable")
157
+ or str(annotation).startswith("collections.abc.Callable")
123
158
  ),
124
159
  )
125
160
  class _CallableParam(AnnotatedParam):
@@ -134,6 +169,9 @@ class _CallableParam(AnnotatedParam):
134
169
  or annotation == Optional[callable]
135
170
  or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
136
171
  or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
172
+ or str(annotation).startswith(
173
+ "typing.Optional[collections.abc.Callable]"
174
+ ) # 3.9+
137
175
  ),
138
176
  )
139
177
  class _OptionalCallableParam(AnnotatedParam):
@@ -145,14 +183,30 @@ class _DataFrameParamBase(AnnotatedParam):
145
183
  super().__init__(param)
146
184
  assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
147
185
 
186
+ @property
187
+ def is_per_row(self) -> bool:
188
+ return False
189
+
148
190
  def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
149
191
  raise NotImplementedError
150
192
 
193
+ def to_input_rows(
194
+ self,
195
+ df: DataFrame,
196
+ ctx: Any,
197
+ ) -> Iterable[Any]:
198
+ raise NotImplementedError # pragma: no cover
199
+
151
200
  def to_output_df(
152
201
  self, df: Any, schema: Any, ctx: Any
153
202
  ) -> DataFrame: # pragma: no cover
154
203
  raise NotImplementedError
155
204
 
205
+ def iterable_to_output_df(
206
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
207
+ ) -> DataFrame: # pragma: no cover
208
+ raise NotImplementedError
209
+
156
210
  def count(self, df: Any) -> int: # pragma: no cover
157
211
  raise NotImplementedError
158
212
 
@@ -182,6 +236,34 @@ class DataFrameParam(_DataFrameParamBase):
182
236
  return sum(1 for _ in df.as_array_iterable())
183
237
 
184
238
 
239
+ @fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
240
+ class RowParam(_DataFrameParamBase):
241
+ @property
242
+ def is_per_row(self) -> bool:
243
+ return True
244
+
245
+ def count(self, df: Any) -> int:
246
+ return 1
247
+
248
+
249
+ @fugue_annotated_param(Dict[str, Any])
250
+ class DictParam(RowParam):
251
+ def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
252
+ yield from df.as_dict_iterable()
253
+
254
+ def to_output_df(self, output: Dict[str, Any], schema: Any, ctx: Any) -> DataFrame:
255
+ return ArrayDataFrame([list(output.values())], schema)
256
+
257
+ def iterable_to_output_df(
258
+ self, dfs: Iterable[Dict[str, Any]], schema: Any, ctx: Any
259
+ ) -> DataFrame: # pragma: no cover
260
+ params: Dict[str, Any] = {}
261
+ if schema is not None:
262
+ params["schema"] = Schema(schema).pa_schema
263
+ adf = pa.Table.from_pylist(list(dfs), **params)
264
+ return ArrowDataFrame(adf)
265
+
266
+
185
267
  @fugue_annotated_param(AnyDataFrame)
186
268
  class _AnyDataFrameParam(DataFrameParam):
187
269
  def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
@@ -207,6 +289,15 @@ class LocalDataFrameParam(DataFrameParam):
207
289
  )
208
290
  return output
209
291
 
292
+ def iterable_to_output_df(
293
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
294
+ ) -> DataFrame: # pragma: no cover
295
+ def _dfs() -> Iterable[DataFrame]:
296
+ for df in dfs:
297
+ yield self.to_output_df(df, schema, ctx)
298
+
299
+ return LocalDataFrameIterableDataFrame(_dfs(), schema=schema)
300
+
210
301
  def count(self, df: LocalDataFrame) -> int:
211
302
  if df.is_bounded:
212
303
  return df.count()
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
375
375
  assert_arg_not_none(schema, "schema")
376
376
  tr = _FuncAsTransformer()
377
377
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
378
- func, "^[lspq][fF]?x*z?$", "^[lspq]$"
378
+ func, "^[lspqr][fF]?x*z?$", "^[lspqr]$"
379
379
  )
380
380
  tr._output_schema_arg = schema # type: ignore
381
381
  tr._validation_rules = validation_rules # type: ignore
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
410
410
  validation_rules.update(parse_validation_rules_from_comment(func))
411
411
  tr = _FuncAsOutputTransformer()
412
412
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
413
- func, "^[lspq][fF]?x*z?$", "^[lspnq]$"
413
+ func, "^[lspqr][fF]?x*z?$", "^[lspnqr]$"
414
414
  )
415
415
  tr._output_schema_arg = None # type: ignore
416
416
  tr._validation_rules = validation_rules # type: ignore
@@ -503,7 +503,7 @@ class _FuncAsCoTransformer(CoTransformer):
503
503
  assert_arg_not_none(schema, "schema")
504
504
  tr = _FuncAsCoTransformer()
505
505
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
506
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$"
506
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspqr]$"
507
507
  )
508
508
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
509
509
  tr._output_schema_arg = schema # type: ignore
@@ -562,7 +562,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
562
562
 
563
563
  tr = _FuncAsOutputCoTransformer()
564
564
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
565
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnq]$"
565
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnqr]$"
566
566
  )
567
567
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
568
568
  tr._output_schema_arg = None # type: ignore
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fugue
3
- Version: 0.9.1
3
+ Version: 0.9.2.dev2
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
7
7
  Author-email: hello@fugue.ai
8
8
  License: Apache-2.0
9
9
  Keywords: distributed spark dask ray sql dsl domain specific language
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Intended Audience :: Developers
13
12
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -17,67 +16,68 @@ Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.8
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: triad >=0.9.7
24
- Requires-Dist: adagio >=0.2.4
23
+ License-File: LICENSE
24
+ Requires-Dist: triad>=0.9.7
25
+ Requires-Dist: adagio>=0.2.6
25
26
  Provides-Extra: all
26
- Requires-Dist: qpd >=0.4.4 ; extra == 'all'
27
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'all'
28
- Requires-Dist: sqlglot ; extra == 'all'
29
- Requires-Dist: jinja2 ; extra == 'all'
30
- Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
31
- Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
32
- Requires-Dist: dask-sql ; extra == 'all'
33
- Requires-Dist: ray[data] >=2.5.0 ; extra == 'all'
34
- Requires-Dist: notebook ; extra == 'all'
35
- Requires-Dist: jupyterlab ; extra == 'all'
36
- Requires-Dist: ipython >=7.10.0 ; extra == 'all'
37
- Requires-Dist: duckdb >=0.5.0 ; extra == 'all'
38
- Requires-Dist: pyarrow >=6.0.1 ; extra == 'all'
39
- Requires-Dist: pandas <2.2,>=2.0.2 ; extra == 'all'
40
- Requires-Dist: ibis-framework ; extra == 'all'
41
- Requires-Dist: polars ; extra == 'all'
27
+ Requires-Dist: qpd>=0.4.4; extra == "all"
28
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "all"
29
+ Requires-Dist: sqlglot; extra == "all"
30
+ Requires-Dist: jinja2; extra == "all"
31
+ Requires-Dist: pyspark>=3.1.1; extra == "all"
32
+ Requires-Dist: dask[dataframe,distributed]>=2023.5.0; extra == "all"
33
+ Requires-Dist: dask-sql; extra == "all"
34
+ Requires-Dist: ray[data]>=2.5.0; extra == "all"
35
+ Requires-Dist: notebook; extra == "all"
36
+ Requires-Dist: jupyterlab; extra == "all"
37
+ Requires-Dist: ipython>=7.10.0; extra == "all"
38
+ Requires-Dist: duckdb>=0.5.0; extra == "all"
39
+ Requires-Dist: pyarrow>=6.0.1; extra == "all"
40
+ Requires-Dist: pandas<2.2,>=2.0.2; extra == "all"
41
+ Requires-Dist: ibis-framework[duckdb,pandas]; extra == "all"
42
+ Requires-Dist: polars; extra == "all"
42
43
  Provides-Extra: cpp_sql_parser
43
- Requires-Dist: fugue-sql-antlr[cpp] >=0.2.0 ; extra == 'cpp_sql_parser'
44
+ Requires-Dist: fugue-sql-antlr[cpp]>=0.2.0; extra == "cpp-sql-parser"
44
45
  Provides-Extra: dask
45
- Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'dask'
46
- Requires-Dist: pyarrow >=7.0.0 ; extra == 'dask'
47
- Requires-Dist: pandas >=2.0.2 ; extra == 'dask'
48
- Requires-Dist: dask[dataframe,distributed] >=2024.4.0 ; (python_version >= "3.11.9") and extra == 'dask'
46
+ Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "dask"
47
+ Requires-Dist: pyarrow>=7.0.0; extra == "dask"
48
+ Requires-Dist: pandas>=2.0.2; extra == "dask"
49
49
  Provides-Extra: duckdb
50
- Requires-Dist: qpd >=0.4.4 ; extra == 'duckdb'
51
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'duckdb'
52
- Requires-Dist: sqlglot ; extra == 'duckdb'
53
- Requires-Dist: jinja2 ; extra == 'duckdb'
54
- Requires-Dist: duckdb >=0.5.0 ; extra == 'duckdb'
55
- Requires-Dist: numpy ; extra == 'duckdb'
50
+ Requires-Dist: qpd>=0.4.4; extra == "duckdb"
51
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "duckdb"
52
+ Requires-Dist: sqlglot; extra == "duckdb"
53
+ Requires-Dist: jinja2; extra == "duckdb"
54
+ Requires-Dist: duckdb>=0.5.0; extra == "duckdb"
55
+ Requires-Dist: numpy; extra == "duckdb"
56
56
  Provides-Extra: ibis
57
- Requires-Dist: qpd >=0.4.4 ; extra == 'ibis'
58
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'ibis'
59
- Requires-Dist: sqlglot ; extra == 'ibis'
60
- Requires-Dist: jinja2 ; extra == 'ibis'
61
- Requires-Dist: ibis-framework ; extra == 'ibis'
62
- Requires-Dist: pandas <2.2 ; extra == 'ibis'
57
+ Requires-Dist: qpd>=0.4.4; extra == "ibis"
58
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "ibis"
59
+ Requires-Dist: sqlglot; extra == "ibis"
60
+ Requires-Dist: jinja2; extra == "ibis"
61
+ Requires-Dist: ibis-framework[pandas]; extra == "ibis"
62
+ Requires-Dist: pandas<2.2; extra == "ibis"
63
63
  Provides-Extra: notebook
64
- Requires-Dist: notebook ; extra == 'notebook'
65
- Requires-Dist: jupyterlab ; extra == 'notebook'
66
- Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
64
+ Requires-Dist: notebook; extra == "notebook"
65
+ Requires-Dist: jupyterlab; extra == "notebook"
66
+ Requires-Dist: ipython>=7.10.0; extra == "notebook"
67
67
  Provides-Extra: polars
68
- Requires-Dist: polars ; extra == 'polars'
68
+ Requires-Dist: polars; extra == "polars"
69
69
  Provides-Extra: ray
70
- Requires-Dist: ray[data] >=2.5.0 ; extra == 'ray'
71
- Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
72
- Requires-Dist: pyarrow >=7.0.0 ; extra == 'ray'
73
- Requires-Dist: pandas <2.2 ; extra == 'ray'
70
+ Requires-Dist: ray[data]>=2.5.0; extra == "ray"
71
+ Requires-Dist: duckdb>=0.5.0; extra == "ray"
72
+ Requires-Dist: pyarrow>=7.0.0; extra == "ray"
73
+ Requires-Dist: pandas<2.2; extra == "ray"
74
74
  Provides-Extra: spark
75
- Requires-Dist: pyspark >=3.1.1 ; extra == 'spark'
75
+ Requires-Dist: pyspark>=3.1.1; extra == "spark"
76
76
  Provides-Extra: sql
77
- Requires-Dist: qpd >=0.4.4 ; extra == 'sql'
78
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'sql'
79
- Requires-Dist: sqlglot ; extra == 'sql'
80
- Requires-Dist: jinja2 ; extra == 'sql'
77
+ Requires-Dist: qpd>=0.4.4; extra == "sql"
78
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "sql"
79
+ Requires-Dist: sqlglot; extra == "sql"
80
+ Requires-Dist: jinja2; extra == "sql"
81
81
 
82
82
  # Fugue
83
83
 
@@ -355,4 +355,3 @@ View some of our latest conferences presentations and content. For a more comple
355
355
  * [Large Scale Data Validation with Spark and Dask (PyCon US)](https://www.youtube.com/watch?v=2AdvBgjO_3Q)
356
356
  * [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames (PyData Global)](https://www.youtube.com/watch?v=OBpnGYjNBBI)
357
357
  * [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk)
358
-
@@ -31,7 +31,7 @@ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvr
31
31
  fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
32
32
  fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
33
33
  fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
34
- fugue/dataframe/function_wrapper.py,sha256=hOZF3GmwpxqwqKi9-pEOAPZSW1ZFyB47hLxRrGyOiuM,14855
34
+ fugue/dataframe/function_wrapper.py,sha256=1CjI4UXHffomylK0_u0CGL1dPv_sSXTN22S5grD10_w,17889
35
35
  fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
36
36
  fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
37
37
  fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
@@ -61,7 +61,7 @@ fugue/extensions/processor/convert.py,sha256=zG0lMtHGwY5TsqK4eplbMdlTg7J_PD3HbI0
61
61
  fugue/extensions/processor/processor.py,sha256=czhQlQgMpAXXoLVAX9Q0TFUMYEEhsgufTammxcKSmOY,1665
62
62
  fugue/extensions/transformer/__init__.py,sha256=VD6d-8xW1Yl8fUPj43cBWNR9pCOlYD9xWyGIHAlHwvI,456
63
63
  fugue/extensions/transformer/constants.py,sha256=76DfpoTOGQ8gp5XtCs_xznfbr_H015-prXpHWSqMNDU,59
64
- fugue/extensions/transformer/convert.py,sha256=5fhktR2s13ZOpUihpy-gy7Xn2BRN6UoA5uwOzJ6YNOU,23380
64
+ fugue/extensions/transformer/convert.py,sha256=SU_KvzZp_nV8oCxZGx7qDsdCE0CJ--8UAp5m8z4d4HY,23386
65
65
  fugue/extensions/transformer/transformer.py,sha256=zhOUgyv5-DPxYd1CP_98WeEw-zUgwknRnPW_6di-q3g,9098
66
66
  fugue/rpc/__init__.py,sha256=3GzUl4QZQuCChjD7eaTJW8tnTwfke6ZY9r9g5nCeBZ8,167
67
67
  fugue/rpc/base.py,sha256=3Fq5SvwLZqw9NXru3r32WuJKBGFr9bl7nFgy6e9boGo,8470
@@ -89,10 +89,11 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
89
89
  fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
90
90
  fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
91
91
  fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
92
+ fugue_dask/_dask_sql_wrapper.py,sha256=lj38gJIOdoMV9W44gpwzLjUEtPVsQNKjRWuEkfI7-PM,2618
92
93
  fugue_dask/_io.py,sha256=pl4F7mbVgP7Rwh1FFG7xfOz2TBZRUj1l3lLvDY4jOf4,6020
93
- fugue_dask/_utils.py,sha256=1uplEqvpCDZDp2YdwJxa6cuGScpgG9VvN3057J02bys,8956
94
+ fugue_dask/_utils.py,sha256=dGUkhOoXQqgGQH_BY6aeYFo9UIWUAyo8YjwtdB7QD4s,8951
94
95
  fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
95
- fugue_dask/execution_engine.py,sha256=60IiwYRBVhN-pX3v6i9BZ8Pa4bcSh5UoklvCScM_XAM,21361
96
+ fugue_dask/execution_engine.py,sha256=Em9pN6cw5w5DGLcjV6oKQKQeLLblc9DZ0DkvxKVFxQQ,21167
96
97
  fugue_dask/registry.py,sha256=jepWKH55VWNIWV3pOF5vpCl2OpO0rI1IULx5GM2Gk6w,2274
97
98
  fugue_dask/tester.py,sha256=E7BZjgFpJgrHsLMKzvSO5im5OwocYcratjzulJSQZl0,718
98
99
  fugue_duckdb/__init__.py,sha256=ZzhmAWbROR1YL9Kmlt7OlwkgPZzFhsSdwLV2pFmAqGI,268
@@ -107,7 +108,7 @@ fugue_ibis/__init__.py,sha256=z7TkK7M2_0p9XO6jQATNDgT0aHXn5k69Ttz2ga-eQG8,190
107
108
  fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
108
109
  fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
109
110
  fugue_ibis/dataframe.py,sha256=k4Q6qBLBIADF5YhbvaDplXO7OkMZSHuf_Wg5o-AusEI,7796
110
- fugue_ibis/execution_engine.py,sha256=5I-ou5xPdomVu-srdvidvP8f7wDYbGrCV_lGffZa_ac,18679
111
+ fugue_ibis/execution_engine.py,sha256=jRnp1m1wuTicS29A-WA043f8QwdoK8b9rwPXvTkm8r8,18751
111
112
  fugue_notebook/__init__.py,sha256=9r_-2uxu1lBeZ8GgpYCKom_OZy2soIOYZajg7JDO-HY,4326
112
113
  fugue_notebook/env.py,sha256=TYiTxYPFi-BVJJY49jDsvw9mddhK8WrifeRxBke30I8,4773
113
114
  fugue_notebook/nbextension/README.md,sha256=QLnr957YeGfwzy2r4c4qbZPaXyCbyGrKPvcqSBQYSnU,123
@@ -127,7 +128,7 @@ fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
127
128
  fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
129
  fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
129
130
  fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
130
- fugue_ray/_utils/io.py,sha256=3hFNDeBuh4bfCud40ZsGrGZLSvCSuxL_1VlqCTnn6RA,9794
131
+ fugue_ray/_utils/io.py,sha256=Dz0WuQrh_8Ix7jU5viFIA6caJcfxV4ew0ruBZLQbD1s,9930
131
132
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
132
133
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
133
134
  fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
@@ -143,14 +144,14 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
143
144
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
144
145
  fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
145
146
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
146
- fugue_test/builtin_suite.py,sha256=cOkZG6w1RHhWWxtjQhZClZQaGT6haNd576BoUmNC_cA,77960
147
+ fugue_test/builtin_suite.py,sha256=BpGwa66cAUuuc7ULOsPP3ax8IKQtNIPoSmlUFgqUKQk,79252
147
148
  fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
148
149
  fugue_test/execution_suite.py,sha256=jcSSoKqTGbeWzTxkyYU-8i2zJAjzuXn7BqE8ul-JjIc,48646
149
150
  fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
150
- fugue_version/__init__.py,sha256=UwJXM8JY2T3tE2id0K2k_lEaVThbRTrGO1mNibyzIz8,22
151
- fugue-0.9.1.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
- fugue-0.9.1.dist-info/METADATA,sha256=zu44QGPIwk28QyKe9H4Si2ANByy1sJ9cmauNrhCg4bc,18380
153
- fugue-0.9.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
154
- fugue-0.9.1.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
155
- fugue-0.9.1.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
- fugue-0.9.1.dist-info/RECORD,,
151
+ fugue_version/__init__.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
152
+ fugue-0.9.2.dev2.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
153
+ fugue-0.9.2.dev2.dist-info/METADATA,sha256=eR5mL6Tf1RGa_-Do5Dmzy4ZkbcbKf-FzW4qA0cAW1Ec,18283
154
+ fugue-0.9.2.dev2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
155
+ fugue-0.9.2.dev2.dist-info/entry_points.txt,sha256=2Vxp1qew_tswacA8m0RzIliLlFOQMlzezvSXPugM_KA,295
156
+ fugue-0.9.2.dev2.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
157
+ fugue-0.9.2.dev2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,11 @@
1
+ [fugue.plugins]
2
+ dask = fugue_dask.registry[dask]
3
+ duckdb = fugue_duckdb.registry[duckdb]
4
+ ibis = fugue_ibis[ibis]
5
+ polars = fugue_polars.registry[polars]
6
+ ray = fugue_ray.registry[ray]
7
+ spark = fugue_spark.registry[spark]
8
+
9
+ [pytest11]
10
+ fugue_test = fugue_test
11
+ fugue_test_fixtures = fugue_test.fixtures
@@ -0,0 +1,76 @@
1
+ from typing import Any, Optional
2
+
3
+ import dask.dataframe as dd
4
+
5
+ try:
6
+ from dask.dataframe.dask_expr.io.parquet import ReadParquet
7
+
8
+ HAS_DASK_EXPR = True # newer dask
9
+ except ImportError: # pragma: no cover
10
+ HAS_DASK_EXPR = False # older dask
11
+
12
+ if not HAS_DASK_EXPR: # pragma: no cover
13
+ try:
14
+ from dask_sql import Context as ContextWrapper # pylint: disable-all
15
+ except ImportError: # pragma: no cover
16
+ raise ImportError(
17
+ "dask-sql is not installed. Please install it with `pip install dask-sql`"
18
+ )
19
+ else:
20
+ from triad.utils.assertion import assert_or_throw
21
+
22
+ try:
23
+ from dask_sql import Context
24
+ from dask_sql.datacontainer import Statistics
25
+ from dask_sql.input_utils import InputUtil
26
+ except ImportError: # pragma: no cover
27
+ raise ImportError(
28
+ "dask-sql is not installed. Please install it with `pip install dask-sql`"
29
+ )
30
+
31
+ class ContextWrapper(Context): # type: ignore
32
+ def create_table(
33
+ self,
34
+ table_name: str,
35
+ input_table: dd.DataFrame,
36
+ format: Optional[str] = None, # noqa
37
+ persist: bool = False,
38
+ schema_name: Optional[str] = None,
39
+ statistics: Optional[Statistics] = None,
40
+ gpu: bool = False,
41
+ **kwargs: Any,
42
+ ) -> None: # pragma: no cover
43
+ assert_or_throw(
44
+ isinstance(input_table, dd.DataFrame),
45
+ lambda: ValueError(
46
+ f"input_table must be a dask dataframe, but got {type(input_table)}"
47
+ ),
48
+ )
49
+ assert_or_throw(
50
+ dd._dask_expr_enabled(), lambda: ValueError("Dask expr must be enabled")
51
+ )
52
+ schema_name = schema_name or self.schema_name
53
+
54
+ dc = InputUtil.to_dc(
55
+ input_table,
56
+ table_name=table_name,
57
+ format=format,
58
+ persist=persist,
59
+ gpu=gpu,
60
+ **kwargs,
61
+ )
62
+
63
+ dask_filepath = None
64
+ operations = input_table.find_operations(ReadParquet)
65
+ for op in operations:
66
+ dask_filepath = op._args[0]
67
+
68
+ dc.filepath = dask_filepath
69
+ self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
70
+
71
+ if not statistics:
72
+ statistics = Statistics(float("nan"))
73
+ dc.statistics = statistics
74
+
75
+ self.schema[schema_name].tables[table_name.lower()] = dc
76
+ self.schema[schema_name].statistics[table_name.lower()] = statistics
fugue_dask/_utils.py CHANGED
@@ -5,7 +5,7 @@ import dask.dataframe as dd
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pyarrow as pa
8
- from dask.dataframe.core import DataFrame
8
+ from dask.dataframe import DataFrame
9
9
  from dask.delayed import delayed
10
10
  from dask.distributed import Client, get_client
11
11
  from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
@@ -9,9 +9,10 @@ from triad.collections import Schema
9
9
  from triad.collections.dict import IndexedOrderedDict, ParamDict
10
10
  from triad.utils.assertion import assert_or_throw
11
11
  from triad.utils.hash import to_uuid
12
+ from triad.utils.io import makedirs
12
13
  from triad.utils.pandas_like import PandasUtils
13
14
  from triad.utils.threading import RunOnce
14
- from triad.utils.io import makedirs
15
+
15
16
  from fugue import StructuredRawSQL
16
17
  from fugue.collections.partition import (
17
18
  PartitionCursor,
@@ -61,14 +62,9 @@ class DaskSQLEngine(SQLEngine):
61
62
  return True
62
63
 
63
64
  def select(self, dfs: DataFrames, statement: StructuredRawSQL) -> DataFrame:
64
- try:
65
- from dask_sql import Context
66
- except ImportError: # pragma: no cover
67
- raise ImportError(
68
- "dask-sql is not installed. "
69
- "Please install it with `pip install dask-sql`"
70
- )
71
- ctx = Context()
65
+ from ._dask_sql_wrapper import ContextWrapper
66
+
67
+ ctx = ContextWrapper()
72
68
  _dfs: Dict[str, dd.DataFrame] = {k: self._to_safe_df(v) for k, v in dfs.items()}
73
69
  sql = statement.construct(dialect=self.dialect, log=self.log)
74
70
  res = ctx.sql(
@@ -92,7 +92,8 @@ class IbisSQLEngine(SQLEngine):
92
92
  _df2 = self.to_df(df2)
93
93
  key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
94
94
  on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
95
- if ibis.__version__ < "6": # pragma: no cover
95
+ version = int(ibis.__version__.split(".")[0])
96
+ if version < 6: # pragma: no cover
96
97
  suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
97
98
  else:
98
99
  # breaking change in ibis 6.0
@@ -113,7 +114,7 @@ class IbisSQLEngine(SQLEngine):
113
114
  cols.append(
114
115
  ibis.coalesce(tb[k], tb[k + _JOIN_RIGHT_SUFFIX]).name(k)
115
116
  )
116
- tb = tb[cols]
117
+ tb = tb.select(*cols)
117
118
  elif how.lower() in ["semi", "left_semi"]:
118
119
  tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
119
120
  elif how.lower() in ["anti", "left_anti"]:
@@ -153,7 +154,7 @@ class IbisSQLEngine(SQLEngine):
153
154
  self,
154
155
  df: DataFrame,
155
156
  how: str = "any",
156
- thresh: int = None,
157
+ thresh: Optional[int] = None,
157
158
  subset: Optional[List[str]] = None,
158
159
  ) -> DataFrame:
159
160
  schema = df.schema
@@ -161,7 +162,7 @@ class IbisSQLEngine(SQLEngine):
161
162
  schema = schema.extract(subset)
162
163
  _df = self.to_df(df)
163
164
  if thresh is None:
164
- tb = _df.native.dropna(subset=subset, how=how)
165
+ tb = _df.native.drop_null(subset, how=how)
165
166
  return self.to_df(tb, df.schema)
166
167
  assert_or_throw(
167
168
  how == "any", ValueError("when thresh is set, how must be 'any'")
@@ -204,7 +205,7 @@ class IbisSQLEngine(SQLEngine):
204
205
  ibis.coalesce(tb[f], ibis.literal(vd[f])).name(f) if f in names else tb[f]
205
206
  for f in df.columns
206
207
  ]
207
- return self.to_df(tb[cols], schema=df.schema)
208
+ return self.to_df(tb.select(cols), schema=df.schema)
208
209
 
209
210
  def take(
210
211
  self,
@@ -241,7 +242,7 @@ class IbisSQLEngine(SQLEngine):
241
242
  f") WHERE __fugue_take_param<={n}"
242
243
  )
243
244
  tb = self.query_to_table(sql, {tbn: idf})
244
- return self.to_df(tb[df.columns], schema=df.schema)
245
+ return self.to_df(tb.select(*df.columns), schema=df.schema)
245
246
 
246
247
  sorts: List[str] = []
247
248
  for k, v in _presort.items():
fugue_ray/_utils/io.py CHANGED
@@ -7,7 +7,7 @@ import ray.data as rd
7
7
  from packaging import version
8
8
  from pyarrow import csv as pacsv
9
9
  from pyarrow import json as pajson
10
- from ray.data.datasource import FileExtensionFilter
10
+
11
11
  from triad.collections import Schema
12
12
  from triad.collections.dict import ParamDict
13
13
  from triad.utils.assertion import assert_or_throw
@@ -21,6 +21,27 @@ from fugue_ray.dataframe import RayDataFrame
21
21
 
22
22
  from .._constants import RAY_VERSION
23
23
 
24
+ try:
25
+ from ray.data.datasource import FileExtensionFilter
26
+
27
+ class _FileFiler(FileExtensionFilter): # pragma: no cover
28
+ def __init__(
29
+ self, file_extensions: Union[str, List[str]], exclude: Iterable[str]
30
+ ):
31
+ super().__init__(file_extensions, allow_if_no_extension=True)
32
+ self._exclude = set(exclude)
33
+
34
+ def _is_valid(self, path: str) -> bool:
35
+ return pathlib.Path(
36
+ path
37
+ ).name not in self._exclude and self._file_has_extension(path)
38
+
39
+ def __call__(self, paths: List[str]) -> List[str]:
40
+ return [path for path in paths if self._is_valid(path)]
41
+
42
+ except ImportError: # pragma: no cover
43
+ pass # ray >=2.10
44
+
24
45
 
25
46
  class RayIO(object):
26
47
  def __init__(self, engine: ExecutionEngine):
@@ -248,17 +269,3 @@ class RayIO(object):
248
269
 
249
270
  def _remote_args(self) -> Dict[str, Any]:
250
271
  return {"num_cpus": 1}
251
-
252
-
253
- class _FileFiler(FileExtensionFilter): # pragma: no cover
254
- def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
255
- super().__init__(file_extensions, allow_if_no_extension=True)
256
- self._exclude = set(exclude)
257
-
258
- def _is_valid(self, path: str) -> bool:
259
- return pathlib.Path(
260
- path
261
- ).name not in self._exclude and self._file_has_extension(path)
262
-
263
- def __call__(self, paths: List[str]) -> List[str]:
264
- return [path for path in paths if self._is_valid(path)]
@@ -486,6 +486,23 @@ class BuiltInTests(object):
486
486
  dag.df([], "a:int,b:int").assert_eq(b)
487
487
  dag.run(self.engine)
488
488
 
489
+ def test_transform_row_wise(self):
490
+ def t1(row: Dict[str, Any]) -> Dict[str, Any]:
491
+ row["b"] = 1
492
+ return row
493
+
494
+ def t2(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
495
+ return rows[0]
496
+
497
+ with fa.engine_context(self.engine):
498
+ a = pd.DataFrame([[3, 4], [1, 2], [3, 5]], columns=["a", "b"])
499
+ b = fa.transform(a, t1, schema="*")
500
+ assert sorted(fa.as_array(b)) == [[1, 1], [3, 1], [3, 1]]
501
+ b = fa.transform(
502
+ a, t2, schema="*", partition={"by": "a", "presort": "b"}
503
+ )
504
+ assert sorted(fa.as_array(b)) == [[1, 2], [3, 4]]
505
+
489
506
  def test_transform_binary(self):
490
507
  with FugueWorkflow() as dag:
491
508
  a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
@@ -548,6 +565,8 @@ class BuiltInTests(object):
548
565
  e = dag.df([[1, 2, 1, 10]], "a:int,ct1:int,ct2:int,x:int")
549
566
  e.assert_eq(c)
550
567
 
568
+ a.zip(b).transform(mock_co_tf1_d, params=dict(p=10)).assert_eq(e)
569
+
551
570
  # interfaceless
552
571
  c = dag.transform(
553
572
  a.zip(b),
@@ -676,6 +695,13 @@ class BuiltInTests(object):
676
695
  incr()
677
696
  yield pa.Table.from_pandas(df)
678
697
 
698
+ def t11(row: Dict[str, Any]) -> Dict[str, Any]:
699
+ incr()
700
+ return row
701
+
702
+ def t12(row: Dict[str, Any]) -> None:
703
+ incr()
704
+
679
705
  with FugueWorkflow() as dag:
680
706
  a = dag.df([[1, 2], [3, 4]], "a:double,b:int")
681
707
  a.out_transform(t1) # +2
@@ -688,6 +714,8 @@ class BuiltInTests(object):
688
714
  a.out_transform(t8, ignore_errors=[NotImplementedError]) # +1
689
715
  a.out_transform(t9) # +1
690
716
  a.out_transform(t10) # +1
717
+ a.out_transform(t11) # +2
718
+ a.out_transform(t12) # +2
691
719
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t2))
692
720
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t3))
693
721
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t4))
@@ -695,7 +723,7 @@ class BuiltInTests(object):
695
723
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(T7))
696
724
  dag.run(self.engine)
697
725
 
698
- assert 13 <= incr()
726
+ assert 17 <= incr()
699
727
 
700
728
  def test_out_cotransform(self): # noqa: C901
701
729
  tmpdir = str(self.tmpdir)
@@ -2001,6 +2029,13 @@ def mock_co_tf1(
2001
2029
  return [[df1[0]["a"], len(df1), len(df2), p]]
2002
2030
 
2003
2031
 
2032
+ @cotransformer(lambda dfs, **kwargs: "a:int,ct1:int,ct2:int,x:int")
2033
+ def mock_co_tf1_d(
2034
+ df1: List[Dict[str, Any]], df2: List[List[Any]], p=1
2035
+ ) -> Dict[str, Any]:
2036
+ return dict(a=df1[0]["a"], ct1=len(df1), ct2=len(df2), x=p)
2037
+
2038
+
2004
2039
  def mock_co_tf2(dfs: DataFrames, p=1) -> List[List[Any]]:
2005
2040
  return [[dfs[0].peek_dict()["a"], dfs[0].count(), dfs[1].count(), p]]
2006
2041
 
fugue_version/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.1"
1
+ __version__ = "0.9.2"
@@ -1,12 +0,0 @@
1
- [fugue.plugins]
2
- dask = fugue_dask.registry [dask]
3
- duckdb = fugue_duckdb.registry [duckdb]
4
- ibis = fugue_ibis [ibis]
5
- polars = fugue_polars.registry [polars]
6
- ray = fugue_ray.registry [ray]
7
- spark = fugue_spark.registry [spark]
8
-
9
- [pytest11]
10
- fugue_test = fugue_test
11
- fugue_test_fixtures = fugue_test.fixtures
12
-