fugue 0.9.0.dev4__py3-none-any.whl → 0.9.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,6 +20,7 @@ from triad.collections.function_wrapper import (
20
20
  PositionalParam,
21
21
  function_wrapper,
22
22
  )
23
+ from triad.utils.convert import compare_annotations
23
24
  from triad.utils.iter import EmptyAwareIterable, make_empty_aware
24
25
 
25
26
  from ..constants import FUGUE_ENTRYPOINT
@@ -37,6 +38,14 @@ from .iterable_dataframe import IterableDataFrame
37
38
  from .pandas_dataframe import PandasDataFrame
38
39
 
39
40
 
41
+ def _compare_iter(tp: Any) -> Any:
42
+ return lambda x: compare_annotations(
43
+ x, Iterable[tp] # type:ignore
44
+ ) or compare_annotations(
45
+ x, Iterator[tp] # type:ignore
46
+ )
47
+
48
+
40
49
  @function_wrapper(FUGUE_ENTRYPOINT)
41
50
  class DataFrameFunctionWrapper(FunctionWrapper):
42
51
  @property
@@ -71,6 +80,7 @@ class DataFrameFunctionWrapper(FunctionWrapper):
71
80
  p.update(kwargs)
72
81
  has_kw = False
73
82
  rargs: Dict[str, Any] = {}
83
+ row_param_info: Any = None
74
84
  for k, v in self._params.items():
75
85
  if isinstance(v, (PositionalParam, KeywordParam)):
76
86
  if isinstance(v, KeywordParam):
@@ -81,7 +91,14 @@ class DataFrameFunctionWrapper(FunctionWrapper):
81
91
  isinstance(p[k], DataFrame),
82
92
  lambda: TypeError(f"{p[k]} is not a DataFrame"),
83
93
  )
84
- rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
+ if v.is_per_row:
95
+ assert_or_throw(
96
+ row_param_info is None,
97
+ lambda: ValueError("only one row parameter is allowed"),
98
+ )
99
+ row_param_info = (k, v, p[k])
100
+ else:
101
+ rargs[k] = v.to_input_data(p[k], ctx=ctx)
85
102
  else:
86
103
  rargs[k] = p[k] # TODO: should we do auto type conversion?
87
104
  del p[k]
@@ -91,12 +108,38 @@ class DataFrameFunctionWrapper(FunctionWrapper):
91
108
  rargs.update(p)
92
109
  elif not ignore_unknown and len(p) > 0:
93
110
  raise ValueError(f"{p} are not acceptable parameters")
111
+ if row_param_info is None:
112
+ return self._run_func(rargs, output, output_schema, ctx, raw=False)
113
+ else: # input contains row parameter
114
+
115
+ def _dfs() -> Iterable[Any]:
116
+ k, v, df = row_param_info
117
+ for row in v.to_input_rows(df, ctx):
118
+ rargs[k] = None
119
+ _rargs = rargs.copy()
120
+ _rargs[k] = row
121
+ yield self._run_func(_rargs, output, output_schema, ctx, raw=True)
122
+
123
+ if not output:
124
+ sum(1 for _ in _dfs())
125
+ return
126
+ else:
127
+ return self._rt.iterable_to_output_df(_dfs(), output_schema, ctx)
128
+
129
+ def _run_func(
130
+ self,
131
+ rargs: Dict[str, Any],
132
+ output: bool,
133
+ output_schema: Any,
134
+ ctx: Any,
135
+ raw: bool,
136
+ ) -> Any:
94
137
  rt = self._func(**rargs)
95
138
  if not output:
96
139
  if isinstance(self._rt, _DataFrameParamBase):
97
140
  self._rt.count(rt)
98
141
  return
99
- if isinstance(self._rt, _DataFrameParamBase):
142
+ if not raw and isinstance(self._rt, _DataFrameParamBase):
100
143
  return self._rt.to_output_df(rt, output_schema, ctx=ctx)
101
144
  return rt
102
145
 
@@ -136,14 +179,30 @@ class _DataFrameParamBase(AnnotatedParam):
136
179
  super().__init__(param)
137
180
  assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
138
181
 
182
+ @property
183
+ def is_per_row(self) -> bool:
184
+ return False
185
+
139
186
  def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
140
187
  raise NotImplementedError
141
188
 
189
+ def to_input_rows(
190
+ self,
191
+ df: DataFrame,
192
+ ctx: Any,
193
+ ) -> Iterable[Any]:
194
+ raise NotImplementedError # pragma: no cover
195
+
142
196
  def to_output_df(
143
197
  self, df: Any, schema: Any, ctx: Any
144
198
  ) -> DataFrame: # pragma: no cover
145
199
  raise NotImplementedError
146
200
 
201
+ def iterable_to_output_df(
202
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
203
+ ) -> DataFrame: # pragma: no cover
204
+ raise NotImplementedError
205
+
147
206
  def count(self, df: Any) -> int: # pragma: no cover
148
207
  raise NotImplementedError
149
208
 
@@ -173,6 +232,34 @@ class DataFrameParam(_DataFrameParamBase):
173
232
  return sum(1 for _ in df.as_array_iterable())
174
233
 
175
234
 
235
+ @fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
236
+ class RowParam(_DataFrameParamBase):
237
+ @property
238
+ def is_per_row(self) -> bool:
239
+ return True
240
+
241
+ def count(self, df: Any) -> int:
242
+ return 1
243
+
244
+
245
+ @fugue_annotated_param(Dict[str, Any])
246
+ class DictParam(RowParam):
247
+ def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
248
+ yield from df.as_dict_iterable()
249
+
250
+ def to_output_df(self, output: Dict[str, Any], schema: Any, ctx: Any) -> DataFrame:
251
+ return ArrayDataFrame([list(output.values())], schema)
252
+
253
+ def iterable_to_output_df(
254
+ self, dfs: Iterable[Dict[str, Any]], schema: Any, ctx: Any
255
+ ) -> DataFrame: # pragma: no cover
256
+ params: Dict[str, Any] = {}
257
+ if schema is not None:
258
+ params["schema"] = Schema(schema).pa_schema
259
+ adf = pa.Table.from_pylist(list(dfs), **params)
260
+ return ArrowDataFrame(adf)
261
+
262
+
176
263
  @fugue_annotated_param(AnyDataFrame)
177
264
  class _AnyDataFrameParam(DataFrameParam):
178
265
  def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
@@ -198,6 +285,15 @@ class LocalDataFrameParam(DataFrameParam):
198
285
  )
199
286
  return output
200
287
 
288
+ def iterable_to_output_df(
289
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
290
+ ) -> DataFrame: # pragma: no cover
291
+ def _dfs() -> Iterable[DataFrame]:
292
+ for df in dfs:
293
+ yield self.to_output_df(df, schema, ctx)
294
+
295
+ return LocalDataFrameIterableDataFrame(_dfs(), schema=schema)
296
+
201
297
  def count(self, df: LocalDataFrame) -> int:
202
298
  if df.is_bounded:
203
299
  return df.count()
@@ -228,10 +324,7 @@ class _ListListParam(_LocalNoSchemaDataFrameParam):
228
324
  return len(df)
229
325
 
230
326
 
231
- @fugue_annotated_param(
232
- Iterable[List[Any]],
233
- matcher=lambda x: x == Iterable[List[Any]] or x == Iterator[List[Any]],
234
- )
327
+ @fugue_annotated_param(Iterable[List[Any]], matcher=_compare_iter(List[Any]))
235
328
  class _IterableListParam(_LocalNoSchemaDataFrameParam):
236
329
  @no_type_check
237
330
  def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[List[Any]]:
@@ -288,10 +381,7 @@ class _ListDictParam(_LocalNoSchemaDataFrameParam):
288
381
  return len(df)
289
382
 
290
383
 
291
- @fugue_annotated_param(
292
- Iterable[Dict[str, Any]],
293
- matcher=lambda x: x == Iterable[Dict[str, Any]] or x == Iterator[Dict[str, Any]],
294
- )
384
+ @fugue_annotated_param(Iterable[Dict[str, Any]], matcher=_compare_iter(Dict[str, Any]))
295
385
  class _IterableDictParam(_LocalNoSchemaDataFrameParam):
296
386
  @no_type_check
297
387
  def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[Dict[str, Any]]:
@@ -360,10 +450,7 @@ class _PandasParam(LocalDataFrameParam):
360
450
  return "pandas"
361
451
 
362
452
 
363
- @fugue_annotated_param(
364
- Iterable[pd.DataFrame],
365
- matcher=lambda x: x == Iterable[pd.DataFrame] or x == Iterator[pd.DataFrame],
366
- )
453
+ @fugue_annotated_param(Iterable[pd.DataFrame], matcher=_compare_iter(pd.DataFrame))
367
454
  class _IterablePandasParam(LocalDataFrameParam):
368
455
  @no_type_check
369
456
  def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pd.DataFrame]:
@@ -419,10 +506,7 @@ class _PyArrowTableParam(LocalDataFrameParam):
419
506
  return "pyarrow"
420
507
 
421
508
 
422
- @fugue_annotated_param(
423
- Iterable[pa.Table],
424
- matcher=lambda x: x == Iterable[pa.Table] or x == Iterator[pa.Table],
425
- )
509
+ @fugue_annotated_param(Iterable[pa.Table], matcher=_compare_iter(pa.Table))
426
510
  class _IterableArrowParam(LocalDataFrameParam):
427
511
  @no_type_check
428
512
  def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
375
375
  assert_arg_not_none(schema, "schema")
376
376
  tr = _FuncAsTransformer()
377
377
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
378
- func, "^[lspq][fF]?x*z?$", "^[lspq]$"
378
+ func, "^[lspqr][fF]?x*z?$", "^[lspqr]$"
379
379
  )
380
380
  tr._output_schema_arg = schema # type: ignore
381
381
  tr._validation_rules = validation_rules # type: ignore
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
410
410
  validation_rules.update(parse_validation_rules_from_comment(func))
411
411
  tr = _FuncAsOutputTransformer()
412
412
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
413
- func, "^[lspq][fF]?x*z?$", "^[lspnq]$"
413
+ func, "^[lspqr][fF]?x*z?$", "^[lspnqr]$"
414
414
  )
415
415
  tr._output_schema_arg = None # type: ignore
416
416
  tr._validation_rules = validation_rules # type: ignore
@@ -503,7 +503,7 @@ class _FuncAsCoTransformer(CoTransformer):
503
503
  assert_arg_not_none(schema, "schema")
504
504
  tr = _FuncAsCoTransformer()
505
505
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
506
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$"
506
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspqr]$"
507
507
  )
508
508
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
509
509
  tr._output_schema_arg = schema # type: ignore
@@ -562,7 +562,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
562
562
 
563
563
  tr = _FuncAsOutputCoTransformer()
564
564
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
565
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnq]$"
565
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnqr]$"
566
566
  )
567
567
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
568
568
  tr._output_schema_arg = None # type: ignore
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fugue
3
- Version: 0.9.0.dev4
3
+ Version: 0.9.2.dev1
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.8
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: triad >=0.9.6
23
+ Requires-Dist: triad >=0.9.7
24
24
  Requires-Dist: adagio >=0.2.4
25
25
  Provides-Extra: all
26
26
  Requires-Dist: qpd >=0.4.4 ; extra == 'all'
@@ -31,7 +31,7 @@ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvr
31
31
  fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
32
32
  fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
33
33
  fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
34
- fugue/dataframe/function_wrapper.py,sha256=V1eQMOn27UroEYT7_YiwoEF0RjZYIM0zkD3vfaMAQFs,14813
34
+ fugue/dataframe/function_wrapper.py,sha256=cG-0ICf-WxgxmEJIdU2jx4GzYhVfP69AyzjTjNDpXGE,17710
35
35
  fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
36
36
  fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
37
37
  fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
@@ -61,7 +61,7 @@ fugue/extensions/processor/convert.py,sha256=zG0lMtHGwY5TsqK4eplbMdlTg7J_PD3HbI0
61
61
  fugue/extensions/processor/processor.py,sha256=czhQlQgMpAXXoLVAX9Q0TFUMYEEhsgufTammxcKSmOY,1665
62
62
  fugue/extensions/transformer/__init__.py,sha256=VD6d-8xW1Yl8fUPj43cBWNR9pCOlYD9xWyGIHAlHwvI,456
63
63
  fugue/extensions/transformer/constants.py,sha256=76DfpoTOGQ8gp5XtCs_xznfbr_H015-prXpHWSqMNDU,59
64
- fugue/extensions/transformer/convert.py,sha256=5fhktR2s13ZOpUihpy-gy7Xn2BRN6UoA5uwOzJ6YNOU,23380
64
+ fugue/extensions/transformer/convert.py,sha256=SU_KvzZp_nV8oCxZGx7qDsdCE0CJ--8UAp5m8z4d4HY,23386
65
65
  fugue/extensions/transformer/transformer.py,sha256=zhOUgyv5-DPxYd1CP_98WeEw-zUgwknRnPW_6di-q3g,9098
66
66
  fugue/rpc/__init__.py,sha256=3GzUl4QZQuCChjD7eaTJW8tnTwfke6ZY9r9g5nCeBZ8,167
67
67
  fugue/rpc/base.py,sha256=3Fq5SvwLZqw9NXru3r32WuJKBGFr9bl7nFgy6e9boGo,8470
@@ -127,7 +127,7 @@ fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
127
127
  fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
128
  fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
129
129
  fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
130
- fugue_ray/_utils/io.py,sha256=3hFNDeBuh4bfCud40ZsGrGZLSvCSuxL_1VlqCTnn6RA,9794
130
+ fugue_ray/_utils/io.py,sha256=Dz0WuQrh_8Ix7jU5viFIA6caJcfxV4ew0ruBZLQbD1s,9930
131
131
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
132
132
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
133
133
  fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
@@ -137,20 +137,20 @@ fugue_spark/tester.py,sha256=VX003yGNlBukaZTQSN-w7XvgSk4rqxrWQIzno0dWrXg,2481
137
137
  fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
138
  fugue_spark/_utils/convert.py,sha256=eRWkDYA4UO-FQu-2y4O80WEdawx7X_rIrWg55AlOiRc,10007
139
139
  fugue_spark/_utils/io.py,sha256=OdUezKpB29Lx9aUS2k9x0xUAGZrmgMZyQYGPEeHk7rQ,5574
140
- fugue_spark/_utils/misc.py,sha256=o8dZmXOHnA7D_ps37vgGXTPTiSEG9LQzPKq7l-MG-qM,860
140
+ fugue_spark/_utils/misc.py,sha256=9LsbBp6nOEhqXFLr8oWTc3VKzKk-vuVyixlRoquGnEs,858
141
141
  fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
142
142
  fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
143
143
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
144
144
  fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
145
145
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
146
- fugue_test/builtin_suite.py,sha256=cOkZG6w1RHhWWxtjQhZClZQaGT6haNd576BoUmNC_cA,77960
146
+ fugue_test/builtin_suite.py,sha256=BpGwa66cAUuuc7ULOsPP3ax8IKQtNIPoSmlUFgqUKQk,79252
147
147
  fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
148
148
  fugue_test/execution_suite.py,sha256=jcSSoKqTGbeWzTxkyYU-8i2zJAjzuXn7BqE8ul-JjIc,48646
149
149
  fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
150
- fugue_version/__init__.py,sha256=H9NWRZb7NbeRRPLP_V1fARmLNXranorVM-OOY-8_2ug,22
151
- fugue-0.9.0.dev4.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
- fugue-0.9.0.dev4.dist-info/METADATA,sha256=smbI6QuuMajmoMhJ14Y4MUs2mGpb4onc6kImR83D9DQ,18385
153
- fugue-0.9.0.dev4.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
154
- fugue-0.9.0.dev4.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
155
- fugue-0.9.0.dev4.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
- fugue-0.9.0.dev4.dist-info/RECORD,,
150
+ fugue_version/__init__.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
151
+ fugue-0.9.2.dev1.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
+ fugue-0.9.2.dev1.dist-info/METADATA,sha256=0ML_xHPma0CaGuGtZn45doFbLu2UtzB-VlPX4koFLYg,18385
153
+ fugue-0.9.2.dev1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
154
+ fugue-0.9.2.dev1.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
155
+ fugue-0.9.2.dev1.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
+ fugue-0.9.2.dev1.dist-info/RECORD,,
fugue_ray/_utils/io.py CHANGED
@@ -7,7 +7,7 @@ import ray.data as rd
7
7
  from packaging import version
8
8
  from pyarrow import csv as pacsv
9
9
  from pyarrow import json as pajson
10
- from ray.data.datasource import FileExtensionFilter
10
+
11
11
  from triad.collections import Schema
12
12
  from triad.collections.dict import ParamDict
13
13
  from triad.utils.assertion import assert_or_throw
@@ -21,6 +21,27 @@ from fugue_ray.dataframe import RayDataFrame
21
21
 
22
22
  from .._constants import RAY_VERSION
23
23
 
24
+ try:
25
+ from ray.data.datasource import FileExtensionFilter
26
+
27
+ class _FileFiler(FileExtensionFilter): # pragma: no cover
28
+ def __init__(
29
+ self, file_extensions: Union[str, List[str]], exclude: Iterable[str]
30
+ ):
31
+ super().__init__(file_extensions, allow_if_no_extension=True)
32
+ self._exclude = set(exclude)
33
+
34
+ def _is_valid(self, path: str) -> bool:
35
+ return pathlib.Path(
36
+ path
37
+ ).name not in self._exclude and self._file_has_extension(path)
38
+
39
+ def __call__(self, paths: List[str]) -> List[str]:
40
+ return [path for path in paths if self._is_valid(path)]
41
+
42
+ except ImportError: # pragma: no cover
43
+ pass # ray >=2.10
44
+
24
45
 
25
46
  class RayIO(object):
26
47
  def __init__(self, engine: ExecutionEngine):
@@ -248,17 +269,3 @@ class RayIO(object):
248
269
 
249
270
  def _remote_args(self) -> Dict[str, Any]:
250
271
  return {"num_cpus": 1}
251
-
252
-
253
- class _FileFiler(FileExtensionFilter): # pragma: no cover
254
- def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
255
- super().__init__(file_extensions, allow_if_no_extension=True)
256
- self._exclude = set(exclude)
257
-
258
- def _is_valid(self, path: str) -> bool:
259
- return pathlib.Path(
260
- path
261
- ).name not in self._exclude and self._file_has_extension(path)
262
-
263
- def __call__(self, paths: List[str]) -> List[str]:
264
- return [path for path in paths if self._is_valid(path)]
@@ -3,7 +3,7 @@ from typing import Any
3
3
  try:
4
4
  from pyspark.sql.connect.session import SparkSession as SparkConnectSession
5
5
  from pyspark.sql.connect.dataframe import DataFrame as SparkConnectDataFrame
6
- except ImportError: # pragma: no cover
6
+ except Exception: # pragma: no cover
7
7
  SparkConnectSession = None
8
8
  SparkConnectDataFrame = None
9
9
  import pyspark.sql as ps
@@ -486,6 +486,23 @@ class BuiltInTests(object):
486
486
  dag.df([], "a:int,b:int").assert_eq(b)
487
487
  dag.run(self.engine)
488
488
 
489
+ def test_transform_row_wise(self):
490
+ def t1(row: Dict[str, Any]) -> Dict[str, Any]:
491
+ row["b"] = 1
492
+ return row
493
+
494
+ def t2(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
495
+ return rows[0]
496
+
497
+ with fa.engine_context(self.engine):
498
+ a = pd.DataFrame([[3, 4], [1, 2], [3, 5]], columns=["a", "b"])
499
+ b = fa.transform(a, t1, schema="*")
500
+ assert sorted(fa.as_array(b)) == [[1, 1], [3, 1], [3, 1]]
501
+ b = fa.transform(
502
+ a, t2, schema="*", partition={"by": "a", "presort": "b"}
503
+ )
504
+ assert sorted(fa.as_array(b)) == [[1, 2], [3, 4]]
505
+
489
506
  def test_transform_binary(self):
490
507
  with FugueWorkflow() as dag:
491
508
  a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
@@ -548,6 +565,8 @@ class BuiltInTests(object):
548
565
  e = dag.df([[1, 2, 1, 10]], "a:int,ct1:int,ct2:int,x:int")
549
566
  e.assert_eq(c)
550
567
 
568
+ a.zip(b).transform(mock_co_tf1_d, params=dict(p=10)).assert_eq(e)
569
+
551
570
  # interfaceless
552
571
  c = dag.transform(
553
572
  a.zip(b),
@@ -676,6 +695,13 @@ class BuiltInTests(object):
676
695
  incr()
677
696
  yield pa.Table.from_pandas(df)
678
697
 
698
+ def t11(row: Dict[str, Any]) -> Dict[str, Any]:
699
+ incr()
700
+ return row
701
+
702
+ def t12(row: Dict[str, Any]) -> None:
703
+ incr()
704
+
679
705
  with FugueWorkflow() as dag:
680
706
  a = dag.df([[1, 2], [3, 4]], "a:double,b:int")
681
707
  a.out_transform(t1) # +2
@@ -688,6 +714,8 @@ class BuiltInTests(object):
688
714
  a.out_transform(t8, ignore_errors=[NotImplementedError]) # +1
689
715
  a.out_transform(t9) # +1
690
716
  a.out_transform(t10) # +1
717
+ a.out_transform(t11) # +2
718
+ a.out_transform(t12) # +2
691
719
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t2))
692
720
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t3))
693
721
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t4))
@@ -695,7 +723,7 @@ class BuiltInTests(object):
695
723
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(T7))
696
724
  dag.run(self.engine)
697
725
 
698
- assert 13 <= incr()
726
+ assert 17 <= incr()
699
727
 
700
728
  def test_out_cotransform(self): # noqa: C901
701
729
  tmpdir = str(self.tmpdir)
@@ -2001,6 +2029,13 @@ def mock_co_tf1(
2001
2029
  return [[df1[0]["a"], len(df1), len(df2), p]]
2002
2030
 
2003
2031
 
2032
+ @cotransformer(lambda dfs, **kwargs: "a:int,ct1:int,ct2:int,x:int")
2033
+ def mock_co_tf1_d(
2034
+ df1: List[Dict[str, Any]], df2: List[List[Any]], p=1
2035
+ ) -> Dict[str, Any]:
2036
+ return dict(a=df1[0]["a"], ct1=len(df1), ct2=len(df2), x=p)
2037
+
2038
+
2004
2039
  def mock_co_tf2(dfs: DataFrames, p=1) -> List[List[Any]]:
2005
2040
  return [[dfs[0].peek_dict()["a"], dfs[0].count(), dfs[1].count(), p]]
2006
2041
 
fugue_version/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.0"
1
+ __version__ = "0.9.2"