fugue 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fugue/_utils/io.py CHANGED
@@ -20,6 +20,10 @@ class FileParser(object):
20
20
  self._has_glob = "*" in path or "?" in path
21
21
  self._raw_path = path
22
22
  self._fs, self._fs_path = url_to_fs(path)
23
+ if not self._has_glob and self._fs.isdir(self._fs_path):
24
+ self._is_dir = True
25
+ else:
26
+ self._is_dir = False
23
27
  if not self.is_local:
24
28
  self._path = self._fs.unstrip_protocol(self._fs_path)
25
29
  else:
@@ -43,11 +47,15 @@ class FileParser(object):
43
47
  return self
44
48
 
45
49
  @property
46
- def has_glob(self):
50
+ def is_dir(self) -> bool:
51
+ return self._is_dir
52
+
53
+ @property
54
+ def has_glob(self) -> bool:
47
55
  return self._has_glob
48
56
 
49
57
  @property
50
- def is_local(self):
58
+ def is_local(self) -> bool:
51
59
  return isinstance(self._fs, LocalFileSystem)
52
60
 
53
61
  def join(self, path: str, format_hint: Optional[str] = None) -> "FileParser":
@@ -65,6 +73,10 @@ class FileParser(object):
65
73
  def path(self) -> str:
66
74
  return self._path
67
75
 
76
+ def as_dir_path(self) -> str:
77
+ assert_or_throw(self.is_dir, f"{self.raw_path} is not a directory")
78
+ return self.path + self._fs.sep
79
+
68
80
  @property
69
81
  def raw_path(self) -> str:
70
82
  return self._raw_path
@@ -80,6 +80,7 @@ class DataFrameFunctionWrapper(FunctionWrapper):
80
80
  p.update(kwargs)
81
81
  has_kw = False
82
82
  rargs: Dict[str, Any] = {}
83
+ row_param_info: Any = None
83
84
  for k, v in self._params.items():
84
85
  if isinstance(v, (PositionalParam, KeywordParam)):
85
86
  if isinstance(v, KeywordParam):
@@ -90,7 +91,16 @@ class DataFrameFunctionWrapper(FunctionWrapper):
90
91
  isinstance(p[k], DataFrame),
91
92
  lambda: TypeError(f"{p[k]} is not a DataFrame"),
92
93
  )
93
- rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
+ if v.is_per_row: # pragma: no cover
95
+ # TODO: this branch is used only if row annotations
96
+ # are allowed as input
97
+ assert_or_throw(
98
+ row_param_info is None,
99
+ lambda: ValueError("only one row parameter is allowed"),
100
+ )
101
+ row_param_info = (k, v, p[k])
102
+ else:
103
+ rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
104
  else:
95
105
  rargs[k] = p[k] # TODO: should we do auto type conversion?
96
106
  del p[k]
@@ -100,12 +110,40 @@ class DataFrameFunctionWrapper(FunctionWrapper):
100
110
  rargs.update(p)
101
111
  elif not ignore_unknown and len(p) > 0:
102
112
  raise ValueError(f"{p} are not acceptable parameters")
113
+ if row_param_info is None:
114
+ return self._run_func(rargs, output, output_schema, ctx, raw=False)
115
+ else: # pragma: no cover
116
+ # input contains row parameter
117
+ # TODO: this branch is used only if row annotations are allowed as input
118
+
119
+ def _dfs() -> Iterable[Any]:
120
+ k, v, df = row_param_info
121
+ for row in v.to_input_rows(df, ctx):
122
+ rargs[k] = None
123
+ _rargs = rargs.copy()
124
+ _rargs[k] = row
125
+ yield self._run_func(_rargs, output, output_schema, ctx, raw=True)
126
+
127
+ if not output:
128
+ sum(1 for _ in _dfs())
129
+ return
130
+ else:
131
+ return self._rt.iterable_to_output_df(_dfs(), output_schema, ctx)
132
+
133
+ def _run_func(
134
+ self,
135
+ rargs: Dict[str, Any],
136
+ output: bool,
137
+ output_schema: Any,
138
+ ctx: Any,
139
+ raw: bool,
140
+ ) -> Any:
103
141
  rt = self._func(**rargs)
104
142
  if not output:
105
143
  if isinstance(self._rt, _DataFrameParamBase):
106
144
  self._rt.count(rt)
107
145
  return
108
- if isinstance(self._rt, _DataFrameParamBase):
146
+ if not raw and isinstance(self._rt, _DataFrameParamBase):
109
147
  return self._rt.to_output_df(rt, output_schema, ctx=ctx)
110
148
  return rt
111
149
 
@@ -120,6 +158,7 @@ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
120
158
  annotation == Callable
121
159
  or annotation == callable # pylint: disable=comparison-with-callable
122
160
  or str(annotation).startswith("typing.Callable")
161
+ or str(annotation).startswith("collections.abc.Callable")
123
162
  ),
124
163
  )
125
164
  class _CallableParam(AnnotatedParam):
@@ -134,6 +173,9 @@ class _CallableParam(AnnotatedParam):
134
173
  or annotation == Optional[callable]
135
174
  or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
136
175
  or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
176
+ or str(annotation).startswith(
177
+ "typing.Optional[collections.abc.Callable]"
178
+ ) # 3.9+
137
179
  ),
138
180
  )
139
181
  class _OptionalCallableParam(AnnotatedParam):
@@ -145,14 +187,30 @@ class _DataFrameParamBase(AnnotatedParam):
145
187
  super().__init__(param)
146
188
  assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
147
189
 
190
+ @property
191
+ def is_per_row(self) -> bool:
192
+ return False
193
+
148
194
  def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
149
195
  raise NotImplementedError
150
196
 
197
+ def to_input_rows(
198
+ self,
199
+ df: DataFrame,
200
+ ctx: Any,
201
+ ) -> Iterable[Any]:
202
+ raise NotImplementedError # pragma: no cover
203
+
151
204
  def to_output_df(
152
205
  self, df: Any, schema: Any, ctx: Any
153
206
  ) -> DataFrame: # pragma: no cover
154
207
  raise NotImplementedError
155
208
 
209
+ def iterable_to_output_df(
210
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
211
+ ) -> DataFrame: # pragma: no cover
212
+ raise NotImplementedError
213
+
156
214
  def count(self, df: Any) -> int: # pragma: no cover
157
215
  raise NotImplementedError
158
216
 
@@ -182,6 +240,36 @@ class DataFrameParam(_DataFrameParamBase):
182
240
  return sum(1 for _ in df.as_array_iterable())
183
241
 
184
242
 
243
+ @fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
244
+ class RowParam(_DataFrameParamBase): # pragma: no cover
245
+ # TODO: this class is used only if row annotations are allowed as input
246
+ @property
247
+ def is_per_row(self) -> bool:
248
+ return True
249
+
250
+ def count(self, df: Any) -> int:
251
+ return 1
252
+
253
+
254
+ @fugue_annotated_param(Dict[str, Any])
255
+ class DictParam(RowParam): # pragma: no cover
256
+ # TODO: this class is used only if row annotations are allowed as input
257
+ def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
258
+ yield from df.as_dict_iterable()
259
+
260
+ def to_output_df(self, output: Dict[str, Any], schema: Any, ctx: Any) -> DataFrame:
261
+ return ArrayDataFrame([list(output.values())], schema)
262
+
263
+ def iterable_to_output_df(
264
+ self, dfs: Iterable[Dict[str, Any]], schema: Any, ctx: Any
265
+ ) -> DataFrame: # pragma: no cover
266
+ params: Dict[str, Any] = {}
267
+ if schema is not None:
268
+ params["schema"] = Schema(schema).pa_schema
269
+ adf = pa.Table.from_pylist(list(dfs), **params)
270
+ return ArrowDataFrame(adf)
271
+
272
+
185
273
  @fugue_annotated_param(AnyDataFrame)
186
274
  class _AnyDataFrameParam(DataFrameParam):
187
275
  def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
@@ -207,6 +295,15 @@ class LocalDataFrameParam(DataFrameParam):
207
295
  )
208
296
  return output
209
297
 
298
+ def iterable_to_output_df(
299
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
300
+ ) -> DataFrame: # pragma: no cover
301
+ def _dfs() -> Iterable[DataFrame]:
302
+ for df in dfs:
303
+ yield self.to_output_df(df, schema, ctx)
304
+
305
+ return LocalDataFrameIterableDataFrame(_dfs(), schema=schema)
306
+
210
307
  def count(self, df: LocalDataFrame) -> int:
211
308
  if df.is_bounded:
212
309
  return df.count()
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
375
375
  assert_arg_not_none(schema, "schema")
376
376
  tr = _FuncAsTransformer()
377
377
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
378
- func, "^[lspq][fF]?x*z?$", "^[lspq]$"
378
+ func, "^[lspq][fF]?x*z?$", "^[lspqr]$"
379
379
  )
380
380
  tr._output_schema_arg = schema # type: ignore
381
381
  tr._validation_rules = validation_rules # type: ignore
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
410
410
  validation_rules.update(parse_validation_rules_from_comment(func))
411
411
  tr = _FuncAsOutputTransformer()
412
412
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
413
- func, "^[lspq][fF]?x*z?$", "^[lspnq]$"
413
+ func, "^[lspq][fF]?x*z?$", "^[lspnqr]$"
414
414
  )
415
415
  tr._output_schema_arg = None # type: ignore
416
416
  tr._validation_rules = validation_rules # type: ignore
@@ -503,7 +503,7 @@ class _FuncAsCoTransformer(CoTransformer):
503
503
  assert_arg_not_none(schema, "schema")
504
504
  tr = _FuncAsCoTransformer()
505
505
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
506
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$"
506
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspqr]$"
507
507
  )
508
508
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
509
509
  tr._output_schema_arg = schema # type: ignore
@@ -562,7 +562,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
562
562
 
563
563
  tr = _FuncAsOutputCoTransformer()
564
564
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
565
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnq]$"
565
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnqr]$"
566
566
  )
567
567
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
568
568
  tr._output_schema_arg = None # type: ignore
@@ -1,13 +1,12 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: fugue
3
- Version: 0.9.1
3
+ Version: 0.9.2
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
7
7
  Author-email: hello@fugue.ai
8
8
  License: Apache-2.0
9
9
  Keywords: distributed spark dask ray sql dsl domain specific language
10
- Platform: UNKNOWN
11
10
  Classifier: Development Status :: 5 - Production/Stable
12
11
  Classifier: Intended Audience :: Developers
13
12
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
@@ -17,67 +16,81 @@ Classifier: Programming Language :: Python :: 3.8
17
16
  Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.8
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: triad >=0.9.7
24
- Requires-Dist: adagio >=0.2.4
25
- Provides-Extra: all
26
- Requires-Dist: qpd >=0.4.4 ; extra == 'all'
27
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'all'
28
- Requires-Dist: sqlglot ; extra == 'all'
29
- Requires-Dist: jinja2 ; extra == 'all'
30
- Requires-Dist: pyspark >=3.1.1 ; extra == 'all'
31
- Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'all'
32
- Requires-Dist: dask-sql ; extra == 'all'
33
- Requires-Dist: ray[data] >=2.5.0 ; extra == 'all'
34
- Requires-Dist: notebook ; extra == 'all'
35
- Requires-Dist: jupyterlab ; extra == 'all'
36
- Requires-Dist: ipython >=7.10.0 ; extra == 'all'
37
- Requires-Dist: duckdb >=0.5.0 ; extra == 'all'
38
- Requires-Dist: pyarrow >=6.0.1 ; extra == 'all'
39
- Requires-Dist: pandas <2.2,>=2.0.2 ; extra == 'all'
40
- Requires-Dist: ibis-framework ; extra == 'all'
41
- Requires-Dist: polars ; extra == 'all'
42
- Provides-Extra: cpp_sql_parser
43
- Requires-Dist: fugue-sql-antlr[cpp] >=0.2.0 ; extra == 'cpp_sql_parser'
23
+ License-File: LICENSE
24
+ Requires-Dist: triad>=1.0.0
25
+ Requires-Dist: adagio>=0.2.6
26
+ Provides-Extra: sql
27
+ Requires-Dist: qpd>=0.4.4; extra == "sql"
28
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "sql"
29
+ Requires-Dist: sqlglot; extra == "sql"
30
+ Requires-Dist: jinja2; extra == "sql"
31
+ Provides-Extra: cpp-sql-parser
32
+ Requires-Dist: fugue-sql-antlr[cpp]>=0.2.0; extra == "cpp-sql-parser"
33
+ Provides-Extra: spark
34
+ Requires-Dist: pyspark>=3.1.1; extra == "spark"
44
35
  Provides-Extra: dask
45
- Requires-Dist: dask[dataframe,distributed] >=2023.5.0 ; extra == 'dask'
46
- Requires-Dist: pyarrow >=7.0.0 ; extra == 'dask'
47
- Requires-Dist: pandas >=2.0.2 ; extra == 'dask'
48
- Requires-Dist: dask[dataframe,distributed] >=2024.4.0 ; (python_version >= "3.11.9") and extra == 'dask'
36
+ Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "dask"
37
+ Requires-Dist: pyarrow>=7.0.0; extra == "dask"
38
+ Requires-Dist: pandas>=2.0.2; extra == "dask"
39
+ Provides-Extra: ray
40
+ Requires-Dist: ray[data]>=2.30.0; extra == "ray"
41
+ Requires-Dist: duckdb>=0.5.0; extra == "ray"
42
+ Requires-Dist: pyarrow>=7.0.0; extra == "ray"
43
+ Requires-Dist: pandas<2.2; extra == "ray"
49
44
  Provides-Extra: duckdb
50
- Requires-Dist: qpd >=0.4.4 ; extra == 'duckdb'
51
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'duckdb'
52
- Requires-Dist: sqlglot ; extra == 'duckdb'
53
- Requires-Dist: jinja2 ; extra == 'duckdb'
54
- Requires-Dist: duckdb >=0.5.0 ; extra == 'duckdb'
55
- Requires-Dist: numpy ; extra == 'duckdb'
45
+ Requires-Dist: qpd>=0.4.4; extra == "duckdb"
46
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "duckdb"
47
+ Requires-Dist: sqlglot; extra == "duckdb"
48
+ Requires-Dist: jinja2; extra == "duckdb"
49
+ Requires-Dist: duckdb>=0.5.0; extra == "duckdb"
50
+ Requires-Dist: numpy; extra == "duckdb"
51
+ Provides-Extra: polars
52
+ Requires-Dist: polars; extra == "polars"
56
53
  Provides-Extra: ibis
57
- Requires-Dist: qpd >=0.4.4 ; extra == 'ibis'
58
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'ibis'
59
- Requires-Dist: sqlglot ; extra == 'ibis'
60
- Requires-Dist: jinja2 ; extra == 'ibis'
61
- Requires-Dist: ibis-framework ; extra == 'ibis'
62
- Requires-Dist: pandas <2.2 ; extra == 'ibis'
54
+ Requires-Dist: qpd>=0.4.4; extra == "ibis"
55
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "ibis"
56
+ Requires-Dist: sqlglot; extra == "ibis"
57
+ Requires-Dist: jinja2; extra == "ibis"
58
+ Requires-Dist: ibis-framework[pandas]; extra == "ibis"
59
+ Requires-Dist: pandas<2.2; extra == "ibis"
63
60
  Provides-Extra: notebook
64
- Requires-Dist: notebook ; extra == 'notebook'
65
- Requires-Dist: jupyterlab ; extra == 'notebook'
66
- Requires-Dist: ipython >=7.10.0 ; extra == 'notebook'
67
- Provides-Extra: polars
68
- Requires-Dist: polars ; extra == 'polars'
69
- Provides-Extra: ray
70
- Requires-Dist: ray[data] >=2.5.0 ; extra == 'ray'
71
- Requires-Dist: duckdb >=0.5.0 ; extra == 'ray'
72
- Requires-Dist: pyarrow >=7.0.0 ; extra == 'ray'
73
- Requires-Dist: pandas <2.2 ; extra == 'ray'
74
- Provides-Extra: spark
75
- Requires-Dist: pyspark >=3.1.1 ; extra == 'spark'
76
- Provides-Extra: sql
77
- Requires-Dist: qpd >=0.4.4 ; extra == 'sql'
78
- Requires-Dist: fugue-sql-antlr >=0.2.0 ; extra == 'sql'
79
- Requires-Dist: sqlglot ; extra == 'sql'
80
- Requires-Dist: jinja2 ; extra == 'sql'
61
+ Requires-Dist: notebook; extra == "notebook"
62
+ Requires-Dist: jupyterlab; extra == "notebook"
63
+ Requires-Dist: ipython>=7.10.0; extra == "notebook"
64
+ Provides-Extra: all
65
+ Requires-Dist: qpd>=0.4.4; extra == "all"
66
+ Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "all"
67
+ Requires-Dist: sqlglot; extra == "all"
68
+ Requires-Dist: jinja2; extra == "all"
69
+ Requires-Dist: pyspark>=3.1.1; extra == "all"
70
+ Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "all"
71
+ Requires-Dist: dask-sql; extra == "all"
72
+ Requires-Dist: ray[data]>=2.30.0; extra == "all"
73
+ Requires-Dist: notebook; extra == "all"
74
+ Requires-Dist: jupyterlab; extra == "all"
75
+ Requires-Dist: ipython>=7.10.0; extra == "all"
76
+ Requires-Dist: duckdb>=0.5.0; extra == "all"
77
+ Requires-Dist: pyarrow>=6.0.1; extra == "all"
78
+ Requires-Dist: pandas<2.2,>=2.0.2; extra == "all"
79
+ Requires-Dist: ibis-framework[duckdb,pandas]; extra == "all"
80
+ Requires-Dist: polars; extra == "all"
81
+ Dynamic: author
82
+ Dynamic: author-email
83
+ Dynamic: classifier
84
+ Dynamic: description
85
+ Dynamic: description-content-type
86
+ Dynamic: home-page
87
+ Dynamic: keywords
88
+ Dynamic: license
89
+ Dynamic: license-file
90
+ Dynamic: provides-extra
91
+ Dynamic: requires-dist
92
+ Dynamic: requires-python
93
+ Dynamic: summary
81
94
 
82
95
  # Fugue
83
96
 
@@ -355,4 +368,3 @@ View some of our latest conferences presentations and content. For a more comple
355
368
  * [Large Scale Data Validation with Spark and Dask (PyCon US)](https://www.youtube.com/watch?v=2AdvBgjO_3Q)
356
369
  * [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames (PyData Global)](https://www.youtube.com/watch?v=OBpnGYjNBBI)
357
370
  * [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk)
358
-
@@ -10,7 +10,7 @@ fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  fugue/_utils/display.py,sha256=JV8oDA7efHm1wceZulCBOY5dMvjbWHvIm6ASisKfoWY,3164
11
11
  fugue/_utils/exception.py,sha256=SFIjwjV4CIEovp3P9k7ePNOFB12A5D8hDdhtfFUeM5Y,2247
12
12
  fugue/_utils/interfaceless.py,sha256=wI0H6L4W_1uQjh9tpjgT9HzN-fbrrtXXHC1x6Q_rrPg,2203
13
- fugue/_utils/io.py,sha256=adrtj6Dq0ti426DNlkliApbTkp8b3bfBysAiE5MVQVc,9265
13
+ fugue/_utils/io.py,sha256=5twd99LBzHtIMT67il1qwnEUa5n13WZmVKNd1shO4No,9649
14
14
  fugue/_utils/misc.py,sha256=_huy0eylmRTEFoReGR2M4rbAI8m79hFcfY5bDceVEXU,887
15
15
  fugue/_utils/registry.py,sha256=lrbzTdUEVnW6paBGDj-Yb-aTIbP5mjCqrXuRU9_N6os,316
16
16
  fugue/bag/__init__.py,sha256=0Q0_rnrEThrTx2U-1xGNyAg95idp_xcnywymIcW4Xck,46
@@ -31,7 +31,7 @@ fugue/dataframe/arrow_dataframe.py,sha256=r5zcZBX_N6XO5dmixBkTCPgLcMmgDF022piZvr
31
31
  fugue/dataframe/dataframe.py,sha256=xmyG85i14A6LDRkNmPt29oYq7PJsq668s1QvFHK8PV4,16964
32
32
  fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
33
33
  fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
34
- fugue/dataframe/function_wrapper.py,sha256=hOZF3GmwpxqwqKi9-pEOAPZSW1ZFyB47hLxRrGyOiuM,14855
34
+ fugue/dataframe/function_wrapper.py,sha256=7Sb6XrWTD_swtHJbHDWZRxHvFNWkERynnCDzLM0wSbo,18340
35
35
  fugue/dataframe/iterable_dataframe.py,sha256=TcOoNKa4jNbHbvAZ0XAhtMmGcioygIHPxI9budDtenQ,4758
36
36
  fugue/dataframe/pandas_dataframe.py,sha256=0L0wYCGhD2BpQbruoT07Ox9iQM5YLHLNrcgzudc-yKs,11633
37
37
  fugue/dataframe/utils.py,sha256=bA_otOJt9oju1yq5gtn21L_GDT_pUgNc6luYuBIhbUQ,10488
@@ -61,7 +61,7 @@ fugue/extensions/processor/convert.py,sha256=zG0lMtHGwY5TsqK4eplbMdlTg7J_PD3HbI0
61
61
  fugue/extensions/processor/processor.py,sha256=czhQlQgMpAXXoLVAX9Q0TFUMYEEhsgufTammxcKSmOY,1665
62
62
  fugue/extensions/transformer/__init__.py,sha256=VD6d-8xW1Yl8fUPj43cBWNR9pCOlYD9xWyGIHAlHwvI,456
63
63
  fugue/extensions/transformer/constants.py,sha256=76DfpoTOGQ8gp5XtCs_xznfbr_H015-prXpHWSqMNDU,59
64
- fugue/extensions/transformer/convert.py,sha256=5fhktR2s13ZOpUihpy-gy7Xn2BRN6UoA5uwOzJ6YNOU,23380
64
+ fugue/extensions/transformer/convert.py,sha256=zDDIpZawMnHFarjZNZAyiw1jfyXGuPjnvgQk9jpYLak,23384
65
65
  fugue/extensions/transformer/transformer.py,sha256=zhOUgyv5-DPxYd1CP_98WeEw-zUgwknRnPW_6di-q3g,9098
66
66
  fugue/rpc/__init__.py,sha256=3GzUl4QZQuCChjD7eaTJW8tnTwfke6ZY9r9g5nCeBZ8,167
67
67
  fugue/rpc/base.py,sha256=3Fq5SvwLZqw9NXru3r32WuJKBGFr9bl7nFgy6e9boGo,8470
@@ -82,6 +82,7 @@ fugue/workflow/api.py,sha256=uQoxPSCZ91-ST4vwuPWG7qioRGW4eo-Sgi3DdwtSL4k,12495
82
82
  fugue/workflow/input.py,sha256=V_zLDNzndmQuYJAPXtdK4n-vOp7LrimGIf_wQtwf2mc,321
83
83
  fugue/workflow/module.py,sha256=ajyqgMwX6hFMZY9xp4Bp1Q-Zdta0p5f_W_n_SNrc4LE,5547
84
84
  fugue/workflow/workflow.py,sha256=-SFCXkyxgXbS6DpQGSBox4d3Ws3psIlB6PnraJLSu9Y,88219
85
+ fugue-0.9.2.dist-info/licenses/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
85
86
  fugue_contrib/__init__.py,sha256=QJioX-r2AiU7Pvt24M-k2c4vNq29qpK-3WNUde7ucck,222
86
87
  fugue_contrib/contrib.py,sha256=3B--6oIVBMZ-GwjIOXwZqYqkloH7Cxfq1I8vkwl2yPk,267
87
88
  fugue_contrib/seaborn/__init__.py,sha256=NuVv8EI4Om4gHcHwYO8ddextLQqw24vDj8qJio3E1MU,1405
@@ -89,25 +90,26 @@ fugue_contrib/viz/__init__.py,sha256=osgZx63Br-yMZImyEfYf9MVzJNM2Cqqke_-WsuDmG5M
89
90
  fugue_contrib/viz/_ext.py,sha256=Lu_DlS5DcmrFz27fHcKTCkhKyknVWcfS5kzZVVuO9xM,1345
90
91
  fugue_dask/__init__.py,sha256=2CcJ0AsN-k_f7dZ-yAyYpaICfUMPfH3l0FvUJSBzTr0,161
91
92
  fugue_dask/_constants.py,sha256=35UmTVITk21GhRyRlbJOwPPdQsytM_p_2NytOXEay18,510
93
+ fugue_dask/_dask_sql_wrapper.py,sha256=lj38gJIOdoMV9W44gpwzLjUEtPVsQNKjRWuEkfI7-PM,2618
92
94
  fugue_dask/_io.py,sha256=pl4F7mbVgP7Rwh1FFG7xfOz2TBZRUj1l3lLvDY4jOf4,6020
93
- fugue_dask/_utils.py,sha256=1uplEqvpCDZDp2YdwJxa6cuGScpgG9VvN3057J02bys,8956
94
- fugue_dask/dataframe.py,sha256=MuG9TqCND7qI66lPvxzuomfE7yA4sW7DjrvbyvE6XEU,13471
95
- fugue_dask/execution_engine.py,sha256=60IiwYRBVhN-pX3v6i9BZ8Pa4bcSh5UoklvCScM_XAM,21361
95
+ fugue_dask/_utils.py,sha256=0R0pCh4B47kQsAS_o0QGaosIqVcZnSakm6pfMB7fSXs,9059
96
+ fugue_dask/dataframe.py,sha256=4Dvckpc4mlld2WsEFTTemxoA1zYK8Cn6jMKxUxYQCEE,13491
97
+ fugue_dask/execution_engine.py,sha256=mFN_IurhdBEu8C5OreqpGSRdTbTBqSpzJO2dMQzEF-o,21264
96
98
  fugue_dask/registry.py,sha256=jepWKH55VWNIWV3pOF5vpCl2OpO0rI1IULx5GM2Gk6w,2274
97
99
  fugue_dask/tester.py,sha256=E7BZjgFpJgrHsLMKzvSO5im5OwocYcratjzulJSQZl0,718
98
100
  fugue_duckdb/__init__.py,sha256=ZzhmAWbROR1YL9Kmlt7OlwkgPZzFhsSdwLV2pFmAqGI,268
99
101
  fugue_duckdb/_io.py,sha256=vnd8m8C6XeMCBJBbAdA5h695NMfsduQrvONyS0HcEFA,8475
100
102
  fugue_duckdb/_utils.py,sha256=ElKbHUyn5fWSPGXsK57iqMzcqKtCf0c8pBVBYGe5Ql4,5020
101
103
  fugue_duckdb/dask.py,sha256=agoLzeB7Swxj2kVWfmXFbWD1NS2lbbTlnrjSkR8kKWY,5014
102
- fugue_duckdb/dataframe.py,sha256=LRfTv7Y46wMM_IDYSP1R-5OXuHuBg8GHjPGFFt8u7l0,8444
103
- fugue_duckdb/execution_engine.py,sha256=IZDmSAtOMJGvulTStxjTmsqJyI5QRNyxBgSMlFMSrBI,20389
104
+ fugue_duckdb/dataframe.py,sha256=LAPoPOad9hgGhjyhlMGMfrnhkyBKe06Xzn6eP1hkl-w,8504
105
+ fugue_duckdb/execution_engine.py,sha256=3f5hbWcX1y9mAtfFixrri-snYxVIQAf4HOgo9fHbDwQ,20385
104
106
  fugue_duckdb/registry.py,sha256=9_41KO42kXqcjF4yParQ5JGyg5TckcbhH-Q2IlGpSho,3987
105
107
  fugue_duckdb/tester.py,sha256=MzTkv3sdOwOjI59LRrSGGl4w59Njv3OArTU5kSRL-P0,1526
106
108
  fugue_ibis/__init__.py,sha256=z7TkK7M2_0p9XO6jQATNDgT0aHXn5k69Ttz2ga-eQG8,190
107
109
  fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
108
110
  fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
109
111
  fugue_ibis/dataframe.py,sha256=k4Q6qBLBIADF5YhbvaDplXO7OkMZSHuf_Wg5o-AusEI,7796
110
- fugue_ibis/execution_engine.py,sha256=5I-ou5xPdomVu-srdvidvP8f7wDYbGrCV_lGffZa_ac,18679
112
+ fugue_ibis/execution_engine.py,sha256=jRnp1m1wuTicS29A-WA043f8QwdoK8b9rwPXvTkm8r8,18751
111
113
  fugue_notebook/__init__.py,sha256=9r_-2uxu1lBeZ8GgpYCKom_OZy2soIOYZajg7JDO-HY,4326
112
114
  fugue_notebook/env.py,sha256=TYiTxYPFi-BVJJY49jDsvw9mddhK8WrifeRxBke30I8,4773
113
115
  fugue_notebook/nbextension/README.md,sha256=QLnr957YeGfwzy2r4c4qbZPaXyCbyGrKPvcqSBQYSnU,123
@@ -127,7 +129,7 @@ fugue_ray/tester.py,sha256=oTA_xOzvQhJU3ohc4hsVpZc0zv4bwJn1c8a9u8kcuIs,537
127
129
  fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
130
  fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
129
131
  fugue_ray/_utils/dataframe.py,sha256=5c4duGV--mdLkKrbJRgjDWvVcp9BegA3yX16pmYDYLE,3954
130
- fugue_ray/_utils/io.py,sha256=3hFNDeBuh4bfCud40ZsGrGZLSvCSuxL_1VlqCTnn6RA,9794
132
+ fugue_ray/_utils/io.py,sha256=y7TFtdKcqDtMw2e1u012rT8Ay0ChvAT2uJL4pCypABM,9963
131
133
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
132
134
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
133
135
  fugue_spark/dataframe.py,sha256=lYa8FizM3p_lsKYFR49FazkVZMJKyi2LABKTpP5YBLo,12006
@@ -135,7 +137,7 @@ fugue_spark/execution_engine.py,sha256=YBMtNxCvpy77xICFSg9PHMa6feNoYhWEZe8MmxznX
135
137
  fugue_spark/registry.py,sha256=_NmiV2cOooYK0YmqATEnNkPEMT9suUMtuecw2NNbIIk,4530
136
138
  fugue_spark/tester.py,sha256=VX003yGNlBukaZTQSN-w7XvgSk4rqxrWQIzno0dWrXg,2481
137
139
  fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
138
- fugue_spark/_utils/convert.py,sha256=eRWkDYA4UO-FQu-2y4O80WEdawx7X_rIrWg55AlOiRc,10007
140
+ fugue_spark/_utils/convert.py,sha256=J3HtbuzomTYTN6A11iuvsC1h2C7o3fQBW5U360xGDhE,10234
139
141
  fugue_spark/_utils/io.py,sha256=OdUezKpB29Lx9aUS2k9x0xUAGZrmgMZyQYGPEeHk7rQ,5574
140
142
  fugue_spark/_utils/misc.py,sha256=9LsbBp6nOEhqXFLr8oWTc3VKzKk-vuVyixlRoquGnEs,858
141
143
  fugue_spark/_utils/partition.py,sha256=iaesyO5f4uXhj1W-p91cD5ecPiGlu0bzh8gl2ce2Uvg,3618
@@ -143,14 +145,13 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
143
145
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
144
146
  fugue_test/__init__.py,sha256=xoQuVobhU64uyODRdnzf6MSWe9lw5khkhpJ2atvADoc,2315
145
147
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
146
- fugue_test/builtin_suite.py,sha256=cOkZG6w1RHhWWxtjQhZClZQaGT6haNd576BoUmNC_cA,77960
148
+ fugue_test/builtin_suite.py,sha256=jP3xiq2vRZNNGzoSRjcUfrUk8NVg31SU0kpJaEvP25E,79400
147
149
  fugue_test/dataframe_suite.py,sha256=7ym4sshDUly6004cq1UlppqDVtbwxD6CKxR4Lu70i0s,18994
148
- fugue_test/execution_suite.py,sha256=jcSSoKqTGbeWzTxkyYU-8i2zJAjzuXn7BqE8ul-JjIc,48646
150
+ fugue_test/execution_suite.py,sha256=wUiGdb8wLRd13JXo7Lo19vPOLh7t1C-L2NPLeU0k-uE,48736
149
151
  fugue_test/fixtures.py,sha256=8Pev-mxRZOWwTFlsGjcSZ0iIs78zyWbp5tq4KG1wyvk,1432
150
- fugue_version/__init__.py,sha256=UwJXM8JY2T3tE2id0K2k_lEaVThbRTrGO1mNibyzIz8,22
151
- fugue-0.9.1.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
- fugue-0.9.1.dist-info/METADATA,sha256=zu44QGPIwk28QyKe9H4Si2ANByy1sJ9cmauNrhCg4bc,18380
153
- fugue-0.9.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
154
- fugue-0.9.1.dist-info/entry_points.txt,sha256=kiRuUkKOnnHFvlWpYSfVUZiXJW3hOez6gjYoOhGht3Q,302
155
- fugue-0.9.1.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
- fugue-0.9.1.dist-info/RECORD,,
152
+ fugue_version/__init__.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
153
+ fugue-0.9.2.dist-info/METADATA,sha256=zmzlL5Fw-t0qTfcRb5jCeJb_IUJ7HVYRT0SpgcN3ncI,18558
154
+ fugue-0.9.2.dist-info/WHEEL,sha256=SmOxYU7pzNKBqASvQJ7DjX3XGUF92lrGhMb3R6_iiqI,91
155
+ fugue-0.9.2.dist-info/entry_points.txt,sha256=2Vxp1qew_tswacA8m0RzIliLlFOQMlzezvSXPugM_KA,295
156
+ fugue-0.9.2.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
157
+ fugue-0.9.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (79.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,11 @@
1
+ [fugue.plugins]
2
+ dask = fugue_dask.registry[dask]
3
+ duckdb = fugue_duckdb.registry[duckdb]
4
+ ibis = fugue_ibis[ibis]
5
+ polars = fugue_polars.registry[polars]
6
+ ray = fugue_ray.registry[ray]
7
+ spark = fugue_spark.registry[spark]
8
+
9
+ [pytest11]
10
+ fugue_test = fugue_test
11
+ fugue_test_fixtures = fugue_test.fixtures
@@ -0,0 +1,76 @@
1
+ from typing import Any, Optional
2
+
3
+ import dask.dataframe as dd
4
+
5
+ try:
6
+ from dask.dataframe.dask_expr.io.parquet import ReadParquet
7
+
8
+ HAS_DASK_EXPR = True # newer dask
9
+ except ImportError: # pragma: no cover
10
+ HAS_DASK_EXPR = False # older dask
11
+
12
+ if not HAS_DASK_EXPR: # pragma: no cover
13
+ try:
14
+ from dask_sql import Context as ContextWrapper # pylint: disable-all
15
+ except ImportError: # pragma: no cover
16
+ raise ImportError(
17
+ "dask-sql is not installed. Please install it with `pip install dask-sql`"
18
+ )
19
+ else:
20
+ from triad.utils.assertion import assert_or_throw
21
+
22
+ try:
23
+ from dask_sql import Context
24
+ from dask_sql.datacontainer import Statistics
25
+ from dask_sql.input_utils import InputUtil
26
+ except ImportError: # pragma: no cover
27
+ raise ImportError(
28
+ "dask-sql is not installed. Please install it with `pip install dask-sql`"
29
+ )
30
+
31
+ class ContextWrapper(Context): # type: ignore
32
+ def create_table(
33
+ self,
34
+ table_name: str,
35
+ input_table: dd.DataFrame,
36
+ format: Optional[str] = None, # noqa
37
+ persist: bool = False,
38
+ schema_name: Optional[str] = None,
39
+ statistics: Optional[Statistics] = None,
40
+ gpu: bool = False,
41
+ **kwargs: Any,
42
+ ) -> None: # pragma: no cover
43
+ assert_or_throw(
44
+ isinstance(input_table, dd.DataFrame),
45
+ lambda: ValueError(
46
+ f"input_table must be a dask dataframe, but got {type(input_table)}"
47
+ ),
48
+ )
49
+ assert_or_throw(
50
+ dd._dask_expr_enabled(), lambda: ValueError("Dask expr must be enabled")
51
+ )
52
+ schema_name = schema_name or self.schema_name
53
+
54
+ dc = InputUtil.to_dc(
55
+ input_table,
56
+ table_name=table_name,
57
+ format=format,
58
+ persist=persist,
59
+ gpu=gpu,
60
+ **kwargs,
61
+ )
62
+
63
+ dask_filepath = None
64
+ operations = input_table.find_operations(ReadParquet)
65
+ for op in operations:
66
+ dask_filepath = op._args[0]
67
+
68
+ dc.filepath = dask_filepath
69
+ self.schema[schema_name].filepaths[table_name.lower()] = dask_filepath
70
+
71
+ if not statistics:
72
+ statistics = Statistics(float("nan"))
73
+ dc.statistics = statistics
74
+
75
+ self.schema[schema_name].tables[table_name.lower()] = dc
76
+ self.schema[schema_name].statistics[table_name.lower()] = statistics
fugue_dask/_utils.py CHANGED
@@ -5,7 +5,7 @@ import dask.dataframe as dd
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
  import pyarrow as pa
8
- from dask.dataframe.core import DataFrame
8
+ from dask.dataframe import DataFrame
9
9
  from dask.delayed import delayed
10
10
  from dask.distributed import Client, get_client
11
11
  from triad.utils.pandas_like import PD_UTILS, PandasLikeUtils
@@ -149,7 +149,7 @@ def _add_hash_index(
149
149
  if len(cols) == 0:
150
150
  cols = list(df.columns)
151
151
 
152
- def _add_hash(df: pd.DataFrame) -> pd.DataFrame:
152
+ def _add_hash(df: pd.DataFrame) -> pd.DataFrame: # pragma: no cover
153
153
  if len(df) == 0:
154
154
  return df.assign(**{_FUGUE_DASK_TEMP_IDX_COLUMN: pd.Series(dtype=int)})
155
155
  return df.assign(
@@ -171,7 +171,7 @@ def _add_hash_index(
171
171
 
172
172
  def _add_random_index(
173
173
  df: dd.DataFrame, num: int, seed: Any = None
174
- ) -> Tuple[dd.DataFrame, int]:
174
+ ) -> Tuple[dd.DataFrame, int]: # pragma: no cover
175
175
  def _add_rand(df: pd.DataFrame) -> pd.DataFrame:
176
176
  if len(df) == 0:
177
177
  return df.assign(**{_FUGUE_DASK_TEMP_IDX_COLUMN: pd.Series(dtype=int)})
@@ -189,7 +189,9 @@ def _add_random_index(
189
189
 
190
190
 
191
191
  def _add_continuous_index(df: dd.DataFrame) -> Tuple[dd.DataFrame, int]:
192
- def _get_info(df: pd.DataFrame, partition_info: Any) -> pd.DataFrame:
192
+ def _get_info(
193
+ df: pd.DataFrame, partition_info: Any
194
+ ) -> pd.DataFrame: # pragma: no cover
193
195
  return pd.DataFrame(dict(no=[partition_info["number"]], ct=[len(df)]))
194
196
 
195
197
  pinfo = (
@@ -200,7 +202,9 @@ def _add_continuous_index(df: dd.DataFrame) -> Tuple[dd.DataFrame, int]:
200
202
  counts = pinfo.sort_values("no").ct.cumsum().tolist()
201
203
  starts = [0] + counts[0:-1]
202
204
 
203
- def _add_index(df: pd.DataFrame, partition_info: Any) -> pd.DataFrame:
205
+ def _add_index(
206
+ df: pd.DataFrame, partition_info: Any
207
+ ) -> pd.DataFrame: # pragma: no cover
204
208
  return df.assign(
205
209
  **{
206
210
  _FUGUE_DASK_TEMP_IDX_COLUMN: np.arange(len(df))
fugue_dask/dataframe.py CHANGED
@@ -379,7 +379,7 @@ def _to_array_chunks(
379
379
  assert_or_throw(columns is None or len(columns) > 0, ValueError("empty columns"))
380
380
  _df = df if columns is None or len(columns) == 0 else df[columns]
381
381
 
382
- def _to_list(pdf: pd.DataFrame) -> List[Any]:
382
+ def _to_list(pdf: pd.DataFrame) -> List[Any]: # pragma: no cover
383
383
  return list(
384
384
  PD_UTILS.as_array_iterable(
385
385
  pdf,
@@ -9,9 +9,10 @@ from triad.collections import Schema
9
9
  from triad.collections.dict import IndexedOrderedDict, ParamDict
10
10
  from triad.utils.assertion import assert_or_throw
11
11
  from triad.utils.hash import to_uuid
12
+ from triad.utils.io import makedirs
12
13
  from triad.utils.pandas_like import PandasUtils
13
14
  from triad.utils.threading import RunOnce
14
- from triad.utils.io import makedirs
15
+
15
16
  from fugue import StructuredRawSQL
16
17
  from fugue.collections.partition import (
17
18
  PartitionCursor,
@@ -61,14 +62,9 @@ class DaskSQLEngine(SQLEngine):
61
62
  return True
62
63
 
63
64
  def select(self, dfs: DataFrames, statement: StructuredRawSQL) -> DataFrame:
64
- try:
65
- from dask_sql import Context
66
- except ImportError: # pragma: no cover
67
- raise ImportError(
68
- "dask-sql is not installed. "
69
- "Please install it with `pip install dask-sql`"
70
- )
71
- ctx = Context()
65
+ from ._dask_sql_wrapper import ContextWrapper
66
+
67
+ ctx = ContextWrapper()
72
68
  _dfs: Dict[str, dd.DataFrame] = {k: self._to_safe_df(v) for k, v in dfs.items()}
73
69
  sql = statement.construct(dialect=self.dialect, log=self.log)
74
70
  res = ctx.sql(
@@ -102,7 +98,8 @@ class DaskMapEngine(MapEngine):
102
98
  partition_spec: PartitionSpec,
103
99
  on_init: Optional[Callable[[int, DataFrame], Any]] = None,
104
100
  map_func_format_hint: Optional[str] = None,
105
- ) -> DataFrame:
101
+ ) -> DataFrame: # pragma: no cover
102
+ # It is well tested but not captured by coverage
106
103
  presort = partition_spec.get_sorts(
107
104
  df.schema, with_partition_keys=partition_spec.algo == "coarse"
108
105
  )
@@ -475,7 +472,7 @@ class DaskExecutionEngine(ExecutionEngine):
475
472
  # Use presort over partition_spec.presort if possible
476
473
  _presort: IndexedOrderedDict = presort or partition_spec.presort
477
474
 
478
- def _partition_take(partition, n, presort):
475
+ def _partition_take(partition, n, presort): # pragma: no cover
479
476
  assert_or_throw(
480
477
  partition.shape[1] == len(meta),
481
478
  FugueBug("hitting the dask bug where partition keys are lost"),
fugue_duckdb/dataframe.py CHANGED
@@ -165,7 +165,7 @@ def _duck_as_local(df: DuckDBPyRelation) -> DuckDBPyRelation:
165
165
 
166
166
  @as_arrow.candidate(lambda df: isinstance(df, DuckDBPyRelation))
167
167
  def _duck_as_arrow(df: DuckDBPyRelation) -> pa.Table:
168
- _df = df.arrow()
168
+ _df = df.fetch_arrow_table()
169
169
  _df = replace_types_in_table(_df, LARGE_TYPES_REPLACEMENT, recursive=True)
170
170
  return _df
171
171
 
@@ -216,7 +216,7 @@ def _drop_duckdb_columns(df: DuckDBPyRelation, columns: List[str]) -> DuckDBPyRe
216
216
  def _duck_as_array(
217
217
  df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
218
218
  ) -> List[Any]:
219
- return pa_table_as_array(df.arrow(), columns=columns)
219
+ return pa_table_as_array(df.fetch_arrow_table(), columns=columns)
220
220
 
221
221
 
222
222
  @as_array_iterable.candidate(
@@ -225,14 +225,14 @@ def _duck_as_array(
225
225
  def _duck_as_array_iterable(
226
226
  df: DuckDBPyRelation, columns: Optional[List[str]] = None, type_safe: bool = False
227
227
  ) -> Iterable[Any]:
228
- yield from pa_table_as_array_iterable(df.arrow(), columns=columns)
228
+ yield from pa_table_as_array_iterable(df.fetch_arrow_table(), columns=columns)
229
229
 
230
230
 
231
231
  @as_dicts.candidate(lambda df, *args, **kwargs: isinstance(df, DuckDBPyRelation))
232
232
  def _duck_as_dicts(
233
233
  df: DuckDBPyRelation, columns: Optional[List[str]] = None
234
234
  ) -> List[Dict[str, Any]]:
235
- return pa_table_as_dicts(df.arrow(), columns=columns)
235
+ return pa_table_as_dicts(df.fetch_arrow_table(), columns=columns)
236
236
 
237
237
 
238
238
  @as_dict_iterable.candidate(
@@ -241,7 +241,7 @@ def _duck_as_dicts(
241
241
  def _duck_as_dict_iterable(
242
242
  df: DuckDBPyRelation, columns: Optional[List[str]] = None
243
243
  ) -> Iterable[Dict[str, Any]]:
244
- yield from pa_table_as_dict_iterable(df.arrow(), columns=columns)
244
+ yield from pa_table_as_dict_iterable(df.fetch_arrow_table(), columns=columns)
245
245
 
246
246
 
247
247
  def _assert_no_missing(df: DuckDBPyRelation, columns: Iterable[Any]) -> None:
@@ -108,7 +108,7 @@ class DuckDBEngine(SQLEngine):
108
108
  try:
109
109
  for k, v in dfs.items():
110
110
  duckdb.from_arrow(v.as_arrow(), connection=conn).create_view(k)
111
- return ArrowDataFrame(_duck_as_arrow(conn.execute(statement)))
111
+ return ArrowDataFrame(_duck_as_arrow(conn.sql(statement)))
112
112
  finally:
113
113
  conn.close()
114
114
 
@@ -92,7 +92,8 @@ class IbisSQLEngine(SQLEngine):
92
92
  _df2 = self.to_df(df2)
93
93
  key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
94
94
  on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
95
- if ibis.__version__ < "6": # pragma: no cover
95
+ version = int(ibis.__version__.split(".")[0])
96
+ if version < 6: # pragma: no cover
96
97
  suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
97
98
  else:
98
99
  # breaking change in ibis 6.0
@@ -113,7 +114,7 @@ class IbisSQLEngine(SQLEngine):
113
114
  cols.append(
114
115
  ibis.coalesce(tb[k], tb[k + _JOIN_RIGHT_SUFFIX]).name(k)
115
116
  )
116
- tb = tb[cols]
117
+ tb = tb.select(*cols)
117
118
  elif how.lower() in ["semi", "left_semi"]:
118
119
  tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
119
120
  elif how.lower() in ["anti", "left_anti"]:
@@ -153,7 +154,7 @@ class IbisSQLEngine(SQLEngine):
153
154
  self,
154
155
  df: DataFrame,
155
156
  how: str = "any",
156
- thresh: int = None,
157
+ thresh: Optional[int] = None,
157
158
  subset: Optional[List[str]] = None,
158
159
  ) -> DataFrame:
159
160
  schema = df.schema
@@ -161,7 +162,7 @@ class IbisSQLEngine(SQLEngine):
161
162
  schema = schema.extract(subset)
162
163
  _df = self.to_df(df)
163
164
  if thresh is None:
164
- tb = _df.native.dropna(subset=subset, how=how)
165
+ tb = _df.native.drop_null(subset, how=how)
165
166
  return self.to_df(tb, df.schema)
166
167
  assert_or_throw(
167
168
  how == "any", ValueError("when thresh is set, how must be 'any'")
@@ -204,7 +205,7 @@ class IbisSQLEngine(SQLEngine):
204
205
  ibis.coalesce(tb[f], ibis.literal(vd[f])).name(f) if f in names else tb[f]
205
206
  for f in df.columns
206
207
  ]
207
- return self.to_df(tb[cols], schema=df.schema)
208
+ return self.to_df(tb.select(cols), schema=df.schema)
208
209
 
209
210
  def take(
210
211
  self,
@@ -241,7 +242,7 @@ class IbisSQLEngine(SQLEngine):
241
242
  f") WHERE __fugue_take_param<={n}"
242
243
  )
243
244
  tb = self.query_to_table(sql, {tbn: idf})
244
- return self.to_df(tb[df.columns], schema=df.schema)
245
+ return self.to_df(tb.select(*df.columns), schema=df.schema)
245
246
 
246
247
  sorts: List[str] = []
247
248
  for k, v in _presort.items():
fugue_ray/_utils/io.py CHANGED
@@ -7,7 +7,7 @@ import ray.data as rd
7
7
  from packaging import version
8
8
  from pyarrow import csv as pacsv
9
9
  from pyarrow import json as pajson
10
- from ray.data.datasource import FileExtensionFilter
10
+
11
11
  from triad.collections import Schema
12
12
  from triad.collections.dict import ParamDict
13
13
  from triad.utils.assertion import assert_or_throw
@@ -21,6 +21,27 @@ from fugue_ray.dataframe import RayDataFrame
21
21
 
22
22
  from .._constants import RAY_VERSION
23
23
 
24
+ try:
25
+ from ray.data.datasource import FileExtensionFilter
26
+
27
+ class _FileFiler(FileExtensionFilter): # pragma: no cover
28
+ def __init__(
29
+ self, file_extensions: Union[str, List[str]], exclude: Iterable[str]
30
+ ):
31
+ super().__init__(file_extensions, allow_if_no_extension=True)
32
+ self._exclude = set(exclude)
33
+
34
+ def _is_valid(self, path: str) -> bool:
35
+ return pathlib.Path(
36
+ path
37
+ ).name not in self._exclude and self._file_has_extension(path)
38
+
39
+ def __call__(self, paths: List[str]) -> List[str]:
40
+ return [path for path in paths if self._is_valid(path)]
41
+
42
+ except ImportError: # pragma: no cover
43
+ pass # ray >=2.10
44
+
24
45
 
25
46
  class RayIO(object):
26
47
  def __init__(self, engine: ExecutionEngine):
@@ -53,7 +74,7 @@ class RayIO(object):
53
74
  len(fmts) == 1, NotImplementedError("can't support multiple formats")
54
75
  )
55
76
  fmt = fmts[0]
56
- files = [f.path for f in fp]
77
+ files = [f.as_dir_path() if f.is_dir else f.path for f in fp]
57
78
  return self._loads[fmt](files, columns, **kwargs)
58
79
 
59
80
  def save_df(
@@ -248,17 +269,3 @@ class RayIO(object):
248
269
 
249
270
  def _remote_args(self) -> Dict[str, Any]:
250
271
  return {"num_cpus": 1}
251
-
252
-
253
- class _FileFiler(FileExtensionFilter): # pragma: no cover
254
- def __init__(self, file_extensions: Union[str, List[str]], exclude: Iterable[str]):
255
- super().__init__(file_extensions, allow_if_no_extension=True)
256
- self._exclude = set(exclude)
257
-
258
- def _is_valid(self, path: str) -> bool:
259
- return pathlib.Path(
260
- path
261
- ).name not in self._exclude and self._file_has_extension(path)
262
-
263
- def __call__(self, paths: List[str]) -> List[str]:
264
- return [path for path in paths if self._is_valid(path)]
@@ -174,20 +174,26 @@ def pd_to_spark_df(
174
174
 
175
175
 
176
176
  def to_pandas(df: ps.DataFrame) -> pd.DataFrame:
177
- if version.parse(pd.__version__) < version.parse("2.0.0") or not any(
178
- isinstance(x.dataType, (pt.TimestampType, TimestampNTZType))
179
- for x in df.schema.fields
180
- ):
181
- return df.toPandas()
182
- else: # pragma: no cover
177
+ def _to_df() -> pd.DataFrame:
178
+ if version.parse(pd.__version__) < version.parse("2.0.0") or not any(
179
+ isinstance(x.dataType, (pt.TimestampType, TimestampNTZType))
180
+ for x in df.schema.fields
181
+ ):
182
+ return df.toPandas()
183
+ else: # pragma: no cover
184
+
185
+ def serialize(dfs):
186
+ for df in dfs:
187
+ data = pickle.dumps(df)
188
+ yield pd.DataFrame([[data]], columns=["data"])
183
189
 
184
- def serialize(dfs):
185
- for df in dfs:
186
- data = pickle.dumps(df)
187
- yield pd.DataFrame([[data]], columns=["data"])
190
+ sdf = df.mapInPandas(serialize, schema="data binary")
191
+ return pd.concat(pickle.loads(x.data) for x in sdf.collect())
188
192
 
189
- sdf = df.mapInPandas(serialize, schema="data binary")
190
- return pd.concat(pickle.loads(x.data) for x in sdf.collect())
193
+ pdf = _to_df()
194
+ if hasattr(pdf, "attrs") and "metrics" in pdf.attrs: # pragma: no cover
195
+ del pdf.attrs["metrics"]
196
+ return pdf
191
197
 
192
198
 
193
199
  def to_arrow(df: ps.DataFrame) -> pa.Table:
@@ -486,6 +486,25 @@ class BuiltInTests(object):
486
486
  dag.df([], "a:int,b:int").assert_eq(b)
487
487
  dag.run(self.engine)
488
488
 
489
+ def _test_transform_row_wise(self): # pragma: no cover
490
+ # TODO: currently disabled because we don't support Dict[str, Any]
491
+ # as dataframe input
492
+ def t1(row: Dict[str, Any]) -> Dict[str, Any]:
493
+ row["b"] = 1
494
+ return row
495
+
496
+ def t2(rows: List[Dict[str, Any]]) -> Dict[str, Any]:
497
+ return rows[0]
498
+
499
+ with fa.engine_context(self.engine):
500
+ a = pd.DataFrame([[3, 4], [1, 2], [3, 5]], columns=["a", "b"])
501
+ b = fa.transform(a, t1, schema="*")
502
+ assert sorted(fa.as_array(b)) == [[1, 1], [3, 1], [3, 1]]
503
+ b = fa.transform(
504
+ a, t2, schema="*", partition={"by": "a", "presort": "b"}
505
+ )
506
+ assert sorted(fa.as_array(b)) == [[1, 2], [3, 4]]
507
+
489
508
  def test_transform_binary(self):
490
509
  with FugueWorkflow() as dag:
491
510
  a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
@@ -548,6 +567,8 @@ class BuiltInTests(object):
548
567
  e = dag.df([[1, 2, 1, 10]], "a:int,ct1:int,ct2:int,x:int")
549
568
  e.assert_eq(c)
550
569
 
570
+ a.zip(b).transform(mock_co_tf1_d, params=dict(p=10)).assert_eq(e)
571
+
551
572
  # interfaceless
552
573
  c = dag.transform(
553
574
  a.zip(b),
@@ -676,6 +697,13 @@ class BuiltInTests(object):
676
697
  incr()
677
698
  yield pa.Table.from_pandas(df)
678
699
 
700
+ def t11(row: list[dict[str, Any]]) -> dict[str, Any]:
701
+ incr()
702
+ return row[0]
703
+
704
+ def t12(row: list[dict[str, Any]]) -> None:
705
+ incr()
706
+
679
707
  with FugueWorkflow() as dag:
680
708
  a = dag.df([[1, 2], [3, 4]], "a:double,b:int")
681
709
  a.out_transform(t1) # +2
@@ -688,6 +716,8 @@ class BuiltInTests(object):
688
716
  a.out_transform(t8, ignore_errors=[NotImplementedError]) # +1
689
717
  a.out_transform(t9) # +1
690
718
  a.out_transform(t10) # +1
719
+ a.out_transform(t11) # +2
720
+ a.out_transform(t12) # +2
691
721
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t2))
692
722
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t3))
693
723
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(t4))
@@ -695,7 +725,7 @@ class BuiltInTests(object):
695
725
  raises(FugueWorkflowCompileValidationError, lambda: a.out_transform(T7))
696
726
  dag.run(self.engine)
697
727
 
698
- assert 13 <= incr()
728
+ assert 17 <= incr()
699
729
 
700
730
  def test_out_cotransform(self): # noqa: C901
701
731
  tmpdir = str(self.tmpdir)
@@ -2001,6 +2031,13 @@ def mock_co_tf1(
2001
2031
  return [[df1[0]["a"], len(df1), len(df2), p]]
2002
2032
 
2003
2033
 
2034
+ @cotransformer(lambda dfs, **kwargs: "a:int,ct1:int,ct2:int,x:int")
2035
+ def mock_co_tf1_d(
2036
+ df1: List[Dict[str, Any]], df2: List[List[Any]], p=1
2037
+ ) -> Dict[str, Any]:
2038
+ return dict(a=df1[0]["a"], ct1=len(df1), ct2=len(df2), x=p)
2039
+
2040
+
2004
2041
  def mock_co_tf2(dfs: DataFrames, p=1) -> List[List[Any]]:
2005
2042
  return [[dfs[0].peek_dict()["a"], dfs[0].count(), dfs[1].count(), p]]
2006
2043
 
@@ -9,6 +9,7 @@ except ImportError: # pragma: no cover
9
9
  import copy
10
10
  import os
11
11
  import pickle
12
+ import sys
12
13
  from datetime import datetime
13
14
 
14
15
  import pandas as pd
@@ -1194,6 +1195,7 @@ class ExecutionEngineTests(object):
1194
1195
  )
1195
1196
  self.df_eq(c, [[1.1, 6.1], [7.1, 2.1]], "a:double,c:double", throw=True)
1196
1197
 
1198
+ @pytest.mark.skipif(sys.platform == "win32", reason="skip on Windows")
1197
1199
  def test_load_csv_folder(self):
1198
1200
  native = NativeExecutionEngine()
1199
1201
  a = ArrayDataFrame([[6.1, 1.1]], "c:double,a:double")
fugue_version/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.9.1"
1
+ __version__ = "0.9.2"
@@ -1,12 +0,0 @@
1
- [fugue.plugins]
2
- dask = fugue_dask.registry [dask]
3
- duckdb = fugue_duckdb.registry [duckdb]
4
- ibis = fugue_ibis [ibis]
5
- polars = fugue_polars.registry [polars]
6
- ray = fugue_ray.registry [ray]
7
- spark = fugue_spark.registry [spark]
8
-
9
- [pytest11]
10
- fugue_test = fugue_test
11
- fugue_test_fixtures = fugue_test.fixtures
12
-