fugue 0.8.5.dev1__py3-none-any.whl → 0.8.6.dev2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -113,6 +113,13 @@ class DataFrame(Dataset):
113
113
  def as_pandas(self) -> pd.DataFrame:
114
114
  """Convert to pandas DataFrame"""
115
115
  pdf = pd.DataFrame(self.as_array(), columns=self.columns)
116
+ if len(pdf) == 0: # TODO: move to triad
117
+ return pd.DataFrame(
118
+ {
119
+ k: pd.Series(dtype=v.type.to_pandas_dtype())
120
+ for k, v in self.schema.items()
121
+ }
122
+ )
116
123
  return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
117
124
 
118
125
  def as_arrow(self, type_safe: bool = False) -> pa.Table:
@@ -165,7 +165,7 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
165
165
 
166
166
  def as_pandas(self) -> pd.DataFrame:
167
167
  if self.empty:
168
- return ArrayDataFrame([], self.schema).as_pandas()
168
+ return PandasDataFrame(schema=self.schema).as_pandas()
169
169
 
170
170
  return pd.concat(df.as_pandas() for df in self.native)
171
171
 
@@ -23,9 +23,10 @@ from triad.collections.function_wrapper import (
23
23
  from triad.utils.iter import EmptyAwareIterable, make_empty_aware
24
24
 
25
25
  from ..constants import FUGUE_ENTRYPOINT
26
+ from ..dataset.api import count as df_count
26
27
  from .array_dataframe import ArrayDataFrame
27
28
  from .arrow_dataframe import ArrowDataFrame
28
- from .dataframe import DataFrame, LocalDataFrame
29
+ from .dataframe import AnyDataFrame, DataFrame, LocalDataFrame, as_fugue_df
29
30
  from .dataframe_iterable_dataframe import (
30
31
  IterableArrowDataFrame,
31
32
  IterablePandasDataFrame,
@@ -172,6 +173,19 @@ class DataFrameParam(_DataFrameParamBase):
172
173
  return sum(1 for _ in df.as_array_iterable())
173
174
 
174
175
 
176
+ @fugue_annotated_param(AnyDataFrame)
177
+ class _AnyDataFrameParam(DataFrameParam):
178
+ def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
179
+ return (
180
+ as_fugue_df(output)
181
+ if schema is None
182
+ else as_fugue_df(output, schema=schema)
183
+ )
184
+
185
+ def count(self, df: Any) -> int:
186
+ return df_count(df)
187
+
188
+
175
189
  @fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
176
190
  class LocalDataFrameParam(DataFrameParam):
177
191
  def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
@@ -333,6 +347,9 @@ class _PandasParam(LocalDataFrameParam):
333
347
 
334
348
  @no_type_check
335
349
  def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
350
+ _schema: Optional[Schema] = None if schema is None else Schema(schema)
351
+ if _schema is not None and _schema.names != list(output.columns):
352
+ output = output[_schema.names]
336
353
  return PandasDataFrame(output, schema)
337
354
 
338
355
  @no_type_check
@@ -361,8 +378,15 @@ class _IterablePandasParam(LocalDataFrameParam):
361
378
  self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
362
379
  ) -> DataFrame:
363
380
  def dfs():
381
+ _schema: Optional[Schema] = None if schema is None else Schema(schema)
382
+ has_return = False
364
383
  for df in output:
365
- yield PandasDataFrame(df, schema)
384
+ if _schema is not None and _schema.names != list(df.columns):
385
+ df = df[_schema.names]
386
+ yield PandasDataFrame(df, _schema)
387
+ has_return = True
388
+ if not has_return and _schema is not None:
389
+ yield PandasDataFrame(schema=_schema)
366
390
 
367
391
  return IterablePandasDataFrame(dfs())
368
392
 
@@ -381,7 +405,12 @@ class _PyArrowTableParam(LocalDataFrameParam):
381
405
 
382
406
  def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
383
407
  assert isinstance(output, pa.Table)
384
- return ArrowDataFrame(output, schema=schema)
408
+ adf: DataFrame = ArrowDataFrame(output)
409
+ if schema is not None:
410
+ _schema = Schema(schema)
411
+ if adf.schema != _schema:
412
+ adf = adf[_schema.names].alter_columns(_schema)
413
+ return adf
385
414
 
386
415
  def count(self, df: Any) -> int: # pragma: no cover
387
416
  return df.count()
@@ -409,13 +438,15 @@ class _IterableArrowParam(LocalDataFrameParam):
409
438
  ) -> DataFrame:
410
439
  def dfs():
411
440
  _schema: Optional[Schema] = None if schema is None else Schema(schema)
441
+ has_return = False
412
442
  for df in output:
413
- adf = ArrowDataFrame(df)
414
- if _schema is not None and not ( # pylint: disable-all
415
- adf.schema == schema
416
- ):
443
+ adf: DataFrame = ArrowDataFrame(df)
444
+ if _schema is not None and adf.schema != _schema:
417
445
  adf = adf[_schema.names].alter_columns(_schema)
418
446
  yield adf
447
+ has_return = True
448
+ if not has_return and _schema is not None:
449
+ yield ArrowDataFrame(schema=_schema)
419
450
 
420
451
  return IterableArrowDataFrame(dfs())
421
452
 
fugue/py.typed ADDED
File without changes
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fugue
3
- Version: 0.8.5.dev1
3
+ Version: 0.8.6.dev2
4
4
  Summary: An abstraction layer for distributed computation
5
5
  Home-page: http://github.com/fugue-project/fugue
6
6
  Author: The Fugue Development Team
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.10
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.7
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: triad (>=0.9.0)
23
+ Requires-Dist: triad (>=0.9.1)
24
24
  Requires-Dist: adagio (>=0.2.4)
25
25
  Requires-Dist: pyarrow (>=0.15.1)
26
26
  Requires-Dist: pandas (>=1.2.0)
27
- Requires-Dist: qpd (>=0.4.3)
27
+ Requires-Dist: qpd (>=0.4.4)
28
28
  Requires-Dist: fugue-sql-antlr (>=0.1.6)
29
29
  Requires-Dist: sqlglot
30
30
  Requires-Dist: jinja2
@@ -33,8 +33,8 @@ Requires-Dist: sqlglot ; extra == 'all'
33
33
  Requires-Dist: jinja2 ; extra == 'all'
34
34
  Requires-Dist: fugue-sql-antlr[cpp] (>=0.1.6) ; extra == 'all'
35
35
  Requires-Dist: pyspark (>=3.1.1) ; extra == 'all'
36
- Requires-Dist: ray[data] (>=2.0.0) ; extra == 'all'
37
- Requires-Dist: qpd[dask] (>=0.4.3) ; extra == 'all'
36
+ Requires-Dist: ray[data] (>=2.1.0) ; extra == 'all'
37
+ Requires-Dist: qpd[dask] (>=0.4.4) ; extra == 'all'
38
38
  Requires-Dist: notebook ; extra == 'all'
39
39
  Requires-Dist: jupyterlab ; extra == 'all'
40
40
  Requires-Dist: ipython (>=7.10.0) ; extra == 'all'
@@ -43,21 +43,21 @@ Requires-Dist: pyarrow (>=6.0.1) ; extra == 'all'
43
43
  Requires-Dist: polars ; extra == 'all'
44
44
  Requires-Dist: dask[dataframe,distributed] ; (python_version < "3.8") and extra == 'all'
45
45
  Requires-Dist: ibis-framework (>=2.1.1) ; (python_version < "3.8") and extra == 'all'
46
- Requires-Dist: dask[dataframe,distributed] (>=2022.9.0) ; (python_version >= "3.8") and extra == 'all'
47
- Requires-Dist: ibis-framework (>=3.2.0) ; (python_version >= "3.8") and extra == 'all'
46
+ Requires-Dist: dask[dataframe,distributed] (<2023.7.1,>=2022.9.0) ; (python_version >= "3.8") and extra == 'all'
47
+ Requires-Dist: ibis-framework (<6,>=3.2.0) ; (python_version >= "3.8") and extra == 'all'
48
48
  Provides-Extra: cpp_sql_parser
49
49
  Requires-Dist: fugue-sql-antlr[cpp] (>=0.1.6) ; extra == 'cpp_sql_parser'
50
50
  Provides-Extra: dask
51
- Requires-Dist: qpd[dask] (>=0.4.3) ; extra == 'dask'
51
+ Requires-Dist: qpd[dask] (>=0.4.4) ; extra == 'dask'
52
52
  Requires-Dist: dask[dataframe,distributed] ; (python_version < "3.8") and extra == 'dask'
53
- Requires-Dist: dask[dataframe,distributed] (>=2022.9.0) ; (python_version >= "3.8") and extra == 'dask'
53
+ Requires-Dist: dask[dataframe,distributed] (<2023.7.1,>=2022.9.0) ; (python_version >= "3.8") and extra == 'dask'
54
54
  Provides-Extra: duckdb
55
55
  Requires-Dist: duckdb (>=0.5.0) ; extra == 'duckdb'
56
56
  Requires-Dist: pyarrow (>=6.0.1) ; extra == 'duckdb'
57
57
  Requires-Dist: numpy ; extra == 'duckdb'
58
58
  Provides-Extra: ibis
59
59
  Requires-Dist: ibis-framework (>=2.1.1) ; (python_version < "3.8") and extra == 'ibis'
60
- Requires-Dist: ibis-framework (>=3.2.0) ; (python_version >= "3.8") and extra == 'ibis'
60
+ Requires-Dist: ibis-framework (<6,>=3.2.0) ; (python_version >= "3.8") and extra == 'ibis'
61
61
  Provides-Extra: notebook
62
62
  Requires-Dist: notebook ; extra == 'notebook'
63
63
  Requires-Dist: jupyterlab ; extra == 'notebook'
@@ -65,13 +65,13 @@ Requires-Dist: ipython (>=7.10.0) ; extra == 'notebook'
65
65
  Provides-Extra: polars
66
66
  Requires-Dist: polars ; extra == 'polars'
67
67
  Provides-Extra: ray
68
- Requires-Dist: ray[data] (>=2.0.0) ; extra == 'ray'
68
+ Requires-Dist: ray[data] (>=2.1.0) ; extra == 'ray'
69
69
  Requires-Dist: duckdb (>=0.5.0) ; extra == 'ray'
70
70
  Requires-Dist: pyarrow (>=6.0.1) ; extra == 'ray'
71
71
  Provides-Extra: spark
72
72
  Requires-Dist: pyspark (>=3.1.1) ; extra == 'spark'
73
73
  Provides-Extra: sql
74
- Requires-Dist: qpd (>=0.4.3) ; extra == 'sql'
74
+ Requires-Dist: qpd (>=0.4.4) ; extra == 'sql'
75
75
  Requires-Dist: fugue-sql-antlr (>=0.1.6) ; extra == 'sql'
76
76
  Requires-Dist: sqlglot ; extra == 'sql'
77
77
  Requires-Dist: jinja2 ; extra == 'sql'
@@ -222,9 +222,16 @@ Fugue can be installed through pip or conda. For example:
222
222
  pip install fugue
223
223
  ```
224
224
 
225
+ In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
226
+
227
+ ```bash
228
+ pip install fugue[sql]
229
+ ```
230
+
225
231
  It also has the following installation extras:
226
232
 
227
- * **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html)
233
+ * **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
234
+ * **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
228
235
  * **dask**: to support Dask as the ExecutionEngine.
229
236
  * **ray**: to support Ray as the ExecutionEngine.
230
237
  * **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).
@@ -4,6 +4,7 @@ fugue/constants.py,sha256=crd0VqX8WtBcjSUNwZDi2LDIEkhUMWOlSn73H8JI9ds,3385
4
4
  fugue/dev.py,sha256=GQCkezBBl4V0lVDWhGtUQKqomiCxgR9dMhfqj9C8cS8,1369
5
5
  fugue/exceptions.py,sha256=ylP8gkZL8ao_ZLinNYKv16FPyO_n7c29dN-4QChUxi0,1544
6
6
  fugue/plugins.py,sha256=SJ-jqs04StHIHJ65lgdGP0IDopVIGBDpmzHHllNK8p0,998
7
+ fugue/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
8
  fugue/registry.py,sha256=SNULGv08f37fRO-cIxFDmnVcod7ref2fNLSK6G7nVnI,868
8
9
  fugue/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
10
  fugue/_utils/display.py,sha256=JV8oDA7efHm1wceZulCBOY5dMvjbWHvIm6ASisKfoWY,3164
@@ -27,10 +28,10 @@ fugue/dataframe/__init__.py,sha256=zm7TbsaJLIvfm7zymWm2LGcuJd3nxfGsFnQiyrSnenM,6
27
28
  fugue/dataframe/api.py,sha256=c5Err3c-ayl-k28IUi6kV_ClDWX30NpVNkv97hQKDac,9862
28
29
  fugue/dataframe/array_dataframe.py,sha256=oBfN545NTGdYJ5zPIRv7hXRR-R_OW1JieyOfnl296oU,4447
29
30
  fugue/dataframe/arrow_dataframe.py,sha256=h0DJH8G0MEgfkyvX4U957iqDXIgvTtrP7YED5iEjizI,12098
30
- fugue/dataframe/dataframe.py,sha256=rIjaOplyRDt_BfpFwZWMjZALDqa03NePs9tFiLSW6Jg,17247
31
- fugue/dataframe/dataframe_iterable_dataframe.py,sha256=0gvb12D0s4VntNZ-M8J_Pic2XlFpw7upXf4hfMn2ufY,7255
31
+ fugue/dataframe/dataframe.py,sha256=XOn9x_aKWOLdpach7RHxg2PoP-hpfsyjoRFvLx6xKV0,17496
32
+ fugue/dataframe/dataframe_iterable_dataframe.py,sha256=lx71KfaI4lsVKI-79buc-idaeT20JEMBOq21SQcAiY8,7259
32
33
  fugue/dataframe/dataframes.py,sha256=tBSpHsENgbcdOJ0Jgst6PTKbjG7_uoFJch96oTlaQIs,4160
33
- fugue/dataframe/function_wrapper.py,sha256=DjyIrNBj5Bv9AwIl2I2fG5ClcKe3OoW_eBkyEabYY5Y,13505
34
+ fugue/dataframe/function_wrapper.py,sha256=r6H1SQWaag2eSbJ50327t_bt7MZunbOMOl9OcOcQW2E,14827
34
35
  fugue/dataframe/iterable_dataframe.py,sha256=Kn5HZnVU4o1nn9mbbQxaV8rGG869wImZcOCK3AdlA-M,4627
35
36
  fugue/dataframe/pandas_dataframe.py,sha256=ZWqI-ZUFiSP7giJ3siRlrZcMedI_fyuoLn227H0YRvw,10453
36
37
  fugue/dataframe/utils.py,sha256=nQVU01jspB1NSeRiagE71uzRibDqvyGwi94ZfHwNHD0,10508
@@ -103,7 +104,7 @@ fugue_ibis/__init__.py,sha256=PcUt66KlLyGGicad7asq5j2U567_fhR0HzvWQBhV1VM,362
103
104
  fugue_ibis/_compat.py,sha256=zKdTaTfuC02eUIzZPkcd7oObnVBi_X5mQjQf7SDme3Y,246
104
105
  fugue_ibis/_utils.py,sha256=BUL5swA5FE4eQu0t5Z17hZVu9a2MFfxlFH6Ymy9xifg,6607
105
106
  fugue_ibis/dataframe.py,sha256=Y4Wn--oAlCvmqStY92AgUaAKqr9l6jSfJ2EXOhDFk9M,7302
106
- fugue_ibis/execution_engine.py,sha256=igZ_rHflORPKp2h8HMSlzuv50P3ZaaijsvWyc8gilow,18540
107
+ fugue_ibis/execution_engine.py,sha256=p5zy0IBXiJgLi67RBHCRcHgZsaJMANdNSpUxz0k_6C0,18453
107
108
  fugue_ibis/extensions.py,sha256=H8l-SPfoqLuUoILtOuL2nccOpoL83zHeSoIhoqjtWQM,6905
108
109
  fugue_ibis/execution/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
110
  fugue_ibis/execution/ibis_engine.py,sha256=-HdPnIFWD83n5WITdzJiu4attH7GOcO041wkT5Y5ChA,1499
@@ -120,17 +121,17 @@ fugue_polars/polars_dataframe.py,sha256=Ll4ZUuRhAETWtmSf87KsdUCqZPiexFqy4FiPkvWQ
120
121
  fugue_polars/registry.py,sha256=gd6qQ-OxYtTAQFyvYbLDPXmSvCR-LW6n5K5ylgMY_7A,2950
121
122
  fugue_ray/__init__.py,sha256=HzEHfG2mpc0ugf3nf1Pdy15Bhg35K6maZpYejn1aoyI,119
122
123
  fugue_ray/_constants.py,sha256=vu5l1w-Wi-2V_nm0HLXKOYhh5HdWRCc5yQktO2XzhOg,569
123
- fugue_ray/dataframe.py,sha256=shUtnQbAquN2s5bR3Rx1QUGxxLz_g-Az9O0QDcXaCD0,10377
124
+ fugue_ray/dataframe.py,sha256=u6X9OvCGZta6cjHvQ-CcK4xckXNvKHK3EEMHn8dFrHM,10577
124
125
  fugue_ray/execution_engine.py,sha256=PUj1Fgqsg-6DDFG9KNip7NenTX_bfYvBjcFjmXp8LNo,12596
125
126
  fugue_ray/registry.py,sha256=xJRAhbwNrg695EwghQDnVtTKi4YkqZ0_61BD4OAblSA,1685
126
127
  fugue_ray/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
127
128
  fugue_ray/_utils/cluster.py,sha256=3T3Gyra6lAHlzktta-Ro35j6YZQfH6fNrj2hC5ATF9k,621
128
- fugue_ray/_utils/dataframe.py,sha256=xaw9Pbfjsnd_0vmbGRJJbAPUPWMZaYptw4LfDdIgvHo,3145
129
+ fugue_ray/_utils/dataframe.py,sha256=gUgdmDws8gFm7YjYyISRDgRx_-ksxtQu6oSpwIGkxfM,4457
129
130
  fugue_ray/_utils/io.py,sha256=gHfx70tdXPHmVL6nHxVhmCO5KpWjjyDG8qKT1Lbpav4,8737
130
131
  fugue_spark/__init__.py,sha256=rvrMpFs9socMgyH_58gLbnAqmirBf5oidXoO4cekW6U,165
131
132
  fugue_spark/_constants.py,sha256=K2uLQfjvMxXk75K-7_Wn47Alpwq5rW57BtECAUrOeqA,177
132
133
  fugue_spark/dataframe.py,sha256=xoM2-SwVRFfSyfEEnx4g4b0GO6XgN-DQLvXtUrAMq1Q,9510
133
- fugue_spark/execution_engine.py,sha256=n_ZdQI4ysbgdDsjyWwbwhyBsgcg_icc9MAmVyWEn2RQ,32029
134
+ fugue_spark/execution_engine.py,sha256=nssgfqt2h1OjGlK5iuGEyF-lt2pEokmsjrqo6K4C1Kg,32440
134
135
  fugue_spark/ibis_engine.py,sha256=Yl5xxwROo1idcD2hFaylaI1IpmBUgbvOZRWtcrE0Zjo,1697
135
136
  fugue_spark/registry.py,sha256=kyIMk6dAiKRSKCHawQKyXu9DhZ24T6j3gL57TiOAZ8c,4162
136
137
  fugue_spark/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -142,14 +143,14 @@ fugue_sql/__init__.py,sha256=Cmr7w0Efr7PzoXdQzdJfc4Dgqd69qKqcHZZodENq7EU,287
142
143
  fugue_sql/exceptions.py,sha256=ltS0MC8gMnVVrJbQiOZ0kRUWvVQ2LTx33dCW3ugqtb0,260
143
144
  fugue_test/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
145
  fugue_test/bag_suite.py,sha256=WbDCFjuAHYoJh4GXSPiSJxOoOwE1VMtYpJ3lQrsUK-Y,2483
145
- fugue_test/builtin_suite.py,sha256=piY_bVtLylQWrZbencrQwT89IXgvgBmNscddJ6vZieY,76338
146
+ fugue_test/builtin_suite.py,sha256=sT_Btm7cpSsuLnt_PpNFeLZVc2WPmF5NQy4GKVri_-c,78396
146
147
  fugue_test/dataframe_suite.py,sha256=6uM7_et2Y55-ePIssG9G_K9mXBYNjPXnpunuCh0xKhw,19082
147
148
  fugue_test/execution_suite.py,sha256=HzM_7jUhnt1kD2jGaaPhyr-Q_vtRV4nCQTaLtMnzTuU,50948
148
149
  fugue_test/ibis_suite.py,sha256=Dk4AHVD00RcFsNm9VvJ4_4LOyFdGX30OnAtpO2SPruE,3529
149
- fugue_version/__init__.py,sha256=K0kGrhh1kzVisZcoSkeuJdC06rTwxufV05Vy2hOVGoo,22
150
- fugue-0.8.5.dev1.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
151
- fugue-0.8.5.dev1.dist-info/METADATA,sha256=DA5pNJvD69RYFX-B8waTg-SCXrwzAVXj0zKPTUzajps,17870
152
- fugue-0.8.5.dev1.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
153
- fugue-0.8.5.dev1.dist-info/entry_points.txt,sha256=By9wWIZDuUtDISL9QiORS0_BzCDtYK7s_MZLtFuloFA,374
154
- fugue-0.8.5.dev1.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
155
- fugue-0.8.5.dev1.dist-info/RECORD,,
150
+ fugue_version/__init__.py,sha256=VpASnrti7EGWxUfSWGgERUfe7NLJltfVXYosOzHbpPg,22
151
+ fugue-0.8.6.dev2.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
152
+ fugue-0.8.6.dev2.dist-info/METADATA,sha256=pGSp1pOq2BHvHd3KiRL3YzFHbNKzmKDMUPd8Hcs__xc,18284
153
+ fugue-0.8.6.dev2.dist-info/WHEEL,sha256=AtBG6SXL3KF_v0NxLf0ehyVOh0cold-JbJYXNGorC6Q,92
154
+ fugue-0.8.6.dev2.dist-info/entry_points.txt,sha256=Ta1DD9RIgS_YfhieUvM6PgAzuOYmhlnKe9fWbns9sLc,374
155
+ fugue-0.8.6.dev2.dist-info/top_level.txt,sha256=y1eCfzGdQ1_RkgcShcfbvXs-bopD3DwJcIOxP9EFXno,140
156
+ fugue-0.8.6.dev2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.40.0)
2
+ Generator: bdist_wheel (0.41.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,11 +1,11 @@
1
1
  [fugue.plugins]
2
2
  dask = fugue_dask.registry [dask]
3
- dask_ibis = fugue_dask.ibis_engine [ibis,dask]
3
+ dask_ibis = fugue_dask.ibis_engine [dask,ibis]
4
4
  duckdb = fugue_duckdb.registry [duckdb]
5
- duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
5
+ duckdb_ibis = fugue_duckdb.ibis_engine [duckdb,ibis]
6
6
  ibis = fugue_ibis [ibis]
7
7
  polars = fugue_polars.registry [polars]
8
8
  ray = fugue_ray.registry [ray]
9
9
  spark = fugue_spark.registry [spark]
10
- spark_ibis = fugue_spark.ibis_engine [spark,ibis]
10
+ spark_ibis = fugue_spark.ibis_engine [ibis,spark]
11
11
 
@@ -92,20 +92,19 @@ class IbisSQLEngine(SQLEngine):
92
92
  _df2 = self.to_df(df2)
93
93
  key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
94
94
  on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
95
+ if ibis.__version__ < "6":
96
+ suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
97
+ else: # pragma: no cover
98
+ # breaking change in ibis 6.0
99
+ suffixes = dict(lname="", rname=_JOIN_RIGHT_SUFFIX)
95
100
  if how.lower() == "cross":
96
- tb = _df1.native.cross_join(_df2.native, suffixes=("", _JOIN_RIGHT_SUFFIX))
101
+ tb = _df1.native.cross_join(_df2.native, **suffixes)
97
102
  elif how.lower() == "right_outer":
98
- tb = _df2.native.left_join(
99
- _df1.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
100
- )
103
+ tb = _df2.native.left_join(_df1.native, on_fields, **suffixes)
101
104
  elif how.lower() == "left_outer":
102
- tb = _df1.native.left_join(
103
- _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
104
- )
105
+ tb = _df1.native.left_join(_df2.native, on_fields, **suffixes)
105
106
  elif how.lower() == "full_outer":
106
- tb = _df1.native.outer_join(
107
- _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
108
- )
107
+ tb = _df1.native.outer_join(_df2.native, on_fields, **suffixes)
109
108
  cols: List[Any] = []
110
109
  for k in end_schema.names:
111
110
  if k not in key_schema:
@@ -116,17 +115,11 @@ class IbisSQLEngine(SQLEngine):
116
115
  )
117
116
  tb = tb[cols]
118
117
  elif how.lower() in ["semi", "left_semi"]:
119
- tb = _df1.native.semi_join(
120
- _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
121
- )
118
+ tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
122
119
  elif how.lower() in ["anti", "left_anti"]:
123
- tb = _df1.native.anti_join(
124
- _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
125
- )
120
+ tb = _df1.native.anti_join(_df2.native, on_fields, **suffixes)
126
121
  else:
127
- tb = _df1.native.inner_join(
128
- _df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
129
- )
122
+ tb = _df1.native.inner_join(_df2.native, on_fields, **suffixes)
130
123
  return self.to_df(tb[end_schema.names], schema=end_schema)
131
124
 
132
125
  def union(self, df1: DataFrame, df2: DataFrame, distinct: bool = True) -> DataFrame:
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple
3
3
 
4
4
  import pandas as pd
5
5
  import pyarrow as pa
6
+ import ray
6
7
  import ray.data as rd
7
8
  from triad import Schema
8
9
 
@@ -13,15 +14,51 @@ from .._constants import _ZERO_COPY
13
14
  _RAY_NULL_REPR = "__RAY_NULL__"
14
15
 
15
16
 
16
- def get_dataset_format(df: rd.Dataset) -> Optional[str]:
17
- df.fully_executed()
17
+ def is_materialized(df: rd.Dataset) -> bool:
18
+ if hasattr(rd.dataset, "MaterializedDataset"):
19
+ return isinstance(df, rd.dataset.MaterializedDataset)
20
+ return df.is_fully_executed() # pragma: no cover
21
+
22
+
23
+ def materialize(df: rd.Dataset) -> rd.Dataset:
24
+ if not is_materialized(df):
25
+ if hasattr(df, "materialize"):
26
+ df = df.materialize()
27
+ else: # pragma: no cover
28
+ df = df.fully_executed()
29
+ return df
30
+
31
+
32
+ def get_dataset_format(df: rd.Dataset) -> Tuple[Optional[str], rd.Dataset]:
33
+ df = materialize(df)
18
34
  if df.count() == 0:
19
- return None
20
- if hasattr(df, "_dataset_format"): # pragma: no cover
21
- return df._dataset_format() # ray<2.2
22
- ctx = rd.context.DatasetContext.get_current()
23
- ctx.use_streaming_executor = False
24
- return df.dataset_format() # ray>=2.2
35
+ return None, df
36
+ if ray.__version__ < "2.5.0": # pragma: no cover
37
+ if hasattr(df, "_dataset_format"): # pragma: no cover
38
+ return df._dataset_format(), df # ray<2.2
39
+ ctx = rd.context.DatasetContext.get_current()
40
+ ctx.use_streaming_executor = False
41
+ return df.dataset_format(), df # ray>=2.2
42
+ else:
43
+ schema = df.schema(fetch_if_missing=True)
44
+ if schema is None: # pragma: no cover
45
+ return None, df
46
+ if isinstance(schema.base_schema, pa.Schema):
47
+ return "arrow", df
48
+ return "pandas", df
49
+
50
+
51
+ def to_schema(schema: Any) -> Schema: # pragma: no cover
52
+ if isinstance(schema, pa.Schema):
53
+ return Schema(schema)
54
+ if ray.__version__ >= "2.5.0":
55
+ if isinstance(schema, rd.Schema):
56
+ if hasattr(schema, "base_schema") and isinstance(
57
+ schema.base_schema, pa.Schema
58
+ ):
59
+ return Schema(schema.base_schema)
60
+ return Schema(list(zip(schema.names, schema.types)))
61
+ raise ValueError(f"{schema} is not supported")
25
62
 
26
63
 
27
64
  def build_empty(schema: Schema) -> rd.Dataset:
fugue_ray/dataframe.py CHANGED
@@ -18,7 +18,7 @@ from fugue.plugins import (
18
18
  )
19
19
 
20
20
  from ._constants import _ZERO_COPY
21
- from ._utils.dataframe import build_empty, get_dataset_format
21
+ from ._utils.dataframe import build_empty, get_dataset_format, materialize, to_schema
22
22
 
23
23
 
24
24
  class RayDataFrame(DataFrame):
@@ -52,7 +52,7 @@ class RayDataFrame(DataFrame):
52
52
  self._native = build_empty(schema)
53
53
  return
54
54
  if isinstance(df, rd.Dataset):
55
- fmt = get_dataset_format(df)
55
+ fmt, df = get_dataset_format(df)
56
56
  if fmt is None: # empty:
57
57
  schema = _input_schema(schema).assert_not_empty()
58
58
  super().__init__(schema)
@@ -62,7 +62,7 @@ class RayDataFrame(DataFrame):
62
62
  rdf = rd.from_arrow_refs(df.to_arrow_refs())
63
63
  elif fmt == "arrow":
64
64
  rdf = df
65
- else:
65
+ else: # pragma: no cover
66
66
  raise NotImplementedError(
67
67
  f"Ray Dataset in {fmt} format is not supported"
68
68
  )
@@ -156,8 +156,7 @@ class RayDataFrame(DataFrame):
156
156
 
157
157
  def persist(self, **kwargs: Any) -> "RayDataFrame":
158
158
  # TODO: it mutates the dataframe, is this a good bahavior
159
- if not self.native.is_fully_executed(): # pragma: no cover
160
- self.native.fully_executed()
159
+ self._native = materialize(self._native)
161
160
  return self
162
161
 
163
162
  def count(self) -> int:
@@ -226,11 +225,12 @@ class RayDataFrame(DataFrame):
226
225
  ) -> Tuple[rd.Dataset, Schema]:
227
226
  if internal_schema:
228
227
  return rdf, schema
229
- if get_dataset_format(rdf) is None: # empty
228
+ fmt, rdf = get_dataset_format(rdf)
229
+ if fmt is None: # empty
230
230
  schema = _input_schema(schema).assert_not_empty()
231
231
  return build_empty(schema), schema
232
- if schema is None or schema == rdf.schema(fetch_if_missing=True):
233
- return rdf, rdf.schema(fetch_if_missing=True)
232
+ if schema is None or schema == to_schema(rdf.schema(fetch_if_missing=True)):
233
+ return rdf, to_schema(rdf.schema(fetch_if_missing=True))
234
234
 
235
235
  def _alter(table: pa.Table) -> pa.Table: # pragma: no cover
236
236
  return ArrowDataFrame(table).alter_columns(schema).native # type: ignore
@@ -263,12 +263,15 @@ def _rd_as_local(df: rd.Dataset) -> bool:
263
263
 
264
264
  @get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
265
265
  def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]:
266
- fmt = get_dataset_format(df)
267
- if fmt == "pandas":
268
- return list(df.schema(True).names)
269
- elif fmt == "arrow":
270
- return [f.name for f in df.schema(True)]
271
- raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover
266
+ if hasattr(df, "columns"): # higher version of ray
267
+ return df.columns(fetch_if_missing=True)
268
+ else: # pragma: no cover
269
+ fmt, _ = get_dataset_format(df)
270
+ if fmt == "pandas":
271
+ return list(df.schema(True).names)
272
+ elif fmt == "arrow":
273
+ return df.schema(fetch_if_missing=True).names
274
+ raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover
272
275
 
273
276
 
274
277
  @rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
@@ -134,9 +134,8 @@ class SparkMapEngine(MapEngine):
134
134
  ) -> DataFrame:
135
135
  output_schema = Schema(output_schema)
136
136
  if self._should_use_pandas_udf(output_schema):
137
- # pandas udf can only be used for pyspark > 3
138
137
  if len(partition_spec.partition_by) > 0:
139
- if partition_spec.algo == "coarse":
138
+ if partition_spec.algo in ["coarse", "even"]:
140
139
  return self._map_by_pandas_udf(
141
140
  df,
142
141
  map_func=map_func,
@@ -145,7 +144,18 @@ class SparkMapEngine(MapEngine):
145
144
  on_init=on_init,
146
145
  map_func_format_hint=map_func_format_hint,
147
146
  )
148
- elif partition_spec.algo != "even" or self.is_spark_connect:
147
+ else:
148
+ if ( # not simple partitioning
149
+ partition_spec.algo != "hash"
150
+ or partition_spec.num_partitions != "0"
151
+ ):
152
+ # TODO: not sure if presort should be done
153
+ # on physical partition level
154
+ df = self.to_df(
155
+ self.execution_engine.repartition(
156
+ df, PartitionSpec(partition_spec, presort=[])
157
+ )
158
+ )
149
159
  return self._group_map_by_pandas_udf(
150
160
  df,
151
161
  map_func=map_func,
@@ -154,7 +164,7 @@ class SparkMapEngine(MapEngine):
154
164
  on_init=on_init,
155
165
  map_func_format_hint=map_func_format_hint,
156
166
  )
157
- elif len(partition_spec.partition_by) == 0:
167
+ else:
158
168
  return self._map_by_pandas_udf(
159
169
  df,
160
170
  map_func=map_func,
@@ -22,6 +22,7 @@ from triad import SerializableRLock
22
22
 
23
23
  import fugue.api as fa
24
24
  from fugue import (
25
+ AnyDataFrame,
25
26
  ArrayDataFrame,
26
27
  CoTransformer,
27
28
  DataFrame,
@@ -365,6 +366,12 @@ class BuiltInTests(object):
365
366
  dag.output(dict(df=a), using=mock_outputter2)
366
367
  a.partition(num=3).output(MockOutputter3)
367
368
  dag.output(dict(aa=a, bb=b), using=MockOutputter4)
369
+
370
+ a = dag.create(mock_creator2, params=dict(p=2))
371
+ b = dag.create(mock_creator2, params=dict(p=2))
372
+ c = dag.process(a, b, using=mock_processor4)
373
+ c.assert_eq(ArrayDataFrame([[2]], "a:int"))
374
+ dag.output(a, b, using=mock_outputter4)
368
375
  dag.run(self.engine)
369
376
 
370
377
  def test_zip(self):
@@ -435,9 +442,14 @@ class BuiltInTests(object):
435
442
  # this test is important for using mapInPandas in spark
436
443
 
437
444
  # schema: *,c:int
438
- def mt_pandas(dfs: Iterable[pd.DataFrame]) -> Iterator[pd.DataFrame]:
445
+ def mt_pandas(
446
+ dfs: Iterable[pd.DataFrame], empty: bool = False
447
+ ) -> Iterator[pd.DataFrame]:
439
448
  for df in dfs:
440
- yield df.assign(c=2)
449
+ if not empty:
450
+ df = df.assign(c=2)
451
+ df = df[reversed(list(df.columns))]
452
+ yield df
441
453
 
442
454
  with FugueWorkflow() as dag:
443
455
  a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
@@ -445,10 +457,25 @@ class BuiltInTests(object):
445
457
  dag.df([[1, 2, 2], [3, 4, 2]], "a:int,b:int,c:int").assert_eq(b)
446
458
  dag.run(self.engine)
447
459
 
460
+ # when iterable returns nothing
461
+ with FugueWorkflow() as dag:
462
+ a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
463
+ # without partitioning
464
+ b = a.transform(mt_pandas, params=dict(empty=True))
465
+ dag.df([], "a:int,b:int,c:int").assert_eq(b)
466
+ # with partitioning
467
+ b = a.partition_by("a").transform(mt_pandas, params=dict(empty=True))
468
+ dag.df([], "a:int,b:int,c:int").assert_eq(b)
469
+ dag.run(self.engine)
470
+
448
471
  # schema: *
449
- def mt_arrow(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
472
+ def mt_arrow(
473
+ dfs: Iterable[pa.Table], empty: bool = False
474
+ ) -> Iterator[pa.Table]:
450
475
  for df in dfs:
451
- yield df
476
+ if not empty:
477
+ df = df.select(reversed(df.schema.names))
478
+ yield df
452
479
 
453
480
  # schema: a:long
454
481
  def mt_arrow_2(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
@@ -463,6 +490,17 @@ class BuiltInTests(object):
463
490
  dag.df([[1], [3]], "a:long").assert_eq(b)
464
491
  dag.run(self.engine)
465
492
 
493
+ # when iterable returns nothing
494
+ with FugueWorkflow() as dag:
495
+ a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
496
+ # without partitioning
497
+ b = a.transform(mt_arrow, params=dict(empty=True))
498
+ dag.df([], "a:int,b:int").assert_eq(b)
499
+ # with partitioning
500
+ b = a.partition_by("a").transform(mt_arrow, params=dict(empty=True))
501
+ dag.df([], "a:int,b:int").assert_eq(b)
502
+ dag.run(self.engine)
503
+
466
504
  def test_transform_binary(self):
467
505
  with FugueWorkflow() as dag:
468
506
  a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
@@ -1829,6 +1867,10 @@ def mock_creator(p: int) -> DataFrame:
1829
1867
  return ArrayDataFrame([[p]], "a:int")
1830
1868
 
1831
1869
 
1870
+ def mock_creator2(p: int) -> AnyDataFrame:
1871
+ return fa.as_fugue_df([[p]], schema="a:int")
1872
+
1873
+
1832
1874
  def mock_processor(df1: List[List[Any]], df2: List[List[Any]]) -> DataFrame:
1833
1875
  return ArrayDataFrame([[len(df1) + len(df2)]], "a:int")
1834
1876
 
@@ -1844,6 +1886,10 @@ class MockProcessor3(Processor):
1844
1886
  return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
1845
1887
 
1846
1888
 
1889
+ def mock_processor4(df1: AnyDataFrame, df2: AnyDataFrame) -> AnyDataFrame:
1890
+ return ArrayDataFrame([[fa.count(df1) + fa.count(df2)]], "a:int")
1891
+
1892
+
1847
1893
  def mock_outputter(df1: List[List[Any]], df2: List[List[Any]]) -> None:
1848
1894
  assert len(df1) == len(df2)
1849
1895
 
@@ -1857,6 +1903,10 @@ class MockOutputter3(Outputter):
1857
1903
  assert "3" == self.partition_spec.num_partitions
1858
1904
 
1859
1905
 
1906
+ def mock_outputter4(df1: AnyDataFrame, df2: AnyDataFrame) -> None:
1907
+ assert fa.count(df1) == fa.count(df2)
1908
+
1909
+
1860
1910
  class MockOutputter4(Outputter):
1861
1911
  def process(self, dfs):
1862
1912
  for k, v in dfs.items():
@@ -1895,8 +1945,8 @@ def mock_tf0(df: pd.DataFrame, p=1, col="p") -> pd.DataFrame:
1895
1945
 
1896
1946
  # schema: *,ct:int,p:int
1897
1947
  def mock_tf1(df: pd.DataFrame, p=1) -> pd.DataFrame:
1898
- df["ct"] = df.shape[0]
1899
1948
  df["p"] = p
1949
+ df["ct"] = df.shape[0]
1900
1950
  return df
1901
1951
 
1902
1952
 
fugue_version/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.8.5"
1
+ __version__ = "0.8.6"