fugue 0.8.5.dev1__tar.gz → 0.8.6.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/PKG-INFO +9 -2
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/README.md +8 -1
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframe.py +7 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframe_iterable_dataframe.py +1 -1
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/function_wrapper.py +38 -7
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/PKG-INFO +9 -2
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/SOURCES.txt +1 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/entry_points.txt +2 -2
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/requires.txt +11 -11
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution_engine.py +12 -19
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/dataframe.py +45 -8
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/dataframe.py +17 -14
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/execution_engine.py +14 -4
- fugue-0.8.6.dev2/fugue_test/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_test/builtin_suite.py +55 -5
- fugue-0.8.6.dev2/fugue_version/__init__.py +1 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/setup.py +12 -12
- fugue-0.8.5.dev1/fugue_version/__init__.py +0 -1
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/LICENSE +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/display.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/exception.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/interfaceless.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/io.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/misc.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/_utils/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/api.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/bag/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/bag/array_bag.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/bag/bag.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/collections/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/collections/partition.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/collections/sql.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/collections/yielded.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/column/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/column/expressions.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/column/functions.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/column/sql.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/constants.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/api.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/array_dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/arrow_dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/dataframes.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/iterable_dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/pandas_dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataframe/utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataset/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataset/api.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dataset/dataset.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/dev.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/exceptions.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/execution/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/execution/api.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/execution/execution_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/execution/factory.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/execution/native_execution_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/creators.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/outputters.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_builtins/processors.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/_utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/context.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/creator/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/creator/convert.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/creator/creator.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/outputter/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/outputter/convert.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/outputter/outputter.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/processor/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/processor/convert.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/processor/processor.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/constants.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/convert.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/extensions/transformer/transformer.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/plugins.py +0 -0
- /fugue-0.8.5.dev1/fugue/sql/__init__.py → /fugue-0.8.6.dev2/fugue/py.typed +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/rpc/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/rpc/base.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/rpc/flask.py +0 -0
- {fugue-0.8.5.dev1/fugue_ibis/execution → fugue-0.8.6.dev2/fugue/sql}/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/sql/_utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/sql/_visitors.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/sql/api.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/sql/workflow.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/_checkpoint.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/_tasks.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/_workflow_context.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/api.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/input.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/module.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue/workflow/workflow.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/dependency_links.txt +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue.egg-info/top_level.txt +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_contrib/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_contrib/contrib.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_contrib/seaborn/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_contrib/viz/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_contrib/viz/_ext.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/_constants.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/_io.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/_utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/execution_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/ibis_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_dask/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/_io.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/_utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/dask.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/execution_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/ibis_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_duckdb/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/_compat.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/_utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/dataframe.py +0 -0
- {fugue-0.8.5.dev1/fugue_notebook/nbextension → fugue-0.8.6.dev2/fugue_ibis/execution}/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution/ibis_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/execution/pandas_backend.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ibis/extensions.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_notebook/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_notebook/env.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/README.md +0 -0
- {fugue-0.8.5.dev1/fugue_ray/_utils → fugue-0.8.6.dev2/fugue_notebook/nbextension}/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/description.yaml +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_notebook/nbextension/main.js +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_polars/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_polars/_utils.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_polars/polars_dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_polars/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/_constants.py +0 -0
- {fugue-0.8.5.dev1/fugue_spark → fugue-0.8.6.dev2/fugue_ray}/_utils/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/cluster.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/_utils/io.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/execution_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_ray/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/_constants.py +0 -0
- {fugue-0.8.5.dev1/fugue_test → fugue-0.8.6.dev2/fugue_spark/_utils}/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/convert.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/io.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/misc.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/_utils/partition.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/dataframe.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/ibis_engine.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_spark/registry.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_sql/__init__.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_sql/exceptions.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_test/bag_suite.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_test/dataframe_suite.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_test/execution_suite.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/fugue_test/ibis_suite.py +0 -0
- {fugue-0.8.5.dev1 → fugue-0.8.6.dev2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.6.dev2
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
@@ -152,9 +152,16 @@ Description: # Fugue
|
|
|
152
152
|
pip install fugue
|
|
153
153
|
```
|
|
154
154
|
|
|
155
|
+
In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
pip install fugue[sql]
|
|
159
|
+
```
|
|
160
|
+
|
|
155
161
|
It also has the following installation extras:
|
|
156
162
|
|
|
157
|
-
* **
|
|
163
|
+
* **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
|
|
164
|
+
* **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
|
|
158
165
|
* **dask**: to support Dask as the ExecutionEngine.
|
|
159
166
|
* **ray**: to support Ray as the ExecutionEngine.
|
|
160
167
|
* **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).
|
|
@@ -144,9 +144,16 @@ Fugue can be installed through pip or conda. For example:
|
|
|
144
144
|
pip install fugue
|
|
145
145
|
```
|
|
146
146
|
|
|
147
|
+
In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
pip install fugue[sql]
|
|
151
|
+
```
|
|
152
|
+
|
|
147
153
|
It also has the following installation extras:
|
|
148
154
|
|
|
149
|
-
* **
|
|
155
|
+
* **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
|
|
156
|
+
* **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
|
|
150
157
|
* **dask**: to support Dask as the ExecutionEngine.
|
|
151
158
|
* **ray**: to support Ray as the ExecutionEngine.
|
|
152
159
|
* **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).
|
|
@@ -113,6 +113,13 @@ class DataFrame(Dataset):
|
|
|
113
113
|
def as_pandas(self) -> pd.DataFrame:
|
|
114
114
|
"""Convert to pandas DataFrame"""
|
|
115
115
|
pdf = pd.DataFrame(self.as_array(), columns=self.columns)
|
|
116
|
+
if len(pdf) == 0: # TODO: move to triad
|
|
117
|
+
return pd.DataFrame(
|
|
118
|
+
{
|
|
119
|
+
k: pd.Series(dtype=v.type.to_pandas_dtype())
|
|
120
|
+
for k, v in self.schema.items()
|
|
121
|
+
}
|
|
122
|
+
)
|
|
116
123
|
return PD_UTILS.enforce_type(pdf, self.schema.pa_schema, null_safe=True)
|
|
117
124
|
|
|
118
125
|
def as_arrow(self, type_safe: bool = False) -> pa.Table:
|
|
@@ -165,7 +165,7 @@ class LocalDataFrameIterableDataFrame(LocalUnboundedDataFrame):
|
|
|
165
165
|
|
|
166
166
|
def as_pandas(self) -> pd.DataFrame:
|
|
167
167
|
if self.empty:
|
|
168
|
-
return
|
|
168
|
+
return PandasDataFrame(schema=self.schema).as_pandas()
|
|
169
169
|
|
|
170
170
|
return pd.concat(df.as_pandas() for df in self.native)
|
|
171
171
|
|
|
@@ -23,9 +23,10 @@ from triad.collections.function_wrapper import (
|
|
|
23
23
|
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
24
24
|
|
|
25
25
|
from ..constants import FUGUE_ENTRYPOINT
|
|
26
|
+
from ..dataset.api import count as df_count
|
|
26
27
|
from .array_dataframe import ArrayDataFrame
|
|
27
28
|
from .arrow_dataframe import ArrowDataFrame
|
|
28
|
-
from .dataframe import DataFrame, LocalDataFrame
|
|
29
|
+
from .dataframe import AnyDataFrame, DataFrame, LocalDataFrame, as_fugue_df
|
|
29
30
|
from .dataframe_iterable_dataframe import (
|
|
30
31
|
IterableArrowDataFrame,
|
|
31
32
|
IterablePandasDataFrame,
|
|
@@ -172,6 +173,19 @@ class DataFrameParam(_DataFrameParamBase):
|
|
|
172
173
|
return sum(1 for _ in df.as_array_iterable())
|
|
173
174
|
|
|
174
175
|
|
|
176
|
+
@fugue_annotated_param(AnyDataFrame)
|
|
177
|
+
class _AnyDataFrameParam(DataFrameParam):
|
|
178
|
+
def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
179
|
+
return (
|
|
180
|
+
as_fugue_df(output)
|
|
181
|
+
if schema is None
|
|
182
|
+
else as_fugue_df(output, schema=schema)
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def count(self, df: Any) -> int:
|
|
186
|
+
return df_count(df)
|
|
187
|
+
|
|
188
|
+
|
|
175
189
|
@fugue_annotated_param(LocalDataFrame, "l", child_can_reuse_code=True)
|
|
176
190
|
class LocalDataFrameParam(DataFrameParam):
|
|
177
191
|
def to_input_data(self, df: DataFrame, ctx: Any) -> LocalDataFrame:
|
|
@@ -333,6 +347,9 @@ class _PandasParam(LocalDataFrameParam):
|
|
|
333
347
|
|
|
334
348
|
@no_type_check
|
|
335
349
|
def to_output_df(self, output: pd.DataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
350
|
+
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
351
|
+
if _schema is not None and _schema.names != list(output.columns):
|
|
352
|
+
output = output[_schema.names]
|
|
336
353
|
return PandasDataFrame(output, schema)
|
|
337
354
|
|
|
338
355
|
@no_type_check
|
|
@@ -361,8 +378,15 @@ class _IterablePandasParam(LocalDataFrameParam):
|
|
|
361
378
|
self, output: Iterable[pd.DataFrame], schema: Any, ctx: Any
|
|
362
379
|
) -> DataFrame:
|
|
363
380
|
def dfs():
|
|
381
|
+
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
382
|
+
has_return = False
|
|
364
383
|
for df in output:
|
|
365
|
-
|
|
384
|
+
if _schema is not None and _schema.names != list(df.columns):
|
|
385
|
+
df = df[_schema.names]
|
|
386
|
+
yield PandasDataFrame(df, _schema)
|
|
387
|
+
has_return = True
|
|
388
|
+
if not has_return and _schema is not None:
|
|
389
|
+
yield PandasDataFrame(schema=_schema)
|
|
366
390
|
|
|
367
391
|
return IterablePandasDataFrame(dfs())
|
|
368
392
|
|
|
@@ -381,7 +405,12 @@ class _PyArrowTableParam(LocalDataFrameParam):
|
|
|
381
405
|
|
|
382
406
|
def to_output_df(self, output: Any, schema: Any, ctx: Any) -> DataFrame:
|
|
383
407
|
assert isinstance(output, pa.Table)
|
|
384
|
-
|
|
408
|
+
adf: DataFrame = ArrowDataFrame(output)
|
|
409
|
+
if schema is not None:
|
|
410
|
+
_schema = Schema(schema)
|
|
411
|
+
if adf.schema != _schema:
|
|
412
|
+
adf = adf[_schema.names].alter_columns(_schema)
|
|
413
|
+
return adf
|
|
385
414
|
|
|
386
415
|
def count(self, df: Any) -> int: # pragma: no cover
|
|
387
416
|
return df.count()
|
|
@@ -409,13 +438,15 @@ class _IterableArrowParam(LocalDataFrameParam):
|
|
|
409
438
|
) -> DataFrame:
|
|
410
439
|
def dfs():
|
|
411
440
|
_schema: Optional[Schema] = None if schema is None else Schema(schema)
|
|
441
|
+
has_return = False
|
|
412
442
|
for df in output:
|
|
413
|
-
adf = ArrowDataFrame(df)
|
|
414
|
-
if _schema is not None and
|
|
415
|
-
adf.schema == schema
|
|
416
|
-
):
|
|
443
|
+
adf: DataFrame = ArrowDataFrame(df)
|
|
444
|
+
if _schema is not None and adf.schema != _schema:
|
|
417
445
|
adf = adf[_schema.names].alter_columns(_schema)
|
|
418
446
|
yield adf
|
|
447
|
+
has_return = True
|
|
448
|
+
if not has_return and _schema is not None:
|
|
449
|
+
yield ArrowDataFrame(schema=_schema)
|
|
419
450
|
|
|
420
451
|
return IterableArrowDataFrame(dfs())
|
|
421
452
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fugue
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.6.dev2
|
|
4
4
|
Summary: An abstraction layer for distributed computation
|
|
5
5
|
Home-page: http://github.com/fugue-project/fugue
|
|
6
6
|
Author: The Fugue Development Team
|
|
@@ -152,9 +152,16 @@ Description: # Fugue
|
|
|
152
152
|
pip install fugue
|
|
153
153
|
```
|
|
154
154
|
|
|
155
|
+
In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
pip install fugue[sql]
|
|
159
|
+
```
|
|
160
|
+
|
|
155
161
|
It also has the following installation extras:
|
|
156
162
|
|
|
157
|
-
* **
|
|
163
|
+
* **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
|
|
164
|
+
* **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
|
|
158
165
|
* **dask**: to support Dask as the ExecutionEngine.
|
|
159
166
|
* **ray**: to support Ray as the ExecutionEngine.
|
|
160
167
|
* **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
dask = fugue_dask.registry [dask]
|
|
3
3
|
dask_ibis = fugue_dask.ibis_engine [dask,ibis]
|
|
4
4
|
duckdb = fugue_duckdb.registry [duckdb]
|
|
5
|
-
duckdb_ibis = fugue_duckdb.ibis_engine [duckdb
|
|
5
|
+
duckdb_ibis = fugue_duckdb.ibis_engine [ibis,duckdb]
|
|
6
6
|
ibis = fugue_ibis [ibis]
|
|
7
7
|
polars = fugue_polars.registry [polars]
|
|
8
8
|
ray = fugue_ray.registry [ray]
|
|
9
9
|
spark = fugue_spark.registry [spark]
|
|
10
|
-
spark_ibis = fugue_spark.ibis_engine [spark
|
|
10
|
+
spark_ibis = fugue_spark.ibis_engine [ibis,spark]
|
|
11
11
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
triad>=0.9.
|
|
1
|
+
triad>=0.9.1
|
|
2
2
|
adagio>=0.2.4
|
|
3
3
|
pyarrow>=0.15.1
|
|
4
4
|
pandas>=1.2.0
|
|
5
|
-
qpd>=0.4.
|
|
5
|
+
qpd>=0.4.4
|
|
6
6
|
fugue-sql-antlr>=0.1.6
|
|
7
7
|
sqlglot
|
|
8
8
|
jinja2
|
|
@@ -12,8 +12,8 @@ sqlglot
|
|
|
12
12
|
jinja2
|
|
13
13
|
fugue-sql-antlr[cpp]>=0.1.6
|
|
14
14
|
pyspark>=3.1.1
|
|
15
|
-
ray[data]>=2.
|
|
16
|
-
qpd[dask]>=0.4.
|
|
15
|
+
ray[data]>=2.1.0
|
|
16
|
+
qpd[dask]>=0.4.4
|
|
17
17
|
notebook
|
|
18
18
|
jupyterlab
|
|
19
19
|
ipython>=7.10.0
|
|
@@ -26,20 +26,20 @@ dask[dataframe,distributed]
|
|
|
26
26
|
ibis-framework>=2.1.1
|
|
27
27
|
|
|
28
28
|
[all:python_version >= "3.8"]
|
|
29
|
-
dask[dataframe,distributed]
|
|
30
|
-
ibis-framework
|
|
29
|
+
dask[dataframe,distributed]<2023.7.1,>=2022.9.0
|
|
30
|
+
ibis-framework<6,>=3.2.0
|
|
31
31
|
|
|
32
32
|
[cpp_sql_parser]
|
|
33
33
|
fugue-sql-antlr[cpp]>=0.1.6
|
|
34
34
|
|
|
35
35
|
[dask]
|
|
36
|
-
qpd[dask]>=0.4.
|
|
36
|
+
qpd[dask]>=0.4.4
|
|
37
37
|
|
|
38
38
|
[dask:python_version < "3.8"]
|
|
39
39
|
dask[dataframe,distributed]
|
|
40
40
|
|
|
41
41
|
[dask:python_version >= "3.8"]
|
|
42
|
-
dask[dataframe,distributed]
|
|
42
|
+
dask[dataframe,distributed]<2023.7.1,>=2022.9.0
|
|
43
43
|
|
|
44
44
|
[duckdb]
|
|
45
45
|
duckdb>=0.5.0
|
|
@@ -52,7 +52,7 @@ numpy
|
|
|
52
52
|
ibis-framework>=2.1.1
|
|
53
53
|
|
|
54
54
|
[ibis:python_version >= "3.8"]
|
|
55
|
-
ibis-framework
|
|
55
|
+
ibis-framework<6,>=3.2.0
|
|
56
56
|
|
|
57
57
|
[notebook]
|
|
58
58
|
notebook
|
|
@@ -63,7 +63,7 @@ ipython>=7.10.0
|
|
|
63
63
|
polars
|
|
64
64
|
|
|
65
65
|
[ray]
|
|
66
|
-
ray[data]>=2.
|
|
66
|
+
ray[data]>=2.1.0
|
|
67
67
|
duckdb>=0.5.0
|
|
68
68
|
pyarrow>=6.0.1
|
|
69
69
|
|
|
@@ -71,7 +71,7 @@ pyarrow>=6.0.1
|
|
|
71
71
|
pyspark>=3.1.1
|
|
72
72
|
|
|
73
73
|
[sql]
|
|
74
|
-
qpd>=0.4.
|
|
74
|
+
qpd>=0.4.4
|
|
75
75
|
fugue-sql-antlr>=0.1.6
|
|
76
76
|
sqlglot
|
|
77
77
|
jinja2
|
|
@@ -92,20 +92,19 @@ class IbisSQLEngine(SQLEngine):
|
|
|
92
92
|
_df2 = self.to_df(df2)
|
|
93
93
|
key_schema, end_schema = get_join_schemas(_df1, _df2, how=how, on=on)
|
|
94
94
|
on_fields = [_df1.native[k] == _df2.native[k] for k in key_schema]
|
|
95
|
+
if ibis.__version__ < "6":
|
|
96
|
+
suffixes: Dict[str, Any] = dict(suffixes=("", _JOIN_RIGHT_SUFFIX))
|
|
97
|
+
else: # pragma: no cover
|
|
98
|
+
# breaking change in ibis 6.0
|
|
99
|
+
suffixes = dict(lname="", rname=_JOIN_RIGHT_SUFFIX)
|
|
95
100
|
if how.lower() == "cross":
|
|
96
|
-
tb = _df1.native.cross_join(_df2.native, suffixes
|
|
101
|
+
tb = _df1.native.cross_join(_df2.native, **suffixes)
|
|
97
102
|
elif how.lower() == "right_outer":
|
|
98
|
-
tb = _df2.native.left_join(
|
|
99
|
-
_df1.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
|
|
100
|
-
)
|
|
103
|
+
tb = _df2.native.left_join(_df1.native, on_fields, **suffixes)
|
|
101
104
|
elif how.lower() == "left_outer":
|
|
102
|
-
tb = _df1.native.left_join(
|
|
103
|
-
_df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
|
|
104
|
-
)
|
|
105
|
+
tb = _df1.native.left_join(_df2.native, on_fields, **suffixes)
|
|
105
106
|
elif how.lower() == "full_outer":
|
|
106
|
-
tb = _df1.native.outer_join(
|
|
107
|
-
_df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
|
|
108
|
-
)
|
|
107
|
+
tb = _df1.native.outer_join(_df2.native, on_fields, **suffixes)
|
|
109
108
|
cols: List[Any] = []
|
|
110
109
|
for k in end_schema.names:
|
|
111
110
|
if k not in key_schema:
|
|
@@ -116,17 +115,11 @@ class IbisSQLEngine(SQLEngine):
|
|
|
116
115
|
)
|
|
117
116
|
tb = tb[cols]
|
|
118
117
|
elif how.lower() in ["semi", "left_semi"]:
|
|
119
|
-
tb = _df1.native.semi_join(
|
|
120
|
-
_df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
|
|
121
|
-
)
|
|
118
|
+
tb = _df1.native.semi_join(_df2.native, on_fields, **suffixes)
|
|
122
119
|
elif how.lower() in ["anti", "left_anti"]:
|
|
123
|
-
tb = _df1.native.anti_join(
|
|
124
|
-
_df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
|
|
125
|
-
)
|
|
120
|
+
tb = _df1.native.anti_join(_df2.native, on_fields, **suffixes)
|
|
126
121
|
else:
|
|
127
|
-
tb = _df1.native.inner_join(
|
|
128
|
-
_df2.native, on_fields, suffixes=("", _JOIN_RIGHT_SUFFIX)
|
|
129
|
-
)
|
|
122
|
+
tb = _df1.native.inner_join(_df2.native, on_fields, **suffixes)
|
|
130
123
|
return self.to_df(tb[end_schema.names], schema=end_schema)
|
|
131
124
|
|
|
132
125
|
def union(self, df1: DataFrame, df2: DataFrame, distinct: bool = True) -> DataFrame:
|
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
import pyarrow as pa
|
|
6
|
+
import ray
|
|
6
7
|
import ray.data as rd
|
|
7
8
|
from triad import Schema
|
|
8
9
|
|
|
@@ -13,15 +14,51 @@ from .._constants import _ZERO_COPY
|
|
|
13
14
|
_RAY_NULL_REPR = "__RAY_NULL__"
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def
|
|
17
|
-
|
|
17
|
+
def is_materialized(df: rd.Dataset) -> bool:
|
|
18
|
+
if hasattr(rd.dataset, "MaterializedDataset"):
|
|
19
|
+
return isinstance(df, rd.dataset.MaterializedDataset)
|
|
20
|
+
return df.is_fully_executed() # pragma: no cover
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def materialize(df: rd.Dataset) -> rd.Dataset:
|
|
24
|
+
if not is_materialized(df):
|
|
25
|
+
if hasattr(df, "materialize"):
|
|
26
|
+
df = df.materialize()
|
|
27
|
+
else: # pragma: no cover
|
|
28
|
+
df = df.fully_executed()
|
|
29
|
+
return df
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_dataset_format(df: rd.Dataset) -> Tuple[Optional[str], rd.Dataset]:
|
|
33
|
+
df = materialize(df)
|
|
18
34
|
if df.count() == 0:
|
|
19
|
-
return None
|
|
20
|
-
if
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
35
|
+
return None, df
|
|
36
|
+
if ray.__version__ < "2.5.0": # pragma: no cover
|
|
37
|
+
if hasattr(df, "_dataset_format"): # pragma: no cover
|
|
38
|
+
return df._dataset_format(), df # ray<2.2
|
|
39
|
+
ctx = rd.context.DatasetContext.get_current()
|
|
40
|
+
ctx.use_streaming_executor = False
|
|
41
|
+
return df.dataset_format(), df # ray>=2.2
|
|
42
|
+
else:
|
|
43
|
+
schema = df.schema(fetch_if_missing=True)
|
|
44
|
+
if schema is None: # pragma: no cover
|
|
45
|
+
return None, df
|
|
46
|
+
if isinstance(schema.base_schema, pa.Schema):
|
|
47
|
+
return "arrow", df
|
|
48
|
+
return "pandas", df
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def to_schema(schema: Any) -> Schema: # pragma: no cover
|
|
52
|
+
if isinstance(schema, pa.Schema):
|
|
53
|
+
return Schema(schema)
|
|
54
|
+
if ray.__version__ >= "2.5.0":
|
|
55
|
+
if isinstance(schema, rd.Schema):
|
|
56
|
+
if hasattr(schema, "base_schema") and isinstance(
|
|
57
|
+
schema.base_schema, pa.Schema
|
|
58
|
+
):
|
|
59
|
+
return Schema(schema.base_schema)
|
|
60
|
+
return Schema(list(zip(schema.names, schema.types)))
|
|
61
|
+
raise ValueError(f"{schema} is not supported")
|
|
25
62
|
|
|
26
63
|
|
|
27
64
|
def build_empty(schema: Schema) -> rd.Dataset:
|
|
@@ -18,7 +18,7 @@ from fugue.plugins import (
|
|
|
18
18
|
)
|
|
19
19
|
|
|
20
20
|
from ._constants import _ZERO_COPY
|
|
21
|
-
from ._utils.dataframe import build_empty, get_dataset_format
|
|
21
|
+
from ._utils.dataframe import build_empty, get_dataset_format, materialize, to_schema
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class RayDataFrame(DataFrame):
|
|
@@ -52,7 +52,7 @@ class RayDataFrame(DataFrame):
|
|
|
52
52
|
self._native = build_empty(schema)
|
|
53
53
|
return
|
|
54
54
|
if isinstance(df, rd.Dataset):
|
|
55
|
-
fmt = get_dataset_format(df)
|
|
55
|
+
fmt, df = get_dataset_format(df)
|
|
56
56
|
if fmt is None: # empty:
|
|
57
57
|
schema = _input_schema(schema).assert_not_empty()
|
|
58
58
|
super().__init__(schema)
|
|
@@ -62,7 +62,7 @@ class RayDataFrame(DataFrame):
|
|
|
62
62
|
rdf = rd.from_arrow_refs(df.to_arrow_refs())
|
|
63
63
|
elif fmt == "arrow":
|
|
64
64
|
rdf = df
|
|
65
|
-
else:
|
|
65
|
+
else: # pragma: no cover
|
|
66
66
|
raise NotImplementedError(
|
|
67
67
|
f"Ray Dataset in {fmt} format is not supported"
|
|
68
68
|
)
|
|
@@ -156,8 +156,7 @@ class RayDataFrame(DataFrame):
|
|
|
156
156
|
|
|
157
157
|
def persist(self, **kwargs: Any) -> "RayDataFrame":
|
|
158
158
|
# TODO: it mutates the dataframe, is this a good bahavior
|
|
159
|
-
|
|
160
|
-
self.native.fully_executed()
|
|
159
|
+
self._native = materialize(self._native)
|
|
161
160
|
return self
|
|
162
161
|
|
|
163
162
|
def count(self) -> int:
|
|
@@ -226,11 +225,12 @@ class RayDataFrame(DataFrame):
|
|
|
226
225
|
) -> Tuple[rd.Dataset, Schema]:
|
|
227
226
|
if internal_schema:
|
|
228
227
|
return rdf, schema
|
|
229
|
-
|
|
228
|
+
fmt, rdf = get_dataset_format(rdf)
|
|
229
|
+
if fmt is None: # empty
|
|
230
230
|
schema = _input_schema(schema).assert_not_empty()
|
|
231
231
|
return build_empty(schema), schema
|
|
232
|
-
if schema is None or schema == rdf.schema(fetch_if_missing=True):
|
|
233
|
-
return rdf, rdf.schema(fetch_if_missing=True)
|
|
232
|
+
if schema is None or schema == to_schema(rdf.schema(fetch_if_missing=True)):
|
|
233
|
+
return rdf, to_schema(rdf.schema(fetch_if_missing=True))
|
|
234
234
|
|
|
235
235
|
def _alter(table: pa.Table) -> pa.Table: # pragma: no cover
|
|
236
236
|
return ArrowDataFrame(table).alter_columns(schema).native # type: ignore
|
|
@@ -263,12 +263,15 @@ def _rd_as_local(df: rd.Dataset) -> bool:
|
|
|
263
263
|
|
|
264
264
|
@get_column_names.candidate(lambda df: isinstance(df, rd.Dataset))
|
|
265
265
|
def _get_ray_dataframe_columns(df: rd.Dataset) -> List[Any]:
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
266
|
+
if hasattr(df, "columns"): # higher version of ray
|
|
267
|
+
return df.columns(fetch_if_missing=True)
|
|
268
|
+
else: # pragma: no cover
|
|
269
|
+
fmt, _ = get_dataset_format(df)
|
|
270
|
+
if fmt == "pandas":
|
|
271
|
+
return list(df.schema(True).names)
|
|
272
|
+
elif fmt == "arrow":
|
|
273
|
+
return df.schema(fetch_if_missing=True).names
|
|
274
|
+
raise NotImplementedError(f"{fmt} is not supported") # pragma: no cover
|
|
272
275
|
|
|
273
276
|
|
|
274
277
|
@rename.candidate(lambda df, *args, **kwargs: isinstance(df, rd.Dataset))
|
|
@@ -134,9 +134,8 @@ class SparkMapEngine(MapEngine):
|
|
|
134
134
|
) -> DataFrame:
|
|
135
135
|
output_schema = Schema(output_schema)
|
|
136
136
|
if self._should_use_pandas_udf(output_schema):
|
|
137
|
-
# pandas udf can only be used for pyspark > 3
|
|
138
137
|
if len(partition_spec.partition_by) > 0:
|
|
139
|
-
if partition_spec.algo
|
|
138
|
+
if partition_spec.algo in ["coarse", "even"]:
|
|
140
139
|
return self._map_by_pandas_udf(
|
|
141
140
|
df,
|
|
142
141
|
map_func=map_func,
|
|
@@ -145,7 +144,18 @@ class SparkMapEngine(MapEngine):
|
|
|
145
144
|
on_init=on_init,
|
|
146
145
|
map_func_format_hint=map_func_format_hint,
|
|
147
146
|
)
|
|
148
|
-
|
|
147
|
+
else:
|
|
148
|
+
if ( # not simple partitioning
|
|
149
|
+
partition_spec.algo != "hash"
|
|
150
|
+
or partition_spec.num_partitions != "0"
|
|
151
|
+
):
|
|
152
|
+
# TODO: not sure if presort should be done
|
|
153
|
+
# on physical partition level
|
|
154
|
+
df = self.to_df(
|
|
155
|
+
self.execution_engine.repartition(
|
|
156
|
+
df, PartitionSpec(partition_spec, presort=[])
|
|
157
|
+
)
|
|
158
|
+
)
|
|
149
159
|
return self._group_map_by_pandas_udf(
|
|
150
160
|
df,
|
|
151
161
|
map_func=map_func,
|
|
@@ -154,7 +164,7 @@ class SparkMapEngine(MapEngine):
|
|
|
154
164
|
on_init=on_init,
|
|
155
165
|
map_func_format_hint=map_func_format_hint,
|
|
156
166
|
)
|
|
157
|
-
|
|
167
|
+
else:
|
|
158
168
|
return self._map_by_pandas_udf(
|
|
159
169
|
df,
|
|
160
170
|
map_func=map_func,
|
|
File without changes
|
|
@@ -22,6 +22,7 @@ from triad import SerializableRLock
|
|
|
22
22
|
|
|
23
23
|
import fugue.api as fa
|
|
24
24
|
from fugue import (
|
|
25
|
+
AnyDataFrame,
|
|
25
26
|
ArrayDataFrame,
|
|
26
27
|
CoTransformer,
|
|
27
28
|
DataFrame,
|
|
@@ -365,6 +366,12 @@ class BuiltInTests(object):
|
|
|
365
366
|
dag.output(dict(df=a), using=mock_outputter2)
|
|
366
367
|
a.partition(num=3).output(MockOutputter3)
|
|
367
368
|
dag.output(dict(aa=a, bb=b), using=MockOutputter4)
|
|
369
|
+
|
|
370
|
+
a = dag.create(mock_creator2, params=dict(p=2))
|
|
371
|
+
b = dag.create(mock_creator2, params=dict(p=2))
|
|
372
|
+
c = dag.process(a, b, using=mock_processor4)
|
|
373
|
+
c.assert_eq(ArrayDataFrame([[2]], "a:int"))
|
|
374
|
+
dag.output(a, b, using=mock_outputter4)
|
|
368
375
|
dag.run(self.engine)
|
|
369
376
|
|
|
370
377
|
def test_zip(self):
|
|
@@ -435,9 +442,14 @@ class BuiltInTests(object):
|
|
|
435
442
|
# this test is important for using mapInPandas in spark
|
|
436
443
|
|
|
437
444
|
# schema: *,c:int
|
|
438
|
-
def mt_pandas(
|
|
445
|
+
def mt_pandas(
|
|
446
|
+
dfs: Iterable[pd.DataFrame], empty: bool = False
|
|
447
|
+
) -> Iterator[pd.DataFrame]:
|
|
439
448
|
for df in dfs:
|
|
440
|
-
|
|
449
|
+
if not empty:
|
|
450
|
+
df = df.assign(c=2)
|
|
451
|
+
df = df[reversed(list(df.columns))]
|
|
452
|
+
yield df
|
|
441
453
|
|
|
442
454
|
with FugueWorkflow() as dag:
|
|
443
455
|
a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
|
|
@@ -445,10 +457,25 @@ class BuiltInTests(object):
|
|
|
445
457
|
dag.df([[1, 2, 2], [3, 4, 2]], "a:int,b:int,c:int").assert_eq(b)
|
|
446
458
|
dag.run(self.engine)
|
|
447
459
|
|
|
460
|
+
# when iterable returns nothing
|
|
461
|
+
with FugueWorkflow() as dag:
|
|
462
|
+
a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
|
|
463
|
+
# without partitioning
|
|
464
|
+
b = a.transform(mt_pandas, params=dict(empty=True))
|
|
465
|
+
dag.df([], "a:int,b:int,c:int").assert_eq(b)
|
|
466
|
+
# with partitioning
|
|
467
|
+
b = a.partition_by("a").transform(mt_pandas, params=dict(empty=True))
|
|
468
|
+
dag.df([], "a:int,b:int,c:int").assert_eq(b)
|
|
469
|
+
dag.run(self.engine)
|
|
470
|
+
|
|
448
471
|
# schema: *
|
|
449
|
-
def mt_arrow(
|
|
472
|
+
def mt_arrow(
|
|
473
|
+
dfs: Iterable[pa.Table], empty: bool = False
|
|
474
|
+
) -> Iterator[pa.Table]:
|
|
450
475
|
for df in dfs:
|
|
451
|
-
|
|
476
|
+
if not empty:
|
|
477
|
+
df = df.select(reversed(df.schema.names))
|
|
478
|
+
yield df
|
|
452
479
|
|
|
453
480
|
# schema: a:long
|
|
454
481
|
def mt_arrow_2(dfs: Iterable[pa.Table]) -> Iterator[pa.Table]:
|
|
@@ -463,6 +490,17 @@ class BuiltInTests(object):
|
|
|
463
490
|
dag.df([[1], [3]], "a:long").assert_eq(b)
|
|
464
491
|
dag.run(self.engine)
|
|
465
492
|
|
|
493
|
+
# when iterable returns nothing
|
|
494
|
+
with FugueWorkflow() as dag:
|
|
495
|
+
a = dag.df([[1, 2], [3, 4]], "a:int,b:int")
|
|
496
|
+
# without partitioning
|
|
497
|
+
b = a.transform(mt_arrow, params=dict(empty=True))
|
|
498
|
+
dag.df([], "a:int,b:int").assert_eq(b)
|
|
499
|
+
# with partitioning
|
|
500
|
+
b = a.partition_by("a").transform(mt_arrow, params=dict(empty=True))
|
|
501
|
+
dag.df([], "a:int,b:int").assert_eq(b)
|
|
502
|
+
dag.run(self.engine)
|
|
503
|
+
|
|
466
504
|
def test_transform_binary(self):
|
|
467
505
|
with FugueWorkflow() as dag:
|
|
468
506
|
a = dag.df([[1, pickle.dumps([0, "a"])]], "a:int,b:bytes")
|
|
@@ -1829,6 +1867,10 @@ def mock_creator(p: int) -> DataFrame:
|
|
|
1829
1867
|
return ArrayDataFrame([[p]], "a:int")
|
|
1830
1868
|
|
|
1831
1869
|
|
|
1870
|
+
def mock_creator2(p: int) -> AnyDataFrame:
|
|
1871
|
+
return fa.as_fugue_df([[p]], schema="a:int")
|
|
1872
|
+
|
|
1873
|
+
|
|
1832
1874
|
def mock_processor(df1: List[List[Any]], df2: List[List[Any]]) -> DataFrame:
|
|
1833
1875
|
return ArrayDataFrame([[len(df1) + len(df2)]], "a:int")
|
|
1834
1876
|
|
|
@@ -1844,6 +1886,10 @@ class MockProcessor3(Processor):
|
|
|
1844
1886
|
return ArrayDataFrame([[sum(s.count() for s in dfs.values())]], "a:int")
|
|
1845
1887
|
|
|
1846
1888
|
|
|
1889
|
+
def mock_processor4(df1: AnyDataFrame, df2: AnyDataFrame) -> AnyDataFrame:
|
|
1890
|
+
return ArrayDataFrame([[fa.count(df1) + fa.count(df2)]], "a:int")
|
|
1891
|
+
|
|
1892
|
+
|
|
1847
1893
|
def mock_outputter(df1: List[List[Any]], df2: List[List[Any]]) -> None:
|
|
1848
1894
|
assert len(df1) == len(df2)
|
|
1849
1895
|
|
|
@@ -1857,6 +1903,10 @@ class MockOutputter3(Outputter):
|
|
|
1857
1903
|
assert "3" == self.partition_spec.num_partitions
|
|
1858
1904
|
|
|
1859
1905
|
|
|
1906
|
+
def mock_outputter4(df1: AnyDataFrame, df2: AnyDataFrame) -> None:
|
|
1907
|
+
assert fa.count(df1) == fa.count(df2)
|
|
1908
|
+
|
|
1909
|
+
|
|
1860
1910
|
class MockOutputter4(Outputter):
|
|
1861
1911
|
def process(self, dfs):
|
|
1862
1912
|
for k, v in dfs.items():
|
|
@@ -1895,8 +1945,8 @@ def mock_tf0(df: pd.DataFrame, p=1, col="p") -> pd.DataFrame:
|
|
|
1895
1945
|
|
|
1896
1946
|
# schema: *,ct:int,p:int
|
|
1897
1947
|
def mock_tf1(df: pd.DataFrame, p=1) -> pd.DataFrame:
|
|
1898
|
-
df["ct"] = df.shape[0]
|
|
1899
1948
|
df["p"] = p
|
|
1949
|
+
df["ct"] = df.shape[0]
|
|
1900
1950
|
return df
|
|
1901
1951
|
|
|
1902
1952
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.8.6"
|