fugue 0.9.0.dev4__tar.gz → 0.9.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fugue-0.9.2/PKG-INFO +370 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/io.py +14 -2
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/function_wrapper.py +112 -18
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/transformer/convert.py +4 -4
- fugue-0.9.2/fugue.egg-info/PKG-INFO +370 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue.egg-info/SOURCES.txt +1 -0
- fugue-0.9.2/fugue.egg-info/entry_points.txt +11 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue.egg-info/requires.txt +8 -11
- fugue-0.9.2/fugue_dask/_dask_sql_wrapper.py +76 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/_utils.py +9 -5
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/dataframe.py +1 -1
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/execution_engine.py +8 -11
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/dataframe.py +5 -5
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/execution_engine.py +1 -1
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ibis/execution_engine.py +7 -6
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/_utils/io.py +23 -16
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/_utils/convert.py +18 -12
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/_utils/misc.py +1 -1
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_test/builtin_suite.py +38 -1
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_test/execution_suite.py +2 -0
- fugue-0.9.2/fugue_version/__init__.py +1 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/setup.cfg +1 -1
- {fugue-0.9.0.dev4 → fugue-0.9.2}/setup.py +10 -10
- fugue-0.9.0.dev4/PKG-INFO +0 -308
- fugue-0.9.0.dev4/fugue.egg-info/PKG-INFO +0 -308
- fugue-0.9.0.dev4/fugue.egg-info/entry_points.txt +0 -12
- fugue-0.9.0.dev4/fugue_version/__init__.py +0 -1
- {fugue-0.9.0.dev4 → fugue-0.9.2}/LICENSE +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/README.md +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/display.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/exception.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/interfaceless.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/misc.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/_utils/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/api.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/bag/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/bag/array_bag.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/bag/bag.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/collections/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/collections/partition.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/collections/sql.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/collections/yielded.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/column/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/column/expressions.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/column/functions.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/column/sql.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/constants.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/api.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/array_dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/arrow_dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/dataframe_iterable_dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/dataframes.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/iterable_dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/pandas_dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataframe/utils.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataset/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataset/api.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dataset/dataset.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/dev.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/exceptions.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/execution/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/execution/api.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/execution/execution_engine.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/execution/factory.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/execution/native_execution_engine.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/_builtins/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/_builtins/creators.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/_builtins/outputters.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/_builtins/processors.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/_utils.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/context.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/creator/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/creator/convert.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/creator/creator.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/outputter/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/outputter/convert.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/outputter/outputter.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/processor/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/processor/convert.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/processor/processor.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/transformer/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/transformer/constants.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/extensions/transformer/transformer.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/plugins.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/py.typed +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/rpc/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/rpc/base.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/rpc/flask.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/sql/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/sql/_utils.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/sql/_visitors.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/sql/api.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/sql/workflow.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/test/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/test/pandas_tester.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/test/plugins.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/_checkpoint.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/_tasks.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/_workflow_context.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/api.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/input.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/module.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue/workflow/workflow.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue.egg-info/dependency_links.txt +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue.egg-info/top_level.txt +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_contrib/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_contrib/contrib.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_contrib/seaborn/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_contrib/viz/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_contrib/viz/_ext.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/_constants.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/_io.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_dask/tester.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/_io.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/_utils.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/dask.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_duckdb/tester.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ibis/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ibis/_compat.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ibis/_utils.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ibis/dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_notebook/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_notebook/env.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_notebook/nbextension/README.md +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_notebook/nbextension/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_notebook/nbextension/description.yaml +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_notebook/nbextension/main.js +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_polars/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_polars/_utils.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_polars/polars_dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_polars/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/_constants.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/_utils/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/_utils/cluster.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/_utils/dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/execution_engine.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_ray/tester.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/_constants.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/_utils/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/_utils/io.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/_utils/partition.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/dataframe.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/execution_engine.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/registry.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_spark/tester.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_sql/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_sql/exceptions.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_test/__init__.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_test/bag_suite.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_test/dataframe_suite.py +0 -0
- {fugue-0.9.0.dev4 → fugue-0.9.2}/fugue_test/fixtures.py +0 -0
fugue-0.9.2/PKG-INFO
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fugue
|
|
3
|
+
Version: 0.9.2
|
|
4
|
+
Summary: An abstraction layer for distributed computation
|
|
5
|
+
Home-page: http://github.com/fugue-project/fugue
|
|
6
|
+
Author: The Fugue Development Team
|
|
7
|
+
Author-email: hello@fugue.ai
|
|
8
|
+
License: Apache-2.0
|
|
9
|
+
Keywords: distributed spark dask ray sql dsl domain specific language
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
13
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: triad>=1.0.0
|
|
25
|
+
Requires-Dist: adagio>=0.2.6
|
|
26
|
+
Provides-Extra: sql
|
|
27
|
+
Requires-Dist: qpd>=0.4.4; extra == "sql"
|
|
28
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "sql"
|
|
29
|
+
Requires-Dist: sqlglot; extra == "sql"
|
|
30
|
+
Requires-Dist: jinja2; extra == "sql"
|
|
31
|
+
Provides-Extra: cpp-sql-parser
|
|
32
|
+
Requires-Dist: fugue-sql-antlr[cpp]>=0.2.0; extra == "cpp-sql-parser"
|
|
33
|
+
Provides-Extra: spark
|
|
34
|
+
Requires-Dist: pyspark>=3.1.1; extra == "spark"
|
|
35
|
+
Provides-Extra: dask
|
|
36
|
+
Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "dask"
|
|
37
|
+
Requires-Dist: pyarrow>=7.0.0; extra == "dask"
|
|
38
|
+
Requires-Dist: pandas>=2.0.2; extra == "dask"
|
|
39
|
+
Provides-Extra: ray
|
|
40
|
+
Requires-Dist: ray[data]>=2.30.0; extra == "ray"
|
|
41
|
+
Requires-Dist: duckdb>=0.5.0; extra == "ray"
|
|
42
|
+
Requires-Dist: pyarrow>=7.0.0; extra == "ray"
|
|
43
|
+
Requires-Dist: pandas<2.2; extra == "ray"
|
|
44
|
+
Provides-Extra: duckdb
|
|
45
|
+
Requires-Dist: qpd>=0.4.4; extra == "duckdb"
|
|
46
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "duckdb"
|
|
47
|
+
Requires-Dist: sqlglot; extra == "duckdb"
|
|
48
|
+
Requires-Dist: jinja2; extra == "duckdb"
|
|
49
|
+
Requires-Dist: duckdb>=0.5.0; extra == "duckdb"
|
|
50
|
+
Requires-Dist: numpy; extra == "duckdb"
|
|
51
|
+
Provides-Extra: polars
|
|
52
|
+
Requires-Dist: polars; extra == "polars"
|
|
53
|
+
Provides-Extra: ibis
|
|
54
|
+
Requires-Dist: qpd>=0.4.4; extra == "ibis"
|
|
55
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "ibis"
|
|
56
|
+
Requires-Dist: sqlglot; extra == "ibis"
|
|
57
|
+
Requires-Dist: jinja2; extra == "ibis"
|
|
58
|
+
Requires-Dist: ibis-framework[pandas]; extra == "ibis"
|
|
59
|
+
Requires-Dist: pandas<2.2; extra == "ibis"
|
|
60
|
+
Provides-Extra: notebook
|
|
61
|
+
Requires-Dist: notebook; extra == "notebook"
|
|
62
|
+
Requires-Dist: jupyterlab; extra == "notebook"
|
|
63
|
+
Requires-Dist: ipython>=7.10.0; extra == "notebook"
|
|
64
|
+
Provides-Extra: all
|
|
65
|
+
Requires-Dist: qpd>=0.4.4; extra == "all"
|
|
66
|
+
Requires-Dist: fugue-sql-antlr>=0.2.0; extra == "all"
|
|
67
|
+
Requires-Dist: sqlglot; extra == "all"
|
|
68
|
+
Requires-Dist: jinja2; extra == "all"
|
|
69
|
+
Requires-Dist: pyspark>=3.1.1; extra == "all"
|
|
70
|
+
Requires-Dist: dask[dataframe,distributed]>=2024.4.0; extra == "all"
|
|
71
|
+
Requires-Dist: dask-sql; extra == "all"
|
|
72
|
+
Requires-Dist: ray[data]>=2.30.0; extra == "all"
|
|
73
|
+
Requires-Dist: notebook; extra == "all"
|
|
74
|
+
Requires-Dist: jupyterlab; extra == "all"
|
|
75
|
+
Requires-Dist: ipython>=7.10.0; extra == "all"
|
|
76
|
+
Requires-Dist: duckdb>=0.5.0; extra == "all"
|
|
77
|
+
Requires-Dist: pyarrow>=6.0.1; extra == "all"
|
|
78
|
+
Requires-Dist: pandas<2.2,>=2.0.2; extra == "all"
|
|
79
|
+
Requires-Dist: ibis-framework[duckdb,pandas]; extra == "all"
|
|
80
|
+
Requires-Dist: polars; extra == "all"
|
|
81
|
+
Dynamic: author
|
|
82
|
+
Dynamic: author-email
|
|
83
|
+
Dynamic: classifier
|
|
84
|
+
Dynamic: description
|
|
85
|
+
Dynamic: description-content-type
|
|
86
|
+
Dynamic: home-page
|
|
87
|
+
Dynamic: keywords
|
|
88
|
+
Dynamic: license
|
|
89
|
+
Dynamic: license-file
|
|
90
|
+
Dynamic: provides-extra
|
|
91
|
+
Dynamic: requires-dist
|
|
92
|
+
Dynamic: requires-python
|
|
93
|
+
Dynamic: summary
|
|
94
|
+
|
|
95
|
+
# Fugue
|
|
96
|
+
|
|
97
|
+
[](https://pypi.python.org/pypi/fugue/)
|
|
98
|
+
[](https://pypi.python.org/pypi/fugue/)
|
|
99
|
+
[](https://pypi.python.org/pypi/fugue/)
|
|
100
|
+
[](https://codecov.io/gh/fugue-project/fugue)
|
|
101
|
+
[](https://www.codacy.com/gh/fugue-project/fugue/dashboard?utm_source=github.com&utm_medium=referral&utm_content=fugue-project/fugue&utm_campaign=Badge_Grade)
|
|
102
|
+
[](https://pepy.tech/project/fugue)
|
|
103
|
+
|
|
104
|
+
| Tutorials | API Documentation | Chat with us on slack! |
|
|
105
|
+
| --------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
|
106
|
+
| [](https://fugue-tutorials.readthedocs.io/) | [](https://fugue.readthedocs.org) | [](http://slack.fugue.ai) |
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
**Fugue is a unified interface for distributed computing that lets users execute Python, Pandas, and SQL code on Spark, Dask, and Ray with minimal rewrites**.
|
|
110
|
+
|
|
111
|
+
Fugue is most commonly used for:
|
|
112
|
+
|
|
113
|
+
* **Parallelizing or scaling existing Python and Pandas code** by bringing it to Spark, Dask, or Ray with minimal rewrites.
|
|
114
|
+
* Using [FugueSQL](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes_sql.html) to **define end-to-end workflows** on top of Pandas, Spark, and Dask DataFrames. FugueSQL is an enhanced SQL interface that can invoke Python code.
|
|
115
|
+
|
|
116
|
+
To see how Fugue compares to other frameworks like dbt, Arrow, Ibis, PySpark Pandas, see the [comparisons](https://fugue-tutorials.readthedocs.io/#how-does-fugue-compare-to)
|
|
117
|
+
|
|
118
|
+
## [Fugue API](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes.html)
|
|
119
|
+
|
|
120
|
+
The Fugue API is a collection of functions that are capable of running on Pandas, Spark, Dask, and Ray. The simplest way to use Fugue is the [`transform()` function](https://fugue-tutorials.readthedocs.io/tutorials/beginner/transform.html). This lets users parallelize the execution of a single function by bringing it to Spark, Dask, or Ray. In the example below, the `map_letter_to_food()` function takes in a mapping and applies it on a column. This is just Pandas and Python so far (without Fugue).
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
import pandas as pd
|
|
124
|
+
from typing import Dict
|
|
125
|
+
|
|
126
|
+
input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
|
|
127
|
+
map_dict = {"A": "Apple", "B": "Banana", "C": "Carrot"}
|
|
128
|
+
|
|
129
|
+
def map_letter_to_food(df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
|
|
130
|
+
df["value"] = df["value"].map(mapping)
|
|
131
|
+
return df
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Now, the `map_letter_to_food()` function is brought to the Spark execution engine by invoking the `transform()` function of Fugue. The output `schema` and `params` are passed to the `transform()` call. The `schema` is needed because it's a requirement for distributed frameworks. A schema of `"*"` below means all input columns are in the output.
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from pyspark.sql import SparkSession
|
|
138
|
+
from fugue import transform
|
|
139
|
+
|
|
140
|
+
spark = SparkSession.builder.getOrCreate()
|
|
141
|
+
sdf = spark.createDataFrame(input_df)
|
|
142
|
+
|
|
143
|
+
out = transform(sdf,
|
|
144
|
+
map_letter_to_food,
|
|
145
|
+
schema="*",
|
|
146
|
+
params=dict(mapping=map_dict),
|
|
147
|
+
)
|
|
148
|
+
# out is a Spark DataFrame
|
|
149
|
+
out.show()
|
|
150
|
+
```
|
|
151
|
+
```rst
|
|
152
|
+
+---+------+
|
|
153
|
+
| id| value|
|
|
154
|
+
+---+------+
|
|
155
|
+
| 0| Apple|
|
|
156
|
+
| 1|Banana|
|
|
157
|
+
| 2|Carrot|
|
|
158
|
+
+---+------+
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
<details>
|
|
162
|
+
<summary>PySpark equivalent of Fugue transform()</summary>
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from typing import Iterator, Union
|
|
166
|
+
from pyspark.sql.types import StructType
|
|
167
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
168
|
+
|
|
169
|
+
spark_session = SparkSession.builder.getOrCreate()
|
|
170
|
+
|
|
171
|
+
def mapping_wrapper(dfs: Iterator[pd.DataFrame], mapping):
|
|
172
|
+
for df in dfs:
|
|
173
|
+
yield map_letter_to_food(df, mapping)
|
|
174
|
+
|
|
175
|
+
def run_map_letter_to_food(input_df: Union[DataFrame, pd.DataFrame], mapping):
|
|
176
|
+
# conversion
|
|
177
|
+
if isinstance(input_df, pd.DataFrame):
|
|
178
|
+
sdf = spark_session.createDataFrame(input_df.copy())
|
|
179
|
+
else:
|
|
180
|
+
sdf = input_df.copy()
|
|
181
|
+
|
|
182
|
+
schema = StructType(list(sdf.schema.fields))
|
|
183
|
+
return sdf.mapInPandas(lambda dfs: mapping_wrapper(dfs, mapping),
|
|
184
|
+
schema=schema)
|
|
185
|
+
|
|
186
|
+
result = run_map_letter_to_food(input_df, map_dict)
|
|
187
|
+
result.show()
|
|
188
|
+
```
|
|
189
|
+
</details>
|
|
190
|
+
|
|
191
|
+
This syntax is simpler, cleaner, and more maintainable than the PySpark equivalent. At the same time, no edits were made to the original Pandas-based function to bring it to Spark. It is still usable on Pandas DataFrames. Fugue `transform()` also supports Dask and Ray as execution engines alongside the default Pandas-based engine.
|
|
192
|
+
|
|
193
|
+
The Fugue API has a broader collection of functions that are also compatible with Spark, Dask, and Ray. For example, we can use `load()` and `save()` to create an end-to-end workflow compatible with Spark, Dask, and Ray. For the full list of functions, see the [Top Level API](https://fugue.readthedocs.io/en/latest/top_api.html)
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
import fugue.api as fa
|
|
197
|
+
|
|
198
|
+
def run(engine=None):
|
|
199
|
+
with fa.engine_context(engine):
|
|
200
|
+
df = fa.load("/path/to/file.parquet")
|
|
201
|
+
out = fa.transform(df, map_letter_to_food, schema="*")
|
|
202
|
+
fa.save(out, "/path/to/output_file.parquet")
|
|
203
|
+
|
|
204
|
+
run() # runs on Pandas
|
|
205
|
+
run(engine="spark") # runs on Spark
|
|
206
|
+
run(engine="dask") # runs on Dask
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
All functions underneath the context will run on the specified backend. This makes it easy to toggle between local execution, and distributed execution.
|
|
210
|
+
|
|
211
|
+
## [FugueSQL](https://fugue-tutorials.readthedocs.io/tutorials/fugue_sql/index.html)
|
|
212
|
+
|
|
213
|
+
FugueSQL is a SQL-based language capable of expressing end-to-end data workflows on top of Pandas, Spark, and Dask. The `map_letter_to_food()` function above is used in the SQL expression below. This is how to use a Python-defined function along with the standard SQL `SELECT` statement.
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from fugue.api import fugue_sql
|
|
217
|
+
import json
|
|
218
|
+
|
|
219
|
+
query = """
|
|
220
|
+
SELECT id, value
|
|
221
|
+
FROM input_df
|
|
222
|
+
TRANSFORM USING map_letter_to_food(mapping={{mapping}}) SCHEMA *
|
|
223
|
+
"""
|
|
224
|
+
map_dict_str = json.dumps(map_dict)
|
|
225
|
+
|
|
226
|
+
# returns Pandas DataFrame
|
|
227
|
+
fugue_sql(query,mapping=map_dict_str)
|
|
228
|
+
|
|
229
|
+
# returns Spark DataFrame
|
|
230
|
+
fugue_sql(query, mapping=map_dict_str, engine="spark")
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
## Installation
|
|
234
|
+
|
|
235
|
+
Fugue can be installed through pip or conda. For example:
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
pip install fugue
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
pip install fugue[sql]
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
It also has the following installation extras:
|
|
248
|
+
|
|
249
|
+
* **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
|
|
250
|
+
* **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
|
|
251
|
+
* **dask**: to support Dask as the ExecutionEngine.
|
|
252
|
+
* **ray**: to support Ray as the ExecutionEngine.
|
|
253
|
+
* **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).
|
|
254
|
+
* **polars**: to support Polars DataFrames and extensions using Polars.
|
|
255
|
+
* **ibis**: to enable Ibis for Fugue workflows, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/ibis.html).
|
|
256
|
+
* **cpp_sql_parser**: to enable the CPP antlr parser for Fugue SQL. It can be 50+ times faster than the pure Python parser. For the main Python versions and platforms, there is already pre-built binaries, but for the remaining, it needs a C++ compiler to build on the fly.
|
|
257
|
+
|
|
258
|
+
For example a common use case is:
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
pip install "fugue[duckdb,spark]"
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Note if you already installed Spark or DuckDB independently, Fugue is able to automatically use them without installing the extras.
|
|
265
|
+
|
|
266
|
+
## [Getting Started](https://fugue-tutorials.readthedocs.io/)
|
|
267
|
+
|
|
268
|
+
The best way to get started with Fugue is to work through the 10 minute tutorials:
|
|
269
|
+
|
|
270
|
+
* [Fugue API in 10 minutes](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes.html)
|
|
271
|
+
* [FugueSQL in 10 minutes](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes_sql.html)
|
|
272
|
+
|
|
273
|
+
For the top level API, see:
|
|
274
|
+
|
|
275
|
+
* [Fugue Top Level API](https://fugue.readthedocs.io/en/latest/top_api.html)
|
|
276
|
+
|
|
277
|
+
The [tutorials](https://fugue-tutorials.readthedocs.io/) can also be run in an interactive notebook environment through binder or Docker:
|
|
278
|
+
|
|
279
|
+
### Using binder
|
|
280
|
+
|
|
281
|
+
[](https://mybinder.org/v2/gh/fugue-project/tutorials/master)
|
|
282
|
+
|
|
283
|
+
**Note it runs slow on binder** because the machine on binder isn't powerful enough for a distributed framework such as Spark. Parallel executions can become sequential, so some of the performance comparison examples will not give you the correct numbers.
|
|
284
|
+
|
|
285
|
+
### Using Docker
|
|
286
|
+
|
|
287
|
+
Alternatively, you should get decent performance by running this Docker image on your own machine:
|
|
288
|
+
|
|
289
|
+
```bash
|
|
290
|
+
docker run -p 8888:8888 fugueproject/tutorials:latest
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
## Jupyter Notebook Extension
|
|
295
|
+
|
|
296
|
+
There is an accompanying [notebook extension](https://pypi.org/project/fugue-jupyter/) for FugueSQL that lets users use the `%%fsql` cell magic. The extension also provides syntax highlighting for FugueSQL cells. It works for both classic notebook and Jupyter Lab. More details can be found in the [installation instructions](https://github.com/fugue-project/fugue-jupyter#install).
|
|
297
|
+
|
|
298
|
+

|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
## Ecosystem
|
|
302
|
+
|
|
303
|
+
By being an abstraction layer, Fugue can be used with a lot of other open-source projects seamlessly.
|
|
304
|
+
|
|
305
|
+
Python backends:
|
|
306
|
+
|
|
307
|
+
* [Pandas](https://github.com/pandas-dev/pandas)
|
|
308
|
+
* [Polars](https://www.pola.rs) (DataFrames only)
|
|
309
|
+
* [Spark](https://github.com/apache/spark)
|
|
310
|
+
* [Dask](https://github.com/dask/dask)
|
|
311
|
+
* [Ray](http://github.com/ray-project/ray)
|
|
312
|
+
* [Ibis](https://github.com/ibis-project/ibis/)
|
|
313
|
+
|
|
314
|
+
FugueSQL backends:
|
|
315
|
+
|
|
316
|
+
* Pandas - FugueSQL can run on Pandas
|
|
317
|
+
* [Duckdb](https://github.com/duckdb/duckdb) - in-process SQL OLAP database management
|
|
318
|
+
* [dask-sql](https://github.com/dask-contrib/dask-sql) - SQL interface for Dask
|
|
319
|
+
* SparkSQL
|
|
320
|
+
* [BigQuery](https://fugue-tutorials.readthedocs.io/tutorials/integrations/warehouses/bigquery.html)
|
|
321
|
+
* Trino
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
Fugue is available as a backend or can integrate with the following projects:
|
|
325
|
+
|
|
326
|
+
* [WhyLogs](https://whylogs.readthedocs.io/en/latest/examples/integrations/Fugue_Profiling.html?highlight=fugue) - data profiling
|
|
327
|
+
* [PyCaret](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/pycaret.html) - low code machine learning
|
|
328
|
+
* [Nixtla](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/nixtla.html) - timeseries modelling
|
|
329
|
+
* [Prefect](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/prefect.html) - workflow orchestration
|
|
330
|
+
* [Pandera](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/pandera.html) - data validation
|
|
331
|
+
* [Datacompy (by Capital One)](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/datacompy.html) - comparing DataFrames
|
|
332
|
+
|
|
333
|
+
Registered 3rd party extensions (majorly for Fugue SQL) include:
|
|
334
|
+
|
|
335
|
+
* [Pandas plot](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html) - visualize data using matplotlib or plotly
|
|
336
|
+
* [Seaborn](https://seaborn.pydata.org/api.html) - visualize data using seaborn
|
|
337
|
+
* [WhyLogs](https://whylogs.readthedocs.io/en/latest/examples/integrations/Fugue_Profiling.html?highlight=fugue) - visualize data profiling
|
|
338
|
+
* [Vizzu](https://github.com/vizzuhq/ipyvizzu) - visualize data using ipyvizzu
|
|
339
|
+
|
|
340
|
+
## Community and Contributing
|
|
341
|
+
|
|
342
|
+
Feel free to message us on [Slack](http://slack.fugue.ai). We also have [contributing instructions](CONTRIBUTING.md).
|
|
343
|
+
|
|
344
|
+
### Case Studies
|
|
345
|
+
|
|
346
|
+
* [How LyftLearn Democratizes Distributed Compute through Kubernetes Spark and Fugue](https://eng.lyft.com/how-lyftlearn-democratizes-distributed-compute-through-kubernetes-spark-and-fugue-c0875b97c3d9)
|
|
347
|
+
* [Clobotics - Large Scale Image Processing with Spark through Fugue](https://medium.com/fugue-project/large-scale-image-processing-with-spark-through-fugue-e510b9813da8)
|
|
348
|
+
* [Architecture for a data lake REST API using Delta Lake, Fugue & Spark (article by bitsofinfo)](https://bitsofinfo.wordpress.com/2023/08/14/data-lake-rest-api-delta-lake-fugue-spark)
|
|
349
|
+
|
|
350
|
+
### Mentioned Uses
|
|
351
|
+
|
|
352
|
+
* [Productionizing Data Science at Interos, Inc. (LinkedIn post by Anthony Holten)](https://www.linkedin.com/posts/anthony-holten_pandas-spark-dask-activity-7022628193983459328-QvcF)
|
|
353
|
+
* [Multiple Time Series Forecasting with Fugue & Nixtla at Bain & Company (LinkedIn post by Fahad Akbar)](https://www.linkedin.com/posts/fahadakbar_fugue-datascience-forecasting-activity-7041119034813124608-u08q?utm_source=share&utm_medium=member_desktop)
|
|
354
|
+
|
|
355
|
+
## Further Resources
|
|
356
|
+
|
|
357
|
+
View some of our latest conferences presentations and content. For a more complete list, check the [Content](https://fugue-tutorials.readthedocs.io/tutorials/resources/content.html) page in the tutorials.
|
|
358
|
+
|
|
359
|
+
### Blogs
|
|
360
|
+
|
|
361
|
+
* [Why Pandas-like Interfaces are Sub-optimal for Distributed Computing](https://towardsdatascience.com/why-pandas-like-interfaces-are-sub-optimal-for-distributed-computing-322dacbce43)
|
|
362
|
+
* [Introducing FugueSQL — SQL for Pandas, Spark, and Dask DataFrames (Towards Data Science by Khuyen Tran)](https://towardsdatascience.com/introducing-fuguesql-sql-for-pandas-spark-and-dask-dataframes-63d461a16b27)
|
|
363
|
+
|
|
364
|
+
### Conferences
|
|
365
|
+
|
|
366
|
+
* [Distributed Machine Learning at Lyft](https://www.youtube.com/watch?v=_IVyIOV0LgY)
|
|
367
|
+
* [Comparing the Different Ways to Scale Python and Pandas Code](https://www.youtube.com/watch?v=b3ae0m_XTys)
|
|
368
|
+
* [Large Scale Data Validation with Spark and Dask (PyCon US)](https://www.youtube.com/watch?v=2AdvBgjO_3Q)
|
|
369
|
+
* [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames (PyData Global)](https://www.youtube.com/watch?v=OBpnGYjNBBI)
|
|
370
|
+
* [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk)
|
|
@@ -20,6 +20,10 @@ class FileParser(object):
|
|
|
20
20
|
self._has_glob = "*" in path or "?" in path
|
|
21
21
|
self._raw_path = path
|
|
22
22
|
self._fs, self._fs_path = url_to_fs(path)
|
|
23
|
+
if not self._has_glob and self._fs.isdir(self._fs_path):
|
|
24
|
+
self._is_dir = True
|
|
25
|
+
else:
|
|
26
|
+
self._is_dir = False
|
|
23
27
|
if not self.is_local:
|
|
24
28
|
self._path = self._fs.unstrip_protocol(self._fs_path)
|
|
25
29
|
else:
|
|
@@ -43,11 +47,15 @@ class FileParser(object):
|
|
|
43
47
|
return self
|
|
44
48
|
|
|
45
49
|
@property
|
|
46
|
-
def
|
|
50
|
+
def is_dir(self) -> bool:
|
|
51
|
+
return self._is_dir
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def has_glob(self) -> bool:
|
|
47
55
|
return self._has_glob
|
|
48
56
|
|
|
49
57
|
@property
|
|
50
|
-
def is_local(self):
|
|
58
|
+
def is_local(self) -> bool:
|
|
51
59
|
return isinstance(self._fs, LocalFileSystem)
|
|
52
60
|
|
|
53
61
|
def join(self, path: str, format_hint: Optional[str] = None) -> "FileParser":
|
|
@@ -65,6 +73,10 @@ class FileParser(object):
|
|
|
65
73
|
def path(self) -> str:
|
|
66
74
|
return self._path
|
|
67
75
|
|
|
76
|
+
def as_dir_path(self) -> str:
|
|
77
|
+
assert_or_throw(self.is_dir, f"{self.raw_path} is not a directory")
|
|
78
|
+
return self.path + self._fs.sep
|
|
79
|
+
|
|
68
80
|
@property
|
|
69
81
|
def raw_path(self) -> str:
|
|
70
82
|
return self._raw_path
|
|
@@ -20,6 +20,7 @@ from triad.collections.function_wrapper import (
|
|
|
20
20
|
PositionalParam,
|
|
21
21
|
function_wrapper,
|
|
22
22
|
)
|
|
23
|
+
from triad.utils.convert import compare_annotations
|
|
23
24
|
from triad.utils.iter import EmptyAwareIterable, make_empty_aware
|
|
24
25
|
|
|
25
26
|
from ..constants import FUGUE_ENTRYPOINT
|
|
@@ -37,6 +38,14 @@ from .iterable_dataframe import IterableDataFrame
|
|
|
37
38
|
from .pandas_dataframe import PandasDataFrame
|
|
38
39
|
|
|
39
40
|
|
|
41
|
+
def _compare_iter(tp: Any) -> Any:
|
|
42
|
+
return lambda x: compare_annotations(
|
|
43
|
+
x, Iterable[tp] # type:ignore
|
|
44
|
+
) or compare_annotations(
|
|
45
|
+
x, Iterator[tp] # type:ignore
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
40
49
|
@function_wrapper(FUGUE_ENTRYPOINT)
|
|
41
50
|
class DataFrameFunctionWrapper(FunctionWrapper):
|
|
42
51
|
@property
|
|
@@ -71,6 +80,7 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
71
80
|
p.update(kwargs)
|
|
72
81
|
has_kw = False
|
|
73
82
|
rargs: Dict[str, Any] = {}
|
|
83
|
+
row_param_info: Any = None
|
|
74
84
|
for k, v in self._params.items():
|
|
75
85
|
if isinstance(v, (PositionalParam, KeywordParam)):
|
|
76
86
|
if isinstance(v, KeywordParam):
|
|
@@ -81,7 +91,16 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
81
91
|
isinstance(p[k], DataFrame),
|
|
82
92
|
lambda: TypeError(f"{p[k]} is not a DataFrame"),
|
|
83
93
|
)
|
|
84
|
-
|
|
94
|
+
if v.is_per_row: # pragma: no cover
|
|
95
|
+
# TODO: this branch is used only if row annotations
|
|
96
|
+
# are allowed as input
|
|
97
|
+
assert_or_throw(
|
|
98
|
+
row_param_info is None,
|
|
99
|
+
lambda: ValueError("only one row parameter is allowed"),
|
|
100
|
+
)
|
|
101
|
+
row_param_info = (k, v, p[k])
|
|
102
|
+
else:
|
|
103
|
+
rargs[k] = v.to_input_data(p[k], ctx=ctx)
|
|
85
104
|
else:
|
|
86
105
|
rargs[k] = p[k] # TODO: should we do auto type conversion?
|
|
87
106
|
del p[k]
|
|
@@ -91,12 +110,40 @@ class DataFrameFunctionWrapper(FunctionWrapper):
|
|
|
91
110
|
rargs.update(p)
|
|
92
111
|
elif not ignore_unknown and len(p) > 0:
|
|
93
112
|
raise ValueError(f"{p} are not acceptable parameters")
|
|
113
|
+
if row_param_info is None:
|
|
114
|
+
return self._run_func(rargs, output, output_schema, ctx, raw=False)
|
|
115
|
+
else: # pragma: no cover
|
|
116
|
+
# input contains row parameter
|
|
117
|
+
# TODO: this branch is used only if row annotations are allowed as input
|
|
118
|
+
|
|
119
|
+
def _dfs() -> Iterable[Any]:
|
|
120
|
+
k, v, df = row_param_info
|
|
121
|
+
for row in v.to_input_rows(df, ctx):
|
|
122
|
+
rargs[k] = None
|
|
123
|
+
_rargs = rargs.copy()
|
|
124
|
+
_rargs[k] = row
|
|
125
|
+
yield self._run_func(_rargs, output, output_schema, ctx, raw=True)
|
|
126
|
+
|
|
127
|
+
if not output:
|
|
128
|
+
sum(1 for _ in _dfs())
|
|
129
|
+
return
|
|
130
|
+
else:
|
|
131
|
+
return self._rt.iterable_to_output_df(_dfs(), output_schema, ctx)
|
|
132
|
+
|
|
133
|
+
def _run_func(
|
|
134
|
+
self,
|
|
135
|
+
rargs: Dict[str, Any],
|
|
136
|
+
output: bool,
|
|
137
|
+
output_schema: Any,
|
|
138
|
+
ctx: Any,
|
|
139
|
+
raw: bool,
|
|
140
|
+
) -> Any:
|
|
94
141
|
rt = self._func(**rargs)
|
|
95
142
|
if not output:
|
|
96
143
|
if isinstance(self._rt, _DataFrameParamBase):
|
|
97
144
|
self._rt.count(rt)
|
|
98
145
|
return
|
|
99
|
-
if isinstance(self._rt, _DataFrameParamBase):
|
|
146
|
+
if not raw and isinstance(self._rt, _DataFrameParamBase):
|
|
100
147
|
return self._rt.to_output_df(rt, output_schema, ctx=ctx)
|
|
101
148
|
return rt
|
|
102
149
|
|
|
@@ -111,6 +158,7 @@ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
|
|
|
111
158
|
annotation == Callable
|
|
112
159
|
or annotation == callable # pylint: disable=comparison-with-callable
|
|
113
160
|
or str(annotation).startswith("typing.Callable")
|
|
161
|
+
or str(annotation).startswith("collections.abc.Callable")
|
|
114
162
|
),
|
|
115
163
|
)
|
|
116
164
|
class _CallableParam(AnnotatedParam):
|
|
@@ -125,6 +173,9 @@ class _CallableParam(AnnotatedParam):
|
|
|
125
173
|
or annotation == Optional[callable]
|
|
126
174
|
or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
|
|
127
175
|
or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
|
|
176
|
+
or str(annotation).startswith(
|
|
177
|
+
"typing.Optional[collections.abc.Callable]"
|
|
178
|
+
) # 3.9+
|
|
128
179
|
),
|
|
129
180
|
)
|
|
130
181
|
class _OptionalCallableParam(AnnotatedParam):
|
|
@@ -136,14 +187,30 @@ class _DataFrameParamBase(AnnotatedParam):
|
|
|
136
187
|
super().__init__(param)
|
|
137
188
|
assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
|
|
138
189
|
|
|
190
|
+
@property
|
|
191
|
+
def is_per_row(self) -> bool:
|
|
192
|
+
return False
|
|
193
|
+
|
|
139
194
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
|
|
140
195
|
raise NotImplementedError
|
|
141
196
|
|
|
197
|
+
def to_input_rows(
|
|
198
|
+
self,
|
|
199
|
+
df: DataFrame,
|
|
200
|
+
ctx: Any,
|
|
201
|
+
) -> Iterable[Any]:
|
|
202
|
+
raise NotImplementedError # pragma: no cover
|
|
203
|
+
|
|
142
204
|
def to_output_df(
|
|
143
205
|
self, df: Any, schema: Any, ctx: Any
|
|
144
206
|
) -> DataFrame: # pragma: no cover
|
|
145
207
|
raise NotImplementedError
|
|
146
208
|
|
|
209
|
+
def iterable_to_output_df(
|
|
210
|
+
self, dfs: Iterable[Any], schema: Any, ctx: Any
|
|
211
|
+
) -> DataFrame: # pragma: no cover
|
|
212
|
+
raise NotImplementedError
|
|
213
|
+
|
|
147
214
|
def count(self, df: Any) -> int: # pragma: no cover
|
|
148
215
|
raise NotImplementedError
|
|
149
216
|
|
|
@@ -173,6 +240,36 @@ class DataFrameParam(_DataFrameParamBase):
|
|
|
173
240
|
return sum(1 for _ in df.as_array_iterable())
|
|
174
241
|
|
|
175
242
|
|
|
243
|
+
@fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
|
|
244
|
+
class RowParam(_DataFrameParamBase): # pragma: no cover
|
|
245
|
+
# TODO: this class is used only if row annotations are allowed as input
|
|
246
|
+
@property
|
|
247
|
+
def is_per_row(self) -> bool:
|
|
248
|
+
return True
|
|
249
|
+
|
|
250
|
+
def count(self, df: Any) -> int:
|
|
251
|
+
return 1
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
@fugue_annotated_param(Dict[str, Any])
|
|
255
|
+
class DictParam(RowParam): # pragma: no cover
|
|
256
|
+
# TODO: this class is used only if row annotations are allowed as input
|
|
257
|
+
def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
|
|
258
|
+
yield from df.as_dict_iterable()
|
|
259
|
+
|
|
260
|
+
def to_output_df(self, output: Dict[str, Any], schema: Any, ctx: Any) -> DataFrame:
|
|
261
|
+
return ArrayDataFrame([list(output.values())], schema)
|
|
262
|
+
|
|
263
|
+
def iterable_to_output_df(
|
|
264
|
+
self, dfs: Iterable[Dict[str, Any]], schema: Any, ctx: Any
|
|
265
|
+
) -> DataFrame: # pragma: no cover
|
|
266
|
+
params: Dict[str, Any] = {}
|
|
267
|
+
if schema is not None:
|
|
268
|
+
params["schema"] = Schema(schema).pa_schema
|
|
269
|
+
adf = pa.Table.from_pylist(list(dfs), **params)
|
|
270
|
+
return ArrowDataFrame(adf)
|
|
271
|
+
|
|
272
|
+
|
|
176
273
|
@fugue_annotated_param(AnyDataFrame)
|
|
177
274
|
class _AnyDataFrameParam(DataFrameParam):
|
|
178
275
|
def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
|
|
@@ -198,6 +295,15 @@ class LocalDataFrameParam(DataFrameParam):
|
|
|
198
295
|
)
|
|
199
296
|
return output
|
|
200
297
|
|
|
298
|
+
def iterable_to_output_df(
|
|
299
|
+
self, dfs: Iterable[Any], schema: Any, ctx: Any
|
|
300
|
+
) -> DataFrame: # pragma: no cover
|
|
301
|
+
def _dfs() -> Iterable[DataFrame]:
|
|
302
|
+
for df in dfs:
|
|
303
|
+
yield self.to_output_df(df, schema, ctx)
|
|
304
|
+
|
|
305
|
+
return LocalDataFrameIterableDataFrame(_dfs(), schema=schema)
|
|
306
|
+
|
|
201
307
|
def count(self, df: LocalDataFrame) -> int:
|
|
202
308
|
if df.is_bounded:
|
|
203
309
|
return df.count()
|
|
@@ -228,10 +334,7 @@ class _ListListParam(_LocalNoSchemaDataFrameParam):
|
|
|
228
334
|
return len(df)
|
|
229
335
|
|
|
230
336
|
|
|
231
|
-
@fugue_annotated_param(
|
|
232
|
-
Iterable[List[Any]],
|
|
233
|
-
matcher=lambda x: x == Iterable[List[Any]] or x == Iterator[List[Any]],
|
|
234
|
-
)
|
|
337
|
+
@fugue_annotated_param(Iterable[List[Any]], matcher=_compare_iter(List[Any]))
|
|
235
338
|
class _IterableListParam(_LocalNoSchemaDataFrameParam):
|
|
236
339
|
@no_type_check
|
|
237
340
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[List[Any]]:
|
|
@@ -288,10 +391,7 @@ class _ListDictParam(_LocalNoSchemaDataFrameParam):
|
|
|
288
391
|
return len(df)
|
|
289
392
|
|
|
290
393
|
|
|
291
|
-
@fugue_annotated_param(
|
|
292
|
-
Iterable[Dict[str, Any]],
|
|
293
|
-
matcher=lambda x: x == Iterable[Dict[str, Any]] or x == Iterator[Dict[str, Any]],
|
|
294
|
-
)
|
|
394
|
+
@fugue_annotated_param(Iterable[Dict[str, Any]], matcher=_compare_iter(Dict[str, Any]))
|
|
295
395
|
class _IterableDictParam(_LocalNoSchemaDataFrameParam):
|
|
296
396
|
@no_type_check
|
|
297
397
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[Dict[str, Any]]:
|
|
@@ -360,10 +460,7 @@ class _PandasParam(LocalDataFrameParam):
|
|
|
360
460
|
return "pandas"
|
|
361
461
|
|
|
362
462
|
|
|
363
|
-
@fugue_annotated_param(
|
|
364
|
-
Iterable[pd.DataFrame],
|
|
365
|
-
matcher=lambda x: x == Iterable[pd.DataFrame] or x == Iterator[pd.DataFrame],
|
|
366
|
-
)
|
|
463
|
+
@fugue_annotated_param(Iterable[pd.DataFrame], matcher=_compare_iter(pd.DataFrame))
|
|
367
464
|
class _IterablePandasParam(LocalDataFrameParam):
|
|
368
465
|
@no_type_check
|
|
369
466
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pd.DataFrame]:
|
|
@@ -419,10 +516,7 @@ class _PyArrowTableParam(LocalDataFrameParam):
|
|
|
419
516
|
return "pyarrow"
|
|
420
517
|
|
|
421
518
|
|
|
422
|
-
@fugue_annotated_param(
|
|
423
|
-
Iterable[pa.Table],
|
|
424
|
-
matcher=lambda x: x == Iterable[pa.Table] or x == Iterator[pa.Table],
|
|
425
|
-
)
|
|
519
|
+
@fugue_annotated_param(Iterable[pa.Table], matcher=_compare_iter(pa.Table))
|
|
426
520
|
class _IterableArrowParam(LocalDataFrameParam):
|
|
427
521
|
@no_type_check
|
|
428
522
|
def to_input_data(self, df: DataFrame, ctx: Any) -> Iterable[pa.Table]:
|
|
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
|
|
|
375
375
|
assert_arg_not_none(schema, "schema")
|
|
376
376
|
tr = _FuncAsTransformer()
|
|
377
377
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
378
|
-
func, "^[lspq][fF]?x*z?$", "^[
|
|
378
|
+
func, "^[lspq][fF]?x*z?$", "^[lspqr]$"
|
|
379
379
|
)
|
|
380
380
|
tr._output_schema_arg = schema # type: ignore
|
|
381
381
|
tr._validation_rules = validation_rules # type: ignore
|
|
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
|
|
|
410
410
|
validation_rules.update(parse_validation_rules_from_comment(func))
|
|
411
411
|
tr = _FuncAsOutputTransformer()
|
|
412
412
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
413
|
-
func, "^[lspq][fF]?x*z?$", "^[
|
|
413
|
+
func, "^[lspq][fF]?x*z?$", "^[lspnqr]$"
|
|
414
414
|
)
|
|
415
415
|
tr._output_schema_arg = None # type: ignore
|
|
416
416
|
tr._validation_rules = validation_rules # type: ignore
|
|
@@ -503,7 +503,7 @@ class _FuncAsCoTransformer(CoTransformer):
|
|
|
503
503
|
assert_arg_not_none(schema, "schema")
|
|
504
504
|
tr = _FuncAsCoTransformer()
|
|
505
505
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
506
|
-
func, "^(c|[lspq]+)[fF]?x*z?$", "^[
|
|
506
|
+
func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspqr]$"
|
|
507
507
|
)
|
|
508
508
|
tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
|
|
509
509
|
tr._output_schema_arg = schema # type: ignore
|
|
@@ -562,7 +562,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
|
|
|
562
562
|
|
|
563
563
|
tr = _FuncAsOutputCoTransformer()
|
|
564
564
|
tr._wrapper = DataFrameFunctionWrapper( # type: ignore
|
|
565
|
-
func, "^(c|[lspq]+)[fF]?x*z?$", "^[
|
|
565
|
+
func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnqr]$"
|
|
566
566
|
)
|
|
567
567
|
tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
|
|
568
568
|
tr._output_schema_arg = None # type: ignore
|