fugue 0.9.1__tar.gz → 0.9.2.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. fugue-0.9.2.dev2/PKG-INFO +310 -0
  2. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/function_wrapper.py +93 -2
  3. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/transformer/convert.py +4 -4
  4. fugue-0.9.2.dev2/fugue.egg-info/PKG-INFO +310 -0
  5. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue.egg-info/SOURCES.txt +1 -0
  6. fugue-0.9.2.dev2/fugue.egg-info/entry_points.txt +11 -0
  7. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue.egg-info/requires.txt +4 -7
  8. fugue-0.9.2.dev2/fugue_dask/_dask_sql_wrapper.py +76 -0
  9. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/_utils.py +1 -1
  10. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/execution_engine.py +5 -9
  11. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ibis/execution_engine.py +7 -6
  12. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/_utils/io.py +22 -15
  13. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_test/builtin_suite.py +36 -1
  14. fugue-0.9.2.dev2/fugue_version/__init__.py +1 -0
  15. {fugue-0.9.1 → fugue-0.9.2.dev2}/setup.cfg +1 -1
  16. {fugue-0.9.1 → fugue-0.9.2.dev2}/setup.py +5 -5
  17. fugue-0.9.1/PKG-INFO +0 -308
  18. fugue-0.9.1/fugue.egg-info/PKG-INFO +0 -308
  19. fugue-0.9.1/fugue.egg-info/entry_points.txt +0 -12
  20. fugue-0.9.1/fugue_version/__init__.py +0 -1
  21. {fugue-0.9.1 → fugue-0.9.2.dev2}/LICENSE +0 -0
  22. {fugue-0.9.1 → fugue-0.9.2.dev2}/README.md +0 -0
  23. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/__init__.py +0 -0
  24. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/__init__.py +0 -0
  25. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/display.py +0 -0
  26. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/exception.py +0 -0
  27. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/interfaceless.py +0 -0
  28. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/io.py +0 -0
  29. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/misc.py +0 -0
  30. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/_utils/registry.py +0 -0
  31. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/api.py +0 -0
  32. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/bag/__init__.py +0 -0
  33. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/bag/array_bag.py +0 -0
  34. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/bag/bag.py +0 -0
  35. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/collections/__init__.py +0 -0
  36. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/collections/partition.py +0 -0
  37. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/collections/sql.py +0 -0
  38. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/collections/yielded.py +0 -0
  39. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/column/__init__.py +0 -0
  40. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/column/expressions.py +0 -0
  41. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/column/functions.py +0 -0
  42. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/column/sql.py +0 -0
  43. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/constants.py +0 -0
  44. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/__init__.py +0 -0
  45. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/api.py +0 -0
  46. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/array_dataframe.py +0 -0
  47. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/arrow_dataframe.py +0 -0
  48. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/dataframe.py +0 -0
  49. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/dataframe_iterable_dataframe.py +0 -0
  50. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/dataframes.py +0 -0
  51. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/iterable_dataframe.py +0 -0
  52. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/pandas_dataframe.py +0 -0
  53. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataframe/utils.py +0 -0
  54. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataset/__init__.py +0 -0
  55. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataset/api.py +0 -0
  56. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dataset/dataset.py +0 -0
  57. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/dev.py +0 -0
  58. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/exceptions.py +0 -0
  59. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/execution/__init__.py +0 -0
  60. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/execution/api.py +0 -0
  61. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/execution/execution_engine.py +0 -0
  62. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/execution/factory.py +0 -0
  63. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/execution/native_execution_engine.py +0 -0
  64. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/__init__.py +0 -0
  65. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/_builtins/__init__.py +0 -0
  66. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/_builtins/creators.py +0 -0
  67. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/_builtins/outputters.py +0 -0
  68. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/_builtins/processors.py +0 -0
  69. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/_utils.py +0 -0
  70. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/context.py +0 -0
  71. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/creator/__init__.py +0 -0
  72. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/creator/convert.py +0 -0
  73. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/creator/creator.py +0 -0
  74. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/outputter/__init__.py +0 -0
  75. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/outputter/convert.py +0 -0
  76. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/outputter/outputter.py +0 -0
  77. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/processor/__init__.py +0 -0
  78. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/processor/convert.py +0 -0
  79. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/processor/processor.py +0 -0
  80. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/transformer/__init__.py +0 -0
  81. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/transformer/constants.py +0 -0
  82. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/extensions/transformer/transformer.py +0 -0
  83. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/plugins.py +0 -0
  84. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/py.typed +0 -0
  85. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/registry.py +0 -0
  86. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/rpc/__init__.py +0 -0
  87. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/rpc/base.py +0 -0
  88. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/rpc/flask.py +0 -0
  89. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/sql/__init__.py +0 -0
  90. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/sql/_utils.py +0 -0
  91. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/sql/_visitors.py +0 -0
  92. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/sql/api.py +0 -0
  93. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/sql/workflow.py +0 -0
  94. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/test/__init__.py +0 -0
  95. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/test/pandas_tester.py +0 -0
  96. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/test/plugins.py +0 -0
  97. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/__init__.py +0 -0
  98. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/_checkpoint.py +0 -0
  99. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/_tasks.py +0 -0
  100. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/_workflow_context.py +0 -0
  101. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/api.py +0 -0
  102. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/input.py +0 -0
  103. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/module.py +0 -0
  104. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue/workflow/workflow.py +0 -0
  105. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue.egg-info/dependency_links.txt +0 -0
  106. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue.egg-info/top_level.txt +0 -0
  107. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_contrib/__init__.py +0 -0
  108. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_contrib/contrib.py +0 -0
  109. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_contrib/seaborn/__init__.py +0 -0
  110. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_contrib/viz/__init__.py +0 -0
  111. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_contrib/viz/_ext.py +0 -0
  112. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/__init__.py +0 -0
  113. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/_constants.py +0 -0
  114. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/_io.py +0 -0
  115. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/dataframe.py +0 -0
  116. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/registry.py +0 -0
  117. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_dask/tester.py +0 -0
  118. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/__init__.py +0 -0
  119. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/_io.py +0 -0
  120. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/_utils.py +0 -0
  121. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/dask.py +0 -0
  122. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/dataframe.py +0 -0
  123. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/execution_engine.py +0 -0
  124. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/registry.py +0 -0
  125. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_duckdb/tester.py +0 -0
  126. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ibis/__init__.py +0 -0
  127. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ibis/_compat.py +0 -0
  128. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ibis/_utils.py +0 -0
  129. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ibis/dataframe.py +0 -0
  130. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_notebook/__init__.py +0 -0
  131. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_notebook/env.py +0 -0
  132. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_notebook/nbextension/README.md +0 -0
  133. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_notebook/nbextension/__init__.py +0 -0
  134. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_notebook/nbextension/description.yaml +0 -0
  135. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_notebook/nbextension/main.js +0 -0
  136. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_polars/__init__.py +0 -0
  137. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_polars/_utils.py +0 -0
  138. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_polars/polars_dataframe.py +0 -0
  139. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_polars/registry.py +0 -0
  140. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/__init__.py +0 -0
  141. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/_constants.py +0 -0
  142. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/_utils/__init__.py +0 -0
  143. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/_utils/cluster.py +0 -0
  144. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/_utils/dataframe.py +0 -0
  145. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/dataframe.py +0 -0
  146. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/execution_engine.py +0 -0
  147. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/registry.py +0 -0
  148. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_ray/tester.py +0 -0
  149. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/__init__.py +0 -0
  150. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/_constants.py +0 -0
  151. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/_utils/__init__.py +0 -0
  152. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/_utils/convert.py +0 -0
  153. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/_utils/io.py +0 -0
  154. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/_utils/misc.py +0 -0
  155. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/_utils/partition.py +0 -0
  156. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/dataframe.py +0 -0
  157. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/execution_engine.py +0 -0
  158. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/registry.py +0 -0
  159. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_spark/tester.py +0 -0
  160. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_sql/__init__.py +0 -0
  161. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_sql/exceptions.py +0 -0
  162. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_test/__init__.py +0 -0
  163. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_test/bag_suite.py +0 -0
  164. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_test/dataframe_suite.py +0 -0
  165. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_test/execution_suite.py +0 -0
  166. {fugue-0.9.1 → fugue-0.9.2.dev2}/fugue_test/fixtures.py +0 -0
@@ -0,0 +1,310 @@
1
+ Metadata-Version: 2.1
2
+ Name: fugue
3
+ Version: 0.9.2.dev2
4
+ Summary: An abstraction layer for distributed computation
5
+ Home-page: http://github.com/fugue-project/fugue
6
+ Author: The Fugue Development Team
7
+ Author-email: hello@fugue.ai
8
+ License: Apache-2.0
9
+ Keywords: distributed spark dask ray sql dsl domain specific language
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ Provides-Extra: sql
24
+ Provides-Extra: cpp_sql_parser
25
+ Provides-Extra: spark
26
+ Provides-Extra: dask
27
+ Provides-Extra: ray
28
+ Provides-Extra: duckdb
29
+ Provides-Extra: polars
30
+ Provides-Extra: ibis
31
+ Provides-Extra: notebook
32
+ Provides-Extra: all
33
+ License-File: LICENSE
34
+
35
+ # Fugue
36
+
37
+ [![PyPI version](https://badge.fury.io/py/fugue.svg)](https://pypi.python.org/pypi/fugue/)
38
+ [![PyPI pyversions](https://img.shields.io/pypi/pyversions/fugue.svg)](https://pypi.python.org/pypi/fugue/)
39
+ [![PyPI license](https://img.shields.io/pypi/l/fugue.svg)](https://pypi.python.org/pypi/fugue/)
40
+ [![codecov](https://codecov.io/gh/fugue-project/fugue/branch/master/graph/badge.svg?token=ZO9YD5N3IA)](https://codecov.io/gh/fugue-project/fugue)
41
+ [![Codacy Badge](https://app.codacy.com/project/badge/Grade/4fa5f2f53e6f48aaa1218a89f4808b91)](https://www.codacy.com/gh/fugue-project/fugue/dashboard?utm_source=github.com&utm_medium=referral&utm_content=fugue-project/fugue&utm_campaign=Badge_Grade)
42
+ [![Downloads](https://static.pepy.tech/badge/fugue)](https://pepy.tech/project/fugue)
43
+
44
+ | Tutorials | API Documentation | Chat with us on slack! |
45
+ | --------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
46
+ | [![Jupyter Book Badge](https://jupyterbook.org/badge.svg)](https://fugue-tutorials.readthedocs.io/) | [![Doc](https://readthedocs.org/projects/fugue/badge)](https://fugue.readthedocs.org) | [![Slack Status](https://img.shields.io/badge/slack-join_chat-white.svg?logo=slack&style=social)](http://slack.fugue.ai) |
47
+
48
+
49
+ **Fugue is a unified interface for distributed computing that lets users execute Python, Pandas, and SQL code on Spark, Dask, and Ray with minimal rewrites**.
50
+
51
+ Fugue is most commonly used for:
52
+
53
+ * **Parallelizing or scaling existing Python and Pandas code** by bringing it to Spark, Dask, or Ray with minimal rewrites.
54
+ * Using [FugueSQL](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes_sql.html) to **define end-to-end workflows** on top of Pandas, Spark, and Dask DataFrames. FugueSQL is an enhanced SQL interface that can invoke Python code.
55
+
56
+ To see how Fugue compares to other frameworks like dbt, Arrow, Ibis, PySpark Pandas, see the [comparisons](https://fugue-tutorials.readthedocs.io/#how-does-fugue-compare-to)
57
+
58
+ ## [Fugue API](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes.html)
59
+
60
+ The Fugue API is a collection of functions that are capable of running on Pandas, Spark, Dask, and Ray. The simplest way to use Fugue is the [`transform()` function](https://fugue-tutorials.readthedocs.io/tutorials/beginner/transform.html). This lets users parallelize the execution of a single function by bringing it to Spark, Dask, or Ray. In the example below, the `map_letter_to_food()` function takes in a mapping and applies it on a column. This is just Pandas and Python so far (without Fugue).
61
+
62
+ ```python
63
+ import pandas as pd
64
+ from typing import Dict
65
+
66
+ input_df = pd.DataFrame({"id":[0,1,2], "value": (["A", "B", "C"])})
67
+ map_dict = {"A": "Apple", "B": "Banana", "C": "Carrot"}
68
+
69
+ def map_letter_to_food(df: pd.DataFrame, mapping: Dict[str, str]) -> pd.DataFrame:
70
+ df["value"] = df["value"].map(mapping)
71
+ return df
72
+ ```
73
+
74
+ Now, the `map_letter_to_food()` function is brought to the Spark execution engine by invoking the `transform()` function of Fugue. The output `schema` and `params` are passed to the `transform()` call. The `schema` is needed because it's a requirement for distributed frameworks. A schema of `"*"` below means all input columns are in the output.
75
+
76
+ ```python
77
+ from pyspark.sql import SparkSession
78
+ from fugue import transform
79
+
80
+ spark = SparkSession.builder.getOrCreate()
81
+ sdf = spark.createDataFrame(input_df)
82
+
83
+ out = transform(sdf,
84
+ map_letter_to_food,
85
+ schema="*",
86
+ params=dict(mapping=map_dict),
87
+ )
88
+ # out is a Spark DataFrame
89
+ out.show()
90
+ ```
91
+ ```rst
92
+ +---+------+
93
+ | id| value|
94
+ +---+------+
95
+ | 0| Apple|
96
+ | 1|Banana|
97
+ | 2|Carrot|
98
+ +---+------+
99
+ ```
100
+
101
+ <details>
102
+ <summary>PySpark equivalent of Fugue transform()</summary>
103
+
104
+ ```python
105
+ from typing import Iterator, Union
106
+ from pyspark.sql.types import StructType
107
+ from pyspark.sql import DataFrame, SparkSession
108
+
109
+ spark_session = SparkSession.builder.getOrCreate()
110
+
111
+ def mapping_wrapper(dfs: Iterator[pd.DataFrame], mapping):
112
+ for df in dfs:
113
+ yield map_letter_to_food(df, mapping)
114
+
115
+ def run_map_letter_to_food(input_df: Union[DataFrame, pd.DataFrame], mapping):
116
+ # conversion
117
+ if isinstance(input_df, pd.DataFrame):
118
+ sdf = spark_session.createDataFrame(input_df.copy())
119
+ else:
120
+ sdf = input_df.copy()
121
+
122
+ schema = StructType(list(sdf.schema.fields))
123
+ return sdf.mapInPandas(lambda dfs: mapping_wrapper(dfs, mapping),
124
+ schema=schema)
125
+
126
+ result = run_map_letter_to_food(input_df, map_dict)
127
+ result.show()
128
+ ```
129
+ </details>
130
+
131
+ This syntax is simpler, cleaner, and more maintainable than the PySpark equivalent. At the same time, no edits were made to the original Pandas-based function to bring it to Spark. It is still usable on Pandas DataFrames. Fugue `transform()` also supports Dask and Ray as execution engines alongside the default Pandas-based engine.
132
+
133
+ The Fugue API has a broader collection of functions that are also compatible with Spark, Dask, and Ray. For example, we can use `load()` and `save()` to create an end-to-end workflow compatible with Spark, Dask, and Ray. For the full list of functions, see the [Top Level API](https://fugue.readthedocs.io/en/latest/top_api.html)
134
+
135
+ ```python
136
+ import fugue.api as fa
137
+
138
+ def run(engine=None):
139
+ with fa.engine_context(engine):
140
+ df = fa.load("/path/to/file.parquet")
141
+ out = fa.transform(df, map_letter_to_food, schema="*")
142
+ fa.save(out, "/path/to/output_file.parquet")
143
+
144
+ run() # runs on Pandas
145
+ run(engine="spark") # runs on Spark
146
+ run(engine="dask") # runs on Dask
147
+ ```
148
+
149
+ All functions underneath the context will run on the specified backend. This makes it easy to toggle between local execution, and distributed execution.
150
+
151
+ ## [FugueSQL](https://fugue-tutorials.readthedocs.io/tutorials/fugue_sql/index.html)
152
+
153
+ FugueSQL is a SQL-based language capable of expressing end-to-end data workflows on top of Pandas, Spark, and Dask. The `map_letter_to_food()` function above is used in the SQL expression below. This is how to use a Python-defined function along with the standard SQL `SELECT` statement.
154
+
155
+ ```python
156
+ from fugue.api import fugue_sql
157
+ import json
158
+
159
+ query = """
160
+ SELECT id, value
161
+ FROM input_df
162
+ TRANSFORM USING map_letter_to_food(mapping={{mapping}}) SCHEMA *
163
+ """
164
+ map_dict_str = json.dumps(map_dict)
165
+
166
+ # returns Pandas DataFrame
167
+ fugue_sql(query,mapping=map_dict_str)
168
+
169
+ # returns Spark DataFrame
170
+ fugue_sql(query, mapping=map_dict_str, engine="spark")
171
+ ```
172
+
173
+ ## Installation
174
+
175
+ Fugue can be installed through pip or conda. For example:
176
+
177
+ ```bash
178
+ pip install fugue
179
+ ```
180
+
181
+ In order to use Fugue SQL, it is strongly recommended to install the `sql` extra:
182
+
183
+ ```bash
184
+ pip install fugue[sql]
185
+ ```
186
+
187
+ It also has the following installation extras:
188
+
189
+ * **sql**: to support Fugue SQL. Without this extra, the non-SQL part still works. Before Fugue 0.9.0, this extra is included in Fugue's core dependency so you don't need to install explicitly. **But for 0,9.0+, this becomes required if you want to use Fugue SQL.**
190
+ * **spark**: to support Spark as the [ExecutionEngine](https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html).
191
+ * **dask**: to support Dask as the ExecutionEngine.
192
+ * **ray**: to support Ray as the ExecutionEngine.
193
+ * **duckdb**: to support DuckDB as the ExecutionEngine, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/duckdb.html).
194
+ * **polars**: to support Polars DataFrames and extensions using Polars.
195
+ * **ibis**: to enable Ibis for Fugue workflows, read [details](https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/ibis.html).
196
+ * **cpp_sql_parser**: to enable the CPP antlr parser for Fugue SQL. It can be 50+ times faster than the pure Python parser. For the main Python versions and platforms, there is already pre-built binaries, but for the remaining, it needs a C++ compiler to build on the fly.
197
+
198
+ For example a common use case is:
199
+
200
+ ```bash
201
+ pip install "fugue[duckdb,spark]"
202
+ ```
203
+
204
+ Note if you already installed Spark or DuckDB independently, Fugue is able to automatically use them without installing the extras.
205
+
206
+ ## [Getting Started](https://fugue-tutorials.readthedocs.io/)
207
+
208
+ The best way to get started with Fugue is to work through the 10 minute tutorials:
209
+
210
+ * [Fugue API in 10 minutes](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes.html)
211
+ * [FugueSQL in 10 minutes](https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes_sql.html)
212
+
213
+ For the top level API, see:
214
+
215
+ * [Fugue Top Level API](https://fugue.readthedocs.io/en/latest/top_api.html)
216
+
217
+ The [tutorials](https://fugue-tutorials.readthedocs.io/) can also be run in an interactive notebook environment through binder or Docker:
218
+
219
+ ### Using binder
220
+
221
+ [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/fugue-project/tutorials/master)
222
+
223
+ **Note it runs slow on binder** because the machine on binder isn't powerful enough for a distributed framework such as Spark. Parallel executions can become sequential, so some of the performance comparison examples will not give you the correct numbers.
224
+
225
+ ### Using Docker
226
+
227
+ Alternatively, you should get decent performance by running this Docker image on your own machine:
228
+
229
+ ```bash
230
+ docker run -p 8888:8888 fugueproject/tutorials:latest
231
+ ```
232
+
233
+
234
+ ## Jupyter Notebook Extension
235
+
236
+ There is an accompanying [notebook extension](https://pypi.org/project/fugue-jupyter/) for FugueSQL that lets users use the `%%fsql` cell magic. The extension also provides syntax highlighting for FugueSQL cells. It works for both classic notebook and Jupyter Lab. More details can be found in the [installation instructions](https://github.com/fugue-project/fugue-jupyter#install).
237
+
238
+ ![FugueSQL gif](https://miro.medium.com/max/700/1*6091-RcrOPyifJTLjo0anA.gif)
239
+
240
+
241
+ ## Ecosystem
242
+
243
+ By being an abstraction layer, Fugue can be used with a lot of other open-source projects seamlessly.
244
+
245
+ Python backends:
246
+
247
+ * [Pandas](https://github.com/pandas-dev/pandas)
248
+ * [Polars](https://www.pola.rs) (DataFrames only)
249
+ * [Spark](https://github.com/apache/spark)
250
+ * [Dask](https://github.com/dask/dask)
251
+ * [Ray](http://github.com/ray-project/ray)
252
+ * [Ibis](https://github.com/ibis-project/ibis/)
253
+
254
+ FugueSQL backends:
255
+
256
+ * Pandas - FugueSQL can run on Pandas
257
+ * [Duckdb](https://github.com/duckdb/duckdb) - in-process SQL OLAP database management
258
+ * [dask-sql](https://github.com/dask-contrib/dask-sql) - SQL interface for Dask
259
+ * SparkSQL
260
+ * [BigQuery](https://fugue-tutorials.readthedocs.io/tutorials/integrations/warehouses/bigquery.html)
261
+ * Trino
262
+
263
+
264
+ Fugue is available as a backend or can integrate with the following projects:
265
+
266
+ * [WhyLogs](https://whylogs.readthedocs.io/en/latest/examples/integrations/Fugue_Profiling.html?highlight=fugue) - data profiling
267
+ * [PyCaret](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/pycaret.html) - low code machine learning
268
+ * [Nixtla](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/nixtla.html) - timeseries modelling
269
+ * [Prefect](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/prefect.html) - workflow orchestration
270
+ * [Pandera](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/pandera.html) - data validation
271
+ * [Datacompy (by Capital One)](https://fugue-tutorials.readthedocs.io/tutorials/integrations/ecosystem/datacompy.html) - comparing DataFrames
272
+
273
+ Registered 3rd party extensions (majorly for Fugue SQL) include:
274
+
275
+ * [Pandas plot](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html) - visualize data using matplotlib or plotly
276
+ * [Seaborn](https://seaborn.pydata.org/api.html) - visualize data using seaborn
277
+ * [WhyLogs](https://whylogs.readthedocs.io/en/latest/examples/integrations/Fugue_Profiling.html?highlight=fugue) - visualize data profiling
278
+ * [Vizzu](https://github.com/vizzuhq/ipyvizzu) - visualize data using ipyvizzu
279
+
280
+ ## Community and Contributing
281
+
282
+ Feel free to message us on [Slack](http://slack.fugue.ai). We also have [contributing instructions](CONTRIBUTING.md).
283
+
284
+ ### Case Studies
285
+
286
+ * [How LyftLearn Democratizes Distributed Compute through Kubernetes Spark and Fugue](https://eng.lyft.com/how-lyftlearn-democratizes-distributed-compute-through-kubernetes-spark-and-fugue-c0875b97c3d9)
287
+ * [Clobotics - Large Scale Image Processing with Spark through Fugue](https://medium.com/fugue-project/large-scale-image-processing-with-spark-through-fugue-e510b9813da8)
288
+ * [Architecture for a data lake REST API using Delta Lake, Fugue & Spark (article by bitsofinfo)](https://bitsofinfo.wordpress.com/2023/08/14/data-lake-rest-api-delta-lake-fugue-spark)
289
+
290
+ ### Mentioned Uses
291
+
292
+ * [Productionizing Data Science at Interos, Inc. (LinkedIn post by Anthony Holten)](https://www.linkedin.com/posts/anthony-holten_pandas-spark-dask-activity-7022628193983459328-QvcF)
293
+ * [Multiple Time Series Forecasting with Fugue & Nixtla at Bain & Company (LinkedIn post by Fahad Akbar)](https://www.linkedin.com/posts/fahadakbar_fugue-datascience-forecasting-activity-7041119034813124608-u08q?utm_source=share&utm_medium=member_desktop)
294
+
295
+ ## Further Resources
296
+
297
+ View some of our latest conferences presentations and content. For a more complete list, check the [Content](https://fugue-tutorials.readthedocs.io/tutorials/resources/content.html) page in the tutorials.
298
+
299
+ ### Blogs
300
+
301
+ * [Why Pandas-like Interfaces are Sub-optimal for Distributed Computing](https://towardsdatascience.com/why-pandas-like-interfaces-are-sub-optimal-for-distributed-computing-322dacbce43)
302
+ * [Introducing FugueSQL — SQL for Pandas, Spark, and Dask DataFrames (Towards Data Science by Khuyen Tran)](https://towardsdatascience.com/introducing-fuguesql-sql-for-pandas-spark-and-dask-dataframes-63d461a16b27)
303
+
304
+ ### Conferences
305
+
306
+ * [Distributed Machine Learning at Lyft](https://www.youtube.com/watch?v=_IVyIOV0LgY)
307
+ * [Comparing the Different Ways to Scale Python and Pandas Code](https://www.youtube.com/watch?v=b3ae0m_XTys)
308
+ * [Large Scale Data Validation with Spark and Dask (PyCon US)](https://www.youtube.com/watch?v=2AdvBgjO_3Q)
309
+ * [FugueSQL - The Enhanced SQL Interface for Pandas, Spark, and Dask DataFrames (PyData Global)](https://www.youtube.com/watch?v=OBpnGYjNBBI)
310
+ * [Distributed Hybrid Parameter Tuning](https://www.youtube.com/watch?v=_GBjqskD8Qk)
@@ -80,6 +80,7 @@ class DataFrameFunctionWrapper(FunctionWrapper):
80
80
  p.update(kwargs)
81
81
  has_kw = False
82
82
  rargs: Dict[str, Any] = {}
83
+ row_param_info: Any = None
83
84
  for k, v in self._params.items():
84
85
  if isinstance(v, (PositionalParam, KeywordParam)):
85
86
  if isinstance(v, KeywordParam):
@@ -90,7 +91,14 @@ class DataFrameFunctionWrapper(FunctionWrapper):
90
91
  isinstance(p[k], DataFrame),
91
92
  lambda: TypeError(f"{p[k]} is not a DataFrame"),
92
93
  )
93
- rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
+ if v.is_per_row:
95
+ assert_or_throw(
96
+ row_param_info is None,
97
+ lambda: ValueError("only one row parameter is allowed"),
98
+ )
99
+ row_param_info = (k, v, p[k])
100
+ else:
101
+ rargs[k] = v.to_input_data(p[k], ctx=ctx)
94
102
  else:
95
103
  rargs[k] = p[k] # TODO: should we do auto type conversion?
96
104
  del p[k]
@@ -100,12 +108,38 @@ class DataFrameFunctionWrapper(FunctionWrapper):
100
108
  rargs.update(p)
101
109
  elif not ignore_unknown and len(p) > 0:
102
110
  raise ValueError(f"{p} are not acceptable parameters")
111
+ if row_param_info is None:
112
+ return self._run_func(rargs, output, output_schema, ctx, raw=False)
113
+ else: # input contains row parameter
114
+
115
+ def _dfs() -> Iterable[Any]:
116
+ k, v, df = row_param_info
117
+ for row in v.to_input_rows(df, ctx):
118
+ rargs[k] = None
119
+ _rargs = rargs.copy()
120
+ _rargs[k] = row
121
+ yield self._run_func(_rargs, output, output_schema, ctx, raw=True)
122
+
123
+ if not output:
124
+ sum(1 for _ in _dfs())
125
+ return
126
+ else:
127
+ return self._rt.iterable_to_output_df(_dfs(), output_schema, ctx)
128
+
129
+ def _run_func(
130
+ self,
131
+ rargs: Dict[str, Any],
132
+ output: bool,
133
+ output_schema: Any,
134
+ ctx: Any,
135
+ raw: bool,
136
+ ) -> Any:
103
137
  rt = self._func(**rargs)
104
138
  if not output:
105
139
  if isinstance(self._rt, _DataFrameParamBase):
106
140
  self._rt.count(rt)
107
141
  return
108
- if isinstance(self._rt, _DataFrameParamBase):
142
+ if not raw and isinstance(self._rt, _DataFrameParamBase):
109
143
  return self._rt.to_output_df(rt, output_schema, ctx=ctx)
110
144
  return rt
111
145
 
@@ -120,6 +154,7 @@ fugue_annotated_param = DataFrameFunctionWrapper.annotated_param
120
154
  annotation == Callable
121
155
  or annotation == callable # pylint: disable=comparison-with-callable
122
156
  or str(annotation).startswith("typing.Callable")
157
+ or str(annotation).startswith("collections.abc.Callable")
123
158
  ),
124
159
  )
125
160
  class _CallableParam(AnnotatedParam):
@@ -134,6 +169,9 @@ class _CallableParam(AnnotatedParam):
134
169
  or annotation == Optional[callable]
135
170
  or str(annotation).startswith("typing.Union[typing.Callable") # 3.8-
136
171
  or str(annotation).startswith("typing.Optional[typing.Callable") # 3.9+
172
+ or str(annotation).startswith(
173
+ "typing.Optional[collections.abc.Callable]"
174
+ ) # 3.9+
137
175
  ),
138
176
  )
139
177
  class _OptionalCallableParam(AnnotatedParam):
@@ -145,14 +183,30 @@ class _DataFrameParamBase(AnnotatedParam):
145
183
  super().__init__(param)
146
184
  assert_or_throw(self.required, lambda: TypeError(f"{self} must be required"))
147
185
 
186
+ @property
187
+ def is_per_row(self) -> bool:
188
+ return False
189
+
148
190
  def to_input_data(self, df: DataFrame, ctx: Any) -> Any: # pragma: no cover
149
191
  raise NotImplementedError
150
192
 
193
+ def to_input_rows(
194
+ self,
195
+ df: DataFrame,
196
+ ctx: Any,
197
+ ) -> Iterable[Any]:
198
+ raise NotImplementedError # pragma: no cover
199
+
151
200
  def to_output_df(
152
201
  self, df: Any, schema: Any, ctx: Any
153
202
  ) -> DataFrame: # pragma: no cover
154
203
  raise NotImplementedError
155
204
 
205
+ def iterable_to_output_df(
206
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
207
+ ) -> DataFrame: # pragma: no cover
208
+ raise NotImplementedError
209
+
156
210
  def count(self, df: Any) -> int: # pragma: no cover
157
211
  raise NotImplementedError
158
212
 
@@ -182,6 +236,34 @@ class DataFrameParam(_DataFrameParamBase):
182
236
  return sum(1 for _ in df.as_array_iterable())
183
237
 
184
238
 
239
+ @fugue_annotated_param(DataFrame, "r", child_can_reuse_code=True)
240
+ class RowParam(_DataFrameParamBase):
241
+ @property
242
+ def is_per_row(self) -> bool:
243
+ return True
244
+
245
+ def count(self, df: Any) -> int:
246
+ return 1
247
+
248
+
249
+ @fugue_annotated_param(Dict[str, Any])
250
+ class DictParam(RowParam):
251
+ def to_input_rows(self, df: DataFrame, ctx: Any) -> Iterable[Any]:
252
+ yield from df.as_dict_iterable()
253
+
254
+ def to_output_df(self, output: Dict[str, Any], schema: Any, ctx: Any) -> DataFrame:
255
+ return ArrayDataFrame([list(output.values())], schema)
256
+
257
+ def iterable_to_output_df(
258
+ self, dfs: Iterable[Dict[str, Any]], schema: Any, ctx: Any
259
+ ) -> DataFrame: # pragma: no cover
260
+ params: Dict[str, Any] = {}
261
+ if schema is not None:
262
+ params["schema"] = Schema(schema).pa_schema
263
+ adf = pa.Table.from_pylist(list(dfs), **params)
264
+ return ArrowDataFrame(adf)
265
+
266
+
185
267
  @fugue_annotated_param(AnyDataFrame)
186
268
  class _AnyDataFrameParam(DataFrameParam):
187
269
  def to_output_df(self, output: AnyDataFrame, schema: Any, ctx: Any) -> DataFrame:
@@ -207,6 +289,15 @@ class LocalDataFrameParam(DataFrameParam):
207
289
  )
208
290
  return output
209
291
 
292
+ def iterable_to_output_df(
293
+ self, dfs: Iterable[Any], schema: Any, ctx: Any
294
+ ) -> DataFrame: # pragma: no cover
295
+ def _dfs() -> Iterable[DataFrame]:
296
+ for df in dfs:
297
+ yield self.to_output_df(df, schema, ctx)
298
+
299
+ return LocalDataFrameIterableDataFrame(_dfs(), schema=schema)
300
+
210
301
  def count(self, df: LocalDataFrame) -> int:
211
302
  if df.is_bounded:
212
303
  return df.count()
@@ -375,7 +375,7 @@ class _FuncAsTransformer(Transformer):
375
375
  assert_arg_not_none(schema, "schema")
376
376
  tr = _FuncAsTransformer()
377
377
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
378
- func, "^[lspq][fF]?x*z?$", "^[lspq]$"
378
+ func, "^[lspqr][fF]?x*z?$", "^[lspqr]$"
379
379
  )
380
380
  tr._output_schema_arg = schema # type: ignore
381
381
  tr._validation_rules = validation_rules # type: ignore
@@ -410,7 +410,7 @@ class _FuncAsOutputTransformer(_FuncAsTransformer):
410
410
  validation_rules.update(parse_validation_rules_from_comment(func))
411
411
  tr = _FuncAsOutputTransformer()
412
412
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
413
- func, "^[lspq][fF]?x*z?$", "^[lspnq]$"
413
+ func, "^[lspqr][fF]?x*z?$", "^[lspnqr]$"
414
414
  )
415
415
  tr._output_schema_arg = None # type: ignore
416
416
  tr._validation_rules = validation_rules # type: ignore
@@ -503,7 +503,7 @@ class _FuncAsCoTransformer(CoTransformer):
503
503
  assert_arg_not_none(schema, "schema")
504
504
  tr = _FuncAsCoTransformer()
505
505
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
506
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspq]$"
506
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspqr]$"
507
507
  )
508
508
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
509
509
  tr._output_schema_arg = schema # type: ignore
@@ -562,7 +562,7 @@ class _FuncAsOutputCoTransformer(_FuncAsCoTransformer):
562
562
 
563
563
  tr = _FuncAsOutputCoTransformer()
564
564
  tr._wrapper = DataFrameFunctionWrapper( # type: ignore
565
- func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnq]$"
565
+ func, "^(c|[lspq]+)[fF]?x*z?$", "^[lspnqr]$"
566
566
  )
567
567
  tr._dfs_input = tr._wrapper.input_code[0] == "c" # type: ignore
568
568
  tr._output_schema_arg = None # type: ignore