sqlframe 0.1.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. sqlframe-0.1.dev3/.gitignore +145 -0
  2. sqlframe-0.1.dev3/.pre-commit-config.yaml +33 -0
  3. sqlframe-0.1.dev3/LICENSE +21 -0
  4. sqlframe-0.1.dev3/Makefile +37 -0
  5. sqlframe-0.1.dev3/PKG-INFO +170 -0
  6. sqlframe-0.1.dev3/README.md +116 -0
  7. sqlframe-0.1.dev3/blogs/images/but_wait_theres_more.gif +0 -0
  8. sqlframe-0.1.dev3/blogs/images/cake.gif +0 -0
  9. sqlframe-0.1.dev3/blogs/images/you_get_pyspark_api.gif +0 -0
  10. sqlframe-0.1.dev3/blogs/sqlframe_universal_dataframe_api.md +139 -0
  11. sqlframe-0.1.dev3/docs/bigquery.md +479 -0
  12. sqlframe-0.1.dev3/docs/docs/bigquery.md +479 -0
  13. sqlframe-0.1.dev3/docs/docs/duckdb.md +466 -0
  14. sqlframe-0.1.dev3/docs/docs/images/SF.png +0 -0
  15. sqlframe-0.1.dev3/docs/docs/images/favicon.png +0 -0
  16. sqlframe-0.1.dev3/docs/docs/images/favicon_old.png +0 -0
  17. sqlframe-0.1.dev3/docs/docs/images/sqlframe_diagram.png +0 -0
  18. sqlframe-0.1.dev3/docs/docs/images/sqlframe_logo.png +0 -0
  19. sqlframe-0.1.dev3/docs/docs/postgres.md +430 -0
  20. sqlframe-0.1.dev3/docs/duckdb.md +452 -0
  21. sqlframe-0.1.dev3/docs/images/SF.png +0 -0
  22. sqlframe-0.1.dev3/docs/images/favicon.png +0 -0
  23. sqlframe-0.1.dev3/docs/images/favicon_old.png +0 -0
  24. sqlframe-0.1.dev3/docs/images/sqlframe_diagram.png +0 -0
  25. sqlframe-0.1.dev3/docs/images/sqlframe_logo.png +0 -0
  26. sqlframe-0.1.dev3/docs/index.md +1 -0
  27. sqlframe-0.1.dev3/docs/postgres.md +430 -0
  28. sqlframe-0.1.dev3/docs/standalone.md +450 -0
  29. sqlframe-0.1.dev3/docs/stylesheets/extra.css +17 -0
  30. sqlframe-0.1.dev3/mkdocs.yml +50 -0
  31. sqlframe-0.1.dev3/pytest.ini +4 -0
  32. sqlframe-0.1.dev3/setup.cfg +7 -0
  33. sqlframe-0.1.dev3/setup.py +76 -0
  34. sqlframe-0.1.dev3/sqlframe/LICENSE +260 -0
  35. sqlframe-0.1.dev3/sqlframe/__init__.py +0 -0
  36. sqlframe-0.1.dev3/sqlframe/_version.py +16 -0
  37. sqlframe-0.1.dev3/sqlframe/base/__init__.py +0 -0
  38. sqlframe-0.1.dev3/sqlframe/base/_typing.py +39 -0
  39. sqlframe-0.1.dev3/sqlframe/base/catalog.py +1162 -0
  40. sqlframe-0.1.dev3/sqlframe/base/column.py +388 -0
  41. sqlframe-0.1.dev3/sqlframe/base/dataframe.py +1513 -0
  42. sqlframe-0.1.dev3/sqlframe/base/decorators.py +51 -0
  43. sqlframe-0.1.dev3/sqlframe/base/exceptions.py +14 -0
  44. sqlframe-0.1.dev3/sqlframe/base/function_alternatives.py +1055 -0
  45. sqlframe-0.1.dev3/sqlframe/base/functions.py +1678 -0
  46. sqlframe-0.1.dev3/sqlframe/base/group.py +102 -0
  47. sqlframe-0.1.dev3/sqlframe/base/mixins/__init__.py +0 -0
  48. sqlframe-0.1.dev3/sqlframe/base/mixins/catalog_mixins.py +419 -0
  49. sqlframe-0.1.dev3/sqlframe/base/mixins/readwriter_mixins.py +115 -0
  50. sqlframe-0.1.dev3/sqlframe/base/normalize.py +85 -0
  51. sqlframe-0.1.dev3/sqlframe/base/operations.py +87 -0
  52. sqlframe-0.1.dev3/sqlframe/base/readerwriter.py +679 -0
  53. sqlframe-0.1.dev3/sqlframe/base/session.py +593 -0
  54. sqlframe-0.1.dev3/sqlframe/base/transforms.py +11 -0
  55. sqlframe-0.1.dev3/sqlframe/base/types.py +418 -0
  56. sqlframe-0.1.dev3/sqlframe/base/util.py +239 -0
  57. sqlframe-0.1.dev3/sqlframe/base/window.py +139 -0
  58. sqlframe-0.1.dev3/sqlframe/bigquery/__init__.py +23 -0
  59. sqlframe-0.1.dev3/sqlframe/bigquery/catalog.py +254 -0
  60. sqlframe-0.1.dev3/sqlframe/bigquery/column.py +1 -0
  61. sqlframe-0.1.dev3/sqlframe/bigquery/dataframe.py +54 -0
  62. sqlframe-0.1.dev3/sqlframe/bigquery/functions.py +378 -0
  63. sqlframe-0.1.dev3/sqlframe/bigquery/functions.pyi +269 -0
  64. sqlframe-0.1.dev3/sqlframe/bigquery/group.py +14 -0
  65. sqlframe-0.1.dev3/sqlframe/bigquery/readwriter.py +29 -0
  66. sqlframe-0.1.dev3/sqlframe/bigquery/session.py +88 -0
  67. sqlframe-0.1.dev3/sqlframe/bigquery/types.py +1 -0
  68. sqlframe-0.1.dev3/sqlframe/bigquery/window.py +1 -0
  69. sqlframe-0.1.dev3/sqlframe/duckdb/__init__.py +20 -0
  70. sqlframe-0.1.dev3/sqlframe/duckdb/catalog.py +108 -0
  71. sqlframe-0.1.dev3/sqlframe/duckdb/column.py +1 -0
  72. sqlframe-0.1.dev3/sqlframe/duckdb/dataframe.py +55 -0
  73. sqlframe-0.1.dev3/sqlframe/duckdb/functions.py +47 -0
  74. sqlframe-0.1.dev3/sqlframe/duckdb/functions.pyi +183 -0
  75. sqlframe-0.1.dev3/sqlframe/duckdb/group.py +14 -0
  76. sqlframe-0.1.dev3/sqlframe/duckdb/readwriter.py +96 -0
  77. sqlframe-0.1.dev3/sqlframe/duckdb/session.py +65 -0
  78. sqlframe-0.1.dev3/sqlframe/duckdb/types.py +1 -0
  79. sqlframe-0.1.dev3/sqlframe/duckdb/window.py +1 -0
  80. sqlframe-0.1.dev3/sqlframe/postgres/__init__.py +23 -0
  81. sqlframe-0.1.dev3/sqlframe/postgres/catalog.py +106 -0
  82. sqlframe-0.1.dev3/sqlframe/postgres/column.py +1 -0
  83. sqlframe-0.1.dev3/sqlframe/postgres/dataframe.py +54 -0
  84. sqlframe-0.1.dev3/sqlframe/postgres/functions.py +61 -0
  85. sqlframe-0.1.dev3/sqlframe/postgres/functions.pyi +167 -0
  86. sqlframe-0.1.dev3/sqlframe/postgres/group.py +14 -0
  87. sqlframe-0.1.dev3/sqlframe/postgres/readwriter.py +29 -0
  88. sqlframe-0.1.dev3/sqlframe/postgres/session.py +68 -0
  89. sqlframe-0.1.dev3/sqlframe/postgres/types.py +1 -0
  90. sqlframe-0.1.dev3/sqlframe/postgres/window.py +1 -0
  91. sqlframe-0.1.dev3/sqlframe/redshift/__init__.py +23 -0
  92. sqlframe-0.1.dev3/sqlframe/redshift/catalog.py +127 -0
  93. sqlframe-0.1.dev3/sqlframe/redshift/column.py +1 -0
  94. sqlframe-0.1.dev3/sqlframe/redshift/dataframe.py +54 -0
  95. sqlframe-0.1.dev3/sqlframe/redshift/functions.py +18 -0
  96. sqlframe-0.1.dev3/sqlframe/redshift/group.py +14 -0
  97. sqlframe-0.1.dev3/sqlframe/redshift/readwriter.py +29 -0
  98. sqlframe-0.1.dev3/sqlframe/redshift/session.py +53 -0
  99. sqlframe-0.1.dev3/sqlframe/redshift/types.py +1 -0
  100. sqlframe-0.1.dev3/sqlframe/redshift/window.py +1 -0
  101. sqlframe-0.1.dev3/sqlframe/snowflake/__init__.py +26 -0
  102. sqlframe-0.1.dev3/sqlframe/snowflake/catalog.py +134 -0
  103. sqlframe-0.1.dev3/sqlframe/snowflake/column.py +1 -0
  104. sqlframe-0.1.dev3/sqlframe/snowflake/dataframe.py +54 -0
  105. sqlframe-0.1.dev3/sqlframe/snowflake/functions.py +18 -0
  106. sqlframe-0.1.dev3/sqlframe/snowflake/group.py +14 -0
  107. sqlframe-0.1.dev3/sqlframe/snowflake/readwriter.py +29 -0
  108. sqlframe-0.1.dev3/sqlframe/snowflake/session.py +53 -0
  109. sqlframe-0.1.dev3/sqlframe/snowflake/types.py +1 -0
  110. sqlframe-0.1.dev3/sqlframe/snowflake/window.py +1 -0
  111. sqlframe-0.1.dev3/sqlframe/spark/__init__.py +23 -0
  112. sqlframe-0.1.dev3/sqlframe/spark/catalog.py +1028 -0
  113. sqlframe-0.1.dev3/sqlframe/spark/column.py +1 -0
  114. sqlframe-0.1.dev3/sqlframe/spark/dataframe.py +54 -0
  115. sqlframe-0.1.dev3/sqlframe/spark/functions.py +22 -0
  116. sqlframe-0.1.dev3/sqlframe/spark/group.py +14 -0
  117. sqlframe-0.1.dev3/sqlframe/spark/readwriter.py +29 -0
  118. sqlframe-0.1.dev3/sqlframe/spark/session.py +90 -0
  119. sqlframe-0.1.dev3/sqlframe/spark/types.py +1 -0
  120. sqlframe-0.1.dev3/sqlframe/spark/window.py +1 -0
  121. sqlframe-0.1.dev3/sqlframe/standalone/__init__.py +26 -0
  122. sqlframe-0.1.dev3/sqlframe/standalone/catalog.py +13 -0
  123. sqlframe-0.1.dev3/sqlframe/standalone/column.py +1 -0
  124. sqlframe-0.1.dev3/sqlframe/standalone/dataframe.py +36 -0
  125. sqlframe-0.1.dev3/sqlframe/standalone/functions.py +1 -0
  126. sqlframe-0.1.dev3/sqlframe/standalone/group.py +14 -0
  127. sqlframe-0.1.dev3/sqlframe/standalone/readwriter.py +19 -0
  128. sqlframe-0.1.dev3/sqlframe/standalone/session.py +40 -0
  129. sqlframe-0.1.dev3/sqlframe/standalone/types.py +1 -0
  130. sqlframe-0.1.dev3/sqlframe/standalone/window.py +1 -0
  131. sqlframe-0.1.dev3/sqlframe.egg-info/PKG-INFO +170 -0
  132. sqlframe-0.1.dev3/sqlframe.egg-info/SOURCES.txt +181 -0
  133. sqlframe-0.1.dev3/sqlframe.egg-info/dependency_links.txt +1 -0
  134. sqlframe-0.1.dev3/sqlframe.egg-info/requires.txt +42 -0
  135. sqlframe-0.1.dev3/sqlframe.egg-info/top_level.txt +1 -0
  136. sqlframe-0.1.dev3/tests/__init__.py +0 -0
  137. sqlframe-0.1.dev3/tests/common_fixtures.py +205 -0
  138. sqlframe-0.1.dev3/tests/conftest.py +11 -0
  139. sqlframe-0.1.dev3/tests/fixtures/employee.csv +6 -0
  140. sqlframe-0.1.dev3/tests/fixtures/employee.json +5 -0
  141. sqlframe-0.1.dev3/tests/fixtures/employee.parquet +0 -0
  142. sqlframe-0.1.dev3/tests/integration/__init__.py +0 -0
  143. sqlframe-0.1.dev3/tests/integration/engines/__init__.py +0 -0
  144. sqlframe-0.1.dev3/tests/integration/engines/bigquery/__init__.py +0 -0
  145. sqlframe-0.1.dev3/tests/integration/engines/bigquery/test_bigquery_catalog.py +343 -0
  146. sqlframe-0.1.dev3/tests/integration/engines/bigquery/test_bigquery_session.py +20 -0
  147. sqlframe-0.1.dev3/tests/integration/engines/duckdb/__init__.py +0 -0
  148. sqlframe-0.1.dev3/tests/integration/engines/duckdb/test_duckdb_catalog.py +356 -0
  149. sqlframe-0.1.dev3/tests/integration/engines/duckdb/test_duckdb_session.py +13 -0
  150. sqlframe-0.1.dev3/tests/integration/engines/postgres/__init__.py +0 -0
  151. sqlframe-0.1.dev3/tests/integration/engines/postgres/test_postgres_catalog.py +317 -0
  152. sqlframe-0.1.dev3/tests/integration/engines/postgres/test_postgres_session.py +19 -0
  153. sqlframe-0.1.dev3/tests/integration/engines/redshift/__init__.py +0 -0
  154. sqlframe-0.1.dev3/tests/integration/engines/redshift/test_redshift_catalog.py +306 -0
  155. sqlframe-0.1.dev3/tests/integration/engines/redshift/test_redshift_session.py +47 -0
  156. sqlframe-0.1.dev3/tests/integration/engines/snowflake/__init__.py +0 -0
  157. sqlframe-0.1.dev3/tests/integration/engines/snowflake/test_snowflake_catalog.py +333 -0
  158. sqlframe-0.1.dev3/tests/integration/engines/snowflake/test_snowflake_session.py +47 -0
  159. sqlframe-0.1.dev3/tests/integration/engines/spark/__init__.py +0 -0
  160. sqlframe-0.1.dev3/tests/integration/engines/spark/test_spark_catalog.py +244 -0
  161. sqlframe-0.1.dev3/tests/integration/engines/test_engine_dataframe.py +87 -0
  162. sqlframe-0.1.dev3/tests/integration/engines/test_engine_reader.py +131 -0
  163. sqlframe-0.1.dev3/tests/integration/engines/test_engine_session.py +40 -0
  164. sqlframe-0.1.dev3/tests/integration/engines/test_engine_writer.py +176 -0
  165. sqlframe-0.1.dev3/tests/integration/engines/test_int_functions.py +2688 -0
  166. sqlframe-0.1.dev3/tests/integration/fixtures.py +712 -0
  167. sqlframe-0.1.dev3/tests/integration/test_int_dataframe.py +1969 -0
  168. sqlframe-0.1.dev3/tests/integration/test_int_dataframe_stats.py +28 -0
  169. sqlframe-0.1.dev3/tests/integration/test_int_grouped_data.py +165 -0
  170. sqlframe-0.1.dev3/tests/integration/test_int_session.py +70 -0
  171. sqlframe-0.1.dev3/tests/types.py +6 -0
  172. sqlframe-0.1.dev3/tests/unit/__init__.py +0 -0
  173. sqlframe-0.1.dev3/tests/unit/standalone/__init__.py +0 -0
  174. sqlframe-0.1.dev3/tests/unit/standalone/fixtures.py +71 -0
  175. sqlframe-0.1.dev3/tests/unit/standalone/test_column.py +218 -0
  176. sqlframe-0.1.dev3/tests/unit/standalone/test_dataframe.py +46 -0
  177. sqlframe-0.1.dev3/tests/unit/standalone/test_dataframe_writer.py +107 -0
  178. sqlframe-0.1.dev3/tests/unit/standalone/test_functions.py +2792 -0
  179. sqlframe-0.1.dev3/tests/unit/standalone/test_session.py +138 -0
  180. sqlframe-0.1.dev3/tests/unit/standalone/test_session_case_sensitivity.py +110 -0
  181. sqlframe-0.1.dev3/tests/unit/standalone/test_types.py +38 -0
  182. sqlframe-0.1.dev3/tests/unit/standalone/test_window.py +45 -0
@@ -0,0 +1,145 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # PyCharm
132
+ .idea/
133
+
134
+ # Visual Studio Code
135
+ .vscode
136
+
137
+ .DS_STORE
138
+ metastore_db
139
+ spark_warehouse
140
+
141
+ # Version file
142
+ sqlframe/_version.py
143
+
144
+ # Emacs files
145
+ *~
@@ -0,0 +1,33 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: ruff
5
+ name: ruff
6
+ description: "Run 'ruff' for extremely fast Python linting"
7
+ entry: ruff check
8
+ --force-exclude
9
+ --fix
10
+ --select I
11
+ --ignore E721
12
+ --ignore E741
13
+ language: python
14
+ types_or: [python, pyi]
15
+ require_serial: true
16
+ additional_dependencies: []
17
+ files: ^(sqlframe/|tests/|setup.py)
18
+ - id: ruff-format
19
+ name: ruff-format
20
+ description: "Run 'ruff format' for extremely fast Python formatting"
21
+ entry: ruff format
22
+ --force-exclude
23
+ --line-length 100
24
+ language: python
25
+ types_or: [python, pyi]
26
+ require_serial: true
27
+ - id: mypy
28
+ name: mypy
29
+ entry: mypy sqlframe tests
30
+ language: system
31
+ types: [ python ]
32
+ files: ^(sqlframe/|tests/)
33
+ pass_filenames: false
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ryan Eakman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,37 @@
1
+ install-dev:
2
+ pip install -e ".[dev]"
3
+
4
+ install-pre-commit:
5
+ pre-commit install
6
+
7
+ slow-test:
8
+ pytest -n auto tests
9
+
10
+ fast-test:
11
+ pytest -n auto tests/unit
12
+
13
+ local-test:
14
+ pytest -n auto -m "local"
15
+
16
+ bigquery-test:
17
+ pytest -n auto -m "bigquery"
18
+
19
+ duckdb-test:
20
+ pytest -n auto -m "duckdb"
21
+
22
+ style:
23
+ pre-commit run --all-files
24
+
25
+ docs-serve:
26
+ mkdocs serve
27
+
28
+ stubs:
29
+ stubgen sqlframe/bigquery/functions.py --output ./ --inspect-mode
30
+ stubgen sqlframe/duckdb/functions.py --output ./ --inspect-mode
31
+ stubgen sqlframe/postgres/functions.py --output ./ --inspect-mode
32
+
33
+ package:
34
+ pip3 install wheel && python3 setup.py sdist bdist_wheel
35
+
36
+ publish: package
37
+ pip3 install twine && python3 -m twine upload dist/*
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.1
2
+ Name: sqlframe
3
+ Version: 0.1.dev3
4
+ Summary: PySpark Dataframe API Compatible SQL Generator
5
+ Home-page: https://github.com/eakmanrq/sqlframe
6
+ Author: Ryan Eakman
7
+ Author-email: eakmanrq@gmail.com
8
+ License: MIT
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: SQL
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: prettytable
20
+ Requires-Dist: sqlglot
21
+ Provides-Extra: bigquery
22
+ Requires-Dist: google-cloud-bigquery[pandas]; extra == "bigquery"
23
+ Requires-Dist: google-cloud-bigquery-storage; extra == "bigquery"
24
+ Provides-Extra: dev
25
+ Requires-Dist: duckdb; extra == "dev"
26
+ Requires-Dist: mkdocs==1.4.2; extra == "dev"
27
+ Requires-Dist: mkdocs-include-markdown-plugin==4.0.3; extra == "dev"
28
+ Requires-Dist: mkdocs-material==9.0.5; extra == "dev"
29
+ Requires-Dist: mkdocs-material-extensions==1.1.1; extra == "dev"
30
+ Requires-Dist: mypy; extra == "dev"
31
+ Requires-Dist: pandas; extra == "dev"
32
+ Requires-Dist: pymdown-extensions; extra == "dev"
33
+ Requires-Dist: psycopg; extra == "dev"
34
+ Requires-Dist: pyarrow; extra == "dev"
35
+ Requires-Dist: pyspark; extra == "dev"
36
+ Requires-Dist: pytest; extra == "dev"
37
+ Requires-Dist: pytest-postgresql; extra == "dev"
38
+ Requires-Dist: pytest-xdist; extra == "dev"
39
+ Requires-Dist: pre-commit; extra == "dev"
40
+ Requires-Dist: ruff; extra == "dev"
41
+ Requires-Dist: typing_extensions; extra == "dev"
42
+ Requires-Dist: types-psycopg2; extra == "dev"
43
+ Provides-Extra: duckdb
44
+ Requires-Dist: duckdb; extra == "duckdb"
45
+ Requires-Dist: pandas; extra == "duckdb"
46
+ Provides-Extra: postgres
47
+ Requires-Dist: psycopg2; extra == "postgres"
48
+ Provides-Extra: redshift
49
+ Requires-Dist: redshift_connector; extra == "redshift"
50
+ Provides-Extra: snowflake
51
+ Requires-Dist: snowflake-connector-python[pandas,secure-local-storage]; extra == "snowflake"
52
+ Provides-Extra: spark
53
+ Requires-Dist: pyspark; extra == "spark"
54
+
55
+ <div align="center">
56
+ <img src="docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
57
+ </div>
58
+
59
+ ![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)
60
+
61
+ SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
62
+
63
+ ![SQLFrame Diagram](docs/images/sqlframe_diagram.png)
64
+
65
+ SQLFrame currently supports the following engines:
66
+
67
+ * [BigQuery](docs/bigquery.md)
68
+ * [DuckDB](docs/duckdb.md)
69
+ * [Postgres](docs/postgres.md)
70
+
71
+ SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
72
+ * [Standalone](docs/standalone.md)
73
+
74
+ SQLFrame is great for:
75
+
76
+ * Users who want to run PySpark DataFrame code without having to use a Spark cluster
77
+ * Users who want a SQL representation of their DataFrame code for debugging or sharing with others
78
+ * Users who want a DataFrame API that leverages the full power of their engine to do the processing
79
+
80
+ ## Installation
81
+
82
+ ```bash
83
+ # BigQuery
84
+ pip install "sqlframe[bigquery]"
85
+ # DuckDB
86
+ pip install "sqlframe[duckdb]"
87
+ # Postgres
88
+ pip install "sqlframe[postgres]"
89
+ # Standalone
90
+ pip install sqlframe
91
+ ```
92
+
93
+ See specific engine documentation for additional setup instructions.
94
+
95
+ ## Example Usage
96
+
97
+ ```python
98
+ from sqlframe.bigquery import BigQuerySession
99
+ from sqlframe.bigquery import functions as F
100
+ from sqlframe.bigquery import Window
101
+
102
+ session = BigQuerySession()
103
+ table_path = "bigquery-public-data.samples.natality"
104
+ # Get the top 5 years with the greatest year-over-year % change in new families with a single child
105
+ df = (
106
+ session.table(table_path)
107
+ .where(F.col("ever_born") == 1)
108
+ .groupBy("year")
109
+ .agg(F.count("*").alias("num_single_child_families"))
110
+ .withColumn(
111
+ "last_year_num_single_child_families",
112
+ F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
113
+ )
114
+ .withColumn(
115
+ "percent_change",
116
+ (F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
117
+ / F.col("last_year_num_single_child_families")
118
+ )
119
+ .orderBy(F.abs(F.col("percent_change")).desc())
120
+ .select(
121
+ F.col("year").alias("Year"),
122
+ F.format_number("num_single_child_families", 0).alias("number of new families single child"),
123
+ F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
124
+ )
125
+ .limit(5)
126
+ )
127
+ ```
128
+ ```python
129
+ df.sql()
130
+ ```
131
+ ```sql
132
+ WITH `t94228` AS (
133
+ SELECT
134
+ `natality`.`year` AS `year`,
135
+ COUNT(*) AS `num_single_child_families`
136
+ FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
137
+ WHERE
138
+ `natality`.`ever_born` = 1
139
+ GROUP BY
140
+ `natality`.`year`
141
+ ), `t39093` AS (
142
+ SELECT
143
+ `t94228`.`year` AS `year`,
144
+ `t94228`.`num_single_child_families` AS `num_single_child_families`,
145
+ LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
146
+ FROM `t94228` AS `t94228`
147
+ )
148
+ SELECT
149
+ `t39093`.`year` AS `year`,
150
+ FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `number of new families single child`,
151
+ FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
152
+ FROM `t39093` AS `t39093`
153
+ ORDER BY
154
+ ABS(`percent_change`) DESC
155
+ LIMIT 5
156
+ ```
157
+ ```python
158
+ df.show()
159
+ ```
160
+ ```
161
+ +------+-------------------------------------+----------------+
162
+ | year | number of new families single child | percent change |
163
+ +------+-------------------------------------+----------------+
164
+ | 1989 | 1,650,246 | 25.02 |
165
+ | 1974 | 783,448 | 14.49 |
166
+ | 1977 | 1,057,379 | 11.38 |
167
+ | 1985 | 1,308,476 | 11.15 |
168
+ | 1975 | 868,985 | 10.92 |
169
+ +------+-------------------------------------+----------------+
170
+ ```
@@ -0,0 +1,116 @@
1
+ <div align="center">
2
+ <img src="docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
3
+ </div>
4
+
5
+ ![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)
6
+
7
+ SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
8
+
9
+ ![SQLFrame Diagram](docs/images/sqlframe_diagram.png)
10
+
11
+ SQLFrame currently supports the following engines:
12
+
13
+ * [BigQuery](docs/bigquery.md)
14
+ * [DuckDB](docs/duckdb.md)
15
+ * [Postgres](docs/postgres.md)
16
+
17
+ SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
18
+ * [Standalone](docs/standalone.md)
19
+
20
+ SQLFrame is great for:
21
+
22
+ * Users who want to run PySpark DataFrame code without having to use a Spark cluster
23
+ * Users who want a SQL representation of their DataFrame code for debugging or sharing with others
24
+ * Users who want a DataFrame API that leverages the full power of their engine to do the processing
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ # BigQuery
30
+ pip install "sqlframe[bigquery]"
31
+ # DuckDB
32
+ pip install "sqlframe[duckdb]"
33
+ # Postgres
34
+ pip install "sqlframe[postgres]"
35
+ # Standalone
36
+ pip install sqlframe
37
+ ```
38
+
39
+ See specific engine documentation for additional setup instructions.
40
+
41
+ ## Example Usage
42
+
43
+ ```python
44
+ from sqlframe.bigquery import BigQuerySession
45
+ from sqlframe.bigquery import functions as F
46
+ from sqlframe.bigquery import Window
47
+
48
+ session = BigQuerySession()
49
+ table_path = "bigquery-public-data.samples.natality"
50
+ # Get the top 5 years with the greatest year-over-year % change in new families with a single child
51
+ df = (
52
+ session.table(table_path)
53
+ .where(F.col("ever_born") == 1)
54
+ .groupBy("year")
55
+ .agg(F.count("*").alias("num_single_child_families"))
56
+ .withColumn(
57
+ "last_year_num_single_child_families",
58
+ F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
59
+ )
60
+ .withColumn(
61
+ "percent_change",
62
+ (F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
63
+ / F.col("last_year_num_single_child_families")
64
+ )
65
+ .orderBy(F.abs(F.col("percent_change")).desc())
66
+ .select(
67
+ F.col("year").alias("Year"),
68
+ F.format_number("num_single_child_families", 0).alias("number of new families single child"),
69
+ F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
70
+ )
71
+ .limit(5)
72
+ )
73
+ ```
74
+ ```python
75
+ df.sql()
76
+ ```
77
+ ```sql
78
+ WITH `t94228` AS (
79
+ SELECT
80
+ `natality`.`year` AS `year`,
81
+ COUNT(*) AS `num_single_child_families`
82
+ FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
83
+ WHERE
84
+ `natality`.`ever_born` = 1
85
+ GROUP BY
86
+ `natality`.`year`
87
+ ), `t39093` AS (
88
+ SELECT
89
+ `t94228`.`year` AS `year`,
90
+ `t94228`.`num_single_child_families` AS `num_single_child_families`,
91
+ LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
92
+ FROM `t94228` AS `t94228`
93
+ )
94
+ SELECT
95
+ `t39093`.`year` AS `year`,
96
+ FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `number of new families single child`,
97
+ FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
98
+ FROM `t39093` AS `t39093`
99
+ ORDER BY
100
+ ABS(`percent_change`) DESC
101
+ LIMIT 5
102
+ ```
103
+ ```python
104
+ df.show()
105
+ ```
106
+ ```
107
+ +------+-------------------------------------+----------------+
108
+ | year | number of new families single child | percent change |
109
+ +------+-------------------------------------+----------------+
110
+ | 1989 | 1,650,246 | 25.02 |
111
+ | 1974 | 783,448 | 14.49 |
112
+ | 1977 | 1,057,379 | 11.38 |
113
+ | 1985 | 1,308,476 | 11.15 |
114
+ | 1975 | 868,985 | 10.92 |
115
+ +------+-------------------------------------+----------------+
116
+ ```
Binary file
@@ -0,0 +1,139 @@
1
+ # SQLFrame: Turning PySpark into a Universal DataFrame API
2
+ ## Have your SQL/Python Cake and Eat it Too
3
+
4
+ After 13 years as a data engineer, I've grown accustomed to constant change - whether significant shifts like the move to the cloud or smaller trends like the rise of notebooks.
5
+ Amid all this change, one thing has remained constant: SQL.
6
+ Every job I've had, from startups to FAANG, has leveraged the fact that I both understand and write SQL.
7
+ SQL is the universal language that unites all data professionals, and it enables performant pipelines by allowing query planners and optimizers to handle complex details of distributed processing.
8
+
9
+ Despite its strengths, SQL often seems ill-suited for maintaining data pipelines.
10
+ The language lacks support for abstracting common operations or unit testing specific segments of code, leading many to use Jinja as a makeshift solution.
11
+ Jinja SQL is to SQL what Pig Latin is to English - can be fun in small doses but impossible to understand at scale.
12
+ Moreover, SQL's repetitive nature, requiring columns to be repeated across operations, leads to fatigue and many data practitioners responding to the siren song of `SELECT *` and later found drowning in the sea of non-determinism.
13
+
14
+ This has put data professionals in a tough spot: Do you write your pipelines in SQL to favor accessibility or Python to favor maintainability?
15
+ Well, starting today, you no longer have to choose.
16
+ You can finally have your cake and eat it too.
17
+
18
+ <div align="center">
19
+ <img src="images/cake.gif" alt="Bruce Bogtrotter" width="800"/>
20
+ </div>
21
+
22
+ ### Introducing open-source SQLFrame!
23
+
24
+ <div align="center">
25
+ <img src="../docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="800"/>
26
+ </div>
27
+
28
+ SQLFrame revolutionizes how data professionals interact with SQL and PySpark DataFrames.
29
+ Unlike traditional PySpark, SQLFrame converts DataFrame operations directly into SQL, enabling real-time SQL script generation during development.
30
+ Here's how it works:
31
+
32
+ Consider a scenario where we analyze the number of new families per year choosing to have a single child based on publicly accessible natality data.
33
+
34
+ ```python
35
+ from sqlframe.bigquery import BigQuerySession
36
+ from sqlframe.bigquery import functions as F
37
+ from sqlframe.bigquery import Window
38
+
39
+ # Unique to SQLFrame: Ability to connect directly to BigQuery
40
+ session = BigQuerySession()
41
+ table_path = "bigquery-public-data.samples.natality"
42
+ # Get the top 5 years with the greatest year-over-year % change in new families with a single child
43
+ df = (
44
+ session.table(table_path)
45
+ .where(F.col("ever_born") == 1)
46
+ .groupBy("year")
47
+ .agg(F.count("*").alias("num_single_child_families"))
48
+ .withColumn("last_year_num_single_child_families", F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year")))
49
+ .withColumn("percent_change", (F.col("num_single_child_families") - F.col("last_year_num_single_child_families")) / F.col("last_year_num_single_child_families"))
50
+ .orderBy(F.abs(F.col("percent_change")).desc())
51
+ .select(
52
+ F.col("year").alias("Year"),
53
+ F.format_number("num_single_child_families", 0).alias("number of new families single child"),
54
+ F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
55
+ )
56
+ .limit(5)
57
+ )
58
+ # Unique to SQLFrame: Ability to see the SQL for your DataFrame
59
+ df.sql()
60
+ ```
61
+
62
+ Using SQLFrame, you can utilize the special BigQuery class to seamlessly integrate with your BigQuery environment.
63
+ The DataFrame operations mimic those you'd perform in PySpark, but with SQLFrame, you can also generate and review the corresponding SQL queries in real time using the `df.sql()` method.
64
+
65
+ ```sql
66
+ WITH `t94228` AS (
67
+ SELECT
68
+ `natality`.`year` AS `year`,
69
+ COUNT(*) AS `num_single_child_families`
70
+ FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
71
+ WHERE
72
+ `natality`.`ever_born` = 1
73
+ GROUP BY
74
+ `natality`.`year`
75
+ ), `t39093` AS (
76
+ SELECT
77
+ `t94228`.`year` AS `year`,
78
+ `t94228`.`num_single_child_families` AS `num_single_child_families`,
79
+ LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
80
+ FROM `t94228` AS `t94228`
81
+ )
82
+ SELECT
83
+ `t39093`.`year` AS `year`,
84
+ FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `number of new families single child`,
85
+ FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
86
+ FROM `t39093` AS `t39093`
87
+ ORDER BY
88
+ ABS(`percent_change`) DESC
89
+ LIMIT 5
90
+ ```
91
+
92
+ This feature not only enhances understanding but also ensures that the SQL output is deterministic, making it suitable for version control.
93
+ That way you can version both the Python and SQL representation of your pipelines and let your co-workers pick which format best suits them!
94
+
95
+ <div align="center">
96
+ <img src="images/but_wait_theres_more.gif" alt="There is more" width="800"/>
97
+ </div>
98
+
99
+ SQLFrame though is much more than just generating SQL: the goal is to make the PySpark DataFrame API feel like a native DataFrame API on all major data warehouses.
100
+ Therefore it enables users to execute DataFrame API pipelines directly on their data warehouse without any Spark clusters or libraries!
101
+
102
+ For instance, replacing `.sql()` with `.show()` in your pipeline displays the results directly from BigQuery, just as it would in PySpark.
103
+
104
+ ```text
105
+ >>> df.show()
106
+ +------+-------------------------------------+----------------+
107
+ | year | number of new families single child | percent change |
108
+ +------+-------------------------------------+----------------+
109
+ | 1989 | 1,650,246 | 25.02 |
110
+ | 1974 | 783,448 | 14.49 |
111
+ | 1977 | 1,057,379 | 11.38 |
112
+ | 1985 | 1,308,476 | 11.15 |
113
+ | 1975 | 868,985 | 10.92 |
114
+ +------+-------------------------------------+----------------+
115
+ ```
116
+
117
+ Even many of the Catalog operations are supported like `listColumns`:
118
+
119
+ ```text
120
+ >>> columns = session.catalog.listColumns(table_path)
121
+ >>> print("\n".join([f"Name: {x.name}, Data Type: {x.dataType}, Desc: {x.description}" for x in columns]))
122
+ Name: source_year, Data Type: INT64, Desc: Four-digit year of the birth. Example: 1975.
123
+ Name: year, Data Type: INT64, Desc: Four-digit year of the birth. Example: 1975.
124
+ Name: month, Data Type: INT64, Desc: Month index of the date of birth, where 1=January.
125
+ Name: day, Data Type: INT64, Desc: Day of birth, starting from 1.
126
+ Name: wday, Data Type: INT64, Desc: Day of the week, where 1 is Sunday and 7 is Saturday.
127
+ Name: state, Data Type: STRING, Desc: The two character postal code for the state. Entries after 2004 do not include this value.
128
+ ```
129
+
130
+ Therefore not only does SQLFrame make your DataFrame pipeline more accessible, it also turns the PySpark DataFrame API in universal DataFrame API that any data practitioner can enjoy!
131
+
132
+ <div align="center">
133
+ <img src="images/you_get_pyspark_api.gif" alt="There is more" width="800"/>
134
+ </div>
135
+
136
+ SQLFrame currently supports BigQuery, DuckDB, and PostgreSQL, with Redshift, Snowflake, Spark, and Trino in development.
137
+ For those interested in experimenting with SQL generation for other engines, the "StandaloneSession" provides a flexible testing ground.
138
+
139
+ Follow the simple setup guide to begin integrating SQLFrame into your projects today!