sqlframe 0.1.dev3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlframe-0.1.dev3/.gitignore +145 -0
- sqlframe-0.1.dev3/.pre-commit-config.yaml +33 -0
- sqlframe-0.1.dev3/LICENSE +21 -0
- sqlframe-0.1.dev3/Makefile +37 -0
- sqlframe-0.1.dev3/PKG-INFO +170 -0
- sqlframe-0.1.dev3/README.md +116 -0
- sqlframe-0.1.dev3/blogs/images/but_wait_theres_more.gif +0 -0
- sqlframe-0.1.dev3/blogs/images/cake.gif +0 -0
- sqlframe-0.1.dev3/blogs/images/you_get_pyspark_api.gif +0 -0
- sqlframe-0.1.dev3/blogs/sqlframe_universal_dataframe_api.md +139 -0
- sqlframe-0.1.dev3/docs/bigquery.md +479 -0
- sqlframe-0.1.dev3/docs/docs/bigquery.md +479 -0
- sqlframe-0.1.dev3/docs/docs/duckdb.md +466 -0
- sqlframe-0.1.dev3/docs/docs/images/SF.png +0 -0
- sqlframe-0.1.dev3/docs/docs/images/favicon.png +0 -0
- sqlframe-0.1.dev3/docs/docs/images/favicon_old.png +0 -0
- sqlframe-0.1.dev3/docs/docs/images/sqlframe_diagram.png +0 -0
- sqlframe-0.1.dev3/docs/docs/images/sqlframe_logo.png +0 -0
- sqlframe-0.1.dev3/docs/docs/postgres.md +430 -0
- sqlframe-0.1.dev3/docs/duckdb.md +452 -0
- sqlframe-0.1.dev3/docs/images/SF.png +0 -0
- sqlframe-0.1.dev3/docs/images/favicon.png +0 -0
- sqlframe-0.1.dev3/docs/images/favicon_old.png +0 -0
- sqlframe-0.1.dev3/docs/images/sqlframe_diagram.png +0 -0
- sqlframe-0.1.dev3/docs/images/sqlframe_logo.png +0 -0
- sqlframe-0.1.dev3/docs/index.md +1 -0
- sqlframe-0.1.dev3/docs/postgres.md +430 -0
- sqlframe-0.1.dev3/docs/standalone.md +450 -0
- sqlframe-0.1.dev3/docs/stylesheets/extra.css +17 -0
- sqlframe-0.1.dev3/mkdocs.yml +50 -0
- sqlframe-0.1.dev3/pytest.ini +4 -0
- sqlframe-0.1.dev3/setup.cfg +7 -0
- sqlframe-0.1.dev3/setup.py +76 -0
- sqlframe-0.1.dev3/sqlframe/LICENSE +260 -0
- sqlframe-0.1.dev3/sqlframe/__init__.py +0 -0
- sqlframe-0.1.dev3/sqlframe/_version.py +16 -0
- sqlframe-0.1.dev3/sqlframe/base/__init__.py +0 -0
- sqlframe-0.1.dev3/sqlframe/base/_typing.py +39 -0
- sqlframe-0.1.dev3/sqlframe/base/catalog.py +1162 -0
- sqlframe-0.1.dev3/sqlframe/base/column.py +388 -0
- sqlframe-0.1.dev3/sqlframe/base/dataframe.py +1513 -0
- sqlframe-0.1.dev3/sqlframe/base/decorators.py +51 -0
- sqlframe-0.1.dev3/sqlframe/base/exceptions.py +14 -0
- sqlframe-0.1.dev3/sqlframe/base/function_alternatives.py +1055 -0
- sqlframe-0.1.dev3/sqlframe/base/functions.py +1678 -0
- sqlframe-0.1.dev3/sqlframe/base/group.py +102 -0
- sqlframe-0.1.dev3/sqlframe/base/mixins/__init__.py +0 -0
- sqlframe-0.1.dev3/sqlframe/base/mixins/catalog_mixins.py +419 -0
- sqlframe-0.1.dev3/sqlframe/base/mixins/readwriter_mixins.py +115 -0
- sqlframe-0.1.dev3/sqlframe/base/normalize.py +85 -0
- sqlframe-0.1.dev3/sqlframe/base/operations.py +87 -0
- sqlframe-0.1.dev3/sqlframe/base/readerwriter.py +679 -0
- sqlframe-0.1.dev3/sqlframe/base/session.py +593 -0
- sqlframe-0.1.dev3/sqlframe/base/transforms.py +11 -0
- sqlframe-0.1.dev3/sqlframe/base/types.py +418 -0
- sqlframe-0.1.dev3/sqlframe/base/util.py +239 -0
- sqlframe-0.1.dev3/sqlframe/base/window.py +139 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/__init__.py +23 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/catalog.py +254 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/dataframe.py +54 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/functions.py +378 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/functions.pyi +269 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/readwriter.py +29 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/session.py +88 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/bigquery/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/__init__.py +20 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/catalog.py +108 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/dataframe.py +55 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/functions.py +47 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/functions.pyi +183 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/readwriter.py +96 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/session.py +65 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/duckdb/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe/postgres/__init__.py +23 -0
- sqlframe-0.1.dev3/sqlframe/postgres/catalog.py +106 -0
- sqlframe-0.1.dev3/sqlframe/postgres/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/postgres/dataframe.py +54 -0
- sqlframe-0.1.dev3/sqlframe/postgres/functions.py +61 -0
- sqlframe-0.1.dev3/sqlframe/postgres/functions.pyi +167 -0
- sqlframe-0.1.dev3/sqlframe/postgres/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/postgres/readwriter.py +29 -0
- sqlframe-0.1.dev3/sqlframe/postgres/session.py +68 -0
- sqlframe-0.1.dev3/sqlframe/postgres/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/postgres/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe/redshift/__init__.py +23 -0
- sqlframe-0.1.dev3/sqlframe/redshift/catalog.py +127 -0
- sqlframe-0.1.dev3/sqlframe/redshift/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/redshift/dataframe.py +54 -0
- sqlframe-0.1.dev3/sqlframe/redshift/functions.py +18 -0
- sqlframe-0.1.dev3/sqlframe/redshift/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/redshift/readwriter.py +29 -0
- sqlframe-0.1.dev3/sqlframe/redshift/session.py +53 -0
- sqlframe-0.1.dev3/sqlframe/redshift/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/redshift/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/__init__.py +26 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/catalog.py +134 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/dataframe.py +54 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/functions.py +18 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/readwriter.py +29 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/session.py +53 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/snowflake/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe/spark/__init__.py +23 -0
- sqlframe-0.1.dev3/sqlframe/spark/catalog.py +1028 -0
- sqlframe-0.1.dev3/sqlframe/spark/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/spark/dataframe.py +54 -0
- sqlframe-0.1.dev3/sqlframe/spark/functions.py +22 -0
- sqlframe-0.1.dev3/sqlframe/spark/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/spark/readwriter.py +29 -0
- sqlframe-0.1.dev3/sqlframe/spark/session.py +90 -0
- sqlframe-0.1.dev3/sqlframe/spark/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/spark/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe/standalone/__init__.py +26 -0
- sqlframe-0.1.dev3/sqlframe/standalone/catalog.py +13 -0
- sqlframe-0.1.dev3/sqlframe/standalone/column.py +1 -0
- sqlframe-0.1.dev3/sqlframe/standalone/dataframe.py +36 -0
- sqlframe-0.1.dev3/sqlframe/standalone/functions.py +1 -0
- sqlframe-0.1.dev3/sqlframe/standalone/group.py +14 -0
- sqlframe-0.1.dev3/sqlframe/standalone/readwriter.py +19 -0
- sqlframe-0.1.dev3/sqlframe/standalone/session.py +40 -0
- sqlframe-0.1.dev3/sqlframe/standalone/types.py +1 -0
- sqlframe-0.1.dev3/sqlframe/standalone/window.py +1 -0
- sqlframe-0.1.dev3/sqlframe.egg-info/PKG-INFO +170 -0
- sqlframe-0.1.dev3/sqlframe.egg-info/SOURCES.txt +181 -0
- sqlframe-0.1.dev3/sqlframe.egg-info/dependency_links.txt +1 -0
- sqlframe-0.1.dev3/sqlframe.egg-info/requires.txt +42 -0
- sqlframe-0.1.dev3/sqlframe.egg-info/top_level.txt +1 -0
- sqlframe-0.1.dev3/tests/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/common_fixtures.py +205 -0
- sqlframe-0.1.dev3/tests/conftest.py +11 -0
- sqlframe-0.1.dev3/tests/fixtures/employee.csv +6 -0
- sqlframe-0.1.dev3/tests/fixtures/employee.json +5 -0
- sqlframe-0.1.dev3/tests/fixtures/employee.parquet +0 -0
- sqlframe-0.1.dev3/tests/integration/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/bigquery/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/bigquery/test_bigquery_catalog.py +343 -0
- sqlframe-0.1.dev3/tests/integration/engines/bigquery/test_bigquery_session.py +20 -0
- sqlframe-0.1.dev3/tests/integration/engines/duckdb/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/duckdb/test_duckdb_catalog.py +356 -0
- sqlframe-0.1.dev3/tests/integration/engines/duckdb/test_duckdb_session.py +13 -0
- sqlframe-0.1.dev3/tests/integration/engines/postgres/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/postgres/test_postgres_catalog.py +317 -0
- sqlframe-0.1.dev3/tests/integration/engines/postgres/test_postgres_session.py +19 -0
- sqlframe-0.1.dev3/tests/integration/engines/redshift/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/redshift/test_redshift_catalog.py +306 -0
- sqlframe-0.1.dev3/tests/integration/engines/redshift/test_redshift_session.py +47 -0
- sqlframe-0.1.dev3/tests/integration/engines/snowflake/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/snowflake/test_snowflake_catalog.py +333 -0
- sqlframe-0.1.dev3/tests/integration/engines/snowflake/test_snowflake_session.py +47 -0
- sqlframe-0.1.dev3/tests/integration/engines/spark/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/integration/engines/spark/test_spark_catalog.py +244 -0
- sqlframe-0.1.dev3/tests/integration/engines/test_engine_dataframe.py +87 -0
- sqlframe-0.1.dev3/tests/integration/engines/test_engine_reader.py +131 -0
- sqlframe-0.1.dev3/tests/integration/engines/test_engine_session.py +40 -0
- sqlframe-0.1.dev3/tests/integration/engines/test_engine_writer.py +176 -0
- sqlframe-0.1.dev3/tests/integration/engines/test_int_functions.py +2688 -0
- sqlframe-0.1.dev3/tests/integration/fixtures.py +712 -0
- sqlframe-0.1.dev3/tests/integration/test_int_dataframe.py +1969 -0
- sqlframe-0.1.dev3/tests/integration/test_int_dataframe_stats.py +28 -0
- sqlframe-0.1.dev3/tests/integration/test_int_grouped_data.py +165 -0
- sqlframe-0.1.dev3/tests/integration/test_int_session.py +70 -0
- sqlframe-0.1.dev3/tests/types.py +6 -0
- sqlframe-0.1.dev3/tests/unit/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/unit/standalone/__init__.py +0 -0
- sqlframe-0.1.dev3/tests/unit/standalone/fixtures.py +71 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_column.py +218 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_dataframe.py +46 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_dataframe_writer.py +107 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_functions.py +2792 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_session.py +138 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_session_case_sensitivity.py +110 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_types.py +38 -0
- sqlframe-0.1.dev3/tests/unit/standalone/test_window.py +45 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
target/
|
|
76
|
+
|
|
77
|
+
# Jupyter Notebook
|
|
78
|
+
.ipynb_checkpoints
|
|
79
|
+
|
|
80
|
+
# IPython
|
|
81
|
+
profile_default/
|
|
82
|
+
ipython_config.py
|
|
83
|
+
|
|
84
|
+
# pyenv
|
|
85
|
+
.python-version
|
|
86
|
+
|
|
87
|
+
# pipenv
|
|
88
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
89
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
90
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
91
|
+
# install all needed dependencies.
|
|
92
|
+
#Pipfile.lock
|
|
93
|
+
|
|
94
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
95
|
+
__pypackages__/
|
|
96
|
+
|
|
97
|
+
# Celery stuff
|
|
98
|
+
celerybeat-schedule
|
|
99
|
+
celerybeat.pid
|
|
100
|
+
|
|
101
|
+
# SageMath parsed files
|
|
102
|
+
*.sage.py
|
|
103
|
+
|
|
104
|
+
# Environments
|
|
105
|
+
.env
|
|
106
|
+
.venv
|
|
107
|
+
env/
|
|
108
|
+
venv/
|
|
109
|
+
ENV/
|
|
110
|
+
env.bak/
|
|
111
|
+
venv.bak/
|
|
112
|
+
|
|
113
|
+
# Spyder project settings
|
|
114
|
+
.spyderproject
|
|
115
|
+
.spyproject
|
|
116
|
+
|
|
117
|
+
# Rope project settings
|
|
118
|
+
.ropeproject
|
|
119
|
+
|
|
120
|
+
# mkdocs documentation
|
|
121
|
+
/site
|
|
122
|
+
|
|
123
|
+
# mypy
|
|
124
|
+
.mypy_cache/
|
|
125
|
+
.dmypy.json
|
|
126
|
+
dmypy.json
|
|
127
|
+
|
|
128
|
+
# Pyre type checker
|
|
129
|
+
.pyre/
|
|
130
|
+
|
|
131
|
+
# PyCharm
|
|
132
|
+
.idea/
|
|
133
|
+
|
|
134
|
+
# Visual Studio Code
|
|
135
|
+
.vscode
|
|
136
|
+
|
|
137
|
+
.DS_STORE
|
|
138
|
+
metastore_db
|
|
139
|
+
spark_warehouse
|
|
140
|
+
|
|
141
|
+
# Version file
|
|
142
|
+
sqlframe/_version.py
|
|
143
|
+
|
|
144
|
+
# Emacs files
|
|
145
|
+
*~
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
- id: ruff
|
|
5
|
+
name: ruff
|
|
6
|
+
description: "Run 'ruff' for extremely fast Python linting"
|
|
7
|
+
entry: ruff check
|
|
8
|
+
--force-exclude
|
|
9
|
+
--fix
|
|
10
|
+
--select I
|
|
11
|
+
--ignore E721
|
|
12
|
+
--ignore E741
|
|
13
|
+
language: python
|
|
14
|
+
types_or: [python, pyi]
|
|
15
|
+
require_serial: true
|
|
16
|
+
additional_dependencies: []
|
|
17
|
+
files: ^(sqlframe/|tests/|setup.py)
|
|
18
|
+
- id: ruff-format
|
|
19
|
+
name: ruff-format
|
|
20
|
+
description: "Run 'ruff format' for extremely fast Python formatting"
|
|
21
|
+
entry: ruff format
|
|
22
|
+
--force-exclude
|
|
23
|
+
--line-length 100
|
|
24
|
+
language: python
|
|
25
|
+
types_or: [python, pyi]
|
|
26
|
+
require_serial: true
|
|
27
|
+
- id: mypy
|
|
28
|
+
name: mypy
|
|
29
|
+
entry: mypy sqlframe tests
|
|
30
|
+
language: system
|
|
31
|
+
types: [ python ]
|
|
32
|
+
files: ^(sqlframe/|tests/)
|
|
33
|
+
pass_filenames: false
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Ryan Eakman
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
install-dev:
|
|
2
|
+
pip install -e ".[dev]"
|
|
3
|
+
|
|
4
|
+
install-pre-commit:
|
|
5
|
+
pre-commit install
|
|
6
|
+
|
|
7
|
+
slow-test:
|
|
8
|
+
pytest -n auto tests
|
|
9
|
+
|
|
10
|
+
fast-test:
|
|
11
|
+
pytest -n auto tests/unit
|
|
12
|
+
|
|
13
|
+
local-test:
|
|
14
|
+
pytest -n auto -m "local"
|
|
15
|
+
|
|
16
|
+
bigquery-test:
|
|
17
|
+
pytest -n auto -m "bigquery"
|
|
18
|
+
|
|
19
|
+
duckdb-test:
|
|
20
|
+
pytest -n auto -m "duckdb"
|
|
21
|
+
|
|
22
|
+
style:
|
|
23
|
+
pre-commit run --all-files
|
|
24
|
+
|
|
25
|
+
docs-serve:
|
|
26
|
+
mkdocs serve
|
|
27
|
+
|
|
28
|
+
stubs:
|
|
29
|
+
stubgen sqlframe/bigquery/functions.py --output ./ --inspect-mode
|
|
30
|
+
stubgen sqlframe/duckdb/functions.py --output ./ --inspect-mode
|
|
31
|
+
stubgen sqlframe/postgres/functions.py --output ./ --inspect-mode
|
|
32
|
+
|
|
33
|
+
package:
|
|
34
|
+
pip3 install wheel && python3 setup.py sdist bdist_wheel
|
|
35
|
+
|
|
36
|
+
publish: package
|
|
37
|
+
pip3 install twine && python3 -m twine upload dist/*
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: sqlframe
|
|
3
|
+
Version: 0.1.dev3
|
|
4
|
+
Summary: PySpark Dataframe API Compatible SQL Generator
|
|
5
|
+
Home-page: https://github.com/eakmanrq/sqlframe
|
|
6
|
+
Author: Ryan Eakman
|
|
7
|
+
Author-email: eakmanrq@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: SQL
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: prettytable
|
|
20
|
+
Requires-Dist: sqlglot
|
|
21
|
+
Provides-Extra: bigquery
|
|
22
|
+
Requires-Dist: google-cloud-bigquery[pandas]; extra == "bigquery"
|
|
23
|
+
Requires-Dist: google-cloud-bigquery-storage; extra == "bigquery"
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: duckdb; extra == "dev"
|
|
26
|
+
Requires-Dist: mkdocs==1.4.2; extra == "dev"
|
|
27
|
+
Requires-Dist: mkdocs-include-markdown-plugin==4.0.3; extra == "dev"
|
|
28
|
+
Requires-Dist: mkdocs-material==9.0.5; extra == "dev"
|
|
29
|
+
Requires-Dist: mkdocs-material-extensions==1.1.1; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy; extra == "dev"
|
|
31
|
+
Requires-Dist: pandas; extra == "dev"
|
|
32
|
+
Requires-Dist: pymdown-extensions; extra == "dev"
|
|
33
|
+
Requires-Dist: psycopg; extra == "dev"
|
|
34
|
+
Requires-Dist: pyarrow; extra == "dev"
|
|
35
|
+
Requires-Dist: pyspark; extra == "dev"
|
|
36
|
+
Requires-Dist: pytest; extra == "dev"
|
|
37
|
+
Requires-Dist: pytest-postgresql; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-xdist; extra == "dev"
|
|
39
|
+
Requires-Dist: pre-commit; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff; extra == "dev"
|
|
41
|
+
Requires-Dist: typing_extensions; extra == "dev"
|
|
42
|
+
Requires-Dist: types-psycopg2; extra == "dev"
|
|
43
|
+
Provides-Extra: duckdb
|
|
44
|
+
Requires-Dist: duckdb; extra == "duckdb"
|
|
45
|
+
Requires-Dist: pandas; extra == "duckdb"
|
|
46
|
+
Provides-Extra: postgres
|
|
47
|
+
Requires-Dist: psycopg2; extra == "postgres"
|
|
48
|
+
Provides-Extra: redshift
|
|
49
|
+
Requires-Dist: redshift_connector; extra == "redshift"
|
|
50
|
+
Provides-Extra: snowflake
|
|
51
|
+
Requires-Dist: snowflake-connector-python[pandas,secure-local-storage]; extra == "snowflake"
|
|
52
|
+
Provides-Extra: spark
|
|
53
|
+
Requires-Dist: pyspark; extra == "spark"
|
|
54
|
+
|
|
55
|
+
<div align="center">
|
|
56
|
+
<img src="docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
|
|
57
|
+
</div>
|
|
58
|
+
|
|
59
|
+

|
|
60
|
+
|
|
61
|
+
SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
|
|
62
|
+
|
|
63
|
+

|
|
64
|
+
|
|
65
|
+
SQLFrame currently supports the following engines:
|
|
66
|
+
|
|
67
|
+
* [BigQuery](docs/bigquery.md)
|
|
68
|
+
* [DuckDB](docs/duckdb.md)
|
|
69
|
+
* [Postgres](docs/postgres.md)
|
|
70
|
+
|
|
71
|
+
SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
|
|
72
|
+
* [Standalone](docs/standalone.md)
|
|
73
|
+
|
|
74
|
+
SQLFrame is great for:
|
|
75
|
+
|
|
76
|
+
* Users who want to run PySpark DataFrame code without having to use a Spark cluster
|
|
77
|
+
* Users who want a SQL representation of their DataFrame code for debugging or sharing with others
|
|
78
|
+
* Users who want a DataFrame API that leverages the full power of their engine to do the processing
|
|
79
|
+
|
|
80
|
+
## Installation
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# BigQuery
|
|
84
|
+
pip install "sqlframe[bigquery]"
|
|
85
|
+
# DuckDB
|
|
86
|
+
pip install "sqlframe[duckdb]"
|
|
87
|
+
# Postgres
|
|
88
|
+
pip install "sqlframe[postgres]"
|
|
89
|
+
# Standalone
|
|
90
|
+
pip install sqlframe
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
See specific engine documentation for additional setup instructions.
|
|
94
|
+
|
|
95
|
+
## Example Usage
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from sqlframe.bigquery import BigQuerySession
|
|
99
|
+
from sqlframe.bigquery import functions as F
|
|
100
|
+
from sqlframe.bigquery import Window
|
|
101
|
+
|
|
102
|
+
session = BigQuerySession()
|
|
103
|
+
table_path = "bigquery-public-data.samples.natality"
|
|
104
|
+
# Get the top 5 years with the greatest year-over-year % change in new families with a single child
|
|
105
|
+
df = (
|
|
106
|
+
session.table(table_path)
|
|
107
|
+
.where(F.col("ever_born") == 1)
|
|
108
|
+
.groupBy("year")
|
|
109
|
+
.agg(F.count("*").alias("num_single_child_families"))
|
|
110
|
+
.withColumn(
|
|
111
|
+
"last_year_num_single_child_families",
|
|
112
|
+
F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
|
|
113
|
+
)
|
|
114
|
+
.withColumn(
|
|
115
|
+
"percent_change",
|
|
116
|
+
(F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
|
|
117
|
+
/ F.col("last_year_num_single_child_families")
|
|
118
|
+
)
|
|
119
|
+
.orderBy(F.abs(F.col("percent_change")).desc())
|
|
120
|
+
.select(
|
|
121
|
+
F.col("year").alias("Year"),
|
|
122
|
+
F.format_number("num_single_child_families", 0).alias("number of new families single child"),
|
|
123
|
+
F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
|
|
124
|
+
)
|
|
125
|
+
.limit(5)
|
|
126
|
+
)
|
|
127
|
+
```
|
|
128
|
+
```python
|
|
129
|
+
df.sql()
|
|
130
|
+
```
|
|
131
|
+
```sql
|
|
132
|
+
WITH `t94228` AS (
|
|
133
|
+
SELECT
|
|
134
|
+
`natality`.`year` AS `year`,
|
|
135
|
+
COUNT(*) AS `num_single_child_families`
|
|
136
|
+
FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
|
|
137
|
+
WHERE
|
|
138
|
+
`natality`.`ever_born` = 1
|
|
139
|
+
GROUP BY
|
|
140
|
+
`natality`.`year`
|
|
141
|
+
), `t39093` AS (
|
|
142
|
+
SELECT
|
|
143
|
+
`t94228`.`year` AS `year`,
|
|
144
|
+
`t94228`.`num_single_child_families` AS `num_single_child_families`,
|
|
145
|
+
LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
|
|
146
|
+
FROM `t94228` AS `t94228`
|
|
147
|
+
)
|
|
148
|
+
SELECT
|
|
149
|
+
`t39093`.`year` AS `year`,
|
|
150
|
+
FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `number of new families single child`,
|
|
151
|
+
FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
|
|
152
|
+
FROM `t39093` AS `t39093`
|
|
153
|
+
ORDER BY
|
|
154
|
+
ABS(`percent_change`) DESC
|
|
155
|
+
LIMIT 5
|
|
156
|
+
```
|
|
157
|
+
```python
|
|
158
|
+
df.show()
|
|
159
|
+
```
|
|
160
|
+
```
|
|
161
|
+
+------+-------------------------------------+----------------+
|
|
162
|
+
| year | number of new families single child | percent change |
|
|
163
|
+
+------+-------------------------------------+----------------+
|
|
164
|
+
| 1989 | 1,650,246 | 25.02 |
|
|
165
|
+
| 1974 | 783,448 | 14.49 |
|
|
166
|
+
| 1977 | 1,057,379 | 11.38 |
|
|
167
|
+
| 1985 | 1,308,476 | 11.15 |
|
|
168
|
+
| 1975 | 868,985 | 10.92 |
|
|
169
|
+
+------+-------------------------------------+----------------+
|
|
170
|
+
```
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
|
|
3
|
+
</div>
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+
|
|
7
|
+
SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
|
|
8
|
+
|
|
9
|
+

|
|
10
|
+
|
|
11
|
+
SQLFrame currently supports the following engines:
|
|
12
|
+
|
|
13
|
+
* [BigQuery](docs/bigquery.md)
|
|
14
|
+
* [DuckDB](docs/duckdb.md)
|
|
15
|
+
* [Postgres](docs/postgres.md)
|
|
16
|
+
|
|
17
|
+
SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
|
|
18
|
+
* [Standalone](docs/standalone.md)
|
|
19
|
+
|
|
20
|
+
SQLFrame is great for:
|
|
21
|
+
|
|
22
|
+
* Users who want to run PySpark DataFrame code without having to use a Spark cluster
|
|
23
|
+
* Users who want a SQL representation of their DataFrame code for debugging or sharing with others
|
|
24
|
+
* Users who want a DataFrame API that leverages the full power of their engine to do the processing
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# BigQuery
|
|
30
|
+
pip install "sqlframe[bigquery]"
|
|
31
|
+
# DuckDB
|
|
32
|
+
pip install "sqlframe[duckdb]"
|
|
33
|
+
# Postgres
|
|
34
|
+
pip install "sqlframe[postgres]"
|
|
35
|
+
# Standalone
|
|
36
|
+
pip install sqlframe
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
See specific engine documentation for additional setup instructions.
|
|
40
|
+
|
|
41
|
+
## Example Usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from sqlframe.bigquery import BigQuerySession
|
|
45
|
+
from sqlframe.bigquery import functions as F
|
|
46
|
+
from sqlframe.bigquery import Window
|
|
47
|
+
|
|
48
|
+
session = BigQuerySession()
|
|
49
|
+
table_path = "bigquery-public-data.samples.natality"
|
|
50
|
+
# Get the top 5 years with the greatest year-over-year % change in new families with a single child
|
|
51
|
+
df = (
|
|
52
|
+
session.table(table_path)
|
|
53
|
+
.where(F.col("ever_born") == 1)
|
|
54
|
+
.groupBy("year")
|
|
55
|
+
.agg(F.count("*").alias("num_single_child_families"))
|
|
56
|
+
.withColumn(
|
|
57
|
+
"last_year_num_single_child_families",
|
|
58
|
+
F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
|
|
59
|
+
)
|
|
60
|
+
.withColumn(
|
|
61
|
+
"percent_change",
|
|
62
|
+
(F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
|
|
63
|
+
/ F.col("last_year_num_single_child_families")
|
|
64
|
+
)
|
|
65
|
+
.orderBy(F.abs(F.col("percent_change")).desc())
|
|
66
|
+
.select(
|
|
67
|
+
F.col("year").alias("Year"),
|
|
68
|
+
F.format_number("num_single_child_families", 0).alias("number of new families single child"),
|
|
69
|
+
F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
|
|
70
|
+
)
|
|
71
|
+
.limit(5)
|
|
72
|
+
)
|
|
73
|
+
```
|
|
74
|
+
```python
|
|
75
|
+
df.sql()
|
|
76
|
+
```
|
|
77
|
+
```sql
|
|
78
|
+
WITH `t94228` AS (
|
|
79
|
+
SELECT
|
|
80
|
+
`natality`.`year` AS `year`,
|
|
81
|
+
COUNT(*) AS `num_single_child_families`
|
|
82
|
+
FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
|
|
83
|
+
WHERE
|
|
84
|
+
`natality`.`ever_born` = 1
|
|
85
|
+
GROUP BY
|
|
86
|
+
`natality`.`year`
|
|
87
|
+
), `t39093` AS (
|
|
88
|
+
SELECT
|
|
89
|
+
`t94228`.`year` AS `year`,
|
|
90
|
+
`t94228`.`num_single_child_families` AS `num_single_child_families`,
|
|
91
|
+
LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
|
|
92
|
+
FROM `t94228` AS `t94228`
|
|
93
|
+
)
|
|
94
|
+
SELECT
|
|
95
|
+
`t39093`.`year` AS `year`,
|
|
96
|
+
FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `number of new families single child`,
|
|
97
|
+
FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
|
|
98
|
+
FROM `t39093` AS `t39093`
|
|
99
|
+
ORDER BY
|
|
100
|
+
ABS(`percent_change`) DESC
|
|
101
|
+
LIMIT 5
|
|
102
|
+
```
|
|
103
|
+
```python
|
|
104
|
+
df.show()
|
|
105
|
+
```
|
|
106
|
+
```
|
|
107
|
+
+------+-------------------------------------+----------------+
|
|
108
|
+
| year | number of new families single child | percent change |
|
|
109
|
+
+------+-------------------------------------+----------------+
|
|
110
|
+
| 1989 | 1,650,246 | 25.02 |
|
|
111
|
+
| 1974 | 783,448 | 14.49 |
|
|
112
|
+
| 1977 | 1,057,379 | 11.38 |
|
|
113
|
+
| 1985 | 1,308,476 | 11.15 |
|
|
114
|
+
| 1975 | 868,985 | 10.92 |
|
|
115
|
+
+------+-------------------------------------+----------------+
|
|
116
|
+
```
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# SQLFrame: Turning PySpark into a Universal DataFrame API
|
|
2
|
+
## Have your SQL/Python Cake and Eat it Too
|
|
3
|
+
|
|
4
|
+
After 13 years as a data engineer, I've grown accustomed to constant change - whether significant shifts like the move to the cloud or smaller trends like the rise of notebooks.
|
|
5
|
+
Amid all this change, one thing has remained constant: SQL.
|
|
6
|
+
Every job I've had, from startups to FAANG, has leveraged the fact that I both understand and write SQL.
|
|
7
|
+
SQL is the universal language that unites all data professionals, and it enables performant pipelines by allowing query planners and optimizers to handle complex details of distributed processing.
|
|
8
|
+
|
|
9
|
+
Despite its strengths, SQL often seems ill-suited for maintaining data pipelines.
|
|
10
|
+
The language lacks support for abstracting common operations or unit testing specific segments of code, leading many to use Jinja as a makeshift solution.
|
|
11
|
+
Jinja SQL is to SQL what Pig Latin is to English - can be fun in small doses but impossible to understand at scale.
|
|
12
|
+
Moreover, SQL's repetitive nature, requiring columns to be repeated across operations, leads to fatigue and many data practitioners responding to the siren song of `SELECT *` and later found drowning in the sea of non-determinism.
|
|
13
|
+
|
|
14
|
+
This has put data professionals in a tough spot: Do you write your pipelines in SQL to favor accessibility or Python to favor maintainability?
|
|
15
|
+
Well, starting today, you no longer have to choose.
|
|
16
|
+
You can finally have your cake and eat it too.
|
|
17
|
+
|
|
18
|
+
<div align="center">
|
|
19
|
+
<img src="images/cake.gif" alt="Bruce Bogtrotter" width="800"/>
|
|
20
|
+
</div>
|
|
21
|
+
|
|
22
|
+
### Introducing open-source SQLFrame!
|
|
23
|
+
|
|
24
|
+
<div align="center">
|
|
25
|
+
<img src="../docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="800"/>
|
|
26
|
+
</div>
|
|
27
|
+
|
|
28
|
+
SQLFrame revolutionizes how data professionals interact with SQL and PySpark DataFrames.
|
|
29
|
+
Unlike traditional PySpark, SQLFrame converts DataFrame operations directly into SQL, enabling real-time SQL script generation during development.
|
|
30
|
+
Here's how it works:
|
|
31
|
+
|
|
32
|
+
Consider a scenario where we analyze the number of new families per year choosing to have a single child based on publicly accessible natality data.
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from sqlframe.bigquery import BigQuerySession
|
|
36
|
+
from sqlframe.bigquery import functions as F
|
|
37
|
+
from sqlframe.bigquery import Window
|
|
38
|
+
|
|
39
|
+
# Unique to SQLFrame: Ability to connect directly to BigQuery
|
|
40
|
+
session = BigQuerySession()
|
|
41
|
+
table_path = "bigquery-public-data.samples.natality"
|
|
42
|
+
# Get the top 5 years with the greatest year-over-year % change in new families with a single child
|
|
43
|
+
df = (
|
|
44
|
+
session.table(table_path)
|
|
45
|
+
.where(F.col("ever_born") == 1)
|
|
46
|
+
.groupBy("year")
|
|
47
|
+
.agg(F.count("*").alias("num_single_child_families"))
|
|
48
|
+
.withColumn("last_year_num_single_child_families", F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year")))
|
|
49
|
+
.withColumn("percent_change", (F.col("num_single_child_families") - F.col("last_year_num_single_child_families")) / F.col("last_year_num_single_child_families"))
|
|
50
|
+
.orderBy(F.abs(F.col("percent_change")).desc())
|
|
51
|
+
.select(
|
|
52
|
+
F.col("year").alias("Year"),
|
|
53
|
+
F.format_number("num_single_child_families", 0).alias("number of new families single child"),
|
|
54
|
+
F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
|
|
55
|
+
)
|
|
56
|
+
.limit(5)
|
|
57
|
+
)
|
|
58
|
+
# Unique to SQLFrame: Ability to see the SQL for your DataFrame
|
|
59
|
+
df.sql()
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Using SQLFrame, you can utilize the special BigQuery class to seamlessly integrate with your BigQuery environment.
|
|
63
|
+
The DataFrame operations mimic those you'd perform in PySpark, but with SQLFrame, you can also generate and review the corresponding SQL queries in real time using the `df.sql()` method.
|
|
64
|
+
|
|
65
|
+
```sql
|
|
66
|
+
WITH `t94228` AS (
|
|
67
|
+
SELECT
|
|
68
|
+
`natality`.`year` AS `year`,
|
|
69
|
+
COUNT(*) AS `num_single_child_families`
|
|
70
|
+
FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
|
|
71
|
+
WHERE
|
|
72
|
+
`natality`.`ever_born` = 1
|
|
73
|
+
GROUP BY
|
|
74
|
+
`natality`.`year`
|
|
75
|
+
), `t39093` AS (
|
|
76
|
+
SELECT
|
|
77
|
+
`t94228`.`year` AS `year`,
|
|
78
|
+
`t94228`.`num_single_child_families` AS `num_single_child_families`,
|
|
79
|
+
LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
|
|
80
|
+
FROM `t94228` AS `t94228`
|
|
81
|
+
)
|
|
82
|
+
SELECT
|
|
83
|
+
`t39093`.`year` AS `year`,
|
|
84
|
+
FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `number of new families single child`,
|
|
85
|
+
FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
|
|
86
|
+
FROM `t39093` AS `t39093`
|
|
87
|
+
ORDER BY
|
|
88
|
+
ABS(`percent_change`) DESC
|
|
89
|
+
LIMIT 5
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
This feature not only enhances understanding but also ensures that the SQL output is deterministic, making it suitable for version control.
|
|
93
|
+
That way you can version both the Python and SQL representation of your pipelines and let your co-workers pick which format best suits them!
|
|
94
|
+
|
|
95
|
+
<div align="center">
|
|
96
|
+
<img src="images/but_wait_theres_more.gif" alt="There is more" width="800"/>
|
|
97
|
+
</div>
|
|
98
|
+
|
|
99
|
+
SQLFrame though is much more than just generating SQL: the goal is to make the PySpark DataFrame API feel like a native DataFrame API on all major data warehouses.
|
|
100
|
+
Therefore it enables users to execute DataFrame API pipelines directly on their data warehouse without any Spark clusters or libraries!
|
|
101
|
+
|
|
102
|
+
For instance, replacing `.sql()` with `.show()` in your pipeline displays the results directly from BigQuery, just as it would in PySpark.
|
|
103
|
+
|
|
104
|
+
```text
|
|
105
|
+
>>> df.show()
|
|
106
|
+
+------+-------------------------------------+----------------+
|
|
107
|
+
| year | number of new families single child | percent change |
|
|
108
|
+
+------+-------------------------------------+----------------+
|
|
109
|
+
| 1989 | 1,650,246 | 25.02 |
|
|
110
|
+
| 1974 | 783,448 | 14.49 |
|
|
111
|
+
| 1977 | 1,057,379 | 11.38 |
|
|
112
|
+
| 1985 | 1,308,476 | 11.15 |
|
|
113
|
+
| 1975 | 868,985 | 10.92 |
|
|
114
|
+
+------+-------------------------------------+----------------+
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Even many of the Catalog operations are supported like `listColumns`:
|
|
118
|
+
|
|
119
|
+
```text
|
|
120
|
+
>>> columns = session.catalog.listColumns(table_path)
|
|
121
|
+
>>> print("\n".join([f"Name: {x.name}, Data Type: {x.dataType}, Desc: {x.description}" for x in columns]))
|
|
122
|
+
Name: source_year, Data Type: INT64, Desc: Four-digit year of the birth. Example: 1975.
|
|
123
|
+
Name: year, Data Type: INT64, Desc: Four-digit year of the birth. Example: 1975.
|
|
124
|
+
Name: month, Data Type: INT64, Desc: Month index of the date of birth, where 1=January.
|
|
125
|
+
Name: day, Data Type: INT64, Desc: Day of birth, starting from 1.
|
|
126
|
+
Name: wday, Data Type: INT64, Desc: Day of the week, where 1 is Sunday and 7 is Saturday.
|
|
127
|
+
Name: state, Data Type: STRING, Desc: The two character postal code for the state. Entries after 2004 do not include this value.
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Therefore not only does SQLFrame make your DataFrame pipeline more accessible, it also turns the PySpark DataFrame API in universal DataFrame API that any data practitioner can enjoy!
|
|
131
|
+
|
|
132
|
+
<div align="center">
|
|
133
|
+
<img src="images/you_get_pyspark_api.gif" alt="There is more" width="800"/>
|
|
134
|
+
</div>
|
|
135
|
+
|
|
136
|
+
SQLFrame currently supports BigQuery, DuckDB, and PostgreSQL, with Redshift, Snowflake, Spark, and Trino in development.
|
|
137
|
+
For those interested in experimenting with SQL generation for other engines, the "StandaloneSession" provides a flexible testing ground.
|
|
138
|
+
|
|
139
|
+
Follow the simple setup guide to begin integrating SQLFrame into your projects today!
|