sqlframe 1.2.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlframe-1.2.0 → sqlframe-1.4.0}/Makefile +2 -2
- {sqlframe-1.2.0 → sqlframe-1.4.0}/PKG-INFO +14 -6
- {sqlframe-1.2.0 → sqlframe-1.4.0}/README.md +11 -5
- sqlframe-1.4.0/docs/configuration.md +242 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/mkdocs.yml +1 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/setup.py +8 -5
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/_version.py +2 -2
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/column.py +7 -3
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/dataframe.py +94 -7
- sqlframe-1.4.0/sqlframe/base/decorators.py +53 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/mixins/catalog_mixins.py +1 -1
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/mixins/readwriter_mixins.py +4 -3
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/readerwriter.py +3 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/session.py +6 -9
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/util.py +38 -1
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/catalog.py +3 -1
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/session.py +31 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/session.py +3 -1
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe.egg-info/PKG-INFO +14 -6
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe.egg-info/SOURCES.txt +2 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe.egg-info/requires.txt +8 -5
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/snowflake/test_snowflake_session.py +2 -2
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/test_int_functions.py +4 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_session.py +1 -1
- sqlframe-1.4.0/tests/unit/test_util.py +26 -0
- sqlframe-1.2.0/sqlframe/base/decorators.py +0 -51
- {sqlframe-1.2.0 → sqlframe-1.4.0}/.github/CODEOWNERS +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/.github/workflows/main.workflow.yaml +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/.github/workflows/publish.workflow.yaml +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/.gitignore +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/.pre-commit-config.yaml +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/.readthedocs.yaml +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/LICENSE +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/blogs/images/but_wait_theres_more.gif +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/blogs/images/cake.gif +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/blogs/images/you_get_pyspark_api.gif +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/blogs/sqlframe_universal_dataframe_api.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/bigquery.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/bigquery.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/duckdb.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/images/SF.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/images/favicon.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/images/favicon_old.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/images/sqlframe_diagram.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/images/sqlframe_logo.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/docs/postgres.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/duckdb.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/images/SF.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/images/favicon.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/images/favicon_old.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/images/sqlframe_diagram.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/images/sqlframe_logo.png +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/index.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/postgres.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/requirements.txt +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/standalone.md +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/docs/stylesheets/extra.css +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/pytest.ini +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/renovate.json +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/setup.cfg +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/LICENSE +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/_typing.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/exceptions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/function_alternatives.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/mixins/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/mixins/dataframe_mixins.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/normalize.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/operations.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/transforms.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/base/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/functions.pyi +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/bigquery/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/functions.pyi +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/duckdb/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/functions.pyi +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/postgres/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/redshift/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/snowflake/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/spark/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/group.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/readwriter.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe/standalone/window.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe.egg-info/dependency_links.txt +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/sqlframe.egg-info/top_level.txt +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/common_fixtures.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/conftest.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/fixtures/employee.csv +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/fixtures/employee.json +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/fixtures/employee.parquet +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/fixtures/employee_extra_line.csv +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/bigquery/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/bigquery/test_bigquery_catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/bigquery/test_bigquery_session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/duck/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/duck/test_duckdb_catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/duck/test_duckdb_dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/duck/test_duckdb_reader.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/duck/test_duckdb_session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/postgres/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/postgres/test_postgres_catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/postgres/test_postgres_dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/postgres/test_postgres_session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/redshift/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/redshift/test_redshift_catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/redshift/test_redshift_session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/snowflake/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/snowflake/test_snowflake_catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/spark/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/spark/test_spark_catalog.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/test_engine_dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/test_engine_reader.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/test_engine_session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/engines/test_engine_writer.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/fixtures.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/test_int_dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/test_int_dataframe_stats.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/test_int_grouped_data.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/integration/test_int_session.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/__init__.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/fixtures.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_column.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_dataframe.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_dataframe_writer.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_functions.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_session_case_sensitivity.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_types.py +0 -0
- {sqlframe-1.2.0 → sqlframe-1.4.0}/tests/unit/standalone/test_window.py +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
install-dev:
|
|
2
|
-
pip install -e ".[dev,docs,duckdb,postgres,redshift,
|
|
2
|
+
pip install -e ".[bigquery,dev,docs,duckdb,pandas,postgres,redshift,snowflake,spark]"
|
|
3
3
|
|
|
4
4
|
install-pre-commit:
|
|
5
5
|
pre-commit install
|
|
@@ -8,7 +8,7 @@ slow-test:
|
|
|
8
8
|
pytest -n auto tests
|
|
9
9
|
|
|
10
10
|
fast-test:
|
|
11
|
-
pytest -n auto
|
|
11
|
+
pytest -n auto tests/unit
|
|
12
12
|
|
|
13
13
|
local-test:
|
|
14
14
|
pytest -n auto -m "fast or local"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sqlframe
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Taking the Spark out of PySpark by converting to SQL
|
|
5
5
|
Home-page: https://github.com/eakmanrq/sqlframe
|
|
6
6
|
Author: Ryan Eakman
|
|
@@ -20,6 +20,8 @@ Provides-Extra: bigquery
|
|
|
20
20
|
Provides-Extra: dev
|
|
21
21
|
Provides-Extra: docs
|
|
22
22
|
Provides-Extra: duckdb
|
|
23
|
+
Provides-Extra: openai
|
|
24
|
+
Provides-Extra: pandas
|
|
23
25
|
Provides-Extra: postgres
|
|
24
26
|
Provides-Extra: redshift
|
|
25
27
|
Provides-Extra: snowflake
|
|
@@ -27,19 +29,19 @@ Provides-Extra: spark
|
|
|
27
29
|
License-File: LICENSE
|
|
28
30
|
|
|
29
31
|
<div align="center">
|
|
30
|
-
<img src="https://sqlframe.readthedocs.io/en/
|
|
32
|
+
<img src="https://sqlframe.readthedocs.io/en/stable/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
|
|
31
33
|
</div>
|
|
32
34
|
|
|
33
35
|
SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
|
|
34
36
|
|
|
35
37
|
SQLFrame currently supports the following engines (many more in development):
|
|
36
38
|
|
|
37
|
-
* [BigQuery](https://sqlframe.readthedocs.io/en/
|
|
38
|
-
* [DuckDB](https://sqlframe.readthedocs.io/en/
|
|
39
|
-
* [Postgres](https://sqlframe.readthedocs.io/en/
|
|
39
|
+
* [BigQuery](https://sqlframe.readthedocs.io/en/stable/bigquery/)
|
|
40
|
+
* [DuckDB](https://sqlframe.readthedocs.io/en/stable/duckdb)
|
|
41
|
+
* [Postgres](https://sqlframe.readthedocs.io/en/stable/postgres)
|
|
40
42
|
|
|
41
43
|
SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
|
|
42
|
-
* [Standalone](https://sqlframe.readthedocs.io/en/
|
|
44
|
+
* [Standalone](https://sqlframe.readthedocs.io/en/stable/standalone)
|
|
43
45
|
|
|
44
46
|
SQLFrame is great for:
|
|
45
47
|
|
|
@@ -62,6 +64,12 @@ pip install sqlframe
|
|
|
62
64
|
|
|
63
65
|
See specific engine documentation for additional setup instructions.
|
|
64
66
|
|
|
67
|
+
## Configuration
|
|
68
|
+
|
|
69
|
+
SQLFrame generates consistently accurate yet complex SQL for engine execution.
|
|
70
|
+
However, when using df.sql(), it produces more human-readable SQL.
|
|
71
|
+
For details on how to configure this output and leverage OpenAI to enhance the SQL, see [Generated SQL Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#generated-sql).
|
|
72
|
+
|
|
65
73
|
## Example Usage
|
|
66
74
|
|
|
67
75
|
```python
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
<div align="center">
|
|
2
|
-
<img src="https://sqlframe.readthedocs.io/en/
|
|
2
|
+
<img src="https://sqlframe.readthedocs.io/en/stable/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
|
|
3
3
|
</div>
|
|
4
4
|
|
|
5
5
|
SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
|
|
6
6
|
|
|
7
7
|
SQLFrame currently supports the following engines (many more in development):
|
|
8
8
|
|
|
9
|
-
* [BigQuery](https://sqlframe.readthedocs.io/en/
|
|
10
|
-
* [DuckDB](https://sqlframe.readthedocs.io/en/
|
|
11
|
-
* [Postgres](https://sqlframe.readthedocs.io/en/
|
|
9
|
+
* [BigQuery](https://sqlframe.readthedocs.io/en/stable/bigquery/)
|
|
10
|
+
* [DuckDB](https://sqlframe.readthedocs.io/en/stable/duckdb)
|
|
11
|
+
* [Postgres](https://sqlframe.readthedocs.io/en/stable/postgres)
|
|
12
12
|
|
|
13
13
|
SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
|
|
14
|
-
* [Standalone](https://sqlframe.readthedocs.io/en/
|
|
14
|
+
* [Standalone](https://sqlframe.readthedocs.io/en/stable/standalone)
|
|
15
15
|
|
|
16
16
|
SQLFrame is great for:
|
|
17
17
|
|
|
@@ -34,6 +34,12 @@ pip install sqlframe
|
|
|
34
34
|
|
|
35
35
|
See specific engine documentation for additional setup instructions.
|
|
36
36
|
|
|
37
|
+
## Configuration
|
|
38
|
+
|
|
39
|
+
SQLFrame generates consistently accurate yet complex SQL for engine execution.
|
|
40
|
+
However, when using df.sql(), it produces more human-readable SQL.
|
|
41
|
+
For details on how to configure this output and leverage OpenAI to enhance the SQL, see [Generated SQL Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#generated-sql).
|
|
42
|
+
|
|
37
43
|
## Example Usage
|
|
38
44
|
|
|
39
45
|
```python
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
# General Configuration
|
|
2
|
+
|
|
3
|
+
## Generated SQL
|
|
4
|
+
|
|
5
|
+
### Pretty
|
|
6
|
+
|
|
7
|
+
If the SQL should be returned in a "pretty" format meaning it has newlines and indentation. Defaults to `True`.
|
|
8
|
+
|
|
9
|
+
```python
|
|
10
|
+
from sqlframe.standalone import StandaloneSession
|
|
11
|
+
|
|
12
|
+
session = StandaloneSession()
|
|
13
|
+
|
|
14
|
+
df = session.createDataFrame([{'a': 1, 'b': 2}])
|
|
15
|
+
```
|
|
16
|
+
```python
|
|
17
|
+
>>> print(df.sql())
|
|
18
|
+
SELECT
|
|
19
|
+
CAST(`a1`.`a` AS BIGINT) AS `a`,
|
|
20
|
+
CAST(`a1`.`b` AS BIGINT) AS `b`
|
|
21
|
+
FROM VALUES
|
|
22
|
+
(1, 2) AS `a1`(`a`, `b`)
|
|
23
|
+
```
|
|
24
|
+
```python
|
|
25
|
+
>>> print(df.sql(pretty=False))
|
|
26
|
+
SELECT CAST(`a3`.`a` AS BIGINT) AS `a`, CAST(`a3`.`b` AS BIGINT) AS `b` FROM VALUES (1, 2) AS `a3`(`a`, `b`)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Optimized
|
|
30
|
+
|
|
31
|
+
Optimized SQL is SQL that has been processed by SQLGlot's optimizer. For complex queries this will significantly reduce the number of CTEs produced and remove extra unused columns. Defaults to `True`.
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from sqlframe.bigquery import BigQuerySession
|
|
35
|
+
from sqlframe.bigquery import functions as F
|
|
36
|
+
from sqlframe.bigquery import Window
|
|
37
|
+
|
|
38
|
+
session = BigQuerySession()
|
|
39
|
+
table_path = "bigquery-public-data.samples.natality"
|
|
40
|
+
# Top 5 years with the greatest year-over-year % change in new families with single child
|
|
41
|
+
df = (
|
|
42
|
+
session.table(table_path)
|
|
43
|
+
.where(F.col("ever_born") == 1)
|
|
44
|
+
.groupBy("year")
|
|
45
|
+
.agg(F.count("*").alias("num_single_child_families"))
|
|
46
|
+
.withColumn(
|
|
47
|
+
"last_year_num_single_child_families",
|
|
48
|
+
F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
|
|
49
|
+
)
|
|
50
|
+
.withColumn(
|
|
51
|
+
"percent_change",
|
|
52
|
+
(F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
|
|
53
|
+
/ F.col("last_year_num_single_child_families")
|
|
54
|
+
)
|
|
55
|
+
.orderBy(F.abs(F.col("percent_change")).desc())
|
|
56
|
+
.select(
|
|
57
|
+
F.col("year").alias("year"),
|
|
58
|
+
F.format_number("num_single_child_families", 0).alias("new families single child"),
|
|
59
|
+
F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
|
|
60
|
+
)
|
|
61
|
+
.limit(5)
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
```python
|
|
65
|
+
>>> print(df.sql(optimize=True))
|
|
66
|
+
WITH `t94228042` AS (
|
|
67
|
+
SELECT
|
|
68
|
+
`natality`.`year` AS `year`,
|
|
69
|
+
COUNT(*) AS `num_single_child_families`
|
|
70
|
+
FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
|
|
71
|
+
WHERE
|
|
72
|
+
`natality`.`ever_born` = 1
|
|
73
|
+
GROUP BY
|
|
74
|
+
`natality`.`year`
|
|
75
|
+
), `t30206548` AS (
|
|
76
|
+
SELECT
|
|
77
|
+
`t94228042`.`year` AS `year`,
|
|
78
|
+
`t94228042`.`num_single_child_families` AS `num_single_child_families`,
|
|
79
|
+
LAG(`t94228042`.`num_single_child_families`, 1) OVER (ORDER BY `t94228042`.`year`) AS `last_year_num_single_child_families`
|
|
80
|
+
FROM `t94228042` AS `t94228042`
|
|
81
|
+
)
|
|
82
|
+
SELECT
|
|
83
|
+
`t30206548`.`year` AS `year`,
|
|
84
|
+
FORMAT('%\'.0f', ROUND(CAST(`t30206548`.`num_single_child_families` AS FLOAT64), 0)) AS `new families single child`,
|
|
85
|
+
FORMAT(
|
|
86
|
+
'%\'.2f',
|
|
87
|
+
ROUND(
|
|
88
|
+
CAST((
|
|
89
|
+
(
|
|
90
|
+
(
|
|
91
|
+
`t30206548`.`num_single_child_families` - `t30206548`.`last_year_num_single_child_families`
|
|
92
|
+
) / `t30206548`.`last_year_num_single_child_families`
|
|
93
|
+
) * 100
|
|
94
|
+
) AS FLOAT64),
|
|
95
|
+
2
|
|
96
|
+
)
|
|
97
|
+
) AS `percent change`
|
|
98
|
+
FROM `t30206548` AS `t30206548`
|
|
99
|
+
ORDER BY
|
|
100
|
+
ABS(`percent_change`) DESC
|
|
101
|
+
LIMIT 5
|
|
102
|
+
```
|
|
103
|
+
```python
|
|
104
|
+
>>> print(df.sql(optimize=False))
|
|
105
|
+
WITH t14183493 AS (
|
|
106
|
+
SELECT
|
|
107
|
+
`source_year`,
|
|
108
|
+
`year`,
|
|
109
|
+
`month`,
|
|
110
|
+
`day`,
|
|
111
|
+
`wday`,
|
|
112
|
+
`state`,
|
|
113
|
+
`is_male`,
|
|
114
|
+
`child_race`,
|
|
115
|
+
`weight_pounds`,
|
|
116
|
+
`plurality`,
|
|
117
|
+
`apgar_1min`,
|
|
118
|
+
`apgar_5min`,
|
|
119
|
+
`mother_residence_state`,
|
|
120
|
+
`mother_race`,
|
|
121
|
+
`mother_age`,
|
|
122
|
+
`gestation_weeks`,
|
|
123
|
+
`lmp`,
|
|
124
|
+
`mother_married`,
|
|
125
|
+
`mother_birth_state`,
|
|
126
|
+
`cigarette_use`,
|
|
127
|
+
`cigarettes_per_day`,
|
|
128
|
+
`alcohol_use`,
|
|
129
|
+
`drinks_per_week`,
|
|
130
|
+
`weight_gain_pounds`,
|
|
131
|
+
`born_alive_alive`,
|
|
132
|
+
`born_alive_dead`,
|
|
133
|
+
`born_dead`,
|
|
134
|
+
`ever_born`,
|
|
135
|
+
`father_race`,
|
|
136
|
+
`father_age`,
|
|
137
|
+
`record_weight`
|
|
138
|
+
FROM bigquery-public-data.samples.natality
|
|
139
|
+
), t17633417 AS (
|
|
140
|
+
SELECT
|
|
141
|
+
year,
|
|
142
|
+
COUNT(*) AS num_single_child_families
|
|
143
|
+
FROM t14183493
|
|
144
|
+
WHERE
|
|
145
|
+
ever_born = 1
|
|
146
|
+
GROUP BY
|
|
147
|
+
year
|
|
148
|
+
), t32066970 AS (
|
|
149
|
+
SELECT
|
|
150
|
+
year,
|
|
151
|
+
num_single_child_families,
|
|
152
|
+
LAG(num_single_child_families, 1) OVER (ORDER BY year) AS last_year_num_single_child_families
|
|
153
|
+
FROM t17633417
|
|
154
|
+
), t21362690 AS (
|
|
155
|
+
SELECT
|
|
156
|
+
year,
|
|
157
|
+
num_single_child_families,
|
|
158
|
+
last_year_num_single_child_families,
|
|
159
|
+
(
|
|
160
|
+
(
|
|
161
|
+
num_single_child_families - last_year_num_single_child_families
|
|
162
|
+
) / last_year_num_single_child_families
|
|
163
|
+
) AS percent_change
|
|
164
|
+
FROM t32066970
|
|
165
|
+
ORDER BY
|
|
166
|
+
ABS(percent_change) DESC
|
|
167
|
+
)
|
|
168
|
+
SELECT
|
|
169
|
+
year AS year,
|
|
170
|
+
FORMAT('%\'.0f', ROUND(CAST(num_single_child_families AS FLOAT64), 0)) AS `new families single child`,
|
|
171
|
+
FORMAT('%\'.2f', ROUND(CAST((
|
|
172
|
+
percent_change * 100
|
|
173
|
+
) AS FLOAT64), 2)) AS `percent change`
|
|
174
|
+
FROM t21362690
|
|
175
|
+
LIMIT 5
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Override Dialect
|
|
179
|
+
|
|
180
|
+
The dialect of the generated SQL will be based on the session's dialect. However, you can override the dialect by passing a string to the `dialect` parameter. This is useful when you want to generate SQL for a different database.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
# create session and `df` like normal
|
|
184
|
+
df.sql(dialect="bigquery")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### OpenAI Enrichment
|
|
188
|
+
|
|
189
|
+
OpenAI's models can be used to enrich the generated SQL to make it more human-like.
|
|
190
|
+
You can have it just provide more readable CTE names or you can have it try to make the whole SQL statement more readable.
|
|
191
|
+
|
|
192
|
+
#### Example
|
|
193
|
+
|
|
194
|
+
```python
|
|
195
|
+
# create session and `df` like normal
|
|
196
|
+
# The model to use defaults to `gpt-4o` but can be changed by passing a string to the `openai_model` parameter.
|
|
197
|
+
>>> df.sql(openai_config={"mode": "cte_only", "model": "gpt-3.5-turbo"})
|
|
198
|
+
WITH `single_child_families_by_year` AS (
|
|
199
|
+
SELECT
|
|
200
|
+
`natality`.`year` AS `year`,
|
|
201
|
+
COUNT(*) AS `num_single_child_families`
|
|
202
|
+
FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
|
|
203
|
+
WHERE
|
|
204
|
+
`natality`.`ever_born` = 1
|
|
205
|
+
GROUP BY
|
|
206
|
+
`natality`.`year`
|
|
207
|
+
), `families_with_percent_change` AS (
|
|
208
|
+
SELECT
|
|
209
|
+
`single_child_families_by_year`.`year` AS `year`,
|
|
210
|
+
`single_child_families_by_year`.`num_single_child_families` AS `num_single_child_families`,
|
|
211
|
+
LAG(`single_child_families_by_year`.`num_single_child_families`, 1) OVER (ORDER BY `single_child_families_by_year`.`year`) AS `last_year_num_single_child_families`
|
|
212
|
+
FROM `single_child_families_by_year` AS `single_child_families_by_year`
|
|
213
|
+
)
|
|
214
|
+
SELECT
|
|
215
|
+
`families_with_percent_change`.`year` AS `year`,
|
|
216
|
+
FORMAT('%\'.0f', ROUND(CAST(`families_with_percent_change`.`num_single_child_families` AS FLOAT64), 0)) AS `new families single child`,
|
|
217
|
+
FORMAT(
|
|
218
|
+
'%\'.2f',
|
|
219
|
+
ROUND(
|
|
220
|
+
CAST((
|
|
221
|
+
(
|
|
222
|
+
(
|
|
223
|
+
`families_with_percent_change`.`num_single_child_families` - `families_with_percent_change`.`last_year_num_single_child_families`
|
|
224
|
+
) / `families_with_percent_change`.`last_year_num_single_child_families`
|
|
225
|
+
) * 100
|
|
226
|
+
) AS FLOAT64),
|
|
227
|
+
2
|
|
228
|
+
)
|
|
229
|
+
) AS `percent change`
|
|
230
|
+
FROM `families_with_percent_change` AS `families_with_percent_change`
|
|
231
|
+
ORDER BY
|
|
232
|
+
ABS(`percent_change`) DESC
|
|
233
|
+
LIMIT 5
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
#### Parameters
|
|
237
|
+
|
|
238
|
+
| Parameter | Description | Default |
|
|
239
|
+
|-------------------|-----------------------------------------------------------------------|------------|
|
|
240
|
+
| `mode` | The mode to use. Can be `cte_only` or `full`. | `cte_only` |
|
|
241
|
+
| `model` | The OpenAI model to use. Note: The default may change in new releases | `gpt-4o` |
|
|
242
|
+
| `prompt_override` | A string to use to override the default prompt. | None |
|
|
@@ -26,11 +26,11 @@ setup(
|
|
|
26
26
|
"bigquery": [
|
|
27
27
|
"google-cloud-bigquery[pandas]>=3,<4",
|
|
28
28
|
"google-cloud-bigquery-storage>=2,<3",
|
|
29
|
-
"pandas>=2,<3",
|
|
30
29
|
],
|
|
31
30
|
"dev": [
|
|
32
31
|
"duckdb>=0.9,<0.11",
|
|
33
32
|
"mypy>=1.10.0,<1.11",
|
|
33
|
+
"openai>=1.30,<1.31",
|
|
34
34
|
"pandas>=2,<3",
|
|
35
35
|
"pandas-stubs>=2,<3",
|
|
36
36
|
"psycopg>=3.1,<4",
|
|
@@ -56,17 +56,20 @@ setup(
|
|
|
56
56
|
"duckdb>=0.9,<0.11",
|
|
57
57
|
"pandas>=2,<3",
|
|
58
58
|
],
|
|
59
|
-
"
|
|
59
|
+
"openai": [
|
|
60
|
+
"openai>=1.30,<1.31",
|
|
61
|
+
],
|
|
62
|
+
"pandas": [
|
|
60
63
|
"pandas>=2,<3",
|
|
64
|
+
],
|
|
65
|
+
"postgres": [
|
|
61
66
|
"psycopg2>=2.8,<3",
|
|
62
67
|
],
|
|
63
68
|
"redshift": [
|
|
64
|
-
"pandas>=2,<3",
|
|
65
69
|
"redshift_connector>=2.1.1,<2.2.0",
|
|
66
70
|
],
|
|
67
71
|
"snowflake": [
|
|
68
|
-
"
|
|
69
|
-
"snowflake-connector-python[pandas,secure-local-storage]>=3.10.0,<3.11",
|
|
72
|
+
"snowflake-connector-python[secure-local-storage]>=3.10.0,<3.11",
|
|
70
73
|
],
|
|
71
74
|
"spark": [
|
|
72
75
|
"pyspark>=2,<3.6",
|
|
@@ -9,9 +9,11 @@ import typing as t
|
|
|
9
9
|
import sqlglot
|
|
10
10
|
from sqlglot import expressions as exp
|
|
11
11
|
from sqlglot.helper import flatten, is_iterable
|
|
12
|
+
from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
|
|
12
13
|
|
|
14
|
+
from sqlframe.base.decorators import normalize
|
|
13
15
|
from sqlframe.base.types import DataType
|
|
14
|
-
from sqlframe.base.util import get_func_from_session
|
|
16
|
+
from sqlframe.base.util import get_func_from_session, quote_preserving_alias_or_name
|
|
15
17
|
|
|
16
18
|
if t.TYPE_CHECKING:
|
|
17
19
|
from sqlframe.base._typing import ColumnOrLiteral, ColumnOrName
|
|
@@ -237,7 +239,7 @@ class Column:
|
|
|
237
239
|
|
|
238
240
|
@property
|
|
239
241
|
def alias_or_name(self) -> str:
|
|
240
|
-
return self.expression
|
|
242
|
+
return quote_preserving_alias_or_name(self.expression) # type: ignore
|
|
241
243
|
|
|
242
244
|
@classmethod
|
|
243
245
|
def ensure_literal(cls, value) -> Column:
|
|
@@ -266,7 +268,9 @@ class Column:
|
|
|
266
268
|
from sqlframe.base.session import _BaseSession
|
|
267
269
|
|
|
268
270
|
dialect = _BaseSession().input_dialect
|
|
269
|
-
alias: exp.Expression =
|
|
271
|
+
alias: exp.Expression = normalize_identifiers(
|
|
272
|
+
exp.parse_identifier(name, dialect=dialect), dialect=dialect
|
|
273
|
+
)
|
|
270
274
|
new_expression = exp.Alias(
|
|
271
275
|
this=self.column_expression,
|
|
272
276
|
alias=alias.this if isinstance(alias, exp.Column) else alias,
|
|
@@ -2,26 +2,34 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import enum
|
|
5
6
|
import functools
|
|
6
7
|
import itertools
|
|
8
|
+
import json
|
|
7
9
|
import logging
|
|
8
10
|
import sys
|
|
9
11
|
import typing as t
|
|
10
12
|
import zlib
|
|
11
13
|
from copy import copy
|
|
14
|
+
from dataclasses import dataclass
|
|
12
15
|
|
|
13
16
|
import sqlglot
|
|
14
17
|
from prettytable import PrettyTable
|
|
15
18
|
from sqlglot import Dialect
|
|
16
19
|
from sqlglot import expressions as exp
|
|
17
20
|
from sqlglot.helper import ensure_list, object_to_dict, seq_get
|
|
21
|
+
from sqlglot.optimizer.pushdown_projections import pushdown_projections
|
|
22
|
+
from sqlglot.optimizer.qualify import qualify
|
|
18
23
|
from sqlglot.optimizer.qualify_columns import quote_identifiers
|
|
19
24
|
|
|
25
|
+
from sqlframe.base.decorators import normalize
|
|
20
26
|
from sqlframe.base.operations import Operation, operation
|
|
21
27
|
from sqlframe.base.transforms import replace_id_value
|
|
22
28
|
from sqlframe.base.util import (
|
|
23
29
|
get_func_from_session,
|
|
24
30
|
get_tables_from_expression_with_join,
|
|
31
|
+
quote_preserving_alias_or_name,
|
|
32
|
+
verify_openai_installed,
|
|
25
33
|
)
|
|
26
34
|
|
|
27
35
|
if sys.version_info >= (3, 11):
|
|
@@ -70,6 +78,46 @@ JOIN_HINTS = {
|
|
|
70
78
|
DF = t.TypeVar("DF", bound="_BaseDataFrame")
|
|
71
79
|
|
|
72
80
|
|
|
81
|
+
class OpenAIMode(enum.Enum):
|
|
82
|
+
CTE_ONLY = "cte_only"
|
|
83
|
+
FULL = "full"
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def is_cte_only(self) -> bool:
|
|
87
|
+
return self == OpenAIMode.CTE_ONLY
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def is_full(self) -> bool:
|
|
91
|
+
return self == OpenAIMode.FULL
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass
|
|
95
|
+
class OpenAIConfig:
|
|
96
|
+
mode: OpenAIMode = OpenAIMode.CTE_ONLY
|
|
97
|
+
model: str = "gpt-4o"
|
|
98
|
+
prompt_override: t.Optional[str] = None
|
|
99
|
+
|
|
100
|
+
@classmethod
|
|
101
|
+
def from_dict(cls, config: t.Dict[str, t.Any]) -> OpenAIConfig:
|
|
102
|
+
if "mode" in config:
|
|
103
|
+
config["mode"] = OpenAIMode(config["mode"].lower())
|
|
104
|
+
return cls(**config)
|
|
105
|
+
|
|
106
|
+
def get_prompt(self, dialect: Dialect) -> str:
|
|
107
|
+
if self.prompt_override:
|
|
108
|
+
return self.prompt_override
|
|
109
|
+
if self.mode.is_cte_only:
|
|
110
|
+
return f"You are a backend tool that creates unique CTE alias names match what a human would write and in snake case. You respond without code blocks and only a json payload with the key being the CTE name that is being replaced and the value being the new CTE human readable name."
|
|
111
|
+
return f"""
|
|
112
|
+
You are a backend tool that converts correct {dialect} SQL to simplified and more human readable version.
|
|
113
|
+
You respond without code block with rewritten {dialect} SQL.
|
|
114
|
+
You don't change any column names in the final select because the user expects those to remain the same.
|
|
115
|
+
You make unique CTE alias names match what a human would write and in snake case.
|
|
116
|
+
You improve formatting with spacing and line-breaks.
|
|
117
|
+
You remove redundant parenthesis and aliases.
|
|
118
|
+
When remove extra quotes, make sure to keep quotes around words that could be reserved words"""
|
|
119
|
+
|
|
120
|
+
|
|
73
121
|
class _BaseDataFrameNaFunctions(t.Generic[DF]):
|
|
74
122
|
def __init__(self, df: DF):
|
|
75
123
|
self.df = df
|
|
@@ -410,7 +458,7 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
410
458
|
|
|
411
459
|
outer_select = item.find(exp.Select)
|
|
412
460
|
if outer_select:
|
|
413
|
-
return [col(x
|
|
461
|
+
return [col(quote_preserving_alias_or_name(x)) for x in outer_select.expressions]
|
|
414
462
|
return []
|
|
415
463
|
|
|
416
464
|
def _create_hash_from_expression(self, expression: exp.Expression) -> str:
|
|
@@ -471,6 +519,7 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
471
519
|
dialect: DialectType = None,
|
|
472
520
|
optimize: bool = True,
|
|
473
521
|
pretty: bool = True,
|
|
522
|
+
openai_config: t.Optional[t.Union[t.Dict[str, t.Any], OpenAIConfig]] = None,
|
|
474
523
|
as_list: bool = False,
|
|
475
524
|
**kwargs,
|
|
476
525
|
) -> t.Union[str, t.List[str]]:
|
|
@@ -480,6 +529,11 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
480
529
|
select_expressions = df._get_select_expressions()
|
|
481
530
|
output_expressions: t.List[t.Union[exp.Select, exp.Cache, exp.Drop]] = []
|
|
482
531
|
replacement_mapping: t.Dict[exp.Identifier, exp.Identifier] = {}
|
|
532
|
+
openai_config = (
|
|
533
|
+
OpenAIConfig.from_dict(openai_config)
|
|
534
|
+
if openai_config is not None and isinstance(openai_config, dict)
|
|
535
|
+
else openai_config
|
|
536
|
+
)
|
|
483
537
|
|
|
484
538
|
for expression_type, select_expression in select_expressions:
|
|
485
539
|
select_expression = select_expression.transform(
|
|
@@ -490,6 +544,9 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
490
544
|
select_expression = t.cast(
|
|
491
545
|
exp.Select, self.session._optimize(select_expression, dialect=dialect)
|
|
492
546
|
)
|
|
547
|
+
elif openai_config:
|
|
548
|
+
qualify(select_expression, dialect=dialect, schema=self.session.catalog._schema)
|
|
549
|
+
pushdown_projections(select_expression, schema=self.session.catalog._schema)
|
|
493
550
|
|
|
494
551
|
select_expression = df._replace_cte_names_with_hashes(select_expression)
|
|
495
552
|
|
|
@@ -505,7 +562,9 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
505
562
|
self.session.catalog.add_table(
|
|
506
563
|
cache_table_name,
|
|
507
564
|
{
|
|
508
|
-
expression
|
|
565
|
+
quote_preserving_alias_or_name(expression): expression.type.sql(
|
|
566
|
+
dialect=dialect
|
|
567
|
+
)
|
|
509
568
|
if expression.type
|
|
510
569
|
else "UNKNOWN"
|
|
511
570
|
for expression in select_expression.expressions
|
|
@@ -541,10 +600,37 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
541
600
|
|
|
542
601
|
output_expressions.append(expression)
|
|
543
602
|
|
|
544
|
-
results = [
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
603
|
+
results = []
|
|
604
|
+
for expression in output_expressions:
|
|
605
|
+
sql = expression.sql(dialect=dialect, pretty=pretty, **kwargs)
|
|
606
|
+
if openai_config:
|
|
607
|
+
assert isinstance(openai_config, OpenAIConfig)
|
|
608
|
+
verify_openai_installed()
|
|
609
|
+
from openai import OpenAI
|
|
610
|
+
|
|
611
|
+
client = OpenAI()
|
|
612
|
+
chat_completed = client.chat.completions.create(
|
|
613
|
+
messages=[
|
|
614
|
+
{ # type: ignore
|
|
615
|
+
"role": "system",
|
|
616
|
+
"content": openai_config.get_prompt(dialect),
|
|
617
|
+
},
|
|
618
|
+
{
|
|
619
|
+
"role": "user",
|
|
620
|
+
"content": sql,
|
|
621
|
+
},
|
|
622
|
+
],
|
|
623
|
+
model=openai_config.model,
|
|
624
|
+
)
|
|
625
|
+
assert chat_completed.choices[0].message.content is not None
|
|
626
|
+
if openai_config.mode.is_cte_only:
|
|
627
|
+
cte_replacement_mapping = json.loads(chat_completed.choices[0].message.content)
|
|
628
|
+
for old_name, new_name in cte_replacement_mapping.items():
|
|
629
|
+
sql = sql.replace(old_name, new_name)
|
|
630
|
+
else:
|
|
631
|
+
sql = chat_completed.choices[0].message.content
|
|
632
|
+
results.append(sql)
|
|
633
|
+
|
|
548
634
|
if as_list:
|
|
549
635
|
return results
|
|
550
636
|
return ";\n".join(results)
|
|
@@ -688,7 +774,7 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
688
774
|
join_expression = self._add_ctes_to_expression(join_expression, other_df.expression.ctes)
|
|
689
775
|
self_columns = self._get_outer_select_columns(join_expression)
|
|
690
776
|
other_columns = self._get_outer_select_columns(other_df.expression)
|
|
691
|
-
join_columns = self.
|
|
777
|
+
join_columns = self._ensure_and_normalize_cols(on)
|
|
692
778
|
# Determines the join clause and select columns to be used passed on what type of columns were provided for
|
|
693
779
|
# the join. The columns returned changes based on how the on expression is provided.
|
|
694
780
|
if how != "cross":
|
|
@@ -1324,6 +1410,7 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
|
|
|
1324
1410
|
assert sqls[-1] is not None
|
|
1325
1411
|
return self.session._fetchdf(sqls[-1])
|
|
1326
1412
|
|
|
1413
|
+
@normalize("name")
|
|
1327
1414
|
def createOrReplaceTempView(self, name: str) -> None:
|
|
1328
1415
|
self.session.temp_views[name] = self.copy()._convert_leaf_to_cte()
|
|
1329
1416
|
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import typing as t
|
|
5
|
+
|
|
6
|
+
from sqlglot import parse_one
|
|
7
|
+
from sqlglot.helper import ensure_list
|
|
8
|
+
from sqlglot.optimizer.normalize_identifiers import normalize_identifiers
|
|
9
|
+
|
|
10
|
+
if t.TYPE_CHECKING:
|
|
11
|
+
from sqlframe.base.catalog import _BaseCatalog
|
|
12
|
+
|
|
13
|
+
CALLING_CLASS = t.TypeVar("CALLING_CLASS")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def normalize(normalize_kwargs: t.Union[str, t.List[str]]) -> t.Callable[[t.Callable], t.Callable]:
|
|
17
|
+
"""
|
|
18
|
+
Decorator used to normalize identifiers in the kwargs of a method.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def decorator(func: t.Callable) -> t.Callable:
|
|
22
|
+
@functools.wraps(func)
|
|
23
|
+
def wrapper(self: CALLING_CLASS, *args, **kwargs) -> CALLING_CLASS:
|
|
24
|
+
from sqlframe.base.session import _BaseSession
|
|
25
|
+
|
|
26
|
+
input_dialect = _BaseSession().input_dialect
|
|
27
|
+
kwargs.update(dict(zip(func.__code__.co_varnames[1:], args)))
|
|
28
|
+
for kwarg in ensure_list(normalize_kwargs):
|
|
29
|
+
if kwarg in kwargs:
|
|
30
|
+
value = kwargs.get(kwarg)
|
|
31
|
+
if value:
|
|
32
|
+
expression = (
|
|
33
|
+
parse_one(value, dialect=input_dialect)
|
|
34
|
+
if isinstance(value, str)
|
|
35
|
+
else value
|
|
36
|
+
)
|
|
37
|
+
kwargs[kwarg] = normalize_identifiers(expression, input_dialect).sql(
|
|
38
|
+
dialect=input_dialect
|
|
39
|
+
)
|
|
40
|
+
return func(self, **kwargs)
|
|
41
|
+
|
|
42
|
+
wrapper.__wrapped__ = func # type: ignore
|
|
43
|
+
return wrapper
|
|
44
|
+
|
|
45
|
+
return decorator
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def func_metadata(unsupported_engines: t.Optional[t.Union[str, t.List[str]]] = None) -> t.Callable:
|
|
49
|
+
def _metadata(func: t.Callable) -> t.Callable:
|
|
50
|
+
func.unsupported_engines = ensure_list(unsupported_engines) if unsupported_engines else [] # type: ignore
|
|
51
|
+
return func
|
|
52
|
+
|
|
53
|
+
return _metadata
|