sqlframe 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sqlframe/__init__.py +0 -0
  2. sqlframe/_version.py +16 -0
  3. sqlframe/base/__init__.py +0 -0
  4. sqlframe/base/_typing.py +39 -0
  5. sqlframe/base/catalog.py +1163 -0
  6. sqlframe/base/column.py +388 -0
  7. sqlframe/base/dataframe.py +1519 -0
  8. sqlframe/base/decorators.py +51 -0
  9. sqlframe/base/exceptions.py +14 -0
  10. sqlframe/base/function_alternatives.py +1055 -0
  11. sqlframe/base/functions.py +1678 -0
  12. sqlframe/base/group.py +102 -0
  13. sqlframe/base/mixins/__init__.py +0 -0
  14. sqlframe/base/mixins/catalog_mixins.py +419 -0
  15. sqlframe/base/mixins/readwriter_mixins.py +118 -0
  16. sqlframe/base/normalize.py +84 -0
  17. sqlframe/base/operations.py +87 -0
  18. sqlframe/base/readerwriter.py +679 -0
  19. sqlframe/base/session.py +585 -0
  20. sqlframe/base/transforms.py +13 -0
  21. sqlframe/base/types.py +418 -0
  22. sqlframe/base/util.py +242 -0
  23. sqlframe/base/window.py +139 -0
  24. sqlframe/bigquery/__init__.py +23 -0
  25. sqlframe/bigquery/catalog.py +255 -0
  26. sqlframe/bigquery/column.py +1 -0
  27. sqlframe/bigquery/dataframe.py +54 -0
  28. sqlframe/bigquery/functions.py +378 -0
  29. sqlframe/bigquery/group.py +14 -0
  30. sqlframe/bigquery/readwriter.py +29 -0
  31. sqlframe/bigquery/session.py +89 -0
  32. sqlframe/bigquery/types.py +1 -0
  33. sqlframe/bigquery/window.py +1 -0
  34. sqlframe/duckdb/__init__.py +20 -0
  35. sqlframe/duckdb/catalog.py +108 -0
  36. sqlframe/duckdb/column.py +1 -0
  37. sqlframe/duckdb/dataframe.py +55 -0
  38. sqlframe/duckdb/functions.py +47 -0
  39. sqlframe/duckdb/group.py +14 -0
  40. sqlframe/duckdb/readwriter.py +111 -0
  41. sqlframe/duckdb/session.py +65 -0
  42. sqlframe/duckdb/types.py +1 -0
  43. sqlframe/duckdb/window.py +1 -0
  44. sqlframe/postgres/__init__.py +23 -0
  45. sqlframe/postgres/catalog.py +106 -0
  46. sqlframe/postgres/column.py +1 -0
  47. sqlframe/postgres/dataframe.py +54 -0
  48. sqlframe/postgres/functions.py +61 -0
  49. sqlframe/postgres/group.py +14 -0
  50. sqlframe/postgres/readwriter.py +29 -0
  51. sqlframe/postgres/session.py +68 -0
  52. sqlframe/postgres/types.py +1 -0
  53. sqlframe/postgres/window.py +1 -0
  54. sqlframe/redshift/__init__.py +23 -0
  55. sqlframe/redshift/catalog.py +127 -0
  56. sqlframe/redshift/column.py +1 -0
  57. sqlframe/redshift/dataframe.py +54 -0
  58. sqlframe/redshift/functions.py +18 -0
  59. sqlframe/redshift/group.py +14 -0
  60. sqlframe/redshift/readwriter.py +29 -0
  61. sqlframe/redshift/session.py +53 -0
  62. sqlframe/redshift/types.py +1 -0
  63. sqlframe/redshift/window.py +1 -0
  64. sqlframe/snowflake/__init__.py +26 -0
  65. sqlframe/snowflake/catalog.py +134 -0
  66. sqlframe/snowflake/column.py +1 -0
  67. sqlframe/snowflake/dataframe.py +54 -0
  68. sqlframe/snowflake/functions.py +18 -0
  69. sqlframe/snowflake/group.py +14 -0
  70. sqlframe/snowflake/readwriter.py +29 -0
  71. sqlframe/snowflake/session.py +53 -0
  72. sqlframe/snowflake/types.py +1 -0
  73. sqlframe/snowflake/window.py +1 -0
  74. sqlframe/spark/__init__.py +23 -0
  75. sqlframe/spark/catalog.py +1028 -0
  76. sqlframe/spark/column.py +1 -0
  77. sqlframe/spark/dataframe.py +54 -0
  78. sqlframe/spark/functions.py +22 -0
  79. sqlframe/spark/group.py +14 -0
  80. sqlframe/spark/readwriter.py +29 -0
  81. sqlframe/spark/session.py +90 -0
  82. sqlframe/spark/types.py +1 -0
  83. sqlframe/spark/window.py +1 -0
  84. sqlframe/standalone/__init__.py +26 -0
  85. sqlframe/standalone/catalog.py +13 -0
  86. sqlframe/standalone/column.py +1 -0
  87. sqlframe/standalone/dataframe.py +36 -0
  88. sqlframe/standalone/functions.py +1 -0
  89. sqlframe/standalone/group.py +14 -0
  90. sqlframe/standalone/readwriter.py +19 -0
  91. sqlframe/standalone/session.py +40 -0
  92. sqlframe/standalone/types.py +1 -0
  93. sqlframe/standalone/window.py +1 -0
  94. sqlframe-1.1.3.dist-info/LICENSE +21 -0
  95. sqlframe-1.1.3.dist-info/METADATA +172 -0
  96. sqlframe-1.1.3.dist-info/RECORD +98 -0
  97. sqlframe-1.1.3.dist-info/WHEEL +5 -0
  98. sqlframe-1.1.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1 @@
1
+ from sqlframe.base.column import Column
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import sys
5
+ import typing as t
6
+
7
+ from sqlframe.base.dataframe import (
8
+ _BaseDataFrame,
9
+ _BaseDataFrameNaFunctions,
10
+ _BaseDataFrameStatFunctions,
11
+ )
12
+ from sqlframe.spark.group import SparkGroupedData
13
+
14
+ if sys.version_info >= (3, 11):
15
+ from typing import Self
16
+ else:
17
+ from typing_extensions import Self
18
+
19
+ if t.TYPE_CHECKING:
20
+ from sqlframe.spark.readwriter import SparkDataFrameWriter
21
+ from sqlframe.spark.session import SparkSession
22
+
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class SparkDataFrameNaFunctions(_BaseDataFrameNaFunctions["SparkDataFrame"]):
28
+ pass
29
+
30
+
31
+ class SparkDataFrameStatFunctions(_BaseDataFrameStatFunctions["SparkDataFrame"]):
32
+ pass
33
+
34
+
35
+ class SparkDataFrame(
36
+ _BaseDataFrame[
37
+ "SparkSession",
38
+ "SparkDataFrameWriter",
39
+ "SparkDataFrameNaFunctions",
40
+ "SparkDataFrameStatFunctions",
41
+ "SparkGroupedData",
42
+ ]
43
+ ):
44
+ _na = SparkDataFrameNaFunctions
45
+ _stat = SparkDataFrameStatFunctions
46
+ _group_data = SparkGroupedData
47
+
48
+ def cache(self) -> Self:
49
+ logger.warning("Spark does not support caching. Ignoring cache() call.")
50
+ return self
51
+
52
+ def persist(self) -> Self:
53
+ logger.warning("Spark does not support persist. Ignoring persist() call.")
54
+ return self
@@ -0,0 +1,22 @@
1
+ import inspect
2
+ import sys
3
+
4
+ import sqlframe.base.functions # noqa
5
+
6
+ module = sys.modules["sqlframe.base.functions"]
7
+ globals().update(
8
+ {
9
+ name: func
10
+ for name, func in inspect.getmembers(module, inspect.isfunction)
11
+ if hasattr(func, "unsupported_engines")
12
+ and "spark" not in func.unsupported_engines
13
+ and "*" not in func.unsupported_engines
14
+ }
15
+ )
16
+
17
+
18
+ from sqlframe.base.function_alternatives import ( # noqa
19
+ percentile_without_disc as percentile,
20
+ add_months_by_multiplication as add_months,
21
+ arrays_overlap_renamed as arrays_overlap,
22
+ )
@@ -0,0 +1,14 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import typing as t
6
+
7
+ from sqlframe.base.group import _BaseGroupedData
8
+
9
+ if t.TYPE_CHECKING:
10
+ from sqlframe.spark.dataframe import SparkDataFrame
11
+
12
+
13
+ class SparkGroupedData(_BaseGroupedData["SparkDataFrame"]):
14
+ pass
@@ -0,0 +1,29 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import typing as t
6
+
7
+ from sqlframe.base.mixins.readwriter_mixins import PandasLoaderMixin, PandasWriterMixin
8
+ from sqlframe.base.readerwriter import (
9
+ _BaseDataFrameReader,
10
+ _BaseDataFrameWriter,
11
+ )
12
+
13
+ if t.TYPE_CHECKING:
14
+ from sqlframe.spark.dataframe import SparkDataFrame
15
+ from sqlframe.spark.session import SparkSession
16
+
17
+
18
+ class SparkDataFrameReader(
19
+ PandasLoaderMixin["SparkSession", "SparkDataFrame"],
20
+ _BaseDataFrameReader["SparkSession", "SparkDataFrame"],
21
+ ):
22
+ pass
23
+
24
+
25
+ class SparkDataFrameWriter(
26
+ PandasWriterMixin["SparkSession", "SparkDataFrame"],
27
+ _BaseDataFrameWriter["SparkSession", "SparkDataFrame"],
28
+ ):
29
+ pass
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ import typing as t
4
+ import warnings
5
+
6
+ import pandas as pd
7
+ from sqlglot import exp
8
+
9
+ from sqlframe.base.session import _BaseSession
10
+ from sqlframe.spark.catalog import SparkCatalog
11
+ from sqlframe.spark.dataframe import SparkDataFrame
12
+ from sqlframe.spark.readwriter import (
13
+ SparkDataFrameReader,
14
+ SparkDataFrameWriter,
15
+ )
16
+ from sqlframe.spark.types import Row
17
+
18
+
19
+ class SparkSession(
20
+ _BaseSession[ # type: ignore
21
+ SparkCatalog,
22
+ SparkDataFrameReader,
23
+ SparkDataFrameWriter,
24
+ SparkDataFrame,
25
+ object,
26
+ ],
27
+ ):
28
+ _catalog = SparkCatalog
29
+ _reader = SparkDataFrameReader
30
+ _writer = SparkDataFrameWriter
31
+ _df = SparkDataFrame
32
+
33
+ def __init__(self, conn: t.Optional[t.Any] = None):
34
+ warnings.warn(
35
+ "SparkSession is still in active development. Functions may not work as expected."
36
+ )
37
+
38
+ from pyspark.sql.session import DataFrame, SparkSession
39
+
40
+ if not hasattr(self, "spark_session"):
41
+ super().__init__(conn)
42
+ self.spark_session = SparkSession.builder.getOrCreate()
43
+ self._last_df: t.Optional[DataFrame] = None
44
+
45
+ @property
46
+ def _conn(self) -> t.Any:
47
+ raise NotImplementedError()
48
+
49
+ @property
50
+ def _cur(self) -> t.Any:
51
+ raise NotImplementedError()
52
+
53
+ def _fetch_rows(
54
+ self, sql: t.Union[str, exp.Expression], *, quote_identifiers: bool = True
55
+ ) -> t.List[Row]:
56
+ self._execute(sql, quote_identifiers=quote_identifiers)
57
+ assert self._last_df is not None
58
+ return [Row(**row.asDict()) for row in self._last_df.collect()]
59
+
60
+ def _execute(
61
+ self, sql: t.Union[str, exp.Expression], *, quote_identifiers: bool = True
62
+ ) -> None:
63
+ self._last_df = self.spark_session.sql(
64
+ self._to_sql(sql, quote_identifiers=quote_identifiers)
65
+ )
66
+
67
+ def _fetchdf(
68
+ self, sql: t.Union[str, exp.Expression], *, quote_identifiers: bool = True
69
+ ) -> pd.DataFrame:
70
+ self._execute(sql, quote_identifiers=quote_identifiers)
71
+ assert self._last_df is not None
72
+ return self._last_df.toPandas()
73
+
74
+ @property
75
+ def _has_connection(self) -> bool:
76
+ return True
77
+
78
+ class Builder(_BaseSession.Builder):
79
+ DEFAULT_INPUT_DIALECT = "spark"
80
+ DEFAULT_OUTPUT_DIALECT = "spark"
81
+
82
+ @property
83
+ def session(self) -> SparkSession:
84
+ return SparkSession(**self._session_kwargs)
85
+
86
+ def getOrCreate(self) -> SparkSession:
87
+ self._set_session_properties()
88
+ return self.session
89
+
90
+ builder = Builder()
@@ -0,0 +1 @@
1
+ from sqlframe.base.types import *
@@ -0,0 +1 @@
1
+ from sqlframe.base.window import *
@@ -0,0 +1,26 @@
1
+ from sqlframe.standalone.catalog import StandaloneCatalog
2
+ from sqlframe.standalone.column import Column
3
+ from sqlframe.standalone.dataframe import (
4
+ StandaloneDataFrame,
5
+ StandaloneDataFrameNaFunctions,
6
+ )
7
+ from sqlframe.standalone.group import StandaloneGroupedData
8
+ from sqlframe.standalone.readwriter import (
9
+ StandaloneDataFrameReader,
10
+ StandaloneDataFrameWriter,
11
+ )
12
+ from sqlframe.standalone.session import StandaloneSession
13
+ from sqlframe.standalone.window import Window, WindowSpec
14
+
15
+ __all__ = [
16
+ "StandaloneCatalog",
17
+ "Column",
18
+ "StandaloneDataFrame",
19
+ "StandaloneDataFrameNaFunctions",
20
+ "StandaloneGroupedData",
21
+ "StandaloneDataFrameReader",
22
+ "StandaloneDataFrameWriter",
23
+ "StandaloneSession",
24
+ "Window",
25
+ "WindowSpec",
26
+ ]
@@ -0,0 +1,13 @@
1
+ import typing as t
2
+
3
+ from sqlframe.base.catalog import _BaseCatalog
4
+
5
+ if t.TYPE_CHECKING:
6
+ from sqlframe.standalone.dataframe import StandaloneDataFrame
7
+ from sqlframe.standalone.session import StandaloneSession
8
+
9
+
10
+ class StandaloneCatalog(_BaseCatalog["StandaloneSession", "StandaloneDataFrame"]):
11
+ """User-facing catalog API, accessible through `SparkSession.catalog`."""
12
+
13
+ pass
@@ -0,0 +1 @@
1
+ from sqlframe.base.column import Column
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ import typing as t
4
+
5
+ from sqlframe.base.dataframe import (
6
+ _BaseDataFrame,
7
+ _BaseDataFrameNaFunctions,
8
+ _BaseDataFrameStatFunctions,
9
+ )
10
+ from sqlframe.standalone.group import StandaloneGroupedData
11
+
12
+ if t.TYPE_CHECKING:
13
+ from sqlframe.standalone.readwriter import StandaloneDataFrameWriter
14
+ from sqlframe.standalone.session import StandaloneSession
15
+
16
+
17
+ class StandaloneDataFrameNaFunctions(_BaseDataFrameNaFunctions["StandaloneDataFrame"]):
18
+ pass
19
+
20
+
21
+ class StandaloneDataFrameStatFunctions(_BaseDataFrameStatFunctions["StandaloneDataFrame"]):
22
+ pass
23
+
24
+
25
+ class StandaloneDataFrame(
26
+ _BaseDataFrame[
27
+ "StandaloneSession",
28
+ "StandaloneDataFrameWriter",
29
+ "StandaloneDataFrameNaFunctions",
30
+ "StandaloneDataFrameStatFunctions",
31
+ "StandaloneGroupedData",
32
+ ]
33
+ ):
34
+ _na = StandaloneDataFrameNaFunctions
35
+ _stat = StandaloneDataFrameStatFunctions
36
+ _group_data = StandaloneGroupedData
@@ -0,0 +1 @@
1
+ from sqlframe.base.functions import *
@@ -0,0 +1,14 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import typing as t
6
+
7
+ from sqlframe.base.group import _BaseGroupedData
8
+
9
+ if t.TYPE_CHECKING:
10
+ from sqlframe.standalone.dataframe import StandaloneDataFrame
11
+
12
+
13
+ class StandaloneGroupedData(_BaseGroupedData["StandaloneDataFrame"]):
14
+ pass
@@ -0,0 +1,19 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ import typing as t
6
+
7
+ from sqlframe.base.readerwriter import _BaseDataFrameReader, _BaseDataFrameWriter
8
+
9
+ if t.TYPE_CHECKING:
10
+ from sqlframe.standalone.dataframe import StandaloneDataFrame
11
+ from sqlframe.standalone.session import StandaloneSession
12
+
13
+
14
+ class StandaloneDataFrameReader(_BaseDataFrameReader["StandaloneSession", "StandaloneDataFrame"]):
15
+ pass
16
+
17
+
18
+ class StandaloneDataFrameWriter(_BaseDataFrameWriter["StandaloneSession", "StandaloneDataFrame"]):
19
+ pass
@@ -0,0 +1,40 @@
1
+ # This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
2
+
3
+ from __future__ import annotations
4
+
5
+ from sqlframe.base.session import _BaseSession
6
+ from sqlframe.standalone.catalog import StandaloneCatalog
7
+ from sqlframe.standalone.dataframe import StandaloneDataFrame
8
+ from sqlframe.standalone.readwriter import (
9
+ StandaloneDataFrameReader,
10
+ StandaloneDataFrameWriter,
11
+ )
12
+
13
+
14
+ class StandaloneSession(
15
+ _BaseSession[ # type: ignore
16
+ StandaloneCatalog,
17
+ StandaloneDataFrameReader,
18
+ StandaloneDataFrameWriter,
19
+ StandaloneDataFrame,
20
+ object,
21
+ ]
22
+ ): # type: ignore
23
+ _catalog = StandaloneCatalog
24
+ _reader = StandaloneDataFrameReader
25
+ _writer = StandaloneDataFrameWriter
26
+ _df = StandaloneDataFrame
27
+
28
+ class Builder(_BaseSession.Builder):
29
+ DEFAULT_INPUT_DIALECT = "spark"
30
+ DEFAULT_OUTPUT_DIALECT = "spark"
31
+
32
+ @property
33
+ def session(self) -> StandaloneSession:
34
+ return StandaloneSession()
35
+
36
+ def getOrCreate(self) -> StandaloneSession:
37
+ self._set_session_properties()
38
+ return self.session
39
+
40
+ builder = Builder()
@@ -0,0 +1 @@
1
+ from sqlframe.base.types import *
@@ -0,0 +1 @@
1
+ from sqlframe.base.window import *
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Ryan Eakman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,172 @@
1
+ Metadata-Version: 2.1
2
+ Name: sqlframe
3
+ Version: 1.1.3
4
+ Summary: Taking the Spark out of PySpark by converting to SQL
5
+ Home-page: https://github.com/eakmanrq/sqlframe
6
+ Author: Ryan Eakman
7
+ Author-email: eakmanrq@gmail.com
8
+ License: MIT
9
+ Platform: UNKNOWN
10
+ Classifier: Development Status :: 5 - Production/Stable
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: SQL
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Requires-Python: >=3.8
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: prettytable (<3.11.0)
21
+ Requires-Dist: sqlglot (<24.1,>=24.0.0)
22
+ Provides-Extra: bigquery
23
+ Requires-Dist: google-cloud-bigquery-storage (<3,>=2) ; extra == 'bigquery'
24
+ Requires-Dist: google-cloud-bigquery[pandas] (<4,>=3) ; extra == 'bigquery'
25
+ Requires-Dist: pandas (<3,>=2) ; extra == 'bigquery'
26
+ Provides-Extra: dev
27
+ Requires-Dist: duckdb (<0.11,>=0.9) ; extra == 'dev'
28
+ Requires-Dist: mypy (<1.11,>=1.10.0) ; extra == 'dev'
29
+ Requires-Dist: pandas-stubs (<3,>=2) ; extra == 'dev'
30
+ Requires-Dist: pandas (<3,>=2) ; extra == 'dev'
31
+ Requires-Dist: psycopg (<4,>=3.1) ; extra == 'dev'
32
+ Requires-Dist: pyarrow (<17,>=10) ; extra == 'dev'
33
+ Requires-Dist: pyspark (<3.6,>=2) ; extra == 'dev'
34
+ Requires-Dist: pytest-postgresql (<7,>=6) ; extra == 'dev'
35
+ Requires-Dist: pytest-xdist (<3.7,>=3.6) ; extra == 'dev'
36
+ Requires-Dist: pytest (<8.3,>=8.2.0) ; extra == 'dev'
37
+ Requires-Dist: ruff (<0.5,>=0.4.4) ; extra == 'dev'
38
+ Requires-Dist: types-psycopg2 (<3,>=2.9) ; extra == 'dev'
39
+ Requires-Dist: typing-extensions (<5,>=4.11) ; extra == 'dev'
40
+ Requires-Dist: pre-commit (>=3.5) ; (python_version == "3.8") and extra == 'dev'
41
+ Requires-Dist: pre-commit (<3.8,>=3.7) ; (python_version >= "3.9") and extra == 'dev'
42
+ Provides-Extra: docs
43
+ Requires-Dist: mkdocs-include-markdown-plugin (==6.0.6) ; extra == 'docs'
44
+ Requires-Dist: mkdocs-material-extensions (==1.1.1) ; extra == 'docs'
45
+ Requires-Dist: mkdocs-material (==9.0.5) ; extra == 'docs'
46
+ Requires-Dist: mkdocs (==1.4.2) ; extra == 'docs'
47
+ Requires-Dist: pymdown-extensions ; extra == 'docs'
48
+ Provides-Extra: duckdb
49
+ Requires-Dist: duckdb (<0.11,>=0.9) ; extra == 'duckdb'
50
+ Requires-Dist: pandas (<3,>=2) ; extra == 'duckdb'
51
+ Provides-Extra: postgres
52
+ Requires-Dist: pandas (<3,>=2) ; extra == 'postgres'
53
+ Requires-Dist: psycopg2 (<3,>=2.8) ; extra == 'postgres'
54
+ Provides-Extra: redshift
55
+ Requires-Dist: pandas (<3,>=2) ; extra == 'redshift'
56
+ Requires-Dist: redshift-connector (<2.2.0,>=2.1.1) ; extra == 'redshift'
57
+ Provides-Extra: snowflake
58
+ Requires-Dist: pandas (<3,>=2) ; extra == 'snowflake'
59
+ Requires-Dist: snowflake-connector-python[pandas,secure-local-storage] (<3.11,>=3.10.0) ; extra == 'snowflake'
60
+ Provides-Extra: spark
61
+ Requires-Dist: pyspark (<3.6,>=2) ; extra == 'spark'
62
+
63
+ <div align="center">
64
+ <img src="https://sqlframe.readthedocs.io/en/latest/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
65
+ </div>
66
+
67
+ SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
68
+
69
+ SQLFrame currently supports the following engines (many more in development):
70
+
71
+ * [BigQuery](https://sqlframe.readthedocs.io/en/latest/bigquery/)
72
+ * [DuckDB](https://sqlframe.readthedocs.io/en/latest/duckdb)
73
+ * [Postgres](https://sqlframe.readthedocs.io/en/latest/postgres)
74
+
75
+ SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
76
+ * [Standalone](https://sqlframe.readthedocs.io/en/latest/standalone)
77
+
78
+ SQLFrame is great for:
79
+
80
+ * Users who want to run PySpark DataFrame code without having to use a Spark cluster
81
+ * Users who want a SQL representation of their DataFrame code for debugging or sharing with others
82
+ * Users who want a DataFrame API that leverages the full power of their engine to do the processing
83
+
84
+ ## Installation
85
+
86
+ ```bash
87
+ # BigQuery
88
+ pip install "sqlframe[bigquery]"
89
+ # DuckDB
90
+ pip install "sqlframe[duckdb]"
91
+ # Postgres
92
+ pip install "sqlframe[postgres]"
93
+ # Standalone
94
+ pip install sqlframe
95
+ ```
96
+
97
+ See specific engine documentation for additional setup instructions.
98
+
99
+ ## Example Usage
100
+
101
+ ```python
102
+ from sqlframe.bigquery import BigQuerySession
103
+ from sqlframe.bigquery import functions as F
104
+ from sqlframe.bigquery import Window
105
+
106
+ session = BigQuerySession()
107
+ table_path = "bigquery-public-data.samples.natality"
108
+ # Top 5 years with the greatest year-over-year % change in new families with single child
109
+ df = (
110
+ session.table(table_path)
111
+ .where(F.col("ever_born") == 1)
112
+ .groupBy("year")
113
+ .agg(F.count("*").alias("num_single_child_families"))
114
+ .withColumn(
115
+ "last_year_num_single_child_families",
116
+ F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
117
+ )
118
+ .withColumn(
119
+ "percent_change",
120
+ (F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
121
+ / F.col("last_year_num_single_child_families")
122
+ )
123
+ .orderBy(F.abs(F.col("percent_change")).desc())
124
+ .select(
125
+ F.col("year").alias("year"),
126
+ F.format_number("num_single_child_families", 0).alias("new families single child"),
127
+ F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
128
+ )
129
+ .limit(5)
130
+ )
131
+ ```
132
+ ```python
133
+ >>> df.sql()
134
+ WITH `t94228` AS (
135
+ SELECT
136
+ `natality`.`year` AS `year`,
137
+ COUNT(*) AS `num_single_child_families`
138
+ FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
139
+ WHERE
140
+ `natality`.`ever_born` = 1
141
+ GROUP BY
142
+ `natality`.`year`
143
+ ), `t39093` AS (
144
+ SELECT
145
+ `t94228`.`year` AS `year`,
146
+ `t94228`.`num_single_child_families` AS `num_single_child_families`,
147
+ LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
148
+ FROM `t94228` AS `t94228`
149
+ )
150
+ SELECT
151
+ `t39093`.`year` AS `year`,
152
+ FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `new families single child`,
153
+ FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
154
+ FROM `t39093` AS `t39093`
155
+ ORDER BY
156
+ ABS(`percent_change`) DESC
157
+ LIMIT 5
158
+ ```
159
+ ```python
160
+ >>> df.show()
161
+ +------+---------------------------+----------------+
162
+ | year | new families single child | percent change |
163
+ +------+---------------------------+----------------+
164
+ | 1989 | 1,650,246 | 25.02 |
165
+ | 1974 | 783,448 | 14.49 |
166
+ | 1977 | 1,057,379 | 11.38 |
167
+ | 1985 | 1,308,476 | 11.15 |
168
+ | 1975 | 868,985 | 10.92 |
169
+ +------+---------------------------+----------------+
170
+ ```
171
+
172
+