sqlframe 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlframe/__init__.py +0 -0
- sqlframe/_version.py +16 -0
- sqlframe/base/__init__.py +0 -0
- sqlframe/base/_typing.py +39 -0
- sqlframe/base/catalog.py +1163 -0
- sqlframe/base/column.py +388 -0
- sqlframe/base/dataframe.py +1519 -0
- sqlframe/base/decorators.py +51 -0
- sqlframe/base/exceptions.py +14 -0
- sqlframe/base/function_alternatives.py +1055 -0
- sqlframe/base/functions.py +1678 -0
- sqlframe/base/group.py +102 -0
- sqlframe/base/mixins/__init__.py +0 -0
- sqlframe/base/mixins/catalog_mixins.py +419 -0
- sqlframe/base/mixins/readwriter_mixins.py +118 -0
- sqlframe/base/normalize.py +84 -0
- sqlframe/base/operations.py +87 -0
- sqlframe/base/readerwriter.py +679 -0
- sqlframe/base/session.py +585 -0
- sqlframe/base/transforms.py +13 -0
- sqlframe/base/types.py +418 -0
- sqlframe/base/util.py +242 -0
- sqlframe/base/window.py +139 -0
- sqlframe/bigquery/__init__.py +23 -0
- sqlframe/bigquery/catalog.py +255 -0
- sqlframe/bigquery/column.py +1 -0
- sqlframe/bigquery/dataframe.py +54 -0
- sqlframe/bigquery/functions.py +378 -0
- sqlframe/bigquery/group.py +14 -0
- sqlframe/bigquery/readwriter.py +29 -0
- sqlframe/bigquery/session.py +89 -0
- sqlframe/bigquery/types.py +1 -0
- sqlframe/bigquery/window.py +1 -0
- sqlframe/duckdb/__init__.py +20 -0
- sqlframe/duckdb/catalog.py +108 -0
- sqlframe/duckdb/column.py +1 -0
- sqlframe/duckdb/dataframe.py +55 -0
- sqlframe/duckdb/functions.py +47 -0
- sqlframe/duckdb/group.py +14 -0
- sqlframe/duckdb/readwriter.py +111 -0
- sqlframe/duckdb/session.py +65 -0
- sqlframe/duckdb/types.py +1 -0
- sqlframe/duckdb/window.py +1 -0
- sqlframe/postgres/__init__.py +23 -0
- sqlframe/postgres/catalog.py +106 -0
- sqlframe/postgres/column.py +1 -0
- sqlframe/postgres/dataframe.py +54 -0
- sqlframe/postgres/functions.py +61 -0
- sqlframe/postgres/group.py +14 -0
- sqlframe/postgres/readwriter.py +29 -0
- sqlframe/postgres/session.py +68 -0
- sqlframe/postgres/types.py +1 -0
- sqlframe/postgres/window.py +1 -0
- sqlframe/redshift/__init__.py +23 -0
- sqlframe/redshift/catalog.py +127 -0
- sqlframe/redshift/column.py +1 -0
- sqlframe/redshift/dataframe.py +54 -0
- sqlframe/redshift/functions.py +18 -0
- sqlframe/redshift/group.py +14 -0
- sqlframe/redshift/readwriter.py +29 -0
- sqlframe/redshift/session.py +53 -0
- sqlframe/redshift/types.py +1 -0
- sqlframe/redshift/window.py +1 -0
- sqlframe/snowflake/__init__.py +26 -0
- sqlframe/snowflake/catalog.py +134 -0
- sqlframe/snowflake/column.py +1 -0
- sqlframe/snowflake/dataframe.py +54 -0
- sqlframe/snowflake/functions.py +18 -0
- sqlframe/snowflake/group.py +14 -0
- sqlframe/snowflake/readwriter.py +29 -0
- sqlframe/snowflake/session.py +53 -0
- sqlframe/snowflake/types.py +1 -0
- sqlframe/snowflake/window.py +1 -0
- sqlframe/spark/__init__.py +23 -0
- sqlframe/spark/catalog.py +1028 -0
- sqlframe/spark/column.py +1 -0
- sqlframe/spark/dataframe.py +54 -0
- sqlframe/spark/functions.py +22 -0
- sqlframe/spark/group.py +14 -0
- sqlframe/spark/readwriter.py +29 -0
- sqlframe/spark/session.py +90 -0
- sqlframe/spark/types.py +1 -0
- sqlframe/spark/window.py +1 -0
- sqlframe/standalone/__init__.py +26 -0
- sqlframe/standalone/catalog.py +13 -0
- sqlframe/standalone/column.py +1 -0
- sqlframe/standalone/dataframe.py +36 -0
- sqlframe/standalone/functions.py +1 -0
- sqlframe/standalone/group.py +14 -0
- sqlframe/standalone/readwriter.py +19 -0
- sqlframe/standalone/session.py +40 -0
- sqlframe/standalone/types.py +1 -0
- sqlframe/standalone/window.py +1 -0
- sqlframe-1.1.3.dist-info/LICENSE +21 -0
- sqlframe-1.1.3.dist-info/METADATA +172 -0
- sqlframe-1.1.3.dist-info/RECORD +98 -0
- sqlframe-1.1.3.dist-info/WHEEL +5 -0
- sqlframe-1.1.3.dist-info/top_level.txt +1 -0
sqlframe/spark/column.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.column import Column
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlframe.base.dataframe import (
|
|
8
|
+
_BaseDataFrame,
|
|
9
|
+
_BaseDataFrameNaFunctions,
|
|
10
|
+
_BaseDataFrameStatFunctions,
|
|
11
|
+
)
|
|
12
|
+
from sqlframe.spark.group import SparkGroupedData
|
|
13
|
+
|
|
14
|
+
if sys.version_info >= (3, 11):
|
|
15
|
+
from typing import Self
|
|
16
|
+
else:
|
|
17
|
+
from typing_extensions import Self
|
|
18
|
+
|
|
19
|
+
if t.TYPE_CHECKING:
|
|
20
|
+
from sqlframe.spark.readwriter import SparkDataFrameWriter
|
|
21
|
+
from sqlframe.spark.session import SparkSession
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SparkDataFrameNaFunctions(_BaseDataFrameNaFunctions["SparkDataFrame"]):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SparkDataFrameStatFunctions(_BaseDataFrameStatFunctions["SparkDataFrame"]):
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SparkDataFrame(
|
|
36
|
+
_BaseDataFrame[
|
|
37
|
+
"SparkSession",
|
|
38
|
+
"SparkDataFrameWriter",
|
|
39
|
+
"SparkDataFrameNaFunctions",
|
|
40
|
+
"SparkDataFrameStatFunctions",
|
|
41
|
+
"SparkGroupedData",
|
|
42
|
+
]
|
|
43
|
+
):
|
|
44
|
+
_na = SparkDataFrameNaFunctions
|
|
45
|
+
_stat = SparkDataFrameStatFunctions
|
|
46
|
+
_group_data = SparkGroupedData
|
|
47
|
+
|
|
48
|
+
def cache(self) -> Self:
|
|
49
|
+
logger.warning("Spark does not support caching. Ignoring cache() call.")
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
def persist(self) -> Self:
|
|
53
|
+
logger.warning("Spark does not support persist. Ignoring persist() call.")
|
|
54
|
+
return self
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import inspect
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
import sqlframe.base.functions # noqa
|
|
5
|
+
|
|
6
|
+
module = sys.modules["sqlframe.base.functions"]
|
|
7
|
+
globals().update(
|
|
8
|
+
{
|
|
9
|
+
name: func
|
|
10
|
+
for name, func in inspect.getmembers(module, inspect.isfunction)
|
|
11
|
+
if hasattr(func, "unsupported_engines")
|
|
12
|
+
and "spark" not in func.unsupported_engines
|
|
13
|
+
and "*" not in func.unsupported_engines
|
|
14
|
+
}
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from sqlframe.base.function_alternatives import ( # noqa
|
|
19
|
+
percentile_without_disc as percentile,
|
|
20
|
+
add_months_by_multiplication as add_months,
|
|
21
|
+
arrays_overlap_renamed as arrays_overlap,
|
|
22
|
+
)
|
sqlframe/spark/group.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlframe.base.group import _BaseGroupedData
|
|
8
|
+
|
|
9
|
+
if t.TYPE_CHECKING:
|
|
10
|
+
from sqlframe.spark.dataframe import SparkDataFrame
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SparkGroupedData(_BaseGroupedData["SparkDataFrame"]):
|
|
14
|
+
pass
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlframe.base.mixins.readwriter_mixins import PandasLoaderMixin, PandasWriterMixin
|
|
8
|
+
from sqlframe.base.readerwriter import (
|
|
9
|
+
_BaseDataFrameReader,
|
|
10
|
+
_BaseDataFrameWriter,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
if t.TYPE_CHECKING:
|
|
14
|
+
from sqlframe.spark.dataframe import SparkDataFrame
|
|
15
|
+
from sqlframe.spark.session import SparkSession
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SparkDataFrameReader(
|
|
19
|
+
PandasLoaderMixin["SparkSession", "SparkDataFrame"],
|
|
20
|
+
_BaseDataFrameReader["SparkSession", "SparkDataFrame"],
|
|
21
|
+
):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SparkDataFrameWriter(
|
|
26
|
+
PandasWriterMixin["SparkSession", "SparkDataFrame"],
|
|
27
|
+
_BaseDataFrameWriter["SparkSession", "SparkDataFrame"],
|
|
28
|
+
):
|
|
29
|
+
pass
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import typing as t
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from sqlglot import exp
|
|
8
|
+
|
|
9
|
+
from sqlframe.base.session import _BaseSession
|
|
10
|
+
from sqlframe.spark.catalog import SparkCatalog
|
|
11
|
+
from sqlframe.spark.dataframe import SparkDataFrame
|
|
12
|
+
from sqlframe.spark.readwriter import (
|
|
13
|
+
SparkDataFrameReader,
|
|
14
|
+
SparkDataFrameWriter,
|
|
15
|
+
)
|
|
16
|
+
from sqlframe.spark.types import Row
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SparkSession(
|
|
20
|
+
_BaseSession[ # type: ignore
|
|
21
|
+
SparkCatalog,
|
|
22
|
+
SparkDataFrameReader,
|
|
23
|
+
SparkDataFrameWriter,
|
|
24
|
+
SparkDataFrame,
|
|
25
|
+
object,
|
|
26
|
+
],
|
|
27
|
+
):
|
|
28
|
+
_catalog = SparkCatalog
|
|
29
|
+
_reader = SparkDataFrameReader
|
|
30
|
+
_writer = SparkDataFrameWriter
|
|
31
|
+
_df = SparkDataFrame
|
|
32
|
+
|
|
33
|
+
def __init__(self, conn: t.Optional[t.Any] = None):
|
|
34
|
+
warnings.warn(
|
|
35
|
+
"SparkSession is still in active development. Functions may not work as expected."
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
from pyspark.sql.session import DataFrame, SparkSession
|
|
39
|
+
|
|
40
|
+
if not hasattr(self, "spark_session"):
|
|
41
|
+
super().__init__(conn)
|
|
42
|
+
self.spark_session = SparkSession.builder.getOrCreate()
|
|
43
|
+
self._last_df: t.Optional[DataFrame] = None
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def _conn(self) -> t.Any:
|
|
47
|
+
raise NotImplementedError()
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def _cur(self) -> t.Any:
|
|
51
|
+
raise NotImplementedError()
|
|
52
|
+
|
|
53
|
+
def _fetch_rows(
|
|
54
|
+
self, sql: t.Union[str, exp.Expression], *, quote_identifiers: bool = True
|
|
55
|
+
) -> t.List[Row]:
|
|
56
|
+
self._execute(sql, quote_identifiers=quote_identifiers)
|
|
57
|
+
assert self._last_df is not None
|
|
58
|
+
return [Row(**row.asDict()) for row in self._last_df.collect()]
|
|
59
|
+
|
|
60
|
+
def _execute(
|
|
61
|
+
self, sql: t.Union[str, exp.Expression], *, quote_identifiers: bool = True
|
|
62
|
+
) -> None:
|
|
63
|
+
self._last_df = self.spark_session.sql(
|
|
64
|
+
self._to_sql(sql, quote_identifiers=quote_identifiers)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _fetchdf(
|
|
68
|
+
self, sql: t.Union[str, exp.Expression], *, quote_identifiers: bool = True
|
|
69
|
+
) -> pd.DataFrame:
|
|
70
|
+
self._execute(sql, quote_identifiers=quote_identifiers)
|
|
71
|
+
assert self._last_df is not None
|
|
72
|
+
return self._last_df.toPandas()
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def _has_connection(self) -> bool:
|
|
76
|
+
return True
|
|
77
|
+
|
|
78
|
+
class Builder(_BaseSession.Builder):
|
|
79
|
+
DEFAULT_INPUT_DIALECT = "spark"
|
|
80
|
+
DEFAULT_OUTPUT_DIALECT = "spark"
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def session(self) -> SparkSession:
|
|
84
|
+
return SparkSession(**self._session_kwargs)
|
|
85
|
+
|
|
86
|
+
def getOrCreate(self) -> SparkSession:
|
|
87
|
+
self._set_session_properties()
|
|
88
|
+
return self.session
|
|
89
|
+
|
|
90
|
+
builder = Builder()
|
sqlframe/spark/types.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.types import *
|
sqlframe/spark/window.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.window import *
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from sqlframe.standalone.catalog import StandaloneCatalog
|
|
2
|
+
from sqlframe.standalone.column import Column
|
|
3
|
+
from sqlframe.standalone.dataframe import (
|
|
4
|
+
StandaloneDataFrame,
|
|
5
|
+
StandaloneDataFrameNaFunctions,
|
|
6
|
+
)
|
|
7
|
+
from sqlframe.standalone.group import StandaloneGroupedData
|
|
8
|
+
from sqlframe.standalone.readwriter import (
|
|
9
|
+
StandaloneDataFrameReader,
|
|
10
|
+
StandaloneDataFrameWriter,
|
|
11
|
+
)
|
|
12
|
+
from sqlframe.standalone.session import StandaloneSession
|
|
13
|
+
from sqlframe.standalone.window import Window, WindowSpec
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"StandaloneCatalog",
|
|
17
|
+
"Column",
|
|
18
|
+
"StandaloneDataFrame",
|
|
19
|
+
"StandaloneDataFrameNaFunctions",
|
|
20
|
+
"StandaloneGroupedData",
|
|
21
|
+
"StandaloneDataFrameReader",
|
|
22
|
+
"StandaloneDataFrameWriter",
|
|
23
|
+
"StandaloneSession",
|
|
24
|
+
"Window",
|
|
25
|
+
"WindowSpec",
|
|
26
|
+
]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import typing as t
|
|
2
|
+
|
|
3
|
+
from sqlframe.base.catalog import _BaseCatalog
|
|
4
|
+
|
|
5
|
+
if t.TYPE_CHECKING:
|
|
6
|
+
from sqlframe.standalone.dataframe import StandaloneDataFrame
|
|
7
|
+
from sqlframe.standalone.session import StandaloneSession
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StandaloneCatalog(_BaseCatalog["StandaloneSession", "StandaloneDataFrame"]):
|
|
11
|
+
"""User-facing catalog API, accessible through `SparkSession.catalog`."""
|
|
12
|
+
|
|
13
|
+
pass
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.column import Column
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import typing as t
|
|
4
|
+
|
|
5
|
+
from sqlframe.base.dataframe import (
|
|
6
|
+
_BaseDataFrame,
|
|
7
|
+
_BaseDataFrameNaFunctions,
|
|
8
|
+
_BaseDataFrameStatFunctions,
|
|
9
|
+
)
|
|
10
|
+
from sqlframe.standalone.group import StandaloneGroupedData
|
|
11
|
+
|
|
12
|
+
if t.TYPE_CHECKING:
|
|
13
|
+
from sqlframe.standalone.readwriter import StandaloneDataFrameWriter
|
|
14
|
+
from sqlframe.standalone.session import StandaloneSession
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StandaloneDataFrameNaFunctions(_BaseDataFrameNaFunctions["StandaloneDataFrame"]):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StandaloneDataFrameStatFunctions(_BaseDataFrameStatFunctions["StandaloneDataFrame"]):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class StandaloneDataFrame(
|
|
26
|
+
_BaseDataFrame[
|
|
27
|
+
"StandaloneSession",
|
|
28
|
+
"StandaloneDataFrameWriter",
|
|
29
|
+
"StandaloneDataFrameNaFunctions",
|
|
30
|
+
"StandaloneDataFrameStatFunctions",
|
|
31
|
+
"StandaloneGroupedData",
|
|
32
|
+
]
|
|
33
|
+
):
|
|
34
|
+
_na = StandaloneDataFrameNaFunctions
|
|
35
|
+
_stat = StandaloneDataFrameStatFunctions
|
|
36
|
+
_group_data = StandaloneGroupedData
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.functions import *
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlframe.base.group import _BaseGroupedData
|
|
8
|
+
|
|
9
|
+
if t.TYPE_CHECKING:
|
|
10
|
+
from sqlframe.standalone.dataframe import StandaloneDataFrame
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StandaloneGroupedData(_BaseGroupedData["StandaloneDataFrame"]):
|
|
14
|
+
pass
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlframe.base.readerwriter import _BaseDataFrameReader, _BaseDataFrameWriter
|
|
8
|
+
|
|
9
|
+
if t.TYPE_CHECKING:
|
|
10
|
+
from sqlframe.standalone.dataframe import StandaloneDataFrame
|
|
11
|
+
from sqlframe.standalone.session import StandaloneSession
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StandaloneDataFrameReader(_BaseDataFrameReader["StandaloneSession", "StandaloneDataFrame"]):
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StandaloneDataFrameWriter(_BaseDataFrameWriter["StandaloneSession", "StandaloneDataFrame"]):
|
|
19
|
+
pass
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from sqlframe.base.session import _BaseSession
|
|
6
|
+
from sqlframe.standalone.catalog import StandaloneCatalog
|
|
7
|
+
from sqlframe.standalone.dataframe import StandaloneDataFrame
|
|
8
|
+
from sqlframe.standalone.readwriter import (
|
|
9
|
+
StandaloneDataFrameReader,
|
|
10
|
+
StandaloneDataFrameWriter,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StandaloneSession(
|
|
15
|
+
_BaseSession[ # type: ignore
|
|
16
|
+
StandaloneCatalog,
|
|
17
|
+
StandaloneDataFrameReader,
|
|
18
|
+
StandaloneDataFrameWriter,
|
|
19
|
+
StandaloneDataFrame,
|
|
20
|
+
object,
|
|
21
|
+
]
|
|
22
|
+
): # type: ignore
|
|
23
|
+
_catalog = StandaloneCatalog
|
|
24
|
+
_reader = StandaloneDataFrameReader
|
|
25
|
+
_writer = StandaloneDataFrameWriter
|
|
26
|
+
_df = StandaloneDataFrame
|
|
27
|
+
|
|
28
|
+
class Builder(_BaseSession.Builder):
|
|
29
|
+
DEFAULT_INPUT_DIALECT = "spark"
|
|
30
|
+
DEFAULT_OUTPUT_DIALECT = "spark"
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def session(self) -> StandaloneSession:
|
|
34
|
+
return StandaloneSession()
|
|
35
|
+
|
|
36
|
+
def getOrCreate(self) -> StandaloneSession:
|
|
37
|
+
self._set_session_properties()
|
|
38
|
+
return self.session
|
|
39
|
+
|
|
40
|
+
builder = Builder()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.types import *
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from sqlframe.base.window import *
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Ryan Eakman
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: sqlframe
|
|
3
|
+
Version: 1.1.3
|
|
4
|
+
Summary: Taking the Spark out of PySpark by converting to SQL
|
|
5
|
+
Home-page: https://github.com/eakmanrq/sqlframe
|
|
6
|
+
Author: Ryan Eakman
|
|
7
|
+
Author-email: eakmanrq@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Platform: UNKNOWN
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: SQL
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: prettytable (<3.11.0)
|
|
21
|
+
Requires-Dist: sqlglot (<24.1,>=24.0.0)
|
|
22
|
+
Provides-Extra: bigquery
|
|
23
|
+
Requires-Dist: google-cloud-bigquery-storage (<3,>=2) ; extra == 'bigquery'
|
|
24
|
+
Requires-Dist: google-cloud-bigquery[pandas] (<4,>=3) ; extra == 'bigquery'
|
|
25
|
+
Requires-Dist: pandas (<3,>=2) ; extra == 'bigquery'
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: duckdb (<0.11,>=0.9) ; extra == 'dev'
|
|
28
|
+
Requires-Dist: mypy (<1.11,>=1.10.0) ; extra == 'dev'
|
|
29
|
+
Requires-Dist: pandas-stubs (<3,>=2) ; extra == 'dev'
|
|
30
|
+
Requires-Dist: pandas (<3,>=2) ; extra == 'dev'
|
|
31
|
+
Requires-Dist: psycopg (<4,>=3.1) ; extra == 'dev'
|
|
32
|
+
Requires-Dist: pyarrow (<17,>=10) ; extra == 'dev'
|
|
33
|
+
Requires-Dist: pyspark (<3.6,>=2) ; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest-postgresql (<7,>=6) ; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-xdist (<3.7,>=3.6) ; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest (<8.3,>=8.2.0) ; extra == 'dev'
|
|
37
|
+
Requires-Dist: ruff (<0.5,>=0.4.4) ; extra == 'dev'
|
|
38
|
+
Requires-Dist: types-psycopg2 (<3,>=2.9) ; extra == 'dev'
|
|
39
|
+
Requires-Dist: typing-extensions (<5,>=4.11) ; extra == 'dev'
|
|
40
|
+
Requires-Dist: pre-commit (>=3.5) ; (python_version == "3.8") and extra == 'dev'
|
|
41
|
+
Requires-Dist: pre-commit (<3.8,>=3.7) ; (python_version >= "3.9") and extra == 'dev'
|
|
42
|
+
Provides-Extra: docs
|
|
43
|
+
Requires-Dist: mkdocs-include-markdown-plugin (==6.0.6) ; extra == 'docs'
|
|
44
|
+
Requires-Dist: mkdocs-material-extensions (==1.1.1) ; extra == 'docs'
|
|
45
|
+
Requires-Dist: mkdocs-material (==9.0.5) ; extra == 'docs'
|
|
46
|
+
Requires-Dist: mkdocs (==1.4.2) ; extra == 'docs'
|
|
47
|
+
Requires-Dist: pymdown-extensions ; extra == 'docs'
|
|
48
|
+
Provides-Extra: duckdb
|
|
49
|
+
Requires-Dist: duckdb (<0.11,>=0.9) ; extra == 'duckdb'
|
|
50
|
+
Requires-Dist: pandas (<3,>=2) ; extra == 'duckdb'
|
|
51
|
+
Provides-Extra: postgres
|
|
52
|
+
Requires-Dist: pandas (<3,>=2) ; extra == 'postgres'
|
|
53
|
+
Requires-Dist: psycopg2 (<3,>=2.8) ; extra == 'postgres'
|
|
54
|
+
Provides-Extra: redshift
|
|
55
|
+
Requires-Dist: pandas (<3,>=2) ; extra == 'redshift'
|
|
56
|
+
Requires-Dist: redshift-connector (<2.2.0,>=2.1.1) ; extra == 'redshift'
|
|
57
|
+
Provides-Extra: snowflake
|
|
58
|
+
Requires-Dist: pandas (<3,>=2) ; extra == 'snowflake'
|
|
59
|
+
Requires-Dist: snowflake-connector-python[pandas,secure-local-storage] (<3.11,>=3.10.0) ; extra == 'snowflake'
|
|
60
|
+
Provides-Extra: spark
|
|
61
|
+
Requires-Dist: pyspark (<3.6,>=2) ; extra == 'spark'
|
|
62
|
+
|
|
63
|
+
<div align="center">
|
|
64
|
+
<img src="https://sqlframe.readthedocs.io/en/latest/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
|
|
65
|
+
</div>
|
|
66
|
+
|
|
67
|
+
SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
|
|
68
|
+
|
|
69
|
+
SQLFrame currently supports the following engines (many more in development):
|
|
70
|
+
|
|
71
|
+
* [BigQuery](https://sqlframe.readthedocs.io/en/latest/bigquery/)
|
|
72
|
+
* [DuckDB](https://sqlframe.readthedocs.io/en/latest/duckdb)
|
|
73
|
+
* [Postgres](https://sqlframe.readthedocs.io/en/latest/postgres)
|
|
74
|
+
|
|
75
|
+
SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
|
|
76
|
+
* [Standalone](https://sqlframe.readthedocs.io/en/latest/standalone)
|
|
77
|
+
|
|
78
|
+
SQLFrame is great for:
|
|
79
|
+
|
|
80
|
+
* Users who want to run PySpark DataFrame code without having to use a Spark cluster
|
|
81
|
+
* Users who want a SQL representation of their DataFrame code for debugging or sharing with others
|
|
82
|
+
* Users who want a DataFrame API that leverages the full power of their engine to do the processing
|
|
83
|
+
|
|
84
|
+
## Installation
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
# BigQuery
|
|
88
|
+
pip install "sqlframe[bigquery]"
|
|
89
|
+
# DuckDB
|
|
90
|
+
pip install "sqlframe[duckdb]"
|
|
91
|
+
# Postgres
|
|
92
|
+
pip install "sqlframe[postgres]"
|
|
93
|
+
# Standalone
|
|
94
|
+
pip install sqlframe
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
See specific engine documentation for additional setup instructions.
|
|
98
|
+
|
|
99
|
+
## Example Usage
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from sqlframe.bigquery import BigQuerySession
|
|
103
|
+
from sqlframe.bigquery import functions as F
|
|
104
|
+
from sqlframe.bigquery import Window
|
|
105
|
+
|
|
106
|
+
session = BigQuerySession()
|
|
107
|
+
table_path = "bigquery-public-data.samples.natality"
|
|
108
|
+
# Top 5 years with the greatest year-over-year % change in new families with single child
|
|
109
|
+
df = (
|
|
110
|
+
session.table(table_path)
|
|
111
|
+
.where(F.col("ever_born") == 1)
|
|
112
|
+
.groupBy("year")
|
|
113
|
+
.agg(F.count("*").alias("num_single_child_families"))
|
|
114
|
+
.withColumn(
|
|
115
|
+
"last_year_num_single_child_families",
|
|
116
|
+
F.lag(F.col("num_single_child_families"), 1).over(Window.orderBy("year"))
|
|
117
|
+
)
|
|
118
|
+
.withColumn(
|
|
119
|
+
"percent_change",
|
|
120
|
+
(F.col("num_single_child_families") - F.col("last_year_num_single_child_families"))
|
|
121
|
+
/ F.col("last_year_num_single_child_families")
|
|
122
|
+
)
|
|
123
|
+
.orderBy(F.abs(F.col("percent_change")).desc())
|
|
124
|
+
.select(
|
|
125
|
+
F.col("year").alias("year"),
|
|
126
|
+
F.format_number("num_single_child_families", 0).alias("new families single child"),
|
|
127
|
+
F.format_number(F.col("percent_change") * 100, 2).alias("percent change"),
|
|
128
|
+
)
|
|
129
|
+
.limit(5)
|
|
130
|
+
)
|
|
131
|
+
```
|
|
132
|
+
```python
|
|
133
|
+
>>> df.sql()
|
|
134
|
+
WITH `t94228` AS (
|
|
135
|
+
SELECT
|
|
136
|
+
`natality`.`year` AS `year`,
|
|
137
|
+
COUNT(*) AS `num_single_child_families`
|
|
138
|
+
FROM `bigquery-public-data`.`samples`.`natality` AS `natality`
|
|
139
|
+
WHERE
|
|
140
|
+
`natality`.`ever_born` = 1
|
|
141
|
+
GROUP BY
|
|
142
|
+
`natality`.`year`
|
|
143
|
+
), `t39093` AS (
|
|
144
|
+
SELECT
|
|
145
|
+
`t94228`.`year` AS `year`,
|
|
146
|
+
`t94228`.`num_single_child_families` AS `num_single_child_families`,
|
|
147
|
+
LAG(`t94228`.`num_single_child_families`, 1) OVER (ORDER BY `t94228`.`year`) AS `last_year_num_single_child_families`
|
|
148
|
+
FROM `t94228` AS `t94228`
|
|
149
|
+
)
|
|
150
|
+
SELECT
|
|
151
|
+
`t39093`.`year` AS `year`,
|
|
152
|
+
FORMAT('%\'.0f', ROUND(CAST(`t39093`.`num_single_child_families` AS FLOAT64), 0)) AS `new families single child`,
|
|
153
|
+
FORMAT('%\'.2f', ROUND(CAST((((`t39093`.`num_single_child_families` - `t39093`.`last_year_num_single_child_families`) / `t39093`.`last_year_num_single_child_families`) * 100) AS FLOAT64), 2)) AS `percent change`
|
|
154
|
+
FROM `t39093` AS `t39093`
|
|
155
|
+
ORDER BY
|
|
156
|
+
ABS(`percent_change`) DESC
|
|
157
|
+
LIMIT 5
|
|
158
|
+
```
|
|
159
|
+
```python
|
|
160
|
+
>>> df.show()
|
|
161
|
+
+------+---------------------------+----------------+
|
|
162
|
+
| year | new families single child | percent change |
|
|
163
|
+
+------+---------------------------+----------------+
|
|
164
|
+
| 1989 | 1,650,246 | 25.02 |
|
|
165
|
+
| 1974 | 783,448 | 14.49 |
|
|
166
|
+
| 1977 | 1,057,379 | 11.38 |
|
|
167
|
+
| 1985 | 1,308,476 | 11.15 |
|
|
168
|
+
| 1975 | 868,985 | 10.92 |
|
|
169
|
+
+------+---------------------------+----------------+
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
|