sqlframe 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlframe/__init__.py +0 -0
- sqlframe/_version.py +16 -0
- sqlframe/base/__init__.py +0 -0
- sqlframe/base/_typing.py +39 -0
- sqlframe/base/catalog.py +1163 -0
- sqlframe/base/column.py +388 -0
- sqlframe/base/dataframe.py +1519 -0
- sqlframe/base/decorators.py +51 -0
- sqlframe/base/exceptions.py +14 -0
- sqlframe/base/function_alternatives.py +1055 -0
- sqlframe/base/functions.py +1678 -0
- sqlframe/base/group.py +102 -0
- sqlframe/base/mixins/__init__.py +0 -0
- sqlframe/base/mixins/catalog_mixins.py +419 -0
- sqlframe/base/mixins/readwriter_mixins.py +118 -0
- sqlframe/base/normalize.py +84 -0
- sqlframe/base/operations.py +87 -0
- sqlframe/base/readerwriter.py +679 -0
- sqlframe/base/session.py +585 -0
- sqlframe/base/transforms.py +13 -0
- sqlframe/base/types.py +418 -0
- sqlframe/base/util.py +242 -0
- sqlframe/base/window.py +139 -0
- sqlframe/bigquery/__init__.py +23 -0
- sqlframe/bigquery/catalog.py +255 -0
- sqlframe/bigquery/column.py +1 -0
- sqlframe/bigquery/dataframe.py +54 -0
- sqlframe/bigquery/functions.py +378 -0
- sqlframe/bigquery/group.py +14 -0
- sqlframe/bigquery/readwriter.py +29 -0
- sqlframe/bigquery/session.py +89 -0
- sqlframe/bigquery/types.py +1 -0
- sqlframe/bigquery/window.py +1 -0
- sqlframe/duckdb/__init__.py +20 -0
- sqlframe/duckdb/catalog.py +108 -0
- sqlframe/duckdb/column.py +1 -0
- sqlframe/duckdb/dataframe.py +55 -0
- sqlframe/duckdb/functions.py +47 -0
- sqlframe/duckdb/group.py +14 -0
- sqlframe/duckdb/readwriter.py +111 -0
- sqlframe/duckdb/session.py +65 -0
- sqlframe/duckdb/types.py +1 -0
- sqlframe/duckdb/window.py +1 -0
- sqlframe/postgres/__init__.py +23 -0
- sqlframe/postgres/catalog.py +106 -0
- sqlframe/postgres/column.py +1 -0
- sqlframe/postgres/dataframe.py +54 -0
- sqlframe/postgres/functions.py +61 -0
- sqlframe/postgres/group.py +14 -0
- sqlframe/postgres/readwriter.py +29 -0
- sqlframe/postgres/session.py +68 -0
- sqlframe/postgres/types.py +1 -0
- sqlframe/postgres/window.py +1 -0
- sqlframe/redshift/__init__.py +23 -0
- sqlframe/redshift/catalog.py +127 -0
- sqlframe/redshift/column.py +1 -0
- sqlframe/redshift/dataframe.py +54 -0
- sqlframe/redshift/functions.py +18 -0
- sqlframe/redshift/group.py +14 -0
- sqlframe/redshift/readwriter.py +29 -0
- sqlframe/redshift/session.py +53 -0
- sqlframe/redshift/types.py +1 -0
- sqlframe/redshift/window.py +1 -0
- sqlframe/snowflake/__init__.py +26 -0
- sqlframe/snowflake/catalog.py +134 -0
- sqlframe/snowflake/column.py +1 -0
- sqlframe/snowflake/dataframe.py +54 -0
- sqlframe/snowflake/functions.py +18 -0
- sqlframe/snowflake/group.py +14 -0
- sqlframe/snowflake/readwriter.py +29 -0
- sqlframe/snowflake/session.py +53 -0
- sqlframe/snowflake/types.py +1 -0
- sqlframe/snowflake/window.py +1 -0
- sqlframe/spark/__init__.py +23 -0
- sqlframe/spark/catalog.py +1028 -0
- sqlframe/spark/column.py +1 -0
- sqlframe/spark/dataframe.py +54 -0
- sqlframe/spark/functions.py +22 -0
- sqlframe/spark/group.py +14 -0
- sqlframe/spark/readwriter.py +29 -0
- sqlframe/spark/session.py +90 -0
- sqlframe/spark/types.py +1 -0
- sqlframe/spark/window.py +1 -0
- sqlframe/standalone/__init__.py +26 -0
- sqlframe/standalone/catalog.py +13 -0
- sqlframe/standalone/column.py +1 -0
- sqlframe/standalone/dataframe.py +36 -0
- sqlframe/standalone/functions.py +1 -0
- sqlframe/standalone/group.py +14 -0
- sqlframe/standalone/readwriter.py +19 -0
- sqlframe/standalone/session.py +40 -0
- sqlframe/standalone/types.py +1 -0
- sqlframe/standalone/window.py +1 -0
- sqlframe-1.1.3.dist-info/LICENSE +21 -0
- sqlframe-1.1.3.dist-info/METADATA +172 -0
- sqlframe-1.1.3.dist-info/RECORD +98 -0
- sqlframe-1.1.3.dist-info/WHEEL +5 -0
- sqlframe-1.1.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import typing as t
|
|
6
|
+
|
|
7
|
+
from sqlglot import expressions as exp
|
|
8
|
+
from sqlglot.helper import ensure_list
|
|
9
|
+
|
|
10
|
+
from sqlframe.base.column import Column
|
|
11
|
+
from sqlframe.base.util import get_tables_from_expression_with_join
|
|
12
|
+
|
|
13
|
+
if t.TYPE_CHECKING:
|
|
14
|
+
from sqlframe.base.dataframe import SESSION
|
|
15
|
+
|
|
16
|
+
NORMALIZE_INPUT = t.TypeVar("NORMALIZE_INPUT", bound=t.Union[str, exp.Expression, Column])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def normalize(session: SESSION, expression_context: exp.Select, expr: t.List[NORMALIZE_INPUT]):
|
|
20
|
+
expr = ensure_list(expr)
|
|
21
|
+
expressions = _ensure_expressions(expr)
|
|
22
|
+
for expression in expressions:
|
|
23
|
+
identifiers = expression.find_all(exp.Identifier)
|
|
24
|
+
for identifier in identifiers:
|
|
25
|
+
identifier.transform(session.input_dialect.normalize_identifier)
|
|
26
|
+
replace_alias_name_with_cte_name(session, expression_context, identifier)
|
|
27
|
+
replace_branch_and_sequence_ids_with_cte_name(session, expression_context, identifier)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def replace_alias_name_with_cte_name(
|
|
31
|
+
session: SESSION, expression_context: exp.Select, id: exp.Identifier
|
|
32
|
+
):
|
|
33
|
+
normalized_id = session._normalize_string(id.alias_or_name)
|
|
34
|
+
if normalized_id in session.name_to_sequence_id_mapping:
|
|
35
|
+
for cte in reversed(expression_context.ctes):
|
|
36
|
+
if cte.args["sequence_id"] in session.name_to_sequence_id_mapping[normalized_id]:
|
|
37
|
+
_set_alias_name(id, cte.alias_or_name)
|
|
38
|
+
break
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def replace_branch_and_sequence_ids_with_cte_name(
|
|
42
|
+
session: SESSION, expression_context: exp.Select, id: exp.Identifier
|
|
43
|
+
):
|
|
44
|
+
normalized_id = session._normalize_string(id.alias_or_name)
|
|
45
|
+
if normalized_id in session.known_ids:
|
|
46
|
+
# Check if we have a join and if both the tables in that join share a common branch id
|
|
47
|
+
# If so we need to have this reference the left table by default unless the id is a sequence
|
|
48
|
+
# id then it keeps that reference. This handles the weird edge case in spark that shouldn't
|
|
49
|
+
# be common in practice
|
|
50
|
+
if expression_context.args.get("joins") and normalized_id in session.known_branch_ids:
|
|
51
|
+
join_table_aliases = [
|
|
52
|
+
x.alias_or_name for x in get_tables_from_expression_with_join(expression_context)
|
|
53
|
+
]
|
|
54
|
+
ctes_in_join = [
|
|
55
|
+
cte for cte in expression_context.ctes if cte.alias_or_name in join_table_aliases
|
|
56
|
+
]
|
|
57
|
+
if ctes_in_join[0].args["branch_id"] == ctes_in_join[1].args["branch_id"]:
|
|
58
|
+
assert len(ctes_in_join) == 2
|
|
59
|
+
_set_alias_name(id, ctes_in_join[0].alias_or_name)
|
|
60
|
+
return
|
|
61
|
+
|
|
62
|
+
for cte in reversed(expression_context.ctes):
|
|
63
|
+
if normalized_id in (cte.args["branch_id"], cte.args["sequence_id"]):
|
|
64
|
+
_set_alias_name(id, cte.alias_or_name)
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _set_alias_name(id: exp.Identifier, name: str):
|
|
69
|
+
id.set("this", name)
|
|
70
|
+
id.set("quoted", False)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _ensure_expressions(values: t.List[NORMALIZE_INPUT]) -> t.List[exp.Expression]:
|
|
74
|
+
results = []
|
|
75
|
+
for value in values:
|
|
76
|
+
if isinstance(value, str):
|
|
77
|
+
results.append(Column.ensure_col(value).expression)
|
|
78
|
+
elif isinstance(value, Column):
|
|
79
|
+
results.append(value.expression)
|
|
80
|
+
elif isinstance(value, exp.Expression):
|
|
81
|
+
results.append(value)
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(f"Got an invalid type to normalize: {type(value)}")
|
|
84
|
+
return results
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# This code is based on code from Apache Spark under the license found in the LICENSE file located in the 'sqlframe' folder.
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
import typing as t
|
|
7
|
+
from enum import IntEnum
|
|
8
|
+
|
|
9
|
+
if t.TYPE_CHECKING:
|
|
10
|
+
from sqlframe.base.dataframe import _BaseDataFrame
|
|
11
|
+
from sqlframe.base.group import _BaseGroupedData
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Operation(IntEnum):
|
|
15
|
+
INIT = -1
|
|
16
|
+
NO_OP = 0
|
|
17
|
+
FROM = 1
|
|
18
|
+
WHERE = 2
|
|
19
|
+
GROUP_BY = 3
|
|
20
|
+
HAVING = 4
|
|
21
|
+
SELECT = 5
|
|
22
|
+
ORDER_BY = 6
|
|
23
|
+
LIMIT = 7
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def operation(op: Operation) -> t.Callable[[t.Callable], t.Callable]:
|
|
27
|
+
"""
|
|
28
|
+
Decorator used around DataFrame methods to indicate what type of operation is being performed from the
|
|
29
|
+
ordered Operation enums. This is used to determine which operations should be performed on a CTE vs.
|
|
30
|
+
included with the previous operation.
|
|
31
|
+
|
|
32
|
+
Ex: After a user does a join we want to allow them to select which columns for the different
|
|
33
|
+
tables that they want to carry through to the following operation. If we put that join in
|
|
34
|
+
a CTE preemptively then the user would not have a chance to select which column they want
|
|
35
|
+
in cases where there is overlap in names.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def decorator(func: t.Callable) -> t.Callable:
|
|
39
|
+
@functools.wraps(func)
|
|
40
|
+
def wrapper(self: _BaseDataFrame, *args, **kwargs) -> _BaseDataFrame:
|
|
41
|
+
if self.last_op == Operation.INIT:
|
|
42
|
+
self = self._convert_leaf_to_cte()
|
|
43
|
+
self.last_op = Operation.NO_OP
|
|
44
|
+
last_op = self.last_op
|
|
45
|
+
new_op = op if op != Operation.NO_OP else last_op
|
|
46
|
+
if new_op < last_op or (last_op == new_op == Operation.SELECT):
|
|
47
|
+
self = self._convert_leaf_to_cte()
|
|
48
|
+
df: t.Union[_BaseDataFrame, _BaseGroupedData] = func(self, *args, **kwargs)
|
|
49
|
+
df.last_op = new_op # type: ignore
|
|
50
|
+
return df # type: ignore
|
|
51
|
+
|
|
52
|
+
wrapper.__wrapped__ = func # type: ignore
|
|
53
|
+
return wrapper
|
|
54
|
+
|
|
55
|
+
return decorator
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def group_operation(op: Operation) -> t.Callable[[t.Callable], t.Callable]:
|
|
59
|
+
"""
|
|
60
|
+
Decorator used around DataFrame methods to indicate what type of operation is being performed from the
|
|
61
|
+
ordered Operation enums. This is used to determine which operations should be performed on a CTE vs.
|
|
62
|
+
included with the previous operation.
|
|
63
|
+
|
|
64
|
+
Ex: After a user does a join we want to allow them to select which columns for the different
|
|
65
|
+
tables that they want to carry through to the following operation. If we put that join in
|
|
66
|
+
a CTE preemptively then the user would not have a chance to select which column they want
|
|
67
|
+
in cases where there is overlap in names.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def decorator(func: t.Callable) -> t.Callable:
|
|
71
|
+
@functools.wraps(func)
|
|
72
|
+
def wrapper(self: _BaseGroupedData, *args, **kwargs) -> _BaseDataFrame:
|
|
73
|
+
if self._df.last_op == Operation.INIT:
|
|
74
|
+
self._df = self._df._convert_leaf_to_cte()
|
|
75
|
+
self._df.last_op = Operation.NO_OP
|
|
76
|
+
last_op = self._df.last_op
|
|
77
|
+
new_op = op if op != Operation.NO_OP else last_op
|
|
78
|
+
if new_op < last_op or (last_op == new_op == Operation.SELECT):
|
|
79
|
+
self._df = self._df._convert_leaf_to_cte()
|
|
80
|
+
df: _BaseDataFrame = func(self, *args, **kwargs)
|
|
81
|
+
df.last_op = new_op # type: ignore
|
|
82
|
+
return df
|
|
83
|
+
|
|
84
|
+
wrapper.__wrapped__ = func # type: ignore
|
|
85
|
+
return wrapper
|
|
86
|
+
|
|
87
|
+
return decorator
|