fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import List, Optional, Union
|
|
5
|
+
|
|
6
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
7
|
+
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
|
+
from fabricks.context import SPARK
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
+
from fabricks.metastore.database import Database
|
|
12
|
+
from fabricks.metastore.table import Table
|
|
13
|
+
from fabricks.utils._types import DataFrameLike
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Configurator(ABC):
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
database: str,
|
|
20
|
+
*levels: str,
|
|
21
|
+
change_data_capture: str,
|
|
22
|
+
spark: Optional[SparkSession] = None,
|
|
23
|
+
):
|
|
24
|
+
if spark is None:
|
|
25
|
+
spark = SPARK
|
|
26
|
+
assert spark is not None
|
|
27
|
+
self.spark: SparkSession = spark
|
|
28
|
+
|
|
29
|
+
self.database = Database(database)
|
|
30
|
+
self.levels = levels
|
|
31
|
+
self.change_data_capture = change_data_capture
|
|
32
|
+
self.table = Table(self.database.name, *self.levels, spark=self.spark)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def is_view(self):
|
|
36
|
+
return self.table.is_view
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def registered(self):
|
|
40
|
+
return self.table.registered
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def qualified_name(self):
|
|
44
|
+
return f"{self.database}_{'_'.join(self.levels)}"
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def get_query(self, src: AllowedSources, **kwargs) -> str: ...
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def get_data(self, src: AllowedSources, **kwargs) -> DataFrame: ...
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def create_table(
|
|
54
|
+
self,
|
|
55
|
+
src: AllowedSources,
|
|
56
|
+
partitioning: Optional[bool] = False,
|
|
57
|
+
partition_by: Optional[Union[List[str], str]] = None,
|
|
58
|
+
identity: Optional[bool] = False,
|
|
59
|
+
liquid_clustering: Optional[bool] = False,
|
|
60
|
+
cluster_by: Optional[Union[List[str], str]] = None,
|
|
61
|
+
properties: Optional[dict[str, str]] = None,
|
|
62
|
+
**kwargs,
|
|
63
|
+
): ...
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
def drop(self): ...
|
|
67
|
+
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def create_or_replace_view(self, src: Union[Table, str], **kwargs): ...
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def allowed_input__columns(self) -> List[str]:
|
|
73
|
+
cols = self.__columns
|
|
74
|
+
|
|
75
|
+
if self.slowly_changing_dimension:
|
|
76
|
+
if "__valid_from" in cols:
|
|
77
|
+
cols.remove("__valid_from")
|
|
78
|
+
if "__valid_to" in cols:
|
|
79
|
+
cols.remove("__valid_to")
|
|
80
|
+
if "__is_current" in cols:
|
|
81
|
+
cols.remove("__is_current")
|
|
82
|
+
if "__is_deleted" in cols:
|
|
83
|
+
cols.remove("__is_deleted")
|
|
84
|
+
|
|
85
|
+
return cols
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def allowed_ouput_leading__columns(self) -> List[str]:
|
|
89
|
+
cols = [
|
|
90
|
+
"__identity",
|
|
91
|
+
"__source",
|
|
92
|
+
"__key",
|
|
93
|
+
"__timestamp",
|
|
94
|
+
"__valid_from",
|
|
95
|
+
"__valid_to",
|
|
96
|
+
"__is_current",
|
|
97
|
+
"__is_deleted",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
if self.change_data_capture == "scd1":
|
|
101
|
+
cols.remove("__valid_from")
|
|
102
|
+
cols.remove("__valid_to")
|
|
103
|
+
elif self.change_data_capture == "scd2":
|
|
104
|
+
cols.remove("__timestamp")
|
|
105
|
+
|
|
106
|
+
return cols
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def allowed_output_trailing__columns(self) -> List[str]:
|
|
110
|
+
cols = [
|
|
111
|
+
"__operation",
|
|
112
|
+
"__metadata",
|
|
113
|
+
"__hash",
|
|
114
|
+
"__rescued_data",
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
if self.slowly_changing_dimension:
|
|
118
|
+
cols.remove("__operation")
|
|
119
|
+
|
|
120
|
+
return cols
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def __columns(self) -> List[str]:
|
|
124
|
+
return [
|
|
125
|
+
# Leading
|
|
126
|
+
"__identity",
|
|
127
|
+
"__source",
|
|
128
|
+
"__key",
|
|
129
|
+
"__timestamp",
|
|
130
|
+
"__valid_from",
|
|
131
|
+
"__valid_to",
|
|
132
|
+
"__is_current",
|
|
133
|
+
"__is_deleted",
|
|
134
|
+
# Trailing
|
|
135
|
+
"__operation",
|
|
136
|
+
"__metadata",
|
|
137
|
+
"__hash",
|
|
138
|
+
"__rescued_data",
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def slowly_changing_dimension(self) -> bool:
|
|
143
|
+
return self.change_data_capture in ["scd1", "scd2"]
|
|
144
|
+
|
|
145
|
+
def get_src(self, src: AllowedSources) -> DataFrame:
|
|
146
|
+
if isinstance(src, DataFrameLike):
|
|
147
|
+
df = src
|
|
148
|
+
elif isinstance(src, Table):
|
|
149
|
+
df = self.table.dataframe
|
|
150
|
+
elif isinstance(src, str):
|
|
151
|
+
df = self.spark.sql(src)
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError(f"{src} not allowed")
|
|
154
|
+
|
|
155
|
+
return df
|
|
156
|
+
|
|
157
|
+
def has_data(self, src: AllowedSources, **kwargs) -> bool:
|
|
158
|
+
DEFAULT_LOGGER.debug("check if has data", extra={"label": self})
|
|
159
|
+
df = self.get_src(src=src)
|
|
160
|
+
return not df.isEmpty()
|
|
161
|
+
|
|
162
|
+
def get_columns(
|
|
163
|
+
self,
|
|
164
|
+
src: AllowedSources,
|
|
165
|
+
backtick: Optional[bool] = True,
|
|
166
|
+
sort: Optional[bool] = True,
|
|
167
|
+
check: Optional[bool] = True,
|
|
168
|
+
) -> List[str]:
|
|
169
|
+
if backtick:
|
|
170
|
+
backtick = True
|
|
171
|
+
|
|
172
|
+
df = self.get_src(src=src)
|
|
173
|
+
columns = df.columns
|
|
174
|
+
|
|
175
|
+
if check:
|
|
176
|
+
for c in columns:
|
|
177
|
+
# avoid duplicate column issue in merge
|
|
178
|
+
if c.startswith("__") and c in self.__columns:
|
|
179
|
+
assert c in self.allowed_input__columns, f"{c} is not allowed"
|
|
180
|
+
|
|
181
|
+
if sort:
|
|
182
|
+
columns = self.sort_columns(columns)
|
|
183
|
+
|
|
184
|
+
if backtick:
|
|
185
|
+
return [f"`{c}`" for c in columns]
|
|
186
|
+
else:
|
|
187
|
+
return columns
|
|
188
|
+
|
|
189
|
+
def sort_columns(self, columns: List[str]) -> List[str]:
|
|
190
|
+
fields = [c for c in columns if not c.startswith("__")]
|
|
191
|
+
|
|
192
|
+
leading = self.allowed_ouput_leading__columns
|
|
193
|
+
trailing = self.allowed_output_trailing__columns
|
|
194
|
+
|
|
195
|
+
# move __hash to the front of the table to ensure statistics are present
|
|
196
|
+
if "__key" not in columns and "__hash" in columns:
|
|
197
|
+
leading = ["__hash" if c == "__key" else c for c in leading]
|
|
198
|
+
trailing = [c for c in trailing if c != "__hash"]
|
|
199
|
+
|
|
200
|
+
__leading = [c for c in leading if c in columns]
|
|
201
|
+
__trailing = [c for c in trailing if c in columns]
|
|
202
|
+
|
|
203
|
+
return __leading + fields + __trailing
|
|
204
|
+
|
|
205
|
+
def reorder_dataframe(self, df: DataFrame) -> DataFrame:
|
|
206
|
+
columns = self.sort_columns(df.columns)
|
|
207
|
+
columns = [f"`{c}`" for c in columns]
|
|
208
|
+
return df.select(columns)
|
|
209
|
+
|
|
210
|
+
@abstractmethod
|
|
211
|
+
def optimize_table(self): ...
|
|
212
|
+
|
|
213
|
+
@abstractmethod
|
|
214
|
+
def update_schema(self, src: AllowedSources, **kwargs): ...
|
|
215
|
+
|
|
216
|
+
@abstractmethod
|
|
217
|
+
def get_differences_with_deltatable(self, src: AllowedSources, **kwargs): ...
|
|
218
|
+
|
|
219
|
+
@abstractmethod
|
|
220
|
+
def overwrite_schema(self, src: AllowedSources): ...
|
|
221
|
+
|
|
222
|
+
def __str__(self):
|
|
223
|
+
return f"{self.table.qualified_name}"
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, List, Optional, Sequence, Union, cast
|
|
4
|
+
|
|
5
|
+
from py4j.protocol import Py4JJavaError
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
|
+
from fabricks.cdc.base.configurator import Configurator
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
+
from fabricks.metastore.table import SchemaDiff, Table
|
|
12
|
+
from fabricks.utils._types import DataFrameLike
|
|
13
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Generator(Configurator):
|
|
17
|
+
def drop(self):
|
|
18
|
+
self.table.drop()
|
|
19
|
+
|
|
20
|
+
def create_table(
|
|
21
|
+
self,
|
|
22
|
+
src: AllowedSources,
|
|
23
|
+
partitioning: Optional[bool] = False,
|
|
24
|
+
partition_by: Optional[Union[List[str], str]] = None,
|
|
25
|
+
identity: Optional[bool] = False,
|
|
26
|
+
liquid_clustering: Optional[bool] = False,
|
|
27
|
+
cluster_by: Optional[Union[List[str], str]] = None,
|
|
28
|
+
properties: Optional[dict[str, str]] = None,
|
|
29
|
+
masks: Optional[dict[str, str]] = None,
|
|
30
|
+
primary_key: Optional[dict[str, Any]] = None,
|
|
31
|
+
foreign_keys: Optional[dict[str, Any]] = None,
|
|
32
|
+
comments: Optional[dict[str, str]] = None,
|
|
33
|
+
**kwargs,
|
|
34
|
+
):
|
|
35
|
+
kwargs["mode"] = "complete"
|
|
36
|
+
kwargs["slice"] = False
|
|
37
|
+
kwargs["rectify"] = False
|
|
38
|
+
kwargs["deduplicate"] = False
|
|
39
|
+
|
|
40
|
+
df = self.get_data(src, **kwargs)
|
|
41
|
+
|
|
42
|
+
if partitioning is True:
|
|
43
|
+
assert partition_by, "partitioning column(s) not found"
|
|
44
|
+
|
|
45
|
+
df = self.reorder_dataframe(df)
|
|
46
|
+
|
|
47
|
+
identity = False if identity is None else identity
|
|
48
|
+
liquid_clustering = False if liquid_clustering is None else liquid_clustering
|
|
49
|
+
|
|
50
|
+
self.table.create(
|
|
51
|
+
df=df,
|
|
52
|
+
partitioning=partitioning,
|
|
53
|
+
partition_by=partition_by,
|
|
54
|
+
identity=identity,
|
|
55
|
+
liquid_clustering=liquid_clustering,
|
|
56
|
+
cluster_by=cluster_by,
|
|
57
|
+
properties=properties,
|
|
58
|
+
masks=masks,
|
|
59
|
+
primary_key=primary_key,
|
|
60
|
+
foreign_keys=foreign_keys,
|
|
61
|
+
comments=comments,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def create_or_replace_view(self, src: Union[Table, str], schema_evolution: bool = True, **kwargs):
|
|
65
|
+
assert not isinstance(src, DataFrameLike), "dataframe not allowed"
|
|
66
|
+
|
|
67
|
+
assert kwargs["mode"] == "complete", f"{kwargs['mode']} not allowed"
|
|
68
|
+
sql = self.get_query(src, **kwargs)
|
|
69
|
+
|
|
70
|
+
df = self.spark.sql(sql)
|
|
71
|
+
df = self.reorder_dataframe(df)
|
|
72
|
+
columns = [f"`{c}`" for c in df.columns]
|
|
73
|
+
|
|
74
|
+
sql = f"""
|
|
75
|
+
create or replace view {self}
|
|
76
|
+
{"with schema evolution" if schema_evolution else "-- no schema evolution"}
|
|
77
|
+
as
|
|
78
|
+
with __view as (
|
|
79
|
+
{sql}
|
|
80
|
+
)
|
|
81
|
+
select
|
|
82
|
+
{",".join(columns)}
|
|
83
|
+
from __view
|
|
84
|
+
"""
|
|
85
|
+
sql = fix_sql(sql)
|
|
86
|
+
DEFAULT_LOGGER.debug("create or replace view", extra={"label": self, "sql": sql})
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
self.spark.sql(sql)
|
|
90
|
+
except Py4JJavaError as e:
|
|
91
|
+
DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "sql": sql}, exc_info=e)
|
|
92
|
+
|
|
93
|
+
def optimize_table(self):
|
|
94
|
+
columns = None
|
|
95
|
+
|
|
96
|
+
if self.change_data_capture == "scd1":
|
|
97
|
+
columns = ["__key"]
|
|
98
|
+
elif self.change_data_capture == "scd2":
|
|
99
|
+
columns = ["__key", "__valid_from"]
|
|
100
|
+
|
|
101
|
+
self.table.optimize(columns=columns)
|
|
102
|
+
|
|
103
|
+
def get_differences_with_deltatable(self, src: AllowedSources, **kwargs) -> DataFrame:
|
|
104
|
+
from pyspark.sql.types import StringType, StructField, StructType
|
|
105
|
+
|
|
106
|
+
schema = StructType(
|
|
107
|
+
[
|
|
108
|
+
StructField("column", StringType(), False),
|
|
109
|
+
StructField("data_type", StringType(), True),
|
|
110
|
+
StructField("new_column", StringType(), True),
|
|
111
|
+
StructField("new_data_type", StringType(), True),
|
|
112
|
+
StructField("status", StringType(), True),
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
if self.is_view:
|
|
117
|
+
return self.spark.createDataFrame([], schema=schema)
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
kwargs["mode"] = "complete"
|
|
121
|
+
if "slice" in kwargs:
|
|
122
|
+
del kwargs["slice"]
|
|
123
|
+
|
|
124
|
+
df = self.get_data(src, **kwargs)
|
|
125
|
+
df = self.reorder_dataframe(df)
|
|
126
|
+
|
|
127
|
+
diffs = self.table.get_schema_differences(df)
|
|
128
|
+
return self.spark.createDataFrame([cast(Any, d.model_dump()) for d in diffs], schema=schema)
|
|
129
|
+
|
|
130
|
+
def get_schema_differences(self, src: AllowedSources, **kwargs) -> Optional[Sequence[SchemaDiff]]:
|
|
131
|
+
if self.is_view:
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
else:
|
|
135
|
+
kwargs["mode"] = "complete"
|
|
136
|
+
if "slice" in kwargs:
|
|
137
|
+
del kwargs["slice"]
|
|
138
|
+
|
|
139
|
+
df = self.get_data(src, **kwargs)
|
|
140
|
+
df = self.reorder_dataframe(df)
|
|
141
|
+
|
|
142
|
+
return self.table.get_schema_differences(df)
|
|
143
|
+
|
|
144
|
+
def schema_drifted(self, src: AllowedSources, **kwargs) -> Optional[bool]:
|
|
145
|
+
d = self.get_schema_differences(src, **kwargs)
|
|
146
|
+
if d is None:
|
|
147
|
+
return None
|
|
148
|
+
return len(d) > 0
|
|
149
|
+
|
|
150
|
+
def _update_schema(
|
|
151
|
+
self,
|
|
152
|
+
src: AllowedSources,
|
|
153
|
+
overwrite: bool = False,
|
|
154
|
+
widen_types: bool = False,
|
|
155
|
+
**kwargs,
|
|
156
|
+
):
|
|
157
|
+
if self.is_view:
|
|
158
|
+
assert not isinstance(src, DataFrameLike), "dataframe not allowed"
|
|
159
|
+
self.create_or_replace_view(src=src)
|
|
160
|
+
|
|
161
|
+
else:
|
|
162
|
+
kwargs["mode"] = "complete"
|
|
163
|
+
if "slice" in kwargs:
|
|
164
|
+
del kwargs["slice"]
|
|
165
|
+
|
|
166
|
+
df = self.get_data(src, **kwargs)
|
|
167
|
+
df = self.reorder_dataframe(df)
|
|
168
|
+
if overwrite:
|
|
169
|
+
self.table.overwrite_schema(df)
|
|
170
|
+
else:
|
|
171
|
+
self.table.update_schema(df, widen_types=widen_types)
|
|
172
|
+
|
|
173
|
+
def update_schema(self, src: AllowedSources, **kwargs):
|
|
174
|
+
self._update_schema(src=src, **kwargs)
|
|
175
|
+
|
|
176
|
+
def overwrite_schema(self, src: AllowedSources, **kwargs):
|
|
177
|
+
self._update_schema(src=src, overwrite=True, **kwargs)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from jinja2 import Environment, PackageLoader
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
|
+
from fabricks.cdc.base.processor import Processor
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.utils._types import DataFrameLike
|
|
13
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Merger(Processor):
|
|
17
|
+
def get_merge_context(self, src: Union[DataFrame, str], **kwargs) -> dict:
|
|
18
|
+
if isinstance(src, DataFrameLike):
|
|
19
|
+
format = "dataframe"
|
|
20
|
+
columns = self.get_columns(src, backtick=False, sort=False, check=False) # already done in processor
|
|
21
|
+
elif isinstance(src, str):
|
|
22
|
+
format = "view"
|
|
23
|
+
columns = self.get_columns(
|
|
24
|
+
f"select * from {src}", backtick=False, sort=False, check=False
|
|
25
|
+
) # already done in processor
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError(f"{src} not allowed")
|
|
28
|
+
|
|
29
|
+
assert "__merge_key" in columns, "__merge_key not found"
|
|
30
|
+
assert "__merge_condition" in columns, "__merge_condition not found"
|
|
31
|
+
|
|
32
|
+
keys = kwargs.get("keys")
|
|
33
|
+
if isinstance(keys, str):
|
|
34
|
+
keys = [keys]
|
|
35
|
+
|
|
36
|
+
columns = [c for c in columns if c not in ["__merge_condition", "__merge_key"]]
|
|
37
|
+
fields = [c for c in columns if not c.startswith("__")]
|
|
38
|
+
where = kwargs.get("update_where") if self.table.rows > 0 else None
|
|
39
|
+
soft_delete = "__is_deleted" in columns
|
|
40
|
+
|
|
41
|
+
has_source = "__source" in columns
|
|
42
|
+
has_key = "__key" in columns
|
|
43
|
+
has_metadata = "__metadata" in columns
|
|
44
|
+
has_hash = "__hash" in columns
|
|
45
|
+
has_timestamp = "__timestamp" in columns
|
|
46
|
+
has_identity = "__identity" in columns
|
|
47
|
+
|
|
48
|
+
# 'NoneType' object is not iterable
|
|
49
|
+
if keys:
|
|
50
|
+
keys = [f"`{k}`" for k in keys]
|
|
51
|
+
if columns:
|
|
52
|
+
columns = [f"`{c}`" for c in columns]
|
|
53
|
+
if fields:
|
|
54
|
+
fields = [f"`{c}`" for c in fields]
|
|
55
|
+
|
|
56
|
+
assert "__key" or keys, f"{self} - __key or keys not found"
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"src": src,
|
|
60
|
+
"format": format,
|
|
61
|
+
"tgt": self.table,
|
|
62
|
+
"cdc": self.change_data_capture,
|
|
63
|
+
"columns": columns,
|
|
64
|
+
"fields": fields,
|
|
65
|
+
"soft_delete": soft_delete,
|
|
66
|
+
"has_source": has_source,
|
|
67
|
+
"has_identity": has_identity,
|
|
68
|
+
"has_key": has_key,
|
|
69
|
+
"has_hash": has_hash,
|
|
70
|
+
"keys": keys,
|
|
71
|
+
"has_metadata": has_metadata,
|
|
72
|
+
"has_timestamp": has_timestamp,
|
|
73
|
+
"where": where,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def get_merge_query(self, src: Union[DataFrame, str], fix: Optional[bool] = True, **kwargs) -> str:
|
|
77
|
+
context = self.get_merge_context(src=src, **kwargs)
|
|
78
|
+
environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
|
|
79
|
+
merge = environment.get_template("merge.sql.jinja")
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
sql = merge.render(**context)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
DEFAULT_LOGGER.debug("context", extra={"label": self, "content": context})
|
|
85
|
+
raise e
|
|
86
|
+
|
|
87
|
+
if fix:
|
|
88
|
+
try:
|
|
89
|
+
sql = sql.replace("{src}", "src")
|
|
90
|
+
sql = fix_sql(sql)
|
|
91
|
+
sql = sql.replace("`src`", "{src}")
|
|
92
|
+
DEFAULT_LOGGER.debug("merge", extra={"label": self, "sql": sql})
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
DEFAULT_LOGGER.exception("fail to clean sql query", extra={"label": self, "sql": sql})
|
|
96
|
+
raise e
|
|
97
|
+
|
|
98
|
+
return sql
|
|
99
|
+
|
|
100
|
+
def merge(self, src: AllowedSources, **kwargs):
|
|
101
|
+
if not self.table.exists():
|
|
102
|
+
self.create_table(src, **kwargs)
|
|
103
|
+
|
|
104
|
+
df = self.get_data(src, **kwargs)
|
|
105
|
+
global_temp_view = f"{self.qualified_name}__merge"
|
|
106
|
+
view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False), job=self)
|
|
107
|
+
|
|
108
|
+
merge = self.get_merge_query(view, **kwargs)
|
|
109
|
+
DEFAULT_LOGGER.debug("exec merge", extra={"label": self, "sql": merge})
|
|
110
|
+
self.spark.sql(merge, src=view)
|