fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from collections.abc import Sequence
|
|
3
|
+
from typing import List, Optional, Union, cast
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.types import Row
|
|
7
|
+
from typing_extensions import deprecated
|
|
8
|
+
|
|
9
|
+
from fabricks.cdc.nocdc import NoCDC
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
+
from fabricks.core.jobs.base._types import JobDependency, TGold
|
|
12
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
13
|
+
from fabricks.core.udfs import is_registered, register_udf
|
|
14
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
15
|
+
from fabricks.utils.path import Path
|
|
16
|
+
from fabricks.utils.sqlglot import fix, get_tables
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Gold(BaseJob):
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
step: TGold,
|
|
23
|
+
topic: Optional[str] = None,
|
|
24
|
+
item: Optional[str] = None,
|
|
25
|
+
job_id: Optional[str] = None,
|
|
26
|
+
conf: Optional[Union[dict, Row]] = None,
|
|
27
|
+
): # type: ignore
|
|
28
|
+
super().__init__(
|
|
29
|
+
"gold",
|
|
30
|
+
step=step,
|
|
31
|
+
topic=topic,
|
|
32
|
+
item=item,
|
|
33
|
+
job_id=job_id,
|
|
34
|
+
conf=conf,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
_sql: Optional[str] = None
|
|
38
|
+
_sql_path: Optional[Path] = None
|
|
39
|
+
_schema_drift: Optional[bool] = None
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
43
|
+
return cls(step=cast(TGold, step), job_id=job_id)
|
|
44
|
+
|
|
45
|
+
@classmethod
|
|
46
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
47
|
+
return cls(step=cast(TGold, step), topic=topic, item=item)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def stream(self) -> bool:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def schema_drift(self) -> bool:
|
|
55
|
+
if not self._schema_drift:
|
|
56
|
+
_schema_drift = self.step_conf.get("options", {}).get("schema_drift", False)
|
|
57
|
+
assert _schema_drift is not None
|
|
58
|
+
self._schema_drift = cast(bool, _schema_drift)
|
|
59
|
+
return self._schema_drift
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def persist(self) -> bool:
|
|
63
|
+
return self.mode in ["update", "append", "complete"]
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def virtual(self) -> bool:
|
|
67
|
+
return self.mode in ["memory"]
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def sql(self) -> str:
|
|
71
|
+
sql = self.paths.runtime.get_sql()
|
|
72
|
+
return fix(sql, keep_comments=False)
|
|
73
|
+
|
|
74
|
+
@deprecated("use sql instead")
|
|
75
|
+
def get_sql(self) -> str:
|
|
76
|
+
return self.sql
|
|
77
|
+
|
|
78
|
+
def get_udfs(self) -> List[str]:
|
|
79
|
+
# udf not allowed in invoke
|
|
80
|
+
if self.mode == "invoke":
|
|
81
|
+
return []
|
|
82
|
+
|
|
83
|
+
# udf not allowed in notebook
|
|
84
|
+
elif self.options.job.get("notebook"):
|
|
85
|
+
return []
|
|
86
|
+
|
|
87
|
+
# udf not allowed in table
|
|
88
|
+
elif self.options.job.get("table"):
|
|
89
|
+
return []
|
|
90
|
+
|
|
91
|
+
else:
|
|
92
|
+
matches = []
|
|
93
|
+
if "udf_" in self.sql:
|
|
94
|
+
r = re.compile(r"(?<=udf_)\w*(?=\()")
|
|
95
|
+
matches = re.findall(r, self.sql)
|
|
96
|
+
matches = set(matches)
|
|
97
|
+
matches = list(matches)
|
|
98
|
+
return matches
|
|
99
|
+
|
|
100
|
+
def register_udfs(self):
|
|
101
|
+
for u in self.get_udfs():
|
|
102
|
+
if not is_registered(u):
|
|
103
|
+
DEFAULT_LOGGER.debug(f"register udf ({u})", extra={"label": self})
|
|
104
|
+
register_udf(udf=u, spark=self.spark)
|
|
105
|
+
|
|
106
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
107
|
+
df = df.transform(self.extend)
|
|
108
|
+
return df
|
|
109
|
+
|
|
110
|
+
def get_data(
|
|
111
|
+
self,
|
|
112
|
+
stream: bool = False,
|
|
113
|
+
transform: Optional[bool] = False,
|
|
114
|
+
schema_only: Optional[bool] = False,
|
|
115
|
+
**kwargs,
|
|
116
|
+
) -> DataFrame:
|
|
117
|
+
if self.options.job.get_boolean("requirements"):
|
|
118
|
+
import sys
|
|
119
|
+
|
|
120
|
+
sys.path.append("/dbfs/mnt/fabricks/site-packages")
|
|
121
|
+
|
|
122
|
+
if self.mode == "invoke":
|
|
123
|
+
df = self.spark.createDataFrame([{}]) # type: ignore
|
|
124
|
+
|
|
125
|
+
elif self.options.job.get("notebook"):
|
|
126
|
+
invokers = self.options.invokers.get_list("run")
|
|
127
|
+
assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
|
|
128
|
+
|
|
129
|
+
global_temp_view = self.invoke(path=self.paths.runtime, schema_only=schema_only, **kwargs)
|
|
130
|
+
assert global_temp_view is not None, "global_temp_view not found"
|
|
131
|
+
|
|
132
|
+
df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
|
|
133
|
+
|
|
134
|
+
elif self.options.job.get("table"):
|
|
135
|
+
table = self.options.job.get("table")
|
|
136
|
+
df = self.spark.read.table(table) # type: ignore
|
|
137
|
+
|
|
138
|
+
else:
|
|
139
|
+
assert self.sql, "sql not found"
|
|
140
|
+
self.register_udfs()
|
|
141
|
+
df = self.spark.sql(self.sql)
|
|
142
|
+
|
|
143
|
+
if transform:
|
|
144
|
+
df = self.base_transform(df)
|
|
145
|
+
|
|
146
|
+
if schema_only:
|
|
147
|
+
df = df.where("1 == 2")
|
|
148
|
+
|
|
149
|
+
return df
|
|
150
|
+
|
|
151
|
+
def create_or_replace_view(self):
|
|
152
|
+
assert self.mode == "memory", f"{self.mode} not allowed"
|
|
153
|
+
|
|
154
|
+
df = self.spark.sql(self.sql)
|
|
155
|
+
cdc_options = self.get_cdc_context(df)
|
|
156
|
+
self.cdc.create_or_replace_view(self.sql, **cdc_options)
|
|
157
|
+
|
|
158
|
+
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
159
|
+
data = []
|
|
160
|
+
parents = self.options.job.get_list("parents") or []
|
|
161
|
+
|
|
162
|
+
if self.mode == "invoke":
|
|
163
|
+
dependencies = []
|
|
164
|
+
elif self.options.job.get("notebook"):
|
|
165
|
+
dependencies = self._get_notebook_dependencies()
|
|
166
|
+
else:
|
|
167
|
+
dependencies = self._get_sql_dependencies()
|
|
168
|
+
|
|
169
|
+
dependencies = [d for d in dependencies if d not in parents]
|
|
170
|
+
dependencies = [d.replace("__current", "") for d in dependencies]
|
|
171
|
+
dependencies = list(set(dependencies))
|
|
172
|
+
|
|
173
|
+
for d in dependencies:
|
|
174
|
+
data.append(JobDependency.from_parts(self.job_id, d, "parser"))
|
|
175
|
+
|
|
176
|
+
for p in parents:
|
|
177
|
+
data.append(JobDependency.from_parts(self.job_id, p, "job"))
|
|
178
|
+
return data
|
|
179
|
+
|
|
180
|
+
def _get_sql_dependencies(self) -> List[str]:
|
|
181
|
+
from fabricks.core.jobs.base._types import Steps
|
|
182
|
+
|
|
183
|
+
steps = [str(s) for s in Steps]
|
|
184
|
+
return get_tables(self.sql, allowed_databases=steps)
|
|
185
|
+
|
|
186
|
+
def _get_notebook_dependencies(self) -> List[str]:
|
|
187
|
+
import re
|
|
188
|
+
|
|
189
|
+
from fabricks.context import CATALOG
|
|
190
|
+
|
|
191
|
+
dependencies = []
|
|
192
|
+
df = self.get_data(stream=self.stream)
|
|
193
|
+
|
|
194
|
+
if df is not None:
|
|
195
|
+
explain_plan = self.spark.sql("explain extended select * from {df}", df=df).collect()[0][0]
|
|
196
|
+
|
|
197
|
+
if CATALOG is None:
|
|
198
|
+
r = re.compile(r"(?<=SubqueryAlias spark_catalog\.)[^.]*\.[^.\n]*")
|
|
199
|
+
else:
|
|
200
|
+
r = re.compile(rf"(?:(?<=SubqueryAlias spark_catalog\.)|(?<=SubqueryAlias {CATALOG}\.))[^.]*\.[^.\n]*")
|
|
201
|
+
|
|
202
|
+
matches = re.findall(r, explain_plan)
|
|
203
|
+
dependencies = list(set(matches))
|
|
204
|
+
|
|
205
|
+
return dependencies
|
|
206
|
+
|
|
207
|
+
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
208
|
+
# assume no duplicate in gold (to improve performance)
|
|
209
|
+
deduplicate = self.options.job.get_boolean("deduplicate", None)
|
|
210
|
+
# assume no reload in gold (to improve performance)
|
|
211
|
+
rectify = self.options.job.get_boolean("rectify_as_upserts", None)
|
|
212
|
+
|
|
213
|
+
add_metadata = self.options.job.get_boolean("metadata", None)
|
|
214
|
+
if add_metadata is None:
|
|
215
|
+
add_metadata = self.step_conf.get("options", {}).get("metadata", False)
|
|
216
|
+
|
|
217
|
+
context = {
|
|
218
|
+
"add_metadata": add_metadata,
|
|
219
|
+
"soft_delete": True if self.slowly_changing_dimension else None,
|
|
220
|
+
"deduplicate_key": None,
|
|
221
|
+
"deduplicate_hash": True if self.slowly_changing_dimension else None,
|
|
222
|
+
"deduplicate": False,
|
|
223
|
+
"rectify": False,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
# force deduplicate
|
|
227
|
+
if deduplicate is not None:
|
|
228
|
+
context["deduplicate"] = deduplicate
|
|
229
|
+
context["deduplicate_key"] = deduplicate
|
|
230
|
+
context["deduplicate_hash"] = deduplicate
|
|
231
|
+
|
|
232
|
+
# force rectify
|
|
233
|
+
if rectify is not None:
|
|
234
|
+
context["rectify"] = rectify
|
|
235
|
+
|
|
236
|
+
# add key and hash when needed
|
|
237
|
+
if self.mode == "update" and self.change_data_capture == "nocdc":
|
|
238
|
+
if "__key" not in df.columns:
|
|
239
|
+
context["add_key"] = True
|
|
240
|
+
if "__hash" not in df.columns:
|
|
241
|
+
context["add_hash"] = True
|
|
242
|
+
|
|
243
|
+
# add key and hash when needed
|
|
244
|
+
if self.slowly_changing_dimension:
|
|
245
|
+
if "__key" not in df.columns:
|
|
246
|
+
context["add_key"] = True
|
|
247
|
+
if "__hash" not in df.columns:
|
|
248
|
+
context["add_hash"] = True
|
|
249
|
+
|
|
250
|
+
if self.slowly_changing_dimension:
|
|
251
|
+
if "__operation" not in df.columns:
|
|
252
|
+
# assume no duplicate hash
|
|
253
|
+
if deduplicate is None:
|
|
254
|
+
context["deduplicate_hash"] = None
|
|
255
|
+
|
|
256
|
+
if self.mode == "update":
|
|
257
|
+
context["add_operation"] = "reload"
|
|
258
|
+
if rectify is None:
|
|
259
|
+
context["rectify"] = True
|
|
260
|
+
|
|
261
|
+
else:
|
|
262
|
+
context["add_operation"] = "upsert"
|
|
263
|
+
|
|
264
|
+
# filter to get latest data
|
|
265
|
+
if not reload:
|
|
266
|
+
if self.mode == "update" and self.change_data_capture == "scd2":
|
|
267
|
+
context["slice"] = "update"
|
|
268
|
+
|
|
269
|
+
if self.mode == "update" and self.change_data_capture == "nocdc" and "__timestamp" in df.columns:
|
|
270
|
+
context["slice"] = "update"
|
|
271
|
+
|
|
272
|
+
if self.mode == "append" and "__timestamp" in df.columns:
|
|
273
|
+
context["slice"] = "update"
|
|
274
|
+
|
|
275
|
+
if self.mode == "memory":
|
|
276
|
+
context["mode"] = "complete"
|
|
277
|
+
|
|
278
|
+
# correct __valid_from
|
|
279
|
+
if self.change_data_capture == "scd2":
|
|
280
|
+
context["correct_valid_from"] = self.options.job.get_boolean("correct_valid_from", True)
|
|
281
|
+
|
|
282
|
+
# add __timestamp
|
|
283
|
+
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
284
|
+
if self.change_data_capture == "scd1":
|
|
285
|
+
if "__timestamp" not in df.columns:
|
|
286
|
+
context["add_timestamp"] = True
|
|
287
|
+
if self.change_data_capture == "scd2":
|
|
288
|
+
if "__valid_from" not in df.columns:
|
|
289
|
+
context["add_timestamp"] = True
|
|
290
|
+
|
|
291
|
+
if "__order_duplicate_by_asc" in df.columns:
|
|
292
|
+
context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
|
|
293
|
+
elif "__order_duplicate_by_desc" in df.columns:
|
|
294
|
+
context["order_duplicate_by"] = {"__order_duplicate_by_desc": "desc"}
|
|
295
|
+
|
|
296
|
+
return context
|
|
297
|
+
|
|
298
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
299
|
+
assert self.persist, f"{self.mode} not allowed"
|
|
300
|
+
|
|
301
|
+
reload = kwargs.get("reload")
|
|
302
|
+
context = self.get_cdc_context(df=df, reload=reload)
|
|
303
|
+
|
|
304
|
+
# if dataframe, reference is passed (BUG)
|
|
305
|
+
name = f"{self.step}_{self.topic}_{self.item}"
|
|
306
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
307
|
+
sql = f"select * from {global_temp_view}"
|
|
308
|
+
|
|
309
|
+
check_df = self.spark.sql(sql)
|
|
310
|
+
if check_df.isEmpty():
|
|
311
|
+
DEFAULT_LOGGER.warning("no data", extra={"label": self})
|
|
312
|
+
return
|
|
313
|
+
|
|
314
|
+
if reload:
|
|
315
|
+
DEFAULT_LOGGER.warning("force reload", extra={"label": self})
|
|
316
|
+
self.cdc.complete(sql, **context)
|
|
317
|
+
|
|
318
|
+
elif self.mode == "update":
|
|
319
|
+
self.cdc.update(sql, **context)
|
|
320
|
+
|
|
321
|
+
elif self.mode == "append":
|
|
322
|
+
assert isinstance(self.cdc, NoCDC), f"{self.change_data_capture} append not allowed"
|
|
323
|
+
self.cdc.append(sql, **context)
|
|
324
|
+
|
|
325
|
+
elif self.mode == "complete":
|
|
326
|
+
self.cdc.complete(sql, **context)
|
|
327
|
+
|
|
328
|
+
else:
|
|
329
|
+
raise ValueError(f"{self.mode} - not allowed")
|
|
330
|
+
|
|
331
|
+
self.check_duplicate_key()
|
|
332
|
+
self.check_duplicate_hash()
|
|
333
|
+
self.check_duplicate_identity()
|
|
334
|
+
|
|
335
|
+
def for_each_run(self, **kwargs):
|
|
336
|
+
last_version = None
|
|
337
|
+
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
338
|
+
last_version = self.table.get_last_version()
|
|
339
|
+
|
|
340
|
+
if self.mode == "invoke":
|
|
341
|
+
schedule = kwargs.get("schedule", None)
|
|
342
|
+
self.invoke(schedule=schedule)
|
|
343
|
+
else:
|
|
344
|
+
super().for_each_run(**kwargs)
|
|
345
|
+
|
|
346
|
+
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
347
|
+
self._update_last_timestamp(last_version=last_version)
|
|
348
|
+
|
|
349
|
+
def create(self):
|
|
350
|
+
if self.mode == "invoke":
|
|
351
|
+
DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
|
|
352
|
+
else:
|
|
353
|
+
self.register_udfs()
|
|
354
|
+
super().create()
|
|
355
|
+
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
356
|
+
self._update_last_timestamp(create=True)
|
|
357
|
+
|
|
358
|
+
def register(self):
|
|
359
|
+
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
360
|
+
self.cdc_last_timestamp.table.register()
|
|
361
|
+
|
|
362
|
+
if self.mode == "invoke":
|
|
363
|
+
DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
|
|
364
|
+
else:
|
|
365
|
+
super().register()
|
|
366
|
+
|
|
367
|
+
def drop(self):
|
|
368
|
+
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
369
|
+
self.cdc_last_timestamp.drop()
|
|
370
|
+
|
|
371
|
+
super().drop()
|
|
372
|
+
|
|
373
|
+
@property
|
|
374
|
+
def cdc_last_timestamp(self) -> NoCDC:
|
|
375
|
+
assert self.mode == "update", "persist_last_timestamp only allowed in update"
|
|
376
|
+
assert self.change_data_capture in ["scd1", "scd2"], "persist_last_timestamp only allowed in scd1 or scd2"
|
|
377
|
+
|
|
378
|
+
cdc = NoCDC(self.step, self.topic, f"{self.item}__last_timestamp")
|
|
379
|
+
return cdc
|
|
380
|
+
|
|
381
|
+
def _update_last_timestamp(self, last_version: Optional[int] = None, create: bool = False):
|
|
382
|
+
df = self.spark.sql(f"select * from {self} limit 1")
|
|
383
|
+
|
|
384
|
+
fields = []
|
|
385
|
+
if self.change_data_capture == "scd1":
|
|
386
|
+
fields.append("max(__timestamp) :: timestamp as __timestamp")
|
|
387
|
+
elif self.change_data_capture == "scd2":
|
|
388
|
+
fields.append("max(__valid_from) :: timestamp as __timestamp")
|
|
389
|
+
if "__source" in df.columns:
|
|
390
|
+
fields.append("__source")
|
|
391
|
+
|
|
392
|
+
asof = None
|
|
393
|
+
if last_version is not None:
|
|
394
|
+
asof = f"version as of {last_version}"
|
|
395
|
+
|
|
396
|
+
sql = f"select {', '.join(fields)} from {self} {asof} group by all"
|
|
397
|
+
df = self.spark.sql(sql)
|
|
398
|
+
|
|
399
|
+
if create:
|
|
400
|
+
self.cdc_last_timestamp.table.create(df)
|
|
401
|
+
else:
|
|
402
|
+
self.cdc_last_timestamp.overwrite(df)
|
|
403
|
+
|
|
404
|
+
def overwrite(self, schedule: Optional[str] = None):
|
|
405
|
+
if self.mode == "invoke":
|
|
406
|
+
DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
|
|
407
|
+
return
|
|
408
|
+
|
|
409
|
+
elif self.mode == "memory":
|
|
410
|
+
DEFAULT_LOGGER.debug("memory (no overwrite)", extra={"label": self})
|
|
411
|
+
self.create_or_replace_view()
|
|
412
|
+
return
|
|
413
|
+
|
|
414
|
+
self.overwrite_schema()
|
|
415
|
+
self.run(reload=True, schedule=schedule)
|