fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,415 @@
1
+ import re
2
+ from collections.abc import Sequence
3
+ from typing import List, Optional, Union, cast
4
+
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql.types import Row
7
+ from typing_extensions import deprecated
8
+
9
+ from fabricks.cdc.nocdc import NoCDC
10
+ from fabricks.context.log import DEFAULT_LOGGER
11
+ from fabricks.core.jobs.base._types import JobDependency, TGold
12
+ from fabricks.core.jobs.base.job import BaseJob
13
+ from fabricks.core.udfs import is_registered, register_udf
14
+ from fabricks.metastore.view import create_or_replace_global_temp_view
15
+ from fabricks.utils.path import Path
16
+ from fabricks.utils.sqlglot import fix, get_tables
17
+
18
+
19
+ class Gold(BaseJob):
20
+ def __init__(
21
+ self,
22
+ step: TGold,
23
+ topic: Optional[str] = None,
24
+ item: Optional[str] = None,
25
+ job_id: Optional[str] = None,
26
+ conf: Optional[Union[dict, Row]] = None,
27
+ ): # type: ignore
28
+ super().__init__(
29
+ "gold",
30
+ step=step,
31
+ topic=topic,
32
+ item=item,
33
+ job_id=job_id,
34
+ conf=conf,
35
+ )
36
+
37
+ _sql: Optional[str] = None
38
+ _sql_path: Optional[Path] = None
39
+ _schema_drift: Optional[bool] = None
40
+
41
+ @classmethod
42
+ def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
43
+ return cls(step=cast(TGold, step), job_id=job_id)
44
+
45
+ @classmethod
46
+ def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
47
+ return cls(step=cast(TGold, step), topic=topic, item=item)
48
+
49
+ @property
50
+ def stream(self) -> bool:
51
+ return False
52
+
53
+ @property
54
+ def schema_drift(self) -> bool:
55
+ if not self._schema_drift:
56
+ _schema_drift = self.step_conf.get("options", {}).get("schema_drift", False)
57
+ assert _schema_drift is not None
58
+ self._schema_drift = cast(bool, _schema_drift)
59
+ return self._schema_drift
60
+
61
+ @property
62
+ def persist(self) -> bool:
63
+ return self.mode in ["update", "append", "complete"]
64
+
65
+ @property
66
+ def virtual(self) -> bool:
67
+ return self.mode in ["memory"]
68
+
69
+ @property
70
+ def sql(self) -> str:
71
+ sql = self.paths.runtime.get_sql()
72
+ return fix(sql, keep_comments=False)
73
+
74
+ @deprecated("use sql instead")
75
+ def get_sql(self) -> str:
76
+ return self.sql
77
+
78
+ def get_udfs(self) -> List[str]:
79
+ # udf not allowed in invoke
80
+ if self.mode == "invoke":
81
+ return []
82
+
83
+ # udf not allowed in notebook
84
+ elif self.options.job.get("notebook"):
85
+ return []
86
+
87
+ # udf not allowed in table
88
+ elif self.options.job.get("table"):
89
+ return []
90
+
91
+ else:
92
+ matches = []
93
+ if "udf_" in self.sql:
94
+ r = re.compile(r"(?<=udf_)\w*(?=\()")
95
+ matches = re.findall(r, self.sql)
96
+ matches = set(matches)
97
+ matches = list(matches)
98
+ return matches
99
+
100
+ def register_udfs(self):
101
+ for u in self.get_udfs():
102
+ if not is_registered(u):
103
+ DEFAULT_LOGGER.debug(f"register udf ({u})", extra={"label": self})
104
+ register_udf(udf=u, spark=self.spark)
105
+
106
+ def base_transform(self, df: DataFrame) -> DataFrame:
107
+ df = df.transform(self.extend)
108
+ return df
109
+
110
+ def get_data(
111
+ self,
112
+ stream: bool = False,
113
+ transform: Optional[bool] = False,
114
+ schema_only: Optional[bool] = False,
115
+ **kwargs,
116
+ ) -> DataFrame:
117
+ if self.options.job.get_boolean("requirements"):
118
+ import sys
119
+
120
+ sys.path.append("/dbfs/mnt/fabricks/site-packages")
121
+
122
+ if self.mode == "invoke":
123
+ df = self.spark.createDataFrame([{}]) # type: ignore
124
+
125
+ elif self.options.job.get("notebook"):
126
+ invokers = self.options.invokers.get_list("run")
127
+ assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
128
+
129
+ global_temp_view = self.invoke(path=self.paths.runtime, schema_only=schema_only, **kwargs)
130
+ assert global_temp_view is not None, "global_temp_view not found"
131
+
132
+ df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
133
+
134
+ elif self.options.job.get("table"):
135
+ table = self.options.job.get("table")
136
+ df = self.spark.read.table(table) # type: ignore
137
+
138
+ else:
139
+ assert self.sql, "sql not found"
140
+ self.register_udfs()
141
+ df = self.spark.sql(self.sql)
142
+
143
+ if transform:
144
+ df = self.base_transform(df)
145
+
146
+ if schema_only:
147
+ df = df.where("1 == 2")
148
+
149
+ return df
150
+
151
+ def create_or_replace_view(self):
152
+ assert self.mode == "memory", f"{self.mode} not allowed"
153
+
154
+ df = self.spark.sql(self.sql)
155
+ cdc_options = self.get_cdc_context(df)
156
+ self.cdc.create_or_replace_view(self.sql, **cdc_options)
157
+
158
+ def get_dependencies(self) -> Sequence[JobDependency]:
159
+ data = []
160
+ parents = self.options.job.get_list("parents") or []
161
+
162
+ if self.mode == "invoke":
163
+ dependencies = []
164
+ elif self.options.job.get("notebook"):
165
+ dependencies = self._get_notebook_dependencies()
166
+ else:
167
+ dependencies = self._get_sql_dependencies()
168
+
169
+ dependencies = [d for d in dependencies if d not in parents]
170
+ dependencies = [d.replace("__current", "") for d in dependencies]
171
+ dependencies = list(set(dependencies))
172
+
173
+ for d in dependencies:
174
+ data.append(JobDependency.from_parts(self.job_id, d, "parser"))
175
+
176
+ for p in parents:
177
+ data.append(JobDependency.from_parts(self.job_id, p, "job"))
178
+ return data
179
+
180
+ def _get_sql_dependencies(self) -> List[str]:
181
+ from fabricks.core.jobs.base._types import Steps
182
+
183
+ steps = [str(s) for s in Steps]
184
+ return get_tables(self.sql, allowed_databases=steps)
185
+
186
+ def _get_notebook_dependencies(self) -> List[str]:
187
+ import re
188
+
189
+ from fabricks.context import CATALOG
190
+
191
+ dependencies = []
192
+ df = self.get_data(stream=self.stream)
193
+
194
+ if df is not None:
195
+ explain_plan = self.spark.sql("explain extended select * from {df}", df=df).collect()[0][0]
196
+
197
+ if CATALOG is None:
198
+ r = re.compile(r"(?<=SubqueryAlias spark_catalog\.)[^.]*\.[^.\n]*")
199
+ else:
200
+ r = re.compile(rf"(?:(?<=SubqueryAlias spark_catalog\.)|(?<=SubqueryAlias {CATALOG}\.))[^.]*\.[^.\n]*")
201
+
202
+ matches = re.findall(r, explain_plan)
203
+ dependencies = list(set(matches))
204
+
205
+ return dependencies
206
+
207
+ def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
208
+ # assume no duplicate in gold (to improve performance)
209
+ deduplicate = self.options.job.get_boolean("deduplicate", None)
210
+ # assume no reload in gold (to improve performance)
211
+ rectify = self.options.job.get_boolean("rectify_as_upserts", None)
212
+
213
+ add_metadata = self.options.job.get_boolean("metadata", None)
214
+ if add_metadata is None:
215
+ add_metadata = self.step_conf.get("options", {}).get("metadata", False)
216
+
217
+ context = {
218
+ "add_metadata": add_metadata,
219
+ "soft_delete": True if self.slowly_changing_dimension else None,
220
+ "deduplicate_key": None,
221
+ "deduplicate_hash": True if self.slowly_changing_dimension else None,
222
+ "deduplicate": False,
223
+ "rectify": False,
224
+ }
225
+
226
+ # force deduplicate
227
+ if deduplicate is not None:
228
+ context["deduplicate"] = deduplicate
229
+ context["deduplicate_key"] = deduplicate
230
+ context["deduplicate_hash"] = deduplicate
231
+
232
+ # force rectify
233
+ if rectify is not None:
234
+ context["rectify"] = rectify
235
+
236
+ # add key and hash when needed
237
+ if self.mode == "update" and self.change_data_capture == "nocdc":
238
+ if "__key" not in df.columns:
239
+ context["add_key"] = True
240
+ if "__hash" not in df.columns:
241
+ context["add_hash"] = True
242
+
243
+ # add key and hash when needed
244
+ if self.slowly_changing_dimension:
245
+ if "__key" not in df.columns:
246
+ context["add_key"] = True
247
+ if "__hash" not in df.columns:
248
+ context["add_hash"] = True
249
+
250
+ if self.slowly_changing_dimension:
251
+ if "__operation" not in df.columns:
252
+ # assume no duplicate hash
253
+ if deduplicate is None:
254
+ context["deduplicate_hash"] = None
255
+
256
+ if self.mode == "update":
257
+ context["add_operation"] = "reload"
258
+ if rectify is None:
259
+ context["rectify"] = True
260
+
261
+ else:
262
+ context["add_operation"] = "upsert"
263
+
264
+ # filter to get latest data
265
+ if not reload:
266
+ if self.mode == "update" and self.change_data_capture == "scd2":
267
+ context["slice"] = "update"
268
+
269
+ if self.mode == "update" and self.change_data_capture == "nocdc" and "__timestamp" in df.columns:
270
+ context["slice"] = "update"
271
+
272
+ if self.mode == "append" and "__timestamp" in df.columns:
273
+ context["slice"] = "update"
274
+
275
+ if self.mode == "memory":
276
+ context["mode"] = "complete"
277
+
278
+ # correct __valid_from
279
+ if self.change_data_capture == "scd2":
280
+ context["correct_valid_from"] = self.options.job.get_boolean("correct_valid_from", True)
281
+
282
+ # add __timestamp
283
+ if self.options.job.get_boolean("persist_last_timestamp"):
284
+ if self.change_data_capture == "scd1":
285
+ if "__timestamp" not in df.columns:
286
+ context["add_timestamp"] = True
287
+ if self.change_data_capture == "scd2":
288
+ if "__valid_from" not in df.columns:
289
+ context["add_timestamp"] = True
290
+
291
+ if "__order_duplicate_by_asc" in df.columns:
292
+ context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
293
+ elif "__order_duplicate_by_desc" in df.columns:
294
+ context["order_duplicate_by"] = {"__order_duplicate_by_desc": "desc"}
295
+
296
+ return context
297
+
298
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
299
+ assert self.persist, f"{self.mode} not allowed"
300
+
301
+ reload = kwargs.get("reload")
302
+ context = self.get_cdc_context(df=df, reload=reload)
303
+
304
+ # if dataframe, reference is passed (BUG)
305
+ name = f"{self.step}_{self.topic}_{self.item}"
306
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
307
+ sql = f"select * from {global_temp_view}"
308
+
309
+ check_df = self.spark.sql(sql)
310
+ if check_df.isEmpty():
311
+ DEFAULT_LOGGER.warning("no data", extra={"label": self})
312
+ return
313
+
314
+ if reload:
315
+ DEFAULT_LOGGER.warning("force reload", extra={"label": self})
316
+ self.cdc.complete(sql, **context)
317
+
318
+ elif self.mode == "update":
319
+ self.cdc.update(sql, **context)
320
+
321
+ elif self.mode == "append":
322
+ assert isinstance(self.cdc, NoCDC), f"{self.change_data_capture} append not allowed"
323
+ self.cdc.append(sql, **context)
324
+
325
+ elif self.mode == "complete":
326
+ self.cdc.complete(sql, **context)
327
+
328
+ else:
329
+ raise ValueError(f"{self.mode} - not allowed")
330
+
331
+ self.check_duplicate_key()
332
+ self.check_duplicate_hash()
333
+ self.check_duplicate_identity()
334
+
335
+ def for_each_run(self, **kwargs):
336
+ last_version = None
337
+ if self.options.job.get_boolean("persist_last_timestamp"):
338
+ last_version = self.table.get_last_version()
339
+
340
+ if self.mode == "invoke":
341
+ schedule = kwargs.get("schedule", None)
342
+ self.invoke(schedule=schedule)
343
+ else:
344
+ super().for_each_run(**kwargs)
345
+
346
+ if self.options.job.get_boolean("persist_last_timestamp"):
347
+ self._update_last_timestamp(last_version=last_version)
348
+
349
+ def create(self):
350
+ if self.mode == "invoke":
351
+ DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
352
+ else:
353
+ self.register_udfs()
354
+ super().create()
355
+ if self.options.job.get_boolean("persist_last_timestamp"):
356
+ self._update_last_timestamp(create=True)
357
+
358
+ def register(self):
359
+ if self.options.job.get_boolean("persist_last_timestamp"):
360
+ self.cdc_last_timestamp.table.register()
361
+
362
+ if self.mode == "invoke":
363
+ DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
364
+ else:
365
+ super().register()
366
+
367
+ def drop(self):
368
+ if self.options.job.get_boolean("persist_last_timestamp"):
369
+ self.cdc_last_timestamp.drop()
370
+
371
+ super().drop()
372
+
373
+ @property
374
+ def cdc_last_timestamp(self) -> NoCDC:
375
+ assert self.mode == "update", "persist_last_timestamp only allowed in update"
376
+ assert self.change_data_capture in ["scd1", "scd2"], "persist_last_timestamp only allowed in scd1 or scd2"
377
+
378
+ cdc = NoCDC(self.step, self.topic, f"{self.item}__last_timestamp")
379
+ return cdc
380
+
381
+ def _update_last_timestamp(self, last_version: Optional[int] = None, create: bool = False):
382
+ df = self.spark.sql(f"select * from {self} limit 1")
383
+
384
+ fields = []
385
+ if self.change_data_capture == "scd1":
386
+ fields.append("max(__timestamp) :: timestamp as __timestamp")
387
+ elif self.change_data_capture == "scd2":
388
+ fields.append("max(__valid_from) :: timestamp as __timestamp")
389
+ if "__source" in df.columns:
390
+ fields.append("__source")
391
+
392
+ asof = None
393
+ if last_version is not None:
394
+ asof = f"version as of {last_version}"
395
+
396
+ sql = f"select {', '.join(fields)} from {self} {asof} group by all"
397
+ df = self.spark.sql(sql)
398
+
399
+ if create:
400
+ self.cdc_last_timestamp.table.create(df)
401
+ else:
402
+ self.cdc_last_timestamp.overwrite(df)
403
+
404
+ def overwrite(self, schedule: Optional[str] = None):
405
+ if self.mode == "invoke":
406
+ DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
407
+ return
408
+
409
+ elif self.mode == "memory":
410
+ DEFAULT_LOGGER.debug("memory (no overwrite)", extra={"label": self})
411
+ self.create_or_replace_view()
412
+ return
413
+
414
+ self.overwrite_schema()
415
+ self.run(reload=True, schedule=schedule)