fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,471 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+
5
+ from jinja2 import Environment, PackageLoader
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.cdc.base._types import AllowedSources
9
+ from fabricks.cdc.base.generator import Generator
10
+ from fabricks.context.log import DEFAULT_LOGGER
11
+ from fabricks.metastore.table import Table
12
+ from fabricks.metastore.view import create_or_replace_global_temp_view
13
+ from fabricks.utils._types import DataFrameLike
14
+ from fabricks.utils.sqlglot import fix as fix_sql
15
+
16
+
17
+ class Processor(Generator):
18
+ def get_data(self, src: AllowedSources, **kwargs) -> DataFrame:
19
+ if isinstance(src, DataFrameLike):
20
+ name = f"{self.qualified_name}__data"
21
+ global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False), job=self)
22
+ src = f"select * from {global_temp_view}"
23
+
24
+ sql = self.get_query(src, fix=True, **kwargs)
25
+ DEFAULT_LOGGER.debug("exec query", extra={"label": self, "sql": sql})
26
+ return self.spark.sql(sql)
27
+
28
+ def get_query_context(self, src: AllowedSources, **kwargs) -> dict:
29
+ DEFAULT_LOGGER.debug("deduce query context", extra={"label": self})
30
+
31
+ if isinstance(src, DataFrameLike):
32
+ format = "dataframe"
33
+ elif isinstance(src, Table):
34
+ format = "table"
35
+ elif isinstance(src, str):
36
+ format = "query"
37
+ else:
38
+ raise ValueError(f"{src} not allowed")
39
+
40
+ inputs = self.get_columns(src, backtick=False, sort=False)
41
+ fields = [c for c in inputs if not c.startswith("__")]
42
+ keys = kwargs.get("keys", None)
43
+
44
+ mode = kwargs.get("mode", "complete")
45
+ if mode == "update":
46
+ tgt = str(self.table)
47
+ elif mode == "append" and "__timestamp" in inputs:
48
+ tgt = str(self.table)
49
+ else:
50
+ tgt = None
51
+
52
+ overwrite = []
53
+ exclude = kwargs.get("exclude", []) # used by silver to exclude __operation from output if not update
54
+
55
+ order_duplicate_by = kwargs.get("order_duplicate_by", None)
56
+ if order_duplicate_by:
57
+ order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
58
+
59
+ add_source = kwargs.get("add_source", None)
60
+ add_calculated_columns = kwargs.get("add_calculated_columns", [])
61
+ if add_calculated_columns:
62
+ raise ValueError("add_calculated_columns is not yet supported")
63
+ add_operation = kwargs.get("add_operation", None)
64
+ add_key = kwargs.get("add_key", None)
65
+ add_hash = kwargs.get("add_hash", None)
66
+ add_timestamp = kwargs.get("add_timestamp", None)
67
+ add_metadata = kwargs.get("add_metadata", None)
68
+
69
+ has_order_by = None if not order_duplicate_by else True
70
+
71
+ # determine which special columns are present or need to be added to the output
72
+ has_operation = add_operation or "__operation" in inputs
73
+ has_metadata = add_metadata or "__metadata" in inputs
74
+ has_source = add_source or "__source" in inputs
75
+ has_timestamp = add_timestamp or "__timestamp" in inputs
76
+ has_key = add_key or "__key" in inputs
77
+ has_hash = add_hash or "__hash" in inputs
78
+ has_identity = "__identity" in inputs
79
+ has_rescued_data = "__rescued_data" in inputs
80
+
81
+ soft_delete = kwargs.get("soft_delete", None)
82
+ delete_missing = kwargs.get("delete_missing", None)
83
+ slice = kwargs.get("slice", None)
84
+ rectify = kwargs.get("rectify", None)
85
+ deduplicate = kwargs.get("deduplicate", None)
86
+ deduplicate_key = kwargs.get("deduplicate_key", None)
87
+ deduplicate_hash = kwargs.get("deduplicate_hash", None)
88
+ correct_valid_from = kwargs.get("correct_valid_from", None)
89
+
90
+ try:
91
+ has_rows = self.table.rows > 0
92
+ except Exception:
93
+ has_rows = None
94
+
95
+ # only needed when comparing to current
96
+ # delete all records in current if there is no new data
97
+ if mode == "update" and delete_missing and self.change_data_capture in ["scd1", "scd2"]:
98
+ has_no_data = not self.has_data(src)
99
+ else:
100
+ has_no_data = None
101
+
102
+ # always deduplicate if not set for slowly changing dimensions
103
+ if self.slowly_changing_dimension:
104
+ if deduplicate is None:
105
+ deduplicate = True
106
+
107
+ # order duplicates by implies key deduplication
108
+ if order_duplicate_by:
109
+ deduplicate_key = True
110
+
111
+ if deduplicate:
112
+ deduplicate_key = True
113
+ deduplicate_hash = True
114
+
115
+ # if any deduplication is requested, deduplicate all
116
+ deduplicate = deduplicate or deduplicate_key or deduplicate_hash
117
+
118
+ # always rectify if not set
119
+ if self.slowly_changing_dimension:
120
+ if rectify is None:
121
+ rectify = True
122
+
123
+ # only correct valid_from on first load
124
+ if self.slowly_changing_dimension and mode == "update":
125
+ correct_valid_from = correct_valid_from and self.table.rows == 0
126
+
127
+ # override slice for incremental load if timestamp and rows are present
128
+ if slice is None:
129
+ if mode == "update" and has_timestamp and has_rows:
130
+ slice = "update"
131
+
132
+ # override slice for full load if update and table is empty
133
+ if slice == "update" and not has_rows:
134
+ slice = None
135
+
136
+ # override operation if added and found in df
137
+ if add_operation and "__operation" in inputs:
138
+ overwrite.append("__operation")
139
+
140
+ # override timestamp if added and found in df
141
+ if add_timestamp and "__timestamp" in inputs:
142
+ overwrite.append("__timestamp")
143
+
144
+ # override key if added and found in df (key needed for merge)
145
+ if add_key and "__key" in inputs:
146
+ overwrite.append("__key")
147
+
148
+ # override hash if added and found in df (hash needed to identify fake updates)
149
+ if add_hash and "__hash" in inputs:
150
+ overwrite.append("__hash")
151
+
152
+ # override metadata if added and found in df
153
+ if add_metadata and "__metadata" in inputs:
154
+ overwrite.append("__metadata")
155
+
156
+ advanced_ctes = ((rectify or deduplicate) and self.slowly_changing_dimension) or self.slowly_changing_dimension
157
+ advanced_deduplication = advanced_ctes and deduplicate
158
+
159
+ # add key and hash if not added nor found in df but exclude from output
160
+ # needed for merge
161
+ if mode == "update" or advanced_ctes or deduplicate:
162
+ if not add_key and "__key" not in inputs:
163
+ add_key = True
164
+ exclude.append("__key")
165
+
166
+ if not add_hash and "__hash" not in inputs:
167
+ add_hash = True
168
+ exclude.append("__hash")
169
+
170
+ # add operation and timestamp if not added nor found in df but exclude from output
171
+ # needed for deduplication and/or rectification
172
+ if advanced_ctes:
173
+ if not add_operation and "__operation" not in inputs:
174
+ add_operation = "upsert"
175
+ exclude.append("__operation")
176
+
177
+ if not add_timestamp and "__timestamp" not in inputs:
178
+ add_timestamp = True
179
+ exclude.append("__timestamp")
180
+
181
+ if add_key:
182
+ keys = keys if keys is not None else [f for f in fields]
183
+ if isinstance(keys, str):
184
+ keys = [keys]
185
+ if has_source:
186
+ keys.append("__source")
187
+
188
+ hashes = None
189
+ if add_hash:
190
+ hashes = [f for f in fields]
191
+ if "__operation" in inputs or add_operation:
192
+ hashes.append("__operation")
193
+
194
+ if self.change_data_capture == "nocdc":
195
+ intermediates = [i for i in inputs]
196
+ outputs = [i for i in inputs]
197
+ else:
198
+ intermediates = [f for f in fields]
199
+ outputs = [f for f in fields]
200
+
201
+ if has_operation:
202
+ if "__operation" not in outputs:
203
+ outputs.append("__operation")
204
+ if has_timestamp:
205
+ if "__timestamp" not in outputs:
206
+ outputs.append("__timestamp")
207
+ if has_key:
208
+ if "__key" not in outputs:
209
+ outputs.append("__key")
210
+ if has_hash:
211
+ if "__hash" not in outputs:
212
+ outputs.append("__hash")
213
+
214
+ if has_metadata:
215
+ if "__metadata" not in outputs:
216
+ outputs.append("__metadata")
217
+ if "__metadata" not in intermediates:
218
+ intermediates.append("__metadata")
219
+ if has_source:
220
+ if "__source" not in outputs:
221
+ outputs.append("__source")
222
+ if "__source" not in intermediates:
223
+ intermediates.append("__source")
224
+ if has_identity:
225
+ if "__identity" not in outputs:
226
+ outputs.append("__identity")
227
+ if "__identity" not in intermediates:
228
+ intermediates.append("__identity")
229
+ if has_rescued_data:
230
+ if "__rescued_data" not in outputs:
231
+ outputs.append("__rescued_data")
232
+ if "__rescued_data" not in intermediates:
233
+ intermediates.append("__rescued_data")
234
+
235
+ if soft_delete:
236
+ if "__is_deleted" not in outputs:
237
+ outputs.append("__is_deleted")
238
+ if "__is_current" not in outputs:
239
+ outputs.append("__is_current")
240
+
241
+ if self.change_data_capture == "scd2":
242
+ if "__valid_from" not in outputs:
243
+ outputs.append("__valid_from")
244
+ if "__valid_to" not in outputs:
245
+ outputs.append("__valid_to")
246
+ if "__is_current" not in outputs:
247
+ outputs.append("__is_current")
248
+
249
+ if advanced_ctes:
250
+ if "__operation" not in intermediates:
251
+ intermediates.append("__operation")
252
+ if "__timestamp" not in intermediates:
253
+ intermediates.append("__timestamp")
254
+
255
+ # needed for deduplication and/or rectification
256
+ # might need __operation or __source
257
+ if "__key" not in intermediates:
258
+ intermediates.append("__key")
259
+ if "__hash" not in intermediates:
260
+ intermediates.append("__hash")
261
+
262
+ outputs = [o for o in outputs if o not in exclude]
263
+ outputs = self.sort_columns(outputs)
264
+
265
+ parent_slice = None
266
+ if slice:
267
+ parent_slice = "__base"
268
+
269
+ parent_deduplicate_key = None
270
+ if deduplicate_key:
271
+ if slice:
272
+ parent_deduplicate_key = "__sliced"
273
+ else:
274
+ parent_deduplicate_key = "__base"
275
+
276
+ parent_rectify = None
277
+ if rectify:
278
+ if deduplicate_key:
279
+ parent_rectify = "__deduplicated_key"
280
+ elif slice:
281
+ parent_rectify = "__sliced"
282
+ else:
283
+ parent_rectify = "__base"
284
+
285
+ parent_deduplicate_hash = None
286
+ if deduplicate_hash:
287
+ if rectify:
288
+ parent_deduplicate_hash = "__rectified"
289
+ elif deduplicate_key:
290
+ parent_deduplicate_hash = "__deduplicated_key"
291
+ elif slice:
292
+ parent_deduplicate_hash = "__sliced"
293
+ else:
294
+ parent_deduplicate_hash = "__base"
295
+
296
+ parent_cdc = None
297
+ if deduplicate_hash:
298
+ parent_cdc = "__deduplicated_hash"
299
+ elif rectify:
300
+ parent_cdc = "__rectified"
301
+ elif deduplicate_key:
302
+ parent_cdc = "__deduplicated_key"
303
+ elif slice:
304
+ parent_cdc = "__sliced"
305
+ else:
306
+ parent_cdc = "__base"
307
+
308
+ parent_final = "__final"
309
+
310
+ return {
311
+ "src": src,
312
+ "format": format,
313
+ "tgt": tgt,
314
+ "cdc": self.change_data_capture,
315
+ "mode": mode,
316
+ # fields
317
+ "inputs": inputs,
318
+ "intermediates": intermediates,
319
+ "outputs": outputs,
320
+ "fields": fields,
321
+ "keys": keys,
322
+ "hashes": hashes,
323
+ # options
324
+ "delete_missing": delete_missing,
325
+ "advanced_deduplication": advanced_deduplication,
326
+ # cte's
327
+ "slice": slice,
328
+ "rectify": rectify,
329
+ "deduplicate": deduplicate,
330
+ "deduplicate_key": deduplicate_key,
331
+ "deduplicate_hash": deduplicate_hash,
332
+ # has
333
+ "has_no_data": has_no_data,
334
+ "has_rows": has_rows,
335
+ "has_source": has_source,
336
+ "has_metadata": has_metadata,
337
+ "has_timestamp": has_timestamp,
338
+ "has_operation": has_operation,
339
+ "has_identity": has_identity,
340
+ "has_key": has_key,
341
+ "has_hash": has_hash,
342
+ "has_order_by": has_order_by,
343
+ "has_rescued_data": has_rescued_data,
344
+ # default add
345
+ "add_metadata": add_metadata,
346
+ "add_timestamp": add_timestamp,
347
+ "add_key": add_key,
348
+ "add_hash": add_hash,
349
+ # value add
350
+ "add_operation": add_operation,
351
+ "add_source": add_source,
352
+ "add_calculated_columns": add_calculated_columns,
353
+ # extra
354
+ "order_duplicate_by": order_duplicate_by,
355
+ "soft_delete": soft_delete,
356
+ "correct_valid_from": correct_valid_from,
357
+ # overwrite
358
+ "overwrite": overwrite,
359
+ # filter
360
+ "slices": None,
361
+ "sources": None,
362
+ "filter_where": kwargs.get("filter_where"),
363
+ "update_where": kwargs.get("update_where"),
364
+ # parents
365
+ "parent_slice": parent_slice,
366
+ "parent_rectify": parent_rectify,
367
+ "parent_deduplicate_key": parent_deduplicate_key,
368
+ "parent_deduplicate_hash": parent_deduplicate_hash,
369
+ "parent_cdc": parent_cdc,
370
+ "parent_final": parent_final,
371
+ }
372
+
373
+ def fix_sql(self, sql: str) -> str:
374
+ try:
375
+ sql = sql.replace("{src}", "src")
376
+ sql = fix_sql(sql)
377
+ sql = sql.replace("`src`", "{src}")
378
+
379
+ DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql, "target": "buffer"})
380
+ return sql
381
+
382
+ except Exception as e:
383
+ DEFAULT_LOGGER.exception("fail to fix sql query", extra={"label": self, "sql": sql})
384
+ raise e
385
+
386
+ def fix_context(self, context: dict, fix: Optional[bool] = True, **kwargs) -> dict:
387
+ environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
388
+ template = environment.get_template("filter.sql.jinja")
389
+
390
+ try:
391
+ sql = template.render(**context)
392
+ if fix:
393
+ DEFAULT_LOGGER.debug("fix context", extra={"label": self, "sql": sql})
394
+ sql = self.fix_sql(sql)
395
+
396
+ except (Exception, TypeError) as e:
397
+ DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "context": context})
398
+ raise e
399
+
400
+ row = self.spark.sql(sql).collect()[0]
401
+ assert row.slices, "no slices found"
402
+
403
+ context["slices"] = row.slices
404
+ if context.get("has_source"):
405
+ assert row.sources, "no sources found"
406
+ context["sources"] = row.sources
407
+
408
+ return context
409
+
410
+ def get_query(self, src: AllowedSources, fix: Optional[bool] = True, **kwargs) -> str:
411
+ context = self.get_query_context(src=src, **kwargs)
412
+ environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
413
+
414
+ try:
415
+ if context.get("slice"):
416
+ context = self.fix_context(context, fix=fix, **kwargs)
417
+
418
+ template = environment.get_template("query.sql.jinja")
419
+
420
+ sql = template.render(**context)
421
+ if fix:
422
+ sql = self.fix_sql(sql)
423
+ else:
424
+ DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql})
425
+
426
+ except (Exception, TypeError) as e:
427
+ DEFAULT_LOGGER.debug("context", extra={"label": self, "context": context})
428
+ DEFAULT_LOGGER.exception("fail to generate sql query", extra={"label": self, "context": context})
429
+ raise e
430
+
431
+ return sql
432
+
433
+ def append(self, src: AllowedSources, **kwargs):
434
+ if not self.table.registered:
435
+ self.create_table(src, **kwargs)
436
+
437
+ df = self.get_data(src, **kwargs)
438
+ df = self.reorder_dataframe(df)
439
+
440
+ name = f"{self.qualified_name}__append"
441
+ create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
442
+ append = f"insert into table {self.table} by name select * from global_temp.{name}"
443
+
444
+ DEFAULT_LOGGER.debug("exec append", extra={"label": self, "sql": append})
445
+ self.spark.sql(append)
446
+
447
+ def overwrite(
448
+ self,
449
+ src: AllowedSources,
450
+ dynamic: Optional[bool] = False,
451
+ **kwargs,
452
+ ):
453
+ if not self.table.registered:
454
+ self.create_table(src, **kwargs)
455
+
456
+ df = self.get_data(src, **kwargs)
457
+ df = self.reorder_dataframe(df)
458
+
459
+ if not dynamic:
460
+ if kwargs.get("update_where"):
461
+ dynamic = True
462
+
463
+ if dynamic:
464
+ self.spark.sql("set spark.sql.sources.partitionOverwriteMode = dynamic")
465
+
466
+ name = f"{self.qualified_name}__overwrite"
467
+ create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
468
+ overwrite = f"insert overwrite table {self.table} by name select * from global_temp.{name}"
469
+
470
+ DEFAULT_LOGGER.debug("excec overwrite", extra={"label": self, "sql": overwrite})
471
+ self.spark.sql(overwrite)
fabricks/cdc/cdc.py ADDED
@@ -0,0 +1,5 @@
1
+ from fabricks.cdc.base import BaseCDC
2
+
3
+
4
+ class CDC(BaseCDC):
5
+ pass
fabricks/cdc/nocdc.py ADDED
@@ -0,0 +1,20 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from fabricks.cdc.scd import SCD
6
+
7
+
8
+ class NoCDC(SCD):
9
+ def __init__(
10
+ self,
11
+ database: str,
12
+ *levels: str,
13
+ spark: Optional[SparkSession] = None,
14
+ ):
15
+ super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
16
+
17
+ def delete_missing(self, src, **kwargs):
18
+ kwargs["delete_missing"] = True
19
+ kwargs["mode"] = "update"
20
+ self.merge(src, **kwargs)
fabricks/cdc/scd.py ADDED
@@ -0,0 +1,22 @@
1
+ from typing import Union
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.cdc.base import BaseCDC
6
+ from fabricks.metastore.table import Table
7
+
8
+
9
+ class SCD(BaseCDC):
10
+ def delete_missing(self, src: Union[DataFrame, Table, str], **kwargs):
11
+ kwargs["add_operation"] = "reload"
12
+ kwargs["delete_missing"] = True
13
+ kwargs["mode"] = "update"
14
+ self.merge(src, **kwargs)
15
+
16
+ def complete(self, src: Union[DataFrame, Table, str], **kwargs):
17
+ kwargs["mode"] = "complete"
18
+ self.overwrite(src, **kwargs)
19
+
20
+ def update(self, src: Union[DataFrame, Table, str], **kwargs):
21
+ kwargs["mode"] = "update"
22
+ self.merge(src, **kwargs)
fabricks/cdc/scd1.py ADDED
@@ -0,0 +1,15 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from fabricks.cdc.scd import SCD
6
+
7
+
8
+ class SCD1(SCD):
9
+ def __init__(
10
+ self,
11
+ database: str,
12
+ *levels: str,
13
+ spark: Optional[SparkSession] = None,
14
+ ):
15
+ super().__init__(database, *levels, change_data_capture="scd1", spark=spark)
fabricks/cdc/scd2.py ADDED
@@ -0,0 +1,15 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from fabricks.cdc.scd import SCD
6
+
7
+
8
+ class SCD2(SCD):
9
+ def __init__(
10
+ self,
11
+ database: str,
12
+ *levels: str,
13
+ spark: Optional[SparkSession] = None,
14
+ ):
15
+ super().__init__(database, *levels, change_data_capture="scd2", spark=spark)
File without changes
@@ -0,0 +1,35 @@
1
+ {% import 'macros/hash.sql.jinja' as h -%}
2
+
3
+ with
4
+ {% if format == "query" %} __query as ({{ src }}), {% endif %}
5
+ __base as (
6
+ select
7
+ *
8
+ {% if overwrite %}
9
+ -- will be overwritten below
10
+ except ({% for o in overwrite %}{{ o }}, {% endfor %})
11
+ {% endif %},
12
+ {% if add_calculated_columns %} {% for c in add_calculated_columns %} {{ c }}, {% endfor %} {% endif %}
13
+ {% if add_timestamp %} cast(current_date() as timestamp) as __timestamp, {% endif %}
14
+ {% if add_operation %} cast('{{ add_operation }}' as string) as __operation, {% endif %}
15
+ {% if add_source %} cast('{{ add_source }}' as string) as __source, {% endif %}
16
+ {% if add_hash %} {{ h.add_hash(fields=hashes) }} as __hash, {% endif %}
17
+ {% if add_key %} {{ h.add_hash(fields=keys) }} as __key, {% endif %}
18
+ {% if add_metadata %}
19
+ struct(
20
+ {% if cdc == "nocdc" %}current_timestamp() as inserted,
21
+ {% else %}current_timestamp() as inserted, current_timestamp() as updated,
22
+ {% endif %}
23
+ ) as __metadata,
24
+ {% endif %}
25
+ {% if format == "query" %} from __query
26
+ {% else %}
27
+ {% if format == "table" %} from {{ src }}
28
+ {% endif %}
29
+ {% if format == "global_temp_view" %} from {{ src }}
30
+ {% endif %}
31
+ {% if format == "dataframe" %} from {{ "{src}" }}
32
+ {% endif %}
33
+ {% endif %}
34
+ {% if filter_where %} where {{ filter_where }} {% endif %}
35
+ ),
@@ -0,0 +1,28 @@
1
+ {% import 'macros/hash.sql.jinja' as h -%}
2
+
3
+ __current as (
4
+ select
5
+ {% for i in intermediates %}
6
+ {% if i == "__timestamp" %}
7
+ {% if add_timestamp %} cast('0001-01-01' as timestamp) as __timestamp,
8
+ {% elif cdc == "nocdc" %} __timestamp,
9
+ {% elif cdc == "scd1" %} __timestamp,
10
+ {% elif cdc == "scd2" %} __valid_from as __timestamp,
11
+ {% endif %}
12
+ {% elif i == "__operation" %}
13
+ {% if has_no_data %} 'delete' as __operation, {% else %} 'current' as __operation, {% endif %}
14
+ {% elif i == "__hash" %}
15
+ {% if add_hash %} {{ h.add_hash(fields=hashes) }} as __hash, {% else %} __hash, {% endif %}
16
+ {% elif i == "__key" %}
17
+ {% if add_key %} {{ h.add_key(fields=keys) }} as __key, {% else %} __key, {% endif %}
18
+ {% else %} `{{ i }}`,
19
+ {% endif %}
20
+ {% endfor %}
21
+ from {{ tgt }} t
22
+ where
23
+ true
24
+ {% if cdc == "scd2" %} and __is_current {% endif %}
25
+ {% if cdc == "scd1" %} {% if soft_delete %} and __is_current {% endif %} {% endif %}
26
+ {% if sources %} and ({{ sources }}) {% endif %}
27
+ {% if update_where %} and {{ update_where }} {% endif %}
28
+ ),
@@ -0,0 +1,32 @@
1
+ {% if advanced_deduplication %}
2
+ __deduplicate_hash as (
3
+ select
4
+ *,
5
+ lag(__hash) over (
6
+ partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
7
+ ) as __deduplicate_hash_previous__hash,
8
+ lag(__operation) over (
9
+ partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
10
+ ) as __deduplicate_hash_previous_operation
11
+ from {{ parent_deduplicate_hash }}
12
+ where true
13
+ ),
14
+ __deduplicated_hash as (
15
+ select *
16
+ from __deduplicate_hash
17
+ where
18
+ true
19
+ and not (
20
+ __hash <=> __deduplicate_hash_previous__hash and __operation <=> __deduplicate_hash_previous_operation
21
+ )
22
+ ),
23
+ {% else %}
24
+ __deduplicated_hash as (
25
+ select *
26
+ from {{ parent_deduplicate_hash }}
27
+ where true
28
+ qualify
29
+ not lag(__hash) over (partition by {% if has_source %} __source, {% endif %} __key order by null)
30
+ <=> __hash
31
+ ),
32
+ {% endif %}
@@ -0,0 +1,31 @@
1
+ {% if advanced_deduplication %}
2
+ __deduplicate_key as (
3
+ select
4
+ *,
5
+ row_number() over (
6
+ partition by {% if has_source %} __source, {% endif %} __key, __timestamp
7
+ order by
8
+ /* prioritize delete over upsert */
9
+ __operation asc,
10
+ {% if has_order_by %} {% for o in order_duplicate_by %} {{ o }}, {% endfor %} {% endif %}
11
+ ) as __deduplicate_key_rn
12
+ from {{ parent_deduplicate_key }}
13
+ where true
14
+ ),
15
+ __deduplicated_key as (select *, from __deduplicate_key where __deduplicate_key_rn == 1),
16
+ {% else %}
17
+ __deduplicated_key as (
18
+ select *
19
+ from {{ parent_deduplicate_key }}
20
+ where true
21
+ qualify
22
+ row_number() over (
23
+ partition by {% if has_source %} __source, {% endif %} __key
24
+ order by
25
+ {% if has_order_by %} {% for o in order_duplicate_by %} {{ o }}, {% endfor %}
26
+ {% else %} null
27
+ {% endif %}
28
+ )
29
+ == 1
30
+ ),
31
+ {% endif %}