fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,338 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from jinja2 import Environment, PackageLoader
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.cdc.base.generator import Generator
9
+ from fabricks.context.log import Logger
10
+ from fabricks.metastore.table import Table
11
+ from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.utils.sqlglot import fix as fix_sql
13
+
14
+
15
+ class Processor(Generator):
16
+ def get_data(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[DataFrame]:
17
+ if isinstance(src, DataFrame):
18
+ name = f"{self.database}_{'_'.join(self.levels)}__data"
19
+ global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False))
20
+ src = f"select * from {global_temp_view}"
21
+
22
+ sql = self.get_query(src, fix=True, **kwargs)
23
+ return self.spark.sql(sql)
24
+
25
+ def get_query_context(self, src: Union[DataFrame, Table, str], **kwargs) -> dict:
26
+ if isinstance(src, DataFrame):
27
+ format = "dataframe"
28
+ elif isinstance(src, Table):
29
+ format = "table"
30
+ elif isinstance(src, str):
31
+ format = "query"
32
+ else:
33
+ raise ValueError(f"{src} not allowed")
34
+
35
+ columns = self.get_columns(src, backtick=False)
36
+ fields = [c for c in columns if not c.startswith("__")]
37
+
38
+ keys = kwargs.get("keys", None)
39
+ mode = kwargs.get("mode", "complete")
40
+ tgt = str(self.table) if mode == "update" else None
41
+
42
+ order_duplicate_by = kwargs.get("order_duplicate_by", None)
43
+ if order_duplicate_by:
44
+ order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
45
+
46
+ add_source = kwargs.get("add_source", None)
47
+ add_calculated_columns = kwargs.get("add_calculated_columns", [])
48
+ add_operation = kwargs.get("add_operation", None)
49
+ add_key = kwargs.get("add_key", None)
50
+ add_hash = kwargs.get("add_hash", None)
51
+ add_timestamp = kwargs.get("add_timestamp", None)
52
+ add_metadata = kwargs.get("add_metadata", None)
53
+
54
+ has_metadata = add_metadata or "__metadata" in columns
55
+ has_source = add_source or "__source" in columns
56
+ has_timestamp = add_timestamp or "__timestamp" in columns
57
+ has_key = add_key or "__key" in columns
58
+ has_hash = add_hash or "__hash" in columns
59
+ has_identity = "__identity" in columns
60
+ has_rescued_data = "__rescued_data" in columns
61
+ has_order_by = None if not order_duplicate_by else True
62
+ try:
63
+ has_rows = self.table.rows > 0
64
+ except Exception:
65
+ has_rows = None
66
+
67
+ filter = kwargs.get("filter", None)
68
+ rectify = kwargs.get("rectify", None)
69
+ deduplicate = kwargs.get("deduplicate", None)
70
+ deduplicate_key = kwargs.get("deduplicate_key", None)
71
+ deduplicate_hash = kwargs.get("deduplicate_hash", None)
72
+ soft_delete = kwargs.get("soft_delete", None)
73
+ fix_valid_from = kwargs.get("fix_valid_from", None)
74
+
75
+ if filter is None:
76
+ if mode == "update" and has_timestamp and has_rows:
77
+ filter = "update"
78
+
79
+ if self.slowly_changing_dimension:
80
+ if deduplicate is None:
81
+ deduplicate = True
82
+ if rectify is None:
83
+ rectify = True
84
+
85
+ if order_duplicate_by:
86
+ deduplicate_key = True
87
+
88
+ if self.slowly_changing_dimension and mode == "update":
89
+ fix_valid_from = fix_valid_from and self.table.rows == 0
90
+
91
+ transformed = filter or rectify or deduplicate or deduplicate_key or deduplicate_hash
92
+
93
+ if deduplicate:
94
+ deduplicate_key = True
95
+ deduplicate_hash = True
96
+
97
+ all_except = kwargs.get("except", []) or []
98
+ all_overwrite = []
99
+
100
+ # override operation if provided and found in df
101
+ if add_operation and "__operation" in columns:
102
+ all_overwrite.append("__operation")
103
+ # add operation if not provided and not found in df BUT remove from output
104
+ elif (transformed or self.slowly_changing_dimension) and not add_operation and "__operation" not in columns:
105
+ add_operation = "upsert"
106
+ if self.change_data_capture == "nocdc":
107
+ all_except.append("__operation")
108
+
109
+ # override key if provided and found in df
110
+ if add_key and "__key" in columns:
111
+ all_overwrite.append("__key")
112
+ # add key if not provided and not found in df BUT remove from output
113
+ elif (transformed or keys or self.slowly_changing_dimension) and not add_key and "__key" not in columns:
114
+ add_key = True
115
+ all_except.append("__key")
116
+
117
+ # override hash if provided and found in df
118
+ if add_hash and "__hash" in columns:
119
+ all_overwrite.append("__hash")
120
+ # add hash if not provided and not found in df BUT remove from output
121
+ elif (transformed or self.slowly_changing_dimension) and not add_hash and "__hash" not in columns:
122
+ add_hash = True
123
+ all_except.append("__hash")
124
+
125
+ # override timestamp if provided and found in df
126
+ if add_timestamp and "__timestamp" in columns:
127
+ all_overwrite.append("__timestamp")
128
+ # add timestamp if not provided and not found in df BUT remove from output
129
+ elif (transformed or self.slowly_changing_dimension) and not add_timestamp and "__timestamp" not in columns:
130
+ add_timestamp = True
131
+ all_except.append("__timestamp")
132
+
133
+ # override metadata if provided and found in df
134
+ if add_metadata and "__metadata" in columns:
135
+ all_overwrite.append("__metadata")
136
+
137
+ parent_filter = None
138
+ if filter:
139
+ parent_filter = "__base"
140
+
141
+ parent_deduplicate_key = None
142
+ if deduplicate_key:
143
+ if filter:
144
+ parent_deduplicate_key = "__filtered"
145
+ else:
146
+ parent_deduplicate_key = "__base"
147
+
148
+ parent_rectify = None
149
+ if rectify:
150
+ if deduplicate_key:
151
+ parent_rectify = "__deduplicated_key"
152
+ elif filter:
153
+ parent_rectify = "__filtered"
154
+ else:
155
+ parent_rectify = "__base"
156
+
157
+ parent_deduplicate_hash = None
158
+ if deduplicate_hash:
159
+ if rectify:
160
+ parent_deduplicate_hash = "__rectified"
161
+ elif deduplicate_key:
162
+ parent_deduplicate_hash = "__deduplicated_key"
163
+ elif filter:
164
+ parent_deduplicate_hash = "__filtered"
165
+ else:
166
+ parent_deduplicate_hash = "__base"
167
+
168
+ parent_cdc = None
169
+ if deduplicate_hash:
170
+ parent_cdc = "__deduplicated_hash"
171
+ elif rectify:
172
+ parent_cdc = "__rectified"
173
+ elif deduplicate_key:
174
+ parent_cdc = "__deduplicated_key"
175
+ elif filter:
176
+ parent_cdc = "__filtered"
177
+ else:
178
+ parent_cdc = "__base"
179
+
180
+ parent_final = "__final"
181
+
182
+ if add_key:
183
+ keys = keys if keys is not None else fields
184
+ if isinstance(keys, str):
185
+ keys = [keys]
186
+ if has_source:
187
+ keys.append("__source")
188
+ keys = [f"cast(`{k}` as string)" for k in keys]
189
+
190
+ hashes = None
191
+ if add_hash:
192
+ hashes = [f"cast(`{f}` as string)" for f in fields]
193
+ if "__operation" in columns or add_operation:
194
+ hashes.append("cast(`__operation` <=> 'delete' as string)")
195
+
196
+ if fields:
197
+ if has_order_by:
198
+ if "__order_duplicate_by_desc desc" in order_duplicate_by:
199
+ fields.append("__order_duplicate_by_desc")
200
+ elif "__order_duplicate_by_asc asc" in order_duplicate_by:
201
+ fields.append("__order_duplicate_by_asc")
202
+ fields = [f"`{f}`" for f in fields]
203
+
204
+ if self.change_data_capture == "nocdc":
205
+ __not_allowed_columns = [
206
+ c
207
+ for c in columns
208
+ if c.startswith("__")
209
+ and c not in self.allowed_leading_columns
210
+ and c not in self.allowed_trailing_columns
211
+ ]
212
+ all_except = all_except + __not_allowed_columns
213
+
214
+ return {
215
+ "src": src,
216
+ "format": format,
217
+ "tgt": tgt,
218
+ "cdc": self.change_data_capture,
219
+ "mode": mode,
220
+ # fields
221
+ "fields": fields,
222
+ "keys": keys,
223
+ "hashes": hashes,
224
+ # options
225
+ "filter": filter,
226
+ "rectify": rectify,
227
+ "deduplicate": deduplicate,
228
+ # extra
229
+ "deduplicate_key": deduplicate_key,
230
+ "deduplicate_hash": deduplicate_hash,
231
+ # has
232
+ "has_rows": has_rows,
233
+ "has_source": has_source,
234
+ "has_metadata": has_metadata,
235
+ "has_timestamp": has_timestamp,
236
+ "has_identity": has_identity,
237
+ "has_key": has_key,
238
+ "has_hash": has_hash,
239
+ "has_order_by": has_order_by,
240
+ "has_rescued_data": has_rescued_data,
241
+ # default add
242
+ "add_metadata": add_metadata,
243
+ "add_timestamp": add_timestamp,
244
+ "add_key": add_key,
245
+ "add_hash": add_hash,
246
+ # value add
247
+ "add_operation": add_operation,
248
+ "add_source": add_source,
249
+ "add_calculated_columns": add_calculated_columns,
250
+ # extra
251
+ "order_duplicate_by": order_duplicate_by,
252
+ "soft_delete": soft_delete,
253
+ "fix_valid_from": fix_valid_from,
254
+ # except
255
+ "all_except": all_except,
256
+ "all_overwrite": all_overwrite,
257
+ # filter
258
+ "filter_where": kwargs.get("filter_where"),
259
+ "update_where": kwargs.get("update_where"),
260
+ # parents
261
+ "parent_filter": parent_filter,
262
+ "parent_rectify": parent_rectify,
263
+ "parent_deduplicate_key": parent_deduplicate_key,
264
+ "parent_deduplicate_hash": parent_deduplicate_hash,
265
+ "parent_cdc": parent_cdc,
266
+ "parent_final": parent_final,
267
+ }
268
+
269
+ def get_query(self, src: Union[DataFrame, Table, str], fix: Optional[bool] = True, **kwargs) -> str:
270
+ context = self.get_query_context(src=src, **kwargs)
271
+ environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
272
+ query = environment.get_template("query.sql.jinja")
273
+
274
+ try:
275
+ sql = query.render(**context)
276
+ except Exception as e:
277
+ Logger.exception("🙈", extra={"job": self, "context": context})
278
+ raise e
279
+
280
+ if fix:
281
+ try:
282
+ sql = sql.replace("{src}", "src")
283
+ sql = fix_sql(sql)
284
+ sql = sql.replace("`src`", "{src}")
285
+ Logger.debug("query", extra={"job": self, "sql": sql, "target": "buffer"})
286
+ except Exception as e:
287
+ Logger.exception("🙈", extra={"job": self, "sql": sql})
288
+ raise e
289
+ else:
290
+ Logger.debug("query", extra={"job": self, "sql": sql})
291
+
292
+ return sql
293
+
294
+ def append(self, src: Union[DataFrame, Table, str], **kwargs):
295
+ if not self.table.exists():
296
+ self.create_table(src, **kwargs)
297
+
298
+ df = self.get_data(src, **kwargs)
299
+ if df:
300
+ df = self.reorder_columns(df)
301
+
302
+ name = f"{self.database}_{'_'.join(self.levels)}__append"
303
+ create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
304
+
305
+ Logger.debug("append", extra={"job": self})
306
+ df.write.format("delta").mode("append").save(self.table.deltapath.string)
307
+
308
+ def overwrite(
309
+ self,
310
+ src: Union[DataFrame, Table, str],
311
+ dynamic: Optional[bool] = False,
312
+ **kwargs,
313
+ ):
314
+ if not self.table.exists():
315
+ self.create_table(src, **kwargs)
316
+
317
+ df = self.get_data(src, **kwargs)
318
+ if df:
319
+ df = self.reorder_columns(df)
320
+
321
+ name = f"{self.database}_{'_'.join(self.levels)}__overwrite"
322
+ create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
323
+
324
+ if not dynamic:
325
+ if kwargs.get("update_where"):
326
+ dynamic = True
327
+
328
+ if dynamic:
329
+ Logger.debug("dynamic overwrite", extra={"job": self})
330
+ (
331
+ df.write.format("delta")
332
+ .mode("overwrite")
333
+ .option("partitionOverwriteMode", "dynamic")
334
+ .save(self.table.deltapath.string)
335
+ )
336
+ else:
337
+ Logger.debug("overwrite", extra={"job": self})
338
+ df.write.format("delta").mode("overwrite").save(self.table.deltapath.string)
@@ -0,0 +1,3 @@
1
+ from typing import Literal
2
+
3
+ ChangeDataCaptures = Literal["nocdc", "scd1", "scd2"]
fabricks/cdc/cdc.py ADDED
@@ -0,0 +1,5 @@
1
+ from fabricks.cdc.base import BaseCDC
2
+
3
+
4
+ class CDC(BaseCDC):
5
+ pass
fabricks/cdc/nocdc.py ADDED
@@ -0,0 +1,19 @@
1
+ from typing import Optional, Union
2
+
3
+ from pyspark.sql import DataFrame, SparkSession
4
+
5
+ from fabricks.cdc.base import BaseCDC
6
+ from fabricks.metastore.table import Table
7
+
8
+
9
+ class NoCDC(BaseCDC):
10
+ def __init__(
11
+ self,
12
+ database: str,
13
+ *levels: str,
14
+ spark: Optional[SparkSession] = None,
15
+ ):
16
+ super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
17
+
18
+ def complete(self, src: Union[DataFrame, Table, str], **kwargs):
19
+ self.overwrite(src=src, **kwargs)
fabricks/cdc/scd.py ADDED
@@ -0,0 +1,21 @@
1
+ from typing import Union
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.cdc.base import BaseCDC
6
+ from fabricks.metastore.table import Table
7
+
8
+
9
+ class SCD(BaseCDC):
10
+ def delete_missing(self, src: Union[DataFrame, Table, str], **kwargs):
11
+ kwargs["add_operation"] = "reload"
12
+ kwargs["mode"] = "update"
13
+ self.merge(src, **kwargs)
14
+
15
+ def complete(self, src: Union[DataFrame, Table, str], **kwargs):
16
+ kwargs["mode"] = "complete"
17
+ self.overwrite(src, **kwargs)
18
+
19
+ def update(self, src: Union[DataFrame, Table, str], **kwargs):
20
+ kwargs["mode"] = "update"
21
+ self.merge(src, **kwargs)
fabricks/cdc/scd1.py ADDED
@@ -0,0 +1,15 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from fabricks.cdc.scd import SCD
6
+
7
+
8
+ class SCD1(SCD):
9
+ def __init__(
10
+ self,
11
+ database: str,
12
+ *levels: str,
13
+ spark: Optional[SparkSession] = None,
14
+ ):
15
+ super().__init__(database, *levels, change_data_capture="scd1", spark=spark)
fabricks/cdc/scd2.py ADDED
@@ -0,0 +1,15 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from fabricks.cdc.scd import SCD
6
+
7
+
8
+ class SCD2(SCD):
9
+ def __init__(
10
+ self,
11
+ database: str,
12
+ *levels: str,
13
+ spark: Optional[SparkSession] = None,
14
+ ):
15
+ super().__init__(database, *levels, change_data_capture="scd2", spark=spark)
File without changes
@@ -0,0 +1,72 @@
1
+ {% if format == "dataframe" %}
2
+ merge into {{ tgt }} t using {{ "{src}" }} s
3
+ {% endif %}
4
+ {% if format == "view" %}
5
+ merge into {{ tgt }} t using {{ src }} s
6
+ {% endif %}
7
+ {% if has_key %}
8
+ on t.__key == s.__merge_key
9
+ {% else %}
10
+ on
11
+ {% for k in keys %}
12
+ t.{{ k }} <=> s.{{ k }}
13
+ {% endfor %}
14
+ {% endif %}
15
+ {% if has_source %}
16
+ and t.__source == s.__source
17
+ {% endif %}
18
+ {% if update_where %}
19
+ {{ update_where }}
20
+ {% endif %}
21
+ when matched
22
+ and __merge_condition == 'upsert' then
23
+ update
24
+ set
25
+ {% for f in fields %}
26
+ {{ f }} = s.{{f}},
27
+ {% endfor %}
28
+ {% if has_timestamp %}
29
+ __timestamp = s.__timestamp,
30
+ {% endif %}
31
+ {%if has_metadata%}
32
+ __metadata.updated = cast(current_date() as timestamp),
33
+ {%endif%}
34
+ {% if has_hash %}
35
+ __hash = s.__hash,
36
+ {% endif %}
37
+ {% if has_rescued_data %}
38
+ __rescued_data = s.__rescued_data,
39
+ {% endif %}
40
+ {%if soft_delete %}
41
+ __is_current = s.__is_current,
42
+ __is_deleted = s.__is_deleted,
43
+ {% endif %}
44
+ {%if soft_delete %}
45
+ -- soft delete
46
+ when matched
47
+ and __merge_condition == 'delete' then
48
+ update
49
+ set
50
+ __is_current = False,
51
+ __is_deleted = True,
52
+ {%if has_metadata%}
53
+ __metadata.updated = cast(current_date() as timestamp),
54
+ {%endif%}
55
+ {%else%}
56
+ -- delete
57
+ when matched
58
+ and __merge_condition == 'delete' then
59
+ delete
60
+ {% endif %}
61
+ when not matched
62
+ and __merge_condition == 'upsert' then
63
+ insert (
64
+ {% for c in columns %}
65
+ {{ c }},
66
+ {% endfor %}
67
+ )
68
+ values (
69
+ {% for c in columns %}
70
+ s.{{ c }},
71
+ {% endfor %}
72
+ )
@@ -0,0 +1,54 @@
1
+ {% if format == "dataframe" %}
2
+ merge into {{ tgt }} t using {{ "{src}" }} s
3
+ {% endif %}
4
+ {% if format == "view" %}
5
+ merge into {{ tgt }} t using {{ src }} s
6
+ {% endif %}
7
+ {% if has_key %}
8
+ on t.__key == s.__merge_key
9
+ {% else %}
10
+ on
11
+ {% for k in keys %}
12
+ t.{{ k }} <=> s.{{ k }} and
13
+ {% endfor %}
14
+ {% endif %}
15
+ and t.__is_current
16
+ {% if has_source %}
17
+ and t.__source == s.__source
18
+ {% endif %}
19
+ when matched
20
+ and __merge_condition == 'update' then
21
+ update
22
+ set
23
+ __valid_to = s.__valid_from - interval 1 seconds,
24
+ __is_current = False,
25
+ {%if soft_delete %}
26
+ __is_deleted = False,
27
+ {% endif %}
28
+ {%if has_metadata%}
29
+ __metadata.updated = cast(current_date() as timestamp),
30
+ {%endif%}
31
+ when matched
32
+ and __merge_condition == 'delete' then
33
+ update
34
+ set
35
+ __valid_to = s.__valid_from - interval 1 seconds,
36
+ __is_current = False,
37
+ {%if soft_delete %}
38
+ __is_deleted = True,
39
+ {% endif %}
40
+ {%if has_metadata%}
41
+ __metadata.updated = cast(current_date() as timestamp),
42
+ {%endif%}
43
+ when not matched
44
+ and __merge_condition == 'insert' then
45
+ insert (
46
+ {% for c in columns %}
47
+ {{ c }},
48
+ {% endfor %}
49
+ )
50
+ values (
51
+ {% for c in columns %}
52
+ s.{{ c }},
53
+ {% endfor %}
54
+ )
@@ -0,0 +1,2 @@
1
+ {% if cdc == "scd1" %} {% include 'merge/scd1.sql.jinja' %} {% endif %}
2
+ {% if cdc == "scd2" %} {% include 'merge/scd2.sql.jinja' %} {% endif %}
File without changes
@@ -0,0 +1,34 @@
1
+ {% import 'query/hash.sql.jinja' as h -%}
2
+
3
+ with
4
+ {% if format == "query" %} __query as ({{ src }}), {% endif %}
5
+ __base as (
6
+ select
7
+ *
8
+ {% if all_overwrite %} except ({% for o in all_overwrite %}{{ o }}, {% endfor %}),
9
+ {% else %},
10
+ {% endif %}
11
+ {% if add_calculated_columns %} {% for c in add_calculated_columns %} {{ c }}, {% endfor %} {% endif %}
12
+ {% if add_timestamp %} cast(current_date() as timestamp) as __timestamp, {% endif %}
13
+ {% if add_operation %} cast('{{ add_operation }}' as string) as __operation, {% endif %}
14
+ {% if add_source %} cast('{{ add_source }}' as string) as __source, {% endif %}
15
+ {% if add_hash %} {{ h.hash(fields=hashes) }} as __hash, {% endif %}
16
+ {% if add_key %} {{ h.hash(fields=keys) }} as __key, {% endif %}
17
+ {% if add_metadata %}
18
+ struct(
19
+ {% if cdc == "nocdc" %}current_timestamp() as inserted,
20
+ {% else %}current_timestamp() as inserted, current_timestamp() as updated,
21
+ {% endif %}
22
+ ) as __metadata,
23
+ {% endif %}
24
+ {% if format == "query" %} from __query
25
+ {% else %}
26
+ {% if format == "table" %} from {{ src }}
27
+ {% endif %}
28
+ {% if format == "global_temp_view" %} from {{ src }}
29
+ {% endif %}
30
+ {% if format == "dataframe" %} from {{ "{src}" }}
31
+ {% endif %}
32
+ {% endif %}
33
+ {% if filter_where %} where {{ filter_where }} {% endif %}
34
+ ),
@@ -0,0 +1,95 @@
1
+ /*
2
+ ⛷️🧀🍫🏔️
3
+
4
+ 👀🏁
5
+ {%- if format %}
6
+ 🗹 format: {{format}}{% endif %}
7
+ {%- if tgt %}
8
+ 🗹 tgt: {{tgt}}{% endif %}
9
+ {%- if cdc %}
10
+ 🗹 cdc: {{cdc}}{% endif %}
11
+ {%- if mode %}
12
+ 🗹 mode: {{mode}}{% endif %}
13
+ {%- if filter %}
14
+ 🗹 filter: {{filter}}{% endif %}
15
+ {%- if rectify %}
16
+ 🗹 rectify: {{rectify}}{% endif %}
17
+ {%- if deduplicate %}
18
+ 🗹 deduplicate: {{deduplicate}}{% endif %}
19
+ {%- if deduplicate_key %}
20
+ 🗹 deduplicate_key: {{deduplicate_key}}{% endif %}
21
+ {%- if deduplicate_hash %}
22
+ 🗹 deduplicate_hash: {{deduplicate_hash}}{% endif %}
23
+ {%- if soft_delete %}
24
+ 🗹 soft_delete: {{soft_delete}}{% endif %}
25
+ {%- if fix_valid_from %}
26
+ 🗹 fix_valid_from: {{fix_valid_from}}{% endif %}
27
+ {%- if has_rows %}
28
+ 🗹 has_rows: {{has_rows}}{% endif %}
29
+ {%- if has_source %}
30
+ 🗹 has_source: {{has_source}}{% endif %}
31
+ {%- if has_metadata %}
32
+ 🗹 has_metadata: {{has_metadata}}{% endif %}
33
+ {%- if has_timestamp %}
34
+ 🗹 has_timestamp: {{has_timestamp}}{% endif %}
35
+ {%- if has_identity %}
36
+ 🗹 has_identity: {{has_identity}}{% endif %}
37
+ {%- if has_key %}
38
+ 🗹 has_key: {{has_key}}{% endif %}
39
+ {%- if has_hash %}
40
+ 🗹 has_hash: {{has_hash}}{% endif %}
41
+ {%- if has_order_by %}
42
+ 🗹 has_order_by: {{has_order_by}}{% endif %}
43
+ {%- if has_rescued_data %}
44
+ 🗹 has_rescued_data: {{has_rescued_data}}{% endif %}
45
+ {%- if add_metadata %}
46
+ 🗹 add_metadata: {{add_metadata}}{% endif %}
47
+ {%- if add_timestamp %}
48
+ 🗹 add_timestamp: {{add_timestamp}}{% endif %}
49
+ {%- if add_key %}
50
+ 🗹 add_key: {{add_key}}{% endif %}
51
+ {%- if add_hash %}
52
+ 🗹 add_hash: {{add_hash}}{% endif %}
53
+ {%- if add_operation %}
54
+ 🗹 add_operation: {{add_operation}}{% endif %}
55
+ {%- if add_source %}
56
+ 🗹 add_source: {{add_source}}{% endif %}
57
+ {%- if add_calculated_columns %}
58
+ 🗹 add_calculated_columns: {{add_calculated_columns}}{% endif %}
59
+ {%- if order_duplicate_by %}
60
+ 🗹 order_duplicate_by: {{order_duplicate_by}}{% endif %}
61
+ {%- if all_except %}
62
+ 🗹 all_except: {{all_except}}{% endif %}
63
+ {%- if all_overwrite %}
64
+ 🗹 all_overwrite: {{all_overwrite}}{% endif %}
65
+ {%- if filter_where %}
66
+ 🗹 filter_where: {{filter_where}}{% endif %}
67
+ {%- if update_where %}
68
+ 🗹 update_where: {{update_where}}{% endif %}
69
+ {%- if parent_filter %}
70
+ 🗹 parent_filter: {{parent_filter}}{% endif %}
71
+ {%- if parent_rectify %}
72
+ 🗹 parent_rectify: {{parent_rectify}}{% endif %}
73
+ {%- if parent_deduplicate_key %}
74
+ 🗹 parent_deduplicate_key: {{parent_deduplicate_key}}{% endif %}
75
+ {%- if parent_deduplicate_hash %}
76
+ 🗹 parent_deduplicate_hash: {{parent_deduplicate_hash}}{% endif %}
77
+ {%- if parent_cdc %}
78
+ 🗹 parent_cdc: {{parent_cdc}}{% endif %}
79
+ {%- if parent_final %}
80
+ 🗹 parent_final: {{parent_final}}{% endif %}
81
+ 👀🏳️
82
+
83
+ 👁️🏁
84
+ {%- if src %}
85
+ 🗸 src: {{src}}{% endif %}
86
+ {%- if fields %}
87
+ 🗸 fields: {{fields}}{% endif %}
88
+ {%- if keys %}
89
+ 🗸 keys: {{keys}}{% endif %}
90
+ {%- if hashes %}
91
+ 🗸 hashes: {{hashes}}{% endif %}
92
+ 👁️🏳️
93
+
94
+ */
95
+