fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from jinja2 import Environment, PackageLoader
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
|
|
8
|
+
from fabricks.cdc.base.generator import Generator
|
|
9
|
+
from fabricks.context.log import Logger
|
|
10
|
+
from fabricks.metastore.table import Table
|
|
11
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Processor(Generator):
|
|
16
|
+
def get_data(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[DataFrame]:
|
|
17
|
+
if isinstance(src, DataFrame):
|
|
18
|
+
name = f"{self.database}_{'_'.join(self.levels)}__data"
|
|
19
|
+
global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False))
|
|
20
|
+
src = f"select * from {global_temp_view}"
|
|
21
|
+
|
|
22
|
+
sql = self.get_query(src, fix=True, **kwargs)
|
|
23
|
+
return self.spark.sql(sql)
|
|
24
|
+
|
|
25
|
+
def get_query_context(self, src: Union[DataFrame, Table, str], **kwargs) -> dict:
|
|
26
|
+
if isinstance(src, DataFrame):
|
|
27
|
+
format = "dataframe"
|
|
28
|
+
elif isinstance(src, Table):
|
|
29
|
+
format = "table"
|
|
30
|
+
elif isinstance(src, str):
|
|
31
|
+
format = "query"
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"{src} not allowed")
|
|
34
|
+
|
|
35
|
+
columns = self.get_columns(src, backtick=False)
|
|
36
|
+
fields = [c for c in columns if not c.startswith("__")]
|
|
37
|
+
|
|
38
|
+
keys = kwargs.get("keys", None)
|
|
39
|
+
mode = kwargs.get("mode", "complete")
|
|
40
|
+
tgt = str(self.table) if mode == "update" else None
|
|
41
|
+
|
|
42
|
+
order_duplicate_by = kwargs.get("order_duplicate_by", None)
|
|
43
|
+
if order_duplicate_by:
|
|
44
|
+
order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
|
|
45
|
+
|
|
46
|
+
add_source = kwargs.get("add_source", None)
|
|
47
|
+
add_calculated_columns = kwargs.get("add_calculated_columns", [])
|
|
48
|
+
add_operation = kwargs.get("add_operation", None)
|
|
49
|
+
add_key = kwargs.get("add_key", None)
|
|
50
|
+
add_hash = kwargs.get("add_hash", None)
|
|
51
|
+
add_timestamp = kwargs.get("add_timestamp", None)
|
|
52
|
+
add_metadata = kwargs.get("add_metadata", None)
|
|
53
|
+
|
|
54
|
+
has_metadata = add_metadata or "__metadata" in columns
|
|
55
|
+
has_source = add_source or "__source" in columns
|
|
56
|
+
has_timestamp = add_timestamp or "__timestamp" in columns
|
|
57
|
+
has_key = add_key or "__key" in columns
|
|
58
|
+
has_hash = add_hash or "__hash" in columns
|
|
59
|
+
has_identity = "__identity" in columns
|
|
60
|
+
has_rescued_data = "__rescued_data" in columns
|
|
61
|
+
has_order_by = None if not order_duplicate_by else True
|
|
62
|
+
try:
|
|
63
|
+
has_rows = self.table.rows > 0
|
|
64
|
+
except Exception:
|
|
65
|
+
has_rows = None
|
|
66
|
+
|
|
67
|
+
filter = kwargs.get("filter", None)
|
|
68
|
+
rectify = kwargs.get("rectify", None)
|
|
69
|
+
deduplicate = kwargs.get("deduplicate", None)
|
|
70
|
+
deduplicate_key = kwargs.get("deduplicate_key", None)
|
|
71
|
+
deduplicate_hash = kwargs.get("deduplicate_hash", None)
|
|
72
|
+
soft_delete = kwargs.get("soft_delete", None)
|
|
73
|
+
fix_valid_from = kwargs.get("fix_valid_from", None)
|
|
74
|
+
|
|
75
|
+
if filter is None:
|
|
76
|
+
if mode == "update" and has_timestamp and has_rows:
|
|
77
|
+
filter = "update"
|
|
78
|
+
|
|
79
|
+
if self.slowly_changing_dimension:
|
|
80
|
+
if deduplicate is None:
|
|
81
|
+
deduplicate = True
|
|
82
|
+
if rectify is None:
|
|
83
|
+
rectify = True
|
|
84
|
+
|
|
85
|
+
if order_duplicate_by:
|
|
86
|
+
deduplicate_key = True
|
|
87
|
+
|
|
88
|
+
if self.slowly_changing_dimension and mode == "update":
|
|
89
|
+
fix_valid_from = fix_valid_from and self.table.rows == 0
|
|
90
|
+
|
|
91
|
+
transformed = filter or rectify or deduplicate or deduplicate_key or deduplicate_hash
|
|
92
|
+
|
|
93
|
+
if deduplicate:
|
|
94
|
+
deduplicate_key = True
|
|
95
|
+
deduplicate_hash = True
|
|
96
|
+
|
|
97
|
+
all_except = kwargs.get("except", []) or []
|
|
98
|
+
all_overwrite = []
|
|
99
|
+
|
|
100
|
+
# override operation if provided and found in df
|
|
101
|
+
if add_operation and "__operation" in columns:
|
|
102
|
+
all_overwrite.append("__operation")
|
|
103
|
+
# add operation if not provided and not found in df BUT remove from output
|
|
104
|
+
elif (transformed or self.slowly_changing_dimension) and not add_operation and "__operation" not in columns:
|
|
105
|
+
add_operation = "upsert"
|
|
106
|
+
if self.change_data_capture == "nocdc":
|
|
107
|
+
all_except.append("__operation")
|
|
108
|
+
|
|
109
|
+
# override key if provided and found in df
|
|
110
|
+
if add_key and "__key" in columns:
|
|
111
|
+
all_overwrite.append("__key")
|
|
112
|
+
# add key if not provided and not found in df BUT remove from output
|
|
113
|
+
elif (transformed or keys or self.slowly_changing_dimension) and not add_key and "__key" not in columns:
|
|
114
|
+
add_key = True
|
|
115
|
+
all_except.append("__key")
|
|
116
|
+
|
|
117
|
+
# override hash if provided and found in df
|
|
118
|
+
if add_hash and "__hash" in columns:
|
|
119
|
+
all_overwrite.append("__hash")
|
|
120
|
+
# add hash if not provided and not found in df BUT remove from output
|
|
121
|
+
elif (transformed or self.slowly_changing_dimension) and not add_hash and "__hash" not in columns:
|
|
122
|
+
add_hash = True
|
|
123
|
+
all_except.append("__hash")
|
|
124
|
+
|
|
125
|
+
# override timestamp if provided and found in df
|
|
126
|
+
if add_timestamp and "__timestamp" in columns:
|
|
127
|
+
all_overwrite.append("__timestamp")
|
|
128
|
+
# add timestamp if not provided and not found in df BUT remove from output
|
|
129
|
+
elif (transformed or self.slowly_changing_dimension) and not add_timestamp and "__timestamp" not in columns:
|
|
130
|
+
add_timestamp = True
|
|
131
|
+
all_except.append("__timestamp")
|
|
132
|
+
|
|
133
|
+
# override metadata if provided and found in df
|
|
134
|
+
if add_metadata and "__metadata" in columns:
|
|
135
|
+
all_overwrite.append("__metadata")
|
|
136
|
+
|
|
137
|
+
parent_filter = None
|
|
138
|
+
if filter:
|
|
139
|
+
parent_filter = "__base"
|
|
140
|
+
|
|
141
|
+
parent_deduplicate_key = None
|
|
142
|
+
if deduplicate_key:
|
|
143
|
+
if filter:
|
|
144
|
+
parent_deduplicate_key = "__filtered"
|
|
145
|
+
else:
|
|
146
|
+
parent_deduplicate_key = "__base"
|
|
147
|
+
|
|
148
|
+
parent_rectify = None
|
|
149
|
+
if rectify:
|
|
150
|
+
if deduplicate_key:
|
|
151
|
+
parent_rectify = "__deduplicated_key"
|
|
152
|
+
elif filter:
|
|
153
|
+
parent_rectify = "__filtered"
|
|
154
|
+
else:
|
|
155
|
+
parent_rectify = "__base"
|
|
156
|
+
|
|
157
|
+
parent_deduplicate_hash = None
|
|
158
|
+
if deduplicate_hash:
|
|
159
|
+
if rectify:
|
|
160
|
+
parent_deduplicate_hash = "__rectified"
|
|
161
|
+
elif deduplicate_key:
|
|
162
|
+
parent_deduplicate_hash = "__deduplicated_key"
|
|
163
|
+
elif filter:
|
|
164
|
+
parent_deduplicate_hash = "__filtered"
|
|
165
|
+
else:
|
|
166
|
+
parent_deduplicate_hash = "__base"
|
|
167
|
+
|
|
168
|
+
parent_cdc = None
|
|
169
|
+
if deduplicate_hash:
|
|
170
|
+
parent_cdc = "__deduplicated_hash"
|
|
171
|
+
elif rectify:
|
|
172
|
+
parent_cdc = "__rectified"
|
|
173
|
+
elif deduplicate_key:
|
|
174
|
+
parent_cdc = "__deduplicated_key"
|
|
175
|
+
elif filter:
|
|
176
|
+
parent_cdc = "__filtered"
|
|
177
|
+
else:
|
|
178
|
+
parent_cdc = "__base"
|
|
179
|
+
|
|
180
|
+
parent_final = "__final"
|
|
181
|
+
|
|
182
|
+
if add_key:
|
|
183
|
+
keys = keys if keys is not None else fields
|
|
184
|
+
if isinstance(keys, str):
|
|
185
|
+
keys = [keys]
|
|
186
|
+
if has_source:
|
|
187
|
+
keys.append("__source")
|
|
188
|
+
keys = [f"cast(`{k}` as string)" for k in keys]
|
|
189
|
+
|
|
190
|
+
hashes = None
|
|
191
|
+
if add_hash:
|
|
192
|
+
hashes = [f"cast(`{f}` as string)" for f in fields]
|
|
193
|
+
if "__operation" in columns or add_operation:
|
|
194
|
+
hashes.append("cast(`__operation` <=> 'delete' as string)")
|
|
195
|
+
|
|
196
|
+
if fields:
|
|
197
|
+
if has_order_by:
|
|
198
|
+
if "__order_duplicate_by_desc desc" in order_duplicate_by:
|
|
199
|
+
fields.append("__order_duplicate_by_desc")
|
|
200
|
+
elif "__order_duplicate_by_asc asc" in order_duplicate_by:
|
|
201
|
+
fields.append("__order_duplicate_by_asc")
|
|
202
|
+
fields = [f"`{f}`" for f in fields]
|
|
203
|
+
|
|
204
|
+
if self.change_data_capture == "nocdc":
|
|
205
|
+
__not_allowed_columns = [
|
|
206
|
+
c
|
|
207
|
+
for c in columns
|
|
208
|
+
if c.startswith("__")
|
|
209
|
+
and c not in self.allowed_leading_columns
|
|
210
|
+
and c not in self.allowed_trailing_columns
|
|
211
|
+
]
|
|
212
|
+
all_except = all_except + __not_allowed_columns
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
"src": src,
|
|
216
|
+
"format": format,
|
|
217
|
+
"tgt": tgt,
|
|
218
|
+
"cdc": self.change_data_capture,
|
|
219
|
+
"mode": mode,
|
|
220
|
+
# fields
|
|
221
|
+
"fields": fields,
|
|
222
|
+
"keys": keys,
|
|
223
|
+
"hashes": hashes,
|
|
224
|
+
# options
|
|
225
|
+
"filter": filter,
|
|
226
|
+
"rectify": rectify,
|
|
227
|
+
"deduplicate": deduplicate,
|
|
228
|
+
# extra
|
|
229
|
+
"deduplicate_key": deduplicate_key,
|
|
230
|
+
"deduplicate_hash": deduplicate_hash,
|
|
231
|
+
# has
|
|
232
|
+
"has_rows": has_rows,
|
|
233
|
+
"has_source": has_source,
|
|
234
|
+
"has_metadata": has_metadata,
|
|
235
|
+
"has_timestamp": has_timestamp,
|
|
236
|
+
"has_identity": has_identity,
|
|
237
|
+
"has_key": has_key,
|
|
238
|
+
"has_hash": has_hash,
|
|
239
|
+
"has_order_by": has_order_by,
|
|
240
|
+
"has_rescued_data": has_rescued_data,
|
|
241
|
+
# default add
|
|
242
|
+
"add_metadata": add_metadata,
|
|
243
|
+
"add_timestamp": add_timestamp,
|
|
244
|
+
"add_key": add_key,
|
|
245
|
+
"add_hash": add_hash,
|
|
246
|
+
# value add
|
|
247
|
+
"add_operation": add_operation,
|
|
248
|
+
"add_source": add_source,
|
|
249
|
+
"add_calculated_columns": add_calculated_columns,
|
|
250
|
+
# extra
|
|
251
|
+
"order_duplicate_by": order_duplicate_by,
|
|
252
|
+
"soft_delete": soft_delete,
|
|
253
|
+
"fix_valid_from": fix_valid_from,
|
|
254
|
+
# except
|
|
255
|
+
"all_except": all_except,
|
|
256
|
+
"all_overwrite": all_overwrite,
|
|
257
|
+
# filter
|
|
258
|
+
"filter_where": kwargs.get("filter_where"),
|
|
259
|
+
"update_where": kwargs.get("update_where"),
|
|
260
|
+
# parents
|
|
261
|
+
"parent_filter": parent_filter,
|
|
262
|
+
"parent_rectify": parent_rectify,
|
|
263
|
+
"parent_deduplicate_key": parent_deduplicate_key,
|
|
264
|
+
"parent_deduplicate_hash": parent_deduplicate_hash,
|
|
265
|
+
"parent_cdc": parent_cdc,
|
|
266
|
+
"parent_final": parent_final,
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
def get_query(self, src: Union[DataFrame, Table, str], fix: Optional[bool] = True, **kwargs) -> str:
|
|
270
|
+
context = self.get_query_context(src=src, **kwargs)
|
|
271
|
+
environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
|
|
272
|
+
query = environment.get_template("query.sql.jinja")
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
sql = query.render(**context)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
Logger.exception("🙈", extra={"job": self, "context": context})
|
|
278
|
+
raise e
|
|
279
|
+
|
|
280
|
+
if fix:
|
|
281
|
+
try:
|
|
282
|
+
sql = sql.replace("{src}", "src")
|
|
283
|
+
sql = fix_sql(sql)
|
|
284
|
+
sql = sql.replace("`src`", "{src}")
|
|
285
|
+
Logger.debug("query", extra={"job": self, "sql": sql, "target": "buffer"})
|
|
286
|
+
except Exception as e:
|
|
287
|
+
Logger.exception("🙈", extra={"job": self, "sql": sql})
|
|
288
|
+
raise e
|
|
289
|
+
else:
|
|
290
|
+
Logger.debug("query", extra={"job": self, "sql": sql})
|
|
291
|
+
|
|
292
|
+
return sql
|
|
293
|
+
|
|
294
|
+
def append(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
295
|
+
if not self.table.exists():
|
|
296
|
+
self.create_table(src, **kwargs)
|
|
297
|
+
|
|
298
|
+
df = self.get_data(src, **kwargs)
|
|
299
|
+
if df:
|
|
300
|
+
df = self.reorder_columns(df)
|
|
301
|
+
|
|
302
|
+
name = f"{self.database}_{'_'.join(self.levels)}__append"
|
|
303
|
+
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
|
|
304
|
+
|
|
305
|
+
Logger.debug("append", extra={"job": self})
|
|
306
|
+
df.write.format("delta").mode("append").save(self.table.deltapath.string)
|
|
307
|
+
|
|
308
|
+
def overwrite(
|
|
309
|
+
self,
|
|
310
|
+
src: Union[DataFrame, Table, str],
|
|
311
|
+
dynamic: Optional[bool] = False,
|
|
312
|
+
**kwargs,
|
|
313
|
+
):
|
|
314
|
+
if not self.table.exists():
|
|
315
|
+
self.create_table(src, **kwargs)
|
|
316
|
+
|
|
317
|
+
df = self.get_data(src, **kwargs)
|
|
318
|
+
if df:
|
|
319
|
+
df = self.reorder_columns(df)
|
|
320
|
+
|
|
321
|
+
name = f"{self.database}_{'_'.join(self.levels)}__overwrite"
|
|
322
|
+
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
|
|
323
|
+
|
|
324
|
+
if not dynamic:
|
|
325
|
+
if kwargs.get("update_where"):
|
|
326
|
+
dynamic = True
|
|
327
|
+
|
|
328
|
+
if dynamic:
|
|
329
|
+
Logger.debug("dynamic overwrite", extra={"job": self})
|
|
330
|
+
(
|
|
331
|
+
df.write.format("delta")
|
|
332
|
+
.mode("overwrite")
|
|
333
|
+
.option("partitionOverwriteMode", "dynamic")
|
|
334
|
+
.save(self.table.deltapath.string)
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
Logger.debug("overwrite", extra={"job": self})
|
|
338
|
+
df.write.format("delta").mode("overwrite").save(self.table.deltapath.string)
|
fabricks/cdc/cdc.py
ADDED
fabricks/cdc/nocdc.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.base import BaseCDC
|
|
6
|
+
from fabricks.metastore.table import Table
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NoCDC(BaseCDC):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
database: str,
|
|
13
|
+
*levels: str,
|
|
14
|
+
spark: Optional[SparkSession] = None,
|
|
15
|
+
):
|
|
16
|
+
super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
|
|
17
|
+
|
|
18
|
+
def complete(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
19
|
+
self.overwrite(src=src, **kwargs)
|
fabricks/cdc/scd.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.base import BaseCDC
|
|
6
|
+
from fabricks.metastore.table import Table
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SCD(BaseCDC):
|
|
10
|
+
def delete_missing(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
11
|
+
kwargs["add_operation"] = "reload"
|
|
12
|
+
kwargs["mode"] = "update"
|
|
13
|
+
self.merge(src, **kwargs)
|
|
14
|
+
|
|
15
|
+
def complete(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
16
|
+
kwargs["mode"] = "complete"
|
|
17
|
+
self.overwrite(src, **kwargs)
|
|
18
|
+
|
|
19
|
+
def update(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
20
|
+
kwargs["mode"] = "update"
|
|
21
|
+
self.merge(src, **kwargs)
|
fabricks/cdc/scd1.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.scd import SCD
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SCD1(SCD):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
database: str,
|
|
12
|
+
*levels: str,
|
|
13
|
+
spark: Optional[SparkSession] = None,
|
|
14
|
+
):
|
|
15
|
+
super().__init__(database, *levels, change_data_capture="scd1", spark=spark)
|
fabricks/cdc/scd2.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.scd import SCD
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SCD2(SCD):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
database: str,
|
|
12
|
+
*levels: str,
|
|
13
|
+
spark: Optional[SparkSession] = None,
|
|
14
|
+
):
|
|
15
|
+
super().__init__(database, *levels, change_data_capture="scd2", spark=spark)
|
|
File without changes
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
{% if format == "dataframe" %}
|
|
2
|
+
merge into {{ tgt }} t using {{ "{src}" }} s
|
|
3
|
+
{% endif %}
|
|
4
|
+
{% if format == "view" %}
|
|
5
|
+
merge into {{ tgt }} t using {{ src }} s
|
|
6
|
+
{% endif %}
|
|
7
|
+
{% if has_key %}
|
|
8
|
+
on t.__key == s.__merge_key
|
|
9
|
+
{% else %}
|
|
10
|
+
on
|
|
11
|
+
{% for k in keys %}
|
|
12
|
+
t.{{ k }} <=> s.{{ k }}
|
|
13
|
+
{% endfor %}
|
|
14
|
+
{% endif %}
|
|
15
|
+
{% if has_source %}
|
|
16
|
+
and t.__source == s.__source
|
|
17
|
+
{% endif %}
|
|
18
|
+
{% if update_where %}
|
|
19
|
+
{{ update_where }}
|
|
20
|
+
{% endif %}
|
|
21
|
+
when matched
|
|
22
|
+
and __merge_condition == 'upsert' then
|
|
23
|
+
update
|
|
24
|
+
set
|
|
25
|
+
{% for f in fields %}
|
|
26
|
+
{{ f }} = s.{{f}},
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% if has_timestamp %}
|
|
29
|
+
__timestamp = s.__timestamp,
|
|
30
|
+
{% endif %}
|
|
31
|
+
{%if has_metadata%}
|
|
32
|
+
__metadata.updated = cast(current_date() as timestamp),
|
|
33
|
+
{%endif%}
|
|
34
|
+
{% if has_hash %}
|
|
35
|
+
__hash = s.__hash,
|
|
36
|
+
{% endif %}
|
|
37
|
+
{% if has_rescued_data %}
|
|
38
|
+
__rescued_data = s.__rescued_data,
|
|
39
|
+
{% endif %}
|
|
40
|
+
{%if soft_delete %}
|
|
41
|
+
__is_current = s.__is_current,
|
|
42
|
+
__is_deleted = s.__is_deleted,
|
|
43
|
+
{% endif %}
|
|
44
|
+
{%if soft_delete %}
|
|
45
|
+
-- soft delete
|
|
46
|
+
when matched
|
|
47
|
+
and __merge_condition == 'delete' then
|
|
48
|
+
update
|
|
49
|
+
set
|
|
50
|
+
__is_current = False,
|
|
51
|
+
__is_deleted = True,
|
|
52
|
+
{%if has_metadata%}
|
|
53
|
+
__metadata.updated = cast(current_date() as timestamp),
|
|
54
|
+
{%endif%}
|
|
55
|
+
{%else%}
|
|
56
|
+
-- delete
|
|
57
|
+
when matched
|
|
58
|
+
and __merge_condition == 'delete' then
|
|
59
|
+
delete
|
|
60
|
+
{% endif %}
|
|
61
|
+
when not matched
|
|
62
|
+
and __merge_condition == 'upsert' then
|
|
63
|
+
insert (
|
|
64
|
+
{% for c in columns %}
|
|
65
|
+
{{ c }},
|
|
66
|
+
{% endfor %}
|
|
67
|
+
)
|
|
68
|
+
values (
|
|
69
|
+
{% for c in columns %}
|
|
70
|
+
s.{{ c }},
|
|
71
|
+
{% endfor %}
|
|
72
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{% if format == "dataframe" %}
|
|
2
|
+
merge into {{ tgt }} t using {{ "{src}" }} s
|
|
3
|
+
{% endif %}
|
|
4
|
+
{% if format == "view" %}
|
|
5
|
+
merge into {{ tgt }} t using {{ src }} s
|
|
6
|
+
{% endif %}
|
|
7
|
+
{% if has_key %}
|
|
8
|
+
on t.__key == s.__merge_key
|
|
9
|
+
{% else %}
|
|
10
|
+
on
|
|
11
|
+
{% for k in keys %}
|
|
12
|
+
t.{{ k }} <=> s.{{ k }} and
|
|
13
|
+
{% endfor %}
|
|
14
|
+
{% endif %}
|
|
15
|
+
and t.__is_current
|
|
16
|
+
{% if has_source %}
|
|
17
|
+
and t.__source == s.__source
|
|
18
|
+
{% endif %}
|
|
19
|
+
when matched
|
|
20
|
+
and __merge_condition == 'update' then
|
|
21
|
+
update
|
|
22
|
+
set
|
|
23
|
+
__valid_to = s.__valid_from - interval 1 seconds,
|
|
24
|
+
__is_current = False,
|
|
25
|
+
{%if soft_delete %}
|
|
26
|
+
__is_deleted = False,
|
|
27
|
+
{% endif %}
|
|
28
|
+
{%if has_metadata%}
|
|
29
|
+
__metadata.updated = cast(current_date() as timestamp),
|
|
30
|
+
{%endif%}
|
|
31
|
+
when matched
|
|
32
|
+
and __merge_condition == 'delete' then
|
|
33
|
+
update
|
|
34
|
+
set
|
|
35
|
+
__valid_to = s.__valid_from - interval 1 seconds,
|
|
36
|
+
__is_current = False,
|
|
37
|
+
{%if soft_delete %}
|
|
38
|
+
__is_deleted = True,
|
|
39
|
+
{% endif %}
|
|
40
|
+
{%if has_metadata%}
|
|
41
|
+
__metadata.updated = cast(current_date() as timestamp),
|
|
42
|
+
{%endif%}
|
|
43
|
+
when not matched
|
|
44
|
+
and __merge_condition == 'insert' then
|
|
45
|
+
insert (
|
|
46
|
+
{% for c in columns %}
|
|
47
|
+
{{ c }},
|
|
48
|
+
{% endfor %}
|
|
49
|
+
)
|
|
50
|
+
values (
|
|
51
|
+
{% for c in columns %}
|
|
52
|
+
s.{{ c }},
|
|
53
|
+
{% endfor %}
|
|
54
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
{% import 'query/hash.sql.jinja' as h -%}
|
|
2
|
+
|
|
3
|
+
with
|
|
4
|
+
{% if format == "query" %} __query as ({{ src }}), {% endif %}
|
|
5
|
+
__base as (
|
|
6
|
+
select
|
|
7
|
+
*
|
|
8
|
+
{% if all_overwrite %} except ({% for o in all_overwrite %}{{ o }}, {% endfor %}),
|
|
9
|
+
{% else %},
|
|
10
|
+
{% endif %}
|
|
11
|
+
{% if add_calculated_columns %} {% for c in add_calculated_columns %} {{ c }}, {% endfor %} {% endif %}
|
|
12
|
+
{% if add_timestamp %} cast(current_date() as timestamp) as __timestamp, {% endif %}
|
|
13
|
+
{% if add_operation %} cast('{{ add_operation }}' as string) as __operation, {% endif %}
|
|
14
|
+
{% if add_source %} cast('{{ add_source }}' as string) as __source, {% endif %}
|
|
15
|
+
{% if add_hash %} {{ h.hash(fields=hashes) }} as __hash, {% endif %}
|
|
16
|
+
{% if add_key %} {{ h.hash(fields=keys) }} as __key, {% endif %}
|
|
17
|
+
{% if add_metadata %}
|
|
18
|
+
struct(
|
|
19
|
+
{% if cdc == "nocdc" %}current_timestamp() as inserted,
|
|
20
|
+
{% else %}current_timestamp() as inserted, current_timestamp() as updated,
|
|
21
|
+
{% endif %}
|
|
22
|
+
) as __metadata,
|
|
23
|
+
{% endif %}
|
|
24
|
+
{% if format == "query" %} from __query
|
|
25
|
+
{% else %}
|
|
26
|
+
{% if format == "table" %} from {{ src }}
|
|
27
|
+
{% endif %}
|
|
28
|
+
{% if format == "global_temp_view" %} from {{ src }}
|
|
29
|
+
{% endif %}
|
|
30
|
+
{% if format == "dataframe" %} from {{ "{src}" }}
|
|
31
|
+
{% endif %}
|
|
32
|
+
{% endif %}
|
|
33
|
+
{% if filter_where %} where {{ filter_where }} {% endif %}
|
|
34
|
+
),
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/*
|
|
2
|
+
⛷️🧀🍫🏔️
|
|
3
|
+
|
|
4
|
+
👀🏁
|
|
5
|
+
{%- if format %}
|
|
6
|
+
🗹 format: {{format}}{% endif %}
|
|
7
|
+
{%- if tgt %}
|
|
8
|
+
🗹 tgt: {{tgt}}{% endif %}
|
|
9
|
+
{%- if cdc %}
|
|
10
|
+
🗹 cdc: {{cdc}}{% endif %}
|
|
11
|
+
{%- if mode %}
|
|
12
|
+
🗹 mode: {{mode}}{% endif %}
|
|
13
|
+
{%- if filter %}
|
|
14
|
+
🗹 filter: {{filter}}{% endif %}
|
|
15
|
+
{%- if rectify %}
|
|
16
|
+
🗹 rectify: {{rectify}}{% endif %}
|
|
17
|
+
{%- if deduplicate %}
|
|
18
|
+
🗹 deduplicate: {{deduplicate}}{% endif %}
|
|
19
|
+
{%- if deduplicate_key %}
|
|
20
|
+
🗹 deduplicate_key: {{deduplicate_key}}{% endif %}
|
|
21
|
+
{%- if deduplicate_hash %}
|
|
22
|
+
🗹 deduplicate_hash: {{deduplicate_hash}}{% endif %}
|
|
23
|
+
{%- if soft_delete %}
|
|
24
|
+
🗹 soft_delete: {{soft_delete}}{% endif %}
|
|
25
|
+
{%- if fix_valid_from %}
|
|
26
|
+
🗹 fix_valid_from: {{fix_valid_from}}{% endif %}
|
|
27
|
+
{%- if has_rows %}
|
|
28
|
+
🗹 has_rows: {{has_rows}}{% endif %}
|
|
29
|
+
{%- if has_source %}
|
|
30
|
+
🗹 has_source: {{has_source}}{% endif %}
|
|
31
|
+
{%- if has_metadata %}
|
|
32
|
+
🗹 has_metadata: {{has_metadata}}{% endif %}
|
|
33
|
+
{%- if has_timestamp %}
|
|
34
|
+
🗹 has_timestamp: {{has_timestamp}}{% endif %}
|
|
35
|
+
{%- if has_identity %}
|
|
36
|
+
🗹 has_identity: {{has_identity}}{% endif %}
|
|
37
|
+
{%- if has_key %}
|
|
38
|
+
🗹 has_key: {{has_key}}{% endif %}
|
|
39
|
+
{%- if has_hash %}
|
|
40
|
+
🗹 has_hash: {{has_hash}}{% endif %}
|
|
41
|
+
{%- if has_order_by %}
|
|
42
|
+
🗹 has_order_by: {{has_order_by}}{% endif %}
|
|
43
|
+
{%- if has_rescued_data %}
|
|
44
|
+
🗹 has_rescued_data: {{has_rescued_data}}{% endif %}
|
|
45
|
+
{%- if add_metadata %}
|
|
46
|
+
🗹 add_metadata: {{add_metadata}}{% endif %}
|
|
47
|
+
{%- if add_timestamp %}
|
|
48
|
+
🗹 add_timestamp: {{add_timestamp}}{% endif %}
|
|
49
|
+
{%- if add_key %}
|
|
50
|
+
🗹 add_key: {{add_key}}{% endif %}
|
|
51
|
+
{%- if add_hash %}
|
|
52
|
+
🗹 add_hash: {{add_hash}}{% endif %}
|
|
53
|
+
{%- if add_operation %}
|
|
54
|
+
🗹 add_operation: {{add_operation}}{% endif %}
|
|
55
|
+
{%- if add_source %}
|
|
56
|
+
🗹 add_source: {{add_source}}{% endif %}
|
|
57
|
+
{%- if add_calculated_columns %}
|
|
58
|
+
🗹 add_calculated_columns: {{add_calculated_columns}}{% endif %}
|
|
59
|
+
{%- if order_duplicate_by %}
|
|
60
|
+
🗹 order_duplicate_by: {{order_duplicate_by}}{% endif %}
|
|
61
|
+
{%- if all_except %}
|
|
62
|
+
🗹 all_except: {{all_except}}{% endif %}
|
|
63
|
+
{%- if all_overwrite %}
|
|
64
|
+
🗹 all_overwrite: {{all_overwrite}}{% endif %}
|
|
65
|
+
{%- if filter_where %}
|
|
66
|
+
🗹 filter_where: {{filter_where}}{% endif %}
|
|
67
|
+
{%- if update_where %}
|
|
68
|
+
🗹 update_where: {{update_where}}{% endif %}
|
|
69
|
+
{%- if parent_filter %}
|
|
70
|
+
🗹 parent_filter: {{parent_filter}}{% endif %}
|
|
71
|
+
{%- if parent_rectify %}
|
|
72
|
+
🗹 parent_rectify: {{parent_rectify}}{% endif %}
|
|
73
|
+
{%- if parent_deduplicate_key %}
|
|
74
|
+
🗹 parent_deduplicate_key: {{parent_deduplicate_key}}{% endif %}
|
|
75
|
+
{%- if parent_deduplicate_hash %}
|
|
76
|
+
🗹 parent_deduplicate_hash: {{parent_deduplicate_hash}}{% endif %}
|
|
77
|
+
{%- if parent_cdc %}
|
|
78
|
+
🗹 parent_cdc: {{parent_cdc}}{% endif %}
|
|
79
|
+
{%- if parent_final %}
|
|
80
|
+
🗹 parent_final: {{parent_final}}{% endif %}
|
|
81
|
+
👀🏳️
|
|
82
|
+
|
|
83
|
+
👁️🏁
|
|
84
|
+
{%- if src %}
|
|
85
|
+
🗸 src: {{src}}{% endif %}
|
|
86
|
+
{%- if fields %}
|
|
87
|
+
🗸 fields: {{fields}}{% endif %}
|
|
88
|
+
{%- if keys %}
|
|
89
|
+
🗸 keys: {{keys}}{% endif %}
|
|
90
|
+
{%- if hashes %}
|
|
91
|
+
🗸 hashes: {{hashes}}{% endif %}
|
|
92
|
+
👁️🏳️
|
|
93
|
+
|
|
94
|
+
*/
|
|
95
|
+
|