fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from jinja2 import Environment, PackageLoader
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
|
+
from fabricks.cdc.base.generator import Generator
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
+
from fabricks.metastore.table import Table
|
|
12
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
13
|
+
from fabricks.utils._types import DataFrameLike
|
|
14
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Processor(Generator):
|
|
18
|
+
def get_data(self, src: AllowedSources, **kwargs) -> DataFrame:
|
|
19
|
+
if isinstance(src, DataFrameLike):
|
|
20
|
+
name = f"{self.qualified_name}__data"
|
|
21
|
+
global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False), job=self)
|
|
22
|
+
src = f"select * from {global_temp_view}"
|
|
23
|
+
|
|
24
|
+
sql = self.get_query(src, fix=True, **kwargs)
|
|
25
|
+
DEFAULT_LOGGER.debug("exec query", extra={"label": self, "sql": sql})
|
|
26
|
+
return self.spark.sql(sql)
|
|
27
|
+
|
|
28
|
+
def get_query_context(self, src: AllowedSources, **kwargs) -> dict:
|
|
29
|
+
DEFAULT_LOGGER.debug("deduce query context", extra={"label": self})
|
|
30
|
+
|
|
31
|
+
if isinstance(src, DataFrameLike):
|
|
32
|
+
format = "dataframe"
|
|
33
|
+
elif isinstance(src, Table):
|
|
34
|
+
format = "table"
|
|
35
|
+
elif isinstance(src, str):
|
|
36
|
+
format = "query"
|
|
37
|
+
else:
|
|
38
|
+
raise ValueError(f"{src} not allowed")
|
|
39
|
+
|
|
40
|
+
inputs = self.get_columns(src, backtick=False, sort=False)
|
|
41
|
+
fields = [c for c in inputs if not c.startswith("__")]
|
|
42
|
+
keys = kwargs.get("keys", None)
|
|
43
|
+
|
|
44
|
+
mode = kwargs.get("mode", "complete")
|
|
45
|
+
if mode == "update":
|
|
46
|
+
tgt = str(self.table)
|
|
47
|
+
elif mode == "append" and "__timestamp" in inputs:
|
|
48
|
+
tgt = str(self.table)
|
|
49
|
+
else:
|
|
50
|
+
tgt = None
|
|
51
|
+
|
|
52
|
+
overwrite = []
|
|
53
|
+
exclude = kwargs.get("exclude", []) # used by silver to exclude __operation from output if not update
|
|
54
|
+
|
|
55
|
+
order_duplicate_by = kwargs.get("order_duplicate_by", None)
|
|
56
|
+
if order_duplicate_by:
|
|
57
|
+
order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
|
|
58
|
+
|
|
59
|
+
add_source = kwargs.get("add_source", None)
|
|
60
|
+
add_calculated_columns = kwargs.get("add_calculated_columns", [])
|
|
61
|
+
if add_calculated_columns:
|
|
62
|
+
raise ValueError("add_calculated_columns is not yet supported")
|
|
63
|
+
add_operation = kwargs.get("add_operation", None)
|
|
64
|
+
add_key = kwargs.get("add_key", None)
|
|
65
|
+
add_hash = kwargs.get("add_hash", None)
|
|
66
|
+
add_timestamp = kwargs.get("add_timestamp", None)
|
|
67
|
+
add_metadata = kwargs.get("add_metadata", None)
|
|
68
|
+
|
|
69
|
+
has_order_by = None if not order_duplicate_by else True
|
|
70
|
+
|
|
71
|
+
# determine which special columns are present or need to be added to the output
|
|
72
|
+
has_operation = add_operation or "__operation" in inputs
|
|
73
|
+
has_metadata = add_metadata or "__metadata" in inputs
|
|
74
|
+
has_source = add_source or "__source" in inputs
|
|
75
|
+
has_timestamp = add_timestamp or "__timestamp" in inputs
|
|
76
|
+
has_key = add_key or "__key" in inputs
|
|
77
|
+
has_hash = add_hash or "__hash" in inputs
|
|
78
|
+
has_identity = "__identity" in inputs
|
|
79
|
+
has_rescued_data = "__rescued_data" in inputs
|
|
80
|
+
|
|
81
|
+
soft_delete = kwargs.get("soft_delete", None)
|
|
82
|
+
delete_missing = kwargs.get("delete_missing", None)
|
|
83
|
+
slice = kwargs.get("slice", None)
|
|
84
|
+
rectify = kwargs.get("rectify", None)
|
|
85
|
+
deduplicate = kwargs.get("deduplicate", None)
|
|
86
|
+
deduplicate_key = kwargs.get("deduplicate_key", None)
|
|
87
|
+
deduplicate_hash = kwargs.get("deduplicate_hash", None)
|
|
88
|
+
correct_valid_from = kwargs.get("correct_valid_from", None)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
has_rows = self.table.rows > 0
|
|
92
|
+
except Exception:
|
|
93
|
+
has_rows = None
|
|
94
|
+
|
|
95
|
+
# only needed when comparing to current
|
|
96
|
+
# delete all records in current if there is no new data
|
|
97
|
+
if mode == "update" and delete_missing and self.change_data_capture in ["scd1", "scd2"]:
|
|
98
|
+
has_no_data = not self.has_data(src)
|
|
99
|
+
else:
|
|
100
|
+
has_no_data = None
|
|
101
|
+
|
|
102
|
+
# always deduplicate if not set for slowly changing dimensions
|
|
103
|
+
if self.slowly_changing_dimension:
|
|
104
|
+
if deduplicate is None:
|
|
105
|
+
deduplicate = True
|
|
106
|
+
|
|
107
|
+
# order duplicates by implies key deduplication
|
|
108
|
+
if order_duplicate_by:
|
|
109
|
+
deduplicate_key = True
|
|
110
|
+
|
|
111
|
+
if deduplicate:
|
|
112
|
+
deduplicate_key = True
|
|
113
|
+
deduplicate_hash = True
|
|
114
|
+
|
|
115
|
+
# if any deduplication is requested, deduplicate all
|
|
116
|
+
deduplicate = deduplicate or deduplicate_key or deduplicate_hash
|
|
117
|
+
|
|
118
|
+
# always rectify if not set
|
|
119
|
+
if self.slowly_changing_dimension:
|
|
120
|
+
if rectify is None:
|
|
121
|
+
rectify = True
|
|
122
|
+
|
|
123
|
+
# only correct valid_from on first load
|
|
124
|
+
if self.slowly_changing_dimension and mode == "update":
|
|
125
|
+
correct_valid_from = correct_valid_from and self.table.rows == 0
|
|
126
|
+
|
|
127
|
+
# override slice for incremental load if timestamp and rows are present
|
|
128
|
+
if slice is None:
|
|
129
|
+
if mode == "update" and has_timestamp and has_rows:
|
|
130
|
+
slice = "update"
|
|
131
|
+
|
|
132
|
+
# override slice for full load if update and table is empty
|
|
133
|
+
if slice == "update" and not has_rows:
|
|
134
|
+
slice = None
|
|
135
|
+
|
|
136
|
+
# override operation if added and found in df
|
|
137
|
+
if add_operation and "__operation" in inputs:
|
|
138
|
+
overwrite.append("__operation")
|
|
139
|
+
|
|
140
|
+
# override timestamp if added and found in df
|
|
141
|
+
if add_timestamp and "__timestamp" in inputs:
|
|
142
|
+
overwrite.append("__timestamp")
|
|
143
|
+
|
|
144
|
+
# override key if added and found in df (key needed for merge)
|
|
145
|
+
if add_key and "__key" in inputs:
|
|
146
|
+
overwrite.append("__key")
|
|
147
|
+
|
|
148
|
+
# override hash if added and found in df (hash needed to identify fake updates)
|
|
149
|
+
if add_hash and "__hash" in inputs:
|
|
150
|
+
overwrite.append("__hash")
|
|
151
|
+
|
|
152
|
+
# override metadata if added and found in df
|
|
153
|
+
if add_metadata and "__metadata" in inputs:
|
|
154
|
+
overwrite.append("__metadata")
|
|
155
|
+
|
|
156
|
+
advanced_ctes = ((rectify or deduplicate) and self.slowly_changing_dimension) or self.slowly_changing_dimension
|
|
157
|
+
advanced_deduplication = advanced_ctes and deduplicate
|
|
158
|
+
|
|
159
|
+
# add key and hash if not added nor found in df but exclude from output
|
|
160
|
+
# needed for merge
|
|
161
|
+
if mode == "update" or advanced_ctes or deduplicate:
|
|
162
|
+
if not add_key and "__key" not in inputs:
|
|
163
|
+
add_key = True
|
|
164
|
+
exclude.append("__key")
|
|
165
|
+
|
|
166
|
+
if not add_hash and "__hash" not in inputs:
|
|
167
|
+
add_hash = True
|
|
168
|
+
exclude.append("__hash")
|
|
169
|
+
|
|
170
|
+
# add operation and timestamp if not added nor found in df but exclude from output
|
|
171
|
+
# needed for deduplication and/or rectification
|
|
172
|
+
if advanced_ctes:
|
|
173
|
+
if not add_operation and "__operation" not in inputs:
|
|
174
|
+
add_operation = "upsert"
|
|
175
|
+
exclude.append("__operation")
|
|
176
|
+
|
|
177
|
+
if not add_timestamp and "__timestamp" not in inputs:
|
|
178
|
+
add_timestamp = True
|
|
179
|
+
exclude.append("__timestamp")
|
|
180
|
+
|
|
181
|
+
if add_key:
|
|
182
|
+
keys = keys if keys is not None else [f for f in fields]
|
|
183
|
+
if isinstance(keys, str):
|
|
184
|
+
keys = [keys]
|
|
185
|
+
if has_source:
|
|
186
|
+
keys.append("__source")
|
|
187
|
+
|
|
188
|
+
hashes = None
|
|
189
|
+
if add_hash:
|
|
190
|
+
hashes = [f for f in fields]
|
|
191
|
+
if "__operation" in inputs or add_operation:
|
|
192
|
+
hashes.append("__operation")
|
|
193
|
+
|
|
194
|
+
if self.change_data_capture == "nocdc":
|
|
195
|
+
intermediates = [i for i in inputs]
|
|
196
|
+
outputs = [i for i in inputs]
|
|
197
|
+
else:
|
|
198
|
+
intermediates = [f for f in fields]
|
|
199
|
+
outputs = [f for f in fields]
|
|
200
|
+
|
|
201
|
+
if has_operation:
|
|
202
|
+
if "__operation" not in outputs:
|
|
203
|
+
outputs.append("__operation")
|
|
204
|
+
if has_timestamp:
|
|
205
|
+
if "__timestamp" not in outputs:
|
|
206
|
+
outputs.append("__timestamp")
|
|
207
|
+
if has_key:
|
|
208
|
+
if "__key" not in outputs:
|
|
209
|
+
outputs.append("__key")
|
|
210
|
+
if has_hash:
|
|
211
|
+
if "__hash" not in outputs:
|
|
212
|
+
outputs.append("__hash")
|
|
213
|
+
|
|
214
|
+
if has_metadata:
|
|
215
|
+
if "__metadata" not in outputs:
|
|
216
|
+
outputs.append("__metadata")
|
|
217
|
+
if "__metadata" not in intermediates:
|
|
218
|
+
intermediates.append("__metadata")
|
|
219
|
+
if has_source:
|
|
220
|
+
if "__source" not in outputs:
|
|
221
|
+
outputs.append("__source")
|
|
222
|
+
if "__source" not in intermediates:
|
|
223
|
+
intermediates.append("__source")
|
|
224
|
+
if has_identity:
|
|
225
|
+
if "__identity" not in outputs:
|
|
226
|
+
outputs.append("__identity")
|
|
227
|
+
if "__identity" not in intermediates:
|
|
228
|
+
intermediates.append("__identity")
|
|
229
|
+
if has_rescued_data:
|
|
230
|
+
if "__rescued_data" not in outputs:
|
|
231
|
+
outputs.append("__rescued_data")
|
|
232
|
+
if "__rescued_data" not in intermediates:
|
|
233
|
+
intermediates.append("__rescued_data")
|
|
234
|
+
|
|
235
|
+
if soft_delete:
|
|
236
|
+
if "__is_deleted" not in outputs:
|
|
237
|
+
outputs.append("__is_deleted")
|
|
238
|
+
if "__is_current" not in outputs:
|
|
239
|
+
outputs.append("__is_current")
|
|
240
|
+
|
|
241
|
+
if self.change_data_capture == "scd2":
|
|
242
|
+
if "__valid_from" not in outputs:
|
|
243
|
+
outputs.append("__valid_from")
|
|
244
|
+
if "__valid_to" not in outputs:
|
|
245
|
+
outputs.append("__valid_to")
|
|
246
|
+
if "__is_current" not in outputs:
|
|
247
|
+
outputs.append("__is_current")
|
|
248
|
+
|
|
249
|
+
if advanced_ctes:
|
|
250
|
+
if "__operation" not in intermediates:
|
|
251
|
+
intermediates.append("__operation")
|
|
252
|
+
if "__timestamp" not in intermediates:
|
|
253
|
+
intermediates.append("__timestamp")
|
|
254
|
+
|
|
255
|
+
# needed for deduplication and/or rectification
|
|
256
|
+
# might need __operation or __source
|
|
257
|
+
if "__key" not in intermediates:
|
|
258
|
+
intermediates.append("__key")
|
|
259
|
+
if "__hash" not in intermediates:
|
|
260
|
+
intermediates.append("__hash")
|
|
261
|
+
|
|
262
|
+
outputs = [o for o in outputs if o not in exclude]
|
|
263
|
+
outputs = self.sort_columns(outputs)
|
|
264
|
+
|
|
265
|
+
parent_slice = None
|
|
266
|
+
if slice:
|
|
267
|
+
parent_slice = "__base"
|
|
268
|
+
|
|
269
|
+
parent_deduplicate_key = None
|
|
270
|
+
if deduplicate_key:
|
|
271
|
+
if slice:
|
|
272
|
+
parent_deduplicate_key = "__sliced"
|
|
273
|
+
else:
|
|
274
|
+
parent_deduplicate_key = "__base"
|
|
275
|
+
|
|
276
|
+
parent_rectify = None
|
|
277
|
+
if rectify:
|
|
278
|
+
if deduplicate_key:
|
|
279
|
+
parent_rectify = "__deduplicated_key"
|
|
280
|
+
elif slice:
|
|
281
|
+
parent_rectify = "__sliced"
|
|
282
|
+
else:
|
|
283
|
+
parent_rectify = "__base"
|
|
284
|
+
|
|
285
|
+
parent_deduplicate_hash = None
|
|
286
|
+
if deduplicate_hash:
|
|
287
|
+
if rectify:
|
|
288
|
+
parent_deduplicate_hash = "__rectified"
|
|
289
|
+
elif deduplicate_key:
|
|
290
|
+
parent_deduplicate_hash = "__deduplicated_key"
|
|
291
|
+
elif slice:
|
|
292
|
+
parent_deduplicate_hash = "__sliced"
|
|
293
|
+
else:
|
|
294
|
+
parent_deduplicate_hash = "__base"
|
|
295
|
+
|
|
296
|
+
parent_cdc = None
|
|
297
|
+
if deduplicate_hash:
|
|
298
|
+
parent_cdc = "__deduplicated_hash"
|
|
299
|
+
elif rectify:
|
|
300
|
+
parent_cdc = "__rectified"
|
|
301
|
+
elif deduplicate_key:
|
|
302
|
+
parent_cdc = "__deduplicated_key"
|
|
303
|
+
elif slice:
|
|
304
|
+
parent_cdc = "__sliced"
|
|
305
|
+
else:
|
|
306
|
+
parent_cdc = "__base"
|
|
307
|
+
|
|
308
|
+
parent_final = "__final"
|
|
309
|
+
|
|
310
|
+
return {
|
|
311
|
+
"src": src,
|
|
312
|
+
"format": format,
|
|
313
|
+
"tgt": tgt,
|
|
314
|
+
"cdc": self.change_data_capture,
|
|
315
|
+
"mode": mode,
|
|
316
|
+
# fields
|
|
317
|
+
"inputs": inputs,
|
|
318
|
+
"intermediates": intermediates,
|
|
319
|
+
"outputs": outputs,
|
|
320
|
+
"fields": fields,
|
|
321
|
+
"keys": keys,
|
|
322
|
+
"hashes": hashes,
|
|
323
|
+
# options
|
|
324
|
+
"delete_missing": delete_missing,
|
|
325
|
+
"advanced_deduplication": advanced_deduplication,
|
|
326
|
+
# cte's
|
|
327
|
+
"slice": slice,
|
|
328
|
+
"rectify": rectify,
|
|
329
|
+
"deduplicate": deduplicate,
|
|
330
|
+
"deduplicate_key": deduplicate_key,
|
|
331
|
+
"deduplicate_hash": deduplicate_hash,
|
|
332
|
+
# has
|
|
333
|
+
"has_no_data": has_no_data,
|
|
334
|
+
"has_rows": has_rows,
|
|
335
|
+
"has_source": has_source,
|
|
336
|
+
"has_metadata": has_metadata,
|
|
337
|
+
"has_timestamp": has_timestamp,
|
|
338
|
+
"has_operation": has_operation,
|
|
339
|
+
"has_identity": has_identity,
|
|
340
|
+
"has_key": has_key,
|
|
341
|
+
"has_hash": has_hash,
|
|
342
|
+
"has_order_by": has_order_by,
|
|
343
|
+
"has_rescued_data": has_rescued_data,
|
|
344
|
+
# default add
|
|
345
|
+
"add_metadata": add_metadata,
|
|
346
|
+
"add_timestamp": add_timestamp,
|
|
347
|
+
"add_key": add_key,
|
|
348
|
+
"add_hash": add_hash,
|
|
349
|
+
# value add
|
|
350
|
+
"add_operation": add_operation,
|
|
351
|
+
"add_source": add_source,
|
|
352
|
+
"add_calculated_columns": add_calculated_columns,
|
|
353
|
+
# extra
|
|
354
|
+
"order_duplicate_by": order_duplicate_by,
|
|
355
|
+
"soft_delete": soft_delete,
|
|
356
|
+
"correct_valid_from": correct_valid_from,
|
|
357
|
+
# overwrite
|
|
358
|
+
"overwrite": overwrite,
|
|
359
|
+
# filter
|
|
360
|
+
"slices": None,
|
|
361
|
+
"sources": None,
|
|
362
|
+
"filter_where": kwargs.get("filter_where"),
|
|
363
|
+
"update_where": kwargs.get("update_where"),
|
|
364
|
+
# parents
|
|
365
|
+
"parent_slice": parent_slice,
|
|
366
|
+
"parent_rectify": parent_rectify,
|
|
367
|
+
"parent_deduplicate_key": parent_deduplicate_key,
|
|
368
|
+
"parent_deduplicate_hash": parent_deduplicate_hash,
|
|
369
|
+
"parent_cdc": parent_cdc,
|
|
370
|
+
"parent_final": parent_final,
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
def fix_sql(self, sql: str) -> str:
|
|
374
|
+
try:
|
|
375
|
+
sql = sql.replace("{src}", "src")
|
|
376
|
+
sql = fix_sql(sql)
|
|
377
|
+
sql = sql.replace("`src`", "{src}")
|
|
378
|
+
|
|
379
|
+
DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql, "target": "buffer"})
|
|
380
|
+
return sql
|
|
381
|
+
|
|
382
|
+
except Exception as e:
|
|
383
|
+
DEFAULT_LOGGER.exception("fail to fix sql query", extra={"label": self, "sql": sql})
|
|
384
|
+
raise e
|
|
385
|
+
|
|
386
|
+
def fix_context(self, context: dict, fix: Optional[bool] = True, **kwargs) -> dict:
|
|
387
|
+
environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
|
|
388
|
+
template = environment.get_template("filter.sql.jinja")
|
|
389
|
+
|
|
390
|
+
try:
|
|
391
|
+
sql = template.render(**context)
|
|
392
|
+
if fix:
|
|
393
|
+
DEFAULT_LOGGER.debug("fix context", extra={"label": self, "sql": sql})
|
|
394
|
+
sql = self.fix_sql(sql)
|
|
395
|
+
|
|
396
|
+
except (Exception, TypeError) as e:
|
|
397
|
+
DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "context": context})
|
|
398
|
+
raise e
|
|
399
|
+
|
|
400
|
+
row = self.spark.sql(sql).collect()[0]
|
|
401
|
+
assert row.slices, "no slices found"
|
|
402
|
+
|
|
403
|
+
context["slices"] = row.slices
|
|
404
|
+
if context.get("has_source"):
|
|
405
|
+
assert row.sources, "no sources found"
|
|
406
|
+
context["sources"] = row.sources
|
|
407
|
+
|
|
408
|
+
return context
|
|
409
|
+
|
|
410
|
+
def get_query(self, src: AllowedSources, fix: Optional[bool] = True, **kwargs) -> str:
|
|
411
|
+
context = self.get_query_context(src=src, **kwargs)
|
|
412
|
+
environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
|
|
413
|
+
|
|
414
|
+
try:
|
|
415
|
+
if context.get("slice"):
|
|
416
|
+
context = self.fix_context(context, fix=fix, **kwargs)
|
|
417
|
+
|
|
418
|
+
template = environment.get_template("query.sql.jinja")
|
|
419
|
+
|
|
420
|
+
sql = template.render(**context)
|
|
421
|
+
if fix:
|
|
422
|
+
sql = self.fix_sql(sql)
|
|
423
|
+
else:
|
|
424
|
+
DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql})
|
|
425
|
+
|
|
426
|
+
except (Exception, TypeError) as e:
|
|
427
|
+
DEFAULT_LOGGER.debug("context", extra={"label": self, "context": context})
|
|
428
|
+
DEFAULT_LOGGER.exception("fail to generate sql query", extra={"label": self, "context": context})
|
|
429
|
+
raise e
|
|
430
|
+
|
|
431
|
+
return sql
|
|
432
|
+
|
|
433
|
+
def append(self, src: AllowedSources, **kwargs):
|
|
434
|
+
if not self.table.registered:
|
|
435
|
+
self.create_table(src, **kwargs)
|
|
436
|
+
|
|
437
|
+
df = self.get_data(src, **kwargs)
|
|
438
|
+
df = self.reorder_dataframe(df)
|
|
439
|
+
|
|
440
|
+
name = f"{self.qualified_name}__append"
|
|
441
|
+
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
|
|
442
|
+
append = f"insert into table {self.table} by name select * from global_temp.{name}"
|
|
443
|
+
|
|
444
|
+
DEFAULT_LOGGER.debug("exec append", extra={"label": self, "sql": append})
|
|
445
|
+
self.spark.sql(append)
|
|
446
|
+
|
|
447
|
+
def overwrite(
|
|
448
|
+
self,
|
|
449
|
+
src: AllowedSources,
|
|
450
|
+
dynamic: Optional[bool] = False,
|
|
451
|
+
**kwargs,
|
|
452
|
+
):
|
|
453
|
+
if not self.table.registered:
|
|
454
|
+
self.create_table(src, **kwargs)
|
|
455
|
+
|
|
456
|
+
df = self.get_data(src, **kwargs)
|
|
457
|
+
df = self.reorder_dataframe(df)
|
|
458
|
+
|
|
459
|
+
if not dynamic:
|
|
460
|
+
if kwargs.get("update_where"):
|
|
461
|
+
dynamic = True
|
|
462
|
+
|
|
463
|
+
if dynamic:
|
|
464
|
+
self.spark.sql("set spark.sql.sources.partitionOverwriteMode = dynamic")
|
|
465
|
+
|
|
466
|
+
name = f"{self.qualified_name}__overwrite"
|
|
467
|
+
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
|
|
468
|
+
overwrite = f"insert overwrite table {self.table} by name select * from global_temp.{name}"
|
|
469
|
+
|
|
470
|
+
DEFAULT_LOGGER.debug("excec overwrite", extra={"label": self, "sql": overwrite})
|
|
471
|
+
self.spark.sql(overwrite)
|
fabricks/cdc/cdc.py
ADDED
fabricks/cdc/nocdc.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.scd import SCD
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NoCDC(SCD):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
database: str,
|
|
12
|
+
*levels: str,
|
|
13
|
+
spark: Optional[SparkSession] = None,
|
|
14
|
+
):
|
|
15
|
+
super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
|
|
16
|
+
|
|
17
|
+
def delete_missing(self, src, **kwargs):
|
|
18
|
+
kwargs["delete_missing"] = True
|
|
19
|
+
kwargs["mode"] = "update"
|
|
20
|
+
self.merge(src, **kwargs)
|
fabricks/cdc/scd.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.base import BaseCDC
|
|
6
|
+
from fabricks.metastore.table import Table
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SCD(BaseCDC):
|
|
10
|
+
def delete_missing(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
11
|
+
kwargs["add_operation"] = "reload"
|
|
12
|
+
kwargs["delete_missing"] = True
|
|
13
|
+
kwargs["mode"] = "update"
|
|
14
|
+
self.merge(src, **kwargs)
|
|
15
|
+
|
|
16
|
+
def complete(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
17
|
+
kwargs["mode"] = "complete"
|
|
18
|
+
self.overwrite(src, **kwargs)
|
|
19
|
+
|
|
20
|
+
def update(self, src: Union[DataFrame, Table, str], **kwargs):
|
|
21
|
+
kwargs["mode"] = "update"
|
|
22
|
+
self.merge(src, **kwargs)
|
fabricks/cdc/scd1.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.scd import SCD
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SCD1(SCD):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
database: str,
|
|
12
|
+
*levels: str,
|
|
13
|
+
spark: Optional[SparkSession] = None,
|
|
14
|
+
):
|
|
15
|
+
super().__init__(database, *levels, change_data_capture="scd1", spark=spark)
|
fabricks/cdc/scd2.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
|
|
5
|
+
from fabricks.cdc.scd import SCD
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SCD2(SCD):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
database: str,
|
|
12
|
+
*levels: str,
|
|
13
|
+
spark: Optional[SparkSession] = None,
|
|
14
|
+
):
|
|
15
|
+
super().__init__(database, *levels, change_data_capture="scd2", spark=spark)
|
|
File without changes
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{% import 'macros/hash.sql.jinja' as h -%}
|
|
2
|
+
|
|
3
|
+
with
|
|
4
|
+
{% if format == "query" %} __query as ({{ src }}), {% endif %}
|
|
5
|
+
__base as (
|
|
6
|
+
select
|
|
7
|
+
*
|
|
8
|
+
{% if overwrite %}
|
|
9
|
+
-- will be overwritten below
|
|
10
|
+
except ({% for o in overwrite %}{{ o }}, {% endfor %})
|
|
11
|
+
{% endif %},
|
|
12
|
+
{% if add_calculated_columns %} {% for c in add_calculated_columns %} {{ c }}, {% endfor %} {% endif %}
|
|
13
|
+
{% if add_timestamp %} cast(current_date() as timestamp) as __timestamp, {% endif %}
|
|
14
|
+
{% if add_operation %} cast('{{ add_operation }}' as string) as __operation, {% endif %}
|
|
15
|
+
{% if add_source %} cast('{{ add_source }}' as string) as __source, {% endif %}
|
|
16
|
+
{% if add_hash %} {{ h.add_hash(fields=hashes) }} as __hash, {% endif %}
|
|
17
|
+
{% if add_key %} {{ h.add_hash(fields=keys) }} as __key, {% endif %}
|
|
18
|
+
{% if add_metadata %}
|
|
19
|
+
struct(
|
|
20
|
+
{% if cdc == "nocdc" %}current_timestamp() as inserted,
|
|
21
|
+
{% else %}current_timestamp() as inserted, current_timestamp() as updated,
|
|
22
|
+
{% endif %}
|
|
23
|
+
) as __metadata,
|
|
24
|
+
{% endif %}
|
|
25
|
+
{% if format == "query" %} from __query
|
|
26
|
+
{% else %}
|
|
27
|
+
{% if format == "table" %} from {{ src }}
|
|
28
|
+
{% endif %}
|
|
29
|
+
{% if format == "global_temp_view" %} from {{ src }}
|
|
30
|
+
{% endif %}
|
|
31
|
+
{% if format == "dataframe" %} from {{ "{src}" }}
|
|
32
|
+
{% endif %}
|
|
33
|
+
{% endif %}
|
|
34
|
+
{% if filter_where %} where {{ filter_where }} {% endif %}
|
|
35
|
+
),
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{% import 'macros/hash.sql.jinja' as h -%}
|
|
2
|
+
|
|
3
|
+
__current as (
|
|
4
|
+
select
|
|
5
|
+
{% for i in intermediates %}
|
|
6
|
+
{% if i == "__timestamp" %}
|
|
7
|
+
{% if add_timestamp %} cast('0001-01-01' as timestamp) as __timestamp,
|
|
8
|
+
{% elif cdc == "nocdc" %} __timestamp,
|
|
9
|
+
{% elif cdc == "scd1" %} __timestamp,
|
|
10
|
+
{% elif cdc == "scd2" %} __valid_from as __timestamp,
|
|
11
|
+
{% endif %}
|
|
12
|
+
{% elif i == "__operation" %}
|
|
13
|
+
{% if has_no_data %} 'delete' as __operation, {% else %} 'current' as __operation, {% endif %}
|
|
14
|
+
{% elif i == "__hash" %}
|
|
15
|
+
{% if add_hash %} {{ h.add_hash(fields=hashes) }} as __hash, {% else %} __hash, {% endif %}
|
|
16
|
+
{% elif i == "__key" %}
|
|
17
|
+
{% if add_key %} {{ h.add_key(fields=keys) }} as __key, {% else %} __key, {% endif %}
|
|
18
|
+
{% else %} `{{ i }}`,
|
|
19
|
+
{% endif %}
|
|
20
|
+
{% endfor %}
|
|
21
|
+
from {{ tgt }} t
|
|
22
|
+
where
|
|
23
|
+
true
|
|
24
|
+
{% if cdc == "scd2" %} and __is_current {% endif %}
|
|
25
|
+
{% if cdc == "scd1" %} {% if soft_delete %} and __is_current {% endif %} {% endif %}
|
|
26
|
+
{% if sources %} and ({{ sources }}) {% endif %}
|
|
27
|
+
{% if update_where %} and {{ update_where }} {% endif %}
|
|
28
|
+
),
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
{% if advanced_deduplication %}
|
|
2
|
+
__deduplicate_hash as (
|
|
3
|
+
select
|
|
4
|
+
*,
|
|
5
|
+
lag(__hash) over (
|
|
6
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
7
|
+
) as __deduplicate_hash_previous__hash,
|
|
8
|
+
lag(__operation) over (
|
|
9
|
+
partition by {% if has_source %} __source, {% endif %} __key order by __timestamp asc
|
|
10
|
+
) as __deduplicate_hash_previous_operation
|
|
11
|
+
from {{ parent_deduplicate_hash }}
|
|
12
|
+
where true
|
|
13
|
+
),
|
|
14
|
+
__deduplicated_hash as (
|
|
15
|
+
select *
|
|
16
|
+
from __deduplicate_hash
|
|
17
|
+
where
|
|
18
|
+
true
|
|
19
|
+
and not (
|
|
20
|
+
__hash <=> __deduplicate_hash_previous__hash and __operation <=> __deduplicate_hash_previous_operation
|
|
21
|
+
)
|
|
22
|
+
),
|
|
23
|
+
{% else %}
|
|
24
|
+
__deduplicated_hash as (
|
|
25
|
+
select *
|
|
26
|
+
from {{ parent_deduplicate_hash }}
|
|
27
|
+
where true
|
|
28
|
+
qualify
|
|
29
|
+
not lag(__hash) over (partition by {% if has_source %} __source, {% endif %} __key order by null)
|
|
30
|
+
<=> __hash
|
|
31
|
+
),
|
|
32
|
+
{% endif %}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{% if advanced_deduplication %}
|
|
2
|
+
__deduplicate_key as (
|
|
3
|
+
select
|
|
4
|
+
*,
|
|
5
|
+
row_number() over (
|
|
6
|
+
partition by {% if has_source %} __source, {% endif %} __key, __timestamp
|
|
7
|
+
order by
|
|
8
|
+
/* prioritize delete over upsert */
|
|
9
|
+
__operation asc,
|
|
10
|
+
{% if has_order_by %} {% for o in order_duplicate_by %} {{ o }}, {% endfor %} {% endif %}
|
|
11
|
+
) as __deduplicate_key_rn
|
|
12
|
+
from {{ parent_deduplicate_key }}
|
|
13
|
+
where true
|
|
14
|
+
),
|
|
15
|
+
__deduplicated_key as (select *, from __deduplicate_key where __deduplicate_key_rn == 1),
|
|
16
|
+
{% else %}
|
|
17
|
+
__deduplicated_key as (
|
|
18
|
+
select *
|
|
19
|
+
from {{ parent_deduplicate_key }}
|
|
20
|
+
where true
|
|
21
|
+
qualify
|
|
22
|
+
row_number() over (
|
|
23
|
+
partition by {% if has_source %} __source, {% endif %} __key
|
|
24
|
+
order by
|
|
25
|
+
{% if has_order_by %} {% for o in order_duplicate_by %} {{ o }}, {% endfor %}
|
|
26
|
+
{% else %} null
|
|
27
|
+
{% endif %}
|
|
28
|
+
)
|
|
29
|
+
== 1
|
|
30
|
+
),
|
|
31
|
+
{% endif %}
|