fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from functools import partial
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.functions import expr
|
|
7
|
+
|
|
8
|
+
from fabricks.context import IS_TYPE_WIDENING, IS_UNITY_CATALOG, SECRET_SCOPE
|
|
9
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
10
|
+
from fabricks.core.jobs.base.exception import (
|
|
11
|
+
PostRunCheckException,
|
|
12
|
+
PostRunCheckWarning,
|
|
13
|
+
PostRunInvokeException,
|
|
14
|
+
PreRunCheckException,
|
|
15
|
+
PreRunCheckWarning,
|
|
16
|
+
PreRunInvokeException,
|
|
17
|
+
SchemaDriftException,
|
|
18
|
+
SkipRunCheckWarning,
|
|
19
|
+
)
|
|
20
|
+
from fabricks.core.jobs.base.invoker import Invoker
|
|
21
|
+
from fabricks.utils.write import write_stream
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Processor(Invoker):
|
|
25
|
+
def filter_where(self, df: DataFrame) -> DataFrame:
|
|
26
|
+
f = self.options.job.get("filter_where")
|
|
27
|
+
|
|
28
|
+
if f:
|
|
29
|
+
DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
|
|
30
|
+
df = df.where(f"{f}")
|
|
31
|
+
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
def encrypt(self, df: DataFrame) -> DataFrame:
|
|
35
|
+
encrypted_columns = self.options.job.get_list("encrypted_columns")
|
|
36
|
+
if encrypted_columns:
|
|
37
|
+
if not IS_UNITY_CATALOG:
|
|
38
|
+
from databricks.sdk.runtime import dbutils
|
|
39
|
+
|
|
40
|
+
key = dbutils.secrets.get(scope=SECRET_SCOPE, key="encryption-key")
|
|
41
|
+
else:
|
|
42
|
+
import os
|
|
43
|
+
|
|
44
|
+
key = os.environ["FABRICKS_ENCRYPTION_KEY"]
|
|
45
|
+
|
|
46
|
+
assert key, "key not found"
|
|
47
|
+
|
|
48
|
+
for col in encrypted_columns:
|
|
49
|
+
DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
|
|
50
|
+
df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
|
|
51
|
+
|
|
52
|
+
return df
|
|
53
|
+
|
|
54
|
+
def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
|
|
55
|
+
"""
|
|
56
|
+
Restores the processor to a specific version and batch.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
last_version (Optional[str]): The last version to restore to. If None, no version restore will be performed.
|
|
60
|
+
last_batch (Optional[str]): The last batch to restore to. If None, no batch restore will be performed.
|
|
61
|
+
"""
|
|
62
|
+
if self.persist:
|
|
63
|
+
if last_version is not None:
|
|
64
|
+
_last_version = int(last_version)
|
|
65
|
+
if self.table.get_last_version() > _last_version:
|
|
66
|
+
self.table.restore_to_version(_last_version)
|
|
67
|
+
|
|
68
|
+
if last_batch is not None:
|
|
69
|
+
current_batch = int(last_batch) + 1
|
|
70
|
+
self.rm_commit(current_batch)
|
|
71
|
+
|
|
72
|
+
assert last_batch == self.table.get_property("fabricks.last_batch")
|
|
73
|
+
assert self.paths.commits.joinpath(last_batch).exists()
|
|
74
|
+
|
|
75
|
+
def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
76
|
+
DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
|
|
77
|
+
if batch is not None:
|
|
78
|
+
DEFAULT_LOGGER.debug(f"batch {batch}", extra={"label": self})
|
|
79
|
+
|
|
80
|
+
df = self.base_transform(df)
|
|
81
|
+
|
|
82
|
+
diffs = self.get_schema_differences(df)
|
|
83
|
+
if diffs:
|
|
84
|
+
if self.schema_drift or kwargs.get("reload", False):
|
|
85
|
+
DEFAULT_LOGGER.warning("schema drifted", extra={"label": self, "diffs": diffs})
|
|
86
|
+
self.update_schema(df=df)
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
only_type_widening_compatible = all(d.type_widening_compatible for d in diffs if d.status == "changed")
|
|
90
|
+
if only_type_widening_compatible and self.table.type_widening_enabled and IS_TYPE_WIDENING:
|
|
91
|
+
self.update_schema(df=df, widen_types=True)
|
|
92
|
+
else:
|
|
93
|
+
raise SchemaDriftException.from_diffs(str(self), diffs)
|
|
94
|
+
|
|
95
|
+
self.for_each_batch(df, batch, **kwargs)
|
|
96
|
+
|
|
97
|
+
if batch is not None:
|
|
98
|
+
self.table.set_property("fabricks.last_batch", batch)
|
|
99
|
+
|
|
100
|
+
self.table.create_restore_point()
|
|
101
|
+
DEFAULT_LOGGER.debug("end (for each batch)", extra={"label": self})
|
|
102
|
+
|
|
103
|
+
def for_each_run(self, **kwargs):
|
|
104
|
+
DEFAULT_LOGGER.debug("start (for each run)", extra={"label": self})
|
|
105
|
+
|
|
106
|
+
if self.virtual:
|
|
107
|
+
self.create_or_replace_view()
|
|
108
|
+
|
|
109
|
+
elif self.persist:
|
|
110
|
+
assert self.table.registered, f"{self} is not registered"
|
|
111
|
+
|
|
112
|
+
df = self.get_data(stream=self.stream, **kwargs)
|
|
113
|
+
assert df is not None, "no data"
|
|
114
|
+
|
|
115
|
+
partial(self._for_each_batch, **kwargs)
|
|
116
|
+
|
|
117
|
+
if self.stream:
|
|
118
|
+
DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
|
|
119
|
+
write_stream(
|
|
120
|
+
df,
|
|
121
|
+
checkpoints_path=self.paths.checkpoints,
|
|
122
|
+
func=self._for_each_batch,
|
|
123
|
+
timeout=self.timeout,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
self._for_each_batch(df, **kwargs)
|
|
127
|
+
|
|
128
|
+
else:
|
|
129
|
+
raise ValueError(f"{self.mode} - not allowed")
|
|
130
|
+
|
|
131
|
+
DEFAULT_LOGGER.debug("end (for each run)", extra={"label": self})
|
|
132
|
+
|
|
133
|
+
def run(
|
|
134
|
+
self,
|
|
135
|
+
retry: Optional[bool] = True,
|
|
136
|
+
schedule: Optional[str] = None,
|
|
137
|
+
schedule_id: Optional[str] = None,
|
|
138
|
+
invoke: Optional[bool] = True,
|
|
139
|
+
reload: Optional[bool] = None,
|
|
140
|
+
vacuum: Optional[bool] = None,
|
|
141
|
+
optimize: Optional[bool] = None,
|
|
142
|
+
compute_statistics: Optional[bool] = None,
|
|
143
|
+
):
|
|
144
|
+
"""
|
|
145
|
+
Run the processor.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
retry (bool, optional): Whether to retry the execution in case of failure. Defaults to True.
|
|
149
|
+
schedule (str, optional): The schedule to run the processor on. Defaults to None.
|
|
150
|
+
schedule_id (str, optional): The ID of the schedule. Defaults to None.
|
|
151
|
+
invoke (bool, optional): Whether to invoke pre-run and post-run methods. Defaults to True.
|
|
152
|
+
"""
|
|
153
|
+
last_version = None
|
|
154
|
+
last_batch = None
|
|
155
|
+
exception = None
|
|
156
|
+
|
|
157
|
+
if self.persist:
|
|
158
|
+
last_version = self.table.get_property("fabricks.last_version")
|
|
159
|
+
if last_version is not None:
|
|
160
|
+
DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"label": self})
|
|
161
|
+
else:
|
|
162
|
+
last_version = str(self.table.last_version)
|
|
163
|
+
|
|
164
|
+
last_batch = self.table.get_property("fabricks.last_batch")
|
|
165
|
+
if last_batch is not None:
|
|
166
|
+
DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"label": self})
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
DEFAULT_LOGGER.info("start (run)", extra={"label": self})
|
|
170
|
+
|
|
171
|
+
if reload:
|
|
172
|
+
DEFAULT_LOGGER.debug("force reload", extra={"label": self})
|
|
173
|
+
|
|
174
|
+
if invoke:
|
|
175
|
+
self.invoke_pre_run(schedule=schedule)
|
|
176
|
+
|
|
177
|
+
if not reload:
|
|
178
|
+
self.check_skip_run()
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
self.check_pre_run()
|
|
182
|
+
except PreRunCheckWarning as e:
|
|
183
|
+
exception = e
|
|
184
|
+
|
|
185
|
+
self.for_each_run(schedule=schedule, reload=reload)
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
self.check_post_run()
|
|
189
|
+
except PostRunCheckWarning as e:
|
|
190
|
+
exception = e
|
|
191
|
+
|
|
192
|
+
self.check_post_run_extra()
|
|
193
|
+
|
|
194
|
+
if invoke:
|
|
195
|
+
self.invoke_post_run(schedule=schedule)
|
|
196
|
+
|
|
197
|
+
if exception:
|
|
198
|
+
raise exception
|
|
199
|
+
|
|
200
|
+
if vacuum is None:
|
|
201
|
+
vacuum = self.options.job.get("vacuum", False)
|
|
202
|
+
if optimize is None:
|
|
203
|
+
optimize = self.options.job.get("optimize", False)
|
|
204
|
+
if compute_statistics is None:
|
|
205
|
+
compute_statistics = self.options.job.get("compute_statistics", False)
|
|
206
|
+
|
|
207
|
+
if vacuum or optimize or compute_statistics:
|
|
208
|
+
self.maintain(
|
|
209
|
+
compute_statistics=compute_statistics,
|
|
210
|
+
optimize=optimize,
|
|
211
|
+
vacuum=vacuum,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
DEFAULT_LOGGER.info("end (run)", extra={"label": self})
|
|
215
|
+
|
|
216
|
+
except SkipRunCheckWarning as e:
|
|
217
|
+
DEFAULT_LOGGER.warning("skip run", extra={"label": self})
|
|
218
|
+
raise e
|
|
219
|
+
|
|
220
|
+
except (PreRunCheckWarning, PostRunCheckWarning) as e:
|
|
221
|
+
DEFAULT_LOGGER.warning("fail to pass warning check", extra={"label": self})
|
|
222
|
+
raise e
|
|
223
|
+
|
|
224
|
+
except (PreRunInvokeException, PostRunInvokeException) as e:
|
|
225
|
+
DEFAULT_LOGGER.exception("fail to run invoker", extra={"label": self})
|
|
226
|
+
raise e
|
|
227
|
+
|
|
228
|
+
except (PreRunCheckException, PostRunCheckException) as e:
|
|
229
|
+
DEFAULT_LOGGER.exception("fail to pass check", extra={"label": self})
|
|
230
|
+
self.restore(last_version, last_batch)
|
|
231
|
+
raise e
|
|
232
|
+
|
|
233
|
+
except AssertionError as e:
|
|
234
|
+
DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
|
|
235
|
+
self.restore(last_version, last_batch)
|
|
236
|
+
raise e
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
if not self.stream or not retry:
|
|
240
|
+
DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
|
|
241
|
+
self.restore(last_version, last_batch)
|
|
242
|
+
raise e
|
|
243
|
+
|
|
244
|
+
else:
|
|
245
|
+
DEFAULT_LOGGER.warning("retry to run", extra={"label": self})
|
|
246
|
+
self.run(retry=False, schedule_id=schedule_id, schedule=schedule)
|
|
247
|
+
|
|
248
|
+
@abstractmethod
|
|
249
|
+
def overwrite(self) -> None: ...
|
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
from typing import Optional, Sequence, Union, cast
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql.functions import expr, lit, md5
|
|
5
|
+
from pyspark.sql.types import Row
|
|
6
|
+
|
|
7
|
+
from fabricks.cdc.nocdc import NoCDC
|
|
8
|
+
from fabricks.context import VARIABLES
|
|
9
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
10
|
+
from fabricks.core.jobs.base._types import JobDependency, TBronze
|
|
11
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
12
|
+
from fabricks.core.parsers import BaseParser
|
|
13
|
+
from fabricks.core.parsers.get_parser import get_parser
|
|
14
|
+
from fabricks.core.parsers.utils import clean
|
|
15
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
16
|
+
from fabricks.utils.helpers import concat_ws
|
|
17
|
+
from fabricks.utils.path import Path
|
|
18
|
+
from fabricks.utils.read import read
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Bronze(BaseJob):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
step: TBronze,
|
|
25
|
+
topic: Optional[str] = None,
|
|
26
|
+
item: Optional[str] = None,
|
|
27
|
+
job_id: Optional[str] = None,
|
|
28
|
+
conf: Optional[Union[dict, Row]] = None,
|
|
29
|
+
): # type: ignore
|
|
30
|
+
super().__init__(
|
|
31
|
+
"bronze",
|
|
32
|
+
step=step,
|
|
33
|
+
topic=topic,
|
|
34
|
+
item=item,
|
|
35
|
+
job_id=job_id,
|
|
36
|
+
conf=conf,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
_parser: Optional[BaseParser] = None
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def stream(self) -> bool:
|
|
43
|
+
return self.mode not in ["register"]
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def schema_drift(self) -> bool:
|
|
47
|
+
return True
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def persist(self) -> bool:
|
|
51
|
+
return self.mode in ["append", "register"]
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def virtual(self) -> bool:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
59
|
+
return cls(step=cast(TBronze, step), job_id=job_id, conf=conf)
|
|
60
|
+
|
|
61
|
+
@classmethod
|
|
62
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
63
|
+
return cls(step=cast(TBronze, step), topic=topic, item=item, conf=conf)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def data_path(self) -> Path:
|
|
67
|
+
uri = self.options.job.get("uri")
|
|
68
|
+
assert uri is not None, "no uri provided in options"
|
|
69
|
+
path = Path.from_uri(uri, regex=VARIABLES)
|
|
70
|
+
return path
|
|
71
|
+
|
|
72
|
+
def get_dependencies(self, *s) -> Sequence[JobDependency]:
|
|
73
|
+
dependencies = []
|
|
74
|
+
|
|
75
|
+
parents = self.options.job.get_list("parents")
|
|
76
|
+
if parents:
|
|
77
|
+
for p in parents:
|
|
78
|
+
dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
|
|
79
|
+
|
|
80
|
+
return dependencies
|
|
81
|
+
|
|
82
|
+
def register_external_table(self):
|
|
83
|
+
options = self.conf.parser_options # type: ignore
|
|
84
|
+
if options:
|
|
85
|
+
file_format = options.get("file_format")
|
|
86
|
+
else:
|
|
87
|
+
file_format = "delta"
|
|
88
|
+
|
|
89
|
+
DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"label": self})
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
df = self.spark.sql(f"select * from {file_format}.`{self.data_path}`")
|
|
93
|
+
assert len(df.columns) > 1, "external table must have at least one column"
|
|
94
|
+
except Exception as e:
|
|
95
|
+
DEFAULT_LOGGER.exception("read external table failed", extra={"label": self})
|
|
96
|
+
raise e
|
|
97
|
+
|
|
98
|
+
self.spark.sql(
|
|
99
|
+
f"create table if not exists {self.qualified_name} using {file_format} location '{self.data_path}'"
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def drop_external_table(self):
|
|
103
|
+
DEFAULT_LOGGER.warning("remove external table from metastore", extra={"label": self})
|
|
104
|
+
self.spark.sql(f"drop table if exists {self.qualified_name}")
|
|
105
|
+
|
|
106
|
+
def compute_statistics_external_table(self):
|
|
107
|
+
DEFAULT_LOGGER.debug("compute statistics (external table)", extra={"label": self})
|
|
108
|
+
self.spark.sql(f"analyze table {self.qualified_name} compute statistics")
|
|
109
|
+
|
|
110
|
+
def vacuum_external_table(self, retention_hours: Optional[int] = 168):
|
|
111
|
+
from delta import DeltaTable
|
|
112
|
+
|
|
113
|
+
DEFAULT_LOGGER.debug("vacuum (external table)", extra={"label": self})
|
|
114
|
+
try:
|
|
115
|
+
dt = DeltaTable.forPath(self.spark, self.data_path.string)
|
|
116
|
+
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
|
|
117
|
+
dt.vacuum(retention_hours)
|
|
118
|
+
finally:
|
|
119
|
+
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
|
|
120
|
+
|
|
121
|
+
def maintain_external_table(
|
|
122
|
+
self,
|
|
123
|
+
vacuum: Optional[bool] = True,
|
|
124
|
+
compute_statistics: Optional[bool] = True,
|
|
125
|
+
):
|
|
126
|
+
DEFAULT_LOGGER.debug("maintain (external table)", extra={"label": self})
|
|
127
|
+
if vacuum:
|
|
128
|
+
self.vacuum_external_table()
|
|
129
|
+
|
|
130
|
+
if compute_statistics:
|
|
131
|
+
self.compute_statistics_external_table()
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def parser(self) -> BaseParser:
|
|
135
|
+
if not self._parser:
|
|
136
|
+
assert self.mode not in ["register"], f"{self.mode} not allowed"
|
|
137
|
+
|
|
138
|
+
name = self.options.job.get("parser")
|
|
139
|
+
assert name is not None, "parser not found"
|
|
140
|
+
|
|
141
|
+
options = self.conf.parser_options or None # type: ignore
|
|
142
|
+
p = get_parser(name, options)
|
|
143
|
+
|
|
144
|
+
self._parser = p
|
|
145
|
+
|
|
146
|
+
return self._parser
|
|
147
|
+
|
|
148
|
+
def parse(self, stream: bool = False) -> DataFrame:
|
|
149
|
+
"""
|
|
150
|
+
Parses the data based on the specified mode and returns a DataFrame.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
stream (bool, optional): Indicates whether the data should be read as a stream. Defaults to False.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
DataFrame: The parsed data as a DataFrame.
|
|
157
|
+
"""
|
|
158
|
+
if self.mode == "register":
|
|
159
|
+
if stream:
|
|
160
|
+
df = read(
|
|
161
|
+
stream=stream,
|
|
162
|
+
path=self.data_path,
|
|
163
|
+
file_format="delta",
|
|
164
|
+
# spark=self.spark, (BUG)
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
df = self.spark.sql(f"select * from {self}")
|
|
168
|
+
|
|
169
|
+
# cleaning should done by parser
|
|
170
|
+
df = clean(df)
|
|
171
|
+
|
|
172
|
+
else:
|
|
173
|
+
df = self.parser.get_data(
|
|
174
|
+
stream=stream,
|
|
175
|
+
data_path=self.data_path,
|
|
176
|
+
schema_path=self.paths.schema,
|
|
177
|
+
spark=self.spark,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return df
|
|
181
|
+
|
|
182
|
+
def get_data(
|
|
183
|
+
self,
|
|
184
|
+
stream: bool = False,
|
|
185
|
+
transform: Optional[bool] = False,
|
|
186
|
+
schema_only: Optional[bool] = False,
|
|
187
|
+
**kwargs,
|
|
188
|
+
) -> Optional[DataFrame]:
|
|
189
|
+
df = self.parse(stream)
|
|
190
|
+
df = self.filter_where(df)
|
|
191
|
+
df = self.encrypt(df)
|
|
192
|
+
|
|
193
|
+
if transform:
|
|
194
|
+
df = self.base_transform(df)
|
|
195
|
+
|
|
196
|
+
if schema_only:
|
|
197
|
+
df = df.where("1 == 2")
|
|
198
|
+
|
|
199
|
+
return df
|
|
200
|
+
|
|
201
|
+
def add_calculated_columns(self, df: DataFrame) -> DataFrame:
|
|
202
|
+
calculated_columns = self.options.job.get_dict("calculated_columns")
|
|
203
|
+
|
|
204
|
+
if calculated_columns:
|
|
205
|
+
for key, value in calculated_columns.items():
|
|
206
|
+
DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"label": self})
|
|
207
|
+
df = df.withColumn(key, expr(f"{value}"))
|
|
208
|
+
|
|
209
|
+
return df
|
|
210
|
+
|
|
211
|
+
def add_hash(self, df: DataFrame) -> DataFrame:
|
|
212
|
+
if "__hash" not in df.columns:
|
|
213
|
+
fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
|
|
214
|
+
DEFAULT_LOGGER.debug("add hash", extra={"label": self})
|
|
215
|
+
|
|
216
|
+
if "__operation" in df.columns:
|
|
217
|
+
fields += ["__operation == 'delete'"]
|
|
218
|
+
|
|
219
|
+
if "__source" in df.columns:
|
|
220
|
+
fields += ["__source"]
|
|
221
|
+
|
|
222
|
+
df = df.withColumn("__hash", md5(expr(f"{concat_ws(fields)}")))
|
|
223
|
+
|
|
224
|
+
return df
|
|
225
|
+
|
|
226
|
+
def add_key(self, df: DataFrame) -> DataFrame:
|
|
227
|
+
if "__key" not in df.columns:
|
|
228
|
+
fields = self.options.job.get_list("keys")
|
|
229
|
+
if fields:
|
|
230
|
+
DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
|
|
231
|
+
|
|
232
|
+
if "__source" in df.columns:
|
|
233
|
+
fields = fields + ["__source"]
|
|
234
|
+
|
|
235
|
+
fields = [f"`{f}`" for f in fields]
|
|
236
|
+
df = df.withColumn("__key", md5(expr(f"{concat_ws(fields)}")))
|
|
237
|
+
|
|
238
|
+
return df
|
|
239
|
+
|
|
240
|
+
def add_source(self, df: DataFrame) -> DataFrame:
|
|
241
|
+
if "__source" not in df.columns:
|
|
242
|
+
source = self.options.job.get("source")
|
|
243
|
+
if source:
|
|
244
|
+
DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
|
|
245
|
+
df = df.withColumn("__source", lit(source))
|
|
246
|
+
|
|
247
|
+
return df
|
|
248
|
+
|
|
249
|
+
def add_operation(self, df: DataFrame) -> DataFrame:
|
|
250
|
+
if "__operation" not in df.columns:
|
|
251
|
+
operation = self.options.job.get("operation")
|
|
252
|
+
if operation:
|
|
253
|
+
DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
|
|
254
|
+
df = df.withColumn("__operation", lit(operation))
|
|
255
|
+
|
|
256
|
+
else:
|
|
257
|
+
df = df.withColumn("__operation", lit("upsert"))
|
|
258
|
+
|
|
259
|
+
return df
|
|
260
|
+
|
|
261
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
262
|
+
df = df.transform(self.extend)
|
|
263
|
+
df = df.transform(self.add_calculated_columns)
|
|
264
|
+
df = df.transform(self.add_hash)
|
|
265
|
+
df = df.transform(self.add_operation)
|
|
266
|
+
df = df.transform(self.add_source)
|
|
267
|
+
df = df.transform(self.add_key)
|
|
268
|
+
|
|
269
|
+
if "__metadata" in df.columns:
|
|
270
|
+
if self.mode == "register":
|
|
271
|
+
# https://github.com/delta-io/delta/issues/2014 (BUG)
|
|
272
|
+
df = df.withColumn(
|
|
273
|
+
"__metadata",
|
|
274
|
+
expr(
|
|
275
|
+
f"""
|
|
276
|
+
struct(
|
|
277
|
+
concat_ws('/', '{self.data_path}', __timestamp, __operation) as file_path,
|
|
278
|
+
__metadata.file_name as file_name,
|
|
279
|
+
__metadata.file_size as file_size,
|
|
280
|
+
__metadata.file_modification_time as file_modification_time,
|
|
281
|
+
cast(current_date() as timestamp) as inserted
|
|
282
|
+
)
|
|
283
|
+
"""
|
|
284
|
+
),
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
else:
|
|
288
|
+
df = df.withColumn(
|
|
289
|
+
"__metadata",
|
|
290
|
+
expr(
|
|
291
|
+
"""
|
|
292
|
+
struct(
|
|
293
|
+
__metadata.file_path as file_path,
|
|
294
|
+
__metadata.file_name as file_name,
|
|
295
|
+
__metadata.file_size as file_size,
|
|
296
|
+
__metadata.file_modification_time as file_modification_time,
|
|
297
|
+
cast(current_date() as timestamp) as inserted
|
|
298
|
+
)
|
|
299
|
+
"""
|
|
300
|
+
),
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
return df
|
|
304
|
+
|
|
305
|
+
def create_or_replace_view(self):
|
|
306
|
+
DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
|
|
307
|
+
|
|
308
|
+
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
309
|
+
DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"label": self})
|
|
310
|
+
|
|
311
|
+
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
312
|
+
return {}
|
|
313
|
+
|
|
314
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
315
|
+
assert self.persist, f"{self.mode} not allowed"
|
|
316
|
+
|
|
317
|
+
context = self.get_cdc_context(df)
|
|
318
|
+
|
|
319
|
+
# if dataframe, reference is passed (BUG)
|
|
320
|
+
name = f"{self.step}_{self.topic}_{self.item}__{batch}"
|
|
321
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
322
|
+
sql = f"select * from {global_temp_view}"
|
|
323
|
+
|
|
324
|
+
check_df = self.spark.sql(sql)
|
|
325
|
+
if check_df.isEmpty():
|
|
326
|
+
DEFAULT_LOGGER.warning("no data", extra={"label": self})
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
assert isinstance(self.cdc, NoCDC)
|
|
330
|
+
if self.mode == "append":
|
|
331
|
+
self.cdc.append(sql, **context)
|
|
332
|
+
|
|
333
|
+
def for_each_run(self, **kwargs):
|
|
334
|
+
if self.mode == "register":
|
|
335
|
+
DEFAULT_LOGGER.debug("register (no run)", extra={"label": self})
|
|
336
|
+
elif self.mode == "memory":
|
|
337
|
+
DEFAULT_LOGGER.debug("memory (no run)", extra={"label": self})
|
|
338
|
+
else:
|
|
339
|
+
super().for_each_run(**kwargs)
|
|
340
|
+
|
|
341
|
+
def create(self):
|
|
342
|
+
if self.mode == "register":
|
|
343
|
+
self.register_external_table()
|
|
344
|
+
elif self.mode == "memory":
|
|
345
|
+
DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
|
|
346
|
+
else:
|
|
347
|
+
super().create()
|
|
348
|
+
|
|
349
|
+
def register(self):
|
|
350
|
+
if self.mode == "register":
|
|
351
|
+
self.register_external_table()
|
|
352
|
+
elif self.mode == "memory":
|
|
353
|
+
DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
|
|
354
|
+
else:
|
|
355
|
+
super().register()
|
|
356
|
+
|
|
357
|
+
def truncate(self):
|
|
358
|
+
if self.mode == "register":
|
|
359
|
+
DEFAULT_LOGGER.info("register (no truncate)", extra={"label": self})
|
|
360
|
+
else:
|
|
361
|
+
super().truncate()
|
|
362
|
+
|
|
363
|
+
def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
|
|
364
|
+
if self.mode == "register":
|
|
365
|
+
DEFAULT_LOGGER.info("register (no restore)", extra={"label": self})
|
|
366
|
+
else:
|
|
367
|
+
super().restore()
|
|
368
|
+
|
|
369
|
+
def drop(self):
|
|
370
|
+
if self.mode == "register":
|
|
371
|
+
self.drop_external_table()
|
|
372
|
+
super().drop()
|
|
373
|
+
|
|
374
|
+
def maintain(
|
|
375
|
+
self,
|
|
376
|
+
vacuum: Optional[bool] = True,
|
|
377
|
+
optimize: Optional[bool] = True,
|
|
378
|
+
compute_statistics: Optional[bool] = True,
|
|
379
|
+
):
|
|
380
|
+
if self.mode == "register":
|
|
381
|
+
self.maintain_external_table(vacuum=vacuum, compute_statistics=compute_statistics)
|
|
382
|
+
else:
|
|
383
|
+
super().maintain(vacuum=vacuum, optimize=optimize, compute_statistics=compute_statistics)
|
|
384
|
+
|
|
385
|
+
def vacuum(self):
|
|
386
|
+
if self.mode == "memory":
|
|
387
|
+
DEFAULT_LOGGER.info("memory (no vacuum)", extra={"label": self})
|
|
388
|
+
elif self.mode == "register":
|
|
389
|
+
self.vacuum_external_table()
|
|
390
|
+
else:
|
|
391
|
+
super().vacuum()
|
|
392
|
+
|
|
393
|
+
def overwrite(self, schedule: Optional[str] = None):
|
|
394
|
+
self.truncate()
|
|
395
|
+
self.run(schedule=schedule)
|