fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from typing import Optional, Union, cast
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
6
|
+
from pyspark.sql.types import Row
|
|
7
|
+
from typing_extensions import deprecated
|
|
8
|
+
|
|
9
|
+
from fabricks.cdc import SCD1, SCD2, AllowedChangeDataCaptures, NoCDC
|
|
10
|
+
from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
|
|
11
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
12
|
+
from fabricks.context.spark_session import build_spark_session
|
|
13
|
+
from fabricks.core.jobs.base._types import AllowedModes, Options, Paths, TStep
|
|
14
|
+
from fabricks.core.jobs.get_job_conf import get_job_conf
|
|
15
|
+
from fabricks.core.jobs.get_job_id import get_job_id
|
|
16
|
+
from fabricks.metastore.table import Table
|
|
17
|
+
from fabricks.utils.fdict import FDict
|
|
18
|
+
from fabricks.utils.path import Path
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Configurator(ABC):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
expand: str,
|
|
25
|
+
step: TStep,
|
|
26
|
+
topic: Optional[str] = None,
|
|
27
|
+
item: Optional[str] = None,
|
|
28
|
+
job_id: Optional[str] = None,
|
|
29
|
+
conf: Optional[Union[dict, Row]] = None,
|
|
30
|
+
):
|
|
31
|
+
self.expand = expand
|
|
32
|
+
self.step: TStep = step
|
|
33
|
+
|
|
34
|
+
if job_id is not None:
|
|
35
|
+
self.job_id = job_id
|
|
36
|
+
self.conf = get_job_conf(step=self.step, job_id=self.job_id, row=conf)
|
|
37
|
+
self.topic = self.conf.topic
|
|
38
|
+
self.item = self.conf.item
|
|
39
|
+
else:
|
|
40
|
+
assert topic
|
|
41
|
+
assert item
|
|
42
|
+
self.topic = topic
|
|
43
|
+
self.item = item
|
|
44
|
+
self.conf = get_job_conf(step=self.step, topic=self.topic, item=self.item, row=conf)
|
|
45
|
+
self.job_id = get_job_id(step=self.step, topic=self.topic, item=self.item)
|
|
46
|
+
|
|
47
|
+
_step_conf: Optional[dict[str, str]] = None
|
|
48
|
+
_spark: Optional[SparkSession] = None
|
|
49
|
+
_timeout: Optional[int] = None
|
|
50
|
+
_options: Optional[Options] = None
|
|
51
|
+
_paths: Optional[Paths] = None
|
|
52
|
+
_table: Optional[Table] = None
|
|
53
|
+
_root: Optional[Path] = None
|
|
54
|
+
|
|
55
|
+
_cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
|
|
56
|
+
_change_data_capture: Optional[AllowedChangeDataCaptures] = None
|
|
57
|
+
_mode: Optional[AllowedModes] = None
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def stream(self) -> bool: ...
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def schema_drift(self) -> bool: ...
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
@abstractmethod
|
|
69
|
+
def persist(self) -> bool: ...
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
@abstractmethod
|
|
73
|
+
def virtual(self) -> bool: ...
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str): ...
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_job_id(cls, step: str, job_id: str): ...
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def spark(self) -> SparkSession:
|
|
83
|
+
if not self._spark:
|
|
84
|
+
spark = build_spark_session(app_name=str(self))
|
|
85
|
+
|
|
86
|
+
step_options = self.step_conf.get("spark_options", {})
|
|
87
|
+
step_sql_options = step_options.get("sql", {})
|
|
88
|
+
step_conf_options = step_options.get("conf", {})
|
|
89
|
+
if step_sql_options:
|
|
90
|
+
for key, value in step_sql_options.items():
|
|
91
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
|
|
92
|
+
spark.sql(f"set {key} = {value}")
|
|
93
|
+
if step_conf_options:
|
|
94
|
+
for key, value in step_conf_options.items():
|
|
95
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
|
|
96
|
+
spark.conf.set(f"{key}", f"{value}")
|
|
97
|
+
|
|
98
|
+
job_sql_options = self.options.spark.get_dict("sql")
|
|
99
|
+
job_conf_options = self.options.spark.get_dict("conf")
|
|
100
|
+
if job_sql_options:
|
|
101
|
+
for key, value in job_sql_options.items():
|
|
102
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
|
|
103
|
+
spark.sql(f"set {key} = {value}")
|
|
104
|
+
if job_conf_options:
|
|
105
|
+
for key, value in job_conf_options.items():
|
|
106
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
|
|
107
|
+
spark.conf.set(f"{key}", f"{value}")
|
|
108
|
+
|
|
109
|
+
self._spark = spark
|
|
110
|
+
return self._spark
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def step_conf(self) -> dict:
|
|
114
|
+
if not self._step_conf:
|
|
115
|
+
_conf = [s for s in STEPS if s.get("name") == self.step][0]
|
|
116
|
+
assert _conf is not None
|
|
117
|
+
self._step_conf = cast(dict[str, str], _conf)
|
|
118
|
+
return self._step_conf
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def qualified_name(self) -> str:
|
|
122
|
+
return f"{self.step}.{self.topic}_{self.item}"
|
|
123
|
+
|
|
124
|
+
def _get_timeout(self, what: str) -> int:
|
|
125
|
+
t = self.step_conf.get("options", {}).get("timeouts", {}).get(what, None)
|
|
126
|
+
if t is None:
|
|
127
|
+
t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
|
|
128
|
+
assert t is not None
|
|
129
|
+
return t
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def timeout(self) -> int:
|
|
133
|
+
if not self._timeout:
|
|
134
|
+
t = self.options.job.get("timeout")
|
|
135
|
+
|
|
136
|
+
if t is None:
|
|
137
|
+
t = self._get_timeout("job")
|
|
138
|
+
|
|
139
|
+
assert t is not None
|
|
140
|
+
self._timeout = int(t)
|
|
141
|
+
|
|
142
|
+
return self._timeout
|
|
143
|
+
|
|
144
|
+
def pip(self):
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def table(self) -> Table:
|
|
149
|
+
return self.cdc.table
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def paths(self) -> Paths:
|
|
153
|
+
if not self._paths:
|
|
154
|
+
storage = PATHS_STORAGE.get(self.step)
|
|
155
|
+
assert storage
|
|
156
|
+
|
|
157
|
+
runtime_root = PATHS_RUNTIME.get(self.step)
|
|
158
|
+
assert runtime_root
|
|
159
|
+
|
|
160
|
+
self._paths = Paths(
|
|
161
|
+
storage=storage,
|
|
162
|
+
tmp=storage.joinpath("tmp", self.topic, self.item),
|
|
163
|
+
checkpoints=storage.joinpath("checkpoints", self.topic, self.item),
|
|
164
|
+
commits=storage.joinpath("checkpoints", self.topic, self.item, "commits"),
|
|
165
|
+
schema=storage.joinpath("schema", self.topic, self.item),
|
|
166
|
+
runtime=runtime_root.joinpath(self.topic, self.item),
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
return self._paths
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
@lru_cache(maxsize=None)
|
|
173
|
+
def options(self) -> Options:
|
|
174
|
+
if not self._options:
|
|
175
|
+
job = self.conf.options or {}
|
|
176
|
+
table = self.conf.table_options or {}
|
|
177
|
+
check = self.conf.check_options or {}
|
|
178
|
+
spark = self.conf.spark_options or {}
|
|
179
|
+
invokers = self.conf.invoker_options or {}
|
|
180
|
+
extenders = self.conf.extender_options or []
|
|
181
|
+
|
|
182
|
+
self._options = Options(
|
|
183
|
+
job=FDict(job),
|
|
184
|
+
table=FDict(table),
|
|
185
|
+
check=FDict(check),
|
|
186
|
+
spark=FDict(spark),
|
|
187
|
+
invokers=FDict(invokers),
|
|
188
|
+
extenders=extenders,
|
|
189
|
+
)
|
|
190
|
+
return self._options
|
|
191
|
+
|
|
192
|
+
@property
|
|
193
|
+
def change_data_capture(self) -> AllowedChangeDataCaptures:
|
|
194
|
+
if not self._change_data_capture:
|
|
195
|
+
cdc: AllowedChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
|
|
196
|
+
self._change_data_capture = cdc
|
|
197
|
+
return self._change_data_capture
|
|
198
|
+
|
|
199
|
+
@property
|
|
200
|
+
def cdc(self) -> Union[NoCDC, SCD1, SCD2]:
|
|
201
|
+
if not self._cdc:
|
|
202
|
+
if self.change_data_capture == "nocdc":
|
|
203
|
+
cdc = NoCDC(self.step, self.topic, self.item, spark=self.spark)
|
|
204
|
+
elif self.change_data_capture == "scd1":
|
|
205
|
+
cdc = SCD1(self.step, self.topic, self.item, spark=self.spark)
|
|
206
|
+
elif self.change_data_capture == "scd2":
|
|
207
|
+
cdc = SCD2(self.step, self.topic, self.item, spark=self.spark)
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError(f"{self.change_data_capture} not allowed")
|
|
210
|
+
self._cdc = cdc
|
|
211
|
+
return self._cdc
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def slowly_changing_dimension(self) -> bool:
|
|
215
|
+
return self.change_data_capture in ["scd1", "scd2"]
|
|
216
|
+
|
|
217
|
+
@abstractmethod
|
|
218
|
+
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict: ...
|
|
219
|
+
|
|
220
|
+
def get_cdc_data(self, stream: bool = False) -> Optional[DataFrame]:
|
|
221
|
+
df = self.get_data(stream=stream)
|
|
222
|
+
if df:
|
|
223
|
+
cdc_context = self.get_cdc_context(df)
|
|
224
|
+
cdc_df = self.cdc.get_data(src=df, **cdc_context)
|
|
225
|
+
return cdc_df
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def mode(self) -> AllowedModes:
|
|
229
|
+
if not self._mode:
|
|
230
|
+
_mode = self.options.job.get("mode")
|
|
231
|
+
assert _mode is not None
|
|
232
|
+
self._mode = cast(AllowedModes, _mode)
|
|
233
|
+
return self._mode
|
|
234
|
+
|
|
235
|
+
@abstractmethod
|
|
236
|
+
def get_data(self, stream: bool = False, transform: Optional[bool] = None, **kwargs) -> Optional[DataFrame]: ...
|
|
237
|
+
|
|
238
|
+
@abstractmethod
|
|
239
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs): ...
|
|
240
|
+
|
|
241
|
+
@abstractmethod
|
|
242
|
+
def for_each_run(self, **kwargs): ...
|
|
243
|
+
|
|
244
|
+
@abstractmethod
|
|
245
|
+
def base_transform(self, df: DataFrame) -> DataFrame: ...
|
|
246
|
+
|
|
247
|
+
@abstractmethod
|
|
248
|
+
def run(
|
|
249
|
+
self,
|
|
250
|
+
retry: Optional[bool] = True,
|
|
251
|
+
schedule: Optional[str] = None,
|
|
252
|
+
schedule_id: Optional[str] = None,
|
|
253
|
+
invoke: Optional[bool] = True,
|
|
254
|
+
): ...
|
|
255
|
+
|
|
256
|
+
@deprecated("use maintain instead")
|
|
257
|
+
def optimize(
|
|
258
|
+
self,
|
|
259
|
+
vacuum: Optional[bool] = True,
|
|
260
|
+
optimize: Optional[bool] = True,
|
|
261
|
+
analyze: Optional[bool] = True,
|
|
262
|
+
):
|
|
263
|
+
return self.maintain(
|
|
264
|
+
vacuum=vacuum,
|
|
265
|
+
optimize=optimize,
|
|
266
|
+
compute_statistics=analyze,
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
def maintain(
|
|
270
|
+
self,
|
|
271
|
+
vacuum: Optional[bool] = True,
|
|
272
|
+
optimize: Optional[bool] = True,
|
|
273
|
+
compute_statistics: Optional[bool] = True,
|
|
274
|
+
):
|
|
275
|
+
if self.mode == "memory":
|
|
276
|
+
DEFAULT_LOGGER.debug("could not maintain (memory)", extra={"label": self})
|
|
277
|
+
|
|
278
|
+
else:
|
|
279
|
+
if vacuum:
|
|
280
|
+
self.vacuum()
|
|
281
|
+
if optimize:
|
|
282
|
+
self.cdc.optimize_table()
|
|
283
|
+
if compute_statistics:
|
|
284
|
+
self.table.compute_statistics()
|
|
285
|
+
|
|
286
|
+
def vacuum(self):
|
|
287
|
+
if self.mode == "memory":
|
|
288
|
+
DEFAULT_LOGGER.debug("could not vacuum (memory)", extra={"label": self})
|
|
289
|
+
|
|
290
|
+
else:
|
|
291
|
+
job = self.options.table.get("retention_days")
|
|
292
|
+
step = self.step_conf.get("table_options", {}).get("retention_days", None)
|
|
293
|
+
runtime = CONF_RUNTIME.get("options", {}).get("retention_days")
|
|
294
|
+
|
|
295
|
+
if job is not None:
|
|
296
|
+
retention_days = job
|
|
297
|
+
elif step:
|
|
298
|
+
retention_days = step
|
|
299
|
+
else:
|
|
300
|
+
assert runtime
|
|
301
|
+
retention_days = runtime
|
|
302
|
+
|
|
303
|
+
self.table.vacuum(retention_days=retention_days)
|
|
304
|
+
|
|
305
|
+
def __str__(self):
|
|
306
|
+
return f"{self.step}.{self.topic}_{self.item}"
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from typing import Sequence
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.metastore.table import SchemaDiff
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CustomException(Exception):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CheckException(Exception):
|
|
13
|
+
def __init__(self, message: str, dataframe: DataFrame):
|
|
14
|
+
self.message = message
|
|
15
|
+
self.dataframe = dataframe
|
|
16
|
+
|
|
17
|
+
super().__init__(self.message)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CheckWarning(CheckException):
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PreRunCheckException(CheckException):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class PostRunCheckException(CheckException):
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class PreRunCheckWarning(CheckWarning):
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PostRunCheckWarning(CheckWarning):
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PreRunInvokeException(CustomException):
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PostRunInvokeException(CustomException):
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SkipRunCheckWarning(CheckWarning):
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class SchemaDriftException(Exception):
|
|
53
|
+
@staticmethod
|
|
54
|
+
def from_diffs(table: str, diffs: Sequence[SchemaDiff]):
|
|
55
|
+
out = []
|
|
56
|
+
type_widening_compatible = True
|
|
57
|
+
|
|
58
|
+
added = [d.new_column or d.column for d in diffs if d.status == "added"]
|
|
59
|
+
if added:
|
|
60
|
+
type_widening_compatible = False
|
|
61
|
+
out.append("added columns:\n" + "\n".join(f"\t- {col}" for col in added))
|
|
62
|
+
|
|
63
|
+
removed = [d.column for d in diffs if d.status == "dropped"]
|
|
64
|
+
if removed:
|
|
65
|
+
type_widening_compatible = False
|
|
66
|
+
out.append("removed columns:\n" + "\n".join(f"\t- {col}" for col in removed))
|
|
67
|
+
|
|
68
|
+
changed = [f"{d.column} ({d.data_type} -> {d.new_data_type})" for d in diffs if d.status == "changed"]
|
|
69
|
+
if changed:
|
|
70
|
+
if False in [d.type_widening_compatible for d in diffs if d.status == "changed"]:
|
|
71
|
+
type_widening_compatible = False
|
|
72
|
+
|
|
73
|
+
out.append("changed columns:\n" + "\n".join(f"\t- {col}" for col in changed))
|
|
74
|
+
|
|
75
|
+
out = "\n".join(out)
|
|
76
|
+
|
|
77
|
+
if type_widening_compatible:
|
|
78
|
+
return SchemaDriftException(f"type widening detected:\n {out}", diffs, type_widening_compatible)
|
|
79
|
+
else:
|
|
80
|
+
return SchemaDriftException(f"schema drift detected:\n {out}", diffs, type_widening_compatible)
|
|
81
|
+
|
|
82
|
+
def __init__(self, message: str, diffs: Sequence[SchemaDiff], type_widening_compatible: bool = False):
|
|
83
|
+
super().__init__(message)
|
|
84
|
+
self.diffs = diffs
|
|
85
|
+
self.type_widening_compatible = type_widening_compatible
|