fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +76 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
- fabricks-3.0.6.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
|
@@ -4,12 +4,13 @@ from typing import Optional, Union, cast
|
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame, SparkSession
|
|
6
6
|
from pyspark.sql.types import Row
|
|
7
|
+
from typing_extensions import deprecated
|
|
7
8
|
|
|
8
|
-
from fabricks.cdc import SCD1, SCD2,
|
|
9
|
+
from fabricks.cdc import SCD1, SCD2, AllowedChangeDataCaptures, NoCDC
|
|
9
10
|
from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
|
|
10
11
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
12
|
from fabricks.context.spark_session import build_spark_session
|
|
12
|
-
from fabricks.core.jobs.base._types import
|
|
13
|
+
from fabricks.core.jobs.base._types import AllowedModes, Options, Paths, TStep
|
|
13
14
|
from fabricks.core.jobs.get_job_conf import get_job_conf
|
|
14
15
|
from fabricks.core.jobs.get_job_id import get_job_id
|
|
15
16
|
from fabricks.metastore.table import Table
|
|
@@ -52,36 +53,30 @@ class Configurator(ABC):
|
|
|
52
53
|
_root: Optional[Path] = None
|
|
53
54
|
|
|
54
55
|
_cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
|
|
55
|
-
_change_data_capture: Optional[
|
|
56
|
-
_mode: Optional[
|
|
56
|
+
_change_data_capture: Optional[AllowedChangeDataCaptures] = None
|
|
57
|
+
_mode: Optional[AllowedModes] = None
|
|
57
58
|
|
|
58
59
|
@property
|
|
59
60
|
@abstractmethod
|
|
60
|
-
def stream(self) -> bool:
|
|
61
|
-
raise NotImplementedError()
|
|
61
|
+
def stream(self) -> bool: ...
|
|
62
62
|
|
|
63
63
|
@property
|
|
64
64
|
@abstractmethod
|
|
65
|
-
def schema_drift(self) -> bool:
|
|
66
|
-
raise NotImplementedError()
|
|
65
|
+
def schema_drift(self) -> bool: ...
|
|
67
66
|
|
|
68
67
|
@property
|
|
69
68
|
@abstractmethod
|
|
70
|
-
def persist(self) -> bool:
|
|
71
|
-
raise NotImplementedError()
|
|
69
|
+
def persist(self) -> bool: ...
|
|
72
70
|
|
|
73
71
|
@property
|
|
74
72
|
@abstractmethod
|
|
75
|
-
def virtual(self) -> bool:
|
|
76
|
-
raise NotImplementedError()
|
|
73
|
+
def virtual(self) -> bool: ...
|
|
77
74
|
|
|
78
75
|
@classmethod
|
|
79
|
-
def from_step_topic_item(cls, step: str, topic: str, item: str):
|
|
80
|
-
raise NotImplementedError()
|
|
76
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str): ...
|
|
81
77
|
|
|
82
78
|
@classmethod
|
|
83
|
-
def from_job_id(cls, step: str, job_id: str):
|
|
84
|
-
raise NotImplementedError()
|
|
79
|
+
def from_job_id(cls, step: str, job_id: str): ...
|
|
85
80
|
|
|
86
81
|
@property
|
|
87
82
|
def spark(self) -> SparkSession:
|
|
@@ -93,22 +88,22 @@ class Configurator(ABC):
|
|
|
93
88
|
step_conf_options = step_options.get("conf", {})
|
|
94
89
|
if step_sql_options:
|
|
95
90
|
for key, value in step_sql_options.items():
|
|
96
|
-
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"
|
|
91
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
|
|
97
92
|
spark.sql(f"set {key} = {value}")
|
|
98
93
|
if step_conf_options:
|
|
99
94
|
for key, value in step_conf_options.items():
|
|
100
|
-
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"
|
|
95
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
|
|
101
96
|
spark.conf.set(f"{key}", f"{value}")
|
|
102
97
|
|
|
103
98
|
job_sql_options = self.options.spark.get_dict("sql")
|
|
104
99
|
job_conf_options = self.options.spark.get_dict("conf")
|
|
105
100
|
if job_sql_options:
|
|
106
101
|
for key, value in job_sql_options.items():
|
|
107
|
-
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"
|
|
102
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
|
|
108
103
|
spark.sql(f"set {key} = {value}")
|
|
109
104
|
if job_conf_options:
|
|
110
105
|
for key, value in job_conf_options.items():
|
|
111
|
-
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"
|
|
106
|
+
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
|
|
112
107
|
spark.conf.set(f"{key}", f"{value}")
|
|
113
108
|
|
|
114
109
|
self._spark = spark
|
|
@@ -195,9 +190,9 @@ class Configurator(ABC):
|
|
|
195
190
|
return self._options
|
|
196
191
|
|
|
197
192
|
@property
|
|
198
|
-
def change_data_capture(self) ->
|
|
193
|
+
def change_data_capture(self) -> AllowedChangeDataCaptures:
|
|
199
194
|
if not self._change_data_capture:
|
|
200
|
-
cdc:
|
|
195
|
+
cdc: AllowedChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
|
|
201
196
|
self._change_data_capture = cdc
|
|
202
197
|
return self._change_data_capture
|
|
203
198
|
|
|
@@ -220,49 +215,34 @@ class Configurator(ABC):
|
|
|
220
215
|
return self.change_data_capture in ["scd1", "scd2"]
|
|
221
216
|
|
|
222
217
|
@abstractmethod
|
|
223
|
-
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict:
|
|
224
|
-
raise NotImplementedError()
|
|
218
|
+
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict: ...
|
|
225
219
|
|
|
226
220
|
def get_cdc_data(self, stream: bool = False) -> Optional[DataFrame]:
|
|
227
|
-
df = self.get_data(stream)
|
|
221
|
+
df = self.get_data(stream=stream)
|
|
228
222
|
if df:
|
|
229
223
|
cdc_context = self.get_cdc_context(df)
|
|
230
224
|
cdc_df = self.cdc.get_data(src=df, **cdc_context)
|
|
231
225
|
return cdc_df
|
|
232
226
|
|
|
233
227
|
@property
|
|
234
|
-
def mode(self) ->
|
|
228
|
+
def mode(self) -> AllowedModes:
|
|
235
229
|
if not self._mode:
|
|
236
230
|
_mode = self.options.job.get("mode")
|
|
237
231
|
assert _mode is not None
|
|
238
|
-
self._mode = cast(
|
|
232
|
+
self._mode = cast(AllowedModes, _mode)
|
|
239
233
|
return self._mode
|
|
240
234
|
|
|
241
235
|
@abstractmethod
|
|
242
|
-
def get_data(self, stream: bool = False, transform: Optional[bool] =
|
|
243
|
-
"""
|
|
244
|
-
Retrieves the data for the job.
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
stream (bool, optional): If True, the data will be streamed. Defaults to False.
|
|
248
|
-
transform (bool, optional): If True, the data will be transformed. Defaults to False.
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
DataFrame or None: The retrieved data as a DataFrame, or None if the data is not available.
|
|
252
|
-
"""
|
|
253
|
-
raise NotImplementedError()
|
|
236
|
+
def get_data(self, stream: bool = False, transform: Optional[bool] = None, **kwargs) -> Optional[DataFrame]: ...
|
|
254
237
|
|
|
255
238
|
@abstractmethod
|
|
256
|
-
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
257
|
-
raise NotImplementedError()
|
|
239
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs): ...
|
|
258
240
|
|
|
259
241
|
@abstractmethod
|
|
260
|
-
def for_each_run(self, **kwargs):
|
|
261
|
-
raise NotImplementedError()
|
|
242
|
+
def for_each_run(self, **kwargs): ...
|
|
262
243
|
|
|
263
244
|
@abstractmethod
|
|
264
|
-
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
265
|
-
raise NotImplementedError()
|
|
245
|
+
def base_transform(self, df: DataFrame) -> DataFrame: ...
|
|
266
246
|
|
|
267
247
|
@abstractmethod
|
|
268
248
|
def run(
|
|
@@ -271,47 +251,41 @@ class Configurator(ABC):
|
|
|
271
251
|
schedule: Optional[str] = None,
|
|
272
252
|
schedule_id: Optional[str] = None,
|
|
273
253
|
invoke: Optional[bool] = True,
|
|
274
|
-
):
|
|
275
|
-
raise NotImplementedError()
|
|
254
|
+
): ...
|
|
276
255
|
|
|
256
|
+
@deprecated("use maintain instead")
|
|
277
257
|
def optimize(
|
|
278
258
|
self,
|
|
279
259
|
vacuum: Optional[bool] = True,
|
|
280
260
|
optimize: Optional[bool] = True,
|
|
281
261
|
analyze: Optional[bool] = True,
|
|
282
262
|
):
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
The retention days for optimization are determined in the following order:
|
|
289
|
-
1. If 'retention_days' is specified in the job options table, it is used.
|
|
290
|
-
2. If 'retention_days' is specified in the step configuration table options, it is used.
|
|
291
|
-
3. If 'retention_days' is specified in the CONF_RUNTIME options, it is used.
|
|
292
|
-
|
|
293
|
-
After determining the retention days, the table is vacuumed with the specified retention days,
|
|
294
|
-
CDC is optimized for the table, and the table is analyzed.
|
|
263
|
+
return self.maintain(
|
|
264
|
+
vacuum=vacuum,
|
|
265
|
+
optimize=optimize,
|
|
266
|
+
compute_statistics=analyze,
|
|
267
|
+
)
|
|
295
268
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
269
|
+
def maintain(
|
|
270
|
+
self,
|
|
271
|
+
vacuum: Optional[bool] = True,
|
|
272
|
+
optimize: Optional[bool] = True,
|
|
273
|
+
compute_statistics: Optional[bool] = True,
|
|
274
|
+
):
|
|
301
275
|
if self.mode == "memory":
|
|
302
|
-
DEFAULT_LOGGER.debug("
|
|
276
|
+
DEFAULT_LOGGER.debug("could not maintain (memory)", extra={"label": self})
|
|
303
277
|
|
|
304
278
|
else:
|
|
305
279
|
if vacuum:
|
|
306
280
|
self.vacuum()
|
|
307
281
|
if optimize:
|
|
308
282
|
self.cdc.optimize_table()
|
|
309
|
-
if
|
|
283
|
+
if compute_statistics:
|
|
310
284
|
self.table.compute_statistics()
|
|
311
285
|
|
|
312
286
|
def vacuum(self):
|
|
313
287
|
if self.mode == "memory":
|
|
314
|
-
DEFAULT_LOGGER.debug("
|
|
288
|
+
DEFAULT_LOGGER.debug("could not vacuum (memory)", extra={"label": self})
|
|
315
289
|
|
|
316
290
|
else:
|
|
317
291
|
job = self.options.table.get("retention_days")
|
|
@@ -4,7 +4,7 @@ from typing import Optional, Sequence, Union, cast
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import lit
|
|
6
6
|
|
|
7
|
-
from fabricks.cdc import
|
|
7
|
+
from fabricks.cdc import NoCDC
|
|
8
8
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
9
|
from fabricks.core.jobs.base._types import JobDependency
|
|
10
10
|
from fabricks.core.jobs.base.configurator import Configurator
|
|
@@ -14,17 +14,16 @@ from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
|
14
14
|
|
|
15
15
|
class Generator(Configurator):
|
|
16
16
|
def update_dependencies(self):
|
|
17
|
-
DEFAULT_LOGGER.info("update dependencies", extra={"
|
|
17
|
+
DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
|
|
18
18
|
|
|
19
19
|
deps = self.get_dependencies()
|
|
20
20
|
if deps:
|
|
21
21
|
df = self.spark.createDataFrame([d.model_dump() for d in deps]) # type: ignore
|
|
22
|
-
|
|
23
|
-
|
|
22
|
+
cdc = NoCDC("fabricks", self.step, "dependencies")
|
|
23
|
+
cdc.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
|
|
24
24
|
|
|
25
25
|
@abstractmethod
|
|
26
|
-
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
27
|
-
raise NotImplementedError()
|
|
26
|
+
def get_dependencies(self) -> Sequence[JobDependency]: ...
|
|
28
27
|
|
|
29
28
|
def rm(self):
|
|
30
29
|
"""
|
|
@@ -33,7 +32,7 @@ class Generator(Configurator):
|
|
|
33
32
|
If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
|
|
34
33
|
"""
|
|
35
34
|
if self.paths.schema.exists():
|
|
36
|
-
DEFAULT_LOGGER.info("delete schema folder", extra={"
|
|
35
|
+
DEFAULT_LOGGER.info("delete schema folder", extra={"label": self})
|
|
37
36
|
self.paths.schema.rm()
|
|
38
37
|
self.rm_checkpoints()
|
|
39
38
|
|
|
@@ -44,7 +43,7 @@ class Generator(Configurator):
|
|
|
44
43
|
This method checks if the checkpoints folder exists and deletes it if it does.
|
|
45
44
|
"""
|
|
46
45
|
if self.paths.checkpoints.exists():
|
|
47
|
-
DEFAULT_LOGGER.info("delete checkpoints folder", extra={"
|
|
46
|
+
DEFAULT_LOGGER.info("delete checkpoints folder", extra={"label": self})
|
|
48
47
|
self.paths.checkpoints.rm()
|
|
49
48
|
|
|
50
49
|
def rm_commit(self, id: Union[str, int]):
|
|
@@ -59,7 +58,7 @@ class Generator(Configurator):
|
|
|
59
58
|
"""
|
|
60
59
|
path = self.paths.commits.joinpath(str(id))
|
|
61
60
|
if path.exists():
|
|
62
|
-
DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"
|
|
61
|
+
DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"label": self})
|
|
63
62
|
path.rm()
|
|
64
63
|
|
|
65
64
|
def truncate(self):
|
|
@@ -72,7 +71,7 @@ class Generator(Configurator):
|
|
|
72
71
|
Returns:
|
|
73
72
|
None
|
|
74
73
|
"""
|
|
75
|
-
DEFAULT_LOGGER.warning("truncate", extra={"
|
|
74
|
+
DEFAULT_LOGGER.warning("truncate", extra={"label": self})
|
|
76
75
|
self.rm()
|
|
77
76
|
if self.persist:
|
|
78
77
|
self.table.truncate()
|
|
@@ -92,6 +91,9 @@ class Generator(Configurator):
|
|
|
92
91
|
Returns:
|
|
93
92
|
None
|
|
94
93
|
"""
|
|
94
|
+
if self.options.job.get("no_drop"):
|
|
95
|
+
raise ValueError("no_drop is set, cannot drop the job")
|
|
96
|
+
|
|
95
97
|
try:
|
|
96
98
|
row = self.spark.sql(
|
|
97
99
|
f"""
|
|
@@ -106,7 +108,7 @@ class Generator(Configurator):
|
|
|
106
108
|
"""
|
|
107
109
|
).collect()[0]
|
|
108
110
|
if cast(int, row.count) > 0:
|
|
109
|
-
DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"
|
|
111
|
+
DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"label": self, "content": row.children})
|
|
110
112
|
|
|
111
113
|
except Exception:
|
|
112
114
|
pass
|
|
@@ -162,7 +164,7 @@ class Generator(Configurator):
|
|
|
162
164
|
Raises:
|
|
163
165
|
NotImplementedError: This method is meant to be overridden by subclasses.
|
|
164
166
|
"""
|
|
165
|
-
|
|
167
|
+
...
|
|
166
168
|
|
|
167
169
|
def create_table(self):
|
|
168
170
|
def _create_table(df: DataFrame, batch: Optional[int] = 0):
|
|
@@ -185,12 +187,29 @@ class Generator(Configurator):
|
|
|
185
187
|
elif step_powerbi is not None:
|
|
186
188
|
powerbi = step_powerbi
|
|
187
189
|
|
|
188
|
-
|
|
190
|
+
# first take from job options, then from step options
|
|
191
|
+
job_masks = self.options.table.get("masks", None)
|
|
192
|
+
step_masks = self.step_conf.get("table_options", {}).get("masks", None)
|
|
193
|
+
if job_masks is not None:
|
|
194
|
+
masks = job_masks
|
|
195
|
+
elif step_masks is not None:
|
|
196
|
+
masks = step_masks
|
|
197
|
+
else:
|
|
198
|
+
masks = None
|
|
199
|
+
|
|
200
|
+
maximum_compatibility = self.options.table.get_boolean("maximum_compatibility", False)
|
|
201
|
+
|
|
202
|
+
if maximum_compatibility:
|
|
203
|
+
default_properties = {
|
|
204
|
+
"delta.minReaderVersion": "1",
|
|
205
|
+
"delta.minWriterVersion": "7",
|
|
206
|
+
"delta.columnMapping.mode": "none",
|
|
207
|
+
}
|
|
208
|
+
elif powerbi:
|
|
189
209
|
default_properties = {
|
|
190
210
|
"delta.columnMapping.mode": "name",
|
|
191
211
|
"delta.minReaderVersion": "2",
|
|
192
212
|
"delta.minWriterVersion": "5",
|
|
193
|
-
"fabricks.last_version": "0",
|
|
194
213
|
}
|
|
195
214
|
else:
|
|
196
215
|
default_properties = {
|
|
@@ -200,9 +219,10 @@ class Generator(Configurator):
|
|
|
200
219
|
"delta.minReaderVersion": "2",
|
|
201
220
|
"delta.minWriterVersion": "5",
|
|
202
221
|
"delta.feature.timestampNtz": "supported",
|
|
203
|
-
"fabricks.last_version": "0",
|
|
204
222
|
}
|
|
205
223
|
|
|
224
|
+
default_properties["fabricks.last_version"] = "0"
|
|
225
|
+
|
|
206
226
|
if "__identity" in df.columns:
|
|
207
227
|
identity = False
|
|
208
228
|
else:
|
|
@@ -234,9 +254,7 @@ class Generator(Configurator):
|
|
|
234
254
|
cluster_by.append("__hash")
|
|
235
255
|
|
|
236
256
|
if not cluster_by:
|
|
237
|
-
DEFAULT_LOGGER.
|
|
238
|
-
"liquid clustering disabled (no clustering columns found)", extra={"job": self}
|
|
239
|
-
)
|
|
257
|
+
DEFAULT_LOGGER.debug("could not determine clustering column", extra={"label": self})
|
|
240
258
|
liquid_clustering = False
|
|
241
259
|
cluster_by = None
|
|
242
260
|
|
|
@@ -257,9 +275,13 @@ class Generator(Configurator):
|
|
|
257
275
|
if properties is None:
|
|
258
276
|
properties = default_properties
|
|
259
277
|
|
|
278
|
+
primary_key = self.options.table.get_dict("primary_key")
|
|
279
|
+
foreign_keys = self.options.table.get_dict("foreign_keys")
|
|
280
|
+
comments = self.options.table.get_dict("comments")
|
|
281
|
+
|
|
260
282
|
# if dataframe, reference is passed (BUG)
|
|
261
283
|
name = f"{self.step}_{self.topic}_{self.item}__init"
|
|
262
|
-
global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"))
|
|
284
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"), job=self)
|
|
263
285
|
sql = f"select * from {global_temp_view}"
|
|
264
286
|
|
|
265
287
|
self.cdc.create_table(
|
|
@@ -270,11 +292,17 @@ class Generator(Configurator):
|
|
|
270
292
|
partitioning=partitioning,
|
|
271
293
|
partition_by=partition_by,
|
|
272
294
|
properties=properties,
|
|
295
|
+
masks=masks,
|
|
296
|
+
primary_key=primary_key,
|
|
297
|
+
foreign_keys=foreign_keys,
|
|
298
|
+
comments=comments,
|
|
273
299
|
**cdc_options,
|
|
274
300
|
)
|
|
275
301
|
|
|
276
302
|
if not self.table.exists():
|
|
277
|
-
|
|
303
|
+
DEFAULT_LOGGER.debug("create table", extra={"label": self})
|
|
304
|
+
|
|
305
|
+
df = self.get_data(stream=self.stream, schema_only=True)
|
|
278
306
|
if df:
|
|
279
307
|
if self.stream:
|
|
280
308
|
# add dummy stream to be sure that the writeStream will start
|
|
@@ -310,6 +338,9 @@ class Generator(Configurator):
|
|
|
310
338
|
if comment:
|
|
311
339
|
self.table.add_comment(comment=comment)
|
|
312
340
|
|
|
341
|
+
else:
|
|
342
|
+
DEFAULT_LOGGER.debug("table exists, skip creation", extra={"label": self})
|
|
343
|
+
|
|
313
344
|
def _update_schema(
|
|
314
345
|
self,
|
|
315
346
|
df: Optional[DataFrame] = None,
|
|
@@ -328,7 +359,7 @@ class Generator(Configurator):
|
|
|
328
359
|
_update_schema(df)
|
|
329
360
|
|
|
330
361
|
else:
|
|
331
|
-
df = self.get_data(self.stream)
|
|
362
|
+
df = self.get_data(stream=self.stream, schema_only=True)
|
|
332
363
|
assert df is not None
|
|
333
364
|
df = self.base_transform(df)
|
|
334
365
|
|
|
@@ -360,7 +391,7 @@ class Generator(Configurator):
|
|
|
360
391
|
|
|
361
392
|
def get_differences_with_deltatable(self, df: Optional[DataFrame] = None):
|
|
362
393
|
if df is None:
|
|
363
|
-
df = self.get_data(self.stream)
|
|
394
|
+
df = self.get_data(stream=self.stream)
|
|
364
395
|
assert df is not None
|
|
365
396
|
df = self.base_transform(df)
|
|
366
397
|
|
|
@@ -370,7 +401,7 @@ class Generator(Configurator):
|
|
|
370
401
|
|
|
371
402
|
def get_schema_differences(self, df: Optional[DataFrame] = None) -> Optional[Sequence[SchemaDiff]]:
|
|
372
403
|
if df is None:
|
|
373
|
-
df = self.get_data(self.stream)
|
|
404
|
+
df = self.get_data(stream=self.stream)
|
|
374
405
|
assert df is not None
|
|
375
406
|
df = self.base_transform(df)
|
|
376
407
|
|
|
@@ -413,4 +444,4 @@ class Generator(Configurator):
|
|
|
413
444
|
else:
|
|
414
445
|
self.table.enable_liquid_clustering(auto=True)
|
|
415
446
|
else:
|
|
416
|
-
DEFAULT_LOGGER.debug("liquid clustering
|
|
447
|
+
DEFAULT_LOGGER.debug("could not enable liquid clustering", extra={"label": self})
|
|
@@ -7,13 +7,17 @@ from fabricks.context import PATH_RUNTIME
|
|
|
7
7
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
8
|
from fabricks.core.jobs.base.checker import Checker
|
|
9
9
|
from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
|
|
10
|
-
from fabricks.core.
|
|
10
|
+
from fabricks.core.jobs.get_schedule import get_schedule
|
|
11
11
|
from fabricks.utils.path import Path
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class Invoker(Checker):
|
|
15
|
-
def invoke(self, schedule: Optional[str] = None):
|
|
16
|
-
self._invoke_job(
|
|
15
|
+
def invoke(self, schedule: Optional[str] = None, **kwargs):
|
|
16
|
+
return self._invoke_job(
|
|
17
|
+
position="run",
|
|
18
|
+
schedule=schedule,
|
|
19
|
+
**kwargs,
|
|
20
|
+
) # kwargs and return needed for get_data in gold
|
|
17
21
|
|
|
18
22
|
def invoke_pre_run(self, schedule: Optional[str] = None):
|
|
19
23
|
self._invoke_job(position="pre_run", schedule=schedule)
|
|
@@ -23,30 +27,50 @@ class Invoker(Checker):
|
|
|
23
27
|
self._invoke_job(position="post_run", schedule=schedule)
|
|
24
28
|
self._invoke_step(position="post_run", schedule=schedule)
|
|
25
29
|
|
|
26
|
-
def _invoke_job(self, position: str, schedule: Optional[str] = None):
|
|
30
|
+
def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
|
|
27
31
|
invokers = self.options.invokers.get_list(position)
|
|
32
|
+
if position == "run":
|
|
33
|
+
invokers = invokers if len(invokers) > 0 else [{}] # run must work even without run invoker options
|
|
28
34
|
|
|
29
35
|
errors = []
|
|
30
36
|
|
|
31
37
|
if invokers:
|
|
32
|
-
for i in invokers:
|
|
33
|
-
DEFAULT_LOGGER.
|
|
38
|
+
for i, invoker in enumerate(invokers):
|
|
39
|
+
DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
|
|
34
40
|
try:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
41
|
+
path = kwargs.get("path")
|
|
42
|
+
if path is None:
|
|
43
|
+
notebook = invoker.get("notebook")
|
|
44
|
+
assert notebook, "notebook mandatory"
|
|
45
|
+
path = PATH_RUNTIME.joinpath(notebook)
|
|
46
|
+
|
|
47
|
+
assert path is not None, "path mandatory"
|
|
48
|
+
|
|
49
|
+
arguments = invoker.get("arguments") or {}
|
|
50
|
+
timeout = invoker.get("timeout")
|
|
51
|
+
|
|
52
|
+
schema_only = kwargs.get("schema_only")
|
|
53
|
+
if schema_only is not None:
|
|
54
|
+
arguments["schema_only"] = schema_only
|
|
55
|
+
|
|
56
|
+
if len(invokers) == 1 and position == "run":
|
|
57
|
+
return self._run_notebook(
|
|
58
|
+
path=path,
|
|
59
|
+
arguments=arguments,
|
|
60
|
+
timeout=timeout,
|
|
61
|
+
schedule=schedule,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
self._run_notebook(
|
|
65
|
+
path=path,
|
|
66
|
+
arguments=arguments,
|
|
67
|
+
timeout=timeout,
|
|
68
|
+
schedule=schedule,
|
|
69
|
+
)
|
|
48
70
|
|
|
49
71
|
except Exception as e:
|
|
72
|
+
DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
|
|
73
|
+
|
|
50
74
|
if position == "pre_run":
|
|
51
75
|
errors.append(PreRunInvokeException(e))
|
|
52
76
|
elif position == "post_run":
|
|
@@ -63,15 +87,15 @@ class Invoker(Checker):
|
|
|
63
87
|
errors = []
|
|
64
88
|
|
|
65
89
|
if invokers:
|
|
66
|
-
for i in invokers:
|
|
67
|
-
DEFAULT_LOGGER.
|
|
90
|
+
for i, invoker in enumerate(invokers):
|
|
91
|
+
DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
|
|
68
92
|
try:
|
|
69
|
-
notebook =
|
|
93
|
+
notebook = invoker.get("notebook")
|
|
70
94
|
assert notebook, "notebook mandatory"
|
|
71
95
|
path = PATH_RUNTIME.joinpath(notebook)
|
|
72
96
|
|
|
73
|
-
arguments =
|
|
74
|
-
timeout =
|
|
97
|
+
arguments = invoker.get("arguments", {})
|
|
98
|
+
timeout = invoker.get("timeout")
|
|
75
99
|
|
|
76
100
|
self._run_notebook(
|
|
77
101
|
path=path,
|
|
@@ -81,6 +105,8 @@ class Invoker(Checker):
|
|
|
81
105
|
)
|
|
82
106
|
|
|
83
107
|
except Exception as e:
|
|
108
|
+
DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
|
|
109
|
+
|
|
84
110
|
if position == "pre_run":
|
|
85
111
|
errors.append(PreRunInvokeException(e))
|
|
86
112
|
elif position == "post_run":
|
|
@@ -125,9 +151,7 @@ class Invoker(Checker):
|
|
|
125
151
|
|
|
126
152
|
variables = None
|
|
127
153
|
if schedule is not None:
|
|
128
|
-
variables = (
|
|
129
|
-
next(s for s in get_schedules() if s.get("name") == schedule).get("options", {}).get("variables", {})
|
|
130
|
-
)
|
|
154
|
+
variables = get_schedule(name=schedule).get("options", {}).get("variables", {})
|
|
131
155
|
|
|
132
156
|
if variables is None:
|
|
133
157
|
variables = {}
|
|
@@ -135,7 +159,7 @@ class Invoker(Checker):
|
|
|
135
159
|
if arguments is None:
|
|
136
160
|
arguments = {}
|
|
137
161
|
|
|
138
|
-
dbutils.notebook.run(
|
|
162
|
+
return dbutils.notebook.run(
|
|
139
163
|
path=path.get_notebook_path(), # type: ignore
|
|
140
164
|
timeout_seconds=timeout, # type: ignore
|
|
141
165
|
arguments={ # type: ignore
|
|
@@ -154,7 +178,7 @@ class Invoker(Checker):
|
|
|
154
178
|
extenders = self.options.extenders
|
|
155
179
|
for e in extenders:
|
|
156
180
|
name = e.get("extender")
|
|
157
|
-
DEFAULT_LOGGER.
|
|
181
|
+
DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
|
|
158
182
|
arguments = e.get("arguments") or {}
|
|
159
183
|
|
|
160
184
|
extender = get_extender(name)
|
|
@@ -168,7 +192,7 @@ class Invoker(Checker):
|
|
|
168
192
|
extenders = self.step_conf.get("extender_options", {})
|
|
169
193
|
for e in extenders:
|
|
170
194
|
name = e.get("extender")
|
|
171
|
-
DEFAULT_LOGGER.
|
|
195
|
+
DEFAULT_LOGGER.debug(f"extend by step ({name})", extra={"label": self})
|
|
172
196
|
arguments = e.get("arguments", {})
|
|
173
197
|
|
|
174
198
|
extender = get_extender(name)
|