fabricks 3.0.19__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +8 -7
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
|
@@ -5,10 +5,12 @@ from pyspark.sql import DataFrame
|
|
|
5
5
|
|
|
6
6
|
from fabricks.context import PATH_RUNTIME
|
|
7
7
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
|
+
from fabricks.core.extenders import get_extender
|
|
8
9
|
from fabricks.core.jobs.base.checker import Checker
|
|
9
10
|
from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
|
|
10
11
|
from fabricks.core.jobs.get_schedule import get_schedule
|
|
11
|
-
from fabricks.
|
|
12
|
+
from fabricks.models.common import BaseInvokerOptions, ExtenderOptions
|
|
13
|
+
from fabricks.utils.path import GitPath
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class Invoker(Checker):
|
|
@@ -27,8 +29,37 @@ class Invoker(Checker):
|
|
|
27
29
|
self._invoke_job(position="post_run", schedule=schedule)
|
|
28
30
|
self._invoke_step(position="post_run", schedule=schedule)
|
|
29
31
|
|
|
32
|
+
def _invoke_notebook(
|
|
33
|
+
self,
|
|
34
|
+
invoker: dict | BaseInvokerOptions,
|
|
35
|
+
schedule: Optional[str] = None,
|
|
36
|
+
**kwargs,
|
|
37
|
+
):
|
|
38
|
+
path = kwargs.get("path")
|
|
39
|
+
if path is None:
|
|
40
|
+
notebook = invoker.get("notebook") if isinstance(invoker, dict) else invoker.notebook
|
|
41
|
+
assert notebook, "notebook mandatory"
|
|
42
|
+
path = PATH_RUNTIME.joinpath(notebook)
|
|
43
|
+
|
|
44
|
+
assert path is not None, "path could not be resolved"
|
|
45
|
+
|
|
46
|
+
timeout = invoker.get("timeout") if isinstance(invoker, dict) else invoker.timeout
|
|
47
|
+
arguments = invoker.get("arguments") if isinstance(invoker, dict) else invoker.arguments
|
|
48
|
+
arguments = arguments or {}
|
|
49
|
+
|
|
50
|
+
schema_only = kwargs.get("schema_only")
|
|
51
|
+
if schema_only is not None:
|
|
52
|
+
arguments["schema_only"] = schema_only
|
|
53
|
+
|
|
54
|
+
return self._run_notebook(
|
|
55
|
+
path=path,
|
|
56
|
+
arguments=arguments,
|
|
57
|
+
schedule=schedule,
|
|
58
|
+
timeout=timeout,
|
|
59
|
+
)
|
|
60
|
+
|
|
30
61
|
def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
|
|
31
|
-
invokers = self.
|
|
62
|
+
invokers = getattr(self.invoker_options, position, None) or [] if self.invoker_options else []
|
|
32
63
|
if position == "run":
|
|
33
64
|
invokers = invokers if len(invokers) > 0 else [{}] # run must work even without run invoker options
|
|
34
65
|
|
|
@@ -38,35 +69,10 @@ class Invoker(Checker):
|
|
|
38
69
|
for i, invoker in enumerate(invokers):
|
|
39
70
|
DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
|
|
40
71
|
try:
|
|
41
|
-
path = kwargs.get("path")
|
|
42
|
-
if path is None:
|
|
43
|
-
notebook = invoker.get("notebook")
|
|
44
|
-
assert notebook, "notebook mandatory"
|
|
45
|
-
path = PATH_RUNTIME.joinpath(notebook)
|
|
46
|
-
|
|
47
|
-
assert path is not None, "path mandatory"
|
|
48
|
-
|
|
49
|
-
arguments = invoker.get("arguments") or {}
|
|
50
|
-
timeout = invoker.get("timeout")
|
|
51
|
-
|
|
52
|
-
schema_only = kwargs.get("schema_only")
|
|
53
|
-
if schema_only is not None:
|
|
54
|
-
arguments["schema_only"] = schema_only
|
|
55
|
-
|
|
56
72
|
if len(invokers) == 1 and position == "run":
|
|
57
|
-
return self.
|
|
58
|
-
path=path,
|
|
59
|
-
arguments=arguments,
|
|
60
|
-
timeout=timeout,
|
|
61
|
-
schedule=schedule,
|
|
62
|
-
)
|
|
73
|
+
return self._invoke_notebook(invoker, schedule=schedule, **kwargs)
|
|
63
74
|
else:
|
|
64
|
-
self.
|
|
65
|
-
path=path,
|
|
66
|
-
arguments=arguments,
|
|
67
|
-
timeout=timeout,
|
|
68
|
-
schedule=schedule,
|
|
69
|
-
)
|
|
75
|
+
self._invoke_notebook(invoker=invoker, schedule=schedule, **kwargs)
|
|
70
76
|
|
|
71
77
|
except Exception as e:
|
|
72
78
|
DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
|
|
@@ -82,7 +88,7 @@ class Invoker(Checker):
|
|
|
82
88
|
raise Exception(errors)
|
|
83
89
|
|
|
84
90
|
def _invoke_step(self, position: str, schedule: Optional[str] = None):
|
|
85
|
-
invokers = self.step_conf.
|
|
91
|
+
invokers = getattr(self.step_conf.invoker_options, position, []) if self.step_conf.invoker_options else []
|
|
86
92
|
|
|
87
93
|
errors = []
|
|
88
94
|
|
|
@@ -90,19 +96,7 @@ class Invoker(Checker):
|
|
|
90
96
|
for i, invoker in enumerate(invokers):
|
|
91
97
|
DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
|
|
92
98
|
try:
|
|
93
|
-
|
|
94
|
-
assert notebook, "notebook mandatory"
|
|
95
|
-
path = PATH_RUNTIME.joinpath(notebook)
|
|
96
|
-
|
|
97
|
-
arguments = invoker.get("arguments", {})
|
|
98
|
-
timeout = invoker.get("timeout")
|
|
99
|
-
|
|
100
|
-
self._run_notebook(
|
|
101
|
-
path=path,
|
|
102
|
-
arguments=arguments,
|
|
103
|
-
timeout=timeout,
|
|
104
|
-
schedule=schedule,
|
|
105
|
-
)
|
|
99
|
+
self._invoke_notebook(invoker=invoker, schedule=schedule)
|
|
106
100
|
|
|
107
101
|
except Exception as e:
|
|
108
102
|
DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
|
|
@@ -119,7 +113,7 @@ class Invoker(Checker):
|
|
|
119
113
|
|
|
120
114
|
def _run_notebook(
|
|
121
115
|
self,
|
|
122
|
-
path:
|
|
116
|
+
path: GitPath,
|
|
123
117
|
arguments: Optional[dict] = None,
|
|
124
118
|
timeout: Optional[int] = None,
|
|
125
119
|
schedule: Optional[str] = None,
|
|
@@ -128,7 +122,7 @@ class Invoker(Checker):
|
|
|
128
122
|
Invokes a notebook job.
|
|
129
123
|
|
|
130
124
|
Args:
|
|
131
|
-
path (Optional[
|
|
125
|
+
path (Optional[GitPath]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
|
|
132
126
|
arguments (Optional[dict]): Additional arguments to pass to the notebook job. If not provided, it will be retrieved from the invoker options.
|
|
133
127
|
schedule (Optional[str]): The schedule for the job. If provided, schedule variables will be retrieved.
|
|
134
128
|
|
|
@@ -167,33 +161,24 @@ class Invoker(Checker):
|
|
|
167
161
|
"topic": self.topic,
|
|
168
162
|
"item": self.item,
|
|
169
163
|
**arguments,
|
|
170
|
-
"job_options": json.dumps(self.options.
|
|
164
|
+
"job_options": json.dumps(self.options.model_dump()),
|
|
171
165
|
"schedule_variables": json.dumps(variables),
|
|
172
166
|
},
|
|
173
167
|
)
|
|
174
168
|
|
|
175
169
|
def extend_job(self, df: DataFrame) -> DataFrame:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
extenders = self.options.extenders
|
|
179
|
-
for e in extenders:
|
|
180
|
-
name = e.get("extender")
|
|
181
|
-
DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
|
|
182
|
-
arguments = e.get("arguments") or {}
|
|
183
|
-
|
|
184
|
-
extender = get_extender(name)
|
|
185
|
-
df = extender(df, **arguments)
|
|
186
|
-
|
|
187
|
-
return df
|
|
170
|
+
extenders = self.extender_options or []
|
|
171
|
+
return self._extend(df, extenders, extended="job")
|
|
188
172
|
|
|
189
173
|
def extend_step(self, df: DataFrame) -> DataFrame:
|
|
190
|
-
|
|
174
|
+
extenders = self.step_conf.extender_options or []
|
|
175
|
+
return self._extend(df, extenders, extended="step")
|
|
191
176
|
|
|
192
|
-
|
|
177
|
+
def _extend(self, df: DataFrame, extenders: list[ExtenderOptions], extended: str) -> DataFrame:
|
|
193
178
|
for e in extenders:
|
|
194
|
-
name = e.
|
|
195
|
-
DEFAULT_LOGGER.debug(f"extend
|
|
196
|
-
arguments = e.
|
|
179
|
+
name = e.extender
|
|
180
|
+
DEFAULT_LOGGER.debug(f"extend {extended} ({name})", extra={"label": self})
|
|
181
|
+
arguments = e.arguments or {}
|
|
197
182
|
|
|
198
183
|
extender = get_extender(name)
|
|
199
184
|
df = extender(df, **arguments)
|
|
@@ -3,9 +3,8 @@ from functools import partial
|
|
|
3
3
|
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
|
-
from pyspark.sql.functions import expr
|
|
7
6
|
|
|
8
|
-
from fabricks.context import IS_TYPE_WIDENING
|
|
7
|
+
from fabricks.context import IS_TYPE_WIDENING
|
|
9
8
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
10
9
|
from fabricks.core.jobs.base.exception import (
|
|
11
10
|
PostRunCheckException,
|
|
@@ -18,39 +17,21 @@ from fabricks.core.jobs.base.exception import (
|
|
|
18
17
|
SkipRunCheckWarning,
|
|
19
18
|
)
|
|
20
19
|
from fabricks.core.jobs.base.invoker import Invoker
|
|
20
|
+
from fabricks.models import JobBronzeOptions, JobSilverOptions
|
|
21
21
|
from fabricks.utils.write import write_stream
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class Processor(Invoker):
|
|
25
25
|
def filter_where(self, df: DataFrame) -> DataFrame:
|
|
26
|
-
|
|
26
|
+
assert isinstance(self.options, (JobBronzeOptions, JobSilverOptions))
|
|
27
27
|
|
|
28
|
+
f = self.options.filter_where
|
|
28
29
|
if f:
|
|
29
30
|
DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
|
|
30
31
|
df = df.where(f"{f}")
|
|
31
32
|
|
|
32
33
|
return df
|
|
33
34
|
|
|
34
|
-
def encrypt(self, df: DataFrame) -> DataFrame:
|
|
35
|
-
encrypted_columns = self.options.job.get_list("encrypted_columns")
|
|
36
|
-
if encrypted_columns:
|
|
37
|
-
if not IS_UNITY_CATALOG:
|
|
38
|
-
from databricks.sdk.runtime import dbutils
|
|
39
|
-
|
|
40
|
-
key = dbutils.secrets.get(scope=SECRET_SCOPE, key="encryption-key")
|
|
41
|
-
else:
|
|
42
|
-
import os
|
|
43
|
-
|
|
44
|
-
key = os.environ["FABRICKS_ENCRYPTION_KEY"]
|
|
45
|
-
|
|
46
|
-
assert key, "key not found"
|
|
47
|
-
|
|
48
|
-
for col in encrypted_columns:
|
|
49
|
-
DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
|
|
50
|
-
df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
|
|
51
|
-
|
|
52
|
-
return df
|
|
53
|
-
|
|
54
35
|
def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
|
|
55
36
|
"""
|
|
56
37
|
Restores the processor to a specific version and batch.
|
|
@@ -70,7 +51,7 @@ class Processor(Invoker):
|
|
|
70
51
|
self.rm_commit(current_batch)
|
|
71
52
|
|
|
72
53
|
assert last_batch == self.table.get_property("fabricks.last_batch")
|
|
73
|
-
assert self.paths.
|
|
54
|
+
assert self.paths.to_commits.joinpath(last_batch).exists()
|
|
74
55
|
|
|
75
56
|
def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
76
57
|
DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
|
|
@@ -118,7 +99,7 @@ class Processor(Invoker):
|
|
|
118
99
|
DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
|
|
119
100
|
write_stream(
|
|
120
101
|
df,
|
|
121
|
-
checkpoints_path=self.paths.
|
|
102
|
+
checkpoints_path=self.paths.to_checkpoints,
|
|
122
103
|
func=self._for_each_batch,
|
|
123
104
|
timeout=self.timeout,
|
|
124
105
|
)
|
|
@@ -198,11 +179,15 @@ class Processor(Invoker):
|
|
|
198
179
|
raise exception
|
|
199
180
|
|
|
200
181
|
if vacuum is None:
|
|
201
|
-
vacuum = self.options.
|
|
182
|
+
vacuum = self.options.vacuum if self.options and self.options.vacuum is not None else False
|
|
202
183
|
if optimize is None:
|
|
203
|
-
optimize = self.options.
|
|
184
|
+
optimize = self.options.optimize if self.options and self.options.optimize is not None else False
|
|
204
185
|
if compute_statistics is None:
|
|
205
|
-
compute_statistics =
|
|
186
|
+
compute_statistics = (
|
|
187
|
+
self.options.compute_statistics
|
|
188
|
+
if self.options and self.options.compute_statistics is not None
|
|
189
|
+
else False
|
|
190
|
+
)
|
|
206
191
|
|
|
207
192
|
if vacuum or optimize or compute_statistics:
|
|
208
193
|
self.maintain(
|
fabricks/core/jobs/bronze.py
CHANGED
|
@@ -7,21 +7,20 @@ from pyspark.sql.types import Row, TimestampType
|
|
|
7
7
|
from fabricks.cdc.nocdc import NoCDC
|
|
8
8
|
from fabricks.context import VARIABLES
|
|
9
9
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
10
|
-
from fabricks.core.jobs.base._types import JobDependency, TBronze
|
|
11
10
|
from fabricks.core.jobs.base.job import BaseJob
|
|
12
|
-
from fabricks.core.parsers import BaseParser
|
|
13
11
|
from fabricks.core.parsers.get_parser import get_parser
|
|
14
12
|
from fabricks.core.parsers.utils import clean
|
|
15
13
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
14
|
+
from fabricks.models import JobBronzeOptions, JobDependency, StepBronzeConf, StepBronzeOptions
|
|
16
15
|
from fabricks.utils.helpers import concat_ws
|
|
17
|
-
from fabricks.utils.path import
|
|
16
|
+
from fabricks.utils.path import FileSharePath
|
|
18
17
|
from fabricks.utils.read import read
|
|
19
18
|
|
|
20
19
|
|
|
21
20
|
class Bronze(BaseJob):
|
|
22
21
|
def __init__(
|
|
23
22
|
self,
|
|
24
|
-
step:
|
|
23
|
+
step: str,
|
|
25
24
|
topic: Optional[str] = None,
|
|
26
25
|
item: Optional[str] = None,
|
|
27
26
|
job_id: Optional[str] = None,
|
|
@@ -36,7 +35,7 @@ class Bronze(BaseJob):
|
|
|
36
35
|
conf=conf,
|
|
37
36
|
)
|
|
38
37
|
|
|
39
|
-
_parser: Optional[
|
|
38
|
+
_parser: Optional[str] = None
|
|
40
39
|
|
|
41
40
|
@property
|
|
42
41
|
def stream(self) -> bool:
|
|
@@ -54,25 +53,40 @@ class Bronze(BaseJob):
|
|
|
54
53
|
def virtual(self) -> bool:
|
|
55
54
|
return False
|
|
56
55
|
|
|
56
|
+
@property
|
|
57
|
+
def options(self) -> JobBronzeOptions:
|
|
58
|
+
"""Direct access to typed bronze job options."""
|
|
59
|
+
return self.conf.options # type: ignore
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def step_conf(self) -> StepBronzeConf:
|
|
63
|
+
"""Direct access to typed bronze step conf."""
|
|
64
|
+
return self.base_step_conf # type: ignore
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def step_options(self) -> StepBronzeOptions:
|
|
68
|
+
"""Direct access to typed bronze step options."""
|
|
69
|
+
return self.base_step_conf.options # type: ignore
|
|
70
|
+
|
|
57
71
|
@classmethod
|
|
58
72
|
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
59
|
-
return cls(step=
|
|
73
|
+
return cls(step=step, job_id=job_id, conf=conf)
|
|
60
74
|
|
|
61
75
|
@classmethod
|
|
62
76
|
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
63
|
-
return cls(step=
|
|
77
|
+
return cls(step=step, topic=topic, item=item, conf=conf)
|
|
64
78
|
|
|
65
79
|
@property
|
|
66
|
-
def data_path(self) ->
|
|
67
|
-
uri = self.options.
|
|
80
|
+
def data_path(self) -> FileSharePath:
|
|
81
|
+
uri = self.options.uri
|
|
68
82
|
assert uri is not None, "no uri provided in options"
|
|
69
|
-
path =
|
|
83
|
+
path = FileSharePath.from_uri(uri, regex=VARIABLES)
|
|
70
84
|
return path
|
|
71
85
|
|
|
72
86
|
def get_dependencies(self, *s) -> Sequence[JobDependency]:
|
|
73
87
|
dependencies = []
|
|
74
88
|
|
|
75
|
-
parents = self.options.
|
|
89
|
+
parents = self.options.parents or []
|
|
76
90
|
if parents:
|
|
77
91
|
for p in parents:
|
|
78
92
|
dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
|
|
@@ -81,8 +95,8 @@ class Bronze(BaseJob):
|
|
|
81
95
|
|
|
82
96
|
def register_external_table(self):
|
|
83
97
|
options = self.conf.parser_options # type: ignore
|
|
84
|
-
if options:
|
|
85
|
-
file_format = options.
|
|
98
|
+
if options and options.file_format:
|
|
99
|
+
file_format = options.file_format
|
|
86
100
|
else:
|
|
87
101
|
file_format = "delta"
|
|
88
102
|
|
|
@@ -136,17 +150,14 @@ class Bronze(BaseJob):
|
|
|
136
150
|
self.compute_statistics_external_table()
|
|
137
151
|
|
|
138
152
|
@property
|
|
139
|
-
def parser(self) ->
|
|
153
|
+
def parser(self) -> str:
|
|
140
154
|
if not self._parser:
|
|
141
155
|
assert self.mode not in ["register"], f"{self.mode} not allowed"
|
|
142
156
|
|
|
143
|
-
|
|
144
|
-
assert
|
|
145
|
-
|
|
146
|
-
options = self.conf.parser_options or None # type: ignore
|
|
147
|
-
p = get_parser(name, options)
|
|
157
|
+
parser = self.options.parser
|
|
158
|
+
assert parser is not None, "parser not found"
|
|
148
159
|
|
|
149
|
-
self._parser =
|
|
160
|
+
self._parser = cast(str, parser)
|
|
150
161
|
|
|
151
162
|
return self._parser
|
|
152
163
|
|
|
@@ -171,19 +182,52 @@ class Bronze(BaseJob):
|
|
|
171
182
|
else:
|
|
172
183
|
df = self.spark.sql(f"select * from {self}")
|
|
173
184
|
|
|
174
|
-
|
|
175
|
-
|
|
185
|
+
if self.step_options.clean is not False:
|
|
186
|
+
# cleaning should done by parser but for delta we do it here
|
|
187
|
+
df = clean(df)
|
|
176
188
|
|
|
177
189
|
else:
|
|
178
|
-
|
|
190
|
+
options = self.conf.parser_options or None # type: ignore
|
|
191
|
+
parse = get_parser(self.parser, options)
|
|
192
|
+
|
|
193
|
+
df = parse(
|
|
179
194
|
stream=stream,
|
|
180
195
|
data_path=self.data_path,
|
|
181
|
-
schema_path=self.paths.
|
|
196
|
+
schema_path=self.paths.to_schema,
|
|
182
197
|
spark=self.spark,
|
|
183
198
|
)
|
|
184
199
|
|
|
185
200
|
return df
|
|
186
201
|
|
|
202
|
+
def encrypt(self, df: DataFrame) -> DataFrame:
|
|
203
|
+
encrypted_columns = self.options.encrypted_columns or []
|
|
204
|
+
if encrypted_columns:
|
|
205
|
+
if self.runtime_options.encryption_key is not None:
|
|
206
|
+
from databricks.sdk.runtime import dbutils
|
|
207
|
+
|
|
208
|
+
key = dbutils.secrets.get(
|
|
209
|
+
scope=self.runtime_options.secret_scope,
|
|
210
|
+
key=self.runtime_options.encryption_key,
|
|
211
|
+
)
|
|
212
|
+
if self.runtime_options.unity_catalog:
|
|
213
|
+
DEFAULT_LOGGER.warning(
|
|
214
|
+
"Unity Catalog enabled, use FABRICKS_ENCRYPTION_KEY instead",
|
|
215
|
+
extra={"label": self},
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
else:
|
|
219
|
+
import os
|
|
220
|
+
|
|
221
|
+
key = os.environ.get("FABRICKS_ENCRYPTION_KEY")
|
|
222
|
+
|
|
223
|
+
assert key, "encryption key not found in secrets nor in environment"
|
|
224
|
+
|
|
225
|
+
for col in encrypted_columns:
|
|
226
|
+
DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
|
|
227
|
+
df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
|
|
228
|
+
|
|
229
|
+
return df
|
|
230
|
+
|
|
187
231
|
def get_data(
|
|
188
232
|
self,
|
|
189
233
|
stream: bool = False,
|
|
@@ -204,7 +248,7 @@ class Bronze(BaseJob):
|
|
|
204
248
|
return df
|
|
205
249
|
|
|
206
250
|
def add_calculated_columns(self, df: DataFrame) -> DataFrame:
|
|
207
|
-
calculated_columns = self.options.
|
|
251
|
+
calculated_columns = self.options.calculated_columns or {}
|
|
208
252
|
|
|
209
253
|
if calculated_columns:
|
|
210
254
|
for key, value in calculated_columns.items():
|
|
@@ -230,7 +274,7 @@ class Bronze(BaseJob):
|
|
|
230
274
|
|
|
231
275
|
def add_key(self, df: DataFrame) -> DataFrame:
|
|
232
276
|
if "__key" not in df.columns:
|
|
233
|
-
fields = self.options.
|
|
277
|
+
fields = self.options.keys or []
|
|
234
278
|
if fields:
|
|
235
279
|
DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
|
|
236
280
|
|
|
@@ -244,7 +288,7 @@ class Bronze(BaseJob):
|
|
|
244
288
|
|
|
245
289
|
def add_source(self, df: DataFrame) -> DataFrame:
|
|
246
290
|
if "__source" not in df.columns:
|
|
247
|
-
source = self.options.
|
|
291
|
+
source = self.options.source
|
|
248
292
|
if source:
|
|
249
293
|
DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
|
|
250
294
|
df = df.withColumn("__source", lit(source))
|
|
@@ -253,7 +297,7 @@ class Bronze(BaseJob):
|
|
|
253
297
|
|
|
254
298
|
def add_operation(self, df: DataFrame) -> DataFrame:
|
|
255
299
|
if "__operation" not in df.columns:
|
|
256
|
-
operation = self.options.
|
|
300
|
+
operation = self.options.operation
|
|
257
301
|
if operation:
|
|
258
302
|
DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
|
|
259
303
|
df = df.withColumn("__operation", lit(operation))
|
|
@@ -263,15 +307,10 @@ class Bronze(BaseJob):
|
|
|
263
307
|
|
|
264
308
|
return df
|
|
265
309
|
|
|
266
|
-
def
|
|
267
|
-
df = df.transform(self.extend)
|
|
268
|
-
df = df.transform(self.add_calculated_columns)
|
|
269
|
-
df = df.transform(self.add_hash)
|
|
270
|
-
df = df.transform(self.add_operation)
|
|
271
|
-
df = df.transform(self.add_source)
|
|
272
|
-
df = df.transform(self.add_key)
|
|
273
|
-
|
|
310
|
+
def add_metadata(self, df: DataFrame) -> DataFrame:
|
|
274
311
|
if "__metadata" in df.columns:
|
|
312
|
+
DEFAULT_LOGGER.debug("add metadata", extra={"label": self})
|
|
313
|
+
|
|
275
314
|
if self.mode == "register":
|
|
276
315
|
# https://github.com/delta-io/delta/issues/2014 (BUG)
|
|
277
316
|
df = df.withColumn(
|
|
@@ -307,6 +346,17 @@ class Bronze(BaseJob):
|
|
|
307
346
|
|
|
308
347
|
return df
|
|
309
348
|
|
|
349
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
350
|
+
df = df.transform(self.extend)
|
|
351
|
+
df = df.transform(self.add_calculated_columns)
|
|
352
|
+
df = df.transform(self.add_hash)
|
|
353
|
+
df = df.transform(self.add_operation)
|
|
354
|
+
df = df.transform(self.add_source)
|
|
355
|
+
df = df.transform(self.add_key)
|
|
356
|
+
df = df.transform(self.add_metadata)
|
|
357
|
+
|
|
358
|
+
return df
|
|
359
|
+
|
|
310
360
|
def create_or_replace_view(self):
|
|
311
361
|
DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
|
|
312
362
|
|
|
@@ -395,6 +445,6 @@ class Bronze(BaseJob):
|
|
|
395
445
|
else:
|
|
396
446
|
super().vacuum()
|
|
397
447
|
|
|
398
|
-
def overwrite(self, schedule: Optional[str] = None):
|
|
448
|
+
def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
|
|
399
449
|
self.truncate()
|
|
400
|
-
self.run(schedule=schedule)
|
|
450
|
+
self.run(schedule=schedule, invoke=invoke)
|
fabricks/core/jobs/get_job.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from typing import Optional, Union,
|
|
1
|
+
from typing import Optional, Union, overload
|
|
2
2
|
|
|
3
3
|
from pyspark.sql.types import Row
|
|
4
4
|
|
|
5
|
-
from fabricks.
|
|
5
|
+
from fabricks.context import Bronzes, Golds, Silvers
|
|
6
6
|
from fabricks.core.jobs.base.job import BaseJob
|
|
7
|
-
from fabricks.
|
|
7
|
+
from fabricks.models import get_job_id
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@overload
|
|
@@ -91,7 +91,6 @@ def get_job_internal(
|
|
|
91
91
|
if step in Bronzes:
|
|
92
92
|
from fabricks.core.jobs.bronze import Bronze
|
|
93
93
|
|
|
94
|
-
step = cast(TBronze, step)
|
|
95
94
|
if job_id is not None:
|
|
96
95
|
job = Bronze.from_job_id(step=step, job_id=job_id, conf=conf)
|
|
97
96
|
else:
|
|
@@ -102,7 +101,6 @@ def get_job_internal(
|
|
|
102
101
|
elif step in Silvers:
|
|
103
102
|
from fabricks.core.jobs.silver import Silver
|
|
104
103
|
|
|
105
|
-
step = cast(TSilver, step)
|
|
106
104
|
if job_id is not None:
|
|
107
105
|
job = Silver.from_job_id(step=step, job_id=job_id, conf=conf)
|
|
108
106
|
else:
|
|
@@ -113,7 +111,6 @@ def get_job_internal(
|
|
|
113
111
|
elif step in Golds:
|
|
114
112
|
from fabricks.core.jobs.gold import Gold
|
|
115
113
|
|
|
116
|
-
step = cast(TGold, step)
|
|
117
114
|
if job_id is not None:
|
|
118
115
|
job = Gold.from_job_id(step=step, job_id=job_id, conf=conf)
|
|
119
116
|
else:
|
|
@@ -1,97 +1,48 @@
|
|
|
1
|
-
from typing import Optional, Union,
|
|
1
|
+
from typing import Optional, Union, overload
|
|
2
2
|
|
|
3
3
|
from pyspark.sql.types import Row
|
|
4
4
|
|
|
5
|
-
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, SPARK
|
|
6
|
-
from fabricks.
|
|
7
|
-
from fabricks.core.jobs.get_job_id import get_job_id
|
|
5
|
+
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, SPARK, Bronzes, Golds, Silvers
|
|
6
|
+
from fabricks.models import JobConf, get_job_id
|
|
8
7
|
|
|
9
8
|
|
|
10
|
-
def get_job_conf_internal(step:
|
|
9
|
+
def get_job_conf_internal(step: str, row: Union[Row, dict]) -> JobConf:
|
|
11
10
|
if isinstance(row, Row):
|
|
12
11
|
row = row.asDict(recursive=True)
|
|
13
12
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
check_options = row.get("check_options")
|
|
17
|
-
spark_options = row.get("spark_options")
|
|
18
|
-
invoker_options = row.get("invoker_options")
|
|
19
|
-
extender_options = row.get("extender_options")
|
|
20
|
-
|
|
21
|
-
job_id = row.get("job_id", get_job_id(step=step, topic=row["topic"], item=row["item"]))
|
|
13
|
+
# Add step to row data (job_id will be computed automatically)
|
|
14
|
+
row["step"] = step
|
|
22
15
|
|
|
16
|
+
# Use Pydantic validation - handles nested models and validation automatically
|
|
23
17
|
if step in Bronzes:
|
|
24
|
-
from fabricks.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
step = cast(TBronze, step)
|
|
28
|
-
return JobConfBronze(
|
|
29
|
-
job_id=job_id,
|
|
30
|
-
topic=row["topic"],
|
|
31
|
-
item=row["item"],
|
|
32
|
-
step=step,
|
|
33
|
-
options=options,
|
|
34
|
-
parser_options=row.get("parser_options"),
|
|
35
|
-
table_options=table_options,
|
|
36
|
-
check_options=check_options,
|
|
37
|
-
invoker_options=invoker_options,
|
|
38
|
-
extender_options=extender_options,
|
|
39
|
-
spark_options=spark_options,
|
|
40
|
-
tags=row.get("tags"),
|
|
41
|
-
)
|
|
18
|
+
from fabricks.models import JobConfBronze
|
|
19
|
+
|
|
20
|
+
return JobConfBronze.model_validate(row)
|
|
42
21
|
|
|
43
22
|
elif step in Silvers:
|
|
44
|
-
from fabricks.
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
step = cast(TSilver, step)
|
|
48
|
-
return JobConfSilver(
|
|
49
|
-
job_id=job_id,
|
|
50
|
-
topic=row["topic"],
|
|
51
|
-
item=row["item"],
|
|
52
|
-
step=step,
|
|
53
|
-
options=options,
|
|
54
|
-
table_options=table_options,
|
|
55
|
-
check_options=check_options,
|
|
56
|
-
invoker_options=invoker_options,
|
|
57
|
-
extender_options=extender_options,
|
|
58
|
-
spark_options=spark_options,
|
|
59
|
-
tags=row.get("tags"),
|
|
60
|
-
)
|
|
23
|
+
from fabricks.models import JobConfSilver
|
|
24
|
+
|
|
25
|
+
return JobConfSilver.model_validate(row)
|
|
61
26
|
|
|
62
27
|
elif step in Golds:
|
|
63
|
-
from fabricks.
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
step = cast(TGold, step)
|
|
67
|
-
return JobConfGold(
|
|
68
|
-
job_id=job_id,
|
|
69
|
-
topic=row["topic"],
|
|
70
|
-
item=row["item"],
|
|
71
|
-
step=step,
|
|
72
|
-
options=options,
|
|
73
|
-
table_options=table_options,
|
|
74
|
-
check_options=check_options,
|
|
75
|
-
invoker_options=invoker_options,
|
|
76
|
-
extender_options=extender_options,
|
|
77
|
-
spark_options=spark_options,
|
|
78
|
-
tags=row.get("tags"),
|
|
79
|
-
)
|
|
28
|
+
from fabricks.models import JobConfGold
|
|
29
|
+
|
|
30
|
+
return JobConfGold.model_validate(row)
|
|
80
31
|
|
|
81
32
|
else:
|
|
82
33
|
raise ValueError(f"{step} not found")
|
|
83
34
|
|
|
84
35
|
|
|
85
36
|
@overload
|
|
86
|
-
def get_job_conf(step:
|
|
37
|
+
def get_job_conf(step: str, *, job_id: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
|
|
87
38
|
|
|
88
39
|
|
|
89
40
|
@overload
|
|
90
|
-
def get_job_conf(step:
|
|
41
|
+
def get_job_conf(step: str, *, topic: str, item: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
|
|
91
42
|
|
|
92
43
|
|
|
93
44
|
def get_job_conf(
|
|
94
|
-
step:
|
|
45
|
+
step: str,
|
|
95
46
|
job_id: Optional[str] = None,
|
|
96
47
|
topic: Optional[str] = None,
|
|
97
48
|
item: Optional[str] = None,
|