fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +4 -4
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
fabricks/core/jobs/get_jobs.py
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import List, Literal, Optional, TypedDict, Union, overload
|
|
1
|
+
from typing import List, Literal, Optional, Union, overload
|
|
3
2
|
|
|
3
|
+
from pydantic import BaseModel
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import expr
|
|
6
6
|
from pyspark.sql.types import Row
|
|
7
|
+
from sparkdantic import create_spark_schema
|
|
7
8
|
|
|
8
9
|
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
|
|
9
|
-
from fabricks.core.jobs.base._types import AllowedModes, TStep
|
|
10
10
|
from fabricks.core.jobs.base.job import BaseJob
|
|
11
11
|
from fabricks.core.jobs.get_job import get_job, get_job_internal
|
|
12
|
+
from fabricks.models import AllowedModes
|
|
12
13
|
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
13
|
-
from fabricks.utils.path import
|
|
14
|
+
from fabricks.utils.path import GitPath
|
|
14
15
|
from fabricks.utils.read import read_yaml
|
|
15
|
-
from fabricks.utils.schema import get_schema_for_type
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class GenericOptions(
|
|
18
|
+
class GenericOptions(BaseModel):
|
|
19
19
|
mode: AllowedModes
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
step: TStep
|
|
22
|
+
class JobConfGeneric(BaseModel):
|
|
23
|
+
step: str
|
|
25
24
|
job_id: str
|
|
26
25
|
topic: str
|
|
27
26
|
item: str
|
|
@@ -39,9 +38,9 @@ def get_jobs_internal():
|
|
|
39
38
|
|
|
40
39
|
def get_jobs_internal_df() -> DataFrame:
|
|
41
40
|
if IS_JOB_CONFIG_FROM_YAML:
|
|
42
|
-
schema =
|
|
41
|
+
schema = create_spark_schema(JobConfGeneric)
|
|
43
42
|
|
|
44
|
-
def _read_yaml(path:
|
|
43
|
+
def _read_yaml(path: GitPath):
|
|
45
44
|
df = SPARK.createDataFrame(read_yaml(path, root="job"), schema=schema) # type: ignore
|
|
46
45
|
if df:
|
|
47
46
|
df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
|
|
@@ -1,23 +1,9 @@
|
|
|
1
|
-
from typing import List, Optional, TypedDict
|
|
2
|
-
|
|
3
1
|
from pyspark.sql import DataFrame
|
|
2
|
+
from sparkdantic import create_spark_schema
|
|
4
3
|
|
|
5
4
|
from fabricks.context import PATH_SCHEDULES, SPARK
|
|
6
|
-
from fabricks.
|
|
5
|
+
from fabricks.models.schedule import Schedule
|
|
7
6
|
from fabricks.utils.read.read_yaml import read_yaml
|
|
8
|
-
from fabricks.utils.schema import get_schema_for_type
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class Options(TypedDict):
|
|
12
|
-
steps: Optional[List[TStep]]
|
|
13
|
-
tag: Optional[str]
|
|
14
|
-
view: Optional[str]
|
|
15
|
-
variables: Optional[dict[str, str]]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Schedule(TypedDict):
|
|
19
|
-
name: str
|
|
20
|
-
options: Options
|
|
21
7
|
|
|
22
8
|
|
|
23
9
|
def get_schedules():
|
|
@@ -25,7 +11,7 @@ def get_schedules():
|
|
|
25
11
|
|
|
26
12
|
|
|
27
13
|
def get_schedules_df() -> DataFrame:
|
|
28
|
-
schema =
|
|
14
|
+
schema = create_spark_schema(Schedule)
|
|
29
15
|
df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
|
|
30
16
|
|
|
31
17
|
assert df, "no schedules found"
|
fabricks/core/jobs/gold.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
-
from typing import List, Optional, Union, cast
|
|
3
|
+
from typing import List, Literal, Optional, Union, cast
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
6
|
from pyspark.sql.types import Row
|
|
@@ -8,18 +8,18 @@ from typing_extensions import deprecated
|
|
|
8
8
|
|
|
9
9
|
from fabricks.cdc.nocdc import NoCDC
|
|
10
10
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
-
from fabricks.core.jobs.base._types import JobDependency, TGold
|
|
12
11
|
from fabricks.core.jobs.base.job import BaseJob
|
|
13
|
-
from fabricks.core.udfs import is_registered, register_udf
|
|
12
|
+
from fabricks.core.udfs import UDF_PREFIX, is_registered, register_udf
|
|
14
13
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
15
|
-
from fabricks.
|
|
14
|
+
from fabricks.models import JobDependency, JobGoldOptions, StepGoldConf, StepGoldOptions
|
|
15
|
+
from fabricks.utils.path import GitPath
|
|
16
16
|
from fabricks.utils.sqlglot import fix, get_tables
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class Gold(BaseJob):
|
|
20
20
|
def __init__(
|
|
21
21
|
self,
|
|
22
|
-
step:
|
|
22
|
+
step: str,
|
|
23
23
|
topic: Optional[str] = None,
|
|
24
24
|
item: Optional[str] = None,
|
|
25
25
|
job_id: Optional[str] = None,
|
|
@@ -35,16 +35,31 @@ class Gold(BaseJob):
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
_sql: Optional[str] = None
|
|
38
|
-
_sql_path: Optional[
|
|
38
|
+
_sql_path: Optional[GitPath] = None
|
|
39
39
|
_schema_drift: Optional[bool] = None
|
|
40
40
|
|
|
41
41
|
@classmethod
|
|
42
42
|
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
43
|
-
return cls(step=
|
|
43
|
+
return cls(step=step, job_id=job_id)
|
|
44
44
|
|
|
45
45
|
@classmethod
|
|
46
46
|
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
47
|
-
return cls(step=
|
|
47
|
+
return cls(step=step, topic=topic, item=item)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def options(self) -> JobGoldOptions:
|
|
51
|
+
"""Direct access to typed gold job options."""
|
|
52
|
+
return self.conf.options # type: ignore
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def step_conf(self) -> StepGoldConf:
|
|
56
|
+
"""Direct access to typed gold step conf."""
|
|
57
|
+
return self.base_step_conf # type: ignore
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def step_options(self) -> StepGoldOptions:
|
|
61
|
+
"""Direct access to typed gold step options."""
|
|
62
|
+
return self.base_step_conf.options # type: ignore
|
|
48
63
|
|
|
49
64
|
@property
|
|
50
65
|
def stream(self) -> bool:
|
|
@@ -53,7 +68,7 @@ class Gold(BaseJob):
|
|
|
53
68
|
@property
|
|
54
69
|
def schema_drift(self) -> bool:
|
|
55
70
|
if not self._schema_drift:
|
|
56
|
-
_schema_drift = self.step_conf.
|
|
71
|
+
_schema_drift = self.step_conf.options.schema_drift or False
|
|
57
72
|
assert _schema_drift is not None
|
|
58
73
|
self._schema_drift = cast(bool, _schema_drift)
|
|
59
74
|
return self._schema_drift
|
|
@@ -68,7 +83,7 @@ class Gold(BaseJob):
|
|
|
68
83
|
|
|
69
84
|
@property
|
|
70
85
|
def sql(self) -> str:
|
|
71
|
-
sql = self.paths.
|
|
86
|
+
sql = self.paths.to_runtime.get_sql()
|
|
72
87
|
return fix(sql, keep_comments=False)
|
|
73
88
|
|
|
74
89
|
@deprecated("use sql instead")
|
|
@@ -81,17 +96,17 @@ class Gold(BaseJob):
|
|
|
81
96
|
return []
|
|
82
97
|
|
|
83
98
|
# udf not allowed in notebook
|
|
84
|
-
elif self.options.
|
|
99
|
+
elif self.options.notebook:
|
|
85
100
|
return []
|
|
86
101
|
|
|
87
102
|
# udf not allowed in table
|
|
88
|
-
elif self.options.
|
|
103
|
+
elif self.options.table:
|
|
89
104
|
return []
|
|
90
105
|
|
|
91
106
|
else:
|
|
92
107
|
matches = []
|
|
93
|
-
if f"{
|
|
94
|
-
r = re.compile(rf"(?<={
|
|
108
|
+
if f"{UDF_PREFIX}" in self.sql:
|
|
109
|
+
r = re.compile(rf"(?<={UDF_PREFIX})\w*(?=\()")
|
|
95
110
|
matches = re.findall(r, self.sql)
|
|
96
111
|
matches = set(matches)
|
|
97
112
|
matches = list(matches)
|
|
@@ -114,7 +129,7 @@ class Gold(BaseJob):
|
|
|
114
129
|
schema_only: Optional[bool] = False,
|
|
115
130
|
**kwargs,
|
|
116
131
|
) -> DataFrame:
|
|
117
|
-
if self.options.
|
|
132
|
+
if self.options.requirements:
|
|
118
133
|
import sys
|
|
119
134
|
|
|
120
135
|
sys.path.append("/dbfs/mnt/fabricks/site-packages")
|
|
@@ -122,28 +137,28 @@ class Gold(BaseJob):
|
|
|
122
137
|
if self.mode == "invoke":
|
|
123
138
|
df = self.spark.createDataFrame([{}]) # type: ignore
|
|
124
139
|
|
|
125
|
-
elif self.options.
|
|
126
|
-
invokers = self.
|
|
140
|
+
elif self.options.notebook:
|
|
141
|
+
invokers = self.invoker_options.run or [] if self.invoker_options else []
|
|
127
142
|
assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
|
|
128
143
|
|
|
129
144
|
path = None
|
|
130
145
|
if invokers:
|
|
131
|
-
|
|
132
|
-
if notebook:
|
|
133
|
-
from fabricks.context import PATH_RUNTIME
|
|
146
|
+
from fabricks.context import PATH_RUNTIME
|
|
134
147
|
|
|
135
|
-
|
|
148
|
+
path = PATH_RUNTIME.joinpath(invokers[0].notebook) if invokers[0].notebook else None
|
|
136
149
|
|
|
137
150
|
if path is None:
|
|
138
|
-
path = self.paths.
|
|
151
|
+
path = self.paths.to_runtime
|
|
152
|
+
|
|
153
|
+
assert path is not None, "path could not be resolved"
|
|
139
154
|
|
|
140
155
|
global_temp_view = self.invoke(path=path, schema_only=schema_only, **kwargs)
|
|
141
156
|
assert global_temp_view is not None, "global_temp_view not found"
|
|
142
157
|
|
|
143
158
|
df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
|
|
144
159
|
|
|
145
|
-
elif self.options.
|
|
146
|
-
table = self.options.
|
|
160
|
+
elif self.options.table:
|
|
161
|
+
table = self.options.table
|
|
147
162
|
df = self.spark.read.table(table) # type: ignore
|
|
148
163
|
|
|
149
164
|
else:
|
|
@@ -168,11 +183,11 @@ class Gold(BaseJob):
|
|
|
168
183
|
|
|
169
184
|
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
170
185
|
data = []
|
|
171
|
-
parents = self.options.
|
|
186
|
+
parents = self.options.parents or []
|
|
172
187
|
|
|
173
188
|
if self.mode == "invoke":
|
|
174
189
|
dependencies = []
|
|
175
|
-
elif self.options.
|
|
190
|
+
elif self.options.notebook:
|
|
176
191
|
dependencies = self._get_notebook_dependencies()
|
|
177
192
|
else:
|
|
178
193
|
dependencies = self._get_sql_dependencies()
|
|
@@ -189,7 +204,7 @@ class Gold(BaseJob):
|
|
|
189
204
|
return data
|
|
190
205
|
|
|
191
206
|
def _get_sql_dependencies(self) -> List[str]:
|
|
192
|
-
from fabricks.
|
|
207
|
+
from fabricks.context import Steps
|
|
193
208
|
|
|
194
209
|
steps = [str(s) for s in Steps]
|
|
195
210
|
return get_tables(self.sql, allowed_databases=steps)
|
|
@@ -217,13 +232,13 @@ class Gold(BaseJob):
|
|
|
217
232
|
|
|
218
233
|
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
219
234
|
# assume no duplicate in gold (to improve performance)
|
|
220
|
-
deduplicate = self.options.
|
|
235
|
+
deduplicate = self.options.deduplicate
|
|
221
236
|
# assume no reload in gold (to improve performance)
|
|
222
|
-
rectify = self.options.
|
|
237
|
+
rectify = self.options.rectify_as_upserts
|
|
223
238
|
|
|
224
|
-
add_metadata = self.options.
|
|
239
|
+
add_metadata = self.options.metadata
|
|
225
240
|
if add_metadata is None:
|
|
226
|
-
add_metadata = self.step_conf.
|
|
241
|
+
add_metadata = self.step_conf.options.metadata or False
|
|
227
242
|
|
|
228
243
|
context = {
|
|
229
244
|
"add_metadata": add_metadata,
|
|
@@ -288,10 +303,12 @@ class Gold(BaseJob):
|
|
|
288
303
|
|
|
289
304
|
# correct __valid_from
|
|
290
305
|
if self.change_data_capture == "scd2":
|
|
291
|
-
context["correct_valid_from"] =
|
|
306
|
+
context["correct_valid_from"] = (
|
|
307
|
+
self.options.correct_valid_from if self.options.correct_valid_from is not None else True
|
|
308
|
+
)
|
|
292
309
|
|
|
293
310
|
# add __timestamp
|
|
294
|
-
if self.options.
|
|
311
|
+
if self.options.persist_last_timestamp:
|
|
295
312
|
if self.change_data_capture == "scd1":
|
|
296
313
|
if "__timestamp" not in df.columns:
|
|
297
314
|
context["add_timestamp"] = True
|
|
@@ -299,6 +316,14 @@ class Gold(BaseJob):
|
|
|
299
316
|
if "__valid_from" not in df.columns:
|
|
300
317
|
context["add_timestamp"] = True
|
|
301
318
|
|
|
319
|
+
# add __updated
|
|
320
|
+
if self.options.persist_last_updated_timestamp:
|
|
321
|
+
if "__last_updated" not in df.columns:
|
|
322
|
+
context["add_last_updated"] = True
|
|
323
|
+
if self.options.last_updated:
|
|
324
|
+
if "__last_updated" not in df.columns:
|
|
325
|
+
context["add_last_updated"] = True
|
|
326
|
+
|
|
302
327
|
if "__order_duplicate_by_asc" in df.columns:
|
|
303
328
|
context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
|
|
304
329
|
elif "__order_duplicate_by_desc" in df.columns:
|
|
@@ -345,7 +370,10 @@ class Gold(BaseJob):
|
|
|
345
370
|
|
|
346
371
|
def for_each_run(self, **kwargs):
|
|
347
372
|
last_version = None
|
|
348
|
-
|
|
373
|
+
|
|
374
|
+
if self.options.persist_last_timestamp:
|
|
375
|
+
last_version = self.table.get_last_version()
|
|
376
|
+
if self.options.persist_last_updated_timestamp:
|
|
349
377
|
last_version = self.table.get_last_version()
|
|
350
378
|
|
|
351
379
|
if self.mode == "invoke":
|
|
@@ -354,8 +382,11 @@ class Gold(BaseJob):
|
|
|
354
382
|
else:
|
|
355
383
|
super().for_each_run(**kwargs)
|
|
356
384
|
|
|
357
|
-
if self.options.
|
|
358
|
-
self.
|
|
385
|
+
if self.options.persist_last_timestamp:
|
|
386
|
+
self._persist_timestamp(field="__timestamp", last_version=last_version)
|
|
387
|
+
|
|
388
|
+
if self.options.persist_last_updated_timestamp:
|
|
389
|
+
self._persist_timestamp(field="__last_updated", last_version=last_version)
|
|
359
390
|
|
|
360
391
|
def create(self):
|
|
361
392
|
if self.mode == "invoke":
|
|
@@ -363,11 +394,11 @@ class Gold(BaseJob):
|
|
|
363
394
|
else:
|
|
364
395
|
self.register_udfs()
|
|
365
396
|
super().create()
|
|
366
|
-
if self.options.
|
|
367
|
-
self.
|
|
397
|
+
if self.options.persist_last_timestamp:
|
|
398
|
+
self._persist_timestamp(create=True)
|
|
368
399
|
|
|
369
400
|
def register(self):
|
|
370
|
-
if self.options.
|
|
401
|
+
if self.options.persist_last_timestamp:
|
|
371
402
|
self.cdc_last_timestamp.table.register()
|
|
372
403
|
|
|
373
404
|
if self.mode == "invoke":
|
|
@@ -376,7 +407,7 @@ class Gold(BaseJob):
|
|
|
376
407
|
super().register()
|
|
377
408
|
|
|
378
409
|
def drop(self):
|
|
379
|
-
if self.options.
|
|
410
|
+
if self.options.persist_last_timestamp:
|
|
380
411
|
self.cdc_last_timestamp.drop()
|
|
381
412
|
|
|
382
413
|
super().drop()
|
|
@@ -389,14 +420,25 @@ class Gold(BaseJob):
|
|
|
389
420
|
cdc = NoCDC(self.step, self.topic, f"{self.item}__last_timestamp")
|
|
390
421
|
return cdc
|
|
391
422
|
|
|
392
|
-
def
|
|
423
|
+
def _persist_timestamp(
|
|
424
|
+
self,
|
|
425
|
+
field: Literal["__timestamp", "__last_updated"] = "__timestamp",
|
|
426
|
+
last_version: Optional[int] = None,
|
|
427
|
+
create: bool = False,
|
|
428
|
+
):
|
|
393
429
|
df = self.spark.sql(f"select * from {self} limit 1")
|
|
394
430
|
|
|
395
431
|
fields = []
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
432
|
+
|
|
433
|
+
if field == "__last_updated":
|
|
434
|
+
fields.append("max(__last_updated) :: timestamp as __last_updated")
|
|
435
|
+
|
|
436
|
+
elif field == "__timestamp":
|
|
437
|
+
if self.change_data_capture == "scd1":
|
|
438
|
+
fields.append("max(__timestamp) :: timestamp as __timestamp")
|
|
439
|
+
elif self.change_data_capture == "scd2":
|
|
440
|
+
fields.append("max(__valid_from) :: timestamp as __timestamp")
|
|
441
|
+
|
|
400
442
|
if "__source" in df.columns:
|
|
401
443
|
fields.append("__source")
|
|
402
444
|
|
|
@@ -412,7 +454,7 @@ class Gold(BaseJob):
|
|
|
412
454
|
else:
|
|
413
455
|
self.cdc_last_timestamp.overwrite(df)
|
|
414
456
|
|
|
415
|
-
def overwrite(self, schedule: Optional[str] = None):
|
|
457
|
+
def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
|
|
416
458
|
if self.mode == "invoke":
|
|
417
459
|
DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
|
|
418
460
|
return
|
|
@@ -423,4 +465,4 @@ class Gold(BaseJob):
|
|
|
423
465
|
return
|
|
424
466
|
|
|
425
467
|
self.overwrite_schema()
|
|
426
|
-
self.run(reload=True, schedule=schedule)
|
|
468
|
+
self.run(reload=True, schedule=schedule, invoke=invoke)
|
fabricks/core/jobs/silver.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Sequence, Union
|
|
1
|
+
from typing import Optional, Sequence, Union
|
|
2
2
|
|
|
3
3
|
from pyspark.sql import DataFrame
|
|
4
4
|
from pyspark.sql.functions import expr
|
|
@@ -6,10 +6,10 @@ from pyspark.sql.types import Row
|
|
|
6
6
|
|
|
7
7
|
from fabricks.cdc.nocdc import NoCDC
|
|
8
8
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
|
-
from fabricks.core.jobs.base._types import JobDependency, TBronze, TSilver
|
|
10
9
|
from fabricks.core.jobs.base.job import BaseJob
|
|
11
10
|
from fabricks.core.jobs.bronze import Bronze
|
|
12
11
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.models import JobDependency, JobSilverOptions, StepSilverConf, StepSilverOptions
|
|
13
13
|
from fabricks.utils.helpers import concat_dfs
|
|
14
14
|
from fabricks.utils.read.read import read
|
|
15
15
|
from fabricks.utils.sqlglot import fix as fix_sql
|
|
@@ -18,7 +18,7 @@ from fabricks.utils.sqlglot import fix as fix_sql
|
|
|
18
18
|
class Silver(BaseJob):
|
|
19
19
|
def __init__(
|
|
20
20
|
self,
|
|
21
|
-
step:
|
|
21
|
+
step: str,
|
|
22
22
|
topic: Optional[str] = None,
|
|
23
23
|
item: Optional[str] = None,
|
|
24
24
|
job_id: Optional[str] = None,
|
|
@@ -33,23 +33,38 @@ class Silver(BaseJob):
|
|
|
33
33
|
conf=conf,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
_parent_step: Optional[
|
|
36
|
+
_parent_step: Optional[str] = None
|
|
37
37
|
_stream: Optional[bool] = None
|
|
38
38
|
|
|
39
39
|
@classmethod
|
|
40
40
|
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
41
|
-
return cls(step=
|
|
41
|
+
return cls(step=step, job_id=job_id, conf=conf)
|
|
42
42
|
|
|
43
43
|
@classmethod
|
|
44
44
|
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
45
|
-
return cls(step=
|
|
45
|
+
return cls(step=step, topic=topic, item=item, conf=conf)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def options(self) -> JobSilverOptions:
|
|
49
|
+
"""Direct access to typed silver job options."""
|
|
50
|
+
return self.conf.options # type: ignore
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def step_conf(self) -> StepSilverConf:
|
|
54
|
+
"""Direct access to typed silver step conf."""
|
|
55
|
+
return self.base_step_conf # type: ignore
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def step_options(self) -> StepSilverOptions:
|
|
59
|
+
"""Direct access to typed silver step options."""
|
|
60
|
+
return self.base_step_conf.options # type: ignore
|
|
46
61
|
|
|
47
62
|
@property
|
|
48
63
|
def stream(self) -> bool:
|
|
49
64
|
if not self._stream:
|
|
50
|
-
_stream = self.options.
|
|
65
|
+
_stream = self.options.stream
|
|
51
66
|
if _stream is None:
|
|
52
|
-
_stream = self.step_conf.
|
|
67
|
+
_stream = self.step_conf.options.stream
|
|
53
68
|
self._stream = _stream if _stream is not None else True
|
|
54
69
|
return self._stream # type: ignore
|
|
55
70
|
|
|
@@ -66,18 +81,17 @@ class Silver(BaseJob):
|
|
|
66
81
|
return self.mode in ["combine", "memory"]
|
|
67
82
|
|
|
68
83
|
@property
|
|
69
|
-
def parent_step(self) ->
|
|
84
|
+
def parent_step(self) -> str:
|
|
70
85
|
if not self._parent_step:
|
|
71
|
-
_parent_step = self.step_conf.
|
|
72
|
-
_parent_step = cast(TBronze, _parent_step)
|
|
86
|
+
_parent_step = self.step_conf.options.parent
|
|
73
87
|
assert _parent_step is not None
|
|
74
|
-
self._parent_step = _parent_step
|
|
88
|
+
self._parent_step = str(_parent_step)
|
|
75
89
|
return self._parent_step
|
|
76
90
|
|
|
77
|
-
def
|
|
78
|
-
df = df.transform(self.extend)
|
|
79
|
-
|
|
91
|
+
def update_metadata(self, df: DataFrame) -> DataFrame:
|
|
80
92
|
if "__metadata" in df.columns:
|
|
93
|
+
DEFAULT_LOGGER.debug("update metadata", extra={"label": self})
|
|
94
|
+
|
|
81
95
|
df = df.withColumn(
|
|
82
96
|
"__metadata",
|
|
83
97
|
expr(
|
|
@@ -88,11 +102,18 @@ class Silver(BaseJob):
|
|
|
88
102
|
__metadata.file_size as file_size,
|
|
89
103
|
__metadata.file_modification_time as file_modification_time,
|
|
90
104
|
__metadata.inserted as inserted,
|
|
91
|
-
|
|
105
|
+
cast(current_timestamp() as timestamp) as updated
|
|
92
106
|
)
|
|
93
107
|
"""
|
|
94
108
|
),
|
|
95
109
|
)
|
|
110
|
+
|
|
111
|
+
return df
|
|
112
|
+
|
|
113
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
114
|
+
df = df.transform(self.extend)
|
|
115
|
+
df = self.update_metadata(df)
|
|
116
|
+
|
|
96
117
|
return df
|
|
97
118
|
|
|
98
119
|
def get_data(
|
|
@@ -153,7 +174,6 @@ class Silver(BaseJob):
|
|
|
153
174
|
|
|
154
175
|
# transforms
|
|
155
176
|
df = self.filter_where(df)
|
|
156
|
-
df = self.encrypt(df)
|
|
157
177
|
if transform:
|
|
158
178
|
df = self.base_transform(df)
|
|
159
179
|
|
|
@@ -165,7 +185,7 @@ class Silver(BaseJob):
|
|
|
165
185
|
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
166
186
|
dependencies = []
|
|
167
187
|
|
|
168
|
-
parents = self.options.
|
|
188
|
+
parents = self.options.parents or []
|
|
169
189
|
if parents:
|
|
170
190
|
for p in parents:
|
|
171
191
|
dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
|
|
@@ -237,9 +257,9 @@ class Silver(BaseJob):
|
|
|
237
257
|
except Py4JJavaError as e:
|
|
238
258
|
DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
|
|
239
259
|
|
|
240
|
-
def overwrite(self, schedule: Optional[str] = None):
|
|
260
|
+
def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
|
|
241
261
|
self.truncate()
|
|
242
|
-
self.run(schedule=schedule)
|
|
262
|
+
self.run(schedule=schedule, invoke=invoke)
|
|
243
263
|
|
|
244
264
|
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
245
265
|
DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
|
|
@@ -251,7 +271,7 @@ class Silver(BaseJob):
|
|
|
251
271
|
|
|
252
272
|
not_append = not self.mode == "append"
|
|
253
273
|
nocdc = self.change_data_capture == "nocdc"
|
|
254
|
-
order_duplicate_by = self.options.
|
|
274
|
+
order_duplicate_by = self.options.order_duplicate_by or {}
|
|
255
275
|
|
|
256
276
|
rectify = False
|
|
257
277
|
if not_append and not nocdc:
|
|
@@ -283,7 +303,7 @@ class Silver(BaseJob):
|
|
|
283
303
|
|
|
284
304
|
context = {
|
|
285
305
|
"soft_delete": self.slowly_changing_dimension,
|
|
286
|
-
"deduplicate": self.options.
|
|
306
|
+
"deduplicate": self.options.deduplicate if self.options.deduplicate is not None else not_append,
|
|
287
307
|
"rectify": rectify,
|
|
288
308
|
"order_duplicate_by": order_duplicate_by,
|
|
289
309
|
}
|
fabricks/core/masks.py
CHANGED
|
@@ -3,22 +3,25 @@ from typing import List, Optional
|
|
|
3
3
|
|
|
4
4
|
from pyspark.sql import SparkSession
|
|
5
5
|
|
|
6
|
-
from fabricks.context import CATALOG, PATH_MASKS, SPARK
|
|
6
|
+
from fabricks.context import CATALOG, CONF_RUNTIME, PATH_MASKS, SPARK
|
|
7
7
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
8
|
|
|
9
|
+
MASK_SCHEMA = CONF_RUNTIME.mask_options.schema_name or "default" if CONF_RUNTIME.mask_options else "default"
|
|
10
|
+
MASK_PREFIX = CONF_RUNTIME.mask_options.prefix or "mask_" if CONF_RUNTIME.mask_options else "mask_"
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
def register_all_masks(override: bool = False):
|
|
11
14
|
"""
|
|
12
15
|
Register all masks.
|
|
13
16
|
"""
|
|
14
17
|
|
|
15
|
-
DEFAULT_LOGGER.info("register masks")
|
|
18
|
+
DEFAULT_LOGGER.info("register masks", extra={"label": "fabricks"})
|
|
16
19
|
for mask in get_masks():
|
|
17
20
|
split = mask.split(".")
|
|
18
21
|
try:
|
|
19
22
|
register_mask(mask=split[0], override=override)
|
|
20
23
|
except Exception as e:
|
|
21
|
-
DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
|
|
24
|
+
DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e, extra={"label": "fabricks"})
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def get_masks() -> List[str]:
|
|
@@ -30,12 +33,12 @@ def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
|
|
|
30
33
|
spark = SPARK
|
|
31
34
|
assert spark is not None
|
|
32
35
|
|
|
33
|
-
df = spark.sql("show user functions in
|
|
36
|
+
df = spark.sql(f"show user functions in {MASK_SCHEMA}")
|
|
34
37
|
|
|
35
38
|
if CATALOG:
|
|
36
|
-
df = df.where(f"function == '{CATALOG}.
|
|
39
|
+
df = df.where(f"function == '{CATALOG}.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
|
|
37
40
|
else:
|
|
38
|
-
df = df.where(f"function == 'spark_catalog.
|
|
41
|
+
df = df.where(f"function == 'spark_catalog.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
|
|
39
42
|
|
|
40
43
|
return not df.isEmpty()
|
|
41
44
|
|
|
@@ -47,9 +50,9 @@ def register_mask(mask: str, override: Optional[bool] = False, spark: Optional[S
|
|
|
47
50
|
|
|
48
51
|
if not is_registered(mask, spark) or override:
|
|
49
52
|
if override:
|
|
50
|
-
DEFAULT_LOGGER.debug(f"
|
|
53
|
+
DEFAULT_LOGGER.debug(f"drop mask {mask}", extra={"label": "fabricks"})
|
|
51
54
|
else:
|
|
52
|
-
DEFAULT_LOGGER.debug(f"register mask {mask}")
|
|
55
|
+
DEFAULT_LOGGER.debug(f"register mask {mask}", extra={"label": "fabricks"})
|
|
53
56
|
|
|
54
57
|
path = PATH_MASKS.joinpath(f"{mask}.sql")
|
|
55
58
|
spark.sql(path.get_sql())
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from fabricks.core.parsers._types import ParserOptions
|
|
2
1
|
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
3
2
|
from fabricks.core.parsers.decorator import parser
|
|
4
3
|
from fabricks.core.parsers.get_parser import get_parser
|
|
@@ -7,6 +6,5 @@ __all__ = [
|
|
|
7
6
|
"BaseParser",
|
|
8
7
|
"get_parser",
|
|
9
8
|
"parser",
|
|
10
|
-
"ParserOptions",
|
|
11
9
|
"PARSERS",
|
|
12
10
|
]
|