fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +8 -7
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +96 -43
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +9 -8
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +269 -102
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
- {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -137
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
fabricks/core/jobs/get_jobs.py
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
|
-
from
|
|
2
|
-
from typing import List, Literal, Optional, TypedDict, Union, overload
|
|
1
|
+
from typing import List, Literal, Optional, Union, overload
|
|
3
2
|
|
|
3
|
+
from pydantic import BaseModel
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import expr
|
|
6
6
|
from pyspark.sql.types import Row
|
|
7
|
+
from sparkdantic import create_spark_schema
|
|
7
8
|
|
|
8
9
|
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
|
|
9
|
-
from fabricks.core.jobs.base._types import AllowedModes, TStep
|
|
10
10
|
from fabricks.core.jobs.base.job import BaseJob
|
|
11
11
|
from fabricks.core.jobs.get_job import get_job, get_job_internal
|
|
12
|
+
from fabricks.models import AllowedModes
|
|
12
13
|
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
13
|
-
from fabricks.utils.path import
|
|
14
|
+
from fabricks.utils.path import GitPath
|
|
14
15
|
from fabricks.utils.read import read_yaml
|
|
15
|
-
from fabricks.utils.schema import get_schema_for_type
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
class GenericOptions(
|
|
18
|
+
class GenericOptions(BaseModel):
|
|
19
19
|
mode: AllowedModes
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
step: TStep
|
|
22
|
+
class JobConfGeneric(BaseModel):
|
|
23
|
+
step: str
|
|
25
24
|
job_id: str
|
|
26
25
|
topic: str
|
|
27
26
|
item: str
|
|
@@ -39,9 +38,9 @@ def get_jobs_internal():
|
|
|
39
38
|
|
|
40
39
|
def get_jobs_internal_df() -> DataFrame:
|
|
41
40
|
if IS_JOB_CONFIG_FROM_YAML:
|
|
42
|
-
schema =
|
|
41
|
+
schema = create_spark_schema(JobConfGeneric)
|
|
43
42
|
|
|
44
|
-
def _read_yaml(path:
|
|
43
|
+
def _read_yaml(path: GitPath):
|
|
45
44
|
df = SPARK.createDataFrame(read_yaml(path, root="job"), schema=schema) # type: ignore
|
|
46
45
|
if df:
|
|
47
46
|
df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
|
|
@@ -1,23 +1,9 @@
|
|
|
1
|
-
from typing import List, Optional, TypedDict
|
|
2
|
-
|
|
3
1
|
from pyspark.sql import DataFrame
|
|
2
|
+
from sparkdantic import create_spark_schema
|
|
4
3
|
|
|
5
4
|
from fabricks.context import PATH_SCHEDULES, SPARK
|
|
6
|
-
from fabricks.
|
|
5
|
+
from fabricks.models.schedule import Schedule
|
|
7
6
|
from fabricks.utils.read.read_yaml import read_yaml
|
|
8
|
-
from fabricks.utils.schema import get_schema_for_type
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class Options(TypedDict):
|
|
12
|
-
steps: Optional[List[TStep]]
|
|
13
|
-
tag: Optional[str]
|
|
14
|
-
view: Optional[str]
|
|
15
|
-
variables: Optional[dict[str, str]]
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Schedule(TypedDict):
|
|
19
|
-
name: str
|
|
20
|
-
options: Options
|
|
21
7
|
|
|
22
8
|
|
|
23
9
|
def get_schedules():
|
|
@@ -25,7 +11,7 @@ def get_schedules():
|
|
|
25
11
|
|
|
26
12
|
|
|
27
13
|
def get_schedules_df() -> DataFrame:
|
|
28
|
-
schema =
|
|
14
|
+
schema = create_spark_schema(Schedule)
|
|
29
15
|
df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
|
|
30
16
|
|
|
31
17
|
assert df, "no schedules found"
|
fabricks/core/jobs/gold.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import re
|
|
2
2
|
from collections.abc import Sequence
|
|
3
|
-
from typing import List, Optional, Union, cast
|
|
3
|
+
from typing import List, Literal, Optional, Union, cast
|
|
4
4
|
|
|
5
5
|
from pyspark.sql import DataFrame
|
|
6
6
|
from pyspark.sql.types import Row
|
|
@@ -8,18 +8,18 @@ from typing_extensions import deprecated
|
|
|
8
8
|
|
|
9
9
|
from fabricks.cdc.nocdc import NoCDC
|
|
10
10
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
-
from fabricks.core.jobs.base._types import JobDependency, TGold
|
|
12
11
|
from fabricks.core.jobs.base.job import BaseJob
|
|
13
|
-
from fabricks.core.udfs import is_registered, register_udf
|
|
12
|
+
from fabricks.core.udfs import UDF_PREFIX, is_registered, register_udf
|
|
14
13
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
15
|
-
from fabricks.
|
|
14
|
+
from fabricks.models import JobDependency, JobGoldOptions, StepGoldConf, StepGoldOptions
|
|
15
|
+
from fabricks.utils.path import GitPath
|
|
16
16
|
from fabricks.utils.sqlglot import fix, get_tables
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class Gold(BaseJob):
|
|
20
20
|
def __init__(
|
|
21
21
|
self,
|
|
22
|
-
step:
|
|
22
|
+
step: str,
|
|
23
23
|
topic: Optional[str] = None,
|
|
24
24
|
item: Optional[str] = None,
|
|
25
25
|
job_id: Optional[str] = None,
|
|
@@ -35,16 +35,31 @@ class Gold(BaseJob):
|
|
|
35
35
|
)
|
|
36
36
|
|
|
37
37
|
_sql: Optional[str] = None
|
|
38
|
-
_sql_path: Optional[
|
|
38
|
+
_sql_path: Optional[GitPath] = None
|
|
39
39
|
_schema_drift: Optional[bool] = None
|
|
40
40
|
|
|
41
41
|
@classmethod
|
|
42
42
|
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
43
|
-
return cls(step=
|
|
43
|
+
return cls(step=step, job_id=job_id)
|
|
44
44
|
|
|
45
45
|
@classmethod
|
|
46
46
|
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
47
|
-
return cls(step=
|
|
47
|
+
return cls(step=step, topic=topic, item=item)
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def options(self) -> JobGoldOptions:
|
|
51
|
+
"""Direct access to typed gold job options."""
|
|
52
|
+
return self.conf.options # type: ignore
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def step_conf(self) -> StepGoldConf:
|
|
56
|
+
"""Direct access to typed gold step conf."""
|
|
57
|
+
return self.base_step_conf # type: ignore
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def step_options(self) -> StepGoldOptions:
|
|
61
|
+
"""Direct access to typed gold step options."""
|
|
62
|
+
return self.base_step_conf.options # type: ignore
|
|
48
63
|
|
|
49
64
|
@property
|
|
50
65
|
def stream(self) -> bool:
|
|
@@ -53,7 +68,7 @@ class Gold(BaseJob):
|
|
|
53
68
|
@property
|
|
54
69
|
def schema_drift(self) -> bool:
|
|
55
70
|
if not self._schema_drift:
|
|
56
|
-
_schema_drift = self.step_conf.
|
|
71
|
+
_schema_drift = self.step_conf.options.schema_drift or False
|
|
57
72
|
assert _schema_drift is not None
|
|
58
73
|
self._schema_drift = cast(bool, _schema_drift)
|
|
59
74
|
return self._schema_drift
|
|
@@ -68,7 +83,7 @@ class Gold(BaseJob):
|
|
|
68
83
|
|
|
69
84
|
@property
|
|
70
85
|
def sql(self) -> str:
|
|
71
|
-
sql = self.paths.
|
|
86
|
+
sql = self.paths.to_runtime.get_sql()
|
|
72
87
|
return fix(sql, keep_comments=False)
|
|
73
88
|
|
|
74
89
|
@deprecated("use sql instead")
|
|
@@ -81,17 +96,17 @@ class Gold(BaseJob):
|
|
|
81
96
|
return []
|
|
82
97
|
|
|
83
98
|
# udf not allowed in notebook
|
|
84
|
-
elif self.options.
|
|
99
|
+
elif self.options.notebook:
|
|
85
100
|
return []
|
|
86
101
|
|
|
87
102
|
# udf not allowed in table
|
|
88
|
-
elif self.options.
|
|
103
|
+
elif self.options.table:
|
|
89
104
|
return []
|
|
90
105
|
|
|
91
106
|
else:
|
|
92
107
|
matches = []
|
|
93
|
-
if f"{
|
|
94
|
-
r = re.compile(rf"(?<={
|
|
108
|
+
if f"{UDF_PREFIX}" in self.sql:
|
|
109
|
+
r = re.compile(rf"(?<={UDF_PREFIX})\w*(?=\()")
|
|
95
110
|
matches = re.findall(r, self.sql)
|
|
96
111
|
matches = set(matches)
|
|
97
112
|
matches = list(matches)
|
|
@@ -114,7 +129,7 @@ class Gold(BaseJob):
|
|
|
114
129
|
schema_only: Optional[bool] = False,
|
|
115
130
|
**kwargs,
|
|
116
131
|
) -> DataFrame:
|
|
117
|
-
if self.options.
|
|
132
|
+
if self.options.requirements:
|
|
118
133
|
import sys
|
|
119
134
|
|
|
120
135
|
sys.path.append("/dbfs/mnt/fabricks/site-packages")
|
|
@@ -122,17 +137,28 @@ class Gold(BaseJob):
|
|
|
122
137
|
if self.mode == "invoke":
|
|
123
138
|
df = self.spark.createDataFrame([{}]) # type: ignore
|
|
124
139
|
|
|
125
|
-
elif self.options.
|
|
126
|
-
invokers = self.
|
|
140
|
+
elif self.options.notebook:
|
|
141
|
+
invokers = self.invoker_options.run or [] if self.invoker_options else []
|
|
127
142
|
assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
|
|
128
143
|
|
|
129
|
-
|
|
144
|
+
path = None
|
|
145
|
+
if invokers:
|
|
146
|
+
from fabricks.context import PATH_RUNTIME
|
|
147
|
+
|
|
148
|
+
path = PATH_RUNTIME.joinpath(invokers[0].notebook) if invokers[0].notebook else None
|
|
149
|
+
|
|
150
|
+
if path is None:
|
|
151
|
+
path = self.paths.to_runtime
|
|
152
|
+
|
|
153
|
+
assert path is not None, "path could not be resolved"
|
|
154
|
+
|
|
155
|
+
global_temp_view = self.invoke(path=path, schema_only=schema_only, **kwargs)
|
|
130
156
|
assert global_temp_view is not None, "global_temp_view not found"
|
|
131
157
|
|
|
132
158
|
df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
|
|
133
159
|
|
|
134
|
-
elif self.options.
|
|
135
|
-
table = self.options.
|
|
160
|
+
elif self.options.table:
|
|
161
|
+
table = self.options.table
|
|
136
162
|
df = self.spark.read.table(table) # type: ignore
|
|
137
163
|
|
|
138
164
|
else:
|
|
@@ -157,11 +183,11 @@ class Gold(BaseJob):
|
|
|
157
183
|
|
|
158
184
|
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
159
185
|
data = []
|
|
160
|
-
parents = self.options.
|
|
186
|
+
parents = self.options.parents or []
|
|
161
187
|
|
|
162
188
|
if self.mode == "invoke":
|
|
163
189
|
dependencies = []
|
|
164
|
-
elif self.options.
|
|
190
|
+
elif self.options.notebook:
|
|
165
191
|
dependencies = self._get_notebook_dependencies()
|
|
166
192
|
else:
|
|
167
193
|
dependencies = self._get_sql_dependencies()
|
|
@@ -178,7 +204,7 @@ class Gold(BaseJob):
|
|
|
178
204
|
return data
|
|
179
205
|
|
|
180
206
|
def _get_sql_dependencies(self) -> List[str]:
|
|
181
|
-
from fabricks.
|
|
207
|
+
from fabricks.context import Steps
|
|
182
208
|
|
|
183
209
|
steps = [str(s) for s in Steps]
|
|
184
210
|
return get_tables(self.sql, allowed_databases=steps)
|
|
@@ -206,13 +232,13 @@ class Gold(BaseJob):
|
|
|
206
232
|
|
|
207
233
|
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
208
234
|
# assume no duplicate in gold (to improve performance)
|
|
209
|
-
deduplicate = self.options.
|
|
235
|
+
deduplicate = self.options.deduplicate
|
|
210
236
|
# assume no reload in gold (to improve performance)
|
|
211
|
-
rectify = self.options.
|
|
237
|
+
rectify = self.options.rectify_as_upserts
|
|
212
238
|
|
|
213
|
-
add_metadata = self.options.
|
|
239
|
+
add_metadata = self.options.metadata
|
|
214
240
|
if add_metadata is None:
|
|
215
|
-
add_metadata = self.step_conf.
|
|
241
|
+
add_metadata = self.step_conf.options.metadata or False
|
|
216
242
|
|
|
217
243
|
context = {
|
|
218
244
|
"add_metadata": add_metadata,
|
|
@@ -277,10 +303,12 @@ class Gold(BaseJob):
|
|
|
277
303
|
|
|
278
304
|
# correct __valid_from
|
|
279
305
|
if self.change_data_capture == "scd2":
|
|
280
|
-
context["correct_valid_from"] =
|
|
306
|
+
context["correct_valid_from"] = (
|
|
307
|
+
self.options.correct_valid_from if self.options.correct_valid_from is not None else True
|
|
308
|
+
)
|
|
281
309
|
|
|
282
310
|
# add __timestamp
|
|
283
|
-
if self.options.
|
|
311
|
+
if self.options.persist_last_timestamp:
|
|
284
312
|
if self.change_data_capture == "scd1":
|
|
285
313
|
if "__timestamp" not in df.columns:
|
|
286
314
|
context["add_timestamp"] = True
|
|
@@ -288,6 +316,14 @@ class Gold(BaseJob):
|
|
|
288
316
|
if "__valid_from" not in df.columns:
|
|
289
317
|
context["add_timestamp"] = True
|
|
290
318
|
|
|
319
|
+
# add __updated
|
|
320
|
+
if self.options.persist_last_updated_timestamp:
|
|
321
|
+
if "__last_updated" not in df.columns:
|
|
322
|
+
context["add_last_updated"] = True
|
|
323
|
+
if self.options.last_updated:
|
|
324
|
+
if "__last_updated" not in df.columns:
|
|
325
|
+
context["add_last_updated"] = True
|
|
326
|
+
|
|
291
327
|
if "__order_duplicate_by_asc" in df.columns:
|
|
292
328
|
context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
|
|
293
329
|
elif "__order_duplicate_by_desc" in df.columns:
|
|
@@ -334,7 +370,10 @@ class Gold(BaseJob):
|
|
|
334
370
|
|
|
335
371
|
def for_each_run(self, **kwargs):
|
|
336
372
|
last_version = None
|
|
337
|
-
|
|
373
|
+
|
|
374
|
+
if self.options.persist_last_timestamp:
|
|
375
|
+
last_version = self.table.get_last_version()
|
|
376
|
+
if self.options.persist_last_updated_timestamp:
|
|
338
377
|
last_version = self.table.get_last_version()
|
|
339
378
|
|
|
340
379
|
if self.mode == "invoke":
|
|
@@ -343,8 +382,11 @@ class Gold(BaseJob):
|
|
|
343
382
|
else:
|
|
344
383
|
super().for_each_run(**kwargs)
|
|
345
384
|
|
|
346
|
-
if self.options.
|
|
347
|
-
self.
|
|
385
|
+
if self.options.persist_last_timestamp:
|
|
386
|
+
self._persist_timestamp(field="__timestamp", last_version=last_version)
|
|
387
|
+
|
|
388
|
+
if self.options.persist_last_updated_timestamp:
|
|
389
|
+
self._persist_timestamp(field="__last_updated", last_version=last_version)
|
|
348
390
|
|
|
349
391
|
def create(self):
|
|
350
392
|
if self.mode == "invoke":
|
|
@@ -352,11 +394,11 @@ class Gold(BaseJob):
|
|
|
352
394
|
else:
|
|
353
395
|
self.register_udfs()
|
|
354
396
|
super().create()
|
|
355
|
-
if self.options.
|
|
356
|
-
self.
|
|
397
|
+
if self.options.persist_last_timestamp:
|
|
398
|
+
self._persist_timestamp(create=True)
|
|
357
399
|
|
|
358
400
|
def register(self):
|
|
359
|
-
if self.options.
|
|
401
|
+
if self.options.persist_last_timestamp:
|
|
360
402
|
self.cdc_last_timestamp.table.register()
|
|
361
403
|
|
|
362
404
|
if self.mode == "invoke":
|
|
@@ -365,7 +407,7 @@ class Gold(BaseJob):
|
|
|
365
407
|
super().register()
|
|
366
408
|
|
|
367
409
|
def drop(self):
|
|
368
|
-
if self.options.
|
|
410
|
+
if self.options.persist_last_timestamp:
|
|
369
411
|
self.cdc_last_timestamp.drop()
|
|
370
412
|
|
|
371
413
|
super().drop()
|
|
@@ -378,14 +420,25 @@ class Gold(BaseJob):
|
|
|
378
420
|
cdc = NoCDC(self.step, self.topic, f"{self.item}__last_timestamp")
|
|
379
421
|
return cdc
|
|
380
422
|
|
|
381
|
-
def
|
|
423
|
+
def _persist_timestamp(
|
|
424
|
+
self,
|
|
425
|
+
field: Literal["__timestamp", "__last_updated"] = "__timestamp",
|
|
426
|
+
last_version: Optional[int] = None,
|
|
427
|
+
create: bool = False,
|
|
428
|
+
):
|
|
382
429
|
df = self.spark.sql(f"select * from {self} limit 1")
|
|
383
430
|
|
|
384
431
|
fields = []
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
432
|
+
|
|
433
|
+
if field == "__last_updated":
|
|
434
|
+
fields.append("max(__last_updated) :: timestamp as __last_updated")
|
|
435
|
+
|
|
436
|
+
elif field == "__timestamp":
|
|
437
|
+
if self.change_data_capture == "scd1":
|
|
438
|
+
fields.append("max(__timestamp) :: timestamp as __timestamp")
|
|
439
|
+
elif self.change_data_capture == "scd2":
|
|
440
|
+
fields.append("max(__valid_from) :: timestamp as __timestamp")
|
|
441
|
+
|
|
389
442
|
if "__source" in df.columns:
|
|
390
443
|
fields.append("__source")
|
|
391
444
|
|
|
@@ -401,7 +454,7 @@ class Gold(BaseJob):
|
|
|
401
454
|
else:
|
|
402
455
|
self.cdc_last_timestamp.overwrite(df)
|
|
403
456
|
|
|
404
|
-
def overwrite(self, schedule: Optional[str] = None):
|
|
457
|
+
def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
|
|
405
458
|
if self.mode == "invoke":
|
|
406
459
|
DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
|
|
407
460
|
return
|
|
@@ -412,4 +465,4 @@ class Gold(BaseJob):
|
|
|
412
465
|
return
|
|
413
466
|
|
|
414
467
|
self.overwrite_schema()
|
|
415
|
-
self.run(reload=True, schedule=schedule)
|
|
468
|
+
self.run(reload=True, schedule=schedule, invoke=invoke)
|
fabricks/core/jobs/silver.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import Optional, Sequence, Union
|
|
1
|
+
from typing import Optional, Sequence, Union
|
|
2
2
|
|
|
3
3
|
from pyspark.sql import DataFrame
|
|
4
4
|
from pyspark.sql.functions import expr
|
|
@@ -6,10 +6,10 @@ from pyspark.sql.types import Row
|
|
|
6
6
|
|
|
7
7
|
from fabricks.cdc.nocdc import NoCDC
|
|
8
8
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
|
-
from fabricks.core.jobs.base._types import JobDependency, TBronze, TSilver
|
|
10
9
|
from fabricks.core.jobs.base.job import BaseJob
|
|
11
10
|
from fabricks.core.jobs.bronze import Bronze
|
|
12
11
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.models import JobDependency, JobSilverOptions, StepSilverConf, StepSilverOptions
|
|
13
13
|
from fabricks.utils.helpers import concat_dfs
|
|
14
14
|
from fabricks.utils.read.read import read
|
|
15
15
|
from fabricks.utils.sqlglot import fix as fix_sql
|
|
@@ -18,7 +18,7 @@ from fabricks.utils.sqlglot import fix as fix_sql
|
|
|
18
18
|
class Silver(BaseJob):
|
|
19
19
|
def __init__(
|
|
20
20
|
self,
|
|
21
|
-
step:
|
|
21
|
+
step: str,
|
|
22
22
|
topic: Optional[str] = None,
|
|
23
23
|
item: Optional[str] = None,
|
|
24
24
|
job_id: Optional[str] = None,
|
|
@@ -33,23 +33,38 @@ class Silver(BaseJob):
|
|
|
33
33
|
conf=conf,
|
|
34
34
|
)
|
|
35
35
|
|
|
36
|
-
_parent_step: Optional[
|
|
36
|
+
_parent_step: Optional[str] = None
|
|
37
37
|
_stream: Optional[bool] = None
|
|
38
38
|
|
|
39
39
|
@classmethod
|
|
40
40
|
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
41
|
-
return cls(step=
|
|
41
|
+
return cls(step=step, job_id=job_id, conf=conf)
|
|
42
42
|
|
|
43
43
|
@classmethod
|
|
44
44
|
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
45
|
-
return cls(step=
|
|
45
|
+
return cls(step=step, topic=topic, item=item, conf=conf)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def options(self) -> JobSilverOptions:
|
|
49
|
+
"""Direct access to typed silver job options."""
|
|
50
|
+
return self.conf.options # type: ignore
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def step_conf(self) -> StepSilverConf:
|
|
54
|
+
"""Direct access to typed silver step conf."""
|
|
55
|
+
return self.base_step_conf # type: ignore
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def step_options(self) -> StepSilverOptions:
|
|
59
|
+
"""Direct access to typed silver step options."""
|
|
60
|
+
return self.base_step_conf.options # type: ignore
|
|
46
61
|
|
|
47
62
|
@property
|
|
48
63
|
def stream(self) -> bool:
|
|
49
64
|
if not self._stream:
|
|
50
|
-
_stream = self.options.
|
|
65
|
+
_stream = self.options.stream
|
|
51
66
|
if _stream is None:
|
|
52
|
-
_stream = self.step_conf.
|
|
67
|
+
_stream = self.step_conf.options.stream
|
|
53
68
|
self._stream = _stream if _stream is not None else True
|
|
54
69
|
return self._stream # type: ignore
|
|
55
70
|
|
|
@@ -66,18 +81,17 @@ class Silver(BaseJob):
|
|
|
66
81
|
return self.mode in ["combine", "memory"]
|
|
67
82
|
|
|
68
83
|
@property
|
|
69
|
-
def parent_step(self) ->
|
|
84
|
+
def parent_step(self) -> str:
|
|
70
85
|
if not self._parent_step:
|
|
71
|
-
_parent_step = self.step_conf.
|
|
72
|
-
_parent_step = cast(TBronze, _parent_step)
|
|
86
|
+
_parent_step = self.step_conf.options.parent
|
|
73
87
|
assert _parent_step is not None
|
|
74
|
-
self._parent_step = _parent_step
|
|
88
|
+
self._parent_step = str(_parent_step)
|
|
75
89
|
return self._parent_step
|
|
76
90
|
|
|
77
|
-
def
|
|
78
|
-
df = df.transform(self.extend)
|
|
79
|
-
|
|
91
|
+
def update_metadata(self, df: DataFrame) -> DataFrame:
|
|
80
92
|
if "__metadata" in df.columns:
|
|
93
|
+
DEFAULT_LOGGER.debug("update metadata", extra={"label": self})
|
|
94
|
+
|
|
81
95
|
df = df.withColumn(
|
|
82
96
|
"__metadata",
|
|
83
97
|
expr(
|
|
@@ -88,11 +102,18 @@ class Silver(BaseJob):
|
|
|
88
102
|
__metadata.file_size as file_size,
|
|
89
103
|
__metadata.file_modification_time as file_modification_time,
|
|
90
104
|
__metadata.inserted as inserted,
|
|
91
|
-
|
|
105
|
+
cast(current_timestamp() as timestamp) as updated
|
|
92
106
|
)
|
|
93
107
|
"""
|
|
94
108
|
),
|
|
95
109
|
)
|
|
110
|
+
|
|
111
|
+
return df
|
|
112
|
+
|
|
113
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
114
|
+
df = df.transform(self.extend)
|
|
115
|
+
df = self.update_metadata(df)
|
|
116
|
+
|
|
96
117
|
return df
|
|
97
118
|
|
|
98
119
|
def get_data(
|
|
@@ -153,7 +174,6 @@ class Silver(BaseJob):
|
|
|
153
174
|
|
|
154
175
|
# transforms
|
|
155
176
|
df = self.filter_where(df)
|
|
156
|
-
df = self.encrypt(df)
|
|
157
177
|
if transform:
|
|
158
178
|
df = self.base_transform(df)
|
|
159
179
|
|
|
@@ -165,7 +185,7 @@ class Silver(BaseJob):
|
|
|
165
185
|
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
166
186
|
dependencies = []
|
|
167
187
|
|
|
168
|
-
parents = self.options.
|
|
188
|
+
parents = self.options.parents or []
|
|
169
189
|
if parents:
|
|
170
190
|
for p in parents:
|
|
171
191
|
dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
|
|
@@ -237,9 +257,9 @@ class Silver(BaseJob):
|
|
|
237
257
|
except Py4JJavaError as e:
|
|
238
258
|
DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
|
|
239
259
|
|
|
240
|
-
def overwrite(self, schedule: Optional[str] = None):
|
|
260
|
+
def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
|
|
241
261
|
self.truncate()
|
|
242
|
-
self.run(schedule=schedule)
|
|
262
|
+
self.run(schedule=schedule, invoke=invoke)
|
|
243
263
|
|
|
244
264
|
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
245
265
|
DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
|
|
@@ -251,7 +271,7 @@ class Silver(BaseJob):
|
|
|
251
271
|
|
|
252
272
|
not_append = not self.mode == "append"
|
|
253
273
|
nocdc = self.change_data_capture == "nocdc"
|
|
254
|
-
order_duplicate_by = self.options.
|
|
274
|
+
order_duplicate_by = self.options.order_duplicate_by or {}
|
|
255
275
|
|
|
256
276
|
rectify = False
|
|
257
277
|
if not_append and not nocdc:
|
|
@@ -283,7 +303,7 @@ class Silver(BaseJob):
|
|
|
283
303
|
|
|
284
304
|
context = {
|
|
285
305
|
"soft_delete": self.slowly_changing_dimension,
|
|
286
|
-
"deduplicate": self.options.
|
|
306
|
+
"deduplicate": self.options.deduplicate if self.options.deduplicate is not None else not_append,
|
|
287
307
|
"rectify": rectify,
|
|
288
308
|
"order_duplicate_by": order_duplicate_by,
|
|
289
309
|
}
|
fabricks/core/masks.py
CHANGED
|
@@ -3,22 +3,25 @@ from typing import List, Optional
|
|
|
3
3
|
|
|
4
4
|
from pyspark.sql import SparkSession
|
|
5
5
|
|
|
6
|
-
from fabricks.context import CATALOG, PATH_MASKS, SPARK
|
|
6
|
+
from fabricks.context import CATALOG, CONF_RUNTIME, PATH_MASKS, SPARK
|
|
7
7
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
8
|
|
|
9
|
+
MASK_SCHEMA = CONF_RUNTIME.mask_options.schema_name or "default" if CONF_RUNTIME.mask_options else "default"
|
|
10
|
+
MASK_PREFIX = CONF_RUNTIME.mask_options.prefix or "mask_" if CONF_RUNTIME.mask_options else "mask_"
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
def register_all_masks(override: bool = False):
|
|
11
14
|
"""
|
|
12
15
|
Register all masks.
|
|
13
16
|
"""
|
|
14
17
|
|
|
15
|
-
DEFAULT_LOGGER.info("register masks")
|
|
18
|
+
DEFAULT_LOGGER.info("register masks", extra={"label": "fabricks"})
|
|
16
19
|
for mask in get_masks():
|
|
17
20
|
split = mask.split(".")
|
|
18
21
|
try:
|
|
19
22
|
register_mask(mask=split[0], override=override)
|
|
20
23
|
except Exception as e:
|
|
21
|
-
DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
|
|
24
|
+
DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e, extra={"label": "fabricks"})
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def get_masks() -> List[str]:
|
|
@@ -30,12 +33,12 @@ def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
|
|
|
30
33
|
spark = SPARK
|
|
31
34
|
assert spark is not None
|
|
32
35
|
|
|
33
|
-
df = spark.sql("show user functions in
|
|
36
|
+
df = spark.sql(f"show user functions in {MASK_SCHEMA}")
|
|
34
37
|
|
|
35
38
|
if CATALOG:
|
|
36
|
-
df = df.where(f"function == '{CATALOG}.
|
|
39
|
+
df = df.where(f"function == '{CATALOG}.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
|
|
37
40
|
else:
|
|
38
|
-
df = df.where(f"function == 'spark_catalog.
|
|
41
|
+
df = df.where(f"function == 'spark_catalog.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
|
|
39
42
|
|
|
40
43
|
return not df.isEmpty()
|
|
41
44
|
|
|
@@ -47,9 +50,9 @@ def register_mask(mask: str, override: Optional[bool] = False, spark: Optional[S
|
|
|
47
50
|
|
|
48
51
|
if not is_registered(mask, spark) or override:
|
|
49
52
|
if override:
|
|
50
|
-
DEFAULT_LOGGER.debug(f"
|
|
53
|
+
DEFAULT_LOGGER.debug(f"drop mask {mask}", extra={"label": "fabricks"})
|
|
51
54
|
else:
|
|
52
|
-
DEFAULT_LOGGER.debug(f"register mask {mask}")
|
|
55
|
+
DEFAULT_LOGGER.debug(f"register mask {mask}", extra={"label": "fabricks"})
|
|
53
56
|
|
|
54
57
|
path = PATH_MASKS.joinpath(f"{mask}.sql")
|
|
55
58
|
spark.sql(path.get_sql())
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
from fabricks.core.parsers._types import ParserOptions
|
|
2
1
|
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
3
2
|
from fabricks.core.parsers.decorator import parser
|
|
4
3
|
from fabricks.core.parsers.get_parser import get_parser
|
|
@@ -7,6 +6,5 @@ __all__ = [
|
|
|
7
6
|
"BaseParser",
|
|
8
7
|
"get_parser",
|
|
9
8
|
"parser",
|
|
10
|
-
"ParserOptions",
|
|
11
9
|
"PARSERS",
|
|
12
10
|
]
|
fabricks/core/parsers/base.py
CHANGED
|
@@ -5,15 +5,15 @@ from pyspark.sql import DataFrame, SparkSession
|
|
|
5
5
|
from pyspark.sql.functions import col, expr, from_json, lit
|
|
6
6
|
from pyspark.sql.types import MapType, StringType
|
|
7
7
|
|
|
8
|
-
from fabricks.core.parsers._types import ParserOptions
|
|
9
8
|
from fabricks.core.parsers.utils import clean
|
|
10
|
-
from fabricks.
|
|
9
|
+
from fabricks.models import ParserOptions
|
|
10
|
+
from fabricks.utils.path import FileSharePath
|
|
11
11
|
from fabricks.utils.read.read import read
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
class BaseParser(ABC):
|
|
15
15
|
def __init__(self, options: Optional[ParserOptions], file_format: str):
|
|
16
|
-
self.options = options
|
|
16
|
+
self.options = options
|
|
17
17
|
self.file_format = file_format
|
|
18
18
|
|
|
19
19
|
def add_timestamp_from_file_path(self, df: DataFrame) -> DataFrame:
|
|
@@ -33,8 +33,8 @@ class BaseParser(ABC):
|
|
|
33
33
|
|
|
34
34
|
def parse(
|
|
35
35
|
self,
|
|
36
|
-
data_path:
|
|
37
|
-
schema_path:
|
|
36
|
+
data_path: FileSharePath,
|
|
37
|
+
schema_path: FileSharePath,
|
|
38
38
|
spark: SparkSession,
|
|
39
39
|
stream: bool,
|
|
40
40
|
) -> DataFrame:
|
|
@@ -43,7 +43,7 @@ class BaseParser(ABC):
|
|
|
43
43
|
path=data_path,
|
|
44
44
|
file_format=self.file_format,
|
|
45
45
|
schema_path=schema_path,
|
|
46
|
-
options=self.options.
|
|
46
|
+
options=self.options.read_options if self.options else {},
|
|
47
47
|
spark=spark,
|
|
48
48
|
)
|
|
49
49
|
|
|
@@ -55,8 +55,8 @@ class BaseParser(ABC):
|
|
|
55
55
|
@final
|
|
56
56
|
def get_data(
|
|
57
57
|
self,
|
|
58
|
-
data_path:
|
|
59
|
-
schema_path:
|
|
58
|
+
data_path: FileSharePath,
|
|
59
|
+
schema_path: FileSharePath,
|
|
60
60
|
spark: SparkSession,
|
|
61
61
|
stream: bool,
|
|
62
62
|
) -> DataFrame:
|
|
@@ -64,8 +64,8 @@ class BaseParser(ABC):
|
|
|
64
64
|
Retrieves and processes data from the specified data path using the provided schema.
|
|
65
65
|
|
|
66
66
|
Args:
|
|
67
|
-
data_path (
|
|
68
|
-
schema_path (
|
|
67
|
+
data_path (FileSharePath): The path to the data file.
|
|
68
|
+
schema_path (FileSharePath): The path to the schema file.
|
|
69
69
|
spark (SparkSession): The SparkSession object.
|
|
70
70
|
stream (bool): Indicates whether the data should be processed as a stream.
|
|
71
71
|
|