fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +4 -4
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +89 -47
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +7 -7
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +265 -108
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -139
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
@@ -1,27 +1,26 @@
1
- from dataclasses import dataclass
2
- from typing import List, Literal, Optional, TypedDict, Union, overload
1
+ from typing import List, Literal, Optional, Union, overload
3
2
 
3
+ from pydantic import BaseModel
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import expr
6
6
  from pyspark.sql.types import Row
7
+ from sparkdantic import create_spark_schema
7
8
 
8
9
  from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
9
- from fabricks.core.jobs.base._types import AllowedModes, TStep
10
10
  from fabricks.core.jobs.base.job import BaseJob
11
11
  from fabricks.core.jobs.get_job import get_job, get_job_internal
12
+ from fabricks.models import AllowedModes
12
13
  from fabricks.utils.helpers import concat_dfs, run_in_parallel
13
- from fabricks.utils.path import Path
14
+ from fabricks.utils.path import GitPath
14
15
  from fabricks.utils.read import read_yaml
15
- from fabricks.utils.schema import get_schema_for_type
16
16
 
17
17
 
18
- class GenericOptions(TypedDict):
18
+ class GenericOptions(BaseModel):
19
19
  mode: AllowedModes
20
20
 
21
21
 
22
- @dataclass
23
- class JobConfGeneric:
24
- step: TStep
22
+ class JobConfGeneric(BaseModel):
23
+ step: str
25
24
  job_id: str
26
25
  topic: str
27
26
  item: str
@@ -39,9 +38,9 @@ def get_jobs_internal():
39
38
 
40
39
  def get_jobs_internal_df() -> DataFrame:
41
40
  if IS_JOB_CONFIG_FROM_YAML:
42
- schema = get_schema_for_type(JobConfGeneric)
41
+ schema = create_spark_schema(JobConfGeneric)
43
42
 
44
- def _read_yaml(path: Path):
43
+ def _read_yaml(path: GitPath):
45
44
  df = SPARK.createDataFrame(read_yaml(path, root="job"), schema=schema) # type: ignore
46
45
  if df:
47
46
  df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
@@ -1,23 +1,9 @@
1
- from typing import List, Optional, TypedDict
2
-
3
1
  from pyspark.sql import DataFrame
2
+ from sparkdantic import create_spark_schema
4
3
 
5
4
  from fabricks.context import PATH_SCHEDULES, SPARK
6
- from fabricks.core.jobs.base._types import TStep
5
+ from fabricks.models.schedule import Schedule
7
6
  from fabricks.utils.read.read_yaml import read_yaml
8
- from fabricks.utils.schema import get_schema_for_type
9
-
10
-
11
- class Options(TypedDict):
12
- steps: Optional[List[TStep]]
13
- tag: Optional[str]
14
- view: Optional[str]
15
- variables: Optional[dict[str, str]]
16
-
17
-
18
- class Schedule(TypedDict):
19
- name: str
20
- options: Options
21
7
 
22
8
 
23
9
  def get_schedules():
@@ -25,7 +11,7 @@ def get_schedules():
25
11
 
26
12
 
27
13
  def get_schedules_df() -> DataFrame:
28
- schema = get_schema_for_type(Schedule)
14
+ schema = create_spark_schema(Schedule)
29
15
  df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
30
16
 
31
17
  assert df, "no schedules found"
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  from collections.abc import Sequence
3
- from typing import List, Optional, Union, cast
3
+ from typing import List, Literal, Optional, Union, cast
4
4
 
5
5
  from pyspark.sql import DataFrame
6
6
  from pyspark.sql.types import Row
@@ -8,18 +8,18 @@ from typing_extensions import deprecated
8
8
 
9
9
  from fabricks.cdc.nocdc import NoCDC
10
10
  from fabricks.context.log import DEFAULT_LOGGER
11
- from fabricks.core.jobs.base._types import JobDependency, TGold
12
11
  from fabricks.core.jobs.base.job import BaseJob
13
- from fabricks.core.udfs import is_registered, register_udf, udf_prefix
12
+ from fabricks.core.udfs import UDF_PREFIX, is_registered, register_udf
14
13
  from fabricks.metastore.view import create_or_replace_global_temp_view
15
- from fabricks.utils.path import Path
14
+ from fabricks.models import JobDependency, JobGoldOptions, StepGoldConf, StepGoldOptions
15
+ from fabricks.utils.path import GitPath
16
16
  from fabricks.utils.sqlglot import fix, get_tables
17
17
 
18
18
 
19
19
  class Gold(BaseJob):
20
20
  def __init__(
21
21
  self,
22
- step: TGold,
22
+ step: str,
23
23
  topic: Optional[str] = None,
24
24
  item: Optional[str] = None,
25
25
  job_id: Optional[str] = None,
@@ -35,16 +35,31 @@ class Gold(BaseJob):
35
35
  )
36
36
 
37
37
  _sql: Optional[str] = None
38
- _sql_path: Optional[Path] = None
38
+ _sql_path: Optional[GitPath] = None
39
39
  _schema_drift: Optional[bool] = None
40
40
 
41
41
  @classmethod
42
42
  def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
43
- return cls(step=cast(TGold, step), job_id=job_id)
43
+ return cls(step=step, job_id=job_id)
44
44
 
45
45
  @classmethod
46
46
  def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
47
- return cls(step=cast(TGold, step), topic=topic, item=item)
47
+ return cls(step=step, topic=topic, item=item)
48
+
49
+ @property
50
+ def options(self) -> JobGoldOptions:
51
+ """Direct access to typed gold job options."""
52
+ return self.conf.options # type: ignore
53
+
54
+ @property
55
+ def step_conf(self) -> StepGoldConf:
56
+ """Direct access to typed gold step conf."""
57
+ return self.base_step_conf # type: ignore
58
+
59
+ @property
60
+ def step_options(self) -> StepGoldOptions:
61
+ """Direct access to typed gold step options."""
62
+ return self.base_step_conf.options # type: ignore
48
63
 
49
64
  @property
50
65
  def stream(self) -> bool:
@@ -53,7 +68,7 @@ class Gold(BaseJob):
53
68
  @property
54
69
  def schema_drift(self) -> bool:
55
70
  if not self._schema_drift:
56
- _schema_drift = self.step_conf.get("options", {}).get("schema_drift", False)
71
+ _schema_drift = self.step_conf.options.schema_drift or False
57
72
  assert _schema_drift is not None
58
73
  self._schema_drift = cast(bool, _schema_drift)
59
74
  return self._schema_drift
@@ -68,7 +83,7 @@ class Gold(BaseJob):
68
83
 
69
84
  @property
70
85
  def sql(self) -> str:
71
- sql = self.paths.runtime.get_sql()
86
+ sql = self.paths.to_runtime.get_sql()
72
87
  return fix(sql, keep_comments=False)
73
88
 
74
89
  @deprecated("use sql instead")
@@ -81,17 +96,17 @@ class Gold(BaseJob):
81
96
  return []
82
97
 
83
98
  # udf not allowed in notebook
84
- elif self.options.job.get("notebook"):
99
+ elif self.options.notebook:
85
100
  return []
86
101
 
87
102
  # udf not allowed in table
88
- elif self.options.job.get("table"):
103
+ elif self.options.table:
89
104
  return []
90
105
 
91
106
  else:
92
107
  matches = []
93
- if f"{udf_prefix}" in self.sql:
94
- r = re.compile(rf"(?<={udf_prefix})\w*(?=\()")
108
+ if f"{UDF_PREFIX}" in self.sql:
109
+ r = re.compile(rf"(?<={UDF_PREFIX})\w*(?=\()")
95
110
  matches = re.findall(r, self.sql)
96
111
  matches = set(matches)
97
112
  matches = list(matches)
@@ -114,7 +129,7 @@ class Gold(BaseJob):
114
129
  schema_only: Optional[bool] = False,
115
130
  **kwargs,
116
131
  ) -> DataFrame:
117
- if self.options.job.get_boolean("requirements"):
132
+ if self.options.requirements:
118
133
  import sys
119
134
 
120
135
  sys.path.append("/dbfs/mnt/fabricks/site-packages")
@@ -122,28 +137,28 @@ class Gold(BaseJob):
122
137
  if self.mode == "invoke":
123
138
  df = self.spark.createDataFrame([{}]) # type: ignore
124
139
 
125
- elif self.options.job.get("notebook"):
126
- invokers = self.options.invokers.get_list("run")
140
+ elif self.options.notebook:
141
+ invokers = self.invoker_options.run or [] if self.invoker_options else []
127
142
  assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
128
143
 
129
144
  path = None
130
145
  if invokers:
131
- notebook = invokers[0].get("notebook")
132
- if notebook:
133
- from fabricks.context import PATH_RUNTIME
146
+ from fabricks.context import PATH_RUNTIME
134
147
 
135
- path = PATH_RUNTIME.joinpath(notebook)
148
+ path = PATH_RUNTIME.joinpath(invokers[0].notebook) if invokers[0].notebook else None
136
149
 
137
150
  if path is None:
138
- path = self.paths.runtime
151
+ path = self.paths.to_runtime
152
+
153
+ assert path is not None, "path could not be resolved"
139
154
 
140
155
  global_temp_view = self.invoke(path=path, schema_only=schema_only, **kwargs)
141
156
  assert global_temp_view is not None, "global_temp_view not found"
142
157
 
143
158
  df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
144
159
 
145
- elif self.options.job.get("table"):
146
- table = self.options.job.get("table")
160
+ elif self.options.table:
161
+ table = self.options.table
147
162
  df = self.spark.read.table(table) # type: ignore
148
163
 
149
164
  else:
@@ -168,11 +183,11 @@ class Gold(BaseJob):
168
183
 
169
184
  def get_dependencies(self) -> Sequence[JobDependency]:
170
185
  data = []
171
- parents = self.options.job.get_list("parents") or []
186
+ parents = self.options.parents or []
172
187
 
173
188
  if self.mode == "invoke":
174
189
  dependencies = []
175
- elif self.options.job.get("notebook"):
190
+ elif self.options.notebook:
176
191
  dependencies = self._get_notebook_dependencies()
177
192
  else:
178
193
  dependencies = self._get_sql_dependencies()
@@ -189,7 +204,7 @@ class Gold(BaseJob):
189
204
  return data
190
205
 
191
206
  def _get_sql_dependencies(self) -> List[str]:
192
- from fabricks.core.jobs.base._types import Steps
207
+ from fabricks.context import Steps
193
208
 
194
209
  steps = [str(s) for s in Steps]
195
210
  return get_tables(self.sql, allowed_databases=steps)
@@ -217,13 +232,13 @@ class Gold(BaseJob):
217
232
 
218
233
  def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
219
234
  # assume no duplicate in gold (to improve performance)
220
- deduplicate = self.options.job.get_boolean("deduplicate", None)
235
+ deduplicate = self.options.deduplicate
221
236
  # assume no reload in gold (to improve performance)
222
- rectify = self.options.job.get_boolean("rectify_as_upserts", None)
237
+ rectify = self.options.rectify_as_upserts
223
238
 
224
- add_metadata = self.options.job.get_boolean("metadata", None)
239
+ add_metadata = self.options.metadata
225
240
  if add_metadata is None:
226
- add_metadata = self.step_conf.get("options", {}).get("metadata", False)
241
+ add_metadata = self.step_conf.options.metadata or False
227
242
 
228
243
  context = {
229
244
  "add_metadata": add_metadata,
@@ -288,10 +303,12 @@ class Gold(BaseJob):
288
303
 
289
304
  # correct __valid_from
290
305
  if self.change_data_capture == "scd2":
291
- context["correct_valid_from"] = self.options.job.get_boolean("correct_valid_from", True)
306
+ context["correct_valid_from"] = (
307
+ self.options.correct_valid_from if self.options.correct_valid_from is not None else True
308
+ )
292
309
 
293
310
  # add __timestamp
294
- if self.options.job.get_boolean("persist_last_timestamp"):
311
+ if self.options.persist_last_timestamp:
295
312
  if self.change_data_capture == "scd1":
296
313
  if "__timestamp" not in df.columns:
297
314
  context["add_timestamp"] = True
@@ -299,6 +316,14 @@ class Gold(BaseJob):
299
316
  if "__valid_from" not in df.columns:
300
317
  context["add_timestamp"] = True
301
318
 
319
+ # add __updated
320
+ if self.options.persist_last_updated_timestamp:
321
+ if "__last_updated" not in df.columns:
322
+ context["add_last_updated"] = True
323
+ if self.options.last_updated:
324
+ if "__last_updated" not in df.columns:
325
+ context["add_last_updated"] = True
326
+
302
327
  if "__order_duplicate_by_asc" in df.columns:
303
328
  context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
304
329
  elif "__order_duplicate_by_desc" in df.columns:
@@ -345,7 +370,10 @@ class Gold(BaseJob):
345
370
 
346
371
  def for_each_run(self, **kwargs):
347
372
  last_version = None
348
- if self.options.job.get_boolean("persist_last_timestamp"):
373
+
374
+ if self.options.persist_last_timestamp:
375
+ last_version = self.table.get_last_version()
376
+ if self.options.persist_last_updated_timestamp:
349
377
  last_version = self.table.get_last_version()
350
378
 
351
379
  if self.mode == "invoke":
@@ -354,8 +382,11 @@ class Gold(BaseJob):
354
382
  else:
355
383
  super().for_each_run(**kwargs)
356
384
 
357
- if self.options.job.get_boolean("persist_last_timestamp"):
358
- self._update_last_timestamp(last_version=last_version)
385
+ if self.options.persist_last_timestamp:
386
+ self._persist_timestamp(field="__timestamp", last_version=last_version)
387
+
388
+ if self.options.persist_last_updated_timestamp:
389
+ self._persist_timestamp(field="__last_updated", last_version=last_version)
359
390
 
360
391
  def create(self):
361
392
  if self.mode == "invoke":
@@ -363,11 +394,11 @@ class Gold(BaseJob):
363
394
  else:
364
395
  self.register_udfs()
365
396
  super().create()
366
- if self.options.job.get_boolean("persist_last_timestamp"):
367
- self._update_last_timestamp(create=True)
397
+ if self.options.persist_last_timestamp:
398
+ self._persist_timestamp(create=True)
368
399
 
369
400
  def register(self):
370
- if self.options.job.get_boolean("persist_last_timestamp"):
401
+ if self.options.persist_last_timestamp:
371
402
  self.cdc_last_timestamp.table.register()
372
403
 
373
404
  if self.mode == "invoke":
@@ -376,7 +407,7 @@ class Gold(BaseJob):
376
407
  super().register()
377
408
 
378
409
  def drop(self):
379
- if self.options.job.get_boolean("persist_last_timestamp"):
410
+ if self.options.persist_last_timestamp:
380
411
  self.cdc_last_timestamp.drop()
381
412
 
382
413
  super().drop()
@@ -389,14 +420,25 @@ class Gold(BaseJob):
389
420
  cdc = NoCDC(self.step, self.topic, f"{self.item}__last_timestamp")
390
421
  return cdc
391
422
 
392
- def _update_last_timestamp(self, last_version: Optional[int] = None, create: bool = False):
423
+ def _persist_timestamp(
424
+ self,
425
+ field: Literal["__timestamp", "__last_updated"] = "__timestamp",
426
+ last_version: Optional[int] = None,
427
+ create: bool = False,
428
+ ):
393
429
  df = self.spark.sql(f"select * from {self} limit 1")
394
430
 
395
431
  fields = []
396
- if self.change_data_capture == "scd1":
397
- fields.append("max(__timestamp) :: timestamp as __timestamp")
398
- elif self.change_data_capture == "scd2":
399
- fields.append("max(__valid_from) :: timestamp as __timestamp")
432
+
433
+ if field == "__last_updated":
434
+ fields.append("max(__last_updated) :: timestamp as __last_updated")
435
+
436
+ elif field == "__timestamp":
437
+ if self.change_data_capture == "scd1":
438
+ fields.append("max(__timestamp) :: timestamp as __timestamp")
439
+ elif self.change_data_capture == "scd2":
440
+ fields.append("max(__valid_from) :: timestamp as __timestamp")
441
+
400
442
  if "__source" in df.columns:
401
443
  fields.append("__source")
402
444
 
@@ -412,7 +454,7 @@ class Gold(BaseJob):
412
454
  else:
413
455
  self.cdc_last_timestamp.overwrite(df)
414
456
 
415
- def overwrite(self, schedule: Optional[str] = None):
457
+ def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
416
458
  if self.mode == "invoke":
417
459
  DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
418
460
  return
@@ -423,4 +465,4 @@ class Gold(BaseJob):
423
465
  return
424
466
 
425
467
  self.overwrite_schema()
426
- self.run(reload=True, schedule=schedule)
468
+ self.run(reload=True, schedule=schedule, invoke=invoke)
@@ -1,4 +1,4 @@
1
- from typing import Optional, Sequence, Union, cast
1
+ from typing import Optional, Sequence, Union
2
2
 
3
3
  from pyspark.sql import DataFrame
4
4
  from pyspark.sql.functions import expr
@@ -6,10 +6,10 @@ from pyspark.sql.types import Row
6
6
 
7
7
  from fabricks.cdc.nocdc import NoCDC
8
8
  from fabricks.context.log import DEFAULT_LOGGER
9
- from fabricks.core.jobs.base._types import JobDependency, TBronze, TSilver
10
9
  from fabricks.core.jobs.base.job import BaseJob
11
10
  from fabricks.core.jobs.bronze import Bronze
12
11
  from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.models import JobDependency, JobSilverOptions, StepSilverConf, StepSilverOptions
13
13
  from fabricks.utils.helpers import concat_dfs
14
14
  from fabricks.utils.read.read import read
15
15
  from fabricks.utils.sqlglot import fix as fix_sql
@@ -18,7 +18,7 @@ from fabricks.utils.sqlglot import fix as fix_sql
18
18
  class Silver(BaseJob):
19
19
  def __init__(
20
20
  self,
21
- step: TSilver,
21
+ step: str,
22
22
  topic: Optional[str] = None,
23
23
  item: Optional[str] = None,
24
24
  job_id: Optional[str] = None,
@@ -33,23 +33,38 @@ class Silver(BaseJob):
33
33
  conf=conf,
34
34
  )
35
35
 
36
- _parent_step: Optional[TBronze] = None
36
+ _parent_step: Optional[str] = None
37
37
  _stream: Optional[bool] = None
38
38
 
39
39
  @classmethod
40
40
  def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
41
- return cls(step=cast(TSilver, step), job_id=job_id, conf=conf)
41
+ return cls(step=step, job_id=job_id, conf=conf)
42
42
 
43
43
  @classmethod
44
44
  def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
45
- return cls(step=cast(TSilver, step), topic=topic, item=item, conf=conf)
45
+ return cls(step=step, topic=topic, item=item, conf=conf)
46
+
47
+ @property
48
+ def options(self) -> JobSilverOptions:
49
+ """Direct access to typed silver job options."""
50
+ return self.conf.options # type: ignore
51
+
52
+ @property
53
+ def step_conf(self) -> StepSilverConf:
54
+ """Direct access to typed silver step conf."""
55
+ return self.base_step_conf # type: ignore
56
+
57
+ @property
58
+ def step_options(self) -> StepSilverOptions:
59
+ """Direct access to typed silver step options."""
60
+ return self.base_step_conf.options # type: ignore
46
61
 
47
62
  @property
48
63
  def stream(self) -> bool:
49
64
  if not self._stream:
50
- _stream = self.options.job.get("stream")
65
+ _stream = self.options.stream
51
66
  if _stream is None:
52
- _stream = self.step_conf.get("options", {}).get("stream")
67
+ _stream = self.step_conf.options.stream
53
68
  self._stream = _stream if _stream is not None else True
54
69
  return self._stream # type: ignore
55
70
 
@@ -66,18 +81,17 @@ class Silver(BaseJob):
66
81
  return self.mode in ["combine", "memory"]
67
82
 
68
83
  @property
69
- def parent_step(self) -> TBronze:
84
+ def parent_step(self) -> str:
70
85
  if not self._parent_step:
71
- _parent_step = self.step_conf.get("options", {}).get("parent")
72
- _parent_step = cast(TBronze, _parent_step)
86
+ _parent_step = self.step_conf.options.parent
73
87
  assert _parent_step is not None
74
- self._parent_step = _parent_step
88
+ self._parent_step = str(_parent_step)
75
89
  return self._parent_step
76
90
 
77
- def base_transform(self, df: DataFrame) -> DataFrame:
78
- df = df.transform(self.extend)
79
-
91
+ def update_metadata(self, df: DataFrame) -> DataFrame:
80
92
  if "__metadata" in df.columns:
93
+ DEFAULT_LOGGER.debug("update metadata", extra={"label": self})
94
+
81
95
  df = df.withColumn(
82
96
  "__metadata",
83
97
  expr(
@@ -88,11 +102,18 @@ class Silver(BaseJob):
88
102
  __metadata.file_size as file_size,
89
103
  __metadata.file_modification_time as file_modification_time,
90
104
  __metadata.inserted as inserted,
91
- cast(current_timestamp() as timestamp) as updated
105
+ cast(current_timestamp() as timestamp) as updated
92
106
  )
93
107
  """
94
108
  ),
95
109
  )
110
+
111
+ return df
112
+
113
+ def base_transform(self, df: DataFrame) -> DataFrame:
114
+ df = df.transform(self.extend)
115
+ df = self.update_metadata(df)
116
+
96
117
  return df
97
118
 
98
119
  def get_data(
@@ -153,7 +174,6 @@ class Silver(BaseJob):
153
174
 
154
175
  # transforms
155
176
  df = self.filter_where(df)
156
- df = self.encrypt(df)
157
177
  if transform:
158
178
  df = self.base_transform(df)
159
179
 
@@ -165,7 +185,7 @@ class Silver(BaseJob):
165
185
  def get_dependencies(self) -> Sequence[JobDependency]:
166
186
  dependencies = []
167
187
 
168
- parents = self.options.job.get_list("parents") or []
188
+ parents = self.options.parents or []
169
189
  if parents:
170
190
  for p in parents:
171
191
  dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
@@ -237,9 +257,9 @@ class Silver(BaseJob):
237
257
  except Py4JJavaError as e:
238
258
  DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
239
259
 
240
- def overwrite(self, schedule: Optional[str] = None):
260
+ def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
241
261
  self.truncate()
242
- self.run(schedule=schedule)
262
+ self.run(schedule=schedule, invoke=invoke)
243
263
 
244
264
  def overwrite_schema(self, df: Optional[DataFrame] = None):
245
265
  DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
@@ -251,7 +271,7 @@ class Silver(BaseJob):
251
271
 
252
272
  not_append = not self.mode == "append"
253
273
  nocdc = self.change_data_capture == "nocdc"
254
- order_duplicate_by = self.options.job.get_dict("order_duplicate_by") or {}
274
+ order_duplicate_by = self.options.order_duplicate_by or {}
255
275
 
256
276
  rectify = False
257
277
  if not_append and not nocdc:
@@ -283,7 +303,7 @@ class Silver(BaseJob):
283
303
 
284
304
  context = {
285
305
  "soft_delete": self.slowly_changing_dimension,
286
- "deduplicate": self.options.job.get_boolean("deduplicate", not_append),
306
+ "deduplicate": self.options.deduplicate if self.options.deduplicate is not None else not_append,
287
307
  "rectify": rectify,
288
308
  "order_duplicate_by": order_duplicate_by,
289
309
  }
fabricks/core/masks.py CHANGED
@@ -3,22 +3,25 @@ from typing import List, Optional
3
3
 
4
4
  from pyspark.sql import SparkSession
5
5
 
6
- from fabricks.context import CATALOG, PATH_MASKS, SPARK
6
+ from fabricks.context import CATALOG, CONF_RUNTIME, PATH_MASKS, SPARK
7
7
  from fabricks.context.log import DEFAULT_LOGGER
8
8
 
9
+ MASK_SCHEMA = CONF_RUNTIME.mask_options.schema_name or "default" if CONF_RUNTIME.mask_options else "default"
10
+ MASK_PREFIX = CONF_RUNTIME.mask_options.prefix or "mask_" if CONF_RUNTIME.mask_options else "mask_"
11
+
9
12
 
10
13
  def register_all_masks(override: bool = False):
11
14
  """
12
15
  Register all masks.
13
16
  """
14
17
 
15
- DEFAULT_LOGGER.info("register masks")
18
+ DEFAULT_LOGGER.info("register masks", extra={"label": "fabricks"})
16
19
  for mask in get_masks():
17
20
  split = mask.split(".")
18
21
  try:
19
22
  register_mask(mask=split[0], override=override)
20
23
  except Exception as e:
21
- DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
24
+ DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e, extra={"label": "fabricks"})
22
25
 
23
26
 
24
27
  def get_masks() -> List[str]:
@@ -30,12 +33,12 @@ def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
30
33
  spark = SPARK
31
34
  assert spark is not None
32
35
 
33
- df = spark.sql("show user functions in default")
36
+ df = spark.sql(f"show user functions in {MASK_SCHEMA}")
34
37
 
35
38
  if CATALOG:
36
- df = df.where(f"function == '{CATALOG}.default.mask_{mask}'")
39
+ df = df.where(f"function == '{CATALOG}.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
37
40
  else:
38
- df = df.where(f"function == 'spark_catalog.default.mask_{mask}'")
41
+ df = df.where(f"function == 'spark_catalog.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
39
42
 
40
43
  return not df.isEmpty()
41
44
 
@@ -47,9 +50,9 @@ def register_mask(mask: str, override: Optional[bool] = False, spark: Optional[S
47
50
 
48
51
  if not is_registered(mask, spark) or override:
49
52
  if override:
50
- DEFAULT_LOGGER.debug(f"override mask {mask}")
53
+ DEFAULT_LOGGER.debug(f"drop mask {mask}", extra={"label": "fabricks"})
51
54
  else:
52
- DEFAULT_LOGGER.debug(f"register mask {mask}")
55
+ DEFAULT_LOGGER.debug(f"register mask {mask}", extra={"label": "fabricks"})
53
56
 
54
57
  path = PATH_MASKS.joinpath(f"{mask}.sql")
55
58
  spark.sql(path.get_sql())
@@ -1,4 +1,3 @@
1
- from fabricks.core.parsers._types import ParserOptions
2
1
  from fabricks.core.parsers.base import PARSERS, BaseParser
3
2
  from fabricks.core.parsers.decorator import parser
4
3
  from fabricks.core.parsers.get_parser import get_parser
@@ -7,6 +6,5 @@ __all__ = [
7
6
  "BaseParser",
8
7
  "get_parser",
9
8
  "parser",
10
- "ParserOptions",
11
9
  "PARSERS",
12
10
  ]