fabricks 3.0.18__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +8 -7
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +96 -43
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +9 -8
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +269 -102
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -137
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.18.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
@@ -1,27 +1,26 @@
1
- from dataclasses import dataclass
2
- from typing import List, Literal, Optional, TypedDict, Union, overload
1
+ from typing import List, Literal, Optional, Union, overload
3
2
 
3
+ from pydantic import BaseModel
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import expr
6
6
  from pyspark.sql.types import Row
7
+ from sparkdantic import create_spark_schema
7
8
 
8
9
  from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
9
- from fabricks.core.jobs.base._types import AllowedModes, TStep
10
10
  from fabricks.core.jobs.base.job import BaseJob
11
11
  from fabricks.core.jobs.get_job import get_job, get_job_internal
12
+ from fabricks.models import AllowedModes
12
13
  from fabricks.utils.helpers import concat_dfs, run_in_parallel
13
- from fabricks.utils.path import Path
14
+ from fabricks.utils.path import GitPath
14
15
  from fabricks.utils.read import read_yaml
15
- from fabricks.utils.schema import get_schema_for_type
16
16
 
17
17
 
18
- class GenericOptions(TypedDict):
18
+ class GenericOptions(BaseModel):
19
19
  mode: AllowedModes
20
20
 
21
21
 
22
- @dataclass
23
- class JobConfGeneric:
24
- step: TStep
22
+ class JobConfGeneric(BaseModel):
23
+ step: str
25
24
  job_id: str
26
25
  topic: str
27
26
  item: str
@@ -39,9 +38,9 @@ def get_jobs_internal():
39
38
 
40
39
  def get_jobs_internal_df() -> DataFrame:
41
40
  if IS_JOB_CONFIG_FROM_YAML:
42
- schema = get_schema_for_type(JobConfGeneric)
41
+ schema = create_spark_schema(JobConfGeneric)
43
42
 
44
- def _read_yaml(path: Path):
43
+ def _read_yaml(path: GitPath):
45
44
  df = SPARK.createDataFrame(read_yaml(path, root="job"), schema=schema) # type: ignore
46
45
  if df:
47
46
  df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
@@ -1,23 +1,9 @@
1
- from typing import List, Optional, TypedDict
2
-
3
1
  from pyspark.sql import DataFrame
2
+ from sparkdantic import create_spark_schema
4
3
 
5
4
  from fabricks.context import PATH_SCHEDULES, SPARK
6
- from fabricks.core.jobs.base._types import TStep
5
+ from fabricks.models.schedule import Schedule
7
6
  from fabricks.utils.read.read_yaml import read_yaml
8
- from fabricks.utils.schema import get_schema_for_type
9
-
10
-
11
- class Options(TypedDict):
12
- steps: Optional[List[TStep]]
13
- tag: Optional[str]
14
- view: Optional[str]
15
- variables: Optional[dict[str, str]]
16
-
17
-
18
- class Schedule(TypedDict):
19
- name: str
20
- options: Options
21
7
 
22
8
 
23
9
  def get_schedules():
@@ -25,7 +11,7 @@ def get_schedules():
25
11
 
26
12
 
27
13
  def get_schedules_df() -> DataFrame:
28
- schema = get_schema_for_type(Schedule)
14
+ schema = create_spark_schema(Schedule)
29
15
  df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
30
16
 
31
17
  assert df, "no schedules found"
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  from collections.abc import Sequence
3
- from typing import List, Optional, Union, cast
3
+ from typing import List, Literal, Optional, Union, cast
4
4
 
5
5
  from pyspark.sql import DataFrame
6
6
  from pyspark.sql.types import Row
@@ -8,18 +8,18 @@ from typing_extensions import deprecated
8
8
 
9
9
  from fabricks.cdc.nocdc import NoCDC
10
10
  from fabricks.context.log import DEFAULT_LOGGER
11
- from fabricks.core.jobs.base._types import JobDependency, TGold
12
11
  from fabricks.core.jobs.base.job import BaseJob
13
- from fabricks.core.udfs import is_registered, register_udf, udf_prefix
12
+ from fabricks.core.udfs import UDF_PREFIX, is_registered, register_udf
14
13
  from fabricks.metastore.view import create_or_replace_global_temp_view
15
- from fabricks.utils.path import Path
14
+ from fabricks.models import JobDependency, JobGoldOptions, StepGoldConf, StepGoldOptions
15
+ from fabricks.utils.path import GitPath
16
16
  from fabricks.utils.sqlglot import fix, get_tables
17
17
 
18
18
 
19
19
  class Gold(BaseJob):
20
20
  def __init__(
21
21
  self,
22
- step: TGold,
22
+ step: str,
23
23
  topic: Optional[str] = None,
24
24
  item: Optional[str] = None,
25
25
  job_id: Optional[str] = None,
@@ -35,16 +35,31 @@ class Gold(BaseJob):
35
35
  )
36
36
 
37
37
  _sql: Optional[str] = None
38
- _sql_path: Optional[Path] = None
38
+ _sql_path: Optional[GitPath] = None
39
39
  _schema_drift: Optional[bool] = None
40
40
 
41
41
  @classmethod
42
42
  def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
43
- return cls(step=cast(TGold, step), job_id=job_id)
43
+ return cls(step=step, job_id=job_id)
44
44
 
45
45
  @classmethod
46
46
  def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
47
- return cls(step=cast(TGold, step), topic=topic, item=item)
47
+ return cls(step=step, topic=topic, item=item)
48
+
49
+ @property
50
+ def options(self) -> JobGoldOptions:
51
+ """Direct access to typed gold job options."""
52
+ return self.conf.options # type: ignore
53
+
54
+ @property
55
+ def step_conf(self) -> StepGoldConf:
56
+ """Direct access to typed gold step conf."""
57
+ return self.base_step_conf # type: ignore
58
+
59
+ @property
60
+ def step_options(self) -> StepGoldOptions:
61
+ """Direct access to typed gold step options."""
62
+ return self.base_step_conf.options # type: ignore
48
63
 
49
64
  @property
50
65
  def stream(self) -> bool:
@@ -53,7 +68,7 @@ class Gold(BaseJob):
53
68
  @property
54
69
  def schema_drift(self) -> bool:
55
70
  if not self._schema_drift:
56
- _schema_drift = self.step_conf.get("options", {}).get("schema_drift", False)
71
+ _schema_drift = self.step_conf.options.schema_drift or False
57
72
  assert _schema_drift is not None
58
73
  self._schema_drift = cast(bool, _schema_drift)
59
74
  return self._schema_drift
@@ -68,7 +83,7 @@ class Gold(BaseJob):
68
83
 
69
84
  @property
70
85
  def sql(self) -> str:
71
- sql = self.paths.runtime.get_sql()
86
+ sql = self.paths.to_runtime.get_sql()
72
87
  return fix(sql, keep_comments=False)
73
88
 
74
89
  @deprecated("use sql instead")
@@ -81,17 +96,17 @@ class Gold(BaseJob):
81
96
  return []
82
97
 
83
98
  # udf not allowed in notebook
84
- elif self.options.job.get("notebook"):
99
+ elif self.options.notebook:
85
100
  return []
86
101
 
87
102
  # udf not allowed in table
88
- elif self.options.job.get("table"):
103
+ elif self.options.table:
89
104
  return []
90
105
 
91
106
  else:
92
107
  matches = []
93
- if f"{udf_prefix}" in self.sql:
94
- r = re.compile(rf"(?<={udf_prefix})\w*(?=\()")
108
+ if f"{UDF_PREFIX}" in self.sql:
109
+ r = re.compile(rf"(?<={UDF_PREFIX})\w*(?=\()")
95
110
  matches = re.findall(r, self.sql)
96
111
  matches = set(matches)
97
112
  matches = list(matches)
@@ -114,7 +129,7 @@ class Gold(BaseJob):
114
129
  schema_only: Optional[bool] = False,
115
130
  **kwargs,
116
131
  ) -> DataFrame:
117
- if self.options.job.get_boolean("requirements"):
132
+ if self.options.requirements:
118
133
  import sys
119
134
 
120
135
  sys.path.append("/dbfs/mnt/fabricks/site-packages")
@@ -122,17 +137,28 @@ class Gold(BaseJob):
122
137
  if self.mode == "invoke":
123
138
  df = self.spark.createDataFrame([{}]) # type: ignore
124
139
 
125
- elif self.options.job.get("notebook"):
126
- invokers = self.options.invokers.get_list("run")
140
+ elif self.options.notebook:
141
+ invokers = self.invoker_options.run or [] if self.invoker_options else []
127
142
  assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
128
143
 
129
- global_temp_view = self.invoke(path=self.paths.runtime, schema_only=schema_only, **kwargs)
144
+ path = None
145
+ if invokers:
146
+ from fabricks.context import PATH_RUNTIME
147
+
148
+ path = PATH_RUNTIME.joinpath(invokers[0].notebook) if invokers[0].notebook else None
149
+
150
+ if path is None:
151
+ path = self.paths.to_runtime
152
+
153
+ assert path is not None, "path could not be resolved"
154
+
155
+ global_temp_view = self.invoke(path=path, schema_only=schema_only, **kwargs)
130
156
  assert global_temp_view is not None, "global_temp_view not found"
131
157
 
132
158
  df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
133
159
 
134
- elif self.options.job.get("table"):
135
- table = self.options.job.get("table")
160
+ elif self.options.table:
161
+ table = self.options.table
136
162
  df = self.spark.read.table(table) # type: ignore
137
163
 
138
164
  else:
@@ -157,11 +183,11 @@ class Gold(BaseJob):
157
183
 
158
184
  def get_dependencies(self) -> Sequence[JobDependency]:
159
185
  data = []
160
- parents = self.options.job.get_list("parents") or []
186
+ parents = self.options.parents or []
161
187
 
162
188
  if self.mode == "invoke":
163
189
  dependencies = []
164
- elif self.options.job.get("notebook"):
190
+ elif self.options.notebook:
165
191
  dependencies = self._get_notebook_dependencies()
166
192
  else:
167
193
  dependencies = self._get_sql_dependencies()
@@ -178,7 +204,7 @@ class Gold(BaseJob):
178
204
  return data
179
205
 
180
206
  def _get_sql_dependencies(self) -> List[str]:
181
- from fabricks.core.jobs.base._types import Steps
207
+ from fabricks.context import Steps
182
208
 
183
209
  steps = [str(s) for s in Steps]
184
210
  return get_tables(self.sql, allowed_databases=steps)
@@ -206,13 +232,13 @@ class Gold(BaseJob):
206
232
 
207
233
  def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
208
234
  # assume no duplicate in gold (to improve performance)
209
- deduplicate = self.options.job.get_boolean("deduplicate", None)
235
+ deduplicate = self.options.deduplicate
210
236
  # assume no reload in gold (to improve performance)
211
- rectify = self.options.job.get_boolean("rectify_as_upserts", None)
237
+ rectify = self.options.rectify_as_upserts
212
238
 
213
- add_metadata = self.options.job.get_boolean("metadata", None)
239
+ add_metadata = self.options.metadata
214
240
  if add_metadata is None:
215
- add_metadata = self.step_conf.get("options", {}).get("metadata", False)
241
+ add_metadata = self.step_conf.options.metadata or False
216
242
 
217
243
  context = {
218
244
  "add_metadata": add_metadata,
@@ -277,10 +303,12 @@ class Gold(BaseJob):
277
303
 
278
304
  # correct __valid_from
279
305
  if self.change_data_capture == "scd2":
280
- context["correct_valid_from"] = self.options.job.get_boolean("correct_valid_from", True)
306
+ context["correct_valid_from"] = (
307
+ self.options.correct_valid_from if self.options.correct_valid_from is not None else True
308
+ )
281
309
 
282
310
  # add __timestamp
283
- if self.options.job.get_boolean("persist_last_timestamp"):
311
+ if self.options.persist_last_timestamp:
284
312
  if self.change_data_capture == "scd1":
285
313
  if "__timestamp" not in df.columns:
286
314
  context["add_timestamp"] = True
@@ -288,6 +316,14 @@ class Gold(BaseJob):
288
316
  if "__valid_from" not in df.columns:
289
317
  context["add_timestamp"] = True
290
318
 
319
+ # add __updated
320
+ if self.options.persist_last_updated_timestamp:
321
+ if "__last_updated" not in df.columns:
322
+ context["add_last_updated"] = True
323
+ if self.options.last_updated:
324
+ if "__last_updated" not in df.columns:
325
+ context["add_last_updated"] = True
326
+
291
327
  if "__order_duplicate_by_asc" in df.columns:
292
328
  context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
293
329
  elif "__order_duplicate_by_desc" in df.columns:
@@ -334,7 +370,10 @@ class Gold(BaseJob):
334
370
 
335
371
  def for_each_run(self, **kwargs):
336
372
  last_version = None
337
- if self.options.job.get_boolean("persist_last_timestamp"):
373
+
374
+ if self.options.persist_last_timestamp:
375
+ last_version = self.table.get_last_version()
376
+ if self.options.persist_last_updated_timestamp:
338
377
  last_version = self.table.get_last_version()
339
378
 
340
379
  if self.mode == "invoke":
@@ -343,8 +382,11 @@ class Gold(BaseJob):
343
382
  else:
344
383
  super().for_each_run(**kwargs)
345
384
 
346
- if self.options.job.get_boolean("persist_last_timestamp"):
347
- self._update_last_timestamp(last_version=last_version)
385
+ if self.options.persist_last_timestamp:
386
+ self._persist_timestamp(field="__timestamp", last_version=last_version)
387
+
388
+ if self.options.persist_last_updated_timestamp:
389
+ self._persist_timestamp(field="__last_updated", last_version=last_version)
348
390
 
349
391
  def create(self):
350
392
  if self.mode == "invoke":
@@ -352,11 +394,11 @@ class Gold(BaseJob):
352
394
  else:
353
395
  self.register_udfs()
354
396
  super().create()
355
- if self.options.job.get_boolean("persist_last_timestamp"):
356
- self._update_last_timestamp(create=True)
397
+ if self.options.persist_last_timestamp:
398
+ self._persist_timestamp(create=True)
357
399
 
358
400
  def register(self):
359
- if self.options.job.get_boolean("persist_last_timestamp"):
401
+ if self.options.persist_last_timestamp:
360
402
  self.cdc_last_timestamp.table.register()
361
403
 
362
404
  if self.mode == "invoke":
@@ -365,7 +407,7 @@ class Gold(BaseJob):
365
407
  super().register()
366
408
 
367
409
  def drop(self):
368
- if self.options.job.get_boolean("persist_last_timestamp"):
410
+ if self.options.persist_last_timestamp:
369
411
  self.cdc_last_timestamp.drop()
370
412
 
371
413
  super().drop()
@@ -378,14 +420,25 @@ class Gold(BaseJob):
378
420
  cdc = NoCDC(self.step, self.topic, f"{self.item}__last_timestamp")
379
421
  return cdc
380
422
 
381
- def _update_last_timestamp(self, last_version: Optional[int] = None, create: bool = False):
423
+ def _persist_timestamp(
424
+ self,
425
+ field: Literal["__timestamp", "__last_updated"] = "__timestamp",
426
+ last_version: Optional[int] = None,
427
+ create: bool = False,
428
+ ):
382
429
  df = self.spark.sql(f"select * from {self} limit 1")
383
430
 
384
431
  fields = []
385
- if self.change_data_capture == "scd1":
386
- fields.append("max(__timestamp) :: timestamp as __timestamp")
387
- elif self.change_data_capture == "scd2":
388
- fields.append("max(__valid_from) :: timestamp as __timestamp")
432
+
433
+ if field == "__last_updated":
434
+ fields.append("max(__last_updated) :: timestamp as __last_updated")
435
+
436
+ elif field == "__timestamp":
437
+ if self.change_data_capture == "scd1":
438
+ fields.append("max(__timestamp) :: timestamp as __timestamp")
439
+ elif self.change_data_capture == "scd2":
440
+ fields.append("max(__valid_from) :: timestamp as __timestamp")
441
+
389
442
  if "__source" in df.columns:
390
443
  fields.append("__source")
391
444
 
@@ -401,7 +454,7 @@ class Gold(BaseJob):
401
454
  else:
402
455
  self.cdc_last_timestamp.overwrite(df)
403
456
 
404
- def overwrite(self, schedule: Optional[str] = None):
457
+ def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
405
458
  if self.mode == "invoke":
406
459
  DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
407
460
  return
@@ -412,4 +465,4 @@ class Gold(BaseJob):
412
465
  return
413
466
 
414
467
  self.overwrite_schema()
415
- self.run(reload=True, schedule=schedule)
468
+ self.run(reload=True, schedule=schedule, invoke=invoke)
@@ -1,4 +1,4 @@
1
- from typing import Optional, Sequence, Union, cast
1
+ from typing import Optional, Sequence, Union
2
2
 
3
3
  from pyspark.sql import DataFrame
4
4
  from pyspark.sql.functions import expr
@@ -6,10 +6,10 @@ from pyspark.sql.types import Row
6
6
 
7
7
  from fabricks.cdc.nocdc import NoCDC
8
8
  from fabricks.context.log import DEFAULT_LOGGER
9
- from fabricks.core.jobs.base._types import JobDependency, TBronze, TSilver
10
9
  from fabricks.core.jobs.base.job import BaseJob
11
10
  from fabricks.core.jobs.bronze import Bronze
12
11
  from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.models import JobDependency, JobSilverOptions, StepSilverConf, StepSilverOptions
13
13
  from fabricks.utils.helpers import concat_dfs
14
14
  from fabricks.utils.read.read import read
15
15
  from fabricks.utils.sqlglot import fix as fix_sql
@@ -18,7 +18,7 @@ from fabricks.utils.sqlglot import fix as fix_sql
18
18
  class Silver(BaseJob):
19
19
  def __init__(
20
20
  self,
21
- step: TSilver,
21
+ step: str,
22
22
  topic: Optional[str] = None,
23
23
  item: Optional[str] = None,
24
24
  job_id: Optional[str] = None,
@@ -33,23 +33,38 @@ class Silver(BaseJob):
33
33
  conf=conf,
34
34
  )
35
35
 
36
- _parent_step: Optional[TBronze] = None
36
+ _parent_step: Optional[str] = None
37
37
  _stream: Optional[bool] = None
38
38
 
39
39
  @classmethod
40
40
  def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
41
- return cls(step=cast(TSilver, step), job_id=job_id, conf=conf)
41
+ return cls(step=step, job_id=job_id, conf=conf)
42
42
 
43
43
  @classmethod
44
44
  def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
45
- return cls(step=cast(TSilver, step), topic=topic, item=item, conf=conf)
45
+ return cls(step=step, topic=topic, item=item, conf=conf)
46
+
47
+ @property
48
+ def options(self) -> JobSilverOptions:
49
+ """Direct access to typed silver job options."""
50
+ return self.conf.options # type: ignore
51
+
52
+ @property
53
+ def step_conf(self) -> StepSilverConf:
54
+ """Direct access to typed silver step conf."""
55
+ return self.base_step_conf # type: ignore
56
+
57
+ @property
58
+ def step_options(self) -> StepSilverOptions:
59
+ """Direct access to typed silver step options."""
60
+ return self.base_step_conf.options # type: ignore
46
61
 
47
62
  @property
48
63
  def stream(self) -> bool:
49
64
  if not self._stream:
50
- _stream = self.options.job.get("stream")
65
+ _stream = self.options.stream
51
66
  if _stream is None:
52
- _stream = self.step_conf.get("options", {}).get("stream")
67
+ _stream = self.step_conf.options.stream
53
68
  self._stream = _stream if _stream is not None else True
54
69
  return self._stream # type: ignore
55
70
 
@@ -66,18 +81,17 @@ class Silver(BaseJob):
66
81
  return self.mode in ["combine", "memory"]
67
82
 
68
83
  @property
69
- def parent_step(self) -> TBronze:
84
+ def parent_step(self) -> str:
70
85
  if not self._parent_step:
71
- _parent_step = self.step_conf.get("options", {}).get("parent")
72
- _parent_step = cast(TBronze, _parent_step)
86
+ _parent_step = self.step_conf.options.parent
73
87
  assert _parent_step is not None
74
- self._parent_step = _parent_step
88
+ self._parent_step = str(_parent_step)
75
89
  return self._parent_step
76
90
 
77
- def base_transform(self, df: DataFrame) -> DataFrame:
78
- df = df.transform(self.extend)
79
-
91
+ def update_metadata(self, df: DataFrame) -> DataFrame:
80
92
  if "__metadata" in df.columns:
93
+ DEFAULT_LOGGER.debug("update metadata", extra={"label": self})
94
+
81
95
  df = df.withColumn(
82
96
  "__metadata",
83
97
  expr(
@@ -88,11 +102,18 @@ class Silver(BaseJob):
88
102
  __metadata.file_size as file_size,
89
103
  __metadata.file_modification_time as file_modification_time,
90
104
  __metadata.inserted as inserted,
91
- cast(current_timestamp() as timestamp) as updated
105
+ cast(current_timestamp() as timestamp) as updated
92
106
  )
93
107
  """
94
108
  ),
95
109
  )
110
+
111
+ return df
112
+
113
+ def base_transform(self, df: DataFrame) -> DataFrame:
114
+ df = df.transform(self.extend)
115
+ df = self.update_metadata(df)
116
+
96
117
  return df
97
118
 
98
119
  def get_data(
@@ -153,7 +174,6 @@ class Silver(BaseJob):
153
174
 
154
175
  # transforms
155
176
  df = self.filter_where(df)
156
- df = self.encrypt(df)
157
177
  if transform:
158
178
  df = self.base_transform(df)
159
179
 
@@ -165,7 +185,7 @@ class Silver(BaseJob):
165
185
  def get_dependencies(self) -> Sequence[JobDependency]:
166
186
  dependencies = []
167
187
 
168
- parents = self.options.job.get_list("parents") or []
188
+ parents = self.options.parents or []
169
189
  if parents:
170
190
  for p in parents:
171
191
  dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
@@ -237,9 +257,9 @@ class Silver(BaseJob):
237
257
  except Py4JJavaError as e:
238
258
  DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
239
259
 
240
- def overwrite(self, schedule: Optional[str] = None):
260
+ def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
241
261
  self.truncate()
242
- self.run(schedule=schedule)
262
+ self.run(schedule=schedule, invoke=invoke)
243
263
 
244
264
  def overwrite_schema(self, df: Optional[DataFrame] = None):
245
265
  DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
@@ -251,7 +271,7 @@ class Silver(BaseJob):
251
271
 
252
272
  not_append = not self.mode == "append"
253
273
  nocdc = self.change_data_capture == "nocdc"
254
- order_duplicate_by = self.options.job.get_dict("order_duplicate_by") or {}
274
+ order_duplicate_by = self.options.order_duplicate_by or {}
255
275
 
256
276
  rectify = False
257
277
  if not_append and not nocdc:
@@ -283,7 +303,7 @@ class Silver(BaseJob):
283
303
 
284
304
  context = {
285
305
  "soft_delete": self.slowly_changing_dimension,
286
- "deduplicate": self.options.job.get_boolean("deduplicate", not_append),
306
+ "deduplicate": self.options.deduplicate if self.options.deduplicate is not None else not_append,
287
307
  "rectify": rectify,
288
308
  "order_duplicate_by": order_duplicate_by,
289
309
  }
fabricks/core/masks.py CHANGED
@@ -3,22 +3,25 @@ from typing import List, Optional
3
3
 
4
4
  from pyspark.sql import SparkSession
5
5
 
6
- from fabricks.context import CATALOG, PATH_MASKS, SPARK
6
+ from fabricks.context import CATALOG, CONF_RUNTIME, PATH_MASKS, SPARK
7
7
  from fabricks.context.log import DEFAULT_LOGGER
8
8
 
9
+ MASK_SCHEMA = CONF_RUNTIME.mask_options.schema_name or "default" if CONF_RUNTIME.mask_options else "default"
10
+ MASK_PREFIX = CONF_RUNTIME.mask_options.prefix or "mask_" if CONF_RUNTIME.mask_options else "mask_"
11
+
9
12
 
10
13
  def register_all_masks(override: bool = False):
11
14
  """
12
15
  Register all masks.
13
16
  """
14
17
 
15
- DEFAULT_LOGGER.info("register masks")
18
+ DEFAULT_LOGGER.info("register masks", extra={"label": "fabricks"})
16
19
  for mask in get_masks():
17
20
  split = mask.split(".")
18
21
  try:
19
22
  register_mask(mask=split[0], override=override)
20
23
  except Exception as e:
21
- DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
24
+ DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e, extra={"label": "fabricks"})
22
25
 
23
26
 
24
27
  def get_masks() -> List[str]:
@@ -30,12 +33,12 @@ def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
30
33
  spark = SPARK
31
34
  assert spark is not None
32
35
 
33
- df = spark.sql("show user functions in default")
36
+ df = spark.sql(f"show user functions in {MASK_SCHEMA}")
34
37
 
35
38
  if CATALOG:
36
- df = df.where(f"function == '{CATALOG}.default.mask_{mask}'")
39
+ df = df.where(f"function == '{CATALOG}.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
37
40
  else:
38
- df = df.where(f"function == 'spark_catalog.default.mask_{mask}'")
41
+ df = df.where(f"function == 'spark_catalog.{MASK_SCHEMA}.{MASK_PREFIX}{mask}'")
39
42
 
40
43
  return not df.isEmpty()
41
44
 
@@ -47,9 +50,9 @@ def register_mask(mask: str, override: Optional[bool] = False, spark: Optional[S
47
50
 
48
51
  if not is_registered(mask, spark) or override:
49
52
  if override:
50
- DEFAULT_LOGGER.debug(f"override mask {mask}")
53
+ DEFAULT_LOGGER.debug(f"drop mask {mask}", extra={"label": "fabricks"})
51
54
  else:
52
- DEFAULT_LOGGER.debug(f"register mask {mask}")
55
+ DEFAULT_LOGGER.debug(f"register mask {mask}", extra={"label": "fabricks"})
53
56
 
54
57
  path = PATH_MASKS.joinpath(f"{mask}.sql")
55
58
  spark.sql(path.get_sql())
@@ -1,4 +1,3 @@
1
- from fabricks.core.parsers._types import ParserOptions
2
1
  from fabricks.core.parsers.base import PARSERS, BaseParser
3
2
  from fabricks.core.parsers.decorator import parser
4
3
  from fabricks.core.parsers.get_parser import get_parser
@@ -7,6 +6,5 @@ __all__ = [
7
6
  "BaseParser",
8
7
  "get_parser",
9
8
  "parser",
10
- "ParserOptions",
11
9
  "PARSERS",
12
10
  ]
@@ -5,15 +5,15 @@ from pyspark.sql import DataFrame, SparkSession
5
5
  from pyspark.sql.functions import col, expr, from_json, lit
6
6
  from pyspark.sql.types import MapType, StringType
7
7
 
8
- from fabricks.core.parsers._types import ParserOptions
9
8
  from fabricks.core.parsers.utils import clean
10
- from fabricks.utils.path import Path
9
+ from fabricks.models import ParserOptions
10
+ from fabricks.utils.path import FileSharePath
11
11
  from fabricks.utils.read.read import read
12
12
 
13
13
 
14
14
  class BaseParser(ABC):
15
15
  def __init__(self, options: Optional[ParserOptions], file_format: str):
16
- self.options = options or {}
16
+ self.options = options
17
17
  self.file_format = file_format
18
18
 
19
19
  def add_timestamp_from_file_path(self, df: DataFrame) -> DataFrame:
@@ -33,8 +33,8 @@ class BaseParser(ABC):
33
33
 
34
34
  def parse(
35
35
  self,
36
- data_path: Path,
37
- schema_path: Path,
36
+ data_path: FileSharePath,
37
+ schema_path: FileSharePath,
38
38
  spark: SparkSession,
39
39
  stream: bool,
40
40
  ) -> DataFrame:
@@ -43,7 +43,7 @@ class BaseParser(ABC):
43
43
  path=data_path,
44
44
  file_format=self.file_format,
45
45
  schema_path=schema_path,
46
- options=self.options.get("read_options"),
46
+ options=self.options.read_options if self.options else {},
47
47
  spark=spark,
48
48
  )
49
49
 
@@ -55,8 +55,8 @@ class BaseParser(ABC):
55
55
  @final
56
56
  def get_data(
57
57
  self,
58
- data_path: Path,
59
- schema_path: Path,
58
+ data_path: FileSharePath,
59
+ schema_path: FileSharePath,
60
60
  spark: SparkSession,
61
61
  stream: bool,
62
62
  ) -> DataFrame:
@@ -64,8 +64,8 @@ class BaseParser(ABC):
64
64
  Retrieves and processes data from the specified data path using the provided schema.
65
65
 
66
66
  Args:
67
- data_path (Path): The path to the data file.
68
- schema_path (Path): The path to the schema file.
67
+ data_path (FileSharePath): The path to the data file.
68
+ schema_path (FileSharePath): The path to the schema file.
69
69
  spark (SparkSession): The SparkSession object.
70
70
  stream (bool): Indicates whether the data should be processed as a stream.
71
71