fabricks 3.0.19__py3-none-any.whl → 4.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. fabricks/api/context.py +15 -3
  2. fabricks/api/notebooks/schedule.py +2 -3
  3. fabricks/api/parsers.py +2 -1
  4. fabricks/api/utils.py +3 -1
  5. fabricks/cdc/__init__.py +1 -2
  6. fabricks/cdc/base/__init__.py +1 -2
  7. fabricks/cdc/base/_types.py +5 -3
  8. fabricks/cdc/base/configurator.py +5 -0
  9. fabricks/cdc/base/generator.py +7 -3
  10. fabricks/cdc/base/merger.py +2 -0
  11. fabricks/cdc/base/processor.py +15 -0
  12. fabricks/cdc/templates/README.md +490 -0
  13. fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
  14. fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
  15. fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
  16. fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
  17. fabricks/cdc/templates/queries/context.sql.jinja +104 -96
  18. fabricks/cdc/templates/query.sql.jinja +1 -1
  19. fabricks/context/__init__.py +13 -1
  20. fabricks/context/config.py +13 -122
  21. fabricks/context/log.py +92 -1
  22. fabricks/context/runtime.py +35 -69
  23. fabricks/context/spark_session.py +4 -4
  24. fabricks/context/utils.py +26 -39
  25. fabricks/core/__init__.py +2 -2
  26. fabricks/core/dags/base.py +5 -5
  27. fabricks/core/dags/processor.py +2 -3
  28. fabricks/core/extenders.py +1 -1
  29. fabricks/core/job_schema.py +26 -16
  30. fabricks/core/jobs/__init__.py +1 -7
  31. fabricks/core/jobs/base/README.md +1545 -0
  32. fabricks/core/jobs/base/__init__.py +1 -8
  33. fabricks/core/jobs/base/checker.py +7 -7
  34. fabricks/core/jobs/base/configurator.py +142 -63
  35. fabricks/core/jobs/base/generator.py +38 -34
  36. fabricks/core/jobs/base/invoker.py +48 -63
  37. fabricks/core/jobs/base/processor.py +13 -28
  38. fabricks/core/jobs/bronze.py +88 -38
  39. fabricks/core/jobs/get_job.py +3 -6
  40. fabricks/core/jobs/get_job_conf.py +19 -68
  41. fabricks/core/jobs/get_jobs.py +10 -11
  42. fabricks/core/jobs/get_schedules.py +3 -17
  43. fabricks/core/jobs/gold.py +89 -47
  44. fabricks/core/jobs/silver.py +42 -22
  45. fabricks/core/masks.py +11 -8
  46. fabricks/core/parsers/__init__.py +0 -2
  47. fabricks/core/parsers/base.py +10 -10
  48. fabricks/core/parsers/decorator.py +1 -1
  49. fabricks/core/parsers/get_parser.py +4 -5
  50. fabricks/core/schedules/process.py +1 -4
  51. fabricks/core/steps/base.py +27 -17
  52. fabricks/core/steps/get_step.py +2 -4
  53. fabricks/core/steps/get_step_conf.py +3 -7
  54. fabricks/core/udfs.py +7 -7
  55. fabricks/core/views.py +2 -2
  56. fabricks/deploy/__init__.py +27 -16
  57. fabricks/deploy/masks.py +1 -1
  58. fabricks/deploy/notebooks.py +19 -16
  59. fabricks/deploy/schedules.py +1 -1
  60. fabricks/deploy/tables.py +66 -49
  61. fabricks/deploy/udfs.py +2 -2
  62. fabricks/deploy/views.py +15 -16
  63. fabricks/metastore/database.py +3 -3
  64. fabricks/metastore/table.py +103 -68
  65. fabricks/models/__init__.py +125 -0
  66. fabricks/models/common.py +79 -0
  67. fabricks/models/config.py +225 -0
  68. fabricks/models/dependency.py +50 -0
  69. fabricks/models/job.py +157 -0
  70. fabricks/models/path.py +17 -0
  71. fabricks/models/runtime.py +182 -0
  72. fabricks/models/schedule.py +21 -0
  73. fabricks/models/step.py +103 -0
  74. fabricks/models/table.py +77 -0
  75. fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
  76. fabricks/utils/helpers.py +6 -5
  77. fabricks/utils/log.py +25 -6
  78. fabricks/utils/path.py +265 -108
  79. fabricks/utils/pip.py +7 -7
  80. fabricks/utils/read/read.py +23 -22
  81. fabricks/utils/read/read_yaml.py +2 -2
  82. fabricks/utils/write/delta.py +4 -4
  83. fabricks/utils/write/stream.py +2 -2
  84. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/METADATA +9 -4
  85. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/RECORD +86 -83
  86. fabricks/context/_types.py +0 -139
  87. fabricks/context/helpers.py +0 -63
  88. fabricks/core/jobs/base/_types.py +0 -284
  89. fabricks/core/parsers/_types.py +0 -6
  90. fabricks/utils/fdict.py +0 -240
  91. fabricks/utils/pydantic.py +0 -94
  92. fabricks/utils/schema/__init__.py +0 -7
  93. fabricks/utils/schema/get_json_schema_for_type.py +0 -161
  94. fabricks/utils/schema/get_schema_for_type.py +0 -99
  95. {fabricks-3.0.19.dist-info → fabricks-4.0.1.dist-info}/WHEEL +0 -0
@@ -5,10 +5,12 @@ from pyspark.sql import DataFrame
5
5
 
6
6
  from fabricks.context import PATH_RUNTIME
7
7
  from fabricks.context.log import DEFAULT_LOGGER
8
+ from fabricks.core.extenders import get_extender
8
9
  from fabricks.core.jobs.base.checker import Checker
9
10
  from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
10
11
  from fabricks.core.jobs.get_schedule import get_schedule
11
- from fabricks.utils.path import Path
12
+ from fabricks.models.common import BaseInvokerOptions, ExtenderOptions
13
+ from fabricks.utils.path import GitPath
12
14
 
13
15
 
14
16
  class Invoker(Checker):
@@ -27,8 +29,37 @@ class Invoker(Checker):
27
29
  self._invoke_job(position="post_run", schedule=schedule)
28
30
  self._invoke_step(position="post_run", schedule=schedule)
29
31
 
32
+ def _invoke_notebook(
33
+ self,
34
+ invoker: dict | BaseInvokerOptions,
35
+ schedule: Optional[str] = None,
36
+ **kwargs,
37
+ ):
38
+ path = kwargs.get("path")
39
+ if path is None:
40
+ notebook = invoker.get("notebook") if isinstance(invoker, dict) else invoker.notebook
41
+ assert notebook, "notebook mandatory"
42
+ path = PATH_RUNTIME.joinpath(notebook)
43
+
44
+ assert path is not None, "path could not be resolved"
45
+
46
+ timeout = invoker.get("timeout") if isinstance(invoker, dict) else invoker.timeout
47
+ arguments = invoker.get("arguments") if isinstance(invoker, dict) else invoker.arguments
48
+ arguments = arguments or {}
49
+
50
+ schema_only = kwargs.get("schema_only")
51
+ if schema_only is not None:
52
+ arguments["schema_only"] = schema_only
53
+
54
+ return self._run_notebook(
55
+ path=path,
56
+ arguments=arguments,
57
+ schedule=schedule,
58
+ timeout=timeout,
59
+ )
60
+
30
61
  def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
31
- invokers = self.options.invokers.get_list(position)
62
+ invokers = getattr(self.invoker_options, position, None) or [] if self.invoker_options else []
32
63
  if position == "run":
33
64
  invokers = invokers if len(invokers) > 0 else [{}] # run must work even without run invoker options
34
65
 
@@ -38,35 +69,10 @@ class Invoker(Checker):
38
69
  for i, invoker in enumerate(invokers):
39
70
  DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
40
71
  try:
41
- path = kwargs.get("path")
42
- if path is None:
43
- notebook = invoker.get("notebook")
44
- assert notebook, "notebook mandatory"
45
- path = PATH_RUNTIME.joinpath(notebook)
46
-
47
- assert path is not None, "path mandatory"
48
-
49
- arguments = invoker.get("arguments") or {}
50
- timeout = invoker.get("timeout")
51
-
52
- schema_only = kwargs.get("schema_only")
53
- if schema_only is not None:
54
- arguments["schema_only"] = schema_only
55
-
56
72
  if len(invokers) == 1 and position == "run":
57
- return self._run_notebook(
58
- path=path,
59
- arguments=arguments,
60
- timeout=timeout,
61
- schedule=schedule,
62
- )
73
+ return self._invoke_notebook(invoker, schedule=schedule, **kwargs)
63
74
  else:
64
- self._run_notebook(
65
- path=path,
66
- arguments=arguments,
67
- timeout=timeout,
68
- schedule=schedule,
69
- )
75
+ self._invoke_notebook(invoker=invoker, schedule=schedule, **kwargs)
70
76
 
71
77
  except Exception as e:
72
78
  DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
@@ -82,7 +88,7 @@ class Invoker(Checker):
82
88
  raise Exception(errors)
83
89
 
84
90
  def _invoke_step(self, position: str, schedule: Optional[str] = None):
85
- invokers = self.step_conf.get("invoker_options", {}).get(position, [])
91
+ invokers = getattr(self.step_conf.invoker_options, position, []) if self.step_conf.invoker_options else []
86
92
 
87
93
  errors = []
88
94
 
@@ -90,19 +96,7 @@ class Invoker(Checker):
90
96
  for i, invoker in enumerate(invokers):
91
97
  DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
92
98
  try:
93
- notebook = invoker.get("notebook")
94
- assert notebook, "notebook mandatory"
95
- path = PATH_RUNTIME.joinpath(notebook)
96
-
97
- arguments = invoker.get("arguments", {})
98
- timeout = invoker.get("timeout")
99
-
100
- self._run_notebook(
101
- path=path,
102
- arguments=arguments,
103
- timeout=timeout,
104
- schedule=schedule,
105
- )
99
+ self._invoke_notebook(invoker=invoker, schedule=schedule)
106
100
 
107
101
  except Exception as e:
108
102
  DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
@@ -119,7 +113,7 @@ class Invoker(Checker):
119
113
 
120
114
  def _run_notebook(
121
115
  self,
122
- path: Path,
116
+ path: GitPath,
123
117
  arguments: Optional[dict] = None,
124
118
  timeout: Optional[int] = None,
125
119
  schedule: Optional[str] = None,
@@ -128,7 +122,7 @@ class Invoker(Checker):
128
122
  Invokes a notebook job.
129
123
 
130
124
  Args:
131
- path (Optional[Path]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
125
+ path (Optional[GitPath]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
132
126
  arguments (Optional[dict]): Additional arguments to pass to the notebook job. If not provided, it will be retrieved from the invoker options.
133
127
  schedule (Optional[str]): The schedule for the job. If provided, schedule variables will be retrieved.
134
128
 
@@ -167,33 +161,24 @@ class Invoker(Checker):
167
161
  "topic": self.topic,
168
162
  "item": self.item,
169
163
  **arguments,
170
- "job_options": json.dumps(self.options.job.options),
164
+ "job_options": json.dumps(self.options.model_dump()),
171
165
  "schedule_variables": json.dumps(variables),
172
166
  },
173
167
  )
174
168
 
175
169
  def extend_job(self, df: DataFrame) -> DataFrame:
176
- from fabricks.core.extenders import get_extender
177
-
178
- extenders = self.options.extenders
179
- for e in extenders:
180
- name = e.get("extender")
181
- DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
182
- arguments = e.get("arguments") or {}
183
-
184
- extender = get_extender(name)
185
- df = extender(df, **arguments)
186
-
187
- return df
170
+ extenders = self.extender_options or []
171
+ return self._extend(df, extenders, extended="job")
188
172
 
189
173
  def extend_step(self, df: DataFrame) -> DataFrame:
190
- from fabricks.core.extenders import get_extender
174
+ extenders = self.step_conf.extender_options or []
175
+ return self._extend(df, extenders, extended="step")
191
176
 
192
- extenders = self.step_conf.get("extender_options", {})
177
+ def _extend(self, df: DataFrame, extenders: list[ExtenderOptions], extended: str) -> DataFrame:
193
178
  for e in extenders:
194
- name = e.get("extender")
195
- DEFAULT_LOGGER.debug(f"extend by step ({name})", extra={"label": self})
196
- arguments = e.get("arguments", {})
179
+ name = e.extender
180
+ DEFAULT_LOGGER.debug(f"extend {extended} ({name})", extra={"label": self})
181
+ arguments = e.arguments or {}
197
182
 
198
183
  extender = get_extender(name)
199
184
  df = extender(df, **arguments)
@@ -3,9 +3,8 @@ from functools import partial
3
3
  from typing import Optional
4
4
 
5
5
  from pyspark.sql import DataFrame
6
- from pyspark.sql.functions import expr
7
6
 
8
- from fabricks.context import IS_TYPE_WIDENING, IS_UNITY_CATALOG, SECRET_SCOPE
7
+ from fabricks.context import IS_TYPE_WIDENING
9
8
  from fabricks.context.log import DEFAULT_LOGGER
10
9
  from fabricks.core.jobs.base.exception import (
11
10
  PostRunCheckException,
@@ -18,39 +17,21 @@ from fabricks.core.jobs.base.exception import (
18
17
  SkipRunCheckWarning,
19
18
  )
20
19
  from fabricks.core.jobs.base.invoker import Invoker
20
+ from fabricks.models import JobBronzeOptions, JobSilverOptions
21
21
  from fabricks.utils.write import write_stream
22
22
 
23
23
 
24
24
  class Processor(Invoker):
25
25
  def filter_where(self, df: DataFrame) -> DataFrame:
26
- f = self.options.job.get("filter_where")
26
+ assert isinstance(self.options, (JobBronzeOptions, JobSilverOptions))
27
27
 
28
+ f = self.options.filter_where
28
29
  if f:
29
30
  DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
30
31
  df = df.where(f"{f}")
31
32
 
32
33
  return df
33
34
 
34
- def encrypt(self, df: DataFrame) -> DataFrame:
35
- encrypted_columns = self.options.job.get_list("encrypted_columns")
36
- if encrypted_columns:
37
- if not IS_UNITY_CATALOG:
38
- from databricks.sdk.runtime import dbutils
39
-
40
- key = dbutils.secrets.get(scope=SECRET_SCOPE, key="encryption-key")
41
- else:
42
- import os
43
-
44
- key = os.environ["FABRICKS_ENCRYPTION_KEY"]
45
-
46
- assert key, "key not found"
47
-
48
- for col in encrypted_columns:
49
- DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
50
- df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
51
-
52
- return df
53
-
54
35
  def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
55
36
  """
56
37
  Restores the processor to a specific version and batch.
@@ -70,7 +51,7 @@ class Processor(Invoker):
70
51
  self.rm_commit(current_batch)
71
52
 
72
53
  assert last_batch == self.table.get_property("fabricks.last_batch")
73
- assert self.paths.commits.joinpath(last_batch).exists()
54
+ assert self.paths.to_commits.joinpath(last_batch).exists()
74
55
 
75
56
  def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
76
57
  DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
@@ -118,7 +99,7 @@ class Processor(Invoker):
118
99
  DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
119
100
  write_stream(
120
101
  df,
121
- checkpoints_path=self.paths.checkpoints,
102
+ checkpoints_path=self.paths.to_checkpoints,
122
103
  func=self._for_each_batch,
123
104
  timeout=self.timeout,
124
105
  )
@@ -198,11 +179,15 @@ class Processor(Invoker):
198
179
  raise exception
199
180
 
200
181
  if vacuum is None:
201
- vacuum = self.options.job.get("vacuum", False)
182
+ vacuum = self.options.vacuum if self.options and self.options.vacuum is not None else False
202
183
  if optimize is None:
203
- optimize = self.options.job.get("optimize", False)
184
+ optimize = self.options.optimize if self.options and self.options.optimize is not None else False
204
185
  if compute_statistics is None:
205
- compute_statistics = self.options.job.get("compute_statistics", False)
186
+ compute_statistics = (
187
+ self.options.compute_statistics
188
+ if self.options and self.options.compute_statistics is not None
189
+ else False
190
+ )
206
191
 
207
192
  if vacuum or optimize or compute_statistics:
208
193
  self.maintain(
@@ -7,21 +7,20 @@ from pyspark.sql.types import Row, TimestampType
7
7
  from fabricks.cdc.nocdc import NoCDC
8
8
  from fabricks.context import VARIABLES
9
9
  from fabricks.context.log import DEFAULT_LOGGER
10
- from fabricks.core.jobs.base._types import JobDependency, TBronze
11
10
  from fabricks.core.jobs.base.job import BaseJob
12
- from fabricks.core.parsers import BaseParser
13
11
  from fabricks.core.parsers.get_parser import get_parser
14
12
  from fabricks.core.parsers.utils import clean
15
13
  from fabricks.metastore.view import create_or_replace_global_temp_view
14
+ from fabricks.models import JobBronzeOptions, JobDependency, StepBronzeConf, StepBronzeOptions
16
15
  from fabricks.utils.helpers import concat_ws
17
- from fabricks.utils.path import Path
16
+ from fabricks.utils.path import FileSharePath
18
17
  from fabricks.utils.read import read
19
18
 
20
19
 
21
20
  class Bronze(BaseJob):
22
21
  def __init__(
23
22
  self,
24
- step: TBronze,
23
+ step: str,
25
24
  topic: Optional[str] = None,
26
25
  item: Optional[str] = None,
27
26
  job_id: Optional[str] = None,
@@ -36,7 +35,7 @@ class Bronze(BaseJob):
36
35
  conf=conf,
37
36
  )
38
37
 
39
- _parser: Optional[BaseParser] = None
38
+ _parser: Optional[str] = None
40
39
 
41
40
  @property
42
41
  def stream(self) -> bool:
@@ -54,25 +53,40 @@ class Bronze(BaseJob):
54
53
  def virtual(self) -> bool:
55
54
  return False
56
55
 
56
+ @property
57
+ def options(self) -> JobBronzeOptions:
58
+ """Direct access to typed bronze job options."""
59
+ return self.conf.options # type: ignore
60
+
61
+ @property
62
+ def step_conf(self) -> StepBronzeConf:
63
+ """Direct access to typed bronze step conf."""
64
+ return self.base_step_conf # type: ignore
65
+
66
+ @property
67
+ def step_options(self) -> StepBronzeOptions:
68
+ """Direct access to typed bronze step options."""
69
+ return self.base_step_conf.options # type: ignore
70
+
57
71
  @classmethod
58
72
  def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
59
- return cls(step=cast(TBronze, step), job_id=job_id, conf=conf)
73
+ return cls(step=step, job_id=job_id, conf=conf)
60
74
 
61
75
  @classmethod
62
76
  def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
63
- return cls(step=cast(TBronze, step), topic=topic, item=item, conf=conf)
77
+ return cls(step=step, topic=topic, item=item, conf=conf)
64
78
 
65
79
  @property
66
- def data_path(self) -> Path:
67
- uri = self.options.job.get("uri")
80
+ def data_path(self) -> FileSharePath:
81
+ uri = self.options.uri
68
82
  assert uri is not None, "no uri provided in options"
69
- path = Path.from_uri(uri, regex=VARIABLES)
83
+ path = FileSharePath.from_uri(uri, regex=VARIABLES)
70
84
  return path
71
85
 
72
86
  def get_dependencies(self, *s) -> Sequence[JobDependency]:
73
87
  dependencies = []
74
88
 
75
- parents = self.options.job.get_list("parents")
89
+ parents = self.options.parents or []
76
90
  if parents:
77
91
  for p in parents:
78
92
  dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
@@ -81,8 +95,8 @@ class Bronze(BaseJob):
81
95
 
82
96
  def register_external_table(self):
83
97
  options = self.conf.parser_options # type: ignore
84
- if options:
85
- file_format = options.get("file_format")
98
+ if options and options.file_format:
99
+ file_format = options.file_format
86
100
  else:
87
101
  file_format = "delta"
88
102
 
@@ -136,17 +150,14 @@ class Bronze(BaseJob):
136
150
  self.compute_statistics_external_table()
137
151
 
138
152
  @property
139
- def parser(self) -> BaseParser:
153
+ def parser(self) -> str:
140
154
  if not self._parser:
141
155
  assert self.mode not in ["register"], f"{self.mode} not allowed"
142
156
 
143
- name = self.options.job.get("parser")
144
- assert name is not None, "parser not found"
145
-
146
- options = self.conf.parser_options or None # type: ignore
147
- p = get_parser(name, options)
157
+ parser = self.options.parser
158
+ assert parser is not None, "parser not found"
148
159
 
149
- self._parser = p
160
+ self._parser = cast(str, parser)
150
161
 
151
162
  return self._parser
152
163
 
@@ -171,19 +182,52 @@ class Bronze(BaseJob):
171
182
  else:
172
183
  df = self.spark.sql(f"select * from {self}")
173
184
 
174
- # cleaning should done by parser
175
- df = clean(df)
185
+ if self.step_options.clean is not False:
186
+ # cleaning should done by parser but for delta we do it here
187
+ df = clean(df)
176
188
 
177
189
  else:
178
- df = self.parser.get_data(
190
+ options = self.conf.parser_options or None # type: ignore
191
+ parse = get_parser(self.parser, options)
192
+
193
+ df = parse(
179
194
  stream=stream,
180
195
  data_path=self.data_path,
181
- schema_path=self.paths.schema,
196
+ schema_path=self.paths.to_schema,
182
197
  spark=self.spark,
183
198
  )
184
199
 
185
200
  return df
186
201
 
202
+ def encrypt(self, df: DataFrame) -> DataFrame:
203
+ encrypted_columns = self.options.encrypted_columns or []
204
+ if encrypted_columns:
205
+ if self.runtime_options.encryption_key is not None:
206
+ from databricks.sdk.runtime import dbutils
207
+
208
+ key = dbutils.secrets.get(
209
+ scope=self.runtime_options.secret_scope,
210
+ key=self.runtime_options.encryption_key,
211
+ )
212
+ if self.runtime_options.unity_catalog:
213
+ DEFAULT_LOGGER.warning(
214
+ "Unity Catalog enabled, use FABRICKS_ENCRYPTION_KEY instead",
215
+ extra={"label": self},
216
+ )
217
+
218
+ else:
219
+ import os
220
+
221
+ key = os.environ.get("FABRICKS_ENCRYPTION_KEY")
222
+
223
+ assert key, "encryption key not found in secrets nor in environment"
224
+
225
+ for col in encrypted_columns:
226
+ DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
227
+ df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
228
+
229
+ return df
230
+
187
231
  def get_data(
188
232
  self,
189
233
  stream: bool = False,
@@ -204,7 +248,7 @@ class Bronze(BaseJob):
204
248
  return df
205
249
 
206
250
  def add_calculated_columns(self, df: DataFrame) -> DataFrame:
207
- calculated_columns = self.options.job.get_dict("calculated_columns")
251
+ calculated_columns = self.options.calculated_columns or {}
208
252
 
209
253
  if calculated_columns:
210
254
  for key, value in calculated_columns.items():
@@ -230,7 +274,7 @@ class Bronze(BaseJob):
230
274
 
231
275
  def add_key(self, df: DataFrame) -> DataFrame:
232
276
  if "__key" not in df.columns:
233
- fields = self.options.job.get_list("keys")
277
+ fields = self.options.keys or []
234
278
  if fields:
235
279
  DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
236
280
 
@@ -244,7 +288,7 @@ class Bronze(BaseJob):
244
288
 
245
289
  def add_source(self, df: DataFrame) -> DataFrame:
246
290
  if "__source" not in df.columns:
247
- source = self.options.job.get("source")
291
+ source = self.options.source
248
292
  if source:
249
293
  DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
250
294
  df = df.withColumn("__source", lit(source))
@@ -253,7 +297,7 @@ class Bronze(BaseJob):
253
297
 
254
298
  def add_operation(self, df: DataFrame) -> DataFrame:
255
299
  if "__operation" not in df.columns:
256
- operation = self.options.job.get("operation")
300
+ operation = self.options.operation
257
301
  if operation:
258
302
  DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
259
303
  df = df.withColumn("__operation", lit(operation))
@@ -263,15 +307,10 @@ class Bronze(BaseJob):
263
307
 
264
308
  return df
265
309
 
266
- def base_transform(self, df: DataFrame) -> DataFrame:
267
- df = df.transform(self.extend)
268
- df = df.transform(self.add_calculated_columns)
269
- df = df.transform(self.add_hash)
270
- df = df.transform(self.add_operation)
271
- df = df.transform(self.add_source)
272
- df = df.transform(self.add_key)
273
-
310
+ def add_metadata(self, df: DataFrame) -> DataFrame:
274
311
  if "__metadata" in df.columns:
312
+ DEFAULT_LOGGER.debug("add metadata", extra={"label": self})
313
+
275
314
  if self.mode == "register":
276
315
  # https://github.com/delta-io/delta/issues/2014 (BUG)
277
316
  df = df.withColumn(
@@ -307,6 +346,17 @@ class Bronze(BaseJob):
307
346
 
308
347
  return df
309
348
 
349
+ def base_transform(self, df: DataFrame) -> DataFrame:
350
+ df = df.transform(self.extend)
351
+ df = df.transform(self.add_calculated_columns)
352
+ df = df.transform(self.add_hash)
353
+ df = df.transform(self.add_operation)
354
+ df = df.transform(self.add_source)
355
+ df = df.transform(self.add_key)
356
+ df = df.transform(self.add_metadata)
357
+
358
+ return df
359
+
310
360
  def create_or_replace_view(self):
311
361
  DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
312
362
 
@@ -395,6 +445,6 @@ class Bronze(BaseJob):
395
445
  else:
396
446
  super().vacuum()
397
447
 
398
- def overwrite(self, schedule: Optional[str] = None):
448
+ def overwrite(self, schedule: Optional[str] = None, invoke: Optional[bool] = False):
399
449
  self.truncate()
400
- self.run(schedule=schedule)
450
+ self.run(schedule=schedule, invoke=invoke)
@@ -1,10 +1,10 @@
1
- from typing import Optional, Union, cast, overload
1
+ from typing import Optional, Union, overload
2
2
 
3
3
  from pyspark.sql.types import Row
4
4
 
5
- from fabricks.core.jobs.base._types import Bronzes, Golds, Silvers, TBronze, TGold, TSilver
5
+ from fabricks.context import Bronzes, Golds, Silvers
6
6
  from fabricks.core.jobs.base.job import BaseJob
7
- from fabricks.core.jobs.get_job_id import get_job_id
7
+ from fabricks.models import get_job_id
8
8
 
9
9
 
10
10
  @overload
@@ -91,7 +91,6 @@ def get_job_internal(
91
91
  if step in Bronzes:
92
92
  from fabricks.core.jobs.bronze import Bronze
93
93
 
94
- step = cast(TBronze, step)
95
94
  if job_id is not None:
96
95
  job = Bronze.from_job_id(step=step, job_id=job_id, conf=conf)
97
96
  else:
@@ -102,7 +101,6 @@ def get_job_internal(
102
101
  elif step in Silvers:
103
102
  from fabricks.core.jobs.silver import Silver
104
103
 
105
- step = cast(TSilver, step)
106
104
  if job_id is not None:
107
105
  job = Silver.from_job_id(step=step, job_id=job_id, conf=conf)
108
106
  else:
@@ -113,7 +111,6 @@ def get_job_internal(
113
111
  elif step in Golds:
114
112
  from fabricks.core.jobs.gold import Gold
115
113
 
116
- step = cast(TGold, step)
117
114
  if job_id is not None:
118
115
  job = Gold.from_job_id(step=step, job_id=job_id, conf=conf)
119
116
  else:
@@ -1,97 +1,48 @@
1
- from typing import Optional, Union, cast, overload
1
+ from typing import Optional, Union, overload
2
2
 
3
3
  from pyspark.sql.types import Row
4
4
 
5
- from fabricks.context import IS_JOB_CONFIG_FROM_YAML, SPARK
6
- from fabricks.core.jobs.base._types import Bronzes, Golds, JobConf, Silvers, TBronze, TGold, TSilver, TStep
7
- from fabricks.core.jobs.get_job_id import get_job_id
5
+ from fabricks.context import IS_JOB_CONFIG_FROM_YAML, SPARK, Bronzes, Golds, Silvers
6
+ from fabricks.models import JobConf, get_job_id
8
7
 
9
8
 
10
- def get_job_conf_internal(step: TStep, row: Union[Row, dict]) -> JobConf:
9
+ def get_job_conf_internal(step: str, row: Union[Row, dict]) -> JobConf:
11
10
  if isinstance(row, Row):
12
11
  row = row.asDict(recursive=True)
13
12
 
14
- options = row.get("options")
15
- table_options = row.get("table_options")
16
- check_options = row.get("check_options")
17
- spark_options = row.get("spark_options")
18
- invoker_options = row.get("invoker_options")
19
- extender_options = row.get("extender_options")
20
-
21
- job_id = row.get("job_id", get_job_id(step=step, topic=row["topic"], item=row["item"]))
13
+ # Add step to row data (job_id will be computed automatically)
14
+ row["step"] = step
22
15
 
16
+ # Use Pydantic validation - handles nested models and validation automatically
23
17
  if step in Bronzes:
24
- from fabricks.core.jobs.base._types import JobConfBronze
25
-
26
- assert options is not None, "no option"
27
- step = cast(TBronze, step)
28
- return JobConfBronze(
29
- job_id=job_id,
30
- topic=row["topic"],
31
- item=row["item"],
32
- step=step,
33
- options=options,
34
- parser_options=row.get("parser_options"),
35
- table_options=table_options,
36
- check_options=check_options,
37
- invoker_options=invoker_options,
38
- extender_options=extender_options,
39
- spark_options=spark_options,
40
- tags=row.get("tags"),
41
- )
18
+ from fabricks.models import JobConfBronze
19
+
20
+ return JobConfBronze.model_validate(row)
42
21
 
43
22
  elif step in Silvers:
44
- from fabricks.core.jobs.base._types import JobConfSilver
45
-
46
- assert options is not None, "no option"
47
- step = cast(TSilver, step)
48
- return JobConfSilver(
49
- job_id=job_id,
50
- topic=row["topic"],
51
- item=row["item"],
52
- step=step,
53
- options=options,
54
- table_options=table_options,
55
- check_options=check_options,
56
- invoker_options=invoker_options,
57
- extender_options=extender_options,
58
- spark_options=spark_options,
59
- tags=row.get("tags"),
60
- )
23
+ from fabricks.models import JobConfSilver
24
+
25
+ return JobConfSilver.model_validate(row)
61
26
 
62
27
  elif step in Golds:
63
- from fabricks.core.jobs.base._types import JobConfGold
64
-
65
- assert options is not None, "no option"
66
- step = cast(TGold, step)
67
- return JobConfGold(
68
- job_id=job_id,
69
- topic=row["topic"],
70
- item=row["item"],
71
- step=step,
72
- options=options,
73
- table_options=table_options,
74
- check_options=check_options,
75
- invoker_options=invoker_options,
76
- extender_options=extender_options,
77
- spark_options=spark_options,
78
- tags=row.get("tags"),
79
- )
28
+ from fabricks.models import JobConfGold
29
+
30
+ return JobConfGold.model_validate(row)
80
31
 
81
32
  else:
82
33
  raise ValueError(f"{step} not found")
83
34
 
84
35
 
85
36
  @overload
86
- def get_job_conf(step: TStep, *, job_id: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
37
+ def get_job_conf(step: str, *, job_id: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
87
38
 
88
39
 
89
40
  @overload
90
- def get_job_conf(step: TStep, *, topic: str, item: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
41
+ def get_job_conf(step: str, *, topic: str, item: str, row: Optional[Union[Row, dict]] = None) -> JobConf: ...
91
42
 
92
43
 
93
44
  def get_job_conf(
94
- step: TStep,
45
+ step: str,
95
46
  job_id: Optional[str] = None,
96
47
  topic: Optional[str] = None,
97
48
  item: Optional[str] = None,