fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +80 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
  94. fabricks-3.0.7.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
@@ -4,12 +4,13 @@ from typing import Optional, Union, cast
4
4
 
5
5
  from pyspark.sql import DataFrame, SparkSession
6
6
  from pyspark.sql.types import Row
7
+ from typing_extensions import deprecated
7
8
 
8
- from fabricks.cdc import SCD1, SCD2, ChangeDataCaptures, NoCDC
9
+ from fabricks.cdc import SCD1, SCD2, AllowedChangeDataCaptures, NoCDC
9
10
  from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
10
11
  from fabricks.context.log import DEFAULT_LOGGER
11
12
  from fabricks.context.spark_session import build_spark_session
12
- from fabricks.core.jobs.base._types import Modes, Options, Paths, TStep
13
+ from fabricks.core.jobs.base._types import AllowedModes, Options, Paths, TStep
13
14
  from fabricks.core.jobs.get_job_conf import get_job_conf
14
15
  from fabricks.core.jobs.get_job_id import get_job_id
15
16
  from fabricks.metastore.table import Table
@@ -52,36 +53,30 @@ class Configurator(ABC):
52
53
  _root: Optional[Path] = None
53
54
 
54
55
  _cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
55
- _change_data_capture: Optional[ChangeDataCaptures] = None
56
- _mode: Optional[Modes] = None
56
+ _change_data_capture: Optional[AllowedChangeDataCaptures] = None
57
+ _mode: Optional[AllowedModes] = None
57
58
 
58
59
  @property
59
60
  @abstractmethod
60
- def stream(self) -> bool:
61
- raise NotImplementedError()
61
+ def stream(self) -> bool: ...
62
62
 
63
63
  @property
64
64
  @abstractmethod
65
- def schema_drift(self) -> bool:
66
- raise NotImplementedError()
65
+ def schema_drift(self) -> bool: ...
67
66
 
68
67
  @property
69
68
  @abstractmethod
70
- def persist(self) -> bool:
71
- raise NotImplementedError()
69
+ def persist(self) -> bool: ...
72
70
 
73
71
  @property
74
72
  @abstractmethod
75
- def virtual(self) -> bool:
76
- raise NotImplementedError()
73
+ def virtual(self) -> bool: ...
77
74
 
78
75
  @classmethod
79
- def from_step_topic_item(cls, step: str, topic: str, item: str):
80
- raise NotImplementedError()
76
+ def from_step_topic_item(cls, step: str, topic: str, item: str): ...
81
77
 
82
78
  @classmethod
83
- def from_job_id(cls, step: str, job_id: str):
84
- raise NotImplementedError()
79
+ def from_job_id(cls, step: str, job_id: str): ...
85
80
 
86
81
  @property
87
82
  def spark(self) -> SparkSession:
@@ -93,22 +88,22 @@ class Configurator(ABC):
93
88
  step_conf_options = step_options.get("conf", {})
94
89
  if step_sql_options:
95
90
  for key, value in step_sql_options.items():
96
- DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"step": self.step})
91
+ DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
97
92
  spark.sql(f"set {key} = {value}")
98
93
  if step_conf_options:
99
94
  for key, value in step_conf_options.items():
100
- DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"step": self.step})
95
+ DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
101
96
  spark.conf.set(f"{key}", f"{value}")
102
97
 
103
98
  job_sql_options = self.options.spark.get_dict("sql")
104
99
  job_conf_options = self.options.spark.get_dict("conf")
105
100
  if job_sql_options:
106
101
  for key, value in job_sql_options.items():
107
- DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"job": self})
102
+ DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
108
103
  spark.sql(f"set {key} = {value}")
109
104
  if job_conf_options:
110
105
  for key, value in job_conf_options.items():
111
- DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"job": self})
106
+ DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
112
107
  spark.conf.set(f"{key}", f"{value}")
113
108
 
114
109
  self._spark = spark
@@ -195,9 +190,9 @@ class Configurator(ABC):
195
190
  return self._options
196
191
 
197
192
  @property
198
- def change_data_capture(self) -> ChangeDataCaptures:
193
+ def change_data_capture(self) -> AllowedChangeDataCaptures:
199
194
  if not self._change_data_capture:
200
- cdc: ChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
195
+ cdc: AllowedChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
201
196
  self._change_data_capture = cdc
202
197
  return self._change_data_capture
203
198
 
@@ -220,49 +215,34 @@ class Configurator(ABC):
220
215
  return self.change_data_capture in ["scd1", "scd2"]
221
216
 
222
217
  @abstractmethod
223
- def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict:
224
- raise NotImplementedError()
218
+ def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = False) -> dict: ...
225
219
 
226
220
  def get_cdc_data(self, stream: bool = False) -> Optional[DataFrame]:
227
- df = self.get_data(stream)
221
+ df = self.get_data(stream=stream)
228
222
  if df:
229
223
  cdc_context = self.get_cdc_context(df)
230
224
  cdc_df = self.cdc.get_data(src=df, **cdc_context)
231
225
  return cdc_df
232
226
 
233
227
  @property
234
- def mode(self) -> Modes:
228
+ def mode(self) -> AllowedModes:
235
229
  if not self._mode:
236
230
  _mode = self.options.job.get("mode")
237
231
  assert _mode is not None
238
- self._mode = cast(Modes, _mode)
232
+ self._mode = cast(AllowedModes, _mode)
239
233
  return self._mode
240
234
 
241
235
  @abstractmethod
242
- def get_data(self, stream: bool = False, transform: Optional[bool] = False) -> Optional[DataFrame]:
243
- """
244
- Retrieves the data for the job.
245
-
246
- Args:
247
- stream (bool, optional): If True, the data will be streamed. Defaults to False.
248
- transform (bool, optional): If True, the data will be transformed. Defaults to False.
249
-
250
- Returns:
251
- DataFrame or None: The retrieved data as a DataFrame, or None if the data is not available.
252
- """
253
- raise NotImplementedError()
236
+ def get_data(self, stream: bool = False, transform: Optional[bool] = None, **kwargs) -> Optional[DataFrame]: ...
254
237
 
255
238
  @abstractmethod
256
- def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
257
- raise NotImplementedError()
239
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs): ...
258
240
 
259
241
  @abstractmethod
260
- def for_each_run(self, **kwargs):
261
- raise NotImplementedError()
242
+ def for_each_run(self, **kwargs): ...
262
243
 
263
244
  @abstractmethod
264
- def base_transform(self, df: DataFrame) -> DataFrame:
265
- raise NotImplementedError()
245
+ def base_transform(self, df: DataFrame) -> DataFrame: ...
266
246
 
267
247
  @abstractmethod
268
248
  def run(
@@ -271,47 +251,41 @@ class Configurator(ABC):
271
251
  schedule: Optional[str] = None,
272
252
  schedule_id: Optional[str] = None,
273
253
  invoke: Optional[bool] = True,
274
- ):
275
- raise NotImplementedError()
254
+ ): ...
276
255
 
256
+ @deprecated("use maintain instead")
277
257
  def optimize(
278
258
  self,
279
259
  vacuum: Optional[bool] = True,
280
260
  optimize: Optional[bool] = True,
281
261
  analyze: Optional[bool] = True,
282
262
  ):
283
- """
284
- Optimize the table by performing vacuum, optimizing CDC, and analyzing the table.
285
-
286
- If the mode is set to 'memory', no optimization is performed.
287
-
288
- The retention days for optimization are determined in the following order:
289
- 1. If 'retention_days' is specified in the job options table, it is used.
290
- 2. If 'retention_days' is specified in the step configuration table options, it is used.
291
- 3. If 'retention_days' is specified in the CONF_RUNTIME options, it is used.
292
-
293
- After determining the retention days, the table is vacuumed with the specified retention days,
294
- CDC is optimized for the table, and the table is analyzed.
263
+ return self.maintain(
264
+ vacuum=vacuum,
265
+ optimize=optimize,
266
+ compute_statistics=analyze,
267
+ )
295
268
 
296
- Note: This method assumes that either 'runtime' or 'step' or 'job' is specified.
297
-
298
- Returns:
299
- None
300
- """
269
+ def maintain(
270
+ self,
271
+ vacuum: Optional[bool] = True,
272
+ optimize: Optional[bool] = True,
273
+ compute_statistics: Optional[bool] = True,
274
+ ):
301
275
  if self.mode == "memory":
302
- DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
276
+ DEFAULT_LOGGER.debug("could not maintain (memory)", extra={"label": self})
303
277
 
304
278
  else:
305
279
  if vacuum:
306
280
  self.vacuum()
307
281
  if optimize:
308
282
  self.cdc.optimize_table()
309
- if analyze:
283
+ if compute_statistics:
310
284
  self.table.compute_statistics()
311
285
 
312
286
  def vacuum(self):
313
287
  if self.mode == "memory":
314
- DEFAULT_LOGGER.debug("memory (no vacuum)", extra={"job": self})
288
+ DEFAULT_LOGGER.debug("could not vacuum (memory)", extra={"label": self})
315
289
 
316
290
  else:
317
291
  job = self.options.table.get("retention_days")
@@ -4,7 +4,7 @@ from typing import Optional, Sequence, Union, cast
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import lit
6
6
 
7
- from fabricks.cdc import SCD1
7
+ from fabricks.cdc import NoCDC
8
8
  from fabricks.context.log import DEFAULT_LOGGER
9
9
  from fabricks.core.jobs.base._types import JobDependency
10
10
  from fabricks.core.jobs.base.configurator import Configurator
@@ -14,17 +14,16 @@ from fabricks.metastore.view import create_or_replace_global_temp_view
14
14
 
15
15
  class Generator(Configurator):
16
16
  def update_dependencies(self):
17
- DEFAULT_LOGGER.info("update dependencies", extra={"job": self})
17
+ DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
18
18
 
19
19
  deps = self.get_dependencies()
20
20
  if deps:
21
21
  df = self.spark.createDataFrame([d.model_dump() for d in deps]) # type: ignore
22
- scd1 = SCD1("fabricks", self.step, "dependencies")
23
- scd1.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
22
+ cdc = NoCDC("fabricks", self.step, "dependencies")
23
+ cdc.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
24
24
 
25
25
  @abstractmethod
26
- def get_dependencies(self) -> Sequence[JobDependency]:
27
- raise NotImplementedError()
26
+ def get_dependencies(self) -> Sequence[JobDependency]: ...
28
27
 
29
28
  def rm(self):
30
29
  """
@@ -33,7 +32,7 @@ class Generator(Configurator):
33
32
  If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
34
33
  """
35
34
  if self.paths.schema.exists():
36
- DEFAULT_LOGGER.info("delete schema folder", extra={"job": self})
35
+ DEFAULT_LOGGER.info("delete schema folder", extra={"label": self})
37
36
  self.paths.schema.rm()
38
37
  self.rm_checkpoints()
39
38
 
@@ -44,7 +43,7 @@ class Generator(Configurator):
44
43
  This method checks if the checkpoints folder exists and deletes it if it does.
45
44
  """
46
45
  if self.paths.checkpoints.exists():
47
- DEFAULT_LOGGER.info("delete checkpoints folder", extra={"job": self})
46
+ DEFAULT_LOGGER.info("delete checkpoints folder", extra={"label": self})
48
47
  self.paths.checkpoints.rm()
49
48
 
50
49
  def rm_commit(self, id: Union[str, int]):
@@ -59,7 +58,7 @@ class Generator(Configurator):
59
58
  """
60
59
  path = self.paths.commits.joinpath(str(id))
61
60
  if path.exists():
62
- DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"job": self})
61
+ DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"label": self})
63
62
  path.rm()
64
63
 
65
64
  def truncate(self):
@@ -72,7 +71,7 @@ class Generator(Configurator):
72
71
  Returns:
73
72
  None
74
73
  """
75
- DEFAULT_LOGGER.warning("truncate", extra={"job": self})
74
+ DEFAULT_LOGGER.warning("truncate", extra={"label": self})
76
75
  self.rm()
77
76
  if self.persist:
78
77
  self.table.truncate()
@@ -92,6 +91,9 @@ class Generator(Configurator):
92
91
  Returns:
93
92
  None
94
93
  """
94
+ if self.options.job.get("no_drop"):
95
+ raise ValueError("no_drop is set, cannot drop the job")
96
+
95
97
  try:
96
98
  row = self.spark.sql(
97
99
  f"""
@@ -106,7 +108,7 @@ class Generator(Configurator):
106
108
  """
107
109
  ).collect()[0]
108
110
  if cast(int, row.count) > 0:
109
- DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"job": self, "content": row.children})
111
+ DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"label": self, "content": row.children})
110
112
 
111
113
  except Exception:
112
114
  pass
@@ -162,7 +164,7 @@ class Generator(Configurator):
162
164
  Raises:
163
165
  NotImplementedError: This method is meant to be overridden by subclasses.
164
166
  """
165
- raise NotImplementedError()
167
+ ...
166
168
 
167
169
  def create_table(self):
168
170
  def _create_table(df: DataFrame, batch: Optional[int] = 0):
@@ -185,12 +187,29 @@ class Generator(Configurator):
185
187
  elif step_powerbi is not None:
186
188
  powerbi = step_powerbi
187
189
 
188
- if powerbi:
190
+ # first take from job options, then from step options
191
+ job_masks = self.options.table.get("masks", None)
192
+ step_masks = self.step_conf.get("table_options", {}).get("masks", None)
193
+ if job_masks is not None:
194
+ masks = job_masks
195
+ elif step_masks is not None:
196
+ masks = step_masks
197
+ else:
198
+ masks = None
199
+
200
+ maximum_compatibility = self.options.table.get_boolean("maximum_compatibility", False)
201
+
202
+ if maximum_compatibility:
203
+ default_properties = {
204
+ "delta.minReaderVersion": "1",
205
+ "delta.minWriterVersion": "7",
206
+ "delta.columnMapping.mode": "none",
207
+ }
208
+ elif powerbi:
189
209
  default_properties = {
190
210
  "delta.columnMapping.mode": "name",
191
211
  "delta.minReaderVersion": "2",
192
212
  "delta.minWriterVersion": "5",
193
- "fabricks.last_version": "0",
194
213
  }
195
214
  else:
196
215
  default_properties = {
@@ -200,9 +219,10 @@ class Generator(Configurator):
200
219
  "delta.minReaderVersion": "2",
201
220
  "delta.minWriterVersion": "5",
202
221
  "delta.feature.timestampNtz": "supported",
203
- "fabricks.last_version": "0",
204
222
  }
205
223
 
224
+ default_properties["fabricks.last_version"] = "0"
225
+
206
226
  if "__identity" in df.columns:
207
227
  identity = False
208
228
  else:
@@ -234,9 +254,7 @@ class Generator(Configurator):
234
254
  cluster_by.append("__hash")
235
255
 
236
256
  if not cluster_by:
237
- DEFAULT_LOGGER.warning(
238
- "liquid clustering disabled (no clustering columns found)", extra={"job": self}
239
- )
257
+ DEFAULT_LOGGER.debug("could not determine clustering column", extra={"label": self})
240
258
  liquid_clustering = False
241
259
  cluster_by = None
242
260
 
@@ -257,9 +275,13 @@ class Generator(Configurator):
257
275
  if properties is None:
258
276
  properties = default_properties
259
277
 
278
+ primary_key = self.options.table.get_dict("primary_key")
279
+ foreign_keys = self.options.table.get_dict("foreign_keys")
280
+ comments = self.options.table.get_dict("comments")
281
+
260
282
  # if dataframe, reference is passed (BUG)
261
283
  name = f"{self.step}_{self.topic}_{self.item}__init"
262
- global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"))
284
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"), job=self)
263
285
  sql = f"select * from {global_temp_view}"
264
286
 
265
287
  self.cdc.create_table(
@@ -270,11 +292,17 @@ class Generator(Configurator):
270
292
  partitioning=partitioning,
271
293
  partition_by=partition_by,
272
294
  properties=properties,
295
+ masks=masks,
296
+ primary_key=primary_key,
297
+ foreign_keys=foreign_keys,
298
+ comments=comments,
273
299
  **cdc_options,
274
300
  )
275
301
 
276
302
  if not self.table.exists():
277
- df = self.get_data(self.stream)
303
+ DEFAULT_LOGGER.debug("create table", extra={"label": self})
304
+
305
+ df = self.get_data(stream=self.stream, schema_only=True)
278
306
  if df:
279
307
  if self.stream:
280
308
  # add dummy stream to be sure that the writeStream will start
@@ -310,6 +338,9 @@ class Generator(Configurator):
310
338
  if comment:
311
339
  self.table.add_comment(comment=comment)
312
340
 
341
+ else:
342
+ DEFAULT_LOGGER.debug("table exists, skip creation", extra={"label": self})
343
+
313
344
  def _update_schema(
314
345
  self,
315
346
  df: Optional[DataFrame] = None,
@@ -328,7 +359,7 @@ class Generator(Configurator):
328
359
  _update_schema(df)
329
360
 
330
361
  else:
331
- df = self.get_data(self.stream)
362
+ df = self.get_data(stream=self.stream, schema_only=True)
332
363
  assert df is not None
333
364
  df = self.base_transform(df)
334
365
 
@@ -360,7 +391,7 @@ class Generator(Configurator):
360
391
 
361
392
  def get_differences_with_deltatable(self, df: Optional[DataFrame] = None):
362
393
  if df is None:
363
- df = self.get_data(self.stream)
394
+ df = self.get_data(stream=self.stream)
364
395
  assert df is not None
365
396
  df = self.base_transform(df)
366
397
 
@@ -370,7 +401,7 @@ class Generator(Configurator):
370
401
 
371
402
  def get_schema_differences(self, df: Optional[DataFrame] = None) -> Optional[Sequence[SchemaDiff]]:
372
403
  if df is None:
373
- df = self.get_data(self.stream)
404
+ df = self.get_data(stream=self.stream)
374
405
  assert df is not None
375
406
  df = self.base_transform(df)
376
407
 
@@ -413,4 +444,4 @@ class Generator(Configurator):
413
444
  else:
414
445
  self.table.enable_liquid_clustering(auto=True)
415
446
  else:
416
- DEFAULT_LOGGER.debug("liquid clustering not enabled", extra={"job": self})
447
+ DEFAULT_LOGGER.debug("could not enable liquid clustering", extra={"label": self})
@@ -7,13 +7,17 @@ from fabricks.context import PATH_RUNTIME
7
7
  from fabricks.context.log import DEFAULT_LOGGER
8
8
  from fabricks.core.jobs.base.checker import Checker
9
9
  from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
10
- from fabricks.core.schedules import get_schedules
10
+ from fabricks.core.jobs.get_schedule import get_schedule
11
11
  from fabricks.utils.path import Path
12
12
 
13
13
 
14
14
  class Invoker(Checker):
15
- def invoke(self, schedule: Optional[str] = None):
16
- self._invoke_job(position="run", schedule=schedule)
15
+ def invoke(self, schedule: Optional[str] = None, **kwargs):
16
+ return self._invoke_job(
17
+ position="run",
18
+ schedule=schedule,
19
+ **kwargs,
20
+ ) # kwargs and return needed for get_data in gold
17
21
 
18
22
  def invoke_pre_run(self, schedule: Optional[str] = None):
19
23
  self._invoke_job(position="pre_run", schedule=schedule)
@@ -23,30 +27,50 @@ class Invoker(Checker):
23
27
  self._invoke_job(position="post_run", schedule=schedule)
24
28
  self._invoke_step(position="post_run", schedule=schedule)
25
29
 
26
- def _invoke_job(self, position: str, schedule: Optional[str] = None):
30
+ def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
27
31
  invokers = self.options.invokers.get_list(position)
32
+ if position == "run":
33
+ invokers = invokers if len(invokers) > 0 else [{}] # run must work even without run invoker options
28
34
 
29
35
  errors = []
30
36
 
31
37
  if invokers:
32
- for i in invokers:
33
- DEFAULT_LOGGER.info(f"{position}-invoke", extra={"job": self})
38
+ for i, invoker in enumerate(invokers):
39
+ DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
34
40
  try:
35
- notebook = i.get("notebook")
36
- assert notebook, "notebook mandatory"
37
- path = PATH_RUNTIME.joinpath(notebook)
38
-
39
- arguments = i.get("arguments") or {}
40
- timeout = i.get("timeout")
41
-
42
- self._run_notebook(
43
- path=path,
44
- arguments=arguments,
45
- timeout=timeout,
46
- schedule=schedule,
47
- )
41
+ path = kwargs.get("path")
42
+ if path is None:
43
+ notebook = invoker.get("notebook")
44
+ assert notebook, "notebook mandatory"
45
+ path = PATH_RUNTIME.joinpath(notebook)
46
+
47
+ assert path is not None, "path mandatory"
48
+
49
+ arguments = invoker.get("arguments") or {}
50
+ timeout = invoker.get("timeout")
51
+
52
+ schema_only = kwargs.get("schema_only")
53
+ if schema_only is not None:
54
+ arguments["schema_only"] = schema_only
55
+
56
+ if len(invokers) == 1 and position == "run":
57
+ return self._run_notebook(
58
+ path=path,
59
+ arguments=arguments,
60
+ timeout=timeout,
61
+ schedule=schedule,
62
+ )
63
+ else:
64
+ self._run_notebook(
65
+ path=path,
66
+ arguments=arguments,
67
+ timeout=timeout,
68
+ schedule=schedule,
69
+ )
48
70
 
49
71
  except Exception as e:
72
+ DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
73
+
50
74
  if position == "pre_run":
51
75
  errors.append(PreRunInvokeException(e))
52
76
  elif position == "post_run":
@@ -63,15 +87,15 @@ class Invoker(Checker):
63
87
  errors = []
64
88
 
65
89
  if invokers:
66
- for i in invokers:
67
- DEFAULT_LOGGER.info(f"{position}-invoke", extra={"step": self.step})
90
+ for i, invoker in enumerate(invokers):
91
+ DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
68
92
  try:
69
- notebook = i.get("notebook")
93
+ notebook = invoker.get("notebook")
70
94
  assert notebook, "notebook mandatory"
71
95
  path = PATH_RUNTIME.joinpath(notebook)
72
96
 
73
- arguments = i.get("arguments", {})
74
- timeout = i.get("timeout")
97
+ arguments = invoker.get("arguments", {})
98
+ timeout = invoker.get("timeout")
75
99
 
76
100
  self._run_notebook(
77
101
  path=path,
@@ -81,6 +105,8 @@ class Invoker(Checker):
81
105
  )
82
106
 
83
107
  except Exception as e:
108
+ DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
109
+
84
110
  if position == "pre_run":
85
111
  errors.append(PreRunInvokeException(e))
86
112
  elif position == "post_run":
@@ -125,9 +151,7 @@ class Invoker(Checker):
125
151
 
126
152
  variables = None
127
153
  if schedule is not None:
128
- variables = (
129
- next(s for s in get_schedules() if s.get("name") == schedule).get("options", {}).get("variables", {})
130
- )
154
+ variables = get_schedule(name=schedule).get("options", {}).get("variables", {})
131
155
 
132
156
  if variables is None:
133
157
  variables = {}
@@ -135,7 +159,7 @@ class Invoker(Checker):
135
159
  if arguments is None:
136
160
  arguments = {}
137
161
 
138
- dbutils.notebook.run(
162
+ return dbutils.notebook.run(
139
163
  path=path.get_notebook_path(), # type: ignore
140
164
  timeout_seconds=timeout, # type: ignore
141
165
  arguments={ # type: ignore
@@ -154,7 +178,7 @@ class Invoker(Checker):
154
178
  extenders = self.options.extenders
155
179
  for e in extenders:
156
180
  name = e.get("extender")
157
- DEFAULT_LOGGER.info(f"calling {name}", extra={"job": self})
181
+ DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
158
182
  arguments = e.get("arguments") or {}
159
183
 
160
184
  extender = get_extender(name)
@@ -168,7 +192,7 @@ class Invoker(Checker):
168
192
  extenders = self.step_conf.get("extender_options", {})
169
193
  for e in extenders:
170
194
  name = e.get("extender")
171
- DEFAULT_LOGGER.info(f"calling {name}", extra={"step": self.step})
195
+ DEFAULT_LOGGER.debug(f"extend by step ({name})", extra={"label": self})
172
196
  arguments = e.get("arguments", {})
173
197
 
174
198
  extender = get_extender(name)