fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +76 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
  94. fabricks-3.0.6.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
@@ -26,7 +26,7 @@ class Processor(Invoker):
26
26
  f = self.options.job.get("filter_where")
27
27
 
28
28
  if f:
29
- DEFAULT_LOGGER.debug(f"filter where {f}", extra={"job": self})
29
+ DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
30
30
  df = df.where(f"{f}")
31
31
 
32
32
  return df
@@ -46,7 +46,7 @@ class Processor(Invoker):
46
46
  assert key, "key not found"
47
47
 
48
48
  for col in encrypted_columns:
49
- DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"job": self})
49
+ DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
50
50
  df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
51
51
 
52
52
  return df
@@ -73,16 +73,16 @@ class Processor(Invoker):
73
73
  assert self.paths.commits.joinpath(last_batch).exists()
74
74
 
75
75
  def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
76
- DEFAULT_LOGGER.debug("for each batch starts", extra={"job": self})
76
+ DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
77
77
  if batch is not None:
78
- DEFAULT_LOGGER.debug(f"batch {batch}", extra={"job": self})
78
+ DEFAULT_LOGGER.debug(f"batch {batch}", extra={"label": self})
79
79
 
80
80
  df = self.base_transform(df)
81
81
 
82
82
  diffs = self.get_schema_differences(df)
83
83
  if diffs:
84
84
  if self.schema_drift or kwargs.get("reload", False):
85
- DEFAULT_LOGGER.warning("schema drifted", extra={"job": self, "diffs": diffs})
85
+ DEFAULT_LOGGER.warning("schema drifted", extra={"label": self, "diffs": diffs})
86
86
  self.update_schema(df=df)
87
87
 
88
88
  else:
@@ -98,24 +98,24 @@ class Processor(Invoker):
98
98
  self.table.set_property("fabricks.last_batch", batch)
99
99
 
100
100
  self.table.create_restore_point()
101
- DEFAULT_LOGGER.debug("for each batch ends", extra={"job": self})
101
+ DEFAULT_LOGGER.debug("end (for each batch)", extra={"label": self})
102
102
 
103
103
  def for_each_run(self, **kwargs):
104
- DEFAULT_LOGGER.debug("for each run starts", extra={"job": self})
104
+ DEFAULT_LOGGER.debug("start (for each run)", extra={"label": self})
105
105
 
106
106
  if self.virtual:
107
107
  self.create_or_replace_view()
108
108
 
109
109
  elif self.persist:
110
- assert self.table.exists(), "delta table not found"
110
+ assert self.table.registered, f"{self} is not registered"
111
111
 
112
- df = self.get_data(self.stream)
112
+ df = self.get_data(stream=self.stream, **kwargs)
113
113
  assert df is not None, "no data"
114
114
 
115
115
  partial(self._for_each_batch, **kwargs)
116
116
 
117
117
  if self.stream:
118
- DEFAULT_LOGGER.debug("stream enabled", extra={"job": self})
118
+ DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
119
119
  write_stream(
120
120
  df,
121
121
  checkpoints_path=self.paths.checkpoints,
@@ -128,7 +128,7 @@ class Processor(Invoker):
128
128
  else:
129
129
  raise ValueError(f"{self.mode} - not allowed")
130
130
 
131
- DEFAULT_LOGGER.debug("for each run ends", extra={"job": self})
131
+ DEFAULT_LOGGER.debug("end (for each run)", extra={"label": self})
132
132
 
133
133
  def run(
134
134
  self,
@@ -137,6 +137,9 @@ class Processor(Invoker):
137
137
  schedule_id: Optional[str] = None,
138
138
  invoke: Optional[bool] = True,
139
139
  reload: Optional[bool] = None,
140
+ vacuum: Optional[bool] = None,
141
+ optimize: Optional[bool] = None,
142
+ compute_statistics: Optional[bool] = None,
140
143
  ):
141
144
  """
142
145
  Run the processor.
@@ -154,18 +157,19 @@ class Processor(Invoker):
154
157
  if self.persist:
155
158
  last_version = self.table.get_property("fabricks.last_version")
156
159
  if last_version is not None:
157
- DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"job": self})
160
+ DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"label": self})
158
161
  else:
159
162
  last_version = str(self.table.last_version)
160
163
 
161
164
  last_batch = self.table.get_property("fabricks.last_batch")
162
165
  if last_batch is not None:
163
- DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"job": self})
166
+ DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"label": self})
164
167
 
165
168
  try:
166
- DEFAULT_LOGGER.info("run starts", extra={"job": self})
169
+ DEFAULT_LOGGER.info("start (run)", extra={"label": self})
170
+
167
171
  if reload:
168
- DEFAULT_LOGGER.debug("force reload", extra={"job": self})
172
+ DEFAULT_LOGGER.debug("force reload", extra={"label": self})
169
173
 
170
174
  if invoke:
171
175
  self.invoke_pre_run(schedule=schedule)
@@ -193,40 +197,53 @@ class Processor(Invoker):
193
197
  if exception:
194
198
  raise exception
195
199
 
196
- DEFAULT_LOGGER.info("run ends", extra={"job": self})
200
+ if vacuum is None:
201
+ vacuum = self.options.job.get("vacuum", False)
202
+ if optimize is None:
203
+ optimize = self.options.job.get("optimize", False)
204
+ if compute_statistics is None:
205
+ compute_statistics = self.options.job.get("compute_statistics", False)
206
+
207
+ if vacuum or optimize or compute_statistics:
208
+ self.maintain(
209
+ compute_statistics=compute_statistics,
210
+ optimize=optimize,
211
+ vacuum=vacuum,
212
+ )
213
+
214
+ DEFAULT_LOGGER.info("end (run)", extra={"label": self})
197
215
 
198
216
  except SkipRunCheckWarning as e:
199
- DEFAULT_LOGGER.warning("skip run", extra={"job": self})
217
+ DEFAULT_LOGGER.warning("skip run", extra={"label": self})
200
218
  raise e
201
219
 
202
220
  except (PreRunCheckWarning, PostRunCheckWarning) as e:
203
- DEFAULT_LOGGER.warning("could not pass warning check", extra={"job": self})
221
+ DEFAULT_LOGGER.warning("fail to pass warning check", extra={"label": self})
204
222
  raise e
205
223
 
206
224
  except (PreRunInvokeException, PostRunInvokeException) as e:
207
- DEFAULT_LOGGER.exception("could not run invoker", extra={"job": self})
225
+ DEFAULT_LOGGER.exception("fail to run invoker", extra={"label": self})
208
226
  raise e
209
227
 
210
228
  except (PreRunCheckException, PostRunCheckException) as e:
211
- DEFAULT_LOGGER.exception("could not pass check", extra={"job": self})
229
+ DEFAULT_LOGGER.exception("fail to pass check", extra={"label": self})
212
230
  self.restore(last_version, last_batch)
213
231
  raise e
214
232
 
215
233
  except AssertionError as e:
216
- DEFAULT_LOGGER.exception("could not run", extra={"job": self})
234
+ DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
217
235
  self.restore(last_version, last_batch)
218
236
  raise e
219
237
 
220
238
  except Exception as e:
221
239
  if not self.stream or not retry:
222
- DEFAULT_LOGGER.exception("could not run", extra={"job": self})
240
+ DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
223
241
  self.restore(last_version, last_batch)
224
242
  raise e
225
243
 
226
244
  else:
227
- DEFAULT_LOGGER.warning("retry to run", extra={"job": self})
228
- self.run(retry=False, schedule_id=schedule_id)
245
+ DEFAULT_LOGGER.warning("retry to run", extra={"label": self})
246
+ self.run(retry=False, schedule_id=schedule_id, schedule=schedule)
229
247
 
230
248
  @abstractmethod
231
- def overwrite(self):
232
- raise NotImplementedError()
249
+ def overwrite(self) -> None: ...
@@ -11,7 +11,7 @@ from fabricks.core.jobs.base._types import JobDependency, TBronze
11
11
  from fabricks.core.jobs.base.job import BaseJob
12
12
  from fabricks.core.parsers import BaseParser
13
13
  from fabricks.core.parsers.get_parser import get_parser
14
- from fabricks.core.utils import clean
14
+ from fabricks.core.parsers.utils import clean
15
15
  from fabricks.metastore.view import create_or_replace_global_temp_view
16
16
  from fabricks.utils.helpers import concat_ws
17
17
  from fabricks.utils.path import Path
@@ -86,13 +86,13 @@ class Bronze(BaseJob):
86
86
  else:
87
87
  file_format = "delta"
88
88
 
89
- DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"job": self})
89
+ DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"label": self})
90
90
 
91
91
  try:
92
92
  df = self.spark.sql(f"select * from {file_format}.`{self.data_path}`")
93
93
  assert len(df.columns) > 1, "external table must have at least one column"
94
94
  except Exception as e:
95
- DEFAULT_LOGGER.exception("read external table failed", extra={"job": self})
95
+ DEFAULT_LOGGER.exception("read external table failed", extra={"label": self})
96
96
  raise e
97
97
 
98
98
  self.spark.sql(
@@ -100,17 +100,17 @@ class Bronze(BaseJob):
100
100
  )
101
101
 
102
102
  def drop_external_table(self):
103
- DEFAULT_LOGGER.debug("drop external table", extra={"job": self})
103
+ DEFAULT_LOGGER.warning("remove external table from metastore", extra={"label": self})
104
104
  self.spark.sql(f"drop table if exists {self.qualified_name}")
105
105
 
106
- def analyze_external_table(self):
107
- DEFAULT_LOGGER.debug("analyze external table", extra={"job": self})
106
+ def compute_statistics_external_table(self):
107
+ DEFAULT_LOGGER.debug("compute statistics (external table)", extra={"label": self})
108
108
  self.spark.sql(f"analyze table {self.qualified_name} compute statistics")
109
109
 
110
110
  def vacuum_external_table(self, retention_hours: Optional[int] = 168):
111
111
  from delta import DeltaTable
112
112
 
113
- DEFAULT_LOGGER.debug("vacuum external table", extra={"job": self})
113
+ DEFAULT_LOGGER.debug("vacuum (external table)", extra={"label": self})
114
114
  try:
115
115
  dt = DeltaTable.forPath(self.spark, self.data_path.string)
116
116
  self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
@@ -118,17 +118,17 @@ class Bronze(BaseJob):
118
118
  finally:
119
119
  self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
120
120
 
121
- def optimize_external_table(
121
+ def maintain_external_table(
122
122
  self,
123
123
  vacuum: Optional[bool] = True,
124
- analyze: Optional[bool] = True,
124
+ compute_statistics: Optional[bool] = True,
125
125
  ):
126
- DEFAULT_LOGGER.debug("optimize external table", extra={"job": self})
126
+ DEFAULT_LOGGER.debug("maintain (external table)", extra={"label": self})
127
127
  if vacuum:
128
128
  self.vacuum_external_table()
129
129
 
130
- if analyze:
131
- self.analyze_external_table()
130
+ if compute_statistics:
131
+ self.compute_statistics_external_table()
132
132
 
133
133
  @property
134
134
  def parser(self) -> BaseParser:
@@ -179,7 +179,13 @@ class Bronze(BaseJob):
179
179
 
180
180
  return df
181
181
 
182
- def get_data(self, stream: bool = False, transform: Optional[bool] = False) -> Optional[DataFrame]:
182
+ def get_data(
183
+ self,
184
+ stream: bool = False,
185
+ transform: Optional[bool] = False,
186
+ schema_only: Optional[bool] = False,
187
+ **kwargs,
188
+ ) -> Optional[DataFrame]:
183
189
  df = self.parse(stream)
184
190
  df = self.filter_where(df)
185
191
  df = self.encrypt(df)
@@ -187,6 +193,9 @@ class Bronze(BaseJob):
187
193
  if transform:
188
194
  df = self.base_transform(df)
189
195
 
196
+ if schema_only:
197
+ df = df.where("1 == 2")
198
+
190
199
  return df
191
200
 
192
201
  def add_calculated_columns(self, df: DataFrame) -> DataFrame:
@@ -194,7 +203,7 @@ class Bronze(BaseJob):
194
203
 
195
204
  if calculated_columns:
196
205
  for key, value in calculated_columns.items():
197
- DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"job": self})
206
+ DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"label": self})
198
207
  df = df.withColumn(key, expr(f"{value}"))
199
208
 
200
209
  return df
@@ -202,7 +211,7 @@ class Bronze(BaseJob):
202
211
  def add_hash(self, df: DataFrame) -> DataFrame:
203
212
  if "__hash" not in df.columns:
204
213
  fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
205
- DEFAULT_LOGGER.debug("add hash", extra={"job": self})
214
+ DEFAULT_LOGGER.debug("add hash", extra={"label": self})
206
215
 
207
216
  if "__operation" in df.columns:
208
217
  fields += ["__operation == 'delete'"]
@@ -218,7 +227,7 @@ class Bronze(BaseJob):
218
227
  if "__key" not in df.columns:
219
228
  fields = self.options.job.get_list("keys")
220
229
  if fields:
221
- DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"job": self})
230
+ DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
222
231
 
223
232
  if "__source" in df.columns:
224
233
  fields = fields + ["__source"]
@@ -232,7 +241,7 @@ class Bronze(BaseJob):
232
241
  if "__source" not in df.columns:
233
242
  source = self.options.job.get("source")
234
243
  if source:
235
- DEFAULT_LOGGER.debug(f"add source ({source})", extra={"job": self})
244
+ DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
236
245
  df = df.withColumn("__source", lit(source))
237
246
 
238
247
  return df
@@ -241,7 +250,7 @@ class Bronze(BaseJob):
241
250
  if "__operation" not in df.columns:
242
251
  operation = self.options.job.get("operation")
243
252
  if operation:
244
- DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"job": self})
253
+ DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
245
254
  df = df.withColumn("__operation", lit(operation))
246
255
 
247
256
  else:
@@ -294,10 +303,10 @@ class Bronze(BaseJob):
294
303
  return df
295
304
 
296
305
  def create_or_replace_view(self):
297
- DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"job": self})
306
+ DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
298
307
 
299
308
  def overwrite_schema(self, df: Optional[DataFrame] = None):
300
- DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"job": self})
309
+ DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"label": self})
301
310
 
302
311
  def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
303
312
  return {}
@@ -309,12 +318,12 @@ class Bronze(BaseJob):
309
318
 
310
319
  # if dataframe, reference is passed (BUG)
311
320
  name = f"{self.step}_{self.topic}_{self.item}__{batch}"
312
- global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
321
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
313
322
  sql = f"select * from {global_temp_view}"
314
323
 
315
324
  check_df = self.spark.sql(sql)
316
325
  if check_df.isEmpty():
317
- DEFAULT_LOGGER.warning("no data", extra={"job": self})
326
+ DEFAULT_LOGGER.warning("no data", extra={"label": self})
318
327
  return
319
328
 
320
329
  assert isinstance(self.cdc, NoCDC)
@@ -323,9 +332,9 @@ class Bronze(BaseJob):
323
332
 
324
333
  def for_each_run(self, **kwargs):
325
334
  if self.mode == "register":
326
- DEFAULT_LOGGER.debug("register (no run)", extra={"job": self})
335
+ DEFAULT_LOGGER.debug("register (no run)", extra={"label": self})
327
336
  elif self.mode == "memory":
328
- DEFAULT_LOGGER.debug("memory (no run)", extra={"job": self})
337
+ DEFAULT_LOGGER.debug("memory (no run)", extra={"label": self})
329
338
  else:
330
339
  super().for_each_run(**kwargs)
331
340
 
@@ -333,7 +342,7 @@ class Bronze(BaseJob):
333
342
  if self.mode == "register":
334
343
  self.register_external_table()
335
344
  elif self.mode == "memory":
336
- DEFAULT_LOGGER.info("memory (no table nor view)", extra={"job": self})
345
+ DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
337
346
  else:
338
347
  super().create()
339
348
 
@@ -341,19 +350,19 @@ class Bronze(BaseJob):
341
350
  if self.mode == "register":
342
351
  self.register_external_table()
343
352
  elif self.mode == "memory":
344
- DEFAULT_LOGGER.info("memory (no table nor view)", extra={"job": self})
353
+ DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
345
354
  else:
346
355
  super().register()
347
356
 
348
357
  def truncate(self):
349
358
  if self.mode == "register":
350
- DEFAULT_LOGGER.info("register (no truncate)", extra={"job": self})
359
+ DEFAULT_LOGGER.info("register (no truncate)", extra={"label": self})
351
360
  else:
352
361
  super().truncate()
353
362
 
354
363
  def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
355
364
  if self.mode == "register":
356
- DEFAULT_LOGGER.info("register (no restore)", extra={"job": self})
365
+ DEFAULT_LOGGER.info("register (no restore)", extra={"label": self})
357
366
  else:
358
367
  super().restore()
359
368
 
@@ -362,27 +371,25 @@ class Bronze(BaseJob):
362
371
  self.drop_external_table()
363
372
  super().drop()
364
373
 
365
- def optimize(
374
+ def maintain(
366
375
  self,
367
376
  vacuum: Optional[bool] = True,
368
377
  optimize: Optional[bool] = True,
369
- analyze: Optional[bool] = True,
378
+ compute_statistics: Optional[bool] = True,
370
379
  ):
371
- if self.mode == "memory":
372
- DEFAULT_LOGGER.info("memory (no optimize)", extra={"job": self})
373
- elif self.mode == "register":
374
- self.optimize_external_table(vacuum, analyze)
380
+ if self.mode == "register":
381
+ self.maintain_external_table(vacuum=vacuum, compute_statistics=compute_statistics)
375
382
  else:
376
- super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
383
+ super().maintain(vacuum=vacuum, optimize=optimize, compute_statistics=compute_statistics)
377
384
 
378
385
  def vacuum(self):
379
386
  if self.mode == "memory":
380
- DEFAULT_LOGGER.info("memory (no vacuum)", extra={"job": self})
387
+ DEFAULT_LOGGER.info("memory (no vacuum)", extra={"label": self})
381
388
  elif self.mode == "register":
382
389
  self.vacuum_external_table()
383
390
  else:
384
391
  super().vacuum()
385
392
 
386
- def overwrite(self):
393
+ def overwrite(self, schedule: Optional[str] = None):
387
394
  self.truncate()
388
- self.run()
395
+ self.run(schedule=schedule)
@@ -6,7 +6,7 @@ from pyspark.sql.functions import expr
6
6
  from pyspark.sql.types import Row
7
7
 
8
8
  from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
9
- from fabricks.core.jobs.base._types import Modes, TStep
9
+ from fabricks.core.jobs.base._types import AllowedModes, TStep
10
10
  from fabricks.core.jobs.base.job import BaseJob
11
11
  from fabricks.core.jobs.get_job import get_job, get_job_internal
12
12
  from fabricks.utils.helpers import concat_dfs, run_in_parallel
@@ -16,7 +16,7 @@ from fabricks.utils.schema import get_schema_for_type
16
16
 
17
17
 
18
18
  class GenericOptions(TypedDict):
19
- mode: Modes
19
+ mode: AllowedModes
20
20
 
21
21
 
22
22
  @dataclass
@@ -0,0 +1,10 @@
1
+ from typing import Dict
2
+
3
+ from fabricks.core.jobs.get_schedules import get_schedules
4
+
5
+
6
+ def get_schedule(name: str) -> Dict:
7
+ schedule = next(s for s in get_schedules() if s.get("name") == name)
8
+
9
+ assert schedule, "schedule not found"
10
+ return schedule
@@ -0,0 +1,32 @@
1
+ from typing import List, Optional, TypedDict
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.context import PATH_SCHEDULES, SPARK
6
+ from fabricks.core.jobs.base._types import TStep
7
+ from fabricks.utils.read.read_yaml import read_yaml
8
+ from fabricks.utils.schema import get_schema_for_type
9
+
10
+
11
+ class Options(TypedDict):
12
+ steps: Optional[List[TStep]]
13
+ tag: Optional[str]
14
+ view: Optional[str]
15
+ variables: Optional[dict[str, str]]
16
+
17
+
18
+ class Schedule(TypedDict):
19
+ name: str
20
+ options: Options
21
+
22
+
23
+ def get_schedules():
24
+ return read_yaml(PATH_SCHEDULES, root="schedule")
25
+
26
+
27
+ def get_schedules_df() -> DataFrame:
28
+ schema = get_schema_for_type(Schedule)
29
+ df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
30
+
31
+ assert df, "no schedules found"
32
+ return df