fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +80 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
  94. fabricks-3.0.7.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
@@ -99,14 +99,20 @@ class Gold(BaseJob):
99
99
  def register_udfs(self):
100
100
  for u in self.get_udfs():
101
101
  if not is_registered(u):
102
- DEFAULT_LOGGER.debug(f"register udf ({u})", extra={"job": self})
102
+ DEFAULT_LOGGER.debug(f"register udf ({u})", extra={"label": self})
103
103
  register_udf(udf=u, spark=self.spark)
104
104
 
105
105
  def base_transform(self, df: DataFrame) -> DataFrame:
106
106
  df = df.transform(self.extend)
107
107
  return df
108
108
 
109
- def get_data(self, stream=False, transform: Optional[bool] = False) -> DataFrame:
109
+ def get_data(
110
+ self,
111
+ stream: bool = False,
112
+ transform: Optional[bool] = False,
113
+ schema_only: Optional[bool] = False,
114
+ **kwargs,
115
+ ) -> DataFrame:
110
116
  if self.options.job.get_boolean("requirements"):
111
117
  import sys
112
118
 
@@ -116,12 +122,12 @@ class Gold(BaseJob):
116
122
  df = self.spark.createDataFrame([{}]) # type: ignore
117
123
 
118
124
  elif self.options.job.get("notebook"):
119
- from databricks.sdk.runtime import dbutils
125
+ invokers = self.options.invokers.get_list("run")
126
+ assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
120
127
 
121
- DEFAULT_LOGGER.debug("run notebook", extra={"job": self})
122
- path = self.paths.runtime.get_notebook_path()
128
+ global_temp_view = self.invoke(path=self.paths.runtime, schema_only=schema_only, **kwargs)
129
+ assert global_temp_view is not None, "global_temp_view not found"
123
130
 
124
- global_temp_view = dbutils.notebook.run(path, self.timeout, arguments={}) # type: ignore
125
131
  df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
126
132
 
127
133
  elif self.options.job.get("table"):
@@ -135,6 +141,10 @@ class Gold(BaseJob):
135
141
 
136
142
  if transform:
137
143
  df = self.base_transform(df)
144
+
145
+ if schema_only:
146
+ df = df.where("1 == 2")
147
+
138
148
  return df
139
149
 
140
150
  def create_or_replace_view(self):
@@ -178,7 +188,7 @@ class Gold(BaseJob):
178
188
  from fabricks.context import CATALOG
179
189
 
180
190
  dependencies = []
181
- df = self.get_data(self.stream)
191
+ df = self.get_data(stream=self.stream)
182
192
 
183
193
  if df is not None:
184
194
  explain_plan = self.spark.sql("explain extended select * from {df}", df=df).collect()[0][0]
@@ -194,23 +204,14 @@ class Gold(BaseJob):
194
204
  return dependencies
195
205
 
196
206
  def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
197
- if "__order_duplicate_by_asc" in df.columns:
198
- order_duplicate_by = {"__order_duplicate_by_asc": "asc"}
199
- elif "__order_duplicate_by_desc" in df.columns:
200
- order_duplicate_by = {"__order_duplicate_by_desc": "desc"}
201
- else:
202
- order_duplicate_by = None
203
-
204
- deduplicate = self.options.job.get_boolean(
205
- "deduplicate",
206
- None,
207
- ) # assume no duplicate in gold (to improve performance)
208
- rectify = self.options.job.get_boolean(
209
- "rectify_as_upserts",
210
- None,
211
- ) # assume no reload in gold (to improve performance)
212
- correct_valid_from = self.options.job.get_boolean("correct_valid_from", True)
213
- add_metadata = self.step_conf.get("options", {}).get("metadata", False)
207
+ # assume no duplicate in gold (to improve performance)
208
+ deduplicate = self.options.job.get_boolean("deduplicate", None)
209
+ # assume no reload in gold (to improve performance)
210
+ rectify = self.options.job.get_boolean("rectify_as_upserts", None)
211
+
212
+ add_metadata = self.options.job.get_boolean("metadata", None)
213
+ if add_metadata is None:
214
+ add_metadata = self.step_conf.get("options", {}).get("metadata", False)
214
215
 
215
216
  context = {
216
217
  "add_metadata": add_metadata,
@@ -219,27 +220,37 @@ class Gold(BaseJob):
219
220
  "deduplicate_hash": True if self.slowly_changing_dimension else None,
220
221
  "deduplicate": False,
221
222
  "rectify": False,
222
- "order_duplicate_by": order_duplicate_by,
223
- "correct_valid_from": correct_valid_from,
224
223
  }
225
224
 
225
+ # force deduplicate
226
226
  if deduplicate is not None:
227
227
  context["deduplicate"] = deduplicate
228
228
  context["deduplicate_key"] = deduplicate
229
229
  context["deduplicate_hash"] = deduplicate
230
230
 
231
+ # force rectify
231
232
  if rectify is not None:
232
233
  context["rectify"] = rectify
233
234
 
235
+ # add key and hash when needed
236
+ if self.mode == "update" and self.change_data_capture == "nocdc":
237
+ if "__key" not in df.columns:
238
+ context["add_key"] = True
239
+ if "__hash" not in df.columns:
240
+ context["add_hash"] = True
241
+
242
+ # add key and hash when needed
234
243
  if self.slowly_changing_dimension:
235
244
  if "__key" not in df.columns:
236
245
  context["add_key"] = True
237
246
  if "__hash" not in df.columns:
238
247
  context["add_hash"] = True
239
248
 
249
+ if self.slowly_changing_dimension:
240
250
  if "__operation" not in df.columns:
251
+ # assume no duplicate hash
241
252
  if deduplicate is None:
242
- context["deduplicate_hash"] = None # assume no duplicate hash
253
+ context["deduplicate_hash"] = None
243
254
 
244
255
  if self.mode == "update":
245
256
  context["add_operation"] = "reload"
@@ -249,16 +260,25 @@ class Gold(BaseJob):
249
260
  else:
250
261
  context["add_operation"] = "upsert"
251
262
 
263
+ # filter to get latest data
252
264
  if not reload:
253
265
  if self.mode == "update" and self.change_data_capture == "scd2":
254
266
  context["slice"] = "update"
255
267
 
268
+ if self.mode == "update" and self.change_data_capture == "nocdc" and "__timestamp" in df.columns:
269
+ context["slice"] = "update"
270
+
256
271
  if self.mode == "append" and "__timestamp" in df.columns:
257
272
  context["slice"] = "update"
258
273
 
259
274
  if self.mode == "memory":
260
275
  context["mode"] = "complete"
261
276
 
277
+ # correct __valid_from
278
+ if self.change_data_capture == "scd2":
279
+ context["correct_valid_from"] = self.options.job.get_boolean("correct_valid_from", True)
280
+
281
+ # add __timestamp
262
282
  if self.options.job.get_boolean("persist_last_timestamp"):
263
283
  if self.change_data_capture == "scd1":
264
284
  if "__timestamp" not in df.columns:
@@ -267,6 +287,11 @@ class Gold(BaseJob):
267
287
  if "__valid_from" not in df.columns:
268
288
  context["add_timestamp"] = True
269
289
 
290
+ if "__order_duplicate_by_asc" in df.columns:
291
+ context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
292
+ elif "__order_duplicate_by_desc" in df.columns:
293
+ context["order_duplicate_by"] = {"__order_duplicate_by_desc": "desc"}
294
+
270
295
  return context
271
296
 
272
297
  def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
@@ -277,20 +302,19 @@ class Gold(BaseJob):
277
302
 
278
303
  # if dataframe, reference is passed (BUG)
279
304
  name = f"{self.step}_{self.topic}_{self.item}"
280
- global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
305
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
281
306
  sql = f"select * from {global_temp_view}"
282
307
 
283
308
  check_df = self.spark.sql(sql)
284
309
  if check_df.isEmpty():
285
- DEFAULT_LOGGER.warning("no data", extra={"job": self})
310
+ DEFAULT_LOGGER.warning("no data", extra={"label": self})
286
311
  return
287
312
 
288
313
  if reload:
289
- DEFAULT_LOGGER.warning("force reload", extra={"job": self})
314
+ DEFAULT_LOGGER.warning("force reload", extra={"label": self})
290
315
  self.cdc.complete(sql, **context)
291
316
 
292
317
  elif self.mode == "update":
293
- assert not isinstance(self.cdc, NoCDC), "nocdc update not allowed"
294
318
  self.cdc.update(sql, **context)
295
319
 
296
320
  elif self.mode == "append":
@@ -323,7 +347,7 @@ class Gold(BaseJob):
323
347
 
324
348
  def create(self):
325
349
  if self.mode == "invoke":
326
- DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"job": self})
350
+ DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
327
351
  else:
328
352
  self.register_udfs()
329
353
  super().create()
@@ -335,7 +359,7 @@ class Gold(BaseJob):
335
359
  self.cdc_last_timestamp.table.register()
336
360
 
337
361
  if self.mode == "invoke":
338
- DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"job": self})
362
+ DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
339
363
  else:
340
364
  super().register()
341
365
 
@@ -345,17 +369,6 @@ class Gold(BaseJob):
345
369
 
346
370
  super().drop()
347
371
 
348
- def optimize(
349
- self,
350
- vacuum: Optional[bool] = True,
351
- optimize: Optional[bool] = True,
352
- analyze: Optional[bool] = True,
353
- ):
354
- if self.mode == "memory":
355
- DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
356
- else:
357
- super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
358
-
359
372
  @property
360
373
  def cdc_last_timestamp(self) -> NoCDC:
361
374
  assert self.mode == "update", "persist_last_timestamp only allowed in update"
@@ -387,15 +400,15 @@ class Gold(BaseJob):
387
400
  else:
388
401
  self.cdc_last_timestamp.overwrite(df)
389
402
 
390
- def overwrite(self):
403
+ def overwrite(self, schedule: Optional[str] = None):
391
404
  if self.mode == "invoke":
392
- DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"job": self})
405
+ DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
393
406
  return
394
407
 
395
408
  elif self.mode == "memory":
396
- DEFAULT_LOGGER.debug("memory (no overwrite)", extra={"job": self})
409
+ DEFAULT_LOGGER.debug("memory (no overwrite)", extra={"label": self})
397
410
  self.create_or_replace_view()
398
411
  return
399
412
 
400
413
  self.overwrite_schema()
401
- self.run(reload=True)
414
+ self.run(reload=True, schedule=schedule)
@@ -95,7 +95,13 @@ class Silver(BaseJob):
95
95
  )
96
96
  return df
97
97
 
98
- def get_data(self, stream: bool = False, transform: Optional[bool] = False) -> DataFrame:
98
+ def get_data(
99
+ self,
100
+ stream: bool = False,
101
+ transform: Optional[bool] = False,
102
+ schema_only: Optional[bool] = False,
103
+ **kwargs,
104
+ ) -> DataFrame:
99
105
  deps = self.get_dependencies()
100
106
  assert deps, "not dependency found"
101
107
 
@@ -139,7 +145,7 @@ class Silver(BaseJob):
139
145
  dfs.append(df)
140
146
 
141
147
  except Exception as e:
142
- DEFAULT_LOGGER.exception("could not get dependencies", extra={"job": self})
148
+ DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": self})
143
149
  raise e
144
150
 
145
151
  df = concat_dfs(dfs)
@@ -151,6 +157,9 @@ class Silver(BaseJob):
151
157
  if transform:
152
158
  df = self.base_transform(df)
153
159
 
160
+ if schema_only:
161
+ df = df.where("1 == 2")
162
+
154
163
  return df
155
164
 
156
165
  def get_dependencies(self) -> Sequence[JobDependency]:
@@ -186,7 +195,7 @@ class Silver(BaseJob):
186
195
 
187
196
  sql = f"create or replace view {self.qualified_name} as {' union all '.join(queries)}"
188
197
  sql = fix_sql(sql)
189
- DEFAULT_LOGGER.debug("view", extra={"job": self, "sql": sql})
198
+ DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
190
199
  self.spark.sql(sql)
191
200
 
192
201
  else:
@@ -195,7 +204,7 @@ class Silver(BaseJob):
195
204
  parent = deps[0].parent
196
205
  sql = f"select * from {parent}"
197
206
  sql = fix_sql(sql)
198
- DEFAULT_LOGGER.debug("view", extra={"job": self, "sql": sql})
207
+ DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
199
208
 
200
209
  df = self.spark.sql(sql)
201
210
  cdc_options = self.get_cdc_context(df)
@@ -205,7 +214,7 @@ class Silver(BaseJob):
205
214
  from py4j.protocol import Py4JJavaError
206
215
 
207
216
  try:
208
- DEFAULT_LOGGER.debug("create or replace current view", extra={"job": self})
217
+ DEFAULT_LOGGER.debug("create or replace current view", extra={"label": self})
209
218
 
210
219
  df = self.spark.sql(f"select * from {self.qualified_name}")
211
220
 
@@ -222,23 +231,23 @@ class Silver(BaseJob):
222
231
  {where_clause}
223
232
  """
224
233
  # sql = fix_sql(sql)
225
- # DEFAULT_LOGGER.debug("current view", extra={"job": self, "sql": sql})
234
+ # DEFAULT_LOGGER.debug("current view", extra={"label": self, "sql": sql})
226
235
  self.spark.sql(sql)
227
236
 
228
- except Py4JJavaError:
229
- DEFAULT_LOGGER.exception("could not create or replace view", extra={"job": self})
237
+ except Py4JJavaError as e:
238
+ DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
230
239
 
231
- def overwrite(self):
240
+ def overwrite(self, schedule: Optional[str] = None):
232
241
  self.truncate()
233
- self.run()
242
+ self.run(schedule=schedule)
234
243
 
235
244
  def overwrite_schema(self, df: Optional[DataFrame] = None):
236
- DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"job": self})
245
+ DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
237
246
 
238
247
  def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
239
248
  # if dataframe, reference is passed (BUG)
240
249
  name = f"{self.step}_{self.topic}_{self.item}__check"
241
- global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
250
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
242
251
 
243
252
  not_append = not self.mode == "append"
244
253
  nocdc = self.change_data_capture == "nocdc"
@@ -265,12 +274,12 @@ class Silver(BaseJob):
265
274
  1
266
275
  """
267
276
  sql = fix_sql(sql)
268
- DEFAULT_LOGGER.debug("check", extra={"job": self, "sql": sql})
277
+ DEFAULT_LOGGER.debug("check", extra={"label": self, "sql": sql})
269
278
 
270
279
  check_df = self.spark.sql(sql)
271
280
  if not check_df.isEmpty():
272
281
  rectify = True
273
- DEFAULT_LOGGER.debug("rectify enabled", extra={"job": self})
282
+ DEFAULT_LOGGER.debug("rectify enabled", extra={"label": self})
274
283
 
275
284
  context = {
276
285
  "soft_delete": self.slowly_changing_dimension,
@@ -279,29 +288,30 @@ class Silver(BaseJob):
279
288
  "order_duplicate_by": order_duplicate_by,
280
289
  }
281
290
 
282
- if self.slowly_changing_dimension:
283
- if "__key" not in df.columns:
284
- context["add_key"] = True
285
-
286
291
  if self.mode == "memory":
287
292
  context["mode"] = "complete"
288
- if self.mode == "latest":
289
- context["slice"] = "latest"
290
293
 
291
- if self.change_data_capture == "scd2":
292
- context["correct_valid_from"] = True
294
+ if self.slowly_changing_dimension:
295
+ if "__key" not in df.columns:
296
+ context["add_key"] = True
293
297
 
294
- if nocdc:
295
- if "__operation" in df.columns:
296
- context["except"] = ["__operation"]
297
298
  if nocdc and self.mode == "memory":
298
299
  if "__operation" not in df.columns:
299
300
  context["add_operation"] = "upsert"
300
- context["except"] = ["__operation"]
301
301
 
302
+ if self.mode == "latest":
303
+ context["slice"] = "latest"
302
304
  if not self.stream and self.mode == "update":
303
305
  context["slice"] = "update"
304
306
 
307
+ if self.change_data_capture == "scd2":
308
+ context["correct_valid_from"] = True
309
+
310
+ if "__operation" in df.columns:
311
+ context["exclude"] = ["__operation"]
312
+ if nocdc: # operation is passed from the bronze layer
313
+ context["exclude"] = ["__operation"]
314
+
305
315
  return context
306
316
 
307
317
  def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
@@ -313,12 +323,12 @@ class Silver(BaseJob):
313
323
  name = f"{self.step}_{self.topic}_{self.item}"
314
324
  if batch is not None:
315
325
  name = f"{name}__{batch}"
316
- global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
326
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
317
327
  sql = f"select * from {global_temp_view}"
318
328
 
319
329
  check_df = self.spark.sql(sql)
320
330
  if check_df.isEmpty():
321
- DEFAULT_LOGGER.warning("no data", extra={"job": self})
331
+ DEFAULT_LOGGER.warning("no data", extra={"label": self})
322
332
  return
323
333
 
324
334
  if self.mode == "update":
@@ -359,16 +369,5 @@ class Silver(BaseJob):
359
369
 
360
370
  def drop(self):
361
371
  super().drop()
362
- DEFAULT_LOGGER.debug("drop current view", extra={"job": self})
372
+ DEFAULT_LOGGER.debug("drop current view", extra={"label": self})
363
373
  self.spark.sql(f"drop view if exists {self.qualified_name}__current")
364
-
365
- def optimize(
366
- self,
367
- vacuum: Optional[bool] = True,
368
- optimize: Optional[bool] = True,
369
- analyze: Optional[bool] = True,
370
- ):
371
- if self.mode == "memory":
372
- DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
373
- else:
374
- super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
fabricks/core/masks.py ADDED
@@ -0,0 +1,52 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from pyspark.sql import SparkSession
5
+
6
+ from fabricks.context import CATALOG, PATH_MASKS, SPARK
7
+ from fabricks.context.log import DEFAULT_LOGGER
8
+
9
+
10
+ def register_all_masks():
11
+ """
12
+ Register all masks.
13
+ """
14
+
15
+ DEFAULT_LOGGER.info("register masks")
16
+ for mask in get_masks():
17
+ split = mask.split(".")
18
+ try:
19
+ register_mask(mask=split[0])
20
+ except Exception as e:
21
+ DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
22
+
23
+
24
+ def get_masks() -> List[str]:
25
+ return [os.path.basename(f) for f in PATH_MASKS.walk()]
26
+
27
+
28
+ def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
29
+ if spark is None:
30
+ spark = SPARK
31
+ assert spark is not None
32
+
33
+ df = spark.sql("show user functions in default")
34
+
35
+ if CATALOG:
36
+ df = df.where(f"function == '{CATALOG}.default.mask_{mask}'")
37
+ else:
38
+ df = df.where(f"function == 'spark_catalog.default.mask_{mask}'")
39
+
40
+ return not df.isEmpty()
41
+
42
+
43
+ def register_mask(mask: str, spark: Optional[SparkSession] = None):
44
+ if spark is None:
45
+ spark = SPARK
46
+ assert spark is not None
47
+
48
+ if not is_registered(mask, spark):
49
+ DEFAULT_LOGGER.debug(f"register mask {mask}")
50
+
51
+ path = PATH_MASKS.joinpath(f"{mask}.sql")
52
+ spark.sql(path.get_sql())
@@ -6,7 +6,7 @@ from pyspark.sql.functions import col, expr, from_json, lit
6
6
  from pyspark.sql.types import MapType, StringType
7
7
 
8
8
  from fabricks.core.parsers._types import ParserOptions
9
- from fabricks.core.utils import clean
9
+ from fabricks.core.parsers.utils import clean
10
10
  from fabricks.utils.path import Path
11
11
  from fabricks.utils.read.read import read
12
12
 
@@ -26,7 +26,7 @@ class BaseParser(ABC):
26
26
  "__timestamp",
27
27
  expr("left(concat_ws('', slice(__split, __split_size - 4, 4), '00'), 14)"),
28
28
  )
29
- df = df.withColumn("__timestamp", expr("to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
29
+ df = df.withColumn("__timestamp", expr("try_to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
30
30
  df = df.drop("__split", "__split_size")
31
31
 
32
32
  return df
@@ -0,0 +1,14 @@
1
+ from fabricks.core.schedules.generate import generate
2
+ from fabricks.core.schedules.process import process
3
+ from fabricks.core.schedules.run import run
4
+ from fabricks.core.schedules.terminate import terminate
5
+ from fabricks.core.schedules.views import create_or_replace_view, create_or_replace_views
6
+
7
+ __all__ = [
8
+ "process",
9
+ "generate",
10
+ "terminate",
11
+ "run",
12
+ "create_or_replace_view",
13
+ "create_or_replace_views",
14
+ ]
@@ -0,0 +1,46 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ def get_dependencies(name: str) -> DataFrame:
5
+ from fabricks.core.dags import DagGenerator
6
+
7
+ g = DagGenerator(schedule=name)
8
+ return g.get_dependencies()
9
+
10
+
11
+ def get_mermaid_diagram(name: str) -> str:
12
+ df = get_dependencies(name)
13
+
14
+ df = df.withColumnRenamed("ParentId", "parent_id")
15
+ df = df.withColumnRenamed("Parent", "parent")
16
+ df = df.withColumnRenamed("JobId", "job_id")
17
+ df = df.withColumnRenamed("Job", "job")
18
+
19
+ dependencies = df.select("parent_id", "parent", "job_id", "job").collect()
20
+
21
+ out = "flowchart TD\n"
22
+
23
+ unique_nodes = set()
24
+
25
+ for row in dependencies:
26
+ parent_id = str(row["parent_id"])
27
+ parent_name = str(row["parent"])
28
+ child_id = str(row["job_id"])
29
+ child_name = str(row["job"])
30
+
31
+ if parent_id != "0" and parent_id is not None:
32
+ if parent_id not in unique_nodes:
33
+ out += f" {parent_id}[{parent_name}]\n"
34
+ unique_nodes.add(parent_id)
35
+
36
+ if child_id not in unique_nodes:
37
+ out += f" {child_id}[{child_name}]\n"
38
+ unique_nodes.add(child_id)
39
+
40
+ out += f" {parent_id} --> {child_id}\n"
41
+ else:
42
+ if child_id not in unique_nodes:
43
+ out += f" {child_id}[{child_name}]\n"
44
+ unique_nodes.add(child_id)
45
+
46
+ return out
@@ -0,0 +1,5 @@
1
+ from fabricks.core.jobs.get_schedule import get_schedule # void circular import
2
+
3
+ __all__ = [
4
+ "get_schedule",
5
+ ]
@@ -0,0 +1,9 @@
1
+ from fabricks.core.jobs.get_schedules import ( # void circular import
2
+ get_schedules,
3
+ get_schedules_df,
4
+ )
5
+
6
+ __all__ = [
7
+ "get_schedules",
8
+ "get_schedules_df",
9
+ ]
@@ -0,0 +1,3 @@
1
+ from fabricks.core.dags.run import run
2
+
3
+ __all__ = ["run"]
@@ -0,0 +1,61 @@
1
+ from fabricks.context import SPARK
2
+ from fabricks.context.log import DEFAULT_LOGGER
3
+ from fabricks.core.schedules.get_schedule import get_schedule
4
+ from fabricks.core.schedules.get_schedules import get_schedules_df
5
+ from fabricks.utils.sqlglot import fix as fix_sql
6
+
7
+
8
+ def create_or_replace_view_internal(name: str, options: dict):
9
+ step = "-- no step provided"
10
+ tag = "-- no tag provided"
11
+ view = "-- no view provided"
12
+
13
+ assert isinstance(options, dict), "options must be a dict"
14
+
15
+ if options.get("steps") is not None:
16
+ steps = [f"'{s}'" for s in options.get("steps")] # type: ignore
17
+ step = f"and j.step in ({', '.join(steps)})"
18
+
19
+ if options.get("tag") is not None:
20
+ tag = f"""and array_contains(j.tags, '{options.get("tag")}')"""
21
+
22
+ if options.get("view") is not None:
23
+ view = f"""inner join fabricks.{options.get("view")} v on j.job_id = v.job_id"""
24
+
25
+ sql = f"""
26
+ create or replace view fabricks.{name}_schedule
27
+ as
28
+ select
29
+ j.*
30
+ from
31
+ fabricks.jobs j
32
+ {view}
33
+ where
34
+ true
35
+ {step}
36
+ {tag}
37
+ and j.type not in ('manual')
38
+ """
39
+ sql = fix_sql(sql)
40
+ DEFAULT_LOGGER.debug("create or replace (schedule) view", extra={"label": f"fabricks.{name}_schedule", "sql": sql})
41
+
42
+ SPARK.sql(sql)
43
+
44
+
45
+ def create_or_replace_view(name: str):
46
+ sc = get_schedule(name=name)
47
+ try:
48
+ create_or_replace_view_internal(sc["name"], sc["options"])
49
+ except Exception as e:
50
+ DEFAULT_LOGGER.exception(f"could not create nor replace view {sc['name']}", exc_info=e)
51
+
52
+
53
+ def create_or_replace_views():
54
+ DEFAULT_LOGGER.info("create or replace (schedule) views")
55
+
56
+ df = get_schedules_df()
57
+ for row in df.collect():
58
+ try:
59
+ create_or_replace_view_internal(row.name, row.options.asDict())
60
+ except Exception as e:
61
+ DEFAULT_LOGGER.exception(f"could not create nor replace view {row.name}", exc_info=e)