fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +76 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
  94. fabricks-3.0.6.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
@@ -1,15 +1,15 @@
1
1
  import logging
2
- from typing import Iterable, List, Literal, Optional, Tuple, Union, cast
2
+ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
3
3
 
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import expr, md5
6
6
  from pyspark.sql.types import Row
7
7
  from typing_extensions import deprecated
8
8
 
9
- from fabricks.cdc import SCD1
9
+ from fabricks.cdc import NoCDC
10
10
  from fabricks.context import CONF_RUNTIME, LOGLEVEL, PATHS_RUNTIME, PATHS_STORAGE, SPARK, STEPS
11
11
  from fabricks.context.log import DEFAULT_LOGGER
12
- from fabricks.core.jobs.base._types import Bronzes, Golds, JobDependency, SchemaDependencies, Silvers, TStep
12
+ from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
13
13
  from fabricks.core.jobs.get_job import get_job
14
14
  from fabricks.core.steps._types import Timeouts
15
15
  from fabricks.core.steps.get_step_conf import get_step_conf
@@ -98,53 +98,66 @@ class BaseStep:
98
98
  return self._options
99
99
 
100
100
  def drop(self):
101
- DEFAULT_LOGGER.warning("💣 (drop)", extra={"step": self})
101
+ DEFAULT_LOGGER.warning("drop", extra={"label": self})
102
102
 
103
103
  fs = self.database.storage
104
104
  assert fs
105
105
 
106
106
  tmp = fs.joinpath("tmp")
107
107
  if tmp.exists():
108
+ DEFAULT_LOGGER.debug("clean tmp folder", extra={"label": self})
108
109
  tmp.rm()
109
110
 
110
111
  checkpoint = fs.joinpath("checkpoints")
111
112
  if checkpoint.exists():
113
+ DEFAULT_LOGGER.debug("clean checkpoint folder", extra={"label": self})
112
114
  checkpoint.rm()
113
115
 
114
116
  schema = fs.joinpath("schemas")
115
117
  if schema.exists():
118
+ DEFAULT_LOGGER.debug("clean schema folder", extra={"label": self})
116
119
  schema.rm()
117
120
 
121
+ DEFAULT_LOGGER.debug("clean fabricks", extra={"label": self})
118
122
  for t in ["jobs", "tables", "dependencies", "views"]:
119
123
  tbl = Table("fabricks", self.name, t)
120
124
  tbl.drop()
121
125
 
126
+ try:
127
+ SPARK.sql(f"delete from fabricks.steps where step = '{self}'")
128
+ except Exception:
129
+ pass
130
+
122
131
  self.database.drop()
123
132
 
124
133
  def create(self):
125
- DEFAULT_LOGGER.info("🌟 (create)", extra={"step": self})
134
+ DEFAULT_LOGGER.info("create", extra={"label": self})
126
135
 
127
136
  if not self.runtime.exists():
128
- DEFAULT_LOGGER.warning(f"{self.name} not found in runtime ({self.runtime})")
137
+ DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
129
138
  else:
130
139
  self.update()
131
140
 
132
141
  def update(self, update_dependencies: Optional[bool] = True, progress_bar: Optional[bool] = False):
133
142
  if not self.runtime.exists():
134
- DEFAULT_LOGGER.warning(f"{self.name} not found in runtime ({self.runtime})")
143
+ DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
135
144
 
136
145
  else:
137
146
  if not self.database.exists():
138
147
  self.database.create()
139
148
 
140
- self.update_jobs()
141
- self.create_db_objects()
149
+ self.update_configurations()
150
+ errors = self.create_db_objects()
151
+
152
+ for e in errors:
153
+ DEFAULT_LOGGER.exception("fail to create db object", extra={"label": e["job"]}, exc_info=e["error"])
142
154
 
143
155
  if update_dependencies:
144
156
  self.update_dependencies(progress_bar=progress_bar)
145
157
 
146
158
  self.update_tables_list()
147
159
  self.update_views_list()
160
+ self.update_steps_list()
148
161
 
149
162
  def get_dependencies(
150
163
  self,
@@ -152,19 +165,8 @@ class BaseStep:
152
165
  topic: Optional[Union[str, List[str]]] = None,
153
166
  include_manual: Optional[bool] = False,
154
167
  loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
155
- ) -> Tuple[DataFrame, List[str]]:
156
- DEFAULT_LOGGER.debug("get dependencies", extra={"step": self})
157
-
158
- errors = []
159
- dependencies: list[JobDependency] = []
160
-
161
- def _get_dependencies(row: Row):
162
- job = get_job(step=self.name, job_id=row["job_id"])
163
- try:
164
- dependencies.extend(job.get_dependencies())
165
- except Exception as e:
166
- DEFAULT_LOGGER.exception("failed to get dependencies", extra={"job": job})
167
- errors.append((job, e))
168
+ ) -> Tuple[DataFrame, List[Dict]]:
169
+ DEFAULT_LOGGER.debug("get dependencies", extra={"label": self})
168
170
 
169
171
  df = self.get_jobs()
170
172
 
@@ -176,18 +178,25 @@ class BaseStep:
176
178
  topic = [topic]
177
179
 
178
180
  where = ", ".join([f"'{t}'" for t in topic])
179
- DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"step": self})
181
+ DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"label": self})
180
182
  df = df.where(f"topic in ({where})")
181
183
 
182
184
  if not df:
183
185
  raise ValueError("no jobs found")
184
186
 
185
- DEFAULT_LOGGER.setLevel(logging.CRITICAL)
186
- run_in_parallel(_get_dependencies, df, workers=16, progress_bar=progress_bar)
187
- if loglevel:
188
- DEFAULT_LOGGER.setLevel(loglevel)
189
- else:
190
- DEFAULT_LOGGER.setLevel(LOGLEVEL)
187
+ results = run_in_parallel(
188
+ _get_dependencies,
189
+ df,
190
+ workers=16,
191
+ progress_bar=progress_bar,
192
+ logger=DEFAULT_LOGGER,
193
+ loglevel=logging.CRITICAL,
194
+ )
195
+
196
+ errors = [res for res in results if res.get("error")]
197
+ dependencies = []
198
+ for res in [res for res in results if res.get("dependencies")]:
199
+ dependencies.extend(res.get("dependencies"))
191
200
 
192
201
  df = self.spark.createDataFrame([d.model_dump() for d in dependencies], SchemaDependencies) # type: ignore
193
202
  return df, errors
@@ -196,7 +205,7 @@ class BaseStep:
196
205
  return read_yaml(self.runtime, root="job", preferred_file_name=topic)
197
206
 
198
207
  def get_jobs(self, topic: Optional[str] = None) -> DataFrame:
199
- DEFAULT_LOGGER.debug("get jobs", extra={"step": self})
208
+ DEFAULT_LOGGER.debug("get jobs", extra={"label": self})
200
209
 
201
210
  try:
202
211
  conf = get_step_conf(self.name)
@@ -216,21 +225,11 @@ class BaseStep:
216
225
  return df
217
226
 
218
227
  except AssertionError as e:
219
- DEFAULT_LOGGER.exception("failed to get jobs", extra={"step": self})
228
+ DEFAULT_LOGGER.exception("fail to get jobs", extra={"label": self})
220
229
  raise e
221
230
 
222
- def create_db_objects(self, retry: Optional[bool] = True) -> List[str]:
223
- DEFAULT_LOGGER.info("create db objects", extra={"step": self})
224
-
225
- errors = []
226
-
227
- def _create_db_object(row: Row):
228
- job = get_job(step=self.name, job_id=row["job_id"])
229
- try:
230
- job.create()
231
- except: # noqa E722
232
- DEFAULT_LOGGER.exception("not created", extra={"job": self})
233
- errors.append(job)
231
+ def create_db_objects(self, retry: Optional[bool] = True) -> List[Dict]:
232
+ DEFAULT_LOGGER.info("create db objects", extra={"label": self})
234
233
 
235
234
  df = self.get_jobs()
236
235
  table_df = self.database.get_tables()
@@ -240,22 +239,29 @@ class BaseStep:
240
239
  df = df.join(view_df, "job_id", how="left_anti")
241
240
 
242
241
  if df:
243
- DEFAULT_LOGGER.setLevel(logging.CRITICAL)
244
- run_in_parallel(_create_db_object, df, workers=16, progress_bar=True)
245
- DEFAULT_LOGGER.setLevel(LOGLEVEL)
242
+ results = run_in_parallel(
243
+ _create_db_object,
244
+ df,
245
+ workers=16,
246
+ progress_bar=True,
247
+ logger=DEFAULT_LOGGER,
248
+ loglevel=logging.CRITICAL,
249
+ )
246
250
 
247
251
  self.update_tables_list()
248
252
  self.update_views_list()
249
253
 
254
+ errors = [res for res in results if res.get("error")]
255
+
250
256
  if errors:
251
257
  if retry:
252
- DEFAULT_LOGGER.warning("retry create jobs", extra={"step": self})
258
+ DEFAULT_LOGGER.warning("retry to create jobs", extra={"label": self})
253
259
  return self.create_db_objects(retry=False)
254
260
 
255
261
  return errors
256
262
 
257
263
  @deprecated("use create_db_objects instead")
258
- def create_jobs(self, retry: Optional[bool] = True) -> List[str]:
264
+ def create_jobs(self, retry: Optional[bool] = True) -> List[Dict]:
259
265
  return self.create_db_objects(retry=retry)
260
266
 
261
267
  @deprecated("use update_configurations instead")
@@ -265,19 +271,19 @@ class BaseStep:
265
271
  def update_configurations(self, drop: Optional[bool] = False):
266
272
  df = self.get_jobs()
267
273
 
268
- DEFAULT_LOGGER.info("update configurations", extra={"step": self})
274
+ DEFAULT_LOGGER.info("update configurations", extra={"label": self})
269
275
 
270
- scd1 = SCD1("fabricks", self.name, "jobs")
276
+ cdc = NoCDC("fabricks", self.name, "jobs")
271
277
 
272
278
  if drop:
273
- scd1.table.drop()
274
- elif scd1.table.exists():
275
- diffs = scd1.get_differences_with_deltatable(df)
276
- if diffs:
277
- DEFAULT_LOGGER.warning("schema drift detected", extra={"step": self})
278
- scd1.table.overwrite_schema(df=df)
279
+ cdc.table.drop()
280
+ elif cdc.table.exists():
281
+ df_diffs = cdc.get_differences_with_deltatable(df)
282
+ if not df_diffs.isEmpty():
283
+ DEFAULT_LOGGER.warning("schema drift detected", extra={"label": self})
284
+ cdc.table.overwrite_schema(df=df)
279
285
 
280
- scd1.delete_missing(df, keys=["job_id"])
286
+ cdc.delete_missing(df, keys=["job_id"])
281
287
 
282
288
  @deprecated("use update_tables_list instead")
283
289
  def update_tables(self):
@@ -287,8 +293,8 @@ class BaseStep:
287
293
  df = self.database.get_tables()
288
294
  df = df.withColumn("job_id", expr("md5(table)"))
289
295
 
290
- DEFAULT_LOGGER.info("update tables list", extra={"step": self})
291
- SCD1("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
296
+ DEFAULT_LOGGER.info("update tables list", extra={"label": self})
297
+ NoCDC("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
292
298
 
293
299
  @deprecated("use update_views_list instead")
294
300
  def update_views(self):
@@ -298,8 +304,8 @@ class BaseStep:
298
304
  df = self.database.get_views()
299
305
  df = df.withColumn("job_id", expr("md5(view)"))
300
306
 
301
- DEFAULT_LOGGER.info("update views list", extra={"step": self})
302
- SCD1("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
307
+ DEFAULT_LOGGER.info("update views list", extra={"label": self})
308
+ NoCDC("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
303
309
 
304
310
  def update_dependencies(
305
311
  self,
@@ -307,7 +313,7 @@ class BaseStep:
307
313
  topic: Optional[Union[str, List[str]]] = None,
308
314
  include_manual: Optional[bool] = False,
309
315
  loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
310
- ) -> List[str]:
316
+ ) -> List[Dict]:
311
317
  df, errors = self.get_dependencies(
312
318
  progress_bar=progress_bar,
313
319
  topic=topic,
@@ -316,7 +322,7 @@ class BaseStep:
316
322
  )
317
323
  df.cache()
318
324
 
319
- DEFAULT_LOGGER.info("update dependencies", extra={"step": self})
325
+ DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
320
326
 
321
327
  update_where = None
322
328
 
@@ -327,9 +333,9 @@ class BaseStep:
327
333
  )
328
334
 
329
335
  if update_where:
330
- DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"step": self})
336
+ DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
331
337
 
332
- SCD1("fabricks", self.name, "dependencies").delete_missing(
338
+ NoCDC("fabricks", self.name, "dependencies").delete_missing(
333
339
  df,
334
340
  keys=["dependency_id"],
335
341
  update_where=update_where,
@@ -347,9 +353,9 @@ class BaseStep:
347
353
  update_where = (
348
354
  f"""job_id in (select job_id from fabricks.{self.name}_jobs where {where_topic} {where_not_manual})"""
349
355
  )
350
- DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"step": self})
356
+ DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
351
357
 
352
- SCD1("fabricks", self.name, "dependencies").delete_missing(
358
+ NoCDC("fabricks", self.name, "dependencies").delete_missing(
353
359
  df,
354
360
  keys=["dependency_id"],
355
361
  update_where=update_where,
@@ -359,10 +365,6 @@ class BaseStep:
359
365
  return errors
360
366
 
361
367
  def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
362
- def _register(row: Row):
363
- job = get_job(step=self.name, topic=row["topic"], item=row["item"])
364
- job.register()
365
-
366
368
  if drop:
367
369
  SPARK.sql(f"drop database if exists {self.name} cascade ")
368
370
  SPARK.sql(f"create database {self.name}")
@@ -378,8 +380,44 @@ class BaseStep:
378
380
 
379
381
  if df:
380
382
  DEFAULT_LOGGER.setLevel(logging.CRITICAL)
381
- run_in_parallel(_register, df, workers=16, progress_bar=True)
383
+ run_in_parallel(_register, df, workers=16, progress_bar=True, run_as="Pool")
382
384
  DEFAULT_LOGGER.setLevel(LOGLEVEL)
383
385
 
386
+ def update_steps_list(self):
387
+ order = self.options.get("order", 0)
388
+ df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
389
+
390
+ NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
391
+
384
392
  def __str__(self):
385
393
  return self.name
394
+
395
+
396
+ # to avoid AttributeError: can't pickle local object
397
+ def _get_dependencies(row: Row):
398
+ job = get_job(step=row["step"], job_id=row["job_id"])
399
+ try:
400
+ return {"job": str(job), "dependencies": job.get_dependencies()}
401
+ except Exception as e:
402
+ DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
403
+ return {"job": str(job), "error": e}
404
+
405
+
406
+ def _create_db_object(row: Row):
407
+ job = get_job(step=row["step"], job_id=row["job_id"])
408
+ try:
409
+ job.create()
410
+ return {"job": str(job)}
411
+ except Exception as e: # noqa E722
412
+ DEFAULT_LOGGER.exception("fail to create db object", extra={"label": job})
413
+ return {"job": str(job), "error": e}
414
+
415
+
416
+ def _register(row: Row):
417
+ job = get_job(step=row["step"], topic=row["topic"], item=row["item"])
418
+ try:
419
+ job.register()
420
+ return {"job": str(job)}
421
+ except Exception as e:
422
+ DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
423
+ return {"job": str(job), "error": e}
fabricks/core/udfs.py CHANGED
@@ -11,29 +11,25 @@ from fabricks.context.log import DEFAULT_LOGGER
11
11
  UDFS: dict[str, Callable] = {}
12
12
 
13
13
 
14
- def register_all_udfs():
14
+ def register_all_udfs(extension: Optional[str] = None):
15
15
  """
16
16
  Register all user-defined functions (UDFs).
17
-
18
- This function iterates over all UDFs returned by the `get_udfs` function,
19
- splits the UDF name into the function name and extension, and attempts to
20
- register the UDF using the `register_udf` function. If an exception occurs
21
- during registration, an error message is logged.
22
-
23
- Returns:
24
- None
25
17
  """
26
- for udf in get_udfs():
18
+ DEFAULT_LOGGER.info("register udfs")
19
+
20
+ for udf in get_udfs(extension=extension):
27
21
  split = udf.split(".")
28
22
  try:
29
23
  register_udf(udf=split[0], extension=split[1])
30
- except Exception:
31
- DEFAULT_LOGGER.exception(f"udf {udf} not registered")
24
+ except Exception as e:
25
+ DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e)
32
26
 
33
27
 
34
- def get_udfs() -> List[str]:
28
+ def get_udfs(extension: Optional[str] = None) -> List[str]:
35
29
  files = [os.path.basename(f) for f in PATH_UDFS.walk()]
36
30
  udfs = [f for f in files if not str(f).endswith("__init__.py") and not str(f).endswith(".requirements.txt")]
31
+ if extension:
32
+ udfs = [f for f in udfs if f.endswith(f".{extension}")]
37
33
  return udfs
38
34
 
39
35
 
@@ -63,22 +59,15 @@ def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
63
59
 
64
60
  def register_udf(udf: str, extension: Optional[str] = None, spark: Optional[SparkSession] = None):
65
61
  """
66
- Register a user-defined function (UDF) in Spark.
67
-
68
- Args:
69
- udf (str): The name of the UDF to register.
70
- extension (Optional[str]): The file extension of the UDF implementation file. If not provided, it will be inferred from the UDF name.
71
- spark (Optional[SparkSession]): The SparkSession object. If not provided, a new SparkSession will be created.
72
-
73
- Raises:
74
- ValueError: If the UDF implementation file is not found or if the UDF name is not found.
75
-
62
+ Register a user-defined function (UDF).
76
63
  """
77
64
  if spark is None:
78
65
  spark = SPARK
79
66
  assert spark is not None
80
67
 
81
68
  if not is_registered(udf, spark):
69
+ DEFAULT_LOGGER.debug(f"register udf {udf}")
70
+
82
71
  if extension is None:
83
72
  extension = get_extension(udf)
84
73
 
fabricks/core/views.py CHANGED
@@ -7,28 +7,35 @@ from fabricks.utils.sqlglot import fix as fix_sql
7
7
  def create_or_replace_view_internal(path: Path):
8
8
  sql = path.get_sql()
9
9
  file_name = path.get_file_name().split(".")[0]
10
- sql = f"""
11
- create or replace view fabricks.{file_name}
12
- as
13
- {sql}
14
- """
15
- sql = fix_sql(sql)
16
- DEFAULT_LOGGER.debug(f"schedule - %sql\n---\n{sql}\n---")
17
10
 
18
- SPARK.sql(sql)
11
+ try:
12
+ sql = f"""
13
+ create or replace view fabricks.{file_name}
14
+ as
15
+ {sql}
16
+ """
17
+ sql = fix_sql(sql)
18
+ DEFAULT_LOGGER.debug("create or replace (custom) view", extra={"label": f"fabricks.{file_name}", "sql": sql})
19
+
20
+ SPARK.sql(sql)
21
+
22
+ except Exception as e:
23
+ DEFAULT_LOGGER.exception(
24
+ "could not create nor replace (custom) view", extra={"label": f"fabricks.{file_name}", "exc_info": e}
25
+ )
26
+ raise e
19
27
 
20
28
 
21
29
  def create_or_replace_view(name: str):
22
30
  p = PATH_VIEWS.joinpath(f"{name}.sql")
23
- try:
24
- create_or_replace_view_internal(p)
25
- except Exception:
26
- DEFAULT_LOGGER.warning(f"schedule - {name} not created nor replace")
31
+ create_or_replace_view_internal(p)
27
32
 
28
33
 
29
34
  def create_or_replace_views():
35
+ DEFAULT_LOGGER.info("create or replace (custom) views")
36
+
30
37
  for p in PATH_VIEWS.walk(file_format="sql", convert=True):
31
38
  try:
32
39
  create_or_replace_view_internal(p)
33
40
  except Exception:
34
- DEFAULT_LOGGER.warning(f"schedule - {p.get_file_name()} not created nor replace")
41
+ pass
@@ -0,0 +1,97 @@
1
+ import logging
2
+ from typing import List, Optional, Union, cast
3
+
4
+ from fabricks.context import FABRICKS_STORAGE
5
+ from fabricks.context.log import DEFAULT_LOGGER
6
+ from fabricks.core.jobs.base._types import Steps, TStep
7
+ from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
8
+ from fabricks.core.steps.base import BaseStep
9
+ from fabricks.core.views import create_or_replace_views as create_or_replace_custom_views
10
+ from fabricks.deploy.masks import deploy_masks
11
+ from fabricks.deploy.notebooks import deploy_notebooks
12
+ from fabricks.deploy.schedules import deploy_schedules
13
+ from fabricks.deploy.tables import deploy_tables
14
+ from fabricks.deploy.udfs import deploy_udfs
15
+ from fabricks.deploy.utils import print_atomic_bomb
16
+ from fabricks.deploy.views import deploy_views
17
+ from fabricks.metastore.database import Database
18
+
19
+
20
+ class Deploy:
21
+ @staticmethod
22
+ def tables(drop: bool = False):
23
+ deploy_tables(drop=drop)
24
+
25
+ @staticmethod
26
+ def views():
27
+ deploy_views()
28
+
29
+ create_or_replace_custom_views()
30
+ create_or_replace_schedules_views()
31
+
32
+ @staticmethod
33
+ def udfs():
34
+ deploy_udfs()
35
+
36
+ @staticmethod
37
+ def masks():
38
+ deploy_masks()
39
+
40
+ @staticmethod
41
+ def notebooks():
42
+ deploy_notebooks()
43
+
44
+ @staticmethod
45
+ def schedules():
46
+ deploy_schedules()
47
+
48
+ @staticmethod
49
+ def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]], nowait: bool = False):
50
+ DEFAULT_LOGGER.warning("!💥 armageddon 💥!")
51
+ print_atomic_bomb(nowait=nowait)
52
+
53
+ DEFAULT_LOGGER.setLevel(logging.INFO)
54
+
55
+ if steps is None:
56
+ steps = Steps
57
+ assert steps is not None
58
+
59
+ if isinstance(steps, str):
60
+ steps = [cast(TStep, steps)]
61
+ elif isinstance(steps, List):
62
+ steps = [cast(TStep, s) for s in steps]
63
+ elif isinstance(steps, TStep):
64
+ steps = [steps]
65
+
66
+ fabricks = Database("fabricks")
67
+ fabricks.drop()
68
+
69
+ for s in steps:
70
+ step = BaseStep(s)
71
+ step.drop()
72
+
73
+ tmp = FABRICKS_STORAGE.joinpath("tmp")
74
+ tmp.rm()
75
+
76
+ checkpoint = FABRICKS_STORAGE.joinpath("checkpoints")
77
+ checkpoint.rm()
78
+
79
+ schema = FABRICKS_STORAGE.joinpath("schemas")
80
+ schema.rm()
81
+
82
+ schedule = FABRICKS_STORAGE.joinpath("schedules")
83
+ schedule.rm()
84
+
85
+ fabricks.create()
86
+
87
+ Deploy.tables(drop=True)
88
+ Deploy.udfs()
89
+ Deploy.masks()
90
+ Deploy.notebooks()
91
+
92
+ for s in steps:
93
+ step = BaseStep(s)
94
+ step.create()
95
+
96
+ Deploy.views()
97
+ Deploy.schedules()
@@ -0,0 +1,8 @@
1
+ from fabricks.context.log import DEFAULT_LOGGER
2
+ from fabricks.core.masks import register_all_masks
3
+
4
+
5
+ def deploy_masks():
6
+ DEFAULT_LOGGER.info("create or replace masks")
7
+
8
+ register_all_masks()
@@ -0,0 +1,71 @@
1
+ import base64
2
+ import io
3
+ import os
4
+ from importlib import resources
5
+
6
+ from databricks.sdk import WorkspaceClient
7
+ from databricks.sdk.service import workspace
8
+
9
+ from fabricks.context import PATH_NOTEBOOKS
10
+ from fabricks.context.log import DEFAULT_LOGGER
11
+
12
+
13
+ def deploy_notebook(notebook: str):
14
+ from fabricks.api import notebooks
15
+
16
+ DEFAULT_LOGGER.debug(f"overwrite {notebook}")
17
+
18
+ w = WorkspaceClient()
19
+
20
+ target = f"{PATH_NOTEBOOKS}/{notebook}.py"
21
+ src = resources.files(notebooks) / f"{notebook}.py"
22
+
23
+ with io.open(src, "rb") as file: # type: ignore
24
+ content = file.read()
25
+
26
+ encoded = base64.b64encode(content).decode("utf-8")
27
+
28
+ w.workspace.import_(
29
+ path=target,
30
+ content=encoded,
31
+ format=workspace.ImportFormat.AUTO,
32
+ language=workspace.Language.PYTHON,
33
+ overwrite=True,
34
+ )
35
+
36
+
37
+ def deploy_notebooks():
38
+ DEFAULT_LOGGER.info("overwrite notebooks")
39
+
40
+ _create_dir_if_not_exists()
41
+ _clean_dir()
42
+
43
+ for n in [
44
+ "cluster",
45
+ "initialize",
46
+ "process",
47
+ "schedule",
48
+ "run",
49
+ "terminate",
50
+ ]:
51
+ deploy_notebook(notebook=n)
52
+
53
+
54
+ def _create_dir_if_not_exists():
55
+ dir = str(PATH_NOTEBOOKS)
56
+ os.makedirs(dir, exist_ok=True)
57
+
58
+
59
+ def _clean_dir():
60
+ dir = str(PATH_NOTEBOOKS)
61
+ for n in [
62
+ "cluster",
63
+ "initialize",
64
+ "process",
65
+ "schedule",
66
+ "run",
67
+ "terminate",
68
+ ]:
69
+ file_path = os.path.join(dir, f"{n}.py")
70
+ if os.path.isfile(file_path):
71
+ os.remove(file_path)
@@ -0,0 +1,8 @@
1
+ from fabricks.context.log import DEFAULT_LOGGER
2
+ from fabricks.core.schedules import create_or_replace_views
3
+
4
+
5
+ def deploy_schedules():
6
+ DEFAULT_LOGGER.info("create or replace schedules")
7
+
8
+ create_or_replace_views()