fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,509 @@
1
+ from fabricks.context import SPARK
2
+ from fabricks.context.log import DEFAULT_LOGGER
3
+ from fabricks.core.jobs.base._types import Steps
4
+ from fabricks.utils.sqlglot import fix as fix_sql
5
+
6
+
7
+ def deploy_views():
8
+ DEFAULT_LOGGER.info("create or replace fabricks (default) views")
9
+
10
+ create_or_replace_jobs_view()
11
+ create_or_replace_tables_view()
12
+ create_or_replace_views_view()
13
+ create_or_replace_logs_pivot_view()
14
+ create_or_replace_last_schedule_view()
15
+ create_or_replace_last_status_view()
16
+ create_or_replace_previous_schedule_view()
17
+ create_or_replace_schedules_view()
18
+ create_or_replace_dependencies_view()
19
+ create_or_replace_dependencies_flat_view()
20
+ create_or_replace_dependencies_unpivot_view()
21
+ create_or_replace_dependencies_circular_view()
22
+ create_or_replace_jobs_to_be_updated_view()
23
+
24
+
25
+ def create_or_replace_jobs_view():
26
+ dmls = []
27
+
28
+ for step in Steps:
29
+ table = f"{step}_jobs"
30
+ try:
31
+ try:
32
+ SPARK.sql(f"select options.change_data_capture from fabricks.{table}")
33
+ change_data_capture = "coalesce(options.change_data_capture, 'nocdc') as change_data_capture"
34
+ except Exception:
35
+ change_data_capture = "'nocdc' as change_data_capture"
36
+
37
+ dml = f"""
38
+ select
39
+ j.step,
40
+ s.expand,
41
+ j.job_id,
42
+ j.topic,
43
+ j.item,
44
+ concat(j.step, '.', j.topic, '_', j.item) as job,
45
+ j.options.mode,
46
+ {change_data_capture},
47
+ coalesce(j.options.type, 'default') as type,
48
+ tags,
49
+ case
50
+ when s.expand == "bronze" then if(j.options.mode in ("append", "register"), "table", null)
51
+ when
52
+ s.expand == "silver"
53
+ then
54
+ if(
55
+ j.options.mode in ("update", "append", "latest"),
56
+ "table",
57
+ if(j.options.mode in ("combine", "memory"), "view", null)
58
+ )
59
+ when
60
+ s.expand == "gold"
61
+ then
62
+ if(j.options.mode in ("update", "append", "complete"), "table", if(j.options.mode in ("memory"), "view", null))
63
+ end as object_type
64
+ from
65
+ fabricks.{table} j
66
+ left join fabricks.steps s on s.step = j.step
67
+ """
68
+ SPARK.sql(dml) # Check if the table exists
69
+ dmls.append(dml)
70
+
71
+ except Exception:
72
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{table}")
73
+
74
+ sql = f"""create or replace view fabricks.jobs with schema evolution as {" union all ".join(dmls)}"""
75
+ sql = fix_sql(sql)
76
+
77
+ DEFAULT_LOGGER.debug("create or replace fabricks.jobs", extra={"sql": sql})
78
+ SPARK.sql(sql)
79
+
80
+
81
+ def create_or_replace_tables_view():
82
+ dmls = []
83
+
84
+ for step in Steps:
85
+ table = f"{step}_tables"
86
+ try:
87
+ dml = f"""
88
+ select
89
+ '{step}' as step,
90
+ job_id,
91
+ table
92
+ from
93
+ fabricks.{table}
94
+ """
95
+ SPARK.sql(dml) # Check if the table exists
96
+ dmls.append(dml)
97
+
98
+ except Exception:
99
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_tables")
100
+
101
+ sql = f"""create or replace view fabricks.tables with schema evolution as {" union all ".join(dmls)}"""
102
+ sql = fix_sql(sql)
103
+
104
+ DEFAULT_LOGGER.debug("create or replace fabricks.tables", extra={"sql": sql})
105
+ SPARK.sql(sql)
106
+
107
+
108
+ def create_or_replace_views_view():
109
+ dmls = []
110
+
111
+ for step in Steps:
112
+ table = f"{step}_views"
113
+ try:
114
+ dml = f"""
115
+ select
116
+ '{step}' as step,
117
+ job_id,
118
+ view
119
+ from
120
+ fabricks.{table}
121
+ """
122
+ SPARK.sql(dml) # Check if the table exists
123
+ dmls.append(dml)
124
+
125
+ except Exception:
126
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_views")
127
+
128
+ sql = f"""create or replace view fabricks.views with schema evolution as {" union all ".join(dmls)}"""
129
+ sql = fix_sql(sql)
130
+
131
+ DEFAULT_LOGGER.debug("create or replace fabricks.views", extra={"sql": sql})
132
+ SPARK.sql(sql)
133
+
134
+
135
+ def create_or_replace_dependencies_view():
136
+ dmls = []
137
+
138
+ for step in Steps:
139
+ f"{step}_dependencies"
140
+ try:
141
+ dml = f"""
142
+ select
143
+ '{step}' as step,
144
+ dependency_id,
145
+ job_id,
146
+ parent_id,
147
+ parent,
148
+ origin
149
+ from
150
+ fabricks.{step}_dependencies d
151
+ """
152
+ SPARK.sql(dml) # Check if the table exists
153
+ dmls.append(dml)
154
+
155
+ except Exception:
156
+ DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_dependencies")
157
+
158
+ sql = f"""create or replace view fabricks.dependencies with schema evolution as {" union all ".join(dmls)}"""
159
+ sql = fix_sql(sql)
160
+
161
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies", extra={"sql": sql})
162
+ SPARK.sql(sql)
163
+
164
+
165
+ def create_or_replace_dependencies_flat_view():
166
+ parent = ",\n ".join([f"d{i + 1}.parent_id as parent_{i + 1}" for i in range(10)])
167
+ join = "\n ".join(
168
+ [f"left join fabricks.dependencies d{i + 1} on d{i}.parent_id = d{i + 1}.job_id" for i in range(10)]
169
+ )
170
+
171
+ sql = f"""
172
+ create or replace view fabricks.dependencies_flat with schema evolution as
173
+ select
174
+ d0.job_id,
175
+ d0.parent_id as parent_0,
176
+ {parent}
177
+ from
178
+ fabricks.dependencies d0
179
+ {join}
180
+ """
181
+ sql = fix_sql(sql)
182
+
183
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_flat", extra={"sql": sql})
184
+ SPARK.sql(sql)
185
+
186
+
187
+ def create_or_replace_dependencies_unpivot_view():
188
+ sql = """
189
+ create or replace view fabricks.dependencies_unpivot with schema evolution as
190
+ with unpvt as (
191
+ select
192
+ *
193
+ from
194
+ fabricks.dependencies_flat unpivot (
195
+ (parent_id) for depth in (
196
+ (parent_0) as depth_00,
197
+ (parent_1) as depth_01,
198
+ (parent_2) as depth_02,
199
+ (parent_3) as depth_03,
200
+ (parent_4) as depth_04,
201
+ (parent_5) as depth_05,
202
+ (parent_6) as depth_06,
203
+ (parent_7) as depth_07,
204
+ (parent_8) as depth_08,
205
+ (parent_9) as depth_09,
206
+ (parent_10) as depth_10
207
+ )
208
+ ) p
209
+ )
210
+ select
211
+ job_id,
212
+ cast(replace(depth, 'depth_', '') as int) as depth,
213
+ parent_id
214
+ from
215
+ unpvt qualify row_number() over (
216
+ partition by job_id,
217
+ parent_id
218
+ order by
219
+ depth asc
220
+ ) = 1
221
+ """
222
+ sql = fix_sql(sql)
223
+
224
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_unpivot", extra={"sql": sql})
225
+ SPARK.sql(sql)
226
+
227
+
228
+ def create_or_replace_dependencies_circular_view():
229
+ sql = """
230
+ create or replace view fabricks.dependencies_circular with schema evolution as
231
+ with d as (
232
+ select
233
+ d1.job_id,
234
+ j1.job,
235
+ p.job_id as parent_id,
236
+ p.job as parent
237
+ from
238
+ fabricks.dependencies d1
239
+ left join fabricks.dependencies_unpivot d2 on d2.parent_id = d1.job_id
240
+ left join fabricks.jobs j1 on d1.job_id = j1.job_id
241
+ left join fabricks.jobs p on d1.parent_id = p.job_id
242
+ where
243
+ true
244
+ and d1.job_id = d2.job_id
245
+ group by
246
+ all
247
+ )
248
+ select
249
+ *
250
+ from
251
+ d
252
+ where
253
+ true
254
+ and exists (
255
+ select
256
+ 1
257
+ from
258
+ d d1
259
+ where
260
+ d1.job_id = d.parent_id
261
+ )
262
+ """
263
+ sql = fix_sql(sql)
264
+
265
+ DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_circular", extra={"sql": sql})
266
+ SPARK.sql(sql)
267
+
268
+
269
+ def create_or_replace_logs_pivot_view():
270
+ sql = """
271
+ create or replace view fabricks.logs_pivot with schema evolution as
272
+ with groupby as (
273
+ select
274
+ l.schedule,
275
+ l.schedule_id,
276
+ l.step,
277
+ l.job,
278
+ l.job_id,
279
+ collect_set(l.status) as statuses,
280
+ array_contains(statuses, 'skipped') as skipped,
281
+ array_contains(statuses, 'warned') as warned,
282
+ array_contains(statuses, 'done') or warned as done,
283
+ array_contains(statuses, 'failed') or (not done and not skipped) as failed,
284
+ not done and not failed and not skipped and array_contains(statuses, 'running') as timed_out,
285
+ not array_contains(statuses, 'running') as cancelled,
286
+ max(l.notebook_id) as notebook_id,
287
+ max(l.timestamp) filter (where l.status = 'scheduled' ) as scheduled_time,
288
+ max(l.timestamp) filter (where l.status = 'waiting' ) as waiting_time,
289
+ max(l.timestamp) filter (where l.status = 'running') as start_time,
290
+ max(l.timestamp) filter (where l.status = 'running' ) as running_time,
291
+ max(l.timestamp) filter (where l.status = 'done' ) as done_time,
292
+ max(l.timestamp) filter (where l.status = 'failed' ) as failed_time,
293
+ max(l.timestamp) filter(where l.status = 'ok') as end_time,
294
+ max(l.timestamp) filter(where l.status = 'ok') as ok_time,
295
+ max(l.exception) as exception
296
+ from
297
+ fabricks.logs l
298
+ group by
299
+ l.schedule, l.schedule_id, l.step, l.job, l.job_id
300
+ )
301
+ select
302
+ g.schedule,
303
+ g.schedule_id,
304
+ g.job,
305
+ g.step,
306
+ j.topic,
307
+ j.item,
308
+ g.job_id,
309
+ g.done,
310
+ g.failed,
311
+ g.timed_out,
312
+ g.cancelled,
313
+ g.skipped,
314
+ g.warned,
315
+ g.notebook_id,
316
+ g.start_time,
317
+ g.end_time,
318
+ g.scheduled_time,
319
+ g.waiting_time,
320
+ g.running_time,
321
+ g.done_time,
322
+ g.failed_time,
323
+ g.ok_time,
324
+ if(g.timed_out, null, date_diff(SECOND, start_time, end_time)) as duration,
325
+ g.exception
326
+ from
327
+ groupby g
328
+ left join fabricks.jobs j on g.job_id = j.job_id
329
+ """
330
+ sql = fix_sql(sql)
331
+
332
+ DEFAULT_LOGGER.debug("create or replace fabricks.logs_pivot", extra={"sql": sql})
333
+ SPARK.sql(sql)
334
+
335
+
336
+ def create_or_replace_last_schedule_view():
337
+ sql = """
338
+ create or replace view fabricks.last_schedule with schema evolution as
339
+ with lst as (
340
+ select
341
+ schedule_id as last_schedule_id
342
+ from
343
+ fabricks.logs_pivot
344
+ where
345
+ schedule_id is not null
346
+ order by
347
+ start_time desc
348
+ limit
349
+ 1
350
+ )
351
+ select
352
+ l.*
353
+ from
354
+ fabricks.logs_pivot l
355
+ inner join lst on schedule_id = last_schedule_id
356
+ """
357
+ sql = fix_sql(sql)
358
+
359
+ DEFAULT_LOGGER.debug("create or replace fabricks.last_schedule", extra={"sql": sql})
360
+ SPARK.sql(sql)
361
+
362
+
363
+ def create_or_replace_last_status_view():
364
+ sql = """
365
+ create or replace view fabricks.last_status with schema evolution as
366
+ select
367
+ job_id,
368
+ job,
369
+ step,
370
+ start_time as time,
371
+ done,
372
+ failed,
373
+ cancelled,
374
+ timed_out,
375
+ exception
376
+ from
377
+ fabricks.logs_pivot
378
+ qualify row_number() over (
379
+ partition by job_id
380
+ order by
381
+ start_time desc
382
+ ) = 1
383
+ """
384
+ sql = fix_sql(sql)
385
+
386
+ DEFAULT_LOGGER.debug("create or replace fabricks.last_status", extra={"sql": sql})
387
+ SPARK.sql(sql)
388
+
389
+
390
+ def create_or_replace_previous_schedule_view():
391
+ sql = """
392
+ create or replace view fabricks.previous_schedule with schema evolution as
393
+ with lst_2 as (
394
+ select
395
+ schedule_id as last_schedule_id,
396
+ max(start_time) as start_time
397
+ from
398
+ fabricks.logs_pivot
399
+ where
400
+ schedule_id is not null
401
+ group by
402
+ all
403
+ order by
404
+ start_time desc
405
+ limit
406
+ 2
407
+ ), lst as (
408
+ select
409
+ last_schedule_id
410
+ from
411
+ lst_2
412
+ order by
413
+ start_time asc
414
+ limit
415
+ 1
416
+ )
417
+ select
418
+ l.*
419
+ from
420
+ fabricks.logs_pivot l
421
+ inner join lst on schedule_id = last_schedule_id
422
+ """
423
+ sql = fix_sql(sql)
424
+
425
+ DEFAULT_LOGGER.debug("create or replace fabricks.previous_schedule", extra={"sql": sql})
426
+ SPARK.sql(sql)
427
+
428
+
429
+ def create_or_replace_schedules_view():
430
+ sql = """
431
+ create or replace view fabricks.schedules with schema evolution as
432
+ select
433
+ schedule,
434
+ schedule_id,
435
+ min(start_time) as start_time,
436
+ max(end_time) as end_time,
437
+ max(start_time) :: date as date,
438
+ sum(duration) as duration,
439
+ count(*) as logs,
440
+ count_if(failed) as failed,
441
+ count_if(done) as done,
442
+ count_if(timed_out) as timed_out
443
+ from
444
+ fabricks.logs_pivot
445
+ group by
446
+ all
447
+ order by date desc, start_time desc
448
+ """
449
+ sql = fix_sql(sql)
450
+
451
+ DEFAULT_LOGGER.debug("create or replace fabricks.schedules", extra={"sql": sql})
452
+ SPARK.sql(sql)
453
+
454
+
455
+ def create_or_replace_jobs_to_be_updated_view():
456
+ sql = """
457
+ create or replace view fabricks.jobs_to_be_updated with schema evolution as
458
+ with base as (
459
+ select
460
+ j.job,
461
+ j.job_id,
462
+ j.step,
463
+ j.topic,
464
+ j.item,
465
+ s.expand,
466
+ j.mode as mode,
467
+ j.object_type as object_type
468
+ from
469
+ fabricks.jobs j
470
+ inner join fabricks.steps s
471
+ on j.step = s.step
472
+ ),
473
+ objects as (
474
+ select
475
+ `table` as job,
476
+ job_id,
477
+ 'table' as object_type
478
+ from
479
+ fabricks.tables
480
+ union
481
+ select
482
+ `view` as job,
483
+ job_id,
484
+ 'view' as object_type
485
+ from
486
+ fabricks.views
487
+ )
488
+ select
489
+ b.job,
490
+ b.job_id,
491
+ b.step,
492
+ b.topic,
493
+ b.item,
494
+ b.expand,
495
+ b.mode,
496
+ o.object_type as old_object_type,
497
+ b.object_type as new_object_type,
498
+ array(old_object_type, new_object_type) as object_types,
499
+ (old_object_type is not null and new_object_type is null) or (not old_object_type <=> new_object_type and old_object_type is not null ) as is_to_drop,
500
+ (is_to_drop and new_object_type is not null) or (old_object_type is null and new_object_type is not null) as is_to_register
501
+ from
502
+ base b
503
+ left join objects o
504
+ on b.job_id = o.job_id
505
+ """
506
+ sql = fix_sql(sql)
507
+
508
+ DEFAULT_LOGGER.debug("create or replace fabricks.jobs_to_be_updated", extra={"sql": sql})
509
+ SPARK.sql(sql)
@@ -0,0 +1,3 @@
1
+ # BMS DNA Fabricks Metastore
2
+
3
+ Metastore - Fabricks
@@ -0,0 +1,5 @@
1
+ from fabricks.metastore.database import Database
2
+ from fabricks.metastore.table import Table
3
+ from fabricks.metastore.view import View
4
+
5
+ __all__ = ["Database", "Table", "View"]
@@ -0,0 +1,65 @@
1
+ from typing import Literal, Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
+
6
+ class SchemaDiff(BaseModel):
7
+ column: str
8
+ data_type: Optional[str] = None
9
+ new_column: Optional[str] = None
10
+ new_data_type: Optional[str] = None
11
+ status: Literal["added", "changed", "dropped"]
12
+
13
+ @property
14
+ def type_widening_compatible(self) -> bool:
15
+ if self.status != "changed":
16
+ return False
17
+
18
+ assert self.new_data_type
19
+ assert self.data_type
20
+ map = {
21
+ "byte": {"short", "int", "long", "decimal", "double"},
22
+ "short": {"int", "long", "decimal", "double"},
23
+ "int": {"long", "decimal", "double"},
24
+ "long": {"decimal"},
25
+ "float": {"double"},
26
+ }
27
+ return self.new_data_type.lower() in map.get(self.data_type.lower(), set())
28
+
29
+
30
+ class DroppedColumn(SchemaDiff):
31
+ def __init__(self, column: str, data_type: Optional[str] = None):
32
+ super().__init__(
33
+ column=column,
34
+ data_type=data_type,
35
+ status="dropped",
36
+ )
37
+
38
+ def __str__(self):
39
+ return f"dropped {self.column}"
40
+
41
+
42
+ class AddedColumn(SchemaDiff):
43
+ def __init__(self, new_column: str, new_data_type: str):
44
+ super().__init__(
45
+ column=new_column,
46
+ new_column=new_column,
47
+ new_data_type=new_data_type,
48
+ status="added",
49
+ )
50
+
51
+ def __str__(self):
52
+ return f"added {self.new_column} with type {self.new_data_type}"
53
+
54
+
55
+ class ChangedColumn(SchemaDiff):
56
+ def __init__(self, column: str, data_type: str, new_data_type: str):
57
+ super().__init__(
58
+ column=column,
59
+ data_type=data_type,
60
+ new_data_type=new_data_type,
61
+ status="changed",
62
+ )
63
+
64
+ def __str__(self):
65
+ return f"changed {self.column} from {self.data_type} to {self.new_data_type} (widening compatible: {self.type_widening_compatible})"
@@ -0,0 +1,65 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.errors.exceptions.base import AnalysisException
4
+ from pyspark.sql import DataFrame, SparkSession
5
+ from typing_extensions import deprecated
6
+
7
+ from fabricks.context import PATHS_STORAGE, SPARK
8
+ from fabricks.context.log import DEFAULT_LOGGER
9
+ from fabricks.metastore.utils import get_tables, get_views
10
+ from fabricks.utils.path import Path
11
+
12
+
13
+ class Database:
14
+ def __init__(self, name: str, spark: Optional[SparkSession] = None):
15
+ self.name = name
16
+
17
+ storage = PATHS_STORAGE.get(self.name)
18
+ assert storage is not None
19
+ self.storage = storage
20
+
21
+ if spark is None:
22
+ spark = SPARK
23
+ assert spark is not None
24
+ self.spark = spark
25
+
26
+ @property
27
+ @deprecated("use delta_path instead")
28
+ def deltapath(self) -> Path:
29
+ return self.storage.joinpath("delta")
30
+
31
+ @property
32
+ def delta_path(self) -> Path:
33
+ return self.storage.joinpath("delta")
34
+
35
+ def create(self):
36
+ DEFAULT_LOGGER.info("create database", extra={"label": self})
37
+ self.spark.sql(f"create database if not exists {self.name};")
38
+
39
+ def drop(self, rm: Optional[bool] = True):
40
+ if self.exists():
41
+ DEFAULT_LOGGER.warning("drop database", extra={"label": self})
42
+ self.spark.sql(f"drop database if exists {self.name} cascade;")
43
+
44
+ if rm:
45
+ if self.delta_path.exists():
46
+ DEFAULT_LOGGER.debug("remove delta files", extra={"label": self})
47
+ self.delta_path.rm()
48
+
49
+ def exists(self) -> bool:
50
+ try:
51
+ self.spark.sql(f"show tables in {self.name}")
52
+ # database not found
53
+ except AnalysisException:
54
+ return False
55
+
56
+ return True
57
+
58
+ def __str__(self):
59
+ return self.name
60
+
61
+ def get_tables(self) -> DataFrame:
62
+ return get_tables(self.name)
63
+
64
+ def get_views(self) -> DataFrame:
65
+ return get_views(self.name)