fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +80 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
- fabricks-3.0.7.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
fabricks/core/jobs/gold.py
CHANGED
|
@@ -99,14 +99,20 @@ class Gold(BaseJob):
|
|
|
99
99
|
def register_udfs(self):
|
|
100
100
|
for u in self.get_udfs():
|
|
101
101
|
if not is_registered(u):
|
|
102
|
-
DEFAULT_LOGGER.debug(f"register udf ({u})", extra={"
|
|
102
|
+
DEFAULT_LOGGER.debug(f"register udf ({u})", extra={"label": self})
|
|
103
103
|
register_udf(udf=u, spark=self.spark)
|
|
104
104
|
|
|
105
105
|
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
106
106
|
df = df.transform(self.extend)
|
|
107
107
|
return df
|
|
108
108
|
|
|
109
|
-
def get_data(
|
|
109
|
+
def get_data(
|
|
110
|
+
self,
|
|
111
|
+
stream: bool = False,
|
|
112
|
+
transform: Optional[bool] = False,
|
|
113
|
+
schema_only: Optional[bool] = False,
|
|
114
|
+
**kwargs,
|
|
115
|
+
) -> DataFrame:
|
|
110
116
|
if self.options.job.get_boolean("requirements"):
|
|
111
117
|
import sys
|
|
112
118
|
|
|
@@ -116,12 +122,12 @@ class Gold(BaseJob):
|
|
|
116
122
|
df = self.spark.createDataFrame([{}]) # type: ignore
|
|
117
123
|
|
|
118
124
|
elif self.options.job.get("notebook"):
|
|
119
|
-
|
|
125
|
+
invokers = self.options.invokers.get_list("run")
|
|
126
|
+
assert len(invokers) <= 1, "at most one invoker allowed when notebook is true"
|
|
120
127
|
|
|
121
|
-
|
|
122
|
-
|
|
128
|
+
global_temp_view = self.invoke(path=self.paths.runtime, schema_only=schema_only, **kwargs)
|
|
129
|
+
assert global_temp_view is not None, "global_temp_view not found"
|
|
123
130
|
|
|
124
|
-
global_temp_view = dbutils.notebook.run(path, self.timeout, arguments={}) # type: ignore
|
|
125
131
|
df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
|
|
126
132
|
|
|
127
133
|
elif self.options.job.get("table"):
|
|
@@ -135,6 +141,10 @@ class Gold(BaseJob):
|
|
|
135
141
|
|
|
136
142
|
if transform:
|
|
137
143
|
df = self.base_transform(df)
|
|
144
|
+
|
|
145
|
+
if schema_only:
|
|
146
|
+
df = df.where("1 == 2")
|
|
147
|
+
|
|
138
148
|
return df
|
|
139
149
|
|
|
140
150
|
def create_or_replace_view(self):
|
|
@@ -178,7 +188,7 @@ class Gold(BaseJob):
|
|
|
178
188
|
from fabricks.context import CATALOG
|
|
179
189
|
|
|
180
190
|
dependencies = []
|
|
181
|
-
df = self.get_data(self.stream)
|
|
191
|
+
df = self.get_data(stream=self.stream)
|
|
182
192
|
|
|
183
193
|
if df is not None:
|
|
184
194
|
explain_plan = self.spark.sql("explain extended select * from {df}", df=df).collect()[0][0]
|
|
@@ -194,23 +204,14 @@ class Gold(BaseJob):
|
|
|
194
204
|
return dependencies
|
|
195
205
|
|
|
196
206
|
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
"deduplicate",
|
|
206
|
-
None,
|
|
207
|
-
) # assume no duplicate in gold (to improve performance)
|
|
208
|
-
rectify = self.options.job.get_boolean(
|
|
209
|
-
"rectify_as_upserts",
|
|
210
|
-
None,
|
|
211
|
-
) # assume no reload in gold (to improve performance)
|
|
212
|
-
correct_valid_from = self.options.job.get_boolean("correct_valid_from", True)
|
|
213
|
-
add_metadata = self.step_conf.get("options", {}).get("metadata", False)
|
|
207
|
+
# assume no duplicate in gold (to improve performance)
|
|
208
|
+
deduplicate = self.options.job.get_boolean("deduplicate", None)
|
|
209
|
+
# assume no reload in gold (to improve performance)
|
|
210
|
+
rectify = self.options.job.get_boolean("rectify_as_upserts", None)
|
|
211
|
+
|
|
212
|
+
add_metadata = self.options.job.get_boolean("metadata", None)
|
|
213
|
+
if add_metadata is None:
|
|
214
|
+
add_metadata = self.step_conf.get("options", {}).get("metadata", False)
|
|
214
215
|
|
|
215
216
|
context = {
|
|
216
217
|
"add_metadata": add_metadata,
|
|
@@ -219,27 +220,37 @@ class Gold(BaseJob):
|
|
|
219
220
|
"deduplicate_hash": True if self.slowly_changing_dimension else None,
|
|
220
221
|
"deduplicate": False,
|
|
221
222
|
"rectify": False,
|
|
222
|
-
"order_duplicate_by": order_duplicate_by,
|
|
223
|
-
"correct_valid_from": correct_valid_from,
|
|
224
223
|
}
|
|
225
224
|
|
|
225
|
+
# force deduplicate
|
|
226
226
|
if deduplicate is not None:
|
|
227
227
|
context["deduplicate"] = deduplicate
|
|
228
228
|
context["deduplicate_key"] = deduplicate
|
|
229
229
|
context["deduplicate_hash"] = deduplicate
|
|
230
230
|
|
|
231
|
+
# force rectify
|
|
231
232
|
if rectify is not None:
|
|
232
233
|
context["rectify"] = rectify
|
|
233
234
|
|
|
235
|
+
# add key and hash when needed
|
|
236
|
+
if self.mode == "update" and self.change_data_capture == "nocdc":
|
|
237
|
+
if "__key" not in df.columns:
|
|
238
|
+
context["add_key"] = True
|
|
239
|
+
if "__hash" not in df.columns:
|
|
240
|
+
context["add_hash"] = True
|
|
241
|
+
|
|
242
|
+
# add key and hash when needed
|
|
234
243
|
if self.slowly_changing_dimension:
|
|
235
244
|
if "__key" not in df.columns:
|
|
236
245
|
context["add_key"] = True
|
|
237
246
|
if "__hash" not in df.columns:
|
|
238
247
|
context["add_hash"] = True
|
|
239
248
|
|
|
249
|
+
if self.slowly_changing_dimension:
|
|
240
250
|
if "__operation" not in df.columns:
|
|
251
|
+
# assume no duplicate hash
|
|
241
252
|
if deduplicate is None:
|
|
242
|
-
context["deduplicate_hash"] = None
|
|
253
|
+
context["deduplicate_hash"] = None
|
|
243
254
|
|
|
244
255
|
if self.mode == "update":
|
|
245
256
|
context["add_operation"] = "reload"
|
|
@@ -249,16 +260,25 @@ class Gold(BaseJob):
|
|
|
249
260
|
else:
|
|
250
261
|
context["add_operation"] = "upsert"
|
|
251
262
|
|
|
263
|
+
# filter to get latest data
|
|
252
264
|
if not reload:
|
|
253
265
|
if self.mode == "update" and self.change_data_capture == "scd2":
|
|
254
266
|
context["slice"] = "update"
|
|
255
267
|
|
|
268
|
+
if self.mode == "update" and self.change_data_capture == "nocdc" and "__timestamp" in df.columns:
|
|
269
|
+
context["slice"] = "update"
|
|
270
|
+
|
|
256
271
|
if self.mode == "append" and "__timestamp" in df.columns:
|
|
257
272
|
context["slice"] = "update"
|
|
258
273
|
|
|
259
274
|
if self.mode == "memory":
|
|
260
275
|
context["mode"] = "complete"
|
|
261
276
|
|
|
277
|
+
# correct __valid_from
|
|
278
|
+
if self.change_data_capture == "scd2":
|
|
279
|
+
context["correct_valid_from"] = self.options.job.get_boolean("correct_valid_from", True)
|
|
280
|
+
|
|
281
|
+
# add __timestamp
|
|
262
282
|
if self.options.job.get_boolean("persist_last_timestamp"):
|
|
263
283
|
if self.change_data_capture == "scd1":
|
|
264
284
|
if "__timestamp" not in df.columns:
|
|
@@ -267,6 +287,11 @@ class Gold(BaseJob):
|
|
|
267
287
|
if "__valid_from" not in df.columns:
|
|
268
288
|
context["add_timestamp"] = True
|
|
269
289
|
|
|
290
|
+
if "__order_duplicate_by_asc" in df.columns:
|
|
291
|
+
context["order_duplicate_by"] = {"__order_duplicate_by_asc": "asc"}
|
|
292
|
+
elif "__order_duplicate_by_desc" in df.columns:
|
|
293
|
+
context["order_duplicate_by"] = {"__order_duplicate_by_desc": "desc"}
|
|
294
|
+
|
|
270
295
|
return context
|
|
271
296
|
|
|
272
297
|
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
@@ -277,20 +302,19 @@ class Gold(BaseJob):
|
|
|
277
302
|
|
|
278
303
|
# if dataframe, reference is passed (BUG)
|
|
279
304
|
name = f"{self.step}_{self.topic}_{self.item}"
|
|
280
|
-
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
305
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
281
306
|
sql = f"select * from {global_temp_view}"
|
|
282
307
|
|
|
283
308
|
check_df = self.spark.sql(sql)
|
|
284
309
|
if check_df.isEmpty():
|
|
285
|
-
DEFAULT_LOGGER.warning("no data", extra={"
|
|
310
|
+
DEFAULT_LOGGER.warning("no data", extra={"label": self})
|
|
286
311
|
return
|
|
287
312
|
|
|
288
313
|
if reload:
|
|
289
|
-
DEFAULT_LOGGER.warning("force reload", extra={"
|
|
314
|
+
DEFAULT_LOGGER.warning("force reload", extra={"label": self})
|
|
290
315
|
self.cdc.complete(sql, **context)
|
|
291
316
|
|
|
292
317
|
elif self.mode == "update":
|
|
293
|
-
assert not isinstance(self.cdc, NoCDC), "nocdc update not allowed"
|
|
294
318
|
self.cdc.update(sql, **context)
|
|
295
319
|
|
|
296
320
|
elif self.mode == "append":
|
|
@@ -323,7 +347,7 @@ class Gold(BaseJob):
|
|
|
323
347
|
|
|
324
348
|
def create(self):
|
|
325
349
|
if self.mode == "invoke":
|
|
326
|
-
DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"
|
|
350
|
+
DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
|
|
327
351
|
else:
|
|
328
352
|
self.register_udfs()
|
|
329
353
|
super().create()
|
|
@@ -335,7 +359,7 @@ class Gold(BaseJob):
|
|
|
335
359
|
self.cdc_last_timestamp.table.register()
|
|
336
360
|
|
|
337
361
|
if self.mode == "invoke":
|
|
338
|
-
DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"
|
|
362
|
+
DEFAULT_LOGGER.info("invoke (no table nor view)", extra={"label": self})
|
|
339
363
|
else:
|
|
340
364
|
super().register()
|
|
341
365
|
|
|
@@ -345,17 +369,6 @@ class Gold(BaseJob):
|
|
|
345
369
|
|
|
346
370
|
super().drop()
|
|
347
371
|
|
|
348
|
-
def optimize(
|
|
349
|
-
self,
|
|
350
|
-
vacuum: Optional[bool] = True,
|
|
351
|
-
optimize: Optional[bool] = True,
|
|
352
|
-
analyze: Optional[bool] = True,
|
|
353
|
-
):
|
|
354
|
-
if self.mode == "memory":
|
|
355
|
-
DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
|
|
356
|
-
else:
|
|
357
|
-
super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
|
|
358
|
-
|
|
359
372
|
@property
|
|
360
373
|
def cdc_last_timestamp(self) -> NoCDC:
|
|
361
374
|
assert self.mode == "update", "persist_last_timestamp only allowed in update"
|
|
@@ -387,15 +400,15 @@ class Gold(BaseJob):
|
|
|
387
400
|
else:
|
|
388
401
|
self.cdc_last_timestamp.overwrite(df)
|
|
389
402
|
|
|
390
|
-
def overwrite(self):
|
|
403
|
+
def overwrite(self, schedule: Optional[str] = None):
|
|
391
404
|
if self.mode == "invoke":
|
|
392
|
-
DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"
|
|
405
|
+
DEFAULT_LOGGER.debug("invoke (no overwrite)", extra={"label": self})
|
|
393
406
|
return
|
|
394
407
|
|
|
395
408
|
elif self.mode == "memory":
|
|
396
|
-
DEFAULT_LOGGER.debug("memory (no overwrite)", extra={"
|
|
409
|
+
DEFAULT_LOGGER.debug("memory (no overwrite)", extra={"label": self})
|
|
397
410
|
self.create_or_replace_view()
|
|
398
411
|
return
|
|
399
412
|
|
|
400
413
|
self.overwrite_schema()
|
|
401
|
-
self.run(reload=True)
|
|
414
|
+
self.run(reload=True, schedule=schedule)
|
fabricks/core/jobs/silver.py
CHANGED
|
@@ -95,7 +95,13 @@ class Silver(BaseJob):
|
|
|
95
95
|
)
|
|
96
96
|
return df
|
|
97
97
|
|
|
98
|
-
def get_data(
|
|
98
|
+
def get_data(
|
|
99
|
+
self,
|
|
100
|
+
stream: bool = False,
|
|
101
|
+
transform: Optional[bool] = False,
|
|
102
|
+
schema_only: Optional[bool] = False,
|
|
103
|
+
**kwargs,
|
|
104
|
+
) -> DataFrame:
|
|
99
105
|
deps = self.get_dependencies()
|
|
100
106
|
assert deps, "not dependency found"
|
|
101
107
|
|
|
@@ -139,7 +145,7 @@ class Silver(BaseJob):
|
|
|
139
145
|
dfs.append(df)
|
|
140
146
|
|
|
141
147
|
except Exception as e:
|
|
142
|
-
DEFAULT_LOGGER.exception("
|
|
148
|
+
DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": self})
|
|
143
149
|
raise e
|
|
144
150
|
|
|
145
151
|
df = concat_dfs(dfs)
|
|
@@ -151,6 +157,9 @@ class Silver(BaseJob):
|
|
|
151
157
|
if transform:
|
|
152
158
|
df = self.base_transform(df)
|
|
153
159
|
|
|
160
|
+
if schema_only:
|
|
161
|
+
df = df.where("1 == 2")
|
|
162
|
+
|
|
154
163
|
return df
|
|
155
164
|
|
|
156
165
|
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
@@ -186,7 +195,7 @@ class Silver(BaseJob):
|
|
|
186
195
|
|
|
187
196
|
sql = f"create or replace view {self.qualified_name} as {' union all '.join(queries)}"
|
|
188
197
|
sql = fix_sql(sql)
|
|
189
|
-
DEFAULT_LOGGER.debug("view", extra={"
|
|
198
|
+
DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
|
|
190
199
|
self.spark.sql(sql)
|
|
191
200
|
|
|
192
201
|
else:
|
|
@@ -195,7 +204,7 @@ class Silver(BaseJob):
|
|
|
195
204
|
parent = deps[0].parent
|
|
196
205
|
sql = f"select * from {parent}"
|
|
197
206
|
sql = fix_sql(sql)
|
|
198
|
-
DEFAULT_LOGGER.debug("view", extra={"
|
|
207
|
+
DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
|
|
199
208
|
|
|
200
209
|
df = self.spark.sql(sql)
|
|
201
210
|
cdc_options = self.get_cdc_context(df)
|
|
@@ -205,7 +214,7 @@ class Silver(BaseJob):
|
|
|
205
214
|
from py4j.protocol import Py4JJavaError
|
|
206
215
|
|
|
207
216
|
try:
|
|
208
|
-
DEFAULT_LOGGER.debug("create or replace current view", extra={"
|
|
217
|
+
DEFAULT_LOGGER.debug("create or replace current view", extra={"label": self})
|
|
209
218
|
|
|
210
219
|
df = self.spark.sql(f"select * from {self.qualified_name}")
|
|
211
220
|
|
|
@@ -222,23 +231,23 @@ class Silver(BaseJob):
|
|
|
222
231
|
{where_clause}
|
|
223
232
|
"""
|
|
224
233
|
# sql = fix_sql(sql)
|
|
225
|
-
# DEFAULT_LOGGER.debug("current view", extra={"
|
|
234
|
+
# DEFAULT_LOGGER.debug("current view", extra={"label": self, "sql": sql})
|
|
226
235
|
self.spark.sql(sql)
|
|
227
236
|
|
|
228
|
-
except Py4JJavaError:
|
|
229
|
-
DEFAULT_LOGGER.exception("
|
|
237
|
+
except Py4JJavaError as e:
|
|
238
|
+
DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
|
|
230
239
|
|
|
231
|
-
def overwrite(self):
|
|
240
|
+
def overwrite(self, schedule: Optional[str] = None):
|
|
232
241
|
self.truncate()
|
|
233
|
-
self.run()
|
|
242
|
+
self.run(schedule=schedule)
|
|
234
243
|
|
|
235
244
|
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
236
|
-
DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"
|
|
245
|
+
DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
|
|
237
246
|
|
|
238
247
|
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
239
248
|
# if dataframe, reference is passed (BUG)
|
|
240
249
|
name = f"{self.step}_{self.topic}_{self.item}__check"
|
|
241
|
-
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
250
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
242
251
|
|
|
243
252
|
not_append = not self.mode == "append"
|
|
244
253
|
nocdc = self.change_data_capture == "nocdc"
|
|
@@ -265,12 +274,12 @@ class Silver(BaseJob):
|
|
|
265
274
|
1
|
|
266
275
|
"""
|
|
267
276
|
sql = fix_sql(sql)
|
|
268
|
-
DEFAULT_LOGGER.debug("check", extra={"
|
|
277
|
+
DEFAULT_LOGGER.debug("check", extra={"label": self, "sql": sql})
|
|
269
278
|
|
|
270
279
|
check_df = self.spark.sql(sql)
|
|
271
280
|
if not check_df.isEmpty():
|
|
272
281
|
rectify = True
|
|
273
|
-
DEFAULT_LOGGER.debug("rectify enabled", extra={"
|
|
282
|
+
DEFAULT_LOGGER.debug("rectify enabled", extra={"label": self})
|
|
274
283
|
|
|
275
284
|
context = {
|
|
276
285
|
"soft_delete": self.slowly_changing_dimension,
|
|
@@ -279,29 +288,30 @@ class Silver(BaseJob):
|
|
|
279
288
|
"order_duplicate_by": order_duplicate_by,
|
|
280
289
|
}
|
|
281
290
|
|
|
282
|
-
if self.slowly_changing_dimension:
|
|
283
|
-
if "__key" not in df.columns:
|
|
284
|
-
context["add_key"] = True
|
|
285
|
-
|
|
286
291
|
if self.mode == "memory":
|
|
287
292
|
context["mode"] = "complete"
|
|
288
|
-
if self.mode == "latest":
|
|
289
|
-
context["slice"] = "latest"
|
|
290
293
|
|
|
291
|
-
if self.
|
|
292
|
-
|
|
294
|
+
if self.slowly_changing_dimension:
|
|
295
|
+
if "__key" not in df.columns:
|
|
296
|
+
context["add_key"] = True
|
|
293
297
|
|
|
294
|
-
if nocdc:
|
|
295
|
-
if "__operation" in df.columns:
|
|
296
|
-
context["except"] = ["__operation"]
|
|
297
298
|
if nocdc and self.mode == "memory":
|
|
298
299
|
if "__operation" not in df.columns:
|
|
299
300
|
context["add_operation"] = "upsert"
|
|
300
|
-
context["except"] = ["__operation"]
|
|
301
301
|
|
|
302
|
+
if self.mode == "latest":
|
|
303
|
+
context["slice"] = "latest"
|
|
302
304
|
if not self.stream and self.mode == "update":
|
|
303
305
|
context["slice"] = "update"
|
|
304
306
|
|
|
307
|
+
if self.change_data_capture == "scd2":
|
|
308
|
+
context["correct_valid_from"] = True
|
|
309
|
+
|
|
310
|
+
if "__operation" in df.columns:
|
|
311
|
+
context["exclude"] = ["__operation"]
|
|
312
|
+
if nocdc: # operation is passed from the bronze layer
|
|
313
|
+
context["exclude"] = ["__operation"]
|
|
314
|
+
|
|
305
315
|
return context
|
|
306
316
|
|
|
307
317
|
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
@@ -313,12 +323,12 @@ class Silver(BaseJob):
|
|
|
313
323
|
name = f"{self.step}_{self.topic}_{self.item}"
|
|
314
324
|
if batch is not None:
|
|
315
325
|
name = f"{name}__{batch}"
|
|
316
|
-
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
326
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
317
327
|
sql = f"select * from {global_temp_view}"
|
|
318
328
|
|
|
319
329
|
check_df = self.spark.sql(sql)
|
|
320
330
|
if check_df.isEmpty():
|
|
321
|
-
DEFAULT_LOGGER.warning("no data", extra={"
|
|
331
|
+
DEFAULT_LOGGER.warning("no data", extra={"label": self})
|
|
322
332
|
return
|
|
323
333
|
|
|
324
334
|
if self.mode == "update":
|
|
@@ -359,16 +369,5 @@ class Silver(BaseJob):
|
|
|
359
369
|
|
|
360
370
|
def drop(self):
|
|
361
371
|
super().drop()
|
|
362
|
-
DEFAULT_LOGGER.debug("drop current view", extra={"
|
|
372
|
+
DEFAULT_LOGGER.debug("drop current view", extra={"label": self})
|
|
363
373
|
self.spark.sql(f"drop view if exists {self.qualified_name}__current")
|
|
364
|
-
|
|
365
|
-
def optimize(
|
|
366
|
-
self,
|
|
367
|
-
vacuum: Optional[bool] = True,
|
|
368
|
-
optimize: Optional[bool] = True,
|
|
369
|
-
analyze: Optional[bool] = True,
|
|
370
|
-
):
|
|
371
|
-
if self.mode == "memory":
|
|
372
|
-
DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
|
|
373
|
-
else:
|
|
374
|
-
super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
|
fabricks/core/masks.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import SparkSession
|
|
5
|
+
|
|
6
|
+
from fabricks.context import CATALOG, PATH_MASKS, SPARK
|
|
7
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def register_all_masks():
|
|
11
|
+
"""
|
|
12
|
+
Register all masks.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
DEFAULT_LOGGER.info("register masks")
|
|
16
|
+
for mask in get_masks():
|
|
17
|
+
split = mask.split(".")
|
|
18
|
+
try:
|
|
19
|
+
register_mask(mask=split[0])
|
|
20
|
+
except Exception as e:
|
|
21
|
+
DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_masks() -> List[str]:
|
|
25
|
+
return [os.path.basename(f) for f in PATH_MASKS.walk()]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
|
|
29
|
+
if spark is None:
|
|
30
|
+
spark = SPARK
|
|
31
|
+
assert spark is not None
|
|
32
|
+
|
|
33
|
+
df = spark.sql("show user functions in default")
|
|
34
|
+
|
|
35
|
+
if CATALOG:
|
|
36
|
+
df = df.where(f"function == '{CATALOG}.default.mask_{mask}'")
|
|
37
|
+
else:
|
|
38
|
+
df = df.where(f"function == 'spark_catalog.default.mask_{mask}'")
|
|
39
|
+
|
|
40
|
+
return not df.isEmpty()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def register_mask(mask: str, spark: Optional[SparkSession] = None):
|
|
44
|
+
if spark is None:
|
|
45
|
+
spark = SPARK
|
|
46
|
+
assert spark is not None
|
|
47
|
+
|
|
48
|
+
if not is_registered(mask, spark):
|
|
49
|
+
DEFAULT_LOGGER.debug(f"register mask {mask}")
|
|
50
|
+
|
|
51
|
+
path = PATH_MASKS.joinpath(f"{mask}.sql")
|
|
52
|
+
spark.sql(path.get_sql())
|
fabricks/core/parsers/base.py
CHANGED
|
@@ -6,7 +6,7 @@ from pyspark.sql.functions import col, expr, from_json, lit
|
|
|
6
6
|
from pyspark.sql.types import MapType, StringType
|
|
7
7
|
|
|
8
8
|
from fabricks.core.parsers._types import ParserOptions
|
|
9
|
-
from fabricks.core.utils import clean
|
|
9
|
+
from fabricks.core.parsers.utils import clean
|
|
10
10
|
from fabricks.utils.path import Path
|
|
11
11
|
from fabricks.utils.read.read import read
|
|
12
12
|
|
|
@@ -26,7 +26,7 @@ class BaseParser(ABC):
|
|
|
26
26
|
"__timestamp",
|
|
27
27
|
expr("left(concat_ws('', slice(__split, __split_size - 4, 4), '00'), 14)"),
|
|
28
28
|
)
|
|
29
|
-
df = df.withColumn("__timestamp", expr("
|
|
29
|
+
df = df.withColumn("__timestamp", expr("try_to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
|
|
30
30
|
df = df.drop("__split", "__split_size")
|
|
31
31
|
|
|
32
32
|
return df
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from fabricks.core.schedules.generate import generate
|
|
2
|
+
from fabricks.core.schedules.process import process
|
|
3
|
+
from fabricks.core.schedules.run import run
|
|
4
|
+
from fabricks.core.schedules.terminate import terminate
|
|
5
|
+
from fabricks.core.schedules.views import create_or_replace_view, create_or_replace_views
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"process",
|
|
9
|
+
"generate",
|
|
10
|
+
"terminate",
|
|
11
|
+
"run",
|
|
12
|
+
"create_or_replace_view",
|
|
13
|
+
"create_or_replace_views",
|
|
14
|
+
]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_dependencies(name: str) -> DataFrame:
|
|
5
|
+
from fabricks.core.dags import DagGenerator
|
|
6
|
+
|
|
7
|
+
g = DagGenerator(schedule=name)
|
|
8
|
+
return g.get_dependencies()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_mermaid_diagram(name: str) -> str:
|
|
12
|
+
df = get_dependencies(name)
|
|
13
|
+
|
|
14
|
+
df = df.withColumnRenamed("ParentId", "parent_id")
|
|
15
|
+
df = df.withColumnRenamed("Parent", "parent")
|
|
16
|
+
df = df.withColumnRenamed("JobId", "job_id")
|
|
17
|
+
df = df.withColumnRenamed("Job", "job")
|
|
18
|
+
|
|
19
|
+
dependencies = df.select("parent_id", "parent", "job_id", "job").collect()
|
|
20
|
+
|
|
21
|
+
out = "flowchart TD\n"
|
|
22
|
+
|
|
23
|
+
unique_nodes = set()
|
|
24
|
+
|
|
25
|
+
for row in dependencies:
|
|
26
|
+
parent_id = str(row["parent_id"])
|
|
27
|
+
parent_name = str(row["parent"])
|
|
28
|
+
child_id = str(row["job_id"])
|
|
29
|
+
child_name = str(row["job"])
|
|
30
|
+
|
|
31
|
+
if parent_id != "0" and parent_id is not None:
|
|
32
|
+
if parent_id not in unique_nodes:
|
|
33
|
+
out += f" {parent_id}[{parent_name}]\n"
|
|
34
|
+
unique_nodes.add(parent_id)
|
|
35
|
+
|
|
36
|
+
if child_id not in unique_nodes:
|
|
37
|
+
out += f" {child_id}[{child_name}]\n"
|
|
38
|
+
unique_nodes.add(child_id)
|
|
39
|
+
|
|
40
|
+
out += f" {parent_id} --> {child_id}\n"
|
|
41
|
+
else:
|
|
42
|
+
if child_id not in unique_nodes:
|
|
43
|
+
out += f" {child_id}[{child_name}]\n"
|
|
44
|
+
unique_nodes.add(child_id)
|
|
45
|
+
|
|
46
|
+
return out
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from fabricks.context import SPARK
|
|
2
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
3
|
+
from fabricks.core.schedules.get_schedule import get_schedule
|
|
4
|
+
from fabricks.core.schedules.get_schedules import get_schedules_df
|
|
5
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_or_replace_view_internal(name: str, options: dict):
|
|
9
|
+
step = "-- no step provided"
|
|
10
|
+
tag = "-- no tag provided"
|
|
11
|
+
view = "-- no view provided"
|
|
12
|
+
|
|
13
|
+
assert isinstance(options, dict), "options must be a dict"
|
|
14
|
+
|
|
15
|
+
if options.get("steps") is not None:
|
|
16
|
+
steps = [f"'{s}'" for s in options.get("steps")] # type: ignore
|
|
17
|
+
step = f"and j.step in ({', '.join(steps)})"
|
|
18
|
+
|
|
19
|
+
if options.get("tag") is not None:
|
|
20
|
+
tag = f"""and array_contains(j.tags, '{options.get("tag")}')"""
|
|
21
|
+
|
|
22
|
+
if options.get("view") is not None:
|
|
23
|
+
view = f"""inner join fabricks.{options.get("view")} v on j.job_id = v.job_id"""
|
|
24
|
+
|
|
25
|
+
sql = f"""
|
|
26
|
+
create or replace view fabricks.{name}_schedule
|
|
27
|
+
as
|
|
28
|
+
select
|
|
29
|
+
j.*
|
|
30
|
+
from
|
|
31
|
+
fabricks.jobs j
|
|
32
|
+
{view}
|
|
33
|
+
where
|
|
34
|
+
true
|
|
35
|
+
{step}
|
|
36
|
+
{tag}
|
|
37
|
+
and j.type not in ('manual')
|
|
38
|
+
"""
|
|
39
|
+
sql = fix_sql(sql)
|
|
40
|
+
DEFAULT_LOGGER.debug("create or replace (schedule) view", extra={"label": f"fabricks.{name}_schedule", "sql": sql})
|
|
41
|
+
|
|
42
|
+
SPARK.sql(sql)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def create_or_replace_view(name: str):
|
|
46
|
+
sc = get_schedule(name=name)
|
|
47
|
+
try:
|
|
48
|
+
create_or_replace_view_internal(sc["name"], sc["options"])
|
|
49
|
+
except Exception as e:
|
|
50
|
+
DEFAULT_LOGGER.exception(f"could not create nor replace view {sc['name']}", exc_info=e)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def create_or_replace_views():
|
|
54
|
+
DEFAULT_LOGGER.info("create or replace (schedule) views")
|
|
55
|
+
|
|
56
|
+
df = get_schedules_df()
|
|
57
|
+
for row in df.collect():
|
|
58
|
+
try:
|
|
59
|
+
create_or_replace_view_internal(row.name, row.options.asDict())
|
|
60
|
+
except Exception as e:
|
|
61
|
+
DEFAULT_LOGGER.exception(f"could not create nor replace view {row.name}", exc_info=e)
|