fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +80 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
- fabricks-3.0.7.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
fabricks/cdc/base/merger.py
CHANGED
|
@@ -4,28 +4,30 @@ from typing import Optional, Union
|
|
|
4
4
|
|
|
5
5
|
from jinja2 import Environment, PackageLoader
|
|
6
6
|
from pyspark.sql import DataFrame
|
|
7
|
-
from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
|
|
8
7
|
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
9
|
from fabricks.cdc.base.processor import Processor
|
|
10
10
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
-
from fabricks.metastore.table import Table
|
|
12
11
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.utils._types import DataFrameLike
|
|
13
13
|
from fabricks.utils.sqlglot import fix as fix_sql
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class Merger(Processor):
|
|
17
17
|
def get_merge_context(self, src: Union[DataFrame, str], **kwargs) -> dict:
|
|
18
|
-
if isinstance(src,
|
|
18
|
+
if isinstance(src, DataFrameLike):
|
|
19
19
|
format = "dataframe"
|
|
20
|
-
columns = self.get_columns(src, backtick=False)
|
|
20
|
+
columns = self.get_columns(src, backtick=False, sort=False, check=False) # already done in processor
|
|
21
21
|
elif isinstance(src, str):
|
|
22
22
|
format = "view"
|
|
23
|
-
columns = self.get_columns(
|
|
23
|
+
columns = self.get_columns(
|
|
24
|
+
f"select * from {src}", backtick=False, sort=False, check=False
|
|
25
|
+
) # already done in processor
|
|
24
26
|
else:
|
|
25
27
|
raise ValueError(f"{src} not allowed")
|
|
26
28
|
|
|
27
|
-
assert "__merge_key" in columns
|
|
28
|
-
assert "__merge_condition" in columns
|
|
29
|
+
assert "__merge_key" in columns, "__merge_key not found"
|
|
30
|
+
assert "__merge_condition" in columns, "__merge_condition not found"
|
|
29
31
|
|
|
30
32
|
keys = kwargs.get("keys")
|
|
31
33
|
if isinstance(keys, str):
|
|
@@ -35,6 +37,7 @@ class Merger(Processor):
|
|
|
35
37
|
fields = [c for c in columns if not c.startswith("__")]
|
|
36
38
|
where = kwargs.get("update_where") if self.table.rows > 0 else None
|
|
37
39
|
soft_delete = "__is_deleted" in columns
|
|
40
|
+
|
|
38
41
|
has_source = "__source" in columns
|
|
39
42
|
has_key = "__key" in columns
|
|
40
43
|
has_metadata = "__metadata" in columns
|
|
@@ -78,7 +81,7 @@ class Merger(Processor):
|
|
|
78
81
|
try:
|
|
79
82
|
sql = merge.render(**context)
|
|
80
83
|
except Exception as e:
|
|
81
|
-
DEFAULT_LOGGER.debug("context", extra={"
|
|
84
|
+
DEFAULT_LOGGER.debug("context", extra={"label": self, "content": context})
|
|
82
85
|
raise e
|
|
83
86
|
|
|
84
87
|
if fix:
|
|
@@ -86,23 +89,22 @@ class Merger(Processor):
|
|
|
86
89
|
sql = sql.replace("{src}", "src")
|
|
87
90
|
sql = fix_sql(sql)
|
|
88
91
|
sql = sql.replace("`src`", "{src}")
|
|
89
|
-
DEFAULT_LOGGER.debug("merge", extra={"
|
|
92
|
+
DEFAULT_LOGGER.debug("merge", extra={"label": self, "sql": sql})
|
|
90
93
|
|
|
91
94
|
except Exception as e:
|
|
92
|
-
DEFAULT_LOGGER.exception("
|
|
95
|
+
DEFAULT_LOGGER.exception("fail to clean sql query", extra={"label": self, "sql": sql})
|
|
93
96
|
raise e
|
|
94
|
-
else:
|
|
95
|
-
DEFAULT_LOGGER.debug("merge", extra={"job": self, "sql": sql})
|
|
96
97
|
|
|
97
98
|
return sql
|
|
98
99
|
|
|
99
|
-
def merge(self, src:
|
|
100
|
+
def merge(self, src: AllowedSources, **kwargs):
|
|
100
101
|
if not self.table.exists():
|
|
101
102
|
self.create_table(src, **kwargs)
|
|
102
103
|
|
|
103
104
|
df = self.get_data(src, **kwargs)
|
|
104
105
|
global_temp_view = f"{self.qualified_name}__merge"
|
|
105
|
-
view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False))
|
|
106
|
+
view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False), job=self)
|
|
106
107
|
|
|
107
108
|
merge = self.get_merge_query(view, **kwargs)
|
|
109
|
+
DEFAULT_LOGGER.debug("exec merge", extra={"label": self, "sql": merge})
|
|
108
110
|
self.spark.sql(merge, src=view)
|
fabricks/cdc/base/processor.py
CHANGED
|
@@ -1,30 +1,34 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import Optional
|
|
4
4
|
|
|
5
5
|
from jinja2 import Environment, PackageLoader
|
|
6
6
|
from pyspark.sql import DataFrame
|
|
7
|
-
from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
|
|
8
7
|
|
|
8
|
+
from fabricks.cdc.base._types import AllowedSources
|
|
9
9
|
from fabricks.cdc.base.generator import Generator
|
|
10
10
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
11
|
from fabricks.metastore.table import Table
|
|
12
12
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
13
|
+
from fabricks.utils._types import DataFrameLike
|
|
13
14
|
from fabricks.utils.sqlglot import fix as fix_sql
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class Processor(Generator):
|
|
17
|
-
def get_data(self, src:
|
|
18
|
-
if isinstance(src,
|
|
18
|
+
def get_data(self, src: AllowedSources, **kwargs) -> DataFrame:
|
|
19
|
+
if isinstance(src, DataFrameLike):
|
|
19
20
|
name = f"{self.qualified_name}__data"
|
|
20
|
-
global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False))
|
|
21
|
+
global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False), job=self)
|
|
21
22
|
src = f"select * from {global_temp_view}"
|
|
22
23
|
|
|
23
24
|
sql = self.get_query(src, fix=True, **kwargs)
|
|
25
|
+
DEFAULT_LOGGER.debug("exec query", extra={"label": self, "sql": sql})
|
|
24
26
|
return self.spark.sql(sql)
|
|
25
27
|
|
|
26
|
-
def get_query_context(self, src:
|
|
27
|
-
|
|
28
|
+
def get_query_context(self, src: AllowedSources, **kwargs) -> dict:
|
|
29
|
+
DEFAULT_LOGGER.debug("deduce query context", extra={"label": self})
|
|
30
|
+
|
|
31
|
+
if isinstance(src, DataFrameLike):
|
|
28
32
|
format = "dataframe"
|
|
29
33
|
elif isinstance(src, Table):
|
|
30
34
|
format = "table"
|
|
@@ -33,123 +37,230 @@ class Processor(Generator):
|
|
|
33
37
|
else:
|
|
34
38
|
raise ValueError(f"{src} not allowed")
|
|
35
39
|
|
|
36
|
-
|
|
37
|
-
fields = [c for c in
|
|
38
|
-
|
|
40
|
+
inputs = self.get_columns(src, backtick=False, sort=False)
|
|
41
|
+
fields = [c for c in inputs if not c.startswith("__")]
|
|
39
42
|
keys = kwargs.get("keys", None)
|
|
40
|
-
mode = kwargs.get("mode", "complete")
|
|
41
43
|
|
|
44
|
+
mode = kwargs.get("mode", "complete")
|
|
42
45
|
if mode == "update":
|
|
43
46
|
tgt = str(self.table)
|
|
44
|
-
elif mode == "append" and "__timestamp" in
|
|
47
|
+
elif mode == "append" and "__timestamp" in inputs:
|
|
45
48
|
tgt = str(self.table)
|
|
46
49
|
else:
|
|
47
50
|
tgt = None
|
|
48
51
|
|
|
52
|
+
overwrite = []
|
|
53
|
+
exclude = kwargs.get("exclude", []) # used by silver to exclude __operation from output if not update
|
|
54
|
+
|
|
49
55
|
order_duplicate_by = kwargs.get("order_duplicate_by", None)
|
|
50
56
|
if order_duplicate_by:
|
|
51
57
|
order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
|
|
52
58
|
|
|
53
59
|
add_source = kwargs.get("add_source", None)
|
|
54
60
|
add_calculated_columns = kwargs.get("add_calculated_columns", [])
|
|
61
|
+
if add_calculated_columns:
|
|
62
|
+
raise ValueError("add_calculated_columns is not yet supported")
|
|
55
63
|
add_operation = kwargs.get("add_operation", None)
|
|
56
64
|
add_key = kwargs.get("add_key", None)
|
|
57
65
|
add_hash = kwargs.get("add_hash", None)
|
|
58
66
|
add_timestamp = kwargs.get("add_timestamp", None)
|
|
59
67
|
add_metadata = kwargs.get("add_metadata", None)
|
|
60
68
|
|
|
61
|
-
has_metadata = add_metadata or "__metadata" in columns
|
|
62
|
-
has_source = add_source or "__source" in columns
|
|
63
|
-
has_timestamp = add_timestamp or "__timestamp" in columns
|
|
64
|
-
has_key = add_key or "__key" in columns
|
|
65
|
-
has_hash = add_hash or "__hash" in columns
|
|
66
|
-
has_identity = "__identity" in columns
|
|
67
|
-
has_rescued_data = "__rescued_data" in columns
|
|
68
69
|
has_order_by = None if not order_duplicate_by else True
|
|
69
|
-
try:
|
|
70
|
-
has_rows = self.table.rows > 0
|
|
71
|
-
except Exception:
|
|
72
|
-
has_rows = None
|
|
73
70
|
|
|
71
|
+
# determine which special columns are present or need to be added to the output
|
|
72
|
+
has_operation = add_operation or "__operation" in inputs
|
|
73
|
+
has_metadata = add_metadata or "__metadata" in inputs
|
|
74
|
+
has_source = add_source or "__source" in inputs
|
|
75
|
+
has_timestamp = add_timestamp or "__timestamp" in inputs
|
|
76
|
+
has_key = add_key or "__key" in inputs
|
|
77
|
+
has_hash = add_hash or "__hash" in inputs
|
|
78
|
+
has_identity = "__identity" in inputs
|
|
79
|
+
has_rescued_data = "__rescued_data" in inputs
|
|
80
|
+
|
|
81
|
+
soft_delete = kwargs.get("soft_delete", None)
|
|
82
|
+
delete_missing = kwargs.get("delete_missing", None)
|
|
74
83
|
slice = kwargs.get("slice", None)
|
|
75
84
|
rectify = kwargs.get("rectify", None)
|
|
76
85
|
deduplicate = kwargs.get("deduplicate", None)
|
|
77
86
|
deduplicate_key = kwargs.get("deduplicate_key", None)
|
|
78
87
|
deduplicate_hash = kwargs.get("deduplicate_hash", None)
|
|
79
|
-
soft_delete = kwargs.get("soft_delete", None)
|
|
80
88
|
correct_valid_from = kwargs.get("correct_valid_from", None)
|
|
81
|
-
delete_missing = kwargs.get("delete_missing", None)
|
|
82
89
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
if slice is None:
|
|
89
|
-
if mode == "update" and has_timestamp and has_rows:
|
|
90
|
-
slice = "update"
|
|
90
|
+
try:
|
|
91
|
+
has_rows = self.table.rows > 0
|
|
92
|
+
except Exception:
|
|
93
|
+
has_rows = None
|
|
91
94
|
|
|
92
|
-
#
|
|
93
|
-
if
|
|
94
|
-
|
|
95
|
+
# only needed when comparing to current
|
|
96
|
+
# delete all records in current if there is no new data
|
|
97
|
+
if mode == "update" and delete_missing and self.change_data_capture in ["scd1", "scd2"]:
|
|
98
|
+
has_no_data = not self.has_data(src)
|
|
99
|
+
else:
|
|
100
|
+
has_no_data = None
|
|
95
101
|
|
|
102
|
+
# always deduplicate if not set for slowly changing dimensions
|
|
96
103
|
if self.slowly_changing_dimension:
|
|
97
104
|
if deduplicate is None:
|
|
98
105
|
deduplicate = True
|
|
99
|
-
if rectify is None:
|
|
100
|
-
rectify = True
|
|
101
106
|
|
|
107
|
+
# order duplicates by implies key deduplication
|
|
102
108
|
if order_duplicate_by:
|
|
103
109
|
deduplicate_key = True
|
|
104
110
|
|
|
111
|
+
if deduplicate:
|
|
112
|
+
deduplicate_key = True
|
|
113
|
+
deduplicate_hash = True
|
|
114
|
+
|
|
115
|
+
# if any deduplication is requested, deduplicate all
|
|
116
|
+
deduplicate = deduplicate or deduplicate_key or deduplicate_hash
|
|
117
|
+
|
|
118
|
+
# always rectify if not set
|
|
119
|
+
if self.slowly_changing_dimension:
|
|
120
|
+
if rectify is None:
|
|
121
|
+
rectify = True
|
|
122
|
+
|
|
123
|
+
# only correct valid_from on first load
|
|
105
124
|
if self.slowly_changing_dimension and mode == "update":
|
|
106
125
|
correct_valid_from = correct_valid_from and self.table.rows == 0
|
|
107
126
|
|
|
108
|
-
|
|
127
|
+
# override slice for incremental load if timestamp and rows are present
|
|
128
|
+
if slice is None:
|
|
129
|
+
if mode == "update" and has_timestamp and has_rows:
|
|
130
|
+
slice = "update"
|
|
109
131
|
|
|
110
|
-
if
|
|
111
|
-
|
|
112
|
-
|
|
132
|
+
# override slice for full load if update and table is empty
|
|
133
|
+
if slice == "update" and not has_rows:
|
|
134
|
+
slice = None
|
|
135
|
+
|
|
136
|
+
# override operation if added and found in df
|
|
137
|
+
if add_operation and "__operation" in inputs:
|
|
138
|
+
overwrite.append("__operation")
|
|
139
|
+
|
|
140
|
+
# override timestamp if added and found in df
|
|
141
|
+
if add_timestamp and "__timestamp" in inputs:
|
|
142
|
+
overwrite.append("__timestamp")
|
|
143
|
+
|
|
144
|
+
# override key if added and found in df (key needed for merge)
|
|
145
|
+
if add_key and "__key" in inputs:
|
|
146
|
+
overwrite.append("__key")
|
|
147
|
+
|
|
148
|
+
# override hash if added and found in df (hash needed to identify fake updates)
|
|
149
|
+
if add_hash and "__hash" in inputs:
|
|
150
|
+
overwrite.append("__hash")
|
|
151
|
+
|
|
152
|
+
# override metadata if added and found in df
|
|
153
|
+
if add_metadata and "__metadata" in inputs:
|
|
154
|
+
overwrite.append("__metadata")
|
|
155
|
+
|
|
156
|
+
advanced_ctes = ((rectify or deduplicate) and self.slowly_changing_dimension) or self.slowly_changing_dimension
|
|
157
|
+
advanced_deduplication = advanced_ctes and deduplicate
|
|
158
|
+
|
|
159
|
+
# add key and hash if not added nor found in df but exclude from output
|
|
160
|
+
# needed for merge
|
|
161
|
+
if mode == "update" or advanced_ctes or deduplicate:
|
|
162
|
+
if not add_key and "__key" not in inputs:
|
|
163
|
+
add_key = True
|
|
164
|
+
exclude.append("__key")
|
|
165
|
+
|
|
166
|
+
if not add_hash and "__hash" not in inputs:
|
|
167
|
+
add_hash = True
|
|
168
|
+
exclude.append("__hash")
|
|
169
|
+
|
|
170
|
+
# add operation and timestamp if not added nor found in df but exclude from output
|
|
171
|
+
# needed for deduplication and/or rectification
|
|
172
|
+
if advanced_ctes:
|
|
173
|
+
if not add_operation and "__operation" not in inputs:
|
|
174
|
+
add_operation = "upsert"
|
|
175
|
+
exclude.append("__operation")
|
|
176
|
+
|
|
177
|
+
if not add_timestamp and "__timestamp" not in inputs:
|
|
178
|
+
add_timestamp = True
|
|
179
|
+
exclude.append("__timestamp")
|
|
180
|
+
|
|
181
|
+
if add_key:
|
|
182
|
+
keys = keys if keys is not None else [f for f in fields]
|
|
183
|
+
if isinstance(keys, str):
|
|
184
|
+
keys = [keys]
|
|
185
|
+
if has_source:
|
|
186
|
+
keys.append("__source")
|
|
187
|
+
|
|
188
|
+
hashes = None
|
|
189
|
+
if add_hash:
|
|
190
|
+
hashes = [f for f in fields]
|
|
191
|
+
if "__operation" in inputs or add_operation:
|
|
192
|
+
hashes.append("__operation")
|
|
113
193
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
194
|
+
if self.change_data_capture == "nocdc":
|
|
195
|
+
intermediates = [i for i in inputs]
|
|
196
|
+
outputs = [i for i in inputs]
|
|
197
|
+
else:
|
|
198
|
+
intermediates = [f for f in fields]
|
|
199
|
+
outputs = [f for f in fields]
|
|
200
|
+
|
|
201
|
+
if has_operation:
|
|
202
|
+
if "__operation" not in outputs:
|
|
203
|
+
outputs.append("__operation")
|
|
204
|
+
if has_timestamp:
|
|
205
|
+
if "__timestamp" not in outputs:
|
|
206
|
+
outputs.append("__timestamp")
|
|
207
|
+
if has_key:
|
|
208
|
+
if "__key" not in outputs:
|
|
209
|
+
outputs.append("__key")
|
|
210
|
+
if has_hash:
|
|
211
|
+
if "__hash" not in outputs:
|
|
212
|
+
outputs.append("__hash")
|
|
213
|
+
|
|
214
|
+
if has_metadata:
|
|
215
|
+
if "__metadata" not in outputs:
|
|
216
|
+
outputs.append("__metadata")
|
|
217
|
+
if "__metadata" not in intermediates:
|
|
218
|
+
intermediates.append("__metadata")
|
|
219
|
+
if has_source:
|
|
220
|
+
if "__source" not in outputs:
|
|
221
|
+
outputs.append("__source")
|
|
222
|
+
if "__source" not in intermediates:
|
|
223
|
+
intermediates.append("__source")
|
|
224
|
+
if has_identity:
|
|
225
|
+
if "__identity" not in outputs:
|
|
226
|
+
outputs.append("__identity")
|
|
227
|
+
if "__identity" not in intermediates:
|
|
228
|
+
intermediates.append("__identity")
|
|
229
|
+
if has_rescued_data:
|
|
230
|
+
if "__rescued_data" not in outputs:
|
|
231
|
+
outputs.append("__rescued_data")
|
|
232
|
+
if "__rescued_data" not in intermediates:
|
|
233
|
+
intermediates.append("__rescued_data")
|
|
234
|
+
|
|
235
|
+
if soft_delete:
|
|
236
|
+
if "__is_deleted" not in outputs:
|
|
237
|
+
outputs.append("__is_deleted")
|
|
238
|
+
if "__is_current" not in outputs:
|
|
239
|
+
outputs.append("__is_current")
|
|
240
|
+
|
|
241
|
+
if self.change_data_capture == "scd2":
|
|
242
|
+
if "__valid_from" not in outputs:
|
|
243
|
+
outputs.append("__valid_from")
|
|
244
|
+
if "__valid_to" not in outputs:
|
|
245
|
+
outputs.append("__valid_to")
|
|
246
|
+
if "__is_current" not in outputs:
|
|
247
|
+
outputs.append("__is_current")
|
|
248
|
+
|
|
249
|
+
if advanced_ctes:
|
|
250
|
+
if "__operation" not in intermediates:
|
|
251
|
+
intermediates.append("__operation")
|
|
252
|
+
if "__timestamp" not in intermediates:
|
|
253
|
+
intermediates.append("__timestamp")
|
|
254
|
+
|
|
255
|
+
# needed for deduplication and/or rectification
|
|
256
|
+
# might need __operation or __source
|
|
257
|
+
if "__key" not in intermediates:
|
|
258
|
+
intermediates.append("__key")
|
|
259
|
+
if "__hash" not in intermediates:
|
|
260
|
+
intermediates.append("__hash")
|
|
261
|
+
|
|
262
|
+
outputs = [o for o in outputs if o not in exclude]
|
|
263
|
+
outputs = self.sort_columns(outputs)
|
|
153
264
|
|
|
154
265
|
parent_slice = None
|
|
155
266
|
if slice:
|
|
@@ -196,38 +307,6 @@ class Processor(Generator):
|
|
|
196
307
|
|
|
197
308
|
parent_final = "__final"
|
|
198
309
|
|
|
199
|
-
if add_key:
|
|
200
|
-
keys = keys if keys is not None else fields
|
|
201
|
-
if isinstance(keys, str):
|
|
202
|
-
keys = [keys]
|
|
203
|
-
if has_source:
|
|
204
|
-
keys.append("__source")
|
|
205
|
-
keys = [f"cast(`{k}` as string)" for k in keys]
|
|
206
|
-
|
|
207
|
-
hashes = None
|
|
208
|
-
if add_hash:
|
|
209
|
-
hashes = [f"cast(`{f}` as string)" for f in fields]
|
|
210
|
-
if "__operation" in columns or add_operation:
|
|
211
|
-
hashes.append("cast(`__operation` <=> 'delete' as string)")
|
|
212
|
-
|
|
213
|
-
if fields:
|
|
214
|
-
if has_order_by:
|
|
215
|
-
if "__order_duplicate_by_desc desc" in order_duplicate_by:
|
|
216
|
-
fields.append("__order_duplicate_by_desc")
|
|
217
|
-
elif "__order_duplicate_by_asc asc" in order_duplicate_by:
|
|
218
|
-
fields.append("__order_duplicate_by_asc")
|
|
219
|
-
fields = [f"`{f}`" for f in fields]
|
|
220
|
-
|
|
221
|
-
if self.change_data_capture == "nocdc":
|
|
222
|
-
__not_allowed_columns = [
|
|
223
|
-
c
|
|
224
|
-
for c in columns
|
|
225
|
-
if c.startswith("__")
|
|
226
|
-
and c not in self.allowed_leading_columns
|
|
227
|
-
and c not in self.allowed_trailing_columns
|
|
228
|
-
]
|
|
229
|
-
all_except = all_except + __not_allowed_columns
|
|
230
|
-
|
|
231
310
|
return {
|
|
232
311
|
"src": src,
|
|
233
312
|
"format": format,
|
|
@@ -235,22 +314,28 @@ class Processor(Generator):
|
|
|
235
314
|
"cdc": self.change_data_capture,
|
|
236
315
|
"mode": mode,
|
|
237
316
|
# fields
|
|
317
|
+
"inputs": inputs,
|
|
318
|
+
"intermediates": intermediates,
|
|
319
|
+
"outputs": outputs,
|
|
238
320
|
"fields": fields,
|
|
239
321
|
"keys": keys,
|
|
240
322
|
"hashes": hashes,
|
|
241
323
|
# options
|
|
324
|
+
"delete_missing": delete_missing,
|
|
325
|
+
"advanced_deduplication": advanced_deduplication,
|
|
326
|
+
# cte's
|
|
242
327
|
"slice": slice,
|
|
243
328
|
"rectify": rectify,
|
|
244
329
|
"deduplicate": deduplicate,
|
|
245
|
-
# extra
|
|
246
330
|
"deduplicate_key": deduplicate_key,
|
|
247
331
|
"deduplicate_hash": deduplicate_hash,
|
|
248
332
|
# has
|
|
249
|
-
"
|
|
333
|
+
"has_no_data": has_no_data,
|
|
250
334
|
"has_rows": has_rows,
|
|
251
335
|
"has_source": has_source,
|
|
252
336
|
"has_metadata": has_metadata,
|
|
253
337
|
"has_timestamp": has_timestamp,
|
|
338
|
+
"has_operation": has_operation,
|
|
254
339
|
"has_identity": has_identity,
|
|
255
340
|
"has_key": has_key,
|
|
256
341
|
"has_hash": has_hash,
|
|
@@ -269,9 +354,8 @@ class Processor(Generator):
|
|
|
269
354
|
"order_duplicate_by": order_duplicate_by,
|
|
270
355
|
"soft_delete": soft_delete,
|
|
271
356
|
"correct_valid_from": correct_valid_from,
|
|
272
|
-
#
|
|
273
|
-
"
|
|
274
|
-
"all_overwrite": all_overwrite,
|
|
357
|
+
# overwrite
|
|
358
|
+
"overwrite": overwrite,
|
|
275
359
|
# filter
|
|
276
360
|
"slices": None,
|
|
277
361
|
"sources": None,
|
|
@@ -291,11 +375,12 @@ class Processor(Generator):
|
|
|
291
375
|
sql = sql.replace("{src}", "src")
|
|
292
376
|
sql = fix_sql(sql)
|
|
293
377
|
sql = sql.replace("`src`", "{src}")
|
|
294
|
-
|
|
378
|
+
|
|
379
|
+
DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql, "target": "buffer"})
|
|
295
380
|
return sql
|
|
296
381
|
|
|
297
382
|
except Exception as e:
|
|
298
|
-
DEFAULT_LOGGER.exception("
|
|
383
|
+
DEFAULT_LOGGER.exception("fail to fix sql query", extra={"label": self, "sql": sql})
|
|
299
384
|
raise e
|
|
300
385
|
|
|
301
386
|
def fix_context(self, context: dict, fix: Optional[bool] = True, **kwargs) -> dict:
|
|
@@ -305,12 +390,11 @@ class Processor(Generator):
|
|
|
305
390
|
try:
|
|
306
391
|
sql = template.render(**context)
|
|
307
392
|
if fix:
|
|
393
|
+
DEFAULT_LOGGER.debug("fix context", extra={"label": self, "sql": sql})
|
|
308
394
|
sql = self.fix_sql(sql)
|
|
309
|
-
else:
|
|
310
|
-
DEFAULT_LOGGER.debug("fix context", extra={"job": self, "sql": sql})
|
|
311
395
|
|
|
312
|
-
except Exception as e:
|
|
313
|
-
DEFAULT_LOGGER.exception("
|
|
396
|
+
except (Exception, TypeError) as e:
|
|
397
|
+
DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "context": context})
|
|
314
398
|
raise e
|
|
315
399
|
|
|
316
400
|
row = self.spark.sql(sql).collect()[0]
|
|
@@ -323,51 +407,54 @@ class Processor(Generator):
|
|
|
323
407
|
|
|
324
408
|
return context
|
|
325
409
|
|
|
326
|
-
def get_query(self, src:
|
|
410
|
+
def get_query(self, src: AllowedSources, fix: Optional[bool] = True, **kwargs) -> str:
|
|
327
411
|
context = self.get_query_context(src=src, **kwargs)
|
|
328
412
|
environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
|
|
329
413
|
|
|
330
|
-
if context.get("slice"):
|
|
331
|
-
context = self.fix_context(context, fix=fix, **kwargs)
|
|
332
|
-
|
|
333
|
-
template = environment.get_template("query.sql.jinja")
|
|
334
414
|
try:
|
|
415
|
+
if context.get("slice"):
|
|
416
|
+
context = self.fix_context(context, fix=fix, **kwargs)
|
|
417
|
+
|
|
418
|
+
template = environment.get_template("query.sql.jinja")
|
|
419
|
+
|
|
335
420
|
sql = template.render(**context)
|
|
336
421
|
if fix:
|
|
337
422
|
sql = self.fix_sql(sql)
|
|
338
423
|
else:
|
|
339
|
-
DEFAULT_LOGGER.debug("query", extra={"
|
|
424
|
+
DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql})
|
|
340
425
|
|
|
341
|
-
except Exception as e:
|
|
342
|
-
DEFAULT_LOGGER.
|
|
426
|
+
except (Exception, TypeError) as e:
|
|
427
|
+
DEFAULT_LOGGER.debug("context", extra={"label": self, "context": context})
|
|
428
|
+
DEFAULT_LOGGER.exception("fail to generate sql query", extra={"label": self, "context": context})
|
|
343
429
|
raise e
|
|
344
430
|
|
|
345
431
|
return sql
|
|
346
432
|
|
|
347
|
-
def append(self, src:
|
|
348
|
-
if not self.table.
|
|
433
|
+
def append(self, src: AllowedSources, **kwargs):
|
|
434
|
+
if not self.table.registered:
|
|
349
435
|
self.create_table(src, **kwargs)
|
|
350
436
|
|
|
351
437
|
df = self.get_data(src, **kwargs)
|
|
352
|
-
df = self.
|
|
438
|
+
df = self.reorder_dataframe(df)
|
|
353
439
|
|
|
354
440
|
name = f"{self.qualified_name}__append"
|
|
355
|
-
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
|
|
441
|
+
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
|
|
442
|
+
append = f"insert into table {self.table} by name select * from global_temp.{name}"
|
|
356
443
|
|
|
357
|
-
DEFAULT_LOGGER.debug("append", extra={"
|
|
358
|
-
self.spark.sql(
|
|
444
|
+
DEFAULT_LOGGER.debug("exec append", extra={"label": self, "sql": append})
|
|
445
|
+
self.spark.sql(append)
|
|
359
446
|
|
|
360
447
|
def overwrite(
|
|
361
448
|
self,
|
|
362
|
-
src:
|
|
449
|
+
src: AllowedSources,
|
|
363
450
|
dynamic: Optional[bool] = False,
|
|
364
451
|
**kwargs,
|
|
365
452
|
):
|
|
366
|
-
if not self.table.
|
|
453
|
+
if not self.table.registered:
|
|
367
454
|
self.create_table(src, **kwargs)
|
|
368
455
|
|
|
369
456
|
df = self.get_data(src, **kwargs)
|
|
370
|
-
df = self.
|
|
457
|
+
df = self.reorder_dataframe(df)
|
|
371
458
|
|
|
372
459
|
if not dynamic:
|
|
373
460
|
if kwargs.get("update_where"):
|
|
@@ -377,7 +464,8 @@ class Processor(Generator):
|
|
|
377
464
|
self.spark.sql("set spark.sql.sources.partitionOverwriteMode = dynamic")
|
|
378
465
|
|
|
379
466
|
name = f"{self.qualified_name}__overwrite"
|
|
380
|
-
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
|
|
467
|
+
create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
|
|
468
|
+
overwrite = f"insert overwrite table {self.table} by name select * from global_temp.{name}"
|
|
381
469
|
|
|
382
|
-
DEFAULT_LOGGER.debug("overwrite", extra={"
|
|
383
|
-
self.spark.sql(
|
|
470
|
+
DEFAULT_LOGGER.debug("excec overwrite", extra={"label": self, "sql": overwrite})
|
|
471
|
+
self.spark.sql(overwrite)
|
fabricks/cdc/nocdc.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
from typing import Optional
|
|
1
|
+
from typing import Optional
|
|
2
2
|
|
|
3
|
-
from pyspark.sql import
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
4
|
|
|
5
|
-
from fabricks.cdc.
|
|
6
|
-
from fabricks.metastore.table import Table
|
|
5
|
+
from fabricks.cdc.scd import SCD
|
|
7
6
|
|
|
8
7
|
|
|
9
|
-
class NoCDC(
|
|
8
|
+
class NoCDC(SCD):
|
|
10
9
|
def __init__(
|
|
11
10
|
self,
|
|
12
11
|
database: str,
|
|
@@ -15,5 +14,7 @@ class NoCDC(BaseCDC):
|
|
|
15
14
|
):
|
|
16
15
|
super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
|
|
17
16
|
|
|
18
|
-
def
|
|
19
|
-
|
|
17
|
+
def delete_missing(self, src, **kwargs):
|
|
18
|
+
kwargs["delete_missing"] = True
|
|
19
|
+
kwargs["mode"] = "update"
|
|
20
|
+
self.merge(src, **kwargs)
|