fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. fabricks/api/__init__.py +2 -0
  2. fabricks/api/context.py +1 -2
  3. fabricks/api/deploy.py +3 -0
  4. fabricks/api/job_schema.py +2 -2
  5. fabricks/api/masks.py +3 -0
  6. fabricks/api/notebooks/initialize.py +2 -2
  7. fabricks/api/notebooks/process.py +2 -2
  8. fabricks/api/notebooks/run.py +2 -2
  9. fabricks/api/notebooks/schedule.py +75 -0
  10. fabricks/api/notebooks/terminate.py +2 -2
  11. fabricks/api/schedules.py +2 -16
  12. fabricks/cdc/__init__.py +2 -2
  13. fabricks/cdc/base/__init__.py +2 -2
  14. fabricks/cdc/base/_types.py +9 -2
  15. fabricks/cdc/base/configurator.py +86 -41
  16. fabricks/cdc/base/generator.py +44 -35
  17. fabricks/cdc/base/merger.py +16 -14
  18. fabricks/cdc/base/processor.py +232 -144
  19. fabricks/cdc/nocdc.py +8 -7
  20. fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
  21. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  22. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  23. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  24. fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
  25. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  26. fabricks/cdc/templates/filter.sql.jinja +4 -4
  27. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  28. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  29. fabricks/cdc/templates/merge.sql.jinja +3 -2
  30. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  31. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  32. fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
  33. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
  34. fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
  35. fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
  36. fabricks/cdc/templates/query.sql.jinja +15 -11
  37. fabricks/context/__init__.py +18 -4
  38. fabricks/context/_types.py +2 -0
  39. fabricks/context/config/__init__.py +92 -0
  40. fabricks/context/config/utils.py +53 -0
  41. fabricks/context/log.py +8 -2
  42. fabricks/context/runtime.py +87 -263
  43. fabricks/context/secret.py +1 -1
  44. fabricks/context/spark_session.py +1 -1
  45. fabricks/context/utils.py +76 -0
  46. fabricks/core/dags/generator.py +6 -7
  47. fabricks/core/dags/log.py +2 -15
  48. fabricks/core/dags/processor.py +11 -11
  49. fabricks/core/dags/utils.py +15 -1
  50. fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
  51. fabricks/core/jobs/base/_types.py +64 -22
  52. fabricks/core/jobs/base/checker.py +13 -12
  53. fabricks/core/jobs/base/configurator.py +41 -67
  54. fabricks/core/jobs/base/generator.py +55 -24
  55. fabricks/core/jobs/base/invoker.py +54 -30
  56. fabricks/core/jobs/base/processor.py +43 -26
  57. fabricks/core/jobs/bronze.py +45 -38
  58. fabricks/core/jobs/get_jobs.py +2 -2
  59. fabricks/core/jobs/get_schedule.py +10 -0
  60. fabricks/core/jobs/get_schedules.py +32 -0
  61. fabricks/core/jobs/gold.py +61 -48
  62. fabricks/core/jobs/silver.py +39 -40
  63. fabricks/core/masks.py +52 -0
  64. fabricks/core/parsers/base.py +2 -2
  65. fabricks/core/schedules/__init__.py +14 -0
  66. fabricks/core/schedules/diagrams.py +46 -0
  67. fabricks/core/schedules/get_schedule.py +5 -0
  68. fabricks/core/schedules/get_schedules.py +9 -0
  69. fabricks/core/schedules/run.py +3 -0
  70. fabricks/core/schedules/views.py +61 -0
  71. fabricks/core/steps/base.py +110 -72
  72. fabricks/core/udfs.py +12 -23
  73. fabricks/core/views.py +20 -13
  74. fabricks/deploy/__init__.py +97 -0
  75. fabricks/deploy/masks.py +8 -0
  76. fabricks/deploy/notebooks.py +71 -0
  77. fabricks/deploy/schedules.py +8 -0
  78. fabricks/{core/deploy → deploy}/tables.py +16 -13
  79. fabricks/{core/deploy → deploy}/udfs.py +3 -1
  80. fabricks/deploy/utils.py +36 -0
  81. fabricks/{core/deploy → deploy}/views.py +5 -9
  82. fabricks/metastore/database.py +3 -3
  83. fabricks/metastore/dbobject.py +4 -4
  84. fabricks/metastore/table.py +157 -88
  85. fabricks/metastore/view.py +13 -6
  86. fabricks/utils/_types.py +6 -0
  87. fabricks/utils/azure_table.py +4 -3
  88. fabricks/utils/helpers.py +141 -11
  89. fabricks/utils/log.py +29 -18
  90. fabricks/utils/read/_types.py +1 -1
  91. fabricks/utils/schema/get_schema_for_type.py +6 -0
  92. fabricks/utils/write/delta.py +3 -3
  93. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
  94. fabricks-3.0.6.dist-info/RECORD +175 -0
  95. fabricks/api/notebooks/add_fabricks.py +0 -13
  96. fabricks/api/notebooks/optimize.py +0 -29
  97. fabricks/api/notebooks/vacuum.py +0 -29
  98. fabricks/cdc/templates/query/context.sql.jinja +0 -101
  99. fabricks/cdc/templates/query/current.sql.jinja +0 -32
  100. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
  101. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
  102. fabricks/cdc/templates/query/hash.sql.jinja +0 -1
  103. fabricks/cdc/templates/query/slice.sql.jinja +0 -14
  104. fabricks/config/__init__.py +0 -0
  105. fabricks/config/base.py +0 -8
  106. fabricks/config/fabricks/__init__.py +0 -26
  107. fabricks/config/fabricks/base.py +0 -90
  108. fabricks/config/fabricks/environment.py +0 -9
  109. fabricks/config/fabricks/pyproject.py +0 -47
  110. fabricks/config/jobs/__init__.py +0 -6
  111. fabricks/config/jobs/base.py +0 -101
  112. fabricks/config/jobs/bronze.py +0 -38
  113. fabricks/config/jobs/gold.py +0 -27
  114. fabricks/config/jobs/silver.py +0 -22
  115. fabricks/config/runtime.py +0 -67
  116. fabricks/config/steps/__init__.py +0 -6
  117. fabricks/config/steps/base.py +0 -50
  118. fabricks/config/steps/bronze.py +0 -7
  119. fabricks/config/steps/gold.py +0 -14
  120. fabricks/config/steps/silver.py +0 -15
  121. fabricks/core/deploy/__init__.py +0 -17
  122. fabricks/core/schedules.py +0 -142
  123. fabricks/core/scripts/__init__.py +0 -9
  124. fabricks/core/scripts/armageddon.py +0 -87
  125. fabricks/core/scripts/stats.py +0 -51
  126. fabricks/core/scripts/steps.py +0 -26
  127. fabricks-3.0.5.2.dist-info/RECORD +0 -177
  128. /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
  129. /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
  130. /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
  131. /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
  132. /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
  133. /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
  134. /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
  135. /fabricks/core/{utils.py → parsers/utils.py} +0 -0
  136. /fabricks/core/{scripts → schedules}/generate.py +0 -0
  137. /fabricks/core/{scripts → schedules}/process.py +0 -0
  138. /fabricks/core/{scripts → schedules}/terminate.py +0 -0
  139. {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
@@ -4,28 +4,30 @@ from typing import Optional, Union
4
4
 
5
5
  from jinja2 import Environment, PackageLoader
6
6
  from pyspark.sql import DataFrame
7
- from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
8
7
 
8
+ from fabricks.cdc.base._types import AllowedSources
9
9
  from fabricks.cdc.base.processor import Processor
10
10
  from fabricks.context.log import DEFAULT_LOGGER
11
- from fabricks.metastore.table import Table
12
11
  from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.utils._types import DataFrameLike
13
13
  from fabricks.utils.sqlglot import fix as fix_sql
14
14
 
15
15
 
16
16
  class Merger(Processor):
17
17
  def get_merge_context(self, src: Union[DataFrame, str], **kwargs) -> dict:
18
- if isinstance(src, (DataFrame, CDataFrame)):
18
+ if isinstance(src, DataFrameLike):
19
19
  format = "dataframe"
20
- columns = self.get_columns(src, backtick=False)
20
+ columns = self.get_columns(src, backtick=False, sort=False, check=False) # already done in processor
21
21
  elif isinstance(src, str):
22
22
  format = "view"
23
- columns = self.get_columns(f"select * from {src}", backtick=False)
23
+ columns = self.get_columns(
24
+ f"select * from {src}", backtick=False, sort=False, check=False
25
+ ) # already done in processor
24
26
  else:
25
27
  raise ValueError(f"{src} not allowed")
26
28
 
27
- assert "__merge_key" in columns
28
- assert "__merge_condition" in columns
29
+ assert "__merge_key" in columns, "__merge_key not found"
30
+ assert "__merge_condition" in columns, "__merge_condition not found"
29
31
 
30
32
  keys = kwargs.get("keys")
31
33
  if isinstance(keys, str):
@@ -35,6 +37,7 @@ class Merger(Processor):
35
37
  fields = [c for c in columns if not c.startswith("__")]
36
38
  where = kwargs.get("update_where") if self.table.rows > 0 else None
37
39
  soft_delete = "__is_deleted" in columns
40
+
38
41
  has_source = "__source" in columns
39
42
  has_key = "__key" in columns
40
43
  has_metadata = "__metadata" in columns
@@ -78,7 +81,7 @@ class Merger(Processor):
78
81
  try:
79
82
  sql = merge.render(**context)
80
83
  except Exception as e:
81
- DEFAULT_LOGGER.debug("context", extra={"job": self, "content": context})
84
+ DEFAULT_LOGGER.debug("context", extra={"label": self, "content": context})
82
85
  raise e
83
86
 
84
87
  if fix:
@@ -86,23 +89,22 @@ class Merger(Processor):
86
89
  sql = sql.replace("{src}", "src")
87
90
  sql = fix_sql(sql)
88
91
  sql = sql.replace("`src`", "{src}")
89
- DEFAULT_LOGGER.debug("merge", extra={"job": self, "sql": sql})
92
+ DEFAULT_LOGGER.debug("merge", extra={"label": self, "sql": sql})
90
93
 
91
94
  except Exception as e:
92
- DEFAULT_LOGGER.exception("could not clean sql query", extra={"job": self, "sql": sql})
95
+ DEFAULT_LOGGER.exception("fail to clean sql query", extra={"label": self, "sql": sql})
93
96
  raise e
94
- else:
95
- DEFAULT_LOGGER.debug("merge", extra={"job": self, "sql": sql})
96
97
 
97
98
  return sql
98
99
 
99
- def merge(self, src: Union[DataFrame, Table, str], **kwargs):
100
+ def merge(self, src: AllowedSources, **kwargs):
100
101
  if not self.table.exists():
101
102
  self.create_table(src, **kwargs)
102
103
 
103
104
  df = self.get_data(src, **kwargs)
104
105
  global_temp_view = f"{self.qualified_name}__merge"
105
- view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False))
106
+ view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False), job=self)
106
107
 
107
108
  merge = self.get_merge_query(view, **kwargs)
109
+ DEFAULT_LOGGER.debug("exec merge", extra={"label": self, "sql": merge})
108
110
  self.spark.sql(merge, src=view)
@@ -1,30 +1,34 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Optional, Union
3
+ from typing import Optional
4
4
 
5
5
  from jinja2 import Environment, PackageLoader
6
6
  from pyspark.sql import DataFrame
7
- from pyspark.sql.connect.dataframe import DataFrame as CDataFrame
8
7
 
8
+ from fabricks.cdc.base._types import AllowedSources
9
9
  from fabricks.cdc.base.generator import Generator
10
10
  from fabricks.context.log import DEFAULT_LOGGER
11
11
  from fabricks.metastore.table import Table
12
12
  from fabricks.metastore.view import create_or_replace_global_temp_view
13
+ from fabricks.utils._types import DataFrameLike
13
14
  from fabricks.utils.sqlglot import fix as fix_sql
14
15
 
15
16
 
16
17
  class Processor(Generator):
17
- def get_data(self, src: Union[DataFrame, Table, str], **kwargs) -> DataFrame:
18
- if isinstance(src, (DataFrame, CDataFrame)):
18
+ def get_data(self, src: AllowedSources, **kwargs) -> DataFrame:
19
+ if isinstance(src, DataFrameLike):
19
20
  name = f"{self.qualified_name}__data"
20
- global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False))
21
+ global_temp_view = create_or_replace_global_temp_view(name, src, uuid=kwargs.get("uuid", False), job=self)
21
22
  src = f"select * from {global_temp_view}"
22
23
 
23
24
  sql = self.get_query(src, fix=True, **kwargs)
25
+ DEFAULT_LOGGER.debug("exec query", extra={"label": self, "sql": sql})
24
26
  return self.spark.sql(sql)
25
27
 
26
- def get_query_context(self, src: Union[DataFrame, Table, str], **kwargs) -> dict:
27
- if isinstance(src, (DataFrame, CDataFrame)):
28
+ def get_query_context(self, src: AllowedSources, **kwargs) -> dict:
29
+ DEFAULT_LOGGER.debug("deduce query context", extra={"label": self})
30
+
31
+ if isinstance(src, DataFrameLike):
28
32
  format = "dataframe"
29
33
  elif isinstance(src, Table):
30
34
  format = "table"
@@ -33,123 +37,230 @@ class Processor(Generator):
33
37
  else:
34
38
  raise ValueError(f"{src} not allowed")
35
39
 
36
- columns = self.get_columns(src, backtick=False)
37
- fields = [c for c in columns if not c.startswith("__")]
38
-
40
+ inputs = self.get_columns(src, backtick=False, sort=False)
41
+ fields = [c for c in inputs if not c.startswith("__")]
39
42
  keys = kwargs.get("keys", None)
40
- mode = kwargs.get("mode", "complete")
41
43
 
44
+ mode = kwargs.get("mode", "complete")
42
45
  if mode == "update":
43
46
  tgt = str(self.table)
44
- elif mode == "append" and "__timestamp" in columns:
47
+ elif mode == "append" and "__timestamp" in inputs:
45
48
  tgt = str(self.table)
46
49
  else:
47
50
  tgt = None
48
51
 
52
+ overwrite = []
53
+ exclude = kwargs.get("exclude", []) # used by silver to exclude __operation from output if not update
54
+
49
55
  order_duplicate_by = kwargs.get("order_duplicate_by", None)
50
56
  if order_duplicate_by:
51
57
  order_duplicate_by = [f"{key} {value}" for key, value in order_duplicate_by.items()]
52
58
 
53
59
  add_source = kwargs.get("add_source", None)
54
60
  add_calculated_columns = kwargs.get("add_calculated_columns", [])
61
+ if add_calculated_columns:
62
+ raise ValueError("add_calculated_columns is not yet supported")
55
63
  add_operation = kwargs.get("add_operation", None)
56
64
  add_key = kwargs.get("add_key", None)
57
65
  add_hash = kwargs.get("add_hash", None)
58
66
  add_timestamp = kwargs.get("add_timestamp", None)
59
67
  add_metadata = kwargs.get("add_metadata", None)
60
68
 
61
- has_metadata = add_metadata or "__metadata" in columns
62
- has_source = add_source or "__source" in columns
63
- has_timestamp = add_timestamp or "__timestamp" in columns
64
- has_key = add_key or "__key" in columns
65
- has_hash = add_hash or "__hash" in columns
66
- has_identity = "__identity" in columns
67
- has_rescued_data = "__rescued_data" in columns
68
69
  has_order_by = None if not order_duplicate_by else True
69
- try:
70
- has_rows = self.table.rows > 0
71
- except Exception:
72
- has_rows = None
73
70
 
71
+ # determine which special columns are present or need to be added to the output
72
+ has_operation = add_operation or "__operation" in inputs
73
+ has_metadata = add_metadata or "__metadata" in inputs
74
+ has_source = add_source or "__source" in inputs
75
+ has_timestamp = add_timestamp or "__timestamp" in inputs
76
+ has_key = add_key or "__key" in inputs
77
+ has_hash = add_hash or "__hash" in inputs
78
+ has_identity = "__identity" in inputs
79
+ has_rescued_data = "__rescued_data" in inputs
80
+
81
+ soft_delete = kwargs.get("soft_delete", None)
82
+ delete_missing = kwargs.get("delete_missing", None)
74
83
  slice = kwargs.get("slice", None)
75
84
  rectify = kwargs.get("rectify", None)
76
85
  deduplicate = kwargs.get("deduplicate", None)
77
86
  deduplicate_key = kwargs.get("deduplicate_key", None)
78
87
  deduplicate_hash = kwargs.get("deduplicate_hash", None)
79
- soft_delete = kwargs.get("soft_delete", None)
80
88
  correct_valid_from = kwargs.get("correct_valid_from", None)
81
- delete_missing = kwargs.get("delete_missing", None)
82
89
 
83
- if mode == "update" and delete_missing:
84
- has_data = self.has_data(src)
85
- else:
86
- has_data = True
87
-
88
- if slice is None:
89
- if mode == "update" and has_timestamp and has_rows:
90
- slice = "update"
90
+ try:
91
+ has_rows = self.table.rows > 0
92
+ except Exception:
93
+ has_rows = None
91
94
 
92
- # override slice if update and table is empty
93
- if slice == "update" and not has_rows:
94
- slice = None
95
+ # only needed when comparing to current
96
+ # delete all records in current if there is no new data
97
+ if mode == "update" and delete_missing and self.change_data_capture in ["scd1", "scd2"]:
98
+ has_no_data = not self.has_data(src)
99
+ else:
100
+ has_no_data = None
95
101
 
102
+ # always deduplicate if not set for slowly changing dimensions
96
103
  if self.slowly_changing_dimension:
97
104
  if deduplicate is None:
98
105
  deduplicate = True
99
- if rectify is None:
100
- rectify = True
101
106
 
107
+ # order duplicates by implies key deduplication
102
108
  if order_duplicate_by:
103
109
  deduplicate_key = True
104
110
 
111
+ if deduplicate:
112
+ deduplicate_key = True
113
+ deduplicate_hash = True
114
+
115
+ # if any deduplication is requested, deduplicate all
116
+ deduplicate = deduplicate or deduplicate_key or deduplicate_hash
117
+
118
+ # always rectify if not set
119
+ if self.slowly_changing_dimension:
120
+ if rectify is None:
121
+ rectify = True
122
+
123
+ # only correct valid_from on first load
105
124
  if self.slowly_changing_dimension and mode == "update":
106
125
  correct_valid_from = correct_valid_from and self.table.rows == 0
107
126
 
108
- transformed = slice or rectify or deduplicate or deduplicate_key or deduplicate_hash
127
+ # override slice for incremental load if timestamp and rows are present
128
+ if slice is None:
129
+ if mode == "update" and has_timestamp and has_rows:
130
+ slice = "update"
109
131
 
110
- if deduplicate:
111
- deduplicate_key = True
112
- deduplicate_hash = True
132
+ # override slice for full load if update and table is empty
133
+ if slice == "update" and not has_rows:
134
+ slice = None
135
+
136
+ # override operation if added and found in df
137
+ if add_operation and "__operation" in inputs:
138
+ overwrite.append("__operation")
139
+
140
+ # override timestamp if added and found in df
141
+ if add_timestamp and "__timestamp" in inputs:
142
+ overwrite.append("__timestamp")
143
+
144
+ # override key if added and found in df (key needed for merge)
145
+ if add_key and "__key" in inputs:
146
+ overwrite.append("__key")
147
+
148
+ # override hash if added and found in df (hash needed to identify fake updates)
149
+ if add_hash and "__hash" in inputs:
150
+ overwrite.append("__hash")
151
+
152
+ # override metadata if added and found in df
153
+ if add_metadata and "__metadata" in inputs:
154
+ overwrite.append("__metadata")
155
+
156
+ advanced_ctes = ((rectify or deduplicate) and self.slowly_changing_dimension) or self.slowly_changing_dimension
157
+ advanced_deduplication = advanced_ctes and deduplicate
158
+
159
+ # add key and hash if not added nor found in df but exclude from output
160
+ # needed for merge
161
+ if mode == "update" or advanced_ctes or deduplicate:
162
+ if not add_key and "__key" not in inputs:
163
+ add_key = True
164
+ exclude.append("__key")
165
+
166
+ if not add_hash and "__hash" not in inputs:
167
+ add_hash = True
168
+ exclude.append("__hash")
169
+
170
+ # add operation and timestamp if not added nor found in df but exclude from output
171
+ # needed for deduplication and/or rectification
172
+ if advanced_ctes:
173
+ if not add_operation and "__operation" not in inputs:
174
+ add_operation = "upsert"
175
+ exclude.append("__operation")
176
+
177
+ if not add_timestamp and "__timestamp" not in inputs:
178
+ add_timestamp = True
179
+ exclude.append("__timestamp")
180
+
181
+ if add_key:
182
+ keys = keys if keys is not None else [f for f in fields]
183
+ if isinstance(keys, str):
184
+ keys = [keys]
185
+ if has_source:
186
+ keys.append("__source")
187
+
188
+ hashes = None
189
+ if add_hash:
190
+ hashes = [f for f in fields]
191
+ if "__operation" in inputs or add_operation:
192
+ hashes.append("__operation")
113
193
 
114
- all_except = kwargs.get("except", []) or []
115
- all_overwrite = []
116
-
117
- # override operation if provided and found in df
118
- if add_operation and "__operation" in columns:
119
- all_overwrite.append("__operation")
120
- # add operation if not provided and not found in df BUT remove from output
121
- elif (transformed or self.slowly_changing_dimension) and not add_operation and "__operation" not in columns:
122
- add_operation = "upsert"
123
- if self.change_data_capture == "nocdc":
124
- all_except.append("__operation")
125
-
126
- # override key if provided and found in df
127
- if add_key and "__key" in columns:
128
- all_overwrite.append("__key")
129
- # add key if not provided and not found in df BUT remove from output
130
- elif (transformed or keys or self.slowly_changing_dimension) and not add_key and "__key" not in columns:
131
- add_key = True
132
- all_except.append("__key")
133
-
134
- # override hash if provided and found in df
135
- if add_hash and "__hash" in columns:
136
- all_overwrite.append("__hash")
137
- # add hash if not provided and not found in df BUT remove from output
138
- elif (transformed or self.slowly_changing_dimension) and not add_hash and "__hash" not in columns:
139
- add_hash = True
140
- all_except.append("__hash")
141
-
142
- # override timestamp if provided and found in df
143
- if add_timestamp and "__timestamp" in columns:
144
- all_overwrite.append("__timestamp")
145
- # add timestamp if not provided and not found in df BUT remove from output
146
- elif (transformed or self.slowly_changing_dimension) and not add_timestamp and "__timestamp" not in columns:
147
- add_timestamp = True
148
- all_except.append("__timestamp")
149
-
150
- # override metadata if provided and found in df
151
- if add_metadata and "__metadata" in columns:
152
- all_overwrite.append("__metadata")
194
+ if self.change_data_capture == "nocdc":
195
+ intermediates = [i for i in inputs]
196
+ outputs = [i for i in inputs]
197
+ else:
198
+ intermediates = [f for f in fields]
199
+ outputs = [f for f in fields]
200
+
201
+ if has_operation:
202
+ if "__operation" not in outputs:
203
+ outputs.append("__operation")
204
+ if has_timestamp:
205
+ if "__timestamp" not in outputs:
206
+ outputs.append("__timestamp")
207
+ if has_key:
208
+ if "__key" not in outputs:
209
+ outputs.append("__key")
210
+ if has_hash:
211
+ if "__hash" not in outputs:
212
+ outputs.append("__hash")
213
+
214
+ if has_metadata:
215
+ if "__metadata" not in outputs:
216
+ outputs.append("__metadata")
217
+ if "__metadata" not in intermediates:
218
+ intermediates.append("__metadata")
219
+ if has_source:
220
+ if "__source" not in outputs:
221
+ outputs.append("__source")
222
+ if "__source" not in intermediates:
223
+ intermediates.append("__source")
224
+ if has_identity:
225
+ if "__identity" not in outputs:
226
+ outputs.append("__identity")
227
+ if "__identity" not in intermediates:
228
+ intermediates.append("__identity")
229
+ if has_rescued_data:
230
+ if "__rescued_data" not in outputs:
231
+ outputs.append("__rescued_data")
232
+ if "__rescued_data" not in intermediates:
233
+ intermediates.append("__rescued_data")
234
+
235
+ if soft_delete:
236
+ if "__is_deleted" not in outputs:
237
+ outputs.append("__is_deleted")
238
+ if "__is_current" not in outputs:
239
+ outputs.append("__is_current")
240
+
241
+ if self.change_data_capture == "scd2":
242
+ if "__valid_from" not in outputs:
243
+ outputs.append("__valid_from")
244
+ if "__valid_to" not in outputs:
245
+ outputs.append("__valid_to")
246
+ if "__is_current" not in outputs:
247
+ outputs.append("__is_current")
248
+
249
+ if advanced_ctes:
250
+ if "__operation" not in intermediates:
251
+ intermediates.append("__operation")
252
+ if "__timestamp" not in intermediates:
253
+ intermediates.append("__timestamp")
254
+
255
+ # needed for deduplication and/or rectification
256
+ # might need __operation or __source
257
+ if "__key" not in intermediates:
258
+ intermediates.append("__key")
259
+ if "__hash" not in intermediates:
260
+ intermediates.append("__hash")
261
+
262
+ outputs = [o for o in outputs if o not in exclude]
263
+ outputs = self.sort_columns(outputs)
153
264
 
154
265
  parent_slice = None
155
266
  if slice:
@@ -196,38 +307,6 @@ class Processor(Generator):
196
307
 
197
308
  parent_final = "__final"
198
309
 
199
- if add_key:
200
- keys = keys if keys is not None else fields
201
- if isinstance(keys, str):
202
- keys = [keys]
203
- if has_source:
204
- keys.append("__source")
205
- keys = [f"cast(`{k}` as string)" for k in keys]
206
-
207
- hashes = None
208
- if add_hash:
209
- hashes = [f"cast(`{f}` as string)" for f in fields]
210
- if "__operation" in columns or add_operation:
211
- hashes.append("cast(`__operation` <=> 'delete' as string)")
212
-
213
- if fields:
214
- if has_order_by:
215
- if "__order_duplicate_by_desc desc" in order_duplicate_by:
216
- fields.append("__order_duplicate_by_desc")
217
- elif "__order_duplicate_by_asc asc" in order_duplicate_by:
218
- fields.append("__order_duplicate_by_asc")
219
- fields = [f"`{f}`" for f in fields]
220
-
221
- if self.change_data_capture == "nocdc":
222
- __not_allowed_columns = [
223
- c
224
- for c in columns
225
- if c.startswith("__")
226
- and c not in self.allowed_leading_columns
227
- and c not in self.allowed_trailing_columns
228
- ]
229
- all_except = all_except + __not_allowed_columns
230
-
231
310
  return {
232
311
  "src": src,
233
312
  "format": format,
@@ -235,22 +314,28 @@ class Processor(Generator):
235
314
  "cdc": self.change_data_capture,
236
315
  "mode": mode,
237
316
  # fields
317
+ "inputs": inputs,
318
+ "intermediates": intermediates,
319
+ "outputs": outputs,
238
320
  "fields": fields,
239
321
  "keys": keys,
240
322
  "hashes": hashes,
241
323
  # options
324
+ "delete_missing": delete_missing,
325
+ "advanced_deduplication": advanced_deduplication,
326
+ # cte's
242
327
  "slice": slice,
243
328
  "rectify": rectify,
244
329
  "deduplicate": deduplicate,
245
- # extra
246
330
  "deduplicate_key": deduplicate_key,
247
331
  "deduplicate_hash": deduplicate_hash,
248
332
  # has
249
- "has_data": has_data,
333
+ "has_no_data": has_no_data,
250
334
  "has_rows": has_rows,
251
335
  "has_source": has_source,
252
336
  "has_metadata": has_metadata,
253
337
  "has_timestamp": has_timestamp,
338
+ "has_operation": has_operation,
254
339
  "has_identity": has_identity,
255
340
  "has_key": has_key,
256
341
  "has_hash": has_hash,
@@ -269,9 +354,8 @@ class Processor(Generator):
269
354
  "order_duplicate_by": order_duplicate_by,
270
355
  "soft_delete": soft_delete,
271
356
  "correct_valid_from": correct_valid_from,
272
- # except
273
- "all_except": all_except,
274
- "all_overwrite": all_overwrite,
357
+ # overwrite
358
+ "overwrite": overwrite,
275
359
  # filter
276
360
  "slices": None,
277
361
  "sources": None,
@@ -291,11 +375,12 @@ class Processor(Generator):
291
375
  sql = sql.replace("{src}", "src")
292
376
  sql = fix_sql(sql)
293
377
  sql = sql.replace("`src`", "{src}")
294
- DEFAULT_LOGGER.debug("query", extra={"job": self, "sql": sql, "target": "buffer"})
378
+
379
+ DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql, "target": "buffer"})
295
380
  return sql
296
381
 
297
382
  except Exception as e:
298
- DEFAULT_LOGGER.exception("could not fix sql query", extra={"job": self, "sql": sql})
383
+ DEFAULT_LOGGER.exception("fail to fix sql query", extra={"label": self, "sql": sql})
299
384
  raise e
300
385
 
301
386
  def fix_context(self, context: dict, fix: Optional[bool] = True, **kwargs) -> dict:
@@ -305,12 +390,11 @@ class Processor(Generator):
305
390
  try:
306
391
  sql = template.render(**context)
307
392
  if fix:
393
+ DEFAULT_LOGGER.debug("fix context", extra={"label": self, "sql": sql})
308
394
  sql = self.fix_sql(sql)
309
- else:
310
- DEFAULT_LOGGER.debug("fix context", extra={"job": self, "sql": sql})
311
395
 
312
- except Exception as e:
313
- DEFAULT_LOGGER.exception("could not execute sql query", extra={"job": self, "context": context})
396
+ except (Exception, TypeError) as e:
397
+ DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "context": context})
314
398
  raise e
315
399
 
316
400
  row = self.spark.sql(sql).collect()[0]
@@ -323,51 +407,54 @@ class Processor(Generator):
323
407
 
324
408
  return context
325
409
 
326
- def get_query(self, src: Union[DataFrame, Table, str], fix: Optional[bool] = True, **kwargs) -> str:
410
+ def get_query(self, src: AllowedSources, fix: Optional[bool] = True, **kwargs) -> str:
327
411
  context = self.get_query_context(src=src, **kwargs)
328
412
  environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
329
413
 
330
- if context.get("slice"):
331
- context = self.fix_context(context, fix=fix, **kwargs)
332
-
333
- template = environment.get_template("query.sql.jinja")
334
414
  try:
415
+ if context.get("slice"):
416
+ context = self.fix_context(context, fix=fix, **kwargs)
417
+
418
+ template = environment.get_template("query.sql.jinja")
419
+
335
420
  sql = template.render(**context)
336
421
  if fix:
337
422
  sql = self.fix_sql(sql)
338
423
  else:
339
- DEFAULT_LOGGER.debug("query", extra={"job": self, "sql": sql})
424
+ DEFAULT_LOGGER.debug("print query", extra={"label": self, "sql": sql})
340
425
 
341
- except Exception as e:
342
- DEFAULT_LOGGER.exception("could not generate sql query", extra={"job": self, "context": context})
426
+ except (Exception, TypeError) as e:
427
+ DEFAULT_LOGGER.debug("context", extra={"label": self, "context": context})
428
+ DEFAULT_LOGGER.exception("fail to generate sql query", extra={"label": self, "context": context})
343
429
  raise e
344
430
 
345
431
  return sql
346
432
 
347
- def append(self, src: Union[DataFrame, Table, str], **kwargs):
348
- if not self.table.exists():
433
+ def append(self, src: AllowedSources, **kwargs):
434
+ if not self.table.registered:
349
435
  self.create_table(src, **kwargs)
350
436
 
351
437
  df = self.get_data(src, **kwargs)
352
- df = self.reorder_columns(df)
438
+ df = self.reorder_dataframe(df)
353
439
 
354
440
  name = f"{self.qualified_name}__append"
355
- create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
441
+ create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
442
+ append = f"insert into table {self.table} by name select * from global_temp.{name}"
356
443
 
357
- DEFAULT_LOGGER.debug("append", extra={"job": self})
358
- self.spark.sql(f"insert into table {self.table} by name select * from global_temp.{name}")
444
+ DEFAULT_LOGGER.debug("exec append", extra={"label": self, "sql": append})
445
+ self.spark.sql(append)
359
446
 
360
447
  def overwrite(
361
448
  self,
362
- src: Union[DataFrame, Table, str],
449
+ src: AllowedSources,
363
450
  dynamic: Optional[bool] = False,
364
451
  **kwargs,
365
452
  ):
366
- if not self.table.exists():
453
+ if not self.table.registered:
367
454
  self.create_table(src, **kwargs)
368
455
 
369
456
  df = self.get_data(src, **kwargs)
370
- df = self.reorder_columns(df)
457
+ df = self.reorder_dataframe(df)
371
458
 
372
459
  if not dynamic:
373
460
  if kwargs.get("update_where"):
@@ -377,7 +464,8 @@ class Processor(Generator):
377
464
  self.spark.sql("set spark.sql.sources.partitionOverwriteMode = dynamic")
378
465
 
379
466
  name = f"{self.qualified_name}__overwrite"
380
- create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False))
467
+ create_or_replace_global_temp_view(name, df, uuid=kwargs.get("uuid", False), job=self)
468
+ overwrite = f"insert overwrite table {self.table} by name select * from global_temp.{name}"
381
469
 
382
- DEFAULT_LOGGER.debug("overwrite", extra={"job": self})
383
- self.spark.sql(f"insert overwrite table {self.table} by name select * from global_temp.{name}")
470
+ DEFAULT_LOGGER.debug("excec overwrite", extra={"label": self, "sql": overwrite})
471
+ self.spark.sql(overwrite)
fabricks/cdc/nocdc.py CHANGED
@@ -1,12 +1,11 @@
1
- from typing import Optional, Union
1
+ from typing import Optional
2
2
 
3
- from pyspark.sql import DataFrame, SparkSession
3
+ from pyspark.sql import SparkSession
4
4
 
5
- from fabricks.cdc.base import BaseCDC
6
- from fabricks.metastore.table import Table
5
+ from fabricks.cdc.scd import SCD
7
6
 
8
7
 
9
- class NoCDC(BaseCDC):
8
+ class NoCDC(SCD):
10
9
  def __init__(
11
10
  self,
12
11
  database: str,
@@ -15,5 +14,7 @@ class NoCDC(BaseCDC):
15
14
  ):
16
15
  super().__init__(database, *levels, change_data_capture="nocdc", spark=spark)
17
16
 
18
- def complete(self, src: Union[DataFrame, Table, str], **kwargs):
19
- self.overwrite(src=src, **kwargs)
17
+ def delete_missing(self, src, **kwargs):
18
+ kwargs["delete_missing"] = True
19
+ kwargs["mode"] = "update"
20
+ self.merge(src, **kwargs)