fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import List, Optional, Union
5
+
6
+ from pyspark.sql import DataFrame, SparkSession
7
+
8
+ from fabricks.cdc.base._types import AllowedSources
9
+ from fabricks.context import SPARK
10
+ from fabricks.context.log import DEFAULT_LOGGER
11
+ from fabricks.metastore.database import Database
12
+ from fabricks.metastore.table import Table
13
+ from fabricks.utils._types import DataFrameLike
14
+
15
+
16
+ class Configurator(ABC):
17
+ def __init__(
18
+ self,
19
+ database: str,
20
+ *levels: str,
21
+ change_data_capture: str,
22
+ spark: Optional[SparkSession] = None,
23
+ ):
24
+ if spark is None:
25
+ spark = SPARK
26
+ assert spark is not None
27
+ self.spark: SparkSession = spark
28
+
29
+ self.database = Database(database)
30
+ self.levels = levels
31
+ self.change_data_capture = change_data_capture
32
+ self.table = Table(self.database.name, *self.levels, spark=self.spark)
33
+
34
+ @property
35
+ def is_view(self):
36
+ return self.table.is_view
37
+
38
+ @property
39
+ def registered(self):
40
+ return self.table.registered
41
+
42
+ @property
43
+ def qualified_name(self):
44
+ return f"{self.database}_{'_'.join(self.levels)}"
45
+
46
+ @abstractmethod
47
+ def get_query(self, src: AllowedSources, **kwargs) -> str: ...
48
+
49
+ @abstractmethod
50
+ def get_data(self, src: AllowedSources, **kwargs) -> DataFrame: ...
51
+
52
+ @abstractmethod
53
+ def create_table(
54
+ self,
55
+ src: AllowedSources,
56
+ partitioning: Optional[bool] = False,
57
+ partition_by: Optional[Union[List[str], str]] = None,
58
+ identity: Optional[bool] = False,
59
+ liquid_clustering: Optional[bool] = False,
60
+ cluster_by: Optional[Union[List[str], str]] = None,
61
+ properties: Optional[dict[str, str]] = None,
62
+ **kwargs,
63
+ ): ...
64
+
65
+ @abstractmethod
66
+ def drop(self): ...
67
+
68
+ @abstractmethod
69
+ def create_or_replace_view(self, src: Union[Table, str], **kwargs): ...
70
+
71
+ @property
72
+ def allowed_input__columns(self) -> List[str]:
73
+ cols = self.__columns
74
+
75
+ if self.slowly_changing_dimension:
76
+ if "__valid_from" in cols:
77
+ cols.remove("__valid_from")
78
+ if "__valid_to" in cols:
79
+ cols.remove("__valid_to")
80
+ if "__is_current" in cols:
81
+ cols.remove("__is_current")
82
+ if "__is_deleted" in cols:
83
+ cols.remove("__is_deleted")
84
+
85
+ return cols
86
+
87
+ @property
88
+ def allowed_ouput_leading__columns(self) -> List[str]:
89
+ cols = [
90
+ "__identity",
91
+ "__source",
92
+ "__key",
93
+ "__timestamp",
94
+ "__valid_from",
95
+ "__valid_to",
96
+ "__is_current",
97
+ "__is_deleted",
98
+ ]
99
+
100
+ if self.change_data_capture == "scd1":
101
+ cols.remove("__valid_from")
102
+ cols.remove("__valid_to")
103
+ elif self.change_data_capture == "scd2":
104
+ cols.remove("__timestamp")
105
+
106
+ return cols
107
+
108
+ @property
109
+ def allowed_output_trailing__columns(self) -> List[str]:
110
+ cols = [
111
+ "__operation",
112
+ "__metadata",
113
+ "__hash",
114
+ "__rescued_data",
115
+ ]
116
+
117
+ if self.slowly_changing_dimension:
118
+ cols.remove("__operation")
119
+
120
+ return cols
121
+
122
+ @property
123
+ def __columns(self) -> List[str]:
124
+ return [
125
+ # Leading
126
+ "__identity",
127
+ "__source",
128
+ "__key",
129
+ "__timestamp",
130
+ "__valid_from",
131
+ "__valid_to",
132
+ "__is_current",
133
+ "__is_deleted",
134
+ # Trailing
135
+ "__operation",
136
+ "__metadata",
137
+ "__hash",
138
+ "__rescued_data",
139
+ ]
140
+
141
+ @property
142
+ def slowly_changing_dimension(self) -> bool:
143
+ return self.change_data_capture in ["scd1", "scd2"]
144
+
145
+ def get_src(self, src: AllowedSources) -> DataFrame:
146
+ if isinstance(src, DataFrameLike):
147
+ df = src
148
+ elif isinstance(src, Table):
149
+ df = self.table.dataframe
150
+ elif isinstance(src, str):
151
+ df = self.spark.sql(src)
152
+ else:
153
+ raise ValueError(f"{src} not allowed")
154
+
155
+ return df
156
+
157
+ def has_data(self, src: AllowedSources, **kwargs) -> bool:
158
+ DEFAULT_LOGGER.debug("check if has data", extra={"label": self})
159
+ df = self.get_src(src=src)
160
+ return not df.isEmpty()
161
+
162
+ def get_columns(
163
+ self,
164
+ src: AllowedSources,
165
+ backtick: Optional[bool] = True,
166
+ sort: Optional[bool] = True,
167
+ check: Optional[bool] = True,
168
+ ) -> List[str]:
169
+ if backtick:
170
+ backtick = True
171
+
172
+ df = self.get_src(src=src)
173
+ columns = df.columns
174
+
175
+ if check:
176
+ for c in columns:
177
+ # avoid duplicate column issue in merge
178
+ if c.startswith("__") and c in self.__columns:
179
+ assert c in self.allowed_input__columns, f"{c} is not allowed"
180
+
181
+ if sort:
182
+ columns = self.sort_columns(columns)
183
+
184
+ if backtick:
185
+ return [f"`{c}`" for c in columns]
186
+ else:
187
+ return columns
188
+
189
+ def sort_columns(self, columns: List[str]) -> List[str]:
190
+ fields = [c for c in columns if not c.startswith("__")]
191
+
192
+ leading = self.allowed_ouput_leading__columns
193
+ trailing = self.allowed_output_trailing__columns
194
+
195
+ # move __hash to the front of the table to ensure statistics are present
196
+ if "__key" not in columns and "__hash" in columns:
197
+ leading = ["__hash" if c == "__key" else c for c in leading]
198
+ trailing = [c for c in trailing if c != "__hash"]
199
+
200
+ __leading = [c for c in leading if c in columns]
201
+ __trailing = [c for c in trailing if c in columns]
202
+
203
+ return __leading + fields + __trailing
204
+
205
+ def reorder_dataframe(self, df: DataFrame) -> DataFrame:
206
+ columns = self.sort_columns(df.columns)
207
+ columns = [f"`{c}`" for c in columns]
208
+ return df.select(columns)
209
+
210
+ @abstractmethod
211
+ def optimize_table(self): ...
212
+
213
+ @abstractmethod
214
+ def update_schema(self, src: AllowedSources, **kwargs): ...
215
+
216
+ @abstractmethod
217
+ def get_differences_with_deltatable(self, src: AllowedSources, **kwargs): ...
218
+
219
+ @abstractmethod
220
+ def overwrite_schema(self, src: AllowedSources): ...
221
+
222
+ def __str__(self):
223
+ return f"{self.table.qualified_name}"
@@ -0,0 +1,177 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, List, Optional, Sequence, Union, cast
4
+
5
+ from py4j.protocol import Py4JJavaError
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.cdc.base._types import AllowedSources
9
+ from fabricks.cdc.base.configurator import Configurator
10
+ from fabricks.context.log import DEFAULT_LOGGER
11
+ from fabricks.metastore.table import SchemaDiff, Table
12
+ from fabricks.utils._types import DataFrameLike
13
+ from fabricks.utils.sqlglot import fix as fix_sql
14
+
15
+
16
+ class Generator(Configurator):
17
+ def drop(self):
18
+ self.table.drop()
19
+
20
+ def create_table(
21
+ self,
22
+ src: AllowedSources,
23
+ partitioning: Optional[bool] = False,
24
+ partition_by: Optional[Union[List[str], str]] = None,
25
+ identity: Optional[bool] = False,
26
+ liquid_clustering: Optional[bool] = False,
27
+ cluster_by: Optional[Union[List[str], str]] = None,
28
+ properties: Optional[dict[str, str]] = None,
29
+ masks: Optional[dict[str, str]] = None,
30
+ primary_key: Optional[dict[str, Any]] = None,
31
+ foreign_keys: Optional[dict[str, Any]] = None,
32
+ comments: Optional[dict[str, str]] = None,
33
+ **kwargs,
34
+ ):
35
+ kwargs["mode"] = "complete"
36
+ kwargs["slice"] = False
37
+ kwargs["rectify"] = False
38
+ kwargs["deduplicate"] = False
39
+
40
+ df = self.get_data(src, **kwargs)
41
+
42
+ if partitioning is True:
43
+ assert partition_by, "partitioning column(s) not found"
44
+
45
+ df = self.reorder_dataframe(df)
46
+
47
+ identity = False if identity is None else identity
48
+ liquid_clustering = False if liquid_clustering is None else liquid_clustering
49
+
50
+ self.table.create(
51
+ df=df,
52
+ partitioning=partitioning,
53
+ partition_by=partition_by,
54
+ identity=identity,
55
+ liquid_clustering=liquid_clustering,
56
+ cluster_by=cluster_by,
57
+ properties=properties,
58
+ masks=masks,
59
+ primary_key=primary_key,
60
+ foreign_keys=foreign_keys,
61
+ comments=comments,
62
+ )
63
+
64
+ def create_or_replace_view(self, src: Union[Table, str], schema_evolution: bool = True, **kwargs):
65
+ assert not isinstance(src, DataFrameLike), "dataframe not allowed"
66
+
67
+ assert kwargs["mode"] == "complete", f"{kwargs['mode']} not allowed"
68
+ sql = self.get_query(src, **kwargs)
69
+
70
+ df = self.spark.sql(sql)
71
+ df = self.reorder_dataframe(df)
72
+ columns = [f"`{c}`" for c in df.columns]
73
+
74
+ sql = f"""
75
+ create or replace view {self}
76
+ {"with schema evolution" if schema_evolution else "-- no schema evolution"}
77
+ as
78
+ with __view as (
79
+ {sql}
80
+ )
81
+ select
82
+ {",".join(columns)}
83
+ from __view
84
+ """
85
+ sql = fix_sql(sql)
86
+ DEFAULT_LOGGER.debug("create or replace view", extra={"label": self, "sql": sql})
87
+
88
+ try:
89
+ self.spark.sql(sql)
90
+ except Py4JJavaError as e:
91
+ DEFAULT_LOGGER.exception("fail to execute sql query", extra={"label": self, "sql": sql}, exc_info=e)
92
+
93
+ def optimize_table(self):
94
+ columns = None
95
+
96
+ if self.change_data_capture == "scd1":
97
+ columns = ["__key"]
98
+ elif self.change_data_capture == "scd2":
99
+ columns = ["__key", "__valid_from"]
100
+
101
+ self.table.optimize(columns=columns)
102
+
103
+ def get_differences_with_deltatable(self, src: AllowedSources, **kwargs) -> DataFrame:
104
+ from pyspark.sql.types import StringType, StructField, StructType
105
+
106
+ schema = StructType(
107
+ [
108
+ StructField("column", StringType(), False),
109
+ StructField("data_type", StringType(), True),
110
+ StructField("new_column", StringType(), True),
111
+ StructField("new_data_type", StringType(), True),
112
+ StructField("status", StringType(), True),
113
+ ]
114
+ )
115
+
116
+ if self.is_view:
117
+ return self.spark.createDataFrame([], schema=schema)
118
+
119
+ else:
120
+ kwargs["mode"] = "complete"
121
+ if "slice" in kwargs:
122
+ del kwargs["slice"]
123
+
124
+ df = self.get_data(src, **kwargs)
125
+ df = self.reorder_dataframe(df)
126
+
127
+ diffs = self.table.get_schema_differences(df)
128
+ return self.spark.createDataFrame([cast(Any, d.model_dump()) for d in diffs], schema=schema)
129
+
130
+ def get_schema_differences(self, src: AllowedSources, **kwargs) -> Optional[Sequence[SchemaDiff]]:
131
+ if self.is_view:
132
+ return None
133
+
134
+ else:
135
+ kwargs["mode"] = "complete"
136
+ if "slice" in kwargs:
137
+ del kwargs["slice"]
138
+
139
+ df = self.get_data(src, **kwargs)
140
+ df = self.reorder_dataframe(df)
141
+
142
+ return self.table.get_schema_differences(df)
143
+
144
+ def schema_drifted(self, src: AllowedSources, **kwargs) -> Optional[bool]:
145
+ d = self.get_schema_differences(src, **kwargs)
146
+ if d is None:
147
+ return None
148
+ return len(d) > 0
149
+
150
+ def _update_schema(
151
+ self,
152
+ src: AllowedSources,
153
+ overwrite: bool = False,
154
+ widen_types: bool = False,
155
+ **kwargs,
156
+ ):
157
+ if self.is_view:
158
+ assert not isinstance(src, DataFrameLike), "dataframe not allowed"
159
+ self.create_or_replace_view(src=src)
160
+
161
+ else:
162
+ kwargs["mode"] = "complete"
163
+ if "slice" in kwargs:
164
+ del kwargs["slice"]
165
+
166
+ df = self.get_data(src, **kwargs)
167
+ df = self.reorder_dataframe(df)
168
+ if overwrite:
169
+ self.table.overwrite_schema(df)
170
+ else:
171
+ self.table.update_schema(df, widen_types=widen_types)
172
+
173
+ def update_schema(self, src: AllowedSources, **kwargs):
174
+ self._update_schema(src=src, **kwargs)
175
+
176
+ def overwrite_schema(self, src: AllowedSources, **kwargs):
177
+ self._update_schema(src=src, overwrite=True, **kwargs)
@@ -0,0 +1,110 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from jinja2 import Environment, PackageLoader
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.cdc.base._types import AllowedSources
9
+ from fabricks.cdc.base.processor import Processor
10
+ from fabricks.context.log import DEFAULT_LOGGER
11
+ from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.utils._types import DataFrameLike
13
+ from fabricks.utils.sqlglot import fix as fix_sql
14
+
15
+
16
+ class Merger(Processor):
17
+ def get_merge_context(self, src: Union[DataFrame, str], **kwargs) -> dict:
18
+ if isinstance(src, DataFrameLike):
19
+ format = "dataframe"
20
+ columns = self.get_columns(src, backtick=False, sort=False, check=False) # already done in processor
21
+ elif isinstance(src, str):
22
+ format = "view"
23
+ columns = self.get_columns(
24
+ f"select * from {src}", backtick=False, sort=False, check=False
25
+ ) # already done in processor
26
+ else:
27
+ raise ValueError(f"{src} not allowed")
28
+
29
+ assert "__merge_key" in columns, "__merge_key not found"
30
+ assert "__merge_condition" in columns, "__merge_condition not found"
31
+
32
+ keys = kwargs.get("keys")
33
+ if isinstance(keys, str):
34
+ keys = [keys]
35
+
36
+ columns = [c for c in columns if c not in ["__merge_condition", "__merge_key"]]
37
+ fields = [c for c in columns if not c.startswith("__")]
38
+ where = kwargs.get("update_where") if self.table.rows > 0 else None
39
+ soft_delete = "__is_deleted" in columns
40
+
41
+ has_source = "__source" in columns
42
+ has_key = "__key" in columns
43
+ has_metadata = "__metadata" in columns
44
+ has_hash = "__hash" in columns
45
+ has_timestamp = "__timestamp" in columns
46
+ has_identity = "__identity" in columns
47
+
48
+ # 'NoneType' object is not iterable
49
+ if keys:
50
+ keys = [f"`{k}`" for k in keys]
51
+ if columns:
52
+ columns = [f"`{c}`" for c in columns]
53
+ if fields:
54
+ fields = [f"`{c}`" for c in fields]
55
+
56
+ assert "__key" or keys, f"{self} - __key or keys not found"
57
+
58
+ return {
59
+ "src": src,
60
+ "format": format,
61
+ "tgt": self.table,
62
+ "cdc": self.change_data_capture,
63
+ "columns": columns,
64
+ "fields": fields,
65
+ "soft_delete": soft_delete,
66
+ "has_source": has_source,
67
+ "has_identity": has_identity,
68
+ "has_key": has_key,
69
+ "has_hash": has_hash,
70
+ "keys": keys,
71
+ "has_metadata": has_metadata,
72
+ "has_timestamp": has_timestamp,
73
+ "where": where,
74
+ }
75
+
76
+ def get_merge_query(self, src: Union[DataFrame, str], fix: Optional[bool] = True, **kwargs) -> str:
77
+ context = self.get_merge_context(src=src, **kwargs)
78
+ environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
79
+ merge = environment.get_template("merge.sql.jinja")
80
+
81
+ try:
82
+ sql = merge.render(**context)
83
+ except Exception as e:
84
+ DEFAULT_LOGGER.debug("context", extra={"label": self, "content": context})
85
+ raise e
86
+
87
+ if fix:
88
+ try:
89
+ sql = sql.replace("{src}", "src")
90
+ sql = fix_sql(sql)
91
+ sql = sql.replace("`src`", "{src}")
92
+ DEFAULT_LOGGER.debug("merge", extra={"label": self, "sql": sql})
93
+
94
+ except Exception as e:
95
+ DEFAULT_LOGGER.exception("fail to clean sql query", extra={"label": self, "sql": sql})
96
+ raise e
97
+
98
+ return sql
99
+
100
+ def merge(self, src: AllowedSources, **kwargs):
101
+ if not self.table.exists():
102
+ self.create_table(src, **kwargs)
103
+
104
+ df = self.get_data(src, **kwargs)
105
+ global_temp_view = f"{self.qualified_name}__merge"
106
+ view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False), job=self)
107
+
108
+ merge = self.get_merge_query(view, **kwargs)
109
+ DEFAULT_LOGGER.debug("exec merge", extra={"label": self, "sql": merge})
110
+ self.spark.sql(merge, src=view)