fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,768 @@
1
+ import re
2
+ from typing import Any, List, Optional, Sequence, Union, overload
3
+
4
+ from delta import DeltaTable
5
+ from pyspark.errors.exceptions.base import AnalysisException
6
+ from pyspark.sql import DataFrame, SparkSession
7
+ from pyspark.sql.functions import max
8
+ from pyspark.sql.types import StructType
9
+
10
+ from fabricks.context import SPARK
11
+ from fabricks.context.log import DEFAULT_LOGGER
12
+ from fabricks.metastore._types import AddedColumn, ChangedColumn, DroppedColumn, SchemaDiff
13
+ from fabricks.metastore.dbobject import DbObject
14
+ from fabricks.utils.path import Path
15
+ from fabricks.utils.sqlglot import fix
16
+
17
+
18
+ class Table(DbObject):
19
+ @classmethod
20
+ def from_step_topic_item(cls, step: str, topic: str, item: str, spark: Optional[SparkSession] = SPARK):
21
+ return cls(step, topic, item, spark=spark)
22
+
23
+ @property
24
+ def deltapath(self) -> Path:
25
+ return self.database.delta_path.joinpath("/".join(self.levels))
26
+
27
+ @property
28
+ def delta_path(self) -> Path:
29
+ return self.database.delta_path.joinpath("/".join(self.levels))
30
+
31
+ @property
32
+ def deltatable(self) -> DeltaTable:
33
+ return DeltaTable.forPath(self.spark, self.delta_path.string)
34
+
35
+ @property
36
+ def delta_table(self) -> DeltaTable:
37
+ return DeltaTable.forPath(self.spark, self.delta_path.string)
38
+
39
+ @property
40
+ def dataframe(self) -> DataFrame:
41
+ assert self.registered, f"{self} not registered"
42
+
43
+ return self.spark.sql(f"select * from {self}")
44
+
45
+ @property
46
+ def columns(self) -> List[str]:
47
+ assert self.registered, f"{self} not registered"
48
+
49
+ return self.dataframe.columns
50
+
51
+ @property
52
+ def rows(self) -> int:
53
+ assert self.registered, f"{self} not registered"
54
+
55
+ return self.spark.sql(f"select count(*) from {self}").collect()[0][0]
56
+
57
+ @property
58
+ def last_version(self) -> int:
59
+ assert self.registered, f"{self} not registered"
60
+
61
+ df = self.describe_history()
62
+ version = df.select(max("version")).collect()[0][0]
63
+ return version
64
+
65
+ @property
66
+ def identity_enabled(self) -> bool:
67
+ assert self.registered, f"{self} not registered"
68
+ return self.get_property("delta.feature.identityColumns") == "supported"
69
+
70
+ @property
71
+ def type_widening_enabled(self) -> bool:
72
+ assert self.registered, f"{self} not registered"
73
+ return self.get_property("delta.enableTypeWidening") == "true"
74
+
75
+ @property
76
+ def liquid_clustering_enabled(self) -> bool:
77
+ assert self.registered, f"{self} not registered"
78
+ return self.get_property("delta.feature.clustering") == "supported"
79
+
80
+ @property
81
+ def auto_liquid_clustering_enabled(self) -> bool:
82
+ assert self.registered, f"{self} not registered"
83
+ return self.get_property("delta.clusterByAuto") == "true"
84
+
85
+ @property
86
+ def vorder_enabled(self) -> bool:
87
+ assert self.registered, f"{self} not registered"
88
+ return self.get_property("delta.parquet.vorder.enabled") == "true"
89
+
90
+ def drop(self):
91
+ super().drop()
92
+ if self.delta_path.exists():
93
+ DEFAULT_LOGGER.debug("delete delta folder", extra={"label": self})
94
+ self.delta_path.rm()
95
+
96
+ @overload
97
+ def create(
98
+ self,
99
+ df: DataFrame,
100
+ *,
101
+ partitioning: Optional[bool] = False,
102
+ partition_by: Optional[Union[List[str], str]] = None,
103
+ identity: Optional[bool] = False,
104
+ liquid_clustering: Optional[bool] = False,
105
+ cluster_by: Optional[Union[List[str], str]] = None,
106
+ properties: Optional[dict[str, str]] = None,
107
+ masks: Optional[dict[str, str]] = None,
108
+ primary_key: Optional[dict[str, Any]] = None,
109
+ foreign_keys: Optional[dict[str, Any]] = None,
110
+ comments: Optional[dict[str, str]] = None,
111
+ ): ...
112
+
113
+ @overload
114
+ def create(
115
+ self,
116
+ *,
117
+ schema: StructType,
118
+ partitioning: Optional[bool] = False,
119
+ partition_by: Optional[Union[List[str], str]] = None,
120
+ identity: Optional[bool] = False,
121
+ liquid_clustering: Optional[bool] = False,
122
+ cluster_by: Optional[Union[List[str], str]] = None,
123
+ properties: Optional[dict[str, str]] = None,
124
+ masks: Optional[dict[str, str]] = None,
125
+ primary_key: Optional[dict[str, Any]] = None,
126
+ foreign_keys: Optional[dict[str, Any]] = None,
127
+ comments: Optional[dict[str, str]] = None,
128
+ ): ...
129
+
130
+ def create(
131
+ self,
132
+ df: Optional[DataFrame] = None,
133
+ schema: Optional[StructType] = None,
134
+ partitioning: Optional[bool] = False,
135
+ partition_by: Optional[Union[List[str], str]] = None,
136
+ identity: Optional[bool] = False,
137
+ liquid_clustering: Optional[bool] = False,
138
+ cluster_by: Optional[Union[List[str], str]] = None,
139
+ properties: Optional[dict[str, str]] = None,
140
+ masks: Optional[dict[str, str]] = None,
141
+ primary_key: Optional[dict[str, Any]] = None,
142
+ foreign_keys: Optional[dict[str, Any]] = None,
143
+ comments: Optional[dict[str, str]] = None,
144
+ ):
145
+ self._create(
146
+ df=df,
147
+ schema=schema,
148
+ partitioning=partitioning,
149
+ partition_by=partition_by,
150
+ identity=identity,
151
+ liquid_clustering=liquid_clustering,
152
+ cluster_by=cluster_by,
153
+ properties=properties,
154
+ masks=masks,
155
+ primary_key=primary_key,
156
+ foreign_keys=foreign_keys,
157
+ comments=comments,
158
+ )
159
+
160
+ def _get_ddl_columns(
161
+ self, df: DataFrame, masks: Optional[dict[str, str]], comments: Optional[dict[str, str]]
162
+ ) -> List[str]:
163
+ def _backtick(name: str, dtype: str) -> str:
164
+ j = df.schema[name].jsonValue()
165
+ r = re.compile(r"(?<='name': ')[^']+(?=',)")
166
+
167
+ names = re.findall(r, str(j))
168
+ for n in names:
169
+ escaped = re.escape(n)
170
+ dtype = re.sub(f"(?<=,){escaped}(?=:)|(?<=<){escaped}(?=:)", f"`{n}`", dtype)
171
+
172
+ return dtype
173
+
174
+ out = []
175
+
176
+ for name, dtype in df.dtypes:
177
+ col = [f"`{name}`", _backtick(name, dtype)]
178
+
179
+ if comments and name in comments:
180
+ col.append(f"comment '{comments[name]}'")
181
+
182
+ if masks and name in masks:
183
+ col.append(f"mask {masks[name]}")
184
+
185
+ out.append(" ".join(col))
186
+
187
+ return out
188
+
189
+ def _create(
190
+ self,
191
+ df: Optional[DataFrame] = None,
192
+ schema: Optional[StructType] = None,
193
+ partitioning: Optional[bool] = False,
194
+ partition_by: Optional[Union[List[str], str]] = None,
195
+ identity: Optional[bool] = False,
196
+ liquid_clustering: Optional[bool] = False,
197
+ cluster_by: Optional[Union[List[str], str]] = None,
198
+ properties: Optional[dict[str, str]] = None,
199
+ masks: Optional[dict[str, str]] = None,
200
+ primary_key: Optional[dict[str, Any]] = None,
201
+ foreign_keys: Optional[dict[str, Any]] = None,
202
+ comments: Optional[dict[str, str]] = None,
203
+ ):
204
+ DEFAULT_LOGGER.info("create table", extra={"label": self})
205
+ if not df:
206
+ assert schema is not None
207
+ df = self.spark.createDataFrame([], schema)
208
+
209
+ ddl_columns = ",\n\t".join(self._get_ddl_columns(df, masks=masks, comments=comments))
210
+ ddl_identity = "-- no identity" if "__identity" not in df.columns else ""
211
+ ddl_cluster_by = "-- no cluster by"
212
+ ddl_partition_by = "-- no partitioned by"
213
+ ddl_tblproperties = "-- not tblproperties"
214
+ ddl_primary_key = "-- no primary key"
215
+ ddl_foreign_keys = "-- no foreign keys"
216
+
217
+ if liquid_clustering:
218
+ if cluster_by:
219
+ if isinstance(cluster_by, str):
220
+ cluster_by = [cluster_by]
221
+ cluster_by = [f"`{c}`" for c in cluster_by]
222
+ ddl_cluster_by = "cluster by (" + ", ".join(cluster_by) + ")"
223
+
224
+ else:
225
+ ddl_cluster_by = "cluster by auto"
226
+
227
+ if partitioning:
228
+ assert partition_by
229
+ if isinstance(partition_by, str):
230
+ partition_by = [partition_by]
231
+ partition_by = [f"`{p}`" for p in partition_by]
232
+ ddl_partition_by = "partitioned by (" + ", ".join(partition_by) + ")"
233
+
234
+ if identity:
235
+ ddl_identity = "__identity bigint generated by default as identity (start with 1 increment by 1), "
236
+
237
+ if primary_key:
238
+ assert len(primary_key) == 1, "only one primary key allowed"
239
+
240
+ for key, value in primary_key.items():
241
+ keys = value["keys"]
242
+ if isinstance(keys, str):
243
+ keys = [keys]
244
+ ddl_primary_key = f", constraint {key} primary key (" + ", ".join(keys) + ")"
245
+
246
+ if foreign_keys:
247
+ fks = []
248
+
249
+ for key, value in foreign_keys.items():
250
+ reference = value["reference"]
251
+ keys = value["keys"]
252
+ if isinstance(keys, str):
253
+ keys = [keys]
254
+ keys = ", ".join([f"`{k}`" for k in keys])
255
+ fk = f"constraint {key} foreign key ({keys}) references {reference}"
256
+ fks.append(fk)
257
+
258
+ ddl_foreign_keys = "," + ", ".join(fks)
259
+
260
+ if not properties:
261
+ special_char = False
262
+
263
+ for c in df.columns:
264
+ match = re.search(r"[^a-zA-Z0-9_]", c)
265
+ if match:
266
+ special_char = True
267
+ break
268
+
269
+ if special_char:
270
+ properties = {
271
+ "delta.columnMapping.mode": "name",
272
+ "delta.minReaderVersion": "2",
273
+ "delta.minWriterVersion": "5",
274
+ }
275
+
276
+ if properties:
277
+ ddl_tblproperties = (
278
+ "tblproperties (" + ",".join(f"'{key}' = '{value}'" for key, value in properties.items()) + ")"
279
+ )
280
+
281
+ sql = f"""
282
+ create table if not exists {self.qualified_name}
283
+ (
284
+ {ddl_identity}
285
+ {ddl_columns}
286
+ {ddl_foreign_keys}
287
+ {ddl_primary_key}
288
+ )
289
+ {ddl_tblproperties}
290
+ {ddl_partition_by}
291
+ {ddl_cluster_by}
292
+ location '{self.delta_path}'
293
+ """
294
+ try:
295
+ sql = fix(sql)
296
+ except Exception:
297
+ pass
298
+
299
+ DEFAULT_LOGGER.debug("ddl", extra={"label": self, "sql": sql})
300
+ self.spark.sql(sql)
301
+
302
+ @property
303
+ def is_deltatable(self) -> bool:
304
+ return DeltaTable.isDeltaTable(self.spark, str(self.delta_path))
305
+
306
+ @property
307
+ def column_mapping_enabled(self) -> bool:
308
+ assert self.registered, f"{self} not registered"
309
+
310
+ return self.get_property("delta.columnMapping.mode") == "name"
311
+
312
+ def exists(self) -> bool:
313
+ return self.is_deltatable and self.registered
314
+
315
+ def register(self):
316
+ DEFAULT_LOGGER.debug("register table", extra={"label": self})
317
+ self.spark.sql(f"create table if not exists {self.qualified_name} using delta location '{self.delta_path}'")
318
+
319
+ def restore_to_version(self, version: int):
320
+ assert self.registered, f"{self} not registered"
321
+
322
+ DEFAULT_LOGGER.info(f"restore table to version {version}", extra={"label": self})
323
+ self.spark.sql(f"restore table {self.qualified_name} to version as of {version}")
324
+
325
+ def truncate(self):
326
+ assert self.registered, f"{self} not registered"
327
+
328
+ DEFAULT_LOGGER.warning("truncate table", extra={"label": self})
329
+ self.create_restore_point()
330
+ self.spark.sql(f"truncate table {self.qualified_name}")
331
+
332
+ def schema_drifted(self, df: DataFrame, exclude_columns_with_prefix: Optional[str] = None) -> bool:
333
+ assert self.registered, f"{self} not registered"
334
+
335
+ diffs = self.get_schema_differences(df)
336
+ return len(diffs) > 0
337
+
338
+ def get_schema_differences(self, df: DataFrame) -> Sequence[SchemaDiff]:
339
+ assert self.registered, f"{self} not registered"
340
+
341
+ DEFAULT_LOGGER.debug("get schema differences", extra={"label": self, "df": df})
342
+
343
+ df1 = self.dataframe
344
+ if self.identity_enabled:
345
+ if "__identity" in df1.columns:
346
+ df1 = df1.drop("__identity")
347
+
348
+ all_columns = set(df1.columns).union(set(df.columns))
349
+
350
+ df1_dict = {name: dtype for name, dtype in df1.dtypes}
351
+ df2_dict = {name: dtype for name, dtype in df.dtypes}
352
+
353
+ diffs: list[SchemaDiff] = []
354
+
355
+ for c in all_columns:
356
+ old_datatype = df1_dict.get(c)
357
+ new_datatype = df2_dict.get(c)
358
+
359
+ if old_datatype is None and new_datatype is not None:
360
+ diffs.append(AddedColumn(new_column=c, new_data_type=new_datatype))
361
+
362
+ elif old_datatype is not None and new_datatype is None:
363
+ diffs.append(DroppedColumn(column=c, data_type=old_datatype))
364
+
365
+ elif old_datatype != new_datatype:
366
+ assert old_datatype is not None
367
+ assert new_datatype is not None
368
+ diffs.append(
369
+ ChangedColumn(
370
+ column=c,
371
+ data_type=old_datatype,
372
+ new_data_type=new_datatype,
373
+ )
374
+ )
375
+
376
+ if diffs:
377
+ DEFAULT_LOGGER.warning("difference(s) with delta table", extra={"label": self, "df": df})
378
+
379
+ return diffs
380
+
381
+ def update_schema(self, df: DataFrame, widen_types: bool = False):
382
+ assert self.registered, f"{self} not registered"
383
+ if not self.column_mapping_enabled:
384
+ self.enable_column_mapping()
385
+
386
+ diffs = self.get_schema_differences(df)
387
+ if widen_types:
388
+ diffs = [d for d in diffs if d.type_widening_compatible]
389
+ msg = "update schema (type widening only)"
390
+ else:
391
+ diffs = [d for d in diffs if d.status in ("added", "changed")]
392
+ msg = "update schema"
393
+
394
+ if diffs:
395
+ DEFAULT_LOGGER.info(msg, extra={"label": self, "df": diffs})
396
+
397
+ for row in diffs:
398
+ if row.status == "changed":
399
+ data_type = f"{row.data_type} -> {row.new_data_type}"
400
+ else:
401
+ data_type = f"{row.new_data_type}"
402
+
403
+ DEFAULT_LOGGER.debug(
404
+ f"{row.status.replace('ed', 'ing')} {row.column} ({data_type})",
405
+ extra={"label": self},
406
+ )
407
+
408
+ try:
409
+ # https://docs.databricks.com/aws/en/delta/type-widening#widen-types-with-automatic-schema-evolution
410
+ # The type change is not one of byte, short, int, or long to decimal or double.
411
+ # These type changes can only be applied manually using ALTER TABLE to avoid accidental promotion of integers to decimals.
412
+ if row.data_type in ["byte", "short", "int", "long"] and row.new_data_type in [
413
+ "decimal",
414
+ "double",
415
+ ]:
416
+ self.change_column(row.column, row.new_data_type)
417
+
418
+ else:
419
+ update_df = df.select(row.column).where("1 == 2")
420
+ (
421
+ self.deltatable.alias("dt")
422
+ .merge(update_df.alias("df"), "1 == 2")
423
+ .withSchemaEvolution() # type: ignore
424
+ .whenMatchedUpdateAll()
425
+ .whenNotMatchedInsertAll()
426
+ .execute()
427
+ )
428
+ except Exception:
429
+ pass
430
+
431
+ def overwrite_schema(self, df: DataFrame):
432
+ assert self.registered, f"{self} not registered"
433
+ if not self.column_mapping_enabled:
434
+ self.enable_column_mapping()
435
+
436
+ diffs = self.get_schema_differences(df)
437
+
438
+ if diffs:
439
+ self.update_schema(df)
440
+
441
+ diffs = self.get_schema_differences(df)
442
+ if diffs:
443
+ DEFAULT_LOGGER.warning("overwrite schema", extra={"label": self, "df": diffs})
444
+
445
+ for row in diffs:
446
+ if row.status == "added":
447
+ assert row.new_data_type is not None, "new_data_type must be defined for added columns"
448
+ self.add_column(row.column, row.new_data_type)
449
+
450
+ elif row.status == "dropped":
451
+ self.drop_column(row.column)
452
+
453
+ elif row.status == "changed":
454
+ assert row.new_data_type is not None, "new_data_type must be defined for changed columns"
455
+
456
+ try:
457
+ self.change_column(row.column, row.new_data_type)
458
+ except AnalysisException:
459
+ self.drop_column(row.column)
460
+ self.add_column(row.column, row.new_data_type)
461
+
462
+ def vacuum(self, retention_days: int = 7):
463
+ assert self.registered, f"{self} not registered"
464
+
465
+ DEFAULT_LOGGER.debug(f"vacuum table (removing files older than {retention_days} days)", extra={"label": self})
466
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
467
+ try:
468
+ self.create_restore_point()
469
+ retention_hours = retention_days * 24
470
+ self.deltatable.vacuum(retention_hours)
471
+ finally:
472
+ # finally
473
+ pass
474
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
475
+
476
+ def optimize(self, columns: Optional[Union[str, List[str]]] = None):
477
+ assert self.registered, f"{self} not registered"
478
+
479
+ DEFAULT_LOGGER.info("optimize", extra={"label": self})
480
+
481
+ if self.liquid_clustering_enabled:
482
+ self.spark.sql(f"optimize {self.qualified_name}")
483
+
484
+ elif self.auto_liquid_clustering_enabled:
485
+ self.spark.sql(f"optimize {self.qualified_name}")
486
+
487
+ elif columns is None:
488
+ if self.vorder_enabled:
489
+ DEFAULT_LOGGER.debug("vorder", extra={"label": self})
490
+ self.spark.sql(f"optimize {self.qualified_name} vorder")
491
+ else:
492
+ self.spark.sql(f"optimize {self.qualified_name}")
493
+
494
+ else:
495
+ if isinstance(columns, str):
496
+ columns = [columns]
497
+ columns = [f"`{c}`" for c in columns]
498
+ cols = ", ".join(columns)
499
+
500
+ if self.vorder_enabled:
501
+ DEFAULT_LOGGER.debug(f"zorder by {cols} vorder", extra={"label": self})
502
+ self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols}) vorder")
503
+
504
+ else:
505
+ DEFAULT_LOGGER.debug(f"zorder by {cols}", extra={"label": self})
506
+ self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols})")
507
+
508
+ def analyze(self):
509
+ assert self.registered, f"{self} not registered"
510
+
511
+ DEFAULT_LOGGER.debug("analyze", extra={"label": self})
512
+ self.compute_statistics()
513
+ self.compute_delta_statistics()
514
+
515
+ def compute_statistics(self):
516
+ assert self.registered, f"{self} not registered"
517
+
518
+ DEFAULT_LOGGER.debug("compute statistics", extra={"label": self})
519
+ cols = [
520
+ f"`{name}`"
521
+ for name, dtype in self.dataframe.dtypes
522
+ if not dtype.startswith("struct")
523
+ and not dtype.startswith("array")
524
+ and not dtype.startswith("variant")
525
+ and not dtype.startswith("map")
526
+ and name not in ["__metadata"]
527
+ ]
528
+ cols = ", ".join(sorted(cols))
529
+ self.spark.sql(f"analyze table {self.qualified_name} compute statistics for columns {cols}")
530
+
531
+ def compute_delta_statistics(self):
532
+ assert self.registered, f"{self} not registered"
533
+
534
+ DEFAULT_LOGGER.debug("compute delta statistics", extra={"label": self})
535
+ self.spark.sql(f"analyze table {self.qualified_name} compute delta statistics")
536
+
537
+ def drop_column(self, name: str):
538
+ assert self.registered, f"{self} not registered"
539
+ assert self.column_mapping_enabled, "column mapping not enabled"
540
+
541
+ DEFAULT_LOGGER.warning(f"drop column {name}", extra={"label": self})
542
+ self.spark.sql(
543
+ f"""
544
+ alter table {self.qualified_name}
545
+ drop column `{name}`
546
+ """
547
+ )
548
+
549
+ def change_column(self, name: str, type: str):
550
+ assert self.registered, f"{self} not registered"
551
+ assert self.column_mapping_enabled, "column mapping not enabled"
552
+
553
+ DEFAULT_LOGGER.info(f"change column {name} ({type})", extra={"label": self})
554
+ self.spark.sql(
555
+ f"""
556
+ alter table {self.qualified_name}
557
+ change column `{name}` `{name}` {type}
558
+ """
559
+ )
560
+
561
+ def rename_column(self, old: str, new: str):
562
+ assert self.registered, f"{self} not registered"
563
+ assert self.column_mapping_enabled, "column mapping not enabled"
564
+
565
+ DEFAULT_LOGGER.info(f"rename column {old} -> {new}", extra={"label": self})
566
+ self.spark.sql(
567
+ f"""
568
+ alter table {self.qualified_name}
569
+ rename column `{old}` to `{new}`
570
+ """
571
+ )
572
+
573
+ def get_column_data_type(self, name: str) -> str:
574
+ data_type = self.get_description().where(f"col_name == '{name}'").select("data_type").collect()[0][0]
575
+ return data_type
576
+
577
+ def get_details(self) -> DataFrame:
578
+ assert self.registered, f"{self} not registered"
579
+
580
+ return self.spark.sql(f"describe detail {self.qualified_name}")
581
+
582
+ def get_properties(self) -> DataFrame:
583
+ assert self.registered, f"{self} not registered"
584
+
585
+ return self.spark.sql(f"show tblproperties {self.qualified_name}")
586
+
587
+ def get_description(self) -> DataFrame:
588
+ assert self.registered, f"{self} not registered"
589
+
590
+ return self.spark.sql(f"describe extended {self.qualified_name}")
591
+
592
+ def get_history(self) -> DataFrame:
593
+ assert self.registered, f"{self} not registered"
594
+
595
+ df = self.spark.sql(f"describe history {self.qualified_name}")
596
+ return df
597
+
598
+ def get_last_version(self) -> int:
599
+ assert self.registered, f"{self} not registered"
600
+
601
+ df = self.get_history()
602
+ version = df.select(max("version")).collect()[0][0]
603
+ return version
604
+
605
+ def get_property(self, key: str) -> Optional[str]:
606
+ assert self.registered, f"{self} not registered"
607
+
608
+ try:
609
+ value = self.get_properties().where(f"key == '{key}'").select("value").collect()[0][0]
610
+ return value
611
+
612
+ except IndexError:
613
+ return None
614
+
615
+ def enable_change_data_feed(self):
616
+ assert self.registered, f"{self} not registered"
617
+
618
+ DEFAULT_LOGGER.debug("enable change data feed", extra={"label": self})
619
+ self.set_property("delta.enableChangeDataFeed", "true")
620
+
621
+ def enable_column_mapping(self):
622
+ assert self.registered, f"{self} not registered"
623
+
624
+ DEFAULT_LOGGER.debug("enable column mapping", extra={"label": self})
625
+
626
+ try:
627
+ self.spark.sql(
628
+ f"""
629
+ alter table {self.qualified_name}
630
+ set tblproperties ('delta.columnMapping.mode' = 'name')
631
+ """
632
+ )
633
+
634
+ except Exception:
635
+ DEFAULT_LOGGER.debug("update reader and writer version", extra={"label": self})
636
+ self.spark.sql(
637
+ f"""
638
+ alter table {self.qualified_name}
639
+ set tblproperties (
640
+ 'delta.columnMapping.mode' = 'name',
641
+ 'delta.minReaderVersion' = '2',
642
+ 'delta.minWriterVersion' = '5'
643
+ )
644
+ """
645
+ )
646
+
647
+ def set_property(self, key: Union[str, int], value: Union[str, int]):
648
+ assert self.registered, f"{self} not registered"
649
+
650
+ DEFAULT_LOGGER.debug(f"set property {key} = {value}", extra={"label": self})
651
+ self.spark.sql(
652
+ f"""
653
+ alter table {self.qualified_name}
654
+ set tblproperties ({key} = '{value}')
655
+ """
656
+ )
657
+
658
+ def add_constraint(self, name: str, expr: str):
659
+ assert self.registered, f"{self} not registered"
660
+
661
+ DEFAULT_LOGGER.debug(f"add constraint ({name} check ({expr}))", extra={"label": self})
662
+ self.spark.sql(
663
+ f"""
664
+ alter table {self.qualified_name}
665
+ add constraint {name} check ({expr});
666
+ """
667
+ )
668
+
669
+ def add_comment(self, comment: str):
670
+ assert self.registered, f"{self} not registered"
671
+
672
+ DEFAULT_LOGGER.debug(f"add comment '{comment}'", extra={"label": self})
673
+ self.spark.sql(
674
+ f"""
675
+ comment on table {self.qualified_name}
676
+ is '{comment}';
677
+ """
678
+ )
679
+
680
+ def add_materialized_column(self, name: str, expr: str, type: str):
681
+ assert self.registered, f"{self} not registered"
682
+ assert self.column_mapping_enabled, "column mapping not enabled"
683
+
684
+ DEFAULT_LOGGER.info(f"add materialized column ({name} {type})", extra={"label": self})
685
+ self.spark.sql(
686
+ f""""
687
+ alter table {self.qualified_name}
688
+ add columns (`{name}` {type} materialized {expr})
689
+ """
690
+ )
691
+
692
+ def add_column(self, name: str, type: str, after: Optional[str] = None):
693
+ assert self.registered, f"{self} not registered"
694
+
695
+ DEFAULT_LOGGER.info(f"add column {name} ({type})", extra={"label": self})
696
+ ddl_after = "" if not after else f"after {after}"
697
+ self.spark.sql(
698
+ f"""
699
+ alter table {self.qualified_name}
700
+ add columns (`{name}` {type} {ddl_after})
701
+ """
702
+ )
703
+
704
+ def create_bloomfilter_index(self, columns: Union[str, List[str]]):
705
+ assert self.registered, f"{self} not registered"
706
+
707
+ if isinstance(columns, str):
708
+ columns = [columns]
709
+ columns = [f"`{c}`" for c in columns]
710
+ cols = ", ".join(columns)
711
+
712
+ DEFAULT_LOGGER.info(f"bloomfilter by {cols}", extra={"label": self})
713
+ self.spark.sql(
714
+ f"""
715
+ create bloomfilter index on table {self.qualified_name}
716
+ for columns ({cols})
717
+ """
718
+ )
719
+
720
+ def create_restore_point(self):
721
+ assert self.registered, f"{self} not registered"
722
+
723
+ last_version = self.get_last_version() + 1
724
+ self.set_property("fabricks.last_version", last_version)
725
+
726
+ def show_properties(self) -> DataFrame:
727
+ assert self.registered, f"{self} not registered"
728
+
729
+ return self.spark.sql(f"show tblproperties {self.qualified_name}")
730
+
731
+ def describe_detail(self) -> DataFrame:
732
+ assert self.registered, f"{self} not registered"
733
+
734
+ return self.spark.sql(f"describe detail {self.qualified_name}")
735
+
736
+ def describe_extended(self) -> DataFrame:
737
+ assert self.registered, f"{self} not registered"
738
+
739
+ return self.spark.sql(f"describe extended {self.qualified_name}")
740
+
741
+ def describe_history(self) -> DataFrame:
742
+ assert self.registered, f"{self} not registered"
743
+
744
+ df = self.spark.sql(f"describe history {self.qualified_name}")
745
+ return df
746
+
747
+ def enable_liquid_clustering(self, columns: Optional[Union[str, List[str]]] = None, auto: Optional[bool] = False):
748
+ assert self.registered, f"{self} not registered"
749
+
750
+ if auto:
751
+ DEFAULT_LOGGER.info("cluster by auto", extra={"label": self})
752
+ self.spark.sql(f"alter table {self.qualified_name} cluster by automatic")
753
+
754
+ else:
755
+ assert columns, "at least one clustering column must be specified"
756
+
757
+ if isinstance(columns, str):
758
+ columns = [columns]
759
+ columns = [f"`{c}`" for c in columns]
760
+ cols = ", ".join(columns)
761
+
762
+ DEFAULT_LOGGER.info(f"cluster by {cols}", extra={"label": self})
763
+ self.spark.sql(
764
+ f"""
765
+ alter table {self.qualified_name}
766
+ cluster by ({cols})
767
+ """
768
+ )