fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,61 @@
1
+ from typing import Optional
2
+
3
+ from databricks.sdk.runtime import dbutils as _dbutils
4
+ from databricks.sdk.runtime import spark as _spark
5
+ from pyspark.errors.exceptions.base import AnalysisException
6
+ from pyspark.sql import SparkSession
7
+
8
+ from fabricks.context.log import Logger
9
+ from fabricks.metastore.database import Database
10
+
11
+
12
+ class Relational:
13
+ def __init__(self, database: str, *levels: str, spark: Optional[SparkSession] = None):
14
+ self.database = Database(database)
15
+ self.levels = levels
16
+ if spark is None:
17
+ spark = _spark
18
+ assert spark is not None
19
+ self.spark: SparkSession = spark
20
+ self.dbutils = _dbutils
21
+
22
+ @property
23
+ def name(self) -> str:
24
+ return "_".join(self.levels)
25
+
26
+ @property
27
+ def qualified_name(self) -> str:
28
+ return f"{self.database.name}.{self.name}"
29
+
30
+ def registered(self):
31
+ try:
32
+ df = self.spark.sql(f"show tables in {self.database}").where(f"tableName == '{self.name}'")
33
+ return not df.isEmpty()
34
+ # not found
35
+ except AnalysisException:
36
+ return False
37
+
38
+ def is_view(self):
39
+ try:
40
+ df = self.spark.sql(f"show views in {self.database}").where(f"viewName == '{self.name}'")
41
+ return not df.isEmpty()
42
+ # not found
43
+ except AnalysisException:
44
+ return False
45
+
46
+ def is_table(self):
47
+ if self.is_view():
48
+ return False
49
+ else:
50
+ return self.registered()
51
+
52
+ def drop(self):
53
+ if self.is_view():
54
+ Logger.warning("drop view from metastore", extra={"job": self})
55
+ self.spark.sql(f"drop view if exists {self}")
56
+ elif self.is_table():
57
+ Logger.warning("drop table from metastore", extra={"job": self})
58
+ self.spark.sql(f"drop table if exists {self}")
59
+
60
+ def __str__(self):
61
+ return self.qualified_name
@@ -0,0 +1,529 @@
1
+ import re
2
+ from typing import List, Optional, Union, overload
3
+
4
+ from databricks.sdk.runtime import spark
5
+ from delta import DeltaTable
6
+ from pyspark.errors.exceptions.base import AnalysisException
7
+ from pyspark.sql import DataFrame, SparkSession
8
+ from pyspark.sql.functions import expr, max
9
+ from pyspark.sql.types import StructType
10
+ from typing_extensions import deprecated
11
+
12
+ from fabricks.context.log import Logger
13
+ from fabricks.metastore.relational import Relational
14
+ from fabricks.utils.path import Path
15
+ from fabricks.utils.sqlglot import fix
16
+
17
+
18
+ class Table(Relational):
19
+ @classmethod
20
+ def from_step_topic_item(cls, step: str, topic: str, item: str, spark: Optional[SparkSession] = spark):
21
+ return cls(step, topic, item, spark=spark)
22
+
23
+ @property
24
+ @deprecated("use delta_path instead")
25
+ def deltapath(self) -> Path:
26
+ return self.database.deltapath.join("/".join(self.levels))
27
+
28
+ @property
29
+ def delta_path(self) -> Path:
30
+ return self.database.deltapath.join("/".join(self.levels))
31
+
32
+ @property
33
+ def deltatable(self) -> DeltaTable:
34
+ return DeltaTable.forPath(self.spark, self.deltapath.string)
35
+
36
+ @property
37
+ def delta_table(self) -> DeltaTable:
38
+ return DeltaTable.forPath(self.spark, self.deltapath.string)
39
+
40
+ @property
41
+ def dataframe(self) -> DataFrame:
42
+ return self.spark.sql(f"select * from {self}")
43
+
44
+ @property
45
+ def columns(self) -> List[str]:
46
+ return self.dataframe.columns
47
+
48
+ @property
49
+ def rows(self) -> int:
50
+ return self.spark.sql(f"select count(*) from {self}").collect()[0][0]
51
+
52
+ @property
53
+ def last_version(self) -> int:
54
+ df = self.describe_history()
55
+ version = df.select(max("version")).collect()[0][0]
56
+ return version
57
+
58
+ def drop(self):
59
+ super().drop()
60
+ if self.deltapath.exists():
61
+ Logger.debug("delete delta folder", extra={"job": self})
62
+ self.deltapath.rm()
63
+
64
+ @overload
65
+ def create(
66
+ self,
67
+ df: DataFrame,
68
+ *,
69
+ partitioning: Optional[bool] = False,
70
+ partition_by: Optional[Union[List[str], str]] = None,
71
+ identity: Optional[bool] = False,
72
+ liquid_clustering: Optional[bool] = False,
73
+ cluster_by: Optional[Union[List[str], str]] = None,
74
+ properties: Optional[dict[str, str]] = None,
75
+ ): ...
76
+
77
+ @overload
78
+ def create(
79
+ self,
80
+ *,
81
+ schema: StructType,
82
+ partitioning: Optional[bool] = False,
83
+ partition_by: Optional[Union[List[str], str]] = None,
84
+ identity: Optional[bool] = False,
85
+ liquid_clustering: Optional[bool] = False,
86
+ cluster_by: Optional[Union[List[str], str]] = None,
87
+ properties: Optional[dict[str, str]] = None,
88
+ ): ...
89
+
90
+ def create(
91
+ self,
92
+ df: Optional[DataFrame] = None,
93
+ schema: Optional[StructType] = None,
94
+ partitioning: Optional[bool] = False,
95
+ partition_by: Optional[Union[List[str], str]] = None,
96
+ identity: Optional[bool] = False,
97
+ liquid_clustering: Optional[bool] = False,
98
+ cluster_by: Optional[Union[List[str], str]] = None,
99
+ properties: Optional[dict[str, str]] = None,
100
+ ):
101
+ self._create(
102
+ df=df,
103
+ schema=schema,
104
+ partitioning=partitioning,
105
+ partition_by=partition_by,
106
+ identity=identity,
107
+ liquid_clustering=liquid_clustering,
108
+ cluster_by=cluster_by,
109
+ properties=properties,
110
+ )
111
+
112
+ def _create(
113
+ self,
114
+ df: Optional[DataFrame] = None,
115
+ schema: Optional[StructType] = None,
116
+ partitioning: Optional[bool] = False,
117
+ partition_by: Optional[Union[List[str], str]] = None,
118
+ identity: Optional[bool] = False,
119
+ liquid_clustering: Optional[bool] = False,
120
+ cluster_by: Optional[Union[List[str], str]] = None,
121
+ properties: Optional[dict[str, str]] = None,
122
+ ):
123
+ Logger.info("create table", extra={"job": self})
124
+ if not df:
125
+ assert schema is not None
126
+ df = self.spark.createDataFrame([], schema)
127
+
128
+ def _backtick(name: str, dtype: str) -> str:
129
+ j = df.schema[name].jsonValue()
130
+ r = re.compile(r"(?<='name': ')[^']+(?=',)")
131
+ names = re.findall(r, str(j))
132
+ for n in names:
133
+ escaped = re.escape(n)
134
+ dtype = re.sub(f"(?<=,){escaped}(?=:)|(?<=<){escaped}(?=:)", f"`{n}`", dtype)
135
+ return dtype
136
+
137
+ ddl_columns = ",\n\t".join([f"`{name}` {_backtick(name, dtype)}" for name, dtype in df.dtypes])
138
+ ddl_identity = "-- no identity" if "__identity" not in df.columns else ""
139
+ ddl_cluster_by = "-- no cluster by"
140
+ ddl_partition_by = "-- no partitioned by"
141
+ ddl_tblproperties = "-- not tblproperties"
142
+
143
+ if liquid_clustering:
144
+ assert cluster_by
145
+ if isinstance(cluster_by, str):
146
+ cluster_by = [cluster_by]
147
+ cluster_by = [f"`{c}`" for c in cluster_by]
148
+ ddl_cluster_by = "cluster by (" + ", ".join(cluster_by) + ")"
149
+ if partitioning:
150
+ assert partition_by
151
+ if isinstance(partition_by, str):
152
+ partition_by = [partition_by]
153
+ partition_by = [f"`{p}`" for p in partition_by]
154
+ ddl_partition_by = "partitioned by (" + ", ".join(partition_by) + ")"
155
+
156
+ if identity:
157
+ ddl_identity = "__identity bigint generated by default as identity (start with 1 increment by 1), "
158
+
159
+ if not properties:
160
+ special_char = False
161
+ for c in df.columns:
162
+ match = re.search(r"[^a-zA-Z0-9_]", c)
163
+ if match:
164
+ special_char = True
165
+ break
166
+ if special_char:
167
+ properties = {
168
+ "delta.columnMapping.mode": "name",
169
+ "delta.minReaderVersion": "2",
170
+ "delta.minWriterVersion": "5",
171
+ }
172
+ if properties:
173
+ ddl_tblproperties = (
174
+ "tblproperties (" + ",".join(f"'{key}' = '{value}'" for key, value in properties.items()) + ")"
175
+ )
176
+
177
+ sql = f"""
178
+ create table if not exists {self.qualified_name}
179
+ (
180
+ {ddl_identity}
181
+ {ddl_columns}
182
+ )
183
+ {ddl_tblproperties}
184
+ {ddl_partition_by}
185
+ {ddl_cluster_by}
186
+ location '{self.deltapath}'
187
+ """
188
+ try:
189
+ sql = fix(sql)
190
+ except Exception:
191
+ pass
192
+ Logger.debug("ddl", extra={"job": self, "sql": sql})
193
+ self.spark.sql(sql)
194
+
195
+ def is_deltatable(self) -> bool:
196
+ return DeltaTable.isDeltaTable(self.spark, str(self.deltapath))
197
+
198
+ def column_mapping_enabled(self) -> bool:
199
+ return self.get_property("delta.columnMapping.mode") == "name"
200
+
201
+ def exists(self) -> bool:
202
+ return self.is_deltatable() and self.registered()
203
+
204
+ def register(self):
205
+ Logger.debug("register table", extra={"job": self})
206
+ self.spark.sql(f"create table if not exists {self.qualified_name} using delta location '{self.deltapath}'")
207
+
208
+ def restore_to_version(self, version: int):
209
+ Logger.info(f"restore table to version {version}", extra={"job": self})
210
+ self.spark.sql(f"restore table {self.qualified_name} to version as of {version}")
211
+
212
+ def truncate(self):
213
+ Logger.warning("truncate table", extra={"job": self})
214
+ self.create_restore_point()
215
+ self.spark.sql(f"truncate table {self.qualified_name}")
216
+
217
+ def schema_drifted(self, df: DataFrame) -> bool:
218
+ return not self._check_schema_drift(df).isEmpty()
219
+
220
+ def _check_schema_drift(self, df: DataFrame) -> DataFrame:
221
+ Logger.debug("check schema drift", extra={"job": self})
222
+
223
+ new_df = self.spark.createDataFrame(df.dtypes, ["new_name", "new_type"]) # type: ignore
224
+ new_df = new_df.filter(~new_df.new_name.startswith("__"))
225
+
226
+ old_df = self.spark.createDataFrame(self.dataframe.dtypes, ["old_name", "old_type"]) # type: ignore
227
+ old_df = old_df.filter(~old_df.old_name.startswith("__"))
228
+
229
+ cond = [new_df["new_name"] == old_df["old_name"]]
230
+ df_diff = (
231
+ new_df.join(old_df, on=cond, how="outer")
232
+ .where(
233
+ """
234
+ coalesce(old_name, -1) <> coalesce(new_name, -1)
235
+ or
236
+ coalesce(old_type, -1) <> coalesce(new_type, -1)
237
+ """
238
+ )
239
+ .withColumn(
240
+ "operation",
241
+ expr("if(new_name is null, 'drop', if(old_name is null, 'add', 'update'))"),
242
+ )
243
+ .withColumn("column", expr("coalesce(new_name, old_name)"))
244
+ )
245
+ return df_diff
246
+
247
+ def _fix_schema(self, df: DataFrame, overwrite: bool = False):
248
+ drift_df = self._check_schema_drift(df)
249
+
250
+ if not drift_df.isEmpty():
251
+ Logger.info("update table", extra={"job": self})
252
+ todo_df = drift_df.where("operation in ('add', 'update')")
253
+ if not todo_df.isEmpty():
254
+ for row in todo_df.collect():
255
+ if row.operation == "add":
256
+ Logger.debug(f"add column { row.column}", extra={"job": self})
257
+ else:
258
+ Logger.debug(
259
+ "update column { row.column} ({row.old_type} -> {row.new_type})",
260
+ extra={"job": self},
261
+ )
262
+
263
+ try:
264
+ col_df = df.select(row.column).where("1 == 2")
265
+ (
266
+ self.deltatable.alias("dt")
267
+ .merge(col_df.alias("df"), "1 == 2")
268
+ .whenNotMatchedInsertAll()
269
+ .execute()
270
+ )
271
+ except Exception:
272
+ pass
273
+
274
+ if overwrite:
275
+ drift_df = self._check_schema_drift(df)
276
+ Logger.warning("overwrite table", extra={"job": self})
277
+ for row in drift_df.collect():
278
+ if row.operation == "add":
279
+ self.add_column(row.column, row.new_type)
280
+ elif row.operation == "drop":
281
+ self.drop_column(row.column)
282
+ elif row.operation == "update":
283
+ try:
284
+ self.change_column(row.column, row.new_type)
285
+ except AnalysisException:
286
+ self.drop_column(row.column)
287
+ self.add_column(row.column, row.new_type)
288
+ else:
289
+ raise ValueError(f"{row.operation} not allowed")
290
+
291
+ def update_schema(self, df: DataFrame):
292
+ self._fix_schema(df, overwrite=False)
293
+
294
+ def overwrite_schema(self, df: DataFrame):
295
+ self._fix_schema(df, overwrite=True)
296
+
297
+ def vacuum(self, retention_days: int = 7):
298
+ Logger.debug(f"vacuum table (removing files older than {retention_days} days)", extra={"job": self})
299
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
300
+ try:
301
+ self.create_restore_point()
302
+ retention_hours = retention_days * 24
303
+ self.deltatable.vacuum(retention_hours)
304
+ finally:
305
+ # finally
306
+ pass
307
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
308
+
309
+ def optimize(
310
+ self,
311
+ columns: Optional[Union[str, List[str]]] = None,
312
+ vorder: Optional[bool] = False,
313
+ ):
314
+ Logger.info("optimize", extra={"job": self})
315
+
316
+ zorder_by = columns is not None
317
+ if zorder_by:
318
+ if isinstance(columns, str):
319
+ columns = [columns]
320
+ columns = [f"`{c}`" for c in columns]
321
+ cols = ", ".join(columns)
322
+
323
+ if vorder:
324
+ Logger.debug(f"zorder by {cols} vorder", extra={"job": self})
325
+ self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols}) vorder")
326
+ else:
327
+ Logger.debug(f"zorder by {cols}", extra={"job": self})
328
+ self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols})")
329
+
330
+ elif vorder:
331
+ Logger.debug("vorder", extra={"job": self})
332
+ self.spark.sql(f"optimize {self.qualified_name} vorder")
333
+
334
+ else:
335
+ Logger.debug("optimize", extra={"job": self})
336
+ self.spark.sql(f"optimize {self.qualified_name}")
337
+
338
+ def analyze(self):
339
+ Logger.debug("analyze", extra={"job": self})
340
+ self.compute_statistics()
341
+ self.compute_delta_statistics()
342
+
343
+ def compute_statistics(self):
344
+ Logger.debug("compute statistics", extra={"job": self})
345
+ cols = [
346
+ f"`{name}`"
347
+ for name, dtype in self.dataframe.dtypes
348
+ if not dtype.startswith("struct") and not dtype.startswith("array") and name not in ["__metadata"]
349
+ ]
350
+ cols = ", ".join(sorted(cols))
351
+ self.spark.sql(f"analyze table delta.`{self.deltapath}` compute statistics for columns {cols}")
352
+
353
+ def compute_delta_statistics(self):
354
+ Logger.debug("compute delta statistics", extra={"job": self})
355
+ self.spark.sql(f"analyze table delta.`{self.deltapath}` compute delta statistics")
356
+
357
+ def drop_column(self, name: str):
358
+ assert self.column_mapping_enabled(), "column mapping not enabled"
359
+ Logger.warning(f"drop column {name}", extra={"job": self})
360
+ self.spark.sql(
361
+ f"""
362
+ alter table {self.qualified_name}
363
+ drop column `{name}`
364
+ """
365
+ )
366
+
367
+ def change_column(self, name: str, type: str):
368
+ assert self.column_mapping_enabled(), "column mapping not enabled"
369
+ Logger.info(f"change column {name} ({type})", extra={"job": self})
370
+ self.spark.sql(
371
+ f"""
372
+ alter table {self.qualified_name}
373
+ change column `{name}` `{name}` {type}
374
+ """
375
+ )
376
+
377
+ def rename_column(self, old: str, new: str):
378
+ assert self.column_mapping_enabled(), "column mapping not enabled"
379
+ Logger.info(f"rename column {old} -> {new}", extra={"job": self})
380
+ self.spark.sql(
381
+ f"""
382
+ alter table {self.qualified_name}
383
+ rename column `{old}` to `{new}`
384
+ """
385
+ )
386
+
387
+ def get_details(self) -> DataFrame:
388
+ return self.spark.sql(f"describe detail {self.qualified_name}")
389
+
390
+ def get_properties(self) -> DataFrame:
391
+ return self.spark.sql(f"show tblproperties {self.qualified_name}")
392
+
393
+ def get_description(self) -> DataFrame:
394
+ return self.spark.sql(f"describe extended {self.qualified_name}")
395
+
396
+ def get_history(self) -> DataFrame:
397
+ df = self.spark.sql(f"describe history {self.qualified_name}")
398
+ return df
399
+
400
+ def get_last_version(self) -> int:
401
+ df = self.get_history()
402
+ version = df.select(max("version")).collect()[0][0]
403
+ return version
404
+
405
+ def get_property(self, key: str) -> Optional[str]:
406
+ try:
407
+ value = self.get_properties().where(f"key == '{key}'").select("value").collect()[0][0]
408
+ return value
409
+
410
+ except IndexError:
411
+ return None
412
+
413
+ def enable_change_data_feed(self):
414
+ Logger.debug("enable change data feed", extra={"job": self})
415
+ self.set_property("delta.enableChangeDataFeed", "true")
416
+
417
+ def enable_column_mapping(self):
418
+ Logger.debug("enable column mapping", extra={"job": self})
419
+ try:
420
+ self.spark.sql(
421
+ f"""
422
+ alter table {self.qualified_name}
423
+ set tblproperties ('delta.columnMapping.mode' = 'name')
424
+ """
425
+ )
426
+ except Exception:
427
+ Logger.debug("update reader and writer version", extra={"job": self})
428
+ self.spark.sql(
429
+ f"""
430
+ alter table {self.qualified_name}
431
+ set tblproperties (
432
+ 'delta.columnMapping.mode' = 'name',
433
+ 'delta.minReaderVersion' = '2',
434
+ 'delta.minWriterVersion' = '5'
435
+ )
436
+ """
437
+ )
438
+
439
+ def set_property(self, key: Union[str, int], value: Union[str, int]):
440
+ Logger.debug(f"set property {key} = {value}", extra={"job": self})
441
+ self.spark.sql(
442
+ f"""
443
+ alter table {self.qualified_name}
444
+ set tblproperties ({key} = '{value}')
445
+ """
446
+ )
447
+
448
+ def add_constraint(self, name: str, expr: str):
449
+ Logger.debug(f"add constraint ({name} check ({expr}))", extra={"job": self})
450
+ self.spark.sql(
451
+ f"""
452
+ alter table {self.qualified_name}
453
+ add constraint {name} check ({expr});
454
+ """
455
+ )
456
+
457
+ def add_comment(self, comment: str):
458
+ Logger.debug(f"add comment '{comment}'", extra={"job": self})
459
+ self.spark.sql(
460
+ f"""
461
+ comment on table {self.qualified_name}
462
+ is '{comment}';
463
+ """
464
+ )
465
+
466
+ def add_materialized_column(self, name: str, expr: str, type: str):
467
+ assert self.column_mapping_enabled(), "column mapping not enabled"
468
+ Logger.info(f"add materialized column ({name} {type})", extra={"job": self})
469
+ self.spark.sql(
470
+ f""""
471
+ alter table {self.qualified_name}
472
+ add columns (`{name}` {type} materialized {expr})
473
+ """
474
+ )
475
+
476
+ def add_column(self, name: str, type: str, after: Optional[str] = None):
477
+ Logger.info(f"add column {name} ({type})", extra={"job": self})
478
+ ddl_after = "" if not after else f"after {after}"
479
+ self.spark.sql(
480
+ f"""
481
+ alter table {self.qualified_name}
482
+ add columns (`{name}` {type} {ddl_after})
483
+ """
484
+ )
485
+
486
+ def create_bloomfilter_index(self, columns: Union[str, List[str]]):
487
+ if isinstance(columns, str):
488
+ columns = [columns]
489
+ columns = [f"`{c}`" for c in columns]
490
+ cols = ", ".join(columns)
491
+
492
+ Logger.info(f"bloomfilter by {cols}", extra={"job": self})
493
+ self.spark.sql(
494
+ f"""
495
+ create bloomfilter index on table {self.qualified_name}
496
+ for columns ({cols})
497
+ """
498
+ )
499
+
500
+ def create_restore_point(self):
501
+ last_version = self.get_last_version() + 1
502
+ self.set_property("fabricks.last_version", last_version)
503
+
504
+ def show_properties(self) -> DataFrame:
505
+ return self.spark.sql(f"show tblproperties {self.qualified_name}")
506
+
507
+ def describe_detail(self) -> DataFrame:
508
+ return self.spark.sql(f"describe detail {self.qualified_name}")
509
+
510
+ def describe_extended(self) -> DataFrame:
511
+ return self.spark.sql(f"describe extended {self.qualified_name}")
512
+
513
+ def describe_history(self) -> DataFrame:
514
+ df = self.spark.sql(f"describe history {self.qualified_name}")
515
+ return df
516
+
517
+ def enable_liquid_clustering(self, columns: Union[str, List[str]]):
518
+ if isinstance(columns, str):
519
+ columns = [columns]
520
+ columns = [f"`{c}`" for c in columns]
521
+ cols = ", ".join(columns)
522
+ Logger.info(f"cluster by {cols}", extra={"job": self})
523
+
524
+ self.spark.sql(
525
+ f"""
526
+ alter table {self.qualified_name}
527
+ cluster by ({cols})
528
+ """
529
+ )
@@ -0,0 +1,35 @@
1
+ from databricks.sdk.runtime import spark
2
+ from pyspark.sql import DataFrame
3
+
4
+
5
+ def get_tables(schema: str) -> DataFrame:
6
+ table_df = spark.sql(f"show tables in {schema}")
7
+ view_df = spark.sql(f"show views in {schema}")
8
+ df = spark.sql(
9
+ """
10
+ select
11
+ database,
12
+ concat_ws('.', database, tableName) as table
13
+ from
14
+ {tables}
15
+ left anti join {views} on tableName = viewName
16
+ """,
17
+ tables=table_df,
18
+ views=view_df,
19
+ )
20
+ return df
21
+
22
+
23
+ def get_views(schema: str) -> DataFrame:
24
+ view_df = spark.sql(f"show views in {schema}")
25
+ df = spark.sql(
26
+ """
27
+ select
28
+ namespace as database,
29
+ concat_ws('.', namespace, viewName) as view
30
+ from
31
+ {views}
32
+ """,
33
+ views=view_df,
34
+ )
35
+ return df
@@ -0,0 +1,40 @@
1
+ from typing import Optional, Union
2
+ from uuid import uuid4
3
+
4
+ import pandas as pd
5
+ from databricks.sdk.runtime import spark as _spark
6
+ from pyspark.sql import DataFrame, SparkSession
7
+
8
+ from fabricks.context.log import Logger
9
+ from fabricks.metastore.relational import Relational
10
+
11
+
12
+ class View(Relational):
13
+ @staticmethod
14
+ def create_or_replace(
15
+ df: Union[DataFrame, pd.DataFrame],
16
+ *dependencies,
17
+ spark: Optional[SparkSession] = None,
18
+ ) -> str:
19
+ if spark is None:
20
+ spark = _spark
21
+ assert spark is not None
22
+
23
+ uuid = str(uuid4().hex)
24
+ df = spark.createDataFrame(df) if isinstance(df, pd.DataFrame) else df
25
+ if dependencies:
26
+ for d in dependencies:
27
+ df = df.join(d.where("1 == 2"), how="leftanti")
28
+
29
+ df.createOrReplaceGlobalTempView(uuid)
30
+ return uuid
31
+
32
+
33
+ def create_or_replace_global_temp_view(name: str, df: DataFrame, uuid: Optional[bool] = False) -> str:
34
+ if uuid:
35
+ name = f"{name}__{str(uuid4().hex)}"
36
+
37
+ job = name.split("__")[0]
38
+ Logger.debug("create global temp view", extra={"job": job})
39
+ df.createOrReplaceGlobalTempView(name)
40
+ return f"global_temp.{name}"
@@ -0,0 +1,3 @@
1
+ # BMS DNA Fabricks Databricks
2
+
3
+ Databricks - Fabricks
File without changes