fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import dbutils as _dbutils
|
|
4
|
+
from databricks.sdk.runtime import spark as _spark
|
|
5
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
6
|
+
from pyspark.sql import SparkSession
|
|
7
|
+
|
|
8
|
+
from fabricks.context.log import Logger
|
|
9
|
+
from fabricks.metastore.database import Database
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Relational:
|
|
13
|
+
def __init__(self, database: str, *levels: str, spark: Optional[SparkSession] = None):
|
|
14
|
+
self.database = Database(database)
|
|
15
|
+
self.levels = levels
|
|
16
|
+
if spark is None:
|
|
17
|
+
spark = _spark
|
|
18
|
+
assert spark is not None
|
|
19
|
+
self.spark: SparkSession = spark
|
|
20
|
+
self.dbutils = _dbutils
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def name(self) -> str:
|
|
24
|
+
return "_".join(self.levels)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def qualified_name(self) -> str:
|
|
28
|
+
return f"{self.database.name}.{self.name}"
|
|
29
|
+
|
|
30
|
+
def registered(self):
|
|
31
|
+
try:
|
|
32
|
+
df = self.spark.sql(f"show tables in {self.database}").where(f"tableName == '{self.name}'")
|
|
33
|
+
return not df.isEmpty()
|
|
34
|
+
# not found
|
|
35
|
+
except AnalysisException:
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
def is_view(self):
|
|
39
|
+
try:
|
|
40
|
+
df = self.spark.sql(f"show views in {self.database}").where(f"viewName == '{self.name}'")
|
|
41
|
+
return not df.isEmpty()
|
|
42
|
+
# not found
|
|
43
|
+
except AnalysisException:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
def is_table(self):
|
|
47
|
+
if self.is_view():
|
|
48
|
+
return False
|
|
49
|
+
else:
|
|
50
|
+
return self.registered()
|
|
51
|
+
|
|
52
|
+
def drop(self):
|
|
53
|
+
if self.is_view():
|
|
54
|
+
Logger.warning("drop view from metastore", extra={"job": self})
|
|
55
|
+
self.spark.sql(f"drop view if exists {self}")
|
|
56
|
+
elif self.is_table():
|
|
57
|
+
Logger.warning("drop table from metastore", extra={"job": self})
|
|
58
|
+
self.spark.sql(f"drop table if exists {self}")
|
|
59
|
+
|
|
60
|
+
def __str__(self):
|
|
61
|
+
return self.qualified_name
|
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Optional, Union, overload
|
|
3
|
+
|
|
4
|
+
from databricks.sdk.runtime import spark
|
|
5
|
+
from delta import DeltaTable
|
|
6
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
7
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
8
|
+
from pyspark.sql.functions import expr, max
|
|
9
|
+
from pyspark.sql.types import StructType
|
|
10
|
+
from typing_extensions import deprecated
|
|
11
|
+
|
|
12
|
+
from fabricks.context.log import Logger
|
|
13
|
+
from fabricks.metastore.relational import Relational
|
|
14
|
+
from fabricks.utils.path import Path
|
|
15
|
+
from fabricks.utils.sqlglot import fix
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Table(Relational):
|
|
19
|
+
@classmethod
|
|
20
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str, spark: Optional[SparkSession] = spark):
|
|
21
|
+
return cls(step, topic, item, spark=spark)
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
@deprecated("use delta_path instead")
|
|
25
|
+
def deltapath(self) -> Path:
|
|
26
|
+
return self.database.deltapath.join("/".join(self.levels))
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def delta_path(self) -> Path:
|
|
30
|
+
return self.database.deltapath.join("/".join(self.levels))
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def deltatable(self) -> DeltaTable:
|
|
34
|
+
return DeltaTable.forPath(self.spark, self.deltapath.string)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def delta_table(self) -> DeltaTable:
|
|
38
|
+
return DeltaTable.forPath(self.spark, self.deltapath.string)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def dataframe(self) -> DataFrame:
|
|
42
|
+
return self.spark.sql(f"select * from {self}")
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def columns(self) -> List[str]:
|
|
46
|
+
return self.dataframe.columns
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def rows(self) -> int:
|
|
50
|
+
return self.spark.sql(f"select count(*) from {self}").collect()[0][0]
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def last_version(self) -> int:
|
|
54
|
+
df = self.describe_history()
|
|
55
|
+
version = df.select(max("version")).collect()[0][0]
|
|
56
|
+
return version
|
|
57
|
+
|
|
58
|
+
def drop(self):
|
|
59
|
+
super().drop()
|
|
60
|
+
if self.deltapath.exists():
|
|
61
|
+
Logger.debug("delete delta folder", extra={"job": self})
|
|
62
|
+
self.deltapath.rm()
|
|
63
|
+
|
|
64
|
+
@overload
|
|
65
|
+
def create(
|
|
66
|
+
self,
|
|
67
|
+
df: DataFrame,
|
|
68
|
+
*,
|
|
69
|
+
partitioning: Optional[bool] = False,
|
|
70
|
+
partition_by: Optional[Union[List[str], str]] = None,
|
|
71
|
+
identity: Optional[bool] = False,
|
|
72
|
+
liquid_clustering: Optional[bool] = False,
|
|
73
|
+
cluster_by: Optional[Union[List[str], str]] = None,
|
|
74
|
+
properties: Optional[dict[str, str]] = None,
|
|
75
|
+
): ...
|
|
76
|
+
|
|
77
|
+
@overload
|
|
78
|
+
def create(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
schema: StructType,
|
|
82
|
+
partitioning: Optional[bool] = False,
|
|
83
|
+
partition_by: Optional[Union[List[str], str]] = None,
|
|
84
|
+
identity: Optional[bool] = False,
|
|
85
|
+
liquid_clustering: Optional[bool] = False,
|
|
86
|
+
cluster_by: Optional[Union[List[str], str]] = None,
|
|
87
|
+
properties: Optional[dict[str, str]] = None,
|
|
88
|
+
): ...
|
|
89
|
+
|
|
90
|
+
def create(
|
|
91
|
+
self,
|
|
92
|
+
df: Optional[DataFrame] = None,
|
|
93
|
+
schema: Optional[StructType] = None,
|
|
94
|
+
partitioning: Optional[bool] = False,
|
|
95
|
+
partition_by: Optional[Union[List[str], str]] = None,
|
|
96
|
+
identity: Optional[bool] = False,
|
|
97
|
+
liquid_clustering: Optional[bool] = False,
|
|
98
|
+
cluster_by: Optional[Union[List[str], str]] = None,
|
|
99
|
+
properties: Optional[dict[str, str]] = None,
|
|
100
|
+
):
|
|
101
|
+
self._create(
|
|
102
|
+
df=df,
|
|
103
|
+
schema=schema,
|
|
104
|
+
partitioning=partitioning,
|
|
105
|
+
partition_by=partition_by,
|
|
106
|
+
identity=identity,
|
|
107
|
+
liquid_clustering=liquid_clustering,
|
|
108
|
+
cluster_by=cluster_by,
|
|
109
|
+
properties=properties,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def _create(
|
|
113
|
+
self,
|
|
114
|
+
df: Optional[DataFrame] = None,
|
|
115
|
+
schema: Optional[StructType] = None,
|
|
116
|
+
partitioning: Optional[bool] = False,
|
|
117
|
+
partition_by: Optional[Union[List[str], str]] = None,
|
|
118
|
+
identity: Optional[bool] = False,
|
|
119
|
+
liquid_clustering: Optional[bool] = False,
|
|
120
|
+
cluster_by: Optional[Union[List[str], str]] = None,
|
|
121
|
+
properties: Optional[dict[str, str]] = None,
|
|
122
|
+
):
|
|
123
|
+
Logger.info("create table", extra={"job": self})
|
|
124
|
+
if not df:
|
|
125
|
+
assert schema is not None
|
|
126
|
+
df = self.spark.createDataFrame([], schema)
|
|
127
|
+
|
|
128
|
+
def _backtick(name: str, dtype: str) -> str:
|
|
129
|
+
j = df.schema[name].jsonValue()
|
|
130
|
+
r = re.compile(r"(?<='name': ')[^']+(?=',)")
|
|
131
|
+
names = re.findall(r, str(j))
|
|
132
|
+
for n in names:
|
|
133
|
+
escaped = re.escape(n)
|
|
134
|
+
dtype = re.sub(f"(?<=,){escaped}(?=:)|(?<=<){escaped}(?=:)", f"`{n}`", dtype)
|
|
135
|
+
return dtype
|
|
136
|
+
|
|
137
|
+
ddl_columns = ",\n\t".join([f"`{name}` {_backtick(name, dtype)}" for name, dtype in df.dtypes])
|
|
138
|
+
ddl_identity = "-- no identity" if "__identity" not in df.columns else ""
|
|
139
|
+
ddl_cluster_by = "-- no cluster by"
|
|
140
|
+
ddl_partition_by = "-- no partitioned by"
|
|
141
|
+
ddl_tblproperties = "-- not tblproperties"
|
|
142
|
+
|
|
143
|
+
if liquid_clustering:
|
|
144
|
+
assert cluster_by
|
|
145
|
+
if isinstance(cluster_by, str):
|
|
146
|
+
cluster_by = [cluster_by]
|
|
147
|
+
cluster_by = [f"`{c}`" for c in cluster_by]
|
|
148
|
+
ddl_cluster_by = "cluster by (" + ", ".join(cluster_by) + ")"
|
|
149
|
+
if partitioning:
|
|
150
|
+
assert partition_by
|
|
151
|
+
if isinstance(partition_by, str):
|
|
152
|
+
partition_by = [partition_by]
|
|
153
|
+
partition_by = [f"`{p}`" for p in partition_by]
|
|
154
|
+
ddl_partition_by = "partitioned by (" + ", ".join(partition_by) + ")"
|
|
155
|
+
|
|
156
|
+
if identity:
|
|
157
|
+
ddl_identity = "__identity bigint generated by default as identity (start with 1 increment by 1), "
|
|
158
|
+
|
|
159
|
+
if not properties:
|
|
160
|
+
special_char = False
|
|
161
|
+
for c in df.columns:
|
|
162
|
+
match = re.search(r"[^a-zA-Z0-9_]", c)
|
|
163
|
+
if match:
|
|
164
|
+
special_char = True
|
|
165
|
+
break
|
|
166
|
+
if special_char:
|
|
167
|
+
properties = {
|
|
168
|
+
"delta.columnMapping.mode": "name",
|
|
169
|
+
"delta.minReaderVersion": "2",
|
|
170
|
+
"delta.minWriterVersion": "5",
|
|
171
|
+
}
|
|
172
|
+
if properties:
|
|
173
|
+
ddl_tblproperties = (
|
|
174
|
+
"tblproperties (" + ",".join(f"'{key}' = '{value}'" for key, value in properties.items()) + ")"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
sql = f"""
|
|
178
|
+
create table if not exists {self.qualified_name}
|
|
179
|
+
(
|
|
180
|
+
{ddl_identity}
|
|
181
|
+
{ddl_columns}
|
|
182
|
+
)
|
|
183
|
+
{ddl_tblproperties}
|
|
184
|
+
{ddl_partition_by}
|
|
185
|
+
{ddl_cluster_by}
|
|
186
|
+
location '{self.deltapath}'
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
sql = fix(sql)
|
|
190
|
+
except Exception:
|
|
191
|
+
pass
|
|
192
|
+
Logger.debug("ddl", extra={"job": self, "sql": sql})
|
|
193
|
+
self.spark.sql(sql)
|
|
194
|
+
|
|
195
|
+
def is_deltatable(self) -> bool:
|
|
196
|
+
return DeltaTable.isDeltaTable(self.spark, str(self.deltapath))
|
|
197
|
+
|
|
198
|
+
def column_mapping_enabled(self) -> bool:
|
|
199
|
+
return self.get_property("delta.columnMapping.mode") == "name"
|
|
200
|
+
|
|
201
|
+
def exists(self) -> bool:
|
|
202
|
+
return self.is_deltatable() and self.registered()
|
|
203
|
+
|
|
204
|
+
def register(self):
|
|
205
|
+
Logger.debug("register table", extra={"job": self})
|
|
206
|
+
self.spark.sql(f"create table if not exists {self.qualified_name} using delta location '{self.deltapath}'")
|
|
207
|
+
|
|
208
|
+
def restore_to_version(self, version: int):
|
|
209
|
+
Logger.info(f"restore table to version {version}", extra={"job": self})
|
|
210
|
+
self.spark.sql(f"restore table {self.qualified_name} to version as of {version}")
|
|
211
|
+
|
|
212
|
+
def truncate(self):
|
|
213
|
+
Logger.warning("truncate table", extra={"job": self})
|
|
214
|
+
self.create_restore_point()
|
|
215
|
+
self.spark.sql(f"truncate table {self.qualified_name}")
|
|
216
|
+
|
|
217
|
+
def schema_drifted(self, df: DataFrame) -> bool:
|
|
218
|
+
return not self._check_schema_drift(df).isEmpty()
|
|
219
|
+
|
|
220
|
+
def _check_schema_drift(self, df: DataFrame) -> DataFrame:
|
|
221
|
+
Logger.debug("check schema drift", extra={"job": self})
|
|
222
|
+
|
|
223
|
+
new_df = self.spark.createDataFrame(df.dtypes, ["new_name", "new_type"]) # type: ignore
|
|
224
|
+
new_df = new_df.filter(~new_df.new_name.startswith("__"))
|
|
225
|
+
|
|
226
|
+
old_df = self.spark.createDataFrame(self.dataframe.dtypes, ["old_name", "old_type"]) # type: ignore
|
|
227
|
+
old_df = old_df.filter(~old_df.old_name.startswith("__"))
|
|
228
|
+
|
|
229
|
+
cond = [new_df["new_name"] == old_df["old_name"]]
|
|
230
|
+
df_diff = (
|
|
231
|
+
new_df.join(old_df, on=cond, how="outer")
|
|
232
|
+
.where(
|
|
233
|
+
"""
|
|
234
|
+
coalesce(old_name, -1) <> coalesce(new_name, -1)
|
|
235
|
+
or
|
|
236
|
+
coalesce(old_type, -1) <> coalesce(new_type, -1)
|
|
237
|
+
"""
|
|
238
|
+
)
|
|
239
|
+
.withColumn(
|
|
240
|
+
"operation",
|
|
241
|
+
expr("if(new_name is null, 'drop', if(old_name is null, 'add', 'update'))"),
|
|
242
|
+
)
|
|
243
|
+
.withColumn("column", expr("coalesce(new_name, old_name)"))
|
|
244
|
+
)
|
|
245
|
+
return df_diff
|
|
246
|
+
|
|
247
|
+
def _fix_schema(self, df: DataFrame, overwrite: bool = False):
|
|
248
|
+
drift_df = self._check_schema_drift(df)
|
|
249
|
+
|
|
250
|
+
if not drift_df.isEmpty():
|
|
251
|
+
Logger.info("update table", extra={"job": self})
|
|
252
|
+
todo_df = drift_df.where("operation in ('add', 'update')")
|
|
253
|
+
if not todo_df.isEmpty():
|
|
254
|
+
for row in todo_df.collect():
|
|
255
|
+
if row.operation == "add":
|
|
256
|
+
Logger.debug(f"add column { row.column}", extra={"job": self})
|
|
257
|
+
else:
|
|
258
|
+
Logger.debug(
|
|
259
|
+
"update column { row.column} ({row.old_type} -> {row.new_type})",
|
|
260
|
+
extra={"job": self},
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
col_df = df.select(row.column).where("1 == 2")
|
|
265
|
+
(
|
|
266
|
+
self.deltatable.alias("dt")
|
|
267
|
+
.merge(col_df.alias("df"), "1 == 2")
|
|
268
|
+
.whenNotMatchedInsertAll()
|
|
269
|
+
.execute()
|
|
270
|
+
)
|
|
271
|
+
except Exception:
|
|
272
|
+
pass
|
|
273
|
+
|
|
274
|
+
if overwrite:
|
|
275
|
+
drift_df = self._check_schema_drift(df)
|
|
276
|
+
Logger.warning("overwrite table", extra={"job": self})
|
|
277
|
+
for row in drift_df.collect():
|
|
278
|
+
if row.operation == "add":
|
|
279
|
+
self.add_column(row.column, row.new_type)
|
|
280
|
+
elif row.operation == "drop":
|
|
281
|
+
self.drop_column(row.column)
|
|
282
|
+
elif row.operation == "update":
|
|
283
|
+
try:
|
|
284
|
+
self.change_column(row.column, row.new_type)
|
|
285
|
+
except AnalysisException:
|
|
286
|
+
self.drop_column(row.column)
|
|
287
|
+
self.add_column(row.column, row.new_type)
|
|
288
|
+
else:
|
|
289
|
+
raise ValueError(f"{row.operation} not allowed")
|
|
290
|
+
|
|
291
|
+
def update_schema(self, df: DataFrame):
|
|
292
|
+
self._fix_schema(df, overwrite=False)
|
|
293
|
+
|
|
294
|
+
def overwrite_schema(self, df: DataFrame):
|
|
295
|
+
self._fix_schema(df, overwrite=True)
|
|
296
|
+
|
|
297
|
+
def vacuum(self, retention_days: int = 7):
|
|
298
|
+
Logger.debug(f"vacuum table (removing files older than {retention_days} days)", extra={"job": self})
|
|
299
|
+
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
|
|
300
|
+
try:
|
|
301
|
+
self.create_restore_point()
|
|
302
|
+
retention_hours = retention_days * 24
|
|
303
|
+
self.deltatable.vacuum(retention_hours)
|
|
304
|
+
finally:
|
|
305
|
+
# finally
|
|
306
|
+
pass
|
|
307
|
+
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
|
|
308
|
+
|
|
309
|
+
def optimize(
|
|
310
|
+
self,
|
|
311
|
+
columns: Optional[Union[str, List[str]]] = None,
|
|
312
|
+
vorder: Optional[bool] = False,
|
|
313
|
+
):
|
|
314
|
+
Logger.info("optimize", extra={"job": self})
|
|
315
|
+
|
|
316
|
+
zorder_by = columns is not None
|
|
317
|
+
if zorder_by:
|
|
318
|
+
if isinstance(columns, str):
|
|
319
|
+
columns = [columns]
|
|
320
|
+
columns = [f"`{c}`" for c in columns]
|
|
321
|
+
cols = ", ".join(columns)
|
|
322
|
+
|
|
323
|
+
if vorder:
|
|
324
|
+
Logger.debug(f"zorder by {cols} vorder", extra={"job": self})
|
|
325
|
+
self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols}) vorder")
|
|
326
|
+
else:
|
|
327
|
+
Logger.debug(f"zorder by {cols}", extra={"job": self})
|
|
328
|
+
self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols})")
|
|
329
|
+
|
|
330
|
+
elif vorder:
|
|
331
|
+
Logger.debug("vorder", extra={"job": self})
|
|
332
|
+
self.spark.sql(f"optimize {self.qualified_name} vorder")
|
|
333
|
+
|
|
334
|
+
else:
|
|
335
|
+
Logger.debug("optimize", extra={"job": self})
|
|
336
|
+
self.spark.sql(f"optimize {self.qualified_name}")
|
|
337
|
+
|
|
338
|
+
def analyze(self):
|
|
339
|
+
Logger.debug("analyze", extra={"job": self})
|
|
340
|
+
self.compute_statistics()
|
|
341
|
+
self.compute_delta_statistics()
|
|
342
|
+
|
|
343
|
+
def compute_statistics(self):
|
|
344
|
+
Logger.debug("compute statistics", extra={"job": self})
|
|
345
|
+
cols = [
|
|
346
|
+
f"`{name}`"
|
|
347
|
+
for name, dtype in self.dataframe.dtypes
|
|
348
|
+
if not dtype.startswith("struct") and not dtype.startswith("array") and name not in ["__metadata"]
|
|
349
|
+
]
|
|
350
|
+
cols = ", ".join(sorted(cols))
|
|
351
|
+
self.spark.sql(f"analyze table delta.`{self.deltapath}` compute statistics for columns {cols}")
|
|
352
|
+
|
|
353
|
+
def compute_delta_statistics(self):
|
|
354
|
+
Logger.debug("compute delta statistics", extra={"job": self})
|
|
355
|
+
self.spark.sql(f"analyze table delta.`{self.deltapath}` compute delta statistics")
|
|
356
|
+
|
|
357
|
+
def drop_column(self, name: str):
|
|
358
|
+
assert self.column_mapping_enabled(), "column mapping not enabled"
|
|
359
|
+
Logger.warning(f"drop column {name}", extra={"job": self})
|
|
360
|
+
self.spark.sql(
|
|
361
|
+
f"""
|
|
362
|
+
alter table {self.qualified_name}
|
|
363
|
+
drop column `{name}`
|
|
364
|
+
"""
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def change_column(self, name: str, type: str):
|
|
368
|
+
assert self.column_mapping_enabled(), "column mapping not enabled"
|
|
369
|
+
Logger.info(f"change column {name} ({type})", extra={"job": self})
|
|
370
|
+
self.spark.sql(
|
|
371
|
+
f"""
|
|
372
|
+
alter table {self.qualified_name}
|
|
373
|
+
change column `{name}` `{name}` {type}
|
|
374
|
+
"""
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def rename_column(self, old: str, new: str):
|
|
378
|
+
assert self.column_mapping_enabled(), "column mapping not enabled"
|
|
379
|
+
Logger.info(f"rename column {old} -> {new}", extra={"job": self})
|
|
380
|
+
self.spark.sql(
|
|
381
|
+
f"""
|
|
382
|
+
alter table {self.qualified_name}
|
|
383
|
+
rename column `{old}` to `{new}`
|
|
384
|
+
"""
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
def get_details(self) -> DataFrame:
|
|
388
|
+
return self.spark.sql(f"describe detail {self.qualified_name}")
|
|
389
|
+
|
|
390
|
+
def get_properties(self) -> DataFrame:
|
|
391
|
+
return self.spark.sql(f"show tblproperties {self.qualified_name}")
|
|
392
|
+
|
|
393
|
+
def get_description(self) -> DataFrame:
|
|
394
|
+
return self.spark.sql(f"describe extended {self.qualified_name}")
|
|
395
|
+
|
|
396
|
+
def get_history(self) -> DataFrame:
|
|
397
|
+
df = self.spark.sql(f"describe history {self.qualified_name}")
|
|
398
|
+
return df
|
|
399
|
+
|
|
400
|
+
def get_last_version(self) -> int:
|
|
401
|
+
df = self.get_history()
|
|
402
|
+
version = df.select(max("version")).collect()[0][0]
|
|
403
|
+
return version
|
|
404
|
+
|
|
405
|
+
def get_property(self, key: str) -> Optional[str]:
|
|
406
|
+
try:
|
|
407
|
+
value = self.get_properties().where(f"key == '{key}'").select("value").collect()[0][0]
|
|
408
|
+
return value
|
|
409
|
+
|
|
410
|
+
except IndexError:
|
|
411
|
+
return None
|
|
412
|
+
|
|
413
|
+
def enable_change_data_feed(self):
|
|
414
|
+
Logger.debug("enable change data feed", extra={"job": self})
|
|
415
|
+
self.set_property("delta.enableChangeDataFeed", "true")
|
|
416
|
+
|
|
417
|
+
def enable_column_mapping(self):
|
|
418
|
+
Logger.debug("enable column mapping", extra={"job": self})
|
|
419
|
+
try:
|
|
420
|
+
self.spark.sql(
|
|
421
|
+
f"""
|
|
422
|
+
alter table {self.qualified_name}
|
|
423
|
+
set tblproperties ('delta.columnMapping.mode' = 'name')
|
|
424
|
+
"""
|
|
425
|
+
)
|
|
426
|
+
except Exception:
|
|
427
|
+
Logger.debug("update reader and writer version", extra={"job": self})
|
|
428
|
+
self.spark.sql(
|
|
429
|
+
f"""
|
|
430
|
+
alter table {self.qualified_name}
|
|
431
|
+
set tblproperties (
|
|
432
|
+
'delta.columnMapping.mode' = 'name',
|
|
433
|
+
'delta.minReaderVersion' = '2',
|
|
434
|
+
'delta.minWriterVersion' = '5'
|
|
435
|
+
)
|
|
436
|
+
"""
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
def set_property(self, key: Union[str, int], value: Union[str, int]):
|
|
440
|
+
Logger.debug(f"set property {key} = {value}", extra={"job": self})
|
|
441
|
+
self.spark.sql(
|
|
442
|
+
f"""
|
|
443
|
+
alter table {self.qualified_name}
|
|
444
|
+
set tblproperties ({key} = '{value}')
|
|
445
|
+
"""
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
def add_constraint(self, name: str, expr: str):
|
|
449
|
+
Logger.debug(f"add constraint ({name} check ({expr}))", extra={"job": self})
|
|
450
|
+
self.spark.sql(
|
|
451
|
+
f"""
|
|
452
|
+
alter table {self.qualified_name}
|
|
453
|
+
add constraint {name} check ({expr});
|
|
454
|
+
"""
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
def add_comment(self, comment: str):
|
|
458
|
+
Logger.debug(f"add comment '{comment}'", extra={"job": self})
|
|
459
|
+
self.spark.sql(
|
|
460
|
+
f"""
|
|
461
|
+
comment on table {self.qualified_name}
|
|
462
|
+
is '{comment}';
|
|
463
|
+
"""
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
def add_materialized_column(self, name: str, expr: str, type: str):
|
|
467
|
+
assert self.column_mapping_enabled(), "column mapping not enabled"
|
|
468
|
+
Logger.info(f"add materialized column ({name} {type})", extra={"job": self})
|
|
469
|
+
self.spark.sql(
|
|
470
|
+
f""""
|
|
471
|
+
alter table {self.qualified_name}
|
|
472
|
+
add columns (`{name}` {type} materialized {expr})
|
|
473
|
+
"""
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
def add_column(self, name: str, type: str, after: Optional[str] = None):
|
|
477
|
+
Logger.info(f"add column {name} ({type})", extra={"job": self})
|
|
478
|
+
ddl_after = "" if not after else f"after {after}"
|
|
479
|
+
self.spark.sql(
|
|
480
|
+
f"""
|
|
481
|
+
alter table {self.qualified_name}
|
|
482
|
+
add columns (`{name}` {type} {ddl_after})
|
|
483
|
+
"""
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
def create_bloomfilter_index(self, columns: Union[str, List[str]]):
|
|
487
|
+
if isinstance(columns, str):
|
|
488
|
+
columns = [columns]
|
|
489
|
+
columns = [f"`{c}`" for c in columns]
|
|
490
|
+
cols = ", ".join(columns)
|
|
491
|
+
|
|
492
|
+
Logger.info(f"bloomfilter by {cols}", extra={"job": self})
|
|
493
|
+
self.spark.sql(
|
|
494
|
+
f"""
|
|
495
|
+
create bloomfilter index on table {self.qualified_name}
|
|
496
|
+
for columns ({cols})
|
|
497
|
+
"""
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
def create_restore_point(self):
|
|
501
|
+
last_version = self.get_last_version() + 1
|
|
502
|
+
self.set_property("fabricks.last_version", last_version)
|
|
503
|
+
|
|
504
|
+
def show_properties(self) -> DataFrame:
|
|
505
|
+
return self.spark.sql(f"show tblproperties {self.qualified_name}")
|
|
506
|
+
|
|
507
|
+
def describe_detail(self) -> DataFrame:
|
|
508
|
+
return self.spark.sql(f"describe detail {self.qualified_name}")
|
|
509
|
+
|
|
510
|
+
def describe_extended(self) -> DataFrame:
|
|
511
|
+
return self.spark.sql(f"describe extended {self.qualified_name}")
|
|
512
|
+
|
|
513
|
+
def describe_history(self) -> DataFrame:
|
|
514
|
+
df = self.spark.sql(f"describe history {self.qualified_name}")
|
|
515
|
+
return df
|
|
516
|
+
|
|
517
|
+
def enable_liquid_clustering(self, columns: Union[str, List[str]]):
|
|
518
|
+
if isinstance(columns, str):
|
|
519
|
+
columns = [columns]
|
|
520
|
+
columns = [f"`{c}`" for c in columns]
|
|
521
|
+
cols = ", ".join(columns)
|
|
522
|
+
Logger.info(f"cluster by {cols}", extra={"job": self})
|
|
523
|
+
|
|
524
|
+
self.spark.sql(
|
|
525
|
+
f"""
|
|
526
|
+
alter table {self.qualified_name}
|
|
527
|
+
cluster by ({cols})
|
|
528
|
+
"""
|
|
529
|
+
)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from databricks.sdk.runtime import spark
|
|
2
|
+
from pyspark.sql import DataFrame
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def get_tables(schema: str) -> DataFrame:
|
|
6
|
+
table_df = spark.sql(f"show tables in {schema}")
|
|
7
|
+
view_df = spark.sql(f"show views in {schema}")
|
|
8
|
+
df = spark.sql(
|
|
9
|
+
"""
|
|
10
|
+
select
|
|
11
|
+
database,
|
|
12
|
+
concat_ws('.', database, tableName) as table
|
|
13
|
+
from
|
|
14
|
+
{tables}
|
|
15
|
+
left anti join {views} on tableName = viewName
|
|
16
|
+
""",
|
|
17
|
+
tables=table_df,
|
|
18
|
+
views=view_df,
|
|
19
|
+
)
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_views(schema: str) -> DataFrame:
|
|
24
|
+
view_df = spark.sql(f"show views in {schema}")
|
|
25
|
+
df = spark.sql(
|
|
26
|
+
"""
|
|
27
|
+
select
|
|
28
|
+
namespace as database,
|
|
29
|
+
concat_ws('.', namespace, viewName) as view
|
|
30
|
+
from
|
|
31
|
+
{views}
|
|
32
|
+
""",
|
|
33
|
+
views=view_df,
|
|
34
|
+
)
|
|
35
|
+
return df
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
from uuid import uuid4
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from databricks.sdk.runtime import spark as _spark
|
|
6
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
7
|
+
|
|
8
|
+
from fabricks.context.log import Logger
|
|
9
|
+
from fabricks.metastore.relational import Relational
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class View(Relational):
|
|
13
|
+
@staticmethod
|
|
14
|
+
def create_or_replace(
|
|
15
|
+
df: Union[DataFrame, pd.DataFrame],
|
|
16
|
+
*dependencies,
|
|
17
|
+
spark: Optional[SparkSession] = None,
|
|
18
|
+
) -> str:
|
|
19
|
+
if spark is None:
|
|
20
|
+
spark = _spark
|
|
21
|
+
assert spark is not None
|
|
22
|
+
|
|
23
|
+
uuid = str(uuid4().hex)
|
|
24
|
+
df = spark.createDataFrame(df) if isinstance(df, pd.DataFrame) else df
|
|
25
|
+
if dependencies:
|
|
26
|
+
for d in dependencies:
|
|
27
|
+
df = df.join(d.where("1 == 2"), how="leftanti")
|
|
28
|
+
|
|
29
|
+
df.createOrReplaceGlobalTempView(uuid)
|
|
30
|
+
return uuid
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_or_replace_global_temp_view(name: str, df: DataFrame, uuid: Optional[bool] = False) -> str:
|
|
34
|
+
if uuid:
|
|
35
|
+
name = f"{name}__{str(uuid4().hex)}"
|
|
36
|
+
|
|
37
|
+
job = name.split("__")[0]
|
|
38
|
+
Logger.debug("create global temp view", extra={"job": job})
|
|
39
|
+
df.createOrReplaceGlobalTempView(name)
|
|
40
|
+
return f"global_temp.{name}"
|
fabricks/utils/README.md
ADDED
|
File without changes
|