fabricks 3.0.5.2__py3-none-any.whl → 3.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +80 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/METADATA +2 -1
- fabricks-3.0.7.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.7.dist-info}/WHEEL +0 -0
fabricks/metastore/table.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
-
from typing import List, Optional, Sequence, Union, overload
|
|
2
|
+
from typing import Any, List, Optional, Sequence, Union, overload
|
|
3
3
|
|
|
4
4
|
from delta import DeltaTable
|
|
5
5
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
@@ -38,25 +38,25 @@ class Table(DbObject):
|
|
|
38
38
|
|
|
39
39
|
@property
|
|
40
40
|
def dataframe(self) -> DataFrame:
|
|
41
|
-
assert self.
|
|
41
|
+
assert self.registered, f"{self} not registered"
|
|
42
42
|
|
|
43
43
|
return self.spark.sql(f"select * from {self}")
|
|
44
44
|
|
|
45
45
|
@property
|
|
46
46
|
def columns(self) -> List[str]:
|
|
47
|
-
assert self.
|
|
47
|
+
assert self.registered, f"{self} not registered"
|
|
48
48
|
|
|
49
49
|
return self.dataframe.columns
|
|
50
50
|
|
|
51
51
|
@property
|
|
52
52
|
def rows(self) -> int:
|
|
53
|
-
assert self.
|
|
53
|
+
assert self.registered, f"{self} not registered"
|
|
54
54
|
|
|
55
55
|
return self.spark.sql(f"select count(*) from {self}").collect()[0][0]
|
|
56
56
|
|
|
57
57
|
@property
|
|
58
58
|
def last_version(self) -> int:
|
|
59
|
-
assert self.
|
|
59
|
+
assert self.registered, f"{self} not registered"
|
|
60
60
|
|
|
61
61
|
df = self.describe_history()
|
|
62
62
|
version = df.select(max("version")).collect()[0][0]
|
|
@@ -64,33 +64,33 @@ class Table(DbObject):
|
|
|
64
64
|
|
|
65
65
|
@property
|
|
66
66
|
def identity_enabled(self) -> bool:
|
|
67
|
-
assert self.
|
|
67
|
+
assert self.registered, f"{self} not registered"
|
|
68
68
|
return self.get_property("delta.feature.identityColumns") == "supported"
|
|
69
69
|
|
|
70
70
|
@property
|
|
71
71
|
def type_widening_enabled(self) -> bool:
|
|
72
|
-
assert self.
|
|
72
|
+
assert self.registered, f"{self} not registered"
|
|
73
73
|
return self.get_property("delta.enableTypeWidening") == "true"
|
|
74
74
|
|
|
75
75
|
@property
|
|
76
76
|
def liquid_clustering_enabled(self) -> bool:
|
|
77
|
-
assert self.
|
|
77
|
+
assert self.registered, f"{self} not registered"
|
|
78
78
|
return self.get_property("delta.feature.clustering") == "supported"
|
|
79
79
|
|
|
80
80
|
@property
|
|
81
81
|
def auto_liquid_clustering_enabled(self) -> bool:
|
|
82
|
-
assert self.
|
|
82
|
+
assert self.registered, f"{self} not registered"
|
|
83
83
|
return self.get_property("delta.clusterByAuto") == "true"
|
|
84
84
|
|
|
85
85
|
@property
|
|
86
86
|
def vorder_enabled(self) -> bool:
|
|
87
|
-
assert self.
|
|
87
|
+
assert self.registered, f"{self} not registered"
|
|
88
88
|
return self.get_property("delta.parquet.vorder.enabled") == "true"
|
|
89
89
|
|
|
90
90
|
def drop(self):
|
|
91
91
|
super().drop()
|
|
92
92
|
if self.delta_path.exists():
|
|
93
|
-
DEFAULT_LOGGER.debug("delete delta folder", extra={"
|
|
93
|
+
DEFAULT_LOGGER.debug("delete delta folder", extra={"label": self})
|
|
94
94
|
self.delta_path.rm()
|
|
95
95
|
|
|
96
96
|
@overload
|
|
@@ -104,6 +104,10 @@ class Table(DbObject):
|
|
|
104
104
|
liquid_clustering: Optional[bool] = False,
|
|
105
105
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
106
106
|
properties: Optional[dict[str, str]] = None,
|
|
107
|
+
masks: Optional[dict[str, str]] = None,
|
|
108
|
+
primary_key: Optional[dict[str, Any]] = None,
|
|
109
|
+
foreign_keys: Optional[dict[str, Any]] = None,
|
|
110
|
+
comments: Optional[dict[str, str]] = None,
|
|
107
111
|
): ...
|
|
108
112
|
|
|
109
113
|
@overload
|
|
@@ -117,6 +121,10 @@ class Table(DbObject):
|
|
|
117
121
|
liquid_clustering: Optional[bool] = False,
|
|
118
122
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
119
123
|
properties: Optional[dict[str, str]] = None,
|
|
124
|
+
masks: Optional[dict[str, str]] = None,
|
|
125
|
+
primary_key: Optional[dict[str, Any]] = None,
|
|
126
|
+
foreign_keys: Optional[dict[str, Any]] = None,
|
|
127
|
+
comments: Optional[dict[str, str]] = None,
|
|
120
128
|
): ...
|
|
121
129
|
|
|
122
130
|
def create(
|
|
@@ -129,6 +137,10 @@ class Table(DbObject):
|
|
|
129
137
|
liquid_clustering: Optional[bool] = False,
|
|
130
138
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
131
139
|
properties: Optional[dict[str, str]] = None,
|
|
140
|
+
masks: Optional[dict[str, str]] = None,
|
|
141
|
+
primary_key: Optional[dict[str, Any]] = None,
|
|
142
|
+
foreign_keys: Optional[dict[str, Any]] = None,
|
|
143
|
+
comments: Optional[dict[str, str]] = None,
|
|
132
144
|
):
|
|
133
145
|
self._create(
|
|
134
146
|
df=df,
|
|
@@ -139,8 +151,41 @@ class Table(DbObject):
|
|
|
139
151
|
liquid_clustering=liquid_clustering,
|
|
140
152
|
cluster_by=cluster_by,
|
|
141
153
|
properties=properties,
|
|
154
|
+
masks=masks,
|
|
155
|
+
primary_key=primary_key,
|
|
156
|
+
foreign_keys=foreign_keys,
|
|
157
|
+
comments=comments,
|
|
142
158
|
)
|
|
143
159
|
|
|
160
|
+
def _get_ddl_columns(
|
|
161
|
+
self, df: DataFrame, masks: Optional[dict[str, str]], comments: Optional[dict[str, str]]
|
|
162
|
+
) -> List[str]:
|
|
163
|
+
def _backtick(name: str, dtype: str) -> str:
|
|
164
|
+
j = df.schema[name].jsonValue()
|
|
165
|
+
r = re.compile(r"(?<='name': ')[^']+(?=',)")
|
|
166
|
+
|
|
167
|
+
names = re.findall(r, str(j))
|
|
168
|
+
for n in names:
|
|
169
|
+
escaped = re.escape(n)
|
|
170
|
+
dtype = re.sub(f"(?<=,){escaped}(?=:)|(?<=<){escaped}(?=:)", f"`{n}`", dtype)
|
|
171
|
+
|
|
172
|
+
return dtype
|
|
173
|
+
|
|
174
|
+
out = []
|
|
175
|
+
|
|
176
|
+
for name, dtype in df.dtypes:
|
|
177
|
+
col = [f"`{name}`", _backtick(name, dtype)]
|
|
178
|
+
|
|
179
|
+
if comments and name in comments:
|
|
180
|
+
col.append(f"comment '{comments[name]}'")
|
|
181
|
+
|
|
182
|
+
if masks and name in masks:
|
|
183
|
+
col.append(f"mask {masks[name]}")
|
|
184
|
+
|
|
185
|
+
out.append(" ".join(col))
|
|
186
|
+
|
|
187
|
+
return out
|
|
188
|
+
|
|
144
189
|
def _create(
|
|
145
190
|
self,
|
|
146
191
|
df: Optional[DataFrame] = None,
|
|
@@ -151,26 +196,23 @@ class Table(DbObject):
|
|
|
151
196
|
liquid_clustering: Optional[bool] = False,
|
|
152
197
|
cluster_by: Optional[Union[List[str], str]] = None,
|
|
153
198
|
properties: Optional[dict[str, str]] = None,
|
|
199
|
+
masks: Optional[dict[str, str]] = None,
|
|
200
|
+
primary_key: Optional[dict[str, Any]] = None,
|
|
201
|
+
foreign_keys: Optional[dict[str, Any]] = None,
|
|
202
|
+
comments: Optional[dict[str, str]] = None,
|
|
154
203
|
):
|
|
155
|
-
DEFAULT_LOGGER.info("create table", extra={"
|
|
204
|
+
DEFAULT_LOGGER.info("create table", extra={"label": self})
|
|
156
205
|
if not df:
|
|
157
206
|
assert schema is not None
|
|
158
207
|
df = self.spark.createDataFrame([], schema)
|
|
159
208
|
|
|
160
|
-
|
|
161
|
-
j = df.schema[name].jsonValue()
|
|
162
|
-
r = re.compile(r"(?<='name': ')[^']+(?=',)")
|
|
163
|
-
names = re.findall(r, str(j))
|
|
164
|
-
for n in names:
|
|
165
|
-
escaped = re.escape(n)
|
|
166
|
-
dtype = re.sub(f"(?<=,){escaped}(?=:)|(?<=<){escaped}(?=:)", f"`{n}`", dtype)
|
|
167
|
-
return dtype
|
|
168
|
-
|
|
169
|
-
ddl_columns = ",\n\t".join([f"`{name}` {_backtick(name, dtype)}" for name, dtype in df.dtypes])
|
|
209
|
+
ddl_columns = ",\n\t".join(self._get_ddl_columns(df, masks=masks, comments=comments))
|
|
170
210
|
ddl_identity = "-- no identity" if "__identity" not in df.columns else ""
|
|
171
211
|
ddl_cluster_by = "-- no cluster by"
|
|
172
212
|
ddl_partition_by = "-- no partitioned by"
|
|
173
213
|
ddl_tblproperties = "-- not tblproperties"
|
|
214
|
+
ddl_primary_key = "-- no primary key"
|
|
215
|
+
ddl_foreign_keys = "-- no foreign keys"
|
|
174
216
|
|
|
175
217
|
if liquid_clustering:
|
|
176
218
|
if cluster_by:
|
|
@@ -192,6 +234,29 @@ class Table(DbObject):
|
|
|
192
234
|
if identity:
|
|
193
235
|
ddl_identity = "__identity bigint generated by default as identity (start with 1 increment by 1), "
|
|
194
236
|
|
|
237
|
+
if primary_key:
|
|
238
|
+
assert len(primary_key) == 1, "only one primary key allowed"
|
|
239
|
+
|
|
240
|
+
for key, value in primary_key.items():
|
|
241
|
+
keys = value["keys"]
|
|
242
|
+
if isinstance(keys, str):
|
|
243
|
+
keys = [keys]
|
|
244
|
+
ddl_primary_key = f", constraint {key} primary key (" + ", ".join(keys) + ")"
|
|
245
|
+
|
|
246
|
+
if foreign_keys:
|
|
247
|
+
fks = []
|
|
248
|
+
|
|
249
|
+
for key, value in foreign_keys.items():
|
|
250
|
+
reference = value["reference"]
|
|
251
|
+
keys = value["keys"]
|
|
252
|
+
if isinstance(keys, str):
|
|
253
|
+
keys = [keys]
|
|
254
|
+
keys = ", ".join([f"`{k}`" for k in keys])
|
|
255
|
+
fk = f"constraint {key} foreign key ({keys}) references {reference}"
|
|
256
|
+
fks.append(fk)
|
|
257
|
+
|
|
258
|
+
ddl_foreign_keys = "," + ", ".join(fks)
|
|
259
|
+
|
|
195
260
|
if not properties:
|
|
196
261
|
special_char = False
|
|
197
262
|
|
|
@@ -218,6 +283,8 @@ class Table(DbObject):
|
|
|
218
283
|
(
|
|
219
284
|
{ddl_identity}
|
|
220
285
|
{ddl_columns}
|
|
286
|
+
{ddl_foreign_keys}
|
|
287
|
+
{ddl_primary_key}
|
|
221
288
|
)
|
|
222
289
|
{ddl_tblproperties}
|
|
223
290
|
{ddl_partition_by}
|
|
@@ -229,7 +296,7 @@ class Table(DbObject):
|
|
|
229
296
|
except Exception:
|
|
230
297
|
pass
|
|
231
298
|
|
|
232
|
-
DEFAULT_LOGGER.debug("ddl", extra={"
|
|
299
|
+
DEFAULT_LOGGER.debug("ddl", extra={"label": self, "sql": sql})
|
|
233
300
|
self.spark.sql(sql)
|
|
234
301
|
|
|
235
302
|
@property
|
|
@@ -238,38 +305,40 @@ class Table(DbObject):
|
|
|
238
305
|
|
|
239
306
|
@property
|
|
240
307
|
def column_mapping_enabled(self) -> bool:
|
|
241
|
-
assert self.
|
|
308
|
+
assert self.registered, f"{self} not registered"
|
|
242
309
|
|
|
243
310
|
return self.get_property("delta.columnMapping.mode") == "name"
|
|
244
311
|
|
|
245
312
|
def exists(self) -> bool:
|
|
246
|
-
return self.is_deltatable and self.
|
|
313
|
+
return self.is_deltatable and self.registered
|
|
247
314
|
|
|
248
315
|
def register(self):
|
|
249
|
-
DEFAULT_LOGGER.debug("register table", extra={"
|
|
316
|
+
DEFAULT_LOGGER.debug("register table", extra={"label": self})
|
|
250
317
|
self.spark.sql(f"create table if not exists {self.qualified_name} using delta location '{self.delta_path}'")
|
|
251
318
|
|
|
252
319
|
def restore_to_version(self, version: int):
|
|
253
|
-
assert self.
|
|
320
|
+
assert self.registered, f"{self} not registered"
|
|
254
321
|
|
|
255
|
-
DEFAULT_LOGGER.info(f"restore table to version {version}", extra={"
|
|
322
|
+
DEFAULT_LOGGER.info(f"restore table to version {version}", extra={"label": self})
|
|
256
323
|
self.spark.sql(f"restore table {self.qualified_name} to version as of {version}")
|
|
257
324
|
|
|
258
325
|
def truncate(self):
|
|
259
|
-
assert self.
|
|
326
|
+
assert self.registered, f"{self} not registered"
|
|
260
327
|
|
|
261
|
-
DEFAULT_LOGGER.warning("truncate table", extra={"
|
|
328
|
+
DEFAULT_LOGGER.warning("truncate table", extra={"label": self})
|
|
262
329
|
self.create_restore_point()
|
|
263
330
|
self.spark.sql(f"truncate table {self.qualified_name}")
|
|
264
331
|
|
|
265
332
|
def schema_drifted(self, df: DataFrame, exclude_columns_with_prefix: Optional[str] = None) -> bool:
|
|
266
|
-
assert self.
|
|
333
|
+
assert self.registered, f"{self} not registered"
|
|
267
334
|
|
|
268
335
|
diffs = self.get_schema_differences(df)
|
|
269
336
|
return len(diffs) > 0
|
|
270
337
|
|
|
271
338
|
def get_schema_differences(self, df: DataFrame) -> Sequence[SchemaDiff]:
|
|
272
|
-
assert self.
|
|
339
|
+
assert self.registered, f"{self} not registered"
|
|
340
|
+
|
|
341
|
+
DEFAULT_LOGGER.debug("get schema differences", extra={"label": self, "df": df})
|
|
273
342
|
|
|
274
343
|
df1 = self.dataframe
|
|
275
344
|
if self.identity_enabled:
|
|
@@ -305,12 +374,12 @@ class Table(DbObject):
|
|
|
305
374
|
)
|
|
306
375
|
|
|
307
376
|
if diffs:
|
|
308
|
-
DEFAULT_LOGGER.
|
|
377
|
+
DEFAULT_LOGGER.warning("difference(s) with delta table", extra={"label": self, "df": df})
|
|
309
378
|
|
|
310
379
|
return diffs
|
|
311
380
|
|
|
312
381
|
def update_schema(self, df: DataFrame, widen_types: bool = False):
|
|
313
|
-
assert self.
|
|
382
|
+
assert self.registered, f"{self} not registered"
|
|
314
383
|
if not self.column_mapping_enabled:
|
|
315
384
|
self.enable_column_mapping()
|
|
316
385
|
|
|
@@ -323,7 +392,7 @@ class Table(DbObject):
|
|
|
323
392
|
msg = "update schema"
|
|
324
393
|
|
|
325
394
|
if diffs:
|
|
326
|
-
DEFAULT_LOGGER.info(msg, extra={"
|
|
395
|
+
DEFAULT_LOGGER.info(msg, extra={"label": self, "df": diffs})
|
|
327
396
|
|
|
328
397
|
for row in diffs:
|
|
329
398
|
if row.status == "changed":
|
|
@@ -333,7 +402,7 @@ class Table(DbObject):
|
|
|
333
402
|
|
|
334
403
|
DEFAULT_LOGGER.debug(
|
|
335
404
|
f"{row.status.replace('ed', 'ing')} {row.column} ({data_type})",
|
|
336
|
-
extra={"
|
|
405
|
+
extra={"label": self},
|
|
337
406
|
)
|
|
338
407
|
|
|
339
408
|
try:
|
|
@@ -360,7 +429,7 @@ class Table(DbObject):
|
|
|
360
429
|
pass
|
|
361
430
|
|
|
362
431
|
def overwrite_schema(self, df: DataFrame):
|
|
363
|
-
assert self.
|
|
432
|
+
assert self.registered, f"{self} not registered"
|
|
364
433
|
if not self.column_mapping_enabled:
|
|
365
434
|
self.enable_column_mapping()
|
|
366
435
|
|
|
@@ -371,7 +440,7 @@ class Table(DbObject):
|
|
|
371
440
|
|
|
372
441
|
diffs = self.get_schema_differences(df)
|
|
373
442
|
if diffs:
|
|
374
|
-
DEFAULT_LOGGER.warning("overwrite schema", extra={"
|
|
443
|
+
DEFAULT_LOGGER.warning("overwrite schema", extra={"label": self, "df": diffs})
|
|
375
444
|
|
|
376
445
|
for row in diffs:
|
|
377
446
|
if row.status == "added":
|
|
@@ -391,9 +460,9 @@ class Table(DbObject):
|
|
|
391
460
|
self.add_column(row.column, row.new_data_type)
|
|
392
461
|
|
|
393
462
|
def vacuum(self, retention_days: int = 7):
|
|
394
|
-
assert self.
|
|
463
|
+
assert self.registered, f"{self} not registered"
|
|
395
464
|
|
|
396
|
-
DEFAULT_LOGGER.debug(f"vacuum table (removing files older than {retention_days} days)", extra={"
|
|
465
|
+
DEFAULT_LOGGER.debug(f"vacuum table (removing files older than {retention_days} days)", extra={"label": self})
|
|
397
466
|
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
|
|
398
467
|
try:
|
|
399
468
|
self.create_restore_point()
|
|
@@ -405,9 +474,9 @@ class Table(DbObject):
|
|
|
405
474
|
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
|
|
406
475
|
|
|
407
476
|
def optimize(self, columns: Optional[Union[str, List[str]]] = None):
|
|
408
|
-
assert self.
|
|
477
|
+
assert self.registered, f"{self} not registered"
|
|
409
478
|
|
|
410
|
-
DEFAULT_LOGGER.info("optimize", extra={"
|
|
479
|
+
DEFAULT_LOGGER.info("optimize", extra={"label": self})
|
|
411
480
|
|
|
412
481
|
if self.liquid_clustering_enabled:
|
|
413
482
|
self.spark.sql(f"optimize {self.qualified_name}")
|
|
@@ -417,7 +486,7 @@ class Table(DbObject):
|
|
|
417
486
|
|
|
418
487
|
elif columns is None:
|
|
419
488
|
if self.vorder_enabled:
|
|
420
|
-
DEFAULT_LOGGER.debug("vorder", extra={"
|
|
489
|
+
DEFAULT_LOGGER.debug("vorder", extra={"label": self})
|
|
421
490
|
self.spark.sql(f"optimize {self.qualified_name} vorder")
|
|
422
491
|
else:
|
|
423
492
|
self.spark.sql(f"optimize {self.qualified_name}")
|
|
@@ -429,24 +498,24 @@ class Table(DbObject):
|
|
|
429
498
|
cols = ", ".join(columns)
|
|
430
499
|
|
|
431
500
|
if self.vorder_enabled:
|
|
432
|
-
DEFAULT_LOGGER.debug(f"zorder by {cols} vorder", extra={"
|
|
501
|
+
DEFAULT_LOGGER.debug(f"zorder by {cols} vorder", extra={"label": self})
|
|
433
502
|
self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols}) vorder")
|
|
434
503
|
|
|
435
504
|
else:
|
|
436
|
-
DEFAULT_LOGGER.debug(f"zorder by {cols}", extra={"
|
|
505
|
+
DEFAULT_LOGGER.debug(f"zorder by {cols}", extra={"label": self})
|
|
437
506
|
self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols})")
|
|
438
507
|
|
|
439
508
|
def analyze(self):
|
|
440
|
-
assert self.
|
|
509
|
+
assert self.registered, f"{self} not registered"
|
|
441
510
|
|
|
442
|
-
DEFAULT_LOGGER.debug("analyze", extra={"
|
|
511
|
+
DEFAULT_LOGGER.debug("analyze", extra={"label": self})
|
|
443
512
|
self.compute_statistics()
|
|
444
513
|
self.compute_delta_statistics()
|
|
445
514
|
|
|
446
515
|
def compute_statistics(self):
|
|
447
|
-
assert self.
|
|
516
|
+
assert self.registered, f"{self} not registered"
|
|
448
517
|
|
|
449
|
-
DEFAULT_LOGGER.debug("compute statistics", extra={"
|
|
518
|
+
DEFAULT_LOGGER.debug("compute statistics", extra={"label": self})
|
|
450
519
|
cols = [
|
|
451
520
|
f"`{name}`"
|
|
452
521
|
for name, dtype in self.dataframe.dtypes
|
|
@@ -460,16 +529,16 @@ class Table(DbObject):
|
|
|
460
529
|
self.spark.sql(f"analyze table {self.qualified_name} compute statistics for columns {cols}")
|
|
461
530
|
|
|
462
531
|
def compute_delta_statistics(self):
|
|
463
|
-
assert self.
|
|
532
|
+
assert self.registered, f"{self} not registered"
|
|
464
533
|
|
|
465
|
-
DEFAULT_LOGGER.debug("compute delta statistics", extra={"
|
|
534
|
+
DEFAULT_LOGGER.debug("compute delta statistics", extra={"label": self})
|
|
466
535
|
self.spark.sql(f"analyze table {self.qualified_name} compute delta statistics")
|
|
467
536
|
|
|
468
537
|
def drop_column(self, name: str):
|
|
469
|
-
assert self.
|
|
538
|
+
assert self.registered, f"{self} not registered"
|
|
470
539
|
assert self.column_mapping_enabled, "column mapping not enabled"
|
|
471
540
|
|
|
472
|
-
DEFAULT_LOGGER.warning(f"drop column {name}", extra={"
|
|
541
|
+
DEFAULT_LOGGER.warning(f"drop column {name}", extra={"label": self})
|
|
473
542
|
self.spark.sql(
|
|
474
543
|
f"""
|
|
475
544
|
alter table {self.qualified_name}
|
|
@@ -478,10 +547,10 @@ class Table(DbObject):
|
|
|
478
547
|
)
|
|
479
548
|
|
|
480
549
|
def change_column(self, name: str, type: str):
|
|
481
|
-
assert self.
|
|
550
|
+
assert self.registered, f"{self} not registered"
|
|
482
551
|
assert self.column_mapping_enabled, "column mapping not enabled"
|
|
483
552
|
|
|
484
|
-
DEFAULT_LOGGER.info(f"change column {name} ({type})", extra={"
|
|
553
|
+
DEFAULT_LOGGER.info(f"change column {name} ({type})", extra={"label": self})
|
|
485
554
|
self.spark.sql(
|
|
486
555
|
f"""
|
|
487
556
|
alter table {self.qualified_name}
|
|
@@ -490,10 +559,10 @@ class Table(DbObject):
|
|
|
490
559
|
)
|
|
491
560
|
|
|
492
561
|
def rename_column(self, old: str, new: str):
|
|
493
|
-
assert self.
|
|
562
|
+
assert self.registered, f"{self} not registered"
|
|
494
563
|
assert self.column_mapping_enabled, "column mapping not enabled"
|
|
495
564
|
|
|
496
|
-
DEFAULT_LOGGER.info(f"rename column {old} -> {new}", extra={"
|
|
565
|
+
DEFAULT_LOGGER.info(f"rename column {old} -> {new}", extra={"label": self})
|
|
497
566
|
self.spark.sql(
|
|
498
567
|
f"""
|
|
499
568
|
alter table {self.qualified_name}
|
|
@@ -506,35 +575,35 @@ class Table(DbObject):
|
|
|
506
575
|
return data_type
|
|
507
576
|
|
|
508
577
|
def get_details(self) -> DataFrame:
|
|
509
|
-
assert self.
|
|
578
|
+
assert self.registered, f"{self} not registered"
|
|
510
579
|
|
|
511
580
|
return self.spark.sql(f"describe detail {self.qualified_name}")
|
|
512
581
|
|
|
513
582
|
def get_properties(self) -> DataFrame:
|
|
514
|
-
assert self.
|
|
583
|
+
assert self.registered, f"{self} not registered"
|
|
515
584
|
|
|
516
585
|
return self.spark.sql(f"show tblproperties {self.qualified_name}")
|
|
517
586
|
|
|
518
587
|
def get_description(self) -> DataFrame:
|
|
519
|
-
assert self.
|
|
588
|
+
assert self.registered, f"{self} not registered"
|
|
520
589
|
|
|
521
590
|
return self.spark.sql(f"describe extended {self.qualified_name}")
|
|
522
591
|
|
|
523
592
|
def get_history(self) -> DataFrame:
|
|
524
|
-
assert self.
|
|
593
|
+
assert self.registered, f"{self} not registered"
|
|
525
594
|
|
|
526
595
|
df = self.spark.sql(f"describe history {self.qualified_name}")
|
|
527
596
|
return df
|
|
528
597
|
|
|
529
598
|
def get_last_version(self) -> int:
|
|
530
|
-
assert self.
|
|
599
|
+
assert self.registered, f"{self} not registered"
|
|
531
600
|
|
|
532
601
|
df = self.get_history()
|
|
533
602
|
version = df.select(max("version")).collect()[0][0]
|
|
534
603
|
return version
|
|
535
604
|
|
|
536
605
|
def get_property(self, key: str) -> Optional[str]:
|
|
537
|
-
assert self.
|
|
606
|
+
assert self.registered, f"{self} not registered"
|
|
538
607
|
|
|
539
608
|
try:
|
|
540
609
|
value = self.get_properties().where(f"key == '{key}'").select("value").collect()[0][0]
|
|
@@ -544,15 +613,15 @@ class Table(DbObject):
|
|
|
544
613
|
return None
|
|
545
614
|
|
|
546
615
|
def enable_change_data_feed(self):
|
|
547
|
-
assert self.
|
|
616
|
+
assert self.registered, f"{self} not registered"
|
|
548
617
|
|
|
549
|
-
DEFAULT_LOGGER.debug("enable change data feed", extra={"
|
|
618
|
+
DEFAULT_LOGGER.debug("enable change data feed", extra={"label": self})
|
|
550
619
|
self.set_property("delta.enableChangeDataFeed", "true")
|
|
551
620
|
|
|
552
621
|
def enable_column_mapping(self):
|
|
553
|
-
assert self.
|
|
622
|
+
assert self.registered, f"{self} not registered"
|
|
554
623
|
|
|
555
|
-
DEFAULT_LOGGER.debug("enable column mapping", extra={"
|
|
624
|
+
DEFAULT_LOGGER.debug("enable column mapping", extra={"label": self})
|
|
556
625
|
|
|
557
626
|
try:
|
|
558
627
|
self.spark.sql(
|
|
@@ -563,7 +632,7 @@ class Table(DbObject):
|
|
|
563
632
|
)
|
|
564
633
|
|
|
565
634
|
except Exception:
|
|
566
|
-
DEFAULT_LOGGER.debug("update reader and writer version", extra={"
|
|
635
|
+
DEFAULT_LOGGER.debug("update reader and writer version", extra={"label": self})
|
|
567
636
|
self.spark.sql(
|
|
568
637
|
f"""
|
|
569
638
|
alter table {self.qualified_name}
|
|
@@ -576,9 +645,9 @@ class Table(DbObject):
|
|
|
576
645
|
)
|
|
577
646
|
|
|
578
647
|
def set_property(self, key: Union[str, int], value: Union[str, int]):
|
|
579
|
-
assert self.
|
|
648
|
+
assert self.registered, f"{self} not registered"
|
|
580
649
|
|
|
581
|
-
DEFAULT_LOGGER.debug(f"set property {key} = {value}", extra={"
|
|
650
|
+
DEFAULT_LOGGER.debug(f"set property {key} = {value}", extra={"label": self})
|
|
582
651
|
self.spark.sql(
|
|
583
652
|
f"""
|
|
584
653
|
alter table {self.qualified_name}
|
|
@@ -587,9 +656,9 @@ class Table(DbObject):
|
|
|
587
656
|
)
|
|
588
657
|
|
|
589
658
|
def add_constraint(self, name: str, expr: str):
|
|
590
|
-
assert self.
|
|
659
|
+
assert self.registered, f"{self} not registered"
|
|
591
660
|
|
|
592
|
-
DEFAULT_LOGGER.debug(f"add constraint ({name} check ({expr}))", extra={"
|
|
661
|
+
DEFAULT_LOGGER.debug(f"add constraint ({name} check ({expr}))", extra={"label": self})
|
|
593
662
|
self.spark.sql(
|
|
594
663
|
f"""
|
|
595
664
|
alter table {self.qualified_name}
|
|
@@ -598,9 +667,9 @@ class Table(DbObject):
|
|
|
598
667
|
)
|
|
599
668
|
|
|
600
669
|
def add_comment(self, comment: str):
|
|
601
|
-
assert self.
|
|
670
|
+
assert self.registered, f"{self} not registered"
|
|
602
671
|
|
|
603
|
-
DEFAULT_LOGGER.debug(f"add comment '{comment}'", extra={"
|
|
672
|
+
DEFAULT_LOGGER.debug(f"add comment '{comment}'", extra={"label": self})
|
|
604
673
|
self.spark.sql(
|
|
605
674
|
f"""
|
|
606
675
|
comment on table {self.qualified_name}
|
|
@@ -609,10 +678,10 @@ class Table(DbObject):
|
|
|
609
678
|
)
|
|
610
679
|
|
|
611
680
|
def add_materialized_column(self, name: str, expr: str, type: str):
|
|
612
|
-
assert self.
|
|
681
|
+
assert self.registered, f"{self} not registered"
|
|
613
682
|
assert self.column_mapping_enabled, "column mapping not enabled"
|
|
614
683
|
|
|
615
|
-
DEFAULT_LOGGER.info(f"add materialized column ({name} {type})", extra={"
|
|
684
|
+
DEFAULT_LOGGER.info(f"add materialized column ({name} {type})", extra={"label": self})
|
|
616
685
|
self.spark.sql(
|
|
617
686
|
f""""
|
|
618
687
|
alter table {self.qualified_name}
|
|
@@ -621,9 +690,9 @@ class Table(DbObject):
|
|
|
621
690
|
)
|
|
622
691
|
|
|
623
692
|
def add_column(self, name: str, type: str, after: Optional[str] = None):
|
|
624
|
-
assert self.
|
|
693
|
+
assert self.registered, f"{self} not registered"
|
|
625
694
|
|
|
626
|
-
DEFAULT_LOGGER.info(f"add column {name} ({type})", extra={"
|
|
695
|
+
DEFAULT_LOGGER.info(f"add column {name} ({type})", extra={"label": self})
|
|
627
696
|
ddl_after = "" if not after else f"after {after}"
|
|
628
697
|
self.spark.sql(
|
|
629
698
|
f"""
|
|
@@ -633,14 +702,14 @@ class Table(DbObject):
|
|
|
633
702
|
)
|
|
634
703
|
|
|
635
704
|
def create_bloomfilter_index(self, columns: Union[str, List[str]]):
|
|
636
|
-
assert self.
|
|
705
|
+
assert self.registered, f"{self} not registered"
|
|
637
706
|
|
|
638
707
|
if isinstance(columns, str):
|
|
639
708
|
columns = [columns]
|
|
640
709
|
columns = [f"`{c}`" for c in columns]
|
|
641
710
|
cols = ", ".join(columns)
|
|
642
711
|
|
|
643
|
-
DEFAULT_LOGGER.info(f"bloomfilter by {cols}", extra={"
|
|
712
|
+
DEFAULT_LOGGER.info(f"bloomfilter by {cols}", extra={"label": self})
|
|
644
713
|
self.spark.sql(
|
|
645
714
|
f"""
|
|
646
715
|
create bloomfilter index on table {self.qualified_name}
|
|
@@ -649,37 +718,37 @@ class Table(DbObject):
|
|
|
649
718
|
)
|
|
650
719
|
|
|
651
720
|
def create_restore_point(self):
|
|
652
|
-
assert self.
|
|
721
|
+
assert self.registered, f"{self} not registered"
|
|
653
722
|
|
|
654
723
|
last_version = self.get_last_version() + 1
|
|
655
724
|
self.set_property("fabricks.last_version", last_version)
|
|
656
725
|
|
|
657
726
|
def show_properties(self) -> DataFrame:
|
|
658
|
-
assert self.
|
|
727
|
+
assert self.registered, f"{self} not registered"
|
|
659
728
|
|
|
660
729
|
return self.spark.sql(f"show tblproperties {self.qualified_name}")
|
|
661
730
|
|
|
662
731
|
def describe_detail(self) -> DataFrame:
|
|
663
|
-
assert self.
|
|
732
|
+
assert self.registered, f"{self} not registered"
|
|
664
733
|
|
|
665
734
|
return self.spark.sql(f"describe detail {self.qualified_name}")
|
|
666
735
|
|
|
667
736
|
def describe_extended(self) -> DataFrame:
|
|
668
|
-
assert self.
|
|
737
|
+
assert self.registered, f"{self} not registered"
|
|
669
738
|
|
|
670
739
|
return self.spark.sql(f"describe extended {self.qualified_name}")
|
|
671
740
|
|
|
672
741
|
def describe_history(self) -> DataFrame:
|
|
673
|
-
assert self.
|
|
742
|
+
assert self.registered, f"{self} not registered"
|
|
674
743
|
|
|
675
744
|
df = self.spark.sql(f"describe history {self.qualified_name}")
|
|
676
745
|
return df
|
|
677
746
|
|
|
678
747
|
def enable_liquid_clustering(self, columns: Optional[Union[str, List[str]]] = None, auto: Optional[bool] = False):
|
|
679
|
-
assert self.
|
|
748
|
+
assert self.registered, f"{self} not registered"
|
|
680
749
|
|
|
681
750
|
if auto:
|
|
682
|
-
DEFAULT_LOGGER.info("cluster by auto", extra={"
|
|
751
|
+
DEFAULT_LOGGER.info("cluster by auto", extra={"label": self})
|
|
683
752
|
self.spark.sql(f"alter table {self.qualified_name} cluster by automatic")
|
|
684
753
|
|
|
685
754
|
else:
|
|
@@ -690,7 +759,7 @@ class Table(DbObject):
|
|
|
690
759
|
columns = [f"`{c}`" for c in columns]
|
|
691
760
|
cols = ", ".join(columns)
|
|
692
761
|
|
|
693
|
-
DEFAULT_LOGGER.info(f"cluster by {cols}", extra={"
|
|
762
|
+
DEFAULT_LOGGER.info(f"cluster by {cols}", extra={"label": self})
|
|
694
763
|
self.spark.sql(
|
|
695
764
|
f"""
|
|
696
765
|
alter table {self.qualified_name}
|