fabricks 3.0.4__py3-none-any.whl → 3.0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ import importlib.metadata
2
+
3
+ FABRICKS_VERSION = importlib.metadata.version("fabricks")
@@ -126,7 +126,7 @@ class Configurator(ABC):
126
126
 
127
127
  def has_data(self, src: Union[DataFrame, Table, str], **kwargs) -> bool:
128
128
  df = self.get_src(src=src)
129
- return df.count() > 0
129
+ return not df.isEmpty()
130
130
 
131
131
  def get_columns(self, src: Union[DataFrame, Table, str], backtick: Optional[bool] = True) -> List[str]:
132
132
  if backtick:
@@ -34,10 +34,8 @@ class Generator(Configurator):
34
34
 
35
35
  df = self.get_data(src, **kwargs)
36
36
 
37
- if liquid_clustering:
38
- assert cluster_by, "clustering column not found"
39
- elif partitioning:
40
- assert partition_by, "partitioning column not found"
37
+ if partitioning is True:
38
+ assert partition_by, "partitioning column(s) not found"
41
39
 
42
40
  df = self.reorder_columns(df)
43
41
 
@@ -84,22 +82,14 @@ class Generator(Configurator):
84
82
  DEFAULT_LOGGER.exception("could not execute sql query", extra={"job": self, "sql": sql})
85
83
 
86
84
  def optimize_table(self):
87
- liquid_clustering = self.table.get_property("delta.feature.liquid") == "supported"
85
+ columns = None
88
86
 
89
- if liquid_clustering:
90
- self.table.optimize()
91
- else:
92
- columns = None
93
-
94
- if self.change_data_capture == "scd1":
95
- columns = ["__key"]
96
- elif self.change_data_capture == "scd2":
97
- columns = ["__key", "__valid_from"]
98
-
99
- vorder = self.table.get_property("delta.parquet.vorder.enabled") or "false"
100
- vorder = vorder.lower() == "true"
87
+ if self.change_data_capture == "scd1":
88
+ columns = ["__key"]
89
+ elif self.change_data_capture == "scd2":
90
+ columns = ["__key", "__valid_from"]
101
91
 
102
- self.table.optimize(columns=columns, vorder=vorder)
92
+ self.table.optimize(columns=columns)
103
93
 
104
94
  def get_differences_with_deltatable(self, src: Union[DataFrame, Table, str], **kwargs) -> Optional[DataFrame]:
105
95
  if self.is_view:
@@ -36,8 +36,6 @@ class Processor(Generator):
36
36
  columns = self.get_columns(src, backtick=False)
37
37
  fields = [c for c in columns if not c.startswith("__")]
38
38
 
39
- has_data = self.has_data(src)
40
-
41
39
  keys = kwargs.get("keys", None)
42
40
  mode = kwargs.get("mode", "complete")
43
41
 
@@ -80,6 +78,12 @@ class Processor(Generator):
80
78
  deduplicate_hash = kwargs.get("deduplicate_hash", None)
81
79
  soft_delete = kwargs.get("soft_delete", None)
82
80
  correct_valid_from = kwargs.get("correct_valid_from", None)
81
+ delete_missing = kwargs.get("delete_missing", None)
82
+
83
+ if mode == "update" and delete_missing:
84
+ has_data = self.has_data(src)
85
+ else:
86
+ has_data = True
83
87
 
84
88
  if slice is None:
85
89
  if mode == "update" and has_timestamp and has_rows:
fabricks/cdc/scd.py CHANGED
@@ -9,6 +9,7 @@ from fabricks.metastore.table import Table
9
9
  class SCD(BaseCDC):
10
10
  def delete_missing(self, src: Union[DataFrame, Table, str], **kwargs):
11
11
  kwargs["add_operation"] = "reload"
12
+ kwargs["delete_missing"] = True
12
13
  kwargs["mode"] = "update"
13
14
  self.merge(src, **kwargs)
14
15
 
@@ -54,7 +54,6 @@ class Configurator(ABC):
54
54
  _cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
55
55
  _change_data_capture: Optional[ChangeDataCaptures] = None
56
56
  _mode: Optional[Modes] = None
57
- _liquid_clustering: Optional[bool] = False
58
57
 
59
58
  @property
60
59
  @abstractmethod
@@ -209,33 +209,38 @@ class Generator(Configurator):
209
209
  identity = self.options.table.get_boolean("identity", False)
210
210
 
211
211
  # first take from job options, then from step options
212
- liquid_clustering_job = self.options.table.get_boolean("liquid_clustering", None)
212
+ liquid_clustering_job = self.options.table.get("liquid_clustering", None)
213
213
  liquid_clustering_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
214
214
  if liquid_clustering_job is not None:
215
215
  liquid_clustering = liquid_clustering_job
216
216
  elif liquid_clustering_step:
217
217
  liquid_clustering = liquid_clustering_step
218
218
 
219
- if liquid_clustering:
220
- cluster_by = self.options.table.get_list("cluster_by") or []
221
- if not cluster_by:
222
- if "__source" in df.columns:
223
- cluster_by.append("__source")
224
- if "__is_current" in df.columns:
225
- cluster_by.append("__is_current")
226
- if "__key" in df.columns:
227
- cluster_by.append("__key")
228
- elif "__hash" in df.columns:
229
- cluster_by.append("__hash")
230
-
231
- if not cluster_by:
232
- DEFAULT_LOGGER.warning(
233
- "liquid clustering disabled (no clustering columns found)", extra={"job": self}
234
- )
235
- liquid_clustering = False
236
- cluster_by = None
219
+ if liquid_clustering is not None:
220
+ if liquid_clustering == "auto":
221
+ liquid_clustering = True
222
+ cluster_by = []
237
223
 
238
- if not liquid_clustering:
224
+ else:
225
+ cluster_by = self.options.table.get_list("cluster_by") or []
226
+ if not cluster_by:
227
+ if "__source" in df.columns:
228
+ cluster_by.append("__source")
229
+ if "__is_current" in df.columns:
230
+ cluster_by.append("__is_current")
231
+ if "__key" in df.columns:
232
+ cluster_by.append("__key")
233
+ elif "__hash" in df.columns:
234
+ cluster_by.append("__hash")
235
+
236
+ if not cluster_by:
237
+ DEFAULT_LOGGER.warning(
238
+ "liquid clustering disabled (no clustering columns found)", extra={"job": self}
239
+ )
240
+ liquid_clustering = False
241
+ cluster_by = None
242
+
243
+ if liquid_clustering is None:
239
244
  cluster_by = None
240
245
  partition_by = self.options.table.get_list("partition_by")
241
246
  if partition_by:
@@ -404,11 +409,8 @@ class Generator(Configurator):
404
409
  cluster_by.append("__hash")
405
410
 
406
411
  if len(cluster_by) > 0:
407
- self.table.enable_liquid_clustering(cluster_by)
412
+ self.table.enable_liquid_clustering(cluster_by, auto=False)
408
413
  else:
409
- DEFAULT_LOGGER.warning(
410
- "liquid clustering not enabled (no clustering column found)", extra={"job": self}
411
- )
412
-
414
+ self.table.enable_liquid_clustering(auto=True)
413
415
  else:
414
416
  DEFAULT_LOGGER.debug("liquid clustering not enabled", extra={"job": self})
@@ -65,15 +65,28 @@ class Table(DbObject):
65
65
  @property
66
66
  def identity_enabled(self) -> bool:
67
67
  assert self.is_registered, f"{self} not registered"
68
-
69
68
  return self.get_property("delta.feature.identityColumns") == "supported"
70
69
 
71
70
  @property
72
71
  def type_widening_enabled(self) -> bool:
73
72
  assert self.is_registered, f"{self} not registered"
74
-
75
73
  return self.get_property("delta.enableTypeWidening") == "true"
76
74
 
75
+ @property
76
+ def liquid_clustering_enabled(self) -> bool:
77
+ assert self.is_registered, f"{self} not registered"
78
+ return self.get_property("delta.feature.clustering") == "supported"
79
+
80
+ @property
81
+ def auto_liquid_clustering_enabled(self) -> bool:
82
+ assert self.is_registered, f"{self} not registered"
83
+ return self.get_property("delta.clusterByAuto") == "true"
84
+
85
+ @property
86
+ def vorder_enabled(self) -> bool:
87
+ assert self.is_registered, f"{self} not registered"
88
+ return self.get_property("delta.parquet.vorder.enabled") == "true"
89
+
77
90
  def drop(self):
78
91
  super().drop()
79
92
  if self.delta_path.exists():
@@ -160,11 +173,14 @@ class Table(DbObject):
160
173
  ddl_tblproperties = "-- not tblproperties"
161
174
 
162
175
  if liquid_clustering:
163
- assert cluster_by
164
- if isinstance(cluster_by, str):
165
- cluster_by = [cluster_by]
166
- cluster_by = [f"`{c}`" for c in cluster_by]
167
- ddl_cluster_by = "cluster by (" + ", ".join(cluster_by) + ")"
176
+ if cluster_by:
177
+ if isinstance(cluster_by, str):
178
+ cluster_by = [cluster_by]
179
+ cluster_by = [f"`{c}`" for c in cluster_by]
180
+ ddl_cluster_by = "cluster by (" + ", ".join(cluster_by) + ")"
181
+
182
+ else:
183
+ ddl_cluster_by = "cluster by auto"
168
184
 
169
185
  if partitioning:
170
186
  assert partition_by
@@ -388,37 +404,38 @@ class Table(DbObject):
388
404
  pass
389
405
  self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
390
406
 
391
- def optimize(
392
- self,
393
- columns: Optional[Union[str, List[str]]] = None,
394
- vorder: Optional[bool] = False,
395
- ):
407
+ def optimize(self, columns: Optional[Union[str, List[str]]] = None):
396
408
  assert self.is_registered, f"{self} not registered"
397
409
 
398
410
  DEFAULT_LOGGER.info("optimize", extra={"job": self})
399
411
 
400
- zorder_by = columns is not None
401
- if zorder_by:
412
+ if self.liquid_clustering_enabled:
413
+ self.spark.sql(f"optimize {self.qualified_name}")
414
+
415
+ elif self.auto_liquid_clustering_enabled:
416
+ self.spark.sql(f"optimize {self.qualified_name}")
417
+
418
+ elif columns is None:
419
+ if self.vorder_enabled:
420
+ DEFAULT_LOGGER.debug("vorder", extra={"job": self})
421
+ self.spark.sql(f"optimize {self.qualified_name} vorder")
422
+ else:
423
+ self.spark.sql(f"optimize {self.qualified_name}")
424
+
425
+ else:
402
426
  if isinstance(columns, str):
403
427
  columns = [columns]
404
428
  columns = [f"`{c}`" for c in columns]
405
429
  cols = ", ".join(columns)
406
430
 
407
- if vorder:
431
+ if self.vorder_enabled:
408
432
  DEFAULT_LOGGER.debug(f"zorder by {cols} vorder", extra={"job": self})
409
433
  self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols}) vorder")
434
+
410
435
  else:
411
436
  DEFAULT_LOGGER.debug(f"zorder by {cols}", extra={"job": self})
412
437
  self.spark.sql(f"optimize {self.qualified_name} zorder by ({cols})")
413
438
 
414
- elif vorder:
415
- DEFAULT_LOGGER.debug("vorder", extra={"job": self})
416
- self.spark.sql(f"optimize {self.qualified_name} vorder")
417
-
418
- else:
419
- DEFAULT_LOGGER.debug("optimize", extra={"job": self})
420
- self.spark.sql(f"optimize {self.qualified_name}")
421
-
422
439
  def analyze(self):
423
440
  assert self.is_registered, f"{self} not registered"
424
441
 
@@ -658,18 +675,25 @@ class Table(DbObject):
658
675
  df = self.spark.sql(f"describe history {self.qualified_name}")
659
676
  return df
660
677
 
661
- def enable_liquid_clustering(self, columns: Union[str, List[str]]):
678
+ def enable_liquid_clustering(self, columns: Optional[Union[str, List[str]]] = None, auto: Optional[bool] = False):
662
679
  assert self.is_registered, f"{self} not registered"
663
680
 
664
- if isinstance(columns, str):
665
- columns = [columns]
666
- columns = [f"`{c}`" for c in columns]
667
- cols = ", ".join(columns)
668
- DEFAULT_LOGGER.info(f"cluster by {cols}", extra={"job": self})
681
+ if auto:
682
+ DEFAULT_LOGGER.info("cluster by auto", extra={"job": self})
683
+ self.spark.sql(f"alter table {self.qualified_name} cluster by automatic")
669
684
 
670
- self.spark.sql(
671
- f"""
672
- alter table {self.qualified_name}
673
- cluster by ({cols})
674
- """
675
- )
685
+ else:
686
+ assert columns, "at least one clustering column must be specified"
687
+
688
+ if isinstance(columns, str):
689
+ columns = [columns]
690
+ columns = [f"`{c}`" for c in columns]
691
+ cols = ", ".join(columns)
692
+
693
+ DEFAULT_LOGGER.info(f"cluster by {cols}", extra={"job": self})
694
+ self.spark.sql(
695
+ f"""
696
+ alter table {self.qualified_name}
697
+ cluster by ({cols})
698
+ """
699
+ )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fabricks
3
- Version: 3.0.4
3
+ Version: 3.0.5.1
4
4
  Author-email: BMS DWH Team <bi_support@bmsuisse.ch>
5
5
  Requires-Python: <4,>=3.9
6
6
  Requires-Dist: azure-data-tables<13,>=12.5.0
@@ -8,6 +8,7 @@ Requires-Dist: azure-identity>=1.10.0
8
8
  Requires-Dist: azure-storage-blob>=12.14.1
9
9
  Requires-Dist: azure-storage-queue<13,>=12.10.0
10
10
  Requires-Dist: databricks-sdk>=0.20.0
11
+ Requires-Dist: importlib-metadata>=8.6.1
11
12
  Requires-Dist: jinja2>=2.11.3
12
13
  Requires-Dist: pydantic-settings
13
14
  Requires-Dist: pydantic-yaml>=1.4.0
@@ -10,6 +10,7 @@ fabricks/api/parsers.py,sha256=nPUDzQ_Hz0fVmnBfGCqqHo7X7R6M-oGsXWDYSikjB54,121
10
10
  fabricks/api/schedules.py,sha256=omxxRU5xC_ee5pA5v1ZXpz9pv0INqEdBIlhs1RYkhIk,349
11
11
  fabricks/api/udfs.py,sha256=3JTX4OWkoW7_AP9pUKHVS0C6zIBVdOJoAn8MpmB6R48,124
12
12
  fabricks/api/utils.py,sha256=a-YrCXkDFzMmcNN8QOSDs_-YQtSePaDP4C4WYMX2AEg,196
13
+ fabricks/api/version.py,sha256=FukX94EbtmJMeajxyOwka8fMfFeaVc0cuM3I5CVIuK8,85
13
14
  fabricks/api/views.py,sha256=dPqsGgDs9QOYc-5_QG_i4F_VoaFO5hGZQnIPZ31h5Ps,156
14
15
  fabricks/api/cdc/__init__.py,sha256=Cl3LhLbQrA42IvNLqoV7CCbjQEYQMJfO6cAZv1l1aas,196
15
16
  fabricks/api/cdc/nocdc.py,sha256=3E1Cn6cPHfEszGMaHEknrLqEvVKS-5-hk8s_GRu6TYY,58
@@ -31,16 +32,16 @@ fabricks/api/notebooks/vacuum.py,sha256=F88-alJyR4rh1ZB4CbvMqyJvAC-6l73GHtq6eBkk
31
32
  fabricks/cdc/__init__.py,sha256=_ncE8b8xuT2HqWC3JiCa4JCb_na2xQnVz3M6tLkAXD8,302
32
33
  fabricks/cdc/cdc.py,sha256=2CjPUtogWjnvyLjwiyVllcyDV1gpJ0QoRP0yUsiHXuc,69
33
34
  fabricks/cdc/nocdc.py,sha256=Nwj0pE3NjSVyLxKs9PUimHzWcKN5ehHt1trrlq69qE4,518
34
- fabricks/cdc/scd.py,sha256=r1NVK9QAKJG4tRSpEAksvOO3nAuNwRLZoGmNG2TsypE,630
35
+ fabricks/cdc/scd.py,sha256=HzC9ifEu45B4P2aOSgi97AGB-C56l6sKTLqdVinnHKo,670
35
36
  fabricks/cdc/scd1.py,sha256=WsOVRsp55WEw4-7nEtb3dfv310icExrj-zEJSEehyz8,334
36
37
  fabricks/cdc/scd2.py,sha256=4vZkhc8pJAUlgiBmIw9j_2RsWuAFMcgCkU3WMVt0A-A,334
37
38
  fabricks/cdc/base/__init__.py,sha256=1uec9NHg3J5TWPMR09EsCMO1g8_3Dt6ZhC_b61Sg7JY,143
38
39
  fabricks/cdc/base/_types.py,sha256=IMI5bT4IFfqSnjTVrPBHsJkRXNdaRcMVUYW8qpfsTs0,82
39
40
  fabricks/cdc/base/cdc.py,sha256=9w5BqQxSVbFVEozJWmZQThqdppkE_SYi4fHSzJ7WMvA,78
40
- fabricks/cdc/base/configurator.py,sha256=fbQg4C1AH0BIis_Pdrv3BLkcjGnYOaCrxbjNc-95bj8,5269
41
- fabricks/cdc/base/generator.py,sha256=r_6S556wuNvl4eqo1L6-AfyPNj3mHQl8lTspWiFGFYU,6161
41
+ fabricks/cdc/base/configurator.py,sha256=lInLgLUm_h2VN43vUFzOsxwk4yOQxnDplR6F9fC0rEE,5271
42
+ fabricks/cdc/base/generator.py,sha256=OQuNGjblFeCP0JLJpJKbaXQmzREISw7PLiN04Nt9nu4,5735
42
43
  fabricks/cdc/base/merger.py,sha256=suule_MRyI-qXwBaUpKiBLmduZpvI01nwn6MBarkI24,3991
43
- fabricks/cdc/base/processor.py,sha256=e0JKpz6sGGXvc6U6IFagxy8iZVtE3YAhfLP2mWXMiBs,14239
44
+ fabricks/cdc/base/processor.py,sha256=d52pvaHI9KTvztT2NCb8Vl-tIGc66ZPyuxXR4p3Vq1U,14393
44
45
  fabricks/cdc/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
46
  fabricks/cdc/templates/filter.sql.jinja,sha256=H0-nAN7HzxDa3p_Qu2U_LeJnBCcBKR6xzIM0VnckeuQ,234
46
47
  fabricks/cdc/templates/merge.sql.jinja,sha256=iNpgqGiuI2QABmyTkHCibRr_5r7SASb3yqojhNP3e20,144
@@ -115,9 +116,9 @@ fabricks/core/jobs/silver.py,sha256=wn6c6hoeppjlWf6EutB_8qE5Sxu2PIWk5iQecRUuJ5o,
115
116
  fabricks/core/jobs/base/__init__.py,sha256=_AdWtyL7yZG2TOZ9e8WyNPrOjmm6EDkI_TNym5cLDws,208
116
117
  fabricks/core/jobs/base/_types.py,sha256=xNKHpzof_mPd97ytvk3wrPnXd1_VdLilh1yQgFF3Ois,6769
117
118
  fabricks/core/jobs/base/checker.py,sha256=LPK5f3ucT7T4Z7LjlOyHPXFfb94J_DdYVp6X85wIvDk,5324
118
- fabricks/core/jobs/base/configurator.py,sha256=ARj920yJJdNtD0Iz8IjbhSyJqlVrXzSz29n0mXofo-k,11569
119
+ fabricks/core/jobs/base/configurator.py,sha256=Dwx7B09PDXefmrY5MtkD3NMuiqE66RtvNJomnzaMjfE,11522
119
120
  fabricks/core/jobs/base/exception.py,sha256=HrdxEuOfK5rY-ItZvEL3iywLgdpYUpmWFkjjjks7oYc,2318
120
- fabricks/core/jobs/base/generator.py,sha256=LdI3PDrwee5rjwlFlduA4_s-7rE1AsnFrYdgQJL5_tE,15527
121
+ fabricks/core/jobs/base/generator.py,sha256=3WgL8JweaK31WSIxKGdTsc32dAHAxLOxJT5PQdp_jO4,15657
121
122
  fabricks/core/jobs/base/invoker.py,sha256=xJV9fLtY36qfnclqKqNBsjryyR8x39wfhbYJtzOPRyM,6342
122
123
  fabricks/core/jobs/base/job.py,sha256=dWmk2PpQH2NETaaDS6KoiefRnDHfDMdCyhmogkdcSFI,93
123
124
  fabricks/core/jobs/base/processor.py,sha256=QmyUM11drJ9o8vF5he4rdztcfO7HjiXNhbk_AwJakUM,8324
@@ -145,7 +146,7 @@ fabricks/metastore/_types.py,sha256=NXYxwQHP0sCllM0N6QBbaK4CdtM_m_rHFDxRNRfBcLU,
145
146
  fabricks/metastore/database.py,sha256=1EjbRh2b6xEdHJyc4C4xee6FXDiKuPgm-8Q3Gqt7eds,1942
146
147
  fabricks/metastore/dbobject.py,sha256=EdxofFMCx6XdqFkm9Z5x4ywW4sstvdpc1d_EhYsE0KY,1883
147
148
  fabricks/metastore/pyproject.toml,sha256=6RZM9RMKMDF_EAequhORZ7TD0BQNk7aBCTWAv-sRcp0,519
148
- fabricks/metastore/table.py,sha256=YywtGc6z_zMwvK9wORvvys50AB-gBd9ZGAdtQq614yc,24312
149
+ fabricks/metastore/table.py,sha256=luKm_kMHBBPHn_J5Tx5Aw6k8yOttgODa4FvzpD-4on0,25453
149
150
  fabricks/metastore/utils.py,sha256=8SxhjDkz_aSH4IGUusel7hqOQxP9U8PNBCY0M7GH00Y,1355
150
151
  fabricks/metastore/view.py,sha256=Va7xdFtOW9GcDSlyoZNgcF07qty9abtex41au6OSz6c,1381
151
152
  fabricks/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -171,6 +172,6 @@ fabricks/utils/schema/get_schema_for_type.py,sha256=u9FFYvWyq9VQdNJNu79-SCN9iGUB
171
172
  fabricks/utils/write/__init__.py,sha256=i0UnZenXj9Aq0b0_aU3s6882vg-Vu_AyKfQhl_dTp-g,200
172
173
  fabricks/utils/write/delta.py,sha256=mpaSxBNcl6N0QheGLx8rjeyWUvy1Yvvj4raGRv7GL5M,1229
173
174
  fabricks/utils/write/stream.py,sha256=wQBpAnQtYA6nl79sPKhVM6u5m-66suX7B6VQ6tW4TOs,622
174
- fabricks-3.0.4.dist-info/METADATA,sha256=nFCQ4-fewJvmjPgINnMmf07MFDMOyMp8dVOFMd6Vbwo,682
175
- fabricks-3.0.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
176
- fabricks-3.0.4.dist-info/RECORD,,
175
+ fabricks-3.0.5.1.dist-info/METADATA,sha256=6gsR4UJcnoz4x2dPwdNT3jYIyEZlpdBqCee0-LsTJJ0,725
176
+ fabricks-3.0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
177
+ fabricks-3.0.5.1.dist-info/RECORD,,