fabricks 3.0.16__py3-none-any.whl → 3.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -90,6 +90,7 @@ class Configurator(ABC):
90
90
  "__identity",
91
91
  "__source",
92
92
  "__key",
93
+ "__hash",
93
94
  "__timestamp",
94
95
  "__valid_from",
95
96
  "__valid_to",
@@ -110,7 +111,6 @@ class Configurator(ABC):
110
111
  cols = [
111
112
  "__operation",
112
113
  "__metadata",
113
- "__hash",
114
114
  "__rescued_data",
115
115
  ]
116
116
 
@@ -126,6 +126,7 @@ class Configurator(ABC):
126
126
  "__identity",
127
127
  "__source",
128
128
  "__key",
129
+ "__hash",
129
130
  "__timestamp",
130
131
  "__valid_from",
131
132
  "__valid_to",
@@ -134,7 +135,6 @@ class Configurator(ABC):
134
135
  # Trailing
135
136
  "__operation",
136
137
  "__metadata",
137
- "__hash",
138
138
  "__rescued_data",
139
139
  ]
140
140
 
@@ -192,11 +192,6 @@ class Configurator(ABC):
192
192
  leading = self.allowed_ouput_leading__columns
193
193
  trailing = self.allowed_output_trailing__columns
194
194
 
195
- # move __hash to the front of the table to ensure statistics are present
196
- if "__key" not in columns and "__hash" in columns:
197
- leading = ["__hash" if c == "__key" else c for c in leading]
198
- trailing = [c for c in trailing if c != "__hash"]
199
-
200
195
  __leading = [c for c in leading if c in columns]
201
196
  __trailing = [c for c in trailing if c in columns]
202
197
 
@@ -51,6 +51,7 @@ class Processor(Generator):
51
51
 
52
52
  overwrite = []
53
53
  exclude = kwargs.get("exclude", []) # used by silver to exclude __operation from output if not update
54
+ cast = kwargs.get("cast", {}) # used by silver to cast columns to target types
54
55
 
55
56
  order_duplicate_by = kwargs.get("order_duplicate_by", None)
56
57
  if order_duplicate_by:
@@ -140,6 +141,8 @@ class Processor(Generator):
140
141
  # override timestamp if added and found in df
141
142
  if add_timestamp and "__timestamp" in inputs:
142
143
  overwrite.append("__timestamp")
144
+ elif "__timestamp" in inputs:
145
+ cast["__timestamp"] = "timestamp"
143
146
 
144
147
  # override key if added and found in df (key needed for merge)
145
148
  if add_key and "__key" in inputs:
@@ -356,6 +359,8 @@ class Processor(Generator):
356
359
  "correct_valid_from": correct_valid_from,
357
360
  # overwrite
358
361
  "overwrite": overwrite,
362
+ # cast
363
+ "cast": cast,
359
364
  # filter
360
365
  "slices": None,
361
366
  "sources": None,
@@ -5,10 +5,14 @@ with
5
5
  __base as (
6
6
  select
7
7
  *
8
- {% if overwrite %}
8
+ {% if overwrite or cast %}
9
9
  -- will be overwritten below
10
- except ({% for o in overwrite %}{{ o }}, {% endfor %})
10
+ except (
11
+ {% for o in overwrite %}{{ o }}, {% endfor %}
12
+ {% for key, value in cast.items() %} {{ key }}, {% endfor %}
13
+ )
11
14
  {% endif %},
15
+ {% for key, value in cast.items() %} cast({{ key }} as {{ value }}) as {{ key }}, {% endfor %}
12
16
  {% if add_calculated_columns %} {% for c in add_calculated_columns %} {{ c }}, {% endfor %} {% endif %}
13
17
  {% if add_timestamp %} cast(current_date() as timestamp) as __timestamp, {% endif %}
14
18
  {% if add_operation %} cast('{{ add_operation }}' as string) as __operation, {% endif %}
@@ -141,6 +141,9 @@
141
141
  {%- if overwrite %}
142
142
  ☐ overwrite: {{ overwrite | join(", ") | truncate(100, killwords=True) }}
143
143
  {%- endif %}
144
+ {%- if cast %}
145
+ ☐ cast: {{ cast | join(", ") | truncate(100, killwords=True) }}
146
+ {%- endif %}
144
147
 
145
148
  👨‍👩‍👧 PARENTS
146
149
  {%- if parent_slice %}
@@ -1,5 +1,5 @@
1
1
  from abc import abstractmethod
2
- from typing import Optional, Sequence, Union, cast
2
+ from typing import List, Optional, Sequence, Union, cast
3
3
 
4
4
  from pyspark.sql import DataFrame
5
5
  from pyspark.sql.functions import lit
@@ -166,6 +166,31 @@ class Generator(Configurator):
166
166
  """
167
167
  ...
168
168
 
169
+ def _get_clustering_columns(self, df: DataFrame) -> Optional[List[str]]:
170
+ columns = self.options.table.get_list("cluster_by")
171
+ if columns:
172
+ return columns
173
+
174
+ columns = []
175
+
176
+ if "__source" in df.columns:
177
+ columns = ["__source"]
178
+ if "__is_current" in df.columns:
179
+ columns.append("__is_current")
180
+
181
+ if "__key" in df.columns:
182
+ columns.append("__key")
183
+ elif "__hash" in df.columns:
184
+ columns.append("__hash")
185
+
186
+ if columns:
187
+ DEFAULT_LOGGER.debug(f"found clustering columns ({', '.join(columns)})", extra={"label": self})
188
+ return columns
189
+
190
+ else:
191
+ DEFAULT_LOGGER.debug("could not determine any clustering column", extra={"label": self})
192
+ return None
193
+
169
194
  def create_table(self):
170
195
  def _create_table(df: DataFrame, batch: Optional[int] = 0):
171
196
  df = self.base_transform(df)
@@ -242,20 +267,13 @@ class Generator(Configurator):
242
267
  cluster_by = []
243
268
 
244
269
  else:
245
- cluster_by = self.options.table.get_list("cluster_by") or []
246
- if not cluster_by:
247
- if "__source" in df.columns:
248
- cluster_by.append("__source")
249
- if "__is_current" in df.columns:
250
- cluster_by.append("__is_current")
251
- if "__key" in df.columns:
252
- cluster_by.append("__key")
253
- elif "__hash" in df.columns:
254
- cluster_by.append("__hash")
255
-
256
- if not cluster_by:
257
- DEFAULT_LOGGER.debug("could not determine clustering column", extra={"label": self})
258
- liquid_clustering = False
270
+ cluster_by = self._get_clustering_columns(df)
271
+
272
+ if cluster_by:
273
+ liquid_clustering = True
274
+
275
+ else:
276
+ liquid_clustering = None
259
277
  cluster_by = None
260
278
 
261
279
  if liquid_clustering is None:
@@ -446,20 +464,12 @@ class Generator(Configurator):
446
464
  enable = enable_step
447
465
 
448
466
  if enable:
449
- cluster_by = self.options.table.get_list("cluster_by") or []
450
- if not cluster_by:
451
- if "__source" in df.columns:
452
- cluster_by.append("__source")
453
- if "__is_current" in df.columns:
454
- cluster_by.append("__is_current")
455
- if "__key" in df.columns:
456
- cluster_by.append("__key")
457
- elif "__hash" in df.columns:
458
- cluster_by.append("__hash")
459
-
460
- if len(cluster_by) > 0:
461
- self.table.enable_liquid_clustering(cluster_by, auto=False)
462
- else:
463
- self.table.enable_liquid_clustering(auto=True)
467
+ cluster_by = self._get_clustering_columns(df)
468
+
469
+ if cluster_by and len(cluster_by) > 0:
470
+ self.table.enable_liquid_clustering(cluster_by, auto=False)
471
+ else:
472
+ self.table.enable_liquid_clustering(auto=True)
473
+
464
474
  else:
465
- DEFAULT_LOGGER.debug("could not enable liquid clustering", extra={"label": self})
475
+ DEFAULT_LOGGER.debug("liquid clustering disabled", extra={"label": self})
@@ -2,7 +2,7 @@ from typing import Optional, Sequence, Union, cast
2
2
 
3
3
  from pyspark.sql import DataFrame
4
4
  from pyspark.sql.functions import expr, lit, md5
5
- from pyspark.sql.types import Row
5
+ from pyspark.sql.types import Row, TimestampType
6
6
 
7
7
  from fabricks.cdc.nocdc import NoCDC
8
8
  from fabricks.context import VARIABLES
@@ -91,6 +91,11 @@ class Bronze(BaseJob):
91
91
  try:
92
92
  df = self.spark.sql(f"select * from {file_format}.`{self.data_path}`")
93
93
  assert len(df.columns) > 1, "external table must have at least one column"
94
+ if "__timestamp" in df.columns:
95
+ assert isinstance(df.schema["__timestamp"].dataType, TimestampType), (
96
+ "__timestamp must be of type timestamp"
97
+ )
98
+
94
99
  except Exception as e:
95
100
  DEFAULT_LOGGER.exception("read external table failed", extra={"label": self})
96
101
  raise e
@@ -579,6 +579,14 @@ class Table(DbObject):
579
579
 
580
580
  return self.spark.sql(f"describe detail {self.qualified_name}")
581
581
 
582
+ def get_partitions(self) -> List[str]:
583
+ assert self.registered, f"{self} not registered"
584
+
585
+ try:
586
+ return self.spark.sql(f"show partitions {self.qualified_name}").columns
587
+ except AnalysisException:
588
+ return []
589
+
582
590
  def get_properties(self) -> DataFrame:
583
591
  assert self.registered, f"{self} not registered"
584
592
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fabricks
3
- Version: 3.0.16
3
+ Version: 3.0.17
4
4
  Author-email: BMS DWH Team <bi_support@bmsuisse.ch>
5
5
  Requires-Python: <4,>=3.9
6
6
  Requires-Dist: azure-data-tables<13,>=12.5.0
@@ -38,15 +38,15 @@ fabricks/cdc/scd2.py,sha256=4vZkhc8pJAUlgiBmIw9j_2RsWuAFMcgCkU3WMVt0A-A,334
38
38
  fabricks/cdc/base/__init__.py,sha256=kU4LmQ7x1rekCt8T3r83MmAQac6n2ov-Gh8mBbxIC48,157
39
39
  fabricks/cdc/base/_types.py,sha256=WloCDC3ATrn0aZJ6E8BRYKZx19N3EE56r6qlBYhcuvQ,257
40
40
  fabricks/cdc/base/cdc.py,sha256=9w5BqQxSVbFVEozJWmZQThqdppkE_SYi4fHSzJ7WMvA,78
41
- fabricks/cdc/base/configurator.py,sha256=w6Ywif87iv1WG-5OM3XkzIRrsns-_QQ6XlADpk0YLlw,6434
41
+ fabricks/cdc/base/configurator.py,sha256=DzODL0wO0FqfmIMQKEofqQKYeOCR6vaO0Am92ctcAk4,6158
42
42
  fabricks/cdc/base/generator.py,sha256=pa_GJn7Pdi5vMnXN8zExmOPMpCqdZ3QoxHEB0wv0lsk,5933
43
43
  fabricks/cdc/base/merger.py,sha256=3qUUs0uqmwOMdXc50kV3Zo9omuQuUUFgtMLBrg4E-wk,4159
44
- fabricks/cdc/base/processor.py,sha256=gL3pWMaBRsc0oB93ISnH2x07WbmtM_QEIx8qrUcUoZ0,17704
44
+ fabricks/cdc/base/processor.py,sha256=hExUGGVOX1Px0IWw65MOaDeeUbSMtjRekLdqEQ0U6Mw,17921
45
45
  fabricks/cdc/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
46
46
  fabricks/cdc/templates/filter.sql.jinja,sha256=AQcOj4KRwKscVG3zepnEAN9Yxb03AM7068hqW7dtVI8,236
47
47
  fabricks/cdc/templates/merge.sql.jinja,sha256=YS9wWckCVsUI1pUYiRSFBIuQ16WU3EPWSkhZVy2niBA,221
48
48
  fabricks/cdc/templates/query.sql.jinja,sha256=Z0kSm9sRKJTLQ2Lb3NS7yu93GBxNls9HL7uAjTdirjk,868
49
- fabricks/cdc/templates/ctes/base.sql.jinja,sha256=gO0dEQ00_NXOeZKRxfWvoUjpks0nHK-Is5H7xdI_J6s,1637
49
+ fabricks/cdc/templates/ctes/base.sql.jinja,sha256=rtp5hRMxcoD3NupzlAFvH8BREKHUnLEirHvJCoOmQk8,1871
50
50
  fabricks/cdc/templates/ctes/current.sql.jinja,sha256=bb72XdaJVce5k57dx_N6T4OfyQDUrFLulr3VM6irdn0,1278
51
51
  fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja,sha256=DHcSNhAvoFZY0gSv3Zylat9ysGXKR22OGXs49KTNCjA,1095
52
52
  fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja,sha256=4-K4nk2Bfr_8c_x3YDYBP7JUx4c-b3Ef-7HFx0s2jrc,1122
@@ -61,7 +61,7 @@ fabricks/cdc/templates/merges/nocdc.sql.jinja,sha256=lA4-PXogC_u6RqummOQX_WBFCNq
61
61
  fabricks/cdc/templates/merges/scd1.sql.jinja,sha256=GimfwgEobGgCzPce_FJdvQY9jmRJXFUZ4_CVhRgTTqY,1600
62
62
  fabricks/cdc/templates/merges/scd2.sql.jinja,sha256=5qBO_1lr7xa-Ep8RqvJGCRISOv4uo_tiAtVOybp4tUU,1210
63
63
  fabricks/cdc/templates/queries/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
- fabricks/cdc/templates/queries/context.sql.jinja,sha256=19mAAAM9Gh2PAzuupCzc68zxEHH3spTpHZHGU9vke8o,3883
64
+ fabricks/cdc/templates/queries/context.sql.jinja,sha256=x-t3CzdRaSXi8Xi7aYaNdIVyRwVOxmxw8xBZyfnFE40,3977
65
65
  fabricks/cdc/templates/queries/final.sql.jinja,sha256=vxH434CO5k8Ia7tugaH8LC1co7Epaj7Z1M7Y9BdqzaI,111
66
66
  fabricks/cdc/templates/queries/scd1.sql.jinja,sha256=siHULgKE3uRBGQYZFUR_eHNqFuGgO9xUCRVV2jnAXAI,3019
67
67
  fabricks/cdc/templates/queries/scd2.sql.jinja,sha256=Nn0wUs9N-_QviZqUKRWAFdD17RR3EFBTMs9BpBu6z7E,3877
@@ -91,7 +91,7 @@ fabricks/core/dags/run.py,sha256=RIDULb9WakObSyYzmkglh8RwFRwC8-NFC-1yPDMkBC0,107
91
91
  fabricks/core/dags/terminator.py,sha256=Y6pV2UnSyrCIx2AQWJXoHk5Roq12gZqpennHx_Lbnzs,793
92
92
  fabricks/core/dags/utils.py,sha256=4kyobLGl4tO0Flo6LxNzYjCU_G42vns1LrkxTO5_KLY,1585
93
93
  fabricks/core/jobs/__init__.py,sha256=W_1m6LoGiXBml_8cucedju_hllSSnZGKsZjyFh-2oJw,531
94
- fabricks/core/jobs/bronze.py,sha256=eDH2YLHbOgBoJoGZTFLJS9igqnqkeJtM56nahQK6zJ4,13815
94
+ fabricks/core/jobs/bronze.py,sha256=mz3YDm4ft9sP51k4lUR6gmT1rkQBLvdaePAD3uo3m1c,14040
95
95
  fabricks/core/jobs/get_job.py,sha256=35zay3Z_WoJIylzEQlGle6UvrE1EClfRbFEVGvszof0,3675
96
96
  fabricks/core/jobs/get_job_conf.py,sha256=3vAj_usCbNqejMUKOF85LPaHBYAwxgrDG7LYgY-vBUw,4812
97
97
  fabricks/core/jobs/get_job_id.py,sha256=6dLyzxGHlRvJZVJSwZkCk3iXzWkIhePC_6FhoP0gEN4,607
@@ -105,7 +105,7 @@ fabricks/core/jobs/base/_types.py,sha256=y66BtJlJskq7wGzn7te5XYjO-NEqeQGUC11kkbe
105
105
  fabricks/core/jobs/base/checker.py,sha256=Cdfh8rQYy4MvMFl0HyC3alGUWm8zrFXk08m2t2JMu6Y,5477
106
106
  fabricks/core/jobs/base/configurator.py,sha256=9G5F7Qg5FWHPbHgdh8Qxc85OoSX0rnjD4c9itwU5KKc,10415
107
107
  fabricks/core/jobs/base/exception.py,sha256=HrdxEuOfK5rY-ItZvEL3iywLgdpYUpmWFkjjjks7oYc,2318
108
- fabricks/core/jobs/base/generator.py,sha256=Dk82tj21NhR9wwgXzMp8JlKQ6D9HnjVlK9fvDmoYLbk,17646
108
+ fabricks/core/jobs/base/generator.py,sha256=TWiJtQvEH0uH8YMFlOxxi6CpWnrDNua61PCbdamOtHA,17487
109
109
  fabricks/core/jobs/base/invoker.py,sha256=FvjfpNqi542slxC2yLu1BIu5EklNUWySxDF8cD_SqKQ,7602
110
110
  fabricks/core/jobs/base/job.py,sha256=dWmk2PpQH2NETaaDS6KoiefRnDHfDMdCyhmogkdcSFI,93
111
111
  fabricks/core/jobs/base/processor.py,sha256=qkNiJSSLaEnivKGBcd9UZyIVFexnv-n1p_5mCZIy1rA,9076
@@ -143,7 +143,7 @@ fabricks/metastore/_types.py,sha256=NXYxwQHP0sCllM0N6QBbaK4CdtM_m_rHFDxRNRfBcLU,
143
143
  fabricks/metastore/database.py,sha256=23VAKKzjrwlEaj28DNNmiOhcfdKRzYk8eEfq-PzINbg,1924
144
144
  fabricks/metastore/dbobject.py,sha256=ve8p48OqEpJYsqWNhgesGSE0emM--uY8QrvBRoR3j3g,1881
145
145
  fabricks/metastore/pyproject.toml,sha256=6RZM9RMKMDF_EAequhORZ7TD0BQNk7aBCTWAv-sRcp0,519
146
- fabricks/metastore/table.py,sha256=AaoNL-1mz4A0CCb3tH_0BUurYPjA1oL5pioCYlEMtu4,29113
146
+ fabricks/metastore/table.py,sha256=lAxOky3BgpPudcIEZl2UbNgFiTHYfdtvcIHy9D5D78s,29368
147
147
  fabricks/metastore/utils.py,sha256=8SxhjDkz_aSH4IGUusel7hqOQxP9U8PNBCY0M7GH00Y,1355
148
148
  fabricks/metastore/view.py,sha256=f7hKJWtnH1KmZym8dkoucKOTndntzai_f2YqferxHLs,1431
149
149
  fabricks/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -171,6 +171,6 @@ fabricks/utils/schema/get_schema_for_type.py,sha256=5k-R6zCgUAtapQgxT4turcx1IQ-b
171
171
  fabricks/utils/write/__init__.py,sha256=i0UnZenXj9Aq0b0_aU3s6882vg-Vu_AyKfQhl_dTp-g,200
172
172
  fabricks/utils/write/delta.py,sha256=lTQ0CfUhcvn3xTCcT_Ns6PMDBsO5UEfa2S9XpJiLJ9c,1250
173
173
  fabricks/utils/write/stream.py,sha256=wQBpAnQtYA6nl79sPKhVM6u5m-66suX7B6VQ6tW4TOs,622
174
- fabricks-3.0.16.dist-info/METADATA,sha256=EUzxCf5zxbG8Dmst3_r4jyjK4Xb78n3pY0gajpZwqrg,798
175
- fabricks-3.0.16.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
176
- fabricks-3.0.16.dist-info/RECORD,,
174
+ fabricks-3.0.17.dist-info/METADATA,sha256=JTkqiUkvRKtEQKIWDsQhsNcmvMJ81vCx9kQhyxfznwM,798
175
+ fabricks-3.0.17.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
176
+ fabricks-3.0.17.dist-info/RECORD,,