fabricks 3.0.1__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  from typing import Optional
3
3
 
4
- from azure.core.exceptions import ServiceRequestError
4
+ from azure.core.exceptions import AzureError
5
5
  from pyspark.sql import DataFrame
6
6
  from pyspark.sql.functions import expr
7
7
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
@@ -31,7 +31,7 @@ class BaseDags:
31
31
  @retry(
32
32
  stop=stop_after_attempt(3),
33
33
  wait=wait_exponential(multiplier=1, min=1, max=10),
34
- retry=retry_if_exception_type((Exception, ServiceRequestError)),
34
+ retry=retry_if_exception_type((Exception, AzureError)),
35
35
  reraise=True,
36
36
  )
37
37
  def get_table(self) -> AzureTable:
@@ -4,7 +4,7 @@ import time
4
4
  from multiprocessing import Process
5
5
  from typing import Any, List, Union
6
6
 
7
- from azure.core.exceptions import ServiceRequestError
7
+ from azure.core.exceptions import AzureError
8
8
  from databricks.sdk.runtime import dbutils, spark
9
9
  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
10
10
 
@@ -49,7 +49,7 @@ class DagProcessor(BaseDags):
49
49
  @retry(
50
50
  stop=stop_after_attempt(3),
51
51
  wait=wait_exponential(multiplier=1, min=1, max=10),
52
- retry=retry_if_exception_type((Exception, ServiceRequestError)),
52
+ retry=retry_if_exception_type((Exception, AzureError)),
53
53
  reraise=True,
54
54
  )
55
55
  def query(self, data: Any) -> List[dict]:
@@ -58,7 +58,7 @@ class DagProcessor(BaseDags):
58
58
  @retry(
59
59
  stop=stop_after_attempt(3),
60
60
  wait=wait_exponential(multiplier=1, min=1, max=10),
61
- retry=retry_if_exception_type((Exception, ServiceRequestError)),
61
+ retry=retry_if_exception_type((Exception, AzureError)),
62
62
  reraise=True,
63
63
  )
64
64
  def upsert(self, data: Any) -> None:
@@ -67,7 +67,7 @@ class DagProcessor(BaseDags):
67
67
  @retry(
68
68
  stop=stop_after_attempt(3),
69
69
  wait=wait_exponential(multiplier=1, min=1, max=10),
70
- retry=retry_if_exception_type((Exception, ServiceRequestError)),
70
+ retry=retry_if_exception_type((Exception, AzureError)),
71
71
  reraise=True,
72
72
  )
73
73
  def delete(self, data: Any) -> None:
@@ -1,5 +1,5 @@
1
1
  from dataclasses import dataclass
2
- from typing import List, Literal, Optional, TypedDict, Union
2
+ from typing import Any, List, Literal, Optional, TypedDict, Union
3
3
 
4
4
  from pydantic import BaseModel, ConfigDict, model_validator
5
5
  from pyspark.sql.types import StringType, StructField, StructType
@@ -33,8 +33,8 @@ Origins = Literal["parser", "job"]
33
33
 
34
34
 
35
35
  class SparkOptions(TypedDict):
36
- sql: Optional[dict[str, str]]
37
- conf: Optional[dict[str, str]]
36
+ sql: Optional[dict[Any, Any]]
37
+ conf: Optional[dict[Any, Any]]
38
38
 
39
39
 
40
40
  class TableOptions(TypedDict):
@@ -45,17 +45,17 @@ class TableOptions(TypedDict):
45
45
  cluster_by: Optional[List[str]]
46
46
  powerbi: Optional[bool]
47
47
  bloomfilter_by: Optional[List[str]]
48
- constraints: Optional[dict[str, str]]
49
- properties: Optional[dict[str, str]]
48
+ constraints: Optional[dict[Any, Any]]
49
+ properties: Optional[dict[Any, Any]]
50
50
  comment: Optional[str]
51
- calculated_columns: Optional[dict[str, str]]
51
+ calculated_columns: Optional[dict[Any, Any]]
52
52
  retention_days: Optional[int]
53
53
 
54
54
 
55
55
  class _InvokeOptions(TypedDict):
56
56
  notebook: str
57
57
  timeout: int
58
- arguments: Optional[dict[str, str]]
58
+ arguments: Optional[dict[Any, Any]]
59
59
 
60
60
 
61
61
  class InvokerOptions(TypedDict):
@@ -66,7 +66,7 @@ class InvokerOptions(TypedDict):
66
66
 
67
67
  class ExtenderOptions(TypedDict):
68
68
  extender: str
69
- arguments: Optional[dict[str, str]]
69
+ arguments: Optional[dict[Any, Any]]
70
70
 
71
71
 
72
72
  class CheckOptions(TypedDict):
@@ -90,7 +90,7 @@ class BronzeOptions(TypedDict):
90
90
  filter_where: Optional[str]
91
91
  # extra
92
92
  encrypted_columns: Optional[List[str]]
93
- calculated_columns: Optional[dict[str, str]]
93
+ calculated_columns: Optional[dict[Any, Any]]
94
94
  operation: Optional[Operations]
95
95
  timeout: Optional[int]
96
96
 
@@ -106,7 +106,7 @@ class SilverOptions(TypedDict):
106
106
  deduplicate: Optional[bool]
107
107
  stream: Optional[bool]
108
108
  # else
109
- order_duplicate_by: Optional[dict[str, str]]
109
+ order_duplicate_by: Optional[dict[Any, Any]]
110
110
  timeout: Optional[int]
111
111
 
112
112
 
@@ -141,7 +141,7 @@ class BaseJobConf:
141
141
 
142
142
  @dataclass
143
143
  class JobConfBronze(BaseJobConf):
144
- step: TBronze
144
+ step: str
145
145
  options: BronzeOptions
146
146
  table_options: Optional[TableOptions] = None
147
147
  parser_options: Optional[ParserOptions] = None
@@ -155,7 +155,7 @@ class JobConfBronze(BaseJobConf):
155
155
 
156
156
  @dataclass
157
157
  class JobConfSilver(BaseJobConf):
158
- step: TSilver
158
+ step: str
159
159
  options: SilverOptions
160
160
  table_options: Optional[TableOptions] = None
161
161
  check_options: Optional[CheckOptions] = None
@@ -168,7 +168,7 @@ class JobConfSilver(BaseJobConf):
168
168
 
169
169
  @dataclass
170
170
  class JobConfGold(BaseJobConf):
171
- step: TGold
171
+ step: str
172
172
  options: Optional[GoldOptions]
173
173
  table_options: Optional[TableOptions] = None
174
174
  check_options: Optional[CheckOptions] = None
@@ -301,9 +301,8 @@ class Configurator(ABC):
301
301
  """
302
302
  if self.mode == "memory":
303
303
  DEFAULT_LOGGER.debug("memory (no optimize)", extra={"job": self})
304
- else:
305
- assert self.table.exists()
306
304
 
305
+ else:
307
306
  if vacuum:
308
307
  self.vacuum()
309
308
  if optimize:
@@ -312,19 +311,23 @@ class Configurator(ABC):
312
311
  self.table.compute_statistics()
313
312
 
314
313
  def vacuum(self):
315
- job = self.options.table.get("retention_days")
316
- step = self.step_conf.get("table_options", {}).get("retention_days", None)
317
- runtime = CONF_RUNTIME.get("options", {}).get("retention_days")
318
-
319
- if job is not None:
320
- retention_days = job
321
- elif step:
322
- retention_days = step
314
+ if self.mode == "memory":
315
+ DEFAULT_LOGGER.debug("memory (no vacuum)", extra={"job": self})
316
+
323
317
  else:
324
- assert runtime
325
- retention_days = runtime
318
+ job = self.options.table.get("retention_days")
319
+ step = self.step_conf.get("table_options", {}).get("retention_days", None)
320
+ runtime = CONF_RUNTIME.get("options", {}).get("retention_days")
321
+
322
+ if job is not None:
323
+ retention_days = job
324
+ elif step:
325
+ retention_days = step
326
+ else:
327
+ assert runtime
328
+ retention_days = runtime
326
329
 
327
- self.table.vacuum(retention_days=retention_days)
330
+ self.table.vacuum(retention_days=retention_days)
328
331
 
329
332
  def __str__(self):
330
333
  return f"{self.step}.{self.topic}_{self.item}"
@@ -103,6 +103,21 @@ class Bronze(BaseJob):
103
103
  DEFAULT_LOGGER.debug("drop external table", extra={"job": self})
104
104
  self.spark.sql(f"drop table if exists {self.qualified_name}")
105
105
 
106
+ def analyze_external_table(self):
107
+ DEFAULT_LOGGER.debug("analyze external table", extra={"job": self})
108
+ self.spark.sql(f"analyze table {self.qualified_name} compute statistics")
109
+
110
+ def vacuum_external_table(self, retention_hours: Optional[int] = 168):
111
+ from delta import DeltaTable
112
+
113
+ DEFAULT_LOGGER.debug("vacuum external table", extra={"job": self})
114
+ try:
115
+ dt = DeltaTable.forPath(self.spark, self.data_path.string)
116
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
117
+ dt.vacuum(retention_hours)
118
+ finally:
119
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
120
+
106
121
  def optimize_external_table(
107
122
  self,
108
123
  vacuum: Optional[bool] = True,
@@ -110,20 +125,10 @@ class Bronze(BaseJob):
110
125
  ):
111
126
  DEFAULT_LOGGER.debug("optimize external table", extra={"job": self})
112
127
  if vacuum:
113
- from delta import DeltaTable
114
-
115
- dt = DeltaTable.forPath(self.spark, self.data_path.string)
116
- retention_days = 7
117
- DEFAULT_LOGGER.debug(f"{self.data_path} - vacuum table (removing files older than {retention_days} days)")
118
- try:
119
- self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
120
- dt.vacuum(retention_days * 24)
121
- finally:
122
- self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
128
+ self.vacuum_external_table()
123
129
 
124
130
  if analyze:
125
- DEFAULT_LOGGER.debug(f"{self.data_path} - compute delta statistics")
126
- self.spark.sql(f"analyze table delta.`{self.data_path}` compute delta statistics")
131
+ self.analyze_external_table()
127
132
 
128
133
  @property
129
134
  def parser(self) -> BaseParser:
@@ -318,9 +323,9 @@ class Bronze(BaseJob):
318
323
 
319
324
  def for_each_run(self, **kwargs):
320
325
  if self.mode == "register":
321
- DEFAULT_LOGGER.info("register (no run)", extra={"job": self})
326
+ DEFAULT_LOGGER.debug("register (no run)", extra={"job": self})
322
327
  elif self.mode == "memory":
323
- DEFAULT_LOGGER.info("memory (no run)", extra={"job": self})
328
+ DEFAULT_LOGGER.debug("memory (no run)", extra={"job": self})
324
329
  else:
325
330
  super().for_each_run(**kwargs)
326
331
 
@@ -370,6 +375,14 @@ class Bronze(BaseJob):
370
375
  else:
371
376
  super().optimize(vacuum=vacuum, optimize=optimize, analyze=analyze)
372
377
 
378
+ def vacuum(self):
379
+ if self.mode == "memory":
380
+ DEFAULT_LOGGER.info("memory (no vacuum)", extra={"job": self})
381
+ elif self.mode == "register":
382
+ self.vacuum_external_table()
383
+ else:
384
+ super().vacuum()
385
+
373
386
  def overwrite(self):
374
387
  self.truncate()
375
388
  self.run()
@@ -433,7 +433,11 @@ class Table(DbObject):
433
433
  cols = [
434
434
  f"`{name}`"
435
435
  for name, dtype in self.dataframe.dtypes
436
- if not dtype.startswith("struct") and not dtype.startswith("array") and name not in ["__metadata"]
436
+ if not dtype.startswith("struct")
437
+ and not dtype.startswith("array")
438
+ and not dtype.startswith("variant")
439
+ and not dtype.startswith("map")
440
+ and name not in ["__metadata"]
437
441
  ]
438
442
  cols = ", ".join(sorted(cols))
439
443
  self.spark.sql(f"analyze table {self.qualified_name} compute statistics for columns {cols}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fabricks
3
- Version: 3.0.1
3
+ Version: 3.0.3
4
4
  Author-email: BMS DWH Team <bi_support@bmsuisse.ch>
5
5
  Requires-Python: <4,>=3.9
6
6
  Requires-Dist: azure-data-tables<13,>=12.5.0
@@ -93,10 +93,10 @@ fabricks/core/udfs.py,sha256=sxahq2WZSy427i8_HLV1y4P_RjdvCRrm3muXGMufr1s,3440
93
93
  fabricks/core/utils.py,sha256=qdn2ElpqBgDsW55-tACWZaFOT0ebrBYg2fenqSgd6YI,2456
94
94
  fabricks/core/views.py,sha256=6fmQgxXIB3NDhabti0vs5DSlq1kHJ-forNHJgvBxslU,1013
95
95
  fabricks/core/dags/__init__.py,sha256=0DUKzVcXcROvxkN19P_kaOJ7da5BAM7Vt8EGQbp2KSY,240
96
- fabricks/core/dags/base.py,sha256=31rAck3eAYGXDPUbFGIIhnw3cAOpbZK3yjpMkRFUxro,3157
96
+ fabricks/core/dags/base.py,sha256=tFj27SqeZUZ7pB_LOWkpdowZz5gj30JUANI4gWK3Pl8,3139
97
97
  fabricks/core/dags/generator.py,sha256=WoPsXLA89O8y75YpjvEo_NhLoXdZh3oSDrOU-lnAj94,4826
98
98
  fabricks/core/dags/log.py,sha256=pszqun3lSFzfyKbahsPWDgP0MOH58H2dpK42prc6sk0,782
99
- fabricks/core/dags/processor.py,sha256=NwMuZHWency81Pq7rTPI5cXb_ACQ2flHTiixZ5eTitw,7734
99
+ fabricks/core/dags/processor.py,sha256=_8Htq7rP_6hZx0htxISrxRIA9gjar4c44meVQ5spX7A,7698
100
100
  fabricks/core/dags/run.py,sha256=RIDULb9WakObSyYzmkglh8RwFRwC8-NFC-1yPDMkBC0,1074
101
101
  fabricks/core/dags/terminator.py,sha256=Y6pV2UnSyrCIx2AQWJXoHk5Roq12gZqpennHx_Lbnzs,793
102
102
  fabricks/core/dags/utils.py,sha256=17IIza8zaaVkhqCudXUr4GSYXZZCobGB3FLgbS4eAjs,1229
@@ -105,7 +105,7 @@ fabricks/core/deploy/tables.py,sha256=HPQxdLfggSzL3PTqHnBaJnScXuROK_oko-vqjiXWrn
105
105
  fabricks/core/deploy/udfs.py,sha256=TBTwMJAuZpsShu_z-Z-LENo3P_eWX69qUkmFuRqv3kU,516
106
106
  fabricks/core/deploy/views.py,sha256=DAPCF1gijNGVioKUhRH_PS0koAYhoeDgMvBy-UJ6GQc,13756
107
107
  fabricks/core/jobs/__init__.py,sha256=W_1m6LoGiXBml_8cucedju_hllSSnZGKsZjyFh-2oJw,531
108
- fabricks/core/jobs/bronze.py,sha256=PRZ5aJX-XIX3hal_mgav43pcTDVaO5KeeJrUYOu0drM,13181
108
+ fabricks/core/jobs/bronze.py,sha256=89nsucGyTGAYPd6pO-MZUmi031P62gp095PTYjSftjg,13514
109
109
  fabricks/core/jobs/get_job.py,sha256=35zay3Z_WoJIylzEQlGle6UvrE1EClfRbFEVGvszof0,3675
110
110
  fabricks/core/jobs/get_job_conf.py,sha256=3vAj_usCbNqejMUKOF85LPaHBYAwxgrDG7LYgY-vBUw,4812
111
111
  fabricks/core/jobs/get_job_id.py,sha256=6dLyzxGHlRvJZVJSwZkCk3iXzWkIhePC_6FhoP0gEN4,607
@@ -113,9 +113,9 @@ fabricks/core/jobs/get_jobs.py,sha256=5E1J95vFYDqa2n9DKpJn4ujD0MW-P38pNc6T6V8LDm
113
113
  fabricks/core/jobs/gold.py,sha256=EQ6nCNdvWTBt194tmXvMWZItw53o7x49nwJ4UiCSDH0,13996
114
114
  fabricks/core/jobs/silver.py,sha256=wn6c6hoeppjlWf6EutB_8qE5Sxu2PIWk5iQecRUuJ5o,13163
115
115
  fabricks/core/jobs/base/__init__.py,sha256=_AdWtyL7yZG2TOZ9e8WyNPrOjmm6EDkI_TNym5cLDws,208
116
- fabricks/core/jobs/base/_types.py,sha256=xNKHpzof_mPd97ytvk3wrPnXd1_VdLilh1yQgFF3Ois,6769
116
+ fabricks/core/jobs/base/_types.py,sha256=r_s1mG-kHErvC9tMX5ndNEgA_3ASD2ouEkYylbhe-18,6764
117
117
  fabricks/core/jobs/base/checker.py,sha256=LPK5f3ucT7T4Z7LjlOyHPXFfb94J_DdYVp6X85wIvDk,5324
118
- fabricks/core/jobs/base/configurator.py,sha256=671-ksuf5FDIUtCLmt_HhLYhUl34jUdKb7S1wiotBzk,11439
118
+ fabricks/core/jobs/base/configurator.py,sha256=ARj920yJJdNtD0Iz8IjbhSyJqlVrXzSz29n0mXofo-k,11569
119
119
  fabricks/core/jobs/base/exception.py,sha256=HrdxEuOfK5rY-ItZvEL3iywLgdpYUpmWFkjjjks7oYc,2318
120
120
  fabricks/core/jobs/base/generator.py,sha256=LdI3PDrwee5rjwlFlduA4_s-7rE1AsnFrYdgQJL5_tE,15527
121
121
  fabricks/core/jobs/base/invoker.py,sha256=xJV9fLtY36qfnclqKqNBsjryyR8x39wfhbYJtzOPRyM,6342
@@ -145,7 +145,7 @@ fabricks/metastore/_types.py,sha256=NXYxwQHP0sCllM0N6QBbaK4CdtM_m_rHFDxRNRfBcLU,
145
145
  fabricks/metastore/database.py,sha256=1EjbRh2b6xEdHJyc4C4xee6FXDiKuPgm-8Q3Gqt7eds,1942
146
146
  fabricks/metastore/dbobject.py,sha256=EdxofFMCx6XdqFkm9Z5x4ywW4sstvdpc1d_EhYsE0KY,1883
147
147
  fabricks/metastore/pyproject.toml,sha256=6RZM9RMKMDF_EAequhORZ7TD0BQNk7aBCTWAv-sRcp0,519
148
- fabricks/metastore/table.py,sha256=3rQZDOJZ2GchpVQ4N9Cu-SgEZI2TlUGqO9cWchDsitE,24196
148
+ fabricks/metastore/table.py,sha256=YywtGc6z_zMwvK9wORvvys50AB-gBd9ZGAdtQq614yc,24312
149
149
  fabricks/metastore/utils.py,sha256=8SxhjDkz_aSH4IGUusel7hqOQxP9U8PNBCY0M7GH00Y,1355
150
150
  fabricks/metastore/view.py,sha256=Va7xdFtOW9GcDSlyoZNgcF07qty9abtex41au6OSz6c,1381
151
151
  fabricks/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -171,6 +171,6 @@ fabricks/utils/schema/get_schema_for_type.py,sha256=u9FFYvWyq9VQdNJNu79-SCN9iGUB
171
171
  fabricks/utils/write/__init__.py,sha256=i0UnZenXj9Aq0b0_aU3s6882vg-Vu_AyKfQhl_dTp-g,200
172
172
  fabricks/utils/write/delta.py,sha256=mpaSxBNcl6N0QheGLx8rjeyWUvy1Yvvj4raGRv7GL5M,1229
173
173
  fabricks/utils/write/stream.py,sha256=wQBpAnQtYA6nl79sPKhVM6u5m-66suX7B6VQ6tW4TOs,622
174
- fabricks-3.0.1.dist-info/METADATA,sha256=5jGdhyCSur762goGfzWn5wtH_emHDD2MIDTogOazLcE,682
175
- fabricks-3.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
176
- fabricks-3.0.1.dist-info/RECORD,,
174
+ fabricks-3.0.3.dist-info/METADATA,sha256=dztXMj5CrebPZZmMtr-LdpcS2Zg6sO0x0OCUzwLik3s,682
175
+ fabricks-3.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
176
+ fabricks-3.0.3.dist-info/RECORD,,