fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,249 @@
1
+ from abc import abstractmethod
2
+ from functools import partial
3
+ from typing import Optional
4
+
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql.functions import expr
7
+
8
+ from fabricks.context import IS_TYPE_WIDENING, IS_UNITY_CATALOG, SECRET_SCOPE
9
+ from fabricks.context.log import DEFAULT_LOGGER
10
+ from fabricks.core.jobs.base.exception import (
11
+ PostRunCheckException,
12
+ PostRunCheckWarning,
13
+ PostRunInvokeException,
14
+ PreRunCheckException,
15
+ PreRunCheckWarning,
16
+ PreRunInvokeException,
17
+ SchemaDriftException,
18
+ SkipRunCheckWarning,
19
+ )
20
+ from fabricks.core.jobs.base.invoker import Invoker
21
+ from fabricks.utils.write import write_stream
22
+
23
+
24
+ class Processor(Invoker):
25
+ def filter_where(self, df: DataFrame) -> DataFrame:
26
+ f = self.options.job.get("filter_where")
27
+
28
+ if f:
29
+ DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
30
+ df = df.where(f"{f}")
31
+
32
+ return df
33
+
34
+ def encrypt(self, df: DataFrame) -> DataFrame:
35
+ encrypted_columns = self.options.job.get_list("encrypted_columns")
36
+ if encrypted_columns:
37
+ if not IS_UNITY_CATALOG:
38
+ from databricks.sdk.runtime import dbutils
39
+
40
+ key = dbutils.secrets.get(scope=SECRET_SCOPE, key="encryption-key")
41
+ else:
42
+ import os
43
+
44
+ key = os.environ["FABRICKS_ENCRYPTION_KEY"]
45
+
46
+ assert key, "key not found"
47
+
48
+ for col in encrypted_columns:
49
+ DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
50
+ df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
51
+
52
+ return df
53
+
54
+ def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
55
+ """
56
+ Restores the processor to a specific version and batch.
57
+
58
+ Args:
59
+ last_version (Optional[str]): The last version to restore to. If None, no version restore will be performed.
60
+ last_batch (Optional[str]): The last batch to restore to. If None, no batch restore will be performed.
61
+ """
62
+ if self.persist:
63
+ if last_version is not None:
64
+ _last_version = int(last_version)
65
+ if self.table.get_last_version() > _last_version:
66
+ self.table.restore_to_version(_last_version)
67
+
68
+ if last_batch is not None:
69
+ current_batch = int(last_batch) + 1
70
+ self.rm_commit(current_batch)
71
+
72
+ assert last_batch == self.table.get_property("fabricks.last_batch")
73
+ assert self.paths.commits.joinpath(last_batch).exists()
74
+
75
+ def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
76
+ DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
77
+ if batch is not None:
78
+ DEFAULT_LOGGER.debug(f"batch {batch}", extra={"label": self})
79
+
80
+ df = self.base_transform(df)
81
+
82
+ diffs = self.get_schema_differences(df)
83
+ if diffs:
84
+ if self.schema_drift or kwargs.get("reload", False):
85
+ DEFAULT_LOGGER.warning("schema drifted", extra={"label": self, "diffs": diffs})
86
+ self.update_schema(df=df)
87
+
88
+ else:
89
+ only_type_widening_compatible = all(d.type_widening_compatible for d in diffs if d.status == "changed")
90
+ if only_type_widening_compatible and self.table.type_widening_enabled and IS_TYPE_WIDENING:
91
+ self.update_schema(df=df, widen_types=True)
92
+ else:
93
+ raise SchemaDriftException.from_diffs(str(self), diffs)
94
+
95
+ self.for_each_batch(df, batch, **kwargs)
96
+
97
+ if batch is not None:
98
+ self.table.set_property("fabricks.last_batch", batch)
99
+
100
+ self.table.create_restore_point()
101
+ DEFAULT_LOGGER.debug("end (for each batch)", extra={"label": self})
102
+
103
+ def for_each_run(self, **kwargs):
104
+ DEFAULT_LOGGER.debug("start (for each run)", extra={"label": self})
105
+
106
+ if self.virtual:
107
+ self.create_or_replace_view()
108
+
109
+ elif self.persist:
110
+ assert self.table.registered, f"{self} is not registered"
111
+
112
+ df = self.get_data(stream=self.stream, **kwargs)
113
+ assert df is not None, "no data"
114
+
115
+ partial(self._for_each_batch, **kwargs)
116
+
117
+ if self.stream:
118
+ DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
119
+ write_stream(
120
+ df,
121
+ checkpoints_path=self.paths.checkpoints,
122
+ func=self._for_each_batch,
123
+ timeout=self.timeout,
124
+ )
125
+ else:
126
+ self._for_each_batch(df, **kwargs)
127
+
128
+ else:
129
+ raise ValueError(f"{self.mode} - not allowed")
130
+
131
+ DEFAULT_LOGGER.debug("end (for each run)", extra={"label": self})
132
+
133
+ def run(
134
+ self,
135
+ retry: Optional[bool] = True,
136
+ schedule: Optional[str] = None,
137
+ schedule_id: Optional[str] = None,
138
+ invoke: Optional[bool] = True,
139
+ reload: Optional[bool] = None,
140
+ vacuum: Optional[bool] = None,
141
+ optimize: Optional[bool] = None,
142
+ compute_statistics: Optional[bool] = None,
143
+ ):
144
+ """
145
+ Run the processor.
146
+
147
+ Args:
148
+ retry (bool, optional): Whether to retry the execution in case of failure. Defaults to True.
149
+ schedule (str, optional): The schedule to run the processor on. Defaults to None.
150
+ schedule_id (str, optional): The ID of the schedule. Defaults to None.
151
+ invoke (bool, optional): Whether to invoke pre-run and post-run methods. Defaults to True.
152
+ """
153
+ last_version = None
154
+ last_batch = None
155
+ exception = None
156
+
157
+ if self.persist:
158
+ last_version = self.table.get_property("fabricks.last_version")
159
+ if last_version is not None:
160
+ DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"label": self})
161
+ else:
162
+ last_version = str(self.table.last_version)
163
+
164
+ last_batch = self.table.get_property("fabricks.last_batch")
165
+ if last_batch is not None:
166
+ DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"label": self})
167
+
168
+ try:
169
+ DEFAULT_LOGGER.info("start (run)", extra={"label": self})
170
+
171
+ if reload:
172
+ DEFAULT_LOGGER.debug("force reload", extra={"label": self})
173
+
174
+ if invoke:
175
+ self.invoke_pre_run(schedule=schedule)
176
+
177
+ if not reload:
178
+ self.check_skip_run()
179
+
180
+ try:
181
+ self.check_pre_run()
182
+ except PreRunCheckWarning as e:
183
+ exception = e
184
+
185
+ self.for_each_run(schedule=schedule, reload=reload)
186
+
187
+ try:
188
+ self.check_post_run()
189
+ except PostRunCheckWarning as e:
190
+ exception = e
191
+
192
+ self.check_post_run_extra()
193
+
194
+ if invoke:
195
+ self.invoke_post_run(schedule=schedule)
196
+
197
+ if exception:
198
+ raise exception
199
+
200
+ if vacuum is None:
201
+ vacuum = self.options.job.get("vacuum", False)
202
+ if optimize is None:
203
+ optimize = self.options.job.get("optimize", False)
204
+ if compute_statistics is None:
205
+ compute_statistics = self.options.job.get("compute_statistics", False)
206
+
207
+ if vacuum or optimize or compute_statistics:
208
+ self.maintain(
209
+ compute_statistics=compute_statistics,
210
+ optimize=optimize,
211
+ vacuum=vacuum,
212
+ )
213
+
214
+ DEFAULT_LOGGER.info("end (run)", extra={"label": self})
215
+
216
+ except SkipRunCheckWarning as e:
217
+ DEFAULT_LOGGER.warning("skip run", extra={"label": self})
218
+ raise e
219
+
220
+ except (PreRunCheckWarning, PostRunCheckWarning) as e:
221
+ DEFAULT_LOGGER.warning("fail to pass warning check", extra={"label": self})
222
+ raise e
223
+
224
+ except (PreRunInvokeException, PostRunInvokeException) as e:
225
+ DEFAULT_LOGGER.exception("fail to run invoker", extra={"label": self})
226
+ raise e
227
+
228
+ except (PreRunCheckException, PostRunCheckException) as e:
229
+ DEFAULT_LOGGER.exception("fail to pass check", extra={"label": self})
230
+ self.restore(last_version, last_batch)
231
+ raise e
232
+
233
+ except AssertionError as e:
234
+ DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
235
+ self.restore(last_version, last_batch)
236
+ raise e
237
+
238
+ except Exception as e:
239
+ if not self.stream or not retry:
240
+ DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
241
+ self.restore(last_version, last_batch)
242
+ raise e
243
+
244
+ else:
245
+ DEFAULT_LOGGER.warning("retry to run", extra={"label": self})
246
+ self.run(retry=False, schedule_id=schedule_id, schedule=schedule)
247
+
248
+ @abstractmethod
249
+ def overwrite(self) -> None: ...
@@ -0,0 +1,395 @@
1
+ from typing import Optional, Sequence, Union, cast
2
+
3
+ from pyspark.sql import DataFrame
4
+ from pyspark.sql.functions import expr, lit, md5
5
+ from pyspark.sql.types import Row
6
+
7
+ from fabricks.cdc.nocdc import NoCDC
8
+ from fabricks.context import VARIABLES
9
+ from fabricks.context.log import DEFAULT_LOGGER
10
+ from fabricks.core.jobs.base._types import JobDependency, TBronze
11
+ from fabricks.core.jobs.base.job import BaseJob
12
+ from fabricks.core.parsers import BaseParser
13
+ from fabricks.core.parsers.get_parser import get_parser
14
+ from fabricks.core.parsers.utils import clean
15
+ from fabricks.metastore.view import create_or_replace_global_temp_view
16
+ from fabricks.utils.helpers import concat_ws
17
+ from fabricks.utils.path import Path
18
+ from fabricks.utils.read import read
19
+
20
+
21
+ class Bronze(BaseJob):
22
+ def __init__(
23
+ self,
24
+ step: TBronze,
25
+ topic: Optional[str] = None,
26
+ item: Optional[str] = None,
27
+ job_id: Optional[str] = None,
28
+ conf: Optional[Union[dict, Row]] = None,
29
+ ): # type: ignore
30
+ super().__init__(
31
+ "bronze",
32
+ step=step,
33
+ topic=topic,
34
+ item=item,
35
+ job_id=job_id,
36
+ conf=conf,
37
+ )
38
+
39
+ _parser: Optional[BaseParser] = None
40
+
41
+ @property
42
+ def stream(self) -> bool:
43
+ return self.mode not in ["register"]
44
+
45
+ @property
46
+ def schema_drift(self) -> bool:
47
+ return True
48
+
49
+ @property
50
+ def persist(self) -> bool:
51
+ return self.mode in ["append", "register"]
52
+
53
+ @property
54
+ def virtual(self) -> bool:
55
+ return False
56
+
57
+ @classmethod
58
+ def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
59
+ return cls(step=cast(TBronze, step), job_id=job_id, conf=conf)
60
+
61
+ @classmethod
62
+ def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
63
+ return cls(step=cast(TBronze, step), topic=topic, item=item, conf=conf)
64
+
65
+ @property
66
+ def data_path(self) -> Path:
67
+ uri = self.options.job.get("uri")
68
+ assert uri is not None, "no uri provided in options"
69
+ path = Path.from_uri(uri, regex=VARIABLES)
70
+ return path
71
+
72
+ def get_dependencies(self, *s) -> Sequence[JobDependency]:
73
+ dependencies = []
74
+
75
+ parents = self.options.job.get_list("parents")
76
+ if parents:
77
+ for p in parents:
78
+ dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
79
+
80
+ return dependencies
81
+
82
+ def register_external_table(self):
83
+ options = self.conf.parser_options # type: ignore
84
+ if options:
85
+ file_format = options.get("file_format")
86
+ else:
87
+ file_format = "delta"
88
+
89
+ DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"label": self})
90
+
91
+ try:
92
+ df = self.spark.sql(f"select * from {file_format}.`{self.data_path}`")
93
+ assert len(df.columns) > 1, "external table must have at least one column"
94
+ except Exception as e:
95
+ DEFAULT_LOGGER.exception("read external table failed", extra={"label": self})
96
+ raise e
97
+
98
+ self.spark.sql(
99
+ f"create table if not exists {self.qualified_name} using {file_format} location '{self.data_path}'"
100
+ )
101
+
102
+ def drop_external_table(self):
103
+ DEFAULT_LOGGER.warning("remove external table from metastore", extra={"label": self})
104
+ self.spark.sql(f"drop table if exists {self.qualified_name}")
105
+
106
+ def compute_statistics_external_table(self):
107
+ DEFAULT_LOGGER.debug("compute statistics (external table)", extra={"label": self})
108
+ self.spark.sql(f"analyze table {self.qualified_name} compute statistics")
109
+
110
+ def vacuum_external_table(self, retention_hours: Optional[int] = 168):
111
+ from delta import DeltaTable
112
+
113
+ DEFAULT_LOGGER.debug("vacuum (external table)", extra={"label": self})
114
+ try:
115
+ dt = DeltaTable.forPath(self.spark, self.data_path.string)
116
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
117
+ dt.vacuum(retention_hours)
118
+ finally:
119
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
120
+
121
+ def maintain_external_table(
122
+ self,
123
+ vacuum: Optional[bool] = True,
124
+ compute_statistics: Optional[bool] = True,
125
+ ):
126
+ DEFAULT_LOGGER.debug("maintain (external table)", extra={"label": self})
127
+ if vacuum:
128
+ self.vacuum_external_table()
129
+
130
+ if compute_statistics:
131
+ self.compute_statistics_external_table()
132
+
133
+ @property
134
+ def parser(self) -> BaseParser:
135
+ if not self._parser:
136
+ assert self.mode not in ["register"], f"{self.mode} not allowed"
137
+
138
+ name = self.options.job.get("parser")
139
+ assert name is not None, "parser not found"
140
+
141
+ options = self.conf.parser_options or None # type: ignore
142
+ p = get_parser(name, options)
143
+
144
+ self._parser = p
145
+
146
+ return self._parser
147
+
148
+ def parse(self, stream: bool = False) -> DataFrame:
149
+ """
150
+ Parses the data based on the specified mode and returns a DataFrame.
151
+
152
+ Args:
153
+ stream (bool, optional): Indicates whether the data should be read as a stream. Defaults to False.
154
+
155
+ Returns:
156
+ DataFrame: The parsed data as a DataFrame.
157
+ """
158
+ if self.mode == "register":
159
+ if stream:
160
+ df = read(
161
+ stream=stream,
162
+ path=self.data_path,
163
+ file_format="delta",
164
+ # spark=self.spark, (BUG)
165
+ )
166
+ else:
167
+ df = self.spark.sql(f"select * from {self}")
168
+
169
+ # cleaning should done by parser
170
+ df = clean(df)
171
+
172
+ else:
173
+ df = self.parser.get_data(
174
+ stream=stream,
175
+ data_path=self.data_path,
176
+ schema_path=self.paths.schema,
177
+ spark=self.spark,
178
+ )
179
+
180
+ return df
181
+
182
+ def get_data(
183
+ self,
184
+ stream: bool = False,
185
+ transform: Optional[bool] = False,
186
+ schema_only: Optional[bool] = False,
187
+ **kwargs,
188
+ ) -> Optional[DataFrame]:
189
+ df = self.parse(stream)
190
+ df = self.filter_where(df)
191
+ df = self.encrypt(df)
192
+
193
+ if transform:
194
+ df = self.base_transform(df)
195
+
196
+ if schema_only:
197
+ df = df.where("1 == 2")
198
+
199
+ return df
200
+
201
+ def add_calculated_columns(self, df: DataFrame) -> DataFrame:
202
+ calculated_columns = self.options.job.get_dict("calculated_columns")
203
+
204
+ if calculated_columns:
205
+ for key, value in calculated_columns.items():
206
+ DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"label": self})
207
+ df = df.withColumn(key, expr(f"{value}"))
208
+
209
+ return df
210
+
211
+ def add_hash(self, df: DataFrame) -> DataFrame:
212
+ if "__hash" not in df.columns:
213
+ fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
214
+ DEFAULT_LOGGER.debug("add hash", extra={"label": self})
215
+
216
+ if "__operation" in df.columns:
217
+ fields += ["__operation == 'delete'"]
218
+
219
+ if "__source" in df.columns:
220
+ fields += ["__source"]
221
+
222
+ df = df.withColumn("__hash", md5(expr(f"{concat_ws(fields)}")))
223
+
224
+ return df
225
+
226
+ def add_key(self, df: DataFrame) -> DataFrame:
227
+ if "__key" not in df.columns:
228
+ fields = self.options.job.get_list("keys")
229
+ if fields:
230
+ DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
231
+
232
+ if "__source" in df.columns:
233
+ fields = fields + ["__source"]
234
+
235
+ fields = [f"`{f}`" for f in fields]
236
+ df = df.withColumn("__key", md5(expr(f"{concat_ws(fields)}")))
237
+
238
+ return df
239
+
240
+ def add_source(self, df: DataFrame) -> DataFrame:
241
+ if "__source" not in df.columns:
242
+ source = self.options.job.get("source")
243
+ if source:
244
+ DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
245
+ df = df.withColumn("__source", lit(source))
246
+
247
+ return df
248
+
249
+ def add_operation(self, df: DataFrame) -> DataFrame:
250
+ if "__operation" not in df.columns:
251
+ operation = self.options.job.get("operation")
252
+ if operation:
253
+ DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
254
+ df = df.withColumn("__operation", lit(operation))
255
+
256
+ else:
257
+ df = df.withColumn("__operation", lit("upsert"))
258
+
259
+ return df
260
+
261
+ def base_transform(self, df: DataFrame) -> DataFrame:
262
+ df = df.transform(self.extend)
263
+ df = df.transform(self.add_calculated_columns)
264
+ df = df.transform(self.add_hash)
265
+ df = df.transform(self.add_operation)
266
+ df = df.transform(self.add_source)
267
+ df = df.transform(self.add_key)
268
+
269
+ if "__metadata" in df.columns:
270
+ if self.mode == "register":
271
+ # https://github.com/delta-io/delta/issues/2014 (BUG)
272
+ df = df.withColumn(
273
+ "__metadata",
274
+ expr(
275
+ f"""
276
+ struct(
277
+ concat_ws('/', '{self.data_path}', __timestamp, __operation) as file_path,
278
+ __metadata.file_name as file_name,
279
+ __metadata.file_size as file_size,
280
+ __metadata.file_modification_time as file_modification_time,
281
+ cast(current_date() as timestamp) as inserted
282
+ )
283
+ """
284
+ ),
285
+ )
286
+
287
+ else:
288
+ df = df.withColumn(
289
+ "__metadata",
290
+ expr(
291
+ """
292
+ struct(
293
+ __metadata.file_path as file_path,
294
+ __metadata.file_name as file_name,
295
+ __metadata.file_size as file_size,
296
+ __metadata.file_modification_time as file_modification_time,
297
+ cast(current_date() as timestamp) as inserted
298
+ )
299
+ """
300
+ ),
301
+ )
302
+
303
+ return df
304
+
305
+ def create_or_replace_view(self):
306
+ DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
307
+
308
+ def overwrite_schema(self, df: Optional[DataFrame] = None):
309
+ DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"label": self})
310
+
311
+ def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
312
+ return {}
313
+
314
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
315
+ assert self.persist, f"{self.mode} not allowed"
316
+
317
+ context = self.get_cdc_context(df)
318
+
319
+ # if dataframe, reference is passed (BUG)
320
+ name = f"{self.step}_{self.topic}_{self.item}__{batch}"
321
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
322
+ sql = f"select * from {global_temp_view}"
323
+
324
+ check_df = self.spark.sql(sql)
325
+ if check_df.isEmpty():
326
+ DEFAULT_LOGGER.warning("no data", extra={"label": self})
327
+ return
328
+
329
+ assert isinstance(self.cdc, NoCDC)
330
+ if self.mode == "append":
331
+ self.cdc.append(sql, **context)
332
+
333
+ def for_each_run(self, **kwargs):
334
+ if self.mode == "register":
335
+ DEFAULT_LOGGER.debug("register (no run)", extra={"label": self})
336
+ elif self.mode == "memory":
337
+ DEFAULT_LOGGER.debug("memory (no run)", extra={"label": self})
338
+ else:
339
+ super().for_each_run(**kwargs)
340
+
341
+ def create(self):
342
+ if self.mode == "register":
343
+ self.register_external_table()
344
+ elif self.mode == "memory":
345
+ DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
346
+ else:
347
+ super().create()
348
+
349
+ def register(self):
350
+ if self.mode == "register":
351
+ self.register_external_table()
352
+ elif self.mode == "memory":
353
+ DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
354
+ else:
355
+ super().register()
356
+
357
+ def truncate(self):
358
+ if self.mode == "register":
359
+ DEFAULT_LOGGER.info("register (no truncate)", extra={"label": self})
360
+ else:
361
+ super().truncate()
362
+
363
+ def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
364
+ if self.mode == "register":
365
+ DEFAULT_LOGGER.info("register (no restore)", extra={"label": self})
366
+ else:
367
+ super().restore()
368
+
369
+ def drop(self):
370
+ if self.mode == "register":
371
+ self.drop_external_table()
372
+ super().drop()
373
+
374
+ def maintain(
375
+ self,
376
+ vacuum: Optional[bool] = True,
377
+ optimize: Optional[bool] = True,
378
+ compute_statistics: Optional[bool] = True,
379
+ ):
380
+ if self.mode == "register":
381
+ self.maintain_external_table(vacuum=vacuum, compute_statistics=compute_statistics)
382
+ else:
383
+ super().maintain(vacuum=vacuum, optimize=optimize, compute_statistics=compute_statistics)
384
+
385
+ def vacuum(self):
386
+ if self.mode == "memory":
387
+ DEFAULT_LOGGER.info("memory (no vacuum)", extra={"label": self})
388
+ elif self.mode == "register":
389
+ self.vacuum_external_table()
390
+ else:
391
+ super().vacuum()
392
+
393
+ def overwrite(self, schedule: Optional[str] = None):
394
+ self.truncate()
395
+ self.run(schedule=schedule)