fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,29 @@
1
+ import sys
2
+ from importlib.util import spec_from_file_location
3
+ from typing import Callable
4
+
5
+ from fabricks.context import PATH_EXTENDERS
6
+
7
+ EXTENDERS: dict[str, Callable] = {}
8
+
9
+
10
+ def get_extender(name: str) -> Callable:
11
+ sys.path.append(PATH_EXTENDERS.string)
12
+
13
+ path = PATH_EXTENDERS.join(f"{name}.py")
14
+ assert path.exists(), "no valid extender found in {path.string}"
15
+
16
+ spec = spec_from_file_location(name, path.string)
17
+ assert spec, "no valid extender found in {path.string}"
18
+
19
+ spec.loader.load_module() # type: ignore
20
+ e = EXTENDERS[name]
21
+ return e
22
+
23
+
24
+ def extender(name: str):
25
+ def decorator(fn: Callable):
26
+ EXTENDERS[name] = fn
27
+ return fn
28
+
29
+ return decorator
@@ -0,0 +1,20 @@
1
+ from fabricks.core.jobs.base import Bronzes, Golds, Silvers, Steps
2
+ from fabricks.core.jobs.bronze import Bronze
3
+ from fabricks.core.jobs.get_job import get_job
4
+ from fabricks.core.jobs.get_job_id import get_job_id
5
+ from fabricks.core.jobs.get_jobs import get_jobs
6
+ from fabricks.core.jobs.gold import Gold
7
+ from fabricks.core.jobs.silver import Silver
8
+
9
+ __all__ = [
10
+ "Bronze",
11
+ "Bronzes",
12
+ "get_job_id",
13
+ "get_job",
14
+ "get_jobs",
15
+ "Gold",
16
+ "Golds",
17
+ "Silver",
18
+ "Silvers",
19
+ "Steps",
20
+ ]
@@ -0,0 +1,10 @@
1
+ from fabricks.core.jobs.base.job import BaseJob
2
+ from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers, Steps
3
+
4
+ __all__ = [
5
+ "BaseJob",
6
+ "Bronzes",
7
+ "Golds",
8
+ "Silvers",
9
+ "Steps",
10
+ ]
@@ -0,0 +1,89 @@
1
+ from fabricks.context.log import Logger
2
+ from fabricks.core.jobs.base.error import CheckFailedException, CheckWarningException
3
+ from fabricks.core.jobs.base.generator import Generator
4
+
5
+
6
+ class Checker(Generator):
7
+ def pre_run_check(self):
8
+ self._check("pre_run")
9
+
10
+ def post_run_check(self):
11
+ self._check("post_run")
12
+
13
+ def _check(self, position: str):
14
+ if self.options.check.get(position):
15
+ Logger.debug(f"{position.replace('_', ' ')} check", extra={"job": self})
16
+
17
+ p = self.paths.runtime.append(f".{position}.sql")
18
+ assert p.exists(), f"{position} check not found ({p})"
19
+
20
+ fail_df = self.spark.sql(p.get_sql()).where("__action == 'fail'")
21
+ warning_df = self.spark.sql(p.get_sql()).where("__action == 'warning'")
22
+
23
+ if not fail_df.isEmpty():
24
+ for row in fail_df.collect():
25
+ Logger.error(
26
+ f"{position.replace('_', ' ')} check failed due to {row['__message']}",
27
+ extra={"job": self},
28
+ )
29
+ raise CheckFailedException(row["__message"]) # type: ignore
30
+ elif not warning_df.isEmpty():
31
+ for row in warning_df.collect():
32
+ Logger.warning(
33
+ f"{position.replace('_', ' ')} check failed due to {row['__message']}",
34
+ extra={"job": self},
35
+ )
36
+ raise CheckWarningException(row["__message"]) # type: ignore
37
+
38
+ def post_run_extra_check(self):
39
+ min_rows = self.options.check.get("min_rows")
40
+ max_rows = self.options.check.get("max_rows")
41
+ count_must_equal = self.options.check.get("count_must_equal")
42
+
43
+ if min_rows or max_rows or count_must_equal:
44
+ Logger.debug("extra post run check", extra={"job": self})
45
+
46
+ rows = self.spark.sql(f"select count(*) from {self}").collect()[0][0]
47
+ if min_rows:
48
+ if rows < min_rows:
49
+ raise CheckFailedException(f"min rows check failed ({rows} < {min_rows})")
50
+ if max_rows:
51
+ if rows > max_rows:
52
+ raise CheckFailedException(f"max rows check failed ({rows} > {max_rows})")
53
+
54
+ if count_must_equal:
55
+ equals_rows = self.spark.read.table(count_must_equal).count()
56
+ if rows != equals_rows:
57
+ raise CheckFailedException(
58
+ f"count must equal check failed ({count_must_equal} - {rows} != {equals_rows})"
59
+ )
60
+
61
+ def _check_duplicate(self, column: str):
62
+ if column in self.table.columns:
63
+ Logger.debug(f"duplicate {column} check", extra={"job": self})
64
+
65
+ cols = [column]
66
+
67
+ if "__source" in self.table.columns:
68
+ cols.append("__source")
69
+
70
+ if self.change_data_capture == "scd2":
71
+ cols.append("__valid_to")
72
+ elif self.change_data_capture == "nocdc":
73
+ if "__valid_to" in self.table.columns:
74
+ cols.append("__valid_to")
75
+
76
+ cols = ", ".join(cols)
77
+ df = self.spark.sql(f"select {cols} from {self} group by all having count(*) > 1 limit 5")
78
+
79
+ if not df.isEmpty():
80
+ duplicates = ",".join([str(row[column]) for row in df.collect()])
81
+ raise CheckFailedException(f"duplicate {column} check failed ({duplicates})")
82
+ else:
83
+ Logger.debug(f"{column} not found", extra={"job": self})
84
+
85
+ def check_duplicate_key(self):
86
+ self._check_duplicate("__key")
87
+
88
+ def check_duplicate_hash(self):
89
+ self._check_duplicate("__hash")
@@ -0,0 +1,323 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional, Union, cast
3
+
4
+ from pyspark.dbutils import DBUtils
5
+ from pyspark.sql import DataFrame, SparkSession
6
+
7
+ from fabricks.cdc import SCD1, SCD2, ChangeDataCaptures, NoCDC
8
+ from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
9
+ from fabricks.context.log import Logger
10
+ from fabricks.context.spark import build_spark_session
11
+ from fabricks.core.jobs.base.types import Modes, Options, Paths, Timeouts, TStep
12
+ from fabricks.core.jobs.get_job_conf import get_job_conf
13
+ from fabricks.core.jobs.get_job_id import get_job_id
14
+ from fabricks.metastore.table import Table
15
+ from fabricks.utils.fdict import FDict
16
+ from fabricks.utils.path import Path
17
+
18
+
19
+ class Configurator(ABC):
20
+ def __init__(
21
+ self,
22
+ extend: str,
23
+ step: TStep,
24
+ topic: Optional[str] = None,
25
+ item: Optional[str] = None,
26
+ job_id: Optional[str] = None,
27
+ ):
28
+ self.extend = extend
29
+ self.step: TStep = step
30
+
31
+ if job_id is not None:
32
+ self.job_id = job_id
33
+ self.conf = get_job_conf(step=self.step, job_id=self.job_id)
34
+ self.topic = self.conf.topic
35
+ self.item = self.conf.item
36
+ else:
37
+ assert topic
38
+ assert item
39
+ self.topic = topic
40
+ self.item = item
41
+ self.conf = get_job_conf(step=self.step, topic=self.topic, item=self.item)
42
+ self.job_id = get_job_id(step=self.step, topic=self.topic, item=self.item)
43
+
44
+ _step_conf: Optional[dict[str, str]] = None
45
+ _spark: Optional[SparkSession] = None
46
+ _timeouts: Optional[Timeouts] = None
47
+ _options: Optional[Options] = None
48
+ _paths: Optional[Paths] = None
49
+ _table: Optional[Table] = None
50
+ _root: Optional[Path] = None
51
+
52
+ _cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
53
+ _change_data_capture: Optional[ChangeDataCaptures] = None
54
+ _mode: Optional[Modes] = None
55
+ _liquid_clustering: Optional[bool] = False
56
+
57
+ @property
58
+ @abstractmethod
59
+ def stream(self) -> bool:
60
+ raise NotImplementedError()
61
+
62
+ @property
63
+ @abstractmethod
64
+ def schema_drift(self) -> bool:
65
+ raise NotImplementedError()
66
+
67
+ @property
68
+ @abstractmethod
69
+ def persist(self) -> bool:
70
+ raise NotImplementedError()
71
+
72
+ @property
73
+ @abstractmethod
74
+ def virtual(self) -> bool:
75
+ raise NotImplementedError()
76
+
77
+ @classmethod
78
+ def from_step_topic_item(cls, step: str, topic: str, item: str):
79
+ raise NotImplementedError()
80
+
81
+ @classmethod
82
+ def from_job_id(cls, step: str, job_id: str):
83
+ raise NotImplementedError()
84
+
85
+ @property
86
+ def spark(self) -> SparkSession:
87
+ if not self._spark:
88
+ spark, _ = build_spark_session(new=True, log=False)
89
+
90
+ step_options = self.step_conf.get("spark_options", {})
91
+ step_sql_options = step_options.get("sql", {})
92
+ step_conf_options = step_options.get("conf", {})
93
+ if step_sql_options:
94
+ for key, value in step_sql_options.items():
95
+ Logger.debug(f"{self.step} - add {key} = {value}", extra={"job": self})
96
+ spark.sql(f"set {key} = {value}")
97
+ if step_conf_options:
98
+ for key, value in step_conf_options.items():
99
+ Logger.debug(f"{self.step} - add {key} = {value}")
100
+ spark.conf.set(f"{key}", f"{value}")
101
+
102
+ job_sql_options = self.options.spark.get_dict("sql")
103
+ job_conf_options = self.options.spark.get_dict("conf")
104
+ if job_sql_options:
105
+ for key, value in job_sql_options.items():
106
+ Logger.debug(f"add {key} = {value}", extra={"job": self})
107
+ spark.sql(f"set {key} = {value}")
108
+ if job_conf_options:
109
+ for key, value in job_conf_options.items():
110
+ Logger.debug(f"add {key} = {value}", extra={"job": self})
111
+ spark.conf.set(f"{key}", f"{value}")
112
+
113
+ self._spark = spark
114
+ return self._spark
115
+
116
+ @property
117
+ def step_conf(self) -> dict:
118
+ if not self._step_conf:
119
+ _conf = [s for s in STEPS if s.get("name") == self.step][0]
120
+ assert _conf is not None
121
+ self._step_conf = cast(dict[str, str], _conf)
122
+ return self._step_conf
123
+
124
+ @property
125
+ def dbutils(self) -> DBUtils:
126
+ return DBUtils(self.spark)
127
+
128
+ @property
129
+ def qualified_name(self) -> str:
130
+ return f"{self.step}.{self.topic}_{self.item}"
131
+
132
+ def _get_timeout(self, what: str) -> int:
133
+ t = self.step_conf.get("options", {}).get("timeouts", {}).get(what, None)
134
+ if t is None:
135
+ t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
136
+ assert t is not None
137
+ return int(t)
138
+
139
+ @property
140
+ def timeouts(self) -> Timeouts:
141
+ if not self._timeouts:
142
+ self._timeouts = Timeouts(
143
+ job=self._get_timeout("job"),
144
+ pre_run=self._get_timeout("pre_run"),
145
+ post_run=self._get_timeout("post_run"),
146
+ )
147
+ return self._timeouts
148
+
149
+ def pip(self):
150
+ pass
151
+
152
+ @property
153
+ def table(self) -> Table:
154
+ return self.cdc.table
155
+
156
+ @property
157
+ def paths(self) -> Paths:
158
+ if not self._paths:
159
+ storage = PATHS_STORAGE.get(self.step)
160
+ assert storage
161
+ runtime_root = PATHS_RUNTIME.get(self.step)
162
+ assert runtime_root
163
+ self._paths = Paths(
164
+ storage=storage,
165
+ tmp=storage.join("tmp", self.topic, self.item),
166
+ checkpoints=storage.join("checkpoints", self.topic, self.item),
167
+ commits=storage.join("checkpoints", self.topic, self.item, "commits"),
168
+ schema=storage.join("schema", self.topic, self.item),
169
+ runtime=runtime_root.join(self.topic, self.item),
170
+ )
171
+ return self._paths
172
+
173
+ @property
174
+ def options(self) -> Options:
175
+ if not self._options:
176
+ job = self.conf.options or {}
177
+ table = self.conf.table_options or {}
178
+ check = self.conf.check_options or {}
179
+ spark = self.conf.spark_options or {}
180
+ invoker = self.conf.invoker_options or {}
181
+
182
+ self._options = Options(
183
+ job=FDict(job),
184
+ table=FDict(table),
185
+ check=FDict(check),
186
+ spark=FDict(spark),
187
+ invoker=FDict(invoker),
188
+ )
189
+ return self._options
190
+
191
+ @property
192
+ def change_data_capture(self) -> ChangeDataCaptures:
193
+ if not self._change_data_capture:
194
+ cdc: ChangeDataCaptures = self.options.job.get("change_data_capture") or "nocdc"
195
+ self._change_data_capture = cdc
196
+ return self._change_data_capture
197
+
198
+ @property
199
+ def cdc(self) -> Union[NoCDC, SCD1, SCD2]:
200
+ if not self._cdc:
201
+ if self.change_data_capture == "nocdc":
202
+ cdc = NoCDC(self.step, self.topic, self.item, spark=self.spark)
203
+ elif self.change_data_capture == "scd1":
204
+ cdc = SCD1(self.step, self.topic, self.item, spark=self.spark)
205
+ elif self.change_data_capture == "scd2":
206
+ cdc = SCD2(self.step, self.topic, self.item, spark=self.spark)
207
+ else:
208
+ raise ValueError(f"{self.change_data_capture} not allowed")
209
+ self._cdc = cdc
210
+ return self._cdc
211
+
212
+ @property
213
+ def slowly_changing_dimension(self) -> bool:
214
+ return self.change_data_capture in ["scd1", "scd2"]
215
+
216
+ @abstractmethod
217
+ def get_cdc_context(self, df: DataFrame) -> dict:
218
+ raise NotImplementedError()
219
+
220
+ def get_cdc_data(self, stream: Optional[bool] = False) -> Optional[DataFrame]:
221
+ df = self.get_data(stream)
222
+ if df:
223
+ cdc_context = self.get_cdc_context(df)
224
+ cdc_df = self.cdc.get_data(src=df, **cdc_context)
225
+ return cdc_df
226
+
227
+ @property
228
+ def mode(self) -> Modes:
229
+ if not self._mode:
230
+ _mode = self.options.job.get("mode")
231
+ assert _mode is not None
232
+ self._mode = cast(Modes, _mode)
233
+ return self._mode
234
+
235
+ @abstractmethod
236
+ def get_data(self, stream: Optional[bool] = False, transform: Optional[bool] = False) -> Optional[DataFrame]:
237
+ """
238
+ Retrieves the data for the job.
239
+
240
+ Args:
241
+ stream (bool, optional): If True, the data will be streamed. Defaults to False.
242
+ transform (bool, optional): If True, the data will be transformed. Defaults to False.
243
+
244
+ Returns:
245
+ DataFrame or None: The retrieved data as a DataFrame, or None if the data is not available.
246
+ """
247
+ raise NotImplementedError()
248
+
249
+ @abstractmethod
250
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
251
+ raise NotImplementedError()
252
+
253
+ @abstractmethod
254
+ def for_each_run(self):
255
+ raise NotImplementedError()
256
+
257
+ @abstractmethod
258
+ def base_transform(self, df: DataFrame) -> DataFrame:
259
+ raise NotImplementedError()
260
+
261
+ @abstractmethod
262
+ def run(
263
+ self,
264
+ retry: Optional[int] = 1,
265
+ schedule: Optional[str] = None,
266
+ schedule_id: Optional[str] = None,
267
+ invoke: Optional[bool] = None,
268
+ ) -> Optional[int]:
269
+ raise NotImplementedError()
270
+
271
+ def optimize(
272
+ self,
273
+ vacuum: Optional[bool] = True,
274
+ optimize: Optional[bool] = True,
275
+ analyze: Optional[bool] = True,
276
+ ):
277
+ """
278
+ Optimize the table by performing vacuum, optimizing CDC, and analyzing the table.
279
+
280
+ If the mode is set to 'memory', no optimization is performed.
281
+
282
+ The retention days for optimization are determined in the following order:
283
+ 1. If 'retention_days' is specified in the job options table, it is used.
284
+ 2. If 'retention_days' is specified in the step configuration table options, it is used.
285
+ 3. If 'retention_days' is specified in the CONF_RUNTIME options, it is used.
286
+
287
+ After determining the retention days, the table is vacuumed with the specified retention days,
288
+ CDC is optimized for the table, and the table is analyzed.
289
+
290
+ Note: This method assumes that either 'runtime' or 'step' or 'job' is specified.
291
+
292
+ Returns:
293
+ None
294
+ """
295
+ if self.mode == "memory":
296
+ Logger.debug("memory (no optimize)", extra={"job": self})
297
+ else:
298
+ assert self.table.exists()
299
+
300
+ if vacuum:
301
+ self.vacuum()
302
+ if optimize:
303
+ self.cdc.optimize_table()
304
+ if analyze:
305
+ self.table.compute_statistics()
306
+
307
+ def vacuum(self):
308
+ job = self.options.table.get("retention_days")
309
+ step = self.step_conf.get("table_options", {}).get("retention_days", None)
310
+ runtime = CONF_RUNTIME.get("options", {}).get("retention_days")
311
+
312
+ if job is not None:
313
+ retention_days = job
314
+ elif step:
315
+ retention_days = step
316
+ else:
317
+ assert runtime
318
+ retention_days = runtime
319
+
320
+ self.table.vacuum(retention_days=retention_days)
321
+
322
+ def __str__(self):
323
+ return f"{self.step}.{self.topic}_{self.item}"
@@ -0,0 +1,16 @@
1
+ class CheckFailedException(Exception):
2
+ def __init__(self, message: str):
3
+ self.message = message
4
+ super().__init__(self.message)
5
+
6
+
7
+ class CheckWarningException(Exception):
8
+ def __init__(self, message: str):
9
+ self.message = message
10
+ super().__init__(self.message)
11
+
12
+
13
+ class InvokerFailedException(Exception):
14
+ def __init__(self, message: str):
15
+ self.message = message
16
+ super().__init__(self.message)