fabricks 3.0.19__py3-none-any.whl → 4.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/context.py +15 -3
- fabricks/api/notebooks/schedule.py +2 -3
- fabricks/api/parsers.py +2 -1
- fabricks/api/utils.py +3 -1
- fabricks/cdc/__init__.py +1 -2
- fabricks/cdc/base/__init__.py +1 -2
- fabricks/cdc/base/_types.py +5 -3
- fabricks/cdc/base/configurator.py +5 -0
- fabricks/cdc/base/generator.py +7 -3
- fabricks/cdc/base/merger.py +2 -0
- fabricks/cdc/base/processor.py +15 -0
- fabricks/cdc/templates/README.md +490 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +1 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +4 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +6 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +6 -0
- fabricks/cdc/templates/queries/context.sql.jinja +104 -96
- fabricks/cdc/templates/query.sql.jinja +1 -1
- fabricks/context/__init__.py +13 -1
- fabricks/context/config.py +13 -122
- fabricks/context/log.py +92 -1
- fabricks/context/runtime.py +35 -69
- fabricks/context/spark_session.py +8 -7
- fabricks/context/utils.py +26 -39
- fabricks/core/__init__.py +2 -2
- fabricks/core/dags/base.py +5 -5
- fabricks/core/dags/processor.py +2 -3
- fabricks/core/extenders.py +1 -1
- fabricks/core/job_schema.py +26 -16
- fabricks/core/jobs/__init__.py +1 -7
- fabricks/core/jobs/base/README.md +1545 -0
- fabricks/core/jobs/base/__init__.py +1 -8
- fabricks/core/jobs/base/checker.py +7 -7
- fabricks/core/jobs/base/configurator.py +142 -63
- fabricks/core/jobs/base/generator.py +38 -34
- fabricks/core/jobs/base/invoker.py +48 -63
- fabricks/core/jobs/base/processor.py +13 -28
- fabricks/core/jobs/bronze.py +88 -38
- fabricks/core/jobs/get_job.py +3 -6
- fabricks/core/jobs/get_job_conf.py +19 -68
- fabricks/core/jobs/get_jobs.py +10 -11
- fabricks/core/jobs/get_schedules.py +3 -17
- fabricks/core/jobs/gold.py +89 -47
- fabricks/core/jobs/silver.py +42 -22
- fabricks/core/masks.py +11 -8
- fabricks/core/parsers/__init__.py +0 -2
- fabricks/core/parsers/base.py +10 -10
- fabricks/core/parsers/decorator.py +1 -1
- fabricks/core/parsers/get_parser.py +4 -5
- fabricks/core/schedules/process.py +1 -4
- fabricks/core/steps/base.py +27 -17
- fabricks/core/steps/get_step.py +2 -4
- fabricks/core/steps/get_step_conf.py +3 -7
- fabricks/core/udfs.py +7 -7
- fabricks/core/views.py +2 -2
- fabricks/deploy/__init__.py +27 -16
- fabricks/deploy/masks.py +1 -1
- fabricks/deploy/notebooks.py +19 -16
- fabricks/deploy/schedules.py +1 -1
- fabricks/deploy/tables.py +66 -49
- fabricks/deploy/udfs.py +2 -2
- fabricks/deploy/views.py +15 -16
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/table.py +103 -68
- fabricks/models/__init__.py +125 -0
- fabricks/models/common.py +79 -0
- fabricks/models/config.py +225 -0
- fabricks/models/dependency.py +50 -0
- fabricks/models/job.py +157 -0
- fabricks/models/path.py +17 -0
- fabricks/models/runtime.py +182 -0
- fabricks/models/schedule.py +21 -0
- fabricks/models/step.py +103 -0
- fabricks/models/table.py +77 -0
- fabricks/{core/jobs/get_job_id.py → models/utils.py} +2 -0
- fabricks/utils/helpers.py +6 -5
- fabricks/utils/log.py +25 -6
- fabricks/utils/path.py +265 -108
- fabricks/utils/pip.py +7 -7
- fabricks/utils/read/read.py +23 -22
- fabricks/utils/read/read_yaml.py +2 -2
- fabricks/utils/write/delta.py +4 -4
- fabricks/utils/write/stream.py +2 -2
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/METADATA +9 -4
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/RECORD +86 -83
- fabricks/context/_types.py +0 -139
- fabricks/context/helpers.py +0 -63
- fabricks/core/jobs/base/_types.py +0 -284
- fabricks/core/parsers/_types.py +0 -6
- fabricks/utils/fdict.py +0 -240
- fabricks/utils/pydantic.py +0 -94
- fabricks/utils/schema/__init__.py +0 -7
- fabricks/utils/schema/get_json_schema_for_type.py +0 -161
- fabricks/utils/schema/get_schema_for_type.py +0 -99
- {fabricks-3.0.19.dist-info → fabricks-4.0.0.dist-info}/WHEEL +0 -0
|
@@ -19,10 +19,10 @@ class Checker(Generator):
|
|
|
19
19
|
self._check("post_run")
|
|
20
20
|
|
|
21
21
|
def _check(self, position: Literal["pre_run", "post_run"]):
|
|
22
|
-
if self.
|
|
22
|
+
if self.check_options and getattr(self.check_options, position):
|
|
23
23
|
DEFAULT_LOGGER.debug(f"check {position}", extra={"label": self})
|
|
24
24
|
|
|
25
|
-
p = self.paths.
|
|
25
|
+
p = self.paths.to_runtime.append(f".{position}.sql")
|
|
26
26
|
assert p.exists(), f"{position} check not found ({p})"
|
|
27
27
|
|
|
28
28
|
df = self.spark.sql(p.get_sql())
|
|
@@ -54,9 +54,9 @@ class Checker(Generator):
|
|
|
54
54
|
raise PostRunCheckWarning(row["__message"], dataframe=df)
|
|
55
55
|
|
|
56
56
|
def check_post_run_extra(self):
|
|
57
|
-
min_rows = self.
|
|
58
|
-
max_rows = self.
|
|
59
|
-
count_must_equal = self.
|
|
57
|
+
min_rows = self.check_options.min_rows if self.check_options else None
|
|
58
|
+
max_rows = self.check_options.max_rows if self.check_options else None
|
|
59
|
+
count_must_equal = self.check_options.count_must_equal if self.check_options else None
|
|
60
60
|
|
|
61
61
|
if min_rows or max_rows or count_must_equal:
|
|
62
62
|
df = self.spark.sql(f"select count(*) from {self}")
|
|
@@ -121,10 +121,10 @@ class Checker(Generator):
|
|
|
121
121
|
self._check_duplicate_in_column("__identity")
|
|
122
122
|
|
|
123
123
|
def check_skip_run(self):
|
|
124
|
-
if self.
|
|
124
|
+
if self.check_options and self.check_options.skip:
|
|
125
125
|
DEFAULT_LOGGER.debug("check if run should be skipped", extra={"label": self})
|
|
126
126
|
|
|
127
|
-
p = self.paths.
|
|
127
|
+
p = self.paths.to_runtime.append(".skip.sql")
|
|
128
128
|
assert p.exists(), "skip check not found"
|
|
129
129
|
|
|
130
130
|
df = self.spark.sql(p.get_sql())
|
|
@@ -1,41 +1,58 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
|
-
from
|
|
3
|
-
from typing import Optional, Union, cast
|
|
2
|
+
from typing import List, Optional, Union, cast
|
|
4
3
|
|
|
5
4
|
from pyspark.sql import DataFrame, SparkSession
|
|
6
5
|
from pyspark.sql.types import Row
|
|
7
6
|
from typing_extensions import deprecated
|
|
8
7
|
|
|
9
|
-
from fabricks.cdc import SCD1, SCD2,
|
|
10
|
-
from fabricks.context import
|
|
8
|
+
from fabricks.cdc import SCD1, SCD2, NoCDC
|
|
9
|
+
from fabricks.context import PATHS_RUNTIME, PATHS_STORAGE, STEPS
|
|
11
10
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
12
11
|
from fabricks.context.spark_session import build_spark_session
|
|
13
|
-
from fabricks.core.jobs.base._types import AllowedModes, Options, Paths, TStep
|
|
14
12
|
from fabricks.core.jobs.get_job_conf import get_job_conf
|
|
15
|
-
from fabricks.core.jobs.get_job_id import get_job_id
|
|
16
13
|
from fabricks.metastore.table import Table
|
|
17
|
-
from fabricks.
|
|
18
|
-
|
|
14
|
+
from fabricks.models import (
|
|
15
|
+
AllowedChangeDataCaptures,
|
|
16
|
+
AllowedModes,
|
|
17
|
+
CheckOptions,
|
|
18
|
+
ExtenderOptions,
|
|
19
|
+
InvokerOptions,
|
|
20
|
+
Paths,
|
|
21
|
+
RuntimeOptions,
|
|
22
|
+
SparkOptions,
|
|
23
|
+
StepBronzeConf,
|
|
24
|
+
StepBronzeOptions,
|
|
25
|
+
StepGoldConf,
|
|
26
|
+
StepGoldOptions,
|
|
27
|
+
StepSilverConf,
|
|
28
|
+
StepSilverOptions,
|
|
29
|
+
StepTableOptions,
|
|
30
|
+
TableOptions,
|
|
31
|
+
TOptions,
|
|
32
|
+
get_job_id,
|
|
33
|
+
)
|
|
34
|
+
from fabricks.models.runtime import RuntimeConf
|
|
19
35
|
|
|
20
36
|
|
|
21
37
|
class Configurator(ABC):
|
|
22
38
|
def __init__(
|
|
23
39
|
self,
|
|
24
40
|
expand: str,
|
|
25
|
-
step:
|
|
41
|
+
step: str,
|
|
26
42
|
topic: Optional[str] = None,
|
|
27
43
|
item: Optional[str] = None,
|
|
28
44
|
job_id: Optional[str] = None,
|
|
29
45
|
conf: Optional[Union[dict, Row]] = None,
|
|
30
46
|
):
|
|
31
47
|
self.expand = expand
|
|
32
|
-
self.step
|
|
48
|
+
self.step = step
|
|
33
49
|
|
|
34
50
|
if job_id is not None:
|
|
35
51
|
self.job_id = job_id
|
|
36
52
|
self.conf = get_job_conf(step=self.step, job_id=self.job_id, row=conf)
|
|
37
53
|
self.topic = self.conf.topic
|
|
38
54
|
self.item = self.conf.item
|
|
55
|
+
|
|
39
56
|
else:
|
|
40
57
|
assert topic
|
|
41
58
|
assert item
|
|
@@ -44,13 +61,15 @@ class Configurator(ABC):
|
|
|
44
61
|
self.conf = get_job_conf(step=self.step, topic=self.topic, item=self.item, row=conf)
|
|
45
62
|
self.job_id = get_job_id(step=self.step, topic=self.topic, item=self.item)
|
|
46
63
|
|
|
47
|
-
_step_conf: Optional[
|
|
64
|
+
_step_conf: Optional[Union[StepBronzeConf, StepSilverConf, StepGoldConf]] = None
|
|
65
|
+
_step_options: Optional[Union[StepBronzeOptions, StepSilverOptions, StepGoldOptions]] = None
|
|
66
|
+
_step_table_options: Optional[StepTableOptions] = None
|
|
67
|
+
_runtime_options: Optional[RuntimeOptions] = None
|
|
68
|
+
_runtime_conf: Optional[RuntimeConf] = None
|
|
48
69
|
_spark: Optional[SparkSession] = None
|
|
49
70
|
_timeout: Optional[int] = None
|
|
50
|
-
_options: Optional[Options] = None
|
|
51
71
|
_paths: Optional[Paths] = None
|
|
52
72
|
_table: Optional[Table] = None
|
|
53
|
-
_root: Optional[Path] = None
|
|
54
73
|
|
|
55
74
|
_cdc: Optional[Union[NoCDC, SCD1, SCD2]] = None
|
|
56
75
|
_change_data_capture: Optional[AllowedChangeDataCaptures] = None
|
|
@@ -83,26 +102,29 @@ class Configurator(ABC):
|
|
|
83
102
|
if not self._spark:
|
|
84
103
|
spark = build_spark_session(app_name=str(self))
|
|
85
104
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
for key, value in
|
|
105
|
+
# Apply step-level spark options if configured
|
|
106
|
+
step_spark = self.step_spark_options
|
|
107
|
+
if step_spark:
|
|
108
|
+
sql_options = step_spark.sql or {}
|
|
109
|
+
for key, value in sql_options.items():
|
|
91
110
|
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
|
|
92
111
|
spark.sql(f"set {key} = {value}")
|
|
93
|
-
|
|
94
|
-
|
|
112
|
+
|
|
113
|
+
conf_options = step_spark.conf or {}
|
|
114
|
+
for key, value in conf_options.items():
|
|
95
115
|
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self.step})
|
|
96
116
|
spark.conf.set(f"{key}", f"{value}")
|
|
97
117
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
if
|
|
101
|
-
|
|
118
|
+
# Apply job-level spark options if configured
|
|
119
|
+
job_spark = self.spark_options
|
|
120
|
+
if job_spark:
|
|
121
|
+
sql_options = job_spark.sql or {}
|
|
122
|
+
for key, value in sql_options.items():
|
|
102
123
|
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
|
|
103
124
|
spark.sql(f"set {key} = {value}")
|
|
104
|
-
|
|
105
|
-
|
|
125
|
+
|
|
126
|
+
conf_options = job_spark.conf or {}
|
|
127
|
+
for key, value in conf_options.items():
|
|
106
128
|
DEFAULT_LOGGER.debug(f"add {key} = {value}", extra={"label": self})
|
|
107
129
|
spark.conf.set(f"{key}", f"{value}")
|
|
108
130
|
|
|
@@ -110,11 +132,11 @@ class Configurator(ABC):
|
|
|
110
132
|
return self._spark
|
|
111
133
|
|
|
112
134
|
@property
|
|
113
|
-
def
|
|
135
|
+
def base_step_conf(self) -> Union[StepBronzeConf, StepSilverConf, StepGoldConf]:
|
|
114
136
|
if not self._step_conf:
|
|
115
|
-
_conf = [s for s in STEPS if s.
|
|
137
|
+
_conf = [s for s in STEPS if s.name == self.step][0]
|
|
116
138
|
assert _conf is not None
|
|
117
|
-
self._step_conf =
|
|
139
|
+
self._step_conf = _conf
|
|
118
140
|
return self._step_conf
|
|
119
141
|
|
|
120
142
|
@property
|
|
@@ -122,16 +144,16 @@ class Configurator(ABC):
|
|
|
122
144
|
return f"{self.step}.{self.topic}_{self.item}"
|
|
123
145
|
|
|
124
146
|
def _get_timeout(self, what: str) -> int:
|
|
125
|
-
t = self.
|
|
147
|
+
t = getattr(self.step_options.timeouts, what, None)
|
|
126
148
|
if t is None:
|
|
127
|
-
t =
|
|
149
|
+
t = getattr(self.runtime_options.timeouts, what)
|
|
128
150
|
assert t is not None
|
|
129
151
|
return t
|
|
130
152
|
|
|
131
153
|
@property
|
|
132
154
|
def timeout(self) -> int:
|
|
133
155
|
if not self._timeout:
|
|
134
|
-
t = self.options.
|
|
156
|
+
t = self.options.timeout
|
|
135
157
|
|
|
136
158
|
if t is None:
|
|
137
159
|
t = self._get_timeout("job")
|
|
@@ -158,48 +180,105 @@ class Configurator(ABC):
|
|
|
158
180
|
assert runtime_root
|
|
159
181
|
|
|
160
182
|
self._paths = Paths(
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
183
|
+
to_storage=storage,
|
|
184
|
+
to_tmp=storage.joinpath("tmp", self.topic, self.item),
|
|
185
|
+
to_checkpoints=storage.joinpath("checkpoints", self.topic, self.item),
|
|
186
|
+
to_commits=storage.joinpath("checkpoints", self.topic, self.item, "commits"),
|
|
187
|
+
to_schema=storage.joinpath("schema", self.topic, self.item),
|
|
188
|
+
to_runtime=runtime_root.joinpath(self.topic, self.item),
|
|
167
189
|
)
|
|
168
190
|
|
|
191
|
+
assert self._paths is not None
|
|
169
192
|
return self._paths
|
|
170
193
|
|
|
171
194
|
@property
|
|
172
|
-
@
|
|
173
|
-
def options(self) ->
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
return self.
|
|
195
|
+
@abstractmethod
|
|
196
|
+
def options(self) -> TOptions:
|
|
197
|
+
"""
|
|
198
|
+
Direct access to typed job options.
|
|
199
|
+
|
|
200
|
+
Subclasses must implement this property and return their specific typed
|
|
201
|
+
options instance (e.g. JobBronzeOptions, JobSilverOptions, or JobGoldOptions)
|
|
202
|
+
corresponding to the job type.
|
|
203
|
+
"""
|
|
204
|
+
raise NotImplementedError()
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def runtime_conf(self) -> RuntimeConf:
|
|
208
|
+
"""Direct access to typed runtime conf."""
|
|
209
|
+
if not self._runtime_conf:
|
|
210
|
+
from fabricks.context.runtime import CONF_RUNTIME
|
|
211
|
+
|
|
212
|
+
self._runtime_conf = CONF_RUNTIME
|
|
213
|
+
return self._runtime_conf
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
@abstractmethod
|
|
217
|
+
def step_conf(self) -> Union[StepBronzeConf, StepSilverConf, StepGoldConf]:
|
|
218
|
+
"""Direct access to typed step conf from context configuration."""
|
|
219
|
+
raise NotImplementedError()
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def step_options(self) -> Union[StepBronzeOptions, StepSilverOptions, StepGoldOptions]:
|
|
223
|
+
"""Direct access to typed step-level options from context configuration."""
|
|
224
|
+
raise NotImplementedError()
|
|
225
|
+
|
|
226
|
+
@property
|
|
227
|
+
def step_table_options(self) -> Optional[StepTableOptions]:
|
|
228
|
+
"""Direct access to typed step-level table options from context configuration."""
|
|
229
|
+
if self._step_table_options is None:
|
|
230
|
+
_step = [s for s in STEPS if s.name == self.step][0]
|
|
231
|
+
assert _step is not None
|
|
232
|
+
self._step_table_options = _step.table_options
|
|
233
|
+
return self._step_table_options
|
|
234
|
+
|
|
235
|
+
@property
|
|
236
|
+
def runtime_options(self) -> RuntimeOptions:
|
|
237
|
+
"""Direct access to typed runtime options from context configuration."""
|
|
238
|
+
return self.runtime_conf.options
|
|
239
|
+
|
|
240
|
+
@property
|
|
241
|
+
def step_spark_options(self) -> Optional[SparkOptions]:
|
|
242
|
+
"""Direct access to typed step-level spark options from context configuration.
|
|
243
|
+
Returns None if not configured at step level."""
|
|
244
|
+
return self.step_conf.spark_options
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def table_options(self) -> Optional[TableOptions]:
|
|
248
|
+
"""Direct access to typed table options."""
|
|
249
|
+
return self.conf.table_options
|
|
250
|
+
|
|
251
|
+
@property
|
|
252
|
+
def check_options(self) -> Optional[CheckOptions]:
|
|
253
|
+
"""Direct access to typed check options."""
|
|
254
|
+
return self.conf.check_options
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def spark_options(self) -> Optional[SparkOptions]:
|
|
258
|
+
"""Direct access to typed spark options."""
|
|
259
|
+
return self.conf.spark_options
|
|
260
|
+
|
|
261
|
+
@property
|
|
262
|
+
def invoker_options(self) -> Optional[InvokerOptions]:
|
|
263
|
+
"""Direct access to typed invoker options."""
|
|
264
|
+
return self.conf.invoker_options
|
|
265
|
+
|
|
266
|
+
@property
|
|
267
|
+
def extender_options(self) -> Optional[List[ExtenderOptions]]:
|
|
268
|
+
"""Direct access to typed extender options."""
|
|
269
|
+
return self.conf.extender_options
|
|
191
270
|
|
|
192
271
|
@property
|
|
193
272
|
def change_data_capture(self) -> AllowedChangeDataCaptures:
|
|
194
273
|
if not self._change_data_capture:
|
|
195
|
-
cdc: AllowedChangeDataCaptures = self.options.
|
|
274
|
+
cdc: AllowedChangeDataCaptures = self.options.change_data_capture or "nocdc"
|
|
196
275
|
self._change_data_capture = cdc
|
|
197
276
|
return self._change_data_capture
|
|
198
277
|
|
|
199
278
|
@property
|
|
200
279
|
def cdc(self) -> Union[NoCDC, SCD1, SCD2]:
|
|
201
280
|
if not self._cdc:
|
|
202
|
-
if self.change_data_capture
|
|
281
|
+
if self.change_data_capture in ["nocdc", "none"]:
|
|
203
282
|
cdc = NoCDC(self.step, self.topic, self.item, spark=self.spark)
|
|
204
283
|
elif self.change_data_capture == "scd1":
|
|
205
284
|
cdc = SCD1(self.step, self.topic, self.item, spark=self.spark)
|
|
@@ -227,7 +306,7 @@ class Configurator(ABC):
|
|
|
227
306
|
@property
|
|
228
307
|
def mode(self) -> AllowedModes:
|
|
229
308
|
if not self._mode:
|
|
230
|
-
_mode = self.options.
|
|
309
|
+
_mode = self.options.mode
|
|
231
310
|
assert _mode is not None
|
|
232
311
|
self._mode = cast(AllowedModes, _mode)
|
|
233
312
|
return self._mode
|
|
@@ -288,9 +367,9 @@ class Configurator(ABC):
|
|
|
288
367
|
DEFAULT_LOGGER.debug("could not vacuum (memory)", extra={"label": self})
|
|
289
368
|
|
|
290
369
|
else:
|
|
291
|
-
job = self.
|
|
292
|
-
step = self.
|
|
293
|
-
runtime =
|
|
370
|
+
job = self.table_options.retention_days if self.table_options else None
|
|
371
|
+
step = self.step_table_options.retention_days if self.step_table_options else None
|
|
372
|
+
runtime = self.runtime_options.retention_days
|
|
294
373
|
|
|
295
374
|
if job is not None:
|
|
296
375
|
retention_days = job
|
|
@@ -6,10 +6,10 @@ from pyspark.sql.functions import lit
|
|
|
6
6
|
|
|
7
7
|
from fabricks.cdc import NoCDC
|
|
8
8
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
|
-
from fabricks.core.jobs.base._types import JobDependency
|
|
10
9
|
from fabricks.core.jobs.base.configurator import Configurator
|
|
11
10
|
from fabricks.metastore.table import SchemaDiff
|
|
12
11
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.models import JobDependency
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class Generator(Configurator):
|
|
@@ -31,9 +31,9 @@ class Generator(Configurator):
|
|
|
31
31
|
|
|
32
32
|
If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
|
|
33
33
|
"""
|
|
34
|
-
if self.paths.
|
|
34
|
+
if self.paths.to_schema.exists():
|
|
35
35
|
DEFAULT_LOGGER.info("delete schema folder", extra={"label": self})
|
|
36
|
-
self.paths.
|
|
36
|
+
self.paths.to_schema.rm()
|
|
37
37
|
self.rm_checkpoints()
|
|
38
38
|
|
|
39
39
|
def rm_checkpoints(self):
|
|
@@ -42,9 +42,9 @@ class Generator(Configurator):
|
|
|
42
42
|
|
|
43
43
|
This method checks if the checkpoints folder exists and deletes it if it does.
|
|
44
44
|
"""
|
|
45
|
-
if self.paths.
|
|
45
|
+
if self.paths.to_checkpoints.exists():
|
|
46
46
|
DEFAULT_LOGGER.info("delete checkpoints folder", extra={"label": self})
|
|
47
|
-
self.paths.
|
|
47
|
+
self.paths.to_checkpoints.rm()
|
|
48
48
|
|
|
49
49
|
def rm_commit(self, id: Union[str, int]):
|
|
50
50
|
"""
|
|
@@ -56,7 +56,7 @@ class Generator(Configurator):
|
|
|
56
56
|
Returns:
|
|
57
57
|
None
|
|
58
58
|
"""
|
|
59
|
-
path = self.paths.
|
|
59
|
+
path = self.paths.to_commits.joinpath(str(id))
|
|
60
60
|
if path.exists():
|
|
61
61
|
DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"label": self})
|
|
62
62
|
path.rm()
|
|
@@ -91,7 +91,7 @@ class Generator(Configurator):
|
|
|
91
91
|
Returns:
|
|
92
92
|
None
|
|
93
93
|
"""
|
|
94
|
-
if self.options.
|
|
94
|
+
if self.options.no_drop:
|
|
95
95
|
raise ValueError("no_drop is set, cannot drop the job")
|
|
96
96
|
|
|
97
97
|
try:
|
|
@@ -167,7 +167,7 @@ class Generator(Configurator):
|
|
|
167
167
|
...
|
|
168
168
|
|
|
169
169
|
def _get_clustering_columns(self, df: DataFrame) -> Optional[List[str]]:
|
|
170
|
-
columns = self.
|
|
170
|
+
columns = self.table_options.cluster_by or [] if self.table_options else []
|
|
171
171
|
if columns:
|
|
172
172
|
return columns
|
|
173
173
|
|
|
@@ -205,16 +205,16 @@ class Generator(Configurator):
|
|
|
205
205
|
identity = False
|
|
206
206
|
|
|
207
207
|
# first take from job options, then from step options
|
|
208
|
-
job_powerbi = self.
|
|
209
|
-
step_powerbi = self.step_conf.
|
|
208
|
+
job_powerbi = self.table_options.powerbi if self.table_options else None
|
|
209
|
+
step_powerbi = self.step_conf.table_options.powerbi if self.step_conf.table_options else None
|
|
210
210
|
if job_powerbi is not None:
|
|
211
211
|
powerbi = job_powerbi
|
|
212
212
|
elif step_powerbi is not None:
|
|
213
213
|
powerbi = step_powerbi
|
|
214
214
|
|
|
215
215
|
# first take from job options, then from step options
|
|
216
|
-
job_masks = self.
|
|
217
|
-
step_masks = self.step_conf.
|
|
216
|
+
job_masks = self.table_options.masks if self.table_options else None
|
|
217
|
+
step_masks = self.step_conf.table_options.masks if self.step_conf.table_options else None
|
|
218
218
|
if job_masks is not None:
|
|
219
219
|
masks = job_masks
|
|
220
220
|
elif step_masks is not None:
|
|
@@ -222,7 +222,9 @@ class Generator(Configurator):
|
|
|
222
222
|
else:
|
|
223
223
|
masks = None
|
|
224
224
|
|
|
225
|
-
maximum_compatibility = self.
|
|
225
|
+
maximum_compatibility = self.table_options.maximum_compatibility if self.table_options else False
|
|
226
|
+
|
|
227
|
+
default_properties: dict[str, str | bool | int] = {}
|
|
226
228
|
|
|
227
229
|
if maximum_compatibility:
|
|
228
230
|
default_properties = {
|
|
@@ -251,11 +253,13 @@ class Generator(Configurator):
|
|
|
251
253
|
if "__identity" in df.columns:
|
|
252
254
|
identity = False
|
|
253
255
|
else:
|
|
254
|
-
identity = self.
|
|
256
|
+
identity = self.table_options.identity if self.table_options else False
|
|
255
257
|
|
|
256
258
|
# first take from job options, then from step options
|
|
257
|
-
liquid_clustering_job = self.
|
|
258
|
-
liquid_clustering_step =
|
|
259
|
+
liquid_clustering_job = self.table_options.liquid_clustering if self.table_options else None
|
|
260
|
+
liquid_clustering_step = (
|
|
261
|
+
self.step_conf.table_options.liquid_clustering if self.step_conf.table_options else None
|
|
262
|
+
)
|
|
259
263
|
if liquid_clustering_job is not None:
|
|
260
264
|
liquid_clustering = liquid_clustering_job
|
|
261
265
|
elif liquid_clustering_step:
|
|
@@ -278,24 +282,24 @@ class Generator(Configurator):
|
|
|
278
282
|
|
|
279
283
|
if liquid_clustering is None:
|
|
280
284
|
cluster_by = None
|
|
281
|
-
partition_by = self.
|
|
285
|
+
partition_by = self.table_options.partition_by or [] if self.table_options else []
|
|
282
286
|
if partition_by:
|
|
283
287
|
partitioning = True
|
|
284
288
|
|
|
285
289
|
properties = None
|
|
286
290
|
if not powerbi:
|
|
287
291
|
# first take from job options, then from step options
|
|
288
|
-
if self.
|
|
289
|
-
properties = self.
|
|
290
|
-
elif self.step_conf.
|
|
291
|
-
properties = self.step_conf.
|
|
292
|
+
if self.table_options and self.table_options.properties:
|
|
293
|
+
properties = self.table_options.properties
|
|
294
|
+
elif self.step_conf.table_options and self.step_conf.table_options.properties:
|
|
295
|
+
properties = self.step_conf.table_options.properties
|
|
292
296
|
|
|
293
297
|
if properties is None:
|
|
294
298
|
properties = default_properties
|
|
295
299
|
|
|
296
|
-
primary_key = self.
|
|
297
|
-
foreign_keys = self.
|
|
298
|
-
comments = self.
|
|
300
|
+
primary_key = self.table_options.primary_key or {} if self.table_options else {}
|
|
301
|
+
foreign_keys = self.table_options.foreign_keys or {} if self.table_options else {}
|
|
302
|
+
comments = self.table_options.comments or {} if self.table_options else {}
|
|
299
303
|
|
|
300
304
|
# if dataframe, reference is passed (BUG)
|
|
301
305
|
name = f"{self.step}_{self.topic}_{self.item}__init"
|
|
@@ -332,7 +336,7 @@ class Generator(Configurator):
|
|
|
332
336
|
dummy_df = dummy_df.select("__metadata")
|
|
333
337
|
|
|
334
338
|
df = df.unionByName(dummy_df, allowMissingColumns=True)
|
|
335
|
-
path = self.paths.
|
|
339
|
+
path = self.paths.to_checkpoints.append("__init")
|
|
336
340
|
if path.exists():
|
|
337
341
|
path.rm()
|
|
338
342
|
|
|
@@ -347,12 +351,12 @@ class Generator(Configurator):
|
|
|
347
351
|
else:
|
|
348
352
|
_create_table(df)
|
|
349
353
|
|
|
350
|
-
constraints = self.
|
|
354
|
+
constraints = self.table_options.constraints or {} if self.table_options else {}
|
|
351
355
|
if constraints:
|
|
352
356
|
for key, value in constraints.items():
|
|
353
|
-
self.table.add_constraint(name=key, expr=value)
|
|
357
|
+
self.table.add_constraint(name=key, expr=str(value))
|
|
354
358
|
|
|
355
|
-
comment = self.
|
|
359
|
+
comment = self.table_options.comment if self.table_options else None
|
|
356
360
|
if comment:
|
|
357
361
|
self.table.add_table_comment(comment=comment)
|
|
358
362
|
|
|
@@ -382,7 +386,7 @@ class Generator(Configurator):
|
|
|
382
386
|
df = self.base_transform(df)
|
|
383
387
|
|
|
384
388
|
if self.stream:
|
|
385
|
-
path = self.paths.
|
|
389
|
+
path = self.paths.to_checkpoints.append("__schema")
|
|
386
390
|
query = (
|
|
387
391
|
df.writeStream.foreachBatch(_update_schema)
|
|
388
392
|
.option("checkpointLocation", path.string)
|
|
@@ -415,15 +419,15 @@ class Generator(Configurator):
|
|
|
415
419
|
self.table.drop_comments()
|
|
416
420
|
|
|
417
421
|
if table:
|
|
418
|
-
comment = self.
|
|
422
|
+
comment = self.table_options.comment if self.table_options else None
|
|
419
423
|
if comment:
|
|
420
424
|
self.table.add_table_comment(comment=comment)
|
|
421
425
|
|
|
422
426
|
if columns:
|
|
423
|
-
comments = self.
|
|
427
|
+
comments = self.table_options.comments or {} if self.table_options else {}
|
|
424
428
|
if comments:
|
|
425
429
|
for col, comment in comments.items():
|
|
426
|
-
self.table.add_column_comment(column=col, comment=comment)
|
|
430
|
+
self.table.add_column_comment(column=col, comment=str(comment))
|
|
427
431
|
|
|
428
432
|
def get_differences_with_deltatable(self, df: Optional[DataFrame] = None):
|
|
429
433
|
if df is None:
|
|
@@ -456,8 +460,8 @@ class Generator(Configurator):
|
|
|
456
460
|
enable = False
|
|
457
461
|
|
|
458
462
|
# first take from job options, then from step options
|
|
459
|
-
enable_job = self.
|
|
460
|
-
enable_step = self.step_conf.
|
|
463
|
+
enable_job = self.table_options.liquid_clustering if self.table_options else None
|
|
464
|
+
enable_step = self.step_conf.table_options.liquid_clustering if self.step_conf.table_options else None
|
|
461
465
|
if enable_job is not None:
|
|
462
466
|
enable = enable_job
|
|
463
467
|
elif enable_step:
|