fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +76 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
- fabricks-3.0.6.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
|
@@ -26,7 +26,7 @@ class Processor(Invoker):
|
|
|
26
26
|
f = self.options.job.get("filter_where")
|
|
27
27
|
|
|
28
28
|
if f:
|
|
29
|
-
DEFAULT_LOGGER.debug(f"filter where {f}", extra={"
|
|
29
|
+
DEFAULT_LOGGER.debug(f"filter where {f}", extra={"label": self})
|
|
30
30
|
df = df.where(f"{f}")
|
|
31
31
|
|
|
32
32
|
return df
|
|
@@ -46,7 +46,7 @@ class Processor(Invoker):
|
|
|
46
46
|
assert key, "key not found"
|
|
47
47
|
|
|
48
48
|
for col in encrypted_columns:
|
|
49
|
-
DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"
|
|
49
|
+
DEFAULT_LOGGER.debug(f"encrypt column: {col}", extra={"label": self})
|
|
50
50
|
df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
|
|
51
51
|
|
|
52
52
|
return df
|
|
@@ -73,16 +73,16 @@ class Processor(Invoker):
|
|
|
73
73
|
assert self.paths.commits.joinpath(last_batch).exists()
|
|
74
74
|
|
|
75
75
|
def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
76
|
-
DEFAULT_LOGGER.debug("for each batch
|
|
76
|
+
DEFAULT_LOGGER.debug("start (for each batch)", extra={"label": self})
|
|
77
77
|
if batch is not None:
|
|
78
|
-
DEFAULT_LOGGER.debug(f"batch {batch}", extra={"
|
|
78
|
+
DEFAULT_LOGGER.debug(f"batch {batch}", extra={"label": self})
|
|
79
79
|
|
|
80
80
|
df = self.base_transform(df)
|
|
81
81
|
|
|
82
82
|
diffs = self.get_schema_differences(df)
|
|
83
83
|
if diffs:
|
|
84
84
|
if self.schema_drift or kwargs.get("reload", False):
|
|
85
|
-
DEFAULT_LOGGER.warning("schema drifted", extra={"
|
|
85
|
+
DEFAULT_LOGGER.warning("schema drifted", extra={"label": self, "diffs": diffs})
|
|
86
86
|
self.update_schema(df=df)
|
|
87
87
|
|
|
88
88
|
else:
|
|
@@ -98,24 +98,24 @@ class Processor(Invoker):
|
|
|
98
98
|
self.table.set_property("fabricks.last_batch", batch)
|
|
99
99
|
|
|
100
100
|
self.table.create_restore_point()
|
|
101
|
-
DEFAULT_LOGGER.debug("for each batch
|
|
101
|
+
DEFAULT_LOGGER.debug("end (for each batch)", extra={"label": self})
|
|
102
102
|
|
|
103
103
|
def for_each_run(self, **kwargs):
|
|
104
|
-
DEFAULT_LOGGER.debug("for each run
|
|
104
|
+
DEFAULT_LOGGER.debug("start (for each run)", extra={"label": self})
|
|
105
105
|
|
|
106
106
|
if self.virtual:
|
|
107
107
|
self.create_or_replace_view()
|
|
108
108
|
|
|
109
109
|
elif self.persist:
|
|
110
|
-
assert self.table.
|
|
110
|
+
assert self.table.registered, f"{self} is not registered"
|
|
111
111
|
|
|
112
|
-
df = self.get_data(self.stream)
|
|
112
|
+
df = self.get_data(stream=self.stream, **kwargs)
|
|
113
113
|
assert df is not None, "no data"
|
|
114
114
|
|
|
115
115
|
partial(self._for_each_batch, **kwargs)
|
|
116
116
|
|
|
117
117
|
if self.stream:
|
|
118
|
-
DEFAULT_LOGGER.debug("
|
|
118
|
+
DEFAULT_LOGGER.debug("use streaming", extra={"label": self})
|
|
119
119
|
write_stream(
|
|
120
120
|
df,
|
|
121
121
|
checkpoints_path=self.paths.checkpoints,
|
|
@@ -128,7 +128,7 @@ class Processor(Invoker):
|
|
|
128
128
|
else:
|
|
129
129
|
raise ValueError(f"{self.mode} - not allowed")
|
|
130
130
|
|
|
131
|
-
DEFAULT_LOGGER.debug("for each run
|
|
131
|
+
DEFAULT_LOGGER.debug("end (for each run)", extra={"label": self})
|
|
132
132
|
|
|
133
133
|
def run(
|
|
134
134
|
self,
|
|
@@ -137,6 +137,9 @@ class Processor(Invoker):
|
|
|
137
137
|
schedule_id: Optional[str] = None,
|
|
138
138
|
invoke: Optional[bool] = True,
|
|
139
139
|
reload: Optional[bool] = None,
|
|
140
|
+
vacuum: Optional[bool] = None,
|
|
141
|
+
optimize: Optional[bool] = None,
|
|
142
|
+
compute_statistics: Optional[bool] = None,
|
|
140
143
|
):
|
|
141
144
|
"""
|
|
142
145
|
Run the processor.
|
|
@@ -154,18 +157,19 @@ class Processor(Invoker):
|
|
|
154
157
|
if self.persist:
|
|
155
158
|
last_version = self.table.get_property("fabricks.last_version")
|
|
156
159
|
if last_version is not None:
|
|
157
|
-
DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"
|
|
160
|
+
DEFAULT_LOGGER.debug(f"last version {last_version}", extra={"label": self})
|
|
158
161
|
else:
|
|
159
162
|
last_version = str(self.table.last_version)
|
|
160
163
|
|
|
161
164
|
last_batch = self.table.get_property("fabricks.last_batch")
|
|
162
165
|
if last_batch is not None:
|
|
163
|
-
DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"
|
|
166
|
+
DEFAULT_LOGGER.debug(f"last batch {last_batch}", extra={"label": self})
|
|
164
167
|
|
|
165
168
|
try:
|
|
166
|
-
DEFAULT_LOGGER.info("run
|
|
169
|
+
DEFAULT_LOGGER.info("start (run)", extra={"label": self})
|
|
170
|
+
|
|
167
171
|
if reload:
|
|
168
|
-
DEFAULT_LOGGER.debug("force reload", extra={"
|
|
172
|
+
DEFAULT_LOGGER.debug("force reload", extra={"label": self})
|
|
169
173
|
|
|
170
174
|
if invoke:
|
|
171
175
|
self.invoke_pre_run(schedule=schedule)
|
|
@@ -193,40 +197,53 @@ class Processor(Invoker):
|
|
|
193
197
|
if exception:
|
|
194
198
|
raise exception
|
|
195
199
|
|
|
196
|
-
|
|
200
|
+
if vacuum is None:
|
|
201
|
+
vacuum = self.options.job.get("vacuum", False)
|
|
202
|
+
if optimize is None:
|
|
203
|
+
optimize = self.options.job.get("optimize", False)
|
|
204
|
+
if compute_statistics is None:
|
|
205
|
+
compute_statistics = self.options.job.get("compute_statistics", False)
|
|
206
|
+
|
|
207
|
+
if vacuum or optimize or compute_statistics:
|
|
208
|
+
self.maintain(
|
|
209
|
+
compute_statistics=compute_statistics,
|
|
210
|
+
optimize=optimize,
|
|
211
|
+
vacuum=vacuum,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
DEFAULT_LOGGER.info("end (run)", extra={"label": self})
|
|
197
215
|
|
|
198
216
|
except SkipRunCheckWarning as e:
|
|
199
|
-
DEFAULT_LOGGER.warning("skip run", extra={"
|
|
217
|
+
DEFAULT_LOGGER.warning("skip run", extra={"label": self})
|
|
200
218
|
raise e
|
|
201
219
|
|
|
202
220
|
except (PreRunCheckWarning, PostRunCheckWarning) as e:
|
|
203
|
-
DEFAULT_LOGGER.warning("
|
|
221
|
+
DEFAULT_LOGGER.warning("fail to pass warning check", extra={"label": self})
|
|
204
222
|
raise e
|
|
205
223
|
|
|
206
224
|
except (PreRunInvokeException, PostRunInvokeException) as e:
|
|
207
|
-
DEFAULT_LOGGER.exception("
|
|
225
|
+
DEFAULT_LOGGER.exception("fail to run invoker", extra={"label": self})
|
|
208
226
|
raise e
|
|
209
227
|
|
|
210
228
|
except (PreRunCheckException, PostRunCheckException) as e:
|
|
211
|
-
DEFAULT_LOGGER.exception("
|
|
229
|
+
DEFAULT_LOGGER.exception("fail to pass check", extra={"label": self})
|
|
212
230
|
self.restore(last_version, last_batch)
|
|
213
231
|
raise e
|
|
214
232
|
|
|
215
233
|
except AssertionError as e:
|
|
216
|
-
DEFAULT_LOGGER.exception("
|
|
234
|
+
DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
|
|
217
235
|
self.restore(last_version, last_batch)
|
|
218
236
|
raise e
|
|
219
237
|
|
|
220
238
|
except Exception as e:
|
|
221
239
|
if not self.stream or not retry:
|
|
222
|
-
DEFAULT_LOGGER.exception("
|
|
240
|
+
DEFAULT_LOGGER.exception("fail to run", extra={"label": self})
|
|
223
241
|
self.restore(last_version, last_batch)
|
|
224
242
|
raise e
|
|
225
243
|
|
|
226
244
|
else:
|
|
227
|
-
DEFAULT_LOGGER.warning("retry to run", extra={"
|
|
228
|
-
self.run(retry=False, schedule_id=schedule_id)
|
|
245
|
+
DEFAULT_LOGGER.warning("retry to run", extra={"label": self})
|
|
246
|
+
self.run(retry=False, schedule_id=schedule_id, schedule=schedule)
|
|
229
247
|
|
|
230
248
|
@abstractmethod
|
|
231
|
-
def overwrite(self):
|
|
232
|
-
raise NotImplementedError()
|
|
249
|
+
def overwrite(self) -> None: ...
|
fabricks/core/jobs/bronze.py
CHANGED
|
@@ -11,7 +11,7 @@ from fabricks.core.jobs.base._types import JobDependency, TBronze
|
|
|
11
11
|
from fabricks.core.jobs.base.job import BaseJob
|
|
12
12
|
from fabricks.core.parsers import BaseParser
|
|
13
13
|
from fabricks.core.parsers.get_parser import get_parser
|
|
14
|
-
from fabricks.core.utils import clean
|
|
14
|
+
from fabricks.core.parsers.utils import clean
|
|
15
15
|
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
16
16
|
from fabricks.utils.helpers import concat_ws
|
|
17
17
|
from fabricks.utils.path import Path
|
|
@@ -86,13 +86,13 @@ class Bronze(BaseJob):
|
|
|
86
86
|
else:
|
|
87
87
|
file_format = "delta"
|
|
88
88
|
|
|
89
|
-
DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"
|
|
89
|
+
DEFAULT_LOGGER.debug(f"register external table ({self.data_path})", extra={"label": self})
|
|
90
90
|
|
|
91
91
|
try:
|
|
92
92
|
df = self.spark.sql(f"select * from {file_format}.`{self.data_path}`")
|
|
93
93
|
assert len(df.columns) > 1, "external table must have at least one column"
|
|
94
94
|
except Exception as e:
|
|
95
|
-
DEFAULT_LOGGER.exception("read external table failed", extra={"
|
|
95
|
+
DEFAULT_LOGGER.exception("read external table failed", extra={"label": self})
|
|
96
96
|
raise e
|
|
97
97
|
|
|
98
98
|
self.spark.sql(
|
|
@@ -100,17 +100,17 @@ class Bronze(BaseJob):
|
|
|
100
100
|
)
|
|
101
101
|
|
|
102
102
|
def drop_external_table(self):
|
|
103
|
-
DEFAULT_LOGGER.
|
|
103
|
+
DEFAULT_LOGGER.warning("remove external table from metastore", extra={"label": self})
|
|
104
104
|
self.spark.sql(f"drop table if exists {self.qualified_name}")
|
|
105
105
|
|
|
106
|
-
def
|
|
107
|
-
DEFAULT_LOGGER.debug("
|
|
106
|
+
def compute_statistics_external_table(self):
|
|
107
|
+
DEFAULT_LOGGER.debug("compute statistics (external table)", extra={"label": self})
|
|
108
108
|
self.spark.sql(f"analyze table {self.qualified_name} compute statistics")
|
|
109
109
|
|
|
110
110
|
def vacuum_external_table(self, retention_hours: Optional[int] = 168):
|
|
111
111
|
from delta import DeltaTable
|
|
112
112
|
|
|
113
|
-
DEFAULT_LOGGER.debug("vacuum external table", extra={"
|
|
113
|
+
DEFAULT_LOGGER.debug("vacuum (external table)", extra={"label": self})
|
|
114
114
|
try:
|
|
115
115
|
dt = DeltaTable.forPath(self.spark, self.data_path.string)
|
|
116
116
|
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
|
|
@@ -118,17 +118,17 @@ class Bronze(BaseJob):
|
|
|
118
118
|
finally:
|
|
119
119
|
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
|
|
120
120
|
|
|
121
|
-
def
|
|
121
|
+
def maintain_external_table(
|
|
122
122
|
self,
|
|
123
123
|
vacuum: Optional[bool] = True,
|
|
124
|
-
|
|
124
|
+
compute_statistics: Optional[bool] = True,
|
|
125
125
|
):
|
|
126
|
-
DEFAULT_LOGGER.debug("
|
|
126
|
+
DEFAULT_LOGGER.debug("maintain (external table)", extra={"label": self})
|
|
127
127
|
if vacuum:
|
|
128
128
|
self.vacuum_external_table()
|
|
129
129
|
|
|
130
|
-
if
|
|
131
|
-
self.
|
|
130
|
+
if compute_statistics:
|
|
131
|
+
self.compute_statistics_external_table()
|
|
132
132
|
|
|
133
133
|
@property
|
|
134
134
|
def parser(self) -> BaseParser:
|
|
@@ -179,7 +179,13 @@ class Bronze(BaseJob):
|
|
|
179
179
|
|
|
180
180
|
return df
|
|
181
181
|
|
|
182
|
-
def get_data(
|
|
182
|
+
def get_data(
|
|
183
|
+
self,
|
|
184
|
+
stream: bool = False,
|
|
185
|
+
transform: Optional[bool] = False,
|
|
186
|
+
schema_only: Optional[bool] = False,
|
|
187
|
+
**kwargs,
|
|
188
|
+
) -> Optional[DataFrame]:
|
|
183
189
|
df = self.parse(stream)
|
|
184
190
|
df = self.filter_where(df)
|
|
185
191
|
df = self.encrypt(df)
|
|
@@ -187,6 +193,9 @@ class Bronze(BaseJob):
|
|
|
187
193
|
if transform:
|
|
188
194
|
df = self.base_transform(df)
|
|
189
195
|
|
|
196
|
+
if schema_only:
|
|
197
|
+
df = df.where("1 == 2")
|
|
198
|
+
|
|
190
199
|
return df
|
|
191
200
|
|
|
192
201
|
def add_calculated_columns(self, df: DataFrame) -> DataFrame:
|
|
@@ -194,7 +203,7 @@ class Bronze(BaseJob):
|
|
|
194
203
|
|
|
195
204
|
if calculated_columns:
|
|
196
205
|
for key, value in calculated_columns.items():
|
|
197
|
-
DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"
|
|
206
|
+
DEFAULT_LOGGER.debug(f"add calculated column ({key} -> {value})", extra={"label": self})
|
|
198
207
|
df = df.withColumn(key, expr(f"{value}"))
|
|
199
208
|
|
|
200
209
|
return df
|
|
@@ -202,7 +211,7 @@ class Bronze(BaseJob):
|
|
|
202
211
|
def add_hash(self, df: DataFrame) -> DataFrame:
|
|
203
212
|
if "__hash" not in df.columns:
|
|
204
213
|
fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
|
|
205
|
-
DEFAULT_LOGGER.debug("add hash", extra={"
|
|
214
|
+
DEFAULT_LOGGER.debug("add hash", extra={"label": self})
|
|
206
215
|
|
|
207
216
|
if "__operation" in df.columns:
|
|
208
217
|
fields += ["__operation == 'delete'"]
|
|
@@ -218,7 +227,7 @@ class Bronze(BaseJob):
|
|
|
218
227
|
if "__key" not in df.columns:
|
|
219
228
|
fields = self.options.job.get_list("keys")
|
|
220
229
|
if fields:
|
|
221
|
-
DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"
|
|
230
|
+
DEFAULT_LOGGER.debug(f"add key ({', '.join(fields)})", extra={"label": self})
|
|
222
231
|
|
|
223
232
|
if "__source" in df.columns:
|
|
224
233
|
fields = fields + ["__source"]
|
|
@@ -232,7 +241,7 @@ class Bronze(BaseJob):
|
|
|
232
241
|
if "__source" not in df.columns:
|
|
233
242
|
source = self.options.job.get("source")
|
|
234
243
|
if source:
|
|
235
|
-
DEFAULT_LOGGER.debug(f"add source ({source})", extra={"
|
|
244
|
+
DEFAULT_LOGGER.debug(f"add source ({source})", extra={"label": self})
|
|
236
245
|
df = df.withColumn("__source", lit(source))
|
|
237
246
|
|
|
238
247
|
return df
|
|
@@ -241,7 +250,7 @@ class Bronze(BaseJob):
|
|
|
241
250
|
if "__operation" not in df.columns:
|
|
242
251
|
operation = self.options.job.get("operation")
|
|
243
252
|
if operation:
|
|
244
|
-
DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"
|
|
253
|
+
DEFAULT_LOGGER.debug(f"add operation ({operation})", extra={"label": self})
|
|
245
254
|
df = df.withColumn("__operation", lit(operation))
|
|
246
255
|
|
|
247
256
|
else:
|
|
@@ -294,10 +303,10 @@ class Bronze(BaseJob):
|
|
|
294
303
|
return df
|
|
295
304
|
|
|
296
305
|
def create_or_replace_view(self):
|
|
297
|
-
DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"
|
|
306
|
+
DEFAULT_LOGGER.warning("create or replace view not allowed", extra={"label": self})
|
|
298
307
|
|
|
299
308
|
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
300
|
-
DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"
|
|
309
|
+
DEFAULT_LOGGER.warning("schema overwrite not allowed", extra={"label": self})
|
|
301
310
|
|
|
302
311
|
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
303
312
|
return {}
|
|
@@ -309,12 +318,12 @@ class Bronze(BaseJob):
|
|
|
309
318
|
|
|
310
319
|
# if dataframe, reference is passed (BUG)
|
|
311
320
|
name = f"{self.step}_{self.topic}_{self.item}__{batch}"
|
|
312
|
-
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
321
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
313
322
|
sql = f"select * from {global_temp_view}"
|
|
314
323
|
|
|
315
324
|
check_df = self.spark.sql(sql)
|
|
316
325
|
if check_df.isEmpty():
|
|
317
|
-
DEFAULT_LOGGER.warning("no data", extra={"
|
|
326
|
+
DEFAULT_LOGGER.warning("no data", extra={"label": self})
|
|
318
327
|
return
|
|
319
328
|
|
|
320
329
|
assert isinstance(self.cdc, NoCDC)
|
|
@@ -323,9 +332,9 @@ class Bronze(BaseJob):
|
|
|
323
332
|
|
|
324
333
|
def for_each_run(self, **kwargs):
|
|
325
334
|
if self.mode == "register":
|
|
326
|
-
DEFAULT_LOGGER.debug("register (no run)", extra={"
|
|
335
|
+
DEFAULT_LOGGER.debug("register (no run)", extra={"label": self})
|
|
327
336
|
elif self.mode == "memory":
|
|
328
|
-
DEFAULT_LOGGER.debug("memory (no run)", extra={"
|
|
337
|
+
DEFAULT_LOGGER.debug("memory (no run)", extra={"label": self})
|
|
329
338
|
else:
|
|
330
339
|
super().for_each_run(**kwargs)
|
|
331
340
|
|
|
@@ -333,7 +342,7 @@ class Bronze(BaseJob):
|
|
|
333
342
|
if self.mode == "register":
|
|
334
343
|
self.register_external_table()
|
|
335
344
|
elif self.mode == "memory":
|
|
336
|
-
DEFAULT_LOGGER.info("memory (no table nor view)", extra={"
|
|
345
|
+
DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
|
|
337
346
|
else:
|
|
338
347
|
super().create()
|
|
339
348
|
|
|
@@ -341,19 +350,19 @@ class Bronze(BaseJob):
|
|
|
341
350
|
if self.mode == "register":
|
|
342
351
|
self.register_external_table()
|
|
343
352
|
elif self.mode == "memory":
|
|
344
|
-
DEFAULT_LOGGER.info("memory (no table nor view)", extra={"
|
|
353
|
+
DEFAULT_LOGGER.info("memory (no table nor view)", extra={"label": self})
|
|
345
354
|
else:
|
|
346
355
|
super().register()
|
|
347
356
|
|
|
348
357
|
def truncate(self):
|
|
349
358
|
if self.mode == "register":
|
|
350
|
-
DEFAULT_LOGGER.info("register (no truncate)", extra={"
|
|
359
|
+
DEFAULT_LOGGER.info("register (no truncate)", extra={"label": self})
|
|
351
360
|
else:
|
|
352
361
|
super().truncate()
|
|
353
362
|
|
|
354
363
|
def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
|
|
355
364
|
if self.mode == "register":
|
|
356
|
-
DEFAULT_LOGGER.info("register (no restore)", extra={"
|
|
365
|
+
DEFAULT_LOGGER.info("register (no restore)", extra={"label": self})
|
|
357
366
|
else:
|
|
358
367
|
super().restore()
|
|
359
368
|
|
|
@@ -362,27 +371,25 @@ class Bronze(BaseJob):
|
|
|
362
371
|
self.drop_external_table()
|
|
363
372
|
super().drop()
|
|
364
373
|
|
|
365
|
-
def
|
|
374
|
+
def maintain(
|
|
366
375
|
self,
|
|
367
376
|
vacuum: Optional[bool] = True,
|
|
368
377
|
optimize: Optional[bool] = True,
|
|
369
|
-
|
|
378
|
+
compute_statistics: Optional[bool] = True,
|
|
370
379
|
):
|
|
371
|
-
if self.mode == "
|
|
372
|
-
|
|
373
|
-
elif self.mode == "register":
|
|
374
|
-
self.optimize_external_table(vacuum, analyze)
|
|
380
|
+
if self.mode == "register":
|
|
381
|
+
self.maintain_external_table(vacuum=vacuum, compute_statistics=compute_statistics)
|
|
375
382
|
else:
|
|
376
|
-
super().
|
|
383
|
+
super().maintain(vacuum=vacuum, optimize=optimize, compute_statistics=compute_statistics)
|
|
377
384
|
|
|
378
385
|
def vacuum(self):
|
|
379
386
|
if self.mode == "memory":
|
|
380
|
-
DEFAULT_LOGGER.info("memory (no vacuum)", extra={"
|
|
387
|
+
DEFAULT_LOGGER.info("memory (no vacuum)", extra={"label": self})
|
|
381
388
|
elif self.mode == "register":
|
|
382
389
|
self.vacuum_external_table()
|
|
383
390
|
else:
|
|
384
391
|
super().vacuum()
|
|
385
392
|
|
|
386
|
-
def overwrite(self):
|
|
393
|
+
def overwrite(self, schedule: Optional[str] = None):
|
|
387
394
|
self.truncate()
|
|
388
|
-
self.run()
|
|
395
|
+
self.run(schedule=schedule)
|
fabricks/core/jobs/get_jobs.py
CHANGED
|
@@ -6,7 +6,7 @@ from pyspark.sql.functions import expr
|
|
|
6
6
|
from pyspark.sql.types import Row
|
|
7
7
|
|
|
8
8
|
from fabricks.context import IS_JOB_CONFIG_FROM_YAML, PATHS_RUNTIME, SPARK
|
|
9
|
-
from fabricks.core.jobs.base._types import
|
|
9
|
+
from fabricks.core.jobs.base._types import AllowedModes, TStep
|
|
10
10
|
from fabricks.core.jobs.base.job import BaseJob
|
|
11
11
|
from fabricks.core.jobs.get_job import get_job, get_job_internal
|
|
12
12
|
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
@@ -16,7 +16,7 @@ from fabricks.utils.schema import get_schema_for_type
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
class GenericOptions(TypedDict):
|
|
19
|
-
mode:
|
|
19
|
+
mode: AllowedModes
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
@dataclass
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from fabricks.core.jobs.get_schedules import get_schedules
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_schedule(name: str) -> Dict:
|
|
7
|
+
schedule = next(s for s in get_schedules() if s.get("name") == name)
|
|
8
|
+
|
|
9
|
+
assert schedule, "schedule not found"
|
|
10
|
+
return schedule
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from typing import List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from fabricks.context import PATH_SCHEDULES, SPARK
|
|
6
|
+
from fabricks.core.jobs.base._types import TStep
|
|
7
|
+
from fabricks.utils.read.read_yaml import read_yaml
|
|
8
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Options(TypedDict):
|
|
12
|
+
steps: Optional[List[TStep]]
|
|
13
|
+
tag: Optional[str]
|
|
14
|
+
view: Optional[str]
|
|
15
|
+
variables: Optional[dict[str, str]]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Schedule(TypedDict):
|
|
19
|
+
name: str
|
|
20
|
+
options: Options
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_schedules():
|
|
24
|
+
return read_yaml(PATH_SCHEDULES, root="schedule")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_schedules_df() -> DataFrame:
|
|
28
|
+
schema = get_schema_for_type(Schedule)
|
|
29
|
+
df = SPARK.createDataFrame(list(get_schedules()), schema=schema) # type: ignore
|
|
30
|
+
|
|
31
|
+
assert df, "no schedules found"
|
|
32
|
+
return df
|