fabricks 3.0.5.2__py3-none-any.whl → 3.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/api/__init__.py +2 -0
- fabricks/api/context.py +1 -2
- fabricks/api/deploy.py +3 -0
- fabricks/api/job_schema.py +2 -2
- fabricks/api/masks.py +3 -0
- fabricks/api/notebooks/initialize.py +2 -2
- fabricks/api/notebooks/process.py +2 -2
- fabricks/api/notebooks/run.py +2 -2
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +2 -2
- fabricks/api/schedules.py +2 -16
- fabricks/cdc/__init__.py +2 -2
- fabricks/cdc/base/__init__.py +2 -2
- fabricks/cdc/base/_types.py +9 -2
- fabricks/cdc/base/configurator.py +86 -41
- fabricks/cdc/base/generator.py +44 -35
- fabricks/cdc/base/merger.py +16 -14
- fabricks/cdc/base/processor.py +232 -144
- fabricks/cdc/nocdc.py +8 -7
- fabricks/cdc/templates/{query → ctes}/base.sql.jinja +7 -6
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/{query → ctes}/rectify.sql.jinja +4 -22
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -4
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -2
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/{query/nocdc.sql.jinja → queries/nocdc/complete.sql.jinja} +1 -1
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +35 -0
- fabricks/cdc/templates/{query → queries}/scd1.sql.jinja +2 -28
- fabricks/cdc/templates/{query → queries}/scd2.sql.jinja +29 -48
- fabricks/cdc/templates/query.sql.jinja +15 -11
- fabricks/context/__init__.py +18 -4
- fabricks/context/_types.py +2 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +8 -2
- fabricks/context/runtime.py +87 -263
- fabricks/context/secret.py +1 -1
- fabricks/context/spark_session.py +1 -1
- fabricks/context/utils.py +76 -0
- fabricks/core/dags/generator.py +6 -7
- fabricks/core/dags/log.py +2 -15
- fabricks/core/dags/processor.py +11 -11
- fabricks/core/dags/utils.py +15 -1
- fabricks/core/{scripts/job_schema.py → job_schema.py} +4 -0
- fabricks/core/jobs/base/_types.py +64 -22
- fabricks/core/jobs/base/checker.py +13 -12
- fabricks/core/jobs/base/configurator.py +41 -67
- fabricks/core/jobs/base/generator.py +55 -24
- fabricks/core/jobs/base/invoker.py +54 -30
- fabricks/core/jobs/base/processor.py +43 -26
- fabricks/core/jobs/bronze.py +45 -38
- fabricks/core/jobs/get_jobs.py +2 -2
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +61 -48
- fabricks/core/jobs/silver.py +39 -40
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/base.py +2 -2
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +46 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/base.py +110 -72
- fabricks/core/udfs.py +12 -23
- fabricks/core/views.py +20 -13
- fabricks/deploy/__init__.py +97 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +8 -0
- fabricks/{core/deploy → deploy}/tables.py +16 -13
- fabricks/{core/deploy → deploy}/udfs.py +3 -1
- fabricks/deploy/utils.py +36 -0
- fabricks/{core/deploy → deploy}/views.py +5 -9
- fabricks/metastore/database.py +3 -3
- fabricks/metastore/dbobject.py +4 -4
- fabricks/metastore/table.py +157 -88
- fabricks/metastore/view.py +13 -6
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_table.py +4 -3
- fabricks/utils/helpers.py +141 -11
- fabricks/utils/log.py +29 -18
- fabricks/utils/read/_types.py +1 -1
- fabricks/utils/schema/get_schema_for_type.py +6 -0
- fabricks/utils/write/delta.py +3 -3
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/METADATA +2 -1
- fabricks-3.0.6.dist-info/RECORD +175 -0
- fabricks/api/notebooks/add_fabricks.py +0 -13
- fabricks/api/notebooks/optimize.py +0 -29
- fabricks/api/notebooks/vacuum.py +0 -29
- fabricks/cdc/templates/query/context.sql.jinja +0 -101
- fabricks/cdc/templates/query/current.sql.jinja +0 -32
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +0 -21
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +0 -14
- fabricks/cdc/templates/query/hash.sql.jinja +0 -1
- fabricks/cdc/templates/query/slice.sql.jinja +0 -14
- fabricks/config/__init__.py +0 -0
- fabricks/config/base.py +0 -8
- fabricks/config/fabricks/__init__.py +0 -26
- fabricks/config/fabricks/base.py +0 -90
- fabricks/config/fabricks/environment.py +0 -9
- fabricks/config/fabricks/pyproject.py +0 -47
- fabricks/config/jobs/__init__.py +0 -6
- fabricks/config/jobs/base.py +0 -101
- fabricks/config/jobs/bronze.py +0 -38
- fabricks/config/jobs/gold.py +0 -27
- fabricks/config/jobs/silver.py +0 -22
- fabricks/config/runtime.py +0 -67
- fabricks/config/steps/__init__.py +0 -6
- fabricks/config/steps/base.py +0 -50
- fabricks/config/steps/bronze.py +0 -7
- fabricks/config/steps/gold.py +0 -14
- fabricks/config/steps/silver.py +0 -15
- fabricks/core/deploy/__init__.py +0 -17
- fabricks/core/schedules.py +0 -142
- fabricks/core/scripts/__init__.py +0 -9
- fabricks/core/scripts/armageddon.py +0 -87
- fabricks/core/scripts/stats.py +0 -51
- fabricks/core/scripts/steps.py +0 -26
- fabricks-3.0.5.2.dist-info/RECORD +0 -177
- /fabricks/cdc/templates/{filter → filters}/final.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/latest.sql.jinja +0 -0
- /fabricks/cdc/templates/{filter → filters}/update.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd1.sql.jinja +0 -0
- /fabricks/cdc/templates/{merge → merges}/scd2.sql.jinja +0 -0
- /fabricks/cdc/templates/{query → queries}/__init__.py +0 -0
- /fabricks/cdc/templates/{query → queries}/final.sql.jinja +0 -0
- /fabricks/core/{utils.py → parsers/utils.py} +0 -0
- /fabricks/core/{scripts → schedules}/generate.py +0 -0
- /fabricks/core/{scripts → schedules}/process.py +0 -0
- /fabricks/core/{scripts → schedules}/terminate.py +0 -0
- {fabricks-3.0.5.2.dist-info → fabricks-3.0.6.dist-info}/WHEEL +0 -0
fabricks/core/steps/base.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
2
|
+
from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
|
|
3
3
|
|
|
4
4
|
from pyspark.sql import DataFrame
|
|
5
5
|
from pyspark.sql.functions import expr, md5
|
|
6
6
|
from pyspark.sql.types import Row
|
|
7
7
|
from typing_extensions import deprecated
|
|
8
8
|
|
|
9
|
-
from fabricks.cdc import
|
|
9
|
+
from fabricks.cdc import NoCDC
|
|
10
10
|
from fabricks.context import CONF_RUNTIME, LOGLEVEL, PATHS_RUNTIME, PATHS_STORAGE, SPARK, STEPS
|
|
11
11
|
from fabricks.context.log import DEFAULT_LOGGER
|
|
12
|
-
from fabricks.core.jobs.base._types import Bronzes, Golds,
|
|
12
|
+
from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
|
|
13
13
|
from fabricks.core.jobs.get_job import get_job
|
|
14
14
|
from fabricks.core.steps._types import Timeouts
|
|
15
15
|
from fabricks.core.steps.get_step_conf import get_step_conf
|
|
@@ -98,53 +98,66 @@ class BaseStep:
|
|
|
98
98
|
return self._options
|
|
99
99
|
|
|
100
100
|
def drop(self):
|
|
101
|
-
DEFAULT_LOGGER.warning("
|
|
101
|
+
DEFAULT_LOGGER.warning("drop", extra={"label": self})
|
|
102
102
|
|
|
103
103
|
fs = self.database.storage
|
|
104
104
|
assert fs
|
|
105
105
|
|
|
106
106
|
tmp = fs.joinpath("tmp")
|
|
107
107
|
if tmp.exists():
|
|
108
|
+
DEFAULT_LOGGER.debug("clean tmp folder", extra={"label": self})
|
|
108
109
|
tmp.rm()
|
|
109
110
|
|
|
110
111
|
checkpoint = fs.joinpath("checkpoints")
|
|
111
112
|
if checkpoint.exists():
|
|
113
|
+
DEFAULT_LOGGER.debug("clean checkpoint folder", extra={"label": self})
|
|
112
114
|
checkpoint.rm()
|
|
113
115
|
|
|
114
116
|
schema = fs.joinpath("schemas")
|
|
115
117
|
if schema.exists():
|
|
118
|
+
DEFAULT_LOGGER.debug("clean schema folder", extra={"label": self})
|
|
116
119
|
schema.rm()
|
|
117
120
|
|
|
121
|
+
DEFAULT_LOGGER.debug("clean fabricks", extra={"label": self})
|
|
118
122
|
for t in ["jobs", "tables", "dependencies", "views"]:
|
|
119
123
|
tbl = Table("fabricks", self.name, t)
|
|
120
124
|
tbl.drop()
|
|
121
125
|
|
|
126
|
+
try:
|
|
127
|
+
SPARK.sql(f"delete from fabricks.steps where step = '{self}'")
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
122
131
|
self.database.drop()
|
|
123
132
|
|
|
124
133
|
def create(self):
|
|
125
|
-
DEFAULT_LOGGER.info("
|
|
134
|
+
DEFAULT_LOGGER.info("create", extra={"label": self})
|
|
126
135
|
|
|
127
136
|
if not self.runtime.exists():
|
|
128
|
-
DEFAULT_LOGGER.warning(f"{self.name}
|
|
137
|
+
DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
|
|
129
138
|
else:
|
|
130
139
|
self.update()
|
|
131
140
|
|
|
132
141
|
def update(self, update_dependencies: Optional[bool] = True, progress_bar: Optional[bool] = False):
|
|
133
142
|
if not self.runtime.exists():
|
|
134
|
-
DEFAULT_LOGGER.warning(f"{self.name}
|
|
143
|
+
DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
|
|
135
144
|
|
|
136
145
|
else:
|
|
137
146
|
if not self.database.exists():
|
|
138
147
|
self.database.create()
|
|
139
148
|
|
|
140
|
-
self.
|
|
141
|
-
self.create_db_objects()
|
|
149
|
+
self.update_configurations()
|
|
150
|
+
errors = self.create_db_objects()
|
|
151
|
+
|
|
152
|
+
for e in errors:
|
|
153
|
+
DEFAULT_LOGGER.exception("fail to create db object", extra={"label": e["job"]}, exc_info=e["error"])
|
|
142
154
|
|
|
143
155
|
if update_dependencies:
|
|
144
156
|
self.update_dependencies(progress_bar=progress_bar)
|
|
145
157
|
|
|
146
158
|
self.update_tables_list()
|
|
147
159
|
self.update_views_list()
|
|
160
|
+
self.update_steps_list()
|
|
148
161
|
|
|
149
162
|
def get_dependencies(
|
|
150
163
|
self,
|
|
@@ -152,19 +165,8 @@ class BaseStep:
|
|
|
152
165
|
topic: Optional[Union[str, List[str]]] = None,
|
|
153
166
|
include_manual: Optional[bool] = False,
|
|
154
167
|
loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
|
|
155
|
-
) -> Tuple[DataFrame, List[
|
|
156
|
-
DEFAULT_LOGGER.debug("get dependencies", extra={"
|
|
157
|
-
|
|
158
|
-
errors = []
|
|
159
|
-
dependencies: list[JobDependency] = []
|
|
160
|
-
|
|
161
|
-
def _get_dependencies(row: Row):
|
|
162
|
-
job = get_job(step=self.name, job_id=row["job_id"])
|
|
163
|
-
try:
|
|
164
|
-
dependencies.extend(job.get_dependencies())
|
|
165
|
-
except Exception as e:
|
|
166
|
-
DEFAULT_LOGGER.exception("failed to get dependencies", extra={"job": job})
|
|
167
|
-
errors.append((job, e))
|
|
168
|
+
) -> Tuple[DataFrame, List[Dict]]:
|
|
169
|
+
DEFAULT_LOGGER.debug("get dependencies", extra={"label": self})
|
|
168
170
|
|
|
169
171
|
df = self.get_jobs()
|
|
170
172
|
|
|
@@ -176,18 +178,25 @@ class BaseStep:
|
|
|
176
178
|
topic = [topic]
|
|
177
179
|
|
|
178
180
|
where = ", ".join([f"'{t}'" for t in topic])
|
|
179
|
-
DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"
|
|
181
|
+
DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"label": self})
|
|
180
182
|
df = df.where(f"topic in ({where})")
|
|
181
183
|
|
|
182
184
|
if not df:
|
|
183
185
|
raise ValueError("no jobs found")
|
|
184
186
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
DEFAULT_LOGGER
|
|
187
|
+
results = run_in_parallel(
|
|
188
|
+
_get_dependencies,
|
|
189
|
+
df,
|
|
190
|
+
workers=16,
|
|
191
|
+
progress_bar=progress_bar,
|
|
192
|
+
logger=DEFAULT_LOGGER,
|
|
193
|
+
loglevel=logging.CRITICAL,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
errors = [res for res in results if res.get("error")]
|
|
197
|
+
dependencies = []
|
|
198
|
+
for res in [res for res in results if res.get("dependencies")]:
|
|
199
|
+
dependencies.extend(res.get("dependencies"))
|
|
191
200
|
|
|
192
201
|
df = self.spark.createDataFrame([d.model_dump() for d in dependencies], SchemaDependencies) # type: ignore
|
|
193
202
|
return df, errors
|
|
@@ -196,7 +205,7 @@ class BaseStep:
|
|
|
196
205
|
return read_yaml(self.runtime, root="job", preferred_file_name=topic)
|
|
197
206
|
|
|
198
207
|
def get_jobs(self, topic: Optional[str] = None) -> DataFrame:
|
|
199
|
-
DEFAULT_LOGGER.debug("get jobs", extra={"
|
|
208
|
+
DEFAULT_LOGGER.debug("get jobs", extra={"label": self})
|
|
200
209
|
|
|
201
210
|
try:
|
|
202
211
|
conf = get_step_conf(self.name)
|
|
@@ -216,21 +225,11 @@ class BaseStep:
|
|
|
216
225
|
return df
|
|
217
226
|
|
|
218
227
|
except AssertionError as e:
|
|
219
|
-
DEFAULT_LOGGER.exception("
|
|
228
|
+
DEFAULT_LOGGER.exception("fail to get jobs", extra={"label": self})
|
|
220
229
|
raise e
|
|
221
230
|
|
|
222
|
-
def create_db_objects(self, retry: Optional[bool] = True) -> List[
|
|
223
|
-
DEFAULT_LOGGER.info("create db objects", extra={"
|
|
224
|
-
|
|
225
|
-
errors = []
|
|
226
|
-
|
|
227
|
-
def _create_db_object(row: Row):
|
|
228
|
-
job = get_job(step=self.name, job_id=row["job_id"])
|
|
229
|
-
try:
|
|
230
|
-
job.create()
|
|
231
|
-
except: # noqa E722
|
|
232
|
-
DEFAULT_LOGGER.exception("not created", extra={"job": self})
|
|
233
|
-
errors.append(job)
|
|
231
|
+
def create_db_objects(self, retry: Optional[bool] = True) -> List[Dict]:
|
|
232
|
+
DEFAULT_LOGGER.info("create db objects", extra={"label": self})
|
|
234
233
|
|
|
235
234
|
df = self.get_jobs()
|
|
236
235
|
table_df = self.database.get_tables()
|
|
@@ -240,22 +239,29 @@ class BaseStep:
|
|
|
240
239
|
df = df.join(view_df, "job_id", how="left_anti")
|
|
241
240
|
|
|
242
241
|
if df:
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
242
|
+
results = run_in_parallel(
|
|
243
|
+
_create_db_object,
|
|
244
|
+
df,
|
|
245
|
+
workers=16,
|
|
246
|
+
progress_bar=True,
|
|
247
|
+
logger=DEFAULT_LOGGER,
|
|
248
|
+
loglevel=logging.CRITICAL,
|
|
249
|
+
)
|
|
246
250
|
|
|
247
251
|
self.update_tables_list()
|
|
248
252
|
self.update_views_list()
|
|
249
253
|
|
|
254
|
+
errors = [res for res in results if res.get("error")]
|
|
255
|
+
|
|
250
256
|
if errors:
|
|
251
257
|
if retry:
|
|
252
|
-
DEFAULT_LOGGER.warning("retry create jobs", extra={"
|
|
258
|
+
DEFAULT_LOGGER.warning("retry to create jobs", extra={"label": self})
|
|
253
259
|
return self.create_db_objects(retry=False)
|
|
254
260
|
|
|
255
261
|
return errors
|
|
256
262
|
|
|
257
263
|
@deprecated("use create_db_objects instead")
|
|
258
|
-
def create_jobs(self, retry: Optional[bool] = True) -> List[
|
|
264
|
+
def create_jobs(self, retry: Optional[bool] = True) -> List[Dict]:
|
|
259
265
|
return self.create_db_objects(retry=retry)
|
|
260
266
|
|
|
261
267
|
@deprecated("use update_configurations instead")
|
|
@@ -265,19 +271,19 @@ class BaseStep:
|
|
|
265
271
|
def update_configurations(self, drop: Optional[bool] = False):
|
|
266
272
|
df = self.get_jobs()
|
|
267
273
|
|
|
268
|
-
DEFAULT_LOGGER.info("update configurations", extra={"
|
|
274
|
+
DEFAULT_LOGGER.info("update configurations", extra={"label": self})
|
|
269
275
|
|
|
270
|
-
|
|
276
|
+
cdc = NoCDC("fabricks", self.name, "jobs")
|
|
271
277
|
|
|
272
278
|
if drop:
|
|
273
|
-
|
|
274
|
-
elif
|
|
275
|
-
|
|
276
|
-
if
|
|
277
|
-
DEFAULT_LOGGER.warning("schema drift detected", extra={"
|
|
278
|
-
|
|
279
|
+
cdc.table.drop()
|
|
280
|
+
elif cdc.table.exists():
|
|
281
|
+
df_diffs = cdc.get_differences_with_deltatable(df)
|
|
282
|
+
if not df_diffs.isEmpty():
|
|
283
|
+
DEFAULT_LOGGER.warning("schema drift detected", extra={"label": self})
|
|
284
|
+
cdc.table.overwrite_schema(df=df)
|
|
279
285
|
|
|
280
|
-
|
|
286
|
+
cdc.delete_missing(df, keys=["job_id"])
|
|
281
287
|
|
|
282
288
|
@deprecated("use update_tables_list instead")
|
|
283
289
|
def update_tables(self):
|
|
@@ -287,8 +293,8 @@ class BaseStep:
|
|
|
287
293
|
df = self.database.get_tables()
|
|
288
294
|
df = df.withColumn("job_id", expr("md5(table)"))
|
|
289
295
|
|
|
290
|
-
DEFAULT_LOGGER.info("update tables list", extra={"
|
|
291
|
-
|
|
296
|
+
DEFAULT_LOGGER.info("update tables list", extra={"label": self})
|
|
297
|
+
NoCDC("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
|
|
292
298
|
|
|
293
299
|
@deprecated("use update_views_list instead")
|
|
294
300
|
def update_views(self):
|
|
@@ -298,8 +304,8 @@ class BaseStep:
|
|
|
298
304
|
df = self.database.get_views()
|
|
299
305
|
df = df.withColumn("job_id", expr("md5(view)"))
|
|
300
306
|
|
|
301
|
-
DEFAULT_LOGGER.info("update views list", extra={"
|
|
302
|
-
|
|
307
|
+
DEFAULT_LOGGER.info("update views list", extra={"label": self})
|
|
308
|
+
NoCDC("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
|
|
303
309
|
|
|
304
310
|
def update_dependencies(
|
|
305
311
|
self,
|
|
@@ -307,7 +313,7 @@ class BaseStep:
|
|
|
307
313
|
topic: Optional[Union[str, List[str]]] = None,
|
|
308
314
|
include_manual: Optional[bool] = False,
|
|
309
315
|
loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
|
|
310
|
-
) -> List[
|
|
316
|
+
) -> List[Dict]:
|
|
311
317
|
df, errors = self.get_dependencies(
|
|
312
318
|
progress_bar=progress_bar,
|
|
313
319
|
topic=topic,
|
|
@@ -316,7 +322,7 @@ class BaseStep:
|
|
|
316
322
|
)
|
|
317
323
|
df.cache()
|
|
318
324
|
|
|
319
|
-
DEFAULT_LOGGER.info("update dependencies", extra={"
|
|
325
|
+
DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
|
|
320
326
|
|
|
321
327
|
update_where = None
|
|
322
328
|
|
|
@@ -327,9 +333,9 @@ class BaseStep:
|
|
|
327
333
|
)
|
|
328
334
|
|
|
329
335
|
if update_where:
|
|
330
|
-
DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"
|
|
336
|
+
DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
|
|
331
337
|
|
|
332
|
-
|
|
338
|
+
NoCDC("fabricks", self.name, "dependencies").delete_missing(
|
|
333
339
|
df,
|
|
334
340
|
keys=["dependency_id"],
|
|
335
341
|
update_where=update_where,
|
|
@@ -347,9 +353,9 @@ class BaseStep:
|
|
|
347
353
|
update_where = (
|
|
348
354
|
f"""job_id in (select job_id from fabricks.{self.name}_jobs where {where_topic} {where_not_manual})"""
|
|
349
355
|
)
|
|
350
|
-
DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"
|
|
356
|
+
DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
|
|
351
357
|
|
|
352
|
-
|
|
358
|
+
NoCDC("fabricks", self.name, "dependencies").delete_missing(
|
|
353
359
|
df,
|
|
354
360
|
keys=["dependency_id"],
|
|
355
361
|
update_where=update_where,
|
|
@@ -359,10 +365,6 @@ class BaseStep:
|
|
|
359
365
|
return errors
|
|
360
366
|
|
|
361
367
|
def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
|
|
362
|
-
def _register(row: Row):
|
|
363
|
-
job = get_job(step=self.name, topic=row["topic"], item=row["item"])
|
|
364
|
-
job.register()
|
|
365
|
-
|
|
366
368
|
if drop:
|
|
367
369
|
SPARK.sql(f"drop database if exists {self.name} cascade ")
|
|
368
370
|
SPARK.sql(f"create database {self.name}")
|
|
@@ -378,8 +380,44 @@ class BaseStep:
|
|
|
378
380
|
|
|
379
381
|
if df:
|
|
380
382
|
DEFAULT_LOGGER.setLevel(logging.CRITICAL)
|
|
381
|
-
run_in_parallel(_register, df, workers=16, progress_bar=True)
|
|
383
|
+
run_in_parallel(_register, df, workers=16, progress_bar=True, run_as="Pool")
|
|
382
384
|
DEFAULT_LOGGER.setLevel(LOGLEVEL)
|
|
383
385
|
|
|
386
|
+
def update_steps_list(self):
|
|
387
|
+
order = self.options.get("order", 0)
|
|
388
|
+
df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
|
|
389
|
+
|
|
390
|
+
NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
|
|
391
|
+
|
|
384
392
|
def __str__(self):
|
|
385
393
|
return self.name
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
# to avoid AttributeError: can't pickle local object
|
|
397
|
+
def _get_dependencies(row: Row):
|
|
398
|
+
job = get_job(step=row["step"], job_id=row["job_id"])
|
|
399
|
+
try:
|
|
400
|
+
return {"job": str(job), "dependencies": job.get_dependencies()}
|
|
401
|
+
except Exception as e:
|
|
402
|
+
DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
|
|
403
|
+
return {"job": str(job), "error": e}
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _create_db_object(row: Row):
|
|
407
|
+
job = get_job(step=row["step"], job_id=row["job_id"])
|
|
408
|
+
try:
|
|
409
|
+
job.create()
|
|
410
|
+
return {"job": str(job)}
|
|
411
|
+
except Exception as e: # noqa E722
|
|
412
|
+
DEFAULT_LOGGER.exception("fail to create db object", extra={"label": job})
|
|
413
|
+
return {"job": str(job), "error": e}
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def _register(row: Row):
|
|
417
|
+
job = get_job(step=row["step"], topic=row["topic"], item=row["item"])
|
|
418
|
+
try:
|
|
419
|
+
job.register()
|
|
420
|
+
return {"job": str(job)}
|
|
421
|
+
except Exception as e:
|
|
422
|
+
DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
|
|
423
|
+
return {"job": str(job), "error": e}
|
fabricks/core/udfs.py
CHANGED
|
@@ -11,29 +11,25 @@ from fabricks.context.log import DEFAULT_LOGGER
|
|
|
11
11
|
UDFS: dict[str, Callable] = {}
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
def register_all_udfs():
|
|
14
|
+
def register_all_udfs(extension: Optional[str] = None):
|
|
15
15
|
"""
|
|
16
16
|
Register all user-defined functions (UDFs).
|
|
17
|
-
|
|
18
|
-
This function iterates over all UDFs returned by the `get_udfs` function,
|
|
19
|
-
splits the UDF name into the function name and extension, and attempts to
|
|
20
|
-
register the UDF using the `register_udf` function. If an exception occurs
|
|
21
|
-
during registration, an error message is logged.
|
|
22
|
-
|
|
23
|
-
Returns:
|
|
24
|
-
None
|
|
25
17
|
"""
|
|
26
|
-
|
|
18
|
+
DEFAULT_LOGGER.info("register udfs")
|
|
19
|
+
|
|
20
|
+
for udf in get_udfs(extension=extension):
|
|
27
21
|
split = udf.split(".")
|
|
28
22
|
try:
|
|
29
23
|
register_udf(udf=split[0], extension=split[1])
|
|
30
|
-
except Exception:
|
|
31
|
-
DEFAULT_LOGGER.exception(f"udf {udf}
|
|
24
|
+
except Exception as e:
|
|
25
|
+
DEFAULT_LOGGER.exception(f"could not register udf {udf}", exc_info=e)
|
|
32
26
|
|
|
33
27
|
|
|
34
|
-
def get_udfs() -> List[str]:
|
|
28
|
+
def get_udfs(extension: Optional[str] = None) -> List[str]:
|
|
35
29
|
files = [os.path.basename(f) for f in PATH_UDFS.walk()]
|
|
36
30
|
udfs = [f for f in files if not str(f).endswith("__init__.py") and not str(f).endswith(".requirements.txt")]
|
|
31
|
+
if extension:
|
|
32
|
+
udfs = [f for f in udfs if f.endswith(f".{extension}")]
|
|
37
33
|
return udfs
|
|
38
34
|
|
|
39
35
|
|
|
@@ -63,22 +59,15 @@ def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
|
|
|
63
59
|
|
|
64
60
|
def register_udf(udf: str, extension: Optional[str] = None, spark: Optional[SparkSession] = None):
|
|
65
61
|
"""
|
|
66
|
-
Register a user-defined function (UDF)
|
|
67
|
-
|
|
68
|
-
Args:
|
|
69
|
-
udf (str): The name of the UDF to register.
|
|
70
|
-
extension (Optional[str]): The file extension of the UDF implementation file. If not provided, it will be inferred from the UDF name.
|
|
71
|
-
spark (Optional[SparkSession]): The SparkSession object. If not provided, a new SparkSession will be created.
|
|
72
|
-
|
|
73
|
-
Raises:
|
|
74
|
-
ValueError: If the UDF implementation file is not found or if the UDF name is not found.
|
|
75
|
-
|
|
62
|
+
Register a user-defined function (UDF).
|
|
76
63
|
"""
|
|
77
64
|
if spark is None:
|
|
78
65
|
spark = SPARK
|
|
79
66
|
assert spark is not None
|
|
80
67
|
|
|
81
68
|
if not is_registered(udf, spark):
|
|
69
|
+
DEFAULT_LOGGER.debug(f"register udf {udf}")
|
|
70
|
+
|
|
82
71
|
if extension is None:
|
|
83
72
|
extension = get_extension(udf)
|
|
84
73
|
|
fabricks/core/views.py
CHANGED
|
@@ -7,28 +7,35 @@ from fabricks.utils.sqlglot import fix as fix_sql
|
|
|
7
7
|
def create_or_replace_view_internal(path: Path):
|
|
8
8
|
sql = path.get_sql()
|
|
9
9
|
file_name = path.get_file_name().split(".")[0]
|
|
10
|
-
sql = f"""
|
|
11
|
-
create or replace view fabricks.{file_name}
|
|
12
|
-
as
|
|
13
|
-
{sql}
|
|
14
|
-
"""
|
|
15
|
-
sql = fix_sql(sql)
|
|
16
|
-
DEFAULT_LOGGER.debug(f"schedule - %sql\n---\n{sql}\n---")
|
|
17
10
|
|
|
18
|
-
|
|
11
|
+
try:
|
|
12
|
+
sql = f"""
|
|
13
|
+
create or replace view fabricks.{file_name}
|
|
14
|
+
as
|
|
15
|
+
{sql}
|
|
16
|
+
"""
|
|
17
|
+
sql = fix_sql(sql)
|
|
18
|
+
DEFAULT_LOGGER.debug("create or replace (custom) view", extra={"label": f"fabricks.{file_name}", "sql": sql})
|
|
19
|
+
|
|
20
|
+
SPARK.sql(sql)
|
|
21
|
+
|
|
22
|
+
except Exception as e:
|
|
23
|
+
DEFAULT_LOGGER.exception(
|
|
24
|
+
"could not create nor replace (custom) view", extra={"label": f"fabricks.{file_name}", "exc_info": e}
|
|
25
|
+
)
|
|
26
|
+
raise e
|
|
19
27
|
|
|
20
28
|
|
|
21
29
|
def create_or_replace_view(name: str):
|
|
22
30
|
p = PATH_VIEWS.joinpath(f"{name}.sql")
|
|
23
|
-
|
|
24
|
-
create_or_replace_view_internal(p)
|
|
25
|
-
except Exception:
|
|
26
|
-
DEFAULT_LOGGER.warning(f"schedule - {name} not created nor replace")
|
|
31
|
+
create_or_replace_view_internal(p)
|
|
27
32
|
|
|
28
33
|
|
|
29
34
|
def create_or_replace_views():
|
|
35
|
+
DEFAULT_LOGGER.info("create or replace (custom) views")
|
|
36
|
+
|
|
30
37
|
for p in PATH_VIEWS.walk(file_format="sql", convert=True):
|
|
31
38
|
try:
|
|
32
39
|
create_or_replace_view_internal(p)
|
|
33
40
|
except Exception:
|
|
34
|
-
|
|
41
|
+
pass
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional, Union, cast
|
|
3
|
+
|
|
4
|
+
from fabricks.context import FABRICKS_STORAGE
|
|
5
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
6
|
+
from fabricks.core.jobs.base._types import Steps, TStep
|
|
7
|
+
from fabricks.core.schedules import create_or_replace_views as create_or_replace_schedules_views
|
|
8
|
+
from fabricks.core.steps.base import BaseStep
|
|
9
|
+
from fabricks.core.views import create_or_replace_views as create_or_replace_custom_views
|
|
10
|
+
from fabricks.deploy.masks import deploy_masks
|
|
11
|
+
from fabricks.deploy.notebooks import deploy_notebooks
|
|
12
|
+
from fabricks.deploy.schedules import deploy_schedules
|
|
13
|
+
from fabricks.deploy.tables import deploy_tables
|
|
14
|
+
from fabricks.deploy.udfs import deploy_udfs
|
|
15
|
+
from fabricks.deploy.utils import print_atomic_bomb
|
|
16
|
+
from fabricks.deploy.views import deploy_views
|
|
17
|
+
from fabricks.metastore.database import Database
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Deploy:
|
|
21
|
+
@staticmethod
|
|
22
|
+
def tables(drop: bool = False):
|
|
23
|
+
deploy_tables(drop=drop)
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def views():
|
|
27
|
+
deploy_views()
|
|
28
|
+
|
|
29
|
+
create_or_replace_custom_views()
|
|
30
|
+
create_or_replace_schedules_views()
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def udfs():
|
|
34
|
+
deploy_udfs()
|
|
35
|
+
|
|
36
|
+
@staticmethod
|
|
37
|
+
def masks():
|
|
38
|
+
deploy_masks()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def notebooks():
|
|
42
|
+
deploy_notebooks()
|
|
43
|
+
|
|
44
|
+
@staticmethod
|
|
45
|
+
def schedules():
|
|
46
|
+
deploy_schedules()
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def armageddon(steps: Optional[Union[TStep, List[TStep], str, List[str]]], nowait: bool = False):
|
|
50
|
+
DEFAULT_LOGGER.warning("!💥 armageddon 💥!")
|
|
51
|
+
print_atomic_bomb(nowait=nowait)
|
|
52
|
+
|
|
53
|
+
DEFAULT_LOGGER.setLevel(logging.INFO)
|
|
54
|
+
|
|
55
|
+
if steps is None:
|
|
56
|
+
steps = Steps
|
|
57
|
+
assert steps is not None
|
|
58
|
+
|
|
59
|
+
if isinstance(steps, str):
|
|
60
|
+
steps = [cast(TStep, steps)]
|
|
61
|
+
elif isinstance(steps, List):
|
|
62
|
+
steps = [cast(TStep, s) for s in steps]
|
|
63
|
+
elif isinstance(steps, TStep):
|
|
64
|
+
steps = [steps]
|
|
65
|
+
|
|
66
|
+
fabricks = Database("fabricks")
|
|
67
|
+
fabricks.drop()
|
|
68
|
+
|
|
69
|
+
for s in steps:
|
|
70
|
+
step = BaseStep(s)
|
|
71
|
+
step.drop()
|
|
72
|
+
|
|
73
|
+
tmp = FABRICKS_STORAGE.joinpath("tmp")
|
|
74
|
+
tmp.rm()
|
|
75
|
+
|
|
76
|
+
checkpoint = FABRICKS_STORAGE.joinpath("checkpoints")
|
|
77
|
+
checkpoint.rm()
|
|
78
|
+
|
|
79
|
+
schema = FABRICKS_STORAGE.joinpath("schemas")
|
|
80
|
+
schema.rm()
|
|
81
|
+
|
|
82
|
+
schedule = FABRICKS_STORAGE.joinpath("schedules")
|
|
83
|
+
schedule.rm()
|
|
84
|
+
|
|
85
|
+
fabricks.create()
|
|
86
|
+
|
|
87
|
+
Deploy.tables(drop=True)
|
|
88
|
+
Deploy.udfs()
|
|
89
|
+
Deploy.masks()
|
|
90
|
+
Deploy.notebooks()
|
|
91
|
+
|
|
92
|
+
for s in steps:
|
|
93
|
+
step = BaseStep(s)
|
|
94
|
+
step.create()
|
|
95
|
+
|
|
96
|
+
Deploy.views()
|
|
97
|
+
Deploy.schedules()
|
fabricks/deploy/masks.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import io
|
|
3
|
+
import os
|
|
4
|
+
from importlib import resources
|
|
5
|
+
|
|
6
|
+
from databricks.sdk import WorkspaceClient
|
|
7
|
+
from databricks.sdk.service import workspace
|
|
8
|
+
|
|
9
|
+
from fabricks.context import PATH_NOTEBOOKS
|
|
10
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def deploy_notebook(notebook: str):
|
|
14
|
+
from fabricks.api import notebooks
|
|
15
|
+
|
|
16
|
+
DEFAULT_LOGGER.debug(f"overwrite {notebook}")
|
|
17
|
+
|
|
18
|
+
w = WorkspaceClient()
|
|
19
|
+
|
|
20
|
+
target = f"{PATH_NOTEBOOKS}/{notebook}.py"
|
|
21
|
+
src = resources.files(notebooks) / f"{notebook}.py"
|
|
22
|
+
|
|
23
|
+
with io.open(src, "rb") as file: # type: ignore
|
|
24
|
+
content = file.read()
|
|
25
|
+
|
|
26
|
+
encoded = base64.b64encode(content).decode("utf-8")
|
|
27
|
+
|
|
28
|
+
w.workspace.import_(
|
|
29
|
+
path=target,
|
|
30
|
+
content=encoded,
|
|
31
|
+
format=workspace.ImportFormat.AUTO,
|
|
32
|
+
language=workspace.Language.PYTHON,
|
|
33
|
+
overwrite=True,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def deploy_notebooks():
|
|
38
|
+
DEFAULT_LOGGER.info("overwrite notebooks")
|
|
39
|
+
|
|
40
|
+
_create_dir_if_not_exists()
|
|
41
|
+
_clean_dir()
|
|
42
|
+
|
|
43
|
+
for n in [
|
|
44
|
+
"cluster",
|
|
45
|
+
"initialize",
|
|
46
|
+
"process",
|
|
47
|
+
"schedule",
|
|
48
|
+
"run",
|
|
49
|
+
"terminate",
|
|
50
|
+
]:
|
|
51
|
+
deploy_notebook(notebook=n)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _create_dir_if_not_exists():
|
|
55
|
+
dir = str(PATH_NOTEBOOKS)
|
|
56
|
+
os.makedirs(dir, exist_ok=True)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _clean_dir():
|
|
60
|
+
dir = str(PATH_NOTEBOOKS)
|
|
61
|
+
for n in [
|
|
62
|
+
"cluster",
|
|
63
|
+
"initialize",
|
|
64
|
+
"process",
|
|
65
|
+
"schedule",
|
|
66
|
+
"run",
|
|
67
|
+
"terminate",
|
|
68
|
+
]:
|
|
69
|
+
file_path = os.path.join(dir, f"{n}.py")
|
|
70
|
+
if os.path.isfile(file_path):
|
|
71
|
+
os.remove(file_path)
|