fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
from typing import Optional, Union, cast
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark
|
|
4
|
+
from pyspark.sql import DataFrame, Row
|
|
5
|
+
from pyspark.sql.functions import expr, md5
|
|
6
|
+
|
|
7
|
+
from fabricks.cdc import SCD1
|
|
8
|
+
from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
|
|
9
|
+
from fabricks.context.log import Logger
|
|
10
|
+
from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers, TStep
|
|
11
|
+
from fabricks.core.jobs.get_job import get_job
|
|
12
|
+
from fabricks.core.steps.get_step_conf import get_step_conf
|
|
13
|
+
from fabricks.core.steps.types import Timeouts
|
|
14
|
+
from fabricks.metastore.database import Database
|
|
15
|
+
from fabricks.metastore.table import Table
|
|
16
|
+
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
17
|
+
from fabricks.utils.read.read_yaml import read_yaml
|
|
18
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BaseStep:
|
|
22
|
+
def __init__(self, step: Union[TStep, str]):
|
|
23
|
+
self.name = cast(str, step)
|
|
24
|
+
|
|
25
|
+
if self.name in Bronzes:
|
|
26
|
+
self.extend = "bronze"
|
|
27
|
+
elif self.name in Silvers:
|
|
28
|
+
self.extend = "silver"
|
|
29
|
+
elif self.name in Golds:
|
|
30
|
+
self.extend = "gold"
|
|
31
|
+
else:
|
|
32
|
+
raise ValueError(self.name, "does not extend a default job")
|
|
33
|
+
|
|
34
|
+
_storage = PATHS_STORAGE.get(self.name)
|
|
35
|
+
assert _storage
|
|
36
|
+
_runtime = PATHS_RUNTIME.get(self.name)
|
|
37
|
+
assert _runtime
|
|
38
|
+
|
|
39
|
+
self.spark = spark
|
|
40
|
+
self.storage = _storage
|
|
41
|
+
self.runtime = _runtime
|
|
42
|
+
self.database = Database(self.name)
|
|
43
|
+
|
|
44
|
+
_conf: Optional[dict] = None
|
|
45
|
+
_options: Optional[dict] = None
|
|
46
|
+
|
|
47
|
+
_workers: Optional[int] = None
|
|
48
|
+
_timeouts: Optional[Timeouts] = None
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def workers(self):
|
|
52
|
+
if not self._workers:
|
|
53
|
+
w = self.options.get("workers")
|
|
54
|
+
if w is None:
|
|
55
|
+
w = CONF_RUNTIME.get("options", {}).get("workers")
|
|
56
|
+
assert w is not None
|
|
57
|
+
self._workers = cast(int, w)
|
|
58
|
+
return self._workers
|
|
59
|
+
|
|
60
|
+
def _get_timeout(self, what: str) -> int:
|
|
61
|
+
t = self.options.get("timeouts", {}).get(what, None)
|
|
62
|
+
if t is None:
|
|
63
|
+
t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
|
|
64
|
+
assert t is not None
|
|
65
|
+
return int(t)
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def timeouts(self) -> Timeouts:
|
|
69
|
+
if not self._timeouts:
|
|
70
|
+
self._timeouts = Timeouts(
|
|
71
|
+
job=self._get_timeout("job"),
|
|
72
|
+
step=self._get_timeout("step"),
|
|
73
|
+
)
|
|
74
|
+
return self._timeouts
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def conf(self) -> dict:
|
|
78
|
+
if not self._conf:
|
|
79
|
+
_conf = [s for s in STEPS if s.get("name") == self.name][0]
|
|
80
|
+
assert _conf is not None
|
|
81
|
+
self._conf = cast(dict[str, str], _conf)
|
|
82
|
+
return self._conf
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def options(self) -> dict:
|
|
86
|
+
if not self._options:
|
|
87
|
+
o = self.conf.get("options")
|
|
88
|
+
assert o is not None
|
|
89
|
+
self._options = cast(dict[str, str], o)
|
|
90
|
+
return self._options
|
|
91
|
+
|
|
92
|
+
def drop(self):
|
|
93
|
+
Logger.warning("๐ฃ (drop)", extra={"step": self})
|
|
94
|
+
|
|
95
|
+
fs = self.database.storage
|
|
96
|
+
assert fs
|
|
97
|
+
|
|
98
|
+
tmp = fs.join("tmp")
|
|
99
|
+
if tmp.exists():
|
|
100
|
+
tmp.rm()
|
|
101
|
+
checkpoint = fs.join("checkpoints")
|
|
102
|
+
if checkpoint.exists():
|
|
103
|
+
checkpoint.rm()
|
|
104
|
+
schema = fs.join("schemas")
|
|
105
|
+
if schema.exists():
|
|
106
|
+
schema.rm()
|
|
107
|
+
|
|
108
|
+
for t in ["jobs", "tables", "dependencies", "views"]:
|
|
109
|
+
tbl = Table("fabricks", self.name, t)
|
|
110
|
+
tbl.drop()
|
|
111
|
+
|
|
112
|
+
self.database.drop()
|
|
113
|
+
|
|
114
|
+
def create(self):
|
|
115
|
+
Logger.info("๐ (create)", extra={"step": self})
|
|
116
|
+
|
|
117
|
+
if not self.runtime.exists():
|
|
118
|
+
Logger.warning(f"{self.name} not found in runtime ({self.runtime})")
|
|
119
|
+
else:
|
|
120
|
+
self.update()
|
|
121
|
+
|
|
122
|
+
def update(self, update_dependencies: Optional[bool] = True):
|
|
123
|
+
if not self.runtime.exists():
|
|
124
|
+
Logger.warning(f"{self.name} not found in runtime ({self.runtime})")
|
|
125
|
+
else:
|
|
126
|
+
if not self.database.exists():
|
|
127
|
+
self.database.create()
|
|
128
|
+
|
|
129
|
+
self.update_jobs()
|
|
130
|
+
self.create_jobs()
|
|
131
|
+
|
|
132
|
+
if update_dependencies:
|
|
133
|
+
self.update_dependencies()
|
|
134
|
+
|
|
135
|
+
self.update_tables()
|
|
136
|
+
self.update_views()
|
|
137
|
+
|
|
138
|
+
def get_dependencies(self) -> Optional[DataFrame]:
|
|
139
|
+
errors = []
|
|
140
|
+
|
|
141
|
+
def _get_dependencies(row: Row):
|
|
142
|
+
job = get_job(step=self.name, job_id=row["job_id"])
|
|
143
|
+
try:
|
|
144
|
+
df = job.get_dependencies()
|
|
145
|
+
except: # noqa E722
|
|
146
|
+
errors.append(job)
|
|
147
|
+
return df
|
|
148
|
+
|
|
149
|
+
job_df = self.get_jobs()
|
|
150
|
+
if job_df:
|
|
151
|
+
dfs = run_in_parallel(_get_dependencies, job_df, workers=32)
|
|
152
|
+
for e in errors:
|
|
153
|
+
Logger.error("failed to get dependencies", extra={"step": e})
|
|
154
|
+
|
|
155
|
+
if dfs:
|
|
156
|
+
df = concat_dfs(dfs)
|
|
157
|
+
return df if not df.isEmpty() else None
|
|
158
|
+
|
|
159
|
+
def get_jobs(self, topic: Optional[str] = None) -> Optional[DataFrame]:
|
|
160
|
+
try:
|
|
161
|
+
conf = get_step_conf(self.name)
|
|
162
|
+
schema = get_schema_for_type(conf)
|
|
163
|
+
|
|
164
|
+
df = None
|
|
165
|
+
if topic:
|
|
166
|
+
df = read_yaml(self.runtime, root="job", schema=schema, file_name=topic) # type: ignore
|
|
167
|
+
|
|
168
|
+
if not df:
|
|
169
|
+
df = read_yaml(self.runtime, root="job", schema=schema) # type: ignore
|
|
170
|
+
elif df.isEmpty():
|
|
171
|
+
df = read_yaml(self.runtime, root="job", schema=schema) # type: ignore
|
|
172
|
+
|
|
173
|
+
if df:
|
|
174
|
+
df = df.withColumn("job_id", md5(expr("concat(step, '.' ,topic, '_', item)")))
|
|
175
|
+
|
|
176
|
+
duplicated_df = df.groupBy("job_id", "step", "topic", "item").count().where("count > 1")
|
|
177
|
+
duplicates = ",".join(f"{row.step}.{row.topic}_{row.item}" for row in duplicated_df.collect())
|
|
178
|
+
assert duplicated_df.isEmpty(), f"duplicated job(s) ({duplicates})"
|
|
179
|
+
|
|
180
|
+
return df if not df.isEmpty() else None
|
|
181
|
+
|
|
182
|
+
except AssertionError as e:
|
|
183
|
+
Logger.exception("๐", extra={"step": self})
|
|
184
|
+
raise e
|
|
185
|
+
|
|
186
|
+
def create_jobs(self, retry: Optional[bool] = True):
|
|
187
|
+
Logger.info("create jobs", extra={"step": self})
|
|
188
|
+
|
|
189
|
+
errors = []
|
|
190
|
+
|
|
191
|
+
def _create_job(row: Row):
|
|
192
|
+
job = get_job(step=self.name, job_id=row["job_id"])
|
|
193
|
+
try:
|
|
194
|
+
job.create()
|
|
195
|
+
except: # noqa E722
|
|
196
|
+
errors.append(job)
|
|
197
|
+
|
|
198
|
+
df = self.get_jobs()
|
|
199
|
+
table_df = self.database.get_tables()
|
|
200
|
+
view_df = self.database.get_views()
|
|
201
|
+
|
|
202
|
+
if df:
|
|
203
|
+
if table_df:
|
|
204
|
+
table_df = table_df.withColumn("job_id", expr("md5(table)"))
|
|
205
|
+
df = df.join(table_df, "job_id", how="left_anti")
|
|
206
|
+
if view_df:
|
|
207
|
+
view_df = view_df.withColumn("job_id", expr("md5(view)"))
|
|
208
|
+
df = df.join(view_df, "job_id", how="left_anti")
|
|
209
|
+
|
|
210
|
+
run_in_parallel(_create_job, df)
|
|
211
|
+
if errors:
|
|
212
|
+
for e in errors:
|
|
213
|
+
Logger.error("not created", extra={"job": e})
|
|
214
|
+
|
|
215
|
+
if retry:
|
|
216
|
+
Logger.warning("retry create jobs", extra={"step": self})
|
|
217
|
+
self.update_tables()
|
|
218
|
+
self.update_views()
|
|
219
|
+
self.create_jobs(retry=False)
|
|
220
|
+
else:
|
|
221
|
+
Logger.warning("retry failed", extra={"step": self})
|
|
222
|
+
else:
|
|
223
|
+
Logger.debug("no new job", extra={"step": self})
|
|
224
|
+
|
|
225
|
+
def update_jobs(self, drop: Optional[bool] = False):
|
|
226
|
+
df = self.get_jobs()
|
|
227
|
+
if df:
|
|
228
|
+
Logger.info("update jobs", extra={"step": self})
|
|
229
|
+
if drop:
|
|
230
|
+
SCD1("fabricks", self.name, "jobs").table.drop()
|
|
231
|
+
SCD1("fabricks", self.name, "jobs").delete_missing(df, keys=["job_id"])
|
|
232
|
+
else:
|
|
233
|
+
Logger.debug("no job", extra={"step": self})
|
|
234
|
+
|
|
235
|
+
def update_tables(self):
|
|
236
|
+
df = self.database.get_tables()
|
|
237
|
+
if df:
|
|
238
|
+
Logger.debug("update tables", extra={"step": self})
|
|
239
|
+
df = df.withColumn("job_id", expr("md5(table)"))
|
|
240
|
+
SCD1("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
|
|
241
|
+
else:
|
|
242
|
+
Logger.debug("no table", extra={"step": self})
|
|
243
|
+
|
|
244
|
+
def update_views(self):
|
|
245
|
+
df = self.database.get_views()
|
|
246
|
+
if df:
|
|
247
|
+
Logger.debug("update views", extra={"step": self})
|
|
248
|
+
df = df.withColumn("job_id", expr("md5(view)"))
|
|
249
|
+
SCD1("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
|
|
250
|
+
else:
|
|
251
|
+
Logger.debug("no view", extra={"step": self})
|
|
252
|
+
|
|
253
|
+
def update_dependencies(self):
|
|
254
|
+
df = self.get_dependencies()
|
|
255
|
+
if df:
|
|
256
|
+
Logger.debug("update dependencies", extra={"step": self})
|
|
257
|
+
df.cache()
|
|
258
|
+
SCD1("fabricks", self.name, "dependencies").delete_missing(df, keys=["dependency_id"])
|
|
259
|
+
else:
|
|
260
|
+
Logger.debug("no dependency", extra={"step": self})
|
|
261
|
+
|
|
262
|
+
def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
|
|
263
|
+
def _register(row: Row):
|
|
264
|
+
job = get_job(step=self.name, topic=row["topic"], item=row["item"])
|
|
265
|
+
job.register()
|
|
266
|
+
|
|
267
|
+
if drop:
|
|
268
|
+
spark.sql(f"drop database if exists {self.name} cascade ")
|
|
269
|
+
spark.sql(f"create database {self.name}")
|
|
270
|
+
if update:
|
|
271
|
+
self.update_jobs()
|
|
272
|
+
|
|
273
|
+
df = self.get_jobs()
|
|
274
|
+
if df:
|
|
275
|
+
table_df = self.database.get_tables()
|
|
276
|
+
if table_df:
|
|
277
|
+
df = df.join(table_df, "job_id", how="left_anti")
|
|
278
|
+
if df:
|
|
279
|
+
run_in_parallel(_register, df, workers=16)
|
|
280
|
+
|
|
281
|
+
def __str__(self):
|
|
282
|
+
return self.name
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
from fabricks.core.jobs.base.types import Steps, TStep
|
|
4
|
+
from fabricks.core.steps.base import BaseStep
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_step(step: Union[TStep, str]) -> BaseStep:
|
|
8
|
+
assert step in Steps, f"{step} not found"
|
|
9
|
+
_step = BaseStep(step=step)
|
|
10
|
+
return _step
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Union, cast
|
|
2
|
+
|
|
3
|
+
from fabricks.core.jobs.base.types import (
|
|
4
|
+
Bronzes,
|
|
5
|
+
Golds,
|
|
6
|
+
JobConfBronze,
|
|
7
|
+
JobConfGold,
|
|
8
|
+
JobConfSilver,
|
|
9
|
+
Silvers,
|
|
10
|
+
TStep,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_step_conf(step: Union[TStep, str]):
|
|
15
|
+
if isinstance(step, str):
|
|
16
|
+
step = cast(TStep, step)
|
|
17
|
+
|
|
18
|
+
if step in Bronzes:
|
|
19
|
+
extend = "bronze"
|
|
20
|
+
elif step in Silvers:
|
|
21
|
+
extend = "silver"
|
|
22
|
+
elif step in Golds:
|
|
23
|
+
extend = "gold"
|
|
24
|
+
else:
|
|
25
|
+
raise ValueError(f"{step} - not found")
|
|
26
|
+
|
|
27
|
+
job_conf = {
|
|
28
|
+
"bronze": JobConfBronze,
|
|
29
|
+
"silver": JobConfSilver,
|
|
30
|
+
"gold": JobConfGold,
|
|
31
|
+
}.get(extend, None)
|
|
32
|
+
assert job_conf
|
|
33
|
+
return job_conf
|
fabricks/core/udfs.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import Callable, List, Optional
|
|
5
|
+
|
|
6
|
+
from databricks.sdk.runtime import spark as _spark
|
|
7
|
+
from pyspark.sql import SparkSession
|
|
8
|
+
|
|
9
|
+
from fabricks.context import PATH_UDFS
|
|
10
|
+
from fabricks.context.log import Logger
|
|
11
|
+
from fabricks.core.site_packages import add_site_packages_to_path
|
|
12
|
+
|
|
13
|
+
UDFS: dict[str, Callable] = {}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def register_all_udfs():
|
|
17
|
+
"""
|
|
18
|
+
Register all user-defined functions (UDFs).
|
|
19
|
+
|
|
20
|
+
This function iterates over all UDFs returned by the `get_udfs` function,
|
|
21
|
+
splits the UDF name into the function name and extension, and attempts to
|
|
22
|
+
register the UDF using the `register_udf` function. If an exception occurs
|
|
23
|
+
during registration, an error message is logged.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
None
|
|
27
|
+
"""
|
|
28
|
+
for udf in get_udfs():
|
|
29
|
+
split = udf.split(".")
|
|
30
|
+
try:
|
|
31
|
+
register_udf(udf=split[0], extension=split[1])
|
|
32
|
+
except Exception:
|
|
33
|
+
Logger.exception(f"udf {udf} not registered")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_udfs() -> List[str]:
|
|
37
|
+
files = [os.path.basename(f) for f in PATH_UDFS.walk()]
|
|
38
|
+
udfs = [f for f in files if not str(f).endswith("__init__.py") and not str(f).endswith(".requirements.txt")]
|
|
39
|
+
return udfs
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_extension(udf: str) -> str:
|
|
43
|
+
for u in get_udfs():
|
|
44
|
+
r = re.compile(rf"{udf}(\.py|\.sql)")
|
|
45
|
+
if re.match(r, u):
|
|
46
|
+
return u.split(".")[1]
|
|
47
|
+
raise ValueError(f"{udf} not found")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
|
|
51
|
+
if spark is None:
|
|
52
|
+
spark = _spark
|
|
53
|
+
assert spark is not None
|
|
54
|
+
|
|
55
|
+
df = spark.sql("show functions in default")
|
|
56
|
+
df = df.where(f"function == 'spark_catalog.default.udf_{udf}'")
|
|
57
|
+
return not df.isEmpty()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def register_udf(udf: str, extension: Optional[str] = None, spark: Optional[SparkSession] = None):
|
|
61
|
+
"""
|
|
62
|
+
Register a user-defined function (UDF) in Spark.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
udf (str): The name of the UDF to register.
|
|
66
|
+
extension (Optional[str]): The file extension of the UDF implementation file. If not provided, it will be inferred from the UDF name.
|
|
67
|
+
spark (Optional[SparkSession]): The SparkSession object. If not provided, a new SparkSession will be created.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
ValueError: If the UDF implementation file is not found or if the UDF name is not found.
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
if spark is None:
|
|
74
|
+
spark = _spark
|
|
75
|
+
assert spark is not None
|
|
76
|
+
|
|
77
|
+
if not is_registered(udf, spark):
|
|
78
|
+
if extension is None:
|
|
79
|
+
extension = get_extension(udf)
|
|
80
|
+
|
|
81
|
+
assert extension
|
|
82
|
+
path = PATH_UDFS.join(f"{udf}.{extension}")
|
|
83
|
+
if extension == "sql":
|
|
84
|
+
spark.sql(path.get_sql())
|
|
85
|
+
|
|
86
|
+
elif extension == "py":
|
|
87
|
+
assert path.exists(), f"udf not found ({path.string})"
|
|
88
|
+
spec = importlib.util.spec_from_file_location(udf, path.string)
|
|
89
|
+
assert spec, f"no valid udf found ({path.string})"
|
|
90
|
+
spec.loader.load_module() # type: ignore
|
|
91
|
+
|
|
92
|
+
u = UDFS[udf]
|
|
93
|
+
u(spark)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError(f"{udf} not found")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def udf(name: str):
|
|
100
|
+
add_site_packages_to_path()
|
|
101
|
+
|
|
102
|
+
def decorator(fn: Callable):
|
|
103
|
+
UDFS[name] = fn
|
|
104
|
+
return fn
|
|
105
|
+
|
|
106
|
+
return decorator
|
fabricks/core/utils.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from pyspark.sql.functions import length, lower
|
|
3
|
+
from pyspark.sql.functions import trim as _trim
|
|
4
|
+
from pyspark.sql.functions import when
|
|
5
|
+
from pyspark.sql.types import DoubleType, FloatType, IntegerType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def value_to_none(df: DataFrame) -> DataFrame:
|
|
9
|
+
cols = [name for name, dtype in df.dtypes if not name.startswith("__")]
|
|
10
|
+
for c in cols:
|
|
11
|
+
df = df.withColumn(
|
|
12
|
+
c,
|
|
13
|
+
when(length(df[f"`{c}`"].cast("string")) == 0, None)
|
|
14
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "none", None)
|
|
15
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "null", None)
|
|
16
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "blank", None)
|
|
17
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "(none)", None)
|
|
18
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "(null)", None)
|
|
19
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "(blank)", None)
|
|
20
|
+
.otherwise(df[f"`{c}`"]),
|
|
21
|
+
)
|
|
22
|
+
return df
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decimal_to_float(df: DataFrame) -> DataFrame:
|
|
26
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
|
|
27
|
+
for c in cols:
|
|
28
|
+
df = df.withColumn(c, df[f"`{c}`"].cast(FloatType()))
|
|
29
|
+
return df
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def decimal_to_double(df: DataFrame) -> DataFrame:
|
|
33
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
|
|
34
|
+
for c in cols:
|
|
35
|
+
df = df.withColumn(c, df[f"`{c}`"].cast(DoubleType()))
|
|
36
|
+
return df
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def tinyint_to_int(df: DataFrame) -> DataFrame:
|
|
40
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("tinyint") and not name.startswith("__")]
|
|
41
|
+
for c in cols:
|
|
42
|
+
df = df.withColumn(c, df[f"`{c}`"].cast(IntegerType()))
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def trim(df: DataFrame) -> DataFrame:
|
|
47
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("string") and not name.startswith("__")]
|
|
48
|
+
for c in cols:
|
|
49
|
+
df = df.withColumn(c, _trim(df[f"`{c}`"]))
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def clean(df: DataFrame) -> DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Cleans the given DataFrame by performing the following operations:
|
|
56
|
+
1. Trims whitespace from all string columns.
|
|
57
|
+
2. Converts empty strings to None.
|
|
58
|
+
3. Converts decimal values to double.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
df (pandas.DataFrame): The DataFrame to be cleaned.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
pandas.DataFrame: The cleaned DataFrame.
|
|
65
|
+
"""
|
|
66
|
+
df = trim(df)
|
|
67
|
+
df = value_to_none(df)
|
|
68
|
+
df = decimal_to_double(df)
|
|
69
|
+
return df
|
fabricks/core/views.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from databricks.sdk.runtime import spark
|
|
2
|
+
|
|
3
|
+
from fabricks.context import PATH_VIEWS
|
|
4
|
+
from fabricks.context.log import Logger
|
|
5
|
+
from fabricks.utils.path import Path
|
|
6
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _create_or_replace_view(path: Path):
|
|
10
|
+
sql = path.get_sql()
|
|
11
|
+
file_name = path.get_file_name().split(".")[0]
|
|
12
|
+
sql = f"""
|
|
13
|
+
create or replace view fabricks.{file_name}
|
|
14
|
+
as
|
|
15
|
+
{sql}
|
|
16
|
+
"""
|
|
17
|
+
sql = fix_sql(sql)
|
|
18
|
+
Logger.debug(f"schedule - %sql\n---\n{sql}\n---")
|
|
19
|
+
|
|
20
|
+
spark.sql(sql)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def create_or_replace_view(name: str):
|
|
24
|
+
p = PATH_VIEWS.join(f"{name}.sql")
|
|
25
|
+
try:
|
|
26
|
+
_create_or_replace_view(p)
|
|
27
|
+
except Exception:
|
|
28
|
+
Logger.warning(f"schedule - {name} not created nor replace")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_or_replace_views():
|
|
32
|
+
for p in PATH_VIEWS.walk(file_format="sql", convert=True):
|
|
33
|
+
try:
|
|
34
|
+
_create_or_replace_view(p)
|
|
35
|
+
except Exception:
|
|
36
|
+
Logger.warning(f"schedule - {p.get_file_name()} not created nor replace")
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark as _spark
|
|
4
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
5
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
6
|
+
from typing_extensions import deprecated
|
|
7
|
+
|
|
8
|
+
from fabricks.context import PATHS_STORAGE
|
|
9
|
+
from fabricks.context.log import Logger
|
|
10
|
+
from fabricks.metastore.utils import get_tables, get_views
|
|
11
|
+
from fabricks.utils.path import Path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Database:
|
|
15
|
+
def __init__(self, name: str, spark: Optional[SparkSession] = None):
|
|
16
|
+
self.name = name
|
|
17
|
+
storage = PATHS_STORAGE.get(self.name)
|
|
18
|
+
assert storage is not None
|
|
19
|
+
self.storage = storage
|
|
20
|
+
if spark is None:
|
|
21
|
+
spark = _spark
|
|
22
|
+
assert spark is not None
|
|
23
|
+
self.spark = spark
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
@deprecated("use delta_path instead")
|
|
27
|
+
def deltapath(self) -> Path:
|
|
28
|
+
return self.storage.join("delta")
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def delta_path(self) -> Path:
|
|
32
|
+
return self.storage.join("delta")
|
|
33
|
+
|
|
34
|
+
def create(self):
|
|
35
|
+
Logger.info("๐ (create database)", extra={"step": self})
|
|
36
|
+
self.spark.sql(f"create database if not exists {self.name};")
|
|
37
|
+
|
|
38
|
+
def drop(self, rm: Optional[bool] = True):
|
|
39
|
+
if self.exists():
|
|
40
|
+
Logger.warning("๐ฃ (drop database)", extra={"step": self})
|
|
41
|
+
self.spark.sql(f"drop database if exists {self.name} cascade;")
|
|
42
|
+
|
|
43
|
+
if rm:
|
|
44
|
+
if self.deltapath.exists():
|
|
45
|
+
Logger.debug("๐งน (remove delta files)", extra={"step": self})
|
|
46
|
+
self.deltapath.rm()
|
|
47
|
+
|
|
48
|
+
def exists(self) -> bool:
|
|
49
|
+
try:
|
|
50
|
+
self.spark.sql(f"show tables in {self.name}")
|
|
51
|
+
# database not found
|
|
52
|
+
except AnalysisException:
|
|
53
|
+
return False
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
def __str__(self):
|
|
57
|
+
return self.name
|
|
58
|
+
|
|
59
|
+
def get_tables(self) -> Optional[DataFrame]:
|
|
60
|
+
try:
|
|
61
|
+
df = get_tables(self.name)
|
|
62
|
+
return df if not df.isEmpty() else None
|
|
63
|
+
except AnalysisException:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def get_views(self) -> Optional[DataFrame]:
|
|
67
|
+
try:
|
|
68
|
+
df = get_views(self.name)
|
|
69
|
+
return df if not df.isEmpty() else None
|
|
70
|
+
except AnalysisException:
|
|
71
|
+
return None
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [ "poetry_core>=1.0.0",]
|
|
3
|
+
build-backend = "poetry.core.masonry.api"
|
|
4
|
+
|
|
5
|
+
[tool.poetry]
|
|
6
|
+
name = "fabricks-metastore"
|
|
7
|
+
version = "2024.7.1.5"
|
|
8
|
+
description = "Fabricks - Metastore"
|
|
9
|
+
license = "MIT"
|
|
10
|
+
authors = [ "BMS DWH Team <bi_support@bmsuisse.ch>",]
|
|
11
|
+
readme = "README.md"
|
|
12
|
+
packages = [{include="fabricks"}]
|
|
13
|
+
|
|
14
|
+
[tool.black]
|
|
15
|
+
line-length = 119
|
|
16
|
+
|
|
17
|
+
[tool.poetry.dependencies]
|
|
18
|
+
python = ">=3.9,<4"
|
|
19
|
+
"fabricks.utils" = { path = "../utils", develop = true }
|
|
20
|
+
"fabricks.context" = { path = "../context", develop = true }
|