fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,282 @@
1
+ from typing import Optional, Union, cast
2
+
3
+ from databricks.sdk.runtime import spark
4
+ from pyspark.sql import DataFrame, Row
5
+ from pyspark.sql.functions import expr, md5
6
+
7
+ from fabricks.cdc import SCD1
8
+ from fabricks.context import CONF_RUNTIME, PATHS_RUNTIME, PATHS_STORAGE, STEPS
9
+ from fabricks.context.log import Logger
10
+ from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers, TStep
11
+ from fabricks.core.jobs.get_job import get_job
12
+ from fabricks.core.steps.get_step_conf import get_step_conf
13
+ from fabricks.core.steps.types import Timeouts
14
+ from fabricks.metastore.database import Database
15
+ from fabricks.metastore.table import Table
16
+ from fabricks.utils.helpers import concat_dfs, run_in_parallel
17
+ from fabricks.utils.read.read_yaml import read_yaml
18
+ from fabricks.utils.schema import get_schema_for_type
19
+
20
+
21
+ class BaseStep:
22
+ def __init__(self, step: Union[TStep, str]):
23
+ self.name = cast(str, step)
24
+
25
+ if self.name in Bronzes:
26
+ self.extend = "bronze"
27
+ elif self.name in Silvers:
28
+ self.extend = "silver"
29
+ elif self.name in Golds:
30
+ self.extend = "gold"
31
+ else:
32
+ raise ValueError(self.name, "does not extend a default job")
33
+
34
+ _storage = PATHS_STORAGE.get(self.name)
35
+ assert _storage
36
+ _runtime = PATHS_RUNTIME.get(self.name)
37
+ assert _runtime
38
+
39
+ self.spark = spark
40
+ self.storage = _storage
41
+ self.runtime = _runtime
42
+ self.database = Database(self.name)
43
+
44
+ _conf: Optional[dict] = None
45
+ _options: Optional[dict] = None
46
+
47
+ _workers: Optional[int] = None
48
+ _timeouts: Optional[Timeouts] = None
49
+
50
+ @property
51
+ def workers(self):
52
+ if not self._workers:
53
+ w = self.options.get("workers")
54
+ if w is None:
55
+ w = CONF_RUNTIME.get("options", {}).get("workers")
56
+ assert w is not None
57
+ self._workers = cast(int, w)
58
+ return self._workers
59
+
60
+ def _get_timeout(self, what: str) -> int:
61
+ t = self.options.get("timeouts", {}).get(what, None)
62
+ if t is None:
63
+ t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
64
+ assert t is not None
65
+ return int(t)
66
+
67
+ @property
68
+ def timeouts(self) -> Timeouts:
69
+ if not self._timeouts:
70
+ self._timeouts = Timeouts(
71
+ job=self._get_timeout("job"),
72
+ step=self._get_timeout("step"),
73
+ )
74
+ return self._timeouts
75
+
76
+ @property
77
+ def conf(self) -> dict:
78
+ if not self._conf:
79
+ _conf = [s for s in STEPS if s.get("name") == self.name][0]
80
+ assert _conf is not None
81
+ self._conf = cast(dict[str, str], _conf)
82
+ return self._conf
83
+
84
+ @property
85
+ def options(self) -> dict:
86
+ if not self._options:
87
+ o = self.conf.get("options")
88
+ assert o is not None
89
+ self._options = cast(dict[str, str], o)
90
+ return self._options
91
+
92
+ def drop(self):
93
+ Logger.warning("๐Ÿ’ฃ (drop)", extra={"step": self})
94
+
95
+ fs = self.database.storage
96
+ assert fs
97
+
98
+ tmp = fs.join("tmp")
99
+ if tmp.exists():
100
+ tmp.rm()
101
+ checkpoint = fs.join("checkpoints")
102
+ if checkpoint.exists():
103
+ checkpoint.rm()
104
+ schema = fs.join("schemas")
105
+ if schema.exists():
106
+ schema.rm()
107
+
108
+ for t in ["jobs", "tables", "dependencies", "views"]:
109
+ tbl = Table("fabricks", self.name, t)
110
+ tbl.drop()
111
+
112
+ self.database.drop()
113
+
114
+ def create(self):
115
+ Logger.info("๐ŸŒŸ (create)", extra={"step": self})
116
+
117
+ if not self.runtime.exists():
118
+ Logger.warning(f"{self.name} not found in runtime ({self.runtime})")
119
+ else:
120
+ self.update()
121
+
122
+ def update(self, update_dependencies: Optional[bool] = True):
123
+ if not self.runtime.exists():
124
+ Logger.warning(f"{self.name} not found in runtime ({self.runtime})")
125
+ else:
126
+ if not self.database.exists():
127
+ self.database.create()
128
+
129
+ self.update_jobs()
130
+ self.create_jobs()
131
+
132
+ if update_dependencies:
133
+ self.update_dependencies()
134
+
135
+ self.update_tables()
136
+ self.update_views()
137
+
138
+ def get_dependencies(self) -> Optional[DataFrame]:
139
+ errors = []
140
+
141
+ def _get_dependencies(row: Row):
142
+ job = get_job(step=self.name, job_id=row["job_id"])
143
+ try:
144
+ df = job.get_dependencies()
145
+ except: # noqa E722
146
+ errors.append(job)
147
+ return df
148
+
149
+ job_df = self.get_jobs()
150
+ if job_df:
151
+ dfs = run_in_parallel(_get_dependencies, job_df, workers=32)
152
+ for e in errors:
153
+ Logger.error("failed to get dependencies", extra={"step": e})
154
+
155
+ if dfs:
156
+ df = concat_dfs(dfs)
157
+ return df if not df.isEmpty() else None
158
+
159
+ def get_jobs(self, topic: Optional[str] = None) -> Optional[DataFrame]:
160
+ try:
161
+ conf = get_step_conf(self.name)
162
+ schema = get_schema_for_type(conf)
163
+
164
+ df = None
165
+ if topic:
166
+ df = read_yaml(self.runtime, root="job", schema=schema, file_name=topic) # type: ignore
167
+
168
+ if not df:
169
+ df = read_yaml(self.runtime, root="job", schema=schema) # type: ignore
170
+ elif df.isEmpty():
171
+ df = read_yaml(self.runtime, root="job", schema=schema) # type: ignore
172
+
173
+ if df:
174
+ df = df.withColumn("job_id", md5(expr("concat(step, '.' ,topic, '_', item)")))
175
+
176
+ duplicated_df = df.groupBy("job_id", "step", "topic", "item").count().where("count > 1")
177
+ duplicates = ",".join(f"{row.step}.{row.topic}_{row.item}" for row in duplicated_df.collect())
178
+ assert duplicated_df.isEmpty(), f"duplicated job(s) ({duplicates})"
179
+
180
+ return df if not df.isEmpty() else None
181
+
182
+ except AssertionError as e:
183
+ Logger.exception("๐Ÿ™ˆ", extra={"step": self})
184
+ raise e
185
+
186
+ def create_jobs(self, retry: Optional[bool] = True):
187
+ Logger.info("create jobs", extra={"step": self})
188
+
189
+ errors = []
190
+
191
+ def _create_job(row: Row):
192
+ job = get_job(step=self.name, job_id=row["job_id"])
193
+ try:
194
+ job.create()
195
+ except: # noqa E722
196
+ errors.append(job)
197
+
198
+ df = self.get_jobs()
199
+ table_df = self.database.get_tables()
200
+ view_df = self.database.get_views()
201
+
202
+ if df:
203
+ if table_df:
204
+ table_df = table_df.withColumn("job_id", expr("md5(table)"))
205
+ df = df.join(table_df, "job_id", how="left_anti")
206
+ if view_df:
207
+ view_df = view_df.withColumn("job_id", expr("md5(view)"))
208
+ df = df.join(view_df, "job_id", how="left_anti")
209
+
210
+ run_in_parallel(_create_job, df)
211
+ if errors:
212
+ for e in errors:
213
+ Logger.error("not created", extra={"job": e})
214
+
215
+ if retry:
216
+ Logger.warning("retry create jobs", extra={"step": self})
217
+ self.update_tables()
218
+ self.update_views()
219
+ self.create_jobs(retry=False)
220
+ else:
221
+ Logger.warning("retry failed", extra={"step": self})
222
+ else:
223
+ Logger.debug("no new job", extra={"step": self})
224
+
225
+ def update_jobs(self, drop: Optional[bool] = False):
226
+ df = self.get_jobs()
227
+ if df:
228
+ Logger.info("update jobs", extra={"step": self})
229
+ if drop:
230
+ SCD1("fabricks", self.name, "jobs").table.drop()
231
+ SCD1("fabricks", self.name, "jobs").delete_missing(df, keys=["job_id"])
232
+ else:
233
+ Logger.debug("no job", extra={"step": self})
234
+
235
+ def update_tables(self):
236
+ df = self.database.get_tables()
237
+ if df:
238
+ Logger.debug("update tables", extra={"step": self})
239
+ df = df.withColumn("job_id", expr("md5(table)"))
240
+ SCD1("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
241
+ else:
242
+ Logger.debug("no table", extra={"step": self})
243
+
244
+ def update_views(self):
245
+ df = self.database.get_views()
246
+ if df:
247
+ Logger.debug("update views", extra={"step": self})
248
+ df = df.withColumn("job_id", expr("md5(view)"))
249
+ SCD1("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
250
+ else:
251
+ Logger.debug("no view", extra={"step": self})
252
+
253
+ def update_dependencies(self):
254
+ df = self.get_dependencies()
255
+ if df:
256
+ Logger.debug("update dependencies", extra={"step": self})
257
+ df.cache()
258
+ SCD1("fabricks", self.name, "dependencies").delete_missing(df, keys=["dependency_id"])
259
+ else:
260
+ Logger.debug("no dependency", extra={"step": self})
261
+
262
+ def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
263
+ def _register(row: Row):
264
+ job = get_job(step=self.name, topic=row["topic"], item=row["item"])
265
+ job.register()
266
+
267
+ if drop:
268
+ spark.sql(f"drop database if exists {self.name} cascade ")
269
+ spark.sql(f"create database {self.name}")
270
+ if update:
271
+ self.update_jobs()
272
+
273
+ df = self.get_jobs()
274
+ if df:
275
+ table_df = self.database.get_tables()
276
+ if table_df:
277
+ df = df.join(table_df, "job_id", how="left_anti")
278
+ if df:
279
+ run_in_parallel(_register, df, workers=16)
280
+
281
+ def __str__(self):
282
+ return self.name
@@ -0,0 +1,10 @@
1
+ from typing import Union
2
+
3
+ from fabricks.core.jobs.base.types import Steps, TStep
4
+ from fabricks.core.steps.base import BaseStep
5
+
6
+
7
+ def get_step(step: Union[TStep, str]) -> BaseStep:
8
+ assert step in Steps, f"{step} not found"
9
+ _step = BaseStep(step=step)
10
+ return _step
@@ -0,0 +1,33 @@
1
+ from typing import Union, cast
2
+
3
+ from fabricks.core.jobs.base.types import (
4
+ Bronzes,
5
+ Golds,
6
+ JobConfBronze,
7
+ JobConfGold,
8
+ JobConfSilver,
9
+ Silvers,
10
+ TStep,
11
+ )
12
+
13
+
14
+ def get_step_conf(step: Union[TStep, str]):
15
+ if isinstance(step, str):
16
+ step = cast(TStep, step)
17
+
18
+ if step in Bronzes:
19
+ extend = "bronze"
20
+ elif step in Silvers:
21
+ extend = "silver"
22
+ elif step in Golds:
23
+ extend = "gold"
24
+ else:
25
+ raise ValueError(f"{step} - not found")
26
+
27
+ job_conf = {
28
+ "bronze": JobConfBronze,
29
+ "silver": JobConfSilver,
30
+ "gold": JobConfGold,
31
+ }.get(extend, None)
32
+ assert job_conf
33
+ return job_conf
@@ -0,0 +1,7 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class Timeouts:
6
+ job: int
7
+ step: int
fabricks/core/udfs.py ADDED
@@ -0,0 +1,106 @@
1
+ import importlib.util
2
+ import os
3
+ import re
4
+ from typing import Callable, List, Optional
5
+
6
+ from databricks.sdk.runtime import spark as _spark
7
+ from pyspark.sql import SparkSession
8
+
9
+ from fabricks.context import PATH_UDFS
10
+ from fabricks.context.log import Logger
11
+ from fabricks.core.site_packages import add_site_packages_to_path
12
+
13
+ UDFS: dict[str, Callable] = {}
14
+
15
+
16
+ def register_all_udfs():
17
+ """
18
+ Register all user-defined functions (UDFs).
19
+
20
+ This function iterates over all UDFs returned by the `get_udfs` function,
21
+ splits the UDF name into the function name and extension, and attempts to
22
+ register the UDF using the `register_udf` function. If an exception occurs
23
+ during registration, an error message is logged.
24
+
25
+ Returns:
26
+ None
27
+ """
28
+ for udf in get_udfs():
29
+ split = udf.split(".")
30
+ try:
31
+ register_udf(udf=split[0], extension=split[1])
32
+ except Exception:
33
+ Logger.exception(f"udf {udf} not registered")
34
+
35
+
36
+ def get_udfs() -> List[str]:
37
+ files = [os.path.basename(f) for f in PATH_UDFS.walk()]
38
+ udfs = [f for f in files if not str(f).endswith("__init__.py") and not str(f).endswith(".requirements.txt")]
39
+ return udfs
40
+
41
+
42
+ def get_extension(udf: str) -> str:
43
+ for u in get_udfs():
44
+ r = re.compile(rf"{udf}(\.py|\.sql)")
45
+ if re.match(r, u):
46
+ return u.split(".")[1]
47
+ raise ValueError(f"{udf} not found")
48
+
49
+
50
+ def is_registered(udf: str, spark: Optional[SparkSession] = None) -> bool:
51
+ if spark is None:
52
+ spark = _spark
53
+ assert spark is not None
54
+
55
+ df = spark.sql("show functions in default")
56
+ df = df.where(f"function == 'spark_catalog.default.udf_{udf}'")
57
+ return not df.isEmpty()
58
+
59
+
60
+ def register_udf(udf: str, extension: Optional[str] = None, spark: Optional[SparkSession] = None):
61
+ """
62
+ Register a user-defined function (UDF) in Spark.
63
+
64
+ Args:
65
+ udf (str): The name of the UDF to register.
66
+ extension (Optional[str]): The file extension of the UDF implementation file. If not provided, it will be inferred from the UDF name.
67
+ spark (Optional[SparkSession]): The SparkSession object. If not provided, a new SparkSession will be created.
68
+
69
+ Raises:
70
+ ValueError: If the UDF implementation file is not found or if the UDF name is not found.
71
+
72
+ """
73
+ if spark is None:
74
+ spark = _spark
75
+ assert spark is not None
76
+
77
+ if not is_registered(udf, spark):
78
+ if extension is None:
79
+ extension = get_extension(udf)
80
+
81
+ assert extension
82
+ path = PATH_UDFS.join(f"{udf}.{extension}")
83
+ if extension == "sql":
84
+ spark.sql(path.get_sql())
85
+
86
+ elif extension == "py":
87
+ assert path.exists(), f"udf not found ({path.string})"
88
+ spec = importlib.util.spec_from_file_location(udf, path.string)
89
+ assert spec, f"no valid udf found ({path.string})"
90
+ spec.loader.load_module() # type: ignore
91
+
92
+ u = UDFS[udf]
93
+ u(spark)
94
+
95
+ else:
96
+ raise ValueError(f"{udf} not found")
97
+
98
+
99
+ def udf(name: str):
100
+ add_site_packages_to_path()
101
+
102
+ def decorator(fn: Callable):
103
+ UDFS[name] = fn
104
+ return fn
105
+
106
+ return decorator
fabricks/core/utils.py ADDED
@@ -0,0 +1,69 @@
1
+ from pyspark.sql import DataFrame
2
+ from pyspark.sql.functions import length, lower
3
+ from pyspark.sql.functions import trim as _trim
4
+ from pyspark.sql.functions import when
5
+ from pyspark.sql.types import DoubleType, FloatType, IntegerType
6
+
7
+
8
+ def value_to_none(df: DataFrame) -> DataFrame:
9
+ cols = [name for name, dtype in df.dtypes if not name.startswith("__")]
10
+ for c in cols:
11
+ df = df.withColumn(
12
+ c,
13
+ when(length(df[f"`{c}`"].cast("string")) == 0, None)
14
+ .when(lower(df[f"`{c}`"].cast("string")) == "none", None)
15
+ .when(lower(df[f"`{c}`"].cast("string")) == "null", None)
16
+ .when(lower(df[f"`{c}`"].cast("string")) == "blank", None)
17
+ .when(lower(df[f"`{c}`"].cast("string")) == "(none)", None)
18
+ .when(lower(df[f"`{c}`"].cast("string")) == "(null)", None)
19
+ .when(lower(df[f"`{c}`"].cast("string")) == "(blank)", None)
20
+ .otherwise(df[f"`{c}`"]),
21
+ )
22
+ return df
23
+
24
+
25
+ def decimal_to_float(df: DataFrame) -> DataFrame:
26
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
27
+ for c in cols:
28
+ df = df.withColumn(c, df[f"`{c}`"].cast(FloatType()))
29
+ return df
30
+
31
+
32
+ def decimal_to_double(df: DataFrame) -> DataFrame:
33
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
34
+ for c in cols:
35
+ df = df.withColumn(c, df[f"`{c}`"].cast(DoubleType()))
36
+ return df
37
+
38
+
39
+ def tinyint_to_int(df: DataFrame) -> DataFrame:
40
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("tinyint") and not name.startswith("__")]
41
+ for c in cols:
42
+ df = df.withColumn(c, df[f"`{c}`"].cast(IntegerType()))
43
+ return df
44
+
45
+
46
+ def trim(df: DataFrame) -> DataFrame:
47
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("string") and not name.startswith("__")]
48
+ for c in cols:
49
+ df = df.withColumn(c, _trim(df[f"`{c}`"]))
50
+ return df
51
+
52
+
53
+ def clean(df: DataFrame) -> DataFrame:
54
+ """
55
+ Cleans the given DataFrame by performing the following operations:
56
+ 1. Trims whitespace from all string columns.
57
+ 2. Converts empty strings to None.
58
+ 3. Converts decimal values to double.
59
+
60
+ Args:
61
+ df (pandas.DataFrame): The DataFrame to be cleaned.
62
+
63
+ Returns:
64
+ pandas.DataFrame: The cleaned DataFrame.
65
+ """
66
+ df = trim(df)
67
+ df = value_to_none(df)
68
+ df = decimal_to_double(df)
69
+ return df
fabricks/core/views.py ADDED
@@ -0,0 +1,36 @@
1
+ from databricks.sdk.runtime import spark
2
+
3
+ from fabricks.context import PATH_VIEWS
4
+ from fabricks.context.log import Logger
5
+ from fabricks.utils.path import Path
6
+ from fabricks.utils.sqlglot import fix as fix_sql
7
+
8
+
9
+ def _create_or_replace_view(path: Path):
10
+ sql = path.get_sql()
11
+ file_name = path.get_file_name().split(".")[0]
12
+ sql = f"""
13
+ create or replace view fabricks.{file_name}
14
+ as
15
+ {sql}
16
+ """
17
+ sql = fix_sql(sql)
18
+ Logger.debug(f"schedule - %sql\n---\n{sql}\n---")
19
+
20
+ spark.sql(sql)
21
+
22
+
23
+ def create_or_replace_view(name: str):
24
+ p = PATH_VIEWS.join(f"{name}.sql")
25
+ try:
26
+ _create_or_replace_view(p)
27
+ except Exception:
28
+ Logger.warning(f"schedule - {name} not created nor replace")
29
+
30
+
31
+ def create_or_replace_views():
32
+ for p in PATH_VIEWS.walk(file_format="sql", convert=True):
33
+ try:
34
+ _create_or_replace_view(p)
35
+ except Exception:
36
+ Logger.warning(f"schedule - {p.get_file_name()} not created nor replace")
@@ -0,0 +1,3 @@
1
+ # BMS DNA Fabricks Metastore
2
+
3
+ Metastore - Fabricks
@@ -0,0 +1,5 @@
1
+ from fabricks.metastore.database import Database
2
+ from fabricks.metastore.table import Table
3
+ from fabricks.metastore.view import View
4
+
5
+ __all__ = ["Database", "Table", "View"]
@@ -0,0 +1,71 @@
1
+ from typing import Optional
2
+
3
+ from databricks.sdk.runtime import spark as _spark
4
+ from pyspark.errors.exceptions.base import AnalysisException
5
+ from pyspark.sql import DataFrame, SparkSession
6
+ from typing_extensions import deprecated
7
+
8
+ from fabricks.context import PATHS_STORAGE
9
+ from fabricks.context.log import Logger
10
+ from fabricks.metastore.utils import get_tables, get_views
11
+ from fabricks.utils.path import Path
12
+
13
+
14
+ class Database:
15
+ def __init__(self, name: str, spark: Optional[SparkSession] = None):
16
+ self.name = name
17
+ storage = PATHS_STORAGE.get(self.name)
18
+ assert storage is not None
19
+ self.storage = storage
20
+ if spark is None:
21
+ spark = _spark
22
+ assert spark is not None
23
+ self.spark = spark
24
+
25
+ @property
26
+ @deprecated("use delta_path instead")
27
+ def deltapath(self) -> Path:
28
+ return self.storage.join("delta")
29
+
30
+ @property
31
+ def delta_path(self) -> Path:
32
+ return self.storage.join("delta")
33
+
34
+ def create(self):
35
+ Logger.info("๐ŸŒŸ (create database)", extra={"step": self})
36
+ self.spark.sql(f"create database if not exists {self.name};")
37
+
38
+ def drop(self, rm: Optional[bool] = True):
39
+ if self.exists():
40
+ Logger.warning("๐Ÿ’ฃ (drop database)", extra={"step": self})
41
+ self.spark.sql(f"drop database if exists {self.name} cascade;")
42
+
43
+ if rm:
44
+ if self.deltapath.exists():
45
+ Logger.debug("๐Ÿงน (remove delta files)", extra={"step": self})
46
+ self.deltapath.rm()
47
+
48
+ def exists(self) -> bool:
49
+ try:
50
+ self.spark.sql(f"show tables in {self.name}")
51
+ # database not found
52
+ except AnalysisException:
53
+ return False
54
+ return True
55
+
56
+ def __str__(self):
57
+ return self.name
58
+
59
+ def get_tables(self) -> Optional[DataFrame]:
60
+ try:
61
+ df = get_tables(self.name)
62
+ return df if not df.isEmpty() else None
63
+ except AnalysisException:
64
+ return None
65
+
66
+ def get_views(self) -> Optional[DataFrame]:
67
+ try:
68
+ df = get_views(self.name)
69
+ return df if not df.isEmpty() else None
70
+ except AnalysisException:
71
+ return None
@@ -0,0 +1,20 @@
1
+ [build-system]
2
+ requires = [ "poetry_core>=1.0.0",]
3
+ build-backend = "poetry.core.masonry.api"
4
+
5
+ [tool.poetry]
6
+ name = "fabricks-metastore"
7
+ version = "2024.7.1.5"
8
+ description = "Fabricks - Metastore"
9
+ license = "MIT"
10
+ authors = [ "BMS DWH Team <bi_support@bmsuisse.ch>",]
11
+ readme = "README.md"
12
+ packages = [{include="fabricks"}]
13
+
14
+ [tool.black]
15
+ line-length = 119
16
+
17
+ [tool.poetry.dependencies]
18
+ python = ">=3.9,<4"
19
+ "fabricks.utils" = { path = "../utils", develop = true }
20
+ "fabricks.context" = { path = "../context", develop = true }