fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,373 @@
1
+ from typing import Optional, Sequence, Union, cast
2
+
3
+ from pyspark.sql import DataFrame
4
+ from pyspark.sql.functions import expr
5
+ from pyspark.sql.types import Row
6
+
7
+ from fabricks.cdc.nocdc import NoCDC
8
+ from fabricks.context.log import DEFAULT_LOGGER
9
+ from fabricks.core.jobs.base._types import JobDependency, TBronze, TSilver
10
+ from fabricks.core.jobs.base.job import BaseJob
11
+ from fabricks.core.jobs.bronze import Bronze
12
+ from fabricks.metastore.view import create_or_replace_global_temp_view
13
+ from fabricks.utils.helpers import concat_dfs
14
+ from fabricks.utils.read.read import read
15
+ from fabricks.utils.sqlglot import fix as fix_sql
16
+
17
+
18
+ class Silver(BaseJob):
19
+ def __init__(
20
+ self,
21
+ step: TSilver,
22
+ topic: Optional[str] = None,
23
+ item: Optional[str] = None,
24
+ job_id: Optional[str] = None,
25
+ conf: Optional[Union[dict, Row]] = None,
26
+ ): # type: ignore
27
+ super().__init__(
28
+ "silver",
29
+ step=step,
30
+ topic=topic,
31
+ item=item,
32
+ job_id=job_id,
33
+ conf=conf,
34
+ )
35
+
36
+ _parent_step: Optional[TBronze] = None
37
+ _stream: Optional[bool] = None
38
+
39
+ @classmethod
40
+ def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
41
+ return cls(step=cast(TSilver, step), job_id=job_id, conf=conf)
42
+
43
+ @classmethod
44
+ def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
45
+ return cls(step=cast(TSilver, step), topic=topic, item=item, conf=conf)
46
+
47
+ @property
48
+ def stream(self) -> bool:
49
+ if not self._stream:
50
+ _stream = self.options.job.get("stream")
51
+ if _stream is None:
52
+ _stream = self.step_conf.get("options", {}).get("stream")
53
+ self._stream = _stream if _stream is not None else True
54
+ return self._stream # type: ignore
55
+
56
+ @property
57
+ def schema_drift(self) -> bool:
58
+ return True
59
+
60
+ @property
61
+ def persist(self) -> bool:
62
+ return self.mode in ["update", "append", "latest"]
63
+
64
+ @property
65
+ def virtual(self) -> bool:
66
+ return self.mode in ["combine", "memory"]
67
+
68
+ @property
69
+ def parent_step(self) -> TBronze:
70
+ if not self._parent_step:
71
+ _parent_step = self.step_conf.get("options", {}).get("parent")
72
+ _parent_step = cast(TBronze, _parent_step)
73
+ assert _parent_step is not None
74
+ self._parent_step = _parent_step
75
+ return self._parent_step
76
+
77
+ def base_transform(self, df: DataFrame) -> DataFrame:
78
+ df = df.transform(self.extend)
79
+
80
+ if "__metadata" in df.columns:
81
+ df = df.withColumn(
82
+ "__metadata",
83
+ expr(
84
+ """
85
+ struct(
86
+ __metadata.file_path as file_path,
87
+ __metadata.file_name as file_name,
88
+ __metadata.file_size as file_size,
89
+ __metadata.file_modification_time as file_modification_time,
90
+ __metadata.inserted as inserted,
91
+ cast(current_timestamp() as timestamp) as updated
92
+ )
93
+ """
94
+ ),
95
+ )
96
+ return df
97
+
98
+ def get_data(
99
+ self,
100
+ stream: bool = False,
101
+ transform: Optional[bool] = False,
102
+ schema_only: Optional[bool] = False,
103
+ **kwargs,
104
+ ) -> DataFrame:
105
+ deps = self.get_dependencies()
106
+ assert deps, "not dependency found"
107
+
108
+ if self.mode == "memory":
109
+ assert len(deps) == 1, f"more than 1 dependency not allowed ({deps})"
110
+
111
+ parent = deps[0].parent
112
+ df = self.spark.sql(f"select * from {parent}")
113
+
114
+ elif self.mode == "combine":
115
+ dfs = []
116
+
117
+ for row in sorted(deps, key=lambda x: x.parent_id):
118
+ df = self.spark.sql(f"select * from {row.parent}")
119
+ dfs.append(df)
120
+
121
+ df = concat_dfs(dfs)
122
+ assert df is not None
123
+
124
+ else:
125
+ dfs = []
126
+
127
+ for row in sorted(deps, key=lambda x: x.parent_id):
128
+ try:
129
+ bronze = Bronze.from_job_id(step=self.parent_step, job_id=row.parent_id)
130
+ if bronze.mode in ["memory", "register"]:
131
+ # data already transformed if bronze is persisted
132
+ df = bronze.get_data(stream=stream, transform=True)
133
+ else:
134
+ df = read(
135
+ stream=stream,
136
+ path=bronze.table.deltapath,
137
+ file_format="delta",
138
+ metadata=False,
139
+ spark=self.spark,
140
+ )
141
+
142
+ if df:
143
+ if len(deps) > 1:
144
+ assert "__source" in df.columns, "__source not found"
145
+ dfs.append(df)
146
+
147
+ except Exception as e:
148
+ DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": self})
149
+ raise e
150
+
151
+ df = concat_dfs(dfs)
152
+ assert df is not None
153
+
154
+ # transforms
155
+ df = self.filter_where(df)
156
+ df = self.encrypt(df)
157
+ if transform:
158
+ df = self.base_transform(df)
159
+
160
+ if schema_only:
161
+ df = df.where("1 == 2")
162
+
163
+ return df
164
+
165
+ def get_dependencies(self) -> Sequence[JobDependency]:
166
+ dependencies = []
167
+
168
+ parents = self.options.job.get_list("parents") or []
169
+ if parents:
170
+ for p in parents:
171
+ dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
172
+
173
+ else:
174
+ p = f"{self.parent_step}.{self.topic}_{self.item}"
175
+ dependencies.append(JobDependency.from_parts(self.job_id, p, "parser"))
176
+
177
+ return dependencies
178
+
179
+ def create_or_replace_view(self):
180
+ assert self.mode in ["memory", "combine"], f"{self.mode} not allowed"
181
+
182
+ deps = self.get_dependencies()
183
+ assert deps, "dependency not found"
184
+
185
+ if self.mode == "combine":
186
+ queries = []
187
+
188
+ for row in deps:
189
+ columns = self.get_data().columns
190
+ df = self.spark.sql(f"select * from {row.parent}")
191
+ cols = [f"`{c}`" if c in df.columns else f"null as `{c}`" for c in columns if c not in ["__source"]]
192
+ source = "__source" if "__source" in df.columns else f"'{row.parent}' as __source"
193
+ query = f"select {', '.join(cols)}, {source} from {row.parent}"
194
+ queries.append(query)
195
+
196
+ sql = f"create or replace view {self.qualified_name} as {' union all '.join(queries)}"
197
+ sql = fix_sql(sql)
198
+ DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
199
+ self.spark.sql(sql)
200
+
201
+ else:
202
+ assert len(deps) == 1, "only one dependency allowed"
203
+
204
+ parent = deps[0].parent
205
+ sql = f"select * from {parent}"
206
+ sql = fix_sql(sql)
207
+ DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
208
+
209
+ df = self.spark.sql(sql)
210
+ cdc_options = self.get_cdc_context(df)
211
+ self.cdc.create_or_replace_view(sql, **cdc_options)
212
+
213
+ def create_or_replace_current_view(self):
214
+ from py4j.protocol import Py4JJavaError
215
+
216
+ try:
217
+ DEFAULT_LOGGER.debug("create or replace current view", extra={"label": self})
218
+
219
+ df = self.spark.sql(f"select * from {self.qualified_name}")
220
+
221
+ where_clause = "-- no where clause"
222
+ if "__is_current" in df.columns:
223
+ where_clause = "where __is_current"
224
+
225
+ sql = f"""
226
+ create or replace view {self.qualified_name}__current with schema evolution as
227
+ select
228
+ *
229
+ from
230
+ {self.qualified_name}
231
+ {where_clause}
232
+ """
233
+ # sql = fix_sql(sql)
234
+ # DEFAULT_LOGGER.debug("current view", extra={"label": self, "sql": sql})
235
+ self.spark.sql(sql)
236
+
237
+ except Py4JJavaError as e:
238
+ DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
239
+
240
+ def overwrite(self, schedule: Optional[str] = None):
241
+ self.truncate()
242
+ self.run(schedule=schedule)
243
+
244
+ def overwrite_schema(self, df: Optional[DataFrame] = None):
245
+ DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
246
+
247
+ def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
248
+ # if dataframe, reference is passed (BUG)
249
+ name = f"{self.step}_{self.topic}_{self.item}__check"
250
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
251
+
252
+ not_append = not self.mode == "append"
253
+ nocdc = self.change_data_capture == "nocdc"
254
+ order_duplicate_by = self.options.job.get_dict("order_duplicate_by") or {}
255
+
256
+ rectify = False
257
+ if not_append and not nocdc:
258
+ if not self.stream and self.mode == "update" and self.table.exists():
259
+ timestamp = "__valid_from" if self.change_data_capture == "scd2" else "__timestamp"
260
+ extra_check = f" and __timestamp > coalesce((select max({timestamp}) from {self}), cast('0001-01-01' as timestamp))"
261
+ else:
262
+ extra_check = "-- no extra check"
263
+
264
+ sql = f"""
265
+ select
266
+ __operation
267
+ from
268
+ {global_temp_view}
269
+ where
270
+ true
271
+ and __operation == 'reload'
272
+ {extra_check}
273
+ limit
274
+ 1
275
+ """
276
+ sql = fix_sql(sql)
277
+ DEFAULT_LOGGER.debug("check", extra={"label": self, "sql": sql})
278
+
279
+ check_df = self.spark.sql(sql)
280
+ if not check_df.isEmpty():
281
+ rectify = True
282
+ DEFAULT_LOGGER.debug("rectify enabled", extra={"label": self})
283
+
284
+ context = {
285
+ "soft_delete": self.slowly_changing_dimension,
286
+ "deduplicate": self.options.job.get_boolean("deduplicate", not_append),
287
+ "rectify": rectify,
288
+ "order_duplicate_by": order_duplicate_by,
289
+ }
290
+
291
+ if self.mode == "memory":
292
+ context["mode"] = "complete"
293
+
294
+ if self.slowly_changing_dimension:
295
+ if "__key" not in df.columns:
296
+ context["add_key"] = True
297
+
298
+ if nocdc and self.mode == "memory":
299
+ if "__operation" not in df.columns:
300
+ context["add_operation"] = "upsert"
301
+
302
+ if self.mode == "latest":
303
+ context["slice"] = "latest"
304
+ if not self.stream and self.mode == "update":
305
+ context["slice"] = "update"
306
+
307
+ if self.change_data_capture == "scd2":
308
+ context["correct_valid_from"] = True
309
+
310
+ if "__operation" in df.columns:
311
+ context["exclude"] = ["__operation"]
312
+ if nocdc: # operation is passed from the bronze layer
313
+ context["exclude"] = ["__operation"]
314
+
315
+ return context
316
+
317
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
318
+ assert self.persist, f"{self.mode} not allowed"
319
+
320
+ context = self.get_cdc_context(df)
321
+
322
+ # if dataframe, reference is passed (BUG)
323
+ name = f"{self.step}_{self.topic}_{self.item}"
324
+ if batch is not None:
325
+ name = f"{name}__{batch}"
326
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
327
+ sql = f"select * from {global_temp_view}"
328
+
329
+ check_df = self.spark.sql(sql)
330
+ if check_df.isEmpty():
331
+ DEFAULT_LOGGER.warning("no data", extra={"label": self})
332
+ return
333
+
334
+ if self.mode == "update":
335
+ assert not isinstance(self.cdc, NoCDC)
336
+ self.cdc.update(sql, **context)
337
+
338
+ elif self.mode == "append":
339
+ assert isinstance(self.cdc, NoCDC)
340
+ self.cdc.append(sql, **context)
341
+
342
+ elif self.mode == "latest":
343
+ assert isinstance(self.cdc, NoCDC)
344
+ check_df = self.spark.sql(
345
+ f"""
346
+ select
347
+ __operation
348
+ from
349
+ {global_temp_view}
350
+ where
351
+ __operation <> 'reload'
352
+ limit
353
+ 1
354
+ """
355
+ )
356
+ assert check_df.isEmpty(), f"{check_df.collect()[0][0]} not allowed"
357
+ self.cdc.complete(sql, **context)
358
+
359
+ else:
360
+ raise ValueError(f"{self.mode} - not allowed")
361
+
362
+ def create(self):
363
+ super().create()
364
+ self.create_or_replace_current_view()
365
+
366
+ def register(self):
367
+ super().register()
368
+ self.create_or_replace_current_view()
369
+
370
+ def drop(self):
371
+ super().drop()
372
+ DEFAULT_LOGGER.debug("drop current view", extra={"label": self})
373
+ self.spark.sql(f"drop view if exists {self.qualified_name}__current")
fabricks/core/masks.py ADDED
@@ -0,0 +1,52 @@
1
+ import os
2
+ from typing import List, Optional
3
+
4
+ from pyspark.sql import SparkSession
5
+
6
+ from fabricks.context import CATALOG, PATH_MASKS, SPARK
7
+ from fabricks.context.log import DEFAULT_LOGGER
8
+
9
+
10
+ def register_all_masks():
11
+ """
12
+ Register all masks.
13
+ """
14
+
15
+ DEFAULT_LOGGER.info("register masks")
16
+ for mask in get_masks():
17
+ split = mask.split(".")
18
+ try:
19
+ register_mask(mask=split[0])
20
+ except Exception as e:
21
+ DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
22
+
23
+
24
+ def get_masks() -> List[str]:
25
+ return [os.path.basename(f) for f in PATH_MASKS.walk()]
26
+
27
+
28
+ def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
29
+ if spark is None:
30
+ spark = SPARK
31
+ assert spark is not None
32
+
33
+ df = spark.sql("show user functions in default")
34
+
35
+ if CATALOG:
36
+ df = df.where(f"function == '{CATALOG}.default.mask_{mask}'")
37
+ else:
38
+ df = df.where(f"function == 'spark_catalog.default.mask_{mask}'")
39
+
40
+ return not df.isEmpty()
41
+
42
+
43
+ def register_mask(mask: str, spark: Optional[SparkSession] = None):
44
+ if spark is None:
45
+ spark = SPARK
46
+ assert spark is not None
47
+
48
+ if not is_registered(mask, spark):
49
+ DEFAULT_LOGGER.debug(f"register mask {mask}")
50
+
51
+ path = PATH_MASKS.joinpath(f"{mask}.sql")
52
+ spark.sql(path.get_sql())
@@ -0,0 +1,12 @@
1
+ from fabricks.core.parsers._types import ParserOptions
2
+ from fabricks.core.parsers.base import PARSERS, BaseParser
3
+ from fabricks.core.parsers.decorator import parser
4
+ from fabricks.core.parsers.get_parser import get_parser
5
+
6
+ __all__ = [
7
+ "BaseParser",
8
+ "get_parser",
9
+ "parser",
10
+ "ParserOptions",
11
+ "PARSERS",
12
+ ]
@@ -0,0 +1,6 @@
1
+ from typing import Optional, TypedDict
2
+
3
+
4
+ class ParserOptions(TypedDict):
5
+ file_format: Optional[str]
6
+ read_options: Optional[dict[str, str]]
@@ -0,0 +1,95 @@
1
+ from abc import ABC
2
+ from typing import Callable, Optional, final
3
+
4
+ from pyspark.sql import DataFrame, SparkSession
5
+ from pyspark.sql.functions import col, expr, from_json, lit
6
+ from pyspark.sql.types import MapType, StringType
7
+
8
+ from fabricks.core.parsers._types import ParserOptions
9
+ from fabricks.core.parsers.utils import clean
10
+ from fabricks.utils.path import Path
11
+ from fabricks.utils.read.read import read
12
+
13
+
14
+ class BaseParser(ABC):
15
+ def __init__(self, options: Optional[ParserOptions], file_format: str):
16
+ self.options = options or {}
17
+ self.file_format = file_format
18
+
19
+ def add_timestamp_from_file_path(self, df: DataFrame) -> DataFrame:
20
+ df = df.withColumn(
21
+ "__split",
22
+ expr("split(replace(__metadata.file_path, __metadata.file_name), '/')"),
23
+ )
24
+ df = df.withColumn("__split_size", expr("size(__split)"))
25
+ df = df.withColumn(
26
+ "__timestamp",
27
+ expr("left(concat_ws('', slice(__split, __split_size - 4, 4), '00'), 14)"),
28
+ )
29
+ df = df.withColumn("__timestamp", expr("try_to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
30
+ df = df.drop("__split", "__split_size")
31
+
32
+ return df
33
+
34
+ def parse(
35
+ self,
36
+ data_path: Path,
37
+ schema_path: Path,
38
+ spark: SparkSession,
39
+ stream: bool,
40
+ ) -> DataFrame:
41
+ df = read(
42
+ stream=stream,
43
+ path=data_path,
44
+ file_format=self.file_format,
45
+ schema_path=schema_path,
46
+ options=self.options.get("read_options"),
47
+ spark=spark,
48
+ )
49
+
50
+ if "__timestamp" not in df.columns:
51
+ df = self.add_timestamp_from_file_path(df)
52
+
53
+ return df
54
+
55
+ @final
56
+ def get_data(
57
+ self,
58
+ data_path: Path,
59
+ schema_path: Path,
60
+ spark: SparkSession,
61
+ stream: bool,
62
+ ) -> DataFrame:
63
+ """
64
+ Retrieves and processes data from the specified data path using the provided schema.
65
+
66
+ Args:
67
+ data_path (Path): The path to the data file.
68
+ schema_path (Path): The path to the schema file.
69
+ spark (SparkSession): The SparkSession object.
70
+ stream (bool): Indicates whether the data should be processed as a stream.
71
+
72
+ Returns:
73
+ DataFrame: The processed data as a DataFrame.
74
+
75
+ Raises:
76
+ AssertionError: If the "__timestamp" column is missing in the DataFrame.
77
+ AssertionError: If the "__metadata.file_path" column is missing in the DataFrame.
78
+ """
79
+ df = self.parse(data_path=data_path, schema_path=schema_path, spark=spark, stream=stream)
80
+ df = df.transform(clean)
81
+
82
+ if "__rescued_data" not in df.columns:
83
+ df = df.withColumn("__rescued_data", lit(None).cast(StringType()))
84
+
85
+ df = df.withColumn("__rescued_data", from_json(col("__rescued_data"), MapType(StringType(), StringType()))) # type: ignore
86
+
87
+ assert "__timestamp" in df.columns, "__timestamp mandatory in dataframe"
88
+ assert df.select("__metadata.file_path"), "file_path mandatory in struct __metadata in dataframe"
89
+ return df
90
+
91
+ def __str__(self):
92
+ return f"{type(self).__name__} ({self.file_format})"
93
+
94
+
95
+ PARSERS: dict[str, Callable[[Optional[ParserOptions]], BaseParser]] = {}
@@ -0,0 +1,11 @@
1
+ from typing import Callable, Optional
2
+
3
+ from fabricks.core.parsers._types import ParserOptions
4
+ from fabricks.core.parsers.base import PARSERS, BaseParser
5
+
6
+
7
+ def parser(name: str):
8
+ def decorator(parser: Callable[[Optional[ParserOptions]], BaseParser]):
9
+ PARSERS[name] = parser
10
+
11
+ return decorator
@@ -0,0 +1,26 @@
1
+ from importlib.util import module_from_spec, spec_from_file_location
2
+ from typing import Optional
3
+
4
+ from fabricks.context import PATH_PARSERS
5
+ from fabricks.core.parsers._types import ParserOptions
6
+ from fabricks.core.parsers.base import PARSERS, BaseParser
7
+
8
+
9
+ def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> BaseParser:
10
+ if name not in ["json", "parquet", "avro", "csv", "tsv", "delta", "table"]:
11
+ path = PATH_PARSERS.joinpath(name).append(".py")
12
+ assert path.exists(), f"parser not found ({path})"
13
+
14
+ spec = spec_from_file_location(name, path.string)
15
+ assert spec, f"parser not found ({path})"
16
+ assert spec.loader is not None
17
+
18
+ mod = module_from_spec(spec)
19
+ spec.loader.exec_module(mod)
20
+ parser = PARSERS[name](parser_options)
21
+
22
+ else:
23
+ parser = BaseParser(parser_options, name)
24
+
25
+ assert parser
26
+ return parser
@@ -0,0 +1,69 @@
1
+ from pyspark.sql import DataFrame
2
+ from pyspark.sql.functions import length, lower
3
+ from pyspark.sql.functions import trim as _trim
4
+ from pyspark.sql.functions import when
5
+ from pyspark.sql.types import DoubleType, FloatType, IntegerType
6
+
7
+
8
+ def value_to_none(df: DataFrame) -> DataFrame:
9
+ cols = [name for name, dtype in df.dtypes if not name.startswith("__")]
10
+ for c in cols:
11
+ df = df.withColumn(
12
+ c,
13
+ when(length(df[f"`{c}`"].cast("string")) == 0, None)
14
+ .when(lower(df[f"`{c}`"].cast("string")) == "none", None)
15
+ .when(lower(df[f"`{c}`"].cast("string")) == "null", None)
16
+ .when(lower(df[f"`{c}`"].cast("string")) == "blank", None)
17
+ .when(lower(df[f"`{c}`"].cast("string")) == "(none)", None)
18
+ .when(lower(df[f"`{c}`"].cast("string")) == "(null)", None)
19
+ .when(lower(df[f"`{c}`"].cast("string")) == "(blank)", None)
20
+ .otherwise(df[f"`{c}`"]),
21
+ )
22
+ return df
23
+
24
+
25
+ def decimal_to_float(df: DataFrame) -> DataFrame:
26
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
27
+ for c in cols:
28
+ df = df.withColumn(c, df[f"`{c}`"].cast(FloatType()))
29
+ return df
30
+
31
+
32
+ def decimal_to_double(df: DataFrame) -> DataFrame:
33
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
34
+ for c in cols:
35
+ df = df.withColumn(c, df[f"`{c}`"].cast(DoubleType()))
36
+ return df
37
+
38
+
39
+ def tinyint_to_int(df: DataFrame) -> DataFrame:
40
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("tinyint") and not name.startswith("__")]
41
+ for c in cols:
42
+ df = df.withColumn(c, df[f"`{c}`"].cast(IntegerType()))
43
+ return df
44
+
45
+
46
+ def trim(df: DataFrame) -> DataFrame:
47
+ cols = [name for name, dtype in df.dtypes if dtype.startswith("string") and not name.startswith("__")]
48
+ for c in cols:
49
+ df = df.withColumn(c, _trim(df[f"`{c}`"]))
50
+ return df
51
+
52
+
53
+ def clean(df: DataFrame) -> DataFrame:
54
+ """
55
+ Cleans the given DataFrame by performing the following operations:
56
+ 1. Trims whitespace from all string columns.
57
+ 2. Converts empty strings to None.
58
+ 3. Converts decimal values to double.
59
+
60
+ Args:
61
+ df (pandas.DataFrame): The DataFrame to be cleaned.
62
+
63
+ Returns:
64
+ pandas.DataFrame: The cleaned DataFrame.
65
+ """
66
+ df = trim(df)
67
+ df = value_to_none(df)
68
+ df = decimal_to_double(df)
69
+ return df
@@ -0,0 +1,14 @@
1
+ from fabricks.core.schedules.generate import generate
2
+ from fabricks.core.schedules.process import process
3
+ from fabricks.core.schedules.run import run
4
+ from fabricks.core.schedules.terminate import terminate
5
+ from fabricks.core.schedules.views import create_or_replace_view, create_or_replace_views
6
+
7
+ __all__ = [
8
+ "process",
9
+ "generate",
10
+ "terminate",
11
+ "run",
12
+ "create_or_replace_view",
13
+ "create_or_replace_views",
14
+ ]
@@ -0,0 +1,21 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ def get_dependencies(name: str) -> DataFrame:
5
+ from fabricks.core.dags import DagGenerator
6
+
7
+ g = DagGenerator(schedule=name)
8
+ return g.get_dependencies()
9
+
10
+
11
+ def get_mermaid_diagram(name: str) -> str:
12
+ from fabricks.utils.mermaid import get_mermaid_diagram as get_diagram
13
+
14
+ df = get_dependencies(name)
15
+
16
+ df = df.withColumnRenamed("ParentId", "parent_id")
17
+ df = df.withColumnRenamed("Parent", "parent")
18
+ df = df.withColumnRenamed("JobId", "job_id")
19
+ df = df.withColumnRenamed("Job", "job")
20
+
21
+ return get_diagram(df)