fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,218 @@
1
+ import re
2
+ from typing import List, Optional, cast
3
+
4
+ from databricks.sdk.runtime import dbutils
5
+ from pyspark.sql import DataFrame
6
+
7
+ from fabricks.cdc.nocdc import NoCDC
8
+ from fabricks.context.log import Logger
9
+ from fabricks.core.jobs.base.job import BaseJob
10
+ from fabricks.core.jobs.base.types import TGold
11
+ from fabricks.core.udfs import is_registered, register_udf
12
+ from fabricks.metastore.view import create_or_replace_global_temp_view
13
+ from fabricks.utils.path import Path
14
+
15
+
16
+ class Gold(BaseJob):
17
+ def __init__(
18
+ self, step: TGold, topic: Optional[str] = None, item: Optional[str] = None, job_id: Optional[str] = None
19
+ ): # type: ignore
20
+ super().__init__(
21
+ "gold",
22
+ step=step,
23
+ topic=topic,
24
+ item=item,
25
+ job_id=job_id,
26
+ )
27
+
28
+ _sql: Optional[str] = None
29
+ _sql_path: Optional[Path] = None
30
+ _schema_drift: Optional[bool] = None
31
+
32
+ @classmethod
33
+ def from_job_id(cls, step: str, job_id: str):
34
+ return cls(step=cast(TGold, step), job_id=job_id)
35
+
36
+ @classmethod
37
+ def from_step_topic_item(cls, step: str, topic: str, item: str):
38
+ return cls(step=cast(TGold, step), topic=topic, item=item)
39
+
40
+ @property
41
+ def stream(self) -> bool:
42
+ return False
43
+
44
+ @property
45
+ def schema_drift(self) -> bool:
46
+ if not self._schema_drift:
47
+ _schema_drift = self.step_conf.get("options", {}).get("schema_drift", False)
48
+ assert _schema_drift is not None
49
+ self._schema_drift = cast(bool, _schema_drift)
50
+ return self._schema_drift
51
+
52
+ @property
53
+ def persist(self) -> bool:
54
+ return self.mode in ["update", "append", "complete"]
55
+
56
+ @property
57
+ def virtual(self) -> bool:
58
+ return self.mode in ["memory"]
59
+
60
+ def get_sql(self) -> str:
61
+ return self.paths.runtime.get_sql()
62
+
63
+ def get_udfs(self) -> List[str]:
64
+ # udf not allowed in invoke
65
+ if self.mode == "invoke":
66
+ return []
67
+ # udf not allowed in notebook
68
+ elif self.options.job.get("notebook"):
69
+ return []
70
+ # udf not allowed in table
71
+ elif self.options.job.get("table"):
72
+ return []
73
+ else:
74
+ matches = []
75
+ if "udf_" in self.get_sql():
76
+ r = re.compile(r"(?<=udf_)\w*(?=\()")
77
+ matches = re.findall(r, self.get_sql())
78
+ matches = set(matches)
79
+ matches = list(matches)
80
+ return matches
81
+
82
+ def register_udfs(self):
83
+ for u in self.get_udfs():
84
+ if not is_registered(u):
85
+ Logger.debug(f"register udf ({u})", extra={"job": self})
86
+ register_udf(udf=u, spark=self.spark)
87
+
88
+ def base_transform(self, df: DataFrame) -> DataFrame:
89
+ df = df.transform(self.extender)
90
+ return df
91
+
92
+ def get_data(self, stream=False, transform: Optional[bool] = False) -> DataFrame:
93
+ if self.options.job.get_boolean("requirements"):
94
+ import sys
95
+
96
+ sys.path.append("/dbfs/mnt/fabricks/site-packages")
97
+
98
+ if self.mode == "invoke":
99
+ df = self.spark.createDataFrame([{}]) # type: ignore
100
+
101
+ elif self.options.job.get("notebook"):
102
+ Logger.debug("run notebook", extra={"job": self})
103
+ path = self.paths.runtime.get_notebook_path()
104
+ global_temp_view = dbutils.notebook.run(path, self.timeouts.job, arguments={})
105
+ df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
106
+
107
+ elif self.options.job.get("table"):
108
+ table = self.options.job.get("table")
109
+ df = self.spark.read.table(table) # type: ignore
110
+
111
+ else:
112
+ assert self.get_sql(), "sql not found"
113
+ self.register_udfs()
114
+ df = self.spark.sql(self.get_sql())
115
+
116
+ if transform:
117
+ df = self.base_transform(df)
118
+ return df
119
+
120
+ def create_or_replace_view(self):
121
+ assert self.mode == "memory", f"{self.mode} not allowed"
122
+
123
+ df = self.spark.sql(self.get_sql())
124
+ cdc_options = self.get_cdc_context(df)
125
+ self.cdc.create_or_replace_view(self.get_sql(), **cdc_options)
126
+
127
+ def get_cdc_context(self, df: DataFrame) -> dict:
128
+ if "__order_duplicate_by_asc" in df.columns:
129
+ order_duplicate_by = {"__order_duplicate_by_asc": "asc"}
130
+ elif "__order_duplicate_by_desc" in df.columns:
131
+ order_duplicate_by = {"__order_duplicate_by_desc": "desc"}
132
+ else:
133
+ order_duplicate_by = None
134
+
135
+ context = {
136
+ "add_metadata": True,
137
+ "soft_delete": True if self.slowly_changing_dimension else None,
138
+ "deduplicate_key": self.options.job.get_boolean("deduplicate", None),
139
+ "deduplicate_hash": True if self.slowly_changing_dimension else None,
140
+ "deduplicate": False, # assume no duplicate in gold
141
+ "rectify": False, # assume no reload in gold
142
+ "order_duplicate_by": order_duplicate_by,
143
+ }
144
+
145
+ if self.slowly_changing_dimension:
146
+ if "__key" not in df.columns:
147
+ context["add_key"] = True
148
+ if "__hash" not in df.columns:
149
+ context["add_hash"] = True
150
+
151
+ if "__operation" not in df.columns:
152
+ context["deduplicate_hash"] = None # assume no duplicate hash
153
+ if self.mode == "update":
154
+ context["add_operation"] = "reload"
155
+ context["rectify"] = True
156
+ else:
157
+ context["add_operation"] = "upsert"
158
+
159
+ if self.mode == "update" and self.change_data_capture == "scd2":
160
+ context["filter"] = "update"
161
+
162
+ if self.mode == "memory":
163
+ context["mode"] = "complete"
164
+
165
+ return context
166
+
167
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
168
+ assert self.persist, f"{self.mode} not allowed"
169
+
170
+ context = self.get_cdc_context(df=df)
171
+
172
+ # if dataframe, reference is passed (BUG)
173
+ name = f"{self.step}_{self.topic}_{self.item}"
174
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
175
+ sql = f"select * from {global_temp_view}"
176
+
177
+ if self.mode == "update":
178
+ assert not isinstance(self.cdc, NoCDC), "nocdc update not allowed"
179
+ self.cdc.update(sql, **context)
180
+ elif self.mode == "append":
181
+ assert isinstance(self.cdc, NoCDC), f"{self.change_data_capture} append not allowed"
182
+ self.cdc.append(sql, **context)
183
+ elif self.mode == "complete":
184
+ self.cdc.complete(sql, **context)
185
+ else:
186
+ raise ValueError(f"{self.mode} - not allowed")
187
+
188
+ self.check_duplicate_key()
189
+ self.check_duplicate_hash()
190
+
191
+ def for_each_run(self, schedule: Optional[str] = None):
192
+ if self.mode == "invoke":
193
+ self.invoke(schedule=schedule)
194
+ else:
195
+ super().for_each_run(schedule=schedule)
196
+
197
+ def create(self):
198
+ if self.mode == "invoke":
199
+ Logger.info("invoke (no table nor view)", extra={"job": self})
200
+ else:
201
+ super().create()
202
+
203
+ def register(self):
204
+ if self.mode == "invoke":
205
+ Logger.info("invoke (no table nor view)", extra={"job": self})
206
+ else:
207
+ super().register()
208
+
209
+ def optimize(
210
+ self,
211
+ vacuum: Optional[bool] = True,
212
+ optimize: Optional[bool] = True,
213
+ analyze: Optional[bool] = True,
214
+ ):
215
+ if self.mode == "memory":
216
+ Logger.debug("memory (no optimize)", extra={"job": self})
217
+ else:
218
+ super().optimize()
@@ -0,0 +1,354 @@
1
+ from typing import Optional, cast
2
+
3
+ from pyspark.sql import DataFrame, Row
4
+ from pyspark.sql.functions import expr
5
+
6
+ from fabricks.cdc.nocdc import NoCDC
7
+ from fabricks.context.log import Logger
8
+ from fabricks.core.jobs.base.job import BaseJob
9
+ from fabricks.core.jobs.base.types import TBronze, TSilver
10
+ from fabricks.core.jobs.bronze import Bronze
11
+ from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.utils.helpers import concat_dfs
13
+ from fabricks.utils.read.read import read
14
+ from fabricks.utils.sqlglot import fix as fix_sql
15
+
16
+
17
+ class Silver(BaseJob):
18
+ def __init__(
19
+ self, step: TSilver, topic: Optional[str] = None, item: Optional[str] = None, job_id: Optional[str] = None
20
+ ): # type: ignore
21
+ super().__init__(
22
+ "silver",
23
+ step=step,
24
+ topic=topic,
25
+ item=item,
26
+ job_id=job_id,
27
+ )
28
+
29
+ _parent_step: Optional[TBronze] = None
30
+ _stream: Optional[bool] = None
31
+
32
+ @classmethod
33
+ def from_job_id(cls, step: str, job_id: str):
34
+ return cls(step=cast(TSilver, step), job_id=job_id)
35
+
36
+ @classmethod
37
+ def from_step_topic_item(cls, step: str, topic: str, item: str):
38
+ return cls(step=cast(TSilver, step), topic=topic, item=item)
39
+
40
+ @property
41
+ def stream(self) -> bool:
42
+ if not self._stream:
43
+ _stream = self.options.job.get("stream")
44
+ if _stream is None:
45
+ _stream = self.step_conf.get("options", {}).get("stream")
46
+ self._stream = _stream if _stream is not None else True
47
+ return self._stream # type: ignore
48
+
49
+ @property
50
+ def schema_drift(self) -> bool:
51
+ return True
52
+
53
+ @property
54
+ def persist(self) -> bool:
55
+ return self.mode in ["update", "append", "latest"]
56
+
57
+ @property
58
+ def virtual(self) -> bool:
59
+ return self.mode in ["combine", "memory"]
60
+
61
+ @property
62
+ def parent_step(self) -> TBronze:
63
+ if not self._parent_step:
64
+ _parent_step = self.step_conf.get("options", {}).get("parent")
65
+ _parent_step = cast(TBronze, _parent_step)
66
+ assert _parent_step is not None
67
+ self._parent_step = _parent_step
68
+ return self._parent_step
69
+
70
+ def base_transform(self, df: DataFrame) -> DataFrame:
71
+ df = df.transform(self.extender)
72
+ if "__metadata" in df.columns:
73
+ df = df.withColumn(
74
+ "__metadata",
75
+ expr(
76
+ """
77
+ struct(
78
+ __metadata.file_path as file_path,
79
+ __metadata.file_name as file_name,
80
+ __metadata.file_size as file_size,
81
+ __metadata.file_modification_time as file_modification_time,
82
+ __metadata.inserted as inserted,
83
+ cast(current_date() as timestamp) as updated
84
+ )
85
+ """
86
+ ),
87
+ )
88
+ return df
89
+
90
+ def get_data(self, stream: bool = True, transform: Optional[bool] = False) -> DataFrame:
91
+ dep_df = self.get_dependencies()
92
+ assert dep_df, "not dependency found"
93
+ dep_df = dep_df.orderBy("parent_id")
94
+ dependencies = dep_df.count()
95
+
96
+ if self.mode == "memory":
97
+ assert dependencies == 1, f"more than 1 dependency not allowed ({dependencies})"
98
+
99
+ parent = dep_df.collect()[0].parent
100
+ df = self.spark.sql(f"select * from {parent}")
101
+
102
+ elif self.mode == "combine":
103
+ dfs = []
104
+ for row in dep_df.collect():
105
+ df = self.spark.sql(f"select * from {row.parent}")
106
+ dfs.append(df)
107
+ df = concat_dfs(dfs)
108
+
109
+ else:
110
+ dfs = []
111
+
112
+ for row in dep_df.collect():
113
+ try:
114
+ bronze = Bronze.from_job_id(step=self.parent_step, job_id=row["parent_id"])
115
+ if bronze.mode in ["memory", "register"]:
116
+ # data already transformed if bronze is persisted
117
+ df = bronze.get_data(stream=stream, transform=True)
118
+ else:
119
+ df = read(
120
+ stream=stream,
121
+ path=bronze.table.deltapath,
122
+ file_format="delta",
123
+ metadata=False,
124
+ spark=self.spark,
125
+ )
126
+ if dependencies > 1:
127
+ assert "__source" in df.columns, "__source not found"
128
+ dfs.append(df)
129
+ except Exception as e:
130
+ Logger.exception("🙈", extra={"job": self})
131
+ raise e
132
+
133
+ df = concat_dfs(dfs)
134
+
135
+ # transforms
136
+ df = self.filter_where(df)
137
+ df = self.encrypt(df)
138
+ if transform:
139
+ df = self.base_transform(df)
140
+ return df
141
+
142
+ def get_dependencies(self, df: Optional[DataFrame] = None) -> Optional[DataFrame]:
143
+ dependencies = []
144
+ parents = self.options.job.get_list("parents") or []
145
+ if parents:
146
+ for p in parents:
147
+ dependencies.append(Row(self.job_id, p, "job"))
148
+ else:
149
+ p = f"{self.parent_step}.{self.topic}_{self.item}"
150
+ dependencies.append(Row(self.job_id, p, "parser"))
151
+
152
+ if dependencies:
153
+ Logger.debug(f"dependencies ({', '.join([row[1] for row in dependencies])})", extra={"job": self})
154
+ df = self.spark.createDataFrame(dependencies, schema=["job_id", "parent", "origin"])
155
+ df = df.transform(self.add_dependency_details)
156
+ return df
157
+
158
+ def create_or_replace_view(self):
159
+ assert self.mode in ["memory", "combine"], f"{self.mode} not allowed"
160
+
161
+ dep_df = self.get_dependencies()
162
+ assert dep_df, "dependency not found"
163
+
164
+ if self.mode == "combine":
165
+ queries = []
166
+
167
+ for row in dep_df.collect():
168
+ columns = self.get_data().columns
169
+ df = self.spark.sql(f"select * from {row.parent}")
170
+ cols = [f"`{c}`" if c in df.columns else f"null as `{c}`" for c in columns if c not in ["__source"]]
171
+ source = "__source" if "__source" in df.columns else f"'{row.parent}' as __source"
172
+ query = f"select {', '.join(cols)}, {source} from {row.parent}"
173
+ queries.append(query)
174
+
175
+ sql = f"create or replace view {self.qualified_name} as {' union all '.join(queries)}"
176
+ sql = fix_sql(sql)
177
+ Logger.debug("view", extra={"job": self, "sql": sql})
178
+ self.spark.sql(sql)
179
+
180
+ else:
181
+ assert dep_df.count() == 1, "only one dependency allowed"
182
+
183
+ parent = dep_df.collect()[0].parent
184
+ sql = f"select * from {parent}"
185
+ sql = fix_sql(sql)
186
+ Logger.debug("view", extra={"job": self, "sql": sql})
187
+
188
+ df = self.spark.sql(sql)
189
+ cdc_options = self.get_cdc_context(df)
190
+ self.cdc.create_or_replace_view(sql, **cdc_options)
191
+
192
+ def create_or_replace_current_view(self):
193
+ from py4j.protocol import Py4JJavaError
194
+
195
+ try:
196
+ Logger.debug("create or replace current view", extra={"job": self})
197
+
198
+ df = self.spark.sql(f"select * from {self.qualified_name}")
199
+
200
+ where_clause = "-- no where clause"
201
+ if "__is_current" in df.columns:
202
+ where_clause = "where __is_current"
203
+
204
+ sql = f"""
205
+ create or replace view {self.qualified_name}__current with schema evolution as
206
+ select
207
+ *
208
+ from
209
+ {self.qualified_name}
210
+ {where_clause}
211
+ """
212
+ # sql = fix_sql(sql)
213
+ # Logger.debug("current view", extra={"job": self, "sql": sql})
214
+ self.spark.sql(sql)
215
+
216
+ except Py4JJavaError:
217
+ Logger.exception("🙈", extra={"job": self})
218
+
219
+ def overwrite(self):
220
+ self.truncate()
221
+ self.run()
222
+
223
+ def overwrite_schema(self):
224
+ Logger.warning("overwrite schema not allowed", extra={"job": self})
225
+
226
+ def get_cdc_context(self, df: DataFrame) -> dict:
227
+ # if dataframe, reference is passed (BUG)
228
+ name = f"{self.step}_{self.topic}_{self.item}__check"
229
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
230
+
231
+ not_append = not self.mode == "append"
232
+ nocdc = self.change_data_capture == "nocdc"
233
+ order_duplicate_by = self.options.job.get_dict("order_duplicate_by") or {}
234
+
235
+ rectify = False
236
+ if not_append and not nocdc:
237
+ if not self.stream and self.mode == "update" and self.table.exists():
238
+ timestamp = "__valid_from" if self.change_data_capture == "scd2" else "__timestamp"
239
+ extra_check = f" and __timestamp > coalesce((select max({timestamp}) from {self}), cast('0001-01-01' as timestamp))"
240
+ else:
241
+ extra_check = "-- no extra check"
242
+
243
+ sql = f"""
244
+ select
245
+ __operation
246
+ from
247
+ {global_temp_view}
248
+ where
249
+ true
250
+ and __operation == 'reload'
251
+ {extra_check}
252
+ limit
253
+ 1
254
+ """
255
+ sql = fix_sql(sql)
256
+ Logger.debug("check", extra={"job": self, "sql": sql})
257
+
258
+ check_df = self.spark.sql(sql)
259
+ if not check_df.isEmpty():
260
+ rectify = True
261
+ Logger.debug("rectify enabled", extra={"job": self})
262
+
263
+ context = {
264
+ "soft_delete": self.slowly_changing_dimension,
265
+ "deduplicate": self.options.job.get_boolean("deduplicate", not_append),
266
+ "rectify": rectify,
267
+ "order_duplicate_by": order_duplicate_by,
268
+ }
269
+
270
+ if self.slowly_changing_dimension:
271
+ if "__key" not in df.columns:
272
+ context["add_key"] = True
273
+
274
+ if self.mode == "memory":
275
+ context["mode"] = "complete"
276
+ if self.mode == "latest":
277
+ context["filter"] = "latest"
278
+
279
+ if self.change_data_capture == "scd2":
280
+ context["fix_valid_from"] = True
281
+
282
+ if nocdc:
283
+ if "__operation" in df.columns:
284
+ context["except"] = ["__operation"]
285
+ if nocdc and self.mode == "memory":
286
+ if "__operation" not in df.columns:
287
+ context["add_operation"] = "upsert"
288
+ context["except"] = ["__operation"]
289
+
290
+ if not self.stream and self.mode == "update":
291
+ context["filter"] = "update"
292
+
293
+ return context
294
+
295
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
296
+ assert self.persist, f"{self.mode} not allowed"
297
+
298
+ context = self.get_cdc_context(df)
299
+
300
+ # if dataframe, reference is passed (BUG)
301
+ name = f"{self.step}_{self.topic}_{self.item}"
302
+ if batch is not None:
303
+ name = f"{name}__{batch}"
304
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
305
+ sql = f"select * from {global_temp_view}"
306
+
307
+ if self.mode == "update":
308
+ assert not isinstance(self.cdc, NoCDC)
309
+ self.cdc.update(sql, **context)
310
+ elif self.mode == "append":
311
+ assert isinstance(self.cdc, NoCDC)
312
+ self.cdc.append(sql, **context)
313
+ elif self.mode == "latest":
314
+ assert isinstance(self.cdc, NoCDC)
315
+ check_df = self.spark.sql(
316
+ f"""
317
+ select
318
+ __operation
319
+ from
320
+ {global_temp_view}
321
+ where
322
+ __operation <> 'reload'
323
+ limit
324
+ 1
325
+ """
326
+ )
327
+ assert check_df.isEmpty(), f"{check_df.collect()[0][0]} not allowed"
328
+ self.cdc.complete(sql, **context)
329
+ else:
330
+ raise ValueError(f"{self.mode} - not allowed")
331
+
332
+ def create(self):
333
+ super().create()
334
+ self.create_or_replace_current_view()
335
+
336
+ def register(self):
337
+ super().register()
338
+ self.create_or_replace_current_view()
339
+
340
+ def drop(self):
341
+ super().drop()
342
+ Logger.debug("drop current view", extra={"job": self})
343
+ self.spark.sql(f"drop view if exists {self.qualified_name}__current")
344
+
345
+ def optimize(
346
+ self,
347
+ vacuum: Optional[bool] = True,
348
+ optimize: Optional[bool] = True,
349
+ analyze: Optional[bool] = True,
350
+ ):
351
+ if self.mode == "memory":
352
+ Logger.debug("memory (no optimize)", extra={"job": self})
353
+ else:
354
+ super().optimize()
@@ -0,0 +1,12 @@
1
+ from fabricks.core.parsers.base import PARSERS, BaseParser
2
+ from fabricks.core.parsers.decorator import parser
3
+ from fabricks.core.parsers.get_parser import get_parser
4
+ from fabricks.core.parsers.types import ParserOptions
5
+
6
+ __all__ = [
7
+ "BaseParser",
8
+ "get_parser",
9
+ "parser",
10
+ "ParserOptions",
11
+ "PARSERS",
12
+ ]
@@ -0,0 +1,91 @@
1
+ from abc import ABC
2
+ from typing import Callable, Optional, final
3
+
4
+ from pyspark.sql import DataFrame, SparkSession
5
+ from pyspark.sql.functions import col, expr, from_json, lit
6
+ from pyspark.sql.types import MapType, StringType
7
+
8
+ from fabricks.core.parsers.types import ParserOptions
9
+ from fabricks.core.utils import clean
10
+ from fabricks.utils.path import Path
11
+ from fabricks.utils.read.read import read
12
+
13
+
14
+ class BaseParser(ABC):
15
+ def __init__(self, options: Optional[ParserOptions], file_format: str):
16
+ self.options = options or {}
17
+ self.file_format = file_format
18
+
19
+ def add_timestamp_from_file_path(self, df: DataFrame) -> DataFrame:
20
+ df = df.withColumn(
21
+ "__split",
22
+ expr("split(replace(__metadata.file_path, __metadata.file_name), '/')"),
23
+ )
24
+ df = df.withColumn("__split_size", expr("size(__split)"))
25
+ df = df.withColumn(
26
+ "__timestamp",
27
+ expr("left(concat_ws('', slice(__split, __split_size - 4, 4), '00'), 14)"),
28
+ )
29
+ df = df.withColumn("__timestamp", expr("to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
30
+ df = df.drop("__split", "__split_size")
31
+ return df
32
+
33
+ def parse(
34
+ self,
35
+ data_path: Path,
36
+ schema_path: Path,
37
+ spark: SparkSession,
38
+ stream: bool,
39
+ ) -> DataFrame:
40
+ df = read(
41
+ stream=stream,
42
+ path=data_path,
43
+ file_format=self.file_format,
44
+ schema_path=schema_path,
45
+ options=self.options.get("read_options"),
46
+ spark=spark,
47
+ )
48
+ if "__timestamp" not in df.columns:
49
+ df = self.add_timestamp_from_file_path(df)
50
+ return df
51
+
52
+ @final
53
+ def get_data(
54
+ self,
55
+ data_path: Path,
56
+ schema_path: Path,
57
+ spark: SparkSession,
58
+ stream: bool,
59
+ ) -> DataFrame:
60
+ """
61
+ Retrieves and processes data from the specified data path using the provided schema.
62
+
63
+ Args:
64
+ data_path (Path): The path to the data file.
65
+ schema_path (Path): The path to the schema file.
66
+ spark (SparkSession): The SparkSession object.
67
+ stream (bool): Indicates whether the data should be processed as a stream.
68
+
69
+ Returns:
70
+ DataFrame: The processed data as a DataFrame.
71
+
72
+ Raises:
73
+ AssertionError: If the "__timestamp" column is missing in the DataFrame.
74
+ AssertionError: If the "__metadata.file_path" column is missing in the DataFrame.
75
+ """
76
+ df = self.parse(data_path=data_path, schema_path=schema_path, spark=spark, stream=stream)
77
+ df = df.transform(clean)
78
+
79
+ if "__rescued_data" not in df.columns:
80
+ df = df.withColumn("__rescued_data", lit(None).cast(StringType()))
81
+ df = df.withColumn("__rescued_data", from_json(col("__rescued_data"), MapType(StringType(), StringType()))) # type: ignore
82
+
83
+ assert "__timestamp" in df.columns, "__timestamp mandatory in dataframe"
84
+ assert df.select("__metadata.file_path"), "file_path mandatory in struct __metadata in dataframe"
85
+ return df
86
+
87
+ def __str__(self):
88
+ return f"{type(self).__name__} ({self.file_format})"
89
+
90
+
91
+ PARSERS: dict[str, Callable[[Optional[ParserOptions]], BaseParser]] = {}
@@ -0,0 +1,11 @@
1
+ from typing import Callable, Optional
2
+
3
+ from fabricks.core.parsers.base import PARSERS, BaseParser
4
+ from fabricks.core.parsers.types import ParserOptions
5
+
6
+
7
+ def parser(name: str):
8
+ def decorator(parser: Callable[[Optional[ParserOptions]], BaseParser]):
9
+ PARSERS[name] = parser
10
+
11
+ return decorator