fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import List, Optional, cast
|
|
3
|
+
|
|
4
|
+
from databricks.sdk.runtime import dbutils
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
|
|
7
|
+
from fabricks.cdc.nocdc import NoCDC
|
|
8
|
+
from fabricks.context.log import Logger
|
|
9
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
10
|
+
from fabricks.core.jobs.base.types import TGold
|
|
11
|
+
from fabricks.core.udfs import is_registered, register_udf
|
|
12
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
13
|
+
from fabricks.utils.path import Path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Gold(BaseJob):
|
|
17
|
+
def __init__(
|
|
18
|
+
self, step: TGold, topic: Optional[str] = None, item: Optional[str] = None, job_id: Optional[str] = None
|
|
19
|
+
): # type: ignore
|
|
20
|
+
super().__init__(
|
|
21
|
+
"gold",
|
|
22
|
+
step=step,
|
|
23
|
+
topic=topic,
|
|
24
|
+
item=item,
|
|
25
|
+
job_id=job_id,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
_sql: Optional[str] = None
|
|
29
|
+
_sql_path: Optional[Path] = None
|
|
30
|
+
_schema_drift: Optional[bool] = None
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_job_id(cls, step: str, job_id: str):
|
|
34
|
+
return cls(step=cast(TGold, step), job_id=job_id)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str):
|
|
38
|
+
return cls(step=cast(TGold, step), topic=topic, item=item)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def stream(self) -> bool:
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def schema_drift(self) -> bool:
|
|
46
|
+
if not self._schema_drift:
|
|
47
|
+
_schema_drift = self.step_conf.get("options", {}).get("schema_drift", False)
|
|
48
|
+
assert _schema_drift is not None
|
|
49
|
+
self._schema_drift = cast(bool, _schema_drift)
|
|
50
|
+
return self._schema_drift
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def persist(self) -> bool:
|
|
54
|
+
return self.mode in ["update", "append", "complete"]
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def virtual(self) -> bool:
|
|
58
|
+
return self.mode in ["memory"]
|
|
59
|
+
|
|
60
|
+
def get_sql(self) -> str:
|
|
61
|
+
return self.paths.runtime.get_sql()
|
|
62
|
+
|
|
63
|
+
def get_udfs(self) -> List[str]:
|
|
64
|
+
# udf not allowed in invoke
|
|
65
|
+
if self.mode == "invoke":
|
|
66
|
+
return []
|
|
67
|
+
# udf not allowed in notebook
|
|
68
|
+
elif self.options.job.get("notebook"):
|
|
69
|
+
return []
|
|
70
|
+
# udf not allowed in table
|
|
71
|
+
elif self.options.job.get("table"):
|
|
72
|
+
return []
|
|
73
|
+
else:
|
|
74
|
+
matches = []
|
|
75
|
+
if "udf_" in self.get_sql():
|
|
76
|
+
r = re.compile(r"(?<=udf_)\w*(?=\()")
|
|
77
|
+
matches = re.findall(r, self.get_sql())
|
|
78
|
+
matches = set(matches)
|
|
79
|
+
matches = list(matches)
|
|
80
|
+
return matches
|
|
81
|
+
|
|
82
|
+
def register_udfs(self):
|
|
83
|
+
for u in self.get_udfs():
|
|
84
|
+
if not is_registered(u):
|
|
85
|
+
Logger.debug(f"register udf ({u})", extra={"job": self})
|
|
86
|
+
register_udf(udf=u, spark=self.spark)
|
|
87
|
+
|
|
88
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
89
|
+
df = df.transform(self.extender)
|
|
90
|
+
return df
|
|
91
|
+
|
|
92
|
+
def get_data(self, stream=False, transform: Optional[bool] = False) -> DataFrame:
|
|
93
|
+
if self.options.job.get_boolean("requirements"):
|
|
94
|
+
import sys
|
|
95
|
+
|
|
96
|
+
sys.path.append("/dbfs/mnt/fabricks/site-packages")
|
|
97
|
+
|
|
98
|
+
if self.mode == "invoke":
|
|
99
|
+
df = self.spark.createDataFrame([{}]) # type: ignore
|
|
100
|
+
|
|
101
|
+
elif self.options.job.get("notebook"):
|
|
102
|
+
Logger.debug("run notebook", extra={"job": self})
|
|
103
|
+
path = self.paths.runtime.get_notebook_path()
|
|
104
|
+
global_temp_view = dbutils.notebook.run(path, self.timeouts.job, arguments={})
|
|
105
|
+
df = self.spark.sql(f"select * from global_temp.{global_temp_view}")
|
|
106
|
+
|
|
107
|
+
elif self.options.job.get("table"):
|
|
108
|
+
table = self.options.job.get("table")
|
|
109
|
+
df = self.spark.read.table(table) # type: ignore
|
|
110
|
+
|
|
111
|
+
else:
|
|
112
|
+
assert self.get_sql(), "sql not found"
|
|
113
|
+
self.register_udfs()
|
|
114
|
+
df = self.spark.sql(self.get_sql())
|
|
115
|
+
|
|
116
|
+
if transform:
|
|
117
|
+
df = self.base_transform(df)
|
|
118
|
+
return df
|
|
119
|
+
|
|
120
|
+
def create_or_replace_view(self):
|
|
121
|
+
assert self.mode == "memory", f"{self.mode} not allowed"
|
|
122
|
+
|
|
123
|
+
df = self.spark.sql(self.get_sql())
|
|
124
|
+
cdc_options = self.get_cdc_context(df)
|
|
125
|
+
self.cdc.create_or_replace_view(self.get_sql(), **cdc_options)
|
|
126
|
+
|
|
127
|
+
def get_cdc_context(self, df: DataFrame) -> dict:
|
|
128
|
+
if "__order_duplicate_by_asc" in df.columns:
|
|
129
|
+
order_duplicate_by = {"__order_duplicate_by_asc": "asc"}
|
|
130
|
+
elif "__order_duplicate_by_desc" in df.columns:
|
|
131
|
+
order_duplicate_by = {"__order_duplicate_by_desc": "desc"}
|
|
132
|
+
else:
|
|
133
|
+
order_duplicate_by = None
|
|
134
|
+
|
|
135
|
+
context = {
|
|
136
|
+
"add_metadata": True,
|
|
137
|
+
"soft_delete": True if self.slowly_changing_dimension else None,
|
|
138
|
+
"deduplicate_key": self.options.job.get_boolean("deduplicate", None),
|
|
139
|
+
"deduplicate_hash": True if self.slowly_changing_dimension else None,
|
|
140
|
+
"deduplicate": False, # assume no duplicate in gold
|
|
141
|
+
"rectify": False, # assume no reload in gold
|
|
142
|
+
"order_duplicate_by": order_duplicate_by,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if self.slowly_changing_dimension:
|
|
146
|
+
if "__key" not in df.columns:
|
|
147
|
+
context["add_key"] = True
|
|
148
|
+
if "__hash" not in df.columns:
|
|
149
|
+
context["add_hash"] = True
|
|
150
|
+
|
|
151
|
+
if "__operation" not in df.columns:
|
|
152
|
+
context["deduplicate_hash"] = None # assume no duplicate hash
|
|
153
|
+
if self.mode == "update":
|
|
154
|
+
context["add_operation"] = "reload"
|
|
155
|
+
context["rectify"] = True
|
|
156
|
+
else:
|
|
157
|
+
context["add_operation"] = "upsert"
|
|
158
|
+
|
|
159
|
+
if self.mode == "update" and self.change_data_capture == "scd2":
|
|
160
|
+
context["filter"] = "update"
|
|
161
|
+
|
|
162
|
+
if self.mode == "memory":
|
|
163
|
+
context["mode"] = "complete"
|
|
164
|
+
|
|
165
|
+
return context
|
|
166
|
+
|
|
167
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
|
|
168
|
+
assert self.persist, f"{self.mode} not allowed"
|
|
169
|
+
|
|
170
|
+
context = self.get_cdc_context(df=df)
|
|
171
|
+
|
|
172
|
+
# if dataframe, reference is passed (BUG)
|
|
173
|
+
name = f"{self.step}_{self.topic}_{self.item}"
|
|
174
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
175
|
+
sql = f"select * from {global_temp_view}"
|
|
176
|
+
|
|
177
|
+
if self.mode == "update":
|
|
178
|
+
assert not isinstance(self.cdc, NoCDC), "nocdc update not allowed"
|
|
179
|
+
self.cdc.update(sql, **context)
|
|
180
|
+
elif self.mode == "append":
|
|
181
|
+
assert isinstance(self.cdc, NoCDC), f"{self.change_data_capture} append not allowed"
|
|
182
|
+
self.cdc.append(sql, **context)
|
|
183
|
+
elif self.mode == "complete":
|
|
184
|
+
self.cdc.complete(sql, **context)
|
|
185
|
+
else:
|
|
186
|
+
raise ValueError(f"{self.mode} - not allowed")
|
|
187
|
+
|
|
188
|
+
self.check_duplicate_key()
|
|
189
|
+
self.check_duplicate_hash()
|
|
190
|
+
|
|
191
|
+
def for_each_run(self, schedule: Optional[str] = None):
|
|
192
|
+
if self.mode == "invoke":
|
|
193
|
+
self.invoke(schedule=schedule)
|
|
194
|
+
else:
|
|
195
|
+
super().for_each_run(schedule=schedule)
|
|
196
|
+
|
|
197
|
+
def create(self):
|
|
198
|
+
if self.mode == "invoke":
|
|
199
|
+
Logger.info("invoke (no table nor view)", extra={"job": self})
|
|
200
|
+
else:
|
|
201
|
+
super().create()
|
|
202
|
+
|
|
203
|
+
def register(self):
|
|
204
|
+
if self.mode == "invoke":
|
|
205
|
+
Logger.info("invoke (no table nor view)", extra={"job": self})
|
|
206
|
+
else:
|
|
207
|
+
super().register()
|
|
208
|
+
|
|
209
|
+
def optimize(
|
|
210
|
+
self,
|
|
211
|
+
vacuum: Optional[bool] = True,
|
|
212
|
+
optimize: Optional[bool] = True,
|
|
213
|
+
analyze: Optional[bool] = True,
|
|
214
|
+
):
|
|
215
|
+
if self.mode == "memory":
|
|
216
|
+
Logger.debug("memory (no optimize)", extra={"job": self})
|
|
217
|
+
else:
|
|
218
|
+
super().optimize()
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
from typing import Optional, cast
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame, Row
|
|
4
|
+
from pyspark.sql.functions import expr
|
|
5
|
+
|
|
6
|
+
from fabricks.cdc.nocdc import NoCDC
|
|
7
|
+
from fabricks.context.log import Logger
|
|
8
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
9
|
+
from fabricks.core.jobs.base.types import TBronze, TSilver
|
|
10
|
+
from fabricks.core.jobs.bronze import Bronze
|
|
11
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
12
|
+
from fabricks.utils.helpers import concat_dfs
|
|
13
|
+
from fabricks.utils.read.read import read
|
|
14
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Silver(BaseJob):
|
|
18
|
+
def __init__(
|
|
19
|
+
self, step: TSilver, topic: Optional[str] = None, item: Optional[str] = None, job_id: Optional[str] = None
|
|
20
|
+
): # type: ignore
|
|
21
|
+
super().__init__(
|
|
22
|
+
"silver",
|
|
23
|
+
step=step,
|
|
24
|
+
topic=topic,
|
|
25
|
+
item=item,
|
|
26
|
+
job_id=job_id,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
_parent_step: Optional[TBronze] = None
|
|
30
|
+
_stream: Optional[bool] = None
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def from_job_id(cls, step: str, job_id: str):
|
|
34
|
+
return cls(step=cast(TSilver, step), job_id=job_id)
|
|
35
|
+
|
|
36
|
+
@classmethod
|
|
37
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str):
|
|
38
|
+
return cls(step=cast(TSilver, step), topic=topic, item=item)
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def stream(self) -> bool:
|
|
42
|
+
if not self._stream:
|
|
43
|
+
_stream = self.options.job.get("stream")
|
|
44
|
+
if _stream is None:
|
|
45
|
+
_stream = self.step_conf.get("options", {}).get("stream")
|
|
46
|
+
self._stream = _stream if _stream is not None else True
|
|
47
|
+
return self._stream # type: ignore
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def schema_drift(self) -> bool:
|
|
51
|
+
return True
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def persist(self) -> bool:
|
|
55
|
+
return self.mode in ["update", "append", "latest"]
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def virtual(self) -> bool:
|
|
59
|
+
return self.mode in ["combine", "memory"]
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def parent_step(self) -> TBronze:
|
|
63
|
+
if not self._parent_step:
|
|
64
|
+
_parent_step = self.step_conf.get("options", {}).get("parent")
|
|
65
|
+
_parent_step = cast(TBronze, _parent_step)
|
|
66
|
+
assert _parent_step is not None
|
|
67
|
+
self._parent_step = _parent_step
|
|
68
|
+
return self._parent_step
|
|
69
|
+
|
|
70
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
71
|
+
df = df.transform(self.extender)
|
|
72
|
+
if "__metadata" in df.columns:
|
|
73
|
+
df = df.withColumn(
|
|
74
|
+
"__metadata",
|
|
75
|
+
expr(
|
|
76
|
+
"""
|
|
77
|
+
struct(
|
|
78
|
+
__metadata.file_path as file_path,
|
|
79
|
+
__metadata.file_name as file_name,
|
|
80
|
+
__metadata.file_size as file_size,
|
|
81
|
+
__metadata.file_modification_time as file_modification_time,
|
|
82
|
+
__metadata.inserted as inserted,
|
|
83
|
+
cast(current_date() as timestamp) as updated
|
|
84
|
+
)
|
|
85
|
+
"""
|
|
86
|
+
),
|
|
87
|
+
)
|
|
88
|
+
return df
|
|
89
|
+
|
|
90
|
+
def get_data(self, stream: bool = True, transform: Optional[bool] = False) -> DataFrame:
|
|
91
|
+
dep_df = self.get_dependencies()
|
|
92
|
+
assert dep_df, "not dependency found"
|
|
93
|
+
dep_df = dep_df.orderBy("parent_id")
|
|
94
|
+
dependencies = dep_df.count()
|
|
95
|
+
|
|
96
|
+
if self.mode == "memory":
|
|
97
|
+
assert dependencies == 1, f"more than 1 dependency not allowed ({dependencies})"
|
|
98
|
+
|
|
99
|
+
parent = dep_df.collect()[0].parent
|
|
100
|
+
df = self.spark.sql(f"select * from {parent}")
|
|
101
|
+
|
|
102
|
+
elif self.mode == "combine":
|
|
103
|
+
dfs = []
|
|
104
|
+
for row in dep_df.collect():
|
|
105
|
+
df = self.spark.sql(f"select * from {row.parent}")
|
|
106
|
+
dfs.append(df)
|
|
107
|
+
df = concat_dfs(dfs)
|
|
108
|
+
|
|
109
|
+
else:
|
|
110
|
+
dfs = []
|
|
111
|
+
|
|
112
|
+
for row in dep_df.collect():
|
|
113
|
+
try:
|
|
114
|
+
bronze = Bronze.from_job_id(step=self.parent_step, job_id=row["parent_id"])
|
|
115
|
+
if bronze.mode in ["memory", "register"]:
|
|
116
|
+
# data already transformed if bronze is persisted
|
|
117
|
+
df = bronze.get_data(stream=stream, transform=True)
|
|
118
|
+
else:
|
|
119
|
+
df = read(
|
|
120
|
+
stream=stream,
|
|
121
|
+
path=bronze.table.deltapath,
|
|
122
|
+
file_format="delta",
|
|
123
|
+
metadata=False,
|
|
124
|
+
spark=self.spark,
|
|
125
|
+
)
|
|
126
|
+
if dependencies > 1:
|
|
127
|
+
assert "__source" in df.columns, "__source not found"
|
|
128
|
+
dfs.append(df)
|
|
129
|
+
except Exception as e:
|
|
130
|
+
Logger.exception("🙈", extra={"job": self})
|
|
131
|
+
raise e
|
|
132
|
+
|
|
133
|
+
df = concat_dfs(dfs)
|
|
134
|
+
|
|
135
|
+
# transforms
|
|
136
|
+
df = self.filter_where(df)
|
|
137
|
+
df = self.encrypt(df)
|
|
138
|
+
if transform:
|
|
139
|
+
df = self.base_transform(df)
|
|
140
|
+
return df
|
|
141
|
+
|
|
142
|
+
def get_dependencies(self, df: Optional[DataFrame] = None) -> Optional[DataFrame]:
|
|
143
|
+
dependencies = []
|
|
144
|
+
parents = self.options.job.get_list("parents") or []
|
|
145
|
+
if parents:
|
|
146
|
+
for p in parents:
|
|
147
|
+
dependencies.append(Row(self.job_id, p, "job"))
|
|
148
|
+
else:
|
|
149
|
+
p = f"{self.parent_step}.{self.topic}_{self.item}"
|
|
150
|
+
dependencies.append(Row(self.job_id, p, "parser"))
|
|
151
|
+
|
|
152
|
+
if dependencies:
|
|
153
|
+
Logger.debug(f"dependencies ({', '.join([row[1] for row in dependencies])})", extra={"job": self})
|
|
154
|
+
df = self.spark.createDataFrame(dependencies, schema=["job_id", "parent", "origin"])
|
|
155
|
+
df = df.transform(self.add_dependency_details)
|
|
156
|
+
return df
|
|
157
|
+
|
|
158
|
+
def create_or_replace_view(self):
|
|
159
|
+
assert self.mode in ["memory", "combine"], f"{self.mode} not allowed"
|
|
160
|
+
|
|
161
|
+
dep_df = self.get_dependencies()
|
|
162
|
+
assert dep_df, "dependency not found"
|
|
163
|
+
|
|
164
|
+
if self.mode == "combine":
|
|
165
|
+
queries = []
|
|
166
|
+
|
|
167
|
+
for row in dep_df.collect():
|
|
168
|
+
columns = self.get_data().columns
|
|
169
|
+
df = self.spark.sql(f"select * from {row.parent}")
|
|
170
|
+
cols = [f"`{c}`" if c in df.columns else f"null as `{c}`" for c in columns if c not in ["__source"]]
|
|
171
|
+
source = "__source" if "__source" in df.columns else f"'{row.parent}' as __source"
|
|
172
|
+
query = f"select {', '.join(cols)}, {source} from {row.parent}"
|
|
173
|
+
queries.append(query)
|
|
174
|
+
|
|
175
|
+
sql = f"create or replace view {self.qualified_name} as {' union all '.join(queries)}"
|
|
176
|
+
sql = fix_sql(sql)
|
|
177
|
+
Logger.debug("view", extra={"job": self, "sql": sql})
|
|
178
|
+
self.spark.sql(sql)
|
|
179
|
+
|
|
180
|
+
else:
|
|
181
|
+
assert dep_df.count() == 1, "only one dependency allowed"
|
|
182
|
+
|
|
183
|
+
parent = dep_df.collect()[0].parent
|
|
184
|
+
sql = f"select * from {parent}"
|
|
185
|
+
sql = fix_sql(sql)
|
|
186
|
+
Logger.debug("view", extra={"job": self, "sql": sql})
|
|
187
|
+
|
|
188
|
+
df = self.spark.sql(sql)
|
|
189
|
+
cdc_options = self.get_cdc_context(df)
|
|
190
|
+
self.cdc.create_or_replace_view(sql, **cdc_options)
|
|
191
|
+
|
|
192
|
+
def create_or_replace_current_view(self):
|
|
193
|
+
from py4j.protocol import Py4JJavaError
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
Logger.debug("create or replace current view", extra={"job": self})
|
|
197
|
+
|
|
198
|
+
df = self.spark.sql(f"select * from {self.qualified_name}")
|
|
199
|
+
|
|
200
|
+
where_clause = "-- no where clause"
|
|
201
|
+
if "__is_current" in df.columns:
|
|
202
|
+
where_clause = "where __is_current"
|
|
203
|
+
|
|
204
|
+
sql = f"""
|
|
205
|
+
create or replace view {self.qualified_name}__current with schema evolution as
|
|
206
|
+
select
|
|
207
|
+
*
|
|
208
|
+
from
|
|
209
|
+
{self.qualified_name}
|
|
210
|
+
{where_clause}
|
|
211
|
+
"""
|
|
212
|
+
# sql = fix_sql(sql)
|
|
213
|
+
# Logger.debug("current view", extra={"job": self, "sql": sql})
|
|
214
|
+
self.spark.sql(sql)
|
|
215
|
+
|
|
216
|
+
except Py4JJavaError:
|
|
217
|
+
Logger.exception("🙈", extra={"job": self})
|
|
218
|
+
|
|
219
|
+
def overwrite(self):
|
|
220
|
+
self.truncate()
|
|
221
|
+
self.run()
|
|
222
|
+
|
|
223
|
+
def overwrite_schema(self):
|
|
224
|
+
Logger.warning("overwrite schema not allowed", extra={"job": self})
|
|
225
|
+
|
|
226
|
+
def get_cdc_context(self, df: DataFrame) -> dict:
|
|
227
|
+
# if dataframe, reference is passed (BUG)
|
|
228
|
+
name = f"{self.step}_{self.topic}_{self.item}__check"
|
|
229
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
230
|
+
|
|
231
|
+
not_append = not self.mode == "append"
|
|
232
|
+
nocdc = self.change_data_capture == "nocdc"
|
|
233
|
+
order_duplicate_by = self.options.job.get_dict("order_duplicate_by") or {}
|
|
234
|
+
|
|
235
|
+
rectify = False
|
|
236
|
+
if not_append and not nocdc:
|
|
237
|
+
if not self.stream and self.mode == "update" and self.table.exists():
|
|
238
|
+
timestamp = "__valid_from" if self.change_data_capture == "scd2" else "__timestamp"
|
|
239
|
+
extra_check = f" and __timestamp > coalesce((select max({timestamp}) from {self}), cast('0001-01-01' as timestamp))"
|
|
240
|
+
else:
|
|
241
|
+
extra_check = "-- no extra check"
|
|
242
|
+
|
|
243
|
+
sql = f"""
|
|
244
|
+
select
|
|
245
|
+
__operation
|
|
246
|
+
from
|
|
247
|
+
{global_temp_view}
|
|
248
|
+
where
|
|
249
|
+
true
|
|
250
|
+
and __operation == 'reload'
|
|
251
|
+
{extra_check}
|
|
252
|
+
limit
|
|
253
|
+
1
|
|
254
|
+
"""
|
|
255
|
+
sql = fix_sql(sql)
|
|
256
|
+
Logger.debug("check", extra={"job": self, "sql": sql})
|
|
257
|
+
|
|
258
|
+
check_df = self.spark.sql(sql)
|
|
259
|
+
if not check_df.isEmpty():
|
|
260
|
+
rectify = True
|
|
261
|
+
Logger.debug("rectify enabled", extra={"job": self})
|
|
262
|
+
|
|
263
|
+
context = {
|
|
264
|
+
"soft_delete": self.slowly_changing_dimension,
|
|
265
|
+
"deduplicate": self.options.job.get_boolean("deduplicate", not_append),
|
|
266
|
+
"rectify": rectify,
|
|
267
|
+
"order_duplicate_by": order_duplicate_by,
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if self.slowly_changing_dimension:
|
|
271
|
+
if "__key" not in df.columns:
|
|
272
|
+
context["add_key"] = True
|
|
273
|
+
|
|
274
|
+
if self.mode == "memory":
|
|
275
|
+
context["mode"] = "complete"
|
|
276
|
+
if self.mode == "latest":
|
|
277
|
+
context["filter"] = "latest"
|
|
278
|
+
|
|
279
|
+
if self.change_data_capture == "scd2":
|
|
280
|
+
context["fix_valid_from"] = True
|
|
281
|
+
|
|
282
|
+
if nocdc:
|
|
283
|
+
if "__operation" in df.columns:
|
|
284
|
+
context["except"] = ["__operation"]
|
|
285
|
+
if nocdc and self.mode == "memory":
|
|
286
|
+
if "__operation" not in df.columns:
|
|
287
|
+
context["add_operation"] = "upsert"
|
|
288
|
+
context["except"] = ["__operation"]
|
|
289
|
+
|
|
290
|
+
if not self.stream and self.mode == "update":
|
|
291
|
+
context["filter"] = "update"
|
|
292
|
+
|
|
293
|
+
return context
|
|
294
|
+
|
|
295
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
|
|
296
|
+
assert self.persist, f"{self.mode} not allowed"
|
|
297
|
+
|
|
298
|
+
context = self.get_cdc_context(df)
|
|
299
|
+
|
|
300
|
+
# if dataframe, reference is passed (BUG)
|
|
301
|
+
name = f"{self.step}_{self.topic}_{self.item}"
|
|
302
|
+
if batch is not None:
|
|
303
|
+
name = f"{name}__{batch}"
|
|
304
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
305
|
+
sql = f"select * from {global_temp_view}"
|
|
306
|
+
|
|
307
|
+
if self.mode == "update":
|
|
308
|
+
assert not isinstance(self.cdc, NoCDC)
|
|
309
|
+
self.cdc.update(sql, **context)
|
|
310
|
+
elif self.mode == "append":
|
|
311
|
+
assert isinstance(self.cdc, NoCDC)
|
|
312
|
+
self.cdc.append(sql, **context)
|
|
313
|
+
elif self.mode == "latest":
|
|
314
|
+
assert isinstance(self.cdc, NoCDC)
|
|
315
|
+
check_df = self.spark.sql(
|
|
316
|
+
f"""
|
|
317
|
+
select
|
|
318
|
+
__operation
|
|
319
|
+
from
|
|
320
|
+
{global_temp_view}
|
|
321
|
+
where
|
|
322
|
+
__operation <> 'reload'
|
|
323
|
+
limit
|
|
324
|
+
1
|
|
325
|
+
"""
|
|
326
|
+
)
|
|
327
|
+
assert check_df.isEmpty(), f"{check_df.collect()[0][0]} not allowed"
|
|
328
|
+
self.cdc.complete(sql, **context)
|
|
329
|
+
else:
|
|
330
|
+
raise ValueError(f"{self.mode} - not allowed")
|
|
331
|
+
|
|
332
|
+
def create(self):
|
|
333
|
+
super().create()
|
|
334
|
+
self.create_or_replace_current_view()
|
|
335
|
+
|
|
336
|
+
def register(self):
|
|
337
|
+
super().register()
|
|
338
|
+
self.create_or_replace_current_view()
|
|
339
|
+
|
|
340
|
+
def drop(self):
|
|
341
|
+
super().drop()
|
|
342
|
+
Logger.debug("drop current view", extra={"job": self})
|
|
343
|
+
self.spark.sql(f"drop view if exists {self.qualified_name}__current")
|
|
344
|
+
|
|
345
|
+
def optimize(
|
|
346
|
+
self,
|
|
347
|
+
vacuum: Optional[bool] = True,
|
|
348
|
+
optimize: Optional[bool] = True,
|
|
349
|
+
analyze: Optional[bool] = True,
|
|
350
|
+
):
|
|
351
|
+
if self.mode == "memory":
|
|
352
|
+
Logger.debug("memory (no optimize)", extra={"job": self})
|
|
353
|
+
else:
|
|
354
|
+
super().optimize()
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
2
|
+
from fabricks.core.parsers.decorator import parser
|
|
3
|
+
from fabricks.core.parsers.get_parser import get_parser
|
|
4
|
+
from fabricks.core.parsers.types import ParserOptions
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"BaseParser",
|
|
8
|
+
"get_parser",
|
|
9
|
+
"parser",
|
|
10
|
+
"ParserOptions",
|
|
11
|
+
"PARSERS",
|
|
12
|
+
]
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Callable, Optional, final
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
|
+
from pyspark.sql.functions import col, expr, from_json, lit
|
|
6
|
+
from pyspark.sql.types import MapType, StringType
|
|
7
|
+
|
|
8
|
+
from fabricks.core.parsers.types import ParserOptions
|
|
9
|
+
from fabricks.core.utils import clean
|
|
10
|
+
from fabricks.utils.path import Path
|
|
11
|
+
from fabricks.utils.read.read import read
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseParser(ABC):
|
|
15
|
+
def __init__(self, options: Optional[ParserOptions], file_format: str):
|
|
16
|
+
self.options = options or {}
|
|
17
|
+
self.file_format = file_format
|
|
18
|
+
|
|
19
|
+
def add_timestamp_from_file_path(self, df: DataFrame) -> DataFrame:
|
|
20
|
+
df = df.withColumn(
|
|
21
|
+
"__split",
|
|
22
|
+
expr("split(replace(__metadata.file_path, __metadata.file_name), '/')"),
|
|
23
|
+
)
|
|
24
|
+
df = df.withColumn("__split_size", expr("size(__split)"))
|
|
25
|
+
df = df.withColumn(
|
|
26
|
+
"__timestamp",
|
|
27
|
+
expr("left(concat_ws('', slice(__split, __split_size - 4, 4), '00'), 14)"),
|
|
28
|
+
)
|
|
29
|
+
df = df.withColumn("__timestamp", expr("to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
|
|
30
|
+
df = df.drop("__split", "__split_size")
|
|
31
|
+
return df
|
|
32
|
+
|
|
33
|
+
def parse(
|
|
34
|
+
self,
|
|
35
|
+
data_path: Path,
|
|
36
|
+
schema_path: Path,
|
|
37
|
+
spark: SparkSession,
|
|
38
|
+
stream: bool,
|
|
39
|
+
) -> DataFrame:
|
|
40
|
+
df = read(
|
|
41
|
+
stream=stream,
|
|
42
|
+
path=data_path,
|
|
43
|
+
file_format=self.file_format,
|
|
44
|
+
schema_path=schema_path,
|
|
45
|
+
options=self.options.get("read_options"),
|
|
46
|
+
spark=spark,
|
|
47
|
+
)
|
|
48
|
+
if "__timestamp" not in df.columns:
|
|
49
|
+
df = self.add_timestamp_from_file_path(df)
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
@final
|
|
53
|
+
def get_data(
|
|
54
|
+
self,
|
|
55
|
+
data_path: Path,
|
|
56
|
+
schema_path: Path,
|
|
57
|
+
spark: SparkSession,
|
|
58
|
+
stream: bool,
|
|
59
|
+
) -> DataFrame:
|
|
60
|
+
"""
|
|
61
|
+
Retrieves and processes data from the specified data path using the provided schema.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
data_path (Path): The path to the data file.
|
|
65
|
+
schema_path (Path): The path to the schema file.
|
|
66
|
+
spark (SparkSession): The SparkSession object.
|
|
67
|
+
stream (bool): Indicates whether the data should be processed as a stream.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
DataFrame: The processed data as a DataFrame.
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
AssertionError: If the "__timestamp" column is missing in the DataFrame.
|
|
74
|
+
AssertionError: If the "__metadata.file_path" column is missing in the DataFrame.
|
|
75
|
+
"""
|
|
76
|
+
df = self.parse(data_path=data_path, schema_path=schema_path, spark=spark, stream=stream)
|
|
77
|
+
df = df.transform(clean)
|
|
78
|
+
|
|
79
|
+
if "__rescued_data" not in df.columns:
|
|
80
|
+
df = df.withColumn("__rescued_data", lit(None).cast(StringType()))
|
|
81
|
+
df = df.withColumn("__rescued_data", from_json(col("__rescued_data"), MapType(StringType(), StringType()))) # type: ignore
|
|
82
|
+
|
|
83
|
+
assert "__timestamp" in df.columns, "__timestamp mandatory in dataframe"
|
|
84
|
+
assert df.select("__metadata.file_path"), "file_path mandatory in struct __metadata in dataframe"
|
|
85
|
+
return df
|
|
86
|
+
|
|
87
|
+
def __str__(self):
|
|
88
|
+
return f"{type(self).__name__} ({self.file_format})"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
PARSERS: dict[str, Callable[[Optional[ParserOptions]], BaseParser]] = {}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from typing import Callable, Optional
|
|
2
|
+
|
|
3
|
+
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
4
|
+
from fabricks.core.parsers.types import ParserOptions
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parser(name: str):
|
|
8
|
+
def decorator(parser: Callable[[Optional[ParserOptions]], BaseParser]):
|
|
9
|
+
PARSERS[name] = parser
|
|
10
|
+
|
|
11
|
+
return decorator
|