fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
from typing import Optional, Sequence, Union, cast
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from pyspark.sql.functions import expr
|
|
5
|
+
from pyspark.sql.types import Row
|
|
6
|
+
|
|
7
|
+
from fabricks.cdc.nocdc import NoCDC
|
|
8
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
|
+
from fabricks.core.jobs.base._types import JobDependency, TBronze, TSilver
|
|
10
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
11
|
+
from fabricks.core.jobs.bronze import Bronze
|
|
12
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
13
|
+
from fabricks.utils.helpers import concat_dfs
|
|
14
|
+
from fabricks.utils.read.read import read
|
|
15
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Silver(BaseJob):
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
step: TSilver,
|
|
22
|
+
topic: Optional[str] = None,
|
|
23
|
+
item: Optional[str] = None,
|
|
24
|
+
job_id: Optional[str] = None,
|
|
25
|
+
conf: Optional[Union[dict, Row]] = None,
|
|
26
|
+
): # type: ignore
|
|
27
|
+
super().__init__(
|
|
28
|
+
"silver",
|
|
29
|
+
step=step,
|
|
30
|
+
topic=topic,
|
|
31
|
+
item=item,
|
|
32
|
+
job_id=job_id,
|
|
33
|
+
conf=conf,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
_parent_step: Optional[TBronze] = None
|
|
37
|
+
_stream: Optional[bool] = None
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_job_id(cls, step: str, job_id: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
41
|
+
return cls(step=cast(TSilver, step), job_id=job_id, conf=conf)
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str, *, conf: Optional[Union[dict, Row]] = None):
|
|
45
|
+
return cls(step=cast(TSilver, step), topic=topic, item=item, conf=conf)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def stream(self) -> bool:
|
|
49
|
+
if not self._stream:
|
|
50
|
+
_stream = self.options.job.get("stream")
|
|
51
|
+
if _stream is None:
|
|
52
|
+
_stream = self.step_conf.get("options", {}).get("stream")
|
|
53
|
+
self._stream = _stream if _stream is not None else True
|
|
54
|
+
return self._stream # type: ignore
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def schema_drift(self) -> bool:
|
|
58
|
+
return True
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def persist(self) -> bool:
|
|
62
|
+
return self.mode in ["update", "append", "latest"]
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def virtual(self) -> bool:
|
|
66
|
+
return self.mode in ["combine", "memory"]
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def parent_step(self) -> TBronze:
|
|
70
|
+
if not self._parent_step:
|
|
71
|
+
_parent_step = self.step_conf.get("options", {}).get("parent")
|
|
72
|
+
_parent_step = cast(TBronze, _parent_step)
|
|
73
|
+
assert _parent_step is not None
|
|
74
|
+
self._parent_step = _parent_step
|
|
75
|
+
return self._parent_step
|
|
76
|
+
|
|
77
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
78
|
+
df = df.transform(self.extend)
|
|
79
|
+
|
|
80
|
+
if "__metadata" in df.columns:
|
|
81
|
+
df = df.withColumn(
|
|
82
|
+
"__metadata",
|
|
83
|
+
expr(
|
|
84
|
+
"""
|
|
85
|
+
struct(
|
|
86
|
+
__metadata.file_path as file_path,
|
|
87
|
+
__metadata.file_name as file_name,
|
|
88
|
+
__metadata.file_size as file_size,
|
|
89
|
+
__metadata.file_modification_time as file_modification_time,
|
|
90
|
+
__metadata.inserted as inserted,
|
|
91
|
+
cast(current_timestamp() as timestamp) as updated
|
|
92
|
+
)
|
|
93
|
+
"""
|
|
94
|
+
),
|
|
95
|
+
)
|
|
96
|
+
return df
|
|
97
|
+
|
|
98
|
+
def get_data(
|
|
99
|
+
self,
|
|
100
|
+
stream: bool = False,
|
|
101
|
+
transform: Optional[bool] = False,
|
|
102
|
+
schema_only: Optional[bool] = False,
|
|
103
|
+
**kwargs,
|
|
104
|
+
) -> DataFrame:
|
|
105
|
+
deps = self.get_dependencies()
|
|
106
|
+
assert deps, "not dependency found"
|
|
107
|
+
|
|
108
|
+
if self.mode == "memory":
|
|
109
|
+
assert len(deps) == 1, f"more than 1 dependency not allowed ({deps})"
|
|
110
|
+
|
|
111
|
+
parent = deps[0].parent
|
|
112
|
+
df = self.spark.sql(f"select * from {parent}")
|
|
113
|
+
|
|
114
|
+
elif self.mode == "combine":
|
|
115
|
+
dfs = []
|
|
116
|
+
|
|
117
|
+
for row in sorted(deps, key=lambda x: x.parent_id):
|
|
118
|
+
df = self.spark.sql(f"select * from {row.parent}")
|
|
119
|
+
dfs.append(df)
|
|
120
|
+
|
|
121
|
+
df = concat_dfs(dfs)
|
|
122
|
+
assert df is not None
|
|
123
|
+
|
|
124
|
+
else:
|
|
125
|
+
dfs = []
|
|
126
|
+
|
|
127
|
+
for row in sorted(deps, key=lambda x: x.parent_id):
|
|
128
|
+
try:
|
|
129
|
+
bronze = Bronze.from_job_id(step=self.parent_step, job_id=row.parent_id)
|
|
130
|
+
if bronze.mode in ["memory", "register"]:
|
|
131
|
+
# data already transformed if bronze is persisted
|
|
132
|
+
df = bronze.get_data(stream=stream, transform=True)
|
|
133
|
+
else:
|
|
134
|
+
df = read(
|
|
135
|
+
stream=stream,
|
|
136
|
+
path=bronze.table.deltapath,
|
|
137
|
+
file_format="delta",
|
|
138
|
+
metadata=False,
|
|
139
|
+
spark=self.spark,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if df:
|
|
143
|
+
if len(deps) > 1:
|
|
144
|
+
assert "__source" in df.columns, "__source not found"
|
|
145
|
+
dfs.append(df)
|
|
146
|
+
|
|
147
|
+
except Exception as e:
|
|
148
|
+
DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": self})
|
|
149
|
+
raise e
|
|
150
|
+
|
|
151
|
+
df = concat_dfs(dfs)
|
|
152
|
+
assert df is not None
|
|
153
|
+
|
|
154
|
+
# transforms
|
|
155
|
+
df = self.filter_where(df)
|
|
156
|
+
df = self.encrypt(df)
|
|
157
|
+
if transform:
|
|
158
|
+
df = self.base_transform(df)
|
|
159
|
+
|
|
160
|
+
if schema_only:
|
|
161
|
+
df = df.where("1 == 2")
|
|
162
|
+
|
|
163
|
+
return df
|
|
164
|
+
|
|
165
|
+
def get_dependencies(self) -> Sequence[JobDependency]:
|
|
166
|
+
dependencies = []
|
|
167
|
+
|
|
168
|
+
parents = self.options.job.get_list("parents") or []
|
|
169
|
+
if parents:
|
|
170
|
+
for p in parents:
|
|
171
|
+
dependencies.append(JobDependency.from_parts(self.job_id, p, "job"))
|
|
172
|
+
|
|
173
|
+
else:
|
|
174
|
+
p = f"{self.parent_step}.{self.topic}_{self.item}"
|
|
175
|
+
dependencies.append(JobDependency.from_parts(self.job_id, p, "parser"))
|
|
176
|
+
|
|
177
|
+
return dependencies
|
|
178
|
+
|
|
179
|
+
def create_or_replace_view(self):
|
|
180
|
+
assert self.mode in ["memory", "combine"], f"{self.mode} not allowed"
|
|
181
|
+
|
|
182
|
+
deps = self.get_dependencies()
|
|
183
|
+
assert deps, "dependency not found"
|
|
184
|
+
|
|
185
|
+
if self.mode == "combine":
|
|
186
|
+
queries = []
|
|
187
|
+
|
|
188
|
+
for row in deps:
|
|
189
|
+
columns = self.get_data().columns
|
|
190
|
+
df = self.spark.sql(f"select * from {row.parent}")
|
|
191
|
+
cols = [f"`{c}`" if c in df.columns else f"null as `{c}`" for c in columns if c not in ["__source"]]
|
|
192
|
+
source = "__source" if "__source" in df.columns else f"'{row.parent}' as __source"
|
|
193
|
+
query = f"select {', '.join(cols)}, {source} from {row.parent}"
|
|
194
|
+
queries.append(query)
|
|
195
|
+
|
|
196
|
+
sql = f"create or replace view {self.qualified_name} as {' union all '.join(queries)}"
|
|
197
|
+
sql = fix_sql(sql)
|
|
198
|
+
DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
|
|
199
|
+
self.spark.sql(sql)
|
|
200
|
+
|
|
201
|
+
else:
|
|
202
|
+
assert len(deps) == 1, "only one dependency allowed"
|
|
203
|
+
|
|
204
|
+
parent = deps[0].parent
|
|
205
|
+
sql = f"select * from {parent}"
|
|
206
|
+
sql = fix_sql(sql)
|
|
207
|
+
DEFAULT_LOGGER.debug("view", extra={"label": self, "sql": sql})
|
|
208
|
+
|
|
209
|
+
df = self.spark.sql(sql)
|
|
210
|
+
cdc_options = self.get_cdc_context(df)
|
|
211
|
+
self.cdc.create_or_replace_view(sql, **cdc_options)
|
|
212
|
+
|
|
213
|
+
def create_or_replace_current_view(self):
|
|
214
|
+
from py4j.protocol import Py4JJavaError
|
|
215
|
+
|
|
216
|
+
try:
|
|
217
|
+
DEFAULT_LOGGER.debug("create or replace current view", extra={"label": self})
|
|
218
|
+
|
|
219
|
+
df = self.spark.sql(f"select * from {self.qualified_name}")
|
|
220
|
+
|
|
221
|
+
where_clause = "-- no where clause"
|
|
222
|
+
if "__is_current" in df.columns:
|
|
223
|
+
where_clause = "where __is_current"
|
|
224
|
+
|
|
225
|
+
sql = f"""
|
|
226
|
+
create or replace view {self.qualified_name}__current with schema evolution as
|
|
227
|
+
select
|
|
228
|
+
*
|
|
229
|
+
from
|
|
230
|
+
{self.qualified_name}
|
|
231
|
+
{where_clause}
|
|
232
|
+
"""
|
|
233
|
+
# sql = fix_sql(sql)
|
|
234
|
+
# DEFAULT_LOGGER.debug("current view", extra={"label": self, "sql": sql})
|
|
235
|
+
self.spark.sql(sql)
|
|
236
|
+
|
|
237
|
+
except Py4JJavaError as e:
|
|
238
|
+
DEFAULT_LOGGER.exception("fail to create nor replace view", extra={"label": self}, exc_info=e)
|
|
239
|
+
|
|
240
|
+
def overwrite(self, schedule: Optional[str] = None):
|
|
241
|
+
self.truncate()
|
|
242
|
+
self.run(schedule=schedule)
|
|
243
|
+
|
|
244
|
+
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
245
|
+
DEFAULT_LOGGER.warning("overwrite schema not allowed", extra={"label": self})
|
|
246
|
+
|
|
247
|
+
def get_cdc_context(self, df: DataFrame, reload: Optional[bool] = None) -> dict:
|
|
248
|
+
# if dataframe, reference is passed (BUG)
|
|
249
|
+
name = f"{self.step}_{self.topic}_{self.item}__check"
|
|
250
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
251
|
+
|
|
252
|
+
not_append = not self.mode == "append"
|
|
253
|
+
nocdc = self.change_data_capture == "nocdc"
|
|
254
|
+
order_duplicate_by = self.options.job.get_dict("order_duplicate_by") or {}
|
|
255
|
+
|
|
256
|
+
rectify = False
|
|
257
|
+
if not_append and not nocdc:
|
|
258
|
+
if not self.stream and self.mode == "update" and self.table.exists():
|
|
259
|
+
timestamp = "__valid_from" if self.change_data_capture == "scd2" else "__timestamp"
|
|
260
|
+
extra_check = f" and __timestamp > coalesce((select max({timestamp}) from {self}), cast('0001-01-01' as timestamp))"
|
|
261
|
+
else:
|
|
262
|
+
extra_check = "-- no extra check"
|
|
263
|
+
|
|
264
|
+
sql = f"""
|
|
265
|
+
select
|
|
266
|
+
__operation
|
|
267
|
+
from
|
|
268
|
+
{global_temp_view}
|
|
269
|
+
where
|
|
270
|
+
true
|
|
271
|
+
and __operation == 'reload'
|
|
272
|
+
{extra_check}
|
|
273
|
+
limit
|
|
274
|
+
1
|
|
275
|
+
"""
|
|
276
|
+
sql = fix_sql(sql)
|
|
277
|
+
DEFAULT_LOGGER.debug("check", extra={"label": self, "sql": sql})
|
|
278
|
+
|
|
279
|
+
check_df = self.spark.sql(sql)
|
|
280
|
+
if not check_df.isEmpty():
|
|
281
|
+
rectify = True
|
|
282
|
+
DEFAULT_LOGGER.debug("rectify enabled", extra={"label": self})
|
|
283
|
+
|
|
284
|
+
context = {
|
|
285
|
+
"soft_delete": self.slowly_changing_dimension,
|
|
286
|
+
"deduplicate": self.options.job.get_boolean("deduplicate", not_append),
|
|
287
|
+
"rectify": rectify,
|
|
288
|
+
"order_duplicate_by": order_duplicate_by,
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if self.mode == "memory":
|
|
292
|
+
context["mode"] = "complete"
|
|
293
|
+
|
|
294
|
+
if self.slowly_changing_dimension:
|
|
295
|
+
if "__key" not in df.columns:
|
|
296
|
+
context["add_key"] = True
|
|
297
|
+
|
|
298
|
+
if nocdc and self.mode == "memory":
|
|
299
|
+
if "__operation" not in df.columns:
|
|
300
|
+
context["add_operation"] = "upsert"
|
|
301
|
+
|
|
302
|
+
if self.mode == "latest":
|
|
303
|
+
context["slice"] = "latest"
|
|
304
|
+
if not self.stream and self.mode == "update":
|
|
305
|
+
context["slice"] = "update"
|
|
306
|
+
|
|
307
|
+
if self.change_data_capture == "scd2":
|
|
308
|
+
context["correct_valid_from"] = True
|
|
309
|
+
|
|
310
|
+
if "__operation" in df.columns:
|
|
311
|
+
context["exclude"] = ["__operation"]
|
|
312
|
+
if nocdc: # operation is passed from the bronze layer
|
|
313
|
+
context["exclude"] = ["__operation"]
|
|
314
|
+
|
|
315
|
+
return context
|
|
316
|
+
|
|
317
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None, **kwargs):
|
|
318
|
+
assert self.persist, f"{self.mode} not allowed"
|
|
319
|
+
|
|
320
|
+
context = self.get_cdc_context(df)
|
|
321
|
+
|
|
322
|
+
# if dataframe, reference is passed (BUG)
|
|
323
|
+
name = f"{self.step}_{self.topic}_{self.item}"
|
|
324
|
+
if batch is not None:
|
|
325
|
+
name = f"{name}__{batch}"
|
|
326
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df, job=self)
|
|
327
|
+
sql = f"select * from {global_temp_view}"
|
|
328
|
+
|
|
329
|
+
check_df = self.spark.sql(sql)
|
|
330
|
+
if check_df.isEmpty():
|
|
331
|
+
DEFAULT_LOGGER.warning("no data", extra={"label": self})
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
if self.mode == "update":
|
|
335
|
+
assert not isinstance(self.cdc, NoCDC)
|
|
336
|
+
self.cdc.update(sql, **context)
|
|
337
|
+
|
|
338
|
+
elif self.mode == "append":
|
|
339
|
+
assert isinstance(self.cdc, NoCDC)
|
|
340
|
+
self.cdc.append(sql, **context)
|
|
341
|
+
|
|
342
|
+
elif self.mode == "latest":
|
|
343
|
+
assert isinstance(self.cdc, NoCDC)
|
|
344
|
+
check_df = self.spark.sql(
|
|
345
|
+
f"""
|
|
346
|
+
select
|
|
347
|
+
__operation
|
|
348
|
+
from
|
|
349
|
+
{global_temp_view}
|
|
350
|
+
where
|
|
351
|
+
__operation <> 'reload'
|
|
352
|
+
limit
|
|
353
|
+
1
|
|
354
|
+
"""
|
|
355
|
+
)
|
|
356
|
+
assert check_df.isEmpty(), f"{check_df.collect()[0][0]} not allowed"
|
|
357
|
+
self.cdc.complete(sql, **context)
|
|
358
|
+
|
|
359
|
+
else:
|
|
360
|
+
raise ValueError(f"{self.mode} - not allowed")
|
|
361
|
+
|
|
362
|
+
def create(self):
|
|
363
|
+
super().create()
|
|
364
|
+
self.create_or_replace_current_view()
|
|
365
|
+
|
|
366
|
+
def register(self):
|
|
367
|
+
super().register()
|
|
368
|
+
self.create_or_replace_current_view()
|
|
369
|
+
|
|
370
|
+
def drop(self):
|
|
371
|
+
super().drop()
|
|
372
|
+
DEFAULT_LOGGER.debug("drop current view", extra={"label": self})
|
|
373
|
+
self.spark.sql(f"drop view if exists {self.qualified_name}__current")
|
fabricks/core/masks.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import SparkSession
|
|
5
|
+
|
|
6
|
+
from fabricks.context import CATALOG, PATH_MASKS, SPARK
|
|
7
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def register_all_masks():
|
|
11
|
+
"""
|
|
12
|
+
Register all masks.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
DEFAULT_LOGGER.info("register masks")
|
|
16
|
+
for mask in get_masks():
|
|
17
|
+
split = mask.split(".")
|
|
18
|
+
try:
|
|
19
|
+
register_mask(mask=split[0])
|
|
20
|
+
except Exception as e:
|
|
21
|
+
DEFAULT_LOGGER.exception(f"could not register mask {mask}", exc_info=e)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_masks() -> List[str]:
|
|
25
|
+
return [os.path.basename(f) for f in PATH_MASKS.walk()]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_registered(mask: str, spark: Optional[SparkSession] = None) -> bool:
|
|
29
|
+
if spark is None:
|
|
30
|
+
spark = SPARK
|
|
31
|
+
assert spark is not None
|
|
32
|
+
|
|
33
|
+
df = spark.sql("show user functions in default")
|
|
34
|
+
|
|
35
|
+
if CATALOG:
|
|
36
|
+
df = df.where(f"function == '{CATALOG}.default.mask_{mask}'")
|
|
37
|
+
else:
|
|
38
|
+
df = df.where(f"function == 'spark_catalog.default.mask_{mask}'")
|
|
39
|
+
|
|
40
|
+
return not df.isEmpty()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def register_mask(mask: str, spark: Optional[SparkSession] = None):
|
|
44
|
+
if spark is None:
|
|
45
|
+
spark = SPARK
|
|
46
|
+
assert spark is not None
|
|
47
|
+
|
|
48
|
+
if not is_registered(mask, spark):
|
|
49
|
+
DEFAULT_LOGGER.debug(f"register mask {mask}")
|
|
50
|
+
|
|
51
|
+
path = PATH_MASKS.joinpath(f"{mask}.sql")
|
|
52
|
+
spark.sql(path.get_sql())
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from fabricks.core.parsers._types import ParserOptions
|
|
2
|
+
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
3
|
+
from fabricks.core.parsers.decorator import parser
|
|
4
|
+
from fabricks.core.parsers.get_parser import get_parser
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"BaseParser",
|
|
8
|
+
"get_parser",
|
|
9
|
+
"parser",
|
|
10
|
+
"ParserOptions",
|
|
11
|
+
"PARSERS",
|
|
12
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from typing import Callable, Optional, final
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
|
+
from pyspark.sql.functions import col, expr, from_json, lit
|
|
6
|
+
from pyspark.sql.types import MapType, StringType
|
|
7
|
+
|
|
8
|
+
from fabricks.core.parsers._types import ParserOptions
|
|
9
|
+
from fabricks.core.parsers.utils import clean
|
|
10
|
+
from fabricks.utils.path import Path
|
|
11
|
+
from fabricks.utils.read.read import read
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BaseParser(ABC):
|
|
15
|
+
def __init__(self, options: Optional[ParserOptions], file_format: str):
|
|
16
|
+
self.options = options or {}
|
|
17
|
+
self.file_format = file_format
|
|
18
|
+
|
|
19
|
+
def add_timestamp_from_file_path(self, df: DataFrame) -> DataFrame:
|
|
20
|
+
df = df.withColumn(
|
|
21
|
+
"__split",
|
|
22
|
+
expr("split(replace(__metadata.file_path, __metadata.file_name), '/')"),
|
|
23
|
+
)
|
|
24
|
+
df = df.withColumn("__split_size", expr("size(__split)"))
|
|
25
|
+
df = df.withColumn(
|
|
26
|
+
"__timestamp",
|
|
27
|
+
expr("left(concat_ws('', slice(__split, __split_size - 4, 4), '00'), 14)"),
|
|
28
|
+
)
|
|
29
|
+
df = df.withColumn("__timestamp", expr("try_to_timestamp(__timestamp, 'yyyyMMddHHmmss')"))
|
|
30
|
+
df = df.drop("__split", "__split_size")
|
|
31
|
+
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
def parse(
|
|
35
|
+
self,
|
|
36
|
+
data_path: Path,
|
|
37
|
+
schema_path: Path,
|
|
38
|
+
spark: SparkSession,
|
|
39
|
+
stream: bool,
|
|
40
|
+
) -> DataFrame:
|
|
41
|
+
df = read(
|
|
42
|
+
stream=stream,
|
|
43
|
+
path=data_path,
|
|
44
|
+
file_format=self.file_format,
|
|
45
|
+
schema_path=schema_path,
|
|
46
|
+
options=self.options.get("read_options"),
|
|
47
|
+
spark=spark,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if "__timestamp" not in df.columns:
|
|
51
|
+
df = self.add_timestamp_from_file_path(df)
|
|
52
|
+
|
|
53
|
+
return df
|
|
54
|
+
|
|
55
|
+
@final
|
|
56
|
+
def get_data(
|
|
57
|
+
self,
|
|
58
|
+
data_path: Path,
|
|
59
|
+
schema_path: Path,
|
|
60
|
+
spark: SparkSession,
|
|
61
|
+
stream: bool,
|
|
62
|
+
) -> DataFrame:
|
|
63
|
+
"""
|
|
64
|
+
Retrieves and processes data from the specified data path using the provided schema.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
data_path (Path): The path to the data file.
|
|
68
|
+
schema_path (Path): The path to the schema file.
|
|
69
|
+
spark (SparkSession): The SparkSession object.
|
|
70
|
+
stream (bool): Indicates whether the data should be processed as a stream.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
DataFrame: The processed data as a DataFrame.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
AssertionError: If the "__timestamp" column is missing in the DataFrame.
|
|
77
|
+
AssertionError: If the "__metadata.file_path" column is missing in the DataFrame.
|
|
78
|
+
"""
|
|
79
|
+
df = self.parse(data_path=data_path, schema_path=schema_path, spark=spark, stream=stream)
|
|
80
|
+
df = df.transform(clean)
|
|
81
|
+
|
|
82
|
+
if "__rescued_data" not in df.columns:
|
|
83
|
+
df = df.withColumn("__rescued_data", lit(None).cast(StringType()))
|
|
84
|
+
|
|
85
|
+
df = df.withColumn("__rescued_data", from_json(col("__rescued_data"), MapType(StringType(), StringType()))) # type: ignore
|
|
86
|
+
|
|
87
|
+
assert "__timestamp" in df.columns, "__timestamp mandatory in dataframe"
|
|
88
|
+
assert df.select("__metadata.file_path"), "file_path mandatory in struct __metadata in dataframe"
|
|
89
|
+
return df
|
|
90
|
+
|
|
91
|
+
def __str__(self):
|
|
92
|
+
return f"{type(self).__name__} ({self.file_format})"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
PARSERS: dict[str, Callable[[Optional[ParserOptions]], BaseParser]] = {}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from typing import Callable, Optional
|
|
2
|
+
|
|
3
|
+
from fabricks.core.parsers._types import ParserOptions
|
|
4
|
+
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parser(name: str):
|
|
8
|
+
def decorator(parser: Callable[[Optional[ParserOptions]], BaseParser]):
|
|
9
|
+
PARSERS[name] = parser
|
|
10
|
+
|
|
11
|
+
return decorator
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from importlib.util import module_from_spec, spec_from_file_location
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from fabricks.context import PATH_PARSERS
|
|
5
|
+
from fabricks.core.parsers._types import ParserOptions
|
|
6
|
+
from fabricks.core.parsers.base import PARSERS, BaseParser
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_parser(name: str, parser_options: Optional[ParserOptions] = None) -> BaseParser:
|
|
10
|
+
if name not in ["json", "parquet", "avro", "csv", "tsv", "delta", "table"]:
|
|
11
|
+
path = PATH_PARSERS.joinpath(name).append(".py")
|
|
12
|
+
assert path.exists(), f"parser not found ({path})"
|
|
13
|
+
|
|
14
|
+
spec = spec_from_file_location(name, path.string)
|
|
15
|
+
assert spec, f"parser not found ({path})"
|
|
16
|
+
assert spec.loader is not None
|
|
17
|
+
|
|
18
|
+
mod = module_from_spec(spec)
|
|
19
|
+
spec.loader.exec_module(mod)
|
|
20
|
+
parser = PARSERS[name](parser_options)
|
|
21
|
+
|
|
22
|
+
else:
|
|
23
|
+
parser = BaseParser(parser_options, name)
|
|
24
|
+
|
|
25
|
+
assert parser
|
|
26
|
+
return parser
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from pyspark.sql.functions import length, lower
|
|
3
|
+
from pyspark.sql.functions import trim as _trim
|
|
4
|
+
from pyspark.sql.functions import when
|
|
5
|
+
from pyspark.sql.types import DoubleType, FloatType, IntegerType
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def value_to_none(df: DataFrame) -> DataFrame:
|
|
9
|
+
cols = [name for name, dtype in df.dtypes if not name.startswith("__")]
|
|
10
|
+
for c in cols:
|
|
11
|
+
df = df.withColumn(
|
|
12
|
+
c,
|
|
13
|
+
when(length(df[f"`{c}`"].cast("string")) == 0, None)
|
|
14
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "none", None)
|
|
15
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "null", None)
|
|
16
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "blank", None)
|
|
17
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "(none)", None)
|
|
18
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "(null)", None)
|
|
19
|
+
.when(lower(df[f"`{c}`"].cast("string")) == "(blank)", None)
|
|
20
|
+
.otherwise(df[f"`{c}`"]),
|
|
21
|
+
)
|
|
22
|
+
return df
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def decimal_to_float(df: DataFrame) -> DataFrame:
|
|
26
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
|
|
27
|
+
for c in cols:
|
|
28
|
+
df = df.withColumn(c, df[f"`{c}`"].cast(FloatType()))
|
|
29
|
+
return df
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def decimal_to_double(df: DataFrame) -> DataFrame:
|
|
33
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("decimal") and not name.startswith("__")]
|
|
34
|
+
for c in cols:
|
|
35
|
+
df = df.withColumn(c, df[f"`{c}`"].cast(DoubleType()))
|
|
36
|
+
return df
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def tinyint_to_int(df: DataFrame) -> DataFrame:
|
|
40
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("tinyint") and not name.startswith("__")]
|
|
41
|
+
for c in cols:
|
|
42
|
+
df = df.withColumn(c, df[f"`{c}`"].cast(IntegerType()))
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def trim(df: DataFrame) -> DataFrame:
|
|
47
|
+
cols = [name for name, dtype in df.dtypes if dtype.startswith("string") and not name.startswith("__")]
|
|
48
|
+
for c in cols:
|
|
49
|
+
df = df.withColumn(c, _trim(df[f"`{c}`"]))
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def clean(df: DataFrame) -> DataFrame:
|
|
54
|
+
"""
|
|
55
|
+
Cleans the given DataFrame by performing the following operations:
|
|
56
|
+
1. Trims whitespace from all string columns.
|
|
57
|
+
2. Converts empty strings to None.
|
|
58
|
+
3. Converts decimal values to double.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
df (pandas.DataFrame): The DataFrame to be cleaned.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
pandas.DataFrame: The cleaned DataFrame.
|
|
65
|
+
"""
|
|
66
|
+
df = trim(df)
|
|
67
|
+
df = value_to_none(df)
|
|
68
|
+
df = decimal_to_double(df)
|
|
69
|
+
return df
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from fabricks.core.schedules.generate import generate
|
|
2
|
+
from fabricks.core.schedules.process import process
|
|
3
|
+
from fabricks.core.schedules.run import run
|
|
4
|
+
from fabricks.core.schedules.terminate import terminate
|
|
5
|
+
from fabricks.core.schedules.views import create_or_replace_view, create_or_replace_views
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"process",
|
|
9
|
+
"generate",
|
|
10
|
+
"terminate",
|
|
11
|
+
"run",
|
|
12
|
+
"create_or_replace_view",
|
|
13
|
+
"create_or_replace_views",
|
|
14
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_dependencies(name: str) -> DataFrame:
|
|
5
|
+
from fabricks.core.dags import DagGenerator
|
|
6
|
+
|
|
7
|
+
g = DagGenerator(schedule=name)
|
|
8
|
+
return g.get_dependencies()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_mermaid_diagram(name: str) -> str:
|
|
12
|
+
from fabricks.utils.mermaid import get_mermaid_diagram as get_diagram
|
|
13
|
+
|
|
14
|
+
df = get_dependencies(name)
|
|
15
|
+
|
|
16
|
+
df = df.withColumnRenamed("ParentId", "parent_id")
|
|
17
|
+
df = df.withColumnRenamed("Parent", "parent")
|
|
18
|
+
df = df.withColumnRenamed("JobId", "job_id")
|
|
19
|
+
df = df.withColumnRenamed("Job", "job")
|
|
20
|
+
|
|
21
|
+
return get_diagram(df)
|