fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
from typing import Any, Optional, Union, cast
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame, Row
|
|
4
|
+
from pyspark.sql.functions import expr, lit
|
|
5
|
+
|
|
6
|
+
from fabricks.cdc import SCD1
|
|
7
|
+
from fabricks.context.log import Logger
|
|
8
|
+
from fabricks.core.jobs.base.configurator import Configurator
|
|
9
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Generator(Configurator):
|
|
13
|
+
def update_dependencies(self):
|
|
14
|
+
Logger.info("update dependencies", extra={"job": self})
|
|
15
|
+
|
|
16
|
+
df = self.get_dependencies()
|
|
17
|
+
if df:
|
|
18
|
+
scd1 = SCD1("fabricks", self.step, "dependencies")
|
|
19
|
+
scd1.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
|
|
20
|
+
|
|
21
|
+
def add_dependency_details(self, df: DataFrame) -> DataFrame:
|
|
22
|
+
df = df.withColumn("__parent", expr("replace(parent, '__current', '')"))
|
|
23
|
+
df = df.withColumn("parent_id", expr("md5(__parent)"))
|
|
24
|
+
df = df.withColumn("dependency_id", expr("md5(concat_ws('*', job_id, parent))"))
|
|
25
|
+
df = df.drop("__parent")
|
|
26
|
+
return df
|
|
27
|
+
|
|
28
|
+
def get_dependencies(self) -> Optional[DataFrame]:
|
|
29
|
+
import re
|
|
30
|
+
|
|
31
|
+
df = self.get_data(self.stream)
|
|
32
|
+
jvm = df._sc._jvm # type: ignore
|
|
33
|
+
explain_plan = cast(Any, jvm.PythonSQLUtils).explainString(cast(Any, df._jdf).queryExecution(), "extended") # type: ignore
|
|
34
|
+
|
|
35
|
+
dependencies = []
|
|
36
|
+
r = re.compile(r"(?<=SubqueryAlias spark_catalog\.)[^.]*\.[^.\n]*")
|
|
37
|
+
matches = re.findall(r, explain_plan)
|
|
38
|
+
matches = list(set(matches))
|
|
39
|
+
for m in matches:
|
|
40
|
+
dependencies.append(Row(self.job_id, m, "parser"))
|
|
41
|
+
parents = self.options.job.get_list("parents") or []
|
|
42
|
+
for p in parents:
|
|
43
|
+
dependencies.append(Row(self.job_id, p, "job"))
|
|
44
|
+
|
|
45
|
+
if dependencies:
|
|
46
|
+
Logger.debug(f"dependencies ({', '.join([row[1] for row in dependencies])})", extra={"job": self})
|
|
47
|
+
df = self.spark.createDataFrame(dependencies, schema=["job_id", "parent", "origin"])
|
|
48
|
+
df = df.transform(self.add_dependency_details)
|
|
49
|
+
assert df.where("job_id == parent_id").count() == 0, "circular dependency found"
|
|
50
|
+
return df
|
|
51
|
+
|
|
52
|
+
def rm(self):
|
|
53
|
+
"""
|
|
54
|
+
Removes the schema folder and checkpoints associated with the generator.
|
|
55
|
+
|
|
56
|
+
If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
|
|
57
|
+
"""
|
|
58
|
+
if self.paths.schema.exists():
|
|
59
|
+
Logger.info("delete schema folder", extra={"job": self})
|
|
60
|
+
self.paths.schema.rm()
|
|
61
|
+
self.rm_checkpoints()
|
|
62
|
+
|
|
63
|
+
def rm_checkpoints(self):
|
|
64
|
+
"""
|
|
65
|
+
Removes the checkpoints folder if it exists.
|
|
66
|
+
|
|
67
|
+
This method checks if the checkpoints folder exists and deletes it if it does.
|
|
68
|
+
"""
|
|
69
|
+
if self.paths.checkpoints.exists():
|
|
70
|
+
Logger.info("delete checkpoints folder", extra={"job": self})
|
|
71
|
+
self.paths.checkpoints.rm()
|
|
72
|
+
|
|
73
|
+
def rm_commit(self, id: Union[str, int]):
|
|
74
|
+
"""
|
|
75
|
+
Remove a commit with the given ID.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
id (Union[str, int]): The ID of the commit to remove.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
None
|
|
82
|
+
"""
|
|
83
|
+
path = self.paths.commits.join(str(id))
|
|
84
|
+
if path.exists():
|
|
85
|
+
Logger.warning(f"delete commit {id}", extra={"job": self})
|
|
86
|
+
path.rm()
|
|
87
|
+
|
|
88
|
+
def truncate(self):
|
|
89
|
+
"""
|
|
90
|
+
Truncates the job by removing all data associated with it.
|
|
91
|
+
|
|
92
|
+
This method removes the job from the system and, if the `persist` flag is set to True,
|
|
93
|
+
it also truncates the associated table.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
None
|
|
97
|
+
"""
|
|
98
|
+
Logger.warning("truncate", extra={"job": self})
|
|
99
|
+
self.rm()
|
|
100
|
+
if self.persist:
|
|
101
|
+
self.table.truncate()
|
|
102
|
+
|
|
103
|
+
def drop(self):
|
|
104
|
+
"""
|
|
105
|
+
Drops the current job and its dependencies.
|
|
106
|
+
|
|
107
|
+
This method drops the current job and its dependencies by performing the following steps:
|
|
108
|
+
1. Queries the database to check if there are any child jobs associated with the current job.
|
|
109
|
+
2. If child jobs are found, logs a warning message and prints the list of child jobs.
|
|
110
|
+
3. Drops the current job's change data capture (cdc).
|
|
111
|
+
4. Removes the current job.
|
|
112
|
+
|
|
113
|
+
Note: This method handles any exceptions that occur during the process.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
None
|
|
117
|
+
"""
|
|
118
|
+
try:
|
|
119
|
+
row = self.spark.sql(
|
|
120
|
+
f"""
|
|
121
|
+
select
|
|
122
|
+
count(*) as count,
|
|
123
|
+
array_join(sort_array(collect_set(j.job)), ', \n') as children
|
|
124
|
+
from
|
|
125
|
+
fabricks.dependencies d
|
|
126
|
+
inner join fabricks.jobs j on d.job_id = j.job_id
|
|
127
|
+
where
|
|
128
|
+
parent like '{self}'
|
|
129
|
+
"""
|
|
130
|
+
).collect()[0]
|
|
131
|
+
if cast(int, row.count) > 0:
|
|
132
|
+
Logger.warning(f"{row.count} children found", extra={"job": self, "content": row.children})
|
|
133
|
+
except Exception:
|
|
134
|
+
pass
|
|
135
|
+
self.cdc.drop()
|
|
136
|
+
self.rm()
|
|
137
|
+
|
|
138
|
+
def create(self):
|
|
139
|
+
"""
|
|
140
|
+
Creates a table or view based on the specified mode.
|
|
141
|
+
|
|
142
|
+
If `persist` is True, it creates a table by calling the `create_table` method.
|
|
143
|
+
If `virtual` is True, it creates or replaces a view by calling the `create_or_replace_view` method.
|
|
144
|
+
If neither `persist` nor `virtual` is True, it raises a ValueError.
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
ValueError: If neither `persist` nor `virtual` is True.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
if self.persist:
|
|
151
|
+
self.create_table()
|
|
152
|
+
elif self.virtual:
|
|
153
|
+
self.create_or_replace_view()
|
|
154
|
+
else:
|
|
155
|
+
raise ValueError(f"{self.mode} not allowed")
|
|
156
|
+
|
|
157
|
+
def register(self):
|
|
158
|
+
"""
|
|
159
|
+
Register the job.
|
|
160
|
+
|
|
161
|
+
If `persist` is True, the job's table is registered.
|
|
162
|
+
If `virtual` is True, a view is created or replaced.
|
|
163
|
+
Otherwise, a ValueError is raised.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
ValueError: If `persist` and `virtual` are both False.
|
|
167
|
+
|
|
168
|
+
"""
|
|
169
|
+
if self.persist:
|
|
170
|
+
self.table.register()
|
|
171
|
+
elif self.virtual:
|
|
172
|
+
self.create_or_replace_view()
|
|
173
|
+
else:
|
|
174
|
+
raise ValueError(f"{self.mode} not allowed")
|
|
175
|
+
|
|
176
|
+
def create_or_replace_view(self):
|
|
177
|
+
"""
|
|
178
|
+
Creates or replaces a view.
|
|
179
|
+
|
|
180
|
+
This method is responsible for creating or replacing a view in the database.
|
|
181
|
+
It should be implemented by subclasses to define the specific logic for creating or replacing the view.
|
|
182
|
+
|
|
183
|
+
Raises:
|
|
184
|
+
NotImplementedError: This method is meant to be overridden by subclasses.
|
|
185
|
+
"""
|
|
186
|
+
raise NotImplementedError()
|
|
187
|
+
|
|
188
|
+
def create_table(self):
|
|
189
|
+
def _create_table(df: DataFrame, batch: Optional[int] = 0):
|
|
190
|
+
df = self.base_transform(df)
|
|
191
|
+
cdc_options = self.get_cdc_context(df)
|
|
192
|
+
|
|
193
|
+
cluster_by = []
|
|
194
|
+
partition_by = []
|
|
195
|
+
|
|
196
|
+
powerbi = False
|
|
197
|
+
liquid_clustering = False
|
|
198
|
+
partitioning = False
|
|
199
|
+
identity = False
|
|
200
|
+
|
|
201
|
+
# first take from job options, then from step options
|
|
202
|
+
job_powerbi = self.options.table.get_boolean("powerbi", None)
|
|
203
|
+
step_powerbi = self.step_conf.get("table_options", {}).get("powerbi", None)
|
|
204
|
+
if job_powerbi is not None:
|
|
205
|
+
powerbi = job_powerbi
|
|
206
|
+
elif step_powerbi is not None:
|
|
207
|
+
powerbi = step_powerbi
|
|
208
|
+
|
|
209
|
+
if powerbi:
|
|
210
|
+
properties = {
|
|
211
|
+
"delta.columnMapping.mode": "name",
|
|
212
|
+
"delta.minReaderVersion": "2",
|
|
213
|
+
"delta.minWriterVersion": "5",
|
|
214
|
+
"fabricks.last_version": "0",
|
|
215
|
+
}
|
|
216
|
+
else:
|
|
217
|
+
properties = {
|
|
218
|
+
"delta.enableDeletionVectors": "true",
|
|
219
|
+
"delta.columnMapping.mode": "name",
|
|
220
|
+
"delta.minReaderVersion": "2",
|
|
221
|
+
"delta.minWriterVersion": "5",
|
|
222
|
+
"delta.feature.timestampNtz": "supported",
|
|
223
|
+
"fabricks.last_version": "0",
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
if "__identity" in df.columns:
|
|
227
|
+
identity = False
|
|
228
|
+
else:
|
|
229
|
+
identity = self.options.table.get_boolean("identity", False)
|
|
230
|
+
|
|
231
|
+
# first take from job options, then from step options
|
|
232
|
+
liquid_clustering_job = self.options.table.get_boolean("liquid_clustering", None)
|
|
233
|
+
liquid_clustering_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
|
|
234
|
+
if liquid_clustering_job is not None:
|
|
235
|
+
liquid_clustering = liquid_clustering_job
|
|
236
|
+
elif liquid_clustering_step:
|
|
237
|
+
liquid_clustering = liquid_clustering_step
|
|
238
|
+
|
|
239
|
+
if liquid_clustering:
|
|
240
|
+
cluster_by = self.options.table.get_list("cluster_by") or []
|
|
241
|
+
if not cluster_by:
|
|
242
|
+
if "__source" in df.columns:
|
|
243
|
+
cluster_by.append("__source")
|
|
244
|
+
if "__is_current" in df.columns:
|
|
245
|
+
cluster_by.append("__is_current")
|
|
246
|
+
if "__key" in df.columns:
|
|
247
|
+
cluster_by.append("__key")
|
|
248
|
+
elif "__hash" in df.columns:
|
|
249
|
+
cluster_by.append("__hash")
|
|
250
|
+
|
|
251
|
+
if not cluster_by:
|
|
252
|
+
Logger.warning("liquid clustering disabled (no clustering columns found)", extra={"job": self})
|
|
253
|
+
liquid_clustering = False
|
|
254
|
+
cluster_by = None
|
|
255
|
+
|
|
256
|
+
if not liquid_clustering:
|
|
257
|
+
cluster_by = None
|
|
258
|
+
partition_by = self.options.table.get_list("partition_by")
|
|
259
|
+
if partition_by:
|
|
260
|
+
partitioning = True
|
|
261
|
+
|
|
262
|
+
if not powerbi:
|
|
263
|
+
# first take from job options, then from step options
|
|
264
|
+
if self.options.table.get_dict("properties"):
|
|
265
|
+
properties = self.options.table.get_dict("properties")
|
|
266
|
+
elif self.step_conf.get("table_options", {}).get("properties", {}):
|
|
267
|
+
properties = self.step_conf.get("table_options", {}).get("properties", {})
|
|
268
|
+
|
|
269
|
+
# if dataframe, reference is passed (BUG)
|
|
270
|
+
name = f"{self.step}_{self.topic}_{self.item}__init"
|
|
271
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 ==2 "))
|
|
272
|
+
sql = f"select * from {global_temp_view}"
|
|
273
|
+
|
|
274
|
+
self.cdc.create_table(
|
|
275
|
+
sql,
|
|
276
|
+
identity=identity,
|
|
277
|
+
liquid_clustering=liquid_clustering,
|
|
278
|
+
cluster_by=cluster_by,
|
|
279
|
+
partitioning=partitioning,
|
|
280
|
+
partition_by=partition_by,
|
|
281
|
+
properties=properties,
|
|
282
|
+
**cdc_options,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if not self.table.exists():
|
|
286
|
+
df = self.get_data(self.stream)
|
|
287
|
+
if df:
|
|
288
|
+
if self.stream:
|
|
289
|
+
# add dummy stream to be sure that the writeStream will start
|
|
290
|
+
dummy_df = self.spark.readStream.table("fabricks.dummy")
|
|
291
|
+
# __metadata is always present
|
|
292
|
+
dummy_df = dummy_df.withColumn("__metadata", lit(None))
|
|
293
|
+
dummy_df = dummy_df.select("__metadata")
|
|
294
|
+
|
|
295
|
+
df = df.unionByName(dummy_df, allowMissingColumns=True)
|
|
296
|
+
path = self.paths.checkpoints.append("__init")
|
|
297
|
+
if path.exists():
|
|
298
|
+
path.rm()
|
|
299
|
+
|
|
300
|
+
query = (
|
|
301
|
+
df.writeStream.foreachBatch(_create_table)
|
|
302
|
+
.option("checkpointLocation", path.string)
|
|
303
|
+
.trigger(once=True)
|
|
304
|
+
.start()
|
|
305
|
+
)
|
|
306
|
+
query.awaitTermination()
|
|
307
|
+
path.rm()
|
|
308
|
+
else:
|
|
309
|
+
_create_table(df)
|
|
310
|
+
|
|
311
|
+
constraints = self.options.table.get_dict("constraints")
|
|
312
|
+
if constraints:
|
|
313
|
+
for key, value in constraints.items():
|
|
314
|
+
self.table.add_constraint(name=key, expr=value)
|
|
315
|
+
|
|
316
|
+
comment = self.options.table.get("comment")
|
|
317
|
+
if comment:
|
|
318
|
+
self.table.add_comment(comment=comment)
|
|
319
|
+
|
|
320
|
+
def _update_schema(self, df: Optional[DataFrame] = None, overwrite: Optional[bool] = False):
|
|
321
|
+
def _update_schema(df: DataFrame, batch: Optional[int] = None):
|
|
322
|
+
if overwrite:
|
|
323
|
+
self.cdc.overwrite_schema(df)
|
|
324
|
+
else:
|
|
325
|
+
self.cdc.update_schema(df)
|
|
326
|
+
|
|
327
|
+
if self.persist:
|
|
328
|
+
if df is not None:
|
|
329
|
+
_update_schema(df)
|
|
330
|
+
else:
|
|
331
|
+
df = self.get_data(self.stream)
|
|
332
|
+
assert df is not None
|
|
333
|
+
df = self.base_transform(df)
|
|
334
|
+
|
|
335
|
+
if self.stream:
|
|
336
|
+
path = self.paths.checkpoints.append("__schema")
|
|
337
|
+
query = (
|
|
338
|
+
df.writeStream.foreachBatch(_update_schema)
|
|
339
|
+
.option("checkpointLocation", path.string)
|
|
340
|
+
.trigger(once=True)
|
|
341
|
+
.start()
|
|
342
|
+
)
|
|
343
|
+
query.awaitTermination()
|
|
344
|
+
path.rm()
|
|
345
|
+
else:
|
|
346
|
+
_update_schema(df)
|
|
347
|
+
|
|
348
|
+
elif self.virtual:
|
|
349
|
+
self.create_or_replace_view()
|
|
350
|
+
else:
|
|
351
|
+
raise ValueError(f"{self.mode} not allowed")
|
|
352
|
+
|
|
353
|
+
def update_schema(self, df: Optional[DataFrame] = None):
|
|
354
|
+
Logger.info("update schema", extra={"job": self})
|
|
355
|
+
self._update_schema(df=df, overwrite=False)
|
|
356
|
+
|
|
357
|
+
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
358
|
+
Logger.info("overwrite schema", extra={"job": self})
|
|
359
|
+
self._update_schema(df=df, overwrite=True)
|
|
360
|
+
|
|
361
|
+
def enable_liquid_clustering(self):
|
|
362
|
+
df = self.table.dataframe
|
|
363
|
+
enable = False
|
|
364
|
+
|
|
365
|
+
# first take from job options, then from step options
|
|
366
|
+
enable_job = self.options.table.get_boolean("liquid_clustering", None)
|
|
367
|
+
enable_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
|
|
368
|
+
if enable_job is not None:
|
|
369
|
+
enable = enable_job
|
|
370
|
+
elif enable_step:
|
|
371
|
+
enable = enable_step
|
|
372
|
+
|
|
373
|
+
if enable:
|
|
374
|
+
cluster_by = self.options.table.get_list("cluster_by") or []
|
|
375
|
+
if not cluster_by:
|
|
376
|
+
if "__source" in df.columns:
|
|
377
|
+
cluster_by.append("__source")
|
|
378
|
+
if "__is_current" in df.columns:
|
|
379
|
+
cluster_by.append("__is_current")
|
|
380
|
+
if "__key" in df.columns:
|
|
381
|
+
cluster_by.append("__key")
|
|
382
|
+
elif "__hash" in df.columns:
|
|
383
|
+
cluster_by.append("__hash")
|
|
384
|
+
|
|
385
|
+
if len(cluster_by) > 0:
|
|
386
|
+
self.table.enable_liquid_clustering(cluster_by)
|
|
387
|
+
else:
|
|
388
|
+
Logger.warning("liquid clustering not enabled (no clustering column found)", extra={"job": self})
|
|
389
|
+
|
|
390
|
+
else:
|
|
391
|
+
Logger.debug("liquid clustering not enabled", extra={"job": self})
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Optional, overload
|
|
3
|
+
|
|
4
|
+
from fabricks.context import PATH_RUNTIME
|
|
5
|
+
from fabricks.context.log import Logger
|
|
6
|
+
from fabricks.core.jobs.base.checker import Checker
|
|
7
|
+
from fabricks.core.jobs.base.error import InvokerFailedException
|
|
8
|
+
from fabricks.core.schedules import get_schedule
|
|
9
|
+
from fabricks.utils.path import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Invoker(Checker):
|
|
13
|
+
def pre_run_invoke(self, schedule: Optional[str] = None):
|
|
14
|
+
self._job_position_invoke(position="pre_run", schedule=schedule)
|
|
15
|
+
self._step_position_invoke(position="pre_run", schedule=schedule)
|
|
16
|
+
|
|
17
|
+
def post_run_invoke(self, schedule: Optional[str] = None):
|
|
18
|
+
self._job_position_invoke(position="post_run", schedule=schedule)
|
|
19
|
+
self._step_position_invoke(position="post_run", schedule=schedule)
|
|
20
|
+
|
|
21
|
+
def _job_position_invoke(self, position: str, schedule: Optional[str] = None):
|
|
22
|
+
if self.options.invoker.get(position):
|
|
23
|
+
Logger.info(f"{position}-invoke", extra={"job": self})
|
|
24
|
+
try:
|
|
25
|
+
options = self.options.invoker.get_dict(position)
|
|
26
|
+
assert options
|
|
27
|
+
|
|
28
|
+
notebook = options.notebook # type: ignore
|
|
29
|
+
assert notebook, "notebook mandatory"
|
|
30
|
+
path = PATH_RUNTIME.join(notebook)
|
|
31
|
+
|
|
32
|
+
arguments = options.arguments or {} # type: ignore
|
|
33
|
+
timeout = arguments.get("timeout")
|
|
34
|
+
if timeout is None:
|
|
35
|
+
if position == "pre_run":
|
|
36
|
+
timeout = self.timeouts.pre_run
|
|
37
|
+
elif position == "post_run":
|
|
38
|
+
timeout = self.timeouts.post_run
|
|
39
|
+
|
|
40
|
+
self.invoke(path, arguments, timeout, schedule)
|
|
41
|
+
except Exception:
|
|
42
|
+
raise InvokerFailedException(position)
|
|
43
|
+
|
|
44
|
+
def _step_position_invoke(self, position: str, schedule: Optional[str] = None):
|
|
45
|
+
if self.step_conf.get("options", {}).get(position, None):
|
|
46
|
+
Logger.info(f"{self.step} - {position}-invoke")
|
|
47
|
+
try:
|
|
48
|
+
options = self.step_conf.get("options", {}).get(position, None)
|
|
49
|
+
assert options
|
|
50
|
+
|
|
51
|
+
notebook = options.get("notebook") # type: ignore
|
|
52
|
+
assert notebook, "notebook mandatory"
|
|
53
|
+
path = PATH_RUNTIME.join(notebook)
|
|
54
|
+
|
|
55
|
+
arguments = options.get("arguments", {}) # type: ignore
|
|
56
|
+
timeout = arguments.get("timeout")
|
|
57
|
+
if timeout is None:
|
|
58
|
+
if position == "pre_run":
|
|
59
|
+
timeout = self.timeouts.pre_run
|
|
60
|
+
elif position == "post_run":
|
|
61
|
+
timeout = self.timeouts.post_run
|
|
62
|
+
|
|
63
|
+
self.invoke(path, arguments, timeout, schedule)
|
|
64
|
+
except Exception:
|
|
65
|
+
raise InvokerFailedException(position)
|
|
66
|
+
|
|
67
|
+
@overload
|
|
68
|
+
def invoke(self, path: Path, arguments: dict, timeout: Optional[int] = None, schedule: Optional[str] = None): ...
|
|
69
|
+
|
|
70
|
+
@overload
|
|
71
|
+
def invoke(self, *, schedule: Optional[str] = None): ...
|
|
72
|
+
|
|
73
|
+
def invoke(
|
|
74
|
+
self,
|
|
75
|
+
path: Optional[Path] = None,
|
|
76
|
+
arguments: Optional[dict] = None,
|
|
77
|
+
timeout: Optional[int] = None,
|
|
78
|
+
schedule: Optional[str] = None,
|
|
79
|
+
):
|
|
80
|
+
"""
|
|
81
|
+
Invokes a notebook job.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
path (Optional[Path]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
|
|
85
|
+
arguments (Optional[dict]): Additional arguments to pass to the notebook job. If not provided, it will be retrieved from the invoker options.
|
|
86
|
+
schedule (Optional[str]): The schedule for the job. If provided, schedule variables will be retrieved.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
AssertionError: If the specified path does not exist.
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
if path is None:
|
|
93
|
+
notebook = self.options.invoker.get_dict("notebook")
|
|
94
|
+
path = PATH_RUNTIME.join(notebook)
|
|
95
|
+
assert path.exists(), f"{path} not found"
|
|
96
|
+
|
|
97
|
+
if arguments is None:
|
|
98
|
+
arguments = self.options.invoker.get_dict("arguments") or {}
|
|
99
|
+
|
|
100
|
+
if schedule is not None:
|
|
101
|
+
variables = get_schedule(schedule).select("options.variables").collect()[0][0]
|
|
102
|
+
else:
|
|
103
|
+
variables = {}
|
|
104
|
+
|
|
105
|
+
if timeout is None:
|
|
106
|
+
timeout = self.timeouts.job
|
|
107
|
+
|
|
108
|
+
self.dbutils.notebook.run(
|
|
109
|
+
path.get_notebook_path(),
|
|
110
|
+
timeout,
|
|
111
|
+
{
|
|
112
|
+
"step": self.step,
|
|
113
|
+
"topic": self.topic,
|
|
114
|
+
"item": self.item,
|
|
115
|
+
**arguments,
|
|
116
|
+
"job_options": json.dumps(self.options.job.options),
|
|
117
|
+
"schedule_variables": json.dumps(variables),
|
|
118
|
+
},
|
|
119
|
+
)
|