fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
from abc import abstractmethod
|
|
2
|
+
from typing import Optional, Sequence, Union, cast
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
from pyspark.sql.functions import lit
|
|
6
|
+
|
|
7
|
+
from fabricks.cdc import NoCDC
|
|
8
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
|
+
from fabricks.core.jobs.base._types import JobDependency
|
|
10
|
+
from fabricks.core.jobs.base.configurator import Configurator
|
|
11
|
+
from fabricks.metastore.table import SchemaDiff
|
|
12
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Generator(Configurator):
|
|
16
|
+
def update_dependencies(self):
|
|
17
|
+
DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
|
|
18
|
+
|
|
19
|
+
deps = self.get_dependencies()
|
|
20
|
+
if deps:
|
|
21
|
+
df = self.spark.createDataFrame([d.model_dump() for d in deps]) # type: ignore
|
|
22
|
+
cdc = NoCDC("fabricks", self.step, "dependencies")
|
|
23
|
+
cdc.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def get_dependencies(self) -> Sequence[JobDependency]: ...
|
|
27
|
+
|
|
28
|
+
def rm(self):
|
|
29
|
+
"""
|
|
30
|
+
Removes the schema folder and checkpoints associated with the generator.
|
|
31
|
+
|
|
32
|
+
If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
|
|
33
|
+
"""
|
|
34
|
+
if self.paths.schema.exists():
|
|
35
|
+
DEFAULT_LOGGER.info("delete schema folder", extra={"label": self})
|
|
36
|
+
self.paths.schema.rm()
|
|
37
|
+
self.rm_checkpoints()
|
|
38
|
+
|
|
39
|
+
def rm_checkpoints(self):
|
|
40
|
+
"""
|
|
41
|
+
Removes the checkpoints folder if it exists.
|
|
42
|
+
|
|
43
|
+
This method checks if the checkpoints folder exists and deletes it if it does.
|
|
44
|
+
"""
|
|
45
|
+
if self.paths.checkpoints.exists():
|
|
46
|
+
DEFAULT_LOGGER.info("delete checkpoints folder", extra={"label": self})
|
|
47
|
+
self.paths.checkpoints.rm()
|
|
48
|
+
|
|
49
|
+
def rm_commit(self, id: Union[str, int]):
|
|
50
|
+
"""
|
|
51
|
+
Remove a commit with the given ID.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
id (Union[str, int]): The ID of the commit to remove.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
None
|
|
58
|
+
"""
|
|
59
|
+
path = self.paths.commits.joinpath(str(id))
|
|
60
|
+
if path.exists():
|
|
61
|
+
DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"label": self})
|
|
62
|
+
path.rm()
|
|
63
|
+
|
|
64
|
+
def truncate(self):
|
|
65
|
+
"""
|
|
66
|
+
Truncates the job by removing all data associated with it.
|
|
67
|
+
|
|
68
|
+
This method removes the job from the system and, if the `persist` flag is set to True,
|
|
69
|
+
it also truncates the associated table.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
None
|
|
73
|
+
"""
|
|
74
|
+
DEFAULT_LOGGER.warning("truncate", extra={"label": self})
|
|
75
|
+
self.rm()
|
|
76
|
+
if self.persist:
|
|
77
|
+
self.table.truncate()
|
|
78
|
+
|
|
79
|
+
def drop(self):
|
|
80
|
+
"""
|
|
81
|
+
Drops the current job and its dependencies.
|
|
82
|
+
|
|
83
|
+
This method drops the current job and its dependencies by performing the following steps:
|
|
84
|
+
1. Queries the database to check if there are any child jobs associated with the current job.
|
|
85
|
+
2. If child jobs are found, logs a warning message and prints the list of child jobs.
|
|
86
|
+
3. Drops the current job's change data capture (cdc).
|
|
87
|
+
4. Removes the current job.
|
|
88
|
+
|
|
89
|
+
Note: This method handles any exceptions that occur during the process.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
None
|
|
93
|
+
"""
|
|
94
|
+
if self.options.job.get("no_drop"):
|
|
95
|
+
raise ValueError("no_drop is set, cannot drop the job")
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
row = self.spark.sql(
|
|
99
|
+
f"""
|
|
100
|
+
select
|
|
101
|
+
count(*) as count,
|
|
102
|
+
array_join(sort_array(collect_set(j.job)), ', \n') as children
|
|
103
|
+
from
|
|
104
|
+
fabricks.dependencies d
|
|
105
|
+
inner join fabricks.jobs j on d.job_id = j.job_id
|
|
106
|
+
where
|
|
107
|
+
parent like '{self}'
|
|
108
|
+
"""
|
|
109
|
+
).collect()[0]
|
|
110
|
+
if cast(int, row.count) > 0:
|
|
111
|
+
DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"label": self, "content": row.children})
|
|
112
|
+
|
|
113
|
+
except Exception:
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
self.cdc.drop()
|
|
117
|
+
self.rm()
|
|
118
|
+
|
|
119
|
+
def create(self):
|
|
120
|
+
"""
|
|
121
|
+
Creates a table or view based on the specified mode.
|
|
122
|
+
|
|
123
|
+
If `persist` is True, it creates a table by calling the `create_table` method.
|
|
124
|
+
If `virtual` is True, it creates or replaces a view by calling the `create_or_replace_view` method.
|
|
125
|
+
If neither `persist` nor `virtual` is True, it raises a ValueError.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
ValueError: If neither `persist` nor `virtual` is True.
|
|
129
|
+
|
|
130
|
+
"""
|
|
131
|
+
if self.persist:
|
|
132
|
+
self.create_table()
|
|
133
|
+
elif self.virtual:
|
|
134
|
+
self.create_or_replace_view()
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(f"{self.mode} not allowed")
|
|
137
|
+
|
|
138
|
+
def register(self):
|
|
139
|
+
"""
|
|
140
|
+
Register the job.
|
|
141
|
+
|
|
142
|
+
If `persist` is True, the job's table is registered.
|
|
143
|
+
If `virtual` is True, a view is created or replaced.
|
|
144
|
+
Otherwise, a ValueError is raised.
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
ValueError: If `persist` and `virtual` are both False.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
if self.persist:
|
|
151
|
+
self.table.register()
|
|
152
|
+
elif self.virtual:
|
|
153
|
+
self.create_or_replace_view()
|
|
154
|
+
else:
|
|
155
|
+
raise ValueError(f"{self.mode} not allowed")
|
|
156
|
+
|
|
157
|
+
def create_or_replace_view(self):
|
|
158
|
+
"""
|
|
159
|
+
Creates or replaces a view.
|
|
160
|
+
|
|
161
|
+
This method is responsible for creating or replacing a view in the database.
|
|
162
|
+
It should be implemented by subclasses to define the specific logic for creating or replacing the view.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
NotImplementedError: This method is meant to be overridden by subclasses.
|
|
166
|
+
"""
|
|
167
|
+
...
|
|
168
|
+
|
|
169
|
+
def create_table(self):
|
|
170
|
+
def _create_table(df: DataFrame, batch: Optional[int] = 0):
|
|
171
|
+
df = self.base_transform(df)
|
|
172
|
+
cdc_options = self.get_cdc_context(df)
|
|
173
|
+
|
|
174
|
+
cluster_by = []
|
|
175
|
+
partition_by = []
|
|
176
|
+
|
|
177
|
+
powerbi = False
|
|
178
|
+
liquid_clustering = False
|
|
179
|
+
partitioning = False
|
|
180
|
+
identity = False
|
|
181
|
+
|
|
182
|
+
# first take from job options, then from step options
|
|
183
|
+
job_powerbi = self.options.table.get_boolean("powerbi", None)
|
|
184
|
+
step_powerbi = self.step_conf.get("table_options", {}).get("powerbi", None)
|
|
185
|
+
if job_powerbi is not None:
|
|
186
|
+
powerbi = job_powerbi
|
|
187
|
+
elif step_powerbi is not None:
|
|
188
|
+
powerbi = step_powerbi
|
|
189
|
+
|
|
190
|
+
# first take from job options, then from step options
|
|
191
|
+
job_masks = self.options.table.get("masks", None)
|
|
192
|
+
step_masks = self.step_conf.get("table_options", {}).get("masks", None)
|
|
193
|
+
if job_masks is not None:
|
|
194
|
+
masks = job_masks
|
|
195
|
+
elif step_masks is not None:
|
|
196
|
+
masks = step_masks
|
|
197
|
+
else:
|
|
198
|
+
masks = None
|
|
199
|
+
|
|
200
|
+
maximum_compatibility = self.options.table.get_boolean("maximum_compatibility", False)
|
|
201
|
+
|
|
202
|
+
if maximum_compatibility:
|
|
203
|
+
default_properties = {
|
|
204
|
+
"delta.minReaderVersion": "1",
|
|
205
|
+
"delta.minWriterVersion": "7",
|
|
206
|
+
"delta.columnMapping.mode": "none",
|
|
207
|
+
}
|
|
208
|
+
elif powerbi:
|
|
209
|
+
default_properties = {
|
|
210
|
+
"delta.columnMapping.mode": "name",
|
|
211
|
+
"delta.minReaderVersion": "2",
|
|
212
|
+
"delta.minWriterVersion": "5",
|
|
213
|
+
}
|
|
214
|
+
else:
|
|
215
|
+
default_properties = {
|
|
216
|
+
"delta.enableTypeWidening": "true",
|
|
217
|
+
"delta.enableDeletionVectors": "true",
|
|
218
|
+
"delta.columnMapping.mode": "name",
|
|
219
|
+
"delta.minReaderVersion": "2",
|
|
220
|
+
"delta.minWriterVersion": "5",
|
|
221
|
+
"delta.feature.timestampNtz": "supported",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
default_properties["fabricks.last_version"] = "0"
|
|
225
|
+
|
|
226
|
+
if "__identity" in df.columns:
|
|
227
|
+
identity = False
|
|
228
|
+
else:
|
|
229
|
+
identity = self.options.table.get_boolean("identity", False)
|
|
230
|
+
|
|
231
|
+
# first take from job options, then from step options
|
|
232
|
+
liquid_clustering_job = self.options.table.get("liquid_clustering", None)
|
|
233
|
+
liquid_clustering_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
|
|
234
|
+
if liquid_clustering_job is not None:
|
|
235
|
+
liquid_clustering = liquid_clustering_job
|
|
236
|
+
elif liquid_clustering_step:
|
|
237
|
+
liquid_clustering = liquid_clustering_step
|
|
238
|
+
|
|
239
|
+
if liquid_clustering is not None:
|
|
240
|
+
if liquid_clustering == "auto":
|
|
241
|
+
liquid_clustering = True
|
|
242
|
+
cluster_by = []
|
|
243
|
+
|
|
244
|
+
else:
|
|
245
|
+
cluster_by = self.options.table.get_list("cluster_by") or []
|
|
246
|
+
if not cluster_by:
|
|
247
|
+
if "__source" in df.columns:
|
|
248
|
+
cluster_by.append("__source")
|
|
249
|
+
if "__is_current" in df.columns:
|
|
250
|
+
cluster_by.append("__is_current")
|
|
251
|
+
if "__key" in df.columns:
|
|
252
|
+
cluster_by.append("__key")
|
|
253
|
+
elif "__hash" in df.columns:
|
|
254
|
+
cluster_by.append("__hash")
|
|
255
|
+
|
|
256
|
+
if not cluster_by:
|
|
257
|
+
DEFAULT_LOGGER.debug("could not determine clustering column", extra={"label": self})
|
|
258
|
+
liquid_clustering = False
|
|
259
|
+
cluster_by = None
|
|
260
|
+
|
|
261
|
+
if liquid_clustering is None:
|
|
262
|
+
cluster_by = None
|
|
263
|
+
partition_by = self.options.table.get_list("partition_by")
|
|
264
|
+
if partition_by:
|
|
265
|
+
partitioning = True
|
|
266
|
+
|
|
267
|
+
properties = None
|
|
268
|
+
if not powerbi:
|
|
269
|
+
# first take from job options, then from step options
|
|
270
|
+
if self.options.table.get_dict("properties"):
|
|
271
|
+
properties = self.options.table.get_dict("properties")
|
|
272
|
+
elif self.step_conf.get("table_options", {}).get("properties", {}):
|
|
273
|
+
properties = self.step_conf.get("table_options", {}).get("properties", {})
|
|
274
|
+
|
|
275
|
+
if properties is None:
|
|
276
|
+
properties = default_properties
|
|
277
|
+
|
|
278
|
+
primary_key = self.options.table.get_dict("primary_key")
|
|
279
|
+
foreign_keys = self.options.table.get_dict("foreign_keys")
|
|
280
|
+
comments = self.options.table.get_dict("comments")
|
|
281
|
+
|
|
282
|
+
# if dataframe, reference is passed (BUG)
|
|
283
|
+
name = f"{self.step}_{self.topic}_{self.item}__init"
|
|
284
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"), job=self)
|
|
285
|
+
sql = f"select * from {global_temp_view}"
|
|
286
|
+
|
|
287
|
+
self.cdc.create_table(
|
|
288
|
+
sql,
|
|
289
|
+
identity=identity,
|
|
290
|
+
liquid_clustering=liquid_clustering,
|
|
291
|
+
cluster_by=cluster_by,
|
|
292
|
+
partitioning=partitioning,
|
|
293
|
+
partition_by=partition_by,
|
|
294
|
+
properties=properties,
|
|
295
|
+
masks=masks,
|
|
296
|
+
primary_key=primary_key,
|
|
297
|
+
foreign_keys=foreign_keys,
|
|
298
|
+
comments=comments,
|
|
299
|
+
**cdc_options,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
if not self.table.exists():
|
|
303
|
+
DEFAULT_LOGGER.debug("create table", extra={"label": self})
|
|
304
|
+
|
|
305
|
+
df = self.get_data(stream=self.stream, schema_only=True)
|
|
306
|
+
if df:
|
|
307
|
+
if self.stream:
|
|
308
|
+
# add dummy stream to be sure that the writeStream will start
|
|
309
|
+
spark = df.sparkSession
|
|
310
|
+
|
|
311
|
+
dummy_df = spark.readStream.table("fabricks.dummy")
|
|
312
|
+
# __metadata is always present
|
|
313
|
+
dummy_df = dummy_df.withColumn("__metadata", lit(None))
|
|
314
|
+
dummy_df = dummy_df.select("__metadata")
|
|
315
|
+
|
|
316
|
+
df = df.unionByName(dummy_df, allowMissingColumns=True)
|
|
317
|
+
path = self.paths.checkpoints.append("__init")
|
|
318
|
+
if path.exists():
|
|
319
|
+
path.rm()
|
|
320
|
+
|
|
321
|
+
query = (
|
|
322
|
+
df.writeStream.foreachBatch(_create_table)
|
|
323
|
+
.option("checkpointLocation", path.string)
|
|
324
|
+
.trigger(once=True)
|
|
325
|
+
.start()
|
|
326
|
+
)
|
|
327
|
+
query.awaitTermination()
|
|
328
|
+
path.rm()
|
|
329
|
+
else:
|
|
330
|
+
_create_table(df)
|
|
331
|
+
|
|
332
|
+
constraints = self.options.table.get_dict("constraints")
|
|
333
|
+
if constraints:
|
|
334
|
+
for key, value in constraints.items():
|
|
335
|
+
self.table.add_constraint(name=key, expr=value)
|
|
336
|
+
|
|
337
|
+
comment = self.options.table.get("comment")
|
|
338
|
+
if comment:
|
|
339
|
+
self.table.add_comment(comment=comment)
|
|
340
|
+
|
|
341
|
+
else:
|
|
342
|
+
DEFAULT_LOGGER.debug("table exists, skip creation", extra={"label": self})
|
|
343
|
+
|
|
344
|
+
def _update_schema(
|
|
345
|
+
self,
|
|
346
|
+
df: Optional[DataFrame] = None,
|
|
347
|
+
overwrite: Optional[bool] = False,
|
|
348
|
+
widen_types: Optional[bool] = False,
|
|
349
|
+
):
|
|
350
|
+
def _update_schema(df: DataFrame, batch: Optional[int] = None):
|
|
351
|
+
context = self.get_cdc_context(df, reload=True)
|
|
352
|
+
if overwrite:
|
|
353
|
+
self.cdc.overwrite_schema(df, **context)
|
|
354
|
+
else:
|
|
355
|
+
self.cdc.update_schema(df, widen_types=widen_types, **context)
|
|
356
|
+
|
|
357
|
+
if self.persist:
|
|
358
|
+
if df is not None:
|
|
359
|
+
_update_schema(df)
|
|
360
|
+
|
|
361
|
+
else:
|
|
362
|
+
df = self.get_data(stream=self.stream, schema_only=True)
|
|
363
|
+
assert df is not None
|
|
364
|
+
df = self.base_transform(df)
|
|
365
|
+
|
|
366
|
+
if self.stream:
|
|
367
|
+
path = self.paths.checkpoints.append("__schema")
|
|
368
|
+
query = (
|
|
369
|
+
df.writeStream.foreachBatch(_update_schema)
|
|
370
|
+
.option("checkpointLocation", path.string)
|
|
371
|
+
.trigger(once=True)
|
|
372
|
+
.start()
|
|
373
|
+
)
|
|
374
|
+
query.awaitTermination()
|
|
375
|
+
path.rm()
|
|
376
|
+
|
|
377
|
+
else:
|
|
378
|
+
_update_schema(df)
|
|
379
|
+
|
|
380
|
+
elif self.virtual:
|
|
381
|
+
self.create_or_replace_view()
|
|
382
|
+
|
|
383
|
+
else:
|
|
384
|
+
raise ValueError(f"{self.mode} not allowed")
|
|
385
|
+
|
|
386
|
+
def update_schema(self, df: Optional[DataFrame] = None, widen_types: Optional[bool] = False):
|
|
387
|
+
self._update_schema(df=df, overwrite=False, widen_types=widen_types)
|
|
388
|
+
|
|
389
|
+
def overwrite_schema(self, df: Optional[DataFrame] = None):
|
|
390
|
+
self._update_schema(df=df, overwrite=True)
|
|
391
|
+
|
|
392
|
+
def get_differences_with_deltatable(self, df: Optional[DataFrame] = None):
|
|
393
|
+
if df is None:
|
|
394
|
+
df = self.get_data(stream=self.stream)
|
|
395
|
+
assert df is not None
|
|
396
|
+
df = self.base_transform(df)
|
|
397
|
+
|
|
398
|
+
context = self.get_cdc_context(df, reload=True)
|
|
399
|
+
|
|
400
|
+
return self.cdc.get_differences_with_deltatable(df, **context)
|
|
401
|
+
|
|
402
|
+
def get_schema_differences(self, df: Optional[DataFrame] = None) -> Optional[Sequence[SchemaDiff]]:
|
|
403
|
+
if df is None:
|
|
404
|
+
df = self.get_data(stream=self.stream)
|
|
405
|
+
assert df is not None
|
|
406
|
+
df = self.base_transform(df)
|
|
407
|
+
|
|
408
|
+
context = self.get_cdc_context(df, reload=True)
|
|
409
|
+
|
|
410
|
+
return self.cdc.get_schema_differences(df, **context)
|
|
411
|
+
|
|
412
|
+
def schema_drifted(self, df: Optional[DataFrame] = None) -> Optional[bool]:
|
|
413
|
+
d = self.get_schema_differences(df)
|
|
414
|
+
if d is None:
|
|
415
|
+
return None
|
|
416
|
+
return len(d) > 0
|
|
417
|
+
|
|
418
|
+
def enable_liquid_clustering(self):
|
|
419
|
+
df = self.table.dataframe
|
|
420
|
+
enable = False
|
|
421
|
+
|
|
422
|
+
# first take from job options, then from step options
|
|
423
|
+
enable_job = self.options.table.get_boolean("liquid_clustering", None)
|
|
424
|
+
enable_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
|
|
425
|
+
if enable_job is not None:
|
|
426
|
+
enable = enable_job
|
|
427
|
+
elif enable_step:
|
|
428
|
+
enable = enable_step
|
|
429
|
+
|
|
430
|
+
if enable:
|
|
431
|
+
cluster_by = self.options.table.get_list("cluster_by") or []
|
|
432
|
+
if not cluster_by:
|
|
433
|
+
if "__source" in df.columns:
|
|
434
|
+
cluster_by.append("__source")
|
|
435
|
+
if "__is_current" in df.columns:
|
|
436
|
+
cluster_by.append("__is_current")
|
|
437
|
+
if "__key" in df.columns:
|
|
438
|
+
cluster_by.append("__key")
|
|
439
|
+
elif "__hash" in df.columns:
|
|
440
|
+
cluster_by.append("__hash")
|
|
441
|
+
|
|
442
|
+
if len(cluster_by) > 0:
|
|
443
|
+
self.table.enable_liquid_clustering(cluster_by, auto=False)
|
|
444
|
+
else:
|
|
445
|
+
self.table.enable_liquid_clustering(auto=True)
|
|
446
|
+
else:
|
|
447
|
+
DEFAULT_LOGGER.debug("could not enable liquid clustering", extra={"label": self})
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from pyspark.sql import DataFrame
|
|
5
|
+
|
|
6
|
+
from fabricks.context import PATH_RUNTIME
|
|
7
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
8
|
+
from fabricks.core.jobs.base.checker import Checker
|
|
9
|
+
from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
|
|
10
|
+
from fabricks.core.jobs.get_schedule import get_schedule
|
|
11
|
+
from fabricks.utils.path import Path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Invoker(Checker):
|
|
15
|
+
def invoke(self, schedule: Optional[str] = None, **kwargs):
|
|
16
|
+
return self._invoke_job(
|
|
17
|
+
position="run",
|
|
18
|
+
schedule=schedule,
|
|
19
|
+
**kwargs,
|
|
20
|
+
) # kwargs and return needed for get_data in gold
|
|
21
|
+
|
|
22
|
+
def invoke_pre_run(self, schedule: Optional[str] = None):
|
|
23
|
+
self._invoke_job(position="pre_run", schedule=schedule)
|
|
24
|
+
self._invoke_step(position="pre_run", schedule=schedule)
|
|
25
|
+
|
|
26
|
+
def invoke_post_run(self, schedule: Optional[str] = None):
|
|
27
|
+
self._invoke_job(position="post_run", schedule=schedule)
|
|
28
|
+
self._invoke_step(position="post_run", schedule=schedule)
|
|
29
|
+
|
|
30
|
+
def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
|
|
31
|
+
invokers = self.options.invokers.get_list(position)
|
|
32
|
+
if position == "run":
|
|
33
|
+
invokers = invokers if len(invokers) > 0 else [{}] # run must work even without run invoker options
|
|
34
|
+
|
|
35
|
+
errors = []
|
|
36
|
+
|
|
37
|
+
if invokers:
|
|
38
|
+
for i, invoker in enumerate(invokers):
|
|
39
|
+
DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
|
|
40
|
+
try:
|
|
41
|
+
path = kwargs.get("path")
|
|
42
|
+
if path is None:
|
|
43
|
+
notebook = invoker.get("notebook")
|
|
44
|
+
assert notebook, "notebook mandatory"
|
|
45
|
+
path = PATH_RUNTIME.joinpath(notebook)
|
|
46
|
+
|
|
47
|
+
assert path is not None, "path mandatory"
|
|
48
|
+
|
|
49
|
+
arguments = invoker.get("arguments") or {}
|
|
50
|
+
timeout = invoker.get("timeout")
|
|
51
|
+
|
|
52
|
+
schema_only = kwargs.get("schema_only")
|
|
53
|
+
if schema_only is not None:
|
|
54
|
+
arguments["schema_only"] = schema_only
|
|
55
|
+
|
|
56
|
+
if len(invokers) == 1 and position == "run":
|
|
57
|
+
return self._run_notebook(
|
|
58
|
+
path=path,
|
|
59
|
+
arguments=arguments,
|
|
60
|
+
timeout=timeout,
|
|
61
|
+
schedule=schedule,
|
|
62
|
+
)
|
|
63
|
+
else:
|
|
64
|
+
self._run_notebook(
|
|
65
|
+
path=path,
|
|
66
|
+
arguments=arguments,
|
|
67
|
+
timeout=timeout,
|
|
68
|
+
schedule=schedule,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
|
|
73
|
+
|
|
74
|
+
if position == "pre_run":
|
|
75
|
+
errors.append(PreRunInvokeException(e))
|
|
76
|
+
elif position == "post_run":
|
|
77
|
+
errors.append(PostRunInvokeException(e))
|
|
78
|
+
else:
|
|
79
|
+
errors.append(e)
|
|
80
|
+
|
|
81
|
+
if errors:
|
|
82
|
+
raise Exception(errors)
|
|
83
|
+
|
|
84
|
+
def _invoke_step(self, position: str, schedule: Optional[str] = None):
|
|
85
|
+
invokers = self.step_conf.get("invoker_options", {}).get(position, [])
|
|
86
|
+
|
|
87
|
+
errors = []
|
|
88
|
+
|
|
89
|
+
if invokers:
|
|
90
|
+
for i, invoker in enumerate(invokers):
|
|
91
|
+
DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
|
|
92
|
+
try:
|
|
93
|
+
notebook = invoker.get("notebook")
|
|
94
|
+
assert notebook, "notebook mandatory"
|
|
95
|
+
path = PATH_RUNTIME.joinpath(notebook)
|
|
96
|
+
|
|
97
|
+
arguments = invoker.get("arguments", {})
|
|
98
|
+
timeout = invoker.get("timeout")
|
|
99
|
+
|
|
100
|
+
self._run_notebook(
|
|
101
|
+
path=path,
|
|
102
|
+
arguments=arguments,
|
|
103
|
+
timeout=timeout,
|
|
104
|
+
schedule=schedule,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
except Exception as e:
|
|
108
|
+
DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
|
|
109
|
+
|
|
110
|
+
if position == "pre_run":
|
|
111
|
+
errors.append(PreRunInvokeException(e))
|
|
112
|
+
elif position == "post_run":
|
|
113
|
+
errors.append(PostRunInvokeException(e))
|
|
114
|
+
else:
|
|
115
|
+
errors.append(e)
|
|
116
|
+
|
|
117
|
+
if errors:
|
|
118
|
+
raise Exception(errors)
|
|
119
|
+
|
|
120
|
+
def _run_notebook(
|
|
121
|
+
self,
|
|
122
|
+
path: Path,
|
|
123
|
+
arguments: Optional[dict] = None,
|
|
124
|
+
timeout: Optional[int] = None,
|
|
125
|
+
schedule: Optional[str] = None,
|
|
126
|
+
):
|
|
127
|
+
"""
|
|
128
|
+
Invokes a notebook job.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
path (Optional[Path]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
|
|
132
|
+
arguments (Optional[dict]): Additional arguments to pass to the notebook job. If not provided, it will be retrieved from the invoker options.
|
|
133
|
+
schedule (Optional[str]): The schedule for the job. If provided, schedule variables will be retrieved.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
AssertionError: If the specified path does not exist.
|
|
137
|
+
|
|
138
|
+
"""
|
|
139
|
+
from databricks.sdk.runtime import dbutils
|
|
140
|
+
|
|
141
|
+
for file_format in [None, ".py", ".ipynb"]:
|
|
142
|
+
path_with_file_format = path.append(file_format) if file_format else path
|
|
143
|
+
if path_with_file_format.exists():
|
|
144
|
+
path = path_with_file_format
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
if timeout is None:
|
|
148
|
+
timeout = self.timeout
|
|
149
|
+
|
|
150
|
+
assert timeout is not None
|
|
151
|
+
|
|
152
|
+
variables = None
|
|
153
|
+
if schedule is not None:
|
|
154
|
+
variables = get_schedule(name=schedule).get("options", {}).get("variables", {})
|
|
155
|
+
|
|
156
|
+
if variables is None:
|
|
157
|
+
variables = {}
|
|
158
|
+
|
|
159
|
+
if arguments is None:
|
|
160
|
+
arguments = {}
|
|
161
|
+
|
|
162
|
+
return dbutils.notebook.run(
|
|
163
|
+
path=path.get_notebook_path(), # type: ignore
|
|
164
|
+
timeout_seconds=timeout, # type: ignore
|
|
165
|
+
arguments={ # type: ignore
|
|
166
|
+
"step": self.step,
|
|
167
|
+
"topic": self.topic,
|
|
168
|
+
"item": self.item,
|
|
169
|
+
**arguments,
|
|
170
|
+
"job_options": json.dumps(self.options.job.options),
|
|
171
|
+
"schedule_variables": json.dumps(variables),
|
|
172
|
+
},
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
def extend_job(self, df: DataFrame) -> DataFrame:
|
|
176
|
+
from fabricks.core.extenders import get_extender
|
|
177
|
+
|
|
178
|
+
extenders = self.options.extenders
|
|
179
|
+
for e in extenders:
|
|
180
|
+
name = e.get("extender")
|
|
181
|
+
DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
|
|
182
|
+
arguments = e.get("arguments") or {}
|
|
183
|
+
|
|
184
|
+
extender = get_extender(name)
|
|
185
|
+
df = extender(df, **arguments)
|
|
186
|
+
|
|
187
|
+
return df
|
|
188
|
+
|
|
189
|
+
def extend_step(self, df: DataFrame) -> DataFrame:
|
|
190
|
+
from fabricks.core.extenders import get_extender
|
|
191
|
+
|
|
192
|
+
extenders = self.step_conf.get("extender_options", {})
|
|
193
|
+
for e in extenders:
|
|
194
|
+
name = e.get("extender")
|
|
195
|
+
DEFAULT_LOGGER.debug(f"extend by step ({name})", extra={"label": self})
|
|
196
|
+
arguments = e.get("arguments", {})
|
|
197
|
+
|
|
198
|
+
extender = get_extender(name)
|
|
199
|
+
df = extender(df, **arguments)
|
|
200
|
+
|
|
201
|
+
return df
|
|
202
|
+
|
|
203
|
+
def extend(self, df: DataFrame) -> DataFrame:
|
|
204
|
+
df = self.extend_job(df)
|
|
205
|
+
df = self.extend_step(df)
|
|
206
|
+
return df
|