fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
from typing import Optional, cast
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame, Row
|
|
4
|
+
from pyspark.sql.functions import expr, lit, md5
|
|
5
|
+
|
|
6
|
+
from fabricks.cdc.nocdc import NoCDC
|
|
7
|
+
from fabricks.context import VARIABLES
|
|
8
|
+
from fabricks.context.log import Logger
|
|
9
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
10
|
+
from fabricks.core.jobs.base.types import TBronze
|
|
11
|
+
from fabricks.core.parsers import BaseParser
|
|
12
|
+
from fabricks.core.parsers.get_parser import get_parser
|
|
13
|
+
from fabricks.core.utils import clean
|
|
14
|
+
from fabricks.metastore.view import create_or_replace_global_temp_view
|
|
15
|
+
from fabricks.utils.helpers import concat_ws
|
|
16
|
+
from fabricks.utils.path import Path
|
|
17
|
+
from fabricks.utils.read import read
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Bronze(BaseJob):
|
|
21
|
+
def __init__(
|
|
22
|
+
self, step: TBronze, topic: Optional[str] = None, item: Optional[str] = None, job_id: Optional[str] = None
|
|
23
|
+
): # type: ignore
|
|
24
|
+
super().__init__(
|
|
25
|
+
"bronze",
|
|
26
|
+
step=step,
|
|
27
|
+
topic=topic,
|
|
28
|
+
item=item,
|
|
29
|
+
job_id=job_id,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
_parser: Optional[BaseParser] = None
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def stream(self) -> bool:
|
|
36
|
+
return self.mode not in ["register"]
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def schema_drift(self) -> bool:
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def persist(self) -> bool:
|
|
44
|
+
return self.mode in ["append", "register"]
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def virtual(self) -> bool:
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def from_job_id(cls, step: str, job_id: str):
|
|
52
|
+
return cls(step=cast(TBronze, step), job_id=job_id)
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_step_topic_item(cls, step: str, topic: str, item: str):
|
|
56
|
+
return cls(step=cast(TBronze, step), topic=topic, item=item)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def data_path(self) -> Path:
|
|
60
|
+
uri = self.options.job.get("uri")
|
|
61
|
+
assert uri is not None, "no uri provided in options"
|
|
62
|
+
path = Path.from_uri(uri, regex=VARIABLES)
|
|
63
|
+
return path
|
|
64
|
+
|
|
65
|
+
def get_dependencies(self, df: Optional[DataFrame] = None) -> Optional[DataFrame]:
|
|
66
|
+
dependencies = []
|
|
67
|
+
parents = self.options.job.get_list("parents")
|
|
68
|
+
if parents:
|
|
69
|
+
for p in parents:
|
|
70
|
+
dependencies.append(Row(self.job_id, p, "job"))
|
|
71
|
+
if dependencies:
|
|
72
|
+
df = self.spark.createDataFrame(dependencies, schema=["job_id", "parent", "origin"])
|
|
73
|
+
df = df.transform(self.add_dependency_details)
|
|
74
|
+
return df
|
|
75
|
+
|
|
76
|
+
def register_external_table(self):
|
|
77
|
+
options = self.conf.parser_options # type: ignore
|
|
78
|
+
if options:
|
|
79
|
+
file_format = options.get("file_format")
|
|
80
|
+
else:
|
|
81
|
+
file_format = "delta"
|
|
82
|
+
|
|
83
|
+
Logger.debug(f"register external table ({self.data_path})", extra={"job": self})
|
|
84
|
+
self.spark.sql(
|
|
85
|
+
f"create table if not exists {self.qualified_name} using {file_format} location '{self.data_path}'"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def drop_external_table(self):
|
|
89
|
+
Logger.debug("drop external table", extra={"job": self})
|
|
90
|
+
self.spark.sql(f"drop table if exists {self.qualified_name}")
|
|
91
|
+
|
|
92
|
+
def optimize_external_table(
|
|
93
|
+
self,
|
|
94
|
+
vacuum: Optional[bool] = True,
|
|
95
|
+
analyze: Optional[bool] = True,
|
|
96
|
+
):
|
|
97
|
+
Logger.debug("optimize external table", extra={"job": self})
|
|
98
|
+
if vacuum:
|
|
99
|
+
from delta import DeltaTable
|
|
100
|
+
|
|
101
|
+
dt = DeltaTable.forPath(self.spark, self.data_path.string)
|
|
102
|
+
retention_days = 7
|
|
103
|
+
Logger.debug(f"{self.data_path} - vacuum table (removing files older than {retention_days} days)")
|
|
104
|
+
try:
|
|
105
|
+
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
|
|
106
|
+
dt.vacuum(retention_days * 24)
|
|
107
|
+
finally:
|
|
108
|
+
self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
|
|
109
|
+
|
|
110
|
+
if analyze:
|
|
111
|
+
Logger.debug(f"{self.data_path} - compute delta statistics")
|
|
112
|
+
self.spark.sql(f"analyze table delta.`{self.data_path}` compute delta statistics")
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def parser(self) -> BaseParser:
|
|
116
|
+
if not self._parser:
|
|
117
|
+
assert self.mode not in ["register"], f"{self.mode} not allowed"
|
|
118
|
+
name = self.options.job.get("parser")
|
|
119
|
+
assert name is not None, "parser not found"
|
|
120
|
+
options = self.conf.parser_options or None # type: ignore
|
|
121
|
+
p = get_parser(name, options)
|
|
122
|
+
self._parser = p
|
|
123
|
+
return self._parser
|
|
124
|
+
|
|
125
|
+
def parse(self, stream: bool = False) -> DataFrame:
|
|
126
|
+
"""
|
|
127
|
+
Parses the data based on the specified mode and returns a DataFrame.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
stream (bool, optional): Indicates whether the data should be read as a stream. Defaults to False.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
DataFrame: The parsed data as a DataFrame.
|
|
134
|
+
"""
|
|
135
|
+
if self.mode == "register":
|
|
136
|
+
if stream:
|
|
137
|
+
df = read(
|
|
138
|
+
stream=stream,
|
|
139
|
+
path=self.data_path,
|
|
140
|
+
file_format="delta",
|
|
141
|
+
# spark=self.spark, (BUG)
|
|
142
|
+
)
|
|
143
|
+
else:
|
|
144
|
+
df = self.spark.sql(f"select * from {self}")
|
|
145
|
+
# cleaning done in parser
|
|
146
|
+
df = clean(df)
|
|
147
|
+
else:
|
|
148
|
+
df = self.parser.get_data(
|
|
149
|
+
stream=stream,
|
|
150
|
+
data_path=self.data_path,
|
|
151
|
+
schema_path=self.paths.schema,
|
|
152
|
+
spark=self.spark,
|
|
153
|
+
)
|
|
154
|
+
return df
|
|
155
|
+
|
|
156
|
+
def get_data(self, stream: bool = False, transform: bool = False) -> DataFrame:
|
|
157
|
+
df = self.parse(stream)
|
|
158
|
+
df = self.filter_where(df)
|
|
159
|
+
df = self.encrypt(df)
|
|
160
|
+
if transform:
|
|
161
|
+
df = self.base_transform(df)
|
|
162
|
+
return df
|
|
163
|
+
|
|
164
|
+
def add_calculated_columns(self, df: DataFrame) -> DataFrame:
|
|
165
|
+
calculated_columns = self.options.job.get_dict("calculated_columns")
|
|
166
|
+
if calculated_columns:
|
|
167
|
+
for key, value in calculated_columns.items():
|
|
168
|
+
Logger.debug(f"add calculated column ({key} -> {value})", extra={"job": self})
|
|
169
|
+
df = df.withColumn(key, expr(f"{value}"))
|
|
170
|
+
return df
|
|
171
|
+
|
|
172
|
+
def add_hash(self, df: DataFrame) -> DataFrame:
|
|
173
|
+
if "__hash" not in df.columns:
|
|
174
|
+
fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
|
|
175
|
+
Logger.debug("add hash", extra={"job": self})
|
|
176
|
+
if "__operation" in df.columns:
|
|
177
|
+
fields += ["__operation == 'delete'"]
|
|
178
|
+
if "__source" in df.columns:
|
|
179
|
+
fields += ["__source"]
|
|
180
|
+
df = df.withColumn("__hash", md5(expr(f"{concat_ws(fields)}")))
|
|
181
|
+
return df
|
|
182
|
+
|
|
183
|
+
def add_key(self, df: DataFrame) -> DataFrame:
|
|
184
|
+
if "__key" not in df.columns:
|
|
185
|
+
fields = self.options.job.get_list("keys")
|
|
186
|
+
if fields:
|
|
187
|
+
Logger.debug(f"add key ({', '.join(fields)})", extra={"job": self})
|
|
188
|
+
if "__source" in df.columns:
|
|
189
|
+
fields = fields + ["__source"]
|
|
190
|
+
fields = [f"`{f}`" for f in fields]
|
|
191
|
+
df = df.withColumn("__key", md5(expr(f"{concat_ws(fields)}")))
|
|
192
|
+
return df
|
|
193
|
+
|
|
194
|
+
def add_source(self, df: DataFrame) -> DataFrame:
|
|
195
|
+
if "__source" not in df.columns:
|
|
196
|
+
source = self.options.job.get("source")
|
|
197
|
+
if source:
|
|
198
|
+
Logger.debug(f"add source ({source})", extra={"job": self})
|
|
199
|
+
df = df.withColumn("__source", lit(source))
|
|
200
|
+
return df
|
|
201
|
+
|
|
202
|
+
def add_operation(self, df: DataFrame) -> DataFrame:
|
|
203
|
+
if "__operation" not in df.columns:
|
|
204
|
+
operation = self.options.job.get("operation")
|
|
205
|
+
if operation:
|
|
206
|
+
Logger.debug(f"add operation ({operation})", extra={"job": self})
|
|
207
|
+
df = df.withColumn("__operation", lit(operation))
|
|
208
|
+
else:
|
|
209
|
+
df = df.withColumn("__operation", lit("upsert"))
|
|
210
|
+
return df
|
|
211
|
+
|
|
212
|
+
def base_transform(self, df: DataFrame) -> DataFrame:
|
|
213
|
+
df = df.transform(self.extender)
|
|
214
|
+
df = df.transform(self.add_calculated_columns)
|
|
215
|
+
df = df.transform(self.add_hash)
|
|
216
|
+
df = df.transform(self.add_operation)
|
|
217
|
+
df = df.transform(self.add_source)
|
|
218
|
+
df = df.transform(self.add_key)
|
|
219
|
+
|
|
220
|
+
if "__metadata" in df.columns:
|
|
221
|
+
if self.mode == "register":
|
|
222
|
+
# https://github.com/delta-io/delta/issues/2014 (BUG)
|
|
223
|
+
df = df.withColumn(
|
|
224
|
+
"__metadata",
|
|
225
|
+
expr(
|
|
226
|
+
f"""
|
|
227
|
+
struct(
|
|
228
|
+
concat_ws('/', '{self.data_path}', __timestamp, __operation) as file_path,
|
|
229
|
+
__metadata.file_name as file_name,
|
|
230
|
+
__metadata.file_size as file_size,
|
|
231
|
+
__metadata.file_modification_time as file_modification_time,
|
|
232
|
+
cast(current_date() as timestamp) as inserted
|
|
233
|
+
)
|
|
234
|
+
"""
|
|
235
|
+
),
|
|
236
|
+
)
|
|
237
|
+
else:
|
|
238
|
+
df = df.withColumn(
|
|
239
|
+
"__metadata",
|
|
240
|
+
expr(
|
|
241
|
+
"""
|
|
242
|
+
struct(
|
|
243
|
+
__metadata.file_path as file_path,
|
|
244
|
+
__metadata.file_name as file_name,
|
|
245
|
+
__metadata.file_size as file_size,
|
|
246
|
+
__metadata.file_modification_time as file_modification_time,
|
|
247
|
+
cast(current_date() as timestamp) as inserted
|
|
248
|
+
)
|
|
249
|
+
"""
|
|
250
|
+
),
|
|
251
|
+
)
|
|
252
|
+
return df
|
|
253
|
+
|
|
254
|
+
def create_or_replace_view(self):
|
|
255
|
+
Logger.warning("create or replace view not allowed", extra={"job": self})
|
|
256
|
+
|
|
257
|
+
def overwrite_schema(self):
|
|
258
|
+
Logger.warning("schema overwrite not allowed", extra={"job": self})
|
|
259
|
+
|
|
260
|
+
def get_cdc_context(self, df: DataFrame) -> dict:
|
|
261
|
+
return {}
|
|
262
|
+
|
|
263
|
+
def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
|
|
264
|
+
assert self.persist, f"{self.mode} not allowed"
|
|
265
|
+
|
|
266
|
+
context = self.get_cdc_context(df)
|
|
267
|
+
|
|
268
|
+
# if dataframe, reference is passed (BUG)
|
|
269
|
+
name = f"{self.step}_{self.topic}_{self.item}__{batch}"
|
|
270
|
+
global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
|
|
271
|
+
sql = f"select * from {global_temp_view}"
|
|
272
|
+
|
|
273
|
+
assert isinstance(self.cdc, NoCDC)
|
|
274
|
+
if self.mode == "append":
|
|
275
|
+
self.cdc.append(sql, **context)
|
|
276
|
+
|
|
277
|
+
def for_each_run(self, schedule: Optional[str] = None):
|
|
278
|
+
if self.mode == "register":
|
|
279
|
+
Logger.info("register (no run)", extra={"job": self})
|
|
280
|
+
elif self.mode == "memory":
|
|
281
|
+
Logger.info("memory (no run)", extra={"job": self})
|
|
282
|
+
else:
|
|
283
|
+
super().for_each_run(schedule=schedule)
|
|
284
|
+
|
|
285
|
+
def create(self):
|
|
286
|
+
if self.mode == "register":
|
|
287
|
+
self.register_external_table()
|
|
288
|
+
elif self.mode == "memory":
|
|
289
|
+
Logger.info("memory (no table nor view)", extra={"job": self})
|
|
290
|
+
else:
|
|
291
|
+
super().create()
|
|
292
|
+
|
|
293
|
+
def register(self):
|
|
294
|
+
if self.mode == "register":
|
|
295
|
+
self.register_external_table()
|
|
296
|
+
elif self.mode == "memory":
|
|
297
|
+
Logger.info("memory (no table nor view)", extra={"job": self})
|
|
298
|
+
else:
|
|
299
|
+
super().register()
|
|
300
|
+
|
|
301
|
+
def truncate(self):
|
|
302
|
+
if self.mode == "register":
|
|
303
|
+
Logger.info("register (no truncate)", extra={"job": self})
|
|
304
|
+
else:
|
|
305
|
+
super().truncate()
|
|
306
|
+
|
|
307
|
+
def restore(self):
|
|
308
|
+
if self.mode == "register":
|
|
309
|
+
Logger.info("register (no restore)", extra={"job": self})
|
|
310
|
+
else:
|
|
311
|
+
super().restore()
|
|
312
|
+
|
|
313
|
+
def drop(self):
|
|
314
|
+
if self.mode == "register":
|
|
315
|
+
self.drop_external_table()
|
|
316
|
+
super().drop()
|
|
317
|
+
|
|
318
|
+
def optimize(
|
|
319
|
+
self,
|
|
320
|
+
vacuum: Optional[bool] = True,
|
|
321
|
+
optimize: Optional[bool] = True,
|
|
322
|
+
analyze: Optional[bool] = True,
|
|
323
|
+
):
|
|
324
|
+
if self.mode == "memory":
|
|
325
|
+
Logger.info("memory (no optimize)", extra={"job": self})
|
|
326
|
+
elif self.mode == "register":
|
|
327
|
+
self.optimize_external_table(vacuum, analyze)
|
|
328
|
+
else:
|
|
329
|
+
super().optimize()
|
|
330
|
+
|
|
331
|
+
def overwrite(self):
|
|
332
|
+
self.truncate()
|
|
333
|
+
self.run()
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Optional, cast, overload
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import Row
|
|
4
|
+
|
|
5
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
6
|
+
from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers, TBronze, TGold, TSilver
|
|
7
|
+
from fabricks.core.jobs.get_job_id import get_job_id
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def get_job(step: str, *, job_id: str) -> BaseJob: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@overload
|
|
15
|
+
def get_job(step: str, *, topic: str, item: str) -> BaseJob: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@overload
|
|
19
|
+
def get_job(*, row: Row) -> BaseJob: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@overload
|
|
23
|
+
def get_job(*, job: str) -> BaseJob: ...
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_job(
|
|
27
|
+
step: Optional[str] = None,
|
|
28
|
+
topic: Optional[str] = None,
|
|
29
|
+
item: Optional[str] = None,
|
|
30
|
+
job_id: Optional[str] = None,
|
|
31
|
+
job: Optional[str] = None,
|
|
32
|
+
row: Optional[Row] = None,
|
|
33
|
+
) -> BaseJob:
|
|
34
|
+
"""
|
|
35
|
+
Retrieve a job based on the provided parameters.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
step (Optional[str]): The step of the job.
|
|
39
|
+
topic (Optional[str]): The topic of the job.
|
|
40
|
+
item (Optional[str]): The item of the job.
|
|
41
|
+
job_id (Optional[str]): The ID of the job.
|
|
42
|
+
job (Optional[str]): The job string.
|
|
43
|
+
row (Optional[Row]): The row object containing job information.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
BaseJob: The retrieved job.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If the required parameters are not provided.
|
|
50
|
+
|
|
51
|
+
"""
|
|
52
|
+
if row:
|
|
53
|
+
if "step" in row and "topic" in row and "item" in row:
|
|
54
|
+
j = _get_job(step=row.step, topic=row.topic, item=row.item)
|
|
55
|
+
elif "step" in row and "job_id" in row:
|
|
56
|
+
j = get_job(step=row.step, job_id=row.job_id)
|
|
57
|
+
elif "job" in row:
|
|
58
|
+
parts = row.job.split(".")
|
|
59
|
+
s = parts[0]
|
|
60
|
+
job_id = get_job_id(job=row.job)
|
|
61
|
+
j = _get_job(step=s, job_id=job_id)
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError("step, topic, item or step, job_id or job mandatory")
|
|
64
|
+
|
|
65
|
+
elif job:
|
|
66
|
+
parts = job.split(".")
|
|
67
|
+
s = parts[0]
|
|
68
|
+
job_id = get_job_id(job=job)
|
|
69
|
+
j = _get_job(step=s, job_id=job_id)
|
|
70
|
+
|
|
71
|
+
elif job_id:
|
|
72
|
+
assert step, "step mandatory"
|
|
73
|
+
j = _get_job(step=step, job_id=job_id)
|
|
74
|
+
|
|
75
|
+
else:
|
|
76
|
+
assert step, "step mandatory"
|
|
77
|
+
assert topic, "topic mandatory"
|
|
78
|
+
assert item, "item mandatory"
|
|
79
|
+
j = _get_job(step=step, topic=topic, item=item)
|
|
80
|
+
|
|
81
|
+
return j
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _get_job(
|
|
85
|
+
step: str,
|
|
86
|
+
topic: Optional[str] = None,
|
|
87
|
+
item: Optional[str] = None,
|
|
88
|
+
job_id: Optional[str] = None,
|
|
89
|
+
):
|
|
90
|
+
if step in Bronzes:
|
|
91
|
+
from fabricks.core.jobs.bronze import Bronze
|
|
92
|
+
|
|
93
|
+
step = cast(TBronze, step)
|
|
94
|
+
if job_id is not None:
|
|
95
|
+
job = Bronze.from_job_id(step=step, job_id=job_id)
|
|
96
|
+
else:
|
|
97
|
+
assert topic
|
|
98
|
+
assert item
|
|
99
|
+
job = Bronze.from_step_topic_item(step=step, topic=topic, item=item)
|
|
100
|
+
|
|
101
|
+
elif step in Silvers:
|
|
102
|
+
from fabricks.core.jobs.silver import Silver
|
|
103
|
+
|
|
104
|
+
step = cast(TSilver, step)
|
|
105
|
+
if job_id is not None:
|
|
106
|
+
job = Silver.from_job_id(step=step, job_id=job_id)
|
|
107
|
+
else:
|
|
108
|
+
assert topic
|
|
109
|
+
assert item
|
|
110
|
+
job = Silver.from_step_topic_item(step=step, topic=topic, item=item)
|
|
111
|
+
|
|
112
|
+
elif step in Golds:
|
|
113
|
+
from fabricks.core.jobs.gold import Gold
|
|
114
|
+
|
|
115
|
+
step = cast(TGold, step)
|
|
116
|
+
if job_id is not None:
|
|
117
|
+
job = Gold.from_job_id(step=step, job_id=job_id)
|
|
118
|
+
else:
|
|
119
|
+
assert topic
|
|
120
|
+
assert item
|
|
121
|
+
job = Gold.from_step_topic_item(step=step, topic=topic, item=item)
|
|
122
|
+
|
|
123
|
+
else:
|
|
124
|
+
raise ValueError(f"{step} not found")
|
|
125
|
+
|
|
126
|
+
return job
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from typing import Optional, cast, overload
|
|
2
|
+
|
|
3
|
+
from databricks.sdk.runtime import spark
|
|
4
|
+
from pyspark.sql import Row
|
|
5
|
+
|
|
6
|
+
from fabricks.context import IS_LIVE
|
|
7
|
+
from fabricks.core.jobs.base.types import Bronzes, Golds, JobConf, Silvers, TBronze, TGold, TSilver, TStep
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def get_job_conf(step: TStep, *, job_id: str) -> JobConf: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@overload
|
|
15
|
+
def get_job_conf(step: TStep, *, topic: str, item: str) -> JobConf: ...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_job_conf(step: TStep, row: Row) -> JobConf:
|
|
19
|
+
options = row["options"].asDict() if row["options"] else None
|
|
20
|
+
table_options = row["table_options"].asDict() if row["table_options"] else None
|
|
21
|
+
check_options = row["check_options"].asDict() if row["check_options"] else None
|
|
22
|
+
spark_options = row["spark_options"].asDict() if row["spark_options"] else None
|
|
23
|
+
invoker_options = row["invoker_options"].asDict() if row["invoker_options"] else None
|
|
24
|
+
|
|
25
|
+
if step in Bronzes:
|
|
26
|
+
from fabricks.core.jobs.base.types import JobConfBronze
|
|
27
|
+
|
|
28
|
+
assert options is not None, "no option"
|
|
29
|
+
parser_options = row["parser_options"].asDict() if row["parser_options"] else None
|
|
30
|
+
step = cast(TBronze, step)
|
|
31
|
+
return JobConfBronze(
|
|
32
|
+
job_id=row["job_id"],
|
|
33
|
+
topic=row["topic"],
|
|
34
|
+
item=row["item"],
|
|
35
|
+
step=step,
|
|
36
|
+
options=options,
|
|
37
|
+
parser_options=parser_options,
|
|
38
|
+
table_options=table_options,
|
|
39
|
+
check_options=check_options,
|
|
40
|
+
invoker_options=invoker_options,
|
|
41
|
+
spark_options=spark_options,
|
|
42
|
+
tags=row["tags"],
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
elif step in Silvers:
|
|
46
|
+
from fabricks.core.jobs.base.types import JobConfSilver
|
|
47
|
+
|
|
48
|
+
assert options is not None, "no option"
|
|
49
|
+
step = cast(TSilver, step)
|
|
50
|
+
return JobConfSilver(
|
|
51
|
+
job_id=row["job_id"],
|
|
52
|
+
topic=row["topic"],
|
|
53
|
+
item=row["item"],
|
|
54
|
+
step=step,
|
|
55
|
+
options=options,
|
|
56
|
+
table_options=table_options,
|
|
57
|
+
check_options=check_options,
|
|
58
|
+
invoker_options=invoker_options,
|
|
59
|
+
spark_options=spark_options,
|
|
60
|
+
tags=row["tags"],
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
elif step in Golds:
|
|
64
|
+
from fabricks.core.jobs.base.types import JobConfGold
|
|
65
|
+
|
|
66
|
+
assert options is not None, "no option"
|
|
67
|
+
step = cast(TGold, step)
|
|
68
|
+
return JobConfGold(
|
|
69
|
+
job_id=row["job_id"],
|
|
70
|
+
topic=row["topic"],
|
|
71
|
+
item=row["item"],
|
|
72
|
+
step=step,
|
|
73
|
+
options=options,
|
|
74
|
+
table_options=table_options,
|
|
75
|
+
check_options=check_options,
|
|
76
|
+
invoker_options=invoker_options,
|
|
77
|
+
spark_options=spark_options,
|
|
78
|
+
tags=row["tags"],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
else:
|
|
82
|
+
raise ValueError(f"{step} not found")
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_job_conf(
|
|
86
|
+
step: TStep,
|
|
87
|
+
job_id: Optional[str] = None,
|
|
88
|
+
topic: Optional[str] = None,
|
|
89
|
+
item: Optional[str] = None,
|
|
90
|
+
) -> JobConf:
|
|
91
|
+
if IS_LIVE:
|
|
92
|
+
from fabricks.core.steps import get_step
|
|
93
|
+
|
|
94
|
+
s = get_step(step=step)
|
|
95
|
+
if topic:
|
|
96
|
+
df = s.get_jobs(topic=topic)
|
|
97
|
+
else:
|
|
98
|
+
df = s.get_jobs()
|
|
99
|
+
else:
|
|
100
|
+
df = spark.sql(f"select * from fabricks.{step}_jobs")
|
|
101
|
+
|
|
102
|
+
assert df, f"{step} not found"
|
|
103
|
+
|
|
104
|
+
if job_id:
|
|
105
|
+
try:
|
|
106
|
+
row = df.where(f"job_id == '{job_id}'").collect()[0]
|
|
107
|
+
except IndexError:
|
|
108
|
+
raise ValueError(f"job not found ({step}, {job_id})")
|
|
109
|
+
else:
|
|
110
|
+
try:
|
|
111
|
+
row = df.where(f"topic == '{topic}' and item == '{item}'").collect()[0]
|
|
112
|
+
except IndexError:
|
|
113
|
+
raise ValueError(f"job not found ({step}, {topic}, {item})")
|
|
114
|
+
|
|
115
|
+
return _get_job_conf(step=step, row=row)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from typing import Optional, overload
|
|
2
|
+
|
|
3
|
+
from fabricks.utils.helpers import md5
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@overload
|
|
7
|
+
def get_job_id(step: str, topic: str, item: str) -> str: ...
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@overload
|
|
11
|
+
def get_job_id(*, job: str) -> str: ...
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_job_id(
|
|
15
|
+
step: Optional[str] = None,
|
|
16
|
+
topic: Optional[str] = None,
|
|
17
|
+
item: Optional[str] = None,
|
|
18
|
+
job: Optional[str] = None,
|
|
19
|
+
) -> str:
|
|
20
|
+
if not job:
|
|
21
|
+
assert step
|
|
22
|
+
assert topic
|
|
23
|
+
assert item
|
|
24
|
+
job = f"{step}.{topic}_{item}"
|
|
25
|
+
|
|
26
|
+
return md5(job)
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import List, Optional, TypedDict, Union
|
|
3
|
+
|
|
4
|
+
from databricks.sdk.runtime import spark
|
|
5
|
+
from pyspark.sql import DataFrame, Row
|
|
6
|
+
from pyspark.sql.functions import expr
|
|
7
|
+
|
|
8
|
+
from fabricks.context import IS_LIVE, PATHS_RUNTIME
|
|
9
|
+
from fabricks.core.jobs.base.job import BaseJob
|
|
10
|
+
from fabricks.core.jobs.base.types import Modes, TStep
|
|
11
|
+
from fabricks.core.jobs.get_job import get_job
|
|
12
|
+
from fabricks.utils.helpers import concat_dfs, run_in_parallel
|
|
13
|
+
from fabricks.utils.path import Path
|
|
14
|
+
from fabricks.utils.read import read_yaml
|
|
15
|
+
from fabricks.utils.schema import get_schema_for_type
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GenericOptions(TypedDict):
|
|
19
|
+
mode: Modes
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class JobConfGeneric:
|
|
24
|
+
step: TStep
|
|
25
|
+
job_id: str
|
|
26
|
+
topic: str
|
|
27
|
+
item: str
|
|
28
|
+
options: GenericOptions
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _get_job(row: Row):
|
|
32
|
+
return get_job(row=row)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_jobs() -> DataFrame:
|
|
36
|
+
if IS_LIVE:
|
|
37
|
+
schema = get_schema_for_type(JobConfGeneric)
|
|
38
|
+
|
|
39
|
+
def _read_yaml(path: Path):
|
|
40
|
+
df = read_yaml(path, root="job", schema=schema)
|
|
41
|
+
if df:
|
|
42
|
+
df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
|
|
43
|
+
return df
|
|
44
|
+
|
|
45
|
+
dfs = run_in_parallel(_read_yaml, list(PATHS_RUNTIME.values()))
|
|
46
|
+
df = concat_dfs(dfs)
|
|
47
|
+
|
|
48
|
+
else:
|
|
49
|
+
df = spark.sql("select * from fabricks.jobs")
|
|
50
|
+
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_jobs(df: Optional[DataFrame] = None, convert: Optional[bool] = False) -> Union[List[BaseJob], DataFrame]:
|
|
55
|
+
"""
|
|
56
|
+
Retrieves a list of jobs or a DataFrame containing job information.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
df (Optional[DataFrame]): Optional DataFrame containing job information.
|
|
60
|
+
convert (Optional[bool]): Flag indicating whether to convert the DataFrame to a list of jobs.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Union[List[BaseJob], DataFrame]: If `convert` is False, returns a list of BaseJob objects.
|
|
64
|
+
If `convert` is True, returns a DataFrame with selected columns.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If the DataFrame does not contain the required columns.
|
|
68
|
+
|
|
69
|
+
"""
|
|
70
|
+
if not convert:
|
|
71
|
+
return _get_jobs()
|
|
72
|
+
|
|
73
|
+
else:
|
|
74
|
+
if df is None:
|
|
75
|
+
df = _get_jobs()
|
|
76
|
+
else:
|
|
77
|
+
if "step" in df.columns and "topic" in df.columns and "item" in df.columns:
|
|
78
|
+
df = df.select("step", "topic", "item")
|
|
79
|
+
elif "step" in df.columns and "job_id" in df.columns:
|
|
80
|
+
df = df.select("step", "job_id")
|
|
81
|
+
elif "job" in df.columns:
|
|
82
|
+
df = df.select("job")
|
|
83
|
+
else:
|
|
84
|
+
raise ValueError("step, topic, item or step, job_id or job mandatory")
|
|
85
|
+
|
|
86
|
+
assert df
|
|
87
|
+
|
|
88
|
+
jobs = run_in_parallel(_get_job, df)
|
|
89
|
+
return jobs
|