fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
fabricks/deploy/views.py
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
from fabricks.context import SPARK
|
|
2
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
3
|
+
from fabricks.core.jobs.base._types import Steps
|
|
4
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def deploy_views():
|
|
8
|
+
DEFAULT_LOGGER.info("create or replace fabricks (default) views")
|
|
9
|
+
|
|
10
|
+
create_or_replace_jobs_view()
|
|
11
|
+
create_or_replace_tables_view()
|
|
12
|
+
create_or_replace_views_view()
|
|
13
|
+
create_or_replace_logs_pivot_view()
|
|
14
|
+
create_or_replace_last_schedule_view()
|
|
15
|
+
create_or_replace_last_status_view()
|
|
16
|
+
create_or_replace_previous_schedule_view()
|
|
17
|
+
create_or_replace_schedules_view()
|
|
18
|
+
create_or_replace_dependencies_view()
|
|
19
|
+
create_or_replace_dependencies_flat_view()
|
|
20
|
+
create_or_replace_dependencies_unpivot_view()
|
|
21
|
+
create_or_replace_dependencies_circular_view()
|
|
22
|
+
create_or_replace_jobs_to_be_updated_view()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def create_or_replace_jobs_view():
|
|
26
|
+
dmls = []
|
|
27
|
+
|
|
28
|
+
for step in Steps:
|
|
29
|
+
table = f"{step}_jobs"
|
|
30
|
+
try:
|
|
31
|
+
try:
|
|
32
|
+
SPARK.sql(f"select options.change_data_capture from fabricks.{table}")
|
|
33
|
+
change_data_capture = "coalesce(options.change_data_capture, 'nocdc') as change_data_capture"
|
|
34
|
+
except Exception:
|
|
35
|
+
change_data_capture = "'nocdc' as change_data_capture"
|
|
36
|
+
|
|
37
|
+
dml = f"""
|
|
38
|
+
select
|
|
39
|
+
j.step,
|
|
40
|
+
s.expand,
|
|
41
|
+
j.job_id,
|
|
42
|
+
j.topic,
|
|
43
|
+
j.item,
|
|
44
|
+
concat(j.step, '.', j.topic, '_', j.item) as job,
|
|
45
|
+
j.options.mode,
|
|
46
|
+
{change_data_capture},
|
|
47
|
+
coalesce(j.options.type, 'default') as type,
|
|
48
|
+
tags,
|
|
49
|
+
case
|
|
50
|
+
when s.expand == "bronze" then if(j.options.mode in ("append", "register"), "table", null)
|
|
51
|
+
when
|
|
52
|
+
s.expand == "silver"
|
|
53
|
+
then
|
|
54
|
+
if(
|
|
55
|
+
j.options.mode in ("update", "append", "latest"),
|
|
56
|
+
"table",
|
|
57
|
+
if(j.options.mode in ("combine", "memory"), "view", null)
|
|
58
|
+
)
|
|
59
|
+
when
|
|
60
|
+
s.expand == "gold"
|
|
61
|
+
then
|
|
62
|
+
if(j.options.mode in ("update", "append", "complete"), "table", if(j.options.mode in ("memory"), "view", null))
|
|
63
|
+
end as object_type
|
|
64
|
+
from
|
|
65
|
+
fabricks.{table} j
|
|
66
|
+
left join fabricks.steps s on s.step = j.step
|
|
67
|
+
"""
|
|
68
|
+
SPARK.sql(dml) # Check if the table exists
|
|
69
|
+
dmls.append(dml)
|
|
70
|
+
|
|
71
|
+
except Exception:
|
|
72
|
+
DEFAULT_LOGGER.debug(f"could not find fabricks.{table}")
|
|
73
|
+
|
|
74
|
+
sql = f"""create or replace view fabricks.jobs with schema evolution as {" union all ".join(dmls)}"""
|
|
75
|
+
sql = fix_sql(sql)
|
|
76
|
+
|
|
77
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.jobs", extra={"sql": sql})
|
|
78
|
+
SPARK.sql(sql)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def create_or_replace_tables_view():
|
|
82
|
+
dmls = []
|
|
83
|
+
|
|
84
|
+
for step in Steps:
|
|
85
|
+
table = f"{step}_tables"
|
|
86
|
+
try:
|
|
87
|
+
dml = f"""
|
|
88
|
+
select
|
|
89
|
+
'{step}' as step,
|
|
90
|
+
job_id,
|
|
91
|
+
table
|
|
92
|
+
from
|
|
93
|
+
fabricks.{table}
|
|
94
|
+
"""
|
|
95
|
+
SPARK.sql(dml) # Check if the table exists
|
|
96
|
+
dmls.append(dml)
|
|
97
|
+
|
|
98
|
+
except Exception:
|
|
99
|
+
DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_tables")
|
|
100
|
+
|
|
101
|
+
sql = f"""create or replace view fabricks.tables with schema evolution as {" union all ".join(dmls)}"""
|
|
102
|
+
sql = fix_sql(sql)
|
|
103
|
+
|
|
104
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.tables", extra={"sql": sql})
|
|
105
|
+
SPARK.sql(sql)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def create_or_replace_views_view():
|
|
109
|
+
dmls = []
|
|
110
|
+
|
|
111
|
+
for step in Steps:
|
|
112
|
+
table = f"{step}_views"
|
|
113
|
+
try:
|
|
114
|
+
dml = f"""
|
|
115
|
+
select
|
|
116
|
+
'{step}' as step,
|
|
117
|
+
job_id,
|
|
118
|
+
view
|
|
119
|
+
from
|
|
120
|
+
fabricks.{table}
|
|
121
|
+
"""
|
|
122
|
+
SPARK.sql(dml) # Check if the table exists
|
|
123
|
+
dmls.append(dml)
|
|
124
|
+
|
|
125
|
+
except Exception:
|
|
126
|
+
DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_views")
|
|
127
|
+
|
|
128
|
+
sql = f"""create or replace view fabricks.views with schema evolution as {" union all ".join(dmls)}"""
|
|
129
|
+
sql = fix_sql(sql)
|
|
130
|
+
|
|
131
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.views", extra={"sql": sql})
|
|
132
|
+
SPARK.sql(sql)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def create_or_replace_dependencies_view():
|
|
136
|
+
dmls = []
|
|
137
|
+
|
|
138
|
+
for step in Steps:
|
|
139
|
+
f"{step}_dependencies"
|
|
140
|
+
try:
|
|
141
|
+
dml = f"""
|
|
142
|
+
select
|
|
143
|
+
'{step}' as step,
|
|
144
|
+
dependency_id,
|
|
145
|
+
job_id,
|
|
146
|
+
parent_id,
|
|
147
|
+
parent,
|
|
148
|
+
origin
|
|
149
|
+
from
|
|
150
|
+
fabricks.{step}_dependencies d
|
|
151
|
+
"""
|
|
152
|
+
SPARK.sql(dml) # Check if the table exists
|
|
153
|
+
dmls.append(dml)
|
|
154
|
+
|
|
155
|
+
except Exception:
|
|
156
|
+
DEFAULT_LOGGER.debug(f"could not find fabricks.{step}_dependencies")
|
|
157
|
+
|
|
158
|
+
sql = f"""create or replace view fabricks.dependencies with schema evolution as {" union all ".join(dmls)}"""
|
|
159
|
+
sql = fix_sql(sql)
|
|
160
|
+
|
|
161
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.dependencies", extra={"sql": sql})
|
|
162
|
+
SPARK.sql(sql)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def create_or_replace_dependencies_flat_view():
|
|
166
|
+
parent = ",\n ".join([f"d{i + 1}.parent_id as parent_{i + 1}" for i in range(10)])
|
|
167
|
+
join = "\n ".join(
|
|
168
|
+
[f"left join fabricks.dependencies d{i + 1} on d{i}.parent_id = d{i + 1}.job_id" for i in range(10)]
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
sql = f"""
|
|
172
|
+
create or replace view fabricks.dependencies_flat with schema evolution as
|
|
173
|
+
select
|
|
174
|
+
d0.job_id,
|
|
175
|
+
d0.parent_id as parent_0,
|
|
176
|
+
{parent}
|
|
177
|
+
from
|
|
178
|
+
fabricks.dependencies d0
|
|
179
|
+
{join}
|
|
180
|
+
"""
|
|
181
|
+
sql = fix_sql(sql)
|
|
182
|
+
|
|
183
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_flat", extra={"sql": sql})
|
|
184
|
+
SPARK.sql(sql)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def create_or_replace_dependencies_unpivot_view():
|
|
188
|
+
sql = """
|
|
189
|
+
create or replace view fabricks.dependencies_unpivot with schema evolution as
|
|
190
|
+
with unpvt as (
|
|
191
|
+
select
|
|
192
|
+
*
|
|
193
|
+
from
|
|
194
|
+
fabricks.dependencies_flat unpivot (
|
|
195
|
+
(parent_id) for depth in (
|
|
196
|
+
(parent_0) as depth_00,
|
|
197
|
+
(parent_1) as depth_01,
|
|
198
|
+
(parent_2) as depth_02,
|
|
199
|
+
(parent_3) as depth_03,
|
|
200
|
+
(parent_4) as depth_04,
|
|
201
|
+
(parent_5) as depth_05,
|
|
202
|
+
(parent_6) as depth_06,
|
|
203
|
+
(parent_7) as depth_07,
|
|
204
|
+
(parent_8) as depth_08,
|
|
205
|
+
(parent_9) as depth_09,
|
|
206
|
+
(parent_10) as depth_10
|
|
207
|
+
)
|
|
208
|
+
) p
|
|
209
|
+
)
|
|
210
|
+
select
|
|
211
|
+
job_id,
|
|
212
|
+
cast(replace(depth, 'depth_', '') as int) as depth,
|
|
213
|
+
parent_id
|
|
214
|
+
from
|
|
215
|
+
unpvt qualify row_number() over (
|
|
216
|
+
partition by job_id,
|
|
217
|
+
parent_id
|
|
218
|
+
order by
|
|
219
|
+
depth asc
|
|
220
|
+
) = 1
|
|
221
|
+
"""
|
|
222
|
+
sql = fix_sql(sql)
|
|
223
|
+
|
|
224
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_unpivot", extra={"sql": sql})
|
|
225
|
+
SPARK.sql(sql)
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def create_or_replace_dependencies_circular_view():
|
|
229
|
+
sql = """
|
|
230
|
+
create or replace view fabricks.dependencies_circular with schema evolution as
|
|
231
|
+
with d as (
|
|
232
|
+
select
|
|
233
|
+
d1.job_id,
|
|
234
|
+
j1.job,
|
|
235
|
+
p.job_id as parent_id,
|
|
236
|
+
p.job as parent
|
|
237
|
+
from
|
|
238
|
+
fabricks.dependencies d1
|
|
239
|
+
left join fabricks.dependencies_unpivot d2 on d2.parent_id = d1.job_id
|
|
240
|
+
left join fabricks.jobs j1 on d1.job_id = j1.job_id
|
|
241
|
+
left join fabricks.jobs p on d1.parent_id = p.job_id
|
|
242
|
+
where
|
|
243
|
+
true
|
|
244
|
+
and d1.job_id = d2.job_id
|
|
245
|
+
group by
|
|
246
|
+
all
|
|
247
|
+
)
|
|
248
|
+
select
|
|
249
|
+
*
|
|
250
|
+
from
|
|
251
|
+
d
|
|
252
|
+
where
|
|
253
|
+
true
|
|
254
|
+
and exists (
|
|
255
|
+
select
|
|
256
|
+
1
|
|
257
|
+
from
|
|
258
|
+
d d1
|
|
259
|
+
where
|
|
260
|
+
d1.job_id = d.parent_id
|
|
261
|
+
)
|
|
262
|
+
"""
|
|
263
|
+
sql = fix_sql(sql)
|
|
264
|
+
|
|
265
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.dependencies_circular", extra={"sql": sql})
|
|
266
|
+
SPARK.sql(sql)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def create_or_replace_logs_pivot_view():
|
|
270
|
+
sql = """
|
|
271
|
+
create or replace view fabricks.logs_pivot with schema evolution as
|
|
272
|
+
with groupby as (
|
|
273
|
+
select
|
|
274
|
+
l.schedule,
|
|
275
|
+
l.schedule_id,
|
|
276
|
+
l.step,
|
|
277
|
+
l.job,
|
|
278
|
+
l.job_id,
|
|
279
|
+
collect_set(l.status) as statuses,
|
|
280
|
+
array_contains(statuses, 'skipped') as skipped,
|
|
281
|
+
array_contains(statuses, 'warned') as warned,
|
|
282
|
+
array_contains(statuses, 'done') or warned as done,
|
|
283
|
+
array_contains(statuses, 'failed') or (not done and not skipped) as failed,
|
|
284
|
+
not done and not failed and not skipped and array_contains(statuses, 'running') as timed_out,
|
|
285
|
+
not array_contains(statuses, 'running') as cancelled,
|
|
286
|
+
max(l.notebook_id) as notebook_id,
|
|
287
|
+
max(l.timestamp) filter (where l.status = 'scheduled' ) as scheduled_time,
|
|
288
|
+
max(l.timestamp) filter (where l.status = 'waiting' ) as waiting_time,
|
|
289
|
+
max(l.timestamp) filter (where l.status = 'running') as start_time,
|
|
290
|
+
max(l.timestamp) filter (where l.status = 'running' ) as running_time,
|
|
291
|
+
max(l.timestamp) filter (where l.status = 'done' ) as done_time,
|
|
292
|
+
max(l.timestamp) filter (where l.status = 'failed' ) as failed_time,
|
|
293
|
+
max(l.timestamp) filter(where l.status = 'ok') as end_time,
|
|
294
|
+
max(l.timestamp) filter(where l.status = 'ok') as ok_time,
|
|
295
|
+
max(l.exception) as exception
|
|
296
|
+
from
|
|
297
|
+
fabricks.logs l
|
|
298
|
+
group by
|
|
299
|
+
l.schedule, l.schedule_id, l.step, l.job, l.job_id
|
|
300
|
+
)
|
|
301
|
+
select
|
|
302
|
+
g.schedule,
|
|
303
|
+
g.schedule_id,
|
|
304
|
+
g.job,
|
|
305
|
+
g.step,
|
|
306
|
+
j.topic,
|
|
307
|
+
j.item,
|
|
308
|
+
g.job_id,
|
|
309
|
+
g.done,
|
|
310
|
+
g.failed,
|
|
311
|
+
g.timed_out,
|
|
312
|
+
g.cancelled,
|
|
313
|
+
g.skipped,
|
|
314
|
+
g.warned,
|
|
315
|
+
g.notebook_id,
|
|
316
|
+
g.start_time,
|
|
317
|
+
g.end_time,
|
|
318
|
+
g.scheduled_time,
|
|
319
|
+
g.waiting_time,
|
|
320
|
+
g.running_time,
|
|
321
|
+
g.done_time,
|
|
322
|
+
g.failed_time,
|
|
323
|
+
g.ok_time,
|
|
324
|
+
if(g.timed_out, null, date_diff(SECOND, start_time, end_time)) as duration,
|
|
325
|
+
g.exception
|
|
326
|
+
from
|
|
327
|
+
groupby g
|
|
328
|
+
left join fabricks.jobs j on g.job_id = j.job_id
|
|
329
|
+
"""
|
|
330
|
+
sql = fix_sql(sql)
|
|
331
|
+
|
|
332
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.logs_pivot", extra={"sql": sql})
|
|
333
|
+
SPARK.sql(sql)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def create_or_replace_last_schedule_view():
|
|
337
|
+
sql = """
|
|
338
|
+
create or replace view fabricks.last_schedule with schema evolution as
|
|
339
|
+
with lst as (
|
|
340
|
+
select
|
|
341
|
+
schedule_id as last_schedule_id
|
|
342
|
+
from
|
|
343
|
+
fabricks.logs_pivot
|
|
344
|
+
where
|
|
345
|
+
schedule_id is not null
|
|
346
|
+
order by
|
|
347
|
+
start_time desc
|
|
348
|
+
limit
|
|
349
|
+
1
|
|
350
|
+
)
|
|
351
|
+
select
|
|
352
|
+
l.*
|
|
353
|
+
from
|
|
354
|
+
fabricks.logs_pivot l
|
|
355
|
+
inner join lst on schedule_id = last_schedule_id
|
|
356
|
+
"""
|
|
357
|
+
sql = fix_sql(sql)
|
|
358
|
+
|
|
359
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.last_schedule", extra={"sql": sql})
|
|
360
|
+
SPARK.sql(sql)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def create_or_replace_last_status_view():
|
|
364
|
+
sql = """
|
|
365
|
+
create or replace view fabricks.last_status with schema evolution as
|
|
366
|
+
select
|
|
367
|
+
job_id,
|
|
368
|
+
job,
|
|
369
|
+
step,
|
|
370
|
+
start_time as time,
|
|
371
|
+
done,
|
|
372
|
+
failed,
|
|
373
|
+
cancelled,
|
|
374
|
+
timed_out,
|
|
375
|
+
exception
|
|
376
|
+
from
|
|
377
|
+
fabricks.logs_pivot
|
|
378
|
+
qualify row_number() over (
|
|
379
|
+
partition by job_id
|
|
380
|
+
order by
|
|
381
|
+
start_time desc
|
|
382
|
+
) = 1
|
|
383
|
+
"""
|
|
384
|
+
sql = fix_sql(sql)
|
|
385
|
+
|
|
386
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.last_status", extra={"sql": sql})
|
|
387
|
+
SPARK.sql(sql)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def create_or_replace_previous_schedule_view():
|
|
391
|
+
sql = """
|
|
392
|
+
create or replace view fabricks.previous_schedule with schema evolution as
|
|
393
|
+
with lst_2 as (
|
|
394
|
+
select
|
|
395
|
+
schedule_id as last_schedule_id,
|
|
396
|
+
max(start_time) as start_time
|
|
397
|
+
from
|
|
398
|
+
fabricks.logs_pivot
|
|
399
|
+
where
|
|
400
|
+
schedule_id is not null
|
|
401
|
+
group by
|
|
402
|
+
all
|
|
403
|
+
order by
|
|
404
|
+
start_time desc
|
|
405
|
+
limit
|
|
406
|
+
2
|
|
407
|
+
), lst as (
|
|
408
|
+
select
|
|
409
|
+
last_schedule_id
|
|
410
|
+
from
|
|
411
|
+
lst_2
|
|
412
|
+
order by
|
|
413
|
+
start_time asc
|
|
414
|
+
limit
|
|
415
|
+
1
|
|
416
|
+
)
|
|
417
|
+
select
|
|
418
|
+
l.*
|
|
419
|
+
from
|
|
420
|
+
fabricks.logs_pivot l
|
|
421
|
+
inner join lst on schedule_id = last_schedule_id
|
|
422
|
+
"""
|
|
423
|
+
sql = fix_sql(sql)
|
|
424
|
+
|
|
425
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.previous_schedule", extra={"sql": sql})
|
|
426
|
+
SPARK.sql(sql)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def create_or_replace_schedules_view():
|
|
430
|
+
sql = """
|
|
431
|
+
create or replace view fabricks.schedules with schema evolution as
|
|
432
|
+
select
|
|
433
|
+
schedule,
|
|
434
|
+
schedule_id,
|
|
435
|
+
min(start_time) as start_time,
|
|
436
|
+
max(end_time) as end_time,
|
|
437
|
+
max(start_time) :: date as date,
|
|
438
|
+
sum(duration) as duration,
|
|
439
|
+
count(*) as logs,
|
|
440
|
+
count_if(failed) as failed,
|
|
441
|
+
count_if(done) as done,
|
|
442
|
+
count_if(timed_out) as timed_out
|
|
443
|
+
from
|
|
444
|
+
fabricks.logs_pivot
|
|
445
|
+
group by
|
|
446
|
+
all
|
|
447
|
+
order by date desc, start_time desc
|
|
448
|
+
"""
|
|
449
|
+
sql = fix_sql(sql)
|
|
450
|
+
|
|
451
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.schedules", extra={"sql": sql})
|
|
452
|
+
SPARK.sql(sql)
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def create_or_replace_jobs_to_be_updated_view():
|
|
456
|
+
sql = """
|
|
457
|
+
create or replace view fabricks.jobs_to_be_updated with schema evolution as
|
|
458
|
+
with base as (
|
|
459
|
+
select
|
|
460
|
+
j.job,
|
|
461
|
+
j.job_id,
|
|
462
|
+
j.step,
|
|
463
|
+
j.topic,
|
|
464
|
+
j.item,
|
|
465
|
+
s.expand,
|
|
466
|
+
j.mode as mode,
|
|
467
|
+
j.object_type as object_type
|
|
468
|
+
from
|
|
469
|
+
fabricks.jobs j
|
|
470
|
+
inner join fabricks.steps s
|
|
471
|
+
on j.step = s.step
|
|
472
|
+
),
|
|
473
|
+
objects as (
|
|
474
|
+
select
|
|
475
|
+
`table` as job,
|
|
476
|
+
job_id,
|
|
477
|
+
'table' as object_type
|
|
478
|
+
from
|
|
479
|
+
fabricks.tables
|
|
480
|
+
union
|
|
481
|
+
select
|
|
482
|
+
`view` as job,
|
|
483
|
+
job_id,
|
|
484
|
+
'view' as object_type
|
|
485
|
+
from
|
|
486
|
+
fabricks.views
|
|
487
|
+
)
|
|
488
|
+
select
|
|
489
|
+
b.job,
|
|
490
|
+
b.job_id,
|
|
491
|
+
b.step,
|
|
492
|
+
b.topic,
|
|
493
|
+
b.item,
|
|
494
|
+
b.expand,
|
|
495
|
+
b.mode,
|
|
496
|
+
o.object_type as old_object_type,
|
|
497
|
+
b.object_type as new_object_type,
|
|
498
|
+
array(old_object_type, new_object_type) as object_types,
|
|
499
|
+
(old_object_type is not null and new_object_type is null) or (not old_object_type <=> new_object_type and old_object_type is not null ) as is_to_drop,
|
|
500
|
+
(is_to_drop and new_object_type is not null) or (old_object_type is null and new_object_type is not null) as is_to_register
|
|
501
|
+
from
|
|
502
|
+
base b
|
|
503
|
+
left join objects o
|
|
504
|
+
on b.job_id = o.job_id
|
|
505
|
+
"""
|
|
506
|
+
sql = fix_sql(sql)
|
|
507
|
+
|
|
508
|
+
DEFAULT_LOGGER.debug("create or replace fabricks.jobs_to_be_updated", extra={"sql": sql})
|
|
509
|
+
SPARK.sql(sql)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from typing import Literal, Optional
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SchemaDiff(BaseModel):
|
|
7
|
+
column: str
|
|
8
|
+
data_type: Optional[str] = None
|
|
9
|
+
new_column: Optional[str] = None
|
|
10
|
+
new_data_type: Optional[str] = None
|
|
11
|
+
status: Literal["added", "changed", "dropped"]
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def type_widening_compatible(self) -> bool:
|
|
15
|
+
if self.status != "changed":
|
|
16
|
+
return False
|
|
17
|
+
|
|
18
|
+
assert self.new_data_type
|
|
19
|
+
assert self.data_type
|
|
20
|
+
map = {
|
|
21
|
+
"byte": {"short", "int", "long", "decimal", "double"},
|
|
22
|
+
"short": {"int", "long", "decimal", "double"},
|
|
23
|
+
"int": {"long", "decimal", "double"},
|
|
24
|
+
"long": {"decimal"},
|
|
25
|
+
"float": {"double"},
|
|
26
|
+
}
|
|
27
|
+
return self.new_data_type.lower() in map.get(self.data_type.lower(), set())
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DroppedColumn(SchemaDiff):
|
|
31
|
+
def __init__(self, column: str, data_type: Optional[str] = None):
|
|
32
|
+
super().__init__(
|
|
33
|
+
column=column,
|
|
34
|
+
data_type=data_type,
|
|
35
|
+
status="dropped",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def __str__(self):
|
|
39
|
+
return f"dropped {self.column}"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class AddedColumn(SchemaDiff):
|
|
43
|
+
def __init__(self, new_column: str, new_data_type: str):
|
|
44
|
+
super().__init__(
|
|
45
|
+
column=new_column,
|
|
46
|
+
new_column=new_column,
|
|
47
|
+
new_data_type=new_data_type,
|
|
48
|
+
status="added",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def __str__(self):
|
|
52
|
+
return f"added {self.new_column} with type {self.new_data_type}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ChangedColumn(SchemaDiff):
|
|
56
|
+
def __init__(self, column: str, data_type: str, new_data_type: str):
|
|
57
|
+
super().__init__(
|
|
58
|
+
column=column,
|
|
59
|
+
data_type=data_type,
|
|
60
|
+
new_data_type=new_data_type,
|
|
61
|
+
status="changed",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def __str__(self):
|
|
65
|
+
return f"changed {self.column} from {self.data_type} to {self.new_data_type} (widening compatible: {self.type_widening_compatible})"
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
4
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
5
|
+
from typing_extensions import deprecated
|
|
6
|
+
|
|
7
|
+
from fabricks.context import PATHS_STORAGE, SPARK
|
|
8
|
+
from fabricks.context.log import DEFAULT_LOGGER
|
|
9
|
+
from fabricks.metastore.utils import get_tables, get_views
|
|
10
|
+
from fabricks.utils.path import Path
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Database:
|
|
14
|
+
def __init__(self, name: str, spark: Optional[SparkSession] = None):
|
|
15
|
+
self.name = name
|
|
16
|
+
|
|
17
|
+
storage = PATHS_STORAGE.get(self.name)
|
|
18
|
+
assert storage is not None
|
|
19
|
+
self.storage = storage
|
|
20
|
+
|
|
21
|
+
if spark is None:
|
|
22
|
+
spark = SPARK
|
|
23
|
+
assert spark is not None
|
|
24
|
+
self.spark = spark
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
@deprecated("use delta_path instead")
|
|
28
|
+
def deltapath(self) -> Path:
|
|
29
|
+
return self.storage.joinpath("delta")
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def delta_path(self) -> Path:
|
|
33
|
+
return self.storage.joinpath("delta")
|
|
34
|
+
|
|
35
|
+
def create(self):
|
|
36
|
+
DEFAULT_LOGGER.info("create database", extra={"label": self})
|
|
37
|
+
self.spark.sql(f"create database if not exists {self.name};")
|
|
38
|
+
|
|
39
|
+
def drop(self, rm: Optional[bool] = True):
|
|
40
|
+
if self.exists():
|
|
41
|
+
DEFAULT_LOGGER.warning("drop database", extra={"label": self})
|
|
42
|
+
self.spark.sql(f"drop database if exists {self.name} cascade;")
|
|
43
|
+
|
|
44
|
+
if rm:
|
|
45
|
+
if self.delta_path.exists():
|
|
46
|
+
DEFAULT_LOGGER.debug("remove delta files", extra={"label": self})
|
|
47
|
+
self.delta_path.rm()
|
|
48
|
+
|
|
49
|
+
def exists(self) -> bool:
|
|
50
|
+
try:
|
|
51
|
+
self.spark.sql(f"show tables in {self.name}")
|
|
52
|
+
# database not found
|
|
53
|
+
except AnalysisException:
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
return True
|
|
57
|
+
|
|
58
|
+
def __str__(self):
|
|
59
|
+
return self.name
|
|
60
|
+
|
|
61
|
+
def get_tables(self) -> DataFrame:
|
|
62
|
+
return get_tables(self.name)
|
|
63
|
+
|
|
64
|
+
def get_views(self) -> DataFrame:
|
|
65
|
+
return get_views(self.name)
|