fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
from multiprocessing import Process
|
|
5
|
+
from typing import List, Union
|
|
6
|
+
|
|
7
|
+
from databricks.sdk.runtime import dbutils, spark
|
|
8
|
+
|
|
9
|
+
from fabricks.context.runtime import PATH_NOTEBOOKS
|
|
10
|
+
from fabricks.core.dags.base import BaseDags
|
|
11
|
+
from fabricks.core.dags.log import DagsLogger
|
|
12
|
+
from fabricks.core.jobs.base.types import TStep
|
|
13
|
+
from fabricks.core.steps.get_step import get_step
|
|
14
|
+
from fabricks.utils.azure_queue import AzureQueue
|
|
15
|
+
from fabricks.utils.azure_table import AzureTable
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DagProcessor(BaseDags):
|
|
19
|
+
def __init__(self, schedule_id: str, schedule: str, step: Union[TStep, str]):
|
|
20
|
+
self.step = get_step(step=step)
|
|
21
|
+
self.schedule = schedule
|
|
22
|
+
|
|
23
|
+
super().__init__(schedule_id=schedule_id)
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
def queue(self) -> AzureQueue:
|
|
27
|
+
step = self.remove_invalid_characters(str(self.step))
|
|
28
|
+
return AzureQueue(f"q{step}{self.schedule_id}", connection_string=self.get_connection_string())
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def table(self) -> AzureTable:
|
|
32
|
+
return AzureTable(f"t{self.schedule_id}", connection_string=self.get_connection_string())
|
|
33
|
+
|
|
34
|
+
def extra(self, d: dict) -> dict:
|
|
35
|
+
return {
|
|
36
|
+
"partition_key": self.schedule_id,
|
|
37
|
+
"schedule": self.schedule,
|
|
38
|
+
"schedule_id": self.schedule_id,
|
|
39
|
+
"step": str(self.step),
|
|
40
|
+
"job": d.get("Job"),
|
|
41
|
+
"target": "table",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def send(self):
|
|
45
|
+
while True:
|
|
46
|
+
scheduled = self.get_scheduled()
|
|
47
|
+
assert isinstance(scheduled, List)
|
|
48
|
+
if len(scheduled) == 0:
|
|
49
|
+
for _ in range(self.step.workers):
|
|
50
|
+
self.queue.send_sentinel()
|
|
51
|
+
DagsLogger.info("🎉 (no more job to schedule)")
|
|
52
|
+
break
|
|
53
|
+
|
|
54
|
+
else:
|
|
55
|
+
sorted_scheduled = sorted(scheduled, key=lambda x: x.get("Rank"))
|
|
56
|
+
for s in sorted_scheduled:
|
|
57
|
+
dependencies = self.table.query(f"PartitionKey eq 'dependencies' and JobId eq '{s.get('JobId')}'")
|
|
58
|
+
if len(dependencies) == 0:
|
|
59
|
+
s["Status"] = "waiting"
|
|
60
|
+
DagsLogger.info("waiting", extra=self.extra(s))
|
|
61
|
+
self.table.upsert(s)
|
|
62
|
+
self.queue.send(s)
|
|
63
|
+
|
|
64
|
+
time.sleep(5)
|
|
65
|
+
|
|
66
|
+
def receive(self):
|
|
67
|
+
while True:
|
|
68
|
+
response = self.queue.receive()
|
|
69
|
+
if response == self.queue.sentinel:
|
|
70
|
+
DagsLogger.info("💤 (no more job available)")
|
|
71
|
+
break
|
|
72
|
+
elif response:
|
|
73
|
+
j = json.loads(response)
|
|
74
|
+
|
|
75
|
+
j["Status"] = "starting"
|
|
76
|
+
self.table.upsert(j)
|
|
77
|
+
DagsLogger.info("starting", extra=self.extra(j))
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
dbutils.notebook.run(
|
|
81
|
+
PATH_NOTEBOOKS.join("run").get_notebook_path(),
|
|
82
|
+
self.step.timeouts.job,
|
|
83
|
+
{
|
|
84
|
+
"schedule_id": self.schedule_id,
|
|
85
|
+
"schedule": self.schedule, # needed to pass schedule variables to the job
|
|
86
|
+
"step": str(self.step),
|
|
87
|
+
"job_id": j.get("JobId"),
|
|
88
|
+
"job": j.get("Job"),
|
|
89
|
+
},
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
except Exception:
|
|
93
|
+
DagsLogger.warning("🤯 (failed)", extra={"step": str(self.step), "job": j.get("Job")})
|
|
94
|
+
|
|
95
|
+
finally:
|
|
96
|
+
j["Status"] = "ok"
|
|
97
|
+
self.table.upsert(j)
|
|
98
|
+
DagsLogger.info("ok", extra=self.extra(j))
|
|
99
|
+
|
|
100
|
+
dependencies = self.table.query(f"PartitionKey eq 'dependencies' and ParentId eq '{j.get('JobId')}'")
|
|
101
|
+
self.table.delete(dependencies)
|
|
102
|
+
|
|
103
|
+
def get_scheduled(self, convert: bool = False):
|
|
104
|
+
scheduled = self.table.query(f"PartitionKey eq 'statuses' and Status eq 'scheduled' and Step eq '{self.step}'")
|
|
105
|
+
if convert:
|
|
106
|
+
return spark.createDataFrame(scheduled)
|
|
107
|
+
else:
|
|
108
|
+
return scheduled
|
|
109
|
+
|
|
110
|
+
def _process(self):
|
|
111
|
+
scheduled = self.get_scheduled()
|
|
112
|
+
assert isinstance(scheduled, List)
|
|
113
|
+
if len(scheduled) > 0:
|
|
114
|
+
sender = threading.Thread(
|
|
115
|
+
target=self.send,
|
|
116
|
+
name=f"{str(self.step).capitalize()}Sender",
|
|
117
|
+
args=(),
|
|
118
|
+
)
|
|
119
|
+
sender.start()
|
|
120
|
+
|
|
121
|
+
receivers = []
|
|
122
|
+
for i in range(self.step.workers):
|
|
123
|
+
receiver = threading.Thread(
|
|
124
|
+
target=self.receive,
|
|
125
|
+
name=f"{str(self.step).capitalize()}Receiver{i}",
|
|
126
|
+
args=(),
|
|
127
|
+
)
|
|
128
|
+
receiver.start()
|
|
129
|
+
receivers.append(receiver)
|
|
130
|
+
|
|
131
|
+
sender.join()
|
|
132
|
+
for receiver in receivers:
|
|
133
|
+
receiver.join()
|
|
134
|
+
|
|
135
|
+
def process(self):
|
|
136
|
+
scheduled = self.get_scheduled()
|
|
137
|
+
assert isinstance(scheduled, List)
|
|
138
|
+
|
|
139
|
+
if len(scheduled) > 0:
|
|
140
|
+
DagsLogger.info("🏎️ (start)")
|
|
141
|
+
|
|
142
|
+
p = Process(target=self._process())
|
|
143
|
+
p.start()
|
|
144
|
+
p.join(timeout=self.step.timeouts.step)
|
|
145
|
+
p.terminate()
|
|
146
|
+
|
|
147
|
+
self.queue.delete()
|
|
148
|
+
|
|
149
|
+
if p.exitcode is None:
|
|
150
|
+
DagsLogger.critical("💥 (timeout)")
|
|
151
|
+
raise ValueError(f"{self.step} timed out")
|
|
152
|
+
|
|
153
|
+
else:
|
|
154
|
+
df = self.get_logs(str(self.step))
|
|
155
|
+
self.write_logs(df)
|
|
156
|
+
|
|
157
|
+
DagsLogger.info("🏁 (end)")
|
|
158
|
+
|
|
159
|
+
else:
|
|
160
|
+
DagsLogger.info("no job to schedule (🏖️)")
|
|
161
|
+
|
|
162
|
+
def __str__(self) -> str:
|
|
163
|
+
return f"{str(self.step)} ({self.schedule_id})"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from databricks.sdk.runtime import spark
|
|
2
|
+
|
|
3
|
+
from fabricks.core.dags.base import BaseDags
|
|
4
|
+
from fabricks.core.dags.log import DagsLogger, DagsTableLogger
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DagTerminator(BaseDags):
|
|
8
|
+
def __init__(self, schedule_id: str):
|
|
9
|
+
self.schedule_id = schedule_id
|
|
10
|
+
super().__init__(schedule_id=schedule_id)
|
|
11
|
+
|
|
12
|
+
def terminate(self):
|
|
13
|
+
df = self.get_logs()
|
|
14
|
+
self.write_logs(df)
|
|
15
|
+
|
|
16
|
+
error_df = spark.sql("select * from {df} where status = 'failed'", df=df)
|
|
17
|
+
for row in error_df.collect():
|
|
18
|
+
DagsLogger.error(f"{row['job']} failed (🔥)")
|
|
19
|
+
|
|
20
|
+
DagsTableLogger.table.truncate_partition(self.schedule_id)
|
|
21
|
+
|
|
22
|
+
table = self.get_table()
|
|
23
|
+
table.drop()
|
|
24
|
+
|
|
25
|
+
if not error_df.isEmpty():
|
|
26
|
+
raise ValueError(f"{error_df.count()} job(s) failed")
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from fabricks.core.deploy.tables import deploy_tables
|
|
2
|
+
from fabricks.core.deploy.views import deploy_views
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class deploy:
|
|
6
|
+
@staticmethod
|
|
7
|
+
def tables(drop: bool = False):
|
|
8
|
+
deploy_tables(drop=drop)
|
|
9
|
+
|
|
10
|
+
@staticmethod
|
|
11
|
+
def views():
|
|
12
|
+
deploy_views()
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from databricks.sdk.runtime import spark
|
|
2
|
+
from pyspark.sql.types import LongType, StringType, StructField, StructType, TimestampType
|
|
3
|
+
|
|
4
|
+
from fabricks.cdc import NoCDC
|
|
5
|
+
from fabricks.context.log import Logger
|
|
6
|
+
from fabricks.metastore.table import Table
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def deploy_tables(drop: bool = False):
|
|
10
|
+
Logger.info("🌟 (create or replace tables)")
|
|
11
|
+
|
|
12
|
+
create_table_log(drop)
|
|
13
|
+
create_table_dummy(drop)
|
|
14
|
+
create_table_step(drop)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_table_step(drop: bool = False):
|
|
18
|
+
table = Table("fabricks", "steps")
|
|
19
|
+
if drop:
|
|
20
|
+
table.drop()
|
|
21
|
+
if not table.exists():
|
|
22
|
+
schema = StructType(
|
|
23
|
+
[
|
|
24
|
+
StructField("step", StringType(), True),
|
|
25
|
+
StructField("extend", StringType(), True),
|
|
26
|
+
StructField("order", LongType(), True),
|
|
27
|
+
]
|
|
28
|
+
)
|
|
29
|
+
table.create(schema=schema, partitioning=True, partition_by=["extend"])
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_table_log(drop: bool = False):
|
|
33
|
+
table = Table("fabricks", "logs")
|
|
34
|
+
if drop:
|
|
35
|
+
table.drop()
|
|
36
|
+
if not table.exists():
|
|
37
|
+
schema = StructType(
|
|
38
|
+
[
|
|
39
|
+
StructField("schedule_id", StringType(), True),
|
|
40
|
+
StructField("schedule", StringType(), True),
|
|
41
|
+
StructField("step", StringType(), True),
|
|
42
|
+
StructField("job_id", StringType(), True),
|
|
43
|
+
StructField("job", StringType(), True),
|
|
44
|
+
StructField("notebook_id", StringType(), True),
|
|
45
|
+
StructField("level", StringType(), True),
|
|
46
|
+
StructField("status", StringType(), True),
|
|
47
|
+
StructField("timestamp", TimestampType(), True),
|
|
48
|
+
StructField(
|
|
49
|
+
"exception",
|
|
50
|
+
StructType(
|
|
51
|
+
[
|
|
52
|
+
StructField("type", StringType(), True),
|
|
53
|
+
StructField("message", StringType(), True),
|
|
54
|
+
StructField("traceback", StringType(), True),
|
|
55
|
+
]
|
|
56
|
+
),
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
table.create(schema=schema, partitioning=True, partition_by=["schedule_id", "step"])
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def create_table_dummy(drop: bool = False):
|
|
64
|
+
table = NoCDC("fabricks", "dummy")
|
|
65
|
+
df = spark.sql(
|
|
66
|
+
"""
|
|
67
|
+
select
|
|
68
|
+
1 as __key,
|
|
69
|
+
md5('1') as __hash,
|
|
70
|
+
cast('1900-01-01' as timestamp) as __valid_from,
|
|
71
|
+
cast('9999-12-31' as timestamp) as __valid_to
|
|
72
|
+
"""
|
|
73
|
+
)
|
|
74
|
+
if drop:
|
|
75
|
+
table.drop()
|
|
76
|
+
table.overwrite(df)
|
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
from databricks.sdk.runtime import spark
|
|
2
|
+
|
|
3
|
+
from fabricks.context.log import Logger
|
|
4
|
+
from fabricks.core.jobs.base.types import Steps
|
|
5
|
+
from fabricks.utils.sqlglot import fix as fix_sql
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def deploy_views():
|
|
9
|
+
Logger.info("🌟 (create or replace views)")
|
|
10
|
+
|
|
11
|
+
create_or_replace_jobs_view()
|
|
12
|
+
|
|
13
|
+
create_or_replace_logs_pivot_view()
|
|
14
|
+
create_or_replace_last_schedule_view()
|
|
15
|
+
create_or_replace_last_status_view()
|
|
16
|
+
create_or_replace_previous_schedule_view()
|
|
17
|
+
|
|
18
|
+
create_or_replace_schedules_view()
|
|
19
|
+
|
|
20
|
+
create_or_replace_dependencies_view()
|
|
21
|
+
create_or_replace_dependencies_flat_view()
|
|
22
|
+
create_or_replace_dependencies_unpivot_view()
|
|
23
|
+
create_or_replace_dependencies_circular_view()
|
|
24
|
+
|
|
25
|
+
create_or_replace_tables_view()
|
|
26
|
+
create_or_replace_views_view()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_or_replace_jobs_view():
|
|
30
|
+
dmls = []
|
|
31
|
+
|
|
32
|
+
for step in Steps:
|
|
33
|
+
table = f"{step}_jobs"
|
|
34
|
+
|
|
35
|
+
df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
|
|
36
|
+
if not df.isEmpty():
|
|
37
|
+
try:
|
|
38
|
+
spark.sql(f"select options.change_data_capture from fabricks.{table}")
|
|
39
|
+
change_data_capture = "coalesce(options.change_data_capture, 'nocdc') as change_data_capture"
|
|
40
|
+
except Exception:
|
|
41
|
+
change_data_capture = "'nocdc' as change_data_capture"
|
|
42
|
+
|
|
43
|
+
dmls.append(
|
|
44
|
+
f"""
|
|
45
|
+
select
|
|
46
|
+
step,
|
|
47
|
+
job_id,
|
|
48
|
+
topic,
|
|
49
|
+
item,
|
|
50
|
+
concat(step, '.', topic, '_', item) as job,
|
|
51
|
+
options.mode,
|
|
52
|
+
{change_data_capture},
|
|
53
|
+
coalesce(options.type, 'default') as type,
|
|
54
|
+
tags
|
|
55
|
+
from
|
|
56
|
+
fabricks.{table}
|
|
57
|
+
"""
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
sql = f"""create or replace view fabricks.jobs as {' union all '.join(dmls)}"""
|
|
61
|
+
sql = fix_sql(sql)
|
|
62
|
+
Logger.debug("create or replace fabricks.jobs", extra={"sql": sql})
|
|
63
|
+
spark.sql(sql)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def create_or_replace_tables_view():
|
|
67
|
+
dmls = []
|
|
68
|
+
|
|
69
|
+
for step in Steps:
|
|
70
|
+
table = f"{step}_tables"
|
|
71
|
+
|
|
72
|
+
df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
|
|
73
|
+
if not df.isEmpty():
|
|
74
|
+
dmls.append(
|
|
75
|
+
f"""
|
|
76
|
+
select
|
|
77
|
+
'{step}' as step,
|
|
78
|
+
job_id,
|
|
79
|
+
table
|
|
80
|
+
from
|
|
81
|
+
fabricks.{table}
|
|
82
|
+
"""
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
sql = f"""create or replace view fabricks.tables as {' union all '.join(dmls)}"""
|
|
86
|
+
sql = fix_sql(sql)
|
|
87
|
+
Logger.debug("create or replace fabricks.tables", extra={"sql": sql})
|
|
88
|
+
spark.sql(sql)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def create_or_replace_views_view():
|
|
92
|
+
dmls = []
|
|
93
|
+
|
|
94
|
+
for step in Steps:
|
|
95
|
+
table = f"{step}_views"
|
|
96
|
+
|
|
97
|
+
df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
|
|
98
|
+
if not df.isEmpty():
|
|
99
|
+
dmls.append(
|
|
100
|
+
f"""
|
|
101
|
+
select
|
|
102
|
+
'{step}' as step,
|
|
103
|
+
job_id,
|
|
104
|
+
view
|
|
105
|
+
from
|
|
106
|
+
fabricks.{table}
|
|
107
|
+
"""
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
sql = f"""create or replace view fabricks.views as {' union all '.join(dmls)}"""
|
|
111
|
+
sql = fix_sql(sql)
|
|
112
|
+
Logger.debug("create or replace fabricks.views", extra={"sql": sql})
|
|
113
|
+
spark.sql(sql)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def create_or_replace_dependencies_view():
|
|
117
|
+
dmls = []
|
|
118
|
+
|
|
119
|
+
for step in Steps:
|
|
120
|
+
table = f"{step}_dependencies"
|
|
121
|
+
|
|
122
|
+
df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
|
|
123
|
+
if not df.isEmpty():
|
|
124
|
+
dmls.append(
|
|
125
|
+
f"""
|
|
126
|
+
select
|
|
127
|
+
'{step}' as step,
|
|
128
|
+
dependency_id,
|
|
129
|
+
job_id,
|
|
130
|
+
parent_id,
|
|
131
|
+
parent,
|
|
132
|
+
origin
|
|
133
|
+
from
|
|
134
|
+
fabricks.{step}_dependencies d
|
|
135
|
+
"""
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
sql = f"""create or replace view fabricks.dependencies as {' union all '.join(dmls)}"""
|
|
139
|
+
sql = fix_sql(sql)
|
|
140
|
+
Logger.debug("create or replace fabricks.dependencies", extra={"sql": sql})
|
|
141
|
+
spark.sql(sql)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def create_or_replace_dependencies_flat_view():
|
|
145
|
+
parent = ",\n ".join([f"d{i+1}.parent_id as parent_{i+1}" for i in range(10)])
|
|
146
|
+
join = "\n ".join(
|
|
147
|
+
[f"left join fabricks.dependencies d{i+1} on d{i}.parent_id = d{i+1}.job_id" for i in range(10)]
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
sql = f"""
|
|
151
|
+
create or replace view fabricks.dependencies_flat as
|
|
152
|
+
select
|
|
153
|
+
d0.job_id,
|
|
154
|
+
d0.parent_id as parent_0,
|
|
155
|
+
{parent}
|
|
156
|
+
from
|
|
157
|
+
fabricks.dependencies d0
|
|
158
|
+
{join}
|
|
159
|
+
"""
|
|
160
|
+
sql = fix_sql(sql)
|
|
161
|
+
Logger.debug("create or replace fabricks.dependencies_flat", extra={"sql": sql})
|
|
162
|
+
spark.sql(sql)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def create_or_replace_dependencies_unpivot_view():
|
|
166
|
+
sql = """
|
|
167
|
+
create or replace view fabricks.dependencies_unpivot as
|
|
168
|
+
with unpvt as (
|
|
169
|
+
select
|
|
170
|
+
*
|
|
171
|
+
from
|
|
172
|
+
fabricks.dependencies_flat unpivot (
|
|
173
|
+
(parent_id) for depth in (
|
|
174
|
+
(parent_0) as depth_00,
|
|
175
|
+
(parent_1) as depth_01,
|
|
176
|
+
(parent_2) as depth_02,
|
|
177
|
+
(parent_3) as depth_03,
|
|
178
|
+
(parent_4) as depth_04,
|
|
179
|
+
(parent_5) as depth_05,
|
|
180
|
+
(parent_6) as depth_06,
|
|
181
|
+
(parent_7) as depth_07,
|
|
182
|
+
(parent_8) as depth_08,
|
|
183
|
+
(parent_9) as depth_09,
|
|
184
|
+
(parent_10) as depth_10
|
|
185
|
+
)
|
|
186
|
+
) p
|
|
187
|
+
)
|
|
188
|
+
select
|
|
189
|
+
job_id,
|
|
190
|
+
cast(replace(depth, 'depth_', '') as int) as depth,
|
|
191
|
+
parent_id
|
|
192
|
+
from
|
|
193
|
+
unpvt qualify row_number() over (
|
|
194
|
+
partition by job_id,
|
|
195
|
+
parent_id
|
|
196
|
+
order by
|
|
197
|
+
depth asc
|
|
198
|
+
) = 1
|
|
199
|
+
"""
|
|
200
|
+
sql = fix_sql(sql)
|
|
201
|
+
Logger.debug("create or replace fabricks.dependencies_unpivot", extra={"sql": sql})
|
|
202
|
+
spark.sql(sql)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def create_or_replace_dependencies_circular_view():
|
|
206
|
+
sql = """
|
|
207
|
+
create or replace view fabricks.dependencies_circular as
|
|
208
|
+
with d as (
|
|
209
|
+
select
|
|
210
|
+
d1.job_id,
|
|
211
|
+
j1.job,
|
|
212
|
+
p.job_id as parent_id,
|
|
213
|
+
p.job as parent
|
|
214
|
+
from
|
|
215
|
+
fabricks.dependencies d1
|
|
216
|
+
left join fabricks.dependencies_unpivot d2 on d2.parent_id = d1.job_id
|
|
217
|
+
left join fabricks.jobs j1 on d1.job_id = j1.job_id
|
|
218
|
+
left join fabricks.jobs p on d1.parent_id = p.job_id
|
|
219
|
+
where
|
|
220
|
+
true
|
|
221
|
+
and d1.job_id = d2.job_id
|
|
222
|
+
group by
|
|
223
|
+
all
|
|
224
|
+
)
|
|
225
|
+
select
|
|
226
|
+
*
|
|
227
|
+
from
|
|
228
|
+
d
|
|
229
|
+
where
|
|
230
|
+
true
|
|
231
|
+
and exists (
|
|
232
|
+
select
|
|
233
|
+
1
|
|
234
|
+
from
|
|
235
|
+
d d1
|
|
236
|
+
where
|
|
237
|
+
d1.job_id = d.parent_id
|
|
238
|
+
)
|
|
239
|
+
"""
|
|
240
|
+
sql = fix_sql(sql)
|
|
241
|
+
Logger.debug("create or replace fabricks.dependencies_circular", extra={"sql": sql})
|
|
242
|
+
spark.sql(sql)
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def create_or_replace_logs_pivot_view():
|
|
246
|
+
sql = """
|
|
247
|
+
create or replace view fabricks.logs_pivot as
|
|
248
|
+
with groupby as (
|
|
249
|
+
select
|
|
250
|
+
l.schedule,
|
|
251
|
+
l.schedule_id,
|
|
252
|
+
l.step,
|
|
253
|
+
l.job,
|
|
254
|
+
l.job_id,
|
|
255
|
+
collect_set(l.status) as statuses,
|
|
256
|
+
array_contains(statuses, 'done') as done,
|
|
257
|
+
array_contains(statuses, 'failed') or not done as failed,
|
|
258
|
+
not array_contains(statuses, 'failed') and not array_contains(statuses, 'done') and array_contains(statuses, 'running') as timed_out,
|
|
259
|
+
not array_contains(statuses, 'running') as cancelled,
|
|
260
|
+
max(l.notebook_id) as notebook_id,
|
|
261
|
+
max(l.timestamp) filter(where l.status = 'scheduled') as scheduled_time,
|
|
262
|
+
max(l.timestamp) filter(where l.status = 'waiting') as waiting_time,
|
|
263
|
+
max(l.timestamp) filter(where l.status = 'running') as running_time,
|
|
264
|
+
max(l.timestamp) filter(where l.status = 'done') as done_time,
|
|
265
|
+
max(l.timestamp) filter(where l.status = 'failed') as failed_time,
|
|
266
|
+
max(l.timestamp) filter(where l.status = 'ok') as ok_time,
|
|
267
|
+
max(l.exception) as exception
|
|
268
|
+
from
|
|
269
|
+
fabricks.logs l
|
|
270
|
+
group by
|
|
271
|
+
l.schedule, l.schedule_id, l.step, l.job, l.job_id
|
|
272
|
+
)
|
|
273
|
+
select
|
|
274
|
+
g.schedule,
|
|
275
|
+
g.schedule_id,
|
|
276
|
+
g.job,
|
|
277
|
+
g.step,
|
|
278
|
+
j.topic,
|
|
279
|
+
j.item,
|
|
280
|
+
g.job_id,
|
|
281
|
+
g.done,
|
|
282
|
+
g.failed,
|
|
283
|
+
g.timed_out,
|
|
284
|
+
g.cancelled,
|
|
285
|
+
g.notebook_id,
|
|
286
|
+
g.running_time as start_time,
|
|
287
|
+
g.ok_time as end_time,
|
|
288
|
+
g.scheduled_time,
|
|
289
|
+
g.waiting_time,
|
|
290
|
+
g.running_time,
|
|
291
|
+
g.done_time,
|
|
292
|
+
g.failed_time,
|
|
293
|
+
g.ok_time,
|
|
294
|
+
if(g.timed_out, null, date_diff(SECOND, start_time, end_time)) as duration,
|
|
295
|
+
g.exception
|
|
296
|
+
from
|
|
297
|
+
groupby g
|
|
298
|
+
left join fabricks.jobs j on g.job_id = j.job_id
|
|
299
|
+
"""
|
|
300
|
+
sql = fix_sql(sql)
|
|
301
|
+
Logger.debug("create or replace fabricks.logs_pivot", extra={"sql": sql})
|
|
302
|
+
spark.sql(sql)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def create_or_replace_last_schedule_view():
|
|
306
|
+
sql = """
|
|
307
|
+
create or replace view fabricks.last_schedule as
|
|
308
|
+
with lst as (
|
|
309
|
+
select
|
|
310
|
+
schedule_id as last_schedule_id
|
|
311
|
+
from
|
|
312
|
+
fabricks.logs_pivot
|
|
313
|
+
where
|
|
314
|
+
schedule_id is not null
|
|
315
|
+
order by
|
|
316
|
+
start_time desc
|
|
317
|
+
limit
|
|
318
|
+
1
|
|
319
|
+
)
|
|
320
|
+
select
|
|
321
|
+
l.*
|
|
322
|
+
from
|
|
323
|
+
fabricks.logs_pivot l
|
|
324
|
+
inner join lst on schedule_id = last_schedule_id
|
|
325
|
+
"""
|
|
326
|
+
sql = fix_sql(sql)
|
|
327
|
+
Logger.debug("create or replace fabricks.last_schedule", extra={"sql": sql})
|
|
328
|
+
spark.sql(sql)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def create_or_replace_last_status_view():
|
|
332
|
+
sql = """
|
|
333
|
+
create or replace view fabricks.last_status as
|
|
334
|
+
select
|
|
335
|
+
job_id,
|
|
336
|
+
job,
|
|
337
|
+
step,
|
|
338
|
+
start_time as time,
|
|
339
|
+
done,
|
|
340
|
+
failed,
|
|
341
|
+
cancelled,
|
|
342
|
+
timed_out,
|
|
343
|
+
exception
|
|
344
|
+
from
|
|
345
|
+
fabricks.logs_pivot
|
|
346
|
+
qualify row_number() over (
|
|
347
|
+
partition by job_id
|
|
348
|
+
order by
|
|
349
|
+
start_time desc
|
|
350
|
+
) = 1
|
|
351
|
+
"""
|
|
352
|
+
sql = fix_sql(sql)
|
|
353
|
+
Logger.debug("create or replace fabricks.last_status", extra={"sql": sql})
|
|
354
|
+
spark.sql(sql)
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def create_or_replace_previous_schedule_view():
|
|
358
|
+
sql = """
|
|
359
|
+
create or replace view fabricks.previous_schedule as
|
|
360
|
+
with lst_2 as (
|
|
361
|
+
select
|
|
362
|
+
schedule_id as last_schedule_id,
|
|
363
|
+
max(start_time) as start_time
|
|
364
|
+
from
|
|
365
|
+
fabricks.logs_pivot
|
|
366
|
+
where
|
|
367
|
+
schedule_id is not null
|
|
368
|
+
group by
|
|
369
|
+
all
|
|
370
|
+
order by
|
|
371
|
+
start_time desc
|
|
372
|
+
limit
|
|
373
|
+
2
|
|
374
|
+
), lst as (
|
|
375
|
+
select
|
|
376
|
+
last_schedule_id
|
|
377
|
+
from
|
|
378
|
+
lst_2
|
|
379
|
+
order by
|
|
380
|
+
start_time asc
|
|
381
|
+
limit
|
|
382
|
+
1
|
|
383
|
+
)
|
|
384
|
+
select
|
|
385
|
+
l.*
|
|
386
|
+
from
|
|
387
|
+
fabricks.logs_pivot l
|
|
388
|
+
inner join lst on schedule_id = last_schedule_id
|
|
389
|
+
"""
|
|
390
|
+
sql = fix_sql(sql)
|
|
391
|
+
Logger.debug("create or replace fabricks.previous_schedule", extra={"sql": sql})
|
|
392
|
+
spark.sql(sql)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def create_or_replace_schedules_view():
|
|
396
|
+
sql = """
|
|
397
|
+
create or replace view fabricks.schedules as
|
|
398
|
+
select
|
|
399
|
+
schedule,
|
|
400
|
+
schedule_id,
|
|
401
|
+
min(start_time) as start_time,
|
|
402
|
+
max(end_time) as end_time,
|
|
403
|
+
max(start_time) :: date as date,
|
|
404
|
+
sum(duration) as duration,
|
|
405
|
+
count(*) as logs,
|
|
406
|
+
count_if(failed) as failed,
|
|
407
|
+
count_if(done) as done,
|
|
408
|
+
count_if(timed_out) as timed_out
|
|
409
|
+
from
|
|
410
|
+
fabricks.logs_pivot
|
|
411
|
+
group by
|
|
412
|
+
all
|
|
413
|
+
order by date desc, start_time desc
|
|
414
|
+
"""
|
|
415
|
+
sql = fix_sql(sql)
|
|
416
|
+
Logger.debug("create or replace fabricks.schedules", extra={"sql": sql})
|
|
417
|
+
spark.sql(sql)
|