fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,163 @@
1
+ import json
2
+ import threading
3
+ import time
4
+ from multiprocessing import Process
5
+ from typing import List, Union
6
+
7
+ from databricks.sdk.runtime import dbutils, spark
8
+
9
+ from fabricks.context.runtime import PATH_NOTEBOOKS
10
+ from fabricks.core.dags.base import BaseDags
11
+ from fabricks.core.dags.log import DagsLogger
12
+ from fabricks.core.jobs.base.types import TStep
13
+ from fabricks.core.steps.get_step import get_step
14
+ from fabricks.utils.azure_queue import AzureQueue
15
+ from fabricks.utils.azure_table import AzureTable
16
+
17
+
18
+ class DagProcessor(BaseDags):
19
+ def __init__(self, schedule_id: str, schedule: str, step: Union[TStep, str]):
20
+ self.step = get_step(step=step)
21
+ self.schedule = schedule
22
+
23
+ super().__init__(schedule_id=schedule_id)
24
+
25
+ @property
26
+ def queue(self) -> AzureQueue:
27
+ step = self.remove_invalid_characters(str(self.step))
28
+ return AzureQueue(f"q{step}{self.schedule_id}", connection_string=self.get_connection_string())
29
+
30
+ @property
31
+ def table(self) -> AzureTable:
32
+ return AzureTable(f"t{self.schedule_id}", connection_string=self.get_connection_string())
33
+
34
+ def extra(self, d: dict) -> dict:
35
+ return {
36
+ "partition_key": self.schedule_id,
37
+ "schedule": self.schedule,
38
+ "schedule_id": self.schedule_id,
39
+ "step": str(self.step),
40
+ "job": d.get("Job"),
41
+ "target": "table",
42
+ }
43
+
44
+ def send(self):
45
+ while True:
46
+ scheduled = self.get_scheduled()
47
+ assert isinstance(scheduled, List)
48
+ if len(scheduled) == 0:
49
+ for _ in range(self.step.workers):
50
+ self.queue.send_sentinel()
51
+ DagsLogger.info("🎉 (no more job to schedule)")
52
+ break
53
+
54
+ else:
55
+ sorted_scheduled = sorted(scheduled, key=lambda x: x.get("Rank"))
56
+ for s in sorted_scheduled:
57
+ dependencies = self.table.query(f"PartitionKey eq 'dependencies' and JobId eq '{s.get('JobId')}'")
58
+ if len(dependencies) == 0:
59
+ s["Status"] = "waiting"
60
+ DagsLogger.info("waiting", extra=self.extra(s))
61
+ self.table.upsert(s)
62
+ self.queue.send(s)
63
+
64
+ time.sleep(5)
65
+
66
+ def receive(self):
67
+ while True:
68
+ response = self.queue.receive()
69
+ if response == self.queue.sentinel:
70
+ DagsLogger.info("💤 (no more job available)")
71
+ break
72
+ elif response:
73
+ j = json.loads(response)
74
+
75
+ j["Status"] = "starting"
76
+ self.table.upsert(j)
77
+ DagsLogger.info("starting", extra=self.extra(j))
78
+
79
+ try:
80
+ dbutils.notebook.run(
81
+ PATH_NOTEBOOKS.join("run").get_notebook_path(),
82
+ self.step.timeouts.job,
83
+ {
84
+ "schedule_id": self.schedule_id,
85
+ "schedule": self.schedule, # needed to pass schedule variables to the job
86
+ "step": str(self.step),
87
+ "job_id": j.get("JobId"),
88
+ "job": j.get("Job"),
89
+ },
90
+ )
91
+
92
+ except Exception:
93
+ DagsLogger.warning("🤯 (failed)", extra={"step": str(self.step), "job": j.get("Job")})
94
+
95
+ finally:
96
+ j["Status"] = "ok"
97
+ self.table.upsert(j)
98
+ DagsLogger.info("ok", extra=self.extra(j))
99
+
100
+ dependencies = self.table.query(f"PartitionKey eq 'dependencies' and ParentId eq '{j.get('JobId')}'")
101
+ self.table.delete(dependencies)
102
+
103
+ def get_scheduled(self, convert: bool = False):
104
+ scheduled = self.table.query(f"PartitionKey eq 'statuses' and Status eq 'scheduled' and Step eq '{self.step}'")
105
+ if convert:
106
+ return spark.createDataFrame(scheduled)
107
+ else:
108
+ return scheduled
109
+
110
+ def _process(self):
111
+ scheduled = self.get_scheduled()
112
+ assert isinstance(scheduled, List)
113
+ if len(scheduled) > 0:
114
+ sender = threading.Thread(
115
+ target=self.send,
116
+ name=f"{str(self.step).capitalize()}Sender",
117
+ args=(),
118
+ )
119
+ sender.start()
120
+
121
+ receivers = []
122
+ for i in range(self.step.workers):
123
+ receiver = threading.Thread(
124
+ target=self.receive,
125
+ name=f"{str(self.step).capitalize()}Receiver{i}",
126
+ args=(),
127
+ )
128
+ receiver.start()
129
+ receivers.append(receiver)
130
+
131
+ sender.join()
132
+ for receiver in receivers:
133
+ receiver.join()
134
+
135
+ def process(self):
136
+ scheduled = self.get_scheduled()
137
+ assert isinstance(scheduled, List)
138
+
139
+ if len(scheduled) > 0:
140
+ DagsLogger.info("🏎️ (start)")
141
+
142
+ p = Process(target=self._process())
143
+ p.start()
144
+ p.join(timeout=self.step.timeouts.step)
145
+ p.terminate()
146
+
147
+ self.queue.delete()
148
+
149
+ if p.exitcode is None:
150
+ DagsLogger.critical("💥 (timeout)")
151
+ raise ValueError(f"{self.step} timed out")
152
+
153
+ else:
154
+ df = self.get_logs(str(self.step))
155
+ self.write_logs(df)
156
+
157
+ DagsLogger.info("🏁 (end)")
158
+
159
+ else:
160
+ DagsLogger.info("no job to schedule (🏖️)")
161
+
162
+ def __str__(self) -> str:
163
+ return f"{str(self.step)} ({self.schedule_id})"
@@ -0,0 +1,26 @@
1
+ from databricks.sdk.runtime import spark
2
+
3
+ from fabricks.core.dags.base import BaseDags
4
+ from fabricks.core.dags.log import DagsLogger, DagsTableLogger
5
+
6
+
7
+ class DagTerminator(BaseDags):
8
+ def __init__(self, schedule_id: str):
9
+ self.schedule_id = schedule_id
10
+ super().__init__(schedule_id=schedule_id)
11
+
12
+ def terminate(self):
13
+ df = self.get_logs()
14
+ self.write_logs(df)
15
+
16
+ error_df = spark.sql("select * from {df} where status = 'failed'", df=df)
17
+ for row in error_df.collect():
18
+ DagsLogger.error(f"{row['job']} failed (🔥)")
19
+
20
+ DagsTableLogger.table.truncate_partition(self.schedule_id)
21
+
22
+ table = self.get_table()
23
+ table.drop()
24
+
25
+ if not error_df.isEmpty():
26
+ raise ValueError(f"{error_df.count()} job(s) failed")
@@ -0,0 +1,12 @@
1
+ from fabricks.core.deploy.tables import deploy_tables
2
+ from fabricks.core.deploy.views import deploy_views
3
+
4
+
5
+ class deploy:
6
+ @staticmethod
7
+ def tables(drop: bool = False):
8
+ deploy_tables(drop=drop)
9
+
10
+ @staticmethod
11
+ def views():
12
+ deploy_views()
@@ -0,0 +1,76 @@
1
+ from databricks.sdk.runtime import spark
2
+ from pyspark.sql.types import LongType, StringType, StructField, StructType, TimestampType
3
+
4
+ from fabricks.cdc import NoCDC
5
+ from fabricks.context.log import Logger
6
+ from fabricks.metastore.table import Table
7
+
8
+
9
+ def deploy_tables(drop: bool = False):
10
+ Logger.info("🌟 (create or replace tables)")
11
+
12
+ create_table_log(drop)
13
+ create_table_dummy(drop)
14
+ create_table_step(drop)
15
+
16
+
17
+ def create_table_step(drop: bool = False):
18
+ table = Table("fabricks", "steps")
19
+ if drop:
20
+ table.drop()
21
+ if not table.exists():
22
+ schema = StructType(
23
+ [
24
+ StructField("step", StringType(), True),
25
+ StructField("extend", StringType(), True),
26
+ StructField("order", LongType(), True),
27
+ ]
28
+ )
29
+ table.create(schema=schema, partitioning=True, partition_by=["extend"])
30
+
31
+
32
+ def create_table_log(drop: bool = False):
33
+ table = Table("fabricks", "logs")
34
+ if drop:
35
+ table.drop()
36
+ if not table.exists():
37
+ schema = StructType(
38
+ [
39
+ StructField("schedule_id", StringType(), True),
40
+ StructField("schedule", StringType(), True),
41
+ StructField("step", StringType(), True),
42
+ StructField("job_id", StringType(), True),
43
+ StructField("job", StringType(), True),
44
+ StructField("notebook_id", StringType(), True),
45
+ StructField("level", StringType(), True),
46
+ StructField("status", StringType(), True),
47
+ StructField("timestamp", TimestampType(), True),
48
+ StructField(
49
+ "exception",
50
+ StructType(
51
+ [
52
+ StructField("type", StringType(), True),
53
+ StructField("message", StringType(), True),
54
+ StructField("traceback", StringType(), True),
55
+ ]
56
+ ),
57
+ ),
58
+ ]
59
+ )
60
+ table.create(schema=schema, partitioning=True, partition_by=["schedule_id", "step"])
61
+
62
+
63
+ def create_table_dummy(drop: bool = False):
64
+ table = NoCDC("fabricks", "dummy")
65
+ df = spark.sql(
66
+ """
67
+ select
68
+ 1 as __key,
69
+ md5('1') as __hash,
70
+ cast('1900-01-01' as timestamp) as __valid_from,
71
+ cast('9999-12-31' as timestamp) as __valid_to
72
+ """
73
+ )
74
+ if drop:
75
+ table.drop()
76
+ table.overwrite(df)
@@ -0,0 +1,417 @@
1
+ from databricks.sdk.runtime import spark
2
+
3
+ from fabricks.context.log import Logger
4
+ from fabricks.core.jobs.base.types import Steps
5
+ from fabricks.utils.sqlglot import fix as fix_sql
6
+
7
+
8
+ def deploy_views():
9
+ Logger.info("🌟 (create or replace views)")
10
+
11
+ create_or_replace_jobs_view()
12
+
13
+ create_or_replace_logs_pivot_view()
14
+ create_or_replace_last_schedule_view()
15
+ create_or_replace_last_status_view()
16
+ create_or_replace_previous_schedule_view()
17
+
18
+ create_or_replace_schedules_view()
19
+
20
+ create_or_replace_dependencies_view()
21
+ create_or_replace_dependencies_flat_view()
22
+ create_or_replace_dependencies_unpivot_view()
23
+ create_or_replace_dependencies_circular_view()
24
+
25
+ create_or_replace_tables_view()
26
+ create_or_replace_views_view()
27
+
28
+
29
+ def create_or_replace_jobs_view():
30
+ dmls = []
31
+
32
+ for step in Steps:
33
+ table = f"{step}_jobs"
34
+
35
+ df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
36
+ if not df.isEmpty():
37
+ try:
38
+ spark.sql(f"select options.change_data_capture from fabricks.{table}")
39
+ change_data_capture = "coalesce(options.change_data_capture, 'nocdc') as change_data_capture"
40
+ except Exception:
41
+ change_data_capture = "'nocdc' as change_data_capture"
42
+
43
+ dmls.append(
44
+ f"""
45
+ select
46
+ step,
47
+ job_id,
48
+ topic,
49
+ item,
50
+ concat(step, '.', topic, '_', item) as job,
51
+ options.mode,
52
+ {change_data_capture},
53
+ coalesce(options.type, 'default') as type,
54
+ tags
55
+ from
56
+ fabricks.{table}
57
+ """
58
+ )
59
+
60
+ sql = f"""create or replace view fabricks.jobs as {' union all '.join(dmls)}"""
61
+ sql = fix_sql(sql)
62
+ Logger.debug("create or replace fabricks.jobs", extra={"sql": sql})
63
+ spark.sql(sql)
64
+
65
+
66
+ def create_or_replace_tables_view():
67
+ dmls = []
68
+
69
+ for step in Steps:
70
+ table = f"{step}_tables"
71
+
72
+ df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
73
+ if not df.isEmpty():
74
+ dmls.append(
75
+ f"""
76
+ select
77
+ '{step}' as step,
78
+ job_id,
79
+ table
80
+ from
81
+ fabricks.{table}
82
+ """
83
+ )
84
+
85
+ sql = f"""create or replace view fabricks.tables as {' union all '.join(dmls)}"""
86
+ sql = fix_sql(sql)
87
+ Logger.debug("create or replace fabricks.tables", extra={"sql": sql})
88
+ spark.sql(sql)
89
+
90
+
91
+ def create_or_replace_views_view():
92
+ dmls = []
93
+
94
+ for step in Steps:
95
+ table = f"{step}_views"
96
+
97
+ df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
98
+ if not df.isEmpty():
99
+ dmls.append(
100
+ f"""
101
+ select
102
+ '{step}' as step,
103
+ job_id,
104
+ view
105
+ from
106
+ fabricks.{table}
107
+ """
108
+ )
109
+
110
+ sql = f"""create or replace view fabricks.views as {' union all '.join(dmls)}"""
111
+ sql = fix_sql(sql)
112
+ Logger.debug("create or replace fabricks.views", extra={"sql": sql})
113
+ spark.sql(sql)
114
+
115
+
116
+ def create_or_replace_dependencies_view():
117
+ dmls = []
118
+
119
+ for step in Steps:
120
+ table = f"{step}_dependencies"
121
+
122
+ df = spark.sql("show tables in fabricks").where(f"tableName like '{table}'")
123
+ if not df.isEmpty():
124
+ dmls.append(
125
+ f"""
126
+ select
127
+ '{step}' as step,
128
+ dependency_id,
129
+ job_id,
130
+ parent_id,
131
+ parent,
132
+ origin
133
+ from
134
+ fabricks.{step}_dependencies d
135
+ """
136
+ )
137
+
138
+ sql = f"""create or replace view fabricks.dependencies as {' union all '.join(dmls)}"""
139
+ sql = fix_sql(sql)
140
+ Logger.debug("create or replace fabricks.dependencies", extra={"sql": sql})
141
+ spark.sql(sql)
142
+
143
+
144
+ def create_or_replace_dependencies_flat_view():
145
+ parent = ",\n ".join([f"d{i+1}.parent_id as parent_{i+1}" for i in range(10)])
146
+ join = "\n ".join(
147
+ [f"left join fabricks.dependencies d{i+1} on d{i}.parent_id = d{i+1}.job_id" for i in range(10)]
148
+ )
149
+
150
+ sql = f"""
151
+ create or replace view fabricks.dependencies_flat as
152
+ select
153
+ d0.job_id,
154
+ d0.parent_id as parent_0,
155
+ {parent}
156
+ from
157
+ fabricks.dependencies d0
158
+ {join}
159
+ """
160
+ sql = fix_sql(sql)
161
+ Logger.debug("create or replace fabricks.dependencies_flat", extra={"sql": sql})
162
+ spark.sql(sql)
163
+
164
+
165
+ def create_or_replace_dependencies_unpivot_view():
166
+ sql = """
167
+ create or replace view fabricks.dependencies_unpivot as
168
+ with unpvt as (
169
+ select
170
+ *
171
+ from
172
+ fabricks.dependencies_flat unpivot (
173
+ (parent_id) for depth in (
174
+ (parent_0) as depth_00,
175
+ (parent_1) as depth_01,
176
+ (parent_2) as depth_02,
177
+ (parent_3) as depth_03,
178
+ (parent_4) as depth_04,
179
+ (parent_5) as depth_05,
180
+ (parent_6) as depth_06,
181
+ (parent_7) as depth_07,
182
+ (parent_8) as depth_08,
183
+ (parent_9) as depth_09,
184
+ (parent_10) as depth_10
185
+ )
186
+ ) p
187
+ )
188
+ select
189
+ job_id,
190
+ cast(replace(depth, 'depth_', '') as int) as depth,
191
+ parent_id
192
+ from
193
+ unpvt qualify row_number() over (
194
+ partition by job_id,
195
+ parent_id
196
+ order by
197
+ depth asc
198
+ ) = 1
199
+ """
200
+ sql = fix_sql(sql)
201
+ Logger.debug("create or replace fabricks.dependencies_unpivot", extra={"sql": sql})
202
+ spark.sql(sql)
203
+
204
+
205
+ def create_or_replace_dependencies_circular_view():
206
+ sql = """
207
+ create or replace view fabricks.dependencies_circular as
208
+ with d as (
209
+ select
210
+ d1.job_id,
211
+ j1.job,
212
+ p.job_id as parent_id,
213
+ p.job as parent
214
+ from
215
+ fabricks.dependencies d1
216
+ left join fabricks.dependencies_unpivot d2 on d2.parent_id = d1.job_id
217
+ left join fabricks.jobs j1 on d1.job_id = j1.job_id
218
+ left join fabricks.jobs p on d1.parent_id = p.job_id
219
+ where
220
+ true
221
+ and d1.job_id = d2.job_id
222
+ group by
223
+ all
224
+ )
225
+ select
226
+ *
227
+ from
228
+ d
229
+ where
230
+ true
231
+ and exists (
232
+ select
233
+ 1
234
+ from
235
+ d d1
236
+ where
237
+ d1.job_id = d.parent_id
238
+ )
239
+ """
240
+ sql = fix_sql(sql)
241
+ Logger.debug("create or replace fabricks.dependencies_circular", extra={"sql": sql})
242
+ spark.sql(sql)
243
+
244
+
245
+ def create_or_replace_logs_pivot_view():
246
+ sql = """
247
+ create or replace view fabricks.logs_pivot as
248
+ with groupby as (
249
+ select
250
+ l.schedule,
251
+ l.schedule_id,
252
+ l.step,
253
+ l.job,
254
+ l.job_id,
255
+ collect_set(l.status) as statuses,
256
+ array_contains(statuses, 'done') as done,
257
+ array_contains(statuses, 'failed') or not done as failed,
258
+ not array_contains(statuses, 'failed') and not array_contains(statuses, 'done') and array_contains(statuses, 'running') as timed_out,
259
+ not array_contains(statuses, 'running') as cancelled,
260
+ max(l.notebook_id) as notebook_id,
261
+ max(l.timestamp) filter(where l.status = 'scheduled') as scheduled_time,
262
+ max(l.timestamp) filter(where l.status = 'waiting') as waiting_time,
263
+ max(l.timestamp) filter(where l.status = 'running') as running_time,
264
+ max(l.timestamp) filter(where l.status = 'done') as done_time,
265
+ max(l.timestamp) filter(where l.status = 'failed') as failed_time,
266
+ max(l.timestamp) filter(where l.status = 'ok') as ok_time,
267
+ max(l.exception) as exception
268
+ from
269
+ fabricks.logs l
270
+ group by
271
+ l.schedule, l.schedule_id, l.step, l.job, l.job_id
272
+ )
273
+ select
274
+ g.schedule,
275
+ g.schedule_id,
276
+ g.job,
277
+ g.step,
278
+ j.topic,
279
+ j.item,
280
+ g.job_id,
281
+ g.done,
282
+ g.failed,
283
+ g.timed_out,
284
+ g.cancelled,
285
+ g.notebook_id,
286
+ g.running_time as start_time,
287
+ g.ok_time as end_time,
288
+ g.scheduled_time,
289
+ g.waiting_time,
290
+ g.running_time,
291
+ g.done_time,
292
+ g.failed_time,
293
+ g.ok_time,
294
+ if(g.timed_out, null, date_diff(SECOND, start_time, end_time)) as duration,
295
+ g.exception
296
+ from
297
+ groupby g
298
+ left join fabricks.jobs j on g.job_id = j.job_id
299
+ """
300
+ sql = fix_sql(sql)
301
+ Logger.debug("create or replace fabricks.logs_pivot", extra={"sql": sql})
302
+ spark.sql(sql)
303
+
304
+
305
+ def create_or_replace_last_schedule_view():
306
+ sql = """
307
+ create or replace view fabricks.last_schedule as
308
+ with lst as (
309
+ select
310
+ schedule_id as last_schedule_id
311
+ from
312
+ fabricks.logs_pivot
313
+ where
314
+ schedule_id is not null
315
+ order by
316
+ start_time desc
317
+ limit
318
+ 1
319
+ )
320
+ select
321
+ l.*
322
+ from
323
+ fabricks.logs_pivot l
324
+ inner join lst on schedule_id = last_schedule_id
325
+ """
326
+ sql = fix_sql(sql)
327
+ Logger.debug("create or replace fabricks.last_schedule", extra={"sql": sql})
328
+ spark.sql(sql)
329
+
330
+
331
+ def create_or_replace_last_status_view():
332
+ sql = """
333
+ create or replace view fabricks.last_status as
334
+ select
335
+ job_id,
336
+ job,
337
+ step,
338
+ start_time as time,
339
+ done,
340
+ failed,
341
+ cancelled,
342
+ timed_out,
343
+ exception
344
+ from
345
+ fabricks.logs_pivot
346
+ qualify row_number() over (
347
+ partition by job_id
348
+ order by
349
+ start_time desc
350
+ ) = 1
351
+ """
352
+ sql = fix_sql(sql)
353
+ Logger.debug("create or replace fabricks.last_status", extra={"sql": sql})
354
+ spark.sql(sql)
355
+
356
+
357
+ def create_or_replace_previous_schedule_view():
358
+ sql = """
359
+ create or replace view fabricks.previous_schedule as
360
+ with lst_2 as (
361
+ select
362
+ schedule_id as last_schedule_id,
363
+ max(start_time) as start_time
364
+ from
365
+ fabricks.logs_pivot
366
+ where
367
+ schedule_id is not null
368
+ group by
369
+ all
370
+ order by
371
+ start_time desc
372
+ limit
373
+ 2
374
+ ), lst as (
375
+ select
376
+ last_schedule_id
377
+ from
378
+ lst_2
379
+ order by
380
+ start_time asc
381
+ limit
382
+ 1
383
+ )
384
+ select
385
+ l.*
386
+ from
387
+ fabricks.logs_pivot l
388
+ inner join lst on schedule_id = last_schedule_id
389
+ """
390
+ sql = fix_sql(sql)
391
+ Logger.debug("create or replace fabricks.previous_schedule", extra={"sql": sql})
392
+ spark.sql(sql)
393
+
394
+
395
+ def create_or_replace_schedules_view():
396
+ sql = """
397
+ create or replace view fabricks.schedules as
398
+ select
399
+ schedule,
400
+ schedule_id,
401
+ min(start_time) as start_time,
402
+ max(end_time) as end_time,
403
+ max(start_time) :: date as date,
404
+ sum(duration) as duration,
405
+ count(*) as logs,
406
+ count_if(failed) as failed,
407
+ count_if(done) as done,
408
+ count_if(timed_out) as timed_out
409
+ from
410
+ fabricks.logs_pivot
411
+ group by
412
+ all
413
+ order by date desc, start_time desc
414
+ """
415
+ sql = fix_sql(sql)
416
+ Logger.debug("create or replace fabricks.schedules", extra={"sql": sql})
417
+ spark.sql(sql)