fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,20 @@
1
+ from typing import Tuple
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from fabricks.core.dags.generator import DagGenerator
6
+
7
+
8
+ def generate(schedule: str) -> Tuple[str, DataFrame, DataFrame]:
9
+ """
10
+ Generate a schedule, job dataframe, and dependency dataframe based on the given schedule.
11
+
12
+ Args:
13
+ schedule (str): The schedule to generate from.
14
+
15
+ Returns:
16
+ Tuple[str, DataFrame, DataFrame]: A tuple containing the schedule ID, job dataframe, and dependency dataframe.
17
+ """
18
+ with DagGenerator(schedule) as g:
19
+ schedule_id, job_df, dep_df = g.generate()
20
+ return schedule_id, job_df, dep_df
@@ -0,0 +1,5 @@
1
+ from fabricks.core.jobs.get_schedule import get_schedule # void circular import
2
+
3
+ __all__ = [
4
+ "get_schedule",
5
+ ]
@@ -0,0 +1,9 @@
1
+ from fabricks.core.jobs.get_schedules import ( # void circular import
2
+ get_schedules,
3
+ get_schedules_df,
4
+ )
5
+
6
+ __all__ = [
7
+ "get_schedules",
8
+ "get_schedules_df",
9
+ ]
@@ -0,0 +1,9 @@
1
+ from typing import Union
2
+
3
+ from fabricks.core.dags.processor import DagProcessor
4
+ from fabricks.core.jobs.base._types import TStep
5
+
6
+
7
+ def process(schedule_id: str, schedule: str, step: Union[TStep, str]):
8
+ with DagProcessor(schedule_id=schedule_id, schedule=schedule, step=step) as p:
9
+ p.process()
@@ -0,0 +1,3 @@
1
+ from fabricks.core.dags.run import run
2
+
3
+ __all__ = ["run"]
@@ -0,0 +1,6 @@
1
+ from fabricks.core.dags.terminator import DagTerminator
2
+
3
+
4
+ def terminate(schedule_id: str):
5
+ with DagTerminator(schedule_id=schedule_id) as t:
6
+ t.terminate()
@@ -0,0 +1,61 @@
1
+ from fabricks.context import SPARK
2
+ from fabricks.context.log import DEFAULT_LOGGER
3
+ from fabricks.core.schedules.get_schedule import get_schedule
4
+ from fabricks.core.schedules.get_schedules import get_schedules_df
5
+ from fabricks.utils.sqlglot import fix as fix_sql
6
+
7
+
8
+ def create_or_replace_view_internal(name: str, options: dict):
9
+ step = "-- no step provided"
10
+ tag = "-- no tag provided"
11
+ view = "-- no view provided"
12
+
13
+ assert isinstance(options, dict), "options must be a dict"
14
+
15
+ if options.get("steps") is not None:
16
+ steps = [f"'{s}'" for s in options.get("steps")] # type: ignore
17
+ step = f"and j.step in ({', '.join(steps)})"
18
+
19
+ if options.get("tag") is not None:
20
+ tag = f"""and array_contains(j.tags, '{options.get("tag")}')"""
21
+
22
+ if options.get("view") is not None:
23
+ view = f"""inner join fabricks.{options.get("view")} v on j.job_id = v.job_id"""
24
+
25
+ sql = f"""
26
+ create or replace view fabricks.{name}_schedule
27
+ as
28
+ select
29
+ j.*
30
+ from
31
+ fabricks.jobs j
32
+ {view}
33
+ where
34
+ true
35
+ {step}
36
+ {tag}
37
+ and j.type not in ('manual')
38
+ """
39
+ sql = fix_sql(sql)
40
+ DEFAULT_LOGGER.debug("create or replace (schedule) view", extra={"label": f"fabricks.{name}_schedule", "sql": sql})
41
+
42
+ SPARK.sql(sql)
43
+
44
+
45
+ def create_or_replace_view(name: str):
46
+ sc = get_schedule(name=name)
47
+ try:
48
+ create_or_replace_view_internal(sc["name"], sc["options"])
49
+ except Exception as e:
50
+ DEFAULT_LOGGER.exception(f"could not create nor replace view {sc['name']}", exc_info=e)
51
+
52
+
53
+ def create_or_replace_views():
54
+ DEFAULT_LOGGER.info("create or replace (schedule) views")
55
+
56
+ df = get_schedules_df()
57
+ for row in df.collect():
58
+ try:
59
+ create_or_replace_view_internal(row.name, row.options.asDict())
60
+ except Exception as e:
61
+ DEFAULT_LOGGER.exception(f"could not create nor replace view {row.name}", exc_info=e)
@@ -0,0 +1,4 @@
1
+ from fabricks.core.steps.base import BaseStep
2
+ from fabricks.core.steps.get_step import get_step
3
+
4
+ __all__ = ["BaseStep", "get_step"]
@@ -0,0 +1,7 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class Timeouts:
6
+ job: int
7
+ step: int
@@ -0,0 +1,423 @@
1
+ import logging
2
+ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union, cast
3
+
4
+ from pyspark.sql import DataFrame
5
+ from pyspark.sql.functions import expr, md5
6
+ from pyspark.sql.types import Row
7
+ from typing_extensions import deprecated
8
+
9
+ from fabricks.cdc import NoCDC
10
+ from fabricks.context import CONF_RUNTIME, LOGLEVEL, PATHS_RUNTIME, PATHS_STORAGE, SPARK, STEPS
11
+ from fabricks.context.log import DEFAULT_LOGGER
12
+ from fabricks.core.jobs.base._types import Bronzes, Golds, SchemaDependencies, Silvers, TStep
13
+ from fabricks.core.jobs.get_job import get_job
14
+ from fabricks.core.steps._types import Timeouts
15
+ from fabricks.core.steps.get_step_conf import get_step_conf
16
+ from fabricks.metastore.database import Database
17
+ from fabricks.metastore.table import Table
18
+ from fabricks.utils.helpers import run_in_parallel
19
+ from fabricks.utils.read.read_yaml import read_yaml
20
+ from fabricks.utils.schema import get_schema_for_type
21
+
22
+
23
+ class BaseStep:
24
+ def __init__(self, step: Union[TStep, str]):
25
+ self.name = cast(str, step)
26
+
27
+ if self.name in Bronzes:
28
+ self.expand = "bronze"
29
+ elif self.name in Silvers:
30
+ self.expand = "silver"
31
+ elif self.name in Golds:
32
+ self.expand = "gold"
33
+
34
+ else:
35
+ raise ValueError(self.name, "does not expand a default step")
36
+
37
+ _storage = PATHS_STORAGE.get(self.name)
38
+ assert _storage
39
+ _runtime = PATHS_RUNTIME.get(self.name)
40
+ assert _runtime
41
+
42
+ self.spark = SPARK
43
+ self.storage = _storage
44
+ self.runtime = _runtime
45
+ self.database = Database(self.name)
46
+
47
+ _conf: Optional[dict] = None
48
+ _options: Optional[dict] = None
49
+
50
+ _workers: Optional[int] = None
51
+ _timeouts: Optional[Timeouts] = None
52
+
53
+ @property
54
+ def workers(self):
55
+ if not self._workers:
56
+ w = self.options.get("workers")
57
+ if w is None:
58
+ w = CONF_RUNTIME.get("options", {}).get("workers")
59
+ assert w is not None
60
+ self._workers = cast(int, w)
61
+
62
+ return self._workers
63
+
64
+ def _get_timeout(self, what: str) -> int:
65
+ t = self.options.get("timeouts", {}).get(what, None)
66
+ if t is None:
67
+ t = CONF_RUNTIME.get("options", {}).get("timeouts", {}).get(what)
68
+ assert t is not None
69
+
70
+ return int(t)
71
+
72
+ @property
73
+ def timeouts(self) -> Timeouts:
74
+ if not self._timeouts:
75
+ self._timeouts = Timeouts(
76
+ job=self._get_timeout("job"),
77
+ step=self._get_timeout("step"),
78
+ )
79
+
80
+ return self._timeouts
81
+
82
+ @property
83
+ def conf(self) -> dict:
84
+ if not self._conf:
85
+ _conf = [s for s in STEPS if s.get("name") == self.name][0]
86
+ assert _conf is not None
87
+ self._conf = cast(dict[str, str], _conf)
88
+
89
+ return self._conf
90
+
91
+ @property
92
+ def options(self) -> dict:
93
+ if not self._options:
94
+ o = self.conf.get("options")
95
+ assert o is not None
96
+ self._options = cast(dict[str, str], o)
97
+
98
+ return self._options
99
+
100
+ def drop(self):
101
+ DEFAULT_LOGGER.warning("drop", extra={"label": self})
102
+
103
+ fs = self.database.storage
104
+ assert fs
105
+
106
+ tmp = fs.joinpath("tmp")
107
+ if tmp.exists():
108
+ DEFAULT_LOGGER.debug("clean tmp folder", extra={"label": self})
109
+ tmp.rm()
110
+
111
+ checkpoint = fs.joinpath("checkpoints")
112
+ if checkpoint.exists():
113
+ DEFAULT_LOGGER.debug("clean checkpoint folder", extra={"label": self})
114
+ checkpoint.rm()
115
+
116
+ schema = fs.joinpath("schemas")
117
+ if schema.exists():
118
+ DEFAULT_LOGGER.debug("clean schema folder", extra={"label": self})
119
+ schema.rm()
120
+
121
+ DEFAULT_LOGGER.debug("clean fabricks", extra={"label": self})
122
+ for t in ["jobs", "tables", "dependencies", "views"]:
123
+ tbl = Table("fabricks", self.name, t)
124
+ tbl.drop()
125
+
126
+ try:
127
+ SPARK.sql(f"delete from fabricks.steps where step = '{self}'")
128
+ except Exception:
129
+ pass
130
+
131
+ self.database.drop()
132
+
133
+ def create(self):
134
+ DEFAULT_LOGGER.info("create", extra={"label": self})
135
+
136
+ if not self.runtime.exists():
137
+ DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
138
+ else:
139
+ self.update()
140
+
141
+ def update(self, update_dependencies: Optional[bool] = True, progress_bar: Optional[bool] = False):
142
+ if not self.runtime.exists():
143
+ DEFAULT_LOGGER.warning(f"could not find {self.name} in runtime")
144
+
145
+ else:
146
+ if not self.database.exists():
147
+ self.database.create()
148
+
149
+ self.update_configurations()
150
+ errors = self.create_db_objects()
151
+
152
+ for e in errors:
153
+ DEFAULT_LOGGER.exception("fail to create db object", extra={"label": e["job"]}, exc_info=e["error"])
154
+
155
+ if update_dependencies:
156
+ self.update_dependencies(progress_bar=progress_bar)
157
+
158
+ self.update_tables_list()
159
+ self.update_views_list()
160
+ self.update_steps_list()
161
+
162
+ def get_dependencies(
163
+ self,
164
+ progress_bar: Optional[bool] = False,
165
+ topic: Optional[Union[str, List[str]]] = None,
166
+ include_manual: Optional[bool] = False,
167
+ loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
168
+ ) -> Tuple[DataFrame, List[Dict]]:
169
+ DEFAULT_LOGGER.debug("get dependencies", extra={"label": self})
170
+
171
+ df = self.get_jobs()
172
+
173
+ if not include_manual:
174
+ df = df.where("not options.type <=> 'manual'")
175
+
176
+ if topic:
177
+ if isinstance(topic, str):
178
+ topic = [topic]
179
+
180
+ where = ", ".join([f"'{t}'" for t in topic])
181
+ DEFAULT_LOGGER.debug(f"where topic in {where}", extra={"label": self})
182
+ df = df.where(f"topic in ({where})")
183
+
184
+ if not df:
185
+ raise ValueError("no jobs found")
186
+
187
+ results = run_in_parallel(
188
+ _get_dependencies,
189
+ df,
190
+ workers=16,
191
+ progress_bar=progress_bar,
192
+ logger=DEFAULT_LOGGER,
193
+ loglevel=logging.CRITICAL,
194
+ )
195
+
196
+ errors = [res for res in results if res.get("error")]
197
+ dependencies = []
198
+ for res in [res for res in results if res.get("dependencies")]:
199
+ dependencies.extend(res.get("dependencies"))
200
+
201
+ df = self.spark.createDataFrame([d.model_dump() for d in dependencies], SchemaDependencies) # type: ignore
202
+ return df, errors
203
+
204
+ def get_jobs_iter(self, topic: Optional[str] = None) -> Iterable[dict]:
205
+ return read_yaml(self.runtime, root="job", preferred_file_name=topic)
206
+
207
+ def get_jobs(self, topic: Optional[str] = None) -> DataFrame:
208
+ DEFAULT_LOGGER.debug("get jobs", extra={"label": self})
209
+
210
+ try:
211
+ conf = get_step_conf(self.name)
212
+ schema = get_schema_for_type(conf)
213
+ jobs = self.get_jobs_iter(topic=topic)
214
+
215
+ df = SPARK.createDataFrame(jobs, schema=schema) # type: ignore
216
+ df = df.withColumn("job_id", md5(expr("concat(step, '.' ,topic, '_', item)")))
217
+
218
+ duplicated_df = df.groupBy("job_id", "step", "topic", "item").count().where("count > 1")
219
+ duplicates = ",".join(f"{row.step}.{row.topic}_{row.item}" for row in duplicated_df.collect())
220
+ assert duplicated_df.isEmpty(), f"duplicated job(s) ({duplicates})"
221
+
222
+ if not df:
223
+ raise ValueError("no jobs found")
224
+
225
+ return df
226
+
227
+ except AssertionError as e:
228
+ DEFAULT_LOGGER.exception("fail to get jobs", extra={"label": self})
229
+ raise e
230
+
231
+ def create_db_objects(self, retry: Optional[bool] = True) -> List[Dict]:
232
+ DEFAULT_LOGGER.info("create db objects", extra={"label": self})
233
+
234
+ df = self.get_jobs()
235
+ table_df = self.database.get_tables()
236
+ view_df = self.database.get_views()
237
+
238
+ df = df.join(table_df, "job_id", how="left_anti")
239
+ df = df.join(view_df, "job_id", how="left_anti")
240
+
241
+ if df:
242
+ results = run_in_parallel(
243
+ _create_db_object,
244
+ df,
245
+ workers=16,
246
+ progress_bar=True,
247
+ logger=DEFAULT_LOGGER,
248
+ loglevel=logging.CRITICAL,
249
+ )
250
+
251
+ self.update_tables_list()
252
+ self.update_views_list()
253
+
254
+ errors = [res for res in results if res.get("error")]
255
+
256
+ if errors:
257
+ if retry:
258
+ DEFAULT_LOGGER.warning("retry to create jobs", extra={"label": self})
259
+ return self.create_db_objects(retry=False)
260
+
261
+ return errors
262
+
263
+ @deprecated("use create_db_objects instead")
264
+ def create_jobs(self, retry: Optional[bool] = True) -> List[Dict]:
265
+ return self.create_db_objects(retry=retry)
266
+
267
+ @deprecated("use update_configurations instead")
268
+ def update_jobs(self, drop: Optional[bool] = False):
269
+ return self.update_configurations(drop=drop)
270
+
271
+ def update_configurations(self, drop: Optional[bool] = False):
272
+ df = self.get_jobs()
273
+
274
+ DEFAULT_LOGGER.info("update configurations", extra={"label": self})
275
+
276
+ cdc = NoCDC("fabricks", self.name, "jobs")
277
+
278
+ if drop:
279
+ cdc.table.drop()
280
+ elif cdc.table.exists():
281
+ df_diffs = cdc.get_differences_with_deltatable(df)
282
+ if not df_diffs.isEmpty():
283
+ DEFAULT_LOGGER.warning("schema drift detected", extra={"label": self})
284
+ cdc.table.overwrite_schema(df=df)
285
+
286
+ cdc.delete_missing(df, keys=["job_id"])
287
+
288
+ @deprecated("use update_tables_list instead")
289
+ def update_tables(self):
290
+ return self.update_tables_list()
291
+
292
+ def update_tables_list(self):
293
+ df = self.database.get_tables()
294
+ df = df.withColumn("job_id", expr("md5(table)"))
295
+
296
+ DEFAULT_LOGGER.info("update tables list", extra={"label": self})
297
+ NoCDC("fabricks", self.name, "tables").delete_missing(df, keys=["job_id"])
298
+
299
+ @deprecated("use update_views_list instead")
300
+ def update_views(self):
301
+ return self.update_views_list()
302
+
303
+ def update_views_list(self):
304
+ df = self.database.get_views()
305
+ df = df.withColumn("job_id", expr("md5(view)"))
306
+
307
+ DEFAULT_LOGGER.info("update views list", extra={"label": self})
308
+ NoCDC("fabricks", self.name, "views").delete_missing(df, keys=["job_id"])
309
+
310
+ def update_dependencies(
311
+ self,
312
+ progress_bar: Optional[bool] = False,
313
+ topic: Optional[Union[str, List[str]]] = None,
314
+ include_manual: Optional[bool] = False,
315
+ loglevel: Optional[Literal[10, 20, 30, 40, 50]] = None,
316
+ ) -> List[Dict]:
317
+ df, errors = self.get_dependencies(
318
+ progress_bar=progress_bar,
319
+ topic=topic,
320
+ include_manual=include_manual,
321
+ loglevel=loglevel,
322
+ )
323
+ df.cache()
324
+
325
+ DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
326
+
327
+ update_where = None
328
+
329
+ if topic is None:
330
+ if not include_manual:
331
+ update_where = (
332
+ f"job_id not in (select job_id from fabricks.{self.name}_jobs where not options.type <=> 'manual')"
333
+ )
334
+
335
+ if update_where:
336
+ DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
337
+
338
+ NoCDC("fabricks", self.name, "dependencies").delete_missing(
339
+ df,
340
+ keys=["dependency_id"],
341
+ update_where=update_where,
342
+ )
343
+
344
+ else:
345
+ if isinstance(topic, str):
346
+ topic = [topic]
347
+
348
+ where_topic = f"""topic in ('{"', '".join(topic)}')"""
349
+ where_not_manual = "-- manual job(s) included"
350
+ if not include_manual:
351
+ where_not_manual = "and not options.type <=> 'manual'"
352
+
353
+ update_where = (
354
+ f"""job_id in (select job_id from fabricks.{self.name}_jobs where {where_topic} {where_not_manual})"""
355
+ )
356
+ DEFAULT_LOGGER.debug(f"update where {update_where}", extra={"label": self})
357
+
358
+ NoCDC("fabricks", self.name, "dependencies").delete_missing(
359
+ df,
360
+ keys=["dependency_id"],
361
+ update_where=update_where,
362
+ uuid=True,
363
+ )
364
+
365
+ return errors
366
+
367
+ def register(self, update: Optional[bool] = False, drop: Optional[bool] = False):
368
+ if drop:
369
+ SPARK.sql(f"drop database if exists {self.name} cascade ")
370
+ SPARK.sql(f"create database {self.name}")
371
+
372
+ if update:
373
+ self.update_configurations()
374
+
375
+ df = self.get_jobs()
376
+ if df:
377
+ table_df = self.database.get_tables()
378
+ if table_df:
379
+ df = df.join(table_df, "job_id", how="left_anti")
380
+
381
+ if df:
382
+ DEFAULT_LOGGER.setLevel(logging.CRITICAL)
383
+ run_in_parallel(_register, df, workers=16, progress_bar=True, run_as="Pool")
384
+ DEFAULT_LOGGER.setLevel(LOGLEVEL)
385
+
386
+ def update_steps_list(self):
387
+ order = self.options.get("order", 0)
388
+ df = SPARK.sql(f"select '{self.expand}' as expand, '{self.name}' as step, '{order}' :: int as `order`")
389
+
390
+ NoCDC("fabricks", "steps").delete_missing(df, keys=["step"], update_where=f"step = '{self.name}'")
391
+
392
+ def __str__(self):
393
+ return self.name
394
+
395
+
396
+ # to avoid AttributeError: can't pickle local object
397
+ def _get_dependencies(row: Row):
398
+ job = get_job(step=row["step"], job_id=row["job_id"])
399
+ try:
400
+ return {"job": str(job), "dependencies": job.get_dependencies()}
401
+ except Exception as e:
402
+ DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
403
+ return {"job": str(job), "error": e}
404
+
405
+
406
+ def _create_db_object(row: Row):
407
+ job = get_job(step=row["step"], job_id=row["job_id"])
408
+ try:
409
+ job.create()
410
+ return {"job": str(job)}
411
+ except Exception as e: # noqa E722
412
+ DEFAULT_LOGGER.exception("fail to create db object", extra={"label": job})
413
+ return {"job": str(job), "error": e}
414
+
415
+
416
+ def _register(row: Row):
417
+ job = get_job(step=row["step"], topic=row["topic"], item=row["item"])
418
+ try:
419
+ job.register()
420
+ return {"job": str(job)}
421
+ except Exception as e:
422
+ DEFAULT_LOGGER.exception("fail to get dependencies", extra={"label": job})
423
+ return {"job": str(job), "error": e}
@@ -0,0 +1,10 @@
1
+ from typing import Union
2
+
3
+ from fabricks.core.jobs.base._types import Steps, TStep
4
+ from fabricks.core.steps.base import BaseStep
5
+
6
+
7
+ def get_step(step: Union[TStep, str]) -> BaseStep:
8
+ assert step in Steps, f"{step} not found"
9
+ base_step = BaseStep(step=step)
10
+ return base_step
@@ -0,0 +1,26 @@
1
+ from typing import Union, cast
2
+
3
+ from fabricks.core.jobs.base._types import Bronzes, Golds, JobConfBronze, JobConfGold, JobConfSilver, Silvers, TStep
4
+
5
+
6
+ def get_step_conf(step: Union[TStep, str]):
7
+ if isinstance(step, str):
8
+ step = cast(TStep, step)
9
+
10
+ if step in Bronzes:
11
+ expand = "bronze"
12
+ elif step in Silvers:
13
+ expand = "silver"
14
+ elif step in Golds:
15
+ expand = "gold"
16
+ else:
17
+ raise ValueError(f"{step} - not found")
18
+
19
+ conf = {
20
+ "bronze": JobConfBronze,
21
+ "silver": JobConfSilver,
22
+ "gold": JobConfGold,
23
+ }.get(expand, None)
24
+
25
+ assert conf
26
+ return conf