fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,333 @@
1
+ from typing import Optional, cast
2
+
3
+ from pyspark.sql import DataFrame, Row
4
+ from pyspark.sql.functions import expr, lit, md5
5
+
6
+ from fabricks.cdc.nocdc import NoCDC
7
+ from fabricks.context import VARIABLES
8
+ from fabricks.context.log import Logger
9
+ from fabricks.core.jobs.base.job import BaseJob
10
+ from fabricks.core.jobs.base.types import TBronze
11
+ from fabricks.core.parsers import BaseParser
12
+ from fabricks.core.parsers.get_parser import get_parser
13
+ from fabricks.core.utils import clean
14
+ from fabricks.metastore.view import create_or_replace_global_temp_view
15
+ from fabricks.utils.helpers import concat_ws
16
+ from fabricks.utils.path import Path
17
+ from fabricks.utils.read import read
18
+
19
+
20
+ class Bronze(BaseJob):
21
+ def __init__(
22
+ self, step: TBronze, topic: Optional[str] = None, item: Optional[str] = None, job_id: Optional[str] = None
23
+ ): # type: ignore
24
+ super().__init__(
25
+ "bronze",
26
+ step=step,
27
+ topic=topic,
28
+ item=item,
29
+ job_id=job_id,
30
+ )
31
+
32
+ _parser: Optional[BaseParser] = None
33
+
34
+ @property
35
+ def stream(self) -> bool:
36
+ return self.mode not in ["register"]
37
+
38
+ @property
39
+ def schema_drift(self) -> bool:
40
+ return True
41
+
42
+ @property
43
+ def persist(self) -> bool:
44
+ return self.mode in ["append", "register"]
45
+
46
+ @property
47
+ def virtual(self) -> bool:
48
+ return False
49
+
50
+ @classmethod
51
+ def from_job_id(cls, step: str, job_id: str):
52
+ return cls(step=cast(TBronze, step), job_id=job_id)
53
+
54
+ @classmethod
55
+ def from_step_topic_item(cls, step: str, topic: str, item: str):
56
+ return cls(step=cast(TBronze, step), topic=topic, item=item)
57
+
58
+ @property
59
+ def data_path(self) -> Path:
60
+ uri = self.options.job.get("uri")
61
+ assert uri is not None, "no uri provided in options"
62
+ path = Path.from_uri(uri, regex=VARIABLES)
63
+ return path
64
+
65
+ def get_dependencies(self, df: Optional[DataFrame] = None) -> Optional[DataFrame]:
66
+ dependencies = []
67
+ parents = self.options.job.get_list("parents")
68
+ if parents:
69
+ for p in parents:
70
+ dependencies.append(Row(self.job_id, p, "job"))
71
+ if dependencies:
72
+ df = self.spark.createDataFrame(dependencies, schema=["job_id", "parent", "origin"])
73
+ df = df.transform(self.add_dependency_details)
74
+ return df
75
+
76
+ def register_external_table(self):
77
+ options = self.conf.parser_options # type: ignore
78
+ if options:
79
+ file_format = options.get("file_format")
80
+ else:
81
+ file_format = "delta"
82
+
83
+ Logger.debug(f"register external table ({self.data_path})", extra={"job": self})
84
+ self.spark.sql(
85
+ f"create table if not exists {self.qualified_name} using {file_format} location '{self.data_path}'"
86
+ )
87
+
88
+ def drop_external_table(self):
89
+ Logger.debug("drop external table", extra={"job": self})
90
+ self.spark.sql(f"drop table if exists {self.qualified_name}")
91
+
92
+ def optimize_external_table(
93
+ self,
94
+ vacuum: Optional[bool] = True,
95
+ analyze: Optional[bool] = True,
96
+ ):
97
+ Logger.debug("optimize external table", extra={"job": self})
98
+ if vacuum:
99
+ from delta import DeltaTable
100
+
101
+ dt = DeltaTable.forPath(self.spark, self.data_path.string)
102
+ retention_days = 7
103
+ Logger.debug(f"{self.data_path} - vacuum table (removing files older than {retention_days} days)")
104
+ try:
105
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = False")
106
+ dt.vacuum(retention_days * 24)
107
+ finally:
108
+ self.spark.sql("SET self.spark.databricks.delta.retentionDurationCheck.enabled = True")
109
+
110
+ if analyze:
111
+ Logger.debug(f"{self.data_path} - compute delta statistics")
112
+ self.spark.sql(f"analyze table delta.`{self.data_path}` compute delta statistics")
113
+
114
+ @property
115
+ def parser(self) -> BaseParser:
116
+ if not self._parser:
117
+ assert self.mode not in ["register"], f"{self.mode} not allowed"
118
+ name = self.options.job.get("parser")
119
+ assert name is not None, "parser not found"
120
+ options = self.conf.parser_options or None # type: ignore
121
+ p = get_parser(name, options)
122
+ self._parser = p
123
+ return self._parser
124
+
125
+ def parse(self, stream: bool = False) -> DataFrame:
126
+ """
127
+ Parses the data based on the specified mode and returns a DataFrame.
128
+
129
+ Args:
130
+ stream (bool, optional): Indicates whether the data should be read as a stream. Defaults to False.
131
+
132
+ Returns:
133
+ DataFrame: The parsed data as a DataFrame.
134
+ """
135
+ if self.mode == "register":
136
+ if stream:
137
+ df = read(
138
+ stream=stream,
139
+ path=self.data_path,
140
+ file_format="delta",
141
+ # spark=self.spark, (BUG)
142
+ )
143
+ else:
144
+ df = self.spark.sql(f"select * from {self}")
145
+ # cleaning done in parser
146
+ df = clean(df)
147
+ else:
148
+ df = self.parser.get_data(
149
+ stream=stream,
150
+ data_path=self.data_path,
151
+ schema_path=self.paths.schema,
152
+ spark=self.spark,
153
+ )
154
+ return df
155
+
156
+ def get_data(self, stream: bool = False, transform: bool = False) -> DataFrame:
157
+ df = self.parse(stream)
158
+ df = self.filter_where(df)
159
+ df = self.encrypt(df)
160
+ if transform:
161
+ df = self.base_transform(df)
162
+ return df
163
+
164
+ def add_calculated_columns(self, df: DataFrame) -> DataFrame:
165
+ calculated_columns = self.options.job.get_dict("calculated_columns")
166
+ if calculated_columns:
167
+ for key, value in calculated_columns.items():
168
+ Logger.debug(f"add calculated column ({key} -> {value})", extra={"job": self})
169
+ df = df.withColumn(key, expr(f"{value}"))
170
+ return df
171
+
172
+ def add_hash(self, df: DataFrame) -> DataFrame:
173
+ if "__hash" not in df.columns:
174
+ fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
175
+ Logger.debug("add hash", extra={"job": self})
176
+ if "__operation" in df.columns:
177
+ fields += ["__operation == 'delete'"]
178
+ if "__source" in df.columns:
179
+ fields += ["__source"]
180
+ df = df.withColumn("__hash", md5(expr(f"{concat_ws(fields)}")))
181
+ return df
182
+
183
+ def add_key(self, df: DataFrame) -> DataFrame:
184
+ if "__key" not in df.columns:
185
+ fields = self.options.job.get_list("keys")
186
+ if fields:
187
+ Logger.debug(f"add key ({', '.join(fields)})", extra={"job": self})
188
+ if "__source" in df.columns:
189
+ fields = fields + ["__source"]
190
+ fields = [f"`{f}`" for f in fields]
191
+ df = df.withColumn("__key", md5(expr(f"{concat_ws(fields)}")))
192
+ return df
193
+
194
+ def add_source(self, df: DataFrame) -> DataFrame:
195
+ if "__source" not in df.columns:
196
+ source = self.options.job.get("source")
197
+ if source:
198
+ Logger.debug(f"add source ({source})", extra={"job": self})
199
+ df = df.withColumn("__source", lit(source))
200
+ return df
201
+
202
+ def add_operation(self, df: DataFrame) -> DataFrame:
203
+ if "__operation" not in df.columns:
204
+ operation = self.options.job.get("operation")
205
+ if operation:
206
+ Logger.debug(f"add operation ({operation})", extra={"job": self})
207
+ df = df.withColumn("__operation", lit(operation))
208
+ else:
209
+ df = df.withColumn("__operation", lit("upsert"))
210
+ return df
211
+
212
+ def base_transform(self, df: DataFrame) -> DataFrame:
213
+ df = df.transform(self.extender)
214
+ df = df.transform(self.add_calculated_columns)
215
+ df = df.transform(self.add_hash)
216
+ df = df.transform(self.add_operation)
217
+ df = df.transform(self.add_source)
218
+ df = df.transform(self.add_key)
219
+
220
+ if "__metadata" in df.columns:
221
+ if self.mode == "register":
222
+ # https://github.com/delta-io/delta/issues/2014 (BUG)
223
+ df = df.withColumn(
224
+ "__metadata",
225
+ expr(
226
+ f"""
227
+ struct(
228
+ concat_ws('/', '{self.data_path}', __timestamp, __operation) as file_path,
229
+ __metadata.file_name as file_name,
230
+ __metadata.file_size as file_size,
231
+ __metadata.file_modification_time as file_modification_time,
232
+ cast(current_date() as timestamp) as inserted
233
+ )
234
+ """
235
+ ),
236
+ )
237
+ else:
238
+ df = df.withColumn(
239
+ "__metadata",
240
+ expr(
241
+ """
242
+ struct(
243
+ __metadata.file_path as file_path,
244
+ __metadata.file_name as file_name,
245
+ __metadata.file_size as file_size,
246
+ __metadata.file_modification_time as file_modification_time,
247
+ cast(current_date() as timestamp) as inserted
248
+ )
249
+ """
250
+ ),
251
+ )
252
+ return df
253
+
254
+ def create_or_replace_view(self):
255
+ Logger.warning("create or replace view not allowed", extra={"job": self})
256
+
257
+ def overwrite_schema(self):
258
+ Logger.warning("schema overwrite not allowed", extra={"job": self})
259
+
260
+ def get_cdc_context(self, df: DataFrame) -> dict:
261
+ return {}
262
+
263
+ def for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
264
+ assert self.persist, f"{self.mode} not allowed"
265
+
266
+ context = self.get_cdc_context(df)
267
+
268
+ # if dataframe, reference is passed (BUG)
269
+ name = f"{self.step}_{self.topic}_{self.item}__{batch}"
270
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df)
271
+ sql = f"select * from {global_temp_view}"
272
+
273
+ assert isinstance(self.cdc, NoCDC)
274
+ if self.mode == "append":
275
+ self.cdc.append(sql, **context)
276
+
277
+ def for_each_run(self, schedule: Optional[str] = None):
278
+ if self.mode == "register":
279
+ Logger.info("register (no run)", extra={"job": self})
280
+ elif self.mode == "memory":
281
+ Logger.info("memory (no run)", extra={"job": self})
282
+ else:
283
+ super().for_each_run(schedule=schedule)
284
+
285
+ def create(self):
286
+ if self.mode == "register":
287
+ self.register_external_table()
288
+ elif self.mode == "memory":
289
+ Logger.info("memory (no table nor view)", extra={"job": self})
290
+ else:
291
+ super().create()
292
+
293
+ def register(self):
294
+ if self.mode == "register":
295
+ self.register_external_table()
296
+ elif self.mode == "memory":
297
+ Logger.info("memory (no table nor view)", extra={"job": self})
298
+ else:
299
+ super().register()
300
+
301
+ def truncate(self):
302
+ if self.mode == "register":
303
+ Logger.info("register (no truncate)", extra={"job": self})
304
+ else:
305
+ super().truncate()
306
+
307
+ def restore(self):
308
+ if self.mode == "register":
309
+ Logger.info("register (no restore)", extra={"job": self})
310
+ else:
311
+ super().restore()
312
+
313
+ def drop(self):
314
+ if self.mode == "register":
315
+ self.drop_external_table()
316
+ super().drop()
317
+
318
+ def optimize(
319
+ self,
320
+ vacuum: Optional[bool] = True,
321
+ optimize: Optional[bool] = True,
322
+ analyze: Optional[bool] = True,
323
+ ):
324
+ if self.mode == "memory":
325
+ Logger.info("memory (no optimize)", extra={"job": self})
326
+ elif self.mode == "register":
327
+ self.optimize_external_table(vacuum, analyze)
328
+ else:
329
+ super().optimize()
330
+
331
+ def overwrite(self):
332
+ self.truncate()
333
+ self.run()
@@ -0,0 +1,126 @@
1
+ from typing import Optional, cast, overload
2
+
3
+ from pyspark.sql import Row
4
+
5
+ from fabricks.core.jobs.base.job import BaseJob
6
+ from fabricks.core.jobs.base.types import Bronzes, Golds, Silvers, TBronze, TGold, TSilver
7
+ from fabricks.core.jobs.get_job_id import get_job_id
8
+
9
+
10
+ @overload
11
+ def get_job(step: str, *, job_id: str) -> BaseJob: ...
12
+
13
+
14
+ @overload
15
+ def get_job(step: str, *, topic: str, item: str) -> BaseJob: ...
16
+
17
+
18
+ @overload
19
+ def get_job(*, row: Row) -> BaseJob: ...
20
+
21
+
22
+ @overload
23
+ def get_job(*, job: str) -> BaseJob: ...
24
+
25
+
26
+ def get_job(
27
+ step: Optional[str] = None,
28
+ topic: Optional[str] = None,
29
+ item: Optional[str] = None,
30
+ job_id: Optional[str] = None,
31
+ job: Optional[str] = None,
32
+ row: Optional[Row] = None,
33
+ ) -> BaseJob:
34
+ """
35
+ Retrieve a job based on the provided parameters.
36
+
37
+ Args:
38
+ step (Optional[str]): The step of the job.
39
+ topic (Optional[str]): The topic of the job.
40
+ item (Optional[str]): The item of the job.
41
+ job_id (Optional[str]): The ID of the job.
42
+ job (Optional[str]): The job string.
43
+ row (Optional[Row]): The row object containing job information.
44
+
45
+ Returns:
46
+ BaseJob: The retrieved job.
47
+
48
+ Raises:
49
+ ValueError: If the required parameters are not provided.
50
+
51
+ """
52
+ if row:
53
+ if "step" in row and "topic" in row and "item" in row:
54
+ j = _get_job(step=row.step, topic=row.topic, item=row.item)
55
+ elif "step" in row and "job_id" in row:
56
+ j = get_job(step=row.step, job_id=row.job_id)
57
+ elif "job" in row:
58
+ parts = row.job.split(".")
59
+ s = parts[0]
60
+ job_id = get_job_id(job=row.job)
61
+ j = _get_job(step=s, job_id=job_id)
62
+ else:
63
+ raise ValueError("step, topic, item or step, job_id or job mandatory")
64
+
65
+ elif job:
66
+ parts = job.split(".")
67
+ s = parts[0]
68
+ job_id = get_job_id(job=job)
69
+ j = _get_job(step=s, job_id=job_id)
70
+
71
+ elif job_id:
72
+ assert step, "step mandatory"
73
+ j = _get_job(step=step, job_id=job_id)
74
+
75
+ else:
76
+ assert step, "step mandatory"
77
+ assert topic, "topic mandatory"
78
+ assert item, "item mandatory"
79
+ j = _get_job(step=step, topic=topic, item=item)
80
+
81
+ return j
82
+
83
+
84
+ def _get_job(
85
+ step: str,
86
+ topic: Optional[str] = None,
87
+ item: Optional[str] = None,
88
+ job_id: Optional[str] = None,
89
+ ):
90
+ if step in Bronzes:
91
+ from fabricks.core.jobs.bronze import Bronze
92
+
93
+ step = cast(TBronze, step)
94
+ if job_id is not None:
95
+ job = Bronze.from_job_id(step=step, job_id=job_id)
96
+ else:
97
+ assert topic
98
+ assert item
99
+ job = Bronze.from_step_topic_item(step=step, topic=topic, item=item)
100
+
101
+ elif step in Silvers:
102
+ from fabricks.core.jobs.silver import Silver
103
+
104
+ step = cast(TSilver, step)
105
+ if job_id is not None:
106
+ job = Silver.from_job_id(step=step, job_id=job_id)
107
+ else:
108
+ assert topic
109
+ assert item
110
+ job = Silver.from_step_topic_item(step=step, topic=topic, item=item)
111
+
112
+ elif step in Golds:
113
+ from fabricks.core.jobs.gold import Gold
114
+
115
+ step = cast(TGold, step)
116
+ if job_id is not None:
117
+ job = Gold.from_job_id(step=step, job_id=job_id)
118
+ else:
119
+ assert topic
120
+ assert item
121
+ job = Gold.from_step_topic_item(step=step, topic=topic, item=item)
122
+
123
+ else:
124
+ raise ValueError(f"{step} not found")
125
+
126
+ return job
@@ -0,0 +1,115 @@
1
+ from typing import Optional, cast, overload
2
+
3
+ from databricks.sdk.runtime import spark
4
+ from pyspark.sql import Row
5
+
6
+ from fabricks.context import IS_LIVE
7
+ from fabricks.core.jobs.base.types import Bronzes, Golds, JobConf, Silvers, TBronze, TGold, TSilver, TStep
8
+
9
+
10
+ @overload
11
+ def get_job_conf(step: TStep, *, job_id: str) -> JobConf: ...
12
+
13
+
14
+ @overload
15
+ def get_job_conf(step: TStep, *, topic: str, item: str) -> JobConf: ...
16
+
17
+
18
+ def _get_job_conf(step: TStep, row: Row) -> JobConf:
19
+ options = row["options"].asDict() if row["options"] else None
20
+ table_options = row["table_options"].asDict() if row["table_options"] else None
21
+ check_options = row["check_options"].asDict() if row["check_options"] else None
22
+ spark_options = row["spark_options"].asDict() if row["spark_options"] else None
23
+ invoker_options = row["invoker_options"].asDict() if row["invoker_options"] else None
24
+
25
+ if step in Bronzes:
26
+ from fabricks.core.jobs.base.types import JobConfBronze
27
+
28
+ assert options is not None, "no option"
29
+ parser_options = row["parser_options"].asDict() if row["parser_options"] else None
30
+ step = cast(TBronze, step)
31
+ return JobConfBronze(
32
+ job_id=row["job_id"],
33
+ topic=row["topic"],
34
+ item=row["item"],
35
+ step=step,
36
+ options=options,
37
+ parser_options=parser_options,
38
+ table_options=table_options,
39
+ check_options=check_options,
40
+ invoker_options=invoker_options,
41
+ spark_options=spark_options,
42
+ tags=row["tags"],
43
+ )
44
+
45
+ elif step in Silvers:
46
+ from fabricks.core.jobs.base.types import JobConfSilver
47
+
48
+ assert options is not None, "no option"
49
+ step = cast(TSilver, step)
50
+ return JobConfSilver(
51
+ job_id=row["job_id"],
52
+ topic=row["topic"],
53
+ item=row["item"],
54
+ step=step,
55
+ options=options,
56
+ table_options=table_options,
57
+ check_options=check_options,
58
+ invoker_options=invoker_options,
59
+ spark_options=spark_options,
60
+ tags=row["tags"],
61
+ )
62
+
63
+ elif step in Golds:
64
+ from fabricks.core.jobs.base.types import JobConfGold
65
+
66
+ assert options is not None, "no option"
67
+ step = cast(TGold, step)
68
+ return JobConfGold(
69
+ job_id=row["job_id"],
70
+ topic=row["topic"],
71
+ item=row["item"],
72
+ step=step,
73
+ options=options,
74
+ table_options=table_options,
75
+ check_options=check_options,
76
+ invoker_options=invoker_options,
77
+ spark_options=spark_options,
78
+ tags=row["tags"],
79
+ )
80
+
81
+ else:
82
+ raise ValueError(f"{step} not found")
83
+
84
+
85
+ def get_job_conf(
86
+ step: TStep,
87
+ job_id: Optional[str] = None,
88
+ topic: Optional[str] = None,
89
+ item: Optional[str] = None,
90
+ ) -> JobConf:
91
+ if IS_LIVE:
92
+ from fabricks.core.steps import get_step
93
+
94
+ s = get_step(step=step)
95
+ if topic:
96
+ df = s.get_jobs(topic=topic)
97
+ else:
98
+ df = s.get_jobs()
99
+ else:
100
+ df = spark.sql(f"select * from fabricks.{step}_jobs")
101
+
102
+ assert df, f"{step} not found"
103
+
104
+ if job_id:
105
+ try:
106
+ row = df.where(f"job_id == '{job_id}'").collect()[0]
107
+ except IndexError:
108
+ raise ValueError(f"job not found ({step}, {job_id})")
109
+ else:
110
+ try:
111
+ row = df.where(f"topic == '{topic}' and item == '{item}'").collect()[0]
112
+ except IndexError:
113
+ raise ValueError(f"job not found ({step}, {topic}, {item})")
114
+
115
+ return _get_job_conf(step=step, row=row)
@@ -0,0 +1,26 @@
1
+ from typing import Optional, overload
2
+
3
+ from fabricks.utils.helpers import md5
4
+
5
+
6
+ @overload
7
+ def get_job_id(step: str, topic: str, item: str) -> str: ...
8
+
9
+
10
+ @overload
11
+ def get_job_id(*, job: str) -> str: ...
12
+
13
+
14
+ def get_job_id(
15
+ step: Optional[str] = None,
16
+ topic: Optional[str] = None,
17
+ item: Optional[str] = None,
18
+ job: Optional[str] = None,
19
+ ) -> str:
20
+ if not job:
21
+ assert step
22
+ assert topic
23
+ assert item
24
+ job = f"{step}.{topic}_{item}"
25
+
26
+ return md5(job)
@@ -0,0 +1,89 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional, TypedDict, Union
3
+
4
+ from databricks.sdk.runtime import spark
5
+ from pyspark.sql import DataFrame, Row
6
+ from pyspark.sql.functions import expr
7
+
8
+ from fabricks.context import IS_LIVE, PATHS_RUNTIME
9
+ from fabricks.core.jobs.base.job import BaseJob
10
+ from fabricks.core.jobs.base.types import Modes, TStep
11
+ from fabricks.core.jobs.get_job import get_job
12
+ from fabricks.utils.helpers import concat_dfs, run_in_parallel
13
+ from fabricks.utils.path import Path
14
+ from fabricks.utils.read import read_yaml
15
+ from fabricks.utils.schema import get_schema_for_type
16
+
17
+
18
+ class GenericOptions(TypedDict):
19
+ mode: Modes
20
+
21
+
22
+ @dataclass
23
+ class JobConfGeneric:
24
+ step: TStep
25
+ job_id: str
26
+ topic: str
27
+ item: str
28
+ options: GenericOptions
29
+
30
+
31
+ def _get_job(row: Row):
32
+ return get_job(row=row)
33
+
34
+
35
+ def _get_jobs() -> DataFrame:
36
+ if IS_LIVE:
37
+ schema = get_schema_for_type(JobConfGeneric)
38
+
39
+ def _read_yaml(path: Path):
40
+ df = read_yaml(path, root="job", schema=schema)
41
+ if df:
42
+ df = df.withColumn("job_id", expr("md5(concat(step,'.',topic,'_',item))"))
43
+ return df
44
+
45
+ dfs = run_in_parallel(_read_yaml, list(PATHS_RUNTIME.values()))
46
+ df = concat_dfs(dfs)
47
+
48
+ else:
49
+ df = spark.sql("select * from fabricks.jobs")
50
+
51
+ return df
52
+
53
+
54
+ def get_jobs(df: Optional[DataFrame] = None, convert: Optional[bool] = False) -> Union[List[BaseJob], DataFrame]:
55
+ """
56
+ Retrieves a list of jobs or a DataFrame containing job information.
57
+
58
+ Args:
59
+ df (Optional[DataFrame]): Optional DataFrame containing job information.
60
+ convert (Optional[bool]): Flag indicating whether to convert the DataFrame to a list of jobs.
61
+
62
+ Returns:
63
+ Union[List[BaseJob], DataFrame]: If `convert` is False, returns a list of BaseJob objects.
64
+ If `convert` is True, returns a DataFrame with selected columns.
65
+
66
+ Raises:
67
+ ValueError: If the DataFrame does not contain the required columns.
68
+
69
+ """
70
+ if not convert:
71
+ return _get_jobs()
72
+
73
+ else:
74
+ if df is None:
75
+ df = _get_jobs()
76
+ else:
77
+ if "step" in df.columns and "topic" in df.columns and "item" in df.columns:
78
+ df = df.select("step", "topic", "item")
79
+ elif "step" in df.columns and "job_id" in df.columns:
80
+ df = df.select("step", "job_id")
81
+ elif "job" in df.columns:
82
+ df = df.select("job")
83
+ else:
84
+ raise ValueError("step, topic, item or step, job_id or job mandatory")
85
+
86
+ assert df
87
+
88
+ jobs = run_in_parallel(_get_job, df)
89
+ return jobs