fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,391 @@
1
+ from typing import Any, Optional, Union, cast
2
+
3
+ from pyspark.sql import DataFrame, Row
4
+ from pyspark.sql.functions import expr, lit
5
+
6
+ from fabricks.cdc import SCD1
7
+ from fabricks.context.log import Logger
8
+ from fabricks.core.jobs.base.configurator import Configurator
9
+ from fabricks.metastore.view import create_or_replace_global_temp_view
10
+
11
+
12
+ class Generator(Configurator):
13
+ def update_dependencies(self):
14
+ Logger.info("update dependencies", extra={"job": self})
15
+
16
+ df = self.get_dependencies()
17
+ if df:
18
+ scd1 = SCD1("fabricks", self.step, "dependencies")
19
+ scd1.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
20
+
21
+ def add_dependency_details(self, df: DataFrame) -> DataFrame:
22
+ df = df.withColumn("__parent", expr("replace(parent, '__current', '')"))
23
+ df = df.withColumn("parent_id", expr("md5(__parent)"))
24
+ df = df.withColumn("dependency_id", expr("md5(concat_ws('*', job_id, parent))"))
25
+ df = df.drop("__parent")
26
+ return df
27
+
28
+ def get_dependencies(self) -> Optional[DataFrame]:
29
+ import re
30
+
31
+ df = self.get_data(self.stream)
32
+ jvm = df._sc._jvm # type: ignore
33
+ explain_plan = cast(Any, jvm.PythonSQLUtils).explainString(cast(Any, df._jdf).queryExecution(), "extended") # type: ignore
34
+
35
+ dependencies = []
36
+ r = re.compile(r"(?<=SubqueryAlias spark_catalog\.)[^.]*\.[^.\n]*")
37
+ matches = re.findall(r, explain_plan)
38
+ matches = list(set(matches))
39
+ for m in matches:
40
+ dependencies.append(Row(self.job_id, m, "parser"))
41
+ parents = self.options.job.get_list("parents") or []
42
+ for p in parents:
43
+ dependencies.append(Row(self.job_id, p, "job"))
44
+
45
+ if dependencies:
46
+ Logger.debug(f"dependencies ({', '.join([row[1] for row in dependencies])})", extra={"job": self})
47
+ df = self.spark.createDataFrame(dependencies, schema=["job_id", "parent", "origin"])
48
+ df = df.transform(self.add_dependency_details)
49
+ assert df.where("job_id == parent_id").count() == 0, "circular dependency found"
50
+ return df
51
+
52
+ def rm(self):
53
+ """
54
+ Removes the schema folder and checkpoints associated with the generator.
55
+
56
+ If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
57
+ """
58
+ if self.paths.schema.exists():
59
+ Logger.info("delete schema folder", extra={"job": self})
60
+ self.paths.schema.rm()
61
+ self.rm_checkpoints()
62
+
63
+ def rm_checkpoints(self):
64
+ """
65
+ Removes the checkpoints folder if it exists.
66
+
67
+ This method checks if the checkpoints folder exists and deletes it if it does.
68
+ """
69
+ if self.paths.checkpoints.exists():
70
+ Logger.info("delete checkpoints folder", extra={"job": self})
71
+ self.paths.checkpoints.rm()
72
+
73
+ def rm_commit(self, id: Union[str, int]):
74
+ """
75
+ Remove a commit with the given ID.
76
+
77
+ Args:
78
+ id (Union[str, int]): The ID of the commit to remove.
79
+
80
+ Returns:
81
+ None
82
+ """
83
+ path = self.paths.commits.join(str(id))
84
+ if path.exists():
85
+ Logger.warning(f"delete commit {id}", extra={"job": self})
86
+ path.rm()
87
+
88
+ def truncate(self):
89
+ """
90
+ Truncates the job by removing all data associated with it.
91
+
92
+ This method removes the job from the system and, if the `persist` flag is set to True,
93
+ it also truncates the associated table.
94
+
95
+ Returns:
96
+ None
97
+ """
98
+ Logger.warning("truncate", extra={"job": self})
99
+ self.rm()
100
+ if self.persist:
101
+ self.table.truncate()
102
+
103
+ def drop(self):
104
+ """
105
+ Drops the current job and its dependencies.
106
+
107
+ This method drops the current job and its dependencies by performing the following steps:
108
+ 1. Queries the database to check if there are any child jobs associated with the current job.
109
+ 2. If child jobs are found, logs a warning message and prints the list of child jobs.
110
+ 3. Drops the current job's change data capture (cdc).
111
+ 4. Removes the current job.
112
+
113
+ Note: This method handles any exceptions that occur during the process.
114
+
115
+ Returns:
116
+ None
117
+ """
118
+ try:
119
+ row = self.spark.sql(
120
+ f"""
121
+ select
122
+ count(*) as count,
123
+ array_join(sort_array(collect_set(j.job)), ', \n') as children
124
+ from
125
+ fabricks.dependencies d
126
+ inner join fabricks.jobs j on d.job_id = j.job_id
127
+ where
128
+ parent like '{self}'
129
+ """
130
+ ).collect()[0]
131
+ if cast(int, row.count) > 0:
132
+ Logger.warning(f"{row.count} children found", extra={"job": self, "content": row.children})
133
+ except Exception:
134
+ pass
135
+ self.cdc.drop()
136
+ self.rm()
137
+
138
+ def create(self):
139
+ """
140
+ Creates a table or view based on the specified mode.
141
+
142
+ If `persist` is True, it creates a table by calling the `create_table` method.
143
+ If `virtual` is True, it creates or replaces a view by calling the `create_or_replace_view` method.
144
+ If neither `persist` nor `virtual` is True, it raises a ValueError.
145
+
146
+ Raises:
147
+ ValueError: If neither `persist` nor `virtual` is True.
148
+
149
+ """
150
+ if self.persist:
151
+ self.create_table()
152
+ elif self.virtual:
153
+ self.create_or_replace_view()
154
+ else:
155
+ raise ValueError(f"{self.mode} not allowed")
156
+
157
+ def register(self):
158
+ """
159
+ Register the job.
160
+
161
+ If `persist` is True, the job's table is registered.
162
+ If `virtual` is True, a view is created or replaced.
163
+ Otherwise, a ValueError is raised.
164
+
165
+ Raises:
166
+ ValueError: If `persist` and `virtual` are both False.
167
+
168
+ """
169
+ if self.persist:
170
+ self.table.register()
171
+ elif self.virtual:
172
+ self.create_or_replace_view()
173
+ else:
174
+ raise ValueError(f"{self.mode} not allowed")
175
+
176
+ def create_or_replace_view(self):
177
+ """
178
+ Creates or replaces a view.
179
+
180
+ This method is responsible for creating or replacing a view in the database.
181
+ It should be implemented by subclasses to define the specific logic for creating or replacing the view.
182
+
183
+ Raises:
184
+ NotImplementedError: This method is meant to be overridden by subclasses.
185
+ """
186
+ raise NotImplementedError()
187
+
188
+ def create_table(self):
189
+ def _create_table(df: DataFrame, batch: Optional[int] = 0):
190
+ df = self.base_transform(df)
191
+ cdc_options = self.get_cdc_context(df)
192
+
193
+ cluster_by = []
194
+ partition_by = []
195
+
196
+ powerbi = False
197
+ liquid_clustering = False
198
+ partitioning = False
199
+ identity = False
200
+
201
+ # first take from job options, then from step options
202
+ job_powerbi = self.options.table.get_boolean("powerbi", None)
203
+ step_powerbi = self.step_conf.get("table_options", {}).get("powerbi", None)
204
+ if job_powerbi is not None:
205
+ powerbi = job_powerbi
206
+ elif step_powerbi is not None:
207
+ powerbi = step_powerbi
208
+
209
+ if powerbi:
210
+ properties = {
211
+ "delta.columnMapping.mode": "name",
212
+ "delta.minReaderVersion": "2",
213
+ "delta.minWriterVersion": "5",
214
+ "fabricks.last_version": "0",
215
+ }
216
+ else:
217
+ properties = {
218
+ "delta.enableDeletionVectors": "true",
219
+ "delta.columnMapping.mode": "name",
220
+ "delta.minReaderVersion": "2",
221
+ "delta.minWriterVersion": "5",
222
+ "delta.feature.timestampNtz": "supported",
223
+ "fabricks.last_version": "0",
224
+ }
225
+
226
+ if "__identity" in df.columns:
227
+ identity = False
228
+ else:
229
+ identity = self.options.table.get_boolean("identity", False)
230
+
231
+ # first take from job options, then from step options
232
+ liquid_clustering_job = self.options.table.get_boolean("liquid_clustering", None)
233
+ liquid_clustering_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
234
+ if liquid_clustering_job is not None:
235
+ liquid_clustering = liquid_clustering_job
236
+ elif liquid_clustering_step:
237
+ liquid_clustering = liquid_clustering_step
238
+
239
+ if liquid_clustering:
240
+ cluster_by = self.options.table.get_list("cluster_by") or []
241
+ if not cluster_by:
242
+ if "__source" in df.columns:
243
+ cluster_by.append("__source")
244
+ if "__is_current" in df.columns:
245
+ cluster_by.append("__is_current")
246
+ if "__key" in df.columns:
247
+ cluster_by.append("__key")
248
+ elif "__hash" in df.columns:
249
+ cluster_by.append("__hash")
250
+
251
+ if not cluster_by:
252
+ Logger.warning("liquid clustering disabled (no clustering columns found)", extra={"job": self})
253
+ liquid_clustering = False
254
+ cluster_by = None
255
+
256
+ if not liquid_clustering:
257
+ cluster_by = None
258
+ partition_by = self.options.table.get_list("partition_by")
259
+ if partition_by:
260
+ partitioning = True
261
+
262
+ if not powerbi:
263
+ # first take from job options, then from step options
264
+ if self.options.table.get_dict("properties"):
265
+ properties = self.options.table.get_dict("properties")
266
+ elif self.step_conf.get("table_options", {}).get("properties", {}):
267
+ properties = self.step_conf.get("table_options", {}).get("properties", {})
268
+
269
+ # if dataframe, reference is passed (BUG)
270
+ name = f"{self.step}_{self.topic}_{self.item}__init"
271
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 ==2 "))
272
+ sql = f"select * from {global_temp_view}"
273
+
274
+ self.cdc.create_table(
275
+ sql,
276
+ identity=identity,
277
+ liquid_clustering=liquid_clustering,
278
+ cluster_by=cluster_by,
279
+ partitioning=partitioning,
280
+ partition_by=partition_by,
281
+ properties=properties,
282
+ **cdc_options,
283
+ )
284
+
285
+ if not self.table.exists():
286
+ df = self.get_data(self.stream)
287
+ if df:
288
+ if self.stream:
289
+ # add dummy stream to be sure that the writeStream will start
290
+ dummy_df = self.spark.readStream.table("fabricks.dummy")
291
+ # __metadata is always present
292
+ dummy_df = dummy_df.withColumn("__metadata", lit(None))
293
+ dummy_df = dummy_df.select("__metadata")
294
+
295
+ df = df.unionByName(dummy_df, allowMissingColumns=True)
296
+ path = self.paths.checkpoints.append("__init")
297
+ if path.exists():
298
+ path.rm()
299
+
300
+ query = (
301
+ df.writeStream.foreachBatch(_create_table)
302
+ .option("checkpointLocation", path.string)
303
+ .trigger(once=True)
304
+ .start()
305
+ )
306
+ query.awaitTermination()
307
+ path.rm()
308
+ else:
309
+ _create_table(df)
310
+
311
+ constraints = self.options.table.get_dict("constraints")
312
+ if constraints:
313
+ for key, value in constraints.items():
314
+ self.table.add_constraint(name=key, expr=value)
315
+
316
+ comment = self.options.table.get("comment")
317
+ if comment:
318
+ self.table.add_comment(comment=comment)
319
+
320
+ def _update_schema(self, df: Optional[DataFrame] = None, overwrite: Optional[bool] = False):
321
+ def _update_schema(df: DataFrame, batch: Optional[int] = None):
322
+ if overwrite:
323
+ self.cdc.overwrite_schema(df)
324
+ else:
325
+ self.cdc.update_schema(df)
326
+
327
+ if self.persist:
328
+ if df is not None:
329
+ _update_schema(df)
330
+ else:
331
+ df = self.get_data(self.stream)
332
+ assert df is not None
333
+ df = self.base_transform(df)
334
+
335
+ if self.stream:
336
+ path = self.paths.checkpoints.append("__schema")
337
+ query = (
338
+ df.writeStream.foreachBatch(_update_schema)
339
+ .option("checkpointLocation", path.string)
340
+ .trigger(once=True)
341
+ .start()
342
+ )
343
+ query.awaitTermination()
344
+ path.rm()
345
+ else:
346
+ _update_schema(df)
347
+
348
+ elif self.virtual:
349
+ self.create_or_replace_view()
350
+ else:
351
+ raise ValueError(f"{self.mode} not allowed")
352
+
353
+ def update_schema(self, df: Optional[DataFrame] = None):
354
+ Logger.info("update schema", extra={"job": self})
355
+ self._update_schema(df=df, overwrite=False)
356
+
357
+ def overwrite_schema(self, df: Optional[DataFrame] = None):
358
+ Logger.info("overwrite schema", extra={"job": self})
359
+ self._update_schema(df=df, overwrite=True)
360
+
361
+ def enable_liquid_clustering(self):
362
+ df = self.table.dataframe
363
+ enable = False
364
+
365
+ # first take from job options, then from step options
366
+ enable_job = self.options.table.get_boolean("liquid_clustering", None)
367
+ enable_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
368
+ if enable_job is not None:
369
+ enable = enable_job
370
+ elif enable_step:
371
+ enable = enable_step
372
+
373
+ if enable:
374
+ cluster_by = self.options.table.get_list("cluster_by") or []
375
+ if not cluster_by:
376
+ if "__source" in df.columns:
377
+ cluster_by.append("__source")
378
+ if "__is_current" in df.columns:
379
+ cluster_by.append("__is_current")
380
+ if "__key" in df.columns:
381
+ cluster_by.append("__key")
382
+ elif "__hash" in df.columns:
383
+ cluster_by.append("__hash")
384
+
385
+ if len(cluster_by) > 0:
386
+ self.table.enable_liquid_clustering(cluster_by)
387
+ else:
388
+ Logger.warning("liquid clustering not enabled (no clustering column found)", extra={"job": self})
389
+
390
+ else:
391
+ Logger.debug("liquid clustering not enabled", extra={"job": self})
@@ -0,0 +1,119 @@
1
+ import json
2
+ from typing import Optional, overload
3
+
4
+ from fabricks.context import PATH_RUNTIME
5
+ from fabricks.context.log import Logger
6
+ from fabricks.core.jobs.base.checker import Checker
7
+ from fabricks.core.jobs.base.error import InvokerFailedException
8
+ from fabricks.core.schedules import get_schedule
9
+ from fabricks.utils.path import Path
10
+
11
+
12
+ class Invoker(Checker):
13
+ def pre_run_invoke(self, schedule: Optional[str] = None):
14
+ self._job_position_invoke(position="pre_run", schedule=schedule)
15
+ self._step_position_invoke(position="pre_run", schedule=schedule)
16
+
17
+ def post_run_invoke(self, schedule: Optional[str] = None):
18
+ self._job_position_invoke(position="post_run", schedule=schedule)
19
+ self._step_position_invoke(position="post_run", schedule=schedule)
20
+
21
+ def _job_position_invoke(self, position: str, schedule: Optional[str] = None):
22
+ if self.options.invoker.get(position):
23
+ Logger.info(f"{position}-invoke", extra={"job": self})
24
+ try:
25
+ options = self.options.invoker.get_dict(position)
26
+ assert options
27
+
28
+ notebook = options.notebook # type: ignore
29
+ assert notebook, "notebook mandatory"
30
+ path = PATH_RUNTIME.join(notebook)
31
+
32
+ arguments = options.arguments or {} # type: ignore
33
+ timeout = arguments.get("timeout")
34
+ if timeout is None:
35
+ if position == "pre_run":
36
+ timeout = self.timeouts.pre_run
37
+ elif position == "post_run":
38
+ timeout = self.timeouts.post_run
39
+
40
+ self.invoke(path, arguments, timeout, schedule)
41
+ except Exception:
42
+ raise InvokerFailedException(position)
43
+
44
+ def _step_position_invoke(self, position: str, schedule: Optional[str] = None):
45
+ if self.step_conf.get("options", {}).get(position, None):
46
+ Logger.info(f"{self.step} - {position}-invoke")
47
+ try:
48
+ options = self.step_conf.get("options", {}).get(position, None)
49
+ assert options
50
+
51
+ notebook = options.get("notebook") # type: ignore
52
+ assert notebook, "notebook mandatory"
53
+ path = PATH_RUNTIME.join(notebook)
54
+
55
+ arguments = options.get("arguments", {}) # type: ignore
56
+ timeout = arguments.get("timeout")
57
+ if timeout is None:
58
+ if position == "pre_run":
59
+ timeout = self.timeouts.pre_run
60
+ elif position == "post_run":
61
+ timeout = self.timeouts.post_run
62
+
63
+ self.invoke(path, arguments, timeout, schedule)
64
+ except Exception:
65
+ raise InvokerFailedException(position)
66
+
67
+ @overload
68
+ def invoke(self, path: Path, arguments: dict, timeout: Optional[int] = None, schedule: Optional[str] = None): ...
69
+
70
+ @overload
71
+ def invoke(self, *, schedule: Optional[str] = None): ...
72
+
73
+ def invoke(
74
+ self,
75
+ path: Optional[Path] = None,
76
+ arguments: Optional[dict] = None,
77
+ timeout: Optional[int] = None,
78
+ schedule: Optional[str] = None,
79
+ ):
80
+ """
81
+ Invokes a notebook job.
82
+
83
+ Args:
84
+ path (Optional[Path]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
85
+ arguments (Optional[dict]): Additional arguments to pass to the notebook job. If not provided, it will be retrieved from the invoker options.
86
+ schedule (Optional[str]): The schedule for the job. If provided, schedule variables will be retrieved.
87
+
88
+ Raises:
89
+ AssertionError: If the specified path does not exist.
90
+
91
+ """
92
+ if path is None:
93
+ notebook = self.options.invoker.get_dict("notebook")
94
+ path = PATH_RUNTIME.join(notebook)
95
+ assert path.exists(), f"{path} not found"
96
+
97
+ if arguments is None:
98
+ arguments = self.options.invoker.get_dict("arguments") or {}
99
+
100
+ if schedule is not None:
101
+ variables = get_schedule(schedule).select("options.variables").collect()[0][0]
102
+ else:
103
+ variables = {}
104
+
105
+ if timeout is None:
106
+ timeout = self.timeouts.job
107
+
108
+ self.dbutils.notebook.run(
109
+ path.get_notebook_path(),
110
+ timeout,
111
+ {
112
+ "step": self.step,
113
+ "topic": self.topic,
114
+ "item": self.item,
115
+ **arguments,
116
+ "job_options": json.dumps(self.options.job.options),
117
+ "schedule_variables": json.dumps(variables),
118
+ },
119
+ )
@@ -0,0 +1,5 @@
1
+ from fabricks.core.jobs.base.processor import Processor
2
+
3
+
4
+ class BaseJob(Processor):
5
+ pass