fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,447 @@
1
+ from abc import abstractmethod
2
+ from typing import Optional, Sequence, Union, cast
3
+
4
+ from pyspark.sql import DataFrame
5
+ from pyspark.sql.functions import lit
6
+
7
+ from fabricks.cdc import NoCDC
8
+ from fabricks.context.log import DEFAULT_LOGGER
9
+ from fabricks.core.jobs.base._types import JobDependency
10
+ from fabricks.core.jobs.base.configurator import Configurator
11
+ from fabricks.metastore.table import SchemaDiff
12
+ from fabricks.metastore.view import create_or_replace_global_temp_view
13
+
14
+
15
+ class Generator(Configurator):
16
+ def update_dependencies(self):
17
+ DEFAULT_LOGGER.info("update dependencies", extra={"label": self})
18
+
19
+ deps = self.get_dependencies()
20
+ if deps:
21
+ df = self.spark.createDataFrame([d.model_dump() for d in deps]) # type: ignore
22
+ cdc = NoCDC("fabricks", self.step, "dependencies")
23
+ cdc.delete_missing(df, keys=["dependency_id"], update_where=f"job_id = '{self.job_id}'", uuid=True)
24
+
25
+ @abstractmethod
26
+ def get_dependencies(self) -> Sequence[JobDependency]: ...
27
+
28
+ def rm(self):
29
+ """
30
+ Removes the schema folder and checkpoints associated with the generator.
31
+
32
+ If the schema folder exists, it will be deleted. The method also calls the `rm_checkpoints` method to remove any checkpoints associated with the generator.
33
+ """
34
+ if self.paths.schema.exists():
35
+ DEFAULT_LOGGER.info("delete schema folder", extra={"label": self})
36
+ self.paths.schema.rm()
37
+ self.rm_checkpoints()
38
+
39
+ def rm_checkpoints(self):
40
+ """
41
+ Removes the checkpoints folder if it exists.
42
+
43
+ This method checks if the checkpoints folder exists and deletes it if it does.
44
+ """
45
+ if self.paths.checkpoints.exists():
46
+ DEFAULT_LOGGER.info("delete checkpoints folder", extra={"label": self})
47
+ self.paths.checkpoints.rm()
48
+
49
+ def rm_commit(self, id: Union[str, int]):
50
+ """
51
+ Remove a commit with the given ID.
52
+
53
+ Args:
54
+ id (Union[str, int]): The ID of the commit to remove.
55
+
56
+ Returns:
57
+ None
58
+ """
59
+ path = self.paths.commits.joinpath(str(id))
60
+ if path.exists():
61
+ DEFAULT_LOGGER.warning(f"delete commit {id}", extra={"label": self})
62
+ path.rm()
63
+
64
+ def truncate(self):
65
+ """
66
+ Truncates the job by removing all data associated with it.
67
+
68
+ This method removes the job from the system and, if the `persist` flag is set to True,
69
+ it also truncates the associated table.
70
+
71
+ Returns:
72
+ None
73
+ """
74
+ DEFAULT_LOGGER.warning("truncate", extra={"label": self})
75
+ self.rm()
76
+ if self.persist:
77
+ self.table.truncate()
78
+
79
+ def drop(self):
80
+ """
81
+ Drops the current job and its dependencies.
82
+
83
+ This method drops the current job and its dependencies by performing the following steps:
84
+ 1. Queries the database to check if there are any child jobs associated with the current job.
85
+ 2. If child jobs are found, logs a warning message and prints the list of child jobs.
86
+ 3. Drops the current job's change data capture (cdc).
87
+ 4. Removes the current job.
88
+
89
+ Note: This method handles any exceptions that occur during the process.
90
+
91
+ Returns:
92
+ None
93
+ """
94
+ if self.options.job.get("no_drop"):
95
+ raise ValueError("no_drop is set, cannot drop the job")
96
+
97
+ try:
98
+ row = self.spark.sql(
99
+ f"""
100
+ select
101
+ count(*) as count,
102
+ array_join(sort_array(collect_set(j.job)), ', \n') as children
103
+ from
104
+ fabricks.dependencies d
105
+ inner join fabricks.jobs j on d.job_id = j.job_id
106
+ where
107
+ parent like '{self}'
108
+ """
109
+ ).collect()[0]
110
+ if cast(int, row.count) > 0:
111
+ DEFAULT_LOGGER.warning(f"{row.count} children found", extra={"label": self, "content": row.children})
112
+
113
+ except Exception:
114
+ pass
115
+
116
+ self.cdc.drop()
117
+ self.rm()
118
+
119
+ def create(self):
120
+ """
121
+ Creates a table or view based on the specified mode.
122
+
123
+ If `persist` is True, it creates a table by calling the `create_table` method.
124
+ If `virtual` is True, it creates or replaces a view by calling the `create_or_replace_view` method.
125
+ If neither `persist` nor `virtual` is True, it raises a ValueError.
126
+
127
+ Raises:
128
+ ValueError: If neither `persist` nor `virtual` is True.
129
+
130
+ """
131
+ if self.persist:
132
+ self.create_table()
133
+ elif self.virtual:
134
+ self.create_or_replace_view()
135
+ else:
136
+ raise ValueError(f"{self.mode} not allowed")
137
+
138
+ def register(self):
139
+ """
140
+ Register the job.
141
+
142
+ If `persist` is True, the job's table is registered.
143
+ If `virtual` is True, a view is created or replaced.
144
+ Otherwise, a ValueError is raised.
145
+
146
+ Raises:
147
+ ValueError: If `persist` and `virtual` are both False.
148
+
149
+ """
150
+ if self.persist:
151
+ self.table.register()
152
+ elif self.virtual:
153
+ self.create_or_replace_view()
154
+ else:
155
+ raise ValueError(f"{self.mode} not allowed")
156
+
157
+ def create_or_replace_view(self):
158
+ """
159
+ Creates or replaces a view.
160
+
161
+ This method is responsible for creating or replacing a view in the database.
162
+ It should be implemented by subclasses to define the specific logic for creating or replacing the view.
163
+
164
+ Raises:
165
+ NotImplementedError: This method is meant to be overridden by subclasses.
166
+ """
167
+ ...
168
+
169
+ def create_table(self):
170
+ def _create_table(df: DataFrame, batch: Optional[int] = 0):
171
+ df = self.base_transform(df)
172
+ cdc_options = self.get_cdc_context(df)
173
+
174
+ cluster_by = []
175
+ partition_by = []
176
+
177
+ powerbi = False
178
+ liquid_clustering = False
179
+ partitioning = False
180
+ identity = False
181
+
182
+ # first take from job options, then from step options
183
+ job_powerbi = self.options.table.get_boolean("powerbi", None)
184
+ step_powerbi = self.step_conf.get("table_options", {}).get("powerbi", None)
185
+ if job_powerbi is not None:
186
+ powerbi = job_powerbi
187
+ elif step_powerbi is not None:
188
+ powerbi = step_powerbi
189
+
190
+ # first take from job options, then from step options
191
+ job_masks = self.options.table.get("masks", None)
192
+ step_masks = self.step_conf.get("table_options", {}).get("masks", None)
193
+ if job_masks is not None:
194
+ masks = job_masks
195
+ elif step_masks is not None:
196
+ masks = step_masks
197
+ else:
198
+ masks = None
199
+
200
+ maximum_compatibility = self.options.table.get_boolean("maximum_compatibility", False)
201
+
202
+ if maximum_compatibility:
203
+ default_properties = {
204
+ "delta.minReaderVersion": "1",
205
+ "delta.minWriterVersion": "7",
206
+ "delta.columnMapping.mode": "none",
207
+ }
208
+ elif powerbi:
209
+ default_properties = {
210
+ "delta.columnMapping.mode": "name",
211
+ "delta.minReaderVersion": "2",
212
+ "delta.minWriterVersion": "5",
213
+ }
214
+ else:
215
+ default_properties = {
216
+ "delta.enableTypeWidening": "true",
217
+ "delta.enableDeletionVectors": "true",
218
+ "delta.columnMapping.mode": "name",
219
+ "delta.minReaderVersion": "2",
220
+ "delta.minWriterVersion": "5",
221
+ "delta.feature.timestampNtz": "supported",
222
+ }
223
+
224
+ default_properties["fabricks.last_version"] = "0"
225
+
226
+ if "__identity" in df.columns:
227
+ identity = False
228
+ else:
229
+ identity = self.options.table.get_boolean("identity", False)
230
+
231
+ # first take from job options, then from step options
232
+ liquid_clustering_job = self.options.table.get("liquid_clustering", None)
233
+ liquid_clustering_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
234
+ if liquid_clustering_job is not None:
235
+ liquid_clustering = liquid_clustering_job
236
+ elif liquid_clustering_step:
237
+ liquid_clustering = liquid_clustering_step
238
+
239
+ if liquid_clustering is not None:
240
+ if liquid_clustering == "auto":
241
+ liquid_clustering = True
242
+ cluster_by = []
243
+
244
+ else:
245
+ cluster_by = self.options.table.get_list("cluster_by") or []
246
+ if not cluster_by:
247
+ if "__source" in df.columns:
248
+ cluster_by.append("__source")
249
+ if "__is_current" in df.columns:
250
+ cluster_by.append("__is_current")
251
+ if "__key" in df.columns:
252
+ cluster_by.append("__key")
253
+ elif "__hash" in df.columns:
254
+ cluster_by.append("__hash")
255
+
256
+ if not cluster_by:
257
+ DEFAULT_LOGGER.debug("could not determine clustering column", extra={"label": self})
258
+ liquid_clustering = False
259
+ cluster_by = None
260
+
261
+ if liquid_clustering is None:
262
+ cluster_by = None
263
+ partition_by = self.options.table.get_list("partition_by")
264
+ if partition_by:
265
+ partitioning = True
266
+
267
+ properties = None
268
+ if not powerbi:
269
+ # first take from job options, then from step options
270
+ if self.options.table.get_dict("properties"):
271
+ properties = self.options.table.get_dict("properties")
272
+ elif self.step_conf.get("table_options", {}).get("properties", {}):
273
+ properties = self.step_conf.get("table_options", {}).get("properties", {})
274
+
275
+ if properties is None:
276
+ properties = default_properties
277
+
278
+ primary_key = self.options.table.get_dict("primary_key")
279
+ foreign_keys = self.options.table.get_dict("foreign_keys")
280
+ comments = self.options.table.get_dict("comments")
281
+
282
+ # if dataframe, reference is passed (BUG)
283
+ name = f"{self.step}_{self.topic}_{self.item}__init"
284
+ global_temp_view = create_or_replace_global_temp_view(name=name, df=df.where("1 == 2"), job=self)
285
+ sql = f"select * from {global_temp_view}"
286
+
287
+ self.cdc.create_table(
288
+ sql,
289
+ identity=identity,
290
+ liquid_clustering=liquid_clustering,
291
+ cluster_by=cluster_by,
292
+ partitioning=partitioning,
293
+ partition_by=partition_by,
294
+ properties=properties,
295
+ masks=masks,
296
+ primary_key=primary_key,
297
+ foreign_keys=foreign_keys,
298
+ comments=comments,
299
+ **cdc_options,
300
+ )
301
+
302
+ if not self.table.exists():
303
+ DEFAULT_LOGGER.debug("create table", extra={"label": self})
304
+
305
+ df = self.get_data(stream=self.stream, schema_only=True)
306
+ if df:
307
+ if self.stream:
308
+ # add dummy stream to be sure that the writeStream will start
309
+ spark = df.sparkSession
310
+
311
+ dummy_df = spark.readStream.table("fabricks.dummy")
312
+ # __metadata is always present
313
+ dummy_df = dummy_df.withColumn("__metadata", lit(None))
314
+ dummy_df = dummy_df.select("__metadata")
315
+
316
+ df = df.unionByName(dummy_df, allowMissingColumns=True)
317
+ path = self.paths.checkpoints.append("__init")
318
+ if path.exists():
319
+ path.rm()
320
+
321
+ query = (
322
+ df.writeStream.foreachBatch(_create_table)
323
+ .option("checkpointLocation", path.string)
324
+ .trigger(once=True)
325
+ .start()
326
+ )
327
+ query.awaitTermination()
328
+ path.rm()
329
+ else:
330
+ _create_table(df)
331
+
332
+ constraints = self.options.table.get_dict("constraints")
333
+ if constraints:
334
+ for key, value in constraints.items():
335
+ self.table.add_constraint(name=key, expr=value)
336
+
337
+ comment = self.options.table.get("comment")
338
+ if comment:
339
+ self.table.add_comment(comment=comment)
340
+
341
+ else:
342
+ DEFAULT_LOGGER.debug("table exists, skip creation", extra={"label": self})
343
+
344
+ def _update_schema(
345
+ self,
346
+ df: Optional[DataFrame] = None,
347
+ overwrite: Optional[bool] = False,
348
+ widen_types: Optional[bool] = False,
349
+ ):
350
+ def _update_schema(df: DataFrame, batch: Optional[int] = None):
351
+ context = self.get_cdc_context(df, reload=True)
352
+ if overwrite:
353
+ self.cdc.overwrite_schema(df, **context)
354
+ else:
355
+ self.cdc.update_schema(df, widen_types=widen_types, **context)
356
+
357
+ if self.persist:
358
+ if df is not None:
359
+ _update_schema(df)
360
+
361
+ else:
362
+ df = self.get_data(stream=self.stream, schema_only=True)
363
+ assert df is not None
364
+ df = self.base_transform(df)
365
+
366
+ if self.stream:
367
+ path = self.paths.checkpoints.append("__schema")
368
+ query = (
369
+ df.writeStream.foreachBatch(_update_schema)
370
+ .option("checkpointLocation", path.string)
371
+ .trigger(once=True)
372
+ .start()
373
+ )
374
+ query.awaitTermination()
375
+ path.rm()
376
+
377
+ else:
378
+ _update_schema(df)
379
+
380
+ elif self.virtual:
381
+ self.create_or_replace_view()
382
+
383
+ else:
384
+ raise ValueError(f"{self.mode} not allowed")
385
+
386
+ def update_schema(self, df: Optional[DataFrame] = None, widen_types: Optional[bool] = False):
387
+ self._update_schema(df=df, overwrite=False, widen_types=widen_types)
388
+
389
+ def overwrite_schema(self, df: Optional[DataFrame] = None):
390
+ self._update_schema(df=df, overwrite=True)
391
+
392
+ def get_differences_with_deltatable(self, df: Optional[DataFrame] = None):
393
+ if df is None:
394
+ df = self.get_data(stream=self.stream)
395
+ assert df is not None
396
+ df = self.base_transform(df)
397
+
398
+ context = self.get_cdc_context(df, reload=True)
399
+
400
+ return self.cdc.get_differences_with_deltatable(df, **context)
401
+
402
+ def get_schema_differences(self, df: Optional[DataFrame] = None) -> Optional[Sequence[SchemaDiff]]:
403
+ if df is None:
404
+ df = self.get_data(stream=self.stream)
405
+ assert df is not None
406
+ df = self.base_transform(df)
407
+
408
+ context = self.get_cdc_context(df, reload=True)
409
+
410
+ return self.cdc.get_schema_differences(df, **context)
411
+
412
+ def schema_drifted(self, df: Optional[DataFrame] = None) -> Optional[bool]:
413
+ d = self.get_schema_differences(df)
414
+ if d is None:
415
+ return None
416
+ return len(d) > 0
417
+
418
+ def enable_liquid_clustering(self):
419
+ df = self.table.dataframe
420
+ enable = False
421
+
422
+ # first take from job options, then from step options
423
+ enable_job = self.options.table.get_boolean("liquid_clustering", None)
424
+ enable_step = self.step_conf.get("table_options", {}).get("liquid_clustering", None)
425
+ if enable_job is not None:
426
+ enable = enable_job
427
+ elif enable_step:
428
+ enable = enable_step
429
+
430
+ if enable:
431
+ cluster_by = self.options.table.get_list("cluster_by") or []
432
+ if not cluster_by:
433
+ if "__source" in df.columns:
434
+ cluster_by.append("__source")
435
+ if "__is_current" in df.columns:
436
+ cluster_by.append("__is_current")
437
+ if "__key" in df.columns:
438
+ cluster_by.append("__key")
439
+ elif "__hash" in df.columns:
440
+ cluster_by.append("__hash")
441
+
442
+ if len(cluster_by) > 0:
443
+ self.table.enable_liquid_clustering(cluster_by, auto=False)
444
+ else:
445
+ self.table.enable_liquid_clustering(auto=True)
446
+ else:
447
+ DEFAULT_LOGGER.debug("could not enable liquid clustering", extra={"label": self})
@@ -0,0 +1,206 @@
1
+ import json
2
+ from typing import Optional
3
+
4
+ from pyspark.sql import DataFrame
5
+
6
+ from fabricks.context import PATH_RUNTIME
7
+ from fabricks.context.log import DEFAULT_LOGGER
8
+ from fabricks.core.jobs.base.checker import Checker
9
+ from fabricks.core.jobs.base.exception import PostRunInvokeException, PreRunInvokeException
10
+ from fabricks.core.jobs.get_schedule import get_schedule
11
+ from fabricks.utils.path import Path
12
+
13
+
14
+ class Invoker(Checker):
15
+ def invoke(self, schedule: Optional[str] = None, **kwargs):
16
+ return self._invoke_job(
17
+ position="run",
18
+ schedule=schedule,
19
+ **kwargs,
20
+ ) # kwargs and return needed for get_data in gold
21
+
22
+ def invoke_pre_run(self, schedule: Optional[str] = None):
23
+ self._invoke_job(position="pre_run", schedule=schedule)
24
+ self._invoke_step(position="pre_run", schedule=schedule)
25
+
26
+ def invoke_post_run(self, schedule: Optional[str] = None):
27
+ self._invoke_job(position="post_run", schedule=schedule)
28
+ self._invoke_step(position="post_run", schedule=schedule)
29
+
30
+ def _invoke_job(self, position: str, schedule: Optional[str] = None, **kwargs):
31
+ invokers = self.options.invokers.get_list(position)
32
+ if position == "run":
33
+ invokers = invokers if len(invokers) > 0 else [{}] # run must work even without run invoker options
34
+
35
+ errors = []
36
+
37
+ if invokers:
38
+ for i, invoker in enumerate(invokers):
39
+ DEFAULT_LOGGER.debug(f"invoke ({i}, {position})", extra={"label": self})
40
+ try:
41
+ path = kwargs.get("path")
42
+ if path is None:
43
+ notebook = invoker.get("notebook")
44
+ assert notebook, "notebook mandatory"
45
+ path = PATH_RUNTIME.joinpath(notebook)
46
+
47
+ assert path is not None, "path mandatory"
48
+
49
+ arguments = invoker.get("arguments") or {}
50
+ timeout = invoker.get("timeout")
51
+
52
+ schema_only = kwargs.get("schema_only")
53
+ if schema_only is not None:
54
+ arguments["schema_only"] = schema_only
55
+
56
+ if len(invokers) == 1 and position == "run":
57
+ return self._run_notebook(
58
+ path=path,
59
+ arguments=arguments,
60
+ timeout=timeout,
61
+ schedule=schedule,
62
+ )
63
+ else:
64
+ self._run_notebook(
65
+ path=path,
66
+ arguments=arguments,
67
+ timeout=timeout,
68
+ schedule=schedule,
69
+ )
70
+
71
+ except Exception as e:
72
+ DEFAULT_LOGGER.warning(f"fail to run invoker ({i}, {position})", extra={"label": self})
73
+
74
+ if position == "pre_run":
75
+ errors.append(PreRunInvokeException(e))
76
+ elif position == "post_run":
77
+ errors.append(PostRunInvokeException(e))
78
+ else:
79
+ errors.append(e)
80
+
81
+ if errors:
82
+ raise Exception(errors)
83
+
84
+ def _invoke_step(self, position: str, schedule: Optional[str] = None):
85
+ invokers = self.step_conf.get("invoker_options", {}).get(position, [])
86
+
87
+ errors = []
88
+
89
+ if invokers:
90
+ for i, invoker in enumerate(invokers):
91
+ DEFAULT_LOGGER.debug(f"invoke by step ({i}, {position})", extra={"label": self})
92
+ try:
93
+ notebook = invoker.get("notebook")
94
+ assert notebook, "notebook mandatory"
95
+ path = PATH_RUNTIME.joinpath(notebook)
96
+
97
+ arguments = invoker.get("arguments", {})
98
+ timeout = invoker.get("timeout")
99
+
100
+ self._run_notebook(
101
+ path=path,
102
+ arguments=arguments,
103
+ timeout=timeout,
104
+ schedule=schedule,
105
+ )
106
+
107
+ except Exception as e:
108
+ DEFAULT_LOGGER.warning(f"fail to run invoker by step ({i}, {position})", extra={"label": self})
109
+
110
+ if position == "pre_run":
111
+ errors.append(PreRunInvokeException(e))
112
+ elif position == "post_run":
113
+ errors.append(PostRunInvokeException(e))
114
+ else:
115
+ errors.append(e)
116
+
117
+ if errors:
118
+ raise Exception(errors)
119
+
120
+ def _run_notebook(
121
+ self,
122
+ path: Path,
123
+ arguments: Optional[dict] = None,
124
+ timeout: Optional[int] = None,
125
+ schedule: Optional[str] = None,
126
+ ):
127
+ """
128
+ Invokes a notebook job.
129
+
130
+ Args:
131
+ path (Optional[Path]): The path to the notebook file. If not provided, it will be retrieved from the invoker options.
132
+ arguments (Optional[dict]): Additional arguments to pass to the notebook job. If not provided, it will be retrieved from the invoker options.
133
+ schedule (Optional[str]): The schedule for the job. If provided, schedule variables will be retrieved.
134
+
135
+ Raises:
136
+ AssertionError: If the specified path does not exist.
137
+
138
+ """
139
+ from databricks.sdk.runtime import dbutils
140
+
141
+ for file_format in [None, ".py", ".ipynb"]:
142
+ path_with_file_format = path.append(file_format) if file_format else path
143
+ if path_with_file_format.exists():
144
+ path = path_with_file_format
145
+ break
146
+
147
+ if timeout is None:
148
+ timeout = self.timeout
149
+
150
+ assert timeout is not None
151
+
152
+ variables = None
153
+ if schedule is not None:
154
+ variables = get_schedule(name=schedule).get("options", {}).get("variables", {})
155
+
156
+ if variables is None:
157
+ variables = {}
158
+
159
+ if arguments is None:
160
+ arguments = {}
161
+
162
+ return dbutils.notebook.run(
163
+ path=path.get_notebook_path(), # type: ignore
164
+ timeout_seconds=timeout, # type: ignore
165
+ arguments={ # type: ignore
166
+ "step": self.step,
167
+ "topic": self.topic,
168
+ "item": self.item,
169
+ **arguments,
170
+ "job_options": json.dumps(self.options.job.options),
171
+ "schedule_variables": json.dumps(variables),
172
+ },
173
+ )
174
+
175
+ def extend_job(self, df: DataFrame) -> DataFrame:
176
+ from fabricks.core.extenders import get_extender
177
+
178
+ extenders = self.options.extenders
179
+ for e in extenders:
180
+ name = e.get("extender")
181
+ DEFAULT_LOGGER.debug(f"extend ({name})", extra={"label": self})
182
+ arguments = e.get("arguments") or {}
183
+
184
+ extender = get_extender(name)
185
+ df = extender(df, **arguments)
186
+
187
+ return df
188
+
189
+ def extend_step(self, df: DataFrame) -> DataFrame:
190
+ from fabricks.core.extenders import get_extender
191
+
192
+ extenders = self.step_conf.get("extender_options", {})
193
+ for e in extenders:
194
+ name = e.get("extender")
195
+ DEFAULT_LOGGER.debug(f"extend by step ({name})", extra={"label": self})
196
+ arguments = e.get("arguments", {})
197
+
198
+ extender = get_extender(name)
199
+ df = extender(df, **arguments)
200
+
201
+ return df
202
+
203
+ def extend(self, df: DataFrame) -> DataFrame:
204
+ df = self.extend_job(df)
205
+ df = self.extend_step(df)
206
+ return df
@@ -0,0 +1,5 @@
1
+ from fabricks.core.jobs.base.processor import Processor
2
+
3
+
4
+ class BaseJob(Processor):
5
+ pass