fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,228 @@
1
+ import logging
2
+ from functools import reduce
3
+ from queue import Queue
4
+ from typing import Any, Callable, Iterable, List, Literal, Optional, Union
5
+
6
+ from pyspark.sql import DataFrame
7
+ from typing_extensions import deprecated
8
+
9
+ from fabricks.utils._types import DataFrameLike
10
+ from fabricks.utils.path import Path
11
+ from fabricks.utils.spark import spark
12
+
13
+
14
+ def concat_ws(fields: Union[str, List[str]], alias: Optional[str] = None) -> str:
15
+ if isinstance(fields, str):
16
+ fields = [fields]
17
+
18
+ if alias:
19
+ coalesce = [f"coalesce(cast({alias}.{f} as string), '-1')" for f in fields]
20
+ else:
21
+ coalesce = [f"coalesce(cast({f} as string), '-1')" for f in fields]
22
+
23
+ return "concat_ws('*', " + ",".join(coalesce) + ")"
24
+
25
+
26
+ def concat_dfs(dfs: Iterable[DataFrame]) -> Optional[DataFrame]:
27
+ dfs = [df for df in dfs if df is not None]
28
+ if len(dfs) == 0:
29
+ return None
30
+ return reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), dfs)
31
+
32
+
33
+ @deprecated("use run_in_parallel instead")
34
+ def run_threads(func: Callable, iter: Union[List, DataFrame, range, set], workers: int = 8) -> List[Any]:
35
+ return run_in_parallel(func, iter, workers)
36
+
37
+
38
+ def _process_queue_item(func: Callable, task_queue: Queue, result_queue: Queue, stop_signal: Any):
39
+ """Worker function that processes items from a queue."""
40
+ while True:
41
+ try:
42
+ item = task_queue.get(timeout=1)
43
+
44
+ if item is stop_signal:
45
+ task_queue.put(stop_signal) # Put it back for other workers
46
+ break
47
+
48
+ result = func(item)
49
+ result_queue.put(result)
50
+ except Exception:
51
+ continue
52
+
53
+
54
+ def _run_in_parallel_legacy(
55
+ func: Callable,
56
+ iterable: Union[List, DataFrame, range, set],
57
+ workers: int = 8,
58
+ progress_bar: Optional[bool] = False,
59
+ position: Optional[int] = None,
60
+ ) -> List[Any]:
61
+ from concurrent.futures import ThreadPoolExecutor
62
+
63
+ iterable = iterable.collect() if isinstance(iterable, DataFrameLike) else iterable # type: ignore
64
+
65
+ with ThreadPoolExecutor(max_workers=workers) as executor:
66
+ if progress_bar:
67
+ from tqdm import tqdm
68
+
69
+ results = list(tqdm(executor.map(func, iterable), total=len(iterable), position=position))
70
+ else:
71
+ results = list(executor.map(func, iterable))
72
+
73
+ return results
74
+
75
+
76
+ def run_in_parallel(
77
+ func: Callable,
78
+ iterable: Union[List, DataFrame, range, set],
79
+ workers: int = 8,
80
+ progress_bar: Optional[bool] = False,
81
+ position: Optional[int] = None,
82
+ loglevel: int = logging.CRITICAL,
83
+ logger: Optional[logging.Logger] = None,
84
+ run_as: Optional[Literal["ThreadPool", "ProcessPool", "Pool", "Queue", "Legacy"]] = "Legacy",
85
+ ) -> List[Any]:
86
+ """
87
+ Runs the given function in parallel on the elements of the iterable using multiple threads or processes.
88
+
89
+ Args:
90
+ func (Callable): The function to be executed in parallel.
91
+ iterable (Union[List, DataFrame, range, set]): The iterable containing the elements on which the function will be executed.
92
+ workers (int, optional): The number of worker threads/processes to use. Defaults to 8.
93
+ progress_bar (Optional[bool], optional): Whether to display a progress bar. Defaults to False.
94
+ position (Optional[int], optional): Position for the progress bar. Defaults to None.
95
+ loglevel (int, optional): Log level to set during execution. Defaults to logging.CRITICAL.
96
+ logger (Optional[logging.Logger], optional): Logger instance to use. Defaults to None.
97
+ run_as (Optional[Literal["ThreadPool", "ProcessPool", "Pool", "Queue"]], optional): Type of run as to use.
98
+
99
+ Returns:
100
+ List[Any]: A list containing the results of the function calls.
101
+
102
+ """
103
+ if logger is None:
104
+ logger = logging.getLogger()
105
+
106
+ current_loglevel = logger.getEffectiveLevel()
107
+ logger.setLevel(loglevel)
108
+
109
+ if run_as == "Legacy":
110
+ results = _run_in_parallel_legacy(
111
+ func=func,
112
+ iterable=iterable,
113
+ workers=workers,
114
+ progress_bar=progress_bar,
115
+ position=position,
116
+ )
117
+
118
+ else:
119
+ iterables = iterable.collect() if isinstance(iterable, DataFrameLike) else iterable # type: ignore
120
+ results = []
121
+
122
+ if run_as == "Queue":
123
+ import threading
124
+
125
+ task_queue = Queue()
126
+ result_queue = Queue()
127
+ stop_signal = object()
128
+
129
+ for item in iterables:
130
+ task_queue.put(item)
131
+
132
+ task_queue.put(stop_signal)
133
+
134
+ threads = []
135
+ for _ in range(workers):
136
+ t = threading.Thread(target=_process_queue_item, args=(func, task_queue, result_queue, stop_signal))
137
+ t.start()
138
+
139
+ threads.append(t)
140
+
141
+ if progress_bar:
142
+ from tqdm import tqdm
143
+
144
+ with tqdm(total=len(iterables), position=position) as t:
145
+ for _ in range(len(iterables)):
146
+ result = result_queue.get()
147
+ results.append(result)
148
+
149
+ t.update()
150
+ t.refresh()
151
+
152
+ else:
153
+ for _ in range(len(iterables)):
154
+ results.append(result_queue.get())
155
+
156
+ for t in threads:
157
+ t.join()
158
+
159
+ elif run_as == "Pool":
160
+ from multiprocessing import Pool
161
+
162
+ with Pool(processes=workers) as p:
163
+ if progress_bar:
164
+ from tqdm import tqdm
165
+
166
+ with tqdm(total=len(iterables), position=position) as t:
167
+ for result in p.map(func, iterables):
168
+ results.append(result)
169
+
170
+ t.update()
171
+ t.refresh()
172
+
173
+ else:
174
+ results = list(p.map(func, iterables))
175
+
176
+ else:
177
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
178
+
179
+ Executor = ProcessPoolExecutor if run_as == "ProcessPool" else ThreadPoolExecutor
180
+ with Executor(max_workers=workers) as exe:
181
+ if progress_bar:
182
+ from tqdm import tqdm
183
+
184
+ with tqdm(total=len(iterables), position=position) as t:
185
+ for result in exe.map(func, iterables):
186
+ results.append(result)
187
+
188
+ t.update()
189
+ t.refresh()
190
+
191
+ else:
192
+ results = list(exe.map(func, iterables))
193
+
194
+ logger.setLevel(current_loglevel)
195
+
196
+ return results
197
+
198
+
199
+ def run_notebook(path: Path, timeout: Optional[int] = None, **kwargs):
200
+ """
201
+ Runs a notebook located at the given path.
202
+
203
+ Args:
204
+ path (Path): The path to the notebook file.
205
+ timeout (Optional[int]): The maximum execution time for the notebook in seconds. Defaults to None.
206
+ **kwargs: Additional keyword arguments to be passed to the notebook.
207
+
208
+ Returns:
209
+ None
210
+ """
211
+ from databricks.sdk.runtime import dbutils
212
+
213
+ if timeout is None:
214
+ timeout = 3600
215
+
216
+ dbutils.notebook.run(path.get_notebook_path(), timeout, {**kwargs}) # type: ignore
217
+
218
+
219
+ def xxhash64(s: Any):
220
+ df = spark.sql(f"select xxhash64(cast('{s}' as string)) as xxhash64")
221
+ return df.collect()[0][0]
222
+
223
+
224
+ def md5(s: Any):
225
+ from hashlib import md5
226
+
227
+ md5 = md5(str(s).encode())
228
+ return md5.hexdigest()
fabricks/utils/log.py ADDED
@@ -0,0 +1,236 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import sys
5
+ from datetime import datetime
6
+ from datetime import timezone as tz
7
+ from typing import Optional, Tuple
8
+ from zoneinfo import ZoneInfo
9
+
10
+ from pyspark.sql import DataFrame
11
+
12
+ from fabricks.utils.azure_table import AzureTable
13
+
14
+
15
+ class LogFormatter(logging.Formatter):
16
+ def __init__(self, debugmode: Optional[bool] = False, timezone: Optional[str] = None):
17
+ super().__init__(fmt="%(levelname)s%(prefix)s%(message)s [%(timestamp)s]%(extra)s")
18
+
19
+ self.debugmode = False if debugmode is None else debugmode
20
+ self.timezone = ZoneInfo(timezone) if timezone else tz.utc
21
+
22
+ COLORS = {
23
+ logging.DEBUG: "\033[36m",
24
+ logging.INFO: "\033[32m",
25
+ logging.WARNING: "\033[33m",
26
+ logging.ERROR: "\033[31m",
27
+ logging.CRITICAL: "\033[41;31m",
28
+ }
29
+
30
+ RESET = "\033[0m"
31
+ BRIGHT = "\033[1m"
32
+
33
+ PADDINGS = {
34
+ "DEBUG": " ",
35
+ "INFO": " ",
36
+ "WARNING": " ",
37
+ "ERROR": " ",
38
+ "CRITICAL": "",
39
+ }
40
+
41
+ def formatTime(self, record) -> str:
42
+ ct = datetime.fromtimestamp(record.created, tz=tz.utc).astimezone(self.timezone)
43
+ s = ct.strftime("%d/%m/%y %H:%M:%S")
44
+ return f"{self.COLORS[logging.DEBUG]}{s}{self.RESET}"
45
+
46
+ def format(self, record):
47
+ levelname = record.levelname
48
+ padding = self.PADDINGS[levelname]
49
+ levelname_formatted = f"{self.COLORS[record.levelno]}{levelname}:{padding}{self.RESET}"
50
+
51
+ prefix = ""
52
+
53
+ if hasattr(record, "label"):
54
+ prefix = f"{record.__dict__.get('label')} - "
55
+ elif hasattr(record, "job"):
56
+ prefix = f"{record.__dict__.get('job')} - " # keep for backward compatibility
57
+ elif hasattr(record, "step"):
58
+ prefix = f"{self.BRIGHT}{record.__dict__.get('step')}{self.RESET} - "
59
+
60
+ extra = ""
61
+ if hasattr(record, "exc_info") and record.exc_info:
62
+ exc_info = record.__dict__.get("exc_info", None)
63
+ extra += f" [{self.COLORS[logging.ERROR]}{exc_info[0].__name__}{self.RESET}]"
64
+
65
+ if self.debugmode:
66
+ if hasattr(record, "sql"):
67
+ extra += f"\n---\n%sql\n{record.__dict__.get('sql')}\n---"
68
+
69
+ if hasattr(record, "content"):
70
+ extra += f"\n---\n{record.__dict__.get('content')}\n---"
71
+
72
+ if hasattr(record, "context"):
73
+ extra += f"\n---\n{json.dumps(record.__dict__.get('context'), indent=2, default=str)}\n---"
74
+
75
+ if hasattr(record, "df"):
76
+ df = record.__dict__.get("df")
77
+ if isinstance(df, DataFrame):
78
+ extra += f"\n---\n%df\n{df.toPandas().to_string(index=True)}\n---"
79
+
80
+ record.levelname = levelname_formatted
81
+ record.prefix = prefix
82
+ record.timestamp = self.formatTime(record)
83
+ record.extra = extra
84
+
85
+ return super().format(record)
86
+
87
+
88
+ class AzureTableLogHandler(logging.Handler):
89
+ def __init__(self, table: AzureTable, debugmode: Optional[bool] = False, timezone: Optional[str] = None):
90
+ super().__init__()
91
+
92
+ self.buffer = []
93
+ self.table = table
94
+
95
+ self.debugmode = False if debugmode is None else debugmode
96
+ self.timezone = ZoneInfo(timezone) if timezone else tz.utc
97
+
98
+ def formatTime(self, record) -> str:
99
+ ct = datetime.fromtimestamp(record.created, tz=tz.utc).astimezone(self.timezone)
100
+ s = ct.strftime("%d/%m/%y %H:%M:%S")
101
+ return s
102
+
103
+ def emit(self, record):
104
+ if hasattr(record, "target"):
105
+ target = record.__dict__.get("target")
106
+
107
+ level = record.levelname
108
+ if "debug" in level.lower():
109
+ level = "DEBUG"
110
+ elif "info" in level.lower():
111
+ level = "INFO"
112
+ elif "warning" in level.lower():
113
+ level = "WARNING"
114
+ elif "error" in level.lower():
115
+ level = "ERROR"
116
+ elif "critical" in level.lower():
117
+ level = "CRITICAL"
118
+ else:
119
+ level = "INFO"
120
+
121
+ r = {
122
+ "Created": self.formatTime(record), # timestamp not present when querying Azure Table
123
+ "Level": level,
124
+ "Message": record.message,
125
+ }
126
+
127
+ if hasattr(record, "job"):
128
+ j = str(record.__dict__.get("job", ""))
129
+ r["Job"] = j
130
+ r["JobId"] = hashlib.md5(j.encode()).hexdigest()
131
+
132
+ if hasattr(record, "table"):
133
+ t = str(record.__dict__.get("table", ""))
134
+ r["Job"] = t
135
+ r["JobId"] = hashlib.md5(t.encode()).hexdigest()
136
+
137
+ if hasattr(record, "step"):
138
+ r["Step"] = record.__dict__.get("step", "")
139
+
140
+ if hasattr(record, "schedule_id"):
141
+ r["ScheduleId"] = record.__dict__.get("schedule_id", "")
142
+
143
+ if hasattr(record, "schedule"):
144
+ r["Schedule"] = record.__dict__.get("schedule", "")
145
+
146
+ if hasattr(record, "notebook_id"):
147
+ r["NotebookId"] = record.__dict__.get("notebook_id", "")
148
+
149
+ if hasattr(record, "exc_info"):
150
+ e = record.__dict__.get("exc_info", None)
151
+ if e is not None:
152
+ d = {
153
+ "type": str(e[0].__name__)[:1000],
154
+ "message": str(e[1])[:1000],
155
+ "traceback": str(logging.Formatter.formatException(self, e))[:1000], # type: ignore
156
+ }
157
+ r["Exception"] = json.dumps(d)
158
+
159
+ if self.debugmode:
160
+ if hasattr(record, "content"):
161
+ r["Content"] = json.dumps(record.__dict__.get("content", ""))[:1000]
162
+ if hasattr(record, "sql"):
163
+ r["Sql"] = record.__dict__.get("sql", "")[:1000]
164
+
165
+ r["PartitionKey"] = record.__dict__.get("partition_key", "default")
166
+ if hasattr(record, "row_key"):
167
+ r["RowKey"] = record.__dict__.get("row_key", "")
168
+ else:
169
+ r["RowKey"] = hashlib.md5(json.dumps(r, sort_keys=True).encode()).hexdigest()
170
+
171
+ if target == "table":
172
+ self.table.upsert(r)
173
+ else:
174
+ self.buffer.append(r)
175
+
176
+ else:
177
+ pass
178
+
179
+ def flush(self):
180
+ self.table.upsert(self.buffer)
181
+ self.buffer.clear()
182
+
183
+ def clear_buffer(self):
184
+ self.buffer = []
185
+
186
+
187
+ class CustomConsoleHandler(logging.StreamHandler):
188
+ def __init__(self, stream=None, debugmode: Optional[bool] = False):
189
+ super().__init__(stream or sys.stderr)
190
+
191
+ self.debugmode = False if debugmode is None else debugmode
192
+
193
+ def emit(self, record):
194
+ if hasattr(record, "sql"):
195
+ if self.debugmode:
196
+ super().emit(record)
197
+ else:
198
+ super().emit(record)
199
+
200
+
201
+ def get_logger(
202
+ name: str,
203
+ level: int,
204
+ table: Optional[AzureTable] = None,
205
+ debugmode: Optional[bool] = False,
206
+ timezone: Optional[str] = None,
207
+ ) -> Tuple[logging.Logger, Optional[AzureTableLogHandler]]:
208
+ logger = logging.getLogger(name)
209
+ if logger.hasHandlers():
210
+ logger.handlers.clear()
211
+
212
+ root = logging.getLogger()
213
+ if root.hasHandlers():
214
+ root.handlers.clear()
215
+
216
+ logger.setLevel(level)
217
+ logger.propagate = False
218
+
219
+ # Console handler
220
+ console_handler = CustomConsoleHandler(debugmode=debugmode)
221
+ console_handler.setLevel(level)
222
+ console_format = LogFormatter(debugmode=debugmode, timezone=timezone)
223
+ console_handler.setFormatter(console_format)
224
+
225
+ if table is not None:
226
+ # Azure Table handler
227
+ azure_table_handler = AzureTableLogHandler(table=table, debugmode=debugmode, timezone=timezone)
228
+ azure_table_handler.setLevel(level)
229
+ else:
230
+ azure_table_handler = None
231
+
232
+ logger.addHandler(console_handler)
233
+ if azure_table_handler is not None:
234
+ logger.addHandler(azure_table_handler)
235
+
236
+ return logger, azure_table_handler
@@ -0,0 +1,32 @@
1
+ from pyspark.sql import DataFrame
2
+
3
+
4
+ def get_mermaid_diagram(df: DataFrame) -> str:
5
+ dependencies = df.select("parent_id", "parent", "job_id", "job").collect()
6
+
7
+ out = "flowchart TD\n"
8
+
9
+ unique_nodes = set()
10
+
11
+ for row in dependencies:
12
+ parent_id = str(row["parent_id"])
13
+ parent_name = str(row["parent"])
14
+ child_id = str(row["job_id"])
15
+ child_name = str(row["job"])
16
+
17
+ if parent_id != "0" and parent_id is not None:
18
+ if parent_id not in unique_nodes:
19
+ out += f" {parent_id}[{parent_name}]\n"
20
+ unique_nodes.add(parent_id)
21
+
22
+ if child_id not in unique_nodes:
23
+ out += f" {child_id}[{child_name}]\n"
24
+ unique_nodes.add(child_id)
25
+
26
+ out += f" {parent_id} --> {child_id}\n"
27
+ else:
28
+ if child_id not in unique_nodes:
29
+ out += f" {child_id}[{child_name}]\n"
30
+ unique_nodes.add(child_id)
31
+
32
+ return out