fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from functools import reduce
|
|
3
|
+
from queue import Queue
|
|
4
|
+
from typing import Any, Callable, Iterable, List, Literal, Optional, Union
|
|
5
|
+
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
from typing_extensions import deprecated
|
|
8
|
+
|
|
9
|
+
from fabricks.utils._types import DataFrameLike
|
|
10
|
+
from fabricks.utils.path import Path
|
|
11
|
+
from fabricks.utils.spark import spark
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def concat_ws(fields: Union[str, List[str]], alias: Optional[str] = None) -> str:
|
|
15
|
+
if isinstance(fields, str):
|
|
16
|
+
fields = [fields]
|
|
17
|
+
|
|
18
|
+
if alias:
|
|
19
|
+
coalesce = [f"coalesce(cast({alias}.{f} as string), '-1')" for f in fields]
|
|
20
|
+
else:
|
|
21
|
+
coalesce = [f"coalesce(cast({f} as string), '-1')" for f in fields]
|
|
22
|
+
|
|
23
|
+
return "concat_ws('*', " + ",".join(coalesce) + ")"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def concat_dfs(dfs: Iterable[DataFrame]) -> Optional[DataFrame]:
|
|
27
|
+
dfs = [df for df in dfs if df is not None]
|
|
28
|
+
if len(dfs) == 0:
|
|
29
|
+
return None
|
|
30
|
+
return reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), dfs)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@deprecated("use run_in_parallel instead")
|
|
34
|
+
def run_threads(func: Callable, iter: Union[List, DataFrame, range, set], workers: int = 8) -> List[Any]:
|
|
35
|
+
return run_in_parallel(func, iter, workers)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _process_queue_item(func: Callable, task_queue: Queue, result_queue: Queue, stop_signal: Any):
|
|
39
|
+
"""Worker function that processes items from a queue."""
|
|
40
|
+
while True:
|
|
41
|
+
try:
|
|
42
|
+
item = task_queue.get(timeout=1)
|
|
43
|
+
|
|
44
|
+
if item is stop_signal:
|
|
45
|
+
task_queue.put(stop_signal) # Put it back for other workers
|
|
46
|
+
break
|
|
47
|
+
|
|
48
|
+
result = func(item)
|
|
49
|
+
result_queue.put(result)
|
|
50
|
+
except Exception:
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _run_in_parallel_legacy(
|
|
55
|
+
func: Callable,
|
|
56
|
+
iterable: Union[List, DataFrame, range, set],
|
|
57
|
+
workers: int = 8,
|
|
58
|
+
progress_bar: Optional[bool] = False,
|
|
59
|
+
position: Optional[int] = None,
|
|
60
|
+
) -> List[Any]:
|
|
61
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
62
|
+
|
|
63
|
+
iterable = iterable.collect() if isinstance(iterable, DataFrameLike) else iterable # type: ignore
|
|
64
|
+
|
|
65
|
+
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
66
|
+
if progress_bar:
|
|
67
|
+
from tqdm import tqdm
|
|
68
|
+
|
|
69
|
+
results = list(tqdm(executor.map(func, iterable), total=len(iterable), position=position))
|
|
70
|
+
else:
|
|
71
|
+
results = list(executor.map(func, iterable))
|
|
72
|
+
|
|
73
|
+
return results
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def run_in_parallel(
|
|
77
|
+
func: Callable,
|
|
78
|
+
iterable: Union[List, DataFrame, range, set],
|
|
79
|
+
workers: int = 8,
|
|
80
|
+
progress_bar: Optional[bool] = False,
|
|
81
|
+
position: Optional[int] = None,
|
|
82
|
+
loglevel: int = logging.CRITICAL,
|
|
83
|
+
logger: Optional[logging.Logger] = None,
|
|
84
|
+
run_as: Optional[Literal["ThreadPool", "ProcessPool", "Pool", "Queue", "Legacy"]] = "Legacy",
|
|
85
|
+
) -> List[Any]:
|
|
86
|
+
"""
|
|
87
|
+
Runs the given function in parallel on the elements of the iterable using multiple threads or processes.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
func (Callable): The function to be executed in parallel.
|
|
91
|
+
iterable (Union[List, DataFrame, range, set]): The iterable containing the elements on which the function will be executed.
|
|
92
|
+
workers (int, optional): The number of worker threads/processes to use. Defaults to 8.
|
|
93
|
+
progress_bar (Optional[bool], optional): Whether to display a progress bar. Defaults to False.
|
|
94
|
+
position (Optional[int], optional): Position for the progress bar. Defaults to None.
|
|
95
|
+
loglevel (int, optional): Log level to set during execution. Defaults to logging.CRITICAL.
|
|
96
|
+
logger (Optional[logging.Logger], optional): Logger instance to use. Defaults to None.
|
|
97
|
+
run_as (Optional[Literal["ThreadPool", "ProcessPool", "Pool", "Queue"]], optional): Type of run as to use.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List[Any]: A list containing the results of the function calls.
|
|
101
|
+
|
|
102
|
+
"""
|
|
103
|
+
if logger is None:
|
|
104
|
+
logger = logging.getLogger()
|
|
105
|
+
|
|
106
|
+
current_loglevel = logger.getEffectiveLevel()
|
|
107
|
+
logger.setLevel(loglevel)
|
|
108
|
+
|
|
109
|
+
if run_as == "Legacy":
|
|
110
|
+
results = _run_in_parallel_legacy(
|
|
111
|
+
func=func,
|
|
112
|
+
iterable=iterable,
|
|
113
|
+
workers=workers,
|
|
114
|
+
progress_bar=progress_bar,
|
|
115
|
+
position=position,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
else:
|
|
119
|
+
iterables = iterable.collect() if isinstance(iterable, DataFrameLike) else iterable # type: ignore
|
|
120
|
+
results = []
|
|
121
|
+
|
|
122
|
+
if run_as == "Queue":
|
|
123
|
+
import threading
|
|
124
|
+
|
|
125
|
+
task_queue = Queue()
|
|
126
|
+
result_queue = Queue()
|
|
127
|
+
stop_signal = object()
|
|
128
|
+
|
|
129
|
+
for item in iterables:
|
|
130
|
+
task_queue.put(item)
|
|
131
|
+
|
|
132
|
+
task_queue.put(stop_signal)
|
|
133
|
+
|
|
134
|
+
threads = []
|
|
135
|
+
for _ in range(workers):
|
|
136
|
+
t = threading.Thread(target=_process_queue_item, args=(func, task_queue, result_queue, stop_signal))
|
|
137
|
+
t.start()
|
|
138
|
+
|
|
139
|
+
threads.append(t)
|
|
140
|
+
|
|
141
|
+
if progress_bar:
|
|
142
|
+
from tqdm import tqdm
|
|
143
|
+
|
|
144
|
+
with tqdm(total=len(iterables), position=position) as t:
|
|
145
|
+
for _ in range(len(iterables)):
|
|
146
|
+
result = result_queue.get()
|
|
147
|
+
results.append(result)
|
|
148
|
+
|
|
149
|
+
t.update()
|
|
150
|
+
t.refresh()
|
|
151
|
+
|
|
152
|
+
else:
|
|
153
|
+
for _ in range(len(iterables)):
|
|
154
|
+
results.append(result_queue.get())
|
|
155
|
+
|
|
156
|
+
for t in threads:
|
|
157
|
+
t.join()
|
|
158
|
+
|
|
159
|
+
elif run_as == "Pool":
|
|
160
|
+
from multiprocessing import Pool
|
|
161
|
+
|
|
162
|
+
with Pool(processes=workers) as p:
|
|
163
|
+
if progress_bar:
|
|
164
|
+
from tqdm import tqdm
|
|
165
|
+
|
|
166
|
+
with tqdm(total=len(iterables), position=position) as t:
|
|
167
|
+
for result in p.map(func, iterables):
|
|
168
|
+
results.append(result)
|
|
169
|
+
|
|
170
|
+
t.update()
|
|
171
|
+
t.refresh()
|
|
172
|
+
|
|
173
|
+
else:
|
|
174
|
+
results = list(p.map(func, iterables))
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
|
178
|
+
|
|
179
|
+
Executor = ProcessPoolExecutor if run_as == "ProcessPool" else ThreadPoolExecutor
|
|
180
|
+
with Executor(max_workers=workers) as exe:
|
|
181
|
+
if progress_bar:
|
|
182
|
+
from tqdm import tqdm
|
|
183
|
+
|
|
184
|
+
with tqdm(total=len(iterables), position=position) as t:
|
|
185
|
+
for result in exe.map(func, iterables):
|
|
186
|
+
results.append(result)
|
|
187
|
+
|
|
188
|
+
t.update()
|
|
189
|
+
t.refresh()
|
|
190
|
+
|
|
191
|
+
else:
|
|
192
|
+
results = list(exe.map(func, iterables))
|
|
193
|
+
|
|
194
|
+
logger.setLevel(current_loglevel)
|
|
195
|
+
|
|
196
|
+
return results
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def run_notebook(path: Path, timeout: Optional[int] = None, **kwargs):
|
|
200
|
+
"""
|
|
201
|
+
Runs a notebook located at the given path.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
path (Path): The path to the notebook file.
|
|
205
|
+
timeout (Optional[int]): The maximum execution time for the notebook in seconds. Defaults to None.
|
|
206
|
+
**kwargs: Additional keyword arguments to be passed to the notebook.
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
None
|
|
210
|
+
"""
|
|
211
|
+
from databricks.sdk.runtime import dbutils
|
|
212
|
+
|
|
213
|
+
if timeout is None:
|
|
214
|
+
timeout = 3600
|
|
215
|
+
|
|
216
|
+
dbutils.notebook.run(path.get_notebook_path(), timeout, {**kwargs}) # type: ignore
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def xxhash64(s: Any):
|
|
220
|
+
df = spark.sql(f"select xxhash64(cast('{s}' as string)) as xxhash64")
|
|
221
|
+
return df.collect()[0][0]
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def md5(s: Any):
|
|
225
|
+
from hashlib import md5
|
|
226
|
+
|
|
227
|
+
md5 = md5(str(s).encode())
|
|
228
|
+
return md5.hexdigest()
|
fabricks/utils/log.py
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from datetime import timezone as tz
|
|
7
|
+
from typing import Optional, Tuple
|
|
8
|
+
from zoneinfo import ZoneInfo
|
|
9
|
+
|
|
10
|
+
from pyspark.sql import DataFrame
|
|
11
|
+
|
|
12
|
+
from fabricks.utils.azure_table import AzureTable
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LogFormatter(logging.Formatter):
|
|
16
|
+
def __init__(self, debugmode: Optional[bool] = False, timezone: Optional[str] = None):
|
|
17
|
+
super().__init__(fmt="%(levelname)s%(prefix)s%(message)s [%(timestamp)s]%(extra)s")
|
|
18
|
+
|
|
19
|
+
self.debugmode = False if debugmode is None else debugmode
|
|
20
|
+
self.timezone = ZoneInfo(timezone) if timezone else tz.utc
|
|
21
|
+
|
|
22
|
+
COLORS = {
|
|
23
|
+
logging.DEBUG: "\033[36m",
|
|
24
|
+
logging.INFO: "\033[32m",
|
|
25
|
+
logging.WARNING: "\033[33m",
|
|
26
|
+
logging.ERROR: "\033[31m",
|
|
27
|
+
logging.CRITICAL: "\033[41;31m",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
RESET = "\033[0m"
|
|
31
|
+
BRIGHT = "\033[1m"
|
|
32
|
+
|
|
33
|
+
PADDINGS = {
|
|
34
|
+
"DEBUG": " ",
|
|
35
|
+
"INFO": " ",
|
|
36
|
+
"WARNING": " ",
|
|
37
|
+
"ERROR": " ",
|
|
38
|
+
"CRITICAL": "",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
def formatTime(self, record) -> str:
|
|
42
|
+
ct = datetime.fromtimestamp(record.created, tz=tz.utc).astimezone(self.timezone)
|
|
43
|
+
s = ct.strftime("%d/%m/%y %H:%M:%S")
|
|
44
|
+
return f"{self.COLORS[logging.DEBUG]}{s}{self.RESET}"
|
|
45
|
+
|
|
46
|
+
def format(self, record):
|
|
47
|
+
levelname = record.levelname
|
|
48
|
+
padding = self.PADDINGS[levelname]
|
|
49
|
+
levelname_formatted = f"{self.COLORS[record.levelno]}{levelname}:{padding}{self.RESET}"
|
|
50
|
+
|
|
51
|
+
prefix = ""
|
|
52
|
+
|
|
53
|
+
if hasattr(record, "label"):
|
|
54
|
+
prefix = f"{record.__dict__.get('label')} - "
|
|
55
|
+
elif hasattr(record, "job"):
|
|
56
|
+
prefix = f"{record.__dict__.get('job')} - " # keep for backward compatibility
|
|
57
|
+
elif hasattr(record, "step"):
|
|
58
|
+
prefix = f"{self.BRIGHT}{record.__dict__.get('step')}{self.RESET} - "
|
|
59
|
+
|
|
60
|
+
extra = ""
|
|
61
|
+
if hasattr(record, "exc_info") and record.exc_info:
|
|
62
|
+
exc_info = record.__dict__.get("exc_info", None)
|
|
63
|
+
extra += f" [{self.COLORS[logging.ERROR]}{exc_info[0].__name__}{self.RESET}]"
|
|
64
|
+
|
|
65
|
+
if self.debugmode:
|
|
66
|
+
if hasattr(record, "sql"):
|
|
67
|
+
extra += f"\n---\n%sql\n{record.__dict__.get('sql')}\n---"
|
|
68
|
+
|
|
69
|
+
if hasattr(record, "content"):
|
|
70
|
+
extra += f"\n---\n{record.__dict__.get('content')}\n---"
|
|
71
|
+
|
|
72
|
+
if hasattr(record, "context"):
|
|
73
|
+
extra += f"\n---\n{json.dumps(record.__dict__.get('context'), indent=2, default=str)}\n---"
|
|
74
|
+
|
|
75
|
+
if hasattr(record, "df"):
|
|
76
|
+
df = record.__dict__.get("df")
|
|
77
|
+
if isinstance(df, DataFrame):
|
|
78
|
+
extra += f"\n---\n%df\n{df.toPandas().to_string(index=True)}\n---"
|
|
79
|
+
|
|
80
|
+
record.levelname = levelname_formatted
|
|
81
|
+
record.prefix = prefix
|
|
82
|
+
record.timestamp = self.formatTime(record)
|
|
83
|
+
record.extra = extra
|
|
84
|
+
|
|
85
|
+
return super().format(record)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class AzureTableLogHandler(logging.Handler):
|
|
89
|
+
def __init__(self, table: AzureTable, debugmode: Optional[bool] = False, timezone: Optional[str] = None):
|
|
90
|
+
super().__init__()
|
|
91
|
+
|
|
92
|
+
self.buffer = []
|
|
93
|
+
self.table = table
|
|
94
|
+
|
|
95
|
+
self.debugmode = False if debugmode is None else debugmode
|
|
96
|
+
self.timezone = ZoneInfo(timezone) if timezone else tz.utc
|
|
97
|
+
|
|
98
|
+
def formatTime(self, record) -> str:
|
|
99
|
+
ct = datetime.fromtimestamp(record.created, tz=tz.utc).astimezone(self.timezone)
|
|
100
|
+
s = ct.strftime("%d/%m/%y %H:%M:%S")
|
|
101
|
+
return s
|
|
102
|
+
|
|
103
|
+
def emit(self, record):
|
|
104
|
+
if hasattr(record, "target"):
|
|
105
|
+
target = record.__dict__.get("target")
|
|
106
|
+
|
|
107
|
+
level = record.levelname
|
|
108
|
+
if "debug" in level.lower():
|
|
109
|
+
level = "DEBUG"
|
|
110
|
+
elif "info" in level.lower():
|
|
111
|
+
level = "INFO"
|
|
112
|
+
elif "warning" in level.lower():
|
|
113
|
+
level = "WARNING"
|
|
114
|
+
elif "error" in level.lower():
|
|
115
|
+
level = "ERROR"
|
|
116
|
+
elif "critical" in level.lower():
|
|
117
|
+
level = "CRITICAL"
|
|
118
|
+
else:
|
|
119
|
+
level = "INFO"
|
|
120
|
+
|
|
121
|
+
r = {
|
|
122
|
+
"Created": self.formatTime(record), # timestamp not present when querying Azure Table
|
|
123
|
+
"Level": level,
|
|
124
|
+
"Message": record.message,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if hasattr(record, "job"):
|
|
128
|
+
j = str(record.__dict__.get("job", ""))
|
|
129
|
+
r["Job"] = j
|
|
130
|
+
r["JobId"] = hashlib.md5(j.encode()).hexdigest()
|
|
131
|
+
|
|
132
|
+
if hasattr(record, "table"):
|
|
133
|
+
t = str(record.__dict__.get("table", ""))
|
|
134
|
+
r["Job"] = t
|
|
135
|
+
r["JobId"] = hashlib.md5(t.encode()).hexdigest()
|
|
136
|
+
|
|
137
|
+
if hasattr(record, "step"):
|
|
138
|
+
r["Step"] = record.__dict__.get("step", "")
|
|
139
|
+
|
|
140
|
+
if hasattr(record, "schedule_id"):
|
|
141
|
+
r["ScheduleId"] = record.__dict__.get("schedule_id", "")
|
|
142
|
+
|
|
143
|
+
if hasattr(record, "schedule"):
|
|
144
|
+
r["Schedule"] = record.__dict__.get("schedule", "")
|
|
145
|
+
|
|
146
|
+
if hasattr(record, "notebook_id"):
|
|
147
|
+
r["NotebookId"] = record.__dict__.get("notebook_id", "")
|
|
148
|
+
|
|
149
|
+
if hasattr(record, "exc_info"):
|
|
150
|
+
e = record.__dict__.get("exc_info", None)
|
|
151
|
+
if e is not None:
|
|
152
|
+
d = {
|
|
153
|
+
"type": str(e[0].__name__)[:1000],
|
|
154
|
+
"message": str(e[1])[:1000],
|
|
155
|
+
"traceback": str(logging.Formatter.formatException(self, e))[:1000], # type: ignore
|
|
156
|
+
}
|
|
157
|
+
r["Exception"] = json.dumps(d)
|
|
158
|
+
|
|
159
|
+
if self.debugmode:
|
|
160
|
+
if hasattr(record, "content"):
|
|
161
|
+
r["Content"] = json.dumps(record.__dict__.get("content", ""))[:1000]
|
|
162
|
+
if hasattr(record, "sql"):
|
|
163
|
+
r["Sql"] = record.__dict__.get("sql", "")[:1000]
|
|
164
|
+
|
|
165
|
+
r["PartitionKey"] = record.__dict__.get("partition_key", "default")
|
|
166
|
+
if hasattr(record, "row_key"):
|
|
167
|
+
r["RowKey"] = record.__dict__.get("row_key", "")
|
|
168
|
+
else:
|
|
169
|
+
r["RowKey"] = hashlib.md5(json.dumps(r, sort_keys=True).encode()).hexdigest()
|
|
170
|
+
|
|
171
|
+
if target == "table":
|
|
172
|
+
self.table.upsert(r)
|
|
173
|
+
else:
|
|
174
|
+
self.buffer.append(r)
|
|
175
|
+
|
|
176
|
+
else:
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
def flush(self):
|
|
180
|
+
self.table.upsert(self.buffer)
|
|
181
|
+
self.buffer.clear()
|
|
182
|
+
|
|
183
|
+
def clear_buffer(self):
|
|
184
|
+
self.buffer = []
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class CustomConsoleHandler(logging.StreamHandler):
|
|
188
|
+
def __init__(self, stream=None, debugmode: Optional[bool] = False):
|
|
189
|
+
super().__init__(stream or sys.stderr)
|
|
190
|
+
|
|
191
|
+
self.debugmode = False if debugmode is None else debugmode
|
|
192
|
+
|
|
193
|
+
def emit(self, record):
|
|
194
|
+
if hasattr(record, "sql"):
|
|
195
|
+
if self.debugmode:
|
|
196
|
+
super().emit(record)
|
|
197
|
+
else:
|
|
198
|
+
super().emit(record)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def get_logger(
|
|
202
|
+
name: str,
|
|
203
|
+
level: int,
|
|
204
|
+
table: Optional[AzureTable] = None,
|
|
205
|
+
debugmode: Optional[bool] = False,
|
|
206
|
+
timezone: Optional[str] = None,
|
|
207
|
+
) -> Tuple[logging.Logger, Optional[AzureTableLogHandler]]:
|
|
208
|
+
logger = logging.getLogger(name)
|
|
209
|
+
if logger.hasHandlers():
|
|
210
|
+
logger.handlers.clear()
|
|
211
|
+
|
|
212
|
+
root = logging.getLogger()
|
|
213
|
+
if root.hasHandlers():
|
|
214
|
+
root.handlers.clear()
|
|
215
|
+
|
|
216
|
+
logger.setLevel(level)
|
|
217
|
+
logger.propagate = False
|
|
218
|
+
|
|
219
|
+
# Console handler
|
|
220
|
+
console_handler = CustomConsoleHandler(debugmode=debugmode)
|
|
221
|
+
console_handler.setLevel(level)
|
|
222
|
+
console_format = LogFormatter(debugmode=debugmode, timezone=timezone)
|
|
223
|
+
console_handler.setFormatter(console_format)
|
|
224
|
+
|
|
225
|
+
if table is not None:
|
|
226
|
+
# Azure Table handler
|
|
227
|
+
azure_table_handler = AzureTableLogHandler(table=table, debugmode=debugmode, timezone=timezone)
|
|
228
|
+
azure_table_handler.setLevel(level)
|
|
229
|
+
else:
|
|
230
|
+
azure_table_handler = None
|
|
231
|
+
|
|
232
|
+
logger.addHandler(console_handler)
|
|
233
|
+
if azure_table_handler is not None:
|
|
234
|
+
logger.addHandler(azure_table_handler)
|
|
235
|
+
|
|
236
|
+
return logger, azure_table_handler
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_mermaid_diagram(df: DataFrame) -> str:
|
|
5
|
+
dependencies = df.select("parent_id", "parent", "job_id", "job").collect()
|
|
6
|
+
|
|
7
|
+
out = "flowchart TD\n"
|
|
8
|
+
|
|
9
|
+
unique_nodes = set()
|
|
10
|
+
|
|
11
|
+
for row in dependencies:
|
|
12
|
+
parent_id = str(row["parent_id"])
|
|
13
|
+
parent_name = str(row["parent"])
|
|
14
|
+
child_id = str(row["job_id"])
|
|
15
|
+
child_name = str(row["job"])
|
|
16
|
+
|
|
17
|
+
if parent_id != "0" and parent_id is not None:
|
|
18
|
+
if parent_id not in unique_nodes:
|
|
19
|
+
out += f" {parent_id}[{parent_name}]\n"
|
|
20
|
+
unique_nodes.add(parent_id)
|
|
21
|
+
|
|
22
|
+
if child_id not in unique_nodes:
|
|
23
|
+
out += f" {child_id}[{child_name}]\n"
|
|
24
|
+
unique_nodes.add(child_id)
|
|
25
|
+
|
|
26
|
+
out += f" {parent_id} --> {child_id}\n"
|
|
27
|
+
else:
|
|
28
|
+
if child_id not in unique_nodes:
|
|
29
|
+
out += f" {child_id}[{child_name}]\n"
|
|
30
|
+
unique_nodes.add(child_id)
|
|
31
|
+
|
|
32
|
+
return out
|