fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,63 @@
1
+ import json
2
+ from typing import Optional, Union
3
+
4
+ from azure.core.exceptions import ResourceExistsError
5
+ from azure.storage.queue import QueueClient
6
+
7
+
8
+ class AzureQueue(QueueClient):
9
+ def __init__(
10
+ self,
11
+ name: str,
12
+ storage_account: Optional[str] = None,
13
+ access_key: Optional[str] = None,
14
+ connection_string: Optional[str] = None,
15
+ ):
16
+ self.name = name
17
+ if connection_string is None:
18
+ assert storage_account
19
+ assert access_key
20
+ self.storage_account = storage_account
21
+ self.access_key = access_key
22
+ connection_string = f"DefaultEndpointsProtocol=https;AccountName={self.storage_account};AccountKey={self.access_key};EndpointSuffix=core.windows.net"
23
+
24
+ assert connection_string
25
+ self.connection_string = connection_string
26
+
27
+ @property
28
+ def queue_client(self) -> QueueClient:
29
+ return QueueClient.from_connection_string(self.connection_string, queue_name=self.name)
30
+
31
+ def create_if_not_exists(self):
32
+ try:
33
+ self.queue_client.create_queue()
34
+ except ResourceExistsError:
35
+ pass
36
+
37
+ @property
38
+ def sentinel(self):
39
+ return "SENTINEL"
40
+
41
+ def clear(self):
42
+ self.queue_client.clear_messages()
43
+
44
+ def send(self, message: Union[str, dict]):
45
+ if isinstance(message, dict):
46
+ message = json.dumps(message)
47
+ # print("sending ->", message)
48
+ self.queue_client.send_message(message)
49
+
50
+ def send_sentinel(self):
51
+ # print("sentinel", self.sentinel)
52
+ self.send(self.sentinel)
53
+
54
+ def receive(self):
55
+ msg = self.queue_client.receive_message()
56
+ if msg:
57
+ self.queue_client.delete_message(msg)
58
+ # print("receiving ->", msg.content)
59
+ return msg.content
60
+ return None
61
+
62
+ def delete(self):
63
+ self.queue_client.delete_queue()
@@ -0,0 +1,99 @@
1
+ import time
2
+ from typing import List, Optional, Union
3
+
4
+ from azure.data.tables import TableClient, TableServiceClient
5
+ from pyspark.sql import DataFrame
6
+
7
+
8
+ class AzureTable(TableClient):
9
+ def __init__(
10
+ self,
11
+ name: str,
12
+ storage_account: Optional[str] = None,
13
+ access_key: Optional[str] = None,
14
+ connection_string: Optional[str] = None,
15
+ ):
16
+ self.name = name
17
+ if connection_string is None:
18
+ assert storage_account
19
+ assert access_key
20
+ self.storage_account = storage_account
21
+ self.access_key = access_key
22
+ connection_string = f"DefaultEndpointsProtocol=https;AccountName={self.storage_account};AccountKey={self.access_key};EndpointSuffix=core.windows.net"
23
+
24
+ assert connection_string
25
+ self.connection_string = connection_string
26
+
27
+ @property
28
+ def table_service_client(self) -> TableServiceClient:
29
+ return TableServiceClient.from_connection_string(self.connection_string, table_name=self.name)
30
+
31
+ @property
32
+ def table(self) -> TableClient:
33
+ return self.create_if_not_exists()
34
+
35
+ def create_if_not_exists(self) -> TableClient:
36
+ return self.table_service_client.create_table_if_not_exists(table_name=self.name)
37
+
38
+ def drop(self):
39
+ self.table_service_client.delete_table(self.name)
40
+
41
+ def query(self, query: str) -> List:
42
+ return list(self.table.query_entities(query))
43
+
44
+ def list_all(self) -> List:
45
+ return self.query("")
46
+
47
+ def submit(self, operations: List, retry: Optional[bool] = True):
48
+ try:
49
+ partitions = set()
50
+ for d in operations:
51
+ partitions.add(d[1]["PartitionKey"])
52
+
53
+ for p in partitions:
54
+ _operations = [d for d in operations if d[1].get("PartitionKey") == p]
55
+ t = 50
56
+ if len(_operations) < t:
57
+ self.table.submit_transaction(_operations)
58
+ else:
59
+ transactions = [_operations[i : i + t] for i in range(0, len(_operations), t)]
60
+ for transaction in transactions:
61
+ self.table.submit_transaction(transaction)
62
+ except Exception as e:
63
+ if retry:
64
+ time.sleep(10)
65
+ self.submit(operations, retry=False)
66
+ else:
67
+ raise e
68
+
69
+ def delete(self, data: Union[List, DataFrame, dict]):
70
+ if isinstance(data, DataFrame):
71
+ data = [row.asDict() for row in data.collect()]
72
+ elif not isinstance(data, List):
73
+ data = [data]
74
+
75
+ operations = [("delete", d) for d in data]
76
+ self.submit(operations)
77
+
78
+ def upsert(self, data: Union[List, DataFrame, dict]):
79
+ if isinstance(data, DataFrame):
80
+ data = [row.asDict() for row in data.collect()]
81
+ elif not isinstance(data, List):
82
+ data = [data]
83
+
84
+ operations = [("upsert", d) for d in data]
85
+ self.submit(operations)
86
+
87
+ def truncate_partition(self, partition: str):
88
+ data = self.query(f"PartitionKey eq '{partition}'")
89
+ self.delete(data)
90
+
91
+ def truncate_all_partitions(self):
92
+ for p in self.list_all_partitions():
93
+ self.truncate_partition(p)
94
+
95
+ def list_all_partitions(self) -> List:
96
+ partitions = set()
97
+ for d in self.list_all():
98
+ partitions.add(d["PartitionKey"])
99
+ return sorted(list(partitions))
@@ -0,0 +1,51 @@
1
+ from typing import Optional
2
+
3
+
4
+ class formatter:
5
+ END = "\33[0m"
6
+ BOLD = "\33[1m"
7
+ ITALIC = "\33[3m"
8
+ URL = "\33[4m"
9
+ BLINK = "\33[5m"
10
+ SELECTED = "\33[7m"
11
+
12
+ BLINK2 = "\33[6m"
13
+
14
+
15
+ class colors:
16
+ BLACK = "\33[30m"
17
+ RED = "\33[31m"
18
+ GREEN = "\33[32m"
19
+ YELLOW = "\33[33m"
20
+ BLUE = "\33[34m"
21
+ VIOLET = "\33[35m"
22
+ BEIGE = "\33[36m"
23
+ WHITE = "\33[37m"
24
+ GREY = "\33[90m"
25
+ ORANGE = "\33[33m"
26
+
27
+ RED2 = "\33[91m"
28
+ GREEN2 = "\33[92m"
29
+ YELLOW2 = "\33[93m"
30
+ BLUE2 = "\33[94m"
31
+ VIOLET2 = "\33[95m"
32
+ BEIGE2 = "\33[96m"
33
+ WHITE2 = "\33[97m"
34
+
35
+ RED3 = "\33[1;31m"
36
+
37
+
38
+ def progress_bar(progress: int = 0, width: int = 40, msg: Optional[str] = None):
39
+ if not isinstance(progress, int):
40
+ progress = int(progress)
41
+
42
+ left = width * progress // 100
43
+ right = width - left
44
+
45
+ tags = "#" * left
46
+ spaces = " " * right
47
+ pct = f" {progress}%"
48
+ if msg:
49
+ pct = f"{pct} ({msg})"
50
+
51
+ print("\r[", tags, spaces, "]", pct, sep="", end="", flush=True)
@@ -0,0 +1,57 @@
1
+ from databricks.sdk.runtime import dbutils
2
+
3
+ from fabricks.utils.secret import AccessKey, ApplicationRegistration, Secret
4
+
5
+
6
+ def create_container(storage_account: str, container: str, secret: Secret):
7
+ from azure.core.exceptions import ResourceExistsError
8
+ from azure.identity import ClientSecretCredential
9
+ from azure.storage.blob import BlobServiceClient
10
+
11
+ assert isinstance(secret, ApplicationRegistration)
12
+
13
+ cred = ClientSecretCredential(
14
+ tenant_id=secret.directory_id,
15
+ client_id=secret.application_id,
16
+ client_secret=secret.secret,
17
+ )
18
+
19
+ try:
20
+ blob_service_client = BlobServiceClient(f"https://{storage_account}.blob.core.windows.net", credential=cred)
21
+ blob_service_client.create_container(container)
22
+
23
+ except ResourceExistsError:
24
+ pass
25
+
26
+
27
+ def mount_container(storage_account: str, container: str, secret: Secret):
28
+ try:
29
+ dbutils.fs.unmount(f"/mnt/{container}") # type: ignore
30
+
31
+ except Exception:
32
+ pass
33
+
34
+ if isinstance(secret, AccessKey):
35
+ dbutils.fs.mount( # type: ignore
36
+ source=f"wasbs://{container}@{storage_account}.blob.core.windows.net",
37
+ mount_point=f"/mnt/{container}",
38
+ extra_configs={
39
+ f"fs.azure.account.key.{storage_account}.blob.core.windows.net": f"{secret.key}",
40
+ },
41
+ )
42
+
43
+ elif isinstance(secret, ApplicationRegistration):
44
+ dbutils.fs.mount( # type: ignore
45
+ source=f"abfss://{container}@{storage_account}.blob.core.windows.net/",
46
+ mount_point=f"/mnt/{container}",
47
+ extra_configs={
48
+ "fs.azure.account.auth.type": "OAuth",
49
+ "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredentialsTokenProvider",
50
+ "fs.azure.account.oauth2.client.id": secret.application_id,
51
+ "fs.azure.account.oauth2.client.secret": secret.secret,
52
+ "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{secret.directory_id}/oauth2/token",
53
+ },
54
+ )
55
+
56
+ else:
57
+ raise ValueError("secret is not valid")
@@ -0,0 +1,28 @@
1
+ from typing import Any, List, Optional
2
+
3
+
4
+ class FDict:
5
+ def __init__(self, options: Any):
6
+ self.options = options
7
+
8
+ def get(self, key: str) -> Optional[Any]:
9
+ return self.options.get(key)
10
+
11
+ def get_list(self, key: str) -> List[Any]:
12
+ values = self.options.get(key, [])
13
+ if values is not None:
14
+ if not isinstance(values, List):
15
+ values = [values]
16
+ return values
17
+
18
+ def get_boolean(self, key: str, if_none: Optional[bool] = None) -> Optional[bool]:
19
+ o = self.options.get(key)
20
+ if isinstance(o, bool):
21
+ return o
22
+ elif o is not None:
23
+ return o.lower() == "true"
24
+ else:
25
+ return if_none
26
+
27
+ def get_dict(self, key) -> dict:
28
+ return self.options.get(key, {})
@@ -0,0 +1,89 @@
1
+ from concurrent.futures import ThreadPoolExecutor, as_completed
2
+ from functools import reduce
3
+ from typing import Any, Callable, Iterable, List, Optional, Union
4
+
5
+ from databricks.sdk.runtime import dbutils, spark
6
+ from pyspark.sql import DataFrame
7
+ from typing_extensions import deprecated
8
+
9
+ from fabricks.utils.path import Path
10
+
11
+
12
+ def concat_ws(fields: Union[str, List[str]], alias: Optional[str] = None) -> str:
13
+ if isinstance(fields, str):
14
+ fields = [fields]
15
+
16
+ if alias:
17
+ coalesce = [f"coalesce(cast({alias}.{f} as string), '-1')" for f in fields]
18
+ else:
19
+ coalesce = [f"coalesce(cast({f} as string), '-1')" for f in fields]
20
+
21
+ return "concat_ws('*', " + ",".join(coalesce) + ")"
22
+
23
+
24
+ def concat_dfs(dfs: Iterable[DataFrame]) -> DataFrame:
25
+ return reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), dfs)
26
+
27
+
28
+ @deprecated("use run_threads instead")
29
+ def run_threads(func: Callable, iter: Union[List, DataFrame, range, set], workers: int = 8) -> List[Any]:
30
+ return run_in_parallel(func, iter, workers)
31
+
32
+
33
+ def run_in_parallel(func: Callable, iterable: Union[List, DataFrame, range, set], workers: int = 8) -> List[Any]:
34
+ """
35
+ Runs the given function in parallel on the elements of the iterable using multiple threads.
36
+
37
+ Args:
38
+ func (Callable): The function to be executed in parallel.
39
+ iterable (Union[List, DataFrame, range, set]): The iterable containing the elements on which the function will be executed.
40
+ workers (int, optional): The number of worker threads to use. Defaults to 8.
41
+
42
+ Returns:
43
+ List[Any]: A list containing the results of the function calls.
44
+
45
+ """
46
+ out = []
47
+
48
+ with ThreadPoolExecutor(max_workers=workers) as executor:
49
+ iterable = iterable.collect() if isinstance(iterable, DataFrame) else iterable
50
+ futures = {executor.submit(func, i): i for i in iterable}
51
+ for future in as_completed(futures):
52
+ try:
53
+ r = future.result()
54
+ if r:
55
+ out.append(r)
56
+ except Exception:
57
+ pass
58
+
59
+ return out
60
+
61
+
62
+ def run_notebook(path: Path, timeout: Optional[int] = None, **kwargs):
63
+ """
64
+ Runs a notebook located at the given path.
65
+
66
+ Args:
67
+ path (Path): The path to the notebook file.
68
+ timeout (Optional[int]): The maximum execution time for the notebook in seconds. Defaults to None.
69
+ **kwargs: Additional keyword arguments to be passed to the notebook.
70
+
71
+ Returns:
72
+ None
73
+ """
74
+ if timeout is None:
75
+ timeout = 3600
76
+
77
+ dbutils.notebook.run(path.get_notebook_path(), timeout, {**kwargs})
78
+
79
+
80
+ def xxhash64(s: Any):
81
+ df = spark.sql(f"select xxhash64(cast('{s}' as string)) as xxhash64")
82
+ return df.collect()[0][0]
83
+
84
+
85
+ def md5(s: Any):
86
+ from hashlib import md5
87
+
88
+ md5 = md5(str(s).encode())
89
+ return md5.hexdigest()
fabricks/utils/log.py ADDED
@@ -0,0 +1,153 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ from datetime import datetime
5
+ from typing import Tuple
6
+
7
+ from fabricks.utils.azure_table import AzureTable
8
+
9
+ GREY = "\33[90m"
10
+ BLACK = "\33[30m"
11
+ YELLOW = "\33[93m"
12
+ RED = "\33[31m"
13
+ RESET = "\33[0m"
14
+ COLORS = {"DEBUG": GREY, "INFO": BLACK, "WARNING": YELLOW, "ERROR": RED, "CRITICAL": RED}
15
+
16
+
17
+ class CustomFormatter(logging.Formatter):
18
+ def format(self, record):
19
+ message = super().format(record) # noqa: F841
20
+ out = f"{COLORS[record.levelname]}"
21
+
22
+ if hasattr(record, "created"):
23
+ d = datetime.fromtimestamp(record.created).strftime("%d/%m/%y %H:%M:%S")
24
+ out += f"[{d}]"
25
+
26
+ if hasattr(record, "job"):
27
+ j = f" - {record.__dict__.get('job')}"
28
+ out += str(j)
29
+
30
+ elif hasattr(record, "step"):
31
+ s = f" - {record.__dict__.get('step')}"
32
+ out += str(s)
33
+
34
+ if hasattr(record, "message"):
35
+ m = record.__dict__.get("message", "")
36
+ if hasattr(record, "job"):
37
+ m = f" => {m}"
38
+ elif hasattr(record, "step"):
39
+ m = f" => {m}"
40
+ else:
41
+ m = " " + m
42
+ out += m
43
+
44
+ if hasattr(record, "exc_info"):
45
+ exc_info = record.__dict__.get("exc_info", None)
46
+ if exc_info is not None:
47
+ e = f" !{exc_info[0].__name__.lower()}!"
48
+ out += e
49
+
50
+ if hasattr(record, "sql"):
51
+ s = f"\n---\n%sql\n{record.__dict__.get('sql')}\n---"
52
+ out += s
53
+ if hasattr(record, "content"):
54
+ s = f"\n---\n{record.__dict__.get('content')}\n---"
55
+ out += s
56
+
57
+ out += f"{RESET}"
58
+ return out
59
+
60
+
61
+ class AzureTableHandler(logging.Handler):
62
+ def __init__(self, table: AzureTable):
63
+ super().__init__()
64
+ self.buffer = []
65
+ self.table = table
66
+
67
+ def emit(self, record):
68
+ if hasattr(record, "target"):
69
+ target = record.__dict__.get("target")
70
+
71
+ r = {
72
+ "Created": str(
73
+ datetime.fromtimestamp(record.created).strftime("%d/%m/%y %H:%M:%S")
74
+ ), # timestamp not present when querying Azure Table
75
+ "Level": record.levelname,
76
+ "Message": record.message,
77
+ }
78
+ if hasattr(record, "job"):
79
+ j = str(record.__dict__.get("job", ""))
80
+ r["Job"] = j
81
+ r["JobId"] = hashlib.md5(j.encode()).hexdigest()
82
+ if hasattr(record, "table"):
83
+ t = str(record.__dict__.get("table", ""))
84
+ r["Job"] = t
85
+ r["JobId"] = hashlib.md5(t.encode()).hexdigest()
86
+
87
+ if hasattr(record, "step"):
88
+ r["Step"] = record.__dict__.get("step", "")
89
+
90
+ if hasattr(record, "schedule_id"):
91
+ r["ScheduleId"] = record.__dict__.get("schedule_id", "")
92
+
93
+ if hasattr(record, "schedule"):
94
+ r["Schedule"] = record.__dict__.get("schedule", "")
95
+
96
+ if hasattr(record, "notebook_id"):
97
+ r["NotebookId"] = record.__dict__.get("notebook_id", "")
98
+
99
+ if hasattr(record, "exc_info"):
100
+ e = record.__dict__.get("exc_info", None)
101
+ if e is not None:
102
+ d = {
103
+ "type": str(e[0].__name__)[:1000],
104
+ "message": str(e[1])[:1000],
105
+ "traceback": str(logging.Formatter.formatException(self, e))[:1000], # type: ignore
106
+ }
107
+ r["Exception"] = json.dumps(d)
108
+
109
+ if hasattr(record, "content"):
110
+ r["Content"] = json.dumps(record.__dict__.get("content", ""))[:1000]
111
+ if hasattr(record, "sql"):
112
+ r["Sql"] = record.__dict__.get("sql", "")[:1000]
113
+
114
+ r["PartitionKey"] = record.__dict__.get("partition_key", "default")
115
+ if hasattr(record, "row_key"):
116
+ r["RowKey"] = record.__dict__.get("row_key", "")
117
+ else:
118
+ r["RowKey"] = hashlib.md5(json.dumps(r, sort_keys=True).encode()).hexdigest()
119
+
120
+ if target == "table":
121
+ self.table.upsert(r)
122
+ else:
123
+ self.buffer.append(r)
124
+
125
+ else:
126
+ pass
127
+
128
+ def flush(self):
129
+ self.table.upsert(self.buffer)
130
+ self.buffer.clear()
131
+
132
+ def clear_buffer(self):
133
+ self.buffer = []
134
+
135
+
136
+ def get_logger(name: str, level: int, table: AzureTable) -> Tuple[logging.Logger, AzureTableHandler]:
137
+ logger = logging.getLogger(name)
138
+ logger.setLevel(level)
139
+
140
+ # Console handler
141
+ console_handler = logging.StreamHandler()
142
+ console_handler.setLevel(level)
143
+ console_format = CustomFormatter()
144
+ console_handler.setFormatter(console_format)
145
+
146
+ # Azure Table handler
147
+ azure_table_handler = AzureTableHandler(table=table)
148
+ azure_table_handler.setLevel(level)
149
+
150
+ logger.addHandler(console_handler)
151
+ logger.addHandler(azure_table_handler)
152
+
153
+ return logger, azure_table_handler