fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
from azure.core.exceptions import ResourceExistsError
|
|
5
|
+
from azure.storage.queue import QueueClient
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AzureQueue(QueueClient):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
name: str,
|
|
12
|
+
storage_account: Optional[str] = None,
|
|
13
|
+
access_key: Optional[str] = None,
|
|
14
|
+
connection_string: Optional[str] = None,
|
|
15
|
+
):
|
|
16
|
+
self.name = name
|
|
17
|
+
if connection_string is None:
|
|
18
|
+
assert storage_account
|
|
19
|
+
assert access_key
|
|
20
|
+
self.storage_account = storage_account
|
|
21
|
+
self.access_key = access_key
|
|
22
|
+
connection_string = f"DefaultEndpointsProtocol=https;AccountName={self.storage_account};AccountKey={self.access_key};EndpointSuffix=core.windows.net"
|
|
23
|
+
|
|
24
|
+
assert connection_string
|
|
25
|
+
self.connection_string = connection_string
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def queue_client(self) -> QueueClient:
|
|
29
|
+
return QueueClient.from_connection_string(self.connection_string, queue_name=self.name)
|
|
30
|
+
|
|
31
|
+
def create_if_not_exists(self):
|
|
32
|
+
try:
|
|
33
|
+
self.queue_client.create_queue()
|
|
34
|
+
except ResourceExistsError:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def sentinel(self):
|
|
39
|
+
return "SENTINEL"
|
|
40
|
+
|
|
41
|
+
def clear(self):
|
|
42
|
+
self.queue_client.clear_messages()
|
|
43
|
+
|
|
44
|
+
def send(self, message: Union[str, dict]):
|
|
45
|
+
if isinstance(message, dict):
|
|
46
|
+
message = json.dumps(message)
|
|
47
|
+
# print("sending ->", message)
|
|
48
|
+
self.queue_client.send_message(message)
|
|
49
|
+
|
|
50
|
+
def send_sentinel(self):
|
|
51
|
+
# print("sentinel", self.sentinel)
|
|
52
|
+
self.send(self.sentinel)
|
|
53
|
+
|
|
54
|
+
def receive(self):
|
|
55
|
+
msg = self.queue_client.receive_message()
|
|
56
|
+
if msg:
|
|
57
|
+
self.queue_client.delete_message(msg)
|
|
58
|
+
# print("receiving ->", msg.content)
|
|
59
|
+
return msg.content
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
def delete(self):
|
|
63
|
+
self.queue_client.delete_queue()
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import List, Optional, Union
|
|
3
|
+
|
|
4
|
+
from azure.data.tables import TableClient, TableServiceClient
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AzureTable(TableClient):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
name: str,
|
|
12
|
+
storage_account: Optional[str] = None,
|
|
13
|
+
access_key: Optional[str] = None,
|
|
14
|
+
connection_string: Optional[str] = None,
|
|
15
|
+
):
|
|
16
|
+
self.name = name
|
|
17
|
+
if connection_string is None:
|
|
18
|
+
assert storage_account
|
|
19
|
+
assert access_key
|
|
20
|
+
self.storage_account = storage_account
|
|
21
|
+
self.access_key = access_key
|
|
22
|
+
connection_string = f"DefaultEndpointsProtocol=https;AccountName={self.storage_account};AccountKey={self.access_key};EndpointSuffix=core.windows.net"
|
|
23
|
+
|
|
24
|
+
assert connection_string
|
|
25
|
+
self.connection_string = connection_string
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def table_service_client(self) -> TableServiceClient:
|
|
29
|
+
return TableServiceClient.from_connection_string(self.connection_string, table_name=self.name)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def table(self) -> TableClient:
|
|
33
|
+
return self.create_if_not_exists()
|
|
34
|
+
|
|
35
|
+
def create_if_not_exists(self) -> TableClient:
|
|
36
|
+
return self.table_service_client.create_table_if_not_exists(table_name=self.name)
|
|
37
|
+
|
|
38
|
+
def drop(self):
|
|
39
|
+
self.table_service_client.delete_table(self.name)
|
|
40
|
+
|
|
41
|
+
def query(self, query: str) -> List:
|
|
42
|
+
return list(self.table.query_entities(query))
|
|
43
|
+
|
|
44
|
+
def list_all(self) -> List:
|
|
45
|
+
return self.query("")
|
|
46
|
+
|
|
47
|
+
def submit(self, operations: List, retry: Optional[bool] = True):
|
|
48
|
+
try:
|
|
49
|
+
partitions = set()
|
|
50
|
+
for d in operations:
|
|
51
|
+
partitions.add(d[1]["PartitionKey"])
|
|
52
|
+
|
|
53
|
+
for p in partitions:
|
|
54
|
+
_operations = [d for d in operations if d[1].get("PartitionKey") == p]
|
|
55
|
+
t = 50
|
|
56
|
+
if len(_operations) < t:
|
|
57
|
+
self.table.submit_transaction(_operations)
|
|
58
|
+
else:
|
|
59
|
+
transactions = [_operations[i : i + t] for i in range(0, len(_operations), t)]
|
|
60
|
+
for transaction in transactions:
|
|
61
|
+
self.table.submit_transaction(transaction)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
if retry:
|
|
64
|
+
time.sleep(10)
|
|
65
|
+
self.submit(operations, retry=False)
|
|
66
|
+
else:
|
|
67
|
+
raise e
|
|
68
|
+
|
|
69
|
+
def delete(self, data: Union[List, DataFrame, dict]):
|
|
70
|
+
if isinstance(data, DataFrame):
|
|
71
|
+
data = [row.asDict() for row in data.collect()]
|
|
72
|
+
elif not isinstance(data, List):
|
|
73
|
+
data = [data]
|
|
74
|
+
|
|
75
|
+
operations = [("delete", d) for d in data]
|
|
76
|
+
self.submit(operations)
|
|
77
|
+
|
|
78
|
+
def upsert(self, data: Union[List, DataFrame, dict]):
|
|
79
|
+
if isinstance(data, DataFrame):
|
|
80
|
+
data = [row.asDict() for row in data.collect()]
|
|
81
|
+
elif not isinstance(data, List):
|
|
82
|
+
data = [data]
|
|
83
|
+
|
|
84
|
+
operations = [("upsert", d) for d in data]
|
|
85
|
+
self.submit(operations)
|
|
86
|
+
|
|
87
|
+
def truncate_partition(self, partition: str):
|
|
88
|
+
data = self.query(f"PartitionKey eq '{partition}'")
|
|
89
|
+
self.delete(data)
|
|
90
|
+
|
|
91
|
+
def truncate_all_partitions(self):
|
|
92
|
+
for p in self.list_all_partitions():
|
|
93
|
+
self.truncate_partition(p)
|
|
94
|
+
|
|
95
|
+
def list_all_partitions(self) -> List:
|
|
96
|
+
partitions = set()
|
|
97
|
+
for d in self.list_all():
|
|
98
|
+
partitions.add(d["PartitionKey"])
|
|
99
|
+
return sorted(list(partitions))
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class formatter:
|
|
5
|
+
END = "\33[0m"
|
|
6
|
+
BOLD = "\33[1m"
|
|
7
|
+
ITALIC = "\33[3m"
|
|
8
|
+
URL = "\33[4m"
|
|
9
|
+
BLINK = "\33[5m"
|
|
10
|
+
SELECTED = "\33[7m"
|
|
11
|
+
|
|
12
|
+
BLINK2 = "\33[6m"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class colors:
|
|
16
|
+
BLACK = "\33[30m"
|
|
17
|
+
RED = "\33[31m"
|
|
18
|
+
GREEN = "\33[32m"
|
|
19
|
+
YELLOW = "\33[33m"
|
|
20
|
+
BLUE = "\33[34m"
|
|
21
|
+
VIOLET = "\33[35m"
|
|
22
|
+
BEIGE = "\33[36m"
|
|
23
|
+
WHITE = "\33[37m"
|
|
24
|
+
GREY = "\33[90m"
|
|
25
|
+
ORANGE = "\33[33m"
|
|
26
|
+
|
|
27
|
+
RED2 = "\33[91m"
|
|
28
|
+
GREEN2 = "\33[92m"
|
|
29
|
+
YELLOW2 = "\33[93m"
|
|
30
|
+
BLUE2 = "\33[94m"
|
|
31
|
+
VIOLET2 = "\33[95m"
|
|
32
|
+
BEIGE2 = "\33[96m"
|
|
33
|
+
WHITE2 = "\33[97m"
|
|
34
|
+
|
|
35
|
+
RED3 = "\33[1;31m"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def progress_bar(progress: int = 0, width: int = 40, msg: Optional[str] = None):
|
|
39
|
+
if not isinstance(progress, int):
|
|
40
|
+
progress = int(progress)
|
|
41
|
+
|
|
42
|
+
left = width * progress // 100
|
|
43
|
+
right = width - left
|
|
44
|
+
|
|
45
|
+
tags = "#" * left
|
|
46
|
+
spaces = " " * right
|
|
47
|
+
pct = f" {progress}%"
|
|
48
|
+
if msg:
|
|
49
|
+
pct = f"{pct} ({msg})"
|
|
50
|
+
|
|
51
|
+
print("\r[", tags, spaces, "]", pct, sep="", end="", flush=True)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from databricks.sdk.runtime import dbutils
|
|
2
|
+
|
|
3
|
+
from fabricks.utils.secret import AccessKey, ApplicationRegistration, Secret
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def create_container(storage_account: str, container: str, secret: Secret):
|
|
7
|
+
from azure.core.exceptions import ResourceExistsError
|
|
8
|
+
from azure.identity import ClientSecretCredential
|
|
9
|
+
from azure.storage.blob import BlobServiceClient
|
|
10
|
+
|
|
11
|
+
assert isinstance(secret, ApplicationRegistration)
|
|
12
|
+
|
|
13
|
+
cred = ClientSecretCredential(
|
|
14
|
+
tenant_id=secret.directory_id,
|
|
15
|
+
client_id=secret.application_id,
|
|
16
|
+
client_secret=secret.secret,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
blob_service_client = BlobServiceClient(f"https://{storage_account}.blob.core.windows.net", credential=cred)
|
|
21
|
+
blob_service_client.create_container(container)
|
|
22
|
+
|
|
23
|
+
except ResourceExistsError:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def mount_container(storage_account: str, container: str, secret: Secret):
|
|
28
|
+
try:
|
|
29
|
+
dbutils.fs.unmount(f"/mnt/{container}") # type: ignore
|
|
30
|
+
|
|
31
|
+
except Exception:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
if isinstance(secret, AccessKey):
|
|
35
|
+
dbutils.fs.mount( # type: ignore
|
|
36
|
+
source=f"wasbs://{container}@{storage_account}.blob.core.windows.net",
|
|
37
|
+
mount_point=f"/mnt/{container}",
|
|
38
|
+
extra_configs={
|
|
39
|
+
f"fs.azure.account.key.{storage_account}.blob.core.windows.net": f"{secret.key}",
|
|
40
|
+
},
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
elif isinstance(secret, ApplicationRegistration):
|
|
44
|
+
dbutils.fs.mount( # type: ignore
|
|
45
|
+
source=f"abfss://{container}@{storage_account}.blob.core.windows.net/",
|
|
46
|
+
mount_point=f"/mnt/{container}",
|
|
47
|
+
extra_configs={
|
|
48
|
+
"fs.azure.account.auth.type": "OAuth",
|
|
49
|
+
"fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredentialsTokenProvider",
|
|
50
|
+
"fs.azure.account.oauth2.client.id": secret.application_id,
|
|
51
|
+
"fs.azure.account.oauth2.client.secret": secret.secret,
|
|
52
|
+
"fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{secret.directory_id}/oauth2/token",
|
|
53
|
+
},
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError("secret is not valid")
|
fabricks/utils/fdict.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import Any, List, Optional
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class FDict:
|
|
5
|
+
def __init__(self, options: Any):
|
|
6
|
+
self.options = options
|
|
7
|
+
|
|
8
|
+
def get(self, key: str) -> Optional[Any]:
|
|
9
|
+
return self.options.get(key)
|
|
10
|
+
|
|
11
|
+
def get_list(self, key: str) -> List[Any]:
|
|
12
|
+
values = self.options.get(key, [])
|
|
13
|
+
if values is not None:
|
|
14
|
+
if not isinstance(values, List):
|
|
15
|
+
values = [values]
|
|
16
|
+
return values
|
|
17
|
+
|
|
18
|
+
def get_boolean(self, key: str, if_none: Optional[bool] = None) -> Optional[bool]:
|
|
19
|
+
o = self.options.get(key)
|
|
20
|
+
if isinstance(o, bool):
|
|
21
|
+
return o
|
|
22
|
+
elif o is not None:
|
|
23
|
+
return o.lower() == "true"
|
|
24
|
+
else:
|
|
25
|
+
return if_none
|
|
26
|
+
|
|
27
|
+
def get_dict(self, key) -> dict:
|
|
28
|
+
return self.options.get(key, {})
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
2
|
+
from functools import reduce
|
|
3
|
+
from typing import Any, Callable, Iterable, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from databricks.sdk.runtime import dbutils, spark
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
from typing_extensions import deprecated
|
|
8
|
+
|
|
9
|
+
from fabricks.utils.path import Path
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def concat_ws(fields: Union[str, List[str]], alias: Optional[str] = None) -> str:
|
|
13
|
+
if isinstance(fields, str):
|
|
14
|
+
fields = [fields]
|
|
15
|
+
|
|
16
|
+
if alias:
|
|
17
|
+
coalesce = [f"coalesce(cast({alias}.{f} as string), '-1')" for f in fields]
|
|
18
|
+
else:
|
|
19
|
+
coalesce = [f"coalesce(cast({f} as string), '-1')" for f in fields]
|
|
20
|
+
|
|
21
|
+
return "concat_ws('*', " + ",".join(coalesce) + ")"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def concat_dfs(dfs: Iterable[DataFrame]) -> DataFrame:
|
|
25
|
+
return reduce(lambda x, y: x.unionByName(y, allowMissingColumns=True), dfs)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@deprecated("use run_threads instead")
|
|
29
|
+
def run_threads(func: Callable, iter: Union[List, DataFrame, range, set], workers: int = 8) -> List[Any]:
|
|
30
|
+
return run_in_parallel(func, iter, workers)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def run_in_parallel(func: Callable, iterable: Union[List, DataFrame, range, set], workers: int = 8) -> List[Any]:
|
|
34
|
+
"""
|
|
35
|
+
Runs the given function in parallel on the elements of the iterable using multiple threads.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
func (Callable): The function to be executed in parallel.
|
|
39
|
+
iterable (Union[List, DataFrame, range, set]): The iterable containing the elements on which the function will be executed.
|
|
40
|
+
workers (int, optional): The number of worker threads to use. Defaults to 8.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
List[Any]: A list containing the results of the function calls.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
out = []
|
|
47
|
+
|
|
48
|
+
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
49
|
+
iterable = iterable.collect() if isinstance(iterable, DataFrame) else iterable
|
|
50
|
+
futures = {executor.submit(func, i): i for i in iterable}
|
|
51
|
+
for future in as_completed(futures):
|
|
52
|
+
try:
|
|
53
|
+
r = future.result()
|
|
54
|
+
if r:
|
|
55
|
+
out.append(r)
|
|
56
|
+
except Exception:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
return out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def run_notebook(path: Path, timeout: Optional[int] = None, **kwargs):
|
|
63
|
+
"""
|
|
64
|
+
Runs a notebook located at the given path.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
path (Path): The path to the notebook file.
|
|
68
|
+
timeout (Optional[int]): The maximum execution time for the notebook in seconds. Defaults to None.
|
|
69
|
+
**kwargs: Additional keyword arguments to be passed to the notebook.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
None
|
|
73
|
+
"""
|
|
74
|
+
if timeout is None:
|
|
75
|
+
timeout = 3600
|
|
76
|
+
|
|
77
|
+
dbutils.notebook.run(path.get_notebook_path(), timeout, {**kwargs})
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def xxhash64(s: Any):
|
|
81
|
+
df = spark.sql(f"select xxhash64(cast('{s}' as string)) as xxhash64")
|
|
82
|
+
return df.collect()[0][0]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def md5(s: Any):
|
|
86
|
+
from hashlib import md5
|
|
87
|
+
|
|
88
|
+
md5 = md5(str(s).encode())
|
|
89
|
+
return md5.hexdigest()
|
fabricks/utils/log.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Tuple
|
|
6
|
+
|
|
7
|
+
from fabricks.utils.azure_table import AzureTable
|
|
8
|
+
|
|
9
|
+
GREY = "\33[90m"
|
|
10
|
+
BLACK = "\33[30m"
|
|
11
|
+
YELLOW = "\33[93m"
|
|
12
|
+
RED = "\33[31m"
|
|
13
|
+
RESET = "\33[0m"
|
|
14
|
+
COLORS = {"DEBUG": GREY, "INFO": BLACK, "WARNING": YELLOW, "ERROR": RED, "CRITICAL": RED}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class CustomFormatter(logging.Formatter):
|
|
18
|
+
def format(self, record):
|
|
19
|
+
message = super().format(record) # noqa: F841
|
|
20
|
+
out = f"{COLORS[record.levelname]}"
|
|
21
|
+
|
|
22
|
+
if hasattr(record, "created"):
|
|
23
|
+
d = datetime.fromtimestamp(record.created).strftime("%d/%m/%y %H:%M:%S")
|
|
24
|
+
out += f"[{d}]"
|
|
25
|
+
|
|
26
|
+
if hasattr(record, "job"):
|
|
27
|
+
j = f" - {record.__dict__.get('job')}"
|
|
28
|
+
out += str(j)
|
|
29
|
+
|
|
30
|
+
elif hasattr(record, "step"):
|
|
31
|
+
s = f" - {record.__dict__.get('step')}"
|
|
32
|
+
out += str(s)
|
|
33
|
+
|
|
34
|
+
if hasattr(record, "message"):
|
|
35
|
+
m = record.__dict__.get("message", "")
|
|
36
|
+
if hasattr(record, "job"):
|
|
37
|
+
m = f" => {m}"
|
|
38
|
+
elif hasattr(record, "step"):
|
|
39
|
+
m = f" => {m}"
|
|
40
|
+
else:
|
|
41
|
+
m = " " + m
|
|
42
|
+
out += m
|
|
43
|
+
|
|
44
|
+
if hasattr(record, "exc_info"):
|
|
45
|
+
exc_info = record.__dict__.get("exc_info", None)
|
|
46
|
+
if exc_info is not None:
|
|
47
|
+
e = f" !{exc_info[0].__name__.lower()}!"
|
|
48
|
+
out += e
|
|
49
|
+
|
|
50
|
+
if hasattr(record, "sql"):
|
|
51
|
+
s = f"\n---\n%sql\n{record.__dict__.get('sql')}\n---"
|
|
52
|
+
out += s
|
|
53
|
+
if hasattr(record, "content"):
|
|
54
|
+
s = f"\n---\n{record.__dict__.get('content')}\n---"
|
|
55
|
+
out += s
|
|
56
|
+
|
|
57
|
+
out += f"{RESET}"
|
|
58
|
+
return out
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class AzureTableHandler(logging.Handler):
|
|
62
|
+
def __init__(self, table: AzureTable):
|
|
63
|
+
super().__init__()
|
|
64
|
+
self.buffer = []
|
|
65
|
+
self.table = table
|
|
66
|
+
|
|
67
|
+
def emit(self, record):
|
|
68
|
+
if hasattr(record, "target"):
|
|
69
|
+
target = record.__dict__.get("target")
|
|
70
|
+
|
|
71
|
+
r = {
|
|
72
|
+
"Created": str(
|
|
73
|
+
datetime.fromtimestamp(record.created).strftime("%d/%m/%y %H:%M:%S")
|
|
74
|
+
), # timestamp not present when querying Azure Table
|
|
75
|
+
"Level": record.levelname,
|
|
76
|
+
"Message": record.message,
|
|
77
|
+
}
|
|
78
|
+
if hasattr(record, "job"):
|
|
79
|
+
j = str(record.__dict__.get("job", ""))
|
|
80
|
+
r["Job"] = j
|
|
81
|
+
r["JobId"] = hashlib.md5(j.encode()).hexdigest()
|
|
82
|
+
if hasattr(record, "table"):
|
|
83
|
+
t = str(record.__dict__.get("table", ""))
|
|
84
|
+
r["Job"] = t
|
|
85
|
+
r["JobId"] = hashlib.md5(t.encode()).hexdigest()
|
|
86
|
+
|
|
87
|
+
if hasattr(record, "step"):
|
|
88
|
+
r["Step"] = record.__dict__.get("step", "")
|
|
89
|
+
|
|
90
|
+
if hasattr(record, "schedule_id"):
|
|
91
|
+
r["ScheduleId"] = record.__dict__.get("schedule_id", "")
|
|
92
|
+
|
|
93
|
+
if hasattr(record, "schedule"):
|
|
94
|
+
r["Schedule"] = record.__dict__.get("schedule", "")
|
|
95
|
+
|
|
96
|
+
if hasattr(record, "notebook_id"):
|
|
97
|
+
r["NotebookId"] = record.__dict__.get("notebook_id", "")
|
|
98
|
+
|
|
99
|
+
if hasattr(record, "exc_info"):
|
|
100
|
+
e = record.__dict__.get("exc_info", None)
|
|
101
|
+
if e is not None:
|
|
102
|
+
d = {
|
|
103
|
+
"type": str(e[0].__name__)[:1000],
|
|
104
|
+
"message": str(e[1])[:1000],
|
|
105
|
+
"traceback": str(logging.Formatter.formatException(self, e))[:1000], # type: ignore
|
|
106
|
+
}
|
|
107
|
+
r["Exception"] = json.dumps(d)
|
|
108
|
+
|
|
109
|
+
if hasattr(record, "content"):
|
|
110
|
+
r["Content"] = json.dumps(record.__dict__.get("content", ""))[:1000]
|
|
111
|
+
if hasattr(record, "sql"):
|
|
112
|
+
r["Sql"] = record.__dict__.get("sql", "")[:1000]
|
|
113
|
+
|
|
114
|
+
r["PartitionKey"] = record.__dict__.get("partition_key", "default")
|
|
115
|
+
if hasattr(record, "row_key"):
|
|
116
|
+
r["RowKey"] = record.__dict__.get("row_key", "")
|
|
117
|
+
else:
|
|
118
|
+
r["RowKey"] = hashlib.md5(json.dumps(r, sort_keys=True).encode()).hexdigest()
|
|
119
|
+
|
|
120
|
+
if target == "table":
|
|
121
|
+
self.table.upsert(r)
|
|
122
|
+
else:
|
|
123
|
+
self.buffer.append(r)
|
|
124
|
+
|
|
125
|
+
else:
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
def flush(self):
|
|
129
|
+
self.table.upsert(self.buffer)
|
|
130
|
+
self.buffer.clear()
|
|
131
|
+
|
|
132
|
+
def clear_buffer(self):
|
|
133
|
+
self.buffer = []
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_logger(name: str, level: int, table: AzureTable) -> Tuple[logging.Logger, AzureTableHandler]:
|
|
137
|
+
logger = logging.getLogger(name)
|
|
138
|
+
logger.setLevel(level)
|
|
139
|
+
|
|
140
|
+
# Console handler
|
|
141
|
+
console_handler = logging.StreamHandler()
|
|
142
|
+
console_handler.setLevel(level)
|
|
143
|
+
console_format = CustomFormatter()
|
|
144
|
+
console_handler.setFormatter(console_format)
|
|
145
|
+
|
|
146
|
+
# Azure Table handler
|
|
147
|
+
azure_table_handler = AzureTableHandler(table=table)
|
|
148
|
+
azure_table_handler.setLevel(level)
|
|
149
|
+
|
|
150
|
+
logger.addHandler(console_handler)
|
|
151
|
+
logger.addHandler(azure_table_handler)
|
|
152
|
+
|
|
153
|
+
return logger, azure_table_handler
|