fabricks 3.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +11 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +27 -0
- fabricks/api/core.py +4 -0
- fabricks/api/deploy.py +3 -0
- fabricks/api/exceptions.py +19 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/job_schema.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/masks.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/initialize.py +42 -0
- fabricks/api/notebooks/process.py +54 -0
- fabricks/api/notebooks/run.py +59 -0
- fabricks/api/notebooks/schedule.py +75 -0
- fabricks/api/notebooks/terminate.py +31 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/schedules.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/api/version.py +3 -0
- fabricks/api/views.py +6 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/_types.py +10 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +223 -0
- fabricks/cdc/base/generator.py +177 -0
- fabricks/cdc/base/merger.py +110 -0
- fabricks/cdc/base/processor.py +471 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +20 -0
- fabricks/cdc/scd.py +22 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
- fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
- fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
- fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
- fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
- fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
- fabricks/cdc/templates/filter.sql.jinja +4 -0
- fabricks/cdc/templates/filters/final.sql.jinja +4 -0
- fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
- fabricks/cdc/templates/filters/update.sql.jinja +30 -0
- fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
- fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
- fabricks/cdc/templates/merge.sql.jinja +3 -0
- fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
- fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
- fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/queries/__init__.py +0 -0
- fabricks/cdc/templates/queries/context.sql.jinja +186 -0
- fabricks/cdc/templates/queries/final.sql.jinja +1 -0
- fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
- fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
- fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
- fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
- fabricks/cdc/templates/query.sql.jinja +15 -0
- fabricks/context/__init__.py +72 -0
- fabricks/context/_types.py +133 -0
- fabricks/context/config/__init__.py +92 -0
- fabricks/context/config/utils.py +53 -0
- fabricks/context/log.py +77 -0
- fabricks/context/runtime.py +117 -0
- fabricks/context/secret.py +103 -0
- fabricks/context/spark_session.py +82 -0
- fabricks/context/utils.py +80 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +99 -0
- fabricks/core/dags/generator.py +157 -0
- fabricks/core/dags/log.py +12 -0
- fabricks/core/dags/processor.py +228 -0
- fabricks/core/dags/run.py +39 -0
- fabricks/core/dags/terminator.py +25 -0
- fabricks/core/dags/utils.py +54 -0
- fabricks/core/extenders.py +33 -0
- fabricks/core/job_schema.py +32 -0
- fabricks/core/jobs/__init__.py +21 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/_types.py +284 -0
- fabricks/core/jobs/base/checker.py +139 -0
- fabricks/core/jobs/base/configurator.py +306 -0
- fabricks/core/jobs/base/exception.py +85 -0
- fabricks/core/jobs/base/generator.py +447 -0
- fabricks/core/jobs/base/invoker.py +206 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +249 -0
- fabricks/core/jobs/bronze.py +395 -0
- fabricks/core/jobs/get_job.py +127 -0
- fabricks/core/jobs/get_job_conf.py +152 -0
- fabricks/core/jobs/get_job_id.py +31 -0
- fabricks/core/jobs/get_jobs.py +107 -0
- fabricks/core/jobs/get_schedule.py +10 -0
- fabricks/core/jobs/get_schedules.py +32 -0
- fabricks/core/jobs/gold.py +415 -0
- fabricks/core/jobs/silver.py +373 -0
- fabricks/core/masks.py +52 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/_types.py +6 -0
- fabricks/core/parsers/base.py +95 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +26 -0
- fabricks/core/parsers/utils.py +69 -0
- fabricks/core/schedules/__init__.py +14 -0
- fabricks/core/schedules/diagrams.py +21 -0
- fabricks/core/schedules/generate.py +20 -0
- fabricks/core/schedules/get_schedule.py +5 -0
- fabricks/core/schedules/get_schedules.py +9 -0
- fabricks/core/schedules/process.py +9 -0
- fabricks/core/schedules/run.py +3 -0
- fabricks/core/schedules/terminate.py +6 -0
- fabricks/core/schedules/views.py +61 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/_types.py +7 -0
- fabricks/core/steps/base.py +423 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +26 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/views.py +41 -0
- fabricks/deploy/__init__.py +92 -0
- fabricks/deploy/masks.py +8 -0
- fabricks/deploy/notebooks.py +71 -0
- fabricks/deploy/schedules.py +10 -0
- fabricks/deploy/tables.py +82 -0
- fabricks/deploy/udfs.py +19 -0
- fabricks/deploy/utils.py +36 -0
- fabricks/deploy/views.py +509 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/_types.py +65 -0
- fabricks/metastore/database.py +65 -0
- fabricks/metastore/dbobject.py +66 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/table.py +768 -0
- fabricks/metastore/utils.py +51 -0
- fabricks/metastore/view.py +53 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/_types.py +6 -0
- fabricks/utils/azure_queue.py +93 -0
- fabricks/utils/azure_table.py +154 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/fdict.py +240 -0
- fabricks/utils/helpers.py +228 -0
- fabricks/utils/log.py +236 -0
- fabricks/utils/mermaid.py +32 -0
- fabricks/utils/path.py +242 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +94 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/_types.py +3 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +33 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +99 -0
- fabricks/utils/spark.py +76 -0
- fabricks/utils/sqlglot.py +56 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-3.0.11.dist-info/METADATA +23 -0
- fabricks-3.0.11.dist-info/RECORD +176 -0
- fabricks-3.0.11.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from typing import Final, List, Optional
|
|
2
|
+
|
|
3
|
+
import yaml
|
|
4
|
+
|
|
5
|
+
from fabricks.context.config import path_config, path_runtime
|
|
6
|
+
from fabricks.utils.path import Path
|
|
7
|
+
|
|
8
|
+
with open(str(path_config)) as f:
|
|
9
|
+
data = yaml.safe_load(f)
|
|
10
|
+
|
|
11
|
+
conf: dict = [d["conf"] for d in data][0]
|
|
12
|
+
assert conf, "conf mandatory"
|
|
13
|
+
CONF_RUNTIME: Final[dict] = conf
|
|
14
|
+
|
|
15
|
+
BRONZE = CONF_RUNTIME.get("bronze", [{}])
|
|
16
|
+
SILVER = CONF_RUNTIME.get("silver", [{}])
|
|
17
|
+
GOLD = CONF_RUNTIME.get("gold", [{}])
|
|
18
|
+
STEPS = BRONZE + SILVER + GOLD
|
|
19
|
+
|
|
20
|
+
databases = CONF_RUNTIME.get("databases", [{}])
|
|
21
|
+
credentials = CONF_RUNTIME.get("credentials", {})
|
|
22
|
+
variables = CONF_RUNTIME.get("variables", {})
|
|
23
|
+
VARIABLES: dict = variables
|
|
24
|
+
|
|
25
|
+
conf_options = CONF_RUNTIME.get("options", {})
|
|
26
|
+
assert conf_options, "options mandatory"
|
|
27
|
+
|
|
28
|
+
IS_UNITY_CATALOG: Final[bool] = str(conf_options.get("unity_catalog", "False")).lower() in ("true", "1", "yes")
|
|
29
|
+
CATALOG: Optional[str] = conf_options.get("catalog")
|
|
30
|
+
|
|
31
|
+
if IS_UNITY_CATALOG and not CATALOG:
|
|
32
|
+
raise ValueError("catalog mandatory in options when unity_catalog is enabled")
|
|
33
|
+
|
|
34
|
+
secret_scope = conf_options.get("secret_scope")
|
|
35
|
+
assert secret_scope, "secret_scope mandatory in options"
|
|
36
|
+
SECRET_SCOPE: Final[str] = secret_scope
|
|
37
|
+
|
|
38
|
+
timezone = conf_options.get("timezone")
|
|
39
|
+
TIMEZONE: Final[str] = timezone
|
|
40
|
+
|
|
41
|
+
IS_TYPE_WIDENING: Final[bool] = str(conf_options.get("type_widening", "True")).lower() in ("true", "1", "yes")
|
|
42
|
+
|
|
43
|
+
path_options = CONF_RUNTIME.get("path_options", {})
|
|
44
|
+
assert path_options, "options mandatory"
|
|
45
|
+
|
|
46
|
+
fabricks_uri = path_options.get("storage")
|
|
47
|
+
assert fabricks_uri, "storage mandatory in path options"
|
|
48
|
+
FABRICKS_STORAGE: Final[Path] = Path.from_uri(fabricks_uri, regex=variables)
|
|
49
|
+
|
|
50
|
+
FABRICKS_STORAGE_CREDENTIAL: Final[Optional[str]] = path_options.get("storage_credential")
|
|
51
|
+
|
|
52
|
+
path_udfs = path_options.get("udfs", "fabricks/udfs")
|
|
53
|
+
assert path_udfs, "path to udfs mandatory"
|
|
54
|
+
PATH_UDFS: Final[Path] = path_runtime.joinpath(path_udfs)
|
|
55
|
+
|
|
56
|
+
path_parsers = path_options.get("parsers", "fabricks/parsers")
|
|
57
|
+
assert path_parsers, "path to parsers mandatory"
|
|
58
|
+
PATH_PARSERS: Final[Path] = path_runtime.joinpath(path_parsers)
|
|
59
|
+
|
|
60
|
+
path_extenders = path_options.get("extenders", "fabricks/extenders")
|
|
61
|
+
assert path_extenders, "path to extenders mandatory"
|
|
62
|
+
PATH_EXTENDERS: Final[Path] = path_runtime.joinpath(path_extenders)
|
|
63
|
+
|
|
64
|
+
path_views = path_options.get("views", "fabricks/views")
|
|
65
|
+
assert path_views, "path to views mandatory"
|
|
66
|
+
PATH_VIEWS: Final[Path] = path_runtime.joinpath(path_views)
|
|
67
|
+
|
|
68
|
+
path_schedules = path_options.get("schedules", "fabricks/schedules")
|
|
69
|
+
assert path_schedules, "path to schedules mandatory"
|
|
70
|
+
PATH_SCHEDULES: Final[Path] = path_runtime.joinpath(path_schedules)
|
|
71
|
+
|
|
72
|
+
path_requirements = path_options.get("requirements", "fabricks/requirements")
|
|
73
|
+
assert path_requirements, "path to requirements mandatory"
|
|
74
|
+
PATH_REQUIREMENTS: Final[Path] = path_runtime.joinpath(path_requirements)
|
|
75
|
+
|
|
76
|
+
path_masks = path_options.get("masks", "fabricks/masks")
|
|
77
|
+
assert path_masks, "path to masks mandatory"
|
|
78
|
+
PATH_MASKS: Final[Path] = path_runtime.joinpath(path_masks)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _get_storage_paths(objects: List[dict]) -> dict:
|
|
82
|
+
d = {}
|
|
83
|
+
for o in objects:
|
|
84
|
+
if o:
|
|
85
|
+
name = o.get("name")
|
|
86
|
+
assert name
|
|
87
|
+
uri = o.get("path_options", {}).get("storage")
|
|
88
|
+
assert uri
|
|
89
|
+
d[name] = Path.from_uri(uri, regex=variables)
|
|
90
|
+
return d
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
PATHS_STORAGE: Final[dict[str, Path]] = {
|
|
94
|
+
"fabricks": FABRICKS_STORAGE,
|
|
95
|
+
**_get_storage_paths(BRONZE),
|
|
96
|
+
**_get_storage_paths(SILVER),
|
|
97
|
+
**_get_storage_paths(GOLD),
|
|
98
|
+
**_get_storage_paths(databases),
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _get_runtime_path(objects: List[dict]) -> dict:
|
|
103
|
+
d = {}
|
|
104
|
+
for o in objects:
|
|
105
|
+
name = o.get("name")
|
|
106
|
+
assert name
|
|
107
|
+
uri = o.get("path_options", {}).get("runtime")
|
|
108
|
+
assert uri
|
|
109
|
+
d[name] = path_runtime.joinpath(uri)
|
|
110
|
+
return d
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
PATHS_RUNTIME: Final[dict[str, Path]] = {
|
|
114
|
+
**_get_runtime_path(BRONZE),
|
|
115
|
+
**_get_runtime_path(SILVER),
|
|
116
|
+
**_get_runtime_path(GOLD),
|
|
117
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import SparkSession
|
|
9
|
+
|
|
10
|
+
from fabricks.context import IS_UNITY_CATALOG
|
|
11
|
+
from fabricks.utils.spark import spark as _spark
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class Secret:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ApplicationRegistration(Secret):
|
|
21
|
+
secret: str
|
|
22
|
+
application_id: str
|
|
23
|
+
directory_id: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class AccessKey(Secret):
|
|
28
|
+
key: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_scopes = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@lru_cache(maxsize=None)
|
|
35
|
+
def _get_secret_from_secret_scope(secret_scope: str, name: str) -> str:
|
|
36
|
+
from databricks.sdk.runtime import dbutils
|
|
37
|
+
|
|
38
|
+
global _scopes
|
|
39
|
+
|
|
40
|
+
if not _scopes or secret_scope not in _scopes: # we get the scopes only once, unless you search for something new
|
|
41
|
+
_scopes = [s.name for s in dbutils.secrets.listScopes()]
|
|
42
|
+
|
|
43
|
+
assert secret_scope in _scopes, f"scope {secret_scope} not found"
|
|
44
|
+
|
|
45
|
+
return dbutils.secrets.get(scope=secret_scope, key=name)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_secret_from_secret_scope(secret_scope: str, name: str) -> Secret:
|
|
49
|
+
secret = _get_secret_from_secret_scope(secret_scope=secret_scope, name=name)
|
|
50
|
+
|
|
51
|
+
if name.endswith("application-registration"):
|
|
52
|
+
s = json.loads(secret)
|
|
53
|
+
assert s.get("secret"), f"no secret found in {name}"
|
|
54
|
+
assert s.get("application_id"), f"no application_id found in {name}"
|
|
55
|
+
assert s.get("directory_id"), f"no directory_id found in {name}"
|
|
56
|
+
|
|
57
|
+
return ApplicationRegistration(
|
|
58
|
+
secret=s.get("secret"),
|
|
59
|
+
application_id=s.get("application_id"),
|
|
60
|
+
directory_id=s.get("directory_id"),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
elif name.endswith("access-key"):
|
|
64
|
+
return AccessKey(key=secret)
|
|
65
|
+
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError(f"{name} is not valid")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _add_secret_to_spark(key: str, value: str, spark: Optional[SparkSession] = None):
|
|
71
|
+
if spark is None:
|
|
72
|
+
spark = _spark
|
|
73
|
+
|
|
74
|
+
spark.conf.set(key, value) # needed for check (invalid configuration value detected for fs.azure.account.key)
|
|
75
|
+
|
|
76
|
+
if not IS_UNITY_CATALOG:
|
|
77
|
+
spark._jsc.hadoopConfiguration().set(key, value) # type: ignore
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def add_secret_to_spark(secret: Secret, uri: str, spark: Optional[SparkSession] = None):
|
|
81
|
+
if spark is None:
|
|
82
|
+
spark = _spark
|
|
83
|
+
|
|
84
|
+
if isinstance(secret, ApplicationRegistration):
|
|
85
|
+
_add_secret_to_spark(f"fs.azure.account.auth.type.{uri}", "OAuth", spark=spark)
|
|
86
|
+
_add_secret_to_spark(
|
|
87
|
+
f"fs.azure.account.oauth.provider.type.{uri}",
|
|
88
|
+
"org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
|
|
89
|
+
spark=spark,
|
|
90
|
+
)
|
|
91
|
+
_add_secret_to_spark(f"fs.azure.account.oauth2.client.id.{uri}", secret.application_id, spark=spark)
|
|
92
|
+
_add_secret_to_spark(f"fs.azure.account.oauth2.client.secret.{uri}", secret.secret, spark=spark)
|
|
93
|
+
_add_secret_to_spark(
|
|
94
|
+
f"fs.azure.account.oauth2.client.endpoint.{uri}",
|
|
95
|
+
f"https://login.microsoftonline.com/{secret.directory_id}/oauth2/token",
|
|
96
|
+
spark=spark,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
elif isinstance(secret, AccessKey):
|
|
100
|
+
_add_secret_to_spark(f"fs.azure.account.key.{uri}", secret.key, spark=spark)
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
raise ValueError("secret is not valid")
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
from typing_extensions import deprecated
|
|
5
|
+
|
|
6
|
+
from fabricks.context import CATALOG, CONF_RUNTIME, IS_UNITY_CATALOG, SECRET_SCOPE
|
|
7
|
+
from fabricks.context.secret import add_secret_to_spark, get_secret_from_secret_scope
|
|
8
|
+
from fabricks.utils.spark import get_dbutils, get_spark
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def add_catalog_to_spark(spark: Optional[SparkSession] = None):
|
|
12
|
+
if spark is None:
|
|
13
|
+
spark = get_spark()
|
|
14
|
+
|
|
15
|
+
if CATALOG is not None:
|
|
16
|
+
spark.sql(f"use catalog {CATALOG};")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def add_credentials_to_spark(spark: Optional[SparkSession] = None):
|
|
20
|
+
if spark is None:
|
|
21
|
+
spark = get_spark()
|
|
22
|
+
|
|
23
|
+
credentials = CONF_RUNTIME.get("credentials", {})
|
|
24
|
+
for uri, secret in credentials.items():
|
|
25
|
+
s = get_secret_from_secret_scope(secret_scope=SECRET_SCOPE, name=secret)
|
|
26
|
+
add_secret_to_spark(secret=s, uri=uri, spark=spark)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def add_spark_options_to_spark(spark: Optional[SparkSession] = None):
|
|
30
|
+
if spark is None:
|
|
31
|
+
spark = get_spark()
|
|
32
|
+
|
|
33
|
+
# delta default options
|
|
34
|
+
spark.sql("set spark.databricks.delta.schema.autoMerge.enabled = True;")
|
|
35
|
+
spark.sql("set spark.databricks.delta.resolveMergeUpdateStructsByName.enabled = True;")
|
|
36
|
+
|
|
37
|
+
# runtime options
|
|
38
|
+
spark_options = CONF_RUNTIME.get("spark_options", {})
|
|
39
|
+
if spark_options:
|
|
40
|
+
sql_options = spark_options.get("sql", {})
|
|
41
|
+
for key, value in sql_options.items():
|
|
42
|
+
spark.sql(f"set {key} = {value};")
|
|
43
|
+
|
|
44
|
+
conf_options = spark_options.get("conf", {})
|
|
45
|
+
for key, value in conf_options.items():
|
|
46
|
+
spark.conf.set(key, value)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def build_spark_session(spark: Optional[SparkSession] = None, app_name: Optional[str] = "default") -> SparkSession:
|
|
50
|
+
if app_name is None:
|
|
51
|
+
app_name = "default"
|
|
52
|
+
|
|
53
|
+
if spark is not None:
|
|
54
|
+
_spark = spark
|
|
55
|
+
_spark.builder.appName(app_name)
|
|
56
|
+
|
|
57
|
+
else:
|
|
58
|
+
_spark = (
|
|
59
|
+
SparkSession.builder.appName(app_name) # type: ignore
|
|
60
|
+
.config("spark.driver.allowMultipleContexts", "true")
|
|
61
|
+
.enableHiveSupport()
|
|
62
|
+
.getOrCreate()
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
add_catalog_to_spark(spark=_spark)
|
|
66
|
+
if not IS_UNITY_CATALOG:
|
|
67
|
+
add_credentials_to_spark(spark=_spark)
|
|
68
|
+
add_spark_options_to_spark(spark=_spark)
|
|
69
|
+
|
|
70
|
+
return _spark
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@deprecated("use build_spark_session instead")
|
|
74
|
+
def init_spark_session(spark: Optional[SparkSession] = None):
|
|
75
|
+
if spark is None:
|
|
76
|
+
spark = get_spark()
|
|
77
|
+
|
|
78
|
+
return build_spark_session(spark=spark)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
SPARK = build_spark_session(app_name="default")
|
|
82
|
+
DBUTILS = get_dbutils(SPARK)
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
import fabricks.context.config as c
|
|
4
|
+
import fabricks.context.runtime as r
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def pprint_runtime(extended: bool = False) -> None:
|
|
8
|
+
print("=" * 60)
|
|
9
|
+
print("FABRICKS RUNTIME CONFIGURATION")
|
|
10
|
+
print("=" * 60)
|
|
11
|
+
|
|
12
|
+
# Core Paths Section
|
|
13
|
+
print("\n📁 CORE CONFIG:")
|
|
14
|
+
print(f" Runtime: {c.PATH_RUNTIME.string}")
|
|
15
|
+
print(f" Notebooks: {c.PATH_NOTEBOOKS.string}")
|
|
16
|
+
print(f" Config: {c.PATH_CONFIG.string}")
|
|
17
|
+
print(f" Log Level: {logging.getLevelName(c.LOGLEVEL)}")
|
|
18
|
+
print(f" Debug Mode: {'✓' if c.IS_DEBUGMODE else '✗'}")
|
|
19
|
+
print(f" Job Config from YAML: {'✓' if c.IS_JOB_CONFIG_FROM_YAML else '✗'}")
|
|
20
|
+
|
|
21
|
+
print("\n⚙️ RUNTIME SETTINGS:")
|
|
22
|
+
print("\n🔄 PIPELINE STEPS:")
|
|
23
|
+
|
|
24
|
+
def _print_steps(steps_list, layer_name, icon):
|
|
25
|
+
if steps_list and any(step for step in steps_list if step):
|
|
26
|
+
print(f" {icon} {layer_name}:")
|
|
27
|
+
for step in steps_list:
|
|
28
|
+
if step:
|
|
29
|
+
step_name = step.get("name", "Unnamed")
|
|
30
|
+
print(f" • {step_name}")
|
|
31
|
+
else:
|
|
32
|
+
print(f" {icon} {layer_name}: No steps")
|
|
33
|
+
|
|
34
|
+
_print_steps(r.BRONZE, "Bronze", "🥉")
|
|
35
|
+
_print_steps(r.SILVER, "Silver", "🥈")
|
|
36
|
+
_print_steps(r.GOLD, "Gold", "🥇")
|
|
37
|
+
|
|
38
|
+
# Storage Configuration Section
|
|
39
|
+
print("\n💾 STORAGE CONFIGURATION:")
|
|
40
|
+
print(f" Storage URI: {r.FABRICKS_STORAGE.string}")
|
|
41
|
+
print(f" Storage Credential: {r.FABRICKS_STORAGE_CREDENTIAL or 'Not configured'}")
|
|
42
|
+
|
|
43
|
+
# Unity Catalog Section
|
|
44
|
+
print("\n🏛️ UNITY CATALOG:")
|
|
45
|
+
print(f" Enabled: {'✓' if r.IS_UNITY_CATALOG else '✗'}")
|
|
46
|
+
if r.IS_UNITY_CATALOG and r.CATALOG:
|
|
47
|
+
print(f" Catalog: {r.CATALOG}")
|
|
48
|
+
|
|
49
|
+
# Security Section
|
|
50
|
+
print("\n🔐 SECURITY:")
|
|
51
|
+
print(f" Secret Scope: {r.SECRET_SCOPE}")
|
|
52
|
+
|
|
53
|
+
print("\n🌐 ADDITIONAL SETTINGS:")
|
|
54
|
+
print(f" Timezone: {r.TIMEZONE}")
|
|
55
|
+
|
|
56
|
+
if extended:
|
|
57
|
+
# Component Paths Section
|
|
58
|
+
print("\n🛠️ COMPONENT PATHS:")
|
|
59
|
+
components = [
|
|
60
|
+
("UDFs", r.PATH_UDFS),
|
|
61
|
+
("Parsers", r.PATH_PARSERS),
|
|
62
|
+
("Extenders", r.PATH_EXTENDERS),
|
|
63
|
+
("Views", r.PATH_VIEWS),
|
|
64
|
+
("Schedules", r.PATH_SCHEDULES),
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
for name, path in components:
|
|
68
|
+
print(f" {name}: {path.string}")
|
|
69
|
+
|
|
70
|
+
# Storage Paths Section
|
|
71
|
+
print("\n📦 STORAGE PATHS:")
|
|
72
|
+
for name, path in sorted(r.PATHS_STORAGE.items()):
|
|
73
|
+
icon = "🏭" if name == "fabricks" else "📊"
|
|
74
|
+
print(f" {icon} {name}: {path.string}")
|
|
75
|
+
|
|
76
|
+
# Runtime Paths Section
|
|
77
|
+
if r.PATHS_RUNTIME:
|
|
78
|
+
print("\n⚡ RUNTIME PATHS:")
|
|
79
|
+
for name, path in sorted(r.PATHS_RUNTIME.items()):
|
|
80
|
+
print(f" 📂 {name}: {path.string}")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from azure.core.exceptions import AzureError
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.functions import expr
|
|
7
|
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
|
|
8
|
+
|
|
9
|
+
from fabricks.context import FABRICKS_STORAGE, SPARK
|
|
10
|
+
from fabricks.core.dags.log import TABLE_LOG_HANDLER
|
|
11
|
+
from fabricks.core.dags.utils import get_connection_info
|
|
12
|
+
from fabricks.metastore.table import Table
|
|
13
|
+
from fabricks.utils.azure_table import AzureTable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseDags:
|
|
17
|
+
def __init__(self, schedule_id: str):
|
|
18
|
+
self.schedule_id = schedule_id
|
|
19
|
+
self._connection_info = None
|
|
20
|
+
self._table = None
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def storage_account(self) -> str:
|
|
24
|
+
return FABRICKS_STORAGE.get_storage_account()
|
|
25
|
+
|
|
26
|
+
def get_connection_info(self) -> dict:
|
|
27
|
+
if not self._connection_info:
|
|
28
|
+
self._connection_info = get_connection_info(self.storage_account)
|
|
29
|
+
return self._connection_info
|
|
30
|
+
|
|
31
|
+
@retry(
|
|
32
|
+
stop=stop_after_attempt(3),
|
|
33
|
+
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
34
|
+
retry=retry_if_exception_type((Exception, AzureError)),
|
|
35
|
+
reraise=True,
|
|
36
|
+
)
|
|
37
|
+
def get_table(self) -> AzureTable:
|
|
38
|
+
if not self._table:
|
|
39
|
+
cs = self.get_connection_info()
|
|
40
|
+
self._table = AzureTable(f"t{self.schedule_id}", **dict(cs)) # type: ignore
|
|
41
|
+
|
|
42
|
+
if self._table is None:
|
|
43
|
+
raise ValueError("Azure table for logs not found")
|
|
44
|
+
|
|
45
|
+
return self._table
|
|
46
|
+
|
|
47
|
+
def __enter__(self):
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
def __exit__(self, *args, **kwargs):
|
|
51
|
+
if self._table is not None:
|
|
52
|
+
self._table.__exit__()
|
|
53
|
+
|
|
54
|
+
def get_logs(self, step: Optional[str] = None) -> DataFrame:
|
|
55
|
+
q = f"PartitionKey eq '{self.schedule_id}'"
|
|
56
|
+
if step:
|
|
57
|
+
q += f" and Step eq '{step}'"
|
|
58
|
+
|
|
59
|
+
d = TABLE_LOG_HANDLER.table.query(q)
|
|
60
|
+
df = SPARK.createDataFrame(d)
|
|
61
|
+
|
|
62
|
+
if "Exception" not in df.columns:
|
|
63
|
+
df = df.withColumn("Exception", expr("null"))
|
|
64
|
+
if "NotebookId" not in df.columns:
|
|
65
|
+
df = df.withColumn("NotebookId", expr("null"))
|
|
66
|
+
|
|
67
|
+
df = SPARK.sql(
|
|
68
|
+
"""
|
|
69
|
+
select
|
|
70
|
+
ScheduleId as schedule_id,
|
|
71
|
+
Schedule as schedule,
|
|
72
|
+
Step as step,
|
|
73
|
+
JobId as job_id,
|
|
74
|
+
Job as job,
|
|
75
|
+
NotebookId as notebook_id,
|
|
76
|
+
`Level` as `level`,
|
|
77
|
+
`Message` as `status`,
|
|
78
|
+
to_timestamp(`Created`, 'dd/MM/yy HH:mm:ss') as `timestamp`,
|
|
79
|
+
from_json(Exception, 'type STRING, message STRING, traceback STRING') as exception
|
|
80
|
+
from
|
|
81
|
+
{df}
|
|
82
|
+
""",
|
|
83
|
+
df=df,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return df
|
|
87
|
+
|
|
88
|
+
def write_logs(self, df: DataFrame):
|
|
89
|
+
(
|
|
90
|
+
df.write.format("delta")
|
|
91
|
+
.mode("overwrite")
|
|
92
|
+
.option("mergeSchema", "true")
|
|
93
|
+
.option("partitionOverwriteMode", "dynamic")
|
|
94
|
+
.save(Table("fabricks", "logs").delta_path.string)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def remove_invalid_characters(self, s: str) -> str:
|
|
98
|
+
out = re.sub("[^a-zA-Z0-9]", "", s)
|
|
99
|
+
return out
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.functions import lit
|
|
7
|
+
|
|
8
|
+
from fabricks.context import SPARK
|
|
9
|
+
from fabricks.core.dags.base import BaseDags
|
|
10
|
+
from fabricks.core.dags.log import TABLE_LOG_HANDLER
|
|
11
|
+
from fabricks.utils.azure_queue import AzureQueue
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DagGenerator(BaseDags):
|
|
15
|
+
def __init__(self, schedule: str):
|
|
16
|
+
self.schedule = schedule
|
|
17
|
+
schedule_id = str(uuid4().hex)
|
|
18
|
+
super().__init__(schedule_id=schedule_id)
|
|
19
|
+
|
|
20
|
+
def get_jobs(self) -> DataFrame:
|
|
21
|
+
return SPARK.sql(
|
|
22
|
+
f"""
|
|
23
|
+
with logs as (
|
|
24
|
+
select
|
|
25
|
+
l.job_id,
|
|
26
|
+
median(l.duration) as median_duration
|
|
27
|
+
from
|
|
28
|
+
fabricks.logs_pivot l
|
|
29
|
+
where
|
|
30
|
+
true
|
|
31
|
+
and duration is not null
|
|
32
|
+
and date_diff(day, l.start_time , current_date) < 10
|
|
33
|
+
group by
|
|
34
|
+
l.job_id
|
|
35
|
+
)
|
|
36
|
+
select
|
|
37
|
+
'statuses' as PartitionKey,
|
|
38
|
+
'{self.schedule_id}' as ScheduleId,
|
|
39
|
+
'{self.schedule}' as Schedule,
|
|
40
|
+
j.job_id::string as RowKey,
|
|
41
|
+
j.step as Step,
|
|
42
|
+
j.job_id as JobId,
|
|
43
|
+
j.job as Job,
|
|
44
|
+
'scheduled' as `Status`,
|
|
45
|
+
max(median_duration) as `MedianDuration`,
|
|
46
|
+
dense_rank() over (order by max(median_duration) desc) as Rank
|
|
47
|
+
from
|
|
48
|
+
fabricks.jobs j
|
|
49
|
+
inner join fabricks.{self.schedule}_schedule v on j.job_id = v.job_id
|
|
50
|
+
left join logs l on j.job_id = l.job_id
|
|
51
|
+
group by all
|
|
52
|
+
"""
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def get_dependencies(self, job_df: Optional[DataFrame] = None) -> DataFrame:
|
|
56
|
+
if job_df is None:
|
|
57
|
+
job_df = self.get_jobs()
|
|
58
|
+
|
|
59
|
+
df = SPARK.sql(
|
|
60
|
+
"""
|
|
61
|
+
select
|
|
62
|
+
'dependencies' as PartitionKey,
|
|
63
|
+
d.dependency_id :: string as RowKey,
|
|
64
|
+
d.dependency_id as DependencyId,
|
|
65
|
+
j.Step as Step,
|
|
66
|
+
j.Job as Job,
|
|
67
|
+
j.JobId as JobId,
|
|
68
|
+
p.Step as ParentStep,
|
|
69
|
+
p.Job as Parent,
|
|
70
|
+
p.JobId as ParentId
|
|
71
|
+
from
|
|
72
|
+
fabricks.dependencies d
|
|
73
|
+
inner join {job} j on d.job_id = j.JobId
|
|
74
|
+
inner join {job} p on d.parent_id = p.JobId
|
|
75
|
+
where
|
|
76
|
+
true
|
|
77
|
+
and d.parent_id is not null
|
|
78
|
+
and not d.job_id = d.parent_id
|
|
79
|
+
and not exists (
|
|
80
|
+
select 1
|
|
81
|
+
from
|
|
82
|
+
fabricks.dependencies_circular dc
|
|
83
|
+
where
|
|
84
|
+
true
|
|
85
|
+
and d.job_id = dc.job_id
|
|
86
|
+
and d.parent_id = dc.parent_id
|
|
87
|
+
|
|
88
|
+
)
|
|
89
|
+
group by all
|
|
90
|
+
""",
|
|
91
|
+
job=job_df,
|
|
92
|
+
)
|
|
93
|
+
df = df.withColumn("ScheduleId", lit(self.schedule_id))
|
|
94
|
+
return df.withColumn("Schedule", lit(self.schedule))
|
|
95
|
+
|
|
96
|
+
def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
|
|
97
|
+
if job_df is None:
|
|
98
|
+
job_df = self.get_jobs()
|
|
99
|
+
|
|
100
|
+
return SPARK.sql(
|
|
101
|
+
"""
|
|
102
|
+
select
|
|
103
|
+
Step
|
|
104
|
+
from
|
|
105
|
+
{job}
|
|
106
|
+
group by
|
|
107
|
+
Step
|
|
108
|
+
""",
|
|
109
|
+
job=job_df,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def generate(self) -> Tuple[str, DataFrame, DataFrame]:
|
|
113
|
+
job_df = self.get_jobs()
|
|
114
|
+
deps_df = self.get_dependencies(job_df)
|
|
115
|
+
step_df = self.get_steps(job_df)
|
|
116
|
+
|
|
117
|
+
table = self.get_table()
|
|
118
|
+
|
|
119
|
+
table.create_if_not_exists()
|
|
120
|
+
table.truncate_all_partitions()
|
|
121
|
+
|
|
122
|
+
table.upsert(job_df)
|
|
123
|
+
table.upsert(deps_df)
|
|
124
|
+
|
|
125
|
+
df = SPARK.sql(
|
|
126
|
+
"""
|
|
127
|
+
select
|
|
128
|
+
ScheduleId as PartitionKey,
|
|
129
|
+
ScheduleId,
|
|
130
|
+
`Schedule`,
|
|
131
|
+
Step,
|
|
132
|
+
Job,
|
|
133
|
+
JobId,
|
|
134
|
+
date_format(current_timestamp(), 'dd/MM/yy HH:mm:ss') as Created,
|
|
135
|
+
'INFO' as `Level`,
|
|
136
|
+
`Status` as `Message`,
|
|
137
|
+
from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
|
|
138
|
+
md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, '-1'), "*")) as RowKey
|
|
139
|
+
from
|
|
140
|
+
{df}
|
|
141
|
+
""",
|
|
142
|
+
df=job_df,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
TABLE_LOG_HANDLER.table.upsert(df)
|
|
146
|
+
|
|
147
|
+
cs = self.get_connection_info()
|
|
148
|
+
for row in step_df.collect():
|
|
149
|
+
step = self.remove_invalid_characters(row.Step)
|
|
150
|
+
|
|
151
|
+
with AzureQueue(f"q{step}{self.schedule_id}", **dict(cs)) as queue: # type: ignore
|
|
152
|
+
queue.create_if_not_exists()
|
|
153
|
+
queue.clear()
|
|
154
|
+
|
|
155
|
+
time.sleep(60)
|
|
156
|
+
|
|
157
|
+
return self.schedule_id, job_df, deps_df
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Final
|
|
3
|
+
|
|
4
|
+
from fabricks.core.dags.utils import get_table
|
|
5
|
+
from fabricks.utils.log import AzureTableLogHandler, get_logger
|
|
6
|
+
|
|
7
|
+
table = get_table()
|
|
8
|
+
Logger, TableLogHandler = get_logger("dags", logging.INFO, table=table, debugmode=False)
|
|
9
|
+
|
|
10
|
+
LOGGER: Final[logging.Logger] = Logger
|
|
11
|
+
assert TableLogHandler is not None
|
|
12
|
+
TABLE_LOG_HANDLER: Final[AzureTableLogHandler] = TableLogHandler
|