fabricks 2024.7.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fabricks/__init__.py +0 -0
- fabricks/api/__init__.py +7 -0
- fabricks/api/cdc/__init__.py +6 -0
- fabricks/api/cdc/nocdc.py +3 -0
- fabricks/api/cdc/scd1.py +3 -0
- fabricks/api/cdc/scd2.py +3 -0
- fabricks/api/context.py +31 -0
- fabricks/api/core.py +4 -0
- fabricks/api/extenders.py +3 -0
- fabricks/api/log.py +3 -0
- fabricks/api/metastore/__init__.py +10 -0
- fabricks/api/metastore/database.py +3 -0
- fabricks/api/metastore/table.py +3 -0
- fabricks/api/metastore/view.py +6 -0
- fabricks/api/notebooks/__init__.py +0 -0
- fabricks/api/notebooks/cluster.py +6 -0
- fabricks/api/notebooks/deploy/__init__.py +0 -0
- fabricks/api/notebooks/deploy/fabricks.py +147 -0
- fabricks/api/notebooks/deploy/notebooks.py +86 -0
- fabricks/api/notebooks/initialize.py +38 -0
- fabricks/api/notebooks/optimize.py +25 -0
- fabricks/api/notebooks/process.py +50 -0
- fabricks/api/notebooks/run.py +87 -0
- fabricks/api/notebooks/terminate.py +27 -0
- fabricks/api/notebooks/vacuum.py +25 -0
- fabricks/api/parsers.py +3 -0
- fabricks/api/udfs.py +3 -0
- fabricks/api/utils.py +9 -0
- fabricks/cdc/__init__.py +14 -0
- fabricks/cdc/base/__init__.py +4 -0
- fabricks/cdc/base/cdc.py +5 -0
- fabricks/cdc/base/configurator.py +145 -0
- fabricks/cdc/base/generator.py +117 -0
- fabricks/cdc/base/merger.py +107 -0
- fabricks/cdc/base/processor.py +338 -0
- fabricks/cdc/base/types.py +3 -0
- fabricks/cdc/cdc.py +5 -0
- fabricks/cdc/nocdc.py +19 -0
- fabricks/cdc/scd.py +21 -0
- fabricks/cdc/scd1.py +15 -0
- fabricks/cdc/scd2.py +15 -0
- fabricks/cdc/templates/__init__.py +0 -0
- fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
- fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
- fabricks/cdc/templates/merge.sql.jinja +2 -0
- fabricks/cdc/templates/query/__init__.py +0 -0
- fabricks/cdc/templates/query/base.sql.jinja +34 -0
- fabricks/cdc/templates/query/context.sql.jinja +95 -0
- fabricks/cdc/templates/query/current.sql.jinja +32 -0
- fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
- fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
- fabricks/cdc/templates/query/filter.sql.jinja +71 -0
- fabricks/cdc/templates/query/final.sql.jinja +1 -0
- fabricks/cdc/templates/query/hash.sql.jinja +1 -0
- fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
- fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
- fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
- fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
- fabricks/cdc/templates/query.sql.jinja +11 -0
- fabricks/context/__init__.py +51 -0
- fabricks/context/log.py +26 -0
- fabricks/context/runtime.py +143 -0
- fabricks/context/spark.py +43 -0
- fabricks/context/types.py +123 -0
- fabricks/core/__init__.py +4 -0
- fabricks/core/dags/__init__.py +9 -0
- fabricks/core/dags/base.py +72 -0
- fabricks/core/dags/generator.py +154 -0
- fabricks/core/dags/log.py +14 -0
- fabricks/core/dags/processor.py +163 -0
- fabricks/core/dags/terminator.py +26 -0
- fabricks/core/deploy/__init__.py +12 -0
- fabricks/core/deploy/tables.py +76 -0
- fabricks/core/deploy/views.py +417 -0
- fabricks/core/extenders.py +29 -0
- fabricks/core/jobs/__init__.py +20 -0
- fabricks/core/jobs/base/__init__.py +10 -0
- fabricks/core/jobs/base/checker.py +89 -0
- fabricks/core/jobs/base/configurator.py +323 -0
- fabricks/core/jobs/base/error.py +16 -0
- fabricks/core/jobs/base/generator.py +391 -0
- fabricks/core/jobs/base/invoker.py +119 -0
- fabricks/core/jobs/base/job.py +5 -0
- fabricks/core/jobs/base/processor.py +204 -0
- fabricks/core/jobs/base/types.py +191 -0
- fabricks/core/jobs/bronze.py +333 -0
- fabricks/core/jobs/get_job.py +126 -0
- fabricks/core/jobs/get_job_conf.py +115 -0
- fabricks/core/jobs/get_job_id.py +26 -0
- fabricks/core/jobs/get_jobs.py +89 -0
- fabricks/core/jobs/gold.py +218 -0
- fabricks/core/jobs/silver.py +354 -0
- fabricks/core/parsers/__init__.py +12 -0
- fabricks/core/parsers/base.py +91 -0
- fabricks/core/parsers/decorator.py +11 -0
- fabricks/core/parsers/get_parser.py +25 -0
- fabricks/core/parsers/types.py +6 -0
- fabricks/core/schedules.py +89 -0
- fabricks/core/scripts/__init__.py +13 -0
- fabricks/core/scripts/armageddon.py +82 -0
- fabricks/core/scripts/generate.py +20 -0
- fabricks/core/scripts/job_schema.py +28 -0
- fabricks/core/scripts/optimize.py +45 -0
- fabricks/core/scripts/process.py +9 -0
- fabricks/core/scripts/stats.py +48 -0
- fabricks/core/scripts/steps.py +27 -0
- fabricks/core/scripts/terminate.py +6 -0
- fabricks/core/scripts/vacuum.py +45 -0
- fabricks/core/site_packages.py +55 -0
- fabricks/core/steps/__init__.py +4 -0
- fabricks/core/steps/base.py +282 -0
- fabricks/core/steps/get_step.py +10 -0
- fabricks/core/steps/get_step_conf.py +33 -0
- fabricks/core/steps/types.py +7 -0
- fabricks/core/udfs.py +106 -0
- fabricks/core/utils.py +69 -0
- fabricks/core/views.py +36 -0
- fabricks/metastore/README.md +3 -0
- fabricks/metastore/__init__.py +5 -0
- fabricks/metastore/database.py +71 -0
- fabricks/metastore/pyproject.toml +20 -0
- fabricks/metastore/relational.py +61 -0
- fabricks/metastore/table.py +529 -0
- fabricks/metastore/utils.py +35 -0
- fabricks/metastore/view.py +40 -0
- fabricks/utils/README.md +3 -0
- fabricks/utils/__init__.py +0 -0
- fabricks/utils/azure_queue.py +63 -0
- fabricks/utils/azure_table.py +99 -0
- fabricks/utils/console.py +51 -0
- fabricks/utils/container.py +57 -0
- fabricks/utils/fdict.py +28 -0
- fabricks/utils/helpers.py +89 -0
- fabricks/utils/log.py +153 -0
- fabricks/utils/path.py +206 -0
- fabricks/utils/pip.py +61 -0
- fabricks/utils/pydantic.py +92 -0
- fabricks/utils/pyproject.toml +18 -0
- fabricks/utils/read/__init__.py +11 -0
- fabricks/utils/read/read.py +305 -0
- fabricks/utils/read/read_excel.py +5 -0
- fabricks/utils/read/read_yaml.py +43 -0
- fabricks/utils/read/types.py +3 -0
- fabricks/utils/schema/__init__.py +7 -0
- fabricks/utils/schema/get_json_schema_for_type.py +161 -0
- fabricks/utils/schema/get_schema_for_type.py +93 -0
- fabricks/utils/secret.py +78 -0
- fabricks/utils/sqlglot.py +48 -0
- fabricks/utils/write/__init__.py +8 -0
- fabricks/utils/write/delta.py +46 -0
- fabricks/utils/write/stream.py +27 -0
- fabricks-2024.7.1.5.dist-info/METADATA +212 -0
- fabricks-2024.7.1.5.dist-info/RECORD +154 -0
- fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Final, List
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
from databricks.sdk.runtime import spark
|
|
7
|
+
|
|
8
|
+
from fabricks.utils.path import Path
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
runtime = Path(os.environ["FABRICKS_RUNTIME"], assume_git=True)
|
|
12
|
+
assert runtime, "runtime mandatory in cluster config"
|
|
13
|
+
PATH_RUNTIME: Final[Path] = runtime
|
|
14
|
+
|
|
15
|
+
notebooks = Path(os.environ["FABRICKS_NOTEBOOKS"], assume_git=True)
|
|
16
|
+
assert notebooks, "notebooks mandatory in cluster config"
|
|
17
|
+
PATH_NOTEBOOKS: Final[Path] = notebooks
|
|
18
|
+
|
|
19
|
+
version = os.environ["FABRICKS_VERSION"]
|
|
20
|
+
assert version, "version mandatory in cluster config"
|
|
21
|
+
VERSION: Final[str] = version
|
|
22
|
+
|
|
23
|
+
PATH_LIBRARIES = "/dbfs/mnt/fabricks/site-packages"
|
|
24
|
+
spark._sc._python_includes.append(PATH_LIBRARIES) # type: ignore
|
|
25
|
+
sys.path.append(PATH_LIBRARIES)
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
is_test = os.environ["FABRICKS_IS_TEST"] == "TRUE"
|
|
29
|
+
except Exception:
|
|
30
|
+
is_test = False
|
|
31
|
+
IS_TEST: Final[bool] = is_test
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
is_debug = os.environ["FABRICKS_IS_DEBUG"] == "TRUE"
|
|
35
|
+
except Exception:
|
|
36
|
+
is_debug = False
|
|
37
|
+
IS_DEBUG: Final[bool] = is_debug
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
is_live = os.environ["FABRICKS_IS_LIVE"] == "TRUE"
|
|
41
|
+
except Exception:
|
|
42
|
+
is_live = False
|
|
43
|
+
IS_LIVE: Final[bool] = is_live
|
|
44
|
+
|
|
45
|
+
conf_path = PATH_RUNTIME.join(
|
|
46
|
+
"fabricks",
|
|
47
|
+
f"conf.{spark.conf.get('spark.databricks.clusterUsageTags.clusterOwnerOrgId')}.yml",
|
|
48
|
+
)
|
|
49
|
+
with open(conf_path.string) as f:
|
|
50
|
+
data = yaml.safe_load(f)
|
|
51
|
+
|
|
52
|
+
conf: dict = [d["conf"] for d in data][0]
|
|
53
|
+
assert conf, "conf mandatory"
|
|
54
|
+
CONF_RUNTIME: Final[dict] = conf
|
|
55
|
+
|
|
56
|
+
BRONZE = CONF_RUNTIME.get("bronze", [{}])
|
|
57
|
+
SILVER = CONF_RUNTIME.get("silver", [{}])
|
|
58
|
+
GOLD = CONF_RUNTIME.get("gold", [{}])
|
|
59
|
+
STEPS = BRONZE + SILVER + GOLD
|
|
60
|
+
|
|
61
|
+
databases = CONF_RUNTIME.get("databases", [{}])
|
|
62
|
+
credentials = CONF_RUNTIME.get("credentials", {})
|
|
63
|
+
variables = CONF_RUNTIME.get("variables", {})
|
|
64
|
+
VARIABLES: dict = variables
|
|
65
|
+
|
|
66
|
+
conf_options = CONF_RUNTIME.get("options", {})
|
|
67
|
+
assert conf_options, "options mandatory"
|
|
68
|
+
|
|
69
|
+
secret_scope = conf_options.get("secret_scope")
|
|
70
|
+
assert secret_scope, "secret_scope mandatory in options"
|
|
71
|
+
SECRET_SCOPE: Final[str] = secret_scope
|
|
72
|
+
|
|
73
|
+
path_options = CONF_RUNTIME.get("path_options", {})
|
|
74
|
+
assert path_options, "options mandatory"
|
|
75
|
+
|
|
76
|
+
fabricks_uri = path_options.get("storage")
|
|
77
|
+
assert fabricks_uri, "storage mandatory in path options"
|
|
78
|
+
FABRICKS_STORAGE: Final[Path] = Path.from_uri(fabricks_uri, regex=variables)
|
|
79
|
+
|
|
80
|
+
path_udfs = path_options.get("udfs")
|
|
81
|
+
assert path_udfs, "udfs mandatory in path options"
|
|
82
|
+
PATH_UDFS: Final[Path] = PATH_RUNTIME.join(path_udfs)
|
|
83
|
+
|
|
84
|
+
path_parsers = path_options.get("parsers")
|
|
85
|
+
assert path_parsers, "parsers mandatory in path options"
|
|
86
|
+
PATH_PARSERS: Final[Path] = PATH_RUNTIME.join(path_parsers)
|
|
87
|
+
|
|
88
|
+
path_extenders = path_options.get("extenders")
|
|
89
|
+
assert path_extenders, "extenders mandatory in path options"
|
|
90
|
+
PATH_EXTENDERS: Final[Path] = PATH_RUNTIME.join(path_extenders)
|
|
91
|
+
|
|
92
|
+
path_views = path_options.get("views")
|
|
93
|
+
assert path_views, "views mandatory in path options"
|
|
94
|
+
PATH_VIEWS: Final[Path] = PATH_RUNTIME.join(path_views)
|
|
95
|
+
|
|
96
|
+
path_schedules = path_options.get("schedules")
|
|
97
|
+
assert path_schedules, "schedules mandatory in path options"
|
|
98
|
+
PATH_SCHEDULES: Final[Path] = PATH_RUNTIME.join(path_schedules)
|
|
99
|
+
|
|
100
|
+
path_requirements = path_options.get("requirements")
|
|
101
|
+
assert path_requirements, "requirements mandatory in path options"
|
|
102
|
+
PATH_REQUIREMENTS: Final[Path] = PATH_RUNTIME.join(path_requirements)
|
|
103
|
+
|
|
104
|
+
def _get_storage_paths(objects: List[dict]) -> dict:
|
|
105
|
+
d = {}
|
|
106
|
+
for o in objects:
|
|
107
|
+
if o:
|
|
108
|
+
name = o.get("name")
|
|
109
|
+
assert name
|
|
110
|
+
uri = o.get("path_options", {}).get("storage")
|
|
111
|
+
assert uri
|
|
112
|
+
d[name] = Path.from_uri(uri, regex=variables)
|
|
113
|
+
return d
|
|
114
|
+
|
|
115
|
+
PATHS_STORAGE: Final[dict[str, Path]] = {
|
|
116
|
+
"fabricks": FABRICKS_STORAGE,
|
|
117
|
+
**_get_storage_paths(BRONZE),
|
|
118
|
+
**_get_storage_paths(SILVER),
|
|
119
|
+
**_get_storage_paths(GOLD),
|
|
120
|
+
**_get_storage_paths(databases),
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
def _get_runtime_path(objects: List[dict]) -> dict:
|
|
124
|
+
d = {}
|
|
125
|
+
for o in objects:
|
|
126
|
+
name = o.get("name")
|
|
127
|
+
assert name
|
|
128
|
+
uri = o.get("path_options", {}).get("runtime")
|
|
129
|
+
assert uri
|
|
130
|
+
d[name] = PATH_RUNTIME.join(uri)
|
|
131
|
+
return d
|
|
132
|
+
|
|
133
|
+
PATHS_RUNTIME: Final[dict[str, Path]] = {
|
|
134
|
+
**_get_runtime_path(BRONZE),
|
|
135
|
+
**_get_runtime_path(SILVER),
|
|
136
|
+
**_get_runtime_path(GOLD),
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
except KeyError as e:
|
|
140
|
+
raise e
|
|
141
|
+
|
|
142
|
+
except AssertionError as e:
|
|
143
|
+
raise e
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Optional, Tuple
|
|
2
|
+
|
|
3
|
+
from pyspark.dbutils import DBUtils
|
|
4
|
+
from pyspark.sql import SparkSession
|
|
5
|
+
|
|
6
|
+
from fabricks.context.runtime import CONF_RUNTIME, SECRET_SCOPE
|
|
7
|
+
from fabricks.utils.secret import add_secret_to_spark, get_secret_from_secret_scope
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def build_spark_session(new: Optional[bool] = False, log: Optional[bool] = False) -> Tuple[SparkSession, DBUtils]:
|
|
11
|
+
if new:
|
|
12
|
+
spark = SparkSession.builder.getOrCreate().newSession() # type: ignore
|
|
13
|
+
|
|
14
|
+
catalog = CONF_RUNTIME.get("options", {}).get("catalog")
|
|
15
|
+
if catalog:
|
|
16
|
+
spark.sql(f"use catalog {catalog};")
|
|
17
|
+
|
|
18
|
+
# delta
|
|
19
|
+
spark.sql("set spark.databricks.delta.schema.autoMerge.enabled = True;")
|
|
20
|
+
spark.sql("set spark.databricks.delta.resolveMergeUpdateStructsByName.enabled = True;")
|
|
21
|
+
|
|
22
|
+
spark_options = CONF_RUNTIME.get("spark_options", {})
|
|
23
|
+
if spark_options:
|
|
24
|
+
sql_options = spark_options.get("sql", {})
|
|
25
|
+
for key, value in sql_options.items():
|
|
26
|
+
spark.sql(f"set {key} = {value};")
|
|
27
|
+
|
|
28
|
+
conf_options = spark_options.get("conf", {})
|
|
29
|
+
for key, value in conf_options.items():
|
|
30
|
+
spark.conf.set(key, value)
|
|
31
|
+
|
|
32
|
+
credentials = CONF_RUNTIME.get("credentials", {})
|
|
33
|
+
for uri, secret in credentials.items():
|
|
34
|
+
s = get_secret_from_secret_scope(secret_scope=SECRET_SCOPE, name=secret)
|
|
35
|
+
add_secret_to_spark(secret=s, uri=uri)
|
|
36
|
+
|
|
37
|
+
else:
|
|
38
|
+
spark = SparkSession.builder.getOrCreate() # type: ignore
|
|
39
|
+
|
|
40
|
+
return spark, DBUtils(spark)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
build_spark_session(new=True, log=True)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from typing import List, Optional, TypedDict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class RuntimePathOptions(TypedDict):
|
|
5
|
+
storage: str
|
|
6
|
+
udfs: str
|
|
7
|
+
extenders: str
|
|
8
|
+
parsers: str
|
|
9
|
+
schedules: str
|
|
10
|
+
views: str
|
|
11
|
+
requirements: str
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RuntimeTimeoutOptions(TypedDict):
|
|
15
|
+
step: int
|
|
16
|
+
job: int
|
|
17
|
+
pre_run: int
|
|
18
|
+
post_run: int
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class StepTimeoutOptions(TypedDict):
|
|
22
|
+
step: Optional[int]
|
|
23
|
+
job: Optional[int]
|
|
24
|
+
pre_run: Optional[int]
|
|
25
|
+
post_run: Optional[int]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RuntimeOptions(TypedDict):
|
|
29
|
+
secret_scope: str
|
|
30
|
+
catalog: str
|
|
31
|
+
workers: int
|
|
32
|
+
timeouts: RuntimeTimeoutOptions
|
|
33
|
+
retention_days: int
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class SparkOptions(TypedDict):
|
|
37
|
+
sql: dict
|
|
38
|
+
conf: dict
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class StepPathOptions(TypedDict):
|
|
42
|
+
runtime: str
|
|
43
|
+
storage: str
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class InvokeOptions(TypedDict):
|
|
47
|
+
notebook: str
|
|
48
|
+
arguments: Optional[dict[str, str]]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class StepOptions(TypedDict):
|
|
52
|
+
order: int
|
|
53
|
+
workers: Optional[int]
|
|
54
|
+
timeouts: StepTimeoutOptions
|
|
55
|
+
extender: Optional[str]
|
|
56
|
+
pre_run: Optional[InvokeOptions]
|
|
57
|
+
post_run: Optional[InvokeOptions]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SilverOptions(StepOptions):
|
|
61
|
+
parent: str
|
|
62
|
+
stream: Optional[bool]
|
|
63
|
+
local_checkpoint: Optional[bool]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class GoldOptions(StepOptions):
|
|
67
|
+
schema_drift: Optional[bool]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class Step(TypedDict):
|
|
71
|
+
name: str
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class TableOptions(TypedDict):
|
|
75
|
+
powerbi: Optional[bool]
|
|
76
|
+
liquid_clustering: Optional[bool]
|
|
77
|
+
properties: Optional[dict[str, str]]
|
|
78
|
+
retention_days: Optional[int]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class Bronze(Step):
|
|
82
|
+
options: StepOptions
|
|
83
|
+
path_options: StepPathOptions
|
|
84
|
+
table_options: Optional[TableOptions]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Silver(Step):
|
|
88
|
+
options: SilverOptions
|
|
89
|
+
path_options: StepPathOptions
|
|
90
|
+
table_options: Optional[TableOptions]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Gold(Step):
|
|
94
|
+
options: GoldOptions
|
|
95
|
+
path_options: StepPathOptions
|
|
96
|
+
table_options: Optional[TableOptions]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class PowerBI(Step):
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class DatabasePathOptions(TypedDict):
|
|
104
|
+
storage: str
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class Database(TypedDict):
|
|
108
|
+
name: str
|
|
109
|
+
path_options: DatabasePathOptions
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class Conf(TypedDict):
|
|
113
|
+
name: str
|
|
114
|
+
options: RuntimeOptions
|
|
115
|
+
path_options: RuntimePathOptions
|
|
116
|
+
spark_options: SparkOptions
|
|
117
|
+
bronze: Optional[List[Bronze]]
|
|
118
|
+
silver: Optional[List[Silver]]
|
|
119
|
+
gold: Optional[List[Gold]]
|
|
120
|
+
powerbi: Optional[List[PowerBI]]
|
|
121
|
+
databases: Optional[List[Database]]
|
|
122
|
+
variables: Optional[List[dict[str, str]]]
|
|
123
|
+
credentials: Optional[List[dict[str, str]]]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, cast
|
|
3
|
+
|
|
4
|
+
from databricks.sdk.runtime import spark
|
|
5
|
+
from pyspark.sql import DataFrame
|
|
6
|
+
from pyspark.sql.functions import expr
|
|
7
|
+
|
|
8
|
+
from fabricks.context import FABRICKS_STORAGE
|
|
9
|
+
from fabricks.core.dags.log import DagsTableLogger
|
|
10
|
+
from fabricks.metastore.table import Table
|
|
11
|
+
from fabricks.utils.azure_table import AzureTable
|
|
12
|
+
from fabricks.utils.secret import AccessKey, get_secret_from_secret_scope
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseDags:
|
|
16
|
+
def __init__(self, schedule_id: str):
|
|
17
|
+
self.schedule_id = schedule_id
|
|
18
|
+
|
|
19
|
+
def get_connection_string(self) -> str:
|
|
20
|
+
storage_account = FABRICKS_STORAGE.get_storage_account()
|
|
21
|
+
secret = get_secret_from_secret_scope("bmskv", f"{storage_account}-access-key")
|
|
22
|
+
access_key = cast(AccessKey, secret).key
|
|
23
|
+
connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account};AccountKey={access_key};EndpointSuffix=core.windows.net"
|
|
24
|
+
return connection_string
|
|
25
|
+
|
|
26
|
+
def get_table(self) -> AzureTable:
|
|
27
|
+
cs = self.get_connection_string()
|
|
28
|
+
table = AzureTable(f"t{self.schedule_id}", connection_string=cs)
|
|
29
|
+
return table
|
|
30
|
+
|
|
31
|
+
def get_logs(self, step: Optional[str] = None) -> DataFrame:
|
|
32
|
+
q = f"PartitionKey eq '{self.schedule_id}'"
|
|
33
|
+
if step:
|
|
34
|
+
q += f" and Step eq '{step}'"
|
|
35
|
+
|
|
36
|
+
d = DagsTableLogger.table.query(q)
|
|
37
|
+
df = spark.createDataFrame(d)
|
|
38
|
+
if "Exception" not in df.columns:
|
|
39
|
+
df = df.withColumn("Exception", expr("null"))
|
|
40
|
+
|
|
41
|
+
df = spark.sql(
|
|
42
|
+
"""
|
|
43
|
+
select
|
|
44
|
+
ScheduleId as schedule_id,
|
|
45
|
+
Schedule as schedule,
|
|
46
|
+
Step as step,
|
|
47
|
+
JobId as job_id,
|
|
48
|
+
Job as job,
|
|
49
|
+
NotebookId as notebook_id,
|
|
50
|
+
`Level` as `level`,
|
|
51
|
+
`Message` as `status`,
|
|
52
|
+
to_timestamp(`Created`, 'dd/MM/yy HH:mm:ss') as `timestamp`,
|
|
53
|
+
from_json(Exception, 'type STRING, message STRING, traceback STRING') as exception
|
|
54
|
+
from
|
|
55
|
+
{df}
|
|
56
|
+
""",
|
|
57
|
+
df=df,
|
|
58
|
+
)
|
|
59
|
+
return df
|
|
60
|
+
|
|
61
|
+
def write_logs(self, df: DataFrame):
|
|
62
|
+
(
|
|
63
|
+
df.write.format("delta")
|
|
64
|
+
.mode("overwrite")
|
|
65
|
+
.option("mergeSchema", "true")
|
|
66
|
+
.option("partitionOverwriteMode", "dynamic")
|
|
67
|
+
.save(Table("fabricks", "logs").deltapath.string)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
def remove_invalid_characters(self, s: str) -> str:
|
|
71
|
+
out = re.sub("[^a-zA-Z0-9]", "", s)
|
|
72
|
+
return out
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
|
|
5
|
+
from databricks.sdk.runtime import spark
|
|
6
|
+
from pyspark.sql import DataFrame
|
|
7
|
+
|
|
8
|
+
from fabricks.core.dags.base import BaseDags
|
|
9
|
+
from fabricks.core.dags.log import DagsTableLogger
|
|
10
|
+
from fabricks.utils.azure_queue import AzureQueue
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DagGenerator(BaseDags):
|
|
14
|
+
def __init__(self, schedule: str):
|
|
15
|
+
self.schedule = schedule
|
|
16
|
+
schedule_id = str(uuid4().hex)
|
|
17
|
+
super().__init__(schedule_id=schedule_id)
|
|
18
|
+
|
|
19
|
+
def get_jobs(self) -> DataFrame:
|
|
20
|
+
return spark.sql(
|
|
21
|
+
f"""
|
|
22
|
+
with logs as (
|
|
23
|
+
select
|
|
24
|
+
l.job_id,
|
|
25
|
+
median(l.duration) as median_duration
|
|
26
|
+
from
|
|
27
|
+
fabricks.logs_pivot l
|
|
28
|
+
where
|
|
29
|
+
true
|
|
30
|
+
and duration is not null
|
|
31
|
+
and date_diff(day, l.start_time , current_date) < 10
|
|
32
|
+
group by
|
|
33
|
+
l.job_id
|
|
34
|
+
)
|
|
35
|
+
select
|
|
36
|
+
'statuses' as PartitionKey,
|
|
37
|
+
'{self.schedule_id}' as ScheduleId,
|
|
38
|
+
'{self.schedule}' as Schedule,
|
|
39
|
+
j.job_id::string as RowKey,
|
|
40
|
+
j.step as Step,
|
|
41
|
+
j.job_id as JobId,
|
|
42
|
+
j.job as Job,
|
|
43
|
+
'scheduled' as `Status`,
|
|
44
|
+
max(median_duration) as `MedianDuration`,
|
|
45
|
+
dense_rank() over (order by max(median_duration) desc) as Rank
|
|
46
|
+
from
|
|
47
|
+
fabricks.jobs j
|
|
48
|
+
inner join fabricks.{self.schedule}_schedule v on j.job_id = v.job_id
|
|
49
|
+
left join logs l on j.job_id = l.job_id
|
|
50
|
+
group by all
|
|
51
|
+
"""
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def get_dependencies(self, job_df: Optional[DataFrame] = None) -> DataFrame:
|
|
55
|
+
if job_df is None:
|
|
56
|
+
job_df = self.get_jobs()
|
|
57
|
+
|
|
58
|
+
return spark.sql(
|
|
59
|
+
"""
|
|
60
|
+
select
|
|
61
|
+
'dependencies' as PartitionKey,
|
|
62
|
+
d.dependency_id::string as RowKey,
|
|
63
|
+
{schedule_id} as ScheduleId,
|
|
64
|
+
{schedule} as Schedule,
|
|
65
|
+
d.dependency_id as DependencyId,
|
|
66
|
+
j.Step as Step,
|
|
67
|
+
j.Job as Job,
|
|
68
|
+
j.JobId as JobId,
|
|
69
|
+
p.Step as ParentStep,
|
|
70
|
+
p.Job as Parent,
|
|
71
|
+
p.JobId as ParentId
|
|
72
|
+
from
|
|
73
|
+
fabricks.dependencies d
|
|
74
|
+
inner join {job} j on d.job_id = j.JobId
|
|
75
|
+
inner join {job} p on d.parent_id = p.JobId
|
|
76
|
+
where
|
|
77
|
+
true
|
|
78
|
+
and d.parent_id is not null
|
|
79
|
+
and not d.job_id = d.parent_id
|
|
80
|
+
and not exists (
|
|
81
|
+
select 1
|
|
82
|
+
from
|
|
83
|
+
fabricks.dependencies_circular dc
|
|
84
|
+
where
|
|
85
|
+
true
|
|
86
|
+
and d.job_id = dc.job_id
|
|
87
|
+
and d.parent_id = dc.parent_id
|
|
88
|
+
|
|
89
|
+
)
|
|
90
|
+
group by all
|
|
91
|
+
""",
|
|
92
|
+
job=job_df,
|
|
93
|
+
schedule=self.schedule,
|
|
94
|
+
schedule_id=self.schedule_id,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
|
|
98
|
+
if job_df is None:
|
|
99
|
+
job_df = self.get_jobs()
|
|
100
|
+
|
|
101
|
+
return spark.sql(
|
|
102
|
+
"""
|
|
103
|
+
select
|
|
104
|
+
Step
|
|
105
|
+
from
|
|
106
|
+
{job}
|
|
107
|
+
group by
|
|
108
|
+
Step
|
|
109
|
+
""",
|
|
110
|
+
job=job_df,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
def generate(self) -> Tuple[str, DataFrame, DataFrame]:
|
|
114
|
+
job_df = self.get_jobs()
|
|
115
|
+
deps_df = self.get_dependencies(job_df)
|
|
116
|
+
step_df = self.get_steps(job_df)
|
|
117
|
+
|
|
118
|
+
table = self.get_table()
|
|
119
|
+
table.create_if_not_exists()
|
|
120
|
+
table.truncate_all_partitions()
|
|
121
|
+
table.upsert(job_df)
|
|
122
|
+
table.upsert(deps_df)
|
|
123
|
+
|
|
124
|
+
df = spark.sql(
|
|
125
|
+
"""
|
|
126
|
+
select
|
|
127
|
+
ScheduleId as PartitionKey,
|
|
128
|
+
ScheduleId,
|
|
129
|
+
`Schedule`,
|
|
130
|
+
Step,
|
|
131
|
+
Job,
|
|
132
|
+
JobId,
|
|
133
|
+
date_format(current_timestamp(), 'dd/MM/yy HH:mm:ss') as Created,
|
|
134
|
+
'INFO' as `Level`,
|
|
135
|
+
`Status` as `Message`,
|
|
136
|
+
from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
|
|
137
|
+
md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, -1), "*")) as RowKey
|
|
138
|
+
from
|
|
139
|
+
{df}
|
|
140
|
+
""",
|
|
141
|
+
df=job_df,
|
|
142
|
+
)
|
|
143
|
+
DagsTableLogger.table.upsert(df)
|
|
144
|
+
|
|
145
|
+
cs = self.get_connection_string()
|
|
146
|
+
for row in step_df.collect():
|
|
147
|
+
step = self.remove_invalid_characters(row.Step)
|
|
148
|
+
queue = AzureQueue(f"q{step}{self.schedule_id}", connection_string=cs)
|
|
149
|
+
queue.create_if_not_exists()
|
|
150
|
+
queue.clear()
|
|
151
|
+
|
|
152
|
+
time.sleep(60)
|
|
153
|
+
|
|
154
|
+
return self.schedule_id, job_df, deps_df
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import cast
|
|
3
|
+
|
|
4
|
+
from fabricks.context.runtime import FABRICKS_STORAGE, SECRET_SCOPE
|
|
5
|
+
from fabricks.utils.azure_table import AzureTable
|
|
6
|
+
from fabricks.utils.log import get_logger
|
|
7
|
+
from fabricks.utils.secret import AccessKey, get_secret_from_secret_scope
|
|
8
|
+
|
|
9
|
+
storage_account = FABRICKS_STORAGE.get_storage_account()
|
|
10
|
+
secret = get_secret_from_secret_scope(SECRET_SCOPE, f"{storage_account}-access-key")
|
|
11
|
+
access_key = cast(AccessKey, secret).key
|
|
12
|
+
|
|
13
|
+
table = AzureTable("dags", storage_account=storage_account, access_key=access_key)
|
|
14
|
+
DagsLogger, DagsTableLogger = get_logger("dags", logging.DEBUG, table=table)
|