fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,117 @@
1
+ from typing import Final, List, Optional
2
+
3
+ import yaml
4
+
5
+ from fabricks.context.config import path_config, path_runtime
6
+ from fabricks.utils.path import Path
7
+
8
+ with open(str(path_config)) as f:
9
+ data = yaml.safe_load(f)
10
+
11
+ conf: dict = [d["conf"] for d in data][0]
12
+ assert conf, "conf mandatory"
13
+ CONF_RUNTIME: Final[dict] = conf
14
+
15
+ BRONZE = CONF_RUNTIME.get("bronze", [{}])
16
+ SILVER = CONF_RUNTIME.get("silver", [{}])
17
+ GOLD = CONF_RUNTIME.get("gold", [{}])
18
+ STEPS = BRONZE + SILVER + GOLD
19
+
20
+ databases = CONF_RUNTIME.get("databases", [{}])
21
+ credentials = CONF_RUNTIME.get("credentials", {})
22
+ variables = CONF_RUNTIME.get("variables", {})
23
+ VARIABLES: dict = variables
24
+
25
+ conf_options = CONF_RUNTIME.get("options", {})
26
+ assert conf_options, "options mandatory"
27
+
28
+ IS_UNITY_CATALOG: Final[bool] = str(conf_options.get("unity_catalog", "False")).lower() in ("true", "1", "yes")
29
+ CATALOG: Optional[str] = conf_options.get("catalog")
30
+
31
+ if IS_UNITY_CATALOG and not CATALOG:
32
+ raise ValueError("catalog mandatory in options when unity_catalog is enabled")
33
+
34
+ secret_scope = conf_options.get("secret_scope")
35
+ assert secret_scope, "secret_scope mandatory in options"
36
+ SECRET_SCOPE: Final[str] = secret_scope
37
+
38
+ timezone = conf_options.get("timezone")
39
+ TIMEZONE: Final[str] = timezone
40
+
41
+ IS_TYPE_WIDENING: Final[bool] = str(conf_options.get("type_widening", "True")).lower() in ("true", "1", "yes")
42
+
43
+ path_options = CONF_RUNTIME.get("path_options", {})
44
+ assert path_options, "options mandatory"
45
+
46
+ fabricks_uri = path_options.get("storage")
47
+ assert fabricks_uri, "storage mandatory in path options"
48
+ FABRICKS_STORAGE: Final[Path] = Path.from_uri(fabricks_uri, regex=variables)
49
+
50
+ FABRICKS_STORAGE_CREDENTIAL: Final[Optional[str]] = path_options.get("storage_credential")
51
+
52
+ path_udfs = path_options.get("udfs", "fabricks/udfs")
53
+ assert path_udfs, "path to udfs mandatory"
54
+ PATH_UDFS: Final[Path] = path_runtime.joinpath(path_udfs)
55
+
56
+ path_parsers = path_options.get("parsers", "fabricks/parsers")
57
+ assert path_parsers, "path to parsers mandatory"
58
+ PATH_PARSERS: Final[Path] = path_runtime.joinpath(path_parsers)
59
+
60
+ path_extenders = path_options.get("extenders", "fabricks/extenders")
61
+ assert path_extenders, "path to extenders mandatory"
62
+ PATH_EXTENDERS: Final[Path] = path_runtime.joinpath(path_extenders)
63
+
64
+ path_views = path_options.get("views", "fabricks/views")
65
+ assert path_views, "path to views mandatory"
66
+ PATH_VIEWS: Final[Path] = path_runtime.joinpath(path_views)
67
+
68
+ path_schedules = path_options.get("schedules", "fabricks/schedules")
69
+ assert path_schedules, "path to schedules mandatory"
70
+ PATH_SCHEDULES: Final[Path] = path_runtime.joinpath(path_schedules)
71
+
72
+ path_requirements = path_options.get("requirements", "fabricks/requirements")
73
+ assert path_requirements, "path to requirements mandatory"
74
+ PATH_REQUIREMENTS: Final[Path] = path_runtime.joinpath(path_requirements)
75
+
76
+ path_masks = path_options.get("masks", "fabricks/masks")
77
+ assert path_masks, "path to masks mandatory"
78
+ PATH_MASKS: Final[Path] = path_runtime.joinpath(path_masks)
79
+
80
+
81
+ def _get_storage_paths(objects: List[dict]) -> dict:
82
+ d = {}
83
+ for o in objects:
84
+ if o:
85
+ name = o.get("name")
86
+ assert name
87
+ uri = o.get("path_options", {}).get("storage")
88
+ assert uri
89
+ d[name] = Path.from_uri(uri, regex=variables)
90
+ return d
91
+
92
+
93
+ PATHS_STORAGE: Final[dict[str, Path]] = {
94
+ "fabricks": FABRICKS_STORAGE,
95
+ **_get_storage_paths(BRONZE),
96
+ **_get_storage_paths(SILVER),
97
+ **_get_storage_paths(GOLD),
98
+ **_get_storage_paths(databases),
99
+ }
100
+
101
+
102
+ def _get_runtime_path(objects: List[dict]) -> dict:
103
+ d = {}
104
+ for o in objects:
105
+ name = o.get("name")
106
+ assert name
107
+ uri = o.get("path_options", {}).get("runtime")
108
+ assert uri
109
+ d[name] = path_runtime.joinpath(uri)
110
+ return d
111
+
112
+
113
+ PATHS_RUNTIME: Final[dict[str, Path]] = {
114
+ **_get_runtime_path(BRONZE),
115
+ **_get_runtime_path(SILVER),
116
+ **_get_runtime_path(GOLD),
117
+ }
@@ -0,0 +1,103 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from functools import lru_cache
6
+ from typing import Optional
7
+
8
+ from pyspark.sql import SparkSession
9
+
10
+ from fabricks.context import IS_UNITY_CATALOG
11
+ from fabricks.utils.spark import spark as _spark
12
+
13
+
14
+ @dataclass
15
+ class Secret:
16
+ pass
17
+
18
+
19
+ @dataclass
20
+ class ApplicationRegistration(Secret):
21
+ secret: str
22
+ application_id: str
23
+ directory_id: str
24
+
25
+
26
+ @dataclass
27
+ class AccessKey(Secret):
28
+ key: str
29
+
30
+
31
+ _scopes = None
32
+
33
+
34
+ @lru_cache(maxsize=None)
35
+ def _get_secret_from_secret_scope(secret_scope: str, name: str) -> str:
36
+ from databricks.sdk.runtime import dbutils
37
+
38
+ global _scopes
39
+
40
+ if not _scopes or secret_scope not in _scopes: # we get the scopes only once, unless you search for something new
41
+ _scopes = [s.name for s in dbutils.secrets.listScopes()]
42
+
43
+ assert secret_scope in _scopes, f"scope {secret_scope} not found"
44
+
45
+ return dbutils.secrets.get(scope=secret_scope, key=name)
46
+
47
+
48
+ def get_secret_from_secret_scope(secret_scope: str, name: str) -> Secret:
49
+ secret = _get_secret_from_secret_scope(secret_scope=secret_scope, name=name)
50
+
51
+ if name.endswith("application-registration"):
52
+ s = json.loads(secret)
53
+ assert s.get("secret"), f"no secret found in {name}"
54
+ assert s.get("application_id"), f"no application_id found in {name}"
55
+ assert s.get("directory_id"), f"no directory_id found in {name}"
56
+
57
+ return ApplicationRegistration(
58
+ secret=s.get("secret"),
59
+ application_id=s.get("application_id"),
60
+ directory_id=s.get("directory_id"),
61
+ )
62
+
63
+ elif name.endswith("access-key"):
64
+ return AccessKey(key=secret)
65
+
66
+ else:
67
+ raise ValueError(f"{name} is not valid")
68
+
69
+
70
+ def _add_secret_to_spark(key: str, value: str, spark: Optional[SparkSession] = None):
71
+ if spark is None:
72
+ spark = _spark
73
+
74
+ spark.conf.set(key, value) # needed for check (invalid configuration value detected for fs.azure.account.key)
75
+
76
+ if not IS_UNITY_CATALOG:
77
+ spark._jsc.hadoopConfiguration().set(key, value) # type: ignore
78
+
79
+
80
+ def add_secret_to_spark(secret: Secret, uri: str, spark: Optional[SparkSession] = None):
81
+ if spark is None:
82
+ spark = _spark
83
+
84
+ if isinstance(secret, ApplicationRegistration):
85
+ _add_secret_to_spark(f"fs.azure.account.auth.type.{uri}", "OAuth", spark=spark)
86
+ _add_secret_to_spark(
87
+ f"fs.azure.account.oauth.provider.type.{uri}",
88
+ "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
89
+ spark=spark,
90
+ )
91
+ _add_secret_to_spark(f"fs.azure.account.oauth2.client.id.{uri}", secret.application_id, spark=spark)
92
+ _add_secret_to_spark(f"fs.azure.account.oauth2.client.secret.{uri}", secret.secret, spark=spark)
93
+ _add_secret_to_spark(
94
+ f"fs.azure.account.oauth2.client.endpoint.{uri}",
95
+ f"https://login.microsoftonline.com/{secret.directory_id}/oauth2/token",
96
+ spark=spark,
97
+ )
98
+
99
+ elif isinstance(secret, AccessKey):
100
+ _add_secret_to_spark(f"fs.azure.account.key.{uri}", secret.key, spark=spark)
101
+
102
+ else:
103
+ raise ValueError("secret is not valid")
@@ -0,0 +1,82 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import SparkSession
4
+ from typing_extensions import deprecated
5
+
6
+ from fabricks.context import CATALOG, CONF_RUNTIME, IS_UNITY_CATALOG, SECRET_SCOPE
7
+ from fabricks.context.secret import add_secret_to_spark, get_secret_from_secret_scope
8
+ from fabricks.utils.spark import get_dbutils, get_spark
9
+
10
+
11
+ def add_catalog_to_spark(spark: Optional[SparkSession] = None):
12
+ if spark is None:
13
+ spark = get_spark()
14
+
15
+ if CATALOG is not None:
16
+ spark.sql(f"use catalog {CATALOG};")
17
+
18
+
19
+ def add_credentials_to_spark(spark: Optional[SparkSession] = None):
20
+ if spark is None:
21
+ spark = get_spark()
22
+
23
+ credentials = CONF_RUNTIME.get("credentials", {})
24
+ for uri, secret in credentials.items():
25
+ s = get_secret_from_secret_scope(secret_scope=SECRET_SCOPE, name=secret)
26
+ add_secret_to_spark(secret=s, uri=uri, spark=spark)
27
+
28
+
29
+ def add_spark_options_to_spark(spark: Optional[SparkSession] = None):
30
+ if spark is None:
31
+ spark = get_spark()
32
+
33
+ # delta default options
34
+ spark.sql("set spark.databricks.delta.schema.autoMerge.enabled = True;")
35
+ spark.sql("set spark.databricks.delta.resolveMergeUpdateStructsByName.enabled = True;")
36
+
37
+ # runtime options
38
+ spark_options = CONF_RUNTIME.get("spark_options", {})
39
+ if spark_options:
40
+ sql_options = spark_options.get("sql", {})
41
+ for key, value in sql_options.items():
42
+ spark.sql(f"set {key} = {value};")
43
+
44
+ conf_options = spark_options.get("conf", {})
45
+ for key, value in conf_options.items():
46
+ spark.conf.set(key, value)
47
+
48
+
49
+ def build_spark_session(spark: Optional[SparkSession] = None, app_name: Optional[str] = "default") -> SparkSession:
50
+ if app_name is None:
51
+ app_name = "default"
52
+
53
+ if spark is not None:
54
+ _spark = spark
55
+ _spark.builder.appName(app_name)
56
+
57
+ else:
58
+ _spark = (
59
+ SparkSession.builder.appName(app_name) # type: ignore
60
+ .config("spark.driver.allowMultipleContexts", "true")
61
+ .enableHiveSupport()
62
+ .getOrCreate()
63
+ )
64
+
65
+ add_catalog_to_spark(spark=_spark)
66
+ if not IS_UNITY_CATALOG:
67
+ add_credentials_to_spark(spark=_spark)
68
+ add_spark_options_to_spark(spark=_spark)
69
+
70
+ return _spark
71
+
72
+
73
+ @deprecated("use build_spark_session instead")
74
+ def init_spark_session(spark: Optional[SparkSession] = None):
75
+ if spark is None:
76
+ spark = get_spark()
77
+
78
+ return build_spark_session(spark=spark)
79
+
80
+
81
+ SPARK = build_spark_session(app_name="default")
82
+ DBUTILS = get_dbutils(SPARK)
@@ -0,0 +1,80 @@
1
+ import logging
2
+
3
+ import fabricks.context.config as c
4
+ import fabricks.context.runtime as r
5
+
6
+
7
+ def pprint_runtime(extended: bool = False) -> None:
8
+ print("=" * 60)
9
+ print("FABRICKS RUNTIME CONFIGURATION")
10
+ print("=" * 60)
11
+
12
+ # Core Paths Section
13
+ print("\n📁 CORE CONFIG:")
14
+ print(f" Runtime: {c.PATH_RUNTIME.string}")
15
+ print(f" Notebooks: {c.PATH_NOTEBOOKS.string}")
16
+ print(f" Config: {c.PATH_CONFIG.string}")
17
+ print(f" Log Level: {logging.getLevelName(c.LOGLEVEL)}")
18
+ print(f" Debug Mode: {'✓' if c.IS_DEBUGMODE else '✗'}")
19
+ print(f" Job Config from YAML: {'✓' if c.IS_JOB_CONFIG_FROM_YAML else '✗'}")
20
+
21
+ print("\n⚙️ RUNTIME SETTINGS:")
22
+ print("\n🔄 PIPELINE STEPS:")
23
+
24
+ def _print_steps(steps_list, layer_name, icon):
25
+ if steps_list and any(step for step in steps_list if step):
26
+ print(f" {icon} {layer_name}:")
27
+ for step in steps_list:
28
+ if step:
29
+ step_name = step.get("name", "Unnamed")
30
+ print(f" • {step_name}")
31
+ else:
32
+ print(f" {icon} {layer_name}: No steps")
33
+
34
+ _print_steps(r.BRONZE, "Bronze", "🥉")
35
+ _print_steps(r.SILVER, "Silver", "🥈")
36
+ _print_steps(r.GOLD, "Gold", "🥇")
37
+
38
+ # Storage Configuration Section
39
+ print("\n💾 STORAGE CONFIGURATION:")
40
+ print(f" Storage URI: {r.FABRICKS_STORAGE.string}")
41
+ print(f" Storage Credential: {r.FABRICKS_STORAGE_CREDENTIAL or 'Not configured'}")
42
+
43
+ # Unity Catalog Section
44
+ print("\n🏛️ UNITY CATALOG:")
45
+ print(f" Enabled: {'✓' if r.IS_UNITY_CATALOG else '✗'}")
46
+ if r.IS_UNITY_CATALOG and r.CATALOG:
47
+ print(f" Catalog: {r.CATALOG}")
48
+
49
+ # Security Section
50
+ print("\n🔐 SECURITY:")
51
+ print(f" Secret Scope: {r.SECRET_SCOPE}")
52
+
53
+ print("\n🌐 ADDITIONAL SETTINGS:")
54
+ print(f" Timezone: {r.TIMEZONE}")
55
+
56
+ if extended:
57
+ # Component Paths Section
58
+ print("\n🛠️ COMPONENT PATHS:")
59
+ components = [
60
+ ("UDFs", r.PATH_UDFS),
61
+ ("Parsers", r.PATH_PARSERS),
62
+ ("Extenders", r.PATH_EXTENDERS),
63
+ ("Views", r.PATH_VIEWS),
64
+ ("Schedules", r.PATH_SCHEDULES),
65
+ ]
66
+
67
+ for name, path in components:
68
+ print(f" {name}: {path.string}")
69
+
70
+ # Storage Paths Section
71
+ print("\n📦 STORAGE PATHS:")
72
+ for name, path in sorted(r.PATHS_STORAGE.items()):
73
+ icon = "🏭" if name == "fabricks" else "📊"
74
+ print(f" {icon} {name}: {path.string}")
75
+
76
+ # Runtime Paths Section
77
+ if r.PATHS_RUNTIME:
78
+ print("\n⚡ RUNTIME PATHS:")
79
+ for name, path in sorted(r.PATHS_RUNTIME.items()):
80
+ print(f" 📂 {name}: {path.string}")
@@ -0,0 +1,4 @@
1
+ from fabricks.core.jobs import Bronzes, Golds, Silvers, Steps, get_job, get_jobs
2
+ from fabricks.core.steps import get_step
3
+
4
+ __all__ = ["get_job", "get_jobs", "get_step", "Bronzes", "Silvers", "Golds", "Steps"]
@@ -0,0 +1,9 @@
1
+ from fabricks.core.dags.generator import DagGenerator
2
+ from fabricks.core.dags.processor import DagProcessor
3
+ from fabricks.core.dags.terminator import DagTerminator
4
+
5
+ __all__ = [
6
+ "DagGenerator",
7
+ "DagProcessor",
8
+ "DagTerminator",
9
+ ]
@@ -0,0 +1,99 @@
1
+ import re
2
+ from typing import Optional
3
+
4
+ from azure.core.exceptions import AzureError
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql.functions import expr
7
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
8
+
9
+ from fabricks.context import FABRICKS_STORAGE, SPARK
10
+ from fabricks.core.dags.log import TABLE_LOG_HANDLER
11
+ from fabricks.core.dags.utils import get_connection_info
12
+ from fabricks.metastore.table import Table
13
+ from fabricks.utils.azure_table import AzureTable
14
+
15
+
16
+ class BaseDags:
17
+ def __init__(self, schedule_id: str):
18
+ self.schedule_id = schedule_id
19
+ self._connection_info = None
20
+ self._table = None
21
+
22
+ @property
23
+ def storage_account(self) -> str:
24
+ return FABRICKS_STORAGE.get_storage_account()
25
+
26
+ def get_connection_info(self) -> dict:
27
+ if not self._connection_info:
28
+ self._connection_info = get_connection_info(self.storage_account)
29
+ return self._connection_info
30
+
31
+ @retry(
32
+ stop=stop_after_attempt(3),
33
+ wait=wait_exponential(multiplier=1, min=1, max=10),
34
+ retry=retry_if_exception_type((Exception, AzureError)),
35
+ reraise=True,
36
+ )
37
+ def get_table(self) -> AzureTable:
38
+ if not self._table:
39
+ cs = self.get_connection_info()
40
+ self._table = AzureTable(f"t{self.schedule_id}", **dict(cs)) # type: ignore
41
+
42
+ if self._table is None:
43
+ raise ValueError("Azure table for logs not found")
44
+
45
+ return self._table
46
+
47
+ def __enter__(self):
48
+ return self
49
+
50
+ def __exit__(self, *args, **kwargs):
51
+ if self._table is not None:
52
+ self._table.__exit__()
53
+
54
+ def get_logs(self, step: Optional[str] = None) -> DataFrame:
55
+ q = f"PartitionKey eq '{self.schedule_id}'"
56
+ if step:
57
+ q += f" and Step eq '{step}'"
58
+
59
+ d = TABLE_LOG_HANDLER.table.query(q)
60
+ df = SPARK.createDataFrame(d)
61
+
62
+ if "Exception" not in df.columns:
63
+ df = df.withColumn("Exception", expr("null"))
64
+ if "NotebookId" not in df.columns:
65
+ df = df.withColumn("NotebookId", expr("null"))
66
+
67
+ df = SPARK.sql(
68
+ """
69
+ select
70
+ ScheduleId as schedule_id,
71
+ Schedule as schedule,
72
+ Step as step,
73
+ JobId as job_id,
74
+ Job as job,
75
+ NotebookId as notebook_id,
76
+ `Level` as `level`,
77
+ `Message` as `status`,
78
+ to_timestamp(`Created`, 'dd/MM/yy HH:mm:ss') as `timestamp`,
79
+ from_json(Exception, 'type STRING, message STRING, traceback STRING') as exception
80
+ from
81
+ {df}
82
+ """,
83
+ df=df,
84
+ )
85
+
86
+ return df
87
+
88
+ def write_logs(self, df: DataFrame):
89
+ (
90
+ df.write.format("delta")
91
+ .mode("overwrite")
92
+ .option("mergeSchema", "true")
93
+ .option("partitionOverwriteMode", "dynamic")
94
+ .save(Table("fabricks", "logs").delta_path.string)
95
+ )
96
+
97
+ def remove_invalid_characters(self, s: str) -> str:
98
+ out = re.sub("[^a-zA-Z0-9]", "", s)
99
+ return out
@@ -0,0 +1,157 @@
1
+ import time
2
+ from typing import Optional, Tuple
3
+ from uuid import uuid4
4
+
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql.functions import lit
7
+
8
+ from fabricks.context import SPARK
9
+ from fabricks.core.dags.base import BaseDags
10
+ from fabricks.core.dags.log import TABLE_LOG_HANDLER
11
+ from fabricks.utils.azure_queue import AzureQueue
12
+
13
+
14
+ class DagGenerator(BaseDags):
15
+ def __init__(self, schedule: str):
16
+ self.schedule = schedule
17
+ schedule_id = str(uuid4().hex)
18
+ super().__init__(schedule_id=schedule_id)
19
+
20
+ def get_jobs(self) -> DataFrame:
21
+ return SPARK.sql(
22
+ f"""
23
+ with logs as (
24
+ select
25
+ l.job_id,
26
+ median(l.duration) as median_duration
27
+ from
28
+ fabricks.logs_pivot l
29
+ where
30
+ true
31
+ and duration is not null
32
+ and date_diff(day, l.start_time , current_date) < 10
33
+ group by
34
+ l.job_id
35
+ )
36
+ select
37
+ 'statuses' as PartitionKey,
38
+ '{self.schedule_id}' as ScheduleId,
39
+ '{self.schedule}' as Schedule,
40
+ j.job_id::string as RowKey,
41
+ j.step as Step,
42
+ j.job_id as JobId,
43
+ j.job as Job,
44
+ 'scheduled' as `Status`,
45
+ max(median_duration) as `MedianDuration`,
46
+ dense_rank() over (order by max(median_duration) desc) as Rank
47
+ from
48
+ fabricks.jobs j
49
+ inner join fabricks.{self.schedule}_schedule v on j.job_id = v.job_id
50
+ left join logs l on j.job_id = l.job_id
51
+ group by all
52
+ """
53
+ )
54
+
55
+ def get_dependencies(self, job_df: Optional[DataFrame] = None) -> DataFrame:
56
+ if job_df is None:
57
+ job_df = self.get_jobs()
58
+
59
+ df = SPARK.sql(
60
+ """
61
+ select
62
+ 'dependencies' as PartitionKey,
63
+ d.dependency_id :: string as RowKey,
64
+ d.dependency_id as DependencyId,
65
+ j.Step as Step,
66
+ j.Job as Job,
67
+ j.JobId as JobId,
68
+ p.Step as ParentStep,
69
+ p.Job as Parent,
70
+ p.JobId as ParentId
71
+ from
72
+ fabricks.dependencies d
73
+ inner join {job} j on d.job_id = j.JobId
74
+ inner join {job} p on d.parent_id = p.JobId
75
+ where
76
+ true
77
+ and d.parent_id is not null
78
+ and not d.job_id = d.parent_id
79
+ and not exists (
80
+ select 1
81
+ from
82
+ fabricks.dependencies_circular dc
83
+ where
84
+ true
85
+ and d.job_id = dc.job_id
86
+ and d.parent_id = dc.parent_id
87
+
88
+ )
89
+ group by all
90
+ """,
91
+ job=job_df,
92
+ )
93
+ df = df.withColumn("ScheduleId", lit(self.schedule_id))
94
+ return df.withColumn("Schedule", lit(self.schedule))
95
+
96
+ def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
97
+ if job_df is None:
98
+ job_df = self.get_jobs()
99
+
100
+ return SPARK.sql(
101
+ """
102
+ select
103
+ Step
104
+ from
105
+ {job}
106
+ group by
107
+ Step
108
+ """,
109
+ job=job_df,
110
+ )
111
+
112
+ def generate(self) -> Tuple[str, DataFrame, DataFrame]:
113
+ job_df = self.get_jobs()
114
+ deps_df = self.get_dependencies(job_df)
115
+ step_df = self.get_steps(job_df)
116
+
117
+ table = self.get_table()
118
+
119
+ table.create_if_not_exists()
120
+ table.truncate_all_partitions()
121
+
122
+ table.upsert(job_df)
123
+ table.upsert(deps_df)
124
+
125
+ df = SPARK.sql(
126
+ """
127
+ select
128
+ ScheduleId as PartitionKey,
129
+ ScheduleId,
130
+ `Schedule`,
131
+ Step,
132
+ Job,
133
+ JobId,
134
+ date_format(current_timestamp(), 'dd/MM/yy HH:mm:ss') as Created,
135
+ 'INFO' as `Level`,
136
+ `Status` as `Message`,
137
+ from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
138
+ md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, '-1'), "*")) as RowKey
139
+ from
140
+ {df}
141
+ """,
142
+ df=job_df,
143
+ )
144
+
145
+ TABLE_LOG_HANDLER.table.upsert(df)
146
+
147
+ cs = self.get_connection_info()
148
+ for row in step_df.collect():
149
+ step = self.remove_invalid_characters(row.Step)
150
+
151
+ with AzureQueue(f"q{step}{self.schedule_id}", **dict(cs)) as queue: # type: ignore
152
+ queue.create_if_not_exists()
153
+ queue.clear()
154
+
155
+ time.sleep(60)
156
+
157
+ return self.schedule_id, job_df, deps_df
@@ -0,0 +1,12 @@
1
+ import logging
2
+ from typing import Final
3
+
4
+ from fabricks.core.dags.utils import get_table
5
+ from fabricks.utils.log import AzureTableLogHandler, get_logger
6
+
7
+ table = get_table()
8
+ Logger, TableLogHandler = get_logger("dags", logging.INFO, table=table, debugmode=False)
9
+
10
+ LOGGER: Final[logging.Logger] = Logger
11
+ assert TableLogHandler is not None
12
+ TABLE_LOG_HANDLER: Final[AzureTableLogHandler] = TableLogHandler