fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,143 @@
1
+ import os
2
+ import sys
3
+ from typing import Final, List
4
+
5
+ import yaml
6
+ from databricks.sdk.runtime import spark
7
+
8
+ from fabricks.utils.path import Path
9
+
10
+ try:
11
+ runtime = Path(os.environ["FABRICKS_RUNTIME"], assume_git=True)
12
+ assert runtime, "runtime mandatory in cluster config"
13
+ PATH_RUNTIME: Final[Path] = runtime
14
+
15
+ notebooks = Path(os.environ["FABRICKS_NOTEBOOKS"], assume_git=True)
16
+ assert notebooks, "notebooks mandatory in cluster config"
17
+ PATH_NOTEBOOKS: Final[Path] = notebooks
18
+
19
+ version = os.environ["FABRICKS_VERSION"]
20
+ assert version, "version mandatory in cluster config"
21
+ VERSION: Final[str] = version
22
+
23
+ PATH_LIBRARIES = "/dbfs/mnt/fabricks/site-packages"
24
+ spark._sc._python_includes.append(PATH_LIBRARIES) # type: ignore
25
+ sys.path.append(PATH_LIBRARIES)
26
+
27
+ try:
28
+ is_test = os.environ["FABRICKS_IS_TEST"] == "TRUE"
29
+ except Exception:
30
+ is_test = False
31
+ IS_TEST: Final[bool] = is_test
32
+
33
+ try:
34
+ is_debug = os.environ["FABRICKS_IS_DEBUG"] == "TRUE"
35
+ except Exception:
36
+ is_debug = False
37
+ IS_DEBUG: Final[bool] = is_debug
38
+
39
+ try:
40
+ is_live = os.environ["FABRICKS_IS_LIVE"] == "TRUE"
41
+ except Exception:
42
+ is_live = False
43
+ IS_LIVE: Final[bool] = is_live
44
+
45
+ conf_path = PATH_RUNTIME.join(
46
+ "fabricks",
47
+ f"conf.{spark.conf.get('spark.databricks.clusterUsageTags.clusterOwnerOrgId')}.yml",
48
+ )
49
+ with open(conf_path.string) as f:
50
+ data = yaml.safe_load(f)
51
+
52
+ conf: dict = [d["conf"] for d in data][0]
53
+ assert conf, "conf mandatory"
54
+ CONF_RUNTIME: Final[dict] = conf
55
+
56
+ BRONZE = CONF_RUNTIME.get("bronze", [{}])
57
+ SILVER = CONF_RUNTIME.get("silver", [{}])
58
+ GOLD = CONF_RUNTIME.get("gold", [{}])
59
+ STEPS = BRONZE + SILVER + GOLD
60
+
61
+ databases = CONF_RUNTIME.get("databases", [{}])
62
+ credentials = CONF_RUNTIME.get("credentials", {})
63
+ variables = CONF_RUNTIME.get("variables", {})
64
+ VARIABLES: dict = variables
65
+
66
+ conf_options = CONF_RUNTIME.get("options", {})
67
+ assert conf_options, "options mandatory"
68
+
69
+ secret_scope = conf_options.get("secret_scope")
70
+ assert secret_scope, "secret_scope mandatory in options"
71
+ SECRET_SCOPE: Final[str] = secret_scope
72
+
73
+ path_options = CONF_RUNTIME.get("path_options", {})
74
+ assert path_options, "options mandatory"
75
+
76
+ fabricks_uri = path_options.get("storage")
77
+ assert fabricks_uri, "storage mandatory in path options"
78
+ FABRICKS_STORAGE: Final[Path] = Path.from_uri(fabricks_uri, regex=variables)
79
+
80
+ path_udfs = path_options.get("udfs")
81
+ assert path_udfs, "udfs mandatory in path options"
82
+ PATH_UDFS: Final[Path] = PATH_RUNTIME.join(path_udfs)
83
+
84
+ path_parsers = path_options.get("parsers")
85
+ assert path_parsers, "parsers mandatory in path options"
86
+ PATH_PARSERS: Final[Path] = PATH_RUNTIME.join(path_parsers)
87
+
88
+ path_extenders = path_options.get("extenders")
89
+ assert path_extenders, "extenders mandatory in path options"
90
+ PATH_EXTENDERS: Final[Path] = PATH_RUNTIME.join(path_extenders)
91
+
92
+ path_views = path_options.get("views")
93
+ assert path_views, "views mandatory in path options"
94
+ PATH_VIEWS: Final[Path] = PATH_RUNTIME.join(path_views)
95
+
96
+ path_schedules = path_options.get("schedules")
97
+ assert path_schedules, "schedules mandatory in path options"
98
+ PATH_SCHEDULES: Final[Path] = PATH_RUNTIME.join(path_schedules)
99
+
100
+ path_requirements = path_options.get("requirements")
101
+ assert path_requirements, "requirements mandatory in path options"
102
+ PATH_REQUIREMENTS: Final[Path] = PATH_RUNTIME.join(path_requirements)
103
+
104
+ def _get_storage_paths(objects: List[dict]) -> dict:
105
+ d = {}
106
+ for o in objects:
107
+ if o:
108
+ name = o.get("name")
109
+ assert name
110
+ uri = o.get("path_options", {}).get("storage")
111
+ assert uri
112
+ d[name] = Path.from_uri(uri, regex=variables)
113
+ return d
114
+
115
+ PATHS_STORAGE: Final[dict[str, Path]] = {
116
+ "fabricks": FABRICKS_STORAGE,
117
+ **_get_storage_paths(BRONZE),
118
+ **_get_storage_paths(SILVER),
119
+ **_get_storage_paths(GOLD),
120
+ **_get_storage_paths(databases),
121
+ }
122
+
123
+ def _get_runtime_path(objects: List[dict]) -> dict:
124
+ d = {}
125
+ for o in objects:
126
+ name = o.get("name")
127
+ assert name
128
+ uri = o.get("path_options", {}).get("runtime")
129
+ assert uri
130
+ d[name] = PATH_RUNTIME.join(uri)
131
+ return d
132
+
133
+ PATHS_RUNTIME: Final[dict[str, Path]] = {
134
+ **_get_runtime_path(BRONZE),
135
+ **_get_runtime_path(SILVER),
136
+ **_get_runtime_path(GOLD),
137
+ }
138
+
139
+ except KeyError as e:
140
+ raise e
141
+
142
+ except AssertionError as e:
143
+ raise e
@@ -0,0 +1,43 @@
1
+ from typing import Optional, Tuple
2
+
3
+ from pyspark.dbutils import DBUtils
4
+ from pyspark.sql import SparkSession
5
+
6
+ from fabricks.context.runtime import CONF_RUNTIME, SECRET_SCOPE
7
+ from fabricks.utils.secret import add_secret_to_spark, get_secret_from_secret_scope
8
+
9
+
10
+ def build_spark_session(new: Optional[bool] = False, log: Optional[bool] = False) -> Tuple[SparkSession, DBUtils]:
11
+ if new:
12
+ spark = SparkSession.builder.getOrCreate().newSession() # type: ignore
13
+
14
+ catalog = CONF_RUNTIME.get("options", {}).get("catalog")
15
+ if catalog:
16
+ spark.sql(f"use catalog {catalog};")
17
+
18
+ # delta
19
+ spark.sql("set spark.databricks.delta.schema.autoMerge.enabled = True;")
20
+ spark.sql("set spark.databricks.delta.resolveMergeUpdateStructsByName.enabled = True;")
21
+
22
+ spark_options = CONF_RUNTIME.get("spark_options", {})
23
+ if spark_options:
24
+ sql_options = spark_options.get("sql", {})
25
+ for key, value in sql_options.items():
26
+ spark.sql(f"set {key} = {value};")
27
+
28
+ conf_options = spark_options.get("conf", {})
29
+ for key, value in conf_options.items():
30
+ spark.conf.set(key, value)
31
+
32
+ credentials = CONF_RUNTIME.get("credentials", {})
33
+ for uri, secret in credentials.items():
34
+ s = get_secret_from_secret_scope(secret_scope=SECRET_SCOPE, name=secret)
35
+ add_secret_to_spark(secret=s, uri=uri)
36
+
37
+ else:
38
+ spark = SparkSession.builder.getOrCreate() # type: ignore
39
+
40
+ return spark, DBUtils(spark)
41
+
42
+
43
+ build_spark_session(new=True, log=True)
@@ -0,0 +1,123 @@
1
+ from typing import List, Optional, TypedDict
2
+
3
+
4
+ class RuntimePathOptions(TypedDict):
5
+ storage: str
6
+ udfs: str
7
+ extenders: str
8
+ parsers: str
9
+ schedules: str
10
+ views: str
11
+ requirements: str
12
+
13
+
14
+ class RuntimeTimeoutOptions(TypedDict):
15
+ step: int
16
+ job: int
17
+ pre_run: int
18
+ post_run: int
19
+
20
+
21
+ class StepTimeoutOptions(TypedDict):
22
+ step: Optional[int]
23
+ job: Optional[int]
24
+ pre_run: Optional[int]
25
+ post_run: Optional[int]
26
+
27
+
28
+ class RuntimeOptions(TypedDict):
29
+ secret_scope: str
30
+ catalog: str
31
+ workers: int
32
+ timeouts: RuntimeTimeoutOptions
33
+ retention_days: int
34
+
35
+
36
+ class SparkOptions(TypedDict):
37
+ sql: dict
38
+ conf: dict
39
+
40
+
41
+ class StepPathOptions(TypedDict):
42
+ runtime: str
43
+ storage: str
44
+
45
+
46
+ class InvokeOptions(TypedDict):
47
+ notebook: str
48
+ arguments: Optional[dict[str, str]]
49
+
50
+
51
+ class StepOptions(TypedDict):
52
+ order: int
53
+ workers: Optional[int]
54
+ timeouts: StepTimeoutOptions
55
+ extender: Optional[str]
56
+ pre_run: Optional[InvokeOptions]
57
+ post_run: Optional[InvokeOptions]
58
+
59
+
60
+ class SilverOptions(StepOptions):
61
+ parent: str
62
+ stream: Optional[bool]
63
+ local_checkpoint: Optional[bool]
64
+
65
+
66
+ class GoldOptions(StepOptions):
67
+ schema_drift: Optional[bool]
68
+
69
+
70
+ class Step(TypedDict):
71
+ name: str
72
+
73
+
74
+ class TableOptions(TypedDict):
75
+ powerbi: Optional[bool]
76
+ liquid_clustering: Optional[bool]
77
+ properties: Optional[dict[str, str]]
78
+ retention_days: Optional[int]
79
+
80
+
81
+ class Bronze(Step):
82
+ options: StepOptions
83
+ path_options: StepPathOptions
84
+ table_options: Optional[TableOptions]
85
+
86
+
87
+ class Silver(Step):
88
+ options: SilverOptions
89
+ path_options: StepPathOptions
90
+ table_options: Optional[TableOptions]
91
+
92
+
93
+ class Gold(Step):
94
+ options: GoldOptions
95
+ path_options: StepPathOptions
96
+ table_options: Optional[TableOptions]
97
+
98
+
99
+ class PowerBI(Step):
100
+ pass
101
+
102
+
103
+ class DatabasePathOptions(TypedDict):
104
+ storage: str
105
+
106
+
107
+ class Database(TypedDict):
108
+ name: str
109
+ path_options: DatabasePathOptions
110
+
111
+
112
+ class Conf(TypedDict):
113
+ name: str
114
+ options: RuntimeOptions
115
+ path_options: RuntimePathOptions
116
+ spark_options: SparkOptions
117
+ bronze: Optional[List[Bronze]]
118
+ silver: Optional[List[Silver]]
119
+ gold: Optional[List[Gold]]
120
+ powerbi: Optional[List[PowerBI]]
121
+ databases: Optional[List[Database]]
122
+ variables: Optional[List[dict[str, str]]]
123
+ credentials: Optional[List[dict[str, str]]]
@@ -0,0 +1,4 @@
1
+ from fabricks.core.jobs import Bronzes, Golds, Silvers, Steps, get_job, get_jobs
2
+ from fabricks.core.steps import get_step
3
+
4
+ __all__ = ["get_job", "get_jobs", "get_step", "Bronzes", "Silvers", "Golds", "Steps"]
@@ -0,0 +1,9 @@
1
+ from fabricks.core.dags.generator import DagGenerator
2
+ from fabricks.core.dags.processor import DagProcessor
3
+ from fabricks.core.dags.terminator import DagTerminator
4
+
5
+ __all__ = [
6
+ "DagGenerator",
7
+ "DagProcessor",
8
+ "DagTerminator",
9
+ ]
@@ -0,0 +1,72 @@
1
+ import re
2
+ from typing import Optional, cast
3
+
4
+ from databricks.sdk.runtime import spark
5
+ from pyspark.sql import DataFrame
6
+ from pyspark.sql.functions import expr
7
+
8
+ from fabricks.context import FABRICKS_STORAGE
9
+ from fabricks.core.dags.log import DagsTableLogger
10
+ from fabricks.metastore.table import Table
11
+ from fabricks.utils.azure_table import AzureTable
12
+ from fabricks.utils.secret import AccessKey, get_secret_from_secret_scope
13
+
14
+
15
+ class BaseDags:
16
+ def __init__(self, schedule_id: str):
17
+ self.schedule_id = schedule_id
18
+
19
+ def get_connection_string(self) -> str:
20
+ storage_account = FABRICKS_STORAGE.get_storage_account()
21
+ secret = get_secret_from_secret_scope("bmskv", f"{storage_account}-access-key")
22
+ access_key = cast(AccessKey, secret).key
23
+ connection_string = f"DefaultEndpointsProtocol=https;AccountName={storage_account};AccountKey={access_key};EndpointSuffix=core.windows.net"
24
+ return connection_string
25
+
26
+ def get_table(self) -> AzureTable:
27
+ cs = self.get_connection_string()
28
+ table = AzureTable(f"t{self.schedule_id}", connection_string=cs)
29
+ return table
30
+
31
+ def get_logs(self, step: Optional[str] = None) -> DataFrame:
32
+ q = f"PartitionKey eq '{self.schedule_id}'"
33
+ if step:
34
+ q += f" and Step eq '{step}'"
35
+
36
+ d = DagsTableLogger.table.query(q)
37
+ df = spark.createDataFrame(d)
38
+ if "Exception" not in df.columns:
39
+ df = df.withColumn("Exception", expr("null"))
40
+
41
+ df = spark.sql(
42
+ """
43
+ select
44
+ ScheduleId as schedule_id,
45
+ Schedule as schedule,
46
+ Step as step,
47
+ JobId as job_id,
48
+ Job as job,
49
+ NotebookId as notebook_id,
50
+ `Level` as `level`,
51
+ `Message` as `status`,
52
+ to_timestamp(`Created`, 'dd/MM/yy HH:mm:ss') as `timestamp`,
53
+ from_json(Exception, 'type STRING, message STRING, traceback STRING') as exception
54
+ from
55
+ {df}
56
+ """,
57
+ df=df,
58
+ )
59
+ return df
60
+
61
+ def write_logs(self, df: DataFrame):
62
+ (
63
+ df.write.format("delta")
64
+ .mode("overwrite")
65
+ .option("mergeSchema", "true")
66
+ .option("partitionOverwriteMode", "dynamic")
67
+ .save(Table("fabricks", "logs").deltapath.string)
68
+ )
69
+
70
+ def remove_invalid_characters(self, s: str) -> str:
71
+ out = re.sub("[^a-zA-Z0-9]", "", s)
72
+ return out
@@ -0,0 +1,154 @@
1
+ import time
2
+ from typing import Optional, Tuple
3
+ from uuid import uuid4
4
+
5
+ from databricks.sdk.runtime import spark
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.core.dags.base import BaseDags
9
+ from fabricks.core.dags.log import DagsTableLogger
10
+ from fabricks.utils.azure_queue import AzureQueue
11
+
12
+
13
+ class DagGenerator(BaseDags):
14
+ def __init__(self, schedule: str):
15
+ self.schedule = schedule
16
+ schedule_id = str(uuid4().hex)
17
+ super().__init__(schedule_id=schedule_id)
18
+
19
+ def get_jobs(self) -> DataFrame:
20
+ return spark.sql(
21
+ f"""
22
+ with logs as (
23
+ select
24
+ l.job_id,
25
+ median(l.duration) as median_duration
26
+ from
27
+ fabricks.logs_pivot l
28
+ where
29
+ true
30
+ and duration is not null
31
+ and date_diff(day, l.start_time , current_date) < 10
32
+ group by
33
+ l.job_id
34
+ )
35
+ select
36
+ 'statuses' as PartitionKey,
37
+ '{self.schedule_id}' as ScheduleId,
38
+ '{self.schedule}' as Schedule,
39
+ j.job_id::string as RowKey,
40
+ j.step as Step,
41
+ j.job_id as JobId,
42
+ j.job as Job,
43
+ 'scheduled' as `Status`,
44
+ max(median_duration) as `MedianDuration`,
45
+ dense_rank() over (order by max(median_duration) desc) as Rank
46
+ from
47
+ fabricks.jobs j
48
+ inner join fabricks.{self.schedule}_schedule v on j.job_id = v.job_id
49
+ left join logs l on j.job_id = l.job_id
50
+ group by all
51
+ """
52
+ )
53
+
54
+ def get_dependencies(self, job_df: Optional[DataFrame] = None) -> DataFrame:
55
+ if job_df is None:
56
+ job_df = self.get_jobs()
57
+
58
+ return spark.sql(
59
+ """
60
+ select
61
+ 'dependencies' as PartitionKey,
62
+ d.dependency_id::string as RowKey,
63
+ {schedule_id} as ScheduleId,
64
+ {schedule} as Schedule,
65
+ d.dependency_id as DependencyId,
66
+ j.Step as Step,
67
+ j.Job as Job,
68
+ j.JobId as JobId,
69
+ p.Step as ParentStep,
70
+ p.Job as Parent,
71
+ p.JobId as ParentId
72
+ from
73
+ fabricks.dependencies d
74
+ inner join {job} j on d.job_id = j.JobId
75
+ inner join {job} p on d.parent_id = p.JobId
76
+ where
77
+ true
78
+ and d.parent_id is not null
79
+ and not d.job_id = d.parent_id
80
+ and not exists (
81
+ select 1
82
+ from
83
+ fabricks.dependencies_circular dc
84
+ where
85
+ true
86
+ and d.job_id = dc.job_id
87
+ and d.parent_id = dc.parent_id
88
+
89
+ )
90
+ group by all
91
+ """,
92
+ job=job_df,
93
+ schedule=self.schedule,
94
+ schedule_id=self.schedule_id,
95
+ )
96
+
97
+ def get_steps(self, job_df: Optional[DataFrame] = None) -> DataFrame:
98
+ if job_df is None:
99
+ job_df = self.get_jobs()
100
+
101
+ return spark.sql(
102
+ """
103
+ select
104
+ Step
105
+ from
106
+ {job}
107
+ group by
108
+ Step
109
+ """,
110
+ job=job_df,
111
+ )
112
+
113
+ def generate(self) -> Tuple[str, DataFrame, DataFrame]:
114
+ job_df = self.get_jobs()
115
+ deps_df = self.get_dependencies(job_df)
116
+ step_df = self.get_steps(job_df)
117
+
118
+ table = self.get_table()
119
+ table.create_if_not_exists()
120
+ table.truncate_all_partitions()
121
+ table.upsert(job_df)
122
+ table.upsert(deps_df)
123
+
124
+ df = spark.sql(
125
+ """
126
+ select
127
+ ScheduleId as PartitionKey,
128
+ ScheduleId,
129
+ `Schedule`,
130
+ Step,
131
+ Job,
132
+ JobId,
133
+ date_format(current_timestamp(), 'dd/MM/yy HH:mm:ss') as Created,
134
+ 'INFO' as `Level`,
135
+ `Status` as `Message`,
136
+ from_json(null, 'type STRING, message STRING, traceback STRING') as Exception,
137
+ md5(array_join(array(ScheduleId, `Schedule`, Step, Job, JobId, Created, `Level`, `Message`, -1), "*")) as RowKey
138
+ from
139
+ {df}
140
+ """,
141
+ df=job_df,
142
+ )
143
+ DagsTableLogger.table.upsert(df)
144
+
145
+ cs = self.get_connection_string()
146
+ for row in step_df.collect():
147
+ step = self.remove_invalid_characters(row.Step)
148
+ queue = AzureQueue(f"q{step}{self.schedule_id}", connection_string=cs)
149
+ queue.create_if_not_exists()
150
+ queue.clear()
151
+
152
+ time.sleep(60)
153
+
154
+ return self.schedule_id, job_df, deps_df
@@ -0,0 +1,14 @@
1
+ import logging
2
+ from typing import cast
3
+
4
+ from fabricks.context.runtime import FABRICKS_STORAGE, SECRET_SCOPE
5
+ from fabricks.utils.azure_table import AzureTable
6
+ from fabricks.utils.log import get_logger
7
+ from fabricks.utils.secret import AccessKey, get_secret_from_secret_scope
8
+
9
+ storage_account = FABRICKS_STORAGE.get_storage_account()
10
+ secret = get_secret_from_secret_scope(SECRET_SCOPE, f"{storage_account}-access-key")
11
+ access_key = cast(AccessKey, secret).key
12
+
13
+ table = AzureTable("dags", storage_account=storage_account, access_key=access_key)
14
+ DagsLogger, DagsTableLogger = get_logger("dags", logging.DEBUG, table=table)