fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,145 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import List, Optional, Union
5
+
6
+ from databricks.sdk.runtime import dbutils as _dbutils
7
+ from databricks.sdk.runtime import spark as _spark
8
+ from pyspark.sql import DataFrame, SparkSession
9
+
10
+ from fabricks.metastore.database import Database
11
+ from fabricks.metastore.table import Table
12
+
13
+
14
+ class Configurator(ABC):
15
+ def __init__(
16
+ self,
17
+ database: str,
18
+ *levels: str,
19
+ change_data_capture: str,
20
+ spark: Optional[SparkSession] = None,
21
+ ):
22
+ if spark is None:
23
+ spark = _spark
24
+ assert spark is not None
25
+ self.spark: SparkSession = spark
26
+ self.dbutils = _dbutils
27
+
28
+ self.database = Database(database)
29
+ self.levels = levels
30
+ self.change_data_capture = change_data_capture
31
+ self.table = Table(self.database.name, *self.levels, spark=self.spark)
32
+
33
+ def is_view(self):
34
+ return self.table.is_view()
35
+
36
+ def registered(self):
37
+ return self.table.registered()
38
+
39
+ @abstractmethod
40
+ def get_query(self, src: Union[DataFrame, Table, str], **kwargs):
41
+ raise NotImplementedError()
42
+
43
+ @abstractmethod
44
+ def get_data(self, src: Union[DataFrame, Table, str], **kwargs) -> DataFrame:
45
+ raise NotImplementedError()
46
+
47
+ @abstractmethod
48
+ def create_table(
49
+ self,
50
+ src: Union[DataFrame, Table, str],
51
+ partition_by: Optional[Union[List[str], str]] = None,
52
+ identity: Optional[bool] = False,
53
+ liquid_clustering: Optional[bool] = False,
54
+ cluster_by: Optional[Union[List[str], str]] = None,
55
+ properties: Optional[dict[str, str]] = None,
56
+ **kwargs,
57
+ ):
58
+ raise NotImplementedError()
59
+
60
+ @abstractmethod
61
+ def drop(self):
62
+ raise NotImplementedError()
63
+
64
+ @abstractmethod
65
+ def create_or_replace_view(self, query: Optional[str] = None):
66
+ raise NotImplementedError()
67
+
68
+ @property
69
+ def allowed_leading_columns(self):
70
+ cols = ["__identity", "__key", "__timestamp", "__valid_from", "__valid_to"]
71
+ if self.change_data_capture == "scd1":
72
+ cols.remove("__valid_from")
73
+ cols.remove("__valid_to")
74
+ elif self.change_data_capture == "scd2":
75
+ cols.remove("__timestamp")
76
+ return cols
77
+
78
+ @property
79
+ def allowed_trailing_columns(self):
80
+ cols = [
81
+ "__source",
82
+ "__operation",
83
+ "__is_current",
84
+ "__is_deleted",
85
+ "__metadata",
86
+ "__hash",
87
+ "__rescued_data",
88
+ ]
89
+ if self.change_data_capture == "scd1":
90
+ cols.remove("__operation")
91
+ elif self.change_data_capture == "scd2":
92
+ cols.remove("__operation")
93
+ return cols
94
+
95
+ @property
96
+ def slowly_changing_dimension(self) -> bool:
97
+ return self.change_data_capture in ["scd1", "scd2"]
98
+
99
+ def get_src(self, src: Union[DataFrame, Table, str]) -> DataFrame:
100
+ if isinstance(src, DataFrame):
101
+ df = src
102
+ elif isinstance(src, Table):
103
+ df = self.table.dataframe
104
+ elif isinstance(src, str):
105
+ df = self.spark.sql(src)
106
+ else:
107
+ raise ValueError(f"{src} not allowed")
108
+
109
+ return df
110
+
111
+ def get_columns(self, src: Union[DataFrame, Table, str], backtick: Optional[bool] = True) -> List[str]:
112
+ if backtick:
113
+ backtick = True
114
+
115
+ df = self.get_src(src=src)
116
+ columns = df.columns
117
+
118
+ if backtick:
119
+ return [f"`{c}`" for c in columns]
120
+ else:
121
+ return columns
122
+
123
+ def reorder_columns(self, df: DataFrame) -> DataFrame:
124
+ fields = [f"`{c}`" for c in df.columns if not c.startswith("__")]
125
+ __leading = [c for c in self.allowed_leading_columns if c in df.columns]
126
+ __trailing = [c for c in self.allowed_trailing_columns if c in df.columns]
127
+
128
+ columns = __leading + fields + __trailing
129
+
130
+ return df.select(columns)
131
+
132
+ @abstractmethod
133
+ def optimize_table(self):
134
+ raise NotImplementedError()
135
+
136
+ @abstractmethod
137
+ def update_schema(self, **kwargs):
138
+ raise NotImplementedError()
139
+
140
+ @abstractmethod
141
+ def overwrite_schema(self):
142
+ raise NotImplementedError()
143
+
144
+ def __str__(self):
145
+ return f"{self.table.qualified_name}"
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional, Union
4
+
5
+ from py4j.protocol import Py4JJavaError
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.cdc.base.configurator import Configurator
9
+ from fabricks.context.log import Logger
10
+ from fabricks.metastore.table import Table
11
+ from fabricks.utils.sqlglot import fix as fix_sql
12
+
13
+
14
+ class Generator(Configurator):
15
+ def drop(self):
16
+ self.table.drop()
17
+
18
+ def create_table(
19
+ self,
20
+ src: Union[DataFrame, Table, str],
21
+ partitioning: Optional[bool] = False,
22
+ partition_by: Optional[Union[List[str], str]] = None,
23
+ identity: Optional[bool] = False,
24
+ liquid_clustering: Optional[bool] = False,
25
+ cluster_by: Optional[Union[List[str], str]] = None,
26
+ properties: Optional[dict[str, str]] = None,
27
+ **kwargs,
28
+ ):
29
+ kwargs["mode"] = "complete"
30
+ kwargs["filter"] = False
31
+ kwargs["rectify"] = False
32
+ kwargs["deduplicate"] = False
33
+ df = self.get_data(src, **kwargs)
34
+
35
+ if liquid_clustering:
36
+ assert cluster_by, "clustering column not found"
37
+ elif partitioning:
38
+ assert partition_by, "partitioning column not found"
39
+
40
+ fields = [c for c in df.columns if not c.startswith("__")]
41
+ __leading = [c for c in self.allowed_leading_columns if c in df.columns]
42
+ __trailing = [c for c in self.allowed_trailing_columns if c in df.columns]
43
+ columns = __leading + fields + __trailing
44
+
45
+ df = df.select([f"`{c}`" for c in columns])
46
+
47
+ identity = False if identity is None else identity
48
+ liquid_clustering = False if liquid_clustering is None else liquid_clustering
49
+
50
+ self.table.create(
51
+ df=df,
52
+ partitioning=partitioning,
53
+ partition_by=partition_by,
54
+ identity=identity,
55
+ liquid_clustering=liquid_clustering,
56
+ cluster_by=cluster_by,
57
+ properties=properties,
58
+ )
59
+
60
+ def create_or_replace_view(self, src: Union[Table, str], **kwargs):
61
+ assert not isinstance(src, DataFrame), "dataframe not allowed"
62
+
63
+ assert kwargs["mode"] == "complete", f"{kwargs['mode']} not allowed"
64
+ sql = self.get_query(src, **kwargs)
65
+
66
+ df = self.spark.sql(sql)
67
+ df = self.reorder_columns(df)
68
+ columns = [f"`{c}`" for c in df.columns]
69
+
70
+ sql = f"""
71
+ create or replace view {self}
72
+ as
73
+ with __view as (
74
+ {sql}
75
+ )
76
+ select
77
+ {','.join(columns)}
78
+ from __view
79
+ """
80
+ sql = fix_sql(sql)
81
+ Logger.debug("create or replace view", extra={"job": self, "sql": sql})
82
+
83
+ try:
84
+ self.spark.sql(sql)
85
+ except Py4JJavaError:
86
+ Logger.exception("🙈", extra={"job": self})
87
+
88
+ def optimize_table(self):
89
+ liquid_clustering = self.table.get_property("delta.feature.liquid") == "supported"
90
+ if liquid_clustering:
91
+ self.table.optimize()
92
+ else:
93
+ columns = None
94
+ if self.change_data_capture == "scd1":
95
+ columns = ["__key"]
96
+ elif self.change_data_capture == "scd2":
97
+ columns = ["__key", "__valid_from"]
98
+ vorder = self.table.get_property("delta.parquet.vorder.enabled") or "false"
99
+ vorder = vorder.lower() == "true"
100
+ self.table.optimize(columns=columns, vorder=vorder)
101
+
102
+ def update_schema(self, src: Union[DataFrame, Table, str], **kwargs):
103
+ overwrite = kwargs.get("overwrite", False)
104
+
105
+ if self.is_view():
106
+ assert not isinstance(src, DataFrame), "dataframe not allowed"
107
+ self.create_or_replace_view(src=src, **kwargs)
108
+ else:
109
+ kwargs["mode"] = "complete"
110
+ df = self.get_data(src, **kwargs)
111
+ if overwrite:
112
+ self.table.overwrite_schema(df)
113
+ else:
114
+ self.table.update_schema(df)
115
+
116
+ def overwrite_schema(self, src: Union[DataFrame, Table, str]):
117
+ self.update_schema(src=src, overwrite=True)
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, Union
4
+
5
+ from jinja2 import Environment, PackageLoader
6
+ from pyspark.sql import DataFrame
7
+
8
+ from fabricks.cdc.base.processor import Processor
9
+ from fabricks.context.log import Logger
10
+ from fabricks.metastore.table import Table
11
+ from fabricks.metastore.view import create_or_replace_global_temp_view
12
+ from fabricks.utils.sqlglot import fix as fix_sql
13
+
14
+
15
+ class Merger(Processor):
16
+ def get_merge_context(self, src: Union[DataFrame, str], **kwargs) -> dict:
17
+ if isinstance(src, DataFrame):
18
+ format = "dataframe"
19
+ columns = self.get_columns(src, backtick=False)
20
+ elif isinstance(src, str):
21
+ format = "view"
22
+ columns = self.get_columns(f"select * from {src}", backtick=False)
23
+ else:
24
+ raise ValueError(f"{src} not allowed")
25
+
26
+ assert "__merge_key" in columns
27
+ assert "__merge_condition" in columns
28
+
29
+ keys = kwargs.get("keys")
30
+ if isinstance(keys, str):
31
+ keys = [keys]
32
+
33
+ columns = [c for c in columns if c not in ["__merge_condition", "__merge_key"]]
34
+ fields = [c for c in columns if not c.startswith("__")]
35
+ where = kwargs.get("update_where") if self.table.rows > 0 else None
36
+ soft_delete = "__is_deleted" in columns
37
+ has_source = "__source" in columns
38
+ has_key = "__key" in columns
39
+ has_metadata = "__metadata" in columns
40
+ has_hash = "__hash" in columns
41
+ has_timestamp = "__timestamp" in columns
42
+ has_identity = "__identity" in columns
43
+
44
+ # 'NoneType' object is not iterable
45
+ if keys:
46
+ keys = [f"`{k}`" for k in keys]
47
+ if columns:
48
+ columns = [f"`{c}`" for c in columns]
49
+ if fields:
50
+ fields = [f"`{c}`" for c in fields]
51
+
52
+ assert "__key" or keys, f"{self} - __key or keys not found"
53
+
54
+ return {
55
+ "src": src,
56
+ "format": format,
57
+ "tgt": self.table,
58
+ "cdc": self.change_data_capture,
59
+ "columns": columns,
60
+ "fields": fields,
61
+ "soft_delete": soft_delete,
62
+ "has_source": has_source,
63
+ "has_identity": has_identity,
64
+ "has_key": has_key,
65
+ "has_hash": has_hash,
66
+ "keys": keys,
67
+ "has_metadata": has_metadata,
68
+ "has_timestamp": has_timestamp,
69
+ "where": where,
70
+ }
71
+
72
+ def get_merge_query(self, src: Union[DataFrame, str], fix: Optional[bool] = True, **kwargs) -> str:
73
+ context = self.get_merge_context(src=src, **kwargs)
74
+ environment = Environment(loader=PackageLoader("fabricks.cdc", "templates"))
75
+ merge = environment.get_template("merge.sql.jinja")
76
+
77
+ try:
78
+ sql = merge.render(**context)
79
+ except Exception as e:
80
+ Logger.debug("context", extra={"job": self, "content": context})
81
+ raise e
82
+
83
+ if fix:
84
+ try:
85
+ sql = sql.replace("{src}", "src")
86
+ sql = fix_sql(sql)
87
+ sql = sql.replace("`src`", "{src}")
88
+ Logger.debug("merge", extra={"job": self, "sql": sql})
89
+ except Exception as e:
90
+ Logger.exception("🙈", extra={"job": self, "sql": sql})
91
+ raise e
92
+ else:
93
+ Logger.debug("merge", extra={"job": self, "sql": sql})
94
+
95
+ return sql
96
+
97
+ def merge(self, src: Union[DataFrame, Table, str], **kwargs):
98
+ if not self.table.exists():
99
+ self.create_table(src, **kwargs)
100
+
101
+ df = self.get_data(src, **kwargs)
102
+ if df:
103
+ global_temp_view = f"{self.database}_{'_'.join(self.levels)}__merge"
104
+ view = create_or_replace_global_temp_view(global_temp_view, df, uuid=kwargs.get("uuid", False))
105
+
106
+ merge = self.get_merge_query(view, **kwargs)
107
+ self.spark.sql(merge, src=view)