fabricks 2024.7.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +7 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +31 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/extenders.py +3 -0
  10. fabricks/api/log.py +3 -0
  11. fabricks/api/metastore/__init__.py +10 -0
  12. fabricks/api/metastore/database.py +3 -0
  13. fabricks/api/metastore/table.py +3 -0
  14. fabricks/api/metastore/view.py +6 -0
  15. fabricks/api/notebooks/__init__.py +0 -0
  16. fabricks/api/notebooks/cluster.py +6 -0
  17. fabricks/api/notebooks/deploy/__init__.py +0 -0
  18. fabricks/api/notebooks/deploy/fabricks.py +147 -0
  19. fabricks/api/notebooks/deploy/notebooks.py +86 -0
  20. fabricks/api/notebooks/initialize.py +38 -0
  21. fabricks/api/notebooks/optimize.py +25 -0
  22. fabricks/api/notebooks/process.py +50 -0
  23. fabricks/api/notebooks/run.py +87 -0
  24. fabricks/api/notebooks/terminate.py +27 -0
  25. fabricks/api/notebooks/vacuum.py +25 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/udfs.py +3 -0
  28. fabricks/api/utils.py +9 -0
  29. fabricks/cdc/__init__.py +14 -0
  30. fabricks/cdc/base/__init__.py +4 -0
  31. fabricks/cdc/base/cdc.py +5 -0
  32. fabricks/cdc/base/configurator.py +145 -0
  33. fabricks/cdc/base/generator.py +117 -0
  34. fabricks/cdc/base/merger.py +107 -0
  35. fabricks/cdc/base/processor.py +338 -0
  36. fabricks/cdc/base/types.py +3 -0
  37. fabricks/cdc/cdc.py +5 -0
  38. fabricks/cdc/nocdc.py +19 -0
  39. fabricks/cdc/scd.py +21 -0
  40. fabricks/cdc/scd1.py +15 -0
  41. fabricks/cdc/scd2.py +15 -0
  42. fabricks/cdc/templates/__init__.py +0 -0
  43. fabricks/cdc/templates/merge/scd1.sql.jinja +72 -0
  44. fabricks/cdc/templates/merge/scd2.sql.jinja +54 -0
  45. fabricks/cdc/templates/merge.sql.jinja +2 -0
  46. fabricks/cdc/templates/query/__init__.py +0 -0
  47. fabricks/cdc/templates/query/base.sql.jinja +34 -0
  48. fabricks/cdc/templates/query/context.sql.jinja +95 -0
  49. fabricks/cdc/templates/query/current.sql.jinja +32 -0
  50. fabricks/cdc/templates/query/deduplicate_hash.sql.jinja +21 -0
  51. fabricks/cdc/templates/query/deduplicate_key.sql.jinja +14 -0
  52. fabricks/cdc/templates/query/filter.sql.jinja +71 -0
  53. fabricks/cdc/templates/query/final.sql.jinja +1 -0
  54. fabricks/cdc/templates/query/hash.sql.jinja +1 -0
  55. fabricks/cdc/templates/query/nocdc.sql.jinja +10 -0
  56. fabricks/cdc/templates/query/rectify.sql.jinja +120 -0
  57. fabricks/cdc/templates/query/scd1.sql.jinja +112 -0
  58. fabricks/cdc/templates/query/scd2.sql.jinja +114 -0
  59. fabricks/cdc/templates/query.sql.jinja +11 -0
  60. fabricks/context/__init__.py +51 -0
  61. fabricks/context/log.py +26 -0
  62. fabricks/context/runtime.py +143 -0
  63. fabricks/context/spark.py +43 -0
  64. fabricks/context/types.py +123 -0
  65. fabricks/core/__init__.py +4 -0
  66. fabricks/core/dags/__init__.py +9 -0
  67. fabricks/core/dags/base.py +72 -0
  68. fabricks/core/dags/generator.py +154 -0
  69. fabricks/core/dags/log.py +14 -0
  70. fabricks/core/dags/processor.py +163 -0
  71. fabricks/core/dags/terminator.py +26 -0
  72. fabricks/core/deploy/__init__.py +12 -0
  73. fabricks/core/deploy/tables.py +76 -0
  74. fabricks/core/deploy/views.py +417 -0
  75. fabricks/core/extenders.py +29 -0
  76. fabricks/core/jobs/__init__.py +20 -0
  77. fabricks/core/jobs/base/__init__.py +10 -0
  78. fabricks/core/jobs/base/checker.py +89 -0
  79. fabricks/core/jobs/base/configurator.py +323 -0
  80. fabricks/core/jobs/base/error.py +16 -0
  81. fabricks/core/jobs/base/generator.py +391 -0
  82. fabricks/core/jobs/base/invoker.py +119 -0
  83. fabricks/core/jobs/base/job.py +5 -0
  84. fabricks/core/jobs/base/processor.py +204 -0
  85. fabricks/core/jobs/base/types.py +191 -0
  86. fabricks/core/jobs/bronze.py +333 -0
  87. fabricks/core/jobs/get_job.py +126 -0
  88. fabricks/core/jobs/get_job_conf.py +115 -0
  89. fabricks/core/jobs/get_job_id.py +26 -0
  90. fabricks/core/jobs/get_jobs.py +89 -0
  91. fabricks/core/jobs/gold.py +218 -0
  92. fabricks/core/jobs/silver.py +354 -0
  93. fabricks/core/parsers/__init__.py +12 -0
  94. fabricks/core/parsers/base.py +91 -0
  95. fabricks/core/parsers/decorator.py +11 -0
  96. fabricks/core/parsers/get_parser.py +25 -0
  97. fabricks/core/parsers/types.py +6 -0
  98. fabricks/core/schedules.py +89 -0
  99. fabricks/core/scripts/__init__.py +13 -0
  100. fabricks/core/scripts/armageddon.py +82 -0
  101. fabricks/core/scripts/generate.py +20 -0
  102. fabricks/core/scripts/job_schema.py +28 -0
  103. fabricks/core/scripts/optimize.py +45 -0
  104. fabricks/core/scripts/process.py +9 -0
  105. fabricks/core/scripts/stats.py +48 -0
  106. fabricks/core/scripts/steps.py +27 -0
  107. fabricks/core/scripts/terminate.py +6 -0
  108. fabricks/core/scripts/vacuum.py +45 -0
  109. fabricks/core/site_packages.py +55 -0
  110. fabricks/core/steps/__init__.py +4 -0
  111. fabricks/core/steps/base.py +282 -0
  112. fabricks/core/steps/get_step.py +10 -0
  113. fabricks/core/steps/get_step_conf.py +33 -0
  114. fabricks/core/steps/types.py +7 -0
  115. fabricks/core/udfs.py +106 -0
  116. fabricks/core/utils.py +69 -0
  117. fabricks/core/views.py +36 -0
  118. fabricks/metastore/README.md +3 -0
  119. fabricks/metastore/__init__.py +5 -0
  120. fabricks/metastore/database.py +71 -0
  121. fabricks/metastore/pyproject.toml +20 -0
  122. fabricks/metastore/relational.py +61 -0
  123. fabricks/metastore/table.py +529 -0
  124. fabricks/metastore/utils.py +35 -0
  125. fabricks/metastore/view.py +40 -0
  126. fabricks/utils/README.md +3 -0
  127. fabricks/utils/__init__.py +0 -0
  128. fabricks/utils/azure_queue.py +63 -0
  129. fabricks/utils/azure_table.py +99 -0
  130. fabricks/utils/console.py +51 -0
  131. fabricks/utils/container.py +57 -0
  132. fabricks/utils/fdict.py +28 -0
  133. fabricks/utils/helpers.py +89 -0
  134. fabricks/utils/log.py +153 -0
  135. fabricks/utils/path.py +206 -0
  136. fabricks/utils/pip.py +61 -0
  137. fabricks/utils/pydantic.py +92 -0
  138. fabricks/utils/pyproject.toml +18 -0
  139. fabricks/utils/read/__init__.py +11 -0
  140. fabricks/utils/read/read.py +305 -0
  141. fabricks/utils/read/read_excel.py +5 -0
  142. fabricks/utils/read/read_yaml.py +43 -0
  143. fabricks/utils/read/types.py +3 -0
  144. fabricks/utils/schema/__init__.py +7 -0
  145. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  146. fabricks/utils/schema/get_schema_for_type.py +93 -0
  147. fabricks/utils/secret.py +78 -0
  148. fabricks/utils/sqlglot.py +48 -0
  149. fabricks/utils/write/__init__.py +8 -0
  150. fabricks/utils/write/delta.py +46 -0
  151. fabricks/utils/write/stream.py +27 -0
  152. fabricks-2024.7.1.5.dist-info/METADATA +212 -0
  153. fabricks-2024.7.1.5.dist-info/RECORD +154 -0
  154. fabricks-2024.7.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,204 @@
1
+ from typing import Optional
2
+
3
+ from pyspark.sql import DataFrame
4
+ from pyspark.sql.functions import expr
5
+
6
+ from fabricks.context import SECRET_SCOPE
7
+ from fabricks.context.log import Logger, flush
8
+ from fabricks.core.jobs.base.error import CheckFailedException, CheckWarningException, InvokerFailedException
9
+ from fabricks.core.jobs.base.invoker import Invoker
10
+ from fabricks.utils.write import write_stream
11
+
12
+
13
+ class Processor(Invoker):
14
+ def extender(self, df: DataFrame) -> DataFrame:
15
+ name = self.options.job.get("extender")
16
+ if not name:
17
+ name = self.step_conf.get("options", {}).get("extender", None)
18
+
19
+ if name:
20
+ from fabricks.core.extenders import get_extender
21
+
22
+ Logger.debug(f"extend ({name})", extra={"job": self})
23
+ df = df.transform(get_extender(name))
24
+ return df
25
+
26
+ def filter_where(self, df: DataFrame) -> DataFrame:
27
+ f = self.options.job.get("filter_where")
28
+ if f:
29
+ Logger.debug(f"filter where {f}", extra={"job": self})
30
+ df = df.where(f"{f}")
31
+ return df
32
+
33
+ def encrypt(self, df: DataFrame) -> DataFrame:
34
+ encrypted_columns = self.options.job.get_list("encrypted_columns")
35
+ if encrypted_columns:
36
+ key = self.dbutils.secrets.get(scope=SECRET_SCOPE, key="encryption-key")
37
+ assert key, "key not found"
38
+ for col in encrypted_columns:
39
+ Logger.debug(f"encrypt column: {col}", extra={"job": self})
40
+ df = df.withColumn(col, expr(f"aes_encrypt({col}, '{key}')"))
41
+ return df
42
+
43
+ def restore(self, last_version: Optional[str] = None, last_batch: Optional[str] = None):
44
+ """
45
+ Restores the processor to a specific version and batch.
46
+
47
+ Args:
48
+ last_version (Optional[str]): The last version to restore to. If None, no version restore will be performed.
49
+ last_batch (Optional[str]): The last batch to restore to. If None, no batch restore will be performed.
50
+ """
51
+ if self.persist:
52
+ if last_version is not None:
53
+ _last_version = int(last_version)
54
+ if self.table.get_last_version() > _last_version:
55
+ self.table.restore_to_version(_last_version)
56
+
57
+ if last_batch is not None:
58
+ current_batch = int(last_batch) + 1
59
+ self.rm_commit(current_batch)
60
+ assert last_batch == self.table.get_property("fabricks.last_batch")
61
+ assert self.paths.commits.join(last_batch).exists()
62
+
63
+ def _for_each_batch(self, df: DataFrame, batch: Optional[int] = None):
64
+ Logger.debug("for each batch starts", extra={"job": self})
65
+ if batch is not None:
66
+ Logger.debug(f"batch {batch}", extra={"job": self})
67
+
68
+ df = self.base_transform(df)
69
+
70
+ drift = self.table.schema_drifted(df)
71
+ if drift:
72
+ if self.schema_drift:
73
+ Logger.warning("schema drifted", extra={"job": self})
74
+ self.update_schema(df=df)
75
+ else:
76
+ raise ValueError("schema drifted")
77
+
78
+ self.for_each_batch(df, batch)
79
+
80
+ if batch is not None:
81
+ self.table.set_property("fabricks.last_batch", batch)
82
+
83
+ self.table.create_restore_point()
84
+ Logger.debug("for each batch ends", extra={"job": self})
85
+
86
+ def for_each_run(self, schedule: Optional[str] = None):
87
+ Logger.debug("for each run starts", extra={"job": self})
88
+
89
+ if self.virtual:
90
+ if self.schema_drift:
91
+ self.create_or_replace_view()
92
+
93
+ elif self.persist:
94
+ assert self.table.exists(), "delta table not found"
95
+
96
+ df = self.get_data(self.stream)
97
+ assert df is not None, "no data"
98
+
99
+ if self.stream:
100
+ Logger.debug("stream enabled", extra={"job": self})
101
+ write_stream(
102
+ df,
103
+ checkpoints_path=self.paths.checkpoints,
104
+ func=self._for_each_batch,
105
+ timeout=self.timeouts.job,
106
+ )
107
+ else:
108
+ self._for_each_batch(df)
109
+
110
+ else:
111
+ raise ValueError(f"{self.mode} - not allowed")
112
+
113
+ Logger.debug("for each run ends", extra={"job": self})
114
+
115
+ @flush
116
+ def run(
117
+ self,
118
+ retry: Optional[bool] = True,
119
+ schedule: Optional[str] = None,
120
+ schedule_id: Optional[str] = None,
121
+ invoke: Optional[bool] = True,
122
+ ):
123
+ """
124
+ Run the processor.
125
+
126
+ Args:
127
+ retry (bool, optional): Whether to retry the execution in case of failure. Defaults to True.
128
+ schedule (str, optional): The schedule to run the processor on. Defaults to None.
129
+ schedule_id (str, optional): The ID of the schedule. Defaults to None.
130
+ invoke (bool, optional): Whether to invoke pre-run and post-run methods. Defaults to True.
131
+ """
132
+ last_version = None
133
+ last_batch = None
134
+
135
+ if self.persist:
136
+ last_version = self.table.get_property("fabricks.last_version")
137
+ if last_version is not None:
138
+ Logger.debug(f"last version {last_version}", extra={"job": self})
139
+ else:
140
+ last_version = str(self.table.last_version)
141
+
142
+ last_batch = self.table.get_property("fabricks.last_batch")
143
+ if last_batch is not None:
144
+ Logger.debug(f"last batch {last_batch}", extra={"job": self})
145
+
146
+ try:
147
+ Logger.info("run starts", extra={"job": self})
148
+
149
+ if invoke:
150
+ self.pre_run_invoke(schedule=schedule)
151
+
152
+ self.pre_run_check()
153
+
154
+ self.for_each_run(schedule=schedule)
155
+
156
+ self.post_run_check()
157
+ self.post_run_extra_check()
158
+
159
+ if invoke:
160
+ self.post_run_invoke(schedule=schedule)
161
+
162
+ Logger.info("run ends", extra={"job": self})
163
+
164
+ except CheckWarningException as e:
165
+ Logger.exception("🙈 (no retry)", extra={"job": self})
166
+ raise e
167
+ except InvokerFailedException as e:
168
+ Logger.exception("🙈 (no retry)", extra={"job": self})
169
+ raise e
170
+ except CheckFailedException as e:
171
+ Logger.exception("🙈 (no retry)", extra={"job": self})
172
+ self.restore(last_version, last_batch)
173
+ raise e
174
+ except AssertionError as e:
175
+ Logger.exception("🙈", extra={"job": self})
176
+ self.restore(last_version, last_batch)
177
+ raise e
178
+ except Exception as e:
179
+ if not self.stream or not retry:
180
+ Logger.exception("🙈 (no retry)", extra={"job": self})
181
+ self.restore(last_version, last_batch)
182
+ raise e
183
+ else:
184
+ Logger.exception("🙈 (retry)", extra={"job": self})
185
+ self.run(retry=False, schedule_id=schedule_id)
186
+
187
+ def overwrite(self):
188
+ """
189
+ Executes the overwrite job.
190
+
191
+ This method truncates the data, overwrites the schema, and runs the job.
192
+ If an exception occurs during the execution, it is logged and re-raised.
193
+
194
+ Raises:
195
+ Exception: If an error occurs during the execution of the job.
196
+ """
197
+ try:
198
+ Logger.warning("overwrite job", extra={"job": self})
199
+ self.truncate()
200
+ self.overwrite_schema()
201
+ self.run(retry=False)
202
+ except Exception as e:
203
+ Logger.exception("🙈", extra={"job": self})
204
+ raise e
@@ -0,0 +1,191 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Literal, Optional, TypedDict, Union
3
+
4
+ from fabricks.cdc.base.types import ChangeDataCaptures
5
+ from fabricks.context import BRONZE, GOLD, SILVER
6
+ from fabricks.core.parsers import ParserOptions
7
+ from fabricks.utils.fdict import FDict
8
+ from fabricks.utils.path import Path
9
+
10
+ TBronze = Literal["bronze"]
11
+ TSilver = Literal["silver"]
12
+ TGold = Literal["gold"]
13
+ TStep = Literal[TBronze, TSilver, TGold]
14
+
15
+ Bronzes: List[TBronze] = [b.get("name") for b in BRONZE]
16
+ Silvers: List[TSilver] = [s.get("name") for s in SILVER]
17
+ Golds: List[TGold] = [g.get("name") for g in GOLD]
18
+ Steps: List[TStep] = Bronzes + Silvers + Golds
19
+
20
+ BronzeModes = Literal["memory", "append", "register"]
21
+ SilverModes = Literal["memory", "append", "latest", "update", "combine"]
22
+ GoldModes = Literal["memory", "append", "complete", "update", "invoke"]
23
+ Modes = Literal[BronzeModes, SilverModes, GoldModes]
24
+
25
+ FileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
26
+ Operations = Literal["upsert", "reload", "delete"]
27
+ Types = Literal["manual", "default"]
28
+
29
+
30
+ class SparkOptions(TypedDict):
31
+ sql: Optional[dict[str, str]]
32
+ conf: Optional[dict[str, str]]
33
+
34
+
35
+ class TableOptions(TypedDict):
36
+ identity: Optional[bool]
37
+ liquid_clustering: Optional[bool]
38
+ partition_by: Optional[List[str]]
39
+ zorder_by: Optional[List[str]]
40
+ cluster_by: Optional[List[str]]
41
+ powerbi: Optional[bool]
42
+ bloomfilter_by: Optional[List[str]]
43
+ constraints: Optional[dict[str, str]]
44
+ properties: Optional[dict[str, str]]
45
+ comment: Optional[str]
46
+ calculated_columns: Optional[dict[str, str]]
47
+ retention_days: Optional[int]
48
+
49
+
50
+ class _InvokeOptions(TypedDict):
51
+ notebook: str
52
+ arguments: Optional[dict[str, str]]
53
+
54
+
55
+ class InvokerOptions(TypedDict):
56
+ notebook: str
57
+ arguments: Optional[dict[str, str]]
58
+ pre_run: Optional[_InvokeOptions]
59
+ post_run: Optional[_InvokeOptions]
60
+
61
+
62
+ class CheckOptions(TypedDict):
63
+ pre_run: Optional[bool]
64
+ post_run: Optional[bool]
65
+ min_rows: Optional[int]
66
+ max_rows: Optional[int]
67
+ count_must_equal: Optional[str]
68
+
69
+
70
+ class BronzeOptions(TypedDict):
71
+ type: Optional[Types]
72
+ mode: BronzeModes
73
+ uri: str
74
+ parser: str
75
+ source: str
76
+ keys: Optional[List[str]]
77
+ # default
78
+ parents: Optional[List[str]]
79
+ filter_where: Optional[str]
80
+ extender: Optional[str]
81
+ # extra
82
+ encrypted_columns: Optional[List[str]]
83
+ calculated_columns: Optional[dict[str, str]]
84
+ operation: Optional[Operations]
85
+
86
+
87
+ class SilverOptions(TypedDict):
88
+ type: Optional[Types]
89
+ mode: SilverModes
90
+ change_data_capture: ChangeDataCaptures
91
+ # default
92
+ parents: Optional[List[str]]
93
+ filter_where: Optional[str]
94
+ extender: Optional[str]
95
+ # extra
96
+ deduplicate: Optional[bool]
97
+ stream: Optional[bool]
98
+ # else
99
+ order_duplicate_by: Optional[dict[str, str]]
100
+
101
+
102
+ class GoldOptions(TypedDict):
103
+ type: Optional[Types]
104
+ mode: GoldModes
105
+ change_data_capture: ChangeDataCaptures
106
+ update_where: Optional[str]
107
+ # default
108
+ parents: Optional[List[str]]
109
+ # extra
110
+ deduplicate: Optional[bool]
111
+ # else
112
+ table: Optional[str]
113
+ notebook: Optional[bool]
114
+ requirements: Optional[bool]
115
+
116
+
117
+ StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
118
+
119
+
120
+ @dataclass
121
+ class BaseJobConf:
122
+ step: TStep
123
+ job_id: str
124
+ topic: str
125
+ item: str
126
+
127
+
128
+ @dataclass
129
+ class JobConfBronze(BaseJobConf):
130
+ step: TBronze
131
+ options: BronzeOptions
132
+ table_options: Optional[TableOptions] = None
133
+ parser_options: Optional[ParserOptions] = None
134
+ check_options: Optional[CheckOptions] = None
135
+ spark_options: Optional[SparkOptions] = None
136
+ invoker_options: Optional[InvokerOptions] = None
137
+ tags: Optional[List[str]] = None
138
+ comment: Optional[str] = None
139
+
140
+
141
+ @dataclass
142
+ class JobConfSilver(BaseJobConf):
143
+ step: TSilver
144
+ options: SilverOptions
145
+ table_options: Optional[TableOptions] = None
146
+ check_options: Optional[CheckOptions] = None
147
+ spark_options: Optional[SparkOptions] = None
148
+ invoker_options: Optional[InvokerOptions] = None
149
+ tags: Optional[List[str]] = None
150
+ comment: Optional[str] = None
151
+
152
+
153
+ @dataclass
154
+ class JobConfGold(BaseJobConf):
155
+ step: TGold
156
+ options: Optional[GoldOptions]
157
+ table_options: Optional[TableOptions] = None
158
+ check_options: Optional[CheckOptions] = None
159
+ spark_options: Optional[SparkOptions] = None
160
+ invoker_options: Optional[InvokerOptions] = None
161
+ tags: Optional[List[str]] = None
162
+ comment: Optional[str] = None
163
+
164
+
165
+ JobConf = Union[JobConfBronze, JobConfSilver, JobConfGold]
166
+
167
+
168
+ @dataclass
169
+ class Paths:
170
+ storage: Path
171
+ tmp: Path
172
+ checkpoints: Path
173
+ commits: Path
174
+ schema: Path
175
+ runtime: Path
176
+
177
+
178
+ @dataclass
179
+ class Options:
180
+ job: FDict
181
+ check: FDict
182
+ table: FDict
183
+ spark: FDict
184
+ invoker: FDict
185
+
186
+
187
+ @dataclass
188
+ class Timeouts:
189
+ job: int
190
+ pre_run: int
191
+ post_run: int