fabricks 3.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. fabricks/__init__.py +0 -0
  2. fabricks/api/__init__.py +11 -0
  3. fabricks/api/cdc/__init__.py +6 -0
  4. fabricks/api/cdc/nocdc.py +3 -0
  5. fabricks/api/cdc/scd1.py +3 -0
  6. fabricks/api/cdc/scd2.py +3 -0
  7. fabricks/api/context.py +27 -0
  8. fabricks/api/core.py +4 -0
  9. fabricks/api/deploy.py +3 -0
  10. fabricks/api/exceptions.py +19 -0
  11. fabricks/api/extenders.py +3 -0
  12. fabricks/api/job_schema.py +3 -0
  13. fabricks/api/log.py +3 -0
  14. fabricks/api/masks.py +3 -0
  15. fabricks/api/metastore/__init__.py +10 -0
  16. fabricks/api/metastore/database.py +3 -0
  17. fabricks/api/metastore/table.py +3 -0
  18. fabricks/api/metastore/view.py +6 -0
  19. fabricks/api/notebooks/__init__.py +0 -0
  20. fabricks/api/notebooks/cluster.py +6 -0
  21. fabricks/api/notebooks/initialize.py +42 -0
  22. fabricks/api/notebooks/process.py +54 -0
  23. fabricks/api/notebooks/run.py +59 -0
  24. fabricks/api/notebooks/schedule.py +75 -0
  25. fabricks/api/notebooks/terminate.py +31 -0
  26. fabricks/api/parsers.py +3 -0
  27. fabricks/api/schedules.py +3 -0
  28. fabricks/api/udfs.py +3 -0
  29. fabricks/api/utils.py +9 -0
  30. fabricks/api/version.py +3 -0
  31. fabricks/api/views.py +6 -0
  32. fabricks/cdc/__init__.py +14 -0
  33. fabricks/cdc/base/__init__.py +4 -0
  34. fabricks/cdc/base/_types.py +10 -0
  35. fabricks/cdc/base/cdc.py +5 -0
  36. fabricks/cdc/base/configurator.py +223 -0
  37. fabricks/cdc/base/generator.py +177 -0
  38. fabricks/cdc/base/merger.py +110 -0
  39. fabricks/cdc/base/processor.py +471 -0
  40. fabricks/cdc/cdc.py +5 -0
  41. fabricks/cdc/nocdc.py +20 -0
  42. fabricks/cdc/scd.py +22 -0
  43. fabricks/cdc/scd1.py +15 -0
  44. fabricks/cdc/scd2.py +15 -0
  45. fabricks/cdc/templates/__init__.py +0 -0
  46. fabricks/cdc/templates/ctes/base.sql.jinja +35 -0
  47. fabricks/cdc/templates/ctes/current.sql.jinja +28 -0
  48. fabricks/cdc/templates/ctes/deduplicate_hash.sql.jinja +32 -0
  49. fabricks/cdc/templates/ctes/deduplicate_key.sql.jinja +31 -0
  50. fabricks/cdc/templates/ctes/rectify.sql.jinja +113 -0
  51. fabricks/cdc/templates/ctes/slice.sql.jinja +1 -0
  52. fabricks/cdc/templates/filter.sql.jinja +4 -0
  53. fabricks/cdc/templates/filters/final.sql.jinja +4 -0
  54. fabricks/cdc/templates/filters/latest.sql.jinja +17 -0
  55. fabricks/cdc/templates/filters/update.sql.jinja +30 -0
  56. fabricks/cdc/templates/macros/bactick.sql.jinja +1 -0
  57. fabricks/cdc/templates/macros/hash.sql.jinja +18 -0
  58. fabricks/cdc/templates/merge.sql.jinja +3 -0
  59. fabricks/cdc/templates/merges/nocdc.sql.jinja +41 -0
  60. fabricks/cdc/templates/merges/scd1.sql.jinja +73 -0
  61. fabricks/cdc/templates/merges/scd2.sql.jinja +54 -0
  62. fabricks/cdc/templates/queries/__init__.py +0 -0
  63. fabricks/cdc/templates/queries/context.sql.jinja +186 -0
  64. fabricks/cdc/templates/queries/final.sql.jinja +1 -0
  65. fabricks/cdc/templates/queries/nocdc/complete.sql.jinja +10 -0
  66. fabricks/cdc/templates/queries/nocdc/update.sql.jinja +34 -0
  67. fabricks/cdc/templates/queries/scd1.sql.jinja +85 -0
  68. fabricks/cdc/templates/queries/scd2.sql.jinja +98 -0
  69. fabricks/cdc/templates/query.sql.jinja +15 -0
  70. fabricks/context/__init__.py +72 -0
  71. fabricks/context/_types.py +133 -0
  72. fabricks/context/config/__init__.py +92 -0
  73. fabricks/context/config/utils.py +53 -0
  74. fabricks/context/log.py +77 -0
  75. fabricks/context/runtime.py +117 -0
  76. fabricks/context/secret.py +103 -0
  77. fabricks/context/spark_session.py +82 -0
  78. fabricks/context/utils.py +80 -0
  79. fabricks/core/__init__.py +4 -0
  80. fabricks/core/dags/__init__.py +9 -0
  81. fabricks/core/dags/base.py +99 -0
  82. fabricks/core/dags/generator.py +157 -0
  83. fabricks/core/dags/log.py +12 -0
  84. fabricks/core/dags/processor.py +228 -0
  85. fabricks/core/dags/run.py +39 -0
  86. fabricks/core/dags/terminator.py +25 -0
  87. fabricks/core/dags/utils.py +54 -0
  88. fabricks/core/extenders.py +33 -0
  89. fabricks/core/job_schema.py +32 -0
  90. fabricks/core/jobs/__init__.py +21 -0
  91. fabricks/core/jobs/base/__init__.py +10 -0
  92. fabricks/core/jobs/base/_types.py +284 -0
  93. fabricks/core/jobs/base/checker.py +139 -0
  94. fabricks/core/jobs/base/configurator.py +306 -0
  95. fabricks/core/jobs/base/exception.py +85 -0
  96. fabricks/core/jobs/base/generator.py +447 -0
  97. fabricks/core/jobs/base/invoker.py +206 -0
  98. fabricks/core/jobs/base/job.py +5 -0
  99. fabricks/core/jobs/base/processor.py +249 -0
  100. fabricks/core/jobs/bronze.py +395 -0
  101. fabricks/core/jobs/get_job.py +127 -0
  102. fabricks/core/jobs/get_job_conf.py +152 -0
  103. fabricks/core/jobs/get_job_id.py +31 -0
  104. fabricks/core/jobs/get_jobs.py +107 -0
  105. fabricks/core/jobs/get_schedule.py +10 -0
  106. fabricks/core/jobs/get_schedules.py +32 -0
  107. fabricks/core/jobs/gold.py +415 -0
  108. fabricks/core/jobs/silver.py +373 -0
  109. fabricks/core/masks.py +52 -0
  110. fabricks/core/parsers/__init__.py +12 -0
  111. fabricks/core/parsers/_types.py +6 -0
  112. fabricks/core/parsers/base.py +95 -0
  113. fabricks/core/parsers/decorator.py +11 -0
  114. fabricks/core/parsers/get_parser.py +26 -0
  115. fabricks/core/parsers/utils.py +69 -0
  116. fabricks/core/schedules/__init__.py +14 -0
  117. fabricks/core/schedules/diagrams.py +21 -0
  118. fabricks/core/schedules/generate.py +20 -0
  119. fabricks/core/schedules/get_schedule.py +5 -0
  120. fabricks/core/schedules/get_schedules.py +9 -0
  121. fabricks/core/schedules/process.py +9 -0
  122. fabricks/core/schedules/run.py +3 -0
  123. fabricks/core/schedules/terminate.py +6 -0
  124. fabricks/core/schedules/views.py +61 -0
  125. fabricks/core/steps/__init__.py +4 -0
  126. fabricks/core/steps/_types.py +7 -0
  127. fabricks/core/steps/base.py +423 -0
  128. fabricks/core/steps/get_step.py +10 -0
  129. fabricks/core/steps/get_step_conf.py +26 -0
  130. fabricks/core/udfs.py +106 -0
  131. fabricks/core/views.py +41 -0
  132. fabricks/deploy/__init__.py +92 -0
  133. fabricks/deploy/masks.py +8 -0
  134. fabricks/deploy/notebooks.py +71 -0
  135. fabricks/deploy/schedules.py +10 -0
  136. fabricks/deploy/tables.py +82 -0
  137. fabricks/deploy/udfs.py +19 -0
  138. fabricks/deploy/utils.py +36 -0
  139. fabricks/deploy/views.py +509 -0
  140. fabricks/metastore/README.md +3 -0
  141. fabricks/metastore/__init__.py +5 -0
  142. fabricks/metastore/_types.py +65 -0
  143. fabricks/metastore/database.py +65 -0
  144. fabricks/metastore/dbobject.py +66 -0
  145. fabricks/metastore/pyproject.toml +20 -0
  146. fabricks/metastore/table.py +768 -0
  147. fabricks/metastore/utils.py +51 -0
  148. fabricks/metastore/view.py +53 -0
  149. fabricks/utils/__init__.py +0 -0
  150. fabricks/utils/_types.py +6 -0
  151. fabricks/utils/azure_queue.py +93 -0
  152. fabricks/utils/azure_table.py +154 -0
  153. fabricks/utils/console.py +51 -0
  154. fabricks/utils/fdict.py +240 -0
  155. fabricks/utils/helpers.py +228 -0
  156. fabricks/utils/log.py +236 -0
  157. fabricks/utils/mermaid.py +32 -0
  158. fabricks/utils/path.py +242 -0
  159. fabricks/utils/pip.py +61 -0
  160. fabricks/utils/pydantic.py +94 -0
  161. fabricks/utils/read/__init__.py +11 -0
  162. fabricks/utils/read/_types.py +3 -0
  163. fabricks/utils/read/read.py +305 -0
  164. fabricks/utils/read/read_excel.py +5 -0
  165. fabricks/utils/read/read_yaml.py +33 -0
  166. fabricks/utils/schema/__init__.py +7 -0
  167. fabricks/utils/schema/get_json_schema_for_type.py +161 -0
  168. fabricks/utils/schema/get_schema_for_type.py +99 -0
  169. fabricks/utils/spark.py +76 -0
  170. fabricks/utils/sqlglot.py +56 -0
  171. fabricks/utils/write/__init__.py +8 -0
  172. fabricks/utils/write/delta.py +46 -0
  173. fabricks/utils/write/stream.py +27 -0
  174. fabricks-3.0.11.dist-info/METADATA +23 -0
  175. fabricks-3.0.11.dist-info/RECORD +176 -0
  176. fabricks-3.0.11.dist-info/WHEEL +4 -0
@@ -0,0 +1,284 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Literal, Optional, TypedDict, Union
3
+
4
+ from pydantic import BaseModel, ConfigDict, model_validator
5
+ from pyspark.sql.types import StringType, StructField, StructType
6
+
7
+ from fabricks.cdc.base._types import AllowedChangeDataCaptures
8
+ from fabricks.context import BRONZE, GOLD, SILVER
9
+ from fabricks.core.jobs.get_job_id import get_dependency_id, get_job_id
10
+ from fabricks.core.parsers import ParserOptions
11
+ from fabricks.utils.fdict import FDict
12
+ from fabricks.utils.path import Path
13
+
14
+ TBronze = Literal["bronze"]
15
+ TSilver = Literal["silver"]
16
+ TGold = Literal["gold"]
17
+ TStep = Literal[TBronze, TSilver, TGold]
18
+
19
+ Bronzes: List[TBronze] = [b.get("name") for b in BRONZE]
20
+ Silvers: List[TSilver] = [s.get("name") for s in SILVER]
21
+ Golds: List[TGold] = [g.get("name") for g in GOLD]
22
+ Steps: List[TStep] = Bronzes + Silvers + Golds
23
+
24
+ AllowedModesBronze = Literal["memory", "append", "register"]
25
+ AllowedModesSilver = Literal["memory", "append", "latest", "update", "combine"]
26
+ AllowedModesGold = Literal["memory", "append", "complete", "update", "invoke"]
27
+ AllowedModes = Literal[AllowedModesBronze, AllowedModesSilver, AllowedModesGold]
28
+
29
+ AllowedFileFormats = Literal["json_array", "json", "jsonl", "csv", "parquet", "delta"]
30
+ AllowedOperations = Literal["upsert", "reload", "delete"]
31
+ AllowedTypes = Literal["manual", "default"]
32
+ AllowedOrigins = Literal["parser", "job"]
33
+
34
+ AllowedConstraintOptions = Literal["not enforced", "deferrable", "initially deferred", "norely", "rely"]
35
+ AllowedForeignKeyOptions = Literal["match full", "on update no action", "on delete no action"]
36
+
37
+
38
+ class SparkOptions(TypedDict):
39
+ sql: Optional[dict[str, str]]
40
+ conf: Optional[dict[str, str]]
41
+
42
+
43
+ class ForeignKeyOptions(TypedDict):
44
+ foreign_key: Optional[AllowedForeignKeyOptions]
45
+ constraint: Optional[AllowedConstraintOptions]
46
+
47
+
48
+ class PrimaryKeyOptions(TypedDict):
49
+ constraint: Optional[AllowedConstraintOptions]
50
+
51
+
52
+ class ForeignKey(TypedDict):
53
+ keys: List[str]
54
+ reference: str
55
+ options: Optional[ForeignKeyOptions]
56
+
57
+
58
+ class PrimaryKey(TypedDict):
59
+ keys: List[str]
60
+ options: Optional[PrimaryKeyOptions]
61
+
62
+
63
+ class TableOptions(TypedDict):
64
+ identity: Optional[bool]
65
+ liquid_clustering: Optional[bool]
66
+ partition_by: Optional[List[str]]
67
+ zorder_by: Optional[List[str]]
68
+ cluster_by: Optional[List[str]]
69
+ powerbi: Optional[bool]
70
+ maximum_compatibility: Optional[bool]
71
+ bloomfilter_by: Optional[List[str]]
72
+ constraints: Optional[dict[str, str]]
73
+ properties: Optional[dict[str, str]]
74
+ comment: Optional[str]
75
+ calculated_columns: Optional[dict[str, str]]
76
+ masks: Optional[dict[str, str]]
77
+ comments: Optional[dict[str, str]]
78
+ retention_days: Optional[int]
79
+ primary_key: Optional[dict[str, PrimaryKey]]
80
+ foreign_keys: Optional[dict[str, ForeignKey]]
81
+
82
+
83
+ class _InvokeOptions(TypedDict):
84
+ notebook: str
85
+ timeout: int
86
+ arguments: Optional[dict[str, str]]
87
+
88
+
89
+ class InvokerOptions(TypedDict):
90
+ pre_run: Optional[List[_InvokeOptions]]
91
+ run: Optional[List[_InvokeOptions]]
92
+ post_run: Optional[List[_InvokeOptions]]
93
+
94
+
95
+ class ExtenderOptions(TypedDict):
96
+ extender: str
97
+ arguments: Optional[dict[str, str]]
98
+
99
+
100
+ class CheckOptions(TypedDict):
101
+ skip: Optional[bool]
102
+ pre_run: Optional[bool]
103
+ post_run: Optional[bool]
104
+ min_rows: Optional[int]
105
+ max_rows: Optional[int]
106
+ count_must_equal: Optional[str]
107
+
108
+
109
+ class BronzeOptions(TypedDict):
110
+ type: Optional[AllowedTypes]
111
+ mode: AllowedModesBronze
112
+ uri: str
113
+ parser: str
114
+ source: str
115
+ keys: Optional[List[str]]
116
+ # default
117
+ parents: Optional[List[str]]
118
+ filter_where: Optional[str]
119
+ optimize: Optional[bool]
120
+ compute_statistics: Optional[bool]
121
+ vacuum: Optional[bool]
122
+ no_drop: Optional[bool]
123
+ # extra
124
+ encrypted_columns: Optional[List[str]]
125
+ calculated_columns: Optional[dict[str, str]]
126
+ operation: Optional[AllowedOperations]
127
+ timeout: Optional[int]
128
+
129
+
130
+ class SilverOptions(TypedDict):
131
+ type: Optional[AllowedTypes]
132
+ mode: AllowedModesSilver
133
+ change_data_capture: AllowedChangeDataCaptures
134
+ # default
135
+ parents: Optional[List[str]]
136
+ filter_where: Optional[str]
137
+ optimize: Optional[bool]
138
+ compute_statistics: Optional[bool]
139
+ vacuum: Optional[bool]
140
+ no_drop: Optional[bool]
141
+ # extra
142
+ deduplicate: Optional[bool]
143
+ stream: Optional[bool]
144
+ # else
145
+ order_duplicate_by: Optional[dict[str, str]]
146
+ timeout: Optional[int]
147
+
148
+
149
+ class GoldOptions(TypedDict):
150
+ type: Optional[AllowedTypes]
151
+ mode: AllowedModesGold
152
+ change_data_capture: AllowedChangeDataCaptures
153
+ update_where: Optional[str]
154
+ # default
155
+ parents: Optional[List[str]]
156
+ optimize: Optional[bool]
157
+ compute_statistics: Optional[bool]
158
+ vacuum: Optional[bool]
159
+ no_drop: Optional[bool]
160
+ # extra
161
+ deduplicate: Optional[bool] # remove duplicates on the keys and on the hash
162
+ rectify_as_upserts: Optional[bool] # convert reloads into upserts and deletes
163
+ correct_valid_from: Optional[bool] # update valid_from to '1900-01-01' for the first timestamp
164
+ persist_last_timestamp: Optional[bool] # persist the last timestamp to be used as a watermark for the next run
165
+ # delete_missing: Optional[bool] # delete missing records on update (to be implemented)
166
+ # else
167
+ table: Optional[str]
168
+ notebook: Optional[bool]
169
+ requirements: Optional[bool]
170
+ timeout: Optional[int]
171
+ metadata: Optional[bool]
172
+
173
+
174
+ StepOptions = Union[BronzeOptions, SilverOptions, GoldOptions]
175
+
176
+
177
+ @dataclass
178
+ class BaseJobConf:
179
+ job_id: str
180
+ topic: str
181
+ item: str
182
+
183
+
184
+ @dataclass
185
+ class JobConfBronze(BaseJobConf):
186
+ step: TBronze
187
+ options: BronzeOptions
188
+ table_options: Optional[TableOptions] = None
189
+ parser_options: Optional[ParserOptions] = None
190
+ check_options: Optional[CheckOptions] = None
191
+ spark_options: Optional[SparkOptions] = None
192
+ invoker_options: Optional[InvokerOptions] = None
193
+ extender_options: Optional[List[ExtenderOptions]] = None
194
+ tags: Optional[List[str]] = None
195
+ comment: Optional[str] = None
196
+
197
+
198
+ @dataclass
199
+ class JobConfSilver(BaseJobConf):
200
+ step: TSilver
201
+ options: SilverOptions
202
+ table_options: Optional[TableOptions] = None
203
+ check_options: Optional[CheckOptions] = None
204
+ spark_options: Optional[SparkOptions] = None
205
+ invoker_options: Optional[InvokerOptions] = None
206
+ extender_options: Optional[List[ExtenderOptions]] = None
207
+ tags: Optional[List[str]] = None
208
+ comment: Optional[str] = None
209
+
210
+
211
+ @dataclass
212
+ class JobConfGold(BaseJobConf):
213
+ step: TGold
214
+ options: Optional[GoldOptions]
215
+ table_options: Optional[TableOptions] = None
216
+ check_options: Optional[CheckOptions] = None
217
+ spark_options: Optional[SparkOptions] = None
218
+ invoker_options: Optional[InvokerOptions] = None
219
+ extender_options: Optional[List[ExtenderOptions]] = None
220
+ tags: Optional[List[str]] = None
221
+ comment: Optional[str] = None
222
+
223
+
224
+ JobConf = Union[JobConfBronze, JobConfSilver, JobConfGold]
225
+
226
+
227
+ @dataclass
228
+ class Paths:
229
+ storage: Path
230
+ tmp: Path
231
+ checkpoints: Path
232
+ commits: Path
233
+ schema: Path
234
+ runtime: Path
235
+
236
+
237
+ @dataclass
238
+ class Options:
239
+ job: FDict
240
+ check: FDict
241
+ table: FDict
242
+ spark: FDict
243
+ invokers: FDict
244
+ extenders: List
245
+
246
+
247
+ class JobDependency(BaseModel):
248
+ model_config = ConfigDict(extra="forbid", frozen=True)
249
+ origin: AllowedOrigins
250
+ job_id: str
251
+ parent: str
252
+ parent_id: str
253
+ dependency_id: str
254
+
255
+ def __str__(self) -> str:
256
+ return f"{self.job_id} -> {self.parent}"
257
+
258
+ @model_validator(mode="after")
259
+ def check_no_circular_dependency(self):
260
+ if self.job_id == self.parent_id:
261
+ raise ValueError("Circular dependency detected")
262
+ return self
263
+
264
+ @staticmethod
265
+ def from_parts(job_id: str, parent: str, origin: AllowedOrigins):
266
+ parent = parent.removesuffix("__current")
267
+ return JobDependency(
268
+ job_id=job_id,
269
+ origin=origin,
270
+ parent=parent,
271
+ parent_id=get_job_id(job=parent),
272
+ dependency_id=get_dependency_id(parent=parent, job_id=job_id),
273
+ )
274
+
275
+
276
+ SchemaDependencies = StructType(
277
+ [
278
+ StructField("dependency_id", StringType(), True),
279
+ StructField("origin", StringType(), True),
280
+ StructField("job_id", StringType(), True),
281
+ StructField("parent_id", StringType(), True),
282
+ StructField("parent", StringType(), True),
283
+ ]
284
+ )
@@ -0,0 +1,139 @@
1
+ from typing import Literal
2
+
3
+ from fabricks.context.log import DEFAULT_LOGGER
4
+ from fabricks.core.jobs.base.exception import (
5
+ PostRunCheckException,
6
+ PostRunCheckWarning,
7
+ PreRunCheckException,
8
+ PreRunCheckWarning,
9
+ SkipRunCheckWarning,
10
+ )
11
+ from fabricks.core.jobs.base.generator import Generator
12
+
13
+
14
+ class Checker(Generator):
15
+ def check_pre_run(self):
16
+ self._check("pre_run")
17
+
18
+ def check_post_run(self):
19
+ self._check("post_run")
20
+
21
+ def _check(self, position: Literal["pre_run", "post_run"]):
22
+ if self.options.check.get(position):
23
+ DEFAULT_LOGGER.debug(f"check {position}", extra={"label": self})
24
+
25
+ p = self.paths.runtime.append(f".{position}.sql")
26
+ assert p.exists(), f"{position} check not found ({p})"
27
+
28
+ df = self.spark.sql(p.get_sql())
29
+ fail_df = df.where("__action == 'fail'")
30
+ warning_df = df.where("__action == 'warning'")
31
+
32
+ if not fail_df.isEmpty():
33
+ for row in fail_df.collect():
34
+ DEFAULT_LOGGER.warning(
35
+ f"check {position} failed due to {row['__message']}",
36
+ extra={"label": self},
37
+ )
38
+
39
+ if position == "pre_run":
40
+ raise PreRunCheckException(row["__message"], dataframe=df)
41
+ elif position == "post_run":
42
+ raise PostRunCheckException(row["__message"], dataframe=df)
43
+
44
+ elif not warning_df.isEmpty():
45
+ for row in warning_df.collect():
46
+ DEFAULT_LOGGER.warning(
47
+ f"check {position} failed due to {row['__message']}",
48
+ extra={"label": self},
49
+ )
50
+
51
+ if position == "pre_run":
52
+ raise PreRunCheckWarning(row["__message"], dataframe=df)
53
+ elif position == "post_run":
54
+ raise PostRunCheckWarning(row["__message"], dataframe=df)
55
+
56
+ def check_post_run_extra(self):
57
+ min_rows = self.options.check.get("min_rows")
58
+ max_rows = self.options.check.get("max_rows")
59
+ count_must_equal = self.options.check.get("count_must_equal")
60
+
61
+ if min_rows or max_rows or count_must_equal:
62
+ df = self.spark.sql(f"select count(*) from {self}")
63
+ rows = df.collect()[0][0]
64
+ if min_rows:
65
+ DEFAULT_LOGGER.debug("check min rows", extra={"label": self})
66
+ if rows < min_rows:
67
+ raise PostRunCheckException(f"min rows check failed ({rows} < {min_rows})", dataframe=df)
68
+
69
+ if max_rows:
70
+ DEFAULT_LOGGER.debug("check max rows", extra={"label": self})
71
+ if rows > max_rows:
72
+ raise PostRunCheckException(f"max rows check failed ({rows} > {max_rows})", dataframe=df)
73
+
74
+ if count_must_equal:
75
+ DEFAULT_LOGGER.debug("check count must equal", extra={"label": self})
76
+ equals_rows = self.spark.read.table(count_must_equal).count()
77
+ if rows != equals_rows:
78
+ raise PostRunCheckException(
79
+ f"count must equal check failed ({count_must_equal} - {rows} != {equals_rows})",
80
+ dataframe=df,
81
+ )
82
+
83
+ def _check_duplicate_in_column(self, column: str):
84
+ if column in self.table.columns:
85
+ DEFAULT_LOGGER.debug(f"check duplicate in {column}", extra={"label": self})
86
+
87
+ cols = [column]
88
+
89
+ if "__source" in self.table.columns:
90
+ cols.append("__source")
91
+
92
+ if self.change_data_capture == "scd2":
93
+ cols.append("__valid_to")
94
+
95
+ elif self.change_data_capture == "nocdc":
96
+ if "__valid_to" in self.table.columns:
97
+ cols.append("__valid_to")
98
+ elif self.mode == "append" and "__timestamp" in self.table.columns:
99
+ cols.append("__timestamp")
100
+
101
+ cols = ", ".join(cols)
102
+ df = self.spark.sql(f"select {cols} from {self} group by all having count(*) > 1 limit 5")
103
+
104
+ if not df.isEmpty():
105
+ duplicates = ",".join([str(row[column]) for row in df.collect()])
106
+ raise PostRunCheckException(
107
+ f"duplicate {column} check failed ({duplicates})",
108
+ dataframe=df,
109
+ )
110
+
111
+ else:
112
+ DEFAULT_LOGGER.debug(f"could not find {column}", extra={"label": self})
113
+
114
+ def check_duplicate_key(self):
115
+ self._check_duplicate_in_column("__key")
116
+
117
+ def check_duplicate_hash(self):
118
+ self._check_duplicate_in_column("__hash")
119
+
120
+ def check_duplicate_identity(self):
121
+ self._check_duplicate_in_column("__identity")
122
+
123
+ def check_skip_run(self):
124
+ if self.options.check.get("skip"):
125
+ DEFAULT_LOGGER.debug("check if run should be skipped", extra={"label": self})
126
+
127
+ p = self.paths.runtime.append(".skip.sql")
128
+ assert p.exists(), "skip check not found"
129
+
130
+ df = self.spark.sql(p.get_sql())
131
+ skip_df = df.where("__skip")
132
+ if not skip_df.isEmpty():
133
+ for row in skip_df.collect():
134
+ DEFAULT_LOGGER.warning(
135
+ f"skip run due to {row['__message']}",
136
+ extra={"label": self},
137
+ )
138
+
139
+ raise SkipRunCheckWarning(row["__message"], dataframe=df)