AQF 1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AQF/__init__.py +1 -0
- AQF/aqf_dataset_rules.py +138 -0
- AQF/aqf_engine.py +377 -0
- AQF/aqf_logging.py +192 -0
- AQF/aqf_row_rules.py +168 -0
- AQF/aqf_rule_retriever.py +97 -0
- AQF/aqf_utils.py +74 -0
- aqf-1.1.dist-info/METADATA +9 -0
- aqf-1.1.dist-info/RECORD +11 -0
- aqf-1.1.dist-info/WHEEL +5 -0
- aqf-1.1.dist-info/top_level.txt +1 -0
AQF/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import AQF.aqf_dataset_rules as aqf_dataset_rules, AQF.aqf_engine as aqf_engine, AQF.aqf_logging as aqf_logging, AQF.aqf_row_rules as aqf_row_rules, AQF.aqf_rule_retriever as aqf_rule_retriever, AQF.aqf_utils as aqf_utils
|
AQF/aqf_dataset_rules.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from .aqf_utils import create_spark, Status
|
|
3
|
+
from pyspark.sql.functions import col
|
|
4
|
+
from functools import reduce
|
|
5
|
+
import operator
|
|
6
|
+
|
|
7
|
+
def join_basic_inner_count_check(test, df: DataFrame, **kwargs) -> dict:
|
|
8
|
+
"""
|
|
9
|
+
Validates referential integrity by performing inner join with lookup table.
|
|
10
|
+
|
|
11
|
+
Checks if records in the source DataFrame have corresponding matches in a reference table.
|
|
12
|
+
This test ensures data consistency across related tables and identifies orphaned records.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
rule: Rule object containing test configuration with attributes:
|
|
16
|
+
- join_table: Target table name for join operation
|
|
17
|
+
- column_name: Source column for join condition
|
|
18
|
+
- join_column: Target column for join condition
|
|
19
|
+
- rule_id, test_type_id, description, table_name, bad_data_action
|
|
20
|
+
df (DataFrame): Source DataFrame to validate
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
(dict): Single-element list containing status code:
|
|
24
|
+
[1] = PASS (all records have matches)
|
|
25
|
+
[0] = FAIL (some records lack matches)
|
|
26
|
+
[2] = ERROR (exception occurred)
|
|
27
|
+
|
|
28
|
+
Test Logic:
|
|
29
|
+
- Performs inner join between source and reference table
|
|
30
|
+
- Compares join result count with source record count
|
|
31
|
+
- PASS if join_count > 0, FAIL otherwise
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
spark = kwargs["spark"]
|
|
35
|
+
|
|
36
|
+
# Load reference table for join operation
|
|
37
|
+
df_join_target = spark.sql(f"SELECT * FROM {test.join_table}").alias("df_join_target")
|
|
38
|
+
df_comp = df.alias("df_comp")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# prepare string to join tables on all columns
|
|
42
|
+
columns = eval(test.columns)
|
|
43
|
+
join_columns = eval(test.join_column)
|
|
44
|
+
|
|
45
|
+
if len(columns) != len(join_columns):
|
|
46
|
+
raise Exception("column count doesn't match join column count")
|
|
47
|
+
|
|
48
|
+
# Build join conditions
|
|
49
|
+
join_condition = reduce(
|
|
50
|
+
operator.and_,
|
|
51
|
+
[col(f"df_comp.{c1}") == col(f"df_join_target.{c2}") for c1, c2 in zip(columns, join_columns)]
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Perform inner join
|
|
55
|
+
joined_df = df_comp.join(
|
|
56
|
+
df_join_target,
|
|
57
|
+
join_condition,
|
|
58
|
+
"inner"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Calculate metrics for test evaluation
|
|
62
|
+
join_count = joined_df.count() # Records with valid references
|
|
63
|
+
new_data_count = df_comp.count() # Total source records
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Prepare metrics for result logging
|
|
67
|
+
result_values = {
|
|
68
|
+
"status": Status.PASS if join_count > 0 else Status.WARNING, # PASS if any matches found
|
|
69
|
+
"new_data_count": new_data_count,
|
|
70
|
+
"join_count": join_count,
|
|
71
|
+
}
|
|
72
|
+
return result_values
|
|
73
|
+
|
|
74
|
+
#----------------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def unique_check(test, df: DataFrame) -> dict:
|
|
78
|
+
"""
|
|
79
|
+
Validates data uniqueness by checking for duplicate values in specified column.
|
|
80
|
+
|
|
81
|
+
Ensures primary key constraints and data integrity by identifying duplicate records.
|
|
82
|
+
Supports automatic deduplication based on configured bad data action.
|
|
83
|
+
|
|
84
|
+
!!! Removing/quarantining any data doesnt make sense here. We cant decide what the bad data is, based on only one column. !!!
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
rule: Rule object containing test configuration with attributes:
|
|
88
|
+
- column_name: Target column for uniqueness validation0293
|
|
89
|
+
- bad_data_action: Action for duplicate records ('exclude', 'process')
|
|
90
|
+
- rule_id, test_type_id, description, table_name
|
|
91
|
+
df (DataFrame): Source DataFrame to validate
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List[Union[int, DataFrame]]: Status code and optionally deduplicated DataFrame:
|
|
95
|
+
[status, clean_df] if deduplication applied
|
|
96
|
+
[status] if no filtering
|
|
97
|
+
|
|
98
|
+
Status codes:
|
|
99
|
+
1 = PASS (all values unique)
|
|
100
|
+
0 = FAIL (duplicates found and handled)
|
|
101
|
+
2 = ERROR (exception occurred)
|
|
102
|
+
|
|
103
|
+
Deduplication Logic:
|
|
104
|
+
- 'exclude': Remove duplicate records keeping first occurrence
|
|
105
|
+
- 'process': Allow duplicates to continue through pipeline
|
|
106
|
+
|
|
107
|
+
Note:
|
|
108
|
+
Current implementation uses hardcoded 'id' column for deduplication.
|
|
109
|
+
Consider making this configurable via rule.column_name.
|
|
110
|
+
"""
|
|
111
|
+
columns = eval(test.columns)
|
|
112
|
+
if not columns:
|
|
113
|
+
unique_count = df.distinct().count()
|
|
114
|
+
else:
|
|
115
|
+
unique_count = df.select(columns).distinct().count() # Unique values in target column
|
|
116
|
+
|
|
117
|
+
# Calculate uniqueness metrics
|
|
118
|
+
total_count = df.count() # Total records in DataFrame
|
|
119
|
+
|
|
120
|
+
""
|
|
121
|
+
if unique_count == total_count:
|
|
122
|
+
# Perfect uniqueness - all values are distinct
|
|
123
|
+
status = Status.PASS # PASS
|
|
124
|
+
return_df = df
|
|
125
|
+
# What happens with quarantine? All duplicates should be moved to quarantine table
|
|
126
|
+
else:
|
|
127
|
+
status = Status.WARNING
|
|
128
|
+
|
|
129
|
+
# Prepare detailed metrics for analysis
|
|
130
|
+
result_values = {
|
|
131
|
+
"status": status,
|
|
132
|
+
"total_count": total_count,
|
|
133
|
+
"unique_count": unique_count,
|
|
134
|
+
"difference": total_count - unique_count # Number of duplicate records
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return result_values
|
|
138
|
+
|
AQF/aqf_engine.py
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
from .aqf_utils import generate_run_id, create_spark, Status, normalize_timestamps
|
|
2
|
+
from .aqf_rule_retriever import *
|
|
3
|
+
from .aqf_dataset_rules import *
|
|
4
|
+
from .aqf_row_rules import *
|
|
5
|
+
from .aqf_logging import result_writer, bad_data_writer
|
|
6
|
+
|
|
7
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
8
|
+
from pyspark.sql.functions import max
|
|
9
|
+
from functools import reduce
|
|
10
|
+
from datetime import datetime, timezone
|
|
11
|
+
|
|
12
|
+
import notebookutils
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AQF_Engine():
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
Class that manages the DQF testing procedure
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
run_id: id string to make every test run unique
|
|
22
|
+
job_id: id for every hsi session?
|
|
23
|
+
config: name of the variable in the workspace
|
|
24
|
+
spark: Optional SparkSession to use. If not provided, the active session is used.
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
job_id: str,
|
|
31
|
+
run_id: str,
|
|
32
|
+
config: str,
|
|
33
|
+
spark: SparkSession | None = None
|
|
34
|
+
):
|
|
35
|
+
self.spark = create_spark() if spark is None else spark
|
|
36
|
+
self.run_id = run_id
|
|
37
|
+
|
|
38
|
+
# list of the outcomes of each test
|
|
39
|
+
self.status_list = []
|
|
40
|
+
|
|
41
|
+
# env variables
|
|
42
|
+
self.config = notebookutils.variableLibrary.getLibrary(config)
|
|
43
|
+
|
|
44
|
+
# represents the tested and updated dataframe
|
|
45
|
+
self.consolidated_df = None
|
|
46
|
+
|
|
47
|
+
self.cancel = False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def run_tests(
|
|
51
|
+
self,
|
|
52
|
+
table_path: str,
|
|
53
|
+
table_name: str,
|
|
54
|
+
df: DataFrame,
|
|
55
|
+
check_type: str | None = None,
|
|
56
|
+
spark: SparkSession | None = None,
|
|
57
|
+
) -> DataFrame:
|
|
58
|
+
"""
|
|
59
|
+
Main orchestration function for the Data Quality Framework (DQF).
|
|
60
|
+
|
|
61
|
+
Executes all configured data quality tests for a given table and consolidates results.
|
|
62
|
+
This function retrieves rules, executes corresponding tests, and tracks success/failure rates.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
config_id:
|
|
66
|
+
table_name: string for the table name, only used for quaratine table name
|
|
67
|
+
table_path:
|
|
68
|
+
df (DataFrame): Input Spark DataFrame to validate
|
|
69
|
+
check_type: ?
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
DataFrame: Consolidated DataFrame containing only records that passed all tests
|
|
74
|
+
(when bad_data_action is 'quarantine' or 'delete')
|
|
75
|
+
|
|
76
|
+
Notes:
|
|
77
|
+
- Test results are automatically logged to dbo.dqf.dqf_test_log table
|
|
78
|
+
- Supports multiple bad data actions: 'process', 'delete', 'quarantine', ignore
|
|
79
|
+
- Generates unique run_id for tracking test execution sessions
|
|
80
|
+
|
|
81
|
+
Example:
|
|
82
|
+
>>> clean_df = engine.run_tests(test_table="dqf_tests", df=df, table_path="lh_bronze", table_name="bronze_hsi", check_type="")
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
if spark:
|
|
86
|
+
self.spark = spark
|
|
87
|
+
elif not self.spark:
|
|
88
|
+
self.spark = create_spark()
|
|
89
|
+
|
|
90
|
+
# table (str): Fully qualified table name (e.g., 'catalog.schema.table')
|
|
91
|
+
# Used to retrieve applicable DQ rules from metadata table
|
|
92
|
+
self.table_id = table_path + "." + table_name
|
|
93
|
+
self.table_name = table_name
|
|
94
|
+
|
|
95
|
+
# clean df for return after tests are done
|
|
96
|
+
self.consolidated_df = df
|
|
97
|
+
|
|
98
|
+
print(f"Starting Data Quality checks for {self.table_id}")
|
|
99
|
+
|
|
100
|
+
# Retrieve all configured DQ rules for the specified table
|
|
101
|
+
df_tests = retrieve_aqf_tests_by_table_id(table_id=self.table_id, test_table=self.config.test_table, spark=self.spark, jdbc_url=self.config.jdbc_url)
|
|
102
|
+
tests = df_tests.collect()
|
|
103
|
+
number_of_tests = len(tests)
|
|
104
|
+
if not number_of_tests:
|
|
105
|
+
print(f"No tests found for {self.table_id}")
|
|
106
|
+
return df
|
|
107
|
+
|
|
108
|
+
test_list = df_tests.toPandas()["rule_id"].tolist()
|
|
109
|
+
rules = retrieve_aqf_rules_by_id(test_list, spark=self.spark, jdbc_url=self.config.jdbc_url, rule_table=self.config.rule_table)
|
|
110
|
+
|
|
111
|
+
# Execute each rule sequentially
|
|
112
|
+
for test in tests:
|
|
113
|
+
print(test)
|
|
114
|
+
rule = rules.filter(rules.rule_id.isin([test.rule_id])).collect()
|
|
115
|
+
self.testing(test=test, df=df, rule=rule[0])
|
|
116
|
+
|
|
117
|
+
if self.status_list[-1] == Status.FAIL:
|
|
118
|
+
self.cancel = True
|
|
119
|
+
|
|
120
|
+
self.output_results(number_of_tests)
|
|
121
|
+
|
|
122
|
+
return self.consolidated_df
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def consolidate(self, new_df: DataFrame):
|
|
127
|
+
"""
|
|
128
|
+
Takes the dataframe that just got reduced by some row level test and removes the missing rows also from the consolidated df
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
new_df: the dataframe that got changed by a test
|
|
132
|
+
"""
|
|
133
|
+
# Verwende INTERSECT statt UNION
|
|
134
|
+
self.consolidated_df = self.consolidated_df.intersect(new_df)
|
|
135
|
+
#self.consolidated_df.show()
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def output_results(self, number_of_tests: int):
|
|
139
|
+
"""
|
|
140
|
+
Might get scrapped or changed to something else since output to notebook doesnt make sense in a regular environment
|
|
141
|
+
|
|
142
|
+
Evaluates how the test session went
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
number_of_tests: number of tests is inferred from the test retrievel at the start
|
|
146
|
+
"""
|
|
147
|
+
# Calculate final test statistics
|
|
148
|
+
incomplete_tests = self.status_list.count(Status.ERROR) # ERROR status
|
|
149
|
+
passed_tests = self.status_list.count(Status.PASS) # PASS status
|
|
150
|
+
failed_tests = self.status_list.count(Status.FAIL) # FAIL status
|
|
151
|
+
warnings = self.status_list.count(Status.WARNING)
|
|
152
|
+
#canceled = number_of_tests - len(self.status_list)
|
|
153
|
+
|
|
154
|
+
# Summary output for monitoring (minimal logging)
|
|
155
|
+
print(f"AQF Results - Tests: {number_of_tests} | Passed: {passed_tests} | Failed: {failed_tests+warnings} | Errors: {incomplete_tests}")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def get_kwargs(self, connection: str) -> dict:
|
|
159
|
+
"""
|
|
160
|
+
Some rules require an additional table so this puts the connections into the extra parameters
|
|
161
|
+
"""
|
|
162
|
+
con = eval(connection)
|
|
163
|
+
kwargs = {}
|
|
164
|
+
for c in con:
|
|
165
|
+
match c:
|
|
166
|
+
case "spark":
|
|
167
|
+
kwargs["spark"] = self.spark
|
|
168
|
+
case "sql":
|
|
169
|
+
kwargs["jdbc_url"] = self.config.jdbc_url
|
|
170
|
+
return kwargs
|
|
171
|
+
|
|
172
|
+
def row_level(self, test: dict, df: DataFrame, rule: dict, **kwargs) -> dict:
|
|
173
|
+
"""
|
|
174
|
+
this is only for tests that are row based
|
|
175
|
+
checks each cell in a column with a value
|
|
176
|
+
reference value str optional
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
test: dict with the test info
|
|
180
|
+
df: Dataframe to evaluate
|
|
181
|
+
rule: dict with the rule info
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
logging data
|
|
185
|
+
"""
|
|
186
|
+
# results = (result_values, good_df, bad_df)
|
|
187
|
+
results = eval(rule.name)(test=test, df=df, **kwargs)
|
|
188
|
+
|
|
189
|
+
# Write bad data to quarantine table for later analysis
|
|
190
|
+
# Table naming: utility.bad_data.{source_table_name}
|
|
191
|
+
if results[2]:
|
|
192
|
+
bad_data_writer(table_name=self.table_name, df=results[2], run_id=self.run_id, spark=self.spark, jdbc_url=self.config.jdbc_url, quarantine_table=self.config.quarantine_table)
|
|
193
|
+
if results[1]:
|
|
194
|
+
self.consolidate(results[1])
|
|
195
|
+
|
|
196
|
+
return results[0]
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def testing(self, test: dict, df: DataFrame, rule: dict):
|
|
200
|
+
"""
|
|
201
|
+
This manages a single test
|
|
202
|
+
It runs it and logs it
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
test: dict with the test info
|
|
206
|
+
df: Dataframe to evaluate
|
|
207
|
+
rule: dict with the rule info
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
start_time = datetime.now(timezone.utc)
|
|
211
|
+
start_time.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
212
|
+
try:
|
|
213
|
+
result_writer(
|
|
214
|
+
log_path = self.config.log_table,
|
|
215
|
+
run_id = self.run_id,
|
|
216
|
+
test_id = test.test_id,
|
|
217
|
+
rule_id = test.rule_id,
|
|
218
|
+
description = test.description,
|
|
219
|
+
table_id = test.table_id,
|
|
220
|
+
start_time = start_time,
|
|
221
|
+
result_values = {},
|
|
222
|
+
bad_data_action = test.bad_data_action,
|
|
223
|
+
criticality = test.criticality,
|
|
224
|
+
spark = self.spark,
|
|
225
|
+
jdbc_url = self.config.jdbc_url
|
|
226
|
+
)
|
|
227
|
+
print("logging (start) successful")
|
|
228
|
+
except:
|
|
229
|
+
pass
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
kwargs = self.get_kwargs(rule.connection)
|
|
233
|
+
|
|
234
|
+
try:
|
|
235
|
+
# result: result_values
|
|
236
|
+
if rule.rule_type == "dataset":
|
|
237
|
+
# dataset level
|
|
238
|
+
result_values = eval(rule.name)(test, df, **kwargs)
|
|
239
|
+
elif rule.rule_type == "row":
|
|
240
|
+
# row level
|
|
241
|
+
result_values = self.row_level(test=test, df=df, rule=rule, **kwargs)
|
|
242
|
+
else:
|
|
243
|
+
raise Exception(f"Invalid rule type {rule.rule_type}")
|
|
244
|
+
#print(results)
|
|
245
|
+
if result_values["status"] == Status.WARNING and test.criticality:
|
|
246
|
+
result_values["status"] = Status.FAIL
|
|
247
|
+
|
|
248
|
+
self.status_list.append(result_values["status"])
|
|
249
|
+
end_time = datetime.now(timezone.utc)
|
|
250
|
+
end_time.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
251
|
+
error_message = None
|
|
252
|
+
|
|
253
|
+
print(f"Test successful: {test.test_id}")
|
|
254
|
+
except Exception as e:
|
|
255
|
+
# Handle unexpected errors during test execution
|
|
256
|
+
print(f"Testing (id: {test.test_id}) failed: {e}")
|
|
257
|
+
error_message = str(e)
|
|
258
|
+
result_values = {"status": Status.ERROR}
|
|
259
|
+
end_time = datetime.now(timezone.utc)
|
|
260
|
+
end_time.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
261
|
+
self.status_list.append(Status.ERROR)
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
result_writer(
|
|
265
|
+
log_path = self.config.log_table,
|
|
266
|
+
run_id = self.run_id,
|
|
267
|
+
test_id = test.test_id,
|
|
268
|
+
rule_id = test.rule_id,
|
|
269
|
+
description = test.description,
|
|
270
|
+
table_id = test.table_id,
|
|
271
|
+
start_time = start_time,
|
|
272
|
+
end_time = end_time,
|
|
273
|
+
result_values = result_values,
|
|
274
|
+
error_message = error_message,
|
|
275
|
+
bad_data_action = test.bad_data_action,
|
|
276
|
+
criticality = test.criticality,
|
|
277
|
+
spark = self.spark,
|
|
278
|
+
jdbc_url = self.config.jdbc_url
|
|
279
|
+
)
|
|
280
|
+
print("Logging successful")
|
|
281
|
+
except Exception as e:
|
|
282
|
+
print(f"Logging failed: {e}")
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def is_critical(self):
|
|
286
|
+
"""
|
|
287
|
+
Returns wether a critical test failed
|
|
288
|
+
"""
|
|
289
|
+
return self.cancel
|
|
290
|
+
|
|
291
|
+
def get_fail_count(self):
|
|
292
|
+
return self.status_list.count(Status.FAIL)
|
|
293
|
+
|
|
294
|
+
def get_log_table(self):
|
|
295
|
+
spark = self.spark
|
|
296
|
+
return spark.read.option("url", self.config.jdbc_url).mssql(self.config.log_table)
|
|
297
|
+
|
|
298
|
+
def create_test(
|
|
299
|
+
jdbc_url: str,
|
|
300
|
+
test_table: str,
|
|
301
|
+
rule_table: str,
|
|
302
|
+
rule_id: int,
|
|
303
|
+
stage: str,
|
|
304
|
+
table_id: str,
|
|
305
|
+
desc: str = None,
|
|
306
|
+
columns: list[str] = None,
|
|
307
|
+
expression: str = None,
|
|
308
|
+
join_table: str = None,
|
|
309
|
+
join_column: str = None,
|
|
310
|
+
bad_data_action: str = "process", # in case no bad data action is given for row based test
|
|
311
|
+
citicality: bool = False, # in case no criticality is given
|
|
312
|
+
spark: SparkSession | None = None
|
|
313
|
+
) -> int:
|
|
314
|
+
"""
|
|
315
|
+
Create an entry in the test table with all necessary values
|
|
316
|
+
"""
|
|
317
|
+
spark = create_spark() if not spark else spark
|
|
318
|
+
|
|
319
|
+
# create new test_id
|
|
320
|
+
tests = spark.read.option("url", jdbc_url).mssql(test_table)
|
|
321
|
+
test_id = tests.select(max(tests.test_id).alias("test_id")).collect()[0][0]+1
|
|
322
|
+
|
|
323
|
+
# check rule_id validity
|
|
324
|
+
rules = spark.read.option("url", jdbc_url).mssql(rule_table)
|
|
325
|
+
if not rules.filter(rules.rule_id == rule_id):
|
|
326
|
+
raise Exception(f"rule {rule_id} doesn't exist")
|
|
327
|
+
return -1
|
|
328
|
+
|
|
329
|
+
#check for table
|
|
330
|
+
try:
|
|
331
|
+
df = spark.read.format("delta").load(table_id)
|
|
332
|
+
except:
|
|
333
|
+
try:
|
|
334
|
+
query = f"""SELECT * FROM {table_id}"""
|
|
335
|
+
df = spark.sql(query)
|
|
336
|
+
except:
|
|
337
|
+
try:
|
|
338
|
+
ws = notebookutils.runtime.context.get("currentWorkspaceName")
|
|
339
|
+
query = f"""SELECT * FROM {ws}.{table_id}"""
|
|
340
|
+
df = spark.sql(query)
|
|
341
|
+
except:
|
|
342
|
+
raise Exception(f"table {table} doesn't exist")
|
|
343
|
+
return -1
|
|
344
|
+
|
|
345
|
+
# create table
|
|
346
|
+
schema = StructType([
|
|
347
|
+
StructField("test_id", LongType(), True),
|
|
348
|
+
StructField("rule_id", LongType(), True),
|
|
349
|
+
StructField("description", StringType(), True),
|
|
350
|
+
StructField("stage", StringType(), True),
|
|
351
|
+
StructField("table_id", StringType(), True),
|
|
352
|
+
StructField("columns", StringType(), True),
|
|
353
|
+
StructField("expression", StringType(), True),
|
|
354
|
+
StructField("join_table", StringType(), True),
|
|
355
|
+
StructField("join_column", StringType(), True),
|
|
356
|
+
StructField("bad_data_action", StringType(), True),
|
|
357
|
+
StructField("criticality", BooleanType(), True),
|
|
358
|
+
])
|
|
359
|
+
|
|
360
|
+
data = (
|
|
361
|
+
test_id,
|
|
362
|
+
rule_id,
|
|
363
|
+
desc,
|
|
364
|
+
stage,
|
|
365
|
+
table_id,
|
|
366
|
+
columns,
|
|
367
|
+
expression,
|
|
368
|
+
join_table,
|
|
369
|
+
join_columns,
|
|
370
|
+
bad_data_action,
|
|
371
|
+
criticality
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
new_test = spark.createDataframe(data=[data], schema=schema)
|
|
375
|
+
tests.write.option("url", jdbc_url).mode("append").mssql(test_table)
|
|
376
|
+
|
|
377
|
+
return test_id
|
AQF/aqf_logging.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import json
|
|
4
|
+
import ast
|
|
5
|
+
|
|
6
|
+
from pyspark.sql.functions import lit
|
|
7
|
+
from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, TimestampType
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from pyspark.sql import DataFrame, SparkSession, Row
|
|
10
|
+
import notebookutils
|
|
11
|
+
from .aqf_utils import create_spark, Status, normalize_timestamps
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def result_writer(
|
|
16
|
+
log_path: str,
|
|
17
|
+
run_id: str,
|
|
18
|
+
test_id: int,
|
|
19
|
+
rule_id: int,
|
|
20
|
+
description: str,
|
|
21
|
+
table_id: str,
|
|
22
|
+
start_time: datetime,
|
|
23
|
+
criticality: bool,
|
|
24
|
+
bad_data_action: str,
|
|
25
|
+
result_values,
|
|
26
|
+
jdbc_url: str,
|
|
27
|
+
spark: SparkSession,
|
|
28
|
+
end_time: Optional[datetime] = None,
|
|
29
|
+
error_message: Optional[str] = None,
|
|
30
|
+
job_run_id: int = 0
|
|
31
|
+
|
|
32
|
+
) -> None:
|
|
33
|
+
|
|
34
|
+
"""
|
|
35
|
+
Persists data quality test results to the DQF audit trail.
|
|
36
|
+
|
|
37
|
+
Records comprehensive test execution metadata including performance metrics,
|
|
38
|
+
test outcomes, and error details for monitoring and compliance reporting.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
run_id: unique identifier for each test session
|
|
42
|
+
test_id (int): Unique identifier of the executed test
|
|
43
|
+
rule_id (int): Unique identifier of the executed rule
|
|
44
|
+
description (str): Human-readable description of the test
|
|
45
|
+
table (str): name of the tested table
|
|
46
|
+
start_time (datetime): Test execution start timestamp (UTC)
|
|
47
|
+
end_time (datetime): Test execution completion timestamp (UTC)
|
|
48
|
+
criticality: what if a test fails
|
|
49
|
+
bad_data_action (Optional[str]): Action taken for failed records
|
|
50
|
+
('process', 'delete', 'ignore', 'quarantine')
|
|
51
|
+
result_values (Dict): Test-specific metrics and measurements
|
|
52
|
+
Examples: {"status": Status.PASS, "null_count": 5, "total_count": 1000}
|
|
53
|
+
error_message (Optional[str]): Exception details if status=ERROR
|
|
54
|
+
job_id (int): Job run identifier for grouping related tests
|
|
55
|
+
Defaults to 0 if not specified
|
|
56
|
+
|
|
57
|
+
Result Table Schema:
|
|
58
|
+
- run_id: Auto-incrementing primary key per test execution
|
|
59
|
+
- job_id: Groups related tests in the same batch run
|
|
60
|
+
- Execution metadata: rule_id, test_type_id, description, config_id
|
|
61
|
+
- Timing data: start_time, end_time (for performance analysis)
|
|
62
|
+
- Outcome data: status, result_values, error_message
|
|
63
|
+
- Action taken: bad_data_action
|
|
64
|
+
|
|
65
|
+
Usage:
|
|
66
|
+
Called automatically by test functions in test_engine.py to ensure
|
|
67
|
+
comprehensive audit trail of all DQ validations.
|
|
68
|
+
|
|
69
|
+
Note:
|
|
70
|
+
Uses string interpolation in SQL. Consider JSON serialization
|
|
71
|
+
for result_values to handle complex data types safely.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
#insert optional values that are dependend on the test
|
|
76
|
+
if "new_data_count" in result_values:
|
|
77
|
+
new_data_count = result_values["new_data_count"]
|
|
78
|
+
else:
|
|
79
|
+
new_data_count = None
|
|
80
|
+
|
|
81
|
+
if "rows_test_failed" in result_values:
|
|
82
|
+
rows_test_failed = result_values["rows_test_failed"]
|
|
83
|
+
else:
|
|
84
|
+
rows_test_failed = None
|
|
85
|
+
|
|
86
|
+
if "rows_test_passed" in result_values:
|
|
87
|
+
rows_test_passed = result_values["rows_test_passed"]
|
|
88
|
+
else:
|
|
89
|
+
rows_test_passed = None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
#start_time = start_time.astimezone(timezone.utc).replace(tzinfo=None)
|
|
93
|
+
#end_time = end_time.astimezone(timezone.utc).replace(tzinfo=None)
|
|
94
|
+
|
|
95
|
+
if not end_time:
|
|
96
|
+
status = "running"
|
|
97
|
+
else:
|
|
98
|
+
status = result_values["status"].as_name()
|
|
99
|
+
|
|
100
|
+
# Dictionary für DataFrame
|
|
101
|
+
df_dqf_test_log = (
|
|
102
|
+
run_id, # Unique identifier for each test engine run
|
|
103
|
+
job_run_id, # Batch execution identifier
|
|
104
|
+
test_id, # Reference to rule configuration
|
|
105
|
+
rule_id, # Test function type for categorization
|
|
106
|
+
criticality,
|
|
107
|
+
description, # Human-readable test description
|
|
108
|
+
table_id, # Target table for auditing
|
|
109
|
+
start_time, # Execution start for performance tracking
|
|
110
|
+
end_time, # Execution end for duration calculation
|
|
111
|
+
status, # Test outcome (PASS/FAIL/ERROR)
|
|
112
|
+
json.dumps(result_values), # Test-specific metrics as VARIANT (STRUCT, ARRAY, "JSON" etc.)
|
|
113
|
+
new_data_count,
|
|
114
|
+
rows_test_failed,
|
|
115
|
+
rows_test_passed,
|
|
116
|
+
bad_data_action, # Action taken for quality violations
|
|
117
|
+
error_message # Exception details for troubleshooting
|
|
118
|
+
)
|
|
119
|
+
#print(df_dqf_test_log)
|
|
120
|
+
#print(type(run_id))
|
|
121
|
+
schema = StructType([
|
|
122
|
+
StructField('run_id', StringType(), True),
|
|
123
|
+
StructField('job_run_id', LongType(), True),
|
|
124
|
+
StructField('test_id', LongType(), True),
|
|
125
|
+
StructField('rule_id', LongType(), True),
|
|
126
|
+
StructField('criticality', BooleanType(), True),
|
|
127
|
+
StructField('description', StringType(), True),
|
|
128
|
+
StructField('table_id', StringType(), True),
|
|
129
|
+
StructField('start_time', TimestampType(), True),
|
|
130
|
+
StructField('end_time', TimestampType(), True),
|
|
131
|
+
StructField('status', StringType(), True),
|
|
132
|
+
StructField('result_values', StringType(), True),
|
|
133
|
+
StructField('new_data_count', LongType(), True),
|
|
134
|
+
StructField('rows_test_failed', LongType(), True),
|
|
135
|
+
StructField('rows_test_passed', LongType(), True),
|
|
136
|
+
StructField('bad_data_action', StringType(), True),
|
|
137
|
+
StructField('error_message', StringType(), True)
|
|
138
|
+
])
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
#old = spark.read.option("url", jdbc_url).mssql("dbo.dqf_test_log")
|
|
142
|
+
|
|
143
|
+
log_df = spark.createDataFrame(data=[df_dqf_test_log], schema=schema)
|
|
144
|
+
|
|
145
|
+
log_df = normalize_timestamps(log_df)
|
|
146
|
+
|
|
147
|
+
log_df.write.option("url", jdbc_url).mode("append").mssql(f"dbo.{log_path}")
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def bad_data_writer(table_name: str, df: DataFrame, run_id: str, spark: SparkSession, jdbc_url: str, quarantine_table: str) -> None:
|
|
151
|
+
"""
|
|
152
|
+
Persists data quality test results to the DQF audit trail.
|
|
153
|
+
|
|
154
|
+
Records comprehensive test execution metadata including performance metrics,
|
|
155
|
+
test outcomes, and error details for monitoring and compliance reporting.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
table_name (str): name for the quarantine table
|
|
159
|
+
df (DataFrame): DataFrame containing the test results
|
|
160
|
+
run_id (str): Unique identifier for the test run
|
|
161
|
+
spark: Sparksession
|
|
162
|
+
jdbc_url: connections string to sql db
|
|
163
|
+
quarantine_table: path to quarantine lakehouse
|
|
164
|
+
|
|
165
|
+
Result Table Schema:
|
|
166
|
+
- run_id: Unique identifier per test execution
|
|
167
|
+
- Additional columns from the input DataFrame
|
|
168
|
+
|
|
169
|
+
Usage:
|
|
170
|
+
Called automatically by row test functions if they fail to ensure
|
|
171
|
+
comprehensive audit trail of all DQ validations.
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
# Insert test execution record into audit table
|
|
175
|
+
# All test executions are logged regardless of outcome for compliance
|
|
176
|
+
df = df.withColumn("run_id", lit(run_id))
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
# date table
|
|
181
|
+
ddl_error_table = (
|
|
182
|
+
f"""CREATE TABLE IF NOT EXISTS {quarantine_table}.{table_name}""")
|
|
183
|
+
spark.sql(ddl_error_table)
|
|
184
|
+
|
|
185
|
+
df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{quarantine_table}.{table_name}")
|
|
186
|
+
except:
|
|
187
|
+
ws = notebookutils.runtime.context.get("currentWorkspaceName")
|
|
188
|
+
ddl_error_table = (
|
|
189
|
+
f"""CREATE TABLE IF NOT EXISTS {ws}.{quarantine_table}.dbo.{table_name}""")
|
|
190
|
+
spark.sql(ddl_error_table)
|
|
191
|
+
|
|
192
|
+
df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{ws}.{quarantine_table}.dbo.{table_name}")
|
AQF/aqf_row_rules.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from .aqf_utils import Status
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
|
|
5
|
+
def null_check(test: dict, df: DataFrame) -> (dict, DataFrame, DataFrame):
|
|
6
|
+
"""
|
|
7
|
+
Validates data completeness by checking for NULL values in specified column.
|
|
8
|
+
|
|
9
|
+
Identifies and optionally quarantines records with NULL values in critical fields.
|
|
10
|
+
Supports data quality enforcement through configurable bad data actions.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
test: Test object containing test configuration with attributes:
|
|
14
|
+
- column_name: Target column for NULL validation
|
|
15
|
+
- bad_data_action: Action for NULL records ('quarantine', 'exclude', 'process')
|
|
16
|
+
- rule_id, test_type_id, description, table_name
|
|
17
|
+
df (DataFrame): Source DataFrame to validate
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Tuple(
|
|
21
|
+
result_values: status and additional infos for logging
|
|
22
|
+
return_df: df that has the failed rows removed
|
|
23
|
+
df_bad_data: df that has the failed rows
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
Bad Data Actions:
|
|
27
|
+
- 'quarantine': Move NULL records to bad_data table, continue with clean data
|
|
28
|
+
- 'delete': Remove NULL records from processing pipeline
|
|
29
|
+
- 'ignore': Allow NULL records to continue (no filtering)
|
|
30
|
+
- 'process': Copy NULL records to bad_data_table, dont remove bda data from df
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
"""
|
|
34
|
+
# create df with only null values
|
|
35
|
+
df_bad_data = df.filter(df[eval(test.columns)[0]].isNull())
|
|
36
|
+
null_count = df_bad_data.count()
|
|
37
|
+
|
|
38
|
+
if not df_bad_data.count():
|
|
39
|
+
# Perfect data quality - no NULLs detected
|
|
40
|
+
status = Status.PASS # PASS
|
|
41
|
+
return_df = None
|
|
42
|
+
else:
|
|
43
|
+
match test.bad_data_action:
|
|
44
|
+
case "quarantine" | "delete":
|
|
45
|
+
return_df = df.filter(df[eval(test.columns)[0]].isNotNull())
|
|
46
|
+
case _: # ignore, process
|
|
47
|
+
return_df = df
|
|
48
|
+
|
|
49
|
+
status = Status.WARNING # FAIL (but handled appropriately)
|
|
50
|
+
|
|
51
|
+
if return_df:
|
|
52
|
+
new_data_count = return_df.count()
|
|
53
|
+
else:
|
|
54
|
+
new_data_count = 0
|
|
55
|
+
|
|
56
|
+
# Prepare comprehensive metrics for analysis
|
|
57
|
+
result_values = {
|
|
58
|
+
"status": status,
|
|
59
|
+
"new_data_count": new_data_count,
|
|
60
|
+
"null_count": null_count,
|
|
61
|
+
"none_null_count": df.count() - null_count
|
|
62
|
+
}
|
|
63
|
+
return (result_values, return_df, df_bad_data)
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
def compare(test: dict, df: DataFrame):
|
|
68
|
+
"""
|
|
69
|
+
Validates data by comparing a value to a given expression
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
test: information about the test
|
|
73
|
+
df: dataframe to validate
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Tuple(
|
|
77
|
+
result_values: status and additional infos for logging
|
|
78
|
+
return_df: df that has the failed rows removed
|
|
79
|
+
df_bad_data: df that has the failed rows
|
|
80
|
+
)
|
|
81
|
+
Bad Data Actions:
|
|
82
|
+
- 'quarantine': Move NULL records to bad_data table, continue with clean data
|
|
83
|
+
- 'delete': Remove NULL records from processing pipeline
|
|
84
|
+
- 'ignore': Allow NULL records to continue (no filtering)
|
|
85
|
+
- 'process': Copy NULL records to bad_data_table, dont remove bda data from df
|
|
86
|
+
"""
|
|
87
|
+
comp = ""
|
|
88
|
+
comp += "df[eval(test.columns)[0]]" + test.expression
|
|
89
|
+
|
|
90
|
+
df_bad_data = df.filter(~eval(comp))
|
|
91
|
+
fail_count = df_bad_data.count()
|
|
92
|
+
|
|
93
|
+
if not df_bad_data.count():
|
|
94
|
+
# Perfect data quality - no NULLs detected
|
|
95
|
+
status = Status.PASS # PASS
|
|
96
|
+
return_df = None
|
|
97
|
+
else:
|
|
98
|
+
match test.bad_data_action:
|
|
99
|
+
case "quarantine" | "delete":
|
|
100
|
+
return_df = df.filter(eval(comp))
|
|
101
|
+
case _: # ignore, process
|
|
102
|
+
return_df = df
|
|
103
|
+
|
|
104
|
+
status = Status.WARNING # FAIL (but handled appropriately)
|
|
105
|
+
|
|
106
|
+
if return_df:
|
|
107
|
+
new_data_count = return_df.count()
|
|
108
|
+
else:
|
|
109
|
+
new_data_count = 0
|
|
110
|
+
|
|
111
|
+
# Prepare comprehensive metrics for analysis
|
|
112
|
+
result_values = {
|
|
113
|
+
"status": status,
|
|
114
|
+
"new_data_count": new_data_count,
|
|
115
|
+
"fail_count": fail_count,
|
|
116
|
+
"pass_count": df.count() - fail_count
|
|
117
|
+
}
|
|
118
|
+
return (result_values, return_df, df_bad_data)
|
|
119
|
+
|
|
120
|
+
# -------------------------------------------------------------------------------------
|
|
121
|
+
|
|
122
|
+
def is_not_in_future(test: dict, df: DataFrame):
|
|
123
|
+
"""
|
|
124
|
+
Validates data by checking if a date or timestamp is a valid past date.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
test: information about the test
|
|
128
|
+
df: dataframe to validate
|
|
129
|
+
|
|
130
|
+
Return:
|
|
131
|
+
Tuple(
|
|
132
|
+
result_values: status and additional infos for logging
|
|
133
|
+
return_df: df that has the failed rows removed
|
|
134
|
+
df_bad_data: df that has the failed rows
|
|
135
|
+
)
|
|
136
|
+
Bad Data Actions:
|
|
137
|
+
- 'quarantine': Move NULL records to bad_data table, continue with clean data
|
|
138
|
+
- 'delete': Remove NULL records from processing pipeline
|
|
139
|
+
- 'ignore': Allow NULL records to continue (no filtering)
|
|
140
|
+
- 'process': Copy NULL records to bad_data_table, dont remove bda data from df
|
|
141
|
+
"""
|
|
142
|
+
now = datetime.now(timezone.utc)
|
|
143
|
+
|
|
144
|
+
df_bad_data = df.filter(df[eval(test.columns)[0]] > now)
|
|
145
|
+
fail_count = df_bad_data.count()
|
|
146
|
+
|
|
147
|
+
if not df_bad_data.count():
|
|
148
|
+
# Perfect data quality - no NULLs detected
|
|
149
|
+
status = Status.PASS # PASS
|
|
150
|
+
else:
|
|
151
|
+
match test.bad_data_action:
|
|
152
|
+
case "quarantine" | "delete":
|
|
153
|
+
return_df = df.filter(df[eval(test.columns)[0]] <= now)
|
|
154
|
+
case _: # ignore, process
|
|
155
|
+
return_df = df
|
|
156
|
+
|
|
157
|
+
status = Status.WARNING # FAIL (but handled appropriately)
|
|
158
|
+
|
|
159
|
+
new_data_count = return_df.count()
|
|
160
|
+
|
|
161
|
+
# Prepare comprehensive metrics for analysis
|
|
162
|
+
result_values = {
|
|
163
|
+
"status": status,
|
|
164
|
+
"new_data_count": new_data_count,
|
|
165
|
+
"fail_count": fail_count,
|
|
166
|
+
"pass_count": df.count() - fail_count
|
|
167
|
+
}
|
|
168
|
+
return (result_values, return_df, df_bad_data)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame, SparkSession
|
|
2
|
+
from .aqf_utils import create_spark
|
|
3
|
+
import com.microsoft.sqlserver.jdbc.spark
|
|
4
|
+
from pyspark.sql.types import StructType
|
|
5
|
+
|
|
6
|
+
def retrieve_aqf_tests_by_table_id(table_id: str, test_table: str, spark: SparkSession, jdbc_url: str) -> DataFrame:
|
|
7
|
+
"""
|
|
8
|
+
Retrieves all configured Data Quality Framework rules for a specific table.
|
|
9
|
+
|
|
10
|
+
Queries the DQF metadata repository to fetch test configurations associated
|
|
11
|
+
with the specified table. This enables dynamic test execution based on
|
|
12
|
+
stored rule definitions rather than hardcoded test logic.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
config_id (int): id
|
|
16
|
+
test_table: Fully qualified table name (e.g., 'catalog.schema.table')
|
|
17
|
+
spark: Sparksession
|
|
18
|
+
jdbc_url: connections string to the sql db of the test table
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
DataFrame: Spark DataFrame containing rule configurations with columns:
|
|
23
|
+
- test_id: Unique identifier for the test
|
|
24
|
+
- rule_id: ID for the rule
|
|
25
|
+
- description: Human-readable test description
|
|
26
|
+
- table: Target table for validation
|
|
27
|
+
- columns: Target columns
|
|
28
|
+
- expression: Join strategy (inner/left/right) for referential tests or comparison based expression ("<10")
|
|
29
|
+
- join_table: Reference table for join operations
|
|
30
|
+
- join_column: Reference column for join operations
|
|
31
|
+
- bad_data_action: Action for failed validations ('process'/'exclude'/'quarantine')
|
|
32
|
+
- criticality: action in case the test fails
|
|
33
|
+
|
|
34
|
+
Usage:
|
|
35
|
+
Rules are typically configured via rule_writer.py and stored in utility.dqf.dqf_rules.
|
|
36
|
+
The engine uses this function to discover applicable tests at runtime.
|
|
37
|
+
|
|
38
|
+
Example:
|
|
39
|
+
>>> rules_df = retrieve_dqf_rules_by_table_name('bronze.sales.orders')
|
|
40
|
+
>>> rules_list = rules_df.collect() # Convert to list for iteration
|
|
41
|
+
"""
|
|
42
|
+
# Query DQF metadata table for rules matching the specified table
|
|
43
|
+
# Selects all rule configuration fields needed for test execution
|
|
44
|
+
all_tests = spark.read.option("url", jdbc_url).mssql(f"dbo.{test_table}")
|
|
45
|
+
all_tests.createOrReplaceTempView("pdf")
|
|
46
|
+
tests_sql = f"""
|
|
47
|
+
SELECT
|
|
48
|
+
test_id, -- Unique test identifier
|
|
49
|
+
rule_id, -- Maps to rule
|
|
50
|
+
description, -- Human-readable test description,
|
|
51
|
+
stage,
|
|
52
|
+
table_id, -- full Target table name
|
|
53
|
+
columns, -- Target columns (nullable for table-level tests)
|
|
54
|
+
expression, -- Join strategy (inner/left/right) for referential tests or comparison based expression ("<10")
|
|
55
|
+
join_table, -- Reference table for join operations
|
|
56
|
+
join_column, -- Reference column for join operations
|
|
57
|
+
bad_data_action, -- Bad data handling strategy
|
|
58
|
+
criticality
|
|
59
|
+
FROM pdf
|
|
60
|
+
WHERE table_id = '{table_id}'
|
|
61
|
+
ORDER BY rule_id -- Ensure consistent execution order
|
|
62
|
+
"""
|
|
63
|
+
tests = spark.sql(tests_sql)
|
|
64
|
+
|
|
65
|
+
if tests.rdd.isEmpty():
|
|
66
|
+
print(f"Warning: No DQF tests for table '{table_id}' found.")
|
|
67
|
+
|
|
68
|
+
print("Retrieving of tests successful")
|
|
69
|
+
return tests
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def retrieve_aqf_rules_by_id(id_list, spark: SparkSession, jdbc_url: str, rule_table: str) -> DataFrame:
|
|
73
|
+
"""
|
|
74
|
+
retrieves the rules based on their id
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
test_table: Fully qualified table name (e.g., 'catalog.schema.table')
|
|
78
|
+
spark: Sparksession
|
|
79
|
+
jdbc_url: connections string to the sql db of the test table
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Dataframe with the info for the rules
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
all_rules = spark.read.option("url", jdbc_url).mssql(f"dbo.{rule_table}")
|
|
87
|
+
rules = all_rules.filter(all_rules.rule_id.isin(id_list))
|
|
88
|
+
|
|
89
|
+
if rules.rdd.isEmpty():
|
|
90
|
+
return spark.createDataFrame([], schema=["rule_id", "rule_type_id", "reference_type_id", "name", "descripton", "connection"])
|
|
91
|
+
|
|
92
|
+
print("Retrieving of rules successful")
|
|
93
|
+
return rules
|
|
94
|
+
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(f"Error retrieving rules: {e}")
|
|
97
|
+
return spark.createDataFrame(data=[], schema=["rule_id", "rule_type_id", "reference_type_id", "name", "descripton", "connection"])
|
AQF/aqf_utils.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
from enum import IntEnum
|
|
4
|
+
from pyspark.sql import functions as F
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def generate_run_id() -> str:
|
|
8
|
+
"""
|
|
9
|
+
Generate a globally unique identifier for DQF test execution sessions.
|
|
10
|
+
|
|
11
|
+
Creates a UUID1 string that uniquely identifies each invocation of the
|
|
12
|
+
DQF engine. This enables tracking and correlation of test results across
|
|
13
|
+
related validations within the same processing batch.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
str: UUID1 string in format 'xxxxxxxx-xxxx-1xxx-yxxx-xxxxxxxxxxxx'
|
|
17
|
+
Example: '550e8400-e29b-11d4-a716-446655440000'
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
Used by engine.py to tag all test executions within a single run,
|
|
21
|
+
enabling batch-level reporting and correlation of results.
|
|
22
|
+
|
|
23
|
+
Benefits:
|
|
24
|
+
- Enables grouping of related test executions
|
|
25
|
+
- Supports distributed processing scenarios
|
|
26
|
+
- Provides audit trail for compliance reporting
|
|
27
|
+
- Facilitates troubleshooting of failed validation runs
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> run_id = generate_run_id()
|
|
31
|
+
>>> print(f"Starting DQF execution with run_id: {run_id}")
|
|
32
|
+
"""
|
|
33
|
+
return str(uuid.uuid1())
|
|
34
|
+
|
|
35
|
+
def create_spark():
|
|
36
|
+
"""
|
|
37
|
+
creates spark session if not directly run in notebook
|
|
38
|
+
"""
|
|
39
|
+
return SparkSession.builder.getOrCreate()
|
|
40
|
+
|
|
41
|
+
def normalize_timestamps(df):
|
|
42
|
+
"""
|
|
43
|
+
helper to fix timestamp issues
|
|
44
|
+
"""
|
|
45
|
+
for col, dtype in df.dtypes:
|
|
46
|
+
if dtype == "timestamp":
|
|
47
|
+
df = df.withColumn(
|
|
48
|
+
col,
|
|
49
|
+
F.to_timestamp(F.date_format(F.col(col), "yyyy-MM-dd HH:mm:ss"))
|
|
50
|
+
)
|
|
51
|
+
return df
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Status(IntEnum):
|
|
55
|
+
"""
|
|
56
|
+
User friendly enum for status type
|
|
57
|
+
"""
|
|
58
|
+
PASS = 0
|
|
59
|
+
WARNING = 1
|
|
60
|
+
FAIL = 2
|
|
61
|
+
ERROR = 3
|
|
62
|
+
|
|
63
|
+
def as_name(self):
|
|
64
|
+
match self:
|
|
65
|
+
case Status.PASS:
|
|
66
|
+
return "pass"
|
|
67
|
+
case Status.WARNING:
|
|
68
|
+
return "warning"
|
|
69
|
+
case Status.FAIL:
|
|
70
|
+
return "fail"
|
|
71
|
+
case Status.ERROR:
|
|
72
|
+
return "error"
|
|
73
|
+
case _:
|
|
74
|
+
"no status"
|
aqf-1.1.dist-info/RECORD
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
AQF/__init__.py,sha256=_wKo_YyN6sPfUK8RUyLKZ-x5jZp-Nki8mxXh9DMAKKU,221
|
|
2
|
+
AQF/aqf_dataset_rules.py,sha256=ATZTgap8FOXqcYbKoUndRK9mqD8v2c3xXYZwOlNVK8I,5224
|
|
3
|
+
AQF/aqf_engine.py,sha256=3Se7x1HyZhiGpC9tIV1iDcGtPnTnGlbQPsOzlhz7fmA,13369
|
|
4
|
+
AQF/aqf_logging.py,sha256=HUCDQ5ddOM3VmeQvl94VD8zTW3HxZkN2dz-Bh0UB_Sg,7672
|
|
5
|
+
AQF/aqf_row_rules.py,sha256=h4jtIqSOZ8UXgoIJcHFKx11tRSoroDLSPFnZMfXsp8k,5848
|
|
6
|
+
AQF/aqf_rule_retriever.py,sha256=6hqQukdWXdcBdCXkw4hqQBDXCE0os7AdsSe0MGMyWkE,4338
|
|
7
|
+
AQF/aqf_utils.py,sha256=QrIC6hM2XrUI4b0QcsdlF9wqaSSNEx5KzOagNrctFyI,2086
|
|
8
|
+
aqf-1.1.dist-info/METADATA,sha256=KlhD8zP1gE3Mqn2SMqt5TLbZ3wQYeIGRadwOuLbCOnw,204
|
|
9
|
+
aqf-1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
aqf-1.1.dist-info/top_level.txt,sha256=kd_ZM7nabvEJmgyyqllhL26UZV7OC_PxvShFSjS_XvQ,4
|
|
11
|
+
aqf-1.1.dist-info/RECORD,,
|
aqf-1.1.dist-info/WHEEL
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
AQF
|