AQF 1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
AQF/__init__.py ADDED
@@ -0,0 +1 @@
1
+ import AQF.aqf_dataset_rules as aqf_dataset_rules, AQF.aqf_engine as aqf_engine, AQF.aqf_logging as aqf_logging, AQF.aqf_row_rules as aqf_row_rules, AQF.aqf_rule_retriever as aqf_rule_retriever, AQF.aqf_utils as aqf_utils
@@ -0,0 +1,138 @@
1
+ from pyspark.sql import DataFrame
2
+ from .aqf_utils import create_spark, Status
3
+ from pyspark.sql.functions import col
4
+ from functools import reduce
5
+ import operator
6
+
7
+ def join_basic_inner_count_check(test, df: DataFrame, **kwargs) -> dict:
8
+ """
9
+ Validates referential integrity by performing inner join with lookup table.
10
+
11
+ Checks if records in the source DataFrame have corresponding matches in a reference table.
12
+ This test ensures data consistency across related tables and identifies orphaned records.
13
+
14
+ Args:
15
+ rule: Rule object containing test configuration with attributes:
16
+ - join_table: Target table name for join operation
17
+ - column_name: Source column for join condition
18
+ - join_column: Target column for join condition
19
+ - rule_id, test_type_id, description, table_name, bad_data_action
20
+ df (DataFrame): Source DataFrame to validate
21
+
22
+ Returns:
23
+ (dict): Single-element list containing status code:
24
+ [1] = PASS (all records have matches)
25
+ [0] = FAIL (some records lack matches)
26
+ [2] = ERROR (exception occurred)
27
+
28
+ Test Logic:
29
+ - Performs inner join between source and reference table
30
+ - Compares join result count with source record count
31
+ - PASS if join_count > 0, FAIL otherwise
32
+ """
33
+
34
+ spark = kwargs["spark"]
35
+
36
+ # Load reference table for join operation
37
+ df_join_target = spark.sql(f"SELECT * FROM {test.join_table}").alias("df_join_target")
38
+ df_comp = df.alias("df_comp")
39
+
40
+
41
+ # prepare string to join tables on all columns
42
+ columns = eval(test.columns)
43
+ join_columns = eval(test.join_column)
44
+
45
+ if len(columns) != len(join_columns):
46
+ raise Exception("column count doesn't match join column count")
47
+
48
+ # Build join conditions
49
+ join_condition = reduce(
50
+ operator.and_,
51
+ [col(f"df_comp.{c1}") == col(f"df_join_target.{c2}") for c1, c2 in zip(columns, join_columns)]
52
+ )
53
+
54
+ # Perform inner join
55
+ joined_df = df_comp.join(
56
+ df_join_target,
57
+ join_condition,
58
+ "inner"
59
+ )
60
+
61
+ # Calculate metrics for test evaluation
62
+ join_count = joined_df.count() # Records with valid references
63
+ new_data_count = df_comp.count() # Total source records
64
+
65
+
66
+ # Prepare metrics for result logging
67
+ result_values = {
68
+ "status": Status.PASS if join_count > 0 else Status.WARNING, # PASS if any matches found
69
+ "new_data_count": new_data_count,
70
+ "join_count": join_count,
71
+ }
72
+ return result_values
73
+
74
+ #----------------------------------------------------------------------------------
75
+
76
+
77
+ def unique_check(test, df: DataFrame) -> dict:
78
+ """
79
+ Validates data uniqueness by checking for duplicate values in specified column.
80
+
81
+ Ensures primary key constraints and data integrity by identifying duplicate records.
82
+ Supports automatic deduplication based on configured bad data action.
83
+
84
+ !!! Removing/quarantining any data doesnt make sense here. We cant decide what the bad data is, based on only one column. !!!
85
+
86
+ Args:
87
+ rule: Rule object containing test configuration with attributes:
88
+ - column_name: Target column for uniqueness validation0293
89
+ - bad_data_action: Action for duplicate records ('exclude', 'process')
90
+ - rule_id, test_type_id, description, table_name
91
+ df (DataFrame): Source DataFrame to validate
92
+
93
+ Returns:
94
+ List[Union[int, DataFrame]]: Status code and optionally deduplicated DataFrame:
95
+ [status, clean_df] if deduplication applied
96
+ [status] if no filtering
97
+
98
+ Status codes:
99
+ 1 = PASS (all values unique)
100
+ 0 = FAIL (duplicates found and handled)
101
+ 2 = ERROR (exception occurred)
102
+
103
+ Deduplication Logic:
104
+ - 'exclude': Remove duplicate records keeping first occurrence
105
+ - 'process': Allow duplicates to continue through pipeline
106
+
107
+ Note:
108
+ Current implementation uses hardcoded 'id' column for deduplication.
109
+ Consider making this configurable via rule.column_name.
110
+ """
111
+ columns = eval(test.columns)
112
+ if not columns:
113
+ unique_count = df.distinct().count()
114
+ else:
115
+ unique_count = df.select(columns).distinct().count() # Unique values in target column
116
+
117
+ # Calculate uniqueness metrics
118
+ total_count = df.count() # Total records in DataFrame
119
+
120
+ ""
121
+ if unique_count == total_count:
122
+ # Perfect uniqueness - all values are distinct
123
+ status = Status.PASS # PASS
124
+ return_df = df
125
+ # What happens with quarantine? All duplicates should be moved to quarantine table
126
+ else:
127
+ status = Status.WARNING
128
+
129
+ # Prepare detailed metrics for analysis
130
+ result_values = {
131
+ "status": status,
132
+ "total_count": total_count,
133
+ "unique_count": unique_count,
134
+ "difference": total_count - unique_count # Number of duplicate records
135
+ }
136
+
137
+ return result_values
138
+
AQF/aqf_engine.py ADDED
@@ -0,0 +1,377 @@
1
+ from .aqf_utils import generate_run_id, create_spark, Status, normalize_timestamps
2
+ from .aqf_rule_retriever import *
3
+ from .aqf_dataset_rules import *
4
+ from .aqf_row_rules import *
5
+ from .aqf_logging import result_writer, bad_data_writer
6
+
7
+ from pyspark.sql import DataFrame, SparkSession
8
+ from pyspark.sql.functions import max
9
+ from functools import reduce
10
+ from datetime import datetime, timezone
11
+
12
+ import notebookutils
13
+
14
+
15
+ class AQF_Engine():
16
+
17
+ """
18
+ Class that manages the DQF testing procedure
19
+
20
+ Args:
21
+ run_id: id string to make every test run unique
22
+ job_id: id for every hsi session?
23
+ config: name of the variable in the workspace
24
+ spark: Optional SparkSession to use. If not provided, the active session is used.
25
+
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ job_id: str,
31
+ run_id: str,
32
+ config: str,
33
+ spark: SparkSession | None = None
34
+ ):
35
+ self.spark = create_spark() if spark is None else spark
36
+ self.run_id = run_id
37
+
38
+ # list of the outcomes of each test
39
+ self.status_list = []
40
+
41
+ # env variables
42
+ self.config = notebookutils.variableLibrary.getLibrary(config)
43
+
44
+ # represents the tested and updated dataframe
45
+ self.consolidated_df = None
46
+
47
+ self.cancel = False
48
+
49
+
50
+ def run_tests(
51
+ self,
52
+ table_path: str,
53
+ table_name: str,
54
+ df: DataFrame,
55
+ check_type: str | None = None,
56
+ spark: SparkSession | None = None,
57
+ ) -> DataFrame:
58
+ """
59
+ Main orchestration function for the Data Quality Framework (DQF).
60
+
61
+ Executes all configured data quality tests for a given table and consolidates results.
62
+ This function retrieves rules, executes corresponding tests, and tracks success/failure rates.
63
+
64
+ Args:
65
+ config_id:
66
+ table_name: string for the table name, only used for quaratine table name
67
+ table_path:
68
+ df (DataFrame): Input Spark DataFrame to validate
69
+ check_type: ?
70
+
71
+
72
+ Returns:
73
+ DataFrame: Consolidated DataFrame containing only records that passed all tests
74
+ (when bad_data_action is 'quarantine' or 'delete')
75
+
76
+ Notes:
77
+ - Test results are automatically logged to dbo.dqf.dqf_test_log table
78
+ - Supports multiple bad data actions: 'process', 'delete', 'quarantine', ignore
79
+ - Generates unique run_id for tracking test execution sessions
80
+
81
+ Example:
82
+ >>> clean_df = engine.run_tests(test_table="dqf_tests", df=df, table_path="lh_bronze", table_name="bronze_hsi", check_type="")
83
+ """
84
+
85
+ if spark:
86
+ self.spark = spark
87
+ elif not self.spark:
88
+ self.spark = create_spark()
89
+
90
+ # table (str): Fully qualified table name (e.g., 'catalog.schema.table')
91
+ # Used to retrieve applicable DQ rules from metadata table
92
+ self.table_id = table_path + "." + table_name
93
+ self.table_name = table_name
94
+
95
+ # clean df for return after tests are done
96
+ self.consolidated_df = df
97
+
98
+ print(f"Starting Data Quality checks for {self.table_id}")
99
+
100
+ # Retrieve all configured DQ rules for the specified table
101
+ df_tests = retrieve_aqf_tests_by_table_id(table_id=self.table_id, test_table=self.config.test_table, spark=self.spark, jdbc_url=self.config.jdbc_url)
102
+ tests = df_tests.collect()
103
+ number_of_tests = len(tests)
104
+ if not number_of_tests:
105
+ print(f"No tests found for {self.table_id}")
106
+ return df
107
+
108
+ test_list = df_tests.toPandas()["rule_id"].tolist()
109
+ rules = retrieve_aqf_rules_by_id(test_list, spark=self.spark, jdbc_url=self.config.jdbc_url, rule_table=self.config.rule_table)
110
+
111
+ # Execute each rule sequentially
112
+ for test in tests:
113
+ print(test)
114
+ rule = rules.filter(rules.rule_id.isin([test.rule_id])).collect()
115
+ self.testing(test=test, df=df, rule=rule[0])
116
+
117
+ if self.status_list[-1] == Status.FAIL:
118
+ self.cancel = True
119
+
120
+ self.output_results(number_of_tests)
121
+
122
+ return self.consolidated_df
123
+
124
+
125
+
126
+ def consolidate(self, new_df: DataFrame):
127
+ """
128
+ Takes the dataframe that just got reduced by some row level test and removes the missing rows also from the consolidated df
129
+
130
+ Args:
131
+ new_df: the dataframe that got changed by a test
132
+ """
133
+ # Verwende INTERSECT statt UNION
134
+ self.consolidated_df = self.consolidated_df.intersect(new_df)
135
+ #self.consolidated_df.show()
136
+
137
+
138
+ def output_results(self, number_of_tests: int):
139
+ """
140
+ Might get scrapped or changed to something else since output to notebook doesnt make sense in a regular environment
141
+
142
+ Evaluates how the test session went
143
+
144
+ Args:
145
+ number_of_tests: number of tests is inferred from the test retrievel at the start
146
+ """
147
+ # Calculate final test statistics
148
+ incomplete_tests = self.status_list.count(Status.ERROR) # ERROR status
149
+ passed_tests = self.status_list.count(Status.PASS) # PASS status
150
+ failed_tests = self.status_list.count(Status.FAIL) # FAIL status
151
+ warnings = self.status_list.count(Status.WARNING)
152
+ #canceled = number_of_tests - len(self.status_list)
153
+
154
+ # Summary output for monitoring (minimal logging)
155
+ print(f"AQF Results - Tests: {number_of_tests} | Passed: {passed_tests} | Failed: {failed_tests+warnings} | Errors: {incomplete_tests}")
156
+
157
+
158
+ def get_kwargs(self, connection: str) -> dict:
159
+ """
160
+ Some rules require an additional table so this puts the connections into the extra parameters
161
+ """
162
+ con = eval(connection)
163
+ kwargs = {}
164
+ for c in con:
165
+ match c:
166
+ case "spark":
167
+ kwargs["spark"] = self.spark
168
+ case "sql":
169
+ kwargs["jdbc_url"] = self.config.jdbc_url
170
+ return kwargs
171
+
172
+ def row_level(self, test: dict, df: DataFrame, rule: dict, **kwargs) -> dict:
173
+ """
174
+ this is only for tests that are row based
175
+ checks each cell in a column with a value
176
+ reference value str optional
177
+
178
+ Args:
179
+ test: dict with the test info
180
+ df: Dataframe to evaluate
181
+ rule: dict with the rule info
182
+
183
+ Returns:
184
+ logging data
185
+ """
186
+ # results = (result_values, good_df, bad_df)
187
+ results = eval(rule.name)(test=test, df=df, **kwargs)
188
+
189
+ # Write bad data to quarantine table for later analysis
190
+ # Table naming: utility.bad_data.{source_table_name}
191
+ if results[2]:
192
+ bad_data_writer(table_name=self.table_name, df=results[2], run_id=self.run_id, spark=self.spark, jdbc_url=self.config.jdbc_url, quarantine_table=self.config.quarantine_table)
193
+ if results[1]:
194
+ self.consolidate(results[1])
195
+
196
+ return results[0]
197
+
198
+
199
+ def testing(self, test: dict, df: DataFrame, rule: dict):
200
+ """
201
+ This manages a single test
202
+ It runs it and logs it
203
+
204
+ Args:
205
+ test: dict with the test info
206
+ df: Dataframe to evaluate
207
+ rule: dict with the rule info
208
+ """
209
+
210
+ start_time = datetime.now(timezone.utc)
211
+ start_time.strftime("%Y-%m-%d %H:%M:%S.%f")
212
+ try:
213
+ result_writer(
214
+ log_path = self.config.log_table,
215
+ run_id = self.run_id,
216
+ test_id = test.test_id,
217
+ rule_id = test.rule_id,
218
+ description = test.description,
219
+ table_id = test.table_id,
220
+ start_time = start_time,
221
+ result_values = {},
222
+ bad_data_action = test.bad_data_action,
223
+ criticality = test.criticality,
224
+ spark = self.spark,
225
+ jdbc_url = self.config.jdbc_url
226
+ )
227
+ print("logging (start) successful")
228
+ except:
229
+ pass
230
+
231
+
232
+ kwargs = self.get_kwargs(rule.connection)
233
+
234
+ try:
235
+ # result: result_values
236
+ if rule.rule_type == "dataset":
237
+ # dataset level
238
+ result_values = eval(rule.name)(test, df, **kwargs)
239
+ elif rule.rule_type == "row":
240
+ # row level
241
+ result_values = self.row_level(test=test, df=df, rule=rule, **kwargs)
242
+ else:
243
+ raise Exception(f"Invalid rule type {rule.rule_type}")
244
+ #print(results)
245
+ if result_values["status"] == Status.WARNING and test.criticality:
246
+ result_values["status"] = Status.FAIL
247
+
248
+ self.status_list.append(result_values["status"])
249
+ end_time = datetime.now(timezone.utc)
250
+ end_time.strftime("%Y-%m-%d %H:%M:%S.%f")
251
+ error_message = None
252
+
253
+ print(f"Test successful: {test.test_id}")
254
+ except Exception as e:
255
+ # Handle unexpected errors during test execution
256
+ print(f"Testing (id: {test.test_id}) failed: {e}")
257
+ error_message = str(e)
258
+ result_values = {"status": Status.ERROR}
259
+ end_time = datetime.now(timezone.utc)
260
+ end_time.strftime("%Y-%m-%d %H:%M:%S.%f")
261
+ self.status_list.append(Status.ERROR)
262
+
263
+ try:
264
+ result_writer(
265
+ log_path = self.config.log_table,
266
+ run_id = self.run_id,
267
+ test_id = test.test_id,
268
+ rule_id = test.rule_id,
269
+ description = test.description,
270
+ table_id = test.table_id,
271
+ start_time = start_time,
272
+ end_time = end_time,
273
+ result_values = result_values,
274
+ error_message = error_message,
275
+ bad_data_action = test.bad_data_action,
276
+ criticality = test.criticality,
277
+ spark = self.spark,
278
+ jdbc_url = self.config.jdbc_url
279
+ )
280
+ print("Logging successful")
281
+ except Exception as e:
282
+ print(f"Logging failed: {e}")
283
+
284
+
285
+ def is_critical(self):
286
+ """
287
+ Returns wether a critical test failed
288
+ """
289
+ return self.cancel
290
+
291
+ def get_fail_count(self):
292
+ return self.status_list.count(Status.FAIL)
293
+
294
+ def get_log_table(self):
295
+ spark = self.spark
296
+ return spark.read.option("url", self.config.jdbc_url).mssql(self.config.log_table)
297
+
298
+ def create_test(
299
+ jdbc_url: str,
300
+ test_table: str,
301
+ rule_table: str,
302
+ rule_id: int,
303
+ stage: str,
304
+ table_id: str,
305
+ desc: str = None,
306
+ columns: list[str] = None,
307
+ expression: str = None,
308
+ join_table: str = None,
309
+ join_column: str = None,
310
+ bad_data_action: str = "process", # in case no bad data action is given for row based test
311
+ citicality: bool = False, # in case no criticality is given
312
+ spark: SparkSession | None = None
313
+ ) -> int:
314
+ """
315
+ Create an entry in the test table with all necessary values
316
+ """
317
+ spark = create_spark() if not spark else spark
318
+
319
+ # create new test_id
320
+ tests = spark.read.option("url", jdbc_url).mssql(test_table)
321
+ test_id = tests.select(max(tests.test_id).alias("test_id")).collect()[0][0]+1
322
+
323
+ # check rule_id validity
324
+ rules = spark.read.option("url", jdbc_url).mssql(rule_table)
325
+ if not rules.filter(rules.rule_id == rule_id):
326
+ raise Exception(f"rule {rule_id} doesn't exist")
327
+ return -1
328
+
329
+ #check for table
330
+ try:
331
+ df = spark.read.format("delta").load(table_id)
332
+ except:
333
+ try:
334
+ query = f"""SELECT * FROM {table_id}"""
335
+ df = spark.sql(query)
336
+ except:
337
+ try:
338
+ ws = notebookutils.runtime.context.get("currentWorkspaceName")
339
+ query = f"""SELECT * FROM {ws}.{table_id}"""
340
+ df = spark.sql(query)
341
+ except:
342
+ raise Exception(f"table {table} doesn't exist")
343
+ return -1
344
+
345
+ # create table
346
+ schema = StructType([
347
+ StructField("test_id", LongType(), True),
348
+ StructField("rule_id", LongType(), True),
349
+ StructField("description", StringType(), True),
350
+ StructField("stage", StringType(), True),
351
+ StructField("table_id", StringType(), True),
352
+ StructField("columns", StringType(), True),
353
+ StructField("expression", StringType(), True),
354
+ StructField("join_table", StringType(), True),
355
+ StructField("join_column", StringType(), True),
356
+ StructField("bad_data_action", StringType(), True),
357
+ StructField("criticality", BooleanType(), True),
358
+ ])
359
+
360
+ data = (
361
+ test_id,
362
+ rule_id,
363
+ desc,
364
+ stage,
365
+ table_id,
366
+ columns,
367
+ expression,
368
+ join_table,
369
+ join_columns,
370
+ bad_data_action,
371
+ criticality
372
+ )
373
+
374
+ new_test = spark.createDataframe(data=[data], schema=schema)
375
+ tests.write.option("url", jdbc_url).mode("append").mssql(test_table)
376
+
377
+ return test_id
AQF/aqf_logging.py ADDED
@@ -0,0 +1,192 @@
1
+ from datetime import datetime, timezone
2
+ import pandas as pd
3
+ import json
4
+ import ast
5
+
6
+ from pyspark.sql.functions import lit
7
+ from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, TimestampType
8
+ from typing import Optional
9
+ from pyspark.sql import DataFrame, SparkSession, Row
10
+ import notebookutils
11
+ from .aqf_utils import create_spark, Status, normalize_timestamps
12
+
13
+
14
+
15
+ def result_writer(
16
+ log_path: str,
17
+ run_id: str,
18
+ test_id: int,
19
+ rule_id: int,
20
+ description: str,
21
+ table_id: str,
22
+ start_time: datetime,
23
+ criticality: bool,
24
+ bad_data_action: str,
25
+ result_values,
26
+ jdbc_url: str,
27
+ spark: SparkSession,
28
+ end_time: Optional[datetime] = None,
29
+ error_message: Optional[str] = None,
30
+ job_run_id: int = 0
31
+
32
+ ) -> None:
33
+
34
+ """
35
+ Persists data quality test results to the DQF audit trail.
36
+
37
+ Records comprehensive test execution metadata including performance metrics,
38
+ test outcomes, and error details for monitoring and compliance reporting.
39
+
40
+ Args:
41
+ run_id: unique identifier for each test session
42
+ test_id (int): Unique identifier of the executed test
43
+ rule_id (int): Unique identifier of the executed rule
44
+ description (str): Human-readable description of the test
45
+ table (str): name of the tested table
46
+ start_time (datetime): Test execution start timestamp (UTC)
47
+ end_time (datetime): Test execution completion timestamp (UTC)
48
+ criticality: what if a test fails
49
+ bad_data_action (Optional[str]): Action taken for failed records
50
+ ('process', 'delete', 'ignore', 'quarantine')
51
+ result_values (Dict): Test-specific metrics and measurements
52
+ Examples: {"status": Status.PASS, "null_count": 5, "total_count": 1000}
53
+ error_message (Optional[str]): Exception details if status=ERROR
54
+ job_id (int): Job run identifier for grouping related tests
55
+ Defaults to 0 if not specified
56
+
57
+ Result Table Schema:
58
+ - run_id: Auto-incrementing primary key per test execution
59
+ - job_id: Groups related tests in the same batch run
60
+ - Execution metadata: rule_id, test_type_id, description, config_id
61
+ - Timing data: start_time, end_time (for performance analysis)
62
+ - Outcome data: status, result_values, error_message
63
+ - Action taken: bad_data_action
64
+
65
+ Usage:
66
+ Called automatically by test functions in test_engine.py to ensure
67
+ comprehensive audit trail of all DQ validations.
68
+
69
+ Note:
70
+ Uses string interpolation in SQL. Consider JSON serialization
71
+ for result_values to handle complex data types safely.
72
+ """
73
+
74
+
75
+ #insert optional values that are dependend on the test
76
+ if "new_data_count" in result_values:
77
+ new_data_count = result_values["new_data_count"]
78
+ else:
79
+ new_data_count = None
80
+
81
+ if "rows_test_failed" in result_values:
82
+ rows_test_failed = result_values["rows_test_failed"]
83
+ else:
84
+ rows_test_failed = None
85
+
86
+ if "rows_test_passed" in result_values:
87
+ rows_test_passed = result_values["rows_test_passed"]
88
+ else:
89
+ rows_test_passed = None
90
+
91
+
92
+ #start_time = start_time.astimezone(timezone.utc).replace(tzinfo=None)
93
+ #end_time = end_time.astimezone(timezone.utc).replace(tzinfo=None)
94
+
95
+ if not end_time:
96
+ status = "running"
97
+ else:
98
+ status = result_values["status"].as_name()
99
+
100
+ # Dictionary für DataFrame
101
+ df_dqf_test_log = (
102
+ run_id, # Unique identifier for each test engine run
103
+ job_run_id, # Batch execution identifier
104
+ test_id, # Reference to rule configuration
105
+ rule_id, # Test function type for categorization
106
+ criticality,
107
+ description, # Human-readable test description
108
+ table_id, # Target table for auditing
109
+ start_time, # Execution start for performance tracking
110
+ end_time, # Execution end for duration calculation
111
+ status, # Test outcome (PASS/FAIL/ERROR)
112
+ json.dumps(result_values), # Test-specific metrics as VARIANT (STRUCT, ARRAY, "JSON" etc.)
113
+ new_data_count,
114
+ rows_test_failed,
115
+ rows_test_passed,
116
+ bad_data_action, # Action taken for quality violations
117
+ error_message # Exception details for troubleshooting
118
+ )
119
+ #print(df_dqf_test_log)
120
+ #print(type(run_id))
121
+ schema = StructType([
122
+ StructField('run_id', StringType(), True),
123
+ StructField('job_run_id', LongType(), True),
124
+ StructField('test_id', LongType(), True),
125
+ StructField('rule_id', LongType(), True),
126
+ StructField('criticality', BooleanType(), True),
127
+ StructField('description', StringType(), True),
128
+ StructField('table_id', StringType(), True),
129
+ StructField('start_time', TimestampType(), True),
130
+ StructField('end_time', TimestampType(), True),
131
+ StructField('status', StringType(), True),
132
+ StructField('result_values', StringType(), True),
133
+ StructField('new_data_count', LongType(), True),
134
+ StructField('rows_test_failed', LongType(), True),
135
+ StructField('rows_test_passed', LongType(), True),
136
+ StructField('bad_data_action', StringType(), True),
137
+ StructField('error_message', StringType(), True)
138
+ ])
139
+
140
+
141
+ #old = spark.read.option("url", jdbc_url).mssql("dbo.dqf_test_log")
142
+
143
+ log_df = spark.createDataFrame(data=[df_dqf_test_log], schema=schema)
144
+
145
+ log_df = normalize_timestamps(log_df)
146
+
147
+ log_df.write.option("url", jdbc_url).mode("append").mssql(f"dbo.{log_path}")
148
+
149
+
150
+ def bad_data_writer(table_name: str, df: DataFrame, run_id: str, spark: SparkSession, jdbc_url: str, quarantine_table: str) -> None:
151
+ """
152
+ Persists data quality test results to the DQF audit trail.
153
+
154
+ Records comprehensive test execution metadata including performance metrics,
155
+ test outcomes, and error details for monitoring and compliance reporting.
156
+
157
+ Args:
158
+ table_name (str): name for the quarantine table
159
+ df (DataFrame): DataFrame containing the test results
160
+ run_id (str): Unique identifier for the test run
161
+ spark: Sparksession
162
+ jdbc_url: connections string to sql db
163
+ quarantine_table: path to quarantine lakehouse
164
+
165
+ Result Table Schema:
166
+ - run_id: Unique identifier per test execution
167
+ - Additional columns from the input DataFrame
168
+
169
+ Usage:
170
+ Called automatically by row test functions if they fail to ensure
171
+ comprehensive audit trail of all DQ validations.
172
+ """
173
+
174
+ # Insert test execution record into audit table
175
+ # All test executions are logged regardless of outcome for compliance
176
+ df = df.withColumn("run_id", lit(run_id))
177
+
178
+
179
+ try:
180
+ # date table
181
+ ddl_error_table = (
182
+ f"""CREATE TABLE IF NOT EXISTS {quarantine_table}.{table_name}""")
183
+ spark.sql(ddl_error_table)
184
+
185
+ df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{quarantine_table}.{table_name}")
186
+ except:
187
+ ws = notebookutils.runtime.context.get("currentWorkspaceName")
188
+ ddl_error_table = (
189
+ f"""CREATE TABLE IF NOT EXISTS {ws}.{quarantine_table}.dbo.{table_name}""")
190
+ spark.sql(ddl_error_table)
191
+
192
+ df.write.mode("append").option("mergeSchema", "true").saveAsTable(f"{ws}.{quarantine_table}.dbo.{table_name}")
AQF/aqf_row_rules.py ADDED
@@ -0,0 +1,168 @@
1
+ from pyspark.sql import DataFrame
2
+ from .aqf_utils import Status
3
+ from datetime import datetime, timezone
4
+
5
+ def null_check(test: dict, df: DataFrame) -> (dict, DataFrame, DataFrame):
6
+ """
7
+ Validates data completeness by checking for NULL values in specified column.
8
+
9
+ Identifies and optionally quarantines records with NULL values in critical fields.
10
+ Supports data quality enforcement through configurable bad data actions.
11
+
12
+ Args:
13
+ test: Test object containing test configuration with attributes:
14
+ - column_name: Target column for NULL validation
15
+ - bad_data_action: Action for NULL records ('quarantine', 'exclude', 'process')
16
+ - rule_id, test_type_id, description, table_name
17
+ df (DataFrame): Source DataFrame to validate
18
+
19
+ Returns:
20
+ Tuple(
21
+ result_values: status and additional infos for logging
22
+ return_df: df that has the failed rows removed
23
+ df_bad_data: df that has the failed rows
24
+ )
25
+
26
+ Bad Data Actions:
27
+ - 'quarantine': Move NULL records to bad_data table, continue with clean data
28
+ - 'delete': Remove NULL records from processing pipeline
29
+ - 'ignore': Allow NULL records to continue (no filtering)
30
+ - 'process': Copy NULL records to bad_data_table, dont remove bda data from df
31
+
32
+
33
+ """
34
+ # create df with only null values
35
+ df_bad_data = df.filter(df[eval(test.columns)[0]].isNull())
36
+ null_count = df_bad_data.count()
37
+
38
+ if not df_bad_data.count():
39
+ # Perfect data quality - no NULLs detected
40
+ status = Status.PASS # PASS
41
+ return_df = None
42
+ else:
43
+ match test.bad_data_action:
44
+ case "quarantine" | "delete":
45
+ return_df = df.filter(df[eval(test.columns)[0]].isNotNull())
46
+ case _: # ignore, process
47
+ return_df = df
48
+
49
+ status = Status.WARNING # FAIL (but handled appropriately)
50
+
51
+ if return_df:
52
+ new_data_count = return_df.count()
53
+ else:
54
+ new_data_count = 0
55
+
56
+ # Prepare comprehensive metrics for analysis
57
+ result_values = {
58
+ "status": status,
59
+ "new_data_count": new_data_count,
60
+ "null_count": null_count,
61
+ "none_null_count": df.count() - null_count
62
+ }
63
+ return (result_values, return_df, df_bad_data)
64
+
65
+ # ---------------------------------------------------------------------------------------
66
+
67
+ def compare(test: dict, df: DataFrame):
68
+ """
69
+ Validates data by comparing a value to a given expression
70
+
71
+ Args:
72
+ test: information about the test
73
+ df: dataframe to validate
74
+
75
+ Returns:
76
+ Tuple(
77
+ result_values: status and additional infos for logging
78
+ return_df: df that has the failed rows removed
79
+ df_bad_data: df that has the failed rows
80
+ )
81
+ Bad Data Actions:
82
+ - 'quarantine': Move NULL records to bad_data table, continue with clean data
83
+ - 'delete': Remove NULL records from processing pipeline
84
+ - 'ignore': Allow NULL records to continue (no filtering)
85
+ - 'process': Copy NULL records to bad_data_table, dont remove bda data from df
86
+ """
87
+ comp = ""
88
+ comp += "df[eval(test.columns)[0]]" + test.expression
89
+
90
+ df_bad_data = df.filter(~eval(comp))
91
+ fail_count = df_bad_data.count()
92
+
93
+ if not df_bad_data.count():
94
+ # Perfect data quality - no NULLs detected
95
+ status = Status.PASS # PASS
96
+ return_df = None
97
+ else:
98
+ match test.bad_data_action:
99
+ case "quarantine" | "delete":
100
+ return_df = df.filter(eval(comp))
101
+ case _: # ignore, process
102
+ return_df = df
103
+
104
+ status = Status.WARNING # FAIL (but handled appropriately)
105
+
106
+ if return_df:
107
+ new_data_count = return_df.count()
108
+ else:
109
+ new_data_count = 0
110
+
111
+ # Prepare comprehensive metrics for analysis
112
+ result_values = {
113
+ "status": status,
114
+ "new_data_count": new_data_count,
115
+ "fail_count": fail_count,
116
+ "pass_count": df.count() - fail_count
117
+ }
118
+ return (result_values, return_df, df_bad_data)
119
+
120
+ # -------------------------------------------------------------------------------------
121
+
122
+ def is_not_in_future(test: dict, df: DataFrame):
123
+ """
124
+ Validates data by checking if a date or timestamp is a valid past date.
125
+
126
+ Args:
127
+ test: information about the test
128
+ df: dataframe to validate
129
+
130
+ Return:
131
+ Tuple(
132
+ result_values: status and additional infos for logging
133
+ return_df: df that has the failed rows removed
134
+ df_bad_data: df that has the failed rows
135
+ )
136
+ Bad Data Actions:
137
+ - 'quarantine': Move NULL records to bad_data table, continue with clean data
138
+ - 'delete': Remove NULL records from processing pipeline
139
+ - 'ignore': Allow NULL records to continue (no filtering)
140
+ - 'process': Copy NULL records to bad_data_table, dont remove bda data from df
141
+ """
142
+ now = datetime.now(timezone.utc)
143
+
144
+ df_bad_data = df.filter(df[eval(test.columns)[0]] > now)
145
+ fail_count = df_bad_data.count()
146
+
147
+ if not df_bad_data.count():
148
+ # Perfect data quality - no NULLs detected
149
+ status = Status.PASS # PASS
150
+ else:
151
+ match test.bad_data_action:
152
+ case "quarantine" | "delete":
153
+ return_df = df.filter(df[eval(test.columns)[0]] <= now)
154
+ case _: # ignore, process
155
+ return_df = df
156
+
157
+ status = Status.WARNING # FAIL (but handled appropriately)
158
+
159
+ new_data_count = return_df.count()
160
+
161
+ # Prepare comprehensive metrics for analysis
162
+ result_values = {
163
+ "status": status,
164
+ "new_data_count": new_data_count,
165
+ "fail_count": fail_count,
166
+ "pass_count": df.count() - fail_count
167
+ }
168
+ return (result_values, return_df, df_bad_data)
@@ -0,0 +1,97 @@
1
+ from pyspark.sql import DataFrame, SparkSession
2
+ from .aqf_utils import create_spark
3
+ import com.microsoft.sqlserver.jdbc.spark
4
+ from pyspark.sql.types import StructType
5
+
6
+ def retrieve_aqf_tests_by_table_id(table_id: str, test_table: str, spark: SparkSession, jdbc_url: str) -> DataFrame:
7
+ """
8
+ Retrieves all configured Data Quality Framework rules for a specific table.
9
+
10
+ Queries the DQF metadata repository to fetch test configurations associated
11
+ with the specified table. This enables dynamic test execution based on
12
+ stored rule definitions rather than hardcoded test logic.
13
+
14
+ Args:
15
+ config_id (int): id
16
+ test_table: Fully qualified table name (e.g., 'catalog.schema.table')
17
+ spark: Sparksession
18
+ jdbc_url: connections string to the sql db of the test table
19
+
20
+
21
+ Returns:
22
+ DataFrame: Spark DataFrame containing rule configurations with columns:
23
+ - test_id: Unique identifier for the test
24
+ - rule_id: ID for the rule
25
+ - description: Human-readable test description
26
+ - table: Target table for validation
27
+ - columns: Target columns
28
+ - expression: Join strategy (inner/left/right) for referential tests or comparison based expression ("<10")
29
+ - join_table: Reference table for join operations
30
+ - join_column: Reference column for join operations
31
+ - bad_data_action: Action for failed validations ('process'/'exclude'/'quarantine')
32
+ - criticality: action in case the test fails
33
+
34
+ Usage:
35
+ Rules are typically configured via rule_writer.py and stored in utility.dqf.dqf_rules.
36
+ The engine uses this function to discover applicable tests at runtime.
37
+
38
+ Example:
39
+ >>> rules_df = retrieve_dqf_rules_by_table_name('bronze.sales.orders')
40
+ >>> rules_list = rules_df.collect() # Convert to list for iteration
41
+ """
42
+ # Query DQF metadata table for rules matching the specified table
43
+ # Selects all rule configuration fields needed for test execution
44
+ all_tests = spark.read.option("url", jdbc_url).mssql(f"dbo.{test_table}")
45
+ all_tests.createOrReplaceTempView("pdf")
46
+ tests_sql = f"""
47
+ SELECT
48
+ test_id, -- Unique test identifier
49
+ rule_id, -- Maps to rule
50
+ description, -- Human-readable test description,
51
+ stage,
52
+ table_id, -- full Target table name
53
+ columns, -- Target columns (nullable for table-level tests)
54
+ expression, -- Join strategy (inner/left/right) for referential tests or comparison based expression ("<10")
55
+ join_table, -- Reference table for join operations
56
+ join_column, -- Reference column for join operations
57
+ bad_data_action, -- Bad data handling strategy
58
+ criticality
59
+ FROM pdf
60
+ WHERE table_id = '{table_id}'
61
+ ORDER BY rule_id -- Ensure consistent execution order
62
+ """
63
+ tests = spark.sql(tests_sql)
64
+
65
+ if tests.rdd.isEmpty():
66
+ print(f"Warning: No DQF tests for table '{table_id}' found.")
67
+
68
+ print("Retrieving of tests successful")
69
+ return tests
70
+
71
+
72
+ def retrieve_aqf_rules_by_id(id_list, spark: SparkSession, jdbc_url: str, rule_table: str) -> DataFrame:
73
+ """
74
+ retrieves the rules based on their id
75
+
76
+ Args:
77
+ test_table: Fully qualified table name (e.g., 'catalog.schema.table')
78
+ spark: Sparksession
79
+ jdbc_url: connections string to the sql db of the test table
80
+
81
+ Returns:
82
+ Dataframe with the info for the rules
83
+ """
84
+
85
+ try:
86
+ all_rules = spark.read.option("url", jdbc_url).mssql(f"dbo.{rule_table}")
87
+ rules = all_rules.filter(all_rules.rule_id.isin(id_list))
88
+
89
+ if rules.rdd.isEmpty():
90
+ return spark.createDataFrame([], schema=["rule_id", "rule_type_id", "reference_type_id", "name", "descripton", "connection"])
91
+
92
+ print("Retrieving of rules successful")
93
+ return rules
94
+
95
+ except Exception as e:
96
+ print(f"Error retrieving rules: {e}")
97
+ return spark.createDataFrame(data=[], schema=["rule_id", "rule_type_id", "reference_type_id", "name", "descripton", "connection"])
AQF/aqf_utils.py ADDED
@@ -0,0 +1,74 @@
1
+ import uuid
2
+ from pyspark.sql import SparkSession
3
+ from enum import IntEnum
4
+ from pyspark.sql import functions as F
5
+
6
+
7
+ def generate_run_id() -> str:
8
+ """
9
+ Generate a globally unique identifier for DQF test execution sessions.
10
+
11
+ Creates a UUID1 string that uniquely identifies each invocation of the
12
+ DQF engine. This enables tracking and correlation of test results across
13
+ related validations within the same processing batch.
14
+
15
+ Returns:
16
+ str: UUID1 string in format 'xxxxxxxx-xxxx-1xxx-yxxx-xxxxxxxxxxxx'
17
+ Example: '550e8400-e29b-11d4-a716-446655440000'
18
+
19
+ Usage:
20
+ Used by engine.py to tag all test executions within a single run,
21
+ enabling batch-level reporting and correlation of results.
22
+
23
+ Benefits:
24
+ - Enables grouping of related test executions
25
+ - Supports distributed processing scenarios
26
+ - Provides audit trail for compliance reporting
27
+ - Facilitates troubleshooting of failed validation runs
28
+
29
+ Example:
30
+ >>> run_id = generate_run_id()
31
+ >>> print(f"Starting DQF execution with run_id: {run_id}")
32
+ """
33
+ return str(uuid.uuid1())
34
+
35
+ def create_spark():
36
+ """
37
+ creates spark session if not directly run in notebook
38
+ """
39
+ return SparkSession.builder.getOrCreate()
40
+
41
+ def normalize_timestamps(df):
42
+ """
43
+ helper to fix timestamp issues
44
+ """
45
+ for col, dtype in df.dtypes:
46
+ if dtype == "timestamp":
47
+ df = df.withColumn(
48
+ col,
49
+ F.to_timestamp(F.date_format(F.col(col), "yyyy-MM-dd HH:mm:ss"))
50
+ )
51
+ return df
52
+
53
+
54
+ class Status(IntEnum):
55
+ """
56
+ User friendly enum for status type
57
+ """
58
+ PASS = 0
59
+ WARNING = 1
60
+ FAIL = 2
61
+ ERROR = 3
62
+
63
+ def as_name(self):
64
+ match self:
65
+ case Status.PASS:
66
+ return "pass"
67
+ case Status.WARNING:
68
+ return "warning"
69
+ case Status.FAIL:
70
+ return "fail"
71
+ case Status.ERROR:
72
+ return "error"
73
+ case _:
74
+ "no status"
@@ -0,0 +1,9 @@
1
+ Metadata-Version: 2.4
2
+ Name: AQF
3
+ Version: 1.1
4
+ Summary: Data Quality Framework for Spark-based pipelines
5
+ Author: qjener
6
+ Author-email: dasxxq@gmail.com
7
+ Dynamic: author
8
+ Dynamic: author-email
9
+ Dynamic: summary
@@ -0,0 +1,11 @@
1
+ AQF/__init__.py,sha256=_wKo_YyN6sPfUK8RUyLKZ-x5jZp-Nki8mxXh9DMAKKU,221
2
+ AQF/aqf_dataset_rules.py,sha256=ATZTgap8FOXqcYbKoUndRK9mqD8v2c3xXYZwOlNVK8I,5224
3
+ AQF/aqf_engine.py,sha256=3Se7x1HyZhiGpC9tIV1iDcGtPnTnGlbQPsOzlhz7fmA,13369
4
+ AQF/aqf_logging.py,sha256=HUCDQ5ddOM3VmeQvl94VD8zTW3HxZkN2dz-Bh0UB_Sg,7672
5
+ AQF/aqf_row_rules.py,sha256=h4jtIqSOZ8UXgoIJcHFKx11tRSoroDLSPFnZMfXsp8k,5848
6
+ AQF/aqf_rule_retriever.py,sha256=6hqQukdWXdcBdCXkw4hqQBDXCE0os7AdsSe0MGMyWkE,4338
7
+ AQF/aqf_utils.py,sha256=QrIC6hM2XrUI4b0QcsdlF9wqaSSNEx5KzOagNrctFyI,2086
8
+ aqf-1.1.dist-info/METADATA,sha256=KlhD8zP1gE3Mqn2SMqt5TLbZ3wQYeIGRadwOuLbCOnw,204
9
+ aqf-1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
10
+ aqf-1.1.dist-info/top_level.txt,sha256=kd_ZM7nabvEJmgyyqllhL26UZV7OC_PxvShFSjS_XvQ,4
11
+ aqf-1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ AQF