duckguard 2.3.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- duckguard/__init__.py +1 -1
- duckguard/checks/__init__.py +26 -0
- duckguard/checks/conditional.py +796 -0
- duckguard/checks/distributional.py +524 -0
- duckguard/checks/multicolumn.py +726 -0
- duckguard/checks/query_based.py +643 -0
- duckguard/connectors/factory.py +30 -2
- duckguard/connectors/files.py +7 -3
- duckguard/core/column.py +372 -0
- duckguard/core/dataset.py +330 -0
- duckguard/profiler/distribution_analyzer.py +384 -0
- duckguard/profiler/outlier_detector.py +497 -0
- duckguard/profiler/pattern_matcher.py +301 -0
- duckguard/profiler/quality_scorer.py +445 -0
- duckguard/rules/executor.py +642 -0
- duckguard/rules/schema.py +31 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/METADATA +120 -1
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/RECORD +21 -12
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/WHEEL +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/entry_points.txt +0 -0
- {duckguard-2.3.0.dist-info → duckguard-3.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,796 @@
|
|
|
1
|
+
"""Conditional check implementations for DuckGuard 3.0.
|
|
2
|
+
|
|
3
|
+
This module provides conditional validation checks that apply rules only when
|
|
4
|
+
a specified condition is true. This enables sophisticated data quality checks
|
|
5
|
+
like:
|
|
6
|
+
- "Column must not be null when country = 'USA'"
|
|
7
|
+
- "Amount must be positive when status = 'completed'"
|
|
8
|
+
- "Email is required when customer_type = 'registered'"
|
|
9
|
+
|
|
10
|
+
Security Note:
|
|
11
|
+
All SQL conditions are validated to prevent SQL injection. Only SELECT
|
|
12
|
+
queries with WHERE clauses are allowed. No data modification statements
|
|
13
|
+
(INSERT, UPDATE, DELETE, DROP, etc.) are permitted.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from duckguard.core.result import ValidationResult
|
|
23
|
+
from duckguard.errors import ValidationError
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class QueryValidationResult:
|
|
28
|
+
"""Result of SQL query validation."""
|
|
29
|
+
|
|
30
|
+
is_valid: bool
|
|
31
|
+
error_message: str | None = None
|
|
32
|
+
warnings: list[str] = field(default_factory=list)
|
|
33
|
+
complexity_score: int = 0 # 0-100, higher = more complex
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class QueryValidator:
|
|
37
|
+
"""Validates SQL conditions for security and correctness.
|
|
38
|
+
|
|
39
|
+
This validator prevents SQL injection and ensures queries are safe to execute.
|
|
40
|
+
It applies multiple layers of validation:
|
|
41
|
+
1. Keyword blacklist (no destructive operations)
|
|
42
|
+
2. Syntax validation via DuckDB parser
|
|
43
|
+
3. Complexity analysis
|
|
44
|
+
4. Read-only enforcement
|
|
45
|
+
|
|
46
|
+
Examples:
|
|
47
|
+
>>> validator = QueryValidator()
|
|
48
|
+
>>> result = validator.validate("country = 'USA'")
|
|
49
|
+
>>> assert result.is_valid
|
|
50
|
+
|
|
51
|
+
>>> result = validator.validate("DROP TABLE users")
|
|
52
|
+
>>> assert not result.is_valid
|
|
53
|
+
>>> assert "forbidden keyword" in result.error_message.lower()
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
# Keywords that are absolutely forbidden in conditions
|
|
57
|
+
FORBIDDEN_KEYWORDS = [
|
|
58
|
+
"INSERT", "UPDATE", "DELETE", "DROP", "CREATE",
|
|
59
|
+
"ALTER", "TRUNCATE", "GRANT", "REVOKE", "EXECUTE",
|
|
60
|
+
"EXEC", "CALL", "MERGE", "REPLACE", "PRAGMA"
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
# Keywords that suggest dangerous operations
|
|
64
|
+
WARNING_KEYWORDS = [
|
|
65
|
+
"ATTACH", "DETACH", "IMPORT", "EXPORT", "COPY",
|
|
66
|
+
"LOAD", "INSTALL"
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
def __init__(self, max_complexity: int = 50):
|
|
70
|
+
"""Initialize validator.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
max_complexity: Maximum allowed complexity score (0-100)
|
|
74
|
+
"""
|
|
75
|
+
self.max_complexity = max_complexity
|
|
76
|
+
|
|
77
|
+
def validate(self, condition: str) -> QueryValidationResult:
|
|
78
|
+
"""Validate a SQL condition string.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
condition: SQL WHERE clause condition (without WHERE keyword)
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
QueryValidationResult with validation status and details
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
>>> validator = QueryValidator()
|
|
88
|
+
>>> result = validator.validate("status = 'active'")
|
|
89
|
+
>>> assert result.is_valid
|
|
90
|
+
|
|
91
|
+
>>> result = validator.validate("amount > 100 AND category = 'A'")
|
|
92
|
+
>>> assert result.is_valid
|
|
93
|
+
"""
|
|
94
|
+
if not condition or not condition.strip():
|
|
95
|
+
return QueryValidationResult(
|
|
96
|
+
is_valid=False,
|
|
97
|
+
error_message="Condition cannot be empty"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
condition = condition.strip()
|
|
101
|
+
|
|
102
|
+
# Step 1: Check for forbidden keywords
|
|
103
|
+
condition_upper = condition.upper()
|
|
104
|
+
for keyword in self.FORBIDDEN_KEYWORDS:
|
|
105
|
+
if re.search(rf'\b{keyword}\b', condition_upper):
|
|
106
|
+
return QueryValidationResult(
|
|
107
|
+
is_valid=False,
|
|
108
|
+
error_message=f"Forbidden keyword detected: {keyword}. "
|
|
109
|
+
f"Only SELECT queries with WHERE clauses are allowed."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Step 2: Check for warning keywords
|
|
113
|
+
warnings = []
|
|
114
|
+
for keyword in self.WARNING_KEYWORDS:
|
|
115
|
+
if re.search(rf'\b{keyword}\b', condition_upper):
|
|
116
|
+
warnings.append(
|
|
117
|
+
f"Warning: Potentially dangerous keyword detected: {keyword}"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
# Step 3: Basic syntax checks
|
|
121
|
+
if condition.count('(') != condition.count(')'):
|
|
122
|
+
return QueryValidationResult(
|
|
123
|
+
is_valid=False,
|
|
124
|
+
error_message="Unbalanced parentheses in condition"
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
if condition.count("'") % 2 != 0:
|
|
128
|
+
return QueryValidationResult(
|
|
129
|
+
is_valid=False,
|
|
130
|
+
error_message="Unbalanced quotes in condition"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# Step 4: Check for common SQL injection patterns
|
|
134
|
+
injection_patterns = [
|
|
135
|
+
r'--', # SQL comments
|
|
136
|
+
r'/\*', # Block comment start
|
|
137
|
+
r'\*/', # Block comment end
|
|
138
|
+
r';.*', # Multiple statements
|
|
139
|
+
r'\bOR\s+[\'"]?1[\'"]?\s*=\s*[\'"]?1[\'"]?', # OR 1=1
|
|
140
|
+
r'\bUNION\b.*\bSELECT\b', # UNION SELECT
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
for pattern in injection_patterns:
|
|
144
|
+
if re.search(pattern, condition_upper):
|
|
145
|
+
return QueryValidationResult(
|
|
146
|
+
is_valid=False,
|
|
147
|
+
error_message="Potential SQL injection detected. "
|
|
148
|
+
"Suspicious pattern found."
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Step 5: Calculate complexity score
|
|
152
|
+
complexity = self._calculate_complexity(condition)
|
|
153
|
+
if complexity > self.max_complexity:
|
|
154
|
+
return QueryValidationResult(
|
|
155
|
+
is_valid=False,
|
|
156
|
+
error_message=f"Condition too complex (score: {complexity}, "
|
|
157
|
+
f"max: {self.max_complexity}). "
|
|
158
|
+
f"Simplify your condition or use query-based checks."
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
# All checks passed
|
|
162
|
+
return QueryValidationResult(
|
|
163
|
+
is_valid=True,
|
|
164
|
+
warnings=warnings,
|
|
165
|
+
complexity_score=complexity
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
def _calculate_complexity(self, condition: str) -> int:
|
|
169
|
+
"""Calculate complexity score for a condition (0-100).
|
|
170
|
+
|
|
171
|
+
Factors:
|
|
172
|
+
- Length of condition string
|
|
173
|
+
- Number of operators (AND, OR, NOT)
|
|
174
|
+
- Number of comparisons
|
|
175
|
+
- Nesting depth (parentheses)
|
|
176
|
+
- Number of function calls
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
condition: SQL condition string
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Complexity score (0-100)
|
|
183
|
+
"""
|
|
184
|
+
score = 0
|
|
185
|
+
condition_upper = condition.upper()
|
|
186
|
+
|
|
187
|
+
# Length factor (0-20 points)
|
|
188
|
+
length_score = min(20, len(condition) // 10)
|
|
189
|
+
score += length_score
|
|
190
|
+
|
|
191
|
+
# Logical operators (5 points each)
|
|
192
|
+
logical_ops = len(re.findall(r'\b(AND|OR|NOT)\b', condition_upper))
|
|
193
|
+
score += logical_ops * 5
|
|
194
|
+
|
|
195
|
+
# Comparison operators (2 points each)
|
|
196
|
+
comparisons = len(re.findall(r'[<>=!]+', condition))
|
|
197
|
+
score += comparisons * 2
|
|
198
|
+
|
|
199
|
+
# Nesting depth (10 points per level)
|
|
200
|
+
max_depth = self._calculate_nesting_depth(condition)
|
|
201
|
+
score += max_depth * 10
|
|
202
|
+
|
|
203
|
+
# Function calls (8 points each)
|
|
204
|
+
functions = len(re.findall(r'\w+\s*\(', condition))
|
|
205
|
+
score += functions * 8
|
|
206
|
+
|
|
207
|
+
# Subqueries (20 points each - very complex)
|
|
208
|
+
subqueries = condition_upper.count('SELECT')
|
|
209
|
+
score += subqueries * 20
|
|
210
|
+
|
|
211
|
+
return min(100, score)
|
|
212
|
+
|
|
213
|
+
def _calculate_nesting_depth(self, condition: str) -> int:
|
|
214
|
+
"""Calculate maximum nesting depth of parentheses.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
condition: SQL condition string
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Maximum nesting depth
|
|
221
|
+
"""
|
|
222
|
+
max_depth = 0
|
|
223
|
+
current_depth = 0
|
|
224
|
+
|
|
225
|
+
for char in condition:
|
|
226
|
+
if char == '(':
|
|
227
|
+
current_depth += 1
|
|
228
|
+
max_depth = max(max_depth, current_depth)
|
|
229
|
+
elif char == ')':
|
|
230
|
+
current_depth -= 1
|
|
231
|
+
|
|
232
|
+
return max_depth
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class ConditionalCheckHandler:
|
|
236
|
+
"""Executes conditional validation checks.
|
|
237
|
+
|
|
238
|
+
This handler translates conditional checks into SQL queries that filter
|
|
239
|
+
data based on a condition before applying the validation rule.
|
|
240
|
+
|
|
241
|
+
Pattern:
|
|
242
|
+
WHERE (condition) AND NOT (check_passes)
|
|
243
|
+
|
|
244
|
+
Example:
|
|
245
|
+
For not_null_when(condition="country = 'USA'"):
|
|
246
|
+
SELECT COUNT(*) FROM table
|
|
247
|
+
WHERE (country = 'USA') AND (column IS NULL)
|
|
248
|
+
|
|
249
|
+
This counts rows where the condition is true BUT the check fails.
|
|
250
|
+
|
|
251
|
+
Attributes:
|
|
252
|
+
validator: QueryValidator instance for SQL validation
|
|
253
|
+
timeout_seconds: Maximum query execution time
|
|
254
|
+
|
|
255
|
+
Examples:
|
|
256
|
+
>>> handler = ConditionalCheckHandler()
|
|
257
|
+
>>> result = handler.execute_not_null_when(
|
|
258
|
+
... dataset=my_data,
|
|
259
|
+
... column="state",
|
|
260
|
+
... condition="country = 'USA'"
|
|
261
|
+
... )
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
validator: QueryValidator | None = None,
|
|
267
|
+
timeout_seconds: int = 30
|
|
268
|
+
):
|
|
269
|
+
"""Initialize conditional check handler.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
validator: Query validator (creates default if None)
|
|
273
|
+
timeout_seconds: Maximum query execution time
|
|
274
|
+
"""
|
|
275
|
+
self.validator = validator or QueryValidator()
|
|
276
|
+
self.timeout_seconds = timeout_seconds
|
|
277
|
+
|
|
278
|
+
def execute_not_null_when(
|
|
279
|
+
self,
|
|
280
|
+
dataset,
|
|
281
|
+
column: str,
|
|
282
|
+
condition: str,
|
|
283
|
+
threshold: float = 1.0
|
|
284
|
+
) -> ValidationResult:
|
|
285
|
+
"""Check column is not null when condition is true.
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
dataset: Dataset to validate
|
|
289
|
+
column: Column name to check
|
|
290
|
+
condition: SQL WHERE clause condition
|
|
291
|
+
threshold: Maximum allowed failure rate (0.0-1.0)
|
|
292
|
+
|
|
293
|
+
Returns:
|
|
294
|
+
ValidationResult with pass/fail status
|
|
295
|
+
|
|
296
|
+
Raises:
|
|
297
|
+
ValidationError: If condition is invalid or unsafe
|
|
298
|
+
|
|
299
|
+
Examples:
|
|
300
|
+
>>> data = connect("customers.csv")
|
|
301
|
+
>>> result = handler.execute_not_null_when(
|
|
302
|
+
... dataset=data,
|
|
303
|
+
... column="state",
|
|
304
|
+
... condition="country = 'USA'"
|
|
305
|
+
... )
|
|
306
|
+
>>> assert result.passed
|
|
307
|
+
"""
|
|
308
|
+
# Validate condition
|
|
309
|
+
validation = self.validator.validate(condition)
|
|
310
|
+
if not validation.is_valid:
|
|
311
|
+
raise ValidationError(
|
|
312
|
+
f"Invalid condition: {validation.error_message}"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Validate threshold
|
|
316
|
+
if not 0.0 <= threshold <= 1.0:
|
|
317
|
+
raise ValueError(f"threshold must be between 0.0 and 1.0, got {threshold}")
|
|
318
|
+
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
319
|
+
source_path = dataset._source.replace('\\', '/')
|
|
320
|
+
|
|
321
|
+
# Build SQL query
|
|
322
|
+
sql = f"""
|
|
323
|
+
SELECT COUNT(*) as violations
|
|
324
|
+
FROM '{source_path}'
|
|
325
|
+
WHERE ({condition}) AND ({column} IS NULL)
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
# Execute query with timeout
|
|
330
|
+
violations = dataset._engine.fetch_value(sql)
|
|
331
|
+
|
|
332
|
+
# Count rows matching condition
|
|
333
|
+
count_sql = f"""
|
|
334
|
+
SELECT COUNT(*) as total
|
|
335
|
+
FROM '{source_path}'
|
|
336
|
+
WHERE ({condition})
|
|
337
|
+
"""
|
|
338
|
+
total_matching = dataset._engine.fetch_value(count_sql)
|
|
339
|
+
|
|
340
|
+
if total_matching == 0:
|
|
341
|
+
# No rows match condition - check passes vacuously
|
|
342
|
+
return ValidationResult(
|
|
343
|
+
passed=True,
|
|
344
|
+
actual_value=0,
|
|
345
|
+
expected_value=0,
|
|
346
|
+
message=f"No rows match condition: {condition}. "
|
|
347
|
+
f"Check passes vacuously.",
|
|
348
|
+
details={
|
|
349
|
+
"condition": condition,
|
|
350
|
+
"matching_rows": 0,
|
|
351
|
+
"violations": 0
|
|
352
|
+
}
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
# Calculate violation rate
|
|
356
|
+
violation_rate = violations / total_matching
|
|
357
|
+
passed = violation_rate <= (1.0 - threshold)
|
|
358
|
+
|
|
359
|
+
return ValidationResult(
|
|
360
|
+
passed=passed,
|
|
361
|
+
actual_value=violations,
|
|
362
|
+
expected_value=0,
|
|
363
|
+
message=self._format_message(
|
|
364
|
+
passed=passed,
|
|
365
|
+
column=column,
|
|
366
|
+
check="not null",
|
|
367
|
+
condition=condition,
|
|
368
|
+
violations=violations,
|
|
369
|
+
total=total_matching,
|
|
370
|
+
violation_rate=violation_rate
|
|
371
|
+
),
|
|
372
|
+
details={
|
|
373
|
+
"condition": condition,
|
|
374
|
+
"matching_rows": total_matching,
|
|
375
|
+
"violations": violations,
|
|
376
|
+
"violation_rate": violation_rate,
|
|
377
|
+
"threshold": threshold,
|
|
378
|
+
"complexity_score": validation.complexity_score
|
|
379
|
+
}
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
except Exception as e:
|
|
383
|
+
raise ValidationError(
|
|
384
|
+
f"Error executing conditional check: {str(e)}"
|
|
385
|
+
) from e
|
|
386
|
+
|
|
387
|
+
def execute_unique_when(
|
|
388
|
+
self,
|
|
389
|
+
dataset,
|
|
390
|
+
column: str,
|
|
391
|
+
condition: str,
|
|
392
|
+
threshold: float = 1.0
|
|
393
|
+
) -> ValidationResult:
|
|
394
|
+
"""Check column is unique when condition is true.
|
|
395
|
+
|
|
396
|
+
Args:
|
|
397
|
+
dataset: Dataset to validate
|
|
398
|
+
column: Column name to check
|
|
399
|
+
condition: SQL WHERE clause condition
|
|
400
|
+
threshold: Minimum required uniqueness rate (0.0-1.0)
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
ValidationResult with pass/fail status
|
|
404
|
+
|
|
405
|
+
Examples:
|
|
406
|
+
>>> result = handler.execute_unique_when(
|
|
407
|
+
... dataset=data,
|
|
408
|
+
... column="order_id",
|
|
409
|
+
... condition="status = 'completed'"
|
|
410
|
+
... )
|
|
411
|
+
"""
|
|
412
|
+
# Validate condition
|
|
413
|
+
validation = self.validator.validate(condition)
|
|
414
|
+
if not validation.is_valid:
|
|
415
|
+
raise ValidationError(
|
|
416
|
+
f"Invalid condition: {validation.error_message}"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
420
|
+
source_path = dataset._source.replace('\\', '/')
|
|
421
|
+
|
|
422
|
+
# Check for duplicates in rows matching condition
|
|
423
|
+
sql = f"""
|
|
424
|
+
SELECT COUNT(*) as duplicates
|
|
425
|
+
FROM (
|
|
426
|
+
SELECT {column}, COUNT(*) as cnt
|
|
427
|
+
FROM '{source_path}'
|
|
428
|
+
WHERE ({condition})
|
|
429
|
+
GROUP BY {column}
|
|
430
|
+
HAVING cnt > 1
|
|
431
|
+
) as dups
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
duplicate_values = dataset._engine.fetch_value(sql)
|
|
436
|
+
|
|
437
|
+
# Count distinct values matching condition
|
|
438
|
+
distinct_sql = f"""
|
|
439
|
+
SELECT COUNT(DISTINCT {column}) as distinct_count
|
|
440
|
+
FROM '{source_path}'
|
|
441
|
+
WHERE ({condition})
|
|
442
|
+
"""
|
|
443
|
+
distinct_count = dataset._engine.fetch_value(distinct_sql)
|
|
444
|
+
|
|
445
|
+
# Count total rows matching condition
|
|
446
|
+
total_sql = f"""
|
|
447
|
+
SELECT COUNT(*) as total
|
|
448
|
+
FROM '{source_path}'
|
|
449
|
+
WHERE ({condition})
|
|
450
|
+
"""
|
|
451
|
+
total_matching = dataset._engine.fetch_value(total_sql)
|
|
452
|
+
|
|
453
|
+
if total_matching == 0:
|
|
454
|
+
return ValidationResult(
|
|
455
|
+
passed=True,
|
|
456
|
+
actual_value=0,
|
|
457
|
+
expected_value=0,
|
|
458
|
+
message=f"No rows match condition: {condition}",
|
|
459
|
+
details={"condition": condition, "matching_rows": 0}
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Calculate uniqueness rate
|
|
463
|
+
uniqueness_rate = distinct_count / total_matching
|
|
464
|
+
passed = uniqueness_rate >= threshold
|
|
465
|
+
|
|
466
|
+
return ValidationResult(
|
|
467
|
+
passed=passed,
|
|
468
|
+
actual_value=duplicate_values,
|
|
469
|
+
expected_value=0,
|
|
470
|
+
message=self._format_message(
|
|
471
|
+
passed=passed,
|
|
472
|
+
column=column,
|
|
473
|
+
check="unique",
|
|
474
|
+
condition=condition,
|
|
475
|
+
violations=duplicate_values,
|
|
476
|
+
total=total_matching,
|
|
477
|
+
violation_rate=1.0 - uniqueness_rate
|
|
478
|
+
),
|
|
479
|
+
details={
|
|
480
|
+
"condition": condition,
|
|
481
|
+
"matching_rows": total_matching,
|
|
482
|
+
"distinct_values": distinct_count,
|
|
483
|
+
"duplicate_values": duplicate_values,
|
|
484
|
+
"uniqueness_rate": uniqueness_rate,
|
|
485
|
+
"threshold": threshold
|
|
486
|
+
}
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
except Exception as e:
|
|
490
|
+
raise ValidationError(
|
|
491
|
+
f"Error executing conditional unique check: {str(e)}"
|
|
492
|
+
) from e
|
|
493
|
+
|
|
494
|
+
def execute_between_when(
|
|
495
|
+
self,
|
|
496
|
+
dataset,
|
|
497
|
+
column: str,
|
|
498
|
+
min_value: float,
|
|
499
|
+
max_value: float,
|
|
500
|
+
condition: str,
|
|
501
|
+
threshold: float = 1.0
|
|
502
|
+
) -> ValidationResult:
|
|
503
|
+
"""Check column is between min and max when condition is true.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
dataset: Dataset to validate
|
|
507
|
+
column: Column name to check
|
|
508
|
+
min_value: Minimum allowed value
|
|
509
|
+
max_value: Maximum allowed value
|
|
510
|
+
condition: SQL WHERE clause condition
|
|
511
|
+
threshold: Maximum allowed failure rate (0.0-1.0)
|
|
512
|
+
|
|
513
|
+
Returns:
|
|
514
|
+
ValidationResult with pass/fail status
|
|
515
|
+
"""
|
|
516
|
+
validation = self.validator.validate(condition)
|
|
517
|
+
if not validation.is_valid:
|
|
518
|
+
raise ValidationError(
|
|
519
|
+
f"Invalid condition: {validation.error_message}"
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
523
|
+
source_path = dataset._source.replace('\\', '/')
|
|
524
|
+
|
|
525
|
+
sql = f"""
|
|
526
|
+
SELECT COUNT(*) as violations
|
|
527
|
+
FROM '{source_path}'
|
|
528
|
+
WHERE ({condition})
|
|
529
|
+
AND ({column} < {min_value} OR {column} > {max_value})
|
|
530
|
+
"""
|
|
531
|
+
|
|
532
|
+
try:
|
|
533
|
+
violations = dataset._engine.fetch_value(sql)
|
|
534
|
+
|
|
535
|
+
count_sql = f"""
|
|
536
|
+
SELECT COUNT(*) as total
|
|
537
|
+
FROM '{source_path}'
|
|
538
|
+
WHERE ({condition})
|
|
539
|
+
"""
|
|
540
|
+
total_matching = dataset._engine.fetch_value(count_sql)
|
|
541
|
+
|
|
542
|
+
if total_matching == 0:
|
|
543
|
+
return ValidationResult(
|
|
544
|
+
passed=True,
|
|
545
|
+
actual_value=0,
|
|
546
|
+
expected_value=0,
|
|
547
|
+
message=f"No rows match condition: {condition}",
|
|
548
|
+
details={"condition": condition, "matching_rows": 0}
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
violation_rate = violations / total_matching
|
|
552
|
+
passed = violation_rate <= (1.0 - threshold)
|
|
553
|
+
|
|
554
|
+
return ValidationResult(
|
|
555
|
+
passed=passed,
|
|
556
|
+
actual_value=violations,
|
|
557
|
+
expected_value=0,
|
|
558
|
+
message=self._format_message(
|
|
559
|
+
passed=passed,
|
|
560
|
+
column=column,
|
|
561
|
+
check=f"between {min_value} and {max_value}",
|
|
562
|
+
condition=condition,
|
|
563
|
+
violations=violations,
|
|
564
|
+
total=total_matching,
|
|
565
|
+
violation_rate=violation_rate
|
|
566
|
+
),
|
|
567
|
+
details={
|
|
568
|
+
"condition": condition,
|
|
569
|
+
"matching_rows": total_matching,
|
|
570
|
+
"violations": violations,
|
|
571
|
+
"violation_rate": violation_rate,
|
|
572
|
+
"min_value": min_value,
|
|
573
|
+
"max_value": max_value,
|
|
574
|
+
"threshold": threshold
|
|
575
|
+
}
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
except Exception as e:
|
|
579
|
+
raise ValidationError(
|
|
580
|
+
f"Error executing conditional between check: {str(e)}"
|
|
581
|
+
) from e
|
|
582
|
+
|
|
583
|
+
def execute_isin_when(
|
|
584
|
+
self,
|
|
585
|
+
dataset,
|
|
586
|
+
column: str,
|
|
587
|
+
allowed_values: list[Any],
|
|
588
|
+
condition: str,
|
|
589
|
+
threshold: float = 1.0
|
|
590
|
+
) -> ValidationResult:
|
|
591
|
+
"""Check column is in allowed values when condition is true.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
dataset: Dataset to validate
|
|
595
|
+
column: Column name to check
|
|
596
|
+
allowed_values: List of allowed values
|
|
597
|
+
condition: SQL WHERE clause condition
|
|
598
|
+
threshold: Maximum allowed failure rate (0.0-1.0)
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
ValidationResult with pass/fail status
|
|
602
|
+
"""
|
|
603
|
+
validation = self.validator.validate(condition)
|
|
604
|
+
if not validation.is_valid:
|
|
605
|
+
raise ValidationError(
|
|
606
|
+
f"Invalid condition: {validation.error_message}"
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
610
|
+
source_path = dataset._source.replace('\\', '/')
|
|
611
|
+
|
|
612
|
+
# Format allowed values for SQL IN clause
|
|
613
|
+
if isinstance(allowed_values[0], str):
|
|
614
|
+
values_str = ", ".join(f"'{v}'" for v in allowed_values)
|
|
615
|
+
else:
|
|
616
|
+
values_str = ", ".join(str(v) for v in allowed_values)
|
|
617
|
+
|
|
618
|
+
sql = f"""
|
|
619
|
+
SELECT COUNT(*) as violations
|
|
620
|
+
FROM '{source_path}'
|
|
621
|
+
WHERE ({condition})
|
|
622
|
+
AND {column} NOT IN ({values_str})
|
|
623
|
+
"""
|
|
624
|
+
|
|
625
|
+
try:
|
|
626
|
+
violations = dataset._engine.fetch_value(sql)
|
|
627
|
+
|
|
628
|
+
count_sql = f"""
|
|
629
|
+
SELECT COUNT(*) as total
|
|
630
|
+
FROM '{source_path}'
|
|
631
|
+
WHERE ({condition})
|
|
632
|
+
"""
|
|
633
|
+
total_matching = dataset._engine.fetch_value(count_sql)
|
|
634
|
+
|
|
635
|
+
if total_matching == 0:
|
|
636
|
+
return ValidationResult(
|
|
637
|
+
passed=True,
|
|
638
|
+
actual_value=0,
|
|
639
|
+
expected_value=0,
|
|
640
|
+
message=f"No rows match condition: {condition}",
|
|
641
|
+
details={"condition": condition, "matching_rows": 0}
|
|
642
|
+
)
|
|
643
|
+
|
|
644
|
+
violation_rate = violations / total_matching
|
|
645
|
+
passed = violation_rate <= (1.0 - threshold)
|
|
646
|
+
|
|
647
|
+
return ValidationResult(
|
|
648
|
+
passed=passed,
|
|
649
|
+
actual_value=violations,
|
|
650
|
+
expected_value=0,
|
|
651
|
+
message=self._format_message(
|
|
652
|
+
passed=passed,
|
|
653
|
+
column=column,
|
|
654
|
+
check=f"in {allowed_values}",
|
|
655
|
+
condition=condition,
|
|
656
|
+
violations=violations,
|
|
657
|
+
total=total_matching,
|
|
658
|
+
violation_rate=violation_rate
|
|
659
|
+
),
|
|
660
|
+
details={
|
|
661
|
+
"condition": condition,
|
|
662
|
+
"matching_rows": total_matching,
|
|
663
|
+
"violations": violations,
|
|
664
|
+
"violation_rate": violation_rate,
|
|
665
|
+
"allowed_values": allowed_values,
|
|
666
|
+
"threshold": threshold
|
|
667
|
+
}
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
except Exception as e:
|
|
671
|
+
raise ValidationError(
|
|
672
|
+
f"Error executing conditional isin check: {str(e)}"
|
|
673
|
+
) from e
|
|
674
|
+
|
|
675
|
+
def execute_pattern_when(
|
|
676
|
+
self,
|
|
677
|
+
dataset,
|
|
678
|
+
column: str,
|
|
679
|
+
pattern: str,
|
|
680
|
+
condition: str,
|
|
681
|
+
threshold: float = 1.0
|
|
682
|
+
) -> ValidationResult:
|
|
683
|
+
"""Check column matches pattern when condition is true.
|
|
684
|
+
|
|
685
|
+
Args:
|
|
686
|
+
dataset: Dataset to validate
|
|
687
|
+
column: Column name to check
|
|
688
|
+
pattern: Regex pattern to match
|
|
689
|
+
condition: SQL WHERE clause condition
|
|
690
|
+
threshold: Maximum allowed failure rate (0.0-1.0)
|
|
691
|
+
|
|
692
|
+
Returns:
|
|
693
|
+
ValidationResult with pass/fail status
|
|
694
|
+
"""
|
|
695
|
+
validation = self.validator.validate(condition)
|
|
696
|
+
if not validation.is_valid:
|
|
697
|
+
raise ValidationError(
|
|
698
|
+
f"Invalid condition: {validation.error_message}"
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
# Normalize path for DuckDB (forward slashes work on all platforms)
|
|
702
|
+
source_path = dataset._source.replace('\\', '/')
|
|
703
|
+
|
|
704
|
+
sql = f"""
|
|
705
|
+
SELECT COUNT(*) as violations
|
|
706
|
+
FROM '{source_path}'
|
|
707
|
+
WHERE ({condition})
|
|
708
|
+
AND NOT regexp_matches({column}::VARCHAR, '{pattern}')
|
|
709
|
+
"""
|
|
710
|
+
|
|
711
|
+
try:
|
|
712
|
+
violations = dataset._engine.fetch_value(sql)
|
|
713
|
+
|
|
714
|
+
count_sql = f"""
|
|
715
|
+
SELECT COUNT(*) as total
|
|
716
|
+
FROM '{source_path}'
|
|
717
|
+
WHERE ({condition})
|
|
718
|
+
"""
|
|
719
|
+
total_matching = dataset._engine.fetch_value(count_sql)
|
|
720
|
+
|
|
721
|
+
if total_matching == 0:
|
|
722
|
+
return ValidationResult(
|
|
723
|
+
passed=True,
|
|
724
|
+
actual_value=0,
|
|
725
|
+
expected_value=0,
|
|
726
|
+
message=f"No rows match condition: {condition}",
|
|
727
|
+
details={"condition": condition, "matching_rows": 0}
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
violation_rate = violations / total_matching
|
|
731
|
+
passed = violation_rate <= (1.0 - threshold)
|
|
732
|
+
|
|
733
|
+
return ValidationResult(
|
|
734
|
+
passed=passed,
|
|
735
|
+
actual_value=violations,
|
|
736
|
+
expected_value=0,
|
|
737
|
+
message=self._format_message(
|
|
738
|
+
passed=passed,
|
|
739
|
+
column=column,
|
|
740
|
+
check=f"matches pattern '{pattern}'",
|
|
741
|
+
condition=condition,
|
|
742
|
+
violations=violations,
|
|
743
|
+
total=total_matching,
|
|
744
|
+
violation_rate=violation_rate
|
|
745
|
+
),
|
|
746
|
+
details={
|
|
747
|
+
"condition": condition,
|
|
748
|
+
"matching_rows": total_matching,
|
|
749
|
+
"violations": violations,
|
|
750
|
+
"violation_rate": violation_rate,
|
|
751
|
+
"pattern": pattern,
|
|
752
|
+
"threshold": threshold
|
|
753
|
+
}
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
except Exception as e:
|
|
757
|
+
raise ValidationError(
|
|
758
|
+
f"Error executing conditional pattern check: {str(e)}"
|
|
759
|
+
) from e
|
|
760
|
+
|
|
761
|
+
def _format_message(
|
|
762
|
+
self,
|
|
763
|
+
passed: bool,
|
|
764
|
+
column: str,
|
|
765
|
+
check: str,
|
|
766
|
+
condition: str,
|
|
767
|
+
violations: int,
|
|
768
|
+
total: int,
|
|
769
|
+
violation_rate: float
|
|
770
|
+
) -> str:
|
|
771
|
+
"""Format human-readable validation message.
|
|
772
|
+
|
|
773
|
+
Args:
|
|
774
|
+
passed: Whether check passed
|
|
775
|
+
column: Column name
|
|
776
|
+
check: Check description
|
|
777
|
+
condition: SQL condition
|
|
778
|
+
violations: Number of violations
|
|
779
|
+
total: Total rows matching condition
|
|
780
|
+
violation_rate: Violation rate (0.0-1.0)
|
|
781
|
+
|
|
782
|
+
Returns:
|
|
783
|
+
Formatted message string
|
|
784
|
+
"""
|
|
785
|
+
if passed:
|
|
786
|
+
return (
|
|
787
|
+
f"Column '{column}' {check} when {condition}: "
|
|
788
|
+
f"PASSED ({violations}/{total} violations, "
|
|
789
|
+
f"{violation_rate:.1%} failure rate)"
|
|
790
|
+
)
|
|
791
|
+
else:
|
|
792
|
+
return (
|
|
793
|
+
f"Column '{column}' {check} when {condition}: "
|
|
794
|
+
f"FAILED ({violations}/{total} violations, "
|
|
795
|
+
f"{violation_rate:.1%} failure rate)"
|
|
796
|
+
)
|