duckguard 2.3.0__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,796 @@
1
+ """Conditional check implementations for DuckGuard 3.0.
2
+
3
+ This module provides conditional validation checks that apply rules only when
4
+ a specified condition is true. This enables sophisticated data quality checks
5
+ like:
6
+ - "Column must not be null when country = 'USA'"
7
+ - "Amount must be positive when status = 'completed'"
8
+ - "Email is required when customer_type = 'registered'"
9
+
10
+ Security Note:
11
+ All SQL conditions are validated to prevent SQL injection. Only SELECT
12
+ queries with WHERE clauses are allowed. No data modification statements
13
+ (INSERT, UPDATE, DELETE, DROP, etc.) are permitted.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import re
19
+ from dataclasses import dataclass, field
20
+ from typing import Any
21
+
22
+ from duckguard.core.result import ValidationResult
23
+ from duckguard.errors import ValidationError
24
+
25
+
26
+ @dataclass
27
+ class QueryValidationResult:
28
+ """Result of SQL query validation."""
29
+
30
+ is_valid: bool
31
+ error_message: str | None = None
32
+ warnings: list[str] = field(default_factory=list)
33
+ complexity_score: int = 0 # 0-100, higher = more complex
34
+
35
+
36
+ class QueryValidator:
37
+ """Validates SQL conditions for security and correctness.
38
+
39
+ This validator prevents SQL injection and ensures queries are safe to execute.
40
+ It applies multiple layers of validation:
41
+ 1. Keyword blacklist (no destructive operations)
42
+ 2. Syntax validation via DuckDB parser
43
+ 3. Complexity analysis
44
+ 4. Read-only enforcement
45
+
46
+ Examples:
47
+ >>> validator = QueryValidator()
48
+ >>> result = validator.validate("country = 'USA'")
49
+ >>> assert result.is_valid
50
+
51
+ >>> result = validator.validate("DROP TABLE users")
52
+ >>> assert not result.is_valid
53
+ >>> assert "forbidden keyword" in result.error_message.lower()
54
+ """
55
+
56
+ # Keywords that are absolutely forbidden in conditions
57
+ FORBIDDEN_KEYWORDS = [
58
+ "INSERT", "UPDATE", "DELETE", "DROP", "CREATE",
59
+ "ALTER", "TRUNCATE", "GRANT", "REVOKE", "EXECUTE",
60
+ "EXEC", "CALL", "MERGE", "REPLACE", "PRAGMA"
61
+ ]
62
+
63
+ # Keywords that suggest dangerous operations
64
+ WARNING_KEYWORDS = [
65
+ "ATTACH", "DETACH", "IMPORT", "EXPORT", "COPY",
66
+ "LOAD", "INSTALL"
67
+ ]
68
+
69
+ def __init__(self, max_complexity: int = 50):
70
+ """Initialize validator.
71
+
72
+ Args:
73
+ max_complexity: Maximum allowed complexity score (0-100)
74
+ """
75
+ self.max_complexity = max_complexity
76
+
77
+ def validate(self, condition: str) -> QueryValidationResult:
78
+ """Validate a SQL condition string.
79
+
80
+ Args:
81
+ condition: SQL WHERE clause condition (without WHERE keyword)
82
+
83
+ Returns:
84
+ QueryValidationResult with validation status and details
85
+
86
+ Examples:
87
+ >>> validator = QueryValidator()
88
+ >>> result = validator.validate("status = 'active'")
89
+ >>> assert result.is_valid
90
+
91
+ >>> result = validator.validate("amount > 100 AND category = 'A'")
92
+ >>> assert result.is_valid
93
+ """
94
+ if not condition or not condition.strip():
95
+ return QueryValidationResult(
96
+ is_valid=False,
97
+ error_message="Condition cannot be empty"
98
+ )
99
+
100
+ condition = condition.strip()
101
+
102
+ # Step 1: Check for forbidden keywords
103
+ condition_upper = condition.upper()
104
+ for keyword in self.FORBIDDEN_KEYWORDS:
105
+ if re.search(rf'\b{keyword}\b', condition_upper):
106
+ return QueryValidationResult(
107
+ is_valid=False,
108
+ error_message=f"Forbidden keyword detected: {keyword}. "
109
+ f"Only SELECT queries with WHERE clauses are allowed."
110
+ )
111
+
112
+ # Step 2: Check for warning keywords
113
+ warnings = []
114
+ for keyword in self.WARNING_KEYWORDS:
115
+ if re.search(rf'\b{keyword}\b', condition_upper):
116
+ warnings.append(
117
+ f"Warning: Potentially dangerous keyword detected: {keyword}"
118
+ )
119
+
120
+ # Step 3: Basic syntax checks
121
+ if condition.count('(') != condition.count(')'):
122
+ return QueryValidationResult(
123
+ is_valid=False,
124
+ error_message="Unbalanced parentheses in condition"
125
+ )
126
+
127
+ if condition.count("'") % 2 != 0:
128
+ return QueryValidationResult(
129
+ is_valid=False,
130
+ error_message="Unbalanced quotes in condition"
131
+ )
132
+
133
+ # Step 4: Check for common SQL injection patterns
134
+ injection_patterns = [
135
+ r'--', # SQL comments
136
+ r'/\*', # Block comment start
137
+ r'\*/', # Block comment end
138
+ r';.*', # Multiple statements
139
+ r'\bOR\s+[\'"]?1[\'"]?\s*=\s*[\'"]?1[\'"]?', # OR 1=1
140
+ r'\bUNION\b.*\bSELECT\b', # UNION SELECT
141
+ ]
142
+
143
+ for pattern in injection_patterns:
144
+ if re.search(pattern, condition_upper):
145
+ return QueryValidationResult(
146
+ is_valid=False,
147
+ error_message="Potential SQL injection detected. "
148
+ "Suspicious pattern found."
149
+ )
150
+
151
+ # Step 5: Calculate complexity score
152
+ complexity = self._calculate_complexity(condition)
153
+ if complexity > self.max_complexity:
154
+ return QueryValidationResult(
155
+ is_valid=False,
156
+ error_message=f"Condition too complex (score: {complexity}, "
157
+ f"max: {self.max_complexity}). "
158
+ f"Simplify your condition or use query-based checks."
159
+ )
160
+
161
+ # All checks passed
162
+ return QueryValidationResult(
163
+ is_valid=True,
164
+ warnings=warnings,
165
+ complexity_score=complexity
166
+ )
167
+
168
+ def _calculate_complexity(self, condition: str) -> int:
169
+ """Calculate complexity score for a condition (0-100).
170
+
171
+ Factors:
172
+ - Length of condition string
173
+ - Number of operators (AND, OR, NOT)
174
+ - Number of comparisons
175
+ - Nesting depth (parentheses)
176
+ - Number of function calls
177
+
178
+ Args:
179
+ condition: SQL condition string
180
+
181
+ Returns:
182
+ Complexity score (0-100)
183
+ """
184
+ score = 0
185
+ condition_upper = condition.upper()
186
+
187
+ # Length factor (0-20 points)
188
+ length_score = min(20, len(condition) // 10)
189
+ score += length_score
190
+
191
+ # Logical operators (5 points each)
192
+ logical_ops = len(re.findall(r'\b(AND|OR|NOT)\b', condition_upper))
193
+ score += logical_ops * 5
194
+
195
+ # Comparison operators (2 points each)
196
+ comparisons = len(re.findall(r'[<>=!]+', condition))
197
+ score += comparisons * 2
198
+
199
+ # Nesting depth (10 points per level)
200
+ max_depth = self._calculate_nesting_depth(condition)
201
+ score += max_depth * 10
202
+
203
+ # Function calls (8 points each)
204
+ functions = len(re.findall(r'\w+\s*\(', condition))
205
+ score += functions * 8
206
+
207
+ # Subqueries (20 points each - very complex)
208
+ subqueries = condition_upper.count('SELECT')
209
+ score += subqueries * 20
210
+
211
+ return min(100, score)
212
+
213
+ def _calculate_nesting_depth(self, condition: str) -> int:
214
+ """Calculate maximum nesting depth of parentheses.
215
+
216
+ Args:
217
+ condition: SQL condition string
218
+
219
+ Returns:
220
+ Maximum nesting depth
221
+ """
222
+ max_depth = 0
223
+ current_depth = 0
224
+
225
+ for char in condition:
226
+ if char == '(':
227
+ current_depth += 1
228
+ max_depth = max(max_depth, current_depth)
229
+ elif char == ')':
230
+ current_depth -= 1
231
+
232
+ return max_depth
233
+
234
+
235
+ class ConditionalCheckHandler:
236
+ """Executes conditional validation checks.
237
+
238
+ This handler translates conditional checks into SQL queries that filter
239
+ data based on a condition before applying the validation rule.
240
+
241
+ Pattern:
242
+ WHERE (condition) AND NOT (check_passes)
243
+
244
+ Example:
245
+ For not_null_when(condition="country = 'USA'"):
246
+ SELECT COUNT(*) FROM table
247
+ WHERE (country = 'USA') AND (column IS NULL)
248
+
249
+ This counts rows where the condition is true BUT the check fails.
250
+
251
+ Attributes:
252
+ validator: QueryValidator instance for SQL validation
253
+ timeout_seconds: Maximum query execution time
254
+
255
+ Examples:
256
+ >>> handler = ConditionalCheckHandler()
257
+ >>> result = handler.execute_not_null_when(
258
+ ... dataset=my_data,
259
+ ... column="state",
260
+ ... condition="country = 'USA'"
261
+ ... )
262
+ """
263
+
264
+ def __init__(
265
+ self,
266
+ validator: QueryValidator | None = None,
267
+ timeout_seconds: int = 30
268
+ ):
269
+ """Initialize conditional check handler.
270
+
271
+ Args:
272
+ validator: Query validator (creates default if None)
273
+ timeout_seconds: Maximum query execution time
274
+ """
275
+ self.validator = validator or QueryValidator()
276
+ self.timeout_seconds = timeout_seconds
277
+
278
+ def execute_not_null_when(
279
+ self,
280
+ dataset,
281
+ column: str,
282
+ condition: str,
283
+ threshold: float = 1.0
284
+ ) -> ValidationResult:
285
+ """Check column is not null when condition is true.
286
+
287
+ Args:
288
+ dataset: Dataset to validate
289
+ column: Column name to check
290
+ condition: SQL WHERE clause condition
291
+ threshold: Maximum allowed failure rate (0.0-1.0)
292
+
293
+ Returns:
294
+ ValidationResult with pass/fail status
295
+
296
+ Raises:
297
+ ValidationError: If condition is invalid or unsafe
298
+
299
+ Examples:
300
+ >>> data = connect("customers.csv")
301
+ >>> result = handler.execute_not_null_when(
302
+ ... dataset=data,
303
+ ... column="state",
304
+ ... condition="country = 'USA'"
305
+ ... )
306
+ >>> assert result.passed
307
+ """
308
+ # Validate condition
309
+ validation = self.validator.validate(condition)
310
+ if not validation.is_valid:
311
+ raise ValidationError(
312
+ f"Invalid condition: {validation.error_message}"
313
+ )
314
+
315
+ # Validate threshold
316
+ if not 0.0 <= threshold <= 1.0:
317
+ raise ValueError(f"threshold must be between 0.0 and 1.0, got {threshold}")
318
+ # Normalize path for DuckDB (forward slashes work on all platforms)
319
+ source_path = dataset._source.replace('\\', '/')
320
+
321
+ # Build SQL query
322
+ sql = f"""
323
+ SELECT COUNT(*) as violations
324
+ FROM '{source_path}'
325
+ WHERE ({condition}) AND ({column} IS NULL)
326
+ """
327
+
328
+ try:
329
+ # Execute query with timeout
330
+ violations = dataset._engine.fetch_value(sql)
331
+
332
+ # Count rows matching condition
333
+ count_sql = f"""
334
+ SELECT COUNT(*) as total
335
+ FROM '{source_path}'
336
+ WHERE ({condition})
337
+ """
338
+ total_matching = dataset._engine.fetch_value(count_sql)
339
+
340
+ if total_matching == 0:
341
+ # No rows match condition - check passes vacuously
342
+ return ValidationResult(
343
+ passed=True,
344
+ actual_value=0,
345
+ expected_value=0,
346
+ message=f"No rows match condition: {condition}. "
347
+ f"Check passes vacuously.",
348
+ details={
349
+ "condition": condition,
350
+ "matching_rows": 0,
351
+ "violations": 0
352
+ }
353
+ )
354
+
355
+ # Calculate violation rate
356
+ violation_rate = violations / total_matching
357
+ passed = violation_rate <= (1.0 - threshold)
358
+
359
+ return ValidationResult(
360
+ passed=passed,
361
+ actual_value=violations,
362
+ expected_value=0,
363
+ message=self._format_message(
364
+ passed=passed,
365
+ column=column,
366
+ check="not null",
367
+ condition=condition,
368
+ violations=violations,
369
+ total=total_matching,
370
+ violation_rate=violation_rate
371
+ ),
372
+ details={
373
+ "condition": condition,
374
+ "matching_rows": total_matching,
375
+ "violations": violations,
376
+ "violation_rate": violation_rate,
377
+ "threshold": threshold,
378
+ "complexity_score": validation.complexity_score
379
+ }
380
+ )
381
+
382
+ except Exception as e:
383
+ raise ValidationError(
384
+ f"Error executing conditional check: {str(e)}"
385
+ ) from e
386
+
387
+ def execute_unique_when(
388
+ self,
389
+ dataset,
390
+ column: str,
391
+ condition: str,
392
+ threshold: float = 1.0
393
+ ) -> ValidationResult:
394
+ """Check column is unique when condition is true.
395
+
396
+ Args:
397
+ dataset: Dataset to validate
398
+ column: Column name to check
399
+ condition: SQL WHERE clause condition
400
+ threshold: Minimum required uniqueness rate (0.0-1.0)
401
+
402
+ Returns:
403
+ ValidationResult with pass/fail status
404
+
405
+ Examples:
406
+ >>> result = handler.execute_unique_when(
407
+ ... dataset=data,
408
+ ... column="order_id",
409
+ ... condition="status = 'completed'"
410
+ ... )
411
+ """
412
+ # Validate condition
413
+ validation = self.validator.validate(condition)
414
+ if not validation.is_valid:
415
+ raise ValidationError(
416
+ f"Invalid condition: {validation.error_message}"
417
+ )
418
+
419
+ # Normalize path for DuckDB (forward slashes work on all platforms)
420
+ source_path = dataset._source.replace('\\', '/')
421
+
422
+ # Check for duplicates in rows matching condition
423
+ sql = f"""
424
+ SELECT COUNT(*) as duplicates
425
+ FROM (
426
+ SELECT {column}, COUNT(*) as cnt
427
+ FROM '{source_path}'
428
+ WHERE ({condition})
429
+ GROUP BY {column}
430
+ HAVING cnt > 1
431
+ ) as dups
432
+ """
433
+
434
+ try:
435
+ duplicate_values = dataset._engine.fetch_value(sql)
436
+
437
+ # Count distinct values matching condition
438
+ distinct_sql = f"""
439
+ SELECT COUNT(DISTINCT {column}) as distinct_count
440
+ FROM '{source_path}'
441
+ WHERE ({condition})
442
+ """
443
+ distinct_count = dataset._engine.fetch_value(distinct_sql)
444
+
445
+ # Count total rows matching condition
446
+ total_sql = f"""
447
+ SELECT COUNT(*) as total
448
+ FROM '{source_path}'
449
+ WHERE ({condition})
450
+ """
451
+ total_matching = dataset._engine.fetch_value(total_sql)
452
+
453
+ if total_matching == 0:
454
+ return ValidationResult(
455
+ passed=True,
456
+ actual_value=0,
457
+ expected_value=0,
458
+ message=f"No rows match condition: {condition}",
459
+ details={"condition": condition, "matching_rows": 0}
460
+ )
461
+
462
+ # Calculate uniqueness rate
463
+ uniqueness_rate = distinct_count / total_matching
464
+ passed = uniqueness_rate >= threshold
465
+
466
+ return ValidationResult(
467
+ passed=passed,
468
+ actual_value=duplicate_values,
469
+ expected_value=0,
470
+ message=self._format_message(
471
+ passed=passed,
472
+ column=column,
473
+ check="unique",
474
+ condition=condition,
475
+ violations=duplicate_values,
476
+ total=total_matching,
477
+ violation_rate=1.0 - uniqueness_rate
478
+ ),
479
+ details={
480
+ "condition": condition,
481
+ "matching_rows": total_matching,
482
+ "distinct_values": distinct_count,
483
+ "duplicate_values": duplicate_values,
484
+ "uniqueness_rate": uniqueness_rate,
485
+ "threshold": threshold
486
+ }
487
+ )
488
+
489
+ except Exception as e:
490
+ raise ValidationError(
491
+ f"Error executing conditional unique check: {str(e)}"
492
+ ) from e
493
+
494
+ def execute_between_when(
495
+ self,
496
+ dataset,
497
+ column: str,
498
+ min_value: float,
499
+ max_value: float,
500
+ condition: str,
501
+ threshold: float = 1.0
502
+ ) -> ValidationResult:
503
+ """Check column is between min and max when condition is true.
504
+
505
+ Args:
506
+ dataset: Dataset to validate
507
+ column: Column name to check
508
+ min_value: Minimum allowed value
509
+ max_value: Maximum allowed value
510
+ condition: SQL WHERE clause condition
511
+ threshold: Maximum allowed failure rate (0.0-1.0)
512
+
513
+ Returns:
514
+ ValidationResult with pass/fail status
515
+ """
516
+ validation = self.validator.validate(condition)
517
+ if not validation.is_valid:
518
+ raise ValidationError(
519
+ f"Invalid condition: {validation.error_message}"
520
+ )
521
+
522
+ # Normalize path for DuckDB (forward slashes work on all platforms)
523
+ source_path = dataset._source.replace('\\', '/')
524
+
525
+ sql = f"""
526
+ SELECT COUNT(*) as violations
527
+ FROM '{source_path}'
528
+ WHERE ({condition})
529
+ AND ({column} < {min_value} OR {column} > {max_value})
530
+ """
531
+
532
+ try:
533
+ violations = dataset._engine.fetch_value(sql)
534
+
535
+ count_sql = f"""
536
+ SELECT COUNT(*) as total
537
+ FROM '{source_path}'
538
+ WHERE ({condition})
539
+ """
540
+ total_matching = dataset._engine.fetch_value(count_sql)
541
+
542
+ if total_matching == 0:
543
+ return ValidationResult(
544
+ passed=True,
545
+ actual_value=0,
546
+ expected_value=0,
547
+ message=f"No rows match condition: {condition}",
548
+ details={"condition": condition, "matching_rows": 0}
549
+ )
550
+
551
+ violation_rate = violations / total_matching
552
+ passed = violation_rate <= (1.0 - threshold)
553
+
554
+ return ValidationResult(
555
+ passed=passed,
556
+ actual_value=violations,
557
+ expected_value=0,
558
+ message=self._format_message(
559
+ passed=passed,
560
+ column=column,
561
+ check=f"between {min_value} and {max_value}",
562
+ condition=condition,
563
+ violations=violations,
564
+ total=total_matching,
565
+ violation_rate=violation_rate
566
+ ),
567
+ details={
568
+ "condition": condition,
569
+ "matching_rows": total_matching,
570
+ "violations": violations,
571
+ "violation_rate": violation_rate,
572
+ "min_value": min_value,
573
+ "max_value": max_value,
574
+ "threshold": threshold
575
+ }
576
+ )
577
+
578
+ except Exception as e:
579
+ raise ValidationError(
580
+ f"Error executing conditional between check: {str(e)}"
581
+ ) from e
582
+
583
+ def execute_isin_when(
584
+ self,
585
+ dataset,
586
+ column: str,
587
+ allowed_values: list[Any],
588
+ condition: str,
589
+ threshold: float = 1.0
590
+ ) -> ValidationResult:
591
+ """Check column is in allowed values when condition is true.
592
+
593
+ Args:
594
+ dataset: Dataset to validate
595
+ column: Column name to check
596
+ allowed_values: List of allowed values
597
+ condition: SQL WHERE clause condition
598
+ threshold: Maximum allowed failure rate (0.0-1.0)
599
+
600
+ Returns:
601
+ ValidationResult with pass/fail status
602
+ """
603
+ validation = self.validator.validate(condition)
604
+ if not validation.is_valid:
605
+ raise ValidationError(
606
+ f"Invalid condition: {validation.error_message}"
607
+ )
608
+
609
+ # Normalize path for DuckDB (forward slashes work on all platforms)
610
+ source_path = dataset._source.replace('\\', '/')
611
+
612
+ # Format allowed values for SQL IN clause
613
+ if isinstance(allowed_values[0], str):
614
+ values_str = ", ".join(f"'{v}'" for v in allowed_values)
615
+ else:
616
+ values_str = ", ".join(str(v) for v in allowed_values)
617
+
618
+ sql = f"""
619
+ SELECT COUNT(*) as violations
620
+ FROM '{source_path}'
621
+ WHERE ({condition})
622
+ AND {column} NOT IN ({values_str})
623
+ """
624
+
625
+ try:
626
+ violations = dataset._engine.fetch_value(sql)
627
+
628
+ count_sql = f"""
629
+ SELECT COUNT(*) as total
630
+ FROM '{source_path}'
631
+ WHERE ({condition})
632
+ """
633
+ total_matching = dataset._engine.fetch_value(count_sql)
634
+
635
+ if total_matching == 0:
636
+ return ValidationResult(
637
+ passed=True,
638
+ actual_value=0,
639
+ expected_value=0,
640
+ message=f"No rows match condition: {condition}",
641
+ details={"condition": condition, "matching_rows": 0}
642
+ )
643
+
644
+ violation_rate = violations / total_matching
645
+ passed = violation_rate <= (1.0 - threshold)
646
+
647
+ return ValidationResult(
648
+ passed=passed,
649
+ actual_value=violations,
650
+ expected_value=0,
651
+ message=self._format_message(
652
+ passed=passed,
653
+ column=column,
654
+ check=f"in {allowed_values}",
655
+ condition=condition,
656
+ violations=violations,
657
+ total=total_matching,
658
+ violation_rate=violation_rate
659
+ ),
660
+ details={
661
+ "condition": condition,
662
+ "matching_rows": total_matching,
663
+ "violations": violations,
664
+ "violation_rate": violation_rate,
665
+ "allowed_values": allowed_values,
666
+ "threshold": threshold
667
+ }
668
+ )
669
+
670
+ except Exception as e:
671
+ raise ValidationError(
672
+ f"Error executing conditional isin check: {str(e)}"
673
+ ) from e
674
+
675
+ def execute_pattern_when(
676
+ self,
677
+ dataset,
678
+ column: str,
679
+ pattern: str,
680
+ condition: str,
681
+ threshold: float = 1.0
682
+ ) -> ValidationResult:
683
+ """Check column matches pattern when condition is true.
684
+
685
+ Args:
686
+ dataset: Dataset to validate
687
+ column: Column name to check
688
+ pattern: Regex pattern to match
689
+ condition: SQL WHERE clause condition
690
+ threshold: Maximum allowed failure rate (0.0-1.0)
691
+
692
+ Returns:
693
+ ValidationResult with pass/fail status
694
+ """
695
+ validation = self.validator.validate(condition)
696
+ if not validation.is_valid:
697
+ raise ValidationError(
698
+ f"Invalid condition: {validation.error_message}"
699
+ )
700
+
701
+ # Normalize path for DuckDB (forward slashes work on all platforms)
702
+ source_path = dataset._source.replace('\\', '/')
703
+
704
+ sql = f"""
705
+ SELECT COUNT(*) as violations
706
+ FROM '{source_path}'
707
+ WHERE ({condition})
708
+ AND NOT regexp_matches({column}::VARCHAR, '{pattern}')
709
+ """
710
+
711
+ try:
712
+ violations = dataset._engine.fetch_value(sql)
713
+
714
+ count_sql = f"""
715
+ SELECT COUNT(*) as total
716
+ FROM '{source_path}'
717
+ WHERE ({condition})
718
+ """
719
+ total_matching = dataset._engine.fetch_value(count_sql)
720
+
721
+ if total_matching == 0:
722
+ return ValidationResult(
723
+ passed=True,
724
+ actual_value=0,
725
+ expected_value=0,
726
+ message=f"No rows match condition: {condition}",
727
+ details={"condition": condition, "matching_rows": 0}
728
+ )
729
+
730
+ violation_rate = violations / total_matching
731
+ passed = violation_rate <= (1.0 - threshold)
732
+
733
+ return ValidationResult(
734
+ passed=passed,
735
+ actual_value=violations,
736
+ expected_value=0,
737
+ message=self._format_message(
738
+ passed=passed,
739
+ column=column,
740
+ check=f"matches pattern '{pattern}'",
741
+ condition=condition,
742
+ violations=violations,
743
+ total=total_matching,
744
+ violation_rate=violation_rate
745
+ ),
746
+ details={
747
+ "condition": condition,
748
+ "matching_rows": total_matching,
749
+ "violations": violations,
750
+ "violation_rate": violation_rate,
751
+ "pattern": pattern,
752
+ "threshold": threshold
753
+ }
754
+ )
755
+
756
+ except Exception as e:
757
+ raise ValidationError(
758
+ f"Error executing conditional pattern check: {str(e)}"
759
+ ) from e
760
+
761
+ def _format_message(
762
+ self,
763
+ passed: bool,
764
+ column: str,
765
+ check: str,
766
+ condition: str,
767
+ violations: int,
768
+ total: int,
769
+ violation_rate: float
770
+ ) -> str:
771
+ """Format human-readable validation message.
772
+
773
+ Args:
774
+ passed: Whether check passed
775
+ column: Column name
776
+ check: Check description
777
+ condition: SQL condition
778
+ violations: Number of violations
779
+ total: Total rows matching condition
780
+ violation_rate: Violation rate (0.0-1.0)
781
+
782
+ Returns:
783
+ Formatted message string
784
+ """
785
+ if passed:
786
+ return (
787
+ f"Column '{column}' {check} when {condition}: "
788
+ f"PASSED ({violations}/{total} violations, "
789
+ f"{violation_rate:.1%} failure rate)"
790
+ )
791
+ else:
792
+ return (
793
+ f"Column '{column}' {check} when {condition}: "
794
+ f"FAILED ({violations}/{total} violations, "
795
+ f"{violation_rate:.1%} failure rate)"
796
+ )