daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ """PII (Personally Identifiable Information) detection rules."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+ from daytashield.rules.base import ComplianceRule, ComplianceViolation
9
+
10
+
11
+ class PIIDetector(ComplianceRule):
12
+ """Detects personally identifiable information in data.
13
+
14
+ Scans for common PII patterns including:
15
+ - Social Security Numbers (SSN)
16
+ - Credit card numbers
17
+ - Email addresses
18
+ - Phone numbers
19
+ - IP addresses
20
+ - Passport numbers
21
+ - Driver's license numbers
22
+
23
+ Example:
24
+ >>> detector = PIIDetector()
25
+ >>> violations = detector.check(data, [("email", "john@example.com")])
26
+ >>> for v in violations:
27
+ ... print(f"{v.code}: {v.message}")
28
+ """
29
+
30
+ name = "pii"
31
+ description = "Detects personally identifiable information"
32
+
33
+ # PII patterns with their metadata
34
+ PATTERNS: list[dict[str, Any]] = [
35
+ {
36
+ "name": "ssn",
37
+ "pattern": r"\b\d{3}[-.\s]?\d{2}[-.\s]?\d{4}\b",
38
+ "code": "PII_SSN",
39
+ "message": "Social Security Number detected",
40
+ "category": "ssn",
41
+ "severity": "error",
42
+ "recommendation": "Remove or encrypt SSN before processing",
43
+ },
44
+ {
45
+ "name": "credit_card",
46
+ "pattern": r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
47
+ "code": "PII_CREDIT_CARD",
48
+ "message": "Credit card number detected",
49
+ "category": "financial",
50
+ "severity": "error",
51
+ "recommendation": "Remove or tokenize credit card numbers",
52
+ },
53
+ {
54
+ "name": "email",
55
+ "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
56
+ "code": "PII_EMAIL",
57
+ "message": "Email address detected",
58
+ "category": "contact",
59
+ "severity": "warning",
60
+ "recommendation": "Consider if email is necessary or should be hashed",
61
+ },
62
+ {
63
+ "name": "phone_us",
64
+ "pattern": r"\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b",
65
+ "code": "PII_PHONE",
66
+ "message": "Phone number detected",
67
+ "category": "contact",
68
+ "severity": "warning",
69
+ "recommendation": "Consider if phone number is necessary",
70
+ },
71
+ {
72
+ "name": "ip_address",
73
+ "pattern": r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
74
+ "code": "PII_IP_ADDRESS",
75
+ "message": "IP address detected",
76
+ "category": "technical",
77
+ "severity": "warning",
78
+ "recommendation": "Consider anonymizing IP addresses",
79
+ },
80
+ {
81
+ "name": "passport_us",
82
+ "pattern": r"\b[A-Z]{1,2}[0-9]{6,9}\b",
83
+ "code": "PII_PASSPORT",
84
+ "message": "Potential passport number detected",
85
+ "category": "identity",
86
+ "severity": "error",
87
+ "recommendation": "Remove passport numbers from data",
88
+ },
89
+ {
90
+ "name": "date_of_birth",
91
+ "pattern": r"\b(?:0[1-9]|1[0-2])[-/](?:0[1-9]|[12][0-9]|3[01])[-/](?:19|20)\d{2}\b",
92
+ "code": "PII_DOB",
93
+ "message": "Date of birth detected",
94
+ "category": "identity",
95
+ "severity": "warning",
96
+ "recommendation": "Consider if full DOB is necessary (year might suffice)",
97
+ },
98
+ {
99
+ "name": "drivers_license",
100
+ "pattern": r"\b[A-Z][0-9]{7,8}\b",
101
+ "code": "PII_DRIVERS_LICENSE",
102
+ "message": "Potential driver's license number detected",
103
+ "category": "identity",
104
+ "severity": "error",
105
+ "recommendation": "Remove driver's license numbers",
106
+ },
107
+ ]
108
+
109
+ def __init__(
110
+ self,
111
+ patterns: list[str] | None = None,
112
+ severity_overrides: dict[str, str] | None = None,
113
+ ):
114
+ """Initialize the PII detector.
115
+
116
+ Args:
117
+ patterns: List of pattern names to enable (None = all)
118
+ severity_overrides: Override severity for specific patterns
119
+ """
120
+ self.enabled_patterns = patterns
121
+ self.severity_overrides = severity_overrides or {}
122
+ self._compiled_patterns: list[tuple[re.Pattern[str], dict[str, Any]]] = []
123
+ self._compile_patterns()
124
+
125
+ def _compile_patterns(self) -> None:
126
+ """Compile regex patterns for efficiency."""
127
+ for pattern_config in self.PATTERNS:
128
+ if self.enabled_patterns and pattern_config["name"] not in self.enabled_patterns:
129
+ continue
130
+
131
+ compiled = re.compile(pattern_config["pattern"], re.IGNORECASE)
132
+ self._compiled_patterns.append((compiled, pattern_config))
133
+
134
+ def check(
135
+ self, data: Any, text_content: list[tuple[str, str]]
136
+ ) -> list[ComplianceViolation]:
137
+ """Check for PII in the text content.
138
+
139
+ Args:
140
+ data: The original data structure (unused for pattern matching)
141
+ text_content: List of (field_path, text_value) tuples
142
+
143
+ Returns:
144
+ List of ComplianceViolation objects
145
+ """
146
+ violations: list[ComplianceViolation] = []
147
+ seen: set[tuple[str, str, str]] = set() # Dedupe by (field, code, match)
148
+
149
+ for field_path, text in text_content:
150
+ for pattern, config in self._compiled_patterns:
151
+ matches = pattern.findall(text)
152
+ for match in matches:
153
+ # Get the matched string
154
+ matched_str = match if isinstance(match, str) else match[0]
155
+
156
+ # Dedupe
157
+ key = (field_path, config["code"], matched_str)
158
+ if key in seen:
159
+ continue
160
+ seen.add(key)
161
+
162
+ # Get severity (with possible override)
163
+ severity = self.severity_overrides.get(
164
+ config["name"], config["severity"]
165
+ )
166
+
167
+ # Redact the matched value for logging
168
+ redacted = self._redact(matched_str, config["name"])
169
+
170
+ violations.append(
171
+ ComplianceViolation(
172
+ code=config["code"],
173
+ message=config["message"],
174
+ severity=severity,
175
+ category=config["category"],
176
+ field=field_path or None,
177
+ matched_value=redacted,
178
+ recommendation=config["recommendation"],
179
+ )
180
+ )
181
+
182
+ return violations
183
+
184
+ def _redact(self, value: str, pattern_name: str) -> str:
185
+ """Redact a matched value for safe logging.
186
+
187
+ Args:
188
+ value: The matched value
189
+ pattern_name: Name of the pattern that matched
190
+
191
+ Returns:
192
+ Redacted value
193
+ """
194
+ if len(value) <= 4:
195
+ return "*" * len(value)
196
+
197
+ if pattern_name in ("ssn", "credit_card", "phone_us"):
198
+ # Show last 4 digits
199
+ return "*" * (len(value) - 4) + value[-4:]
200
+ elif pattern_name == "email":
201
+ # Show first char and domain
202
+ at_idx = value.find("@")
203
+ if at_idx > 0:
204
+ return value[0] + "*" * (at_idx - 1) + value[at_idx:]
205
+ return "*" * len(value)
206
+ else:
207
+ # Generic: show first and last char
208
+ return value[0] + "*" * (len(value) - 2) + value[-1]
@@ -0,0 +1,15 @@
1
+ """DaytaShield validators for data quality assurance."""
2
+
3
+ from daytashield.validators.base import BaseValidator
4
+ from daytashield.validators.compliance import ComplianceValidator
5
+ from daytashield.validators.freshness import FreshnessValidator
6
+ from daytashield.validators.schema import SchemaValidator
7
+ from daytashield.validators.semantic import SemanticValidator
8
+
9
+ __all__ = [
10
+ "BaseValidator",
11
+ "SchemaValidator",
12
+ "SemanticValidator",
13
+ "FreshnessValidator",
14
+ "ComplianceValidator",
15
+ ]
@@ -0,0 +1,103 @@
1
+ """Base validator abstract class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ if TYPE_CHECKING:
11
+ from daytashield.core.result import ValidationResult
12
+
13
+
14
+ class ValidatorConfig(BaseModel):
15
+ """Base configuration for all validators."""
16
+
17
+ enabled: bool = Field(True, description="Whether this validator is enabled")
18
+ fail_fast: bool = Field(False, description="Stop validation on first failure")
19
+ severity_override: str | None = Field(
20
+ None, description="Override default severity (warning, failed)"
21
+ )
22
+
23
+ model_config = {"extra": "allow"}
24
+
25
+
26
+ class BaseValidator(ABC):
27
+ """Abstract base class for all DaytaShield validators.
28
+
29
+ Validators are responsible for checking specific aspects of data quality.
30
+ Each validator should focus on a single concern (schema, freshness, etc.).
31
+
32
+ Example:
33
+ >>> class MyValidator(BaseValidator):
34
+ ... name = "my_validator"
35
+ ...
36
+ ... def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
37
+ ... if not self._is_valid(data):
38
+ ... result.add_message(
39
+ ... code="MY_001",
40
+ ... message="Data failed custom validation",
41
+ ... severity=ValidationStatus.FAILED,
42
+ ... validator=self.name,
43
+ ... )
44
+ ... return result
45
+ """
46
+
47
+ name: str = "base_validator"
48
+
49
+ def __init__(self, config: ValidatorConfig | dict[str, Any] | None = None):
50
+ """Initialize the validator with optional configuration.
51
+
52
+ Args:
53
+ config: Validator configuration, either as ValidatorConfig or dict.
54
+ """
55
+ if config is None:
56
+ self.config = ValidatorConfig()
57
+ elif isinstance(config, dict):
58
+ self.config = ValidatorConfig(**config)
59
+ else:
60
+ self.config = config
61
+
62
+ @abstractmethod
63
+ def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
64
+ """Validate the provided data and update the result.
65
+
66
+ This is the main method that subclasses must implement. It should:
67
+ 1. Check the data against this validator's rules
68
+ 2. Add messages to the result for any issues found
69
+ 3. Optionally transform the data and update result.data
70
+ 4. Return the updated result
71
+
72
+ Args:
73
+ data: The data to validate (type depends on validator)
74
+ result: The ValidationResult to update with findings
75
+
76
+ Returns:
77
+ The updated ValidationResult
78
+ """
79
+ pass
80
+
81
+ def should_run(self, data: Any, result: ValidationResult) -> bool:
82
+ """Check if this validator should run.
83
+
84
+ Override this method to implement conditional validation logic.
85
+
86
+ Args:
87
+ data: The data to potentially validate
88
+ result: The current validation result
89
+
90
+ Returns:
91
+ True if this validator should run, False otherwise
92
+ """
93
+ if not self.config.enabled:
94
+ return False
95
+
96
+ # Skip if previous validator failed and we're in fail-fast mode
97
+ if self.config.fail_fast and result.failed:
98
+ return False
99
+
100
+ return True
101
+
102
+ def __repr__(self) -> str:
103
+ return f"{self.__class__.__name__}(name={self.name!r})"
@@ -0,0 +1,222 @@
1
+ """Compliance validation for regulatory requirements."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from pydantic import Field
8
+
9
+ from daytashield.core.result import ValidationResult, ValidationStatus
10
+ from daytashield.validators.base import BaseValidator, ValidatorConfig
11
+
12
+ if TYPE_CHECKING:
13
+ from daytashield.rules.base import ComplianceRule
14
+
15
+
16
+ class ComplianceValidatorConfig(ValidatorConfig):
17
+ """Configuration for compliance validation."""
18
+
19
+ fail_on_warning: bool = Field(False, description="Treat warnings as failures")
20
+ include_passed_rules: bool = Field(False, description="Include passed rules in messages")
21
+ scan_nested: bool = Field(True, description="Scan nested objects and arrays")
22
+ max_depth: int = Field(10, description="Maximum nesting depth to scan")
23
+
24
+
25
+ class ComplianceValidator(BaseValidator):
26
+ """Validates data against compliance rules (HIPAA, GDPR, PII, etc.).
27
+
28
+ Uses pluggable rule packs to check for compliance violations. Rules
29
+ can detect sensitive data, check for required fields, validate
30
+ consent flags, etc.
31
+
32
+ Example:
33
+ >>> from daytashield.rules import HIPAARules, GDPRRules
34
+ >>> validator = ComplianceValidator(
35
+ ... rules=[HIPAARules(), GDPRRules()],
36
+ ... )
37
+ >>> result = validator.validate(patient_data, result)
38
+
39
+ Built-in rule packs:
40
+ - HIPAARules: Healthcare data compliance (PHI detection)
41
+ - GDPRRules: EU data protection (consent, data subject rights)
42
+ - PIIDetector: Personal information detection (SSN, emails, etc.)
43
+ """
44
+
45
+ name = "compliance"
46
+
47
+ def __init__(
48
+ self,
49
+ rules: list[ComplianceRule] | list[str] | None = None,
50
+ config: ComplianceValidatorConfig | dict[str, Any] | None = None,
51
+ ):
52
+ """Initialize the compliance validator.
53
+
54
+ Args:
55
+ rules: List of rule objects or rule names to load
56
+ config: Validator configuration
57
+ """
58
+ if config is None:
59
+ super().__init__(ComplianceValidatorConfig())
60
+ elif isinstance(config, dict):
61
+ super().__init__(ComplianceValidatorConfig(**config))
62
+ else:
63
+ super().__init__(config)
64
+
65
+ self.rules: list[ComplianceRule] = []
66
+
67
+ if rules:
68
+ for rule in rules:
69
+ if isinstance(rule, str):
70
+ self.rules.append(self._load_rule_by_name(rule))
71
+ else:
72
+ self.rules.append(rule)
73
+
74
+ def _load_rule_by_name(self, name: str) -> ComplianceRule:
75
+ """Load a rule pack by name.
76
+
77
+ Args:
78
+ name: Rule name (hipaa, gdpr, pii)
79
+
80
+ Returns:
81
+ ComplianceRule instance
82
+
83
+ Raises:
84
+ ValueError: If rule name is unknown
85
+ """
86
+ name_lower = name.lower()
87
+
88
+ if name_lower == "hipaa":
89
+ from daytashield.rules.hipaa import HIPAARules
90
+ return HIPAARules()
91
+ elif name_lower == "gdpr":
92
+ from daytashield.rules.gdpr import GDPRRules
93
+ return GDPRRules()
94
+ elif name_lower == "pii":
95
+ from daytashield.rules.pii import PIIDetector
96
+ return PIIDetector()
97
+ else:
98
+ raise ValueError(f"Unknown rule pack: {name}. Available: hipaa, gdpr, pii")
99
+
100
+ def add_rule(self, rule: ComplianceRule | str) -> ComplianceValidator:
101
+ """Add a rule to the validator.
102
+
103
+ Args:
104
+ rule: Rule object or rule name
105
+
106
+ Returns:
107
+ Self for method chaining
108
+ """
109
+ if isinstance(rule, str):
110
+ self.rules.append(self._load_rule_by_name(rule))
111
+ else:
112
+ self.rules.append(rule)
113
+ return self
114
+
115
+ def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
116
+ """Validate data against all compliance rules.
117
+
118
+ Args:
119
+ data: The data to validate
120
+ result: The ValidationResult to update
121
+
122
+ Returns:
123
+ Updated ValidationResult
124
+ """
125
+ if not self.rules:
126
+ result.add_message(
127
+ code="COMPLIANCE_NO_RULES",
128
+ message="No compliance rules configured",
129
+ severity=ValidationStatus.WARNING,
130
+ validator=self.name,
131
+ )
132
+ return result
133
+
134
+ config = self.config
135
+ if not isinstance(config, ComplianceValidatorConfig):
136
+ config = ComplianceValidatorConfig()
137
+
138
+ # Collect all text content to scan
139
+ text_content = self._extract_text_content(data, config.max_depth if config.scan_nested else 1)
140
+
141
+ # Run each rule
142
+ total_violations = 0
143
+ total_warnings = 0
144
+
145
+ for rule in self.rules:
146
+ violations = rule.check(data, text_content)
147
+
148
+ for violation in violations:
149
+ total_violations += 1 if violation.severity == "error" else 0
150
+ total_warnings += 1 if violation.severity == "warning" else 0
151
+
152
+ severity = (
153
+ ValidationStatus.FAILED
154
+ if violation.severity == "error"
155
+ else ValidationStatus.WARNING
156
+ )
157
+
158
+ result.add_message(
159
+ code=f"COMPLIANCE_{violation.code}",
160
+ message=violation.message,
161
+ severity=severity,
162
+ validator=self.name,
163
+ field=violation.field,
164
+ details={
165
+ "rule": rule.name,
166
+ "category": violation.category,
167
+ "matched_value": violation.matched_value[:50] if violation.matched_value else None,
168
+ "recommendation": violation.recommendation,
169
+ },
170
+ )
171
+
172
+ # Update result metadata
173
+ result.metadata["compliance_rules_run"] = [r.name for r in self.rules]
174
+ result.metadata["compliance_violations"] = total_violations
175
+ result.metadata["compliance_warnings"] = total_warnings
176
+
177
+ # Update status
178
+ if total_violations > 0:
179
+ result.status = ValidationStatus.FAILED
180
+ elif total_warnings > 0:
181
+ if config.fail_on_warning:
182
+ result.status = ValidationStatus.FAILED
183
+ elif result.status == ValidationStatus.PASSED:
184
+ result.status = ValidationStatus.WARNING
185
+
186
+ return result
187
+
188
+ def _extract_text_content(self, data: Any, max_depth: int, current_depth: int = 0) -> list[tuple[str, str]]:
189
+ """Extract all text content from data for scanning.
190
+
191
+ Args:
192
+ data: Data to extract from
193
+ max_depth: Maximum recursion depth
194
+ current_depth: Current recursion depth
195
+
196
+ Returns:
197
+ List of (field_path, text_value) tuples
198
+ """
199
+ if current_depth >= max_depth:
200
+ return []
201
+
202
+ results: list[tuple[str, str]] = []
203
+
204
+ if isinstance(data, str):
205
+ results.append(("", data))
206
+ elif isinstance(data, dict):
207
+ for key, value in data.items():
208
+ if isinstance(value, str):
209
+ results.append((key, value))
210
+ else:
211
+ nested = self._extract_text_content(value, max_depth, current_depth + 1)
212
+ results.extend((f"{key}.{path}" if path else key, text) for path, text in nested)
213
+ elif isinstance(data, list):
214
+ for i, item in enumerate(data):
215
+ nested = self._extract_text_content(item, max_depth, current_depth + 1)
216
+ results.extend((f"[{i}].{path}" if path else f"[{i}]", text) for path, text in nested)
217
+
218
+ return results
219
+
220
+ def __repr__(self) -> str:
221
+ rule_names = [r.name for r in self.rules]
222
+ return f"ComplianceValidator(rules={rule_names})"