PyPI - daytashield - Versions diffs - 0.1.1__py3-none-any.whl - Mend

daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

daytashield/__init__.py +55 -0
daytashield/cli/__init__.py +5 -0
daytashield/cli/main.py +541 -0
daytashield/core/__init__.py +15 -0
daytashield/core/audit.py +275 -0
daytashield/core/pipeline.py +240 -0
daytashield/core/result.py +185 -0
daytashield/core/router.py +217 -0
daytashield/integrations/__init__.py +7 -0
daytashield/integrations/langchain.py +391 -0
daytashield/processors/__init__.py +13 -0
daytashield/processors/base.py +182 -0
daytashield/processors/csv.py +269 -0
daytashield/processors/json.py +260 -0
daytashield/processors/pdf.py +232 -0
daytashield/rules/__init__.py +14 -0
daytashield/rules/base.py +67 -0
daytashield/rules/gdpr.py +348 -0
daytashield/rules/hipaa.py +229 -0
daytashield/rules/pii.py +208 -0
daytashield/validators/__init__.py +15 -0
daytashield/validators/base.py +103 -0
daytashield/validators/compliance.py +222 -0
daytashield/validators/freshness.py +337 -0
daytashield/validators/schema.py +176 -0
daytashield/validators/semantic.py +256 -0
daytashield-0.1.1.dist-info/METADATA +316 -0
daytashield-0.1.1.dist-info/RECORD +31 -0
daytashield-0.1.1.dist-info/WHEEL +4 -0
daytashield-0.1.1.dist-info/entry_points.txt +2 -0
daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0

daytashield/rules/pii.py ADDED Viewed

@@ -0,0 +1,208 @@
+"""PII (Personally Identifiable Information) detection rules."""
+from __future__ import annotations
+import re
+from typing import Any
+from daytashield.rules.base import ComplianceRule, ComplianceViolation
+class PIIDetector(ComplianceRule):
+    """Detects personally identifiable information in data.
+    Scans for common PII patterns including:
+    - Social Security Numbers (SSN)
+    - Credit card numbers
+    - Email addresses
+    - Phone numbers
+    - IP addresses
+    - Passport numbers
+    - Driver's license numbers
+    Example:
+        >>> detector = PIIDetector()
+        >>> violations = detector.check(data, [("email", "john@example.com")])
+        >>> for v in violations:
+        ...     print(f"{v.code}: {v.message}")
+    """
+    name = "pii"
+    description = "Detects personally identifiable information"
+    # PII patterns with their metadata
+    PATTERNS: list[dict[str, Any]] = [
+        {
+            "name": "ssn",
+            "pattern": r"\b\d{3}[-.\s]?\d{2}[-.\s]?\d{4}\b",
+            "code": "PII_SSN",
+            "message": "Social Security Number detected",
+            "category": "ssn",
+            "severity": "error",
+            "recommendation": "Remove or encrypt SSN before processing",
+        },
+        {
+            "name": "credit_card",
+            "pattern": r"\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})\b",
+            "code": "PII_CREDIT_CARD",
+            "message": "Credit card number detected",
+            "category": "financial",
+            "severity": "error",
+            "recommendation": "Remove or tokenize credit card numbers",
+        },
+        {
+            "name": "email",
+            "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
+            "code": "PII_EMAIL",
+            "message": "Email address detected",
+            "category": "contact",
+            "severity": "warning",
+            "recommendation": "Consider if email is necessary or should be hashed",
+        },
+        {
+            "name": "phone_us",
+            "pattern": r"\b(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}\b",
+            "code": "PII_PHONE",
+            "message": "Phone number detected",
+            "category": "contact",
+            "severity": "warning",
+            "recommendation": "Consider if phone number is necessary",
+        },
+        {
+            "name": "ip_address",
+            "pattern": r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b",
+            "code": "PII_IP_ADDRESS",
+            "message": "IP address detected",
+            "category": "technical",
+            "severity": "warning",
+            "recommendation": "Consider anonymizing IP addresses",
+        },
+        {
+            "name": "passport_us",
+            "pattern": r"\b[A-Z]{1,2}[0-9]{6,9}\b",
+            "code": "PII_PASSPORT",
+            "message": "Potential passport number detected",
+            "category": "identity",
+            "severity": "error",
+            "recommendation": "Remove passport numbers from data",
+        },
+        {
+            "name": "date_of_birth",
+            "pattern": r"\b(?:0[1-9]|1[0-2])[-/](?:0[1-9]|[12][0-9]|3[01])[-/](?:19|20)\d{2}\b",
+            "code": "PII_DOB",
+            "message": "Date of birth detected",
+            "category": "identity",
+            "severity": "warning",
+            "recommendation": "Consider if full DOB is necessary (year might suffice)",
+        },
+        {
+            "name": "drivers_license",
+            "pattern": r"\b[A-Z][0-9]{7,8}\b",
+            "code": "PII_DRIVERS_LICENSE",
+            "message": "Potential driver's license number detected",
+            "category": "identity",
+            "severity": "error",
+            "recommendation": "Remove driver's license numbers",
+        },
+    ]
+    def __init__(
+        self,
+        patterns: list[str] | None = None,
+        severity_overrides: dict[str, str] | None = None,
+    ):
+        """Initialize the PII detector.
+        Args:
+            patterns: List of pattern names to enable (None = all)
+            severity_overrides: Override severity for specific patterns
+        """
+        self.enabled_patterns = patterns
+        self.severity_overrides = severity_overrides or {}
+        self._compiled_patterns: list[tuple[re.Pattern[str], dict[str, Any]]] = []
+        self._compile_patterns()
+    def _compile_patterns(self) -> None:
+        """Compile regex patterns for efficiency."""
+        for pattern_config in self.PATTERNS:
+            if self.enabled_patterns and pattern_config["name"] not in self.enabled_patterns:
+                continue
+            compiled = re.compile(pattern_config["pattern"], re.IGNORECASE)
+            self._compiled_patterns.append((compiled, pattern_config))
+    def check(
+        self, data: Any, text_content: list[tuple[str, str]]
+    ) -> list[ComplianceViolation]:
+        """Check for PII in the text content.
+        Args:
+            data: The original data structure (unused for pattern matching)
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            List of ComplianceViolation objects
+        """
+        violations: list[ComplianceViolation] = []
+        seen: set[tuple[str, str, str]] = set()  # Dedupe by (field, code, match)
+        for field_path, text in text_content:
+            for pattern, config in self._compiled_patterns:
+                matches = pattern.findall(text)
+                for match in matches:
+                    # Get the matched string
+                    matched_str = match if isinstance(match, str) else match[0]
+                    # Dedupe
+                    key = (field_path, config["code"], matched_str)
+                    if key in seen:
+                        continue
+                    seen.add(key)
+                    # Get severity (with possible override)
+                    severity = self.severity_overrides.get(
+                        config["name"], config["severity"]
+                    )
+                    # Redact the matched value for logging
+                    redacted = self._redact(matched_str, config["name"])
+                    violations.append(
+                        ComplianceViolation(
+                            code=config["code"],
+                            message=config["message"],
+                            severity=severity,
+                            category=config["category"],
+                            field=field_path or None,
+                            matched_value=redacted,
+                            recommendation=config["recommendation"],
+                        )
+                    )
+        return violations
+    def _redact(self, value: str, pattern_name: str) -> str:
+        """Redact a matched value for safe logging.
+        Args:
+            value: The matched value
+            pattern_name: Name of the pattern that matched
+        Returns:
+            Redacted value
+        """
+        if len(value) <= 4:
+            return "*" * len(value)
+        if pattern_name in ("ssn", "credit_card", "phone_us"):
+            # Show last 4 digits
+            return "*" * (len(value) - 4) + value[-4:]
+        elif pattern_name == "email":
+            # Show first char and domain
+            at_idx = value.find("@")
+            if at_idx > 0:
+                return value[0] + "*" * (at_idx - 1) + value[at_idx:]
+            return "*" * len(value)
+        else:
+            # Generic: show first and last char
+            return value[0] + "*" * (len(value) - 2) + value[-1]

daytashield/validators/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""DaytaShield validators for data quality assurance."""
+from daytashield.validators.base import BaseValidator
+from daytashield.validators.compliance import ComplianceValidator
+from daytashield.validators.freshness import FreshnessValidator
+from daytashield.validators.schema import SchemaValidator
+from daytashield.validators.semantic import SemanticValidator
+__all__ = [
+    "BaseValidator",
+    "SchemaValidator",
+    "SemanticValidator",
+    "FreshnessValidator",
+    "ComplianceValidator",
+]

daytashield/validators/base.py ADDED Viewed

@@ -0,0 +1,103 @@
+"""Base validator abstract class."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any
+from pydantic import BaseModel, Field
+if TYPE_CHECKING:
+    from daytashield.core.result import ValidationResult
+class ValidatorConfig(BaseModel):
+    """Base configuration for all validators."""
+    enabled: bool = Field(True, description="Whether this validator is enabled")
+    fail_fast: bool = Field(False, description="Stop validation on first failure")
+    severity_override: str | None = Field(
+        None, description="Override default severity (warning, failed)"
+    )
+    model_config = {"extra": "allow"}
+class BaseValidator(ABC):
+    """Abstract base class for all DaytaShield validators.
+    Validators are responsible for checking specific aspects of data quality.
+    Each validator should focus on a single concern (schema, freshness, etc.).
+    Example:
+        >>> class MyValidator(BaseValidator):
+        ...     name = "my_validator"
+        ...
+        ...     def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
+        ...         if not self._is_valid(data):
+        ...             result.add_message(
+        ...                 code="MY_001",
+        ...                 message="Data failed custom validation",
+        ...                 severity=ValidationStatus.FAILED,
+        ...                 validator=self.name,
+        ...             )
+        ...         return result
+    """
+    name: str = "base_validator"
+    def __init__(self, config: ValidatorConfig | dict[str, Any] | None = None):
+        """Initialize the validator with optional configuration.
+        Args:
+            config: Validator configuration, either as ValidatorConfig or dict.
+        """
+        if config is None:
+            self.config = ValidatorConfig()
+        elif isinstance(config, dict):
+            self.config = ValidatorConfig(**config)
+        else:
+            self.config = config
+    @abstractmethod
+    def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
+        """Validate the provided data and update the result.
+        This is the main method that subclasses must implement. It should:
+        1. Check the data against this validator's rules
+        2. Add messages to the result for any issues found
+        3. Optionally transform the data and update result.data
+        4. Return the updated result
+        Args:
+            data: The data to validate (type depends on validator)
+            result: The ValidationResult to update with findings
+        Returns:
+            The updated ValidationResult
+        """
+        pass
+    def should_run(self, data: Any, result: ValidationResult) -> bool:
+        """Check if this validator should run.
+        Override this method to implement conditional validation logic.
+        Args:
+            data: The data to potentially validate
+            result: The current validation result
+        Returns:
+            True if this validator should run, False otherwise
+        """
+        if not self.config.enabled:
+            return False
+        # Skip if previous validator failed and we're in fail-fast mode
+        if self.config.fail_fast and result.failed:
+            return False
+        return True
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(name={self.name!r})"

daytashield/validators/compliance.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""Compliance validation for regulatory requirements."""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+from pydantic import Field
+from daytashield.core.result import ValidationResult, ValidationStatus
+from daytashield.validators.base import BaseValidator, ValidatorConfig
+if TYPE_CHECKING:
+    from daytashield.rules.base import ComplianceRule
+class ComplianceValidatorConfig(ValidatorConfig):
+    """Configuration for compliance validation."""
+    fail_on_warning: bool = Field(False, description="Treat warnings as failures")
+    include_passed_rules: bool = Field(False, description="Include passed rules in messages")
+    scan_nested: bool = Field(True, description="Scan nested objects and arrays")
+    max_depth: int = Field(10, description="Maximum nesting depth to scan")
+class ComplianceValidator(BaseValidator):
+    """Validates data against compliance rules (HIPAA, GDPR, PII, etc.).
+    Uses pluggable rule packs to check for compliance violations. Rules
+    can detect sensitive data, check for required fields, validate
+    consent flags, etc.
+    Example:
+        >>> from daytashield.rules import HIPAARules, GDPRRules
+        >>> validator = ComplianceValidator(
+        ...     rules=[HIPAARules(), GDPRRules()],
+        ... )
+        >>> result = validator.validate(patient_data, result)
+    Built-in rule packs:
+    - HIPAARules: Healthcare data compliance (PHI detection)
+    - GDPRRules: EU data protection (consent, data subject rights)
+    - PIIDetector: Personal information detection (SSN, emails, etc.)
+    """
+    name = "compliance"
+    def __init__(
+        self,
+        rules: list[ComplianceRule] | list[str] | None = None,
+        config: ComplianceValidatorConfig | dict[str, Any] | None = None,
+    ):
+        """Initialize the compliance validator.
+        Args:
+            rules: List of rule objects or rule names to load
+            config: Validator configuration
+        """
+        if config is None:
+            super().__init__(ComplianceValidatorConfig())
+        elif isinstance(config, dict):
+            super().__init__(ComplianceValidatorConfig(**config))
+        else:
+            super().__init__(config)
+        self.rules: list[ComplianceRule] = []
+        if rules:
+            for rule in rules:
+                if isinstance(rule, str):
+                    self.rules.append(self._load_rule_by_name(rule))
+                else:
+                    self.rules.append(rule)
+    def _load_rule_by_name(self, name: str) -> ComplianceRule:
+        """Load a rule pack by name.
+        Args:
+            name: Rule name (hipaa, gdpr, pii)
+        Returns:
+            ComplianceRule instance
+        Raises:
+            ValueError: If rule name is unknown
+        """
+        name_lower = name.lower()
+        if name_lower == "hipaa":
+            from daytashield.rules.hipaa import HIPAARules
+            return HIPAARules()
+        elif name_lower == "gdpr":
+            from daytashield.rules.gdpr import GDPRRules
+            return GDPRRules()
+        elif name_lower == "pii":
+            from daytashield.rules.pii import PIIDetector
+            return PIIDetector()
+        else:
+            raise ValueError(f"Unknown rule pack: {name}. Available: hipaa, gdpr, pii")
+    def add_rule(self, rule: ComplianceRule | str) -> ComplianceValidator:
+        """Add a rule to the validator.
+        Args:
+            rule: Rule object or rule name
+        Returns:
+            Self for method chaining
+        """
+        if isinstance(rule, str):
+            self.rules.append(self._load_rule_by_name(rule))
+        else:
+            self.rules.append(rule)
+        return self
+    def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
+        """Validate data against all compliance rules.
+        Args:
+            data: The data to validate
+            result: The ValidationResult to update
+        Returns:
+            Updated ValidationResult
+        """
+        if not self.rules:
+            result.add_message(
+                code="COMPLIANCE_NO_RULES",
+                message="No compliance rules configured",
+                severity=ValidationStatus.WARNING,
+                validator=self.name,
+            )
+            return result
+        config = self.config
+        if not isinstance(config, ComplianceValidatorConfig):
+            config = ComplianceValidatorConfig()
+        # Collect all text content to scan
+        text_content = self._extract_text_content(data, config.max_depth if config.scan_nested else 1)
+        # Run each rule
+        total_violations = 0
+        total_warnings = 0
+        for rule in self.rules:
+            violations = rule.check(data, text_content)
+            for violation in violations:
+                total_violations += 1 if violation.severity == "error" else 0
+                total_warnings += 1 if violation.severity == "warning" else 0
+                severity = (
+                    ValidationStatus.FAILED
+                    if violation.severity == "error"
+                    else ValidationStatus.WARNING
+                )
+                result.add_message(
+                    code=f"COMPLIANCE_{violation.code}",
+                    message=violation.message,
+                    severity=severity,
+                    validator=self.name,
+                    field=violation.field,
+                    details={
+                        "rule": rule.name,
+                        "category": violation.category,
+                        "matched_value": violation.matched_value[:50] if violation.matched_value else None,
+                        "recommendation": violation.recommendation,
+                    },
+                )
+        # Update result metadata
+        result.metadata["compliance_rules_run"] = [r.name for r in self.rules]
+        result.metadata["compliance_violations"] = total_violations
+        result.metadata["compliance_warnings"] = total_warnings
+        # Update status
+        if total_violations > 0:
+            result.status = ValidationStatus.FAILED
+        elif total_warnings > 0:
+            if config.fail_on_warning:
+                result.status = ValidationStatus.FAILED
+            elif result.status == ValidationStatus.PASSED:
+                result.status = ValidationStatus.WARNING
+        return result
+    def _extract_text_content(self, data: Any, max_depth: int, current_depth: int = 0) -> list[tuple[str, str]]:
+        """Extract all text content from data for scanning.
+        Args:
+            data: Data to extract from
+            max_depth: Maximum recursion depth
+            current_depth: Current recursion depth
+        Returns:
+            List of (field_path, text_value) tuples
+        """
+        if current_depth >= max_depth:
+            return []
+        results: list[tuple[str, str]] = []
+        if isinstance(data, str):
+            results.append(("", data))
+        elif isinstance(data, dict):
+            for key, value in data.items():
+                if isinstance(value, str):
+                    results.append((key, value))
+                else:
+                    nested = self._extract_text_content(value, max_depth, current_depth + 1)
+                    results.extend((f"{key}.{path}" if path else key, text) for path, text in nested)
+        elif isinstance(data, list):
+            for i, item in enumerate(data):
+                nested = self._extract_text_content(item, max_depth, current_depth + 1)
+                results.extend((f"[{i}].{path}" if path else f"[{i}]", text) for path, text in nested)
+        return results
+    def __repr__(self) -> str:
+        rule_names = [r.name for r in self.rules]
+        return f"ComplianceValidator(rules={rule_names})"