PyPI - daytashield - Versions diffs - 0.1.1__py3-none-any.whl - Mend

daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

daytashield/__init__.py +55 -0
daytashield/cli/__init__.py +5 -0
daytashield/cli/main.py +541 -0
daytashield/core/__init__.py +15 -0
daytashield/core/audit.py +275 -0
daytashield/core/pipeline.py +240 -0
daytashield/core/result.py +185 -0
daytashield/core/router.py +217 -0
daytashield/integrations/__init__.py +7 -0
daytashield/integrations/langchain.py +391 -0
daytashield/processors/__init__.py +13 -0
daytashield/processors/base.py +182 -0
daytashield/processors/csv.py +269 -0
daytashield/processors/json.py +260 -0
daytashield/processors/pdf.py +232 -0
daytashield/rules/__init__.py +14 -0
daytashield/rules/base.py +67 -0
daytashield/rules/gdpr.py +348 -0
daytashield/rules/hipaa.py +229 -0
daytashield/rules/pii.py +208 -0
daytashield/validators/__init__.py +15 -0
daytashield/validators/base.py +103 -0
daytashield/validators/compliance.py +222 -0
daytashield/validators/freshness.py +337 -0
daytashield/validators/schema.py +176 -0
daytashield/validators/semantic.py +256 -0
daytashield-0.1.1.dist-info/METADATA +316 -0
daytashield-0.1.1.dist-info/RECORD +31 -0
daytashield-0.1.1.dist-info/WHEEL +4 -0
daytashield-0.1.1.dist-info/entry_points.txt +2 -0
daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0

daytashield/rules/gdpr.py ADDED Viewed

@@ -0,0 +1,348 @@
+"""GDPR compliance rules for EU data protection."""
+from __future__ import annotations
+import re
+from typing import Any
+from daytashield.rules.base import ComplianceRule, ComplianceViolation
+class GDPRRules(ComplianceRule):
+    """GDPR compliance rules for EU data protection.
+    Checks for:
+    - Personal data without consent indicators
+    - Special category data (Article 9)
+    - Data subject rights compliance
+    - Cross-border data transfer indicators
+    - Data retention issues
+    Example:
+        >>> rules = GDPRRules()
+        >>> violations = rules.check(data, text_content)
+        >>> for v in violations:
+        ...     print(f"GDPR issue: {v.message}")
+    """
+    name = "gdpr"
+    description = "GDPR compliance rules for EU data protection"
+    # GDPR-specific patterns
+    PATTERNS: list[dict[str, Any]] = [
+        {
+            "name": "eu_national_id",
+            "pattern": r"\b[A-Z]{2}[0-9]{8,12}\b",
+            "code": "GDPR_NATIONAL_ID",
+            "message": "Potential EU national ID number detected",
+            "category": "personal_data",
+            "severity": "error",
+            "recommendation": "National IDs require explicit consent and purpose limitation",
+        },
+        {
+            "name": "iban",
+            "pattern": r"\b[A-Z]{2}[0-9]{2}[A-Z0-9]{4}[0-9]{7}([A-Z0-9]?){0,16}\b",
+            "code": "GDPR_IBAN",
+            "message": "IBAN (bank account) detected",
+            "category": "financial",
+            "severity": "warning",
+            "recommendation": "Financial data requires appropriate safeguards",
+        },
+        {
+            "name": "vat_number",
+            "pattern": r"\b[A-Z]{2}[0-9A-Z]{8,12}\b",
+            "code": "GDPR_VAT",
+            "message": "Potential VAT number detected",
+            "category": "business",
+            "severity": "info",
+            "recommendation": "VAT numbers may be processed for legitimate business purposes",
+        },
+    ]
+    # Special category data keywords (Article 9)
+    SPECIAL_CATEGORY_KEYWORDS = {
+        "racial_ethnic": [
+            "race",
+            "ethnicity",
+            "ethnic origin",
+            "nationality",
+            "national origin",
+        ],
+        "political": [
+            "political opinion",
+            "political party",
+            "political view",
+            "voting",
+            "election",
+        ],
+        "religious": [
+            "religion",
+            "religious belief",
+            "faith",
+            "church",
+            "mosque",
+            "synagogue",
+            "temple",
+        ],
+        "trade_union": [
+            "trade union",
+            "labor union",
+            "union member",
+            "union membership",
+        ],
+        "genetic": [
+            "genetic data",
+            "dna",
+            "genome",
+            "genetic test",
+            "hereditary",
+        ],
+        "biometric": [
+            "fingerprint",
+            "facial recognition",
+            "iris scan",
+            "biometric",
+            "voice print",
+        ],
+        "health": [
+            "health data",
+            "medical condition",
+            "diagnosis",
+            "treatment",
+            "prescription",
+            "disability",
+        ],
+        "sexual": [
+            "sexual orientation",
+            "sex life",
+            "sexual preference",
+            "gender identity",
+        ],
+    }
+    # Required consent indicators
+    CONSENT_FIELDS = [
+        "consent",
+        "consent_given",
+        "gdpr_consent",
+        "data_consent",
+        "privacy_consent",
+        "marketing_consent",
+        "opted_in",
+        "consent_date",
+        "consent_timestamp",
+    ]
+    def __init__(
+        self,
+        check_consent: bool = True,
+        check_special_categories: bool = True,
+        check_data_minimization: bool = True,
+    ):
+        """Initialize GDPR rules.
+        Args:
+            check_consent: Check for consent indicators
+            check_special_categories: Check for Article 9 special category data
+            check_data_minimization: Check for potential data minimization issues
+        """
+        self.check_consent = check_consent
+        self.check_special_categories = check_special_categories
+        self.check_data_minimization = check_data_minimization
+        self._compiled_patterns: list[tuple[re.Pattern[str], dict[str, Any]]] = []
+        self._compile_patterns()
+    def _compile_patterns(self) -> None:
+        """Compile regex patterns for efficiency."""
+        for pattern_config in self.PATTERNS:
+            compiled = re.compile(pattern_config["pattern"], re.IGNORECASE)
+            self._compiled_patterns.append((compiled, pattern_config))
+    def check(
+        self, data: Any, text_content: list[tuple[str, str]]
+    ) -> list[ComplianceViolation]:
+        """Check for GDPR compliance issues.
+        Args:
+            data: The original data structure
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            List of ComplianceViolation objects
+        """
+        violations: list[ComplianceViolation] = []
+        # Check for specific patterns
+        for field_path, text in text_content:
+            for pattern, config in self._compiled_patterns:
+                if pattern.search(text):
+                    violations.append(
+                        ComplianceViolation(
+                            code=config["code"],
+                            message=config["message"],
+                            severity=config["severity"],
+                            category=config["category"],
+                            field=field_path or None,
+                            recommendation=config["recommendation"],
+                        )
+                    )
+        # Check for special category data
+        if self.check_special_categories:
+            violations.extend(self._check_special_categories(text_content))
+        # Check for consent indicators
+        if self.check_consent:
+            violations.extend(self._check_consent(data, text_content))
+        # Check for data minimization
+        if self.check_data_minimization:
+            violations.extend(self._check_data_minimization(data, text_content))
+        return violations
+    def _check_special_categories(
+        self, text_content: list[tuple[str, str]]
+    ) -> list[ComplianceViolation]:
+        """Check for Article 9 special category data.
+        Args:
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            List of violations
+        """
+        violations: list[ComplianceViolation] = []
+        all_text = " ".join(text for _, text in text_content).lower()
+        for category, keywords in self.SPECIAL_CATEGORY_KEYWORDS.items():
+            for keyword in keywords:
+                if keyword in all_text:
+                    violations.append(
+                        ComplianceViolation(
+                            code=f"GDPR_SPECIAL_CATEGORY_{category.upper()}",
+                            message=f"Special category data detected: {category.replace('_', ' ')}",
+                            severity="error",
+                            category="special_category",
+                            recommendation=(
+                                "Article 9 data requires explicit consent and "
+                                "one of the specific lawful bases for processing"
+                            ),
+                        )
+                    )
+                    break  # One violation per category
+        return violations
+    def _check_consent(
+        self, data: Any, text_content: list[tuple[str, str]]
+    ) -> list[ComplianceViolation]:
+        """Check for consent indicators in data.
+        Args:
+            data: The original data structure
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            List of violations
+        """
+        violations: list[ComplianceViolation] = []
+        # Check if data contains personal information
+        has_personal_data = self._contains_personal_data(text_content)
+        if not has_personal_data:
+            return violations
+        # Look for consent fields
+        consent_found = False
+        if isinstance(data, dict):
+            for field in self.CONSENT_FIELDS:
+                if field in data:
+                    consent_value = data[field]
+                    if consent_value in (True, "true", "yes", "1", 1):
+                        consent_found = True
+                        break
+        if not consent_found:
+            violations.append(
+                ComplianceViolation(
+                    code="GDPR_NO_CONSENT",
+                    message="Personal data found without consent indicator",
+                    severity="warning",
+                    category="consent",
+                    recommendation=(
+                        "Ensure valid consent is obtained and recorded, "
+                        "or document another lawful basis for processing"
+                    ),
+                )
+            )
+        return violations
+    def _check_data_minimization(
+        self, data: Any, text_content: list[tuple[str, str]]
+    ) -> list[ComplianceViolation]:
+        """Check for potential data minimization issues.
+        Args:
+            data: The original data structure
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            List of violations
+        """
+        violations: list[ComplianceViolation] = []
+        # Check for excessive personal data collection
+        personal_data_fields = 0
+        excessive_threshold = 10
+        if isinstance(data, dict):
+            personal_field_patterns = [
+                "name",
+                "email",
+                "phone",
+                "address",
+                "birth",
+                "age",
+                "gender",
+                "salary",
+                "income",
+            ]
+            for key in data:
+                key_lower = key.lower()
+                if any(pattern in key_lower for pattern in personal_field_patterns):
+                    personal_data_fields += 1
+        if personal_data_fields > excessive_threshold:
+            violations.append(
+                ComplianceViolation(
+                    code="GDPR_DATA_MINIMIZATION",
+                    message=f"Potential data minimization issue: {personal_data_fields} personal data fields",
+                    severity="warning",
+                    category="data_minimization",
+                    recommendation=(
+                        "Review if all personal data fields are necessary "
+                        "for the stated purpose (Article 5(1)(c))"
+                    ),
+                )
+            )
+        return violations
+    def _contains_personal_data(self, text_content: list[tuple[str, str]]) -> bool:
+        """Check if text content appears to contain personal data.
+        Args:
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            True if personal data is likely present
+        """
+        from daytashield.rules.pii import PIIDetector
+        detector = PIIDetector(patterns=["email", "phone_us", "ssn"])
+        violations = detector.check(None, text_content)
+        return len(violations) > 0

daytashield/rules/hipaa.py ADDED Viewed

@@ -0,0 +1,229 @@
+"""HIPAA compliance rules for healthcare data."""
+from __future__ import annotations
+import re
+from typing import Any
+from daytashield.rules.base import ComplianceRule, ComplianceViolation
+class HIPAARules(ComplianceRule):
+    """HIPAA compliance rules for Protected Health Information (PHI).
+    Checks for the 18 HIPAA identifiers:
+    1. Names
+    2. Geographic data (addresses, zip codes)
+    3. Dates (except year)
+    4. Phone numbers
+    5. Fax numbers
+    6. Email addresses
+    7. Social Security numbers
+    8. Medical record numbers
+    9. Health plan beneficiary numbers
+    10. Account numbers
+    11. Certificate/license numbers
+    12. Vehicle identifiers
+    13. Device identifiers
+    14. Web URLs
+    15. IP addresses
+    16. Biometric identifiers
+    17. Full-face photographs
+    18. Any other unique identifying characteristic
+    Example:
+        >>> rules = HIPAARules()
+        >>> violations = rules.check(data, text_content)
+        >>> for v in violations:
+        ...     print(f"HIPAA violation: {v.message}")
+    """
+    name = "hipaa"
+    description = "HIPAA compliance rules for Protected Health Information"
+    # HIPAA-specific patterns
+    PATTERNS: list[dict[str, Any]] = [
+        {
+            "name": "mrn",
+            "pattern": r"\b(?:MRN|Medical Record|Record #|Patient ID)[:\s#]*([A-Z0-9]{6,12})\b",
+            "code": "HIPAA_MRN",
+            "message": "Medical Record Number (MRN) detected",
+            "category": "phi",
+            "severity": "error",
+            "recommendation": "Remove or encrypt MRN per HIPAA requirements",
+        },
+        {
+            "name": "health_plan_id",
+            "pattern": r"\b(?:Health Plan|Insurance|Plan ID|Member ID)[:\s#]*([A-Z0-9]{8,15})\b",
+            "code": "HIPAA_HEALTH_PLAN",
+            "message": "Health plan beneficiary number detected",
+            "category": "phi",
+            "severity": "error",
+            "recommendation": "Remove or encrypt health plan identifiers",
+        },
+        {
+            "name": "diagnosis_code",
+            "pattern": r"\b(?:ICD-?10|ICD-?9|Diagnosis)[:\s]*([A-Z][0-9]{2}\.?[0-9A-Z]{0,4})\b",
+            "code": "HIPAA_DIAGNOSIS",
+            "message": "Diagnosis code detected (ICD)",
+            "category": "clinical",
+            "severity": "warning",
+            "recommendation": "Ensure diagnosis codes are de-identified when required",
+        },
+        {
+            "name": "prescription",
+            "pattern": r"\b(?:Rx|Prescription|NDC)[:\s#]*([0-9]{10,11})\b",
+            "code": "HIPAA_PRESCRIPTION",
+            "message": "Prescription/NDC number detected",
+            "category": "clinical",
+            "severity": "warning",
+            "recommendation": "Review if prescription details need de-identification",
+        },
+        {
+            "name": "provider_npi",
+            "pattern": r"\b(?:NPI|Provider ID)[:\s#]*([0-9]{10})\b",
+            "code": "HIPAA_NPI",
+            "message": "National Provider Identifier (NPI) detected",
+            "category": "provider",
+            "severity": "warning",
+            "recommendation": "NPI may be included but verify context",
+        },
+        {
+            "name": "dea_number",
+            "pattern": r"\b(?:DEA)[:\s#]*([A-Z]{2}[0-9]{7})\b",
+            "code": "HIPAA_DEA",
+            "message": "DEA number detected",
+            "category": "provider",
+            "severity": "error",
+            "recommendation": "DEA numbers should not be exposed",
+        },
+    ]
+    # Keywords that suggest PHI context
+    PHI_CONTEXT_KEYWORDS = [
+        "patient",
+        "diagnosis",
+        "treatment",
+        "prescription",
+        "medical",
+        "health",
+        "hospital",
+        "doctor",
+        "physician",
+        "nurse",
+        "clinic",
+        "symptom",
+        "medication",
+        "allergy",
+        "procedure",
+        "surgery",
+        "lab result",
+        "test result",
+        "vital sign",
+        "blood pressure",
+        "heart rate",
+        "temperature",
+    ]
+    def __init__(self, strict: bool = True):
+        """Initialize HIPAA rules.
+        Args:
+            strict: If True, flag any data that appears to be in healthcare context
+        """
+        self.strict = strict
+        self._compiled_patterns: list[tuple[re.Pattern[str], dict[str, Any]]] = []
+        self._compile_patterns()
+    def _compile_patterns(self) -> None:
+        """Compile regex patterns for efficiency."""
+        for pattern_config in self.PATTERNS:
+            compiled = re.compile(pattern_config["pattern"], re.IGNORECASE)
+            self._compiled_patterns.append((compiled, pattern_config))
+    def check(
+        self, data: Any, text_content: list[tuple[str, str]]
+    ) -> list[ComplianceViolation]:
+        """Check for HIPAA violations.
+        Args:
+            data: The original data structure
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            List of ComplianceViolation objects
+        """
+        violations: list[ComplianceViolation] = []
+        # Check if data appears to be in healthcare context
+        is_healthcare_context = self._detect_healthcare_context(text_content)
+        # Check specific patterns
+        for field_path, text in text_content:
+            for pattern, config in self._compiled_patterns:
+                matches = pattern.finditer(text)
+                for match in matches:
+                    # Get the captured group (the actual identifier)
+                    matched_value = match.group(1) if match.groups() else match.group(0)
+                    violations.append(
+                        ComplianceViolation(
+                            code=config["code"],
+                            message=config["message"],
+                            severity=config["severity"],
+                            category=config["category"],
+                            field=field_path or None,
+                            matched_value=self._redact(matched_value),
+                            recommendation=config["recommendation"],
+                        )
+                    )
+        # In strict mode, check for general PII in healthcare context
+        if self.strict and is_healthcare_context:
+            from daytashield.rules.pii import PIIDetector
+            pii_detector = PIIDetector(
+                patterns=["ssn", "email", "phone_us", "date_of_birth"],
+                severity_overrides={
+                    "ssn": "error",
+                    "email": "error",  # Elevate to error in healthcare context
+                    "phone_us": "error",
+                    "date_of_birth": "error",
+                },
+            )
+            pii_violations = pii_detector.check(data, text_content)
+            # Add HIPAA context to PII violations
+            for v in pii_violations:
+                v.code = f"HIPAA_{v.code}"
+                v.message = f"{v.message} (in healthcare context)"
+                v.recommendation = (
+                    f"HIPAA requires protection of this data. {v.recommendation}"
+                )
+                violations.append(v)
+        return violations
+    def _detect_healthcare_context(self, text_content: list[tuple[str, str]]) -> bool:
+        """Detect if data appears to be in a healthcare context.
+        Args:
+            text_content: List of (field_path, text_value) tuples
+        Returns:
+            True if healthcare context is detected
+        """
+        all_text = " ".join(text for _, text in text_content).lower()
+        keyword_count = sum(
+            1 for keyword in self.PHI_CONTEXT_KEYWORDS if keyword in all_text
+        )
+        # Consider healthcare context if 2+ keywords found
+        return keyword_count >= 2
+    def _redact(self, value: str) -> str:
+        """Redact a matched value for safe logging."""
+        if len(value) <= 4:
+            return "*" * len(value)
+        return value[:2] + "*" * (len(value) - 4) + value[-2:]