npm - sage-governance - Versions diffs - 1.0.0 - Mend

sage-governance 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/AGENTS.MD +481 -0
package/LICENSE +21 -0
package/README.md +319 -0
package/bin/sage.js +55 -0
package/claude.json +16 -0
package/codex.json +22 -0
package/cursor.json +27 -0
package/docs/architecture.md +38 -0
package/opencode.json +24 -0
package/package.json +58 -0
package/requirements.txt +7 -0
package/rules/general/EU_AI_Act_Annex_III.md +29 -0
package/rules/general/OECD_Principles.md +20 -0
package/rules/general/UNESCO_AI_Ethics.md +237 -0
package/rules/general/UN_Human_Rights.md +183 -0
package/rules/index.json +145 -0
package/sage/mcp_server.py +459 -0
package/sage/report_gen.py +408 -0
package/sage/sage_agent.py +710 -0
package/sage/security_agent.py +455 -0
package/sage/startup.py +311 -0

package/sage/security_agent.py ADDED Viewed

@@ -0,0 +1,455 @@
+"""
+security_agent.py — SAGE Code & Infrastructure Security Agent
+═════════════════════════════════════════════════════════════
+Full-spectrum, deterministic code scanner. No LLM — results must be
+reproducible across every run. Covers:
+  • API key / secret credential exposure
+  • PII field detection (GDPR special categories)
+  • Direct protected-attribute use in ML code
+  • Proxy attribute discrimination risk
+  • EU AI Act compliance gaps (black-box models, missing fairness metrics)
+  • Model inversion attack surface
+SEVERITY SCALE  (matches Beunec BAAP P0-P4 system)
+  P0 — Critical  : secrets in code, biometric/medical PII, direct protected attr
+  P1 — High      : indirect PII, major compliance gap
+  P2 — Medium    : proxy risk, black-box model, missing fairness audit
+  P3 — Low       : data quality, encoding choices, minor gaps
+  P4 — Info      : best-practice suggestions
+Author: SAGE Team / Team SAGE (Hackathon)
+License: MIT
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Literal
+from startup import PROTECTED_ATTRIBUTES, PROXY_ATTRIBUTE_MAP
+# ══════════════════════════════════════════════════════════════════════════════
+# DATA MODELS
+# ══════════════════════════════════════════════════════════════════════════════
+Severity = Literal["P0", "P1", "P2", "P3", "P4"]
+@dataclass
+class SecurityFinding:
+    severity:    Severity
+    category:    str
+    line_number: int
+    snippet:     str
+    description: str
+    fix:         str
+    regulation:  str = ""
+    def to_dict(self) -> dict:
+        return {
+            "severity":    self.severity,
+            "category":    self.category,
+            "line_number": self.line_number,
+            "snippet":     self.snippet,
+            "description": self.description,
+            "fix":         self.fix,
+            "regulation":  self.regulation,
+        }
+@dataclass
+class SecurityReport:
+    findings:                  list[SecurityFinding]
+    total_findings:            int
+    highest_severity:          str
+    protected_attributes_found: list[str]
+    secrets_found:             list[str]
+    pii_fields_found:          list[str]
+    passed: bool  # True only when zero P0/P1 findings
+    def top_finding(self) -> SecurityFinding | None:
+        return self.findings[0] if self.findings else None
+# ══════════════════════════════════════════════════════════════════════════════
+# PATTERN BANKS
+# ══════════════════════════════════════════════════════════════════════════════
+# (regex, description, severity)
+_SECRET_PATTERNS: list[tuple[str, str, Severity]] = [
+    # AI provider keys
+    (r"sk-ant-[A-Za-z0-9\-_]{40,}", "Anthropic API key hardcoded", "P0"),
+    (r"sk-[A-Za-z0-9]{48}", "OpenAI API key hardcoded", "P0"),
+    (r"AIza[0-9A-Za-z\-_]{35}", "Google API key hardcoded", "P0"),
+    # Generic credentials
+    (
+        r'(?i)(api[_-]?key|apikey)\s*=\s*["\']([A-Za-z0-9\-_]{20,})["\']',
+        "API key hardcoded in source",
+        "P0",
+    ),
+    (
+        r'(?i)(secret[_-]?key|client[_-]?secret)\s*=\s*["\']([A-Za-z0-9\-_+/=]{20,})["\']',
+        "Secret key hardcoded in source",
+        "P0",
+    ),
+    (
+        r'(?i)(password|passwd|pwd)\s*=\s*["\'][^"\']{6,}["\']',
+        "Hardcoded password in source",
+        "P0",
+    ),
+    (
+        r'(?i)(token|auth[_-]?token|bearer)\s*=\s*["\'][A-Za-z0-9\-_.]{20,}["\']',
+        "Hardcoded auth token in source",
+        "P0",
+    ),
+    # Cloud credentials
+    (
+        r'(?i)(AKIA|ASIA|AROA)[A-Z0-9]{16}',
+        "AWS access key ID detected",
+        "P0",
+    ),
+    (
+        r'(?i)(aws[_-]?secret[_-]?access[_-]?key)\s*=\s*["\'][A-Za-z0-9/+=]{40}["\']',
+        "AWS secret access key hardcoded",
+        "P0",
+    ),
+    (
+        r'(?i)(database[_-]?url|db[_-]?url|mongo[_-]?uri|postgres[_-]?url)\s*=\s*["\'][^"\']{10,}["\']',
+        "Database connection string hardcoded",
+        "P0",
+    ),
+    (
+        r'(?i)(connection[_-]?string)\s*=\s*["\'][^"\']{10,}["\']',
+        "Connection string hardcoded",
+        "P0",
+    ),
+]
+# (regex, description, severity)
+_PII_PATTERNS: list[tuple[str, str, Severity]] = [
+    (r"\b(ssn|social[_\s]?security[_\s]?number)\b", "Social Security Number field — GDPR Article 9 special category", "P0"),
+    (r"\b(biometric|fingerprint|face[_\s]?recognition|voice[_\s]?print|retina)\b", "Biometric data — GDPR Article 9 special category", "P0"),
+    (r"\b(medical[_\s]?record|health[_\s]?data|diagnosis|prescription|icd[_\s]?code)\b", "Medical/health data — GDPR Article 9 special category", "P0"),
+    (r"\b(passport[_\s]?number|national[_\s]?id[_\s]?number)\b", "Government ID number — high-sensitivity PII", "P1"),
+    (r"\b(date[_\s]?of[_\s]?birth|dob|birth[_\s]?date)\b", "Date of birth field", "P1"),
+    (r"\b(gps[_\s]?coord|latitude.*longitude|geolocation|location[_\s]?data)\b", "Precise geolocation — privacy risk", "P1"),
+    (r"\b(sexual[_\s]?orientation|religion|political[_\s]?opinion)\b", "GDPR Article 9 sensitive category", "P0"),
+    (r"\b(phone[_\s]?number|mobile[_\s]?number|cell[_\s]?number)\b", "Phone number field", "P2"),
+    (r"\b(email[_\s]?address|email)\b", "Email address field", "P2"),
+    (r"\b(ip[_\s]?address|ip[_\s]?addr|ipv4|ipv6)\b", "IP address — personal data under GDPR", "P2"),
+    (r"\b(full[_\s]?name|first[_\s]?name|last[_\s]?name|surname)\b", "Name field — PII", "P3"),
+]
+# (regex, description, severity)
+_COMPLIANCE_PATTERNS: list[tuple[str, str, Severity]] = [
+    (
+        r"RandomForestClassifier|GradientBoostingClassifier|XGBClassifier|LGBMClassifier",
+        "Black-box model detected — EU AI Act Article 13 requires sufficient transparency "
+        "for users to interpret outputs in high-risk systems. Consider Decision Tree (depth≤4) "
+        "or GAM for explainability-first approach.",
+        "P2",
+    ),
+    (
+        r"accuracy_score\s*\(",
+        "Only accuracy_score reported — no fairness metrics detected. "
+        "EU AI Act Article 10 requires bias monitoring. "
+        "Add: fairlearn.metrics.MetricFrame or demographic_parity_difference.",
+        "P2",
+    ),
+    (
+        r"train_test_split\s*\(",
+        "Standard train/test split — verify stratify= parameter preserves "
+        "demographic representation across splits.",
+        "P3",
+    ),
+    (
+        r"\.dropna\(\)",
+        "Missing value removal may disproportionately affect minority groups. "
+        "Audit which rows are dropped and whether removal is demographically skewed.",
+        "P3",
+    ),
+    (
+        r"LabelEncoder\(\)",
+        "LabelEncoder imposes arbitrary ordinal ordering on categorical data. "
+        "Use OneHotEncoder for nominal categories. Ordinal encoding of 'race' or 'gender' "
+        "introduces implicit ranking.",
+        "P3",
+    ),
+    (
+        r"pickle\.dump|joblib\.dump",
+        "Model serialized without integrity hash. "
+        "Add SHA-256 hash of model file to audit trail before deployment.",
+        "P3",
+    ),
+    (
+        r'import\s+requests|urllib\.request',
+        "External HTTP call in ML code — verify no PII is sent to third-party endpoints "
+        "without data processing agreement (GDPR Article 28).",
+        "P3",
+    ),
+]
+# (regex, description, severity, fix, regulation)
+_SAFEGUARDING_PATTERNS: list[tuple[str, str, Severity, str, str]] = [
+    (
+        r"\b(auto_escalate|escalate_auto|automatic_escalation|auto_report)\b",
+        "Automated escalation/reporting without human review in child safety domain. Auto-reporting without human oversight violates UN CRC Article 3 best interests and EU AI Act human oversight principles.",
+        "P0",
+        "Implement a human-in-the-loop review queue or verification step before escalating alerts.",
+        "UN CRC Article 3 (best interests); EU AI Act Article 14 (human oversight)",
+    ),
+    (
+        r"\b(store_full_conversation|save_chat_history|store_chat_logs|save_full_chat|log_chat_history)\b",
+        "Storing full conversation logs/PII of minors. Storing complete chats violates GDPR Article 25 (privacy by design) and data minimization principles.",
+        "P0",
+        "Store metadata only (e.g. event hashes, timestamps) and discard full chat text immediately after scan.",
+        "GDPR Article 5.1.c (data minimization), Article 25 (privacy by design); UN CRC Article 16 (privacy)",
+    ),
+]
+# ══════════════════════════════════════════════════════════════════════════════
+# SCAN FUNCTIONS
+# ══════════════════════════════════════════════════════════════════════════════
+def _lines(code: str) -> list[str]:
+    return code.split("\n")
+def _is_comment_line(line: str) -> bool:
+    """
+    Return True if the line is purely a comment (Python, JS/TS, SQL, shell)
+    and therefore should NOT be flagged for protected-attribute or PII
+    violations — scanners must never penalise SAGE governance annotations.
+    Examples of safe comment forms:
+      # SAGE: race removed from features
+      // SAGE: sex excluded via ThresholdOptimizer
+      /* GDPR Art.9 — no biometric data */
+      -- SQL comment
+    """
+    stripped = line.strip()
+    if not stripped:
+        return False
+    return (
+        stripped.startswith("#")          # Python / Shell / YAML
+        or stripped.startswith("//")      # JS / TS / Java / Go
+        or stripped.startswith("/*")      # C-style block comment opener
+        or stripped.startswith("*")       # C-style block comment continuation
+        or stripped.startswith("*/")      # C-style block comment closer
+        or stripped.startswith("--")      # SQL single-line comment
+    )
+def _scan_secrets(code: str) -> list[SecurityFinding]:
+    findings: list[SecurityFinding] = []
+    for i, line in enumerate(_lines(code), 1):
+        for pattern, description, severity in _SECRET_PATTERNS:
+            if re.search(pattern, line):
+                findings.append(SecurityFinding(
+                    severity=severity,
+                    category="SECRET_EXPOSURE",
+                    line_number=i,
+                    snippet=line.strip()[:120],
+                    description=description,
+                    fix=(
+                        "Move to environment variable: os.environ.get('KEY_NAME'). "
+                        "Use python-dotenv for local development. "
+                        "Never commit secrets to version control."
+                    ),
+                    regulation="OWASP A02:2021 Cryptographic Failures; GDPR Article 32",
+                ))
+    return findings
+def _scan_pii(code: str) -> list[SecurityFinding]:
+    findings: list[SecurityFinding] = []
+    for i, line in enumerate(_lines(code), 1):
+        if _is_comment_line(line):  # never flag governance annotation comments
+            continue
+        for pattern, description, severity in _PII_PATTERNS:
+            if re.search(pattern, line, re.IGNORECASE):
+                findings.append(SecurityFinding(
+                    severity=severity,
+                    category="PII_EXPOSURE",
+                    line_number=i,
+                    snippet=line.strip()[:120],
+                    description=f"PII field detected: {description}",
+                    fix=(
+                        "Apply data minimization (GDPR Article 5.1.c): "
+                        "collect only what is strictly necessary. "
+                        "Pseudonymize or anonymize before model training. "
+                        "If required for fairness auditing, document legal basis under GDPR Art. 9(2)(g)."
+                    ),
+                    regulation="GDPR Article 5 (data minimization), Article 9 (special categories)",
+                ))
+    return findings
+def _scan_protected_attributes(code: str) -> tuple[list[SecurityFinding], list[str]]:
+    """Returns (findings, list_of_found_attribute_names)."""
+    findings: list[SecurityFinding] = []
+    found_attrs: list[str] = []
+    lines = _lines(code)
+    for i, line in enumerate(lines, 1):
+        if _is_comment_line(line):  # skip SAGE governance annotation comments
+            continue
+        for attr in PROTECTED_ATTRIBUTES:
+            # Match as standalone variable name or quoted string
+            if re.search(
+                r"""(?:['"\[])\s*""" + re.escape(attr) + r"""\s*(?:['"\]])""",
+                line,
+                re.IGNORECASE,
+            ) or re.search(r"\b" + re.escape(attr) + r"\b", line, re.IGNORECASE):
+                if attr not in found_attrs:
+                    found_attrs.append(attr)
+                findings.append(SecurityFinding(
+                    severity="P1",
+                    category="PROTECTED_ATTRIBUTE_DIRECT_USE",
+                    line_number=i,
+                    snippet=line.strip()[:120],
+                    description=f"Protected attribute '{attr}' used directly in model code",
+                    fix=(
+                        f"Option A (Remove): Drop '{attr}' from features and audit remaining "
+                        f"columns for proxy correlation.\n"
+                        f"Option B (Post-process): Use Fairlearn ThresholdOptimizer — "
+                        f"'{attr}' used only for threshold calibration at deployment, "
+                        f"never embedded in model weights."
+                    ),
+                    regulation=(
+                        "EU AI Act Annex III; GDPR Article 9; "
+                        "UDHR Article 7 (equal protection); "
+                        "Equal Credit Opportunity Act / Equal Employment Opportunity Act (US)"
+                    ),
+                ))
+    return findings, list(set(found_attrs))
+def _scan_proxy_attributes(code: str) -> list[SecurityFinding]:
+    findings: list[SecurityFinding] = []
+    lines = _lines(code)
+    seen: set[tuple[int, str]] = set()
+    for i, line in enumerate(lines, 1):
+        for protected, proxies in PROXY_ATTRIBUTE_MAP.items():
+            for proxy in proxies:
+                if proxy.lower() in line.lower():
+                    key = (i, proxy)
+                    if key not in seen:
+                        seen.add(key)
+                        findings.append(SecurityFinding(
+                            severity="P2",
+                            category="PROXY_DISCRIMINATION_RISK",
+                            line_number=i,
+                            snippet=line.strip()[:120],
+                            description=(
+                                f"'{proxy}' is a documented proxy for '{protected}'. "
+                                "The model may learn indirect discrimination without "
+                                "any protected attribute appearing in the feature list."
+                            ),
+                            fix=(
+                                f"Audit correlation between '{proxy}' and '{protected}' "
+                                "using Fairlearn MetricFrame BEFORE training. "
+                                "Document the correlation coefficient and the decision "
+                                "to keep or remove the feature in the audit trail."
+                            ),
+                            regulation=(
+                                "EU AI Act Article 10(5) — indirect discrimination; "
+                                "GDPR Recital 71 (automated profiling)"
+                            ),
+                        ))
+    return findings
+def _scan_compliance(code: str) -> list[SecurityFinding]:
+    findings: list[SecurityFinding] = []
+    lines = _lines(code)
+    for pattern, description, severity in _COMPLIANCE_PATTERNS:
+        for i, line in enumerate(lines, 1):
+            if re.search(pattern, line):
+                findings.append(SecurityFinding(
+                    severity=severity,
+                    category="COMPLIANCE_GAP",
+                    line_number=i,
+                    snippet=line.strip()[:120],
+                    description=description,
+                    fix="Consult SAGE fairness options (sage_evaluate) for compliant alternatives.",
+                    regulation="EU AI Act Article 13 (transparency); Article 10 (data governance)",
+                ))
+                break  # One finding per compliance pattern type
+    return findings
+def _scan_safeguarding(code: str) -> list[SecurityFinding]:
+    findings: list[SecurityFinding] = []
+    lines = _lines(code)
+    for pattern, description, severity, fix, regulation in _SAFEGUARDING_PATTERNS:
+        for i, line in enumerate(lines, 1):
+            if re.search(pattern, line, re.IGNORECASE):
+                findings.append(SecurityFinding(
+                    severity=severity,
+                    category="SAFEGUARDING_VIOLATION",
+                    line_number=i,
+                    snippet=line.strip()[:120],
+                    description=description,
+                    fix=fix,
+                    regulation=regulation,
+                ))
+    return findings
+# ══════════════════════════════════════════════════════════════════════════════
+# PUBLIC API
+# ══════════════════════════════════════════════════════════════════════════════
+def scan(code: str) -> SecurityReport:
+    """
+    Full deterministic security scan. Returns SecurityReport.
+    Severity ordering: P0 > P1 > P2 > P3 > P4
+    passed=True only when zero P0/P1 findings.
+    """
+    secret_findings                    = _scan_secrets(code)
+    pii_findings                       = _scan_pii(code)
+    protected_findings, found_attrs    = _scan_protected_attributes(code)
+    proxy_findings                     = _scan_proxy_attributes(code)
+    compliance_findings                = _scan_compliance(code) or []
+    safeguarding_findings              = _scan_safeguarding(code)
+    all_findings: list[SecurityFinding] = (
+        secret_findings
+        + pii_findings
+        + protected_findings
+        + proxy_findings
+        + compliance_findings
+        + safeguarding_findings
+    )
+    # Deduplicate by (line_number, category, description[:40])
+    seen_keys: set[tuple] = set()
+    unique: list[SecurityFinding] = []
+    for f in all_findings:
+        key = (f.line_number, f.category, f.description[:40])
+        if key not in seen_keys:
+            seen_keys.add(key)
+            unique.append(f)
+    _order = {"P0": 0, "P1": 1, "P2": 2, "P3": 3, "P4": 4}
+    unique.sort(key=lambda f: _order.get(f.severity, 99))
+    highest = unique[0].severity if unique else "PASS"
+    passed  = highest not in ("P0", "P1") if unique else True
+    return SecurityReport(
+        findings=unique,
+        total_findings=len(unique),
+        highest_severity=highest,
+        protected_attributes_found=found_attrs,
+        secrets_found=[f.snippet for f in secret_findings],
+        pii_fields_found=[f.description for f in pii_findings],
+        passed=passed,
+    )