PyPI - datagate-llm - Versions diffs - 0.1.0__py3-none-any.whl - Mend

datagate-llm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

datagate_llm/__init__.py +61 -0
datagate_llm/engine.py +127 -0
datagate_llm/loader.py +56 -0
datagate_llm/rules/finance.json +72 -0
datagate_llm/rules/healthcare.json +52 -0
datagate_llm/rules/technology.json +82 -0
datagate_llm/rules/universal.json +52 -0
datagate_llm-0.1.0.dist-info/METADATA +178 -0
datagate_llm-0.1.0.dist-info/RECORD +12 -0
datagate_llm-0.1.0.dist-info/WHEEL +5 -0
datagate_llm-0.1.0.dist-info/licenses/LICENSE +21 -0
datagate_llm-0.1.0.dist-info/top_level.txt +1 -0

datagate_llm/__init__.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""
+datagate-llm: The inference boundary layer between your data and outbound AI requests.
+Public API
+----------
+scan(text, sectors=None, mode="flag", rules_dir=_RULES_DIR) -> dict
+Args:
+    text (str): Input text to analyse.
+    sectors (list[str] | None): Domain rule sets to load in addition to
+        universal rules. Supported: "technology", "healthcare", "finance".
+    mode (str): One of "flag", "redact", or "block".
+    rules_dir (str): Path to directory containing JSON rule files.
+Returns a dict with keys:
+    safe (bool)          - True when risk_score == 0.0
+    risk_score (float)   - 0.0-1.0 aggregate risk
+    action (str)         - "allow", "flag", or "block"
+    findings (list)      - matched spans with metadata
+    redacted_text (str)  - text with spans replaced (mode=redact)
+    fingerprint (str)    - first 16 hex chars of sha256(text+rule_version)
+    rule_version (str)   - hash of loaded rule set for audit
+    trace (list[str])    - human-readable decision log
+"""
+import os
+from .engine import tokenize, match, score, resolve, aggregate, build_result
+from .loader import load_rules
+__version__ = "0.1.0"
+__all__ = ["scan"]
+_RULES_DIR = os.path.join(os.path.dirname(__file__), "rules")
+def scan(text, sectors=None, mode="flag", rules_dir=_RULES_DIR):
+    """Run the full detection pipeline on *text*."""
+    sectors = sectors or []
+    trace = []
+    rules = load_rules(sectors, rules_dir)
+    trace.append(f"loaded {len(rules)} rules for sectors={['universal'] + sectors}")
+    cleaned = tokenize(text)
+    trace.append("tokenized input")
+    spans = match(cleaned, rules)
+    trace.append(f"matched {len(spans)} raw spans")
+    scored = [dict(span, confidence=score(span, cleaned)) for span in spans]
+    clean_spans = resolve(scored)
+    trace.append(f"resolved to {len(clean_spans)} non-overlapping spans")
+    risk = aggregate(clean_spans)
+    trace.append(f"risk_score={risk:.3f}")
+    rule_version = rules[0].get("rule_version", "unknown") if rules else "unknown"
+    result = build_result(cleaned, clean_spans, risk, mode, rule_version)
+    result["trace"] = trace
+    return result

datagate_llm/engine.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""
+Pure-function detection engine. No side effects. No I/O. stdlib only.
+"""
+import hashlib
+import re
+import unicodedata
+from math import log1p
+_ZERO_WIDTH = re.compile(r"[\u200b-\u200f\u202a-\u202e\ufeff\u00ad]")
+_SEVERITY_BASE = {
+    "critical": 1.0,
+    "high": 0.8,
+    "medium": 0.5,
+    "low": 0.3,
+}
+_WINDOW = 30
+_BOOST = 0.15
+_SUPPRESS = 0.25
+_LOG_SCALE = 0.05
+def tokenize(text):
+    """Normalise *text* to NFKC and strip zero-width characters."""
+    normalised = unicodedata.normalize("NFKC", text)
+    return _ZERO_WIDTH.sub("", normalised)
+def match(text, rules):
+    """Return every span matched by *rules* against *text*."""
+    spans = []
+    for rule in rules:
+        compiled = rule.get("compiled")
+        if compiled is None:
+            continue
+        for m in compiled.finditer(text):
+            spans.append({
+                "start": m.start(),
+                "end": m.end(),
+                "text": m.group(),
+                "rule_id": rule["id"],
+                "sector": rule.get("sector", "universal"),
+                "severity": rule.get("severity", "medium"),
+                "context": rule.get("context", {}),
+            })
+    return spans
+def score(span, text):
+    """Return a confidence float in [0.0, 1.0] for *span* inside *text*."""
+    base = _SEVERITY_BASE.get(span.get("severity", "medium"), 0.5)
+    start = max(0, span["start"] - _WINDOW)
+    end = min(len(text), span["end"] + _WINDOW)
+    window = text[start:end].lower()
+    ctx = span.get("context", {})
+    boost_words = ctx.get("boost", [])
+    suppress_words = ctx.get("suppress", [])
+    if any(w in window for w in boost_words):
+        base = min(1.0, base + _BOOST)
+    if any(w in window for w in suppress_words):
+        base = max(0.0, base - _SUPPRESS)
+    return round(base, 4)
+def resolve(spans):
+    """Remove overlapping spans, keeping highest-confidence ones."""
+    sorted_spans = sorted(
+        spans,
+        key=lambda s: (s["start"], -s.get("confidence", 0), s["rule_id"])
+    )
+    result = []
+    last_end = -1
+    for span in sorted_spans:
+        if span["start"] >= last_end:
+            result.append(span)
+            last_end = span["end"]
+    return result
+def aggregate(spans):
+    """Compute aggregate risk in [0.0, 1.0] from resolved *spans*."""
+    if not spans:
+        return 0.0
+    max_score = max(s.get("confidence", 0.0) for s in spans)
+    raw = max_score * (1 + _LOG_SCALE * log1p(len(spans)))
+    return round(min(1.0, raw), 4)
+def build_result(text, spans, risk, mode, rule_version):
+    """Assemble the final result dict. Never raises."""
+    findings = [
+        {k: v for k, v in s.items() if k not in ("compiled", "context")}
+        for s in spans
+    ]
+    redacted = text
+    if mode == "redact":
+        for span in reversed(spans):
+            placeholder = f"[REDACTED:{span['rule_id']}]"
+            redacted = redacted[: span["start"]] + placeholder + redacted[span["end"]:]
+    action = _resolve_action(risk, mode)
+    fp_raw = hashlib.sha256(f"{text}{rule_version}".encode()).hexdigest()
+    return {
+        "safe": risk == 0.0,
+        "risk_score": risk,
+        "action": action,
+        "findings": findings,
+        "redacted_text": redacted,
+        "fingerprint": fp_raw[:16],
+        "rule_version": rule_version,
+        "trace": [],
+    }
+def _resolve_action(risk, mode):
+    if risk == 0.0:
+        return "allow"
+    if mode == "block":
+        return "block"
+    if mode == "flag":
+        return "flag"
+    return "allow"

datagate_llm/loader.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+Rule loader with in-process cache. stdlib only.
+"""
+import json
+import os
+import re
+_cache = {}
+_SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3}
+def load_rules(sectors, rules_dir):
+    """Return compiled rules for *sectors* (universal always included)."""
+    key = (tuple(sorted(sectors)), rules_dir)
+    if key in _cache:
+        return _cache[key]
+    raw = _read(os.path.join(rules_dir, "universal.json"))
+    for sector in sectors:
+        path = os.path.join(rules_dir, f"{sector}.json")
+        raw.extend(_read(path))
+    compiled = _compile(raw)
+    _cache[key] = compiled
+    return compiled
+def _read(path):
+    """Load JSON rule list from *path*; return [] on any error."""
+    try:
+        with open(path, encoding="utf-8") as fh:
+            data = json.loads(fh.read())
+        return data if isinstance(data, list) else []
+    except (OSError, json.JSONDecodeError):
+        return []
+def _compile(rules):
+    """Add *compiled* regex key to each rule and sort by severity."""
+    out = []
+    for rule in rules:
+        pattern = rule.get("pattern", "")
+        if not pattern:
+            continue
+        try:
+            compiled = re.compile(pattern)
+        except re.error:
+            continue
+        entry = dict(rule)
+        entry["compiled"] = compiled
+        out.append(entry)
+    out.sort(key=lambda r: _SEVERITY_ORDER.get(r.get("severity", "medium"), 2))
+    return out

datagate_llm/rules/finance.json ADDED Viewed

@@ -0,0 +1,72 @@
+[
+  {
+    "id": "finance/iban",
+    "sector": "finance",
+    "pattern": "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}(?:[A-Z0-9]?){0,16}\\b",
+    "severity": "high",
+    "context": {
+      "boost": ["iban", "bank", "account", "transfer", "wire"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "finance/swift_bic",
+    "sector": "finance",
+    "pattern": "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
+    "severity": "medium",
+    "context": {
+      "boost": ["swift", "bic", "bank", "wire", "transfer", "international"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "finance/routing_number",
+    "sector": "finance",
+    "pattern": "\\b(?:routing|aba|aba routing)[:\\s#]*([0-9]{9})\\b",
+    "severity": "high",
+    "context": {
+      "boost": ["routing", "aba", "transit", "bank", "account"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "finance/bank_account",
+    "sector": "finance",
+    "pattern": "\\b(?:account|acct)[\\s\\-#:]*([0-9]{8,17})\\b",
+    "severity": "high",
+    "context": {
+      "boost": ["account", "bank", "checking", "savings", "deposit"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "finance/tax_id_ein",
+    "sector": "finance",
+    "pattern": "\\b(?:EIN|FEIN|Tax ID)[:\\s#]*([0-9]{2}-[0-9]{7})\\b",
+    "severity": "critical",
+    "context": {
+      "boost": ["ein", "fein", "tax", "employer", "federal"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "finance/crypto_btc",
+    "sector": "finance",
+    "pattern": "\\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,62}\\b",
+    "severity": "medium",
+    "context": {
+      "boost": ["bitcoin", "btc", "wallet", "crypto", "address"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "finance/crypto_eth",
+    "sector": "finance",
+    "pattern": "\\b0x[a-fA-F0-9]{40}\\b",
+    "severity": "medium",
+    "context": {
+      "boost": ["ethereum", "eth", "wallet", "crypto", "address", "0x"],
+      "suppress": ["example", "test", "sample"]
+    }
+  }
+]

datagate_llm/rules/healthcare.json ADDED Viewed

@@ -0,0 +1,52 @@
+[
+  {
+    "id": "healthcare/npi_number",
+    "sector": "healthcare",
+    "pattern": "\\bNPI[:\\s#]*([1-9]\\d{9})\\b",
+    "severity": "high",
+    "context": {
+      "boost": ["npi", "provider", "physician", "practitioner", "clinic"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "healthcare/icd10_code",
+    "sector": "healthcare",
+    "pattern": "\\b[A-TV-Z][0-9][0-9A-Z](?:\\.[0-9A-Z]{1,4})?\\b",
+    "severity": "medium",
+    "context": {
+      "boost": ["icd", "diagnosis", "code", "condition", "disease"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "healthcare/insurance_member_id",
+    "sector": "healthcare",
+    "pattern": "\\b(?:member|subscriber|insured)[\\s\\-#:]*([A-Z0-9]{6,15})\\b",
+    "severity": "high",
+    "context": {
+      "boost": ["insurance", "member", "subscriber", "plan", "policy"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "healthcare/medical_record_number",
+    "sector": "healthcare",
+    "pattern": "\\b(?:MRN|medical record)[:\\s#]*([A-Z0-9]{5,12})\\b",
+    "severity": "critical",
+    "context": {
+      "boost": ["mrn", "medical record", "patient", "hospital", "chart"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "healthcare/dea_number",
+    "sector": "healthcare",
+    "pattern": "\\b[A-Z]{2}[0-9]{7}\\b",
+    "severity": "critical",
+    "context": {
+      "boost": ["dea", "drug", "prescribe", "controlled", "substance"],
+      "suppress": ["example", "test", "sample"]
+    }
+  }
+]

datagate_llm/rules/technology.json ADDED Viewed

@@ -0,0 +1,82 @@
+[
+  {
+    "id": "technology/aws_access_key",
+    "sector": "technology",
+    "pattern": "(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}",
+    "severity": "critical",
+    "context": {
+      "boost": ["aws", "amazon", "access", "key", "iam"],
+      "suppress": ["example", "test", "sample", "fake"]
+    }
+  },
+  {
+    "id": "technology/openai_key",
+    "sector": "technology",
+    "pattern": "sk-[A-Za-z0-9]{20,}T3BlbkFJ[A-Za-z0-9]{20,}",
+    "severity": "critical",
+    "context": {
+      "boost": ["openai", "gpt", "api", "key", "secret"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "technology/anthropic_key",
+    "sector": "technology",
+    "pattern": "sk-ant-[A-Za-z0-9\\-_]{40,}",
+    "severity": "critical",
+    "context": {
+      "boost": ["anthropic", "claude", "api", "key", "secret"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "technology/github_token",
+    "sector": "technology",
+    "pattern": "gh[pousr]_[A-Za-z0-9]{36,}",
+    "severity": "critical",
+    "context": {
+      "boost": ["github", "token", "git", "repo", "personal access"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "technology/stripe_key",
+    "sector": "technology",
+    "pattern": "(?:sk|pk)_(?:live|test)_[A-Za-z0-9]{24,}",
+    "severity": "critical",
+    "context": {
+      "boost": ["stripe", "payment", "api", "key", "secret"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "technology/jwt_token",
+    "sector": "technology",
+    "pattern": "eyJ[A-Za-z0-9_\\-]+\\.eyJ[A-Za-z0-9_\\-]+\\.[A-Za-z0-9_\\-]+",
+    "severity": "high",
+    "context": {
+      "boost": ["jwt", "token", "bearer", "auth", "authorization"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "technology/private_key",
+    "sector": "technology",
+    "pattern": "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----",
+    "severity": "critical",
+    "context": {
+      "boost": ["private", "key", "pem", "cert", "rsa", "ssh"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "technology/connection_string",
+    "sector": "technology",
+    "pattern": "(?:mongodb|postgresql|mysql|redis|amqp)(?:\\+srv)?://[^:]+:[^@]+@[^/\\s]+",
+    "severity": "critical",
+    "context": {
+      "boost": ["database", "db", "connection", "uri", "dsn"],
+      "suppress": ["example", "test", "localhost", "sample"]
+    }
+  }
+]

datagate_llm/rules/universal.json ADDED Viewed

@@ -0,0 +1,52 @@
+[
+  {
+    "id": "universal/email",
+    "sector": "universal",
+    "pattern": "[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}",
+    "severity": "high",
+    "context": {
+      "boost": ["email", "contact", "reach", "send", "mail"],
+      "suppress": ["example", "test", "sample", "foo", "bar"]
+    }
+  },
+  {
+    "id": "universal/phone_us",
+    "sector": "universal",
+    "pattern": "(?:\\+1[\\s\\-]?)?(?:\\(?[2-9]\\d{2}\\)?[\\s\\-]?)[2-9]\\d{2}[\\s\\-]?\\d{4}",
+    "severity": "medium",
+    "context": {
+      "boost": ["phone", "call", "mobile", "cell", "tel", "fax"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "universal/ssn",
+    "sector": "universal",
+    "pattern": "(?!000|666|9\\d{2})\\d{3}[\\s\\-](?!00)\\d{2}[\\s\\-](?!0000)\\d{4}",
+    "severity": "critical",
+    "context": {
+      "boost": ["ssn", "social security", "taxpayer", "government id"],
+      "suppress": ["example", "test", "sample", "fake"]
+    }
+  },
+  {
+    "id": "universal/credit_card",
+    "sector": "universal",
+    "pattern": "(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})",
+    "severity": "critical",
+    "context": {
+      "boost": ["card", "credit", "debit", "payment", "billing"],
+      "suppress": ["example", "test", "sample"]
+    }
+  },
+  {
+    "id": "universal/ip_address",
+    "sector": "universal",
+    "pattern": "(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
+    "severity": "low",
+    "context": {
+      "boost": ["server", "host", "ip", "address", "network", "connect"],
+      "suppress": ["example", "test", "localhost", "127"]
+    }
+  }
+]

datagate_llm-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,178 @@
+Metadata-Version: 2.4
+Name: datagate-llm
+Version: 0.1.0
+Summary: The inference boundary layer between your data and outbound AI requests
+License: MIT
+Keywords: llm,guardrails,pii-detection,prompt-injection,data-privacy,ai-security,data-gate,inference-boundary
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Security
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Provides-Extra: semantic
+Requires-Dist: onnxruntime; extra == "semantic"
+Dynamic: license-file
+# datagate-llm
+[![PyPI version](https://img.shields.io/pypi/v/datagate-llm.svg)](https://pypi.org/project/datagate-llm/)
+[![Python versions](https://img.shields.io/pypi/pyversions/datagate-llm.svg)](https://pypi.org/project/datagate-llm/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Tests](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml/badge.svg)](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml)
+**The inference boundary layer between your data and outbound AI requests.**
+Scan text for sensitive data — PII, secrets, credentials, and sector-specific identifiers — before it leaves your system and reaches an LLM API.
+---
+## The Problem
+In 2023, Samsung engineers accidentally leaked proprietary source code and internal meeting notes by pasting them into ChatGPT. The data was retained and potentially used for training. This is not a hypothetical risk — it is the default behavior when you send unrestricted text to an external AI model.
+datagate-llm is the layer you put in front of that API call. It checks what you are about to send, tells you what it found, and lets you decide: flag it, redact it, or block it.
+---
+## Install
+```bash
+pip install datagate-llm
+```
+Zero dependencies. Python 3.9+. Works offline.
+---
+## Quickstart
+```python
+from datagate_llm import scan
+# Basic scan
+result = scan("Contact Alice at alice@company.com or call 415-555-0192")
+print(result["safe"])        # False
+print(result["risk_score"])  # 0.8 (or similar)
+print(result["findings"])    # list of matched spans
+# Redact mode — replace PII before sending to an LLM
+result = scan(
+    "My SSN is 123-45-6789 and card number 4111111111111111",
+    mode="redact"
+)
+print(result["redacted_text"])
+# "My SSN is [REDACTED:universal/ssn] and card number [REDACTED:universal/credit_card]"
+# Block mode — hard stop on high-risk content
+result = scan("AKIAIOSFODNN7EXAMPLEKEY", sectors=["technology"], mode="block")
+if result["action"] == "block":
+    raise ValueError("Refusing to send credentials to LLM")
+# Multi-sector scan
+result = scan(
+    "Patient MRN: AB12345, account 123456789012",
+    sectors=["healthcare", "finance"]
+)
+for finding in result["findings"]:
+    print(finding["rule_id"], finding["severity"], finding["confidence"])
+```
+---
+## What It Detects
+| Category | Rule ID | Severity |
+|----------|---------|----------|
+| Email address | `universal/email` | high |
+| US phone number | `universal/phone_us` | medium |
+| Social Security Number | `universal/ssn` | critical |
+| Credit card number | `universal/credit_card` | critical |
+| IP address | `universal/ip_address` | low |
+| AWS access key | `technology/aws_access_key` | critical |
+| OpenAI API key | `technology/openai_key` | critical |
+| Anthropic API key | `technology/anthropic_key` | critical |
+| GitHub token | `technology/github_token` | critical |
+| Stripe key | `technology/stripe_key` | critical |
+| JWT token | `technology/jwt_token` | high |
+| Private key (PEM) | `technology/private_key` | critical |
+| Database connection string | `technology/connection_string` | critical |
+| NPI number | `healthcare/npi_number` | high |
+| ICD-10 diagnosis code | `healthcare/icd10_code` | medium |
+| Insurance member ID | `healthcare/insurance_member_id` | high |
+| Medical record number | `healthcare/medical_record_number` | critical |
+| DEA number | `healthcare/dea_number` | critical |
+| IBAN | `finance/iban` | high |
+| SWIFT/BIC code | `finance/swift_bic` | medium |
+| ABA routing number | `finance/routing_number` | high |
+| Bank account number | `finance/bank_account` | high |
+| Tax ID / EIN | `finance/tax_id_ein` | critical |
+| Bitcoin address | `finance/crypto_btc` | medium |
+| Ethereum address | `finance/crypto_eth` | medium |
+---
+## How It Works
+```
+text input
+    │
+    ▼
+tokenize()          ← NFKC normalization, zero-width char removal
+    │
+    ▼
+match()             ← regex scan against compiled rule set
+    │
+    ▼
+score()             ← context-aware confidence (boost / suppress words)
+    │
+    ▼
+resolve()           ← remove overlapping spans, keep highest confidence
+    │
+    ▼
+aggregate()         ← single risk_score in [0.0, 1.0]
+    │
+    ▼
+build_result()      ← assemble final dict with action, findings, fingerprint
+```
+Every step is a pure function. No network calls. No disk writes. No global state except the in-process rule cache.
+---
+## Scan Modes
+| Mode | When risk > 0 | Use case |
+|------|---------------|----------|
+| `flag` (default) | `action = "flag"` | Log and review before sending |
+| `redact` | `action = "flag"`, spans replaced in `redacted_text` | Strip PII, send cleaned text |
+| `block` | `action = "block"` | Hard stop — raise an error upstream |
+---
+## Honest Limits
+- **Regex-only**: datagate-llm uses deterministic pattern matching. It will not catch PII embedded in obfuscated prose, paraphrased content, or novel formats it has never seen.
+- **English-centric**: Phone and ID patterns currently target US formats. International variants may be missed.
+- **No semantic understanding**: "The patient's temperature was 98.6" will not be flagged as health data because there is no pattern for it. Semantic scanning requires the optional `onnxruntime` layer (not yet released).
+- **False positives are possible**: Short patterns like SWIFT codes can match arbitrary uppercase strings. Use `context.suppress` words in your rule JSON to reduce noise.
+- **Not a compliance tool**: Passing a scan does not mean a document is HIPAA, GDPR, or PCI-DSS compliant. Use this as one layer of defense, not the only one.
+---
+## Contributing
+See [CONTRIBUTING.md](CONTRIBUTING.md). In short: add rules in JSON, add tests, open a PR.
+---
+## License
+MIT. See [LICENSE](LICENSE).

datagate_llm-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+datagate_llm/__init__.py,sha256=VswyM_1dmvAZNpzgDkOypoDtFijtX-qK_gGUZKf0jPA,2136
+datagate_llm/engine.py,sha256=3e3LHxKjIchSzqwesGlU_hRVzXLp6wueKVo7g-bouIA,3527
+datagate_llm/loader.py,sha256=EFzR7NH_mP0EF5za17vz1745WIQNd2MvU37litddR98,1437
+datagate_llm/rules/finance.json,sha256=p4781hxooYAuv386O0plp8fFv7kdmJDZ2hEBSMQB-W0,2069
+datagate_llm/rules/healthcare.json,sha256=ePdhcGZg_X_zuWIMJTRdbwkGa2bKoKtYUL1C42jEqXM,1539
+datagate_llm/rules/technology.json,sha256=6vf_bnsGCTbq0IhMHrMGcN8OxjrC0RURRaxqpHBe3cI,2453
+datagate_llm/rules/universal.json,sha256=JxqCusZerVZ7UO5Np9yULbNEOWfFEFeL0tTBPrCk418,1649
+datagate_llm-0.1.0.dist-info/licenses/LICENSE,sha256=6SFa0ejw6BzmU13JtKRhpx3XWnMdAI1YYIBdmMg_RNc,1082
+datagate_llm-0.1.0.dist-info/METADATA,sha256=fBeBdyTeKWzIICGeqpOqFdygk1c-eaw_toBm0cuCH08,6805
+datagate_llm-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+datagate_llm-0.1.0.dist-info/top_level.txt,sha256=eWfgugyYvFG16UhP-Ek2u30Tuv8zlqJ0xEbN8Sp-Akg,13
+datagate_llm-0.1.0.dist-info/RECORD,,

datagate_llm-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

datagate_llm-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 datagate-llm Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

datagate_llm-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ datagate_llm