datagate-llm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,61 @@
1
+ """
2
+ datagate-llm: The inference boundary layer between your data and outbound AI requests.
3
+
4
+ Public API
5
+ ----------
6
+ scan(text, sectors=None, mode="flag", rules_dir=_RULES_DIR) -> dict
7
+
8
+ Args:
9
+ text (str): Input text to analyse.
10
+ sectors (list[str] | None): Domain rule sets to load in addition to
11
+ universal rules. Supported: "technology", "healthcare", "finance".
12
+ mode (str): One of "flag", "redact", or "block".
13
+ rules_dir (str): Path to directory containing JSON rule files.
14
+
15
+ Returns a dict with keys:
16
+ safe (bool) - True when risk_score == 0.0
17
+ risk_score (float) - 0.0-1.0 aggregate risk
18
+ action (str) - "allow", "flag", or "block"
19
+ findings (list) - matched spans with metadata
20
+ redacted_text (str) - text with spans replaced (mode=redact)
21
+ fingerprint (str) - first 16 hex chars of sha256(text+rule_version)
22
+ rule_version (str) - hash of loaded rule set for audit
23
+ trace (list[str]) - human-readable decision log
24
+ """
25
+
26
+ import os
27
+
28
+ from .engine import tokenize, match, score, resolve, aggregate, build_result
29
+ from .loader import load_rules
30
+
31
+ __version__ = "0.1.0"
32
+ __all__ = ["scan"]
33
+
34
+ _RULES_DIR = os.path.join(os.path.dirname(__file__), "rules")
35
+
36
+
37
+ def scan(text, sectors=None, mode="flag", rules_dir=_RULES_DIR):
38
+ """Run the full detection pipeline on *text*."""
39
+ sectors = sectors or []
40
+ trace = []
41
+
42
+ rules = load_rules(sectors, rules_dir)
43
+ trace.append(f"loaded {len(rules)} rules for sectors={['universal'] + sectors}")
44
+
45
+ cleaned = tokenize(text)
46
+ trace.append("tokenized input")
47
+
48
+ spans = match(cleaned, rules)
49
+ trace.append(f"matched {len(spans)} raw spans")
50
+
51
+ scored = [dict(span, confidence=score(span, cleaned)) for span in spans]
52
+ clean_spans = resolve(scored)
53
+ trace.append(f"resolved to {len(clean_spans)} non-overlapping spans")
54
+
55
+ risk = aggregate(clean_spans)
56
+ trace.append(f"risk_score={risk:.3f}")
57
+
58
+ rule_version = rules[0].get("rule_version", "unknown") if rules else "unknown"
59
+ result = build_result(cleaned, clean_spans, risk, mode, rule_version)
60
+ result["trace"] = trace
61
+ return result
datagate_llm/engine.py ADDED
@@ -0,0 +1,127 @@
1
+ """
2
+ Pure-function detection engine. No side effects. No I/O. stdlib only.
3
+ """
4
+
5
+ import hashlib
6
+ import re
7
+ import unicodedata
8
+ from math import log1p
9
+
10
+ _ZERO_WIDTH = re.compile(r"[\u200b-\u200f\u202a-\u202e\ufeff\u00ad]")
11
+
12
+ _SEVERITY_BASE = {
13
+ "critical": 1.0,
14
+ "high": 0.8,
15
+ "medium": 0.5,
16
+ "low": 0.3,
17
+ }
18
+ _WINDOW = 30
19
+ _BOOST = 0.15
20
+ _SUPPRESS = 0.25
21
+ _LOG_SCALE = 0.05
22
+
23
+
24
+ def tokenize(text):
25
+ """Normalise *text* to NFKC and strip zero-width characters."""
26
+ normalised = unicodedata.normalize("NFKC", text)
27
+ return _ZERO_WIDTH.sub("", normalised)
28
+
29
+
30
+ def match(text, rules):
31
+ """Return every span matched by *rules* against *text*."""
32
+ spans = []
33
+ for rule in rules:
34
+ compiled = rule.get("compiled")
35
+ if compiled is None:
36
+ continue
37
+ for m in compiled.finditer(text):
38
+ spans.append({
39
+ "start": m.start(),
40
+ "end": m.end(),
41
+ "text": m.group(),
42
+ "rule_id": rule["id"],
43
+ "sector": rule.get("sector", "universal"),
44
+ "severity": rule.get("severity", "medium"),
45
+ "context": rule.get("context", {}),
46
+ })
47
+ return spans
48
+
49
+
50
+ def score(span, text):
51
+ """Return a confidence float in [0.0, 1.0] for *span* inside *text*."""
52
+ base = _SEVERITY_BASE.get(span.get("severity", "medium"), 0.5)
53
+ start = max(0, span["start"] - _WINDOW)
54
+ end = min(len(text), span["end"] + _WINDOW)
55
+ window = text[start:end].lower()
56
+
57
+ ctx = span.get("context", {})
58
+ boost_words = ctx.get("boost", [])
59
+ suppress_words = ctx.get("suppress", [])
60
+
61
+ if any(w in window for w in boost_words):
62
+ base = min(1.0, base + _BOOST)
63
+ if any(w in window for w in suppress_words):
64
+ base = max(0.0, base - _SUPPRESS)
65
+ return round(base, 4)
66
+
67
+
68
+ def resolve(spans):
69
+ """Remove overlapping spans, keeping highest-confidence ones."""
70
+ sorted_spans = sorted(
71
+ spans,
72
+ key=lambda s: (s["start"], -s.get("confidence", 0), s["rule_id"])
73
+ )
74
+ result = []
75
+ last_end = -1
76
+ for span in sorted_spans:
77
+ if span["start"] >= last_end:
78
+ result.append(span)
79
+ last_end = span["end"]
80
+ return result
81
+
82
+
83
+ def aggregate(spans):
84
+ """Compute aggregate risk in [0.0, 1.0] from resolved *spans*."""
85
+ if not spans:
86
+ return 0.0
87
+ max_score = max(s.get("confidence", 0.0) for s in spans)
88
+ raw = max_score * (1 + _LOG_SCALE * log1p(len(spans)))
89
+ return round(min(1.0, raw), 4)
90
+
91
+
92
+ def build_result(text, spans, risk, mode, rule_version):
93
+ """Assemble the final result dict. Never raises."""
94
+ findings = [
95
+ {k: v for k, v in s.items() if k not in ("compiled", "context")}
96
+ for s in spans
97
+ ]
98
+
99
+ redacted = text
100
+ if mode == "redact":
101
+ for span in reversed(spans):
102
+ placeholder = f"[REDACTED:{span['rule_id']}]"
103
+ redacted = redacted[: span["start"]] + placeholder + redacted[span["end"]:]
104
+
105
+ action = _resolve_action(risk, mode)
106
+ fp_raw = hashlib.sha256(f"{text}{rule_version}".encode()).hexdigest()
107
+
108
+ return {
109
+ "safe": risk == 0.0,
110
+ "risk_score": risk,
111
+ "action": action,
112
+ "findings": findings,
113
+ "redacted_text": redacted,
114
+ "fingerprint": fp_raw[:16],
115
+ "rule_version": rule_version,
116
+ "trace": [],
117
+ }
118
+
119
+
120
+ def _resolve_action(risk, mode):
121
+ if risk == 0.0:
122
+ return "allow"
123
+ if mode == "block":
124
+ return "block"
125
+ if mode == "flag":
126
+ return "flag"
127
+ return "allow"
datagate_llm/loader.py ADDED
@@ -0,0 +1,56 @@
1
+ """
2
+ Rule loader with in-process cache. stdlib only.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import re
8
+
9
+ _cache = {}
10
+
11
+ _SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3}
12
+
13
+
14
+ def load_rules(sectors, rules_dir):
15
+ """Return compiled rules for *sectors* (universal always included)."""
16
+ key = (tuple(sorted(sectors)), rules_dir)
17
+ if key in _cache:
18
+ return _cache[key]
19
+
20
+ raw = _read(os.path.join(rules_dir, "universal.json"))
21
+ for sector in sectors:
22
+ path = os.path.join(rules_dir, f"{sector}.json")
23
+ raw.extend(_read(path))
24
+
25
+ compiled = _compile(raw)
26
+ _cache[key] = compiled
27
+ return compiled
28
+
29
+
30
+ def _read(path):
31
+ """Load JSON rule list from *path*; return [] on any error."""
32
+ try:
33
+ with open(path, encoding="utf-8") as fh:
34
+ data = json.loads(fh.read())
35
+ return data if isinstance(data, list) else []
36
+ except (OSError, json.JSONDecodeError):
37
+ return []
38
+
39
+
40
+ def _compile(rules):
41
+ """Add *compiled* regex key to each rule and sort by severity."""
42
+ out = []
43
+ for rule in rules:
44
+ pattern = rule.get("pattern", "")
45
+ if not pattern:
46
+ continue
47
+ try:
48
+ compiled = re.compile(pattern)
49
+ except re.error:
50
+ continue
51
+ entry = dict(rule)
52
+ entry["compiled"] = compiled
53
+ out.append(entry)
54
+
55
+ out.sort(key=lambda r: _SEVERITY_ORDER.get(r.get("severity", "medium"), 2))
56
+ return out
@@ -0,0 +1,72 @@
1
+ [
2
+ {
3
+ "id": "finance/iban",
4
+ "sector": "finance",
5
+ "pattern": "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}(?:[A-Z0-9]?){0,16}\\b",
6
+ "severity": "high",
7
+ "context": {
8
+ "boost": ["iban", "bank", "account", "transfer", "wire"],
9
+ "suppress": ["example", "test", "sample"]
10
+ }
11
+ },
12
+ {
13
+ "id": "finance/swift_bic",
14
+ "sector": "finance",
15
+ "pattern": "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
16
+ "severity": "medium",
17
+ "context": {
18
+ "boost": ["swift", "bic", "bank", "wire", "transfer", "international"],
19
+ "suppress": ["example", "test", "sample"]
20
+ }
21
+ },
22
+ {
23
+ "id": "finance/routing_number",
24
+ "sector": "finance",
25
+ "pattern": "\\b(?:routing|aba|aba routing)[:\\s#]*([0-9]{9})\\b",
26
+ "severity": "high",
27
+ "context": {
28
+ "boost": ["routing", "aba", "transit", "bank", "account"],
29
+ "suppress": ["example", "test", "sample"]
30
+ }
31
+ },
32
+ {
33
+ "id": "finance/bank_account",
34
+ "sector": "finance",
35
+ "pattern": "\\b(?:account|acct)[\\s\\-#:]*([0-9]{8,17})\\b",
36
+ "severity": "high",
37
+ "context": {
38
+ "boost": ["account", "bank", "checking", "savings", "deposit"],
39
+ "suppress": ["example", "test", "sample"]
40
+ }
41
+ },
42
+ {
43
+ "id": "finance/tax_id_ein",
44
+ "sector": "finance",
45
+ "pattern": "\\b(?:EIN|FEIN|Tax ID)[:\\s#]*([0-9]{2}-[0-9]{7})\\b",
46
+ "severity": "critical",
47
+ "context": {
48
+ "boost": ["ein", "fein", "tax", "employer", "federal"],
49
+ "suppress": ["example", "test", "sample"]
50
+ }
51
+ },
52
+ {
53
+ "id": "finance/crypto_btc",
54
+ "sector": "finance",
55
+ "pattern": "\\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,62}\\b",
56
+ "severity": "medium",
57
+ "context": {
58
+ "boost": ["bitcoin", "btc", "wallet", "crypto", "address"],
59
+ "suppress": ["example", "test", "sample"]
60
+ }
61
+ },
62
+ {
63
+ "id": "finance/crypto_eth",
64
+ "sector": "finance",
65
+ "pattern": "\\b0x[a-fA-F0-9]{40}\\b",
66
+ "severity": "medium",
67
+ "context": {
68
+ "boost": ["ethereum", "eth", "wallet", "crypto", "address", "0x"],
69
+ "suppress": ["example", "test", "sample"]
70
+ }
71
+ }
72
+ ]
@@ -0,0 +1,52 @@
1
+ [
2
+ {
3
+ "id": "healthcare/npi_number",
4
+ "sector": "healthcare",
5
+ "pattern": "\\bNPI[:\\s#]*([1-9]\\d{9})\\b",
6
+ "severity": "high",
7
+ "context": {
8
+ "boost": ["npi", "provider", "physician", "practitioner", "clinic"],
9
+ "suppress": ["example", "test", "sample"]
10
+ }
11
+ },
12
+ {
13
+ "id": "healthcare/icd10_code",
14
+ "sector": "healthcare",
15
+ "pattern": "\\b[A-TV-Z][0-9][0-9A-Z](?:\\.[0-9A-Z]{1,4})?\\b",
16
+ "severity": "medium",
17
+ "context": {
18
+ "boost": ["icd", "diagnosis", "code", "condition", "disease"],
19
+ "suppress": ["example", "test", "sample"]
20
+ }
21
+ },
22
+ {
23
+ "id": "healthcare/insurance_member_id",
24
+ "sector": "healthcare",
25
+ "pattern": "\\b(?:member|subscriber|insured)[\\s\\-#:]*([A-Z0-9]{6,15})\\b",
26
+ "severity": "high",
27
+ "context": {
28
+ "boost": ["insurance", "member", "subscriber", "plan", "policy"],
29
+ "suppress": ["example", "test", "sample"]
30
+ }
31
+ },
32
+ {
33
+ "id": "healthcare/medical_record_number",
34
+ "sector": "healthcare",
35
+ "pattern": "\\b(?:MRN|medical record)[:\\s#]*([A-Z0-9]{5,12})\\b",
36
+ "severity": "critical",
37
+ "context": {
38
+ "boost": ["mrn", "medical record", "patient", "hospital", "chart"],
39
+ "suppress": ["example", "test", "sample"]
40
+ }
41
+ },
42
+ {
43
+ "id": "healthcare/dea_number",
44
+ "sector": "healthcare",
45
+ "pattern": "\\b[A-Z]{2}[0-9]{7}\\b",
46
+ "severity": "critical",
47
+ "context": {
48
+ "boost": ["dea", "drug", "prescribe", "controlled", "substance"],
49
+ "suppress": ["example", "test", "sample"]
50
+ }
51
+ }
52
+ ]
@@ -0,0 +1,82 @@
1
+ [
2
+ {
3
+ "id": "technology/aws_access_key",
4
+ "sector": "technology",
5
+ "pattern": "(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}",
6
+ "severity": "critical",
7
+ "context": {
8
+ "boost": ["aws", "amazon", "access", "key", "iam"],
9
+ "suppress": ["example", "test", "sample", "fake"]
10
+ }
11
+ },
12
+ {
13
+ "id": "technology/openai_key",
14
+ "sector": "technology",
15
+ "pattern": "sk-[A-Za-z0-9]{20,}T3BlbkFJ[A-Za-z0-9]{20,}",
16
+ "severity": "critical",
17
+ "context": {
18
+ "boost": ["openai", "gpt", "api", "key", "secret"],
19
+ "suppress": ["example", "test", "sample"]
20
+ }
21
+ },
22
+ {
23
+ "id": "technology/anthropic_key",
24
+ "sector": "technology",
25
+ "pattern": "sk-ant-[A-Za-z0-9\\-_]{40,}",
26
+ "severity": "critical",
27
+ "context": {
28
+ "boost": ["anthropic", "claude", "api", "key", "secret"],
29
+ "suppress": ["example", "test", "sample"]
30
+ }
31
+ },
32
+ {
33
+ "id": "technology/github_token",
34
+ "sector": "technology",
35
+ "pattern": "gh[pousr]_[A-Za-z0-9]{36,}",
36
+ "severity": "critical",
37
+ "context": {
38
+ "boost": ["github", "token", "git", "repo", "personal access"],
39
+ "suppress": ["example", "test", "sample"]
40
+ }
41
+ },
42
+ {
43
+ "id": "technology/stripe_key",
44
+ "sector": "technology",
45
+ "pattern": "(?:sk|pk)_(?:live|test)_[A-Za-z0-9]{24,}",
46
+ "severity": "critical",
47
+ "context": {
48
+ "boost": ["stripe", "payment", "api", "key", "secret"],
49
+ "suppress": ["example", "test", "sample"]
50
+ }
51
+ },
52
+ {
53
+ "id": "technology/jwt_token",
54
+ "sector": "technology",
55
+ "pattern": "eyJ[A-Za-z0-9_\\-]+\\.eyJ[A-Za-z0-9_\\-]+\\.[A-Za-z0-9_\\-]+",
56
+ "severity": "high",
57
+ "context": {
58
+ "boost": ["jwt", "token", "bearer", "auth", "authorization"],
59
+ "suppress": ["example", "test", "sample"]
60
+ }
61
+ },
62
+ {
63
+ "id": "technology/private_key",
64
+ "sector": "technology",
65
+ "pattern": "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----",
66
+ "severity": "critical",
67
+ "context": {
68
+ "boost": ["private", "key", "pem", "cert", "rsa", "ssh"],
69
+ "suppress": ["example", "test", "sample"]
70
+ }
71
+ },
72
+ {
73
+ "id": "technology/connection_string",
74
+ "sector": "technology",
75
+ "pattern": "(?:mongodb|postgresql|mysql|redis|amqp)(?:\\+srv)?://[^:]+:[^@]+@[^/\\s]+",
76
+ "severity": "critical",
77
+ "context": {
78
+ "boost": ["database", "db", "connection", "uri", "dsn"],
79
+ "suppress": ["example", "test", "localhost", "sample"]
80
+ }
81
+ }
82
+ ]
@@ -0,0 +1,52 @@
1
+ [
2
+ {
3
+ "id": "universal/email",
4
+ "sector": "universal",
5
+ "pattern": "[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}",
6
+ "severity": "high",
7
+ "context": {
8
+ "boost": ["email", "contact", "reach", "send", "mail"],
9
+ "suppress": ["example", "test", "sample", "foo", "bar"]
10
+ }
11
+ },
12
+ {
13
+ "id": "universal/phone_us",
14
+ "sector": "universal",
15
+ "pattern": "(?:\\+1[\\s\\-]?)?(?:\\(?[2-9]\\d{2}\\)?[\\s\\-]?)[2-9]\\d{2}[\\s\\-]?\\d{4}",
16
+ "severity": "medium",
17
+ "context": {
18
+ "boost": ["phone", "call", "mobile", "cell", "tel", "fax"],
19
+ "suppress": ["example", "test", "sample"]
20
+ }
21
+ },
22
+ {
23
+ "id": "universal/ssn",
24
+ "sector": "universal",
25
+ "pattern": "(?!000|666|9\\d{2})\\d{3}[\\s\\-](?!00)\\d{2}[\\s\\-](?!0000)\\d{4}",
26
+ "severity": "critical",
27
+ "context": {
28
+ "boost": ["ssn", "social security", "taxpayer", "government id"],
29
+ "suppress": ["example", "test", "sample", "fake"]
30
+ }
31
+ },
32
+ {
33
+ "id": "universal/credit_card",
34
+ "sector": "universal",
35
+ "pattern": "(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})",
36
+ "severity": "critical",
37
+ "context": {
38
+ "boost": ["card", "credit", "debit", "payment", "billing"],
39
+ "suppress": ["example", "test", "sample"]
40
+ }
41
+ },
42
+ {
43
+ "id": "universal/ip_address",
44
+ "sector": "universal",
45
+ "pattern": "(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
46
+ "severity": "low",
47
+ "context": {
48
+ "boost": ["server", "host", "ip", "address", "network", "connect"],
49
+ "suppress": ["example", "test", "localhost", "127"]
50
+ }
51
+ }
52
+ ]
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: datagate-llm
3
+ Version: 0.1.0
4
+ Summary: The inference boundary layer between your data and outbound AI requests
5
+ License: MIT
6
+ Keywords: llm,guardrails,pii-detection,prompt-injection,data-privacy,ai-security,data-gate,inference-boundary
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Security
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Provides-Extra: semantic
21
+ Requires-Dist: onnxruntime; extra == "semantic"
22
+ Dynamic: license-file
23
+
24
+ # datagate-llm
25
+
26
+ [![PyPI version](https://img.shields.io/pypi/v/datagate-llm.svg)](https://pypi.org/project/datagate-llm/)
27
+ [![Python versions](https://img.shields.io/pypi/pyversions/datagate-llm.svg)](https://pypi.org/project/datagate-llm/)
28
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
29
+ [![Tests](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml/badge.svg)](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml)
30
+
31
+ **The inference boundary layer between your data and outbound AI requests.**
32
+
33
+ Scan text for sensitive data — PII, secrets, credentials, and sector-specific identifiers — before it leaves your system and reaches an LLM API.
34
+
35
+ ---
36
+
37
+ ## The Problem
38
+
39
+ In 2023, Samsung engineers accidentally leaked proprietary source code and internal meeting notes by pasting them into ChatGPT. The data was retained and potentially used for training. This is not a hypothetical risk — it is the default behavior when you send unrestricted text to an external AI model.
40
+
41
+ datagate-llm is the layer you put in front of that API call. It checks what you are about to send, tells you what it found, and lets you decide: flag it, redact it, or block it.
42
+
43
+ ---
44
+
45
+ ## Install
46
+
47
+ ```bash
48
+ pip install datagate-llm
49
+ ```
50
+
51
+ Zero dependencies. Python 3.9+. Works offline.
52
+
53
+ ---
54
+
55
+ ## Quickstart
56
+
57
+ ```python
58
+ from datagate_llm import scan
59
+
60
+ # Basic scan
61
+ result = scan("Contact Alice at alice@company.com or call 415-555-0192")
62
+ print(result["safe"]) # False
63
+ print(result["risk_score"]) # 0.8 (or similar)
64
+ print(result["findings"]) # list of matched spans
65
+
66
+ # Redact mode — replace PII before sending to an LLM
67
+ result = scan(
68
+ "My SSN is 123-45-6789 and card number 4111111111111111",
69
+ mode="redact"
70
+ )
71
+ print(result["redacted_text"])
72
+ # "My SSN is [REDACTED:universal/ssn] and card number [REDACTED:universal/credit_card]"
73
+
74
+ # Block mode — hard stop on high-risk content
75
+ result = scan("AKIAIOSFODNN7EXAMPLEKEY", sectors=["technology"], mode="block")
76
+ if result["action"] == "block":
77
+ raise ValueError("Refusing to send credentials to LLM")
78
+
79
+ # Multi-sector scan
80
+ result = scan(
81
+ "Patient MRN: AB12345, account 123456789012",
82
+ sectors=["healthcare", "finance"]
83
+ )
84
+ for finding in result["findings"]:
85
+ print(finding["rule_id"], finding["severity"], finding["confidence"])
86
+ ```
87
+
88
+ ---
89
+
90
+ ## What It Detects
91
+
92
+ | Category | Rule ID | Severity |
93
+ |----------|---------|----------|
94
+ | Email address | `universal/email` | high |
95
+ | US phone number | `universal/phone_us` | medium |
96
+ | Social Security Number | `universal/ssn` | critical |
97
+ | Credit card number | `universal/credit_card` | critical |
98
+ | IP address | `universal/ip_address` | low |
99
+ | AWS access key | `technology/aws_access_key` | critical |
100
+ | OpenAI API key | `technology/openai_key` | critical |
101
+ | Anthropic API key | `technology/anthropic_key` | critical |
102
+ | GitHub token | `technology/github_token` | critical |
103
+ | Stripe key | `technology/stripe_key` | critical |
104
+ | JWT token | `technology/jwt_token` | high |
105
+ | Private key (PEM) | `technology/private_key` | critical |
106
+ | Database connection string | `technology/connection_string` | critical |
107
+ | NPI number | `healthcare/npi_number` | high |
108
+ | ICD-10 diagnosis code | `healthcare/icd10_code` | medium |
109
+ | Insurance member ID | `healthcare/insurance_member_id` | high |
110
+ | Medical record number | `healthcare/medical_record_number` | critical |
111
+ | DEA number | `healthcare/dea_number` | critical |
112
+ | IBAN | `finance/iban` | high |
113
+ | SWIFT/BIC code | `finance/swift_bic` | medium |
114
+ | ABA routing number | `finance/routing_number` | high |
115
+ | Bank account number | `finance/bank_account` | high |
116
+ | Tax ID / EIN | `finance/tax_id_ein` | critical |
117
+ | Bitcoin address | `finance/crypto_btc` | medium |
118
+ | Ethereum address | `finance/crypto_eth` | medium |
119
+
120
+ ---
121
+
122
+ ## How It Works
123
+
124
+ ```
125
+ text input
126
+
127
+
128
+ tokenize() ← NFKC normalization, zero-width char removal
129
+
130
+
131
+ match() ← regex scan against compiled rule set
132
+
133
+
134
+ score() ← context-aware confidence (boost / suppress words)
135
+
136
+
137
+ resolve() ← remove overlapping spans, keep highest confidence
138
+
139
+
140
+ aggregate() ← single risk_score in [0.0, 1.0]
141
+
142
+
143
+ build_result() ← assemble final dict with action, findings, fingerprint
144
+ ```
145
+
146
+ Every step is a pure function. No network calls. No disk writes. No global state except the in-process rule cache.
147
+
148
+ ---
149
+
150
+ ## Scan Modes
151
+
152
+ | Mode | When risk > 0 | Use case |
153
+ |------|---------------|----------|
154
+ | `flag` (default) | `action = "flag"` | Log and review before sending |
155
+ | `redact` | `action = "flag"`, spans replaced in `redacted_text` | Strip PII, send cleaned text |
156
+ | `block` | `action = "block"` | Hard stop — raise an error upstream |
157
+
158
+ ---
159
+
160
+ ## Honest Limits
161
+
162
+ - **Regex-only**: datagate-llm uses deterministic pattern matching. It will not catch PII embedded in obfuscated prose, paraphrased content, or novel formats it has never seen.
163
+ - **English-centric**: Phone and ID patterns currently target US formats. International variants may be missed.
164
+ - **No semantic understanding**: "The patient's temperature was 98.6" will not be flagged as health data because there is no pattern for it. Semantic scanning requires the optional `onnxruntime` layer (not yet released).
165
+ - **False positives are possible**: Short patterns like SWIFT codes can match arbitrary uppercase strings. Use `context.suppress` words in your rule JSON to reduce noise.
166
+ - **Not a compliance tool**: Passing a scan does not mean a document is HIPAA, GDPR, or PCI-DSS compliant. Use this as one layer of defense, not the only one.
167
+
168
+ ---
169
+
170
+ ## Contributing
171
+
172
+ See [CONTRIBUTING.md](CONTRIBUTING.md). In short: add rules in JSON, add tests, open a PR.
173
+
174
+ ---
175
+
176
+ ## License
177
+
178
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,12 @@
1
+ datagate_llm/__init__.py,sha256=VswyM_1dmvAZNpzgDkOypoDtFijtX-qK_gGUZKf0jPA,2136
2
+ datagate_llm/engine.py,sha256=3e3LHxKjIchSzqwesGlU_hRVzXLp6wueKVo7g-bouIA,3527
3
+ datagate_llm/loader.py,sha256=EFzR7NH_mP0EF5za17vz1745WIQNd2MvU37litddR98,1437
4
+ datagate_llm/rules/finance.json,sha256=p4781hxooYAuv386O0plp8fFv7kdmJDZ2hEBSMQB-W0,2069
5
+ datagate_llm/rules/healthcare.json,sha256=ePdhcGZg_X_zuWIMJTRdbwkGa2bKoKtYUL1C42jEqXM,1539
6
+ datagate_llm/rules/technology.json,sha256=6vf_bnsGCTbq0IhMHrMGcN8OxjrC0RURRaxqpHBe3cI,2453
7
+ datagate_llm/rules/universal.json,sha256=JxqCusZerVZ7UO5Np9yULbNEOWfFEFeL0tTBPrCk418,1649
8
+ datagate_llm-0.1.0.dist-info/licenses/LICENSE,sha256=6SFa0ejw6BzmU13JtKRhpx3XWnMdAI1YYIBdmMg_RNc,1082
9
+ datagate_llm-0.1.0.dist-info/METADATA,sha256=fBeBdyTeKWzIICGeqpOqFdygk1c-eaw_toBm0cuCH08,6805
10
+ datagate_llm-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
11
+ datagate_llm-0.1.0.dist-info/top_level.txt,sha256=eWfgugyYvFG16UhP-Ek2u30Tuv8zlqJ0xEbN8Sp-Akg,13
12
+ datagate_llm-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 datagate-llm Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ datagate_llm