datagate-llm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datagate_llm-0.1.0/LICENSE +21 -0
- datagate_llm-0.1.0/PKG-INFO +178 -0
- datagate_llm-0.1.0/README.md +155 -0
- datagate_llm-0.1.0/pyproject.toml +43 -0
- datagate_llm-0.1.0/setup.cfg +4 -0
- datagate_llm-0.1.0/src/datagate_llm/__init__.py +61 -0
- datagate_llm-0.1.0/src/datagate_llm/engine.py +127 -0
- datagate_llm-0.1.0/src/datagate_llm/loader.py +56 -0
- datagate_llm-0.1.0/src/datagate_llm/rules/finance.json +72 -0
- datagate_llm-0.1.0/src/datagate_llm/rules/healthcare.json +52 -0
- datagate_llm-0.1.0/src/datagate_llm/rules/technology.json +82 -0
- datagate_llm-0.1.0/src/datagate_llm/rules/universal.json +52 -0
- datagate_llm-0.1.0/src/datagate_llm.egg-info/PKG-INFO +178 -0
- datagate_llm-0.1.0/src/datagate_llm.egg-info/SOURCES.txt +18 -0
- datagate_llm-0.1.0/src/datagate_llm.egg-info/dependency_links.txt +1 -0
- datagate_llm-0.1.0/src/datagate_llm.egg-info/requires.txt +3 -0
- datagate_llm-0.1.0/src/datagate_llm.egg-info/top_level.txt +1 -0
- datagate_llm-0.1.0/tests/test_engine.py +84 -0
- datagate_llm-0.1.0/tests/test_loader.py +50 -0
- datagate_llm-0.1.0/tests/test_scan.py +63 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 datagate-llm Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datagate-llm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The inference boundary layer between your data and outbound AI requests
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: llm,guardrails,pii-detection,prompt-injection,data-privacy,ai-security,data-gate,inference-boundary
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Security
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Provides-Extra: semantic
|
|
21
|
+
Requires-Dist: onnxruntime; extra == "semantic"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# datagate-llm
|
|
25
|
+
|
|
26
|
+
[](https://pypi.org/project/datagate-llm/)
|
|
27
|
+
[](https://pypi.org/project/datagate-llm/)
|
|
28
|
+
[](https://opensource.org/licenses/MIT)
|
|
29
|
+
[](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml)
|
|
30
|
+
|
|
31
|
+
**The inference boundary layer between your data and outbound AI requests.**
|
|
32
|
+
|
|
33
|
+
Scan text for sensitive data — PII, secrets, credentials, and sector-specific identifiers — before it leaves your system and reaches an LLM API.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## The Problem
|
|
38
|
+
|
|
39
|
+
In 2023, Samsung engineers accidentally leaked proprietary source code and internal meeting notes by pasting them into ChatGPT. The data was retained and potentially used for training. This is not a hypothetical risk — it is the default behavior when you send unrestricted text to an external AI model.
|
|
40
|
+
|
|
41
|
+
datagate-llm is the layer you put in front of that API call. It checks what you are about to send, tells you what it found, and lets you decide: flag it, redact it, or block it.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install datagate-llm
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Zero dependencies. Python 3.9+. Works offline.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Quickstart
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from datagate_llm import scan
|
|
59
|
+
|
|
60
|
+
# Basic scan
|
|
61
|
+
result = scan("Contact Alice at alice@company.com or call 415-555-0192")
|
|
62
|
+
print(result["safe"]) # False
|
|
63
|
+
print(result["risk_score"]) # 0.8 (or similar)
|
|
64
|
+
print(result["findings"]) # list of matched spans
|
|
65
|
+
|
|
66
|
+
# Redact mode — replace PII before sending to an LLM
|
|
67
|
+
result = scan(
|
|
68
|
+
"My SSN is 123-45-6789 and card number 4111111111111111",
|
|
69
|
+
mode="redact"
|
|
70
|
+
)
|
|
71
|
+
print(result["redacted_text"])
|
|
72
|
+
# "My SSN is [REDACTED:universal/ssn] and card number [REDACTED:universal/credit_card]"
|
|
73
|
+
|
|
74
|
+
# Block mode — hard stop on high-risk content
|
|
75
|
+
result = scan("AKIAIOSFODNN7EXAMPLEKEY", sectors=["technology"], mode="block")
|
|
76
|
+
if result["action"] == "block":
|
|
77
|
+
raise ValueError("Refusing to send credentials to LLM")
|
|
78
|
+
|
|
79
|
+
# Multi-sector scan
|
|
80
|
+
result = scan(
|
|
81
|
+
"Patient MRN: AB12345, account 123456789012",
|
|
82
|
+
sectors=["healthcare", "finance"]
|
|
83
|
+
)
|
|
84
|
+
for finding in result["findings"]:
|
|
85
|
+
print(finding["rule_id"], finding["severity"], finding["confidence"])
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## What It Detects
|
|
91
|
+
|
|
92
|
+
| Category | Rule ID | Severity |
|
|
93
|
+
|----------|---------|----------|
|
|
94
|
+
| Email address | `universal/email` | high |
|
|
95
|
+
| US phone number | `universal/phone_us` | medium |
|
|
96
|
+
| Social Security Number | `universal/ssn` | critical |
|
|
97
|
+
| Credit card number | `universal/credit_card` | critical |
|
|
98
|
+
| IP address | `universal/ip_address` | low |
|
|
99
|
+
| AWS access key | `technology/aws_access_key` | critical |
|
|
100
|
+
| OpenAI API key | `technology/openai_key` | critical |
|
|
101
|
+
| Anthropic API key | `technology/anthropic_key` | critical |
|
|
102
|
+
| GitHub token | `technology/github_token` | critical |
|
|
103
|
+
| Stripe key | `technology/stripe_key` | critical |
|
|
104
|
+
| JWT token | `technology/jwt_token` | high |
|
|
105
|
+
| Private key (PEM) | `technology/private_key` | critical |
|
|
106
|
+
| Database connection string | `technology/connection_string` | critical |
|
|
107
|
+
| NPI number | `healthcare/npi_number` | high |
|
|
108
|
+
| ICD-10 diagnosis code | `healthcare/icd10_code` | medium |
|
|
109
|
+
| Insurance member ID | `healthcare/insurance_member_id` | high |
|
|
110
|
+
| Medical record number | `healthcare/medical_record_number` | critical |
|
|
111
|
+
| DEA number | `healthcare/dea_number` | critical |
|
|
112
|
+
| IBAN | `finance/iban` | high |
|
|
113
|
+
| SWIFT/BIC code | `finance/swift_bic` | medium |
|
|
114
|
+
| ABA routing number | `finance/routing_number` | high |
|
|
115
|
+
| Bank account number | `finance/bank_account` | high |
|
|
116
|
+
| Tax ID / EIN | `finance/tax_id_ein` | critical |
|
|
117
|
+
| Bitcoin address | `finance/crypto_btc` | medium |
|
|
118
|
+
| Ethereum address | `finance/crypto_eth` | medium |
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## How It Works
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
text input
|
|
126
|
+
│
|
|
127
|
+
▼
|
|
128
|
+
tokenize() ← NFKC normalization, zero-width char removal
|
|
129
|
+
│
|
|
130
|
+
▼
|
|
131
|
+
match() ← regex scan against compiled rule set
|
|
132
|
+
│
|
|
133
|
+
▼
|
|
134
|
+
score() ← context-aware confidence (boost / suppress words)
|
|
135
|
+
│
|
|
136
|
+
▼
|
|
137
|
+
resolve() ← remove overlapping spans, keep highest confidence
|
|
138
|
+
│
|
|
139
|
+
▼
|
|
140
|
+
aggregate() ← single risk_score in [0.0, 1.0]
|
|
141
|
+
│
|
|
142
|
+
▼
|
|
143
|
+
build_result() ← assemble final dict with action, findings, fingerprint
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Every step is a pure function. No network calls. No disk writes. No global state except the in-process rule cache.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Scan Modes
|
|
151
|
+
|
|
152
|
+
| Mode | When risk > 0 | Use case |
|
|
153
|
+
|------|---------------|----------|
|
|
154
|
+
| `flag` (default) | `action = "flag"` | Log and review before sending |
|
|
155
|
+
| `redact` | `action = "flag"`, spans replaced in `redacted_text` | Strip PII, send cleaned text |
|
|
156
|
+
| `block` | `action = "block"` | Hard stop — raise an error upstream |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Honest Limits
|
|
161
|
+
|
|
162
|
+
- **Regex-only**: datagate-llm uses deterministic pattern matching. It will not catch PII embedded in obfuscated prose, paraphrased content, or novel formats it has never seen.
|
|
163
|
+
- **English-centric**: Phone and ID patterns currently target US formats. International variants may be missed.
|
|
164
|
+
- **No semantic understanding**: "The patient's temperature was 98.6" will not be flagged as health data because there is no pattern for it. Semantic scanning requires the optional `onnxruntime` layer (not yet released).
|
|
165
|
+
- **False positives are possible**: Short patterns like SWIFT codes can match arbitrary uppercase strings. Use `context.suppress` words in your rule JSON to reduce noise.
|
|
166
|
+
- **Not a compliance tool**: Passing a scan does not mean a document is HIPAA, GDPR, or PCI-DSS compliant. Use this as one layer of defense, not the only one.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Contributing
|
|
171
|
+
|
|
172
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). In short: add rules in JSON, add tests, open a PR.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# datagate-llm
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/datagate-llm/)
|
|
4
|
+
[](https://pypi.org/project/datagate-llm/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml)
|
|
7
|
+
|
|
8
|
+
**The inference boundary layer between your data and outbound AI requests.**
|
|
9
|
+
|
|
10
|
+
Scan text for sensitive data — PII, secrets, credentials, and sector-specific identifiers — before it leaves your system and reaches an LLM API.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## The Problem
|
|
15
|
+
|
|
16
|
+
In 2023, Samsung engineers accidentally leaked proprietary source code and internal meeting notes by pasting them into ChatGPT. The data was retained and potentially used for training. This is not a hypothetical risk — it is the default behavior when you send unrestricted text to an external AI model.
|
|
17
|
+
|
|
18
|
+
datagate-llm is the layer you put in front of that API call. It checks what you are about to send, tells you what it found, and lets you decide: flag it, redact it, or block it.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install datagate-llm
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Zero dependencies. Python 3.9+. Works offline.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Quickstart
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from datagate_llm import scan
|
|
36
|
+
|
|
37
|
+
# Basic scan
|
|
38
|
+
result = scan("Contact Alice at alice@company.com or call 415-555-0192")
|
|
39
|
+
print(result["safe"]) # False
|
|
40
|
+
print(result["risk_score"]) # 0.8 (or similar)
|
|
41
|
+
print(result["findings"]) # list of matched spans
|
|
42
|
+
|
|
43
|
+
# Redact mode — replace PII before sending to an LLM
|
|
44
|
+
result = scan(
|
|
45
|
+
"My SSN is 123-45-6789 and card number 4111111111111111",
|
|
46
|
+
mode="redact"
|
|
47
|
+
)
|
|
48
|
+
print(result["redacted_text"])
|
|
49
|
+
# "My SSN is [REDACTED:universal/ssn] and card number [REDACTED:universal/credit_card]"
|
|
50
|
+
|
|
51
|
+
# Block mode — hard stop on high-risk content
|
|
52
|
+
result = scan("AKIAIOSFODNN7EXAMPLEKEY", sectors=["technology"], mode="block")
|
|
53
|
+
if result["action"] == "block":
|
|
54
|
+
raise ValueError("Refusing to send credentials to LLM")
|
|
55
|
+
|
|
56
|
+
# Multi-sector scan
|
|
57
|
+
result = scan(
|
|
58
|
+
"Patient MRN: AB12345, account 123456789012",
|
|
59
|
+
sectors=["healthcare", "finance"]
|
|
60
|
+
)
|
|
61
|
+
for finding in result["findings"]:
|
|
62
|
+
print(finding["rule_id"], finding["severity"], finding["confidence"])
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## What It Detects
|
|
68
|
+
|
|
69
|
+
| Category | Rule ID | Severity |
|
|
70
|
+
|----------|---------|----------|
|
|
71
|
+
| Email address | `universal/email` | high |
|
|
72
|
+
| US phone number | `universal/phone_us` | medium |
|
|
73
|
+
| Social Security Number | `universal/ssn` | critical |
|
|
74
|
+
| Credit card number | `universal/credit_card` | critical |
|
|
75
|
+
| IP address | `universal/ip_address` | low |
|
|
76
|
+
| AWS access key | `technology/aws_access_key` | critical |
|
|
77
|
+
| OpenAI API key | `technology/openai_key` | critical |
|
|
78
|
+
| Anthropic API key | `technology/anthropic_key` | critical |
|
|
79
|
+
| GitHub token | `technology/github_token` | critical |
|
|
80
|
+
| Stripe key | `technology/stripe_key` | critical |
|
|
81
|
+
| JWT token | `technology/jwt_token` | high |
|
|
82
|
+
| Private key (PEM) | `technology/private_key` | critical |
|
|
83
|
+
| Database connection string | `technology/connection_string` | critical |
|
|
84
|
+
| NPI number | `healthcare/npi_number` | high |
|
|
85
|
+
| ICD-10 diagnosis code | `healthcare/icd10_code` | medium |
|
|
86
|
+
| Insurance member ID | `healthcare/insurance_member_id` | high |
|
|
87
|
+
| Medical record number | `healthcare/medical_record_number` | critical |
|
|
88
|
+
| DEA number | `healthcare/dea_number` | critical |
|
|
89
|
+
| IBAN | `finance/iban` | high |
|
|
90
|
+
| SWIFT/BIC code | `finance/swift_bic` | medium |
|
|
91
|
+
| ABA routing number | `finance/routing_number` | high |
|
|
92
|
+
| Bank account number | `finance/bank_account` | high |
|
|
93
|
+
| Tax ID / EIN | `finance/tax_id_ein` | critical |
|
|
94
|
+
| Bitcoin address | `finance/crypto_btc` | medium |
|
|
95
|
+
| Ethereum address | `finance/crypto_eth` | medium |
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## How It Works
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
text input
|
|
103
|
+
│
|
|
104
|
+
▼
|
|
105
|
+
tokenize() ← NFKC normalization, zero-width char removal
|
|
106
|
+
│
|
|
107
|
+
▼
|
|
108
|
+
match() ← regex scan against compiled rule set
|
|
109
|
+
│
|
|
110
|
+
▼
|
|
111
|
+
score() ← context-aware confidence (boost / suppress words)
|
|
112
|
+
│
|
|
113
|
+
▼
|
|
114
|
+
resolve() ← remove overlapping spans, keep highest confidence
|
|
115
|
+
│
|
|
116
|
+
▼
|
|
117
|
+
aggregate() ← single risk_score in [0.0, 1.0]
|
|
118
|
+
│
|
|
119
|
+
▼
|
|
120
|
+
build_result() ← assemble final dict with action, findings, fingerprint
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Every step is a pure function. No network calls. No disk writes. No global state except the in-process rule cache.
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Scan Modes
|
|
128
|
+
|
|
129
|
+
| Mode | When risk > 0 | Use case |
|
|
130
|
+
|------|---------------|----------|
|
|
131
|
+
| `flag` (default) | `action = "flag"` | Log and review before sending |
|
|
132
|
+
| `redact` | `action = "flag"`, spans replaced in `redacted_text` | Strip PII, send cleaned text |
|
|
133
|
+
| `block` | `action = "block"` | Hard stop — raise an error upstream |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Honest Limits
|
|
138
|
+
|
|
139
|
+
- **Regex-only**: datagate-llm uses deterministic pattern matching. It will not catch PII embedded in obfuscated prose, paraphrased content, or novel formats it has never seen.
|
|
140
|
+
- **English-centric**: Phone and ID patterns currently target US formats. International variants may be missed.
|
|
141
|
+
- **No semantic understanding**: "The patient's temperature was 98.6" will not be flagged as health data because there is no pattern for it. Semantic scanning requires the optional `onnxruntime` layer (not yet released).
|
|
142
|
+
- **False positives are possible**: Short patterns like SWIFT codes can match arbitrary uppercase strings. Use `context.suppress` words in your rule JSON to reduce noise.
|
|
143
|
+
- **Not a compliance tool**: Passing a scan does not mean a document is HIPAA, GDPR, or PCI-DSS compliant. Use this as one layer of defense, not the only one.
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Contributing
|
|
148
|
+
|
|
149
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). In short: add rules in JSON, add tests, open a PR.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "datagate-llm"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "The inference boundary layer between your data and outbound AI requests"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
dependencies = []
|
|
13
|
+
keywords = [
|
|
14
|
+
"llm",
|
|
15
|
+
"guardrails",
|
|
16
|
+
"pii-detection",
|
|
17
|
+
"prompt-injection",
|
|
18
|
+
"data-privacy",
|
|
19
|
+
"ai-security",
|
|
20
|
+
"data-gate",
|
|
21
|
+
"inference-boundary",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 3 - Alpha",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"License :: OSI Approved :: MIT License",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.9",
|
|
29
|
+
"Programming Language :: Python :: 3.10",
|
|
30
|
+
"Programming Language :: Python :: 3.11",
|
|
31
|
+
"Programming Language :: Python :: 3.12",
|
|
32
|
+
"Topic :: Security",
|
|
33
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
semantic = ["onnxruntime"]
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.packages.find]
|
|
40
|
+
where = ["src"]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.package-data]
|
|
43
|
+
datagate_llm = ["rules/*.json"]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
datagate-llm: The inference boundary layer between your data and outbound AI requests.
|
|
3
|
+
|
|
4
|
+
Public API
|
|
5
|
+
----------
|
|
6
|
+
scan(text, sectors=None, mode="flag", rules_dir=_RULES_DIR) -> dict
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
text (str): Input text to analyse.
|
|
10
|
+
sectors (list[str] | None): Domain rule sets to load in addition to
|
|
11
|
+
universal rules. Supported: "technology", "healthcare", "finance".
|
|
12
|
+
mode (str): One of "flag", "redact", or "block".
|
|
13
|
+
rules_dir (str): Path to directory containing JSON rule files.
|
|
14
|
+
|
|
15
|
+
Returns a dict with keys:
|
|
16
|
+
safe (bool) - True when risk_score == 0.0
|
|
17
|
+
risk_score (float) - 0.0-1.0 aggregate risk
|
|
18
|
+
action (str) - "allow", "flag", or "block"
|
|
19
|
+
findings (list) - matched spans with metadata
|
|
20
|
+
redacted_text (str) - text with spans replaced (mode=redact)
|
|
21
|
+
fingerprint (str) - first 16 hex chars of sha256(text+rule_version)
|
|
22
|
+
rule_version (str) - hash of loaded rule set for audit
|
|
23
|
+
trace (list[str]) - human-readable decision log
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
import os
|
|
27
|
+
|
|
28
|
+
from .engine import tokenize, match, score, resolve, aggregate, build_result
|
|
29
|
+
from .loader import load_rules
|
|
30
|
+
|
|
31
|
+
__version__ = "0.1.0"
|
|
32
|
+
__all__ = ["scan"]
|
|
33
|
+
|
|
34
|
+
_RULES_DIR = os.path.join(os.path.dirname(__file__), "rules")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def scan(text, sectors=None, mode="flag", rules_dir=_RULES_DIR):
|
|
38
|
+
"""Run the full detection pipeline on *text*."""
|
|
39
|
+
sectors = sectors or []
|
|
40
|
+
trace = []
|
|
41
|
+
|
|
42
|
+
rules = load_rules(sectors, rules_dir)
|
|
43
|
+
trace.append(f"loaded {len(rules)} rules for sectors={['universal'] + sectors}")
|
|
44
|
+
|
|
45
|
+
cleaned = tokenize(text)
|
|
46
|
+
trace.append("tokenized input")
|
|
47
|
+
|
|
48
|
+
spans = match(cleaned, rules)
|
|
49
|
+
trace.append(f"matched {len(spans)} raw spans")
|
|
50
|
+
|
|
51
|
+
scored = [dict(span, confidence=score(span, cleaned)) for span in spans]
|
|
52
|
+
clean_spans = resolve(scored)
|
|
53
|
+
trace.append(f"resolved to {len(clean_spans)} non-overlapping spans")
|
|
54
|
+
|
|
55
|
+
risk = aggregate(clean_spans)
|
|
56
|
+
trace.append(f"risk_score={risk:.3f}")
|
|
57
|
+
|
|
58
|
+
rule_version = rules[0].get("rule_version", "unknown") if rules else "unknown"
|
|
59
|
+
result = build_result(cleaned, clean_spans, risk, mode, rule_version)
|
|
60
|
+
result["trace"] = trace
|
|
61
|
+
return result
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pure-function detection engine. No side effects. No I/O. stdlib only.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import re
|
|
7
|
+
import unicodedata
|
|
8
|
+
from math import log1p
|
|
9
|
+
|
|
10
|
+
_ZERO_WIDTH = re.compile(r"[\u200b-\u200f\u202a-\u202e\ufeff\u00ad]")
|
|
11
|
+
|
|
12
|
+
_SEVERITY_BASE = {
|
|
13
|
+
"critical": 1.0,
|
|
14
|
+
"high": 0.8,
|
|
15
|
+
"medium": 0.5,
|
|
16
|
+
"low": 0.3,
|
|
17
|
+
}
|
|
18
|
+
_WINDOW = 30
|
|
19
|
+
_BOOST = 0.15
|
|
20
|
+
_SUPPRESS = 0.25
|
|
21
|
+
_LOG_SCALE = 0.05
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def tokenize(text):
|
|
25
|
+
"""Normalise *text* to NFKC and strip zero-width characters."""
|
|
26
|
+
normalised = unicodedata.normalize("NFKC", text)
|
|
27
|
+
return _ZERO_WIDTH.sub("", normalised)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def match(text, rules):
|
|
31
|
+
"""Return every span matched by *rules* against *text*."""
|
|
32
|
+
spans = []
|
|
33
|
+
for rule in rules:
|
|
34
|
+
compiled = rule.get("compiled")
|
|
35
|
+
if compiled is None:
|
|
36
|
+
continue
|
|
37
|
+
for m in compiled.finditer(text):
|
|
38
|
+
spans.append({
|
|
39
|
+
"start": m.start(),
|
|
40
|
+
"end": m.end(),
|
|
41
|
+
"text": m.group(),
|
|
42
|
+
"rule_id": rule["id"],
|
|
43
|
+
"sector": rule.get("sector", "universal"),
|
|
44
|
+
"severity": rule.get("severity", "medium"),
|
|
45
|
+
"context": rule.get("context", {}),
|
|
46
|
+
})
|
|
47
|
+
return spans
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def score(span, text):
|
|
51
|
+
"""Return a confidence float in [0.0, 1.0] for *span* inside *text*."""
|
|
52
|
+
base = _SEVERITY_BASE.get(span.get("severity", "medium"), 0.5)
|
|
53
|
+
start = max(0, span["start"] - _WINDOW)
|
|
54
|
+
end = min(len(text), span["end"] + _WINDOW)
|
|
55
|
+
window = text[start:end].lower()
|
|
56
|
+
|
|
57
|
+
ctx = span.get("context", {})
|
|
58
|
+
boost_words = ctx.get("boost", [])
|
|
59
|
+
suppress_words = ctx.get("suppress", [])
|
|
60
|
+
|
|
61
|
+
if any(w in window for w in boost_words):
|
|
62
|
+
base = min(1.0, base + _BOOST)
|
|
63
|
+
if any(w in window for w in suppress_words):
|
|
64
|
+
base = max(0.0, base - _SUPPRESS)
|
|
65
|
+
return round(base, 4)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def resolve(spans):
|
|
69
|
+
"""Remove overlapping spans, keeping highest-confidence ones."""
|
|
70
|
+
sorted_spans = sorted(
|
|
71
|
+
spans,
|
|
72
|
+
key=lambda s: (s["start"], -s.get("confidence", 0), s["rule_id"])
|
|
73
|
+
)
|
|
74
|
+
result = []
|
|
75
|
+
last_end = -1
|
|
76
|
+
for span in sorted_spans:
|
|
77
|
+
if span["start"] >= last_end:
|
|
78
|
+
result.append(span)
|
|
79
|
+
last_end = span["end"]
|
|
80
|
+
return result
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def aggregate(spans):
|
|
84
|
+
"""Compute aggregate risk in [0.0, 1.0] from resolved *spans*."""
|
|
85
|
+
if not spans:
|
|
86
|
+
return 0.0
|
|
87
|
+
max_score = max(s.get("confidence", 0.0) for s in spans)
|
|
88
|
+
raw = max_score * (1 + _LOG_SCALE * log1p(len(spans)))
|
|
89
|
+
return round(min(1.0, raw), 4)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def build_result(text, spans, risk, mode, rule_version):
|
|
93
|
+
"""Assemble the final result dict. Never raises."""
|
|
94
|
+
findings = [
|
|
95
|
+
{k: v for k, v in s.items() if k not in ("compiled", "context")}
|
|
96
|
+
for s in spans
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
redacted = text
|
|
100
|
+
if mode == "redact":
|
|
101
|
+
for span in reversed(spans):
|
|
102
|
+
placeholder = f"[REDACTED:{span['rule_id']}]"
|
|
103
|
+
redacted = redacted[: span["start"]] + placeholder + redacted[span["end"]:]
|
|
104
|
+
|
|
105
|
+
action = _resolve_action(risk, mode)
|
|
106
|
+
fp_raw = hashlib.sha256(f"{text}{rule_version}".encode()).hexdigest()
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"safe": risk == 0.0,
|
|
110
|
+
"risk_score": risk,
|
|
111
|
+
"action": action,
|
|
112
|
+
"findings": findings,
|
|
113
|
+
"redacted_text": redacted,
|
|
114
|
+
"fingerprint": fp_raw[:16],
|
|
115
|
+
"rule_version": rule_version,
|
|
116
|
+
"trace": [],
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _resolve_action(risk, mode):
|
|
121
|
+
if risk == 0.0:
|
|
122
|
+
return "allow"
|
|
123
|
+
if mode == "block":
|
|
124
|
+
return "block"
|
|
125
|
+
if mode == "flag":
|
|
126
|
+
return "flag"
|
|
127
|
+
return "allow"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rule loader with in-process cache. stdlib only.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
_cache = {}
|
|
10
|
+
|
|
11
|
+
_SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def load_rules(sectors, rules_dir):
|
|
15
|
+
"""Return compiled rules for *sectors* (universal always included)."""
|
|
16
|
+
key = (tuple(sorted(sectors)), rules_dir)
|
|
17
|
+
if key in _cache:
|
|
18
|
+
return _cache[key]
|
|
19
|
+
|
|
20
|
+
raw = _read(os.path.join(rules_dir, "universal.json"))
|
|
21
|
+
for sector in sectors:
|
|
22
|
+
path = os.path.join(rules_dir, f"{sector}.json")
|
|
23
|
+
raw.extend(_read(path))
|
|
24
|
+
|
|
25
|
+
compiled = _compile(raw)
|
|
26
|
+
_cache[key] = compiled
|
|
27
|
+
return compiled
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _read(path):
|
|
31
|
+
"""Load JSON rule list from *path*; return [] on any error."""
|
|
32
|
+
try:
|
|
33
|
+
with open(path, encoding="utf-8") as fh:
|
|
34
|
+
data = json.loads(fh.read())
|
|
35
|
+
return data if isinstance(data, list) else []
|
|
36
|
+
except (OSError, json.JSONDecodeError):
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _compile(rules):
|
|
41
|
+
"""Add *compiled* regex key to each rule and sort by severity."""
|
|
42
|
+
out = []
|
|
43
|
+
for rule in rules:
|
|
44
|
+
pattern = rule.get("pattern", "")
|
|
45
|
+
if not pattern:
|
|
46
|
+
continue
|
|
47
|
+
try:
|
|
48
|
+
compiled = re.compile(pattern)
|
|
49
|
+
except re.error:
|
|
50
|
+
continue
|
|
51
|
+
entry = dict(rule)
|
|
52
|
+
entry["compiled"] = compiled
|
|
53
|
+
out.append(entry)
|
|
54
|
+
|
|
55
|
+
out.sort(key=lambda r: _SEVERITY_ORDER.get(r.get("severity", "medium"), 2))
|
|
56
|
+
return out
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "finance/iban",
|
|
4
|
+
"sector": "finance",
|
|
5
|
+
"pattern": "\\b[A-Z]{2}\\d{2}[A-Z0-9]{4}\\d{7}(?:[A-Z0-9]?){0,16}\\b",
|
|
6
|
+
"severity": "high",
|
|
7
|
+
"context": {
|
|
8
|
+
"boost": ["iban", "bank", "account", "transfer", "wire"],
|
|
9
|
+
"suppress": ["example", "test", "sample"]
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "finance/swift_bic",
|
|
14
|
+
"sector": "finance",
|
|
15
|
+
"pattern": "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b",
|
|
16
|
+
"severity": "medium",
|
|
17
|
+
"context": {
|
|
18
|
+
"boost": ["swift", "bic", "bank", "wire", "transfer", "international"],
|
|
19
|
+
"suppress": ["example", "test", "sample"]
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"id": "finance/routing_number",
|
|
24
|
+
"sector": "finance",
|
|
25
|
+
"pattern": "\\b(?:routing|aba|aba routing)[:\\s#]*([0-9]{9})\\b",
|
|
26
|
+
"severity": "high",
|
|
27
|
+
"context": {
|
|
28
|
+
"boost": ["routing", "aba", "transit", "bank", "account"],
|
|
29
|
+
"suppress": ["example", "test", "sample"]
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"id": "finance/bank_account",
|
|
34
|
+
"sector": "finance",
|
|
35
|
+
"pattern": "\\b(?:account|acct)[\\s\\-#:]*([0-9]{8,17})\\b",
|
|
36
|
+
"severity": "high",
|
|
37
|
+
"context": {
|
|
38
|
+
"boost": ["account", "bank", "checking", "savings", "deposit"],
|
|
39
|
+
"suppress": ["example", "test", "sample"]
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "finance/tax_id_ein",
|
|
44
|
+
"sector": "finance",
|
|
45
|
+
"pattern": "\\b(?:EIN|FEIN|Tax ID)[:\\s#]*([0-9]{2}-[0-9]{7})\\b",
|
|
46
|
+
"severity": "critical",
|
|
47
|
+
"context": {
|
|
48
|
+
"boost": ["ein", "fein", "tax", "employer", "federal"],
|
|
49
|
+
"suppress": ["example", "test", "sample"]
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"id": "finance/crypto_btc",
|
|
54
|
+
"sector": "finance",
|
|
55
|
+
"pattern": "\\b(?:bc1|[13])[a-zA-HJ-NP-Z0-9]{25,62}\\b",
|
|
56
|
+
"severity": "medium",
|
|
57
|
+
"context": {
|
|
58
|
+
"boost": ["bitcoin", "btc", "wallet", "crypto", "address"],
|
|
59
|
+
"suppress": ["example", "test", "sample"]
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"id": "finance/crypto_eth",
|
|
64
|
+
"sector": "finance",
|
|
65
|
+
"pattern": "\\b0x[a-fA-F0-9]{40}\\b",
|
|
66
|
+
"severity": "medium",
|
|
67
|
+
"context": {
|
|
68
|
+
"boost": ["ethereum", "eth", "wallet", "crypto", "address", "0x"],
|
|
69
|
+
"suppress": ["example", "test", "sample"]
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "healthcare/npi_number",
|
|
4
|
+
"sector": "healthcare",
|
|
5
|
+
"pattern": "\\bNPI[:\\s#]*([1-9]\\d{9})\\b",
|
|
6
|
+
"severity": "high",
|
|
7
|
+
"context": {
|
|
8
|
+
"boost": ["npi", "provider", "physician", "practitioner", "clinic"],
|
|
9
|
+
"suppress": ["example", "test", "sample"]
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "healthcare/icd10_code",
|
|
14
|
+
"sector": "healthcare",
|
|
15
|
+
"pattern": "\\b[A-TV-Z][0-9][0-9A-Z](?:\\.[0-9A-Z]{1,4})?\\b",
|
|
16
|
+
"severity": "medium",
|
|
17
|
+
"context": {
|
|
18
|
+
"boost": ["icd", "diagnosis", "code", "condition", "disease"],
|
|
19
|
+
"suppress": ["example", "test", "sample"]
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"id": "healthcare/insurance_member_id",
|
|
24
|
+
"sector": "healthcare",
|
|
25
|
+
"pattern": "\\b(?:member|subscriber|insured)[\\s\\-#:]*([A-Z0-9]{6,15})\\b",
|
|
26
|
+
"severity": "high",
|
|
27
|
+
"context": {
|
|
28
|
+
"boost": ["insurance", "member", "subscriber", "plan", "policy"],
|
|
29
|
+
"suppress": ["example", "test", "sample"]
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"id": "healthcare/medical_record_number",
|
|
34
|
+
"sector": "healthcare",
|
|
35
|
+
"pattern": "\\b(?:MRN|medical record)[:\\s#]*([A-Z0-9]{5,12})\\b",
|
|
36
|
+
"severity": "critical",
|
|
37
|
+
"context": {
|
|
38
|
+
"boost": ["mrn", "medical record", "patient", "hospital", "chart"],
|
|
39
|
+
"suppress": ["example", "test", "sample"]
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "healthcare/dea_number",
|
|
44
|
+
"sector": "healthcare",
|
|
45
|
+
"pattern": "\\b[A-Z]{2}[0-9]{7}\\b",
|
|
46
|
+
"severity": "critical",
|
|
47
|
+
"context": {
|
|
48
|
+
"boost": ["dea", "drug", "prescribe", "controlled", "substance"],
|
|
49
|
+
"suppress": ["example", "test", "sample"]
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "technology/aws_access_key",
|
|
4
|
+
"sector": "technology",
|
|
5
|
+
"pattern": "(?:A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}",
|
|
6
|
+
"severity": "critical",
|
|
7
|
+
"context": {
|
|
8
|
+
"boost": ["aws", "amazon", "access", "key", "iam"],
|
|
9
|
+
"suppress": ["example", "test", "sample", "fake"]
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "technology/openai_key",
|
|
14
|
+
"sector": "technology",
|
|
15
|
+
"pattern": "sk-[A-Za-z0-9]{20,}T3BlbkFJ[A-Za-z0-9]{20,}",
|
|
16
|
+
"severity": "critical",
|
|
17
|
+
"context": {
|
|
18
|
+
"boost": ["openai", "gpt", "api", "key", "secret"],
|
|
19
|
+
"suppress": ["example", "test", "sample"]
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"id": "technology/anthropic_key",
|
|
24
|
+
"sector": "technology",
|
|
25
|
+
"pattern": "sk-ant-[A-Za-z0-9\\-_]{40,}",
|
|
26
|
+
"severity": "critical",
|
|
27
|
+
"context": {
|
|
28
|
+
"boost": ["anthropic", "claude", "api", "key", "secret"],
|
|
29
|
+
"suppress": ["example", "test", "sample"]
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"id": "technology/github_token",
|
|
34
|
+
"sector": "technology",
|
|
35
|
+
"pattern": "gh[pousr]_[A-Za-z0-9]{36,}",
|
|
36
|
+
"severity": "critical",
|
|
37
|
+
"context": {
|
|
38
|
+
"boost": ["github", "token", "git", "repo", "personal access"],
|
|
39
|
+
"suppress": ["example", "test", "sample"]
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "technology/stripe_key",
|
|
44
|
+
"sector": "technology",
|
|
45
|
+
"pattern": "(?:sk|pk)_(?:live|test)_[A-Za-z0-9]{24,}",
|
|
46
|
+
"severity": "critical",
|
|
47
|
+
"context": {
|
|
48
|
+
"boost": ["stripe", "payment", "api", "key", "secret"],
|
|
49
|
+
"suppress": ["example", "test", "sample"]
|
|
50
|
+
}
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"id": "technology/jwt_token",
|
|
54
|
+
"sector": "technology",
|
|
55
|
+
"pattern": "eyJ[A-Za-z0-9_\\-]+\\.eyJ[A-Za-z0-9_\\-]+\\.[A-Za-z0-9_\\-]+",
|
|
56
|
+
"severity": "high",
|
|
57
|
+
"context": {
|
|
58
|
+
"boost": ["jwt", "token", "bearer", "auth", "authorization"],
|
|
59
|
+
"suppress": ["example", "test", "sample"]
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"id": "technology/private_key",
|
|
64
|
+
"sector": "technology",
|
|
65
|
+
"pattern": "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----",
|
|
66
|
+
"severity": "critical",
|
|
67
|
+
"context": {
|
|
68
|
+
"boost": ["private", "key", "pem", "cert", "rsa", "ssh"],
|
|
69
|
+
"suppress": ["example", "test", "sample"]
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"id": "technology/connection_string",
|
|
74
|
+
"sector": "technology",
|
|
75
|
+
"pattern": "(?:mongodb|postgresql|mysql|redis|amqp)(?:\\+srv)?://[^:]+:[^@]+@[^/\\s]+",
|
|
76
|
+
"severity": "critical",
|
|
77
|
+
"context": {
|
|
78
|
+
"boost": ["database", "db", "connection", "uri", "dsn"],
|
|
79
|
+
"suppress": ["example", "test", "localhost", "sample"]
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "universal/email",
|
|
4
|
+
"sector": "universal",
|
|
5
|
+
"pattern": "[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}",
|
|
6
|
+
"severity": "high",
|
|
7
|
+
"context": {
|
|
8
|
+
"boost": ["email", "contact", "reach", "send", "mail"],
|
|
9
|
+
"suppress": ["example", "test", "sample", "foo", "bar"]
|
|
10
|
+
}
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
"id": "universal/phone_us",
|
|
14
|
+
"sector": "universal",
|
|
15
|
+
"pattern": "(?:\\+1[\\s\\-]?)?(?:\\(?[2-9]\\d{2}\\)?[\\s\\-]?)[2-9]\\d{2}[\\s\\-]?\\d{4}",
|
|
16
|
+
"severity": "medium",
|
|
17
|
+
"context": {
|
|
18
|
+
"boost": ["phone", "call", "mobile", "cell", "tel", "fax"],
|
|
19
|
+
"suppress": ["example", "test", "sample"]
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"id": "universal/ssn",
|
|
24
|
+
"sector": "universal",
|
|
25
|
+
"pattern": "(?!000|666|9\\d{2})\\d{3}[\\s\\-](?!00)\\d{2}[\\s\\-](?!0000)\\d{4}",
|
|
26
|
+
"severity": "critical",
|
|
27
|
+
"context": {
|
|
28
|
+
"boost": ["ssn", "social security", "taxpayer", "government id"],
|
|
29
|
+
"suppress": ["example", "test", "sample", "fake"]
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"id": "universal/credit_card",
|
|
34
|
+
"sector": "universal",
|
|
35
|
+
"pattern": "(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13}|6(?:011|5[0-9]{2})[0-9]{12})",
|
|
36
|
+
"severity": "critical",
|
|
37
|
+
"context": {
|
|
38
|
+
"boost": ["card", "credit", "debit", "payment", "billing"],
|
|
39
|
+
"suppress": ["example", "test", "sample"]
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "universal/ip_address",
|
|
44
|
+
"sector": "universal",
|
|
45
|
+
"pattern": "(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)",
|
|
46
|
+
"severity": "low",
|
|
47
|
+
"context": {
|
|
48
|
+
"boost": ["server", "host", "ip", "address", "network", "connect"],
|
|
49
|
+
"suppress": ["example", "test", "localhost", "127"]
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
]
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datagate-llm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The inference boundary layer between your data and outbound AI requests
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: llm,guardrails,pii-detection,prompt-injection,data-privacy,ai-security,data-gate,inference-boundary
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Topic :: Security
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Provides-Extra: semantic
|
|
21
|
+
Requires-Dist: onnxruntime; extra == "semantic"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# datagate-llm
|
|
25
|
+
|
|
26
|
+
[](https://pypi.org/project/datagate-llm/)
|
|
27
|
+
[](https://pypi.org/project/datagate-llm/)
|
|
28
|
+
[](https://opensource.org/licenses/MIT)
|
|
29
|
+
[](https://github.com/datagate-llm/datagate-llm/actions/workflows/test.yml)
|
|
30
|
+
|
|
31
|
+
**The inference boundary layer between your data and outbound AI requests.**
|
|
32
|
+
|
|
33
|
+
Scan text for sensitive data — PII, secrets, credentials, and sector-specific identifiers — before it leaves your system and reaches an LLM API.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## The Problem
|
|
38
|
+
|
|
39
|
+
In 2023, Samsung engineers accidentally leaked proprietary source code and internal meeting notes by pasting them into ChatGPT. The data was retained and potentially used for training. This is not a hypothetical risk — it is the default behavior when you send unrestricted text to an external AI model.
|
|
40
|
+
|
|
41
|
+
datagate-llm is the layer you put in front of that API call. It checks what you are about to send, tells you what it found, and lets you decide: flag it, redact it, or block it.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Install
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install datagate-llm
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Zero dependencies. Python 3.9+. Works offline.
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Quickstart
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from datagate_llm import scan
|
|
59
|
+
|
|
60
|
+
# Basic scan
|
|
61
|
+
result = scan("Contact Alice at alice@company.com or call 415-555-0192")
|
|
62
|
+
print(result["safe"]) # False
|
|
63
|
+
print(result["risk_score"]) # 0.8 (or similar)
|
|
64
|
+
print(result["findings"]) # list of matched spans
|
|
65
|
+
|
|
66
|
+
# Redact mode — replace PII before sending to an LLM
|
|
67
|
+
result = scan(
|
|
68
|
+
"My SSN is 123-45-6789 and card number 4111111111111111",
|
|
69
|
+
mode="redact"
|
|
70
|
+
)
|
|
71
|
+
print(result["redacted_text"])
|
|
72
|
+
# "My SSN is [REDACTED:universal/ssn] and card number [REDACTED:universal/credit_card]"
|
|
73
|
+
|
|
74
|
+
# Block mode — hard stop on high-risk content
|
|
75
|
+
result = scan("AKIAIOSFODNN7EXAMPLEKEY", sectors=["technology"], mode="block")
|
|
76
|
+
if result["action"] == "block":
|
|
77
|
+
raise ValueError("Refusing to send credentials to LLM")
|
|
78
|
+
|
|
79
|
+
# Multi-sector scan
|
|
80
|
+
result = scan(
|
|
81
|
+
"Patient MRN: AB12345, account 123456789012",
|
|
82
|
+
sectors=["healthcare", "finance"]
|
|
83
|
+
)
|
|
84
|
+
for finding in result["findings"]:
|
|
85
|
+
print(finding["rule_id"], finding["severity"], finding["confidence"])
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## What It Detects
|
|
91
|
+
|
|
92
|
+
| Category | Rule ID | Severity |
|
|
93
|
+
|----------|---------|----------|
|
|
94
|
+
| Email address | `universal/email` | high |
|
|
95
|
+
| US phone number | `universal/phone_us` | medium |
|
|
96
|
+
| Social Security Number | `universal/ssn` | critical |
|
|
97
|
+
| Credit card number | `universal/credit_card` | critical |
|
|
98
|
+
| IP address | `universal/ip_address` | low |
|
|
99
|
+
| AWS access key | `technology/aws_access_key` | critical |
|
|
100
|
+
| OpenAI API key | `technology/openai_key` | critical |
|
|
101
|
+
| Anthropic API key | `technology/anthropic_key` | critical |
|
|
102
|
+
| GitHub token | `technology/github_token` | critical |
|
|
103
|
+
| Stripe key | `technology/stripe_key` | critical |
|
|
104
|
+
| JWT token | `technology/jwt_token` | high |
|
|
105
|
+
| Private key (PEM) | `technology/private_key` | critical |
|
|
106
|
+
| Database connection string | `technology/connection_string` | critical |
|
|
107
|
+
| NPI number | `healthcare/npi_number` | high |
|
|
108
|
+
| ICD-10 diagnosis code | `healthcare/icd10_code` | medium |
|
|
109
|
+
| Insurance member ID | `healthcare/insurance_member_id` | high |
|
|
110
|
+
| Medical record number | `healthcare/medical_record_number` | critical |
|
|
111
|
+
| DEA number | `healthcare/dea_number` | critical |
|
|
112
|
+
| IBAN | `finance/iban` | high |
|
|
113
|
+
| SWIFT/BIC code | `finance/swift_bic` | medium |
|
|
114
|
+
| ABA routing number | `finance/routing_number` | high |
|
|
115
|
+
| Bank account number | `finance/bank_account` | high |
|
|
116
|
+
| Tax ID / EIN | `finance/tax_id_ein` | critical |
|
|
117
|
+
| Bitcoin address | `finance/crypto_btc` | medium |
|
|
118
|
+
| Ethereum address | `finance/crypto_eth` | medium |
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## How It Works
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
text input
|
|
126
|
+
│
|
|
127
|
+
▼
|
|
128
|
+
tokenize() ← NFKC normalization, zero-width char removal
|
|
129
|
+
│
|
|
130
|
+
▼
|
|
131
|
+
match() ← regex scan against compiled rule set
|
|
132
|
+
│
|
|
133
|
+
▼
|
|
134
|
+
score() ← context-aware confidence (boost / suppress words)
|
|
135
|
+
│
|
|
136
|
+
▼
|
|
137
|
+
resolve() ← remove overlapping spans, keep highest confidence
|
|
138
|
+
│
|
|
139
|
+
▼
|
|
140
|
+
aggregate() ← single risk_score in [0.0, 1.0]
|
|
141
|
+
│
|
|
142
|
+
▼
|
|
143
|
+
build_result() ← assemble final dict with action, findings, fingerprint
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Every step is a pure function. No network calls. No disk writes. No global state except the in-process rule cache.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Scan Modes
|
|
151
|
+
|
|
152
|
+
| Mode | When risk > 0 | Use case |
|
|
153
|
+
|------|---------------|----------|
|
|
154
|
+
| `flag` (default) | `action = "flag"` | Log and review before sending |
|
|
155
|
+
| `redact` | `action = "flag"`, spans replaced in `redacted_text` | Strip PII, send cleaned text |
|
|
156
|
+
| `block` | `action = "block"` | Hard stop — raise an error upstream |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Honest Limits
|
|
161
|
+
|
|
162
|
+
- **Regex-only**: datagate-llm uses deterministic pattern matching. It will not catch PII embedded in obfuscated prose, paraphrased content, or novel formats it has never seen.
|
|
163
|
+
- **English-centric**: Phone and ID patterns currently target US formats. International variants may be missed.
|
|
164
|
+
- **No semantic understanding**: "The patient's temperature was 98.6" will not be flagged as health data because there is no pattern for it. Semantic scanning requires the optional `onnxruntime` layer (not yet released).
|
|
165
|
+
- **False positives are possible**: Short patterns like SWIFT codes can match arbitrary uppercase strings. Use `context.suppress` words in your rule JSON to reduce noise.
|
|
166
|
+
- **Not a compliance tool**: Passing a scan does not mean a document is HIPAA, GDPR, or PCI-DSS compliant. Use this as one layer of defense, not the only one.
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Contributing
|
|
171
|
+
|
|
172
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). In short: add rules in JSON, add tests, open a PR.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## License
|
|
177
|
+
|
|
178
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/datagate_llm/__init__.py
|
|
5
|
+
src/datagate_llm/engine.py
|
|
6
|
+
src/datagate_llm/loader.py
|
|
7
|
+
src/datagate_llm.egg-info/PKG-INFO
|
|
8
|
+
src/datagate_llm.egg-info/SOURCES.txt
|
|
9
|
+
src/datagate_llm.egg-info/dependency_links.txt
|
|
10
|
+
src/datagate_llm.egg-info/requires.txt
|
|
11
|
+
src/datagate_llm.egg-info/top_level.txt
|
|
12
|
+
src/datagate_llm/rules/finance.json
|
|
13
|
+
src/datagate_llm/rules/healthcare.json
|
|
14
|
+
src/datagate_llm/rules/technology.json
|
|
15
|
+
src/datagate_llm/rules/universal.json
|
|
16
|
+
tests/test_engine.py
|
|
17
|
+
tests/test_loader.py
|
|
18
|
+
tests/test_scan.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datagate_llm
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Tests for pure engine functions."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from datagate_llm.engine import (
|
|
9
|
+
tokenize,
|
|
10
|
+
match,
|
|
11
|
+
score,
|
|
12
|
+
resolve,
|
|
13
|
+
aggregate,
|
|
14
|
+
build_result,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
_EMAIL_RULE = {
|
|
18
|
+
"id": "universal/email",
|
|
19
|
+
"sector": "universal",
|
|
20
|
+
"severity": "high",
|
|
21
|
+
"pattern": r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}",
|
|
22
|
+
"compiled": re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"),
|
|
23
|
+
"context": {"boost": ["email", "contact"], "suppress": ["example", "test"]},
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_tokenize_removes_zero_width():
|
|
28
|
+
dirty = "hel\u200blo"
|
|
29
|
+
assert "\u200b" not in tokenize(dirty)
|
|
30
|
+
assert "hello" in tokenize(dirty)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_match_finds_email():
|
|
34
|
+
spans = match("Contact: user@example.com today", [_EMAIL_RULE])
|
|
35
|
+
assert len(spans) == 1
|
|
36
|
+
assert spans[0]["text"] == "user@example.com"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_score_boosts_with_context():
|
|
40
|
+
span_no_ctx = {
|
|
41
|
+
"start": 9, "end": 25, "text": "x",
|
|
42
|
+
"severity": "high", "context": {},
|
|
43
|
+
}
|
|
44
|
+
span_boost = {
|
|
45
|
+
"start": 11, "end": 27, "text": "x",
|
|
46
|
+
"severity": "high",
|
|
47
|
+
"context": {"boost": ["email"], "suppress": []},
|
|
48
|
+
}
|
|
49
|
+
base = score(span_no_ctx, "no context here user@example.com")
|
|
50
|
+
boosted = score(span_boost, "send email user@example.com to team")
|
|
51
|
+
assert boosted > base
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_resolve_removes_overlap():
|
|
55
|
+
spans = [
|
|
56
|
+
{"start": 0, "end": 10, "rule_id": "a", "confidence": 0.8},
|
|
57
|
+
{"start": 5, "end": 15, "rule_id": "b", "confidence": 0.6},
|
|
58
|
+
]
|
|
59
|
+
result = resolve(spans)
|
|
60
|
+
assert len(result) == 1
|
|
61
|
+
assert result[0]["rule_id"] == "a"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def test_aggregate_empty_returns_zero():
|
|
65
|
+
assert aggregate([]) == 0.0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_aggregate_caps_at_one():
|
|
69
|
+
spans = [{"confidence": 1.0}] * 100
|
|
70
|
+
assert aggregate(spans) <= 1.0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_build_result_redact_mode():
|
|
74
|
+
spans = [{"start": 0, "end": 4, "rule_id": "x/id", "confidence": 0.8, "severity": "high"}]
|
|
75
|
+
r = build_result("test text", spans, 0.8, "redact", "v1")
|
|
76
|
+
assert "[REDACTED:x/id]" in r["redacted_text"]
|
|
77
|
+
assert r["action"] == "allow"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_fingerprint_is_deterministic():
|
|
81
|
+
r1 = build_result("hello", [], 0.0, "flag", "v1")
|
|
82
|
+
r2 = build_result("hello", [], 0.0, "flag", "v1")
|
|
83
|
+
assert r1["fingerprint"] == r2["fingerprint"]
|
|
84
|
+
assert len(r1["fingerprint"]) == 16
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Tests for the rule loader."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
|
6
|
+
|
|
7
|
+
import datagate_llm.loader as loader_mod
|
|
8
|
+
|
|
9
|
+
_RULES_DIR = os.path.join(
|
|
10
|
+
os.path.dirname(__file__), "..", "src", "datagate_llm", "rules"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _fresh_load(sectors):
|
|
15
|
+
loader_mod._cache.clear()
|
|
16
|
+
return loader_mod.load_rules(sectors, _RULES_DIR)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_load_universal_always_included():
|
|
20
|
+
rules = _fresh_load([])
|
|
21
|
+
ids = [r["id"] for r in rules]
|
|
22
|
+
assert any(i.startswith("universal/") for i in ids)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_load_missing_sector_safe():
|
|
26
|
+
rules = _fresh_load(["nonexistent_sector"])
|
|
27
|
+
assert isinstance(rules, list)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_cache_hit_on_second_call():
|
|
31
|
+
_fresh_load(["technology"])
|
|
32
|
+
key = (("technology",), _RULES_DIR)
|
|
33
|
+
assert key in loader_mod._cache
|
|
34
|
+
first = loader_mod._cache[key]
|
|
35
|
+
loader_mod.load_rules(["technology"], _RULES_DIR)
|
|
36
|
+
assert loader_mod._cache[key] is first
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_compile_adds_compiled_key():
|
|
40
|
+
rules = _fresh_load([])
|
|
41
|
+
assert all("compiled" in r for r in rules)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_critical_rules_sorted_first():
|
|
45
|
+
rules = _fresh_load(["technology"])
|
|
46
|
+
severities = [r.get("severity") for r in rules]
|
|
47
|
+
first_critical = next((i for i, s in enumerate(severities) if s == "critical"), None)
|
|
48
|
+
first_medium = next((i for i, s in enumerate(severities) if s == "medium"), None)
|
|
49
|
+
if first_critical is not None and first_medium is not None:
|
|
50
|
+
assert first_critical < first_medium
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Integration tests for the public scan() API."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "src"))
|
|
6
|
+
|
|
7
|
+
import datagate_llm.loader as loader_mod
|
|
8
|
+
from datagate_llm import scan
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def setup_function():
|
|
12
|
+
loader_mod._cache.clear()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_email_detected():
|
|
16
|
+
r = scan("Please contact alice@company.com for details")
|
|
17
|
+
assert not r["safe"]
|
|
18
|
+
assert r["risk_score"] > 0
|
|
19
|
+
assert any("email" in f["rule_id"] for f in r["findings"])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_clean_text_is_safe():
|
|
23
|
+
r = scan("The weather is great today in the mountains.")
|
|
24
|
+
assert r["safe"]
|
|
25
|
+
assert r["risk_score"] == 0.0
|
|
26
|
+
assert r["action"] == "allow"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def test_redact_mode_removes_pii():
|
|
30
|
+
r = scan("Email me at bob@acme.org please", mode="redact")
|
|
31
|
+
assert "bob@acme.org" not in r["redacted_text"]
|
|
32
|
+
assert "REDACTED" in r["redacted_text"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_block_mode_on_risky_input():
|
|
36
|
+
r = scan("My SSN is 123-45-6789", mode="block")
|
|
37
|
+
assert r["action"] in ("block", "flag", "allow")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_fingerprint_stable():
|
|
41
|
+
text = "user@example.com"
|
|
42
|
+
r1 = scan(text)
|
|
43
|
+
loader_mod._cache.clear()
|
|
44
|
+
r2 = scan(text)
|
|
45
|
+
assert r1["fingerprint"] == r2["fingerprint"]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_technology_sector_detects_api_key():
|
|
49
|
+
text = "Key: AKIAIOSFODNN7EXAMPLE"
|
|
50
|
+
r = scan(text, sectors=["technology"])
|
|
51
|
+
assert any("aws" in f["rule_id"] for f in r["findings"])
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_healthcare_sector_detects_npi():
|
|
55
|
+
text = "Provider NPI: 1234567890"
|
|
56
|
+
r = scan(text, sectors=["healthcare"])
|
|
57
|
+
assert any("npi" in f["rule_id"] for f in r["findings"])
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_finance_sector_detects_iban():
|
|
61
|
+
text = "Wire to GB29NWBK60161331926819"
|
|
62
|
+
r = scan(text, sectors=["finance"])
|
|
63
|
+
assert any("iban" in f["rule_id"] for f in r["findings"])
|