llm-redacted 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_redacted-0.2.0/LICENSE +21 -0
- llm_redacted-0.2.0/PKG-INFO +80 -0
- llm_redacted-0.2.0/README.md +52 -0
- llm_redacted-0.2.0/llm_redacted.egg-info/PKG-INFO +80 -0
- llm_redacted-0.2.0/llm_redacted.egg-info/SOURCES.txt +33 -0
- llm_redacted-0.2.0/llm_redacted.egg-info/dependency_links.txt +1 -0
- llm_redacted-0.2.0/llm_redacted.egg-info/top_level.txt +1 -0
- llm_redacted-0.2.0/main/__init__.py +17 -0
- llm_redacted-0.2.0/main/detectors/__init__.py +20 -0
- llm_redacted-0.2.0/main/detectors/aws.py +4 -0
- llm_redacted-0.2.0/main/detectors/azure.py +5 -0
- llm_redacted-0.2.0/main/detectors/bearer.py +4 -0
- llm_redacted-0.2.0/main/detectors/connection_string.py +5 -0
- llm_redacted-0.2.0/main/detectors/credit_card.py +5 -0
- llm_redacted-0.2.0/main/detectors/discord.py +4 -0
- llm_redacted-0.2.0/main/detectors/email.py +4 -0
- llm_redacted-0.2.0/main/detectors/env.py +6 -0
- llm_redacted-0.2.0/main/detectors/gcp.py +5 -0
- llm_redacted-0.2.0/main/detectors/github.py +4 -0
- llm_redacted-0.2.0/main/detectors/ipv4.py +4 -0
- llm_redacted-0.2.0/main/detectors/jwt.py +5 -0
- llm_redacted-0.2.0/main/detectors/local_path.py +5 -0
- llm_redacted-0.2.0/main/detectors/openai.py +5 -0
- llm_redacted-0.2.0/main/detectors/pem.py +4 -0
- llm_redacted-0.2.0/main/detectors/phone.py +5 -0
- llm_redacted-0.2.0/main/detectors/slack.py +4 -0
- llm_redacted-0.2.0/main/detectors/ssn.py +5 -0
- llm_redacted-0.2.0/main/detectors/stripe.py +4 -0
- llm_redacted-0.2.0/main/detectors/twilio.py +4 -0
- llm_redacted-0.2.0/main/result.py +26 -0
- llm_redacted-0.2.0/main/sanitizer.py +117 -0
- llm_redacted-0.2.0/main/streaming.py +132 -0
- llm_redacted-0.2.0/pyproject.toml +41 -0
- llm_redacted-0.2.0/setup.cfg +4 -0
- llm_redacted-0.2.0/tests/test_sanitizer.py +21 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 redacted contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-redacted
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A lightweight, zero-dependency output sanitizer for secrets, API keys, and PII.
|
|
5
|
+
Author: redacted
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/redacted/redacted
|
|
8
|
+
Project-URL: Repository, https://github.com/redacted/redacted
|
|
9
|
+
Project-URL: Issues, https://github.com/redacted/redacted/issues
|
|
10
|
+
Keywords: sanitizer,redaction,secrets,pii,llm,security,api-keys
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Security
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# redacted
|
|
30
|
+
|
|
31
|
+
Stop leaking secrets. One function call.
|
|
32
|
+
|
|
33
|
+
A lightweight, zero-dependency Python library that sanitizes secrets, API keys, PII, and sensitive data from text. Designed for LLM output pipelines.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install redacted
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from main import sanitize_output
|
|
45
|
+
|
|
46
|
+
text = "My API key is sk-12345678901234567890"
|
|
47
|
+
clean = sanitize_output(text)
|
|
48
|
+
print(clean)
|
|
49
|
+
# Output: My API key is [OPENAI_KEY]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **20 Built-in Detectors**: OpenAI, AWS, GitHub, Stripe, JWT, GCP, Slack, Twilio, Azure, Discord, PEM, Email, Phone, IPv4, SSN, Credit Cards, Env Vars, Local Paths, Connection Strings, Bearer Tokens.
|
|
55
|
+
- **Zero Dependencies**: Pure Python stdlib.
|
|
56
|
+
- **Reversible Redaction**: Keep a mapping of original secrets to restore them later.
|
|
57
|
+
- **Streaming Support**: Sanitize LLM streams token-by-token.
|
|
58
|
+
- **Extensible**: Add custom regex detectors easily.
|
|
59
|
+
|
|
60
|
+
## Advanced Usage
|
|
61
|
+
|
|
62
|
+
### Reversible Redaction
|
|
63
|
+
```python
|
|
64
|
+
from main import sanitize_output, restore_output
|
|
65
|
+
|
|
66
|
+
result = sanitize_output("Email admin@test.com", detailed=True)
|
|
67
|
+
print(result.cleaned) # Email [EMAIL_1]
|
|
68
|
+
|
|
69
|
+
restored = restore_output(result.cleaned, result.mapping)
|
|
70
|
+
print(restored) # Email admin@test.com
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Streaming
|
|
74
|
+
```python
|
|
75
|
+
from main import StreamingSanitizer
|
|
76
|
+
|
|
77
|
+
sanitizer = StreamingSanitizer()
|
|
78
|
+
# feed tokens one by one
|
|
79
|
+
safe_token = sanitizer.feed("sk-")
|
|
80
|
+
```
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# redacted
|
|
2
|
+
|
|
3
|
+
Stop leaking secrets. One function call.
|
|
4
|
+
|
|
5
|
+
A lightweight, zero-dependency Python library that sanitizes secrets, API keys, PII, and sensitive data from text. Designed for LLM output pipelines.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install redacted
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from main import sanitize_output
|
|
17
|
+
|
|
18
|
+
text = "My API key is sk-12345678901234567890"
|
|
19
|
+
clean = sanitize_output(text)
|
|
20
|
+
print(clean)
|
|
21
|
+
# Output: My API key is [OPENAI_KEY]
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- **20 Built-in Detectors**: OpenAI, AWS, GitHub, Stripe, JWT, GCP, Slack, Twilio, Azure, Discord, PEM, Email, Phone, IPv4, SSN, Credit Cards, Env Vars, Local Paths, Connection Strings, Bearer Tokens.
|
|
27
|
+
- **Zero Dependencies**: Pure Python stdlib.
|
|
28
|
+
- **Reversible Redaction**: Keep a mapping of original secrets to restore them later.
|
|
29
|
+
- **Streaming Support**: Sanitize LLM streams token-by-token.
|
|
30
|
+
- **Extensible**: Add custom regex detectors easily.
|
|
31
|
+
|
|
32
|
+
## Advanced Usage
|
|
33
|
+
|
|
34
|
+
### Reversible Redaction
|
|
35
|
+
```python
|
|
36
|
+
from main import sanitize_output, restore_output
|
|
37
|
+
|
|
38
|
+
result = sanitize_output("Email admin@test.com", detailed=True)
|
|
39
|
+
print(result.cleaned) # Email [EMAIL_1]
|
|
40
|
+
|
|
41
|
+
restored = restore_output(result.cleaned, result.mapping)
|
|
42
|
+
print(restored) # Email admin@test.com
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Streaming
|
|
46
|
+
```python
|
|
47
|
+
from main import StreamingSanitizer
|
|
48
|
+
|
|
49
|
+
sanitizer = StreamingSanitizer()
|
|
50
|
+
# feed tokens one by one
|
|
51
|
+
safe_token = sanitizer.feed("sk-")
|
|
52
|
+
```
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-redacted
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A lightweight, zero-dependency output sanitizer for secrets, API keys, and PII.
|
|
5
|
+
Author: redacted
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/redacted/redacted
|
|
8
|
+
Project-URL: Repository, https://github.com/redacted/redacted
|
|
9
|
+
Project-URL: Issues, https://github.com/redacted/redacted/issues
|
|
10
|
+
Keywords: sanitizer,redaction,secrets,pii,llm,security,api-keys
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Security
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.8
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# redacted
|
|
30
|
+
|
|
31
|
+
Stop leaking secrets. One function call.
|
|
32
|
+
|
|
33
|
+
A lightweight, zero-dependency Python library that sanitizes secrets, API keys, PII, and sensitive data from text. Designed for LLM output pipelines.
|
|
34
|
+
|
|
35
|
+
## Installation
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install redacted
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick Start
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from main import sanitize_output
|
|
45
|
+
|
|
46
|
+
text = "My API key is sk-12345678901234567890"
|
|
47
|
+
clean = sanitize_output(text)
|
|
48
|
+
print(clean)
|
|
49
|
+
# Output: My API key is [OPENAI_KEY]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
- **20 Built-in Detectors**: OpenAI, AWS, GitHub, Stripe, JWT, GCP, Slack, Twilio, Azure, Discord, PEM, Email, Phone, IPv4, SSN, Credit Cards, Env Vars, Local Paths, Connection Strings, Bearer Tokens.
|
|
55
|
+
- **Zero Dependencies**: Pure Python stdlib.
|
|
56
|
+
- **Reversible Redaction**: Keep a mapping of original secrets to restore them later.
|
|
57
|
+
- **Streaming Support**: Sanitize LLM streams token-by-token.
|
|
58
|
+
- **Extensible**: Add custom regex detectors easily.
|
|
59
|
+
|
|
60
|
+
## Advanced Usage
|
|
61
|
+
|
|
62
|
+
### Reversible Redaction
|
|
63
|
+
```python
|
|
64
|
+
from main import sanitize_output, restore_output
|
|
65
|
+
|
|
66
|
+
result = sanitize_output("Email admin@test.com", detailed=True)
|
|
67
|
+
print(result.cleaned) # Email [EMAIL_1]
|
|
68
|
+
|
|
69
|
+
restored = restore_output(result.cleaned, result.mapping)
|
|
70
|
+
print(restored) # Email admin@test.com
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Streaming
|
|
74
|
+
```python
|
|
75
|
+
from main import StreamingSanitizer
|
|
76
|
+
|
|
77
|
+
sanitizer = StreamingSanitizer()
|
|
78
|
+
# feed tokens one by one
|
|
79
|
+
safe_token = sanitizer.feed("sk-")
|
|
80
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
llm_redacted.egg-info/PKG-INFO
|
|
5
|
+
llm_redacted.egg-info/SOURCES.txt
|
|
6
|
+
llm_redacted.egg-info/dependency_links.txt
|
|
7
|
+
llm_redacted.egg-info/top_level.txt
|
|
8
|
+
main/__init__.py
|
|
9
|
+
main/result.py
|
|
10
|
+
main/sanitizer.py
|
|
11
|
+
main/streaming.py
|
|
12
|
+
main/detectors/__init__.py
|
|
13
|
+
main/detectors/aws.py
|
|
14
|
+
main/detectors/azure.py
|
|
15
|
+
main/detectors/bearer.py
|
|
16
|
+
main/detectors/connection_string.py
|
|
17
|
+
main/detectors/credit_card.py
|
|
18
|
+
main/detectors/discord.py
|
|
19
|
+
main/detectors/email.py
|
|
20
|
+
main/detectors/env.py
|
|
21
|
+
main/detectors/gcp.py
|
|
22
|
+
main/detectors/github.py
|
|
23
|
+
main/detectors/ipv4.py
|
|
24
|
+
main/detectors/jwt.py
|
|
25
|
+
main/detectors/local_path.py
|
|
26
|
+
main/detectors/openai.py
|
|
27
|
+
main/detectors/pem.py
|
|
28
|
+
main/detectors/phone.py
|
|
29
|
+
main/detectors/slack.py
|
|
30
|
+
main/detectors/ssn.py
|
|
31
|
+
main/detectors/stripe.py
|
|
32
|
+
main/detectors/twilio.py
|
|
33
|
+
tests/test_sanitizer.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
main
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""redacted — A lightweight, zero-dependency output sanitizer.
|
|
2
|
+
|
|
3
|
+
Sanitizes secrets, API keys, PII, and sensitive data from text.
|
|
4
|
+
Designed for LLM output pipelines, logging, and data sharing.
|
|
5
|
+
"""
|
|
6
|
+
from main.sanitizer import sanitize_output, restore_output
|
|
7
|
+
from main.result import SanitizeResult, Finding
|
|
8
|
+
from main.streaming import StreamingSanitizer
|
|
9
|
+
|
|
10
|
+
__version__ = "0.2.0"
|
|
11
|
+
__all__ = [
|
|
12
|
+
"sanitize_output",
|
|
13
|
+
"restore_output",
|
|
14
|
+
"SanitizeResult",
|
|
15
|
+
"Finding",
|
|
16
|
+
"StreamingSanitizer",
|
|
17
|
+
]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Auto-discovery of all detector modules in this package."""
|
|
2
|
+
import importlib
|
|
3
|
+
import pkgutil
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _discover_detectors():
|
|
8
|
+
"""Discover all detector modules and return them as a list."""
|
|
9
|
+
detectors = []
|
|
10
|
+
package_dir = os.path.dirname(__file__)
|
|
11
|
+
for importer, modname, ispkg in pkgutil.iter_modules([package_dir]):
|
|
12
|
+
if modname.startswith('_'):
|
|
13
|
+
continue
|
|
14
|
+
module = importlib.import_module(f'.{modname}', package=__package__)
|
|
15
|
+
if hasattr(module, 'TYPE') and hasattr(module, 'PATTERN') and hasattr(module, 'PLACEHOLDER'):
|
|
16
|
+
detectors.append(module)
|
|
17
|
+
return detectors
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
ALL_DETECTORS = _discover_detectors()
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Matches database connection strings with embedded credentials
|
|
2
|
+
# postgres://, mysql://, mongodb://, redis://, etc.
|
|
3
|
+
TYPE = "CONNECTION_STRING"
|
|
4
|
+
PATTERN = r"(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp|mssql)://[^\s'\"<>]+:[^\s'\"<>]+@[^\s'\"<>]+"
|
|
5
|
+
PLACEHOLDER = "[CONNECTION_STRING]"
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Matches major credit card number formats (with optional separators)
|
|
2
|
+
# Visa (4xxx), Mastercard (5xxx/2xxx), Amex (34xx/37xx), Discover (6xxx)
|
|
3
|
+
TYPE = "CREDIT_CARD"
|
|
4
|
+
PATTERN = r"\b(?:4[0-9]{3}|5[1-5][0-9]{2}|3[47][0-9]{2}|6(?:011|5[0-9]{2}))[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{3,4}\b"
|
|
5
|
+
PLACEHOLDER = "[CREDIT_CARD]"
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
# Matches env var assignments with sensitive-looking keys
|
|
2
|
+
# Requires key to contain SECRET, PASSWORD, TOKEN, API_KEY, etc.
|
|
3
|
+
# Value must be at least 6 chars to avoid matching docs
|
|
4
|
+
TYPE = "ENV_VAR"
|
|
5
|
+
PATTERN = r"(?:^|\b)(?:export\s+)?[A-Z_]*(?:SECRET|PASSWORD|PASSWD|TOKEN|API_?KEY|AUTH|CREDENTIAL)[A-Z_]*\s*=\s*[\"']?[^\s\"'>]{6,}[\"']?"
|
|
6
|
+
PLACEHOLDER = "[ENV_VAR_REDACTED]"
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Matches local file paths (Unix and Windows)
|
|
2
|
+
# Covers /home, /Users, /root, /etc, /var, /tmp, /opt, /srv and Windows drive paths
|
|
3
|
+
TYPE = "LOCAL_PATH"
|
|
4
|
+
PATTERN = r"(?:[A-Z]:\\(?:[^\\\s]+\\)*[^\\\s]+|/(?:home|Users|root|etc|var|tmp|opt|srv)/[\w./-]+)"
|
|
5
|
+
PLACEHOLDER = "[LOCAL_PATH]"
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# Matches international phone numbers - requires + country code to reduce false positives
|
|
2
|
+
# Or matches common formats like (XXX) XXX-XXXX
|
|
3
|
+
TYPE = "PHONE"
|
|
4
|
+
PATTERN = r"(?:\+\d{1,3}[\s.-]?\(?\d{1,4}\)?[\s.-]?\d{2,5}[\s.-]?\d{2,5})|(?:\(\d{3}\)[\s.-]?\d{3}[\s.-]?\d{4})"
|
|
5
|
+
PLACEHOLDER = "[PHONE]"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Result types for the redacted sanitizer."""
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class Finding:
|
|
7
|
+
"""A single detected sensitive item."""
|
|
8
|
+
detector_type: str
|
|
9
|
+
original: str
|
|
10
|
+
placeholder: str
|
|
11
|
+
start: int
|
|
12
|
+
end: int
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class SanitizeResult:
|
|
17
|
+
"""Result of a detailed sanitization operation."""
|
|
18
|
+
cleaned: str
|
|
19
|
+
findings: list = field(default_factory=list)
|
|
20
|
+
mapping: dict = field(default_factory=dict)
|
|
21
|
+
|
|
22
|
+
def __str__(self):
|
|
23
|
+
return self.cleaned
|
|
24
|
+
|
|
25
|
+
def __repr__(self):
|
|
26
|
+
return f"SanitizeResult(findings={len(self.findings)}, mapping_size={len(self.mapping)})"
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Core sanitization engine for the redacted library."""
|
|
2
|
+
import re
|
|
3
|
+
from main.detectors import ALL_DETECTORS
|
|
4
|
+
from main.result import SanitizeResult, Finding
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# Precompile regexes for built-in detectors
|
|
8
|
+
_COMPILED_DETECTORS = [
|
|
9
|
+
(re.compile(det.PATTERN, re.MULTILINE), det.TYPE, det.PLACEHOLDER)
|
|
10
|
+
for det in ALL_DETECTORS
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def sanitize_output(text, custom_detectors=None, detailed=False):
|
|
15
|
+
"""Sanitize sensitive data from text.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text: Input text to sanitize.
|
|
19
|
+
custom_detectors: Optional list of custom detector dicts with
|
|
20
|
+
'TYPE', 'PATTERN', and 'PLACEHOLDER' keys.
|
|
21
|
+
detailed: If True, returns a SanitizeResult with findings and
|
|
22
|
+
a reversible mapping. If False, returns a plain string.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
str if detailed=False, SanitizeResult if detailed=True.
|
|
26
|
+
"""
|
|
27
|
+
compiled = list(_COMPILED_DETECTORS)
|
|
28
|
+
|
|
29
|
+
if custom_detectors:
|
|
30
|
+
for det in custom_detectors:
|
|
31
|
+
compiled.append((
|
|
32
|
+
re.compile(det["PATTERN"], re.MULTILINE),
|
|
33
|
+
det["TYPE"],
|
|
34
|
+
det["PLACEHOLDER"]
|
|
35
|
+
))
|
|
36
|
+
|
|
37
|
+
if not detailed:
|
|
38
|
+
# Simple mode: just replace with static placeholders (backward compatible)
|
|
39
|
+
cleaned = text
|
|
40
|
+
for pattern, dtype, placeholder in compiled:
|
|
41
|
+
cleaned = pattern.sub(placeholder, cleaned)
|
|
42
|
+
return cleaned
|
|
43
|
+
|
|
44
|
+
# Detailed mode: collect findings, build mapping with indexed placeholders
|
|
45
|
+
# First, find all matches across all detectors
|
|
46
|
+
all_matches = []
|
|
47
|
+
for pattern, dtype, placeholder in compiled:
|
|
48
|
+
for match in pattern.finditer(text):
|
|
49
|
+
all_matches.append({
|
|
50
|
+
'start': match.start(),
|
|
51
|
+
'end': match.end(),
|
|
52
|
+
'original': match.group(),
|
|
53
|
+
'type': dtype,
|
|
54
|
+
'placeholder_base': placeholder,
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
# Sort by start position (earliest first), then by length (longest first)
|
|
58
|
+
all_matches.sort(key=lambda m: (m['start'], -(m['end'] - m['start'])))
|
|
59
|
+
|
|
60
|
+
# Remove overlapping matches (keep the first/longest one)
|
|
61
|
+
filtered = []
|
|
62
|
+
last_end = 0
|
|
63
|
+
for m in all_matches:
|
|
64
|
+
if m['start'] >= last_end:
|
|
65
|
+
filtered.append(m)
|
|
66
|
+
last_end = m['end']
|
|
67
|
+
|
|
68
|
+
# Build indexed placeholders and mapping
|
|
69
|
+
type_counters = {}
|
|
70
|
+
findings = []
|
|
71
|
+
mapping = {}
|
|
72
|
+
|
|
73
|
+
for m in filtered:
|
|
74
|
+
base = m['placeholder_base']
|
|
75
|
+
# Strip brackets, add index, re-bracket
|
|
76
|
+
inner = base.strip('[]')
|
|
77
|
+
count = type_counters.get(inner, 0) + 1
|
|
78
|
+
type_counters[inner] = count
|
|
79
|
+
indexed = f"[{inner}_{count}]"
|
|
80
|
+
|
|
81
|
+
findings.append(Finding(
|
|
82
|
+
detector_type=m['type'],
|
|
83
|
+
original=m['original'],
|
|
84
|
+
placeholder=indexed,
|
|
85
|
+
start=m['start'],
|
|
86
|
+
end=m['end'],
|
|
87
|
+
))
|
|
88
|
+
mapping[indexed] = m['original']
|
|
89
|
+
|
|
90
|
+
# Build cleaned text by replacing from end to start to preserve positions
|
|
91
|
+
cleaned = text
|
|
92
|
+
for finding in reversed(findings):
|
|
93
|
+
cleaned = cleaned[:finding.start] + finding.placeholder + cleaned[finding.end:]
|
|
94
|
+
|
|
95
|
+
return SanitizeResult(
|
|
96
|
+
cleaned=cleaned,
|
|
97
|
+
findings=findings,
|
|
98
|
+
mapping=mapping,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def restore_output(cleaned_text, mapping):
|
|
103
|
+
"""Restore original values from a sanitized text using a mapping.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
cleaned_text: Text with indexed placeholders (e.g., [EMAIL_1]).
|
|
107
|
+
mapping: Dict mapping placeholders to original values,
|
|
108
|
+
as returned by SanitizeResult.mapping.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
str with placeholders replaced by original values.
|
|
112
|
+
"""
|
|
113
|
+
result = cleaned_text
|
|
114
|
+
# Sort by placeholder length descending to avoid partial replacements
|
|
115
|
+
for placeholder, original in sorted(mapping.items(), key=lambda x: -len(x[0])):
|
|
116
|
+
result = result.replace(placeholder, original)
|
|
117
|
+
return result
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Streaming sanitizer for real-time LLM output processing."""
|
|
2
|
+
import re
|
|
3
|
+
from main.detectors import ALL_DETECTORS
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class StreamingSanitizer:
|
|
7
|
+
"""Sanitizes text token-by-token for streaming LLM output.
|
|
8
|
+
|
|
9
|
+
Maintains an internal buffer to handle patterns that may span
|
|
10
|
+
multiple tokens. Emits sanitized text as soon as it's safe to
|
|
11
|
+
do so.
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
sanitizer = StreamingSanitizer()
|
|
15
|
+
for token in llm_stream:
|
|
16
|
+
safe_text = sanitizer.feed(token)
|
|
17
|
+
print(safe_text, end='', flush=True)
|
|
18
|
+
# Don't forget to flush at the end
|
|
19
|
+
print(sanitizer.flush(), end='', flush=True)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Maximum pattern length we might need to buffer for
|
|
23
|
+
# This is a conservative estimate based on the longest patterns
|
|
24
|
+
_MAX_PATTERN_LEN = 512
|
|
25
|
+
|
|
26
|
+
def __init__(self, custom_detectors=None):
|
|
27
|
+
"""Initialize the streaming sanitizer.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
custom_detectors: Optional list of custom detector dicts
|
|
31
|
+
with 'TYPE', 'PATTERN', and 'PLACEHOLDER' keys.
|
|
32
|
+
"""
|
|
33
|
+
self._buffer = ""
|
|
34
|
+
self._detectors = [
|
|
35
|
+
(re.compile(det.PATTERN, re.MULTILINE), det.PLACEHOLDER)
|
|
36
|
+
for det in ALL_DETECTORS
|
|
37
|
+
]
|
|
38
|
+
if custom_detectors:
|
|
39
|
+
for det in custom_detectors:
|
|
40
|
+
self._detectors.append((
|
|
41
|
+
re.compile(det["PATTERN"], re.MULTILINE),
|
|
42
|
+
det["PLACEHOLDER"]
|
|
43
|
+
))
|
|
44
|
+
|
|
45
|
+
def feed(self, token):
|
|
46
|
+
"""Feed a token into the sanitizer.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
token: A string token from the LLM stream.
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
A string of sanitized text that is safe to emit.
|
|
53
|
+
May be empty if the sanitizer is buffering.
|
|
54
|
+
"""
|
|
55
|
+
self._buffer += token
|
|
56
|
+
|
|
57
|
+
# If buffer is small, keep accumulating — we might be mid-pattern
|
|
58
|
+
if len(self._buffer) < self._MAX_PATTERN_LEN:
|
|
59
|
+
# Check if any pattern could still be growing
|
|
60
|
+
if self._has_partial_match():
|
|
61
|
+
return ""
|
|
62
|
+
|
|
63
|
+
# Apply all detectors to the buffer
|
|
64
|
+
sanitized = self._buffer
|
|
65
|
+
for pattern, placeholder in self._detectors:
|
|
66
|
+
sanitized = pattern.sub(placeholder, sanitized)
|
|
67
|
+
|
|
68
|
+
# Keep a tail portion in the buffer in case a pattern spans
|
|
69
|
+
# the boundary. The safe portion is everything before the tail.
|
|
70
|
+
tail_size = min(self._MAX_PATTERN_LEN, len(sanitized))
|
|
71
|
+
|
|
72
|
+
if len(sanitized) <= tail_size:
|
|
73
|
+
# Buffer is still small enough to hold entirely
|
|
74
|
+
self._buffer = self._buffer # keep original buffer for partial matching
|
|
75
|
+
# But if no partial matches, we can emit some
|
|
76
|
+
if not self._has_partial_match():
|
|
77
|
+
self._buffer = ""
|
|
78
|
+
return sanitized
|
|
79
|
+
return ""
|
|
80
|
+
|
|
81
|
+
safe = sanitized[:-tail_size]
|
|
82
|
+
# We need to figure out how much of the original buffer corresponds
|
|
83
|
+
# to the safe portion. Apply detectors to progressively larger
|
|
84
|
+
# prefixes... Actually, simpler approach:
|
|
85
|
+
# After sanitization, keep the tail of the SANITIZED output
|
|
86
|
+
# in the buffer and emit the safe prefix.
|
|
87
|
+
self._buffer = sanitized[-tail_size:]
|
|
88
|
+
return safe
|
|
89
|
+
|
|
90
|
+
def _has_partial_match(self):
|
|
91
|
+
"""Check if any detector pattern could partially match at the end of the buffer."""
|
|
92
|
+
buf = self._buffer
|
|
93
|
+
# Check if the buffer ends with something that looks like
|
|
94
|
+
# the start of a sensitive pattern
|
|
95
|
+
partial_starters = [
|
|
96
|
+
r'sk-', # OpenAI, Stripe
|
|
97
|
+
r'eyJ', # JWT
|
|
98
|
+
r'AKIA', r'ASIA', # AWS
|
|
99
|
+
r'gh', # GitHub
|
|
100
|
+
r'xox', # Slack
|
|
101
|
+
r'AC', # Twilio
|
|
102
|
+
r'AIza', # GCP
|
|
103
|
+
r'Bearer', # Bearer tokens
|
|
104
|
+
r'-----BEGIN', # PEM
|
|
105
|
+
r'AccountKey=', # Azure
|
|
106
|
+
]
|
|
107
|
+
for starter in partial_starters:
|
|
108
|
+
# Check if the buffer ends with a prefix of this starter
|
|
109
|
+
for i in range(1, len(starter) + 1):
|
|
110
|
+
if buf.endswith(starter[:i]):
|
|
111
|
+
return True
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
def flush(self):
|
|
115
|
+
"""Flush remaining buffer content.
|
|
116
|
+
|
|
117
|
+
Call this when the stream ends to get any remaining text.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Sanitized remaining buffer content.
|
|
121
|
+
"""
|
|
122
|
+
if not self._buffer:
|
|
123
|
+
return ""
|
|
124
|
+
sanitized = self._buffer
|
|
125
|
+
for pattern, placeholder in self._detectors:
|
|
126
|
+
sanitized = pattern.sub(placeholder, sanitized)
|
|
127
|
+
self._buffer = ""
|
|
128
|
+
return sanitized
|
|
129
|
+
|
|
130
|
+
def reset(self):
|
|
131
|
+
"""Reset the sanitizer state, clearing the buffer."""
|
|
132
|
+
self._buffer = ""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "llm-redacted"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "A lightweight, zero-dependency output sanitizer for secrets, API keys, and PII."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "redacted"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["sanitizer", "redaction", "secrets", "pii", "llm", "security", "api-keys"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Security",
|
|
28
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
29
|
+
"Typing :: Typed",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/redacted/redacted"
|
|
34
|
+
Repository = "https://github.com/redacted/redacted"
|
|
35
|
+
Issues = "https://github.com/redacted/redacted/issues"
|
|
36
|
+
|
|
37
|
+
[tool.setuptools.packages.find]
|
|
38
|
+
include = ["main*"]
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from main import sanitize_output, restore_output
|
|
3
|
+
|
|
4
|
+
class TestSanitizer(unittest.TestCase):
|
|
5
|
+
def test_basic_sanitize(self):
|
|
6
|
+
text = "My email is test@gmail.com and key is sk-12345678901234567890"
|
|
7
|
+
clean = sanitize_output(text)
|
|
8
|
+
self.assertIn("[EMAIL]", clean)
|
|
9
|
+
self.assertIn("[OPENAI_KEY]", clean)
|
|
10
|
+
|
|
11
|
+
def test_detailed_sanitize(self):
|
|
12
|
+
text = "Contact a@b.com or c@d.com"
|
|
13
|
+
result = sanitize_output(text, detailed=True)
|
|
14
|
+
self.assertIn("[EMAIL_1]", result.cleaned)
|
|
15
|
+
self.assertIn("[EMAIL_2]", result.cleaned)
|
|
16
|
+
|
|
17
|
+
restored = restore_output(result.cleaned, result.mapping)
|
|
18
|
+
self.assertEqual(restored, text)
|
|
19
|
+
|
|
20
|
+
if __name__ == '__main__':
|
|
21
|
+
unittest.main()
|