llm-redacted 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. llm_redacted-0.2.0/LICENSE +21 -0
  2. llm_redacted-0.2.0/PKG-INFO +80 -0
  3. llm_redacted-0.2.0/README.md +52 -0
  4. llm_redacted-0.2.0/llm_redacted.egg-info/PKG-INFO +80 -0
  5. llm_redacted-0.2.0/llm_redacted.egg-info/SOURCES.txt +33 -0
  6. llm_redacted-0.2.0/llm_redacted.egg-info/dependency_links.txt +1 -0
  7. llm_redacted-0.2.0/llm_redacted.egg-info/top_level.txt +1 -0
  8. llm_redacted-0.2.0/main/__init__.py +17 -0
  9. llm_redacted-0.2.0/main/detectors/__init__.py +20 -0
  10. llm_redacted-0.2.0/main/detectors/aws.py +4 -0
  11. llm_redacted-0.2.0/main/detectors/azure.py +5 -0
  12. llm_redacted-0.2.0/main/detectors/bearer.py +4 -0
  13. llm_redacted-0.2.0/main/detectors/connection_string.py +5 -0
  14. llm_redacted-0.2.0/main/detectors/credit_card.py +5 -0
  15. llm_redacted-0.2.0/main/detectors/discord.py +4 -0
  16. llm_redacted-0.2.0/main/detectors/email.py +4 -0
  17. llm_redacted-0.2.0/main/detectors/env.py +6 -0
  18. llm_redacted-0.2.0/main/detectors/gcp.py +5 -0
  19. llm_redacted-0.2.0/main/detectors/github.py +4 -0
  20. llm_redacted-0.2.0/main/detectors/ipv4.py +4 -0
  21. llm_redacted-0.2.0/main/detectors/jwt.py +5 -0
  22. llm_redacted-0.2.0/main/detectors/local_path.py +5 -0
  23. llm_redacted-0.2.0/main/detectors/openai.py +5 -0
  24. llm_redacted-0.2.0/main/detectors/pem.py +4 -0
  25. llm_redacted-0.2.0/main/detectors/phone.py +5 -0
  26. llm_redacted-0.2.0/main/detectors/slack.py +4 -0
  27. llm_redacted-0.2.0/main/detectors/ssn.py +5 -0
  28. llm_redacted-0.2.0/main/detectors/stripe.py +4 -0
  29. llm_redacted-0.2.0/main/detectors/twilio.py +4 -0
  30. llm_redacted-0.2.0/main/result.py +26 -0
  31. llm_redacted-0.2.0/main/sanitizer.py +117 -0
  32. llm_redacted-0.2.0/main/streaming.py +132 -0
  33. llm_redacted-0.2.0/pyproject.toml +41 -0
  34. llm_redacted-0.2.0/setup.cfg +4 -0
  35. llm_redacted-0.2.0/tests/test_sanitizer.py +21 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 redacted contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-redacted
3
+ Version: 0.2.0
4
+ Summary: A lightweight, zero-dependency output sanitizer for secrets, API keys, and PII.
5
+ Author: redacted
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/redacted/redacted
8
+ Project-URL: Repository, https://github.com/redacted/redacted
9
+ Project-URL: Issues, https://github.com/redacted/redacted/issues
10
+ Keywords: sanitizer,redaction,secrets,pii,llm,security,api-keys
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Security
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.8
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Dynamic: license-file
28
+
29
+ # redacted
30
+
31
+ Stop leaking secrets. One function call.
32
+
33
+ A lightweight, zero-dependency Python library that sanitizes secrets, API keys, PII, and sensitive data from text. Designed for LLM output pipelines.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install redacted
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```python
44
+ from main import sanitize_output
45
+
46
+ text = "My API key is sk-12345678901234567890"
47
+ clean = sanitize_output(text)
48
+ print(clean)
49
+ # Output: My API key is [OPENAI_KEY]
50
+ ```
51
+
52
+ ## Features
53
+
54
+ - **20 Built-in Detectors**: OpenAI, AWS, GitHub, Stripe, JWT, GCP, Slack, Twilio, Azure, Discord, PEM, Email, Phone, IPv4, SSN, Credit Cards, Env Vars, Local Paths, Connection Strings, Bearer Tokens.
55
+ - **Zero Dependencies**: Pure Python stdlib.
56
+ - **Reversible Redaction**: Keep a mapping of original secrets to restore them later.
57
+ - **Streaming Support**: Sanitize LLM streams token-by-token.
58
+ - **Extensible**: Add custom regex detectors easily.
59
+
60
+ ## Advanced Usage
61
+
62
+ ### Reversible Redaction
63
+ ```python
64
+ from main import sanitize_output, restore_output
65
+
66
+ result = sanitize_output("Email admin@test.com", detailed=True)
67
+ print(result.cleaned) # Email [EMAIL_1]
68
+
69
+ restored = restore_output(result.cleaned, result.mapping)
70
+ print(restored) # Email admin@test.com
71
+ ```
72
+
73
+ ### Streaming
74
+ ```python
75
+ from main import StreamingSanitizer
76
+
77
+ sanitizer = StreamingSanitizer()
78
+ # feed tokens one by one
79
+ safe_token = sanitizer.feed("sk-")
80
+ ```
@@ -0,0 +1,52 @@
1
+ # redacted
2
+
3
+ Stop leaking secrets. One function call.
4
+
5
+ A lightweight, zero-dependency Python library that sanitizes secrets, API keys, PII, and sensitive data from text. Designed for LLM output pipelines.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install redacted
11
+ ```
12
+
13
+ ## Quick Start
14
+
15
+ ```python
16
+ from main import sanitize_output
17
+
18
+ text = "My API key is sk-12345678901234567890"
19
+ clean = sanitize_output(text)
20
+ print(clean)
21
+ # Output: My API key is [OPENAI_KEY]
22
+ ```
23
+
24
+ ## Features
25
+
26
+ - **20 Built-in Detectors**: OpenAI, AWS, GitHub, Stripe, JWT, GCP, Slack, Twilio, Azure, Discord, PEM, Email, Phone, IPv4, SSN, Credit Cards, Env Vars, Local Paths, Connection Strings, Bearer Tokens.
27
+ - **Zero Dependencies**: Pure Python stdlib.
28
+ - **Reversible Redaction**: Keep a mapping of original secrets to restore them later.
29
+ - **Streaming Support**: Sanitize LLM streams token-by-token.
30
+ - **Extensible**: Add custom regex detectors easily.
31
+
32
+ ## Advanced Usage
33
+
34
+ ### Reversible Redaction
35
+ ```python
36
+ from main import sanitize_output, restore_output
37
+
38
+ result = sanitize_output("Email admin@test.com", detailed=True)
39
+ print(result.cleaned) # Email [EMAIL_1]
40
+
41
+ restored = restore_output(result.cleaned, result.mapping)
42
+ print(restored) # Email admin@test.com
43
+ ```
44
+
45
+ ### Streaming
46
+ ```python
47
+ from main import StreamingSanitizer
48
+
49
+ sanitizer = StreamingSanitizer()
50
+ # feed tokens one by one
51
+ safe_token = sanitizer.feed("sk-")
52
+ ```
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-redacted
3
+ Version: 0.2.0
4
+ Summary: A lightweight, zero-dependency output sanitizer for secrets, API keys, and PII.
5
+ Author: redacted
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/redacted/redacted
8
+ Project-URL: Repository, https://github.com/redacted/redacted
9
+ Project-URL: Issues, https://github.com/redacted/redacted/issues
10
+ Keywords: sanitizer,redaction,secrets,pii,llm,security,api-keys
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.8
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Security
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.8
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Dynamic: license-file
28
+
29
+ # redacted
30
+
31
+ Stop leaking secrets. One function call.
32
+
33
+ A lightweight, zero-dependency Python library that sanitizes secrets, API keys, PII, and sensitive data from text. Designed for LLM output pipelines.
34
+
35
+ ## Installation
36
+
37
+ ```bash
38
+ pip install redacted
39
+ ```
40
+
41
+ ## Quick Start
42
+
43
+ ```python
44
+ from main import sanitize_output
45
+
46
+ text = "My API key is sk-12345678901234567890"
47
+ clean = sanitize_output(text)
48
+ print(clean)
49
+ # Output: My API key is [OPENAI_KEY]
50
+ ```
51
+
52
+ ## Features
53
+
54
+ - **20 Built-in Detectors**: OpenAI, AWS, GitHub, Stripe, JWT, GCP, Slack, Twilio, Azure, Discord, PEM, Email, Phone, IPv4, SSN, Credit Cards, Env Vars, Local Paths, Connection Strings, Bearer Tokens.
55
+ - **Zero Dependencies**: Pure Python stdlib.
56
+ - **Reversible Redaction**: Keep a mapping of original secrets to restore them later.
57
+ - **Streaming Support**: Sanitize LLM streams token-by-token.
58
+ - **Extensible**: Add custom regex detectors easily.
59
+
60
+ ## Advanced Usage
61
+
62
+ ### Reversible Redaction
63
+ ```python
64
+ from main import sanitize_output, restore_output
65
+
66
+ result = sanitize_output("Email admin@test.com", detailed=True)
67
+ print(result.cleaned) # Email [EMAIL_1]
68
+
69
+ restored = restore_output(result.cleaned, result.mapping)
70
+ print(restored) # Email admin@test.com
71
+ ```
72
+
73
+ ### Streaming
74
+ ```python
75
+ from main import StreamingSanitizer
76
+
77
+ sanitizer = StreamingSanitizer()
78
+ # feed tokens one by one
79
+ safe_token = sanitizer.feed("sk-")
80
+ ```
@@ -0,0 +1,33 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ llm_redacted.egg-info/PKG-INFO
5
+ llm_redacted.egg-info/SOURCES.txt
6
+ llm_redacted.egg-info/dependency_links.txt
7
+ llm_redacted.egg-info/top_level.txt
8
+ main/__init__.py
9
+ main/result.py
10
+ main/sanitizer.py
11
+ main/streaming.py
12
+ main/detectors/__init__.py
13
+ main/detectors/aws.py
14
+ main/detectors/azure.py
15
+ main/detectors/bearer.py
16
+ main/detectors/connection_string.py
17
+ main/detectors/credit_card.py
18
+ main/detectors/discord.py
19
+ main/detectors/email.py
20
+ main/detectors/env.py
21
+ main/detectors/gcp.py
22
+ main/detectors/github.py
23
+ main/detectors/ipv4.py
24
+ main/detectors/jwt.py
25
+ main/detectors/local_path.py
26
+ main/detectors/openai.py
27
+ main/detectors/pem.py
28
+ main/detectors/phone.py
29
+ main/detectors/slack.py
30
+ main/detectors/ssn.py
31
+ main/detectors/stripe.py
32
+ main/detectors/twilio.py
33
+ tests/test_sanitizer.py
@@ -0,0 +1,17 @@
1
+ """redacted — A lightweight, zero-dependency output sanitizer.
2
+
3
+ Sanitizes secrets, API keys, PII, and sensitive data from text.
4
+ Designed for LLM output pipelines, logging, and data sharing.
5
+ """
6
+ from main.sanitizer import sanitize_output, restore_output
7
+ from main.result import SanitizeResult, Finding
8
+ from main.streaming import StreamingSanitizer
9
+
10
+ __version__ = "0.2.0"
11
+ __all__ = [
12
+ "sanitize_output",
13
+ "restore_output",
14
+ "SanitizeResult",
15
+ "Finding",
16
+ "StreamingSanitizer",
17
+ ]
@@ -0,0 +1,20 @@
1
+ """Auto-discovery of all detector modules in this package."""
2
+ import importlib
3
+ import pkgutil
4
+ import os
5
+
6
+
7
+ def _discover_detectors():
8
+ """Discover all detector modules and return them as a list."""
9
+ detectors = []
10
+ package_dir = os.path.dirname(__file__)
11
+ for importer, modname, ispkg in pkgutil.iter_modules([package_dir]):
12
+ if modname.startswith('_'):
13
+ continue
14
+ module = importlib.import_module(f'.{modname}', package=__package__)
15
+ if hasattr(module, 'TYPE') and hasattr(module, 'PATTERN') and hasattr(module, 'PLACEHOLDER'):
16
+ detectors.append(module)
17
+ return detectors
18
+
19
+
20
+ ALL_DETECTORS = _discover_detectors()
@@ -0,0 +1,4 @@
1
+ # Matches AWS Access Key IDs: AKIA, ASIA (temp creds), AGPA, ANPA, A3T prefixes
2
+ TYPE = "AWS_ACCESS_KEY"
3
+ PATTERN = r"\b(?:AKIA|ASIA|AGPA|ANPA|A3T)[0-9A-Z]{16}\b"
4
+ PLACEHOLDER = "[AWS_KEY]"
@@ -0,0 +1,5 @@
1
+ # Matches Azure storage account keys (base64, 88 chars)
2
+ # and Azure SAS tokens
3
+ TYPE = "AZURE_KEY"
4
+ PATTERN = r"(?:AccountKey=)[A-Za-z0-9+/=]{80,}|(?:sig=)[A-Za-z0-9%+/=]{40,}"
5
+ PLACEHOLDER = "[AZURE_KEY]"
@@ -0,0 +1,4 @@
1
+ # Matches Bearer tokens in HTTP authorization headers
2
+ TYPE = "BEARER_TOKEN"
3
+ PATTERN = r"(?:Bearer|Authorization:\s*Bearer)\s+[A-Za-z0-9_\-.~+/]+="
4
+ PLACEHOLDER = "[BEARER_TOKEN]"
@@ -0,0 +1,5 @@
1
+ # Matches database connection strings with embedded credentials
2
+ # postgres://, mysql://, mongodb://, redis://, etc.
3
+ TYPE = "CONNECTION_STRING"
4
+ PATTERN = r"(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp|mssql)://[^\s'\"<>]+:[^\s'\"<>]+@[^\s'\"<>]+"
5
+ PLACEHOLDER = "[CONNECTION_STRING]"
@@ -0,0 +1,5 @@
1
+ # Matches major credit card number formats (with optional separators)
2
+ # Visa (4xxx), Mastercard (5xxx/2xxx), Amex (34xx/37xx), Discover (6xxx)
3
+ TYPE = "CREDIT_CARD"
4
+ PATTERN = r"\b(?:4[0-9]{3}|5[1-5][0-9]{2}|3[47][0-9]{2}|6(?:011|5[0-9]{2}))[- ]?[0-9]{4}[- ]?[0-9]{4}[- ]?[0-9]{3,4}\b"
5
+ PLACEHOLDER = "[CREDIT_CARD]"
@@ -0,0 +1,4 @@
1
+ # Matches Discord bot tokens and webhook URLs
2
+ TYPE = "DISCORD_TOKEN"
3
+ PATTERN = r"(?:discord(?:app)?\.com/api/webhooks/\d+/[\w-]+|[MN][A-Za-z\d]{23,}\.[\w-]{6}\.[\w-]{27,})"
4
+ PLACEHOLDER = "[DISCORD_TOKEN]"
@@ -0,0 +1,4 @@
1
+ # Matches email addresses
2
+ TYPE = "EMAIL"
3
+ PATTERN = r"\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}\b"
4
+ PLACEHOLDER = "[EMAIL]"
@@ -0,0 +1,6 @@
1
+ # Matches env var assignments with sensitive-looking keys
2
+ # Requires key to contain SECRET, PASSWORD, TOKEN, API_KEY, etc.
3
+ # Value must be at least 6 chars to avoid matching docs
4
+ TYPE = "ENV_VAR"
5
+ PATTERN = r"(?:^|\b)(?:export\s+)?[A-Z_]*(?:SECRET|PASSWORD|PASSWD|TOKEN|API_?KEY|AUTH|CREDENTIAL)[A-Z_]*\s*=\s*[\"']?[^\s\"'>]{6,}[\"']?"
6
+ PLACEHOLDER = "[ENV_VAR_REDACTED]"
@@ -0,0 +1,5 @@
1
+ # Matches Google Cloud API keys (AIzaSy prefix, 39 chars total)
2
+ # and Google OAuth client IDs
3
+ TYPE = "GCP_API_KEY"
4
+ PATTERN = r"\bAIzaSy[A-Za-z0-9_-]{33}\b"
5
+ PLACEHOLDER = "[GCP_KEY]"
@@ -0,0 +1,4 @@
1
+ # Matches GitHub tokens: ghp_, gho_, ghu_, ghs_, ghr_ followed by 36+ alphanumeric chars
2
+ TYPE = "GITHUB_TOKEN"
3
+ PATTERN = r"\bgh[pousr]_[A-Za-z0-9_]{36,}\b"
4
+ PLACEHOLDER = "[GITHUB_TOKEN]"
@@ -0,0 +1,4 @@
1
+ # Matches valid IPv4 addresses: each octet 0-255
2
+ TYPE = "IPV4_ADDRESS"
3
+ PATTERN = r"\b(?:(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(?:25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\b"
4
+ PLACEHOLDER = "[IP_ADDRESS]"
@@ -0,0 +1,5 @@
1
+ # Matches JWT tokens: must start with eyJ (base64 for '{"')
2
+ # Three base64url sections separated by dots
3
+ TYPE = "JWT_TOKEN"
4
+ PATTERN = r"\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b"
5
+ PLACEHOLDER = "[JWT_TOKEN]"
@@ -0,0 +1,5 @@
1
+ # Matches local file paths (Unix and Windows)
2
+ # Covers /home, /Users, /root, /etc, /var, /tmp, /opt, /srv and Windows drive paths
3
+ TYPE = "LOCAL_PATH"
4
+ PATTERN = r"(?:[A-Z]:\\(?:[^\\\s]+\\)*[^\\\s]+|/(?:home|Users|root|etc|var|tmp|opt|srv)/[\w./-]+)"
5
+ PLACEHOLDER = "[LOCAL_PATH]"
@@ -0,0 +1,5 @@
1
+ # Matches OpenAI API keys: sk-<20+ alphanumeric chars>
2
+ # Supports sk-proj- and sk-org- prefixes
3
+ TYPE = "OPENAI_API_KEY"
4
+ PATTERN = r"\bsk-(?:proj-|org-)?[A-Za-z0-9_-]{20,}\b"
5
+ PLACEHOLDER = "[OPENAI_KEY]"
@@ -0,0 +1,4 @@
1
+ # Matches PEM-encoded private keys: RSA, DSA, EC, OPENSSH, ENCRYPTED
2
+ TYPE = "PEM_KEY"
3
+ PATTERN = r"-----BEGIN[\w ]*PRIVATE KEY-----[\s\S]+?-----END[\w ]*PRIVATE KEY-----"
4
+ PLACEHOLDER = "[PEM_KEY]"
@@ -0,0 +1,5 @@
1
+ # Matches international phone numbers - requires + country code to reduce false positives
2
+ # Or matches common formats like (XXX) XXX-XXXX
3
+ TYPE = "PHONE"
4
+ PATTERN = r"(?:\+\d{1,3}[\s.-]?\(?\d{1,4}\)?[\s.-]?\d{2,5}[\s.-]?\d{2,5})|(?:\(\d{3}\)[\s.-]?\d{3}[\s.-]?\d{4})"
5
+ PLACEHOLDER = "[PHONE]"
@@ -0,0 +1,4 @@
1
+ # Matches Slack tokens: bot (xoxb-), user (xoxp-), app (xapp-), workspace (xoxa-)
2
+ TYPE = "SLACK_TOKEN"
3
+ PATTERN = r"\bxox[bpas]-[A-Za-z0-9-]{10,}\b"
4
+ PLACEHOLDER = "[SLACK_TOKEN]"
@@ -0,0 +1,5 @@
1
+ # Matches US Social Security Numbers: XXX-XX-XXXX format
2
+ # Excludes known invalid ranges (000, 666, 900-999 in area)
3
+ TYPE = "SSN"
4
+ PATTERN = r"\b(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}\b"
5
+ PLACEHOLDER = "[SSN]"
@@ -0,0 +1,4 @@
1
+ # Matches Stripe API keys: sk_live_ or sk_test_ followed by 24+ alphanumeric chars
2
+ TYPE = "STRIPE_KEY"
3
+ PATTERN = r"\bsk_(?:live|test)_[A-Za-z0-9]{24,}\b"
4
+ PLACEHOLDER = "[STRIPE_KEY]"
@@ -0,0 +1,4 @@
1
+ # Matches Twilio Account SIDs (AC + 32 hex) and Auth Tokens (32 hex)
2
+ TYPE = "TWILIO_KEY"
3
+ PATTERN = r"\bAC[a-f0-9]{32}\b"
4
+ PLACEHOLDER = "[TWILIO_SID]"
@@ -0,0 +1,26 @@
1
+ """Result types for the redacted sanitizer."""
2
+ from dataclasses import dataclass, field
3
+
4
+
5
+ @dataclass
6
+ class Finding:
7
+ """A single detected sensitive item."""
8
+ detector_type: str
9
+ original: str
10
+ placeholder: str
11
+ start: int
12
+ end: int
13
+
14
+
15
+ @dataclass
16
+ class SanitizeResult:
17
+ """Result of a detailed sanitization operation."""
18
+ cleaned: str
19
+ findings: list = field(default_factory=list)
20
+ mapping: dict = field(default_factory=dict)
21
+
22
+ def __str__(self):
23
+ return self.cleaned
24
+
25
+ def __repr__(self):
26
+ return f"SanitizeResult(findings={len(self.findings)}, mapping_size={len(self.mapping)})"
@@ -0,0 +1,117 @@
1
+ """Core sanitization engine for the redacted library."""
2
+ import re
3
+ from main.detectors import ALL_DETECTORS
4
+ from main.result import SanitizeResult, Finding
5
+
6
+
7
+ # Precompile regexes for built-in detectors
8
+ _COMPILED_DETECTORS = [
9
+ (re.compile(det.PATTERN, re.MULTILINE), det.TYPE, det.PLACEHOLDER)
10
+ for det in ALL_DETECTORS
11
+ ]
12
+
13
+
14
+ def sanitize_output(text, custom_detectors=None, detailed=False):
15
+ """Sanitize sensitive data from text.
16
+
17
+ Args:
18
+ text: Input text to sanitize.
19
+ custom_detectors: Optional list of custom detector dicts with
20
+ 'TYPE', 'PATTERN', and 'PLACEHOLDER' keys.
21
+ detailed: If True, returns a SanitizeResult with findings and
22
+ a reversible mapping. If False, returns a plain string.
23
+
24
+ Returns:
25
+ str if detailed=False, SanitizeResult if detailed=True.
26
+ """
27
+ compiled = list(_COMPILED_DETECTORS)
28
+
29
+ if custom_detectors:
30
+ for det in custom_detectors:
31
+ compiled.append((
32
+ re.compile(det["PATTERN"], re.MULTILINE),
33
+ det["TYPE"],
34
+ det["PLACEHOLDER"]
35
+ ))
36
+
37
+ if not detailed:
38
+ # Simple mode: just replace with static placeholders (backward compatible)
39
+ cleaned = text
40
+ for pattern, dtype, placeholder in compiled:
41
+ cleaned = pattern.sub(placeholder, cleaned)
42
+ return cleaned
43
+
44
+ # Detailed mode: collect findings, build mapping with indexed placeholders
45
+ # First, find all matches across all detectors
46
+ all_matches = []
47
+ for pattern, dtype, placeholder in compiled:
48
+ for match in pattern.finditer(text):
49
+ all_matches.append({
50
+ 'start': match.start(),
51
+ 'end': match.end(),
52
+ 'original': match.group(),
53
+ 'type': dtype,
54
+ 'placeholder_base': placeholder,
55
+ })
56
+
57
+ # Sort by start position (earliest first), then by length (longest first)
58
+ all_matches.sort(key=lambda m: (m['start'], -(m['end'] - m['start'])))
59
+
60
+ # Remove overlapping matches (keep the first/longest one)
61
+ filtered = []
62
+ last_end = 0
63
+ for m in all_matches:
64
+ if m['start'] >= last_end:
65
+ filtered.append(m)
66
+ last_end = m['end']
67
+
68
+ # Build indexed placeholders and mapping
69
+ type_counters = {}
70
+ findings = []
71
+ mapping = {}
72
+
73
+ for m in filtered:
74
+ base = m['placeholder_base']
75
+ # Strip brackets, add index, re-bracket
76
+ inner = base.strip('[]')
77
+ count = type_counters.get(inner, 0) + 1
78
+ type_counters[inner] = count
79
+ indexed = f"[{inner}_{count}]"
80
+
81
+ findings.append(Finding(
82
+ detector_type=m['type'],
83
+ original=m['original'],
84
+ placeholder=indexed,
85
+ start=m['start'],
86
+ end=m['end'],
87
+ ))
88
+ mapping[indexed] = m['original']
89
+
90
+ # Build cleaned text by replacing from end to start to preserve positions
91
+ cleaned = text
92
+ for finding in reversed(findings):
93
+ cleaned = cleaned[:finding.start] + finding.placeholder + cleaned[finding.end:]
94
+
95
+ return SanitizeResult(
96
+ cleaned=cleaned,
97
+ findings=findings,
98
+ mapping=mapping,
99
+ )
100
+
101
+
102
+ def restore_output(cleaned_text, mapping):
103
+ """Restore original values from a sanitized text using a mapping.
104
+
105
+ Args:
106
+ cleaned_text: Text with indexed placeholders (e.g., [EMAIL_1]).
107
+ mapping: Dict mapping placeholders to original values,
108
+ as returned by SanitizeResult.mapping.
109
+
110
+ Returns:
111
+ str with placeholders replaced by original values.
112
+ """
113
+ result = cleaned_text
114
+ # Sort by placeholder length descending to avoid partial replacements
115
+ for placeholder, original in sorted(mapping.items(), key=lambda x: -len(x[0])):
116
+ result = result.replace(placeholder, original)
117
+ return result
@@ -0,0 +1,132 @@
1
+ """Streaming sanitizer for real-time LLM output processing."""
2
+ import re
3
+ from main.detectors import ALL_DETECTORS
4
+
5
+
6
+ class StreamingSanitizer:
7
+ """Sanitizes text token-by-token for streaming LLM output.
8
+
9
+ Maintains an internal buffer to handle patterns that may span
10
+ multiple tokens. Emits sanitized text as soon as it's safe to
11
+ do so.
12
+
13
+ Usage:
14
+ sanitizer = StreamingSanitizer()
15
+ for token in llm_stream:
16
+ safe_text = sanitizer.feed(token)
17
+ print(safe_text, end='', flush=True)
18
+ # Don't forget to flush at the end
19
+ print(sanitizer.flush(), end='', flush=True)
20
+ """
21
+
22
+ # Maximum pattern length we might need to buffer for
23
+ # This is a conservative estimate based on the longest patterns
24
+ _MAX_PATTERN_LEN = 512
25
+
26
+ def __init__(self, custom_detectors=None):
27
+ """Initialize the streaming sanitizer.
28
+
29
+ Args:
30
+ custom_detectors: Optional list of custom detector dicts
31
+ with 'TYPE', 'PATTERN', and 'PLACEHOLDER' keys.
32
+ """
33
+ self._buffer = ""
34
+ self._detectors = [
35
+ (re.compile(det.PATTERN, re.MULTILINE), det.PLACEHOLDER)
36
+ for det in ALL_DETECTORS
37
+ ]
38
+ if custom_detectors:
39
+ for det in custom_detectors:
40
+ self._detectors.append((
41
+ re.compile(det["PATTERN"], re.MULTILINE),
42
+ det["PLACEHOLDER"]
43
+ ))
44
+
45
+ def feed(self, token):
46
+ """Feed a token into the sanitizer.
47
+
48
+ Args:
49
+ token: A string token from the LLM stream.
50
+
51
+ Returns:
52
+ A string of sanitized text that is safe to emit.
53
+ May be empty if the sanitizer is buffering.
54
+ """
55
+ self._buffer += token
56
+
57
+ # If buffer is small, keep accumulating — we might be mid-pattern
58
+ if len(self._buffer) < self._MAX_PATTERN_LEN:
59
+ # Check if any pattern could still be growing
60
+ if self._has_partial_match():
61
+ return ""
62
+
63
+ # Apply all detectors to the buffer
64
+ sanitized = self._buffer
65
+ for pattern, placeholder in self._detectors:
66
+ sanitized = pattern.sub(placeholder, sanitized)
67
+
68
+ # Keep a tail portion in the buffer in case a pattern spans
69
+ # the boundary. The safe portion is everything before the tail.
70
+ tail_size = min(self._MAX_PATTERN_LEN, len(sanitized))
71
+
72
+ if len(sanitized) <= tail_size:
73
+ # Buffer is still small enough to hold entirely
74
+ self._buffer = self._buffer # keep original buffer for partial matching
75
+ # But if no partial matches, we can emit some
76
+ if not self._has_partial_match():
77
+ self._buffer = ""
78
+ return sanitized
79
+ return ""
80
+
81
+ safe = sanitized[:-tail_size]
82
+ # We need to figure out how much of the original buffer corresponds
83
+ # to the safe portion. Apply detectors to progressively larger
84
+ # prefixes... Actually, simpler approach:
85
+ # After sanitization, keep the tail of the SANITIZED output
86
+ # in the buffer and emit the safe prefix.
87
+ self._buffer = sanitized[-tail_size:]
88
+ return safe
89
+
90
+ def _has_partial_match(self):
91
+ """Check if any detector pattern could partially match at the end of the buffer."""
92
+ buf = self._buffer
93
+ # Check if the buffer ends with something that looks like
94
+ # the start of a sensitive pattern
95
+ partial_starters = [
96
+ r'sk-', # OpenAI, Stripe
97
+ r'eyJ', # JWT
98
+ r'AKIA', r'ASIA', # AWS
99
+ r'gh', # GitHub
100
+ r'xox', # Slack
101
+ r'AC', # Twilio
102
+ r'AIza', # GCP
103
+ r'Bearer', # Bearer tokens
104
+ r'-----BEGIN', # PEM
105
+ r'AccountKey=', # Azure
106
+ ]
107
+ for starter in partial_starters:
108
+ # Check if the buffer ends with a prefix of this starter
109
+ for i in range(1, len(starter) + 1):
110
+ if buf.endswith(starter[:i]):
111
+ return True
112
+ return False
113
+
114
+ def flush(self):
115
+ """Flush remaining buffer content.
116
+
117
+ Call this when the stream ends to get any remaining text.
118
+
119
+ Returns:
120
+ Sanitized remaining buffer content.
121
+ """
122
+ if not self._buffer:
123
+ return ""
124
+ sanitized = self._buffer
125
+ for pattern, placeholder in self._detectors:
126
+ sanitized = pattern.sub(placeholder, sanitized)
127
+ self._buffer = ""
128
+ return sanitized
129
+
130
+ def reset(self):
131
+ """Reset the sanitizer state, clearing the buffer."""
132
+ self._buffer = ""
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "llm-redacted"
7
+ version = "0.2.0"
8
+ description = "A lightweight, zero-dependency output sanitizer for secrets, API keys, and PII."
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ {name = "redacted"},
14
+ ]
15
+ keywords = ["sanitizer", "redaction", "secrets", "pii", "llm", "security", "api-keys"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Security",
28
+ "Topic :: Software Development :: Libraries :: Python Modules",
29
+ "Typing :: Typed",
30
+ ]
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/redacted/redacted"
34
+ Repository = "https://github.com/redacted/redacted"
35
+ Issues = "https://github.com/redacted/redacted/issues"
36
+
37
+ [tool.setuptools.packages.find]
38
+ include = ["main*"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,21 @@
1
+ import unittest
2
+ from main import sanitize_output, restore_output
3
+
4
+ class TestSanitizer(unittest.TestCase):
5
+ def test_basic_sanitize(self):
6
+ text = "My email is test@gmail.com and key is sk-12345678901234567890"
7
+ clean = sanitize_output(text)
8
+ self.assertIn("[EMAIL]", clean)
9
+ self.assertIn("[OPENAI_KEY]", clean)
10
+
11
+ def test_detailed_sanitize(self):
12
+ text = "Contact a@b.com or c@d.com"
13
+ result = sanitize_output(text, detailed=True)
14
+ self.assertIn("[EMAIL_1]", result.cleaned)
15
+ self.assertIn("[EMAIL_2]", result.cleaned)
16
+
17
+ restored = restore_output(result.cleaned, result.mapping)
18
+ self.assertEqual(restored, text)
19
+
20
+ if __name__ == '__main__':
21
+ unittest.main()