redsafe 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- redsafe-0.1.0/PKG-INFO +148 -0
- redsafe-0.1.0/README.md +134 -0
- redsafe-0.1.0/core/__init__.py +0 -0
- redsafe-0.1.0/core/data_models.py +64 -0
- redsafe-0.1.0/core/redaction_pipeline.py +169 -0
- redsafe-0.1.0/detection/__init__.py +0 -0
- redsafe-0.1.0/detection/context_detection.py +31 -0
- redsafe-0.1.0/detection/entity_detection.py +72 -0
- redsafe-0.1.0/detection/entropy_detection.py +24 -0
- redsafe-0.1.0/detection/secret_detection.py +110 -0
- redsafe-0.1.0/main.py +56 -0
- redsafe-0.1.0/parsers/__init__.py +0 -0
- redsafe-0.1.0/parsers/burp_xml_parser.py +43 -0
- redsafe-0.1.0/parsers/http_parser.py +124 -0
- redsafe-0.1.0/parsers/image_parser.py +41 -0
- redsafe-0.1.0/parsers/log_parser.py +19 -0
- redsafe-0.1.0/parsers/pcap_parser.py +39 -0
- redsafe-0.1.0/pyproject.toml +28 -0
- redsafe-0.1.0/redaction/__init__.py +0 -0
- redsafe-0.1.0/redaction/placeholder_mapper.py +26 -0
- redsafe-0.1.0/redaction/redact_image.py +47 -0
- redsafe-0.1.0/redaction/redact_text.py +31 -0
- redsafe-0.1.0/redsafe.egg-info/PKG-INFO +148 -0
- redsafe-0.1.0/redsafe.egg-info/SOURCES.txt +34 -0
- redsafe-0.1.0/redsafe.egg-info/dependency_links.txt +1 -0
- redsafe-0.1.0/redsafe.egg-info/entry_points.txt +2 -0
- redsafe-0.1.0/redsafe.egg-info/requires.txt +6 -0
- redsafe-0.1.0/redsafe.egg-info/top_level.txt +6 -0
- redsafe-0.1.0/setup.cfg +4 -0
- redsafe-0.1.0/tests/test_detection.py +46 -0
- redsafe-0.1.0/tests/test_parsers.py +67 -0
- redsafe-0.1.0/tests/test_pipeline.py +40 -0
- redsafe-0.1.0/tests/test_redaction.py +34 -0
- redsafe-0.1.0/utils/__init__.py +0 -0
- redsafe-0.1.0/utils/encoding_utils.py +12 -0
- redsafe-0.1.0/utils/file_utils.py +17 -0
redsafe-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: redsafe
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local AI-safe redaction engine for security data
|
|
5
|
+
Author: AI Safe Redaction Engine
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: spacy
|
|
9
|
+
Requires-Dist: pytesseract
|
|
10
|
+
Requires-Dist: opencv-python
|
|
11
|
+
Requires-Dist: lxml
|
|
12
|
+
Requires-Dist: numpy
|
|
13
|
+
Requires-Dist: scikit-learn
|
|
14
|
+
|
|
15
|
+
# AI Safe Redaction Engine
|
|
16
|
+
|
|
17
|
+
A local-first, modular redaction system for security data before AI processing.
|
|
18
|
+
|
|
19
|
+
## Single-Word CLI Name
|
|
20
|
+
This project is packaged for `pipx` as:
|
|
21
|
+
- Command: `redsafe`
|
|
22
|
+
- Package name: `redsafe`
|
|
23
|
+
|
|
24
|
+
Other good single-word alternatives if you want to rename later:
|
|
25
|
+
- `safescrub`
|
|
26
|
+
- `cloaknet`
|
|
27
|
+
- `privashield`
|
|
28
|
+
|
|
29
|
+
## Supported Inputs
|
|
30
|
+
- Burp Suite XML exports
|
|
31
|
+
- Raw HTTP requests/responses
|
|
32
|
+
- Network logs
|
|
33
|
+
- Basic PCAP parsing
|
|
34
|
+
- Screenshots (OCR + masking)
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
- Analysis-based sensitive data detection (not regex-only)
|
|
38
|
+
- Named Entity Recognition (spaCy)
|
|
39
|
+
- Secret detection with heuristics and entropy scoring
|
|
40
|
+
- Context-aware detection from header/parameter names
|
|
41
|
+
- Consistent placeholder mapping (`<EMAIL_1>`, `<JWT_TOKEN_1>`, etc.)
|
|
42
|
+
- Local-only processing, no external API calls
|
|
43
|
+
|
|
44
|
+
## Project Structure
|
|
45
|
+
- `parsers/`: input ingestion modules
|
|
46
|
+
- `detection/`: entity, secret, entropy, and context detectors
|
|
47
|
+
- `redaction/`: placeholder and redaction logic
|
|
48
|
+
- `core/`: data models and orchestration pipeline
|
|
49
|
+
- `utils/`: file/encoding helpers
|
|
50
|
+
- `tests/`: sample files + pytest coverage
|
|
51
|
+
|
|
52
|
+
## Install With pipx (Recommended)
|
|
53
|
+
```bash
|
|
54
|
+
cd /home/gss/Desktop/Codex/ai-redaction-engine
|
|
55
|
+
pipx install .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Run:
|
|
59
|
+
```bash
|
|
60
|
+
redsafe --input tests/sample_burp.xml --type burp
|
|
61
|
+
redsafe --input tests/sample_http.txt --type http
|
|
62
|
+
redsafe --input tests/sample_log.txt --type log
|
|
63
|
+
redsafe --input tests/sample_image.png --type image
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
To reinstall after local code changes:
|
|
67
|
+
```bash
|
|
68
|
+
pipx reinstall redsafe
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Install With venv (Alternative)
|
|
72
|
+
```bash
|
|
73
|
+
cd /home/gss/Desktop/Codex/ai-redaction-engine
|
|
74
|
+
python -m venv .venv
|
|
75
|
+
source .venv/bin/activate
|
|
76
|
+
pip install -r requirements.txt
|
|
77
|
+
python -m spacy download en_core_web_sm
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Run examples:
|
|
81
|
+
```bash
|
|
82
|
+
python main.py --input tests/sample_burp.xml --type burp
|
|
83
|
+
python main.py --input tests/sample_http.txt --type http
|
|
84
|
+
python main.py --input tests/sample_log.txt --type log
|
|
85
|
+
python main.py --input tests/sample_image.png --type image
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Outputs are written to `sanitized_output/`.
|
|
89
|
+
|
|
90
|
+
## Redaction Tuning (False Positive Control)
|
|
91
|
+
Secret entropy detection can be tuned via environment variables:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
export REDACTION_ENTROPY_THRESHOLD=4.2
|
|
95
|
+
export REDACTION_MIN_SECRET_LEN=24
|
|
96
|
+
export REDACTION_MIN_BASE64_LEN=28
|
|
97
|
+
export REDACTION_IGNORE_VALUES="application/x-www-form-urlencoded,text/plain"
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
These values are consumed by `SecretDetectionConfig` in `detection/secret_detection.py`.
|
|
101
|
+
|
|
102
|
+
## Run Tests
|
|
103
|
+
```bash
|
|
104
|
+
pytest -q
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Example: Burp XML Sanitization
|
|
108
|
+
Input Burp request header:
|
|
109
|
+
```text
|
|
110
|
+
Authorization: Bearer eyJhbGciOiJIUzI1Ni.eyJzdWIiOiIxMjM0NTYifQ.signature123456789
|
|
111
|
+
Cookie: sessionid=abcDEF1234567890
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Sanitized output:
|
|
115
|
+
```text
|
|
116
|
+
Authorization: Bearer <JWT_TOKEN_1>
|
|
117
|
+
Cookie: sessionid=<SESSION_COOKIE_1>
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Example: HTTP Sanitization
|
|
121
|
+
Input:
|
|
122
|
+
```text
|
|
123
|
+
POST /login HTTP/1.1
|
|
124
|
+
Host: app.local
|
|
125
|
+
Content-Type: application/x-www-form-urlencoded
|
|
126
|
+
|
|
127
|
+
email=john@example.com&password=SuperSecret123
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Sanitized output:
|
|
131
|
+
```text
|
|
132
|
+
POST /login HTTP/1.1
|
|
133
|
+
Host: app.local
|
|
134
|
+
Content-Type: application/x-www-form-urlencoded
|
|
135
|
+
|
|
136
|
+
email=<EMAIL_1>&password=<PASSWORD_1>
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Example: Image Sanitization
|
|
140
|
+
- OCR extracts text from screenshot.
|
|
141
|
+
- Detection engine flags sensitive values.
|
|
142
|
+
- Matching OCR boxes are masked in output image.
|
|
143
|
+
|
|
144
|
+
## Notes
|
|
145
|
+
- Designed for integration into a future AI pentesting engine.
|
|
146
|
+
- All processing is local.
|
|
147
|
+
- If `en_core_web_sm` is unavailable, regex/heuristic detection still works.
|
|
148
|
+
- Image redaction needs local `opencv-python`, `pytesseract`, and system `tesseract` binary installed.
|
redsafe-0.1.0/README.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# AI Safe Redaction Engine
|
|
2
|
+
|
|
3
|
+
A local-first, modular redaction system for security data before AI processing.
|
|
4
|
+
|
|
5
|
+
## Single-Word CLI Name
|
|
6
|
+
This project is packaged for `pipx` as:
|
|
7
|
+
- Command: `redsafe`
|
|
8
|
+
- Package name: `redsafe`
|
|
9
|
+
|
|
10
|
+
Other good single-word alternatives if you want to rename later:
|
|
11
|
+
- `safescrub`
|
|
12
|
+
- `cloaknet`
|
|
13
|
+
- `privashield`
|
|
14
|
+
|
|
15
|
+
## Supported Inputs
|
|
16
|
+
- Burp Suite XML exports
|
|
17
|
+
- Raw HTTP requests/responses
|
|
18
|
+
- Network logs
|
|
19
|
+
- Basic PCAP parsing
|
|
20
|
+
- Screenshots (OCR + masking)
|
|
21
|
+
|
|
22
|
+
## Features
|
|
23
|
+
- Analysis-based sensitive data detection (not regex-only)
|
|
24
|
+
- Named Entity Recognition (spaCy)
|
|
25
|
+
- Secret detection with heuristics and entropy scoring
|
|
26
|
+
- Context-aware detection from header/parameter names
|
|
27
|
+
- Consistent placeholder mapping (`<EMAIL_1>`, `<JWT_TOKEN_1>`, etc.)
|
|
28
|
+
- Local-only processing, no external API calls
|
|
29
|
+
|
|
30
|
+
## Project Structure
|
|
31
|
+
- `parsers/`: input ingestion modules
|
|
32
|
+
- `detection/`: entity, secret, entropy, and context detectors
|
|
33
|
+
- `redaction/`: placeholder and redaction logic
|
|
34
|
+
- `core/`: data models and orchestration pipeline
|
|
35
|
+
- `utils/`: file/encoding helpers
|
|
36
|
+
- `tests/`: sample files + pytest coverage
|
|
37
|
+
|
|
38
|
+
## Install With pipx (Recommended)
|
|
39
|
+
```bash
|
|
40
|
+
cd /home/gss/Desktop/Codex/ai-redaction-engine
|
|
41
|
+
pipx install .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Run:
|
|
45
|
+
```bash
|
|
46
|
+
redsafe --input tests/sample_burp.xml --type burp
|
|
47
|
+
redsafe --input tests/sample_http.txt --type http
|
|
48
|
+
redsafe --input tests/sample_log.txt --type log
|
|
49
|
+
redsafe --input tests/sample_image.png --type image
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
To reinstall after local code changes:
|
|
53
|
+
```bash
|
|
54
|
+
pipx reinstall redsafe
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Install With venv (Alternative)
|
|
58
|
+
```bash
|
|
59
|
+
cd /home/gss/Desktop/Codex/ai-redaction-engine
|
|
60
|
+
python -m venv .venv
|
|
61
|
+
source .venv/bin/activate
|
|
62
|
+
pip install -r requirements.txt
|
|
63
|
+
python -m spacy download en_core_web_sm
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Run examples:
|
|
67
|
+
```bash
|
|
68
|
+
python main.py --input tests/sample_burp.xml --type burp
|
|
69
|
+
python main.py --input tests/sample_http.txt --type http
|
|
70
|
+
python main.py --input tests/sample_log.txt --type log
|
|
71
|
+
python main.py --input tests/sample_image.png --type image
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Outputs are written to `sanitized_output/`.
|
|
75
|
+
|
|
76
|
+
## Redaction Tuning (False Positive Control)
|
|
77
|
+
Secret entropy detection can be tuned via environment variables:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
export REDACTION_ENTROPY_THRESHOLD=4.2
|
|
81
|
+
export REDACTION_MIN_SECRET_LEN=24
|
|
82
|
+
export REDACTION_MIN_BASE64_LEN=28
|
|
83
|
+
export REDACTION_IGNORE_VALUES="application/x-www-form-urlencoded,text/plain"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
These values are consumed by `SecretDetectionConfig` in `detection/secret_detection.py`.
|
|
87
|
+
|
|
88
|
+
## Run Tests
|
|
89
|
+
```bash
|
|
90
|
+
pytest -q
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Example: Burp XML Sanitization
|
|
94
|
+
Input Burp request header:
|
|
95
|
+
```text
|
|
96
|
+
Authorization: Bearer eyJhbGciOiJIUzI1Ni.eyJzdWIiOiIxMjM0NTYifQ.signature123456789
|
|
97
|
+
Cookie: sessionid=abcDEF1234567890
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Sanitized output:
|
|
101
|
+
```text
|
|
102
|
+
Authorization: Bearer <JWT_TOKEN_1>
|
|
103
|
+
Cookie: sessionid=<SESSION_COOKIE_1>
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Example: HTTP Sanitization
|
|
107
|
+
Input:
|
|
108
|
+
```text
|
|
109
|
+
POST /login HTTP/1.1
|
|
110
|
+
Host: app.local
|
|
111
|
+
Content-Type: application/x-www-form-urlencoded
|
|
112
|
+
|
|
113
|
+
email=john@example.com&password=SuperSecret123
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Sanitized output:
|
|
117
|
+
```text
|
|
118
|
+
POST /login HTTP/1.1
|
|
119
|
+
Host: app.local
|
|
120
|
+
Content-Type: application/x-www-form-urlencoded
|
|
121
|
+
|
|
122
|
+
email=<EMAIL_1>&password=<PASSWORD_1>
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Example: Image Sanitization
|
|
126
|
+
- OCR extracts text from screenshot.
|
|
127
|
+
- Detection engine flags sensitive values.
|
|
128
|
+
- Matching OCR boxes are masked in output image.
|
|
129
|
+
|
|
130
|
+
## Notes
|
|
131
|
+
- Designed for integration into a future AI pentesting engine.
|
|
132
|
+
- All processing is local.
|
|
133
|
+
- If `en_core_web_sm` is unavailable, regex/heuristic detection still works.
|
|
134
|
+
- Image redaction needs local `opencv-python`, `pytesseract`, and system `tesseract` binary installed.
|
|
File without changes
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class HTTPMessage:
|
|
9
|
+
method: str = ""
|
|
10
|
+
path: str = ""
|
|
11
|
+
version: str = ""
|
|
12
|
+
headers: Dict[str, str] = field(default_factory=dict)
|
|
13
|
+
parameters: Dict[str, str] = field(default_factory=dict)
|
|
14
|
+
body: str = ""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class HTTPResponse:
|
|
19
|
+
status_code: int = 0
|
|
20
|
+
reason: str = ""
|
|
21
|
+
version: str = ""
|
|
22
|
+
headers: Dict[str, str] = field(default_factory=dict)
|
|
23
|
+
body: str = ""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class HTTPExchange:
|
|
28
|
+
request: HTTPMessage
|
|
29
|
+
response: Optional[HTTPResponse] = None
|
|
30
|
+
source: str = ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class Detection:
|
|
35
|
+
kind: str
|
|
36
|
+
value: str
|
|
37
|
+
start: int = -1
|
|
38
|
+
end: int = -1
|
|
39
|
+
confidence: float = 1.0
|
|
40
|
+
source: str = ""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class RedactionSummary:
|
|
45
|
+
detected_count: int = 0
|
|
46
|
+
redacted_count: int = 0
|
|
47
|
+
mappings: Dict[str, str] = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ImageOCRToken:
|
|
52
|
+
text: str
|
|
53
|
+
left: int
|
|
54
|
+
top: int
|
|
55
|
+
width: int
|
|
56
|
+
height: int
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class PipelineResult:
|
|
61
|
+
sanitized_text: Optional[str] = None
|
|
62
|
+
sanitized_image_path: Optional[str] = None
|
|
63
|
+
detections: List[Detection] = field(default_factory=list)
|
|
64
|
+
summary: RedactionSummary = field(default_factory=RedactionSummary)
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Iterable, List
|
|
5
|
+
|
|
6
|
+
from core.data_models import Detection, HTTPExchange, HTTPMessage, HTTPResponse, PipelineResult, RedactionSummary
|
|
7
|
+
from detection.context_detection import ContextDetector
|
|
8
|
+
from detection.entity_detection import EntityDetector
|
|
9
|
+
from detection.secret_detection import SecretDetector
|
|
10
|
+
from parsers.burp_xml_parser import parse_burp_xml
|
|
11
|
+
from parsers.http_parser import parse_http_file
|
|
12
|
+
from parsers.image_parser import extract_ocr_tokens
|
|
13
|
+
from parsers.log_parser import parse_network_log
|
|
14
|
+
from parsers.pcap_parser import parse_pcap_basic
|
|
15
|
+
from redaction.placeholder_mapper import PlaceholderMapper
|
|
16
|
+
from redaction.redact_image import redact_image_tokens
|
|
17
|
+
from redaction.redact_text import redact_text_content
|
|
18
|
+
from utils.file_utils import ensure_dir, read_text, write_text
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RedactionPipeline:
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
self.entity_detector = EntityDetector()
|
|
24
|
+
self.secret_detector = SecretDetector()
|
|
25
|
+
self.context_detector = ContextDetector()
|
|
26
|
+
self.mapper = PlaceholderMapper()
|
|
27
|
+
|
|
28
|
+
def _serialize_request(self, req: HTTPMessage) -> str:
|
|
29
|
+
start = f"{req.method} {req.path} {req.version}".strip()
|
|
30
|
+
lines = [start] if start else []
|
|
31
|
+
lines.extend([f"{k}: {v}" for k, v in req.headers.items()])
|
|
32
|
+
if req.body:
|
|
33
|
+
lines.append("")
|
|
34
|
+
lines.append(req.body)
|
|
35
|
+
return "\n".join(lines).strip()
|
|
36
|
+
|
|
37
|
+
def _serialize_response(self, resp: HTTPResponse) -> str:
|
|
38
|
+
start = f"{resp.version} {resp.status_code} {resp.reason}".strip()
|
|
39
|
+
lines = [start] if start else []
|
|
40
|
+
lines.extend([f"{k}: {v}" for k, v in resp.headers.items()])
|
|
41
|
+
if resp.body:
|
|
42
|
+
lines.append("")
|
|
43
|
+
lines.append(resp.body)
|
|
44
|
+
return "\n".join(lines).strip()
|
|
45
|
+
|
|
46
|
+
def _serialize_exchanges(self, exchanges: Iterable[HTTPExchange]) -> str:
|
|
47
|
+
blocks: List[str] = []
|
|
48
|
+
for ex in exchanges:
|
|
49
|
+
parts = []
|
|
50
|
+
if ex.request:
|
|
51
|
+
req_txt = self._serialize_request(ex.request)
|
|
52
|
+
if req_txt:
|
|
53
|
+
parts.append(req_txt)
|
|
54
|
+
if ex.response:
|
|
55
|
+
resp_txt = self._serialize_response(ex.response)
|
|
56
|
+
if resp_txt:
|
|
57
|
+
parts.append(resp_txt)
|
|
58
|
+
if parts:
|
|
59
|
+
blocks.append("\n\n".join(parts))
|
|
60
|
+
return "\n\n###\n\n".join(blocks)
|
|
61
|
+
|
|
62
|
+
def _collect_detections_from_text(self, text: str) -> List[Detection]:
|
|
63
|
+
detections = []
|
|
64
|
+
detections.extend(self.entity_detector.detect(text))
|
|
65
|
+
detections.extend(self.secret_detector.detect(text))
|
|
66
|
+
return detections
|
|
67
|
+
|
|
68
|
+
def _collect_context_from_exchanges(self, exchanges: Iterable[HTTPExchange]) -> List[Detection]:
|
|
69
|
+
detections: List[Detection] = []
|
|
70
|
+
for ex in exchanges:
|
|
71
|
+
detections.extend(self.context_detector.detect_pairs(ex.request.headers, source="request_headers"))
|
|
72
|
+
detections.extend(self.context_detector.detect_pairs(ex.request.parameters, source="request_params"))
|
|
73
|
+
if ex.response:
|
|
74
|
+
detections.extend(self.context_detector.detect_pairs(ex.response.headers, source="response_headers"))
|
|
75
|
+
return detections
|
|
76
|
+
|
|
77
|
+
def _collect_context_from_logs(self, entries: Iterable[Dict[str, str]]) -> List[Detection]:
|
|
78
|
+
detections: List[Detection] = []
|
|
79
|
+
for entry in entries:
|
|
80
|
+
pairs = {k: v for k, v in entry.items() if k != "raw"}
|
|
81
|
+
detections.extend(self.context_detector.detect_pairs(pairs, source="log_fields"))
|
|
82
|
+
return detections
|
|
83
|
+
|
|
84
|
+
def _dedupe(self, detections: Iterable[Detection]) -> List[Detection]:
|
|
85
|
+
unique = {}
|
|
86
|
+
for d in detections:
|
|
87
|
+
key = (d.kind, d.value)
|
|
88
|
+
if d.value and key not in unique:
|
|
89
|
+
unique[key] = d
|
|
90
|
+
return list(unique.values())
|
|
91
|
+
|
|
92
|
+
def _build_summary(self, detections: List[Detection]) -> RedactionSummary:
|
|
93
|
+
return RedactionSummary(
|
|
94
|
+
detected_count=len(detections),
|
|
95
|
+
redacted_count=len(self.mapper.mapping()),
|
|
96
|
+
mappings=self.mapper.mapping(),
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def process_burp(self, input_path: str, output_dir: str) -> PipelineResult:
|
|
100
|
+
exchanges = parse_burp_xml(input_path)
|
|
101
|
+
original_text = self._serialize_exchanges(exchanges)
|
|
102
|
+
detections = self._collect_detections_from_text(original_text)
|
|
103
|
+
detections.extend(self._collect_context_from_exchanges(exchanges))
|
|
104
|
+
detections = self._dedupe(detections)
|
|
105
|
+
|
|
106
|
+
redacted = redact_text_content(original_text, detections, self.mapper)
|
|
107
|
+
out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
|
|
108
|
+
write_text(out, redacted)
|
|
109
|
+
|
|
110
|
+
return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
|
|
111
|
+
|
|
112
|
+
def process_http(self, input_path: str, output_dir: str) -> PipelineResult:
|
|
113
|
+
raw = read_text(input_path)
|
|
114
|
+
exchanges = parse_http_file(raw, source=input_path)
|
|
115
|
+
original_text = self._serialize_exchanges(exchanges) if exchanges else raw
|
|
116
|
+
|
|
117
|
+
detections = self._collect_detections_from_text(original_text)
|
|
118
|
+
detections.extend(self._collect_context_from_exchanges(exchanges))
|
|
119
|
+
detections = self._dedupe(detections)
|
|
120
|
+
|
|
121
|
+
redacted = redact_text_content(original_text, detections, self.mapper)
|
|
122
|
+
out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
|
|
123
|
+
write_text(out, redacted)
|
|
124
|
+
|
|
125
|
+
return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
|
|
126
|
+
|
|
127
|
+
def process_log(self, input_path: str, output_dir: str) -> PipelineResult:
|
|
128
|
+
raw = read_text(input_path)
|
|
129
|
+
entries = parse_network_log(raw)
|
|
130
|
+
|
|
131
|
+
detections = self._collect_detections_from_text(raw)
|
|
132
|
+
detections.extend(self._collect_context_from_logs(entries))
|
|
133
|
+
detections = self._dedupe(detections)
|
|
134
|
+
|
|
135
|
+
redacted = redact_text_content(raw, detections, self.mapper)
|
|
136
|
+
out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
|
|
137
|
+
write_text(out, redacted)
|
|
138
|
+
|
|
139
|
+
return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
|
|
140
|
+
|
|
141
|
+
def process_pcap(self, input_path: str, output_dir: str) -> PipelineResult:
|
|
142
|
+
exchanges = parse_pcap_basic(input_path)
|
|
143
|
+
original_text = self._serialize_exchanges(exchanges)
|
|
144
|
+
|
|
145
|
+
detections = self._collect_detections_from_text(original_text)
|
|
146
|
+
detections.extend(self._collect_context_from_exchanges(exchanges))
|
|
147
|
+
detections = self._dedupe(detections)
|
|
148
|
+
|
|
149
|
+
redacted = redact_text_content(original_text, detections, self.mapper)
|
|
150
|
+
out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
|
|
151
|
+
write_text(out, redacted)
|
|
152
|
+
|
|
153
|
+
return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
|
|
154
|
+
|
|
155
|
+
def process_image(self, input_path: str, output_dir: str) -> PipelineResult:
|
|
156
|
+
tokens = extract_ocr_tokens(input_path)
|
|
157
|
+
full_text = "\n".join(t.text for t in tokens)
|
|
158
|
+
|
|
159
|
+
detections = self._collect_detections_from_text(full_text)
|
|
160
|
+
detections = self._dedupe(detections)
|
|
161
|
+
|
|
162
|
+
out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.png")
|
|
163
|
+
redact_image_tokens(input_path, str(out), tokens, detections, self.mapper)
|
|
164
|
+
|
|
165
|
+
return PipelineResult(
|
|
166
|
+
sanitized_image_path=str(out),
|
|
167
|
+
detections=detections,
|
|
168
|
+
summary=self._build_summary(detections),
|
|
169
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from core.data_models import Detection
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
SENSITIVE_KEYS = {
|
|
9
|
+
"password": "PASSWORD",
|
|
10
|
+
"passwd": "PASSWORD",
|
|
11
|
+
"secret": "SECRET",
|
|
12
|
+
"token": "TOKEN",
|
|
13
|
+
"apikey": "API_KEY",
|
|
14
|
+
"api_key": "API_KEY",
|
|
15
|
+
"authorization": "AUTHORIZATION",
|
|
16
|
+
"session": "SESSION",
|
|
17
|
+
"cookie": "COOKIE",
|
|
18
|
+
"private_key": "PRIVATE_KEY",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ContextDetector:
|
|
23
|
+
def detect_pairs(self, pairs: Dict[str, str], source: str = "context") -> List[Detection]:
|
|
24
|
+
detections: List[Detection] = []
|
|
25
|
+
for k, v in pairs.items():
|
|
26
|
+
key_lower = k.lower()
|
|
27
|
+
for skey, kind in SENSITIVE_KEYS.items():
|
|
28
|
+
if skey in key_lower and v:
|
|
29
|
+
detections.append(Detection(kind=kind, value=v, source=source, confidence=0.95))
|
|
30
|
+
break
|
|
31
|
+
return detections
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from core.data_models import Detection
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import spacy # type: ignore
|
|
10
|
+
except Exception: # pragma: no cover
|
|
11
|
+
spacy = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
|
|
15
|
+
PHONE_RE = re.compile(r"\b(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{2,4}\)?[\s.-]?)?\d{3}[\s.-]?\d{4}\b")
|
|
16
|
+
ADDRESS_HINT_RE = re.compile(r"\b\d{1,6}\s+[A-Za-z0-9\s]{3,}\s(?:Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd|Drive|Dr)\b", re.IGNORECASE)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EntityDetector:
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
self._nlp = None
|
|
22
|
+
self._load_model()
|
|
23
|
+
|
|
24
|
+
def _load_model(self) -> None:
|
|
25
|
+
if spacy is None:
|
|
26
|
+
return
|
|
27
|
+
for model in ("en_core_web_sm", "en_core_web_md"):
|
|
28
|
+
try:
|
|
29
|
+
self._nlp = spacy.load(model)
|
|
30
|
+
return
|
|
31
|
+
except Exception:
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
def detect(self, text: str) -> List[Detection]:
|
|
35
|
+
detections: List[Detection] = []
|
|
36
|
+
|
|
37
|
+
for m in EMAIL_RE.finditer(text):
|
|
38
|
+
detections.append(Detection(kind="EMAIL", value=m.group(0), start=m.start(), end=m.end(), source="regex"))
|
|
39
|
+
|
|
40
|
+
for m in PHONE_RE.finditer(text):
|
|
41
|
+
if len(re.sub(r"\D", "", m.group(0))) >= 7:
|
|
42
|
+
detections.append(Detection(kind="PHONE", value=m.group(0), start=m.start(), end=m.end(), source="regex"))
|
|
43
|
+
|
|
44
|
+
for m in ADDRESS_HINT_RE.finditer(text):
|
|
45
|
+
detections.append(Detection(kind="ADDRESS", value=m.group(0), start=m.start(), end=m.end(), source="regex"))
|
|
46
|
+
|
|
47
|
+
if self._nlp is None:
|
|
48
|
+
return detections
|
|
49
|
+
|
|
50
|
+
doc = self._nlp(text)
|
|
51
|
+
for ent in doc.ents:
|
|
52
|
+
label_map = {
|
|
53
|
+
"PERSON": "NAME",
|
|
54
|
+
"ORG": "ORGANIZATION",
|
|
55
|
+
"GPE": "ADDRESS",
|
|
56
|
+
"LOC": "ADDRESS",
|
|
57
|
+
"FAC": "ADDRESS",
|
|
58
|
+
}
|
|
59
|
+
mapped = label_map.get(ent.label_)
|
|
60
|
+
if mapped:
|
|
61
|
+
detections.append(
|
|
62
|
+
Detection(
|
|
63
|
+
kind=mapped,
|
|
64
|
+
value=ent.text,
|
|
65
|
+
start=ent.start_char,
|
|
66
|
+
end=ent.end_char,
|
|
67
|
+
source="spacy",
|
|
68
|
+
confidence=0.9,
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
return detections
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EntropyDetector:
|
|
8
|
+
@staticmethod
|
|
9
|
+
def shannon_entropy(value: str) -> float:
|
|
10
|
+
if not value:
|
|
11
|
+
return 0.0
|
|
12
|
+
counts = Counter(value)
|
|
13
|
+
length = len(value)
|
|
14
|
+
entropy = 0.0
|
|
15
|
+
for count in counts.values():
|
|
16
|
+
p = count / length
|
|
17
|
+
entropy -= p * math.log2(p)
|
|
18
|
+
return entropy
|
|
19
|
+
|
|
20
|
+
@staticmethod
|
|
21
|
+
def is_high_entropy(value: str, threshold: float = 3.5, min_len: int = 16) -> bool:
|
|
22
|
+
if len(value) < min_len:
|
|
23
|
+
return False
|
|
24
|
+
return EntropyDetector.shannon_entropy(value) >= threshold
|