redsafe 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. redsafe-0.1.0/PKG-INFO +148 -0
  2. redsafe-0.1.0/README.md +134 -0
  3. redsafe-0.1.0/core/__init__.py +0 -0
  4. redsafe-0.1.0/core/data_models.py +64 -0
  5. redsafe-0.1.0/core/redaction_pipeline.py +169 -0
  6. redsafe-0.1.0/detection/__init__.py +0 -0
  7. redsafe-0.1.0/detection/context_detection.py +31 -0
  8. redsafe-0.1.0/detection/entity_detection.py +72 -0
  9. redsafe-0.1.0/detection/entropy_detection.py +24 -0
  10. redsafe-0.1.0/detection/secret_detection.py +110 -0
  11. redsafe-0.1.0/main.py +56 -0
  12. redsafe-0.1.0/parsers/__init__.py +0 -0
  13. redsafe-0.1.0/parsers/burp_xml_parser.py +43 -0
  14. redsafe-0.1.0/parsers/http_parser.py +124 -0
  15. redsafe-0.1.0/parsers/image_parser.py +41 -0
  16. redsafe-0.1.0/parsers/log_parser.py +19 -0
  17. redsafe-0.1.0/parsers/pcap_parser.py +39 -0
  18. redsafe-0.1.0/pyproject.toml +28 -0
  19. redsafe-0.1.0/redaction/__init__.py +0 -0
  20. redsafe-0.1.0/redaction/placeholder_mapper.py +26 -0
  21. redsafe-0.1.0/redaction/redact_image.py +47 -0
  22. redsafe-0.1.0/redaction/redact_text.py +31 -0
  23. redsafe-0.1.0/redsafe.egg-info/PKG-INFO +148 -0
  24. redsafe-0.1.0/redsafe.egg-info/SOURCES.txt +34 -0
  25. redsafe-0.1.0/redsafe.egg-info/dependency_links.txt +1 -0
  26. redsafe-0.1.0/redsafe.egg-info/entry_points.txt +2 -0
  27. redsafe-0.1.0/redsafe.egg-info/requires.txt +6 -0
  28. redsafe-0.1.0/redsafe.egg-info/top_level.txt +6 -0
  29. redsafe-0.1.0/setup.cfg +4 -0
  30. redsafe-0.1.0/tests/test_detection.py +46 -0
  31. redsafe-0.1.0/tests/test_parsers.py +67 -0
  32. redsafe-0.1.0/tests/test_pipeline.py +40 -0
  33. redsafe-0.1.0/tests/test_redaction.py +34 -0
  34. redsafe-0.1.0/utils/__init__.py +0 -0
  35. redsafe-0.1.0/utils/encoding_utils.py +12 -0
  36. redsafe-0.1.0/utils/file_utils.py +17 -0
redsafe-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,148 @@
1
+ Metadata-Version: 2.4
2
+ Name: redsafe
3
+ Version: 0.1.0
4
+ Summary: Local AI-safe redaction engine for security data
5
+ Author: AI Safe Redaction Engine
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: spacy
9
+ Requires-Dist: pytesseract
10
+ Requires-Dist: opencv-python
11
+ Requires-Dist: lxml
12
+ Requires-Dist: numpy
13
+ Requires-Dist: scikit-learn
14
+
15
+ # AI Safe Redaction Engine
16
+
17
+ A local-first, modular redaction system for security data before AI processing.
18
+
19
+ ## Single-Word CLI Name
20
+ This project is packaged for `pipx` as:
21
+ - Command: `redsafe`
22
+ - Package name: `redsafe`
23
+
24
+ Other good single-word alternatives if you want to rename later:
25
+ - `safescrub`
26
+ - `cloaknet`
27
+ - `privashield`
28
+
29
+ ## Supported Inputs
30
+ - Burp Suite XML exports
31
+ - Raw HTTP requests/responses
32
+ - Network logs
33
+ - Basic PCAP parsing
34
+ - Screenshots (OCR + masking)
35
+
36
+ ## Features
37
+ - Analysis-based sensitive data detection (not regex-only)
38
+ - Named Entity Recognition (spaCy)
39
+ - Secret detection with heuristics and entropy scoring
40
+ - Context-aware detection from header/parameter names
41
+ - Consistent placeholder mapping (`<EMAIL_1>`, `<JWT_TOKEN_1>`, etc.)
42
+ - Local-only processing, no external API calls
43
+
44
+ ## Project Structure
45
+ - `parsers/`: input ingestion modules
46
+ - `detection/`: entity, secret, entropy, and context detectors
47
+ - `redaction/`: placeholder and redaction logic
48
+ - `core/`: data models and orchestration pipeline
49
+ - `utils/`: file/encoding helpers
50
+ - `tests/`: sample files + pytest coverage
51
+
52
+ ## Install With pipx (Recommended)
53
+ ```bash
54
+ cd /home/gss/Desktop/Codex/ai-redaction-engine
55
+ pipx install .
56
+ ```
57
+
58
+ Run:
59
+ ```bash
60
+ redsafe --input tests/sample_burp.xml --type burp
61
+ redsafe --input tests/sample_http.txt --type http
62
+ redsafe --input tests/sample_log.txt --type log
63
+ redsafe --input tests/sample_image.png --type image
64
+ ```
65
+
66
+ To reinstall after local code changes:
67
+ ```bash
68
+ pipx reinstall redsafe
69
+ ```
70
+
71
+ ## Install With venv (Alternative)
72
+ ```bash
73
+ cd /home/gss/Desktop/Codex/ai-redaction-engine
74
+ python -m venv .venv
75
+ source .venv/bin/activate
76
+ pip install -r requirements.txt
77
+ python -m spacy download en_core_web_sm
78
+ ```
79
+
80
+ Run examples:
81
+ ```bash
82
+ python main.py --input tests/sample_burp.xml --type burp
83
+ python main.py --input tests/sample_http.txt --type http
84
+ python main.py --input tests/sample_log.txt --type log
85
+ python main.py --input tests/sample_image.png --type image
86
+ ```
87
+
88
+ Outputs are written to `sanitized_output/`.
89
+
90
+ ## Redaction Tuning (False Positive Control)
91
+ Secret entropy detection can be tuned via environment variables:
92
+
93
+ ```bash
94
+ export REDACTION_ENTROPY_THRESHOLD=4.2
95
+ export REDACTION_MIN_SECRET_LEN=24
96
+ export REDACTION_MIN_BASE64_LEN=28
97
+ export REDACTION_IGNORE_VALUES="application/x-www-form-urlencoded,text/plain"
98
+ ```
99
+
100
+ These values are consumed by `SecretDetectionConfig` in `detection/secret_detection.py`.
101
+
102
+ ## Run Tests
103
+ ```bash
104
+ pytest -q
105
+ ```
106
+
107
+ ## Example: Burp XML Sanitization
108
+ Input Burp request header:
109
+ ```text
110
+ Authorization: Bearer eyJhbGciOiJIUzI1Ni.eyJzdWIiOiIxMjM0NTYifQ.signature123456789
111
+ Cookie: sessionid=abcDEF1234567890
112
+ ```
113
+
114
+ Sanitized output:
115
+ ```text
116
+ Authorization: Bearer <JWT_TOKEN_1>
117
+ Cookie: sessionid=<SESSION_COOKIE_1>
118
+ ```
119
+
120
+ ## Example: HTTP Sanitization
121
+ Input:
122
+ ```text
123
+ POST /login HTTP/1.1
124
+ Host: app.local
125
+ Content-Type: application/x-www-form-urlencoded
126
+
127
+ email=john@example.com&password=SuperSecret123
128
+ ```
129
+
130
+ Sanitized output:
131
+ ```text
132
+ POST /login HTTP/1.1
133
+ Host: app.local
134
+ Content-Type: application/x-www-form-urlencoded
135
+
136
+ email=<EMAIL_1>&password=<PASSWORD_1>
137
+ ```
138
+
139
+ ## Example: Image Sanitization
140
+ - OCR extracts text from screenshot.
141
+ - Detection engine flags sensitive values.
142
+ - Matching OCR boxes are masked in output image.
143
+
144
+ ## Notes
145
+ - Designed for integration into a future AI pentesting engine.
146
+ - All processing is local.
147
+ - If `en_core_web_sm` is unavailable, regex/heuristic detection still works.
148
+ - Image redaction needs local `opencv-python`, `pytesseract`, and system `tesseract` binary installed.
@@ -0,0 +1,134 @@
1
+ # AI Safe Redaction Engine
2
+
3
+ A local-first, modular redaction system for security data before AI processing.
4
+
5
+ ## Single-Word CLI Name
6
+ This project is packaged for `pipx` as:
7
+ - Command: `redsafe`
8
+ - Package name: `redsafe`
9
+
10
+ Other good single-word alternatives if you want to rename later:
11
+ - `safescrub`
12
+ - `cloaknet`
13
+ - `privashield`
14
+
15
+ ## Supported Inputs
16
+ - Burp Suite XML exports
17
+ - Raw HTTP requests/responses
18
+ - Network logs
19
+ - Basic PCAP parsing
20
+ - Screenshots (OCR + masking)
21
+
22
+ ## Features
23
+ - Analysis-based sensitive data detection (not regex-only)
24
+ - Named Entity Recognition (spaCy)
25
+ - Secret detection with heuristics and entropy scoring
26
+ - Context-aware detection from header/parameter names
27
+ - Consistent placeholder mapping (`<EMAIL_1>`, `<JWT_TOKEN_1>`, etc.)
28
+ - Local-only processing, no external API calls
29
+
30
+ ## Project Structure
31
+ - `parsers/`: input ingestion modules
32
+ - `detection/`: entity, secret, entropy, and context detectors
33
+ - `redaction/`: placeholder and redaction logic
34
+ - `core/`: data models and orchestration pipeline
35
+ - `utils/`: file/encoding helpers
36
+ - `tests/`: sample files + pytest coverage
37
+
38
+ ## Install With pipx (Recommended)
39
+ ```bash
40
+ cd /home/gss/Desktop/Codex/ai-redaction-engine
41
+ pipx install .
42
+ ```
43
+
44
+ Run:
45
+ ```bash
46
+ redsafe --input tests/sample_burp.xml --type burp
47
+ redsafe --input tests/sample_http.txt --type http
48
+ redsafe --input tests/sample_log.txt --type log
49
+ redsafe --input tests/sample_image.png --type image
50
+ ```
51
+
52
+ To reinstall after local code changes:
53
+ ```bash
54
+ pipx reinstall redsafe
55
+ ```
56
+
57
+ ## Install With venv (Alternative)
58
+ ```bash
59
+ cd /home/gss/Desktop/Codex/ai-redaction-engine
60
+ python -m venv .venv
61
+ source .venv/bin/activate
62
+ pip install -r requirements.txt
63
+ python -m spacy download en_core_web_sm
64
+ ```
65
+
66
+ Run examples:
67
+ ```bash
68
+ python main.py --input tests/sample_burp.xml --type burp
69
+ python main.py --input tests/sample_http.txt --type http
70
+ python main.py --input tests/sample_log.txt --type log
71
+ python main.py --input tests/sample_image.png --type image
72
+ ```
73
+
74
+ Outputs are written to `sanitized_output/`.
75
+
76
+ ## Redaction Tuning (False Positive Control)
77
+ Secret entropy detection can be tuned via environment variables:
78
+
79
+ ```bash
80
+ export REDACTION_ENTROPY_THRESHOLD=4.2
81
+ export REDACTION_MIN_SECRET_LEN=24
82
+ export REDACTION_MIN_BASE64_LEN=28
83
+ export REDACTION_IGNORE_VALUES="application/x-www-form-urlencoded,text/plain"
84
+ ```
85
+
86
+ These values are consumed by `SecretDetectionConfig` in `detection/secret_detection.py`.
87
+
88
+ ## Run Tests
89
+ ```bash
90
+ pytest -q
91
+ ```
92
+
93
+ ## Example: Burp XML Sanitization
94
+ Input Burp request header:
95
+ ```text
96
+ Authorization: Bearer eyJhbGciOiJIUzI1Ni.eyJzdWIiOiIxMjM0NTYifQ.signature123456789
97
+ Cookie: sessionid=abcDEF1234567890
98
+ ```
99
+
100
+ Sanitized output:
101
+ ```text
102
+ Authorization: Bearer <JWT_TOKEN_1>
103
+ Cookie: sessionid=<SESSION_COOKIE_1>
104
+ ```
105
+
106
+ ## Example: HTTP Sanitization
107
+ Input:
108
+ ```text
109
+ POST /login HTTP/1.1
110
+ Host: app.local
111
+ Content-Type: application/x-www-form-urlencoded
112
+
113
+ email=john@example.com&password=SuperSecret123
114
+ ```
115
+
116
+ Sanitized output:
117
+ ```text
118
+ POST /login HTTP/1.1
119
+ Host: app.local
120
+ Content-Type: application/x-www-form-urlencoded
121
+
122
+ email=<EMAIL_1>&password=<PASSWORD_1>
123
+ ```
124
+
125
+ ## Example: Image Sanitization
126
+ - OCR extracts text from screenshot.
127
+ - Detection engine flags sensitive values.
128
+ - Matching OCR boxes are masked in output image.
129
+
130
+ ## Notes
131
+ - Designed for integration into a future AI pentesting engine.
132
+ - All processing is local.
133
+ - If `en_core_web_sm` is unavailable, regex/heuristic detection still works.
134
+ - Image redaction needs local `opencv-python`, `pytesseract`, and system `tesseract` binary installed.
File without changes
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, List, Optional
5
+
6
+
7
+ @dataclass
8
+ class HTTPMessage:
9
+ method: str = ""
10
+ path: str = ""
11
+ version: str = ""
12
+ headers: Dict[str, str] = field(default_factory=dict)
13
+ parameters: Dict[str, str] = field(default_factory=dict)
14
+ body: str = ""
15
+
16
+
17
+ @dataclass
18
+ class HTTPResponse:
19
+ status_code: int = 0
20
+ reason: str = ""
21
+ version: str = ""
22
+ headers: Dict[str, str] = field(default_factory=dict)
23
+ body: str = ""
24
+
25
+
26
+ @dataclass
27
+ class HTTPExchange:
28
+ request: HTTPMessage
29
+ response: Optional[HTTPResponse] = None
30
+ source: str = ""
31
+
32
+
33
+ @dataclass
34
+ class Detection:
35
+ kind: str
36
+ value: str
37
+ start: int = -1
38
+ end: int = -1
39
+ confidence: float = 1.0
40
+ source: str = ""
41
+
42
+
43
+ @dataclass
44
+ class RedactionSummary:
45
+ detected_count: int = 0
46
+ redacted_count: int = 0
47
+ mappings: Dict[str, str] = field(default_factory=dict)
48
+
49
+
50
+ @dataclass
51
+ class ImageOCRToken:
52
+ text: str
53
+ left: int
54
+ top: int
55
+ width: int
56
+ height: int
57
+
58
+
59
+ @dataclass
60
+ class PipelineResult:
61
+ sanitized_text: Optional[str] = None
62
+ sanitized_image_path: Optional[str] = None
63
+ detections: List[Detection] = field(default_factory=list)
64
+ summary: RedactionSummary = field(default_factory=RedactionSummary)
@@ -0,0 +1,169 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, Iterable, List
5
+
6
+ from core.data_models import Detection, HTTPExchange, HTTPMessage, HTTPResponse, PipelineResult, RedactionSummary
7
+ from detection.context_detection import ContextDetector
8
+ from detection.entity_detection import EntityDetector
9
+ from detection.secret_detection import SecretDetector
10
+ from parsers.burp_xml_parser import parse_burp_xml
11
+ from parsers.http_parser import parse_http_file
12
+ from parsers.image_parser import extract_ocr_tokens
13
+ from parsers.log_parser import parse_network_log
14
+ from parsers.pcap_parser import parse_pcap_basic
15
+ from redaction.placeholder_mapper import PlaceholderMapper
16
+ from redaction.redact_image import redact_image_tokens
17
+ from redaction.redact_text import redact_text_content
18
+ from utils.file_utils import ensure_dir, read_text, write_text
19
+
20
+
21
+ class RedactionPipeline:
22
+ def __init__(self) -> None:
23
+ self.entity_detector = EntityDetector()
24
+ self.secret_detector = SecretDetector()
25
+ self.context_detector = ContextDetector()
26
+ self.mapper = PlaceholderMapper()
27
+
28
+ def _serialize_request(self, req: HTTPMessage) -> str:
29
+ start = f"{req.method} {req.path} {req.version}".strip()
30
+ lines = [start] if start else []
31
+ lines.extend([f"{k}: {v}" for k, v in req.headers.items()])
32
+ if req.body:
33
+ lines.append("")
34
+ lines.append(req.body)
35
+ return "\n".join(lines).strip()
36
+
37
+ def _serialize_response(self, resp: HTTPResponse) -> str:
38
+ start = f"{resp.version} {resp.status_code} {resp.reason}".strip()
39
+ lines = [start] if start else []
40
+ lines.extend([f"{k}: {v}" for k, v in resp.headers.items()])
41
+ if resp.body:
42
+ lines.append("")
43
+ lines.append(resp.body)
44
+ return "\n".join(lines).strip()
45
+
46
+ def _serialize_exchanges(self, exchanges: Iterable[HTTPExchange]) -> str:
47
+ blocks: List[str] = []
48
+ for ex in exchanges:
49
+ parts = []
50
+ if ex.request:
51
+ req_txt = self._serialize_request(ex.request)
52
+ if req_txt:
53
+ parts.append(req_txt)
54
+ if ex.response:
55
+ resp_txt = self._serialize_response(ex.response)
56
+ if resp_txt:
57
+ parts.append(resp_txt)
58
+ if parts:
59
+ blocks.append("\n\n".join(parts))
60
+ return "\n\n###\n\n".join(blocks)
61
+
62
+ def _collect_detections_from_text(self, text: str) -> List[Detection]:
63
+ detections = []
64
+ detections.extend(self.entity_detector.detect(text))
65
+ detections.extend(self.secret_detector.detect(text))
66
+ return detections
67
+
68
+ def _collect_context_from_exchanges(self, exchanges: Iterable[HTTPExchange]) -> List[Detection]:
69
+ detections: List[Detection] = []
70
+ for ex in exchanges:
71
+ detections.extend(self.context_detector.detect_pairs(ex.request.headers, source="request_headers"))
72
+ detections.extend(self.context_detector.detect_pairs(ex.request.parameters, source="request_params"))
73
+ if ex.response:
74
+ detections.extend(self.context_detector.detect_pairs(ex.response.headers, source="response_headers"))
75
+ return detections
76
+
77
+ def _collect_context_from_logs(self, entries: Iterable[Dict[str, str]]) -> List[Detection]:
78
+ detections: List[Detection] = []
79
+ for entry in entries:
80
+ pairs = {k: v for k, v in entry.items() if k != "raw"}
81
+ detections.extend(self.context_detector.detect_pairs(pairs, source="log_fields"))
82
+ return detections
83
+
84
+ def _dedupe(self, detections: Iterable[Detection]) -> List[Detection]:
85
+ unique = {}
86
+ for d in detections:
87
+ key = (d.kind, d.value)
88
+ if d.value and key not in unique:
89
+ unique[key] = d
90
+ return list(unique.values())
91
+
92
+ def _build_summary(self, detections: List[Detection]) -> RedactionSummary:
93
+ return RedactionSummary(
94
+ detected_count=len(detections),
95
+ redacted_count=len(self.mapper.mapping()),
96
+ mappings=self.mapper.mapping(),
97
+ )
98
+
99
+ def process_burp(self, input_path: str, output_dir: str) -> PipelineResult:
100
+ exchanges = parse_burp_xml(input_path)
101
+ original_text = self._serialize_exchanges(exchanges)
102
+ detections = self._collect_detections_from_text(original_text)
103
+ detections.extend(self._collect_context_from_exchanges(exchanges))
104
+ detections = self._dedupe(detections)
105
+
106
+ redacted = redact_text_content(original_text, detections, self.mapper)
107
+ out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
108
+ write_text(out, redacted)
109
+
110
+ return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
111
+
112
+ def process_http(self, input_path: str, output_dir: str) -> PipelineResult:
113
+ raw = read_text(input_path)
114
+ exchanges = parse_http_file(raw, source=input_path)
115
+ original_text = self._serialize_exchanges(exchanges) if exchanges else raw
116
+
117
+ detections = self._collect_detections_from_text(original_text)
118
+ detections.extend(self._collect_context_from_exchanges(exchanges))
119
+ detections = self._dedupe(detections)
120
+
121
+ redacted = redact_text_content(original_text, detections, self.mapper)
122
+ out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
123
+ write_text(out, redacted)
124
+
125
+ return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
126
+
127
+ def process_log(self, input_path: str, output_dir: str) -> PipelineResult:
128
+ raw = read_text(input_path)
129
+ entries = parse_network_log(raw)
130
+
131
+ detections = self._collect_detections_from_text(raw)
132
+ detections.extend(self._collect_context_from_logs(entries))
133
+ detections = self._dedupe(detections)
134
+
135
+ redacted = redact_text_content(raw, detections, self.mapper)
136
+ out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
137
+ write_text(out, redacted)
138
+
139
+ return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
140
+
141
+ def process_pcap(self, input_path: str, output_dir: str) -> PipelineResult:
142
+ exchanges = parse_pcap_basic(input_path)
143
+ original_text = self._serialize_exchanges(exchanges)
144
+
145
+ detections = self._collect_detections_from_text(original_text)
146
+ detections.extend(self._collect_context_from_exchanges(exchanges))
147
+ detections = self._dedupe(detections)
148
+
149
+ redacted = redact_text_content(original_text, detections, self.mapper)
150
+ out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.txt")
151
+ write_text(out, redacted)
152
+
153
+ return PipelineResult(sanitized_text=redacted, detections=detections, summary=self._build_summary(detections))
154
+
155
+ def process_image(self, input_path: str, output_dir: str) -> PipelineResult:
156
+ tokens = extract_ocr_tokens(input_path)
157
+ full_text = "\n".join(t.text for t in tokens)
158
+
159
+ detections = self._collect_detections_from_text(full_text)
160
+ detections = self._dedupe(detections)
161
+
162
+ out = ensure_dir(output_dir) / (Path(input_path).stem + ".sanitized.png")
163
+ redact_image_tokens(input_path, str(out), tokens, detections, self.mapper)
164
+
165
+ return PipelineResult(
166
+ sanitized_image_path=str(out),
167
+ detections=detections,
168
+ summary=self._build_summary(detections),
169
+ )
File without changes
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List
4
+
5
+ from core.data_models import Detection
6
+
7
+
8
+ SENSITIVE_KEYS = {
9
+ "password": "PASSWORD",
10
+ "passwd": "PASSWORD",
11
+ "secret": "SECRET",
12
+ "token": "TOKEN",
13
+ "apikey": "API_KEY",
14
+ "api_key": "API_KEY",
15
+ "authorization": "AUTHORIZATION",
16
+ "session": "SESSION",
17
+ "cookie": "COOKIE",
18
+ "private_key": "PRIVATE_KEY",
19
+ }
20
+
21
+
22
+ class ContextDetector:
23
+ def detect_pairs(self, pairs: Dict[str, str], source: str = "context") -> List[Detection]:
24
+ detections: List[Detection] = []
25
+ for k, v in pairs.items():
26
+ key_lower = k.lower()
27
+ for skey, kind in SENSITIVE_KEYS.items():
28
+ if skey in key_lower and v:
29
+ detections.append(Detection(kind=kind, value=v, source=source, confidence=0.95))
30
+ break
31
+ return detections
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import List
5
+
6
+ from core.data_models import Detection
7
+
8
+ try:
9
+ import spacy # type: ignore
10
+ except Exception: # pragma: no cover
11
+ spacy = None
12
+
13
+
14
+ EMAIL_RE = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
15
+ PHONE_RE = re.compile(r"\b(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{2,4}\)?[\s.-]?)?\d{3}[\s.-]?\d{4}\b")
16
+ ADDRESS_HINT_RE = re.compile(r"\b\d{1,6}\s+[A-Za-z0-9\s]{3,}\s(?:Street|St|Road|Rd|Avenue|Ave|Lane|Ln|Boulevard|Blvd|Drive|Dr)\b", re.IGNORECASE)
17
+
18
+
19
+ class EntityDetector:
20
+ def __init__(self) -> None:
21
+ self._nlp = None
22
+ self._load_model()
23
+
24
+ def _load_model(self) -> None:
25
+ if spacy is None:
26
+ return
27
+ for model in ("en_core_web_sm", "en_core_web_md"):
28
+ try:
29
+ self._nlp = spacy.load(model)
30
+ return
31
+ except Exception:
32
+ continue
33
+
34
+ def detect(self, text: str) -> List[Detection]:
35
+ detections: List[Detection] = []
36
+
37
+ for m in EMAIL_RE.finditer(text):
38
+ detections.append(Detection(kind="EMAIL", value=m.group(0), start=m.start(), end=m.end(), source="regex"))
39
+
40
+ for m in PHONE_RE.finditer(text):
41
+ if len(re.sub(r"\D", "", m.group(0))) >= 7:
42
+ detections.append(Detection(kind="PHONE", value=m.group(0), start=m.start(), end=m.end(), source="regex"))
43
+
44
+ for m in ADDRESS_HINT_RE.finditer(text):
45
+ detections.append(Detection(kind="ADDRESS", value=m.group(0), start=m.start(), end=m.end(), source="regex"))
46
+
47
+ if self._nlp is None:
48
+ return detections
49
+
50
+ doc = self._nlp(text)
51
+ for ent in doc.ents:
52
+ label_map = {
53
+ "PERSON": "NAME",
54
+ "ORG": "ORGANIZATION",
55
+ "GPE": "ADDRESS",
56
+ "LOC": "ADDRESS",
57
+ "FAC": "ADDRESS",
58
+ }
59
+ mapped = label_map.get(ent.label_)
60
+ if mapped:
61
+ detections.append(
62
+ Detection(
63
+ kind=mapped,
64
+ value=ent.text,
65
+ start=ent.start_char,
66
+ end=ent.end_char,
67
+ source="spacy",
68
+ confidence=0.9,
69
+ )
70
+ )
71
+
72
+ return detections
@@ -0,0 +1,24 @@
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from collections import Counter
5
+
6
+
7
+ class EntropyDetector:
8
+ @staticmethod
9
+ def shannon_entropy(value: str) -> float:
10
+ if not value:
11
+ return 0.0
12
+ counts = Counter(value)
13
+ length = len(value)
14
+ entropy = 0.0
15
+ for count in counts.values():
16
+ p = count / length
17
+ entropy -= p * math.log2(p)
18
+ return entropy
19
+
20
+ @staticmethod
21
+ def is_high_entropy(value: str, threshold: float = 3.5, min_len: int = 16) -> bool:
22
+ if len(value) < min_len:
23
+ return False
24
+ return EntropyDetector.shannon_entropy(value) >= threshold