tabayyan 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tabayyan/__init__.py +46 -0
- tabayyan/checksums.py +82 -0
- tabayyan/cli.py +220 -0
- tabayyan/config.py +92 -0
- tabayyan/confusables.py +99 -0
- tabayyan/detectors/__init__.py +12 -0
- tabayyan/detectors/base.py +15 -0
- tabayyan/detectors/domains.py +37 -0
- tabayyan/detectors/generic.py +50 -0
- tabayyan/detectors/names.py +94 -0
- tabayyan/detectors/saudi.py +114 -0
- tabayyan/engine.py +50 -0
- tabayyan/entities.py +71 -0
- tabayyan/homoglyph.py +166 -0
- tabayyan/integrations/__init__.py +4 -0
- tabayyan/integrations/presidio.py +132 -0
- tabayyan/middleware.py +281 -0
- tabayyan/redaction.py +166 -0
- tabayyan/streaming.py +72 -0
- tabayyan-0.5.1.dist-info/METADATA +329 -0
- tabayyan-0.5.1.dist-info/RECORD +24 -0
- tabayyan-0.5.1.dist-info/WHEEL +4 -0
- tabayyan-0.5.1.dist-info/entry_points.txt +2 -0
- tabayyan-0.5.1.dist-info/licenses/LICENSE +143 -0
tabayyan/__init__.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Tabayyan (تبيّن) — Saudi-aware PII detection & redaction for LLM pipelines.
|
|
2
|
+
|
|
3
|
+
Local-first. Zero telemetry. No network calls in the detection core.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from .engine import DetectionEngine
|
|
8
|
+
from .entities import Category, Confidence, EntityType, Match
|
|
9
|
+
from .middleware import AuditLog, AuditRecord, Guard, ProtectResult, is_in_kingdom
|
|
10
|
+
from .redaction import RedactionItem, RedactionMode, RedactionResult, redact, restore
|
|
11
|
+
|
|
12
|
+
__version__ = "0.5.1"
|
|
13
|
+
__all__ = [
|
|
14
|
+
"DetectionEngine",
|
|
15
|
+
"Match",
|
|
16
|
+
"EntityType",
|
|
17
|
+
"Category",
|
|
18
|
+
"Confidence",
|
|
19
|
+
"RedactionMode",
|
|
20
|
+
"RedactionResult",
|
|
21
|
+
"RedactionItem",
|
|
22
|
+
"redact",
|
|
23
|
+
"restore",
|
|
24
|
+
"Guard",
|
|
25
|
+
"AuditLog",
|
|
26
|
+
"AuditRecord",
|
|
27
|
+
"ProtectResult",
|
|
28
|
+
"is_in_kingdom",
|
|
29
|
+
"scan",
|
|
30
|
+
"scan_and_redact",
|
|
31
|
+
"__version__",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def scan(text: str) -> list[Match]:
|
|
36
|
+
"""Convenience one-shot detection using the default detector set."""
|
|
37
|
+
return DetectionEngine().scan(text)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def scan_and_redact(
|
|
41
|
+
text: str,
|
|
42
|
+
mode: "RedactionMode | str" = RedactionMode.MASK,
|
|
43
|
+
**kwargs,
|
|
44
|
+
) -> RedactionResult:
|
|
45
|
+
"""Detect then redact in one call. kwargs pass through to redact()."""
|
|
46
|
+
return redact(text, DetectionEngine().scan(text), mode, **kwargs)
|
tabayyan/checksums.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Deterministic, offline checksum primitives.
|
|
2
|
+
|
|
3
|
+
Pure functions only. Passing a checksum confirms a value is *structurally
|
|
4
|
+
valid*, NOT that it was ever issued.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def luhn_is_valid(number: str) -> bool:
|
|
10
|
+
if not number.isdigit():
|
|
11
|
+
return False
|
|
12
|
+
total = 0
|
|
13
|
+
parity = len(number) % 2
|
|
14
|
+
for i, ch in enumerate(number):
|
|
15
|
+
d = int(ch)
|
|
16
|
+
if i % 2 == parity:
|
|
17
|
+
d *= 2
|
|
18
|
+
if d > 9:
|
|
19
|
+
d -= 9
|
|
20
|
+
total += d
|
|
21
|
+
return total % 10 == 0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def luhn_check_digit(number_without_check: str) -> int:
|
|
25
|
+
total = 0
|
|
26
|
+
parity = (len(number_without_check) + 1) % 2
|
|
27
|
+
for i, ch in enumerate(number_without_check):
|
|
28
|
+
d = int(ch)
|
|
29
|
+
if i % 2 == parity:
|
|
30
|
+
d *= 2
|
|
31
|
+
if d > 9:
|
|
32
|
+
d -= 9
|
|
33
|
+
total += d
|
|
34
|
+
return (10 - (total % 10)) % 10
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def saudi_id_is_valid(value: str) -> bool:
|
|
38
|
+
if len(value) != 10 or not value.isdigit() or value[0] not in ("1", "2"):
|
|
39
|
+
return False
|
|
40
|
+
total = 0
|
|
41
|
+
for i in range(10):
|
|
42
|
+
d = int(value[i])
|
|
43
|
+
if i % 2 == 0:
|
|
44
|
+
d *= 2
|
|
45
|
+
total += d // 10 + d % 10
|
|
46
|
+
else:
|
|
47
|
+
total += d
|
|
48
|
+
return total % 10 == 0
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def saudi_id_check_digit(first_nine: str) -> int:
|
|
52
|
+
if len(first_nine) != 9 or not first_nine.isdigit():
|
|
53
|
+
raise ValueError("first_nine must be exactly 9 digits")
|
|
54
|
+
partial = 0
|
|
55
|
+
for i in range(9):
|
|
56
|
+
d = int(first_nine[i])
|
|
57
|
+
if i % 2 == 0:
|
|
58
|
+
d *= 2
|
|
59
|
+
partial += d // 10 + d % 10
|
|
60
|
+
else:
|
|
61
|
+
partial += d
|
|
62
|
+
return (10 - (partial % 10)) % 10
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _iban_to_numeric(iban: str) -> str:
|
|
66
|
+
rearranged = iban[4:] + iban[:4]
|
|
67
|
+
out = []
|
|
68
|
+
for ch in rearranged:
|
|
69
|
+
out.append(ch if ch.isdigit() else str(ord(ch.upper()) - 55))
|
|
70
|
+
return "".join(out)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def iban_mod97_is_valid(iban: str) -> bool:
|
|
74
|
+
iban = iban.replace(" ", "").upper()
|
|
75
|
+
if len(iban) < 5 or not iban[:2].isalpha() or not iban[2:4].isdigit():
|
|
76
|
+
return False
|
|
77
|
+
return int(_iban_to_numeric(iban)) % 97 == 1
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def iban_check_digits(country: str, bban: str) -> str:
|
|
81
|
+
trial = country.upper() + "00" + bban.upper()
|
|
82
|
+
return f"{98 - (int(_iban_to_numeric(trial)) % 97):02d}"
|
tabayyan/cli.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Command-line interface for Tabayyan. Stdlib only.
|
|
2
|
+
|
|
3
|
+
Commands:
|
|
4
|
+
tabayyan scan [paths...] detect entities, print findings
|
|
5
|
+
tabayyan redact [paths...] detect + redact, print sanitised text
|
|
6
|
+
|
|
7
|
+
Reads stdin when no path is given or path is '-'. Supports batch over
|
|
8
|
+
files and directories. Exit code is non-zero when entities are found,
|
|
9
|
+
so it slots into CI / pre-commit gates.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Iterable
|
|
18
|
+
|
|
19
|
+
from . import __version__
|
|
20
|
+
from .config import Config
|
|
21
|
+
from .engine import DetectionEngine
|
|
22
|
+
from .streaming import scan_file
|
|
23
|
+
from .entities import Confidence, Match
|
|
24
|
+
from .homoglyph import scan_text as _scan_domains
|
|
25
|
+
from .redaction import RedactionMode, redact
|
|
26
|
+
|
|
27
|
+
_CONFIDENCE_ORDER = {Confidence.LOW: 0, Confidence.MEDIUM: 1, Confidence.HIGH: 2}
|
|
28
|
+
_TEXT_SUFFIXES = {".txt", ".md", ".log", ".json", ".csv", ".eml", ".text"}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _iter_inputs(paths: list[str]) -> Iterable[tuple[str, str]]:
|
|
32
|
+
"""Yield (source_name, text). '-' or empty -> stdin."""
|
|
33
|
+
if not paths or paths == ["-"]:
|
|
34
|
+
yield ("<stdin>", sys.stdin.read())
|
|
35
|
+
return
|
|
36
|
+
for raw in paths:
|
|
37
|
+
if raw == "-":
|
|
38
|
+
yield ("<stdin>", sys.stdin.read())
|
|
39
|
+
continue
|
|
40
|
+
p = Path(raw)
|
|
41
|
+
if p.is_dir():
|
|
42
|
+
for child in sorted(p.rglob("*")):
|
|
43
|
+
if child.is_file() and child.suffix.lower() in _TEXT_SUFFIXES:
|
|
44
|
+
yield (str(child), child.read_text(encoding="utf-8", errors="replace"))
|
|
45
|
+
elif p.is_file():
|
|
46
|
+
yield (str(p), p.read_text(encoding="utf-8", errors="replace"))
|
|
47
|
+
else:
|
|
48
|
+
print(f"tabayyan: cannot read '{raw}'", file=sys.stderr)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _engine_from_args(args) -> DetectionEngine:
|
|
52
|
+
cfg = getattr(args, "config", None)
|
|
53
|
+
return Config.from_file(cfg).build_engine() if cfg else DetectionEngine()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _filter_matches(matches: list[Match], args) -> list[Match]:
|
|
57
|
+
out = matches
|
|
58
|
+
if args.min_confidence:
|
|
59
|
+
floor = _CONFIDENCE_ORDER[Confidence(args.min_confidence)]
|
|
60
|
+
out = [m for m in out if _CONFIDENCE_ORDER[m.confidence] >= floor]
|
|
61
|
+
if args.only:
|
|
62
|
+
only = set(args.only)
|
|
63
|
+
out = [m for m in out if m.entity_type.value in only]
|
|
64
|
+
if args.exclude:
|
|
65
|
+
excl = set(args.exclude)
|
|
66
|
+
out = [m for m in out if m.entity_type.value not in excl]
|
|
67
|
+
return out
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _cmd_scan(args) -> int:
|
|
71
|
+
engine = _engine_from_args(args)
|
|
72
|
+
found_any = False
|
|
73
|
+
report = []
|
|
74
|
+
if args.stream:
|
|
75
|
+
for raw in args.paths:
|
|
76
|
+
if raw in ("", "-"):
|
|
77
|
+
print("tabayyan: --stream requires file paths, not stdin", file=sys.stderr)
|
|
78
|
+
continue
|
|
79
|
+
matches = _filter_matches(list(scan_file(raw, engine)), args)
|
|
80
|
+
if matches:
|
|
81
|
+
found_any = True
|
|
82
|
+
if args.json:
|
|
83
|
+
report.append({"source": raw, "matches": [m.to_dict() for m in matches]})
|
|
84
|
+
else:
|
|
85
|
+
for m in matches:
|
|
86
|
+
val = "" if args.no_values else f" {m.value!r}"
|
|
87
|
+
print(f"{raw}:{m.start}-{m.end}\t{m.entity_type.value}\t"
|
|
88
|
+
f"{m.confidence.value}\t{m.category.value}{val}")
|
|
89
|
+
if args.json:
|
|
90
|
+
json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
|
|
91
|
+
sys.stdout.write("\n")
|
|
92
|
+
return (1 if found_any else 0) if args.fail_on_find else 0
|
|
93
|
+
for name, text in _iter_inputs(args.paths):
|
|
94
|
+
matches = _filter_matches(engine.scan(text), args)
|
|
95
|
+
if matches:
|
|
96
|
+
found_any = True
|
|
97
|
+
if args.json:
|
|
98
|
+
report.append({"source": name, "matches": [m.to_dict() for m in matches]})
|
|
99
|
+
else:
|
|
100
|
+
for m in matches:
|
|
101
|
+
val = "" if args.no_values else f" {m.value!r}"
|
|
102
|
+
print(f"{name}:{m.start}-{m.end}\t{m.entity_type.value}\t"
|
|
103
|
+
f"{m.confidence.value}\t{m.category.value}{val}")
|
|
104
|
+
if args.json:
|
|
105
|
+
json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
|
|
106
|
+
sys.stdout.write("\n")
|
|
107
|
+
return (1 if found_any else 0) if args.fail_on_find else 0
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _cmd_redact(args) -> int:
|
|
111
|
+
engine = _engine_from_args(args)
|
|
112
|
+
mode = RedactionMode(args.mode)
|
|
113
|
+
found_any = False
|
|
114
|
+
inputs = list(_iter_inputs(args.paths))
|
|
115
|
+
multi = len(inputs) > 1
|
|
116
|
+
for name, text in inputs:
|
|
117
|
+
matches = _filter_matches(engine.scan(text), args)
|
|
118
|
+
if matches:
|
|
119
|
+
found_any = True
|
|
120
|
+
result = redact(
|
|
121
|
+
text, matches, mode,
|
|
122
|
+
salt=args.salt, hash_length=args.hash_length,
|
|
123
|
+
partial_keep_last=args.keep_last,
|
|
124
|
+
)
|
|
125
|
+
if args.json:
|
|
126
|
+
payload = result.to_dict()
|
|
127
|
+
payload["source"] = name
|
|
128
|
+
json.dump(payload, sys.stdout, ensure_ascii=False, indent=2)
|
|
129
|
+
sys.stdout.write("\n")
|
|
130
|
+
else:
|
|
131
|
+
if multi:
|
|
132
|
+
print(f"===== {name} =====")
|
|
133
|
+
sys.stdout.write(result.text)
|
|
134
|
+
if not result.text.endswith("\n"):
|
|
135
|
+
sys.stdout.write("\n")
|
|
136
|
+
if result.vault:
|
|
137
|
+
print(f"# vault ({len(result.vault)} tokens) — store securely; "
|
|
138
|
+
f"use --json to capture", file=sys.stderr)
|
|
139
|
+
return (1 if found_any else 0) if args.fail_on_find else 0
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _load_watchlist(path: str | None) -> list[str]:
|
|
143
|
+
if not path:
|
|
144
|
+
return []
|
|
145
|
+
return [ln.strip() for ln in Path(path).read_text(encoding="utf-8").splitlines()
|
|
146
|
+
if ln.strip() and not ln.startswith("#")]
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _cmd_domains(args) -> int:
|
|
150
|
+
watchlist = _load_watchlist(args.watchlist)
|
|
151
|
+
found_any = False
|
|
152
|
+
report = []
|
|
153
|
+
for name, text in _iter_inputs(args.paths):
|
|
154
|
+
findings = _scan_domains(text, watchlist,
|
|
155
|
+
typosquat_max_distance=args.max_distance)
|
|
156
|
+
if findings:
|
|
157
|
+
found_any = True
|
|
158
|
+
if args.json:
|
|
159
|
+
report.append({"source": name, "findings": [vars(f) for f in findings]})
|
|
160
|
+
else:
|
|
161
|
+
for f in findings:
|
|
162
|
+
tgt = f" -> {f.target}" if f.target else ""
|
|
163
|
+
print(f"{name}:{f.start}-{f.end}\t{f.domain}\t{f.reason}\t"
|
|
164
|
+
f"{f.confidence}{tgt}\t{f.detail}")
|
|
165
|
+
if args.json:
|
|
166
|
+
json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
|
|
167
|
+
sys.stdout.write("\n")
|
|
168
|
+
return (1 if found_any else 0) if args.fail_on_find else 0
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _add_common_filters(p: argparse.ArgumentParser) -> None:
|
|
172
|
+
p.add_argument("paths", nargs="*", help="files/dirs, or '-' for stdin")
|
|
173
|
+
p.add_argument("--min-confidence", choices=["low", "medium", "high"],
|
|
174
|
+
help="drop matches below this confidence")
|
|
175
|
+
p.add_argument("--only", nargs="+", metavar="TYPE", help="keep only these entity types")
|
|
176
|
+
p.add_argument("--exclude", nargs="+", metavar="TYPE", help="drop these entity types")
|
|
177
|
+
p.add_argument("--json", action="store_true", help="emit JSON")
|
|
178
|
+
p.add_argument("--fail-on-find", action="store_true",
|
|
179
|
+
help="exit 1 if any entity is found (for CI / pre-commit)")
|
|
180
|
+
p.add_argument("--config", help="JSON config: disable/add detectors, thresholds")
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
184
|
+
parser = argparse.ArgumentParser(prog="tabayyan", description=__doc__.splitlines()[0])
|
|
185
|
+
parser.add_argument("--version", action="version", version=f"tabayyan {__version__}")
|
|
186
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
187
|
+
|
|
188
|
+
ps = sub.add_parser("scan", help="detect entities")
|
|
189
|
+
_add_common_filters(ps)
|
|
190
|
+
ps.add_argument("--no-values", action="store_true", help="hide raw values in output")
|
|
191
|
+
ps.add_argument("--stream", action="store_true",
|
|
192
|
+
help="scan large files incrementally (file paths only)")
|
|
193
|
+
ps.set_defaults(func=_cmd_scan)
|
|
194
|
+
|
|
195
|
+
pr = sub.add_parser("redact", help="detect and redact")
|
|
196
|
+
_add_common_filters(pr)
|
|
197
|
+
pr.add_argument("--mode", choices=[m.value for m in RedactionMode], default="mask")
|
|
198
|
+
pr.add_argument("--salt", default="", help="salt for hash mode")
|
|
199
|
+
pr.add_argument("--hash-length", type=int, default=12, help="hash token length")
|
|
200
|
+
pr.add_argument("--keep-last", type=int, default=4, help="kept chars in partial mode")
|
|
201
|
+
pr.set_defaults(func=_cmd_redact)
|
|
202
|
+
pd = sub.add_parser("domains", help="detect lookalike / homoglyph domains")
|
|
203
|
+
pd.add_argument("paths", nargs="*", help="files/dirs, or '-' for stdin")
|
|
204
|
+
pd.add_argument("--watchlist", help="file of legitimate domains, one per line")
|
|
205
|
+
pd.add_argument("--max-distance", type=int, default=1,
|
|
206
|
+
help="max edit distance for typosquat flag")
|
|
207
|
+
pd.add_argument("--json", action="store_true", help="emit JSON")
|
|
208
|
+
pd.add_argument("--fail-on-find", action="store_true",
|
|
209
|
+
help="exit 1 if any suspicious domain is found")
|
|
210
|
+
pd.set_defaults(func=_cmd_domains)
|
|
211
|
+
return parser
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def main(argv: list[str] | None = None) -> int:
|
|
215
|
+
args = build_parser().parse_args(argv)
|
|
216
|
+
return args.func(args)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == "__main__":
|
|
220
|
+
raise SystemExit(main())
|
tabayyan/config.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Configuration: customise detection without editing code.
|
|
2
|
+
|
|
3
|
+
Load a JSON config to enable/disable detectors, add custom regex
|
|
4
|
+
detectors, extend the confusable map, and tune thresholds. JSON is used
|
|
5
|
+
(not TOML) to keep zero runtime dependencies on Python 3.9.
|
|
6
|
+
|
|
7
|
+
Schema (all keys optional):
|
|
8
|
+
{
|
|
9
|
+
"disable": ["saudi_cr", "arabic_name"],
|
|
10
|
+
"typosquat_max_distance": 2,
|
|
11
|
+
"confusables": {"ⅴ": "v"},
|
|
12
|
+
"custom_detectors": [
|
|
13
|
+
{"label": "employee_id", "pattern": "EMP-\\\\d{6}",
|
|
14
|
+
"category": "organisation", "confidence": "medium"}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
Custom-detector matches use entity_type CUSTOM; their configured label is
|
|
19
|
+
preserved in the `detector` and `notes` fields and in the mask placeholder.
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import re
|
|
25
|
+
from dataclasses import dataclass, field
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Iterable
|
|
28
|
+
|
|
29
|
+
from .confusables import register_confusables
|
|
30
|
+
from .detectors import DEFAULT_DETECTORS, Detector
|
|
31
|
+
from .engine import DetectionEngine
|
|
32
|
+
from .entities import Category, Confidence, EntityType, Match
|
|
33
|
+
|
|
34
|
+
_CONF = {c.value: c for c in Confidence}
|
|
35
|
+
_CAT = {c.value: c for c in Category}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CustomRegexDetector(Detector):
|
|
39
|
+
"""A user-defined regex detector loaded from config."""
|
|
40
|
+
|
|
41
|
+
def __init__(self, label: str, pattern: str, category: Category,
|
|
42
|
+
confidence: Confidence) -> None:
|
|
43
|
+
self.name = f"custom:{label}"
|
|
44
|
+
self.label = label
|
|
45
|
+
self._rx = re.compile(pattern)
|
|
46
|
+
self._category = category
|
|
47
|
+
self._confidence = confidence
|
|
48
|
+
|
|
49
|
+
def detect(self, text: str) -> Iterable[Match]:
|
|
50
|
+
for m in self._rx.finditer(text):
|
|
51
|
+
yield Match(
|
|
52
|
+
entity_type=EntityType.CUSTOM, category=self._category,
|
|
53
|
+
confidence=self._confidence, start=m.start(), end=m.end(),
|
|
54
|
+
value=m.group(0), detector=self.name, label=self.label,
|
|
55
|
+
notes=f"custom detector '{self.label}'",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def mask_label(self) -> str:
|
|
59
|
+
return f"[{self.label.upper()}]"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class Config:
|
|
64
|
+
disable: set[str] = field(default_factory=set)
|
|
65
|
+
typosquat_max_distance: int = 1
|
|
66
|
+
custom_detectors: list[CustomRegexDetector] = field(default_factory=list)
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_dict(cls, data: dict) -> "Config":
|
|
70
|
+
for ch, sk in (data.get("confusables") or {}).items():
|
|
71
|
+
register_confusables({ch: sk})
|
|
72
|
+
customs = []
|
|
73
|
+
for spec in data.get("custom_detectors", []):
|
|
74
|
+
customs.append(CustomRegexDetector(
|
|
75
|
+
label=spec["label"], pattern=spec["pattern"],
|
|
76
|
+
category=_CAT[spec.get("category", "organisation")],
|
|
77
|
+
confidence=_CONF[spec.get("confidence", "medium")],
|
|
78
|
+
))
|
|
79
|
+
return cls(
|
|
80
|
+
disable=set(data.get("disable", [])),
|
|
81
|
+
typosquat_max_distance=int(data.get("typosquat_max_distance", 1)),
|
|
82
|
+
custom_detectors=customs,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_file(cls, path: str | Path) -> "Config":
|
|
87
|
+
return cls.from_dict(json.loads(Path(path).read_text(encoding="utf-8")))
|
|
88
|
+
|
|
89
|
+
def build_engine(self) -> DetectionEngine:
|
|
90
|
+
detectors = [d for d in DEFAULT_DETECTORS if d.name not in self.disable]
|
|
91
|
+
detectors.extend(self.custom_detectors)
|
|
92
|
+
return DetectionEngine(detectors)
|
tabayyan/confusables.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Script classification and confusable-skeleton folding.
|
|
2
|
+
|
|
3
|
+
Used by the homoglyph subsystem to catch domains that impersonate a
|
|
4
|
+
target using visually-confusable characters (IDN homograph attacks) or
|
|
5
|
+
mix scripts. The confusables map is a curated, practical subset of the
|
|
6
|
+
Unicode confusables data — extensible, not exhaustive. See
|
|
7
|
+
homoglyph.py for the detection logic that consumes these primitives.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
# Curated confusable -> ASCII/canonical skeleton map.
|
|
12
|
+
# Focus: characters realistically used to spoof Latin domains
|
|
13
|
+
# (Cyrillic, Greek, fullwidth, digit/letter swaps) plus Arabic-Indic
|
|
14
|
+
# digits which appear in mixed Arabic/Latin spoofs.
|
|
15
|
+
_CONFUSABLES: dict[str, str] = {
|
|
16
|
+
# Cyrillic -> Latin
|
|
17
|
+
"а": "a", "е": "e", "о": "o", "р": "p", "с": "c", "х": "x",
|
|
18
|
+
"у": "y", "к": "k", "м": "m", "н": "h", "т": "t", "в": "b",
|
|
19
|
+
"і": "i", "ј": "j", "ѕ": "s", "ԁ": "d", "ɡ": "g",
|
|
20
|
+
"ӏ": "l", "Ӏ": "l", "ⅼ": "l", "j": "j",
|
|
21
|
+
# Greek -> Latin
|
|
22
|
+
"α": "a", "ο": "o", "ν": "v", "ρ": "p", "τ": "t", "υ": "u",
|
|
23
|
+
"ι": "i", "κ": "k", "χ": "x", "ε": "e",
|
|
24
|
+
# Fullwidth Latin -> ASCII
|
|
25
|
+
"a": "a", "b": "b", "c": "c", "d": "d", "e": "e", "o": "o",
|
|
26
|
+
"p": "p", "s": "s", "x": "x", "i": "i", "l": "l", "m": "m",
|
|
27
|
+
"n": "n", "g": "g", "t": "t", "r": "r", "u": "u",
|
|
28
|
+
# Digit/letter confusions folded to a single skeleton
|
|
29
|
+
"0": "o", "1": "l", "5": "s", "8": "b",
|
|
30
|
+
"l": "l", "I": "l", "|": "l",
|
|
31
|
+
# Arabic-Indic and Eastern-Arabic digits -> ASCII digits then folded
|
|
32
|
+
"٠": "o", "١": "l", "٢": "2", "٣": "3", "٤": "4", "٥": "s",
|
|
33
|
+
"٦": "6", "٧": "7", "٨": "b", "٩": "9",
|
|
34
|
+
"۰": "o", "۱": "l", "۲": "2", "۳": "3", "۴": "4", "۵": "s",
|
|
35
|
+
"۶": "6", "۷": "7", "۸": "b", "۹": "9",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def register_confusables(extra: dict[str, str]) -> None:
|
|
40
|
+
"""Merge user-supplied confusable mappings into the global map."""
|
|
41
|
+
_CONFUSABLES.update(extra)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def skeleton(label: str) -> str:
|
|
45
|
+
"""Fold a string to its confusable skeleton (lowercased).
|
|
46
|
+
|
|
47
|
+
Two strings with the same skeleton look alike. This is the core
|
|
48
|
+
primitive for homograph-impersonation detection.
|
|
49
|
+
"""
|
|
50
|
+
out = []
|
|
51
|
+
for ch in label.lower():
|
|
52
|
+
out.append(_CONFUSABLES.get(ch, ch))
|
|
53
|
+
return "".join(out)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Script ranges (start, end, name). Order matters only for first-match.
|
|
57
|
+
_SCRIPT_RANGES = [
|
|
58
|
+
(0x0041, 0x005A, "Latin"), (0x0061, 0x007A, "Latin"),
|
|
59
|
+
(0x00C0, 0x024F, "Latin"),
|
|
60
|
+
(0x0370, 0x03FF, "Greek"), (0x1F00, 0x1FFF, "Greek"),
|
|
61
|
+
(0x0400, 0x04FF, "Cyrillic"), (0x0500, 0x052F, "Cyrillic"),
|
|
62
|
+
(0x0600, 0x06FF, "Arabic"), (0x0750, 0x077F, "Arabic"),
|
|
63
|
+
(0x08A0, 0x08FF, "Arabic"), (0xFB50, 0xFDFF, "Arabic"),
|
|
64
|
+
(0xFE70, 0xFEFF, "Arabic"),
|
|
65
|
+
(0x0590, 0x05FF, "Hebrew"),
|
|
66
|
+
(0x4E00, 0x9FFF, "Han"),
|
|
67
|
+
(0x3040, 0x30FF, "Kana"),
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def script_of(ch: str) -> str:
|
|
72
|
+
"""Return a coarse script name for a single character.
|
|
73
|
+
|
|
74
|
+
Digits and ASCII punctuation are 'Common' (script-neutral) so they
|
|
75
|
+
don't trigger false mixed-script alarms on their own.
|
|
76
|
+
"""
|
|
77
|
+
cp = ord(ch)
|
|
78
|
+
if ch.isdigit() and cp < 0x0660:
|
|
79
|
+
return "Common"
|
|
80
|
+
if ch in "-._/:":
|
|
81
|
+
return "Common"
|
|
82
|
+
for start, end, name in _SCRIPT_RANGES:
|
|
83
|
+
if start <= cp <= end:
|
|
84
|
+
return name
|
|
85
|
+
return "Common"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def scripts_in(label: str) -> set[str]:
|
|
89
|
+
"""Return the set of non-Common scripts present in a label."""
|
|
90
|
+
return {s for s in (script_of(c) for c in label) if s != "Common"}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def is_mixed_script(label: str) -> bool:
|
|
94
|
+
"""True if a single label mixes two or more distinct scripts.
|
|
95
|
+
|
|
96
|
+
Mixing scripts inside one domain label is a strong spoofing signal:
|
|
97
|
+
legitimate labels almost never combine, e.g., Latin and Cyrillic.
|
|
98
|
+
"""
|
|
99
|
+
return len(scripts_in(label)) >= 2
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Detector registry."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .base import Detector
|
|
5
|
+
from .generic import GENERIC_DETECTORS
|
|
6
|
+
from .names import ArabicNameDetector
|
|
7
|
+
from .saudi import SAUDI_DETECTORS
|
|
8
|
+
|
|
9
|
+
DEFAULT_DETECTORS = [*SAUDI_DETECTORS, *GENERIC_DETECTORS, ArabicNameDetector()]
|
|
10
|
+
|
|
11
|
+
__all__ = ["Detector", "DEFAULT_DETECTORS", "SAUDI_DETECTORS", "GENERIC_DETECTORS",
|
|
12
|
+
"ArabicNameDetector"]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Detector base class. Detectors are pure: text in, matches out."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
from ..entities import Match
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Detector(ABC):
|
|
11
|
+
name: str = "detector"
|
|
12
|
+
|
|
13
|
+
@abstractmethod
|
|
14
|
+
def detect(self, text: str) -> Iterable[Match]:
|
|
15
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Opt-in lookalike/homoglyph domain detector.
|
|
2
|
+
|
|
3
|
+
Not part of DEFAULT_DETECTORS: it needs a watchlist to be most useful and
|
|
4
|
+
it emits THREAT findings, not PII. Construct it explicitly (optionally
|
|
5
|
+
with a watchlist) and pass it to DetectionEngine, or use the `domains`
|
|
6
|
+
CLI command.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Iterable, Sequence
|
|
11
|
+
|
|
12
|
+
from ..entities import Category, Confidence, EntityType, Match
|
|
13
|
+
from ..homoglyph import scan_text
|
|
14
|
+
from .base import Detector
|
|
15
|
+
|
|
16
|
+
_CONF = {"high": Confidence.HIGH, "medium": Confidence.MEDIUM, "low": Confidence.LOW}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LookalikeDomainDetector(Detector):
|
|
20
|
+
name = "lookalike_domain"
|
|
21
|
+
|
|
22
|
+
def __init__(self, watchlist: Sequence[str] | None = None,
|
|
23
|
+
typosquat_max_distance: int = 1) -> None:
|
|
24
|
+
self.watchlist = list(watchlist or [])
|
|
25
|
+
self.typosquat_max_distance = typosquat_max_distance
|
|
26
|
+
|
|
27
|
+
def detect(self, text: str) -> Iterable[Match]:
|
|
28
|
+
for f in scan_text(text, self.watchlist,
|
|
29
|
+
typosquat_max_distance=self.typosquat_max_distance):
|
|
30
|
+
note = f.detail if f.target is None else f"{f.reason}: {f.detail}"
|
|
31
|
+
yield Match(
|
|
32
|
+
entity_type=EntityType.SUSPICIOUS_DOMAIN,
|
|
33
|
+
category=Category.THREAT,
|
|
34
|
+
confidence=_CONF[f.confidence],
|
|
35
|
+
start=f.start, end=f.end, value=f.domain,
|
|
36
|
+
detector=self.name, notes=note,
|
|
37
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Generic, locale-independent detectors: email, credit card, IP."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import ipaddress
|
|
5
|
+
import re
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from ..checksums import luhn_is_valid
|
|
9
|
+
from ..entities import Category, Confidence, EntityType, Match
|
|
10
|
+
from .base import Detector
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EmailDetector(Detector):
|
|
14
|
+
name = "email"
|
|
15
|
+
_pattern = re.compile(r"(?<![A-Za-z0-9._%+\-])[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
|
|
16
|
+
|
|
17
|
+
def detect(self, text: str) -> Iterable[Match]:
|
|
18
|
+
for m in self._pattern.finditer(text):
|
|
19
|
+
yield Match(EntityType.EMAIL, Category.CONTACT, Confidence.MEDIUM,
|
|
20
|
+
m.start(), m.end(), m.group(0), self.name)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CreditCardDetector(Detector):
|
|
24
|
+
name = "credit_card"
|
|
25
|
+
_pattern = re.compile(r"(?<!\d)(?:\d[ \-]?){12,18}\d(?!\d)")
|
|
26
|
+
|
|
27
|
+
def detect(self, text: str) -> Iterable[Match]:
|
|
28
|
+
for m in self._pattern.finditer(text):
|
|
29
|
+
digits = re.sub(r"[ \-]", "", m.group(0))
|
|
30
|
+
if not (13 <= len(digits) <= 19) or not luhn_is_valid(digits):
|
|
31
|
+
continue
|
|
32
|
+
yield Match(EntityType.CREDIT_CARD, Category.FINANCIAL, Confidence.HIGH,
|
|
33
|
+
m.start(), m.end(), digits, self.name, "Luhn-valid")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class IpAddressDetector(Detector):
|
|
37
|
+
name = "ip_address"
|
|
38
|
+
_candidate = re.compile(r"(?<![\w.])(?:\d{1,3}(?:\.\d{1,3}){3}|[0-9A-Fa-f:]{2,}:[0-9A-Fa-f:]*)(?![\w.])")
|
|
39
|
+
|
|
40
|
+
def detect(self, text: str) -> Iterable[Match]:
|
|
41
|
+
for m in self._candidate.finditer(text):
|
|
42
|
+
try:
|
|
43
|
+
ipaddress.ip_address(m.group(0))
|
|
44
|
+
except ValueError:
|
|
45
|
+
continue
|
|
46
|
+
yield Match(EntityType.IP_ADDRESS, Category.NETWORK, Confidence.MEDIUM,
|
|
47
|
+
m.start(), m.end(), m.group(0), self.name)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
GENERIC_DETECTORS = [EmailDetector(), CreditCardDetector(), IpAddressDetector()]
|