tabayyan 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tabayyan/__init__.py ADDED
@@ -0,0 +1,46 @@
1
+ """Tabayyan (تبيّن) — Saudi-aware PII detection & redaction for LLM pipelines.
2
+
3
+ Local-first. Zero telemetry. No network calls in the detection core.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from .engine import DetectionEngine
8
+ from .entities import Category, Confidence, EntityType, Match
9
+ from .middleware import AuditLog, AuditRecord, Guard, ProtectResult, is_in_kingdom
10
+ from .redaction import RedactionItem, RedactionMode, RedactionResult, redact, restore
11
+
12
+ __version__ = "0.5.1"
13
+ __all__ = [
14
+ "DetectionEngine",
15
+ "Match",
16
+ "EntityType",
17
+ "Category",
18
+ "Confidence",
19
+ "RedactionMode",
20
+ "RedactionResult",
21
+ "RedactionItem",
22
+ "redact",
23
+ "restore",
24
+ "Guard",
25
+ "AuditLog",
26
+ "AuditRecord",
27
+ "ProtectResult",
28
+ "is_in_kingdom",
29
+ "scan",
30
+ "scan_and_redact",
31
+ "__version__",
32
+ ]
33
+
34
+
35
+ def scan(text: str) -> list[Match]:
36
+ """Convenience one-shot detection using the default detector set."""
37
+ return DetectionEngine().scan(text)
38
+
39
+
40
+ def scan_and_redact(
41
+ text: str,
42
+ mode: "RedactionMode | str" = RedactionMode.MASK,
43
+ **kwargs,
44
+ ) -> RedactionResult:
45
+ """Detect then redact in one call. kwargs pass through to redact()."""
46
+ return redact(text, DetectionEngine().scan(text), mode, **kwargs)
tabayyan/checksums.py ADDED
@@ -0,0 +1,82 @@
1
+ """Deterministic, offline checksum primitives.
2
+
3
+ Pure functions only. Passing a checksum confirms a value is *structurally
4
+ valid*, NOT that it was ever issued.
5
+ """
6
+ from __future__ import annotations
7
+
8
+
9
+ def luhn_is_valid(number: str) -> bool:
10
+ if not number.isdigit():
11
+ return False
12
+ total = 0
13
+ parity = len(number) % 2
14
+ for i, ch in enumerate(number):
15
+ d = int(ch)
16
+ if i % 2 == parity:
17
+ d *= 2
18
+ if d > 9:
19
+ d -= 9
20
+ total += d
21
+ return total % 10 == 0
22
+
23
+
24
+ def luhn_check_digit(number_without_check: str) -> int:
25
+ total = 0
26
+ parity = (len(number_without_check) + 1) % 2
27
+ for i, ch in enumerate(number_without_check):
28
+ d = int(ch)
29
+ if i % 2 == parity:
30
+ d *= 2
31
+ if d > 9:
32
+ d -= 9
33
+ total += d
34
+ return (10 - (total % 10)) % 10
35
+
36
+
37
+ def saudi_id_is_valid(value: str) -> bool:
38
+ if len(value) != 10 or not value.isdigit() or value[0] not in ("1", "2"):
39
+ return False
40
+ total = 0
41
+ for i in range(10):
42
+ d = int(value[i])
43
+ if i % 2 == 0:
44
+ d *= 2
45
+ total += d // 10 + d % 10
46
+ else:
47
+ total += d
48
+ return total % 10 == 0
49
+
50
+
51
+ def saudi_id_check_digit(first_nine: str) -> int:
52
+ if len(first_nine) != 9 or not first_nine.isdigit():
53
+ raise ValueError("first_nine must be exactly 9 digits")
54
+ partial = 0
55
+ for i in range(9):
56
+ d = int(first_nine[i])
57
+ if i % 2 == 0:
58
+ d *= 2
59
+ partial += d // 10 + d % 10
60
+ else:
61
+ partial += d
62
+ return (10 - (partial % 10)) % 10
63
+
64
+
65
+ def _iban_to_numeric(iban: str) -> str:
66
+ rearranged = iban[4:] + iban[:4]
67
+ out = []
68
+ for ch in rearranged:
69
+ out.append(ch if ch.isdigit() else str(ord(ch.upper()) - 55))
70
+ return "".join(out)
71
+
72
+
73
+ def iban_mod97_is_valid(iban: str) -> bool:
74
+ iban = iban.replace(" ", "").upper()
75
+ if len(iban) < 5 or not iban[:2].isalpha() or not iban[2:4].isdigit():
76
+ return False
77
+ return int(_iban_to_numeric(iban)) % 97 == 1
78
+
79
+
80
+ def iban_check_digits(country: str, bban: str) -> str:
81
+ trial = country.upper() + "00" + bban.upper()
82
+ return f"{98 - (int(_iban_to_numeric(trial)) % 97):02d}"
tabayyan/cli.py ADDED
@@ -0,0 +1,220 @@
1
+ """Command-line interface for Tabayyan. Stdlib only.
2
+
3
+ Commands:
4
+ tabayyan scan [paths...] detect entities, print findings
5
+ tabayyan redact [paths...] detect + redact, print sanitised text
6
+
7
+ Reads stdin when no path is given or path is '-'. Supports batch over
8
+ files and directories. Exit code is non-zero when entities are found,
9
+ so it slots into CI / pre-commit gates.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import sys
16
+ from pathlib import Path
17
+ from typing import Iterable
18
+
19
+ from . import __version__
20
+ from .config import Config
21
+ from .engine import DetectionEngine
22
+ from .streaming import scan_file
23
+ from .entities import Confidence, Match
24
+ from .homoglyph import scan_text as _scan_domains
25
+ from .redaction import RedactionMode, redact
26
+
27
+ _CONFIDENCE_ORDER = {Confidence.LOW: 0, Confidence.MEDIUM: 1, Confidence.HIGH: 2}
28
+ _TEXT_SUFFIXES = {".txt", ".md", ".log", ".json", ".csv", ".eml", ".text"}
29
+
30
+
31
+ def _iter_inputs(paths: list[str]) -> Iterable[tuple[str, str]]:
32
+ """Yield (source_name, text). '-' or empty -> stdin."""
33
+ if not paths or paths == ["-"]:
34
+ yield ("<stdin>", sys.stdin.read())
35
+ return
36
+ for raw in paths:
37
+ if raw == "-":
38
+ yield ("<stdin>", sys.stdin.read())
39
+ continue
40
+ p = Path(raw)
41
+ if p.is_dir():
42
+ for child in sorted(p.rglob("*")):
43
+ if child.is_file() and child.suffix.lower() in _TEXT_SUFFIXES:
44
+ yield (str(child), child.read_text(encoding="utf-8", errors="replace"))
45
+ elif p.is_file():
46
+ yield (str(p), p.read_text(encoding="utf-8", errors="replace"))
47
+ else:
48
+ print(f"tabayyan: cannot read '{raw}'", file=sys.stderr)
49
+
50
+
51
+ def _engine_from_args(args) -> DetectionEngine:
52
+ cfg = getattr(args, "config", None)
53
+ return Config.from_file(cfg).build_engine() if cfg else DetectionEngine()
54
+
55
+
56
+ def _filter_matches(matches: list[Match], args) -> list[Match]:
57
+ out = matches
58
+ if args.min_confidence:
59
+ floor = _CONFIDENCE_ORDER[Confidence(args.min_confidence)]
60
+ out = [m for m in out if _CONFIDENCE_ORDER[m.confidence] >= floor]
61
+ if args.only:
62
+ only = set(args.only)
63
+ out = [m for m in out if m.entity_type.value in only]
64
+ if args.exclude:
65
+ excl = set(args.exclude)
66
+ out = [m for m in out if m.entity_type.value not in excl]
67
+ return out
68
+
69
+
70
+ def _cmd_scan(args) -> int:
71
+ engine = _engine_from_args(args)
72
+ found_any = False
73
+ report = []
74
+ if args.stream:
75
+ for raw in args.paths:
76
+ if raw in ("", "-"):
77
+ print("tabayyan: --stream requires file paths, not stdin", file=sys.stderr)
78
+ continue
79
+ matches = _filter_matches(list(scan_file(raw, engine)), args)
80
+ if matches:
81
+ found_any = True
82
+ if args.json:
83
+ report.append({"source": raw, "matches": [m.to_dict() for m in matches]})
84
+ else:
85
+ for m in matches:
86
+ val = "" if args.no_values else f" {m.value!r}"
87
+ print(f"{raw}:{m.start}-{m.end}\t{m.entity_type.value}\t"
88
+ f"{m.confidence.value}\t{m.category.value}{val}")
89
+ if args.json:
90
+ json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
91
+ sys.stdout.write("\n")
92
+ return (1 if found_any else 0) if args.fail_on_find else 0
93
+ for name, text in _iter_inputs(args.paths):
94
+ matches = _filter_matches(engine.scan(text), args)
95
+ if matches:
96
+ found_any = True
97
+ if args.json:
98
+ report.append({"source": name, "matches": [m.to_dict() for m in matches]})
99
+ else:
100
+ for m in matches:
101
+ val = "" if args.no_values else f" {m.value!r}"
102
+ print(f"{name}:{m.start}-{m.end}\t{m.entity_type.value}\t"
103
+ f"{m.confidence.value}\t{m.category.value}{val}")
104
+ if args.json:
105
+ json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
106
+ sys.stdout.write("\n")
107
+ return (1 if found_any else 0) if args.fail_on_find else 0
108
+
109
+
110
+ def _cmd_redact(args) -> int:
111
+ engine = _engine_from_args(args)
112
+ mode = RedactionMode(args.mode)
113
+ found_any = False
114
+ inputs = list(_iter_inputs(args.paths))
115
+ multi = len(inputs) > 1
116
+ for name, text in inputs:
117
+ matches = _filter_matches(engine.scan(text), args)
118
+ if matches:
119
+ found_any = True
120
+ result = redact(
121
+ text, matches, mode,
122
+ salt=args.salt, hash_length=args.hash_length,
123
+ partial_keep_last=args.keep_last,
124
+ )
125
+ if args.json:
126
+ payload = result.to_dict()
127
+ payload["source"] = name
128
+ json.dump(payload, sys.stdout, ensure_ascii=False, indent=2)
129
+ sys.stdout.write("\n")
130
+ else:
131
+ if multi:
132
+ print(f"===== {name} =====")
133
+ sys.stdout.write(result.text)
134
+ if not result.text.endswith("\n"):
135
+ sys.stdout.write("\n")
136
+ if result.vault:
137
+ print(f"# vault ({len(result.vault)} tokens) — store securely; "
138
+ f"use --json to capture", file=sys.stderr)
139
+ return (1 if found_any else 0) if args.fail_on_find else 0
140
+
141
+
142
+ def _load_watchlist(path: str | None) -> list[str]:
143
+ if not path:
144
+ return []
145
+ return [ln.strip() for ln in Path(path).read_text(encoding="utf-8").splitlines()
146
+ if ln.strip() and not ln.startswith("#")]
147
+
148
+
149
+ def _cmd_domains(args) -> int:
150
+ watchlist = _load_watchlist(args.watchlist)
151
+ found_any = False
152
+ report = []
153
+ for name, text in _iter_inputs(args.paths):
154
+ findings = _scan_domains(text, watchlist,
155
+ typosquat_max_distance=args.max_distance)
156
+ if findings:
157
+ found_any = True
158
+ if args.json:
159
+ report.append({"source": name, "findings": [vars(f) for f in findings]})
160
+ else:
161
+ for f in findings:
162
+ tgt = f" -> {f.target}" if f.target else ""
163
+ print(f"{name}:{f.start}-{f.end}\t{f.domain}\t{f.reason}\t"
164
+ f"{f.confidence}{tgt}\t{f.detail}")
165
+ if args.json:
166
+ json.dump(report, sys.stdout, ensure_ascii=False, indent=2)
167
+ sys.stdout.write("\n")
168
+ return (1 if found_any else 0) if args.fail_on_find else 0
169
+
170
+
171
+ def _add_common_filters(p: argparse.ArgumentParser) -> None:
172
+ p.add_argument("paths", nargs="*", help="files/dirs, or '-' for stdin")
173
+ p.add_argument("--min-confidence", choices=["low", "medium", "high"],
174
+ help="drop matches below this confidence")
175
+ p.add_argument("--only", nargs="+", metavar="TYPE", help="keep only these entity types")
176
+ p.add_argument("--exclude", nargs="+", metavar="TYPE", help="drop these entity types")
177
+ p.add_argument("--json", action="store_true", help="emit JSON")
178
+ p.add_argument("--fail-on-find", action="store_true",
179
+ help="exit 1 if any entity is found (for CI / pre-commit)")
180
+ p.add_argument("--config", help="JSON config: disable/add detectors, thresholds")
181
+
182
+
183
+ def build_parser() -> argparse.ArgumentParser:
184
+ parser = argparse.ArgumentParser(prog="tabayyan", description=__doc__.splitlines()[0])
185
+ parser.add_argument("--version", action="version", version=f"tabayyan {__version__}")
186
+ sub = parser.add_subparsers(dest="command", required=True)
187
+
188
+ ps = sub.add_parser("scan", help="detect entities")
189
+ _add_common_filters(ps)
190
+ ps.add_argument("--no-values", action="store_true", help="hide raw values in output")
191
+ ps.add_argument("--stream", action="store_true",
192
+ help="scan large files incrementally (file paths only)")
193
+ ps.set_defaults(func=_cmd_scan)
194
+
195
+ pr = sub.add_parser("redact", help="detect and redact")
196
+ _add_common_filters(pr)
197
+ pr.add_argument("--mode", choices=[m.value for m in RedactionMode], default="mask")
198
+ pr.add_argument("--salt", default="", help="salt for hash mode")
199
+ pr.add_argument("--hash-length", type=int, default=12, help="hash token length")
200
+ pr.add_argument("--keep-last", type=int, default=4, help="kept chars in partial mode")
201
+ pr.set_defaults(func=_cmd_redact)
202
+ pd = sub.add_parser("domains", help="detect lookalike / homoglyph domains")
203
+ pd.add_argument("paths", nargs="*", help="files/dirs, or '-' for stdin")
204
+ pd.add_argument("--watchlist", help="file of legitimate domains, one per line")
205
+ pd.add_argument("--max-distance", type=int, default=1,
206
+ help="max edit distance for typosquat flag")
207
+ pd.add_argument("--json", action="store_true", help="emit JSON")
208
+ pd.add_argument("--fail-on-find", action="store_true",
209
+ help="exit 1 if any suspicious domain is found")
210
+ pd.set_defaults(func=_cmd_domains)
211
+ return parser
212
+
213
+
214
+ def main(argv: list[str] | None = None) -> int:
215
+ args = build_parser().parse_args(argv)
216
+ return args.func(args)
217
+
218
+
219
+ if __name__ == "__main__":
220
+ raise SystemExit(main())
tabayyan/config.py ADDED
@@ -0,0 +1,92 @@
1
+ """Configuration: customise detection without editing code.
2
+
3
+ Load a JSON config to enable/disable detectors, add custom regex
4
+ detectors, extend the confusable map, and tune thresholds. JSON is used
5
+ (not TOML) to keep zero runtime dependencies on Python 3.9.
6
+
7
+ Schema (all keys optional):
8
+ {
9
+ "disable": ["saudi_cr", "arabic_name"],
10
+ "typosquat_max_distance": 2,
11
+ "confusables": {"ⅴ": "v"},
12
+ "custom_detectors": [
13
+ {"label": "employee_id", "pattern": "EMP-\\\\d{6}",
14
+ "category": "organisation", "confidence": "medium"}
15
+ ]
16
+ }
17
+
18
+ Custom-detector matches use entity_type CUSTOM; their configured label is
19
+ preserved in the `detector` and `notes` fields and in the mask placeholder.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import re
25
+ from dataclasses import dataclass, field
26
+ from pathlib import Path
27
+ from typing import Iterable
28
+
29
+ from .confusables import register_confusables
30
+ from .detectors import DEFAULT_DETECTORS, Detector
31
+ from .engine import DetectionEngine
32
+ from .entities import Category, Confidence, EntityType, Match
33
+
34
+ _CONF = {c.value: c for c in Confidence}
35
+ _CAT = {c.value: c for c in Category}
36
+
37
+
38
+ class CustomRegexDetector(Detector):
39
+ """A user-defined regex detector loaded from config."""
40
+
41
+ def __init__(self, label: str, pattern: str, category: Category,
42
+ confidence: Confidence) -> None:
43
+ self.name = f"custom:{label}"
44
+ self.label = label
45
+ self._rx = re.compile(pattern)
46
+ self._category = category
47
+ self._confidence = confidence
48
+
49
+ def detect(self, text: str) -> Iterable[Match]:
50
+ for m in self._rx.finditer(text):
51
+ yield Match(
52
+ entity_type=EntityType.CUSTOM, category=self._category,
53
+ confidence=self._confidence, start=m.start(), end=m.end(),
54
+ value=m.group(0), detector=self.name, label=self.label,
55
+ notes=f"custom detector '{self.label}'",
56
+ )
57
+
58
+ def mask_label(self) -> str:
59
+ return f"[{self.label.upper()}]"
60
+
61
+
62
+ @dataclass
63
+ class Config:
64
+ disable: set[str] = field(default_factory=set)
65
+ typosquat_max_distance: int = 1
66
+ custom_detectors: list[CustomRegexDetector] = field(default_factory=list)
67
+
68
+ @classmethod
69
+ def from_dict(cls, data: dict) -> "Config":
70
+ for ch, sk in (data.get("confusables") or {}).items():
71
+ register_confusables({ch: sk})
72
+ customs = []
73
+ for spec in data.get("custom_detectors", []):
74
+ customs.append(CustomRegexDetector(
75
+ label=spec["label"], pattern=spec["pattern"],
76
+ category=_CAT[spec.get("category", "organisation")],
77
+ confidence=_CONF[spec.get("confidence", "medium")],
78
+ ))
79
+ return cls(
80
+ disable=set(data.get("disable", [])),
81
+ typosquat_max_distance=int(data.get("typosquat_max_distance", 1)),
82
+ custom_detectors=customs,
83
+ )
84
+
85
+ @classmethod
86
+ def from_file(cls, path: str | Path) -> "Config":
87
+ return cls.from_dict(json.loads(Path(path).read_text(encoding="utf-8")))
88
+
89
+ def build_engine(self) -> DetectionEngine:
90
+ detectors = [d for d in DEFAULT_DETECTORS if d.name not in self.disable]
91
+ detectors.extend(self.custom_detectors)
92
+ return DetectionEngine(detectors)
@@ -0,0 +1,99 @@
1
+ """Script classification and confusable-skeleton folding.
2
+
3
+ Used by the homoglyph subsystem to catch domains that impersonate a
4
+ target using visually-confusable characters (IDN homograph attacks) or
5
+ mix scripts. The confusables map is a curated, practical subset of the
6
+ Unicode confusables data — extensible, not exhaustive. See
7
+ homoglyph.py for the detection logic that consumes these primitives.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ # Curated confusable -> ASCII/canonical skeleton map.
12
+ # Focus: characters realistically used to spoof Latin domains
13
+ # (Cyrillic, Greek, fullwidth, digit/letter swaps) plus Arabic-Indic
14
+ # digits which appear in mixed Arabic/Latin spoofs.
15
+ _CONFUSABLES: dict[str, str] = {
16
+ # Cyrillic -> Latin
17
+ "а": "a", "е": "e", "о": "o", "р": "p", "с": "c", "х": "x",
18
+ "у": "y", "к": "k", "м": "m", "н": "h", "т": "t", "в": "b",
19
+ "і": "i", "ј": "j", "ѕ": "s", "ԁ": "d", "ɡ": "g",
20
+ "ӏ": "l", "Ӏ": "l", "ⅼ": "l", "j": "j",
21
+ # Greek -> Latin
22
+ "α": "a", "ο": "o", "ν": "v", "ρ": "p", "τ": "t", "υ": "u",
23
+ "ι": "i", "κ": "k", "χ": "x", "ε": "e",
24
+ # Fullwidth Latin -> ASCII
25
+ "a": "a", "b": "b", "c": "c", "d": "d", "e": "e", "o": "o",
26
+ "p": "p", "s": "s", "x": "x", "i": "i", "l": "l", "m": "m",
27
+ "n": "n", "g": "g", "t": "t", "r": "r", "u": "u",
28
+ # Digit/letter confusions folded to a single skeleton
29
+ "0": "o", "1": "l", "5": "s", "8": "b",
30
+ "l": "l", "I": "l", "|": "l",
31
+ # Arabic-Indic and Eastern-Arabic digits -> ASCII digits then folded
32
+ "٠": "o", "١": "l", "٢": "2", "٣": "3", "٤": "4", "٥": "s",
33
+ "٦": "6", "٧": "7", "٨": "b", "٩": "9",
34
+ "۰": "o", "۱": "l", "۲": "2", "۳": "3", "۴": "4", "۵": "s",
35
+ "۶": "6", "۷": "7", "۸": "b", "۹": "9",
36
+ }
37
+
38
+
39
+ def register_confusables(extra: dict[str, str]) -> None:
40
+ """Merge user-supplied confusable mappings into the global map."""
41
+ _CONFUSABLES.update(extra)
42
+
43
+
44
+ def skeleton(label: str) -> str:
45
+ """Fold a string to its confusable skeleton (lowercased).
46
+
47
+ Two strings with the same skeleton look alike. This is the core
48
+ primitive for homograph-impersonation detection.
49
+ """
50
+ out = []
51
+ for ch in label.lower():
52
+ out.append(_CONFUSABLES.get(ch, ch))
53
+ return "".join(out)
54
+
55
+
56
+ # Script ranges (start, end, name). Order matters only for first-match.
57
+ _SCRIPT_RANGES = [
58
+ (0x0041, 0x005A, "Latin"), (0x0061, 0x007A, "Latin"),
59
+ (0x00C0, 0x024F, "Latin"),
60
+ (0x0370, 0x03FF, "Greek"), (0x1F00, 0x1FFF, "Greek"),
61
+ (0x0400, 0x04FF, "Cyrillic"), (0x0500, 0x052F, "Cyrillic"),
62
+ (0x0600, 0x06FF, "Arabic"), (0x0750, 0x077F, "Arabic"),
63
+ (0x08A0, 0x08FF, "Arabic"), (0xFB50, 0xFDFF, "Arabic"),
64
+ (0xFE70, 0xFEFF, "Arabic"),
65
+ (0x0590, 0x05FF, "Hebrew"),
66
+ (0x4E00, 0x9FFF, "Han"),
67
+ (0x3040, 0x30FF, "Kana"),
68
+ ]
69
+
70
+
71
+ def script_of(ch: str) -> str:
72
+ """Return a coarse script name for a single character.
73
+
74
+ Digits and ASCII punctuation are 'Common' (script-neutral) so they
75
+ don't trigger false mixed-script alarms on their own.
76
+ """
77
+ cp = ord(ch)
78
+ if ch.isdigit() and cp < 0x0660:
79
+ return "Common"
80
+ if ch in "-._/:":
81
+ return "Common"
82
+ for start, end, name in _SCRIPT_RANGES:
83
+ if start <= cp <= end:
84
+ return name
85
+ return "Common"
86
+
87
+
88
+ def scripts_in(label: str) -> set[str]:
89
+ """Return the set of non-Common scripts present in a label."""
90
+ return {s for s in (script_of(c) for c in label) if s != "Common"}
91
+
92
+
93
+ def is_mixed_script(label: str) -> bool:
94
+ """True if a single label mixes two or more distinct scripts.
95
+
96
+ Mixing scripts inside one domain label is a strong spoofing signal:
97
+ legitimate labels almost never combine, e.g., Latin and Cyrillic.
98
+ """
99
+ return len(scripts_in(label)) >= 2
@@ -0,0 +1,12 @@
1
+ """Detector registry."""
2
+ from __future__ import annotations
3
+
4
+ from .base import Detector
5
+ from .generic import GENERIC_DETECTORS
6
+ from .names import ArabicNameDetector
7
+ from .saudi import SAUDI_DETECTORS
8
+
9
+ DEFAULT_DETECTORS = [*SAUDI_DETECTORS, *GENERIC_DETECTORS, ArabicNameDetector()]
10
+
11
+ __all__ = ["Detector", "DEFAULT_DETECTORS", "SAUDI_DETECTORS", "GENERIC_DETECTORS",
12
+ "ArabicNameDetector"]
@@ -0,0 +1,15 @@
1
+ """Detector base class. Detectors are pure: text in, matches out."""
2
+ from __future__ import annotations
3
+
4
+ from abc import ABC, abstractmethod
5
+ from typing import Iterable
6
+
7
+ from ..entities import Match
8
+
9
+
10
+ class Detector(ABC):
11
+ name: str = "detector"
12
+
13
+ @abstractmethod
14
+ def detect(self, text: str) -> Iterable[Match]:
15
+ raise NotImplementedError
@@ -0,0 +1,37 @@
1
+ """Opt-in lookalike/homoglyph domain detector.
2
+
3
+ Not part of DEFAULT_DETECTORS: it needs a watchlist to be most useful and
4
+ it emits THREAT findings, not PII. Construct it explicitly (optionally
5
+ with a watchlist) and pass it to DetectionEngine, or use the `domains`
6
+ CLI command.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Iterable, Sequence
11
+
12
+ from ..entities import Category, Confidence, EntityType, Match
13
+ from ..homoglyph import scan_text
14
+ from .base import Detector
15
+
16
+ _CONF = {"high": Confidence.HIGH, "medium": Confidence.MEDIUM, "low": Confidence.LOW}
17
+
18
+
19
+ class LookalikeDomainDetector(Detector):
20
+ name = "lookalike_domain"
21
+
22
+ def __init__(self, watchlist: Sequence[str] | None = None,
23
+ typosquat_max_distance: int = 1) -> None:
24
+ self.watchlist = list(watchlist or [])
25
+ self.typosquat_max_distance = typosquat_max_distance
26
+
27
+ def detect(self, text: str) -> Iterable[Match]:
28
+ for f in scan_text(text, self.watchlist,
29
+ typosquat_max_distance=self.typosquat_max_distance):
30
+ note = f.detail if f.target is None else f"{f.reason}: {f.detail}"
31
+ yield Match(
32
+ entity_type=EntityType.SUSPICIOUS_DOMAIN,
33
+ category=Category.THREAT,
34
+ confidence=_CONF[f.confidence],
35
+ start=f.start, end=f.end, value=f.domain,
36
+ detector=self.name, notes=note,
37
+ )
@@ -0,0 +1,50 @@
1
+ """Generic, locale-independent detectors: email, credit card, IP."""
2
+ from __future__ import annotations
3
+
4
+ import ipaddress
5
+ import re
6
+ from typing import Iterable
7
+
8
+ from ..checksums import luhn_is_valid
9
+ from ..entities import Category, Confidence, EntityType, Match
10
+ from .base import Detector
11
+
12
+
13
+ class EmailDetector(Detector):
14
+ name = "email"
15
+ _pattern = re.compile(r"(?<![A-Za-z0-9._%+\-])[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
16
+
17
+ def detect(self, text: str) -> Iterable[Match]:
18
+ for m in self._pattern.finditer(text):
19
+ yield Match(EntityType.EMAIL, Category.CONTACT, Confidence.MEDIUM,
20
+ m.start(), m.end(), m.group(0), self.name)
21
+
22
+
23
+ class CreditCardDetector(Detector):
24
+ name = "credit_card"
25
+ _pattern = re.compile(r"(?<!\d)(?:\d[ \-]?){12,18}\d(?!\d)")
26
+
27
+ def detect(self, text: str) -> Iterable[Match]:
28
+ for m in self._pattern.finditer(text):
29
+ digits = re.sub(r"[ \-]", "", m.group(0))
30
+ if not (13 <= len(digits) <= 19) or not luhn_is_valid(digits):
31
+ continue
32
+ yield Match(EntityType.CREDIT_CARD, Category.FINANCIAL, Confidence.HIGH,
33
+ m.start(), m.end(), digits, self.name, "Luhn-valid")
34
+
35
+
36
+ class IpAddressDetector(Detector):
37
+ name = "ip_address"
38
+ _candidate = re.compile(r"(?<![\w.])(?:\d{1,3}(?:\.\d{1,3}){3}|[0-9A-Fa-f:]{2,}:[0-9A-Fa-f:]*)(?![\w.])")
39
+
40
+ def detect(self, text: str) -> Iterable[Match]:
41
+ for m in self._candidate.finditer(text):
42
+ try:
43
+ ipaddress.ip_address(m.group(0))
44
+ except ValueError:
45
+ continue
46
+ yield Match(EntityType.IP_ADDRESS, Category.NETWORK, Confidence.MEDIUM,
47
+ m.start(), m.end(), m.group(0), self.name)
48
+
49
+
50
+ GENERIC_DETECTORS = [EmailDetector(), CreditCardDetector(), IpAddressDetector()]