datacloak 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datacloak/__init__.py +177 -0
- datacloak/cli.py +222 -0
- datacloak/detectors/__init__.py +40 -0
- datacloak/detectors/aadhaar.py +53 -0
- datacloak/detectors/base.py +97 -0
- datacloak/detectors/credit_card.py +60 -0
- datacloak/detectors/email.py +50 -0
- datacloak/detectors/ifsc.py +57 -0
- datacloak/detectors/ip_address.py +86 -0
- datacloak/detectors/mobile.py +60 -0
- datacloak/detectors/pan.py +57 -0
- datacloak/detectors/upi.py +64 -0
- datacloak/file_scanner.py +272 -0
- datacloak/masker.py +196 -0
- datacloak/py.typed +0 -0
- datacloak/reporter.py +126 -0
- datacloak/scanner.py +76 -0
- datacloak-0.1.0.dist-info/METADATA +364 -0
- datacloak-0.1.0.dist-info/RECORD +22 -0
- datacloak-0.1.0.dist-info/WHEEL +4 -0
- datacloak-0.1.0.dist-info/entry_points.txt +2 -0
- datacloak-0.1.0.dist-info/licenses/LICENSE +21 -0
datacloak/__init__.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataCloak — Privacy Protection Library
|
|
3
|
+
=======================================
|
|
4
|
+
|
|
5
|
+
A production-ready Python library for detecting and masking Personally
|
|
6
|
+
Identifiable Information (PII) in text, logs, files, and application data.
|
|
7
|
+
|
|
8
|
+
Quick start::
|
|
9
|
+
|
|
10
|
+
from datacloak import mask, scan
|
|
11
|
+
|
|
12
|
+
text = \"\"\"
|
|
13
|
+
Aadhaar: 2345 6789 0123
|
|
14
|
+
PAN: ABCDE1234F
|
|
15
|
+
Email: alice@example.com
|
|
16
|
+
Phone: 9876543210
|
|
17
|
+
\"\"\"
|
|
18
|
+
|
|
19
|
+
print(mask(text)) # partial masking (default)
|
|
20
|
+
print(mask(text, mode="full"))
|
|
21
|
+
print(mask(text, mode="hash"))
|
|
22
|
+
|
|
23
|
+
findings = scan(text)
|
|
24
|
+
# {"aadhaar": ["2345 6789 0123"], "pan": ["ABCDE1234F"], ...}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
from typing import TYPE_CHECKING
|
|
31
|
+
|
|
32
|
+
from .detectors import (
|
|
33
|
+
DEFAULT_DETECTORS,
|
|
34
|
+
AadhaarDetector,
|
|
35
|
+
BaseDetector,
|
|
36
|
+
CreditCardDetector,
|
|
37
|
+
Detection,
|
|
38
|
+
EmailDetector,
|
|
39
|
+
IFSCDetector,
|
|
40
|
+
IPAddressDetector,
|
|
41
|
+
MobileDetector,
|
|
42
|
+
PANDetector,
|
|
43
|
+
UPIDetector,
|
|
44
|
+
)
|
|
45
|
+
from .file_scanner import FileScanResult, scan_file
|
|
46
|
+
from .masker import MaskMode, mask_text
|
|
47
|
+
from .reporter import Report, generate_report_from_file, generate_report_from_text
|
|
48
|
+
from .scanner import ScanResult, scan_summary, scan_text
|
|
49
|
+
|
|
50
|
+
if TYPE_CHECKING:
|
|
51
|
+
from pathlib import Path
|
|
52
|
+
|
|
53
|
+
__version__ = "0.1.0"
|
|
54
|
+
__author__ = "DataCloak Contributors"
|
|
55
|
+
__license__ = "MIT"
|
|
56
|
+
|
|
57
|
+
# Set up a NullHandler so the library is silent unless the caller configures logging.
|
|
58
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Convenience aliases (the primary public surface)
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def mask(
|
|
67
|
+
text: str,
|
|
68
|
+
mode: MaskMode = "partial",
|
|
69
|
+
detectors: list[BaseDetector] | None = None,
|
|
70
|
+
) -> str:
|
|
71
|
+
"""
|
|
72
|
+
Detect and mask all PII in *text*.
|
|
73
|
+
|
|
74
|
+
Parameters
|
|
75
|
+
----------
|
|
76
|
+
text:
|
|
77
|
+
Input string containing potential PII.
|
|
78
|
+
mode:
|
|
79
|
+
``"partial"`` (default) — keep trailing characters visible.
|
|
80
|
+
``"full"`` — replace with descriptive tags like ``[EMAIL_REDACTED]``.
|
|
81
|
+
``"hash"`` — replace with SHA-256 digest.
|
|
82
|
+
detectors:
|
|
83
|
+
Optional list of :class:`~datacloak.detectors.BaseDetector` instances.
|
|
84
|
+
Defaults to all built-in detectors.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
str
|
|
89
|
+
The masked string.
|
|
90
|
+
|
|
91
|
+
Example::
|
|
92
|
+
|
|
93
|
+
>>> from datacloak import mask
|
|
94
|
+
>>> mask("Call me at 9876543210")
|
|
95
|
+
'Call me at ******3210'
|
|
96
|
+
"""
|
|
97
|
+
return mask_text(text, mode=mode, detectors=detectors)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def scan(
|
|
101
|
+
text: str,
|
|
102
|
+
detectors: list[BaseDetector] | None = None,
|
|
103
|
+
) -> ScanResult:
|
|
104
|
+
"""
|
|
105
|
+
Scan *text* for PII without modifying it.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
text:
|
|
110
|
+
Input string to scan.
|
|
111
|
+
detectors:
|
|
112
|
+
Optional custom detectors.
|
|
113
|
+
|
|
114
|
+
Returns
|
|
115
|
+
-------
|
|
116
|
+
dict
|
|
117
|
+
Mapping of PII type name → list of detected values.
|
|
118
|
+
|
|
119
|
+
Example::
|
|
120
|
+
|
|
121
|
+
>>> from datacloak import scan
|
|
122
|
+
>>> scan("Email me at bob@example.com")
|
|
123
|
+
{'email': ['bob@example.com']}
|
|
124
|
+
"""
|
|
125
|
+
return scan_text(text, detectors=detectors)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def report(
|
|
129
|
+
text: str,
|
|
130
|
+
source_label: str = "<inline text>",
|
|
131
|
+
detectors: list[BaseDetector] | None = None,
|
|
132
|
+
) -> Report:
|
|
133
|
+
"""
|
|
134
|
+
Generate a structured :class:`~datacloak.reporter.Report` from *text*.
|
|
135
|
+
|
|
136
|
+
Example::
|
|
137
|
+
|
|
138
|
+
>>> from datacloak import report
|
|
139
|
+
>>> r = report("john@example.com called 9876543210")
|
|
140
|
+
>>> print(r.to_json())
|
|
141
|
+
"""
|
|
142
|
+
return generate_report_from_text(text, source_label=source_label, detectors=detectors)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
__all__ = [
|
|
146
|
+
# Version
|
|
147
|
+
"__version__",
|
|
148
|
+
# Core API
|
|
149
|
+
"mask",
|
|
150
|
+
"scan",
|
|
151
|
+
"report",
|
|
152
|
+
# File operations
|
|
153
|
+
"scan_file",
|
|
154
|
+
# Lower-level API
|
|
155
|
+
"mask_text",
|
|
156
|
+
"scan_text",
|
|
157
|
+
"scan_summary",
|
|
158
|
+
"generate_report_from_text",
|
|
159
|
+
"generate_report_from_file",
|
|
160
|
+
# Detectors
|
|
161
|
+
"BaseDetector",
|
|
162
|
+
"Detection",
|
|
163
|
+
"DEFAULT_DETECTORS",
|
|
164
|
+
"AadhaarDetector",
|
|
165
|
+
"PANDetector",
|
|
166
|
+
"MobileDetector",
|
|
167
|
+
"EmailDetector",
|
|
168
|
+
"UPIDetector",
|
|
169
|
+
"CreditCardDetector",
|
|
170
|
+
"IFSCDetector",
|
|
171
|
+
"IPAddressDetector",
|
|
172
|
+
# Types
|
|
173
|
+
"MaskMode",
|
|
174
|
+
"ScanResult",
|
|
175
|
+
"FileScanResult",
|
|
176
|
+
"Report",
|
|
177
|
+
]
|
datacloak/cli.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataCloak Command-Line Interface.
|
|
3
|
+
|
|
4
|
+
Usage::
|
|
5
|
+
|
|
6
|
+
datacloak scan file.txt
|
|
7
|
+
datacloak mask file.txt
|
|
8
|
+
datacloak mask file.txt --mode full --output masked.txt
|
|
9
|
+
datacloak report file.txt
|
|
10
|
+
datacloak report file.txt --output report.json
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import click
|
|
20
|
+
|
|
21
|
+
import datacloak
|
|
22
|
+
from datacloak.file_scanner import mask_file, scan_file
|
|
23
|
+
from datacloak.masker import MaskMode
|
|
24
|
+
from datacloak.reporter import generate_report_from_file
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# CLI root
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@click.group(
|
|
33
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
34
|
+
)
|
|
35
|
+
@click.version_option(datacloak.__version__, "-V", "--version")
|
|
36
|
+
@click.option(
|
|
37
|
+
"-v",
|
|
38
|
+
"--verbose",
|
|
39
|
+
is_flag=True,
|
|
40
|
+
default=False,
|
|
41
|
+
help="Enable verbose logging output.",
|
|
42
|
+
)
|
|
43
|
+
def cli(verbose: bool) -> None:
|
|
44
|
+
"""
|
|
45
|
+
\b
|
|
46
|
+
DataCloak — Privacy Protection CLI
|
|
47
|
+
===================================
|
|
48
|
+
Detect and mask PII in text files.
|
|
49
|
+
"""
|
|
50
|
+
if verbose:
|
|
51
|
+
import logging
|
|
52
|
+
|
|
53
|
+
logging.basicConfig(
|
|
54
|
+
level=logging.DEBUG,
|
|
55
|
+
format="%(levelname)s %(name)s: %(message)s",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
# scan command
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@cli.command("scan")
|
|
65
|
+
@click.argument("file", type=click.Path(exists=True, readable=True, path_type=Path))
|
|
66
|
+
@click.option(
|
|
67
|
+
"--format",
|
|
68
|
+
"output_format",
|
|
69
|
+
type=click.Choice(["json", "table"], case_sensitive=False),
|
|
70
|
+
default="table",
|
|
71
|
+
show_default=True,
|
|
72
|
+
help="Output format.",
|
|
73
|
+
)
|
|
74
|
+
def scan_cmd(file: Path, output_format: str) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Scan FILE for PII and display findings.
|
|
77
|
+
|
|
78
|
+
\b
|
|
79
|
+
Examples:
|
|
80
|
+
datacloak scan customer_data.txt
|
|
81
|
+
datacloak scan --format json logs.txt
|
|
82
|
+
"""
|
|
83
|
+
result = scan_file(file)
|
|
84
|
+
|
|
85
|
+
if result.error:
|
|
86
|
+
click.secho(f"Error: {result.error}", fg="red", err=True)
|
|
87
|
+
sys.exit(1)
|
|
88
|
+
|
|
89
|
+
if not result.findings:
|
|
90
|
+
click.secho("✓ No PII detected.", fg="green")
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
if output_format == "json":
|
|
94
|
+
click.echo(json.dumps(result.by_type, indent=2, ensure_ascii=False))
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
# Table output
|
|
98
|
+
click.secho(f"\n📄 File: {file}", bold=True)
|
|
99
|
+
click.secho(f"{'PII Type':<18} {'Value':<40} {'Line':>6}", fg="cyan")
|
|
100
|
+
click.secho("─" * 66, fg="cyan")
|
|
101
|
+
for finding in result.findings:
|
|
102
|
+
line_str = str(finding.line_number) if finding.line_number else "—"
|
|
103
|
+
click.echo(f"{finding.pii_type:<18} {finding.value:<40} {line_str:>6}")
|
|
104
|
+
|
|
105
|
+
click.secho("─" * 66, fg="cyan")
|
|
106
|
+
click.secho(f"\nSummary: {result.summary}", fg="yellow")
|
|
107
|
+
risk_colours = {"NONE": "green", "LOW": "yellow", "MEDIUM": "magenta", "HIGH": "red"}
|
|
108
|
+
from datacloak.reporter import _risk_level
|
|
109
|
+
|
|
110
|
+
risk = _risk_level(len(result.findings))
|
|
111
|
+
colour = risk_colours.get(risk, "white")
|
|
112
|
+
click.secho(f"Risk level: {risk}", fg=colour, bold=True)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
# ---------------------------------------------------------------------------
|
|
116
|
+
# mask command
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@cli.command("mask")
|
|
121
|
+
@click.argument("file", type=click.Path(exists=True, readable=True, path_type=Path))
|
|
122
|
+
@click.option(
|
|
123
|
+
"-m",
|
|
124
|
+
"--mode",
|
|
125
|
+
type=click.Choice(["partial", "full", "hash"], case_sensitive=False),
|
|
126
|
+
default="partial",
|
|
127
|
+
show_default=True,
|
|
128
|
+
help="Masking mode.",
|
|
129
|
+
)
|
|
130
|
+
@click.option(
|
|
131
|
+
"-o",
|
|
132
|
+
"--output",
|
|
133
|
+
"output_path",
|
|
134
|
+
type=click.Path(path_type=Path),
|
|
135
|
+
default=None,
|
|
136
|
+
help="Output file path (default: <file>.masked<ext>).",
|
|
137
|
+
)
|
|
138
|
+
@click.option(
|
|
139
|
+
"--stdout",
|
|
140
|
+
is_flag=True,
|
|
141
|
+
default=False,
|
|
142
|
+
help="Print masked output to stdout instead of writing a file.",
|
|
143
|
+
)
|
|
144
|
+
def mask_cmd(file: Path, mode: str, output_path: Path | None, stdout: bool) -> None:
|
|
145
|
+
"""
|
|
146
|
+
Mask PII in FILE.
|
|
147
|
+
|
|
148
|
+
\b
|
|
149
|
+
Masking modes:
|
|
150
|
+
partial Keep last characters visible (default)
|
|
151
|
+
full Replace with descriptive tags, e.g. [EMAIL_REDACTED]
|
|
152
|
+
hash Replace with SHA-256 digest
|
|
153
|
+
|
|
154
|
+
\b
|
|
155
|
+
Examples:
|
|
156
|
+
datacloak mask logs.txt
|
|
157
|
+
datacloak mask logs.txt --mode full --output clean_logs.txt
|
|
158
|
+
datacloak mask logs.txt --stdout | less
|
|
159
|
+
"""
|
|
160
|
+
if stdout:
|
|
161
|
+
content = file.read_text(encoding="utf-8", errors="replace")
|
|
162
|
+
from datacloak.masker import mask_text
|
|
163
|
+
|
|
164
|
+
click.echo(mask_text(content, mode=mode)) # type: ignore[arg-type]
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
dest = mask_file(file, output_path=output_path, mode=mode) # type: ignore[arg-type]
|
|
168
|
+
click.secho(f"✓ Masked file written to: {dest}", fg="green")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ---------------------------------------------------------------------------
|
|
172
|
+
# report command
|
|
173
|
+
# ---------------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@cli.command("report")
|
|
177
|
+
@click.argument("file", type=click.Path(exists=True, readable=True, path_type=Path))
|
|
178
|
+
@click.option(
|
|
179
|
+
"-o",
|
|
180
|
+
"--output",
|
|
181
|
+
"output_path",
|
|
182
|
+
type=click.Path(path_type=Path),
|
|
183
|
+
default=None,
|
|
184
|
+
help="Save report as JSON to this path.",
|
|
185
|
+
)
|
|
186
|
+
@click.option(
|
|
187
|
+
"--pretty",
|
|
188
|
+
is_flag=True,
|
|
189
|
+
default=True,
|
|
190
|
+
help="Pretty-print JSON output (default: True).",
|
|
191
|
+
)
|
|
192
|
+
def report_cmd(file: Path, output_path: Path | None, pretty: bool) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Generate a PII scan report for FILE.
|
|
195
|
+
|
|
196
|
+
\b
|
|
197
|
+
Examples:
|
|
198
|
+
datacloak report data.txt
|
|
199
|
+
datacloak report data.csv --output report.json
|
|
200
|
+
"""
|
|
201
|
+
rep = generate_report_from_file(file)
|
|
202
|
+
|
|
203
|
+
json_str = rep.to_json(indent=2 if pretty else None)
|
|
204
|
+
|
|
205
|
+
if output_path:
|
|
206
|
+
rep.save(output_path)
|
|
207
|
+
click.secho(f"✓ Report saved to: {output_path}", fg="green")
|
|
208
|
+
else:
|
|
209
|
+
click.echo(json_str)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Entry point
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def main() -> None:
|
|
218
|
+
cli()
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
if __name__ == "__main__":
|
|
222
|
+
main()
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataCloak detector modules.
|
|
3
|
+
|
|
4
|
+
All built-in detectors are exported from this package.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .aadhaar import AadhaarDetector
|
|
8
|
+
from .base import BaseDetector, Detection
|
|
9
|
+
from .credit_card import CreditCardDetector
|
|
10
|
+
from .email import EmailDetector
|
|
11
|
+
from .ifsc import IFSCDetector
|
|
12
|
+
from .ip_address import IPAddressDetector
|
|
13
|
+
from .mobile import MobileDetector
|
|
14
|
+
from .pan import PANDetector
|
|
15
|
+
from .upi import UPIDetector
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"BaseDetector",
|
|
19
|
+
"Detection",
|
|
20
|
+
"AadhaarDetector",
|
|
21
|
+
"PANDetector",
|
|
22
|
+
"MobileDetector",
|
|
23
|
+
"EmailDetector",
|
|
24
|
+
"UPIDetector",
|
|
25
|
+
"CreditCardDetector",
|
|
26
|
+
"IFSCDetector",
|
|
27
|
+
"IPAddressDetector",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
#: Registry of all built-in detectors (ordered by detection priority)
|
|
31
|
+
DEFAULT_DETECTORS: list[BaseDetector] = [
|
|
32
|
+
AadhaarDetector(),
|
|
33
|
+
PANDetector(),
|
|
34
|
+
MobileDetector(),
|
|
35
|
+
EmailDetector(),
|
|
36
|
+
UPIDetector(),
|
|
37
|
+
CreditCardDetector(),
|
|
38
|
+
IFSCDetector(),
|
|
39
|
+
IPAddressDetector(),
|
|
40
|
+
]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Detector for Indian Aadhaar numbers (12-digit UIDs)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .base import BaseDetector, Detection
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AadhaarDetector(BaseDetector):
|
|
11
|
+
"""
|
|
12
|
+
Detects Indian Aadhaar numbers.
|
|
13
|
+
|
|
14
|
+
Supports formats:
|
|
15
|
+
- ``1234 5678 9012`` (space-separated groups of 4)
|
|
16
|
+
- ``1234-5678-9012`` (hyphen-separated)
|
|
17
|
+
- ``123456789012`` (no separator)
|
|
18
|
+
|
|
19
|
+
Validates that the number does not start with 0 or 1 (invalid Aadhaar prefix).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
name = "aadhaar"
|
|
23
|
+
description = "Indian Aadhaar UID (12-digit unique identifier)"
|
|
24
|
+
|
|
25
|
+
# Matches 12-digit numbers in groups of 4, with optional space/hyphen separators.
|
|
26
|
+
_pattern: re.Pattern = re.compile(
|
|
27
|
+
r"\b([2-9]\d{3}[\s\-]?\d{4}[\s\-]?\d{4})\b"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
def _validate(self, value: str) -> bool:
|
|
31
|
+
digits = re.sub(r"[\s\-]", "", value)
|
|
32
|
+
if len(digits) != 12:
|
|
33
|
+
return False
|
|
34
|
+
# Aadhaar numbers cannot start with 0 or 1
|
|
35
|
+
if digits[0] in ("0", "1"):
|
|
36
|
+
return False
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
def detect(self, text: str) -> list[Detection]:
|
|
40
|
+
results: list[Detection] = []
|
|
41
|
+
for match in self._pattern.finditer(text):
|
|
42
|
+
raw = match.group()
|
|
43
|
+
if self._validate(raw):
|
|
44
|
+
results.append(
|
|
45
|
+
Detection(
|
|
46
|
+
detector_name=self.name,
|
|
47
|
+
value=raw,
|
|
48
|
+
start=match.start(),
|
|
49
|
+
end=match.end(),
|
|
50
|
+
confidence=self._confidence(raw),
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
return results
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base detector interface for DataCloak PII detection framework.
|
|
3
|
+
All custom detectors must subclass BaseDetector.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Iterator
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Detection:
|
|
16
|
+
"""Represents a single PII detection result."""
|
|
17
|
+
|
|
18
|
+
detector_name: str
|
|
19
|
+
value: str
|
|
20
|
+
start: int
|
|
21
|
+
end: int
|
|
22
|
+
confidence: float = 1.0
|
|
23
|
+
metadata: dict = field(default_factory=dict)
|
|
24
|
+
|
|
25
|
+
def __repr__(self) -> str:
|
|
26
|
+
return (
|
|
27
|
+
f"Detection(type={self.detector_name!r}, value={self.value!r}, "
|
|
28
|
+
f"span=({self.start}, {self.end}), confidence={self.confidence:.2f})"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class BaseDetector(ABC):
|
|
33
|
+
"""
|
|
34
|
+
Abstract base class for all PII detectors.
|
|
35
|
+
|
|
36
|
+
Subclass this and implement :meth:`detect` to create a pluggable detector.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
name: Unique identifier for this detector (e.g. ``"email"``).
|
|
40
|
+
description: Human-readable description of what this detector finds.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
name: str = ""
|
|
44
|
+
description: str = ""
|
|
45
|
+
|
|
46
|
+
# Optional compiled regex — subclasses may set this to get detect() for free.
|
|
47
|
+
_pattern: re.Pattern | None = None
|
|
48
|
+
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
# Public API
|
|
51
|
+
# ------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
def detect(self, text: str) -> list[Detection]:
|
|
54
|
+
"""
|
|
55
|
+
Detect all PII occurrences in *text*.
|
|
56
|
+
|
|
57
|
+
Returns a list of :class:`Detection` instances sorted by position.
|
|
58
|
+
The default implementation uses :attr:`_pattern` if set.
|
|
59
|
+
Subclasses may override for more complex logic.
|
|
60
|
+
"""
|
|
61
|
+
if self._pattern is None:
|
|
62
|
+
raise NotImplementedError(
|
|
63
|
+
f"{self.__class__.__name__} must implement detect() "
|
|
64
|
+
"or set _pattern."
|
|
65
|
+
)
|
|
66
|
+
results: list[Detection] = []
|
|
67
|
+
for match in self._pattern.finditer(text):
|
|
68
|
+
if self._validate(match.group()):
|
|
69
|
+
results.append(
|
|
70
|
+
Detection(
|
|
71
|
+
detector_name=self.name,
|
|
72
|
+
value=match.group(),
|
|
73
|
+
start=match.start(),
|
|
74
|
+
end=match.end(),
|
|
75
|
+
confidence=self._confidence(match.group()),
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return results
|
|
79
|
+
|
|
80
|
+
def detect_iter(self, text: str) -> Iterator[Detection]:
|
|
81
|
+
"""Lazy iterator variant of :meth:`detect`."""
|
|
82
|
+
yield from self.detect(text)
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------
|
|
85
|
+
# Optional hooks
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
def _validate(self, value: str) -> bool: # noqa: ARG002
|
|
89
|
+
"""Secondary validation hook. Return ``False`` to reject a regex match."""
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
def _confidence(self, value: str) -> float: # noqa: ARG002
|
|
93
|
+
"""Return a confidence score in [0, 1] for the detected value."""
|
|
94
|
+
return 1.0
|
|
95
|
+
|
|
96
|
+
def __repr__(self) -> str:
|
|
97
|
+
return f"<{self.__class__.__name__} name={self.name!r}>"
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Detector for credit/debit card numbers with Luhn algorithm validation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .base import BaseDetector, Detection
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _luhn_check(number: str) -> bool:
|
|
11
|
+
"""Return True if *number* (digits only) passes the Luhn algorithm."""
|
|
12
|
+
digits = [int(d) for d in reversed(number)]
|
|
13
|
+
total = 0
|
|
14
|
+
for i, digit in enumerate(digits):
|
|
15
|
+
if i % 2 == 1:
|
|
16
|
+
digit *= 2
|
|
17
|
+
if digit > 9:
|
|
18
|
+
digit -= 9
|
|
19
|
+
total += digit
|
|
20
|
+
return total % 10 == 0
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CreditCardDetector(BaseDetector):
|
|
24
|
+
"""
|
|
25
|
+
Detects credit and debit card numbers (13–19 digits).
|
|
26
|
+
|
|
27
|
+
Validates using the Luhn algorithm to eliminate false positives.
|
|
28
|
+
|
|
29
|
+
Supports formats:
|
|
30
|
+
- ``4111111111111111`` (no separator)
|
|
31
|
+
- ``4111 1111 1111 1111`` (space-separated)
|
|
32
|
+
- ``4111-1111-1111-1111`` (hyphen-separated)
|
|
33
|
+
|
|
34
|
+
Covers: Visa, Mastercard, Amex, RuPay, Discover, JCB, etc.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
name = "credit_card"
|
|
38
|
+
description = "Credit/debit card number"
|
|
39
|
+
|
|
40
|
+
_pattern: re.Pattern = re.compile(
|
|
41
|
+
r"\b"
|
|
42
|
+
r"(\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{1,4}" # 13-16 digit
|
|
43
|
+
r"(?:[\s\-]?\d{1,3})?)" # up to 19 digits
|
|
44
|
+
r"\b"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def _validate(self, value: str) -> bool:
|
|
48
|
+
digits = re.sub(r"[\s\-]", "", value)
|
|
49
|
+
if not (13 <= len(digits) <= 19):
|
|
50
|
+
return False
|
|
51
|
+
if not digits.isdigit():
|
|
52
|
+
return False
|
|
53
|
+
return _luhn_check(digits)
|
|
54
|
+
|
|
55
|
+
def _confidence(self, value: str) -> float:
|
|
56
|
+
# Luhn-validated numbers get high confidence
|
|
57
|
+
digits = re.sub(r"[\s\-]", "", value)
|
|
58
|
+
if _luhn_check(digits):
|
|
59
|
+
return 0.95
|
|
60
|
+
return 0.5
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Detector for email addresses."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .base import BaseDetector, Detection
|
|
8
|
+
|
|
9
|
+
# Common disposable/invalid TLDs to optionally flag (not filtered by default)
|
|
10
|
+
_MIN_TLD_LENGTH = 2
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EmailDetector(BaseDetector):
|
|
14
|
+
"""
|
|
15
|
+
Detects RFC-5321-compatible email addresses.
|
|
16
|
+
|
|
17
|
+
Examples:
|
|
18
|
+
- ``john.doe@example.com``
|
|
19
|
+
- ``user+tag@sub.domain.org``
|
|
20
|
+
- ``first.last@company.co.in``
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "email"
|
|
24
|
+
description = "Email address"
|
|
25
|
+
|
|
26
|
+
# Permissive but practical email regex
|
|
27
|
+
_pattern: re.Pattern = re.compile(
|
|
28
|
+
r"\b"
|
|
29
|
+
r"([a-zA-Z0-9]" # local: must start with alnum
|
|
30
|
+
r"(?:[a-zA-Z0-9._%+\-]{0,62})" # local: body
|
|
31
|
+
r"[a-zA-Z0-9])" # local: must end with alnum (or be 1 char)
|
|
32
|
+
r"@"
|
|
33
|
+
r"([a-zA-Z0-9]" # domain
|
|
34
|
+
r"(?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?" # domain labels
|
|
35
|
+
r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*"
|
|
36
|
+
r"\.[a-zA-Z]{2,})" # TLD
|
|
37
|
+
r"\b",
|
|
38
|
+
re.ASCII,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def _validate(self, value: str) -> bool:
|
|
42
|
+
if "@" not in value:
|
|
43
|
+
return False
|
|
44
|
+
local, _, domain = value.rpartition("@")
|
|
45
|
+
if not local or not domain:
|
|
46
|
+
return False
|
|
47
|
+
if ".." in local or ".." in domain:
|
|
48
|
+
return False
|
|
49
|
+
tld = domain.rsplit(".", 1)[-1]
|
|
50
|
+
return len(tld) >= _MIN_TLD_LENGTH
|