datacloak 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datacloak/__init__.py ADDED
@@ -0,0 +1,177 @@
1
+ """
2
+ DataCloak — Privacy Protection Library
3
+ =======================================
4
+
5
+ A production-ready Python library for detecting and masking Personally
6
+ Identifiable Information (PII) in text, logs, files, and application data.
7
+
8
+ Quick start::
9
+
10
+ from datacloak import mask, scan
11
+
12
+ text = \"\"\"
13
+ Aadhaar: 2345 6789 0123
14
+ PAN: ABCDE1234F
15
+ Email: alice@example.com
16
+ Phone: 9876543210
17
+ \"\"\"
18
+
19
+ print(mask(text)) # partial masking (default)
20
+ print(mask(text, mode="full"))
21
+ print(mask(text, mode="hash"))
22
+
23
+ findings = scan(text)
24
+ # {"aadhaar": ["2345 6789 0123"], "pan": ["ABCDE1234F"], ...}
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ from typing import TYPE_CHECKING
31
+
32
+ from .detectors import (
33
+ DEFAULT_DETECTORS,
34
+ AadhaarDetector,
35
+ BaseDetector,
36
+ CreditCardDetector,
37
+ Detection,
38
+ EmailDetector,
39
+ IFSCDetector,
40
+ IPAddressDetector,
41
+ MobileDetector,
42
+ PANDetector,
43
+ UPIDetector,
44
+ )
45
+ from .file_scanner import FileScanResult, scan_file
46
+ from .masker import MaskMode, mask_text
47
+ from .reporter import Report, generate_report_from_file, generate_report_from_text
48
+ from .scanner import ScanResult, scan_summary, scan_text
49
+
50
+ if TYPE_CHECKING:
51
+ from pathlib import Path
52
+
53
+ __version__ = "0.1.0"
54
+ __author__ = "DataCloak Contributors"
55
+ __license__ = "MIT"
56
+
57
+ # Set up a NullHandler so the library is silent unless the caller configures logging.
58
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Convenience aliases (the primary public surface)
63
+ # ---------------------------------------------------------------------------
64
+
65
+
66
+ def mask(
67
+ text: str,
68
+ mode: MaskMode = "partial",
69
+ detectors: list[BaseDetector] | None = None,
70
+ ) -> str:
71
+ """
72
+ Detect and mask all PII in *text*.
73
+
74
+ Parameters
75
+ ----------
76
+ text:
77
+ Input string containing potential PII.
78
+ mode:
79
+ ``"partial"`` (default) — keep trailing characters visible.
80
+ ``"full"`` — replace with descriptive tags like ``[EMAIL_REDACTED]``.
81
+ ``"hash"`` — replace with SHA-256 digest.
82
+ detectors:
83
+ Optional list of :class:`~datacloak.detectors.BaseDetector` instances.
84
+ Defaults to all built-in detectors.
85
+
86
+ Returns
87
+ -------
88
+ str
89
+ The masked string.
90
+
91
+ Example::
92
+
93
+ >>> from datacloak import mask
94
+ >>> mask("Call me at 9876543210")
95
+ 'Call me at ******3210'
96
+ """
97
+ return mask_text(text, mode=mode, detectors=detectors)
98
+
99
+
100
+ def scan(
101
+ text: str,
102
+ detectors: list[BaseDetector] | None = None,
103
+ ) -> ScanResult:
104
+ """
105
+ Scan *text* for PII without modifying it.
106
+
107
+ Parameters
108
+ ----------
109
+ text:
110
+ Input string to scan.
111
+ detectors:
112
+ Optional custom detectors.
113
+
114
+ Returns
115
+ -------
116
+ dict
117
+ Mapping of PII type name → list of detected values.
118
+
119
+ Example::
120
+
121
+ >>> from datacloak import scan
122
+ >>> scan("Email me at bob@example.com")
123
+ {'email': ['bob@example.com']}
124
+ """
125
+ return scan_text(text, detectors=detectors)
126
+
127
+
128
+ def report(
129
+ text: str,
130
+ source_label: str = "<inline text>",
131
+ detectors: list[BaseDetector] | None = None,
132
+ ) -> Report:
133
+ """
134
+ Generate a structured :class:`~datacloak.reporter.Report` from *text*.
135
+
136
+ Example::
137
+
138
+ >>> from datacloak import report
139
+ >>> r = report("john@example.com called 9876543210")
140
+ >>> print(r.to_json())
141
+ """
142
+ return generate_report_from_text(text, source_label=source_label, detectors=detectors)
143
+
144
+
145
+ __all__ = [
146
+ # Version
147
+ "__version__",
148
+ # Core API
149
+ "mask",
150
+ "scan",
151
+ "report",
152
+ # File operations
153
+ "scan_file",
154
+ # Lower-level API
155
+ "mask_text",
156
+ "scan_text",
157
+ "scan_summary",
158
+ "generate_report_from_text",
159
+ "generate_report_from_file",
160
+ # Detectors
161
+ "BaseDetector",
162
+ "Detection",
163
+ "DEFAULT_DETECTORS",
164
+ "AadhaarDetector",
165
+ "PANDetector",
166
+ "MobileDetector",
167
+ "EmailDetector",
168
+ "UPIDetector",
169
+ "CreditCardDetector",
170
+ "IFSCDetector",
171
+ "IPAddressDetector",
172
+ # Types
173
+ "MaskMode",
174
+ "ScanResult",
175
+ "FileScanResult",
176
+ "Report",
177
+ ]
datacloak/cli.py ADDED
@@ -0,0 +1,222 @@
1
+ """
2
+ DataCloak Command-Line Interface.
3
+
4
+ Usage::
5
+
6
+ datacloak scan file.txt
7
+ datacloak mask file.txt
8
+ datacloak mask file.txt --mode full --output masked.txt
9
+ datacloak report file.txt
10
+ datacloak report file.txt --output report.json
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import sys
17
+ from pathlib import Path
18
+
19
+ import click
20
+
21
+ import datacloak
22
+ from datacloak.file_scanner import mask_file, scan_file
23
+ from datacloak.masker import MaskMode
24
+ from datacloak.reporter import generate_report_from_file
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # CLI root
29
+ # ---------------------------------------------------------------------------
30
+
31
+
32
+ @click.group(
33
+ context_settings={"help_option_names": ["-h", "--help"]},
34
+ )
35
+ @click.version_option(datacloak.__version__, "-V", "--version")
36
+ @click.option(
37
+ "-v",
38
+ "--verbose",
39
+ is_flag=True,
40
+ default=False,
41
+ help="Enable verbose logging output.",
42
+ )
43
+ def cli(verbose: bool) -> None:
44
+ """
45
+ \b
46
+ DataCloak — Privacy Protection CLI
47
+ ===================================
48
+ Detect and mask PII in text files.
49
+ """
50
+ if verbose:
51
+ import logging
52
+
53
+ logging.basicConfig(
54
+ level=logging.DEBUG,
55
+ format="%(levelname)s %(name)s: %(message)s",
56
+ )
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # scan command
61
+ # ---------------------------------------------------------------------------
62
+
63
+
64
+ @cli.command("scan")
65
+ @click.argument("file", type=click.Path(exists=True, readable=True, path_type=Path))
66
+ @click.option(
67
+ "--format",
68
+ "output_format",
69
+ type=click.Choice(["json", "table"], case_sensitive=False),
70
+ default="table",
71
+ show_default=True,
72
+ help="Output format.",
73
+ )
74
+ def scan_cmd(file: Path, output_format: str) -> None:
75
+ """
76
+ Scan FILE for PII and display findings.
77
+
78
+ \b
79
+ Examples:
80
+ datacloak scan customer_data.txt
81
+ datacloak scan --format json logs.txt
82
+ """
83
+ result = scan_file(file)
84
+
85
+ if result.error:
86
+ click.secho(f"Error: {result.error}", fg="red", err=True)
87
+ sys.exit(1)
88
+
89
+ if not result.findings:
90
+ click.secho("✓ No PII detected.", fg="green")
91
+ return
92
+
93
+ if output_format == "json":
94
+ click.echo(json.dumps(result.by_type, indent=2, ensure_ascii=False))
95
+ return
96
+
97
+ # Table output
98
+ click.secho(f"\n📄 File: {file}", bold=True)
99
+ click.secho(f"{'PII Type':<18} {'Value':<40} {'Line':>6}", fg="cyan")
100
+ click.secho("─" * 66, fg="cyan")
101
+ for finding in result.findings:
102
+ line_str = str(finding.line_number) if finding.line_number else "—"
103
+ click.echo(f"{finding.pii_type:<18} {finding.value:<40} {line_str:>6}")
104
+
105
+ click.secho("─" * 66, fg="cyan")
106
+ click.secho(f"\nSummary: {result.summary}", fg="yellow")
107
+ risk_colours = {"NONE": "green", "LOW": "yellow", "MEDIUM": "magenta", "HIGH": "red"}
108
+ from datacloak.reporter import _risk_level
109
+
110
+ risk = _risk_level(len(result.findings))
111
+ colour = risk_colours.get(risk, "white")
112
+ click.secho(f"Risk level: {risk}", fg=colour, bold=True)
113
+
114
+
115
+ # ---------------------------------------------------------------------------
116
+ # mask command
117
+ # ---------------------------------------------------------------------------
118
+
119
+
120
+ @cli.command("mask")
121
+ @click.argument("file", type=click.Path(exists=True, readable=True, path_type=Path))
122
+ @click.option(
123
+ "-m",
124
+ "--mode",
125
+ type=click.Choice(["partial", "full", "hash"], case_sensitive=False),
126
+ default="partial",
127
+ show_default=True,
128
+ help="Masking mode.",
129
+ )
130
+ @click.option(
131
+ "-o",
132
+ "--output",
133
+ "output_path",
134
+ type=click.Path(path_type=Path),
135
+ default=None,
136
+ help="Output file path (default: <file>.masked<ext>).",
137
+ )
138
+ @click.option(
139
+ "--stdout",
140
+ is_flag=True,
141
+ default=False,
142
+ help="Print masked output to stdout instead of writing a file.",
143
+ )
144
+ def mask_cmd(file: Path, mode: str, output_path: Path | None, stdout: bool) -> None:
145
+ """
146
+ Mask PII in FILE.
147
+
148
+ \b
149
+ Masking modes:
150
+ partial Keep last characters visible (default)
151
+ full Replace with descriptive tags, e.g. [EMAIL_REDACTED]
152
+ hash Replace with SHA-256 digest
153
+
154
+ \b
155
+ Examples:
156
+ datacloak mask logs.txt
157
+ datacloak mask logs.txt --mode full --output clean_logs.txt
158
+ datacloak mask logs.txt --stdout | less
159
+ """
160
+ if stdout:
161
+ content = file.read_text(encoding="utf-8", errors="replace")
162
+ from datacloak.masker import mask_text
163
+
164
+ click.echo(mask_text(content, mode=mode)) # type: ignore[arg-type]
165
+ return
166
+
167
+ dest = mask_file(file, output_path=output_path, mode=mode) # type: ignore[arg-type]
168
+ click.secho(f"✓ Masked file written to: {dest}", fg="green")
169
+
170
+
171
+ # ---------------------------------------------------------------------------
172
+ # report command
173
+ # ---------------------------------------------------------------------------
174
+
175
+
176
+ @cli.command("report")
177
+ @click.argument("file", type=click.Path(exists=True, readable=True, path_type=Path))
178
+ @click.option(
179
+ "-o",
180
+ "--output",
181
+ "output_path",
182
+ type=click.Path(path_type=Path),
183
+ default=None,
184
+ help="Save report as JSON to this path.",
185
+ )
186
+ @click.option(
187
+ "--pretty",
188
+ is_flag=True,
189
+ default=True,
190
+ help="Pretty-print JSON output (default: True).",
191
+ )
192
+ def report_cmd(file: Path, output_path: Path | None, pretty: bool) -> None:
193
+ """
194
+ Generate a PII scan report for FILE.
195
+
196
+ \b
197
+ Examples:
198
+ datacloak report data.txt
199
+ datacloak report data.csv --output report.json
200
+ """
201
+ rep = generate_report_from_file(file)
202
+
203
+ json_str = rep.to_json(indent=2 if pretty else None)
204
+
205
+ if output_path:
206
+ rep.save(output_path)
207
+ click.secho(f"✓ Report saved to: {output_path}", fg="green")
208
+ else:
209
+ click.echo(json_str)
210
+
211
+
212
+ # ---------------------------------------------------------------------------
213
+ # Entry point
214
+ # ---------------------------------------------------------------------------
215
+
216
+
217
+ def main() -> None:
218
+ cli()
219
+
220
+
221
+ if __name__ == "__main__":
222
+ main()
@@ -0,0 +1,40 @@
1
+ """
2
+ DataCloak detector modules.
3
+
4
+ All built-in detectors are exported from this package.
5
+ """
6
+
7
+ from .aadhaar import AadhaarDetector
8
+ from .base import BaseDetector, Detection
9
+ from .credit_card import CreditCardDetector
10
+ from .email import EmailDetector
11
+ from .ifsc import IFSCDetector
12
+ from .ip_address import IPAddressDetector
13
+ from .mobile import MobileDetector
14
+ from .pan import PANDetector
15
+ from .upi import UPIDetector
16
+
17
+ __all__ = [
18
+ "BaseDetector",
19
+ "Detection",
20
+ "AadhaarDetector",
21
+ "PANDetector",
22
+ "MobileDetector",
23
+ "EmailDetector",
24
+ "UPIDetector",
25
+ "CreditCardDetector",
26
+ "IFSCDetector",
27
+ "IPAddressDetector",
28
+ ]
29
+
30
+ #: Registry of all built-in detectors (ordered by detection priority)
31
+ DEFAULT_DETECTORS: list[BaseDetector] = [
32
+ AadhaarDetector(),
33
+ PANDetector(),
34
+ MobileDetector(),
35
+ EmailDetector(),
36
+ UPIDetector(),
37
+ CreditCardDetector(),
38
+ IFSCDetector(),
39
+ IPAddressDetector(),
40
+ ]
@@ -0,0 +1,53 @@
1
+ """Detector for Indian Aadhaar numbers (12-digit UIDs)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .base import BaseDetector, Detection
8
+
9
+
10
+ class AadhaarDetector(BaseDetector):
11
+ """
12
+ Detects Indian Aadhaar numbers.
13
+
14
+ Supports formats:
15
+ - ``1234 5678 9012`` (space-separated groups of 4)
16
+ - ``1234-5678-9012`` (hyphen-separated)
17
+ - ``123456789012`` (no separator)
18
+
19
+ Validates that the number does not start with 0 or 1 (invalid Aadhaar prefix).
20
+ """
21
+
22
+ name = "aadhaar"
23
+ description = "Indian Aadhaar UID (12-digit unique identifier)"
24
+
25
+ # Matches 12-digit numbers in groups of 4, with optional space/hyphen separators.
26
+ _pattern: re.Pattern = re.compile(
27
+ r"\b([2-9]\d{3}[\s\-]?\d{4}[\s\-]?\d{4})\b"
28
+ )
29
+
30
+ def _validate(self, value: str) -> bool:
31
+ digits = re.sub(r"[\s\-]", "", value)
32
+ if len(digits) != 12:
33
+ return False
34
+ # Aadhaar numbers cannot start with 0 or 1
35
+ if digits[0] in ("0", "1"):
36
+ return False
37
+ return True
38
+
39
+ def detect(self, text: str) -> list[Detection]:
40
+ results: list[Detection] = []
41
+ for match in self._pattern.finditer(text):
42
+ raw = match.group()
43
+ if self._validate(raw):
44
+ results.append(
45
+ Detection(
46
+ detector_name=self.name,
47
+ value=raw,
48
+ start=match.start(),
49
+ end=match.end(),
50
+ confidence=self._confidence(raw),
51
+ )
52
+ )
53
+ return results
@@ -0,0 +1,97 @@
1
+ """
2
+ Base detector interface for DataCloak PII detection framework.
3
+ All custom detectors must subclass BaseDetector.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import re
9
+ from abc import ABC, abstractmethod
10
+ from dataclasses import dataclass, field
11
+ from typing import Iterator
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class Detection:
16
+ """Represents a single PII detection result."""
17
+
18
+ detector_name: str
19
+ value: str
20
+ start: int
21
+ end: int
22
+ confidence: float = 1.0
23
+ metadata: dict = field(default_factory=dict)
24
+
25
+ def __repr__(self) -> str:
26
+ return (
27
+ f"Detection(type={self.detector_name!r}, value={self.value!r}, "
28
+ f"span=({self.start}, {self.end}), confidence={self.confidence:.2f})"
29
+ )
30
+
31
+
32
+ class BaseDetector(ABC):
33
+ """
34
+ Abstract base class for all PII detectors.
35
+
36
+ Subclass this and implement :meth:`detect` to create a pluggable detector.
37
+
38
+ Attributes:
39
+ name: Unique identifier for this detector (e.g. ``"email"``).
40
+ description: Human-readable description of what this detector finds.
41
+ """
42
+
43
+ name: str = ""
44
+ description: str = ""
45
+
46
+ # Optional compiled regex — subclasses may set this to get detect() for free.
47
+ _pattern: re.Pattern | None = None
48
+
49
+ # ------------------------------------------------------------------
50
+ # Public API
51
+ # ------------------------------------------------------------------
52
+
53
+ def detect(self, text: str) -> list[Detection]:
54
+ """
55
+ Detect all PII occurrences in *text*.
56
+
57
+ Returns a list of :class:`Detection` instances sorted by position.
58
+ The default implementation uses :attr:`_pattern` if set.
59
+ Subclasses may override for more complex logic.
60
+ """
61
+ if self._pattern is None:
62
+ raise NotImplementedError(
63
+ f"{self.__class__.__name__} must implement detect() "
64
+ "or set _pattern."
65
+ )
66
+ results: list[Detection] = []
67
+ for match in self._pattern.finditer(text):
68
+ if self._validate(match.group()):
69
+ results.append(
70
+ Detection(
71
+ detector_name=self.name,
72
+ value=match.group(),
73
+ start=match.start(),
74
+ end=match.end(),
75
+ confidence=self._confidence(match.group()),
76
+ )
77
+ )
78
+ return results
79
+
80
+ def detect_iter(self, text: str) -> Iterator[Detection]:
81
+ """Lazy iterator variant of :meth:`detect`."""
82
+ yield from self.detect(text)
83
+
84
+ # ------------------------------------------------------------------
85
+ # Optional hooks
86
+ # ------------------------------------------------------------------
87
+
88
+ def _validate(self, value: str) -> bool: # noqa: ARG002
89
+ """Secondary validation hook. Return ``False`` to reject a regex match."""
90
+ return True
91
+
92
+ def _confidence(self, value: str) -> float: # noqa: ARG002
93
+ """Return a confidence score in [0, 1] for the detected value."""
94
+ return 1.0
95
+
96
+ def __repr__(self) -> str:
97
+ return f"<{self.__class__.__name__} name={self.name!r}>"
@@ -0,0 +1,60 @@
1
+ """Detector for credit/debit card numbers with Luhn algorithm validation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .base import BaseDetector, Detection
8
+
9
+
10
+ def _luhn_check(number: str) -> bool:
11
+ """Return True if *number* (digits only) passes the Luhn algorithm."""
12
+ digits = [int(d) for d in reversed(number)]
13
+ total = 0
14
+ for i, digit in enumerate(digits):
15
+ if i % 2 == 1:
16
+ digit *= 2
17
+ if digit > 9:
18
+ digit -= 9
19
+ total += digit
20
+ return total % 10 == 0
21
+
22
+
23
+ class CreditCardDetector(BaseDetector):
24
+ """
25
+ Detects credit and debit card numbers (13–19 digits).
26
+
27
+ Validates using the Luhn algorithm to eliminate false positives.
28
+
29
+ Supports formats:
30
+ - ``4111111111111111`` (no separator)
31
+ - ``4111 1111 1111 1111`` (space-separated)
32
+ - ``4111-1111-1111-1111`` (hyphen-separated)
33
+
34
+ Covers: Visa, Mastercard, Amex, RuPay, Discover, JCB, etc.
35
+ """
36
+
37
+ name = "credit_card"
38
+ description = "Credit/debit card number"
39
+
40
+ _pattern: re.Pattern = re.compile(
41
+ r"\b"
42
+ r"(\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{1,4}" # 13-16 digit
43
+ r"(?:[\s\-]?\d{1,3})?)" # up to 19 digits
44
+ r"\b"
45
+ )
46
+
47
+ def _validate(self, value: str) -> bool:
48
+ digits = re.sub(r"[\s\-]", "", value)
49
+ if not (13 <= len(digits) <= 19):
50
+ return False
51
+ if not digits.isdigit():
52
+ return False
53
+ return _luhn_check(digits)
54
+
55
+ def _confidence(self, value: str) -> float:
56
+ # Luhn-validated numbers get high confidence
57
+ digits = re.sub(r"[\s\-]", "", value)
58
+ if _luhn_check(digits):
59
+ return 0.95
60
+ return 0.5
@@ -0,0 +1,50 @@
1
+ """Detector for email addresses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from .base import BaseDetector, Detection
8
+
9
+ # Common disposable/invalid TLDs to optionally flag (not filtered by default)
10
+ _MIN_TLD_LENGTH = 2
11
+
12
+
13
+ class EmailDetector(BaseDetector):
14
+ """
15
+ Detects RFC-5321-compatible email addresses.
16
+
17
+ Examples:
18
+ - ``john.doe@example.com``
19
+ - ``user+tag@sub.domain.org``
20
+ - ``first.last@company.co.in``
21
+ """
22
+
23
+ name = "email"
24
+ description = "Email address"
25
+
26
+ # Permissive but practical email regex
27
+ _pattern: re.Pattern = re.compile(
28
+ r"\b"
29
+ r"([a-zA-Z0-9]" # local: must start with alnum
30
+ r"(?:[a-zA-Z0-9._%+\-]{0,62})" # local: body
31
+ r"[a-zA-Z0-9])" # local: must end with alnum (or be 1 char)
32
+ r"@"
33
+ r"([a-zA-Z0-9]" # domain
34
+ r"(?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?" # domain labels
35
+ r"(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?)*"
36
+ r"\.[a-zA-Z]{2,})" # TLD
37
+ r"\b",
38
+ re.ASCII,
39
+ )
40
+
41
+ def _validate(self, value: str) -> bool:
42
+ if "@" not in value:
43
+ return False
44
+ local, _, domain = value.rpartition("@")
45
+ if not local or not domain:
46
+ return False
47
+ if ".." in local or ".." in domain:
48
+ return False
49
+ tld = domain.rsplit(".", 1)[-1]
50
+ return len(tld) >= _MIN_TLD_LENGTH