piifill-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Developer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,90 @@
1
+ Metadata-Version: 2.4
2
+ Name: piifill-cli
3
+ Version: 0.1.0
4
+ Summary: PIIFILL: Professional Local-Logic PII Sanitization CLI
5
+ Author-email: Bhavin Sachaniya <bhavinsachaniya200@gmail.com>
6
+ Requires-Python: >=3.9
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: typer>=0.12.0
10
+ Requires-Dist: rich>=13.0.0
11
+ Requires-Dist: loguru>=0.7.0
12
+ Requires-Dist: pydantic>=2.0.0
13
+ Requires-Dist: pydantic-settings>=2.0.0
14
+ Requires-Dist: easyocr
15
+ Requires-Dist: opencv-python-headless
16
+ Requires-Dist: numpy<2
17
+ Requires-Dist: pillow
18
+ Requires-Dist: pandas
19
+ Requires-Dist: openpyxl
20
+ Requires-Dist: pymupdf
21
+ Requires-Dist: python-docx
22
+ Dynamic: license-file
23
+
24
+ # PIIFILL CLI
25
+
26
+ > **Automated Enterprise-Grade Local PII Sanitization CLI.**
27
+
28
+ PIIFILL is a high-performance terminal utility for detecting and masking sensitive data locally and securely.
29
+
30
+ ---
31
+
32
+ ## โšก Quick Start
33
+
34
+ ### 1. Installation
35
+ Install the CLI in editable mode for local development:
36
+ ```bash
37
+ cd CLI/piifill
38
+ pip install -e .
39
+ ```
40
+
41
+ ### 2. Basic Usage
42
+ Scan and mask PII in a file:
43
+ ```bash
44
+ piifill mask path/to/file.json
45
+ ```
46
+
47
+ ---
48
+
49
+ ## ๐Ÿ›  Command Reference
50
+
51
+ ### `piifill mask`
52
+ Securely mask sensitive data in assets.
53
+ - `piifill mask <path>`: Direct masking of a file.
54
+ - `piifill mask -o <output_path>`: Specify custom output path.
55
+ - `--mode`: Sanitization strategy (`mask`, `redact`, `tokenize`).
56
+ - `--local`: Rapid local directory protection (scans current directory by default).
57
+
58
+ ### `piifill scan`
59
+ Detect PII in a file or directory without modifying it.
60
+ - `piifill scan <path>`: Scan an asset.
61
+ - `--recursive`: Scan entire directories.
62
+
63
+ ### `piifill config`
64
+ Manage PIIFILL configuration.
65
+
66
+ ### `piifill version`
67
+ Show PIIFILL version information.
68
+
69
+ ---
70
+
71
+ ## ๐Ÿ“Š Security Analysis
72
+
73
+ Every sanitization run performs high-fidelity risk analysis:
74
+ - **Security Grading**: A-F scale based on PII density.
75
+ - **Risk Scoring**: 0-100 technical protection score.
76
+ - **Entity Breakdown**: Categorical distribution (emails, SSNs, credit cards, etc.).
77
+
78
+ ---
79
+
80
+ ## ๐Ÿงช Verification
81
+
82
+ Run the CLI against sample data to verify detection:
83
+ ```bash
84
+ piifill scan ./test_data/
85
+ ```
86
+
87
+ ---
88
+
89
+ ## ๐Ÿ“œ License
90
+ See [LICENSE](LICENSE) for details.
@@ -0,0 +1,67 @@
1
+ # PIIFILL CLI
2
+
3
+ > **Automated Enterprise-Grade Local PII Sanitization CLI.**
4
+
5
+ PIIFILL is a high-performance terminal utility for detecting and masking sensitive data locally and securely.
6
+
7
+ ---
8
+
9
+ ## โšก Quick Start
10
+
11
+ ### 1. Installation
12
+ Install the CLI in editable mode for local development:
13
+ ```bash
14
+ cd CLI/piifill
15
+ pip install -e .
16
+ ```
17
+
18
+ ### 2. Basic Usage
19
+ Scan and mask PII in a file:
20
+ ```bash
21
+ piifill mask path/to/file.json
22
+ ```
23
+
24
+ ---
25
+
26
+ ## ๐Ÿ›  Command Reference
27
+
28
+ ### `piifill mask`
29
+ Securely mask sensitive data in assets.
30
+ - `piifill mask <path>`: Direct masking of a file.
31
+ - `piifill mask -o <output_path>`: Specify custom output path.
32
+ - `--mode`: Sanitization strategy (`mask`, `redact`, `tokenize`).
33
+ - `--local`: Rapid local directory protection (scans current directory by default).
34
+
35
+ ### `piifill scan`
36
+ Detect PII in a file or directory without modifying it.
37
+ - `piifill scan <path>`: Scan an asset.
38
+ - `--recursive`: Scan entire directories.
39
+
40
+ ### `piifill config`
41
+ Manage PIIFILL configuration.
42
+
43
+ ### `piifill version`
44
+ Show PIIFILL version information.
45
+
46
+ ---
47
+
48
+ ## ๐Ÿ“Š Security Analysis
49
+
50
+ Every sanitization run performs high-fidelity risk analysis:
51
+ - **Security Grading**: A-F scale based on PII density.
52
+ - **Risk Scoring**: 0-100 technical protection score.
53
+ - **Entity Breakdown**: Categorical distribution (emails, SSNs, credit cards, etc.).
54
+
55
+ ---
56
+
57
+ ## ๐Ÿงช Verification
58
+
59
+ Run the CLI against sample data to verify detection:
60
+ ```bash
61
+ piifill scan ./test_data/
62
+ ```
63
+
64
+ ---
65
+
66
+ ## ๐Ÿ“œ License
67
+ See [LICENSE](LICENSE) for details.
@@ -0,0 +1,2 @@
1
+ """PIIFILL CLI tool."""
2
+ __version__ = "0.1.0"
@@ -0,0 +1,244 @@
1
+ import typer
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Optional, List
6
+ from concurrent.futures import ProcessPoolExecutor, as_completed
7
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
8
+ from rich.table import Table
9
+
10
+ # --- Internal Imports ---
11
+ from .shared.config import settings
12
+ from .shared.ui import (
13
+ setup_logger, print_banner, t, print_step, print_status,
14
+ print_error, print_success, format_detection_table, console,
15
+ translator
16
+ )
17
+ from .engine.manager import filtration_manager, FiltrationManager
18
+ from .logic.factory import FileProcessor
19
+
20
+ # --- Initialization ---
21
+ logger = setup_logger(settings.log_file, settings.log_level)
22
+
23
+ # --- CLI Application ---
24
+ app = typer.Typer(
25
+ help="PIIFILL: Professional Enterprise-Grade PII Sanitization CLI.",
26
+ no_args_is_help=False,
27
+ add_completion=False,
28
+ )
29
+
30
+ # --- Helpers ---
31
+ def version_callback(value: bool):
32
+ if value:
33
+ print_banner()
34
+ console.print("[bold cyan]PIIFILL CLI Version:[/bold cyan] 0.1.0")
35
+ console.print("[dim]Enterprise-Grade Local PII Sanitization CLI[/dim]")
36
+ raise typer.Exit()
37
+
38
+ def _worker_process_file(fpath: Path, input_path: Path, output_dir: Path, final_name: Optional[str], mode: str):
39
+ """Worker function for multi-core processing."""
40
+ worker_engine = FiltrationManager()
41
+ filename = fpath.name
42
+ try:
43
+ if input_path.is_dir():
44
+ relative_path = fpath.relative_to(input_path)
45
+ final_output_path = output_dir / relative_path
46
+ else:
47
+ final_output_path = output_dir / final_name
48
+
49
+ final_output_path.parent.mkdir(parents=True, exist_ok=True)
50
+ parser_instance = FileProcessor.get_parser(fpath.suffix)
51
+ parser_instance.parse_file(fpath, final_output_path, worker_engine, mode)
52
+ return filename, True, worker_engine.get_session_report()
53
+ except Exception as e:
54
+ return filename, False, str(e)
55
+
56
+ # --- Commands ---
57
+ @app.command(name="mask", help="Securely protect assets (Files/Directories)")
58
+ def mask_command(
59
+ path: Path = typer.Argument(..., help="File or directory path to secure"),
60
+ output: Optional[Path] = typer.Option(None, "--output", "-o", help="Custom output path"),
61
+ mode: str = typer.Option("mask", "--mode", "-m", help="Sanitization strategy (mask/redact/tokenize)"),
62
+ local: bool = typer.Option(False, "--local", "-l", help="Rapid local directory protection (scans current dir)")
63
+ ):
64
+ """Securely mask sensitive data in assets."""
65
+ if local and path is None:
66
+ path = Path(".")
67
+
68
+ if path is None:
69
+ print_error("Path is required.")
70
+ raise typer.Exit(code=1)
71
+
72
+ input_path = path.resolve()
73
+ if not input_path.exists():
74
+ print_error(t("error_path_exists", path=str(input_path)))
75
+ raise typer.Exit(code=1)
76
+
77
+ if input_path.is_file():
78
+ files_to_process = [input_path]
79
+ if output:
80
+ if output.suffix:
81
+ final_output_path_base = output.resolve()
82
+ output_dir = final_output_path_base.parent
83
+ final_name = final_output_path_base.name
84
+ else:
85
+ output_dir = output.resolve()
86
+ final_name = input_path.name
87
+ else:
88
+ output_dir = input_path.parent / "out"
89
+ final_name = input_path.name
90
+ else:
91
+ files_to_process = [p for p in input_path.rglob("*") if p.is_file() and not p.name.startswith('.')]
92
+ output_dir = output.resolve() if output else input_path / "out"
93
+ final_name = None
94
+
95
+ if not files_to_process:
96
+ print_error(t("error_no_files", path=str(input_path)))
97
+ raise typer.Exit()
98
+
99
+ output_dir.mkdir(parents=True, exist_ok=True)
100
+
101
+ results_table = Table(title=t("results_table_title"), box=None, header_style="bold cyan")
102
+ results_table.add_column(t("results_asset"), style="cyan")
103
+ results_table.add_column(t("results_status"), justify="center")
104
+ results_table.add_column(t("results_grade"), justify="center")
105
+
106
+ if len(files_to_process) > 1:
107
+ print_step(t("step_deploy_parallel", count=f"[bold]{os.cpu_count()}[/bold]", file_count=f"[bold]{len(files_to_process)}[/bold]"))
108
+ with Progress(SpinnerColumn(), TextColumn(f"[cyan]{t('status_parallel')}"), BarColumn(bar_width=40, pulse_style="cyan"), TaskProgressColumn(), console=console, transient=True) as progress:
109
+ task = progress.add_task(t("status_securing"), total=len(files_to_process))
110
+ with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
111
+ futures = [executor.submit(_worker_process_file, f, input_path, output_dir, final_name, mode) for f in files_to_process]
112
+ for future in as_completed(futures):
113
+ fname, success, result = future.result()
114
+ if success:
115
+ results_table.add_row(fname, f"[bold green]{t('status_secured')}[/bold green]", f"[bold]{result['risk_grade']}[/bold]")
116
+ else:
117
+ results_table.add_row(fname, f"[red]{t('status_failed')}[/red]", "N/A")
118
+ logger.error(f"Protection Error on {fname}: {result}")
119
+ progress.advance(task)
120
+ else:
121
+ print_step(t("step_deploy_protection", count=f"[bold]{len(files_to_process)}[/bold]"))
122
+ for fpath in files_to_process:
123
+ filename = fpath.name
124
+ try:
125
+ if input_path.is_dir():
126
+ final_output_path = output_dir / fpath.relative_to(input_path)
127
+ else:
128
+ final_output_path = output_dir / final_name
129
+
130
+ final_output_path.parent.mkdir(parents=True, exist_ok=True)
131
+ filtration_manager.reset_session()
132
+ parser = FileProcessor.get_parser(fpath.suffix)
133
+
134
+ with Progress(SpinnerColumn(), TextColumn(f"[cyan]{t('securing_file', name=filename)}"), BarColumn(bar_width=40, pulse_style="cyan"), TaskProgressColumn(), console=console, transient=True) as progress:
135
+ task_id = progress.add_task(t("sanitizing"), total=1.0)
136
+ parser.parse_file(fpath, final_output_path, filtration_manager, mode, progress_callback=lambda p: progress.update(task_id, completed=p))
137
+
138
+ report = filtration_manager.get_session_report()
139
+ results_table.add_row(filename, f"[bold green]{t('status_secured')}[/bold green]", f"[bold]{report['risk_grade']}[/bold]")
140
+ except Exception as e:
141
+ results_table.add_row(filename, f"[red]{t('status_failed')}[/red]", "N/A")
142
+ logger.error(f"Protection Error on {filename}: {e}")
143
+
144
+ console.print("")
145
+ console.print(results_table)
146
+ print_status(t("status_protection_complete", path=f"[bold cyan]{output_dir}[/bold cyan]"), "success")
147
+
148
+ @app.command(name="scan", help="Scan assets for PII without modification")
149
+ def scan_command(
150
+ path: Path = typer.Argument(..., help="File or directory path to scan for PII"),
151
+ recursive: bool = typer.Option(False, "--recursive", "-r", help="Scan directory recursively")
152
+ ):
153
+ """Detect PII in a file or directory"""
154
+ if not path.exists():
155
+ print_error(t("error_path_exists", path=str(path)))
156
+ raise typer.Exit(code=1)
157
+
158
+ if path.is_file():
159
+ logger.info(f"Scanning file: {path}")
160
+ print_step(f"Analyzing asset: [bold]{path.name}[/bold]")
161
+ file_size = path.stat().st_size
162
+ pii_entities = []
163
+ filtration_manager.reset_session()
164
+ try:
165
+ with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(bar_width=40, pulse_style="magenta"), TaskProgressColumn(), console=console, transient=True) as progress:
166
+ task = progress.add_task(f"[magenta]{t('status_processing')}", total=file_size)
167
+ with open(path, "rb") as f:
168
+ while True:
169
+ chunk = f.read(1024*1024)
170
+ if not chunk: break
171
+ try: text = chunk.decode('utf-8', errors='replace')
172
+ except: text = chunk.decode('latin1', errors='replace')
173
+ entities = filtration_manager.detect(text)
174
+ pii_entities.extend(entities)
175
+ progress.advance(task, len(chunk))
176
+ if not pii_entities:
177
+ print_status(t("status_no_pii"), "success")
178
+ else:
179
+ print_status(t("status_found_pii", count=len(pii_entities)), "warning")
180
+ display_entities = pii_entities[:50]
181
+ table = format_detection_table([e.model_dump() for e in display_entities])
182
+ console.print(table)
183
+ if len(pii_entities) > 50:
184
+ console.print(f"[dim]{t('status_hidden_items', count=len(pii_entities)-50)}[/dim]")
185
+ print_status(t("status_scan_complete", name=f"[bold]{path.name}[/bold]"), "success")
186
+ except Exception as e:
187
+ logger.error(f"Error scanning {path}: {e}")
188
+ print_error(t("error_scan_failed", name=path.name))
189
+ elif path.is_dir():
190
+ if not recursive:
191
+ print_warning(t("warn_dir_recursive"))
192
+ raise typer.Exit()
193
+ files = [p for p in path.rglob("*") if p.is_file() and not p.name.startswith('.')]
194
+ if not files:
195
+ print_error(t("error_no_files", path=str(path)))
196
+ raise typer.Exit()
197
+ total_pii = 0
198
+ print_step(t("step_deploy_batch", count=f"[bold]{len(files)}[/bold]"))
199
+ with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(bar_width=40, pulse_style="cyan"), TaskProgressColumn(), console=console) as progress:
200
+ task = progress.add_task(f"[cyan]{t('status_processing')}", total=len(files))
201
+ for fpath in files:
202
+ try:
203
+ with open(fpath, "rb") as f:
204
+ text = f.read().decode('utf-8', errors='replace')
205
+ total_pii += len(filtration_manager.detect(text))
206
+ except: pass
207
+ progress.advance(task)
208
+ console.print("")
209
+ print_status(t("status_batch_complete", pii_count=f"[bold]{total_pii}[/bold]", file_count=len(files)), "success")
210
+
211
+ @app.command(name="config", help="Manage PIIFILL configuration")
212
+ def config_command():
213
+ """List current configurations."""
214
+ console.print("\n[cyan]--- PIIFILL CONFIGURATION ---[/cyan]")
215
+ console.print(f"Log Level: [bold]{settings.log_level}[/bold]")
216
+ console.print(f"Log File: [bold]{settings.log_file}[/bold]")
217
+ console.print(f"Default Mask: [bold]{settings.default_mask_style}[/bold]")
218
+ console.print(f"Supported Formats: [bold]{', '.join(settings.supported_extensions)}[/bold]")
219
+ console.print("[cyan]-----------------------------[/cyan]\n")
220
+
221
+ @app.command(name="version", help="Show version information")
222
+ def version_command():
223
+ """Show the application version."""
224
+ console.print("[bold cyan]PIIFILL CLI Version:[/bold cyan] 0.1.0")
225
+ console.print("[dim]Enterprise-Grade Local PII Sanitization CLI[/dim]")
226
+
227
+ @app.callback(invoke_without_command=True)
228
+ def main(
229
+ ctx: typer.Context,
230
+ version: bool = typer.Option(None, "--version", "-v", callback=version_callback, is_eager=True, help="Show version info"),
231
+ lang: str = typer.Option(settings.language, "--lang", "-L", help="Set language (en, hi, es, fr)"),
232
+ ):
233
+ """Automated Enterprise-Grade PII Sanitization Logic."""
234
+ translator.set_language(lang)
235
+ print_banner()
236
+ if ctx.invoked_subcommand is None:
237
+ console.print("\n[bold cyan]Welcome to PIIFILL![/bold cyan]")
238
+ console.print("Use [bold]--help[/bold] to see available commands.")
239
+
240
+ def entrypoint():
241
+ app()
242
+
243
+ if __name__ == "__main__":
244
+ app()
File without changes
@@ -0,0 +1,158 @@
1
+ import re
2
+ from typing import Dict, List, Pattern, Optional, Set
3
+ from pydantic import BaseModel
4
+
5
+ class PIIEntity(BaseModel):
6
+ value: str
7
+ entity_type: str
8
+ start: int
9
+ end: int
10
+
11
+ class PatternRegistry:
12
+ """
13
+ Registry for managing PII patterns across different categories and countries.
14
+ Optimized for memory and lookup speed.
15
+ """
16
+ def __init__(self):
17
+ self._patterns: Dict[str, Dict[str, Pattern]] = {
18
+ "GLOBAL": {}, # Multi-country patterns (Email, IP, etc.)
19
+ "AMERICAS": {}, # North/South America
20
+ "EMEA": {}, # Europe, Middle East, Africa
21
+ "APAC": {}, # Asia Pacific
22
+ }
23
+ self._compiled_cache: Optional[Dict[str, Pattern]] = None
24
+
25
+ def register(self, category: str, type_name: str, pattern: str, flags: int = 0):
26
+ """Registers a new PII pattern in a specific category."""
27
+ if category not in self._patterns:
28
+ self._patterns[category] = {}
29
+ # Ensure we don't accidentally use re.IGNORECASE for critical identifiers
30
+ self._patterns[category][type_name] = re.compile(pattern, flags)
31
+ self._compiled_cache = None # Invalidate cache
32
+
33
+ def get_all_patterns(self) -> Dict[str, Pattern]:
34
+ """Returns a flattened dictionary of all registered patterns."""
35
+ if self._compiled_cache is not None:
36
+ return self._compiled_cache
37
+
38
+ all_pats = {}
39
+ for cat_pats in self._patterns.values():
40
+ all_pats.update(cat_pats)
41
+ self._compiled_cache = all_pats
42
+ return all_pats
43
+
44
+ class PIIDetector:
45
+ """
46
+ Professional-grade PII detection engine using a modular PatternRegistry.
47
+ Adheres to PIIFILL high-performance and type-safety standards.
48
+ """
49
+
50
+ # Mapping of common language codes to their primary regions
51
+ _LANG_REGION_MAP = {
52
+ "en": ["GLOBAL", "AMERICAS", "EMEA", "APAC"], # English is global
53
+ "de": ["GLOBAL", "EMEA"],
54
+ "es": ["GLOBAL", "EMEA", "AMERICAS"],
55
+ "fr": ["GLOBAL", "EMEA", "AMERICAS"],
56
+ "hi": ["GLOBAL", "APAC"],
57
+ "pt": ["GLOBAL", "AMERICAS", "EMEA"],
58
+ "zh": ["GLOBAL", "APAC"],
59
+ "ja": ["GLOBAL", "APAC"],
60
+ }
61
+
62
+ def __init__(self):
63
+ self.registry = PatternRegistry()
64
+ self._initialize_core_patterns()
65
+
66
+ def _initialize_core_patterns(self):
67
+ # --- CATEGORY: CONTACT & INTERNET (GLOBAL) ---
68
+ g = "GLOBAL"
69
+ ign = re.IGNORECASE
70
+ self.registry.register(g, "EMAIL", r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', ign)
71
+ self.registry.register(g, "PHONE_GLOBAL", r'(?<!\d)\+(?:[0-9] ?){6,14}[0-9](?!\d)')
72
+ self.registry.register(g, "IPV4", r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b')
73
+ self.registry.register(g, "IPV6", r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b', ign)
74
+ self.registry.register(g, "MAC_ADDRESS", r'\b([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})\b', ign)
75
+
76
+ # --- CATEGORY: FINANCIAL (GLOBAL) ---
77
+ self.registry.register(g, "CREDIT_CARD", r'\b(?:\d{4}[-\s]?){3}\d{4}\b')
78
+ self.registry.register(g, "IBAN", r'\b[A-Z]{2}[0-9]{2}[A-Z0-9]{11,30}\b') # Critical: case-sensitive
79
+ self.registry.register(g, "SWIFT_BIC", r'\b[A-Z]{6}[A-Z0-9]{2}([A-Z0-9]{3})?\b') # Critical: case-sensitive
80
+ self.registry.register(g, "CRYPTO_BTC", r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b')
81
+ self.registry.register(g, "CRYPTO_ETH", r'\b0x[a-fA-F0-9]{40}\b', ign)
82
+
83
+ # --- REGION: AMERICAS ---
84
+ am = "AMERICAS"
85
+ self.registry.register(am, "SSN_US", r'\b\d{3}-\d{2}-\d{4}\b')
86
+ self.registry.register(am, "PASSPORT_US", r'\b[0-9]{9}\b')
87
+ self.registry.register(am, "SIN_CA", r'\b\d{3} \d{3} \d{3}\b')
88
+ self.registry.register(am, "CPF_BR", r'\b\d{3}\.\d{3}\.\d{3}-\d{2}\b')
89
+ self.registry.register(am, "CURP_MX", r'\b[A-Z]{4}\d{6}[HM][A-Z]{5}[A-Z0-9]\d\b')
90
+
91
+ # --- REGION: APAC (Asia Pacific) ---
92
+ ap = "APAC"
93
+ self.registry.register(ap, "AADHAAR_IN", r'\b[2-9][0-9]{3}\s?[0-9]{4}\s?[0-9]{4}\b')
94
+ self.registry.register(ap, "PAN_IN", r'\b[A-Z]{5}[0-9]{4}[A-Z]\b')
95
+ self.registry.register(ap, "PASSPORT_IN", r'\b[A-Z][0-9]{7}\b')
96
+ self.registry.register(ap, "TFN_AU", r'\b\d{3} \d{3} \d{3}\b')
97
+ self.registry.register(ap, "HKID_HK", r'\b[A-Z]{1,2}[0-9]{6}\([0-9A]\)\b')
98
+ self.registry.register(ap, "NRIC_SG", r'\b[STFG][0-9]{7}[A-Z]\b')
99
+ self.registry.register(ap, "RESIDENT_ID_CN", r'\b[1-9]\d{5}(?:18|19|20)\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[0-1])\d{3}[\dXx]\b')
100
+ self.registry.register(ap, "MYKAD_MY", r'\b\d{6}-\d{2}-\d{4}\b')
101
+ self.registry.register(ap, "ARC_TW", r'\b[A-Z][A-D0-9][0-9]{8}\b')
102
+ self.registry.register(ap, "PASSPORT_JP", r'\b[A-Z]{2}[0-9]{7}\b')
103
+
104
+ # --- REGION: EMEA (Europe, Middle East, Africa) ---
105
+ em = "EMEA"
106
+ self.registry.register(em, "NINO_UK", r'\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b')
107
+ self.registry.register(em, "PPSN_IE", r'\b\d{7}[A-W][A-Z]?\b')
108
+ self.registry.register(em, "NIE_ES", r'\b[XYZ][0-9]{7}[A-Z]\b')
109
+ self.registry.register(em, "DNI_ES", r'\b[0-9]{8}[A-Z]\b')
110
+ self.registry.register(em, "NIR_FR", r'\b[12]\s?\d{2}\s?(?:0[1-9]|1[0-2])\s?\d{2}\s?\d{3}\s?\d{3}\s?\d{2}\b')
111
+ self.registry.register(em, "STEUERID_DE", r'\b\d{11}\b')
112
+ self.registry.register(em, "PESEL_PL", r'\b\d{11}\b')
113
+ self.registry.register(em, "CODICE_FISCALE_IT", r'\b[A-Z]{6}\d{2}[A-EHLMPR-T]\d{2}[A-Z]\d{3}[A-Z]\b')
114
+ self.registry.register(em, "CNP_RO", r'\b[1-9]\d{12}\b')
115
+ self.registry.register(em, "BSN_NL", r'\b\d{8,9}\b')
116
+ self.registry.register(em, "EMIRATES_ID_UAE", r'\b784-\d{4}-\d{7}-\d\b')
117
+ self.registry.register(em, "ID_SA", r'\b1\d{9}\b')
118
+ self.registry.register(em, "ID_ZA", r'\b\d{13}\b')
119
+
120
+ def detect(self, text: str, lang: Optional[str] = None) -> List[PIIEntity]:
121
+ """Scans text and returns isolated PII entities, handling overlaps."""
122
+ entities = []
123
+
124
+ # Determine which categories to scan based on language
125
+ active_categories = self._LANG_REGION_MAP.get(lang, ["GLOBAL", "AMERICAS", "EMEA", "APAC"])
126
+ if "GLOBAL" not in active_categories:
127
+ active_categories.append("GLOBAL")
128
+
129
+ for cat in active_categories:
130
+ cat_patterns = self.registry._patterns.get(cat, {})
131
+ for entity_type, pattern in cat_patterns.items():
132
+ for match in pattern.finditer(text):
133
+ entities.append(PIIEntity(
134
+ value=match.group(),
135
+ entity_type=entity_type,
136
+ start=match.start(),
137
+ end=match.end()
138
+ ))
139
+
140
+ if not entities:
141
+ return []
142
+
143
+ # Hande Overlaps: Prioritize longer matches
144
+ entities.sort(key=lambda x: x.start)
145
+ filtered = []
146
+
147
+ current = entities[0]
148
+ for next_ent in entities[1:]:
149
+ if next_ent.start < current.end:
150
+ # Overlap detected
151
+ if (next_ent.end - next_ent.start) > (current.end - current.start):
152
+ current = next_ent # Keep the longer one
153
+ else:
154
+ filtered.append(current)
155
+ current = next_ent
156
+ filtered.append(current)
157
+
158
+ return filtered