piifill-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- piifill_cli-0.1.0/LICENSE +21 -0
- piifill_cli-0.1.0/PKG-INFO +90 -0
- piifill_cli-0.1.0/README.md +67 -0
- piifill_cli-0.1.0/piifill/__init__.py +2 -0
- piifill_cli-0.1.0/piifill/cli.py +244 -0
- piifill_cli-0.1.0/piifill/engine/__init__.py +0 -0
- piifill_cli-0.1.0/piifill/engine/filtration.py +158 -0
- piifill_cli-0.1.0/piifill/engine/manager.py +143 -0
- piifill_cli-0.1.0/piifill/engine/vault.py +91 -0
- piifill_cli-0.1.0/piifill/logic/__init__.py +0 -0
- piifill_cli-0.1.0/piifill/logic/factory.py +26 -0
- piifill_cli-0.1.0/piifill/logic/linguistics.py +50 -0
- piifill_cli-0.1.0/piifill/logic/parsers.py +400 -0
- piifill_cli-0.1.0/piifill/logic/vision.py +100 -0
- piifill_cli-0.1.0/piifill/shared/__init__.py +0 -0
- piifill_cli-0.1.0/piifill/shared/config.py +16 -0
- piifill_cli-0.1.0/piifill/shared/ui.py +137 -0
- piifill_cli-0.1.0/piifill_cli.egg-info/PKG-INFO +90 -0
- piifill_cli-0.1.0/piifill_cli.egg-info/SOURCES.txt +23 -0
- piifill_cli-0.1.0/piifill_cli.egg-info/dependency_links.txt +1 -0
- piifill_cli-0.1.0/piifill_cli.egg-info/entry_points.txt +2 -0
- piifill_cli-0.1.0/piifill_cli.egg-info/requires.txt +13 -0
- piifill_cli-0.1.0/piifill_cli.egg-info/top_level.txt +1 -0
- piifill_cli-0.1.0/pyproject.toml +35 -0
- piifill_cli-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Developer
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: piifill-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PIIFILL: Professional Local-Logic PII Sanitization CLI
|
|
5
|
+
Author-email: Bhavin Sachaniya <bhavinsachaniya200@gmail.com>
|
|
6
|
+
Requires-Python: >=3.9
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: typer>=0.12.0
|
|
10
|
+
Requires-Dist: rich>=13.0.0
|
|
11
|
+
Requires-Dist: loguru>=0.7.0
|
|
12
|
+
Requires-Dist: pydantic>=2.0.0
|
|
13
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
14
|
+
Requires-Dist: easyocr
|
|
15
|
+
Requires-Dist: opencv-python-headless
|
|
16
|
+
Requires-Dist: numpy<2
|
|
17
|
+
Requires-Dist: pillow
|
|
18
|
+
Requires-Dist: pandas
|
|
19
|
+
Requires-Dist: openpyxl
|
|
20
|
+
Requires-Dist: pymupdf
|
|
21
|
+
Requires-Dist: python-docx
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# PIIFILL CLI
|
|
25
|
+
|
|
26
|
+
> **Automated Enterprise-Grade Local PII Sanitization CLI.**
|
|
27
|
+
|
|
28
|
+
PIIFILL is a high-performance terminal utility for detecting and masking sensitive data locally and securely.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## โก Quick Start
|
|
33
|
+
|
|
34
|
+
### 1. Installation
|
|
35
|
+
Install the CLI in editable mode for local development:
|
|
36
|
+
```bash
|
|
37
|
+
cd CLI/piifill
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### 2. Basic Usage
|
|
42
|
+
Scan and mask PII in a file:
|
|
43
|
+
```bash
|
|
44
|
+
piifill mask path/to/file.json
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## ๐ Command Reference
|
|
50
|
+
|
|
51
|
+
### `piifill mask`
|
|
52
|
+
Securely mask sensitive data in assets.
|
|
53
|
+
- `piifill mask <path>`: Direct masking of a file.
|
|
54
|
+
- `piifill mask -o <output_path>`: Specify custom output path.
|
|
55
|
+
- `--mode`: Sanitization strategy (`mask`, `redact`, `tokenize`).
|
|
56
|
+
- `--local`: Rapid local directory protection (scans current directory by default).
|
|
57
|
+
|
|
58
|
+
### `piifill scan`
|
|
59
|
+
Detect PII in a file or directory without modifying it.
|
|
60
|
+
- `piifill scan <path>`: Scan an asset.
|
|
61
|
+
- `--recursive`: Scan entire directories.
|
|
62
|
+
|
|
63
|
+
### `piifill config`
|
|
64
|
+
Manage PIIFILL configuration.
|
|
65
|
+
|
|
66
|
+
### `piifill version`
|
|
67
|
+
Show PIIFILL version information.
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## ๐ Security Analysis
|
|
72
|
+
|
|
73
|
+
Every sanitization run performs high-fidelity risk analysis:
|
|
74
|
+
- **Security Grading**: A-F scale based on PII density.
|
|
75
|
+
- **Risk Scoring**: 0-100 technical protection score.
|
|
76
|
+
- **Entity Breakdown**: Categorical distribution (emails, SSNs, credit cards, etc.).
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## ๐งช Verification
|
|
81
|
+
|
|
82
|
+
Run the CLI against sample data to verify detection:
|
|
83
|
+
```bash
|
|
84
|
+
piifill scan ./test_data/
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## ๐ License
|
|
90
|
+
See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# PIIFILL CLI
|
|
2
|
+
|
|
3
|
+
> **Automated Enterprise-Grade Local PII Sanitization CLI.**
|
|
4
|
+
|
|
5
|
+
PIIFILL is a high-performance terminal utility for detecting and masking sensitive data locally and securely.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## โก Quick Start
|
|
10
|
+
|
|
11
|
+
### 1. Installation
|
|
12
|
+
Install the CLI in editable mode for local development:
|
|
13
|
+
```bash
|
|
14
|
+
cd CLI/piifill
|
|
15
|
+
pip install -e .
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
### 2. Basic Usage
|
|
19
|
+
Scan and mask PII in a file:
|
|
20
|
+
```bash
|
|
21
|
+
piifill mask path/to/file.json
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## ๐ Command Reference
|
|
27
|
+
|
|
28
|
+
### `piifill mask`
|
|
29
|
+
Securely mask sensitive data in assets.
|
|
30
|
+
- `piifill mask <path>`: Direct masking of a file.
|
|
31
|
+
- `piifill mask -o <output_path>`: Specify custom output path.
|
|
32
|
+
- `--mode`: Sanitization strategy (`mask`, `redact`, `tokenize`).
|
|
33
|
+
- `--local`: Rapid local directory protection (scans current directory by default).
|
|
34
|
+
|
|
35
|
+
### `piifill scan`
|
|
36
|
+
Detect PII in a file or directory without modifying it.
|
|
37
|
+
- `piifill scan <path>`: Scan an asset.
|
|
38
|
+
- `--recursive`: Scan entire directories.
|
|
39
|
+
|
|
40
|
+
### `piifill config`
|
|
41
|
+
Manage PIIFILL configuration.
|
|
42
|
+
|
|
43
|
+
### `piifill version`
|
|
44
|
+
Show PIIFILL version information.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## ๐ Security Analysis
|
|
49
|
+
|
|
50
|
+
Every sanitization run performs high-fidelity risk analysis:
|
|
51
|
+
- **Security Grading**: A-F scale based on PII density.
|
|
52
|
+
- **Risk Scoring**: 0-100 technical protection score.
|
|
53
|
+
- **Entity Breakdown**: Categorical distribution (emails, SSNs, credit cards, etc.).
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## ๐งช Verification
|
|
58
|
+
|
|
59
|
+
Run the CLI against sample data to verify detection:
|
|
60
|
+
```bash
|
|
61
|
+
piifill scan ./test_data/
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## ๐ License
|
|
67
|
+
See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, List
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
# --- Internal Imports ---
|
|
11
|
+
from .shared.config import settings
|
|
12
|
+
from .shared.ui import (
|
|
13
|
+
setup_logger, print_banner, t, print_step, print_status,
|
|
14
|
+
print_error, print_success, format_detection_table, console,
|
|
15
|
+
translator
|
|
16
|
+
)
|
|
17
|
+
from .engine.manager import filtration_manager, FiltrationManager
|
|
18
|
+
from .logic.factory import FileProcessor
|
|
19
|
+
|
|
20
|
+
# --- Initialization ---
|
|
21
|
+
logger = setup_logger(settings.log_file, settings.log_level)
|
|
22
|
+
|
|
23
|
+
# --- CLI Application ---
|
|
24
|
+
app = typer.Typer(
|
|
25
|
+
help="PIIFILL: Professional Enterprise-Grade PII Sanitization CLI.",
|
|
26
|
+
no_args_is_help=False,
|
|
27
|
+
add_completion=False,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# --- Helpers ---
|
|
31
|
+
def version_callback(value: bool):
|
|
32
|
+
if value:
|
|
33
|
+
print_banner()
|
|
34
|
+
console.print("[bold cyan]PIIFILL CLI Version:[/bold cyan] 0.1.0")
|
|
35
|
+
console.print("[dim]Enterprise-Grade Local PII Sanitization CLI[/dim]")
|
|
36
|
+
raise typer.Exit()
|
|
37
|
+
|
|
38
|
+
def _worker_process_file(fpath: Path, input_path: Path, output_dir: Path, final_name: Optional[str], mode: str):
|
|
39
|
+
"""Worker function for multi-core processing."""
|
|
40
|
+
worker_engine = FiltrationManager()
|
|
41
|
+
filename = fpath.name
|
|
42
|
+
try:
|
|
43
|
+
if input_path.is_dir():
|
|
44
|
+
relative_path = fpath.relative_to(input_path)
|
|
45
|
+
final_output_path = output_dir / relative_path
|
|
46
|
+
else:
|
|
47
|
+
final_output_path = output_dir / final_name
|
|
48
|
+
|
|
49
|
+
final_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
parser_instance = FileProcessor.get_parser(fpath.suffix)
|
|
51
|
+
parser_instance.parse_file(fpath, final_output_path, worker_engine, mode)
|
|
52
|
+
return filename, True, worker_engine.get_session_report()
|
|
53
|
+
except Exception as e:
|
|
54
|
+
return filename, False, str(e)
|
|
55
|
+
|
|
56
|
+
# --- Commands ---
|
|
57
|
+
@app.command(name="mask", help="Securely protect assets (Files/Directories)")
|
|
58
|
+
def mask_command(
|
|
59
|
+
path: Path = typer.Argument(..., help="File or directory path to secure"),
|
|
60
|
+
output: Optional[Path] = typer.Option(None, "--output", "-o", help="Custom output path"),
|
|
61
|
+
mode: str = typer.Option("mask", "--mode", "-m", help="Sanitization strategy (mask/redact/tokenize)"),
|
|
62
|
+
local: bool = typer.Option(False, "--local", "-l", help="Rapid local directory protection (scans current dir)")
|
|
63
|
+
):
|
|
64
|
+
"""Securely mask sensitive data in assets."""
|
|
65
|
+
if local and path is None:
|
|
66
|
+
path = Path(".")
|
|
67
|
+
|
|
68
|
+
if path is None:
|
|
69
|
+
print_error("Path is required.")
|
|
70
|
+
raise typer.Exit(code=1)
|
|
71
|
+
|
|
72
|
+
input_path = path.resolve()
|
|
73
|
+
if not input_path.exists():
|
|
74
|
+
print_error(t("error_path_exists", path=str(input_path)))
|
|
75
|
+
raise typer.Exit(code=1)
|
|
76
|
+
|
|
77
|
+
if input_path.is_file():
|
|
78
|
+
files_to_process = [input_path]
|
|
79
|
+
if output:
|
|
80
|
+
if output.suffix:
|
|
81
|
+
final_output_path_base = output.resolve()
|
|
82
|
+
output_dir = final_output_path_base.parent
|
|
83
|
+
final_name = final_output_path_base.name
|
|
84
|
+
else:
|
|
85
|
+
output_dir = output.resolve()
|
|
86
|
+
final_name = input_path.name
|
|
87
|
+
else:
|
|
88
|
+
output_dir = input_path.parent / "out"
|
|
89
|
+
final_name = input_path.name
|
|
90
|
+
else:
|
|
91
|
+
files_to_process = [p for p in input_path.rglob("*") if p.is_file() and not p.name.startswith('.')]
|
|
92
|
+
output_dir = output.resolve() if output else input_path / "out"
|
|
93
|
+
final_name = None
|
|
94
|
+
|
|
95
|
+
if not files_to_process:
|
|
96
|
+
print_error(t("error_no_files", path=str(input_path)))
|
|
97
|
+
raise typer.Exit()
|
|
98
|
+
|
|
99
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
100
|
+
|
|
101
|
+
results_table = Table(title=t("results_table_title"), box=None, header_style="bold cyan")
|
|
102
|
+
results_table.add_column(t("results_asset"), style="cyan")
|
|
103
|
+
results_table.add_column(t("results_status"), justify="center")
|
|
104
|
+
results_table.add_column(t("results_grade"), justify="center")
|
|
105
|
+
|
|
106
|
+
if len(files_to_process) > 1:
|
|
107
|
+
print_step(t("step_deploy_parallel", count=f"[bold]{os.cpu_count()}[/bold]", file_count=f"[bold]{len(files_to_process)}[/bold]"))
|
|
108
|
+
with Progress(SpinnerColumn(), TextColumn(f"[cyan]{t('status_parallel')}"), BarColumn(bar_width=40, pulse_style="cyan"), TaskProgressColumn(), console=console, transient=True) as progress:
|
|
109
|
+
task = progress.add_task(t("status_securing"), total=len(files_to_process))
|
|
110
|
+
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
|
|
111
|
+
futures = [executor.submit(_worker_process_file, f, input_path, output_dir, final_name, mode) for f in files_to_process]
|
|
112
|
+
for future in as_completed(futures):
|
|
113
|
+
fname, success, result = future.result()
|
|
114
|
+
if success:
|
|
115
|
+
results_table.add_row(fname, f"[bold green]{t('status_secured')}[/bold green]", f"[bold]{result['risk_grade']}[/bold]")
|
|
116
|
+
else:
|
|
117
|
+
results_table.add_row(fname, f"[red]{t('status_failed')}[/red]", "N/A")
|
|
118
|
+
logger.error(f"Protection Error on {fname}: {result}")
|
|
119
|
+
progress.advance(task)
|
|
120
|
+
else:
|
|
121
|
+
print_step(t("step_deploy_protection", count=f"[bold]{len(files_to_process)}[/bold]"))
|
|
122
|
+
for fpath in files_to_process:
|
|
123
|
+
filename = fpath.name
|
|
124
|
+
try:
|
|
125
|
+
if input_path.is_dir():
|
|
126
|
+
final_output_path = output_dir / fpath.relative_to(input_path)
|
|
127
|
+
else:
|
|
128
|
+
final_output_path = output_dir / final_name
|
|
129
|
+
|
|
130
|
+
final_output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
filtration_manager.reset_session()
|
|
132
|
+
parser = FileProcessor.get_parser(fpath.suffix)
|
|
133
|
+
|
|
134
|
+
with Progress(SpinnerColumn(), TextColumn(f"[cyan]{t('securing_file', name=filename)}"), BarColumn(bar_width=40, pulse_style="cyan"), TaskProgressColumn(), console=console, transient=True) as progress:
|
|
135
|
+
task_id = progress.add_task(t("sanitizing"), total=1.0)
|
|
136
|
+
parser.parse_file(fpath, final_output_path, filtration_manager, mode, progress_callback=lambda p: progress.update(task_id, completed=p))
|
|
137
|
+
|
|
138
|
+
report = filtration_manager.get_session_report()
|
|
139
|
+
results_table.add_row(filename, f"[bold green]{t('status_secured')}[/bold green]", f"[bold]{report['risk_grade']}[/bold]")
|
|
140
|
+
except Exception as e:
|
|
141
|
+
results_table.add_row(filename, f"[red]{t('status_failed')}[/red]", "N/A")
|
|
142
|
+
logger.error(f"Protection Error on {filename}: {e}")
|
|
143
|
+
|
|
144
|
+
console.print("")
|
|
145
|
+
console.print(results_table)
|
|
146
|
+
print_status(t("status_protection_complete", path=f"[bold cyan]{output_dir}[/bold cyan]"), "success")
|
|
147
|
+
|
|
148
|
+
@app.command(name="scan", help="Scan assets for PII without modification")
|
|
149
|
+
def scan_command(
|
|
150
|
+
path: Path = typer.Argument(..., help="File or directory path to scan for PII"),
|
|
151
|
+
recursive: bool = typer.Option(False, "--recursive", "-r", help="Scan directory recursively")
|
|
152
|
+
):
|
|
153
|
+
"""Detect PII in a file or directory"""
|
|
154
|
+
if not path.exists():
|
|
155
|
+
print_error(t("error_path_exists", path=str(path)))
|
|
156
|
+
raise typer.Exit(code=1)
|
|
157
|
+
|
|
158
|
+
if path.is_file():
|
|
159
|
+
logger.info(f"Scanning file: {path}")
|
|
160
|
+
print_step(f"Analyzing asset: [bold]{path.name}[/bold]")
|
|
161
|
+
file_size = path.stat().st_size
|
|
162
|
+
pii_entities = []
|
|
163
|
+
filtration_manager.reset_session()
|
|
164
|
+
try:
|
|
165
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(bar_width=40, pulse_style="magenta"), TaskProgressColumn(), console=console, transient=True) as progress:
|
|
166
|
+
task = progress.add_task(f"[magenta]{t('status_processing')}", total=file_size)
|
|
167
|
+
with open(path, "rb") as f:
|
|
168
|
+
while True:
|
|
169
|
+
chunk = f.read(1024*1024)
|
|
170
|
+
if not chunk: break
|
|
171
|
+
try: text = chunk.decode('utf-8', errors='replace')
|
|
172
|
+
except: text = chunk.decode('latin1', errors='replace')
|
|
173
|
+
entities = filtration_manager.detect(text)
|
|
174
|
+
pii_entities.extend(entities)
|
|
175
|
+
progress.advance(task, len(chunk))
|
|
176
|
+
if not pii_entities:
|
|
177
|
+
print_status(t("status_no_pii"), "success")
|
|
178
|
+
else:
|
|
179
|
+
print_status(t("status_found_pii", count=len(pii_entities)), "warning")
|
|
180
|
+
display_entities = pii_entities[:50]
|
|
181
|
+
table = format_detection_table([e.model_dump() for e in display_entities])
|
|
182
|
+
console.print(table)
|
|
183
|
+
if len(pii_entities) > 50:
|
|
184
|
+
console.print(f"[dim]{t('status_hidden_items', count=len(pii_entities)-50)}[/dim]")
|
|
185
|
+
print_status(t("status_scan_complete", name=f"[bold]{path.name}[/bold]"), "success")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"Error scanning {path}: {e}")
|
|
188
|
+
print_error(t("error_scan_failed", name=path.name))
|
|
189
|
+
elif path.is_dir():
|
|
190
|
+
if not recursive:
|
|
191
|
+
print_warning(t("warn_dir_recursive"))
|
|
192
|
+
raise typer.Exit()
|
|
193
|
+
files = [p for p in path.rglob("*") if p.is_file() and not p.name.startswith('.')]
|
|
194
|
+
if not files:
|
|
195
|
+
print_error(t("error_no_files", path=str(path)))
|
|
196
|
+
raise typer.Exit()
|
|
197
|
+
total_pii = 0
|
|
198
|
+
print_step(t("step_deploy_batch", count=f"[bold]{len(files)}[/bold]"))
|
|
199
|
+
with Progress(SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(bar_width=40, pulse_style="cyan"), TaskProgressColumn(), console=console) as progress:
|
|
200
|
+
task = progress.add_task(f"[cyan]{t('status_processing')}", total=len(files))
|
|
201
|
+
for fpath in files:
|
|
202
|
+
try:
|
|
203
|
+
with open(fpath, "rb") as f:
|
|
204
|
+
text = f.read().decode('utf-8', errors='replace')
|
|
205
|
+
total_pii += len(filtration_manager.detect(text))
|
|
206
|
+
except: pass
|
|
207
|
+
progress.advance(task)
|
|
208
|
+
console.print("")
|
|
209
|
+
print_status(t("status_batch_complete", pii_count=f"[bold]{total_pii}[/bold]", file_count=len(files)), "success")
|
|
210
|
+
|
|
211
|
+
@app.command(name="config", help="Manage PIIFILL configuration")
|
|
212
|
+
def config_command():
|
|
213
|
+
"""List current configurations."""
|
|
214
|
+
console.print("\n[cyan]--- PIIFILL CONFIGURATION ---[/cyan]")
|
|
215
|
+
console.print(f"Log Level: [bold]{settings.log_level}[/bold]")
|
|
216
|
+
console.print(f"Log File: [bold]{settings.log_file}[/bold]")
|
|
217
|
+
console.print(f"Default Mask: [bold]{settings.default_mask_style}[/bold]")
|
|
218
|
+
console.print(f"Supported Formats: [bold]{', '.join(settings.supported_extensions)}[/bold]")
|
|
219
|
+
console.print("[cyan]-----------------------------[/cyan]\n")
|
|
220
|
+
|
|
221
|
+
@app.command(name="version", help="Show version information")
|
|
222
|
+
def version_command():
|
|
223
|
+
"""Show the application version."""
|
|
224
|
+
console.print("[bold cyan]PIIFILL CLI Version:[/bold cyan] 0.1.0")
|
|
225
|
+
console.print("[dim]Enterprise-Grade Local PII Sanitization CLI[/dim]")
|
|
226
|
+
|
|
227
|
+
@app.callback(invoke_without_command=True)
|
|
228
|
+
def main(
|
|
229
|
+
ctx: typer.Context,
|
|
230
|
+
version: bool = typer.Option(None, "--version", "-v", callback=version_callback, is_eager=True, help="Show version info"),
|
|
231
|
+
lang: str = typer.Option(settings.language, "--lang", "-L", help="Set language (en, hi, es, fr)"),
|
|
232
|
+
):
|
|
233
|
+
"""Automated Enterprise-Grade PII Sanitization Logic."""
|
|
234
|
+
translator.set_language(lang)
|
|
235
|
+
print_banner()
|
|
236
|
+
if ctx.invoked_subcommand is None:
|
|
237
|
+
console.print("\n[bold cyan]Welcome to PIIFILL![/bold cyan]")
|
|
238
|
+
console.print("Use [bold]--help[/bold] to see available commands.")
|
|
239
|
+
|
|
240
|
+
def entrypoint():
|
|
241
|
+
app()
|
|
242
|
+
|
|
243
|
+
if __name__ == "__main__":
|
|
244
|
+
app()
|
|
File without changes
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Dict, List, Pattern, Optional, Set
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
class PIIEntity(BaseModel):
|
|
6
|
+
value: str
|
|
7
|
+
entity_type: str
|
|
8
|
+
start: int
|
|
9
|
+
end: int
|
|
10
|
+
|
|
11
|
+
class PatternRegistry:
|
|
12
|
+
"""
|
|
13
|
+
Registry for managing PII patterns across different categories and countries.
|
|
14
|
+
Optimized for memory and lookup speed.
|
|
15
|
+
"""
|
|
16
|
+
def __init__(self):
|
|
17
|
+
self._patterns: Dict[str, Dict[str, Pattern]] = {
|
|
18
|
+
"GLOBAL": {}, # Multi-country patterns (Email, IP, etc.)
|
|
19
|
+
"AMERICAS": {}, # North/South America
|
|
20
|
+
"EMEA": {}, # Europe, Middle East, Africa
|
|
21
|
+
"APAC": {}, # Asia Pacific
|
|
22
|
+
}
|
|
23
|
+
self._compiled_cache: Optional[Dict[str, Pattern]] = None
|
|
24
|
+
|
|
25
|
+
def register(self, category: str, type_name: str, pattern: str, flags: int = 0):
|
|
26
|
+
"""Registers a new PII pattern in a specific category."""
|
|
27
|
+
if category not in self._patterns:
|
|
28
|
+
self._patterns[category] = {}
|
|
29
|
+
# Ensure we don't accidentally use re.IGNORECASE for critical identifiers
|
|
30
|
+
self._patterns[category][type_name] = re.compile(pattern, flags)
|
|
31
|
+
self._compiled_cache = None # Invalidate cache
|
|
32
|
+
|
|
33
|
+
def get_all_patterns(self) -> Dict[str, Pattern]:
|
|
34
|
+
"""Returns a flattened dictionary of all registered patterns."""
|
|
35
|
+
if self._compiled_cache is not None:
|
|
36
|
+
return self._compiled_cache
|
|
37
|
+
|
|
38
|
+
all_pats = {}
|
|
39
|
+
for cat_pats in self._patterns.values():
|
|
40
|
+
all_pats.update(cat_pats)
|
|
41
|
+
self._compiled_cache = all_pats
|
|
42
|
+
return all_pats
|
|
43
|
+
|
|
44
|
+
class PIIDetector:
|
|
45
|
+
"""
|
|
46
|
+
Professional-grade PII detection engine using a modular PatternRegistry.
|
|
47
|
+
Adheres to PIIFILL high-performance and type-safety standards.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Mapping of common language codes to their primary regions
|
|
51
|
+
_LANG_REGION_MAP = {
|
|
52
|
+
"en": ["GLOBAL", "AMERICAS", "EMEA", "APAC"], # English is global
|
|
53
|
+
"de": ["GLOBAL", "EMEA"],
|
|
54
|
+
"es": ["GLOBAL", "EMEA", "AMERICAS"],
|
|
55
|
+
"fr": ["GLOBAL", "EMEA", "AMERICAS"],
|
|
56
|
+
"hi": ["GLOBAL", "APAC"],
|
|
57
|
+
"pt": ["GLOBAL", "AMERICAS", "EMEA"],
|
|
58
|
+
"zh": ["GLOBAL", "APAC"],
|
|
59
|
+
"ja": ["GLOBAL", "APAC"],
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
def __init__(self):
|
|
63
|
+
self.registry = PatternRegistry()
|
|
64
|
+
self._initialize_core_patterns()
|
|
65
|
+
|
|
66
|
+
def _initialize_core_patterns(self):
|
|
67
|
+
# --- CATEGORY: CONTACT & INTERNET (GLOBAL) ---
|
|
68
|
+
g = "GLOBAL"
|
|
69
|
+
ign = re.IGNORECASE
|
|
70
|
+
self.registry.register(g, "EMAIL", r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', ign)
|
|
71
|
+
self.registry.register(g, "PHONE_GLOBAL", r'(?<!\d)\+(?:[0-9] ?){6,14}[0-9](?!\d)')
|
|
72
|
+
self.registry.register(g, "IPV4", r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b')
|
|
73
|
+
self.registry.register(g, "IPV6", r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b', ign)
|
|
74
|
+
self.registry.register(g, "MAC_ADDRESS", r'\b([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})\b', ign)
|
|
75
|
+
|
|
76
|
+
# --- CATEGORY: FINANCIAL (GLOBAL) ---
|
|
77
|
+
self.registry.register(g, "CREDIT_CARD", r'\b(?:\d{4}[-\s]?){3}\d{4}\b')
|
|
78
|
+
self.registry.register(g, "IBAN", r'\b[A-Z]{2}[0-9]{2}[A-Z0-9]{11,30}\b') # Critical: case-sensitive
|
|
79
|
+
self.registry.register(g, "SWIFT_BIC", r'\b[A-Z]{6}[A-Z0-9]{2}([A-Z0-9]{3})?\b') # Critical: case-sensitive
|
|
80
|
+
self.registry.register(g, "CRYPTO_BTC", r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b')
|
|
81
|
+
self.registry.register(g, "CRYPTO_ETH", r'\b0x[a-fA-F0-9]{40}\b', ign)
|
|
82
|
+
|
|
83
|
+
# --- REGION: AMERICAS ---
|
|
84
|
+
am = "AMERICAS"
|
|
85
|
+
self.registry.register(am, "SSN_US", r'\b\d{3}-\d{2}-\d{4}\b')
|
|
86
|
+
self.registry.register(am, "PASSPORT_US", r'\b[0-9]{9}\b')
|
|
87
|
+
self.registry.register(am, "SIN_CA", r'\b\d{3} \d{3} \d{3}\b')
|
|
88
|
+
self.registry.register(am, "CPF_BR", r'\b\d{3}\.\d{3}\.\d{3}-\d{2}\b')
|
|
89
|
+
self.registry.register(am, "CURP_MX", r'\b[A-Z]{4}\d{6}[HM][A-Z]{5}[A-Z0-9]\d\b')
|
|
90
|
+
|
|
91
|
+
# --- REGION: APAC (Asia Pacific) ---
|
|
92
|
+
ap = "APAC"
|
|
93
|
+
self.registry.register(ap, "AADHAAR_IN", r'\b[2-9][0-9]{3}\s?[0-9]{4}\s?[0-9]{4}\b')
|
|
94
|
+
self.registry.register(ap, "PAN_IN", r'\b[A-Z]{5}[0-9]{4}[A-Z]\b')
|
|
95
|
+
self.registry.register(ap, "PASSPORT_IN", r'\b[A-Z][0-9]{7}\b')
|
|
96
|
+
self.registry.register(ap, "TFN_AU", r'\b\d{3} \d{3} \d{3}\b')
|
|
97
|
+
self.registry.register(ap, "HKID_HK", r'\b[A-Z]{1,2}[0-9]{6}\([0-9A]\)\b')
|
|
98
|
+
self.registry.register(ap, "NRIC_SG", r'\b[STFG][0-9]{7}[A-Z]\b')
|
|
99
|
+
self.registry.register(ap, "RESIDENT_ID_CN", r'\b[1-9]\d{5}(?:18|19|20)\d{2}(?:0[1-9]|1[0-2])(?:0[1-9]|[1-2]\d|3[0-1])\d{3}[\dXx]\b')
|
|
100
|
+
self.registry.register(ap, "MYKAD_MY", r'\b\d{6}-\d{2}-\d{4}\b')
|
|
101
|
+
self.registry.register(ap, "ARC_TW", r'\b[A-Z][A-D0-9][0-9]{8}\b')
|
|
102
|
+
self.registry.register(ap, "PASSPORT_JP", r'\b[A-Z]{2}[0-9]{7}\b')
|
|
103
|
+
|
|
104
|
+
# --- REGION: EMEA (Europe, Middle East, Africa) ---
|
|
105
|
+
em = "EMEA"
|
|
106
|
+
self.registry.register(em, "NINO_UK", r'\b[A-CEGHJ-PR-TW-Z]{2}\s?\d{2}\s?\d{2}\s?\d{2}\s?[A-D]\b')
|
|
107
|
+
self.registry.register(em, "PPSN_IE", r'\b\d{7}[A-W][A-Z]?\b')
|
|
108
|
+
self.registry.register(em, "NIE_ES", r'\b[XYZ][0-9]{7}[A-Z]\b')
|
|
109
|
+
self.registry.register(em, "DNI_ES", r'\b[0-9]{8}[A-Z]\b')
|
|
110
|
+
self.registry.register(em, "NIR_FR", r'\b[12]\s?\d{2}\s?(?:0[1-9]|1[0-2])\s?\d{2}\s?\d{3}\s?\d{3}\s?\d{2}\b')
|
|
111
|
+
self.registry.register(em, "STEUERID_DE", r'\b\d{11}\b')
|
|
112
|
+
self.registry.register(em, "PESEL_PL", r'\b\d{11}\b')
|
|
113
|
+
self.registry.register(em, "CODICE_FISCALE_IT", r'\b[A-Z]{6}\d{2}[A-EHLMPR-T]\d{2}[A-Z]\d{3}[A-Z]\b')
|
|
114
|
+
self.registry.register(em, "CNP_RO", r'\b[1-9]\d{12}\b')
|
|
115
|
+
self.registry.register(em, "BSN_NL", r'\b\d{8,9}\b')
|
|
116
|
+
self.registry.register(em, "EMIRATES_ID_UAE", r'\b784-\d{4}-\d{7}-\d\b')
|
|
117
|
+
self.registry.register(em, "ID_SA", r'\b1\d{9}\b')
|
|
118
|
+
self.registry.register(em, "ID_ZA", r'\b\d{13}\b')
|
|
119
|
+
|
|
120
|
+
def detect(self, text: str, lang: Optional[str] = None) -> List[PIIEntity]:
|
|
121
|
+
"""Scans text and returns isolated PII entities, handling overlaps."""
|
|
122
|
+
entities = []
|
|
123
|
+
|
|
124
|
+
# Determine which categories to scan based on language
|
|
125
|
+
active_categories = self._LANG_REGION_MAP.get(lang, ["GLOBAL", "AMERICAS", "EMEA", "APAC"])
|
|
126
|
+
if "GLOBAL" not in active_categories:
|
|
127
|
+
active_categories.append("GLOBAL")
|
|
128
|
+
|
|
129
|
+
for cat in active_categories:
|
|
130
|
+
cat_patterns = self.registry._patterns.get(cat, {})
|
|
131
|
+
for entity_type, pattern in cat_patterns.items():
|
|
132
|
+
for match in pattern.finditer(text):
|
|
133
|
+
entities.append(PIIEntity(
|
|
134
|
+
value=match.group(),
|
|
135
|
+
entity_type=entity_type,
|
|
136
|
+
start=match.start(),
|
|
137
|
+
end=match.end()
|
|
138
|
+
))
|
|
139
|
+
|
|
140
|
+
if not entities:
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
# Hande Overlaps: Prioritize longer matches
|
|
144
|
+
entities.sort(key=lambda x: x.start)
|
|
145
|
+
filtered = []
|
|
146
|
+
|
|
147
|
+
current = entities[0]
|
|
148
|
+
for next_ent in entities[1:]:
|
|
149
|
+
if next_ent.start < current.end:
|
|
150
|
+
# Overlap detected
|
|
151
|
+
if (next_ent.end - next_ent.start) > (current.end - current.start):
|
|
152
|
+
current = next_ent # Keep the longer one
|
|
153
|
+
else:
|
|
154
|
+
filtered.append(current)
|
|
155
|
+
current = next_ent
|
|
156
|
+
filtered.append(current)
|
|
157
|
+
|
|
158
|
+
return filtered
|