piiscrub 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- piiscrub-0.1.0/LICENSE +21 -0
- piiscrub-0.1.0/PKG-INFO +198 -0
- piiscrub-0.1.0/README.md +174 -0
- piiscrub-0.1.0/piiscrub/__init__.py +4 -0
- piiscrub-0.1.0/piiscrub/cli.py +175 -0
- piiscrub-0.1.0/piiscrub/core.py +258 -0
- piiscrub-0.1.0/piiscrub/patterns.py +24 -0
- piiscrub-0.1.0/piiscrub/profiles.py +25 -0
- piiscrub-0.1.0/piiscrub/validators.py +48 -0
- piiscrub-0.1.0/piiscrub.egg-info/PKG-INFO +198 -0
- piiscrub-0.1.0/piiscrub.egg-info/SOURCES.txt +18 -0
- piiscrub-0.1.0/piiscrub.egg-info/dependency_links.txt +1 -0
- piiscrub-0.1.0/piiscrub.egg-info/entry_points.txt +2 -0
- piiscrub-0.1.0/piiscrub.egg-info/requires.txt +1 -0
- piiscrub-0.1.0/piiscrub.egg-info/top_level.txt +1 -0
- piiscrub-0.1.0/pyproject.toml +41 -0
- piiscrub-0.1.0/setup.cfg +4 -0
- piiscrub-0.1.0/tests/test_cli.py +94 -0
- piiscrub-0.1.0/tests/test_core.py +188 -0
- piiscrub-0.1.0/tests/test_validators.py +49 -0
piiscrub-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Omkar Pathak
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
piiscrub-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: piiscrub
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A blazing-fast, lightweight Python library and CLI tool designed to scrub Personally Identifiable Information (PII)
|
|
5
|
+
Author: Omkar Pathak
|
|
6
|
+
Project-URL: Homepage, https://github.com/OmkarPathak/cleanslate
|
|
7
|
+
Project-URL: Issues, https://github.com/OmkarPathak/cleanslate/issues
|
|
8
|
+
Keywords: pii,scrub,anonymization,privacy,llm,rag
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
18
|
+
Classifier: Topic :: Security
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: faker
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# PiiScrub
|
|
26
|
+
|
|
27
|
+
A blazing-fast, lightweight Python library and CLI tool designed to scrub Personally Identifiable Information (PII) from datasets for LLM training and RAG pipelines.
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
|
|
31
|
+
- **Maximum Speed & Zero Dependencies:** Relies exclusively on Python's standard library. No `pandas`, `spaCy`, or other heavy external packages.
|
|
32
|
+
- **Deterministic Validation:** Raw regex matches for high-risk entities (like credit cards and IPs) pass algorithmic checksums (e.g., Luhn algorithm, octet range checks) before being flagged to eliminate false positives.
|
|
33
|
+
- **Pre-compiled Regex:** All regular expressions are compiled at the module level using `re.compile()` for O(1) setup time during execution.
|
|
34
|
+
- **Large Dataset Streaming:** Features `scrub_stream` and `extract_stream` to process massive datasets chunk-by-chunk without hitting Out-Of-Memory limit.
|
|
35
|
+
- **Multi-Core Parallel Processing:** Leverage multiple CPU cores to scrub large files at blazing speed using `--parallel`.
|
|
36
|
+
- **Pre-Bundled Compliance Profiles:** Quickly target specific standards like `hipaa`, `pci-dss`, or `gdpr` using the `--profile` flag.
|
|
37
|
+
- **Compliance Auditing & Metric Reports:** Generate detailed JSON reports with statistics on redacted entities and execution time using `--report`.
|
|
38
|
+
- **High-Value Secret Detection:** Added parsing to locate critical assets like AWS Access Keys, GitHub Tokens, and RSA Private Keys out of the box.
|
|
39
|
+
- **Deterministic Hashing:** Replace PII with deterministic SHA-256 hashes instead of generic tags to track uniqueness without leaking data.
|
|
40
|
+
- **Synthetic Data Generation:** Replace real PII with realistic "fake" data using the `faker` library (beta).
|
|
41
|
+
- **Configuration File Support:** Manage complex settings via `piiscrub.json` instead of long CLI commands.
|
|
42
|
+
- **Custom Pattern Injection:** Dynamically inject your own regex patterns and validators directly into the engine without modifying the core library.
|
|
43
|
+
- **Allowlist Support:** Explicitly bypass scrubbing for public figures, system emails, or company identifiers to prevent false positives.
|
|
44
|
+
|
|
45
|
+
## Supported Entities
|
|
46
|
+
|
|
47
|
+
- **Global:**
|
|
48
|
+
- `EMAIL`
|
|
49
|
+
- `PHONE_GENERIC` (international)
|
|
50
|
+
- `CREDIT_CARD` (13-16 digits with Luhn algorithm validation)
|
|
51
|
+
- `IPV4` (validation ensuring all octets <= 255)
|
|
52
|
+
- `IPV6`
|
|
53
|
+
- **US Specific:**
|
|
54
|
+
- `US_SSN`
|
|
55
|
+
- **India Specific:**
|
|
56
|
+
- `IN_AADHAAR` (12 digits, cannot start with 0 or 1)
|
|
57
|
+
- `IN_PAN` (5 uppercase letters, 4 digits, 1 uppercase letter)
|
|
58
|
+
- **Secrets & Credentials (V2):**
|
|
59
|
+
- `AWS_ACCESS_KEY`
|
|
60
|
+
- `GITHUB_TOKEN`
|
|
61
|
+
- `RSA_PRIVATE_KEY`
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install .
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## CLI Usage
|
|
70
|
+
|
|
71
|
+
### Extract PII
|
|
72
|
+
```bash
|
|
73
|
+
piiscrub extract --text "My email is test@example.com"
|
|
74
|
+
piiscrub extract --file text.txt
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Scrub PII
|
|
78
|
+
```bash
|
|
79
|
+
piiscrub scrub --text "My email is test@example.com"
|
|
80
|
+
piiscrub scrub --file text.txt
|
|
81
|
+
|
|
82
|
+
# Use deterministic hashing instead of standard tags
|
|
83
|
+
piiscrub scrub --text "My email is test@example.com" --style hash
|
|
84
|
+
# Output: My email is <EMAIL_a1517717>
|
|
85
|
+
|
|
86
|
+
# Bypass scrubbing for specific public strings
|
|
87
|
+
piiscrub scrub --text "Contact support@example.com or user@example.com" --allowlist support@example.com
|
|
88
|
+
# Output: Contact support@example.com or <EMAIL>
|
|
89
|
+
|
|
90
|
+
# Inject Custom Pattern from the CLI
|
|
91
|
+
piiscrub scrub --text "This is employee EMP-99881 and email a@b.com" --custom-pattern EMP_ID "\bEMP-\d{5}\b" --entities EMP_ID EMAIL
|
|
92
|
+
# Output: This is employee <EMP_ID> and email <EMAIL>
|
|
93
|
+
|
|
94
|
+
# Synthetic Data Generation
|
|
95
|
+
piiscrub scrub --text "Contact me at omkar@example.com" --style synthetic
|
|
96
|
+
# Output: Contact me at victoria12@gmail.com
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Advanced Features
|
|
100
|
+
|
|
101
|
+
#### 1. Configuration File (`piiscrub.json`)
|
|
102
|
+
You can define a `piiscrub.json` file in your working directory to simplify your commands:
|
|
103
|
+
|
|
104
|
+
```json
|
|
105
|
+
{
|
|
106
|
+
"style": "hash",
|
|
107
|
+
"entities": ["EMAIL", "PHONE_GENERIC"],
|
|
108
|
+
"allowlist": ["support@mycompany.com"],
|
|
109
|
+
"custom_patterns": {
|
|
110
|
+
"ORDER_ID": "ORD-\\d{5}"
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Now just run:
|
|
116
|
+
```bash
|
|
117
|
+
piiscrub scrub --file data.txt
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
#### 2. Parallel Processing
|
|
121
|
+
For large files, use multi-core processing:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
piiscrub scrub --file large_dataset.txt --parallel --output cleaned.txt
|
|
125
|
+
```
|
|
126
|
+
> [!TIP]
|
|
127
|
+
> Parallel mode automatically handles file I/O efficiently and defaults to using all available CPU cores.
|
|
128
|
+
|
|
129
|
+
#### 3. Pre-Bundled Compliance Profiles
|
|
130
|
+
Quickly target common privacy standards without remembering every entity name:
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
# Scrub only PCI-DSS related data (Credit Cards)
|
|
134
|
+
piiscrub scrub --file transactions.txt --profile pci-dss
|
|
135
|
+
|
|
136
|
+
# Scrub HIPAA related data (SSN, Phone, Email, IP)
|
|
137
|
+
piiscrub scrub --file medical_records.txt --profile hipaa
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Available profiles: `pci-dss`, `hipaa`, `gdpr`, `strict`.
|
|
141
|
+
|
|
142
|
+
#### 4. Compliance Auditing & Metric Reports
|
|
143
|
+
Data compliance teams can generate a statistical summary of the scrubbing process as proof of redaction:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
piiscrub scrub --file sensitive_data.txt --report audit.json
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Sample `audit.json` output:**
|
|
150
|
+
```json
|
|
151
|
+
{
|
|
152
|
+
"command": "scrub",
|
|
153
|
+
"total_lines_processed": 5000,
|
|
154
|
+
"execution_time_seconds": 1.25,
|
|
155
|
+
"entities_redacted": {
|
|
156
|
+
"EMAIL": 142,
|
|
157
|
+
"CREDIT_CARD": 12,
|
|
158
|
+
"PHONE_GENERIC": 5
|
|
159
|
+
},
|
|
160
|
+
"style": "tag"
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Stream Processing
|
|
165
|
+
For extremely large files (e.g. LLM corpus data logs):
|
|
166
|
+
```bash
|
|
167
|
+
piiscrub scrub --file huge_dataset.jsonl --stream > scrubbed.jsonl
|
|
168
|
+
piiscrub extract --file huge_dataset.jsonl --stream > entities.json
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Library Usage
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from piiscrub.core import PiiScrub
|
|
175
|
+
import re
|
|
176
|
+
|
|
177
|
+
# Initialize with custom generic entities or pattern injection!
|
|
178
|
+
custom_patterns = {
|
|
179
|
+
"INTERNAL_ID": re.compile(r"\bEMP-\d{5}\b")
|
|
180
|
+
}
|
|
181
|
+
cs = PiiScrub(
|
|
182
|
+
entities=["EMAIL", "CREDIT_CARD", "INTERNAL_ID"],
|
|
183
|
+
custom_patterns=custom_patterns,
|
|
184
|
+
allowlist=["public@example.com"]
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
code = "Contact test@example.com for info on EMP-12345."
|
|
188
|
+
|
|
189
|
+
# Extract entities
|
|
190
|
+
extracted = cs.extract_entities(code)
|
|
191
|
+
print(extracted)
|
|
192
|
+
# {'EMAIL': ['test@example.com'], 'INTERNAL_ID': ['EMP-12345']}
|
|
193
|
+
|
|
194
|
+
# Scrub entities using hashing
|
|
195
|
+
scrubbed_code = cs.scrub_text(code, replacement_style="hash")
|
|
196
|
+
print(scrubbed_code)
|
|
197
|
+
# Contact <EMAIL_a1517717> for info on <INTERNAL_ID_b5fb38c3>.
|
|
198
|
+
```
|
piiscrub-0.1.0/README.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# PiiScrub
|
|
2
|
+
|
|
3
|
+
A blazing-fast, lightweight Python library and CLI tool designed to scrub Personally Identifiable Information (PII) from datasets for LLM training and RAG pipelines.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Maximum Speed & Zero Dependencies:** Relies exclusively on Python's standard library. No `pandas`, `spaCy`, or other heavy external packages.
|
|
8
|
+
- **Deterministic Validation:** Raw regex matches for high-risk entities (like credit cards and IPs) pass algorithmic checksums (e.g., Luhn algorithm, octet range checks) before being flagged to eliminate false positives.
|
|
9
|
+
- **Pre-compiled Regex:** All regular expressions are compiled at the module level using `re.compile()` for O(1) setup time during execution.
|
|
10
|
+
- **Large Dataset Streaming:** Features `scrub_stream` and `extract_stream` to process massive datasets chunk-by-chunk without hitting Out-Of-Memory limit.
|
|
11
|
+
- **Multi-Core Parallel Processing:** Leverage multiple CPU cores to scrub large files at blazing speed using `--parallel`.
|
|
12
|
+
- **Pre-Bundled Compliance Profiles:** Quickly target specific standards like `hipaa`, `pci-dss`, or `gdpr` using the `--profile` flag.
|
|
13
|
+
- **Compliance Auditing & Metric Reports:** Generate detailed JSON reports with statistics on redacted entities and execution time using `--report`.
|
|
14
|
+
- **High-Value Secret Detection:** Added parsing to locate critical assets like AWS Access Keys, GitHub Tokens, and RSA Private Keys out of the box.
|
|
15
|
+
- **Deterministic Hashing:** Replace PII with deterministic SHA-256 hashes instead of generic tags to track uniqueness without leaking data.
|
|
16
|
+
- **Synthetic Data Generation:** Replace real PII with realistic "fake" data using the `faker` library (beta).
|
|
17
|
+
- **Configuration File Support:** Manage complex settings via `piiscrub.json` instead of long CLI commands.
|
|
18
|
+
- **Custom Pattern Injection:** Dynamically inject your own regex patterns and validators directly into the engine without modifying the core library.
|
|
19
|
+
- **Allowlist Support:** Explicitly bypass scrubbing for public figures, system emails, or company identifiers to prevent false positives.
|
|
20
|
+
|
|
21
|
+
## Supported Entities
|
|
22
|
+
|
|
23
|
+
- **Global:**
|
|
24
|
+
- `EMAIL`
|
|
25
|
+
- `PHONE_GENERIC` (international)
|
|
26
|
+
- `CREDIT_CARD` (13-16 digits with Luhn algorithm validation)
|
|
27
|
+
- `IPV4` (validation ensuring all octets <= 255)
|
|
28
|
+
- `IPV6`
|
|
29
|
+
- **US Specific:**
|
|
30
|
+
- `US_SSN`
|
|
31
|
+
- **India Specific:**
|
|
32
|
+
- `IN_AADHAAR` (12 digits, cannot start with 0 or 1)
|
|
33
|
+
- `IN_PAN` (5 uppercase letters, 4 digits, 1 uppercase letter)
|
|
34
|
+
- **Secrets & Credentials (V2):**
|
|
35
|
+
- `AWS_ACCESS_KEY`
|
|
36
|
+
- `GITHUB_TOKEN`
|
|
37
|
+
- `RSA_PRIVATE_KEY`
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install .
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## CLI Usage
|
|
46
|
+
|
|
47
|
+
### Extract PII
|
|
48
|
+
```bash
|
|
49
|
+
piiscrub extract --text "My email is test@example.com"
|
|
50
|
+
piiscrub extract --file text.txt
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Scrub PII
|
|
54
|
+
```bash
|
|
55
|
+
piiscrub scrub --text "My email is test@example.com"
|
|
56
|
+
piiscrub scrub --file text.txt
|
|
57
|
+
|
|
58
|
+
# Use deterministic hashing instead of standard tags
|
|
59
|
+
piiscrub scrub --text "My email is test@example.com" --style hash
|
|
60
|
+
# Output: My email is <EMAIL_a1517717>
|
|
61
|
+
|
|
62
|
+
# Bypass scrubbing for specific public strings
|
|
63
|
+
piiscrub scrub --text "Contact support@example.com or user@example.com" --allowlist support@example.com
|
|
64
|
+
# Output: Contact support@example.com or <EMAIL>
|
|
65
|
+
|
|
66
|
+
# Inject Custom Pattern from the CLI
|
|
67
|
+
piiscrub scrub --text "This is employee EMP-99881 and email a@b.com" --custom-pattern EMP_ID "\bEMP-\d{5}\b" --entities EMP_ID EMAIL
|
|
68
|
+
# Output: This is employee <EMP_ID> and email <EMAIL>
|
|
69
|
+
|
|
70
|
+
# Synthetic Data Generation
|
|
71
|
+
piiscrub scrub --text "Contact me at omkar@example.com" --style synthetic
|
|
72
|
+
# Output: Contact me at victoria12@gmail.com
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### Advanced Features
|
|
76
|
+
|
|
77
|
+
#### 1. Configuration File (`piiscrub.json`)
|
|
78
|
+
You can define a `piiscrub.json` file in your working directory to simplify your commands:
|
|
79
|
+
|
|
80
|
+
```json
|
|
81
|
+
{
|
|
82
|
+
"style": "hash",
|
|
83
|
+
"entities": ["EMAIL", "PHONE_GENERIC"],
|
|
84
|
+
"allowlist": ["support@mycompany.com"],
|
|
85
|
+
"custom_patterns": {
|
|
86
|
+
"ORDER_ID": "ORD-\\d{5}"
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Now just run:
|
|
92
|
+
```bash
|
|
93
|
+
piiscrub scrub --file data.txt
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
#### 2. Parallel Processing
|
|
97
|
+
For large files, use multi-core processing:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
piiscrub scrub --file large_dataset.txt --parallel --output cleaned.txt
|
|
101
|
+
```
|
|
102
|
+
> [!TIP]
|
|
103
|
+
> Parallel mode automatically handles file I/O efficiently and defaults to using all available CPU cores.
|
|
104
|
+
|
|
105
|
+
#### 3. Pre-Bundled Compliance Profiles
|
|
106
|
+
Quickly target common privacy standards without remembering every entity name:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Scrub only PCI-DSS related data (Credit Cards)
|
|
110
|
+
piiscrub scrub --file transactions.txt --profile pci-dss
|
|
111
|
+
|
|
112
|
+
# Scrub HIPAA related data (SSN, Phone, Email, IP)
|
|
113
|
+
piiscrub scrub --file medical_records.txt --profile hipaa
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Available profiles: `pci-dss`, `hipaa`, `gdpr`, `strict`.
|
|
117
|
+
|
|
118
|
+
#### 4. Compliance Auditing & Metric Reports
|
|
119
|
+
Data compliance teams can generate a statistical summary of the scrubbing process as proof of redaction:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
piiscrub scrub --file sensitive_data.txt --report audit.json
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
**Sample `audit.json` output:**
|
|
126
|
+
```json
|
|
127
|
+
{
|
|
128
|
+
"command": "scrub",
|
|
129
|
+
"total_lines_processed": 5000,
|
|
130
|
+
"execution_time_seconds": 1.25,
|
|
131
|
+
"entities_redacted": {
|
|
132
|
+
"EMAIL": 142,
|
|
133
|
+
"CREDIT_CARD": 12,
|
|
134
|
+
"PHONE_GENERIC": 5
|
|
135
|
+
},
|
|
136
|
+
"style": "tag"
|
|
137
|
+
}
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### Stream Processing
|
|
141
|
+
For extremely large files (e.g. LLM corpus data logs):
|
|
142
|
+
```bash
|
|
143
|
+
piiscrub scrub --file huge_dataset.jsonl --stream > scrubbed.jsonl
|
|
144
|
+
piiscrub extract --file huge_dataset.jsonl --stream > entities.json
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Library Usage
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from piiscrub.core import PiiScrub
|
|
151
|
+
import re
|
|
152
|
+
|
|
153
|
+
# Initialize with custom generic entities or pattern injection!
|
|
154
|
+
custom_patterns = {
|
|
155
|
+
"INTERNAL_ID": re.compile(r"\bEMP-\d{5}\b")
|
|
156
|
+
}
|
|
157
|
+
cs = PiiScrub(
|
|
158
|
+
entities=["EMAIL", "CREDIT_CARD", "INTERNAL_ID"],
|
|
159
|
+
custom_patterns=custom_patterns,
|
|
160
|
+
allowlist=["public@example.com"]
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
code = "Contact test@example.com for info on EMP-12345."
|
|
164
|
+
|
|
165
|
+
# Extract entities
|
|
166
|
+
extracted = cs.extract_entities(code)
|
|
167
|
+
print(extracted)
|
|
168
|
+
# {'EMAIL': ['test@example.com'], 'INTERNAL_ID': ['EMP-12345']}
|
|
169
|
+
|
|
170
|
+
# Scrub entities using hashing
|
|
171
|
+
scrubbed_code = cs.scrub_text(code, replacement_style="hash")
|
|
172
|
+
print(scrubbed_code)
|
|
173
|
+
# Contact <EMAIL_a1517717> for info on <INTERNAL_ID_b5fb38c3>.
|
|
174
|
+
```
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import re
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from piiscrub.core import PiiScrub
|
|
8
|
+
|
|
9
|
+
def get_text_from_args(args) -> str:
|
|
10
|
+
if args.text is not None:
|
|
11
|
+
return args.text
|
|
12
|
+
elif args.file is not None:
|
|
13
|
+
try:
|
|
14
|
+
with open(args.file, "r", encoding="utf-8") as f:
|
|
15
|
+
return f.read()
|
|
16
|
+
except IOError as e:
|
|
17
|
+
print(f"Error reading file {args.file}: {e}", file=sys.stderr)
|
|
18
|
+
sys.exit(1)
|
|
19
|
+
else:
|
|
20
|
+
print("Error: Must provide either --text or --file.", file=sys.stderr)
|
|
21
|
+
sys.exit(1)
|
|
22
|
+
|
|
23
|
+
def load_config(config_path=None):
|
|
24
|
+
"""Load configuration from a JSON file."""
|
|
25
|
+
path = config_path or "piiscrub.json"
|
|
26
|
+
if os.path.exists(path):
|
|
27
|
+
try:
|
|
28
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
29
|
+
return json.load(f)
|
|
30
|
+
except (IOError, json.JSONDecodeError) as e:
|
|
31
|
+
print(f"Warning: Could not load config file {path}: {e}", file=sys.stderr)
|
|
32
|
+
return {}
|
|
33
|
+
|
|
34
|
+
def main():
|
|
35
|
+
parser = argparse.ArgumentParser(description="PiiScrub - PII Scrubbing and Extraction Tool")
|
|
36
|
+
subparsers = parser.add_subparsers(dest="command", help="Command to run: 'scrub' or 'extract'")
|
|
37
|
+
subparsers.required = True
|
|
38
|
+
|
|
39
|
+
# Common arguments for both scrub and extract
|
|
40
|
+
parent_parser = argparse.ArgumentParser(add_help=False)
|
|
41
|
+
group = parent_parser.add_mutually_exclusive_group(required=True)
|
|
42
|
+
group.add_argument("--text", type=str, help="Raw text string to process")
|
|
43
|
+
group.add_argument("--file", type=str, help="Path to text file to process")
|
|
44
|
+
parent_parser.add_argument("--entities", type=str, nargs="+", help="Specific entities to target (e.g., EMAIL CREDIT_CARD)")
|
|
45
|
+
parent_parser.add_argument("--allowlist", type=str, nargs="+", help="Specific strings to bypass scrubbing (e.g., support@example.com)")
|
|
46
|
+
parent_parser.add_argument("--custom-pattern", nargs=2, action="append", metavar=("NAME", "REGEX"), help="Inject a custom regex pattern. Can be used multiple times.")
|
|
47
|
+
parent_parser.add_argument("--stream", action="store_true", help="Process the file chunk-by-chunk.")
|
|
48
|
+
parent_parser.add_argument("--parallel", action="store_true", help="Process the file in parallel using multiple cores.")
|
|
49
|
+
parent_parser.add_argument("--config", type=str, help="Path to piiscrub.json configuration file.")
|
|
50
|
+
parent_parser.add_argument("--report", type=str, help="Path to save the JSON audit report.")
|
|
51
|
+
parent_parser.add_argument("--profile", type=str, help="Compliance profile to use (e.g., pci-dss, hipaa, gdpr, strict)")
|
|
52
|
+
|
|
53
|
+
# Extract subcommand
|
|
54
|
+
parser_extract = subparsers.add_parser("extract", parents=[parent_parser], help="Extract PII entities from text")
|
|
55
|
+
|
|
56
|
+
# Scrub subcommand
|
|
57
|
+
parser_scrub = subparsers.add_parser("scrub", parents=[parent_parser], help="Scrub PII entities from text")
|
|
58
|
+
parser_scrub.add_argument("--style", type=str, choices=["tag", "redacted", "hash", "synthetic"], help="Replacement style: 'tag', 'redacted', 'hash', or 'synthetic'")
|
|
59
|
+
parser_scrub.add_argument("--output", type=str, help="Output file path (recommended for large files or parallel mode)")
|
|
60
|
+
|
|
61
|
+
args = parser.parse_args()
|
|
62
|
+
|
|
63
|
+
# Load config file if present
|
|
64
|
+
config = load_config(args.config)
|
|
65
|
+
|
|
66
|
+
# Merge config with args (CLI args take precedence)
|
|
67
|
+
entities = args.entities or config.get("entities")
|
|
68
|
+
allowlist = args.allowlist or config.get("allowlist")
|
|
69
|
+
parallel = args.parallel or config.get("parallel", False)
|
|
70
|
+
profile = args.profile or config.get("profile")
|
|
71
|
+
style = (getattr(args, "style", None) or config.get("style", "tag"))
|
|
72
|
+
|
|
73
|
+
# Process custom patterns from CLI
|
|
74
|
+
custom_patterns_dict = {}
|
|
75
|
+
if args.custom_pattern:
|
|
76
|
+
for name, pattern_str in args.custom_pattern:
|
|
77
|
+
try:
|
|
78
|
+
custom_patterns_dict[name] = re.compile(pattern_str)
|
|
79
|
+
except re.error as e:
|
|
80
|
+
print(f"Error compiling regex for {name}: {e}", file=sys.stderr)
|
|
81
|
+
sys.exit(1)
|
|
82
|
+
|
|
83
|
+
# Merge custom patterns from config
|
|
84
|
+
config_patterns = config.get("custom_patterns", {})
|
|
85
|
+
for name, pattern_str in config_patterns.items():
|
|
86
|
+
if name not in custom_patterns_dict:
|
|
87
|
+
try:
|
|
88
|
+
custom_patterns_dict[name] = re.compile(pattern_str)
|
|
89
|
+
except re.error as e:
|
|
90
|
+
print(f"Error compiling config regex for {name}: {e}", file=sys.stderr)
|
|
91
|
+
|
|
92
|
+
# Initialize Core Engine
|
|
93
|
+
cs = PiiScrub(
|
|
94
|
+
entities=entities,
|
|
95
|
+
profile=profile,
|
|
96
|
+
allowlist=allowlist,
|
|
97
|
+
custom_patterns=custom_patterns_dict if custom_patterns_dict else None
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if (args.stream or parallel) and not args.file:
|
|
101
|
+
print("Error: --stream or --parallel requires --file.", file=sys.stderr)
|
|
102
|
+
sys.exit(1)
|
|
103
|
+
|
|
104
|
+
start_time = time.time()
|
|
105
|
+
total_lines = 0
|
|
106
|
+
|
|
107
|
+
if parallel and args.command == "scrub":
|
|
108
|
+
output_path = args.output or (args.file + ".scrubbed")
|
|
109
|
+
print(f"Processing in parallel... saving to {output_path}")
|
|
110
|
+
cs.scrub_file_parallel(args.file, output_path, replacement_style=style)
|
|
111
|
+
# We can count lines by reading the file or getting it from parallel scrub (if we update it)
|
|
112
|
+
# For now, let's keep it simple and just report execution time and entities
|
|
113
|
+
execution_time = time.time() - start_time
|
|
114
|
+
if args.report:
|
|
115
|
+
report = {
|
|
116
|
+
"command": args.command,
|
|
117
|
+
"execution_time_seconds": round(execution_time, 4),
|
|
118
|
+
"entities_redacted": cs.get_stats(),
|
|
119
|
+
"style": style
|
|
120
|
+
}
|
|
121
|
+
with open(args.report, "w", encoding="utf-8") as f_rep:
|
|
122
|
+
json.dump(report, f_rep, indent=4)
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
if args.stream:
|
|
126
|
+
# Streaming logic for files
|
|
127
|
+
try:
|
|
128
|
+
with open(args.file, "r", encoding="utf-8") as f:
|
|
129
|
+
if args.command == "extract":
|
|
130
|
+
results = cs.extract_stream(f)
|
|
131
|
+
print(json.dumps(results, indent=2))
|
|
132
|
+
elif args.command == "scrub":
|
|
133
|
+
for scrubbed_line in cs.scrub_stream(f, replacement_style=style):
|
|
134
|
+
total_lines += 1
|
|
135
|
+
if args.output:
|
|
136
|
+
with open(args.output, "a", encoding="utf-8") as f_out:
|
|
137
|
+
f_out.write(scrubbed_line)
|
|
138
|
+
else:
|
|
139
|
+
sys.stdout.write(scrubbed_line)
|
|
140
|
+
except IOError as e:
|
|
141
|
+
print(f"Error reading file {args.file}: {e}", file=sys.stderr)
|
|
142
|
+
sys.exit(1)
|
|
143
|
+
else:
|
|
144
|
+
# Traditional in-memory logic
|
|
145
|
+
text = get_text_from_args(args)
|
|
146
|
+
total_lines = len(text.splitlines())
|
|
147
|
+
|
|
148
|
+
if args.command == "extract":
|
|
149
|
+
results = cs.extract_entities(text)
|
|
150
|
+
print(json.dumps(results, indent=2))
|
|
151
|
+
|
|
152
|
+
elif args.command == "scrub":
|
|
153
|
+
result = cs.scrub_text(text, replacement_style=style)
|
|
154
|
+
if args.output:
|
|
155
|
+
with open(args.output, "w", encoding="utf-8") as f_out:
|
|
156
|
+
f_out.write(result)
|
|
157
|
+
else:
|
|
158
|
+
print(result)
|
|
159
|
+
|
|
160
|
+
execution_time = time.time() - start_time
|
|
161
|
+
if args.report:
|
|
162
|
+
report = {
|
|
163
|
+
"command": args.command,
|
|
164
|
+
"total_lines_processed": total_lines,
|
|
165
|
+
"execution_time_seconds": round(execution_time, 4),
|
|
166
|
+
"entities_found" if args.command == "extract" else "entities_redacted": cs.get_stats(),
|
|
167
|
+
}
|
|
168
|
+
if args.command == "scrub":
|
|
169
|
+
report["style"] = style
|
|
170
|
+
|
|
171
|
+
with open(args.report, "w", encoding="utf-8") as f_rep:
|
|
172
|
+
json.dump(report, f_rep, indent=4)
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
main()
|