piiscrub 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
piiscrub-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Omkar Pathak
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: piiscrub
3
+ Version: 0.1.0
4
+ Summary: A blazing-fast, lightweight Python library and CLI tool designed to scrub Personally Identifiable Information (PII)
5
+ Author: Omkar Pathak
6
+ Project-URL: Homepage, https://github.com/OmkarPathak/cleanslate
7
+ Project-URL: Issues, https://github.com/OmkarPathak/cleanslate/issues
8
+ Keywords: pii,scrub,anonymization,privacy,llm,rag
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
18
+ Classifier: Topic :: Security
19
+ Requires-Python: >=3.9
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: faker
23
+ Dynamic: license-file
24
+
25
+ # PiiScrub
26
+
27
+ A blazing-fast, lightweight Python library and CLI tool designed to scrub Personally Identifiable Information (PII) from datasets for LLM training and RAG pipelines.
28
+
29
+ ## Features
30
+
31
+ - **Maximum Speed & Zero Dependencies:** Relies exclusively on Python's standard library. No `pandas`, `spaCy`, or other heavy external packages.
32
+ - **Deterministic Validation:** Raw regex matches for high-risk entities (like credit cards and IPs) pass algorithmic checksums (e.g., Luhn algorithm, octet range checks) before being flagged to eliminate false positives.
33
+ - **Pre-compiled Regex:** All regular expressions are compiled at the module level using `re.compile()` for O(1) setup time during execution.
34
+ - **Large Dataset Streaming:** Features `scrub_stream` and `extract_stream` to process massive datasets chunk-by-chunk without hitting Out-Of-Memory limit.
35
+ - **Multi-Core Parallel Processing:** Leverage multiple CPU cores to scrub large files at blazing speed using `--parallel`.
36
+ - **Pre-Bundled Compliance Profiles:** Quickly target specific standards like `hipaa`, `pci-dss`, or `gdpr` using the `--profile` flag.
37
+ - **Compliance Auditing & Metric Reports:** Generate detailed JSON reports with statistics on redacted entities and execution time using `--report`.
38
+ - **High-Value Secret Detection:** Added parsing to locate critical assets like AWS Access Keys, GitHub Tokens, and RSA Private Keys out of the box.
39
+ - **Deterministic Hashing:** Replace PII with deterministic SHA-256 hashes instead of generic tags to track uniqueness without leaking data.
40
+ - **Synthetic Data Generation:** Replace real PII with realistic "fake" data using the `faker` library (beta).
41
+ - **Configuration File Support:** Manage complex settings via `piiscrub.json` instead of long CLI commands.
42
+ - **Custom Pattern Injection:** Dynamically inject your own regex patterns and validators directly into the engine without modifying the core library.
43
+ - **Allowlist Support:** Explicitly bypass scrubbing for public figures, system emails, or company identifiers to prevent false positives.
44
+
45
+ ## Supported Entities
46
+
47
+ - **Global:**
48
+ - `EMAIL`
49
+ - `PHONE_GENERIC` (international)
50
+ - `CREDIT_CARD` (13-16 digits with Luhn algorithm validation)
51
+ - `IPV4` (validation ensuring all octets <= 255)
52
+ - `IPV6`
53
+ - **US Specific:**
54
+ - `US_SSN`
55
+ - **India Specific:**
56
+ - `IN_AADHAAR` (12 digits, cannot start with 0 or 1)
57
+ - `IN_PAN` (5 uppercase letters, 4 digits, 1 uppercase letter)
58
+ - **Secrets & Credentials (V2):**
59
+ - `AWS_ACCESS_KEY`
60
+ - `GITHUB_TOKEN`
61
+ - `RSA_PRIVATE_KEY`
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install .
67
+ ```
68
+
69
+ ## CLI Usage
70
+
71
+ ### Extract PII
72
+ ```bash
73
+ piiscrub extract --text "My email is test@example.com"
74
+ piiscrub extract --file text.txt
75
+ ```
76
+
77
+ ### Scrub PII
78
+ ```bash
79
+ piiscrub scrub --text "My email is test@example.com"
80
+ piiscrub scrub --file text.txt
81
+
82
+ # Use deterministic hashing instead of standard tags
83
+ piiscrub scrub --text "My email is test@example.com" --style hash
84
+ # Output: My email is <EMAIL_a1517717>
85
+
86
+ # Bypass scrubbing for specific public strings
87
+ piiscrub scrub --text "Contact support@example.com or user@example.com" --allowlist support@example.com
88
+ # Output: Contact support@example.com or <EMAIL>
89
+
90
+ # Inject Custom Pattern from the CLI
91
+ piiscrub scrub --text "This is employee EMP-99881 and email a@b.com" --custom-pattern EMP_ID "\bEMP-\d{5}\b" --entities EMP_ID EMAIL
92
+ # Output: This is employee <EMP_ID> and email <EMAIL>
93
+
94
+ # Synthetic Data Generation
95
+ piiscrub scrub --text "Contact me at omkar@example.com" --style synthetic
96
+ # Output: Contact me at victoria12@gmail.com
97
+ ```
98
+
99
+ ### Advanced Features
100
+
101
+ #### 1. Configuration File (`piiscrub.json`)
102
+ You can define a `piiscrub.json` file in your working directory to simplify your commands:
103
+
104
+ ```json
105
+ {
106
+ "style": "hash",
107
+ "entities": ["EMAIL", "PHONE_GENERIC"],
108
+ "allowlist": ["support@mycompany.com"],
109
+ "custom_patterns": {
110
+ "ORDER_ID": "ORD-\\d{5}"
111
+ }
112
+ }
113
+ ```
114
+
115
+ Now just run:
116
+ ```bash
117
+ piiscrub scrub --file data.txt
118
+ ```
119
+
120
+ #### 2. Parallel Processing
121
+ For large files, use multi-core processing:
122
+
123
+ ```bash
124
+ piiscrub scrub --file large_dataset.txt --parallel --output cleaned.txt
125
+ ```
126
+ > [!TIP]
127
+ > Parallel mode automatically handles file I/O efficiently and defaults to using all available CPU cores.
128
+
129
+ #### 3. Pre-Bundled Compliance Profiles
130
+ Quickly target common privacy standards without remembering every entity name:
131
+
132
+ ```bash
133
+ # Scrub only PCI-DSS related data (Credit Cards)
134
+ piiscrub scrub --file transactions.txt --profile pci-dss
135
+
136
+ # Scrub HIPAA related data (SSN, Phone, Email, IP)
137
+ piiscrub scrub --file medical_records.txt --profile hipaa
138
+ ```
139
+
140
+ Available profiles: `pci-dss`, `hipaa`, `gdpr`, `strict`.
141
+
142
+ #### 4. Compliance Auditing & Metric Reports
143
+ Data compliance teams can generate a statistical summary of the scrubbing process as proof of redaction:
144
+
145
+ ```bash
146
+ piiscrub scrub --file sensitive_data.txt --report audit.json
147
+ ```
148
+
149
+ **Sample `audit.json` output:**
150
+ ```json
151
+ {
152
+ "command": "scrub",
153
+ "total_lines_processed": 5000,
154
+ "execution_time_seconds": 1.25,
155
+ "entities_redacted": {
156
+ "EMAIL": 142,
157
+ "CREDIT_CARD": 12,
158
+ "PHONE_GENERIC": 5
159
+ },
160
+ "style": "tag"
161
+ }
162
+ ```
163
+
164
+ ### Stream Processing
165
+ For extremely large files (e.g. LLM corpus data logs):
166
+ ```bash
167
+ piiscrub scrub --file huge_dataset.jsonl --stream > scrubbed.jsonl
168
+ piiscrub extract --file huge_dataset.jsonl --stream > entities.json
169
+ ```
170
+
171
+ ## Library Usage
172
+
173
+ ```python
174
+ from piiscrub.core import PiiScrub
175
+ import re
176
+
177
+ # Initialize with custom generic entities or pattern injection!
178
+ custom_patterns = {
179
+ "INTERNAL_ID": re.compile(r"\bEMP-\d{5}\b")
180
+ }
181
+ cs = PiiScrub(
182
+ entities=["EMAIL", "CREDIT_CARD", "INTERNAL_ID"],
183
+ custom_patterns=custom_patterns,
184
+ allowlist=["public@example.com"]
185
+ )
186
+
187
+ code = "Contact test@example.com for info on EMP-12345."
188
+
189
+ # Extract entities
190
+ extracted = cs.extract_entities(code)
191
+ print(extracted)
192
+ # {'EMAIL': ['test@example.com'], 'INTERNAL_ID': ['EMP-12345']}
193
+
194
+ # Scrub entities using hashing
195
+ scrubbed_code = cs.scrub_text(code, replacement_style="hash")
196
+ print(scrubbed_code)
197
+ # Contact <EMAIL_a1517717> for info on <INTERNAL_ID_b5fb38c3>.
198
+ ```
@@ -0,0 +1,174 @@
1
+ # PiiScrub
2
+
3
+ A blazing-fast, lightweight Python library and CLI tool designed to scrub Personally Identifiable Information (PII) from datasets for LLM training and RAG pipelines.
4
+
5
+ ## Features
6
+
7
+ - **Maximum Speed & Zero Dependencies:** Relies exclusively on Python's standard library. No `pandas`, `spaCy`, or other heavy external packages.
8
+ - **Deterministic Validation:** Raw regex matches for high-risk entities (like credit cards and IPs) pass algorithmic checksums (e.g., Luhn algorithm, octet range checks) before being flagged to eliminate false positives.
9
+ - **Pre-compiled Regex:** All regular expressions are compiled at the module level using `re.compile()` for O(1) setup time during execution.
10
+ - **Large Dataset Streaming:** Features `scrub_stream` and `extract_stream` to process massive datasets chunk-by-chunk without hitting Out-Of-Memory limit.
11
+ - **Multi-Core Parallel Processing:** Leverage multiple CPU cores to scrub large files at blazing speed using `--parallel`.
12
+ - **Pre-Bundled Compliance Profiles:** Quickly target specific standards like `hipaa`, `pci-dss`, or `gdpr` using the `--profile` flag.
13
+ - **Compliance Auditing & Metric Reports:** Generate detailed JSON reports with statistics on redacted entities and execution time using `--report`.
14
+ - **High-Value Secret Detection:** Added parsing to locate critical assets like AWS Access Keys, GitHub Tokens, and RSA Private Keys out of the box.
15
+ - **Deterministic Hashing:** Replace PII with deterministic SHA-256 hashes instead of generic tags to track uniqueness without leaking data.
16
+ - **Synthetic Data Generation:** Replace real PII with realistic "fake" data using the `faker` library (beta).
17
+ - **Configuration File Support:** Manage complex settings via `piiscrub.json` instead of long CLI commands.
18
+ - **Custom Pattern Injection:** Dynamically inject your own regex patterns and validators directly into the engine without modifying the core library.
19
+ - **Allowlist Support:** Explicitly bypass scrubbing for public figures, system emails, or company identifiers to prevent false positives.
20
+
21
+ ## Supported Entities
22
+
23
+ - **Global:**
24
+ - `EMAIL`
25
+ - `PHONE_GENERIC` (international)
26
+ - `CREDIT_CARD` (13-16 digits with Luhn algorithm validation)
27
+ - `IPV4` (validation ensuring all octets <= 255)
28
+ - `IPV6`
29
+ - **US Specific:**
30
+ - `US_SSN`
31
+ - **India Specific:**
32
+ - `IN_AADHAAR` (12 digits, cannot start with 0 or 1)
33
+ - `IN_PAN` (5 uppercase letters, 4 digits, 1 uppercase letter)
34
+ - **Secrets & Credentials (V2):**
35
+ - `AWS_ACCESS_KEY`
36
+ - `GITHUB_TOKEN`
37
+ - `RSA_PRIVATE_KEY`
38
+
39
+ ## Installation
40
+
41
+ ```bash
42
+ pip install .
43
+ ```
44
+
45
+ ## CLI Usage
46
+
47
+ ### Extract PII
48
+ ```bash
49
+ piiscrub extract --text "My email is test@example.com"
50
+ piiscrub extract --file text.txt
51
+ ```
52
+
53
+ ### Scrub PII
54
+ ```bash
55
+ piiscrub scrub --text "My email is test@example.com"
56
+ piiscrub scrub --file text.txt
57
+
58
+ # Use deterministic hashing instead of standard tags
59
+ piiscrub scrub --text "My email is test@example.com" --style hash
60
+ # Output: My email is <EMAIL_a1517717>
61
+
62
+ # Bypass scrubbing for specific public strings
63
+ piiscrub scrub --text "Contact support@example.com or user@example.com" --allowlist support@example.com
64
+ # Output: Contact support@example.com or <EMAIL>
65
+
66
+ # Inject Custom Pattern from the CLI
67
+ piiscrub scrub --text "This is employee EMP-99881 and email a@b.com" --custom-pattern EMP_ID "\bEMP-\d{5}\b" --entities EMP_ID EMAIL
68
+ # Output: This is employee <EMP_ID> and email <EMAIL>
69
+
70
+ # Synthetic Data Generation
71
+ piiscrub scrub --text "Contact me at omkar@example.com" --style synthetic
72
+ # Output: Contact me at victoria12@gmail.com
73
+ ```
74
+
75
+ ### Advanced Features
76
+
77
+ #### 1. Configuration File (`piiscrub.json`)
78
+ You can define a `piiscrub.json` file in your working directory to simplify your commands:
79
+
80
+ ```json
81
+ {
82
+ "style": "hash",
83
+ "entities": ["EMAIL", "PHONE_GENERIC"],
84
+ "allowlist": ["support@mycompany.com"],
85
+ "custom_patterns": {
86
+ "ORDER_ID": "ORD-\\d{5}"
87
+ }
88
+ }
89
+ ```
90
+
91
+ Now just run:
92
+ ```bash
93
+ piiscrub scrub --file data.txt
94
+ ```
95
+
96
+ #### 2. Parallel Processing
97
+ For large files, use multi-core processing:
98
+
99
+ ```bash
100
+ piiscrub scrub --file large_dataset.txt --parallel --output cleaned.txt
101
+ ```
102
+ > [!TIP]
103
+ > Parallel mode automatically handles file I/O efficiently and defaults to using all available CPU cores.
104
+
105
+ #### 3. Pre-Bundled Compliance Profiles
106
+ Quickly target common privacy standards without remembering every entity name:
107
+
108
+ ```bash
109
+ # Scrub only PCI-DSS related data (Credit Cards)
110
+ piiscrub scrub --file transactions.txt --profile pci-dss
111
+
112
+ # Scrub HIPAA related data (SSN, Phone, Email, IP)
113
+ piiscrub scrub --file medical_records.txt --profile hipaa
114
+ ```
115
+
116
+ Available profiles: `pci-dss`, `hipaa`, `gdpr`, `strict`.
117
+
118
+ #### 4. Compliance Auditing & Metric Reports
119
+ Data compliance teams can generate a statistical summary of the scrubbing process as proof of redaction:
120
+
121
+ ```bash
122
+ piiscrub scrub --file sensitive_data.txt --report audit.json
123
+ ```
124
+
125
+ **Sample `audit.json` output:**
126
+ ```json
127
+ {
128
+ "command": "scrub",
129
+ "total_lines_processed": 5000,
130
+ "execution_time_seconds": 1.25,
131
+ "entities_redacted": {
132
+ "EMAIL": 142,
133
+ "CREDIT_CARD": 12,
134
+ "PHONE_GENERIC": 5
135
+ },
136
+ "style": "tag"
137
+ }
138
+ ```
139
+
140
+ ### Stream Processing
141
+ For extremely large files (e.g. LLM corpus data logs):
142
+ ```bash
143
+ piiscrub scrub --file huge_dataset.jsonl --stream > scrubbed.jsonl
144
+ piiscrub extract --file huge_dataset.jsonl --stream > entities.json
145
+ ```
146
+
147
+ ## Library Usage
148
+
149
+ ```python
150
+ from piiscrub.core import PiiScrub
151
+ import re
152
+
153
+ # Initialize with custom generic entities or pattern injection!
154
+ custom_patterns = {
155
+ "INTERNAL_ID": re.compile(r"\bEMP-\d{5}\b")
156
+ }
157
+ cs = PiiScrub(
158
+ entities=["EMAIL", "CREDIT_CARD", "INTERNAL_ID"],
159
+ custom_patterns=custom_patterns,
160
+ allowlist=["public@example.com"]
161
+ )
162
+
163
+ code = "Contact test@example.com for info on EMP-12345."
164
+
165
+ # Extract entities
166
+ extracted = cs.extract_entities(code)
167
+ print(extracted)
168
+ # {'EMAIL': ['test@example.com'], 'INTERNAL_ID': ['EMP-12345']}
169
+
170
+ # Scrub entities using hashing
171
+ scrubbed_code = cs.scrub_text(code, replacement_style="hash")
172
+ print(scrubbed_code)
173
+ # Contact <EMAIL_a1517717> for info on <INTERNAL_ID_b5fb38c3>.
174
+ ```
@@ -0,0 +1,4 @@
1
+ """
2
+ PiiScrub - Fast, lightweight PII scrubbing library.
3
+ """
4
+ __version__ = "0.1.0"
@@ -0,0 +1,175 @@
1
+ import argparse
2
+ import sys
3
+ import re
4
+ import json
5
+ import os
6
+ import time
7
+ from piiscrub.core import PiiScrub
8
+
9
+ def get_text_from_args(args) -> str:
10
+ if args.text is not None:
11
+ return args.text
12
+ elif args.file is not None:
13
+ try:
14
+ with open(args.file, "r", encoding="utf-8") as f:
15
+ return f.read()
16
+ except IOError as e:
17
+ print(f"Error reading file {args.file}: {e}", file=sys.stderr)
18
+ sys.exit(1)
19
+ else:
20
+ print("Error: Must provide either --text or --file.", file=sys.stderr)
21
+ sys.exit(1)
22
+
23
+ def load_config(config_path=None):
24
+ """Load configuration from a JSON file."""
25
+ path = config_path or "piiscrub.json"
26
+ if os.path.exists(path):
27
+ try:
28
+ with open(path, "r", encoding="utf-8") as f:
29
+ return json.load(f)
30
+ except (IOError, json.JSONDecodeError) as e:
31
+ print(f"Warning: Could not load config file {path}: {e}", file=sys.stderr)
32
+ return {}
33
+
34
+ def main():
35
+ parser = argparse.ArgumentParser(description="PiiScrub - PII Scrubbing and Extraction Tool")
36
+ subparsers = parser.add_subparsers(dest="command", help="Command to run: 'scrub' or 'extract'")
37
+ subparsers.required = True
38
+
39
+ # Common arguments for both scrub and extract
40
+ parent_parser = argparse.ArgumentParser(add_help=False)
41
+ group = parent_parser.add_mutually_exclusive_group(required=True)
42
+ group.add_argument("--text", type=str, help="Raw text string to process")
43
+ group.add_argument("--file", type=str, help="Path to text file to process")
44
+ parent_parser.add_argument("--entities", type=str, nargs="+", help="Specific entities to target (e.g., EMAIL CREDIT_CARD)")
45
+ parent_parser.add_argument("--allowlist", type=str, nargs="+", help="Specific strings to bypass scrubbing (e.g., support@example.com)")
46
+ parent_parser.add_argument("--custom-pattern", nargs=2, action="append", metavar=("NAME", "REGEX"), help="Inject a custom regex pattern. Can be used multiple times.")
47
+ parent_parser.add_argument("--stream", action="store_true", help="Process the file chunk-by-chunk.")
48
+ parent_parser.add_argument("--parallel", action="store_true", help="Process the file in parallel using multiple cores.")
49
+ parent_parser.add_argument("--config", type=str, help="Path to piiscrub.json configuration file.")
50
+ parent_parser.add_argument("--report", type=str, help="Path to save the JSON audit report.")
51
+ parent_parser.add_argument("--profile", type=str, help="Compliance profile to use (e.g., pci-dss, hipaa, gdpr, strict)")
52
+
53
+ # Extract subcommand
54
+ parser_extract = subparsers.add_parser("extract", parents=[parent_parser], help="Extract PII entities from text")
55
+
56
+ # Scrub subcommand
57
+ parser_scrub = subparsers.add_parser("scrub", parents=[parent_parser], help="Scrub PII entities from text")
58
+ parser_scrub.add_argument("--style", type=str, choices=["tag", "redacted", "hash", "synthetic"], help="Replacement style: 'tag', 'redacted', 'hash', or 'synthetic'")
59
+ parser_scrub.add_argument("--output", type=str, help="Output file path (recommended for large files or parallel mode)")
60
+
61
+ args = parser.parse_args()
62
+
63
+ # Load config file if present
64
+ config = load_config(args.config)
65
+
66
+ # Merge config with args (CLI args take precedence)
67
+ entities = args.entities or config.get("entities")
68
+ allowlist = args.allowlist or config.get("allowlist")
69
+ parallel = args.parallel or config.get("parallel", False)
70
+ profile = args.profile or config.get("profile")
71
+ style = (getattr(args, "style", None) or config.get("style", "tag"))
72
+
73
+ # Process custom patterns from CLI
74
+ custom_patterns_dict = {}
75
+ if args.custom_pattern:
76
+ for name, pattern_str in args.custom_pattern:
77
+ try:
78
+ custom_patterns_dict[name] = re.compile(pattern_str)
79
+ except re.error as e:
80
+ print(f"Error compiling regex for {name}: {e}", file=sys.stderr)
81
+ sys.exit(1)
82
+
83
+ # Merge custom patterns from config
84
+ config_patterns = config.get("custom_patterns", {})
85
+ for name, pattern_str in config_patterns.items():
86
+ if name not in custom_patterns_dict:
87
+ try:
88
+ custom_patterns_dict[name] = re.compile(pattern_str)
89
+ except re.error as e:
90
+ print(f"Error compiling config regex for {name}: {e}", file=sys.stderr)
91
+
92
+ # Initialize Core Engine
93
+ cs = PiiScrub(
94
+ entities=entities,
95
+ profile=profile,
96
+ allowlist=allowlist,
97
+ custom_patterns=custom_patterns_dict if custom_patterns_dict else None
98
+ )
99
+
100
+ if (args.stream or parallel) and not args.file:
101
+ print("Error: --stream or --parallel requires --file.", file=sys.stderr)
102
+ sys.exit(1)
103
+
104
+ start_time = time.time()
105
+ total_lines = 0
106
+
107
+ if parallel and args.command == "scrub":
108
+ output_path = args.output or (args.file + ".scrubbed")
109
+ print(f"Processing in parallel... saving to {output_path}")
110
+ cs.scrub_file_parallel(args.file, output_path, replacement_style=style)
111
+ # We can count lines by reading the file or getting it from parallel scrub (if we update it)
112
+ # For now, let's keep it simple and just report execution time and entities
113
+ execution_time = time.time() - start_time
114
+ if args.report:
115
+ report = {
116
+ "command": args.command,
117
+ "execution_time_seconds": round(execution_time, 4),
118
+ "entities_redacted": cs.get_stats(),
119
+ "style": style
120
+ }
121
+ with open(args.report, "w", encoding="utf-8") as f_rep:
122
+ json.dump(report, f_rep, indent=4)
123
+ return
124
+
125
+ if args.stream:
126
+ # Streaming logic for files
127
+ try:
128
+ with open(args.file, "r", encoding="utf-8") as f:
129
+ if args.command == "extract":
130
+ results = cs.extract_stream(f)
131
+ print(json.dumps(results, indent=2))
132
+ elif args.command == "scrub":
133
+ for scrubbed_line in cs.scrub_stream(f, replacement_style=style):
134
+ total_lines += 1
135
+ if args.output:
136
+ with open(args.output, "a", encoding="utf-8") as f_out:
137
+ f_out.write(scrubbed_line)
138
+ else:
139
+ sys.stdout.write(scrubbed_line)
140
+ except IOError as e:
141
+ print(f"Error reading file {args.file}: {e}", file=sys.stderr)
142
+ sys.exit(1)
143
+ else:
144
+ # Traditional in-memory logic
145
+ text = get_text_from_args(args)
146
+ total_lines = len(text.splitlines())
147
+
148
+ if args.command == "extract":
149
+ results = cs.extract_entities(text)
150
+ print(json.dumps(results, indent=2))
151
+
152
+ elif args.command == "scrub":
153
+ result = cs.scrub_text(text, replacement_style=style)
154
+ if args.output:
155
+ with open(args.output, "w", encoding="utf-8") as f_out:
156
+ f_out.write(result)
157
+ else:
158
+ print(result)
159
+
160
+ execution_time = time.time() - start_time
161
+ if args.report:
162
+ report = {
163
+ "command": args.command,
164
+ "total_lines_processed": total_lines,
165
+ "execution_time_seconds": round(execution_time, 4),
166
+ "entities_found" if args.command == "extract" else "entities_redacted": cs.get_stats(),
167
+ }
168
+ if args.command == "scrub":
169
+ report["style"] = style
170
+
171
+ with open(args.report, "w", encoding="utf-8") as f_rep:
172
+ json.dump(report, f_rep, indent=4)
173
+
174
+ if __name__ == "__main__":
175
+ main()