lethe-cli 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lethe_cli-0.1.1/.gitignore +11 -0
- lethe_cli-0.1.1/LICENSE +21 -0
- lethe_cli-0.1.1/PKG-INFO +78 -0
- lethe_cli-0.1.1/README.md +45 -0
- lethe_cli-0.1.1/docs/architecture.md +157 -0
- lethe_cli-0.1.1/docs/cli.md +111 -0
- lethe_cli-0.1.1/docs/index.html +1854 -0
- lethe_cli-0.1.1/docs/use-cases.md +122 -0
- lethe_cli-0.1.1/pyproject.toml +51 -0
- lethe_cli-0.1.1/spec/architecture-overview.svg +90 -0
- lethe_cli-0.1.1/spec/diagrams.html +357 -0
- lethe_cli-0.1.1/spec/mapping-engine.svg +159 -0
- lethe_cli-0.1.1/spec/pii-detection.svg +88 -0
- lethe_cli-0.1.1/src/lethe/__init__.py +3 -0
- lethe_cli-0.1.1/src/lethe/cli.py +138 -0
- lethe_cli-0.1.1/src/lethe/config.py +27 -0
- lethe_cli-0.1.1/src/lethe/mapping/__init__.py +0 -0
- lethe_cli-0.1.1/src/lethe/mapping/session_index.py +65 -0
- lethe_cli-0.1.1/src/lethe/multiplier.py +102 -0
- lethe_cli-0.1.1/src/lethe/multiply_pipeline.py +69 -0
- lethe_cli-0.1.1/src/lethe/parsers/__init__.py +96 -0
- lethe_cli-0.1.1/src/lethe/parsers/base.py +17 -0
- lethe_cli-0.1.1/src/lethe/parsers/csv_parser.py +39 -0
- lethe_cli-0.1.1/src/lethe/parsers/txt_parser.py +108 -0
- lethe_cli-0.1.1/src/lethe/pipeline.py +89 -0
- lethe_cli-0.1.1/src/lethe/replacer/__init__.py +0 -0
- lethe_cli-0.1.1/src/lethe/replacer/engine.py +33 -0
- lethe_cli-0.1.1/src/lethe/replacer/freeform.py +57 -0
- lethe_cli-0.1.1/src/lethe/sanitizer.py +100 -0
- lethe_cli-0.1.1/src/lethe/scanner/__init__.py +0 -0
- lethe_cli-0.1.1/src/lethe/scanner/column_heuristics.py +52 -0
- lethe_cli-0.1.1/src/lethe/scanner/confidence.py +46 -0
- lethe_cli-0.1.1/src/lethe/scanner/engine.py +103 -0
- lethe_cli-0.1.1/src/lethe/scanner/pattern_recognizers.py +50 -0
- lethe_cli-0.1.1/tests/__init__.py +0 -0
- lethe_cli-0.1.1/tests/conftest.py +32 -0
- lethe_cli-0.1.1/tests/fixtures/emails.txt +5 -0
- lethe_cli-0.1.1/tests/fixtures/freeform.txt +13 -0
- lethe_cli-0.1.1/tests/fixtures/sample.tsv +6 -0
- lethe_cli-0.1.1/tests/fixtures/sample_customers.csv +6 -0
- lethe_cli-0.1.1/tests/fixtures/sample_customers_multiplied.csv +21 -0
- lethe_cli-0.1.1/tests/fixtures/sample_orders.csv +5 -0
- lethe_cli-0.1.1/tests/fixtures/single.csv +2 -0
- lethe_cli-0.1.1/tests/test_cli.py +104 -0
- lethe_cli-0.1.1/tests/test_format_detection.py +51 -0
- lethe_cli-0.1.1/tests/test_freeform_replacer.py +65 -0
- lethe_cli-0.1.1/tests/test_mapping/__init__.py +0 -0
- lethe_cli-0.1.1/tests/test_mapping/test_session_index.py +40 -0
- lethe_cli-0.1.1/tests/test_multiply.py +204 -0
- lethe_cli-0.1.1/tests/test_pipeline.py +37 -0
- lethe_cli-0.1.1/tests/test_replacer/__init__.py +0 -0
- lethe_cli-0.1.1/tests/test_replacer/test_engine.py +60 -0
- lethe_cli-0.1.1/tests/test_scanner/__init__.py +0 -0
- lethe_cli-0.1.1/tests/test_scanner/test_column_heuristics.py +51 -0
- lethe_cli-0.1.1/tests/test_scanner/test_confidence.py +49 -0
- lethe_cli-0.1.1/tests/test_txt_parser.py +138 -0
lethe_cli-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Marco Kotrotsos
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
lethe_cli-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lethe-cli
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Data anonymization CLI tool
|
|
5
|
+
Author: Marco Kotrotsos
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Environment :: Console
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Security
|
|
18
|
+
Classifier: Topic :: Utilities
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Requires-Dist: faker<35,>=25.0
|
|
21
|
+
Requires-Dist: pandas<3,>=2.0
|
|
22
|
+
Requires-Dist: presidio-analyzer<3,>=2.2
|
|
23
|
+
Requires-Dist: presidio-anonymizer<3,>=2.2
|
|
24
|
+
Requires-Dist: rich<14,>=13.0
|
|
25
|
+
Requires-Dist: spacy<4,>=3.7
|
|
26
|
+
Requires-Dist: typer[all]<1,>=0.12
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
30
|
+
Provides-Extra: trf
|
|
31
|
+
Requires-Dist: spacy[transformers]; extra == 'trf'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# Lethe
|
|
35
|
+
|
|
36
|
+
Data anonymization CLI for structured files. Detect and replace PII in CSV, TSV, and plain text using Presidio and spaCy NER, with Faker-generated replacements that stay consistent across your dataset.
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install lethe-cli
|
|
42
|
+
python -m spacy download en_core_web_trf
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
For a faster, lighter model instead of the transformer:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
python -m spacy download en_core_web_sm
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Usage
|
|
52
|
+
|
|
53
|
+
### Anonymize
|
|
54
|
+
|
|
55
|
+
Replace detected PII with consistent fake values:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
lethe anonymize data.csv -o anonymized.csv
|
|
59
|
+
lethe anonymize data.csv --model sm --threshold 0.7
|
|
60
|
+
lethe anonymize notes.txt -o clean.txt --locale nl_NL
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Multiply
|
|
64
|
+
|
|
65
|
+
Generate synthetic rows from an existing dataset:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
lethe multiply data.csv --factor 5 -o expanded.csv
|
|
69
|
+
lethe multiply data.csv --factor 10 --sanitize --seed 42
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Options
|
|
73
|
+
|
|
74
|
+
Run `lethe anonymize --help` or `lethe multiply --help` for the full list of options.
|
|
75
|
+
|
|
76
|
+
## License
|
|
77
|
+
|
|
78
|
+
MIT
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Lethe
|
|
2
|
+
|
|
3
|
+
Data anonymization CLI for structured files. Detect and replace PII in CSV, TSV, and plain text using Presidio and spaCy NER, with Faker-generated replacements that stay consistent across your dataset.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install lethe-cli
|
|
9
|
+
python -m spacy download en_core_web_trf
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
For a faster, lighter model instead of the transformer:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
python -m spacy download en_core_web_sm
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
### Anonymize
|
|
21
|
+
|
|
22
|
+
Replace detected PII with consistent fake values:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
lethe anonymize data.csv -o anonymized.csv
|
|
26
|
+
lethe anonymize data.csv --model sm --threshold 0.7
|
|
27
|
+
lethe anonymize notes.txt -o clean.txt --locale nl_NL
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Multiply
|
|
31
|
+
|
|
32
|
+
Generate synthetic rows from an existing dataset:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
lethe multiply data.csv --factor 5 -o expanded.csv
|
|
36
|
+
lethe multiply data.csv --factor 10 --sanitize --seed 42
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Options
|
|
40
|
+
|
|
41
|
+
Run `lethe anonymize --help` or `lethe multiply --help` for the full list of options.
|
|
42
|
+
|
|
43
|
+
## License
|
|
44
|
+
|
|
45
|
+
MIT
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Architecture and Inner Workings
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
Lethe processes CSV files through a four-stage pipeline: **Parse**, **Scan**, **Replace**, **Write**. Each stage is designed around a single principle: process columns, not rows. Structured data has PII concentrated in specific columns (names, emails, SSNs), while other columns (IDs, timestamps, statuses) can be skipped entirely. This column-first approach avoids wasting NLP resources on data that is obviously not PII.
|
|
6
|
+
|
|
7
|
+
## Pipeline
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
Input CSV
|
|
11
|
+
|
|
|
12
|
+
v
|
|
13
|
+
CsvReader.read_chunks() # Stream rows in chunks of N (default 5000)
|
|
14
|
+
|
|
|
15
|
+
v (for each chunk)
|
|
16
|
+
PiiScanner.scan_chunk() # Analyze each column independently
|
|
17
|
+
|
|
|
18
|
+
v
|
|
19
|
+
Replacer.replace_chunk() # Swap PII cells with consistent fakes
|
|
20
|
+
|
|
|
21
|
+
v
|
|
22
|
+
CsvWriter.write_chunk() # Append to output file
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
The pipeline is orchestrated by `pipeline.py`, which wires these stages together and displays a progress bar via Rich.
|
|
26
|
+
|
|
27
|
+
## Chunked Streaming
|
|
28
|
+
|
|
29
|
+
Large files are never loaded entirely into memory. The `CsvReader` uses pandas' `chunksize` parameter to yield DataFrames of N rows at a time. The `CsvWriter` appends each processed chunk to the output file. This means Lethe can handle files larger than available RAM, the memory footprint is roughly proportional to `chunk_size * number_of_columns`.
|
|
30
|
+
|
|
31
|
+
## PII Scanner
|
|
32
|
+
|
|
33
|
+
The scanner (`scanner/engine.py`) is the core of Lethe. It wraps Presidio's `AnalyzerEngine` and adds two layers on top: column heuristics and confidence boosting. For each column in a chunk, the scanner runs three strategies:
|
|
34
|
+
|
|
35
|
+
### Strategy 1: Column Heuristics
|
|
36
|
+
|
|
37
|
+
Before running any NLP, the scanner examines the column name. A column named `first_name` is almost certainly a person's name. A column named `id` or `created_at` is almost certainly not PII.
|
|
38
|
+
|
|
39
|
+
The heuristic classifier (`scanner/column_heuristics.py`) returns one of three outcomes:
|
|
40
|
+
|
|
41
|
+
- **SKIP**: The column is definitely not PII (IDs, timestamps, booleans, statuses, amounts). The scanner skips the entire column, saving all NLP computation.
|
|
42
|
+
- **PII type hint**: The column name matches a known PII pattern (e.g., `email` -> `EMAIL_ADDRESS`, `phone` -> `PHONE_NUMBER`). This hint is used later to boost confidence scores.
|
|
43
|
+
- **None**: The column name is ambiguous. The scanner relies entirely on NLP and regex detection.
|
|
44
|
+
|
|
45
|
+
**Skip patterns** cover:
|
|
46
|
+
- Identifiers: `id`, `uuid`, `pk`, `key`, `guid`
|
|
47
|
+
- Timestamps: `created_at`, `updated_on`, `timestamp`, `modified_date`
|
|
48
|
+
- Booleans: `is_active`, `has_permission`, `enabled`
|
|
49
|
+
- Numerics: `count`, `total`, `amount`, `price`, `score`, `age`
|
|
50
|
+
- Statuses: `status`, `state`, `type`, `category`, `role`
|
|
51
|
+
|
|
52
|
+
**PII hint patterns** cover:
|
|
53
|
+
- `PERSON`: name, first_name, last_name, surname, customer_name
|
|
54
|
+
- `EMAIL_ADDRESS`: email, mail, email_addr
|
|
55
|
+
- `PHONE_NUMBER`: phone, mobile, cell, tel, fax
|
|
56
|
+
- `LOCATION`: address, street, city, zip, postal, country
|
|
57
|
+
- `US_SSN`: ssn, social_security
|
|
58
|
+
- `CREDIT_CARD`: credit_card, card_num, cc_num
|
|
59
|
+
- `IBAN_CODE`: iban
|
|
60
|
+
- `IP_ADDRESS`: ip_addr, ip, remote_ip, client_ip
|
|
61
|
+
- And more (driver's licenses, passports, dates of birth)
|
|
62
|
+
|
|
63
|
+
### Strategy 2: Presidio Analysis
|
|
64
|
+
|
|
65
|
+
For each non-skipped cell, the scanner calls Presidio's `AnalyzerEngine.analyze()`. Presidio combines two detection methods internally:
|
|
66
|
+
|
|
67
|
+
1. **Regex pattern recognizers**: Built-in patterns for emails, phone numbers, credit cards, SSNs, and more. Lethe also registers three custom recognizers for IBAN codes, UK National Insurance Numbers (NINO), and Dutch BSN numbers.
|
|
68
|
+
|
|
69
|
+
2. **NER (Named Entity Recognition)**: A spaCy NLP model identifies entities like person names and locations that cannot be caught by regex. The model choice (`trf` vs `sm`) affects accuracy here:
|
|
70
|
+
- `en_core_web_trf`: Transformer-based, highest accuracy, slower. Best for production use where you cannot afford to miss PII.
|
|
71
|
+
- `en_core_web_sm`: Small CNN-based model, faster, slightly less accurate on ambiguous names.
|
|
72
|
+
|
|
73
|
+
Each recognizer returns results with a confidence score between 0 and 1.
|
|
74
|
+
|
|
75
|
+
### Strategy 3: Confidence Boosting
|
|
76
|
+
|
|
77
|
+
After Presidio returns its results, the confidence module (`scanner/confidence.py`) applies two adjustments based on the column heuristic:
|
|
78
|
+
|
|
79
|
+
1. **Boost matching results**: If the heuristic said this column is `PHONE_NUMBER` and Presidio also detected `PHONE_NUMBER`, the score gets boosted by 0.25 (capped at 1.0). This pushes borderline detections over the threshold.
|
|
80
|
+
|
|
81
|
+
2. **Synthesize missing results**: If the heuristic said `PERSON` but Presidio returned nothing (common for unusual names), the system synthesizes a result with a score of 0.4. The column name itself is strong evidence, this prevents names from slipping through because the NER model did not recognize them.
|
|
82
|
+
|
|
83
|
+
### Result Selection
|
|
84
|
+
|
|
85
|
+
After boosting, the scanner picks the highest-scoring result per cell. If that score meets or exceeds the configured threshold (default 0.35), the cell is marked as PII with its detected entity type.
|
|
86
|
+
|
|
87
|
+
## Session Index and Mapping
|
|
88
|
+
|
|
89
|
+
The `SessionIndex` (`mapping/session_index.py`) is a dictionary that maps `(entity_type, original_value)` tuples to fake replacements. When the replacer encounters "John Smith" in a `PERSON` column, it checks the index:
|
|
90
|
+
|
|
91
|
+
- **First encounter**: Generate a fake name via Faker (e.g., "Nicholas Quinn"), store the mapping, return the fake.
|
|
92
|
+
- **Subsequent encounters**: Return "Nicholas Quinn" immediately, O(1) lookup.
|
|
93
|
+
|
|
94
|
+
This guarantees **cross-row and cross-table consistency**. If "John Smith" appears in both `customers.csv` and `orders.csv` (processed in the same session), it maps to the same fake name. This preserves referential integrity, foreign key relationships still work in the anonymized data.
|
|
95
|
+
|
|
96
|
+
Each entity type maps to a specific Faker generator:
|
|
97
|
+
|
|
98
|
+
| Entity Type | Faker Method | Example Output |
|
|
99
|
+
|---|---|---|
|
|
100
|
+
| `PERSON` | `name()` | "Nicholas Quinn" |
|
|
101
|
+
| `EMAIL_ADDRESS` | `email()` | "sanchezmichelle@example.net" |
|
|
102
|
+
| `PHONE_NUMBER` | `phone_number()` | "+1-555-384-2918" |
|
|
103
|
+
| `LOCATION` | `address()` | "742 Pine Street, Apt 3" |
|
|
104
|
+
| `US_SSN` | `ssn()` | "746-44-8807" |
|
|
105
|
+
| `CREDIT_CARD` | `credit_card_number()` | "4532015112830366" |
|
|
106
|
+
| `IP_ADDRESS` | `ipv4()` | "192.168.42.7" |
|
|
107
|
+
| `IBAN_CODE` | `iban()` | "DE89370400440532013000" |
|
|
108
|
+
| `DATE_TIME` | `date()` | "1994-07-15" |
|
|
109
|
+
|
|
110
|
+
The `--locale` option controls the style of generated data. With `nl_NL`, Faker produces Dutch names and addresses. With `de_DE`, German ones.
|
|
111
|
+
|
|
112
|
+
The `--seed` option makes the Faker output deterministic. Same seed + same input = identical output, useful for reproducible test pipelines.
|
|
113
|
+
|
|
114
|
+
## Replacer
|
|
115
|
+
|
|
116
|
+
The `Replacer` (`replacer/engine.py`) is intentionally simple. For each cell flagged as PII by the scanner, it:
|
|
117
|
+
|
|
118
|
+
1. Reads the original cell value
|
|
119
|
+
2. Calls `SessionIndex.get_or_create(entity_type, original)`
|
|
120
|
+
3. Writes the fake value back
|
|
121
|
+
|
|
122
|
+
This is **whole-cell replacement**, the entire cell content is swapped. This design is correct for structured CSV data where PII typically occupies the full cell (a name is the whole cell, not embedded in a sentence). Free-text span replacement, where PII is embedded in longer text, is planned for a future phase.
|
|
123
|
+
|
|
124
|
+
## Module Map
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
src/lethe/
|
|
128
|
+
config.py # LetheConfig frozen dataclass
|
|
129
|
+
pipeline.py # Orchestrator: ties all stages together
|
|
130
|
+
cli.py # Typer CLI, thin wrapper over pipeline
|
|
131
|
+
parsers/
|
|
132
|
+
base.py # ChunkedReader / ChunkedWriter protocols
|
|
133
|
+
csv_parser.py # pandas-based CSV I/O with chunking
|
|
134
|
+
scanner/
|
|
135
|
+
engine.py # PiiScanner: Presidio + heuristics + boosting
|
|
136
|
+
column_heuristics.py # Column name -> SKIP / PII type / None
|
|
137
|
+
pattern_recognizers.py # Custom regex: IBAN, UK NINO, NL BSN
|
|
138
|
+
confidence.py # Score boosting and result synthesis
|
|
139
|
+
mapping/
|
|
140
|
+
session_index.py # (entity_type, original) -> fake, via Faker
|
|
141
|
+
replacer/
|
|
142
|
+
engine.py # Whole-cell swap using session index
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Design Decisions
|
|
146
|
+
|
|
147
|
+
**Why build on Presidio, not around it?**
|
|
148
|
+
Presidio already handles entity resolution, deduplication, and spaCy NER integration. Reimplementing these would be error-prone and hard to maintain. Instead, Lethe registers custom recognizers into Presidio's `RecognizerRegistry` and lets Presidio handle the heavy lifting.
|
|
149
|
+
|
|
150
|
+
**Why not use Presidio's AnonymizerEngine?**
|
|
151
|
+
Presidio's anonymizer is designed for free-text spans, replacing "John" within "Dear John, your order is ready." Our data is structured CSV where PII is the entire cell. Our `Replacer` + `SessionIndex` approach is simpler, gives cross-table consistency for free, and avoids span-offset arithmetic.
|
|
152
|
+
|
|
153
|
+
**Why column-first processing?**
|
|
154
|
+
In structured data, PII type is determined by the column, not the row. All values in a `first_name` column are names. By processing column-by-column, we can skip entire columns via heuristics (saving all NLP cost for that column) and use the column name as a confidence signal.
|
|
155
|
+
|
|
156
|
+
**Why `en_core_web_trf` by default?**
|
|
157
|
+
Accuracy is the top priority. Missing PII in production data is worse than being slow. The transformer model catches ambiguous names and locations that the small model misses. For development and testing, `--model sm` provides a fast alternative.
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# CLI Reference
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install -e .
|
|
7
|
+
python -m spacy download en_core_web_sm
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
For the transformer model (higher accuracy, slower):
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install -e ".[trf]"
|
|
14
|
+
python -m spacy download en_core_web_trf
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Commands
|
|
18
|
+
|
|
19
|
+
### `lethe anonymize`
|
|
20
|
+
|
|
21
|
+
Anonymize PII in a CSV file.
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
lethe anonymize <INPUT_FILE> [OPTIONS]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**Arguments:**
|
|
28
|
+
|
|
29
|
+
| Argument | Description |
|
|
30
|
+
|---|---|
|
|
31
|
+
| `INPUT_FILE` | Path to the CSV file to anonymize. Must exist and be readable. |
|
|
32
|
+
|
|
33
|
+
**Options:**
|
|
34
|
+
|
|
35
|
+
| Option | Short | Default | Description |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| `--output` | `-o` | `<input>_anonymized.csv` | Output file path. When omitted, appends `_anonymized` to the input filename. |
|
|
38
|
+
| `--model` | `-m` | `trf` | NLP model to use. `trf` = transformer (accurate), `sm` = small (fast). |
|
|
39
|
+
| `--threshold` | `-t` | `0.35` | Minimum confidence score to classify a cell as PII. Lower values catch more PII but risk false positives. Higher values are more conservative. |
|
|
40
|
+
| `--chunk-size` | | `5000` | Number of rows per processing chunk. Controls memory usage for large files. |
|
|
41
|
+
| `--locale` | | `en_US` | Faker locale for generating replacement values. Affects name styles, address formats, phone patterns. |
|
|
42
|
+
| `--seed` | | None | Random seed for reproducible output. Same seed + same input = identical anonymized output. |
|
|
43
|
+
|
|
44
|
+
## Examples
|
|
45
|
+
|
|
46
|
+
### Basic usage
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
lethe anonymize customers.csv
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Output: `customers_anonymized.csv` in the same directory.
|
|
53
|
+
|
|
54
|
+
### Specify output path
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
lethe anonymize customers.csv -o /tmp/safe_customers.csv
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Fast mode with small model
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
lethe anonymize customers.csv --model sm
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Uses `en_core_web_sm` instead of the transformer model. Significantly faster, slightly less accurate on ambiguous names and locations.
|
|
67
|
+
|
|
68
|
+
### Reproducible output
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
lethe anonymize customers.csv --seed 42
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Running this twice on the same input produces identical output. Useful for deterministic test pipelines.
|
|
75
|
+
|
|
76
|
+
### Tuning detection sensitivity
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Aggressive: catch anything that might be PII
|
|
80
|
+
lethe anonymize customers.csv --threshold 0.2
|
|
81
|
+
|
|
82
|
+
# Conservative: only replace high-confidence PII
|
|
83
|
+
lethe anonymize customers.csv --threshold 0.7
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Large files with limited memory
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
lethe anonymize huge_dataset.csv --chunk-size 1000
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Processes 1000 rows at a time instead of the default 5000. Lower values use less memory, higher values are slightly faster.
|
|
93
|
+
|
|
94
|
+
### Dutch locale for replacement data
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
lethe anonymize klanten.csv --locale nl_NL
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Generated fake names, addresses, and phone numbers will follow Dutch conventions.
|
|
101
|
+
|
|
102
|
+
### Combining options
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
lethe anonymize data/export.csv \
|
|
106
|
+
-o data/export_safe.csv \
|
|
107
|
+
--model sm \
|
|
108
|
+
--threshold 0.3 \
|
|
109
|
+
--locale de_DE \
|
|
110
|
+
--seed 123
|
|
111
|
+
```
|