nophi 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nophi-0.1.0/LICENSE +21 -0
- nophi-0.1.0/PKG-INFO +263 -0
- nophi-0.1.0/README.md +235 -0
- nophi-0.1.0/nophi/__init__.py +17 -0
- nophi-0.1.0/nophi/__main__.py +7 -0
- nophi-0.1.0/nophi/analyzer.py +164 -0
- nophi-0.1.0/nophi/cli.py +422 -0
- nophi-0.1.0/nophi/data/__init__.py +0 -0
- nophi-0.1.0/nophi/data/medical_terms.py +203 -0
- nophi-0.1.0/nophi/data/rxnorm_names.txt.gz +0 -0
- nophi-0.1.0/nophi/handlers/__init__.py +0 -0
- nophi-0.1.0/nophi/handlers/common.py +29 -0
- nophi-0.1.0/nophi/handlers/docx.py +116 -0
- nophi-0.1.0/nophi/handlers/pdf.py +247 -0
- nophi-0.1.0/nophi/handlers/text.py +117 -0
- nophi-0.1.0/nophi/handlers/xlsx.py +80 -0
- nophi-0.1.0/nophi/models.py +247 -0
- nophi-0.1.0/nophi/recognizers.py +272 -0
- nophi-0.1.0/nophi/redactor.py +97 -0
- nophi-0.1.0/nophi/reporter.py +105 -0
- nophi-0.1.0/nophi.egg-info/PKG-INFO +263 -0
- nophi-0.1.0/nophi.egg-info/SOURCES.txt +26 -0
- nophi-0.1.0/nophi.egg-info/dependency_links.txt +1 -0
- nophi-0.1.0/nophi.egg-info/entry_points.txt +2 -0
- nophi-0.1.0/nophi.egg-info/requires.txt +11 -0
- nophi-0.1.0/nophi.egg-info/top_level.txt +1 -0
- nophi-0.1.0/pyproject.toml +44 -0
- nophi-0.1.0/setup.cfg +4 -0
nophi-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 no-phi contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
nophi-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nophi
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Detect and redact PHI/PII from documents (.txt, .csv, .docx, .xlsx, .pdf)
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/kshen3778/no-phi
|
|
7
|
+
Project-URL: Repository, https://github.com/kshen3778/no-phi
|
|
8
|
+
Keywords: phi,pii,redaction,anonymization,healthcare,privacy
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Topic :: Text Processing
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: typer[all]<1.0,>=0.26
|
|
17
|
+
Requires-Dist: click<9.0,>=8.4
|
|
18
|
+
Requires-Dist: rich<16.0,>=15.0
|
|
19
|
+
Requires-Dist: presidio-analyzer>=2.2.362
|
|
20
|
+
Requires-Dist: presidio-anonymizer>=2.2.362
|
|
21
|
+
Requires-Dist: spacy<4.0,>=3.8
|
|
22
|
+
Requires-Dist: python-docx<2.0,>=1.2
|
|
23
|
+
Requires-Dist: openpyxl<4.0,>=3.1
|
|
24
|
+
Requires-Dist: pymupdf<2.0,>=1.27
|
|
25
|
+
Requires-Dist: certifi>=2024.0
|
|
26
|
+
Requires-Dist: drug-named-entity-recognition<3.0,>=2.0.9
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# no-phi
|
|
30
|
+
|
|
31
|
+
A command-line tool for detecting and redacting **PHI/PII** (protected health
|
|
32
|
+
information / personally identifiable information) from documents. It reads
|
|
33
|
+
`.txt`, `.csv`, `.docx`, `.xlsx`, and `.pdf` files, finds personal data, writes
|
|
34
|
+
redacted copies, and produces an Excel findings report.
|
|
35
|
+
|
|
36
|
+
It is tuned for **healthcare documents**: a layer of biomedical recognizers
|
|
37
|
+
suppresses the false positives that general-purpose NER produces on clinical
|
|
38
|
+
text (e.g. tagging a drug name like *Perindopril* as a PERSON, or *Cardiology*
|
|
39
|
+
as an ORGANIZATION).
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Scan a file or folder, write redacted copies + phi_report.xlsx
|
|
43
|
+
python main.py scan report.pdf
|
|
44
|
+
python main.py scan ./records/ --output ./records_cleaned/
|
|
45
|
+
|
|
46
|
+
# Detect only, don't write redacted files
|
|
47
|
+
python main.py scan report.docx --dry-run
|
|
48
|
+
|
|
49
|
+
# Restrict to specific entity types
|
|
50
|
+
python main.py scan data.csv --entities PERSON,PHONE_NUMBER,US_SSN
|
|
51
|
+
|
|
52
|
+
# Map detected values to stable IDs instead of <ENTITY_TYPE> (CSV cols: id,mapped_id)
|
|
53
|
+
python main.py scan notes.txt --mappings mappings.csv
|
|
54
|
+
|
|
55
|
+
# Ignore known-safe values (.txt/.csv/.xlsx/.json) — not redacted or reported
|
|
56
|
+
python main.py scan ./records/ --exclude allowlist.txt
|
|
57
|
+
|
|
58
|
+
# Pre-download all NLP models (otherwise downloaded on first scan)
|
|
59
|
+
python main.py download-models
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Pipeline
|
|
65
|
+
|
|
66
|
+
Every file flows through four stages: **extract → recognize → redact →
|
|
67
|
+
report**. The tools used at each stage are listed below.
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
71
|
+
file ─────► │ 1. EXTRACT ├──►│ 2. RECOGNIZE├──►│ 3. REDACT ├──►│ 4. REPORT │
|
|
72
|
+
│ text + │ │ PII spans │ │ anonymize/ │ │ Excel │
|
|
73
|
+
│ positions │ │ (Presidio) │ │ black-box │ │ findings │
|
|
74
|
+
└─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
The CLI orchestration lives in [nophi/cli.py](nophi/cli.py): it collects input
|
|
78
|
+
files ([`_collect_files`](nophi/cli.py)), dispatches each by extension to a
|
|
79
|
+
handler in [nophi/handlers/](nophi/handlers/), aggregates findings, and prints a
|
|
80
|
+
Rich summary table. ([main.py](main.py) is a thin shim that calls into it.)
|
|
81
|
+
|
|
82
|
+
| Layer | Package / tool |
|
|
83
|
+
| --- | --- |
|
|
84
|
+
| CLI, options, sub-commands | [Typer](https://typer.tiangolo.com/) |
|
|
85
|
+
| Terminal progress bars & tables | [Rich](https://rich.readthedocs.io/) |
|
|
86
|
+
| Entity detection engine | [Presidio Analyzer](https://microsoft.github.io/presidio/) |
|
|
87
|
+
| Anonymization engine | [Presidio Anonymizer](https://microsoft.github.io/presidio/) |
|
|
88
|
+
| General NER backend | [spaCy](https://spacy.io/) `en_core_web_lg` |
|
|
89
|
+
| Biomedical NER | [scispaCy](https://allenai.github.io/scispacy/) `en_ner_bc5cdr_md`, `en_ner_bionlp13cg_md` |
|
|
90
|
+
| Drug-name matching | [drug-named-entity-recognition](https://pypi.org/project/drug-named-entity-recognition/) (DrugBank) + bundled [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/) name list |
|
|
91
|
+
| Report output | [openpyxl](https://openpyxl.readthedocs.io/) |
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
### 1. Extract — text + positions
|
|
96
|
+
|
|
97
|
+
Each file type has a handler in [nophi/handlers/](nophi/handlers/) that pulls out
|
|
98
|
+
the text to scan. For formats with layout (PDF), it also tracks where each piece
|
|
99
|
+
of text sits so redactions can be placed precisely.
|
|
100
|
+
|
|
101
|
+
| Type | Handler | Library | Notes |
|
|
102
|
+
| --- | --- | --- | --- |
|
|
103
|
+
| `.txt` | [text.py](nophi/handlers/text.py) | stdlib | Whole file read as one string. |
|
|
104
|
+
| `.csv` | [text.py](nophi/handlers/text.py) | `csv` | Dialect auto-sniffed; scanned **per cell**. |
|
|
105
|
+
| `.docx` | [docx.py](nophi/handlers/docx.py) | `python-docx` | Each paragraph and table cell. |
|
|
106
|
+
| `.xlsx` | [xlsx.py](nophi/handlers/xlsx.py) | `openpyxl` | Every string cell across all sheets. |
|
|
107
|
+
| `.pdf` | [pdf.py](nophi/handlers/pdf.py) | `PyMuPDF` | Words + bounding boxes via `get_text("words")`, reassembled into text with a char-offset → word-box map. |
|
|
108
|
+
|
|
109
|
+
### 2. Recognize — find PII spans
|
|
110
|
+
|
|
111
|
+
[nophi/analyzer.py](nophi/analyzer.py) builds a Presidio `AnalyzerEngine` (backed
|
|
112
|
+
by the spaCy `en_core_web_lg` model) and exposes [`scan_text`](nophi/analyzer.py),
|
|
113
|
+
which returns the detected entities with character offsets and confidence scores.
|
|
114
|
+
|
|
115
|
+
Detection comes from three sources working together:
|
|
116
|
+
|
|
117
|
+
- **Presidio built-ins** — spaCy NER for `PERSON`, `ORGANIZATION`, `LOCATION`,
|
|
118
|
+
`DATE_TIME`, `NRP`, plus pattern/checksum recognizers for `PHONE_NUMBER`,
|
|
119
|
+
`EMAIL_ADDRESS`, `CREDIT_CARD`, `US_SSN`, `IBAN_CODE`, `IP_ADDRESS`, `URL`,
|
|
120
|
+
`MEDICAL_LICENSE`, and other structured identifiers.
|
|
121
|
+
|
|
122
|
+
- **Custom biomedical recognizers** ([nophi/recognizers.py](nophi/recognizers.py)) — these
|
|
123
|
+
do **not** add PII. They recognize medical vocabulary and tag it with the
|
|
124
|
+
internal type `MEDICAL_TERM`, which is used to *protect* that text from being
|
|
125
|
+
scrubbed — **not** to redact it.
|
|
126
|
+
|
|
127
|
+
They form **four complementary layers**, each catching what the others miss
|
|
128
|
+
(a curated deny-list, two drug-name lists, and an ML model). All four emit
|
|
129
|
+
`MEDICAL_TERM`:
|
|
130
|
+
|
|
131
|
+
| # | Recognizer | Backed by | Catches | Matching |
|
|
132
|
+
| --- | --- | --- | --- | --- |
|
|
133
|
+
| 1 | `MedicalTermRecognizer` | deny-list in [nophi/data/medical_terms.py](nophi/data/medical_terms.py) | Hospital departments, specialties, wards, symptoms, diagnoses, procedures, labs/imaging, shorthand. | Exact (case-insensitive) |
|
|
134
|
+
| 2 | `MedicationRecognizer` | `drug-named-entity-recognition` (DrugBank) | Drug names, incl. common misspellings. | Fuzzy |
|
|
135
|
+
| 3 | `RxNormRecognizer` | bundled RxNorm name list ([nophi/data/rxnorm_names.txt.gz](nophi/data/rxnorm_names.txt.gz)) | Drug brand + ingredient names from RxNorm (incl. many vitamins/minerals under their ingredient names). | Exact n-gram |
|
|
136
|
+
| 4 | `BiomedicalNerRecognizer` | scispaCy `en_ner_bc5cdr_md` + `en_ner_bionlp13cg_md` | Chemicals, diseases, anatomy, genes, organisms, tissues — recognized **by ML context**, so it catches substances in no list. | ML model |
|
|
137
|
+
|
|
138
|
+
Layers 2–4 overlap on purpose: the two drug lists give high-precision exact/fuzzy
|
|
139
|
+
hits, and the ML layer is the backstop for substances not in any list. Coverage
|
|
140
|
+
of supplements/vitamins is therefore good for clinical/ingredient names
|
|
141
|
+
(e.g. *ascorbic acid*, *cholecalciferol*) but thinner for lay/botanical names
|
|
142
|
+
(e.g. *fish oil*, *ginkgo biloba*); the ML layer is the main net for those.
|
|
143
|
+
|
|
144
|
+
**Suppression logic** in [`scan_text`](nophi/analyzer.py): any
|
|
145
|
+
`PERSON` / `ORGANIZATION` / `NRP` / `LOCATION` detection that overlaps a
|
|
146
|
+
`MEDICAL_TERM` span is dropped, and the `MEDICAL_TERM` spans themselves are
|
|
147
|
+
removed from the output (they are not PII). The net effect is that genuine
|
|
148
|
+
names/places survive while clinical vocabulary stops being mislabeled as
|
|
149
|
+
identifiers.
|
|
150
|
+
|
|
151
|
+
- **`StreetAddressRecognizer`** ([nophi/recognizers.py](nophi/recognizers.py)) — unlike the
|
|
152
|
+
biomedical recognizers, this one *adds* PII that Presidio's defaults miss.
|
|
153
|
+
spaCy NER tags cities/regions (`Scarborough`) but not street lines, so a
|
|
154
|
+
regex matches a house number + 1–3 street-name words + a known street-type
|
|
155
|
+
suffix (`Rd`, `Street`, `Ave`, `Blvd`, `Dr`, …) and reports it as `LOCATION`.
|
|
156
|
+
It handles bare addresses (`2867 Ellesmere Rd`) as well as full ones, plus
|
|
157
|
+
alphanumeric house numbers (`221B`) and ordinal street names (`350 5th
|
|
158
|
+
Avenue`). Requiring a leading number keeps it from matching a `Dr.` title or
|
|
159
|
+
dosages like `100 mg tablet`.
|
|
160
|
+
|
|
161
|
+
### 3. Redact — anonymize or black-box
|
|
162
|
+
|
|
163
|
+
[nophi/redactor.py](nophi/redactor.py) builds a Presidio `AnonymizerEngine` and the
|
|
164
|
+
operator set used to replace each entity. By default an entity becomes
|
|
165
|
+
`<ENTITY_TYPE>`; with `--mappings` (CSV columns `id,mapped_id`), a detection whose
|
|
166
|
+
text matches an `id` is replaced by its `mapped_id` instead (token-overlap match,
|
|
167
|
+
applied across all entity types so it works regardless of how Presidio classified
|
|
168
|
+
the value). The `--exclude` option takes a `.txt`/`.csv`/`.xlsx`/`.json` list of
|
|
169
|
+
values to ignore — any detection matching one (case-insensitive) is dropped in
|
|
170
|
+
[`scan_text`](nophi/analyzer.py) before redaction or reporting.
|
|
171
|
+
|
|
172
|
+
How the replacement is applied depends on the format:
|
|
173
|
+
|
|
174
|
+
- **`.txt` / `.csv`** — Presidio rewrites the string in place
|
|
175
|
+
([`anonymize_text`](nophi/redactor.py)).
|
|
176
|
+
- **`.docx`** — the anonymized text is written back into the paragraph/cell,
|
|
177
|
+
preserving document structure.
|
|
178
|
+
- **`.xlsx`** — matching cell values are overwritten.
|
|
179
|
+
- **`.pdf`** — each detected span is mapped back to the exact word bounding
|
|
180
|
+
boxes it covers; `page.add_redact_annot()` draws a filled black box with a
|
|
181
|
+
short white label, and `page.apply_redactions()` **permanently removes** the
|
|
182
|
+
underlying text from the PDF content stream (a true irreversible redaction,
|
|
183
|
+
not just a visual cover).
|
|
184
|
+
|
|
185
|
+
### 4. Report — Excel findings
|
|
186
|
+
|
|
187
|
+
[nophi/reporter.py](nophi/reporter.py) writes an `openpyxl` workbook (default
|
|
188
|
+
`phi_report.xlsx`) with two sheets:
|
|
189
|
+
|
|
190
|
+
- **Findings** — one row per detection: file, entity type, original text,
|
|
191
|
+
replacement, character position.
|
|
192
|
+
- **Summary** — entity-type counts and per-file PHI counts.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Models & first run
|
|
197
|
+
|
|
198
|
+
The NLP models are **downloaded on first use and cached** under
|
|
199
|
+
`~/.cache/no-phi/models/` — they are not bundled into the program.
|
|
200
|
+
[nophi/models.py](nophi/models.py) handles fetching, extracting, and
|
|
201
|
+
(for the scispaCy models) patching them to load under the installed spaCy
|
|
202
|
+
version.
|
|
203
|
+
|
|
204
|
+
| Model | Size | Source |
|
|
205
|
+
| --- | --- | --- |
|
|
206
|
+
| `en_core_web_lg` (base NER) | ~560 MB | spaCy GitHub releases (pip wheel) |
|
|
207
|
+
| `en_ner_bc5cdr_md` | ~115 MB | scispaCy S3 release (`.tar.gz`) |
|
|
208
|
+
| `en_ner_bionlp13cg_md` | ~120 MB | scispaCy S3 release (`.tar.gz`) |
|
|
209
|
+
|
|
210
|
+
> The scispaCy biomedical models load with **plain spaCy** — the heavyweight
|
|
211
|
+
> `scispacy` package (and its `nmslib`/`scipy`/`scikit-learn` dependencies) is
|
|
212
|
+
> **not** required. [nophi/models.py](nophi/models.py) rewrites a stale boolean
|
|
213
|
+
> in each model's `config.cfg` during extraction so it validates under spaCy 3.8.
|
|
214
|
+
|
|
215
|
+
Run `python main.py download-models` to fetch everything ahead of time, or just
|
|
216
|
+
run a scan and the models download automatically on first invocation.
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Project layout
|
|
221
|
+
|
|
222
|
+
```
|
|
223
|
+
pyproject.toml # packaging + dependencies + `nophi` console entry point
|
|
224
|
+
main.py # thin entry-point shim → nophi.cli:main (used by Nuitka build)
|
|
225
|
+
nophi/ # the package
|
|
226
|
+
├── __main__.py # enables `python -m nophi`
|
|
227
|
+
├── cli.py # Typer app + orchestration
|
|
228
|
+
├── analyzer.py # build_analyzer() + scan_text() (detection)
|
|
229
|
+
├── recognizers.py # custom MEDICAL_TERM recognizers
|
|
230
|
+
├── redactor.py # anonymization
|
|
231
|
+
├── reporter.py # Excel findings report
|
|
232
|
+
├── models.py # model download / cache
|
|
233
|
+
├── handlers/ # per-format read/write/redact (text, docx, xlsx, pdf)
|
|
234
|
+
└── data/ # medical_terms.py + bundled rxnorm_names.txt.gz
|
|
235
|
+
scripts/ # build_rxnorm_list.py (refreshes the bundled RxNorm list)
|
|
236
|
+
docs/ # expansion_notes.md (user guide lives in the repo-root docs/)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Install
|
|
242
|
+
|
|
243
|
+
### As a pip package (recommended)
|
|
244
|
+
|
|
245
|
+
Installs a `nophi` command on your PATH:
|
|
246
|
+
|
|
247
|
+
```bash
|
|
248
|
+
pip install . # or `pip install nophi` once published to PyPI
|
|
249
|
+
nophi download-models # one-time: fetch NLP models (~560 MB)
|
|
250
|
+
nophi scan report.pdf
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
You can also run it without installing the script via `python -m nophi scan ...`.
|
|
254
|
+
|
|
255
|
+
### For development
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
pip install -e . # editable install (deps come from pyproject.toml)
|
|
259
|
+
# or: pip install -r requirements.txt
|
|
260
|
+
python main.py download-models # optional: pre-fetch models
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
See [the user guide](../../docs/nophi-user-guide.md) for end-user instructions.
|
nophi-0.1.0/README.md
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# no-phi
|
|
2
|
+
|
|
3
|
+
A command-line tool for detecting and redacting **PHI/PII** (protected health
|
|
4
|
+
information / personally identifiable information) from documents. It reads
|
|
5
|
+
`.txt`, `.csv`, `.docx`, `.xlsx`, and `.pdf` files, finds personal data, writes
|
|
6
|
+
redacted copies, and produces an Excel findings report.
|
|
7
|
+
|
|
8
|
+
It is tuned for **healthcare documents**: a layer of biomedical recognizers
|
|
9
|
+
suppresses the false positives that general-purpose NER produces on clinical
|
|
10
|
+
text (e.g. tagging a drug name like *Perindopril* as a PERSON, or *Cardiology*
|
|
11
|
+
as an ORGANIZATION).
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Scan a file or folder, write redacted copies + phi_report.xlsx
|
|
15
|
+
python main.py scan report.pdf
|
|
16
|
+
python main.py scan ./records/ --output ./records_cleaned/
|
|
17
|
+
|
|
18
|
+
# Detect only, don't write redacted files
|
|
19
|
+
python main.py scan report.docx --dry-run
|
|
20
|
+
|
|
21
|
+
# Restrict to specific entity types
|
|
22
|
+
python main.py scan data.csv --entities PERSON,PHONE_NUMBER,US_SSN
|
|
23
|
+
|
|
24
|
+
# Map detected values to stable IDs instead of <ENTITY_TYPE> (CSV cols: id,mapped_id)
|
|
25
|
+
python main.py scan notes.txt --mappings mappings.csv
|
|
26
|
+
|
|
27
|
+
# Ignore known-safe values (.txt/.csv/.xlsx/.json) — not redacted or reported
|
|
28
|
+
python main.py scan ./records/ --exclude allowlist.txt
|
|
29
|
+
|
|
30
|
+
# Pre-download all NLP models (otherwise downloaded on first scan)
|
|
31
|
+
python main.py download-models
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Pipeline
|
|
37
|
+
|
|
38
|
+
Every file flows through four stages: **extract → recognize → redact →
|
|
39
|
+
report**. The tools used at each stage are listed below.
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
43
|
+
file ─────► │ 1. EXTRACT ├──►│ 2. RECOGNIZE├──►│ 3. REDACT ├──►│ 4. REPORT │
|
|
44
|
+
│ text + │ │ PII spans │ │ anonymize/ │ │ Excel │
|
|
45
|
+
│ positions │ │ (Presidio) │ │ black-box │ │ findings │
|
|
46
|
+
└─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
The CLI orchestration lives in [nophi/cli.py](nophi/cli.py): it collects input
|
|
50
|
+
files ([`_collect_files`](nophi/cli.py)), dispatches each by extension to a
|
|
51
|
+
handler in [nophi/handlers/](nophi/handlers/), aggregates findings, and prints a
|
|
52
|
+
Rich summary table. ([main.py](main.py) is a thin shim that calls into it.)
|
|
53
|
+
|
|
54
|
+
| Layer | Package / tool |
|
|
55
|
+
| --- | --- |
|
|
56
|
+
| CLI, options, sub-commands | [Typer](https://typer.tiangolo.com/) |
|
|
57
|
+
| Terminal progress bars & tables | [Rich](https://rich.readthedocs.io/) |
|
|
58
|
+
| Entity detection engine | [Presidio Analyzer](https://microsoft.github.io/presidio/) |
|
|
59
|
+
| Anonymization engine | [Presidio Anonymizer](https://microsoft.github.io/presidio/) |
|
|
60
|
+
| General NER backend | [spaCy](https://spacy.io/) `en_core_web_lg` |
|
|
61
|
+
| Biomedical NER | [scispaCy](https://allenai.github.io/scispacy/) `en_ner_bc5cdr_md`, `en_ner_bionlp13cg_md` |
|
|
62
|
+
| Drug-name matching | [drug-named-entity-recognition](https://pypi.org/project/drug-named-entity-recognition/) (DrugBank) + bundled [RxNorm](https://www.nlm.nih.gov/research/umls/rxnorm/) name list |
|
|
63
|
+
| Report output | [openpyxl](https://openpyxl.readthedocs.io/) |
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
### 1. Extract — text + positions
|
|
68
|
+
|
|
69
|
+
Each file type has a handler in [nophi/handlers/](nophi/handlers/) that pulls out
|
|
70
|
+
the text to scan. For formats with layout (PDF), it also tracks where each piece
|
|
71
|
+
of text sits so redactions can be placed precisely.
|
|
72
|
+
|
|
73
|
+
| Type | Handler | Library | Notes |
|
|
74
|
+
| --- | --- | --- | --- |
|
|
75
|
+
| `.txt` | [text.py](nophi/handlers/text.py) | stdlib | Whole file read as one string. |
|
|
76
|
+
| `.csv` | [text.py](nophi/handlers/text.py) | `csv` | Dialect auto-sniffed; scanned **per cell**. |
|
|
77
|
+
| `.docx` | [docx.py](nophi/handlers/docx.py) | `python-docx` | Each paragraph and table cell. |
|
|
78
|
+
| `.xlsx` | [xlsx.py](nophi/handlers/xlsx.py) | `openpyxl` | Every string cell across all sheets. |
|
|
79
|
+
| `.pdf` | [pdf.py](nophi/handlers/pdf.py) | `PyMuPDF` | Words + bounding boxes via `get_text("words")`, reassembled into text with a char-offset → word-box map. |
|
|
80
|
+
|
|
81
|
+
### 2. Recognize — find PII spans
|
|
82
|
+
|
|
83
|
+
[nophi/analyzer.py](nophi/analyzer.py) builds a Presidio `AnalyzerEngine` (backed
|
|
84
|
+
by the spaCy `en_core_web_lg` model) and exposes [`scan_text`](nophi/analyzer.py),
|
|
85
|
+
which returns the detected entities with character offsets and confidence scores.
|
|
86
|
+
|
|
87
|
+
Detection comes from three sources working together:
|
|
88
|
+
|
|
89
|
+
- **Presidio built-ins** — spaCy NER for `PERSON`, `ORGANIZATION`, `LOCATION`,
|
|
90
|
+
`DATE_TIME`, `NRP`, plus pattern/checksum recognizers for `PHONE_NUMBER`,
|
|
91
|
+
`EMAIL_ADDRESS`, `CREDIT_CARD`, `US_SSN`, `IBAN_CODE`, `IP_ADDRESS`, `URL`,
|
|
92
|
+
`MEDICAL_LICENSE`, and other structured identifiers.
|
|
93
|
+
|
|
94
|
+
- **Custom biomedical recognizers** ([nophi/recognizers.py](nophi/recognizers.py)) — these
|
|
95
|
+
do **not** add PII. They recognize medical vocabulary and tag it with the
|
|
96
|
+
internal type `MEDICAL_TERM`, which is used to *protect* that text from being
|
|
97
|
+
scrubbed — **not** to redact it.
|
|
98
|
+
|
|
99
|
+
They form **four complementary layers**, each catching what the others miss
|
|
100
|
+
(a curated deny-list, two drug-name lists, and an ML model). All four emit
|
|
101
|
+
`MEDICAL_TERM`:
|
|
102
|
+
|
|
103
|
+
| # | Recognizer | Backed by | Catches | Matching |
|
|
104
|
+
| --- | --- | --- | --- | --- |
|
|
105
|
+
| 1 | `MedicalTermRecognizer` | deny-list in [nophi/data/medical_terms.py](nophi/data/medical_terms.py) | Hospital departments, specialties, wards, symptoms, diagnoses, procedures, labs/imaging, shorthand. | Exact (case-insensitive) |
|
|
106
|
+
| 2 | `MedicationRecognizer` | `drug-named-entity-recognition` (DrugBank) | Drug names, incl. common misspellings. | Fuzzy |
|
|
107
|
+
| 3 | `RxNormRecognizer` | bundled RxNorm name list ([nophi/data/rxnorm_names.txt.gz](nophi/data/rxnorm_names.txt.gz)) | Drug brand + ingredient names from RxNorm (incl. many vitamins/minerals under their ingredient names). | Exact n-gram |
|
|
108
|
+
| 4 | `BiomedicalNerRecognizer` | scispaCy `en_ner_bc5cdr_md` + `en_ner_bionlp13cg_md` | Chemicals, diseases, anatomy, genes, organisms, tissues — recognized **by ML context**, so it catches substances in no list. | ML model |
|
|
109
|
+
|
|
110
|
+
Layers 2–4 overlap on purpose: the two drug lists give high-precision exact/fuzzy
|
|
111
|
+
hits, and the ML layer is the backstop for substances not in any list. Coverage
|
|
112
|
+
of supplements/vitamins is therefore good for clinical/ingredient names
|
|
113
|
+
(e.g. *ascorbic acid*, *cholecalciferol*) but thinner for lay/botanical names
|
|
114
|
+
(e.g. *fish oil*, *ginkgo biloba*); the ML layer is the main net for those.
|
|
115
|
+
|
|
116
|
+
**Suppression logic** in [`scan_text`](nophi/analyzer.py): any
|
|
117
|
+
`PERSON` / `ORGANIZATION` / `NRP` / `LOCATION` detection that overlaps a
|
|
118
|
+
`MEDICAL_TERM` span is dropped, and the `MEDICAL_TERM` spans themselves are
|
|
119
|
+
removed from the output (they are not PII). The net effect is that genuine
|
|
120
|
+
names/places survive while clinical vocabulary stops being mislabeled as
|
|
121
|
+
identifiers.
|
|
122
|
+
|
|
123
|
+
- **`StreetAddressRecognizer`** ([nophi/recognizers.py](nophi/recognizers.py)) — unlike the
|
|
124
|
+
biomedical recognizers, this one *adds* PII that Presidio's defaults miss.
|
|
125
|
+
spaCy NER tags cities/regions (`Scarborough`) but not street lines, so a
|
|
126
|
+
regex matches a house number + 1–3 street-name words + a known street-type
|
|
127
|
+
suffix (`Rd`, `Street`, `Ave`, `Blvd`, `Dr`, …) and reports it as `LOCATION`.
|
|
128
|
+
It handles bare addresses (`2867 Ellesmere Rd`) as well as full ones, plus
|
|
129
|
+
alphanumeric house numbers (`221B`) and ordinal street names (`350 5th
|
|
130
|
+
Avenue`). Requiring a leading number keeps it from matching a `Dr.` title or
|
|
131
|
+
dosages like `100 mg tablet`.
|
|
132
|
+
|
|
133
|
+
### 3. Redact — anonymize or black-box
|
|
134
|
+
|
|
135
|
+
[nophi/redactor.py](nophi/redactor.py) builds a Presidio `AnonymizerEngine` and the
|
|
136
|
+
operator set used to replace each entity. By default an entity becomes
|
|
137
|
+
`<ENTITY_TYPE>`; with `--mappings` (CSV columns `id,mapped_id`), a detection whose
|
|
138
|
+
text matches an `id` is replaced by its `mapped_id` instead (token-overlap match,
|
|
139
|
+
applied across all entity types so it works regardless of how Presidio classified
|
|
140
|
+
the value). The `--exclude` option takes a `.txt`/`.csv`/`.xlsx`/`.json` list of
|
|
141
|
+
values to ignore — any detection matching one (case-insensitive) is dropped in
|
|
142
|
+
[`scan_text`](nophi/analyzer.py) before redaction or reporting.
|
|
143
|
+
|
|
144
|
+
How the replacement is applied depends on the format:
|
|
145
|
+
|
|
146
|
+
- **`.txt` / `.csv`** — Presidio rewrites the string in place
|
|
147
|
+
([`anonymize_text`](nophi/redactor.py)).
|
|
148
|
+
- **`.docx`** — the anonymized text is written back into the paragraph/cell,
|
|
149
|
+
preserving document structure.
|
|
150
|
+
- **`.xlsx`** — matching cell values are overwritten.
|
|
151
|
+
- **`.pdf`** — each detected span is mapped back to the exact word bounding
|
|
152
|
+
boxes it covers; `page.add_redact_annot()` draws a filled black box with a
|
|
153
|
+
short white label, and `page.apply_redactions()` **permanently removes** the
|
|
154
|
+
underlying text from the PDF content stream (a true irreversible redaction,
|
|
155
|
+
not just a visual cover).
|
|
156
|
+
|
|
157
|
+
### 4. Report — Excel findings
|
|
158
|
+
|
|
159
|
+
[nophi/reporter.py](nophi/reporter.py) writes an `openpyxl` workbook (default
|
|
160
|
+
`phi_report.xlsx`) with two sheets:
|
|
161
|
+
|
|
162
|
+
- **Findings** — one row per detection: file, entity type, original text,
|
|
163
|
+
replacement, character position.
|
|
164
|
+
- **Summary** — entity-type counts and per-file PHI counts.
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Models & first run
|
|
169
|
+
|
|
170
|
+
The NLP models are **downloaded on first use and cached** under
|
|
171
|
+
`~/.cache/no-phi/models/` — they are not bundled into the program.
|
|
172
|
+
[nophi/models.py](nophi/models.py) handles fetching, extracting, and
|
|
173
|
+
(for the scispaCy models) patching them to load under the installed spaCy
|
|
174
|
+
version.
|
|
175
|
+
|
|
176
|
+
| Model | Size | Source |
|
|
177
|
+
| --- | --- | --- |
|
|
178
|
+
| `en_core_web_lg` (base NER) | ~560 MB | spaCy GitHub releases (pip wheel) |
|
|
179
|
+
| `en_ner_bc5cdr_md` | ~115 MB | scispaCy S3 release (`.tar.gz`) |
|
|
180
|
+
| `en_ner_bionlp13cg_md` | ~120 MB | scispaCy S3 release (`.tar.gz`) |
|
|
181
|
+
|
|
182
|
+
> The scispaCy biomedical models load with **plain spaCy** — the heavyweight
|
|
183
|
+
> `scispacy` package (and its `nmslib`/`scipy`/`scikit-learn` dependencies) is
|
|
184
|
+
> **not** required. [nophi/models.py](nophi/models.py) rewrites a stale boolean
|
|
185
|
+
> in each model's `config.cfg` during extraction so it validates under spaCy 3.8.
|
|
186
|
+
|
|
187
|
+
Run `python main.py download-models` to fetch everything ahead of time, or just
|
|
188
|
+
run a scan and the models download automatically on first invocation.
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Project layout
|
|
193
|
+
|
|
194
|
+
```
|
|
195
|
+
pyproject.toml # packaging + dependencies + `nophi` console entry point
|
|
196
|
+
main.py # thin entry-point shim → nophi.cli:main (used by Nuitka build)
|
|
197
|
+
nophi/ # the package
|
|
198
|
+
├── __main__.py # enables `python -m nophi`
|
|
199
|
+
├── cli.py # Typer app + orchestration
|
|
200
|
+
├── analyzer.py # build_analyzer() + scan_text() (detection)
|
|
201
|
+
├── recognizers.py # custom MEDICAL_TERM recognizers
|
|
202
|
+
├── redactor.py # anonymization
|
|
203
|
+
├── reporter.py # Excel findings report
|
|
204
|
+
├── models.py # model download / cache
|
|
205
|
+
├── handlers/ # per-format read/write/redact (text, docx, xlsx, pdf)
|
|
206
|
+
└── data/ # medical_terms.py + bundled rxnorm_names.txt.gz
|
|
207
|
+
scripts/ # build_rxnorm_list.py (refreshes the bundled RxNorm list)
|
|
208
|
+
docs/ # expansion_notes.md (user guide lives in the repo-root docs/)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Install
|
|
214
|
+
|
|
215
|
+
### As a pip package (recommended)
|
|
216
|
+
|
|
217
|
+
Installs a `nophi` command on your PATH:
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
pip install . # or `pip install nophi` once published to PyPI
|
|
221
|
+
nophi download-models # one-time: fetch NLP models (~560 MB)
|
|
222
|
+
nophi scan report.pdf
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
You can also run it without installing the script via `python -m nophi scan ...`.
|
|
226
|
+
|
|
227
|
+
### For development
|
|
228
|
+
|
|
229
|
+
```bash
|
|
230
|
+
pip install -e . # editable install (deps come from pyproject.toml)
|
|
231
|
+
# or: pip install -r requirements.txt
|
|
232
|
+
python main.py download-models # optional: pre-fetch models
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
See [the user guide](../../docs/nophi-user-guide.md) for end-user instructions.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# no-phi: detect and redact PHI/PII from documents.
|
|
2
|
+
#
|
|
3
|
+
# Public API. `scan_text` and `build_analyzer` are imported lazily (PEP 562
|
|
4
|
+
# __getattr__) so `import nophi` stays cheap — the heavy Presidio/spaCy stack is
|
|
5
|
+
# only loaded when these are actually accessed. Downstream packages (e.g.
|
|
6
|
+
# nophi-av) should depend on these names rather than internal modules.
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
__all__ = ["scan_text", "build_analyzer", "__version__"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def __getattr__(name):
|
|
13
|
+
if name in ("scan_text", "build_analyzer"):
|
|
14
|
+
from nophi import analyzer
|
|
15
|
+
|
|
16
|
+
return getattr(analyzer, name)
|
|
17
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# Enables `python -m nophi ...`. The console entry point declared in
|
|
2
|
+
# pyproject.toml (nophi = "nophi.cli:main") and the Nuitka build both call
|
|
3
|
+
# nophi.cli:main directly; this module just routes module execution there too.
|
|
4
|
+
from nophi.cli import main
|
|
5
|
+
|
|
6
|
+
if __name__ == "__main__":
|
|
7
|
+
main()
|