radreport 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- radreport/__init__.py +60 -0
- radreport/cli.py +148 -0
- radreport/critical_findings.py +221 -0
- radreport/deidentifier.py +237 -0
- radreport/fhir_exporter.py +181 -0
- radreport/recommendation_extractor.py +133 -0
- radreport/report_parser.py +245 -0
- radreport/report_schema.py +220 -0
- radreport-0.4.0.dist-info/METADATA +459 -0
- radreport-0.4.0.dist-info/RECORD +14 -0
- radreport-0.4.0.dist-info/WHEEL +5 -0
- radreport-0.4.0.dist-info/entry_points.txt +2 -0
- radreport-0.4.0.dist-info/licenses/LICENSE +21 -0
- radreport-0.4.0.dist-info/top_level.txt +1 -0
radreport/__init__.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""
|
|
2
|
+
radreport
|
|
3
|
+
~~~~~~~~~
|
|
4
|
+
Parse, de-identify, structure, and export radiology free-text reports.
|
|
5
|
+
|
|
6
|
+
Quick start:
|
|
7
|
+
from radreport import ReportParser, CriticalFindingsDetector, FHIRExporter
|
|
8
|
+
from radreport import RecommendationExtractor, Deidentifier
|
|
9
|
+
|
|
10
|
+
parser = ReportParser()
|
|
11
|
+
detector = CriticalFindingsDetector()
|
|
12
|
+
extractor = RecommendationExtractor()
|
|
13
|
+
exporter = FHIRExporter()
|
|
14
|
+
|
|
15
|
+
# Optional: strip PHI first so downstream output is safe to share.
|
|
16
|
+
raw_text = Deidentifier().deidentify(raw_text).text
|
|
17
|
+
|
|
18
|
+
report = parser.parse(raw_text, modality="CT")
|
|
19
|
+
report = detector.detect(report)
|
|
20
|
+
report = extractor.extract(report)
|
|
21
|
+
fhir = exporter.export(report, patient_id="pt-001")
|
|
22
|
+
|
|
23
|
+
# Flat dict for CSV/pandas:
|
|
24
|
+
row = report.to_flat_dict()
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from .report_parser import ReportParser
|
|
28
|
+
from .critical_findings import CriticalFindingsDetector
|
|
29
|
+
from .recommendation_extractor import RecommendationExtractor
|
|
30
|
+
from .fhir_exporter import FHIRExporter
|
|
31
|
+
from .deidentifier import Deidentifier, deidentify
|
|
32
|
+
from .report_schema import (
|
|
33
|
+
ParsedReport,
|
|
34
|
+
ReportSection,
|
|
35
|
+
Finding,
|
|
36
|
+
Measurement,
|
|
37
|
+
CriticalFinding,
|
|
38
|
+
FollowUpRecommendation,
|
|
39
|
+
Redaction,
|
|
40
|
+
DeidentificationResult,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
__version__ = "0.4.0"
|
|
44
|
+
__all__ = [
|
|
45
|
+
"ReportParser",
|
|
46
|
+
"CriticalFindingsDetector",
|
|
47
|
+
"RecommendationExtractor",
|
|
48
|
+
"FHIRExporter",
|
|
49
|
+
"Deidentifier",
|
|
50
|
+
"deidentify",
|
|
51
|
+
"ParsedReport",
|
|
52
|
+
"ReportSection",
|
|
53
|
+
"Finding",
|
|
54
|
+
"Measurement",
|
|
55
|
+
"CriticalFinding",
|
|
56
|
+
"FollowUpRecommendation",
|
|
57
|
+
"Redaction",
|
|
58
|
+
"DeidentificationResult",
|
|
59
|
+
"__version__",
|
|
60
|
+
]
|
radreport/cli.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Command-line interface for radreport.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
radreport report.txt
|
|
6
|
+
radreport report.txt --fhir --patient-id pt-001 --modality CT
|
|
7
|
+
radreport reports/*.txt --critical --recommend --format csv -o batch.csv
|
|
8
|
+
radreport report.txt --deidentify
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import csv
|
|
13
|
+
import io
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from . import (
|
|
19
|
+
ReportParser, CriticalFindingsDetector, FHIRExporter,
|
|
20
|
+
RecommendationExtractor, Deidentifier,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
_parser = ReportParser()
|
|
24
|
+
_detector = CriticalFindingsDetector()
|
|
25
|
+
_extractor = RecommendationExtractor()
|
|
26
|
+
_exporter = FHIRExporter()
|
|
27
|
+
_deidentifier = Deidentifier()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _process(path: Path, modality, run_critical, run_recommend, as_fhir, patient_id,
|
|
31
|
+
run_deidentify=False):
|
|
32
|
+
"""Return (ParsedReport, output_dict). output_dict is FHIR when as_fhir=True."""
|
|
33
|
+
text = path.read_text(encoding="utf-8")
|
|
34
|
+
|
|
35
|
+
if run_deidentify:
|
|
36
|
+
result = _deidentifier.deidentify(text)
|
|
37
|
+
if result.redaction_count:
|
|
38
|
+
print(f"[deid] {path.name}: redacted {result.redaction_count} span(s) "
|
|
39
|
+
f"({result.category_counts()})", file=sys.stderr)
|
|
40
|
+
text = result.text
|
|
41
|
+
|
|
42
|
+
report = _parser.parse(text, modality=modality)
|
|
43
|
+
|
|
44
|
+
if run_critical:
|
|
45
|
+
report = _detector.detect(report)
|
|
46
|
+
if run_recommend:
|
|
47
|
+
report = _extractor.extract(report)
|
|
48
|
+
|
|
49
|
+
if as_fhir:
|
|
50
|
+
return report, _exporter.export(report, patient_id=patient_id)
|
|
51
|
+
|
|
52
|
+
d = report.to_dict()
|
|
53
|
+
d["source_file"] = path.name
|
|
54
|
+
return report, d
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _to_csv(rows: list[dict]) -> str:
|
|
58
|
+
if not rows:
|
|
59
|
+
return ""
|
|
60
|
+
buf = io.StringIO()
|
|
61
|
+
writer = csv.DictWriter(buf, fieldnames=list(rows[0].keys()), lineterminator="\n")
|
|
62
|
+
writer.writeheader()
|
|
63
|
+
writer.writerows(rows)
|
|
64
|
+
return buf.getvalue()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def main(argv=None):
|
|
68
|
+
ap = argparse.ArgumentParser(
|
|
69
|
+
prog="radreport",
|
|
70
|
+
description="Parse radiology free-text reports into structured JSON, FHIR, or CSV.",
|
|
71
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
72
|
+
epilog="""
|
|
73
|
+
examples:
|
|
74
|
+
radreport report.txt
|
|
75
|
+
radreport report.txt --fhir --patient-id pt-001 --modality CT
|
|
76
|
+
radreport reports/*.txt --critical --recommend --format csv -o batch.csv
|
|
77
|
+
radreport report.txt --deidentify --critical
|
|
78
|
+
""",
|
|
79
|
+
)
|
|
80
|
+
ap.add_argument("files", nargs="+", metavar="FILE",
|
|
81
|
+
help="Report .txt file(s) to parse")
|
|
82
|
+
ap.add_argument("--modality", "-m", metavar="MOD",
|
|
83
|
+
help="Imaging modality: CT, MRI, XR, US, NM, PET …")
|
|
84
|
+
ap.add_argument("--critical", "-c", action="store_true",
|
|
85
|
+
help="Run critical findings detection")
|
|
86
|
+
ap.add_argument("--recommend", "-r", action="store_true",
|
|
87
|
+
help="Extract follow-up imaging recommendations")
|
|
88
|
+
ap.add_argument("--deidentify", "-d", action="store_true",
|
|
89
|
+
help="Redact PHI (dates, MRN, names, phone, …) before parsing")
|
|
90
|
+
ap.add_argument("--fhir", "-f", action="store_true",
|
|
91
|
+
help="Export as FHIR R4 DiagnosticReport (implies --critical; not compatible with --format csv)")
|
|
92
|
+
ap.add_argument("--patient-id", metavar="ID",
|
|
93
|
+
help="FHIR Patient resource ID (used with --fhir)")
|
|
94
|
+
ap.add_argument("--format", "--fmt", dest="fmt", metavar="FMT",
|
|
95
|
+
choices=["json", "csv"], default="json",
|
|
96
|
+
help="Output format: json (default) or csv (flat one-row-per-report)")
|
|
97
|
+
ap.add_argument("--output", "-o", metavar="FILE",
|
|
98
|
+
help="Write output to FILE instead of stdout")
|
|
99
|
+
|
|
100
|
+
args = ap.parse_args(argv)
|
|
101
|
+
|
|
102
|
+
if args.fmt == "csv" and args.fhir:
|
|
103
|
+
ap.error("--format csv is not compatible with --fhir")
|
|
104
|
+
|
|
105
|
+
run_critical = args.critical or args.fhir
|
|
106
|
+
run_recommend = args.recommend
|
|
107
|
+
|
|
108
|
+
reports = [] # (ParsedReport, path) for CSV mode
|
|
109
|
+
json_rows = [] # dicts for JSON mode
|
|
110
|
+
errors = []
|
|
111
|
+
|
|
112
|
+
for f in args.files:
|
|
113
|
+
p = Path(f)
|
|
114
|
+
if not p.is_file():
|
|
115
|
+
errors.append(f"not found: {f}")
|
|
116
|
+
continue
|
|
117
|
+
try:
|
|
118
|
+
report_obj, out = _process(p, args.modality, run_critical, run_recommend,
|
|
119
|
+
args.fhir, args.patient_id, args.deidentify)
|
|
120
|
+
if args.fmt == "csv":
|
|
121
|
+
flat = report_obj.to_flat_dict()
|
|
122
|
+
reports.append({"source_file": p.name, **flat})
|
|
123
|
+
else:
|
|
124
|
+
json_rows.append(out)
|
|
125
|
+
except Exception as e:
|
|
126
|
+
errors.append(f"{p.name}: {e}")
|
|
127
|
+
|
|
128
|
+
for err in errors:
|
|
129
|
+
print(f"[error] {err}", file=sys.stderr)
|
|
130
|
+
|
|
131
|
+
if not reports and not json_rows:
|
|
132
|
+
sys.exit(1)
|
|
133
|
+
|
|
134
|
+
if args.fmt == "csv":
|
|
135
|
+
output_str = _to_csv(reports)
|
|
136
|
+
else:
|
|
137
|
+
output = json_rows[0] if len(json_rows) == 1 else json_rows
|
|
138
|
+
output_str = json.dumps(output, indent=2)
|
|
139
|
+
|
|
140
|
+
if args.output:
|
|
141
|
+
Path(args.output).write_text(output_str, encoding="utf-8")
|
|
142
|
+
print(f"Written to {args.output}", file=sys.stderr)
|
|
143
|
+
else:
|
|
144
|
+
print(output_str)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
if __name__ == "__main__":
|
|
148
|
+
main()
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Critical findings detector for radiology reports.
|
|
3
|
+
|
|
4
|
+
Rule-based and fully auditable — no ML, no external dependencies.
|
|
5
|
+
Designed to be safe to use in clinical alerting pipelines.
|
|
6
|
+
|
|
7
|
+
IMPORTANT: This library flags *potential* critical findings for human review.
|
|
8
|
+
It is NOT a substitute for radiologist interpretation.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from .report_schema import ParsedReport, CriticalFinding
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Critical findings dictionary
|
|
17
|
+
# Format: term -> (category, severity)
|
|
18
|
+
# severity levels: "critical" | "urgent" | "significant"
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
CRITICAL_TERMS: dict[str, tuple[str, str]] = {
|
|
22
|
+
# Vascular emergencies
|
|
23
|
+
"aortic dissection": ("vascular", "critical"),
|
|
24
|
+
"aortic rupture": ("vascular", "critical"),
|
|
25
|
+
"aortic aneurysm": ("vascular", "urgent"),
|
|
26
|
+
"pulmonary embolism": ("pulmonary", "critical"),
|
|
27
|
+
"pe": ("pulmonary", "critical"), # abbreviation
|
|
28
|
+
"saddle embolus": ("pulmonary", "critical"),
|
|
29
|
+
"deep vein thrombosis": ("vascular", "urgent"),
|
|
30
|
+
"dvt": ("vascular", "urgent"),
|
|
31
|
+
"venous thrombosis": ("vascular", "urgent"),
|
|
32
|
+
|
|
33
|
+
# Pulmonary
|
|
34
|
+
"pneumothorax": ("pulmonary", "critical"),
|
|
35
|
+
"tension pneumothorax": ("pulmonary", "critical"),
|
|
36
|
+
"hemothorax": ("pulmonary", "critical"),
|
|
37
|
+
"pneumomediastinum": ("pulmonary", "urgent"),
|
|
38
|
+
"airway obstruction": ("pulmonary", "critical"),
|
|
39
|
+
"pulmonary edema": ("pulmonary", "urgent"),
|
|
40
|
+
"consolidation": ("pulmonary", "significant"),
|
|
41
|
+
|
|
42
|
+
# Neurological
|
|
43
|
+
"intracranial hemorrhage": ("neuro", "critical"),
|
|
44
|
+
"subdural hematoma": ("neuro", "critical"),
|
|
45
|
+
"subdural hemorrhage": ("neuro", "critical"),
|
|
46
|
+
"epidural hematoma": ("neuro", "critical"),
|
|
47
|
+
"subarachnoid hemorrhage": ("neuro", "critical"),
|
|
48
|
+
"cerebral infarction": ("neuro", "critical"),
|
|
49
|
+
"stroke": ("neuro", "critical"),
|
|
50
|
+
"brain herniation": ("neuro", "critical"),
|
|
51
|
+
"midline shift": ("neuro", "critical"),
|
|
52
|
+
"hydrocephalus": ("neuro", "urgent"),
|
|
53
|
+
"mass effect": ("neuro", "urgent"),
|
|
54
|
+
|
|
55
|
+
# Abdominal
|
|
56
|
+
"bowel perforation": ("abdominal", "critical"),
|
|
57
|
+
"free air": ("abdominal", "critical"),
|
|
58
|
+
"pneumoperitoneum": ("abdominal", "critical"),
|
|
59
|
+
"mesenteric ischemia": ("abdominal", "critical"),
|
|
60
|
+
"bowel obstruction": ("abdominal", "urgent"),
|
|
61
|
+
"intussusception": ("abdominal", "urgent"),
|
|
62
|
+
"appendicitis": ("abdominal", "urgent"),
|
|
63
|
+
"ruptured ectopic": ("abdominal", "critical"),
|
|
64
|
+
"splenic laceration": ("abdominal", "critical"),
|
|
65
|
+
"hepatic laceration": ("abdominal", "critical"),
|
|
66
|
+
|
|
67
|
+
# Cardiac
|
|
68
|
+
"pericardial effusion": ("cardiac", "urgent"),
|
|
69
|
+
"cardiac tamponade": ("cardiac", "critical"),
|
|
70
|
+
"myocardial infarction": ("cardiac", "critical"),
|
|
71
|
+
|
|
72
|
+
# Spinal / trauma
|
|
73
|
+
"spinal cord compression": ("spinal", "critical"),
|
|
74
|
+
"cervical fracture": ("spinal", "critical"),
|
|
75
|
+
"unstable fracture": ("spinal", "critical"),
|
|
76
|
+
"cord compression": ("spinal", "critical"),
|
|
77
|
+
|
|
78
|
+
# Oncologic
|
|
79
|
+
"malignancy": ("oncologic", "significant"),
|
|
80
|
+
"metastasis": ("oncologic", "significant"),
|
|
81
|
+
"metastases": ("oncologic", "significant"),
|
|
82
|
+
"lymphoma": ("oncologic", "significant"),
|
|
83
|
+
"carcinoma": ("oncologic", "significant"),
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Negation phrases — if a finding is preceded by these, mark as negated
|
|
87
|
+
NEGATION_PHRASES = [
|
|
88
|
+
r"no\s+",
|
|
89
|
+
r"no\s+evidence\s+of\s+",
|
|
90
|
+
r"no\s+acute\s+",
|
|
91
|
+
r"without\s+",
|
|
92
|
+
r"absence\s+of\s+",
|
|
93
|
+
r"negative\s+for\s+",
|
|
94
|
+
r"unremarkable\s+for\s+",
|
|
95
|
+
r"ruled\s+out",
|
|
96
|
+
r"not\s+seen",
|
|
97
|
+
r"not\s+identified",
|
|
98
|
+
r"not\s+present",
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
NEGATION_WINDOW = 60 # max characters to look back for negation context
|
|
102
|
+
|
|
103
|
+
# Sentence boundary just before the match. Negation is scoped to the current
|
|
104
|
+
# sentence so a negation in a *previous* sentence cannot suppress a finding in
|
|
105
|
+
# the current one (e.g. "No acute hemorrhage. Large subdural hematoma present.").
|
|
106
|
+
_SENTENCE_BOUNDARY = re.compile(r"[.!?\n]")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _is_negated(text: str, match_start: int) -> bool:
|
|
110
|
+
"""
|
|
111
|
+
Check if a matched term is preceded by a negation phrase.
|
|
112
|
+
|
|
113
|
+
The look-back window is bounded by both NEGATION_WINDOW characters *and* the
|
|
114
|
+
start of the current sentence, whichever is closer. Scoping to the sentence
|
|
115
|
+
prevents a negation from a previous sentence from wrongly negating a real
|
|
116
|
+
finding — a false-negative that would suppress a critical alert.
|
|
117
|
+
"""
|
|
118
|
+
char_window_start = max(0, match_start - NEGATION_WINDOW)
|
|
119
|
+
|
|
120
|
+
# Find the start of the current sentence (after the last boundary char).
|
|
121
|
+
sentence_start = 0
|
|
122
|
+
for m in _SENTENCE_BOUNDARY.finditer(text, 0, match_start):
|
|
123
|
+
sentence_start = m.end()
|
|
124
|
+
|
|
125
|
+
window_start = max(char_window_start, sentence_start)
|
|
126
|
+
preceding = text[window_start:match_start].lower()
|
|
127
|
+
|
|
128
|
+
for phrase in NEGATION_PHRASES:
|
|
129
|
+
if re.search(phrase, preceding):
|
|
130
|
+
return True
|
|
131
|
+
return False
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _get_sentence_context(text: str, match_start: int, match_end: int) -> str:
|
|
135
|
+
"""Extract the sentence containing the matched term."""
|
|
136
|
+
# Walk back to sentence start
|
|
137
|
+
start = text.rfind('.', 0, match_start)
|
|
138
|
+
start = start + 1 if start != -1 else 0
|
|
139
|
+
|
|
140
|
+
# Walk forward to sentence end
|
|
141
|
+
end = text.find('.', match_end)
|
|
142
|
+
end = end + 1 if end != -1 else len(text)
|
|
143
|
+
|
|
144
|
+
return text[start:end].strip()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class CriticalFindingsDetector:
|
|
148
|
+
"""
|
|
149
|
+
Scans a ParsedReport for critical, urgent, and significant findings.
|
|
150
|
+
|
|
151
|
+
Usage:
|
|
152
|
+
detector = CriticalFindingsDetector()
|
|
153
|
+
report = detector.detect(parsed_report)
|
|
154
|
+
# report.critical_findings is now populated
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
def detect(self, report: ParsedReport) -> ParsedReport:
|
|
158
|
+
"""
|
|
159
|
+
Scan all sections of a ParsedReport and attach CriticalFinding objects.
|
|
160
|
+
Modifies the report in place and returns it.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
report: A ParsedReport from ReportParser.parse()
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
The same ParsedReport with critical_findings populated.
|
|
167
|
+
"""
|
|
168
|
+
# Focus on clinically meaningful sections
|
|
169
|
+
target_sections = {"findings", "impression", "preamble"}
|
|
170
|
+
scan_text_parts = []
|
|
171
|
+
|
|
172
|
+
for section in report.sections:
|
|
173
|
+
if section.name in target_sections:
|
|
174
|
+
scan_text_parts.append(section.raw_text)
|
|
175
|
+
|
|
176
|
+
# Fallback: scan full text if no structured sections
|
|
177
|
+
scan_text = "\n".join(scan_text_parts) if scan_text_parts else report.raw_text
|
|
178
|
+
|
|
179
|
+
findings: list[CriticalFinding] = []
|
|
180
|
+
|
|
181
|
+
for term, (category, severity) in CRITICAL_TERMS.items():
|
|
182
|
+
pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
|
|
183
|
+
normalized_term = term.strip()
|
|
184
|
+
|
|
185
|
+
# Collect every occurrence of the term, then emit a single finding.
|
|
186
|
+
# A term is only reported as negated if EVERY occurrence is negated:
|
|
187
|
+
# a real (non-negated) mention must never be suppressed by a negated
|
|
188
|
+
# one elsewhere in the report — that would drop a critical alert.
|
|
189
|
+
matches = list(pattern.finditer(scan_text))
|
|
190
|
+
if not matches:
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
active_match = None
|
|
194
|
+
for match in matches:
|
|
195
|
+
if not _is_negated(scan_text, match.start()):
|
|
196
|
+
active_match = match
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
chosen = active_match or matches[0]
|
|
200
|
+
negated = active_match is None
|
|
201
|
+
context = _get_sentence_context(scan_text, chosen.start(), chosen.end())
|
|
202
|
+
|
|
203
|
+
findings.append(CriticalFinding(
|
|
204
|
+
term=normalized_term,
|
|
205
|
+
category=category,
|
|
206
|
+
severity=severity,
|
|
207
|
+
context=context,
|
|
208
|
+
negated=negated,
|
|
209
|
+
))
|
|
210
|
+
|
|
211
|
+
# Sort: critical first, then urgent, then significant; negated last
|
|
212
|
+
severity_order = {"critical": 0, "urgent": 1, "significant": 2}
|
|
213
|
+
findings.sort(key=lambda f: (f.negated, severity_order.get(f.severity, 9)))
|
|
214
|
+
|
|
215
|
+
report.critical_findings = findings
|
|
216
|
+
return report
|
|
217
|
+
|
|
218
|
+
@property
|
|
219
|
+
def supported_terms(self) -> list[str]:
|
|
220
|
+
"""Return all terms currently monitored."""
|
|
221
|
+
return list(CRITICAL_TERMS.keys())
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Rule-based de-identification (PHI redaction) for radiology reports.
|
|
3
|
+
|
|
4
|
+
Removes Protected Health Information so that reports can be shared with research
|
|
5
|
+
collaborators, stored in analytics warehouses, or processed off-site. Every
|
|
6
|
+
decision is a traceable regular-expression rule — no ML, no GPU, no external
|
|
7
|
+
service — so the transformation can be audited line by line and re-run
|
|
8
|
+
deterministically. This is what makes it acceptable in environments where
|
|
9
|
+
sending text to a cloud NER API is not an option.
|
|
10
|
+
|
|
11
|
+
The categories map to the HIPAA Safe Harbor identifier list where they are
|
|
12
|
+
reliably matchable from text alone: dates, telephone/fax numbers, email
|
|
13
|
+
addresses, SSNs, medical-record and accession numbers, URLs, IP addresses, ZIP
|
|
14
|
+
codes, ages over 89, and names that follow an explicit title or header label.
|
|
15
|
+
|
|
16
|
+
IMPORTANT — read before relying on this for compliance:
|
|
17
|
+
Free-text de-identification is inherently imperfect. Names that appear in
|
|
18
|
+
narrative prose *without* a title or label (e.g. a surname mentioned mid
|
|
19
|
+
sentence) are NOT caught by a rule-based system. Treat the output as a
|
|
20
|
+
strong first pass that must still be reviewed before any PHI leaves a
|
|
21
|
+
controlled environment. This tool does not certify Safe Harbor compliance.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
import re
|
|
25
|
+
from typing import Iterable, Optional
|
|
26
|
+
|
|
27
|
+
from .report_schema import DeidentificationResult, Redaction
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Replacement placeholders, one per category.
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
PLACEHOLDERS: dict[str, str] = {
|
|
34
|
+
"date": "[DATE]",
|
|
35
|
+
"age": "[AGE]",
|
|
36
|
+
"ssn": "[SSN]",
|
|
37
|
+
"mrn": "[MRN]",
|
|
38
|
+
"accession": "[ACCESSION]",
|
|
39
|
+
"phone": "[PHONE]",
|
|
40
|
+
"email": "[EMAIL]",
|
|
41
|
+
"url": "[URL]",
|
|
42
|
+
"ipv4": "[IP]",
|
|
43
|
+
"zip": "[ZIP]",
|
|
44
|
+
"name": "[NAME]",
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# All categories, in the order they are applied. Order is only a tie-breaker
|
|
48
|
+
# for overlapping matches (earlier categories win); non-overlapping matches are
|
|
49
|
+
# unaffected. More specific / higher-confidence patterns come first.
|
|
50
|
+
DEFAULT_CATEGORIES: tuple[str, ...] = (
|
|
51
|
+
"ssn", "mrn", "accession", "phone", "email", "url", "ipv4",
|
|
52
|
+
"date", "age", "zip", "name",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Rules. Each category maps to a list of compiled patterns. The full match
|
|
58
|
+
# (group 0) is what gets redacted, so patterns use lookahead/label groups to
|
|
59
|
+
# anchor context without consuming text that must be kept.
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
_MONTHS = (
|
|
62
|
+
r"Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|"
|
|
63
|
+
r"Jul(?:y)?|Aug(?:ust)?|Sep(?:t|tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
_RULES: dict[str, list[re.Pattern]] = {
|
|
67
|
+
"ssn": [
|
|
68
|
+
re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
|
|
69
|
+
],
|
|
70
|
+
"mrn": [
|
|
71
|
+
# "MRN: 12345678", "Medical Record Number 12345678", "MRN# 1234567"
|
|
72
|
+
re.compile(
|
|
73
|
+
r"\b(?:MRN|medical\s+record\s+(?:number|no|#))\s*[:#]?\s*[A-Z]?\d[\d-]{4,}\b",
|
|
74
|
+
re.IGNORECASE,
|
|
75
|
+
),
|
|
76
|
+
],
|
|
77
|
+
"accession": [
|
|
78
|
+
# "Accession: A12345678", "Acc # 12345678", "Accession Number 12345678"
|
|
79
|
+
re.compile(
|
|
80
|
+
r"\b(?:accession(?:\s+(?:number|no|#))?|acc\s*#)\s*[:#]?\s*[A-Z]{0,3}\d[\d-]{4,}\b",
|
|
81
|
+
re.IGNORECASE,
|
|
82
|
+
),
|
|
83
|
+
],
|
|
84
|
+
"phone": [
|
|
85
|
+
# (555) 123-4567 / 555-123-4567 / 555.123.4567 / +1 555 123 4567
|
|
86
|
+
re.compile(
|
|
87
|
+
r"(?<!\d)(?:\+?1[\s.-]?)?(?:\(\d{3}\)|\d{3})[\s.-]\d{3}[\s.-]\d{4}(?!\d)"
|
|
88
|
+
),
|
|
89
|
+
],
|
|
90
|
+
"email": [
|
|
91
|
+
re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
|
|
92
|
+
],
|
|
93
|
+
"url": [
|
|
94
|
+
re.compile(r"\bhttps?://[^\s<>\"')]+", re.IGNORECASE),
|
|
95
|
+
],
|
|
96
|
+
"ipv4": [
|
|
97
|
+
re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
|
|
98
|
+
],
|
|
99
|
+
"date": [
|
|
100
|
+
# 03/10/2024, 3-10-24, 2024-03-10
|
|
101
|
+
re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b"),
|
|
102
|
+
re.compile(r"\b\d{4}-\d{1,2}-\d{1,2}\b"),
|
|
103
|
+
# March 5, 2024 / 5 March 2024 / Mar 2024
|
|
104
|
+
re.compile(rf"\b(?:{_MONTHS})\.?\s+\d{{1,2}}(?:st|nd|rd|th)?,?\s+\d{{4}}\b", re.IGNORECASE),
|
|
105
|
+
re.compile(rf"\b\d{{1,2}}(?:st|nd|rd|th)?\s+(?:{_MONTHS})\.?\s+\d{{4}}\b", re.IGNORECASE),
|
|
106
|
+
re.compile(rf"\b(?:{_MONTHS})\.?\s+\d{{4}}\b", re.IGNORECASE),
|
|
107
|
+
],
|
|
108
|
+
"age": [
|
|
109
|
+
# HIPAA: ages 90+ must not be reported in the clear. Require an age cue
|
|
110
|
+
# so ordinary measurements ("90 mm") are never mistaken for an age.
|
|
111
|
+
re.compile(
|
|
112
|
+
r"\b(?:9\d|1\d\d)\s*[- ]?\s*(?:years?|yrs?|y)[- ]?(?:old|/o|o)\b",
|
|
113
|
+
re.IGNORECASE,
|
|
114
|
+
),
|
|
115
|
+
re.compile(r"\bage[d]?\s*[:#]?\s*(?:9\d|1\d\d)\b", re.IGNORECASE),
|
|
116
|
+
],
|
|
117
|
+
"zip": [
|
|
118
|
+
# Only when preceded by a 2-letter state code, to avoid nuking any 5-digit number.
|
|
119
|
+
re.compile(r"\b[A-Z]{2}\s+\d{5}(?:-\d{4})?\b"),
|
|
120
|
+
],
|
|
121
|
+
"name": [
|
|
122
|
+
# Titled names: Dr. Jane Smith, Mr. John Q. Doe
|
|
123
|
+
re.compile(
|
|
124
|
+
r"\b(?:Dr|Doctor|Mr|Mrs|Ms|Miss|Prof)\.?\s+"
|
|
125
|
+
r"[A-Z][a-z]+(?:\s+[A-Z]\.?)?(?:\s+[A-Z][a-z]+)?"
|
|
126
|
+
),
|
|
127
|
+
# Header label followed by a value, e.g. "Patient Name: John Doe".
|
|
128
|
+
# Only the value (group 1) is redacted; the label is preserved. The
|
|
129
|
+
# value stops at a column break (2+ spaces) or end of line so that a
|
|
130
|
+
# multi-field header line ("Name: Doe MRN: 123") does not swallow the
|
|
131
|
+
# following fields — those are matched by their own category rules.
|
|
132
|
+
re.compile(
|
|
133
|
+
r"(?im)^[ \t]*(?:patient(?:\s+name)?|name|physician|"
|
|
134
|
+
r"referring(?:\s+physician)?|referred\s+by|dictated\s+by|"
|
|
135
|
+
r"signed\s+by|attending|resident|technologist)\s*:[ \t]*"
|
|
136
|
+
r"([^\n]+?)(?=\s{2,}|$)"
|
|
137
|
+
),
|
|
138
|
+
],
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class Deidentifier:
|
|
143
|
+
"""
|
|
144
|
+
Redact PHI from radiology report text.
|
|
145
|
+
|
|
146
|
+
Usage:
|
|
147
|
+
deid = Deidentifier()
|
|
148
|
+
result = deid.deidentify(raw_text)
|
|
149
|
+
clean_text = result.text
|
|
150
|
+
print(result.category_counts()) # {"date": 2, "mrn": 1, ...}
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
categories: Iterable of category names to enable. Defaults to all.
|
|
154
|
+
placeholders: Optional overrides for the replacement tokens, e.g.
|
|
155
|
+
{"name": "XXXX"}. Categories not listed keep their default.
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def __init__(
|
|
159
|
+
self,
|
|
160
|
+
categories: Optional[Iterable[str]] = None,
|
|
161
|
+
placeholders: Optional[dict[str, str]] = None,
|
|
162
|
+
):
|
|
163
|
+
requested = tuple(categories) if categories is not None else DEFAULT_CATEGORIES
|
|
164
|
+
unknown = [c for c in requested if c not in _RULES]
|
|
165
|
+
if unknown:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Unknown de-identification categor{'y' if len(unknown) == 1 else 'ies'}: "
|
|
168
|
+
f"{', '.join(unknown)}. Valid options: {', '.join(sorted(_RULES))}."
|
|
169
|
+
)
|
|
170
|
+
# Preserve the canonical ordering (tie-break priority) among requested cats.
|
|
171
|
+
self.categories = tuple(c for c in DEFAULT_CATEGORIES if c in set(requested))
|
|
172
|
+
self.placeholders = {**PLACEHOLDERS, **(placeholders or {})}
|
|
173
|
+
|
|
174
|
+
def deidentify(self, text: str) -> DeidentificationResult:
|
|
175
|
+
"""
|
|
176
|
+
Scrub PHI from `text` and return a DeidentificationResult.
|
|
177
|
+
|
|
178
|
+
The returned `text` has every detected identifier replaced by a category
|
|
179
|
+
placeholder. `redactions` is the audit trail, ordered by position, with
|
|
180
|
+
offsets into the ORIGINAL text.
|
|
181
|
+
"""
|
|
182
|
+
if text is None:
|
|
183
|
+
raise ValueError("Text cannot be None.")
|
|
184
|
+
|
|
185
|
+
candidates: list[tuple[int, int, str]] = [] # (start, end, category)
|
|
186
|
+
for category in self.categories:
|
|
187
|
+
for pattern in _RULES[category]:
|
|
188
|
+
for m in pattern.finditer(text):
|
|
189
|
+
# If the pattern captured a value group (label fields), redact
|
|
190
|
+
# only that group; otherwise redact the whole match.
|
|
191
|
+
if m.groups():
|
|
192
|
+
start, end = m.span(1)
|
|
193
|
+
else:
|
|
194
|
+
start, end = m.span(0)
|
|
195
|
+
if start < end:
|
|
196
|
+
candidates.append((start, end, category))
|
|
197
|
+
|
|
198
|
+
# Resolve overlaps: sort by start, then longer span, then category priority.
|
|
199
|
+
priority = {c: i for i, c in enumerate(self.categories)}
|
|
200
|
+
candidates.sort(key=lambda c: (c[0], -(c[1] - c[0]), priority.get(c[2], 99)))
|
|
201
|
+
|
|
202
|
+
chosen: list[tuple[int, int, str]] = []
|
|
203
|
+
last_end = 0
|
|
204
|
+
for start, end, category in candidates:
|
|
205
|
+
if start >= last_end:
|
|
206
|
+
chosen.append((start, end, category))
|
|
207
|
+
last_end = end
|
|
208
|
+
|
|
209
|
+
# Rebuild the scrubbed text and the redaction records in one pass.
|
|
210
|
+
out_parts: list[str] = []
|
|
211
|
+
redactions: list[Redaction] = []
|
|
212
|
+
cursor = 0
|
|
213
|
+
for start, end, category in chosen:
|
|
214
|
+
out_parts.append(text[cursor:start])
|
|
215
|
+
replacement = self.placeholders[category]
|
|
216
|
+
out_parts.append(replacement)
|
|
217
|
+
redactions.append(Redaction(
|
|
218
|
+
category=category,
|
|
219
|
+
original=text[start:end],
|
|
220
|
+
replacement=replacement,
|
|
221
|
+
start=start,
|
|
222
|
+
end=end,
|
|
223
|
+
))
|
|
224
|
+
cursor = end
|
|
225
|
+
out_parts.append(text[cursor:])
|
|
226
|
+
|
|
227
|
+
return DeidentificationResult(text="".join(out_parts), redactions=redactions)
|
|
228
|
+
|
|
229
|
+
@property
|
|
230
|
+
def supported_categories(self) -> list[str]:
|
|
231
|
+
"""All PHI categories this de-identifier can detect."""
|
|
232
|
+
return list(_RULES.keys())
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def deidentify(text: str, **kwargs) -> DeidentificationResult:
|
|
236
|
+
"""Convenience wrapper: `Deidentifier(**kwargs).deidentify(text)`."""
|
|
237
|
+
return Deidentifier(**kwargs).deidentify(text)
|