radreport 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
radreport/__init__.py ADDED
@@ -0,0 +1,60 @@
1
+ """
2
+ radreport
3
+ ~~~~~~~~~
4
+ Parse, de-identify, structure, and export radiology free-text reports.
5
+
6
+ Quick start:
7
+ from radreport import ReportParser, CriticalFindingsDetector, FHIRExporter
8
+ from radreport import RecommendationExtractor, Deidentifier
9
+
10
+ parser = ReportParser()
11
+ detector = CriticalFindingsDetector()
12
+ extractor = RecommendationExtractor()
13
+ exporter = FHIRExporter()
14
+
15
+ # Optional: strip PHI first so downstream output is safe to share.
16
+ raw_text = Deidentifier().deidentify(raw_text).text
17
+
18
+ report = parser.parse(raw_text, modality="CT")
19
+ report = detector.detect(report)
20
+ report = extractor.extract(report)
21
+ fhir = exporter.export(report, patient_id="pt-001")
22
+
23
+ # Flat dict for CSV/pandas:
24
+ row = report.to_flat_dict()
25
+ """
26
+
27
+ from .report_parser import ReportParser
28
+ from .critical_findings import CriticalFindingsDetector
29
+ from .recommendation_extractor import RecommendationExtractor
30
+ from .fhir_exporter import FHIRExporter
31
+ from .deidentifier import Deidentifier, deidentify
32
+ from .report_schema import (
33
+ ParsedReport,
34
+ ReportSection,
35
+ Finding,
36
+ Measurement,
37
+ CriticalFinding,
38
+ FollowUpRecommendation,
39
+ Redaction,
40
+ DeidentificationResult,
41
+ )
42
+
43
+ __version__ = "0.4.0"
44
+ __all__ = [
45
+ "ReportParser",
46
+ "CriticalFindingsDetector",
47
+ "RecommendationExtractor",
48
+ "FHIRExporter",
49
+ "Deidentifier",
50
+ "deidentify",
51
+ "ParsedReport",
52
+ "ReportSection",
53
+ "Finding",
54
+ "Measurement",
55
+ "CriticalFinding",
56
+ "FollowUpRecommendation",
57
+ "Redaction",
58
+ "DeidentificationResult",
59
+ "__version__",
60
+ ]
radreport/cli.py ADDED
@@ -0,0 +1,148 @@
1
+ """
2
+ Command-line interface for radreport.
3
+
4
+ Usage:
5
+ radreport report.txt
6
+ radreport report.txt --fhir --patient-id pt-001 --modality CT
7
+ radreport reports/*.txt --critical --recommend --format csv -o batch.csv
8
+ radreport report.txt --deidentify
9
+ """
10
+
11
+ import argparse
12
+ import csv
13
+ import io
14
+ import json
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ from . import (
19
+ ReportParser, CriticalFindingsDetector, FHIRExporter,
20
+ RecommendationExtractor, Deidentifier,
21
+ )
22
+
23
+ _parser = ReportParser()
24
+ _detector = CriticalFindingsDetector()
25
+ _extractor = RecommendationExtractor()
26
+ _exporter = FHIRExporter()
27
+ _deidentifier = Deidentifier()
28
+
29
+
30
+ def _process(path: Path, modality, run_critical, run_recommend, as_fhir, patient_id,
31
+ run_deidentify=False):
32
+ """Return (ParsedReport, output_dict). output_dict is FHIR when as_fhir=True."""
33
+ text = path.read_text(encoding="utf-8")
34
+
35
+ if run_deidentify:
36
+ result = _deidentifier.deidentify(text)
37
+ if result.redaction_count:
38
+ print(f"[deid] {path.name}: redacted {result.redaction_count} span(s) "
39
+ f"({result.category_counts()})", file=sys.stderr)
40
+ text = result.text
41
+
42
+ report = _parser.parse(text, modality=modality)
43
+
44
+ if run_critical:
45
+ report = _detector.detect(report)
46
+ if run_recommend:
47
+ report = _extractor.extract(report)
48
+
49
+ if as_fhir:
50
+ return report, _exporter.export(report, patient_id=patient_id)
51
+
52
+ d = report.to_dict()
53
+ d["source_file"] = path.name
54
+ return report, d
55
+
56
+
57
+ def _to_csv(rows: list[dict]) -> str:
58
+ if not rows:
59
+ return ""
60
+ buf = io.StringIO()
61
+ writer = csv.DictWriter(buf, fieldnames=list(rows[0].keys()), lineterminator="\n")
62
+ writer.writeheader()
63
+ writer.writerows(rows)
64
+ return buf.getvalue()
65
+
66
+
67
+ def main(argv=None):
68
+ ap = argparse.ArgumentParser(
69
+ prog="radreport",
70
+ description="Parse radiology free-text reports into structured JSON, FHIR, or CSV.",
71
+ formatter_class=argparse.RawDescriptionHelpFormatter,
72
+ epilog="""
73
+ examples:
74
+ radreport report.txt
75
+ radreport report.txt --fhir --patient-id pt-001 --modality CT
76
+ radreport reports/*.txt --critical --recommend --format csv -o batch.csv
77
+ radreport report.txt --deidentify --critical
78
+ """,
79
+ )
80
+ ap.add_argument("files", nargs="+", metavar="FILE",
81
+ help="Report .txt file(s) to parse")
82
+ ap.add_argument("--modality", "-m", metavar="MOD",
83
+ help="Imaging modality: CT, MRI, XR, US, NM, PET …")
84
+ ap.add_argument("--critical", "-c", action="store_true",
85
+ help="Run critical findings detection")
86
+ ap.add_argument("--recommend", "-r", action="store_true",
87
+ help="Extract follow-up imaging recommendations")
88
+ ap.add_argument("--deidentify", "-d", action="store_true",
89
+ help="Redact PHI (dates, MRN, names, phone, …) before parsing")
90
+ ap.add_argument("--fhir", "-f", action="store_true",
91
+ help="Export as FHIR R4 DiagnosticReport (implies --critical; not compatible with --format csv)")
92
+ ap.add_argument("--patient-id", metavar="ID",
93
+ help="FHIR Patient resource ID (used with --fhir)")
94
+ ap.add_argument("--format", "--fmt", dest="fmt", metavar="FMT",
95
+ choices=["json", "csv"], default="json",
96
+ help="Output format: json (default) or csv (flat one-row-per-report)")
97
+ ap.add_argument("--output", "-o", metavar="FILE",
98
+ help="Write output to FILE instead of stdout")
99
+
100
+ args = ap.parse_args(argv)
101
+
102
+ if args.fmt == "csv" and args.fhir:
103
+ ap.error("--format csv is not compatible with --fhir")
104
+
105
+ run_critical = args.critical or args.fhir
106
+ run_recommend = args.recommend
107
+
108
+ reports = [] # (ParsedReport, path) for CSV mode
109
+ json_rows = [] # dicts for JSON mode
110
+ errors = []
111
+
112
+ for f in args.files:
113
+ p = Path(f)
114
+ if not p.is_file():
115
+ errors.append(f"not found: {f}")
116
+ continue
117
+ try:
118
+ report_obj, out = _process(p, args.modality, run_critical, run_recommend,
119
+ args.fhir, args.patient_id, args.deidentify)
120
+ if args.fmt == "csv":
121
+ flat = report_obj.to_flat_dict()
122
+ reports.append({"source_file": p.name, **flat})
123
+ else:
124
+ json_rows.append(out)
125
+ except Exception as e:
126
+ errors.append(f"{p.name}: {e}")
127
+
128
+ for err in errors:
129
+ print(f"[error] {err}", file=sys.stderr)
130
+
131
+ if not reports and not json_rows:
132
+ sys.exit(1)
133
+
134
+ if args.fmt == "csv":
135
+ output_str = _to_csv(reports)
136
+ else:
137
+ output = json_rows[0] if len(json_rows) == 1 else json_rows
138
+ output_str = json.dumps(output, indent=2)
139
+
140
+ if args.output:
141
+ Path(args.output).write_text(output_str, encoding="utf-8")
142
+ print(f"Written to {args.output}", file=sys.stderr)
143
+ else:
144
+ print(output_str)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
@@ -0,0 +1,221 @@
1
+ """
2
+ Critical findings detector for radiology reports.
3
+
4
+ Rule-based and fully auditable — no ML, no external dependencies.
5
+ Designed to be safe to use in clinical alerting pipelines.
6
+
7
+ IMPORTANT: This library flags *potential* critical findings for human review.
8
+ It is NOT a substitute for radiologist interpretation.
9
+ """
10
+
11
+ import re
12
+ from .report_schema import ParsedReport, CriticalFinding
13
+
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Critical findings dictionary
17
+ # Format: term -> (category, severity)
18
+ # severity levels: "critical" | "urgent" | "significant"
19
+ # ---------------------------------------------------------------------------
20
+
21
+ CRITICAL_TERMS: dict[str, tuple[str, str]] = {
22
+ # Vascular emergencies
23
+ "aortic dissection": ("vascular", "critical"),
24
+ "aortic rupture": ("vascular", "critical"),
25
+ "aortic aneurysm": ("vascular", "urgent"),
26
+ "pulmonary embolism": ("pulmonary", "critical"),
27
+ "pe": ("pulmonary", "critical"), # abbreviation
28
+ "saddle embolus": ("pulmonary", "critical"),
29
+ "deep vein thrombosis": ("vascular", "urgent"),
30
+ "dvt": ("vascular", "urgent"),
31
+ "venous thrombosis": ("vascular", "urgent"),
32
+
33
+ # Pulmonary
34
+ "pneumothorax": ("pulmonary", "critical"),
35
+ "tension pneumothorax": ("pulmonary", "critical"),
36
+ "hemothorax": ("pulmonary", "critical"),
37
+ "pneumomediastinum": ("pulmonary", "urgent"),
38
+ "airway obstruction": ("pulmonary", "critical"),
39
+ "pulmonary edema": ("pulmonary", "urgent"),
40
+ "consolidation": ("pulmonary", "significant"),
41
+
42
+ # Neurological
43
+ "intracranial hemorrhage": ("neuro", "critical"),
44
+ "subdural hematoma": ("neuro", "critical"),
45
+ "subdural hemorrhage": ("neuro", "critical"),
46
+ "epidural hematoma": ("neuro", "critical"),
47
+ "subarachnoid hemorrhage": ("neuro", "critical"),
48
+ "cerebral infarction": ("neuro", "critical"),
49
+ "stroke": ("neuro", "critical"),
50
+ "brain herniation": ("neuro", "critical"),
51
+ "midline shift": ("neuro", "critical"),
52
+ "hydrocephalus": ("neuro", "urgent"),
53
+ "mass effect": ("neuro", "urgent"),
54
+
55
+ # Abdominal
56
+ "bowel perforation": ("abdominal", "critical"),
57
+ "free air": ("abdominal", "critical"),
58
+ "pneumoperitoneum": ("abdominal", "critical"),
59
+ "mesenteric ischemia": ("abdominal", "critical"),
60
+ "bowel obstruction": ("abdominal", "urgent"),
61
+ "intussusception": ("abdominal", "urgent"),
62
+ "appendicitis": ("abdominal", "urgent"),
63
+ "ruptured ectopic": ("abdominal", "critical"),
64
+ "splenic laceration": ("abdominal", "critical"),
65
+ "hepatic laceration": ("abdominal", "critical"),
66
+
67
+ # Cardiac
68
+ "pericardial effusion": ("cardiac", "urgent"),
69
+ "cardiac tamponade": ("cardiac", "critical"),
70
+ "myocardial infarction": ("cardiac", "critical"),
71
+
72
+ # Spinal / trauma
73
+ "spinal cord compression": ("spinal", "critical"),
74
+ "cervical fracture": ("spinal", "critical"),
75
+ "unstable fracture": ("spinal", "critical"),
76
+ "cord compression": ("spinal", "critical"),
77
+
78
+ # Oncologic
79
+ "malignancy": ("oncologic", "significant"),
80
+ "metastasis": ("oncologic", "significant"),
81
+ "metastases": ("oncologic", "significant"),
82
+ "lymphoma": ("oncologic", "significant"),
83
+ "carcinoma": ("oncologic", "significant"),
84
+ }
85
+
86
+ # Negation phrases — if a finding is preceded by these, mark as negated
87
+ NEGATION_PHRASES = [
88
+ r"no\s+",
89
+ r"no\s+evidence\s+of\s+",
90
+ r"no\s+acute\s+",
91
+ r"without\s+",
92
+ r"absence\s+of\s+",
93
+ r"negative\s+for\s+",
94
+ r"unremarkable\s+for\s+",
95
+ r"ruled\s+out",
96
+ r"not\s+seen",
97
+ r"not\s+identified",
98
+ r"not\s+present",
99
+ ]
100
+
101
+ NEGATION_WINDOW = 60 # max characters to look back for negation context
102
+
103
+ # Sentence boundary just before the match. Negation is scoped to the current
104
+ # sentence so a negation in a *previous* sentence cannot suppress a finding in
105
+ # the current one (e.g. "No acute hemorrhage. Large subdural hematoma present.").
106
+ _SENTENCE_BOUNDARY = re.compile(r"[.!?\n]")
107
+
108
+
109
+ def _is_negated(text: str, match_start: int) -> bool:
110
+ """
111
+ Check if a matched term is preceded by a negation phrase.
112
+
113
+ The look-back window is bounded by both NEGATION_WINDOW characters *and* the
114
+ start of the current sentence, whichever is closer. Scoping to the sentence
115
+ prevents a negation from a previous sentence from wrongly negating a real
116
+ finding — a false-negative that would suppress a critical alert.
117
+ """
118
+ char_window_start = max(0, match_start - NEGATION_WINDOW)
119
+
120
+ # Find the start of the current sentence (after the last boundary char).
121
+ sentence_start = 0
122
+ for m in _SENTENCE_BOUNDARY.finditer(text, 0, match_start):
123
+ sentence_start = m.end()
124
+
125
+ window_start = max(char_window_start, sentence_start)
126
+ preceding = text[window_start:match_start].lower()
127
+
128
+ for phrase in NEGATION_PHRASES:
129
+ if re.search(phrase, preceding):
130
+ return True
131
+ return False
132
+
133
+
134
+ def _get_sentence_context(text: str, match_start: int, match_end: int) -> str:
135
+ """Extract the sentence containing the matched term."""
136
+ # Walk back to sentence start
137
+ start = text.rfind('.', 0, match_start)
138
+ start = start + 1 if start != -1 else 0
139
+
140
+ # Walk forward to sentence end
141
+ end = text.find('.', match_end)
142
+ end = end + 1 if end != -1 else len(text)
143
+
144
+ return text[start:end].strip()
145
+
146
+
147
+ class CriticalFindingsDetector:
148
+ """
149
+ Scans a ParsedReport for critical, urgent, and significant findings.
150
+
151
+ Usage:
152
+ detector = CriticalFindingsDetector()
153
+ report = detector.detect(parsed_report)
154
+ # report.critical_findings is now populated
155
+ """
156
+
157
+ def detect(self, report: ParsedReport) -> ParsedReport:
158
+ """
159
+ Scan all sections of a ParsedReport and attach CriticalFinding objects.
160
+ Modifies the report in place and returns it.
161
+
162
+ Args:
163
+ report: A ParsedReport from ReportParser.parse()
164
+
165
+ Returns:
166
+ The same ParsedReport with critical_findings populated.
167
+ """
168
+ # Focus on clinically meaningful sections
169
+ target_sections = {"findings", "impression", "preamble"}
170
+ scan_text_parts = []
171
+
172
+ for section in report.sections:
173
+ if section.name in target_sections:
174
+ scan_text_parts.append(section.raw_text)
175
+
176
+ # Fallback: scan full text if no structured sections
177
+ scan_text = "\n".join(scan_text_parts) if scan_text_parts else report.raw_text
178
+
179
+ findings: list[CriticalFinding] = []
180
+
181
+ for term, (category, severity) in CRITICAL_TERMS.items():
182
+ pattern = re.compile(r'\b' + re.escape(term) + r'\b', re.IGNORECASE)
183
+ normalized_term = term.strip()
184
+
185
+ # Collect every occurrence of the term, then emit a single finding.
186
+ # A term is only reported as negated if EVERY occurrence is negated:
187
+ # a real (non-negated) mention must never be suppressed by a negated
188
+ # one elsewhere in the report — that would drop a critical alert.
189
+ matches = list(pattern.finditer(scan_text))
190
+ if not matches:
191
+ continue
192
+
193
+ active_match = None
194
+ for match in matches:
195
+ if not _is_negated(scan_text, match.start()):
196
+ active_match = match
197
+ break
198
+
199
+ chosen = active_match or matches[0]
200
+ negated = active_match is None
201
+ context = _get_sentence_context(scan_text, chosen.start(), chosen.end())
202
+
203
+ findings.append(CriticalFinding(
204
+ term=normalized_term,
205
+ category=category,
206
+ severity=severity,
207
+ context=context,
208
+ negated=negated,
209
+ ))
210
+
211
+ # Sort: critical first, then urgent, then significant; negated last
212
+ severity_order = {"critical": 0, "urgent": 1, "significant": 2}
213
+ findings.sort(key=lambda f: (f.negated, severity_order.get(f.severity, 9)))
214
+
215
+ report.critical_findings = findings
216
+ return report
217
+
218
+ @property
219
+ def supported_terms(self) -> list[str]:
220
+ """Return all terms currently monitored."""
221
+ return list(CRITICAL_TERMS.keys())
@@ -0,0 +1,237 @@
1
+ """
2
+ Rule-based de-identification (PHI redaction) for radiology reports.
3
+
4
+ Removes Protected Health Information so that reports can be shared with research
5
+ collaborators, stored in analytics warehouses, or processed off-site. Every
6
+ decision is a traceable regular-expression rule — no ML, no GPU, no external
7
+ service — so the transformation can be audited line by line and re-run
8
+ deterministically. This is what makes it acceptable in environments where
9
+ sending text to a cloud NER API is not an option.
10
+
11
+ The categories map to the HIPAA Safe Harbor identifier list where they are
12
+ reliably matchable from text alone: dates, telephone/fax numbers, email
13
+ addresses, SSNs, medical-record and accession numbers, URLs, IP addresses, ZIP
14
+ codes, ages over 89, and names that follow an explicit title or header label.
15
+
16
+ IMPORTANT — read before relying on this for compliance:
17
+ Free-text de-identification is inherently imperfect. Names that appear in
18
+ narrative prose *without* a title or label (e.g. a surname mentioned mid
19
+ sentence) are NOT caught by a rule-based system. Treat the output as a
20
+ strong first pass that must still be reviewed before any PHI leaves a
21
+ controlled environment. This tool does not certify Safe Harbor compliance.
22
+ """
23
+
24
+ import re
25
+ from typing import Iterable, Optional
26
+
27
+ from .report_schema import DeidentificationResult, Redaction
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Replacement placeholders, one per category.
32
+ # ---------------------------------------------------------------------------
33
+ PLACEHOLDERS: dict[str, str] = {
34
+ "date": "[DATE]",
35
+ "age": "[AGE]",
36
+ "ssn": "[SSN]",
37
+ "mrn": "[MRN]",
38
+ "accession": "[ACCESSION]",
39
+ "phone": "[PHONE]",
40
+ "email": "[EMAIL]",
41
+ "url": "[URL]",
42
+ "ipv4": "[IP]",
43
+ "zip": "[ZIP]",
44
+ "name": "[NAME]",
45
+ }
46
+
47
+ # All categories, in the order they are applied. Order is only a tie-breaker
48
+ # for overlapping matches (earlier categories win); non-overlapping matches are
49
+ # unaffected. More specific / higher-confidence patterns come first.
50
+ DEFAULT_CATEGORIES: tuple[str, ...] = (
51
+ "ssn", "mrn", "accession", "phone", "email", "url", "ipv4",
52
+ "date", "age", "zip", "name",
53
+ )
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Rules. Each category maps to a list of compiled patterns. The full match
58
+ # (group 0) is what gets redacted, so patterns use lookahead/label groups to
59
+ # anchor context without consuming text that must be kept.
60
+ # ---------------------------------------------------------------------------
61
+ _MONTHS = (
62
+ r"Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|"
63
+ r"Jul(?:y)?|Aug(?:ust)?|Sep(?:t|tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?"
64
+ )
65
+
66
+ _RULES: dict[str, list[re.Pattern]] = {
67
+ "ssn": [
68
+ re.compile(r"\b\d{3}-\d{2}-\d{4}\b"),
69
+ ],
70
+ "mrn": [
71
+ # "MRN: 12345678", "Medical Record Number 12345678", "MRN# 1234567"
72
+ re.compile(
73
+ r"\b(?:MRN|medical\s+record\s+(?:number|no|#))\s*[:#]?\s*[A-Z]?\d[\d-]{4,}\b",
74
+ re.IGNORECASE,
75
+ ),
76
+ ],
77
+ "accession": [
78
+ # "Accession: A12345678", "Acc # 12345678", "Accession Number 12345678"
79
+ re.compile(
80
+ r"\b(?:accession(?:\s+(?:number|no|#))?|acc\s*#)\s*[:#]?\s*[A-Z]{0,3}\d[\d-]{4,}\b",
81
+ re.IGNORECASE,
82
+ ),
83
+ ],
84
+ "phone": [
85
+ # (555) 123-4567 / 555-123-4567 / 555.123.4567 / +1 555 123 4567
86
+ re.compile(
87
+ r"(?<!\d)(?:\+?1[\s.-]?)?(?:\(\d{3}\)|\d{3})[\s.-]\d{3}[\s.-]\d{4}(?!\d)"
88
+ ),
89
+ ],
90
+ "email": [
91
+ re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"),
92
+ ],
93
+ "url": [
94
+ re.compile(r"\bhttps?://[^\s<>\"')]+", re.IGNORECASE),
95
+ ],
96
+ "ipv4": [
97
+ re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b"),
98
+ ],
99
+ "date": [
100
+ # 03/10/2024, 3-10-24, 2024-03-10
101
+ re.compile(r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b"),
102
+ re.compile(r"\b\d{4}-\d{1,2}-\d{1,2}\b"),
103
+ # March 5, 2024 / 5 March 2024 / Mar 2024
104
+ re.compile(rf"\b(?:{_MONTHS})\.?\s+\d{{1,2}}(?:st|nd|rd|th)?,?\s+\d{{4}}\b", re.IGNORECASE),
105
+ re.compile(rf"\b\d{{1,2}}(?:st|nd|rd|th)?\s+(?:{_MONTHS})\.?\s+\d{{4}}\b", re.IGNORECASE),
106
+ re.compile(rf"\b(?:{_MONTHS})\.?\s+\d{{4}}\b", re.IGNORECASE),
107
+ ],
108
+ "age": [
109
+ # HIPAA: ages 90+ must not be reported in the clear. Require an age cue
110
+ # so ordinary measurements ("90 mm") are never mistaken for an age.
111
+ re.compile(
112
+ r"\b(?:9\d|1\d\d)\s*[- ]?\s*(?:years?|yrs?|y)[- ]?(?:old|/o|o)\b",
113
+ re.IGNORECASE,
114
+ ),
115
+ re.compile(r"\bage[d]?\s*[:#]?\s*(?:9\d|1\d\d)\b", re.IGNORECASE),
116
+ ],
117
+ "zip": [
118
+ # Only when preceded by a 2-letter state code, to avoid nuking any 5-digit number.
119
+ re.compile(r"\b[A-Z]{2}\s+\d{5}(?:-\d{4})?\b"),
120
+ ],
121
+ "name": [
122
+ # Titled names: Dr. Jane Smith, Mr. John Q. Doe
123
+ re.compile(
124
+ r"\b(?:Dr|Doctor|Mr|Mrs|Ms|Miss|Prof)\.?\s+"
125
+ r"[A-Z][a-z]+(?:\s+[A-Z]\.?)?(?:\s+[A-Z][a-z]+)?"
126
+ ),
127
+ # Header label followed by a value, e.g. "Patient Name: John Doe".
128
+ # Only the value (group 1) is redacted; the label is preserved. The
129
+ # value stops at a column break (2+ spaces) or end of line so that a
130
+ # multi-field header line ("Name: Doe MRN: 123") does not swallow the
131
+ # following fields — those are matched by their own category rules.
132
+ re.compile(
133
+ r"(?im)^[ \t]*(?:patient(?:\s+name)?|name|physician|"
134
+ r"referring(?:\s+physician)?|referred\s+by|dictated\s+by|"
135
+ r"signed\s+by|attending|resident|technologist)\s*:[ \t]*"
136
+ r"([^\n]+?)(?=\s{2,}|$)"
137
+ ),
138
+ ],
139
+ }
140
+
141
+
142
+ class Deidentifier:
143
+ """
144
+ Redact PHI from radiology report text.
145
+
146
+ Usage:
147
+ deid = Deidentifier()
148
+ result = deid.deidentify(raw_text)
149
+ clean_text = result.text
150
+ print(result.category_counts()) # {"date": 2, "mrn": 1, ...}
151
+
152
+ Args:
153
+ categories: Iterable of category names to enable. Defaults to all.
154
+ placeholders: Optional overrides for the replacement tokens, e.g.
155
+ {"name": "XXXX"}. Categories not listed keep their default.
156
+ """
157
+
158
+ def __init__(
159
+ self,
160
+ categories: Optional[Iterable[str]] = None,
161
+ placeholders: Optional[dict[str, str]] = None,
162
+ ):
163
+ requested = tuple(categories) if categories is not None else DEFAULT_CATEGORIES
164
+ unknown = [c for c in requested if c not in _RULES]
165
+ if unknown:
166
+ raise ValueError(
167
+ f"Unknown de-identification categor{'y' if len(unknown) == 1 else 'ies'}: "
168
+ f"{', '.join(unknown)}. Valid options: {', '.join(sorted(_RULES))}."
169
+ )
170
+ # Preserve the canonical ordering (tie-break priority) among requested cats.
171
+ self.categories = tuple(c for c in DEFAULT_CATEGORIES if c in set(requested))
172
+ self.placeholders = {**PLACEHOLDERS, **(placeholders or {})}
173
+
174
+ def deidentify(self, text: str) -> DeidentificationResult:
175
+ """
176
+ Scrub PHI from `text` and return a DeidentificationResult.
177
+
178
+ The returned `text` has every detected identifier replaced by a category
179
+ placeholder. `redactions` is the audit trail, ordered by position, with
180
+ offsets into the ORIGINAL text.
181
+ """
182
+ if text is None:
183
+ raise ValueError("Text cannot be None.")
184
+
185
+ candidates: list[tuple[int, int, str]] = [] # (start, end, category)
186
+ for category in self.categories:
187
+ for pattern in _RULES[category]:
188
+ for m in pattern.finditer(text):
189
+ # If the pattern captured a value group (label fields), redact
190
+ # only that group; otherwise redact the whole match.
191
+ if m.groups():
192
+ start, end = m.span(1)
193
+ else:
194
+ start, end = m.span(0)
195
+ if start < end:
196
+ candidates.append((start, end, category))
197
+
198
+ # Resolve overlaps: sort by start, then longer span, then category priority.
199
+ priority = {c: i for i, c in enumerate(self.categories)}
200
+ candidates.sort(key=lambda c: (c[0], -(c[1] - c[0]), priority.get(c[2], 99)))
201
+
202
+ chosen: list[tuple[int, int, str]] = []
203
+ last_end = 0
204
+ for start, end, category in candidates:
205
+ if start >= last_end:
206
+ chosen.append((start, end, category))
207
+ last_end = end
208
+
209
+ # Rebuild the scrubbed text and the redaction records in one pass.
210
+ out_parts: list[str] = []
211
+ redactions: list[Redaction] = []
212
+ cursor = 0
213
+ for start, end, category in chosen:
214
+ out_parts.append(text[cursor:start])
215
+ replacement = self.placeholders[category]
216
+ out_parts.append(replacement)
217
+ redactions.append(Redaction(
218
+ category=category,
219
+ original=text[start:end],
220
+ replacement=replacement,
221
+ start=start,
222
+ end=end,
223
+ ))
224
+ cursor = end
225
+ out_parts.append(text[cursor:])
226
+
227
+ return DeidentificationResult(text="".join(out_parts), redactions=redactions)
228
+
229
+ @property
230
+ def supported_categories(self) -> list[str]:
231
+ """All PHI categories this de-identifier can detect."""
232
+ return list(_RULES.keys())
233
+
234
+
235
+ def deidentify(text: str, **kwargs) -> DeidentificationResult:
236
+ """Convenience wrapper: `Deidentifier(**kwargs).deidentify(text)`."""
237
+ return Deidentifier(**kwargs).deidentify(text)