@pharmatools/redacta 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +1 -0
- package/dist/redact.d.ts +55 -0
- package/dist/redact.js +377 -0
- package/package.json +42 -0
package/README.md
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# @pharmatools/redacta
|
|
2
|
+
|
|
3
|
+
Pseudonymise patient identifiers and PII in text — and restore them. A
|
|
4
|
+
dependency-free TypeScript engine you can embed in any Node or browser pipeline.
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
npm install @pharmatools/redacta
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
```ts
|
|
11
|
+
import { Redactor, reinstate, selfCheck } from "@pharmatools/redacta";
|
|
12
|
+
|
|
13
|
+
const redactor = new Redactor(["clinical", "general"]);
|
|
14
|
+
const { text } = redactor.redactText(
|
|
15
|
+
"Dear Mrs Patricia Hartley, NHS Number: 943 476 5919"
|
|
16
|
+
);
|
|
17
|
+
// text -> "Dear [PATIENT_NAME_1], NHS Number: [NHS_NUMBER_1]"
|
|
18
|
+
|
|
19
|
+
// same Redactor keeps a token map across many strings (consistent tokens)
|
|
20
|
+
const original = reinstate(text, redactor.tokenMap).text;
|
|
21
|
+
// original -> "Dear Mrs Patricia Hartley, NHS Number: 943 476 5919"
|
|
22
|
+
|
|
23
|
+
// second-pass safety check on already-redacted text
|
|
24
|
+
const leftovers = selfCheck(text); // ResidualFinding[]
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## What it detects
|
|
28
|
+
|
|
29
|
+
Deterministic, checksum-validated patterns — NHS numbers (Modulus-11), UK
|
|
30
|
+
National Insurance numbers, dates of birth (keyword-anchored; appointment dates
|
|
31
|
+
preserved), UK postcodes, US SSN/ZIP, hospital/MRN numbers, emails, phones —
|
|
32
|
+
plus general PII (URLs, IPs, Luhn-validated payment cards, IBANs, account
|
|
33
|
+
numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
|
|
34
|
+
(clinician names preserved by design). Names in free prose are not caught.
|
|
35
|
+
|
|
36
|
+
Same value → same token across a `Redactor` instance; the `tokenMap` reverses
|
|
37
|
+
the redaction. No DOM, no network, no storage.
|
|
38
|
+
|
|
39
|
+
## API
|
|
40
|
+
|
|
41
|
+
- `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,
|
|
42
|
+
`.report`, `.tokenMap`
|
|
43
|
+
- `reinstate(text, tokenMap)` → `{ text, changed }`
|
|
44
|
+
- `selfCheck(text)` → `ResidualFinding[]`
|
|
45
|
+
- `isValidNhs`, `isValidNi`, `isValidLuhn`, `isValidTokenMap`
|
|
46
|
+
|
|
47
|
+
This is the same engine that powers the
|
|
48
|
+
[Redacta for Miro app](https://www.pharmatools.ai/redacta) and the
|
|
49
|
+
[`redacta-mcp` server](https://www.npmjs.com/package/redacta-mcp). For an
|
|
50
|
+
agent-skill build with LLM reasoning over free-text names, see the
|
|
51
|
+
[Redacta skill](https://clawhub.ai/nickjlamb/redacta).
|
|
52
|
+
|
|
53
|
+
## Limits
|
|
54
|
+
|
|
55
|
+
Deterministic + keyword-anchored detection only — not a guarantee, not a
|
|
56
|
+
substitute for formal data-protection processes. Review the output, and treat the
|
|
57
|
+
token map as the key that reverses the redaction.
|
|
58
|
+
|
|
59
|
+
## License
|
|
60
|
+
|
|
61
|
+
MIT-0. Built by [PharmaTools.AI](https://www.pharmatools.ai/redacta).
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "./redact.js";
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from "./redact.js";
|
package/dist/redact.d.ts
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Redacta — deterministic pattern engine.
|
|
3
|
+
*
|
|
4
|
+
* Pure TypeScript: no DOM, no network, no storage. Replaces fixed-format
|
|
5
|
+
* identifiers and PII with labelled tokens, catches keyword-anchored names
|
|
6
|
+
* (patients, relatives, carers — clinician names preserved), self-checks the
|
|
7
|
+
* output, and reverses the process from a token map.
|
|
8
|
+
*/
|
|
9
|
+
export type Category = "clinical" | "general";
|
|
10
|
+
/** Validate a 10-digit NHS number using the Modulus-11 check digit. */
|
|
11
|
+
export declare function isValidNhs(digits: string): boolean;
|
|
12
|
+
/** Validate the two-letter prefix of a UK National Insurance number. */
|
|
13
|
+
export declare function isValidNi(prefix: string): boolean;
|
|
14
|
+
/** Luhn checksum for payment card numbers. */
|
|
15
|
+
export declare function isValidLuhn(digits: string): boolean;
|
|
16
|
+
export interface RedactionResult {
|
|
17
|
+
text: string;
|
|
18
|
+
changed: boolean;
|
|
19
|
+
}
|
|
20
|
+
export interface ResidualFinding {
|
|
21
|
+
label: string;
|
|
22
|
+
sample: string;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* A Redactor keeps one Tokeniser across many texts, so the same identifier
|
|
26
|
+
* gets the same token on every sticky note on the board.
|
|
27
|
+
*/
|
|
28
|
+
export declare class Redactor {
|
|
29
|
+
private tok;
|
|
30
|
+
private passes;
|
|
31
|
+
constructor(categories: Category[]);
|
|
32
|
+
redactText(input: string): RedactionResult;
|
|
33
|
+
/** {token_type: number_of_distinct_values} */
|
|
34
|
+
get report(): Record<string, number>;
|
|
35
|
+
/** {token: original_value} — for review / re-identification. Handle with care. */
|
|
36
|
+
get tokenMap(): Record<string, string>;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Re-scan already-redacted text for anything that still looks like an
|
|
40
|
+
* identifier, so the UI can warn the user to check manually. Returns one
|
|
41
|
+
* finding per distinct sample (deduplicated, capped). A clean result is not a
|
|
42
|
+
* guarantee — it's a second pair of eyes, not a proof.
|
|
43
|
+
*/
|
|
44
|
+
/**
|
|
45
|
+
* Re-identification: replace tokens with their original values, using a token
|
|
46
|
+
* map produced by an earlier redaction. The inverse of redaction — for putting
|
|
47
|
+
* real data back into AI output before it returns to the board.
|
|
48
|
+
*
|
|
49
|
+
* Tokens always end in "]", so "[NAME_1]" never matches inside "[NAME_10]";
|
|
50
|
+
* plain string replacement is safe.
|
|
51
|
+
*/
|
|
52
|
+
export declare function reinstate(text: string, tokenMap: Record<string, string>): RedactionResult;
|
|
53
|
+
/** Validate that a parsed object is a usable token map ([TOKEN] -> string). */
|
|
54
|
+
export declare function isValidTokenMap(value: unknown): value is Record<string, string>;
|
|
55
|
+
export declare function selfCheck(redactedText: string): ResidualFinding[];
|
package/dist/redact.js
ADDED
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Redacta — deterministic pattern engine.
|
|
3
|
+
*
|
|
4
|
+
* Pure TypeScript: no DOM, no network, no storage. Replaces fixed-format
|
|
5
|
+
* identifiers and PII with labelled tokens, catches keyword-anchored names
|
|
6
|
+
* (patients, relatives, carers — clinician names preserved), self-checks the
|
|
7
|
+
* output, and reverses the process from a token map.
|
|
8
|
+
*/
|
|
9
|
+
// ---------------------------------------------------------------------------
|
|
10
|
+
// Validators
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
/** Validate a 10-digit NHS number using the Modulus-11 check digit. */
|
|
13
|
+
export function isValidNhs(digits) {
|
|
14
|
+
if (!/^\d{10}$/.test(digits))
|
|
15
|
+
return false;
|
|
16
|
+
if (digits === digits[0].repeat(10))
|
|
17
|
+
return false;
|
|
18
|
+
const weights = [10, 9, 8, 7, 6, 5, 4, 3, 2];
|
|
19
|
+
const total = weights.reduce((sum, w, i) => sum + Number(digits[i]) * w, 0);
|
|
20
|
+
let check = 11 - (total % 11);
|
|
21
|
+
if (check === 11)
|
|
22
|
+
check = 0;
|
|
23
|
+
if (check === 10)
|
|
24
|
+
return false;
|
|
25
|
+
return check === Number(digits[9]);
|
|
26
|
+
}
|
|
27
|
+
const NI_INVALID_PREFIX = new Set(["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"]);
|
|
28
|
+
const NI_PREFIX1_BAD = new Set("DFIQUV");
|
|
29
|
+
const NI_PREFIX2_BAD = new Set("DFIOQUV");
|
|
30
|
+
/** Validate the two-letter prefix of a UK National Insurance number. */
|
|
31
|
+
export function isValidNi(prefix) {
|
|
32
|
+
const p = prefix.toUpperCase();
|
|
33
|
+
if (p.length !== 2 || NI_INVALID_PREFIX.has(p))
|
|
34
|
+
return false;
|
|
35
|
+
return !NI_PREFIX1_BAD.has(p[0]) && !NI_PREFIX2_BAD.has(p[1]);
|
|
36
|
+
}
|
|
37
|
+
/** Luhn checksum for payment card numbers. */
|
|
38
|
+
export function isValidLuhn(digits) {
|
|
39
|
+
if (!/^\d{13,19}$/.test(digits))
|
|
40
|
+
return false;
|
|
41
|
+
let sum = 0;
|
|
42
|
+
let dbl = false;
|
|
43
|
+
for (let i = digits.length - 1; i >= 0; i--) {
|
|
44
|
+
let d = Number(digits[i]);
|
|
45
|
+
if (dbl) {
|
|
46
|
+
d *= 2;
|
|
47
|
+
if (d > 9)
|
|
48
|
+
d -= 9;
|
|
49
|
+
}
|
|
50
|
+
sum += d;
|
|
51
|
+
dbl = !dbl;
|
|
52
|
+
}
|
|
53
|
+
return sum % 10 === 0;
|
|
54
|
+
}
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
// Tokeniser: same value -> same token, distinct values -> new numbers
|
|
57
|
+
// ---------------------------------------------------------------------------
|
|
58
|
+
class Tokeniser {
|
|
59
|
+
byKey = new Map();
|
|
60
|
+
counters = new Map();
|
|
61
|
+
tokenMap = {};
|
|
62
|
+
tokenFor(type, original, key) {
|
|
63
|
+
const k = `${type}::${key ?? original}`;
|
|
64
|
+
const existing = this.byKey.get(k);
|
|
65
|
+
if (existing)
|
|
66
|
+
return existing;
|
|
67
|
+
const n = (this.counters.get(type) ?? 0) + 1;
|
|
68
|
+
this.counters.set(type, n);
|
|
69
|
+
const token = `[${type}_${n}]`;
|
|
70
|
+
this.byKey.set(k, token);
|
|
71
|
+
this.tokenMap[token] = original;
|
|
72
|
+
return token;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
// Patterns
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
const MONTHS = "January|February|March|April|May|June|July|August|September|" +
|
|
79
|
+
"October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept?|Oct|Nov|Dec";
|
|
80
|
+
const DATE = [
|
|
81
|
+
String.raw `\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}`,
|
|
82
|
+
String.raw `\d{4}-\d{2}-\d{2}`,
|
|
83
|
+
String.raw `\d{1,2}(?:st|nd|rd|th)?\s+(?:${MONTHS})\s+\d{4}`,
|
|
84
|
+
String.raw `(?:${MONTHS})\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}`,
|
|
85
|
+
]
|
|
86
|
+
.map((s) => `(?:${s})`)
|
|
87
|
+
.join("|");
|
|
88
|
+
// A date only counts as a DOB when anchored to a DOB keyword, so clinical and
|
|
89
|
+
// appointment dates are left intact.
|
|
90
|
+
const DOB_RE = new RegExp(String.raw `(\b(?:date\s+of\s+birth|d\.?o\.?b\.?|born(?:\s+on)?)[\s:.]*)((?:${DATE}))`, "gi");
|
|
91
|
+
const NHS_RE = /\b(\d{3}[\s-]?\d{3}[\s-]?\d{4})\b/g;
|
|
92
|
+
const NI_RE = /\b([A-Za-z]{2})\s?(\d{2})\s?(\d{2})\s?(\d{2})\s?([A-Da-d])\b/g;
|
|
93
|
+
const SSN_FMT_RE = /(?<!\d)(?!000|666|9\d\d)(\d{3})([-\s])(\d{2})\2(\d{4})(?!\d)/g;
|
|
94
|
+
const SSN_KW_RE = /((?:SSN|Social\s*Security(?:\s*(?:Number|No\.?|#))?)[\s:]*)((?!000|666|9\d\d)\d{9})(?!\d)/gi;
|
|
95
|
+
const EMAIL_RE = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g;
|
|
96
|
+
const MRN_RE = /((?:MRN|Hospital\s*(?:No\.?|Number)|Hosp\.?\s*(?:No\.?|Number)|Patient\s*ID|Unit\s*(?:No\.?|Number))[\s:]*)([A-Z0-9-]{4,15})/gi;
|
|
97
|
+
const POSTCODE_RE = /\b(GIR\s?0AA|[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2})\b/gi;
|
|
98
|
+
const US_STATES = "AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|" +
|
|
99
|
+
"MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC";
|
|
100
|
+
const ZIP_KW_RE = /((?:ZIP|Zip\s*Code|Postal\s*Code)[\s:]*)(\d{5}(?:-\d{4})?)(?!\d)/gi;
|
|
101
|
+
const ZIP_STATE_RE = new RegExp(String.raw `((?:,?\s)(?:${US_STATES})\s+)(\d{5}(?:-\d{4})?)(?!\d)`, "g");
|
|
102
|
+
// --- General-PII additions -------------------------------------------------
|
|
103
|
+
const URL_RE = /\b(?:https?:\/\/|www\.)[^\s<>"'\])]+/gi;
|
|
104
|
+
const IP_RE = /\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|1?\d?\d)\b/g;
|
|
105
|
+
// Candidate card numbers (13-19 digits, optionally space/dash separated),
|
|
106
|
+
// confirmed with the Luhn checksum before redacting.
|
|
107
|
+
const CARD_RE = /(?<![\d-])(?:\d[ -]?){12,18}\d(?![\d-])/g;
|
|
108
|
+
const IBAN_RE = /\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){2,7}(?:\s?[A-Z0-9]{1,3})?\b/g;
|
|
109
|
+
const ACCOUNT_KW_RE = /((?:Account|Acct\.?|Member\s*ID|Policy\s*(?:No\.?|Number)|Insurance\s*ID)\s*(?:No\.?|Number|#)?[\s:]*)((?=[A-Z0-9-]*\d)[A-Z0-9-]{5,17})/gi;
|
|
110
|
+
const UK_PLATE_RE = /\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b/g;
|
|
111
|
+
// --- Names (keyword-anchored) ----------------------------------------------
|
|
112
|
+
// Names need contextual judgement, which a client-side deterministic engine
|
|
113
|
+
// can't fully do. We catch the high-confidence cases — names introduced by a
|
|
114
|
+
// courtesy title, a salutation, or a label — and deliberately PRESERVE names
|
|
115
|
+
// carrying a clinical title (Dr, Consultant, Nurse, ...), matching the Redacta
|
|
116
|
+
// skill's "don't redact the treating clinician" rule. Names buried in free
|
|
117
|
+
// prose are NOT caught; the UI tells users to review.
|
|
118
|
+
const NAME = String.raw `[A-Z][a-z]+(?:['’\-][A-Za-z]+)?(?:[ \t]+[A-Z][a-z]+(?:['’\-][A-Za-z]+)?){0,2}`;
|
|
119
|
+
// Case-sensitive, anchored version. Used to trim a loosely-captured name down
|
|
120
|
+
// to its leading run of properly capitalised words — necessary because the
|
|
121
|
+
// label/relative regexes carry the `i` flag (for the keyword), which would
|
|
122
|
+
// otherwise let a name match swallow trailing lowercase words ("Sarah is the").
|
|
123
|
+
const STRICT_NAME_RE = new RegExp("^" + NAME);
|
|
124
|
+
/** Split a loosely-captured name into its real leading name and the remainder. */
|
|
125
|
+
function leadingName(s) {
|
|
126
|
+
const m = s.match(STRICT_NAME_RE);
|
|
127
|
+
if (!m)
|
|
128
|
+
return null;
|
|
129
|
+
return { name: m[0], rest: s.slice(m[0].length) };
|
|
130
|
+
}
|
|
131
|
+
const COURTESY_TITLE = "Mr|Mrs|Ms|Miss|Mx";
|
|
132
|
+
const CLINICAL_TITLE = "Dr|Doctor|Prof|Professor|Consultant|Nurse|Sister|Matron|Surgeon|Registrar";
|
|
133
|
+
// "Mrs Patricia Hartley" → redact title + name together.
|
|
134
|
+
const NAME_TITLE_RE = new RegExp(String.raw `\b(?:${COURTESY_TITLE})\.?\s+(${NAME})`, "g");
|
|
135
|
+
// "Dear Patricia Hartley" → keep "Dear", redact the name — unless a clinical title follows.
|
|
136
|
+
const NAME_SALUTATION_RE = new RegExp(String.raw `\b(Dear)\s+(?!(?:${CLINICAL_TITLE})\b)(${NAME})`, "g");
|
|
137
|
+
// "Patient: ...", "Name - ...", "Re: ..." → keep the label, redact the name.
|
|
138
|
+
const NAME_LABEL_RE = new RegExp(String.raw `\b((?:Patient(?:\s+Name)?|Name|Client|Re)\s*[:\-]\s*)(${NAME})`, "gi");
|
|
139
|
+
// Relatives and carers: a relationship word followed by a name. HIPAA Safe
|
|
140
|
+
// Harbor treats relatives' names as identifiers, so "her daughter Sarah" or
|
|
141
|
+
// "NOK: John Hartley" should be redacted too.
|
|
142
|
+
const RELATION = "daughter|son|wife|husband|partner|spouse|mother|father|mum|mom|dad|" +
|
|
143
|
+
"sister|brother|sibling|grandson|granddaughter|grandmother|grandfather|" +
|
|
144
|
+
"grandparent|aunt|uncle|niece|nephew|cousin|carer|caregiver|guardian|" +
|
|
145
|
+
"parent|next\\s+of\\s+kin|nok|relative|widow|widower";
|
|
146
|
+
const RELATIVE_NAME_RE = new RegExp(String.raw `\b(${RELATION})([:,\-]?[ \t]+)(${NAME})`, "gi");
|
|
147
|
+
const digitsOf = (s) => s.replace(/\D/g, "");
|
|
148
|
+
const redactMrn = (text, tok) => text.replace(MRN_RE, (_m, kw, id) => kw + tok.tokenFor("MRN", id, id.toUpperCase()));
|
|
149
|
+
const redactAccount = (text, tok) => text.replace(ACCOUNT_KW_RE, (_m, kw, id) => kw + tok.tokenFor("ACCOUNT_NUMBER", id, id.toUpperCase()));
|
|
150
|
+
const redactDob = (text, tok) => text.replace(DOB_RE, (_m, kw, date) => kw + tok.tokenFor("DATE_OF_BIRTH", date));
|
|
151
|
+
const redactNhs = (text, tok) => text.replace(NHS_RE, (m, raw) => {
|
|
152
|
+
const d = digitsOf(raw);
|
|
153
|
+
if (d.length === 10 && isValidNhs(d))
|
|
154
|
+
return tok.tokenFor("NHS_NUMBER", raw, d);
|
|
155
|
+
return m;
|
|
156
|
+
});
|
|
157
|
+
const redactNi = (text, tok) => text.replace(NI_RE, (m, p1, p2, p3, p4, p5) => {
|
|
158
|
+
if (!isValidNi(p1))
|
|
159
|
+
return m;
|
|
160
|
+
const key = (p1 + p2 + p3 + p4 + p5).toUpperCase();
|
|
161
|
+
return tok.tokenFor("NI_NUMBER", m.trim(), key);
|
|
162
|
+
});
|
|
163
|
+
const redactSsn = (text, tok) => {
|
|
164
|
+
let out = text.replace(SSN_FMT_RE, (m, a, _sep, b, c) => {
|
|
165
|
+
if (b === "00" || c === "0000")
|
|
166
|
+
return m;
|
|
167
|
+
return tok.tokenFor("SSN", m, a + b + c);
|
|
168
|
+
});
|
|
169
|
+
out = out.replace(SSN_KW_RE, (m, kw, num) => {
|
|
170
|
+
if (num.slice(3, 5) === "00" || num.slice(5, 9) === "0000")
|
|
171
|
+
return m;
|
|
172
|
+
return kw + tok.tokenFor("SSN", num, num);
|
|
173
|
+
});
|
|
174
|
+
return out;
|
|
175
|
+
};
|
|
176
|
+
const redactCard = (text, tok) => text.replace(CARD_RE, (m) => {
|
|
177
|
+
const d = digitsOf(m);
|
|
178
|
+
if (d.length >= 13 && d.length <= 19 && isValidLuhn(d)) {
|
|
179
|
+
return tok.tokenFor("CARD_NUMBER", m.trim(), d);
|
|
180
|
+
}
|
|
181
|
+
return m;
|
|
182
|
+
});
|
|
183
|
+
const redactIban = (text, tok) => text.replace(IBAN_RE, (m) => {
|
|
184
|
+
const clean = m.replace(/\s/g, "");
|
|
185
|
+
if (clean.length >= 15 && clean.length <= 34) {
|
|
186
|
+
return tok.tokenFor("IBAN", m, clean.toUpperCase());
|
|
187
|
+
}
|
|
188
|
+
return m;
|
|
189
|
+
});
|
|
190
|
+
const redactUrl = (text, tok) => text.replace(URL_RE, (m) => tok.tokenFor("URL", m, m.toLowerCase()));
|
|
191
|
+
const redactEmail = (text, tok) => text.replace(EMAIL_RE, (m) => tok.tokenFor("EMAIL", m, m.toLowerCase()));
|
|
192
|
+
const redactPhone = (text, tok) => {
|
|
193
|
+
const mk = (m) => tok.tokenFor("PHONE", m.trim(), digitsOf(m));
|
|
194
|
+
let out = text.replace(/(?<!\d)\+44[\s-]?(?:\(0\))?[\s-]?\d{2,5}[\s-]?\d{3,4}[\s-]?\d{3,4}(?!\d)/g, mk);
|
|
195
|
+
out = out.replace(/(?<!\d)\+1[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}(?!\d)/g, mk);
|
|
196
|
+
out = out.replace(/(?<!\d)\(?0\d{2,4}\)?[\s-]?\d{3,4}[\s-]?\d{3,4}(?!\d)/g, (m) => {
|
|
197
|
+
const len = digitsOf(m).length;
|
|
198
|
+
return len >= 10 && len <= 11 ? mk(m) : m;
|
|
199
|
+
});
|
|
200
|
+
out = out.replace(/(?<!\d)\(?[2-9]\d{2}\)?[\s\-.][2-9]\d{2}[\s\-.]\d{4}(?!\d)/g, mk);
|
|
201
|
+
return out;
|
|
202
|
+
};
|
|
203
|
+
const redactPostcode = (text, tok) => text.replace(POSTCODE_RE, (m) => {
|
|
204
|
+
const clean = m.replace(/\s/g, "");
|
|
205
|
+
if (clean.length >= 5 && clean.length <= 7) {
|
|
206
|
+
return tok.tokenFor("POSTCODE", m, clean.toUpperCase());
|
|
207
|
+
}
|
|
208
|
+
return m;
|
|
209
|
+
});
|
|
210
|
+
const redactZip = (text, tok) => {
|
|
211
|
+
let out = text.replace(ZIP_KW_RE, (_m, kw, zip) => kw + tok.tokenFor("ZIP", zip));
|
|
212
|
+
out = out.replace(ZIP_STATE_RE, (_m, pre, zip) => pre + tok.tokenFor("ZIP", zip));
|
|
213
|
+
return out;
|
|
214
|
+
};
|
|
215
|
+
const redactIp = (text, tok) => text.replace(IP_RE, (m) => tok.tokenFor("IP_ADDRESS", m));
|
|
216
|
+
const redactPlate = (text, tok) => text.replace(UK_PLATE_RE, (m) => tok.tokenFor("VEHICLE_REG", m, m.replace(/\s/g, "").toUpperCase()));
|
|
217
|
+
const redactRelative = (text, tok) => text.replace(RELATIVE_NAME_RE, (m, rel, sep, name) => {
|
|
218
|
+
// The `i` flag (for the relationship word) relaxes the name's
|
|
219
|
+
// capitalisation, so trim to the leading capitalised run — this both
|
|
220
|
+
// rejects "daughter and two sons" and stops "Sarah is the" over-capturing.
|
|
221
|
+
const split = leadingName(name);
|
|
222
|
+
if (!split)
|
|
223
|
+
return m;
|
|
224
|
+
return (rel + sep +
|
|
225
|
+
tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase()) +
|
|
226
|
+
split.rest);
|
|
227
|
+
});
|
|
228
|
+
const redactName = (text, tok) => {
|
|
229
|
+
const nameToken = (raw) => tok.tokenFor("PATIENT_NAME", raw.trim(), raw.trim().toLowerCase().replace(/\s+/g, " "));
|
|
230
|
+
// Courtesy-titled names first. Store the full match (title + name) as the
|
|
231
|
+
// original so re-identification restores "Mrs Patricia Hartley" verbatim,
|
|
232
|
+
// but key on the name alone so the same person dedupes across contexts.
|
|
233
|
+
let out = text.replace(NAME_TITLE_RE, (m, name) => tok.tokenFor("PATIENT_NAME", m.trim(), name.trim().toLowerCase().replace(/\s+/g, " ")));
|
|
234
|
+
// Salutations without a courtesy title (clinical titles already excluded).
|
|
235
|
+
out = out.replace(NAME_SALUTATION_RE, (_m, dear, name) => `${dear} ${nameToken(name)}`);
|
|
236
|
+
// Labelled names — preserve the original label + separator. This regex also
|
|
237
|
+
// carries the `i` flag (for the label word), so trim the name the same way.
|
|
238
|
+
out = out.replace(NAME_LABEL_RE, (m, prefix, name) => {
|
|
239
|
+
const split = leadingName(name);
|
|
240
|
+
if (!split)
|
|
241
|
+
return m;
|
|
242
|
+
return prefix + nameToken(split.name) + split.rest;
|
|
243
|
+
});
|
|
244
|
+
return out;
|
|
245
|
+
};
|
|
246
|
+
// Order matters: keyword-anchored and checksum-validated patterns first,
|
|
247
|
+
// weaker heuristics last, so high-confidence matches win any overlap.
|
|
248
|
+
const CLINICAL_PASSES = [
|
|
249
|
+
redactMrn,
|
|
250
|
+
redactDob,
|
|
251
|
+
redactNhs,
|
|
252
|
+
redactNi,
|
|
253
|
+
redactSsn,
|
|
254
|
+
redactEmail,
|
|
255
|
+
redactPhone,
|
|
256
|
+
redactPostcode,
|
|
257
|
+
redactZip,
|
|
258
|
+
redactRelative,
|
|
259
|
+
redactName,
|
|
260
|
+
];
|
|
261
|
+
const GENERAL_PASSES = [
|
|
262
|
+
redactAccount,
|
|
263
|
+
redactCard,
|
|
264
|
+
redactIban,
|
|
265
|
+
redactUrl,
|
|
266
|
+
redactEmail,
|
|
267
|
+
redactPhone,
|
|
268
|
+
redactPostcode,
|
|
269
|
+
redactZip,
|
|
270
|
+
redactIp,
|
|
271
|
+
redactPlate,
|
|
272
|
+
redactRelative,
|
|
273
|
+
redactName,
|
|
274
|
+
];
|
|
275
|
+
// Self-check: patterns that should NOT remain in already-redacted text. These
|
|
276
|
+
// are intentionally broad — they flag *possible* leftovers for human review,
|
|
277
|
+
// not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
|
|
278
|
+
const RESIDUAL_CHECKS = [
|
|
279
|
+
{ label: "long number (10+ digits)", re: /(?<![\d-])\d[\d\s-]{8,}\d(?![\d-])/g },
|
|
280
|
+
{ label: "email address", re: /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g },
|
|
281
|
+
{ label: "UK postcode", re: /\b[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}\b/gi },
|
|
282
|
+
{ label: "URL", re: /\b(?:https?:\/\/|www\.)\S+/gi },
|
|
283
|
+
];
|
|
284
|
+
/**
|
|
285
|
+
* A Redactor keeps one Tokeniser across many texts, so the same identifier
|
|
286
|
+
* gets the same token on every sticky note on the board.
|
|
287
|
+
*/
|
|
288
|
+
export class Redactor {
|
|
289
|
+
tok = new Tokeniser();
|
|
290
|
+
passes;
|
|
291
|
+
constructor(categories) {
|
|
292
|
+
const seen = new Set();
|
|
293
|
+
const passes = [];
|
|
294
|
+
if (categories.includes("clinical")) {
|
|
295
|
+
for (const p of CLINICAL_PASSES)
|
|
296
|
+
if (!seen.has(p))
|
|
297
|
+
(seen.add(p), passes.push(p));
|
|
298
|
+
}
|
|
299
|
+
if (categories.includes("general")) {
|
|
300
|
+
for (const p of GENERAL_PASSES)
|
|
301
|
+
if (!seen.has(p))
|
|
302
|
+
(seen.add(p), passes.push(p));
|
|
303
|
+
}
|
|
304
|
+
this.passes = passes;
|
|
305
|
+
}
|
|
306
|
+
redactText(input) {
|
|
307
|
+
// Normalise non-breaking spaces so spaced identifiers still match.
|
|
308
|
+
let text = input.replace(/[ ]/g, " ");
|
|
309
|
+
for (const pass of this.passes)
|
|
310
|
+
text = pass(text, this.tok);
|
|
311
|
+
return { text, changed: text !== input };
|
|
312
|
+
}
|
|
313
|
+
/** {token_type: number_of_distinct_values} */
|
|
314
|
+
get report() {
|
|
315
|
+
const report = {};
|
|
316
|
+
for (const token of Object.keys(this.tok.tokenMap)) {
|
|
317
|
+
const type = token.slice(1, -1).replace(/_\d+$/, "");
|
|
318
|
+
report[type] = (report[type] ?? 0) + 1;
|
|
319
|
+
}
|
|
320
|
+
return report;
|
|
321
|
+
}
|
|
322
|
+
/** {token: original_value} — for review / re-identification. Handle with care. */
|
|
323
|
+
get tokenMap() {
|
|
324
|
+
return { ...this.tok.tokenMap };
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Re-scan already-redacted text for anything that still looks like an
|
|
329
|
+
* identifier, so the UI can warn the user to check manually. Returns one
|
|
330
|
+
* finding per distinct sample (deduplicated, capped). A clean result is not a
|
|
331
|
+
* guarantee — it's a second pair of eyes, not a proof.
|
|
332
|
+
*/
|
|
333
|
+
/**
|
|
334
|
+
* Re-identification: replace tokens with their original values, using a token
|
|
335
|
+
* map produced by an earlier redaction. The inverse of redaction — for putting
|
|
336
|
+
* real data back into AI output before it returns to the board.
|
|
337
|
+
*
|
|
338
|
+
* Tokens always end in "]", so "[NAME_1]" never matches inside "[NAME_10]";
|
|
339
|
+
* plain string replacement is safe.
|
|
340
|
+
*/
|
|
341
|
+
export function reinstate(text, tokenMap) {
|
|
342
|
+
let out = text;
|
|
343
|
+
for (const [token, original] of Object.entries(tokenMap)) {
|
|
344
|
+
if (token)
|
|
345
|
+
out = out.split(token).join(original);
|
|
346
|
+
}
|
|
347
|
+
return { text: out, changed: out !== text };
|
|
348
|
+
}
|
|
349
|
+
/** Validate that a parsed object is a usable token map ([TOKEN] -> string). */
|
|
350
|
+
export function isValidTokenMap(value) {
|
|
351
|
+
if (!value || typeof value !== "object" || Array.isArray(value))
|
|
352
|
+
return false;
|
|
353
|
+
const entries = Object.entries(value);
|
|
354
|
+
if (entries.length === 0)
|
|
355
|
+
return false;
|
|
356
|
+
return entries.every(([k, v]) => /^\[[A-Z_]+_\d+\]$/.test(k) && typeof v === "string");
|
|
357
|
+
}
|
|
358
|
+
export function selfCheck(redactedText) {
|
|
359
|
+
const seen = new Set();
|
|
360
|
+
const findings = [];
|
|
361
|
+
for (const { label, re } of RESIDUAL_CHECKS) {
|
|
362
|
+
for (const match of redactedText.matchAll(re)) {
|
|
363
|
+
const sample = match[0].trim();
|
|
364
|
+
// Ignore our own tokens, e.g. [NHS_NUMBER_1].
|
|
365
|
+
if (/^\[[A-Z_]+_\d+\]$/.test(sample))
|
|
366
|
+
continue;
|
|
367
|
+
const key = `${label}:${sample.toLowerCase()}`;
|
|
368
|
+
if (seen.has(key))
|
|
369
|
+
continue;
|
|
370
|
+
seen.add(key);
|
|
371
|
+
findings.push({ label, sample });
|
|
372
|
+
if (findings.length >= 20)
|
|
373
|
+
return findings;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
return findings;
|
|
377
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@pharmatools/redacta",
|
|
3
|
+
"version": "1.1.1",
|
|
4
|
+
"description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
|
|
5
|
+
"license": "MIT-0",
|
|
6
|
+
"author": "Nick Lamb (PharmaTools.AI)",
|
|
7
|
+
"homepage": "https://www.pharmatools.ai/redacta",
|
|
8
|
+
"repository": { "type": "git", "url": "https://github.com/nickjlamb/redacta" },
|
|
9
|
+
"type": "module",
|
|
10
|
+
"main": "dist/index.js",
|
|
11
|
+
"types": "dist/index.d.ts",
|
|
12
|
+
"exports": {
|
|
13
|
+
".": {
|
|
14
|
+
"types": "./dist/index.d.ts",
|
|
15
|
+
"import": "./dist/index.js"
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
"files": [
|
|
19
|
+
"dist"
|
|
20
|
+
],
|
|
21
|
+
"keywords": [
|
|
22
|
+
"redaction",
|
|
23
|
+
"pseudonymisation",
|
|
24
|
+
"de-identification",
|
|
25
|
+
"anonymisation",
|
|
26
|
+
"pii",
|
|
27
|
+
"phi",
|
|
28
|
+
"healthcare",
|
|
29
|
+
"nhs",
|
|
30
|
+
"privacy",
|
|
31
|
+
"typescript"
|
|
32
|
+
],
|
|
33
|
+
"scripts": {
|
|
34
|
+
"build": "tsc",
|
|
35
|
+
"test": "vitest run",
|
|
36
|
+
"prepublishOnly": "npm run build"
|
|
37
|
+
},
|
|
38
|
+
"devDependencies": {
|
|
39
|
+
"typescript": "^5.4.0",
|
|
40
|
+
"vitest": "^1.6.0"
|
|
41
|
+
}
|
|
42
|
+
}
|