@pharmatools/redacta 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,61 @@
1
+ # @pharmatools/redacta
2
+
3
+ Pseudonymise patient identifiers and PII in text — and restore them. A
4
+ dependency-free TypeScript engine you can embed in any Node or browser pipeline.
5
+
6
+ ```bash
7
+ npm install @pharmatools/redacta
8
+ ```
9
+
10
+ ```ts
11
+ import { Redactor, reinstate, selfCheck } from "@pharmatools/redacta";
12
+
13
+ const redactor = new Redactor(["clinical", "general"]);
14
+ const { text } = redactor.redactText(
15
+ "Dear Mrs Patricia Hartley, NHS Number: 943 476 5919"
16
+ );
17
+ // text -> "Dear [PATIENT_NAME_1], NHS Number: [NHS_NUMBER_1]"
18
+
19
+ // same Redactor keeps a token map across many strings (consistent tokens)
20
+ const original = reinstate(text, redactor.tokenMap).text;
21
+ // original -> "Dear Mrs Patricia Hartley, NHS Number: 943 476 5919"
22
+
23
+ // second-pass safety check on already-redacted text
24
+ const leftovers = selfCheck(text); // ResidualFinding[]
25
+ ```
26
+
27
+ ## What it detects
28
+
29
+ Deterministic, checksum-validated patterns — NHS numbers (Modulus-11), UK
30
+ National Insurance numbers, dates of birth (keyword-anchored; appointment dates
31
+ preserved), UK postcodes, US SSN/ZIP, hospital/MRN numbers, emails, phones —
32
+ plus general PII (URLs, IPs, Luhn-validated payment cards, IBANs, account
33
+ numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
34
+ (clinician names preserved by design). Names in free prose are not caught.
35
+
36
+ Same value → same token across a `Redactor` instance; the `tokenMap` reverses
37
+ the redaction. No DOM, no network, no storage.
38
+
39
+ ## API
40
+
41
+ - `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,
42
+ `.report`, `.tokenMap`
43
+ - `reinstate(text, tokenMap)` → `{ text, changed }`
44
+ - `selfCheck(text)` → `ResidualFinding[]`
45
+ - `isValidNhs`, `isValidNi`, `isValidLuhn`, `isValidTokenMap`
46
+
47
+ This is the same engine that powers the
48
+ [Redacta for Miro app](https://www.pharmatools.ai/redacta) and the
49
+ [`redacta-mcp` server](https://www.npmjs.com/package/redacta-mcp). For an
50
+ agent-skill build with LLM reasoning over free-text names, see the
51
+ [Redacta skill](https://clawhub.ai/nickjlamb/redacta).
52
+
53
+ ## Limits
54
+
55
+ Deterministic + keyword-anchored detection only — not a guarantee, not a
56
+ substitute for formal data-protection processes. Review the output, and treat the
57
+ token map as the key that reverses the redaction.
58
+
59
+ ## License
60
+
61
+ MIT-0. Built by [PharmaTools.AI](https://www.pharmatools.ai/redacta).
@@ -0,0 +1 @@
1
+ export * from "./redact.js";
package/dist/index.js ADDED
@@ -0,0 +1 @@
1
+ export * from "./redact.js";
@@ -0,0 +1,55 @@
1
+ /**
2
+ * Redacta — deterministic pattern engine.
3
+ *
4
+ * Pure TypeScript: no DOM, no network, no storage. Replaces fixed-format
5
+ * identifiers and PII with labelled tokens, catches keyword-anchored names
6
+ * (patients, relatives, carers — clinician names preserved), self-checks the
7
+ * output, and reverses the process from a token map.
8
+ */
9
+ export type Category = "clinical" | "general";
10
+ /** Validate a 10-digit NHS number using the Modulus-11 check digit. */
11
+ export declare function isValidNhs(digits: string): boolean;
12
+ /** Validate the two-letter prefix of a UK National Insurance number. */
13
+ export declare function isValidNi(prefix: string): boolean;
14
+ /** Luhn checksum for payment card numbers. */
15
+ export declare function isValidLuhn(digits: string): boolean;
16
+ export interface RedactionResult {
17
+ text: string;
18
+ changed: boolean;
19
+ }
20
+ export interface ResidualFinding {
21
+ label: string;
22
+ sample: string;
23
+ }
24
+ /**
25
+ * A Redactor keeps one Tokeniser across many texts, so the same identifier
26
+ * gets the same token on every sticky note on the board.
27
+ */
28
+ export declare class Redactor {
29
+ private tok;
30
+ private passes;
31
+ constructor(categories: Category[]);
32
+ redactText(input: string): RedactionResult;
33
+ /** {token_type: number_of_distinct_values} */
34
+ get report(): Record<string, number>;
35
+ /** {token: original_value} — for review / re-identification. Handle with care. */
36
+ get tokenMap(): Record<string, string>;
37
+ }
38
+ /**
39
+ * Re-scan already-redacted text for anything that still looks like an
40
+ * identifier, so the UI can warn the user to check manually. Returns one
41
+ * finding per distinct sample (deduplicated, capped). A clean result is not a
42
+ * guarantee — it's a second pair of eyes, not a proof.
43
+ */
44
+ /**
45
+ * Re-identification: replace tokens with their original values, using a token
46
+ * map produced by an earlier redaction. The inverse of redaction — for putting
47
+ * real data back into AI output before it returns to the board.
48
+ *
49
+ * Tokens always end in "]", so "[NAME_1]" never matches inside "[NAME_10]";
50
+ * plain string replacement is safe.
51
+ */
52
+ export declare function reinstate(text: string, tokenMap: Record<string, string>): RedactionResult;
53
+ /** Validate that a parsed object is a usable token map ([TOKEN] -> string). */
54
+ export declare function isValidTokenMap(value: unknown): value is Record<string, string>;
55
+ export declare function selfCheck(redactedText: string): ResidualFinding[];
package/dist/redact.js ADDED
@@ -0,0 +1,377 @@
1
+ /**
2
+ * Redacta — deterministic pattern engine.
3
+ *
4
+ * Pure TypeScript: no DOM, no network, no storage. Replaces fixed-format
5
+ * identifiers and PII with labelled tokens, catches keyword-anchored names
6
+ * (patients, relatives, carers — clinician names preserved), self-checks the
7
+ * output, and reverses the process from a token map.
8
+ */
9
+ // ---------------------------------------------------------------------------
10
+ // Validators
11
+ // ---------------------------------------------------------------------------
12
+ /** Validate a 10-digit NHS number using the Modulus-11 check digit. */
13
+ export function isValidNhs(digits) {
14
+ if (!/^\d{10}$/.test(digits))
15
+ return false;
16
+ if (digits === digits[0].repeat(10))
17
+ return false;
18
+ const weights = [10, 9, 8, 7, 6, 5, 4, 3, 2];
19
+ const total = weights.reduce((sum, w, i) => sum + Number(digits[i]) * w, 0);
20
+ let check = 11 - (total % 11);
21
+ if (check === 11)
22
+ check = 0;
23
+ if (check === 10)
24
+ return false;
25
+ return check === Number(digits[9]);
26
+ }
27
+ const NI_INVALID_PREFIX = new Set(["BG", "GB", "NK", "KN", "TN", "NT", "ZZ"]);
28
+ const NI_PREFIX1_BAD = new Set("DFIQUV");
29
+ const NI_PREFIX2_BAD = new Set("DFIOQUV");
30
+ /** Validate the two-letter prefix of a UK National Insurance number. */
31
+ export function isValidNi(prefix) {
32
+ const p = prefix.toUpperCase();
33
+ if (p.length !== 2 || NI_INVALID_PREFIX.has(p))
34
+ return false;
35
+ return !NI_PREFIX1_BAD.has(p[0]) && !NI_PREFIX2_BAD.has(p[1]);
36
+ }
37
+ /** Luhn checksum for payment card numbers. */
38
+ export function isValidLuhn(digits) {
39
+ if (!/^\d{13,19}$/.test(digits))
40
+ return false;
41
+ let sum = 0;
42
+ let dbl = false;
43
+ for (let i = digits.length - 1; i >= 0; i--) {
44
+ let d = Number(digits[i]);
45
+ if (dbl) {
46
+ d *= 2;
47
+ if (d > 9)
48
+ d -= 9;
49
+ }
50
+ sum += d;
51
+ dbl = !dbl;
52
+ }
53
+ return sum % 10 === 0;
54
+ }
55
+ // ---------------------------------------------------------------------------
56
+ // Tokeniser: same value -> same token, distinct values -> new numbers
57
+ // ---------------------------------------------------------------------------
58
+ class Tokeniser {
59
+ byKey = new Map();
60
+ counters = new Map();
61
+ tokenMap = {};
62
+ tokenFor(type, original, key) {
63
+ const k = `${type}::${key ?? original}`;
64
+ const existing = this.byKey.get(k);
65
+ if (existing)
66
+ return existing;
67
+ const n = (this.counters.get(type) ?? 0) + 1;
68
+ this.counters.set(type, n);
69
+ const token = `[${type}_${n}]`;
70
+ this.byKey.set(k, token);
71
+ this.tokenMap[token] = original;
72
+ return token;
73
+ }
74
+ }
75
+ // ---------------------------------------------------------------------------
76
+ // Patterns
77
+ // ---------------------------------------------------------------------------
78
+ const MONTHS = "January|February|March|April|May|June|July|August|September|" +
79
+ "October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept?|Oct|Nov|Dec";
80
+ const DATE = [
81
+ String.raw `\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4}`,
82
+ String.raw `\d{4}-\d{2}-\d{2}`,
83
+ String.raw `\d{1,2}(?:st|nd|rd|th)?\s+(?:${MONTHS})\s+\d{4}`,
84
+ String.raw `(?:${MONTHS})\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}`,
85
+ ]
86
+ .map((s) => `(?:${s})`)
87
+ .join("|");
88
+ // A date only counts as a DOB when anchored to a DOB keyword, so clinical and
89
+ // appointment dates are left intact.
90
+ const DOB_RE = new RegExp(String.raw `(\b(?:date\s+of\s+birth|d\.?o\.?b\.?|born(?:\s+on)?)[\s:.]*)((?:${DATE}))`, "gi");
91
+ const NHS_RE = /\b(\d{3}[\s-]?\d{3}[\s-]?\d{4})\b/g;
92
+ const NI_RE = /\b([A-Za-z]{2})\s?(\d{2})\s?(\d{2})\s?(\d{2})\s?([A-Da-d])\b/g;
93
+ const SSN_FMT_RE = /(?<!\d)(?!000|666|9\d\d)(\d{3})([-\s])(\d{2})\2(\d{4})(?!\d)/g;
94
+ const SSN_KW_RE = /((?:SSN|Social\s*Security(?:\s*(?:Number|No\.?|#))?)[\s:]*)((?!000|666|9\d\d)\d{9})(?!\d)/gi;
95
+ const EMAIL_RE = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g;
96
+ const MRN_RE = /((?:MRN|Hospital\s*(?:No\.?|Number)|Hosp\.?\s*(?:No\.?|Number)|Patient\s*ID|Unit\s*(?:No\.?|Number))[\s:]*)([A-Z0-9-]{4,15})/gi;
97
+ const POSTCODE_RE = /\b(GIR\s?0AA|[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2})\b/gi;
98
+ const US_STATES = "AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|" +
99
+ "MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY|DC";
100
+ const ZIP_KW_RE = /((?:ZIP|Zip\s*Code|Postal\s*Code)[\s:]*)(\d{5}(?:-\d{4})?)(?!\d)/gi;
101
+ const ZIP_STATE_RE = new RegExp(String.raw `((?:,?\s)(?:${US_STATES})\s+)(\d{5}(?:-\d{4})?)(?!\d)`, "g");
102
+ // --- General-PII additions -------------------------------------------------
103
+ const URL_RE = /\b(?:https?:\/\/|www\.)[^\s<>"'\])]+/gi;
104
+ const IP_RE = /\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.){3}(?:25[0-5]|2[0-4]\d|1?\d?\d)\b/g;
105
+ // Candidate card numbers (13-19 digits, optionally space/dash separated),
106
+ // confirmed with the Luhn checksum before redacting.
107
+ const CARD_RE = /(?<![\d-])(?:\d[ -]?){12,18}\d(?![\d-])/g;
108
+ const IBAN_RE = /\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){2,7}(?:\s?[A-Z0-9]{1,3})?\b/g;
109
+ const ACCOUNT_KW_RE = /((?:Account|Acct\.?|Member\s*ID|Policy\s*(?:No\.?|Number)|Insurance\s*ID)\s*(?:No\.?|Number|#)?[\s:]*)((?=[A-Z0-9-]*\d)[A-Z0-9-]{5,17})/gi;
110
+ const UK_PLATE_RE = /\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b/g;
111
+ // --- Names (keyword-anchored) ----------------------------------------------
112
+ // Names need contextual judgement, which a client-side deterministic engine
113
+ // can't fully do. We catch the high-confidence cases — names introduced by a
114
+ // courtesy title, a salutation, or a label — and deliberately PRESERVE names
115
+ // carrying a clinical title (Dr, Consultant, Nurse, ...), matching the Redacta
116
+ // skill's "don't redact the treating clinician" rule. Names buried in free
117
+ // prose are NOT caught; the UI tells users to review.
118
+ const NAME = String.raw `[A-Z][a-z]+(?:['’\-][A-Za-z]+)?(?:[ \t]+[A-Z][a-z]+(?:['’\-][A-Za-z]+)?){0,2}`;
119
+ // Case-sensitive, anchored version. Used to trim a loosely-captured name down
120
+ // to its leading run of properly capitalised words — necessary because the
121
+ // label/relative regexes carry the `i` flag (for the keyword), which would
122
+ // otherwise let a name match swallow trailing lowercase words ("Sarah is the").
123
+ const STRICT_NAME_RE = new RegExp("^" + NAME);
124
+ /** Split a loosely-captured name into its real leading name and the remainder. */
125
+ function leadingName(s) {
126
+ const m = s.match(STRICT_NAME_RE);
127
+ if (!m)
128
+ return null;
129
+ return { name: m[0], rest: s.slice(m[0].length) };
130
+ }
131
+ const COURTESY_TITLE = "Mr|Mrs|Ms|Miss|Mx";
132
+ const CLINICAL_TITLE = "Dr|Doctor|Prof|Professor|Consultant|Nurse|Sister|Matron|Surgeon|Registrar";
133
+ // "Mrs Patricia Hartley" → redact title + name together.
134
+ const NAME_TITLE_RE = new RegExp(String.raw `\b(?:${COURTESY_TITLE})\.?\s+(${NAME})`, "g");
135
+ // "Dear Patricia Hartley" → keep "Dear", redact the name — unless a clinical title follows.
136
+ const NAME_SALUTATION_RE = new RegExp(String.raw `\b(Dear)\s+(?!(?:${CLINICAL_TITLE})\b)(${NAME})`, "g");
137
+ // "Patient: ...", "Name - ...", "Re: ..." → keep the label, redact the name.
138
+ const NAME_LABEL_RE = new RegExp(String.raw `\b((?:Patient(?:\s+Name)?|Name|Client|Re)\s*[:\-]\s*)(${NAME})`, "gi");
139
+ // Relatives and carers: a relationship word followed by a name. HIPAA Safe
140
+ // Harbor treats relatives' names as identifiers, so "her daughter Sarah" or
141
+ // "NOK: John Hartley" should be redacted too.
142
+ const RELATION = "daughter|son|wife|husband|partner|spouse|mother|father|mum|mom|dad|" +
143
+ "sister|brother|sibling|grandson|granddaughter|grandmother|grandfather|" +
144
+ "grandparent|aunt|uncle|niece|nephew|cousin|carer|caregiver|guardian|" +
145
+ "parent|next\\s+of\\s+kin|nok|relative|widow|widower";
146
+ const RELATIVE_NAME_RE = new RegExp(String.raw `\b(${RELATION})([:,\-]?[ \t]+)(${NAME})`, "gi");
147
+ const digitsOf = (s) => s.replace(/\D/g, "");
148
+ const redactMrn = (text, tok) => text.replace(MRN_RE, (_m, kw, id) => kw + tok.tokenFor("MRN", id, id.toUpperCase()));
149
+ const redactAccount = (text, tok) => text.replace(ACCOUNT_KW_RE, (_m, kw, id) => kw + tok.tokenFor("ACCOUNT_NUMBER", id, id.toUpperCase()));
150
+ const redactDob = (text, tok) => text.replace(DOB_RE, (_m, kw, date) => kw + tok.tokenFor("DATE_OF_BIRTH", date));
151
+ const redactNhs = (text, tok) => text.replace(NHS_RE, (m, raw) => {
152
+ const d = digitsOf(raw);
153
+ if (d.length === 10 && isValidNhs(d))
154
+ return tok.tokenFor("NHS_NUMBER", raw, d);
155
+ return m;
156
+ });
157
+ const redactNi = (text, tok) => text.replace(NI_RE, (m, p1, p2, p3, p4, p5) => {
158
+ if (!isValidNi(p1))
159
+ return m;
160
+ const key = (p1 + p2 + p3 + p4 + p5).toUpperCase();
161
+ return tok.tokenFor("NI_NUMBER", m.trim(), key);
162
+ });
163
+ const redactSsn = (text, tok) => {
164
+ let out = text.replace(SSN_FMT_RE, (m, a, _sep, b, c) => {
165
+ if (b === "00" || c === "0000")
166
+ return m;
167
+ return tok.tokenFor("SSN", m, a + b + c);
168
+ });
169
+ out = out.replace(SSN_KW_RE, (m, kw, num) => {
170
+ if (num.slice(3, 5) === "00" || num.slice(5, 9) === "0000")
171
+ return m;
172
+ return kw + tok.tokenFor("SSN", num, num);
173
+ });
174
+ return out;
175
+ };
176
+ const redactCard = (text, tok) => text.replace(CARD_RE, (m) => {
177
+ const d = digitsOf(m);
178
+ if (d.length >= 13 && d.length <= 19 && isValidLuhn(d)) {
179
+ return tok.tokenFor("CARD_NUMBER", m.trim(), d);
180
+ }
181
+ return m;
182
+ });
183
+ const redactIban = (text, tok) => text.replace(IBAN_RE, (m) => {
184
+ const clean = m.replace(/\s/g, "");
185
+ if (clean.length >= 15 && clean.length <= 34) {
186
+ return tok.tokenFor("IBAN", m, clean.toUpperCase());
187
+ }
188
+ return m;
189
+ });
190
+ const redactUrl = (text, tok) => text.replace(URL_RE, (m) => tok.tokenFor("URL", m, m.toLowerCase()));
191
+ const redactEmail = (text, tok) => text.replace(EMAIL_RE, (m) => tok.tokenFor("EMAIL", m, m.toLowerCase()));
192
+ const redactPhone = (text, tok) => {
193
+ const mk = (m) => tok.tokenFor("PHONE", m.trim(), digitsOf(m));
194
+ let out = text.replace(/(?<!\d)\+44[\s-]?(?:\(0\))?[\s-]?\d{2,5}[\s-]?\d{3,4}[\s-]?\d{3,4}(?!\d)/g, mk);
195
+ out = out.replace(/(?<!\d)\+1[\s\-.]?\(?\d{3}\)?[\s\-.]?\d{3}[\s\-.]?\d{4}(?!\d)/g, mk);
196
+ out = out.replace(/(?<!\d)\(?0\d{2,4}\)?[\s-]?\d{3,4}[\s-]?\d{3,4}(?!\d)/g, (m) => {
197
+ const len = digitsOf(m).length;
198
+ return len >= 10 && len <= 11 ? mk(m) : m;
199
+ });
200
+ out = out.replace(/(?<!\d)\(?[2-9]\d{2}\)?[\s\-.][2-9]\d{2}[\s\-.]\d{4}(?!\d)/g, mk);
201
+ return out;
202
+ };
203
+ const redactPostcode = (text, tok) => text.replace(POSTCODE_RE, (m) => {
204
+ const clean = m.replace(/\s/g, "");
205
+ if (clean.length >= 5 && clean.length <= 7) {
206
+ return tok.tokenFor("POSTCODE", m, clean.toUpperCase());
207
+ }
208
+ return m;
209
+ });
210
+ const redactZip = (text, tok) => {
211
+ let out = text.replace(ZIP_KW_RE, (_m, kw, zip) => kw + tok.tokenFor("ZIP", zip));
212
+ out = out.replace(ZIP_STATE_RE, (_m, pre, zip) => pre + tok.tokenFor("ZIP", zip));
213
+ return out;
214
+ };
215
+ const redactIp = (text, tok) => text.replace(IP_RE, (m) => tok.tokenFor("IP_ADDRESS", m));
216
+ const redactPlate = (text, tok) => text.replace(UK_PLATE_RE, (m) => tok.tokenFor("VEHICLE_REG", m, m.replace(/\s/g, "").toUpperCase()));
217
+ const redactRelative = (text, tok) => text.replace(RELATIVE_NAME_RE, (m, rel, sep, name) => {
218
+ // The `i` flag (for the relationship word) relaxes the name's
219
+ // capitalisation, so trim to the leading capitalised run — this both
220
+ // rejects "daughter and two sons" and stops "Sarah is the" over-capturing.
221
+ const split = leadingName(name);
222
+ if (!split)
223
+ return m;
224
+ return (rel + sep +
225
+ tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase()) +
226
+ split.rest);
227
+ });
228
+ const redactName = (text, tok) => {
229
+ const nameToken = (raw) => tok.tokenFor("PATIENT_NAME", raw.trim(), raw.trim().toLowerCase().replace(/\s+/g, " "));
230
+ // Courtesy-titled names first. Store the full match (title + name) as the
231
+ // original so re-identification restores "Mrs Patricia Hartley" verbatim,
232
+ // but key on the name alone so the same person dedupes across contexts.
233
+ let out = text.replace(NAME_TITLE_RE, (m, name) => tok.tokenFor("PATIENT_NAME", m.trim(), name.trim().toLowerCase().replace(/\s+/g, " ")));
234
+ // Salutations without a courtesy title (clinical titles already excluded).
235
+ out = out.replace(NAME_SALUTATION_RE, (_m, dear, name) => `${dear} ${nameToken(name)}`);
236
+ // Labelled names — preserve the original label + separator. This regex also
237
+ // carries the `i` flag (for the label word), so trim the name the same way.
238
+ out = out.replace(NAME_LABEL_RE, (m, prefix, name) => {
239
+ const split = leadingName(name);
240
+ if (!split)
241
+ return m;
242
+ return prefix + nameToken(split.name) + split.rest;
243
+ });
244
+ return out;
245
+ };
246
+ // Order matters: keyword-anchored and checksum-validated patterns first,
247
+ // weaker heuristics last, so high-confidence matches win any overlap.
248
+ const CLINICAL_PASSES = [
249
+ redactMrn,
250
+ redactDob,
251
+ redactNhs,
252
+ redactNi,
253
+ redactSsn,
254
+ redactEmail,
255
+ redactPhone,
256
+ redactPostcode,
257
+ redactZip,
258
+ redactRelative,
259
+ redactName,
260
+ ];
261
+ const GENERAL_PASSES = [
262
+ redactAccount,
263
+ redactCard,
264
+ redactIban,
265
+ redactUrl,
266
+ redactEmail,
267
+ redactPhone,
268
+ redactPostcode,
269
+ redactZip,
270
+ redactIp,
271
+ redactPlate,
272
+ redactRelative,
273
+ redactName,
274
+ ];
275
+ // Self-check: patterns that should NOT remain in already-redacted text. These
276
+ // are intentionally broad — they flag *possible* leftovers for human review,
277
+ // not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
278
+ const RESIDUAL_CHECKS = [
279
+ { label: "long number (10+ digits)", re: /(?<![\d-])\d[\d\s-]{8,}\d(?![\d-])/g },
280
+ { label: "email address", re: /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g },
281
+ { label: "UK postcode", re: /\b[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}\b/gi },
282
+ { label: "URL", re: /\b(?:https?:\/\/|www\.)\S+/gi },
283
+ ];
284
+ /**
285
+ * A Redactor keeps one Tokeniser across many texts, so the same identifier
286
+ * gets the same token on every sticky note on the board.
287
+ */
288
+ export class Redactor {
289
+ tok = new Tokeniser();
290
+ passes;
291
+ constructor(categories) {
292
+ const seen = new Set();
293
+ const passes = [];
294
+ if (categories.includes("clinical")) {
295
+ for (const p of CLINICAL_PASSES)
296
+ if (!seen.has(p))
297
+ (seen.add(p), passes.push(p));
298
+ }
299
+ if (categories.includes("general")) {
300
+ for (const p of GENERAL_PASSES)
301
+ if (!seen.has(p))
302
+ (seen.add(p), passes.push(p));
303
+ }
304
+ this.passes = passes;
305
+ }
306
+ redactText(input) {
307
+ // Normalise non-breaking spaces so spaced identifiers still match.
308
+ let text = input.replace(/[   ]/g, " ");
309
+ for (const pass of this.passes)
310
+ text = pass(text, this.tok);
311
+ return { text, changed: text !== input };
312
+ }
313
+ /** {token_type: number_of_distinct_values} */
314
+ get report() {
315
+ const report = {};
316
+ for (const token of Object.keys(this.tok.tokenMap)) {
317
+ const type = token.slice(1, -1).replace(/_\d+$/, "");
318
+ report[type] = (report[type] ?? 0) + 1;
319
+ }
320
+ return report;
321
+ }
322
+ /** {token: original_value} — for review / re-identification. Handle with care. */
323
+ get tokenMap() {
324
+ return { ...this.tok.tokenMap };
325
+ }
326
+ }
327
+ /**
328
+ * Re-scan already-redacted text for anything that still looks like an
329
+ * identifier, so the UI can warn the user to check manually. Returns one
330
+ * finding per distinct sample (deduplicated, capped). A clean result is not a
331
+ * guarantee — it's a second pair of eyes, not a proof.
332
+ */
333
+ /**
334
+ * Re-identification: replace tokens with their original values, using a token
335
+ * map produced by an earlier redaction. The inverse of redaction — for putting
336
+ * real data back into AI output before it returns to the board.
337
+ *
338
+ * Tokens always end in "]", so "[NAME_1]" never matches inside "[NAME_10]";
339
+ * plain string replacement is safe.
340
+ */
341
+ export function reinstate(text, tokenMap) {
342
+ let out = text;
343
+ for (const [token, original] of Object.entries(tokenMap)) {
344
+ if (token)
345
+ out = out.split(token).join(original);
346
+ }
347
+ return { text: out, changed: out !== text };
348
+ }
349
+ /** Validate that a parsed object is a usable token map ([TOKEN] -> string). */
350
+ export function isValidTokenMap(value) {
351
+ if (!value || typeof value !== "object" || Array.isArray(value))
352
+ return false;
353
+ const entries = Object.entries(value);
354
+ if (entries.length === 0)
355
+ return false;
356
+ return entries.every(([k, v]) => /^\[[A-Z_]+_\d+\]$/.test(k) && typeof v === "string");
357
+ }
358
+ export function selfCheck(redactedText) {
359
+ const seen = new Set();
360
+ const findings = [];
361
+ for (const { label, re } of RESIDUAL_CHECKS) {
362
+ for (const match of redactedText.matchAll(re)) {
363
+ const sample = match[0].trim();
364
+ // Ignore our own tokens, e.g. [NHS_NUMBER_1].
365
+ if (/^\[[A-Z_]+_\d+\]$/.test(sample))
366
+ continue;
367
+ const key = `${label}:${sample.toLowerCase()}`;
368
+ if (seen.has(key))
369
+ continue;
370
+ seen.add(key);
371
+ findings.push({ label, sample });
372
+ if (findings.length >= 20)
373
+ return findings;
374
+ }
375
+ }
376
+ return findings;
377
+ }
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "@pharmatools/redacta",
3
+ "version": "1.1.1",
4
+ "description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
5
+ "license": "MIT-0",
6
+ "author": "Nick Lamb (PharmaTools.AI)",
7
+ "homepage": "https://www.pharmatools.ai/redacta",
8
+ "repository": { "type": "git", "url": "https://github.com/nickjlamb/redacta" },
9
+ "type": "module",
10
+ "main": "dist/index.js",
11
+ "types": "dist/index.d.ts",
12
+ "exports": {
13
+ ".": {
14
+ "types": "./dist/index.d.ts",
15
+ "import": "./dist/index.js"
16
+ }
17
+ },
18
+ "files": [
19
+ "dist"
20
+ ],
21
+ "keywords": [
22
+ "redaction",
23
+ "pseudonymisation",
24
+ "de-identification",
25
+ "anonymisation",
26
+ "pii",
27
+ "phi",
28
+ "healthcare",
29
+ "nhs",
30
+ "privacy",
31
+ "typescript"
32
+ ],
33
+ "scripts": {
34
+ "build": "tsc",
35
+ "test": "vitest run",
36
+ "prepublishOnly": "npm run build"
37
+ },
38
+ "devDependencies": {
39
+ "typescript": "^5.4.0",
40
+ "vitest": "^1.6.0"
41
+ }
42
+ }