@pharmatools/redacta 1.1.1 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/dist/redact.d.ts +1 -1
- package/dist/redact.js +90 -14
- package/package.json +6 -3
package/README.md
CHANGED
|
@@ -36,6 +36,25 @@ numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
|
|
|
36
36
|
Same value → same token across a `Redactor` instance; the `tokenMap` reverses
|
|
37
37
|
the redaction. No DOM, no network, no storage.
|
|
38
38
|
|
|
39
|
+
### Safe Harbor mode
|
|
40
|
+
|
|
41
|
+
`new Redactor(["safeharbor"])` applies a stricter, US-focused pass aligned with
|
|
42
|
+
the HIPAA Safe Harbor method (§164.514). It implies `clinical` + `general` and
|
|
43
|
+
adds: **all** dates (not just DOB — appointment dates included), specific ages,
|
|
44
|
+
fax numbers, certificate/licence numbers, device serial numbers, VINs, and
|
|
45
|
+
health-plan/beneficiary numbers.
|
|
46
|
+
|
|
47
|
+
```ts
|
|
48
|
+
const r = new Redactor(["safeharbor"]);
|
|
49
|
+
r.redactText("73-year-old, appointment 15 March 2026, fax 0113 496 1234").text;
|
|
50
|
+
// "[AGE_1], appointment [DATE_1], fax [FAX_1]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Note: this over-redacts slightly versus the letter of the standard (it removes
|
|
54
|
+
all specific ages and full dates rather than only ages 90+ and date elements
|
|
55
|
+
beyond the year) — deliberately, on the safe side. Biometric and photographic
|
|
56
|
+
identifiers are out of scope for a text engine. Not legal advice; review output.
|
|
57
|
+
|
|
39
58
|
## API
|
|
40
59
|
|
|
41
60
|
- `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,
|
package/dist/redact.d.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* (patients, relatives, carers — clinician names preserved), self-checks the
|
|
7
7
|
* output, and reverses the process from a token map.
|
|
8
8
|
*/
|
|
9
|
-
export type Category = "clinical" | "general";
|
|
9
|
+
export type Category = "clinical" | "general" | "safeharbor";
|
|
10
10
|
/** Validate a 10-digit NHS number using the Modulus-11 check digit. */
|
|
11
11
|
export declare function isValidNhs(digits: string): boolean;
|
|
12
12
|
/** Validate the two-letter prefix of a UK National Insurance number. */
|
package/dist/redact.js
CHANGED
|
@@ -115,7 +115,10 @@ const UK_PLATE_RE = /\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b/g;
|
|
|
115
115
|
// carrying a clinical title (Dr, Consultant, Nurse, ...), matching the Redacta
|
|
116
116
|
// skill's "don't redact the treating clinician" rule. Names buried in free
|
|
117
117
|
// prose are NOT caught; the UI tells users to review.
|
|
118
|
-
|
|
118
|
+
// A single name word: "Eileen", "O'Brien" (apostrophe directly after the
|
|
119
|
+
// initial capital), "Kowalski-Nowak", "O'Brien-Smith".
|
|
120
|
+
const NAME_WORD = String.raw `[A-Z](?:[a-z]+|['’][A-Z][a-z]+)(?:['’\-][A-Za-z]+)?`;
|
|
121
|
+
const NAME = String.raw `${NAME_WORD}(?:[ \t]+${NAME_WORD}){0,2}`;
|
|
119
122
|
// Case-sensitive, anchored version. Used to trim a loosely-captured name down
|
|
120
123
|
// to its leading run of properly capitalised words — necessary because the
|
|
121
124
|
// label/relative regexes carry the `i` flag (for the keyword), which would
|
|
@@ -214,17 +217,34 @@ const redactZip = (text, tok) => {
|
|
|
214
217
|
};
|
|
215
218
|
const redactIp = (text, tok) => text.replace(IP_RE, (m) => tok.tokenFor("IP_ADDRESS", m));
|
|
216
219
|
const redactPlate = (text, tok) => text.replace(UK_PLATE_RE, (m) => tok.tokenFor("VEHICLE_REG", m, m.replace(/\s/g, "").toUpperCase()));
|
|
217
|
-
const redactRelative = (text, tok) =>
|
|
218
|
-
//
|
|
219
|
-
//
|
|
220
|
-
//
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
220
|
+
const redactRelative = (text, tok) => {
|
|
221
|
+
// Manual exec loop rather than String.replace: when the strict-name trim
|
|
222
|
+
// fails (the `i` flag lets the loose capture open with lowercase words, e.g.
|
|
223
|
+
// "Next of kin: her daughter Anita" captured after "next of kin"), a replace
|
|
224
|
+
// callback would consume the whole region and swallow the inner
|
|
225
|
+
// "daughter Anita" match. Here we roll the scan back to just after the
|
|
226
|
+
// relation word instead, so nested relation phrases still match.
|
|
227
|
+
let out = "";
|
|
228
|
+
let last = 0;
|
|
229
|
+
RELATIVE_NAME_RE.lastIndex = 0;
|
|
230
|
+
let m;
|
|
231
|
+
while ((m = RELATIVE_NAME_RE.exec(text)) !== null) {
|
|
232
|
+
const [, rel, sep, name] = m;
|
|
233
|
+
// Trim to the leading capitalised run — this both rejects "daughter and
|
|
234
|
+
// two sons" and stops "Sarah is the" over-capturing.
|
|
235
|
+
const split = leadingName(name);
|
|
236
|
+
if (!split) {
|
|
237
|
+
RELATIVE_NAME_RE.lastIndex = m.index + rel.length;
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
out +=
|
|
241
|
+
text.slice(last, m.index) + rel + sep +
|
|
242
|
+
tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase());
|
|
243
|
+
last = m.index + rel.length + sep.length + split.name.length;
|
|
244
|
+
RELATIVE_NAME_RE.lastIndex = last;
|
|
245
|
+
}
|
|
246
|
+
return out + text.slice(last);
|
|
247
|
+
};
|
|
228
248
|
const redactName = (text, tok) => {
|
|
229
249
|
const nameToken = (raw) => tok.tokenFor("PATIENT_NAME", raw.trim(), raw.trim().toLowerCase().replace(/\s+/g, " "));
|
|
230
250
|
// Courtesy-titled names first. Store the full match (title + name) as the
|
|
@@ -243,6 +263,35 @@ const redactName = (text, tok) => {
|
|
|
243
263
|
});
|
|
244
264
|
return out;
|
|
245
265
|
};
|
|
266
|
+
// --- Safe Harbor extras (HIPAA §164.514(b)(2)) -----------------------------
|
|
267
|
+
// Stricter passes layered on top of clinical + general for full Safe Harbor
|
|
268
|
+
// de-identification: ALL dates (not just DOB), specific ages, fax numbers,
|
|
269
|
+
// certificate/licence numbers, device serials, VINs, and health-plan numbers.
|
|
270
|
+
const ANY_DATE_RE = new RegExp("(?:" + DATE + ")", "g");
|
|
271
|
+
const AGE_PHRASE_RE = /\b\d{1,3}[\s-]?(?:years?[\s-]?old|y\/?o)\b/gi;
|
|
272
|
+
const AGE_LABEL_RE = /\b(aged|age)([:\s]+)(\d{1,3})\b/gi;
|
|
273
|
+
const FAX_RE = /\b(fax(?:\s*(?:no\.?|number|#))?[:\s]+)(\+?[\d(][\d().\s-]{6,}\d)/gi;
|
|
274
|
+
const LICENSE_RE = /\b((?:licen[cs]e|certificate|cert\.?|registration)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{3,})/gi;
|
|
275
|
+
const DEVICE_RE = /\b((?:serial|device\s*(?:id|identifier|no\.?|number)|imei)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
|
|
276
|
+
const VIN_RE = /\b[A-HJ-NPR-Z0-9]{17}\b/g;
|
|
277
|
+
const HEALTH_PLAN_RE = /\b((?:health\s*plan|beneficiary|medicare|medicaid)\s*(?:id|no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
|
|
278
|
+
const redactAllDates = (text, tok) => text.replace(ANY_DATE_RE, (m) => tok.tokenFor("DATE", m));
|
|
279
|
+
const redactAge = (text, tok) => {
|
|
280
|
+
let out = text.replace(AGE_PHRASE_RE, (m) => tok.tokenFor("AGE", m.trim(), m.replace(/\D/g, "")));
|
|
281
|
+
out = out.replace(AGE_LABEL_RE, (_m, kw, sep, num) => kw + sep + tok.tokenFor("AGE", num));
|
|
282
|
+
return out;
|
|
283
|
+
};
|
|
284
|
+
const redactFax = (text, tok) => text.replace(FAX_RE, (_m, kw, num) => kw + tok.tokenFor("FAX", num.trim(), digitsOf(num)));
|
|
285
|
+
const redactLicense = (text, tok) => text.replace(LICENSE_RE, (_m, kw, id) => kw + tok.tokenFor("LICENSE", id, id.toUpperCase()));
|
|
286
|
+
const redactDevice = (text, tok) => text.replace(DEVICE_RE, (_m, kw, id) => kw + tok.tokenFor("DEVICE_ID", id, id.toUpperCase()));
|
|
287
|
+
const redactVin = (text, tok) => text.replace(VIN_RE, (m) => {
|
|
288
|
+
// Require both a digit and a letter, so we don't grab a 17-char all-alpha
|
|
289
|
+
// word or an all-digit run.
|
|
290
|
+
if (!/\d/.test(m) || !/[A-Z]/.test(m))
|
|
291
|
+
return m;
|
|
292
|
+
return tok.tokenFor("VIN", m, m.toUpperCase());
|
|
293
|
+
});
|
|
294
|
+
const redactHealthPlan = (text, tok) => text.replace(HEALTH_PLAN_RE, (_m, kw, id) => kw + tok.tokenFor("HEALTH_PLAN_NUMBER", id, id.toUpperCase()));
|
|
246
295
|
// Order matters: keyword-anchored and checksum-validated patterns first,
|
|
247
296
|
// weaker heuristics last, so high-confidence matches win any overlap.
|
|
248
297
|
const CLINICAL_PASSES = [
|
|
@@ -272,6 +321,19 @@ const GENERAL_PASSES = [
|
|
|
272
321
|
redactRelative,
|
|
273
322
|
redactName,
|
|
274
323
|
];
|
|
324
|
+
// redactFax must run BEFORE the generic phone pass, or a fax number is claimed
|
|
325
|
+
// as [PHONE]. It's keyword-anchored ("Fax: ...") so running first is safe.
|
|
326
|
+
const SAFE_HARBOR_PRE_PASSES = [redactFax];
|
|
327
|
+
// Layered after clinical + general. redactAllDates runs last so keyword DOBs are
|
|
328
|
+
// already [DATE_OF_BIRTH] and only the remaining dates (appointments) → [DATE].
|
|
329
|
+
const SAFE_HARBOR_EXTRA_PASSES = [
|
|
330
|
+
redactAge,
|
|
331
|
+
redactLicense,
|
|
332
|
+
redactDevice,
|
|
333
|
+
redactVin,
|
|
334
|
+
redactHealthPlan,
|
|
335
|
+
redactAllDates,
|
|
336
|
+
];
|
|
275
337
|
// Self-check: patterns that should NOT remain in already-redacted text. These
|
|
276
338
|
// are intentionally broad — they flag *possible* leftovers for human review,
|
|
277
339
|
// not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
|
|
@@ -289,18 +351,32 @@ export class Redactor {
|
|
|
289
351
|
tok = new Tokeniser();
|
|
290
352
|
passes;
|
|
291
353
|
constructor(categories) {
|
|
354
|
+
// Safe Harbor is the strictest mode and implies clinical + general plus the
|
|
355
|
+
// extra Safe Harbor passes (all dates, ages, fax, licence, device, VIN,
|
|
356
|
+
// health-plan numbers).
|
|
357
|
+
const safeHarbor = categories.includes("safeharbor");
|
|
292
358
|
const seen = new Set();
|
|
293
359
|
const passes = [];
|
|
294
|
-
if (
|
|
360
|
+
if (safeHarbor) {
|
|
361
|
+
for (const p of SAFE_HARBOR_PRE_PASSES)
|
|
362
|
+
if (!seen.has(p))
|
|
363
|
+
(seen.add(p), passes.push(p));
|
|
364
|
+
}
|
|
365
|
+
if (categories.includes("clinical") || safeHarbor) {
|
|
295
366
|
for (const p of CLINICAL_PASSES)
|
|
296
367
|
if (!seen.has(p))
|
|
297
368
|
(seen.add(p), passes.push(p));
|
|
298
369
|
}
|
|
299
|
-
if (categories.includes("general")) {
|
|
370
|
+
if (categories.includes("general") || safeHarbor) {
|
|
300
371
|
for (const p of GENERAL_PASSES)
|
|
301
372
|
if (!seen.has(p))
|
|
302
373
|
(seen.add(p), passes.push(p));
|
|
303
374
|
}
|
|
375
|
+
if (safeHarbor) {
|
|
376
|
+
for (const p of SAFE_HARBOR_EXTRA_PASSES)
|
|
377
|
+
if (!seen.has(p))
|
|
378
|
+
(seen.add(p), passes.push(p));
|
|
379
|
+
}
|
|
304
380
|
this.passes = passes;
|
|
305
381
|
}
|
|
306
382
|
redactText(input) {
|
package/package.json
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pharmatools/redacta",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.1",
|
|
4
4
|
"description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
|
|
5
5
|
"license": "MIT-0",
|
|
6
6
|
"author": "Nick Lamb (PharmaTools.AI)",
|
|
7
7
|
"homepage": "https://www.pharmatools.ai/redacta",
|
|
8
|
-
"repository": {
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "https://github.com/nickjlamb/redacta"
|
|
11
|
+
},
|
|
9
12
|
"type": "module",
|
|
10
13
|
"main": "dist/index.js",
|
|
11
14
|
"types": "dist/index.d.ts",
|
|
@@ -37,6 +40,6 @@
|
|
|
37
40
|
},
|
|
38
41
|
"devDependencies": {
|
|
39
42
|
"typescript": "^5.4.0",
|
|
40
|
-
"vitest": "^1.
|
|
43
|
+
"vitest": "^4.1.8"
|
|
41
44
|
}
|
|
42
45
|
}
|