@pharmatools/redacta 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -0
- package/dist/redact.d.ts +1 -1
- package/dist/redact.js +58 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -36,6 +36,25 @@ numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
|
|
|
36
36
|
Same value → same token across a `Redactor` instance; the `tokenMap` reverses
|
|
37
37
|
the redaction. No DOM, no network, no storage.
|
|
38
38
|
|
|
39
|
+
### Safe Harbor mode
|
|
40
|
+
|
|
41
|
+
`new Redactor(["safeharbor"])` applies a stricter, US-focused pass aligned with
|
|
42
|
+
the HIPAA Safe Harbor method (§164.514). It implies `clinical` + `general` and
|
|
43
|
+
adds: **all** dates (not just DOB — appointment dates included), specific ages,
|
|
44
|
+
fax numbers, certificate/licence numbers, device serial numbers, VINs, and
|
|
45
|
+
health-plan/beneficiary numbers.
|
|
46
|
+
|
|
47
|
+
```ts
|
|
48
|
+
const r = new Redactor(["safeharbor"]);
|
|
49
|
+
r.redactText("73-year-old, appointment 15 March 2026, fax 0113 496 1234").text;
|
|
50
|
+
// "[AGE_1], appointment [DATE_1], fax [FAX_1]"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Note: this over-redacts slightly versus the letter of the standard (it removes
|
|
54
|
+
all specific ages and full dates rather than only ages 90+ and date elements
|
|
55
|
+
beyond the year) — deliberately, on the safe side. Biometric and photographic
|
|
56
|
+
identifiers are out of scope for a text engine. Not legal advice; review output.
|
|
57
|
+
|
|
39
58
|
## API
|
|
40
59
|
|
|
41
60
|
- `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,
|
package/dist/redact.d.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
* (patients, relatives, carers — clinician names preserved), self-checks the
|
|
7
7
|
* output, and reverses the process from a token map.
|
|
8
8
|
*/
|
|
9
|
-
export type Category = "clinical" | "general";
|
|
9
|
+
export type Category = "clinical" | "general" | "safeharbor";
|
|
10
10
|
/** Validate a 10-digit NHS number using the Modulus-11 check digit. */
|
|
11
11
|
export declare function isValidNhs(digits: string): boolean;
|
|
12
12
|
/** Validate the two-letter prefix of a UK National Insurance number. */
|
package/dist/redact.js
CHANGED
|
@@ -243,6 +243,35 @@ const redactName = (text, tok) => {
|
|
|
243
243
|
});
|
|
244
244
|
return out;
|
|
245
245
|
};
|
|
246
|
+
// --- Safe Harbor extras (HIPAA §164.514(b)(2)) -----------------------------
|
|
247
|
+
// Stricter passes layered on top of clinical + general for full Safe Harbor
|
|
248
|
+
// de-identification: ALL dates (not just DOB), specific ages, fax numbers,
|
|
249
|
+
// certificate/licence numbers, device serials, VINs, and health-plan numbers.
|
|
250
|
+
const ANY_DATE_RE = new RegExp("(?:" + DATE + ")", "g");
|
|
251
|
+
const AGE_PHRASE_RE = /\b\d{1,3}[\s-]?(?:years?[\s-]?old|y\/?o)\b/gi;
|
|
252
|
+
const AGE_LABEL_RE = /\b(aged|age)([:\s]+)(\d{1,3})\b/gi;
|
|
253
|
+
const FAX_RE = /\b(fax(?:\s*(?:no\.?|number|#))?[:\s]+)(\+?[\d(][\d().\s-]{6,}\d)/gi;
|
|
254
|
+
const LICENSE_RE = /\b((?:licen[cs]e|certificate|cert\.?|registration)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{3,})/gi;
|
|
255
|
+
const DEVICE_RE = /\b((?:serial|device\s*(?:id|identifier|no\.?|number)|imei)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
|
|
256
|
+
const VIN_RE = /\b[A-HJ-NPR-Z0-9]{17}\b/g;
|
|
257
|
+
const HEALTH_PLAN_RE = /\b((?:health\s*plan|beneficiary|medicare|medicaid)\s*(?:id|no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
|
|
258
|
+
const redactAllDates = (text, tok) => text.replace(ANY_DATE_RE, (m) => tok.tokenFor("DATE", m));
|
|
259
|
+
const redactAge = (text, tok) => {
|
|
260
|
+
let out = text.replace(AGE_PHRASE_RE, (m) => tok.tokenFor("AGE", m.trim(), m.replace(/\D/g, "")));
|
|
261
|
+
out = out.replace(AGE_LABEL_RE, (_m, kw, sep, num) => kw + sep + tok.tokenFor("AGE", num));
|
|
262
|
+
return out;
|
|
263
|
+
};
|
|
264
|
+
const redactFax = (text, tok) => text.replace(FAX_RE, (_m, kw, num) => kw + tok.tokenFor("FAX", num.trim(), digitsOf(num)));
|
|
265
|
+
const redactLicense = (text, tok) => text.replace(LICENSE_RE, (_m, kw, id) => kw + tok.tokenFor("LICENSE", id, id.toUpperCase()));
|
|
266
|
+
const redactDevice = (text, tok) => text.replace(DEVICE_RE, (_m, kw, id) => kw + tok.tokenFor("DEVICE_ID", id, id.toUpperCase()));
|
|
267
|
+
const redactVin = (text, tok) => text.replace(VIN_RE, (m) => {
|
|
268
|
+
// Require both a digit and a letter, so we don't grab a 17-char all-alpha
|
|
269
|
+
// word or an all-digit run.
|
|
270
|
+
if (!/\d/.test(m) || !/[A-Z]/.test(m))
|
|
271
|
+
return m;
|
|
272
|
+
return tok.tokenFor("VIN", m, m.toUpperCase());
|
|
273
|
+
});
|
|
274
|
+
const redactHealthPlan = (text, tok) => text.replace(HEALTH_PLAN_RE, (_m, kw, id) => kw + tok.tokenFor("HEALTH_PLAN_NUMBER", id, id.toUpperCase()));
|
|
246
275
|
// Order matters: keyword-anchored and checksum-validated patterns first,
|
|
247
276
|
// weaker heuristics last, so high-confidence matches win any overlap.
|
|
248
277
|
const CLINICAL_PASSES = [
|
|
@@ -272,6 +301,19 @@ const GENERAL_PASSES = [
|
|
|
272
301
|
redactRelative,
|
|
273
302
|
redactName,
|
|
274
303
|
];
|
|
304
|
+
// redactFax must run BEFORE the generic phone pass, or a fax number is claimed
|
|
305
|
+
// as [PHONE]. It's keyword-anchored ("Fax: ...") so running first is safe.
|
|
306
|
+
const SAFE_HARBOR_PRE_PASSES = [redactFax];
|
|
307
|
+
// Layered after clinical + general. redactAllDates runs last so keyword DOBs are
|
|
308
|
+
// already [DATE_OF_BIRTH] and only the remaining dates (appointments) → [DATE].
|
|
309
|
+
const SAFE_HARBOR_EXTRA_PASSES = [
|
|
310
|
+
redactAge,
|
|
311
|
+
redactLicense,
|
|
312
|
+
redactDevice,
|
|
313
|
+
redactVin,
|
|
314
|
+
redactHealthPlan,
|
|
315
|
+
redactAllDates,
|
|
316
|
+
];
|
|
275
317
|
// Self-check: patterns that should NOT remain in already-redacted text. These
|
|
276
318
|
// are intentionally broad — they flag *possible* leftovers for human review,
|
|
277
319
|
// not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
|
|
@@ -289,18 +331,32 @@ export class Redactor {
|
|
|
289
331
|
tok = new Tokeniser();
|
|
290
332
|
passes;
|
|
291
333
|
constructor(categories) {
|
|
334
|
+
// Safe Harbor is the strictest mode and implies clinical + general plus the
|
|
335
|
+
// extra Safe Harbor passes (all dates, ages, fax, licence, device, VIN,
|
|
336
|
+
// health-plan numbers).
|
|
337
|
+
const safeHarbor = categories.includes("safeharbor");
|
|
292
338
|
const seen = new Set();
|
|
293
339
|
const passes = [];
|
|
294
|
-
if (
|
|
340
|
+
if (safeHarbor) {
|
|
341
|
+
for (const p of SAFE_HARBOR_PRE_PASSES)
|
|
342
|
+
if (!seen.has(p))
|
|
343
|
+
(seen.add(p), passes.push(p));
|
|
344
|
+
}
|
|
345
|
+
if (categories.includes("clinical") || safeHarbor) {
|
|
295
346
|
for (const p of CLINICAL_PASSES)
|
|
296
347
|
if (!seen.has(p))
|
|
297
348
|
(seen.add(p), passes.push(p));
|
|
298
349
|
}
|
|
299
|
-
if (categories.includes("general")) {
|
|
350
|
+
if (categories.includes("general") || safeHarbor) {
|
|
300
351
|
for (const p of GENERAL_PASSES)
|
|
301
352
|
if (!seen.has(p))
|
|
302
353
|
(seen.add(p), passes.push(p));
|
|
303
354
|
}
|
|
355
|
+
if (safeHarbor) {
|
|
356
|
+
for (const p of SAFE_HARBOR_EXTRA_PASSES)
|
|
357
|
+
if (!seen.has(p))
|
|
358
|
+
(seen.add(p), passes.push(p));
|
|
359
|
+
}
|
|
304
360
|
this.passes = passes;
|
|
305
361
|
}
|
|
306
362
|
redactText(input) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@pharmatools/redacta",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
|
|
5
5
|
"license": "MIT-0",
|
|
6
6
|
"author": "Nick Lamb (PharmaTools.AI)",
|