@pharmatools/redacta 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -36,6 +36,25 @@ numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
36
36
  Same value → same token across a `Redactor` instance; the `tokenMap` reverses
37
37
  the redaction. No DOM, no network, no storage.
38
38
 
39
+ ### Safe Harbor mode
40
+
41
+ `new Redactor(["safeharbor"])` applies a stricter, US-focused pass aligned with
42
+ the HIPAA Safe Harbor method (§164.514). It implies `clinical` + `general` and
43
+ adds: **all** dates (not just DOB — appointment dates included), specific ages,
44
+ fax numbers, certificate/licence numbers, device serial numbers, VINs, and
45
+ health-plan/beneficiary numbers.
46
+
47
+ ```ts
48
+ const r = new Redactor(["safeharbor"]);
49
+ r.redactText("73-year-old, appointment 15 March 2026, fax 0113 496 1234").text;
50
+ // "[AGE_1], appointment [DATE_1], fax [FAX_1]"
51
+ ```
52
+
53
+ Note: this over-redacts slightly versus the letter of the standard (it removes
54
+ all specific ages and full dates rather than only ages 90+ and date elements
55
+ beyond the year) — deliberately, on the safe side. Biometric and photographic
56
+ identifiers are out of scope for a text engine. Not legal advice; review output.
57
+
39
58
  ## API
40
59
 
41
60
  - `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,
package/dist/redact.d.ts CHANGED
@@ -6,7 +6,7 @@
6
6
  * (patients, relatives, carers — clinician names preserved), self-checks the
7
7
  * output, and reverses the process from a token map.
8
8
  */
9
- export type Category = "clinical" | "general";
9
+ export type Category = "clinical" | "general" | "safeharbor";
10
10
  /** Validate a 10-digit NHS number using the Modulus-11 check digit. */
11
11
  export declare function isValidNhs(digits: string): boolean;
12
12
  /** Validate the two-letter prefix of a UK National Insurance number. */
package/dist/redact.js CHANGED
@@ -243,6 +243,35 @@ const redactName = (text, tok) => {
243
243
  });
244
244
  return out;
245
245
  };
246
+ // --- Safe Harbor extras (HIPAA §164.514(b)(2)) -----------------------------
247
+ // Stricter passes layered on top of clinical + general for full Safe Harbor
248
+ // de-identification: ALL dates (not just DOB), specific ages, fax numbers,
249
+ // certificate/licence numbers, device serials, VINs, and health-plan numbers.
250
+ const ANY_DATE_RE = new RegExp("(?:" + DATE + ")", "g");
251
+ const AGE_PHRASE_RE = /\b\d{1,3}[\s-]?(?:years?[\s-]?old|y\/?o)\b/gi;
252
+ const AGE_LABEL_RE = /\b(aged|age)([:\s]+)(\d{1,3})\b/gi;
253
+ const FAX_RE = /\b(fax(?:\s*(?:no\.?|number|#))?[:\s]+)(\+?[\d(][\d().\s-]{6,}\d)/gi;
254
+ const LICENSE_RE = /\b((?:licen[cs]e|certificate|cert\.?|registration)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{3,})/gi;
255
+ const DEVICE_RE = /\b((?:serial|device\s*(?:id|identifier|no\.?|number)|imei)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
256
+ const VIN_RE = /\b[A-HJ-NPR-Z0-9]{17}\b/g;
257
+ const HEALTH_PLAN_RE = /\b((?:health\s*plan|beneficiary|medicare|medicaid)\s*(?:id|no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
258
+ const redactAllDates = (text, tok) => text.replace(ANY_DATE_RE, (m) => tok.tokenFor("DATE", m));
259
+ const redactAge = (text, tok) => {
260
+ let out = text.replace(AGE_PHRASE_RE, (m) => tok.tokenFor("AGE", m.trim(), m.replace(/\D/g, "")));
261
+ out = out.replace(AGE_LABEL_RE, (_m, kw, sep, num) => kw + sep + tok.tokenFor("AGE", num));
262
+ return out;
263
+ };
264
+ const redactFax = (text, tok) => text.replace(FAX_RE, (_m, kw, num) => kw + tok.tokenFor("FAX", num.trim(), digitsOf(num)));
265
+ const redactLicense = (text, tok) => text.replace(LICENSE_RE, (_m, kw, id) => kw + tok.tokenFor("LICENSE", id, id.toUpperCase()));
266
+ const redactDevice = (text, tok) => text.replace(DEVICE_RE, (_m, kw, id) => kw + tok.tokenFor("DEVICE_ID", id, id.toUpperCase()));
267
+ const redactVin = (text, tok) => text.replace(VIN_RE, (m) => {
268
+ // Require both a digit and a letter, so we don't grab a 17-char all-alpha
269
+ // word or an all-digit run.
270
+ if (!/\d/.test(m) || !/[A-Z]/.test(m))
271
+ return m;
272
+ return tok.tokenFor("VIN", m, m.toUpperCase());
273
+ });
274
+ const redactHealthPlan = (text, tok) => text.replace(HEALTH_PLAN_RE, (_m, kw, id) => kw + tok.tokenFor("HEALTH_PLAN_NUMBER", id, id.toUpperCase()));
246
275
  // Order matters: keyword-anchored and checksum-validated patterns first,
247
276
  // weaker heuristics last, so high-confidence matches win any overlap.
248
277
  const CLINICAL_PASSES = [
@@ -272,6 +301,19 @@ const GENERAL_PASSES = [
272
301
  redactRelative,
273
302
  redactName,
274
303
  ];
304
+ // redactFax must run BEFORE the generic phone pass, or a fax number is claimed
305
+ // as [PHONE]. It's keyword-anchored ("Fax: ...") so running first is safe.
306
+ const SAFE_HARBOR_PRE_PASSES = [redactFax];
307
+ // Layered after clinical + general. redactAllDates runs last so keyword DOBs are
308
+ // already [DATE_OF_BIRTH] and only the remaining dates (appointments) → [DATE].
309
+ const SAFE_HARBOR_EXTRA_PASSES = [
310
+ redactAge,
311
+ redactLicense,
312
+ redactDevice,
313
+ redactVin,
314
+ redactHealthPlan,
315
+ redactAllDates,
316
+ ];
275
317
  // Self-check: patterns that should NOT remain in already-redacted text. These
276
318
  // are intentionally broad — they flag *possible* leftovers for human review,
277
319
  // not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
@@ -289,18 +331,32 @@ export class Redactor {
289
331
  tok = new Tokeniser();
290
332
  passes;
291
333
  constructor(categories) {
334
+ // Safe Harbor is the strictest mode and implies clinical + general plus the
335
+ // extra Safe Harbor passes (all dates, ages, fax, licence, device, VIN,
336
+ // health-plan numbers).
337
+ const safeHarbor = categories.includes("safeharbor");
292
338
  const seen = new Set();
293
339
  const passes = [];
294
- if (categories.includes("clinical")) {
340
+ if (safeHarbor) {
341
+ for (const p of SAFE_HARBOR_PRE_PASSES)
342
+ if (!seen.has(p))
343
+ (seen.add(p), passes.push(p));
344
+ }
345
+ if (categories.includes("clinical") || safeHarbor) {
295
346
  for (const p of CLINICAL_PASSES)
296
347
  if (!seen.has(p))
297
348
  (seen.add(p), passes.push(p));
298
349
  }
299
- if (categories.includes("general")) {
350
+ if (categories.includes("general") || safeHarbor) {
300
351
  for (const p of GENERAL_PASSES)
301
352
  if (!seen.has(p))
302
353
  (seen.add(p), passes.push(p));
303
354
  }
355
+ if (safeHarbor) {
356
+ for (const p of SAFE_HARBOR_EXTRA_PASSES)
357
+ if (!seen.has(p))
358
+ (seen.add(p), passes.push(p));
359
+ }
304
360
  this.passes = passes;
305
361
  }
306
362
  redactText(input) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pharmatools/redacta",
3
- "version": "1.1.1",
3
+ "version": "1.2.0",
4
4
  "description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
5
5
  "license": "MIT-0",
6
6
  "author": "Nick Lamb (PharmaTools.AI)",