npm - @pharmatools/redacta - Versions diffs - 1.1.1 → 1.2.1 - Mend

@pharmatools/redacta 1.1.1 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -36,6 +36,25 @@ numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
 Same value → same token across a `Redactor` instance; the `tokenMap` reverses
 the redaction. No DOM, no network, no storage.
+### Safe Harbor mode
+`new Redactor(["safeharbor"])` applies a stricter, US-focused pass aligned with
+the HIPAA Safe Harbor method (§164.514). It implies `clinical` + `general` and
+adds: **all** dates (not just DOB — appointment dates included), specific ages,
+fax numbers, certificate/licence numbers, device serial numbers, VINs, and
+health-plan/beneficiary numbers.
+```ts
+const r = new Redactor(["safeharbor"]);
+r.redactText("73-year-old, appointment 15 March 2026, fax 0113 496 1234").text;
+// "[AGE_1], appointment [DATE_1], fax [FAX_1]"
+```
+Note: this over-redacts slightly versus the letter of the standard (it removes
+all specific ages and full dates rather than only ages 90+ and date elements
+beyond the year) — deliberately, on the safe side. Biometric and photographic
+identifiers are out of scope for a text engine. Not legal advice; review output.
 ## API
 - `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,

package/dist/redact.d.ts CHANGED Viewed

@@ -6,7 +6,7 @@
  * (patients, relatives, carers — clinician names preserved), self-checks the
  * output, and reverses the process from a token map.
  */
-export type Category = "clinical" | "general";
+export type Category = "clinical" | "general" | "safeharbor";
 /** Validate a 10-digit NHS number using the Modulus-11 check digit. */
 export declare function isValidNhs(digits: string): boolean;
 /** Validate the two-letter prefix of a UK National Insurance number. */

package/dist/redact.js CHANGED Viewed

@@ -115,7 +115,10 @@ const UK_PLATE_RE = /\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b/g;
 // carrying a clinical title (Dr, Consultant, Nurse, ...), matching the Redacta
 // skill's "don't redact the treating clinician" rule. Names buried in free
 // prose are NOT caught; the UI tells users to review.
-const NAME = String.raw `[A-Z][a-z]+(?:['’\-][A-Za-z]+)?(?:[ \t]+[A-Z][a-z]+(?:['’\-][A-Za-z]+)?){0,2}`;
+// A single name word: "Eileen", "O'Brien" (apostrophe directly after the
+// initial capital), "Kowalski-Nowak", "O'Brien-Smith".
+const NAME_WORD = String.raw `[A-Z](?:[a-z]+|['’][A-Z][a-z]+)(?:['’\-][A-Za-z]+)?`;
+const NAME = String.raw `${NAME_WORD}(?:[ \t]+${NAME_WORD}){0,2}`;
 // Case-sensitive, anchored version. Used to trim a loosely-captured name down
 // to its leading run of properly capitalised words — necessary because the
 // label/relative regexes carry the `i` flag (for the keyword), which would
@@ -214,17 +217,34 @@ const redactZip = (text, tok) => {
 };
 const redactIp = (text, tok) => text.replace(IP_RE, (m) => tok.tokenFor("IP_ADDRESS", m));
 const redactPlate = (text, tok) => text.replace(UK_PLATE_RE, (m) => tok.tokenFor("VEHICLE_REG", m, m.replace(/\s/g, "").toUpperCase()));
-const redactRelative = (text, tok) => text.replace(RELATIVE_NAME_RE, (m, rel, sep, name) => {
-    // The `i` flag (for the relationship word) relaxes the name's
-    // capitalisation, so trim to the leading capitalised run — this both
-    // rejects "daughter and two sons" and stops "Sarah is the" over-capturing.
-    const split = leadingName(name);
-    if (!split)
-        return m;
-    return (rel + sep +
-        tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase()) +
-        split.rest);
-});
+const redactRelative = (text, tok) => {
+    // Manual exec loop rather than String.replace: when the strict-name trim
+    // fails (the `i` flag lets the loose capture open with lowercase words, e.g.
+    // "Next of kin: her daughter Anita" captured after "next of kin"), a replace
+    // callback would consume the whole region and swallow the inner
+    // "daughter Anita" match. Here we roll the scan back to just after the
+    // relation word instead, so nested relation phrases still match.
+    let out = "";
+    let last = 0;
+    RELATIVE_NAME_RE.lastIndex = 0;
+    let m;
+    while ((m = RELATIVE_NAME_RE.exec(text)) !== null) {
+        const [, rel, sep, name] = m;
+        // Trim to the leading capitalised run — this both rejects "daughter and
+        // two sons" and stops "Sarah is the" over-capturing.
+        const split = leadingName(name);
+        if (!split) {
+            RELATIVE_NAME_RE.lastIndex = m.index + rel.length;
+            continue;
+        }
+        out +=
+            text.slice(last, m.index) + rel + sep +
+                tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase());
+        last = m.index + rel.length + sep.length + split.name.length;
+        RELATIVE_NAME_RE.lastIndex = last;
+    }
+    return out + text.slice(last);
+};
 const redactName = (text, tok) => {
     const nameToken = (raw) => tok.tokenFor("PATIENT_NAME", raw.trim(), raw.trim().toLowerCase().replace(/\s+/g, " "));
     // Courtesy-titled names first. Store the full match (title + name) as the
@@ -243,6 +263,35 @@ const redactName = (text, tok) => {
     });
     return out;
 };
+// --- Safe Harbor extras (HIPAA §164.514(b)(2)) -----------------------------
+// Stricter passes layered on top of clinical + general for full Safe Harbor
+// de-identification: ALL dates (not just DOB), specific ages, fax numbers,
+// certificate/licence numbers, device serials, VINs, and health-plan numbers.
+const ANY_DATE_RE = new RegExp("(?:" + DATE + ")", "g");
+const AGE_PHRASE_RE = /\b\d{1,3}[\s-]?(?:years?[\s-]?old|y\/?o)\b/gi;
+const AGE_LABEL_RE = /\b(aged|age)([:\s]+)(\d{1,3})\b/gi;
+const FAX_RE = /\b(fax(?:\s*(?:no\.?|number|#))?[:\s]+)(\+?[\d(][\d().\s-]{6,}\d)/gi;
+const LICENSE_RE = /\b((?:licen[cs]e|certificate|cert\.?|registration)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{3,})/gi;
+const DEVICE_RE = /\b((?:serial|device\s*(?:id|identifier|no\.?|number)|imei)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
+const VIN_RE = /\b[A-HJ-NPR-Z0-9]{17}\b/g;
+const HEALTH_PLAN_RE = /\b((?:health\s*plan|beneficiary|medicare|medicaid)\s*(?:id|no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
+const redactAllDates = (text, tok) => text.replace(ANY_DATE_RE, (m) => tok.tokenFor("DATE", m));
+const redactAge = (text, tok) => {
+    let out = text.replace(AGE_PHRASE_RE, (m) => tok.tokenFor("AGE", m.trim(), m.replace(/\D/g, "")));
+    out = out.replace(AGE_LABEL_RE, (_m, kw, sep, num) => kw + sep + tok.tokenFor("AGE", num));
+    return out;
+};
+const redactFax = (text, tok) => text.replace(FAX_RE, (_m, kw, num) => kw + tok.tokenFor("FAX", num.trim(), digitsOf(num)));
+const redactLicense = (text, tok) => text.replace(LICENSE_RE, (_m, kw, id) => kw + tok.tokenFor("LICENSE", id, id.toUpperCase()));
+const redactDevice = (text, tok) => text.replace(DEVICE_RE, (_m, kw, id) => kw + tok.tokenFor("DEVICE_ID", id, id.toUpperCase()));
+const redactVin = (text, tok) => text.replace(VIN_RE, (m) => {
+    // Require both a digit and a letter, so we don't grab a 17-char all-alpha
+    // word or an all-digit run.
+    if (!/\d/.test(m) || !/[A-Z]/.test(m))
+        return m;
+    return tok.tokenFor("VIN", m, m.toUpperCase());
+});
+const redactHealthPlan = (text, tok) => text.replace(HEALTH_PLAN_RE, (_m, kw, id) => kw + tok.tokenFor("HEALTH_PLAN_NUMBER", id, id.toUpperCase()));
 // Order matters: keyword-anchored and checksum-validated patterns first,
 // weaker heuristics last, so high-confidence matches win any overlap.
 const CLINICAL_PASSES = [
@@ -272,6 +321,19 @@ const GENERAL_PASSES = [
     redactRelative,
     redactName,
 ];
+// redactFax must run BEFORE the generic phone pass, or a fax number is claimed
+// as [PHONE]. It's keyword-anchored ("Fax: ...") so running first is safe.
+const SAFE_HARBOR_PRE_PASSES = [redactFax];
+// Layered after clinical + general. redactAllDates runs last so keyword DOBs are
+// already [DATE_OF_BIRTH] and only the remaining dates (appointments) → [DATE].
+const SAFE_HARBOR_EXTRA_PASSES = [
+    redactAge,
+    redactLicense,
+    redactDevice,
+    redactVin,
+    redactHealthPlan,
+    redactAllDates,
+];
 // Self-check: patterns that should NOT remain in already-redacted text. These
 // are intentionally broad — they flag *possible* leftovers for human review,
 // not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
@@ -289,18 +351,32 @@ export class Redactor {
     tok = new Tokeniser();
     passes;
     constructor(categories) {
+        // Safe Harbor is the strictest mode and implies clinical + general plus the
+        // extra Safe Harbor passes (all dates, ages, fax, licence, device, VIN,
+        // health-plan numbers).
+        const safeHarbor = categories.includes("safeharbor");
         const seen = new Set();
         const passes = [];
-        if (categories.includes("clinical")) {
+        if (safeHarbor) {
+            for (const p of SAFE_HARBOR_PRE_PASSES)
+                if (!seen.has(p))
+                    (seen.add(p), passes.push(p));
+        }
+        if (categories.includes("clinical") || safeHarbor) {
             for (const p of CLINICAL_PASSES)
                 if (!seen.has(p))
                     (seen.add(p), passes.push(p));
         }
-        if (categories.includes("general")) {
+        if (categories.includes("general") || safeHarbor) {
             for (const p of GENERAL_PASSES)
                 if (!seen.has(p))
                     (seen.add(p), passes.push(p));
         }
+        if (safeHarbor) {
+            for (const p of SAFE_HARBOR_EXTRA_PASSES)
+                if (!seen.has(p))
+                    (seen.add(p), passes.push(p));
+        }
         this.passes = passes;
     }
     redactText(input) {

package/package.json CHANGED Viewed

@@ -1,11 +1,14 @@
 {
   "name": "@pharmatools/redacta",
-  "version": "1.1.1",
+  "version": "1.2.1",
   "description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
   "license": "MIT-0",
   "author": "Nick Lamb (PharmaTools.AI)",
   "homepage": "https://www.pharmatools.ai/redacta",
-  "repository": { "type": "git", "url": "https://github.com/nickjlamb/redacta" },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/nickjlamb/redacta"
+  },
   "type": "module",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",
@@ -37,6 +40,6 @@
   },
   "devDependencies": {
     "typescript": "^5.4.0",
-    "vitest": "^1.6.0"
+    "vitest": "^4.1.8"
   }
 }