@pharmatools/redacta 1.1.1 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -36,6 +36,25 @@ numbers, UK vehicle regs) and keyword-anchored patient / relative / carer names
36
36
  Same value → same token across a `Redactor` instance; the `tokenMap` reverses
37
37
  the redaction. No DOM, no network, no storage.
38
38
 
39
+ ### Safe Harbor mode
40
+
41
+ `new Redactor(["safeharbor"])` applies a stricter, US-focused pass aligned with
42
+ the HIPAA Safe Harbor method (§164.514). It implies `clinical` + `general` and
43
+ adds: **all** dates (not just DOB — appointment dates included), specific ages,
44
+ fax numbers, certificate/licence numbers, device serial numbers, VINs, and
45
+ health-plan/beneficiary numbers.
46
+
47
+ ```ts
48
+ const r = new Redactor(["safeharbor"]);
49
+ r.redactText("73-year-old, appointment 15 March 2026, fax 0113 496 1234").text;
50
+ // "[AGE_1], appointment [DATE_1], fax [FAX_1]"
51
+ ```
52
+
53
+ Note: this over-redacts slightly versus the letter of the standard (it removes
54
+ all specific ages and full dates rather than only ages 90+ and date elements
55
+ beyond the year) — deliberately, on the safe side. Biometric and photographic
56
+ identifiers are out of scope for a text engine. Not legal advice; review output.
57
+
39
58
  ## API
40
59
 
41
60
  - `new Redactor(categories: ("clinical" | "general")[])` — `.redactText(s)`,
package/dist/redact.d.ts CHANGED
@@ -6,7 +6,7 @@
6
6
  * (patients, relatives, carers — clinician names preserved), self-checks the
7
7
  * output, and reverses the process from a token map.
8
8
  */
9
- export type Category = "clinical" | "general";
9
+ export type Category = "clinical" | "general" | "safeharbor";
10
10
  /** Validate a 10-digit NHS number using the Modulus-11 check digit. */
11
11
  export declare function isValidNhs(digits: string): boolean;
12
12
  /** Validate the two-letter prefix of a UK National Insurance number. */
package/dist/redact.js CHANGED
@@ -115,7 +115,10 @@ const UK_PLATE_RE = /\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b/g;
115
115
  // carrying a clinical title (Dr, Consultant, Nurse, ...), matching the Redacta
116
116
  // skill's "don't redact the treating clinician" rule. Names buried in free
117
117
  // prose are NOT caught; the UI tells users to review.
118
- const NAME = String.raw `[A-Z][a-z]+(?:['’\-][A-Za-z]+)?(?:[ \t]+[A-Z][a-z]+(?:['’\-][A-Za-z]+)?){0,2}`;
118
+ // A single name word: "Eileen", "O'Brien" (apostrophe directly after the
119
+ // initial capital), "Kowalski-Nowak", "O'Brien-Smith".
120
+ const NAME_WORD = String.raw `[A-Z](?:[a-z]+|['’][A-Z][a-z]+)(?:['’\-][A-Za-z]+)?`;
121
+ const NAME = String.raw `${NAME_WORD}(?:[ \t]+${NAME_WORD}){0,2}`;
119
122
  // Case-sensitive, anchored version. Used to trim a loosely-captured name down
120
123
  // to its leading run of properly capitalised words — necessary because the
121
124
  // label/relative regexes carry the `i` flag (for the keyword), which would
@@ -214,17 +217,34 @@ const redactZip = (text, tok) => {
214
217
  };
215
218
  const redactIp = (text, tok) => text.replace(IP_RE, (m) => tok.tokenFor("IP_ADDRESS", m));
216
219
  const redactPlate = (text, tok) => text.replace(UK_PLATE_RE, (m) => tok.tokenFor("VEHICLE_REG", m, m.replace(/\s/g, "").toUpperCase()));
217
- const redactRelative = (text, tok) => text.replace(RELATIVE_NAME_RE, (m, rel, sep, name) => {
218
- // The `i` flag (for the relationship word) relaxes the name's
219
- // capitalisation, so trim to the leading capitalised run this both
220
- // rejects "daughter and two sons" and stops "Sarah is the" over-capturing.
221
- const split = leadingName(name);
222
- if (!split)
223
- return m;
224
- return (rel + sep +
225
- tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase()) +
226
- split.rest);
227
- });
220
+ const redactRelative = (text, tok) => {
221
+ // Manual exec loop rather than String.replace: when the strict-name trim
222
+ // fails (the `i` flag lets the loose capture open with lowercase words, e.g.
223
+ // "Next of kin: her daughter Anita" captured after "next of kin"), a replace
224
+ // callback would consume the whole region and swallow the inner
225
+ // "daughter Anita" match. Here we roll the scan back to just after the
226
+ // relation word instead, so nested relation phrases still match.
227
+ let out = "";
228
+ let last = 0;
229
+ RELATIVE_NAME_RE.lastIndex = 0;
230
+ let m;
231
+ while ((m = RELATIVE_NAME_RE.exec(text)) !== null) {
232
+ const [, rel, sep, name] = m;
233
+ // Trim to the leading capitalised run — this both rejects "daughter and
234
+ // two sons" and stops "Sarah is the" over-capturing.
235
+ const split = leadingName(name);
236
+ if (!split) {
237
+ RELATIVE_NAME_RE.lastIndex = m.index + rel.length;
238
+ continue;
239
+ }
240
+ out +=
241
+ text.slice(last, m.index) + rel + sep +
242
+ tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase());
243
+ last = m.index + rel.length + sep.length + split.name.length;
244
+ RELATIVE_NAME_RE.lastIndex = last;
245
+ }
246
+ return out + text.slice(last);
247
+ };
228
248
  const redactName = (text, tok) => {
229
249
  const nameToken = (raw) => tok.tokenFor("PATIENT_NAME", raw.trim(), raw.trim().toLowerCase().replace(/\s+/g, " "));
230
250
  // Courtesy-titled names first. Store the full match (title + name) as the
@@ -243,6 +263,35 @@ const redactName = (text, tok) => {
243
263
  });
244
264
  return out;
245
265
  };
266
+ // --- Safe Harbor extras (HIPAA §164.514(b)(2)) -----------------------------
267
+ // Stricter passes layered on top of clinical + general for full Safe Harbor
268
+ // de-identification: ALL dates (not just DOB), specific ages, fax numbers,
269
+ // certificate/licence numbers, device serials, VINs, and health-plan numbers.
270
+ const ANY_DATE_RE = new RegExp("(?:" + DATE + ")", "g");
271
+ const AGE_PHRASE_RE = /\b\d{1,3}[\s-]?(?:years?[\s-]?old|y\/?o)\b/gi;
272
+ const AGE_LABEL_RE = /\b(aged|age)([:\s]+)(\d{1,3})\b/gi;
273
+ const FAX_RE = /\b(fax(?:\s*(?:no\.?|number|#))?[:\s]+)(\+?[\d(][\d().\s-]{6,}\d)/gi;
274
+ const LICENSE_RE = /\b((?:licen[cs]e|certificate|cert\.?|registration)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{3,})/gi;
275
+ const DEVICE_RE = /\b((?:serial|device\s*(?:id|identifier|no\.?|number)|imei)\s*(?:no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
276
+ const VIN_RE = /\b[A-HJ-NPR-Z0-9]{17}\b/g;
277
+ const HEALTH_PLAN_RE = /\b((?:health\s*plan|beneficiary|medicare|medicaid)\s*(?:id|no\.?|number|#)?[:\s]+)([A-Z0-9][A-Z0-9-]{4,})/gi;
278
+ const redactAllDates = (text, tok) => text.replace(ANY_DATE_RE, (m) => tok.tokenFor("DATE", m));
279
+ const redactAge = (text, tok) => {
280
+ let out = text.replace(AGE_PHRASE_RE, (m) => tok.tokenFor("AGE", m.trim(), m.replace(/\D/g, "")));
281
+ out = out.replace(AGE_LABEL_RE, (_m, kw, sep, num) => kw + sep + tok.tokenFor("AGE", num));
282
+ return out;
283
+ };
284
+ const redactFax = (text, tok) => text.replace(FAX_RE, (_m, kw, num) => kw + tok.tokenFor("FAX", num.trim(), digitsOf(num)));
285
+ const redactLicense = (text, tok) => text.replace(LICENSE_RE, (_m, kw, id) => kw + tok.tokenFor("LICENSE", id, id.toUpperCase()));
286
+ const redactDevice = (text, tok) => text.replace(DEVICE_RE, (_m, kw, id) => kw + tok.tokenFor("DEVICE_ID", id, id.toUpperCase()));
287
+ const redactVin = (text, tok) => text.replace(VIN_RE, (m) => {
288
+ // Require both a digit and a letter, so we don't grab a 17-char all-alpha
289
+ // word or an all-digit run.
290
+ if (!/\d/.test(m) || !/[A-Z]/.test(m))
291
+ return m;
292
+ return tok.tokenFor("VIN", m, m.toUpperCase());
293
+ });
294
+ const redactHealthPlan = (text, tok) => text.replace(HEALTH_PLAN_RE, (_m, kw, id) => kw + tok.tokenFor("HEALTH_PLAN_NUMBER", id, id.toUpperCase()));
246
295
  // Order matters: keyword-anchored and checksum-validated patterns first,
247
296
  // weaker heuristics last, so high-confidence matches win any overlap.
248
297
  const CLINICAL_PASSES = [
@@ -272,6 +321,19 @@ const GENERAL_PASSES = [
272
321
  redactRelative,
273
322
  redactName,
274
323
  ];
324
+ // redactFax must run BEFORE the generic phone pass, or a fax number is claimed
325
+ // as [PHONE]. It's keyword-anchored ("Fax: ...") so running first is safe.
326
+ const SAFE_HARBOR_PRE_PASSES = [redactFax];
327
+ // Layered after clinical + general. redactAllDates runs last so keyword DOBs are
328
+ // already [DATE_OF_BIRTH] and only the remaining dates (appointments) → [DATE].
329
+ const SAFE_HARBOR_EXTRA_PASSES = [
330
+ redactAge,
331
+ redactLicense,
332
+ redactDevice,
333
+ redactVin,
334
+ redactHealthPlan,
335
+ redactAllDates,
336
+ ];
275
337
  // Self-check: patterns that should NOT remain in already-redacted text. These
276
338
  // are intentionally broad — they flag *possible* leftovers for human review,
277
339
  // not confirmed identifiers. Tokens like [NHS_NUMBER_1] are excluded.
@@ -289,18 +351,32 @@ export class Redactor {
289
351
  tok = new Tokeniser();
290
352
  passes;
291
353
  constructor(categories) {
354
+ // Safe Harbor is the strictest mode and implies clinical + general plus the
355
+ // extra Safe Harbor passes (all dates, ages, fax, licence, device, VIN,
356
+ // health-plan numbers).
357
+ const safeHarbor = categories.includes("safeharbor");
292
358
  const seen = new Set();
293
359
  const passes = [];
294
- if (categories.includes("clinical")) {
360
+ if (safeHarbor) {
361
+ for (const p of SAFE_HARBOR_PRE_PASSES)
362
+ if (!seen.has(p))
363
+ (seen.add(p), passes.push(p));
364
+ }
365
+ if (categories.includes("clinical") || safeHarbor) {
295
366
  for (const p of CLINICAL_PASSES)
296
367
  if (!seen.has(p))
297
368
  (seen.add(p), passes.push(p));
298
369
  }
299
- if (categories.includes("general")) {
370
+ if (categories.includes("general") || safeHarbor) {
300
371
  for (const p of GENERAL_PASSES)
301
372
  if (!seen.has(p))
302
373
  (seen.add(p), passes.push(p));
303
374
  }
375
+ if (safeHarbor) {
376
+ for (const p of SAFE_HARBOR_EXTRA_PASSES)
377
+ if (!seen.has(p))
378
+ (seen.add(p), passes.push(p));
379
+ }
304
380
  this.passes = passes;
305
381
  }
306
382
  redactText(input) {
package/package.json CHANGED
@@ -1,11 +1,14 @@
1
1
  {
2
2
  "name": "@pharmatools/redacta",
3
- "version": "1.1.1",
3
+ "version": "1.2.1",
4
4
  "description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
5
5
  "license": "MIT-0",
6
6
  "author": "Nick Lamb (PharmaTools.AI)",
7
7
  "homepage": "https://www.pharmatools.ai/redacta",
8
- "repository": { "type": "git", "url": "https://github.com/nickjlamb/redacta" },
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "https://github.com/nickjlamb/redacta"
11
+ },
9
12
  "type": "module",
10
13
  "main": "dist/index.js",
11
14
  "types": "dist/index.d.ts",
@@ -37,6 +40,6 @@
37
40
  },
38
41
  "devDependencies": {
39
42
  "typescript": "^5.4.0",
40
- "vitest": "^1.6.0"
43
+ "vitest": "^4.1.8"
41
44
  }
42
45
  }