@pharmatools/redacta 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/redact.js +32 -12
  2. package/package.json +6 -3
package/dist/redact.js CHANGED
@@ -115,7 +115,10 @@ const UK_PLATE_RE = /\b[A-Z]{2}\d{2}\s?[A-Z]{3}\b/g;
115
115
  // carrying a clinical title (Dr, Consultant, Nurse, ...), matching the Redacta
116
116
  // skill's "don't redact the treating clinician" rule. Names buried in free
117
117
  // prose are NOT caught; the UI tells users to review.
118
- const NAME = String.raw `[A-Z][a-z]+(?:['’\-][A-Za-z]+)?(?:[ \t]+[A-Z][a-z]+(?:['’\-][A-Za-z]+)?){0,2}`;
118
+ // A single name word: "Eileen", "O'Brien" (apostrophe directly after the
119
+ // initial capital), "Kowalski-Nowak", "O'Brien-Smith".
120
+ const NAME_WORD = String.raw `[A-Z](?:[a-z]+|['’][A-Z][a-z]+)(?:['’\-][A-Za-z]+)?`;
121
+ const NAME = String.raw `${NAME_WORD}(?:[ \t]+${NAME_WORD}){0,2}`;
119
122
  // Case-sensitive, anchored version. Used to trim a loosely-captured name down
120
123
  // to its leading run of properly capitalised words — necessary because the
121
124
  // label/relative regexes carry the `i` flag (for the keyword), which would
@@ -214,17 +217,34 @@ const redactZip = (text, tok) => {
214
217
  };
215
218
  const redactIp = (text, tok) => text.replace(IP_RE, (m) => tok.tokenFor("IP_ADDRESS", m));
216
219
  const redactPlate = (text, tok) => text.replace(UK_PLATE_RE, (m) => tok.tokenFor("VEHICLE_REG", m, m.replace(/\s/g, "").toUpperCase()));
217
- const redactRelative = (text, tok) => text.replace(RELATIVE_NAME_RE, (m, rel, sep, name) => {
218
- // The `i` flag (for the relationship word) relaxes the name's
219
- // capitalisation, so trim to the leading capitalised run this both
220
- // rejects "daughter and two sons" and stops "Sarah is the" over-capturing.
221
- const split = leadingName(name);
222
- if (!split)
223
- return m;
224
- return (rel + sep +
225
- tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase()) +
226
- split.rest);
227
- });
220
+ const redactRelative = (text, tok) => {
221
+ // Manual exec loop rather than String.replace: when the strict-name trim
222
+ // fails (the `i` flag lets the loose capture open with lowercase words, e.g.
223
+ // "Next of kin: her daughter Anita" captured after "next of kin"), a replace
224
+ // callback would consume the whole region and swallow the inner
225
+ // "daughter Anita" match. Here we roll the scan back to just after the
226
+ // relation word instead, so nested relation phrases still match.
227
+ let out = "";
228
+ let last = 0;
229
+ RELATIVE_NAME_RE.lastIndex = 0;
230
+ let m;
231
+ while ((m = RELATIVE_NAME_RE.exec(text)) !== null) {
232
+ const [, rel, sep, name] = m;
233
+ // Trim to the leading capitalised run — this both rejects "daughter and
234
+ // two sons" and stops "Sarah is the" over-capturing.
235
+ const split = leadingName(name);
236
+ if (!split) {
237
+ RELATIVE_NAME_RE.lastIndex = m.index + rel.length;
238
+ continue;
239
+ }
240
+ out +=
241
+ text.slice(last, m.index) + rel + sep +
242
+ tok.tokenFor("RELATIVE_NAME", split.name, split.name.toLowerCase());
243
+ last = m.index + rel.length + sep.length + split.name.length;
244
+ RELATIVE_NAME_RE.lastIndex = last;
245
+ }
246
+ return out + text.slice(last);
247
+ };
228
248
  const redactName = (text, tok) => {
229
249
  const nameToken = (raw) => tok.tokenFor("PATIENT_NAME", raw.trim(), raw.trim().toLowerCase().replace(/\s+/g, " "));
230
250
  // Courtesy-titled names first. Store the full match (title + name) as the
package/package.json CHANGED
@@ -1,11 +1,14 @@
1
1
  {
2
2
  "name": "@pharmatools/redacta",
3
- "version": "1.2.0",
3
+ "version": "1.2.1",
4
4
  "description": "Pseudonymise patient identifiers and PII in text (and restore them) — a dependency-free TypeScript engine. Names, relatives, general PII, self-check, re-identification.",
5
5
  "license": "MIT-0",
6
6
  "author": "Nick Lamb (PharmaTools.AI)",
7
7
  "homepage": "https://www.pharmatools.ai/redacta",
8
- "repository": { "type": "git", "url": "https://github.com/nickjlamb/redacta" },
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "https://github.com/nickjlamb/redacta"
11
+ },
9
12
  "type": "module",
10
13
  "main": "dist/index.js",
11
14
  "types": "dist/index.d.ts",
@@ -37,6 +40,6 @@
37
40
  },
38
41
  "devDependencies": {
39
42
  "typescript": "^5.4.0",
40
- "vitest": "^1.6.0"
43
+ "vitest": "^4.1.8"
41
44
  }
42
45
  }