@pharmatools/opengate 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ADAPTERS.md CHANGED
@@ -43,7 +43,24 @@ node src/runner.mjs --online --adapter ./src/adapters/http.mjs
43
43
 
44
44
  ## The contract
45
45
 
46
- ### Required exports
46
+ Every adapter provides two base exports, plus at least one complete **capability**:
47
+
48
+ - **`qa`** — `splitClaims(text)` + `analyzeBatch(payload)`: systems that extract claims and verify them against references (scorers: claim-extraction, verdict-accuracy).
49
+ - **`redaction`** — `redact(text)`: systems that remove identifiers from text (scorer: redaction). Returns `{ text, entities: [{ value, type }] }`, where `entities` are the identifiers the system removed.
50
+
51
+ Scorers check `adapter.capabilities.<name>` and skip with a reason when a capability is absent — the bundled Redacta adapter (`src/adapters/redacta.mjs`) runs only the redaction scorer, the RefCheckr adapter only the QA scorers.
52
+
53
+ ### Base exports (always required)
54
+
55
+ ```js
56
+ /** True when the adapter has the config it needs (URLs, tokens, …). */
57
+ export function onlineAvailable() {}
58
+
59
+ /** Human-readable hint shown when onlineAvailable() is false. */
60
+ export function onlineConfigHint() {}
61
+ ```
62
+
63
+ ### QA capability exports
47
64
 
48
65
  ```js
49
66
  /**
@@ -67,14 +84,26 @@ export function splitClaims(text) {}
67
84
  * strong_support · partial_support · implied_by_data · overclaim · not_supported · contradicted
68
85
  */
69
86
  export function analyzeBatch(payload) {}
87
+ ```
70
88
 
71
- /** True when the adapter has the config it needs (URLs, tokens, …). */
72
- export function onlineAvailable() {}
89
+ ### Redaction capability exports
73
90
 
74
- /** Human-readable hint shown when onlineAvailable() is false. */
75
- export function onlineConfigHint() {}
91
+ ```js
92
+ /**
93
+ * Remove identifiers from text.
94
+ * @param {string} text — the source document (e.g. a clinical note)
95
+ * @returns {Promise<{ text: string, entities?: [{ value, type }] }>}
96
+ * `text` is the redacted output; `entities` are the identifiers the system
97
+ * removed (used to measure over-redaction).
98
+ */
99
+ export function redact(text) {}
76
100
  ```
77
101
 
102
+ The bundled `src/adapters/redacta.mjs` is the reference: it wraps the
103
+ `@pharmatools/redacta` engine via a dynamic import (install with
104
+ `npm install --no-save @pharmatools/redacta`), so OpenGATE itself stays
105
+ dependency-free.
106
+
78
107
  ### Optional exports
79
108
 
80
109
  If absent, the loader supplies no-op defaults, so scorers can call these unconditionally. Implement them to get latency, cost, and model-comparison columns in your scorecards.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # OpenGATE
2
2
 
3
- [![CI](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml/badge.svg)](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml)
3
+ [![CI](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml/badge.svg)](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml) [![npm](https://img.shields.io/npm/v/%40pharmatools%2Fopengate)](https://www.npmjs.com/package/@pharmatools/opengate)
4
4
 
5
5
  **Evidence over plausibility.**
6
6
 
@@ -80,8 +80,8 @@ npm run eval:ci # exit non-zero on any failure or metric regression
80
80
  │ adapters
81
81
  ┌──────────────┬───────┴──────┬──────────────┐
82
82
  ▼ ▼ ▼ ▼
83
- RefCheckr Patiently AI Redacta your system
84
- (first impl.) (planned) (planned) (write an adapter)
83
+ RefCheckr Redacta Patiently AI your system
84
+ (first impl., QA) (redaction) (planned) (write an adapter)
85
85
  ```
86
86
 
87
87
  Where it sits in the development loop:
@@ -108,6 +108,7 @@ change a prompt, model, or pipeline
108
108
  | `citation-detection` | offline | per-claim citation set exact-match & Jaccard; supported-style accuracy; tracked known-gap styles |
109
109
  | `claim-extraction` | online | precision / recall / F1 vs gold; non-claim leakage; citation agreement; **fidelity** (extracted claim is verbatim from source) |
110
110
  | `verdict-accuracy` | online | exact & adjacency accuracy on a six-point support scale; confusion matrix; **passage hallucination rate**; consistency across repeats; per-claim latency (p50/p95) and token usage for real cost/claim |
111
+ | `redaction` | online | recall on gold identifiers with **leaks as named failures** (verbatim, and word-level for names); over-redaction count; known-gap tracking for documented engine gaps |
111
112
 
112
113
  Offline scorers run with no API key — fast enough for every commit. Online scorers exercise a live system through an adapter.
113
114
 
@@ -159,6 +160,17 @@ Run against RefCheckr's gold set, OpenGATE:
159
160
 
160
161
  Full methodology and the model comparison: [how RefCheckr is evaluated](https://www.pharmatools.ai/refcheckr-eval).
161
162
 
163
+ ## Second implementation: Redacta
164
+
165
+ [Redacta](https://www.pharmatools.ai/redacta) exercises the framework's **redaction capability** — proof the methodology isn't QA-shaped. The bundled adapter wraps the `@pharmatools/redacta` engine, scored against synthetic UK clinical notes with gold-labelled identifiers:
166
+
167
+ ```bash
168
+ npm install --no-save @pharmatools/redacta
169
+ node src/runner.mjs --online --adapter ./src/adapters/redacta.mjs
170
+ ```
171
+
172
+ On its first run against the new gold set, the eval found two real engine bugs — a relative name directly followed by a parenthesis escapes the name pattern, and apostrophe surnames are dropped from titled-name capture — both now documented as tracked known gaps in the case files. Current scorecard: **100% recall on 23 gold identifiers, 0 leaks, 3 tracked gaps**.
173
+
162
174
  ## Layout
163
175
 
164
176
  ```
@@ -179,7 +191,8 @@ opengate/
179
191
 
180
192
  ## Roadmap
181
193
 
182
- - **Second adapter** — Patiently AI (faithfulness evaluation for patient-language simplification)
194
+ - **Redacta engine fixes** — two bugs found by the redaction eval (paren-adjacent relative names; apostrophe surnames in titled-name capture) are tracked as known gaps; fixing them in `@pharmatools/redacta` flips `knownGap_closed` and promotes the cases to gold
195
+ - **Third adapter** — Patiently AI (faithfulness evaluation for patient-language simplification)
183
196
  - **Author-year in RefCheckr production** — `detectAuthorYear()` now lands "Smith 2020"-style keys in the reference implementation; adopting them in RefCheckr's numeric-keyed citation mapping is tracked separately
184
197
  - **Number-adjacent superscript** — `week 24.1` is genuinely ambiguous with decimals; remains a tracked known gap
185
198
  - **Growing gold set** — more domains, all six verdict types, real-world reference material
@@ -15,3 +15,16 @@ One JSON file per case in `datasets/cases/` (files starting with `_` are ignored
15
15
  Verdict scale (ordered, strongest support → strongest refutation): `strong_support`, `partial_support`, `implied_by_data`, `not_supported`, `contradicted`, `overclaim`.
16
16
 
17
17
  Offline scorers need only `manuscript` + `goldClaims`. Reference texts and gold verdicts are required solely for the online verdict scorer.
18
+
19
+ ## Redaction cases (`kind: "redaction"`)
20
+
21
+ Cases for the `redaction` scorer, exercising an adapter's `redact()` capability. All bundled cases are **synthetic** — test-range NHS numbers (999‑prefix, valid modulus‑11), Ofcom test phone numbers (07700 9xxxxx), example.com emails, and fictitious people.
22
+
23
+ | Field | Required | Meaning |
24
+ |---|---|---|
25
+ | `id` | yes | Unique slug. |
26
+ | `kind` | yes | Must be `"redaction"`. |
27
+ | `description` | no | Human context; state that identifiers are synthetic. |
28
+ | `text` | yes | The source clinical note. |
29
+ | `goldEntities[]` | yes | `{ type, value }` — identifiers that MUST be removed. Any that survive (verbatim, or word-level for `*_NAME` types) are **leaks** and fail the run. |
30
+ | `knownGapEntities[]` | no | `{ type, value, comment }` — identifiers the system does not yet catch. Reported as tracked targets (`knownGap_open` / `knownGap_closed`), not failures. Use `comment` to document the reproduction. When a gap closes, promote the entity to `goldEntities`. |
@@ -0,0 +1,38 @@
1
+ {
2
+ "id": "redact-cardio-clinic",
3
+ "kind": "redaction",
4
+ "description": "Cardiology clinic letter — titled patient name, NHS number, DOB, phone, email, postcode. SYNTHETIC: all identifiers are test-range values; no real persons.",
5
+ "text": "Mr John Smith (NHS 999 000 0018, DOB 12/03/1958) attended cardiology clinic today. He was seen by Dr Patel for review of atrial fibrillation. Blood pressure was 132/84 mmHg and apixaban was continued at 5 mg BD. Contact him on 07700 900123 or john.smith58@example.com. Address on file: 42 Maple Road, Oxford OX2 6JX. Follow-up in 12 weeks.",
6
+ "goldEntities": [
7
+ {
8
+ "type": "PATIENT_NAME",
9
+ "value": "Mr John Smith"
10
+ },
11
+ {
12
+ "type": "NHS_NUMBER",
13
+ "value": "999 000 0018"
14
+ },
15
+ {
16
+ "type": "DATE_OF_BIRTH",
17
+ "value": "12/03/1958"
18
+ },
19
+ {
20
+ "type": "PHONE",
21
+ "value": "07700 900123"
22
+ },
23
+ {
24
+ "type": "EMAIL",
25
+ "value": "john.smith58@example.com"
26
+ },
27
+ {
28
+ "type": "POSTCODE",
29
+ "value": "OX2 6JX"
30
+ }
31
+ ],
32
+ "knownGapEntities": [
33
+ {
34
+ "type": "STREET_ADDRESS",
35
+ "value": "42 Maple Road"
36
+ }
37
+ ]
38
+ }
@@ -0,0 +1,35 @@
1
+ {
2
+ "id": "redact-discharge-summary",
3
+ "kind": "redaction",
4
+ "description": "Discharge summary with a relative as next of kin. SYNTHETIC.",
5
+ "text": "Discharge summary for Mrs Priya Sharma, NHS 999 000 0026, DOB 04/11/1947. Admitted with community-acquired pneumonia; completed 5 days of co-amoxiclav and improved steadily. Next of kin: her daughter Anita (07700 900456) was updated by phone. Discharged to home address, postcode LS8 2QT, with district nurse follow-up.",
6
+ "goldEntities": [
7
+ {
8
+ "type": "PATIENT_NAME",
9
+ "value": "Mrs Priya Sharma"
10
+ },
11
+ {
12
+ "type": "NHS_NUMBER",
13
+ "value": "999 000 0026"
14
+ },
15
+ {
16
+ "type": "DATE_OF_BIRTH",
17
+ "value": "04/11/1947"
18
+ },
19
+ {
20
+ "type": "PHONE",
21
+ "value": "07700 900456"
22
+ },
23
+ {
24
+ "type": "POSTCODE",
25
+ "value": "LS8 2QT"
26
+ }
27
+ ],
28
+ "knownGapEntities": [
29
+ {
30
+ "type": "RELATIVE_NAME",
31
+ "value": "Anita",
32
+ "comment": "Engine bug (tracked): a relative name directly followed by an opening parenthesis escapes the relative-name pattern — 'her daughter Anita (07700...' leaks, while 'her daughter Anita was' is caught. The phone match also consumes the opening paren, leaving an orphan ')'."
33
+ }
34
+ ]
35
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "id": "redact-gp-referral",
3
+ "kind": "redaction",
4
+ "description": "GP referral letter — patient with title, contact details, clinician names preserved by design. SYNTHETIC.",
5
+ "text": "Dear colleague, I would be grateful if you could see Miss Chloe Davies (DOB 22/07/1996) regarding persistent iron-deficiency anaemia despite oral replacement. Ferritin 8 ug/L last month. She works night shifts and prefers contact by email at chloe.davies96@example.com or on 07700 900789. Kind regards, Dr Okafor, Riverside Surgery.",
6
+ "goldEntities": [
7
+ {
8
+ "type": "PATIENT_NAME",
9
+ "value": "Miss Chloe Davies"
10
+ },
11
+ {
12
+ "type": "DATE_OF_BIRTH",
13
+ "value": "22/07/1996"
14
+ },
15
+ {
16
+ "type": "EMAIL",
17
+ "value": "chloe.davies96@example.com"
18
+ },
19
+ {
20
+ "type": "PHONE",
21
+ "value": "07700 900789"
22
+ }
23
+ ]
24
+ }
@@ -0,0 +1,24 @@
1
+ {
2
+ "id": "redact-mdt-note",
3
+ "kind": "redaction",
4
+ "description": "MDT oncology note — hyphenated surname, DOB keyword variant, postcode. SYNTHETIC.",
5
+ "text": "MDT discussion: Mr Tomasz Kowalski-Nowak, date of birth 30/01/1969, NHS 999 000 0042, new diagnosis of stage II colorectal adenocarcinoma. Fit for laparoscopic resection; anaesthetic review booked. Patient lives alone, postcode M14 5RB; holistic needs assessment arranged. Histology to be reviewed at next meeting.",
6
+ "goldEntities": [
7
+ {
8
+ "type": "PATIENT_NAME",
9
+ "value": "Mr Tomasz Kowalski-Nowak"
10
+ },
11
+ {
12
+ "type": "DATE_OF_BIRTH",
13
+ "value": "30/01/1969"
14
+ },
15
+ {
16
+ "type": "NHS_NUMBER",
17
+ "value": "999 000 0042"
18
+ },
19
+ {
20
+ "type": "POSTCODE",
21
+ "value": "M14 5RB"
22
+ }
23
+ ]
24
+ }
@@ -0,0 +1,31 @@
1
+ {
2
+ "id": "redact-ward-round",
3
+ "kind": "redaction",
4
+ "description": "Ward round entry — apostrophe surname, NHS number without spaces, carer mention. SYNTHETIC.",
5
+ "text": "Ward round: Mrs Eileen O'Brien, NHS 9990000034, remains stable post hip hemiarthroplasty. Mobilising with frame. Her son Daniel visited and raised concerns about home support; referred to social work. Plan: continue enoxaparin, orthogeriatric review tomorrow. Contact for family updates: 07700 900321.",
6
+ "goldEntities": [
7
+ {
8
+ "type": "PATIENT_NAME",
9
+ "value": "Mrs Eileen O'Brien"
10
+ },
11
+ {
12
+ "type": "NHS_NUMBER",
13
+ "value": "9990000034"
14
+ },
15
+ {
16
+ "type": "RELATIVE_NAME",
17
+ "value": "Daniel"
18
+ },
19
+ {
20
+ "type": "PHONE",
21
+ "value": "07700 900321"
22
+ }
23
+ ],
24
+ "knownGapEntities": [
25
+ {
26
+ "type": "PATIENT_SURNAME",
27
+ "value": "O'Brien",
28
+ "comment": "Engine bug (tracked): apostrophe surnames are dropped from titled-name capture — 'Mrs Eileen O'Brien' tokenises as 'Mrs Eileen' only, leaving the surname in the text."
29
+ }
30
+ ]
31
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pharmatools/opengate",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "type": "module",
5
5
  "description": "OpenGATE — Open Grounded AI Testing & Evaluation. An open-source framework for evaluating evidence-grounded AI systems.",
6
6
  "license": "MIT",
@@ -0,0 +1,63 @@
1
+ // Redacta adapter — second bundled implementation, exercising the framework's
2
+ // redaction capability. Wraps @pharmatools/redacta, the dependency-free
3
+ // engine behind the Redacta app, iPhone app, CLI, and MCP server.
4
+ //
5
+ // OpenGATE itself stays zero-dependency: the engine is imported dynamically
6
+ // and this adapter reports how to install it when absent:
7
+ //
8
+ // npm install --no-save @pharmatools/redacta
9
+ // node src/runner.mjs --online --adapter ./src/adapters/redacta.mjs
10
+ //
11
+ // Config via env:
12
+ // OPENGATE_REDACTA_CATEGORIES comma-separated: clinical,general,safeharbor
13
+ // (default: clinical,general)
14
+
15
+ export const meta = { name: 'redacta' };
16
+
17
+ let engine = null;
18
+ let loadError = null;
19
+ try {
20
+ engine = await import('@pharmatools/redacta');
21
+ } catch (err) {
22
+ loadError = err;
23
+ }
24
+
25
+ export function onlineAvailable() {
26
+ return Boolean(engine);
27
+ }
28
+
29
+ export function onlineConfigHint() {
30
+ return 'Redacta engine not installed — run: npm install --no-save @pharmatools/redacta' +
31
+ (loadError ? ` (${loadError.code || loadError.message})` : '');
32
+ }
33
+
34
+ function categories() {
35
+ return (process.env.OPENGATE_REDACTA_CATEGORIES || 'clinical,general')
36
+ .split(',').map(s => s.trim()).filter(Boolean);
37
+ }
38
+
39
+ // ── Timing capture (local calls, but latency is still worth recording) ──
40
+ const _calls = [];
41
+ export function resetTiming() { _calls.length = 0; }
42
+ export function callLatencies() { return _calls.map(c => c.ms); }
43
+
44
+ /**
45
+ * Redaction capability. A fresh Redactor per call keeps token maps
46
+ * case-scoped, so identical values across cases can't mask each other.
47
+ * Returns { text, entities: [{ token, type, value }] }.
48
+ */
49
+ export async function redact(text) {
50
+ const t0 = performance.now();
51
+ try {
52
+ const r = new engine.Redactor(categories());
53
+ const { text: redacted } = r.redactText(text);
54
+ const entities = Object.entries(r.tokenMap).map(([token, value]) => ({
55
+ token,
56
+ type: token.replace(/^\[|_\d+\]$/g, ''),
57
+ value,
58
+ }));
59
+ return { text: redacted, entities };
60
+ } finally {
61
+ _calls.push({ ms: performance.now() - t0 });
62
+ }
63
+ }
@@ -16,8 +16,15 @@ import { pathToFileURL, fileURLToPath } from 'node:url';
16
16
  const __dirname = dirname(fileURLToPath(import.meta.url));
17
17
  const DEFAULT_ADAPTER = join(__dirname, '..', 'adapters', 'refcheckr.mjs');
18
18
 
19
- // Required: online scorers cannot run without these.
20
- const REQUIRED = ['splitClaims', 'analyzeBatch', 'onlineAvailable', 'onlineConfigHint'];
19
+ // Base exports every adapter must provide.
20
+ const REQUIRED_BASE = ['onlineAvailable', 'onlineConfigHint'];
21
+
22
+ // Capabilities: an adapter implements at least one, completely. Scorers check
23
+ // `adapter.capabilities.<name>` and skip (with a reason) when absent.
24
+ const CAPABILITIES = {
25
+ qa: ['splitClaims', 'analyzeBatch'], // claim extraction + verdicts against references
26
+ redaction: ['redact'], // identifier removal from text
27
+ };
21
28
 
22
29
  // Optional: validated if present; no-op defaults are supplied if absent, so
23
30
  // scorers can call them unconditionally.
@@ -33,12 +40,35 @@ const DEFAULTS = {
33
40
  }),
34
41
  };
35
42
 
43
+ /** Which capabilities does a module fully implement? */
44
+ export function detectCapabilities(mod) {
45
+ return Object.fromEntries(
46
+ Object.entries(CAPABILITIES).map(([name, fns]) =>
47
+ [name, fns.every(fn => typeof mod[fn] === 'function')])
48
+ );
49
+ }
50
+
36
51
  /** Throws with a readable message listing every problem, or returns silently. */
37
52
  export function validateAdapter(mod, source = 'adapter') {
38
53
  const problems = [];
39
- for (const fn of REQUIRED) {
54
+ for (const fn of REQUIRED_BASE) {
40
55
  if (typeof mod[fn] !== 'function') problems.push(`missing required export: ${fn}()`);
41
56
  }
57
+ const caps = detectCapabilities(mod);
58
+ if (!Object.values(caps).some(Boolean)) {
59
+ problems.push(
60
+ 'no complete capability implemented — provide ' +
61
+ Object.entries(CAPABILITIES).map(([n, fns]) => `${fns.map(f => `${f}()`).join(' + ')} (${n})`).join(' or ')
62
+ );
63
+ // Name partially implemented capabilities to make the fix obvious.
64
+ for (const [name, fns] of Object.entries(CAPABILITIES)) {
65
+ const present = fns.filter(fn => typeof mod[fn] === 'function');
66
+ if (present.length && present.length < fns.length) {
67
+ const missing = fns.filter(fn => typeof mod[fn] !== 'function');
68
+ problems.push(`capability "${name}" is incomplete — missing ${missing.map(f => `${f}()`).join(', ')}`);
69
+ }
70
+ }
71
+ }
42
72
  for (const fn of OPTIONAL) {
43
73
  if (fn in mod && typeof mod[fn] !== 'function') problems.push(`optional export ${fn} is not a function`);
44
74
  }
@@ -65,13 +95,13 @@ export async function loadAdapter(specOverride) {
65
95
  throw new Error(`Could not load adapter from ${path}: ${err.message}`);
66
96
  }
67
97
  validateAdapter(mod, path);
98
+ const allFns = [...REQUIRED_BASE, ...Object.values(CAPABILITIES).flat(), ...OPTIONAL];
68
99
  const methods = Object.fromEntries(
69
- [...REQUIRED, ...OPTIONAL]
70
- .filter(f => typeof mod[f] === 'function')
71
- .map(f => [f, mod[f]])
100
+ allFns.filter(f => typeof mod[f] === 'function').map(f => [f, mod[f]])
72
101
  );
73
102
  return {
74
103
  name: mod.meta?.name || basename(path, '.mjs'),
104
+ capabilities: detectCapabilities(mod),
75
105
  ...DEFAULTS,
76
106
  ...methods,
77
107
  };
package/src/runner.mjs CHANGED
@@ -98,6 +98,7 @@ const SCORERS = [
98
98
  './scorers/citation-detection.mjs',
99
99
  './scorers/claim-extraction.mjs',
100
100
  './scorers/verdict-accuracy.mjs',
101
+ './scorers/redaction.mjs',
101
102
  ];
102
103
 
103
104
  async function loadJsonDir(dir, { skipPrefix } = {}) {
@@ -10,13 +10,16 @@ import { precisionRecallF1, claimMatch, jaccard, normText, mean } from '../lib/m
10
10
  export const meta = { id: 'claim-extraction', mode: 'online' };
11
11
 
12
12
  export async function run({ cases, adapter }) {
13
+ if (!adapter.capabilities.qa) {
14
+ return { meta, skipped: true, reason: `adapter "${adapter.name}" has no QA capability` };
15
+ }
13
16
  if (!adapter.onlineAvailable()) {
14
17
  return { meta, skipped: true, reason: adapter.onlineConfigHint() };
15
18
  }
16
19
 
17
20
  const perCase = [];
18
21
  const splitErrors = [];
19
- for (const c of cases) {
22
+ for (const c of cases.filter(x => x.manuscript && (x.goldClaims || []).length)) {
20
23
  try {
21
24
  const resp = await adapter.splitClaims(c.manuscript);
22
25
  const extracted = (resp.claims || []).map(x => (typeof x === 'string' ? { text: x, citations: [] } : x));
@@ -0,0 +1,135 @@
1
+ // ONLINE scorer — redaction recall.
2
+ //
3
+ // Exercises the adapter's redact() capability against gold cases of kind
4
+ // "redaction": source text plus the identifiers that must be removed. The
5
+ // decisive metric is the LEAK — a gold identifier still present verbatim in
6
+ // the redacted output. For a privacy tool, a single leak is the worst failure
7
+ // there is, so every leak is a named failure and gates the run.
8
+ //
9
+ // Case schema (datasets/SCHEMA.md):
10
+ // {
11
+ // "id": "...", "kind": "redaction", "text": "...",
12
+ // "goldEntities": [{ "type": "PATIENT_NAME", "value": "Mr John Smith" }],
13
+ // "knownGapEntities": [{ "type": "STREET_ADDRESS", "value": "42 Maple Road" }]
14
+ // }
15
+ // knownGapEntities are identifiers the system does not yet catch — reported
16
+ // separately as tracked targets, not failures (same mechanism as known-gap
17
+ // citation styles).
18
+ //
19
+ // Metrics:
20
+ // • recall — share of gold identifiers removed (leaks lower it)
21
+ // • leaks — count of gold identifiers still present (each is a failure)
22
+ // • over_redactions — detected entities not in the gold or known-gap sets
23
+ // (info only: often legitimate extra catches)
24
+ // • knownGap_open / knownGap_closed — tracked targets still open vs now
25
+ // caught (a closed gap means the case can be promoted to goldEntities)
26
+
27
+ import { mean } from '../lib/metrics.mjs';
28
+
29
+ export const meta = { id: 'redaction', mode: 'online' };
30
+
31
+ const escapeRx = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
32
+ /** Is the identifier value still present in the text (case-insensitive, word-bounded)? */
33
+ function stillPresent(text, value) {
34
+ return new RegExp(`(?<![A-Za-z0-9])${escapeRx(value)}(?![A-Za-z0-9])`, 'i').test(text);
35
+ }
36
+
37
+ // For *_NAME entities a full-string check is not enough: a partially redacted
38
+ // name ("[PATIENT_NAME_1] O'Brien") removes the full value but leaks the
39
+ // surname. Scan the individual words of name values too.
40
+ const TITLES = new Set(['mr', 'mrs', 'ms', 'miss', 'mx', 'dr', 'prof']);
41
+ function nameWords(value) {
42
+ return value.split(/[^A-Za-z'’\-]+/)
43
+ .filter(w => w.replace(/[^A-Za-z]/g, '').length >= 3 && !TITLES.has(w.toLowerCase()));
44
+ }
45
+
46
+ /** Words of a name entity that survived redaction and are not excused by a known gap. */
47
+ function leakedNameWords(out, value, gapValues) {
48
+ return nameWords(value).filter(w =>
49
+ stillPresent(out, w) && !gapValues.some(g => g.toLowerCase().includes(w.toLowerCase())));
50
+ }
51
+
52
+ export async function run({ cases, adapter }) {
53
+ if (!adapter.capabilities.redaction) {
54
+ return { meta, skipped: true, reason: `adapter "${adapter.name}" has no redaction capability` };
55
+ }
56
+ if (!adapter.onlineAvailable()) {
57
+ return { meta, skipped: true, reason: adapter.onlineConfigHint() };
58
+ }
59
+ const goldCases = cases.filter(c => c.kind === 'redaction' && (c.goldEntities || []).length);
60
+ if (goldCases.length === 0) {
61
+ return { meta, skipped: true, reason: 'No cases of kind "redaction" with goldEntities.' };
62
+ }
63
+
64
+ const perCase = [];
65
+ const failures = [];
66
+ let gapOpen = 0, gapClosed = 0;
67
+
68
+ for (const c of goldCases) {
69
+ let res;
70
+ try {
71
+ res = await adapter.redact(c.text);
72
+ } catch (err) {
73
+ failures.push(`case ${c.id}: ${err.message}`);
74
+ continue;
75
+ }
76
+ const out = res.text ?? '';
77
+ const gaps = c.knownGapEntities || [];
78
+ const gapValues = gaps.map(e => e.value);
79
+
80
+ const leaked = [];
81
+ for (const e of c.goldEntities || []) {
82
+ if (stillPresent(out, e.value)) {
83
+ leaked.push({ ...e, how: 'verbatim' });
84
+ } else if (/NAME/.test(e.type)) {
85
+ const words = leakedNameWords(out, e.value, gapValues);
86
+ if (words.length) leaked.push({ ...e, how: `partial: ${words.join(', ')}` });
87
+ }
88
+ }
89
+ for (const e of leaked) {
90
+ failures.push(`LEAK in ${c.id}: ${e.type} "${e.value}" survived redaction (${e.how})`);
91
+ }
92
+
93
+ // Tracked targets: not failures, but their status is reported.
94
+ const gapsStillOpen = gaps.filter(e => stillPresent(out, e.value));
95
+ gapOpen += gapsStillOpen.length;
96
+ gapClosed += gaps.length - gapsStillOpen.length;
97
+
98
+ // Over-redaction: detected values unrelated to the gold + known-gap sets.
99
+ // Substring relation in either direction excuses partial captures
100
+ // ("Mrs Eileen" detected for gold "Mrs Eileen O'Brien").
101
+ const expectedVals = [...(c.goldEntities || []), ...gaps].map(e => e.value.toLowerCase());
102
+ const over = (res.entities || []).filter(e => {
103
+ const v = String(e.value).toLowerCase();
104
+ return !expectedVals.some(x => x.includes(v) || v.includes(x));
105
+ });
106
+
107
+ perCase.push({
108
+ case: c.id,
109
+ entities: (c.goldEntities || []).length,
110
+ leaks: leaked.length,
111
+ leakedValues: leaked.map(e => `${e.type}:${e.value}`),
112
+ recall: 1 - leaked.length / (c.goldEntities || []).length,
113
+ overRedactions: over.length,
114
+ gapsStillOpen: gapsStillOpen.map(e => `${e.type}:${e.value}`),
115
+ });
116
+ }
117
+
118
+ const totalEntities = perCase.reduce((a, p) => a + p.entities, 0);
119
+ const totalLeaks = perCase.reduce((a, p) => a + p.leaks, 0);
120
+
121
+ const metrics = {
122
+ n_cases: perCase.length,
123
+ n_entities: totalEntities,
124
+ recall: round(totalEntities ? 1 - totalLeaks / totalEntities : 0),
125
+ leaks: totalLeaks,
126
+ over_redactions: perCase.reduce((a, p) => a + p.overRedactions, 0),
127
+ knownGap_open: gapOpen,
128
+ knownGap_closed: gapClosed,
129
+ mean_case_recall: round(mean(perCase.map(p => p.recall))),
130
+ };
131
+
132
+ return { meta, metrics, detail: { perCase }, failures, passed: failures.length === 0 };
133
+ }
134
+
135
+ function round(x) { return Math.round(x * 1000) / 1000; }
@@ -53,6 +53,9 @@ function isGuardDowngrade(summary) {
53
53
  }
54
54
 
55
55
  export async function run({ cases, adapter }) {
56
+ if (!adapter.capabilities.qa) {
57
+ return { meta, skipped: true, reason: `adapter "${adapter.name}" has no QA capability` };
58
+ }
56
59
  if (!adapter.onlineAvailable()) {
57
60
  return { meta, skipped: true, reason: adapter.onlineConfigHint() };
58
61
  }