npm - @pharmatools/opengate - Versions diffs - 0.1.0 → 0.2.0 - Mend

@pharmatools/opengate 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/ADAPTERS.md +34 -5
package/README.md +17 -4
package/datasets/SCHEMA.md +13 -0
package/datasets/cases/redact-cardio-clinic.json +38 -0
package/datasets/cases/redact-discharge-summary.json +35 -0
package/datasets/cases/redact-gp-referral.json +24 -0
package/datasets/cases/redact-mdt-note.json +24 -0
package/datasets/cases/redact-ward-round.json +31 -0
package/package.json +1 -1
package/src/adapters/redacta.mjs +63 -0
package/src/lib/adapter.mjs +36 -6
package/src/runner.mjs +1 -0
package/src/scorers/claim-extraction.mjs +4 -1
package/src/scorers/redaction.mjs +135 -0
package/src/scorers/verdict-accuracy.mjs +3 -0

package/ADAPTERS.md CHANGED Viewed

@@ -43,7 +43,24 @@ node src/runner.mjs --online --adapter ./src/adapters/http.mjs
 ## The contract
-### Required exports
+Every adapter provides two base exports, plus at least one complete **capability**:
+- **`qa`** — `splitClaims(text)` + `analyzeBatch(payload)`: systems that extract claims and verify them against references (scorers: claim-extraction, verdict-accuracy).
+- **`redaction`** — `redact(text)`: systems that remove identifiers from text (scorer: redaction). Returns `{ text, entities: [{ value, type }] }`, where `entities` are the identifiers the system removed.
+Scorers check `adapter.capabilities.<name>` and skip with a reason when a capability is absent — the bundled Redacta adapter (`src/adapters/redacta.mjs`) runs only the redaction scorer, the RefCheckr adapter only the QA scorers.
+### Base exports (always required)
+```js
+/** True when the adapter has the config it needs (URLs, tokens, …). */
+export function onlineAvailable() {}
+/** Human-readable hint shown when onlineAvailable() is false. */
+export function onlineConfigHint() {}
+```
+### QA capability exports
 ```js
 /**
@@ -67,14 +84,26 @@ export function splitClaims(text) {}
  *   strong_support · partial_support · implied_by_data · overclaim · not_supported · contradicted
  */
 export function analyzeBatch(payload) {}
+```
-/** True when the adapter has the config it needs (URLs, tokens, …). */
-export function onlineAvailable() {}
+### Redaction capability exports
-/** Human-readable hint shown when onlineAvailable() is false. */
-export function onlineConfigHint() {}
+```js
+/**
+ * Remove identifiers from text.
+ * @param {string} text — the source document (e.g. a clinical note)
+ * @returns {Promise<{ text: string, entities?: [{ value, type }] }>}
+ *   `text` is the redacted output; `entities` are the identifiers the system
+ *   removed (used to measure over-redaction).
+ */
+export function redact(text) {}
 ```
+The bundled `src/adapters/redacta.mjs` is the reference: it wraps the
+`@pharmatools/redacta` engine via a dynamic import (install with
+`npm install --no-save @pharmatools/redacta`), so OpenGATE itself stays
+dependency-free.
 ### Optional exports
 If absent, the loader supplies no-op defaults, so scorers can call these unconditionally. Implement them to get latency, cost, and model-comparison columns in your scorecards.

package/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # OpenGATE
-[![CI](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml/badge.svg)](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml)
+[![CI](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml/badge.svg)](https://github.com/nickjlamb/opengate/actions/workflows/ci.yml) [![npm](https://img.shields.io/npm/v/%40pharmatools%2Fopengate)](https://www.npmjs.com/package/@pharmatools/opengate)
 **Evidence over plausibility.**
@@ -80,8 +80,8 @@ npm run eval:ci         # exit non-zero on any failure or metric regression
                                         │ adapters
                  ┌──────────────┬───────┴──────┬──────────────┐
                  ▼              ▼              ▼              ▼
-             RefCheckr     Patiently AI     Redacta      your system
-          (first impl.)     (planned)      (planned)    (write an adapter)
+             RefCheckr       Redacta      Patiently AI   your system
+          (first impl., QA) (redaction)    (planned)    (write an adapter)
 ```
 Where it sits in the development loop:
@@ -108,6 +108,7 @@ change a prompt, model, or pipeline
 | `citation-detection` | offline | per-claim citation set exact-match & Jaccard; supported-style accuracy; tracked known-gap styles |
 | `claim-extraction` | online | precision / recall / F1 vs gold; non-claim leakage; citation agreement; **fidelity** (extracted claim is verbatim from source) |
 | `verdict-accuracy` | online | exact & adjacency accuracy on a six-point support scale; confusion matrix; **passage hallucination rate**; consistency across repeats; per-claim latency (p50/p95) and token usage for real cost/claim |
+| `redaction` | online | recall on gold identifiers with **leaks as named failures** (verbatim, and word-level for names); over-redaction count; known-gap tracking for documented engine gaps |
 Offline scorers run with no API key — fast enough for every commit. Online scorers exercise a live system through an adapter.
@@ -159,6 +160,17 @@ Run against RefCheckr's gold set, OpenGATE:
 Full methodology and the model comparison: [how RefCheckr is evaluated](https://www.pharmatools.ai/refcheckr-eval).
+## Second implementation: Redacta
+[Redacta](https://www.pharmatools.ai/redacta) exercises the framework's **redaction capability** — proof the methodology isn't QA-shaped. The bundled adapter wraps the `@pharmatools/redacta` engine, scored against synthetic UK clinical notes with gold-labelled identifiers:
+```bash
+npm install --no-save @pharmatools/redacta
+node src/runner.mjs --online --adapter ./src/adapters/redacta.mjs
+```
+On its first run against the new gold set, the eval found two real engine bugs — a relative name directly followed by a parenthesis escapes the name pattern, and apostrophe surnames are dropped from titled-name capture — both now documented as tracked known gaps in the case files. Current scorecard: **100% recall on 23 gold identifiers, 0 leaks, 3 tracked gaps**.
 ## Layout
 ```
@@ -179,7 +191,8 @@ opengate/
 ## Roadmap
-- **Second adapter** — Patiently AI (faithfulness evaluation for patient-language simplification)
+- **Redacta engine fixes** — two bugs found by the redaction eval (paren-adjacent relative names; apostrophe surnames in titled-name capture) are tracked as known gaps; fixing them in `@pharmatools/redacta` flips `knownGap_closed` and promotes the cases to gold
+- **Third adapter** — Patiently AI (faithfulness evaluation for patient-language simplification)
 - **Author-year in RefCheckr production** — `detectAuthorYear()` now lands "Smith 2020"-style keys in the reference implementation; adopting them in RefCheckr's numeric-keyed citation mapping is tracked separately
 - **Number-adjacent superscript** — `week 24.1` is genuinely ambiguous with decimals; remains a tracked known gap
 - **Growing gold set** — more domains, all six verdict types, real-world reference material

package/datasets/SCHEMA.md CHANGED Viewed

@@ -15,3 +15,16 @@ One JSON file per case in `datasets/cases/` (files starting with `_` are ignored
 Verdict scale (ordered, strongest support → strongest refutation): `strong_support`, `partial_support`, `implied_by_data`, `not_supported`, `contradicted`, `overclaim`.
 Offline scorers need only `manuscript` + `goldClaims`. Reference texts and gold verdicts are required solely for the online verdict scorer.
+## Redaction cases (`kind: "redaction"`)
+Cases for the `redaction` scorer, exercising an adapter's `redact()` capability. All bundled cases are **synthetic** — test-range NHS numbers (999‑prefix, valid modulus‑11), Ofcom test phone numbers (07700 9xxxxx), example.com emails, and fictitious people.
+| Field | Required | Meaning |
+|---|---|---|
+| `id` | yes | Unique slug. |
+| `kind` | yes | Must be `"redaction"`. |
+| `description` | no | Human context; state that identifiers are synthetic. |
+| `text` | yes | The source clinical note. |
+| `goldEntities[]` | yes | `{ type, value }` — identifiers that MUST be removed. Any that survive (verbatim, or word-level for `*_NAME` types) are **leaks** and fail the run. |
+| `knownGapEntities[]` | no | `{ type, value, comment }` — identifiers the system does not yet catch. Reported as tracked targets (`knownGap_open` / `knownGap_closed`), not failures. Use `comment` to document the reproduction. When a gap closes, promote the entity to `goldEntities`. |

package/datasets/cases/redact-cardio-clinic.json ADDED Viewed

@@ -0,0 +1,38 @@
+{
+  "id": "redact-cardio-clinic",
+  "kind": "redaction",
+  "description": "Cardiology clinic letter — titled patient name, NHS number, DOB, phone, email, postcode. SYNTHETIC: all identifiers are test-range values; no real persons.",
+  "text": "Mr John Smith (NHS 999 000 0018, DOB 12/03/1958) attended cardiology clinic today. He was seen by Dr Patel for review of atrial fibrillation. Blood pressure was 132/84 mmHg and apixaban was continued at 5 mg BD. Contact him on 07700 900123 or john.smith58@example.com. Address on file: 42 Maple Road, Oxford OX2 6JX. Follow-up in 12 weeks.",
+  "goldEntities": [
+    {
+      "type": "PATIENT_NAME",
+      "value": "Mr John Smith"
+    },
+    {
+      "type": "NHS_NUMBER",
+      "value": "999 000 0018"
+    },
+    {
+      "type": "DATE_OF_BIRTH",
+      "value": "12/03/1958"
+    },
+    {
+      "type": "PHONE",
+      "value": "07700 900123"
+    },
+    {
+      "type": "EMAIL",
+      "value": "john.smith58@example.com"
+    },
+    {
+      "type": "POSTCODE",
+      "value": "OX2 6JX"
+    }
+  ],
+  "knownGapEntities": [
+    {
+      "type": "STREET_ADDRESS",
+      "value": "42 Maple Road"
+    }
+  ]
+}

package/datasets/cases/redact-discharge-summary.json ADDED Viewed

@@ -0,0 +1,35 @@
+{
+  "id": "redact-discharge-summary",
+  "kind": "redaction",
+  "description": "Discharge summary with a relative as next of kin. SYNTHETIC.",
+  "text": "Discharge summary for Mrs Priya Sharma, NHS 999 000 0026, DOB 04/11/1947. Admitted with community-acquired pneumonia; completed 5 days of co-amoxiclav and improved steadily. Next of kin: her daughter Anita (07700 900456) was updated by phone. Discharged to home address, postcode LS8 2QT, with district nurse follow-up.",
+  "goldEntities": [
+    {
+      "type": "PATIENT_NAME",
+      "value": "Mrs Priya Sharma"
+    },
+    {
+      "type": "NHS_NUMBER",
+      "value": "999 000 0026"
+    },
+    {
+      "type": "DATE_OF_BIRTH",
+      "value": "04/11/1947"
+    },
+    {
+      "type": "PHONE",
+      "value": "07700 900456"
+    },
+    {
+      "type": "POSTCODE",
+      "value": "LS8 2QT"
+    }
+  ],
+  "knownGapEntities": [
+    {
+      "type": "RELATIVE_NAME",
+      "value": "Anita",
+      "comment": "Engine bug (tracked): a relative name directly followed by an opening parenthesis escapes the relative-name pattern — 'her daughter Anita (07700...' leaks, while 'her daughter Anita was' is caught. The phone match also consumes the opening paren, leaving an orphan ')'."
+    }
+  ]
+}

package/datasets/cases/redact-gp-referral.json ADDED Viewed

@@ -0,0 +1,24 @@
+{
+  "id": "redact-gp-referral",
+  "kind": "redaction",
+  "description": "GP referral letter — patient with title, contact details, clinician names preserved by design. SYNTHETIC.",
+  "text": "Dear colleague, I would be grateful if you could see Miss Chloe Davies (DOB 22/07/1996) regarding persistent iron-deficiency anaemia despite oral replacement. Ferritin 8 ug/L last month. She works night shifts and prefers contact by email at chloe.davies96@example.com or on 07700 900789. Kind regards, Dr Okafor, Riverside Surgery.",
+  "goldEntities": [
+    {
+      "type": "PATIENT_NAME",
+      "value": "Miss Chloe Davies"
+    },
+    {
+      "type": "DATE_OF_BIRTH",
+      "value": "22/07/1996"
+    },
+    {
+      "type": "EMAIL",
+      "value": "chloe.davies96@example.com"
+    },
+    {
+      "type": "PHONE",
+      "value": "07700 900789"
+    }
+  ]
+}

package/datasets/cases/redact-mdt-note.json ADDED Viewed

@@ -0,0 +1,24 @@
+{
+  "id": "redact-mdt-note",
+  "kind": "redaction",
+  "description": "MDT oncology note — hyphenated surname, DOB keyword variant, postcode. SYNTHETIC.",
+  "text": "MDT discussion: Mr Tomasz Kowalski-Nowak, date of birth 30/01/1969, NHS 999 000 0042, new diagnosis of stage II colorectal adenocarcinoma. Fit for laparoscopic resection; anaesthetic review booked. Patient lives alone, postcode M14 5RB; holistic needs assessment arranged. Histology to be reviewed at next meeting.",
+  "goldEntities": [
+    {
+      "type": "PATIENT_NAME",
+      "value": "Mr Tomasz Kowalski-Nowak"
+    },
+    {
+      "type": "DATE_OF_BIRTH",
+      "value": "30/01/1969"
+    },
+    {
+      "type": "NHS_NUMBER",
+      "value": "999 000 0042"
+    },
+    {
+      "type": "POSTCODE",
+      "value": "M14 5RB"
+    }
+  ]
+}

package/datasets/cases/redact-ward-round.json ADDED Viewed

@@ -0,0 +1,31 @@
+{
+  "id": "redact-ward-round",
+  "kind": "redaction",
+  "description": "Ward round entry — apostrophe surname, NHS number without spaces, carer mention. SYNTHETIC.",
+  "text": "Ward round: Mrs Eileen O'Brien, NHS 9990000034, remains stable post hip hemiarthroplasty. Mobilising with frame. Her son Daniel visited and raised concerns about home support; referred to social work. Plan: continue enoxaparin, orthogeriatric review tomorrow. Contact for family updates: 07700 900321.",
+  "goldEntities": [
+    {
+      "type": "PATIENT_NAME",
+      "value": "Mrs Eileen O'Brien"
+    },
+    {
+      "type": "NHS_NUMBER",
+      "value": "9990000034"
+    },
+    {
+      "type": "RELATIVE_NAME",
+      "value": "Daniel"
+    },
+    {
+      "type": "PHONE",
+      "value": "07700 900321"
+    }
+  ],
+  "knownGapEntities": [
+    {
+      "type": "PATIENT_SURNAME",
+      "value": "O'Brien",
+      "comment": "Engine bug (tracked): apostrophe surnames are dropped from titled-name capture — 'Mrs Eileen O'Brien' tokenises as 'Mrs Eileen' only, leaving the surname in the text."
+    }
+  ]
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pharmatools/opengate",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "type": "module",
   "description": "OpenGATE — Open Grounded AI Testing & Evaluation. An open-source framework for evaluating evidence-grounded AI systems.",
   "license": "MIT",

package/src/adapters/redacta.mjs ADDED Viewed

@@ -0,0 +1,63 @@
+// Redacta adapter — second bundled implementation, exercising the framework's
+// redaction capability. Wraps @pharmatools/redacta, the dependency-free
+// engine behind the Redacta app, iPhone app, CLI, and MCP server.
+//
+// OpenGATE itself stays zero-dependency: the engine is imported dynamically
+// and this adapter reports how to install it when absent:
+//
+//   npm install --no-save @pharmatools/redacta
+//   node src/runner.mjs --online --adapter ./src/adapters/redacta.mjs
+//
+// Config via env:
+//   OPENGATE_REDACTA_CATEGORIES   comma-separated: clinical,general,safeharbor
+//                                 (default: clinical,general)
+export const meta = { name: 'redacta' };
+let engine = null;
+let loadError = null;
+try {
+  engine = await import('@pharmatools/redacta');
+} catch (err) {
+  loadError = err;
+}
+export function onlineAvailable() {
+  return Boolean(engine);
+}
+export function onlineConfigHint() {
+  return 'Redacta engine not installed — run: npm install --no-save @pharmatools/redacta' +
+    (loadError ? ` (${loadError.code || loadError.message})` : '');
+}
+function categories() {
+  return (process.env.OPENGATE_REDACTA_CATEGORIES || 'clinical,general')
+    .split(',').map(s => s.trim()).filter(Boolean);
+}
+// ── Timing capture (local calls, but latency is still worth recording) ──
+const _calls = [];
+export function resetTiming() { _calls.length = 0; }
+export function callLatencies() { return _calls.map(c => c.ms); }
+/**
+ * Redaction capability. A fresh Redactor per call keeps token maps
+ * case-scoped, so identical values across cases can't mask each other.
+ * Returns { text, entities: [{ token, type, value }] }.
+ */
+export async function redact(text) {
+  const t0 = performance.now();
+  try {
+    const r = new engine.Redactor(categories());
+    const { text: redacted } = r.redactText(text);
+    const entities = Object.entries(r.tokenMap).map(([token, value]) => ({
+      token,
+      type: token.replace(/^\[|_\d+\]$/g, ''),
+      value,
+    }));
+    return { text: redacted, entities };
+  } finally {
+    _calls.push({ ms: performance.now() - t0 });
+  }
+}

package/src/lib/adapter.mjs CHANGED Viewed

@@ -16,8 +16,15 @@ import { pathToFileURL, fileURLToPath } from 'node:url';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const DEFAULT_ADAPTER = join(__dirname, '..', 'adapters', 'refcheckr.mjs');
-// Required: online scorers cannot run without these.
-const REQUIRED = ['splitClaims', 'analyzeBatch', 'onlineAvailable', 'onlineConfigHint'];
+// Base exports every adapter must provide.
+const REQUIRED_BASE = ['onlineAvailable', 'onlineConfigHint'];
+// Capabilities: an adapter implements at least one, completely. Scorers check
+// `adapter.capabilities.<name>` and skip (with a reason) when absent.
+const CAPABILITIES = {
+  qa: ['splitClaims', 'analyzeBatch'],       // claim extraction + verdicts against references
+  redaction: ['redact'],                      // identifier removal from text
+};
 // Optional: validated if present; no-op defaults are supplied if absent, so
 // scorers can call them unconditionally.
@@ -33,12 +40,35 @@ const DEFAULTS = {
   }),
 };
+/** Which capabilities does a module fully implement? */
+export function detectCapabilities(mod) {
+  return Object.fromEntries(
+    Object.entries(CAPABILITIES).map(([name, fns]) =>
+      [name, fns.every(fn => typeof mod[fn] === 'function')])
+  );
+}
 /** Throws with a readable message listing every problem, or returns silently. */
 export function validateAdapter(mod, source = 'adapter') {
   const problems = [];
-  for (const fn of REQUIRED) {
+  for (const fn of REQUIRED_BASE) {
     if (typeof mod[fn] !== 'function') problems.push(`missing required export: ${fn}()`);
   }
+  const caps = detectCapabilities(mod);
+  if (!Object.values(caps).some(Boolean)) {
+    problems.push(
+      'no complete capability implemented — provide ' +
+      Object.entries(CAPABILITIES).map(([n, fns]) => `${fns.map(f => `${f}()`).join(' + ')} (${n})`).join(' or ')
+    );
+    // Name partially implemented capabilities to make the fix obvious.
+    for (const [name, fns] of Object.entries(CAPABILITIES)) {
+      const present = fns.filter(fn => typeof mod[fn] === 'function');
+      if (present.length && present.length < fns.length) {
+        const missing = fns.filter(fn => typeof mod[fn] !== 'function');
+        problems.push(`capability "${name}" is incomplete — missing ${missing.map(f => `${f}()`).join(', ')}`);
+      }
+    }
+  }
   for (const fn of OPTIONAL) {
     if (fn in mod && typeof mod[fn] !== 'function') problems.push(`optional export ${fn} is not a function`);
   }
@@ -65,13 +95,13 @@ export async function loadAdapter(specOverride) {
     throw new Error(`Could not load adapter from ${path}: ${err.message}`);
   }
   validateAdapter(mod, path);
+  const allFns = [...REQUIRED_BASE, ...Object.values(CAPABILITIES).flat(), ...OPTIONAL];
   const methods = Object.fromEntries(
-    [...REQUIRED, ...OPTIONAL]
-      .filter(f => typeof mod[f] === 'function')
-      .map(f => [f, mod[f]])
+    allFns.filter(f => typeof mod[f] === 'function').map(f => [f, mod[f]])
   );
   return {
     name: mod.meta?.name || basename(path, '.mjs'),
+    capabilities: detectCapabilities(mod),
     ...DEFAULTS,
     ...methods,
   };

package/src/runner.mjs CHANGED Viewed

@@ -98,6 +98,7 @@ const SCORERS = [
   './scorers/citation-detection.mjs',
   './scorers/claim-extraction.mjs',
   './scorers/verdict-accuracy.mjs',
+  './scorers/redaction.mjs',
 ];
 async function loadJsonDir(dir, { skipPrefix } = {}) {

package/src/scorers/claim-extraction.mjs CHANGED Viewed

@@ -10,13 +10,16 @@ import { precisionRecallF1, claimMatch, jaccard, normText, mean } from '../lib/m
 export const meta = { id: 'claim-extraction', mode: 'online' };
 export async function run({ cases, adapter }) {
+  if (!adapter.capabilities.qa) {
+    return { meta, skipped: true, reason: `adapter "${adapter.name}" has no QA capability` };
+  }
   if (!adapter.onlineAvailable()) {
     return { meta, skipped: true, reason: adapter.onlineConfigHint() };
   }
   const perCase = [];
   const splitErrors = [];
-  for (const c of cases) {
+  for (const c of cases.filter(x => x.manuscript && (x.goldClaims || []).length)) {
    try {
     const resp = await adapter.splitClaims(c.manuscript);
     const extracted = (resp.claims || []).map(x => (typeof x === 'string' ? { text: x, citations: [] } : x));

package/src/scorers/redaction.mjs ADDED Viewed

@@ -0,0 +1,135 @@
+// ONLINE scorer — redaction recall.
+//
+// Exercises the adapter's redact() capability against gold cases of kind
+// "redaction": source text plus the identifiers that must be removed. The
+// decisive metric is the LEAK — a gold identifier still present verbatim in
+// the redacted output. For a privacy tool, a single leak is the worst failure
+// there is, so every leak is a named failure and gates the run.
+//
+// Case schema (datasets/SCHEMA.md):
+//   {
+//     "id": "...", "kind": "redaction", "text": "...",
+//     "goldEntities":     [{ "type": "PATIENT_NAME", "value": "Mr John Smith" }],
+//     "knownGapEntities": [{ "type": "STREET_ADDRESS", "value": "42 Maple Road" }]
+//   }
+// knownGapEntities are identifiers the system does not yet catch — reported
+// separately as tracked targets, not failures (same mechanism as known-gap
+// citation styles).
+//
+// Metrics:
+//   • recall — share of gold identifiers removed (leaks lower it)
+//   • leaks — count of gold identifiers still present (each is a failure)
+//   • over_redactions — detected entities not in the gold or known-gap sets
+//     (info only: often legitimate extra catches)
+//   • knownGap_open / knownGap_closed — tracked targets still open vs now
+//     caught (a closed gap means the case can be promoted to goldEntities)
+import { mean } from '../lib/metrics.mjs';
+export const meta = { id: 'redaction', mode: 'online' };
+const escapeRx = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+/** Is the identifier value still present in the text (case-insensitive, word-bounded)? */
+function stillPresent(text, value) {
+  return new RegExp(`(?<![A-Za-z0-9])${escapeRx(value)}(?![A-Za-z0-9])`, 'i').test(text);
+}
+// For *_NAME entities a full-string check is not enough: a partially redacted
+// name ("[PATIENT_NAME_1] O'Brien") removes the full value but leaks the
+// surname. Scan the individual words of name values too.
+const TITLES = new Set(['mr', 'mrs', 'ms', 'miss', 'mx', 'dr', 'prof']);
+function nameWords(value) {
+  return value.split(/[^A-Za-z'’\-]+/)
+    .filter(w => w.replace(/[^A-Za-z]/g, '').length >= 3 && !TITLES.has(w.toLowerCase()));
+}
+/** Words of a name entity that survived redaction and are not excused by a known gap. */
+function leakedNameWords(out, value, gapValues) {
+  return nameWords(value).filter(w =>
+    stillPresent(out, w) && !gapValues.some(g => g.toLowerCase().includes(w.toLowerCase())));
+}
+export async function run({ cases, adapter }) {
+  if (!adapter.capabilities.redaction) {
+    return { meta, skipped: true, reason: `adapter "${adapter.name}" has no redaction capability` };
+  }
+  if (!adapter.onlineAvailable()) {
+    return { meta, skipped: true, reason: adapter.onlineConfigHint() };
+  }
+  const goldCases = cases.filter(c => c.kind === 'redaction' && (c.goldEntities || []).length);
+  if (goldCases.length === 0) {
+    return { meta, skipped: true, reason: 'No cases of kind "redaction" with goldEntities.' };
+  }
+  const perCase = [];
+  const failures = [];
+  let gapOpen = 0, gapClosed = 0;
+  for (const c of goldCases) {
+    let res;
+    try {
+      res = await adapter.redact(c.text);
+    } catch (err) {
+      failures.push(`case ${c.id}: ${err.message}`);
+      continue;
+    }
+    const out = res.text ?? '';
+    const gaps = c.knownGapEntities || [];
+    const gapValues = gaps.map(e => e.value);
+    const leaked = [];
+    for (const e of c.goldEntities || []) {
+      if (stillPresent(out, e.value)) {
+        leaked.push({ ...e, how: 'verbatim' });
+      } else if (/NAME/.test(e.type)) {
+        const words = leakedNameWords(out, e.value, gapValues);
+        if (words.length) leaked.push({ ...e, how: `partial: ${words.join(', ')}` });
+      }
+    }
+    for (const e of leaked) {
+      failures.push(`LEAK in ${c.id}: ${e.type} "${e.value}" survived redaction (${e.how})`);
+    }
+    // Tracked targets: not failures, but their status is reported.
+    const gapsStillOpen = gaps.filter(e => stillPresent(out, e.value));
+    gapOpen += gapsStillOpen.length;
+    gapClosed += gaps.length - gapsStillOpen.length;
+    // Over-redaction: detected values unrelated to the gold + known-gap sets.
+    // Substring relation in either direction excuses partial captures
+    // ("Mrs Eileen" detected for gold "Mrs Eileen O'Brien").
+    const expectedVals = [...(c.goldEntities || []), ...gaps].map(e => e.value.toLowerCase());
+    const over = (res.entities || []).filter(e => {
+      const v = String(e.value).toLowerCase();
+      return !expectedVals.some(x => x.includes(v) || v.includes(x));
+    });
+    perCase.push({
+      case: c.id,
+      entities: (c.goldEntities || []).length,
+      leaks: leaked.length,
+      leakedValues: leaked.map(e => `${e.type}:${e.value}`),
+      recall: 1 - leaked.length / (c.goldEntities || []).length,
+      overRedactions: over.length,
+      gapsStillOpen: gapsStillOpen.map(e => `${e.type}:${e.value}`),
+    });
+  }
+  const totalEntities = perCase.reduce((a, p) => a + p.entities, 0);
+  const totalLeaks = perCase.reduce((a, p) => a + p.leaks, 0);
+  const metrics = {
+    n_cases: perCase.length,
+    n_entities: totalEntities,
+    recall: round(totalEntities ? 1 - totalLeaks / totalEntities : 0),
+    leaks: totalLeaks,
+    over_redactions: perCase.reduce((a, p) => a + p.overRedactions, 0),
+    knownGap_open: gapOpen,
+    knownGap_closed: gapClosed,
+    mean_case_recall: round(mean(perCase.map(p => p.recall))),
+  };
+  return { meta, metrics, detail: { perCase }, failures, passed: failures.length === 0 };
+}
+function round(x) { return Math.round(x * 1000) / 1000; }

package/src/scorers/verdict-accuracy.mjs CHANGED Viewed

@@ -53,6 +53,9 @@ function isGuardDowngrade(summary) {
 }
 export async function run({ cases, adapter }) {
+  if (!adapter.capabilities.qa) {
+    return { meta, skipped: true, reason: `adapter "${adapter.name}" has no QA capability` };
+  }
   if (!adapter.onlineAvailable()) {
     return { meta, skipped: true, reason: adapter.onlineConfigHint() };
   }