npm - @pharmatools/opengate - Versions diffs - 0.4.0 → 0.5.0 - Mend

@pharmatools/opengate 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -116,7 +116,7 @@ Offline scorers run with no API key — fast enough for every commit. Online sco
 **Scorecards** — every run writes `results/<timestamp>.json` stamped with the git SHA, so any result is reproducible and auditable. Per-model runs carry a `run_model` label, turning the results directory into a measured model comparison (accuracy × hallucination × latency × cost).
-**Regression gate** — `--baseline` saves a reference; subsequent runs print per-metric deltas (▲/▼ in percentage points) and `--ci` fails the build on any drop. No change ships without proving it didn't make the system less reliable.
+**Regression gate** — `--baseline` saves a reference; subsequent runs print per-metric deltas (▲/▼ in percentage points) and `--ci` fails the build on any drop. Baselines are **per-adapter** (`baseline.<adapter>.json`), so a PubCrawl retrieval scorecard can't clobber a RefCheckr QA one — each adapter keeps its own reference. No change ships without proving it didn't make the system less reliable.
 ## Adapters: evaluating your own system
@@ -136,13 +136,13 @@ Full contract, minimal skeleton, and verdict-mapping notes: **[ADAPTERS.md](ADAP
 ## CI: the GitHub Action
-Use OpenGATE as a drop-in regression gate in any repository. Keep your gold set and committed `baseline.json` in your own tree; any metric that drops fails the build:
+Use OpenGATE as a drop-in regression gate in any repository. Keep your gold set and committed baseline (`baseline.<adapter>.json`) in your own tree; any metric that drops fails the build:
 ```yaml
 - uses: nickjlamb/opengate@v0
   with:
     datasets: ./evals/datasets      # your cases/ + fixtures/
-    results: ./evals/results        # where baseline.json lives
+    results: ./evals/results        # where baseline.<adapter>.json lives
     adapter: ./evals/my-adapter.mjs # or the bundled HTTP adapter
     online: 'true'
   env:
@@ -208,14 +208,14 @@ opengate/
     scorers/      one file per metric family
     adapters/     system-under-test boundary (refcheckr.mjs is the reference)
     runner.mjs    CLI: discover cases → run scorers → report → snapshot → regression-check
-  results/        timestamped run snapshots + baseline.json
+  results/        timestamped run snapshots + baseline.<adapter>.json
 ```
 ## Roadmap
-- **Per-adapter baselines** — `results/baseline.json` is a single reference; runs against different adapters (RefCheckr QA vs Patiently simplification) should baseline separately (workable today via `--results <dir>`, first-class support planned)
-- **Author-year in RefCheckr production** — `detectAuthorYear()` now lands "Smith 2020"-style keys in the reference implementation; adopting them in RefCheckr's numeric-keyed citation mapping is tracked separately
+- **Retrieval breadth** — retrieval currently scores one PubMed record type; extend to full-text, citation formatting, and trial detail across PubCrawl's other tools
+- **Retrieval coverage** — the retrieval gold set is one case; add a single-author paper (the exact array-collapse risk the capability exists to catch), a trial (NCT) record, and a full-text/citation case across PubCrawl's other tools
 - **Number-adjacent superscript** — `week 24.1` is genuinely ambiguous with decimals; remains a tracked known gap
 - **Growing gold set** — more domains, all six verdict types, real-world reference material
 - **Stable adapter surface** — the contract may still shift pre-1.0; semver will signal breaking changes

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@pharmatools/opengate",
-  "version": "0.4.0",
+  "version": "0.5.0",
   "type": "module",
   "description": "OpenGATE — Open Grounded AI Testing & Evaluation. An open-source framework for evaluating evidence-grounded AI systems.",
   "license": "MIT",

package/src/lib/baseline.mjs ADDED Viewed

@@ -0,0 +1,36 @@
+// Per-adapter baseline resolution.
+//
+// A regression baseline only means something relative to the adapter that
+// produced it — a PubCrawl retrieval scorecard and a RefCheckr QA scorecard
+// share no metrics, so a single baseline.json would let one clobber the other.
+// Baselines are therefore keyed by adapter: baseline.<adapter>.json.
+//
+// Legacy migration: an older single baseline.json (which records the adapter
+// that produced it) is still honoured — but ONLY for that same adapter, never
+// cross-adapter.
+/** Filesystem-safe per-adapter baseline filename. */
+export function baselineFileName(adapterName) {
+  const safe = String(adapterName || 'default').replace(/[^a-z0-9_-]+/gi, '-').toLowerCase();
+  return `baseline.${safe}.json`;
+}
+/**
+ * Choose which baseline file to read for a regression check.
+ * @param {string} adapterName
+ * @param {object} present
+ *   { perAdapter: boolean,          // baseline.<adapter>.json exists
+ *     legacy: boolean,              // baseline.json exists
+ *     legacyAdapter: string|null }  // the `adapter` field inside baseline.json
+ * @returns {{ file: string, source: 'per-adapter'|'legacy' } | null}
+ */
+export function resolveBaseline(adapterName, present) {
+  if (present.perAdapter) {
+    return { file: baselineFileName(adapterName), source: 'per-adapter' };
+  }
+  // A legacy baseline is trustworthy only for the adapter that wrote it.
+  if (present.legacy && present.legacyAdapter === adapterName) {
+    return { file: 'baseline.json', source: 'legacy' };
+  }
+  return null;
+}

package/src/runner.mjs CHANGED Viewed

@@ -4,11 +4,12 @@
 //   node src/runner.mjs            # offline scorers only
 //   node src/runner.mjs --online   # also run scorers that hit the API
 //   node src/runner.mjs --ci       # exit non-zero on failure or regression
-//   node src/runner.mjs --baseline # save this run as results/baseline.json
+//   node src/runner.mjs --baseline # save this run as results/baseline.<adapter>.json
 //
 // Discovers gold cases (datasets/cases/*.json) and fixtures (datasets/fixtures/*.json),
 // runs each scorer, prints a summary, writes a timestamped results file, and
-// compares headline metrics against results/baseline.json to catch regressions.
+// compares headline metrics against the per-adapter baseline
+// (results/baseline.<adapter>.json) to catch regressions.
 import { readdir, readFile, writeFile, mkdir } from 'node:fs/promises';
 import { existsSync } from 'node:fs';
@@ -16,6 +17,7 @@ import { fileURLToPath } from 'node:url';
 import { dirname, join, resolve } from 'node:path';
 import { execSync } from 'node:child_process';
 import { loadAdapter } from './lib/adapter.mjs';
+import { baselineFileName, resolveBaseline } from './lib/baseline.mjs';
 const __dirname = dirname(fileURLToPath(import.meta.url));
 const EVAL_ROOT = join(__dirname, '..');
@@ -185,31 +187,49 @@ async function main() {
   };
   await writeFile(join(RESULTS_DIR, `${stamp}.json`), JSON.stringify({ ...snapshot, results }, null, 2));
   if (SAVE_BASELINE) {
-    await writeFile(join(RESULTS_DIR, 'baseline.json'), JSON.stringify(snapshot, null, 2));
-    console.log('\n  saved baseline.json');
+    // Baselines are per-adapter — a PubCrawl scorecard must not overwrite a
+    // RefCheckr one (they share no metrics).
+    const fname = baselineFileName(adapter.name);
+    await writeFile(join(RESULTS_DIR, fname), JSON.stringify(snapshot, null, 2));
+    console.log(`\n  saved ${fname}`);
   }
-  // ── Regression check vs baseline ──
+  // ── Regression check vs the baseline for THIS adapter ──
   let regressed = false;
-  const baselinePath = join(RESULTS_DIR, 'baseline.json');
-  if (existsSync(baselinePath) && !SAVE_BASELINE) {
-    const base = JSON.parse(await readFile(baselinePath, 'utf8'));
-    const baseById = Object.fromEntries(base.results.map(r => [r.id, r]));
-    console.log('\n  vs baseline:');
-    for (const r of results) {
-      const b = baseById[r.id];
-      if (!b || !r.metrics || !b.metrics) continue;
-      for (const [k, v] of Object.entries(r.metrics)) {
-        if (typeof v !== 'number' || typeof b.metrics[k] !== 'number') continue;
-        const delta = v - b.metrics[k];
-        if (Math.abs(delta) < 1e-9) continue;
-        const arrow = delta > 0 ? '▲' : '▼';
-        // Only rate metrics gate regressions (higher = better); counts shown for info.
-        if (isRateKey(k)) {
-          if (delta < -1e-9) regressed = true;
-          console.log(`      ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${(delta * 100).toFixed(1)}pp`);
-        } else {
-          console.log(`      ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${delta}`);
+  if (!SAVE_BASELINE) {
+    const legacyPath = join(RESULTS_DIR, 'baseline.json');
+    const perAdapterPath = join(RESULTS_DIR, baselineFileName(adapter.name));
+    let legacyAdapter = null;
+    if (existsSync(legacyPath)) {
+      try { legacyAdapter = JSON.parse(await readFile(legacyPath, 'utf8')).adapter ?? null; } catch { /* ignore */ }
+    }
+    const chosen = resolveBaseline(adapter.name, {
+      perAdapter: existsSync(perAdapterPath),
+      legacy: existsSync(legacyPath),
+      legacyAdapter,
+    });
+    if (chosen) {
+      const base = JSON.parse(await readFile(join(RESULTS_DIR, chosen.file), 'utf8'));
+      const baseById = Object.fromEntries(base.results.map(r => [r.id, r]));
+      console.log(`\n  vs baseline (${chosen.file}):`);
+      if (chosen.source === 'legacy') {
+        console.log(`      note: using legacy baseline.json — re-run --baseline to migrate to ${baselineFileName(adapter.name)}`);
+      }
+      for (const r of results) {
+        const b = baseById[r.id];
+        if (!b || !r.metrics || !b.metrics) continue;
+        for (const [k, v] of Object.entries(r.metrics)) {
+          if (typeof v !== 'number' || typeof b.metrics[k] !== 'number') continue;
+          const delta = v - b.metrics[k];
+          if (Math.abs(delta) < 1e-9) continue;
+          const arrow = delta > 0 ? '▲' : '▼';
+          // Only rate metrics gate regressions (higher = better); counts shown for info.
+          if (isRateKey(k)) {
+            if (delta < -1e-9) regressed = true;
+            console.log(`      ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${(delta * 100).toFixed(1)}pp`);
+          } else {
+            console.log(`      ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${delta}`);
+          }
         }
       }
     }