@pharmatools/opengate 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -116,7 +116,7 @@ Offline scorers run with no API key — fast enough for every commit. Online sco
116
116
 
117
117
  **Scorecards** — every run writes `results/<timestamp>.json` stamped with the git SHA, so any result is reproducible and auditable. Per-model runs carry a `run_model` label, turning the results directory into a measured model comparison (accuracy × hallucination × latency × cost).
118
118
 
119
- **Regression gate** — `--baseline` saves a reference; subsequent runs print per-metric deltas (▲/▼ in percentage points) and `--ci` fails the build on any drop. No change ships without proving it didn't make the system less reliable.
119
+ **Regression gate** — `--baseline` saves a reference; subsequent runs print per-metric deltas (▲/▼ in percentage points) and `--ci` fails the build on any drop. Baselines are **per-adapter** (`baseline.<adapter>.json`), so a PubCrawl retrieval scorecard can't clobber a RefCheckr QA one — each adapter keeps its own reference. No change ships without proving it didn't make the system less reliable.
120
120
 
121
121
  ## Adapters: evaluating your own system
122
122
 
@@ -136,13 +136,13 @@ Full contract, minimal skeleton, and verdict-mapping notes: **[ADAPTERS.md](ADAP
136
136
 
137
137
  ## CI: the GitHub Action
138
138
 
139
- Use OpenGATE as a drop-in regression gate in any repository. Keep your gold set and committed `baseline.json` in your own tree; any metric that drops fails the build:
139
+ Use OpenGATE as a drop-in regression gate in any repository. Keep your gold set and committed baseline (`baseline.<adapter>.json`) in your own tree; any metric that drops fails the build:
140
140
 
141
141
  ```yaml
142
142
  - uses: nickjlamb/opengate@v0
143
143
  with:
144
144
  datasets: ./evals/datasets # your cases/ + fixtures/
145
- results: ./evals/results # where baseline.json lives
145
+ results: ./evals/results # where baseline.<adapter>.json lives
146
146
  adapter: ./evals/my-adapter.mjs # or the bundled HTTP adapter
147
147
  online: 'true'
148
148
  env:
@@ -208,14 +208,14 @@ opengate/
208
208
  scorers/ one file per metric family
209
209
  adapters/ system-under-test boundary (refcheckr.mjs is the reference)
210
210
  runner.mjs CLI: discover cases → run scorers → report → snapshot → regression-check
211
- results/ timestamped run snapshots + baseline.json
211
+ results/ timestamped run snapshots + baseline.<adapter>.json
212
212
  ```
213
213
 
214
214
  ## Roadmap
215
215
 
216
216
 
217
- - **Per-adapter baselines** — `results/baseline.json` is a single reference; runs against different adapters (RefCheckr QA vs Patiently simplification) should baseline separately (workable today via `--results <dir>`, first-class support planned)
218
- - **Author-year in RefCheckr production** — `detectAuthorYear()` now lands "Smith 2020"-style keys in the reference implementation; adopting them in RefCheckr's numeric-keyed citation mapping is tracked separately
217
+ - **Retrieval breadth** — retrieval currently scores one PubMed record type; extend to full-text, citation formatting, and trial detail across PubCrawl's other tools
218
+ - **Retrieval coverage** — the retrieval gold set is one case; add a single-author paper (the exact array-collapse risk the capability exists to catch), a trial (NCT) record, and a full-text/citation case across PubCrawl's other tools
219
219
  - **Number-adjacent superscript** — `week 24.1` is genuinely ambiguous with decimals; remains a tracked known gap
220
220
  - **Growing gold set** — more domains, all six verdict types, real-world reference material
221
221
  - **Stable adapter surface** — the contract may still shift pre-1.0; semver will signal breaking changes
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pharmatools/opengate",
3
- "version": "0.4.0",
3
+ "version": "0.5.0",
4
4
  "type": "module",
5
5
  "description": "OpenGATE — Open Grounded AI Testing & Evaluation. An open-source framework for evaluating evidence-grounded AI systems.",
6
6
  "license": "MIT",
@@ -0,0 +1,36 @@
1
+ // Per-adapter baseline resolution.
2
+ //
3
+ // A regression baseline only means something relative to the adapter that
4
+ // produced it — a PubCrawl retrieval scorecard and a RefCheckr QA scorecard
5
+ // share no metrics, so a single baseline.json would let one clobber the other.
6
+ // Baselines are therefore keyed by adapter: baseline.<adapter>.json.
7
+ //
8
+ // Legacy migration: an older single baseline.json (which records the adapter
9
+ // that produced it) is still honoured — but ONLY for that same adapter, never
10
+ // cross-adapter.
11
+
12
+ /** Filesystem-safe per-adapter baseline filename. */
13
+ export function baselineFileName(adapterName) {
14
+ const safe = String(adapterName || 'default').replace(/[^a-z0-9_-]+/gi, '-').toLowerCase();
15
+ return `baseline.${safe}.json`;
16
+ }
17
+
18
+ /**
19
+ * Choose which baseline file to read for a regression check.
20
+ * @param {string} adapterName
21
+ * @param {object} present
22
+ * { perAdapter: boolean, // baseline.<adapter>.json exists
23
+ * legacy: boolean, // baseline.json exists
24
+ * legacyAdapter: string|null } // the `adapter` field inside baseline.json
25
+ * @returns {{ file: string, source: 'per-adapter'|'legacy' } | null}
26
+ */
27
+ export function resolveBaseline(adapterName, present) {
28
+ if (present.perAdapter) {
29
+ return { file: baselineFileName(adapterName), source: 'per-adapter' };
30
+ }
31
+ // A legacy baseline is trustworthy only for the adapter that wrote it.
32
+ if (present.legacy && present.legacyAdapter === adapterName) {
33
+ return { file: 'baseline.json', source: 'legacy' };
34
+ }
35
+ return null;
36
+ }
package/src/runner.mjs CHANGED
@@ -4,11 +4,12 @@
4
4
  // node src/runner.mjs # offline scorers only
5
5
  // node src/runner.mjs --online # also run scorers that hit the API
6
6
  // node src/runner.mjs --ci # exit non-zero on failure or regression
7
- // node src/runner.mjs --baseline # save this run as results/baseline.json
7
+ // node src/runner.mjs --baseline # save this run as results/baseline.<adapter>.json
8
8
  //
9
9
  // Discovers gold cases (datasets/cases/*.json) and fixtures (datasets/fixtures/*.json),
10
10
  // runs each scorer, prints a summary, writes a timestamped results file, and
11
- // compares headline metrics against results/baseline.json to catch regressions.
11
+ // compares headline metrics against the per-adapter baseline
12
+ // (results/baseline.<adapter>.json) to catch regressions.
12
13
 
13
14
  import { readdir, readFile, writeFile, mkdir } from 'node:fs/promises';
14
15
  import { existsSync } from 'node:fs';
@@ -16,6 +17,7 @@ import { fileURLToPath } from 'node:url';
16
17
  import { dirname, join, resolve } from 'node:path';
17
18
  import { execSync } from 'node:child_process';
18
19
  import { loadAdapter } from './lib/adapter.mjs';
20
+ import { baselineFileName, resolveBaseline } from './lib/baseline.mjs';
19
21
 
20
22
  const __dirname = dirname(fileURLToPath(import.meta.url));
21
23
  const EVAL_ROOT = join(__dirname, '..');
@@ -185,31 +187,49 @@ async function main() {
185
187
  };
186
188
  await writeFile(join(RESULTS_DIR, `${stamp}.json`), JSON.stringify({ ...snapshot, results }, null, 2));
187
189
  if (SAVE_BASELINE) {
188
- await writeFile(join(RESULTS_DIR, 'baseline.json'), JSON.stringify(snapshot, null, 2));
189
- console.log('\n saved baseline.json');
190
+ // Baselines are per-adapter a PubCrawl scorecard must not overwrite a
191
+ // RefCheckr one (they share no metrics).
192
+ const fname = baselineFileName(adapter.name);
193
+ await writeFile(join(RESULTS_DIR, fname), JSON.stringify(snapshot, null, 2));
194
+ console.log(`\n saved ${fname}`);
190
195
  }
191
196
 
192
- // ── Regression check vs baseline ──
197
+ // ── Regression check vs the baseline for THIS adapter ──
193
198
  let regressed = false;
194
- const baselinePath = join(RESULTS_DIR, 'baseline.json');
195
- if (existsSync(baselinePath) && !SAVE_BASELINE) {
196
- const base = JSON.parse(await readFile(baselinePath, 'utf8'));
197
- const baseById = Object.fromEntries(base.results.map(r => [r.id, r]));
198
- console.log('\n vs baseline:');
199
- for (const r of results) {
200
- const b = baseById[r.id];
201
- if (!b || !r.metrics || !b.metrics) continue;
202
- for (const [k, v] of Object.entries(r.metrics)) {
203
- if (typeof v !== 'number' || typeof b.metrics[k] !== 'number') continue;
204
- const delta = v - b.metrics[k];
205
- if (Math.abs(delta) < 1e-9) continue;
206
- const arrow = delta > 0 ? '▲' : '▼';
207
- // Only rate metrics gate regressions (higher = better); counts shown for info.
208
- if (isRateKey(k)) {
209
- if (delta < -1e-9) regressed = true;
210
- console.log(` ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${(delta * 100).toFixed(1)}pp`);
211
- } else {
212
- console.log(` ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${delta}`);
199
+ if (!SAVE_BASELINE) {
200
+ const legacyPath = join(RESULTS_DIR, 'baseline.json');
201
+ const perAdapterPath = join(RESULTS_DIR, baselineFileName(adapter.name));
202
+ let legacyAdapter = null;
203
+ if (existsSync(legacyPath)) {
204
+ try { legacyAdapter = JSON.parse(await readFile(legacyPath, 'utf8')).adapter ?? null; } catch { /* ignore */ }
205
+ }
206
+ const chosen = resolveBaseline(adapter.name, {
207
+ perAdapter: existsSync(perAdapterPath),
208
+ legacy: existsSync(legacyPath),
209
+ legacyAdapter,
210
+ });
211
+ if (chosen) {
212
+ const base = JSON.parse(await readFile(join(RESULTS_DIR, chosen.file), 'utf8'));
213
+ const baseById = Object.fromEntries(base.results.map(r => [r.id, r]));
214
+ console.log(`\n vs baseline (${chosen.file}):`);
215
+ if (chosen.source === 'legacy') {
216
+ console.log(` note: using legacy baseline.json — re-run --baseline to migrate to ${baselineFileName(adapter.name)}`);
217
+ }
218
+ for (const r of results) {
219
+ const b = baseById[r.id];
220
+ if (!b || !r.metrics || !b.metrics) continue;
221
+ for (const [k, v] of Object.entries(r.metrics)) {
222
+ if (typeof v !== 'number' || typeof b.metrics[k] !== 'number') continue;
223
+ const delta = v - b.metrics[k];
224
+ if (Math.abs(delta) < 1e-9) continue;
225
+ const arrow = delta > 0 ? '▲' : '▼';
226
+ // Only rate metrics gate regressions (higher = better); counts shown for info.
227
+ if (isRateKey(k)) {
228
+ if (delta < -1e-9) regressed = true;
229
+ console.log(` ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${(delta * 100).toFixed(1)}pp`);
230
+ } else {
231
+ console.log(` ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${delta}`);
232
+ }
213
233
  }
214
234
  }
215
235
  }