@pharmatools/opengate 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/package.json +1 -1
- package/src/lib/baseline.mjs +36 -0
- package/src/runner.mjs +44 -24
package/README.md
CHANGED
|
@@ -116,7 +116,7 @@ Offline scorers run with no API key — fast enough for every commit. Online sco
|
|
|
116
116
|
|
|
117
117
|
**Scorecards** — every run writes `results/<timestamp>.json` stamped with the git SHA, so any result is reproducible and auditable. Per-model runs carry a `run_model` label, turning the results directory into a measured model comparison (accuracy × hallucination × latency × cost).
|
|
118
118
|
|
|
119
|
-
**Regression gate** — `--baseline` saves a reference; subsequent runs print per-metric deltas (▲/▼ in percentage points) and `--ci` fails the build on any drop. No change ships without proving it didn't make the system less reliable.
|
|
119
|
+
**Regression gate** — `--baseline` saves a reference; subsequent runs print per-metric deltas (▲/▼ in percentage points) and `--ci` fails the build on any drop. Baselines are **per-adapter** (`baseline.<adapter>.json`), so a PubCrawl retrieval scorecard can't clobber a RefCheckr QA one — each adapter keeps its own reference. No change ships without proving it didn't make the system less reliable.
|
|
120
120
|
|
|
121
121
|
## Adapters: evaluating your own system
|
|
122
122
|
|
|
@@ -136,13 +136,13 @@ Full contract, minimal skeleton, and verdict-mapping notes: **[ADAPTERS.md](ADAP
|
|
|
136
136
|
|
|
137
137
|
## CI: the GitHub Action
|
|
138
138
|
|
|
139
|
-
Use OpenGATE as a drop-in regression gate in any repository. Keep your gold set and committed `baseline
|
|
139
|
+
Use OpenGATE as a drop-in regression gate in any repository. Keep your gold set and committed baseline (`baseline.<adapter>.json`) in your own tree; any metric that drops fails the build:
|
|
140
140
|
|
|
141
141
|
```yaml
|
|
142
142
|
- uses: nickjlamb/opengate@v0
|
|
143
143
|
with:
|
|
144
144
|
datasets: ./evals/datasets # your cases/ + fixtures/
|
|
145
|
-
results: ./evals/results # where baseline
|
|
145
|
+
results: ./evals/results # where baseline.<adapter>.json lives
|
|
146
146
|
adapter: ./evals/my-adapter.mjs # or the bundled HTTP adapter
|
|
147
147
|
online: 'true'
|
|
148
148
|
env:
|
|
@@ -208,14 +208,14 @@ opengate/
|
|
|
208
208
|
scorers/ one file per metric family
|
|
209
209
|
adapters/ system-under-test boundary (refcheckr.mjs is the reference)
|
|
210
210
|
runner.mjs CLI: discover cases → run scorers → report → snapshot → regression-check
|
|
211
|
-
results/ timestamped run snapshots + baseline
|
|
211
|
+
results/ timestamped run snapshots + baseline.<adapter>.json
|
|
212
212
|
```
|
|
213
213
|
|
|
214
214
|
## Roadmap
|
|
215
215
|
|
|
216
216
|
|
|
217
|
-
- **
|
|
218
|
-
- **
|
|
217
|
+
- **Retrieval breadth** — retrieval currently scores one PubMed record type; extend to full-text, citation formatting, and trial detail across PubCrawl's other tools
|
|
218
|
+
- **Retrieval coverage** — the retrieval gold set is one case; add a single-author paper (the exact array-collapse risk the capability exists to catch), a trial (NCT) record, and a full-text/citation case across PubCrawl's other tools
|
|
219
219
|
- **Number-adjacent superscript** — `week 24.1` is genuinely ambiguous with decimals; remains a tracked known gap
|
|
220
220
|
- **Growing gold set** — more domains, all six verdict types, real-world reference material
|
|
221
221
|
- **Stable adapter surface** — the contract may still shift pre-1.0; semver will signal breaking changes
|
package/package.json
CHANGED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// Per-adapter baseline resolution.
|
|
2
|
+
//
|
|
3
|
+
// A regression baseline only means something relative to the adapter that
|
|
4
|
+
// produced it — a PubCrawl retrieval scorecard and a RefCheckr QA scorecard
|
|
5
|
+
// share no metrics, so a single baseline.json would let one clobber the other.
|
|
6
|
+
// Baselines are therefore keyed by adapter: baseline.<adapter>.json.
|
|
7
|
+
//
|
|
8
|
+
// Legacy migration: an older single baseline.json (which records the adapter
|
|
9
|
+
// that produced it) is still honoured — but ONLY for that same adapter, never
|
|
10
|
+
// cross-adapter.
|
|
11
|
+
|
|
12
|
+
/** Filesystem-safe per-adapter baseline filename. */
|
|
13
|
+
export function baselineFileName(adapterName) {
|
|
14
|
+
const safe = String(adapterName || 'default').replace(/[^a-z0-9_-]+/gi, '-').toLowerCase();
|
|
15
|
+
return `baseline.${safe}.json`;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Choose which baseline file to read for a regression check.
|
|
20
|
+
* @param {string} adapterName
|
|
21
|
+
* @param {object} present
|
|
22
|
+
* { perAdapter: boolean, // baseline.<adapter>.json exists
|
|
23
|
+
* legacy: boolean, // baseline.json exists
|
|
24
|
+
* legacyAdapter: string|null } // the `adapter` field inside baseline.json
|
|
25
|
+
* @returns {{ file: string, source: 'per-adapter'|'legacy' } | null}
|
|
26
|
+
*/
|
|
27
|
+
export function resolveBaseline(adapterName, present) {
|
|
28
|
+
if (present.perAdapter) {
|
|
29
|
+
return { file: baselineFileName(adapterName), source: 'per-adapter' };
|
|
30
|
+
}
|
|
31
|
+
// A legacy baseline is trustworthy only for the adapter that wrote it.
|
|
32
|
+
if (present.legacy && present.legacyAdapter === adapterName) {
|
|
33
|
+
return { file: 'baseline.json', source: 'legacy' };
|
|
34
|
+
}
|
|
35
|
+
return null;
|
|
36
|
+
}
|
package/src/runner.mjs
CHANGED
|
@@ -4,11 +4,12 @@
|
|
|
4
4
|
// node src/runner.mjs # offline scorers only
|
|
5
5
|
// node src/runner.mjs --online # also run scorers that hit the API
|
|
6
6
|
// node src/runner.mjs --ci # exit non-zero on failure or regression
|
|
7
|
-
// node src/runner.mjs --baseline # save this run as results/baseline
|
|
7
|
+
// node src/runner.mjs --baseline # save this run as results/baseline.<adapter>.json
|
|
8
8
|
//
|
|
9
9
|
// Discovers gold cases (datasets/cases/*.json) and fixtures (datasets/fixtures/*.json),
|
|
10
10
|
// runs each scorer, prints a summary, writes a timestamped results file, and
|
|
11
|
-
// compares headline metrics against
|
|
11
|
+
// compares headline metrics against the per-adapter baseline
|
|
12
|
+
// (results/baseline.<adapter>.json) to catch regressions.
|
|
12
13
|
|
|
13
14
|
import { readdir, readFile, writeFile, mkdir } from 'node:fs/promises';
|
|
14
15
|
import { existsSync } from 'node:fs';
|
|
@@ -16,6 +17,7 @@ import { fileURLToPath } from 'node:url';
|
|
|
16
17
|
import { dirname, join, resolve } from 'node:path';
|
|
17
18
|
import { execSync } from 'node:child_process';
|
|
18
19
|
import { loadAdapter } from './lib/adapter.mjs';
|
|
20
|
+
import { baselineFileName, resolveBaseline } from './lib/baseline.mjs';
|
|
19
21
|
|
|
20
22
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
21
23
|
const EVAL_ROOT = join(__dirname, '..');
|
|
@@ -185,31 +187,49 @@ async function main() {
|
|
|
185
187
|
};
|
|
186
188
|
await writeFile(join(RESULTS_DIR, `${stamp}.json`), JSON.stringify({ ...snapshot, results }, null, 2));
|
|
187
189
|
if (SAVE_BASELINE) {
|
|
188
|
-
|
|
189
|
-
|
|
190
|
+
// Baselines are per-adapter — a PubCrawl scorecard must not overwrite a
|
|
191
|
+
// RefCheckr one (they share no metrics).
|
|
192
|
+
const fname = baselineFileName(adapter.name);
|
|
193
|
+
await writeFile(join(RESULTS_DIR, fname), JSON.stringify(snapshot, null, 2));
|
|
194
|
+
console.log(`\n saved ${fname}`);
|
|
190
195
|
}
|
|
191
196
|
|
|
192
|
-
// ── Regression check vs baseline ──
|
|
197
|
+
// ── Regression check vs the baseline for THIS adapter ──
|
|
193
198
|
let regressed = false;
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
const
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
199
|
+
if (!SAVE_BASELINE) {
|
|
200
|
+
const legacyPath = join(RESULTS_DIR, 'baseline.json');
|
|
201
|
+
const perAdapterPath = join(RESULTS_DIR, baselineFileName(adapter.name));
|
|
202
|
+
let legacyAdapter = null;
|
|
203
|
+
if (existsSync(legacyPath)) {
|
|
204
|
+
try { legacyAdapter = JSON.parse(await readFile(legacyPath, 'utf8')).adapter ?? null; } catch { /* ignore */ }
|
|
205
|
+
}
|
|
206
|
+
const chosen = resolveBaseline(adapter.name, {
|
|
207
|
+
perAdapter: existsSync(perAdapterPath),
|
|
208
|
+
legacy: existsSync(legacyPath),
|
|
209
|
+
legacyAdapter,
|
|
210
|
+
});
|
|
211
|
+
if (chosen) {
|
|
212
|
+
const base = JSON.parse(await readFile(join(RESULTS_DIR, chosen.file), 'utf8'));
|
|
213
|
+
const baseById = Object.fromEntries(base.results.map(r => [r.id, r]));
|
|
214
|
+
console.log(`\n vs baseline (${chosen.file}):`);
|
|
215
|
+
if (chosen.source === 'legacy') {
|
|
216
|
+
console.log(` note: using legacy baseline.json — re-run --baseline to migrate to ${baselineFileName(adapter.name)}`);
|
|
217
|
+
}
|
|
218
|
+
for (const r of results) {
|
|
219
|
+
const b = baseById[r.id];
|
|
220
|
+
if (!b || !r.metrics || !b.metrics) continue;
|
|
221
|
+
for (const [k, v] of Object.entries(r.metrics)) {
|
|
222
|
+
if (typeof v !== 'number' || typeof b.metrics[k] !== 'number') continue;
|
|
223
|
+
const delta = v - b.metrics[k];
|
|
224
|
+
if (Math.abs(delta) < 1e-9) continue;
|
|
225
|
+
const arrow = delta > 0 ? '▲' : '▼';
|
|
226
|
+
// Only rate metrics gate regressions (higher = better); counts shown for info.
|
|
227
|
+
if (isRateKey(k)) {
|
|
228
|
+
if (delta < -1e-9) regressed = true;
|
|
229
|
+
console.log(` ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${(delta * 100).toFixed(1)}pp`);
|
|
230
|
+
} else {
|
|
231
|
+
console.log(` ${r.id}.${k}: ${arrow} ${delta > 0 ? '+' : ''}${delta}`);
|
|
232
|
+
}
|
|
213
233
|
}
|
|
214
234
|
}
|
|
215
235
|
}
|