@tangle-network/agent-eval 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +928 -1
- package/dist/index.js +1608 -32
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean3 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean3,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -479,18 +479,18 @@ function mannWhitneyU(a, b) {
|
|
|
479
479
|
...a.map((v) => ({ v, group: "a" })),
|
|
480
480
|
...b.map((v) => ({ v, group: "b" }))
|
|
481
481
|
].sort((x, y) => x.v - y.v);
|
|
482
|
-
const
|
|
482
|
+
const ranks3 = new Array(combined.length);
|
|
483
483
|
let i = 0;
|
|
484
484
|
while (i < combined.length) {
|
|
485
485
|
let j = i;
|
|
486
486
|
while (j < combined.length && combined[j].v === combined[i].v) j++;
|
|
487
487
|
const avgRank = (i + 1 + j) / 2;
|
|
488
|
-
for (let k = i; k < j; k++)
|
|
488
|
+
for (let k = i; k < j; k++) ranks3[k] = avgRank;
|
|
489
489
|
i = j;
|
|
490
490
|
}
|
|
491
491
|
let r1 = 0;
|
|
492
492
|
for (let k = 0; k < combined.length; k++) {
|
|
493
|
-
if (combined[k].group === "a") r1 +=
|
|
493
|
+
if (combined[k].group === "a") r1 += ranks3[k];
|
|
494
494
|
}
|
|
495
495
|
const u1 = r1 - n1 * (n1 + 1) / 2;
|
|
496
496
|
const u2 = n1 * n2 - u1;
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean3 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -530,20 +530,20 @@ function wilcoxonSignedRank(before, after) {
|
|
|
530
530
|
const n = diffs.length;
|
|
531
531
|
if (n < 6) return { w: 0, p: 1 };
|
|
532
532
|
const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
|
|
533
|
-
const
|
|
533
|
+
const ranks3 = new Array(n);
|
|
534
534
|
let i = 0;
|
|
535
535
|
while (i < n) {
|
|
536
536
|
let j = i;
|
|
537
537
|
while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
|
|
538
538
|
const avg = (i + 1 + j) / 2;
|
|
539
|
-
for (let k = i; k < j; k++)
|
|
539
|
+
for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg;
|
|
540
540
|
i = j;
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
|
-
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus +=
|
|
544
|
-
const
|
|
543
|
+
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
+
const mean3 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean3) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -1531,24 +1531,24 @@ function analyzeAntiSlop(outputs, config) {
|
|
|
1531
1531
|
}
|
|
1532
1532
|
}
|
|
1533
1533
|
for (const re of config.hedgingPatterns) {
|
|
1534
|
-
const
|
|
1535
|
-
if (
|
|
1536
|
-
counts.hedging +=
|
|
1534
|
+
const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
|
|
1535
|
+
if (matches2) {
|
|
1536
|
+
counts.hedging += matches2.length;
|
|
1537
1537
|
issues.push({
|
|
1538
1538
|
category: "hedging",
|
|
1539
|
-
detail: `${
|
|
1540
|
-
example:
|
|
1539
|
+
detail: `${matches2.length}x ${re.source}`,
|
|
1540
|
+
example: matches2[0]
|
|
1541
1541
|
});
|
|
1542
1542
|
}
|
|
1543
1543
|
}
|
|
1544
1544
|
for (const re of config.apologyPatterns) {
|
|
1545
|
-
const
|
|
1546
|
-
if (
|
|
1547
|
-
counts.apology +=
|
|
1545
|
+
const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
|
|
1546
|
+
if (matches2) {
|
|
1547
|
+
counts.apology += matches2.length;
|
|
1548
1548
|
issues.push({
|
|
1549
1549
|
category: "apology",
|
|
1550
|
-
detail: `${
|
|
1551
|
-
example:
|
|
1550
|
+
detail: `${matches2.length}x ${re.source}`,
|
|
1551
|
+
example: matches2[0]
|
|
1552
1552
|
});
|
|
1553
1553
|
}
|
|
1554
1554
|
}
|
|
@@ -4076,10 +4076,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
4076
4076
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
4077
4077
|
}
|
|
4078
4078
|
const tail = values.slice(-window);
|
|
4079
|
-
const
|
|
4080
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
4079
|
+
const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
4080
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
|
|
4081
4081
|
const stdDev = Math.sqrt(variance2);
|
|
4082
|
-
const refMean = Math.abs(
|
|
4082
|
+
const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
|
|
4083
4083
|
const cv = stdDev / refMean;
|
|
4084
4084
|
const stable = tail.length >= window && cv <= stableCv;
|
|
4085
4085
|
let tailRun = 0;
|
|
@@ -4100,7 +4100,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
4100
4100
|
} else {
|
|
4101
4101
|
state = "noisy";
|
|
4102
4102
|
}
|
|
4103
|
-
return { state, windowMean:
|
|
4103
|
+
return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
|
|
4104
4104
|
}
|
|
4105
4105
|
|
|
4106
4106
|
// src/state-continuity.ts
|
|
@@ -5028,12 +5028,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
5028
5028
|
variantScores.push({ mutator: id, score, mutated });
|
|
5029
5029
|
all.push(score);
|
|
5030
5030
|
}
|
|
5031
|
-
const
|
|
5032
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
5031
|
+
const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
5032
|
+
const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
|
|
5033
5033
|
const stdDev = Math.sqrt(variance2);
|
|
5034
|
-
const ref = Math.abs(
|
|
5034
|
+
const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
|
|
5035
5035
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
5036
|
-
return { originalScore, variantScores, meanScore:
|
|
5036
|
+
return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
|
|
5037
5037
|
}
|
|
5038
5038
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
5039
5039
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -5445,6 +5445,1546 @@ var ProjectRegistry = class {
|
|
|
5445
5445
|
return out;
|
|
5446
5446
|
}
|
|
5447
5447
|
};
|
|
5448
|
+
|
|
5449
|
+
// src/meta-eval/outcome-store.ts
|
|
5450
|
+
var InMemoryOutcomeStore = class {
|
|
5451
|
+
items = [];
|
|
5452
|
+
async append(outcome) {
|
|
5453
|
+
this.items.push({ ...outcome });
|
|
5454
|
+
}
|
|
5455
|
+
async forRun(runId) {
|
|
5456
|
+
return this.items.filter((o) => o.runId === runId).map((o) => ({ ...o }));
|
|
5457
|
+
}
|
|
5458
|
+
async list(filter = {}) {
|
|
5459
|
+
return this.items.filter((o) => matches(o, filter)).map((o) => ({ ...o }));
|
|
5460
|
+
}
|
|
5461
|
+
};
|
|
5462
|
+
var FileSystemOutcomeStore = class {
|
|
5463
|
+
dir;
|
|
5464
|
+
maxBytes;
|
|
5465
|
+
memo;
|
|
5466
|
+
loaded = false;
|
|
5467
|
+
constructor(options) {
|
|
5468
|
+
this.dir = options.dir;
|
|
5469
|
+
this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
|
|
5470
|
+
}
|
|
5471
|
+
async ensureDir() {
|
|
5472
|
+
const fs = await import("fs/promises");
|
|
5473
|
+
await fs.mkdir(this.dir, { recursive: true });
|
|
5474
|
+
}
|
|
5475
|
+
async append(outcome) {
|
|
5476
|
+
await this.ensureDir();
|
|
5477
|
+
const fs = await import("fs/promises");
|
|
5478
|
+
const path = await import("path");
|
|
5479
|
+
const active = path.join(this.dir, "outcomes.ndjson");
|
|
5480
|
+
try {
|
|
5481
|
+
const stat = await fs.stat(active);
|
|
5482
|
+
if (stat.size >= this.maxBytes) {
|
|
5483
|
+
await fs.rename(active, path.join(this.dir, `outcomes.${Date.now()}.ndjson`));
|
|
5484
|
+
}
|
|
5485
|
+
} catch {
|
|
5486
|
+
}
|
|
5487
|
+
await fs.appendFile(active, JSON.stringify(outcome) + "\n", "utf8");
|
|
5488
|
+
if (this.memo) await this.memo.append(outcome);
|
|
5489
|
+
}
|
|
5490
|
+
async load() {
|
|
5491
|
+
if (this.loaded && this.memo) return this.memo;
|
|
5492
|
+
const fs = await import("fs/promises");
|
|
5493
|
+
const path = await import("path");
|
|
5494
|
+
const memo = new InMemoryOutcomeStore();
|
|
5495
|
+
try {
|
|
5496
|
+
const entries = await fs.readdir(this.dir);
|
|
5497
|
+
for (const file of entries) {
|
|
5498
|
+
if (!file.endsWith(".ndjson")) continue;
|
|
5499
|
+
const content = await fs.readFile(path.join(this.dir, file), "utf8");
|
|
5500
|
+
for (const line of content.split("\n")) {
|
|
5501
|
+
if (!line.trim()) continue;
|
|
5502
|
+
await memo.append(JSON.parse(line));
|
|
5503
|
+
}
|
|
5504
|
+
}
|
|
5505
|
+
} catch {
|
|
5506
|
+
}
|
|
5507
|
+
this.memo = memo;
|
|
5508
|
+
this.loaded = true;
|
|
5509
|
+
return memo;
|
|
5510
|
+
}
|
|
5511
|
+
async forRun(runId) {
|
|
5512
|
+
return (await this.load()).forRun(runId);
|
|
5513
|
+
}
|
|
5514
|
+
async list(filter) {
|
|
5515
|
+
return (await this.load()).list(filter);
|
|
5516
|
+
}
|
|
5517
|
+
};
|
|
5518
|
+
function matches(o, f) {
|
|
5519
|
+
if (f.runIds && !f.runIds.includes(o.runId)) return false;
|
|
5520
|
+
if (f.since !== void 0 && o.capturedAt < f.since) return false;
|
|
5521
|
+
if (f.until !== void 0 && o.capturedAt > f.until) return false;
|
|
5522
|
+
if (f.source && o.source !== f.source) return false;
|
|
5523
|
+
if (f.label && o.labels?.[f.label.key] !== f.label.value) return false;
|
|
5524
|
+
return true;
|
|
5525
|
+
}
|
|
5526
|
+
|
|
5527
|
+
// src/meta-eval/correlation-study.ts
|
|
5528
|
+
async function correlationStudy(traceStore, outcomeStore, evalMetrics, outcomeMetricNames, options = {}) {
|
|
5529
|
+
const runs = await traceStore.listRuns();
|
|
5530
|
+
const outcomes = await outcomeStore.list(options.outcomeFilter);
|
|
5531
|
+
const outcomesByRun = /* @__PURE__ */ new Map();
|
|
5532
|
+
for (const o of outcomes) {
|
|
5533
|
+
const arr = outcomesByRun.get(o.runId) ?? [];
|
|
5534
|
+
arr.push(o);
|
|
5535
|
+
outcomesByRun.set(o.runId, arr);
|
|
5536
|
+
}
|
|
5537
|
+
const reduction = options.reduction ?? "latest";
|
|
5538
|
+
const maxLag = options.maxCaptureLagMs ?? Infinity;
|
|
5539
|
+
const pairs = [];
|
|
5540
|
+
for (const em of evalMetrics) {
|
|
5541
|
+
for (const om of outcomeMetricNames) {
|
|
5542
|
+
pairs.push({ evalMetric: em.id, outcomeMetric: om, xs: [], ys: [] });
|
|
5543
|
+
}
|
|
5544
|
+
}
|
|
5545
|
+
let joined = 0;
|
|
5546
|
+
let skipped = 0;
|
|
5547
|
+
for (const run of runs) {
|
|
5548
|
+
const os = outcomesByRun.get(run.runId);
|
|
5549
|
+
if (!os || os.length === 0) {
|
|
5550
|
+
skipped++;
|
|
5551
|
+
continue;
|
|
5552
|
+
}
|
|
5553
|
+
const eligible = os.filter((o) => o.capturedAt - run.startedAt <= maxLag);
|
|
5554
|
+
if (eligible.length === 0) {
|
|
5555
|
+
skipped++;
|
|
5556
|
+
continue;
|
|
5557
|
+
}
|
|
5558
|
+
for (const em of evalMetrics) {
|
|
5559
|
+
const extract = em.extract ?? defaultExtract3(em.id);
|
|
5560
|
+
const x = await extract(run, traceStore);
|
|
5561
|
+
if (x === null || !Number.isFinite(x)) continue;
|
|
5562
|
+
for (const om of outcomeMetricNames) {
|
|
5563
|
+
const values = eligible.map((o) => o.metrics[om]).filter((v) => typeof v === "number" && Number.isFinite(v));
|
|
5564
|
+
if (values.length === 0) continue;
|
|
5565
|
+
const y = reduce(values, reduction, eligible);
|
|
5566
|
+
if (y === null) continue;
|
|
5567
|
+
const pair = pairs.find((p) => p.evalMetric === em.id && p.outcomeMetric === om);
|
|
5568
|
+
pair.xs.push(x);
|
|
5569
|
+
pair.ys.push(y);
|
|
5570
|
+
}
|
|
5571
|
+
}
|
|
5572
|
+
joined++;
|
|
5573
|
+
}
|
|
5574
|
+
const results = pairs.filter((p) => p.xs.length >= 3).map((p) => {
|
|
5575
|
+
const pearson2 = pearsonR3(p.xs, p.ys);
|
|
5576
|
+
const spearman = pearsonR3(ranks2(p.xs), ranks2(p.ys));
|
|
5577
|
+
const pearsonCi95 = bootstrapPearsonCi(p.xs, p.ys, options.bootstrapIterations ?? 500);
|
|
5578
|
+
const verdict = Math.abs(pearson2) >= 0.7 ? "strong" : Math.abs(pearson2) >= 0.4 ? "moderate" : "weak";
|
|
5579
|
+
return { evalMetric: p.evalMetric, outcomeMetric: p.outcomeMetric, n: p.xs.length, pearson: pearson2, spearman, pearsonCi95, verdict };
|
|
5580
|
+
});
|
|
5581
|
+
return { pairs: results, joinedSamples: joined, skippedRuns: skipped };
|
|
5582
|
+
}
|
|
5583
|
+
function reduce(values, kind, outcomes) {
|
|
5584
|
+
if (values.length === 0) return null;
|
|
5585
|
+
if (kind === "mean") return values.reduce((a, b) => a + b, 0) / values.length;
|
|
5586
|
+
if (kind === "max") return Math.max(...values);
|
|
5587
|
+
const latest = [...outcomes].sort((a, b) => b.capturedAt - a.capturedAt)[0];
|
|
5588
|
+
const v = latest?.metrics[Object.keys(latest.metrics)[0]];
|
|
5589
|
+
const paired = outcomes.map((o) => ({ at: o.capturedAt, v: values.find((x) => o.metrics[Object.keys(o.metrics)[0]] === x) })).filter((p) => p.v !== void 0);
|
|
5590
|
+
if (paired.length === 0) return v ?? null;
|
|
5591
|
+
return paired.sort((a, b) => b.at - a.at)[0].v ?? null;
|
|
5592
|
+
}
|
|
5593
|
+
function pearsonR3(a, b) {
|
|
5594
|
+
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5595
|
+
const mA = a.reduce((s, v) => s + v, 0) / a.length;
|
|
5596
|
+
const mB = b.reduce((s, v) => s + v, 0) / b.length;
|
|
5597
|
+
let num = 0, dA = 0, dB = 0;
|
|
5598
|
+
for (let i = 0; i < a.length; i++) {
|
|
5599
|
+
const da = a[i] - mA, db = b[i] - mB;
|
|
5600
|
+
num += da * db;
|
|
5601
|
+
dA += da * da;
|
|
5602
|
+
dB += db * db;
|
|
5603
|
+
}
|
|
5604
|
+
if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
|
|
5605
|
+
return num / Math.sqrt(dA * dB);
|
|
5606
|
+
}
|
|
5607
|
+
function ranks2(xs) {
|
|
5608
|
+
const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v);
|
|
5609
|
+
const r = new Array(xs.length);
|
|
5610
|
+
for (let i = 0; i < indexed.length; i++) {
|
|
5611
|
+
let j = i;
|
|
5612
|
+
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
5613
|
+
const avg = (i + j + 2) / 2;
|
|
5614
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
|
|
5615
|
+
i = j;
|
|
5616
|
+
}
|
|
5617
|
+
return r;
|
|
5618
|
+
}
|
|
5619
|
+
function bootstrapPearsonCi(xs, ys, iterations) {
|
|
5620
|
+
const n = xs.length;
|
|
5621
|
+
if (n < 3) return { lower: NaN, upper: NaN };
|
|
5622
|
+
const rs = [];
|
|
5623
|
+
for (let b = 0; b < iterations; b++) {
|
|
5624
|
+
const rx = new Array(n);
|
|
5625
|
+
const ry = new Array(n);
|
|
5626
|
+
for (let i = 0; i < n; i++) {
|
|
5627
|
+
const idx = Math.floor(Math.random() * n);
|
|
5628
|
+
rx[i] = xs[idx];
|
|
5629
|
+
ry[i] = ys[idx];
|
|
5630
|
+
}
|
|
5631
|
+
const r = pearsonR3(rx, ry);
|
|
5632
|
+
if (Number.isFinite(r)) rs.push(r);
|
|
5633
|
+
}
|
|
5634
|
+
rs.sort((a, b) => a - b);
|
|
5635
|
+
if (rs.length === 0) return { lower: NaN, upper: NaN };
|
|
5636
|
+
return { lower: rs[Math.floor(0.025 * rs.length)], upper: rs[Math.min(rs.length - 1, Math.floor(0.975 * rs.length))] };
|
|
5637
|
+
}
|
|
5638
|
+
function defaultExtract3(metric) {
|
|
5639
|
+
return async (run, store) => {
|
|
5640
|
+
switch (metric) {
|
|
5641
|
+
case "score":
|
|
5642
|
+
case "overallScore":
|
|
5643
|
+
return run.outcome?.score ?? null;
|
|
5644
|
+
case "pass":
|
|
5645
|
+
return run.outcome?.pass === true ? 1 : 0;
|
|
5646
|
+
case "durationMs":
|
|
5647
|
+
return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null;
|
|
5648
|
+
case "costUsd": {
|
|
5649
|
+
const llm = await llmSpans(store, run.runId);
|
|
5650
|
+
return aggregateLlm(llm).costUsd;
|
|
5651
|
+
}
|
|
5652
|
+
case "inputTokens": {
|
|
5653
|
+
const llm = await llmSpans(store, run.runId);
|
|
5654
|
+
return aggregateLlm(llm).inputTokens;
|
|
5655
|
+
}
|
|
5656
|
+
default:
|
|
5657
|
+
return null;
|
|
5658
|
+
}
|
|
5659
|
+
};
|
|
5660
|
+
}
|
|
5661
|
+
|
|
5662
|
+
// src/meta-eval/calibration.ts
|
|
5663
|
+
async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
|
|
5664
|
+
const runs = await traceStore.listRuns();
|
|
5665
|
+
const outcomes = await outcomeStore.list();
|
|
5666
|
+
const byRun = /* @__PURE__ */ new Map();
|
|
5667
|
+
for (const o of outcomes) {
|
|
5668
|
+
const arr = byRun.get(o.runId) ?? [];
|
|
5669
|
+
arr.push(o);
|
|
5670
|
+
byRun.set(o.runId, arr);
|
|
5671
|
+
}
|
|
5672
|
+
const extract = evalMetric.extract ?? defaultExtract4(evalMetric.id);
|
|
5673
|
+
const pairs = [];
|
|
5674
|
+
for (const run of runs) {
|
|
5675
|
+
const os = byRun.get(run.runId);
|
|
5676
|
+
if (!os?.length) continue;
|
|
5677
|
+
const x = await extract(run, traceStore);
|
|
5678
|
+
if (x === null || !Number.isFinite(x)) continue;
|
|
5679
|
+
const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0];
|
|
5680
|
+
const y = latest.metrics[outcomeMetric];
|
|
5681
|
+
if (typeof y !== "number" || !Number.isFinite(y)) continue;
|
|
5682
|
+
pairs.push({ x, y });
|
|
5683
|
+
}
|
|
5684
|
+
if (pairs.length < 2) return null;
|
|
5685
|
+
const numBins = options.bins ?? 10;
|
|
5686
|
+
const binning = options.binning ?? "equal-width";
|
|
5687
|
+
const xs = pairs.map((p) => p.x);
|
|
5688
|
+
const lo = options.range?.lo ?? Math.min(...xs);
|
|
5689
|
+
const hi = options.range?.hi ?? Math.max(...xs);
|
|
5690
|
+
const bins = [];
|
|
5691
|
+
if (binning === "equal-frequency") {
|
|
5692
|
+
const sorted = [...pairs].sort((a, b) => a.x - b.x);
|
|
5693
|
+
const perBin = Math.max(1, Math.floor(sorted.length / numBins));
|
|
5694
|
+
for (let i = 0; i < sorted.length; i += perBin) {
|
|
5695
|
+
const chunk = sorted.slice(i, i + perBin);
|
|
5696
|
+
if (chunk.length === 0) continue;
|
|
5697
|
+
bins.push(toBin(chunk));
|
|
5698
|
+
}
|
|
5699
|
+
} else {
|
|
5700
|
+
const width = (hi - lo) / numBins;
|
|
5701
|
+
if (width === 0) return null;
|
|
5702
|
+
for (let i = 0; i < numBins; i++) {
|
|
5703
|
+
const binLo = lo + i * width;
|
|
5704
|
+
const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width;
|
|
5705
|
+
const chunk = pairs.filter((p) => p.x >= binLo && p.x < binHi);
|
|
5706
|
+
if (chunk.length === 0) continue;
|
|
5707
|
+
bins.push(toBin(chunk, binLo, binHi));
|
|
5708
|
+
}
|
|
5709
|
+
}
|
|
5710
|
+
const total = bins.reduce((a, b) => a + b.n, 0);
|
|
5711
|
+
const ece = bins.reduce((a, b) => a + b.n / total * b.gap, 0);
|
|
5712
|
+
const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0);
|
|
5713
|
+
return { evalMetric: evalMetric.id, outcomeMetric, n: pairs.length, bins, ece, maxGap };
|
|
5714
|
+
}
|
|
5715
|
+
function toBin(chunk, lower, upper) {
|
|
5716
|
+
const xs = chunk.map((c) => c.x);
|
|
5717
|
+
const ys = chunk.map((c) => c.y);
|
|
5718
|
+
const evalMean = mean2(xs);
|
|
5719
|
+
const outcomeMean = mean2(ys);
|
|
5720
|
+
return {
|
|
5721
|
+
lower: lower ?? Math.min(...xs),
|
|
5722
|
+
upper: upper ?? Math.max(...xs),
|
|
5723
|
+
n: chunk.length,
|
|
5724
|
+
evalMean,
|
|
5725
|
+
outcomeMean,
|
|
5726
|
+
gap: Math.abs(outcomeMean - evalMean)
|
|
5727
|
+
};
|
|
5728
|
+
}
|
|
5729
|
+
function mean2(xs) {
|
|
5730
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
5731
|
+
}
|
|
5732
|
+
function defaultExtract4(metric) {
|
|
5733
|
+
return async (run) => run.outcome?.score ?? (metric === "pass" ? run.outcome?.pass === true ? 1 : 0 : null);
|
|
5734
|
+
}
|
|
5735
|
+
|
|
5736
|
+
// src/prm/rubric.ts
|
|
5737
|
+
var PrmGrader = class {
|
|
5738
|
+
constructor(rubrics) {
|
|
5739
|
+
this.rubrics = rubrics;
|
|
5740
|
+
if (rubrics.length === 0) throw new Error("PrmGrader: at least 1 rubric required");
|
|
5741
|
+
}
|
|
5742
|
+
rubrics;
|
|
5743
|
+
/**
|
|
5744
|
+
* Grade every eligible span in a run. Emits a JudgeVerdict span for each
|
|
5745
|
+
* (rubric × span) verdict so the result is visible to downstream pipelines
|
|
5746
|
+
* (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
|
|
5747
|
+
*/
|
|
5748
|
+
async grade(store, runId) {
|
|
5749
|
+
const trajectory = await buildTrajectory(store, runId);
|
|
5750
|
+
const emitter = new TraceEmitter(store, { runId });
|
|
5751
|
+
const steps = [];
|
|
5752
|
+
let ungraded = 0;
|
|
5753
|
+
for (let i = 0; i < trajectory.steps.length; i++) {
|
|
5754
|
+
const step = trajectory.steps[i];
|
|
5755
|
+
const ctx = {
|
|
5756
|
+
trajectory,
|
|
5757
|
+
step,
|
|
5758
|
+
prior: trajectory.steps.slice(0, i),
|
|
5759
|
+
next: trajectory.steps.slice(i + 1)
|
|
5760
|
+
};
|
|
5761
|
+
let gradedThis = false;
|
|
5762
|
+
for (const rubric of this.rubrics) {
|
|
5763
|
+
if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue;
|
|
5764
|
+
const verdict = await rubric.grade(ctx);
|
|
5765
|
+
if (verdict === null) continue;
|
|
5766
|
+
const weight = rubric.weight ?? 1;
|
|
5767
|
+
steps.push({
|
|
5768
|
+
spanId: step.span.spanId,
|
|
5769
|
+
rubricId: rubric.id,
|
|
5770
|
+
score: verdict.score,
|
|
5771
|
+
weight,
|
|
5772
|
+
rationale: verdict.rationale,
|
|
5773
|
+
evidence: verdict.evidence
|
|
5774
|
+
});
|
|
5775
|
+
gradedThis = true;
|
|
5776
|
+
await emitter.recordJudge({
|
|
5777
|
+
judgeId: `prm:${rubric.id}`,
|
|
5778
|
+
targetSpanId: step.span.spanId,
|
|
5779
|
+
dimension: "step_quality",
|
|
5780
|
+
score: verdict.score,
|
|
5781
|
+
rationale: verdict.rationale,
|
|
5782
|
+
evidence: verdict.evidence,
|
|
5783
|
+
name: `prm:${rubric.id}`
|
|
5784
|
+
});
|
|
5785
|
+
}
|
|
5786
|
+
if (!gradedThis) ungraded++;
|
|
5787
|
+
}
|
|
5788
|
+
const totalWeight = steps.reduce((a, s) => a + s.weight, 0);
|
|
5789
|
+
const aggregateScore = totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight;
|
|
5790
|
+
return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded };
|
|
5791
|
+
}
|
|
5792
|
+
};
|
|
5793
|
+
function isPrmVerdict(verdict) {
|
|
5794
|
+
return verdict.judgeId.startsWith("prm:");
|
|
5795
|
+
}
|
|
5796
|
+
|
|
5797
|
+
// src/prm/builtin-rubrics.ts
|
|
5798
|
+
function outputLengthRubric(args = {}) {
|
|
5799
|
+
const min = args.minChars ?? 20;
|
|
5800
|
+
const max = args.maxChars ?? 8e3;
|
|
5801
|
+
return {
|
|
5802
|
+
id: "output-length",
|
|
5803
|
+
kinds: ["llm"],
|
|
5804
|
+
weight: args.weight ?? 0.5,
|
|
5805
|
+
async grade({ step }) {
|
|
5806
|
+
const llm = step.span;
|
|
5807
|
+
const len = (llm.output ?? "").length;
|
|
5808
|
+
if (len === 0) return { score: 0, rationale: "empty output" };
|
|
5809
|
+
if (len < min) return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` };
|
|
5810
|
+
if (len > max) return { score: Math.max(0, 1 - (len - max) / max), rationale: `above max (${len} > ${max})` };
|
|
5811
|
+
return { score: 1, rationale: `${len} chars in bounds` };
|
|
5812
|
+
}
|
|
5813
|
+
};
|
|
5814
|
+
}
|
|
5815
|
+
function toolSuccessRubric(args = {}) {
|
|
5816
|
+
return {
|
|
5817
|
+
id: "tool-success",
|
|
5818
|
+
kinds: ["tool"],
|
|
5819
|
+
weight: args.weight ?? 1,
|
|
5820
|
+
async grade({ step }) {
|
|
5821
|
+
const tool = step.span;
|
|
5822
|
+
if (tool.status === "error") return { score: 0, rationale: `error: ${tool.error ?? "unknown"}` };
|
|
5823
|
+
const r = tool.result;
|
|
5824
|
+
if (r === null || r === void 0) return { score: 0.3, rationale: "empty result" };
|
|
5825
|
+
const asText = typeof r === "string" ? r : JSON.stringify(r);
|
|
5826
|
+
if (asText.length < 4) return { score: 0.5, rationale: "tiny result" };
|
|
5827
|
+
return { score: 1, rationale: `${tool.toolName} ok` };
|
|
5828
|
+
}
|
|
5829
|
+
};
|
|
5830
|
+
}
|
|
5831
|
+
function toolNonRedundantRubric(args = {}) {
|
|
5832
|
+
const weight = args.weight ?? 0.5;
|
|
5833
|
+
return {
|
|
5834
|
+
id: "tool-non-redundant",
|
|
5835
|
+
kinds: ["tool"],
|
|
5836
|
+
weight,
|
|
5837
|
+
async grade({ step, prior }) {
|
|
5838
|
+
const tool = step.span;
|
|
5839
|
+
const priorMatches = prior.filter((p) => {
|
|
5840
|
+
if (p.span.kind !== "tool") return false;
|
|
5841
|
+
const pt = p.span;
|
|
5842
|
+
return pt.toolName === tool.toolName && stableStringify2(pt.args) === stableStringify2(tool.args);
|
|
5843
|
+
});
|
|
5844
|
+
if (priorMatches.length === 0) return { score: 1, rationale: "novel call" };
|
|
5845
|
+
return { score: Math.max(0, 1 - priorMatches.length * 0.5), rationale: `${priorMatches.length} duplicate(s)` };
|
|
5846
|
+
}
|
|
5847
|
+
};
|
|
5848
|
+
}
|
|
5849
|
+
function nonRefusalRubric(args = {}) {
|
|
5850
|
+
const weight = args.weight ?? 1;
|
|
5851
|
+
const markers = args.markers ?? [
|
|
5852
|
+
/\bi\s+(?:can(?:not|'t)|won't|will\s+not)\b/i,
|
|
5853
|
+
/\b(?:as\s+an?\s+)?ai\b.*?\b(?:can't|cannot)\b/i
|
|
5854
|
+
];
|
|
5855
|
+
return {
|
|
5856
|
+
id: "non-refusal",
|
|
5857
|
+
kinds: ["llm"],
|
|
5858
|
+
weight,
|
|
5859
|
+
async grade({ step }) {
|
|
5860
|
+
const llm = step.span;
|
|
5861
|
+
const out = llm.output ?? "";
|
|
5862
|
+
const refused = markers.some((re) => re.test(out));
|
|
5863
|
+
return refused ? { score: 0, rationale: "refusal marker present" } : { score: 1, rationale: "no refusal" };
|
|
5864
|
+
}
|
|
5865
|
+
};
|
|
5866
|
+
}
|
|
5867
|
+
function toolIntentAlignmentRubric(args = {}) {
|
|
5868
|
+
return {
|
|
5869
|
+
id: "tool-intent-alignment",
|
|
5870
|
+
kinds: ["llm"],
|
|
5871
|
+
weight: args.weight ?? 0.5,
|
|
5872
|
+
async grade({ step, next }) {
|
|
5873
|
+
const llm = step.span;
|
|
5874
|
+
const nextTool = next.find((s) => s.span.kind === "tool");
|
|
5875
|
+
if (!nextTool) return null;
|
|
5876
|
+
const toolName = nextTool.span.toolName;
|
|
5877
|
+
const out = (llm.output ?? "").toLowerCase();
|
|
5878
|
+
const mentioned = out.includes(toolName.toLowerCase());
|
|
5879
|
+
return mentioned ? { score: 1, rationale: `mentioned "${toolName}" before calling it` } : { score: 0.5, rationale: `called "${toolName}" without announcing it` };
|
|
5880
|
+
}
|
|
5881
|
+
};
|
|
5882
|
+
}
|
|
5883
|
+
function stableStringify2(value) {
|
|
5884
|
+
if (value === null || typeof value !== "object") return JSON.stringify(value);
|
|
5885
|
+
if (Array.isArray(value)) return `[${value.map(stableStringify2).join(",")}]`;
|
|
5886
|
+
const keys = Object.keys(value).sort();
|
|
5887
|
+
return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify2(value[k])}`).join(",")}}`;
|
|
5888
|
+
}
|
|
5889
|
+
|
|
5890
|
+
// src/prm/training-export.ts
|
|
5891
|
+
async function exportTrainingData(store, graded, options = {}) {
|
|
5892
|
+
const window = options.contextWindow ?? 5;
|
|
5893
|
+
const out = [];
|
|
5894
|
+
for (const g of graded) {
|
|
5895
|
+
const trajectory = await buildTrajectory(store, g.runId);
|
|
5896
|
+
const spanById = new Map(trajectory.steps.map((s) => [s.span.spanId, s]));
|
|
5897
|
+
for (const gs of g.steps) {
|
|
5898
|
+
const node = spanById.get(gs.spanId);
|
|
5899
|
+
if (!node) continue;
|
|
5900
|
+
const idx = trajectory.steps.indexOf(node);
|
|
5901
|
+
const priorSpans = trajectory.steps.slice(Math.max(0, idx - window), idx).map((s) => s.span);
|
|
5902
|
+
out.push({
|
|
5903
|
+
runId: g.runId,
|
|
5904
|
+
spanId: gs.spanId,
|
|
5905
|
+
rubricId: gs.rubricId,
|
|
5906
|
+
score: gs.score,
|
|
5907
|
+
context: {
|
|
5908
|
+
priorTurns: priorSpans.map(spanToTurn).filter((t) => t !== null),
|
|
5909
|
+
step: { kind: node.span.kind, text: spanToText(node.span) }
|
|
5910
|
+
},
|
|
5911
|
+
rationale: gs.rationale,
|
|
5912
|
+
evidence: gs.evidence
|
|
5913
|
+
});
|
|
5914
|
+
}
|
|
5915
|
+
}
|
|
5916
|
+
return out;
|
|
5917
|
+
}
|
|
5918
|
+
function toNdjson(samples) {
|
|
5919
|
+
return samples.map((s) => JSON.stringify(s)).join("\n") + "\n";
|
|
5920
|
+
}
|
|
5921
|
+
function spanToTurn(span) {
|
|
5922
|
+
if (isLlmSpan(span)) {
|
|
5923
|
+
const text = span.output ?? span.messages.map((m) => `${m.role}: ${m.content}`).join("\n");
|
|
5924
|
+
return { role: "assistant", content: text };
|
|
5925
|
+
}
|
|
5926
|
+
if (isToolSpan(span)) {
|
|
5927
|
+
return {
|
|
5928
|
+
role: "tool",
|
|
5929
|
+
content: `${span.toolName}(${safeStringify(span.args)}) \u2192 ${safeStringify(span.result)}`
|
|
5930
|
+
};
|
|
5931
|
+
}
|
|
5932
|
+
return null;
|
|
5933
|
+
}
|
|
5934
|
+
function spanToText(span) {
|
|
5935
|
+
if (isLlmSpan(span)) return span.output ?? "";
|
|
5936
|
+
if (isToolSpan(span)) return `${span.toolName}(${safeStringify(span.args)}) \u2192 ${safeStringify(span.result)}`;
|
|
5937
|
+
return span.name;
|
|
5938
|
+
}
|
|
5939
|
+
function safeStringify(v) {
|
|
5940
|
+
if (v === null || v === void 0) return "";
|
|
5941
|
+
if (typeof v === "string") return v;
|
|
5942
|
+
try {
|
|
5943
|
+
return JSON.stringify(v);
|
|
5944
|
+
} catch {
|
|
5945
|
+
return String(v);
|
|
5946
|
+
}
|
|
5947
|
+
}
|
|
5948
|
+
|
|
5949
|
+
// src/prm/inference.ts
|
|
5950
|
+
async function prmBestOfN(store, grader, runIds) {
|
|
5951
|
+
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
5952
|
+
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
5953
|
+
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
5954
|
+
const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
5955
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
|
|
5956
|
+
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
5957
|
+
}
|
|
5958
|
+
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
5959
|
+
if (graders.length === 0) throw new Error("prmEnsembleBestOfN: at least 1 grader");
|
|
5960
|
+
const perGrader = await Promise.all(
|
|
5961
|
+
graders.map(async (g) => {
|
|
5962
|
+
const graded = await Promise.all(runIds.map((id) => g.grade(store, id)));
|
|
5963
|
+
return graded.sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
5964
|
+
})
|
|
5965
|
+
);
|
|
5966
|
+
const bordaScores = /* @__PURE__ */ new Map();
|
|
5967
|
+
for (const ranking of perGrader) {
|
|
5968
|
+
ranking.forEach((g, rank) => {
|
|
5969
|
+
bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank));
|
|
5970
|
+
});
|
|
5971
|
+
}
|
|
5972
|
+
const canonical = perGrader[0];
|
|
5973
|
+
const byRun = new Map(canonical.map((g) => [g.runId, g]));
|
|
5974
|
+
const ranked = [...byRun.values()].sort(
|
|
5975
|
+
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
5976
|
+
);
|
|
5977
|
+
const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
5978
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
|
|
5979
|
+
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
5980
|
+
}
|
|
5981
|
+
|
|
5982
|
+
// src/bisector.ts
|
|
5983
|
+
async function bisect(options) {
|
|
5984
|
+
const equals = options.equals ?? ((a, b) => Object.is(a, b));
|
|
5985
|
+
const maxIter = options.maxIterations ?? 40;
|
|
5986
|
+
const path = [];
|
|
5987
|
+
const goodVerdict = await options.runEval(options.good);
|
|
5988
|
+
path.push({ state: options.good, ...goodVerdict });
|
|
5989
|
+
const badVerdict = await options.runEval(options.bad);
|
|
5990
|
+
path.push({ state: options.bad, ...badVerdict });
|
|
5991
|
+
if (!goodVerdict.pass) {
|
|
5992
|
+
return { culprit: options.good, path, converged: false, inputInconsistent: true };
|
|
5993
|
+
}
|
|
5994
|
+
if (badVerdict.pass) {
|
|
5995
|
+
return { culprit: options.bad, path, converged: false, inputInconsistent: true };
|
|
5996
|
+
}
|
|
5997
|
+
let good = options.good;
|
|
5998
|
+
let bad = options.bad;
|
|
5999
|
+
for (let i = 0; i < maxIter; i++) {
|
|
6000
|
+
const mid = options.halfway(good, bad);
|
|
6001
|
+
if (mid === null || equals(mid, good) || equals(mid, bad)) {
|
|
6002
|
+
return { culprit: bad, path, converged: true, inputInconsistent: false };
|
|
6003
|
+
}
|
|
6004
|
+
const v = await options.runEval(mid);
|
|
6005
|
+
path.push({ state: mid, ...v });
|
|
6006
|
+
if (v.pass) good = mid;
|
|
6007
|
+
else bad = mid;
|
|
6008
|
+
}
|
|
6009
|
+
return { culprit: bad, path, converged: false, inputInconsistent: false };
|
|
6010
|
+
}
|
|
6011
|
+
async function commitBisect(options) {
|
|
6012
|
+
const { commits } = options;
|
|
6013
|
+
const goodIdx = commits.indexOf(options.good);
|
|
6014
|
+
const badIdx = commits.indexOf(options.bad);
|
|
6015
|
+
if (goodIdx < 0 || badIdx < 0) {
|
|
6016
|
+
throw new Error(`commitBisect: good or bad SHA not in commit list (good=${options.good}, bad=${options.bad})`);
|
|
6017
|
+
}
|
|
6018
|
+
if (goodIdx >= badIdx) {
|
|
6019
|
+
throw new Error("commitBisect: good must precede bad in the commit list");
|
|
6020
|
+
}
|
|
6021
|
+
return bisect({
|
|
6022
|
+
good: options.good,
|
|
6023
|
+
bad: options.bad,
|
|
6024
|
+
runEval: options.runEval,
|
|
6025
|
+
maxIterations: options.maxIterations,
|
|
6026
|
+
halfway: (g, b) => {
|
|
6027
|
+
const gi = commits.indexOf(g);
|
|
6028
|
+
const bi = commits.indexOf(b);
|
|
6029
|
+
if (bi - gi <= 1) return null;
|
|
6030
|
+
return commits[Math.floor((gi + bi) / 2)];
|
|
6031
|
+
}
|
|
6032
|
+
});
|
|
6033
|
+
}
|
|
6034
|
+
async function promptBisect(options) {
|
|
6035
|
+
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
6036
|
+
const join = (paragraphs) => paragraphs.join("\n\n");
|
|
6037
|
+
const goodParas = split(options.good);
|
|
6038
|
+
const badParas = split(options.bad);
|
|
6039
|
+
if (goodParas.length !== badParas.length) {
|
|
6040
|
+
throw new Error(`promptBisect: paragraph count mismatch (${goodParas.length} vs ${badParas.length})`);
|
|
6041
|
+
}
|
|
6042
|
+
if (goodParas.length < 2) {
|
|
6043
|
+
throw new Error("promptBisect: need at least 2 paragraphs to bisect");
|
|
6044
|
+
}
|
|
6045
|
+
const n = goodParas.length;
|
|
6046
|
+
const goodMask = "0".repeat(n);
|
|
6047
|
+
const badMask = "1".repeat(n);
|
|
6048
|
+
function paragraphsFor(mask) {
|
|
6049
|
+
return mask.split("").map((c, i) => c === "1" ? badParas[i] : goodParas[i]);
|
|
6050
|
+
}
|
|
6051
|
+
const result = await bisect({
|
|
6052
|
+
good: goodMask,
|
|
6053
|
+
bad: badMask,
|
|
6054
|
+
runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
|
|
6055
|
+
maxIterations: options.maxIterations ?? n + 5,
|
|
6056
|
+
halfway: (g, b) => {
|
|
6057
|
+
for (let i = 0; i < g.length; i++) {
|
|
6058
|
+
if (g[i] !== b[i]) {
|
|
6059
|
+
const differing = [];
|
|
6060
|
+
for (let j = i; j < g.length; j++) if (g[j] !== b[j]) differing.push(j);
|
|
6061
|
+
if (differing.length === 0) return null;
|
|
6062
|
+
if (differing.length === 1) return null;
|
|
6063
|
+
const flip = differing.slice(0, Math.ceil(differing.length / 2));
|
|
6064
|
+
const chars = g.split("");
|
|
6065
|
+
for (const f of flip) chars[f] = b[f];
|
|
6066
|
+
return chars.join("");
|
|
6067
|
+
}
|
|
6068
|
+
}
|
|
6069
|
+
return null;
|
|
6070
|
+
},
|
|
6071
|
+
equals: (a, b) => a === b
|
|
6072
|
+
});
|
|
6073
|
+
let offendingParagraphIndex;
|
|
6074
|
+
const lastGood = result.path.filter((s) => s.pass).pop();
|
|
6075
|
+
const culprit = result.culprit;
|
|
6076
|
+
if (lastGood) {
|
|
6077
|
+
for (let i = 0; i < n; i++) {
|
|
6078
|
+
if (lastGood.state[i] !== culprit[i]) {
|
|
6079
|
+
offendingParagraphIndex = i;
|
|
6080
|
+
break;
|
|
6081
|
+
}
|
|
6082
|
+
}
|
|
6083
|
+
}
|
|
6084
|
+
const materializedPath = result.path.map((s) => ({
|
|
6085
|
+
state: join(paragraphsFor(s.state)),
|
|
6086
|
+
score: s.score,
|
|
6087
|
+
pass: s.pass
|
|
6088
|
+
}));
|
|
6089
|
+
return {
|
|
6090
|
+
culprit: join(paragraphsFor(culprit)),
|
|
6091
|
+
path: materializedPath,
|
|
6092
|
+
converged: result.converged,
|
|
6093
|
+
inputInconsistent: result.inputInconsistent,
|
|
6094
|
+
offendingParagraphIndex
|
|
6095
|
+
};
|
|
6096
|
+
}
|
|
6097
|
+
|
|
6098
|
+
// src/counterfactual.ts
|
|
6099
|
+
async function runCounterfactual(store, originalRunId, mutation, runner) {
|
|
6100
|
+
const originalRun = await store.getRun(originalRunId);
|
|
6101
|
+
if (!originalRun) throw new Error(`counterfactual: run ${originalRunId} not found`);
|
|
6102
|
+
const trajectory = await buildTrajectory(store, originalRunId);
|
|
6103
|
+
if (mutation.at < 0 || mutation.at >= trajectory.steps.length) {
|
|
6104
|
+
throw new Error(`counterfactual: mutation.at=${mutation.at} out of range [0, ${trajectory.steps.length})`);
|
|
6105
|
+
}
|
|
6106
|
+
const targetStep = trajectory.steps[mutation.at];
|
|
6107
|
+
const mutatedStep = applyMutation(targetStep, mutation);
|
|
6108
|
+
const cfEmitter = new TraceEmitter(store);
|
|
6109
|
+
await cfEmitter.startRun({
|
|
6110
|
+
scenarioId: originalRun.scenarioId,
|
|
6111
|
+
variantId: originalRun.variantId ? `${originalRun.variantId}+cf:${mutation.kind}@${mutation.at}` : `cf:${mutation.kind}@${mutation.at}`,
|
|
6112
|
+
projectId: originalRun.projectId,
|
|
6113
|
+
parentRunId: originalRunId,
|
|
6114
|
+
layer: "meta",
|
|
6115
|
+
tags: { counterfactual: "true", mutationKind: mutation.kind, mutationAt: String(mutation.at) }
|
|
6116
|
+
});
|
|
6117
|
+
await runner.executeFrom(
|
|
6118
|
+
{
|
|
6119
|
+
originalRunId,
|
|
6120
|
+
originalTrajectory: trajectory,
|
|
6121
|
+
prefix: trajectory.steps.slice(0, mutation.at),
|
|
6122
|
+
mutation,
|
|
6123
|
+
mutatedStep
|
|
6124
|
+
},
|
|
6125
|
+
cfEmitter
|
|
6126
|
+
);
|
|
6127
|
+
const counterfactual = await store.getRun(cfEmitter.runId);
|
|
6128
|
+
const delta = {
|
|
6129
|
+
originalOutcomeScore: originalRun.outcome?.score ?? null,
|
|
6130
|
+
counterfactualOutcomeScore: counterfactual?.outcome?.score ?? null,
|
|
6131
|
+
deltaScore: originalRun.outcome?.score !== void 0 && counterfactual?.outcome?.score !== void 0 ? counterfactual.outcome.score - originalRun.outcome.score : null
|
|
6132
|
+
};
|
|
6133
|
+
return { counterfactualRunId: cfEmitter.runId, originalRunId, mutation, delta };
|
|
6134
|
+
}
|
|
6135
|
+
function applyMutation(step, mutation) {
|
|
6136
|
+
if (mutation.kind === "swap-model" && step.span.kind === "llm") {
|
|
6137
|
+
const llm = step.span;
|
|
6138
|
+
return { ...step, span: { ...llm, model: mutation.newModel } };
|
|
6139
|
+
}
|
|
6140
|
+
if (mutation.kind === "swap-tool-result" && step.span.kind === "tool") {
|
|
6141
|
+
const tool = step.span;
|
|
6142
|
+
return { ...step, span: { ...tool, result: mutation.newResult } };
|
|
6143
|
+
}
|
|
6144
|
+
if (mutation.kind === "inject-system-message" && step.span.kind === "llm") {
|
|
6145
|
+
const llm = step.span;
|
|
6146
|
+
return {
|
|
6147
|
+
...step,
|
|
6148
|
+
span: {
|
|
6149
|
+
...llm,
|
|
6150
|
+
messages: [{ role: "system", content: mutation.content }, ...llm.messages]
|
|
6151
|
+
}
|
|
6152
|
+
};
|
|
6153
|
+
}
|
|
6154
|
+
if (mutation.kind === "custom") return mutation.apply(step);
|
|
6155
|
+
return step;
|
|
6156
|
+
}
|
|
6157
|
+
function attributeCounterfactuals(results) {
|
|
6158
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
6159
|
+
for (const r of results) {
|
|
6160
|
+
const arr = grouped.get(r.mutation.kind) ?? [];
|
|
6161
|
+
arr.push(r);
|
|
6162
|
+
grouped.set(r.mutation.kind, arr);
|
|
6163
|
+
}
|
|
6164
|
+
const out = [];
|
|
6165
|
+
for (const [kind, items] of grouped) {
|
|
6166
|
+
const deltas = items.map((i) => i.delta.deltaScore).filter((d) => typeof d === "number");
|
|
6167
|
+
if (deltas.length === 0) continue;
|
|
6168
|
+
const meanAbs = deltas.reduce((a, b) => a + Math.abs(b), 0) / deltas.length;
|
|
6169
|
+
const meanSigned = deltas.reduce((a, b) => a + b, 0) / deltas.length;
|
|
6170
|
+
out.push({ mutationKind: kind, n: deltas.length, meanAbsDelta: meanAbs, meanSignedDelta: meanSigned });
|
|
6171
|
+
}
|
|
6172
|
+
return out.sort((a, b) => b.meanAbsDelta - a.meanAbsDelta);
|
|
6173
|
+
}
|
|
6174
|
+
|
|
6175
|
+
// src/cross-trace-diff.ts
|
|
6176
|
+
async function crossTraceDiff(store, runA, runB, options = {}) {
|
|
6177
|
+
const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
|
|
6178
|
+
const eq = options.stepEquals ?? defaultStepEquals2;
|
|
6179
|
+
const alignment = align(a.steps, b.steps, eq);
|
|
6180
|
+
const [judgesA, judgesB] = await Promise.all([
|
|
6181
|
+
store.spans({ runId: runA, kind: "judge" }).then((s) => s.filter(isJudgeSpan)),
|
|
6182
|
+
store.spans({ runId: runB, kind: "judge" }).then((s) => s.filter(isJudgeSpan))
|
|
6183
|
+
]);
|
|
6184
|
+
const prmByTargetA = indexPrmByTarget(judgesA);
|
|
6185
|
+
const prmByTargetB = indexPrmByTarget(judgesB);
|
|
6186
|
+
const attributions = alignment.map((ao) => attributeStep(ao, prmByTargetA, prmByTargetB));
|
|
6187
|
+
const prmDeltaSum = attributions.reduce((acc, at) => acc + (at.prmDelta ?? 0), 0);
|
|
6188
|
+
const [runRecA, runRecB] = await Promise.all([store.getRun(runA), store.getRun(runB)]);
|
|
6189
|
+
const totalScoreDelta = runRecA?.outcome?.score !== void 0 && runRecB?.outcome?.score !== void 0 ? runRecB.outcome.score - runRecA.outcome.score : null;
|
|
6190
|
+
return { runA, runB, alignment, attributions, totalScoreDelta, prmDeltaSum };
|
|
6191
|
+
}
|
|
6192
|
+
function align(a, b, eq) {
|
|
6193
|
+
const dp = Array.from({ length: a.length + 1 }, () => new Array(b.length + 1).fill(0));
|
|
6194
|
+
for (let i2 = 1; i2 <= a.length; i2++) {
|
|
6195
|
+
for (let j2 = 1; j2 <= b.length; j2++) {
|
|
6196
|
+
if (eq(a[i2 - 1], b[j2 - 1])) dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
|
|
6197
|
+
else dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
|
|
6198
|
+
}
|
|
6199
|
+
}
|
|
6200
|
+
const ops = [];
|
|
6201
|
+
let i = a.length;
|
|
6202
|
+
let j = b.length;
|
|
6203
|
+
while (i > 0 || j > 0) {
|
|
6204
|
+
if (i > 0 && j > 0 && eq(a[i - 1], b[j - 1])) {
|
|
6205
|
+
ops.push({ op: "match", a: a[i - 1], b: b[j - 1] });
|
|
6206
|
+
i--;
|
|
6207
|
+
j--;
|
|
6208
|
+
} else if (i > 0 && j > 0 && dp[i - 1][j] === dp[i][j - 1]) {
|
|
6209
|
+
if (a[i - 1].span.kind === b[j - 1].span.kind) {
|
|
6210
|
+
ops.push({ op: "replace", a: a[i - 1], b: b[j - 1] });
|
|
6211
|
+
i--;
|
|
6212
|
+
j--;
|
|
6213
|
+
} else if (dp[i - 1][j] >= dp[i][j - 1]) {
|
|
6214
|
+
ops.push({ op: "delete", a: a[i - 1] });
|
|
6215
|
+
i--;
|
|
6216
|
+
} else {
|
|
6217
|
+
ops.push({ op: "insert", b: b[j - 1] });
|
|
6218
|
+
j--;
|
|
6219
|
+
}
|
|
6220
|
+
} else if (i > 0 && (j === 0 || dp[i - 1][j] >= dp[i][j - 1])) {
|
|
6221
|
+
ops.push({ op: "delete", a: a[i - 1] });
|
|
6222
|
+
i--;
|
|
6223
|
+
} else {
|
|
6224
|
+
ops.push({ op: "insert", b: b[j - 1] });
|
|
6225
|
+
j--;
|
|
6226
|
+
}
|
|
6227
|
+
}
|
|
6228
|
+
return ops.reverse();
|
|
6229
|
+
}
|
|
6230
|
+
function defaultStepEquals2(a, b) {
|
|
6231
|
+
if (a.span.kind !== b.span.kind) return false;
|
|
6232
|
+
if (a.span.kind === "tool" && b.span.kind === "tool") return a.span.toolName === b.span.toolName;
|
|
6233
|
+
if (a.span.kind === "llm" && b.span.kind === "llm") return a.span.model === b.span.model;
|
|
6234
|
+
return a.span.name === b.span.name;
|
|
6235
|
+
}
|
|
6236
|
+
function indexPrmByTarget(judges) {
|
|
6237
|
+
const out = /* @__PURE__ */ new Map();
|
|
6238
|
+
for (const j of judges) {
|
|
6239
|
+
const prior = out.get(j.targetSpanId) ?? 0;
|
|
6240
|
+
out.set(j.targetSpanId, prior + j.score);
|
|
6241
|
+
}
|
|
6242
|
+
return out;
|
|
6243
|
+
}
|
|
6244
|
+
function spanLatency(s) {
|
|
6245
|
+
return s.endedAt && s.startedAt ? s.endedAt - s.startedAt : null;
|
|
6246
|
+
}
|
|
6247
|
+
function spanTokens(s) {
|
|
6248
|
+
if (s.kind !== "llm") return null;
|
|
6249
|
+
return (s.inputTokens ?? 0) + (s.outputTokens ?? 0);
|
|
6250
|
+
}
|
|
6251
|
+
function attributeStep(op, prmA, prmB) {
|
|
6252
|
+
if (op.op === "match") {
|
|
6253
|
+
const pa2 = prmA.get(op.a.span.spanId);
|
|
6254
|
+
const pb = prmB.get(op.b.span.spanId);
|
|
6255
|
+
const prmDelta = pa2 !== void 0 && pb !== void 0 ? pb - pa2 : null;
|
|
6256
|
+
const la = spanLatency(op.a.span);
|
|
6257
|
+
const lb = spanLatency(op.b.span);
|
|
6258
|
+
const ta = spanTokens(op.a.span);
|
|
6259
|
+
const tb = spanTokens(op.b.span);
|
|
6260
|
+
return {
|
|
6261
|
+
op,
|
|
6262
|
+
prmDelta,
|
|
6263
|
+
latencyDeltaMs: la !== null && lb !== null ? lb - la : null,
|
|
6264
|
+
tokenDelta: ta !== null && tb !== null ? tb - ta : null,
|
|
6265
|
+
note: prmDelta === null ? "matched step, no PRM coverage" : "matched step, PRM delta recorded"
|
|
6266
|
+
};
|
|
6267
|
+
}
|
|
6268
|
+
if (op.op === "replace") {
|
|
6269
|
+
const pa2 = prmA.get(op.a.span.spanId) ?? 0;
|
|
6270
|
+
const pb = prmB.get(op.b.span.spanId) ?? 0;
|
|
6271
|
+
return {
|
|
6272
|
+
op,
|
|
6273
|
+
prmDelta: pb - pa2,
|
|
6274
|
+
latencyDeltaMs: null,
|
|
6275
|
+
tokenDelta: null,
|
|
6276
|
+
note: `replaced ${op.a.span.kind}/${op.a.span.name} \u2192 ${op.b.span.kind}/${op.b.span.name}`
|
|
6277
|
+
};
|
|
6278
|
+
}
|
|
6279
|
+
if (op.op === "insert") {
|
|
6280
|
+
const pb = prmB.get(op.b.span.spanId) ?? 0;
|
|
6281
|
+
return {
|
|
6282
|
+
op,
|
|
6283
|
+
prmDelta: pb,
|
|
6284
|
+
latencyDeltaMs: null,
|
|
6285
|
+
tokenDelta: null,
|
|
6286
|
+
note: `inserted step in B (${op.b.span.kind}/${op.b.span.name})`
|
|
6287
|
+
};
|
|
6288
|
+
}
|
|
6289
|
+
const pa = prmA.get(op.a.span.spanId) ?? 0;
|
|
6290
|
+
return {
|
|
6291
|
+
op,
|
|
6292
|
+
prmDelta: -pa,
|
|
6293
|
+
latencyDeltaMs: null,
|
|
6294
|
+
tokenDelta: null,
|
|
6295
|
+
note: `deleted step from A (${op.a.span.kind}/${op.a.span.name})`
|
|
6296
|
+
};
|
|
6297
|
+
}
|
|
6298
|
+
|
|
6299
|
+
// src/pre-registration.ts
|
|
6300
|
+
async function signManifest(m) {
|
|
6301
|
+
const canonical = canonicalize2(m);
|
|
6302
|
+
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
6303
|
+
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
6304
|
+
const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
6305
|
+
return { ...m, contentHash: hash };
|
|
6306
|
+
}
|
|
6307
|
+
async function verifyManifest(m) {
|
|
6308
|
+
const { contentHash, ...rest } = m;
|
|
6309
|
+
const resigned = await signManifest(rest);
|
|
6310
|
+
return resigned.contentHash === contentHash;
|
|
6311
|
+
}
|
|
6312
|
+
async function evaluateHypothesis(manifest, observed) {
|
|
6313
|
+
if (!await verifyManifest(manifest)) {
|
|
6314
|
+
throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
|
|
6315
|
+
}
|
|
6316
|
+
const reasons = [];
|
|
6317
|
+
const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
|
|
6318
|
+
if (!directionOk) reasons.push("wrong_direction");
|
|
6319
|
+
if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
|
|
6320
|
+
if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
|
|
6321
|
+
if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
|
|
6322
|
+
return {
|
|
6323
|
+
manifest,
|
|
6324
|
+
observedN: observed.n,
|
|
6325
|
+
observedEffect: observed.effect,
|
|
6326
|
+
observedPValue: observed.pValue,
|
|
6327
|
+
confirmed: reasons.length === 0,
|
|
6328
|
+
rejectionReasons: reasons
|
|
6329
|
+
};
|
|
6330
|
+
}
|
|
6331
|
+
function canonicalize2(v) {
|
|
6332
|
+
if (v === null || typeof v !== "object") return v;
|
|
6333
|
+
if (Array.isArray(v)) return v.map(canonicalize2);
|
|
6334
|
+
const keys = Object.keys(v).sort();
|
|
6335
|
+
const out = {};
|
|
6336
|
+
for (const k of keys) out[k] = canonicalize2(v[k]);
|
|
6337
|
+
return out;
|
|
6338
|
+
}
|
|
6339
|
+
|
|
6340
|
+
// src/self-play.ts
|
|
6341
|
+
async function runSelfPlay(proposer, scorer, targets, options = {}) {
|
|
6342
|
+
if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
|
|
6343
|
+
const minSpread = options.minSpread ?? 0.1;
|
|
6344
|
+
const floor = options.minAbsoluteFloor ?? 0.1;
|
|
6345
|
+
const maxSurvivors = options.maxSurvivors ?? 50;
|
|
6346
|
+
const totalRounds = options.rounds ?? 1;
|
|
6347
|
+
const allRounds = [];
|
|
6348
|
+
let priorSurvivors = [];
|
|
6349
|
+
const datasetScenarios = [];
|
|
6350
|
+
for (let r = 0; r < totalRounds; r++) {
|
|
6351
|
+
const proposed = await proposer.propose(r, priorSurvivors);
|
|
6352
|
+
const scored = [];
|
|
6353
|
+
const rejected = [];
|
|
6354
|
+
const surviving = [];
|
|
6355
|
+
for (const candidate of proposed) {
|
|
6356
|
+
const scores = await scorer.scoreCandidate(candidate, targets);
|
|
6357
|
+
if (scores.length < 2) {
|
|
6358
|
+
rejected.push({ candidate, reason: "scorer returned <2 results" });
|
|
6359
|
+
continue;
|
|
6360
|
+
}
|
|
6361
|
+
const values = scores.map((s) => s.score);
|
|
6362
|
+
const spread = Math.max(...values) - Math.min(...values);
|
|
6363
|
+
const maxScore = Math.max(...values);
|
|
6364
|
+
scored.push({ candidate, scores, spread });
|
|
6365
|
+
if (maxScore < floor) {
|
|
6366
|
+
rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
|
|
6367
|
+
continue;
|
|
6368
|
+
}
|
|
6369
|
+
if (spread < minSpread) {
|
|
6370
|
+
rejected.push({ candidate, reason: `spread below threshold (${spread.toFixed(3)} < ${minSpread})` });
|
|
6371
|
+
continue;
|
|
6372
|
+
}
|
|
6373
|
+
surviving.push(candidate);
|
|
6374
|
+
}
|
|
6375
|
+
surviving.sort((a, b) => {
|
|
6376
|
+
const sa = scored.find((s) => s.candidate.id === a.id)?.spread ?? 0;
|
|
6377
|
+
const sb = scored.find((s) => s.candidate.id === b.id)?.spread ?? 0;
|
|
6378
|
+
return sb - sa;
|
|
6379
|
+
});
|
|
6380
|
+
const capped = surviving.slice(0, maxSurvivors);
|
|
6381
|
+
for (const s of capped) {
|
|
6382
|
+
datasetScenarios.push({
|
|
6383
|
+
id: s.id,
|
|
6384
|
+
payload: s.payload,
|
|
6385
|
+
split: "test",
|
|
6386
|
+
tags: { ...s.tags, evolutionRound: String(r), origin: "self-play" }
|
|
6387
|
+
});
|
|
6388
|
+
}
|
|
6389
|
+
allRounds.push({ round: r, proposed, survived: capped, rejected, scoredBreakdown: scored });
|
|
6390
|
+
priorSurvivors = capped;
|
|
6391
|
+
}
|
|
6392
|
+
const dataset = new Dataset({
|
|
6393
|
+
name: "self-play-survivors",
|
|
6394
|
+
provenance: {
|
|
6395
|
+
version: "1.0.0",
|
|
6396
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6397
|
+
contributor: "self-play",
|
|
6398
|
+
description: `Evolved across ${totalRounds} round(s), ${allRounds.reduce((a, r) => a + r.survived.length, 0)} survivors`
|
|
6399
|
+
},
|
|
6400
|
+
scenarios: datasetScenarios
|
|
6401
|
+
});
|
|
6402
|
+
return { rounds: allRounds, dataset };
|
|
6403
|
+
}
|
|
6404
|
+
|
|
6405
|
+
// src/causal-attribution.ts
|
|
6406
|
+
function causalAttribution(cells) {
|
|
6407
|
+
if (cells.length < 4) throw new Error("causalAttribution: need \u2265 4 cells to estimate effects");
|
|
6408
|
+
const factors = Object.keys(cells[0].levels);
|
|
6409
|
+
if (factors.length < 2) throw new Error("causalAttribution: need \u2265 2 factors");
|
|
6410
|
+
const allScores = cells.map((c) => c.score);
|
|
6411
|
+
const grandMean = allScores.reduce((a, b) => a + b, 0) / allScores.length;
|
|
6412
|
+
const totalVariance = allScores.reduce((acc, s) => acc + (s - grandMean) ** 2, 0) / allScores.length;
|
|
6413
|
+
if (totalVariance === 0) {
|
|
6414
|
+
return { totalVariance: 0, mainEffects: factors.map((f) => ({ factor: f, shareOfVariance: 0, range: 0 })), interactions: [], residualShare: 1, sharesSum: 1 };
|
|
6415
|
+
}
|
|
6416
|
+
const mainEffects = factors.map((f) => {
|
|
6417
|
+
const byLevel = groupBy2(cells, (c) => c.levels[f]);
|
|
6418
|
+
const means = [];
|
|
6419
|
+
for (const arr of byLevel.values()) {
|
|
6420
|
+
means.push(arr.reduce((a, c) => a + c.score, 0) / arr.length);
|
|
6421
|
+
}
|
|
6422
|
+
const mainVariance = means.reduce((acc, m) => acc + (m - grandMean) ** 2, 0) / means.length;
|
|
6423
|
+
return {
|
|
6424
|
+
factor: f,
|
|
6425
|
+
shareOfVariance: mainVariance / totalVariance,
|
|
6426
|
+
range: Math.max(...means) - Math.min(...means)
|
|
6427
|
+
};
|
|
6428
|
+
});
|
|
6429
|
+
const interactions = [];
|
|
6430
|
+
for (let i = 0; i < factors.length; i++) {
|
|
6431
|
+
for (let j = i + 1; j < factors.length; j++) {
|
|
6432
|
+
const byPair = groupBy2(cells, (c) => `${c.levels[factors[i]]}|${c.levels[factors[j]]}`);
|
|
6433
|
+
const pairMeans = [];
|
|
6434
|
+
for (const arr of byPair.values()) {
|
|
6435
|
+
pairMeans.push(arr.reduce((a, c) => a + c.score, 0) / arr.length);
|
|
6436
|
+
}
|
|
6437
|
+
const pairVariance = pairMeans.reduce((acc, m) => acc + (m - grandMean) ** 2, 0) / pairMeans.length;
|
|
6438
|
+
const mainI = mainEffects[i].shareOfVariance * totalVariance;
|
|
6439
|
+
const mainJ = mainEffects[j].shareOfVariance * totalVariance;
|
|
6440
|
+
const interactionVariance = Math.max(0, pairVariance - mainI - mainJ);
|
|
6441
|
+
interactions.push({
|
|
6442
|
+
factors: [factors[i], factors[j]],
|
|
6443
|
+
shareOfVariance: interactionVariance / totalVariance
|
|
6444
|
+
});
|
|
6445
|
+
}
|
|
6446
|
+
}
|
|
6447
|
+
const mainSum = mainEffects.reduce((a, m) => a + m.shareOfVariance, 0);
|
|
6448
|
+
const interactionSum = interactions.reduce((a, m) => a + m.shareOfVariance, 0);
|
|
6449
|
+
const residualShare = Math.max(0, 1 - mainSum - interactionSum);
|
|
6450
|
+
const sharesSum = mainSum + interactionSum + residualShare;
|
|
6451
|
+
return { totalVariance, mainEffects, interactions, residualShare, sharesSum };
|
|
6452
|
+
}
|
|
6453
|
+
function groupBy2(items, key) {
|
|
6454
|
+
const m = /* @__PURE__ */ new Map();
|
|
6455
|
+
for (const item of items) {
|
|
6456
|
+
const k = key(item);
|
|
6457
|
+
const arr = m.get(k) ?? [];
|
|
6458
|
+
arr.push(item);
|
|
6459
|
+
m.set(k, arr);
|
|
6460
|
+
}
|
|
6461
|
+
return m;
|
|
6462
|
+
}
|
|
6463
|
+
|
|
6464
|
+
// src/active-learning.ts
|
|
6465
|
+
async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
6466
|
+
const minPerBand = options.minPerBand ?? 5;
|
|
6467
|
+
const varianceThreshold = options.varianceThreshold ?? 0.05;
|
|
6468
|
+
const topK = options.topK ?? 10;
|
|
6469
|
+
const scenarios = dataset.all();
|
|
6470
|
+
const targets = [];
|
|
6471
|
+
const BANDS = ["easy", "medium", "hard", "extreme"];
|
|
6472
|
+
for (const band of BANDS) {
|
|
6473
|
+
const count = scenarios.filter((s) => s.difficulty === band).length;
|
|
6474
|
+
if (count < minPerBand) {
|
|
6475
|
+
const neighbors = scenarios.filter((s) => s.difficulty === band).slice(0, 3);
|
|
6476
|
+
targets.push({
|
|
6477
|
+
reason: "difficulty-gap",
|
|
6478
|
+
description: `difficulty="${band}" has ${count} scenario(s) \u2014 below minimum ${minPerBand}`,
|
|
6479
|
+
neighbors: [...neighbors],
|
|
6480
|
+
direction: `create more "${band}" scenarios; reuse domain but shift complexity`,
|
|
6481
|
+
priority: Math.max(0, 1 - count / minPerBand)
|
|
6482
|
+
});
|
|
6483
|
+
}
|
|
6484
|
+
}
|
|
6485
|
+
const runs = await traceStore.listRuns();
|
|
6486
|
+
const runCountByScenario = /* @__PURE__ */ new Map();
|
|
6487
|
+
for (const r of runs) {
|
|
6488
|
+
runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
|
|
6489
|
+
}
|
|
6490
|
+
const runCounts = [...runCountByScenario.values()];
|
|
6491
|
+
const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
|
|
6492
|
+
for (const s of scenarios) {
|
|
6493
|
+
const count = runCountByScenario.get(s.id) ?? 0;
|
|
6494
|
+
if (count <= p25 && count < 3) {
|
|
6495
|
+
targets.push({
|
|
6496
|
+
reason: "undersampled",
|
|
6497
|
+
description: `scenario "${s.id}" has only ${count} run(s)`,
|
|
6498
|
+
neighbors: [s],
|
|
6499
|
+
direction: `create near-duplicates of "${s.id}" to stabilize its mean`,
|
|
6500
|
+
priority: Math.max(0, 1 - count / 3) * 0.7
|
|
6501
|
+
});
|
|
6502
|
+
}
|
|
6503
|
+
}
|
|
6504
|
+
for (const s of scenarios) {
|
|
6505
|
+
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
6506
|
+
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
6507
|
+
if (scores.length < 3) continue;
|
|
6508
|
+
const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
6509
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
|
|
6510
|
+
if (variance2 > varianceThreshold) {
|
|
6511
|
+
targets.push({
|
|
6512
|
+
reason: "high-variance",
|
|
6513
|
+
description: `scenario "${s.id}" has unstable scoring (variance ${variance2.toFixed(3)})`,
|
|
6514
|
+
neighbors: [s],
|
|
6515
|
+
direction: `disambiguate the scenario description \u2014 current wording admits too many valid interpretations`,
|
|
6516
|
+
priority: Math.min(1, variance2 * 5)
|
|
6517
|
+
});
|
|
6518
|
+
}
|
|
6519
|
+
}
|
|
6520
|
+
const failureByClass = /* @__PURE__ */ new Map();
|
|
6521
|
+
for (const run of runs) {
|
|
6522
|
+
if (run.outcome?.pass === true) continue;
|
|
6523
|
+
const spans = await traceStore.spans({ runId: run.runId });
|
|
6524
|
+
const events = await traceStore.events({ runId: run.runId });
|
|
6525
|
+
const { failureClass } = classifyFailure({ run, spans, events });
|
|
6526
|
+
if (failureClass === "success" || failureClass === "unknown") continue;
|
|
6527
|
+
const arr = failureByClass.get(failureClass) ?? [];
|
|
6528
|
+
arr.push(run);
|
|
6529
|
+
failureByClass.set(failureClass, arr);
|
|
6530
|
+
}
|
|
6531
|
+
for (const [cls, runs2] of failureByClass) {
|
|
6532
|
+
if (runs2.length < 3) continue;
|
|
6533
|
+
const affectedScenarios = [...new Set(runs2.map((r) => r.scenarioId))];
|
|
6534
|
+
const neighbors = scenarios.filter((s) => affectedScenarios.includes(s.id)).slice(0, 3);
|
|
6535
|
+
targets.push({
|
|
6536
|
+
reason: "failure-cluster",
|
|
6537
|
+
description: `failure class "${cls}" observed ${runs2.length}\xD7 across ${affectedScenarios.length} scenario(s)`,
|
|
6538
|
+
neighbors,
|
|
6539
|
+
direction: `create scenarios that exercise "${cls}" recovery \u2014 currently a systematic weakness`,
|
|
6540
|
+
priority: Math.min(1, runs2.length / 10)
|
|
6541
|
+
});
|
|
6542
|
+
}
|
|
6543
|
+
return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
|
|
6544
|
+
}
|
|
6545
|
+
function quantile(xs, p) {
|
|
6546
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
6547
|
+
const idx = p * (sorted.length - 1);
|
|
6548
|
+
const lo = Math.floor(idx);
|
|
6549
|
+
const hi = Math.ceil(idx);
|
|
6550
|
+
return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
|
|
6551
|
+
}
|
|
6552
|
+
|
|
6553
|
+
// src/reward-model-export.ts
|
|
6554
|
+
async function exportRewardModel(store, grader, runIds) {
|
|
6555
|
+
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6556
|
+
const samples = await exportTrainingData(store, graded);
|
|
6557
|
+
const rubrics = [...new Set(samples.map((s) => s.rubricId))];
|
|
6558
|
+
const meanReward = samples.length > 0 ? samples.reduce((a, s) => a + s.score, 0) / samples.length : 0;
|
|
6559
|
+
return {
|
|
6560
|
+
version: "1.0",
|
|
6561
|
+
metadata: {
|
|
6562
|
+
nTraces: graded.length,
|
|
6563
|
+
nSamples: samples.length,
|
|
6564
|
+
rubrics,
|
|
6565
|
+
exportedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
6566
|
+
meanReward
|
|
6567
|
+
},
|
|
6568
|
+
trainingNdjson: toNdjson(samples)
|
|
6569
|
+
};
|
|
6570
|
+
}
|
|
6571
|
+
function loadScorerFromGrader(grader) {
|
|
6572
|
+
return {
|
|
6573
|
+
async score(trajectory, store) {
|
|
6574
|
+
const graded = await grader.grade(store, trajectory.runId);
|
|
6575
|
+
return graded.aggregateScore;
|
|
6576
|
+
},
|
|
6577
|
+
metadata: {
|
|
6578
|
+
rubrics: ["grader-backed"],
|
|
6579
|
+
deterministic: true
|
|
6580
|
+
}
|
|
6581
|
+
};
|
|
6582
|
+
}
|
|
6583
|
+
async function replayScorerOverCorpus(store, scorer, runIds) {
|
|
6584
|
+
return Promise.all(
|
|
6585
|
+
runIds.map(async (runId) => {
|
|
6586
|
+
const [trajectory, run] = await Promise.all([buildTrajectory(store, runId), store.getRun(runId)]);
|
|
6587
|
+
return {
|
|
6588
|
+
runId,
|
|
6589
|
+
score: await scorer.score(trajectory, store),
|
|
6590
|
+
outcomeScore: run?.outcome?.score ?? null
|
|
6591
|
+
};
|
|
6592
|
+
})
|
|
6593
|
+
);
|
|
6594
|
+
}
|
|
6595
|
+
|
|
6596
|
+
// src/governance/types.ts
|
|
6597
|
+
function renderMarkdown(report) {
|
|
6598
|
+
const sevEmoji = {
|
|
6599
|
+
info: "\u2139\uFE0E",
|
|
6600
|
+
low: "\xB7",
|
|
6601
|
+
medium: "!",
|
|
6602
|
+
high: "!!",
|
|
6603
|
+
critical: "\u203C"
|
|
6604
|
+
};
|
|
6605
|
+
const lines = [];
|
|
6606
|
+
lines.push(`# ${report.framework} report \u2014 ${report.context.systemName}`);
|
|
6607
|
+
lines.push("");
|
|
6608
|
+
lines.push(`- Organization: **${report.context.organization}**`);
|
|
6609
|
+
lines.push(`- Period: ${report.context.periodStart} \u2192 ${report.context.periodEnd}`);
|
|
6610
|
+
lines.push(`- Owner: ${report.context.owner.role} ${report.context.owner.name} <${report.context.owner.email}>`);
|
|
6611
|
+
lines.push(`- Generated: ${report.generatedAt}`);
|
|
6612
|
+
lines.push("");
|
|
6613
|
+
lines.push(`## Summary \u2014 ${report.summary.overall}`);
|
|
6614
|
+
lines.push("");
|
|
6615
|
+
lines.push(`${report.summary.findings} finding(s).`);
|
|
6616
|
+
for (const [sev, n] of Object.entries(report.summary.byeverity)) {
|
|
6617
|
+
if (n > 0) lines.push(`- ${sevEmoji[sev]} ${sev}: ${n}`);
|
|
6618
|
+
}
|
|
6619
|
+
lines.push("");
|
|
6620
|
+
lines.push("## Findings");
|
|
6621
|
+
lines.push("");
|
|
6622
|
+
for (const f of report.findings) {
|
|
6623
|
+
lines.push(`### ${sevEmoji[f.severity]} ${f.id} \u2014 ${f.control}`);
|
|
6624
|
+
lines.push("");
|
|
6625
|
+
lines.push(f.summary);
|
|
6626
|
+
if (f.evidence) {
|
|
6627
|
+
lines.push("");
|
|
6628
|
+
lines.push("**Evidence:** " + f.evidence);
|
|
6629
|
+
}
|
|
6630
|
+
if (f.remediation) {
|
|
6631
|
+
lines.push("");
|
|
6632
|
+
lines.push("**Remediation:** " + f.remediation);
|
|
6633
|
+
}
|
|
6634
|
+
lines.push("");
|
|
6635
|
+
}
|
|
6636
|
+
return lines.join("\n");
|
|
6637
|
+
}
|
|
6638
|
+
function summarize(findings) {
|
|
6639
|
+
const byeverity = {
|
|
6640
|
+
info: 0,
|
|
6641
|
+
low: 0,
|
|
6642
|
+
medium: 0,
|
|
6643
|
+
high: 0,
|
|
6644
|
+
critical: 0
|
|
6645
|
+
};
|
|
6646
|
+
for (const f of findings) byeverity[f.severity]++;
|
|
6647
|
+
const overall = byeverity.critical + byeverity.high > 0 ? "non-compliant" : byeverity.medium + byeverity.low > 0 ? "compliant-with-findings" : "compliant";
|
|
6648
|
+
return { findings: findings.length, byeverity, overall };
|
|
6649
|
+
}
|
|
6650
|
+
|
|
6651
|
+
// src/governance/nist-ai-rmf.ts
|
|
6652
|
+
async function nistAiRmfReport(ctx) {
|
|
6653
|
+
const findings = [];
|
|
6654
|
+
if (!ctx.owner?.email) {
|
|
6655
|
+
findings.push({
|
|
6656
|
+
id: "G-1.1",
|
|
6657
|
+
severity: "high",
|
|
6658
|
+
control: "NIST-AI-RMF:GOVERN-1.1",
|
|
6659
|
+
summary: "No responsible owner recorded for the AI system.",
|
|
6660
|
+
remediation: "Assign an accountable individual + email in GovernanceContext.owner."
|
|
6661
|
+
});
|
|
6662
|
+
}
|
|
6663
|
+
if (ctx.datasets.length === 0) {
|
|
6664
|
+
findings.push({
|
|
6665
|
+
id: "G-1.3",
|
|
6666
|
+
severity: "high",
|
|
6667
|
+
control: "NIST-AI-RMF:GOVERN-1.3",
|
|
6668
|
+
summary: "No versioned datasets recorded for the evaluation period.",
|
|
6669
|
+
remediation: "Register each dataset with a Dataset manifest (content hash + provenance)."
|
|
6670
|
+
});
|
|
6671
|
+
} else {
|
|
6672
|
+
for (const manifest of ctx.datasets) {
|
|
6673
|
+
if (!manifest.contentHash || manifest.contentHash.length < 16) {
|
|
6674
|
+
findings.push({
|
|
6675
|
+
id: "G-1.3-hash",
|
|
6676
|
+
severity: "medium",
|
|
6677
|
+
control: "NIST-AI-RMF:GOVERN-1.3",
|
|
6678
|
+
summary: `Dataset "${manifest.name}" has weak or missing content hash.`,
|
|
6679
|
+
evidence: `contentHash="${manifest.contentHash}"`,
|
|
6680
|
+
remediation: "Call dataset.manifest() to compute SHA-256; commit the manifest alongside releases."
|
|
6681
|
+
});
|
|
6682
|
+
}
|
|
6683
|
+
}
|
|
6684
|
+
}
|
|
6685
|
+
if (!ctx.redTeam) {
|
|
6686
|
+
findings.push({
|
|
6687
|
+
id: "M-2.6",
|
|
6688
|
+
severity: "high",
|
|
6689
|
+
control: "NIST-AI-RMF:MEASURE-2.6",
|
|
6690
|
+
summary: "No red-team evaluation attached to the report period.",
|
|
6691
|
+
remediation: "Run redTeamDataset() against the system and attach the RedTeamReport to context.redTeam."
|
|
6692
|
+
});
|
|
6693
|
+
} else if (ctx.redTeam.overallPassRate < 0.8) {
|
|
6694
|
+
findings.push({
|
|
6695
|
+
id: "M-2.6-rate",
|
|
6696
|
+
severity: "high",
|
|
6697
|
+
control: "NIST-AI-RMF:MEASURE-2.6",
|
|
6698
|
+
summary: `Red-team pass rate ${(ctx.redTeam.overallPassRate * 100).toFixed(1)}% below 80% threshold.`,
|
|
6699
|
+
evidence: JSON.stringify(ctx.redTeam.passRateByCategory),
|
|
6700
|
+
remediation: "Harden the failing categories; rerun the battery."
|
|
6701
|
+
});
|
|
6702
|
+
}
|
|
6703
|
+
const runs = await ctx.traceStore.listRuns({ since: Date.parse(ctx.periodStart), until: Date.parse(ctx.periodEnd) });
|
|
6704
|
+
if (runs.length === 0) {
|
|
6705
|
+
findings.push({
|
|
6706
|
+
id: "M-2.1",
|
|
6707
|
+
severity: "critical",
|
|
6708
|
+
control: "NIST-AI-RMF:MEASURE-2.1",
|
|
6709
|
+
summary: "No eval runs recorded for the reporting period.",
|
|
6710
|
+
remediation: "Emit traces for every deployment-relevant evaluation."
|
|
6711
|
+
});
|
|
6712
|
+
}
|
|
6713
|
+
if (!ctx.judgeCalibration || ctx.judgeCalibration.length === 0) {
|
|
6714
|
+
findings.push({
|
|
6715
|
+
id: "M-2.11",
|
|
6716
|
+
severity: "medium",
|
|
6717
|
+
control: "NIST-AI-RMF:MEASURE-2.11",
|
|
6718
|
+
summary: "No judge-vs-human calibration recorded.",
|
|
6719
|
+
remediation: "Build a human golden set; run calibrateJudge() before trusting LLM judge scores."
|
|
6720
|
+
});
|
|
6721
|
+
} else {
|
|
6722
|
+
const weak = ctx.judgeCalibration.filter((c) => Number.isFinite(c.pearson) && c.pearson < 0.6);
|
|
6723
|
+
if (weak.length > 0) {
|
|
6724
|
+
findings.push({
|
|
6725
|
+
id: "M-2.11-weak",
|
|
6726
|
+
severity: "medium",
|
|
6727
|
+
control: "NIST-AI-RMF:MEASURE-2.11",
|
|
6728
|
+
summary: `${weak.length} judge(s) show weak agreement with humans (Pearson < 0.6).`,
|
|
6729
|
+
remediation: "Retrain or replace the underperforming judges."
|
|
6730
|
+
});
|
|
6731
|
+
}
|
|
6732
|
+
}
|
|
6733
|
+
if (!ctx.outcomeStore) {
|
|
6734
|
+
findings.push({
|
|
6735
|
+
id: "MN-1.1",
|
|
6736
|
+
severity: "medium",
|
|
6737
|
+
control: "NIST-AI-RMF:MANAGE-1.1",
|
|
6738
|
+
summary: "No deployment outcomes captured \u2014 meta-eval correlation cannot be computed.",
|
|
6739
|
+
remediation: "Attach an OutcomeStore and ingest production outcome metrics."
|
|
6740
|
+
});
|
|
6741
|
+
} else {
|
|
6742
|
+
const outcomes = await ctx.outcomeStore.list({ since: Date.parse(ctx.periodStart), until: Date.parse(ctx.periodEnd) });
|
|
6743
|
+
if (outcomes.length === 0) {
|
|
6744
|
+
findings.push({
|
|
6745
|
+
id: "MN-1.1-empty",
|
|
6746
|
+
severity: "medium",
|
|
6747
|
+
control: "NIST-AI-RMF:MANAGE-1.1",
|
|
6748
|
+
summary: "OutcomeStore present but no outcomes captured for the period."
|
|
6749
|
+
});
|
|
6750
|
+
}
|
|
6751
|
+
}
|
|
6752
|
+
const hashChecks = [];
|
|
6753
|
+
for (const manifest of ctx.datasets) {
|
|
6754
|
+
hashChecks.push({ name: manifest.name, ok: /^[0-9a-f]{64}$/.test(manifest.contentHash) });
|
|
6755
|
+
}
|
|
6756
|
+
const payload = {
|
|
6757
|
+
controlsEvaluated: [
|
|
6758
|
+
"GOVERN-1.1",
|
|
6759
|
+
"GOVERN-1.3",
|
|
6760
|
+
"MEASURE-2.1",
|
|
6761
|
+
"MEASURE-2.6",
|
|
6762
|
+
"MEASURE-2.11",
|
|
6763
|
+
"MANAGE-1.1"
|
|
6764
|
+
],
|
|
6765
|
+
runCount: runs.length,
|
|
6766
|
+
redTeamPassRate: ctx.redTeam?.overallPassRate ?? null,
|
|
6767
|
+
datasetHashChecks: hashChecks
|
|
6768
|
+
};
|
|
6769
|
+
return {
|
|
6770
|
+
framework: "NIST-AI-RMF",
|
|
6771
|
+
version: "1.0.0",
|
|
6772
|
+
context: {
|
|
6773
|
+
organization: ctx.organization,
|
|
6774
|
+
systemName: ctx.systemName,
|
|
6775
|
+
periodStart: ctx.periodStart,
|
|
6776
|
+
periodEnd: ctx.periodEnd,
|
|
6777
|
+
owner: ctx.owner
|
|
6778
|
+
},
|
|
6779
|
+
summary: summarize(findings),
|
|
6780
|
+
findings,
|
|
6781
|
+
payload,
|
|
6782
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6783
|
+
};
|
|
6784
|
+
}
|
|
6785
|
+
|
|
6786
|
+
// src/governance/soc2.ts
|
|
6787
|
+
async function soc2Report(ctx) {
|
|
6788
|
+
const findings = [];
|
|
6789
|
+
const start = Date.parse(ctx.periodStart);
|
|
6790
|
+
const end = Date.parse(ctx.periodEnd);
|
|
6791
|
+
const runs = await ctx.traceStore.listRuns({ since: start, until: end });
|
|
6792
|
+
const failureRate = runs.length > 0 ? runs.filter((r) => r.outcome?.pass === false).length / runs.length : null;
|
|
6793
|
+
if (failureRate !== null && failureRate > 0.2) {
|
|
6794
|
+
findings.push({
|
|
6795
|
+
id: "CC7.1-fail-rate",
|
|
6796
|
+
severity: "medium",
|
|
6797
|
+
control: "SOC2:CC7.1",
|
|
6798
|
+
summary: `System failure rate ${(failureRate * 100).toFixed(1)}% over the period exceeds 20%.`,
|
|
6799
|
+
remediation: "Investigate failure clusters (failureClusterView) + prioritize remediation."
|
|
6800
|
+
});
|
|
6801
|
+
}
|
|
6802
|
+
if (runs.length === 0) {
|
|
6803
|
+
findings.push({
|
|
6804
|
+
id: "CC7.1-coverage",
|
|
6805
|
+
severity: "high",
|
|
6806
|
+
control: "SOC2:CC7.1",
|
|
6807
|
+
summary: "No telemetry runs recorded for the period \u2014 monitoring regime is incomplete."
|
|
6808
|
+
});
|
|
6809
|
+
}
|
|
6810
|
+
const aborted = runs.filter((r) => r.status === "aborted");
|
|
6811
|
+
if (aborted.length > runs.length * 0.05 && aborted.length >= 3) {
|
|
6812
|
+
findings.push({
|
|
6813
|
+
id: "CC7.2-abort",
|
|
6814
|
+
severity: "medium",
|
|
6815
|
+
control: "SOC2:CC7.2",
|
|
6816
|
+
summary: `${aborted.length} run(s) aborted \u2014 investigate pattern.`,
|
|
6817
|
+
remediation: "Use the bisector + failureClusterView to localize the trigger."
|
|
6818
|
+
});
|
|
6819
|
+
}
|
|
6820
|
+
const incidentEvents = await ctx.traceStore.events({ kind: "policy_violation", since: start, until: end });
|
|
6821
|
+
const errorEvents = await ctx.traceStore.events({ kind: "error", since: start, until: end });
|
|
6822
|
+
const totalIncidents = incidentEvents.length + errorEvents.length;
|
|
6823
|
+
if (totalIncidents > 0) {
|
|
6824
|
+
findings.push({
|
|
6825
|
+
id: "CC7.3-resolution",
|
|
6826
|
+
severity: "low",
|
|
6827
|
+
control: "SOC2:CC7.3",
|
|
6828
|
+
summary: `${totalIncidents} incident-class event(s) recorded; resolution tracking is informal.`,
|
|
6829
|
+
remediation: 'Emit a resolution event (kind="log" with payload.resolves=<eventId>) per remediated incident.'
|
|
6830
|
+
});
|
|
6831
|
+
}
|
|
6832
|
+
const modelFingerprints = new Set(runs.map((r) => r.modelFingerprint).filter(Boolean));
|
|
6833
|
+
const promptHashes = new Set(runs.map((r) => r.promptSha).filter(Boolean));
|
|
6834
|
+
const codeSha = new Set(runs.map((r) => r.codeSha).filter(Boolean));
|
|
6835
|
+
if (codeSha.size === 0) {
|
|
6836
|
+
findings.push({
|
|
6837
|
+
id: "CC7.4-code",
|
|
6838
|
+
severity: "high",
|
|
6839
|
+
control: "SOC2:CC7.4",
|
|
6840
|
+
summary: "No codeSha recorded on runs \u2014 cannot attribute scores to a specific release.",
|
|
6841
|
+
remediation: "Populate Run.codeSha with the git SHA of the system at run time."
|
|
6842
|
+
});
|
|
6843
|
+
}
|
|
6844
|
+
if (promptHashes.size === 0) {
|
|
6845
|
+
findings.push({
|
|
6846
|
+
id: "CC7.4-prompt",
|
|
6847
|
+
severity: "medium",
|
|
6848
|
+
control: "SOC2:CC7.4",
|
|
6849
|
+
summary: "No promptSha recorded \u2014 prompt changes are untracked."
|
|
6850
|
+
});
|
|
6851
|
+
}
|
|
6852
|
+
const payload = {
|
|
6853
|
+
controls: ["CC7.1", "CC7.2", "CC7.3", "CC7.4"],
|
|
6854
|
+
runCount: runs.length,
|
|
6855
|
+
failureRate,
|
|
6856
|
+
abortedCount: aborted.length,
|
|
6857
|
+
incidentEventCount: totalIncidents,
|
|
6858
|
+
distinctReleases: {
|
|
6859
|
+
codeShas: codeSha.size,
|
|
6860
|
+
promptHashes: promptHashes.size,
|
|
6861
|
+
modelFingerprints: modelFingerprints.size
|
|
6862
|
+
}
|
|
6863
|
+
};
|
|
6864
|
+
return {
|
|
6865
|
+
framework: "SOC2",
|
|
6866
|
+
version: "2017-Common-Criteria",
|
|
6867
|
+
context: {
|
|
6868
|
+
organization: ctx.organization,
|
|
6869
|
+
systemName: ctx.systemName,
|
|
6870
|
+
periodStart: ctx.periodStart,
|
|
6871
|
+
periodEnd: ctx.periodEnd,
|
|
6872
|
+
owner: ctx.owner
|
|
6873
|
+
},
|
|
6874
|
+
summary: summarize(findings),
|
|
6875
|
+
findings,
|
|
6876
|
+
payload,
|
|
6877
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6878
|
+
};
|
|
6879
|
+
}
|
|
6880
|
+
|
|
6881
|
+
// src/governance/eu-ai-act.ts
|
|
6882
|
+
function classifyEuAiRisk(signals) {
|
|
6883
|
+
if (signals.biometricPublic || signals.socialScoring || signals.subliminal) return "unacceptable";
|
|
6884
|
+
if (signals.annexIII) return "high";
|
|
6885
|
+
if (signals.chatbot || signals.generatesSyntheticMedia) return "limited";
|
|
6886
|
+
return "minimal";
|
|
6887
|
+
}
|
|
6888
|
+
async function euAiActReport(ctx, signals) {
|
|
6889
|
+
const riskClass = classifyEuAiRisk(signals);
|
|
6890
|
+
const findings = [];
|
|
6891
|
+
if (riskClass === "unacceptable") {
|
|
6892
|
+
findings.push({
|
|
6893
|
+
id: "EU-ART-5",
|
|
6894
|
+
severity: "critical",
|
|
6895
|
+
control: "EU-AI-ACT:Article-5",
|
|
6896
|
+
summary: "Use case matches a prohibited practice under Article 5.",
|
|
6897
|
+
remediation: "Discontinue or substantially redesign the use case."
|
|
6898
|
+
});
|
|
6899
|
+
}
|
|
6900
|
+
if (riskClass === "high") {
|
|
6901
|
+
if (!ctx.redTeam) {
|
|
6902
|
+
findings.push({
|
|
6903
|
+
id: "EU-ART-9",
|
|
6904
|
+
severity: "high",
|
|
6905
|
+
control: "EU-AI-ACT:Article-9",
|
|
6906
|
+
summary: "High-risk system lacks documented adversarial-testing evidence (Art. 9 risk mgmt).",
|
|
6907
|
+
remediation: "Run redTeamDataset() + attach the report."
|
|
6908
|
+
});
|
|
6909
|
+
}
|
|
6910
|
+
if (ctx.datasets.length === 0) {
|
|
6911
|
+
findings.push({
|
|
6912
|
+
id: "EU-ART-10",
|
|
6913
|
+
severity: "high",
|
|
6914
|
+
control: "EU-AI-ACT:Article-10",
|
|
6915
|
+
summary: "No training/eval datasets recorded with provenance (Art. 10)."
|
|
6916
|
+
});
|
|
6917
|
+
}
|
|
6918
|
+
const runs = await ctx.traceStore.listRuns({
|
|
6919
|
+
since: Date.parse(ctx.periodStart),
|
|
6920
|
+
until: Date.parse(ctx.periodEnd)
|
|
6921
|
+
});
|
|
6922
|
+
if (runs.length === 0) {
|
|
6923
|
+
findings.push({
|
|
6924
|
+
id: "EU-ART-11",
|
|
6925
|
+
severity: "high",
|
|
6926
|
+
control: "EU-AI-ACT:Article-11",
|
|
6927
|
+
summary: "No eval runs recorded (Art. 11 technical documentation)."
|
|
6928
|
+
});
|
|
6929
|
+
}
|
|
6930
|
+
if (!signals.chatbot && !signals.generatesSyntheticMedia) {
|
|
6931
|
+
} else {
|
|
6932
|
+
findings.push({
|
|
6933
|
+
id: "EU-ART-13",
|
|
6934
|
+
severity: "info",
|
|
6935
|
+
control: "EU-AI-ACT:Article-13",
|
|
6936
|
+
summary: "Chatbot/synthetic-media transparency obligations apply; verify user-facing disclosures."
|
|
6937
|
+
});
|
|
6938
|
+
}
|
|
6939
|
+
if (!ctx.owner?.email) {
|
|
6940
|
+
findings.push({
|
|
6941
|
+
id: "EU-ART-14",
|
|
6942
|
+
severity: "high",
|
|
6943
|
+
control: "EU-AI-ACT:Article-14",
|
|
6944
|
+
summary: "No designated human overseer (Art. 14).",
|
|
6945
|
+
remediation: "Populate GovernanceContext.owner with the responsible individual."
|
|
6946
|
+
});
|
|
6947
|
+
}
|
|
6948
|
+
if (!ctx.outcomeStore) {
|
|
6949
|
+
findings.push({
|
|
6950
|
+
id: "EU-ART-15",
|
|
6951
|
+
severity: "medium",
|
|
6952
|
+
control: "EU-AI-ACT:Article-15",
|
|
6953
|
+
summary: "No post-deployment outcome measurement; accuracy + robustness are un-attested.",
|
|
6954
|
+
remediation: "Attach an OutcomeStore + run correlationStudy() over the reporting period."
|
|
6955
|
+
});
|
|
6956
|
+
}
|
|
6957
|
+
}
|
|
6958
|
+
if (riskClass === "limited") {
|
|
6959
|
+
findings.push({
|
|
6960
|
+
id: "EU-ART-52",
|
|
6961
|
+
severity: "info",
|
|
6962
|
+
control: "EU-AI-ACT:Article-52",
|
|
6963
|
+
summary: "Transparency obligations apply: disclose AI nature + synthetic content labeling.",
|
|
6964
|
+
remediation: "Ensure user-facing surfaces label AI-generated content."
|
|
6965
|
+
});
|
|
6966
|
+
}
|
|
6967
|
+
const payload = {
|
|
6968
|
+
riskClass,
|
|
6969
|
+
signals,
|
|
6970
|
+
articlesReviewed: riskClass === "high" ? ["5", "9", "10", "11", "13", "14", "15"] : riskClass === "limited" ? ["52"] : ["none"]
|
|
6971
|
+
};
|
|
6972
|
+
return {
|
|
6973
|
+
framework: "EU-AI-ACT",
|
|
6974
|
+
version: "Regulation-2024-1689",
|
|
6975
|
+
context: {
|
|
6976
|
+
organization: ctx.organization,
|
|
6977
|
+
systemName: ctx.systemName,
|
|
6978
|
+
periodStart: ctx.periodStart,
|
|
6979
|
+
periodEnd: ctx.periodEnd,
|
|
6980
|
+
owner: ctx.owner
|
|
6981
|
+
},
|
|
6982
|
+
summary: summarize(findings),
|
|
6983
|
+
findings,
|
|
6984
|
+
payload,
|
|
6985
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
6986
|
+
};
|
|
6987
|
+
}
|
|
5448
6988
|
export {
|
|
5449
6989
|
AgentDriver,
|
|
5450
6990
|
BenchmarkRunner,
|
|
@@ -5463,15 +7003,18 @@ export {
|
|
|
5463
7003
|
DualAgentBench,
|
|
5464
7004
|
ExperimentTracker,
|
|
5465
7005
|
FAILURE_CLASSES,
|
|
7006
|
+
FileSystemOutcomeStore,
|
|
5466
7007
|
FileSystemTraceStore,
|
|
5467
7008
|
HoldoutAuditor,
|
|
5468
7009
|
HoldoutLockedError,
|
|
5469
7010
|
InMemoryExperimentStore,
|
|
7011
|
+
InMemoryOutcomeStore,
|
|
5470
7012
|
InMemoryTraceStore,
|
|
5471
7013
|
InMemoryWorkspaceInspector,
|
|
5472
7014
|
MODEL_PRICING,
|
|
5473
7015
|
MetricsCollector,
|
|
5474
7016
|
OTEL_AGENT_EVAL_SCOPE,
|
|
7017
|
+
PrmGrader,
|
|
5475
7018
|
ProductClient,
|
|
5476
7019
|
ProjectRegistry,
|
|
5477
7020
|
PromptOptimizer,
|
|
@@ -5488,20 +7031,26 @@ export {
|
|
|
5488
7031
|
analyzeAntiSlop,
|
|
5489
7032
|
analyzeSeries,
|
|
5490
7033
|
argHash,
|
|
7034
|
+
attributeCounterfactuals,
|
|
5491
7035
|
benjaminiHochberg,
|
|
7036
|
+
bisect,
|
|
5492
7037
|
bonferroni,
|
|
5493
7038
|
budgetBreachView,
|
|
5494
7039
|
buildTrajectory,
|
|
5495
7040
|
byteLengthRange,
|
|
5496
7041
|
calibrateJudge,
|
|
7042
|
+
calibrationCurve,
|
|
5497
7043
|
canaryLeakView,
|
|
7044
|
+
causalAttribution,
|
|
5498
7045
|
checkCanaries,
|
|
5499
7046
|
checkSlos,
|
|
7047
|
+
classifyEuAiRisk,
|
|
5500
7048
|
classifyFailure,
|
|
5501
7049
|
codeExecutionJudge,
|
|
5502
7050
|
cohensD,
|
|
5503
7051
|
coherenceJudge,
|
|
5504
7052
|
collectionPreserved,
|
|
7053
|
+
commitBisect,
|
|
5505
7054
|
compareToBaseline,
|
|
5506
7055
|
composeParsers,
|
|
5507
7056
|
composeValidators,
|
|
@@ -5509,18 +7058,24 @@ export {
|
|
|
5509
7058
|
confidenceInterval,
|
|
5510
7059
|
containsAll,
|
|
5511
7060
|
correlateLayers,
|
|
7061
|
+
correlationStudy,
|
|
5512
7062
|
createAntiSlopJudge,
|
|
5513
7063
|
createCustomJudge,
|
|
5514
7064
|
createDomainExpertJudge,
|
|
7065
|
+
crossTraceDiff,
|
|
5515
7066
|
defaultJudges,
|
|
5516
7067
|
dominates,
|
|
5517
7068
|
estimateCost,
|
|
5518
7069
|
estimateTokens,
|
|
7070
|
+
euAiActReport,
|
|
5519
7071
|
evaluateContract,
|
|
7072
|
+
evaluateHypothesis,
|
|
5520
7073
|
evaluateOracles,
|
|
5521
7074
|
executeScenario,
|
|
5522
7075
|
expectAgent,
|
|
7076
|
+
exportRewardModel,
|
|
5523
7077
|
exportRunAsOtlp,
|
|
7078
|
+
exportTrainingData,
|
|
5524
7079
|
failureClusterView,
|
|
5525
7080
|
fileContains,
|
|
5526
7081
|
fileExists,
|
|
@@ -5534,6 +7089,7 @@ export {
|
|
|
5534
7089
|
iqr,
|
|
5535
7090
|
isJudgeSpan,
|
|
5536
7091
|
isLlmSpan,
|
|
7092
|
+
isPrmVerdict,
|
|
5537
7093
|
isRetrievalSpan,
|
|
5538
7094
|
isSandboxSpan,
|
|
5539
7095
|
isToolSpan,
|
|
@@ -5545,10 +7101,14 @@ export {
|
|
|
5545
7101
|
keyPreserved,
|
|
5546
7102
|
llmSpanFromProvider,
|
|
5547
7103
|
llmSpans,
|
|
7104
|
+
loadScorerFromGrader,
|
|
5548
7105
|
lowercaseMutator,
|
|
5549
7106
|
mannWhitneyU,
|
|
7107
|
+
nistAiRmfReport,
|
|
7108
|
+
nonRefusalRubric,
|
|
5550
7109
|
normalizeScores,
|
|
5551
7110
|
notBlocked,
|
|
7111
|
+
outputLengthRubric,
|
|
5552
7112
|
pairedTTest,
|
|
5553
7113
|
paraphraseRobustness,
|
|
5554
7114
|
paretoFrontier,
|
|
@@ -5557,6 +7117,10 @@ export {
|
|
|
5557
7117
|
politenessPrefixMutator,
|
|
5558
7118
|
positionalBias,
|
|
5559
7119
|
printDriverSummary,
|
|
7120
|
+
prmBestOfN,
|
|
7121
|
+
prmEnsembleBestOfN,
|
|
7122
|
+
promptBisect,
|
|
7123
|
+
proposeSynthesisTargets,
|
|
5560
7124
|
pytestTestParser,
|
|
5561
7125
|
redTeamDataset,
|
|
5562
7126
|
redTeamReport,
|
|
@@ -5565,16 +7129,20 @@ export {
|
|
|
5565
7129
|
regexMatch,
|
|
5566
7130
|
regexMatches,
|
|
5567
7131
|
regressionView,
|
|
7132
|
+
renderMarkdown,
|
|
5568
7133
|
renderMarkdownReport,
|
|
7134
|
+
replayScorerOverCorpus,
|
|
5569
7135
|
replayTraceThroughJudge,
|
|
5570
7136
|
requiredSampleSize,
|
|
5571
7137
|
resumeBuilderSession,
|
|
5572
7138
|
rowCount,
|
|
5573
7139
|
rowWhere,
|
|
5574
7140
|
runAssertions,
|
|
7141
|
+
runCounterfactual,
|
|
5575
7142
|
runE2EWorkflow,
|
|
5576
7143
|
runExpectations,
|
|
5577
7144
|
runFailureClass,
|
|
7145
|
+
runSelfPlay,
|
|
5578
7146
|
runTestGradedScenario,
|
|
5579
7147
|
runsForScenario,
|
|
5580
7148
|
scoreAllProjects,
|
|
@@ -5583,17 +7151,25 @@ export {
|
|
|
5583
7151
|
scoreRedTeamOutput,
|
|
5584
7152
|
selfPreference,
|
|
5585
7153
|
sentenceReorderMutator,
|
|
7154
|
+
signManifest,
|
|
7155
|
+
soc2Report,
|
|
5586
7156
|
statusAdvanced,
|
|
5587
7157
|
stuckLoopView,
|
|
7158
|
+
summarize,
|
|
5588
7159
|
textInSnapshot,
|
|
5589
7160
|
toLangfuseEnvelope,
|
|
7161
|
+
toNdjson,
|
|
5590
7162
|
toPrometheusText,
|
|
7163
|
+
toolIntentAlignmentRubric,
|
|
5591
7164
|
toolNamesForRun,
|
|
7165
|
+
toolNonRedundantRubric,
|
|
5592
7166
|
toolSpans,
|
|
7167
|
+
toolSuccessRubric,
|
|
5593
7168
|
toolWasteView,
|
|
5594
7169
|
typoMutator,
|
|
5595
7170
|
urlContains,
|
|
5596
7171
|
verbosityBias,
|
|
7172
|
+
verifyManifest,
|
|
5597
7173
|
visualDiff,
|
|
5598
7174
|
vitestTestParser,
|
|
5599
7175
|
weightedMean,
|