@tangle-network/agent-eval 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +609 -1
- package/dist/index.js +1011 -1
- package/dist/index.js.map +1 -1
- package/package.json +2 -1
package/dist/index.js
CHANGED
|
@@ -506,6 +506,123 @@ function partialCredit(current, target) {
|
|
|
506
506
|
if (target <= 0) return 1;
|
|
507
507
|
return Math.min(1, Math.max(0, current / target));
|
|
508
508
|
}
|
|
509
|
+
function pairedTTest(before, after) {
|
|
510
|
+
if (before.length !== after.length) {
|
|
511
|
+
throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
|
|
512
|
+
}
|
|
513
|
+
const n = before.length;
|
|
514
|
+
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
|
+
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
+
const mean = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
|
|
518
|
+
const se = Math.sqrt(variance / n);
|
|
519
|
+
if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean / se;
|
|
521
|
+
const df = n - 1;
|
|
522
|
+
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
|
+
return { t, df, p };
|
|
524
|
+
}
|
|
525
|
+
function wilcoxonSignedRank(before, after) {
|
|
526
|
+
if (before.length !== after.length) {
|
|
527
|
+
throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
|
|
528
|
+
}
|
|
529
|
+
const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
|
|
530
|
+
const n = diffs.length;
|
|
531
|
+
if (n < 6) return { w: 0, p: 1 };
|
|
532
|
+
const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
|
|
533
|
+
const ranks = new Array(n);
|
|
534
|
+
let i = 0;
|
|
535
|
+
while (i < n) {
|
|
536
|
+
let j = i;
|
|
537
|
+
while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
|
|
538
|
+
const avg = (i + 1 + j) / 2;
|
|
539
|
+
for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg;
|
|
540
|
+
i = j;
|
|
541
|
+
}
|
|
542
|
+
let wPlus = 0;
|
|
543
|
+
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
|
|
544
|
+
const mean = n * (n + 1) / 4;
|
|
545
|
+
const variance = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
+
const z = (wPlus - mean) / Math.sqrt(variance);
|
|
547
|
+
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
|
+
return { w: wPlus, p };
|
|
549
|
+
}
|
|
550
|
+
function cohensD(a, b) {
|
|
551
|
+
if (a.length < 2 || b.length < 2) return 0;
|
|
552
|
+
const meanA = a.reduce((x, y) => x + y, 0) / a.length;
|
|
553
|
+
const meanB = b.reduce((x, y) => x + y, 0) / b.length;
|
|
554
|
+
const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
|
|
555
|
+
const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
|
|
556
|
+
const pooled = Math.sqrt(
|
|
557
|
+
((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
|
|
558
|
+
);
|
|
559
|
+
if (pooled === 0) return 0;
|
|
560
|
+
return (meanB - meanA) / pooled;
|
|
561
|
+
}
|
|
562
|
+
function studentTCdf(t, df) {
|
|
563
|
+
if (df <= 0) return 0.5;
|
|
564
|
+
if (df > 100) return normalCdf(t);
|
|
565
|
+
const x = df / (df + t * t);
|
|
566
|
+
const a = df / 2;
|
|
567
|
+
const b = 0.5;
|
|
568
|
+
const ib = incompleteBeta(x, a, b);
|
|
569
|
+
return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
|
|
570
|
+
}
|
|
571
|
+
function incompleteBeta(x, a, b) {
|
|
572
|
+
if (x <= 0) return 0;
|
|
573
|
+
if (x >= 1) return 1;
|
|
574
|
+
const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
|
|
575
|
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
|
|
576
|
+
const maxIter = 200;
|
|
577
|
+
const eps = 3e-7;
|
|
578
|
+
let c = 1;
|
|
579
|
+
let d = 1 - (a + b) * x / (a + 1);
|
|
580
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
581
|
+
d = 1 / d;
|
|
582
|
+
let f = d;
|
|
583
|
+
for (let m = 1; m <= maxIter; m++) {
|
|
584
|
+
const m2 = 2 * m;
|
|
585
|
+
let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
|
|
586
|
+
d = 1 + num * d;
|
|
587
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
588
|
+
c = 1 + num / c;
|
|
589
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
590
|
+
d = 1 / d;
|
|
591
|
+
f *= d * c;
|
|
592
|
+
num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
|
|
593
|
+
d = 1 + num * d;
|
|
594
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
595
|
+
c = 1 + num / c;
|
|
596
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
597
|
+
d = 1 / d;
|
|
598
|
+
const delta = d * c;
|
|
599
|
+
f *= delta;
|
|
600
|
+
if (Math.abs(delta - 1) < eps) break;
|
|
601
|
+
}
|
|
602
|
+
return front * f;
|
|
603
|
+
}
|
|
604
|
+
function lnGamma(z) {
|
|
605
|
+
const g = 7;
|
|
606
|
+
const coefs = [
|
|
607
|
+
0.9999999999998099,
|
|
608
|
+
676.5203681218851,
|
|
609
|
+
-1259.1392167224028,
|
|
610
|
+
771.3234287776531,
|
|
611
|
+
-176.6150291621406,
|
|
612
|
+
12.507343278686905,
|
|
613
|
+
-0.13857109526572012,
|
|
614
|
+
9984369578019572e-21,
|
|
615
|
+
15056327351493116e-23
|
|
616
|
+
];
|
|
617
|
+
if (z < 0.5) {
|
|
618
|
+
return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
|
|
619
|
+
}
|
|
620
|
+
z -= 1;
|
|
621
|
+
let x = coefs[0];
|
|
622
|
+
for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
|
|
623
|
+
const t = z + g + 0.5;
|
|
624
|
+
return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
|
|
625
|
+
}
|
|
509
626
|
function normalCdf(x) {
|
|
510
627
|
const a1 = 0.254829592;
|
|
511
628
|
const a2 = -0.284496736;
|
|
@@ -1261,33 +1378,926 @@ function printDriverSummary(results) {
|
|
|
1261
1378
|
const completedCount = results.filter((r) => r.completed).length;
|
|
1262
1379
|
console.log(`${completedCount}/${results.length} personas completed`);
|
|
1263
1380
|
}
|
|
1381
|
+
|
|
1382
|
+
// src/prompt-registry.ts
|
|
1383
|
+
var PromptRegistry = class {
|
|
1384
|
+
entries = /* @__PURE__ */ new Map();
|
|
1385
|
+
// `${id}@${version}` → handle
|
|
1386
|
+
/**
|
|
1387
|
+
* Register a prompt. Re-registering the same id+version with DIFFERENT
|
|
1388
|
+
* content throws — versions are immutable. Re-registering with the SAME
|
|
1389
|
+
* content is a no-op (idempotent).
|
|
1390
|
+
*/
|
|
1391
|
+
async register(id, version, content) {
|
|
1392
|
+
validateId(id);
|
|
1393
|
+
validateVersion(version);
|
|
1394
|
+
const key = makeKey(id, version);
|
|
1395
|
+
const hash = await hashContent(content);
|
|
1396
|
+
const existing = this.entries.get(key);
|
|
1397
|
+
if (existing) {
|
|
1398
|
+
if (existing.hash !== hash) {
|
|
1399
|
+
throw new Error(
|
|
1400
|
+
`Prompt ${key} already registered with a different hash (${existing.hash} vs ${hash}). Bump the version.`
|
|
1401
|
+
);
|
|
1402
|
+
}
|
|
1403
|
+
return existing;
|
|
1404
|
+
}
|
|
1405
|
+
const handle = { id, version, hash, content };
|
|
1406
|
+
this.entries.set(key, handle);
|
|
1407
|
+
return handle;
|
|
1408
|
+
}
|
|
1409
|
+
/** Look up a registered prompt. Throws if unknown — no implicit defaults. */
|
|
1410
|
+
get(id, version) {
|
|
1411
|
+
const key = makeKey(id, version);
|
|
1412
|
+
const handle = this.entries.get(key);
|
|
1413
|
+
if (!handle) throw new Error(`Prompt ${key} not registered`);
|
|
1414
|
+
return handle;
|
|
1415
|
+
}
|
|
1416
|
+
/** Return all versions of an id, newest-first (lex-descending on version). */
|
|
1417
|
+
listVersions(id) {
|
|
1418
|
+
return [...this.entries.values()].filter((h) => h.id === id).sort((a, b) => b.version.localeCompare(a.version));
|
|
1419
|
+
}
|
|
1420
|
+
/** Snapshot the whole registry — useful for including in reports. */
|
|
1421
|
+
list() {
|
|
1422
|
+
return [...this.entries.values()];
|
|
1423
|
+
}
|
|
1424
|
+
/** Verify a hash against registered content. Returns null if not found. */
|
|
1425
|
+
verifyHash(id, version, expectedHash) {
|
|
1426
|
+
const handle = this.entries.get(makeKey(id, version));
|
|
1427
|
+
if (!handle) return null;
|
|
1428
|
+
return handle.hash === expectedHash;
|
|
1429
|
+
}
|
|
1430
|
+
};
|
|
1431
|
+
async function hashContent(content) {
|
|
1432
|
+
const bytes = new TextEncoder().encode(content);
|
|
1433
|
+
const digest = await crypto.subtle.digest("SHA-256", bytes);
|
|
1434
|
+
const full = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
1435
|
+
return full.slice(0, 12);
|
|
1436
|
+
}
|
|
1437
|
+
function makeKey(id, version) {
|
|
1438
|
+
return `${id}@${version}`;
|
|
1439
|
+
}
|
|
1440
|
+
var ID_RE = /^[a-z][a-z0-9._-]*$/i;
|
|
1441
|
+
function validateId(id) {
|
|
1442
|
+
if (!ID_RE.test(id)) {
|
|
1443
|
+
throw new Error(`Invalid prompt id "${id}": must match ${ID_RE}`);
|
|
1444
|
+
}
|
|
1445
|
+
}
|
|
1446
|
+
function validateVersion(version) {
|
|
1447
|
+
if (!version || version.length > 64) {
|
|
1448
|
+
throw new Error(`Invalid version "${version}": must be 1\u201364 chars`);
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
// src/trace-store.ts
|
|
1453
|
+
var MemoryTraceStore = class {
|
|
1454
|
+
traces = [];
|
|
1455
|
+
async record(trace) {
|
|
1456
|
+
this.traces.push(trace);
|
|
1457
|
+
}
|
|
1458
|
+
async query(query) {
|
|
1459
|
+
let result = this.filter(query);
|
|
1460
|
+
if (query.limit !== void 0) result = result.slice(0, query.limit);
|
|
1461
|
+
return result;
|
|
1462
|
+
}
|
|
1463
|
+
async count(query) {
|
|
1464
|
+
return query ? this.filter(query).length : this.traces.length;
|
|
1465
|
+
}
|
|
1466
|
+
/** Clear the store — test helper. */
|
|
1467
|
+
reset() {
|
|
1468
|
+
this.traces = [];
|
|
1469
|
+
}
|
|
1470
|
+
filter(query) {
|
|
1471
|
+
return this.traces.filter((t) => {
|
|
1472
|
+
if (query.runId && t.runId !== query.runId) return false;
|
|
1473
|
+
if (query.scenarioId && t.scenarioId !== query.scenarioId) return false;
|
|
1474
|
+
if (query.role && t.role !== query.role) return false;
|
|
1475
|
+
if (query.model && t.model !== query.model) return false;
|
|
1476
|
+
if (query.sinceMs !== void 0) {
|
|
1477
|
+
const ts = Date.parse(t.timestamp);
|
|
1478
|
+
if (Number.isFinite(ts) && ts < query.sinceMs) return false;
|
|
1479
|
+
}
|
|
1480
|
+
return true;
|
|
1481
|
+
});
|
|
1482
|
+
}
|
|
1483
|
+
};
|
|
1484
|
+
var FileSystemTraceStore = class {
|
|
1485
|
+
opts;
|
|
1486
|
+
constructor(opts) {
|
|
1487
|
+
this.opts = {
|
|
1488
|
+
rolloverBytes: 32 * 1024 * 1024,
|
|
1489
|
+
append: defaultAppend,
|
|
1490
|
+
read: defaultRead,
|
|
1491
|
+
list: defaultList,
|
|
1492
|
+
stat: defaultStat,
|
|
1493
|
+
mkdir: defaultMkdir,
|
|
1494
|
+
...opts
|
|
1495
|
+
};
|
|
1496
|
+
}
|
|
1497
|
+
async record(trace) {
|
|
1498
|
+
const file = await this.currentSegment();
|
|
1499
|
+
await this.opts.append(file, JSON.stringify(trace) + "\n");
|
|
1500
|
+
}
|
|
1501
|
+
async query(query) {
|
|
1502
|
+
const files = await this.segments();
|
|
1503
|
+
const out = [];
|
|
1504
|
+
for (const file of files) {
|
|
1505
|
+
const contents = await this.opts.read(file).catch(() => "");
|
|
1506
|
+
for (const line of contents.split("\n")) {
|
|
1507
|
+
if (!line) continue;
|
|
1508
|
+
try {
|
|
1509
|
+
const t = JSON.parse(line);
|
|
1510
|
+
if (!matches(t, query)) continue;
|
|
1511
|
+
out.push(t);
|
|
1512
|
+
if (query.limit !== void 0 && out.length >= query.limit) return out;
|
|
1513
|
+
} catch {
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
}
|
|
1517
|
+
return out;
|
|
1518
|
+
}
|
|
1519
|
+
async count(query) {
|
|
1520
|
+
if (!query) {
|
|
1521
|
+
const files = await this.segments();
|
|
1522
|
+
let total = 0;
|
|
1523
|
+
for (const file of files) {
|
|
1524
|
+
const contents = await this.opts.read(file).catch(() => "");
|
|
1525
|
+
total += contents.split("\n").filter(Boolean).length;
|
|
1526
|
+
}
|
|
1527
|
+
return total;
|
|
1528
|
+
}
|
|
1529
|
+
return (await this.query(query)).length;
|
|
1530
|
+
}
|
|
1531
|
+
async segments() {
|
|
1532
|
+
try {
|
|
1533
|
+
const all = await this.opts.list(this.opts.dir);
|
|
1534
|
+
return all.filter((f) => f.endsWith(".ndjson")).sort();
|
|
1535
|
+
} catch {
|
|
1536
|
+
return [];
|
|
1537
|
+
}
|
|
1538
|
+
}
|
|
1539
|
+
async currentSegment() {
|
|
1540
|
+
await this.opts.mkdir(this.opts.dir);
|
|
1541
|
+
const existing = await this.segments();
|
|
1542
|
+
if (existing.length === 0) return pathJoin(this.opts.dir, `traces-000.ndjson`);
|
|
1543
|
+
const latest = existing[existing.length - 1];
|
|
1544
|
+
try {
|
|
1545
|
+
const s = await this.opts.stat(latest);
|
|
1546
|
+
if (s.size < this.opts.rolloverBytes) return latest;
|
|
1547
|
+
} catch {
|
|
1548
|
+
return latest;
|
|
1549
|
+
}
|
|
1550
|
+
const n = existing.length;
|
|
1551
|
+
return pathJoin(this.opts.dir, `traces-${String(n).padStart(3, "0")}.ndjson`);
|
|
1552
|
+
}
|
|
1553
|
+
};
|
|
1554
|
+
function matches(t, query) {
|
|
1555
|
+
if (query.runId && t.runId !== query.runId) return false;
|
|
1556
|
+
if (query.scenarioId && t.scenarioId !== query.scenarioId) return false;
|
|
1557
|
+
if (query.role && t.role !== query.role) return false;
|
|
1558
|
+
if (query.model && t.model !== query.model) return false;
|
|
1559
|
+
if (query.sinceMs !== void 0) {
|
|
1560
|
+
const ts = Date.parse(t.timestamp);
|
|
1561
|
+
if (Number.isFinite(ts) && ts < query.sinceMs) return false;
|
|
1562
|
+
}
|
|
1563
|
+
return true;
|
|
1564
|
+
}
|
|
1565
|
+
function pathJoin(dir, file) {
|
|
1566
|
+
return dir.endsWith("/") ? `${dir}${file}` : `${dir}/${file}`;
|
|
1567
|
+
}
|
|
1568
|
+
async function defaultAppend(path, data) {
|
|
1569
|
+
const fs = await import("fs/promises");
|
|
1570
|
+
await fs.appendFile(path, data);
|
|
1571
|
+
}
|
|
1572
|
+
async function defaultRead(path) {
|
|
1573
|
+
const fs = await import("fs/promises");
|
|
1574
|
+
return fs.readFile(path, "utf8");
|
|
1575
|
+
}
|
|
1576
|
+
async function defaultList(dir) {
|
|
1577
|
+
const fs = await import("fs/promises");
|
|
1578
|
+
const p = await import("path");
|
|
1579
|
+
try {
|
|
1580
|
+
const entries = await fs.readdir(dir);
|
|
1581
|
+
return entries.map((e) => p.join(dir, e));
|
|
1582
|
+
} catch {
|
|
1583
|
+
return [];
|
|
1584
|
+
}
|
|
1585
|
+
}
|
|
1586
|
+
async function defaultStat(path) {
|
|
1587
|
+
const fs = await import("fs/promises");
|
|
1588
|
+
const s = await fs.stat(path);
|
|
1589
|
+
return { size: s.size };
|
|
1590
|
+
}
|
|
1591
|
+
async function defaultMkdir(dir) {
|
|
1592
|
+
const fs = await import("fs/promises");
|
|
1593
|
+
await fs.mkdir(dir, { recursive: true });
|
|
1594
|
+
}
|
|
1595
|
+
|
|
1596
|
+
// src/anti-slop.ts
|
|
1597
|
+
var DEFAULT_HEDGES = [
|
|
1598
|
+
/\bi\s+could\s+be\s+wrong\b/i,
|
|
1599
|
+
/\bi\s+think\s+maybe\b/i,
|
|
1600
|
+
/\bit\s+might\s+be\s+that\b/i,
|
|
1601
|
+
/\bperhaps\s+(?:you\s+)?could\b/i
|
|
1602
|
+
];
|
|
1603
|
+
var DEFAULT_APOLOGIES = [
|
|
1604
|
+
/\bi\s+(?:apologize|apologise)\s+(?:for|if)\b/i,
|
|
1605
|
+
/\bi'?m\s+(?:so\s+|really\s+)?sorry\s+(?:for|if|about)\b/i,
|
|
1606
|
+
/\bmy\s+apologies\b/i
|
|
1607
|
+
];
|
|
1608
|
+
function createAntiSlopJudge(config = {}) {
|
|
1609
|
+
const conf = {
|
|
1610
|
+
domain: config.domain ?? "general",
|
|
1611
|
+
bannedPhrases: config.bannedPhrases ?? [],
|
|
1612
|
+
bannedOpenings: config.bannedOpenings ?? [],
|
|
1613
|
+
hedgingPatterns: config.hedgingPatterns ?? DEFAULT_HEDGES,
|
|
1614
|
+
apologyPatterns: config.apologyPatterns ?? DEFAULT_APOLOGIES,
|
|
1615
|
+
repetitionThreshold: config.repetitionThreshold ?? 0.15,
|
|
1616
|
+
minLength: config.minLength ?? 20,
|
|
1617
|
+
maxLength: config.maxLength ?? 8e3,
|
|
1618
|
+
penaltyWeights: {
|
|
1619
|
+
banned_phrase: 1,
|
|
1620
|
+
banned_opening: 1,
|
|
1621
|
+
hedging: 0.5,
|
|
1622
|
+
apology: 0.5,
|
|
1623
|
+
repetition: 0.75,
|
|
1624
|
+
length: 0.5,
|
|
1625
|
+
...config.penaltyWeights
|
|
1626
|
+
}
|
|
1627
|
+
};
|
|
1628
|
+
const judge = async (_tc, input) => {
|
|
1629
|
+
const outputs = input.turns.map((t) => t.agentResponse ?? "");
|
|
1630
|
+
const report = analyzeAntiSlop(outputs, conf);
|
|
1631
|
+
return [
|
|
1632
|
+
{
|
|
1633
|
+
judgeName: `anti-slop(${conf.domain})`,
|
|
1634
|
+
dimension: "anti_slop",
|
|
1635
|
+
score: report.score,
|
|
1636
|
+
reasoning: report.issues.length ? report.issues.slice(0, 5).map((i) => `${i.category}: ${i.detail}`).join("; ") : "No slop patterns detected.",
|
|
1637
|
+
evidence: report.issues[0]?.example
|
|
1638
|
+
}
|
|
1639
|
+
];
|
|
1640
|
+
};
|
|
1641
|
+
return judge;
|
|
1642
|
+
}
|
|
1643
|
+
function analyzeAntiSlop(outputs, config) {
|
|
1644
|
+
const issues = [];
|
|
1645
|
+
const counts = {
|
|
1646
|
+
banned_phrase: 0,
|
|
1647
|
+
banned_opening: 0,
|
|
1648
|
+
hedging: 0,
|
|
1649
|
+
apology: 0,
|
|
1650
|
+
repetition: 0,
|
|
1651
|
+
length: 0
|
|
1652
|
+
};
|
|
1653
|
+
for (const output of outputs) {
|
|
1654
|
+
if (!output) continue;
|
|
1655
|
+
const lower = output.toLowerCase();
|
|
1656
|
+
for (const phrase of config.bannedPhrases) {
|
|
1657
|
+
const needle = phrase.toLowerCase();
|
|
1658
|
+
let idx = 0;
|
|
1659
|
+
while ((idx = lower.indexOf(needle, idx)) !== -1) {
|
|
1660
|
+
counts.banned_phrase += 1;
|
|
1661
|
+
if (issues.length < 20) {
|
|
1662
|
+
issues.push({
|
|
1663
|
+
category: "banned_phrase",
|
|
1664
|
+
detail: `"${phrase}"`,
|
|
1665
|
+
example: snippet(output, idx, phrase.length)
|
|
1666
|
+
});
|
|
1667
|
+
}
|
|
1668
|
+
idx += needle.length;
|
|
1669
|
+
}
|
|
1670
|
+
}
|
|
1671
|
+
for (const re of config.bannedOpenings) {
|
|
1672
|
+
if (re.test(output)) {
|
|
1673
|
+
counts.banned_opening += 1;
|
|
1674
|
+
issues.push({ category: "banned_opening", detail: re.source, example: output.slice(0, 80) });
|
|
1675
|
+
}
|
|
1676
|
+
}
|
|
1677
|
+
for (const re of config.hedgingPatterns) {
|
|
1678
|
+
const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
|
|
1679
|
+
if (matches2) {
|
|
1680
|
+
counts.hedging += matches2.length;
|
|
1681
|
+
issues.push({
|
|
1682
|
+
category: "hedging",
|
|
1683
|
+
detail: `${matches2.length}x ${re.source}`,
|
|
1684
|
+
example: matches2[0]
|
|
1685
|
+
});
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
for (const re of config.apologyPatterns) {
|
|
1689
|
+
const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
|
|
1690
|
+
if (matches2) {
|
|
1691
|
+
counts.apology += matches2.length;
|
|
1692
|
+
issues.push({
|
|
1693
|
+
category: "apology",
|
|
1694
|
+
detail: `${matches2.length}x ${re.source}`,
|
|
1695
|
+
example: matches2[0]
|
|
1696
|
+
});
|
|
1697
|
+
}
|
|
1698
|
+
}
|
|
1699
|
+
const sentences = splitSentences(output);
|
|
1700
|
+
if (sentences.length >= 4) {
|
|
1701
|
+
const seen = /* @__PURE__ */ new Map();
|
|
1702
|
+
for (const s of sentences) {
|
|
1703
|
+
const key = normalizeForDupe(s);
|
|
1704
|
+
if (!key) continue;
|
|
1705
|
+
seen.set(key, (seen.get(key) ?? 0) + 1);
|
|
1706
|
+
}
|
|
1707
|
+
let dupes = 0;
|
|
1708
|
+
for (const n of seen.values()) if (n > 1) dupes += n - 1;
|
|
1709
|
+
const ratio = dupes / sentences.length;
|
|
1710
|
+
if (ratio > config.repetitionThreshold) {
|
|
1711
|
+
counts.repetition += 1;
|
|
1712
|
+
issues.push({
|
|
1713
|
+
category: "repetition",
|
|
1714
|
+
detail: `${(ratio * 100).toFixed(0)}% duplicated (threshold ${(config.repetitionThreshold * 100).toFixed(0)}%)`
|
|
1715
|
+
});
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
if (output.length < config.minLength) {
|
|
1719
|
+
counts.length += 1;
|
|
1720
|
+
issues.push({ category: "length", detail: `too short (${output.length} < ${config.minLength})` });
|
|
1721
|
+
} else if (output.length > config.maxLength) {
|
|
1722
|
+
counts.length += 1;
|
|
1723
|
+
issues.push({ category: "length", detail: `too long (${output.length} > ${config.maxLength})` });
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
let penalty = 0;
|
|
1727
|
+
for (const cat of Object.keys(counts)) {
|
|
1728
|
+
penalty += counts[cat] * (config.penaltyWeights[cat] ?? 1);
|
|
1729
|
+
}
|
|
1730
|
+
const score = Math.max(0, Math.min(10, 10 - penalty));
|
|
1731
|
+
return { score, issues, counts };
|
|
1732
|
+
}
|
|
1733
|
+
function snippet(source, at, len) {
|
|
1734
|
+
const pad = 24;
|
|
1735
|
+
const start = Math.max(0, at - pad);
|
|
1736
|
+
const end = Math.min(source.length, at + len + pad);
|
|
1737
|
+
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
1738
|
+
}
|
|
1739
|
+
function splitSentences(text) {
|
|
1740
|
+
return text.split(/[.!?\n]+/).map((s) => s.trim()).filter((s) => s.length > 0);
|
|
1741
|
+
}
|
|
1742
|
+
function normalizeForDupe(s) {
|
|
1743
|
+
return s.toLowerCase().replace(/\s+/g, " ").replace(/[^a-z0-9 ]/g, "").trim();
|
|
1744
|
+
}
|
|
1745
|
+
|
|
1746
|
+
// src/artifact-validator.ts
|
|
1747
|
+
function composeValidators(validators, options) {
|
|
1748
|
+
const weights = options?.weights ?? validators.map(() => 1);
|
|
1749
|
+
if (weights.length !== validators.length) {
|
|
1750
|
+
throw new Error("composeValidators: weights length mismatch");
|
|
1751
|
+
}
|
|
1752
|
+
const totalWeight = weights.reduce((a, b) => a + b, 0) || 1;
|
|
1753
|
+
return {
|
|
1754
|
+
name: options?.name ?? validators.map((v) => v.name).join("+"),
|
|
1755
|
+
async validate(artifact, ctx) {
|
|
1756
|
+
const results = await Promise.all(validators.map((v) => v.validate(artifact, ctx)));
|
|
1757
|
+
const pass = results.every((r) => r.pass);
|
|
1758
|
+
const score = results.reduce((acc, r, i) => acc + r.score * weights[i], 0) / totalWeight;
|
|
1759
|
+
return {
|
|
1760
|
+
pass,
|
|
1761
|
+
score,
|
|
1762
|
+
issues: results.flatMap(
|
|
1763
|
+
(r, i) => r.issues.map((issue) => ({
|
|
1764
|
+
...issue,
|
|
1765
|
+
locus: issue.locus ? `${validators[i].name}:${issue.locus}` : validators[i].name
|
|
1766
|
+
}))
|
|
1767
|
+
),
|
|
1768
|
+
evidence: Object.fromEntries(results.map((r, i) => [validators[i].name, r.evidence]))
|
|
1769
|
+
};
|
|
1770
|
+
}
|
|
1771
|
+
};
|
|
1772
|
+
}
|
|
1773
|
+
function regexMatch(name, pattern) {
|
|
1774
|
+
return {
|
|
1775
|
+
name,
|
|
1776
|
+
async validate(artifact) {
|
|
1777
|
+
const body = artifact.content ?? "";
|
|
1778
|
+
const ok = pattern.test(body);
|
|
1779
|
+
return {
|
|
1780
|
+
pass: ok,
|
|
1781
|
+
score: ok ? 1 : 0,
|
|
1782
|
+
issues: ok ? [] : [{ severity: "error", message: `Artifact content did not match ${pattern}` }]
|
|
1783
|
+
};
|
|
1784
|
+
}
|
|
1785
|
+
};
|
|
1786
|
+
}
|
|
1787
|
+
function jsonHasKeys(name, requiredPaths) {
|
|
1788
|
+
return {
|
|
1789
|
+
name,
|
|
1790
|
+
async validate(artifact) {
|
|
1791
|
+
const body = artifact.content ?? "";
|
|
1792
|
+
let parsed;
|
|
1793
|
+
try {
|
|
1794
|
+
parsed = JSON.parse(body);
|
|
1795
|
+
} catch (err) {
|
|
1796
|
+
return {
|
|
1797
|
+
pass: false,
|
|
1798
|
+
score: 0,
|
|
1799
|
+
issues: [{ severity: "error", message: `Invalid JSON: ${err instanceof Error ? err.message : err}` }]
|
|
1800
|
+
};
|
|
1801
|
+
}
|
|
1802
|
+
const missing = [];
|
|
1803
|
+
for (const path of requiredPaths) {
|
|
1804
|
+
if (!pathExists(parsed, path)) missing.push(path);
|
|
1805
|
+
}
|
|
1806
|
+
const pass = missing.length === 0;
|
|
1807
|
+
return {
|
|
1808
|
+
pass,
|
|
1809
|
+
score: 1 - missing.length / Math.max(1, requiredPaths.length),
|
|
1810
|
+
issues: missing.map((p) => ({ severity: "error", message: `Missing path: ${p}`, locus: p }))
|
|
1811
|
+
};
|
|
1812
|
+
}
|
|
1813
|
+
};
|
|
1814
|
+
}
|
|
1815
|
+
function byteLengthRange(name, min, max) {
|
|
1816
|
+
return {
|
|
1817
|
+
name,
|
|
1818
|
+
async validate(artifact) {
|
|
1819
|
+
const size = artifact.bytes?.byteLength ?? new TextEncoder().encode(artifact.content ?? "").byteLength;
|
|
1820
|
+
const pass = size >= min && size <= max;
|
|
1821
|
+
const score = pass ? 1 : size < min ? Math.max(0, size / min) : Math.max(0, max / size);
|
|
1822
|
+
return {
|
|
1823
|
+
pass,
|
|
1824
|
+
score,
|
|
1825
|
+
issues: pass ? [] : [{ severity: "error", message: `Size ${size} outside [${min}, ${max}]` }]
|
|
1826
|
+
};
|
|
1827
|
+
}
|
|
1828
|
+
};
|
|
1829
|
+
}
|
|
1830
|
+
function containsAll(name, required, options) {
|
|
1831
|
+
const cs = options?.caseSensitive ?? false;
|
|
1832
|
+
return {
|
|
1833
|
+
name,
|
|
1834
|
+
async validate(artifact) {
|
|
1835
|
+
const body = cs ? artifact.content ?? "" : (artifact.content ?? "").toLowerCase();
|
|
1836
|
+
const missing = [];
|
|
1837
|
+
for (const needle of required) {
|
|
1838
|
+
const probe = cs ? needle : needle.toLowerCase();
|
|
1839
|
+
if (!body.includes(probe)) missing.push(needle);
|
|
1840
|
+
}
|
|
1841
|
+
const pass = missing.length === 0;
|
|
1842
|
+
return {
|
|
1843
|
+
pass,
|
|
1844
|
+
score: 1 - missing.length / Math.max(1, required.length),
|
|
1845
|
+
issues: missing.map((m) => ({ severity: "error", message: `Missing substring: ${m}` }))
|
|
1846
|
+
};
|
|
1847
|
+
}
|
|
1848
|
+
};
|
|
1849
|
+
}
|
|
1850
|
+
function pathExists(obj, path) {
|
|
1851
|
+
const parts = path.split(".");
|
|
1852
|
+
let current = obj;
|
|
1853
|
+
for (const part of parts) {
|
|
1854
|
+
if (current === null || typeof current !== "object") return false;
|
|
1855
|
+
const key = /^\d+$/.test(part) ? Number(part) : part;
|
|
1856
|
+
current = current[key];
|
|
1857
|
+
if (current === void 0) return false;
|
|
1858
|
+
}
|
|
1859
|
+
return true;
|
|
1860
|
+
}
|
|
1861
|
+
|
|
1862
|
+
// src/workspace-inspector.ts
|
|
1863
|
+
var InMemoryWorkspaceInspector = class {
|
|
1864
|
+
name = "in-memory";
|
|
1865
|
+
snapshots = /* @__PURE__ */ new Map();
|
|
1866
|
+
set(scopeId, snapshot) {
|
|
1867
|
+
this.snapshots.set(scopeId, snapshot);
|
|
1868
|
+
}
|
|
1869
|
+
async snapshot(context) {
|
|
1870
|
+
return this.snapshots.get(context.scopeId) ?? { files: {}, rows: {}, kv: {} };
|
|
1871
|
+
}
|
|
1872
|
+
};
|
|
1873
|
+
function fileExists(path) {
|
|
1874
|
+
return {
|
|
1875
|
+
name: `file_exists:${path}`,
|
|
1876
|
+
check(snapshot) {
|
|
1877
|
+
const pass = path in snapshot.files;
|
|
1878
|
+
return {
|
|
1879
|
+
pass,
|
|
1880
|
+
score: pass ? 1 : 0,
|
|
1881
|
+
detail: pass ? void 0 : `No file at ${path}`
|
|
1882
|
+
};
|
|
1883
|
+
}
|
|
1884
|
+
};
|
|
1885
|
+
}
|
|
1886
|
+
function fileContains(path, needle) {
|
|
1887
|
+
return {
|
|
1888
|
+
name: `file_contains:${path}:${needle}`,
|
|
1889
|
+
check(snapshot) {
|
|
1890
|
+
const content = snapshot.files[path];
|
|
1891
|
+
if (content === void 0) {
|
|
1892
|
+
return { pass: false, score: 0, detail: `File ${path} missing` };
|
|
1893
|
+
}
|
|
1894
|
+
const pass = content.includes(needle);
|
|
1895
|
+
return { pass, score: pass ? 1 : 0, detail: pass ? void 0 : `File ${path} missing substring "${needle}"` };
|
|
1896
|
+
}
|
|
1897
|
+
};
|
|
1898
|
+
}
|
|
1899
|
+
function rowCount(table, min, max) {
|
|
1900
|
+
return {
|
|
1901
|
+
name: `row_count:${table}:[${min},${max ?? "\u221E"}]`,
|
|
1902
|
+
check(snapshot) {
|
|
1903
|
+
const rows = snapshot.rows[table] ?? [];
|
|
1904
|
+
const count = rows.length;
|
|
1905
|
+
const upper = max ?? Infinity;
|
|
1906
|
+
const pass = count >= min && count <= upper;
|
|
1907
|
+
const score = pass ? 1 : count < min ? Math.max(0, count / min) : Math.max(0, upper / count);
|
|
1908
|
+
return {
|
|
1909
|
+
pass,
|
|
1910
|
+
score,
|
|
1911
|
+
detail: pass ? void 0 : `Table ${table} has ${count} rows, expected [${min}, ${max ?? "\u221E"}]`
|
|
1912
|
+
};
|
|
1913
|
+
}
|
|
1914
|
+
};
|
|
1915
|
+
}
|
|
1916
|
+
function rowWhere(table, predicate, options) {
|
|
1917
|
+
const min = options?.min ?? 1;
|
|
1918
|
+
return {
|
|
1919
|
+
name: `row_where:${table}`,
|
|
1920
|
+
check(snapshot) {
|
|
1921
|
+
const rows = snapshot.rows[table] ?? [];
|
|
1922
|
+
const matching = rows.filter(predicate).length;
|
|
1923
|
+
const pass = matching >= min;
|
|
1924
|
+
return {
|
|
1925
|
+
pass,
|
|
1926
|
+
score: pass ? 1 : Math.max(0, matching / min),
|
|
1927
|
+
detail: pass ? void 0 : `Table ${table} has ${matching} matching rows, expected \u2265 ${min}`
|
|
1928
|
+
};
|
|
1929
|
+
}
|
|
1930
|
+
};
|
|
1931
|
+
}
|
|
1932
|
+
function runAssertions(snapshot, assertions) {
|
|
1933
|
+
const results = assertions.map((a) => ({ assertion: a.name, result: a.check(snapshot) }));
|
|
1934
|
+
const pass = results.every((r) => r.result.pass);
|
|
1935
|
+
const score = results.length ? results.reduce((acc, r) => acc + r.result.score, 0) / results.length : 1;
|
|
1936
|
+
return { pass, score, results };
|
|
1937
|
+
}
|
|
1938
|
+
|
|
1939
|
+
// src/experiment-tracker.ts
|
|
1940
|
+
var InMemoryExperimentStore = class {
|
|
1941
|
+
experiments = /* @__PURE__ */ new Map();
|
|
1942
|
+
runs = /* @__PURE__ */ new Map();
|
|
1943
|
+
async saveExperiment(exp) {
|
|
1944
|
+
this.experiments.set(exp.id, { ...exp });
|
|
1945
|
+
}
|
|
1946
|
+
async getExperiment(id) {
|
|
1947
|
+
const e = this.experiments.get(id);
|
|
1948
|
+
return e ? { ...e } : null;
|
|
1949
|
+
}
|
|
1950
|
+
async listExperiments() {
|
|
1951
|
+
return [...this.experiments.values()].sort((a, b) => b.createdAt.localeCompare(a.createdAt));
|
|
1952
|
+
}
|
|
1953
|
+
async saveRun(run) {
|
|
1954
|
+
this.runs.set(run.id, structuredClone(run));
|
|
1955
|
+
}
|
|
1956
|
+
async getRun(id) {
|
|
1957
|
+
const r = this.runs.get(id);
|
|
1958
|
+
return r ? structuredClone(r) : null;
|
|
1959
|
+
}
|
|
1960
|
+
async listRuns(experimentId) {
|
|
1961
|
+
return [...this.runs.values()].filter((r) => r.experimentId === experimentId).sort((a, b) => b.startedAt.localeCompare(a.startedAt)).map((r) => structuredClone(r));
|
|
1962
|
+
}
|
|
1963
|
+
};
|
|
1964
|
+
var ExperimentTracker = class {
|
|
1965
|
+
constructor(store) {
|
|
1966
|
+
this.store = store;
|
|
1967
|
+
}
|
|
1968
|
+
store;
|
|
1969
|
+
async startExperiment(name, metadata) {
|
|
1970
|
+
const exp = {
|
|
1971
|
+
id: `exp_${rand(8)}`,
|
|
1972
|
+
name,
|
|
1973
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1974
|
+
metadata
|
|
1975
|
+
};
|
|
1976
|
+
await this.store.saveExperiment(exp);
|
|
1977
|
+
return exp;
|
|
1978
|
+
}
|
|
1979
|
+
async startRun(config) {
|
|
1980
|
+
const exp = await this.store.getExperiment(config.experimentId);
|
|
1981
|
+
if (!exp) throw new Error(`Experiment ${config.experimentId} not found`);
|
|
1982
|
+
const run = {
|
|
1983
|
+
id: `run_${rand(10)}`,
|
|
1984
|
+
experimentId: config.experimentId,
|
|
1985
|
+
name: config.name,
|
|
1986
|
+
config,
|
|
1987
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1988
|
+
status: "running"
|
|
1989
|
+
};
|
|
1990
|
+
await this.store.saveRun(run);
|
|
1991
|
+
return run;
|
|
1992
|
+
}
|
|
1993
|
+
async completeRun(runId, report) {
|
|
1994
|
+
const run = await this.store.getRun(runId);
|
|
1995
|
+
if (!run) throw new Error(`Run ${runId} not found`);
|
|
1996
|
+
run.status = "completed";
|
|
1997
|
+
run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
1998
|
+
run.report = report;
|
|
1999
|
+
await this.store.saveRun(run);
|
|
2000
|
+
}
|
|
2001
|
+
async failRun(runId, error) {
|
|
2002
|
+
const run = await this.store.getRun(runId);
|
|
2003
|
+
if (!run) throw new Error(`Run ${runId} not found`);
|
|
2004
|
+
run.status = "failed";
|
|
2005
|
+
run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2006
|
+
run.error = error;
|
|
2007
|
+
await this.store.saveRun(run);
|
|
2008
|
+
}
|
|
2009
|
+
/**
|
|
2010
|
+
* Diff two completed runs. Returns per-scenario deltas, aggregate delta,
|
|
2011
|
+
* and config changes that may explain the movement.
|
|
2012
|
+
*/
|
|
2013
|
+
async diff(runIdA, runIdB) {
|
|
2014
|
+
const [a, b] = await Promise.all([this.store.getRun(runIdA), this.store.getRun(runIdB)]);
|
|
2015
|
+
if (!a || !b) throw new Error("Both runs must exist");
|
|
2016
|
+
if (!a.report || !b.report) throw new Error("Both runs must be completed with reports");
|
|
2017
|
+
const byScenarioA = new Map(a.report.results.map((r) => [r.scenarioId, r.overallScore]));
|
|
2018
|
+
const byScenarioB = new Map(b.report.results.map((r) => [r.scenarioId, r.overallScore]));
|
|
2019
|
+
const scenarioIds = /* @__PURE__ */ new Set([...byScenarioA.keys(), ...byScenarioB.keys()]);
|
|
2020
|
+
const scenarios = [];
|
|
2021
|
+
for (const id of scenarioIds) {
|
|
2022
|
+
const aScore = byScenarioA.get(id);
|
|
2023
|
+
const bScore = byScenarioB.get(id);
|
|
2024
|
+
if (aScore === void 0) {
|
|
2025
|
+
scenarios.push({ scenarioId: id, before: null, after: bScore, delta: null, status: "added" });
|
|
2026
|
+
} else if (bScore === void 0) {
|
|
2027
|
+
scenarios.push({ scenarioId: id, before: aScore, after: null, delta: null, status: "removed" });
|
|
2028
|
+
} else {
|
|
2029
|
+
scenarios.push({
|
|
2030
|
+
scenarioId: id,
|
|
2031
|
+
before: aScore,
|
|
2032
|
+
after: bScore,
|
|
2033
|
+
delta: bScore - aScore,
|
|
2034
|
+
status: bScore > aScore ? "improved" : bScore < aScore ? "regressed" : "unchanged"
|
|
2035
|
+
});
|
|
2036
|
+
}
|
|
2037
|
+
}
|
|
2038
|
+
scenarios.sort((x, y) => (y.delta ?? 0) - (x.delta ?? 0));
|
|
2039
|
+
const aggregateDelta = b.report.summary.overallAvg - a.report.summary.overallAvg;
|
|
2040
|
+
const configChanges = {};
|
|
2041
|
+
const keys = /* @__PURE__ */ new Set([...Object.keys(a.config), ...Object.keys(b.config)]);
|
|
2042
|
+
const aCfg = a.config;
|
|
2043
|
+
const bCfg = b.config;
|
|
2044
|
+
for (const k of keys) {
|
|
2045
|
+
if (JSON.stringify(aCfg[k]) !== JSON.stringify(bCfg[k])) {
|
|
2046
|
+
configChanges[k] = { before: aCfg[k], after: bCfg[k] };
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2049
|
+
return {
|
|
2050
|
+
before: { runId: runIdA, name: a.name, startedAt: a.startedAt },
|
|
2051
|
+
after: { runId: runIdB, name: b.name, startedAt: b.startedAt },
|
|
2052
|
+
aggregateDelta,
|
|
2053
|
+
scenarios,
|
|
2054
|
+
configChanges
|
|
2055
|
+
};
|
|
2056
|
+
}
|
|
2057
|
+
/** Timeline of aggregate scores for an experiment. */
|
|
2058
|
+
async timeline(experimentId) {
|
|
2059
|
+
const runs = await this.store.listRuns(experimentId);
|
|
2060
|
+
return runs.slice().sort((a, b) => a.startedAt.localeCompare(b.startedAt)).map((r) => ({
|
|
2061
|
+
runId: r.id,
|
|
2062
|
+
startedAt: r.startedAt,
|
|
2063
|
+
overall: r.report?.summary.overallAvg ?? null
|
|
2064
|
+
}));
|
|
2065
|
+
}
|
|
2066
|
+
};
|
|
2067
|
+
function rand(bytes) {
|
|
2068
|
+
const arr = new Uint8Array(bytes);
|
|
2069
|
+
crypto.getRandomValues(arr);
|
|
2070
|
+
return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
2071
|
+
}
|
|
2072
|
+
|
|
2073
|
+
// src/prompt-optimizer.ts
|
|
2074
|
+
var PromptOptimizer = class {
|
|
2075
|
+
async run(config) {
|
|
2076
|
+
const trials = config.trialsPerScenario ?? 3;
|
|
2077
|
+
const alpha = config.significanceLevel ?? 0.05;
|
|
2078
|
+
if (config.variants.length < 2) {
|
|
2079
|
+
throw new Error("PromptOptimizer requires at least 2 variants");
|
|
2080
|
+
}
|
|
2081
|
+
if (config.scenarioIds.length === 0) {
|
|
2082
|
+
throw new Error("PromptOptimizer requires at least 1 scenario");
|
|
2083
|
+
}
|
|
2084
|
+
const rawScores = /* @__PURE__ */ new Map();
|
|
2085
|
+
for (const variant of config.variants) {
|
|
2086
|
+
const scenarioMap = /* @__PURE__ */ new Map();
|
|
2087
|
+
rawScores.set(variant.id, scenarioMap);
|
|
2088
|
+
for (const scenarioId of config.scenarioIds) {
|
|
2089
|
+
const samples = [];
|
|
2090
|
+
for (let t = 0; t < trials; t++) {
|
|
2091
|
+
const score = await config.scoreVariant({
|
|
2092
|
+
variant,
|
|
2093
|
+
scenarioId,
|
|
2094
|
+
trialIndex: t
|
|
2095
|
+
});
|
|
2096
|
+
if (!Number.isFinite(score)) {
|
|
2097
|
+
throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
|
|
2098
|
+
}
|
|
2099
|
+
samples.push(score);
|
|
2100
|
+
}
|
|
2101
|
+
scenarioMap.set(scenarioId, samples);
|
|
2102
|
+
config.onScenarioComplete?.({
|
|
2103
|
+
variantId: variant.id,
|
|
2104
|
+
scenarioId,
|
|
2105
|
+
scores: samples
|
|
2106
|
+
});
|
|
2107
|
+
}
|
|
2108
|
+
}
|
|
2109
|
+
const scores = config.variants.map((variant) => {
|
|
2110
|
+
const scenarioMap = rawScores.get(variant.id);
|
|
2111
|
+
const allSamples = [];
|
|
2112
|
+
const perScenario = {};
|
|
2113
|
+
for (const scenarioId of config.scenarioIds) {
|
|
2114
|
+
const samples = scenarioMap.get(scenarioId) ?? [];
|
|
2115
|
+
allSamples.push(...samples);
|
|
2116
|
+
perScenario[scenarioId] = {
|
|
2117
|
+
mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
|
|
2118
|
+
n: samples.length,
|
|
2119
|
+
samples
|
|
2120
|
+
};
|
|
2121
|
+
}
|
|
2122
|
+
const ci = confidenceInterval(allSamples, 0.95);
|
|
2123
|
+
return {
|
|
2124
|
+
variantId: variant.id,
|
|
2125
|
+
mean: ci.mean,
|
|
2126
|
+
ci95: { lower: ci.lower, upper: ci.upper },
|
|
2127
|
+
n: allSamples.length,
|
|
2128
|
+
perScenario
|
|
2129
|
+
};
|
|
2130
|
+
});
|
|
2131
|
+
const pairwise = [];
|
|
2132
|
+
for (let i = 0; i < scores.length; i++) {
|
|
2133
|
+
for (let j = i + 1; j < scores.length; j++) {
|
|
2134
|
+
const a = scores[i];
|
|
2135
|
+
const b = scores[j];
|
|
2136
|
+
const samplesA = flatSamples(a);
|
|
2137
|
+
const samplesB = flatSamples(b);
|
|
2138
|
+
const { p } = mannWhitneyU(samplesA, samplesB);
|
|
2139
|
+
pairwise.push({
|
|
2140
|
+
variantA: a.variantId,
|
|
2141
|
+
variantB: b.variantId,
|
|
2142
|
+
pValue: p,
|
|
2143
|
+
significant: p < alpha,
|
|
2144
|
+
meanDelta: b.mean - a.mean
|
|
2145
|
+
});
|
|
2146
|
+
}
|
|
2147
|
+
}
|
|
2148
|
+
const sorted = scores.slice().sort((x, y) => y.mean - x.mean);
|
|
2149
|
+
const winner = sorted[0];
|
|
2150
|
+
const second = sorted[1];
|
|
2151
|
+
const winnerComparisons = pairwise.filter(
|
|
2152
|
+
(c) => c.variantA === winner.variantId || c.variantB === winner.variantId
|
|
2153
|
+
);
|
|
2154
|
+
const significantOverAll = winnerComparisons.every((c) => c.significant);
|
|
2155
|
+
const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
|
|
2156
|
+
return {
|
|
2157
|
+
winner: {
|
|
2158
|
+
variantId: winner.variantId,
|
|
2159
|
+
significant: significantOverAll,
|
|
2160
|
+
ciLowerBoundExceedsSecondMean
|
|
2161
|
+
},
|
|
2162
|
+
scores,
|
|
2163
|
+
pairwise,
|
|
2164
|
+
config: {
|
|
2165
|
+
trialsPerScenario: trials,
|
|
2166
|
+
significanceLevel: alpha,
|
|
2167
|
+
variants: config.variants.map((v) => v.id),
|
|
2168
|
+
scenarios: config.scenarioIds
|
|
2169
|
+
}
|
|
2170
|
+
};
|
|
2171
|
+
}
|
|
2172
|
+
};
|
|
2173
|
+
function flatSamples(score) {
|
|
2174
|
+
const out = [];
|
|
2175
|
+
for (const s of Object.values(score.perScenario)) out.push(...s.samples);
|
|
2176
|
+
return out;
|
|
2177
|
+
}
|
|
2178
|
+
|
|
2179
|
+
// src/dual-agent-bench.ts
|
|
2180
|
+
var DualAgentBench = class {
|
|
2181
|
+
async run(config) {
|
|
2182
|
+
const maxRounds = config.maxRounds ?? 5;
|
|
2183
|
+
const threshold = config.convergenceThreshold ?? 0.85;
|
|
2184
|
+
if (config.scenarios.length === 0) {
|
|
2185
|
+
throw new Error("DualAgentBench requires at least 1 scenario");
|
|
2186
|
+
}
|
|
2187
|
+
const results = [];
|
|
2188
|
+
for (const scenario of config.scenarios) {
|
|
2189
|
+
const history = [];
|
|
2190
|
+
let converged = false;
|
|
2191
|
+
let roundsToConverge = null;
|
|
2192
|
+
let finalProposal = "";
|
|
2193
|
+
let lastScore = 0;
|
|
2194
|
+
let priorCritique;
|
|
2195
|
+
for (let r = 0; r < maxRounds; r++) {
|
|
2196
|
+
const priorProposal = history[history.length - 1]?.proposal;
|
|
2197
|
+
const proposal = await config.propose({
|
|
2198
|
+
scenario,
|
|
2199
|
+
roundIndex: r,
|
|
2200
|
+
priorProposal,
|
|
2201
|
+
priorCritique
|
|
2202
|
+
});
|
|
2203
|
+
const { critique, convergenceScore } = await config.critique({
|
|
2204
|
+
scenario,
|
|
2205
|
+
roundIndex: r,
|
|
2206
|
+
proposal
|
|
2207
|
+
});
|
|
2208
|
+
if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
|
|
2209
|
+
throw new Error(
|
|
2210
|
+
`critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
|
|
2211
|
+
);
|
|
2212
|
+
}
|
|
2213
|
+
const round = {
|
|
2214
|
+
roundIndex: r,
|
|
2215
|
+
proposal,
|
|
2216
|
+
critique,
|
|
2217
|
+
convergenceScore
|
|
2218
|
+
};
|
|
2219
|
+
history.push(round);
|
|
2220
|
+
config.onRoundComplete?.({ scenarioId: scenario.id, round });
|
|
2221
|
+
finalProposal = proposal;
|
|
2222
|
+
lastScore = convergenceScore;
|
|
2223
|
+
priorCritique = critique;
|
|
2224
|
+
if (convergenceScore >= threshold) {
|
|
2225
|
+
converged = true;
|
|
2226
|
+
roundsToConverge = r + 1;
|
|
2227
|
+
break;
|
|
2228
|
+
}
|
|
2229
|
+
}
|
|
2230
|
+
results.push({
|
|
2231
|
+
scenarioId: scenario.id,
|
|
2232
|
+
converged,
|
|
2233
|
+
roundsToConverge,
|
|
2234
|
+
finalProposal,
|
|
2235
|
+
history,
|
|
2236
|
+
finalScore: lastScore
|
|
2237
|
+
});
|
|
2238
|
+
}
|
|
2239
|
+
const convergedResults = results.filter((r) => r.converged);
|
|
2240
|
+
const convergenceRate = results.length ? convergedResults.length / results.length : 0;
|
|
2241
|
+
const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
|
|
2242
|
+
const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
|
|
2243
|
+
return {
|
|
2244
|
+
scenarios: results,
|
|
2245
|
+
aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
|
|
2246
|
+
config: { maxRounds, convergenceThreshold: threshold }
|
|
2247
|
+
};
|
|
2248
|
+
}
|
|
2249
|
+
};
|
|
1264
2250
|
export {
|
|
1265
2251
|
AgentDriver,
|
|
1266
2252
|
BenchmarkRunner,
|
|
1267
2253
|
ConvergenceTracker,
|
|
2254
|
+
DualAgentBench,
|
|
2255
|
+
ExperimentTracker,
|
|
2256
|
+
FileSystemTraceStore,
|
|
2257
|
+
InMemoryExperimentStore,
|
|
2258
|
+
InMemoryWorkspaceInspector,
|
|
1268
2259
|
MODEL_PRICING,
|
|
2260
|
+
MemoryTraceStore,
|
|
1269
2261
|
MetricsCollector,
|
|
1270
2262
|
ProductClient,
|
|
2263
|
+
PromptOptimizer,
|
|
2264
|
+
PromptRegistry,
|
|
1271
2265
|
ScenarioRegistry,
|
|
1272
2266
|
TokenCounter,
|
|
1273
2267
|
adversarialJudge,
|
|
2268
|
+
analyzeAntiSlop,
|
|
2269
|
+
byteLengthRange,
|
|
1274
2270
|
codeExecutionJudge,
|
|
2271
|
+
cohensD,
|
|
1275
2272
|
coherenceJudge,
|
|
2273
|
+
composeValidators,
|
|
1276
2274
|
confidenceInterval,
|
|
2275
|
+
containsAll,
|
|
2276
|
+
createAntiSlopJudge,
|
|
1277
2277
|
createCustomJudge,
|
|
1278
2278
|
createDomainExpertJudge,
|
|
1279
2279
|
defaultJudges,
|
|
1280
2280
|
estimateCost,
|
|
1281
2281
|
estimateTokens,
|
|
1282
2282
|
executeScenario,
|
|
2283
|
+
fileContains,
|
|
2284
|
+
fileExists,
|
|
1283
2285
|
formatBenchmarkReport,
|
|
1284
2286
|
formatDriverReport,
|
|
2287
|
+
hashContent,
|
|
1285
2288
|
interRaterReliability,
|
|
2289
|
+
jsonHasKeys,
|
|
1286
2290
|
mannWhitneyU,
|
|
1287
2291
|
normalizeScores,
|
|
2292
|
+
pairedTTest,
|
|
1288
2293
|
partialCredit,
|
|
1289
2294
|
printDriverSummary,
|
|
2295
|
+
regexMatch,
|
|
2296
|
+
rowCount,
|
|
2297
|
+
rowWhere,
|
|
2298
|
+
runAssertions,
|
|
1290
2299
|
runE2EWorkflow,
|
|
1291
|
-
weightedMean
|
|
2300
|
+
weightedMean,
|
|
2301
|
+
wilcoxonSignedRank
|
|
1292
2302
|
};
|
|
1293
2303
|
//# sourceMappingURL=index.js.map
|