@tangle-network/agent-eval 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -506,6 +506,123 @@ function partialCredit(current, target) {
506
506
  if (target <= 0) return 1;
507
507
  return Math.min(1, Math.max(0, current / target));
508
508
  }
509
+ function pairedTTest(before, after) {
510
+ if (before.length !== after.length) {
511
+ throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
512
+ }
513
+ const n = before.length;
514
+ if (n < 2) return { t: 0, df: 0, p: 1 };
515
+ const diffs = before.map((b, i) => after[i] - b);
516
+ const mean = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
518
+ const se = Math.sqrt(variance / n);
519
+ if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
520
+ const t = mean / se;
521
+ const df = n - 1;
522
+ const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
+ return { t, df, p };
524
+ }
525
+ function wilcoxonSignedRank(before, after) {
526
+ if (before.length !== after.length) {
527
+ throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
528
+ }
529
+ const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
530
+ const n = diffs.length;
531
+ if (n < 6) return { w: 0, p: 1 };
532
+ const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
533
+ const ranks = new Array(n);
534
+ let i = 0;
535
+ while (i < n) {
536
+ let j = i;
537
+ while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
538
+ const avg = (i + 1 + j) / 2;
539
+ for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg;
540
+ i = j;
541
+ }
542
+ let wPlus = 0;
543
+ for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
544
+ const mean = n * (n + 1) / 4;
545
+ const variance = n * (n + 1) * (2 * n + 1) / 24;
546
+ const z = (wPlus - mean) / Math.sqrt(variance);
547
+ const p = 2 * (1 - normalCdf(Math.abs(z)));
548
+ return { w: wPlus, p };
549
+ }
550
+ function cohensD(a, b) {
551
+ if (a.length < 2 || b.length < 2) return 0;
552
+ const meanA = a.reduce((x, y) => x + y, 0) / a.length;
553
+ const meanB = b.reduce((x, y) => x + y, 0) / b.length;
554
+ const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
555
+ const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
556
+ const pooled = Math.sqrt(
557
+ ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
558
+ );
559
+ if (pooled === 0) return 0;
560
+ return (meanB - meanA) / pooled;
561
+ }
562
+ function studentTCdf(t, df) {
563
+ if (df <= 0) return 0.5;
564
+ if (df > 100) return normalCdf(t);
565
+ const x = df / (df + t * t);
566
+ const a = df / 2;
567
+ const b = 0.5;
568
+ const ib = incompleteBeta(x, a, b);
569
+ return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
570
+ }
571
+ function incompleteBeta(x, a, b) {
572
+ if (x <= 0) return 0;
573
+ if (x >= 1) return 1;
574
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
575
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
576
+ const maxIter = 200;
577
+ const eps = 3e-7;
578
+ let c = 1;
579
+ let d = 1 - (a + b) * x / (a + 1);
580
+ if (Math.abs(d) < 1e-30) d = 1e-30;
581
+ d = 1 / d;
582
+ let f = d;
583
+ for (let m = 1; m <= maxIter; m++) {
584
+ const m2 = 2 * m;
585
+ let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
586
+ d = 1 + num * d;
587
+ if (Math.abs(d) < 1e-30) d = 1e-30;
588
+ c = 1 + num / c;
589
+ if (Math.abs(c) < 1e-30) c = 1e-30;
590
+ d = 1 / d;
591
+ f *= d * c;
592
+ num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
593
+ d = 1 + num * d;
594
+ if (Math.abs(d) < 1e-30) d = 1e-30;
595
+ c = 1 + num / c;
596
+ if (Math.abs(c) < 1e-30) c = 1e-30;
597
+ d = 1 / d;
598
+ const delta = d * c;
599
+ f *= delta;
600
+ if (Math.abs(delta - 1) < eps) break;
601
+ }
602
+ return front * f;
603
+ }
604
+ function lnGamma(z) {
605
+ const g = 7;
606
+ const coefs = [
607
+ 0.9999999999998099,
608
+ 676.5203681218851,
609
+ -1259.1392167224028,
610
+ 771.3234287776531,
611
+ -176.6150291621406,
612
+ 12.507343278686905,
613
+ -0.13857109526572012,
614
+ 9984369578019572e-21,
615
+ 15056327351493116e-23
616
+ ];
617
+ if (z < 0.5) {
618
+ return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
619
+ }
620
+ z -= 1;
621
+ let x = coefs[0];
622
+ for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
623
+ const t = z + g + 0.5;
624
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
625
+ }
509
626
  function normalCdf(x) {
510
627
  const a1 = 0.254829592;
511
628
  const a2 = -0.284496736;
@@ -1261,33 +1378,926 @@ function printDriverSummary(results) {
1261
1378
  const completedCount = results.filter((r) => r.completed).length;
1262
1379
  console.log(`${completedCount}/${results.length} personas completed`);
1263
1380
  }
1381
+
1382
+ // src/prompt-registry.ts
1383
+ var PromptRegistry = class {
1384
+ entries = /* @__PURE__ */ new Map();
1385
+ // `${id}@${version}` → handle
1386
+ /**
1387
+ * Register a prompt. Re-registering the same id+version with DIFFERENT
1388
+ * content throws — versions are immutable. Re-registering with the SAME
1389
+ * content is a no-op (idempotent).
1390
+ */
1391
+ async register(id, version, content) {
1392
+ validateId(id);
1393
+ validateVersion(version);
1394
+ const key = makeKey(id, version);
1395
+ const hash = await hashContent(content);
1396
+ const existing = this.entries.get(key);
1397
+ if (existing) {
1398
+ if (existing.hash !== hash) {
1399
+ throw new Error(
1400
+ `Prompt ${key} already registered with a different hash (${existing.hash} vs ${hash}). Bump the version.`
1401
+ );
1402
+ }
1403
+ return existing;
1404
+ }
1405
+ const handle = { id, version, hash, content };
1406
+ this.entries.set(key, handle);
1407
+ return handle;
1408
+ }
1409
+ /** Look up a registered prompt. Throws if unknown — no implicit defaults. */
1410
+ get(id, version) {
1411
+ const key = makeKey(id, version);
1412
+ const handle = this.entries.get(key);
1413
+ if (!handle) throw new Error(`Prompt ${key} not registered`);
1414
+ return handle;
1415
+ }
1416
+ /** Return all versions of an id, newest-first (lex-descending on version). */
1417
+ listVersions(id) {
1418
+ return [...this.entries.values()].filter((h) => h.id === id).sort((a, b) => b.version.localeCompare(a.version));
1419
+ }
1420
+ /** Snapshot the whole registry — useful for including in reports. */
1421
+ list() {
1422
+ return [...this.entries.values()];
1423
+ }
1424
+ /** Verify a hash against registered content. Returns null if not found. */
1425
+ verifyHash(id, version, expectedHash) {
1426
+ const handle = this.entries.get(makeKey(id, version));
1427
+ if (!handle) return null;
1428
+ return handle.hash === expectedHash;
1429
+ }
1430
+ };
1431
+ async function hashContent(content) {
1432
+ const bytes = new TextEncoder().encode(content);
1433
+ const digest = await crypto.subtle.digest("SHA-256", bytes);
1434
+ const full = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
1435
+ return full.slice(0, 12);
1436
+ }
1437
+ function makeKey(id, version) {
1438
+ return `${id}@${version}`;
1439
+ }
1440
+ var ID_RE = /^[a-z][a-z0-9._-]*$/i;
1441
+ function validateId(id) {
1442
+ if (!ID_RE.test(id)) {
1443
+ throw new Error(`Invalid prompt id "${id}": must match ${ID_RE}`);
1444
+ }
1445
+ }
1446
+ function validateVersion(version) {
1447
+ if (!version || version.length > 64) {
1448
+ throw new Error(`Invalid version "${version}": must be 1\u201364 chars`);
1449
+ }
1450
+ }
1451
+
1452
+ // src/trace-store.ts
1453
+ var MemoryTraceStore = class {
1454
+ traces = [];
1455
+ async record(trace) {
1456
+ this.traces.push(trace);
1457
+ }
1458
+ async query(query) {
1459
+ let result = this.filter(query);
1460
+ if (query.limit !== void 0) result = result.slice(0, query.limit);
1461
+ return result;
1462
+ }
1463
+ async count(query) {
1464
+ return query ? this.filter(query).length : this.traces.length;
1465
+ }
1466
+ /** Clear the store — test helper. */
1467
+ reset() {
1468
+ this.traces = [];
1469
+ }
1470
+ filter(query) {
1471
+ return this.traces.filter((t) => {
1472
+ if (query.runId && t.runId !== query.runId) return false;
1473
+ if (query.scenarioId && t.scenarioId !== query.scenarioId) return false;
1474
+ if (query.role && t.role !== query.role) return false;
1475
+ if (query.model && t.model !== query.model) return false;
1476
+ if (query.sinceMs !== void 0) {
1477
+ const ts = Date.parse(t.timestamp);
1478
+ if (Number.isFinite(ts) && ts < query.sinceMs) return false;
1479
+ }
1480
+ return true;
1481
+ });
1482
+ }
1483
+ };
1484
+ var FileSystemTraceStore = class {
1485
+ opts;
1486
+ constructor(opts) {
1487
+ this.opts = {
1488
+ rolloverBytes: 32 * 1024 * 1024,
1489
+ append: defaultAppend,
1490
+ read: defaultRead,
1491
+ list: defaultList,
1492
+ stat: defaultStat,
1493
+ mkdir: defaultMkdir,
1494
+ ...opts
1495
+ };
1496
+ }
1497
+ async record(trace) {
1498
+ const file = await this.currentSegment();
1499
+ await this.opts.append(file, JSON.stringify(trace) + "\n");
1500
+ }
1501
+ async query(query) {
1502
+ const files = await this.segments();
1503
+ const out = [];
1504
+ for (const file of files) {
1505
+ const contents = await this.opts.read(file).catch(() => "");
1506
+ for (const line of contents.split("\n")) {
1507
+ if (!line) continue;
1508
+ try {
1509
+ const t = JSON.parse(line);
1510
+ if (!matches(t, query)) continue;
1511
+ out.push(t);
1512
+ if (query.limit !== void 0 && out.length >= query.limit) return out;
1513
+ } catch {
1514
+ }
1515
+ }
1516
+ }
1517
+ return out;
1518
+ }
1519
+ async count(query) {
1520
+ if (!query) {
1521
+ const files = await this.segments();
1522
+ let total = 0;
1523
+ for (const file of files) {
1524
+ const contents = await this.opts.read(file).catch(() => "");
1525
+ total += contents.split("\n").filter(Boolean).length;
1526
+ }
1527
+ return total;
1528
+ }
1529
+ return (await this.query(query)).length;
1530
+ }
1531
+ async segments() {
1532
+ try {
1533
+ const all = await this.opts.list(this.opts.dir);
1534
+ return all.filter((f) => f.endsWith(".ndjson")).sort();
1535
+ } catch {
1536
+ return [];
1537
+ }
1538
+ }
1539
+ async currentSegment() {
1540
+ await this.opts.mkdir(this.opts.dir);
1541
+ const existing = await this.segments();
1542
+ if (existing.length === 0) return pathJoin(this.opts.dir, `traces-000.ndjson`);
1543
+ const latest = existing[existing.length - 1];
1544
+ try {
1545
+ const s = await this.opts.stat(latest);
1546
+ if (s.size < this.opts.rolloverBytes) return latest;
1547
+ } catch {
1548
+ return latest;
1549
+ }
1550
+ const n = existing.length;
1551
+ return pathJoin(this.opts.dir, `traces-${String(n).padStart(3, "0")}.ndjson`);
1552
+ }
1553
+ };
1554
+ function matches(t, query) {
1555
+ if (query.runId && t.runId !== query.runId) return false;
1556
+ if (query.scenarioId && t.scenarioId !== query.scenarioId) return false;
1557
+ if (query.role && t.role !== query.role) return false;
1558
+ if (query.model && t.model !== query.model) return false;
1559
+ if (query.sinceMs !== void 0) {
1560
+ const ts = Date.parse(t.timestamp);
1561
+ if (Number.isFinite(ts) && ts < query.sinceMs) return false;
1562
+ }
1563
+ return true;
1564
+ }
1565
+ function pathJoin(dir, file) {
1566
+ return dir.endsWith("/") ? `${dir}${file}` : `${dir}/${file}`;
1567
+ }
1568
+ async function defaultAppend(path, data) {
1569
+ const fs = await import("fs/promises");
1570
+ await fs.appendFile(path, data);
1571
+ }
1572
+ async function defaultRead(path) {
1573
+ const fs = await import("fs/promises");
1574
+ return fs.readFile(path, "utf8");
1575
+ }
1576
+ async function defaultList(dir) {
1577
+ const fs = await import("fs/promises");
1578
+ const p = await import("path");
1579
+ try {
1580
+ const entries = await fs.readdir(dir);
1581
+ return entries.map((e) => p.join(dir, e));
1582
+ } catch {
1583
+ return [];
1584
+ }
1585
+ }
1586
+ async function defaultStat(path) {
1587
+ const fs = await import("fs/promises");
1588
+ const s = await fs.stat(path);
1589
+ return { size: s.size };
1590
+ }
1591
+ async function defaultMkdir(dir) {
1592
+ const fs = await import("fs/promises");
1593
+ await fs.mkdir(dir, { recursive: true });
1594
+ }
1595
+
1596
+ // src/anti-slop.ts
1597
+ var DEFAULT_HEDGES = [
1598
+ /\bi\s+could\s+be\s+wrong\b/i,
1599
+ /\bi\s+think\s+maybe\b/i,
1600
+ /\bit\s+might\s+be\s+that\b/i,
1601
+ /\bperhaps\s+(?:you\s+)?could\b/i
1602
+ ];
1603
+ var DEFAULT_APOLOGIES = [
1604
+ /\bi\s+(?:apologize|apologise)\s+(?:for|if)\b/i,
1605
+ /\bi'?m\s+(?:so\s+|really\s+)?sorry\s+(?:for|if|about)\b/i,
1606
+ /\bmy\s+apologies\b/i
1607
+ ];
1608
+ function createAntiSlopJudge(config = {}) {
1609
+ const conf = {
1610
+ domain: config.domain ?? "general",
1611
+ bannedPhrases: config.bannedPhrases ?? [],
1612
+ bannedOpenings: config.bannedOpenings ?? [],
1613
+ hedgingPatterns: config.hedgingPatterns ?? DEFAULT_HEDGES,
1614
+ apologyPatterns: config.apologyPatterns ?? DEFAULT_APOLOGIES,
1615
+ repetitionThreshold: config.repetitionThreshold ?? 0.15,
1616
+ minLength: config.minLength ?? 20,
1617
+ maxLength: config.maxLength ?? 8e3,
1618
+ penaltyWeights: {
1619
+ banned_phrase: 1,
1620
+ banned_opening: 1,
1621
+ hedging: 0.5,
1622
+ apology: 0.5,
1623
+ repetition: 0.75,
1624
+ length: 0.5,
1625
+ ...config.penaltyWeights
1626
+ }
1627
+ };
1628
+ const judge = async (_tc, input) => {
1629
+ const outputs = input.turns.map((t) => t.agentResponse ?? "");
1630
+ const report = analyzeAntiSlop(outputs, conf);
1631
+ return [
1632
+ {
1633
+ judgeName: `anti-slop(${conf.domain})`,
1634
+ dimension: "anti_slop",
1635
+ score: report.score,
1636
+ reasoning: report.issues.length ? report.issues.slice(0, 5).map((i) => `${i.category}: ${i.detail}`).join("; ") : "No slop patterns detected.",
1637
+ evidence: report.issues[0]?.example
1638
+ }
1639
+ ];
1640
+ };
1641
+ return judge;
1642
+ }
1643
+ function analyzeAntiSlop(outputs, config) {
1644
+ const issues = [];
1645
+ const counts = {
1646
+ banned_phrase: 0,
1647
+ banned_opening: 0,
1648
+ hedging: 0,
1649
+ apology: 0,
1650
+ repetition: 0,
1651
+ length: 0
1652
+ };
1653
+ for (const output of outputs) {
1654
+ if (!output) continue;
1655
+ const lower = output.toLowerCase();
1656
+ for (const phrase of config.bannedPhrases) {
1657
+ const needle = phrase.toLowerCase();
1658
+ let idx = 0;
1659
+ while ((idx = lower.indexOf(needle, idx)) !== -1) {
1660
+ counts.banned_phrase += 1;
1661
+ if (issues.length < 20) {
1662
+ issues.push({
1663
+ category: "banned_phrase",
1664
+ detail: `"${phrase}"`,
1665
+ example: snippet(output, idx, phrase.length)
1666
+ });
1667
+ }
1668
+ idx += needle.length;
1669
+ }
1670
+ }
1671
+ for (const re of config.bannedOpenings) {
1672
+ if (re.test(output)) {
1673
+ counts.banned_opening += 1;
1674
+ issues.push({ category: "banned_opening", detail: re.source, example: output.slice(0, 80) });
1675
+ }
1676
+ }
1677
+ for (const re of config.hedgingPatterns) {
1678
+ const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
1679
+ if (matches2) {
1680
+ counts.hedging += matches2.length;
1681
+ issues.push({
1682
+ category: "hedging",
1683
+ detail: `${matches2.length}x ${re.source}`,
1684
+ example: matches2[0]
1685
+ });
1686
+ }
1687
+ }
1688
+ for (const re of config.apologyPatterns) {
1689
+ const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
1690
+ if (matches2) {
1691
+ counts.apology += matches2.length;
1692
+ issues.push({
1693
+ category: "apology",
1694
+ detail: `${matches2.length}x ${re.source}`,
1695
+ example: matches2[0]
1696
+ });
1697
+ }
1698
+ }
1699
+ const sentences = splitSentences(output);
1700
+ if (sentences.length >= 4) {
1701
+ const seen = /* @__PURE__ */ new Map();
1702
+ for (const s of sentences) {
1703
+ const key = normalizeForDupe(s);
1704
+ if (!key) continue;
1705
+ seen.set(key, (seen.get(key) ?? 0) + 1);
1706
+ }
1707
+ let dupes = 0;
1708
+ for (const n of seen.values()) if (n > 1) dupes += n - 1;
1709
+ const ratio = dupes / sentences.length;
1710
+ if (ratio > config.repetitionThreshold) {
1711
+ counts.repetition += 1;
1712
+ issues.push({
1713
+ category: "repetition",
1714
+ detail: `${(ratio * 100).toFixed(0)}% duplicated (threshold ${(config.repetitionThreshold * 100).toFixed(0)}%)`
1715
+ });
1716
+ }
1717
+ }
1718
+ if (output.length < config.minLength) {
1719
+ counts.length += 1;
1720
+ issues.push({ category: "length", detail: `too short (${output.length} < ${config.minLength})` });
1721
+ } else if (output.length > config.maxLength) {
1722
+ counts.length += 1;
1723
+ issues.push({ category: "length", detail: `too long (${output.length} > ${config.maxLength})` });
1724
+ }
1725
+ }
1726
+ let penalty = 0;
1727
+ for (const cat of Object.keys(counts)) {
1728
+ penalty += counts[cat] * (config.penaltyWeights[cat] ?? 1);
1729
+ }
1730
+ const score = Math.max(0, Math.min(10, 10 - penalty));
1731
+ return { score, issues, counts };
1732
+ }
1733
+ function snippet(source, at, len) {
1734
+ const pad = 24;
1735
+ const start = Math.max(0, at - pad);
1736
+ const end = Math.min(source.length, at + len + pad);
1737
+ return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
1738
+ }
1739
+ function splitSentences(text) {
1740
+ return text.split(/[.!?\n]+/).map((s) => s.trim()).filter((s) => s.length > 0);
1741
+ }
1742
+ function normalizeForDupe(s) {
1743
+ return s.toLowerCase().replace(/\s+/g, " ").replace(/[^a-z0-9 ]/g, "").trim();
1744
+ }
1745
+
1746
+ // src/artifact-validator.ts
1747
+ function composeValidators(validators, options) {
1748
+ const weights = options?.weights ?? validators.map(() => 1);
1749
+ if (weights.length !== validators.length) {
1750
+ throw new Error("composeValidators: weights length mismatch");
1751
+ }
1752
+ const totalWeight = weights.reduce((a, b) => a + b, 0) || 1;
1753
+ return {
1754
+ name: options?.name ?? validators.map((v) => v.name).join("+"),
1755
+ async validate(artifact, ctx) {
1756
+ const results = await Promise.all(validators.map((v) => v.validate(artifact, ctx)));
1757
+ const pass = results.every((r) => r.pass);
1758
+ const score = results.reduce((acc, r, i) => acc + r.score * weights[i], 0) / totalWeight;
1759
+ return {
1760
+ pass,
1761
+ score,
1762
+ issues: results.flatMap(
1763
+ (r, i) => r.issues.map((issue) => ({
1764
+ ...issue,
1765
+ locus: issue.locus ? `${validators[i].name}:${issue.locus}` : validators[i].name
1766
+ }))
1767
+ ),
1768
+ evidence: Object.fromEntries(results.map((r, i) => [validators[i].name, r.evidence]))
1769
+ };
1770
+ }
1771
+ };
1772
+ }
1773
+ function regexMatch(name, pattern) {
1774
+ return {
1775
+ name,
1776
+ async validate(artifact) {
1777
+ const body = artifact.content ?? "";
1778
+ const ok = pattern.test(body);
1779
+ return {
1780
+ pass: ok,
1781
+ score: ok ? 1 : 0,
1782
+ issues: ok ? [] : [{ severity: "error", message: `Artifact content did not match ${pattern}` }]
1783
+ };
1784
+ }
1785
+ };
1786
+ }
1787
+ function jsonHasKeys(name, requiredPaths) {
1788
+ return {
1789
+ name,
1790
+ async validate(artifact) {
1791
+ const body = artifact.content ?? "";
1792
+ let parsed;
1793
+ try {
1794
+ parsed = JSON.parse(body);
1795
+ } catch (err) {
1796
+ return {
1797
+ pass: false,
1798
+ score: 0,
1799
+ issues: [{ severity: "error", message: `Invalid JSON: ${err instanceof Error ? err.message : err}` }]
1800
+ };
1801
+ }
1802
+ const missing = [];
1803
+ for (const path of requiredPaths) {
1804
+ if (!pathExists(parsed, path)) missing.push(path);
1805
+ }
1806
+ const pass = missing.length === 0;
1807
+ return {
1808
+ pass,
1809
+ score: 1 - missing.length / Math.max(1, requiredPaths.length),
1810
+ issues: missing.map((p) => ({ severity: "error", message: `Missing path: ${p}`, locus: p }))
1811
+ };
1812
+ }
1813
+ };
1814
+ }
1815
+ function byteLengthRange(name, min, max) {
1816
+ return {
1817
+ name,
1818
+ async validate(artifact) {
1819
+ const size = artifact.bytes?.byteLength ?? new TextEncoder().encode(artifact.content ?? "").byteLength;
1820
+ const pass = size >= min && size <= max;
1821
+ const score = pass ? 1 : size < min ? Math.max(0, size / min) : Math.max(0, max / size);
1822
+ return {
1823
+ pass,
1824
+ score,
1825
+ issues: pass ? [] : [{ severity: "error", message: `Size ${size} outside [${min}, ${max}]` }]
1826
+ };
1827
+ }
1828
+ };
1829
+ }
1830
+ function containsAll(name, required, options) {
1831
+ const cs = options?.caseSensitive ?? false;
1832
+ return {
1833
+ name,
1834
+ async validate(artifact) {
1835
+ const body = cs ? artifact.content ?? "" : (artifact.content ?? "").toLowerCase();
1836
+ const missing = [];
1837
+ for (const needle of required) {
1838
+ const probe = cs ? needle : needle.toLowerCase();
1839
+ if (!body.includes(probe)) missing.push(needle);
1840
+ }
1841
+ const pass = missing.length === 0;
1842
+ return {
1843
+ pass,
1844
+ score: 1 - missing.length / Math.max(1, required.length),
1845
+ issues: missing.map((m) => ({ severity: "error", message: `Missing substring: ${m}` }))
1846
+ };
1847
+ }
1848
+ };
1849
+ }
1850
+ function pathExists(obj, path) {
1851
+ const parts = path.split(".");
1852
+ let current = obj;
1853
+ for (const part of parts) {
1854
+ if (current === null || typeof current !== "object") return false;
1855
+ const key = /^\d+$/.test(part) ? Number(part) : part;
1856
+ current = current[key];
1857
+ if (current === void 0) return false;
1858
+ }
1859
+ return true;
1860
+ }
1861
+
1862
+ // src/workspace-inspector.ts
1863
+ var InMemoryWorkspaceInspector = class {
1864
+ name = "in-memory";
1865
+ snapshots = /* @__PURE__ */ new Map();
1866
+ set(scopeId, snapshot) {
1867
+ this.snapshots.set(scopeId, snapshot);
1868
+ }
1869
+ async snapshot(context) {
1870
+ return this.snapshots.get(context.scopeId) ?? { files: {}, rows: {}, kv: {} };
1871
+ }
1872
+ };
1873
+ function fileExists(path) {
1874
+ return {
1875
+ name: `file_exists:${path}`,
1876
+ check(snapshot) {
1877
+ const pass = path in snapshot.files;
1878
+ return {
1879
+ pass,
1880
+ score: pass ? 1 : 0,
1881
+ detail: pass ? void 0 : `No file at ${path}`
1882
+ };
1883
+ }
1884
+ };
1885
+ }
1886
+ function fileContains(path, needle) {
1887
+ return {
1888
+ name: `file_contains:${path}:${needle}`,
1889
+ check(snapshot) {
1890
+ const content = snapshot.files[path];
1891
+ if (content === void 0) {
1892
+ return { pass: false, score: 0, detail: `File ${path} missing` };
1893
+ }
1894
+ const pass = content.includes(needle);
1895
+ return { pass, score: pass ? 1 : 0, detail: pass ? void 0 : `File ${path} missing substring "${needle}"` };
1896
+ }
1897
+ };
1898
+ }
1899
+ function rowCount(table, min, max) {
1900
+ return {
1901
+ name: `row_count:${table}:[${min},${max ?? "\u221E"}]`,
1902
+ check(snapshot) {
1903
+ const rows = snapshot.rows[table] ?? [];
1904
+ const count = rows.length;
1905
+ const upper = max ?? Infinity;
1906
+ const pass = count >= min && count <= upper;
1907
+ const score = pass ? 1 : count < min ? Math.max(0, count / min) : Math.max(0, upper / count);
1908
+ return {
1909
+ pass,
1910
+ score,
1911
+ detail: pass ? void 0 : `Table ${table} has ${count} rows, expected [${min}, ${max ?? "\u221E"}]`
1912
+ };
1913
+ }
1914
+ };
1915
+ }
1916
+ function rowWhere(table, predicate, options) {
1917
+ const min = options?.min ?? 1;
1918
+ return {
1919
+ name: `row_where:${table}`,
1920
+ check(snapshot) {
1921
+ const rows = snapshot.rows[table] ?? [];
1922
+ const matching = rows.filter(predicate).length;
1923
+ const pass = matching >= min;
1924
+ return {
1925
+ pass,
1926
+ score: pass ? 1 : Math.max(0, matching / min),
1927
+ detail: pass ? void 0 : `Table ${table} has ${matching} matching rows, expected \u2265 ${min}`
1928
+ };
1929
+ }
1930
+ };
1931
+ }
1932
+ function runAssertions(snapshot, assertions) {
1933
+ const results = assertions.map((a) => ({ assertion: a.name, result: a.check(snapshot) }));
1934
+ const pass = results.every((r) => r.result.pass);
1935
+ const score = results.length ? results.reduce((acc, r) => acc + r.result.score, 0) / results.length : 1;
1936
+ return { pass, score, results };
1937
+ }
1938
+
1939
+ // src/experiment-tracker.ts
1940
+ var InMemoryExperimentStore = class {
1941
+ experiments = /* @__PURE__ */ new Map();
1942
+ runs = /* @__PURE__ */ new Map();
1943
+ async saveExperiment(exp) {
1944
+ this.experiments.set(exp.id, { ...exp });
1945
+ }
1946
+ async getExperiment(id) {
1947
+ const e = this.experiments.get(id);
1948
+ return e ? { ...e } : null;
1949
+ }
1950
+ async listExperiments() {
1951
+ return [...this.experiments.values()].sort((a, b) => b.createdAt.localeCompare(a.createdAt));
1952
+ }
1953
+ async saveRun(run) {
1954
+ this.runs.set(run.id, structuredClone(run));
1955
+ }
1956
+ async getRun(id) {
1957
+ const r = this.runs.get(id);
1958
+ return r ? structuredClone(r) : null;
1959
+ }
1960
+ async listRuns(experimentId) {
1961
+ return [...this.runs.values()].filter((r) => r.experimentId === experimentId).sort((a, b) => b.startedAt.localeCompare(a.startedAt)).map((r) => structuredClone(r));
1962
+ }
1963
+ };
1964
+ var ExperimentTracker = class {
1965
+ constructor(store) {
1966
+ this.store = store;
1967
+ }
1968
+ store;
1969
+ async startExperiment(name, metadata) {
1970
+ const exp = {
1971
+ id: `exp_${rand(8)}`,
1972
+ name,
1973
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
1974
+ metadata
1975
+ };
1976
+ await this.store.saveExperiment(exp);
1977
+ return exp;
1978
+ }
1979
+ async startRun(config) {
1980
+ const exp = await this.store.getExperiment(config.experimentId);
1981
+ if (!exp) throw new Error(`Experiment ${config.experimentId} not found`);
1982
+ const run = {
1983
+ id: `run_${rand(10)}`,
1984
+ experimentId: config.experimentId,
1985
+ name: config.name,
1986
+ config,
1987
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
1988
+ status: "running"
1989
+ };
1990
+ await this.store.saveRun(run);
1991
+ return run;
1992
+ }
1993
+ async completeRun(runId, report) {
1994
+ const run = await this.store.getRun(runId);
1995
+ if (!run) throw new Error(`Run ${runId} not found`);
1996
+ run.status = "completed";
1997
+ run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
1998
+ run.report = report;
1999
+ await this.store.saveRun(run);
2000
+ }
2001
+ async failRun(runId, error) {
2002
+ const run = await this.store.getRun(runId);
2003
+ if (!run) throw new Error(`Run ${runId} not found`);
2004
+ run.status = "failed";
2005
+ run.completedAt = (/* @__PURE__ */ new Date()).toISOString();
2006
+ run.error = error;
2007
+ await this.store.saveRun(run);
2008
+ }
2009
+ /**
2010
+ * Diff two completed runs. Returns per-scenario deltas, aggregate delta,
2011
+ * and config changes that may explain the movement.
2012
+ */
2013
+ async diff(runIdA, runIdB) {
2014
+ const [a, b] = await Promise.all([this.store.getRun(runIdA), this.store.getRun(runIdB)]);
2015
+ if (!a || !b) throw new Error("Both runs must exist");
2016
+ if (!a.report || !b.report) throw new Error("Both runs must be completed with reports");
2017
+ const byScenarioA = new Map(a.report.results.map((r) => [r.scenarioId, r.overallScore]));
2018
+ const byScenarioB = new Map(b.report.results.map((r) => [r.scenarioId, r.overallScore]));
2019
+ const scenarioIds = /* @__PURE__ */ new Set([...byScenarioA.keys(), ...byScenarioB.keys()]);
2020
+ const scenarios = [];
2021
+ for (const id of scenarioIds) {
2022
+ const aScore = byScenarioA.get(id);
2023
+ const bScore = byScenarioB.get(id);
2024
+ if (aScore === void 0) {
2025
+ scenarios.push({ scenarioId: id, before: null, after: bScore, delta: null, status: "added" });
2026
+ } else if (bScore === void 0) {
2027
+ scenarios.push({ scenarioId: id, before: aScore, after: null, delta: null, status: "removed" });
2028
+ } else {
2029
+ scenarios.push({
2030
+ scenarioId: id,
2031
+ before: aScore,
2032
+ after: bScore,
2033
+ delta: bScore - aScore,
2034
+ status: bScore > aScore ? "improved" : bScore < aScore ? "regressed" : "unchanged"
2035
+ });
2036
+ }
2037
+ }
2038
+ scenarios.sort((x, y) => (y.delta ?? 0) - (x.delta ?? 0));
2039
+ const aggregateDelta = b.report.summary.overallAvg - a.report.summary.overallAvg;
2040
+ const configChanges = {};
2041
+ const keys = /* @__PURE__ */ new Set([...Object.keys(a.config), ...Object.keys(b.config)]);
2042
+ const aCfg = a.config;
2043
+ const bCfg = b.config;
2044
+ for (const k of keys) {
2045
+ if (JSON.stringify(aCfg[k]) !== JSON.stringify(bCfg[k])) {
2046
+ configChanges[k] = { before: aCfg[k], after: bCfg[k] };
2047
+ }
2048
+ }
2049
+ return {
2050
+ before: { runId: runIdA, name: a.name, startedAt: a.startedAt },
2051
+ after: { runId: runIdB, name: b.name, startedAt: b.startedAt },
2052
+ aggregateDelta,
2053
+ scenarios,
2054
+ configChanges
2055
+ };
2056
+ }
2057
+ /** Timeline of aggregate scores for an experiment. */
2058
+ async timeline(experimentId) {
2059
+ const runs = await this.store.listRuns(experimentId);
2060
+ return runs.slice().sort((a, b) => a.startedAt.localeCompare(b.startedAt)).map((r) => ({
2061
+ runId: r.id,
2062
+ startedAt: r.startedAt,
2063
+ overall: r.report?.summary.overallAvg ?? null
2064
+ }));
2065
+ }
2066
+ };
2067
+ function rand(bytes) {
2068
+ const arr = new Uint8Array(bytes);
2069
+ crypto.getRandomValues(arr);
2070
+ return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
2071
+ }
2072
+
2073
+ // src/prompt-optimizer.ts
2074
+ var PromptOptimizer = class {
2075
+ async run(config) {
2076
+ const trials = config.trialsPerScenario ?? 3;
2077
+ const alpha = config.significanceLevel ?? 0.05;
2078
+ if (config.variants.length < 2) {
2079
+ throw new Error("PromptOptimizer requires at least 2 variants");
2080
+ }
2081
+ if (config.scenarioIds.length === 0) {
2082
+ throw new Error("PromptOptimizer requires at least 1 scenario");
2083
+ }
2084
+ const rawScores = /* @__PURE__ */ new Map();
2085
+ for (const variant of config.variants) {
2086
+ const scenarioMap = /* @__PURE__ */ new Map();
2087
+ rawScores.set(variant.id, scenarioMap);
2088
+ for (const scenarioId of config.scenarioIds) {
2089
+ const samples = [];
2090
+ for (let t = 0; t < trials; t++) {
2091
+ const score = await config.scoreVariant({
2092
+ variant,
2093
+ scenarioId,
2094
+ trialIndex: t
2095
+ });
2096
+ if (!Number.isFinite(score)) {
2097
+ throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
2098
+ }
2099
+ samples.push(score);
2100
+ }
2101
+ scenarioMap.set(scenarioId, samples);
2102
+ config.onScenarioComplete?.({
2103
+ variantId: variant.id,
2104
+ scenarioId,
2105
+ scores: samples
2106
+ });
2107
+ }
2108
+ }
2109
+ const scores = config.variants.map((variant) => {
2110
+ const scenarioMap = rawScores.get(variant.id);
2111
+ const allSamples = [];
2112
+ const perScenario = {};
2113
+ for (const scenarioId of config.scenarioIds) {
2114
+ const samples = scenarioMap.get(scenarioId) ?? [];
2115
+ allSamples.push(...samples);
2116
+ perScenario[scenarioId] = {
2117
+ mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
2118
+ n: samples.length,
2119
+ samples
2120
+ };
2121
+ }
2122
+ const ci = confidenceInterval(allSamples, 0.95);
2123
+ return {
2124
+ variantId: variant.id,
2125
+ mean: ci.mean,
2126
+ ci95: { lower: ci.lower, upper: ci.upper },
2127
+ n: allSamples.length,
2128
+ perScenario
2129
+ };
2130
+ });
2131
+ const pairwise = [];
2132
+ for (let i = 0; i < scores.length; i++) {
2133
+ for (let j = i + 1; j < scores.length; j++) {
2134
+ const a = scores[i];
2135
+ const b = scores[j];
2136
+ const samplesA = flatSamples(a);
2137
+ const samplesB = flatSamples(b);
2138
+ const { p } = mannWhitneyU(samplesA, samplesB);
2139
+ pairwise.push({
2140
+ variantA: a.variantId,
2141
+ variantB: b.variantId,
2142
+ pValue: p,
2143
+ significant: p < alpha,
2144
+ meanDelta: b.mean - a.mean
2145
+ });
2146
+ }
2147
+ }
2148
+ const sorted = scores.slice().sort((x, y) => y.mean - x.mean);
2149
+ const winner = sorted[0];
2150
+ const second = sorted[1];
2151
+ const winnerComparisons = pairwise.filter(
2152
+ (c) => c.variantA === winner.variantId || c.variantB === winner.variantId
2153
+ );
2154
+ const significantOverAll = winnerComparisons.every((c) => c.significant);
2155
+ const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
2156
+ return {
2157
+ winner: {
2158
+ variantId: winner.variantId,
2159
+ significant: significantOverAll,
2160
+ ciLowerBoundExceedsSecondMean
2161
+ },
2162
+ scores,
2163
+ pairwise,
2164
+ config: {
2165
+ trialsPerScenario: trials,
2166
+ significanceLevel: alpha,
2167
+ variants: config.variants.map((v) => v.id),
2168
+ scenarios: config.scenarioIds
2169
+ }
2170
+ };
2171
+ }
2172
+ };
2173
+ function flatSamples(score) {
2174
+ const out = [];
2175
+ for (const s of Object.values(score.perScenario)) out.push(...s.samples);
2176
+ return out;
2177
+ }
2178
+
2179
+ // src/dual-agent-bench.ts
2180
+ var DualAgentBench = class {
2181
+ async run(config) {
2182
+ const maxRounds = config.maxRounds ?? 5;
2183
+ const threshold = config.convergenceThreshold ?? 0.85;
2184
+ if (config.scenarios.length === 0) {
2185
+ throw new Error("DualAgentBench requires at least 1 scenario");
2186
+ }
2187
+ const results = [];
2188
+ for (const scenario of config.scenarios) {
2189
+ const history = [];
2190
+ let converged = false;
2191
+ let roundsToConverge = null;
2192
+ let finalProposal = "";
2193
+ let lastScore = 0;
2194
+ let priorCritique;
2195
+ for (let r = 0; r < maxRounds; r++) {
2196
+ const priorProposal = history[history.length - 1]?.proposal;
2197
+ const proposal = await config.propose({
2198
+ scenario,
2199
+ roundIndex: r,
2200
+ priorProposal,
2201
+ priorCritique
2202
+ });
2203
+ const { critique, convergenceScore } = await config.critique({
2204
+ scenario,
2205
+ roundIndex: r,
2206
+ proposal
2207
+ });
2208
+ if (!Number.isFinite(convergenceScore) || convergenceScore < 0 || convergenceScore > 1) {
2209
+ throw new Error(
2210
+ `critique must return convergenceScore in [0,1]; got ${convergenceScore} for scenario ${scenario.id} round ${r}`
2211
+ );
2212
+ }
2213
+ const round = {
2214
+ roundIndex: r,
2215
+ proposal,
2216
+ critique,
2217
+ convergenceScore
2218
+ };
2219
+ history.push(round);
2220
+ config.onRoundComplete?.({ scenarioId: scenario.id, round });
2221
+ finalProposal = proposal;
2222
+ lastScore = convergenceScore;
2223
+ priorCritique = critique;
2224
+ if (convergenceScore >= threshold) {
2225
+ converged = true;
2226
+ roundsToConverge = r + 1;
2227
+ break;
2228
+ }
2229
+ }
2230
+ results.push({
2231
+ scenarioId: scenario.id,
2232
+ converged,
2233
+ roundsToConverge,
2234
+ finalProposal,
2235
+ history,
2236
+ finalScore: lastScore
2237
+ });
2238
+ }
2239
+ const convergedResults = results.filter((r) => r.converged);
2240
+ const convergenceRate = results.length ? convergedResults.length / results.length : 0;
2241
+ const avgRoundsToConverge = convergedResults.length ? convergedResults.reduce((acc, r) => acc + (r.roundsToConverge ?? 0), 0) / convergedResults.length : null;
2242
+ const avgFinalScore = results.length ? results.reduce((acc, r) => acc + r.finalScore, 0) / results.length : 0;
2243
+ return {
2244
+ scenarios: results,
2245
+ aggregate: { convergenceRate, avgRoundsToConverge, avgFinalScore },
2246
+ config: { maxRounds, convergenceThreshold: threshold }
2247
+ };
2248
+ }
2249
+ };
1264
2250
  export {
1265
2251
  AgentDriver,
1266
2252
  BenchmarkRunner,
1267
2253
  ConvergenceTracker,
2254
+ DualAgentBench,
2255
+ ExperimentTracker,
2256
+ FileSystemTraceStore,
2257
+ InMemoryExperimentStore,
2258
+ InMemoryWorkspaceInspector,
1268
2259
  MODEL_PRICING,
2260
+ MemoryTraceStore,
1269
2261
  MetricsCollector,
1270
2262
  ProductClient,
2263
+ PromptOptimizer,
2264
+ PromptRegistry,
1271
2265
  ScenarioRegistry,
1272
2266
  TokenCounter,
1273
2267
  adversarialJudge,
2268
+ analyzeAntiSlop,
2269
+ byteLengthRange,
1274
2270
  codeExecutionJudge,
2271
+ cohensD,
1275
2272
  coherenceJudge,
2273
+ composeValidators,
1276
2274
  confidenceInterval,
2275
+ containsAll,
2276
+ createAntiSlopJudge,
1277
2277
  createCustomJudge,
1278
2278
  createDomainExpertJudge,
1279
2279
  defaultJudges,
1280
2280
  estimateCost,
1281
2281
  estimateTokens,
1282
2282
  executeScenario,
2283
+ fileContains,
2284
+ fileExists,
1283
2285
  formatBenchmarkReport,
1284
2286
  formatDriverReport,
2287
+ hashContent,
1285
2288
  interRaterReliability,
2289
+ jsonHasKeys,
1286
2290
  mannWhitneyU,
1287
2291
  normalizeScores,
2292
+ pairedTTest,
1288
2293
  partialCredit,
1289
2294
  printDriverSummary,
2295
+ regexMatch,
2296
+ rowCount,
2297
+ rowWhere,
2298
+ runAssertions,
1290
2299
  runE2EWorkflow,
1291
- weightedMean
2300
+ weightedMean,
2301
+ wilcoxonSignedRank
1292
2302
  };
1293
2303
  //# sourceMappingURL=index.js.map