@tangle-network/agent-eval 0.66.0 → 0.68.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/dist/campaign/index.d.ts +107 -4
- package/dist/campaign/index.js +17 -9
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-E24XD7A2.js +318 -0
- package/dist/chunk-E24XD7A2.js.map +1 -0
- package/dist/{chunk-Q56RRLEC.js → chunk-JFGZPUMU.js} +289 -5
- package/dist/chunk-JFGZPUMU.js.map +1 -0
- package/dist/contract/index.d.ts +4 -4
- package/dist/contract/index.js +6 -6
- package/dist/index.d.ts +120 -11
- package/dist/index.js +100 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{provenance-BZUFC1_D.d.ts → provenance-CChUqexv.d.ts} +23 -1
- package/dist/{registry-BzAEvqAt.d.ts → registry-BGKyX6bw.d.ts} +1 -1
- package/dist/release-report-CN8hJlhk.d.ts +233 -0
- package/dist/reporting.d.ts +4 -3
- package/dist/statistics-B7yCbi9i.d.ts +253 -0
- package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-Q56RRLEC.js.map +0 -1
- package/dist/chunk-RDK3P4JE.js +0 -482
- package/dist/chunk-RDK3P4JE.js.map +0 -1
- package/dist/release-report-DGoeObZT.d.ts +0 -484
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
|
-
runCampaign
|
|
2
|
+
runCampaign,
|
|
3
|
+
summarizeBackendIntegrity
|
|
3
4
|
} from "./chunk-6XQIEUQ2.js";
|
|
4
5
|
import {
|
|
5
6
|
DEFAULT_REDACTION_RULES
|
|
@@ -1362,6 +1363,7 @@ async function runImprovementLoop(opts) {
|
|
|
1362
1363
|
}
|
|
1363
1364
|
const dispatchTimeoutMs = opts.dispatchTimeoutMs ?? DEFAULT_DISPATCH_TIMEOUT_MS;
|
|
1364
1365
|
const optimization = await runOptimization({ ...opts, dispatchTimeoutMs });
|
|
1366
|
+
const winnerIsBaseline = optimization.winnerSurfaceHash === surfaceHash(opts.baselineSurface);
|
|
1365
1367
|
const { runCampaign: runCampaign2 } = await import("./run-campaign-BVY3RGAZ.js");
|
|
1366
1368
|
const baselineOnHoldout = await runCampaign2({
|
|
1367
1369
|
...opts,
|
|
@@ -1370,7 +1372,7 @@ async function runImprovementLoop(opts) {
|
|
|
1370
1372
|
dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
|
|
1371
1373
|
runDir: `${opts.runDir}/holdout-baseline`
|
|
1372
1374
|
});
|
|
1373
|
-
const winnerOnHoldout = await runCampaign2({
|
|
1375
|
+
const winnerOnHoldout = winnerIsBaseline ? baselineOnHoldout : await runCampaign2({
|
|
1374
1376
|
...opts,
|
|
1375
1377
|
dispatchTimeoutMs,
|
|
1376
1378
|
scenarios: opts.holdoutScenarios,
|
|
@@ -1398,7 +1400,16 @@ async function runImprovementLoop(opts) {
|
|
|
1398
1400
|
baselineArtifacts.set(cell.cellId, cell.artifact);
|
|
1399
1401
|
baselineJudgeScores.set(cell.cellId, cell.judgeScores);
|
|
1400
1402
|
}
|
|
1401
|
-
const gateResult =
|
|
1403
|
+
const gateResult = winnerIsBaseline ? {
|
|
1404
|
+
decision: "hold",
|
|
1405
|
+
reasons: [
|
|
1406
|
+
"no candidate beat the training baseline \u2014 winner == baseline (empty diff); nothing to promote"
|
|
1407
|
+
],
|
|
1408
|
+
contributingGates: [
|
|
1409
|
+
{ name: "no-op-guard", passed: false, detail: { winnerIsBaseline: true } }
|
|
1410
|
+
],
|
|
1411
|
+
delta: 0
|
|
1412
|
+
} : await opts.gate.decide({
|
|
1402
1413
|
candidateArtifacts,
|
|
1403
1414
|
baselineArtifacts,
|
|
1404
1415
|
judgeScores,
|
|
@@ -1448,6 +1459,273 @@ ${fmt(winnerSurface)}`;
|
|
|
1448
1459
|
return lines.join("\n");
|
|
1449
1460
|
}
|
|
1450
1461
|
|
|
1462
|
+
// src/campaign/provenance.ts
|
|
1463
|
+
import { createHash as createHash2 } from "crypto";
|
|
1464
|
+
import { join as join2 } from "path";
|
|
1465
|
+
function surfaceContentHash(surface) {
|
|
1466
|
+
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
1467
|
+
kind: surface.kind,
|
|
1468
|
+
worktreeRef: surface.worktreeRef,
|
|
1469
|
+
baseRef: surface.baseRef ?? null
|
|
1470
|
+
});
|
|
1471
|
+
return `sha256:${createHash2("sha256").update(material).digest("hex")}`;
|
|
1472
|
+
}
|
|
1473
|
+
function meanHoldoutComposite(campaign) {
|
|
1474
|
+
const xs = [];
|
|
1475
|
+
for (const cell of campaign.cells) {
|
|
1476
|
+
if (cell.error) continue;
|
|
1477
|
+
const cs = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
1478
|
+
if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
|
|
1479
|
+
}
|
|
1480
|
+
return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
|
|
1481
|
+
}
|
|
1482
|
+
function buildLoopProvenanceRecord(args) {
|
|
1483
|
+
const integrity = summarizeBackendIntegrity(args.workerRecords);
|
|
1484
|
+
const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
|
|
1485
|
+
const candidates = [];
|
|
1486
|
+
for (const gen of args.generations) {
|
|
1487
|
+
const promotedSet = new Set(gen.promoted);
|
|
1488
|
+
const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
|
|
1489
|
+
for (const c of gen.candidates) {
|
|
1490
|
+
const surface = surfaceByHash.get(c.surfaceHash);
|
|
1491
|
+
const entry = {
|
|
1492
|
+
generation: gen.generationIndex,
|
|
1493
|
+
surfaceHash: c.surfaceHash,
|
|
1494
|
+
contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
|
|
1495
|
+
composite: c.composite,
|
|
1496
|
+
promoted: promotedSet.has(c.surfaceHash)
|
|
1497
|
+
};
|
|
1498
|
+
if (c.label) entry.label = c.label;
|
|
1499
|
+
if (c.rationale) entry.rationale = c.rationale;
|
|
1500
|
+
candidates.push(entry);
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
|
|
1504
|
+
const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
|
|
1505
|
+
const record = {
|
|
1506
|
+
schema: "tangle.loop-provenance.v1",
|
|
1507
|
+
runId: args.runId,
|
|
1508
|
+
runDir: args.runDir,
|
|
1509
|
+
timestamp: args.timestamp,
|
|
1510
|
+
baselineContentHash: surfaceContentHash(args.baselineSurface),
|
|
1511
|
+
winnerContentHash: surfaceContentHash(args.winnerSurface),
|
|
1512
|
+
diff: args.diff,
|
|
1513
|
+
candidates,
|
|
1514
|
+
gate: {
|
|
1515
|
+
decision: args.gate.decision,
|
|
1516
|
+
reasons: args.gate.reasons,
|
|
1517
|
+
delta: args.gate.delta,
|
|
1518
|
+
contributingGates: args.gate.contributingGates.map((g) => ({
|
|
1519
|
+
name: g.name,
|
|
1520
|
+
passed: g.passed
|
|
1521
|
+
}))
|
|
1522
|
+
},
|
|
1523
|
+
baselineHoldoutComposite,
|
|
1524
|
+
winnerHoldoutComposite,
|
|
1525
|
+
heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
|
|
1526
|
+
backend: {
|
|
1527
|
+
verdict: integrity.verdict,
|
|
1528
|
+
workerCallCount: integrity.totalRecords,
|
|
1529
|
+
models,
|
|
1530
|
+
totalInputTokens: integrity.totalInputTokens,
|
|
1531
|
+
totalOutputTokens: integrity.totalOutputTokens,
|
|
1532
|
+
totalCostUsd: integrity.totalCostUsd
|
|
1533
|
+
},
|
|
1534
|
+
totalCostUsd: args.totalCostUsd,
|
|
1535
|
+
totalDurationMs: args.totalDurationMs
|
|
1536
|
+
};
|
|
1537
|
+
if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
|
|
1538
|
+
if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
|
|
1539
|
+
return record;
|
|
1540
|
+
}
|
|
1541
|
+
var DECISION_OK = ["ship"];
|
|
1542
|
+
function hashId(parts) {
|
|
1543
|
+
return createHash2("sha256").update(parts.join(":")).digest("hex");
|
|
1544
|
+
}
|
|
1545
|
+
function gateStatus(decision) {
|
|
1546
|
+
return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
|
|
1547
|
+
}
|
|
1548
|
+
function loopProvenanceSpans(record, opts = {}) {
|
|
1549
|
+
const traceId = hashId(["trace", record.runId]).slice(0, 32);
|
|
1550
|
+
const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
|
|
1551
|
+
const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
|
|
1552
|
+
const spans = [];
|
|
1553
|
+
const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
|
|
1554
|
+
spans.push({
|
|
1555
|
+
traceId,
|
|
1556
|
+
spanId: rootSpanId,
|
|
1557
|
+
name: "improvement-loop",
|
|
1558
|
+
startTimeUnixNano: baseNano,
|
|
1559
|
+
endTimeUnixNano: endNano,
|
|
1560
|
+
attributes: {
|
|
1561
|
+
"tangle.runId": record.runId,
|
|
1562
|
+
"tangle.runDir": record.runDir,
|
|
1563
|
+
"tangle.baselineContentHash": record.baselineContentHash,
|
|
1564
|
+
"tangle.winnerContentHash": record.winnerContentHash,
|
|
1565
|
+
"tangle.heldOutLift": record.heldOutLift,
|
|
1566
|
+
"tangle.gateDecision": record.gate.decision,
|
|
1567
|
+
"tangle.backendVerdict": record.backend.verdict,
|
|
1568
|
+
"tangle.workerCallCount": record.backend.workerCallCount,
|
|
1569
|
+
"tangle.totalCostUsd": record.totalCostUsd
|
|
1570
|
+
},
|
|
1571
|
+
status: gateStatus(record.gate.decision),
|
|
1572
|
+
"tangle.runId": record.runId
|
|
1573
|
+
});
|
|
1574
|
+
const byGen = /* @__PURE__ */ new Map();
|
|
1575
|
+
for (const c of record.candidates) {
|
|
1576
|
+
const arr = byGen.get(c.generation) ?? [];
|
|
1577
|
+
arr.push(c);
|
|
1578
|
+
byGen.set(c.generation, arr);
|
|
1579
|
+
}
|
|
1580
|
+
for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
|
|
1581
|
+
const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
|
|
1582
|
+
const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
|
|
1583
|
+
spans.push({
|
|
1584
|
+
traceId,
|
|
1585
|
+
spanId: genSpanId,
|
|
1586
|
+
parentSpanId: rootSpanId,
|
|
1587
|
+
name: `generation-${generation}`,
|
|
1588
|
+
startTimeUnixNano: baseNano,
|
|
1589
|
+
endTimeUnixNano: endNano,
|
|
1590
|
+
attributes: {
|
|
1591
|
+
"tangle.runId": record.runId,
|
|
1592
|
+
"tangle.generation": generation,
|
|
1593
|
+
"tangle.populationSize": cands.length,
|
|
1594
|
+
"tangle.bestComposite": bestComposite
|
|
1595
|
+
},
|
|
1596
|
+
"tangle.runId": record.runId,
|
|
1597
|
+
"tangle.generation": generation
|
|
1598
|
+
});
|
|
1599
|
+
for (let i = 0; i < cands.length; i++) {
|
|
1600
|
+
const c = cands[i];
|
|
1601
|
+
const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
|
|
1602
|
+
0,
|
|
1603
|
+
16
|
|
1604
|
+
);
|
|
1605
|
+
const attributes = {
|
|
1606
|
+
"tangle.runId": record.runId,
|
|
1607
|
+
"tangle.generation": generation,
|
|
1608
|
+
"tangle.surfaceHash": c.surfaceHash,
|
|
1609
|
+
"tangle.contentHash": c.contentHash,
|
|
1610
|
+
"tangle.composite": c.composite,
|
|
1611
|
+
"tangle.promoted": c.promoted
|
|
1612
|
+
};
|
|
1613
|
+
if (c.label) attributes["tangle.candidateLabel"] = c.label;
|
|
1614
|
+
if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
|
|
1615
|
+
spans.push({
|
|
1616
|
+
traceId,
|
|
1617
|
+
spanId: candSpanId,
|
|
1618
|
+
parentSpanId: genSpanId,
|
|
1619
|
+
name: `candidate-${c.surfaceHash}`,
|
|
1620
|
+
startTimeUnixNano: baseNano,
|
|
1621
|
+
endTimeUnixNano: endNano,
|
|
1622
|
+
attributes,
|
|
1623
|
+
"tangle.runId": record.runId,
|
|
1624
|
+
"tangle.generation": generation
|
|
1625
|
+
});
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
|
|
1629
|
+
spans.push({
|
|
1630
|
+
traceId,
|
|
1631
|
+
spanId: gateSpanId,
|
|
1632
|
+
parentSpanId: rootSpanId,
|
|
1633
|
+
name: "gate-decision",
|
|
1634
|
+
startTimeUnixNano: endNano,
|
|
1635
|
+
endTimeUnixNano: endNano,
|
|
1636
|
+
attributes: {
|
|
1637
|
+
"tangle.runId": record.runId,
|
|
1638
|
+
"tangle.gateDecision": record.gate.decision,
|
|
1639
|
+
"tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
|
|
1640
|
+
"tangle.gateReasons": JSON.stringify(record.gate.reasons),
|
|
1641
|
+
"tangle.heldOutLift": record.heldOutLift,
|
|
1642
|
+
"tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
|
|
1643
|
+
"tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
|
|
1644
|
+
},
|
|
1645
|
+
status: gateStatus(record.gate.decision),
|
|
1646
|
+
"tangle.runId": record.runId
|
|
1647
|
+
});
|
|
1648
|
+
return spans;
|
|
1649
|
+
}
|
|
1650
|
+
function provenanceRecordPath(runDir) {
|
|
1651
|
+
return join2(runDir, "loop-provenance.json");
|
|
1652
|
+
}
|
|
1653
|
+
function provenanceSpansPath(runDir) {
|
|
1654
|
+
return join2(runDir, "loop-provenance-spans.jsonl");
|
|
1655
|
+
}
|
|
1656
|
+
function snapshotFromHoldout(index, surfaceHash2, surface, campaign) {
|
|
1657
|
+
const cells = campaign.cells.map((cell) => {
|
|
1658
|
+
const judgeScores = Object.values(cell.judgeScores);
|
|
1659
|
+
const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
|
|
1660
|
+
const score = {
|
|
1661
|
+
scenarioId: cell.scenarioId,
|
|
1662
|
+
rep: cell.rep,
|
|
1663
|
+
compositeMean: composite,
|
|
1664
|
+
dimensions: Object.fromEntries(
|
|
1665
|
+
Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions])
|
|
1666
|
+
)
|
|
1667
|
+
};
|
|
1668
|
+
if (cell.error) score.errorMessage = cell.error;
|
|
1669
|
+
return score;
|
|
1670
|
+
});
|
|
1671
|
+
const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
|
|
1672
|
+
return {
|
|
1673
|
+
index,
|
|
1674
|
+
surfaceHash: surfaceHash2,
|
|
1675
|
+
surface,
|
|
1676
|
+
cells,
|
|
1677
|
+
compositeMean,
|
|
1678
|
+
costUsd: campaign.aggregates.totalCostUsd,
|
|
1679
|
+
durationMs: campaign.durationMs
|
|
1680
|
+
};
|
|
1681
|
+
}
|
|
1682
|
+
function buildEvalRunEvent(args, record) {
|
|
1683
|
+
return {
|
|
1684
|
+
runId: args.runId,
|
|
1685
|
+
runDir: args.runDir,
|
|
1686
|
+
timestamp: args.timestamp,
|
|
1687
|
+
status: "finished",
|
|
1688
|
+
labels: {},
|
|
1689
|
+
baseline: snapshotFromHoldout(
|
|
1690
|
+
0,
|
|
1691
|
+
record.baselineContentHash,
|
|
1692
|
+
args.baselineSurface,
|
|
1693
|
+
args.baselineOnHoldout
|
|
1694
|
+
),
|
|
1695
|
+
generations: [
|
|
1696
|
+
snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout)
|
|
1697
|
+
],
|
|
1698
|
+
gateDecision: args.gate.decision,
|
|
1699
|
+
holdoutLift: record.heldOutLift,
|
|
1700
|
+
totalCostUsd: args.totalCostUsd,
|
|
1701
|
+
totalDurationMs: args.totalDurationMs
|
|
1702
|
+
};
|
|
1703
|
+
}
|
|
1704
|
+
async function emitLoopProvenance(args) {
|
|
1705
|
+
const record = buildLoopProvenanceRecord(args);
|
|
1706
|
+
const spans = loopProvenanceSpans(record);
|
|
1707
|
+
args.storage.ensureDir(args.runDir);
|
|
1708
|
+
const recordPath = provenanceRecordPath(args.runDir);
|
|
1709
|
+
const spansPath = provenanceSpansPath(args.runDir);
|
|
1710
|
+
args.storage.write(recordPath, JSON.stringify(record, null, 2));
|
|
1711
|
+
args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
|
|
1712
|
+
if (args.hostedClient) {
|
|
1713
|
+
try {
|
|
1714
|
+
await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record));
|
|
1715
|
+
} catch (err) {
|
|
1716
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1717
|
+
console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`);
|
|
1718
|
+
}
|
|
1719
|
+
try {
|
|
1720
|
+
await args.hostedClient.ingestTraces(spans);
|
|
1721
|
+
} catch (err) {
|
|
1722
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1723
|
+
console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
return { record, spans, recordPath, spansPath };
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1451
1729
|
export {
|
|
1452
1730
|
dominates,
|
|
1453
1731
|
paretoFrontier,
|
|
@@ -1478,6 +1756,12 @@ export {
|
|
|
1478
1756
|
runOptimization,
|
|
1479
1757
|
surfaceHash,
|
|
1480
1758
|
runImprovementLoop,
|
|
1481
|
-
defaultRenderDiff
|
|
1759
|
+
defaultRenderDiff,
|
|
1760
|
+
surfaceContentHash,
|
|
1761
|
+
buildLoopProvenanceRecord,
|
|
1762
|
+
loopProvenanceSpans,
|
|
1763
|
+
provenanceRecordPath,
|
|
1764
|
+
provenanceSpansPath,
|
|
1765
|
+
emitLoopProvenance
|
|
1482
1766
|
};
|
|
1483
|
-
//# sourceMappingURL=chunk-
|
|
1767
|
+
//# sourceMappingURL=chunk-JFGZPUMU.js.map
|