@tangle-network/agent-eval 0.67.0 → 0.68.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +15 -0
- package/dist/campaign/index.js +9 -9
- package/dist/{chunk-MZ2IYGGN.js → chunk-E24XD7A2.js} +4 -278
- package/dist/chunk-E24XD7A2.js.map +1 -0
- package/dist/{chunk-NV2PF37Q.js → chunk-JFGZPUMU.js} +277 -3
- package/dist/chunk-JFGZPUMU.js.map +1 -0
- package/dist/contract/index.js +6 -6
- package/dist/index.d.ts +113 -5
- package/dist/index.js +100 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/package.json +1 -1
- package/dist/chunk-MZ2IYGGN.js.map +0 -1
- package/dist/chunk-NV2PF37Q.js.map +0 -1
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
|
-
runCampaign
|
|
2
|
+
runCampaign,
|
|
3
|
+
summarizeBackendIntegrity
|
|
3
4
|
} from "./chunk-6XQIEUQ2.js";
|
|
4
5
|
import {
|
|
5
6
|
DEFAULT_REDACTION_RULES
|
|
@@ -1458,6 +1459,273 @@ ${fmt(winnerSurface)}`;
|
|
|
1458
1459
|
return lines.join("\n");
|
|
1459
1460
|
}
|
|
1460
1461
|
|
|
1462
|
+
// src/campaign/provenance.ts
|
|
1463
|
+
import { createHash as createHash2 } from "crypto";
|
|
1464
|
+
import { join as join2 } from "path";
|
|
1465
|
+
function surfaceContentHash(surface) {
|
|
1466
|
+
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
1467
|
+
kind: surface.kind,
|
|
1468
|
+
worktreeRef: surface.worktreeRef,
|
|
1469
|
+
baseRef: surface.baseRef ?? null
|
|
1470
|
+
});
|
|
1471
|
+
return `sha256:${createHash2("sha256").update(material).digest("hex")}`;
|
|
1472
|
+
}
|
|
1473
|
+
function meanHoldoutComposite(campaign) {
|
|
1474
|
+
const xs = [];
|
|
1475
|
+
for (const cell of campaign.cells) {
|
|
1476
|
+
if (cell.error) continue;
|
|
1477
|
+
const cs = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
1478
|
+
if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
|
|
1479
|
+
}
|
|
1480
|
+
return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
|
|
1481
|
+
}
|
|
1482
|
+
function buildLoopProvenanceRecord(args) {
|
|
1483
|
+
const integrity = summarizeBackendIntegrity(args.workerRecords);
|
|
1484
|
+
const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
|
|
1485
|
+
const candidates = [];
|
|
1486
|
+
for (const gen of args.generations) {
|
|
1487
|
+
const promotedSet = new Set(gen.promoted);
|
|
1488
|
+
const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
|
|
1489
|
+
for (const c of gen.candidates) {
|
|
1490
|
+
const surface = surfaceByHash.get(c.surfaceHash);
|
|
1491
|
+
const entry = {
|
|
1492
|
+
generation: gen.generationIndex,
|
|
1493
|
+
surfaceHash: c.surfaceHash,
|
|
1494
|
+
contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
|
|
1495
|
+
composite: c.composite,
|
|
1496
|
+
promoted: promotedSet.has(c.surfaceHash)
|
|
1497
|
+
};
|
|
1498
|
+
if (c.label) entry.label = c.label;
|
|
1499
|
+
if (c.rationale) entry.rationale = c.rationale;
|
|
1500
|
+
candidates.push(entry);
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
|
|
1504
|
+
const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
|
|
1505
|
+
const record = {
|
|
1506
|
+
schema: "tangle.loop-provenance.v1",
|
|
1507
|
+
runId: args.runId,
|
|
1508
|
+
runDir: args.runDir,
|
|
1509
|
+
timestamp: args.timestamp,
|
|
1510
|
+
baselineContentHash: surfaceContentHash(args.baselineSurface),
|
|
1511
|
+
winnerContentHash: surfaceContentHash(args.winnerSurface),
|
|
1512
|
+
diff: args.diff,
|
|
1513
|
+
candidates,
|
|
1514
|
+
gate: {
|
|
1515
|
+
decision: args.gate.decision,
|
|
1516
|
+
reasons: args.gate.reasons,
|
|
1517
|
+
delta: args.gate.delta,
|
|
1518
|
+
contributingGates: args.gate.contributingGates.map((g) => ({
|
|
1519
|
+
name: g.name,
|
|
1520
|
+
passed: g.passed
|
|
1521
|
+
}))
|
|
1522
|
+
},
|
|
1523
|
+
baselineHoldoutComposite,
|
|
1524
|
+
winnerHoldoutComposite,
|
|
1525
|
+
heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
|
|
1526
|
+
backend: {
|
|
1527
|
+
verdict: integrity.verdict,
|
|
1528
|
+
workerCallCount: integrity.totalRecords,
|
|
1529
|
+
models,
|
|
1530
|
+
totalInputTokens: integrity.totalInputTokens,
|
|
1531
|
+
totalOutputTokens: integrity.totalOutputTokens,
|
|
1532
|
+
totalCostUsd: integrity.totalCostUsd
|
|
1533
|
+
},
|
|
1534
|
+
totalCostUsd: args.totalCostUsd,
|
|
1535
|
+
totalDurationMs: args.totalDurationMs
|
|
1536
|
+
};
|
|
1537
|
+
if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
|
|
1538
|
+
if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
|
|
1539
|
+
return record;
|
|
1540
|
+
}
|
|
1541
|
+
var DECISION_OK = ["ship"];
|
|
1542
|
+
function hashId(parts) {
|
|
1543
|
+
return createHash2("sha256").update(parts.join(":")).digest("hex");
|
|
1544
|
+
}
|
|
1545
|
+
function gateStatus(decision) {
|
|
1546
|
+
return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
|
|
1547
|
+
}
|
|
1548
|
+
function loopProvenanceSpans(record, opts = {}) {
|
|
1549
|
+
const traceId = hashId(["trace", record.runId]).slice(0, 32);
|
|
1550
|
+
const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
|
|
1551
|
+
const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
|
|
1552
|
+
const spans = [];
|
|
1553
|
+
const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
|
|
1554
|
+
spans.push({
|
|
1555
|
+
traceId,
|
|
1556
|
+
spanId: rootSpanId,
|
|
1557
|
+
name: "improvement-loop",
|
|
1558
|
+
startTimeUnixNano: baseNano,
|
|
1559
|
+
endTimeUnixNano: endNano,
|
|
1560
|
+
attributes: {
|
|
1561
|
+
"tangle.runId": record.runId,
|
|
1562
|
+
"tangle.runDir": record.runDir,
|
|
1563
|
+
"tangle.baselineContentHash": record.baselineContentHash,
|
|
1564
|
+
"tangle.winnerContentHash": record.winnerContentHash,
|
|
1565
|
+
"tangle.heldOutLift": record.heldOutLift,
|
|
1566
|
+
"tangle.gateDecision": record.gate.decision,
|
|
1567
|
+
"tangle.backendVerdict": record.backend.verdict,
|
|
1568
|
+
"tangle.workerCallCount": record.backend.workerCallCount,
|
|
1569
|
+
"tangle.totalCostUsd": record.totalCostUsd
|
|
1570
|
+
},
|
|
1571
|
+
status: gateStatus(record.gate.decision),
|
|
1572
|
+
"tangle.runId": record.runId
|
|
1573
|
+
});
|
|
1574
|
+
const byGen = /* @__PURE__ */ new Map();
|
|
1575
|
+
for (const c of record.candidates) {
|
|
1576
|
+
const arr = byGen.get(c.generation) ?? [];
|
|
1577
|
+
arr.push(c);
|
|
1578
|
+
byGen.set(c.generation, arr);
|
|
1579
|
+
}
|
|
1580
|
+
for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
|
|
1581
|
+
const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
|
|
1582
|
+
const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
|
|
1583
|
+
spans.push({
|
|
1584
|
+
traceId,
|
|
1585
|
+
spanId: genSpanId,
|
|
1586
|
+
parentSpanId: rootSpanId,
|
|
1587
|
+
name: `generation-${generation}`,
|
|
1588
|
+
startTimeUnixNano: baseNano,
|
|
1589
|
+
endTimeUnixNano: endNano,
|
|
1590
|
+
attributes: {
|
|
1591
|
+
"tangle.runId": record.runId,
|
|
1592
|
+
"tangle.generation": generation,
|
|
1593
|
+
"tangle.populationSize": cands.length,
|
|
1594
|
+
"tangle.bestComposite": bestComposite
|
|
1595
|
+
},
|
|
1596
|
+
"tangle.runId": record.runId,
|
|
1597
|
+
"tangle.generation": generation
|
|
1598
|
+
});
|
|
1599
|
+
for (let i = 0; i < cands.length; i++) {
|
|
1600
|
+
const c = cands[i];
|
|
1601
|
+
const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
|
|
1602
|
+
0,
|
|
1603
|
+
16
|
|
1604
|
+
);
|
|
1605
|
+
const attributes = {
|
|
1606
|
+
"tangle.runId": record.runId,
|
|
1607
|
+
"tangle.generation": generation,
|
|
1608
|
+
"tangle.surfaceHash": c.surfaceHash,
|
|
1609
|
+
"tangle.contentHash": c.contentHash,
|
|
1610
|
+
"tangle.composite": c.composite,
|
|
1611
|
+
"tangle.promoted": c.promoted
|
|
1612
|
+
};
|
|
1613
|
+
if (c.label) attributes["tangle.candidateLabel"] = c.label;
|
|
1614
|
+
if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
|
|
1615
|
+
spans.push({
|
|
1616
|
+
traceId,
|
|
1617
|
+
spanId: candSpanId,
|
|
1618
|
+
parentSpanId: genSpanId,
|
|
1619
|
+
name: `candidate-${c.surfaceHash}`,
|
|
1620
|
+
startTimeUnixNano: baseNano,
|
|
1621
|
+
endTimeUnixNano: endNano,
|
|
1622
|
+
attributes,
|
|
1623
|
+
"tangle.runId": record.runId,
|
|
1624
|
+
"tangle.generation": generation
|
|
1625
|
+
});
|
|
1626
|
+
}
|
|
1627
|
+
}
|
|
1628
|
+
const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
|
|
1629
|
+
spans.push({
|
|
1630
|
+
traceId,
|
|
1631
|
+
spanId: gateSpanId,
|
|
1632
|
+
parentSpanId: rootSpanId,
|
|
1633
|
+
name: "gate-decision",
|
|
1634
|
+
startTimeUnixNano: endNano,
|
|
1635
|
+
endTimeUnixNano: endNano,
|
|
1636
|
+
attributes: {
|
|
1637
|
+
"tangle.runId": record.runId,
|
|
1638
|
+
"tangle.gateDecision": record.gate.decision,
|
|
1639
|
+
"tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
|
|
1640
|
+
"tangle.gateReasons": JSON.stringify(record.gate.reasons),
|
|
1641
|
+
"tangle.heldOutLift": record.heldOutLift,
|
|
1642
|
+
"tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
|
|
1643
|
+
"tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
|
|
1644
|
+
},
|
|
1645
|
+
status: gateStatus(record.gate.decision),
|
|
1646
|
+
"tangle.runId": record.runId
|
|
1647
|
+
});
|
|
1648
|
+
return spans;
|
|
1649
|
+
}
|
|
1650
|
+
function provenanceRecordPath(runDir) {
|
|
1651
|
+
return join2(runDir, "loop-provenance.json");
|
|
1652
|
+
}
|
|
1653
|
+
function provenanceSpansPath(runDir) {
|
|
1654
|
+
return join2(runDir, "loop-provenance-spans.jsonl");
|
|
1655
|
+
}
|
|
1656
|
+
function snapshotFromHoldout(index, surfaceHash2, surface, campaign) {
|
|
1657
|
+
const cells = campaign.cells.map((cell) => {
|
|
1658
|
+
const judgeScores = Object.values(cell.judgeScores);
|
|
1659
|
+
const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
|
|
1660
|
+
const score = {
|
|
1661
|
+
scenarioId: cell.scenarioId,
|
|
1662
|
+
rep: cell.rep,
|
|
1663
|
+
compositeMean: composite,
|
|
1664
|
+
dimensions: Object.fromEntries(
|
|
1665
|
+
Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions])
|
|
1666
|
+
)
|
|
1667
|
+
};
|
|
1668
|
+
if (cell.error) score.errorMessage = cell.error;
|
|
1669
|
+
return score;
|
|
1670
|
+
});
|
|
1671
|
+
const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
|
|
1672
|
+
return {
|
|
1673
|
+
index,
|
|
1674
|
+
surfaceHash: surfaceHash2,
|
|
1675
|
+
surface,
|
|
1676
|
+
cells,
|
|
1677
|
+
compositeMean,
|
|
1678
|
+
costUsd: campaign.aggregates.totalCostUsd,
|
|
1679
|
+
durationMs: campaign.durationMs
|
|
1680
|
+
};
|
|
1681
|
+
}
|
|
1682
|
+
function buildEvalRunEvent(args, record) {
|
|
1683
|
+
return {
|
|
1684
|
+
runId: args.runId,
|
|
1685
|
+
runDir: args.runDir,
|
|
1686
|
+
timestamp: args.timestamp,
|
|
1687
|
+
status: "finished",
|
|
1688
|
+
labels: {},
|
|
1689
|
+
baseline: snapshotFromHoldout(
|
|
1690
|
+
0,
|
|
1691
|
+
record.baselineContentHash,
|
|
1692
|
+
args.baselineSurface,
|
|
1693
|
+
args.baselineOnHoldout
|
|
1694
|
+
),
|
|
1695
|
+
generations: [
|
|
1696
|
+
snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout)
|
|
1697
|
+
],
|
|
1698
|
+
gateDecision: args.gate.decision,
|
|
1699
|
+
holdoutLift: record.heldOutLift,
|
|
1700
|
+
totalCostUsd: args.totalCostUsd,
|
|
1701
|
+
totalDurationMs: args.totalDurationMs
|
|
1702
|
+
};
|
|
1703
|
+
}
|
|
1704
|
+
async function emitLoopProvenance(args) {
|
|
1705
|
+
const record = buildLoopProvenanceRecord(args);
|
|
1706
|
+
const spans = loopProvenanceSpans(record);
|
|
1707
|
+
args.storage.ensureDir(args.runDir);
|
|
1708
|
+
const recordPath = provenanceRecordPath(args.runDir);
|
|
1709
|
+
const spansPath = provenanceSpansPath(args.runDir);
|
|
1710
|
+
args.storage.write(recordPath, JSON.stringify(record, null, 2));
|
|
1711
|
+
args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
|
|
1712
|
+
if (args.hostedClient) {
|
|
1713
|
+
try {
|
|
1714
|
+
await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record));
|
|
1715
|
+
} catch (err) {
|
|
1716
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1717
|
+
console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`);
|
|
1718
|
+
}
|
|
1719
|
+
try {
|
|
1720
|
+
await args.hostedClient.ingestTraces(spans);
|
|
1721
|
+
} catch (err) {
|
|
1722
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
1723
|
+
console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
return { record, spans, recordPath, spansPath };
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1461
1729
|
export {
|
|
1462
1730
|
dominates,
|
|
1463
1731
|
paretoFrontier,
|
|
@@ -1488,6 +1756,12 @@ export {
|
|
|
1488
1756
|
runOptimization,
|
|
1489
1757
|
surfaceHash,
|
|
1490
1758
|
runImprovementLoop,
|
|
1491
|
-
defaultRenderDiff
|
|
1759
|
+
defaultRenderDiff,
|
|
1760
|
+
surfaceContentHash,
|
|
1761
|
+
buildLoopProvenanceRecord,
|
|
1762
|
+
loopProvenanceSpans,
|
|
1763
|
+
provenanceRecordPath,
|
|
1764
|
+
provenanceSpansPath,
|
|
1765
|
+
emitLoopProvenance
|
|
1492
1766
|
};
|
|
1493
|
-
//# sourceMappingURL=chunk-
|
|
1767
|
+
//# sourceMappingURL=chunk-JFGZPUMU.js.map
|