@tangle-network/agent-eval 0.67.0 → 0.68.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import {
2
- runCampaign
2
+ runCampaign,
3
+ summarizeBackendIntegrity
3
4
  } from "./chunk-6XQIEUQ2.js";
4
5
  import {
5
6
  DEFAULT_REDACTION_RULES
@@ -1458,6 +1459,273 @@ ${fmt(winnerSurface)}`;
1458
1459
  return lines.join("\n");
1459
1460
  }
1460
1461
 
1462
+ // src/campaign/provenance.ts
1463
+ import { createHash as createHash2 } from "crypto";
1464
+ import { join as join2 } from "path";
1465
+ function surfaceContentHash(surface) {
1466
+ const material = typeof surface === "string" ? surface : JSON.stringify({
1467
+ kind: surface.kind,
1468
+ worktreeRef: surface.worktreeRef,
1469
+ baseRef: surface.baseRef ?? null
1470
+ });
1471
+ return `sha256:${createHash2("sha256").update(material).digest("hex")}`;
1472
+ }
1473
+ function meanHoldoutComposite(campaign) {
1474
+ const xs = [];
1475
+ for (const cell of campaign.cells) {
1476
+ if (cell.error) continue;
1477
+ const cs = Object.values(cell.judgeScores).map((s) => s.composite);
1478
+ if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
1479
+ }
1480
+ return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
1481
+ }
1482
+ function buildLoopProvenanceRecord(args) {
1483
+ const integrity = summarizeBackendIntegrity(args.workerRecords);
1484
+ const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
1485
+ const candidates = [];
1486
+ for (const gen of args.generations) {
1487
+ const promotedSet = new Set(gen.promoted);
1488
+ const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
1489
+ for (const c of gen.candidates) {
1490
+ const surface = surfaceByHash.get(c.surfaceHash);
1491
+ const entry = {
1492
+ generation: gen.generationIndex,
1493
+ surfaceHash: c.surfaceHash,
1494
+ contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
1495
+ composite: c.composite,
1496
+ promoted: promotedSet.has(c.surfaceHash)
1497
+ };
1498
+ if (c.label) entry.label = c.label;
1499
+ if (c.rationale) entry.rationale = c.rationale;
1500
+ candidates.push(entry);
1501
+ }
1502
+ }
1503
+ const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
1504
+ const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
1505
+ const record = {
1506
+ schema: "tangle.loop-provenance.v1",
1507
+ runId: args.runId,
1508
+ runDir: args.runDir,
1509
+ timestamp: args.timestamp,
1510
+ baselineContentHash: surfaceContentHash(args.baselineSurface),
1511
+ winnerContentHash: surfaceContentHash(args.winnerSurface),
1512
+ diff: args.diff,
1513
+ candidates,
1514
+ gate: {
1515
+ decision: args.gate.decision,
1516
+ reasons: args.gate.reasons,
1517
+ delta: args.gate.delta,
1518
+ contributingGates: args.gate.contributingGates.map((g) => ({
1519
+ name: g.name,
1520
+ passed: g.passed
1521
+ }))
1522
+ },
1523
+ baselineHoldoutComposite,
1524
+ winnerHoldoutComposite,
1525
+ heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
1526
+ backend: {
1527
+ verdict: integrity.verdict,
1528
+ workerCallCount: integrity.totalRecords,
1529
+ models,
1530
+ totalInputTokens: integrity.totalInputTokens,
1531
+ totalOutputTokens: integrity.totalOutputTokens,
1532
+ totalCostUsd: integrity.totalCostUsd
1533
+ },
1534
+ totalCostUsd: args.totalCostUsd,
1535
+ totalDurationMs: args.totalDurationMs
1536
+ };
1537
+ if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
1538
+ if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
1539
+ return record;
1540
+ }
1541
+ var DECISION_OK = ["ship"];
1542
+ function hashId(parts) {
1543
+ return createHash2("sha256").update(parts.join(":")).digest("hex");
1544
+ }
1545
+ function gateStatus(decision) {
1546
+ return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
1547
+ }
1548
+ function loopProvenanceSpans(record, opts = {}) {
1549
+ const traceId = hashId(["trace", record.runId]).slice(0, 32);
1550
+ const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
1551
+ const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
1552
+ const spans = [];
1553
+ const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
1554
+ spans.push({
1555
+ traceId,
1556
+ spanId: rootSpanId,
1557
+ name: "improvement-loop",
1558
+ startTimeUnixNano: baseNano,
1559
+ endTimeUnixNano: endNano,
1560
+ attributes: {
1561
+ "tangle.runId": record.runId,
1562
+ "tangle.runDir": record.runDir,
1563
+ "tangle.baselineContentHash": record.baselineContentHash,
1564
+ "tangle.winnerContentHash": record.winnerContentHash,
1565
+ "tangle.heldOutLift": record.heldOutLift,
1566
+ "tangle.gateDecision": record.gate.decision,
1567
+ "tangle.backendVerdict": record.backend.verdict,
1568
+ "tangle.workerCallCount": record.backend.workerCallCount,
1569
+ "tangle.totalCostUsd": record.totalCostUsd
1570
+ },
1571
+ status: gateStatus(record.gate.decision),
1572
+ "tangle.runId": record.runId
1573
+ });
1574
+ const byGen = /* @__PURE__ */ new Map();
1575
+ for (const c of record.candidates) {
1576
+ const arr = byGen.get(c.generation) ?? [];
1577
+ arr.push(c);
1578
+ byGen.set(c.generation, arr);
1579
+ }
1580
+ for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
1581
+ const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
1582
+ const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
1583
+ spans.push({
1584
+ traceId,
1585
+ spanId: genSpanId,
1586
+ parentSpanId: rootSpanId,
1587
+ name: `generation-${generation}`,
1588
+ startTimeUnixNano: baseNano,
1589
+ endTimeUnixNano: endNano,
1590
+ attributes: {
1591
+ "tangle.runId": record.runId,
1592
+ "tangle.generation": generation,
1593
+ "tangle.populationSize": cands.length,
1594
+ "tangle.bestComposite": bestComposite
1595
+ },
1596
+ "tangle.runId": record.runId,
1597
+ "tangle.generation": generation
1598
+ });
1599
+ for (let i = 0; i < cands.length; i++) {
1600
+ const c = cands[i];
1601
+ const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
1602
+ 0,
1603
+ 16
1604
+ );
1605
+ const attributes = {
1606
+ "tangle.runId": record.runId,
1607
+ "tangle.generation": generation,
1608
+ "tangle.surfaceHash": c.surfaceHash,
1609
+ "tangle.contentHash": c.contentHash,
1610
+ "tangle.composite": c.composite,
1611
+ "tangle.promoted": c.promoted
1612
+ };
1613
+ if (c.label) attributes["tangle.candidateLabel"] = c.label;
1614
+ if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
1615
+ spans.push({
1616
+ traceId,
1617
+ spanId: candSpanId,
1618
+ parentSpanId: genSpanId,
1619
+ name: `candidate-${c.surfaceHash}`,
1620
+ startTimeUnixNano: baseNano,
1621
+ endTimeUnixNano: endNano,
1622
+ attributes,
1623
+ "tangle.runId": record.runId,
1624
+ "tangle.generation": generation
1625
+ });
1626
+ }
1627
+ }
1628
+ const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
1629
+ spans.push({
1630
+ traceId,
1631
+ spanId: gateSpanId,
1632
+ parentSpanId: rootSpanId,
1633
+ name: "gate-decision",
1634
+ startTimeUnixNano: endNano,
1635
+ endTimeUnixNano: endNano,
1636
+ attributes: {
1637
+ "tangle.runId": record.runId,
1638
+ "tangle.gateDecision": record.gate.decision,
1639
+ "tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
1640
+ "tangle.gateReasons": JSON.stringify(record.gate.reasons),
1641
+ "tangle.heldOutLift": record.heldOutLift,
1642
+ "tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
1643
+ "tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
1644
+ },
1645
+ status: gateStatus(record.gate.decision),
1646
+ "tangle.runId": record.runId
1647
+ });
1648
+ return spans;
1649
+ }
1650
+ function provenanceRecordPath(runDir) {
1651
+ return join2(runDir, "loop-provenance.json");
1652
+ }
1653
+ function provenanceSpansPath(runDir) {
1654
+ return join2(runDir, "loop-provenance-spans.jsonl");
1655
+ }
1656
+ function snapshotFromHoldout(index, surfaceHash2, surface, campaign) {
1657
+ const cells = campaign.cells.map((cell) => {
1658
+ const judgeScores = Object.values(cell.judgeScores);
1659
+ const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
1660
+ const score = {
1661
+ scenarioId: cell.scenarioId,
1662
+ rep: cell.rep,
1663
+ compositeMean: composite,
1664
+ dimensions: Object.fromEntries(
1665
+ Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions])
1666
+ )
1667
+ };
1668
+ if (cell.error) score.errorMessage = cell.error;
1669
+ return score;
1670
+ });
1671
+ const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
1672
+ return {
1673
+ index,
1674
+ surfaceHash: surfaceHash2,
1675
+ surface,
1676
+ cells,
1677
+ compositeMean,
1678
+ costUsd: campaign.aggregates.totalCostUsd,
1679
+ durationMs: campaign.durationMs
1680
+ };
1681
+ }
1682
+ function buildEvalRunEvent(args, record) {
1683
+ return {
1684
+ runId: args.runId,
1685
+ runDir: args.runDir,
1686
+ timestamp: args.timestamp,
1687
+ status: "finished",
1688
+ labels: {},
1689
+ baseline: snapshotFromHoldout(
1690
+ 0,
1691
+ record.baselineContentHash,
1692
+ args.baselineSurface,
1693
+ args.baselineOnHoldout
1694
+ ),
1695
+ generations: [
1696
+ snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout)
1697
+ ],
1698
+ gateDecision: args.gate.decision,
1699
+ holdoutLift: record.heldOutLift,
1700
+ totalCostUsd: args.totalCostUsd,
1701
+ totalDurationMs: args.totalDurationMs
1702
+ };
1703
+ }
1704
+ async function emitLoopProvenance(args) {
1705
+ const record = buildLoopProvenanceRecord(args);
1706
+ const spans = loopProvenanceSpans(record);
1707
+ args.storage.ensureDir(args.runDir);
1708
+ const recordPath = provenanceRecordPath(args.runDir);
1709
+ const spansPath = provenanceSpansPath(args.runDir);
1710
+ args.storage.write(recordPath, JSON.stringify(record, null, 2));
1711
+ args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
1712
+ if (args.hostedClient) {
1713
+ try {
1714
+ await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record));
1715
+ } catch (err) {
1716
+ const msg = err instanceof Error ? err.message : String(err);
1717
+ console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`);
1718
+ }
1719
+ try {
1720
+ await args.hostedClient.ingestTraces(spans);
1721
+ } catch (err) {
1722
+ const msg = err instanceof Error ? err.message : String(err);
1723
+ console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
1724
+ }
1725
+ }
1726
+ return { record, spans, recordPath, spansPath };
1727
+ }
1728
+
1461
1729
  export {
1462
1730
  dominates,
1463
1731
  paretoFrontier,
@@ -1488,6 +1756,12 @@ export {
1488
1756
  runOptimization,
1489
1757
  surfaceHash,
1490
1758
  runImprovementLoop,
1491
- defaultRenderDiff
1759
+ defaultRenderDiff,
1760
+ surfaceContentHash,
1761
+ buildLoopProvenanceRecord,
1762
+ loopProvenanceSpans,
1763
+ provenanceRecordPath,
1764
+ provenanceSpansPath,
1765
+ emitLoopProvenance
1492
1766
  };
1493
- //# sourceMappingURL=chunk-NV2PF37Q.js.map
1767
+ //# sourceMappingURL=chunk-JFGZPUMU.js.map