@tangle-network/agent-eval 0.17.1 → 0.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1386,6 +1386,1308 @@ function printDriverSummary(results) {
1386
1386
  console.log(`${completedCount}/${results.length} personas completed`);
1387
1387
  }
1388
1388
 
1389
+ // src/trace/emitter.ts
1390
+ var TraceEmitter = class {
1391
+ store;
1392
+ stack = [];
1393
+ _runId;
1394
+ now;
1395
+ id;
1396
+ constructor(store, options = {}) {
1397
+ this.store = store;
1398
+ this.now = options.now ?? (() => Date.now());
1399
+ this.id = options.id ?? (() => cryptoRandomId());
1400
+ this._runId = options.runId ?? this.id();
1401
+ }
1402
+ get runId() {
1403
+ return this._runId;
1404
+ }
1405
+ // ── Run lifecycle ──────────────────────────────────────────────────
1406
+ async startRun(run) {
1407
+ const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
1408
+ await this.store.appendRun(full);
1409
+ return full;
1410
+ }
1411
+ async endRun(outcome) {
1412
+ const status = outcome?.pass === false ? "failed" : "completed";
1413
+ await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
1414
+ }
1415
+ async abortRun(reason) {
1416
+ await this.store.updateRun(this._runId, {
1417
+ endedAt: this.now(),
1418
+ status: "aborted",
1419
+ outcome: { pass: false, notes: reason }
1420
+ });
1421
+ }
1422
+ // ── Generic span ───────────────────────────────────────────────────
1423
+ async span(init) {
1424
+ const spanId = this.id();
1425
+ const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
1426
+ const span = {
1427
+ spanId,
1428
+ parentSpanId: parent,
1429
+ runId: this._runId,
1430
+ startedAt: this.now(),
1431
+ ...init
1432
+ };
1433
+ await this.store.appendSpan(span);
1434
+ this.stack.push(spanId);
1435
+ return this.handle(span);
1436
+ }
1437
+ handle(span) {
1438
+ return {
1439
+ span,
1440
+ end: async (patch) => {
1441
+ const endedAt = this.now();
1442
+ await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
1443
+ this.pop(span.spanId);
1444
+ },
1445
+ fail: async (error, patch) => {
1446
+ const endedAt = this.now();
1447
+ const errStr = error instanceof Error ? error.message : error;
1448
+ await this.store.updateSpan(span.spanId, {
1449
+ endedAt,
1450
+ status: "error",
1451
+ error: errStr,
1452
+ ...patch
1453
+ });
1454
+ this.pop(span.spanId);
1455
+ }
1456
+ };
1457
+ }
1458
+ pop(spanId) {
1459
+ const idx = this.stack.lastIndexOf(spanId);
1460
+ if (idx >= 0) this.stack.splice(idx, 1);
1461
+ }
1462
+ // ── Typed span conveniences ────────────────────────────────────────
1463
+ llm(init) {
1464
+ return this.span({ kind: "llm", ...init });
1465
+ }
1466
+ tool(init) {
1467
+ return this.span({ kind: "tool", ...init });
1468
+ }
1469
+ retrieval(init) {
1470
+ return this.span({ kind: "retrieval", ...init });
1471
+ }
1472
+ async recordJudge(verdict) {
1473
+ const spanId = this.id();
1474
+ const now = this.now();
1475
+ const full = {
1476
+ spanId,
1477
+ runId: this._runId,
1478
+ kind: "judge",
1479
+ startedAt: now,
1480
+ endedAt: now,
1481
+ status: "ok",
1482
+ ...verdict
1483
+ };
1484
+ await this.store.appendSpan(full);
1485
+ return full;
1486
+ }
1487
+ sandbox(init) {
1488
+ return this.span({ kind: "sandbox", ...init });
1489
+ }
1490
+ // ── Events ─────────────────────────────────────────────────────────
1491
+ async emit(event) {
1492
+ const full = {
1493
+ eventId: this.id(),
1494
+ runId: this._runId,
1495
+ spanId: event.spanId ?? this.stack[this.stack.length - 1],
1496
+ kind: event.kind,
1497
+ timestamp: this.now(),
1498
+ payload: event.payload ?? {}
1499
+ };
1500
+ await this.store.appendEvent(full);
1501
+ return full;
1502
+ }
1503
+ // ── Budget ledger ──────────────────────────────────────────────────
1504
+ async recordBudget(entry) {
1505
+ const full = {
1506
+ runId: this._runId,
1507
+ timestamp: entry.timestamp ?? this.now(),
1508
+ dimension: entry.dimension,
1509
+ limit: entry.limit,
1510
+ consumed: entry.consumed,
1511
+ remaining: entry.remaining,
1512
+ breached: entry.breached,
1513
+ spanId: entry.spanId ?? this.stack[this.stack.length - 1]
1514
+ };
1515
+ await this.store.appendBudgetEntry(full);
1516
+ if (full.breached) {
1517
+ await this.emit({
1518
+ kind: "budget_breach",
1519
+ spanId: full.spanId,
1520
+ payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
1521
+ });
1522
+ }
1523
+ return full;
1524
+ }
1525
+ // ── Artifacts ──────────────────────────────────────────────────────
1526
+ async recordArtifact(artifact) {
1527
+ const full = { artifactId: this.id(), runId: this._runId, ...artifact };
1528
+ await this.store.appendArtifact(full);
1529
+ return full;
1530
+ }
1531
+ // ── Nested composition ─────────────────────────────────────────────
1532
+ /**
1533
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1534
+ * Returns the fn's return value. Use this for the 95% case.
1535
+ */
1536
+ async within(init, fn) {
1537
+ const handle = await this.span(init);
1538
+ try {
1539
+ const result = await fn(handle);
1540
+ await handle.end();
1541
+ return result;
1542
+ } catch (err) {
1543
+ await handle.fail(err instanceof Error ? err : String(err));
1544
+ throw err;
1545
+ }
1546
+ }
1547
+ };
1548
+ function cryptoRandomId() {
1549
+ if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
1550
+ return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
1551
+ }
1552
+ function llmSpanFromProvider(args) {
1553
+ return {
1554
+ name: args.name ?? args.model,
1555
+ model: args.model,
1556
+ messages: args.messages,
1557
+ output: args.output,
1558
+ inputTokens: args.usage?.inputTokens,
1559
+ outputTokens: args.usage?.outputTokens,
1560
+ cachedTokens: args.usage?.cachedTokens,
1561
+ reasoningTokens: args.usage?.reasoningTokens,
1562
+ costUsd: args.costUsd,
1563
+ finishReason: args.finishReason
1564
+ };
1565
+ }
1566
+
1567
+ // src/control-runtime.ts
1568
+ var DEFAULT_BUDGET = {
1569
+ maxSteps: 8,
1570
+ maxWallMs: 5 * 60 * 1e3
1571
+ };
1572
+ async function runAgentControlLoop(config) {
1573
+ const budget = { ...DEFAULT_BUDGET, ...config.budget };
1574
+ const actionFailure = config.actionFailure ?? "continue";
1575
+ const controller = new AbortController();
1576
+ const upstreamAbort = () => controller.abort(config.signal?.reason);
1577
+ if (config.signal) {
1578
+ if (config.signal.aborted) controller.abort(config.signal.reason);
1579
+ else config.signal.addEventListener("abort", upstreamAbort, { once: true });
1580
+ }
1581
+ const started = Date.now();
1582
+ const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
1583
+ const history = [];
1584
+ const emitter = config.store ? new TraceEmitter(config.store) : void 0;
1585
+ let spentCostUsd = 0;
1586
+ const runtimeErrors = [];
1587
+ let lastStateFingerprint;
1588
+ let lastActionFingerprint;
1589
+ let noProgressStreak = 0;
1590
+ let repeatedActionStreak = 0;
1591
+ try {
1592
+ if (emitter) {
1593
+ await runTrace(runtimeErrors, 0, () => emitter.startRun({
1594
+ scenarioId: config.scenarioId ?? "agent-control-loop",
1595
+ projectId: config.projectId,
1596
+ variantId: config.variantId,
1597
+ layer: "meta",
1598
+ tags: {
1599
+ intent: config.intent.slice(0, 120),
1600
+ maxSteps: String(budget.maxSteps),
1601
+ ...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
1602
+ }
1603
+ }));
1604
+ }
1605
+ let state;
1606
+ let evals;
1607
+ try {
1608
+ state = await config.observe({ history, abortSignal: controller.signal });
1609
+ } catch (err) {
1610
+ runtimeErrors.push(runtimeError("observe", 0, err));
1611
+ return finish(emitter, {
1612
+ intent: config.intent,
1613
+ pass: false,
1614
+ completed: false,
1615
+ reason: runtimeErrors[0].message,
1616
+ steps: history,
1617
+ finalState: void 0,
1618
+ finalEvals: [],
1619
+ wallMs: Date.now() - started,
1620
+ spentCostUsd,
1621
+ runId: emitter?.runId ?? null,
1622
+ failureClass: "unknown",
1623
+ runtimeErrors,
1624
+ stoppedBy: "runtime-error"
1625
+ });
1626
+ }
1627
+ try {
1628
+ evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
1629
+ await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
1630
+ } catch (err) {
1631
+ runtimeErrors.push(runtimeError("validate", 0, err));
1632
+ return finish(emitter, {
1633
+ intent: config.intent,
1634
+ pass: false,
1635
+ completed: false,
1636
+ reason: runtimeErrors[0].message,
1637
+ steps: history,
1638
+ finalState: state,
1639
+ finalEvals: [],
1640
+ wallMs: Date.now() - started,
1641
+ spentCostUsd,
1642
+ runId: emitter?.runId ?? null,
1643
+ failureClass: "unknown",
1644
+ runtimeErrors,
1645
+ stoppedBy: "runtime-error"
1646
+ });
1647
+ }
1648
+ lastStateFingerprint = fingerprintState(state, config.stopPolicies);
1649
+ for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
1650
+ if (controller.signal.aborted) {
1651
+ return finish(emitter, {
1652
+ intent: config.intent,
1653
+ pass: false,
1654
+ completed: false,
1655
+ reason: abortReason(controller.signal),
1656
+ score: void 0,
1657
+ steps: history,
1658
+ finalState: state,
1659
+ finalEvals: evals,
1660
+ wallMs: Date.now() - started,
1661
+ spentCostUsd,
1662
+ runId: emitter?.runId ?? null,
1663
+ failureClass: "timeout",
1664
+ runtimeErrors,
1665
+ stoppedBy: "abort"
1666
+ });
1667
+ }
1668
+ const budgetStop = budgetStopDecision(budget, spentCostUsd);
1669
+ if (budgetStop.stop) {
1670
+ return finish(emitter, {
1671
+ intent: config.intent,
1672
+ pass: false,
1673
+ completed: false,
1674
+ reason: budgetStop.reason,
1675
+ score: averageScore(evals),
1676
+ steps: history,
1677
+ finalState: state,
1678
+ finalEvals: evals,
1679
+ wallMs: Date.now() - started,
1680
+ spentCostUsd,
1681
+ runId: emitter?.runId ?? null,
1682
+ failureClass: "budget_exceeded",
1683
+ runtimeErrors,
1684
+ stoppedBy: "budget"
1685
+ });
1686
+ }
1687
+ const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
1688
+ let stop;
1689
+ try {
1690
+ stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
1691
+ } catch (err) {
1692
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
1693
+ return finish(emitter, {
1694
+ intent: config.intent,
1695
+ pass: false,
1696
+ completed: false,
1697
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1698
+ score: averageScore(evals),
1699
+ steps: history,
1700
+ finalState: state,
1701
+ finalEvals: evals,
1702
+ wallMs: Date.now() - started,
1703
+ spentCostUsd,
1704
+ runId: emitter?.runId ?? null,
1705
+ failureClass: "unknown",
1706
+ runtimeErrors,
1707
+ stoppedBy: "runtime-error"
1708
+ });
1709
+ }
1710
+ if (stop.stop) {
1711
+ return finish(emitter, {
1712
+ intent: config.intent,
1713
+ pass: stop.pass,
1714
+ completed: true,
1715
+ reason: stop.reason,
1716
+ score: stop.score,
1717
+ steps: history,
1718
+ finalState: state,
1719
+ finalEvals: evals,
1720
+ wallMs: Date.now() - started,
1721
+ spentCostUsd,
1722
+ runId: emitter?.runId ?? null,
1723
+ failureClass: stop.failureClass,
1724
+ runtimeErrors,
1725
+ stoppedBy: "stop-policy"
1726
+ });
1727
+ }
1728
+ let decision;
1729
+ try {
1730
+ decision = await config.decide(ctx);
1731
+ } catch (err) {
1732
+ runtimeErrors.push(runtimeError("decide", stepIndex, err));
1733
+ return finish(emitter, {
1734
+ intent: config.intent,
1735
+ pass: false,
1736
+ completed: false,
1737
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1738
+ score: averageScore(evals),
1739
+ steps: history,
1740
+ finalState: state,
1741
+ finalEvals: evals,
1742
+ wallMs: Date.now() - started,
1743
+ spentCostUsd,
1744
+ runId: emitter?.runId ?? null,
1745
+ failureClass: "unknown",
1746
+ runtimeErrors,
1747
+ stoppedBy: "runtime-error"
1748
+ });
1749
+ }
1750
+ if (decision.type === "stop") {
1751
+ return finish(emitter, {
1752
+ intent: config.intent,
1753
+ pass: decision.pass ?? false,
1754
+ completed: true,
1755
+ reason: decision.reason,
1756
+ score: decision.score,
1757
+ steps: history,
1758
+ finalState: state,
1759
+ finalEvals: evals,
1760
+ wallMs: Date.now() - started,
1761
+ spentCostUsd,
1762
+ runId: emitter?.runId ?? null,
1763
+ failureClass: decision.pass === false ? "unknown" : void 0,
1764
+ runtimeErrors,
1765
+ stoppedBy: "policy"
1766
+ });
1767
+ }
1768
+ const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
1769
+ repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
1770
+ lastActionFingerprint = actionFingerprint;
1771
+ const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
1772
+ if (repeatedActionStop.stop) {
1773
+ return finish(emitter, {
1774
+ intent: config.intent,
1775
+ pass: false,
1776
+ completed: true,
1777
+ reason: repeatedActionStop.reason,
1778
+ score: averageScore(evals),
1779
+ steps: history,
1780
+ finalState: state,
1781
+ finalEvals: evals,
1782
+ wallMs: Date.now() - started,
1783
+ spentCostUsd,
1784
+ runId: emitter?.runId ?? null,
1785
+ failureClass: "tool_recovery_failure",
1786
+ runtimeErrors,
1787
+ stoppedBy: "stop-policy"
1788
+ });
1789
+ }
1790
+ const beforeState = state;
1791
+ const evalsBefore = evals;
1792
+ const scoreBefore = averageScore(evals);
1793
+ const actionStarted = Date.now();
1794
+ const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
1795
+ name: `control-step-${stepIndex}`,
1796
+ toolName: "agent-control-action",
1797
+ args: decision.action,
1798
+ attributes: {
1799
+ decision: decision.reason ?? "continue",
1800
+ repeatedActionStreak
1801
+ }
1802
+ })) : void 0;
1803
+ let actionOutcome;
1804
+ try {
1805
+ const result = await config.act(decision.action, ctx);
1806
+ const costUsd = config.getActionCostUsd?.({
1807
+ action: decision.action,
1808
+ result,
1809
+ state,
1810
+ evals,
1811
+ history
1812
+ });
1813
+ if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
1814
+ spentCostUsd += costUsd;
1815
+ await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
1816
+ }
1817
+ actionOutcome = {
1818
+ ok: true,
1819
+ result,
1820
+ ...costUsd !== void 0 ? { costUsd } : {},
1821
+ durationMs: Date.now() - actionStarted
1822
+ };
1823
+ } catch (err) {
1824
+ runtimeErrors.push(runtimeError("act", stepIndex, err));
1825
+ actionOutcome = {
1826
+ ok: false,
1827
+ error: runtimeErrors[runtimeErrors.length - 1].message,
1828
+ durationMs: Date.now() - actionStarted
1829
+ };
1830
+ if (actionFailure === "stop") {
1831
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
1832
+ const step2 = {
1833
+ index: stepIndex,
1834
+ decision,
1835
+ beforeState,
1836
+ afterState: state,
1837
+ evalsBefore,
1838
+ evalsAfter: evals,
1839
+ actionOutcome,
1840
+ startedAt: new Date(actionStarted).toISOString(),
1841
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1842
+ };
1843
+ history.push(step2);
1844
+ await runOnStep(config.onStep, step2, runtimeErrors);
1845
+ return finish(emitter, {
1846
+ intent: config.intent,
1847
+ pass: false,
1848
+ completed: false,
1849
+ reason: actionOutcome.error ?? "action failed",
1850
+ score: averageScore(evals),
1851
+ steps: history,
1852
+ finalState: state,
1853
+ finalEvals: evals,
1854
+ wallMs: Date.now() - started,
1855
+ spentCostUsd,
1856
+ runId: emitter?.runId ?? null,
1857
+ failureClass: "unknown",
1858
+ runtimeErrors,
1859
+ stoppedBy: "runtime-error"
1860
+ });
1861
+ }
1862
+ }
1863
+ try {
1864
+ state = await config.observe({ history, abortSignal: controller.signal });
1865
+ } catch (err) {
1866
+ runtimeErrors.push(runtimeError("observe", stepIndex, err));
1867
+ const step2 = {
1868
+ index: stepIndex,
1869
+ decision,
1870
+ beforeState,
1871
+ afterState: beforeState,
1872
+ evalsBefore,
1873
+ evalsAfter: evals,
1874
+ actionOutcome,
1875
+ startedAt: new Date(actionStarted).toISOString(),
1876
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1877
+ };
1878
+ history.push(step2);
1879
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
1880
+ await runOnStep(config.onStep, step2, runtimeErrors);
1881
+ return finish(emitter, {
1882
+ intent: config.intent,
1883
+ pass: false,
1884
+ completed: false,
1885
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1886
+ score: averageScore(evals),
1887
+ steps: history,
1888
+ finalState: beforeState,
1889
+ finalEvals: evals,
1890
+ wallMs: Date.now() - started,
1891
+ spentCostUsd,
1892
+ runId: emitter?.runId ?? null,
1893
+ failureClass: "unknown",
1894
+ runtimeErrors,
1895
+ stoppedBy: "runtime-error"
1896
+ });
1897
+ }
1898
+ try {
1899
+ evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
1900
+ await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
1901
+ } catch (err) {
1902
+ runtimeErrors.push(runtimeError("validate", stepIndex, err));
1903
+ const step2 = {
1904
+ index: stepIndex,
1905
+ decision,
1906
+ beforeState,
1907
+ afterState: state,
1908
+ evalsBefore,
1909
+ evalsAfter: evals,
1910
+ actionOutcome,
1911
+ startedAt: new Date(actionStarted).toISOString(),
1912
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1913
+ };
1914
+ history.push(step2);
1915
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
1916
+ await runOnStep(config.onStep, step2, runtimeErrors);
1917
+ return finish(emitter, {
1918
+ intent: config.intent,
1919
+ pass: false,
1920
+ completed: false,
1921
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1922
+ score: averageScore(evals),
1923
+ steps: history,
1924
+ finalState: state,
1925
+ finalEvals: evals,
1926
+ wallMs: Date.now() - started,
1927
+ spentCostUsd,
1928
+ runId: emitter?.runId ?? null,
1929
+ failureClass: "unknown",
1930
+ runtimeErrors,
1931
+ stoppedBy: "runtime-error"
1932
+ });
1933
+ }
1934
+ const scoreAfter = averageScore(evals);
1935
+ const stateFingerprint = fingerprintState(state, config.stopPolicies);
1936
+ const noProgressStop = noProgressStopDecision({
1937
+ policies: config.stopPolicies,
1938
+ lastStateFingerprint,
1939
+ stateFingerprint,
1940
+ scoreBefore,
1941
+ scoreAfter,
1942
+ currentStreak: noProgressStreak
1943
+ });
1944
+ noProgressStreak = noProgressStop.streak;
1945
+ lastStateFingerprint = stateFingerprint;
1946
+ const step = {
1947
+ index: stepIndex,
1948
+ decision,
1949
+ beforeState,
1950
+ afterState: state,
1951
+ evalsBefore,
1952
+ evalsAfter: evals,
1953
+ actionOutcome,
1954
+ startedAt: new Date(actionStarted).toISOString(),
1955
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1956
+ };
1957
+ history.push(step);
1958
+ if (actionOutcome.ok) {
1959
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
1960
+ attributes: {
1961
+ actionCostUsd: actionOutcome.costUsd ?? null,
1962
+ spentCostUsd,
1963
+ scoreBefore: scoreBefore ?? null,
1964
+ scoreAfter: scoreAfter ?? null,
1965
+ noProgressStreak
1966
+ }
1967
+ }));
1968
+ } else {
1969
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
1970
+ attributes: {
1971
+ spentCostUsd,
1972
+ noProgressStreak
1973
+ }
1974
+ }));
1975
+ }
1976
+ await runOnStep(config.onStep, step, runtimeErrors);
1977
+ if (noProgressStop.stop) {
1978
+ return finish(emitter, {
1979
+ intent: config.intent,
1980
+ pass: false,
1981
+ completed: true,
1982
+ reason: noProgressStop.reason,
1983
+ score: scoreAfter,
1984
+ steps: history,
1985
+ finalState: state,
1986
+ finalEvals: evals,
1987
+ wallMs: Date.now() - started,
1988
+ spentCostUsd,
1989
+ runId: emitter?.runId ?? null,
1990
+ failureClass: "tool_recovery_failure",
1991
+ runtimeErrors,
1992
+ stoppedBy: "stop-policy"
1993
+ });
1994
+ }
1995
+ const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
1996
+ if (postStepBudgetStop.stop) {
1997
+ return finish(emitter, {
1998
+ intent: config.intent,
1999
+ pass: false,
2000
+ completed: false,
2001
+ reason: postStepBudgetStop.reason,
2002
+ score: scoreAfter,
2003
+ steps: history,
2004
+ finalState: state,
2005
+ finalEvals: evals,
2006
+ wallMs: Date.now() - started,
2007
+ spentCostUsd,
2008
+ runId: emitter?.runId ?? null,
2009
+ failureClass: "budget_exceeded",
2010
+ runtimeErrors,
2011
+ stoppedBy: "budget"
2012
+ });
2013
+ }
2014
+ const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
2015
+ let postStepStop;
2016
+ try {
2017
+ postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
2018
+ } catch (err) {
2019
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
2020
+ return finish(emitter, {
2021
+ intent: config.intent,
2022
+ pass: false,
2023
+ completed: false,
2024
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
2025
+ score: averageScore(evals),
2026
+ steps: history,
2027
+ finalState: state,
2028
+ finalEvals: evals,
2029
+ wallMs: Date.now() - started,
2030
+ spentCostUsd,
2031
+ runId: emitter?.runId ?? null,
2032
+ failureClass: "unknown",
2033
+ runtimeErrors,
2034
+ stoppedBy: "runtime-error"
2035
+ });
2036
+ }
2037
+ if (postStepStop.stop) {
2038
+ return finish(emitter, {
2039
+ intent: config.intent,
2040
+ pass: postStepStop.pass,
2041
+ completed: true,
2042
+ reason: postStepStop.reason,
2043
+ score: postStepStop.score,
2044
+ steps: history,
2045
+ finalState: state,
2046
+ finalEvals: evals,
2047
+ wallMs: Date.now() - started,
2048
+ spentCostUsd,
2049
+ runId: emitter?.runId ?? null,
2050
+ failureClass: postStepStop.failureClass,
2051
+ runtimeErrors,
2052
+ stoppedBy: "stop-policy"
2053
+ });
2054
+ }
2055
+ }
2056
+ return finish(emitter, {
2057
+ intent: config.intent,
2058
+ pass: false,
2059
+ completed: false,
2060
+ reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
2061
+ steps: history,
2062
+ finalState: state,
2063
+ finalEvals: evals,
2064
+ wallMs: Date.now() - started,
2065
+ spentCostUsd,
2066
+ runId: emitter?.runId ?? null,
2067
+ failureClass: "budget_exceeded",
2068
+ runtimeErrors,
2069
+ stoppedBy: "budget"
2070
+ });
2071
+ } catch (err) {
2072
+ runtimeErrors.push(runtimeError("act", history.length, err));
2073
+ return finish(emitter, {
2074
+ intent: config.intent,
2075
+ pass: false,
2076
+ completed: false,
2077
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
2078
+ steps: history,
2079
+ finalState: void 0,
2080
+ finalEvals: [],
2081
+ wallMs: Date.now() - started,
2082
+ spentCostUsd,
2083
+ runId: emitter?.runId ?? null,
2084
+ failureClass: "unknown",
2085
+ runtimeErrors,
2086
+ stoppedBy: "runtime-error"
2087
+ });
2088
+ } finally {
2089
+ if (wallTimer) clearTimeout(wallTimer);
2090
+ if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
2091
+ }
2092
+ }
2093
+ function stopOnNoProgress(maxNoProgressSteps, options = {}) {
2094
+ return { ...options, maxNoProgressSteps };
2095
+ }
2096
+ function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
2097
+ return { ...options, maxRepeatedActions };
2098
+ }
2099
+ function objectiveEval(input) {
2100
+ return { ...input, objective: true };
2101
+ }
2102
+ function subjectiveEval(input) {
2103
+ return { ...input, objective: false };
2104
+ }
2105
+ function allCriticalPassed(evals) {
2106
+ return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
2107
+ }
2108
+ function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
2109
+ return {
2110
+ intent,
2111
+ state,
2112
+ evals,
2113
+ history,
2114
+ budget,
2115
+ stepIndex,
2116
+ wallMs: Date.now() - started,
2117
+ spentCostUsd,
2118
+ remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
2119
+ abortSignal,
2120
+ emitter
2121
+ };
2122
+ }
2123
+ function defaultStopDecision(evals) {
2124
+ if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
2125
+ const pass = allCriticalPassed(evals);
2126
+ return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
2127
+ }
2128
+ function averageScore(evals) {
2129
+ const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
2130
+ if (!scored.length) return void 0;
2131
+ return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
2132
+ }
2133
+ function budgetStopDecision(budget, spentCostUsd) {
2134
+ if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
2135
+ return {
2136
+ stop: true,
2137
+ reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
2138
+ };
2139
+ }
2140
+ return { stop: false, reason: "" };
2141
+ }
2142
+ async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
2143
+ if (!emitter || budget.maxCostUsd === void 0) return;
2144
+ const maxCostUsd = budget.maxCostUsd;
2145
+ await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
2146
+ dimension: "usd",
2147
+ limit: maxCostUsd,
2148
+ consumed: spentCostUsd,
2149
+ remaining: Math.max(0, maxCostUsd - spentCostUsd),
2150
+ breached: spentCostUsd >= maxCostUsd,
2151
+ spanId: handle?.span.spanId
2152
+ }));
2153
+ }
2154
+ async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
2155
+ if (!emitter) return;
2156
+ for (const result of evals) {
2157
+ await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
2158
+ judgeId: result.objective ? "objective-validator" : "subjective-judge",
2159
+ targetSpanId: targetSpanId ?? emitter.runId,
2160
+ name: `control-eval/${result.id}`,
2161
+ dimension: result.id,
2162
+ score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
2163
+ rationale: result.detail,
2164
+ evidence: result.evidence,
2165
+ attributes: {
2166
+ phase,
2167
+ passed: result.passed,
2168
+ severity: result.severity,
2169
+ objective: result.objective
2170
+ }
2171
+ }));
2172
+ }
2173
+ }
2174
+ async function runOnStep(onStep, step, runtimeErrors) {
2175
+ if (!onStep) return;
2176
+ try {
2177
+ await onStep(step);
2178
+ } catch (err) {
2179
+ runtimeErrors.push(runtimeError("on-step", step.index, err));
2180
+ }
2181
+ }
2182
+ async function runTrace(runtimeErrors, stepIndex, write) {
2183
+ try {
2184
+ return await write();
2185
+ } catch (err) {
2186
+ runtimeErrors.push(runtimeError("trace", stepIndex, err));
2187
+ return void 0;
2188
+ }
2189
+ }
2190
+ function noProgressStopDecision(args) {
2191
+ const max = args.policies?.maxNoProgressSteps;
2192
+ if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
2193
+ const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
2194
+ const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
2195
+ const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
2196
+ const scoreFlat = scoreDelta < minScoreDelta;
2197
+ const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
2198
+ return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
2199
+ }
2200
+ function repeatedActionStopDecision(policies, streak) {
2201
+ const max = policies?.maxRepeatedActions;
2202
+ if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
2203
+ return {
2204
+ stop: true,
2205
+ reason: `stuck: repeated same action for ${streak} step(s)`
2206
+ };
2207
+ }
2208
+ function fingerprintState(state, policies) {
2209
+ if (policies?.stateFingerprint) return policies.stateFingerprint(state);
2210
+ return stableFingerprint(state);
2211
+ }
2212
+ function fingerprintAction(action, policies) {
2213
+ if (policies?.actionFingerprint) return policies.actionFingerprint(action);
2214
+ return stableFingerprint(action);
2215
+ }
2216
+ function stableFingerprint(value) {
2217
+ if (typeof value === "string") return value;
2218
+ if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
2219
+ try {
2220
+ return JSON.stringify(sortForFingerprint(value));
2221
+ } catch {
2222
+ return String(value);
2223
+ }
2224
+ }
2225
+ function sortForFingerprint(value) {
2226
+ if (Array.isArray(value)) return value.map(sortForFingerprint);
2227
+ if (!value || typeof value !== "object") return value;
2228
+ const record = value;
2229
+ const sorted = {};
2230
+ for (const key of Object.keys(record).sort()) {
2231
+ sorted[key] = sortForFingerprint(record[key]);
2232
+ }
2233
+ return sorted;
2234
+ }
2235
+ function abortReason(signal) {
2236
+ const reason = signal.reason;
2237
+ if (reason instanceof Error) return reason.message;
2238
+ return reason ? String(reason) : "aborted";
2239
+ }
2240
+ function runtimeError(phase, stepIndex, err) {
2241
+ const message = err instanceof Error ? err.message : String(err);
2242
+ return { phase, stepIndex, message };
2243
+ }
2244
+ async function finish(emitter, result) {
2245
+ await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
2246
+ pass: result.pass,
2247
+ score: result.score ?? averageScore(result.finalEvals),
2248
+ failureClass: result.failureClass,
2249
+ notes: result.reason
2250
+ }));
2251
+ return result;
2252
+ }
2253
+
2254
+ // src/feedback-trajectory.ts
2255
+ var DEFAULT_SPLIT_POLICY = {
2256
+ trainPct: 70,
2257
+ devPct: 15,
2258
+ testPct: 10,
2259
+ holdoutPct: 5
2260
+ };
2261
+ var InMemoryFeedbackTrajectoryStore = class {
2262
+ trajectories = /* @__PURE__ */ new Map();
2263
+ async save(trajectory) {
2264
+ this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
2265
+ }
2266
+ async get(id) {
2267
+ const trajectory = this.trajectories.get(id);
2268
+ return trajectory ? cloneTrajectory(trajectory) : null;
2269
+ }
2270
+ async list(filter = {}) {
2271
+ return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
2272
+ }
2273
+ async appendAttempt(id, attempt) {
2274
+ const trajectory = this.trajectories.get(id);
2275
+ if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
2276
+ const next = cloneTrajectory({
2277
+ ...trajectory,
2278
+ attempts: [...trajectory.attempts, attempt],
2279
+ updatedAt: attempt.createdAt
2280
+ });
2281
+ this.trajectories.set(id, next);
2282
+ return cloneTrajectory(next);
2283
+ }
2284
+ async appendLabel(id, label, attemptId) {
2285
+ const trajectory = this.trajectories.get(id);
2286
+ if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
2287
+ const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
2288
+ const next = cloneTrajectory({
2289
+ ...trajectory,
2290
+ attempts,
2291
+ labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
2292
+ updatedAt: label.createdAt
2293
+ });
2294
+ this.trajectories.set(id, next);
2295
+ return cloneTrajectory(next);
2296
+ }
2297
+ };
2298
+ var FileSystemFeedbackTrajectoryStore = class {
2299
+ dir;
2300
+ memory = new InMemoryFeedbackTrajectoryStore();
2301
+ loaded = false;
2302
+ constructor(options) {
2303
+ this.dir = options.dir;
2304
+ }
2305
+ async save(trajectory) {
2306
+ await this.load();
2307
+ await this.memory.save(trajectory);
2308
+ await this.append({ op: "save", trajectory });
2309
+ }
2310
+ async get(id) {
2311
+ await this.load();
2312
+ return this.memory.get(id);
2313
+ }
2314
+ async list(filter = {}) {
2315
+ await this.load();
2316
+ return this.memory.list(filter);
2317
+ }
2318
+ async appendAttempt(id, attempt) {
2319
+ await this.load();
2320
+ const next = await this.memory.appendAttempt(id, attempt);
2321
+ await this.append({ op: "appendAttempt", id, attempt });
2322
+ return next;
2323
+ }
2324
+ async appendLabel(id, label, attemptId) {
2325
+ await this.load();
2326
+ const next = await this.memory.appendLabel(id, label, attemptId);
2327
+ await this.append({ op: "appendLabel", id, label, attemptId });
2328
+ return next;
2329
+ }
2330
+ async append(record) {
2331
+ const { appendFile, mkdir } = await import("fs/promises");
2332
+ const { join: join3 } = await import("path");
2333
+ await mkdir(this.dir, { recursive: true });
2334
+ await appendFile(join3(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
2335
+ }
2336
+ async load() {
2337
+ if (this.loaded) return;
2338
+ const { readFile } = await import("fs/promises");
2339
+ const { join: join3 } = await import("path");
2340
+ const file = join3(this.dir, "feedback-trajectories.ndjson");
2341
+ try {
2342
+ const raw = await readFile(file, "utf8");
2343
+ for (const line of raw.split("\n")) {
2344
+ if (!line.trim()) continue;
2345
+ try {
2346
+ const record = JSON.parse(line);
2347
+ if (record.op === "save") await this.memory.save(record.trajectory);
2348
+ if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
2349
+ if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
2350
+ } catch {
2351
+ }
2352
+ }
2353
+ } catch {
2354
+ }
2355
+ this.loaded = true;
2356
+ }
2357
+ };
2358
+ function createFeedbackTrajectory(input) {
2359
+ const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
2360
+ const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
2361
+ return {
2362
+ id,
2363
+ projectId: input.projectId,
2364
+ scenarioId: input.scenarioId,
2365
+ task: input.task,
2366
+ attempts: input.attempts ?? [],
2367
+ labels: input.labels ?? [],
2368
+ outcome: input.outcome,
2369
+ split: input.split,
2370
+ tags: input.tags,
2371
+ createdAt,
2372
+ metadata: input.metadata
2373
+ };
2374
+ }
2375
+ function assignFeedbackSplit(trajectory, policy = {}) {
2376
+ const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
2377
+ const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
2378
+ if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
2379
+ const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
2380
+ if (bucket < split.trainPct) return "train";
2381
+ if (bucket < split.trainPct + split.devPct) return "dev";
2382
+ if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
2383
+ return "holdout";
2384
+ }
2385
+ function withAssignedFeedbackSplit(trajectory, policy) {
2386
+ return {
2387
+ ...trajectory,
2388
+ split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
2389
+ };
2390
+ }
2391
+ function feedbackTrajectoryToDatasetScenario(trajectory) {
2392
+ const withSplit = withAssignedFeedbackSplit(trajectory);
2393
+ return {
2394
+ id: withSplit.scenarioId ?? withSplit.id,
2395
+ split: withSplit.split,
2396
+ payload: withSplit,
2397
+ tags: {
2398
+ ...withSplit.projectId ? { projectId: withSplit.projectId } : {},
2399
+ ...withSplit.tags ?? {},
2400
+ source: "feedback-trajectory"
2401
+ }
2402
+ };
2403
+ }
2404
+ function feedbackTrajectoriesToDatasetScenarios(trajectories) {
2405
+ return trajectories.map(feedbackTrajectoryToDatasetScenario);
2406
+ }
2407
+ function feedbackTrajectoryToOptimizerRow(trajectory) {
2408
+ const labels = allLabels(trajectory);
2409
+ return {
2410
+ scenarioId: trajectory.scenarioId ?? trajectory.id,
2411
+ trajectoryId: trajectory.id,
2412
+ labelKinds: [...new Set(labels.map((label) => label.kind))],
2413
+ score: trajectory.outcome?.score ?? scoreFromLabels(labels),
2414
+ metadata: {
2415
+ projectId: trajectory.projectId,
2416
+ split: trajectory.split,
2417
+ intent: trajectory.task.intent,
2418
+ attempts: trajectory.attempts.length,
2419
+ outcome: trajectory.outcome,
2420
+ labels
2421
+ }
2422
+ };
2423
+ }
2424
+ function feedbackTrajectoriesToOptimizerRows(trajectories) {
2425
+ return trajectories.map(feedbackTrajectoryToOptimizerRow);
2426
+ }
2427
+ async function replayFeedbackTrajectory(trajectory, adapter2) {
2428
+ try {
2429
+ const result = await adapter2.replay(trajectory);
2430
+ return {
2431
+ trajectoryId: trajectory.id,
2432
+ ...result
2433
+ };
2434
+ } catch (err) {
2435
+ const createdAt = (/* @__PURE__ */ new Date()).toISOString();
2436
+ const message = err instanceof Error ? err.message : String(err);
2437
+ return {
2438
+ trajectoryId: trajectory.id,
2439
+ pass: false,
2440
+ labels: [{
2441
+ source: "system",
2442
+ kind: "reject",
2443
+ value: false,
2444
+ reason: message,
2445
+ severity: "error",
2446
+ createdAt
2447
+ }],
2448
+ outcome: {
2449
+ success: false,
2450
+ score: 0,
2451
+ detail: message,
2452
+ observedAt: createdAt
2453
+ },
2454
+ metadata: { replayError: true }
2455
+ };
2456
+ }
2457
+ }
2458
+ async function replayFeedbackTrajectories(trajectories, adapter2) {
2459
+ const results = [];
2460
+ for (const trajectory of trajectories) {
2461
+ results.push(await replayFeedbackTrajectory(trajectory, adapter2));
2462
+ }
2463
+ return results;
2464
+ }
2465
+ function summarizePreferenceMemory(trajectories, options = {}) {
2466
+ const maxEntries = options.maxEntries ?? 20;
2467
+ const entries = [];
2468
+ for (const trajectory of trajectories) {
2469
+ for (const label of allLabels(trajectory)) {
2470
+ const instruction = instructionFromLabel(trajectory, label);
2471
+ if (!instruction) continue;
2472
+ entries.push({
2473
+ instruction,
2474
+ rationale: label.reason ?? `${label.kind} label from ${label.source}`,
2475
+ weight: weightForLabel(label),
2476
+ sourceTrajectoryId: trajectory.id,
2477
+ sourceLabelId: label.id,
2478
+ category: label.kind
2479
+ });
2480
+ }
2481
+ }
2482
+ const byInstruction = /* @__PURE__ */ new Map();
2483
+ for (const entry of entries) {
2484
+ const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
2485
+ const existing = byInstruction.get(key);
2486
+ if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
2487
+ }
2488
+ return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
2489
+ }
2490
+ function renderPreferenceMemoryMarkdown(entries) {
2491
+ const lines = ["# Preference Memory", ""];
2492
+ for (const entry of entries) {
2493
+ lines.push(`- ${entry.instruction}`);
2494
+ lines.push(` Rationale: ${entry.rationale}`);
2495
+ lines.push(` Source: ${entry.sourceTrajectoryId}`);
2496
+ lines.push("");
2497
+ }
2498
+ return lines.join("\n").trim() + "\n";
2499
+ }
2500
+ function serializeFeedbackTrajectoriesJsonl(trajectories) {
2501
+ return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
2502
+ }
2503
+ function parseFeedbackTrajectoriesJsonl(jsonl) {
2504
+ const trajectories = [];
2505
+ for (const line of jsonl.split("\n")) {
2506
+ if (!line.trim()) continue;
2507
+ trajectories.push(JSON.parse(line));
2508
+ }
2509
+ return trajectories;
2510
+ }
2511
+ function controlRunToFeedbackTrajectory(run, options = {}) {
2512
+ const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
2513
+ const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
2514
+ return createFeedbackTrajectory({
2515
+ id: trajectoryId,
2516
+ projectId: options.projectId,
2517
+ scenarioId: options.scenarioId,
2518
+ task: { intent: run.intent },
2519
+ createdAt,
2520
+ attempts: run.steps.map((step) => ({
2521
+ id: `${trajectoryId}_step_${step.index}`,
2522
+ stepIndex: step.index,
2523
+ artifactType: options.artifactType ?? "action",
2524
+ artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
2525
+ proposedAction: options.proposedActionFromStep?.(step),
2526
+ evals: step.evalsAfter,
2527
+ createdAt: step.startedAt,
2528
+ metadata: {
2529
+ decision: step.decision,
2530
+ actionOutcome: step.actionOutcome
2531
+ }
2532
+ })),
2533
+ labels: [
2534
+ {
2535
+ source: "system",
2536
+ kind: run.pass ? "approve" : "reject",
2537
+ value: run.pass,
2538
+ reason: run.reason,
2539
+ severity: run.pass ? "info" : "error",
2540
+ createdAt
2541
+ }
2542
+ ],
2543
+ outcome: {
2544
+ success: run.pass,
2545
+ score: run.score,
2546
+ costUsd: run.spentCostUsd,
2547
+ detail: run.reason,
2548
+ observedAt: createdAt,
2549
+ metadata: {
2550
+ stoppedBy: run.stoppedBy,
2551
+ failureClass: run.failureClass
2552
+ }
2553
+ }
2554
+ });
2555
+ }
2556
+ function allLabels(trajectory) {
2557
+ const labels = [
2558
+ ...trajectory.labels,
2559
+ ...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
2560
+ ];
2561
+ const seen = /* @__PURE__ */ new Set();
2562
+ return labels.filter((label) => {
2563
+ const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
2564
+ if (seen.has(key)) return false;
2565
+ seen.add(key);
2566
+ return true;
2567
+ });
2568
+ }
2569
+ function scoreFromLabels(labels) {
2570
+ if (!labels.length) return void 0;
2571
+ const scored = labels.map((label) => {
2572
+ if (label.kind === "approve" || label.kind === "select") return 1;
2573
+ if (label.kind === "reject" || label.kind === "policy_block") return 0;
2574
+ if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
2575
+ return void 0;
2576
+ }).filter((value) => typeof value === "number");
2577
+ if (!scored.length) return void 0;
2578
+ return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
2579
+ }
2580
+ function instructionFromLabel(trajectory, label) {
2581
+ if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
2582
+ if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
2583
+ if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
2584
+ if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
2585
+ if (label.kind === "comment" && label.reason) return label.reason;
2586
+ return void 0;
2587
+ }
2588
+ function weightForLabel(label) {
2589
+ const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
2590
+ const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
2591
+ return severity * source;
2592
+ }
2593
+ function matchesFilter(trajectory, filter) {
2594
+ if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
2595
+ if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
2596
+ if (filter.split && trajectory.split !== filter.split) return false;
2597
+ if (filter.tag) {
2598
+ const [key, value] = filter.tag;
2599
+ if (trajectory.tags?.[key] !== value) return false;
2600
+ }
2601
+ return true;
2602
+ }
2603
+ function cloneTrajectory(trajectory) {
2604
+ return JSON.parse(JSON.stringify(trajectory));
2605
+ }
2606
+ function compact(value, max) {
2607
+ const normalized = value.replace(/\s+/g, " ").trim();
2608
+ return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
2609
+ }
2610
+ function stableHash(input) {
2611
+ let hash = 2166136261;
2612
+ for (let i = 0; i < input.length; i += 1) {
2613
+ hash ^= input.charCodeAt(i);
2614
+ hash = Math.imul(hash, 16777619);
2615
+ }
2616
+ return hash >>> 0;
2617
+ }
2618
+ function canonicalize(value) {
2619
+ if (value === null || typeof value !== "object") return value;
2620
+ if (Array.isArray(value)) return value.map(canonicalize);
2621
+ const out = {};
2622
+ for (const key of Object.keys(value).sort()) {
2623
+ out[key] = canonicalize(value[key]);
2624
+ }
2625
+ return out;
2626
+ }
2627
+
2628
+ // src/action-policy.ts
2629
+ function evaluateActionPolicy(action, policy = {}, options = {}) {
2630
+ const reasons = [];
2631
+ let blocked = false;
2632
+ let requiresApproval = Boolean(action.requiresApproval);
2633
+ if (policy.allowedTypes?.length && !policy.allowedTypes.includes(action.type)) {
2634
+ blocked = true;
2635
+ reasons.push(`action type "${action.type}" is not allowed`);
2636
+ }
2637
+ if (policy.blockedTypes?.includes(action.type)) {
2638
+ blocked = true;
2639
+ reasons.push(`action type "${action.type}" is blocked`);
2640
+ }
2641
+ if (policy.alwaysRequireApprovalTypes?.includes(action.type)) {
2642
+ requiresApproval = true;
2643
+ reasons.push(`action type "${action.type}" requires approval`);
2644
+ }
2645
+ if (policy.requireApprovalForExternalSideEffects && action.externalSideEffect) {
2646
+ requiresApproval = true;
2647
+ reasons.push("external side effect requires approval");
2648
+ }
2649
+ if (policy.requireApprovalAboveCostUsd !== void 0 && (action.costUsd ?? 0) > policy.requireApprovalAboveCostUsd) {
2650
+ requiresApproval = true;
2651
+ reasons.push(`cost ${action.costUsd} exceeds approval threshold ${policy.requireApprovalAboveCostUsd}`);
2652
+ }
2653
+ if (policy.maxActionCostUsd !== void 0 && (action.costUsd ?? 0) > policy.maxActionCostUsd) {
2654
+ blocked = true;
2655
+ reasons.push(`cost ${action.costUsd} exceeds max action cost ${policy.maxActionCostUsd}`);
2656
+ }
2657
+ if (policy.remainingBudgetUsd !== void 0 && (action.costUsd ?? 0) > policy.remainingBudgetUsd) {
2658
+ blocked = true;
2659
+ reasons.push(`cost ${action.costUsd} exceeds remaining budget ${policy.remainingBudgetUsd}`);
2660
+ }
2661
+ if (policy.expectedOutcomeRequired && !action.metadata?.expectedOutcome) {
2662
+ blocked = true;
2663
+ reasons.push("expected outcome is required");
2664
+ }
2665
+ if (policy.killCriteriaRequired && !action.metadata?.killCriteria) {
2666
+ blocked = true;
2667
+ reasons.push("kill criteria are required");
2668
+ }
2669
+ if (policy.autoApproveTypes?.includes(action.type) && requiresApproval) {
2670
+ reasons.push(`action type "${action.type}" is auto-approved only when no approval policy applies`);
2671
+ }
2672
+ if (!reasons.length) reasons.push(requiresApproval ? "approval required" : "action allowed");
2673
+ const label = blocked || requiresApproval ? {
2674
+ source: "policy",
2675
+ kind: blocked ? "policy_block" : "comment",
2676
+ value: { actionType: action.type, blocked, requiresApproval },
2677
+ reason: reasons.join("; "),
2678
+ severity: blocked ? "critical" : "warning",
2679
+ createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
2680
+ metadata: { action, policy }
2681
+ } : void 0;
2682
+ return {
2683
+ allowed: !blocked,
2684
+ blocked,
2685
+ requiresApproval: !blocked && requiresApproval,
2686
+ reasons,
2687
+ label
2688
+ };
2689
+ }
2690
+
1389
2691
  // src/prompt-registry.ts
1390
2692
  var PromptRegistry = class {
1391
2693
  entries = /* @__PURE__ */ new Map();
@@ -3101,184 +4403,6 @@ var FileSystemTraceStore = class {
3101
4403
  }
3102
4404
  };
3103
4405
 
3104
- // src/trace/emitter.ts
3105
- var TraceEmitter = class {
3106
- store;
3107
- stack = [];
3108
- _runId;
3109
- now;
3110
- id;
3111
- constructor(store, options = {}) {
3112
- this.store = store;
3113
- this.now = options.now ?? (() => Date.now());
3114
- this.id = options.id ?? (() => cryptoRandomId());
3115
- this._runId = options.runId ?? this.id();
3116
- }
3117
- get runId() {
3118
- return this._runId;
3119
- }
3120
- // ── Run lifecycle ──────────────────────────────────────────────────
3121
- async startRun(run) {
3122
- const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
3123
- await this.store.appendRun(full);
3124
- return full;
3125
- }
3126
- async endRun(outcome) {
3127
- const status = outcome?.pass === false ? "failed" : "completed";
3128
- await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
3129
- }
3130
- async abortRun(reason) {
3131
- await this.store.updateRun(this._runId, {
3132
- endedAt: this.now(),
3133
- status: "aborted",
3134
- outcome: { pass: false, notes: reason }
3135
- });
3136
- }
3137
- // ── Generic span ───────────────────────────────────────────────────
3138
- async span(init) {
3139
- const spanId = this.id();
3140
- const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
3141
- const span = {
3142
- spanId,
3143
- parentSpanId: parent,
3144
- runId: this._runId,
3145
- startedAt: this.now(),
3146
- ...init
3147
- };
3148
- await this.store.appendSpan(span);
3149
- this.stack.push(spanId);
3150
- return this.handle(span);
3151
- }
3152
- handle(span) {
3153
- return {
3154
- span,
3155
- end: async (patch) => {
3156
- const endedAt = this.now();
3157
- await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
3158
- this.pop(span.spanId);
3159
- },
3160
- fail: async (error, patch) => {
3161
- const endedAt = this.now();
3162
- const errStr = error instanceof Error ? error.message : error;
3163
- await this.store.updateSpan(span.spanId, {
3164
- endedAt,
3165
- status: "error",
3166
- error: errStr,
3167
- ...patch
3168
- });
3169
- this.pop(span.spanId);
3170
- }
3171
- };
3172
- }
3173
- pop(spanId) {
3174
- const idx = this.stack.lastIndexOf(spanId);
3175
- if (idx >= 0) this.stack.splice(idx, 1);
3176
- }
3177
- // ── Typed span conveniences ────────────────────────────────────────
3178
- llm(init) {
3179
- return this.span({ kind: "llm", ...init });
3180
- }
3181
- tool(init) {
3182
- return this.span({ kind: "tool", ...init });
3183
- }
3184
- retrieval(init) {
3185
- return this.span({ kind: "retrieval", ...init });
3186
- }
3187
- async recordJudge(verdict) {
3188
- const spanId = this.id();
3189
- const now = this.now();
3190
- const full = {
3191
- spanId,
3192
- runId: this._runId,
3193
- kind: "judge",
3194
- startedAt: now,
3195
- endedAt: now,
3196
- status: "ok",
3197
- ...verdict
3198
- };
3199
- await this.store.appendSpan(full);
3200
- return full;
3201
- }
3202
- sandbox(init) {
3203
- return this.span({ kind: "sandbox", ...init });
3204
- }
3205
- // ── Events ─────────────────────────────────────────────────────────
3206
- async emit(event) {
3207
- const full = {
3208
- eventId: this.id(),
3209
- runId: this._runId,
3210
- spanId: event.spanId ?? this.stack[this.stack.length - 1],
3211
- kind: event.kind,
3212
- timestamp: this.now(),
3213
- payload: event.payload ?? {}
3214
- };
3215
- await this.store.appendEvent(full);
3216
- return full;
3217
- }
3218
- // ── Budget ledger ──────────────────────────────────────────────────
3219
- async recordBudget(entry) {
3220
- const full = {
3221
- runId: this._runId,
3222
- timestamp: entry.timestamp ?? this.now(),
3223
- dimension: entry.dimension,
3224
- limit: entry.limit,
3225
- consumed: entry.consumed,
3226
- remaining: entry.remaining,
3227
- breached: entry.breached,
3228
- spanId: entry.spanId ?? this.stack[this.stack.length - 1]
3229
- };
3230
- await this.store.appendBudgetEntry(full);
3231
- if (full.breached) {
3232
- await this.emit({
3233
- kind: "budget_breach",
3234
- spanId: full.spanId,
3235
- payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
3236
- });
3237
- }
3238
- return full;
3239
- }
3240
- // ── Artifacts ──────────────────────────────────────────────────────
3241
- async recordArtifact(artifact) {
3242
- const full = { artifactId: this.id(), runId: this._runId, ...artifact };
3243
- await this.store.appendArtifact(full);
3244
- return full;
3245
- }
3246
- // ── Nested composition ─────────────────────────────────────────────
3247
- /**
3248
- * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
3249
- * Returns the fn's return value. Use this for the 95% case.
3250
- */
3251
- async within(init, fn) {
3252
- const handle = await this.span(init);
3253
- try {
3254
- const result = await fn(handle);
3255
- await handle.end();
3256
- return result;
3257
- } catch (err) {
3258
- await handle.fail(err instanceof Error ? err : String(err));
3259
- throw err;
3260
- }
3261
- }
3262
- };
3263
- function cryptoRandomId() {
3264
- if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
3265
- return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
3266
- }
3267
- function llmSpanFromProvider(args) {
3268
- return {
3269
- name: args.name ?? args.model,
3270
- model: args.model,
3271
- messages: args.messages,
3272
- output: args.output,
3273
- inputTokens: args.usage?.inputTokens,
3274
- outputTokens: args.usage?.outputTokens,
3275
- cachedTokens: args.usage?.cachedTokens,
3276
- reasoningTokens: args.usage?.reasoningTokens,
3277
- costUsd: args.costUsd,
3278
- finishReason: args.finishReason
3279
- };
3280
- }
3281
-
3282
4406
  // src/sandbox-harness.ts
3283
4407
  var vitestTestParser = {
3284
4408
  id: "vitest",
@@ -3887,6 +5011,157 @@ function safeJson(x) {
3887
5011
  }
3888
5012
  }
3889
5013
 
5014
+ // src/propose-review-control.ts
5015
+ var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
5016
+ async function runProposeReviewAsControlLoop(config) {
5017
+ const maxShots = config.maxShots ?? 10;
5018
+ const confidenceFloor = config.confidenceFloor ?? 0.3;
5019
+ const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
5020
+ const memory = config.memory ?? inMemoryReviewStore();
5021
+ const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
5022
+ const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
5023
+ let lowConfidenceStreak = 0;
5024
+ let current = {
5025
+ shot: 0,
5026
+ state: config.initialState,
5027
+ priorReview: null,
5028
+ verification: { pass: false },
5029
+ memory: await memory.load(),
5030
+ completed: false,
5031
+ reviewAvailable: false
5032
+ };
5033
+ return runAgentControlLoop({
5034
+ intent: config.goal,
5035
+ budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
5036
+ store: config.store,
5037
+ scenarioId: config.scenarioId ?? "propose-review-control",
5038
+ projectId: config.projectId,
5039
+ variantId: config.variantId,
5040
+ actionFailure: config.actionFailure ?? "stop",
5041
+ observe: () => current,
5042
+ validate: ({ state }) => [
5043
+ objectiveEval({
5044
+ id: "verification",
5045
+ passed: state.verification.pass,
5046
+ score: state.verification.score,
5047
+ severity: "critical",
5048
+ detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
5049
+ })
5050
+ ],
5051
+ shouldStop: ({ state }) => {
5052
+ if (state.verification.pass) {
5053
+ return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
5054
+ }
5055
+ if (state.completed) {
5056
+ return {
5057
+ stop: true,
5058
+ pass: false,
5059
+ reason: "reviewer stopped continuation",
5060
+ score: state.verification.score,
5061
+ failureClass: failureClassFromVerification(state.verification)
5062
+ };
5063
+ }
5064
+ return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
5065
+ },
5066
+ decide: ({ state }) => ({
5067
+ type: "continue",
5068
+ action: { type: "propose-review-shot", shot: state.shot + 1 },
5069
+ reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
5070
+ }),
5071
+ act: async (action, ctx) => {
5072
+ const shot = action.shot;
5073
+ const proposeOut = await config.propose({
5074
+ shot,
5075
+ goal: config.goal,
5076
+ state: current.state,
5077
+ priorReview: current.priorReview,
5078
+ abortSignal: ctx.abortSignal,
5079
+ emitter: ctx.emitter
5080
+ });
5081
+ const nextState = proposeOut.state;
5082
+ const verification = await config.verify(nextState);
5083
+ let review = null;
5084
+ let reviewAvailable = false;
5085
+ let reviewError;
5086
+ let shouldContinue = !verification.pass;
5087
+ if (!verification.pass) {
5088
+ try {
5089
+ review = await config.review({
5090
+ shot,
5091
+ goal: config.goal,
5092
+ state: nextState,
5093
+ verification,
5094
+ traceSummary: proposeOut.traceSummary,
5095
+ memory: await memory.load()
5096
+ });
5097
+ reviewAvailable = true;
5098
+ shouldContinue = review.shouldContinue;
5099
+ lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
5100
+ if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
5101
+ } catch (err) {
5102
+ reviewError = err instanceof Error ? err.message : String(err);
5103
+ review = current.priorReview ?? {
5104
+ observations: "Reviewer unavailable.",
5105
+ diagnosis: reviewError,
5106
+ nextShotInstruction: fallbackInstruction,
5107
+ shouldContinue: true,
5108
+ confidence: 0
5109
+ };
5110
+ shouldContinue = true;
5111
+ }
5112
+ } else {
5113
+ review = {
5114
+ observations: "Verification passed.",
5115
+ diagnosis: "No further revision needed.",
5116
+ nextShotInstruction: "",
5117
+ shouldContinue: false,
5118
+ confidence: 1
5119
+ };
5120
+ }
5121
+ const entry = {
5122
+ ...review ?? {
5123
+ observations: "No review.",
5124
+ diagnosis: "",
5125
+ nextShotInstruction: fallbackInstruction,
5126
+ shouldContinue,
5127
+ confidence: 0
5128
+ },
5129
+ shot,
5130
+ timestamp: Date.now(),
5131
+ verification: {
5132
+ pass: verification.pass,
5133
+ score: verification.score,
5134
+ failingLayers: verification.failingLayers
5135
+ }
5136
+ };
5137
+ await memory.append(entry);
5138
+ current = {
5139
+ shot,
5140
+ state: nextState,
5141
+ priorReview: review,
5142
+ verification,
5143
+ traceSummary: proposeOut.traceSummary,
5144
+ memory: await memory.load(),
5145
+ completed: verification.pass || !shouldContinue,
5146
+ reviewAvailable,
5147
+ reviewError
5148
+ };
5149
+ return {
5150
+ state: nextState,
5151
+ verification,
5152
+ traceSummary: proposeOut.traceSummary,
5153
+ review,
5154
+ reviewAvailable,
5155
+ reviewError
5156
+ };
5157
+ }
5158
+ });
5159
+ }
5160
+ function controlFailureClassFromVerification(verification) {
5161
+ if (verification.pass) return void 0;
5162
+ return verification.failingLayers?.length ? "instruction_following" : "unknown";
5163
+ }
5164
+
3890
5165
  // src/trace/schema.ts
3891
5166
  var TRACE_SCHEMA_VERSION = "1.0.0";
3892
5167
  var FAILURE_CLASSES = [
@@ -5557,7 +6832,7 @@ var Dataset = class _Dataset {
5557
6832
  * Write to disk for contamination-verifiable archives.
5558
6833
  */
5559
6834
  toJsonl() {
5560
- return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
6835
+ return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
5561
6836
  }
5562
6837
  static fromJsonl(jsonl, manifest) {
5563
6838
  const scenarios = [];
@@ -5570,18 +6845,18 @@ var Dataset = class _Dataset {
5570
6845
  }
5571
6846
  };
5572
6847
  async function hashScenarios(scenarios) {
5573
- const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
6848
+ const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
5574
6849
  const text = JSON.stringify(canonical);
5575
6850
  const bytes = new TextEncoder().encode(text);
5576
6851
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
5577
6852
  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
5578
6853
  }
5579
- function canonicalize(v) {
6854
+ function canonicalize2(v) {
5580
6855
  if (v === null || typeof v !== "object") return v;
5581
- if (Array.isArray(v)) return v.map(canonicalize);
6856
+ if (Array.isArray(v)) return v.map(canonicalize2);
5582
6857
  const keys = Object.keys(v).sort();
5583
6858
  const out = {};
5584
- for (const k of keys) out[k] = canonicalize(v[k]);
6859
+ for (const k of keys) out[k] = canonicalize2(v[k]);
5585
6860
  return out;
5586
6861
  }
5587
6862
  function seededShuffle(items, seed) {
@@ -7615,7 +8890,7 @@ function attributeStep(op, prmA, prmB) {
7615
8890
 
7616
8891
  // src/pre-registration.ts
7617
8892
  async function signManifest(m) {
7618
- const canonical = canonicalize2(m);
8893
+ const canonical = canonicalize3(m);
7619
8894
  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
7620
8895
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
7621
8896
  const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
@@ -7645,12 +8920,12 @@ async function evaluateHypothesis(manifest, observed) {
7645
8920
  rejectionReasons: reasons
7646
8921
  };
7647
8922
  }
7648
- function canonicalize2(v) {
8923
+ function canonicalize3(v) {
7649
8924
  if (v === null || typeof v !== "object") return v;
7650
- if (Array.isArray(v)) return v.map(canonicalize2);
8925
+ if (Array.isArray(v)) return v.map(canonicalize3);
7651
8926
  const keys = Object.keys(v).sort();
7652
8927
  const out = {};
7653
- for (const k of keys) out[k] = canonicalize2(v[k]);
8928
+ for (const k of keys) out[k] = canonicalize3(v[k]);
7654
8929
  return out;
7655
8930
  }
7656
8931
 
@@ -12426,6 +13701,7 @@ export {
12426
13701
  ExperimentTracker,
12427
13702
  FAILURE_CLASSES,
12428
13703
  FileSystemExperimentStore,
13704
+ FileSystemFeedbackTrajectoryStore,
12429
13705
  FileSystemOutcomeStore,
12430
13706
  FileSystemTraceStore,
12431
13707
  HeldOutGate,
@@ -12433,6 +13709,7 @@ export {
12433
13709
  HoldoutLockedError,
12434
13710
  INTENT_MATCH_JUDGE_VERSION,
12435
13711
  InMemoryExperimentStore,
13712
+ InMemoryFeedbackTrajectoryStore,
12436
13713
  InMemoryOutcomeStore,
12437
13714
  InMemoryTraceStore,
12438
13715
  InMemoryTrialCache,
@@ -12472,9 +13749,11 @@ export {
12472
13749
  adversarialJudge,
12473
13750
  aggregateLlm,
12474
13751
  aggregateRunScore,
13752
+ allCriticalPassed,
12475
13753
  analyzeAntiSlop,
12476
13754
  analyzeSeries,
12477
13755
  argHash,
13756
+ assignFeedbackSplit,
12478
13757
  attributeCounterfactuals,
12479
13758
  deterministicSplit as benchmarkDeterministicSplit,
12480
13759
  benchmarks_exports as benchmarks,
@@ -12512,6 +13791,8 @@ export {
12512
13791
  computeToolUseMetrics,
12513
13792
  confidenceInterval,
12514
13793
  containsAll,
13794
+ controlFailureClassFromVerification,
13795
+ controlRunToFeedbackTrajectory,
12515
13796
  correlateLayers,
12516
13797
  correlationStudy,
12517
13798
  createAntiSlopJudge,
@@ -12519,6 +13800,7 @@ export {
12519
13800
  createCustomJudge,
12520
13801
  createDefaultReviewer,
12521
13802
  createDomainExpertJudge,
13803
+ createFeedbackTrajectory,
12522
13804
  createIntentMatchJudge,
12523
13805
  createLlmReviewer,
12524
13806
  createSandboxCodeMutator,
@@ -12536,6 +13818,7 @@ export {
12536
13818
  estimateCost,
12537
13819
  estimateTokens,
12538
13820
  euAiActReport,
13821
+ evaluateActionPolicy,
12539
13822
  evaluateContract,
12540
13823
  evaluateHypothesis,
12541
13824
  evaluateOracles,
@@ -12547,6 +13830,10 @@ export {
12547
13830
  extractAssetUrls,
12548
13831
  extractErrorCount,
12549
13832
  failureClusterView,
13833
+ feedbackTrajectoriesToDatasetScenarios,
13834
+ feedbackTrajectoriesToOptimizerRows,
13835
+ feedbackTrajectoryToDatasetScenario,
13836
+ feedbackTrajectoryToOptimizerRow,
12550
13837
  fileContains,
12551
13838
  fileExists,
12552
13839
  findAutoMatchNoExpectation,
@@ -12601,6 +13888,7 @@ export {
12601
13888
  nonRefusalRubric,
12602
13889
  normalizeScores,
12603
13890
  notBlocked,
13891
+ objectiveEval,
12604
13892
  outputLengthRubric,
12605
13893
  pairedBootstrap,
12606
13894
  pairedTTest,
@@ -12609,6 +13897,7 @@ export {
12609
13897
  paretoChart,
12610
13898
  paretoFrontier,
12611
13899
  paretoFrontierWithCrowding,
13900
+ parseFeedbackTrajectoriesJsonl,
12612
13901
  parseReflectionResponse,
12613
13902
  parseRunRecordSafe,
12614
13903
  partialCredit,
@@ -12635,7 +13924,10 @@ export {
12635
13924
  renderMarkdown,
12636
13925
  renderMarkdownReport,
12637
13926
  renderPlaybookMarkdown,
13927
+ renderPreferenceMemoryMarkdown,
12638
13928
  renderSteeringText,
13929
+ replayFeedbackTrajectories,
13930
+ replayFeedbackTrajectory,
12639
13931
  replayScorerOverCorpus,
12640
13932
  replayTraceThroughJudge,
12641
13933
  requiredSampleSize,
@@ -12644,6 +13936,7 @@ export {
12644
13936
  roundTripRunRecord,
12645
13937
  rowCount,
12646
13938
  rowWhere,
13939
+ runAgentControlLoop,
12647
13940
  runAssertions,
12648
13941
  runCanaries,
12649
13942
  runCounterfactual,
@@ -12657,6 +13950,7 @@ export {
12657
13950
  runKeywordCoverageJudgeUrl,
12658
13951
  runPromptEvolution,
12659
13952
  runProposeReview,
13953
+ runProposeReviewAsControlLoop,
12660
13954
  runReferenceReplay,
12661
13955
  runSelfPlay,
12662
13956
  runSemanticConceptJudge,
@@ -12673,13 +13967,18 @@ export {
12673
13967
  selectHarnessVariant,
12674
13968
  selfPreference,
12675
13969
  sentenceReorderMutator,
13970
+ serializeFeedbackTrajectoriesJsonl,
12676
13971
  signManifest,
12677
13972
  soc2Report,
12678
13973
  statusAdvanced,
13974
+ stopOnNoProgress,
13975
+ stopOnRepeatedAction,
12679
13976
  stripFencedJson,
12680
13977
  stuckLoopView,
13978
+ subjectiveEval,
12681
13979
  summarize,
12682
13980
  summarizeHarnessResults,
13981
+ summarizePreferenceMemory,
12683
13982
  summaryTable,
12684
13983
  testJudge,
12685
13984
  textInSnapshot,
@@ -12705,6 +14004,7 @@ export {
12705
14004
  welchsTTest,
12706
14005
  whitespaceCollapseMutator,
12707
14006
  wilcoxonSignedRank,
14007
+ withAssignedFeedbackSplit,
12708
14008
  wranglerDeployRunner
12709
14009
  };
12710
14010
  //# sourceMappingURL=index.js.map