@tangle-network/agent-eval 0.17.1 → 0.17.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1386,6 +1386,1205 @@ function printDriverSummary(results) {
1386
1386
  console.log(`${completedCount}/${results.length} personas completed`);
1387
1387
  }
1388
1388
 
1389
+ // src/trace/emitter.ts
1390
+ var TraceEmitter = class {
1391
+ store;
1392
+ stack = [];
1393
+ _runId;
1394
+ now;
1395
+ id;
1396
+ constructor(store, options = {}) {
1397
+ this.store = store;
1398
+ this.now = options.now ?? (() => Date.now());
1399
+ this.id = options.id ?? (() => cryptoRandomId());
1400
+ this._runId = options.runId ?? this.id();
1401
+ }
1402
+ get runId() {
1403
+ return this._runId;
1404
+ }
1405
+ // ── Run lifecycle ──────────────────────────────────────────────────
1406
+ async startRun(run) {
1407
+ const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
1408
+ await this.store.appendRun(full);
1409
+ return full;
1410
+ }
1411
+ async endRun(outcome) {
1412
+ const status = outcome?.pass === false ? "failed" : "completed";
1413
+ await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
1414
+ }
1415
+ async abortRun(reason) {
1416
+ await this.store.updateRun(this._runId, {
1417
+ endedAt: this.now(),
1418
+ status: "aborted",
1419
+ outcome: { pass: false, notes: reason }
1420
+ });
1421
+ }
1422
+ // ── Generic span ───────────────────────────────────────────────────
1423
+ async span(init) {
1424
+ const spanId = this.id();
1425
+ const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
1426
+ const span = {
1427
+ spanId,
1428
+ parentSpanId: parent,
1429
+ runId: this._runId,
1430
+ startedAt: this.now(),
1431
+ ...init
1432
+ };
1433
+ await this.store.appendSpan(span);
1434
+ this.stack.push(spanId);
1435
+ return this.handle(span);
1436
+ }
1437
+ handle(span) {
1438
+ return {
1439
+ span,
1440
+ end: async (patch) => {
1441
+ const endedAt = this.now();
1442
+ await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
1443
+ this.pop(span.spanId);
1444
+ },
1445
+ fail: async (error, patch) => {
1446
+ const endedAt = this.now();
1447
+ const errStr = error instanceof Error ? error.message : error;
1448
+ await this.store.updateSpan(span.spanId, {
1449
+ endedAt,
1450
+ status: "error",
1451
+ error: errStr,
1452
+ ...patch
1453
+ });
1454
+ this.pop(span.spanId);
1455
+ }
1456
+ };
1457
+ }
1458
+ pop(spanId) {
1459
+ const idx = this.stack.lastIndexOf(spanId);
1460
+ if (idx >= 0) this.stack.splice(idx, 1);
1461
+ }
1462
+ // ── Typed span conveniences ────────────────────────────────────────
1463
+ llm(init) {
1464
+ return this.span({ kind: "llm", ...init });
1465
+ }
1466
+ tool(init) {
1467
+ return this.span({ kind: "tool", ...init });
1468
+ }
1469
+ retrieval(init) {
1470
+ return this.span({ kind: "retrieval", ...init });
1471
+ }
1472
+ async recordJudge(verdict) {
1473
+ const spanId = this.id();
1474
+ const now = this.now();
1475
+ const full = {
1476
+ spanId,
1477
+ runId: this._runId,
1478
+ kind: "judge",
1479
+ startedAt: now,
1480
+ endedAt: now,
1481
+ status: "ok",
1482
+ ...verdict
1483
+ };
1484
+ await this.store.appendSpan(full);
1485
+ return full;
1486
+ }
1487
+ sandbox(init) {
1488
+ return this.span({ kind: "sandbox", ...init });
1489
+ }
1490
+ // ── Events ─────────────────────────────────────────────────────────
1491
+ async emit(event) {
1492
+ const full = {
1493
+ eventId: this.id(),
1494
+ runId: this._runId,
1495
+ spanId: event.spanId ?? this.stack[this.stack.length - 1],
1496
+ kind: event.kind,
1497
+ timestamp: this.now(),
1498
+ payload: event.payload ?? {}
1499
+ };
1500
+ await this.store.appendEvent(full);
1501
+ return full;
1502
+ }
1503
+ // ── Budget ledger ──────────────────────────────────────────────────
1504
+ async recordBudget(entry) {
1505
+ const full = {
1506
+ runId: this._runId,
1507
+ timestamp: entry.timestamp ?? this.now(),
1508
+ dimension: entry.dimension,
1509
+ limit: entry.limit,
1510
+ consumed: entry.consumed,
1511
+ remaining: entry.remaining,
1512
+ breached: entry.breached,
1513
+ spanId: entry.spanId ?? this.stack[this.stack.length - 1]
1514
+ };
1515
+ await this.store.appendBudgetEntry(full);
1516
+ if (full.breached) {
1517
+ await this.emit({
1518
+ kind: "budget_breach",
1519
+ spanId: full.spanId,
1520
+ payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
1521
+ });
1522
+ }
1523
+ return full;
1524
+ }
1525
+ // ── Artifacts ──────────────────────────────────────────────────────
1526
+ async recordArtifact(artifact) {
1527
+ const full = { artifactId: this.id(), runId: this._runId, ...artifact };
1528
+ await this.store.appendArtifact(full);
1529
+ return full;
1530
+ }
1531
+ // ── Nested composition ─────────────────────────────────────────────
1532
+ /**
1533
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1534
+ * Returns the fn's return value. Use this for the 95% case.
1535
+ */
1536
+ async within(init, fn) {
1537
+ const handle = await this.span(init);
1538
+ try {
1539
+ const result = await fn(handle);
1540
+ await handle.end();
1541
+ return result;
1542
+ } catch (err) {
1543
+ await handle.fail(err instanceof Error ? err : String(err));
1544
+ throw err;
1545
+ }
1546
+ }
1547
+ };
1548
+ function cryptoRandomId() {
1549
+ if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
1550
+ return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
1551
+ }
1552
+ function llmSpanFromProvider(args) {
1553
+ return {
1554
+ name: args.name ?? args.model,
1555
+ model: args.model,
1556
+ messages: args.messages,
1557
+ output: args.output,
1558
+ inputTokens: args.usage?.inputTokens,
1559
+ outputTokens: args.usage?.outputTokens,
1560
+ cachedTokens: args.usage?.cachedTokens,
1561
+ reasoningTokens: args.usage?.reasoningTokens,
1562
+ costUsd: args.costUsd,
1563
+ finishReason: args.finishReason
1564
+ };
1565
+ }
1566
+
1567
+ // src/control-runtime.ts
1568
+ var DEFAULT_BUDGET = {
1569
+ maxSteps: 8,
1570
+ maxWallMs: 5 * 60 * 1e3
1571
+ };
1572
+ async function runAgentControlLoop(config) {
1573
+ const budget = { ...DEFAULT_BUDGET, ...config.budget };
1574
+ const actionFailure = config.actionFailure ?? "continue";
1575
+ const controller = new AbortController();
1576
+ const upstreamAbort = () => controller.abort(config.signal?.reason);
1577
+ if (config.signal) {
1578
+ if (config.signal.aborted) controller.abort(config.signal.reason);
1579
+ else config.signal.addEventListener("abort", upstreamAbort, { once: true });
1580
+ }
1581
+ const started = Date.now();
1582
+ const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
1583
+ const history = [];
1584
+ const emitter = config.store ? new TraceEmitter(config.store) : void 0;
1585
+ let spentCostUsd = 0;
1586
+ const runtimeErrors = [];
1587
+ let lastStateFingerprint;
1588
+ let lastActionFingerprint;
1589
+ let noProgressStreak = 0;
1590
+ let repeatedActionStreak = 0;
1591
+ try {
1592
+ if (emitter) {
1593
+ await runTrace(runtimeErrors, 0, () => emitter.startRun({
1594
+ scenarioId: config.scenarioId ?? "agent-control-loop",
1595
+ projectId: config.projectId,
1596
+ variantId: config.variantId,
1597
+ layer: "meta",
1598
+ tags: {
1599
+ intent: config.intent.slice(0, 120),
1600
+ maxSteps: String(budget.maxSteps),
1601
+ ...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
1602
+ }
1603
+ }));
1604
+ }
1605
+ let state;
1606
+ let evals;
1607
+ try {
1608
+ state = await config.observe({ history, abortSignal: controller.signal });
1609
+ } catch (err) {
1610
+ runtimeErrors.push(runtimeError("observe", 0, err));
1611
+ return finish(emitter, {
1612
+ intent: config.intent,
1613
+ pass: false,
1614
+ completed: false,
1615
+ reason: runtimeErrors[0].message,
1616
+ steps: history,
1617
+ finalState: void 0,
1618
+ finalEvals: [],
1619
+ wallMs: Date.now() - started,
1620
+ spentCostUsd,
1621
+ runId: emitter?.runId ?? null,
1622
+ failureClass: "unknown",
1623
+ runtimeErrors,
1624
+ stoppedBy: "runtime-error"
1625
+ });
1626
+ }
1627
+ try {
1628
+ evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
1629
+ await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
1630
+ } catch (err) {
1631
+ runtimeErrors.push(runtimeError("validate", 0, err));
1632
+ return finish(emitter, {
1633
+ intent: config.intent,
1634
+ pass: false,
1635
+ completed: false,
1636
+ reason: runtimeErrors[0].message,
1637
+ steps: history,
1638
+ finalState: state,
1639
+ finalEvals: [],
1640
+ wallMs: Date.now() - started,
1641
+ spentCostUsd,
1642
+ runId: emitter?.runId ?? null,
1643
+ failureClass: "unknown",
1644
+ runtimeErrors,
1645
+ stoppedBy: "runtime-error"
1646
+ });
1647
+ }
1648
+ lastStateFingerprint = fingerprintState(state, config.stopPolicies);
1649
+ for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
1650
+ if (controller.signal.aborted) {
1651
+ return finish(emitter, {
1652
+ intent: config.intent,
1653
+ pass: false,
1654
+ completed: false,
1655
+ reason: abortReason(controller.signal),
1656
+ score: void 0,
1657
+ steps: history,
1658
+ finalState: state,
1659
+ finalEvals: evals,
1660
+ wallMs: Date.now() - started,
1661
+ spentCostUsd,
1662
+ runId: emitter?.runId ?? null,
1663
+ failureClass: "timeout",
1664
+ runtimeErrors,
1665
+ stoppedBy: "abort"
1666
+ });
1667
+ }
1668
+ const budgetStop = budgetStopDecision(budget, spentCostUsd);
1669
+ if (budgetStop.stop) {
1670
+ return finish(emitter, {
1671
+ intent: config.intent,
1672
+ pass: false,
1673
+ completed: false,
1674
+ reason: budgetStop.reason,
1675
+ score: averageScore(evals),
1676
+ steps: history,
1677
+ finalState: state,
1678
+ finalEvals: evals,
1679
+ wallMs: Date.now() - started,
1680
+ spentCostUsd,
1681
+ runId: emitter?.runId ?? null,
1682
+ failureClass: "budget_exceeded",
1683
+ runtimeErrors,
1684
+ stoppedBy: "budget"
1685
+ });
1686
+ }
1687
+ const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
1688
+ let stop;
1689
+ try {
1690
+ stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
1691
+ } catch (err) {
1692
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
1693
+ return finish(emitter, {
1694
+ intent: config.intent,
1695
+ pass: false,
1696
+ completed: false,
1697
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1698
+ score: averageScore(evals),
1699
+ steps: history,
1700
+ finalState: state,
1701
+ finalEvals: evals,
1702
+ wallMs: Date.now() - started,
1703
+ spentCostUsd,
1704
+ runId: emitter?.runId ?? null,
1705
+ failureClass: "unknown",
1706
+ runtimeErrors,
1707
+ stoppedBy: "runtime-error"
1708
+ });
1709
+ }
1710
+ if (stop.stop) {
1711
+ return finish(emitter, {
1712
+ intent: config.intent,
1713
+ pass: stop.pass,
1714
+ completed: true,
1715
+ reason: stop.reason,
1716
+ score: stop.score,
1717
+ steps: history,
1718
+ finalState: state,
1719
+ finalEvals: evals,
1720
+ wallMs: Date.now() - started,
1721
+ spentCostUsd,
1722
+ runId: emitter?.runId ?? null,
1723
+ failureClass: stop.failureClass,
1724
+ runtimeErrors,
1725
+ stoppedBy: "stop-policy"
1726
+ });
1727
+ }
1728
+ let decision;
1729
+ try {
1730
+ decision = await config.decide(ctx);
1731
+ } catch (err) {
1732
+ runtimeErrors.push(runtimeError("decide", stepIndex, err));
1733
+ return finish(emitter, {
1734
+ intent: config.intent,
1735
+ pass: false,
1736
+ completed: false,
1737
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1738
+ score: averageScore(evals),
1739
+ steps: history,
1740
+ finalState: state,
1741
+ finalEvals: evals,
1742
+ wallMs: Date.now() - started,
1743
+ spentCostUsd,
1744
+ runId: emitter?.runId ?? null,
1745
+ failureClass: "unknown",
1746
+ runtimeErrors,
1747
+ stoppedBy: "runtime-error"
1748
+ });
1749
+ }
1750
+ if (decision.type === "stop") {
1751
+ return finish(emitter, {
1752
+ intent: config.intent,
1753
+ pass: decision.pass ?? false,
1754
+ completed: true,
1755
+ reason: decision.reason,
1756
+ score: decision.score,
1757
+ steps: history,
1758
+ finalState: state,
1759
+ finalEvals: evals,
1760
+ wallMs: Date.now() - started,
1761
+ spentCostUsd,
1762
+ runId: emitter?.runId ?? null,
1763
+ failureClass: decision.pass === false ? "unknown" : void 0,
1764
+ runtimeErrors,
1765
+ stoppedBy: "policy"
1766
+ });
1767
+ }
1768
+ const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
1769
+ repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
1770
+ lastActionFingerprint = actionFingerprint;
1771
+ const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
1772
+ if (repeatedActionStop.stop) {
1773
+ return finish(emitter, {
1774
+ intent: config.intent,
1775
+ pass: false,
1776
+ completed: true,
1777
+ reason: repeatedActionStop.reason,
1778
+ score: averageScore(evals),
1779
+ steps: history,
1780
+ finalState: state,
1781
+ finalEvals: evals,
1782
+ wallMs: Date.now() - started,
1783
+ spentCostUsd,
1784
+ runId: emitter?.runId ?? null,
1785
+ failureClass: "tool_recovery_failure",
1786
+ runtimeErrors,
1787
+ stoppedBy: "stop-policy"
1788
+ });
1789
+ }
1790
+ const beforeState = state;
1791
+ const evalsBefore = evals;
1792
+ const scoreBefore = averageScore(evals);
1793
+ const actionStarted = Date.now();
1794
+ const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
1795
+ name: `control-step-${stepIndex}`,
1796
+ toolName: "agent-control-action",
1797
+ args: decision.action,
1798
+ attributes: {
1799
+ decision: decision.reason ?? "continue",
1800
+ repeatedActionStreak
1801
+ }
1802
+ })) : void 0;
1803
+ let actionOutcome;
1804
+ try {
1805
+ const result = await config.act(decision.action, ctx);
1806
+ const costUsd = config.getActionCostUsd?.({
1807
+ action: decision.action,
1808
+ result,
1809
+ state,
1810
+ evals,
1811
+ history
1812
+ });
1813
+ if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
1814
+ spentCostUsd += costUsd;
1815
+ await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
1816
+ }
1817
+ actionOutcome = {
1818
+ ok: true,
1819
+ result,
1820
+ ...costUsd !== void 0 ? { costUsd } : {},
1821
+ durationMs: Date.now() - actionStarted
1822
+ };
1823
+ } catch (err) {
1824
+ runtimeErrors.push(runtimeError("act", stepIndex, err));
1825
+ actionOutcome = {
1826
+ ok: false,
1827
+ error: runtimeErrors[runtimeErrors.length - 1].message,
1828
+ durationMs: Date.now() - actionStarted
1829
+ };
1830
+ if (actionFailure === "stop") {
1831
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
1832
+ const step2 = {
1833
+ index: stepIndex,
1834
+ decision,
1835
+ beforeState,
1836
+ afterState: state,
1837
+ evalsBefore,
1838
+ evalsAfter: evals,
1839
+ actionOutcome,
1840
+ startedAt: new Date(actionStarted).toISOString(),
1841
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1842
+ };
1843
+ history.push(step2);
1844
+ await runOnStep(config.onStep, step2, runtimeErrors);
1845
+ return finish(emitter, {
1846
+ intent: config.intent,
1847
+ pass: false,
1848
+ completed: false,
1849
+ reason: actionOutcome.error ?? "action failed",
1850
+ score: averageScore(evals),
1851
+ steps: history,
1852
+ finalState: state,
1853
+ finalEvals: evals,
1854
+ wallMs: Date.now() - started,
1855
+ spentCostUsd,
1856
+ runId: emitter?.runId ?? null,
1857
+ failureClass: "unknown",
1858
+ runtimeErrors,
1859
+ stoppedBy: "runtime-error"
1860
+ });
1861
+ }
1862
+ }
1863
+ try {
1864
+ state = await config.observe({ history, abortSignal: controller.signal });
1865
+ } catch (err) {
1866
+ runtimeErrors.push(runtimeError("observe", stepIndex, err));
1867
+ const step2 = {
1868
+ index: stepIndex,
1869
+ decision,
1870
+ beforeState,
1871
+ afterState: beforeState,
1872
+ evalsBefore,
1873
+ evalsAfter: evals,
1874
+ actionOutcome,
1875
+ startedAt: new Date(actionStarted).toISOString(),
1876
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1877
+ };
1878
+ history.push(step2);
1879
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
1880
+ await runOnStep(config.onStep, step2, runtimeErrors);
1881
+ return finish(emitter, {
1882
+ intent: config.intent,
1883
+ pass: false,
1884
+ completed: false,
1885
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1886
+ score: averageScore(evals),
1887
+ steps: history,
1888
+ finalState: beforeState,
1889
+ finalEvals: evals,
1890
+ wallMs: Date.now() - started,
1891
+ spentCostUsd,
1892
+ runId: emitter?.runId ?? null,
1893
+ failureClass: "unknown",
1894
+ runtimeErrors,
1895
+ stoppedBy: "runtime-error"
1896
+ });
1897
+ }
1898
+ try {
1899
+ evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
1900
+ await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
1901
+ } catch (err) {
1902
+ runtimeErrors.push(runtimeError("validate", stepIndex, err));
1903
+ const step2 = {
1904
+ index: stepIndex,
1905
+ decision,
1906
+ beforeState,
1907
+ afterState: state,
1908
+ evalsBefore,
1909
+ evalsAfter: evals,
1910
+ actionOutcome,
1911
+ startedAt: new Date(actionStarted).toISOString(),
1912
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1913
+ };
1914
+ history.push(step2);
1915
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
1916
+ await runOnStep(config.onStep, step2, runtimeErrors);
1917
+ return finish(emitter, {
1918
+ intent: config.intent,
1919
+ pass: false,
1920
+ completed: false,
1921
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1922
+ score: averageScore(evals),
1923
+ steps: history,
1924
+ finalState: state,
1925
+ finalEvals: evals,
1926
+ wallMs: Date.now() - started,
1927
+ spentCostUsd,
1928
+ runId: emitter?.runId ?? null,
1929
+ failureClass: "unknown",
1930
+ runtimeErrors,
1931
+ stoppedBy: "runtime-error"
1932
+ });
1933
+ }
1934
+ const scoreAfter = averageScore(evals);
1935
+ const stateFingerprint = fingerprintState(state, config.stopPolicies);
1936
+ const noProgressStop = noProgressStopDecision({
1937
+ policies: config.stopPolicies,
1938
+ lastStateFingerprint,
1939
+ stateFingerprint,
1940
+ scoreBefore,
1941
+ scoreAfter,
1942
+ currentStreak: noProgressStreak
1943
+ });
1944
+ noProgressStreak = noProgressStop.streak;
1945
+ lastStateFingerprint = stateFingerprint;
1946
+ const step = {
1947
+ index: stepIndex,
1948
+ decision,
1949
+ beforeState,
1950
+ afterState: state,
1951
+ evalsBefore,
1952
+ evalsAfter: evals,
1953
+ actionOutcome,
1954
+ startedAt: new Date(actionStarted).toISOString(),
1955
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1956
+ };
1957
+ history.push(step);
1958
+ if (actionOutcome.ok) {
1959
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
1960
+ attributes: {
1961
+ actionCostUsd: actionOutcome.costUsd ?? null,
1962
+ spentCostUsd,
1963
+ scoreBefore: scoreBefore ?? null,
1964
+ scoreAfter: scoreAfter ?? null,
1965
+ noProgressStreak
1966
+ }
1967
+ }));
1968
+ } else {
1969
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
1970
+ attributes: {
1971
+ spentCostUsd,
1972
+ noProgressStreak
1973
+ }
1974
+ }));
1975
+ }
1976
+ await runOnStep(config.onStep, step, runtimeErrors);
1977
+ if (noProgressStop.stop) {
1978
+ return finish(emitter, {
1979
+ intent: config.intent,
1980
+ pass: false,
1981
+ completed: true,
1982
+ reason: noProgressStop.reason,
1983
+ score: scoreAfter,
1984
+ steps: history,
1985
+ finalState: state,
1986
+ finalEvals: evals,
1987
+ wallMs: Date.now() - started,
1988
+ spentCostUsd,
1989
+ runId: emitter?.runId ?? null,
1990
+ failureClass: "tool_recovery_failure",
1991
+ runtimeErrors,
1992
+ stoppedBy: "stop-policy"
1993
+ });
1994
+ }
1995
+ const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
1996
+ if (postStepBudgetStop.stop) {
1997
+ return finish(emitter, {
1998
+ intent: config.intent,
1999
+ pass: false,
2000
+ completed: false,
2001
+ reason: postStepBudgetStop.reason,
2002
+ score: scoreAfter,
2003
+ steps: history,
2004
+ finalState: state,
2005
+ finalEvals: evals,
2006
+ wallMs: Date.now() - started,
2007
+ spentCostUsd,
2008
+ runId: emitter?.runId ?? null,
2009
+ failureClass: "budget_exceeded",
2010
+ runtimeErrors,
2011
+ stoppedBy: "budget"
2012
+ });
2013
+ }
2014
+ const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
2015
+ let postStepStop;
2016
+ try {
2017
+ postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
2018
+ } catch (err) {
2019
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
2020
+ return finish(emitter, {
2021
+ intent: config.intent,
2022
+ pass: false,
2023
+ completed: false,
2024
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
2025
+ score: averageScore(evals),
2026
+ steps: history,
2027
+ finalState: state,
2028
+ finalEvals: evals,
2029
+ wallMs: Date.now() - started,
2030
+ spentCostUsd,
2031
+ runId: emitter?.runId ?? null,
2032
+ failureClass: "unknown",
2033
+ runtimeErrors,
2034
+ stoppedBy: "runtime-error"
2035
+ });
2036
+ }
2037
+ if (postStepStop.stop) {
2038
+ return finish(emitter, {
2039
+ intent: config.intent,
2040
+ pass: postStepStop.pass,
2041
+ completed: true,
2042
+ reason: postStepStop.reason,
2043
+ score: postStepStop.score,
2044
+ steps: history,
2045
+ finalState: state,
2046
+ finalEvals: evals,
2047
+ wallMs: Date.now() - started,
2048
+ spentCostUsd,
2049
+ runId: emitter?.runId ?? null,
2050
+ failureClass: postStepStop.failureClass,
2051
+ runtimeErrors,
2052
+ stoppedBy: "stop-policy"
2053
+ });
2054
+ }
2055
+ }
2056
+ return finish(emitter, {
2057
+ intent: config.intent,
2058
+ pass: false,
2059
+ completed: false,
2060
+ reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
2061
+ steps: history,
2062
+ finalState: state,
2063
+ finalEvals: evals,
2064
+ wallMs: Date.now() - started,
2065
+ spentCostUsd,
2066
+ runId: emitter?.runId ?? null,
2067
+ failureClass: "budget_exceeded",
2068
+ runtimeErrors,
2069
+ stoppedBy: "budget"
2070
+ });
2071
+ } catch (err) {
2072
+ runtimeErrors.push(runtimeError("act", history.length, err));
2073
+ return finish(emitter, {
2074
+ intent: config.intent,
2075
+ pass: false,
2076
+ completed: false,
2077
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
2078
+ steps: history,
2079
+ finalState: void 0,
2080
+ finalEvals: [],
2081
+ wallMs: Date.now() - started,
2082
+ spentCostUsd,
2083
+ runId: emitter?.runId ?? null,
2084
+ failureClass: "unknown",
2085
+ runtimeErrors,
2086
+ stoppedBy: "runtime-error"
2087
+ });
2088
+ } finally {
2089
+ if (wallTimer) clearTimeout(wallTimer);
2090
+ if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
2091
+ }
2092
+ }
2093
+ function stopOnNoProgress(maxNoProgressSteps, options = {}) {
2094
+ return { ...options, maxNoProgressSteps };
2095
+ }
2096
+ function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
2097
+ return { ...options, maxRepeatedActions };
2098
+ }
2099
+ function objectiveEval(input) {
2100
+ return { ...input, objective: true };
2101
+ }
2102
+ function subjectiveEval(input) {
2103
+ return { ...input, objective: false };
2104
+ }
2105
+ function allCriticalPassed(evals) {
2106
+ return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
2107
+ }
2108
+ function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
2109
+ return {
2110
+ intent,
2111
+ state,
2112
+ evals,
2113
+ history,
2114
+ budget,
2115
+ stepIndex,
2116
+ wallMs: Date.now() - started,
2117
+ spentCostUsd,
2118
+ remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
2119
+ abortSignal,
2120
+ emitter
2121
+ };
2122
+ }
2123
+ function defaultStopDecision(evals) {
2124
+ if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
2125
+ const pass = allCriticalPassed(evals);
2126
+ return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
2127
+ }
2128
+ function averageScore(evals) {
2129
+ const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
2130
+ if (!scored.length) return void 0;
2131
+ return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
2132
+ }
2133
+ function budgetStopDecision(budget, spentCostUsd) {
2134
+ if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
2135
+ return {
2136
+ stop: true,
2137
+ reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
2138
+ };
2139
+ }
2140
+ return { stop: false, reason: "" };
2141
+ }
2142
+ async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
2143
+ if (!emitter || budget.maxCostUsd === void 0) return;
2144
+ const maxCostUsd = budget.maxCostUsd;
2145
+ await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
2146
+ dimension: "usd",
2147
+ limit: maxCostUsd,
2148
+ consumed: spentCostUsd,
2149
+ remaining: Math.max(0, maxCostUsd - spentCostUsd),
2150
+ breached: spentCostUsd >= maxCostUsd,
2151
+ spanId: handle?.span.spanId
2152
+ }));
2153
+ }
2154
+ async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
2155
+ if (!emitter) return;
2156
+ for (const result of evals) {
2157
+ await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
2158
+ judgeId: result.objective ? "objective-validator" : "subjective-judge",
2159
+ targetSpanId: targetSpanId ?? emitter.runId,
2160
+ name: `control-eval/${result.id}`,
2161
+ dimension: result.id,
2162
+ score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
2163
+ rationale: result.detail,
2164
+ evidence: result.evidence,
2165
+ attributes: {
2166
+ phase,
2167
+ passed: result.passed,
2168
+ severity: result.severity,
2169
+ objective: result.objective
2170
+ }
2171
+ }));
2172
+ }
2173
+ }
2174
+ async function runOnStep(onStep, step, runtimeErrors) {
2175
+ if (!onStep) return;
2176
+ try {
2177
+ await onStep(step);
2178
+ } catch (err) {
2179
+ runtimeErrors.push(runtimeError("on-step", step.index, err));
2180
+ }
2181
+ }
2182
+ async function runTrace(runtimeErrors, stepIndex, write) {
2183
+ try {
2184
+ return await write();
2185
+ } catch (err) {
2186
+ runtimeErrors.push(runtimeError("trace", stepIndex, err));
2187
+ return void 0;
2188
+ }
2189
+ }
2190
+ function noProgressStopDecision(args) {
2191
+ const max = args.policies?.maxNoProgressSteps;
2192
+ if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
2193
+ const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
2194
+ const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
2195
+ const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
2196
+ const scoreFlat = scoreDelta < minScoreDelta;
2197
+ const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
2198
+ return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
2199
+ }
2200
+ function repeatedActionStopDecision(policies, streak) {
2201
+ const max = policies?.maxRepeatedActions;
2202
+ if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
2203
+ return {
2204
+ stop: true,
2205
+ reason: `stuck: repeated same action for ${streak} step(s)`
2206
+ };
2207
+ }
2208
+ function fingerprintState(state, policies) {
2209
+ if (policies?.stateFingerprint) return policies.stateFingerprint(state);
2210
+ return stableFingerprint(state);
2211
+ }
2212
+ function fingerprintAction(action, policies) {
2213
+ if (policies?.actionFingerprint) return policies.actionFingerprint(action);
2214
+ return stableFingerprint(action);
2215
+ }
2216
+ function stableFingerprint(value) {
2217
+ if (typeof value === "string") return value;
2218
+ if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
2219
+ try {
2220
+ return JSON.stringify(sortForFingerprint(value));
2221
+ } catch {
2222
+ return String(value);
2223
+ }
2224
+ }
2225
+ function sortForFingerprint(value) {
2226
+ if (Array.isArray(value)) return value.map(sortForFingerprint);
2227
+ if (!value || typeof value !== "object") return value;
2228
+ const record = value;
2229
+ const sorted = {};
2230
+ for (const key of Object.keys(record).sort()) {
2231
+ sorted[key] = sortForFingerprint(record[key]);
2232
+ }
2233
+ return sorted;
2234
+ }
2235
+ function abortReason(signal) {
2236
+ const reason = signal.reason;
2237
+ if (reason instanceof Error) return reason.message;
2238
+ return reason ? String(reason) : "aborted";
2239
+ }
2240
+ function runtimeError(phase, stepIndex, err) {
2241
+ const message = err instanceof Error ? err.message : String(err);
2242
+ return { phase, stepIndex, message };
2243
+ }
2244
+ async function finish(emitter, result) {
2245
+ await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
2246
+ pass: result.pass,
2247
+ score: result.score ?? averageScore(result.finalEvals),
2248
+ failureClass: result.failureClass,
2249
+ notes: result.reason
2250
+ }));
2251
+ return result;
2252
+ }
2253
+
2254
+ // src/feedback-trajectory.ts
2255
+ import { appendFile, mkdir, readFile } from "fs/promises";
2256
+ import { join } from "path";
2257
+ var DEFAULT_SPLIT_POLICY = {
2258
+ trainPct: 70,
2259
+ devPct: 15,
2260
+ testPct: 10,
2261
+ holdoutPct: 5
2262
+ };
2263
+ var InMemoryFeedbackTrajectoryStore = class {
2264
+ trajectories = /* @__PURE__ */ new Map();
2265
+ async save(trajectory) {
2266
+ this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
2267
+ }
2268
+ async get(id) {
2269
+ const trajectory = this.trajectories.get(id);
2270
+ return trajectory ? cloneTrajectory(trajectory) : null;
2271
+ }
2272
+ async list(filter = {}) {
2273
+ return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
2274
+ }
2275
+ async appendAttempt(id, attempt) {
2276
+ const trajectory = this.trajectories.get(id);
2277
+ if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
2278
+ const next = cloneTrajectory({
2279
+ ...trajectory,
2280
+ attempts: [...trajectory.attempts, attempt],
2281
+ updatedAt: attempt.createdAt
2282
+ });
2283
+ this.trajectories.set(id, next);
2284
+ return cloneTrajectory(next);
2285
+ }
2286
+ async appendLabel(id, label, attemptId) {
2287
+ const trajectory = this.trajectories.get(id);
2288
+ if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
2289
+ const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
2290
+ const next = cloneTrajectory({
2291
+ ...trajectory,
2292
+ attempts,
2293
+ labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
2294
+ updatedAt: label.createdAt
2295
+ });
2296
+ this.trajectories.set(id, next);
2297
+ return cloneTrajectory(next);
2298
+ }
2299
+ };
2300
+ var FileSystemFeedbackTrajectoryStore = class {
2301
+ dir;
2302
+ memory = new InMemoryFeedbackTrajectoryStore();
2303
+ loaded = false;
2304
+ constructor(options) {
2305
+ this.dir = options.dir;
2306
+ }
2307
+ async save(trajectory) {
2308
+ await this.load();
2309
+ await this.memory.save(trajectory);
2310
+ await this.append({ op: "save", trajectory });
2311
+ }
2312
+ async get(id) {
2313
+ await this.load();
2314
+ return this.memory.get(id);
2315
+ }
2316
+ async list(filter = {}) {
2317
+ await this.load();
2318
+ return this.memory.list(filter);
2319
+ }
2320
+ async appendAttempt(id, attempt) {
2321
+ await this.load();
2322
+ const next = await this.memory.appendAttempt(id, attempt);
2323
+ await this.append({ op: "appendAttempt", id, attempt });
2324
+ return next;
2325
+ }
2326
+ async appendLabel(id, label, attemptId) {
2327
+ await this.load();
2328
+ const next = await this.memory.appendLabel(id, label, attemptId);
2329
+ await this.append({ op: "appendLabel", id, label, attemptId });
2330
+ return next;
2331
+ }
2332
+ async append(record) {
2333
+ await mkdir(this.dir, { recursive: true });
2334
+ await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
2335
+ }
2336
+ async load() {
2337
+ if (this.loaded) return;
2338
+ const file = join(this.dir, "feedback-trajectories.ndjson");
2339
+ try {
2340
+ const raw = await readFile(file, "utf8");
2341
+ for (const line of raw.split("\n")) {
2342
+ if (!line.trim()) continue;
2343
+ try {
2344
+ const record = JSON.parse(line);
2345
+ if (record.op === "save") await this.memory.save(record.trajectory);
2346
+ if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
2347
+ if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
2348
+ } catch {
2349
+ }
2350
+ }
2351
+ } catch {
2352
+ }
2353
+ this.loaded = true;
2354
+ }
2355
+ };
2356
+ function createFeedbackTrajectory(input) {
2357
+ const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
2358
+ const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
2359
+ return {
2360
+ id,
2361
+ projectId: input.projectId,
2362
+ scenarioId: input.scenarioId,
2363
+ task: input.task,
2364
+ attempts: input.attempts ?? [],
2365
+ labels: input.labels ?? [],
2366
+ outcome: input.outcome,
2367
+ split: input.split,
2368
+ tags: input.tags,
2369
+ createdAt,
2370
+ metadata: input.metadata
2371
+ };
2372
+ }
2373
+ function assignFeedbackSplit(trajectory, policy = {}) {
2374
+ const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
2375
+ const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
2376
+ if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
2377
+ const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
2378
+ if (bucket < split.trainPct) return "train";
2379
+ if (bucket < split.trainPct + split.devPct) return "dev";
2380
+ if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
2381
+ return "holdout";
2382
+ }
2383
+ function withAssignedFeedbackSplit(trajectory, policy) {
2384
+ return {
2385
+ ...trajectory,
2386
+ split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
2387
+ };
2388
+ }
2389
+ function feedbackTrajectoryToDatasetScenario(trajectory) {
2390
+ const withSplit = withAssignedFeedbackSplit(trajectory);
2391
+ return {
2392
+ id: withSplit.scenarioId ?? withSplit.id,
2393
+ split: withSplit.split,
2394
+ payload: withSplit,
2395
+ tags: {
2396
+ ...withSplit.projectId ? { projectId: withSplit.projectId } : {},
2397
+ ...withSplit.tags ?? {},
2398
+ source: "feedback-trajectory"
2399
+ }
2400
+ };
2401
+ }
2402
+ function feedbackTrajectoriesToDatasetScenarios(trajectories) {
2403
+ return trajectories.map(feedbackTrajectoryToDatasetScenario);
2404
+ }
2405
+ function feedbackTrajectoryToOptimizerRow(trajectory) {
2406
+ const labels = allLabels(trajectory);
2407
+ return {
2408
+ scenarioId: trajectory.scenarioId ?? trajectory.id,
2409
+ trajectoryId: trajectory.id,
2410
+ labelKinds: [...new Set(labels.map((label) => label.kind))],
2411
+ score: trajectory.outcome?.score ?? scoreFromLabels(labels),
2412
+ metadata: {
2413
+ projectId: trajectory.projectId,
2414
+ split: trajectory.split,
2415
+ intent: trajectory.task.intent,
2416
+ attempts: trajectory.attempts.length,
2417
+ outcome: trajectory.outcome,
2418
+ labels
2419
+ }
2420
+ };
2421
+ }
2422
+ function feedbackTrajectoriesToOptimizerRows(trajectories) {
2423
+ return trajectories.map(feedbackTrajectoryToOptimizerRow);
2424
+ }
2425
+ function summarizePreferenceMemory(trajectories, options = {}) {
2426
+ const maxEntries = options.maxEntries ?? 20;
2427
+ const entries = [];
2428
+ for (const trajectory of trajectories) {
2429
+ for (const label of allLabels(trajectory)) {
2430
+ const instruction = instructionFromLabel(trajectory, label);
2431
+ if (!instruction) continue;
2432
+ entries.push({
2433
+ instruction,
2434
+ rationale: label.reason ?? `${label.kind} label from ${label.source}`,
2435
+ weight: weightForLabel(label),
2436
+ sourceTrajectoryId: trajectory.id,
2437
+ sourceLabelId: label.id,
2438
+ category: label.kind
2439
+ });
2440
+ }
2441
+ }
2442
+ const byInstruction = /* @__PURE__ */ new Map();
2443
+ for (const entry of entries) {
2444
+ const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
2445
+ const existing = byInstruction.get(key);
2446
+ if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
2447
+ }
2448
+ return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
2449
+ }
2450
+ function renderPreferenceMemoryMarkdown(entries) {
2451
+ const lines = ["# Preference Memory", ""];
2452
+ for (const entry of entries) {
2453
+ lines.push(`- ${entry.instruction}`);
2454
+ lines.push(` Rationale: ${entry.rationale}`);
2455
+ lines.push(` Source: ${entry.sourceTrajectoryId}`);
2456
+ lines.push("");
2457
+ }
2458
+ return lines.join("\n").trim() + "\n";
2459
+ }
2460
+ function serializeFeedbackTrajectoriesJsonl(trajectories) {
2461
+ return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
2462
+ }
2463
+ function parseFeedbackTrajectoriesJsonl(jsonl) {
2464
+ const trajectories = [];
2465
+ for (const line of jsonl.split("\n")) {
2466
+ if (!line.trim()) continue;
2467
+ trajectories.push(JSON.parse(line));
2468
+ }
2469
+ return trajectories;
2470
+ }
2471
+ function controlRunToFeedbackTrajectory(run, options = {}) {
2472
+ const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
2473
+ const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
2474
+ return createFeedbackTrajectory({
2475
+ id: trajectoryId,
2476
+ projectId: options.projectId,
2477
+ scenarioId: options.scenarioId,
2478
+ task: { intent: run.intent },
2479
+ createdAt,
2480
+ attempts: run.steps.map((step) => ({
2481
+ id: `${trajectoryId}_step_${step.index}`,
2482
+ stepIndex: step.index,
2483
+ artifactType: options.artifactType ?? "action",
2484
+ artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
2485
+ proposedAction: options.proposedActionFromStep?.(step),
2486
+ evals: step.evalsAfter,
2487
+ createdAt: step.startedAt,
2488
+ metadata: {
2489
+ decision: step.decision,
2490
+ actionOutcome: step.actionOutcome
2491
+ }
2492
+ })),
2493
+ labels: [
2494
+ {
2495
+ source: "system",
2496
+ kind: run.pass ? "approve" : "reject",
2497
+ value: run.pass,
2498
+ reason: run.reason,
2499
+ severity: run.pass ? "info" : "error",
2500
+ createdAt
2501
+ }
2502
+ ],
2503
+ outcome: {
2504
+ success: run.pass,
2505
+ score: run.score,
2506
+ costUsd: run.spentCostUsd,
2507
+ detail: run.reason,
2508
+ observedAt: createdAt,
2509
+ metadata: {
2510
+ stoppedBy: run.stoppedBy,
2511
+ failureClass: run.failureClass
2512
+ }
2513
+ }
2514
+ });
2515
+ }
2516
+ function allLabels(trajectory) {
2517
+ const labels = [
2518
+ ...trajectory.labels,
2519
+ ...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
2520
+ ];
2521
+ const seen = /* @__PURE__ */ new Set();
2522
+ return labels.filter((label) => {
2523
+ const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
2524
+ if (seen.has(key)) return false;
2525
+ seen.add(key);
2526
+ return true;
2527
+ });
2528
+ }
2529
+ function scoreFromLabels(labels) {
2530
+ if (!labels.length) return void 0;
2531
+ const scored = labels.map((label) => {
2532
+ if (label.kind === "approve" || label.kind === "select") return 1;
2533
+ if (label.kind === "reject" || label.kind === "policy_block") return 0;
2534
+ if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
2535
+ return void 0;
2536
+ }).filter((value) => typeof value === "number");
2537
+ if (!scored.length) return void 0;
2538
+ return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
2539
+ }
2540
+ function instructionFromLabel(trajectory, label) {
2541
+ if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
2542
+ if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
2543
+ if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
2544
+ if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
2545
+ if (label.kind === "comment" && label.reason) return label.reason;
2546
+ return void 0;
2547
+ }
2548
+ function weightForLabel(label) {
2549
+ const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
2550
+ const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
2551
+ return severity * source;
2552
+ }
2553
+ function matchesFilter(trajectory, filter) {
2554
+ if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
2555
+ if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
2556
+ if (filter.split && trajectory.split !== filter.split) return false;
2557
+ if (filter.tag) {
2558
+ const [key, value] = filter.tag;
2559
+ if (trajectory.tags?.[key] !== value) return false;
2560
+ }
2561
+ return true;
2562
+ }
2563
+ function cloneTrajectory(trajectory) {
2564
+ return JSON.parse(JSON.stringify(trajectory));
2565
+ }
2566
+ function compact(value, max) {
2567
+ const normalized = value.replace(/\s+/g, " ").trim();
2568
+ return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
2569
+ }
2570
+ function stableHash(input) {
2571
+ let hash = 2166136261;
2572
+ for (let i = 0; i < input.length; i += 1) {
2573
+ hash ^= input.charCodeAt(i);
2574
+ hash = Math.imul(hash, 16777619);
2575
+ }
2576
+ return hash >>> 0;
2577
+ }
2578
+ function canonicalize(value) {
2579
+ if (value === null || typeof value !== "object") return value;
2580
+ if (Array.isArray(value)) return value.map(canonicalize);
2581
+ const out = {};
2582
+ for (const key of Object.keys(value).sort()) {
2583
+ out[key] = canonicalize(value[key]);
2584
+ }
2585
+ return out;
2586
+ }
2587
+
1389
2588
  // src/prompt-registry.ts
1390
2589
  var PromptRegistry = class {
1391
2590
  entries = /* @__PURE__ */ new Map();
@@ -3101,184 +4300,6 @@ var FileSystemTraceStore = class {
3101
4300
  }
3102
4301
  };
3103
4302
 
3104
- // src/trace/emitter.ts
3105
- var TraceEmitter = class {
3106
- store;
3107
- stack = [];
3108
- _runId;
3109
- now;
3110
- id;
3111
- constructor(store, options = {}) {
3112
- this.store = store;
3113
- this.now = options.now ?? (() => Date.now());
3114
- this.id = options.id ?? (() => cryptoRandomId());
3115
- this._runId = options.runId ?? this.id();
3116
- }
3117
- get runId() {
3118
- return this._runId;
3119
- }
3120
- // ── Run lifecycle ──────────────────────────────────────────────────
3121
- async startRun(run) {
3122
- const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
3123
- await this.store.appendRun(full);
3124
- return full;
3125
- }
3126
- async endRun(outcome) {
3127
- const status = outcome?.pass === false ? "failed" : "completed";
3128
- await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
3129
- }
3130
- async abortRun(reason) {
3131
- await this.store.updateRun(this._runId, {
3132
- endedAt: this.now(),
3133
- status: "aborted",
3134
- outcome: { pass: false, notes: reason }
3135
- });
3136
- }
3137
- // ── Generic span ───────────────────────────────────────────────────
3138
- async span(init) {
3139
- const spanId = this.id();
3140
- const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
3141
- const span = {
3142
- spanId,
3143
- parentSpanId: parent,
3144
- runId: this._runId,
3145
- startedAt: this.now(),
3146
- ...init
3147
- };
3148
- await this.store.appendSpan(span);
3149
- this.stack.push(spanId);
3150
- return this.handle(span);
3151
- }
3152
- handle(span) {
3153
- return {
3154
- span,
3155
- end: async (patch) => {
3156
- const endedAt = this.now();
3157
- await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
3158
- this.pop(span.spanId);
3159
- },
3160
- fail: async (error, patch) => {
3161
- const endedAt = this.now();
3162
- const errStr = error instanceof Error ? error.message : error;
3163
- await this.store.updateSpan(span.spanId, {
3164
- endedAt,
3165
- status: "error",
3166
- error: errStr,
3167
- ...patch
3168
- });
3169
- this.pop(span.spanId);
3170
- }
3171
- };
3172
- }
3173
- pop(spanId) {
3174
- const idx = this.stack.lastIndexOf(spanId);
3175
- if (idx >= 0) this.stack.splice(idx, 1);
3176
- }
3177
- // ── Typed span conveniences ────────────────────────────────────────
3178
- llm(init) {
3179
- return this.span({ kind: "llm", ...init });
3180
- }
3181
- tool(init) {
3182
- return this.span({ kind: "tool", ...init });
3183
- }
3184
- retrieval(init) {
3185
- return this.span({ kind: "retrieval", ...init });
3186
- }
3187
- async recordJudge(verdict) {
3188
- const spanId = this.id();
3189
- const now = this.now();
3190
- const full = {
3191
- spanId,
3192
- runId: this._runId,
3193
- kind: "judge",
3194
- startedAt: now,
3195
- endedAt: now,
3196
- status: "ok",
3197
- ...verdict
3198
- };
3199
- await this.store.appendSpan(full);
3200
- return full;
3201
- }
3202
- sandbox(init) {
3203
- return this.span({ kind: "sandbox", ...init });
3204
- }
3205
- // ── Events ─────────────────────────────────────────────────────────
3206
- async emit(event) {
3207
- const full = {
3208
- eventId: this.id(),
3209
- runId: this._runId,
3210
- spanId: event.spanId ?? this.stack[this.stack.length - 1],
3211
- kind: event.kind,
3212
- timestamp: this.now(),
3213
- payload: event.payload ?? {}
3214
- };
3215
- await this.store.appendEvent(full);
3216
- return full;
3217
- }
3218
- // ── Budget ledger ──────────────────────────────────────────────────
3219
- async recordBudget(entry) {
3220
- const full = {
3221
- runId: this._runId,
3222
- timestamp: entry.timestamp ?? this.now(),
3223
- dimension: entry.dimension,
3224
- limit: entry.limit,
3225
- consumed: entry.consumed,
3226
- remaining: entry.remaining,
3227
- breached: entry.breached,
3228
- spanId: entry.spanId ?? this.stack[this.stack.length - 1]
3229
- };
3230
- await this.store.appendBudgetEntry(full);
3231
- if (full.breached) {
3232
- await this.emit({
3233
- kind: "budget_breach",
3234
- spanId: full.spanId,
3235
- payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
3236
- });
3237
- }
3238
- return full;
3239
- }
3240
- // ── Artifacts ──────────────────────────────────────────────────────
3241
- async recordArtifact(artifact) {
3242
- const full = { artifactId: this.id(), runId: this._runId, ...artifact };
3243
- await this.store.appendArtifact(full);
3244
- return full;
3245
- }
3246
- // ── Nested composition ─────────────────────────────────────────────
3247
- /**
3248
- * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
3249
- * Returns the fn's return value. Use this for the 95% case.
3250
- */
3251
- async within(init, fn) {
3252
- const handle = await this.span(init);
3253
- try {
3254
- const result = await fn(handle);
3255
- await handle.end();
3256
- return result;
3257
- } catch (err) {
3258
- await handle.fail(err instanceof Error ? err : String(err));
3259
- throw err;
3260
- }
3261
- }
3262
- };
3263
- function cryptoRandomId() {
3264
- if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
3265
- return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
3266
- }
3267
- function llmSpanFromProvider(args) {
3268
- return {
3269
- name: args.name ?? args.model,
3270
- model: args.model,
3271
- messages: args.messages,
3272
- output: args.output,
3273
- inputTokens: args.usage?.inputTokens,
3274
- outputTokens: args.usage?.outputTokens,
3275
- cachedTokens: args.usage?.cachedTokens,
3276
- reasoningTokens: args.usage?.reasoningTokens,
3277
- costUsd: args.costUsd,
3278
- finishReason: args.finishReason
3279
- };
3280
- }
3281
-
3282
4303
  // src/sandbox-harness.ts
3283
4304
  var vitestTestParser = {
3284
4305
  id: "vitest",
@@ -3887,6 +4908,157 @@ function safeJson(x) {
3887
4908
  }
3888
4909
  }
3889
4910
 
4911
+ // src/propose-review-control.ts
4912
+ var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
4913
+ async function runProposeReviewAsControlLoop(config) {
4914
+ const maxShots = config.maxShots ?? 10;
4915
+ const confidenceFloor = config.confidenceFloor ?? 0.3;
4916
+ const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
4917
+ const memory = config.memory ?? inMemoryReviewStore();
4918
+ const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
4919
+ const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
4920
+ let lowConfidenceStreak = 0;
4921
+ let current = {
4922
+ shot: 0,
4923
+ state: config.initialState,
4924
+ priorReview: null,
4925
+ verification: { pass: false },
4926
+ memory: await memory.load(),
4927
+ completed: false,
4928
+ reviewAvailable: false
4929
+ };
4930
+ return runAgentControlLoop({
4931
+ intent: config.goal,
4932
+ budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
4933
+ store: config.store,
4934
+ scenarioId: config.scenarioId ?? "propose-review-control",
4935
+ projectId: config.projectId,
4936
+ variantId: config.variantId,
4937
+ actionFailure: config.actionFailure ?? "stop",
4938
+ observe: () => current,
4939
+ validate: ({ state }) => [
4940
+ objectiveEval({
4941
+ id: "verification",
4942
+ passed: state.verification.pass,
4943
+ score: state.verification.score,
4944
+ severity: "critical",
4945
+ detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
4946
+ })
4947
+ ],
4948
+ shouldStop: ({ state }) => {
4949
+ if (state.verification.pass) {
4950
+ return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
4951
+ }
4952
+ if (state.completed) {
4953
+ return {
4954
+ stop: true,
4955
+ pass: false,
4956
+ reason: "reviewer stopped continuation",
4957
+ score: state.verification.score,
4958
+ failureClass: failureClassFromVerification(state.verification)
4959
+ };
4960
+ }
4961
+ return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
4962
+ },
4963
+ decide: ({ state }) => ({
4964
+ type: "continue",
4965
+ action: { type: "propose-review-shot", shot: state.shot + 1 },
4966
+ reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
4967
+ }),
4968
+ act: async (action, ctx) => {
4969
+ const shot = action.shot;
4970
+ const proposeOut = await config.propose({
4971
+ shot,
4972
+ goal: config.goal,
4973
+ state: current.state,
4974
+ priorReview: current.priorReview,
4975
+ abortSignal: ctx.abortSignal,
4976
+ emitter: ctx.emitter
4977
+ });
4978
+ const nextState = proposeOut.state;
4979
+ const verification = await config.verify(nextState);
4980
+ let review = null;
4981
+ let reviewAvailable = false;
4982
+ let reviewError;
4983
+ let shouldContinue = !verification.pass;
4984
+ if (!verification.pass) {
4985
+ try {
4986
+ review = await config.review({
4987
+ shot,
4988
+ goal: config.goal,
4989
+ state: nextState,
4990
+ verification,
4991
+ traceSummary: proposeOut.traceSummary,
4992
+ memory: await memory.load()
4993
+ });
4994
+ reviewAvailable = true;
4995
+ shouldContinue = review.shouldContinue;
4996
+ lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
4997
+ if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
4998
+ } catch (err) {
4999
+ reviewError = err instanceof Error ? err.message : String(err);
5000
+ review = current.priorReview ?? {
5001
+ observations: "Reviewer unavailable.",
5002
+ diagnosis: reviewError,
5003
+ nextShotInstruction: fallbackInstruction,
5004
+ shouldContinue: true,
5005
+ confidence: 0
5006
+ };
5007
+ shouldContinue = true;
5008
+ }
5009
+ } else {
5010
+ review = {
5011
+ observations: "Verification passed.",
5012
+ diagnosis: "No further revision needed.",
5013
+ nextShotInstruction: "",
5014
+ shouldContinue: false,
5015
+ confidence: 1
5016
+ };
5017
+ }
5018
+ const entry = {
5019
+ ...review ?? {
5020
+ observations: "No review.",
5021
+ diagnosis: "",
5022
+ nextShotInstruction: fallbackInstruction,
5023
+ shouldContinue,
5024
+ confidence: 0
5025
+ },
5026
+ shot,
5027
+ timestamp: Date.now(),
5028
+ verification: {
5029
+ pass: verification.pass,
5030
+ score: verification.score,
5031
+ failingLayers: verification.failingLayers
5032
+ }
5033
+ };
5034
+ await memory.append(entry);
5035
+ current = {
5036
+ shot,
5037
+ state: nextState,
5038
+ priorReview: review,
5039
+ verification,
5040
+ traceSummary: proposeOut.traceSummary,
5041
+ memory: await memory.load(),
5042
+ completed: verification.pass || !shouldContinue,
5043
+ reviewAvailable,
5044
+ reviewError
5045
+ };
5046
+ return {
5047
+ state: nextState,
5048
+ verification,
5049
+ traceSummary: proposeOut.traceSummary,
5050
+ review,
5051
+ reviewAvailable,
5052
+ reviewError
5053
+ };
5054
+ }
5055
+ });
5056
+ }
5057
+ function controlFailureClassFromVerification(verification) {
5058
+ if (verification.pass) return void 0;
5059
+ return verification.failingLayers?.length ? "instruction_following" : "unknown";
5060
+ }
5061
+
3890
5062
  // src/trace/schema.ts
3891
5063
  var TRACE_SCHEMA_VERSION = "1.0.0";
3892
5064
  var FAILURE_CLASSES = [
@@ -5210,7 +6382,7 @@ function assertNonNegative(n, name) {
5210
6382
 
5211
6383
  // src/muffled-gate-scanner.ts
5212
6384
  import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
5213
- import { join } from "path";
6385
+ import { join as join2 } from "path";
5214
6386
  function codeOf(line) {
5215
6387
  return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
5216
6388
  }
@@ -5314,11 +6486,11 @@ var UNIVERSAL_FINDERS = [
5314
6486
  function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
5315
6487
  const matches2 = [];
5316
6488
  const walk = (rel) => {
5317
- const abs = join(repoRoot, rel);
6489
+ const abs = join2(repoRoot, rel);
5318
6490
  if (!existsSync2(abs)) return;
5319
6491
  for (const entry of readdirSync(abs)) {
5320
- const sub = join(rel, entry);
5321
- const subAbs = join(repoRoot, sub);
6492
+ const sub = join2(rel, entry);
6493
+ const subAbs = join2(repoRoot, sub);
5322
6494
  let st;
5323
6495
  try {
5324
6496
  st = statSync(subAbs);
@@ -5347,7 +6519,7 @@ function scanForMuffledGates(opts) {
5347
6519
  const findings = [];
5348
6520
  const scanned = /* @__PURE__ */ new Set();
5349
6521
  for (const file of opts.scanFiles) {
5350
- const abs = join(opts.repoRoot, file);
6522
+ const abs = join2(opts.repoRoot, file);
5351
6523
  if (!existsSync2(abs)) continue;
5352
6524
  const text = readFileSync2(abs, "utf8");
5353
6525
  for (const find of opts.finders) findings.push(...find(file, text));
@@ -5362,7 +6534,7 @@ function scanForMuffledGates(opts) {
5362
6534
  );
5363
6535
  for (const file of importers) {
5364
6536
  if (scanned.has(file)) continue;
5365
- const abs = join(opts.repoRoot, file);
6537
+ const abs = join2(opts.repoRoot, file);
5366
6538
  if (!existsSync2(abs)) continue;
5367
6539
  const text = readFileSync2(abs, "utf8");
5368
6540
  for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
@@ -5557,7 +6729,7 @@ var Dataset = class _Dataset {
5557
6729
  * Write to disk for contamination-verifiable archives.
5558
6730
  */
5559
6731
  toJsonl() {
5560
- return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
6732
+ return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
5561
6733
  }
5562
6734
  static fromJsonl(jsonl, manifest) {
5563
6735
  const scenarios = [];
@@ -5570,18 +6742,18 @@ var Dataset = class _Dataset {
5570
6742
  }
5571
6743
  };
5572
6744
  async function hashScenarios(scenarios) {
5573
- const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
6745
+ const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
5574
6746
  const text = JSON.stringify(canonical);
5575
6747
  const bytes = new TextEncoder().encode(text);
5576
6748
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
5577
6749
  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
5578
6750
  }
5579
- function canonicalize(v) {
6751
+ function canonicalize2(v) {
5580
6752
  if (v === null || typeof v !== "object") return v;
5581
- if (Array.isArray(v)) return v.map(canonicalize);
6753
+ if (Array.isArray(v)) return v.map(canonicalize2);
5582
6754
  const keys = Object.keys(v).sort();
5583
6755
  const out = {};
5584
- for (const k of keys) out[k] = canonicalize(v[k]);
6756
+ for (const k of keys) out[k] = canonicalize2(v[k]);
5585
6757
  return out;
5586
6758
  }
5587
6759
  function seededShuffle(items, seed) {
@@ -7350,7 +8522,7 @@ async function commitBisect(options) {
7350
8522
  }
7351
8523
  async function promptBisect(options) {
7352
8524
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
7353
- const join3 = (paragraphs) => paragraphs.join("\n\n");
8525
+ const join4 = (paragraphs) => paragraphs.join("\n\n");
7354
8526
  const goodParas = split(options.good);
7355
8527
  const badParas = split(options.bad);
7356
8528
  if (goodParas.length !== badParas.length) {
@@ -7368,7 +8540,7 @@ async function promptBisect(options) {
7368
8540
  const result = await bisect({
7369
8541
  good: goodMask,
7370
8542
  bad: badMask,
7371
- runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
8543
+ runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
7372
8544
  maxIterations: options.maxIterations ?? n + 5,
7373
8545
  halfway: (g, b) => {
7374
8546
  for (let i = 0; i < g.length; i++) {
@@ -7399,12 +8571,12 @@ async function promptBisect(options) {
7399
8571
  }
7400
8572
  }
7401
8573
  const materializedPath = result.path.map((s) => ({
7402
- state: join3(paragraphsFor(s.state)),
8574
+ state: join4(paragraphsFor(s.state)),
7403
8575
  score: s.score,
7404
8576
  pass: s.pass
7405
8577
  }));
7406
8578
  return {
7407
- culprit: join3(paragraphsFor(culprit)),
8579
+ culprit: join4(paragraphsFor(culprit)),
7408
8580
  path: materializedPath,
7409
8581
  converged: result.converged,
7410
8582
  inputInconsistent: result.inputInconsistent,
@@ -7615,7 +8787,7 @@ function attributeStep(op, prmA, prmB) {
7615
8787
 
7616
8788
  // src/pre-registration.ts
7617
8789
  async function signManifest(m) {
7618
- const canonical = canonicalize2(m);
8790
+ const canonical = canonicalize3(m);
7619
8791
  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
7620
8792
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
7621
8793
  const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
@@ -7645,12 +8817,12 @@ async function evaluateHypothesis(manifest, observed) {
7645
8817
  rejectionReasons: reasons
7646
8818
  };
7647
8819
  }
7648
- function canonicalize2(v) {
8820
+ function canonicalize3(v) {
7649
8821
  if (v === null || typeof v !== "object") return v;
7650
- if (Array.isArray(v)) return v.map(canonicalize2);
8822
+ if (Array.isArray(v)) return v.map(canonicalize3);
7651
8823
  const keys = Object.keys(v).sort();
7652
8824
  const out = {};
7653
- for (const k of keys) out[k] = canonicalize2(v[k]);
8825
+ for (const k of keys) out[k] = canonicalize3(v[k]);
7654
8826
  return out;
7655
8827
  }
7656
8828
 
@@ -8459,7 +9631,7 @@ function mergeSignals(a, b) {
8459
9631
  // src/command-runner.ts
8460
9632
  import { spawnSync } from "child_process";
8461
9633
  import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
8462
- import { join as join2 } from "path";
9634
+ import { join as join3 } from "path";
8463
9635
  var localCommandRunner = {
8464
9636
  name: "local",
8465
9637
  async run(input) {
@@ -8506,7 +9678,7 @@ var localCommandRunner = {
8506
9678
  const out = [];
8507
9679
  for (const name of entries) {
8508
9680
  try {
8509
- const st = statSync2(join2(path, name));
9681
+ const st = statSync2(join3(path, name));
8510
9682
  out.push({
8511
9683
  name,
8512
9684
  isDirectory: st.isDirectory(),
@@ -12426,6 +13598,7 @@ export {
12426
13598
  ExperimentTracker,
12427
13599
  FAILURE_CLASSES,
12428
13600
  FileSystemExperimentStore,
13601
+ FileSystemFeedbackTrajectoryStore,
12429
13602
  FileSystemOutcomeStore,
12430
13603
  FileSystemTraceStore,
12431
13604
  HeldOutGate,
@@ -12433,6 +13606,7 @@ export {
12433
13606
  HoldoutLockedError,
12434
13607
  INTENT_MATCH_JUDGE_VERSION,
12435
13608
  InMemoryExperimentStore,
13609
+ InMemoryFeedbackTrajectoryStore,
12436
13610
  InMemoryOutcomeStore,
12437
13611
  InMemoryTraceStore,
12438
13612
  InMemoryTrialCache,
@@ -12472,9 +13646,11 @@ export {
12472
13646
  adversarialJudge,
12473
13647
  aggregateLlm,
12474
13648
  aggregateRunScore,
13649
+ allCriticalPassed,
12475
13650
  analyzeAntiSlop,
12476
13651
  analyzeSeries,
12477
13652
  argHash,
13653
+ assignFeedbackSplit,
12478
13654
  attributeCounterfactuals,
12479
13655
  deterministicSplit as benchmarkDeterministicSplit,
12480
13656
  benchmarks_exports as benchmarks,
@@ -12512,6 +13688,8 @@ export {
12512
13688
  computeToolUseMetrics,
12513
13689
  confidenceInterval,
12514
13690
  containsAll,
13691
+ controlFailureClassFromVerification,
13692
+ controlRunToFeedbackTrajectory,
12515
13693
  correlateLayers,
12516
13694
  correlationStudy,
12517
13695
  createAntiSlopJudge,
@@ -12519,6 +13697,7 @@ export {
12519
13697
  createCustomJudge,
12520
13698
  createDefaultReviewer,
12521
13699
  createDomainExpertJudge,
13700
+ createFeedbackTrajectory,
12522
13701
  createIntentMatchJudge,
12523
13702
  createLlmReviewer,
12524
13703
  createSandboxCodeMutator,
@@ -12547,6 +13726,10 @@ export {
12547
13726
  extractAssetUrls,
12548
13727
  extractErrorCount,
12549
13728
  failureClusterView,
13729
+ feedbackTrajectoriesToDatasetScenarios,
13730
+ feedbackTrajectoriesToOptimizerRows,
13731
+ feedbackTrajectoryToDatasetScenario,
13732
+ feedbackTrajectoryToOptimizerRow,
12550
13733
  fileContains,
12551
13734
  fileExists,
12552
13735
  findAutoMatchNoExpectation,
@@ -12601,6 +13784,7 @@ export {
12601
13784
  nonRefusalRubric,
12602
13785
  normalizeScores,
12603
13786
  notBlocked,
13787
+ objectiveEval,
12604
13788
  outputLengthRubric,
12605
13789
  pairedBootstrap,
12606
13790
  pairedTTest,
@@ -12609,6 +13793,7 @@ export {
12609
13793
  paretoChart,
12610
13794
  paretoFrontier,
12611
13795
  paretoFrontierWithCrowding,
13796
+ parseFeedbackTrajectoriesJsonl,
12612
13797
  parseReflectionResponse,
12613
13798
  parseRunRecordSafe,
12614
13799
  partialCredit,
@@ -12635,6 +13820,7 @@ export {
12635
13820
  renderMarkdown,
12636
13821
  renderMarkdownReport,
12637
13822
  renderPlaybookMarkdown,
13823
+ renderPreferenceMemoryMarkdown,
12638
13824
  renderSteeringText,
12639
13825
  replayScorerOverCorpus,
12640
13826
  replayTraceThroughJudge,
@@ -12644,6 +13830,7 @@ export {
12644
13830
  roundTripRunRecord,
12645
13831
  rowCount,
12646
13832
  rowWhere,
13833
+ runAgentControlLoop,
12647
13834
  runAssertions,
12648
13835
  runCanaries,
12649
13836
  runCounterfactual,
@@ -12657,6 +13844,7 @@ export {
12657
13844
  runKeywordCoverageJudgeUrl,
12658
13845
  runPromptEvolution,
12659
13846
  runProposeReview,
13847
+ runProposeReviewAsControlLoop,
12660
13848
  runReferenceReplay,
12661
13849
  runSelfPlay,
12662
13850
  runSemanticConceptJudge,
@@ -12673,13 +13861,18 @@ export {
12673
13861
  selectHarnessVariant,
12674
13862
  selfPreference,
12675
13863
  sentenceReorderMutator,
13864
+ serializeFeedbackTrajectoriesJsonl,
12676
13865
  signManifest,
12677
13866
  soc2Report,
12678
13867
  statusAdvanced,
13868
+ stopOnNoProgress,
13869
+ stopOnRepeatedAction,
12679
13870
  stripFencedJson,
12680
13871
  stuckLoopView,
13872
+ subjectiveEval,
12681
13873
  summarize,
12682
13874
  summarizeHarnessResults,
13875
+ summarizePreferenceMemory,
12683
13876
  summaryTable,
12684
13877
  testJudge,
12685
13878
  textInSnapshot,
@@ -12705,6 +13898,7 @@ export {
12705
13898
  welchsTTest,
12706
13899
  whitespaceCollapseMutator,
12707
13900
  wilcoxonSignedRank,
13901
+ withAssignedFeedbackSplit,
12708
13902
  wranglerDeployRunner
12709
13903
  };
12710
13904
  //# sourceMappingURL=index.js.map