@tangle-network/agent-eval 0.17.0 → 0.17.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1386,6 +1386,1205 @@ function printDriverSummary(results) {
1386
1386
  console.log(`${completedCount}/${results.length} personas completed`);
1387
1387
  }
1388
1388
 
1389
+ // src/trace/emitter.ts
1390
+ var TraceEmitter = class {
1391
+ store;
1392
+ stack = [];
1393
+ _runId;
1394
+ now;
1395
+ id;
1396
+ constructor(store, options = {}) {
1397
+ this.store = store;
1398
+ this.now = options.now ?? (() => Date.now());
1399
+ this.id = options.id ?? (() => cryptoRandomId());
1400
+ this._runId = options.runId ?? this.id();
1401
+ }
1402
+ get runId() {
1403
+ return this._runId;
1404
+ }
1405
+ // ── Run lifecycle ──────────────────────────────────────────────────
1406
+ async startRun(run) {
1407
+ const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
1408
+ await this.store.appendRun(full);
1409
+ return full;
1410
+ }
1411
+ async endRun(outcome) {
1412
+ const status = outcome?.pass === false ? "failed" : "completed";
1413
+ await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
1414
+ }
1415
+ async abortRun(reason) {
1416
+ await this.store.updateRun(this._runId, {
1417
+ endedAt: this.now(),
1418
+ status: "aborted",
1419
+ outcome: { pass: false, notes: reason }
1420
+ });
1421
+ }
1422
+ // ── Generic span ───────────────────────────────────────────────────
1423
+ async span(init) {
1424
+ const spanId = this.id();
1425
+ const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
1426
+ const span = {
1427
+ spanId,
1428
+ parentSpanId: parent,
1429
+ runId: this._runId,
1430
+ startedAt: this.now(),
1431
+ ...init
1432
+ };
1433
+ await this.store.appendSpan(span);
1434
+ this.stack.push(spanId);
1435
+ return this.handle(span);
1436
+ }
1437
+ handle(span) {
1438
+ return {
1439
+ span,
1440
+ end: async (patch) => {
1441
+ const endedAt = this.now();
1442
+ await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
1443
+ this.pop(span.spanId);
1444
+ },
1445
+ fail: async (error, patch) => {
1446
+ const endedAt = this.now();
1447
+ const errStr = error instanceof Error ? error.message : error;
1448
+ await this.store.updateSpan(span.spanId, {
1449
+ endedAt,
1450
+ status: "error",
1451
+ error: errStr,
1452
+ ...patch
1453
+ });
1454
+ this.pop(span.spanId);
1455
+ }
1456
+ };
1457
+ }
1458
+ pop(spanId) {
1459
+ const idx = this.stack.lastIndexOf(spanId);
1460
+ if (idx >= 0) this.stack.splice(idx, 1);
1461
+ }
1462
+ // ── Typed span conveniences ────────────────────────────────────────
1463
+ llm(init) {
1464
+ return this.span({ kind: "llm", ...init });
1465
+ }
1466
+ tool(init) {
1467
+ return this.span({ kind: "tool", ...init });
1468
+ }
1469
+ retrieval(init) {
1470
+ return this.span({ kind: "retrieval", ...init });
1471
+ }
1472
+ async recordJudge(verdict) {
1473
+ const spanId = this.id();
1474
+ const now = this.now();
1475
+ const full = {
1476
+ spanId,
1477
+ runId: this._runId,
1478
+ kind: "judge",
1479
+ startedAt: now,
1480
+ endedAt: now,
1481
+ status: "ok",
1482
+ ...verdict
1483
+ };
1484
+ await this.store.appendSpan(full);
1485
+ return full;
1486
+ }
1487
+ sandbox(init) {
1488
+ return this.span({ kind: "sandbox", ...init });
1489
+ }
1490
+ // ── Events ─────────────────────────────────────────────────────────
1491
+ async emit(event) {
1492
+ const full = {
1493
+ eventId: this.id(),
1494
+ runId: this._runId,
1495
+ spanId: event.spanId ?? this.stack[this.stack.length - 1],
1496
+ kind: event.kind,
1497
+ timestamp: this.now(),
1498
+ payload: event.payload ?? {}
1499
+ };
1500
+ await this.store.appendEvent(full);
1501
+ return full;
1502
+ }
1503
+ // ── Budget ledger ──────────────────────────────────────────────────
1504
+ async recordBudget(entry) {
1505
+ const full = {
1506
+ runId: this._runId,
1507
+ timestamp: entry.timestamp ?? this.now(),
1508
+ dimension: entry.dimension,
1509
+ limit: entry.limit,
1510
+ consumed: entry.consumed,
1511
+ remaining: entry.remaining,
1512
+ breached: entry.breached,
1513
+ spanId: entry.spanId ?? this.stack[this.stack.length - 1]
1514
+ };
1515
+ await this.store.appendBudgetEntry(full);
1516
+ if (full.breached) {
1517
+ await this.emit({
1518
+ kind: "budget_breach",
1519
+ spanId: full.spanId,
1520
+ payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
1521
+ });
1522
+ }
1523
+ return full;
1524
+ }
1525
+ // ── Artifacts ──────────────────────────────────────────────────────
1526
+ async recordArtifact(artifact) {
1527
+ const full = { artifactId: this.id(), runId: this._runId, ...artifact };
1528
+ await this.store.appendArtifact(full);
1529
+ return full;
1530
+ }
1531
+ // ── Nested composition ─────────────────────────────────────────────
1532
+ /**
1533
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1534
+ * Returns the fn's return value. Use this for the 95% case.
1535
+ */
1536
+ async within(init, fn) {
1537
+ const handle = await this.span(init);
1538
+ try {
1539
+ const result = await fn(handle);
1540
+ await handle.end();
1541
+ return result;
1542
+ } catch (err) {
1543
+ await handle.fail(err instanceof Error ? err : String(err));
1544
+ throw err;
1545
+ }
1546
+ }
1547
+ };
1548
+ function cryptoRandomId() {
1549
+ if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
1550
+ return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
1551
+ }
1552
+ function llmSpanFromProvider(args) {
1553
+ return {
1554
+ name: args.name ?? args.model,
1555
+ model: args.model,
1556
+ messages: args.messages,
1557
+ output: args.output,
1558
+ inputTokens: args.usage?.inputTokens,
1559
+ outputTokens: args.usage?.outputTokens,
1560
+ cachedTokens: args.usage?.cachedTokens,
1561
+ reasoningTokens: args.usage?.reasoningTokens,
1562
+ costUsd: args.costUsd,
1563
+ finishReason: args.finishReason
1564
+ };
1565
+ }
1566
+
1567
+ // src/control-runtime.ts
1568
+ var DEFAULT_BUDGET = {
1569
+ maxSteps: 8,
1570
+ maxWallMs: 5 * 60 * 1e3
1571
+ };
1572
+ async function runAgentControlLoop(config) {
1573
+ const budget = { ...DEFAULT_BUDGET, ...config.budget };
1574
+ const actionFailure = config.actionFailure ?? "continue";
1575
+ const controller = new AbortController();
1576
+ const upstreamAbort = () => controller.abort(config.signal?.reason);
1577
+ if (config.signal) {
1578
+ if (config.signal.aborted) controller.abort(config.signal.reason);
1579
+ else config.signal.addEventListener("abort", upstreamAbort, { once: true });
1580
+ }
1581
+ const started = Date.now();
1582
+ const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
1583
+ const history = [];
1584
+ const emitter = config.store ? new TraceEmitter(config.store) : void 0;
1585
+ let spentCostUsd = 0;
1586
+ const runtimeErrors = [];
1587
+ let lastStateFingerprint;
1588
+ let lastActionFingerprint;
1589
+ let noProgressStreak = 0;
1590
+ let repeatedActionStreak = 0;
1591
+ try {
1592
+ if (emitter) {
1593
+ await runTrace(runtimeErrors, 0, () => emitter.startRun({
1594
+ scenarioId: config.scenarioId ?? "agent-control-loop",
1595
+ projectId: config.projectId,
1596
+ variantId: config.variantId,
1597
+ layer: "meta",
1598
+ tags: {
1599
+ intent: config.intent.slice(0, 120),
1600
+ maxSteps: String(budget.maxSteps),
1601
+ ...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
1602
+ }
1603
+ }));
1604
+ }
1605
+ let state;
1606
+ let evals;
1607
+ try {
1608
+ state = await config.observe({ history, abortSignal: controller.signal });
1609
+ } catch (err) {
1610
+ runtimeErrors.push(runtimeError("observe", 0, err));
1611
+ return finish(emitter, {
1612
+ intent: config.intent,
1613
+ pass: false,
1614
+ completed: false,
1615
+ reason: runtimeErrors[0].message,
1616
+ steps: history,
1617
+ finalState: void 0,
1618
+ finalEvals: [],
1619
+ wallMs: Date.now() - started,
1620
+ spentCostUsd,
1621
+ runId: emitter?.runId ?? null,
1622
+ failureClass: "unknown",
1623
+ runtimeErrors,
1624
+ stoppedBy: "runtime-error"
1625
+ });
1626
+ }
1627
+ try {
1628
+ evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
1629
+ await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
1630
+ } catch (err) {
1631
+ runtimeErrors.push(runtimeError("validate", 0, err));
1632
+ return finish(emitter, {
1633
+ intent: config.intent,
1634
+ pass: false,
1635
+ completed: false,
1636
+ reason: runtimeErrors[0].message,
1637
+ steps: history,
1638
+ finalState: state,
1639
+ finalEvals: [],
1640
+ wallMs: Date.now() - started,
1641
+ spentCostUsd,
1642
+ runId: emitter?.runId ?? null,
1643
+ failureClass: "unknown",
1644
+ runtimeErrors,
1645
+ stoppedBy: "runtime-error"
1646
+ });
1647
+ }
1648
+ lastStateFingerprint = fingerprintState(state, config.stopPolicies);
1649
+ for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
1650
+ if (controller.signal.aborted) {
1651
+ return finish(emitter, {
1652
+ intent: config.intent,
1653
+ pass: false,
1654
+ completed: false,
1655
+ reason: abortReason(controller.signal),
1656
+ score: void 0,
1657
+ steps: history,
1658
+ finalState: state,
1659
+ finalEvals: evals,
1660
+ wallMs: Date.now() - started,
1661
+ spentCostUsd,
1662
+ runId: emitter?.runId ?? null,
1663
+ failureClass: "timeout",
1664
+ runtimeErrors,
1665
+ stoppedBy: "abort"
1666
+ });
1667
+ }
1668
+ const budgetStop = budgetStopDecision(budget, spentCostUsd);
1669
+ if (budgetStop.stop) {
1670
+ return finish(emitter, {
1671
+ intent: config.intent,
1672
+ pass: false,
1673
+ completed: false,
1674
+ reason: budgetStop.reason,
1675
+ score: averageScore(evals),
1676
+ steps: history,
1677
+ finalState: state,
1678
+ finalEvals: evals,
1679
+ wallMs: Date.now() - started,
1680
+ spentCostUsd,
1681
+ runId: emitter?.runId ?? null,
1682
+ failureClass: "budget_exceeded",
1683
+ runtimeErrors,
1684
+ stoppedBy: "budget"
1685
+ });
1686
+ }
1687
+ const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
1688
+ let stop;
1689
+ try {
1690
+ stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
1691
+ } catch (err) {
1692
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
1693
+ return finish(emitter, {
1694
+ intent: config.intent,
1695
+ pass: false,
1696
+ completed: false,
1697
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1698
+ score: averageScore(evals),
1699
+ steps: history,
1700
+ finalState: state,
1701
+ finalEvals: evals,
1702
+ wallMs: Date.now() - started,
1703
+ spentCostUsd,
1704
+ runId: emitter?.runId ?? null,
1705
+ failureClass: "unknown",
1706
+ runtimeErrors,
1707
+ stoppedBy: "runtime-error"
1708
+ });
1709
+ }
1710
+ if (stop.stop) {
1711
+ return finish(emitter, {
1712
+ intent: config.intent,
1713
+ pass: stop.pass,
1714
+ completed: true,
1715
+ reason: stop.reason,
1716
+ score: stop.score,
1717
+ steps: history,
1718
+ finalState: state,
1719
+ finalEvals: evals,
1720
+ wallMs: Date.now() - started,
1721
+ spentCostUsd,
1722
+ runId: emitter?.runId ?? null,
1723
+ failureClass: stop.failureClass,
1724
+ runtimeErrors,
1725
+ stoppedBy: "stop-policy"
1726
+ });
1727
+ }
1728
+ let decision;
1729
+ try {
1730
+ decision = await config.decide(ctx);
1731
+ } catch (err) {
1732
+ runtimeErrors.push(runtimeError("decide", stepIndex, err));
1733
+ return finish(emitter, {
1734
+ intent: config.intent,
1735
+ pass: false,
1736
+ completed: false,
1737
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1738
+ score: averageScore(evals),
1739
+ steps: history,
1740
+ finalState: state,
1741
+ finalEvals: evals,
1742
+ wallMs: Date.now() - started,
1743
+ spentCostUsd,
1744
+ runId: emitter?.runId ?? null,
1745
+ failureClass: "unknown",
1746
+ runtimeErrors,
1747
+ stoppedBy: "runtime-error"
1748
+ });
1749
+ }
1750
+ if (decision.type === "stop") {
1751
+ return finish(emitter, {
1752
+ intent: config.intent,
1753
+ pass: decision.pass ?? false,
1754
+ completed: true,
1755
+ reason: decision.reason,
1756
+ score: decision.score,
1757
+ steps: history,
1758
+ finalState: state,
1759
+ finalEvals: evals,
1760
+ wallMs: Date.now() - started,
1761
+ spentCostUsd,
1762
+ runId: emitter?.runId ?? null,
1763
+ failureClass: decision.pass === false ? "unknown" : void 0,
1764
+ runtimeErrors,
1765
+ stoppedBy: "policy"
1766
+ });
1767
+ }
1768
+ const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
1769
+ repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
1770
+ lastActionFingerprint = actionFingerprint;
1771
+ const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
1772
+ if (repeatedActionStop.stop) {
1773
+ return finish(emitter, {
1774
+ intent: config.intent,
1775
+ pass: false,
1776
+ completed: true,
1777
+ reason: repeatedActionStop.reason,
1778
+ score: averageScore(evals),
1779
+ steps: history,
1780
+ finalState: state,
1781
+ finalEvals: evals,
1782
+ wallMs: Date.now() - started,
1783
+ spentCostUsd,
1784
+ runId: emitter?.runId ?? null,
1785
+ failureClass: "tool_recovery_failure",
1786
+ runtimeErrors,
1787
+ stoppedBy: "stop-policy"
1788
+ });
1789
+ }
1790
+ const beforeState = state;
1791
+ const evalsBefore = evals;
1792
+ const scoreBefore = averageScore(evals);
1793
+ const actionStarted = Date.now();
1794
+ const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
1795
+ name: `control-step-${stepIndex}`,
1796
+ toolName: "agent-control-action",
1797
+ args: decision.action,
1798
+ attributes: {
1799
+ decision: decision.reason ?? "continue",
1800
+ repeatedActionStreak
1801
+ }
1802
+ })) : void 0;
1803
+ let actionOutcome;
1804
+ try {
1805
+ const result = await config.act(decision.action, ctx);
1806
+ const costUsd = config.getActionCostUsd?.({
1807
+ action: decision.action,
1808
+ result,
1809
+ state,
1810
+ evals,
1811
+ history
1812
+ });
1813
+ if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
1814
+ spentCostUsd += costUsd;
1815
+ await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
1816
+ }
1817
+ actionOutcome = {
1818
+ ok: true,
1819
+ result,
1820
+ ...costUsd !== void 0 ? { costUsd } : {},
1821
+ durationMs: Date.now() - actionStarted
1822
+ };
1823
+ } catch (err) {
1824
+ runtimeErrors.push(runtimeError("act", stepIndex, err));
1825
+ actionOutcome = {
1826
+ ok: false,
1827
+ error: runtimeErrors[runtimeErrors.length - 1].message,
1828
+ durationMs: Date.now() - actionStarted
1829
+ };
1830
+ if (actionFailure === "stop") {
1831
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
1832
+ const step2 = {
1833
+ index: stepIndex,
1834
+ decision,
1835
+ beforeState,
1836
+ afterState: state,
1837
+ evalsBefore,
1838
+ evalsAfter: evals,
1839
+ actionOutcome,
1840
+ startedAt: new Date(actionStarted).toISOString(),
1841
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1842
+ };
1843
+ history.push(step2);
1844
+ await runOnStep(config.onStep, step2, runtimeErrors);
1845
+ return finish(emitter, {
1846
+ intent: config.intent,
1847
+ pass: false,
1848
+ completed: false,
1849
+ reason: actionOutcome.error ?? "action failed",
1850
+ score: averageScore(evals),
1851
+ steps: history,
1852
+ finalState: state,
1853
+ finalEvals: evals,
1854
+ wallMs: Date.now() - started,
1855
+ spentCostUsd,
1856
+ runId: emitter?.runId ?? null,
1857
+ failureClass: "unknown",
1858
+ runtimeErrors,
1859
+ stoppedBy: "runtime-error"
1860
+ });
1861
+ }
1862
+ }
1863
+ try {
1864
+ state = await config.observe({ history, abortSignal: controller.signal });
1865
+ } catch (err) {
1866
+ runtimeErrors.push(runtimeError("observe", stepIndex, err));
1867
+ const step2 = {
1868
+ index: stepIndex,
1869
+ decision,
1870
+ beforeState,
1871
+ afterState: beforeState,
1872
+ evalsBefore,
1873
+ evalsAfter: evals,
1874
+ actionOutcome,
1875
+ startedAt: new Date(actionStarted).toISOString(),
1876
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1877
+ };
1878
+ history.push(step2);
1879
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
1880
+ await runOnStep(config.onStep, step2, runtimeErrors);
1881
+ return finish(emitter, {
1882
+ intent: config.intent,
1883
+ pass: false,
1884
+ completed: false,
1885
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1886
+ score: averageScore(evals),
1887
+ steps: history,
1888
+ finalState: beforeState,
1889
+ finalEvals: evals,
1890
+ wallMs: Date.now() - started,
1891
+ spentCostUsd,
1892
+ runId: emitter?.runId ?? null,
1893
+ failureClass: "unknown",
1894
+ runtimeErrors,
1895
+ stoppedBy: "runtime-error"
1896
+ });
1897
+ }
1898
+ try {
1899
+ evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
1900
+ await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
1901
+ } catch (err) {
1902
+ runtimeErrors.push(runtimeError("validate", stepIndex, err));
1903
+ const step2 = {
1904
+ index: stepIndex,
1905
+ decision,
1906
+ beforeState,
1907
+ afterState: state,
1908
+ evalsBefore,
1909
+ evalsAfter: evals,
1910
+ actionOutcome,
1911
+ startedAt: new Date(actionStarted).toISOString(),
1912
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1913
+ };
1914
+ history.push(step2);
1915
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
1916
+ await runOnStep(config.onStep, step2, runtimeErrors);
1917
+ return finish(emitter, {
1918
+ intent: config.intent,
1919
+ pass: false,
1920
+ completed: false,
1921
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
1922
+ score: averageScore(evals),
1923
+ steps: history,
1924
+ finalState: state,
1925
+ finalEvals: evals,
1926
+ wallMs: Date.now() - started,
1927
+ spentCostUsd,
1928
+ runId: emitter?.runId ?? null,
1929
+ failureClass: "unknown",
1930
+ runtimeErrors,
1931
+ stoppedBy: "runtime-error"
1932
+ });
1933
+ }
1934
+ const scoreAfter = averageScore(evals);
1935
+ const stateFingerprint = fingerprintState(state, config.stopPolicies);
1936
+ const noProgressStop = noProgressStopDecision({
1937
+ policies: config.stopPolicies,
1938
+ lastStateFingerprint,
1939
+ stateFingerprint,
1940
+ scoreBefore,
1941
+ scoreAfter,
1942
+ currentStreak: noProgressStreak
1943
+ });
1944
+ noProgressStreak = noProgressStop.streak;
1945
+ lastStateFingerprint = stateFingerprint;
1946
+ const step = {
1947
+ index: stepIndex,
1948
+ decision,
1949
+ beforeState,
1950
+ afterState: state,
1951
+ evalsBefore,
1952
+ evalsAfter: evals,
1953
+ actionOutcome,
1954
+ startedAt: new Date(actionStarted).toISOString(),
1955
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
1956
+ };
1957
+ history.push(step);
1958
+ if (actionOutcome.ok) {
1959
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
1960
+ attributes: {
1961
+ actionCostUsd: actionOutcome.costUsd ?? null,
1962
+ spentCostUsd,
1963
+ scoreBefore: scoreBefore ?? null,
1964
+ scoreAfter: scoreAfter ?? null,
1965
+ noProgressStreak
1966
+ }
1967
+ }));
1968
+ } else {
1969
+ await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
1970
+ attributes: {
1971
+ spentCostUsd,
1972
+ noProgressStreak
1973
+ }
1974
+ }));
1975
+ }
1976
+ await runOnStep(config.onStep, step, runtimeErrors);
1977
+ if (noProgressStop.stop) {
1978
+ return finish(emitter, {
1979
+ intent: config.intent,
1980
+ pass: false,
1981
+ completed: true,
1982
+ reason: noProgressStop.reason,
1983
+ score: scoreAfter,
1984
+ steps: history,
1985
+ finalState: state,
1986
+ finalEvals: evals,
1987
+ wallMs: Date.now() - started,
1988
+ spentCostUsd,
1989
+ runId: emitter?.runId ?? null,
1990
+ failureClass: "tool_recovery_failure",
1991
+ runtimeErrors,
1992
+ stoppedBy: "stop-policy"
1993
+ });
1994
+ }
1995
+ const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
1996
+ if (postStepBudgetStop.stop) {
1997
+ return finish(emitter, {
1998
+ intent: config.intent,
1999
+ pass: false,
2000
+ completed: false,
2001
+ reason: postStepBudgetStop.reason,
2002
+ score: scoreAfter,
2003
+ steps: history,
2004
+ finalState: state,
2005
+ finalEvals: evals,
2006
+ wallMs: Date.now() - started,
2007
+ spentCostUsd,
2008
+ runId: emitter?.runId ?? null,
2009
+ failureClass: "budget_exceeded",
2010
+ runtimeErrors,
2011
+ stoppedBy: "budget"
2012
+ });
2013
+ }
2014
+ const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
2015
+ let postStepStop;
2016
+ try {
2017
+ postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
2018
+ } catch (err) {
2019
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
2020
+ return finish(emitter, {
2021
+ intent: config.intent,
2022
+ pass: false,
2023
+ completed: false,
2024
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
2025
+ score: averageScore(evals),
2026
+ steps: history,
2027
+ finalState: state,
2028
+ finalEvals: evals,
2029
+ wallMs: Date.now() - started,
2030
+ spentCostUsd,
2031
+ runId: emitter?.runId ?? null,
2032
+ failureClass: "unknown",
2033
+ runtimeErrors,
2034
+ stoppedBy: "runtime-error"
2035
+ });
2036
+ }
2037
+ if (postStepStop.stop) {
2038
+ return finish(emitter, {
2039
+ intent: config.intent,
2040
+ pass: postStepStop.pass,
2041
+ completed: true,
2042
+ reason: postStepStop.reason,
2043
+ score: postStepStop.score,
2044
+ steps: history,
2045
+ finalState: state,
2046
+ finalEvals: evals,
2047
+ wallMs: Date.now() - started,
2048
+ spentCostUsd,
2049
+ runId: emitter?.runId ?? null,
2050
+ failureClass: postStepStop.failureClass,
2051
+ runtimeErrors,
2052
+ stoppedBy: "stop-policy"
2053
+ });
2054
+ }
2055
+ }
2056
+ return finish(emitter, {
2057
+ intent: config.intent,
2058
+ pass: false,
2059
+ completed: false,
2060
+ reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
2061
+ steps: history,
2062
+ finalState: state,
2063
+ finalEvals: evals,
2064
+ wallMs: Date.now() - started,
2065
+ spentCostUsd,
2066
+ runId: emitter?.runId ?? null,
2067
+ failureClass: "budget_exceeded",
2068
+ runtimeErrors,
2069
+ stoppedBy: "budget"
2070
+ });
2071
+ } catch (err) {
2072
+ runtimeErrors.push(runtimeError("act", history.length, err));
2073
+ return finish(emitter, {
2074
+ intent: config.intent,
2075
+ pass: false,
2076
+ completed: false,
2077
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
2078
+ steps: history,
2079
+ finalState: void 0,
2080
+ finalEvals: [],
2081
+ wallMs: Date.now() - started,
2082
+ spentCostUsd,
2083
+ runId: emitter?.runId ?? null,
2084
+ failureClass: "unknown",
2085
+ runtimeErrors,
2086
+ stoppedBy: "runtime-error"
2087
+ });
2088
+ } finally {
2089
+ if (wallTimer) clearTimeout(wallTimer);
2090
+ if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
2091
+ }
2092
+ }
2093
+ function stopOnNoProgress(maxNoProgressSteps, options = {}) {
2094
+ return { ...options, maxNoProgressSteps };
2095
+ }
2096
+ function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
2097
+ return { ...options, maxRepeatedActions };
2098
+ }
2099
+ function objectiveEval(input) {
2100
+ return { ...input, objective: true };
2101
+ }
2102
+ function subjectiveEval(input) {
2103
+ return { ...input, objective: false };
2104
+ }
2105
+ function allCriticalPassed(evals) {
2106
+ return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
2107
+ }
2108
+ function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
2109
+ return {
2110
+ intent,
2111
+ state,
2112
+ evals,
2113
+ history,
2114
+ budget,
2115
+ stepIndex,
2116
+ wallMs: Date.now() - started,
2117
+ spentCostUsd,
2118
+ remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
2119
+ abortSignal,
2120
+ emitter
2121
+ };
2122
+ }
2123
+ function defaultStopDecision(evals) {
2124
+ if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
2125
+ const pass = allCriticalPassed(evals);
2126
+ return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
2127
+ }
2128
+ function averageScore(evals) {
2129
+ const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
2130
+ if (!scored.length) return void 0;
2131
+ return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
2132
+ }
2133
+ function budgetStopDecision(budget, spentCostUsd) {
2134
+ if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
2135
+ return {
2136
+ stop: true,
2137
+ reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
2138
+ };
2139
+ }
2140
+ return { stop: false, reason: "" };
2141
+ }
2142
+ async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
2143
+ if (!emitter || budget.maxCostUsd === void 0) return;
2144
+ const maxCostUsd = budget.maxCostUsd;
2145
+ await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
2146
+ dimension: "usd",
2147
+ limit: maxCostUsd,
2148
+ consumed: spentCostUsd,
2149
+ remaining: Math.max(0, maxCostUsd - spentCostUsd),
2150
+ breached: spentCostUsd >= maxCostUsd,
2151
+ spanId: handle?.span.spanId
2152
+ }));
2153
+ }
2154
+ async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
2155
+ if (!emitter) return;
2156
+ for (const result of evals) {
2157
+ await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
2158
+ judgeId: result.objective ? "objective-validator" : "subjective-judge",
2159
+ targetSpanId: targetSpanId ?? emitter.runId,
2160
+ name: `control-eval/${result.id}`,
2161
+ dimension: result.id,
2162
+ score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
2163
+ rationale: result.detail,
2164
+ evidence: result.evidence,
2165
+ attributes: {
2166
+ phase,
2167
+ passed: result.passed,
2168
+ severity: result.severity,
2169
+ objective: result.objective
2170
+ }
2171
+ }));
2172
+ }
2173
+ }
2174
+ async function runOnStep(onStep, step, runtimeErrors) {
2175
+ if (!onStep) return;
2176
+ try {
2177
+ await onStep(step);
2178
+ } catch (err) {
2179
+ runtimeErrors.push(runtimeError("on-step", step.index, err));
2180
+ }
2181
+ }
2182
+ async function runTrace(runtimeErrors, stepIndex, write) {
2183
+ try {
2184
+ return await write();
2185
+ } catch (err) {
2186
+ runtimeErrors.push(runtimeError("trace", stepIndex, err));
2187
+ return void 0;
2188
+ }
2189
+ }
2190
+ function noProgressStopDecision(args) {
2191
+ const max = args.policies?.maxNoProgressSteps;
2192
+ if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
2193
+ const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
2194
+ const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
2195
+ const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
2196
+ const scoreFlat = scoreDelta < minScoreDelta;
2197
+ const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
2198
+ return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
2199
+ }
2200
+ function repeatedActionStopDecision(policies, streak) {
2201
+ const max = policies?.maxRepeatedActions;
2202
+ if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
2203
+ return {
2204
+ stop: true,
2205
+ reason: `stuck: repeated same action for ${streak} step(s)`
2206
+ };
2207
+ }
2208
+ function fingerprintState(state, policies) {
2209
+ if (policies?.stateFingerprint) return policies.stateFingerprint(state);
2210
+ return stableFingerprint(state);
2211
+ }
2212
+ function fingerprintAction(action, policies) {
2213
+ if (policies?.actionFingerprint) return policies.actionFingerprint(action);
2214
+ return stableFingerprint(action);
2215
+ }
2216
+ function stableFingerprint(value) {
2217
+ if (typeof value === "string") return value;
2218
+ if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
2219
+ try {
2220
+ return JSON.stringify(sortForFingerprint(value));
2221
+ } catch {
2222
+ return String(value);
2223
+ }
2224
+ }
2225
+ function sortForFingerprint(value) {
2226
+ if (Array.isArray(value)) return value.map(sortForFingerprint);
2227
+ if (!value || typeof value !== "object") return value;
2228
+ const record = value;
2229
+ const sorted = {};
2230
+ for (const key of Object.keys(record).sort()) {
2231
+ sorted[key] = sortForFingerprint(record[key]);
2232
+ }
2233
+ return sorted;
2234
+ }
2235
+ function abortReason(signal) {
2236
+ const reason = signal.reason;
2237
+ if (reason instanceof Error) return reason.message;
2238
+ return reason ? String(reason) : "aborted";
2239
+ }
2240
+ function runtimeError(phase, stepIndex, err) {
2241
+ const message = err instanceof Error ? err.message : String(err);
2242
+ return { phase, stepIndex, message };
2243
+ }
2244
+ async function finish(emitter, result) {
2245
+ await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
2246
+ pass: result.pass,
2247
+ score: result.score ?? averageScore(result.finalEvals),
2248
+ failureClass: result.failureClass,
2249
+ notes: result.reason
2250
+ }));
2251
+ return result;
2252
+ }
2253
+
2254
+ // src/feedback-trajectory.ts
2255
+ import { appendFile, mkdir, readFile } from "fs/promises";
2256
+ import { join } from "path";
2257
+ var DEFAULT_SPLIT_POLICY = {
2258
+ trainPct: 70,
2259
+ devPct: 15,
2260
+ testPct: 10,
2261
+ holdoutPct: 5
2262
+ };
2263
+ var InMemoryFeedbackTrajectoryStore = class {
2264
+ trajectories = /* @__PURE__ */ new Map();
2265
+ async save(trajectory) {
2266
+ this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
2267
+ }
2268
+ async get(id) {
2269
+ const trajectory = this.trajectories.get(id);
2270
+ return trajectory ? cloneTrajectory(trajectory) : null;
2271
+ }
2272
+ async list(filter = {}) {
2273
+ return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
2274
+ }
2275
+ async appendAttempt(id, attempt) {
2276
+ const trajectory = this.trajectories.get(id);
2277
+ if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
2278
+ const next = cloneTrajectory({
2279
+ ...trajectory,
2280
+ attempts: [...trajectory.attempts, attempt],
2281
+ updatedAt: attempt.createdAt
2282
+ });
2283
+ this.trajectories.set(id, next);
2284
+ return cloneTrajectory(next);
2285
+ }
2286
+ async appendLabel(id, label, attemptId) {
2287
+ const trajectory = this.trajectories.get(id);
2288
+ if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
2289
+ const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
2290
+ const next = cloneTrajectory({
2291
+ ...trajectory,
2292
+ attempts,
2293
+ labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
2294
+ updatedAt: label.createdAt
2295
+ });
2296
+ this.trajectories.set(id, next);
2297
+ return cloneTrajectory(next);
2298
+ }
2299
+ };
2300
+ var FileSystemFeedbackTrajectoryStore = class {
2301
+ dir;
2302
+ memory = new InMemoryFeedbackTrajectoryStore();
2303
+ loaded = false;
2304
+ constructor(options) {
2305
+ this.dir = options.dir;
2306
+ }
2307
+ async save(trajectory) {
2308
+ await this.load();
2309
+ await this.memory.save(trajectory);
2310
+ await this.append({ op: "save", trajectory });
2311
+ }
2312
+ async get(id) {
2313
+ await this.load();
2314
+ return this.memory.get(id);
2315
+ }
2316
+ async list(filter = {}) {
2317
+ await this.load();
2318
+ return this.memory.list(filter);
2319
+ }
2320
+ async appendAttempt(id, attempt) {
2321
+ await this.load();
2322
+ const next = await this.memory.appendAttempt(id, attempt);
2323
+ await this.append({ op: "appendAttempt", id, attempt });
2324
+ return next;
2325
+ }
2326
+ async appendLabel(id, label, attemptId) {
2327
+ await this.load();
2328
+ const next = await this.memory.appendLabel(id, label, attemptId);
2329
+ await this.append({ op: "appendLabel", id, label, attemptId });
2330
+ return next;
2331
+ }
2332
+ async append(record) {
2333
+ await mkdir(this.dir, { recursive: true });
2334
+ await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
2335
+ }
2336
+ async load() {
2337
+ if (this.loaded) return;
2338
+ const file = join(this.dir, "feedback-trajectories.ndjson");
2339
+ try {
2340
+ const raw = await readFile(file, "utf8");
2341
+ for (const line of raw.split("\n")) {
2342
+ if (!line.trim()) continue;
2343
+ try {
2344
+ const record = JSON.parse(line);
2345
+ if (record.op === "save") await this.memory.save(record.trajectory);
2346
+ if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
2347
+ if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
2348
+ } catch {
2349
+ }
2350
+ }
2351
+ } catch {
2352
+ }
2353
+ this.loaded = true;
2354
+ }
2355
+ };
2356
+ function createFeedbackTrajectory(input) {
2357
+ const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
2358
+ const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
2359
+ return {
2360
+ id,
2361
+ projectId: input.projectId,
2362
+ scenarioId: input.scenarioId,
2363
+ task: input.task,
2364
+ attempts: input.attempts ?? [],
2365
+ labels: input.labels ?? [],
2366
+ outcome: input.outcome,
2367
+ split: input.split,
2368
+ tags: input.tags,
2369
+ createdAt,
2370
+ metadata: input.metadata
2371
+ };
2372
+ }
2373
+ function assignFeedbackSplit(trajectory, policy = {}) {
2374
+ const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
2375
+ const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
2376
+ if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
2377
+ const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
2378
+ if (bucket < split.trainPct) return "train";
2379
+ if (bucket < split.trainPct + split.devPct) return "dev";
2380
+ if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
2381
+ return "holdout";
2382
+ }
2383
+ function withAssignedFeedbackSplit(trajectory, policy) {
2384
+ return {
2385
+ ...trajectory,
2386
+ split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
2387
+ };
2388
+ }
2389
+ function feedbackTrajectoryToDatasetScenario(trajectory) {
2390
+ const withSplit = withAssignedFeedbackSplit(trajectory);
2391
+ return {
2392
+ id: withSplit.scenarioId ?? withSplit.id,
2393
+ split: withSplit.split,
2394
+ payload: withSplit,
2395
+ tags: {
2396
+ ...withSplit.projectId ? { projectId: withSplit.projectId } : {},
2397
+ ...withSplit.tags ?? {},
2398
+ source: "feedback-trajectory"
2399
+ }
2400
+ };
2401
+ }
2402
+ function feedbackTrajectoriesToDatasetScenarios(trajectories) {
2403
+ return trajectories.map(feedbackTrajectoryToDatasetScenario);
2404
+ }
2405
+ function feedbackTrajectoryToOptimizerRow(trajectory) {
2406
+ const labels = allLabels(trajectory);
2407
+ return {
2408
+ scenarioId: trajectory.scenarioId ?? trajectory.id,
2409
+ trajectoryId: trajectory.id,
2410
+ labelKinds: [...new Set(labels.map((label) => label.kind))],
2411
+ score: trajectory.outcome?.score ?? scoreFromLabels(labels),
2412
+ metadata: {
2413
+ projectId: trajectory.projectId,
2414
+ split: trajectory.split,
2415
+ intent: trajectory.task.intent,
2416
+ attempts: trajectory.attempts.length,
2417
+ outcome: trajectory.outcome,
2418
+ labels
2419
+ }
2420
+ };
2421
+ }
2422
+ function feedbackTrajectoriesToOptimizerRows(trajectories) {
2423
+ return trajectories.map(feedbackTrajectoryToOptimizerRow);
2424
+ }
2425
+ function summarizePreferenceMemory(trajectories, options = {}) {
2426
+ const maxEntries = options.maxEntries ?? 20;
2427
+ const entries = [];
2428
+ for (const trajectory of trajectories) {
2429
+ for (const label of allLabels(trajectory)) {
2430
+ const instruction = instructionFromLabel(trajectory, label);
2431
+ if (!instruction) continue;
2432
+ entries.push({
2433
+ instruction,
2434
+ rationale: label.reason ?? `${label.kind} label from ${label.source}`,
2435
+ weight: weightForLabel(label),
2436
+ sourceTrajectoryId: trajectory.id,
2437
+ sourceLabelId: label.id,
2438
+ category: label.kind
2439
+ });
2440
+ }
2441
+ }
2442
+ const byInstruction = /* @__PURE__ */ new Map();
2443
+ for (const entry of entries) {
2444
+ const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
2445
+ const existing = byInstruction.get(key);
2446
+ if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
2447
+ }
2448
+ return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
2449
+ }
2450
+ function renderPreferenceMemoryMarkdown(entries) {
2451
+ const lines = ["# Preference Memory", ""];
2452
+ for (const entry of entries) {
2453
+ lines.push(`- ${entry.instruction}`);
2454
+ lines.push(` Rationale: ${entry.rationale}`);
2455
+ lines.push(` Source: ${entry.sourceTrajectoryId}`);
2456
+ lines.push("");
2457
+ }
2458
+ return lines.join("\n").trim() + "\n";
2459
+ }
2460
+ function serializeFeedbackTrajectoriesJsonl(trajectories) {
2461
+ return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
2462
+ }
2463
+ function parseFeedbackTrajectoriesJsonl(jsonl) {
2464
+ const trajectories = [];
2465
+ for (const line of jsonl.split("\n")) {
2466
+ if (!line.trim()) continue;
2467
+ trajectories.push(JSON.parse(line));
2468
+ }
2469
+ return trajectories;
2470
+ }
2471
+ function controlRunToFeedbackTrajectory(run, options = {}) {
2472
+ const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
2473
+ const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
2474
+ return createFeedbackTrajectory({
2475
+ id: trajectoryId,
2476
+ projectId: options.projectId,
2477
+ scenarioId: options.scenarioId,
2478
+ task: { intent: run.intent },
2479
+ createdAt,
2480
+ attempts: run.steps.map((step) => ({
2481
+ id: `${trajectoryId}_step_${step.index}`,
2482
+ stepIndex: step.index,
2483
+ artifactType: options.artifactType ?? "action",
2484
+ artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
2485
+ proposedAction: options.proposedActionFromStep?.(step),
2486
+ evals: step.evalsAfter,
2487
+ createdAt: step.startedAt,
2488
+ metadata: {
2489
+ decision: step.decision,
2490
+ actionOutcome: step.actionOutcome
2491
+ }
2492
+ })),
2493
+ labels: [
2494
+ {
2495
+ source: "system",
2496
+ kind: run.pass ? "approve" : "reject",
2497
+ value: run.pass,
2498
+ reason: run.reason,
2499
+ severity: run.pass ? "info" : "error",
2500
+ createdAt
2501
+ }
2502
+ ],
2503
+ outcome: {
2504
+ success: run.pass,
2505
+ score: run.score,
2506
+ costUsd: run.spentCostUsd,
2507
+ detail: run.reason,
2508
+ observedAt: createdAt,
2509
+ metadata: {
2510
+ stoppedBy: run.stoppedBy,
2511
+ failureClass: run.failureClass
2512
+ }
2513
+ }
2514
+ });
2515
+ }
2516
+ function allLabels(trajectory) {
2517
+ const labels = [
2518
+ ...trajectory.labels,
2519
+ ...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
2520
+ ];
2521
+ const seen = /* @__PURE__ */ new Set();
2522
+ return labels.filter((label) => {
2523
+ const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
2524
+ if (seen.has(key)) return false;
2525
+ seen.add(key);
2526
+ return true;
2527
+ });
2528
+ }
2529
+ function scoreFromLabels(labels) {
2530
+ if (!labels.length) return void 0;
2531
+ const scored = labels.map((label) => {
2532
+ if (label.kind === "approve" || label.kind === "select") return 1;
2533
+ if (label.kind === "reject" || label.kind === "policy_block") return 0;
2534
+ if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
2535
+ return void 0;
2536
+ }).filter((value) => typeof value === "number");
2537
+ if (!scored.length) return void 0;
2538
+ return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
2539
+ }
2540
+ function instructionFromLabel(trajectory, label) {
2541
+ if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
2542
+ if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
2543
+ if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
2544
+ if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
2545
+ if (label.kind === "comment" && label.reason) return label.reason;
2546
+ return void 0;
2547
+ }
2548
+ function weightForLabel(label) {
2549
+ const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
2550
+ const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
2551
+ return severity * source;
2552
+ }
2553
+ function matchesFilter(trajectory, filter) {
2554
+ if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
2555
+ if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
2556
+ if (filter.split && trajectory.split !== filter.split) return false;
2557
+ if (filter.tag) {
2558
+ const [key, value] = filter.tag;
2559
+ if (trajectory.tags?.[key] !== value) return false;
2560
+ }
2561
+ return true;
2562
+ }
2563
+ function cloneTrajectory(trajectory) {
2564
+ return JSON.parse(JSON.stringify(trajectory));
2565
+ }
2566
+ function compact(value, max) {
2567
+ const normalized = value.replace(/\s+/g, " ").trim();
2568
+ return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
2569
+ }
2570
+ function stableHash(input) {
2571
+ let hash = 2166136261;
2572
+ for (let i = 0; i < input.length; i += 1) {
2573
+ hash ^= input.charCodeAt(i);
2574
+ hash = Math.imul(hash, 16777619);
2575
+ }
2576
+ return hash >>> 0;
2577
+ }
2578
+ function canonicalize(value) {
2579
+ if (value === null || typeof value !== "object") return value;
2580
+ if (Array.isArray(value)) return value.map(canonicalize);
2581
+ const out = {};
2582
+ for (const key of Object.keys(value).sort()) {
2583
+ out[key] = canonicalize(value[key]);
2584
+ }
2585
+ return out;
2586
+ }
2587
+
1389
2588
  // src/prompt-registry.ts
1390
2589
  var PromptRegistry = class {
1391
2590
  entries = /* @__PURE__ */ new Map();
@@ -3053,231 +4252,53 @@ var FileSystemTraceStore = class {
3053
4252
  }
3054
4253
  }
3055
4254
  } catch {
3056
- }
3057
- this.index = store;
3058
- this.loaded = true;
3059
- return store;
3060
- }
3061
- async appendRun(run) {
3062
- await this.append("runs", run);
3063
- }
3064
- async updateRun(runId, patch) {
3065
- await this.append("runs", { runId, ...patch, _update: true });
3066
- if (this.index) await this.index.updateRun(runId, patch);
3067
- }
3068
- async appendSpan(span) {
3069
- await this.append("spans", span);
3070
- }
3071
- async updateSpan(spanId, patch) {
3072
- await this.append("spans", { spanId, ...patch, _update: true });
3073
- if (this.index) await this.index.updateSpan(spanId, patch);
3074
- }
3075
- async appendEvent(event) {
3076
- await this.append("events", event);
3077
- }
3078
- async appendArtifact(artifact) {
3079
- await this.append("artifacts", artifact);
3080
- }
3081
- async appendBudgetEntry(entry) {
3082
- await this.append("budget", entry);
3083
- }
3084
- async getRun(runId) {
3085
- return (await this.load()).getRun(runId);
3086
- }
3087
- async listRuns(filter) {
3088
- return (await this.load()).listRuns(filter);
3089
- }
3090
- async spans(filter) {
3091
- return (await this.load()).spans(filter);
3092
- }
3093
- async events(filter) {
3094
- return (await this.load()).events(filter);
3095
- }
3096
- async budget(runId) {
3097
- return (await this.load()).budget(runId);
3098
- }
3099
- async artifacts(runId) {
3100
- return (await this.load()).artifacts(runId);
3101
- }
3102
- };
3103
-
3104
- // src/trace/emitter.ts
3105
- var TraceEmitter = class {
3106
- store;
3107
- stack = [];
3108
- _runId;
3109
- now;
3110
- id;
3111
- constructor(store, options = {}) {
3112
- this.store = store;
3113
- this.now = options.now ?? (() => Date.now());
3114
- this.id = options.id ?? (() => cryptoRandomId());
3115
- this._runId = options.runId ?? this.id();
3116
- }
3117
- get runId() {
3118
- return this._runId;
3119
- }
3120
- // ── Run lifecycle ──────────────────────────────────────────────────
3121
- async startRun(run) {
3122
- const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
3123
- await this.store.appendRun(full);
3124
- return full;
3125
- }
3126
- async endRun(outcome) {
3127
- const status = outcome?.pass === false ? "failed" : "completed";
3128
- await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
4255
+ }
4256
+ this.index = store;
4257
+ this.loaded = true;
4258
+ return store;
3129
4259
  }
3130
- async abortRun(reason) {
3131
- await this.store.updateRun(this._runId, {
3132
- endedAt: this.now(),
3133
- status: "aborted",
3134
- outcome: { pass: false, notes: reason }
3135
- });
4260
+ async appendRun(run) {
4261
+ await this.append("runs", run);
3136
4262
  }
3137
- // ── Generic span ───────────────────────────────────────────────────
3138
- async span(init) {
3139
- const spanId = this.id();
3140
- const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
3141
- const span = {
3142
- spanId,
3143
- parentSpanId: parent,
3144
- runId: this._runId,
3145
- startedAt: this.now(),
3146
- ...init
3147
- };
3148
- await this.store.appendSpan(span);
3149
- this.stack.push(spanId);
3150
- return this.handle(span);
4263
+ async updateRun(runId, patch) {
4264
+ await this.append("runs", { runId, ...patch, _update: true });
4265
+ if (this.index) await this.index.updateRun(runId, patch);
3151
4266
  }
3152
- handle(span) {
3153
- return {
3154
- span,
3155
- end: async (patch) => {
3156
- const endedAt = this.now();
3157
- await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
3158
- this.pop(span.spanId);
3159
- },
3160
- fail: async (error, patch) => {
3161
- const endedAt = this.now();
3162
- const errStr = error instanceof Error ? error.message : error;
3163
- await this.store.updateSpan(span.spanId, {
3164
- endedAt,
3165
- status: "error",
3166
- error: errStr,
3167
- ...patch
3168
- });
3169
- this.pop(span.spanId);
3170
- }
3171
- };
4267
+ async appendSpan(span) {
4268
+ await this.append("spans", span);
3172
4269
  }
3173
- pop(spanId) {
3174
- const idx = this.stack.lastIndexOf(spanId);
3175
- if (idx >= 0) this.stack.splice(idx, 1);
4270
+ async updateSpan(spanId, patch) {
4271
+ await this.append("spans", { spanId, ...patch, _update: true });
4272
+ if (this.index) await this.index.updateSpan(spanId, patch);
3176
4273
  }
3177
- // ── Typed span conveniences ────────────────────────────────────────
3178
- llm(init) {
3179
- return this.span({ kind: "llm", ...init });
4274
+ async appendEvent(event) {
4275
+ await this.append("events", event);
3180
4276
  }
3181
- tool(init) {
3182
- return this.span({ kind: "tool", ...init });
4277
+ async appendArtifact(artifact) {
4278
+ await this.append("artifacts", artifact);
3183
4279
  }
3184
- retrieval(init) {
3185
- return this.span({ kind: "retrieval", ...init });
4280
+ async appendBudgetEntry(entry) {
4281
+ await this.append("budget", entry);
3186
4282
  }
3187
- async recordJudge(verdict) {
3188
- const spanId = this.id();
3189
- const now = this.now();
3190
- const full = {
3191
- spanId,
3192
- runId: this._runId,
3193
- kind: "judge",
3194
- startedAt: now,
3195
- endedAt: now,
3196
- status: "ok",
3197
- ...verdict
3198
- };
3199
- await this.store.appendSpan(full);
3200
- return full;
4283
+ async getRun(runId) {
4284
+ return (await this.load()).getRun(runId);
3201
4285
  }
3202
- sandbox(init) {
3203
- return this.span({ kind: "sandbox", ...init });
4286
+ async listRuns(filter) {
4287
+ return (await this.load()).listRuns(filter);
3204
4288
  }
3205
- // ── Events ─────────────────────────────────────────────────────────
3206
- async emit(event) {
3207
- const full = {
3208
- eventId: this.id(),
3209
- runId: this._runId,
3210
- spanId: event.spanId ?? this.stack[this.stack.length - 1],
3211
- kind: event.kind,
3212
- timestamp: this.now(),
3213
- payload: event.payload ?? {}
3214
- };
3215
- await this.store.appendEvent(full);
3216
- return full;
4289
+ async spans(filter) {
4290
+ return (await this.load()).spans(filter);
3217
4291
  }
3218
- // ── Budget ledger ──────────────────────────────────────────────────
3219
- async recordBudget(entry) {
3220
- const full = {
3221
- runId: this._runId,
3222
- timestamp: entry.timestamp ?? this.now(),
3223
- dimension: entry.dimension,
3224
- limit: entry.limit,
3225
- consumed: entry.consumed,
3226
- remaining: entry.remaining,
3227
- breached: entry.breached,
3228
- spanId: entry.spanId ?? this.stack[this.stack.length - 1]
3229
- };
3230
- await this.store.appendBudgetEntry(full);
3231
- if (full.breached) {
3232
- await this.emit({
3233
- kind: "budget_breach",
3234
- spanId: full.spanId,
3235
- payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
3236
- });
3237
- }
3238
- return full;
4292
+ async events(filter) {
4293
+ return (await this.load()).events(filter);
3239
4294
  }
3240
- // ── Artifacts ──────────────────────────────────────────────────────
3241
- async recordArtifact(artifact) {
3242
- const full = { artifactId: this.id(), runId: this._runId, ...artifact };
3243
- await this.store.appendArtifact(full);
3244
- return full;
4295
+ async budget(runId) {
4296
+ return (await this.load()).budget(runId);
3245
4297
  }
3246
- // ── Nested composition ─────────────────────────────────────────────
3247
- /**
3248
- * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
3249
- * Returns the fn's return value. Use this for the 95% case.
3250
- */
3251
- async within(init, fn) {
3252
- const handle = await this.span(init);
3253
- try {
3254
- const result = await fn(handle);
3255
- await handle.end();
3256
- return result;
3257
- } catch (err) {
3258
- await handle.fail(err instanceof Error ? err : String(err));
3259
- throw err;
3260
- }
4298
+ async artifacts(runId) {
4299
+ return (await this.load()).artifacts(runId);
3261
4300
  }
3262
4301
  };
3263
- function cryptoRandomId() {
3264
- if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
3265
- return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
3266
- }
3267
- function llmSpanFromProvider(args) {
3268
- return {
3269
- name: args.name ?? args.model,
3270
- model: args.model,
3271
- messages: args.messages,
3272
- output: args.output,
3273
- inputTokens: args.usage?.inputTokens,
3274
- outputTokens: args.usage?.outputTokens,
3275
- cachedTokens: args.usage?.cachedTokens,
3276
- reasoningTokens: args.usage?.reasoningTokens,
3277
- costUsd: args.costUsd,
3278
- finishReason: args.finishReason
3279
- };
3280
- }
3281
4302
 
3282
4303
  // src/sandbox-harness.ts
3283
4304
  var vitestTestParser = {
@@ -3887,6 +4908,157 @@ function safeJson(x) {
3887
4908
  }
3888
4909
  }
3889
4910
 
4911
+ // src/propose-review-control.ts
4912
+ var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
4913
+ async function runProposeReviewAsControlLoop(config) {
4914
+ const maxShots = config.maxShots ?? 10;
4915
+ const confidenceFloor = config.confidenceFloor ?? 0.3;
4916
+ const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
4917
+ const memory = config.memory ?? inMemoryReviewStore();
4918
+ const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
4919
+ const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
4920
+ let lowConfidenceStreak = 0;
4921
+ let current = {
4922
+ shot: 0,
4923
+ state: config.initialState,
4924
+ priorReview: null,
4925
+ verification: { pass: false },
4926
+ memory: await memory.load(),
4927
+ completed: false,
4928
+ reviewAvailable: false
4929
+ };
4930
+ return runAgentControlLoop({
4931
+ intent: config.goal,
4932
+ budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
4933
+ store: config.store,
4934
+ scenarioId: config.scenarioId ?? "propose-review-control",
4935
+ projectId: config.projectId,
4936
+ variantId: config.variantId,
4937
+ actionFailure: config.actionFailure ?? "stop",
4938
+ observe: () => current,
4939
+ validate: ({ state }) => [
4940
+ objectiveEval({
4941
+ id: "verification",
4942
+ passed: state.verification.pass,
4943
+ score: state.verification.score,
4944
+ severity: "critical",
4945
+ detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
4946
+ })
4947
+ ],
4948
+ shouldStop: ({ state }) => {
4949
+ if (state.verification.pass) {
4950
+ return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
4951
+ }
4952
+ if (state.completed) {
4953
+ return {
4954
+ stop: true,
4955
+ pass: false,
4956
+ reason: "reviewer stopped continuation",
4957
+ score: state.verification.score,
4958
+ failureClass: failureClassFromVerification(state.verification)
4959
+ };
4960
+ }
4961
+ return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
4962
+ },
4963
+ decide: ({ state }) => ({
4964
+ type: "continue",
4965
+ action: { type: "propose-review-shot", shot: state.shot + 1 },
4966
+ reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
4967
+ }),
4968
+ act: async (action, ctx) => {
4969
+ const shot = action.shot;
4970
+ const proposeOut = await config.propose({
4971
+ shot,
4972
+ goal: config.goal,
4973
+ state: current.state,
4974
+ priorReview: current.priorReview,
4975
+ abortSignal: ctx.abortSignal,
4976
+ emitter: ctx.emitter
4977
+ });
4978
+ const nextState = proposeOut.state;
4979
+ const verification = await config.verify(nextState);
4980
+ let review = null;
4981
+ let reviewAvailable = false;
4982
+ let reviewError;
4983
+ let shouldContinue = !verification.pass;
4984
+ if (!verification.pass) {
4985
+ try {
4986
+ review = await config.review({
4987
+ shot,
4988
+ goal: config.goal,
4989
+ state: nextState,
4990
+ verification,
4991
+ traceSummary: proposeOut.traceSummary,
4992
+ memory: await memory.load()
4993
+ });
4994
+ reviewAvailable = true;
4995
+ shouldContinue = review.shouldContinue;
4996
+ lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
4997
+ if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
4998
+ } catch (err) {
4999
+ reviewError = err instanceof Error ? err.message : String(err);
5000
+ review = current.priorReview ?? {
5001
+ observations: "Reviewer unavailable.",
5002
+ diagnosis: reviewError,
5003
+ nextShotInstruction: fallbackInstruction,
5004
+ shouldContinue: true,
5005
+ confidence: 0
5006
+ };
5007
+ shouldContinue = true;
5008
+ }
5009
+ } else {
5010
+ review = {
5011
+ observations: "Verification passed.",
5012
+ diagnosis: "No further revision needed.",
5013
+ nextShotInstruction: "",
5014
+ shouldContinue: false,
5015
+ confidence: 1
5016
+ };
5017
+ }
5018
+ const entry = {
5019
+ ...review ?? {
5020
+ observations: "No review.",
5021
+ diagnosis: "",
5022
+ nextShotInstruction: fallbackInstruction,
5023
+ shouldContinue,
5024
+ confidence: 0
5025
+ },
5026
+ shot,
5027
+ timestamp: Date.now(),
5028
+ verification: {
5029
+ pass: verification.pass,
5030
+ score: verification.score,
5031
+ failingLayers: verification.failingLayers
5032
+ }
5033
+ };
5034
+ await memory.append(entry);
5035
+ current = {
5036
+ shot,
5037
+ state: nextState,
5038
+ priorReview: review,
5039
+ verification,
5040
+ traceSummary: proposeOut.traceSummary,
5041
+ memory: await memory.load(),
5042
+ completed: verification.pass || !shouldContinue,
5043
+ reviewAvailable,
5044
+ reviewError
5045
+ };
5046
+ return {
5047
+ state: nextState,
5048
+ verification,
5049
+ traceSummary: proposeOut.traceSummary,
5050
+ review,
5051
+ reviewAvailable,
5052
+ reviewError
5053
+ };
5054
+ }
5055
+ });
5056
+ }
5057
+ function controlFailureClassFromVerification(verification) {
5058
+ if (verification.pass) return void 0;
5059
+ return verification.failingLayers?.length ? "instruction_following" : "unknown";
5060
+ }
5061
+
3890
5062
  // src/trace/schema.ts
3891
5063
  var TRACE_SCHEMA_VERSION = "1.0.0";
3892
5064
  var FAILURE_CLASSES = [
@@ -5210,7 +6382,7 @@ function assertNonNegative(n, name) {
5210
6382
 
5211
6383
  // src/muffled-gate-scanner.ts
5212
6384
  import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
5213
- import { join } from "path";
6385
+ import { join as join2 } from "path";
5214
6386
  function codeOf(line) {
5215
6387
  return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
5216
6388
  }
@@ -5314,11 +6486,11 @@ var UNIVERSAL_FINDERS = [
5314
6486
  function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
5315
6487
  const matches2 = [];
5316
6488
  const walk = (rel) => {
5317
- const abs = join(repoRoot, rel);
6489
+ const abs = join2(repoRoot, rel);
5318
6490
  if (!existsSync2(abs)) return;
5319
6491
  for (const entry of readdirSync(abs)) {
5320
- const sub = join(rel, entry);
5321
- const subAbs = join(repoRoot, sub);
6492
+ const sub = join2(rel, entry);
6493
+ const subAbs = join2(repoRoot, sub);
5322
6494
  let st;
5323
6495
  try {
5324
6496
  st = statSync(subAbs);
@@ -5347,7 +6519,7 @@ function scanForMuffledGates(opts) {
5347
6519
  const findings = [];
5348
6520
  const scanned = /* @__PURE__ */ new Set();
5349
6521
  for (const file of opts.scanFiles) {
5350
- const abs = join(opts.repoRoot, file);
6522
+ const abs = join2(opts.repoRoot, file);
5351
6523
  if (!existsSync2(abs)) continue;
5352
6524
  const text = readFileSync2(abs, "utf8");
5353
6525
  for (const find of opts.finders) findings.push(...find(file, text));
@@ -5362,7 +6534,7 @@ function scanForMuffledGates(opts) {
5362
6534
  );
5363
6535
  for (const file of importers) {
5364
6536
  if (scanned.has(file)) continue;
5365
- const abs = join(opts.repoRoot, file);
6537
+ const abs = join2(opts.repoRoot, file);
5366
6538
  if (!existsSync2(abs)) continue;
5367
6539
  const text = readFileSync2(abs, "utf8");
5368
6540
  for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
@@ -5557,7 +6729,7 @@ var Dataset = class _Dataset {
5557
6729
  * Write to disk for contamination-verifiable archives.
5558
6730
  */
5559
6731
  toJsonl() {
5560
- return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize(s))).join("\n") + "\n";
6732
+ return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
5561
6733
  }
5562
6734
  static fromJsonl(jsonl, manifest) {
5563
6735
  const scenarios = [];
@@ -5570,18 +6742,18 @@ var Dataset = class _Dataset {
5570
6742
  }
5571
6743
  };
5572
6744
  async function hashScenarios(scenarios) {
5573
- const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize);
6745
+ const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
5574
6746
  const text = JSON.stringify(canonical);
5575
6747
  const bytes = new TextEncoder().encode(text);
5576
6748
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
5577
6749
  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
5578
6750
  }
5579
- function canonicalize(v) {
6751
+ function canonicalize2(v) {
5580
6752
  if (v === null || typeof v !== "object") return v;
5581
- if (Array.isArray(v)) return v.map(canonicalize);
6753
+ if (Array.isArray(v)) return v.map(canonicalize2);
5582
6754
  const keys = Object.keys(v).sort();
5583
6755
  const out = {};
5584
- for (const k of keys) out[k] = canonicalize(v[k]);
6756
+ for (const k of keys) out[k] = canonicalize2(v[k]);
5585
6757
  return out;
5586
6758
  }
5587
6759
  function seededShuffle(items, seed) {
@@ -7350,7 +8522,7 @@ async function commitBisect(options) {
7350
8522
  }
7351
8523
  async function promptBisect(options) {
7352
8524
  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
7353
- const join3 = (paragraphs) => paragraphs.join("\n\n");
8525
+ const join4 = (paragraphs) => paragraphs.join("\n\n");
7354
8526
  const goodParas = split(options.good);
7355
8527
  const badParas = split(options.bad);
7356
8528
  if (goodParas.length !== badParas.length) {
@@ -7368,7 +8540,7 @@ async function promptBisect(options) {
7368
8540
  const result = await bisect({
7369
8541
  good: goodMask,
7370
8542
  bad: badMask,
7371
- runEval: (mask) => options.runEval(join3(paragraphsFor(mask))),
8543
+ runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
7372
8544
  maxIterations: options.maxIterations ?? n + 5,
7373
8545
  halfway: (g, b) => {
7374
8546
  for (let i = 0; i < g.length; i++) {
@@ -7399,12 +8571,12 @@ async function promptBisect(options) {
7399
8571
  }
7400
8572
  }
7401
8573
  const materializedPath = result.path.map((s) => ({
7402
- state: join3(paragraphsFor(s.state)),
8574
+ state: join4(paragraphsFor(s.state)),
7403
8575
  score: s.score,
7404
8576
  pass: s.pass
7405
8577
  }));
7406
8578
  return {
7407
- culprit: join3(paragraphsFor(culprit)),
8579
+ culprit: join4(paragraphsFor(culprit)),
7408
8580
  path: materializedPath,
7409
8581
  converged: result.converged,
7410
8582
  inputInconsistent: result.inputInconsistent,
@@ -7615,7 +8787,7 @@ function attributeStep(op, prmA, prmB) {
7615
8787
 
7616
8788
  // src/pre-registration.ts
7617
8789
  async function signManifest(m) {
7618
- const canonical = canonicalize2(m);
8790
+ const canonical = canonicalize3(m);
7619
8791
  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
7620
8792
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
7621
8793
  const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
@@ -7645,12 +8817,12 @@ async function evaluateHypothesis(manifest, observed) {
7645
8817
  rejectionReasons: reasons
7646
8818
  };
7647
8819
  }
7648
- function canonicalize2(v) {
8820
+ function canonicalize3(v) {
7649
8821
  if (v === null || typeof v !== "object") return v;
7650
- if (Array.isArray(v)) return v.map(canonicalize2);
8822
+ if (Array.isArray(v)) return v.map(canonicalize3);
7651
8823
  const keys = Object.keys(v).sort();
7652
8824
  const out = {};
7653
- for (const k of keys) out[k] = canonicalize2(v[k]);
8825
+ for (const k of keys) out[k] = canonicalize3(v[k]);
7654
8826
  return out;
7655
8827
  }
7656
8828
 
@@ -8459,7 +9631,7 @@ function mergeSignals(a, b) {
8459
9631
  // src/command-runner.ts
8460
9632
  import { spawnSync } from "child_process";
8461
9633
  import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
8462
- import { join as join2 } from "path";
9634
+ import { join as join3 } from "path";
8463
9635
  var localCommandRunner = {
8464
9636
  name: "local",
8465
9637
  async run(input) {
@@ -8506,7 +9678,7 @@ var localCommandRunner = {
8506
9678
  const out = [];
8507
9679
  for (const name of entries) {
8508
9680
  try {
8509
- const st = statSync2(join2(path, name));
9681
+ const st = statSync2(join3(path, name));
8510
9682
  out.push({
8511
9683
  name,
8512
9684
  isDirectory: st.isDirectory(),
@@ -12298,6 +13470,46 @@ function truncate3(s, max) {
12298
13470
  function quote(s) {
12299
13471
  return s.replace(/`/g, "\\`");
12300
13472
  }
13473
+ function autoCloseTruncatedJson(raw) {
13474
+ const stack = [];
13475
+ let inString = false;
13476
+ let escape = false;
13477
+ for (const c of raw) {
13478
+ if (escape) {
13479
+ escape = false;
13480
+ continue;
13481
+ }
13482
+ if (inString) {
13483
+ if (c === "\\") {
13484
+ escape = true;
13485
+ continue;
13486
+ }
13487
+ if (c === '"') {
13488
+ inString = false;
13489
+ continue;
13490
+ }
13491
+ continue;
13492
+ }
13493
+ if (c === '"') {
13494
+ inString = true;
13495
+ continue;
13496
+ }
13497
+ if (c === "{" || c === "[") stack.push(c);
13498
+ else if (c === "}") {
13499
+ if (stack.pop() !== "{") return null;
13500
+ } else if (c === "]") {
13501
+ if (stack.pop() !== "[") return null;
13502
+ }
13503
+ }
13504
+ if (stack.length === 0 && !inString) return raw;
13505
+ let suffix = "";
13506
+ if (inString) suffix += '"';
13507
+ while (stack.length > 0) {
13508
+ const opener = stack.pop();
13509
+ suffix += opener === "{" ? "}" : "]";
13510
+ }
13511
+ return raw + suffix;
13512
+ }
12301
13513
  function parseReflectionResponse(raw, maxProposals) {
12302
13514
  let text = raw.trim();
12303
13515
  if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
@@ -12322,6 +13534,18 @@ function parseReflectionResponse(raw, maxProposals) {
12322
13534
  } catch {
12323
13535
  }
12324
13536
  }
13537
+ if (parsed == null) {
13538
+ for (const slice of candidates) {
13539
+ const closed = autoCloseTruncatedJson(slice);
13540
+ if (closed != null && closed !== slice) {
13541
+ try {
13542
+ parsed = JSON.parse(closed);
13543
+ break;
13544
+ } catch {
13545
+ }
13546
+ }
13547
+ }
13548
+ }
12325
13549
  if (parsed == null) return [];
12326
13550
  let proposalsRaw;
12327
13551
  if (Array.isArray(parsed)) {
@@ -12374,6 +13598,7 @@ export {
12374
13598
  ExperimentTracker,
12375
13599
  FAILURE_CLASSES,
12376
13600
  FileSystemExperimentStore,
13601
+ FileSystemFeedbackTrajectoryStore,
12377
13602
  FileSystemOutcomeStore,
12378
13603
  FileSystemTraceStore,
12379
13604
  HeldOutGate,
@@ -12381,6 +13606,7 @@ export {
12381
13606
  HoldoutLockedError,
12382
13607
  INTENT_MATCH_JUDGE_VERSION,
12383
13608
  InMemoryExperimentStore,
13609
+ InMemoryFeedbackTrajectoryStore,
12384
13610
  InMemoryOutcomeStore,
12385
13611
  InMemoryTraceStore,
12386
13612
  InMemoryTrialCache,
@@ -12420,9 +13646,11 @@ export {
12420
13646
  adversarialJudge,
12421
13647
  aggregateLlm,
12422
13648
  aggregateRunScore,
13649
+ allCriticalPassed,
12423
13650
  analyzeAntiSlop,
12424
13651
  analyzeSeries,
12425
13652
  argHash,
13653
+ assignFeedbackSplit,
12426
13654
  attributeCounterfactuals,
12427
13655
  deterministicSplit as benchmarkDeterministicSplit,
12428
13656
  benchmarks_exports as benchmarks,
@@ -12460,6 +13688,8 @@ export {
12460
13688
  computeToolUseMetrics,
12461
13689
  confidenceInterval,
12462
13690
  containsAll,
13691
+ controlFailureClassFromVerification,
13692
+ controlRunToFeedbackTrajectory,
12463
13693
  correlateLayers,
12464
13694
  correlationStudy,
12465
13695
  createAntiSlopJudge,
@@ -12467,6 +13697,7 @@ export {
12467
13697
  createCustomJudge,
12468
13698
  createDefaultReviewer,
12469
13699
  createDomainExpertJudge,
13700
+ createFeedbackTrajectory,
12470
13701
  createIntentMatchJudge,
12471
13702
  createLlmReviewer,
12472
13703
  createSandboxCodeMutator,
@@ -12495,6 +13726,10 @@ export {
12495
13726
  extractAssetUrls,
12496
13727
  extractErrorCount,
12497
13728
  failureClusterView,
13729
+ feedbackTrajectoriesToDatasetScenarios,
13730
+ feedbackTrajectoriesToOptimizerRows,
13731
+ feedbackTrajectoryToDatasetScenario,
13732
+ feedbackTrajectoryToOptimizerRow,
12498
13733
  fileContains,
12499
13734
  fileExists,
12500
13735
  findAutoMatchNoExpectation,
@@ -12549,6 +13784,7 @@ export {
12549
13784
  nonRefusalRubric,
12550
13785
  normalizeScores,
12551
13786
  notBlocked,
13787
+ objectiveEval,
12552
13788
  outputLengthRubric,
12553
13789
  pairedBootstrap,
12554
13790
  pairedTTest,
@@ -12557,6 +13793,7 @@ export {
12557
13793
  paretoChart,
12558
13794
  paretoFrontier,
12559
13795
  paretoFrontierWithCrowding,
13796
+ parseFeedbackTrajectoriesJsonl,
12560
13797
  parseReflectionResponse,
12561
13798
  parseRunRecordSafe,
12562
13799
  partialCredit,
@@ -12583,6 +13820,7 @@ export {
12583
13820
  renderMarkdown,
12584
13821
  renderMarkdownReport,
12585
13822
  renderPlaybookMarkdown,
13823
+ renderPreferenceMemoryMarkdown,
12586
13824
  renderSteeringText,
12587
13825
  replayScorerOverCorpus,
12588
13826
  replayTraceThroughJudge,
@@ -12592,6 +13830,7 @@ export {
12592
13830
  roundTripRunRecord,
12593
13831
  rowCount,
12594
13832
  rowWhere,
13833
+ runAgentControlLoop,
12595
13834
  runAssertions,
12596
13835
  runCanaries,
12597
13836
  runCounterfactual,
@@ -12605,6 +13844,7 @@ export {
12605
13844
  runKeywordCoverageJudgeUrl,
12606
13845
  runPromptEvolution,
12607
13846
  runProposeReview,
13847
+ runProposeReviewAsControlLoop,
12608
13848
  runReferenceReplay,
12609
13849
  runSelfPlay,
12610
13850
  runSemanticConceptJudge,
@@ -12621,13 +13861,18 @@ export {
12621
13861
  selectHarnessVariant,
12622
13862
  selfPreference,
12623
13863
  sentenceReorderMutator,
13864
+ serializeFeedbackTrajectoriesJsonl,
12624
13865
  signManifest,
12625
13866
  soc2Report,
12626
13867
  statusAdvanced,
13868
+ stopOnNoProgress,
13869
+ stopOnRepeatedAction,
12627
13870
  stripFencedJson,
12628
13871
  stuckLoopView,
13872
+ subjectiveEval,
12629
13873
  summarize,
12630
13874
  summarizeHarnessResults,
13875
+ summarizePreferenceMemory,
12631
13876
  summaryTable,
12632
13877
  testJudge,
12633
13878
  textInSnapshot,
@@ -12653,6 +13898,7 @@ export {
12653
13898
  welchsTTest,
12654
13899
  whitespaceCollapseMutator,
12655
13900
  wilcoxonSignedRank,
13901
+ withAssignedFeedbackSplit,
12656
13902
  wranglerDeployRunner
12657
13903
  };
12658
13904
  //# sourceMappingURL=index.js.map