@tangle-network/agent-eval 0.17.1 → 0.17.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -0
- package/dist/index.d.ts +1453 -1084
- package/dist/index.js +1393 -199
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1386,6 +1386,1205 @@ function printDriverSummary(results) {
|
|
|
1386
1386
|
console.log(`${completedCount}/${results.length} personas completed`);
|
|
1387
1387
|
}
|
|
1388
1388
|
|
|
1389
|
+
// src/trace/emitter.ts
|
|
1390
|
+
var TraceEmitter = class {
|
|
1391
|
+
store;
|
|
1392
|
+
stack = [];
|
|
1393
|
+
_runId;
|
|
1394
|
+
now;
|
|
1395
|
+
id;
|
|
1396
|
+
constructor(store, options = {}) {
|
|
1397
|
+
this.store = store;
|
|
1398
|
+
this.now = options.now ?? (() => Date.now());
|
|
1399
|
+
this.id = options.id ?? (() => cryptoRandomId());
|
|
1400
|
+
this._runId = options.runId ?? this.id();
|
|
1401
|
+
}
|
|
1402
|
+
get runId() {
|
|
1403
|
+
return this._runId;
|
|
1404
|
+
}
|
|
1405
|
+
// ── Run lifecycle ──────────────────────────────────────────────────
|
|
1406
|
+
async startRun(run) {
|
|
1407
|
+
const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
|
|
1408
|
+
await this.store.appendRun(full);
|
|
1409
|
+
return full;
|
|
1410
|
+
}
|
|
1411
|
+
async endRun(outcome) {
|
|
1412
|
+
const status = outcome?.pass === false ? "failed" : "completed";
|
|
1413
|
+
await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
|
|
1414
|
+
}
|
|
1415
|
+
async abortRun(reason) {
|
|
1416
|
+
await this.store.updateRun(this._runId, {
|
|
1417
|
+
endedAt: this.now(),
|
|
1418
|
+
status: "aborted",
|
|
1419
|
+
outcome: { pass: false, notes: reason }
|
|
1420
|
+
});
|
|
1421
|
+
}
|
|
1422
|
+
// ── Generic span ───────────────────────────────────────────────────
|
|
1423
|
+
async span(init) {
|
|
1424
|
+
const spanId = this.id();
|
|
1425
|
+
const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
|
|
1426
|
+
const span = {
|
|
1427
|
+
spanId,
|
|
1428
|
+
parentSpanId: parent,
|
|
1429
|
+
runId: this._runId,
|
|
1430
|
+
startedAt: this.now(),
|
|
1431
|
+
...init
|
|
1432
|
+
};
|
|
1433
|
+
await this.store.appendSpan(span);
|
|
1434
|
+
this.stack.push(spanId);
|
|
1435
|
+
return this.handle(span);
|
|
1436
|
+
}
|
|
1437
|
+
handle(span) {
|
|
1438
|
+
return {
|
|
1439
|
+
span,
|
|
1440
|
+
end: async (patch) => {
|
|
1441
|
+
const endedAt = this.now();
|
|
1442
|
+
await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
|
|
1443
|
+
this.pop(span.spanId);
|
|
1444
|
+
},
|
|
1445
|
+
fail: async (error, patch) => {
|
|
1446
|
+
const endedAt = this.now();
|
|
1447
|
+
const errStr = error instanceof Error ? error.message : error;
|
|
1448
|
+
await this.store.updateSpan(span.spanId, {
|
|
1449
|
+
endedAt,
|
|
1450
|
+
status: "error",
|
|
1451
|
+
error: errStr,
|
|
1452
|
+
...patch
|
|
1453
|
+
});
|
|
1454
|
+
this.pop(span.spanId);
|
|
1455
|
+
}
|
|
1456
|
+
};
|
|
1457
|
+
}
|
|
1458
|
+
pop(spanId) {
|
|
1459
|
+
const idx = this.stack.lastIndexOf(spanId);
|
|
1460
|
+
if (idx >= 0) this.stack.splice(idx, 1);
|
|
1461
|
+
}
|
|
1462
|
+
// ── Typed span conveniences ────────────────────────────────────────
|
|
1463
|
+
llm(init) {
|
|
1464
|
+
return this.span({ kind: "llm", ...init });
|
|
1465
|
+
}
|
|
1466
|
+
tool(init) {
|
|
1467
|
+
return this.span({ kind: "tool", ...init });
|
|
1468
|
+
}
|
|
1469
|
+
retrieval(init) {
|
|
1470
|
+
return this.span({ kind: "retrieval", ...init });
|
|
1471
|
+
}
|
|
1472
|
+
async recordJudge(verdict) {
|
|
1473
|
+
const spanId = this.id();
|
|
1474
|
+
const now = this.now();
|
|
1475
|
+
const full = {
|
|
1476
|
+
spanId,
|
|
1477
|
+
runId: this._runId,
|
|
1478
|
+
kind: "judge",
|
|
1479
|
+
startedAt: now,
|
|
1480
|
+
endedAt: now,
|
|
1481
|
+
status: "ok",
|
|
1482
|
+
...verdict
|
|
1483
|
+
};
|
|
1484
|
+
await this.store.appendSpan(full);
|
|
1485
|
+
return full;
|
|
1486
|
+
}
|
|
1487
|
+
sandbox(init) {
|
|
1488
|
+
return this.span({ kind: "sandbox", ...init });
|
|
1489
|
+
}
|
|
1490
|
+
// ── Events ─────────────────────────────────────────────────────────
|
|
1491
|
+
async emit(event) {
|
|
1492
|
+
const full = {
|
|
1493
|
+
eventId: this.id(),
|
|
1494
|
+
runId: this._runId,
|
|
1495
|
+
spanId: event.spanId ?? this.stack[this.stack.length - 1],
|
|
1496
|
+
kind: event.kind,
|
|
1497
|
+
timestamp: this.now(),
|
|
1498
|
+
payload: event.payload ?? {}
|
|
1499
|
+
};
|
|
1500
|
+
await this.store.appendEvent(full);
|
|
1501
|
+
return full;
|
|
1502
|
+
}
|
|
1503
|
+
// ── Budget ledger ──────────────────────────────────────────────────
|
|
1504
|
+
async recordBudget(entry) {
|
|
1505
|
+
const full = {
|
|
1506
|
+
runId: this._runId,
|
|
1507
|
+
timestamp: entry.timestamp ?? this.now(),
|
|
1508
|
+
dimension: entry.dimension,
|
|
1509
|
+
limit: entry.limit,
|
|
1510
|
+
consumed: entry.consumed,
|
|
1511
|
+
remaining: entry.remaining,
|
|
1512
|
+
breached: entry.breached,
|
|
1513
|
+
spanId: entry.spanId ?? this.stack[this.stack.length - 1]
|
|
1514
|
+
};
|
|
1515
|
+
await this.store.appendBudgetEntry(full);
|
|
1516
|
+
if (full.breached) {
|
|
1517
|
+
await this.emit({
|
|
1518
|
+
kind: "budget_breach",
|
|
1519
|
+
spanId: full.spanId,
|
|
1520
|
+
payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
|
|
1521
|
+
});
|
|
1522
|
+
}
|
|
1523
|
+
return full;
|
|
1524
|
+
}
|
|
1525
|
+
// ── Artifacts ──────────────────────────────────────────────────────
|
|
1526
|
+
async recordArtifact(artifact) {
|
|
1527
|
+
const full = { artifactId: this.id(), runId: this._runId, ...artifact };
|
|
1528
|
+
await this.store.appendArtifact(full);
|
|
1529
|
+
return full;
|
|
1530
|
+
}
|
|
1531
|
+
// ── Nested composition ─────────────────────────────────────────────
|
|
1532
|
+
/**
|
|
1533
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
1534
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
1535
|
+
*/
|
|
1536
|
+
async within(init, fn) {
|
|
1537
|
+
const handle = await this.span(init);
|
|
1538
|
+
try {
|
|
1539
|
+
const result = await fn(handle);
|
|
1540
|
+
await handle.end();
|
|
1541
|
+
return result;
|
|
1542
|
+
} catch (err) {
|
|
1543
|
+
await handle.fail(err instanceof Error ? err : String(err));
|
|
1544
|
+
throw err;
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
};
|
|
1548
|
+
function cryptoRandomId() {
|
|
1549
|
+
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
1550
|
+
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
1551
|
+
}
|
|
1552
|
+
function llmSpanFromProvider(args) {
|
|
1553
|
+
return {
|
|
1554
|
+
name: args.name ?? args.model,
|
|
1555
|
+
model: args.model,
|
|
1556
|
+
messages: args.messages,
|
|
1557
|
+
output: args.output,
|
|
1558
|
+
inputTokens: args.usage?.inputTokens,
|
|
1559
|
+
outputTokens: args.usage?.outputTokens,
|
|
1560
|
+
cachedTokens: args.usage?.cachedTokens,
|
|
1561
|
+
reasoningTokens: args.usage?.reasoningTokens,
|
|
1562
|
+
costUsd: args.costUsd,
|
|
1563
|
+
finishReason: args.finishReason
|
|
1564
|
+
};
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// src/control-runtime.ts
|
|
1568
|
+
var DEFAULT_BUDGET = {
|
|
1569
|
+
maxSteps: 8,
|
|
1570
|
+
maxWallMs: 5 * 60 * 1e3
|
|
1571
|
+
};
|
|
1572
|
+
async function runAgentControlLoop(config) {
|
|
1573
|
+
const budget = { ...DEFAULT_BUDGET, ...config.budget };
|
|
1574
|
+
const actionFailure = config.actionFailure ?? "continue";
|
|
1575
|
+
const controller = new AbortController();
|
|
1576
|
+
const upstreamAbort = () => controller.abort(config.signal?.reason);
|
|
1577
|
+
if (config.signal) {
|
|
1578
|
+
if (config.signal.aborted) controller.abort(config.signal.reason);
|
|
1579
|
+
else config.signal.addEventListener("abort", upstreamAbort, { once: true });
|
|
1580
|
+
}
|
|
1581
|
+
const started = Date.now();
|
|
1582
|
+
const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
|
|
1583
|
+
const history = [];
|
|
1584
|
+
const emitter = config.store ? new TraceEmitter(config.store) : void 0;
|
|
1585
|
+
let spentCostUsd = 0;
|
|
1586
|
+
const runtimeErrors = [];
|
|
1587
|
+
let lastStateFingerprint;
|
|
1588
|
+
let lastActionFingerprint;
|
|
1589
|
+
let noProgressStreak = 0;
|
|
1590
|
+
let repeatedActionStreak = 0;
|
|
1591
|
+
try {
|
|
1592
|
+
if (emitter) {
|
|
1593
|
+
await runTrace(runtimeErrors, 0, () => emitter.startRun({
|
|
1594
|
+
scenarioId: config.scenarioId ?? "agent-control-loop",
|
|
1595
|
+
projectId: config.projectId,
|
|
1596
|
+
variantId: config.variantId,
|
|
1597
|
+
layer: "meta",
|
|
1598
|
+
tags: {
|
|
1599
|
+
intent: config.intent.slice(0, 120),
|
|
1600
|
+
maxSteps: String(budget.maxSteps),
|
|
1601
|
+
...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
|
|
1602
|
+
}
|
|
1603
|
+
}));
|
|
1604
|
+
}
|
|
1605
|
+
let state;
|
|
1606
|
+
let evals;
|
|
1607
|
+
try {
|
|
1608
|
+
state = await config.observe({ history, abortSignal: controller.signal });
|
|
1609
|
+
} catch (err) {
|
|
1610
|
+
runtimeErrors.push(runtimeError("observe", 0, err));
|
|
1611
|
+
return finish(emitter, {
|
|
1612
|
+
intent: config.intent,
|
|
1613
|
+
pass: false,
|
|
1614
|
+
completed: false,
|
|
1615
|
+
reason: runtimeErrors[0].message,
|
|
1616
|
+
steps: history,
|
|
1617
|
+
finalState: void 0,
|
|
1618
|
+
finalEvals: [],
|
|
1619
|
+
wallMs: Date.now() - started,
|
|
1620
|
+
spentCostUsd,
|
|
1621
|
+
runId: emitter?.runId ?? null,
|
|
1622
|
+
failureClass: "unknown",
|
|
1623
|
+
runtimeErrors,
|
|
1624
|
+
stoppedBy: "runtime-error"
|
|
1625
|
+
});
|
|
1626
|
+
}
|
|
1627
|
+
try {
|
|
1628
|
+
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
1629
|
+
await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
|
|
1630
|
+
} catch (err) {
|
|
1631
|
+
runtimeErrors.push(runtimeError("validate", 0, err));
|
|
1632
|
+
return finish(emitter, {
|
|
1633
|
+
intent: config.intent,
|
|
1634
|
+
pass: false,
|
|
1635
|
+
completed: false,
|
|
1636
|
+
reason: runtimeErrors[0].message,
|
|
1637
|
+
steps: history,
|
|
1638
|
+
finalState: state,
|
|
1639
|
+
finalEvals: [],
|
|
1640
|
+
wallMs: Date.now() - started,
|
|
1641
|
+
spentCostUsd,
|
|
1642
|
+
runId: emitter?.runId ?? null,
|
|
1643
|
+
failureClass: "unknown",
|
|
1644
|
+
runtimeErrors,
|
|
1645
|
+
stoppedBy: "runtime-error"
|
|
1646
|
+
});
|
|
1647
|
+
}
|
|
1648
|
+
lastStateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
1649
|
+
for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
|
|
1650
|
+
if (controller.signal.aborted) {
|
|
1651
|
+
return finish(emitter, {
|
|
1652
|
+
intent: config.intent,
|
|
1653
|
+
pass: false,
|
|
1654
|
+
completed: false,
|
|
1655
|
+
reason: abortReason(controller.signal),
|
|
1656
|
+
score: void 0,
|
|
1657
|
+
steps: history,
|
|
1658
|
+
finalState: state,
|
|
1659
|
+
finalEvals: evals,
|
|
1660
|
+
wallMs: Date.now() - started,
|
|
1661
|
+
spentCostUsd,
|
|
1662
|
+
runId: emitter?.runId ?? null,
|
|
1663
|
+
failureClass: "timeout",
|
|
1664
|
+
runtimeErrors,
|
|
1665
|
+
stoppedBy: "abort"
|
|
1666
|
+
});
|
|
1667
|
+
}
|
|
1668
|
+
const budgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
1669
|
+
if (budgetStop.stop) {
|
|
1670
|
+
return finish(emitter, {
|
|
1671
|
+
intent: config.intent,
|
|
1672
|
+
pass: false,
|
|
1673
|
+
completed: false,
|
|
1674
|
+
reason: budgetStop.reason,
|
|
1675
|
+
score: averageScore(evals),
|
|
1676
|
+
steps: history,
|
|
1677
|
+
finalState: state,
|
|
1678
|
+
finalEvals: evals,
|
|
1679
|
+
wallMs: Date.now() - started,
|
|
1680
|
+
spentCostUsd,
|
|
1681
|
+
runId: emitter?.runId ?? null,
|
|
1682
|
+
failureClass: "budget_exceeded",
|
|
1683
|
+
runtimeErrors,
|
|
1684
|
+
stoppedBy: "budget"
|
|
1685
|
+
});
|
|
1686
|
+
}
|
|
1687
|
+
const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
|
|
1688
|
+
let stop;
|
|
1689
|
+
try {
|
|
1690
|
+
stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
|
|
1691
|
+
} catch (err) {
|
|
1692
|
+
runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
|
|
1693
|
+
return finish(emitter, {
|
|
1694
|
+
intent: config.intent,
|
|
1695
|
+
pass: false,
|
|
1696
|
+
completed: false,
|
|
1697
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1698
|
+
score: averageScore(evals),
|
|
1699
|
+
steps: history,
|
|
1700
|
+
finalState: state,
|
|
1701
|
+
finalEvals: evals,
|
|
1702
|
+
wallMs: Date.now() - started,
|
|
1703
|
+
spentCostUsd,
|
|
1704
|
+
runId: emitter?.runId ?? null,
|
|
1705
|
+
failureClass: "unknown",
|
|
1706
|
+
runtimeErrors,
|
|
1707
|
+
stoppedBy: "runtime-error"
|
|
1708
|
+
});
|
|
1709
|
+
}
|
|
1710
|
+
if (stop.stop) {
|
|
1711
|
+
return finish(emitter, {
|
|
1712
|
+
intent: config.intent,
|
|
1713
|
+
pass: stop.pass,
|
|
1714
|
+
completed: true,
|
|
1715
|
+
reason: stop.reason,
|
|
1716
|
+
score: stop.score,
|
|
1717
|
+
steps: history,
|
|
1718
|
+
finalState: state,
|
|
1719
|
+
finalEvals: evals,
|
|
1720
|
+
wallMs: Date.now() - started,
|
|
1721
|
+
spentCostUsd,
|
|
1722
|
+
runId: emitter?.runId ?? null,
|
|
1723
|
+
failureClass: stop.failureClass,
|
|
1724
|
+
runtimeErrors,
|
|
1725
|
+
stoppedBy: "stop-policy"
|
|
1726
|
+
});
|
|
1727
|
+
}
|
|
1728
|
+
let decision;
|
|
1729
|
+
try {
|
|
1730
|
+
decision = await config.decide(ctx);
|
|
1731
|
+
} catch (err) {
|
|
1732
|
+
runtimeErrors.push(runtimeError("decide", stepIndex, err));
|
|
1733
|
+
return finish(emitter, {
|
|
1734
|
+
intent: config.intent,
|
|
1735
|
+
pass: false,
|
|
1736
|
+
completed: false,
|
|
1737
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1738
|
+
score: averageScore(evals),
|
|
1739
|
+
steps: history,
|
|
1740
|
+
finalState: state,
|
|
1741
|
+
finalEvals: evals,
|
|
1742
|
+
wallMs: Date.now() - started,
|
|
1743
|
+
spentCostUsd,
|
|
1744
|
+
runId: emitter?.runId ?? null,
|
|
1745
|
+
failureClass: "unknown",
|
|
1746
|
+
runtimeErrors,
|
|
1747
|
+
stoppedBy: "runtime-error"
|
|
1748
|
+
});
|
|
1749
|
+
}
|
|
1750
|
+
if (decision.type === "stop") {
|
|
1751
|
+
return finish(emitter, {
|
|
1752
|
+
intent: config.intent,
|
|
1753
|
+
pass: decision.pass ?? false,
|
|
1754
|
+
completed: true,
|
|
1755
|
+
reason: decision.reason,
|
|
1756
|
+
score: decision.score,
|
|
1757
|
+
steps: history,
|
|
1758
|
+
finalState: state,
|
|
1759
|
+
finalEvals: evals,
|
|
1760
|
+
wallMs: Date.now() - started,
|
|
1761
|
+
spentCostUsd,
|
|
1762
|
+
runId: emitter?.runId ?? null,
|
|
1763
|
+
failureClass: decision.pass === false ? "unknown" : void 0,
|
|
1764
|
+
runtimeErrors,
|
|
1765
|
+
stoppedBy: "policy"
|
|
1766
|
+
});
|
|
1767
|
+
}
|
|
1768
|
+
const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
|
|
1769
|
+
repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
|
|
1770
|
+
lastActionFingerprint = actionFingerprint;
|
|
1771
|
+
const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
|
|
1772
|
+
if (repeatedActionStop.stop) {
|
|
1773
|
+
return finish(emitter, {
|
|
1774
|
+
intent: config.intent,
|
|
1775
|
+
pass: false,
|
|
1776
|
+
completed: true,
|
|
1777
|
+
reason: repeatedActionStop.reason,
|
|
1778
|
+
score: averageScore(evals),
|
|
1779
|
+
steps: history,
|
|
1780
|
+
finalState: state,
|
|
1781
|
+
finalEvals: evals,
|
|
1782
|
+
wallMs: Date.now() - started,
|
|
1783
|
+
spentCostUsd,
|
|
1784
|
+
runId: emitter?.runId ?? null,
|
|
1785
|
+
failureClass: "tool_recovery_failure",
|
|
1786
|
+
runtimeErrors,
|
|
1787
|
+
stoppedBy: "stop-policy"
|
|
1788
|
+
});
|
|
1789
|
+
}
|
|
1790
|
+
const beforeState = state;
|
|
1791
|
+
const evalsBefore = evals;
|
|
1792
|
+
const scoreBefore = averageScore(evals);
|
|
1793
|
+
const actionStarted = Date.now();
|
|
1794
|
+
const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
|
|
1795
|
+
name: `control-step-${stepIndex}`,
|
|
1796
|
+
toolName: "agent-control-action",
|
|
1797
|
+
args: decision.action,
|
|
1798
|
+
attributes: {
|
|
1799
|
+
decision: decision.reason ?? "continue",
|
|
1800
|
+
repeatedActionStreak
|
|
1801
|
+
}
|
|
1802
|
+
})) : void 0;
|
|
1803
|
+
let actionOutcome;
|
|
1804
|
+
try {
|
|
1805
|
+
const result = await config.act(decision.action, ctx);
|
|
1806
|
+
const costUsd = config.getActionCostUsd?.({
|
|
1807
|
+
action: decision.action,
|
|
1808
|
+
result,
|
|
1809
|
+
state,
|
|
1810
|
+
evals,
|
|
1811
|
+
history
|
|
1812
|
+
});
|
|
1813
|
+
if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
|
|
1814
|
+
spentCostUsd += costUsd;
|
|
1815
|
+
await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
|
|
1816
|
+
}
|
|
1817
|
+
actionOutcome = {
|
|
1818
|
+
ok: true,
|
|
1819
|
+
result,
|
|
1820
|
+
...costUsd !== void 0 ? { costUsd } : {},
|
|
1821
|
+
durationMs: Date.now() - actionStarted
|
|
1822
|
+
};
|
|
1823
|
+
} catch (err) {
|
|
1824
|
+
runtimeErrors.push(runtimeError("act", stepIndex, err));
|
|
1825
|
+
actionOutcome = {
|
|
1826
|
+
ok: false,
|
|
1827
|
+
error: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1828
|
+
durationMs: Date.now() - actionStarted
|
|
1829
|
+
};
|
|
1830
|
+
if (actionFailure === "stop") {
|
|
1831
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
|
|
1832
|
+
const step2 = {
|
|
1833
|
+
index: stepIndex,
|
|
1834
|
+
decision,
|
|
1835
|
+
beforeState,
|
|
1836
|
+
afterState: state,
|
|
1837
|
+
evalsBefore,
|
|
1838
|
+
evalsAfter: evals,
|
|
1839
|
+
actionOutcome,
|
|
1840
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1841
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1842
|
+
};
|
|
1843
|
+
history.push(step2);
|
|
1844
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1845
|
+
return finish(emitter, {
|
|
1846
|
+
intent: config.intent,
|
|
1847
|
+
pass: false,
|
|
1848
|
+
completed: false,
|
|
1849
|
+
reason: actionOutcome.error ?? "action failed",
|
|
1850
|
+
score: averageScore(evals),
|
|
1851
|
+
steps: history,
|
|
1852
|
+
finalState: state,
|
|
1853
|
+
finalEvals: evals,
|
|
1854
|
+
wallMs: Date.now() - started,
|
|
1855
|
+
spentCostUsd,
|
|
1856
|
+
runId: emitter?.runId ?? null,
|
|
1857
|
+
failureClass: "unknown",
|
|
1858
|
+
runtimeErrors,
|
|
1859
|
+
stoppedBy: "runtime-error"
|
|
1860
|
+
});
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
try {
|
|
1864
|
+
state = await config.observe({ history, abortSignal: controller.signal });
|
|
1865
|
+
} catch (err) {
|
|
1866
|
+
runtimeErrors.push(runtimeError("observe", stepIndex, err));
|
|
1867
|
+
const step2 = {
|
|
1868
|
+
index: stepIndex,
|
|
1869
|
+
decision,
|
|
1870
|
+
beforeState,
|
|
1871
|
+
afterState: beforeState,
|
|
1872
|
+
evalsBefore,
|
|
1873
|
+
evalsAfter: evals,
|
|
1874
|
+
actionOutcome,
|
|
1875
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1876
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1877
|
+
};
|
|
1878
|
+
history.push(step2);
|
|
1879
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
1880
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1881
|
+
return finish(emitter, {
|
|
1882
|
+
intent: config.intent,
|
|
1883
|
+
pass: false,
|
|
1884
|
+
completed: false,
|
|
1885
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1886
|
+
score: averageScore(evals),
|
|
1887
|
+
steps: history,
|
|
1888
|
+
finalState: beforeState,
|
|
1889
|
+
finalEvals: evals,
|
|
1890
|
+
wallMs: Date.now() - started,
|
|
1891
|
+
spentCostUsd,
|
|
1892
|
+
runId: emitter?.runId ?? null,
|
|
1893
|
+
failureClass: "unknown",
|
|
1894
|
+
runtimeErrors,
|
|
1895
|
+
stoppedBy: "runtime-error"
|
|
1896
|
+
});
|
|
1897
|
+
}
|
|
1898
|
+
try {
|
|
1899
|
+
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
1900
|
+
await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
|
|
1901
|
+
} catch (err) {
|
|
1902
|
+
runtimeErrors.push(runtimeError("validate", stepIndex, err));
|
|
1903
|
+
const step2 = {
|
|
1904
|
+
index: stepIndex,
|
|
1905
|
+
decision,
|
|
1906
|
+
beforeState,
|
|
1907
|
+
afterState: state,
|
|
1908
|
+
evalsBefore,
|
|
1909
|
+
evalsAfter: evals,
|
|
1910
|
+
actionOutcome,
|
|
1911
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1912
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1913
|
+
};
|
|
1914
|
+
history.push(step2);
|
|
1915
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
1916
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1917
|
+
return finish(emitter, {
|
|
1918
|
+
intent: config.intent,
|
|
1919
|
+
pass: false,
|
|
1920
|
+
completed: false,
|
|
1921
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1922
|
+
score: averageScore(evals),
|
|
1923
|
+
steps: history,
|
|
1924
|
+
finalState: state,
|
|
1925
|
+
finalEvals: evals,
|
|
1926
|
+
wallMs: Date.now() - started,
|
|
1927
|
+
spentCostUsd,
|
|
1928
|
+
runId: emitter?.runId ?? null,
|
|
1929
|
+
failureClass: "unknown",
|
|
1930
|
+
runtimeErrors,
|
|
1931
|
+
stoppedBy: "runtime-error"
|
|
1932
|
+
});
|
|
1933
|
+
}
|
|
1934
|
+
const scoreAfter = averageScore(evals);
|
|
1935
|
+
const stateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
1936
|
+
const noProgressStop = noProgressStopDecision({
|
|
1937
|
+
policies: config.stopPolicies,
|
|
1938
|
+
lastStateFingerprint,
|
|
1939
|
+
stateFingerprint,
|
|
1940
|
+
scoreBefore,
|
|
1941
|
+
scoreAfter,
|
|
1942
|
+
currentStreak: noProgressStreak
|
|
1943
|
+
});
|
|
1944
|
+
noProgressStreak = noProgressStop.streak;
|
|
1945
|
+
lastStateFingerprint = stateFingerprint;
|
|
1946
|
+
const step = {
|
|
1947
|
+
index: stepIndex,
|
|
1948
|
+
decision,
|
|
1949
|
+
beforeState,
|
|
1950
|
+
afterState: state,
|
|
1951
|
+
evalsBefore,
|
|
1952
|
+
evalsAfter: evals,
|
|
1953
|
+
actionOutcome,
|
|
1954
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1955
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1956
|
+
};
|
|
1957
|
+
history.push(step);
|
|
1958
|
+
if (actionOutcome.ok) {
|
|
1959
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
|
|
1960
|
+
attributes: {
|
|
1961
|
+
actionCostUsd: actionOutcome.costUsd ?? null,
|
|
1962
|
+
spentCostUsd,
|
|
1963
|
+
scoreBefore: scoreBefore ?? null,
|
|
1964
|
+
scoreAfter: scoreAfter ?? null,
|
|
1965
|
+
noProgressStreak
|
|
1966
|
+
}
|
|
1967
|
+
}));
|
|
1968
|
+
} else {
|
|
1969
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
|
|
1970
|
+
attributes: {
|
|
1971
|
+
spentCostUsd,
|
|
1972
|
+
noProgressStreak
|
|
1973
|
+
}
|
|
1974
|
+
}));
|
|
1975
|
+
}
|
|
1976
|
+
await runOnStep(config.onStep, step, runtimeErrors);
|
|
1977
|
+
if (noProgressStop.stop) {
|
|
1978
|
+
return finish(emitter, {
|
|
1979
|
+
intent: config.intent,
|
|
1980
|
+
pass: false,
|
|
1981
|
+
completed: true,
|
|
1982
|
+
reason: noProgressStop.reason,
|
|
1983
|
+
score: scoreAfter,
|
|
1984
|
+
steps: history,
|
|
1985
|
+
finalState: state,
|
|
1986
|
+
finalEvals: evals,
|
|
1987
|
+
wallMs: Date.now() - started,
|
|
1988
|
+
spentCostUsd,
|
|
1989
|
+
runId: emitter?.runId ?? null,
|
|
1990
|
+
failureClass: "tool_recovery_failure",
|
|
1991
|
+
runtimeErrors,
|
|
1992
|
+
stoppedBy: "stop-policy"
|
|
1993
|
+
});
|
|
1994
|
+
}
|
|
1995
|
+
const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
1996
|
+
if (postStepBudgetStop.stop) {
|
|
1997
|
+
return finish(emitter, {
|
|
1998
|
+
intent: config.intent,
|
|
1999
|
+
pass: false,
|
|
2000
|
+
completed: false,
|
|
2001
|
+
reason: postStepBudgetStop.reason,
|
|
2002
|
+
score: scoreAfter,
|
|
2003
|
+
steps: history,
|
|
2004
|
+
finalState: state,
|
|
2005
|
+
finalEvals: evals,
|
|
2006
|
+
wallMs: Date.now() - started,
|
|
2007
|
+
spentCostUsd,
|
|
2008
|
+
runId: emitter?.runId ?? null,
|
|
2009
|
+
failureClass: "budget_exceeded",
|
|
2010
|
+
runtimeErrors,
|
|
2011
|
+
stoppedBy: "budget"
|
|
2012
|
+
});
|
|
2013
|
+
}
|
|
2014
|
+
const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
|
|
2015
|
+
let postStepStop;
|
|
2016
|
+
try {
|
|
2017
|
+
postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
|
|
2018
|
+
} catch (err) {
|
|
2019
|
+
runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
|
|
2020
|
+
return finish(emitter, {
|
|
2021
|
+
intent: config.intent,
|
|
2022
|
+
pass: false,
|
|
2023
|
+
completed: false,
|
|
2024
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
2025
|
+
score: averageScore(evals),
|
|
2026
|
+
steps: history,
|
|
2027
|
+
finalState: state,
|
|
2028
|
+
finalEvals: evals,
|
|
2029
|
+
wallMs: Date.now() - started,
|
|
2030
|
+
spentCostUsd,
|
|
2031
|
+
runId: emitter?.runId ?? null,
|
|
2032
|
+
failureClass: "unknown",
|
|
2033
|
+
runtimeErrors,
|
|
2034
|
+
stoppedBy: "runtime-error"
|
|
2035
|
+
});
|
|
2036
|
+
}
|
|
2037
|
+
if (postStepStop.stop) {
|
|
2038
|
+
return finish(emitter, {
|
|
2039
|
+
intent: config.intent,
|
|
2040
|
+
pass: postStepStop.pass,
|
|
2041
|
+
completed: true,
|
|
2042
|
+
reason: postStepStop.reason,
|
|
2043
|
+
score: postStepStop.score,
|
|
2044
|
+
steps: history,
|
|
2045
|
+
finalState: state,
|
|
2046
|
+
finalEvals: evals,
|
|
2047
|
+
wallMs: Date.now() - started,
|
|
2048
|
+
spentCostUsd,
|
|
2049
|
+
runId: emitter?.runId ?? null,
|
|
2050
|
+
failureClass: postStepStop.failureClass,
|
|
2051
|
+
runtimeErrors,
|
|
2052
|
+
stoppedBy: "stop-policy"
|
|
2053
|
+
});
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
return finish(emitter, {
|
|
2057
|
+
intent: config.intent,
|
|
2058
|
+
pass: false,
|
|
2059
|
+
completed: false,
|
|
2060
|
+
reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
|
|
2061
|
+
steps: history,
|
|
2062
|
+
finalState: state,
|
|
2063
|
+
finalEvals: evals,
|
|
2064
|
+
wallMs: Date.now() - started,
|
|
2065
|
+
spentCostUsd,
|
|
2066
|
+
runId: emitter?.runId ?? null,
|
|
2067
|
+
failureClass: "budget_exceeded",
|
|
2068
|
+
runtimeErrors,
|
|
2069
|
+
stoppedBy: "budget"
|
|
2070
|
+
});
|
|
2071
|
+
} catch (err) {
|
|
2072
|
+
runtimeErrors.push(runtimeError("act", history.length, err));
|
|
2073
|
+
return finish(emitter, {
|
|
2074
|
+
intent: config.intent,
|
|
2075
|
+
pass: false,
|
|
2076
|
+
completed: false,
|
|
2077
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
2078
|
+
steps: history,
|
|
2079
|
+
finalState: void 0,
|
|
2080
|
+
finalEvals: [],
|
|
2081
|
+
wallMs: Date.now() - started,
|
|
2082
|
+
spentCostUsd,
|
|
2083
|
+
runId: emitter?.runId ?? null,
|
|
2084
|
+
failureClass: "unknown",
|
|
2085
|
+
runtimeErrors,
|
|
2086
|
+
stoppedBy: "runtime-error"
|
|
2087
|
+
});
|
|
2088
|
+
} finally {
|
|
2089
|
+
if (wallTimer) clearTimeout(wallTimer);
|
|
2090
|
+
if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
function stopOnNoProgress(maxNoProgressSteps, options = {}) {
|
|
2094
|
+
return { ...options, maxNoProgressSteps };
|
|
2095
|
+
}
|
|
2096
|
+
function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
|
|
2097
|
+
return { ...options, maxRepeatedActions };
|
|
2098
|
+
}
|
|
2099
|
+
function objectiveEval(input) {
|
|
2100
|
+
return { ...input, objective: true };
|
|
2101
|
+
}
|
|
2102
|
+
function subjectiveEval(input) {
|
|
2103
|
+
return { ...input, objective: false };
|
|
2104
|
+
}
|
|
2105
|
+
function allCriticalPassed(evals) {
|
|
2106
|
+
return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
|
|
2107
|
+
}
|
|
2108
|
+
function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
|
|
2109
|
+
return {
|
|
2110
|
+
intent,
|
|
2111
|
+
state,
|
|
2112
|
+
evals,
|
|
2113
|
+
history,
|
|
2114
|
+
budget,
|
|
2115
|
+
stepIndex,
|
|
2116
|
+
wallMs: Date.now() - started,
|
|
2117
|
+
spentCostUsd,
|
|
2118
|
+
remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
|
|
2119
|
+
abortSignal,
|
|
2120
|
+
emitter
|
|
2121
|
+
};
|
|
2122
|
+
}
|
|
2123
|
+
function defaultStopDecision(evals) {
|
|
2124
|
+
if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
|
|
2125
|
+
const pass = allCriticalPassed(evals);
|
|
2126
|
+
return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
|
|
2127
|
+
}
|
|
2128
|
+
function averageScore(evals) {
|
|
2129
|
+
const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
|
|
2130
|
+
if (!scored.length) return void 0;
|
|
2131
|
+
return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
|
|
2132
|
+
}
|
|
2133
|
+
function budgetStopDecision(budget, spentCostUsd) {
|
|
2134
|
+
if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
|
|
2135
|
+
return {
|
|
2136
|
+
stop: true,
|
|
2137
|
+
reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
|
|
2138
|
+
};
|
|
2139
|
+
}
|
|
2140
|
+
return { stop: false, reason: "" };
|
|
2141
|
+
}
|
|
2142
|
+
async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
|
|
2143
|
+
if (!emitter || budget.maxCostUsd === void 0) return;
|
|
2144
|
+
const maxCostUsd = budget.maxCostUsd;
|
|
2145
|
+
await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
|
|
2146
|
+
dimension: "usd",
|
|
2147
|
+
limit: maxCostUsd,
|
|
2148
|
+
consumed: spentCostUsd,
|
|
2149
|
+
remaining: Math.max(0, maxCostUsd - spentCostUsd),
|
|
2150
|
+
breached: spentCostUsd >= maxCostUsd,
|
|
2151
|
+
spanId: handle?.span.spanId
|
|
2152
|
+
}));
|
|
2153
|
+
}
|
|
2154
|
+
async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
|
|
2155
|
+
if (!emitter) return;
|
|
2156
|
+
for (const result of evals) {
|
|
2157
|
+
await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
|
|
2158
|
+
judgeId: result.objective ? "objective-validator" : "subjective-judge",
|
|
2159
|
+
targetSpanId: targetSpanId ?? emitter.runId,
|
|
2160
|
+
name: `control-eval/${result.id}`,
|
|
2161
|
+
dimension: result.id,
|
|
2162
|
+
score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
|
|
2163
|
+
rationale: result.detail,
|
|
2164
|
+
evidence: result.evidence,
|
|
2165
|
+
attributes: {
|
|
2166
|
+
phase,
|
|
2167
|
+
passed: result.passed,
|
|
2168
|
+
severity: result.severity,
|
|
2169
|
+
objective: result.objective
|
|
2170
|
+
}
|
|
2171
|
+
}));
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
async function runOnStep(onStep, step, runtimeErrors) {
|
|
2175
|
+
if (!onStep) return;
|
|
2176
|
+
try {
|
|
2177
|
+
await onStep(step);
|
|
2178
|
+
} catch (err) {
|
|
2179
|
+
runtimeErrors.push(runtimeError("on-step", step.index, err));
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
async function runTrace(runtimeErrors, stepIndex, write) {
|
|
2183
|
+
try {
|
|
2184
|
+
return await write();
|
|
2185
|
+
} catch (err) {
|
|
2186
|
+
runtimeErrors.push(runtimeError("trace", stepIndex, err));
|
|
2187
|
+
return void 0;
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
function noProgressStopDecision(args) {
|
|
2191
|
+
const max = args.policies?.maxNoProgressSteps;
|
|
2192
|
+
if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
|
|
2193
|
+
const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
|
|
2194
|
+
const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
|
|
2195
|
+
const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
|
|
2196
|
+
const scoreFlat = scoreDelta < minScoreDelta;
|
|
2197
|
+
const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
|
|
2198
|
+
return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
|
|
2199
|
+
}
|
|
2200
|
+
function repeatedActionStopDecision(policies, streak) {
|
|
2201
|
+
const max = policies?.maxRepeatedActions;
|
|
2202
|
+
if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
|
|
2203
|
+
return {
|
|
2204
|
+
stop: true,
|
|
2205
|
+
reason: `stuck: repeated same action for ${streak} step(s)`
|
|
2206
|
+
};
|
|
2207
|
+
}
|
|
2208
|
+
function fingerprintState(state, policies) {
|
|
2209
|
+
if (policies?.stateFingerprint) return policies.stateFingerprint(state);
|
|
2210
|
+
return stableFingerprint(state);
|
|
2211
|
+
}
|
|
2212
|
+
function fingerprintAction(action, policies) {
|
|
2213
|
+
if (policies?.actionFingerprint) return policies.actionFingerprint(action);
|
|
2214
|
+
return stableFingerprint(action);
|
|
2215
|
+
}
|
|
2216
|
+
function stableFingerprint(value) {
|
|
2217
|
+
if (typeof value === "string") return value;
|
|
2218
|
+
if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
|
|
2219
|
+
try {
|
|
2220
|
+
return JSON.stringify(sortForFingerprint(value));
|
|
2221
|
+
} catch {
|
|
2222
|
+
return String(value);
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
function sortForFingerprint(value) {
|
|
2226
|
+
if (Array.isArray(value)) return value.map(sortForFingerprint);
|
|
2227
|
+
if (!value || typeof value !== "object") return value;
|
|
2228
|
+
const record = value;
|
|
2229
|
+
const sorted = {};
|
|
2230
|
+
for (const key of Object.keys(record).sort()) {
|
|
2231
|
+
sorted[key] = sortForFingerprint(record[key]);
|
|
2232
|
+
}
|
|
2233
|
+
return sorted;
|
|
2234
|
+
}
|
|
2235
|
+
function abortReason(signal) {
|
|
2236
|
+
const reason = signal.reason;
|
|
2237
|
+
if (reason instanceof Error) return reason.message;
|
|
2238
|
+
return reason ? String(reason) : "aborted";
|
|
2239
|
+
}
|
|
2240
|
+
function runtimeError(phase, stepIndex, err) {
|
|
2241
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2242
|
+
return { phase, stepIndex, message };
|
|
2243
|
+
}
|
|
2244
|
+
async function finish(emitter, result) {
|
|
2245
|
+
await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
|
|
2246
|
+
pass: result.pass,
|
|
2247
|
+
score: result.score ?? averageScore(result.finalEvals),
|
|
2248
|
+
failureClass: result.failureClass,
|
|
2249
|
+
notes: result.reason
|
|
2250
|
+
}));
|
|
2251
|
+
return result;
|
|
2252
|
+
}
|
|
2253
|
+
|
|
2254
|
+
// src/feedback-trajectory.ts
|
|
2255
|
+
import { appendFile, mkdir, readFile } from "fs/promises";
|
|
2256
|
+
import { join } from "path";
|
|
2257
|
+
var DEFAULT_SPLIT_POLICY = {
|
|
2258
|
+
trainPct: 70,
|
|
2259
|
+
devPct: 15,
|
|
2260
|
+
testPct: 10,
|
|
2261
|
+
holdoutPct: 5
|
|
2262
|
+
};
|
|
2263
|
+
var InMemoryFeedbackTrajectoryStore = class {
|
|
2264
|
+
trajectories = /* @__PURE__ */ new Map();
|
|
2265
|
+
async save(trajectory) {
|
|
2266
|
+
this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
|
|
2267
|
+
}
|
|
2268
|
+
async get(id) {
|
|
2269
|
+
const trajectory = this.trajectories.get(id);
|
|
2270
|
+
return trajectory ? cloneTrajectory(trajectory) : null;
|
|
2271
|
+
}
|
|
2272
|
+
async list(filter = {}) {
|
|
2273
|
+
return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
|
|
2274
|
+
}
|
|
2275
|
+
async appendAttempt(id, attempt) {
|
|
2276
|
+
const trajectory = this.trajectories.get(id);
|
|
2277
|
+
if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
|
|
2278
|
+
const next = cloneTrajectory({
|
|
2279
|
+
...trajectory,
|
|
2280
|
+
attempts: [...trajectory.attempts, attempt],
|
|
2281
|
+
updatedAt: attempt.createdAt
|
|
2282
|
+
});
|
|
2283
|
+
this.trajectories.set(id, next);
|
|
2284
|
+
return cloneTrajectory(next);
|
|
2285
|
+
}
|
|
2286
|
+
async appendLabel(id, label, attemptId) {
|
|
2287
|
+
const trajectory = this.trajectories.get(id);
|
|
2288
|
+
if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
|
|
2289
|
+
const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
|
|
2290
|
+
const next = cloneTrajectory({
|
|
2291
|
+
...trajectory,
|
|
2292
|
+
attempts,
|
|
2293
|
+
labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
|
|
2294
|
+
updatedAt: label.createdAt
|
|
2295
|
+
});
|
|
2296
|
+
this.trajectories.set(id, next);
|
|
2297
|
+
return cloneTrajectory(next);
|
|
2298
|
+
}
|
|
2299
|
+
};
|
|
2300
|
+
var FileSystemFeedbackTrajectoryStore = class {
|
|
2301
|
+
dir;
|
|
2302
|
+
memory = new InMemoryFeedbackTrajectoryStore();
|
|
2303
|
+
loaded = false;
|
|
2304
|
+
constructor(options) {
|
|
2305
|
+
this.dir = options.dir;
|
|
2306
|
+
}
|
|
2307
|
+
async save(trajectory) {
|
|
2308
|
+
await this.load();
|
|
2309
|
+
await this.memory.save(trajectory);
|
|
2310
|
+
await this.append({ op: "save", trajectory });
|
|
2311
|
+
}
|
|
2312
|
+
async get(id) {
|
|
2313
|
+
await this.load();
|
|
2314
|
+
return this.memory.get(id);
|
|
2315
|
+
}
|
|
2316
|
+
async list(filter = {}) {
|
|
2317
|
+
await this.load();
|
|
2318
|
+
return this.memory.list(filter);
|
|
2319
|
+
}
|
|
2320
|
+
async appendAttempt(id, attempt) {
|
|
2321
|
+
await this.load();
|
|
2322
|
+
const next = await this.memory.appendAttempt(id, attempt);
|
|
2323
|
+
await this.append({ op: "appendAttempt", id, attempt });
|
|
2324
|
+
return next;
|
|
2325
|
+
}
|
|
2326
|
+
async appendLabel(id, label, attemptId) {
|
|
2327
|
+
await this.load();
|
|
2328
|
+
const next = await this.memory.appendLabel(id, label, attemptId);
|
|
2329
|
+
await this.append({ op: "appendLabel", id, label, attemptId });
|
|
2330
|
+
return next;
|
|
2331
|
+
}
|
|
2332
|
+
async append(record) {
|
|
2333
|
+
await mkdir(this.dir, { recursive: true });
|
|
2334
|
+
await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
|
|
2335
|
+
}
|
|
2336
|
+
async load() {
|
|
2337
|
+
if (this.loaded) return;
|
|
2338
|
+
const file = join(this.dir, "feedback-trajectories.ndjson");
|
|
2339
|
+
try {
|
|
2340
|
+
const raw = await readFile(file, "utf8");
|
|
2341
|
+
for (const line of raw.split("\n")) {
|
|
2342
|
+
if (!line.trim()) continue;
|
|
2343
|
+
try {
|
|
2344
|
+
const record = JSON.parse(line);
|
|
2345
|
+
if (record.op === "save") await this.memory.save(record.trajectory);
|
|
2346
|
+
if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
|
|
2347
|
+
if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
|
|
2348
|
+
} catch {
|
|
2349
|
+
}
|
|
2350
|
+
}
|
|
2351
|
+
} catch {
|
|
2352
|
+
}
|
|
2353
|
+
this.loaded = true;
|
|
2354
|
+
}
|
|
2355
|
+
};
|
|
2356
|
+
function createFeedbackTrajectory(input) {
|
|
2357
|
+
const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2358
|
+
const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
|
|
2359
|
+
return {
|
|
2360
|
+
id,
|
|
2361
|
+
projectId: input.projectId,
|
|
2362
|
+
scenarioId: input.scenarioId,
|
|
2363
|
+
task: input.task,
|
|
2364
|
+
attempts: input.attempts ?? [],
|
|
2365
|
+
labels: input.labels ?? [],
|
|
2366
|
+
outcome: input.outcome,
|
|
2367
|
+
split: input.split,
|
|
2368
|
+
tags: input.tags,
|
|
2369
|
+
createdAt,
|
|
2370
|
+
metadata: input.metadata
|
|
2371
|
+
};
|
|
2372
|
+
}
|
|
2373
|
+
function assignFeedbackSplit(trajectory, policy = {}) {
|
|
2374
|
+
const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
|
|
2375
|
+
const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
|
|
2376
|
+
if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
|
|
2377
|
+
const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
|
|
2378
|
+
if (bucket < split.trainPct) return "train";
|
|
2379
|
+
if (bucket < split.trainPct + split.devPct) return "dev";
|
|
2380
|
+
if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
|
|
2381
|
+
return "holdout";
|
|
2382
|
+
}
|
|
2383
|
+
function withAssignedFeedbackSplit(trajectory, policy) {
|
|
2384
|
+
return {
|
|
2385
|
+
...trajectory,
|
|
2386
|
+
split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
|
|
2387
|
+
};
|
|
2388
|
+
}
|
|
2389
|
+
function feedbackTrajectoryToDatasetScenario(trajectory) {
|
|
2390
|
+
const withSplit = withAssignedFeedbackSplit(trajectory);
|
|
2391
|
+
return {
|
|
2392
|
+
id: withSplit.scenarioId ?? withSplit.id,
|
|
2393
|
+
split: withSplit.split,
|
|
2394
|
+
payload: withSplit,
|
|
2395
|
+
tags: {
|
|
2396
|
+
...withSplit.projectId ? { projectId: withSplit.projectId } : {},
|
|
2397
|
+
...withSplit.tags ?? {},
|
|
2398
|
+
source: "feedback-trajectory"
|
|
2399
|
+
}
|
|
2400
|
+
};
|
|
2401
|
+
}
|
|
2402
|
+
function feedbackTrajectoriesToDatasetScenarios(trajectories) {
|
|
2403
|
+
return trajectories.map(feedbackTrajectoryToDatasetScenario);
|
|
2404
|
+
}
|
|
2405
|
+
function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
2406
|
+
const labels = allLabels(trajectory);
|
|
2407
|
+
return {
|
|
2408
|
+
scenarioId: trajectory.scenarioId ?? trajectory.id,
|
|
2409
|
+
trajectoryId: trajectory.id,
|
|
2410
|
+
labelKinds: [...new Set(labels.map((label) => label.kind))],
|
|
2411
|
+
score: trajectory.outcome?.score ?? scoreFromLabels(labels),
|
|
2412
|
+
metadata: {
|
|
2413
|
+
projectId: trajectory.projectId,
|
|
2414
|
+
split: trajectory.split,
|
|
2415
|
+
intent: trajectory.task.intent,
|
|
2416
|
+
attempts: trajectory.attempts.length,
|
|
2417
|
+
outcome: trajectory.outcome,
|
|
2418
|
+
labels
|
|
2419
|
+
}
|
|
2420
|
+
};
|
|
2421
|
+
}
|
|
2422
|
+
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
2423
|
+
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
2424
|
+
}
|
|
2425
|
+
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
2426
|
+
const maxEntries = options.maxEntries ?? 20;
|
|
2427
|
+
const entries = [];
|
|
2428
|
+
for (const trajectory of trajectories) {
|
|
2429
|
+
for (const label of allLabels(trajectory)) {
|
|
2430
|
+
const instruction = instructionFromLabel(trajectory, label);
|
|
2431
|
+
if (!instruction) continue;
|
|
2432
|
+
entries.push({
|
|
2433
|
+
instruction,
|
|
2434
|
+
rationale: label.reason ?? `${label.kind} label from ${label.source}`,
|
|
2435
|
+
weight: weightForLabel(label),
|
|
2436
|
+
sourceTrajectoryId: trajectory.id,
|
|
2437
|
+
sourceLabelId: label.id,
|
|
2438
|
+
category: label.kind
|
|
2439
|
+
});
|
|
2440
|
+
}
|
|
2441
|
+
}
|
|
2442
|
+
const byInstruction = /* @__PURE__ */ new Map();
|
|
2443
|
+
for (const entry of entries) {
|
|
2444
|
+
const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
|
|
2445
|
+
const existing = byInstruction.get(key);
|
|
2446
|
+
if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
|
|
2447
|
+
}
|
|
2448
|
+
return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
|
|
2449
|
+
}
|
|
2450
|
+
function renderPreferenceMemoryMarkdown(entries) {
|
|
2451
|
+
const lines = ["# Preference Memory", ""];
|
|
2452
|
+
for (const entry of entries) {
|
|
2453
|
+
lines.push(`- ${entry.instruction}`);
|
|
2454
|
+
lines.push(` Rationale: ${entry.rationale}`);
|
|
2455
|
+
lines.push(` Source: ${entry.sourceTrajectoryId}`);
|
|
2456
|
+
lines.push("");
|
|
2457
|
+
}
|
|
2458
|
+
return lines.join("\n").trim() + "\n";
|
|
2459
|
+
}
|
|
2460
|
+
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
2461
|
+
return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
|
|
2462
|
+
}
|
|
2463
|
+
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
2464
|
+
const trajectories = [];
|
|
2465
|
+
for (const line of jsonl.split("\n")) {
|
|
2466
|
+
if (!line.trim()) continue;
|
|
2467
|
+
trajectories.push(JSON.parse(line));
|
|
2468
|
+
}
|
|
2469
|
+
return trajectories;
|
|
2470
|
+
}
|
|
2471
|
+
function controlRunToFeedbackTrajectory(run, options = {}) {
|
|
2472
|
+
const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2473
|
+
const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
|
|
2474
|
+
return createFeedbackTrajectory({
|
|
2475
|
+
id: trajectoryId,
|
|
2476
|
+
projectId: options.projectId,
|
|
2477
|
+
scenarioId: options.scenarioId,
|
|
2478
|
+
task: { intent: run.intent },
|
|
2479
|
+
createdAt,
|
|
2480
|
+
attempts: run.steps.map((step) => ({
|
|
2481
|
+
id: `${trajectoryId}_step_${step.index}`,
|
|
2482
|
+
stepIndex: step.index,
|
|
2483
|
+
artifactType: options.artifactType ?? "action",
|
|
2484
|
+
artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
|
|
2485
|
+
proposedAction: options.proposedActionFromStep?.(step),
|
|
2486
|
+
evals: step.evalsAfter,
|
|
2487
|
+
createdAt: step.startedAt,
|
|
2488
|
+
metadata: {
|
|
2489
|
+
decision: step.decision,
|
|
2490
|
+
actionOutcome: step.actionOutcome
|
|
2491
|
+
}
|
|
2492
|
+
})),
|
|
2493
|
+
labels: [
|
|
2494
|
+
{
|
|
2495
|
+
source: "system",
|
|
2496
|
+
kind: run.pass ? "approve" : "reject",
|
|
2497
|
+
value: run.pass,
|
|
2498
|
+
reason: run.reason,
|
|
2499
|
+
severity: run.pass ? "info" : "error",
|
|
2500
|
+
createdAt
|
|
2501
|
+
}
|
|
2502
|
+
],
|
|
2503
|
+
outcome: {
|
|
2504
|
+
success: run.pass,
|
|
2505
|
+
score: run.score,
|
|
2506
|
+
costUsd: run.spentCostUsd,
|
|
2507
|
+
detail: run.reason,
|
|
2508
|
+
observedAt: createdAt,
|
|
2509
|
+
metadata: {
|
|
2510
|
+
stoppedBy: run.stoppedBy,
|
|
2511
|
+
failureClass: run.failureClass
|
|
2512
|
+
}
|
|
2513
|
+
}
|
|
2514
|
+
});
|
|
2515
|
+
}
|
|
2516
|
+
function allLabels(trajectory) {
|
|
2517
|
+
const labels = [
|
|
2518
|
+
...trajectory.labels,
|
|
2519
|
+
...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
|
|
2520
|
+
];
|
|
2521
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2522
|
+
return labels.filter((label) => {
|
|
2523
|
+
const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
|
|
2524
|
+
if (seen.has(key)) return false;
|
|
2525
|
+
seen.add(key);
|
|
2526
|
+
return true;
|
|
2527
|
+
});
|
|
2528
|
+
}
|
|
2529
|
+
function scoreFromLabels(labels) {
|
|
2530
|
+
if (!labels.length) return void 0;
|
|
2531
|
+
const scored = labels.map((label) => {
|
|
2532
|
+
if (label.kind === "approve" || label.kind === "select") return 1;
|
|
2533
|
+
if (label.kind === "reject" || label.kind === "policy_block") return 0;
|
|
2534
|
+
if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
|
|
2535
|
+
return void 0;
|
|
2536
|
+
}).filter((value) => typeof value === "number");
|
|
2537
|
+
if (!scored.length) return void 0;
|
|
2538
|
+
return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
|
|
2539
|
+
}
|
|
2540
|
+
function instructionFromLabel(trajectory, label) {
|
|
2541
|
+
if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
|
|
2542
|
+
if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
|
|
2543
|
+
if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
|
|
2544
|
+
if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
|
|
2545
|
+
if (label.kind === "comment" && label.reason) return label.reason;
|
|
2546
|
+
return void 0;
|
|
2547
|
+
}
|
|
2548
|
+
function weightForLabel(label) {
|
|
2549
|
+
const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
|
|
2550
|
+
const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
|
|
2551
|
+
return severity * source;
|
|
2552
|
+
}
|
|
2553
|
+
function matchesFilter(trajectory, filter) {
|
|
2554
|
+
if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
|
|
2555
|
+
if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
|
|
2556
|
+
if (filter.split && trajectory.split !== filter.split) return false;
|
|
2557
|
+
if (filter.tag) {
|
|
2558
|
+
const [key, value] = filter.tag;
|
|
2559
|
+
if (trajectory.tags?.[key] !== value) return false;
|
|
2560
|
+
}
|
|
2561
|
+
return true;
|
|
2562
|
+
}
|
|
2563
|
+
function cloneTrajectory(trajectory) {
|
|
2564
|
+
return JSON.parse(JSON.stringify(trajectory));
|
|
2565
|
+
}
|
|
2566
|
+
function compact(value, max) {
|
|
2567
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
2568
|
+
return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
|
|
2569
|
+
}
|
|
2570
|
+
function stableHash(input) {
|
|
2571
|
+
let hash = 2166136261;
|
|
2572
|
+
for (let i = 0; i < input.length; i += 1) {
|
|
2573
|
+
hash ^= input.charCodeAt(i);
|
|
2574
|
+
hash = Math.imul(hash, 16777619);
|
|
2575
|
+
}
|
|
2576
|
+
return hash >>> 0;
|
|
2577
|
+
}
|
|
2578
|
+
function canonicalize(value) {
|
|
2579
|
+
if (value === null || typeof value !== "object") return value;
|
|
2580
|
+
if (Array.isArray(value)) return value.map(canonicalize);
|
|
2581
|
+
const out = {};
|
|
2582
|
+
for (const key of Object.keys(value).sort()) {
|
|
2583
|
+
out[key] = canonicalize(value[key]);
|
|
2584
|
+
}
|
|
2585
|
+
return out;
|
|
2586
|
+
}
|
|
2587
|
+
|
|
1389
2588
|
// src/prompt-registry.ts
|
|
1390
2589
|
var PromptRegistry = class {
|
|
1391
2590
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -3101,184 +4300,6 @@ var FileSystemTraceStore = class {
|
|
|
3101
4300
|
}
|
|
3102
4301
|
};
|
|
3103
4302
|
|
|
3104
|
-
// src/trace/emitter.ts
|
|
3105
|
-
var TraceEmitter = class {
|
|
3106
|
-
store;
|
|
3107
|
-
stack = [];
|
|
3108
|
-
_runId;
|
|
3109
|
-
now;
|
|
3110
|
-
id;
|
|
3111
|
-
constructor(store, options = {}) {
|
|
3112
|
-
this.store = store;
|
|
3113
|
-
this.now = options.now ?? (() => Date.now());
|
|
3114
|
-
this.id = options.id ?? (() => cryptoRandomId());
|
|
3115
|
-
this._runId = options.runId ?? this.id();
|
|
3116
|
-
}
|
|
3117
|
-
get runId() {
|
|
3118
|
-
return this._runId;
|
|
3119
|
-
}
|
|
3120
|
-
// ── Run lifecycle ──────────────────────────────────────────────────
|
|
3121
|
-
async startRun(run) {
|
|
3122
|
-
const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
|
|
3123
|
-
await this.store.appendRun(full);
|
|
3124
|
-
return full;
|
|
3125
|
-
}
|
|
3126
|
-
async endRun(outcome) {
|
|
3127
|
-
const status = outcome?.pass === false ? "failed" : "completed";
|
|
3128
|
-
await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
|
|
3129
|
-
}
|
|
3130
|
-
async abortRun(reason) {
|
|
3131
|
-
await this.store.updateRun(this._runId, {
|
|
3132
|
-
endedAt: this.now(),
|
|
3133
|
-
status: "aborted",
|
|
3134
|
-
outcome: { pass: false, notes: reason }
|
|
3135
|
-
});
|
|
3136
|
-
}
|
|
3137
|
-
// ── Generic span ───────────────────────────────────────────────────
|
|
3138
|
-
async span(init) {
|
|
3139
|
-
const spanId = this.id();
|
|
3140
|
-
const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
|
|
3141
|
-
const span = {
|
|
3142
|
-
spanId,
|
|
3143
|
-
parentSpanId: parent,
|
|
3144
|
-
runId: this._runId,
|
|
3145
|
-
startedAt: this.now(),
|
|
3146
|
-
...init
|
|
3147
|
-
};
|
|
3148
|
-
await this.store.appendSpan(span);
|
|
3149
|
-
this.stack.push(spanId);
|
|
3150
|
-
return this.handle(span);
|
|
3151
|
-
}
|
|
3152
|
-
handle(span) {
|
|
3153
|
-
return {
|
|
3154
|
-
span,
|
|
3155
|
-
end: async (patch) => {
|
|
3156
|
-
const endedAt = this.now();
|
|
3157
|
-
await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
|
|
3158
|
-
this.pop(span.spanId);
|
|
3159
|
-
},
|
|
3160
|
-
fail: async (error, patch) => {
|
|
3161
|
-
const endedAt = this.now();
|
|
3162
|
-
const errStr = error instanceof Error ? error.message : error;
|
|
3163
|
-
await this.store.updateSpan(span.spanId, {
|
|
3164
|
-
endedAt,
|
|
3165
|
-
status: "error",
|
|
3166
|
-
error: errStr,
|
|
3167
|
-
...patch
|
|
3168
|
-
});
|
|
3169
|
-
this.pop(span.spanId);
|
|
3170
|
-
}
|
|
3171
|
-
};
|
|
3172
|
-
}
|
|
3173
|
-
pop(spanId) {
|
|
3174
|
-
const idx = this.stack.lastIndexOf(spanId);
|
|
3175
|
-
if (idx >= 0) this.stack.splice(idx, 1);
|
|
3176
|
-
}
|
|
3177
|
-
// ── Typed span conveniences ────────────────────────────────────────
|
|
3178
|
-
llm(init) {
|
|
3179
|
-
return this.span({ kind: "llm", ...init });
|
|
3180
|
-
}
|
|
3181
|
-
tool(init) {
|
|
3182
|
-
return this.span({ kind: "tool", ...init });
|
|
3183
|
-
}
|
|
3184
|
-
retrieval(init) {
|
|
3185
|
-
return this.span({ kind: "retrieval", ...init });
|
|
3186
|
-
}
|
|
3187
|
-
async recordJudge(verdict) {
|
|
3188
|
-
const spanId = this.id();
|
|
3189
|
-
const now = this.now();
|
|
3190
|
-
const full = {
|
|
3191
|
-
spanId,
|
|
3192
|
-
runId: this._runId,
|
|
3193
|
-
kind: "judge",
|
|
3194
|
-
startedAt: now,
|
|
3195
|
-
endedAt: now,
|
|
3196
|
-
status: "ok",
|
|
3197
|
-
...verdict
|
|
3198
|
-
};
|
|
3199
|
-
await this.store.appendSpan(full);
|
|
3200
|
-
return full;
|
|
3201
|
-
}
|
|
3202
|
-
sandbox(init) {
|
|
3203
|
-
return this.span({ kind: "sandbox", ...init });
|
|
3204
|
-
}
|
|
3205
|
-
// ── Events ─────────────────────────────────────────────────────────
|
|
3206
|
-
async emit(event) {
|
|
3207
|
-
const full = {
|
|
3208
|
-
eventId: this.id(),
|
|
3209
|
-
runId: this._runId,
|
|
3210
|
-
spanId: event.spanId ?? this.stack[this.stack.length - 1],
|
|
3211
|
-
kind: event.kind,
|
|
3212
|
-
timestamp: this.now(),
|
|
3213
|
-
payload: event.payload ?? {}
|
|
3214
|
-
};
|
|
3215
|
-
await this.store.appendEvent(full);
|
|
3216
|
-
return full;
|
|
3217
|
-
}
|
|
3218
|
-
// ── Budget ledger ──────────────────────────────────────────────────
|
|
3219
|
-
async recordBudget(entry) {
|
|
3220
|
-
const full = {
|
|
3221
|
-
runId: this._runId,
|
|
3222
|
-
timestamp: entry.timestamp ?? this.now(),
|
|
3223
|
-
dimension: entry.dimension,
|
|
3224
|
-
limit: entry.limit,
|
|
3225
|
-
consumed: entry.consumed,
|
|
3226
|
-
remaining: entry.remaining,
|
|
3227
|
-
breached: entry.breached,
|
|
3228
|
-
spanId: entry.spanId ?? this.stack[this.stack.length - 1]
|
|
3229
|
-
};
|
|
3230
|
-
await this.store.appendBudgetEntry(full);
|
|
3231
|
-
if (full.breached) {
|
|
3232
|
-
await this.emit({
|
|
3233
|
-
kind: "budget_breach",
|
|
3234
|
-
spanId: full.spanId,
|
|
3235
|
-
payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
|
|
3236
|
-
});
|
|
3237
|
-
}
|
|
3238
|
-
return full;
|
|
3239
|
-
}
|
|
3240
|
-
// ── Artifacts ──────────────────────────────────────────────────────
|
|
3241
|
-
async recordArtifact(artifact) {
|
|
3242
|
-
const full = { artifactId: this.id(), runId: this._runId, ...artifact };
|
|
3243
|
-
await this.store.appendArtifact(full);
|
|
3244
|
-
return full;
|
|
3245
|
-
}
|
|
3246
|
-
// ── Nested composition ─────────────────────────────────────────────
|
|
3247
|
-
/**
|
|
3248
|
-
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
3249
|
-
* Returns the fn's return value. Use this for the 95% case.
|
|
3250
|
-
*/
|
|
3251
|
-
async within(init, fn) {
|
|
3252
|
-
const handle = await this.span(init);
|
|
3253
|
-
try {
|
|
3254
|
-
const result = await fn(handle);
|
|
3255
|
-
await handle.end();
|
|
3256
|
-
return result;
|
|
3257
|
-
} catch (err) {
|
|
3258
|
-
await handle.fail(err instanceof Error ? err : String(err));
|
|
3259
|
-
throw err;
|
|
3260
|
-
}
|
|
3261
|
-
}
|
|
3262
|
-
};
|
|
3263
|
-
function cryptoRandomId() {
|
|
3264
|
-
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
3265
|
-
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
3266
|
-
}
|
|
3267
|
-
function llmSpanFromProvider(args) {
|
|
3268
|
-
return {
|
|
3269
|
-
name: args.name ?? args.model,
|
|
3270
|
-
model: args.model,
|
|
3271
|
-
messages: args.messages,
|
|
3272
|
-
output: args.output,
|
|
3273
|
-
inputTokens: args.usage?.inputTokens,
|
|
3274
|
-
outputTokens: args.usage?.outputTokens,
|
|
3275
|
-
cachedTokens: args.usage?.cachedTokens,
|
|
3276
|
-
reasoningTokens: args.usage?.reasoningTokens,
|
|
3277
|
-
costUsd: args.costUsd,
|
|
3278
|
-
finishReason: args.finishReason
|
|
3279
|
-
};
|
|
3280
|
-
}
|
|
3281
|
-
|
|
3282
4303
|
// src/sandbox-harness.ts
|
|
3283
4304
|
var vitestTestParser = {
|
|
3284
4305
|
id: "vitest",
|
|
@@ -3887,6 +4908,157 @@ function safeJson(x) {
|
|
|
3887
4908
|
}
|
|
3888
4909
|
}
|
|
3889
4910
|
|
|
4911
|
+
// src/propose-review-control.ts
|
|
4912
|
+
var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
4913
|
+
async function runProposeReviewAsControlLoop(config) {
|
|
4914
|
+
const maxShots = config.maxShots ?? 10;
|
|
4915
|
+
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
4916
|
+
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
4917
|
+
const memory = config.memory ?? inMemoryReviewStore();
|
|
4918
|
+
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
|
|
4919
|
+
const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
|
|
4920
|
+
let lowConfidenceStreak = 0;
|
|
4921
|
+
let current = {
|
|
4922
|
+
shot: 0,
|
|
4923
|
+
state: config.initialState,
|
|
4924
|
+
priorReview: null,
|
|
4925
|
+
verification: { pass: false },
|
|
4926
|
+
memory: await memory.load(),
|
|
4927
|
+
completed: false,
|
|
4928
|
+
reviewAvailable: false
|
|
4929
|
+
};
|
|
4930
|
+
return runAgentControlLoop({
|
|
4931
|
+
intent: config.goal,
|
|
4932
|
+
budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
|
|
4933
|
+
store: config.store,
|
|
4934
|
+
scenarioId: config.scenarioId ?? "propose-review-control",
|
|
4935
|
+
projectId: config.projectId,
|
|
4936
|
+
variantId: config.variantId,
|
|
4937
|
+
actionFailure: config.actionFailure ?? "stop",
|
|
4938
|
+
observe: () => current,
|
|
4939
|
+
validate: ({ state }) => [
|
|
4940
|
+
objectiveEval({
|
|
4941
|
+
id: "verification",
|
|
4942
|
+
passed: state.verification.pass,
|
|
4943
|
+
score: state.verification.score,
|
|
4944
|
+
severity: "critical",
|
|
4945
|
+
detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
|
|
4946
|
+
})
|
|
4947
|
+
],
|
|
4948
|
+
shouldStop: ({ state }) => {
|
|
4949
|
+
if (state.verification.pass) {
|
|
4950
|
+
return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
|
|
4951
|
+
}
|
|
4952
|
+
if (state.completed) {
|
|
4953
|
+
return {
|
|
4954
|
+
stop: true,
|
|
4955
|
+
pass: false,
|
|
4956
|
+
reason: "reviewer stopped continuation",
|
|
4957
|
+
score: state.verification.score,
|
|
4958
|
+
failureClass: failureClassFromVerification(state.verification)
|
|
4959
|
+
};
|
|
4960
|
+
}
|
|
4961
|
+
return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
|
|
4962
|
+
},
|
|
4963
|
+
decide: ({ state }) => ({
|
|
4964
|
+
type: "continue",
|
|
4965
|
+
action: { type: "propose-review-shot", shot: state.shot + 1 },
|
|
4966
|
+
reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
|
|
4967
|
+
}),
|
|
4968
|
+
act: async (action, ctx) => {
|
|
4969
|
+
const shot = action.shot;
|
|
4970
|
+
const proposeOut = await config.propose({
|
|
4971
|
+
shot,
|
|
4972
|
+
goal: config.goal,
|
|
4973
|
+
state: current.state,
|
|
4974
|
+
priorReview: current.priorReview,
|
|
4975
|
+
abortSignal: ctx.abortSignal,
|
|
4976
|
+
emitter: ctx.emitter
|
|
4977
|
+
});
|
|
4978
|
+
const nextState = proposeOut.state;
|
|
4979
|
+
const verification = await config.verify(nextState);
|
|
4980
|
+
let review = null;
|
|
4981
|
+
let reviewAvailable = false;
|
|
4982
|
+
let reviewError;
|
|
4983
|
+
let shouldContinue = !verification.pass;
|
|
4984
|
+
if (!verification.pass) {
|
|
4985
|
+
try {
|
|
4986
|
+
review = await config.review({
|
|
4987
|
+
shot,
|
|
4988
|
+
goal: config.goal,
|
|
4989
|
+
state: nextState,
|
|
4990
|
+
verification,
|
|
4991
|
+
traceSummary: proposeOut.traceSummary,
|
|
4992
|
+
memory: await memory.load()
|
|
4993
|
+
});
|
|
4994
|
+
reviewAvailable = true;
|
|
4995
|
+
shouldContinue = review.shouldContinue;
|
|
4996
|
+
lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
|
|
4997
|
+
if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
|
|
4998
|
+
} catch (err) {
|
|
4999
|
+
reviewError = err instanceof Error ? err.message : String(err);
|
|
5000
|
+
review = current.priorReview ?? {
|
|
5001
|
+
observations: "Reviewer unavailable.",
|
|
5002
|
+
diagnosis: reviewError,
|
|
5003
|
+
nextShotInstruction: fallbackInstruction,
|
|
5004
|
+
shouldContinue: true,
|
|
5005
|
+
confidence: 0
|
|
5006
|
+
};
|
|
5007
|
+
shouldContinue = true;
|
|
5008
|
+
}
|
|
5009
|
+
} else {
|
|
5010
|
+
review = {
|
|
5011
|
+
observations: "Verification passed.",
|
|
5012
|
+
diagnosis: "No further revision needed.",
|
|
5013
|
+
nextShotInstruction: "",
|
|
5014
|
+
shouldContinue: false,
|
|
5015
|
+
confidence: 1
|
|
5016
|
+
};
|
|
5017
|
+
}
|
|
5018
|
+
const entry = {
|
|
5019
|
+
...review ?? {
|
|
5020
|
+
observations: "No review.",
|
|
5021
|
+
diagnosis: "",
|
|
5022
|
+
nextShotInstruction: fallbackInstruction,
|
|
5023
|
+
shouldContinue,
|
|
5024
|
+
confidence: 0
|
|
5025
|
+
},
|
|
5026
|
+
shot,
|
|
5027
|
+
timestamp: Date.now(),
|
|
5028
|
+
verification: {
|
|
5029
|
+
pass: verification.pass,
|
|
5030
|
+
score: verification.score,
|
|
5031
|
+
failingLayers: verification.failingLayers
|
|
5032
|
+
}
|
|
5033
|
+
};
|
|
5034
|
+
await memory.append(entry);
|
|
5035
|
+
current = {
|
|
5036
|
+
shot,
|
|
5037
|
+
state: nextState,
|
|
5038
|
+
priorReview: review,
|
|
5039
|
+
verification,
|
|
5040
|
+
traceSummary: proposeOut.traceSummary,
|
|
5041
|
+
memory: await memory.load(),
|
|
5042
|
+
completed: verification.pass || !shouldContinue,
|
|
5043
|
+
reviewAvailable,
|
|
5044
|
+
reviewError
|
|
5045
|
+
};
|
|
5046
|
+
return {
|
|
5047
|
+
state: nextState,
|
|
5048
|
+
verification,
|
|
5049
|
+
traceSummary: proposeOut.traceSummary,
|
|
5050
|
+
review,
|
|
5051
|
+
reviewAvailable,
|
|
5052
|
+
reviewError
|
|
5053
|
+
};
|
|
5054
|
+
}
|
|
5055
|
+
});
|
|
5056
|
+
}
|
|
5057
|
+
function controlFailureClassFromVerification(verification) {
|
|
5058
|
+
if (verification.pass) return void 0;
|
|
5059
|
+
return verification.failingLayers?.length ? "instruction_following" : "unknown";
|
|
5060
|
+
}
|
|
5061
|
+
|
|
3890
5062
|
// src/trace/schema.ts
|
|
3891
5063
|
var TRACE_SCHEMA_VERSION = "1.0.0";
|
|
3892
5064
|
var FAILURE_CLASSES = [
|
|
@@ -5210,7 +6382,7 @@ function assertNonNegative(n, name) {
|
|
|
5210
6382
|
|
|
5211
6383
|
// src/muffled-gate-scanner.ts
|
|
5212
6384
|
import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
|
|
5213
|
-
import { join } from "path";
|
|
6385
|
+
import { join as join2 } from "path";
|
|
5214
6386
|
function codeOf(line) {
|
|
5215
6387
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
5216
6388
|
}
|
|
@@ -5314,11 +6486,11 @@ var UNIVERSAL_FINDERS = [
|
|
|
5314
6486
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
5315
6487
|
const matches2 = [];
|
|
5316
6488
|
const walk = (rel) => {
|
|
5317
|
-
const abs =
|
|
6489
|
+
const abs = join2(repoRoot, rel);
|
|
5318
6490
|
if (!existsSync2(abs)) return;
|
|
5319
6491
|
for (const entry of readdirSync(abs)) {
|
|
5320
|
-
const sub =
|
|
5321
|
-
const subAbs =
|
|
6492
|
+
const sub = join2(rel, entry);
|
|
6493
|
+
const subAbs = join2(repoRoot, sub);
|
|
5322
6494
|
let st;
|
|
5323
6495
|
try {
|
|
5324
6496
|
st = statSync(subAbs);
|
|
@@ -5347,7 +6519,7 @@ function scanForMuffledGates(opts) {
|
|
|
5347
6519
|
const findings = [];
|
|
5348
6520
|
const scanned = /* @__PURE__ */ new Set();
|
|
5349
6521
|
for (const file of opts.scanFiles) {
|
|
5350
|
-
const abs =
|
|
6522
|
+
const abs = join2(opts.repoRoot, file);
|
|
5351
6523
|
if (!existsSync2(abs)) continue;
|
|
5352
6524
|
const text = readFileSync2(abs, "utf8");
|
|
5353
6525
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
@@ -5362,7 +6534,7 @@ function scanForMuffledGates(opts) {
|
|
|
5362
6534
|
);
|
|
5363
6535
|
for (const file of importers) {
|
|
5364
6536
|
if (scanned.has(file)) continue;
|
|
5365
|
-
const abs =
|
|
6537
|
+
const abs = join2(opts.repoRoot, file);
|
|
5366
6538
|
if (!existsSync2(abs)) continue;
|
|
5367
6539
|
const text = readFileSync2(abs, "utf8");
|
|
5368
6540
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
@@ -5557,7 +6729,7 @@ var Dataset = class _Dataset {
|
|
|
5557
6729
|
* Write to disk for contamination-verifiable archives.
|
|
5558
6730
|
*/
|
|
5559
6731
|
toJsonl() {
|
|
5560
|
-
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(
|
|
6732
|
+
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
|
|
5561
6733
|
}
|
|
5562
6734
|
static fromJsonl(jsonl, manifest) {
|
|
5563
6735
|
const scenarios = [];
|
|
@@ -5570,18 +6742,18 @@ var Dataset = class _Dataset {
|
|
|
5570
6742
|
}
|
|
5571
6743
|
};
|
|
5572
6744
|
async function hashScenarios(scenarios) {
|
|
5573
|
-
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(
|
|
6745
|
+
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
|
|
5574
6746
|
const text = JSON.stringify(canonical);
|
|
5575
6747
|
const bytes = new TextEncoder().encode(text);
|
|
5576
6748
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
5577
6749
|
return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
5578
6750
|
}
|
|
5579
|
-
function
|
|
6751
|
+
function canonicalize2(v) {
|
|
5580
6752
|
if (v === null || typeof v !== "object") return v;
|
|
5581
|
-
if (Array.isArray(v)) return v.map(
|
|
6753
|
+
if (Array.isArray(v)) return v.map(canonicalize2);
|
|
5582
6754
|
const keys = Object.keys(v).sort();
|
|
5583
6755
|
const out = {};
|
|
5584
|
-
for (const k of keys) out[k] =
|
|
6756
|
+
for (const k of keys) out[k] = canonicalize2(v[k]);
|
|
5585
6757
|
return out;
|
|
5586
6758
|
}
|
|
5587
6759
|
function seededShuffle(items, seed) {
|
|
@@ -7350,7 +8522,7 @@ async function commitBisect(options) {
|
|
|
7350
8522
|
}
|
|
7351
8523
|
async function promptBisect(options) {
|
|
7352
8524
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
7353
|
-
const
|
|
8525
|
+
const join4 = (paragraphs) => paragraphs.join("\n\n");
|
|
7354
8526
|
const goodParas = split(options.good);
|
|
7355
8527
|
const badParas = split(options.bad);
|
|
7356
8528
|
if (goodParas.length !== badParas.length) {
|
|
@@ -7368,7 +8540,7 @@ async function promptBisect(options) {
|
|
|
7368
8540
|
const result = await bisect({
|
|
7369
8541
|
good: goodMask,
|
|
7370
8542
|
bad: badMask,
|
|
7371
|
-
runEval: (mask) => options.runEval(
|
|
8543
|
+
runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
|
|
7372
8544
|
maxIterations: options.maxIterations ?? n + 5,
|
|
7373
8545
|
halfway: (g, b) => {
|
|
7374
8546
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -7399,12 +8571,12 @@ async function promptBisect(options) {
|
|
|
7399
8571
|
}
|
|
7400
8572
|
}
|
|
7401
8573
|
const materializedPath = result.path.map((s) => ({
|
|
7402
|
-
state:
|
|
8574
|
+
state: join4(paragraphsFor(s.state)),
|
|
7403
8575
|
score: s.score,
|
|
7404
8576
|
pass: s.pass
|
|
7405
8577
|
}));
|
|
7406
8578
|
return {
|
|
7407
|
-
culprit:
|
|
8579
|
+
culprit: join4(paragraphsFor(culprit)),
|
|
7408
8580
|
path: materializedPath,
|
|
7409
8581
|
converged: result.converged,
|
|
7410
8582
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7615,7 +8787,7 @@ function attributeStep(op, prmA, prmB) {
|
|
|
7615
8787
|
|
|
7616
8788
|
// src/pre-registration.ts
|
|
7617
8789
|
async function signManifest(m) {
|
|
7618
|
-
const canonical =
|
|
8790
|
+
const canonical = canonicalize3(m);
|
|
7619
8791
|
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
7620
8792
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
7621
8793
|
const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
@@ -7645,12 +8817,12 @@ async function evaluateHypothesis(manifest, observed) {
|
|
|
7645
8817
|
rejectionReasons: reasons
|
|
7646
8818
|
};
|
|
7647
8819
|
}
|
|
7648
|
-
function
|
|
8820
|
+
function canonicalize3(v) {
|
|
7649
8821
|
if (v === null || typeof v !== "object") return v;
|
|
7650
|
-
if (Array.isArray(v)) return v.map(
|
|
8822
|
+
if (Array.isArray(v)) return v.map(canonicalize3);
|
|
7651
8823
|
const keys = Object.keys(v).sort();
|
|
7652
8824
|
const out = {};
|
|
7653
|
-
for (const k of keys) out[k] =
|
|
8825
|
+
for (const k of keys) out[k] = canonicalize3(v[k]);
|
|
7654
8826
|
return out;
|
|
7655
8827
|
}
|
|
7656
8828
|
|
|
@@ -8459,7 +9631,7 @@ function mergeSignals(a, b) {
|
|
|
8459
9631
|
// src/command-runner.ts
|
|
8460
9632
|
import { spawnSync } from "child_process";
|
|
8461
9633
|
import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
|
|
8462
|
-
import { join as
|
|
9634
|
+
import { join as join3 } from "path";
|
|
8463
9635
|
var localCommandRunner = {
|
|
8464
9636
|
name: "local",
|
|
8465
9637
|
async run(input) {
|
|
@@ -8506,7 +9678,7 @@ var localCommandRunner = {
|
|
|
8506
9678
|
const out = [];
|
|
8507
9679
|
for (const name of entries) {
|
|
8508
9680
|
try {
|
|
8509
|
-
const st = statSync2(
|
|
9681
|
+
const st = statSync2(join3(path, name));
|
|
8510
9682
|
out.push({
|
|
8511
9683
|
name,
|
|
8512
9684
|
isDirectory: st.isDirectory(),
|
|
@@ -12426,6 +13598,7 @@ export {
|
|
|
12426
13598
|
ExperimentTracker,
|
|
12427
13599
|
FAILURE_CLASSES,
|
|
12428
13600
|
FileSystemExperimentStore,
|
|
13601
|
+
FileSystemFeedbackTrajectoryStore,
|
|
12429
13602
|
FileSystemOutcomeStore,
|
|
12430
13603
|
FileSystemTraceStore,
|
|
12431
13604
|
HeldOutGate,
|
|
@@ -12433,6 +13606,7 @@ export {
|
|
|
12433
13606
|
HoldoutLockedError,
|
|
12434
13607
|
INTENT_MATCH_JUDGE_VERSION,
|
|
12435
13608
|
InMemoryExperimentStore,
|
|
13609
|
+
InMemoryFeedbackTrajectoryStore,
|
|
12436
13610
|
InMemoryOutcomeStore,
|
|
12437
13611
|
InMemoryTraceStore,
|
|
12438
13612
|
InMemoryTrialCache,
|
|
@@ -12472,9 +13646,11 @@ export {
|
|
|
12472
13646
|
adversarialJudge,
|
|
12473
13647
|
aggregateLlm,
|
|
12474
13648
|
aggregateRunScore,
|
|
13649
|
+
allCriticalPassed,
|
|
12475
13650
|
analyzeAntiSlop,
|
|
12476
13651
|
analyzeSeries,
|
|
12477
13652
|
argHash,
|
|
13653
|
+
assignFeedbackSplit,
|
|
12478
13654
|
attributeCounterfactuals,
|
|
12479
13655
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
12480
13656
|
benchmarks_exports as benchmarks,
|
|
@@ -12512,6 +13688,8 @@ export {
|
|
|
12512
13688
|
computeToolUseMetrics,
|
|
12513
13689
|
confidenceInterval,
|
|
12514
13690
|
containsAll,
|
|
13691
|
+
controlFailureClassFromVerification,
|
|
13692
|
+
controlRunToFeedbackTrajectory,
|
|
12515
13693
|
correlateLayers,
|
|
12516
13694
|
correlationStudy,
|
|
12517
13695
|
createAntiSlopJudge,
|
|
@@ -12519,6 +13697,7 @@ export {
|
|
|
12519
13697
|
createCustomJudge,
|
|
12520
13698
|
createDefaultReviewer,
|
|
12521
13699
|
createDomainExpertJudge,
|
|
13700
|
+
createFeedbackTrajectory,
|
|
12522
13701
|
createIntentMatchJudge,
|
|
12523
13702
|
createLlmReviewer,
|
|
12524
13703
|
createSandboxCodeMutator,
|
|
@@ -12547,6 +13726,10 @@ export {
|
|
|
12547
13726
|
extractAssetUrls,
|
|
12548
13727
|
extractErrorCount,
|
|
12549
13728
|
failureClusterView,
|
|
13729
|
+
feedbackTrajectoriesToDatasetScenarios,
|
|
13730
|
+
feedbackTrajectoriesToOptimizerRows,
|
|
13731
|
+
feedbackTrajectoryToDatasetScenario,
|
|
13732
|
+
feedbackTrajectoryToOptimizerRow,
|
|
12550
13733
|
fileContains,
|
|
12551
13734
|
fileExists,
|
|
12552
13735
|
findAutoMatchNoExpectation,
|
|
@@ -12601,6 +13784,7 @@ export {
|
|
|
12601
13784
|
nonRefusalRubric,
|
|
12602
13785
|
normalizeScores,
|
|
12603
13786
|
notBlocked,
|
|
13787
|
+
objectiveEval,
|
|
12604
13788
|
outputLengthRubric,
|
|
12605
13789
|
pairedBootstrap,
|
|
12606
13790
|
pairedTTest,
|
|
@@ -12609,6 +13793,7 @@ export {
|
|
|
12609
13793
|
paretoChart,
|
|
12610
13794
|
paretoFrontier,
|
|
12611
13795
|
paretoFrontierWithCrowding,
|
|
13796
|
+
parseFeedbackTrajectoriesJsonl,
|
|
12612
13797
|
parseReflectionResponse,
|
|
12613
13798
|
parseRunRecordSafe,
|
|
12614
13799
|
partialCredit,
|
|
@@ -12635,6 +13820,7 @@ export {
|
|
|
12635
13820
|
renderMarkdown,
|
|
12636
13821
|
renderMarkdownReport,
|
|
12637
13822
|
renderPlaybookMarkdown,
|
|
13823
|
+
renderPreferenceMemoryMarkdown,
|
|
12638
13824
|
renderSteeringText,
|
|
12639
13825
|
replayScorerOverCorpus,
|
|
12640
13826
|
replayTraceThroughJudge,
|
|
@@ -12644,6 +13830,7 @@ export {
|
|
|
12644
13830
|
roundTripRunRecord,
|
|
12645
13831
|
rowCount,
|
|
12646
13832
|
rowWhere,
|
|
13833
|
+
runAgentControlLoop,
|
|
12647
13834
|
runAssertions,
|
|
12648
13835
|
runCanaries,
|
|
12649
13836
|
runCounterfactual,
|
|
@@ -12657,6 +13844,7 @@ export {
|
|
|
12657
13844
|
runKeywordCoverageJudgeUrl,
|
|
12658
13845
|
runPromptEvolution,
|
|
12659
13846
|
runProposeReview,
|
|
13847
|
+
runProposeReviewAsControlLoop,
|
|
12660
13848
|
runReferenceReplay,
|
|
12661
13849
|
runSelfPlay,
|
|
12662
13850
|
runSemanticConceptJudge,
|
|
@@ -12673,13 +13861,18 @@ export {
|
|
|
12673
13861
|
selectHarnessVariant,
|
|
12674
13862
|
selfPreference,
|
|
12675
13863
|
sentenceReorderMutator,
|
|
13864
|
+
serializeFeedbackTrajectoriesJsonl,
|
|
12676
13865
|
signManifest,
|
|
12677
13866
|
soc2Report,
|
|
12678
13867
|
statusAdvanced,
|
|
13868
|
+
stopOnNoProgress,
|
|
13869
|
+
stopOnRepeatedAction,
|
|
12679
13870
|
stripFencedJson,
|
|
12680
13871
|
stuckLoopView,
|
|
13872
|
+
subjectiveEval,
|
|
12681
13873
|
summarize,
|
|
12682
13874
|
summarizeHarnessResults,
|
|
13875
|
+
summarizePreferenceMemory,
|
|
12683
13876
|
summaryTable,
|
|
12684
13877
|
testJudge,
|
|
12685
13878
|
textInSnapshot,
|
|
@@ -12705,6 +13898,7 @@ export {
|
|
|
12705
13898
|
welchsTTest,
|
|
12706
13899
|
whitespaceCollapseMutator,
|
|
12707
13900
|
wilcoxonSignedRank,
|
|
13901
|
+
withAssignedFeedbackSplit,
|
|
12708
13902
|
wranglerDeployRunner
|
|
12709
13903
|
};
|
|
12710
13904
|
//# sourceMappingURL=index.js.map
|