@tangle-network/agent-eval 0.17.1 → 0.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -1
- package/dist/index.d.ts +1493 -1088
- package/dist/index.js +1487 -187
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1386,6 +1386,1308 @@ function printDriverSummary(results) {
|
|
|
1386
1386
|
console.log(`${completedCount}/${results.length} personas completed`);
|
|
1387
1387
|
}
|
|
1388
1388
|
|
|
1389
|
+
// src/trace/emitter.ts
|
|
1390
|
+
var TraceEmitter = class {
|
|
1391
|
+
store;
|
|
1392
|
+
stack = [];
|
|
1393
|
+
_runId;
|
|
1394
|
+
now;
|
|
1395
|
+
id;
|
|
1396
|
+
constructor(store, options = {}) {
|
|
1397
|
+
this.store = store;
|
|
1398
|
+
this.now = options.now ?? (() => Date.now());
|
|
1399
|
+
this.id = options.id ?? (() => cryptoRandomId());
|
|
1400
|
+
this._runId = options.runId ?? this.id();
|
|
1401
|
+
}
|
|
1402
|
+
get runId() {
|
|
1403
|
+
return this._runId;
|
|
1404
|
+
}
|
|
1405
|
+
// ── Run lifecycle ──────────────────────────────────────────────────
|
|
1406
|
+
async startRun(run) {
|
|
1407
|
+
const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
|
|
1408
|
+
await this.store.appendRun(full);
|
|
1409
|
+
return full;
|
|
1410
|
+
}
|
|
1411
|
+
async endRun(outcome) {
|
|
1412
|
+
const status = outcome?.pass === false ? "failed" : "completed";
|
|
1413
|
+
await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
|
|
1414
|
+
}
|
|
1415
|
+
async abortRun(reason) {
|
|
1416
|
+
await this.store.updateRun(this._runId, {
|
|
1417
|
+
endedAt: this.now(),
|
|
1418
|
+
status: "aborted",
|
|
1419
|
+
outcome: { pass: false, notes: reason }
|
|
1420
|
+
});
|
|
1421
|
+
}
|
|
1422
|
+
// ── Generic span ───────────────────────────────────────────────────
|
|
1423
|
+
async span(init) {
|
|
1424
|
+
const spanId = this.id();
|
|
1425
|
+
const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
|
|
1426
|
+
const span = {
|
|
1427
|
+
spanId,
|
|
1428
|
+
parentSpanId: parent,
|
|
1429
|
+
runId: this._runId,
|
|
1430
|
+
startedAt: this.now(),
|
|
1431
|
+
...init
|
|
1432
|
+
};
|
|
1433
|
+
await this.store.appendSpan(span);
|
|
1434
|
+
this.stack.push(spanId);
|
|
1435
|
+
return this.handle(span);
|
|
1436
|
+
}
|
|
1437
|
+
handle(span) {
|
|
1438
|
+
return {
|
|
1439
|
+
span,
|
|
1440
|
+
end: async (patch) => {
|
|
1441
|
+
const endedAt = this.now();
|
|
1442
|
+
await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
|
|
1443
|
+
this.pop(span.spanId);
|
|
1444
|
+
},
|
|
1445
|
+
fail: async (error, patch) => {
|
|
1446
|
+
const endedAt = this.now();
|
|
1447
|
+
const errStr = error instanceof Error ? error.message : error;
|
|
1448
|
+
await this.store.updateSpan(span.spanId, {
|
|
1449
|
+
endedAt,
|
|
1450
|
+
status: "error",
|
|
1451
|
+
error: errStr,
|
|
1452
|
+
...patch
|
|
1453
|
+
});
|
|
1454
|
+
this.pop(span.spanId);
|
|
1455
|
+
}
|
|
1456
|
+
};
|
|
1457
|
+
}
|
|
1458
|
+
pop(spanId) {
|
|
1459
|
+
const idx = this.stack.lastIndexOf(spanId);
|
|
1460
|
+
if (idx >= 0) this.stack.splice(idx, 1);
|
|
1461
|
+
}
|
|
1462
|
+
// ── Typed span conveniences ────────────────────────────────────────
|
|
1463
|
+
llm(init) {
|
|
1464
|
+
return this.span({ kind: "llm", ...init });
|
|
1465
|
+
}
|
|
1466
|
+
tool(init) {
|
|
1467
|
+
return this.span({ kind: "tool", ...init });
|
|
1468
|
+
}
|
|
1469
|
+
retrieval(init) {
|
|
1470
|
+
return this.span({ kind: "retrieval", ...init });
|
|
1471
|
+
}
|
|
1472
|
+
async recordJudge(verdict) {
|
|
1473
|
+
const spanId = this.id();
|
|
1474
|
+
const now = this.now();
|
|
1475
|
+
const full = {
|
|
1476
|
+
spanId,
|
|
1477
|
+
runId: this._runId,
|
|
1478
|
+
kind: "judge",
|
|
1479
|
+
startedAt: now,
|
|
1480
|
+
endedAt: now,
|
|
1481
|
+
status: "ok",
|
|
1482
|
+
...verdict
|
|
1483
|
+
};
|
|
1484
|
+
await this.store.appendSpan(full);
|
|
1485
|
+
return full;
|
|
1486
|
+
}
|
|
1487
|
+
sandbox(init) {
|
|
1488
|
+
return this.span({ kind: "sandbox", ...init });
|
|
1489
|
+
}
|
|
1490
|
+
// ── Events ─────────────────────────────────────────────────────────
|
|
1491
|
+
async emit(event) {
|
|
1492
|
+
const full = {
|
|
1493
|
+
eventId: this.id(),
|
|
1494
|
+
runId: this._runId,
|
|
1495
|
+
spanId: event.spanId ?? this.stack[this.stack.length - 1],
|
|
1496
|
+
kind: event.kind,
|
|
1497
|
+
timestamp: this.now(),
|
|
1498
|
+
payload: event.payload ?? {}
|
|
1499
|
+
};
|
|
1500
|
+
await this.store.appendEvent(full);
|
|
1501
|
+
return full;
|
|
1502
|
+
}
|
|
1503
|
+
// ── Budget ledger ──────────────────────────────────────────────────
|
|
1504
|
+
async recordBudget(entry) {
|
|
1505
|
+
const full = {
|
|
1506
|
+
runId: this._runId,
|
|
1507
|
+
timestamp: entry.timestamp ?? this.now(),
|
|
1508
|
+
dimension: entry.dimension,
|
|
1509
|
+
limit: entry.limit,
|
|
1510
|
+
consumed: entry.consumed,
|
|
1511
|
+
remaining: entry.remaining,
|
|
1512
|
+
breached: entry.breached,
|
|
1513
|
+
spanId: entry.spanId ?? this.stack[this.stack.length - 1]
|
|
1514
|
+
};
|
|
1515
|
+
await this.store.appendBudgetEntry(full);
|
|
1516
|
+
if (full.breached) {
|
|
1517
|
+
await this.emit({
|
|
1518
|
+
kind: "budget_breach",
|
|
1519
|
+
spanId: full.spanId,
|
|
1520
|
+
payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
|
|
1521
|
+
});
|
|
1522
|
+
}
|
|
1523
|
+
return full;
|
|
1524
|
+
}
|
|
1525
|
+
// ── Artifacts ──────────────────────────────────────────────────────
|
|
1526
|
+
async recordArtifact(artifact) {
|
|
1527
|
+
const full = { artifactId: this.id(), runId: this._runId, ...artifact };
|
|
1528
|
+
await this.store.appendArtifact(full);
|
|
1529
|
+
return full;
|
|
1530
|
+
}
|
|
1531
|
+
// ── Nested composition ─────────────────────────────────────────────
|
|
1532
|
+
/**
|
|
1533
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
1534
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
1535
|
+
*/
|
|
1536
|
+
async within(init, fn) {
|
|
1537
|
+
const handle = await this.span(init);
|
|
1538
|
+
try {
|
|
1539
|
+
const result = await fn(handle);
|
|
1540
|
+
await handle.end();
|
|
1541
|
+
return result;
|
|
1542
|
+
} catch (err) {
|
|
1543
|
+
await handle.fail(err instanceof Error ? err : String(err));
|
|
1544
|
+
throw err;
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
};
|
|
1548
|
+
function cryptoRandomId() {
|
|
1549
|
+
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
1550
|
+
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
1551
|
+
}
|
|
1552
|
+
function llmSpanFromProvider(args) {
|
|
1553
|
+
return {
|
|
1554
|
+
name: args.name ?? args.model,
|
|
1555
|
+
model: args.model,
|
|
1556
|
+
messages: args.messages,
|
|
1557
|
+
output: args.output,
|
|
1558
|
+
inputTokens: args.usage?.inputTokens,
|
|
1559
|
+
outputTokens: args.usage?.outputTokens,
|
|
1560
|
+
cachedTokens: args.usage?.cachedTokens,
|
|
1561
|
+
reasoningTokens: args.usage?.reasoningTokens,
|
|
1562
|
+
costUsd: args.costUsd,
|
|
1563
|
+
finishReason: args.finishReason
|
|
1564
|
+
};
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// src/control-runtime.ts
|
|
1568
|
+
var DEFAULT_BUDGET = {
|
|
1569
|
+
maxSteps: 8,
|
|
1570
|
+
maxWallMs: 5 * 60 * 1e3
|
|
1571
|
+
};
|
|
1572
|
+
async function runAgentControlLoop(config) {
|
|
1573
|
+
const budget = { ...DEFAULT_BUDGET, ...config.budget };
|
|
1574
|
+
const actionFailure = config.actionFailure ?? "continue";
|
|
1575
|
+
const controller = new AbortController();
|
|
1576
|
+
const upstreamAbort = () => controller.abort(config.signal?.reason);
|
|
1577
|
+
if (config.signal) {
|
|
1578
|
+
if (config.signal.aborted) controller.abort(config.signal.reason);
|
|
1579
|
+
else config.signal.addEventListener("abort", upstreamAbort, { once: true });
|
|
1580
|
+
}
|
|
1581
|
+
const started = Date.now();
|
|
1582
|
+
const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
|
|
1583
|
+
const history = [];
|
|
1584
|
+
const emitter = config.store ? new TraceEmitter(config.store) : void 0;
|
|
1585
|
+
let spentCostUsd = 0;
|
|
1586
|
+
const runtimeErrors = [];
|
|
1587
|
+
let lastStateFingerprint;
|
|
1588
|
+
let lastActionFingerprint;
|
|
1589
|
+
let noProgressStreak = 0;
|
|
1590
|
+
let repeatedActionStreak = 0;
|
|
1591
|
+
try {
|
|
1592
|
+
if (emitter) {
|
|
1593
|
+
await runTrace(runtimeErrors, 0, () => emitter.startRun({
|
|
1594
|
+
scenarioId: config.scenarioId ?? "agent-control-loop",
|
|
1595
|
+
projectId: config.projectId,
|
|
1596
|
+
variantId: config.variantId,
|
|
1597
|
+
layer: "meta",
|
|
1598
|
+
tags: {
|
|
1599
|
+
intent: config.intent.slice(0, 120),
|
|
1600
|
+
maxSteps: String(budget.maxSteps),
|
|
1601
|
+
...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
|
|
1602
|
+
}
|
|
1603
|
+
}));
|
|
1604
|
+
}
|
|
1605
|
+
let state;
|
|
1606
|
+
let evals;
|
|
1607
|
+
try {
|
|
1608
|
+
state = await config.observe({ history, abortSignal: controller.signal });
|
|
1609
|
+
} catch (err) {
|
|
1610
|
+
runtimeErrors.push(runtimeError("observe", 0, err));
|
|
1611
|
+
return finish(emitter, {
|
|
1612
|
+
intent: config.intent,
|
|
1613
|
+
pass: false,
|
|
1614
|
+
completed: false,
|
|
1615
|
+
reason: runtimeErrors[0].message,
|
|
1616
|
+
steps: history,
|
|
1617
|
+
finalState: void 0,
|
|
1618
|
+
finalEvals: [],
|
|
1619
|
+
wallMs: Date.now() - started,
|
|
1620
|
+
spentCostUsd,
|
|
1621
|
+
runId: emitter?.runId ?? null,
|
|
1622
|
+
failureClass: "unknown",
|
|
1623
|
+
runtimeErrors,
|
|
1624
|
+
stoppedBy: "runtime-error"
|
|
1625
|
+
});
|
|
1626
|
+
}
|
|
1627
|
+
try {
|
|
1628
|
+
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
1629
|
+
await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
|
|
1630
|
+
} catch (err) {
|
|
1631
|
+
runtimeErrors.push(runtimeError("validate", 0, err));
|
|
1632
|
+
return finish(emitter, {
|
|
1633
|
+
intent: config.intent,
|
|
1634
|
+
pass: false,
|
|
1635
|
+
completed: false,
|
|
1636
|
+
reason: runtimeErrors[0].message,
|
|
1637
|
+
steps: history,
|
|
1638
|
+
finalState: state,
|
|
1639
|
+
finalEvals: [],
|
|
1640
|
+
wallMs: Date.now() - started,
|
|
1641
|
+
spentCostUsd,
|
|
1642
|
+
runId: emitter?.runId ?? null,
|
|
1643
|
+
failureClass: "unknown",
|
|
1644
|
+
runtimeErrors,
|
|
1645
|
+
stoppedBy: "runtime-error"
|
|
1646
|
+
});
|
|
1647
|
+
}
|
|
1648
|
+
lastStateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
1649
|
+
for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
|
|
1650
|
+
if (controller.signal.aborted) {
|
|
1651
|
+
return finish(emitter, {
|
|
1652
|
+
intent: config.intent,
|
|
1653
|
+
pass: false,
|
|
1654
|
+
completed: false,
|
|
1655
|
+
reason: abortReason(controller.signal),
|
|
1656
|
+
score: void 0,
|
|
1657
|
+
steps: history,
|
|
1658
|
+
finalState: state,
|
|
1659
|
+
finalEvals: evals,
|
|
1660
|
+
wallMs: Date.now() - started,
|
|
1661
|
+
spentCostUsd,
|
|
1662
|
+
runId: emitter?.runId ?? null,
|
|
1663
|
+
failureClass: "timeout",
|
|
1664
|
+
runtimeErrors,
|
|
1665
|
+
stoppedBy: "abort"
|
|
1666
|
+
});
|
|
1667
|
+
}
|
|
1668
|
+
const budgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
1669
|
+
if (budgetStop.stop) {
|
|
1670
|
+
return finish(emitter, {
|
|
1671
|
+
intent: config.intent,
|
|
1672
|
+
pass: false,
|
|
1673
|
+
completed: false,
|
|
1674
|
+
reason: budgetStop.reason,
|
|
1675
|
+
score: averageScore(evals),
|
|
1676
|
+
steps: history,
|
|
1677
|
+
finalState: state,
|
|
1678
|
+
finalEvals: evals,
|
|
1679
|
+
wallMs: Date.now() - started,
|
|
1680
|
+
spentCostUsd,
|
|
1681
|
+
runId: emitter?.runId ?? null,
|
|
1682
|
+
failureClass: "budget_exceeded",
|
|
1683
|
+
runtimeErrors,
|
|
1684
|
+
stoppedBy: "budget"
|
|
1685
|
+
});
|
|
1686
|
+
}
|
|
1687
|
+
const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
|
|
1688
|
+
let stop;
|
|
1689
|
+
try {
|
|
1690
|
+
stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
|
|
1691
|
+
} catch (err) {
|
|
1692
|
+
runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
|
|
1693
|
+
return finish(emitter, {
|
|
1694
|
+
intent: config.intent,
|
|
1695
|
+
pass: false,
|
|
1696
|
+
completed: false,
|
|
1697
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1698
|
+
score: averageScore(evals),
|
|
1699
|
+
steps: history,
|
|
1700
|
+
finalState: state,
|
|
1701
|
+
finalEvals: evals,
|
|
1702
|
+
wallMs: Date.now() - started,
|
|
1703
|
+
spentCostUsd,
|
|
1704
|
+
runId: emitter?.runId ?? null,
|
|
1705
|
+
failureClass: "unknown",
|
|
1706
|
+
runtimeErrors,
|
|
1707
|
+
stoppedBy: "runtime-error"
|
|
1708
|
+
});
|
|
1709
|
+
}
|
|
1710
|
+
if (stop.stop) {
|
|
1711
|
+
return finish(emitter, {
|
|
1712
|
+
intent: config.intent,
|
|
1713
|
+
pass: stop.pass,
|
|
1714
|
+
completed: true,
|
|
1715
|
+
reason: stop.reason,
|
|
1716
|
+
score: stop.score,
|
|
1717
|
+
steps: history,
|
|
1718
|
+
finalState: state,
|
|
1719
|
+
finalEvals: evals,
|
|
1720
|
+
wallMs: Date.now() - started,
|
|
1721
|
+
spentCostUsd,
|
|
1722
|
+
runId: emitter?.runId ?? null,
|
|
1723
|
+
failureClass: stop.failureClass,
|
|
1724
|
+
runtimeErrors,
|
|
1725
|
+
stoppedBy: "stop-policy"
|
|
1726
|
+
});
|
|
1727
|
+
}
|
|
1728
|
+
let decision;
|
|
1729
|
+
try {
|
|
1730
|
+
decision = await config.decide(ctx);
|
|
1731
|
+
} catch (err) {
|
|
1732
|
+
runtimeErrors.push(runtimeError("decide", stepIndex, err));
|
|
1733
|
+
return finish(emitter, {
|
|
1734
|
+
intent: config.intent,
|
|
1735
|
+
pass: false,
|
|
1736
|
+
completed: false,
|
|
1737
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1738
|
+
score: averageScore(evals),
|
|
1739
|
+
steps: history,
|
|
1740
|
+
finalState: state,
|
|
1741
|
+
finalEvals: evals,
|
|
1742
|
+
wallMs: Date.now() - started,
|
|
1743
|
+
spentCostUsd,
|
|
1744
|
+
runId: emitter?.runId ?? null,
|
|
1745
|
+
failureClass: "unknown",
|
|
1746
|
+
runtimeErrors,
|
|
1747
|
+
stoppedBy: "runtime-error"
|
|
1748
|
+
});
|
|
1749
|
+
}
|
|
1750
|
+
if (decision.type === "stop") {
|
|
1751
|
+
return finish(emitter, {
|
|
1752
|
+
intent: config.intent,
|
|
1753
|
+
pass: decision.pass ?? false,
|
|
1754
|
+
completed: true,
|
|
1755
|
+
reason: decision.reason,
|
|
1756
|
+
score: decision.score,
|
|
1757
|
+
steps: history,
|
|
1758
|
+
finalState: state,
|
|
1759
|
+
finalEvals: evals,
|
|
1760
|
+
wallMs: Date.now() - started,
|
|
1761
|
+
spentCostUsd,
|
|
1762
|
+
runId: emitter?.runId ?? null,
|
|
1763
|
+
failureClass: decision.pass === false ? "unknown" : void 0,
|
|
1764
|
+
runtimeErrors,
|
|
1765
|
+
stoppedBy: "policy"
|
|
1766
|
+
});
|
|
1767
|
+
}
|
|
1768
|
+
const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
|
|
1769
|
+
repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
|
|
1770
|
+
lastActionFingerprint = actionFingerprint;
|
|
1771
|
+
const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
|
|
1772
|
+
if (repeatedActionStop.stop) {
|
|
1773
|
+
return finish(emitter, {
|
|
1774
|
+
intent: config.intent,
|
|
1775
|
+
pass: false,
|
|
1776
|
+
completed: true,
|
|
1777
|
+
reason: repeatedActionStop.reason,
|
|
1778
|
+
score: averageScore(evals),
|
|
1779
|
+
steps: history,
|
|
1780
|
+
finalState: state,
|
|
1781
|
+
finalEvals: evals,
|
|
1782
|
+
wallMs: Date.now() - started,
|
|
1783
|
+
spentCostUsd,
|
|
1784
|
+
runId: emitter?.runId ?? null,
|
|
1785
|
+
failureClass: "tool_recovery_failure",
|
|
1786
|
+
runtimeErrors,
|
|
1787
|
+
stoppedBy: "stop-policy"
|
|
1788
|
+
});
|
|
1789
|
+
}
|
|
1790
|
+
const beforeState = state;
|
|
1791
|
+
const evalsBefore = evals;
|
|
1792
|
+
const scoreBefore = averageScore(evals);
|
|
1793
|
+
const actionStarted = Date.now();
|
|
1794
|
+
const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
|
|
1795
|
+
name: `control-step-${stepIndex}`,
|
|
1796
|
+
toolName: "agent-control-action",
|
|
1797
|
+
args: decision.action,
|
|
1798
|
+
attributes: {
|
|
1799
|
+
decision: decision.reason ?? "continue",
|
|
1800
|
+
repeatedActionStreak
|
|
1801
|
+
}
|
|
1802
|
+
})) : void 0;
|
|
1803
|
+
let actionOutcome;
|
|
1804
|
+
try {
|
|
1805
|
+
const result = await config.act(decision.action, ctx);
|
|
1806
|
+
const costUsd = config.getActionCostUsd?.({
|
|
1807
|
+
action: decision.action,
|
|
1808
|
+
result,
|
|
1809
|
+
state,
|
|
1810
|
+
evals,
|
|
1811
|
+
history
|
|
1812
|
+
});
|
|
1813
|
+
if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
|
|
1814
|
+
spentCostUsd += costUsd;
|
|
1815
|
+
await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
|
|
1816
|
+
}
|
|
1817
|
+
actionOutcome = {
|
|
1818
|
+
ok: true,
|
|
1819
|
+
result,
|
|
1820
|
+
...costUsd !== void 0 ? { costUsd } : {},
|
|
1821
|
+
durationMs: Date.now() - actionStarted
|
|
1822
|
+
};
|
|
1823
|
+
} catch (err) {
|
|
1824
|
+
runtimeErrors.push(runtimeError("act", stepIndex, err));
|
|
1825
|
+
actionOutcome = {
|
|
1826
|
+
ok: false,
|
|
1827
|
+
error: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1828
|
+
durationMs: Date.now() - actionStarted
|
|
1829
|
+
};
|
|
1830
|
+
if (actionFailure === "stop") {
|
|
1831
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
|
|
1832
|
+
const step2 = {
|
|
1833
|
+
index: stepIndex,
|
|
1834
|
+
decision,
|
|
1835
|
+
beforeState,
|
|
1836
|
+
afterState: state,
|
|
1837
|
+
evalsBefore,
|
|
1838
|
+
evalsAfter: evals,
|
|
1839
|
+
actionOutcome,
|
|
1840
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1841
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1842
|
+
};
|
|
1843
|
+
history.push(step2);
|
|
1844
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1845
|
+
return finish(emitter, {
|
|
1846
|
+
intent: config.intent,
|
|
1847
|
+
pass: false,
|
|
1848
|
+
completed: false,
|
|
1849
|
+
reason: actionOutcome.error ?? "action failed",
|
|
1850
|
+
score: averageScore(evals),
|
|
1851
|
+
steps: history,
|
|
1852
|
+
finalState: state,
|
|
1853
|
+
finalEvals: evals,
|
|
1854
|
+
wallMs: Date.now() - started,
|
|
1855
|
+
spentCostUsd,
|
|
1856
|
+
runId: emitter?.runId ?? null,
|
|
1857
|
+
failureClass: "unknown",
|
|
1858
|
+
runtimeErrors,
|
|
1859
|
+
stoppedBy: "runtime-error"
|
|
1860
|
+
});
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
try {
|
|
1864
|
+
state = await config.observe({ history, abortSignal: controller.signal });
|
|
1865
|
+
} catch (err) {
|
|
1866
|
+
runtimeErrors.push(runtimeError("observe", stepIndex, err));
|
|
1867
|
+
const step2 = {
|
|
1868
|
+
index: stepIndex,
|
|
1869
|
+
decision,
|
|
1870
|
+
beforeState,
|
|
1871
|
+
afterState: beforeState,
|
|
1872
|
+
evalsBefore,
|
|
1873
|
+
evalsAfter: evals,
|
|
1874
|
+
actionOutcome,
|
|
1875
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1876
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1877
|
+
};
|
|
1878
|
+
history.push(step2);
|
|
1879
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
1880
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1881
|
+
return finish(emitter, {
|
|
1882
|
+
intent: config.intent,
|
|
1883
|
+
pass: false,
|
|
1884
|
+
completed: false,
|
|
1885
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1886
|
+
score: averageScore(evals),
|
|
1887
|
+
steps: history,
|
|
1888
|
+
finalState: beforeState,
|
|
1889
|
+
finalEvals: evals,
|
|
1890
|
+
wallMs: Date.now() - started,
|
|
1891
|
+
spentCostUsd,
|
|
1892
|
+
runId: emitter?.runId ?? null,
|
|
1893
|
+
failureClass: "unknown",
|
|
1894
|
+
runtimeErrors,
|
|
1895
|
+
stoppedBy: "runtime-error"
|
|
1896
|
+
});
|
|
1897
|
+
}
|
|
1898
|
+
try {
|
|
1899
|
+
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
1900
|
+
await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
|
|
1901
|
+
} catch (err) {
|
|
1902
|
+
runtimeErrors.push(runtimeError("validate", stepIndex, err));
|
|
1903
|
+
const step2 = {
|
|
1904
|
+
index: stepIndex,
|
|
1905
|
+
decision,
|
|
1906
|
+
beforeState,
|
|
1907
|
+
afterState: state,
|
|
1908
|
+
evalsBefore,
|
|
1909
|
+
evalsAfter: evals,
|
|
1910
|
+
actionOutcome,
|
|
1911
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1912
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1913
|
+
};
|
|
1914
|
+
history.push(step2);
|
|
1915
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
1916
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1917
|
+
return finish(emitter, {
|
|
1918
|
+
intent: config.intent,
|
|
1919
|
+
pass: false,
|
|
1920
|
+
completed: false,
|
|
1921
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1922
|
+
score: averageScore(evals),
|
|
1923
|
+
steps: history,
|
|
1924
|
+
finalState: state,
|
|
1925
|
+
finalEvals: evals,
|
|
1926
|
+
wallMs: Date.now() - started,
|
|
1927
|
+
spentCostUsd,
|
|
1928
|
+
runId: emitter?.runId ?? null,
|
|
1929
|
+
failureClass: "unknown",
|
|
1930
|
+
runtimeErrors,
|
|
1931
|
+
stoppedBy: "runtime-error"
|
|
1932
|
+
});
|
|
1933
|
+
}
|
|
1934
|
+
const scoreAfter = averageScore(evals);
|
|
1935
|
+
const stateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
1936
|
+
const noProgressStop = noProgressStopDecision({
|
|
1937
|
+
policies: config.stopPolicies,
|
|
1938
|
+
lastStateFingerprint,
|
|
1939
|
+
stateFingerprint,
|
|
1940
|
+
scoreBefore,
|
|
1941
|
+
scoreAfter,
|
|
1942
|
+
currentStreak: noProgressStreak
|
|
1943
|
+
});
|
|
1944
|
+
noProgressStreak = noProgressStop.streak;
|
|
1945
|
+
lastStateFingerprint = stateFingerprint;
|
|
1946
|
+
const step = {
|
|
1947
|
+
index: stepIndex,
|
|
1948
|
+
decision,
|
|
1949
|
+
beforeState,
|
|
1950
|
+
afterState: state,
|
|
1951
|
+
evalsBefore,
|
|
1952
|
+
evalsAfter: evals,
|
|
1953
|
+
actionOutcome,
|
|
1954
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1955
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1956
|
+
};
|
|
1957
|
+
history.push(step);
|
|
1958
|
+
if (actionOutcome.ok) {
|
|
1959
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
|
|
1960
|
+
attributes: {
|
|
1961
|
+
actionCostUsd: actionOutcome.costUsd ?? null,
|
|
1962
|
+
spentCostUsd,
|
|
1963
|
+
scoreBefore: scoreBefore ?? null,
|
|
1964
|
+
scoreAfter: scoreAfter ?? null,
|
|
1965
|
+
noProgressStreak
|
|
1966
|
+
}
|
|
1967
|
+
}));
|
|
1968
|
+
} else {
|
|
1969
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
|
|
1970
|
+
attributes: {
|
|
1971
|
+
spentCostUsd,
|
|
1972
|
+
noProgressStreak
|
|
1973
|
+
}
|
|
1974
|
+
}));
|
|
1975
|
+
}
|
|
1976
|
+
await runOnStep(config.onStep, step, runtimeErrors);
|
|
1977
|
+
if (noProgressStop.stop) {
|
|
1978
|
+
return finish(emitter, {
|
|
1979
|
+
intent: config.intent,
|
|
1980
|
+
pass: false,
|
|
1981
|
+
completed: true,
|
|
1982
|
+
reason: noProgressStop.reason,
|
|
1983
|
+
score: scoreAfter,
|
|
1984
|
+
steps: history,
|
|
1985
|
+
finalState: state,
|
|
1986
|
+
finalEvals: evals,
|
|
1987
|
+
wallMs: Date.now() - started,
|
|
1988
|
+
spentCostUsd,
|
|
1989
|
+
runId: emitter?.runId ?? null,
|
|
1990
|
+
failureClass: "tool_recovery_failure",
|
|
1991
|
+
runtimeErrors,
|
|
1992
|
+
stoppedBy: "stop-policy"
|
|
1993
|
+
});
|
|
1994
|
+
}
|
|
1995
|
+
const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
1996
|
+
if (postStepBudgetStop.stop) {
|
|
1997
|
+
return finish(emitter, {
|
|
1998
|
+
intent: config.intent,
|
|
1999
|
+
pass: false,
|
|
2000
|
+
completed: false,
|
|
2001
|
+
reason: postStepBudgetStop.reason,
|
|
2002
|
+
score: scoreAfter,
|
|
2003
|
+
steps: history,
|
|
2004
|
+
finalState: state,
|
|
2005
|
+
finalEvals: evals,
|
|
2006
|
+
wallMs: Date.now() - started,
|
|
2007
|
+
spentCostUsd,
|
|
2008
|
+
runId: emitter?.runId ?? null,
|
|
2009
|
+
failureClass: "budget_exceeded",
|
|
2010
|
+
runtimeErrors,
|
|
2011
|
+
stoppedBy: "budget"
|
|
2012
|
+
});
|
|
2013
|
+
}
|
|
2014
|
+
const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
|
|
2015
|
+
let postStepStop;
|
|
2016
|
+
try {
|
|
2017
|
+
postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
|
|
2018
|
+
} catch (err) {
|
|
2019
|
+
runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
|
|
2020
|
+
return finish(emitter, {
|
|
2021
|
+
intent: config.intent,
|
|
2022
|
+
pass: false,
|
|
2023
|
+
completed: false,
|
|
2024
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
2025
|
+
score: averageScore(evals),
|
|
2026
|
+
steps: history,
|
|
2027
|
+
finalState: state,
|
|
2028
|
+
finalEvals: evals,
|
|
2029
|
+
wallMs: Date.now() - started,
|
|
2030
|
+
spentCostUsd,
|
|
2031
|
+
runId: emitter?.runId ?? null,
|
|
2032
|
+
failureClass: "unknown",
|
|
2033
|
+
runtimeErrors,
|
|
2034
|
+
stoppedBy: "runtime-error"
|
|
2035
|
+
});
|
|
2036
|
+
}
|
|
2037
|
+
if (postStepStop.stop) {
|
|
2038
|
+
return finish(emitter, {
|
|
2039
|
+
intent: config.intent,
|
|
2040
|
+
pass: postStepStop.pass,
|
|
2041
|
+
completed: true,
|
|
2042
|
+
reason: postStepStop.reason,
|
|
2043
|
+
score: postStepStop.score,
|
|
2044
|
+
steps: history,
|
|
2045
|
+
finalState: state,
|
|
2046
|
+
finalEvals: evals,
|
|
2047
|
+
wallMs: Date.now() - started,
|
|
2048
|
+
spentCostUsd,
|
|
2049
|
+
runId: emitter?.runId ?? null,
|
|
2050
|
+
failureClass: postStepStop.failureClass,
|
|
2051
|
+
runtimeErrors,
|
|
2052
|
+
stoppedBy: "stop-policy"
|
|
2053
|
+
});
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
return finish(emitter, {
|
|
2057
|
+
intent: config.intent,
|
|
2058
|
+
pass: false,
|
|
2059
|
+
completed: false,
|
|
2060
|
+
reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
|
|
2061
|
+
steps: history,
|
|
2062
|
+
finalState: state,
|
|
2063
|
+
finalEvals: evals,
|
|
2064
|
+
wallMs: Date.now() - started,
|
|
2065
|
+
spentCostUsd,
|
|
2066
|
+
runId: emitter?.runId ?? null,
|
|
2067
|
+
failureClass: "budget_exceeded",
|
|
2068
|
+
runtimeErrors,
|
|
2069
|
+
stoppedBy: "budget"
|
|
2070
|
+
});
|
|
2071
|
+
} catch (err) {
|
|
2072
|
+
runtimeErrors.push(runtimeError("act", history.length, err));
|
|
2073
|
+
return finish(emitter, {
|
|
2074
|
+
intent: config.intent,
|
|
2075
|
+
pass: false,
|
|
2076
|
+
completed: false,
|
|
2077
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
2078
|
+
steps: history,
|
|
2079
|
+
finalState: void 0,
|
|
2080
|
+
finalEvals: [],
|
|
2081
|
+
wallMs: Date.now() - started,
|
|
2082
|
+
spentCostUsd,
|
|
2083
|
+
runId: emitter?.runId ?? null,
|
|
2084
|
+
failureClass: "unknown",
|
|
2085
|
+
runtimeErrors,
|
|
2086
|
+
stoppedBy: "runtime-error"
|
|
2087
|
+
});
|
|
2088
|
+
} finally {
|
|
2089
|
+
if (wallTimer) clearTimeout(wallTimer);
|
|
2090
|
+
if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
function stopOnNoProgress(maxNoProgressSteps, options = {}) {
|
|
2094
|
+
return { ...options, maxNoProgressSteps };
|
|
2095
|
+
}
|
|
2096
|
+
function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
|
|
2097
|
+
return { ...options, maxRepeatedActions };
|
|
2098
|
+
}
|
|
2099
|
+
function objectiveEval(input) {
|
|
2100
|
+
return { ...input, objective: true };
|
|
2101
|
+
}
|
|
2102
|
+
function subjectiveEval(input) {
|
|
2103
|
+
return { ...input, objective: false };
|
|
2104
|
+
}
|
|
2105
|
+
function allCriticalPassed(evals) {
|
|
2106
|
+
return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
|
|
2107
|
+
}
|
|
2108
|
+
function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
|
|
2109
|
+
return {
|
|
2110
|
+
intent,
|
|
2111
|
+
state,
|
|
2112
|
+
evals,
|
|
2113
|
+
history,
|
|
2114
|
+
budget,
|
|
2115
|
+
stepIndex,
|
|
2116
|
+
wallMs: Date.now() - started,
|
|
2117
|
+
spentCostUsd,
|
|
2118
|
+
remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
|
|
2119
|
+
abortSignal,
|
|
2120
|
+
emitter
|
|
2121
|
+
};
|
|
2122
|
+
}
|
|
2123
|
+
function defaultStopDecision(evals) {
|
|
2124
|
+
if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
|
|
2125
|
+
const pass = allCriticalPassed(evals);
|
|
2126
|
+
return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
|
|
2127
|
+
}
|
|
2128
|
+
function averageScore(evals) {
|
|
2129
|
+
const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
|
|
2130
|
+
if (!scored.length) return void 0;
|
|
2131
|
+
return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
|
|
2132
|
+
}
|
|
2133
|
+
function budgetStopDecision(budget, spentCostUsd) {
|
|
2134
|
+
if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
|
|
2135
|
+
return {
|
|
2136
|
+
stop: true,
|
|
2137
|
+
reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
|
|
2138
|
+
};
|
|
2139
|
+
}
|
|
2140
|
+
return { stop: false, reason: "" };
|
|
2141
|
+
}
|
|
2142
|
+
async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
|
|
2143
|
+
if (!emitter || budget.maxCostUsd === void 0) return;
|
|
2144
|
+
const maxCostUsd = budget.maxCostUsd;
|
|
2145
|
+
await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
|
|
2146
|
+
dimension: "usd",
|
|
2147
|
+
limit: maxCostUsd,
|
|
2148
|
+
consumed: spentCostUsd,
|
|
2149
|
+
remaining: Math.max(0, maxCostUsd - spentCostUsd),
|
|
2150
|
+
breached: spentCostUsd >= maxCostUsd,
|
|
2151
|
+
spanId: handle?.span.spanId
|
|
2152
|
+
}));
|
|
2153
|
+
}
|
|
2154
|
+
async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
|
|
2155
|
+
if (!emitter) return;
|
|
2156
|
+
for (const result of evals) {
|
|
2157
|
+
await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
|
|
2158
|
+
judgeId: result.objective ? "objective-validator" : "subjective-judge",
|
|
2159
|
+
targetSpanId: targetSpanId ?? emitter.runId,
|
|
2160
|
+
name: `control-eval/${result.id}`,
|
|
2161
|
+
dimension: result.id,
|
|
2162
|
+
score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
|
|
2163
|
+
rationale: result.detail,
|
|
2164
|
+
evidence: result.evidence,
|
|
2165
|
+
attributes: {
|
|
2166
|
+
phase,
|
|
2167
|
+
passed: result.passed,
|
|
2168
|
+
severity: result.severity,
|
|
2169
|
+
objective: result.objective
|
|
2170
|
+
}
|
|
2171
|
+
}));
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
async function runOnStep(onStep, step, runtimeErrors) {
|
|
2175
|
+
if (!onStep) return;
|
|
2176
|
+
try {
|
|
2177
|
+
await onStep(step);
|
|
2178
|
+
} catch (err) {
|
|
2179
|
+
runtimeErrors.push(runtimeError("on-step", step.index, err));
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
async function runTrace(runtimeErrors, stepIndex, write) {
|
|
2183
|
+
try {
|
|
2184
|
+
return await write();
|
|
2185
|
+
} catch (err) {
|
|
2186
|
+
runtimeErrors.push(runtimeError("trace", stepIndex, err));
|
|
2187
|
+
return void 0;
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
function noProgressStopDecision(args) {
|
|
2191
|
+
const max = args.policies?.maxNoProgressSteps;
|
|
2192
|
+
if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
|
|
2193
|
+
const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
|
|
2194
|
+
const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
|
|
2195
|
+
const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
|
|
2196
|
+
const scoreFlat = scoreDelta < minScoreDelta;
|
|
2197
|
+
const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
|
|
2198
|
+
return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
|
|
2199
|
+
}
|
|
2200
|
+
function repeatedActionStopDecision(policies, streak) {
|
|
2201
|
+
const max = policies?.maxRepeatedActions;
|
|
2202
|
+
if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
|
|
2203
|
+
return {
|
|
2204
|
+
stop: true,
|
|
2205
|
+
reason: `stuck: repeated same action for ${streak} step(s)`
|
|
2206
|
+
};
|
|
2207
|
+
}
|
|
2208
|
+
function fingerprintState(state, policies) {
|
|
2209
|
+
if (policies?.stateFingerprint) return policies.stateFingerprint(state);
|
|
2210
|
+
return stableFingerprint(state);
|
|
2211
|
+
}
|
|
2212
|
+
function fingerprintAction(action, policies) {
|
|
2213
|
+
if (policies?.actionFingerprint) return policies.actionFingerprint(action);
|
|
2214
|
+
return stableFingerprint(action);
|
|
2215
|
+
}
|
|
2216
|
+
function stableFingerprint(value) {
|
|
2217
|
+
if (typeof value === "string") return value;
|
|
2218
|
+
if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
|
|
2219
|
+
try {
|
|
2220
|
+
return JSON.stringify(sortForFingerprint(value));
|
|
2221
|
+
} catch {
|
|
2222
|
+
return String(value);
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
function sortForFingerprint(value) {
|
|
2226
|
+
if (Array.isArray(value)) return value.map(sortForFingerprint);
|
|
2227
|
+
if (!value || typeof value !== "object") return value;
|
|
2228
|
+
const record = value;
|
|
2229
|
+
const sorted = {};
|
|
2230
|
+
for (const key of Object.keys(record).sort()) {
|
|
2231
|
+
sorted[key] = sortForFingerprint(record[key]);
|
|
2232
|
+
}
|
|
2233
|
+
return sorted;
|
|
2234
|
+
}
|
|
2235
|
+
function abortReason(signal) {
|
|
2236
|
+
const reason = signal.reason;
|
|
2237
|
+
if (reason instanceof Error) return reason.message;
|
|
2238
|
+
return reason ? String(reason) : "aborted";
|
|
2239
|
+
}
|
|
2240
|
+
function runtimeError(phase, stepIndex, err) {
|
|
2241
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2242
|
+
return { phase, stepIndex, message };
|
|
2243
|
+
}
|
|
2244
|
+
async function finish(emitter, result) {
|
|
2245
|
+
await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
|
|
2246
|
+
pass: result.pass,
|
|
2247
|
+
score: result.score ?? averageScore(result.finalEvals),
|
|
2248
|
+
failureClass: result.failureClass,
|
|
2249
|
+
notes: result.reason
|
|
2250
|
+
}));
|
|
2251
|
+
return result;
|
|
2252
|
+
}
|
|
2253
|
+
|
|
2254
|
+
// src/feedback-trajectory.ts
|
|
2255
|
+
var DEFAULT_SPLIT_POLICY = {
|
|
2256
|
+
trainPct: 70,
|
|
2257
|
+
devPct: 15,
|
|
2258
|
+
testPct: 10,
|
|
2259
|
+
holdoutPct: 5
|
|
2260
|
+
};
|
|
2261
|
+
var InMemoryFeedbackTrajectoryStore = class {
|
|
2262
|
+
trajectories = /* @__PURE__ */ new Map();
|
|
2263
|
+
async save(trajectory) {
|
|
2264
|
+
this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
|
|
2265
|
+
}
|
|
2266
|
+
async get(id) {
|
|
2267
|
+
const trajectory = this.trajectories.get(id);
|
|
2268
|
+
return trajectory ? cloneTrajectory(trajectory) : null;
|
|
2269
|
+
}
|
|
2270
|
+
async list(filter = {}) {
|
|
2271
|
+
return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
|
|
2272
|
+
}
|
|
2273
|
+
async appendAttempt(id, attempt) {
|
|
2274
|
+
const trajectory = this.trajectories.get(id);
|
|
2275
|
+
if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
|
|
2276
|
+
const next = cloneTrajectory({
|
|
2277
|
+
...trajectory,
|
|
2278
|
+
attempts: [...trajectory.attempts, attempt],
|
|
2279
|
+
updatedAt: attempt.createdAt
|
|
2280
|
+
});
|
|
2281
|
+
this.trajectories.set(id, next);
|
|
2282
|
+
return cloneTrajectory(next);
|
|
2283
|
+
}
|
|
2284
|
+
async appendLabel(id, label, attemptId) {
|
|
2285
|
+
const trajectory = this.trajectories.get(id);
|
|
2286
|
+
if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
|
|
2287
|
+
const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
|
|
2288
|
+
const next = cloneTrajectory({
|
|
2289
|
+
...trajectory,
|
|
2290
|
+
attempts,
|
|
2291
|
+
labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
|
|
2292
|
+
updatedAt: label.createdAt
|
|
2293
|
+
});
|
|
2294
|
+
this.trajectories.set(id, next);
|
|
2295
|
+
return cloneTrajectory(next);
|
|
2296
|
+
}
|
|
2297
|
+
};
|
|
2298
|
+
var FileSystemFeedbackTrajectoryStore = class {
|
|
2299
|
+
dir;
|
|
2300
|
+
memory = new InMemoryFeedbackTrajectoryStore();
|
|
2301
|
+
loaded = false;
|
|
2302
|
+
constructor(options) {
|
|
2303
|
+
this.dir = options.dir;
|
|
2304
|
+
}
|
|
2305
|
+
async save(trajectory) {
|
|
2306
|
+
await this.load();
|
|
2307
|
+
await this.memory.save(trajectory);
|
|
2308
|
+
await this.append({ op: "save", trajectory });
|
|
2309
|
+
}
|
|
2310
|
+
async get(id) {
|
|
2311
|
+
await this.load();
|
|
2312
|
+
return this.memory.get(id);
|
|
2313
|
+
}
|
|
2314
|
+
async list(filter = {}) {
|
|
2315
|
+
await this.load();
|
|
2316
|
+
return this.memory.list(filter);
|
|
2317
|
+
}
|
|
2318
|
+
async appendAttempt(id, attempt) {
|
|
2319
|
+
await this.load();
|
|
2320
|
+
const next = await this.memory.appendAttempt(id, attempt);
|
|
2321
|
+
await this.append({ op: "appendAttempt", id, attempt });
|
|
2322
|
+
return next;
|
|
2323
|
+
}
|
|
2324
|
+
async appendLabel(id, label, attemptId) {
|
|
2325
|
+
await this.load();
|
|
2326
|
+
const next = await this.memory.appendLabel(id, label, attemptId);
|
|
2327
|
+
await this.append({ op: "appendLabel", id, label, attemptId });
|
|
2328
|
+
return next;
|
|
2329
|
+
}
|
|
2330
|
+
async append(record) {
|
|
2331
|
+
const { appendFile, mkdir } = await import("fs/promises");
|
|
2332
|
+
const { join: join3 } = await import("path");
|
|
2333
|
+
await mkdir(this.dir, { recursive: true });
|
|
2334
|
+
await appendFile(join3(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
|
|
2335
|
+
}
|
|
2336
|
+
async load() {
|
|
2337
|
+
if (this.loaded) return;
|
|
2338
|
+
const { readFile } = await import("fs/promises");
|
|
2339
|
+
const { join: join3 } = await import("path");
|
|
2340
|
+
const file = join3(this.dir, "feedback-trajectories.ndjson");
|
|
2341
|
+
try {
|
|
2342
|
+
const raw = await readFile(file, "utf8");
|
|
2343
|
+
for (const line of raw.split("\n")) {
|
|
2344
|
+
if (!line.trim()) continue;
|
|
2345
|
+
try {
|
|
2346
|
+
const record = JSON.parse(line);
|
|
2347
|
+
if (record.op === "save") await this.memory.save(record.trajectory);
|
|
2348
|
+
if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
|
|
2349
|
+
if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
|
|
2350
|
+
} catch {
|
|
2351
|
+
}
|
|
2352
|
+
}
|
|
2353
|
+
} catch {
|
|
2354
|
+
}
|
|
2355
|
+
this.loaded = true;
|
|
2356
|
+
}
|
|
2357
|
+
};
|
|
2358
|
+
function createFeedbackTrajectory(input) {
|
|
2359
|
+
const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2360
|
+
const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
|
|
2361
|
+
return {
|
|
2362
|
+
id,
|
|
2363
|
+
projectId: input.projectId,
|
|
2364
|
+
scenarioId: input.scenarioId,
|
|
2365
|
+
task: input.task,
|
|
2366
|
+
attempts: input.attempts ?? [],
|
|
2367
|
+
labels: input.labels ?? [],
|
|
2368
|
+
outcome: input.outcome,
|
|
2369
|
+
split: input.split,
|
|
2370
|
+
tags: input.tags,
|
|
2371
|
+
createdAt,
|
|
2372
|
+
metadata: input.metadata
|
|
2373
|
+
};
|
|
2374
|
+
}
|
|
2375
|
+
function assignFeedbackSplit(trajectory, policy = {}) {
|
|
2376
|
+
const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
|
|
2377
|
+
const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
|
|
2378
|
+
if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
|
|
2379
|
+
const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
|
|
2380
|
+
if (bucket < split.trainPct) return "train";
|
|
2381
|
+
if (bucket < split.trainPct + split.devPct) return "dev";
|
|
2382
|
+
if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
|
|
2383
|
+
return "holdout";
|
|
2384
|
+
}
|
|
2385
|
+
function withAssignedFeedbackSplit(trajectory, policy) {
|
|
2386
|
+
return {
|
|
2387
|
+
...trajectory,
|
|
2388
|
+
split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
|
|
2389
|
+
};
|
|
2390
|
+
}
|
|
2391
|
+
function feedbackTrajectoryToDatasetScenario(trajectory) {
|
|
2392
|
+
const withSplit = withAssignedFeedbackSplit(trajectory);
|
|
2393
|
+
return {
|
|
2394
|
+
id: withSplit.scenarioId ?? withSplit.id,
|
|
2395
|
+
split: withSplit.split,
|
|
2396
|
+
payload: withSplit,
|
|
2397
|
+
tags: {
|
|
2398
|
+
...withSplit.projectId ? { projectId: withSplit.projectId } : {},
|
|
2399
|
+
...withSplit.tags ?? {},
|
|
2400
|
+
source: "feedback-trajectory"
|
|
2401
|
+
}
|
|
2402
|
+
};
|
|
2403
|
+
}
|
|
2404
|
+
function feedbackTrajectoriesToDatasetScenarios(trajectories) {
|
|
2405
|
+
return trajectories.map(feedbackTrajectoryToDatasetScenario);
|
|
2406
|
+
}
|
|
2407
|
+
function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
2408
|
+
const labels = allLabels(trajectory);
|
|
2409
|
+
return {
|
|
2410
|
+
scenarioId: trajectory.scenarioId ?? trajectory.id,
|
|
2411
|
+
trajectoryId: trajectory.id,
|
|
2412
|
+
labelKinds: [...new Set(labels.map((label) => label.kind))],
|
|
2413
|
+
score: trajectory.outcome?.score ?? scoreFromLabels(labels),
|
|
2414
|
+
metadata: {
|
|
2415
|
+
projectId: trajectory.projectId,
|
|
2416
|
+
split: trajectory.split,
|
|
2417
|
+
intent: trajectory.task.intent,
|
|
2418
|
+
attempts: trajectory.attempts.length,
|
|
2419
|
+
outcome: trajectory.outcome,
|
|
2420
|
+
labels
|
|
2421
|
+
}
|
|
2422
|
+
};
|
|
2423
|
+
}
|
|
2424
|
+
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
2425
|
+
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
2426
|
+
}
|
|
2427
|
+
async function replayFeedbackTrajectory(trajectory, adapter2) {
|
|
2428
|
+
try {
|
|
2429
|
+
const result = await adapter2.replay(trajectory);
|
|
2430
|
+
return {
|
|
2431
|
+
trajectoryId: trajectory.id,
|
|
2432
|
+
...result
|
|
2433
|
+
};
|
|
2434
|
+
} catch (err) {
|
|
2435
|
+
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2436
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2437
|
+
return {
|
|
2438
|
+
trajectoryId: trajectory.id,
|
|
2439
|
+
pass: false,
|
|
2440
|
+
labels: [{
|
|
2441
|
+
source: "system",
|
|
2442
|
+
kind: "reject",
|
|
2443
|
+
value: false,
|
|
2444
|
+
reason: message,
|
|
2445
|
+
severity: "error",
|
|
2446
|
+
createdAt
|
|
2447
|
+
}],
|
|
2448
|
+
outcome: {
|
|
2449
|
+
success: false,
|
|
2450
|
+
score: 0,
|
|
2451
|
+
detail: message,
|
|
2452
|
+
observedAt: createdAt
|
|
2453
|
+
},
|
|
2454
|
+
metadata: { replayError: true }
|
|
2455
|
+
};
|
|
2456
|
+
}
|
|
2457
|
+
}
|
|
2458
|
+
async function replayFeedbackTrajectories(trajectories, adapter2) {
|
|
2459
|
+
const results = [];
|
|
2460
|
+
for (const trajectory of trajectories) {
|
|
2461
|
+
results.push(await replayFeedbackTrajectory(trajectory, adapter2));
|
|
2462
|
+
}
|
|
2463
|
+
return results;
|
|
2464
|
+
}
|
|
2465
|
+
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
2466
|
+
const maxEntries = options.maxEntries ?? 20;
|
|
2467
|
+
const entries = [];
|
|
2468
|
+
for (const trajectory of trajectories) {
|
|
2469
|
+
for (const label of allLabels(trajectory)) {
|
|
2470
|
+
const instruction = instructionFromLabel(trajectory, label);
|
|
2471
|
+
if (!instruction) continue;
|
|
2472
|
+
entries.push({
|
|
2473
|
+
instruction,
|
|
2474
|
+
rationale: label.reason ?? `${label.kind} label from ${label.source}`,
|
|
2475
|
+
weight: weightForLabel(label),
|
|
2476
|
+
sourceTrajectoryId: trajectory.id,
|
|
2477
|
+
sourceLabelId: label.id,
|
|
2478
|
+
category: label.kind
|
|
2479
|
+
});
|
|
2480
|
+
}
|
|
2481
|
+
}
|
|
2482
|
+
const byInstruction = /* @__PURE__ */ new Map();
|
|
2483
|
+
for (const entry of entries) {
|
|
2484
|
+
const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
|
|
2485
|
+
const existing = byInstruction.get(key);
|
|
2486
|
+
if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
|
|
2487
|
+
}
|
|
2488
|
+
return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
|
|
2489
|
+
}
|
|
2490
|
+
function renderPreferenceMemoryMarkdown(entries) {
|
|
2491
|
+
const lines = ["# Preference Memory", ""];
|
|
2492
|
+
for (const entry of entries) {
|
|
2493
|
+
lines.push(`- ${entry.instruction}`);
|
|
2494
|
+
lines.push(` Rationale: ${entry.rationale}`);
|
|
2495
|
+
lines.push(` Source: ${entry.sourceTrajectoryId}`);
|
|
2496
|
+
lines.push("");
|
|
2497
|
+
}
|
|
2498
|
+
return lines.join("\n").trim() + "\n";
|
|
2499
|
+
}
|
|
2500
|
+
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
2501
|
+
return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
|
|
2502
|
+
}
|
|
2503
|
+
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
2504
|
+
const trajectories = [];
|
|
2505
|
+
for (const line of jsonl.split("\n")) {
|
|
2506
|
+
if (!line.trim()) continue;
|
|
2507
|
+
trajectories.push(JSON.parse(line));
|
|
2508
|
+
}
|
|
2509
|
+
return trajectories;
|
|
2510
|
+
}
|
|
2511
|
+
function controlRunToFeedbackTrajectory(run, options = {}) {
|
|
2512
|
+
const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2513
|
+
const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
|
|
2514
|
+
return createFeedbackTrajectory({
|
|
2515
|
+
id: trajectoryId,
|
|
2516
|
+
projectId: options.projectId,
|
|
2517
|
+
scenarioId: options.scenarioId,
|
|
2518
|
+
task: { intent: run.intent },
|
|
2519
|
+
createdAt,
|
|
2520
|
+
attempts: run.steps.map((step) => ({
|
|
2521
|
+
id: `${trajectoryId}_step_${step.index}`,
|
|
2522
|
+
stepIndex: step.index,
|
|
2523
|
+
artifactType: options.artifactType ?? "action",
|
|
2524
|
+
artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
|
|
2525
|
+
proposedAction: options.proposedActionFromStep?.(step),
|
|
2526
|
+
evals: step.evalsAfter,
|
|
2527
|
+
createdAt: step.startedAt,
|
|
2528
|
+
metadata: {
|
|
2529
|
+
decision: step.decision,
|
|
2530
|
+
actionOutcome: step.actionOutcome
|
|
2531
|
+
}
|
|
2532
|
+
})),
|
|
2533
|
+
labels: [
|
|
2534
|
+
{
|
|
2535
|
+
source: "system",
|
|
2536
|
+
kind: run.pass ? "approve" : "reject",
|
|
2537
|
+
value: run.pass,
|
|
2538
|
+
reason: run.reason,
|
|
2539
|
+
severity: run.pass ? "info" : "error",
|
|
2540
|
+
createdAt
|
|
2541
|
+
}
|
|
2542
|
+
],
|
|
2543
|
+
outcome: {
|
|
2544
|
+
success: run.pass,
|
|
2545
|
+
score: run.score,
|
|
2546
|
+
costUsd: run.spentCostUsd,
|
|
2547
|
+
detail: run.reason,
|
|
2548
|
+
observedAt: createdAt,
|
|
2549
|
+
metadata: {
|
|
2550
|
+
stoppedBy: run.stoppedBy,
|
|
2551
|
+
failureClass: run.failureClass
|
|
2552
|
+
}
|
|
2553
|
+
}
|
|
2554
|
+
});
|
|
2555
|
+
}
|
|
2556
|
+
function allLabels(trajectory) {
|
|
2557
|
+
const labels = [
|
|
2558
|
+
...trajectory.labels,
|
|
2559
|
+
...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
|
|
2560
|
+
];
|
|
2561
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2562
|
+
return labels.filter((label) => {
|
|
2563
|
+
const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
|
|
2564
|
+
if (seen.has(key)) return false;
|
|
2565
|
+
seen.add(key);
|
|
2566
|
+
return true;
|
|
2567
|
+
});
|
|
2568
|
+
}
|
|
2569
|
+
function scoreFromLabels(labels) {
|
|
2570
|
+
if (!labels.length) return void 0;
|
|
2571
|
+
const scored = labels.map((label) => {
|
|
2572
|
+
if (label.kind === "approve" || label.kind === "select") return 1;
|
|
2573
|
+
if (label.kind === "reject" || label.kind === "policy_block") return 0;
|
|
2574
|
+
if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
|
|
2575
|
+
return void 0;
|
|
2576
|
+
}).filter((value) => typeof value === "number");
|
|
2577
|
+
if (!scored.length) return void 0;
|
|
2578
|
+
return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
|
|
2579
|
+
}
|
|
2580
|
+
function instructionFromLabel(trajectory, label) {
|
|
2581
|
+
if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
|
|
2582
|
+
if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
|
|
2583
|
+
if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
|
|
2584
|
+
if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
|
|
2585
|
+
if (label.kind === "comment" && label.reason) return label.reason;
|
|
2586
|
+
return void 0;
|
|
2587
|
+
}
|
|
2588
|
+
function weightForLabel(label) {
|
|
2589
|
+
const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
|
|
2590
|
+
const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
|
|
2591
|
+
return severity * source;
|
|
2592
|
+
}
|
|
2593
|
+
function matchesFilter(trajectory, filter) {
|
|
2594
|
+
if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
|
|
2595
|
+
if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
|
|
2596
|
+
if (filter.split && trajectory.split !== filter.split) return false;
|
|
2597
|
+
if (filter.tag) {
|
|
2598
|
+
const [key, value] = filter.tag;
|
|
2599
|
+
if (trajectory.tags?.[key] !== value) return false;
|
|
2600
|
+
}
|
|
2601
|
+
return true;
|
|
2602
|
+
}
|
|
2603
|
+
function cloneTrajectory(trajectory) {
|
|
2604
|
+
return JSON.parse(JSON.stringify(trajectory));
|
|
2605
|
+
}
|
|
2606
|
+
function compact(value, max) {
|
|
2607
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
2608
|
+
return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
|
|
2609
|
+
}
|
|
2610
|
+
function stableHash(input) {
|
|
2611
|
+
let hash = 2166136261;
|
|
2612
|
+
for (let i = 0; i < input.length; i += 1) {
|
|
2613
|
+
hash ^= input.charCodeAt(i);
|
|
2614
|
+
hash = Math.imul(hash, 16777619);
|
|
2615
|
+
}
|
|
2616
|
+
return hash >>> 0;
|
|
2617
|
+
}
|
|
2618
|
+
function canonicalize(value) {
|
|
2619
|
+
if (value === null || typeof value !== "object") return value;
|
|
2620
|
+
if (Array.isArray(value)) return value.map(canonicalize);
|
|
2621
|
+
const out = {};
|
|
2622
|
+
for (const key of Object.keys(value).sort()) {
|
|
2623
|
+
out[key] = canonicalize(value[key]);
|
|
2624
|
+
}
|
|
2625
|
+
return out;
|
|
2626
|
+
}
|
|
2627
|
+
|
|
2628
|
+
// src/action-policy.ts
|
|
2629
|
+
function evaluateActionPolicy(action, policy = {}, options = {}) {
|
|
2630
|
+
const reasons = [];
|
|
2631
|
+
let blocked = false;
|
|
2632
|
+
let requiresApproval = Boolean(action.requiresApproval);
|
|
2633
|
+
if (policy.allowedTypes?.length && !policy.allowedTypes.includes(action.type)) {
|
|
2634
|
+
blocked = true;
|
|
2635
|
+
reasons.push(`action type "${action.type}" is not allowed`);
|
|
2636
|
+
}
|
|
2637
|
+
if (policy.blockedTypes?.includes(action.type)) {
|
|
2638
|
+
blocked = true;
|
|
2639
|
+
reasons.push(`action type "${action.type}" is blocked`);
|
|
2640
|
+
}
|
|
2641
|
+
if (policy.alwaysRequireApprovalTypes?.includes(action.type)) {
|
|
2642
|
+
requiresApproval = true;
|
|
2643
|
+
reasons.push(`action type "${action.type}" requires approval`);
|
|
2644
|
+
}
|
|
2645
|
+
if (policy.requireApprovalForExternalSideEffects && action.externalSideEffect) {
|
|
2646
|
+
requiresApproval = true;
|
|
2647
|
+
reasons.push("external side effect requires approval");
|
|
2648
|
+
}
|
|
2649
|
+
if (policy.requireApprovalAboveCostUsd !== void 0 && (action.costUsd ?? 0) > policy.requireApprovalAboveCostUsd) {
|
|
2650
|
+
requiresApproval = true;
|
|
2651
|
+
reasons.push(`cost ${action.costUsd} exceeds approval threshold ${policy.requireApprovalAboveCostUsd}`);
|
|
2652
|
+
}
|
|
2653
|
+
if (policy.maxActionCostUsd !== void 0 && (action.costUsd ?? 0) > policy.maxActionCostUsd) {
|
|
2654
|
+
blocked = true;
|
|
2655
|
+
reasons.push(`cost ${action.costUsd} exceeds max action cost ${policy.maxActionCostUsd}`);
|
|
2656
|
+
}
|
|
2657
|
+
if (policy.remainingBudgetUsd !== void 0 && (action.costUsd ?? 0) > policy.remainingBudgetUsd) {
|
|
2658
|
+
blocked = true;
|
|
2659
|
+
reasons.push(`cost ${action.costUsd} exceeds remaining budget ${policy.remainingBudgetUsd}`);
|
|
2660
|
+
}
|
|
2661
|
+
if (policy.expectedOutcomeRequired && !action.metadata?.expectedOutcome) {
|
|
2662
|
+
blocked = true;
|
|
2663
|
+
reasons.push("expected outcome is required");
|
|
2664
|
+
}
|
|
2665
|
+
if (policy.killCriteriaRequired && !action.metadata?.killCriteria) {
|
|
2666
|
+
blocked = true;
|
|
2667
|
+
reasons.push("kill criteria are required");
|
|
2668
|
+
}
|
|
2669
|
+
if (policy.autoApproveTypes?.includes(action.type) && requiresApproval) {
|
|
2670
|
+
reasons.push(`action type "${action.type}" is auto-approved only when no approval policy applies`);
|
|
2671
|
+
}
|
|
2672
|
+
if (!reasons.length) reasons.push(requiresApproval ? "approval required" : "action allowed");
|
|
2673
|
+
const label = blocked || requiresApproval ? {
|
|
2674
|
+
source: "policy",
|
|
2675
|
+
kind: blocked ? "policy_block" : "comment",
|
|
2676
|
+
value: { actionType: action.type, blocked, requiresApproval },
|
|
2677
|
+
reason: reasons.join("; "),
|
|
2678
|
+
severity: blocked ? "critical" : "warning",
|
|
2679
|
+
createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
2680
|
+
metadata: { action, policy }
|
|
2681
|
+
} : void 0;
|
|
2682
|
+
return {
|
|
2683
|
+
allowed: !blocked,
|
|
2684
|
+
blocked,
|
|
2685
|
+
requiresApproval: !blocked && requiresApproval,
|
|
2686
|
+
reasons,
|
|
2687
|
+
label
|
|
2688
|
+
};
|
|
2689
|
+
}
|
|
2690
|
+
|
|
1389
2691
|
// src/prompt-registry.ts
|
|
1390
2692
|
var PromptRegistry = class {
|
|
1391
2693
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -3101,184 +4403,6 @@ var FileSystemTraceStore = class {
|
|
|
3101
4403
|
}
|
|
3102
4404
|
};
|
|
3103
4405
|
|
|
3104
|
-
// src/trace/emitter.ts
|
|
3105
|
-
var TraceEmitter = class {
|
|
3106
|
-
store;
|
|
3107
|
-
stack = [];
|
|
3108
|
-
_runId;
|
|
3109
|
-
now;
|
|
3110
|
-
id;
|
|
3111
|
-
constructor(store, options = {}) {
|
|
3112
|
-
this.store = store;
|
|
3113
|
-
this.now = options.now ?? (() => Date.now());
|
|
3114
|
-
this.id = options.id ?? (() => cryptoRandomId());
|
|
3115
|
-
this._runId = options.runId ?? this.id();
|
|
3116
|
-
}
|
|
3117
|
-
get runId() {
|
|
3118
|
-
return this._runId;
|
|
3119
|
-
}
|
|
3120
|
-
// ── Run lifecycle ──────────────────────────────────────────────────
|
|
3121
|
-
async startRun(run) {
|
|
3122
|
-
const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
|
|
3123
|
-
await this.store.appendRun(full);
|
|
3124
|
-
return full;
|
|
3125
|
-
}
|
|
3126
|
-
async endRun(outcome) {
|
|
3127
|
-
const status = outcome?.pass === false ? "failed" : "completed";
|
|
3128
|
-
await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
|
|
3129
|
-
}
|
|
3130
|
-
async abortRun(reason) {
|
|
3131
|
-
await this.store.updateRun(this._runId, {
|
|
3132
|
-
endedAt: this.now(),
|
|
3133
|
-
status: "aborted",
|
|
3134
|
-
outcome: { pass: false, notes: reason }
|
|
3135
|
-
});
|
|
3136
|
-
}
|
|
3137
|
-
// ── Generic span ───────────────────────────────────────────────────
|
|
3138
|
-
async span(init) {
|
|
3139
|
-
const spanId = this.id();
|
|
3140
|
-
const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
|
|
3141
|
-
const span = {
|
|
3142
|
-
spanId,
|
|
3143
|
-
parentSpanId: parent,
|
|
3144
|
-
runId: this._runId,
|
|
3145
|
-
startedAt: this.now(),
|
|
3146
|
-
...init
|
|
3147
|
-
};
|
|
3148
|
-
await this.store.appendSpan(span);
|
|
3149
|
-
this.stack.push(spanId);
|
|
3150
|
-
return this.handle(span);
|
|
3151
|
-
}
|
|
3152
|
-
handle(span) {
|
|
3153
|
-
return {
|
|
3154
|
-
span,
|
|
3155
|
-
end: async (patch) => {
|
|
3156
|
-
const endedAt = this.now();
|
|
3157
|
-
await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
|
|
3158
|
-
this.pop(span.spanId);
|
|
3159
|
-
},
|
|
3160
|
-
fail: async (error, patch) => {
|
|
3161
|
-
const endedAt = this.now();
|
|
3162
|
-
const errStr = error instanceof Error ? error.message : error;
|
|
3163
|
-
await this.store.updateSpan(span.spanId, {
|
|
3164
|
-
endedAt,
|
|
3165
|
-
status: "error",
|
|
3166
|
-
error: errStr,
|
|
3167
|
-
...patch
|
|
3168
|
-
});
|
|
3169
|
-
this.pop(span.spanId);
|
|
3170
|
-
}
|
|
3171
|
-
};
|
|
3172
|
-
}
|
|
3173
|
-
pop(spanId) {
|
|
3174
|
-
const idx = this.stack.lastIndexOf(spanId);
|
|
3175
|
-
if (idx >= 0) this.stack.splice(idx, 1);
|
|
3176
|
-
}
|
|
3177
|
-
// ── Typed span conveniences ────────────────────────────────────────
|
|
3178
|
-
llm(init) {
|
|
3179
|
-
return this.span({ kind: "llm", ...init });
|
|
3180
|
-
}
|
|
3181
|
-
tool(init) {
|
|
3182
|
-
return this.span({ kind: "tool", ...init });
|
|
3183
|
-
}
|
|
3184
|
-
retrieval(init) {
|
|
3185
|
-
return this.span({ kind: "retrieval", ...init });
|
|
3186
|
-
}
|
|
3187
|
-
async recordJudge(verdict) {
|
|
3188
|
-
const spanId = this.id();
|
|
3189
|
-
const now = this.now();
|
|
3190
|
-
const full = {
|
|
3191
|
-
spanId,
|
|
3192
|
-
runId: this._runId,
|
|
3193
|
-
kind: "judge",
|
|
3194
|
-
startedAt: now,
|
|
3195
|
-
endedAt: now,
|
|
3196
|
-
status: "ok",
|
|
3197
|
-
...verdict
|
|
3198
|
-
};
|
|
3199
|
-
await this.store.appendSpan(full);
|
|
3200
|
-
return full;
|
|
3201
|
-
}
|
|
3202
|
-
sandbox(init) {
|
|
3203
|
-
return this.span({ kind: "sandbox", ...init });
|
|
3204
|
-
}
|
|
3205
|
-
// ── Events ─────────────────────────────────────────────────────────
|
|
3206
|
-
async emit(event) {
|
|
3207
|
-
const full = {
|
|
3208
|
-
eventId: this.id(),
|
|
3209
|
-
runId: this._runId,
|
|
3210
|
-
spanId: event.spanId ?? this.stack[this.stack.length - 1],
|
|
3211
|
-
kind: event.kind,
|
|
3212
|
-
timestamp: this.now(),
|
|
3213
|
-
payload: event.payload ?? {}
|
|
3214
|
-
};
|
|
3215
|
-
await this.store.appendEvent(full);
|
|
3216
|
-
return full;
|
|
3217
|
-
}
|
|
3218
|
-
// ── Budget ledger ──────────────────────────────────────────────────
|
|
3219
|
-
async recordBudget(entry) {
|
|
3220
|
-
const full = {
|
|
3221
|
-
runId: this._runId,
|
|
3222
|
-
timestamp: entry.timestamp ?? this.now(),
|
|
3223
|
-
dimension: entry.dimension,
|
|
3224
|
-
limit: entry.limit,
|
|
3225
|
-
consumed: entry.consumed,
|
|
3226
|
-
remaining: entry.remaining,
|
|
3227
|
-
breached: entry.breached,
|
|
3228
|
-
spanId: entry.spanId ?? this.stack[this.stack.length - 1]
|
|
3229
|
-
};
|
|
3230
|
-
await this.store.appendBudgetEntry(full);
|
|
3231
|
-
if (full.breached) {
|
|
3232
|
-
await this.emit({
|
|
3233
|
-
kind: "budget_breach",
|
|
3234
|
-
spanId: full.spanId,
|
|
3235
|
-
payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
|
|
3236
|
-
});
|
|
3237
|
-
}
|
|
3238
|
-
return full;
|
|
3239
|
-
}
|
|
3240
|
-
// ── Artifacts ──────────────────────────────────────────────────────
|
|
3241
|
-
async recordArtifact(artifact) {
|
|
3242
|
-
const full = { artifactId: this.id(), runId: this._runId, ...artifact };
|
|
3243
|
-
await this.store.appendArtifact(full);
|
|
3244
|
-
return full;
|
|
3245
|
-
}
|
|
3246
|
-
// ── Nested composition ─────────────────────────────────────────────
|
|
3247
|
-
/**
|
|
3248
|
-
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
3249
|
-
* Returns the fn's return value. Use this for the 95% case.
|
|
3250
|
-
*/
|
|
3251
|
-
async within(init, fn) {
|
|
3252
|
-
const handle = await this.span(init);
|
|
3253
|
-
try {
|
|
3254
|
-
const result = await fn(handle);
|
|
3255
|
-
await handle.end();
|
|
3256
|
-
return result;
|
|
3257
|
-
} catch (err) {
|
|
3258
|
-
await handle.fail(err instanceof Error ? err : String(err));
|
|
3259
|
-
throw err;
|
|
3260
|
-
}
|
|
3261
|
-
}
|
|
3262
|
-
};
|
|
3263
|
-
function cryptoRandomId() {
|
|
3264
|
-
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
3265
|
-
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
3266
|
-
}
|
|
3267
|
-
function llmSpanFromProvider(args) {
|
|
3268
|
-
return {
|
|
3269
|
-
name: args.name ?? args.model,
|
|
3270
|
-
model: args.model,
|
|
3271
|
-
messages: args.messages,
|
|
3272
|
-
output: args.output,
|
|
3273
|
-
inputTokens: args.usage?.inputTokens,
|
|
3274
|
-
outputTokens: args.usage?.outputTokens,
|
|
3275
|
-
cachedTokens: args.usage?.cachedTokens,
|
|
3276
|
-
reasoningTokens: args.usage?.reasoningTokens,
|
|
3277
|
-
costUsd: args.costUsd,
|
|
3278
|
-
finishReason: args.finishReason
|
|
3279
|
-
};
|
|
3280
|
-
}
|
|
3281
|
-
|
|
3282
4406
|
// src/sandbox-harness.ts
|
|
3283
4407
|
var vitestTestParser = {
|
|
3284
4408
|
id: "vitest",
|
|
@@ -3887,6 +5011,157 @@ function safeJson(x) {
|
|
|
3887
5011
|
}
|
|
3888
5012
|
}
|
|
3889
5013
|
|
|
5014
|
+
// src/propose-review-control.ts
|
|
5015
|
+
var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
5016
|
+
async function runProposeReviewAsControlLoop(config) {
|
|
5017
|
+
const maxShots = config.maxShots ?? 10;
|
|
5018
|
+
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
5019
|
+
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
5020
|
+
const memory = config.memory ?? inMemoryReviewStore();
|
|
5021
|
+
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
|
|
5022
|
+
const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
|
|
5023
|
+
let lowConfidenceStreak = 0;
|
|
5024
|
+
let current = {
|
|
5025
|
+
shot: 0,
|
|
5026
|
+
state: config.initialState,
|
|
5027
|
+
priorReview: null,
|
|
5028
|
+
verification: { pass: false },
|
|
5029
|
+
memory: await memory.load(),
|
|
5030
|
+
completed: false,
|
|
5031
|
+
reviewAvailable: false
|
|
5032
|
+
};
|
|
5033
|
+
return runAgentControlLoop({
|
|
5034
|
+
intent: config.goal,
|
|
5035
|
+
budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
|
|
5036
|
+
store: config.store,
|
|
5037
|
+
scenarioId: config.scenarioId ?? "propose-review-control",
|
|
5038
|
+
projectId: config.projectId,
|
|
5039
|
+
variantId: config.variantId,
|
|
5040
|
+
actionFailure: config.actionFailure ?? "stop",
|
|
5041
|
+
observe: () => current,
|
|
5042
|
+
validate: ({ state }) => [
|
|
5043
|
+
objectiveEval({
|
|
5044
|
+
id: "verification",
|
|
5045
|
+
passed: state.verification.pass,
|
|
5046
|
+
score: state.verification.score,
|
|
5047
|
+
severity: "critical",
|
|
5048
|
+
detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
|
|
5049
|
+
})
|
|
5050
|
+
],
|
|
5051
|
+
shouldStop: ({ state }) => {
|
|
5052
|
+
if (state.verification.pass) {
|
|
5053
|
+
return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
|
|
5054
|
+
}
|
|
5055
|
+
if (state.completed) {
|
|
5056
|
+
return {
|
|
5057
|
+
stop: true,
|
|
5058
|
+
pass: false,
|
|
5059
|
+
reason: "reviewer stopped continuation",
|
|
5060
|
+
score: state.verification.score,
|
|
5061
|
+
failureClass: failureClassFromVerification(state.verification)
|
|
5062
|
+
};
|
|
5063
|
+
}
|
|
5064
|
+
return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
|
|
5065
|
+
},
|
|
5066
|
+
decide: ({ state }) => ({
|
|
5067
|
+
type: "continue",
|
|
5068
|
+
action: { type: "propose-review-shot", shot: state.shot + 1 },
|
|
5069
|
+
reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
|
|
5070
|
+
}),
|
|
5071
|
+
act: async (action, ctx) => {
|
|
5072
|
+
const shot = action.shot;
|
|
5073
|
+
const proposeOut = await config.propose({
|
|
5074
|
+
shot,
|
|
5075
|
+
goal: config.goal,
|
|
5076
|
+
state: current.state,
|
|
5077
|
+
priorReview: current.priorReview,
|
|
5078
|
+
abortSignal: ctx.abortSignal,
|
|
5079
|
+
emitter: ctx.emitter
|
|
5080
|
+
});
|
|
5081
|
+
const nextState = proposeOut.state;
|
|
5082
|
+
const verification = await config.verify(nextState);
|
|
5083
|
+
let review = null;
|
|
5084
|
+
let reviewAvailable = false;
|
|
5085
|
+
let reviewError;
|
|
5086
|
+
let shouldContinue = !verification.pass;
|
|
5087
|
+
if (!verification.pass) {
|
|
5088
|
+
try {
|
|
5089
|
+
review = await config.review({
|
|
5090
|
+
shot,
|
|
5091
|
+
goal: config.goal,
|
|
5092
|
+
state: nextState,
|
|
5093
|
+
verification,
|
|
5094
|
+
traceSummary: proposeOut.traceSummary,
|
|
5095
|
+
memory: await memory.load()
|
|
5096
|
+
});
|
|
5097
|
+
reviewAvailable = true;
|
|
5098
|
+
shouldContinue = review.shouldContinue;
|
|
5099
|
+
lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
|
|
5100
|
+
if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
|
|
5101
|
+
} catch (err) {
|
|
5102
|
+
reviewError = err instanceof Error ? err.message : String(err);
|
|
5103
|
+
review = current.priorReview ?? {
|
|
5104
|
+
observations: "Reviewer unavailable.",
|
|
5105
|
+
diagnosis: reviewError,
|
|
5106
|
+
nextShotInstruction: fallbackInstruction,
|
|
5107
|
+
shouldContinue: true,
|
|
5108
|
+
confidence: 0
|
|
5109
|
+
};
|
|
5110
|
+
shouldContinue = true;
|
|
5111
|
+
}
|
|
5112
|
+
} else {
|
|
5113
|
+
review = {
|
|
5114
|
+
observations: "Verification passed.",
|
|
5115
|
+
diagnosis: "No further revision needed.",
|
|
5116
|
+
nextShotInstruction: "",
|
|
5117
|
+
shouldContinue: false,
|
|
5118
|
+
confidence: 1
|
|
5119
|
+
};
|
|
5120
|
+
}
|
|
5121
|
+
const entry = {
|
|
5122
|
+
...review ?? {
|
|
5123
|
+
observations: "No review.",
|
|
5124
|
+
diagnosis: "",
|
|
5125
|
+
nextShotInstruction: fallbackInstruction,
|
|
5126
|
+
shouldContinue,
|
|
5127
|
+
confidence: 0
|
|
5128
|
+
},
|
|
5129
|
+
shot,
|
|
5130
|
+
timestamp: Date.now(),
|
|
5131
|
+
verification: {
|
|
5132
|
+
pass: verification.pass,
|
|
5133
|
+
score: verification.score,
|
|
5134
|
+
failingLayers: verification.failingLayers
|
|
5135
|
+
}
|
|
5136
|
+
};
|
|
5137
|
+
await memory.append(entry);
|
|
5138
|
+
current = {
|
|
5139
|
+
shot,
|
|
5140
|
+
state: nextState,
|
|
5141
|
+
priorReview: review,
|
|
5142
|
+
verification,
|
|
5143
|
+
traceSummary: proposeOut.traceSummary,
|
|
5144
|
+
memory: await memory.load(),
|
|
5145
|
+
completed: verification.pass || !shouldContinue,
|
|
5146
|
+
reviewAvailable,
|
|
5147
|
+
reviewError
|
|
5148
|
+
};
|
|
5149
|
+
return {
|
|
5150
|
+
state: nextState,
|
|
5151
|
+
verification,
|
|
5152
|
+
traceSummary: proposeOut.traceSummary,
|
|
5153
|
+
review,
|
|
5154
|
+
reviewAvailable,
|
|
5155
|
+
reviewError
|
|
5156
|
+
};
|
|
5157
|
+
}
|
|
5158
|
+
});
|
|
5159
|
+
}
|
|
5160
|
+
function controlFailureClassFromVerification(verification) {
|
|
5161
|
+
if (verification.pass) return void 0;
|
|
5162
|
+
return verification.failingLayers?.length ? "instruction_following" : "unknown";
|
|
5163
|
+
}
|
|
5164
|
+
|
|
3890
5165
|
// src/trace/schema.ts
|
|
3891
5166
|
var TRACE_SCHEMA_VERSION = "1.0.0";
|
|
3892
5167
|
var FAILURE_CLASSES = [
|
|
@@ -5557,7 +6832,7 @@ var Dataset = class _Dataset {
|
|
|
5557
6832
|
* Write to disk for contamination-verifiable archives.
|
|
5558
6833
|
*/
|
|
5559
6834
|
toJsonl() {
|
|
5560
|
-
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(
|
|
6835
|
+
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
|
|
5561
6836
|
}
|
|
5562
6837
|
static fromJsonl(jsonl, manifest) {
|
|
5563
6838
|
const scenarios = [];
|
|
@@ -5570,18 +6845,18 @@ var Dataset = class _Dataset {
|
|
|
5570
6845
|
}
|
|
5571
6846
|
};
|
|
5572
6847
|
async function hashScenarios(scenarios) {
|
|
5573
|
-
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(
|
|
6848
|
+
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
|
|
5574
6849
|
const text = JSON.stringify(canonical);
|
|
5575
6850
|
const bytes = new TextEncoder().encode(text);
|
|
5576
6851
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
5577
6852
|
return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
5578
6853
|
}
|
|
5579
|
-
function
|
|
6854
|
+
function canonicalize2(v) {
|
|
5580
6855
|
if (v === null || typeof v !== "object") return v;
|
|
5581
|
-
if (Array.isArray(v)) return v.map(
|
|
6856
|
+
if (Array.isArray(v)) return v.map(canonicalize2);
|
|
5582
6857
|
const keys = Object.keys(v).sort();
|
|
5583
6858
|
const out = {};
|
|
5584
|
-
for (const k of keys) out[k] =
|
|
6859
|
+
for (const k of keys) out[k] = canonicalize2(v[k]);
|
|
5585
6860
|
return out;
|
|
5586
6861
|
}
|
|
5587
6862
|
function seededShuffle(items, seed) {
|
|
@@ -7615,7 +8890,7 @@ function attributeStep(op, prmA, prmB) {
|
|
|
7615
8890
|
|
|
7616
8891
|
// src/pre-registration.ts
|
|
7617
8892
|
async function signManifest(m) {
|
|
7618
|
-
const canonical =
|
|
8893
|
+
const canonical = canonicalize3(m);
|
|
7619
8894
|
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
7620
8895
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
7621
8896
|
const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
@@ -7645,12 +8920,12 @@ async function evaluateHypothesis(manifest, observed) {
|
|
|
7645
8920
|
rejectionReasons: reasons
|
|
7646
8921
|
};
|
|
7647
8922
|
}
|
|
7648
|
-
function
|
|
8923
|
+
function canonicalize3(v) {
|
|
7649
8924
|
if (v === null || typeof v !== "object") return v;
|
|
7650
|
-
if (Array.isArray(v)) return v.map(
|
|
8925
|
+
if (Array.isArray(v)) return v.map(canonicalize3);
|
|
7651
8926
|
const keys = Object.keys(v).sort();
|
|
7652
8927
|
const out = {};
|
|
7653
|
-
for (const k of keys) out[k] =
|
|
8928
|
+
for (const k of keys) out[k] = canonicalize3(v[k]);
|
|
7654
8929
|
return out;
|
|
7655
8930
|
}
|
|
7656
8931
|
|
|
@@ -12426,6 +13701,7 @@ export {
|
|
|
12426
13701
|
ExperimentTracker,
|
|
12427
13702
|
FAILURE_CLASSES,
|
|
12428
13703
|
FileSystemExperimentStore,
|
|
13704
|
+
FileSystemFeedbackTrajectoryStore,
|
|
12429
13705
|
FileSystemOutcomeStore,
|
|
12430
13706
|
FileSystemTraceStore,
|
|
12431
13707
|
HeldOutGate,
|
|
@@ -12433,6 +13709,7 @@ export {
|
|
|
12433
13709
|
HoldoutLockedError,
|
|
12434
13710
|
INTENT_MATCH_JUDGE_VERSION,
|
|
12435
13711
|
InMemoryExperimentStore,
|
|
13712
|
+
InMemoryFeedbackTrajectoryStore,
|
|
12436
13713
|
InMemoryOutcomeStore,
|
|
12437
13714
|
InMemoryTraceStore,
|
|
12438
13715
|
InMemoryTrialCache,
|
|
@@ -12472,9 +13749,11 @@ export {
|
|
|
12472
13749
|
adversarialJudge,
|
|
12473
13750
|
aggregateLlm,
|
|
12474
13751
|
aggregateRunScore,
|
|
13752
|
+
allCriticalPassed,
|
|
12475
13753
|
analyzeAntiSlop,
|
|
12476
13754
|
analyzeSeries,
|
|
12477
13755
|
argHash,
|
|
13756
|
+
assignFeedbackSplit,
|
|
12478
13757
|
attributeCounterfactuals,
|
|
12479
13758
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
12480
13759
|
benchmarks_exports as benchmarks,
|
|
@@ -12512,6 +13791,8 @@ export {
|
|
|
12512
13791
|
computeToolUseMetrics,
|
|
12513
13792
|
confidenceInterval,
|
|
12514
13793
|
containsAll,
|
|
13794
|
+
controlFailureClassFromVerification,
|
|
13795
|
+
controlRunToFeedbackTrajectory,
|
|
12515
13796
|
correlateLayers,
|
|
12516
13797
|
correlationStudy,
|
|
12517
13798
|
createAntiSlopJudge,
|
|
@@ -12519,6 +13800,7 @@ export {
|
|
|
12519
13800
|
createCustomJudge,
|
|
12520
13801
|
createDefaultReviewer,
|
|
12521
13802
|
createDomainExpertJudge,
|
|
13803
|
+
createFeedbackTrajectory,
|
|
12522
13804
|
createIntentMatchJudge,
|
|
12523
13805
|
createLlmReviewer,
|
|
12524
13806
|
createSandboxCodeMutator,
|
|
@@ -12536,6 +13818,7 @@ export {
|
|
|
12536
13818
|
estimateCost,
|
|
12537
13819
|
estimateTokens,
|
|
12538
13820
|
euAiActReport,
|
|
13821
|
+
evaluateActionPolicy,
|
|
12539
13822
|
evaluateContract,
|
|
12540
13823
|
evaluateHypothesis,
|
|
12541
13824
|
evaluateOracles,
|
|
@@ -12547,6 +13830,10 @@ export {
|
|
|
12547
13830
|
extractAssetUrls,
|
|
12548
13831
|
extractErrorCount,
|
|
12549
13832
|
failureClusterView,
|
|
13833
|
+
feedbackTrajectoriesToDatasetScenarios,
|
|
13834
|
+
feedbackTrajectoriesToOptimizerRows,
|
|
13835
|
+
feedbackTrajectoryToDatasetScenario,
|
|
13836
|
+
feedbackTrajectoryToOptimizerRow,
|
|
12550
13837
|
fileContains,
|
|
12551
13838
|
fileExists,
|
|
12552
13839
|
findAutoMatchNoExpectation,
|
|
@@ -12601,6 +13888,7 @@ export {
|
|
|
12601
13888
|
nonRefusalRubric,
|
|
12602
13889
|
normalizeScores,
|
|
12603
13890
|
notBlocked,
|
|
13891
|
+
objectiveEval,
|
|
12604
13892
|
outputLengthRubric,
|
|
12605
13893
|
pairedBootstrap,
|
|
12606
13894
|
pairedTTest,
|
|
@@ -12609,6 +13897,7 @@ export {
|
|
|
12609
13897
|
paretoChart,
|
|
12610
13898
|
paretoFrontier,
|
|
12611
13899
|
paretoFrontierWithCrowding,
|
|
13900
|
+
parseFeedbackTrajectoriesJsonl,
|
|
12612
13901
|
parseReflectionResponse,
|
|
12613
13902
|
parseRunRecordSafe,
|
|
12614
13903
|
partialCredit,
|
|
@@ -12635,7 +13924,10 @@ export {
|
|
|
12635
13924
|
renderMarkdown,
|
|
12636
13925
|
renderMarkdownReport,
|
|
12637
13926
|
renderPlaybookMarkdown,
|
|
13927
|
+
renderPreferenceMemoryMarkdown,
|
|
12638
13928
|
renderSteeringText,
|
|
13929
|
+
replayFeedbackTrajectories,
|
|
13930
|
+
replayFeedbackTrajectory,
|
|
12639
13931
|
replayScorerOverCorpus,
|
|
12640
13932
|
replayTraceThroughJudge,
|
|
12641
13933
|
requiredSampleSize,
|
|
@@ -12644,6 +13936,7 @@ export {
|
|
|
12644
13936
|
roundTripRunRecord,
|
|
12645
13937
|
rowCount,
|
|
12646
13938
|
rowWhere,
|
|
13939
|
+
runAgentControlLoop,
|
|
12647
13940
|
runAssertions,
|
|
12648
13941
|
runCanaries,
|
|
12649
13942
|
runCounterfactual,
|
|
@@ -12657,6 +13950,7 @@ export {
|
|
|
12657
13950
|
runKeywordCoverageJudgeUrl,
|
|
12658
13951
|
runPromptEvolution,
|
|
12659
13952
|
runProposeReview,
|
|
13953
|
+
runProposeReviewAsControlLoop,
|
|
12660
13954
|
runReferenceReplay,
|
|
12661
13955
|
runSelfPlay,
|
|
12662
13956
|
runSemanticConceptJudge,
|
|
@@ -12673,13 +13967,18 @@ export {
|
|
|
12673
13967
|
selectHarnessVariant,
|
|
12674
13968
|
selfPreference,
|
|
12675
13969
|
sentenceReorderMutator,
|
|
13970
|
+
serializeFeedbackTrajectoriesJsonl,
|
|
12676
13971
|
signManifest,
|
|
12677
13972
|
soc2Report,
|
|
12678
13973
|
statusAdvanced,
|
|
13974
|
+
stopOnNoProgress,
|
|
13975
|
+
stopOnRepeatedAction,
|
|
12679
13976
|
stripFencedJson,
|
|
12680
13977
|
stuckLoopView,
|
|
13978
|
+
subjectiveEval,
|
|
12681
13979
|
summarize,
|
|
12682
13980
|
summarizeHarnessResults,
|
|
13981
|
+
summarizePreferenceMemory,
|
|
12683
13982
|
summaryTable,
|
|
12684
13983
|
testJudge,
|
|
12685
13984
|
textInSnapshot,
|
|
@@ -12705,6 +14004,7 @@ export {
|
|
|
12705
14004
|
welchsTTest,
|
|
12706
14005
|
whitespaceCollapseMutator,
|
|
12707
14006
|
wilcoxonSignedRank,
|
|
14007
|
+
withAssignedFeedbackSplit,
|
|
12708
14008
|
wranglerDeployRunner
|
|
12709
14009
|
};
|
|
12710
14010
|
//# sourceMappingURL=index.js.map
|