@tangle-network/agent-eval 0.17.0 → 0.17.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -0
- package/dist/index.d.ts +1453 -1088
- package/dist/index.js +1477 -231
- package/dist/index.js.map +1 -1
- package/package.json +12 -10
package/dist/index.js
CHANGED
|
@@ -1386,6 +1386,1205 @@ function printDriverSummary(results) {
|
|
|
1386
1386
|
console.log(`${completedCount}/${results.length} personas completed`);
|
|
1387
1387
|
}
|
|
1388
1388
|
|
|
1389
|
+
// src/trace/emitter.ts
|
|
1390
|
+
var TraceEmitter = class {
|
|
1391
|
+
store;
|
|
1392
|
+
stack = [];
|
|
1393
|
+
_runId;
|
|
1394
|
+
now;
|
|
1395
|
+
id;
|
|
1396
|
+
constructor(store, options = {}) {
|
|
1397
|
+
this.store = store;
|
|
1398
|
+
this.now = options.now ?? (() => Date.now());
|
|
1399
|
+
this.id = options.id ?? (() => cryptoRandomId());
|
|
1400
|
+
this._runId = options.runId ?? this.id();
|
|
1401
|
+
}
|
|
1402
|
+
get runId() {
|
|
1403
|
+
return this._runId;
|
|
1404
|
+
}
|
|
1405
|
+
// ── Run lifecycle ──────────────────────────────────────────────────
|
|
1406
|
+
async startRun(run) {
|
|
1407
|
+
const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
|
|
1408
|
+
await this.store.appendRun(full);
|
|
1409
|
+
return full;
|
|
1410
|
+
}
|
|
1411
|
+
async endRun(outcome) {
|
|
1412
|
+
const status = outcome?.pass === false ? "failed" : "completed";
|
|
1413
|
+
await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
|
|
1414
|
+
}
|
|
1415
|
+
async abortRun(reason) {
|
|
1416
|
+
await this.store.updateRun(this._runId, {
|
|
1417
|
+
endedAt: this.now(),
|
|
1418
|
+
status: "aborted",
|
|
1419
|
+
outcome: { pass: false, notes: reason }
|
|
1420
|
+
});
|
|
1421
|
+
}
|
|
1422
|
+
// ── Generic span ───────────────────────────────────────────────────
|
|
1423
|
+
async span(init) {
|
|
1424
|
+
const spanId = this.id();
|
|
1425
|
+
const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
|
|
1426
|
+
const span = {
|
|
1427
|
+
spanId,
|
|
1428
|
+
parentSpanId: parent,
|
|
1429
|
+
runId: this._runId,
|
|
1430
|
+
startedAt: this.now(),
|
|
1431
|
+
...init
|
|
1432
|
+
};
|
|
1433
|
+
await this.store.appendSpan(span);
|
|
1434
|
+
this.stack.push(spanId);
|
|
1435
|
+
return this.handle(span);
|
|
1436
|
+
}
|
|
1437
|
+
handle(span) {
|
|
1438
|
+
return {
|
|
1439
|
+
span,
|
|
1440
|
+
end: async (patch) => {
|
|
1441
|
+
const endedAt = this.now();
|
|
1442
|
+
await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
|
|
1443
|
+
this.pop(span.spanId);
|
|
1444
|
+
},
|
|
1445
|
+
fail: async (error, patch) => {
|
|
1446
|
+
const endedAt = this.now();
|
|
1447
|
+
const errStr = error instanceof Error ? error.message : error;
|
|
1448
|
+
await this.store.updateSpan(span.spanId, {
|
|
1449
|
+
endedAt,
|
|
1450
|
+
status: "error",
|
|
1451
|
+
error: errStr,
|
|
1452
|
+
...patch
|
|
1453
|
+
});
|
|
1454
|
+
this.pop(span.spanId);
|
|
1455
|
+
}
|
|
1456
|
+
};
|
|
1457
|
+
}
|
|
1458
|
+
pop(spanId) {
|
|
1459
|
+
const idx = this.stack.lastIndexOf(spanId);
|
|
1460
|
+
if (idx >= 0) this.stack.splice(idx, 1);
|
|
1461
|
+
}
|
|
1462
|
+
// ── Typed span conveniences ────────────────────────────────────────
|
|
1463
|
+
llm(init) {
|
|
1464
|
+
return this.span({ kind: "llm", ...init });
|
|
1465
|
+
}
|
|
1466
|
+
tool(init) {
|
|
1467
|
+
return this.span({ kind: "tool", ...init });
|
|
1468
|
+
}
|
|
1469
|
+
retrieval(init) {
|
|
1470
|
+
return this.span({ kind: "retrieval", ...init });
|
|
1471
|
+
}
|
|
1472
|
+
async recordJudge(verdict) {
|
|
1473
|
+
const spanId = this.id();
|
|
1474
|
+
const now = this.now();
|
|
1475
|
+
const full = {
|
|
1476
|
+
spanId,
|
|
1477
|
+
runId: this._runId,
|
|
1478
|
+
kind: "judge",
|
|
1479
|
+
startedAt: now,
|
|
1480
|
+
endedAt: now,
|
|
1481
|
+
status: "ok",
|
|
1482
|
+
...verdict
|
|
1483
|
+
};
|
|
1484
|
+
await this.store.appendSpan(full);
|
|
1485
|
+
return full;
|
|
1486
|
+
}
|
|
1487
|
+
sandbox(init) {
|
|
1488
|
+
return this.span({ kind: "sandbox", ...init });
|
|
1489
|
+
}
|
|
1490
|
+
// ── Events ─────────────────────────────────────────────────────────
|
|
1491
|
+
async emit(event) {
|
|
1492
|
+
const full = {
|
|
1493
|
+
eventId: this.id(),
|
|
1494
|
+
runId: this._runId,
|
|
1495
|
+
spanId: event.spanId ?? this.stack[this.stack.length - 1],
|
|
1496
|
+
kind: event.kind,
|
|
1497
|
+
timestamp: this.now(),
|
|
1498
|
+
payload: event.payload ?? {}
|
|
1499
|
+
};
|
|
1500
|
+
await this.store.appendEvent(full);
|
|
1501
|
+
return full;
|
|
1502
|
+
}
|
|
1503
|
+
// ── Budget ledger ──────────────────────────────────────────────────
|
|
1504
|
+
async recordBudget(entry) {
|
|
1505
|
+
const full = {
|
|
1506
|
+
runId: this._runId,
|
|
1507
|
+
timestamp: entry.timestamp ?? this.now(),
|
|
1508
|
+
dimension: entry.dimension,
|
|
1509
|
+
limit: entry.limit,
|
|
1510
|
+
consumed: entry.consumed,
|
|
1511
|
+
remaining: entry.remaining,
|
|
1512
|
+
breached: entry.breached,
|
|
1513
|
+
spanId: entry.spanId ?? this.stack[this.stack.length - 1]
|
|
1514
|
+
};
|
|
1515
|
+
await this.store.appendBudgetEntry(full);
|
|
1516
|
+
if (full.breached) {
|
|
1517
|
+
await this.emit({
|
|
1518
|
+
kind: "budget_breach",
|
|
1519
|
+
spanId: full.spanId,
|
|
1520
|
+
payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
|
|
1521
|
+
});
|
|
1522
|
+
}
|
|
1523
|
+
return full;
|
|
1524
|
+
}
|
|
1525
|
+
// ── Artifacts ──────────────────────────────────────────────────────
|
|
1526
|
+
async recordArtifact(artifact) {
|
|
1527
|
+
const full = { artifactId: this.id(), runId: this._runId, ...artifact };
|
|
1528
|
+
await this.store.appendArtifact(full);
|
|
1529
|
+
return full;
|
|
1530
|
+
}
|
|
1531
|
+
// ── Nested composition ─────────────────────────────────────────────
|
|
1532
|
+
/**
|
|
1533
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
1534
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
1535
|
+
*/
|
|
1536
|
+
async within(init, fn) {
|
|
1537
|
+
const handle = await this.span(init);
|
|
1538
|
+
try {
|
|
1539
|
+
const result = await fn(handle);
|
|
1540
|
+
await handle.end();
|
|
1541
|
+
return result;
|
|
1542
|
+
} catch (err) {
|
|
1543
|
+
await handle.fail(err instanceof Error ? err : String(err));
|
|
1544
|
+
throw err;
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
};
|
|
1548
|
+
function cryptoRandomId() {
|
|
1549
|
+
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
1550
|
+
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
1551
|
+
}
|
|
1552
|
+
function llmSpanFromProvider(args) {
|
|
1553
|
+
return {
|
|
1554
|
+
name: args.name ?? args.model,
|
|
1555
|
+
model: args.model,
|
|
1556
|
+
messages: args.messages,
|
|
1557
|
+
output: args.output,
|
|
1558
|
+
inputTokens: args.usage?.inputTokens,
|
|
1559
|
+
outputTokens: args.usage?.outputTokens,
|
|
1560
|
+
cachedTokens: args.usage?.cachedTokens,
|
|
1561
|
+
reasoningTokens: args.usage?.reasoningTokens,
|
|
1562
|
+
costUsd: args.costUsd,
|
|
1563
|
+
finishReason: args.finishReason
|
|
1564
|
+
};
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
// src/control-runtime.ts
|
|
1568
|
+
var DEFAULT_BUDGET = {
|
|
1569
|
+
maxSteps: 8,
|
|
1570
|
+
maxWallMs: 5 * 60 * 1e3
|
|
1571
|
+
};
|
|
1572
|
+
async function runAgentControlLoop(config) {
|
|
1573
|
+
const budget = { ...DEFAULT_BUDGET, ...config.budget };
|
|
1574
|
+
const actionFailure = config.actionFailure ?? "continue";
|
|
1575
|
+
const controller = new AbortController();
|
|
1576
|
+
const upstreamAbort = () => controller.abort(config.signal?.reason);
|
|
1577
|
+
if (config.signal) {
|
|
1578
|
+
if (config.signal.aborted) controller.abort(config.signal.reason);
|
|
1579
|
+
else config.signal.addEventListener("abort", upstreamAbort, { once: true });
|
|
1580
|
+
}
|
|
1581
|
+
const started = Date.now();
|
|
1582
|
+
const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
|
|
1583
|
+
const history = [];
|
|
1584
|
+
const emitter = config.store ? new TraceEmitter(config.store) : void 0;
|
|
1585
|
+
let spentCostUsd = 0;
|
|
1586
|
+
const runtimeErrors = [];
|
|
1587
|
+
let lastStateFingerprint;
|
|
1588
|
+
let lastActionFingerprint;
|
|
1589
|
+
let noProgressStreak = 0;
|
|
1590
|
+
let repeatedActionStreak = 0;
|
|
1591
|
+
try {
|
|
1592
|
+
if (emitter) {
|
|
1593
|
+
await runTrace(runtimeErrors, 0, () => emitter.startRun({
|
|
1594
|
+
scenarioId: config.scenarioId ?? "agent-control-loop",
|
|
1595
|
+
projectId: config.projectId,
|
|
1596
|
+
variantId: config.variantId,
|
|
1597
|
+
layer: "meta",
|
|
1598
|
+
tags: {
|
|
1599
|
+
intent: config.intent.slice(0, 120),
|
|
1600
|
+
maxSteps: String(budget.maxSteps),
|
|
1601
|
+
...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
|
|
1602
|
+
}
|
|
1603
|
+
}));
|
|
1604
|
+
}
|
|
1605
|
+
let state;
|
|
1606
|
+
let evals;
|
|
1607
|
+
try {
|
|
1608
|
+
state = await config.observe({ history, abortSignal: controller.signal });
|
|
1609
|
+
} catch (err) {
|
|
1610
|
+
runtimeErrors.push(runtimeError("observe", 0, err));
|
|
1611
|
+
return finish(emitter, {
|
|
1612
|
+
intent: config.intent,
|
|
1613
|
+
pass: false,
|
|
1614
|
+
completed: false,
|
|
1615
|
+
reason: runtimeErrors[0].message,
|
|
1616
|
+
steps: history,
|
|
1617
|
+
finalState: void 0,
|
|
1618
|
+
finalEvals: [],
|
|
1619
|
+
wallMs: Date.now() - started,
|
|
1620
|
+
spentCostUsd,
|
|
1621
|
+
runId: emitter?.runId ?? null,
|
|
1622
|
+
failureClass: "unknown",
|
|
1623
|
+
runtimeErrors,
|
|
1624
|
+
stoppedBy: "runtime-error"
|
|
1625
|
+
});
|
|
1626
|
+
}
|
|
1627
|
+
try {
|
|
1628
|
+
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
1629
|
+
await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
|
|
1630
|
+
} catch (err) {
|
|
1631
|
+
runtimeErrors.push(runtimeError("validate", 0, err));
|
|
1632
|
+
return finish(emitter, {
|
|
1633
|
+
intent: config.intent,
|
|
1634
|
+
pass: false,
|
|
1635
|
+
completed: false,
|
|
1636
|
+
reason: runtimeErrors[0].message,
|
|
1637
|
+
steps: history,
|
|
1638
|
+
finalState: state,
|
|
1639
|
+
finalEvals: [],
|
|
1640
|
+
wallMs: Date.now() - started,
|
|
1641
|
+
spentCostUsd,
|
|
1642
|
+
runId: emitter?.runId ?? null,
|
|
1643
|
+
failureClass: "unknown",
|
|
1644
|
+
runtimeErrors,
|
|
1645
|
+
stoppedBy: "runtime-error"
|
|
1646
|
+
});
|
|
1647
|
+
}
|
|
1648
|
+
lastStateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
1649
|
+
for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
|
|
1650
|
+
if (controller.signal.aborted) {
|
|
1651
|
+
return finish(emitter, {
|
|
1652
|
+
intent: config.intent,
|
|
1653
|
+
pass: false,
|
|
1654
|
+
completed: false,
|
|
1655
|
+
reason: abortReason(controller.signal),
|
|
1656
|
+
score: void 0,
|
|
1657
|
+
steps: history,
|
|
1658
|
+
finalState: state,
|
|
1659
|
+
finalEvals: evals,
|
|
1660
|
+
wallMs: Date.now() - started,
|
|
1661
|
+
spentCostUsd,
|
|
1662
|
+
runId: emitter?.runId ?? null,
|
|
1663
|
+
failureClass: "timeout",
|
|
1664
|
+
runtimeErrors,
|
|
1665
|
+
stoppedBy: "abort"
|
|
1666
|
+
});
|
|
1667
|
+
}
|
|
1668
|
+
const budgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
1669
|
+
if (budgetStop.stop) {
|
|
1670
|
+
return finish(emitter, {
|
|
1671
|
+
intent: config.intent,
|
|
1672
|
+
pass: false,
|
|
1673
|
+
completed: false,
|
|
1674
|
+
reason: budgetStop.reason,
|
|
1675
|
+
score: averageScore(evals),
|
|
1676
|
+
steps: history,
|
|
1677
|
+
finalState: state,
|
|
1678
|
+
finalEvals: evals,
|
|
1679
|
+
wallMs: Date.now() - started,
|
|
1680
|
+
spentCostUsd,
|
|
1681
|
+
runId: emitter?.runId ?? null,
|
|
1682
|
+
failureClass: "budget_exceeded",
|
|
1683
|
+
runtimeErrors,
|
|
1684
|
+
stoppedBy: "budget"
|
|
1685
|
+
});
|
|
1686
|
+
}
|
|
1687
|
+
const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
|
|
1688
|
+
let stop;
|
|
1689
|
+
try {
|
|
1690
|
+
stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
|
|
1691
|
+
} catch (err) {
|
|
1692
|
+
runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
|
|
1693
|
+
return finish(emitter, {
|
|
1694
|
+
intent: config.intent,
|
|
1695
|
+
pass: false,
|
|
1696
|
+
completed: false,
|
|
1697
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1698
|
+
score: averageScore(evals),
|
|
1699
|
+
steps: history,
|
|
1700
|
+
finalState: state,
|
|
1701
|
+
finalEvals: evals,
|
|
1702
|
+
wallMs: Date.now() - started,
|
|
1703
|
+
spentCostUsd,
|
|
1704
|
+
runId: emitter?.runId ?? null,
|
|
1705
|
+
failureClass: "unknown",
|
|
1706
|
+
runtimeErrors,
|
|
1707
|
+
stoppedBy: "runtime-error"
|
|
1708
|
+
});
|
|
1709
|
+
}
|
|
1710
|
+
if (stop.stop) {
|
|
1711
|
+
return finish(emitter, {
|
|
1712
|
+
intent: config.intent,
|
|
1713
|
+
pass: stop.pass,
|
|
1714
|
+
completed: true,
|
|
1715
|
+
reason: stop.reason,
|
|
1716
|
+
score: stop.score,
|
|
1717
|
+
steps: history,
|
|
1718
|
+
finalState: state,
|
|
1719
|
+
finalEvals: evals,
|
|
1720
|
+
wallMs: Date.now() - started,
|
|
1721
|
+
spentCostUsd,
|
|
1722
|
+
runId: emitter?.runId ?? null,
|
|
1723
|
+
failureClass: stop.failureClass,
|
|
1724
|
+
runtimeErrors,
|
|
1725
|
+
stoppedBy: "stop-policy"
|
|
1726
|
+
});
|
|
1727
|
+
}
|
|
1728
|
+
let decision;
|
|
1729
|
+
try {
|
|
1730
|
+
decision = await config.decide(ctx);
|
|
1731
|
+
} catch (err) {
|
|
1732
|
+
runtimeErrors.push(runtimeError("decide", stepIndex, err));
|
|
1733
|
+
return finish(emitter, {
|
|
1734
|
+
intent: config.intent,
|
|
1735
|
+
pass: false,
|
|
1736
|
+
completed: false,
|
|
1737
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1738
|
+
score: averageScore(evals),
|
|
1739
|
+
steps: history,
|
|
1740
|
+
finalState: state,
|
|
1741
|
+
finalEvals: evals,
|
|
1742
|
+
wallMs: Date.now() - started,
|
|
1743
|
+
spentCostUsd,
|
|
1744
|
+
runId: emitter?.runId ?? null,
|
|
1745
|
+
failureClass: "unknown",
|
|
1746
|
+
runtimeErrors,
|
|
1747
|
+
stoppedBy: "runtime-error"
|
|
1748
|
+
});
|
|
1749
|
+
}
|
|
1750
|
+
if (decision.type === "stop") {
|
|
1751
|
+
return finish(emitter, {
|
|
1752
|
+
intent: config.intent,
|
|
1753
|
+
pass: decision.pass ?? false,
|
|
1754
|
+
completed: true,
|
|
1755
|
+
reason: decision.reason,
|
|
1756
|
+
score: decision.score,
|
|
1757
|
+
steps: history,
|
|
1758
|
+
finalState: state,
|
|
1759
|
+
finalEvals: evals,
|
|
1760
|
+
wallMs: Date.now() - started,
|
|
1761
|
+
spentCostUsd,
|
|
1762
|
+
runId: emitter?.runId ?? null,
|
|
1763
|
+
failureClass: decision.pass === false ? "unknown" : void 0,
|
|
1764
|
+
runtimeErrors,
|
|
1765
|
+
stoppedBy: "policy"
|
|
1766
|
+
});
|
|
1767
|
+
}
|
|
1768
|
+
const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
|
|
1769
|
+
repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
|
|
1770
|
+
lastActionFingerprint = actionFingerprint;
|
|
1771
|
+
const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
|
|
1772
|
+
if (repeatedActionStop.stop) {
|
|
1773
|
+
return finish(emitter, {
|
|
1774
|
+
intent: config.intent,
|
|
1775
|
+
pass: false,
|
|
1776
|
+
completed: true,
|
|
1777
|
+
reason: repeatedActionStop.reason,
|
|
1778
|
+
score: averageScore(evals),
|
|
1779
|
+
steps: history,
|
|
1780
|
+
finalState: state,
|
|
1781
|
+
finalEvals: evals,
|
|
1782
|
+
wallMs: Date.now() - started,
|
|
1783
|
+
spentCostUsd,
|
|
1784
|
+
runId: emitter?.runId ?? null,
|
|
1785
|
+
failureClass: "tool_recovery_failure",
|
|
1786
|
+
runtimeErrors,
|
|
1787
|
+
stoppedBy: "stop-policy"
|
|
1788
|
+
});
|
|
1789
|
+
}
|
|
1790
|
+
const beforeState = state;
|
|
1791
|
+
const evalsBefore = evals;
|
|
1792
|
+
const scoreBefore = averageScore(evals);
|
|
1793
|
+
const actionStarted = Date.now();
|
|
1794
|
+
const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
|
|
1795
|
+
name: `control-step-${stepIndex}`,
|
|
1796
|
+
toolName: "agent-control-action",
|
|
1797
|
+
args: decision.action,
|
|
1798
|
+
attributes: {
|
|
1799
|
+
decision: decision.reason ?? "continue",
|
|
1800
|
+
repeatedActionStreak
|
|
1801
|
+
}
|
|
1802
|
+
})) : void 0;
|
|
1803
|
+
let actionOutcome;
|
|
1804
|
+
try {
|
|
1805
|
+
const result = await config.act(decision.action, ctx);
|
|
1806
|
+
const costUsd = config.getActionCostUsd?.({
|
|
1807
|
+
action: decision.action,
|
|
1808
|
+
result,
|
|
1809
|
+
state,
|
|
1810
|
+
evals,
|
|
1811
|
+
history
|
|
1812
|
+
});
|
|
1813
|
+
if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
|
|
1814
|
+
spentCostUsd += costUsd;
|
|
1815
|
+
await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
|
|
1816
|
+
}
|
|
1817
|
+
actionOutcome = {
|
|
1818
|
+
ok: true,
|
|
1819
|
+
result,
|
|
1820
|
+
...costUsd !== void 0 ? { costUsd } : {},
|
|
1821
|
+
durationMs: Date.now() - actionStarted
|
|
1822
|
+
};
|
|
1823
|
+
} catch (err) {
|
|
1824
|
+
runtimeErrors.push(runtimeError("act", stepIndex, err));
|
|
1825
|
+
actionOutcome = {
|
|
1826
|
+
ok: false,
|
|
1827
|
+
error: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1828
|
+
durationMs: Date.now() - actionStarted
|
|
1829
|
+
};
|
|
1830
|
+
if (actionFailure === "stop") {
|
|
1831
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
|
|
1832
|
+
const step2 = {
|
|
1833
|
+
index: stepIndex,
|
|
1834
|
+
decision,
|
|
1835
|
+
beforeState,
|
|
1836
|
+
afterState: state,
|
|
1837
|
+
evalsBefore,
|
|
1838
|
+
evalsAfter: evals,
|
|
1839
|
+
actionOutcome,
|
|
1840
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1841
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1842
|
+
};
|
|
1843
|
+
history.push(step2);
|
|
1844
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1845
|
+
return finish(emitter, {
|
|
1846
|
+
intent: config.intent,
|
|
1847
|
+
pass: false,
|
|
1848
|
+
completed: false,
|
|
1849
|
+
reason: actionOutcome.error ?? "action failed",
|
|
1850
|
+
score: averageScore(evals),
|
|
1851
|
+
steps: history,
|
|
1852
|
+
finalState: state,
|
|
1853
|
+
finalEvals: evals,
|
|
1854
|
+
wallMs: Date.now() - started,
|
|
1855
|
+
spentCostUsd,
|
|
1856
|
+
runId: emitter?.runId ?? null,
|
|
1857
|
+
failureClass: "unknown",
|
|
1858
|
+
runtimeErrors,
|
|
1859
|
+
stoppedBy: "runtime-error"
|
|
1860
|
+
});
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
try {
|
|
1864
|
+
state = await config.observe({ history, abortSignal: controller.signal });
|
|
1865
|
+
} catch (err) {
|
|
1866
|
+
runtimeErrors.push(runtimeError("observe", stepIndex, err));
|
|
1867
|
+
const step2 = {
|
|
1868
|
+
index: stepIndex,
|
|
1869
|
+
decision,
|
|
1870
|
+
beforeState,
|
|
1871
|
+
afterState: beforeState,
|
|
1872
|
+
evalsBefore,
|
|
1873
|
+
evalsAfter: evals,
|
|
1874
|
+
actionOutcome,
|
|
1875
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1876
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1877
|
+
};
|
|
1878
|
+
history.push(step2);
|
|
1879
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
1880
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1881
|
+
return finish(emitter, {
|
|
1882
|
+
intent: config.intent,
|
|
1883
|
+
pass: false,
|
|
1884
|
+
completed: false,
|
|
1885
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1886
|
+
score: averageScore(evals),
|
|
1887
|
+
steps: history,
|
|
1888
|
+
finalState: beforeState,
|
|
1889
|
+
finalEvals: evals,
|
|
1890
|
+
wallMs: Date.now() - started,
|
|
1891
|
+
spentCostUsd,
|
|
1892
|
+
runId: emitter?.runId ?? null,
|
|
1893
|
+
failureClass: "unknown",
|
|
1894
|
+
runtimeErrors,
|
|
1895
|
+
stoppedBy: "runtime-error"
|
|
1896
|
+
});
|
|
1897
|
+
}
|
|
1898
|
+
try {
|
|
1899
|
+
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
1900
|
+
await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
|
|
1901
|
+
} catch (err) {
|
|
1902
|
+
runtimeErrors.push(runtimeError("validate", stepIndex, err));
|
|
1903
|
+
const step2 = {
|
|
1904
|
+
index: stepIndex,
|
|
1905
|
+
decision,
|
|
1906
|
+
beforeState,
|
|
1907
|
+
afterState: state,
|
|
1908
|
+
evalsBefore,
|
|
1909
|
+
evalsAfter: evals,
|
|
1910
|
+
actionOutcome,
|
|
1911
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1912
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1913
|
+
};
|
|
1914
|
+
history.push(step2);
|
|
1915
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
1916
|
+
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
1917
|
+
return finish(emitter, {
|
|
1918
|
+
intent: config.intent,
|
|
1919
|
+
pass: false,
|
|
1920
|
+
completed: false,
|
|
1921
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
1922
|
+
score: averageScore(evals),
|
|
1923
|
+
steps: history,
|
|
1924
|
+
finalState: state,
|
|
1925
|
+
finalEvals: evals,
|
|
1926
|
+
wallMs: Date.now() - started,
|
|
1927
|
+
spentCostUsd,
|
|
1928
|
+
runId: emitter?.runId ?? null,
|
|
1929
|
+
failureClass: "unknown",
|
|
1930
|
+
runtimeErrors,
|
|
1931
|
+
stoppedBy: "runtime-error"
|
|
1932
|
+
});
|
|
1933
|
+
}
|
|
1934
|
+
const scoreAfter = averageScore(evals);
|
|
1935
|
+
const stateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
1936
|
+
const noProgressStop = noProgressStopDecision({
|
|
1937
|
+
policies: config.stopPolicies,
|
|
1938
|
+
lastStateFingerprint,
|
|
1939
|
+
stateFingerprint,
|
|
1940
|
+
scoreBefore,
|
|
1941
|
+
scoreAfter,
|
|
1942
|
+
currentStreak: noProgressStreak
|
|
1943
|
+
});
|
|
1944
|
+
noProgressStreak = noProgressStop.streak;
|
|
1945
|
+
lastStateFingerprint = stateFingerprint;
|
|
1946
|
+
const step = {
|
|
1947
|
+
index: stepIndex,
|
|
1948
|
+
decision,
|
|
1949
|
+
beforeState,
|
|
1950
|
+
afterState: state,
|
|
1951
|
+
evalsBefore,
|
|
1952
|
+
evalsAfter: evals,
|
|
1953
|
+
actionOutcome,
|
|
1954
|
+
startedAt: new Date(actionStarted).toISOString(),
|
|
1955
|
+
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
1956
|
+
};
|
|
1957
|
+
history.push(step);
|
|
1958
|
+
if (actionOutcome.ok) {
|
|
1959
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
|
|
1960
|
+
attributes: {
|
|
1961
|
+
actionCostUsd: actionOutcome.costUsd ?? null,
|
|
1962
|
+
spentCostUsd,
|
|
1963
|
+
scoreBefore: scoreBefore ?? null,
|
|
1964
|
+
scoreAfter: scoreAfter ?? null,
|
|
1965
|
+
noProgressStreak
|
|
1966
|
+
}
|
|
1967
|
+
}));
|
|
1968
|
+
} else {
|
|
1969
|
+
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
|
|
1970
|
+
attributes: {
|
|
1971
|
+
spentCostUsd,
|
|
1972
|
+
noProgressStreak
|
|
1973
|
+
}
|
|
1974
|
+
}));
|
|
1975
|
+
}
|
|
1976
|
+
await runOnStep(config.onStep, step, runtimeErrors);
|
|
1977
|
+
if (noProgressStop.stop) {
|
|
1978
|
+
return finish(emitter, {
|
|
1979
|
+
intent: config.intent,
|
|
1980
|
+
pass: false,
|
|
1981
|
+
completed: true,
|
|
1982
|
+
reason: noProgressStop.reason,
|
|
1983
|
+
score: scoreAfter,
|
|
1984
|
+
steps: history,
|
|
1985
|
+
finalState: state,
|
|
1986
|
+
finalEvals: evals,
|
|
1987
|
+
wallMs: Date.now() - started,
|
|
1988
|
+
spentCostUsd,
|
|
1989
|
+
runId: emitter?.runId ?? null,
|
|
1990
|
+
failureClass: "tool_recovery_failure",
|
|
1991
|
+
runtimeErrors,
|
|
1992
|
+
stoppedBy: "stop-policy"
|
|
1993
|
+
});
|
|
1994
|
+
}
|
|
1995
|
+
const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
1996
|
+
if (postStepBudgetStop.stop) {
|
|
1997
|
+
return finish(emitter, {
|
|
1998
|
+
intent: config.intent,
|
|
1999
|
+
pass: false,
|
|
2000
|
+
completed: false,
|
|
2001
|
+
reason: postStepBudgetStop.reason,
|
|
2002
|
+
score: scoreAfter,
|
|
2003
|
+
steps: history,
|
|
2004
|
+
finalState: state,
|
|
2005
|
+
finalEvals: evals,
|
|
2006
|
+
wallMs: Date.now() - started,
|
|
2007
|
+
spentCostUsd,
|
|
2008
|
+
runId: emitter?.runId ?? null,
|
|
2009
|
+
failureClass: "budget_exceeded",
|
|
2010
|
+
runtimeErrors,
|
|
2011
|
+
stoppedBy: "budget"
|
|
2012
|
+
});
|
|
2013
|
+
}
|
|
2014
|
+
const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
|
|
2015
|
+
let postStepStop;
|
|
2016
|
+
try {
|
|
2017
|
+
postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
|
|
2018
|
+
} catch (err) {
|
|
2019
|
+
runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
|
|
2020
|
+
return finish(emitter, {
|
|
2021
|
+
intent: config.intent,
|
|
2022
|
+
pass: false,
|
|
2023
|
+
completed: false,
|
|
2024
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
2025
|
+
score: averageScore(evals),
|
|
2026
|
+
steps: history,
|
|
2027
|
+
finalState: state,
|
|
2028
|
+
finalEvals: evals,
|
|
2029
|
+
wallMs: Date.now() - started,
|
|
2030
|
+
spentCostUsd,
|
|
2031
|
+
runId: emitter?.runId ?? null,
|
|
2032
|
+
failureClass: "unknown",
|
|
2033
|
+
runtimeErrors,
|
|
2034
|
+
stoppedBy: "runtime-error"
|
|
2035
|
+
});
|
|
2036
|
+
}
|
|
2037
|
+
if (postStepStop.stop) {
|
|
2038
|
+
return finish(emitter, {
|
|
2039
|
+
intent: config.intent,
|
|
2040
|
+
pass: postStepStop.pass,
|
|
2041
|
+
completed: true,
|
|
2042
|
+
reason: postStepStop.reason,
|
|
2043
|
+
score: postStepStop.score,
|
|
2044
|
+
steps: history,
|
|
2045
|
+
finalState: state,
|
|
2046
|
+
finalEvals: evals,
|
|
2047
|
+
wallMs: Date.now() - started,
|
|
2048
|
+
spentCostUsd,
|
|
2049
|
+
runId: emitter?.runId ?? null,
|
|
2050
|
+
failureClass: postStepStop.failureClass,
|
|
2051
|
+
runtimeErrors,
|
|
2052
|
+
stoppedBy: "stop-policy"
|
|
2053
|
+
});
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
return finish(emitter, {
|
|
2057
|
+
intent: config.intent,
|
|
2058
|
+
pass: false,
|
|
2059
|
+
completed: false,
|
|
2060
|
+
reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
|
|
2061
|
+
steps: history,
|
|
2062
|
+
finalState: state,
|
|
2063
|
+
finalEvals: evals,
|
|
2064
|
+
wallMs: Date.now() - started,
|
|
2065
|
+
spentCostUsd,
|
|
2066
|
+
runId: emitter?.runId ?? null,
|
|
2067
|
+
failureClass: "budget_exceeded",
|
|
2068
|
+
runtimeErrors,
|
|
2069
|
+
stoppedBy: "budget"
|
|
2070
|
+
});
|
|
2071
|
+
} catch (err) {
|
|
2072
|
+
runtimeErrors.push(runtimeError("act", history.length, err));
|
|
2073
|
+
return finish(emitter, {
|
|
2074
|
+
intent: config.intent,
|
|
2075
|
+
pass: false,
|
|
2076
|
+
completed: false,
|
|
2077
|
+
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
2078
|
+
steps: history,
|
|
2079
|
+
finalState: void 0,
|
|
2080
|
+
finalEvals: [],
|
|
2081
|
+
wallMs: Date.now() - started,
|
|
2082
|
+
spentCostUsd,
|
|
2083
|
+
runId: emitter?.runId ?? null,
|
|
2084
|
+
failureClass: "unknown",
|
|
2085
|
+
runtimeErrors,
|
|
2086
|
+
stoppedBy: "runtime-error"
|
|
2087
|
+
});
|
|
2088
|
+
} finally {
|
|
2089
|
+
if (wallTimer) clearTimeout(wallTimer);
|
|
2090
|
+
if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
|
|
2091
|
+
}
|
|
2092
|
+
}
|
|
2093
|
+
function stopOnNoProgress(maxNoProgressSteps, options = {}) {
|
|
2094
|
+
return { ...options, maxNoProgressSteps };
|
|
2095
|
+
}
|
|
2096
|
+
function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
|
|
2097
|
+
return { ...options, maxRepeatedActions };
|
|
2098
|
+
}
|
|
2099
|
+
function objectiveEval(input) {
|
|
2100
|
+
return { ...input, objective: true };
|
|
2101
|
+
}
|
|
2102
|
+
function subjectiveEval(input) {
|
|
2103
|
+
return { ...input, objective: false };
|
|
2104
|
+
}
|
|
2105
|
+
function allCriticalPassed(evals) {
|
|
2106
|
+
return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
|
|
2107
|
+
}
|
|
2108
|
+
function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
|
|
2109
|
+
return {
|
|
2110
|
+
intent,
|
|
2111
|
+
state,
|
|
2112
|
+
evals,
|
|
2113
|
+
history,
|
|
2114
|
+
budget,
|
|
2115
|
+
stepIndex,
|
|
2116
|
+
wallMs: Date.now() - started,
|
|
2117
|
+
spentCostUsd,
|
|
2118
|
+
remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
|
|
2119
|
+
abortSignal,
|
|
2120
|
+
emitter
|
|
2121
|
+
};
|
|
2122
|
+
}
|
|
2123
|
+
function defaultStopDecision(evals) {
|
|
2124
|
+
if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
|
|
2125
|
+
const pass = allCriticalPassed(evals);
|
|
2126
|
+
return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
|
|
2127
|
+
}
|
|
2128
|
+
function averageScore(evals) {
|
|
2129
|
+
const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
|
|
2130
|
+
if (!scored.length) return void 0;
|
|
2131
|
+
return Math.round(scored.reduce((sum2, score) => sum2 + score, 0) / scored.length * 1e3) / 1e3;
|
|
2132
|
+
}
|
|
2133
|
+
function budgetStopDecision(budget, spentCostUsd) {
|
|
2134
|
+
if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
|
|
2135
|
+
return {
|
|
2136
|
+
stop: true,
|
|
2137
|
+
reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
|
|
2138
|
+
};
|
|
2139
|
+
}
|
|
2140
|
+
return { stop: false, reason: "" };
|
|
2141
|
+
}
|
|
2142
|
+
async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
|
|
2143
|
+
if (!emitter || budget.maxCostUsd === void 0) return;
|
|
2144
|
+
const maxCostUsd = budget.maxCostUsd;
|
|
2145
|
+
await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
|
|
2146
|
+
dimension: "usd",
|
|
2147
|
+
limit: maxCostUsd,
|
|
2148
|
+
consumed: spentCostUsd,
|
|
2149
|
+
remaining: Math.max(0, maxCostUsd - spentCostUsd),
|
|
2150
|
+
breached: spentCostUsd >= maxCostUsd,
|
|
2151
|
+
spanId: handle?.span.spanId
|
|
2152
|
+
}));
|
|
2153
|
+
}
|
|
2154
|
+
async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
|
|
2155
|
+
if (!emitter) return;
|
|
2156
|
+
for (const result of evals) {
|
|
2157
|
+
await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
|
|
2158
|
+
judgeId: result.objective ? "objective-validator" : "subjective-judge",
|
|
2159
|
+
targetSpanId: targetSpanId ?? emitter.runId,
|
|
2160
|
+
name: `control-eval/${result.id}`,
|
|
2161
|
+
dimension: result.id,
|
|
2162
|
+
score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
|
|
2163
|
+
rationale: result.detail,
|
|
2164
|
+
evidence: result.evidence,
|
|
2165
|
+
attributes: {
|
|
2166
|
+
phase,
|
|
2167
|
+
passed: result.passed,
|
|
2168
|
+
severity: result.severity,
|
|
2169
|
+
objective: result.objective
|
|
2170
|
+
}
|
|
2171
|
+
}));
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
async function runOnStep(onStep, step, runtimeErrors) {
|
|
2175
|
+
if (!onStep) return;
|
|
2176
|
+
try {
|
|
2177
|
+
await onStep(step);
|
|
2178
|
+
} catch (err) {
|
|
2179
|
+
runtimeErrors.push(runtimeError("on-step", step.index, err));
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
async function runTrace(runtimeErrors, stepIndex, write) {
|
|
2183
|
+
try {
|
|
2184
|
+
return await write();
|
|
2185
|
+
} catch (err) {
|
|
2186
|
+
runtimeErrors.push(runtimeError("trace", stepIndex, err));
|
|
2187
|
+
return void 0;
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
function noProgressStopDecision(args) {
|
|
2191
|
+
const max = args.policies?.maxNoProgressSteps;
|
|
2192
|
+
if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
|
|
2193
|
+
const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
|
|
2194
|
+
const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
|
|
2195
|
+
const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
|
|
2196
|
+
const scoreFlat = scoreDelta < minScoreDelta;
|
|
2197
|
+
const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
|
|
2198
|
+
return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
|
|
2199
|
+
}
|
|
2200
|
+
function repeatedActionStopDecision(policies, streak) {
|
|
2201
|
+
const max = policies?.maxRepeatedActions;
|
|
2202
|
+
if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
|
|
2203
|
+
return {
|
|
2204
|
+
stop: true,
|
|
2205
|
+
reason: `stuck: repeated same action for ${streak} step(s)`
|
|
2206
|
+
};
|
|
2207
|
+
}
|
|
2208
|
+
function fingerprintState(state, policies) {
|
|
2209
|
+
if (policies?.stateFingerprint) return policies.stateFingerprint(state);
|
|
2210
|
+
return stableFingerprint(state);
|
|
2211
|
+
}
|
|
2212
|
+
function fingerprintAction(action, policies) {
|
|
2213
|
+
if (policies?.actionFingerprint) return policies.actionFingerprint(action);
|
|
2214
|
+
return stableFingerprint(action);
|
|
2215
|
+
}
|
|
2216
|
+
function stableFingerprint(value) {
|
|
2217
|
+
if (typeof value === "string") return value;
|
|
2218
|
+
if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
|
|
2219
|
+
try {
|
|
2220
|
+
return JSON.stringify(sortForFingerprint(value));
|
|
2221
|
+
} catch {
|
|
2222
|
+
return String(value);
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
function sortForFingerprint(value) {
|
|
2226
|
+
if (Array.isArray(value)) return value.map(sortForFingerprint);
|
|
2227
|
+
if (!value || typeof value !== "object") return value;
|
|
2228
|
+
const record = value;
|
|
2229
|
+
const sorted = {};
|
|
2230
|
+
for (const key of Object.keys(record).sort()) {
|
|
2231
|
+
sorted[key] = sortForFingerprint(record[key]);
|
|
2232
|
+
}
|
|
2233
|
+
return sorted;
|
|
2234
|
+
}
|
|
2235
|
+
function abortReason(signal) {
|
|
2236
|
+
const reason = signal.reason;
|
|
2237
|
+
if (reason instanceof Error) return reason.message;
|
|
2238
|
+
return reason ? String(reason) : "aborted";
|
|
2239
|
+
}
|
|
2240
|
+
function runtimeError(phase, stepIndex, err) {
|
|
2241
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2242
|
+
return { phase, stepIndex, message };
|
|
2243
|
+
}
|
|
2244
|
+
async function finish(emitter, result) {
|
|
2245
|
+
await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
|
|
2246
|
+
pass: result.pass,
|
|
2247
|
+
score: result.score ?? averageScore(result.finalEvals),
|
|
2248
|
+
failureClass: result.failureClass,
|
|
2249
|
+
notes: result.reason
|
|
2250
|
+
}));
|
|
2251
|
+
return result;
|
|
2252
|
+
}
|
|
2253
|
+
|
|
2254
|
+
// src/feedback-trajectory.ts
|
|
2255
|
+
import { appendFile, mkdir, readFile } from "fs/promises";
|
|
2256
|
+
import { join } from "path";
|
|
2257
|
+
var DEFAULT_SPLIT_POLICY = {
|
|
2258
|
+
trainPct: 70,
|
|
2259
|
+
devPct: 15,
|
|
2260
|
+
testPct: 10,
|
|
2261
|
+
holdoutPct: 5
|
|
2262
|
+
};
|
|
2263
|
+
var InMemoryFeedbackTrajectoryStore = class {
|
|
2264
|
+
trajectories = /* @__PURE__ */ new Map();
|
|
2265
|
+
async save(trajectory) {
|
|
2266
|
+
this.trajectories.set(trajectory.id, cloneTrajectory(trajectory));
|
|
2267
|
+
}
|
|
2268
|
+
async get(id) {
|
|
2269
|
+
const trajectory = this.trajectories.get(id);
|
|
2270
|
+
return trajectory ? cloneTrajectory(trajectory) : null;
|
|
2271
|
+
}
|
|
2272
|
+
async list(filter = {}) {
|
|
2273
|
+
return [...this.trajectories.values()].filter((trajectory) => matchesFilter(trajectory, filter)).map(cloneTrajectory);
|
|
2274
|
+
}
|
|
2275
|
+
async appendAttempt(id, attempt) {
|
|
2276
|
+
const trajectory = this.trajectories.get(id);
|
|
2277
|
+
if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendAttempt: unknown trajectory "${id}"`);
|
|
2278
|
+
const next = cloneTrajectory({
|
|
2279
|
+
...trajectory,
|
|
2280
|
+
attempts: [...trajectory.attempts, attempt],
|
|
2281
|
+
updatedAt: attempt.createdAt
|
|
2282
|
+
});
|
|
2283
|
+
this.trajectories.set(id, next);
|
|
2284
|
+
return cloneTrajectory(next);
|
|
2285
|
+
}
|
|
2286
|
+
async appendLabel(id, label, attemptId) {
|
|
2287
|
+
const trajectory = this.trajectories.get(id);
|
|
2288
|
+
if (!trajectory) throw new Error(`FeedbackTrajectoryStore.appendLabel: unknown trajectory "${id}"`);
|
|
2289
|
+
const attempts = attemptId ? trajectory.attempts.map((attempt) => attempt.id === attemptId ? { ...attempt, feedback: [...attempt.feedback ?? [], label] } : attempt) : trajectory.attempts;
|
|
2290
|
+
const next = cloneTrajectory({
|
|
2291
|
+
...trajectory,
|
|
2292
|
+
attempts,
|
|
2293
|
+
labels: attemptId ? trajectory.labels : [...trajectory.labels, label],
|
|
2294
|
+
updatedAt: label.createdAt
|
|
2295
|
+
});
|
|
2296
|
+
this.trajectories.set(id, next);
|
|
2297
|
+
return cloneTrajectory(next);
|
|
2298
|
+
}
|
|
2299
|
+
};
|
|
2300
|
+
var FileSystemFeedbackTrajectoryStore = class {
|
|
2301
|
+
dir;
|
|
2302
|
+
memory = new InMemoryFeedbackTrajectoryStore();
|
|
2303
|
+
loaded = false;
|
|
2304
|
+
constructor(options) {
|
|
2305
|
+
this.dir = options.dir;
|
|
2306
|
+
}
|
|
2307
|
+
async save(trajectory) {
|
|
2308
|
+
await this.load();
|
|
2309
|
+
await this.memory.save(trajectory);
|
|
2310
|
+
await this.append({ op: "save", trajectory });
|
|
2311
|
+
}
|
|
2312
|
+
async get(id) {
|
|
2313
|
+
await this.load();
|
|
2314
|
+
return this.memory.get(id);
|
|
2315
|
+
}
|
|
2316
|
+
async list(filter = {}) {
|
|
2317
|
+
await this.load();
|
|
2318
|
+
return this.memory.list(filter);
|
|
2319
|
+
}
|
|
2320
|
+
async appendAttempt(id, attempt) {
|
|
2321
|
+
await this.load();
|
|
2322
|
+
const next = await this.memory.appendAttempt(id, attempt);
|
|
2323
|
+
await this.append({ op: "appendAttempt", id, attempt });
|
|
2324
|
+
return next;
|
|
2325
|
+
}
|
|
2326
|
+
async appendLabel(id, label, attemptId) {
|
|
2327
|
+
await this.load();
|
|
2328
|
+
const next = await this.memory.appendLabel(id, label, attemptId);
|
|
2329
|
+
await this.append({ op: "appendLabel", id, label, attemptId });
|
|
2330
|
+
return next;
|
|
2331
|
+
}
|
|
2332
|
+
async append(record) {
|
|
2333
|
+
await mkdir(this.dir, { recursive: true });
|
|
2334
|
+
await appendFile(join(this.dir, "feedback-trajectories.ndjson"), JSON.stringify(record) + "\n", "utf8");
|
|
2335
|
+
}
|
|
2336
|
+
async load() {
|
|
2337
|
+
if (this.loaded) return;
|
|
2338
|
+
const file = join(this.dir, "feedback-trajectories.ndjson");
|
|
2339
|
+
try {
|
|
2340
|
+
const raw = await readFile(file, "utf8");
|
|
2341
|
+
for (const line of raw.split("\n")) {
|
|
2342
|
+
if (!line.trim()) continue;
|
|
2343
|
+
try {
|
|
2344
|
+
const record = JSON.parse(line);
|
|
2345
|
+
if (record.op === "save") await this.memory.save(record.trajectory);
|
|
2346
|
+
if (record.op === "appendAttempt") await this.memory.appendAttempt(record.id, record.attempt);
|
|
2347
|
+
if (record.op === "appendLabel") await this.memory.appendLabel(record.id, record.label, record.attemptId);
|
|
2348
|
+
} catch {
|
|
2349
|
+
}
|
|
2350
|
+
}
|
|
2351
|
+
} catch {
|
|
2352
|
+
}
|
|
2353
|
+
this.loaded = true;
|
|
2354
|
+
}
|
|
2355
|
+
};
|
|
2356
|
+
function createFeedbackTrajectory(input) {
|
|
2357
|
+
const createdAt = input.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2358
|
+
const id = input.id ?? `ft_${stableHash(`${input.projectId ?? ""}|${input.scenarioId ?? ""}|${input.task.intent}|${createdAt}`).toString(16)}`;
|
|
2359
|
+
return {
|
|
2360
|
+
id,
|
|
2361
|
+
projectId: input.projectId,
|
|
2362
|
+
scenarioId: input.scenarioId,
|
|
2363
|
+
task: input.task,
|
|
2364
|
+
attempts: input.attempts ?? [],
|
|
2365
|
+
labels: input.labels ?? [],
|
|
2366
|
+
outcome: input.outcome,
|
|
2367
|
+
split: input.split,
|
|
2368
|
+
tags: input.tags,
|
|
2369
|
+
createdAt,
|
|
2370
|
+
metadata: input.metadata
|
|
2371
|
+
};
|
|
2372
|
+
}
|
|
2373
|
+
function assignFeedbackSplit(trajectory, policy = {}) {
|
|
2374
|
+
const split = { ...DEFAULT_SPLIT_POLICY, ...policy };
|
|
2375
|
+
const total = split.trainPct + split.devPct + split.testPct + split.holdoutPct;
|
|
2376
|
+
if (total <= 0) throw new Error("assignFeedbackSplit: split percentages must sum above zero");
|
|
2377
|
+
const bucket = stableHash(`${trajectory.projectId ?? ""}|${trajectory.scenarioId ?? ""}|${trajectory.id}|${trajectory.task.intent}`) % total;
|
|
2378
|
+
if (bucket < split.trainPct) return "train";
|
|
2379
|
+
if (bucket < split.trainPct + split.devPct) return "dev";
|
|
2380
|
+
if (bucket < split.trainPct + split.devPct + split.testPct) return "test";
|
|
2381
|
+
return "holdout";
|
|
2382
|
+
}
|
|
2383
|
+
function withAssignedFeedbackSplit(trajectory, policy) {
|
|
2384
|
+
return {
|
|
2385
|
+
...trajectory,
|
|
2386
|
+
split: trajectory.split ?? assignFeedbackSplit(trajectory, policy)
|
|
2387
|
+
};
|
|
2388
|
+
}
|
|
2389
|
+
function feedbackTrajectoryToDatasetScenario(trajectory) {
|
|
2390
|
+
const withSplit = withAssignedFeedbackSplit(trajectory);
|
|
2391
|
+
return {
|
|
2392
|
+
id: withSplit.scenarioId ?? withSplit.id,
|
|
2393
|
+
split: withSplit.split,
|
|
2394
|
+
payload: withSplit,
|
|
2395
|
+
tags: {
|
|
2396
|
+
...withSplit.projectId ? { projectId: withSplit.projectId } : {},
|
|
2397
|
+
...withSplit.tags ?? {},
|
|
2398
|
+
source: "feedback-trajectory"
|
|
2399
|
+
}
|
|
2400
|
+
};
|
|
2401
|
+
}
|
|
2402
|
+
function feedbackTrajectoriesToDatasetScenarios(trajectories) {
|
|
2403
|
+
return trajectories.map(feedbackTrajectoryToDatasetScenario);
|
|
2404
|
+
}
|
|
2405
|
+
function feedbackTrajectoryToOptimizerRow(trajectory) {
|
|
2406
|
+
const labels = allLabels(trajectory);
|
|
2407
|
+
return {
|
|
2408
|
+
scenarioId: trajectory.scenarioId ?? trajectory.id,
|
|
2409
|
+
trajectoryId: trajectory.id,
|
|
2410
|
+
labelKinds: [...new Set(labels.map((label) => label.kind))],
|
|
2411
|
+
score: trajectory.outcome?.score ?? scoreFromLabels(labels),
|
|
2412
|
+
metadata: {
|
|
2413
|
+
projectId: trajectory.projectId,
|
|
2414
|
+
split: trajectory.split,
|
|
2415
|
+
intent: trajectory.task.intent,
|
|
2416
|
+
attempts: trajectory.attempts.length,
|
|
2417
|
+
outcome: trajectory.outcome,
|
|
2418
|
+
labels
|
|
2419
|
+
}
|
|
2420
|
+
};
|
|
2421
|
+
}
|
|
2422
|
+
function feedbackTrajectoriesToOptimizerRows(trajectories) {
|
|
2423
|
+
return trajectories.map(feedbackTrajectoryToOptimizerRow);
|
|
2424
|
+
}
|
|
2425
|
+
function summarizePreferenceMemory(trajectories, options = {}) {
|
|
2426
|
+
const maxEntries = options.maxEntries ?? 20;
|
|
2427
|
+
const entries = [];
|
|
2428
|
+
for (const trajectory of trajectories) {
|
|
2429
|
+
for (const label of allLabels(trajectory)) {
|
|
2430
|
+
const instruction = instructionFromLabel(trajectory, label);
|
|
2431
|
+
if (!instruction) continue;
|
|
2432
|
+
entries.push({
|
|
2433
|
+
instruction,
|
|
2434
|
+
rationale: label.reason ?? `${label.kind} label from ${label.source}`,
|
|
2435
|
+
weight: weightForLabel(label),
|
|
2436
|
+
sourceTrajectoryId: trajectory.id,
|
|
2437
|
+
sourceLabelId: label.id,
|
|
2438
|
+
category: label.kind
|
|
2439
|
+
});
|
|
2440
|
+
}
|
|
2441
|
+
}
|
|
2442
|
+
const byInstruction = /* @__PURE__ */ new Map();
|
|
2443
|
+
for (const entry of entries) {
|
|
2444
|
+
const key = entry.instruction.toLowerCase().replace(/\s+/g, " ").trim();
|
|
2445
|
+
const existing = byInstruction.get(key);
|
|
2446
|
+
if (!existing || entry.weight > existing.weight) byInstruction.set(key, entry);
|
|
2447
|
+
}
|
|
2448
|
+
return [...byInstruction.values()].sort((a, b) => b.weight - a.weight).slice(0, maxEntries);
|
|
2449
|
+
}
|
|
2450
|
+
function renderPreferenceMemoryMarkdown(entries) {
|
|
2451
|
+
const lines = ["# Preference Memory", ""];
|
|
2452
|
+
for (const entry of entries) {
|
|
2453
|
+
lines.push(`- ${entry.instruction}`);
|
|
2454
|
+
lines.push(` Rationale: ${entry.rationale}`);
|
|
2455
|
+
lines.push(` Source: ${entry.sourceTrajectoryId}`);
|
|
2456
|
+
lines.push("");
|
|
2457
|
+
}
|
|
2458
|
+
return lines.join("\n").trim() + "\n";
|
|
2459
|
+
}
|
|
2460
|
+
function serializeFeedbackTrajectoriesJsonl(trajectories) {
|
|
2461
|
+
return trajectories.slice().sort((a, b) => a.id.localeCompare(b.id)).map((trajectory) => JSON.stringify(canonicalize(trajectory))).join("\n") + "\n";
|
|
2462
|
+
}
|
|
2463
|
+
function parseFeedbackTrajectoriesJsonl(jsonl) {
|
|
2464
|
+
const trajectories = [];
|
|
2465
|
+
for (const line of jsonl.split("\n")) {
|
|
2466
|
+
if (!line.trim()) continue;
|
|
2467
|
+
trajectories.push(JSON.parse(line));
|
|
2468
|
+
}
|
|
2469
|
+
return trajectories;
|
|
2470
|
+
}
|
|
2471
|
+
function controlRunToFeedbackTrajectory(run, options = {}) {
|
|
2472
|
+
const createdAt = options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
2473
|
+
const trajectoryId = run.runId ?? `ft_control_${stableHash(`${run.intent}|${createdAt}`).toString(16)}`;
|
|
2474
|
+
return createFeedbackTrajectory({
|
|
2475
|
+
id: trajectoryId,
|
|
2476
|
+
projectId: options.projectId,
|
|
2477
|
+
scenarioId: options.scenarioId,
|
|
2478
|
+
task: { intent: run.intent },
|
|
2479
|
+
createdAt,
|
|
2480
|
+
attempts: run.steps.map((step) => ({
|
|
2481
|
+
id: `${trajectoryId}_step_${step.index}`,
|
|
2482
|
+
stepIndex: step.index,
|
|
2483
|
+
artifactType: options.artifactType ?? "action",
|
|
2484
|
+
artifact: options.artifactFromStep?.(step) ?? step.actionOutcome?.result ?? step.decision,
|
|
2485
|
+
proposedAction: options.proposedActionFromStep?.(step),
|
|
2486
|
+
evals: step.evalsAfter,
|
|
2487
|
+
createdAt: step.startedAt,
|
|
2488
|
+
metadata: {
|
|
2489
|
+
decision: step.decision,
|
|
2490
|
+
actionOutcome: step.actionOutcome
|
|
2491
|
+
}
|
|
2492
|
+
})),
|
|
2493
|
+
labels: [
|
|
2494
|
+
{
|
|
2495
|
+
source: "system",
|
|
2496
|
+
kind: run.pass ? "approve" : "reject",
|
|
2497
|
+
value: run.pass,
|
|
2498
|
+
reason: run.reason,
|
|
2499
|
+
severity: run.pass ? "info" : "error",
|
|
2500
|
+
createdAt
|
|
2501
|
+
}
|
|
2502
|
+
],
|
|
2503
|
+
outcome: {
|
|
2504
|
+
success: run.pass,
|
|
2505
|
+
score: run.score,
|
|
2506
|
+
costUsd: run.spentCostUsd,
|
|
2507
|
+
detail: run.reason,
|
|
2508
|
+
observedAt: createdAt,
|
|
2509
|
+
metadata: {
|
|
2510
|
+
stoppedBy: run.stoppedBy,
|
|
2511
|
+
failureClass: run.failureClass
|
|
2512
|
+
}
|
|
2513
|
+
}
|
|
2514
|
+
});
|
|
2515
|
+
}
|
|
2516
|
+
function allLabels(trajectory) {
|
|
2517
|
+
const labels = [
|
|
2518
|
+
...trajectory.labels,
|
|
2519
|
+
...trajectory.attempts.flatMap((attempt) => attempt.feedback ?? [])
|
|
2520
|
+
];
|
|
2521
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2522
|
+
return labels.filter((label) => {
|
|
2523
|
+
const key = label.id ?? `${label.source}|${label.kind}|${label.createdAt}|${JSON.stringify(label.value)}`;
|
|
2524
|
+
if (seen.has(key)) return false;
|
|
2525
|
+
seen.add(key);
|
|
2526
|
+
return true;
|
|
2527
|
+
});
|
|
2528
|
+
}
|
|
2529
|
+
function scoreFromLabels(labels) {
|
|
2530
|
+
if (!labels.length) return void 0;
|
|
2531
|
+
const scored = labels.map((label) => {
|
|
2532
|
+
if (label.kind === "approve" || label.kind === "select") return 1;
|
|
2533
|
+
if (label.kind === "reject" || label.kind === "policy_block") return 0;
|
|
2534
|
+
if (label.kind === "rate" && typeof label.value === "number") return Math.max(0, Math.min(1, label.value));
|
|
2535
|
+
return void 0;
|
|
2536
|
+
}).filter((value) => typeof value === "number");
|
|
2537
|
+
if (!scored.length) return void 0;
|
|
2538
|
+
return Math.round(scored.reduce((sum2, value) => sum2 + value, 0) / scored.length * 1e3) / 1e3;
|
|
2539
|
+
}
|
|
2540
|
+
function instructionFromLabel(trajectory, label) {
|
|
2541
|
+
if (label.kind === "reject" && label.reason) return `Avoid outputs like "${compact(trajectory.task.intent, 80)}" when: ${label.reason}`;
|
|
2542
|
+
if (label.kind === "revision_request" && label.reason) return `Revise similar work by applying: ${label.reason}`;
|
|
2543
|
+
if (label.kind === "select" && label.reason) return `Prefer selected options for "${compact(trajectory.task.intent, 80)}" because: ${label.reason}`;
|
|
2544
|
+
if (label.kind === "approve" && label.reason) return `Repeat the pattern approved for "${compact(trajectory.task.intent, 80)}": ${label.reason}`;
|
|
2545
|
+
if (label.kind === "comment" && label.reason) return label.reason;
|
|
2546
|
+
return void 0;
|
|
2547
|
+
}
|
|
2548
|
+
function weightForLabel(label) {
|
|
2549
|
+
const severity = label.severity === "critical" ? 4 : label.severity === "error" ? 3 : label.severity === "warning" ? 2 : 1;
|
|
2550
|
+
const source = label.source === "user" ? 3 : label.source === "metric" || label.source === "environment" ? 2 : 1;
|
|
2551
|
+
return severity * source;
|
|
2552
|
+
}
|
|
2553
|
+
function matchesFilter(trajectory, filter) {
|
|
2554
|
+
if (filter.projectId && trajectory.projectId !== filter.projectId) return false;
|
|
2555
|
+
if (filter.scenarioId && trajectory.scenarioId !== filter.scenarioId) return false;
|
|
2556
|
+
if (filter.split && trajectory.split !== filter.split) return false;
|
|
2557
|
+
if (filter.tag) {
|
|
2558
|
+
const [key, value] = filter.tag;
|
|
2559
|
+
if (trajectory.tags?.[key] !== value) return false;
|
|
2560
|
+
}
|
|
2561
|
+
return true;
|
|
2562
|
+
}
|
|
2563
|
+
function cloneTrajectory(trajectory) {
|
|
2564
|
+
return JSON.parse(JSON.stringify(trajectory));
|
|
2565
|
+
}
|
|
2566
|
+
function compact(value, max) {
|
|
2567
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
2568
|
+
return normalized.length > max ? `${normalized.slice(0, max).trim()}...` : normalized;
|
|
2569
|
+
}
|
|
2570
|
+
function stableHash(input) {
|
|
2571
|
+
let hash = 2166136261;
|
|
2572
|
+
for (let i = 0; i < input.length; i += 1) {
|
|
2573
|
+
hash ^= input.charCodeAt(i);
|
|
2574
|
+
hash = Math.imul(hash, 16777619);
|
|
2575
|
+
}
|
|
2576
|
+
return hash >>> 0;
|
|
2577
|
+
}
|
|
2578
|
+
function canonicalize(value) {
|
|
2579
|
+
if (value === null || typeof value !== "object") return value;
|
|
2580
|
+
if (Array.isArray(value)) return value.map(canonicalize);
|
|
2581
|
+
const out = {};
|
|
2582
|
+
for (const key of Object.keys(value).sort()) {
|
|
2583
|
+
out[key] = canonicalize(value[key]);
|
|
2584
|
+
}
|
|
2585
|
+
return out;
|
|
2586
|
+
}
|
|
2587
|
+
|
|
1389
2588
|
// src/prompt-registry.ts
|
|
1390
2589
|
var PromptRegistry = class {
|
|
1391
2590
|
entries = /* @__PURE__ */ new Map();
|
|
@@ -3053,231 +4252,53 @@ var FileSystemTraceStore = class {
|
|
|
3053
4252
|
}
|
|
3054
4253
|
}
|
|
3055
4254
|
} catch {
|
|
3056
|
-
}
|
|
3057
|
-
this.index = store;
|
|
3058
|
-
this.loaded = true;
|
|
3059
|
-
return store;
|
|
3060
|
-
}
|
|
3061
|
-
async appendRun(run) {
|
|
3062
|
-
await this.append("runs", run);
|
|
3063
|
-
}
|
|
3064
|
-
async updateRun(runId, patch) {
|
|
3065
|
-
await this.append("runs", { runId, ...patch, _update: true });
|
|
3066
|
-
if (this.index) await this.index.updateRun(runId, patch);
|
|
3067
|
-
}
|
|
3068
|
-
async appendSpan(span) {
|
|
3069
|
-
await this.append("spans", span);
|
|
3070
|
-
}
|
|
3071
|
-
async updateSpan(spanId, patch) {
|
|
3072
|
-
await this.append("spans", { spanId, ...patch, _update: true });
|
|
3073
|
-
if (this.index) await this.index.updateSpan(spanId, patch);
|
|
3074
|
-
}
|
|
3075
|
-
async appendEvent(event) {
|
|
3076
|
-
await this.append("events", event);
|
|
3077
|
-
}
|
|
3078
|
-
async appendArtifact(artifact) {
|
|
3079
|
-
await this.append("artifacts", artifact);
|
|
3080
|
-
}
|
|
3081
|
-
async appendBudgetEntry(entry) {
|
|
3082
|
-
await this.append("budget", entry);
|
|
3083
|
-
}
|
|
3084
|
-
async getRun(runId) {
|
|
3085
|
-
return (await this.load()).getRun(runId);
|
|
3086
|
-
}
|
|
3087
|
-
async listRuns(filter) {
|
|
3088
|
-
return (await this.load()).listRuns(filter);
|
|
3089
|
-
}
|
|
3090
|
-
async spans(filter) {
|
|
3091
|
-
return (await this.load()).spans(filter);
|
|
3092
|
-
}
|
|
3093
|
-
async events(filter) {
|
|
3094
|
-
return (await this.load()).events(filter);
|
|
3095
|
-
}
|
|
3096
|
-
async budget(runId) {
|
|
3097
|
-
return (await this.load()).budget(runId);
|
|
3098
|
-
}
|
|
3099
|
-
async artifacts(runId) {
|
|
3100
|
-
return (await this.load()).artifacts(runId);
|
|
3101
|
-
}
|
|
3102
|
-
};
|
|
3103
|
-
|
|
3104
|
-
// src/trace/emitter.ts
|
|
3105
|
-
var TraceEmitter = class {
|
|
3106
|
-
store;
|
|
3107
|
-
stack = [];
|
|
3108
|
-
_runId;
|
|
3109
|
-
now;
|
|
3110
|
-
id;
|
|
3111
|
-
constructor(store, options = {}) {
|
|
3112
|
-
this.store = store;
|
|
3113
|
-
this.now = options.now ?? (() => Date.now());
|
|
3114
|
-
this.id = options.id ?? (() => cryptoRandomId());
|
|
3115
|
-
this._runId = options.runId ?? this.id();
|
|
3116
|
-
}
|
|
3117
|
-
get runId() {
|
|
3118
|
-
return this._runId;
|
|
3119
|
-
}
|
|
3120
|
-
// ── Run lifecycle ──────────────────────────────────────────────────
|
|
3121
|
-
async startRun(run) {
|
|
3122
|
-
const full = { ...run, runId: this._runId, startedAt: this.now(), status: "running" };
|
|
3123
|
-
await this.store.appendRun(full);
|
|
3124
|
-
return full;
|
|
3125
|
-
}
|
|
3126
|
-
async endRun(outcome) {
|
|
3127
|
-
const status = outcome?.pass === false ? "failed" : "completed";
|
|
3128
|
-
await this.store.updateRun(this._runId, { endedAt: this.now(), status, outcome });
|
|
4255
|
+
}
|
|
4256
|
+
this.index = store;
|
|
4257
|
+
this.loaded = true;
|
|
4258
|
+
return store;
|
|
3129
4259
|
}
|
|
3130
|
-
async
|
|
3131
|
-
await this.
|
|
3132
|
-
endedAt: this.now(),
|
|
3133
|
-
status: "aborted",
|
|
3134
|
-
outcome: { pass: false, notes: reason }
|
|
3135
|
-
});
|
|
4260
|
+
async appendRun(run) {
|
|
4261
|
+
await this.append("runs", run);
|
|
3136
4262
|
}
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
const parent = init.parentSpanId ?? this.stack[this.stack.length - 1];
|
|
3141
|
-
const span = {
|
|
3142
|
-
spanId,
|
|
3143
|
-
parentSpanId: parent,
|
|
3144
|
-
runId: this._runId,
|
|
3145
|
-
startedAt: this.now(),
|
|
3146
|
-
...init
|
|
3147
|
-
};
|
|
3148
|
-
await this.store.appendSpan(span);
|
|
3149
|
-
this.stack.push(spanId);
|
|
3150
|
-
return this.handle(span);
|
|
4263
|
+
async updateRun(runId, patch) {
|
|
4264
|
+
await this.append("runs", { runId, ...patch, _update: true });
|
|
4265
|
+
if (this.index) await this.index.updateRun(runId, patch);
|
|
3151
4266
|
}
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
span,
|
|
3155
|
-
end: async (patch) => {
|
|
3156
|
-
const endedAt = this.now();
|
|
3157
|
-
await this.store.updateSpan(span.spanId, { endedAt, status: "ok", ...patch });
|
|
3158
|
-
this.pop(span.spanId);
|
|
3159
|
-
},
|
|
3160
|
-
fail: async (error, patch) => {
|
|
3161
|
-
const endedAt = this.now();
|
|
3162
|
-
const errStr = error instanceof Error ? error.message : error;
|
|
3163
|
-
await this.store.updateSpan(span.spanId, {
|
|
3164
|
-
endedAt,
|
|
3165
|
-
status: "error",
|
|
3166
|
-
error: errStr,
|
|
3167
|
-
...patch
|
|
3168
|
-
});
|
|
3169
|
-
this.pop(span.spanId);
|
|
3170
|
-
}
|
|
3171
|
-
};
|
|
4267
|
+
async appendSpan(span) {
|
|
4268
|
+
await this.append("spans", span);
|
|
3172
4269
|
}
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
if (
|
|
4270
|
+
async updateSpan(spanId, patch) {
|
|
4271
|
+
await this.append("spans", { spanId, ...patch, _update: true });
|
|
4272
|
+
if (this.index) await this.index.updateSpan(spanId, patch);
|
|
3176
4273
|
}
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
return this.span({ kind: "llm", ...init });
|
|
4274
|
+
async appendEvent(event) {
|
|
4275
|
+
await this.append("events", event);
|
|
3180
4276
|
}
|
|
3181
|
-
|
|
3182
|
-
|
|
4277
|
+
async appendArtifact(artifact) {
|
|
4278
|
+
await this.append("artifacts", artifact);
|
|
3183
4279
|
}
|
|
3184
|
-
|
|
3185
|
-
|
|
4280
|
+
async appendBudgetEntry(entry) {
|
|
4281
|
+
await this.append("budget", entry);
|
|
3186
4282
|
}
|
|
3187
|
-
async
|
|
3188
|
-
|
|
3189
|
-
const now = this.now();
|
|
3190
|
-
const full = {
|
|
3191
|
-
spanId,
|
|
3192
|
-
runId: this._runId,
|
|
3193
|
-
kind: "judge",
|
|
3194
|
-
startedAt: now,
|
|
3195
|
-
endedAt: now,
|
|
3196
|
-
status: "ok",
|
|
3197
|
-
...verdict
|
|
3198
|
-
};
|
|
3199
|
-
await this.store.appendSpan(full);
|
|
3200
|
-
return full;
|
|
4283
|
+
async getRun(runId) {
|
|
4284
|
+
return (await this.load()).getRun(runId);
|
|
3201
4285
|
}
|
|
3202
|
-
|
|
3203
|
-
return this.
|
|
4286
|
+
async listRuns(filter) {
|
|
4287
|
+
return (await this.load()).listRuns(filter);
|
|
3204
4288
|
}
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
const full = {
|
|
3208
|
-
eventId: this.id(),
|
|
3209
|
-
runId: this._runId,
|
|
3210
|
-
spanId: event.spanId ?? this.stack[this.stack.length - 1],
|
|
3211
|
-
kind: event.kind,
|
|
3212
|
-
timestamp: this.now(),
|
|
3213
|
-
payload: event.payload ?? {}
|
|
3214
|
-
};
|
|
3215
|
-
await this.store.appendEvent(full);
|
|
3216
|
-
return full;
|
|
4289
|
+
async spans(filter) {
|
|
4290
|
+
return (await this.load()).spans(filter);
|
|
3217
4291
|
}
|
|
3218
|
-
|
|
3219
|
-
|
|
3220
|
-
const full = {
|
|
3221
|
-
runId: this._runId,
|
|
3222
|
-
timestamp: entry.timestamp ?? this.now(),
|
|
3223
|
-
dimension: entry.dimension,
|
|
3224
|
-
limit: entry.limit,
|
|
3225
|
-
consumed: entry.consumed,
|
|
3226
|
-
remaining: entry.remaining,
|
|
3227
|
-
breached: entry.breached,
|
|
3228
|
-
spanId: entry.spanId ?? this.stack[this.stack.length - 1]
|
|
3229
|
-
};
|
|
3230
|
-
await this.store.appendBudgetEntry(full);
|
|
3231
|
-
if (full.breached) {
|
|
3232
|
-
await this.emit({
|
|
3233
|
-
kind: "budget_breach",
|
|
3234
|
-
spanId: full.spanId,
|
|
3235
|
-
payload: { dimension: full.dimension, limit: full.limit, consumed: full.consumed }
|
|
3236
|
-
});
|
|
3237
|
-
}
|
|
3238
|
-
return full;
|
|
4292
|
+
async events(filter) {
|
|
4293
|
+
return (await this.load()).events(filter);
|
|
3239
4294
|
}
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
const full = { artifactId: this.id(), runId: this._runId, ...artifact };
|
|
3243
|
-
await this.store.appendArtifact(full);
|
|
3244
|
-
return full;
|
|
4295
|
+
async budget(runId) {
|
|
4296
|
+
return (await this.load()).budget(runId);
|
|
3245
4297
|
}
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
3249
|
-
* Returns the fn's return value. Use this for the 95% case.
|
|
3250
|
-
*/
|
|
3251
|
-
async within(init, fn) {
|
|
3252
|
-
const handle = await this.span(init);
|
|
3253
|
-
try {
|
|
3254
|
-
const result = await fn(handle);
|
|
3255
|
-
await handle.end();
|
|
3256
|
-
return result;
|
|
3257
|
-
} catch (err) {
|
|
3258
|
-
await handle.fail(err instanceof Error ? err : String(err));
|
|
3259
|
-
throw err;
|
|
3260
|
-
}
|
|
4298
|
+
async artifacts(runId) {
|
|
4299
|
+
return (await this.load()).artifacts(runId);
|
|
3261
4300
|
}
|
|
3262
4301
|
};
|
|
3263
|
-
function cryptoRandomId() {
|
|
3264
|
-
if (typeof globalThis.crypto?.randomUUID === "function") return globalThis.crypto.randomUUID();
|
|
3265
|
-
return `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
3266
|
-
}
|
|
3267
|
-
function llmSpanFromProvider(args) {
|
|
3268
|
-
return {
|
|
3269
|
-
name: args.name ?? args.model,
|
|
3270
|
-
model: args.model,
|
|
3271
|
-
messages: args.messages,
|
|
3272
|
-
output: args.output,
|
|
3273
|
-
inputTokens: args.usage?.inputTokens,
|
|
3274
|
-
outputTokens: args.usage?.outputTokens,
|
|
3275
|
-
cachedTokens: args.usage?.cachedTokens,
|
|
3276
|
-
reasoningTokens: args.usage?.reasoningTokens,
|
|
3277
|
-
costUsd: args.costUsd,
|
|
3278
|
-
finishReason: args.finishReason
|
|
3279
|
-
};
|
|
3280
|
-
}
|
|
3281
4302
|
|
|
3282
4303
|
// src/sandbox-harness.ts
|
|
3283
4304
|
var vitestTestParser = {
|
|
@@ -3887,6 +4908,157 @@ function safeJson(x) {
|
|
|
3887
4908
|
}
|
|
3888
4909
|
}
|
|
3889
4910
|
|
|
4911
|
+
// src/propose-review-control.ts
|
|
4912
|
+
var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
4913
|
+
async function runProposeReviewAsControlLoop(config) {
|
|
4914
|
+
const maxShots = config.maxShots ?? 10;
|
|
4915
|
+
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
4916
|
+
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
4917
|
+
const memory = config.memory ?? inMemoryReviewStore();
|
|
4918
|
+
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
|
|
4919
|
+
const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
|
|
4920
|
+
let lowConfidenceStreak = 0;
|
|
4921
|
+
let current = {
|
|
4922
|
+
shot: 0,
|
|
4923
|
+
state: config.initialState,
|
|
4924
|
+
priorReview: null,
|
|
4925
|
+
verification: { pass: false },
|
|
4926
|
+
memory: await memory.load(),
|
|
4927
|
+
completed: false,
|
|
4928
|
+
reviewAvailable: false
|
|
4929
|
+
};
|
|
4930
|
+
return runAgentControlLoop({
|
|
4931
|
+
intent: config.goal,
|
|
4932
|
+
budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
|
|
4933
|
+
store: config.store,
|
|
4934
|
+
scenarioId: config.scenarioId ?? "propose-review-control",
|
|
4935
|
+
projectId: config.projectId,
|
|
4936
|
+
variantId: config.variantId,
|
|
4937
|
+
actionFailure: config.actionFailure ?? "stop",
|
|
4938
|
+
observe: () => current,
|
|
4939
|
+
validate: ({ state }) => [
|
|
4940
|
+
objectiveEval({
|
|
4941
|
+
id: "verification",
|
|
4942
|
+
passed: state.verification.pass,
|
|
4943
|
+
score: state.verification.score,
|
|
4944
|
+
severity: "critical",
|
|
4945
|
+
detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
|
|
4946
|
+
})
|
|
4947
|
+
],
|
|
4948
|
+
shouldStop: ({ state }) => {
|
|
4949
|
+
if (state.verification.pass) {
|
|
4950
|
+
return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
|
|
4951
|
+
}
|
|
4952
|
+
if (state.completed) {
|
|
4953
|
+
return {
|
|
4954
|
+
stop: true,
|
|
4955
|
+
pass: false,
|
|
4956
|
+
reason: "reviewer stopped continuation",
|
|
4957
|
+
score: state.verification.score,
|
|
4958
|
+
failureClass: failureClassFromVerification(state.verification)
|
|
4959
|
+
};
|
|
4960
|
+
}
|
|
4961
|
+
return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
|
|
4962
|
+
},
|
|
4963
|
+
decide: ({ state }) => ({
|
|
4964
|
+
type: "continue",
|
|
4965
|
+
action: { type: "propose-review-shot", shot: state.shot + 1 },
|
|
4966
|
+
reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
|
|
4967
|
+
}),
|
|
4968
|
+
act: async (action, ctx) => {
|
|
4969
|
+
const shot = action.shot;
|
|
4970
|
+
const proposeOut = await config.propose({
|
|
4971
|
+
shot,
|
|
4972
|
+
goal: config.goal,
|
|
4973
|
+
state: current.state,
|
|
4974
|
+
priorReview: current.priorReview,
|
|
4975
|
+
abortSignal: ctx.abortSignal,
|
|
4976
|
+
emitter: ctx.emitter
|
|
4977
|
+
});
|
|
4978
|
+
const nextState = proposeOut.state;
|
|
4979
|
+
const verification = await config.verify(nextState);
|
|
4980
|
+
let review = null;
|
|
4981
|
+
let reviewAvailable = false;
|
|
4982
|
+
let reviewError;
|
|
4983
|
+
let shouldContinue = !verification.pass;
|
|
4984
|
+
if (!verification.pass) {
|
|
4985
|
+
try {
|
|
4986
|
+
review = await config.review({
|
|
4987
|
+
shot,
|
|
4988
|
+
goal: config.goal,
|
|
4989
|
+
state: nextState,
|
|
4990
|
+
verification,
|
|
4991
|
+
traceSummary: proposeOut.traceSummary,
|
|
4992
|
+
memory: await memory.load()
|
|
4993
|
+
});
|
|
4994
|
+
reviewAvailable = true;
|
|
4995
|
+
shouldContinue = review.shouldContinue;
|
|
4996
|
+
lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
|
|
4997
|
+
if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
|
|
4998
|
+
} catch (err) {
|
|
4999
|
+
reviewError = err instanceof Error ? err.message : String(err);
|
|
5000
|
+
review = current.priorReview ?? {
|
|
5001
|
+
observations: "Reviewer unavailable.",
|
|
5002
|
+
diagnosis: reviewError,
|
|
5003
|
+
nextShotInstruction: fallbackInstruction,
|
|
5004
|
+
shouldContinue: true,
|
|
5005
|
+
confidence: 0
|
|
5006
|
+
};
|
|
5007
|
+
shouldContinue = true;
|
|
5008
|
+
}
|
|
5009
|
+
} else {
|
|
5010
|
+
review = {
|
|
5011
|
+
observations: "Verification passed.",
|
|
5012
|
+
diagnosis: "No further revision needed.",
|
|
5013
|
+
nextShotInstruction: "",
|
|
5014
|
+
shouldContinue: false,
|
|
5015
|
+
confidence: 1
|
|
5016
|
+
};
|
|
5017
|
+
}
|
|
5018
|
+
const entry = {
|
|
5019
|
+
...review ?? {
|
|
5020
|
+
observations: "No review.",
|
|
5021
|
+
diagnosis: "",
|
|
5022
|
+
nextShotInstruction: fallbackInstruction,
|
|
5023
|
+
shouldContinue,
|
|
5024
|
+
confidence: 0
|
|
5025
|
+
},
|
|
5026
|
+
shot,
|
|
5027
|
+
timestamp: Date.now(),
|
|
5028
|
+
verification: {
|
|
5029
|
+
pass: verification.pass,
|
|
5030
|
+
score: verification.score,
|
|
5031
|
+
failingLayers: verification.failingLayers
|
|
5032
|
+
}
|
|
5033
|
+
};
|
|
5034
|
+
await memory.append(entry);
|
|
5035
|
+
current = {
|
|
5036
|
+
shot,
|
|
5037
|
+
state: nextState,
|
|
5038
|
+
priorReview: review,
|
|
5039
|
+
verification,
|
|
5040
|
+
traceSummary: proposeOut.traceSummary,
|
|
5041
|
+
memory: await memory.load(),
|
|
5042
|
+
completed: verification.pass || !shouldContinue,
|
|
5043
|
+
reviewAvailable,
|
|
5044
|
+
reviewError
|
|
5045
|
+
};
|
|
5046
|
+
return {
|
|
5047
|
+
state: nextState,
|
|
5048
|
+
verification,
|
|
5049
|
+
traceSummary: proposeOut.traceSummary,
|
|
5050
|
+
review,
|
|
5051
|
+
reviewAvailable,
|
|
5052
|
+
reviewError
|
|
5053
|
+
};
|
|
5054
|
+
}
|
|
5055
|
+
});
|
|
5056
|
+
}
|
|
5057
|
+
function controlFailureClassFromVerification(verification) {
|
|
5058
|
+
if (verification.pass) return void 0;
|
|
5059
|
+
return verification.failingLayers?.length ? "instruction_following" : "unknown";
|
|
5060
|
+
}
|
|
5061
|
+
|
|
3890
5062
|
// src/trace/schema.ts
|
|
3891
5063
|
var TRACE_SCHEMA_VERSION = "1.0.0";
|
|
3892
5064
|
var FAILURE_CLASSES = [
|
|
@@ -5210,7 +6382,7 @@ function assertNonNegative(n, name) {
|
|
|
5210
6382
|
|
|
5211
6383
|
// src/muffled-gate-scanner.ts
|
|
5212
6384
|
import { readFileSync as readFileSync2, existsSync as existsSync2, readdirSync, statSync } from "fs";
|
|
5213
|
-
import { join } from "path";
|
|
6385
|
+
import { join as join2 } from "path";
|
|
5214
6386
|
function codeOf(line) {
|
|
5215
6387
|
return line.replace(/\/\/.*$/, "").replace(/^\s*\*.*$/, "");
|
|
5216
6388
|
}
|
|
@@ -5314,11 +6486,11 @@ var UNIVERSAL_FINDERS = [
|
|
|
5314
6486
|
function autoDeriveImporters(repoRoot, roots, extensions, importsContain) {
|
|
5315
6487
|
const matches2 = [];
|
|
5316
6488
|
const walk = (rel) => {
|
|
5317
|
-
const abs =
|
|
6489
|
+
const abs = join2(repoRoot, rel);
|
|
5318
6490
|
if (!existsSync2(abs)) return;
|
|
5319
6491
|
for (const entry of readdirSync(abs)) {
|
|
5320
|
-
const sub =
|
|
5321
|
-
const subAbs =
|
|
6492
|
+
const sub = join2(rel, entry);
|
|
6493
|
+
const subAbs = join2(repoRoot, sub);
|
|
5322
6494
|
let st;
|
|
5323
6495
|
try {
|
|
5324
6496
|
st = statSync(subAbs);
|
|
@@ -5347,7 +6519,7 @@ function scanForMuffledGates(opts) {
|
|
|
5347
6519
|
const findings = [];
|
|
5348
6520
|
const scanned = /* @__PURE__ */ new Set();
|
|
5349
6521
|
for (const file of opts.scanFiles) {
|
|
5350
|
-
const abs =
|
|
6522
|
+
const abs = join2(opts.repoRoot, file);
|
|
5351
6523
|
if (!existsSync2(abs)) continue;
|
|
5352
6524
|
const text = readFileSync2(abs, "utf8");
|
|
5353
6525
|
for (const find of opts.finders) findings.push(...find(file, text));
|
|
@@ -5362,7 +6534,7 @@ function scanForMuffledGates(opts) {
|
|
|
5362
6534
|
);
|
|
5363
6535
|
for (const file of importers) {
|
|
5364
6536
|
if (scanned.has(file)) continue;
|
|
5365
|
-
const abs =
|
|
6537
|
+
const abs = join2(opts.repoRoot, file);
|
|
5366
6538
|
if (!existsSync2(abs)) continue;
|
|
5367
6539
|
const text = readFileSync2(abs, "utf8");
|
|
5368
6540
|
for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text));
|
|
@@ -5557,7 +6729,7 @@ var Dataset = class _Dataset {
|
|
|
5557
6729
|
* Write to disk for contamination-verifiable archives.
|
|
5558
6730
|
*/
|
|
5559
6731
|
toJsonl() {
|
|
5560
|
-
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(
|
|
6732
|
+
return this.scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map((s) => JSON.stringify(canonicalize2(s))).join("\n") + "\n";
|
|
5561
6733
|
}
|
|
5562
6734
|
static fromJsonl(jsonl, manifest) {
|
|
5563
6735
|
const scenarios = [];
|
|
@@ -5570,18 +6742,18 @@ var Dataset = class _Dataset {
|
|
|
5570
6742
|
}
|
|
5571
6743
|
};
|
|
5572
6744
|
async function hashScenarios(scenarios) {
|
|
5573
|
-
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(
|
|
6745
|
+
const canonical = scenarios.slice().sort((a, b) => a.id.localeCompare(b.id)).map(canonicalize2);
|
|
5574
6746
|
const text = JSON.stringify(canonical);
|
|
5575
6747
|
const bytes = new TextEncoder().encode(text);
|
|
5576
6748
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
5577
6749
|
return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
5578
6750
|
}
|
|
5579
|
-
function
|
|
6751
|
+
function canonicalize2(v) {
|
|
5580
6752
|
if (v === null || typeof v !== "object") return v;
|
|
5581
|
-
if (Array.isArray(v)) return v.map(
|
|
6753
|
+
if (Array.isArray(v)) return v.map(canonicalize2);
|
|
5582
6754
|
const keys = Object.keys(v).sort();
|
|
5583
6755
|
const out = {};
|
|
5584
|
-
for (const k of keys) out[k] =
|
|
6756
|
+
for (const k of keys) out[k] = canonicalize2(v[k]);
|
|
5585
6757
|
return out;
|
|
5586
6758
|
}
|
|
5587
6759
|
function seededShuffle(items, seed) {
|
|
@@ -7350,7 +8522,7 @@ async function commitBisect(options) {
|
|
|
7350
8522
|
}
|
|
7351
8523
|
async function promptBisect(options) {
|
|
7352
8524
|
const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
|
|
7353
|
-
const
|
|
8525
|
+
const join4 = (paragraphs) => paragraphs.join("\n\n");
|
|
7354
8526
|
const goodParas = split(options.good);
|
|
7355
8527
|
const badParas = split(options.bad);
|
|
7356
8528
|
if (goodParas.length !== badParas.length) {
|
|
@@ -7368,7 +8540,7 @@ async function promptBisect(options) {
|
|
|
7368
8540
|
const result = await bisect({
|
|
7369
8541
|
good: goodMask,
|
|
7370
8542
|
bad: badMask,
|
|
7371
|
-
runEval: (mask) => options.runEval(
|
|
8543
|
+
runEval: (mask) => options.runEval(join4(paragraphsFor(mask))),
|
|
7372
8544
|
maxIterations: options.maxIterations ?? n + 5,
|
|
7373
8545
|
halfway: (g, b) => {
|
|
7374
8546
|
for (let i = 0; i < g.length; i++) {
|
|
@@ -7399,12 +8571,12 @@ async function promptBisect(options) {
|
|
|
7399
8571
|
}
|
|
7400
8572
|
}
|
|
7401
8573
|
const materializedPath = result.path.map((s) => ({
|
|
7402
|
-
state:
|
|
8574
|
+
state: join4(paragraphsFor(s.state)),
|
|
7403
8575
|
score: s.score,
|
|
7404
8576
|
pass: s.pass
|
|
7405
8577
|
}));
|
|
7406
8578
|
return {
|
|
7407
|
-
culprit:
|
|
8579
|
+
culprit: join4(paragraphsFor(culprit)),
|
|
7408
8580
|
path: materializedPath,
|
|
7409
8581
|
converged: result.converged,
|
|
7410
8582
|
inputInconsistent: result.inputInconsistent,
|
|
@@ -7615,7 +8787,7 @@ function attributeStep(op, prmA, prmB) {
|
|
|
7615
8787
|
|
|
7616
8788
|
// src/pre-registration.ts
|
|
7617
8789
|
async function signManifest(m) {
|
|
7618
|
-
const canonical =
|
|
8790
|
+
const canonical = canonicalize3(m);
|
|
7619
8791
|
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
7620
8792
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
7621
8793
|
const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
@@ -7645,12 +8817,12 @@ async function evaluateHypothesis(manifest, observed) {
|
|
|
7645
8817
|
rejectionReasons: reasons
|
|
7646
8818
|
};
|
|
7647
8819
|
}
|
|
7648
|
-
function
|
|
8820
|
+
function canonicalize3(v) {
|
|
7649
8821
|
if (v === null || typeof v !== "object") return v;
|
|
7650
|
-
if (Array.isArray(v)) return v.map(
|
|
8822
|
+
if (Array.isArray(v)) return v.map(canonicalize3);
|
|
7651
8823
|
const keys = Object.keys(v).sort();
|
|
7652
8824
|
const out = {};
|
|
7653
|
-
for (const k of keys) out[k] =
|
|
8825
|
+
for (const k of keys) out[k] = canonicalize3(v[k]);
|
|
7654
8826
|
return out;
|
|
7655
8827
|
}
|
|
7656
8828
|
|
|
@@ -8459,7 +9631,7 @@ function mergeSignals(a, b) {
|
|
|
8459
9631
|
// src/command-runner.ts
|
|
8460
9632
|
import { spawnSync } from "child_process";
|
|
8461
9633
|
import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync as readdirSync2, statSync as statSync2 } from "fs";
|
|
8462
|
-
import { join as
|
|
9634
|
+
import { join as join3 } from "path";
|
|
8463
9635
|
var localCommandRunner = {
|
|
8464
9636
|
name: "local",
|
|
8465
9637
|
async run(input) {
|
|
@@ -8506,7 +9678,7 @@ var localCommandRunner = {
|
|
|
8506
9678
|
const out = [];
|
|
8507
9679
|
for (const name of entries) {
|
|
8508
9680
|
try {
|
|
8509
|
-
const st = statSync2(
|
|
9681
|
+
const st = statSync2(join3(path, name));
|
|
8510
9682
|
out.push({
|
|
8511
9683
|
name,
|
|
8512
9684
|
isDirectory: st.isDirectory(),
|
|
@@ -12298,6 +13470,46 @@ function truncate3(s, max) {
|
|
|
12298
13470
|
function quote(s) {
|
|
12299
13471
|
return s.replace(/`/g, "\\`");
|
|
12300
13472
|
}
|
|
13473
|
+
function autoCloseTruncatedJson(raw) {
|
|
13474
|
+
const stack = [];
|
|
13475
|
+
let inString = false;
|
|
13476
|
+
let escape = false;
|
|
13477
|
+
for (const c of raw) {
|
|
13478
|
+
if (escape) {
|
|
13479
|
+
escape = false;
|
|
13480
|
+
continue;
|
|
13481
|
+
}
|
|
13482
|
+
if (inString) {
|
|
13483
|
+
if (c === "\\") {
|
|
13484
|
+
escape = true;
|
|
13485
|
+
continue;
|
|
13486
|
+
}
|
|
13487
|
+
if (c === '"') {
|
|
13488
|
+
inString = false;
|
|
13489
|
+
continue;
|
|
13490
|
+
}
|
|
13491
|
+
continue;
|
|
13492
|
+
}
|
|
13493
|
+
if (c === '"') {
|
|
13494
|
+
inString = true;
|
|
13495
|
+
continue;
|
|
13496
|
+
}
|
|
13497
|
+
if (c === "{" || c === "[") stack.push(c);
|
|
13498
|
+
else if (c === "}") {
|
|
13499
|
+
if (stack.pop() !== "{") return null;
|
|
13500
|
+
} else if (c === "]") {
|
|
13501
|
+
if (stack.pop() !== "[") return null;
|
|
13502
|
+
}
|
|
13503
|
+
}
|
|
13504
|
+
if (stack.length === 0 && !inString) return raw;
|
|
13505
|
+
let suffix = "";
|
|
13506
|
+
if (inString) suffix += '"';
|
|
13507
|
+
while (stack.length > 0) {
|
|
13508
|
+
const opener = stack.pop();
|
|
13509
|
+
suffix += opener === "{" ? "}" : "]";
|
|
13510
|
+
}
|
|
13511
|
+
return raw + suffix;
|
|
13512
|
+
}
|
|
12301
13513
|
function parseReflectionResponse(raw, maxProposals) {
|
|
12302
13514
|
let text = raw.trim();
|
|
12303
13515
|
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
@@ -12322,6 +13534,18 @@ function parseReflectionResponse(raw, maxProposals) {
|
|
|
12322
13534
|
} catch {
|
|
12323
13535
|
}
|
|
12324
13536
|
}
|
|
13537
|
+
if (parsed == null) {
|
|
13538
|
+
for (const slice of candidates) {
|
|
13539
|
+
const closed = autoCloseTruncatedJson(slice);
|
|
13540
|
+
if (closed != null && closed !== slice) {
|
|
13541
|
+
try {
|
|
13542
|
+
parsed = JSON.parse(closed);
|
|
13543
|
+
break;
|
|
13544
|
+
} catch {
|
|
13545
|
+
}
|
|
13546
|
+
}
|
|
13547
|
+
}
|
|
13548
|
+
}
|
|
12325
13549
|
if (parsed == null) return [];
|
|
12326
13550
|
let proposalsRaw;
|
|
12327
13551
|
if (Array.isArray(parsed)) {
|
|
@@ -12374,6 +13598,7 @@ export {
|
|
|
12374
13598
|
ExperimentTracker,
|
|
12375
13599
|
FAILURE_CLASSES,
|
|
12376
13600
|
FileSystemExperimentStore,
|
|
13601
|
+
FileSystemFeedbackTrajectoryStore,
|
|
12377
13602
|
FileSystemOutcomeStore,
|
|
12378
13603
|
FileSystemTraceStore,
|
|
12379
13604
|
HeldOutGate,
|
|
@@ -12381,6 +13606,7 @@ export {
|
|
|
12381
13606
|
HoldoutLockedError,
|
|
12382
13607
|
INTENT_MATCH_JUDGE_VERSION,
|
|
12383
13608
|
InMemoryExperimentStore,
|
|
13609
|
+
InMemoryFeedbackTrajectoryStore,
|
|
12384
13610
|
InMemoryOutcomeStore,
|
|
12385
13611
|
InMemoryTraceStore,
|
|
12386
13612
|
InMemoryTrialCache,
|
|
@@ -12420,9 +13646,11 @@ export {
|
|
|
12420
13646
|
adversarialJudge,
|
|
12421
13647
|
aggregateLlm,
|
|
12422
13648
|
aggregateRunScore,
|
|
13649
|
+
allCriticalPassed,
|
|
12423
13650
|
analyzeAntiSlop,
|
|
12424
13651
|
analyzeSeries,
|
|
12425
13652
|
argHash,
|
|
13653
|
+
assignFeedbackSplit,
|
|
12426
13654
|
attributeCounterfactuals,
|
|
12427
13655
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
12428
13656
|
benchmarks_exports as benchmarks,
|
|
@@ -12460,6 +13688,8 @@ export {
|
|
|
12460
13688
|
computeToolUseMetrics,
|
|
12461
13689
|
confidenceInterval,
|
|
12462
13690
|
containsAll,
|
|
13691
|
+
controlFailureClassFromVerification,
|
|
13692
|
+
controlRunToFeedbackTrajectory,
|
|
12463
13693
|
correlateLayers,
|
|
12464
13694
|
correlationStudy,
|
|
12465
13695
|
createAntiSlopJudge,
|
|
@@ -12467,6 +13697,7 @@ export {
|
|
|
12467
13697
|
createCustomJudge,
|
|
12468
13698
|
createDefaultReviewer,
|
|
12469
13699
|
createDomainExpertJudge,
|
|
13700
|
+
createFeedbackTrajectory,
|
|
12470
13701
|
createIntentMatchJudge,
|
|
12471
13702
|
createLlmReviewer,
|
|
12472
13703
|
createSandboxCodeMutator,
|
|
@@ -12495,6 +13726,10 @@ export {
|
|
|
12495
13726
|
extractAssetUrls,
|
|
12496
13727
|
extractErrorCount,
|
|
12497
13728
|
failureClusterView,
|
|
13729
|
+
feedbackTrajectoriesToDatasetScenarios,
|
|
13730
|
+
feedbackTrajectoriesToOptimizerRows,
|
|
13731
|
+
feedbackTrajectoryToDatasetScenario,
|
|
13732
|
+
feedbackTrajectoryToOptimizerRow,
|
|
12498
13733
|
fileContains,
|
|
12499
13734
|
fileExists,
|
|
12500
13735
|
findAutoMatchNoExpectation,
|
|
@@ -12549,6 +13784,7 @@ export {
|
|
|
12549
13784
|
nonRefusalRubric,
|
|
12550
13785
|
normalizeScores,
|
|
12551
13786
|
notBlocked,
|
|
13787
|
+
objectiveEval,
|
|
12552
13788
|
outputLengthRubric,
|
|
12553
13789
|
pairedBootstrap,
|
|
12554
13790
|
pairedTTest,
|
|
@@ -12557,6 +13793,7 @@ export {
|
|
|
12557
13793
|
paretoChart,
|
|
12558
13794
|
paretoFrontier,
|
|
12559
13795
|
paretoFrontierWithCrowding,
|
|
13796
|
+
parseFeedbackTrajectoriesJsonl,
|
|
12560
13797
|
parseReflectionResponse,
|
|
12561
13798
|
parseRunRecordSafe,
|
|
12562
13799
|
partialCredit,
|
|
@@ -12583,6 +13820,7 @@ export {
|
|
|
12583
13820
|
renderMarkdown,
|
|
12584
13821
|
renderMarkdownReport,
|
|
12585
13822
|
renderPlaybookMarkdown,
|
|
13823
|
+
renderPreferenceMemoryMarkdown,
|
|
12586
13824
|
renderSteeringText,
|
|
12587
13825
|
replayScorerOverCorpus,
|
|
12588
13826
|
replayTraceThroughJudge,
|
|
@@ -12592,6 +13830,7 @@ export {
|
|
|
12592
13830
|
roundTripRunRecord,
|
|
12593
13831
|
rowCount,
|
|
12594
13832
|
rowWhere,
|
|
13833
|
+
runAgentControlLoop,
|
|
12595
13834
|
runAssertions,
|
|
12596
13835
|
runCanaries,
|
|
12597
13836
|
runCounterfactual,
|
|
@@ -12605,6 +13844,7 @@ export {
|
|
|
12605
13844
|
runKeywordCoverageJudgeUrl,
|
|
12606
13845
|
runPromptEvolution,
|
|
12607
13846
|
runProposeReview,
|
|
13847
|
+
runProposeReviewAsControlLoop,
|
|
12608
13848
|
runReferenceReplay,
|
|
12609
13849
|
runSelfPlay,
|
|
12610
13850
|
runSemanticConceptJudge,
|
|
@@ -12621,13 +13861,18 @@ export {
|
|
|
12621
13861
|
selectHarnessVariant,
|
|
12622
13862
|
selfPreference,
|
|
12623
13863
|
sentenceReorderMutator,
|
|
13864
|
+
serializeFeedbackTrajectoriesJsonl,
|
|
12624
13865
|
signManifest,
|
|
12625
13866
|
soc2Report,
|
|
12626
13867
|
statusAdvanced,
|
|
13868
|
+
stopOnNoProgress,
|
|
13869
|
+
stopOnRepeatedAction,
|
|
12627
13870
|
stripFencedJson,
|
|
12628
13871
|
stuckLoopView,
|
|
13872
|
+
subjectiveEval,
|
|
12629
13873
|
summarize,
|
|
12630
13874
|
summarizeHarnessResults,
|
|
13875
|
+
summarizePreferenceMemory,
|
|
12631
13876
|
summaryTable,
|
|
12632
13877
|
testJudge,
|
|
12633
13878
|
textInSnapshot,
|
|
@@ -12653,6 +13898,7 @@ export {
|
|
|
12653
13898
|
welchsTTest,
|
|
12654
13899
|
whitespaceCollapseMutator,
|
|
12655
13900
|
wilcoxonSignedRank,
|
|
13901
|
+
withAssignedFeedbackSplit,
|
|
12656
13902
|
wranglerDeployRunner
|
|
12657
13903
|
};
|
|
12658
13904
|
//# sourceMappingURL=index.js.map
|