@tangle-network/agent-eval 0.23.1 → 0.25.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +145 -0
- package/README.md +212 -79
- package/dist/baseline-4R5deP0N.d.ts +108 -0
- package/dist/benchmarks/index.d.ts +3 -2
- package/dist/benchmarks/index.js +1 -1
- package/dist/builder-eval/index.d.ts +249 -0
- package/dist/builder-eval/index.js +391 -0
- package/dist/builder-eval/index.js.map +1 -0
- package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
- package/dist/chunk-2A5XJB43.js.map +1 -0
- package/dist/chunk-47X6LRCE.js +76 -0
- package/dist/chunk-47X6LRCE.js.map +1 -0
- package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
- package/dist/chunk-4F5DQN55.js.map +1 -0
- package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
- package/dist/chunk-4S4BM3QQ.js.map +1 -0
- package/dist/chunk-5BKGXME7.js +65 -0
- package/dist/chunk-5BKGXME7.js.map +1 -0
- package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
- package/dist/chunk-6QDKWHLS.js.map +1 -0
- package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
- package/dist/chunk-EDUKQ5AM.js.map +1 -0
- package/dist/chunk-I4MBDTY5.js +272 -0
- package/dist/chunk-I4MBDTY5.js.map +1 -0
- package/dist/chunk-JLZQWFV3.js +618 -0
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/chunk-K2TPS5LB.js +569 -0
- package/dist/chunk-K2TPS5LB.js.map +1 -0
- package/dist/chunk-KKHDIONI.js +414 -0
- package/dist/chunk-KKHDIONI.js.map +1 -0
- package/dist/chunk-KMPRBJK4.js +74 -0
- package/dist/chunk-KMPRBJK4.js.map +1 -0
- package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
- package/dist/chunk-KTGTIOFD.js.map +1 -0
- package/dist/chunk-LSH4MMOZ.js +838 -0
- package/dist/chunk-LSH4MMOZ.js.map +1 -0
- package/dist/chunk-NG236HPC.js +57 -0
- package/dist/chunk-NG236HPC.js.map +1 -0
- package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
- package/dist/chunk-NLMNWKVM.js.map +1 -0
- package/dist/chunk-NU65VQ7M.js +99 -0
- package/dist/chunk-NU65VQ7M.js.map +1 -0
- package/dist/chunk-OWLAAMME.js +250 -0
- package/dist/chunk-OWLAAMME.js.map +1 -0
- package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
- package/dist/chunk-PC4UYEBM.js.map +1 -0
- package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
- package/dist/chunk-RAF443UI.js.map +1 -0
- package/dist/chunk-RZTMDUO7.js +49 -0
- package/dist/chunk-RZTMDUO7.js.map +1 -0
- package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
- package/dist/chunk-SESZDQPX.js.map +1 -0
- package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
- package/dist/chunk-TVVP3ZZQ.js.map +1 -0
- package/dist/chunk-WWYCWKUM.js +196 -0
- package/dist/chunk-WWYCWKUM.js.map +1 -0
- package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
- package/dist/chunk-YRZ4M5GS.js.map +1 -0
- package/dist/chunk-ZN274SWR.js +613 -0
- package/dist/chunk-ZN274SWR.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
- package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
- package/dist/control.d.ts +8 -6
- package/dist/control.js +10 -7
- package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
- package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
- package/dist/errors-BZ9sTdz7.d.ts +70 -0
- package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
- package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
- package/dist/governance/index.d.ts +5 -0
- package/dist/governance/index.js +18 -0
- package/dist/governance/index.js.map +1 -0
- package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
- package/dist/index-Oj9fAPPN.d.ts +270 -0
- package/dist/index.d.ts +2018 -3003
- package/dist/index.js +7443 -9102
- package/dist/index.js.map +1 -1
- package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
- package/dist/knowledge/index.d.ts +102 -0
- package/dist/knowledge/index.js +18 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/meta-eval/index.d.ts +99 -0
- package/dist/meta-eval/index.js +324 -0
- package/dist/meta-eval/index.js.map +1 -0
- package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +11 -8
- package/dist/optimization.js +11 -9
- package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
- package/dist/pipelines/index.d.ts +172 -0
- package/dist/pipelines/index.js +345 -0
- package/dist/pipelines/index.js.map +1 -0
- package/dist/prm/index.d.ts +99 -0
- package/dist/prm/index.js +222 -0
- package/dist/prm/index.js.map +1 -0
- package/dist/query-DODUYdPg.d.ts +30 -0
- package/dist/release-report-BNgMdqPF.d.ts +292 -0
- package/dist/replay-BL96gCEP.d.ts +226 -0
- package/dist/reporting.d.ts +10 -295
- package/dist/reporting.js +10 -6
- package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
- package/dist/rl.d.ts +1762 -8
- package/dist/rl.js +2035 -58
- package/dist/rl.js.map +1 -1
- package/dist/rubric-D5tjHNJQ.d.ts +72 -0
- package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
- package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
- package/dist/sequential-Dgz1n51-.d.ts +139 -0
- package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
- package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
- package/dist/telemetry/file.js +4 -1
- package/dist/telemetry/file.js.map +1 -1
- package/dist/telemetry/index.js +57 -57
- package/dist/telemetry/index.js.map +1 -1
- package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
- package/dist/traces.d.ts +142 -387
- package/dist/traces.js +1302 -40
- package/dist/traces.js.map +1 -1
- package/dist/trajectory-CnoBo-JY.d.ts +32 -0
- package/dist/wire/index.d.ts +369 -25
- package/dist/wire/index.js +22 -3
- package/package.json +44 -18
- package/dist/chunk-42I2QC2L.js.map +0 -1
- package/dist/chunk-5IIQKMD5.js.map +0 -1
- package/dist/chunk-6KQG5HAH.js.map +0 -1
- package/dist/chunk-6M774GY6.js.map +0 -1
- package/dist/chunk-7EAUOUQS.js.map +0 -1
- package/dist/chunk-AXHNWLIX.js.map +0 -1
- package/dist/chunk-EXGR4XEM.js.map +0 -1
- package/dist/chunk-IOXMGMHQ.js.map +0 -1
- package/dist/chunk-KAO3Q65R.js.map +0 -1
- package/dist/chunk-LZKIOBG2.js +0 -2026
- package/dist/chunk-LZKIOBG2.js.map +0 -1
- package/dist/chunk-QBW3YBTR.js.map +0 -1
- package/dist/chunk-QUKKGHTZ.js.map +0 -1
- package/dist/chunk-SQQLHODJ.js.map +0 -1
- package/dist/chunk-V5QSWN7L.js +0 -1310
- package/dist/chunk-V5QSWN7L.js.map +0 -1
- package/dist/chunk-VQQSPGSM.js.map +0 -1
- package/dist/chunk-XPHOZPOM.js +0 -1947
- package/dist/chunk-XPHOZPOM.js.map +0 -1
- package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
- package/dist/index-ekBXweiQ.d.ts +0 -1894
- package/dist/sequential-DgU2mFsE.d.ts +0 -304
package/dist/chunk-V5QSWN7L.js
DELETED
|
@@ -1,1310 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
validateRunRecord
|
|
3
|
-
} from "./chunk-QBW3YBTR.js";
|
|
4
|
-
import {
|
|
5
|
-
TraceEmitter
|
|
6
|
-
} from "./chunk-5IIQKMD5.js";
|
|
7
|
-
|
|
8
|
-
// src/control-runtime.ts
|
|
9
|
-
var DEFAULT_BUDGET = {
|
|
10
|
-
maxSteps: 8,
|
|
11
|
-
maxWallMs: 5 * 60 * 1e3
|
|
12
|
-
};
|
|
13
|
-
async function runAgentControlLoop(config) {
|
|
14
|
-
const budget = normalizeBudget(config.budget);
|
|
15
|
-
const actionFailure = config.actionFailure ?? "continue";
|
|
16
|
-
const controller = new AbortController();
|
|
17
|
-
const upstreamAbort = () => controller.abort(config.signal?.reason);
|
|
18
|
-
if (config.signal) {
|
|
19
|
-
if (config.signal.aborted) controller.abort(config.signal.reason);
|
|
20
|
-
else config.signal.addEventListener("abort", upstreamAbort, { once: true });
|
|
21
|
-
}
|
|
22
|
-
const started = Date.now();
|
|
23
|
-
const wallTimer = budget.maxWallMs ? setTimeout(() => controller.abort(new Error("control runtime wall timeout")), budget.maxWallMs) : void 0;
|
|
24
|
-
const history = [];
|
|
25
|
-
const emitter = config.store ? new TraceEmitter(config.store) : void 0;
|
|
26
|
-
let spentCostUsd = 0;
|
|
27
|
-
const runtimeErrors = [];
|
|
28
|
-
let lastStateFingerprint;
|
|
29
|
-
let lastActionFingerprint;
|
|
30
|
-
let noProgressStreak = 0;
|
|
31
|
-
let repeatedActionStreak = 0;
|
|
32
|
-
try {
|
|
33
|
-
if (emitter) {
|
|
34
|
-
await runTrace(runtimeErrors, 0, () => emitter.startRun({
|
|
35
|
-
scenarioId: config.scenarioId ?? "agent-control-loop",
|
|
36
|
-
projectId: config.projectId,
|
|
37
|
-
variantId: config.variantId,
|
|
38
|
-
layer: "meta",
|
|
39
|
-
tags: {
|
|
40
|
-
intent: config.intent.slice(0, 120),
|
|
41
|
-
maxSteps: String(budget.maxSteps),
|
|
42
|
-
...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
|
|
43
|
-
}
|
|
44
|
-
}));
|
|
45
|
-
}
|
|
46
|
-
let state;
|
|
47
|
-
let evals;
|
|
48
|
-
try {
|
|
49
|
-
state = await config.observe({ history, abortSignal: controller.signal });
|
|
50
|
-
} catch (err) {
|
|
51
|
-
const error = runtimeError("observe", 0, err);
|
|
52
|
-
runtimeErrors.push(error);
|
|
53
|
-
return finish(emitter, {
|
|
54
|
-
intent: config.intent,
|
|
55
|
-
pass: false,
|
|
56
|
-
completed: false,
|
|
57
|
-
reason: error.message,
|
|
58
|
-
steps: history,
|
|
59
|
-
finalState: void 0,
|
|
60
|
-
finalEvals: [],
|
|
61
|
-
wallMs: Date.now() - started,
|
|
62
|
-
spentCostUsd,
|
|
63
|
-
runId: emitter?.runId ?? null,
|
|
64
|
-
failureClass: "unknown",
|
|
65
|
-
runtimeErrors,
|
|
66
|
-
stoppedBy: "runtime-error"
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
try {
|
|
70
|
-
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
71
|
-
await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
|
|
72
|
-
} catch (err) {
|
|
73
|
-
const error = runtimeError("validate", 0, err);
|
|
74
|
-
runtimeErrors.push(error);
|
|
75
|
-
return finish(emitter, {
|
|
76
|
-
intent: config.intent,
|
|
77
|
-
pass: false,
|
|
78
|
-
completed: false,
|
|
79
|
-
reason: error.message,
|
|
80
|
-
steps: history,
|
|
81
|
-
finalState: state,
|
|
82
|
-
finalEvals: [],
|
|
83
|
-
wallMs: Date.now() - started,
|
|
84
|
-
spentCostUsd,
|
|
85
|
-
runId: emitter?.runId ?? null,
|
|
86
|
-
failureClass: "unknown",
|
|
87
|
-
runtimeErrors,
|
|
88
|
-
stoppedBy: "runtime-error"
|
|
89
|
-
});
|
|
90
|
-
}
|
|
91
|
-
lastStateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
92
|
-
for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
|
|
93
|
-
if (controller.signal.aborted) {
|
|
94
|
-
return finish(emitter, {
|
|
95
|
-
intent: config.intent,
|
|
96
|
-
pass: false,
|
|
97
|
-
completed: false,
|
|
98
|
-
reason: abortReason(controller.signal),
|
|
99
|
-
score: void 0,
|
|
100
|
-
steps: history,
|
|
101
|
-
finalState: state,
|
|
102
|
-
finalEvals: evals,
|
|
103
|
-
wallMs: Date.now() - started,
|
|
104
|
-
spentCostUsd,
|
|
105
|
-
runId: emitter?.runId ?? null,
|
|
106
|
-
failureClass: "timeout",
|
|
107
|
-
runtimeErrors,
|
|
108
|
-
stoppedBy: "abort"
|
|
109
|
-
});
|
|
110
|
-
}
|
|
111
|
-
const budgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
112
|
-
if (budgetStop.stop) {
|
|
113
|
-
return finish(emitter, {
|
|
114
|
-
intent: config.intent,
|
|
115
|
-
pass: false,
|
|
116
|
-
completed: false,
|
|
117
|
-
reason: budgetStop.reason,
|
|
118
|
-
score: averageScore(evals),
|
|
119
|
-
steps: history,
|
|
120
|
-
finalState: state,
|
|
121
|
-
finalEvals: evals,
|
|
122
|
-
wallMs: Date.now() - started,
|
|
123
|
-
spentCostUsd,
|
|
124
|
-
runId: emitter?.runId ?? null,
|
|
125
|
-
failureClass: "budget_exceeded",
|
|
126
|
-
runtimeErrors,
|
|
127
|
-
stoppedBy: "budget"
|
|
128
|
-
});
|
|
129
|
-
}
|
|
130
|
-
const ctx = makeContext(config.intent, state, evals, history, budget, stepIndex, started, spentCostUsd, controller.signal, emitter);
|
|
131
|
-
let stop;
|
|
132
|
-
try {
|
|
133
|
-
stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
|
|
134
|
-
} catch (err) {
|
|
135
|
-
runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
|
|
136
|
-
return finish(emitter, {
|
|
137
|
-
intent: config.intent,
|
|
138
|
-
pass: false,
|
|
139
|
-
completed: false,
|
|
140
|
-
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
141
|
-
score: averageScore(evals),
|
|
142
|
-
steps: history,
|
|
143
|
-
finalState: state,
|
|
144
|
-
finalEvals: evals,
|
|
145
|
-
wallMs: Date.now() - started,
|
|
146
|
-
spentCostUsd,
|
|
147
|
-
runId: emitter?.runId ?? null,
|
|
148
|
-
failureClass: "unknown",
|
|
149
|
-
runtimeErrors,
|
|
150
|
-
stoppedBy: "runtime-error"
|
|
151
|
-
});
|
|
152
|
-
}
|
|
153
|
-
if (stop.stop) {
|
|
154
|
-
return finish(emitter, {
|
|
155
|
-
intent: config.intent,
|
|
156
|
-
pass: stop.pass,
|
|
157
|
-
completed: true,
|
|
158
|
-
reason: stop.reason,
|
|
159
|
-
score: stop.score,
|
|
160
|
-
steps: history,
|
|
161
|
-
finalState: state,
|
|
162
|
-
finalEvals: evals,
|
|
163
|
-
wallMs: Date.now() - started,
|
|
164
|
-
spentCostUsd,
|
|
165
|
-
runId: emitter?.runId ?? null,
|
|
166
|
-
failureClass: stop.failureClass,
|
|
167
|
-
runtimeErrors,
|
|
168
|
-
stoppedBy: "stop-policy"
|
|
169
|
-
});
|
|
170
|
-
}
|
|
171
|
-
let decision;
|
|
172
|
-
try {
|
|
173
|
-
decision = await config.decide(ctx);
|
|
174
|
-
} catch (err) {
|
|
175
|
-
runtimeErrors.push(runtimeError("decide", stepIndex, err));
|
|
176
|
-
return finish(emitter, {
|
|
177
|
-
intent: config.intent,
|
|
178
|
-
pass: false,
|
|
179
|
-
completed: false,
|
|
180
|
-
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
181
|
-
score: averageScore(evals),
|
|
182
|
-
steps: history,
|
|
183
|
-
finalState: state,
|
|
184
|
-
finalEvals: evals,
|
|
185
|
-
wallMs: Date.now() - started,
|
|
186
|
-
spentCostUsd,
|
|
187
|
-
runId: emitter?.runId ?? null,
|
|
188
|
-
failureClass: "unknown",
|
|
189
|
-
runtimeErrors,
|
|
190
|
-
stoppedBy: "runtime-error"
|
|
191
|
-
});
|
|
192
|
-
}
|
|
193
|
-
if (decision.type === "stop") {
|
|
194
|
-
return finish(emitter, {
|
|
195
|
-
intent: config.intent,
|
|
196
|
-
pass: decision.pass ?? false,
|
|
197
|
-
completed: true,
|
|
198
|
-
reason: decision.reason,
|
|
199
|
-
score: decision.score,
|
|
200
|
-
steps: history,
|
|
201
|
-
finalState: state,
|
|
202
|
-
finalEvals: evals,
|
|
203
|
-
wallMs: Date.now() - started,
|
|
204
|
-
spentCostUsd,
|
|
205
|
-
runId: emitter?.runId ?? null,
|
|
206
|
-
failureClass: decision.pass === false ? "unknown" : void 0,
|
|
207
|
-
runtimeErrors,
|
|
208
|
-
stoppedBy: "policy"
|
|
209
|
-
});
|
|
210
|
-
}
|
|
211
|
-
const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
|
|
212
|
-
repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
|
|
213
|
-
lastActionFingerprint = actionFingerprint;
|
|
214
|
-
const repeatedActionStop = repeatedActionStopDecision(config.stopPolicies, repeatedActionStreak);
|
|
215
|
-
if (repeatedActionStop.stop) {
|
|
216
|
-
return finish(emitter, {
|
|
217
|
-
intent: config.intent,
|
|
218
|
-
pass: false,
|
|
219
|
-
completed: true,
|
|
220
|
-
reason: repeatedActionStop.reason,
|
|
221
|
-
score: averageScore(evals),
|
|
222
|
-
steps: history,
|
|
223
|
-
finalState: state,
|
|
224
|
-
finalEvals: evals,
|
|
225
|
-
wallMs: Date.now() - started,
|
|
226
|
-
spentCostUsd,
|
|
227
|
-
runId: emitter?.runId ?? null,
|
|
228
|
-
failureClass: "tool_recovery_failure",
|
|
229
|
-
runtimeErrors,
|
|
230
|
-
stoppedBy: "stop-policy"
|
|
231
|
-
});
|
|
232
|
-
}
|
|
233
|
-
const beforeState = state;
|
|
234
|
-
const evalsBefore = evals;
|
|
235
|
-
const scoreBefore = averageScore(evals);
|
|
236
|
-
const actionStarted = Date.now();
|
|
237
|
-
const stepHandle = emitter ? await runTrace(runtimeErrors, stepIndex, () => emitter.tool({
|
|
238
|
-
name: `control-step-${stepIndex}`,
|
|
239
|
-
toolName: "agent-control-action",
|
|
240
|
-
args: decision.action,
|
|
241
|
-
attributes: {
|
|
242
|
-
decision: decision.reason ?? "continue",
|
|
243
|
-
repeatedActionStreak
|
|
244
|
-
}
|
|
245
|
-
})) : void 0;
|
|
246
|
-
let actionOutcome;
|
|
247
|
-
try {
|
|
248
|
-
const result = await config.act(decision.action, ctx);
|
|
249
|
-
const rawCostUsd = config.getActionCostUsd?.({
|
|
250
|
-
action: decision.action,
|
|
251
|
-
result,
|
|
252
|
-
state,
|
|
253
|
-
evals,
|
|
254
|
-
history
|
|
255
|
-
});
|
|
256
|
-
const costUsd = normalizeActionCostUsd(rawCostUsd, runtimeErrors, stepIndex);
|
|
257
|
-
if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
|
|
258
|
-
spentCostUsd += costUsd;
|
|
259
|
-
await recordCostBudget(emitter, budget, spentCostUsd, stepHandle, runtimeErrors, stepIndex);
|
|
260
|
-
}
|
|
261
|
-
actionOutcome = {
|
|
262
|
-
ok: true,
|
|
263
|
-
result,
|
|
264
|
-
...costUsd !== void 0 ? { costUsd } : {},
|
|
265
|
-
durationMs: Date.now() - actionStarted
|
|
266
|
-
};
|
|
267
|
-
} catch (err) {
|
|
268
|
-
runtimeErrors.push(runtimeError("act", stepIndex, err));
|
|
269
|
-
actionOutcome = {
|
|
270
|
-
ok: false,
|
|
271
|
-
error: runtimeErrors[runtimeErrors.length - 1].message,
|
|
272
|
-
durationMs: Date.now() - actionStarted
|
|
273
|
-
};
|
|
274
|
-
if (actionFailure === "stop") {
|
|
275
|
-
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed"));
|
|
276
|
-
const step2 = {
|
|
277
|
-
index: stepIndex,
|
|
278
|
-
decision,
|
|
279
|
-
beforeState,
|
|
280
|
-
afterState: state,
|
|
281
|
-
evalsBefore,
|
|
282
|
-
evalsAfter: evals,
|
|
283
|
-
actionOutcome,
|
|
284
|
-
startedAt: new Date(actionStarted).toISOString(),
|
|
285
|
-
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
286
|
-
};
|
|
287
|
-
history.push(step2);
|
|
288
|
-
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
289
|
-
return finish(emitter, {
|
|
290
|
-
intent: config.intent,
|
|
291
|
-
pass: false,
|
|
292
|
-
completed: false,
|
|
293
|
-
reason: actionOutcome.error ?? "action failed",
|
|
294
|
-
score: averageScore(evals),
|
|
295
|
-
steps: history,
|
|
296
|
-
finalState: state,
|
|
297
|
-
finalEvals: evals,
|
|
298
|
-
wallMs: Date.now() - started,
|
|
299
|
-
spentCostUsd,
|
|
300
|
-
runId: emitter?.runId ?? null,
|
|
301
|
-
failureClass: "unknown",
|
|
302
|
-
runtimeErrors,
|
|
303
|
-
stoppedBy: "runtime-error"
|
|
304
|
-
});
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
try {
|
|
308
|
-
state = await config.observe({ history, abortSignal: controller.signal });
|
|
309
|
-
} catch (err) {
|
|
310
|
-
runtimeErrors.push(runtimeError("observe", stepIndex, err));
|
|
311
|
-
const step2 = {
|
|
312
|
-
index: stepIndex,
|
|
313
|
-
decision,
|
|
314
|
-
beforeState,
|
|
315
|
-
afterState: beforeState,
|
|
316
|
-
evalsBefore,
|
|
317
|
-
evalsAfter: evals,
|
|
318
|
-
actionOutcome,
|
|
319
|
-
startedAt: new Date(actionStarted).toISOString(),
|
|
320
|
-
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
321
|
-
};
|
|
322
|
-
history.push(step2);
|
|
323
|
-
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
324
|
-
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
325
|
-
return finish(emitter, {
|
|
326
|
-
intent: config.intent,
|
|
327
|
-
pass: false,
|
|
328
|
-
completed: false,
|
|
329
|
-
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
330
|
-
score: averageScore(evals),
|
|
331
|
-
steps: history,
|
|
332
|
-
finalState: beforeState,
|
|
333
|
-
finalEvals: evals,
|
|
334
|
-
wallMs: Date.now() - started,
|
|
335
|
-
spentCostUsd,
|
|
336
|
-
runId: emitter?.runId ?? null,
|
|
337
|
-
failureClass: "unknown",
|
|
338
|
-
runtimeErrors,
|
|
339
|
-
stoppedBy: "runtime-error"
|
|
340
|
-
});
|
|
341
|
-
}
|
|
342
|
-
try {
|
|
343
|
-
evals = await config.validate({ intent: config.intent, state, history, abortSignal: controller.signal });
|
|
344
|
-
await recordEvalSpans(emitter, evals, `step-${stepIndex}`, runtimeErrors, stepIndex, stepHandle?.span.spanId);
|
|
345
|
-
} catch (err) {
|
|
346
|
-
runtimeErrors.push(runtimeError("validate", stepIndex, err));
|
|
347
|
-
const step2 = {
|
|
348
|
-
index: stepIndex,
|
|
349
|
-
decision,
|
|
350
|
-
beforeState,
|
|
351
|
-
afterState: state,
|
|
352
|
-
evalsBefore,
|
|
353
|
-
evalsAfter: evals,
|
|
354
|
-
actionOutcome,
|
|
355
|
-
startedAt: new Date(actionStarted).toISOString(),
|
|
356
|
-
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
357
|
-
};
|
|
358
|
-
history.push(step2);
|
|
359
|
-
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message));
|
|
360
|
-
await runOnStep(config.onStep, step2, runtimeErrors);
|
|
361
|
-
return finish(emitter, {
|
|
362
|
-
intent: config.intent,
|
|
363
|
-
pass: false,
|
|
364
|
-
completed: false,
|
|
365
|
-
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
366
|
-
score: averageScore(evals),
|
|
367
|
-
steps: history,
|
|
368
|
-
finalState: state,
|
|
369
|
-
finalEvals: evals,
|
|
370
|
-
wallMs: Date.now() - started,
|
|
371
|
-
spentCostUsd,
|
|
372
|
-
runId: emitter?.runId ?? null,
|
|
373
|
-
failureClass: "unknown",
|
|
374
|
-
runtimeErrors,
|
|
375
|
-
stoppedBy: "runtime-error"
|
|
376
|
-
});
|
|
377
|
-
}
|
|
378
|
-
const scoreAfter = averageScore(evals);
|
|
379
|
-
const stateFingerprint = fingerprintState(state, config.stopPolicies);
|
|
380
|
-
const noProgressStop = noProgressStopDecision({
|
|
381
|
-
policies: config.stopPolicies,
|
|
382
|
-
lastStateFingerprint,
|
|
383
|
-
stateFingerprint,
|
|
384
|
-
scoreBefore,
|
|
385
|
-
scoreAfter,
|
|
386
|
-
currentStreak: noProgressStreak
|
|
387
|
-
});
|
|
388
|
-
noProgressStreak = noProgressStop.streak;
|
|
389
|
-
lastStateFingerprint = stateFingerprint;
|
|
390
|
-
const step = {
|
|
391
|
-
index: stepIndex,
|
|
392
|
-
decision,
|
|
393
|
-
beforeState,
|
|
394
|
-
afterState: state,
|
|
395
|
-
evalsBefore,
|
|
396
|
-
evalsAfter: evals,
|
|
397
|
-
actionOutcome,
|
|
398
|
-
startedAt: new Date(actionStarted).toISOString(),
|
|
399
|
-
endedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
400
|
-
};
|
|
401
|
-
history.push(step);
|
|
402
|
-
if (actionOutcome.ok) {
|
|
403
|
-
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.end({
|
|
404
|
-
attributes: {
|
|
405
|
-
actionCostUsd: actionOutcome.costUsd ?? null,
|
|
406
|
-
spentCostUsd,
|
|
407
|
-
scoreBefore: scoreBefore ?? null,
|
|
408
|
-
scoreAfter: scoreAfter ?? null,
|
|
409
|
-
noProgressStreak
|
|
410
|
-
}
|
|
411
|
-
}));
|
|
412
|
-
} else {
|
|
413
|
-
await runTrace(runtimeErrors, stepIndex, () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
|
|
414
|
-
attributes: {
|
|
415
|
-
spentCostUsd,
|
|
416
|
-
noProgressStreak
|
|
417
|
-
}
|
|
418
|
-
}));
|
|
419
|
-
}
|
|
420
|
-
await runOnStep(config.onStep, step, runtimeErrors);
|
|
421
|
-
if (noProgressStop.stop) {
|
|
422
|
-
return finish(emitter, {
|
|
423
|
-
intent: config.intent,
|
|
424
|
-
pass: false,
|
|
425
|
-
completed: true,
|
|
426
|
-
reason: noProgressStop.reason,
|
|
427
|
-
score: scoreAfter,
|
|
428
|
-
steps: history,
|
|
429
|
-
finalState: state,
|
|
430
|
-
finalEvals: evals,
|
|
431
|
-
wallMs: Date.now() - started,
|
|
432
|
-
spentCostUsd,
|
|
433
|
-
runId: emitter?.runId ?? null,
|
|
434
|
-
failureClass: "tool_recovery_failure",
|
|
435
|
-
runtimeErrors,
|
|
436
|
-
stoppedBy: "stop-policy"
|
|
437
|
-
});
|
|
438
|
-
}
|
|
439
|
-
const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
|
|
440
|
-
if (postStepBudgetStop.stop) {
|
|
441
|
-
return finish(emitter, {
|
|
442
|
-
intent: config.intent,
|
|
443
|
-
pass: false,
|
|
444
|
-
completed: false,
|
|
445
|
-
reason: postStepBudgetStop.reason,
|
|
446
|
-
score: scoreAfter,
|
|
447
|
-
steps: history,
|
|
448
|
-
finalState: state,
|
|
449
|
-
finalEvals: evals,
|
|
450
|
-
wallMs: Date.now() - started,
|
|
451
|
-
spentCostUsd,
|
|
452
|
-
runId: emitter?.runId ?? null,
|
|
453
|
-
failureClass: "budget_exceeded",
|
|
454
|
-
runtimeErrors,
|
|
455
|
-
stoppedBy: "budget"
|
|
456
|
-
});
|
|
457
|
-
}
|
|
458
|
-
const postStepCtx = makeContext(config.intent, state, evals, history, budget, stepIndex + 1, started, spentCostUsd, controller.signal, emitter);
|
|
459
|
-
let postStepStop;
|
|
460
|
-
try {
|
|
461
|
-
postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
|
|
462
|
-
} catch (err) {
|
|
463
|
-
runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
|
|
464
|
-
return finish(emitter, {
|
|
465
|
-
intent: config.intent,
|
|
466
|
-
pass: false,
|
|
467
|
-
completed: false,
|
|
468
|
-
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
469
|
-
score: averageScore(evals),
|
|
470
|
-
steps: history,
|
|
471
|
-
finalState: state,
|
|
472
|
-
finalEvals: evals,
|
|
473
|
-
wallMs: Date.now() - started,
|
|
474
|
-
spentCostUsd,
|
|
475
|
-
runId: emitter?.runId ?? null,
|
|
476
|
-
failureClass: "unknown",
|
|
477
|
-
runtimeErrors,
|
|
478
|
-
stoppedBy: "runtime-error"
|
|
479
|
-
});
|
|
480
|
-
}
|
|
481
|
-
if (postStepStop.stop) {
|
|
482
|
-
return finish(emitter, {
|
|
483
|
-
intent: config.intent,
|
|
484
|
-
pass: postStepStop.pass,
|
|
485
|
-
completed: true,
|
|
486
|
-
reason: postStepStop.reason,
|
|
487
|
-
score: postStepStop.score,
|
|
488
|
-
steps: history,
|
|
489
|
-
finalState: state,
|
|
490
|
-
finalEvals: evals,
|
|
491
|
-
wallMs: Date.now() - started,
|
|
492
|
-
spentCostUsd,
|
|
493
|
-
runId: emitter?.runId ?? null,
|
|
494
|
-
failureClass: postStepStop.failureClass,
|
|
495
|
-
runtimeErrors,
|
|
496
|
-
stoppedBy: "stop-policy"
|
|
497
|
-
});
|
|
498
|
-
}
|
|
499
|
-
}
|
|
500
|
-
return finish(emitter, {
|
|
501
|
-
intent: config.intent,
|
|
502
|
-
pass: false,
|
|
503
|
-
completed: false,
|
|
504
|
-
reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
|
|
505
|
-
steps: history,
|
|
506
|
-
finalState: state,
|
|
507
|
-
finalEvals: evals,
|
|
508
|
-
wallMs: Date.now() - started,
|
|
509
|
-
spentCostUsd,
|
|
510
|
-
runId: emitter?.runId ?? null,
|
|
511
|
-
failureClass: "budget_exceeded",
|
|
512
|
-
runtimeErrors,
|
|
513
|
-
stoppedBy: "budget"
|
|
514
|
-
});
|
|
515
|
-
} catch (err) {
|
|
516
|
-
runtimeErrors.push(runtimeError("act", history.length, err));
|
|
517
|
-
return finish(emitter, {
|
|
518
|
-
intent: config.intent,
|
|
519
|
-
pass: false,
|
|
520
|
-
completed: false,
|
|
521
|
-
reason: runtimeErrors[runtimeErrors.length - 1].message,
|
|
522
|
-
steps: history,
|
|
523
|
-
finalState: void 0,
|
|
524
|
-
finalEvals: [],
|
|
525
|
-
wallMs: Date.now() - started,
|
|
526
|
-
spentCostUsd,
|
|
527
|
-
runId: emitter?.runId ?? null,
|
|
528
|
-
failureClass: "unknown",
|
|
529
|
-
runtimeErrors,
|
|
530
|
-
stoppedBy: "runtime-error"
|
|
531
|
-
});
|
|
532
|
-
} finally {
|
|
533
|
-
if (wallTimer) clearTimeout(wallTimer);
|
|
534
|
-
if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
function stopOnNoProgress(maxNoProgressSteps, options = {}) {
|
|
538
|
-
return { ...options, maxNoProgressSteps };
|
|
539
|
-
}
|
|
540
|
-
function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
|
|
541
|
-
return { ...options, maxRepeatedActions };
|
|
542
|
-
}
|
|
543
|
-
function objectiveEval(input) {
|
|
544
|
-
return { ...input, objective: true };
|
|
545
|
-
}
|
|
546
|
-
function subjectiveEval(input) {
|
|
547
|
-
return { ...input, objective: false };
|
|
548
|
-
}
|
|
549
|
-
function normalizeBudget(input) {
|
|
550
|
-
const raw = { ...DEFAULT_BUDGET, ...input };
|
|
551
|
-
if (!Number.isInteger(raw.maxSteps) || raw.maxSteps < 1) {
|
|
552
|
-
throw new RangeError(`ControlRuntime budget.maxSteps must be an integer >= 1, got ${String(raw.maxSteps)}`);
|
|
553
|
-
}
|
|
554
|
-
const budget = { maxSteps: raw.maxSteps };
|
|
555
|
-
if (raw.maxWallMs !== void 0) {
|
|
556
|
-
if (typeof raw.maxWallMs !== "number" || !Number.isFinite(raw.maxWallMs) || raw.maxWallMs <= 0) {
|
|
557
|
-
throw new RangeError(`ControlRuntime budget.maxWallMs must be a positive finite number, got ${String(raw.maxWallMs)}`);
|
|
558
|
-
}
|
|
559
|
-
budget.maxWallMs = raw.maxWallMs;
|
|
560
|
-
}
|
|
561
|
-
if (raw.maxCostUsd !== void 0) {
|
|
562
|
-
if (typeof raw.maxCostUsd !== "number" || !Number.isFinite(raw.maxCostUsd) || raw.maxCostUsd < 0) {
|
|
563
|
-
throw new RangeError(`ControlRuntime budget.maxCostUsd must be a nonnegative finite number, got ${String(raw.maxCostUsd)}`);
|
|
564
|
-
}
|
|
565
|
-
budget.maxCostUsd = raw.maxCostUsd;
|
|
566
|
-
}
|
|
567
|
-
return budget;
|
|
568
|
-
}
|
|
569
|
-
function normalizeActionCostUsd(costUsd, runtimeErrors, stepIndex) {
|
|
570
|
-
if (costUsd === void 0) return void 0;
|
|
571
|
-
if (!Number.isFinite(costUsd) || costUsd < 0) {
|
|
572
|
-
runtimeErrors.push(runtimeError("act", stepIndex, new Error(`invalid action costUsd: ${String(costUsd)}`)));
|
|
573
|
-
return void 0;
|
|
574
|
-
}
|
|
575
|
-
return costUsd;
|
|
576
|
-
}
|
|
577
|
-
function allCriticalPassed(evals) {
|
|
578
|
-
return evals.every((result) => result.passed || result.severity !== "critical" && result.severity !== "error");
|
|
579
|
-
}
|
|
580
|
-
function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
|
|
581
|
-
return {
|
|
582
|
-
intent,
|
|
583
|
-
state,
|
|
584
|
-
evals,
|
|
585
|
-
history,
|
|
586
|
-
budget,
|
|
587
|
-
stepIndex,
|
|
588
|
-
wallMs: Date.now() - started,
|
|
589
|
-
spentCostUsd,
|
|
590
|
-
remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
|
|
591
|
-
abortSignal,
|
|
592
|
-
emitter
|
|
593
|
-
};
|
|
594
|
-
}
|
|
595
|
-
function defaultStopDecision(evals) {
|
|
596
|
-
if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
|
|
597
|
-
const pass = allCriticalPassed(evals);
|
|
598
|
-
return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : { stop: false, pass: false, reason: "critical evals still failing", score: averageScore(evals) };
|
|
599
|
-
}
|
|
600
|
-
function averageScore(evals) {
|
|
601
|
-
const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
|
|
602
|
-
if (!scored.length) return void 0;
|
|
603
|
-
return Math.round(scored.reduce((sum, score) => sum + score, 0) / scored.length * 1e3) / 1e3;
|
|
604
|
-
}
|
|
605
|
-
function budgetStopDecision(budget, spentCostUsd) {
|
|
606
|
-
if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
|
|
607
|
-
return {
|
|
608
|
-
stop: true,
|
|
609
|
-
reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
|
|
610
|
-
};
|
|
611
|
-
}
|
|
612
|
-
return { stop: false, reason: "" };
|
|
613
|
-
}
|
|
614
|
-
async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
|
|
615
|
-
if (!emitter || budget.maxCostUsd === void 0) return;
|
|
616
|
-
const maxCostUsd = budget.maxCostUsd;
|
|
617
|
-
await runTrace(runtimeErrors, stepIndex, () => emitter.recordBudget({
|
|
618
|
-
dimension: "usd",
|
|
619
|
-
limit: maxCostUsd,
|
|
620
|
-
consumed: spentCostUsd,
|
|
621
|
-
remaining: Math.max(0, maxCostUsd - spentCostUsd),
|
|
622
|
-
breached: spentCostUsd >= maxCostUsd,
|
|
623
|
-
spanId: handle?.span.spanId
|
|
624
|
-
}));
|
|
625
|
-
}
|
|
626
|
-
async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
|
|
627
|
-
if (!emitter) return;
|
|
628
|
-
for (const result of evals) {
|
|
629
|
-
await runTrace(runtimeErrors, stepIndex, () => emitter.recordJudge({
|
|
630
|
-
judgeId: result.objective ? "objective-validator" : "subjective-judge",
|
|
631
|
-
targetSpanId: targetSpanId ?? emitter.runId,
|
|
632
|
-
name: `control-eval/${result.id}`,
|
|
633
|
-
dimension: result.id,
|
|
634
|
-
score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
|
|
635
|
-
rationale: result.detail,
|
|
636
|
-
evidence: result.evidence,
|
|
637
|
-
attributes: {
|
|
638
|
-
phase,
|
|
639
|
-
passed: result.passed,
|
|
640
|
-
severity: result.severity,
|
|
641
|
-
objective: result.objective
|
|
642
|
-
}
|
|
643
|
-
}));
|
|
644
|
-
}
|
|
645
|
-
}
|
|
646
|
-
async function runOnStep(onStep, step, runtimeErrors) {
|
|
647
|
-
if (!onStep) return;
|
|
648
|
-
try {
|
|
649
|
-
await onStep(step);
|
|
650
|
-
} catch (err) {
|
|
651
|
-
runtimeErrors.push(runtimeError("on-step", step.index, err));
|
|
652
|
-
}
|
|
653
|
-
}
|
|
654
|
-
async function runTrace(runtimeErrors, stepIndex, write) {
|
|
655
|
-
try {
|
|
656
|
-
return await write();
|
|
657
|
-
} catch (err) {
|
|
658
|
-
runtimeErrors.push(runtimeError("trace", stepIndex, err));
|
|
659
|
-
return void 0;
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
function noProgressStopDecision(args) {
|
|
663
|
-
const max = args.policies?.maxNoProgressSteps;
|
|
664
|
-
if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
|
|
665
|
-
const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
|
|
666
|
-
const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
|
|
667
|
-
const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
|
|
668
|
-
const scoreFlat = scoreDelta < minScoreDelta;
|
|
669
|
-
const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
|
|
670
|
-
return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
|
|
671
|
-
}
|
|
672
|
-
function repeatedActionStopDecision(policies, streak) {
|
|
673
|
-
const max = policies?.maxRepeatedActions;
|
|
674
|
-
if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
|
|
675
|
-
return {
|
|
676
|
-
stop: true,
|
|
677
|
-
reason: `stuck: repeated same action for ${streak} step(s)`
|
|
678
|
-
};
|
|
679
|
-
}
|
|
680
|
-
function fingerprintState(state, policies) {
|
|
681
|
-
if (policies?.stateFingerprint) return policies.stateFingerprint(state);
|
|
682
|
-
return stableFingerprint(state);
|
|
683
|
-
}
|
|
684
|
-
function fingerprintAction(action, policies) {
|
|
685
|
-
if (policies?.actionFingerprint) return policies.actionFingerprint(action);
|
|
686
|
-
return stableFingerprint(action);
|
|
687
|
-
}
|
|
688
|
-
function stableFingerprint(value) {
|
|
689
|
-
if (typeof value === "string") return value;
|
|
690
|
-
if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
|
|
691
|
-
try {
|
|
692
|
-
return JSON.stringify(sortForFingerprint(value));
|
|
693
|
-
} catch {
|
|
694
|
-
return String(value);
|
|
695
|
-
}
|
|
696
|
-
}
|
|
697
|
-
function sortForFingerprint(value) {
|
|
698
|
-
if (Array.isArray(value)) return value.map(sortForFingerprint);
|
|
699
|
-
if (!value || typeof value !== "object") return value;
|
|
700
|
-
const record = value;
|
|
701
|
-
const sorted = {};
|
|
702
|
-
for (const key of Object.keys(record).sort()) {
|
|
703
|
-
sorted[key] = sortForFingerprint(record[key]);
|
|
704
|
-
}
|
|
705
|
-
return sorted;
|
|
706
|
-
}
|
|
707
|
-
function abortReason(signal) {
|
|
708
|
-
const reason = signal.reason;
|
|
709
|
-
if (reason instanceof Error) return reason.message;
|
|
710
|
-
return reason ? String(reason) : "aborted";
|
|
711
|
-
}
|
|
712
|
-
function runtimeError(phase, stepIndex, err) {
|
|
713
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
714
|
-
return { phase, stepIndex, message };
|
|
715
|
-
}
|
|
716
|
-
async function finish(emitter, result) {
|
|
717
|
-
await runTrace(result.runtimeErrors, result.steps.length, () => emitter?.endRun({
|
|
718
|
-
pass: result.pass,
|
|
719
|
-
score: result.score ?? averageScore(result.finalEvals),
|
|
720
|
-
failureClass: result.failureClass,
|
|
721
|
-
notes: result.reason
|
|
722
|
-
}));
|
|
723
|
-
return result;
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
// src/run-evidence.ts
|
|
727
|
-
function controlRunToRunRecord(run, options) {
|
|
728
|
-
const score = clampScore(options.score ?? run.score ?? scoreFromEvals(run.finalEvals) ?? (run.pass ? 1 : 0));
|
|
729
|
-
const outcome = options.splitTag === "holdout" ? { holdoutScore: score, raw: normalizeRawMetrics(options.raw, run, score) } : { searchScore: score, raw: normalizeRawMetrics(options.raw, run, score) };
|
|
730
|
-
return validateRunRecord({
|
|
731
|
-
runId: options.runId ?? run.runId ?? `control:${options.experimentId}:${options.candidateId}:${options.seed}:${options.splitTag}`,
|
|
732
|
-
experimentId: options.experimentId,
|
|
733
|
-
candidateId: options.candidateId,
|
|
734
|
-
seed: options.seed,
|
|
735
|
-
model: options.model,
|
|
736
|
-
promptHash: options.promptHash,
|
|
737
|
-
configHash: options.configHash,
|
|
738
|
-
commitSha: options.commitSha,
|
|
739
|
-
wallMs: run.wallMs,
|
|
740
|
-
...options.queueMs !== void 0 ? { queueMs: options.queueMs } : {},
|
|
741
|
-
costUsd: run.spentCostUsd,
|
|
742
|
-
tokenUsage: options.tokenUsage,
|
|
743
|
-
...options.judgeMetadata ? { judgeMetadata: options.judgeMetadata } : {},
|
|
744
|
-
outcome,
|
|
745
|
-
failureMode: options.failureMode ?? failureModeFromRun(run),
|
|
746
|
-
splitTag: options.splitTag
|
|
747
|
-
});
|
|
748
|
-
}
|
|
749
|
-
function scoreFromEvals(evals) {
|
|
750
|
-
const scores = evals.map((e) => e.score).filter((score) => typeof score === "number" && Number.isFinite(score));
|
|
751
|
-
if (scores.length === 0) return void 0;
|
|
752
|
-
return clampScore(scores.reduce((sum, score) => sum + score, 0) / scores.length);
|
|
753
|
-
}
|
|
754
|
-
function normalizeRawMetrics(raw, run, score) {
|
|
755
|
-
return {
|
|
756
|
-
...finiteOnly(raw ?? {}),
|
|
757
|
-
score,
|
|
758
|
-
pass: run.pass ? 1 : 0,
|
|
759
|
-
completed: run.completed ? 1 : 0,
|
|
760
|
-
steps: run.steps.length,
|
|
761
|
-
runtimeErrors: run.runtimeErrors.length
|
|
762
|
-
};
|
|
763
|
-
}
|
|
764
|
-
function finiteOnly(values) {
|
|
765
|
-
const out = {};
|
|
766
|
-
for (const [key, value] of Object.entries(values)) {
|
|
767
|
-
if (Number.isFinite(value)) out[key] = value;
|
|
768
|
-
}
|
|
769
|
-
return out;
|
|
770
|
-
}
|
|
771
|
-
function failureModeFromRun(run) {
|
|
772
|
-
if (run.pass) return void 0;
|
|
773
|
-
return run.failureClass ?? "unknown";
|
|
774
|
-
}
|
|
775
|
-
function clampScore(value) {
|
|
776
|
-
if (!Number.isFinite(value)) return 0;
|
|
777
|
-
return Math.max(0, Math.min(1, value));
|
|
778
|
-
}
|
|
779
|
-
|
|
780
|
-
// src/action-policy.ts
|
|
781
|
-
function evaluateActionPolicy(action, policy = {}, options = {}) {
|
|
782
|
-
const reasons = [];
|
|
783
|
-
let blocked = false;
|
|
784
|
-
let requiresApproval = Boolean(action.requiresApproval);
|
|
785
|
-
if (policy.allowedTypes?.length && !policy.allowedTypes.includes(action.type)) {
|
|
786
|
-
blocked = true;
|
|
787
|
-
reasons.push(`action type "${action.type}" is not allowed`);
|
|
788
|
-
}
|
|
789
|
-
if (policy.blockedTypes?.includes(action.type)) {
|
|
790
|
-
blocked = true;
|
|
791
|
-
reasons.push(`action type "${action.type}" is blocked`);
|
|
792
|
-
}
|
|
793
|
-
if (policy.alwaysRequireApprovalTypes?.includes(action.type)) {
|
|
794
|
-
requiresApproval = true;
|
|
795
|
-
reasons.push(`action type "${action.type}" requires approval`);
|
|
796
|
-
}
|
|
797
|
-
if (policy.requireApprovalForExternalSideEffects && action.externalSideEffect) {
|
|
798
|
-
requiresApproval = true;
|
|
799
|
-
reasons.push("external side effect requires approval");
|
|
800
|
-
}
|
|
801
|
-
if (policy.requireApprovalAboveCostUsd !== void 0 && (action.costUsd ?? 0) > policy.requireApprovalAboveCostUsd) {
|
|
802
|
-
requiresApproval = true;
|
|
803
|
-
reasons.push(`cost ${action.costUsd} exceeds approval threshold ${policy.requireApprovalAboveCostUsd}`);
|
|
804
|
-
}
|
|
805
|
-
if (policy.maxActionCostUsd !== void 0 && (action.costUsd ?? 0) > policy.maxActionCostUsd) {
|
|
806
|
-
blocked = true;
|
|
807
|
-
reasons.push(`cost ${action.costUsd} exceeds max action cost ${policy.maxActionCostUsd}`);
|
|
808
|
-
}
|
|
809
|
-
if (policy.remainingBudgetUsd !== void 0 && (action.costUsd ?? 0) > policy.remainingBudgetUsd) {
|
|
810
|
-
blocked = true;
|
|
811
|
-
reasons.push(`cost ${action.costUsd} exceeds remaining budget ${policy.remainingBudgetUsd}`);
|
|
812
|
-
}
|
|
813
|
-
if (policy.expectedOutcomeRequired && !action.metadata?.expectedOutcome) {
|
|
814
|
-
blocked = true;
|
|
815
|
-
reasons.push("expected outcome is required");
|
|
816
|
-
}
|
|
817
|
-
if (policy.killCriteriaRequired && !action.metadata?.killCriteria) {
|
|
818
|
-
blocked = true;
|
|
819
|
-
reasons.push("kill criteria are required");
|
|
820
|
-
}
|
|
821
|
-
if (policy.autoApproveTypes?.includes(action.type) && requiresApproval) {
|
|
822
|
-
reasons.push(`action type "${action.type}" is auto-approved only when no approval policy applies`);
|
|
823
|
-
}
|
|
824
|
-
if (!reasons.length) reasons.push(requiresApproval ? "approval required" : "action allowed");
|
|
825
|
-
const label = blocked || requiresApproval ? {
|
|
826
|
-
source: "policy",
|
|
827
|
-
kind: blocked ? "policy_block" : "comment",
|
|
828
|
-
value: { actionType: action.type, blocked, requiresApproval },
|
|
829
|
-
reason: reasons.join("; "),
|
|
830
|
-
severity: blocked ? "critical" : "warning",
|
|
831
|
-
createdAt: options.createdAt ?? (/* @__PURE__ */ new Date()).toISOString(),
|
|
832
|
-
metadata: { action, policy }
|
|
833
|
-
} : void 0;
|
|
834
|
-
return {
|
|
835
|
-
allowed: !blocked,
|
|
836
|
-
blocked,
|
|
837
|
-
requiresApproval: !blocked && requiresApproval,
|
|
838
|
-
reasons,
|
|
839
|
-
label
|
|
840
|
-
};
|
|
841
|
-
}
|
|
842
|
-
|
|
843
|
-
// src/propose-review.ts
|
|
844
|
-
import { appendFileSync, existsSync, mkdirSync, readFileSync } from "fs";
|
|
845
|
-
import { dirname } from "path";
|
|
846
|
-
function inMemoryReviewStore(initial = []) {
|
|
847
|
-
const entries = [...initial];
|
|
848
|
-
return {
|
|
849
|
-
async load() {
|
|
850
|
-
return [...entries];
|
|
851
|
-
},
|
|
852
|
-
async append(entry) {
|
|
853
|
-
entries.push(entry);
|
|
854
|
-
}
|
|
855
|
-
};
|
|
856
|
-
}
|
|
857
|
-
function jsonlReviewStore(path) {
|
|
858
|
-
return {
|
|
859
|
-
async load() {
|
|
860
|
-
if (!existsSync(path)) return [];
|
|
861
|
-
const raw = readFileSync(path, "utf8");
|
|
862
|
-
const out = [];
|
|
863
|
-
for (const line of raw.split("\n")) {
|
|
864
|
-
const trimmed = line.trim();
|
|
865
|
-
if (!trimmed) continue;
|
|
866
|
-
try {
|
|
867
|
-
out.push(JSON.parse(trimmed));
|
|
868
|
-
} catch {
|
|
869
|
-
}
|
|
870
|
-
}
|
|
871
|
-
return out;
|
|
872
|
-
},
|
|
873
|
-
async append(entry) {
|
|
874
|
-
mkdirSync(dirname(path), { recursive: true });
|
|
875
|
-
appendFileSync(path, JSON.stringify(entry) + "\n");
|
|
876
|
-
}
|
|
877
|
-
};
|
|
878
|
-
}
|
|
879
|
-
var DEFAULT_FALLBACK_INSTRUCTION = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
880
|
-
async function runProposeReview(config) {
|
|
881
|
-
const maxShots = config.maxShots ?? 10;
|
|
882
|
-
const maxWallMs = config.maxWallMs ?? 10 * 60 * 1e3;
|
|
883
|
-
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
884
|
-
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
885
|
-
const memory = config.memory ?? inMemoryReviewStore();
|
|
886
|
-
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION;
|
|
887
|
-
const emitter = config.store ? new TraceEmitter(config.store) : null;
|
|
888
|
-
if (emitter) {
|
|
889
|
-
await emitter.startRun({
|
|
890
|
-
scenarioId: config.scenarioId ?? "propose-review",
|
|
891
|
-
projectId: config.projectId,
|
|
892
|
-
variantId: config.variantId,
|
|
893
|
-
layer: "meta",
|
|
894
|
-
tags: {
|
|
895
|
-
goal: config.goal.slice(0, 120),
|
|
896
|
-
maxShots: String(maxShots)
|
|
897
|
-
}
|
|
898
|
-
});
|
|
899
|
-
}
|
|
900
|
-
const abort = new AbortController();
|
|
901
|
-
const wallStart = Date.now();
|
|
902
|
-
const wallTimer = setTimeout(() => abort.abort(new Error("propose-review wall timeout")), maxWallMs);
|
|
903
|
-
const shots = [];
|
|
904
|
-
let state = config.initialState;
|
|
905
|
-
let priorReview = null;
|
|
906
|
-
let lastVerification = { pass: false };
|
|
907
|
-
let failureClass;
|
|
908
|
-
let completed = false;
|
|
909
|
-
let lowConfidenceStreak = 0;
|
|
910
|
-
try {
|
|
911
|
-
for (let shot = 1; shot <= maxShots; shot++) {
|
|
912
|
-
if (abort.signal.aborted) {
|
|
913
|
-
failureClass = "timeout";
|
|
914
|
-
break;
|
|
915
|
-
}
|
|
916
|
-
const shotStart = Date.now();
|
|
917
|
-
const shotHandle = emitter ? await emitter.span({ kind: "tool", name: `shot-${shot}` }) : null;
|
|
918
|
-
let proposeOut;
|
|
919
|
-
try {
|
|
920
|
-
proposeOut = await config.propose({
|
|
921
|
-
shot,
|
|
922
|
-
goal: config.goal,
|
|
923
|
-
state,
|
|
924
|
-
priorReview,
|
|
925
|
-
abortSignal: abort.signal,
|
|
926
|
-
emitter: emitter ?? void 0
|
|
927
|
-
});
|
|
928
|
-
} catch (err) {
|
|
929
|
-
await shotHandle?.fail(err instanceof Error ? err : String(err));
|
|
930
|
-
failureClass = "unknown";
|
|
931
|
-
throw err;
|
|
932
|
-
}
|
|
933
|
-
state = proposeOut.state;
|
|
934
|
-
const traceSummary = proposeOut.traceSummary;
|
|
935
|
-
let verification;
|
|
936
|
-
try {
|
|
937
|
-
verification = await config.verify(state);
|
|
938
|
-
} catch (err) {
|
|
939
|
-
await shotHandle?.fail(err instanceof Error ? err : String(err));
|
|
940
|
-
failureClass = "unknown";
|
|
941
|
-
throw err;
|
|
942
|
-
}
|
|
943
|
-
lastVerification = verification;
|
|
944
|
-
const memorySnapshot = await memory.load();
|
|
945
|
-
const verificationDigest = {
|
|
946
|
-
pass: verification.pass,
|
|
947
|
-
score: verification.score,
|
|
948
|
-
failingLayers: verification.failingLayers ?? []
|
|
949
|
-
};
|
|
950
|
-
let review;
|
|
951
|
-
let reviewAvailable = true;
|
|
952
|
-
let reviewError;
|
|
953
|
-
if (verification.pass) {
|
|
954
|
-
review = {
|
|
955
|
-
observations: "verification passed \u2014 skipping reviewer LLM call",
|
|
956
|
-
diagnosis: "no failures to diagnose",
|
|
957
|
-
nextShotInstruction: "(done)",
|
|
958
|
-
shouldContinue: false,
|
|
959
|
-
confidence: 1
|
|
960
|
-
};
|
|
961
|
-
} else {
|
|
962
|
-
try {
|
|
963
|
-
review = await config.review({
|
|
964
|
-
shot,
|
|
965
|
-
goal: config.goal,
|
|
966
|
-
state,
|
|
967
|
-
verification,
|
|
968
|
-
traceSummary,
|
|
969
|
-
memory: memorySnapshot
|
|
970
|
-
});
|
|
971
|
-
review = coerceReview(review);
|
|
972
|
-
} catch (err) {
|
|
973
|
-
reviewAvailable = false;
|
|
974
|
-
reviewError = err instanceof Error ? err.message : String(err);
|
|
975
|
-
const lastInstruction = memorySnapshot.length > 0 ? memorySnapshot[memorySnapshot.length - 1].nextShotInstruction : fallbackInstruction;
|
|
976
|
-
review = {
|
|
977
|
-
observations: "(reviewer unavailable \u2014 using last-known instruction)",
|
|
978
|
-
diagnosis: reviewError,
|
|
979
|
-
nextShotInstruction: lastInstruction,
|
|
980
|
-
shouldContinue: true,
|
|
981
|
-
confidence: 0.3
|
|
982
|
-
};
|
|
983
|
-
}
|
|
984
|
-
}
|
|
985
|
-
const entry = {
|
|
986
|
-
shot,
|
|
987
|
-
timestamp: Date.now(),
|
|
988
|
-
...review,
|
|
989
|
-
verification: verificationDigest
|
|
990
|
-
};
|
|
991
|
-
await memory.append(entry);
|
|
992
|
-
const shotRecord = {
|
|
993
|
-
shot,
|
|
994
|
-
state,
|
|
995
|
-
verification,
|
|
996
|
-
traceSummary,
|
|
997
|
-
review,
|
|
998
|
-
reviewAvailable,
|
|
999
|
-
reviewError,
|
|
1000
|
-
durationMs: Date.now() - shotStart
|
|
1001
|
-
};
|
|
1002
|
-
shots.push(shotRecord);
|
|
1003
|
-
await shotHandle?.end({
|
|
1004
|
-
attributes: {
|
|
1005
|
-
verificationPass: verification.pass,
|
|
1006
|
-
verificationScore: verification.score ?? null,
|
|
1007
|
-
reviewShouldContinue: review.shouldContinue,
|
|
1008
|
-
reviewConfidence: review.confidence,
|
|
1009
|
-
reviewAvailable
|
|
1010
|
-
}
|
|
1011
|
-
});
|
|
1012
|
-
if (verification.pass) {
|
|
1013
|
-
completed = true;
|
|
1014
|
-
break;
|
|
1015
|
-
}
|
|
1016
|
-
if (!review.shouldContinue) {
|
|
1017
|
-
break;
|
|
1018
|
-
}
|
|
1019
|
-
if (confidenceFloorWindow > 0 && review.confidence <= confidenceFloor) {
|
|
1020
|
-
lowConfidenceStreak += 1;
|
|
1021
|
-
if (lowConfidenceStreak >= confidenceFloorWindow) break;
|
|
1022
|
-
} else {
|
|
1023
|
-
lowConfidenceStreak = 0;
|
|
1024
|
-
}
|
|
1025
|
-
priorReview = review;
|
|
1026
|
-
}
|
|
1027
|
-
if (!completed && !failureClass) {
|
|
1028
|
-
failureClass = shots.length >= maxShots ? "budget_exceeded" : "unknown";
|
|
1029
|
-
}
|
|
1030
|
-
} finally {
|
|
1031
|
-
clearTimeout(wallTimer);
|
|
1032
|
-
}
|
|
1033
|
-
const score = lastVerification.pass ? 1 : typeof lastVerification.score === "number" ? lastVerification.score : 0;
|
|
1034
|
-
if (emitter) {
|
|
1035
|
-
await emitter.endRun({
|
|
1036
|
-
pass: completed,
|
|
1037
|
-
score,
|
|
1038
|
-
failureClass,
|
|
1039
|
-
notes: `${shots.length} shot(s); final pass=${lastVerification.pass}`
|
|
1040
|
-
});
|
|
1041
|
-
}
|
|
1042
|
-
return {
|
|
1043
|
-
runId: emitter?.runId ?? null,
|
|
1044
|
-
completed,
|
|
1045
|
-
shots,
|
|
1046
|
-
finalState: state,
|
|
1047
|
-
finalVerification: lastVerification,
|
|
1048
|
-
failureClass,
|
|
1049
|
-
wallMs: Date.now() - wallStart,
|
|
1050
|
-
score
|
|
1051
|
-
};
|
|
1052
|
-
}
|
|
1053
|
-
var REVIEWER_SYSTEM_PROMPT = `You are a senior reviewer directing a multi-shot build loop.
|
|
1054
|
-
You do NOT grade \u2014 the verifier already did. Your job is to direct the worker's next shot.
|
|
1055
|
-
You are blind to the worker's inner monologue. You see what it DID, not what it thought.
|
|
1056
|
-
Return STRICT JSON matching the schema. No prose outside the JSON.`;
|
|
1057
|
-
function createLlmReviewer(cfg) {
|
|
1058
|
-
const renderState = cfg.renderState ?? ((s) => safeJson(s));
|
|
1059
|
-
const renderTraceSummary = cfg.renderTraceSummary ?? ((s) => s === void 0 ? "(none)" : safeJson(s));
|
|
1060
|
-
const system = cfg.systemPromptAddendum ? `${REVIEWER_SYSTEM_PROMPT}
|
|
1061
|
-
|
|
1062
|
-
${cfg.systemPromptAddendum}` : REVIEWER_SYSTEM_PROMPT;
|
|
1063
|
-
return async (input) => {
|
|
1064
|
-
const memoryBlock = input.memory.length === 0 ? "(no prior shots \u2014 this is shot 1)" : input.memory.map((m) => [
|
|
1065
|
-
`shot ${m.shot} \u2014 verification.pass=${m.verification.pass}` + (typeof m.verification.score === "number" ? ` score=${m.verification.score.toFixed(2)}` : "") + ` confidence=${m.confidence.toFixed(2)} failing=[${(m.verification.failingLayers ?? []).join(",")}]`,
|
|
1066
|
-
` observations: ${m.observations.slice(0, 400)}`,
|
|
1067
|
-
` diagnosis: ${m.diagnosis.slice(0, 400)}`,
|
|
1068
|
-
` instruction given: ${m.nextShotInstruction.slice(0, 400)}`
|
|
1069
|
-
].join("\n")).join("\n\n");
|
|
1070
|
-
const user = [
|
|
1071
|
-
`=== GOAL ===`,
|
|
1072
|
-
input.goal,
|
|
1073
|
-
``,
|
|
1074
|
-
`=== SHOT NUMBER ===`,
|
|
1075
|
-
String(input.shot),
|
|
1076
|
-
``,
|
|
1077
|
-
`=== CURRENT STATE ===`,
|
|
1078
|
-
renderState(input.state),
|
|
1079
|
-
``,
|
|
1080
|
-
`=== TRACE SUMMARY ===`,
|
|
1081
|
-
renderTraceSummary(input.traceSummary),
|
|
1082
|
-
``,
|
|
1083
|
-
`=== VERIFICATION ===`,
|
|
1084
|
-
summarizeVerification(input.verification),
|
|
1085
|
-
``,
|
|
1086
|
-
`=== REVIEWER MEMORY (prior shots) ===`,
|
|
1087
|
-
memoryBlock,
|
|
1088
|
-
``,
|
|
1089
|
-
`=== YOUR TASK ===`,
|
|
1090
|
-
`Return STRICT JSON:`,
|
|
1091
|
-
`{`,
|
|
1092
|
-
` "observations": string (20..2000 chars, first-person worker behavior \u2014 quote counts, errors, loops)`,
|
|
1093
|
-
` "diagnosis": string (20..1500 chars, root cause, NOT a restatement of verification)`,
|
|
1094
|
-
` "nextShotInstruction": string (40..3000 chars, concrete directive to the worker)`,
|
|
1095
|
-
` "shouldContinue": boolean (false if verification.pass, or if thrashing, or unachievable)`,
|
|
1096
|
-
` "confidence": number in [0,1]`,
|
|
1097
|
-
`}`
|
|
1098
|
-
].join("\n");
|
|
1099
|
-
const raw = await cfg.callJson({ system, user });
|
|
1100
|
-
return coerceReview(raw);
|
|
1101
|
-
};
|
|
1102
|
-
}
|
|
1103
|
-
function coerceReview(raw) {
|
|
1104
|
-
if (!raw || typeof raw !== "object") {
|
|
1105
|
-
throw new Error("reviewer returned non-object");
|
|
1106
|
-
}
|
|
1107
|
-
const observations = typeof raw.observations === "string" ? raw.observations : "";
|
|
1108
|
-
const diagnosis = typeof raw.diagnosis === "string" ? raw.diagnosis : "";
|
|
1109
|
-
const nextShotInstruction = typeof raw.nextShotInstruction === "string" ? raw.nextShotInstruction : "";
|
|
1110
|
-
if (!observations || !diagnosis || !nextShotInstruction) {
|
|
1111
|
-
throw new Error("reviewer missing required string fields");
|
|
1112
|
-
}
|
|
1113
|
-
if (typeof raw.shouldContinue !== "boolean") {
|
|
1114
|
-
throw new Error("reviewer missing shouldContinue boolean");
|
|
1115
|
-
}
|
|
1116
|
-
const confidenceRaw = Number(raw.confidence);
|
|
1117
|
-
if (!Number.isFinite(confidenceRaw)) {
|
|
1118
|
-
throw new Error("reviewer confidence not finite");
|
|
1119
|
-
}
|
|
1120
|
-
return {
|
|
1121
|
-
observations,
|
|
1122
|
-
diagnosis,
|
|
1123
|
-
nextShotInstruction,
|
|
1124
|
-
shouldContinue: raw.shouldContinue,
|
|
1125
|
-
confidence: Math.max(0, Math.min(1, confidenceRaw))
|
|
1126
|
-
};
|
|
1127
|
-
}
|
|
1128
|
-
function summarizeVerification(v) {
|
|
1129
|
-
const header = `pass=${v.pass}` + (typeof v.score === "number" ? ` score=${v.score.toFixed(3)}` : "") + (v.failingLayers && v.failingLayers.length > 0 ? ` failing=[${v.failingLayers.join(", ")}]` : "");
|
|
1130
|
-
const details = v.details === void 0 ? "" : `
|
|
1131
|
-
${safeJson(v.details).slice(0, 1500)}`;
|
|
1132
|
-
return header + details;
|
|
1133
|
-
}
|
|
1134
|
-
function safeJson(x) {
|
|
1135
|
-
try {
|
|
1136
|
-
return JSON.stringify(x, null, 2);
|
|
1137
|
-
} catch {
|
|
1138
|
-
return String(x);
|
|
1139
|
-
}
|
|
1140
|
-
}
|
|
1141
|
-
|
|
1142
|
-
// src/propose-review-control.ts
|
|
1143
|
-
var DEFAULT_FALLBACK_INSTRUCTION2 = "Inspect the verification failures above. Fix the critical issues first, then the major ones. Do not restate the failures \u2014 act on them.";
|
|
1144
|
-
async function runProposeReviewAsControlLoop(config) {
|
|
1145
|
-
const maxShots = config.maxShots ?? 10;
|
|
1146
|
-
const confidenceFloor = config.confidenceFloor ?? 0.3;
|
|
1147
|
-
const confidenceFloorWindow = config.confidenceFloorWindow ?? 2;
|
|
1148
|
-
const memory = config.memory ?? inMemoryReviewStore();
|
|
1149
|
-
const fallbackInstruction = config.fallbackInstruction ?? DEFAULT_FALLBACK_INSTRUCTION2;
|
|
1150
|
-
const failureClassFromVerification = config.failureClassFromVerification ?? controlFailureClassFromVerification;
|
|
1151
|
-
let lowConfidenceStreak = 0;
|
|
1152
|
-
let current = {
|
|
1153
|
-
shot: 0,
|
|
1154
|
-
state: config.initialState,
|
|
1155
|
-
priorReview: null,
|
|
1156
|
-
verification: { pass: false },
|
|
1157
|
-
memory: await memory.load(),
|
|
1158
|
-
completed: false,
|
|
1159
|
-
reviewAvailable: false
|
|
1160
|
-
};
|
|
1161
|
-
return runAgentControlLoop({
|
|
1162
|
-
intent: config.goal,
|
|
1163
|
-
budget: { maxSteps: maxShots, maxWallMs: config.maxWallMs },
|
|
1164
|
-
store: config.store,
|
|
1165
|
-
scenarioId: config.scenarioId ?? "propose-review-control",
|
|
1166
|
-
projectId: config.projectId,
|
|
1167
|
-
variantId: config.variantId,
|
|
1168
|
-
actionFailure: config.actionFailure ?? "stop",
|
|
1169
|
-
observe: () => current,
|
|
1170
|
-
validate: ({ state }) => [
|
|
1171
|
-
objectiveEval({
|
|
1172
|
-
id: "verification",
|
|
1173
|
-
passed: state.verification.pass,
|
|
1174
|
-
score: state.verification.score,
|
|
1175
|
-
severity: "critical",
|
|
1176
|
-
detail: state.verification.pass ? "verification passed" : `verification failed${state.verification.failingLayers?.length ? `: ${state.verification.failingLayers.join(", ")}` : ""}`
|
|
1177
|
-
})
|
|
1178
|
-
],
|
|
1179
|
-
shouldStop: ({ state }) => {
|
|
1180
|
-
if (state.verification.pass) {
|
|
1181
|
-
return { stop: true, pass: true, reason: "verification passed", score: state.verification.score };
|
|
1182
|
-
}
|
|
1183
|
-
if (state.completed) {
|
|
1184
|
-
return {
|
|
1185
|
-
stop: true,
|
|
1186
|
-
pass: false,
|
|
1187
|
-
reason: "reviewer stopped continuation",
|
|
1188
|
-
score: state.verification.score,
|
|
1189
|
-
failureClass: failureClassFromVerification(state.verification)
|
|
1190
|
-
};
|
|
1191
|
-
}
|
|
1192
|
-
return { stop: false, pass: false, reason: "verification still failing", score: state.verification.score };
|
|
1193
|
-
},
|
|
1194
|
-
decide: ({ state }) => ({
|
|
1195
|
-
type: "continue",
|
|
1196
|
-
action: { type: "propose-review-shot", shot: state.shot + 1 },
|
|
1197
|
-
reason: state.priorReview?.nextShotInstruction ?? fallbackInstruction
|
|
1198
|
-
}),
|
|
1199
|
-
act: async (action, ctx) => {
|
|
1200
|
-
const shot = action.shot;
|
|
1201
|
-
const proposeOut = await config.propose({
|
|
1202
|
-
shot,
|
|
1203
|
-
goal: config.goal,
|
|
1204
|
-
state: current.state,
|
|
1205
|
-
priorReview: current.priorReview,
|
|
1206
|
-
abortSignal: ctx.abortSignal,
|
|
1207
|
-
emitter: ctx.emitter
|
|
1208
|
-
});
|
|
1209
|
-
const nextState = proposeOut.state;
|
|
1210
|
-
const verification = await config.verify(nextState);
|
|
1211
|
-
let review = null;
|
|
1212
|
-
let reviewAvailable = false;
|
|
1213
|
-
let reviewError;
|
|
1214
|
-
let shouldContinue = !verification.pass;
|
|
1215
|
-
if (!verification.pass) {
|
|
1216
|
-
try {
|
|
1217
|
-
review = await config.review({
|
|
1218
|
-
shot,
|
|
1219
|
-
goal: config.goal,
|
|
1220
|
-
state: nextState,
|
|
1221
|
-
verification,
|
|
1222
|
-
traceSummary: proposeOut.traceSummary,
|
|
1223
|
-
memory: await memory.load()
|
|
1224
|
-
});
|
|
1225
|
-
reviewAvailable = true;
|
|
1226
|
-
shouldContinue = review.shouldContinue;
|
|
1227
|
-
lowConfidenceStreak = review.confidence <= confidenceFloor ? lowConfidenceStreak + 1 : 0;
|
|
1228
|
-
if (confidenceFloorWindow > 0 && lowConfidenceStreak >= confidenceFloorWindow) shouldContinue = false;
|
|
1229
|
-
} catch (err) {
|
|
1230
|
-
reviewError = err instanceof Error ? err.message : String(err);
|
|
1231
|
-
review = current.priorReview ?? {
|
|
1232
|
-
observations: "Reviewer unavailable.",
|
|
1233
|
-
diagnosis: reviewError,
|
|
1234
|
-
nextShotInstruction: fallbackInstruction,
|
|
1235
|
-
shouldContinue: true,
|
|
1236
|
-
confidence: 0
|
|
1237
|
-
};
|
|
1238
|
-
shouldContinue = true;
|
|
1239
|
-
}
|
|
1240
|
-
} else {
|
|
1241
|
-
review = {
|
|
1242
|
-
observations: "Verification passed.",
|
|
1243
|
-
diagnosis: "No further revision needed.",
|
|
1244
|
-
nextShotInstruction: "",
|
|
1245
|
-
shouldContinue: false,
|
|
1246
|
-
confidence: 1
|
|
1247
|
-
};
|
|
1248
|
-
}
|
|
1249
|
-
const entry = {
|
|
1250
|
-
...review ?? {
|
|
1251
|
-
observations: "No review.",
|
|
1252
|
-
diagnosis: "",
|
|
1253
|
-
nextShotInstruction: fallbackInstruction,
|
|
1254
|
-
shouldContinue,
|
|
1255
|
-
confidence: 0
|
|
1256
|
-
},
|
|
1257
|
-
shot,
|
|
1258
|
-
timestamp: Date.now(),
|
|
1259
|
-
verification: {
|
|
1260
|
-
pass: verification.pass,
|
|
1261
|
-
score: verification.score,
|
|
1262
|
-
failingLayers: verification.failingLayers
|
|
1263
|
-
}
|
|
1264
|
-
};
|
|
1265
|
-
await memory.append(entry);
|
|
1266
|
-
current = {
|
|
1267
|
-
shot,
|
|
1268
|
-
state: nextState,
|
|
1269
|
-
priorReview: review,
|
|
1270
|
-
verification,
|
|
1271
|
-
traceSummary: proposeOut.traceSummary,
|
|
1272
|
-
memory: await memory.load(),
|
|
1273
|
-
completed: verification.pass || !shouldContinue,
|
|
1274
|
-
reviewAvailable,
|
|
1275
|
-
reviewError
|
|
1276
|
-
};
|
|
1277
|
-
return {
|
|
1278
|
-
state: nextState,
|
|
1279
|
-
verification,
|
|
1280
|
-
traceSummary: proposeOut.traceSummary,
|
|
1281
|
-
review,
|
|
1282
|
-
reviewAvailable,
|
|
1283
|
-
reviewError
|
|
1284
|
-
};
|
|
1285
|
-
}
|
|
1286
|
-
});
|
|
1287
|
-
}
|
|
1288
|
-
function controlFailureClassFromVerification(verification) {
|
|
1289
|
-
if (verification.pass) return void 0;
|
|
1290
|
-
return verification.failingLayers?.length ? "instruction_following" : "unknown";
|
|
1291
|
-
}
|
|
1292
|
-
|
|
1293
|
-
export {
|
|
1294
|
-
runAgentControlLoop,
|
|
1295
|
-
stopOnNoProgress,
|
|
1296
|
-
stopOnRepeatedAction,
|
|
1297
|
-
objectiveEval,
|
|
1298
|
-
subjectiveEval,
|
|
1299
|
-
allCriticalPassed,
|
|
1300
|
-
controlRunToRunRecord,
|
|
1301
|
-
scoreFromEvals,
|
|
1302
|
-
evaluateActionPolicy,
|
|
1303
|
-
inMemoryReviewStore,
|
|
1304
|
-
jsonlReviewStore,
|
|
1305
|
-
runProposeReview,
|
|
1306
|
-
createLlmReviewer,
|
|
1307
|
-
runProposeReviewAsControlLoop,
|
|
1308
|
-
controlFailureClassFromVerification
|
|
1309
|
-
};
|
|
1310
|
-
//# sourceMappingURL=chunk-V5QSWN7L.js.map
|