@tangle-network/agent-eval 0.23.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +102 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +35 -2
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-4W4NCYM2.js +0 -1945
  129. package/dist/chunk-4W4NCYM2.js.map +0 -1
  130. package/dist/chunk-5IIQKMD5.js.map +0 -1
  131. package/dist/chunk-6KQG5HAH.js.map +0 -1
  132. package/dist/chunk-6M774GY6.js.map +0 -1
  133. package/dist/chunk-7EAUOUQS.js.map +0 -1
  134. package/dist/chunk-AXHNWLIX.js.map +0 -1
  135. package/dist/chunk-EXGR4XEM.js.map +0 -1
  136. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  137. package/dist/chunk-KAO3Q65R.js.map +0 -1
  138. package/dist/chunk-LZKIOBG2.js +0 -2026
  139. package/dist/chunk-LZKIOBG2.js.map +0 -1
  140. package/dist/chunk-QBW3YBTR.js.map +0 -1
  141. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  142. package/dist/chunk-SQQLHODJ.js.map +0 -1
  143. package/dist/chunk-V5QSWN7L.js +0 -1310
  144. package/dist/chunk-V5QSWN7L.js.map +0 -1
  145. package/dist/chunk-VQQSPGSM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,838 @@
1
+ import {
2
+ TraceEmitter
3
+ } from "./chunk-TVVP3ZZQ.js";
4
+
5
+ // src/control-runtime.ts
6
+ var DEFAULT_BUDGET = {
7
+ maxSteps: 8,
8
+ maxWallMs: 5 * 60 * 1e3
9
+ };
10
+ async function runAgentControlLoop(config) {
11
+ const budget = normalizeBudget(config.budget);
12
+ const actionFailure = config.actionFailure ?? "continue";
13
+ const controller = new AbortController();
14
+ const upstreamAbort = () => controller.abort(config.signal?.reason);
15
+ if (config.signal) {
16
+ if (config.signal.aborted) controller.abort(config.signal.reason);
17
+ else config.signal.addEventListener("abort", upstreamAbort, { once: true });
18
+ }
19
+ const started = Date.now();
20
+ const wallTimer = budget.maxWallMs ? setTimeout(
21
+ () => controller.abort(new Error("control runtime wall timeout")),
22
+ budget.maxWallMs
23
+ ) : void 0;
24
+ const history = [];
25
+ const emitter = config.store ? new TraceEmitter(config.store) : void 0;
26
+ let spentCostUsd = 0;
27
+ const runtimeErrors = [];
28
+ let lastStateFingerprint;
29
+ let lastActionFingerprint;
30
+ let noProgressStreak = 0;
31
+ let repeatedActionStreak = 0;
32
+ try {
33
+ if (emitter) {
34
+ await runTrace(
35
+ runtimeErrors,
36
+ 0,
37
+ () => emitter.startRun({
38
+ scenarioId: config.scenarioId ?? "agent-control-loop",
39
+ projectId: config.projectId,
40
+ variantId: config.variantId,
41
+ layer: "meta",
42
+ tags: {
43
+ intent: config.intent.slice(0, 120),
44
+ maxSteps: String(budget.maxSteps),
45
+ ...budget.maxCostUsd !== void 0 ? { maxCostUsd: String(budget.maxCostUsd) } : {}
46
+ }
47
+ })
48
+ );
49
+ }
50
+ let state;
51
+ let evals;
52
+ try {
53
+ state = await config.observe({ history, abortSignal: controller.signal });
54
+ } catch (err) {
55
+ const error = runtimeError("observe", 0, err);
56
+ runtimeErrors.push(error);
57
+ return finish(emitter, {
58
+ intent: config.intent,
59
+ pass: false,
60
+ completed: false,
61
+ reason: error.message,
62
+ steps: history,
63
+ finalState: void 0,
64
+ finalEvals: [],
65
+ wallMs: Date.now() - started,
66
+ spentCostUsd,
67
+ runId: emitter?.runId ?? null,
68
+ failureClass: "unknown",
69
+ runtimeErrors,
70
+ stoppedBy: "runtime-error"
71
+ });
72
+ }
73
+ try {
74
+ evals = await config.validate({
75
+ intent: config.intent,
76
+ state,
77
+ history,
78
+ abortSignal: controller.signal
79
+ });
80
+ await recordEvalSpans(emitter, evals, "initial", runtimeErrors, 0);
81
+ } catch (err) {
82
+ const error = runtimeError("validate", 0, err);
83
+ runtimeErrors.push(error);
84
+ return finish(emitter, {
85
+ intent: config.intent,
86
+ pass: false,
87
+ completed: false,
88
+ reason: error.message,
89
+ steps: history,
90
+ finalState: state,
91
+ finalEvals: [],
92
+ wallMs: Date.now() - started,
93
+ spentCostUsd,
94
+ runId: emitter?.runId ?? null,
95
+ failureClass: "unknown",
96
+ runtimeErrors,
97
+ stoppedBy: "runtime-error"
98
+ });
99
+ }
100
+ lastStateFingerprint = fingerprintState(state, config.stopPolicies);
101
+ for (let stepIndex = 0; stepIndex < budget.maxSteps; stepIndex++) {
102
+ if (controller.signal.aborted) {
103
+ return finish(emitter, {
104
+ intent: config.intent,
105
+ pass: false,
106
+ completed: false,
107
+ reason: abortReason(controller.signal),
108
+ score: void 0,
109
+ steps: history,
110
+ finalState: state,
111
+ finalEvals: evals,
112
+ wallMs: Date.now() - started,
113
+ spentCostUsd,
114
+ runId: emitter?.runId ?? null,
115
+ failureClass: "timeout",
116
+ runtimeErrors,
117
+ stoppedBy: "abort"
118
+ });
119
+ }
120
+ const budgetStop = budgetStopDecision(budget, spentCostUsd);
121
+ if (budgetStop.stop) {
122
+ return finish(emitter, {
123
+ intent: config.intent,
124
+ pass: false,
125
+ completed: false,
126
+ reason: budgetStop.reason,
127
+ score: averageScore(evals),
128
+ steps: history,
129
+ finalState: state,
130
+ finalEvals: evals,
131
+ wallMs: Date.now() - started,
132
+ spentCostUsd,
133
+ runId: emitter?.runId ?? null,
134
+ failureClass: "budget_exceeded",
135
+ runtimeErrors,
136
+ stoppedBy: "budget"
137
+ });
138
+ }
139
+ const ctx = makeContext(
140
+ config.intent,
141
+ state,
142
+ evals,
143
+ history,
144
+ budget,
145
+ stepIndex,
146
+ started,
147
+ spentCostUsd,
148
+ controller.signal,
149
+ emitter
150
+ );
151
+ let stop;
152
+ try {
153
+ stop = config.shouldStop ? await config.shouldStop(ctx) : defaultStopDecision(evals);
154
+ } catch (err) {
155
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex, err));
156
+ return finish(emitter, {
157
+ intent: config.intent,
158
+ pass: false,
159
+ completed: false,
160
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
161
+ score: averageScore(evals),
162
+ steps: history,
163
+ finalState: state,
164
+ finalEvals: evals,
165
+ wallMs: Date.now() - started,
166
+ spentCostUsd,
167
+ runId: emitter?.runId ?? null,
168
+ failureClass: "unknown",
169
+ runtimeErrors,
170
+ stoppedBy: "runtime-error"
171
+ });
172
+ }
173
+ if (stop.stop) {
174
+ return finish(emitter, {
175
+ intent: config.intent,
176
+ pass: stop.pass,
177
+ completed: true,
178
+ reason: stop.reason,
179
+ score: stop.score,
180
+ steps: history,
181
+ finalState: state,
182
+ finalEvals: evals,
183
+ wallMs: Date.now() - started,
184
+ spentCostUsd,
185
+ runId: emitter?.runId ?? null,
186
+ failureClass: stop.failureClass,
187
+ runtimeErrors,
188
+ stoppedBy: "stop-policy"
189
+ });
190
+ }
191
+ let decision;
192
+ try {
193
+ decision = await config.decide(ctx);
194
+ } catch (err) {
195
+ runtimeErrors.push(runtimeError("decide", stepIndex, err));
196
+ return finish(emitter, {
197
+ intent: config.intent,
198
+ pass: false,
199
+ completed: false,
200
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
201
+ score: averageScore(evals),
202
+ steps: history,
203
+ finalState: state,
204
+ finalEvals: evals,
205
+ wallMs: Date.now() - started,
206
+ spentCostUsd,
207
+ runId: emitter?.runId ?? null,
208
+ failureClass: "unknown",
209
+ runtimeErrors,
210
+ stoppedBy: "runtime-error"
211
+ });
212
+ }
213
+ if (decision.type === "stop") {
214
+ return finish(emitter, {
215
+ intent: config.intent,
216
+ pass: decision.pass ?? false,
217
+ completed: true,
218
+ reason: decision.reason,
219
+ score: decision.score,
220
+ steps: history,
221
+ finalState: state,
222
+ finalEvals: evals,
223
+ wallMs: Date.now() - started,
224
+ spentCostUsd,
225
+ runId: emitter?.runId ?? null,
226
+ failureClass: decision.pass === false ? "unknown" : void 0,
227
+ runtimeErrors,
228
+ stoppedBy: "policy"
229
+ });
230
+ }
231
+ const actionFingerprint = fingerprintAction(decision.action, config.stopPolicies);
232
+ repeatedActionStreak = actionFingerprint === lastActionFingerprint ? repeatedActionStreak + 1 : 1;
233
+ lastActionFingerprint = actionFingerprint;
234
+ const repeatedActionStop = repeatedActionStopDecision(
235
+ config.stopPolicies,
236
+ repeatedActionStreak
237
+ );
238
+ if (repeatedActionStop.stop) {
239
+ return finish(emitter, {
240
+ intent: config.intent,
241
+ pass: false,
242
+ completed: true,
243
+ reason: repeatedActionStop.reason,
244
+ score: averageScore(evals),
245
+ steps: history,
246
+ finalState: state,
247
+ finalEvals: evals,
248
+ wallMs: Date.now() - started,
249
+ spentCostUsd,
250
+ runId: emitter?.runId ?? null,
251
+ failureClass: "tool_recovery_failure",
252
+ runtimeErrors,
253
+ stoppedBy: "stop-policy"
254
+ });
255
+ }
256
+ const beforeState = state;
257
+ const evalsBefore = evals;
258
+ const scoreBefore = averageScore(evals);
259
+ const actionStarted = Date.now();
260
+ const stepHandle = emitter ? await runTrace(
261
+ runtimeErrors,
262
+ stepIndex,
263
+ () => emitter.tool({
264
+ name: `control-step-${stepIndex}`,
265
+ toolName: "agent-control-action",
266
+ args: decision.action,
267
+ attributes: {
268
+ decision: decision.reason ?? "continue",
269
+ repeatedActionStreak
270
+ }
271
+ })
272
+ ) : void 0;
273
+ let actionOutcome;
274
+ try {
275
+ const result = await config.act(decision.action, ctx);
276
+ const rawCostUsd = config.getActionCostUsd?.({
277
+ action: decision.action,
278
+ result,
279
+ state,
280
+ evals,
281
+ history
282
+ });
283
+ const costUsd = normalizeActionCostUsd(rawCostUsd, runtimeErrors, stepIndex);
284
+ if (costUsd !== void 0 && Number.isFinite(costUsd) && costUsd > 0) {
285
+ spentCostUsd += costUsd;
286
+ await recordCostBudget(
287
+ emitter,
288
+ budget,
289
+ spentCostUsd,
290
+ stepHandle,
291
+ runtimeErrors,
292
+ stepIndex
293
+ );
294
+ }
295
+ actionOutcome = {
296
+ ok: true,
297
+ result,
298
+ ...costUsd !== void 0 ? { costUsd } : {},
299
+ durationMs: Date.now() - actionStarted
300
+ };
301
+ } catch (err) {
302
+ runtimeErrors.push(runtimeError("act", stepIndex, err));
303
+ actionOutcome = {
304
+ ok: false,
305
+ error: runtimeErrors[runtimeErrors.length - 1].message,
306
+ durationMs: Date.now() - actionStarted
307
+ };
308
+ if (actionFailure === "stop") {
309
+ await runTrace(
310
+ runtimeErrors,
311
+ stepIndex,
312
+ () => stepHandle?.fail(actionOutcome.error ?? "action failed")
313
+ );
314
+ const step2 = {
315
+ index: stepIndex,
316
+ decision,
317
+ beforeState,
318
+ afterState: state,
319
+ evalsBefore,
320
+ evalsAfter: evals,
321
+ actionOutcome,
322
+ startedAt: new Date(actionStarted).toISOString(),
323
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
324
+ };
325
+ history.push(step2);
326
+ await runOnStep(config.onStep, step2, runtimeErrors);
327
+ return finish(emitter, {
328
+ intent: config.intent,
329
+ pass: false,
330
+ completed: false,
331
+ reason: actionOutcome.error ?? "action failed",
332
+ score: averageScore(evals),
333
+ steps: history,
334
+ finalState: state,
335
+ finalEvals: evals,
336
+ wallMs: Date.now() - started,
337
+ spentCostUsd,
338
+ runId: emitter?.runId ?? null,
339
+ failureClass: "unknown",
340
+ runtimeErrors,
341
+ stoppedBy: "runtime-error"
342
+ });
343
+ }
344
+ }
345
+ try {
346
+ state = await config.observe({ history, abortSignal: controller.signal });
347
+ } catch (err) {
348
+ runtimeErrors.push(runtimeError("observe", stepIndex, err));
349
+ const step2 = {
350
+ index: stepIndex,
351
+ decision,
352
+ beforeState,
353
+ afterState: beforeState,
354
+ evalsBefore,
355
+ evalsAfter: evals,
356
+ actionOutcome,
357
+ startedAt: new Date(actionStarted).toISOString(),
358
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
359
+ };
360
+ history.push(step2);
361
+ await runTrace(
362
+ runtimeErrors,
363
+ stepIndex,
364
+ () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message)
365
+ );
366
+ await runOnStep(config.onStep, step2, runtimeErrors);
367
+ return finish(emitter, {
368
+ intent: config.intent,
369
+ pass: false,
370
+ completed: false,
371
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
372
+ score: averageScore(evals),
373
+ steps: history,
374
+ finalState: beforeState,
375
+ finalEvals: evals,
376
+ wallMs: Date.now() - started,
377
+ spentCostUsd,
378
+ runId: emitter?.runId ?? null,
379
+ failureClass: "unknown",
380
+ runtimeErrors,
381
+ stoppedBy: "runtime-error"
382
+ });
383
+ }
384
+ try {
385
+ evals = await config.validate({
386
+ intent: config.intent,
387
+ state,
388
+ history,
389
+ abortSignal: controller.signal
390
+ });
391
+ await recordEvalSpans(
392
+ emitter,
393
+ evals,
394
+ `step-${stepIndex}`,
395
+ runtimeErrors,
396
+ stepIndex,
397
+ stepHandle?.span.spanId
398
+ );
399
+ } catch (err) {
400
+ runtimeErrors.push(runtimeError("validate", stepIndex, err));
401
+ const step2 = {
402
+ index: stepIndex,
403
+ decision,
404
+ beforeState,
405
+ afterState: state,
406
+ evalsBefore,
407
+ evalsAfter: evals,
408
+ actionOutcome,
409
+ startedAt: new Date(actionStarted).toISOString(),
410
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
411
+ };
412
+ history.push(step2);
413
+ await runTrace(
414
+ runtimeErrors,
415
+ stepIndex,
416
+ () => stepHandle?.fail(runtimeErrors[runtimeErrors.length - 1].message)
417
+ );
418
+ await runOnStep(config.onStep, step2, runtimeErrors);
419
+ return finish(emitter, {
420
+ intent: config.intent,
421
+ pass: false,
422
+ completed: false,
423
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
424
+ score: averageScore(evals),
425
+ steps: history,
426
+ finalState: state,
427
+ finalEvals: evals,
428
+ wallMs: Date.now() - started,
429
+ spentCostUsd,
430
+ runId: emitter?.runId ?? null,
431
+ failureClass: "unknown",
432
+ runtimeErrors,
433
+ stoppedBy: "runtime-error"
434
+ });
435
+ }
436
+ const scoreAfter = averageScore(evals);
437
+ const stateFingerprint = fingerprintState(state, config.stopPolicies);
438
+ const noProgressStop = noProgressStopDecision({
439
+ policies: config.stopPolicies,
440
+ lastStateFingerprint,
441
+ stateFingerprint,
442
+ scoreBefore,
443
+ scoreAfter,
444
+ currentStreak: noProgressStreak
445
+ });
446
+ noProgressStreak = noProgressStop.streak;
447
+ lastStateFingerprint = stateFingerprint;
448
+ const step = {
449
+ index: stepIndex,
450
+ decision,
451
+ beforeState,
452
+ afterState: state,
453
+ evalsBefore,
454
+ evalsAfter: evals,
455
+ actionOutcome,
456
+ startedAt: new Date(actionStarted).toISOString(),
457
+ endedAt: (/* @__PURE__ */ new Date()).toISOString()
458
+ };
459
+ history.push(step);
460
+ if (actionOutcome.ok) {
461
+ await runTrace(
462
+ runtimeErrors,
463
+ stepIndex,
464
+ () => stepHandle?.end({
465
+ attributes: {
466
+ actionCostUsd: actionOutcome.costUsd ?? null,
467
+ spentCostUsd,
468
+ scoreBefore: scoreBefore ?? null,
469
+ scoreAfter: scoreAfter ?? null,
470
+ noProgressStreak
471
+ }
472
+ })
473
+ );
474
+ } else {
475
+ await runTrace(
476
+ runtimeErrors,
477
+ stepIndex,
478
+ () => stepHandle?.fail(actionOutcome.error ?? "action failed", {
479
+ attributes: {
480
+ spentCostUsd,
481
+ noProgressStreak
482
+ }
483
+ })
484
+ );
485
+ }
486
+ await runOnStep(config.onStep, step, runtimeErrors);
487
+ if (noProgressStop.stop) {
488
+ return finish(emitter, {
489
+ intent: config.intent,
490
+ pass: false,
491
+ completed: true,
492
+ reason: noProgressStop.reason,
493
+ score: scoreAfter,
494
+ steps: history,
495
+ finalState: state,
496
+ finalEvals: evals,
497
+ wallMs: Date.now() - started,
498
+ spentCostUsd,
499
+ runId: emitter?.runId ?? null,
500
+ failureClass: "tool_recovery_failure",
501
+ runtimeErrors,
502
+ stoppedBy: "stop-policy"
503
+ });
504
+ }
505
+ const postStepBudgetStop = budgetStopDecision(budget, spentCostUsd);
506
+ if (postStepBudgetStop.stop) {
507
+ return finish(emitter, {
508
+ intent: config.intent,
509
+ pass: false,
510
+ completed: false,
511
+ reason: postStepBudgetStop.reason,
512
+ score: scoreAfter,
513
+ steps: history,
514
+ finalState: state,
515
+ finalEvals: evals,
516
+ wallMs: Date.now() - started,
517
+ spentCostUsd,
518
+ runId: emitter?.runId ?? null,
519
+ failureClass: "budget_exceeded",
520
+ runtimeErrors,
521
+ stoppedBy: "budget"
522
+ });
523
+ }
524
+ const postStepCtx = makeContext(
525
+ config.intent,
526
+ state,
527
+ evals,
528
+ history,
529
+ budget,
530
+ stepIndex + 1,
531
+ started,
532
+ spentCostUsd,
533
+ controller.signal,
534
+ emitter
535
+ );
536
+ let postStepStop;
537
+ try {
538
+ postStepStop = config.shouldStop ? await config.shouldStop(postStepCtx) : defaultStopDecision(evals);
539
+ } catch (err) {
540
+ runtimeErrors.push(runtimeError("stop-policy", stepIndex + 1, err));
541
+ return finish(emitter, {
542
+ intent: config.intent,
543
+ pass: false,
544
+ completed: false,
545
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
546
+ score: averageScore(evals),
547
+ steps: history,
548
+ finalState: state,
549
+ finalEvals: evals,
550
+ wallMs: Date.now() - started,
551
+ spentCostUsd,
552
+ runId: emitter?.runId ?? null,
553
+ failureClass: "unknown",
554
+ runtimeErrors,
555
+ stoppedBy: "runtime-error"
556
+ });
557
+ }
558
+ if (postStepStop.stop) {
559
+ return finish(emitter, {
560
+ intent: config.intent,
561
+ pass: postStepStop.pass,
562
+ completed: true,
563
+ reason: postStepStop.reason,
564
+ score: postStepStop.score,
565
+ steps: history,
566
+ finalState: state,
567
+ finalEvals: evals,
568
+ wallMs: Date.now() - started,
569
+ spentCostUsd,
570
+ runId: emitter?.runId ?? null,
571
+ failureClass: postStepStop.failureClass,
572
+ runtimeErrors,
573
+ stoppedBy: "stop-policy"
574
+ });
575
+ }
576
+ }
577
+ return finish(emitter, {
578
+ intent: config.intent,
579
+ pass: false,
580
+ completed: false,
581
+ reason: `budget exhausted: maxSteps=${budget.maxSteps}`,
582
+ steps: history,
583
+ finalState: state,
584
+ finalEvals: evals,
585
+ wallMs: Date.now() - started,
586
+ spentCostUsd,
587
+ runId: emitter?.runId ?? null,
588
+ failureClass: "budget_exceeded",
589
+ runtimeErrors,
590
+ stoppedBy: "budget"
591
+ });
592
+ } catch (err) {
593
+ runtimeErrors.push(runtimeError("act", history.length, err));
594
+ return finish(emitter, {
595
+ intent: config.intent,
596
+ pass: false,
597
+ completed: false,
598
+ reason: runtimeErrors[runtimeErrors.length - 1].message,
599
+ steps: history,
600
+ finalState: void 0,
601
+ finalEvals: [],
602
+ wallMs: Date.now() - started,
603
+ spentCostUsd,
604
+ runId: emitter?.runId ?? null,
605
+ failureClass: "unknown",
606
+ runtimeErrors,
607
+ stoppedBy: "runtime-error"
608
+ });
609
+ } finally {
610
+ if (wallTimer) clearTimeout(wallTimer);
611
+ if (config.signal) config.signal.removeEventListener("abort", upstreamAbort);
612
+ }
613
+ }
614
+ function stopOnNoProgress(maxNoProgressSteps, options = {}) {
615
+ return { ...options, maxNoProgressSteps };
616
+ }
617
+ function stopOnRepeatedAction(maxRepeatedActions, options = {}) {
618
+ return { ...options, maxRepeatedActions };
619
+ }
620
+ function objectiveEval(input) {
621
+ return { ...input, objective: true };
622
+ }
623
+ function subjectiveEval(input) {
624
+ return { ...input, objective: false };
625
+ }
626
+ function normalizeBudget(input) {
627
+ const raw = { ...DEFAULT_BUDGET, ...input };
628
+ if (!Number.isInteger(raw.maxSteps) || raw.maxSteps < 1) {
629
+ throw new RangeError(
630
+ `ControlRuntime budget.maxSteps must be an integer >= 1, got ${String(raw.maxSteps)}`
631
+ );
632
+ }
633
+ const budget = { maxSteps: raw.maxSteps };
634
+ if (raw.maxWallMs !== void 0) {
635
+ if (typeof raw.maxWallMs !== "number" || !Number.isFinite(raw.maxWallMs) || raw.maxWallMs <= 0) {
636
+ throw new RangeError(
637
+ `ControlRuntime budget.maxWallMs must be a positive finite number, got ${String(raw.maxWallMs)}`
638
+ );
639
+ }
640
+ budget.maxWallMs = raw.maxWallMs;
641
+ }
642
+ if (raw.maxCostUsd !== void 0) {
643
+ if (typeof raw.maxCostUsd !== "number" || !Number.isFinite(raw.maxCostUsd) || raw.maxCostUsd < 0) {
644
+ throw new RangeError(
645
+ `ControlRuntime budget.maxCostUsd must be a nonnegative finite number, got ${String(raw.maxCostUsd)}`
646
+ );
647
+ }
648
+ budget.maxCostUsd = raw.maxCostUsd;
649
+ }
650
+ return budget;
651
+ }
652
+ function normalizeActionCostUsd(costUsd, runtimeErrors, stepIndex) {
653
+ if (costUsd === void 0) return void 0;
654
+ if (!Number.isFinite(costUsd) || costUsd < 0) {
655
+ runtimeErrors.push(
656
+ runtimeError("act", stepIndex, new Error(`invalid action costUsd: ${String(costUsd)}`))
657
+ );
658
+ return void 0;
659
+ }
660
+ return costUsd;
661
+ }
662
+ function allCriticalPassed(evals) {
663
+ return evals.every(
664
+ (result) => result.passed || result.severity !== "critical" && result.severity !== "error"
665
+ );
666
+ }
667
+ function makeContext(intent, state, evals, history, budget, stepIndex, started, spentCostUsd, abortSignal, emitter) {
668
+ return {
669
+ intent,
670
+ state,
671
+ evals,
672
+ history,
673
+ budget,
674
+ stepIndex,
675
+ wallMs: Date.now() - started,
676
+ spentCostUsd,
677
+ remainingCostUsd: budget.maxCostUsd === void 0 ? void 0 : Math.max(0, budget.maxCostUsd - spentCostUsd),
678
+ abortSignal,
679
+ emitter
680
+ };
681
+ }
682
+ function defaultStopDecision(evals) {
683
+ if (!evals.length) return { stop: false, pass: false, reason: "no evals yet" };
684
+ const pass = allCriticalPassed(evals);
685
+ return pass ? { stop: true, pass: true, reason: "all critical evals passed", score: averageScore(evals) } : {
686
+ stop: false,
687
+ pass: false,
688
+ reason: "critical evals still failing",
689
+ score: averageScore(evals)
690
+ };
691
+ }
692
+ function averageScore(evals) {
693
+ const scored = evals.map((result) => result.score).filter((score) => typeof score === "number");
694
+ if (!scored.length) return void 0;
695
+ return Math.round(scored.reduce((sum, score) => sum + score, 0) / scored.length * 1e3) / 1e3;
696
+ }
697
+ function budgetStopDecision(budget, spentCostUsd) {
698
+ if (budget.maxCostUsd !== void 0 && spentCostUsd >= budget.maxCostUsd) {
699
+ return {
700
+ stop: true,
701
+ reason: `budget exhausted: maxCostUsd=${budget.maxCostUsd}`
702
+ };
703
+ }
704
+ return { stop: false, reason: "" };
705
+ }
706
+ async function recordCostBudget(emitter, budget, spentCostUsd, handle, runtimeErrors, stepIndex) {
707
+ if (!emitter || budget.maxCostUsd === void 0) return;
708
+ const maxCostUsd = budget.maxCostUsd;
709
+ await runTrace(
710
+ runtimeErrors,
711
+ stepIndex,
712
+ () => emitter.recordBudget({
713
+ dimension: "usd",
714
+ limit: maxCostUsd,
715
+ consumed: spentCostUsd,
716
+ remaining: Math.max(0, maxCostUsd - spentCostUsd),
717
+ breached: spentCostUsd >= maxCostUsd,
718
+ spanId: handle?.span.spanId
719
+ })
720
+ );
721
+ }
722
+ async function recordEvalSpans(emitter, evals, phase, runtimeErrors, stepIndex, targetSpanId) {
723
+ if (!emitter) return;
724
+ for (const result of evals) {
725
+ await runTrace(
726
+ runtimeErrors,
727
+ stepIndex,
728
+ () => emitter.recordJudge({
729
+ judgeId: result.objective ? "objective-validator" : "subjective-judge",
730
+ targetSpanId: targetSpanId ?? emitter.runId,
731
+ name: `control-eval/${result.id}`,
732
+ dimension: result.id,
733
+ score: typeof result.score === "number" ? result.score : result.passed ? 1 : 0,
734
+ rationale: result.detail,
735
+ evidence: result.evidence,
736
+ attributes: {
737
+ phase,
738
+ passed: result.passed,
739
+ severity: result.severity,
740
+ objective: result.objective
741
+ }
742
+ })
743
+ );
744
+ }
745
+ }
746
+ async function runOnStep(onStep, step, runtimeErrors) {
747
+ if (!onStep) return;
748
+ try {
749
+ await onStep(step);
750
+ } catch (err) {
751
+ runtimeErrors.push(runtimeError("on-step", step.index, err));
752
+ }
753
+ }
754
+ async function runTrace(runtimeErrors, stepIndex, write) {
755
+ try {
756
+ return await write();
757
+ } catch (err) {
758
+ runtimeErrors.push(runtimeError("trace", stepIndex, err));
759
+ return void 0;
760
+ }
761
+ }
762
+ function noProgressStopDecision(args) {
763
+ const max = args.policies?.maxNoProgressSteps;
764
+ if (!max || max <= 0) return { stop: false, reason: "", streak: 0 };
765
+ const minScoreDelta = args.policies?.minScoreDelta ?? 1e-3;
766
+ const scoreDelta = Math.abs((args.scoreAfter ?? 0) - (args.scoreBefore ?? 0));
767
+ const stateUnchanged = args.lastStateFingerprint !== void 0 && args.lastStateFingerprint === args.stateFingerprint;
768
+ const scoreFlat = scoreDelta < minScoreDelta;
769
+ const streak = stateUnchanged && scoreFlat ? args.currentStreak + 1 : 0;
770
+ return streak >= max ? { stop: true, reason: `stuck: no state/score progress for ${streak} step(s)`, streak } : { stop: false, reason: "", streak };
771
+ }
772
+ function repeatedActionStopDecision(policies, streak) {
773
+ const max = policies?.maxRepeatedActions;
774
+ if (!max || max <= 0 || streak < max) return { stop: false, reason: "" };
775
+ return {
776
+ stop: true,
777
+ reason: `stuck: repeated same action for ${streak} step(s)`
778
+ };
779
+ }
780
+ function fingerprintState(state, policies) {
781
+ if (policies?.stateFingerprint) return policies.stateFingerprint(state);
782
+ return stableFingerprint(state);
783
+ }
784
+ function fingerprintAction(action, policies) {
785
+ if (policies?.actionFingerprint) return policies.actionFingerprint(action);
786
+ return stableFingerprint(action);
787
+ }
788
+ function stableFingerprint(value) {
789
+ if (typeof value === "string") return value;
790
+ if (typeof value === "number" || typeof value === "boolean" || value == null) return String(value);
791
+ try {
792
+ return JSON.stringify(sortForFingerprint(value));
793
+ } catch {
794
+ return String(value);
795
+ }
796
+ }
797
+ function sortForFingerprint(value) {
798
+ if (Array.isArray(value)) return value.map(sortForFingerprint);
799
+ if (!value || typeof value !== "object") return value;
800
+ const record = value;
801
+ const sorted = {};
802
+ for (const key of Object.keys(record).sort()) {
803
+ sorted[key] = sortForFingerprint(record[key]);
804
+ }
805
+ return sorted;
806
+ }
807
+ function abortReason(signal) {
808
+ const reason = signal.reason;
809
+ if (reason instanceof Error) return reason.message;
810
+ return reason ? String(reason) : "aborted";
811
+ }
812
+ function runtimeError(phase, stepIndex, err) {
813
+ const message = err instanceof Error ? err.message : String(err);
814
+ return { phase, stepIndex, message };
815
+ }
816
+ async function finish(emitter, result) {
817
+ await runTrace(
818
+ result.runtimeErrors,
819
+ result.steps.length,
820
+ () => emitter?.endRun({
821
+ pass: result.pass,
822
+ score: result.score ?? averageScore(result.finalEvals),
823
+ failureClass: result.failureClass,
824
+ notes: result.reason
825
+ })
826
+ );
827
+ return result;
828
+ }
829
+
830
+ export {
831
+ runAgentControlLoop,
832
+ stopOnNoProgress,
833
+ stopOnRepeatedAction,
834
+ objectiveEval,
835
+ subjectiveEval,
836
+ allCriticalPassed
837
+ };
838
+ //# sourceMappingURL=chunk-LSH4MMOZ.js.map