@tangle-network/agent-eval 0.77.0 → 0.80.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +50 -19
  2. package/dist/adapters/http.d.ts +2 -2
  3. package/dist/adapters/langchain.d.ts +2 -2
  4. package/dist/adapters/otel.d.ts +4 -4
  5. package/dist/{agent-profile-DYRboYWu.d.ts → agent-profile-aSEaJ9Pl.d.ts} +1 -1
  6. package/dist/analyst/index.d.ts +42 -8
  7. package/dist/analyst/index.js +32 -2
  8. package/dist/analyst/index.js.map +1 -1
  9. package/dist/authenticity/index.d.ts +54 -1
  10. package/dist/authenticity/index.js +88 -1
  11. package/dist/authenticity/index.js.map +1 -1
  12. package/dist/belief-state/index.d.ts +188 -0
  13. package/dist/belief-state/index.js +486 -0
  14. package/dist/belief-state/index.js.map +1 -0
  15. package/dist/benchmarks/index.d.ts +2 -2
  16. package/dist/calibration-Cpr3WaX3.d.ts +101 -0
  17. package/dist/campaign/index.d.ts +11 -11
  18. package/dist/campaign/index.js +4 -4
  19. package/dist/chunk-4DIJWVUT.js +131 -0
  20. package/dist/chunk-4DIJWVUT.js.map +1 -0
  21. package/dist/{chunk-7W4SM7FD.js → chunk-5LVWPNS5.js} +91 -91
  22. package/dist/chunk-5LVWPNS5.js.map +1 -0
  23. package/dist/{chunk-WYIHD6EB.js → chunk-CF67I6QY.js} +1 -1
  24. package/dist/chunk-CF67I6QY.js.map +1 -0
  25. package/dist/{chunk-XPILG2CA.js → chunk-GXHLRXDI.js} +2 -2
  26. package/dist/{chunk-F3SRAAZO.js → chunk-KWRRMR3J.js} +15 -1
  27. package/dist/chunk-KWRRMR3J.js.map +1 -0
  28. package/dist/chunk-NPCTHQIO.js +91 -0
  29. package/dist/chunk-NPCTHQIO.js.map +1 -0
  30. package/dist/{chunk-JYE3WOTE.js → chunk-RPLZ4OIB.js} +10 -1
  31. package/dist/chunk-RPLZ4OIB.js.map +1 -0
  32. package/dist/{chunk-6EKXFFGQ.js → chunk-RTWFUK6A.js} +2 -2
  33. package/dist/{chunk-XGNCBAVZ.js → chunk-XQL22JDG.js} +2 -2
  34. package/dist/{chunk-GJJNJVIR.js → chunk-XXNIODOM.js} +2 -2
  35. package/dist/contract/index.d.ts +128 -15
  36. package/dist/contract/index.js +118 -2
  37. package/dist/contract/index.js.map +1 -1
  38. package/dist/{control-BgA6BYTm.d.ts → control-CehLtoET.d.ts} +1 -1
  39. package/dist/control.d.ts +2 -2
  40. package/dist/control.js +2 -2
  41. package/dist/governance/index.d.ts +1 -1
  42. package/dist/hosted/index.d.ts +4 -4
  43. package/dist/{index-DsnOpCO6.d.ts → index-B1RKber3.d.ts} +1 -1
  44. package/dist/index.d.ts +127 -26
  45. package/dist/index.js +32 -7
  46. package/dist/index.js.map +1 -1
  47. package/dist/{insight-report-Df3lxYXM.d.ts → insight-report-dlpEzQDi.d.ts} +1 -1
  48. package/dist/{kind-factory-DW9XWPvM.d.ts → kind-factory-DqV2t1Xk.d.ts} +1 -1
  49. package/dist/meta-eval/index.d.ts +6 -99
  50. package/dist/meta-eval/index.js +7 -76
  51. package/dist/meta-eval/index.js.map +1 -1
  52. package/dist/off-policy-DiwuKKg7.d.ts +132 -0
  53. package/dist/openapi.json +1 -1
  54. package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-rnXLEqSn.d.ts} +1 -1
  55. package/dist/{provenance-B-TFszPW.d.ts → provenance-jG-Gngg8.d.ts} +3 -3
  56. package/dist/{registry-DuVYiTvw.d.ts → registry-BK0Zee01.d.ts} +1 -1
  57. package/dist/{release-report-CN8hJlhk.d.ts → release-report-CXXZlR8g.d.ts} +2 -2
  58. package/dist/reporting.d.ts +5 -5
  59. package/dist/{researcher-C_KJyIGg.d.ts → researcher-rInLj9De.d.ts} +2 -2
  60. package/dist/rl.d.ts +10 -140
  61. package/dist/rl.js +8 -122
  62. package/dist/rl.js.map +1 -1
  63. package/dist/{rubric-predictive-validity-D_4BSXGV.d.ts → rubric-predictive-validity-CLPuwiUw.d.ts} +2 -2
  64. package/dist/{run-improvement-loop-BqYH2vCR.d.ts → run-improvement-loop-BAl_aVOZ.d.ts} +2 -4
  65. package/dist/{run-record-BgTFzO2r.d.ts → run-record-sItO5ftF.d.ts} +11 -0
  66. package/dist/{semantic-concept-judge-CV9Wlx4t.d.ts → semantic-concept-judge-qXEUV2w7.d.ts} +3 -3
  67. package/dist/{summary-report-ByiOUrHj.d.ts → summary-report-BTaXq1TS.d.ts} +1 -1
  68. package/dist/traces.d.ts +1 -1
  69. package/dist/traces.js +2 -2
  70. package/dist/{types-Bba0vl1V.d.ts → types-4mm2msnR.d.ts} +12 -4
  71. package/dist/{types-CRD68aH7.d.ts → types-DRvV0zRo.d.ts} +10 -1
  72. package/dist/workflow/index.d.ts +4 -4
  73. package/dist/workflow/index.js +1 -1
  74. package/docs/auto-research-loop-end-to-end.md +1 -1
  75. package/docs/feature-guide.md +4 -4
  76. package/docs/multi-shot-optimization.md +61 -115
  77. package/docs/product-eval-adoption.md +1 -1
  78. package/docs/research/belief-state-agent-eval-roadmap.md +558 -0
  79. package/docs/research/research-roadmap.md +1 -0
  80. package/docs/three-package-architecture.md +1 -1
  81. package/docs/trace-analysis.md +19 -0
  82. package/package.json +7 -2
  83. package/dist/chunk-7W4SM7FD.js.map +0 -1
  84. package/dist/chunk-F3SRAAZO.js.map +0 -1
  85. package/dist/chunk-JYE3WOTE.js.map +0 -1
  86. package/dist/chunk-WYIHD6EB.js.map +0 -1
  87. /package/dist/{chunk-XPILG2CA.js.map → chunk-GXHLRXDI.js.map} +0 -0
  88. /package/dist/{chunk-6EKXFFGQ.js.map → chunk-RTWFUK6A.js.map} +0 -0
  89. /package/dist/{chunk-XGNCBAVZ.js.map → chunk-XQL22JDG.js.map} +0 -0
  90. /package/dist/{chunk-GJJNJVIR.js.map → chunk-XXNIODOM.js.map} +0 -0
@@ -0,0 +1,486 @@
1
+ import {
2
+ calibrationFromPairs
3
+ } from "../chunk-NPCTHQIO.js";
4
+ import {
5
+ offPolicyEstimateAll
6
+ } from "../chunk-4DIJWVUT.js";
7
+ import {
8
+ confidenceInterval
9
+ } from "../chunk-ITBRCT73.js";
10
+ import {
11
+ ValidationError
12
+ } from "../chunk-3BFEG2F6.js";
13
+ import "../chunk-PZ5AY32C.js";
14
+
15
+ // src/belief-state/calibration.ts
16
+ function calibrateBeliefDecisions(points, options = {}) {
17
+ const filtered = filterCalibrationRegion(points, options);
18
+ const pairs = filtered.filter((point) => typeof point.confidence === "number" && point.outcome).map((point) => ({
19
+ evalScore: point.confidence,
20
+ outcome: outcomeScore(point)
21
+ })).filter((pair) => Number.isFinite(pair.outcome));
22
+ const minPairs = options.minPairs ?? 10;
23
+ if (pairs.length < minPairs) return null;
24
+ return calibrationFromPairs(pairs, "belief-confidence", "decision-outcome", {
25
+ bins: options.bins ?? 5,
26
+ range: { lo: 0, hi: 1 }
27
+ });
28
+ }
29
+ function filterCalibrationRegion(points, options) {
30
+ const region = options.region ?? "all";
31
+ if (region === "all") return points;
32
+ const policy = options.policy;
33
+ if (!policy) {
34
+ throw new ValidationError(
35
+ `calibrateBeliefDecisions: policy is required when region is "${region}"`
36
+ );
37
+ }
38
+ return points.filter((point) => {
39
+ const accepted = policy.decide(point).action === "accept";
40
+ return region === "accepted" ? accepted : !accepted;
41
+ });
42
+ }
43
+ function outcomeScore(point) {
44
+ if (typeof point.outcome?.reward === "number") return point.outcome.reward;
45
+ if (typeof point.outcome?.score === "number") return point.outcome.score;
46
+ if (point.outcome?.success === true) return 1;
47
+ if (point.outcome?.success === false) return 0;
48
+ return Number.NaN;
49
+ }
50
+
51
+ // src/belief-state/extract.ts
52
+ var DECISION_MARKERS = /* @__PURE__ */ new Set(["belief_decision", "belief.decision", "decision_point"]);
53
+ var DECISION_KINDS = /* @__PURE__ */ new Set([
54
+ "continue",
55
+ "verify",
56
+ "ask",
57
+ "retry",
58
+ "stop",
59
+ "memory-write",
60
+ "memory-read",
61
+ "tool-select",
62
+ "skill-select",
63
+ "workflow-select",
64
+ "surface-promote"
65
+ ]);
66
+ async function extractBeliefDecisionPoints(store, options = {}) {
67
+ const runs = options.runIds ? (await Promise.all(options.runIds.map((runId) => store.getRun(runId)))).filter(Boolean) : await store.listRuns();
68
+ const decisions = [];
69
+ const diagnostics = [];
70
+ for (const run of runs) {
71
+ if (!run) continue;
72
+ const events = await store.events({ runId: run.runId });
73
+ const spans = await store.spans({ runId: run.runId });
74
+ const spanIds = new Set(spans.map((span) => span.spanId));
75
+ let stepIndex = 0;
76
+ for (const event of [...events].sort((a, b) => a.timestamp - b.timestamp)) {
77
+ const parsed = parseDecisionEvent(event, {
78
+ scenarioId: run.scenarioId,
79
+ stepIndex,
80
+ spanExists: event.spanId ? spanIds.has(event.spanId) : false
81
+ });
82
+ if (!parsed) continue;
83
+ if ("diagnostic" in parsed) {
84
+ diagnostics.push(parsed.diagnostic);
85
+ continue;
86
+ }
87
+ decisions.push(parsed.decision);
88
+ stepIndex++;
89
+ }
90
+ }
91
+ return { decisions, diagnostics };
92
+ }
93
+ function parseDecisionEvent(event, context) {
94
+ const payload = event.payload;
95
+ const marker = stringField(payload, "kind") ?? stringField(payload, "type");
96
+ if (!marker || !DECISION_MARKERS.has(marker)) return null;
97
+ const decisionKind = stringField(payload, "decisionKind");
98
+ if (!decisionKind || !DECISION_KINDS.has(decisionKind)) {
99
+ return {
100
+ diagnostic: {
101
+ runId: event.runId,
102
+ eventId: event.eventId,
103
+ severity: "warning",
104
+ reason: `belief decision event has unsupported decisionKind "${decisionKind ?? ""}"`
105
+ }
106
+ };
107
+ }
108
+ const chosenAction = stringField(payload, "chosenAction");
109
+ if (!chosenAction) {
110
+ return {
111
+ diagnostic: {
112
+ runId: event.runId,
113
+ eventId: event.eventId,
114
+ severity: "warning",
115
+ reason: "belief decision event is missing chosenAction"
116
+ }
117
+ };
118
+ }
119
+ const evidence = [
120
+ {
121
+ source: "event",
122
+ id: event.eventId,
123
+ runId: event.runId,
124
+ eventId: event.eventId
125
+ }
126
+ ];
127
+ if (event.spanId && context.spanExists) {
128
+ evidence.push({ source: "span", id: event.spanId, runId: event.runId, spanId: event.spanId });
129
+ }
130
+ return {
131
+ decision: {
132
+ id: stringField(payload, "id") ?? event.eventId,
133
+ runId: event.runId,
134
+ scenarioId: stringField(payload, "scenarioId") ?? context.scenarioId,
135
+ stepIndex: numberField(payload, "stepIndex") ?? context.stepIndex,
136
+ kind: decisionKind,
137
+ chosenAction,
138
+ candidateActions: stringArrayField(payload, "candidateActions"),
139
+ confidence: finiteUnitField(payload, "confidence"),
140
+ behaviorProb: numberField(payload, "behaviorProb"),
141
+ targetProb: numberField(payload, "targetProb"),
142
+ qHat: finiteUnitField(payload, "qHat"),
143
+ costUsd: nonNegativeNumberField(payload, "costUsd"),
144
+ evidence,
145
+ outcome: parseOutcome(payload),
146
+ metadata: recordField(payload, "metadata")
147
+ }
148
+ };
149
+ }
150
+ function parseOutcome(payload) {
151
+ const value = recordField(payload, "outcome");
152
+ if (!value) return void 0;
153
+ return {
154
+ success: typeof value.success === "boolean" ? value.success : void 0,
155
+ score: finiteUnitField(value, "score"),
156
+ reward: finiteUnitField(value, "reward"),
157
+ costUsd: nonNegativeNumberField(value, "costUsd"),
158
+ observedAt: stringField(value, "observedAt"),
159
+ metadata: recordField(value, "metadata")
160
+ };
161
+ }
162
+ function stringField(obj, key) {
163
+ const value = obj[key];
164
+ return typeof value === "string" && value.length > 0 ? value : void 0;
165
+ }
166
+ function numberField(obj, key) {
167
+ const value = obj[key];
168
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
169
+ }
170
+ function finiteUnitField(obj, key) {
171
+ const value = numberField(obj, key);
172
+ return value === void 0 ? void 0 : Math.max(0, Math.min(1, value));
173
+ }
174
+ function nonNegativeNumberField(obj, key) {
175
+ const value = numberField(obj, key);
176
+ return value === void 0 ? void 0 : Math.max(0, value);
177
+ }
178
+ function stringArrayField(obj, key) {
179
+ const value = obj[key];
180
+ if (!Array.isArray(value)) return void 0;
181
+ const strings = value.filter(
182
+ (item) => typeof item === "string" && item.length > 0
183
+ );
184
+ return strings.length > 0 ? strings : void 0;
185
+ }
186
+ function recordField(obj, key) {
187
+ const value = obj[key];
188
+ if (!value || typeof value !== "object" || Array.isArray(value)) return void 0;
189
+ return value;
190
+ }
191
+
192
+ // src/belief-state/ope.ts
193
+ function embeddedBeliefOpeTargetPolicy(id = "embedded-target-prob") {
194
+ return {
195
+ id,
196
+ targetProbOf(point) {
197
+ return point.targetProb;
198
+ },
199
+ qHatOf(point) {
200
+ return point.qHat;
201
+ }
202
+ };
203
+ }
204
+ function beliefDecisionsToOffPolicyTrajectories(points, targetPolicy, options = {}) {
205
+ const trajectories = [];
206
+ const diagnostics = [];
207
+ for (const point of points) {
208
+ if (!point.outcome) {
209
+ diagnostics.push(`${point.id}: missing outcome`);
210
+ continue;
211
+ }
212
+ if (!isBehaviorProbability(point.behaviorProb)) {
213
+ diagnostics.push(`${point.id}: invalid behaviorProb ${formatProbability(point.behaviorProb)}`);
214
+ continue;
215
+ }
216
+ let targetProb;
217
+ let qHat;
218
+ try {
219
+ targetProb = targetPolicy.targetProbOf(point);
220
+ qHat = targetPolicy.qHatOf?.(point);
221
+ } catch (error) {
222
+ diagnostics.push(
223
+ `${point.id}: target policy ${targetPolicy.id} threw (${errorMessage(error)})`
224
+ );
225
+ continue;
226
+ }
227
+ if (!isTargetProbability(targetProb)) {
228
+ diagnostics.push(`${point.id}: invalid targetProb ${formatProbability(targetProb)}`);
229
+ continue;
230
+ }
231
+ if (qHat !== null && qHat !== void 0 && !isTargetProbability(qHat)) {
232
+ diagnostics.push(`${point.id}: invalid qHat ${formatProbability(qHat)}; ignoring qHat`);
233
+ qHat = null;
234
+ }
235
+ trajectories.push({
236
+ runId: point.id,
237
+ reward: rewardOf(point),
238
+ behaviorProb: point.behaviorProb,
239
+ targetProb,
240
+ qHat
241
+ });
242
+ }
243
+ return {
244
+ targetPolicyId: targetPolicy.id,
245
+ trajectories,
246
+ dropped: points.length - trajectories.length,
247
+ diagnostics: compactDiagnostics(diagnostics, options.maxDiagnostics ?? 20)
248
+ };
249
+ }
250
+ function evaluateBeliefOffPolicy(points, targetPolicy, options = {}) {
251
+ const trajectoryReport = beliefDecisionsToOffPolicyTrajectories(points, targetPolicy, options);
252
+ const { trajectories } = trajectoryReport;
253
+ const estimates = offPolicyEstimateAll(trajectories, options);
254
+ const support = supportDiagnostics(estimates.dr, {
255
+ minEffectiveSampleSize: options.minEffectiveSampleSize ?? 30,
256
+ minEffectiveSampleRatio: options.minEffectiveSampleRatio ?? 0.25,
257
+ dropped: trajectoryReport.dropped,
258
+ diagnostics: trajectoryReport.diagnostics
259
+ });
260
+ return { targetPolicyId: targetPolicy.id, ...estimates, support };
261
+ }
262
+ function supportDiagnostics(estimate, options) {
263
+ const ratio = estimate.n > 0 ? estimate.effectiveSampleSize / estimate.n : 0;
264
+ const reasons = [...options.diagnostics];
265
+ if (estimate.n === 0) {
266
+ reasons.push("no valid OPE trajectories");
267
+ }
268
+ if (options.dropped > 0) {
269
+ reasons.push(`dropped ${options.dropped} unsupported decision(s)`);
270
+ }
271
+ if (estimate.effectiveSampleSize < options.minEffectiveSampleSize) {
272
+ reasons.push(
273
+ `effective sample size ${estimate.effectiveSampleSize.toFixed(2)} below ${options.minEffectiveSampleSize}`
274
+ );
275
+ }
276
+ if (ratio < options.minEffectiveSampleRatio) {
277
+ reasons.push(
278
+ `effective sample ratio ${ratio.toFixed(2)} below ${options.minEffectiveSampleRatio}`
279
+ );
280
+ }
281
+ if (estimate.maxImportanceWeight > 10) {
282
+ reasons.push(`max importance weight ${estimate.maxImportanceWeight.toFixed(2)} is high`);
283
+ }
284
+ return {
285
+ supported: reasons.length === 0,
286
+ n: estimate.n,
287
+ dropped: options.dropped,
288
+ effectiveSampleSize: estimate.effectiveSampleSize,
289
+ effectiveSampleRatio: ratio,
290
+ maxImportanceWeight: estimate.maxImportanceWeight,
291
+ reasons
292
+ };
293
+ }
294
+ function rewardOf(point) {
295
+ if (typeof point.outcome?.reward === "number") return point.outcome.reward;
296
+ if (typeof point.outcome?.score === "number") return point.outcome.score;
297
+ if (point.outcome?.success === true) return 1;
298
+ return 0;
299
+ }
300
+ function isBehaviorProbability(value) {
301
+ return typeof value === "number" && Number.isFinite(value) && value > 0 && value <= 1;
302
+ }
303
+ function isTargetProbability(value) {
304
+ return typeof value === "number" && Number.isFinite(value) && value >= 0 && value <= 1;
305
+ }
306
+ function formatProbability(value) {
307
+ return typeof value === "number" ? String(value) : String(value ?? "missing");
308
+ }
309
+ function errorMessage(error) {
310
+ return error instanceof Error ? error.message : String(error);
311
+ }
312
+ function compactDiagnostics(diagnostics, maxDiagnostics) {
313
+ if (diagnostics.length <= maxDiagnostics) return diagnostics;
314
+ return [
315
+ ...diagnostics.slice(0, maxDiagnostics),
316
+ `${diagnostics.length - maxDiagnostics} additional OPE diagnostic(s) omitted`
317
+ ];
318
+ }
319
+
320
+ // src/belief-state/selective.ts
321
+ var DEFAULT_UTILITY = {
322
+ successUtility: 1,
323
+ failureUtility: -1,
324
+ deferUtility: 0,
325
+ verifyCost: 0.05,
326
+ askCost: 0.05,
327
+ retryCost: 0.1,
328
+ stopUtility: 0,
329
+ costWeight: 1
330
+ };
331
+ function thresholdSelectivePolicy(options) {
332
+ const threshold = options.confidenceThreshold;
333
+ if (!Number.isFinite(threshold) || threshold < 0 || threshold > 1) {
334
+ throw new ValidationError(
335
+ `thresholdSelectivePolicy: confidenceThreshold must be in [0, 1], got ${threshold}`
336
+ );
337
+ }
338
+ const belowThresholdAction = options.belowThresholdAction ?? "verify";
339
+ return {
340
+ id: options.id ?? `confidence>=${threshold}`,
341
+ decide(point) {
342
+ const confidence = point.confidence ?? 0;
343
+ return {
344
+ action: confidence >= threshold ? "accept" : belowThresholdAction,
345
+ confidence,
346
+ targetProb: point.targetProb,
347
+ qHat: point.qHat,
348
+ reason: confidence >= threshold ? "confidence threshold passed" : "confidence threshold failed"
349
+ };
350
+ }
351
+ };
352
+ }
353
+ function evaluateBeliefSelectivePolicy(points, policy, options = {}) {
354
+ const utility = { ...DEFAULT_UTILITY, ...options.utility ?? {} };
355
+ const scored = points.filter((point) => point.outcome);
356
+ const minN = options.minN ?? 30;
357
+ const minAccepted = options.minAccepted ?? 5;
358
+ const minUtilityDelta = options.minUtilityDelta ?? 0;
359
+ const deltas = [];
360
+ const acceptedRewards = [];
361
+ const rejectedRewards = [];
362
+ let baselineUtility = 0;
363
+ let policyUtility = 0;
364
+ let accepted = 0;
365
+ let acceptedErrors = 0;
366
+ for (const point of scored) {
367
+ const baseline = acceptUtility(point, utility);
368
+ const decision = policy.decide(point);
369
+ const candidate = policyDecisionUtility(point, decision.action, utility);
370
+ const reward = rewardOf2(point, utility);
371
+ baselineUtility += baseline;
372
+ policyUtility += candidate;
373
+ deltas.push(candidate - baseline);
374
+ if (decision.action === "accept") {
375
+ accepted++;
376
+ acceptedRewards.push(reward);
377
+ if (reward < 0) acceptedErrors++;
378
+ } else {
379
+ rejectedRewards.push(reward);
380
+ }
381
+ }
382
+ const n = scored.length;
383
+ const rejected = Math.max(0, n - accepted);
384
+ const ci = confidenceInterval(deltas, 0.95, { seed: options.seed ?? 17 });
385
+ const reasons = [];
386
+ if (n < minN) reasons.push(`need at least ${minN} scored decisions, got ${n}`);
387
+ if (accepted < minAccepted)
388
+ reasons.push(`need at least ${minAccepted} accepted decisions, got ${accepted}`);
389
+ if (ci.lower <= minUtilityDelta) {
390
+ reasons.push(`utility CI lower bound ${ci.lower.toFixed(4)} does not clear ${minUtilityDelta}`);
391
+ }
392
+ const recommendation = n < minN || accepted < minAccepted ? "need_more_data" : ci.lower > minUtilityDelta ? "ship" : "hold";
393
+ return {
394
+ policyId: policy.id,
395
+ n,
396
+ accepted,
397
+ rejected,
398
+ coverage: n > 0 ? accepted / n : 0,
399
+ acceptedErrorRate: accepted > 0 ? acceptedErrors / accepted : 0,
400
+ baselineUtility,
401
+ policyUtility,
402
+ utilityDelta: policyUtility - baselineUtility,
403
+ utilityCi95: ci,
404
+ rejectedMeanReward: rejectedRewards.length > 0 ? mean(rejectedRewards) : null,
405
+ recommendation,
406
+ reasons
407
+ };
408
+ }
409
+ function acceptUtility(point, utility) {
410
+ return rewardOf2(point, utility) - utility.costWeight * (point.costUsd ?? point.outcome?.costUsd ?? 0);
411
+ }
412
+ function policyDecisionUtility(point, action, utility) {
413
+ if (action === "accept") return acceptUtility(point, utility);
414
+ if (action === "verify") return utility.deferUtility - utility.verifyCost;
415
+ if (action === "ask") return utility.deferUtility - utility.askCost;
416
+ if (action === "retry") return utility.deferUtility - utility.retryCost;
417
+ if (action === "stop") return utility.stopUtility;
418
+ return utility.deferUtility;
419
+ }
420
+ function rewardOf2(point, utility) {
421
+ const outcome = point.outcome;
422
+ if (!outcome) return utility.failureUtility;
423
+ if (typeof outcome.reward === "number") return 2 * outcome.reward - 1;
424
+ if (typeof outcome.score === "number") return 2 * outcome.score - 1;
425
+ if (outcome.success === true) return utility.successUtility;
426
+ if (outcome.success === false) return utility.failureUtility;
427
+ return utility.failureUtility;
428
+ }
429
+ function mean(values) {
430
+ return values.reduce((sum, value) => sum + value, 0) / values.length;
431
+ }
432
+
433
+ // src/belief-state/report.ts
434
+ function analyzeBeliefPolicy(options) {
435
+ const selective = evaluateBeliefSelectivePolicy(options.points, options.policy, options.selective);
436
+ const calibration = calibrateBeliefDecisions(options.points, options.calibration);
437
+ const opeTargetPolicy = options.ope?.targetPolicy;
438
+ const ope = opeTargetPolicy ? evaluateBeliefOffPolicy(options.points, opeTargetPolicy, options.ope) : null;
439
+ const diagnostics = [];
440
+ const selectiveStatus = selective.recommendation;
441
+ const calibrationStatus = calibration ? "supported" : "unsupported";
442
+ const opeRequested = options.requireOpe === true || options.ope !== void 0;
443
+ const opeStatus = ope ? ope.support.supported ? "supported" : "unsupported" : opeRequested ? "unsupported" : "not_requested";
444
+ if (!calibration) diagnostics.push("calibration unsupported: not enough confidence/outcome pairs");
445
+ if (opeRequested && !opeTargetPolicy) diagnostics.push("OPE unsupported: missing target policy");
446
+ else if (ope && !ope.support.supported)
447
+ diagnostics.push(...ope.support.reasons.map((reason) => `OPE unsupported: ${reason}`));
448
+ const status = overallStatus({
449
+ selectiveStatus,
450
+ hasCalibration: calibration !== null,
451
+ opeStatus,
452
+ opeRequested
453
+ });
454
+ return {
455
+ policyId: options.policy.id,
456
+ n: options.points.length,
457
+ status,
458
+ selectiveStatus,
459
+ calibrationStatus,
460
+ opeStatus,
461
+ ...ope ? { opeTargetPolicyId: ope.targetPolicyId } : {},
462
+ selective,
463
+ ...calibration ? { calibration } : {},
464
+ ...ope ? { ope } : {},
465
+ diagnostics
466
+ };
467
+ }
468
+ function overallStatus(options) {
469
+ if (options.selectiveStatus === "need_more_data" || !options.hasCalibration) {
470
+ return "need_more_data";
471
+ }
472
+ if (options.selectiveStatus === "hold") return "hold";
473
+ if (options.opeRequested && options.opeStatus !== "supported") return "hold";
474
+ return "ship";
475
+ }
476
+ export {
477
+ analyzeBeliefPolicy,
478
+ beliefDecisionsToOffPolicyTrajectories,
479
+ calibrateBeliefDecisions,
480
+ embeddedBeliefOpeTargetPolicy,
481
+ evaluateBeliefOffPolicy,
482
+ evaluateBeliefSelectivePolicy,
483
+ extractBeliefDecisionPoints,
484
+ thresholdSelectivePolicy
485
+ };
486
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/belief-state/calibration.ts","../../src/belief-state/extract.ts","../../src/belief-state/ope.ts","../../src/belief-state/selective.ts","../../src/belief-state/report.ts"],"sourcesContent":["import { ValidationError } from '../errors'\nimport { calibrationFromPairs } from '../meta-eval/calibration'\nimport type { BeliefDecisionPoint, BeliefSelectivePolicy } from './types'\n\nexport type BeliefCalibrationRegion = 'all' | 'accepted' | 'rejected'\n\nexport interface BeliefCalibrationOptions {\n bins?: number\n minPairs?: number\n policy?: BeliefSelectivePolicy\n region?: BeliefCalibrationRegion\n}\n\nexport function calibrateBeliefDecisions(\n points: BeliefDecisionPoint[],\n options: BeliefCalibrationOptions = {},\n) {\n const filtered = filterCalibrationRegion(points, options)\n const pairs = filtered\n .filter((point) => typeof point.confidence === 'number' && point.outcome)\n .map((point) => ({\n evalScore: point.confidence!,\n outcome: outcomeScore(point),\n }))\n .filter((pair) => Number.isFinite(pair.outcome))\n const minPairs = options.minPairs ?? 10\n if (pairs.length < minPairs) return null\n return calibrationFromPairs(pairs, 'belief-confidence', 'decision-outcome', {\n bins: options.bins ?? 5,\n range: { lo: 0, hi: 1 },\n })\n}\n\nfunction filterCalibrationRegion(\n points: BeliefDecisionPoint[],\n options: BeliefCalibrationOptions,\n): BeliefDecisionPoint[] {\n const region = options.region ?? 'all'\n if (region === 'all') return points\n const policy = options.policy\n if (!policy) {\n throw new ValidationError(\n `calibrateBeliefDecisions: policy is required when region is \"${region}\"`,\n )\n }\n return points.filter((point) => {\n const accepted = policy.decide(point).action === 'accept'\n return region === 'accepted' ? accepted : !accepted\n })\n}\n\nfunction outcomeScore(point: BeliefDecisionPoint): number {\n if (typeof point.outcome?.reward === 'number') return point.outcome.reward\n if (typeof point.outcome?.score === 'number') return point.outcome.score\n if (point.outcome?.success === true) return 1\n if (point.outcome?.success === false) return 0\n return Number.NaN\n}\n","import type { TraceEvent } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport type {\n BeliefDecisionExtractionDiagnostic,\n BeliefDecisionExtractionReport,\n BeliefDecisionKind,\n BeliefDecisionOutcome,\n BeliefDecisionPoint,\n BeliefEvidenceRef,\n} from './types'\n\nexport interface ExtractBeliefDecisionPointsOptions {\n runIds?: string[]\n}\n\nconst DECISION_MARKERS = new Set(['belief_decision', 'belief.decision', 'decision_point'])\nconst DECISION_KINDS: ReadonlySet<string> = new Set([\n 'continue',\n 'verify',\n 'ask',\n 'retry',\n 'stop',\n 'memory-write',\n 'memory-read',\n 'tool-select',\n 'skill-select',\n 'workflow-select',\n 'surface-promote',\n])\n\nexport async function extractBeliefDecisionPoints(\n store: TraceStore,\n options: ExtractBeliefDecisionPointsOptions = {},\n): Promise<BeliefDecisionExtractionReport> {\n const runs = options.runIds\n ? (await Promise.all(options.runIds.map((runId) => store.getRun(runId)))).filter(Boolean)\n : await store.listRuns()\n const decisions: BeliefDecisionPoint[] = []\n const diagnostics: BeliefDecisionExtractionDiagnostic[] = []\n\n for (const run of runs) {\n if (!run) continue\n const events = await store.events({ runId: run.runId })\n const spans = await store.spans({ runId: run.runId })\n const spanIds = new Set(spans.map((span) => span.spanId))\n let stepIndex = 0\n for (const event of [...events].sort((a, b) => a.timestamp - b.timestamp)) {\n const parsed = parseDecisionEvent(event, {\n scenarioId: run.scenarioId,\n stepIndex,\n spanExists: event.spanId ? spanIds.has(event.spanId) : false,\n })\n if (!parsed) continue\n if ('diagnostic' in parsed) {\n diagnostics.push(parsed.diagnostic)\n continue\n }\n decisions.push(parsed.decision)\n stepIndex++\n }\n }\n\n return { decisions, diagnostics }\n}\n\nfunction parseDecisionEvent(\n event: TraceEvent,\n context: { scenarioId?: string; stepIndex: number; spanExists: boolean },\n): { decision: BeliefDecisionPoint } | { diagnostic: BeliefDecisionExtractionDiagnostic } | null {\n const payload = event.payload\n const marker = stringField(payload, 'kind') ?? stringField(payload, 'type')\n if (!marker || !DECISION_MARKERS.has(marker)) return null\n\n const decisionKind = stringField(payload, 'decisionKind')\n if (!decisionKind || !DECISION_KINDS.has(decisionKind)) {\n return {\n diagnostic: {\n runId: event.runId,\n eventId: event.eventId,\n severity: 'warning',\n reason: `belief decision event has unsupported decisionKind \"${decisionKind ?? ''}\"`,\n },\n }\n }\n\n const chosenAction = stringField(payload, 'chosenAction')\n if (!chosenAction) {\n return {\n diagnostic: {\n runId: event.runId,\n eventId: event.eventId,\n severity: 'warning',\n reason: 'belief decision event is missing chosenAction',\n },\n }\n }\n\n const evidence: BeliefEvidenceRef[] = [\n {\n source: 'event',\n id: event.eventId,\n runId: event.runId,\n eventId: event.eventId,\n },\n ]\n if (event.spanId && context.spanExists) {\n evidence.push({ source: 'span', id: event.spanId, runId: event.runId, spanId: event.spanId })\n }\n\n return {\n decision: {\n id: stringField(payload, 'id') ?? event.eventId,\n runId: event.runId,\n scenarioId: stringField(payload, 'scenarioId') ?? context.scenarioId,\n stepIndex: numberField(payload, 'stepIndex') ?? context.stepIndex,\n kind: decisionKind as BeliefDecisionKind,\n chosenAction,\n candidateActions: stringArrayField(payload, 'candidateActions'),\n confidence: finiteUnitField(payload, 'confidence'),\n behaviorProb: numberField(payload, 'behaviorProb'),\n targetProb: numberField(payload, 'targetProb'),\n qHat: finiteUnitField(payload, 'qHat'),\n costUsd: nonNegativeNumberField(payload, 'costUsd'),\n evidence,\n outcome: parseOutcome(payload),\n metadata: recordField(payload, 'metadata'),\n },\n }\n}\n\nfunction parseOutcome(payload: Record<string, unknown>): BeliefDecisionOutcome | undefined {\n const value = recordField(payload, 'outcome')\n if (!value) return undefined\n return {\n success: typeof value.success === 'boolean' ? value.success : undefined,\n score: finiteUnitField(value, 'score'),\n reward: finiteUnitField(value, 'reward'),\n costUsd: nonNegativeNumberField(value, 'costUsd'),\n observedAt: stringField(value, 'observedAt'),\n metadata: recordField(value, 'metadata'),\n }\n}\n\nfunction stringField(obj: Record<string, unknown>, key: string): string | undefined {\n const value = obj[key]\n return typeof value === 'string' && value.length > 0 ? value : undefined\n}\n\nfunction numberField(obj: Record<string, unknown>, key: string): number | undefined {\n const value = obj[key]\n return typeof value === 'number' && Number.isFinite(value) ? value : undefined\n}\n\nfunction finiteUnitField(obj: Record<string, unknown>, key: string): number | undefined {\n const value = numberField(obj, key)\n return value === undefined ? undefined : Math.max(0, Math.min(1, value))\n}\n\nfunction nonNegativeNumberField(obj: Record<string, unknown>, key: string): number | undefined {\n const value = numberField(obj, key)\n return value === undefined ? undefined : Math.max(0, value)\n}\n\nfunction stringArrayField(obj: Record<string, unknown>, key: string): string[] | undefined {\n const value = obj[key]\n if (!Array.isArray(value)) return undefined\n const strings = value.filter(\n (item): item is string => typeof item === 'string' && item.length > 0,\n )\n return strings.length > 0 ? strings : undefined\n}\n\nfunction recordField(\n obj: Record<string, unknown>,\n key: string,\n): Record<string, unknown> | undefined {\n const value = obj[key]\n if (!value || typeof value !== 'object' || Array.isArray(value)) return undefined\n return value as Record<string, unknown>\n}\n","import {\n type OffPolicyOptions,\n type OffPolicyTrajectory,\n offPolicyEstimateAll,\n} from '../rl/off-policy'\nimport type {\n BeliefDecisionPoint,\n BeliefOpeReport,\n BeliefOpeSupportDiagnostics,\n BeliefOpeTargetPolicy,\n} from './types'\n\nexport interface BeliefOpeOptions extends OffPolicyOptions {\n minEffectiveSampleSize?: number\n minEffectiveSampleRatio?: number\n maxDiagnostics?: number\n}\n\nexport interface BeliefOffPolicyTrajectoryReport {\n targetPolicyId: string\n trajectories: OffPolicyTrajectory[]\n dropped: number\n diagnostics: string[]\n}\n\nexport function embeddedBeliefOpeTargetPolicy(id = 'embedded-target-prob'): BeliefOpeTargetPolicy {\n return {\n id,\n targetProbOf(point) {\n return point.targetProb\n },\n qHatOf(point) {\n return point.qHat\n },\n }\n}\n\nexport function beliefDecisionsToOffPolicyTrajectories(\n points: BeliefDecisionPoint[],\n targetPolicy: BeliefOpeTargetPolicy,\n options: Pick<BeliefOpeOptions, 'maxDiagnostics'> = {},\n): BeliefOffPolicyTrajectoryReport {\n const trajectories: OffPolicyTrajectory[] = []\n const diagnostics: string[] = []\n for (const point of points) {\n if (!point.outcome) {\n diagnostics.push(`${point.id}: missing outcome`)\n continue\n }\n if (!isBehaviorProbability(point.behaviorProb)) {\n diagnostics.push(`${point.id}: invalid behaviorProb ${formatProbability(point.behaviorProb)}`)\n continue\n }\n\n let targetProb: number | null | undefined\n let qHat: number | null | undefined\n try {\n targetProb = targetPolicy.targetProbOf(point)\n qHat = targetPolicy.qHatOf?.(point)\n } catch (error) {\n diagnostics.push(\n `${point.id}: target policy ${targetPolicy.id} threw (${errorMessage(error)})`,\n )\n continue\n }\n if (!isTargetProbability(targetProb)) {\n diagnostics.push(`${point.id}: invalid targetProb ${formatProbability(targetProb)}`)\n continue\n }\n if (qHat !== null && qHat !== undefined && !isTargetProbability(qHat)) {\n diagnostics.push(`${point.id}: invalid qHat ${formatProbability(qHat)}; ignoring qHat`)\n qHat = null\n }\n\n trajectories.push({\n runId: point.id,\n reward: rewardOf(point),\n behaviorProb: point.behaviorProb,\n targetProb,\n qHat,\n })\n }\n return {\n targetPolicyId: targetPolicy.id,\n trajectories,\n dropped: points.length - trajectories.length,\n diagnostics: compactDiagnostics(diagnostics, options.maxDiagnostics ?? 20),\n }\n}\n\nexport function evaluateBeliefOffPolicy(\n points: BeliefDecisionPoint[],\n targetPolicy: BeliefOpeTargetPolicy,\n options: BeliefOpeOptions = {},\n): BeliefOpeReport {\n const trajectoryReport = beliefDecisionsToOffPolicyTrajectories(points, targetPolicy, options)\n const { trajectories } = trajectoryReport\n const estimates = offPolicyEstimateAll(trajectories, options)\n const support = supportDiagnostics(estimates.dr, {\n minEffectiveSampleSize: options.minEffectiveSampleSize ?? 30,\n minEffectiveSampleRatio: options.minEffectiveSampleRatio ?? 0.25,\n dropped: trajectoryReport.dropped,\n diagnostics: trajectoryReport.diagnostics,\n })\n return { targetPolicyId: targetPolicy.id, ...estimates, support }\n}\n\nfunction supportDiagnostics(\n estimate: { n: number; effectiveSampleSize: number; maxImportanceWeight: number },\n options: {\n minEffectiveSampleSize: number\n minEffectiveSampleRatio: number\n dropped: number\n diagnostics: string[]\n },\n): BeliefOpeSupportDiagnostics {\n const ratio = estimate.n > 0 ? estimate.effectiveSampleSize / estimate.n : 0\n const reasons: string[] = [...options.diagnostics]\n if (estimate.n === 0) {\n reasons.push('no valid OPE trajectories')\n }\n if (options.dropped > 0) {\n reasons.push(`dropped ${options.dropped} unsupported decision(s)`)\n }\n if (estimate.effectiveSampleSize < options.minEffectiveSampleSize) {\n reasons.push(\n `effective sample size ${estimate.effectiveSampleSize.toFixed(2)} below ${options.minEffectiveSampleSize}`,\n )\n }\n if (ratio < options.minEffectiveSampleRatio) {\n reasons.push(\n `effective sample ratio ${ratio.toFixed(2)} below ${options.minEffectiveSampleRatio}`,\n )\n }\n if (estimate.maxImportanceWeight > 10) {\n reasons.push(`max importance weight ${estimate.maxImportanceWeight.toFixed(2)} is high`)\n }\n return {\n supported: reasons.length === 0,\n n: estimate.n,\n dropped: options.dropped,\n effectiveSampleSize: estimate.effectiveSampleSize,\n effectiveSampleRatio: ratio,\n maxImportanceWeight: estimate.maxImportanceWeight,\n reasons,\n }\n}\n\nfunction rewardOf(point: BeliefDecisionPoint): number {\n if (typeof point.outcome?.reward === 'number') return point.outcome.reward\n if (typeof point.outcome?.score === 'number') return point.outcome.score\n if (point.outcome?.success === true) return 1\n return 0\n}\n\nfunction isBehaviorProbability(value: unknown): value is number {\n return typeof value === 'number' && Number.isFinite(value) && value > 0 && value <= 1\n}\n\nfunction isTargetProbability(value: unknown): value is number {\n return typeof value === 'number' && Number.isFinite(value) && value >= 0 && value <= 1\n}\n\nfunction formatProbability(value: unknown): string {\n return typeof value === 'number' ? String(value) : String(value ?? 'missing')\n}\n\nfunction errorMessage(error: unknown): string {\n return error instanceof Error ? error.message : String(error)\n}\n\nfunction compactDiagnostics(diagnostics: string[], maxDiagnostics: number): string[] {\n if (diagnostics.length <= maxDiagnostics) return diagnostics\n return [\n ...diagnostics.slice(0, maxDiagnostics),\n `${diagnostics.length - maxDiagnostics} additional OPE diagnostic(s) omitted`,\n ]\n}\n","import { ValidationError } from '../errors'\nimport { confidenceInterval } from '../statistics'\nimport type {\n BeliefDecisionPoint,\n BeliefPolicyAction,\n BeliefSelectivePolicy,\n BeliefSelectivePolicyMetrics,\n BeliefUtilityOptions,\n} from './types'\n\nexport interface EvaluateBeliefSelectivePolicyOptions {\n utility?: BeliefUtilityOptions\n minN?: number\n minAccepted?: number\n minUtilityDelta?: number\n seed?: number\n}\n\nconst DEFAULT_UTILITY: Required<BeliefUtilityOptions> = {\n successUtility: 1,\n failureUtility: -1,\n deferUtility: 0,\n verifyCost: 0.05,\n askCost: 0.05,\n retryCost: 0.1,\n stopUtility: 0,\n costWeight: 1,\n}\n\nexport function thresholdSelectivePolicy(options: {\n id?: string\n confidenceThreshold: number\n belowThresholdAction?: Exclude<BeliefPolicyAction, 'accept'>\n}): BeliefSelectivePolicy {\n const threshold = options.confidenceThreshold\n if (!Number.isFinite(threshold) || threshold < 0 || threshold > 1) {\n throw new ValidationError(\n `thresholdSelectivePolicy: confidenceThreshold must be in [0, 1], got ${threshold}`,\n )\n }\n const belowThresholdAction = options.belowThresholdAction ?? 'verify'\n return {\n id: options.id ?? `confidence>=${threshold}`,\n decide(point) {\n const confidence = point.confidence ?? 0\n return {\n action: confidence >= threshold ? 'accept' : belowThresholdAction,\n confidence,\n targetProb: point.targetProb,\n qHat: point.qHat,\n reason:\n confidence >= threshold ? 'confidence threshold passed' : 'confidence threshold failed',\n }\n },\n }\n}\n\nexport function evaluateBeliefSelectivePolicy(\n points: BeliefDecisionPoint[],\n policy: BeliefSelectivePolicy,\n options: EvaluateBeliefSelectivePolicyOptions = {},\n): BeliefSelectivePolicyMetrics {\n const utility = { ...DEFAULT_UTILITY, ...(options.utility ?? {}) }\n const scored = points.filter((point) => point.outcome)\n const minN = options.minN ?? 30\n const minAccepted = options.minAccepted ?? 5\n const minUtilityDelta = options.minUtilityDelta ?? 0\n const deltas: number[] = []\n const acceptedRewards: number[] = []\n const rejectedRewards: number[] = []\n let baselineUtility = 0\n let policyUtility = 0\n let accepted = 0\n let acceptedErrors = 0\n\n for (const point of scored) {\n const baseline = acceptUtility(point, utility)\n const decision = policy.decide(point)\n const candidate = policyDecisionUtility(point, decision.action, utility)\n const reward = rewardOf(point, utility)\n baselineUtility += baseline\n policyUtility += candidate\n deltas.push(candidate - baseline)\n if (decision.action === 'accept') {\n accepted++\n acceptedRewards.push(reward)\n if (reward < 0) acceptedErrors++\n } else {\n rejectedRewards.push(reward)\n }\n }\n\n const n = scored.length\n const rejected = Math.max(0, n - accepted)\n const ci = confidenceInterval(deltas, 0.95, { seed: options.seed ?? 17 })\n const reasons: string[] = []\n if (n < minN) reasons.push(`need at least ${minN} scored decisions, got ${n}`)\n if (accepted < minAccepted)\n reasons.push(`need at least ${minAccepted} accepted decisions, got ${accepted}`)\n if (ci.lower <= minUtilityDelta) {\n reasons.push(`utility CI lower bound ${ci.lower.toFixed(4)} does not clear ${minUtilityDelta}`)\n }\n const recommendation =\n n < minN || accepted < minAccepted\n ? 'need_more_data'\n : ci.lower > minUtilityDelta\n ? 'ship'\n : 'hold'\n\n return {\n policyId: policy.id,\n n,\n accepted,\n rejected,\n coverage: n > 0 ? accepted / n : 0,\n acceptedErrorRate: accepted > 0 ? acceptedErrors / accepted : 0,\n baselineUtility,\n policyUtility,\n utilityDelta: policyUtility - baselineUtility,\n utilityCi95: ci,\n rejectedMeanReward: rejectedRewards.length > 0 ? mean(rejectedRewards) : null,\n recommendation,\n reasons,\n }\n}\n\nfunction acceptUtility(\n point: BeliefDecisionPoint,\n utility: Required<BeliefUtilityOptions>,\n): number {\n return (\n rewardOf(point, utility) - utility.costWeight * (point.costUsd ?? point.outcome?.costUsd ?? 0)\n )\n}\n\nfunction policyDecisionUtility(\n point: BeliefDecisionPoint,\n action: BeliefPolicyAction,\n utility: Required<BeliefUtilityOptions>,\n): number {\n if (action === 'accept') return acceptUtility(point, utility)\n if (action === 'verify') return utility.deferUtility - utility.verifyCost\n if (action === 'ask') return utility.deferUtility - utility.askCost\n if (action === 'retry') return utility.deferUtility - utility.retryCost\n if (action === 'stop') return utility.stopUtility\n return utility.deferUtility\n}\n\nfunction rewardOf(point: BeliefDecisionPoint, utility: Required<BeliefUtilityOptions>): number {\n const outcome = point.outcome\n if (!outcome) return utility.failureUtility\n if (typeof outcome.reward === 'number') return 2 * outcome.reward - 1\n if (typeof outcome.score === 'number') return 2 * outcome.score - 1\n if (outcome.success === true) return utility.successUtility\n if (outcome.success === false) return utility.failureUtility\n return utility.failureUtility\n}\n\nfunction mean(values: number[]): number {\n return values.reduce((sum, value) => sum + value, 0) / values.length\n}\n","import { type BeliefCalibrationOptions, calibrateBeliefDecisions } from './calibration'\nimport { type BeliefOpeOptions, evaluateBeliefOffPolicy } from './ope'\nimport {\n type EvaluateBeliefSelectivePolicyOptions,\n evaluateBeliefSelectivePolicy,\n} from './selective'\nimport type {\n BeliefDecisionPoint,\n BeliefEvaluationStatus,\n BeliefOpeStatus,\n BeliefOpeTargetPolicy,\n BeliefPolicyEvaluationReport,\n BeliefSelectivePolicy,\n} from './types'\n\nexport interface AnalyzeBeliefPolicyOpeOptions extends BeliefOpeOptions {\n targetPolicy?: BeliefOpeTargetPolicy\n}\n\nexport interface AnalyzeBeliefPolicyOptions {\n points: BeliefDecisionPoint[]\n policy: BeliefSelectivePolicy\n selective?: EvaluateBeliefSelectivePolicyOptions\n calibration?: BeliefCalibrationOptions\n ope?: AnalyzeBeliefPolicyOpeOptions\n requireOpe?: boolean\n}\n\nexport function analyzeBeliefPolicy(\n options: AnalyzeBeliefPolicyOptions,\n): BeliefPolicyEvaluationReport {\n const selective = evaluateBeliefSelectivePolicy(options.points, options.policy, options.selective)\n const calibration = calibrateBeliefDecisions(options.points, options.calibration)\n const opeTargetPolicy = options.ope?.targetPolicy\n const ope = opeTargetPolicy\n ? evaluateBeliefOffPolicy(options.points, opeTargetPolicy, options.ope)\n : null\n const diagnostics: string[] = []\n const selectiveStatus = selective.recommendation\n const calibrationStatus = calibration ? 'supported' : 'unsupported'\n const opeRequested = options.requireOpe === true || options.ope !== undefined\n const opeStatus: BeliefOpeStatus = ope\n ? ope.support.supported\n ? 'supported'\n : 'unsupported'\n : opeRequested\n ? 'unsupported'\n : 'not_requested'\n\n if (!calibration) diagnostics.push('calibration unsupported: not enough confidence/outcome pairs')\n if (opeRequested && !opeTargetPolicy) diagnostics.push('OPE unsupported: missing target policy')\n else if (ope && !ope.support.supported)\n diagnostics.push(...ope.support.reasons.map((reason) => `OPE unsupported: ${reason}`))\n\n const status = overallStatus({\n selectiveStatus,\n hasCalibration: calibration !== null,\n opeStatus,\n opeRequested,\n })\n\n return {\n policyId: options.policy.id,\n n: options.points.length,\n status,\n selectiveStatus,\n calibrationStatus,\n opeStatus,\n ...(ope ? { opeTargetPolicyId: ope.targetPolicyId } : {}),\n selective,\n ...(calibration ? { calibration } : {}),\n ...(ope ? { ope } : {}),\n diagnostics,\n }\n}\n\nfunction overallStatus(options: {\n selectiveStatus: BeliefEvaluationStatus\n hasCalibration: boolean\n opeStatus: BeliefOpeStatus\n opeRequested: boolean\n}): BeliefEvaluationStatus {\n if (options.selectiveStatus === 'need_more_data' || !options.hasCalibration) {\n return 'need_more_data'\n }\n if (options.selectiveStatus === 'hold') return 'hold'\n if (options.opeRequested && options.opeStatus !== 'supported') return 'hold'\n return 'ship'\n}\n"],"mappings":";;;;;;;;;;;;;;;AAaO,SAAS,yBACd,QACA,UAAoC,CAAC,GACrC;AACA,QAAM,WAAW,wBAAwB,QAAQ,OAAO;AACxD,QAAM,QAAQ,SACX,OAAO,CAAC,UAAU,OAAO,MAAM,eAAe,YAAY,MAAM,OAAO,EACvE,IAAI,CAAC,WAAW;AAAA,IACf,WAAW,MAAM;AAAA,IACjB,SAAS,aAAa,KAAK;AAAA,EAC7B,EAAE,EACD,OAAO,CAAC,SAAS,OAAO,SAAS,KAAK,OAAO,CAAC;AACjD,QAAM,WAAW,QAAQ,YAAY;AACrC,MAAI,MAAM,SAAS,SAAU,QAAO;AACpC,SAAO,qBAAqB,OAAO,qBAAqB,oBAAoB;AAAA,IAC1E,MAAM,QAAQ,QAAQ;AAAA,IACtB,OAAO,EAAE,IAAI,GAAG,IAAI,EAAE;AAAA,EACxB,CAAC;AACH;AAEA,SAAS,wBACP,QACA,SACuB;AACvB,QAAM,SAAS,QAAQ,UAAU;AACjC,MAAI,WAAW,MAAO,QAAO;AAC7B,QAAM,SAAS,QAAQ;AACvB,MAAI,CAAC,QAAQ;AACX,UAAM,IAAI;AAAA,MACR,gEAAgE,MAAM;AAAA,IACxE;AAAA,EACF;AACA,SAAO,OAAO,OAAO,CAAC,UAAU;AAC9B,UAAM,WAAW,OAAO,OAAO,KAAK,EAAE,WAAW;AACjD,WAAO,WAAW,aAAa,WAAW,CAAC;AAAA,EAC7C,CAAC;AACH;AAEA,SAAS,aAAa,OAAoC;AACxD,MAAI,OAAO,MAAM,SAAS,WAAW,SAAU,QAAO,MAAM,QAAQ;AACpE,MAAI,OAAO,MAAM,SAAS,UAAU,SAAU,QAAO,MAAM,QAAQ;AACnE,MAAI,MAAM,SAAS,YAAY,KAAM,QAAO;AAC5C,MAAI,MAAM,SAAS,YAAY,MAAO,QAAO;AAC7C,SAAO,OAAO;AAChB;;;AC1CA,IAAM,mBAAmB,oBAAI,IAAI,CAAC,mBAAmB,mBAAmB,gBAAgB,CAAC;AACzF,IAAM,iBAAsC,oBAAI,IAAI;AAAA,EAClD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAED,eAAsB,4BACpB,OACA,UAA8C,CAAC,GACN;AACzC,QAAM,OAAO,QAAQ,UAChB,MAAM,QAAQ,IAAI,QAAQ,OAAO,IAAI,CAAC,UAAU,MAAM,OAAO,KAAK,CAAC,CAAC,GAAG,OAAO,OAAO,IACtF,MAAM,MAAM,SAAS;AACzB,QAAM,YAAmC,CAAC;AAC1C,QAAM,cAAoD,CAAC;AAE3D,aAAW,OAAO,MAAM;AACtB,QAAI,CAAC,IAAK;AACV,UAAM,SAAS,MAAM,MAAM,OAAO,EAAE,OAAO,IAAI,MAAM,CAAC;AACtD,UAAM,QAAQ,MAAM,MAAM,MAAM,EAAE,OAAO,IAAI,MAAM,CAAC;AACpD,UAAM,UAAU,IAAI,IAAI,MAAM,IAAI,CAAC,SAAS,KAAK,MAAM,CAAC;AACxD,QAAI,YAAY;AAChB,eAAW,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS,GAAG;AACzE,YAAM,SAAS,mBAAmB,OAAO;AAAA,QACvC,YAAY,IAAI;AAAA,QAChB;AAAA,QACA,YAAY,MAAM,SAAS,QAAQ,IAAI,MAAM,MAAM,IAAI;AAAA,MACzD,CAAC;AACD,UAAI,CAAC,OAAQ;AACb,UAAI,gBAAgB,QAAQ;AAC1B,oBAAY,KAAK,OAAO,UAAU;AAClC;AAAA,MACF;AACA,gBAAU,KAAK,OAAO,QAAQ;AAC9B;AAAA,IACF;AAAA,EACF;AAEA,SAAO,EAAE,WAAW,YAAY;AAClC;AAEA,SAAS,mBACP,OACA,SAC+F;AAC/F,QAAM,UAAU,MAAM;AACtB,QAAM,SAAS,YAAY,SAAS,MAAM,KAAK,YAAY,SAAS,MAAM;AAC1E,MAAI,CAAC,UAAU,CAAC,iBAAiB,IAAI,MAAM,EAAG,QAAO;AAErD,QAAM,eAAe,YAAY,SAAS,cAAc;AACxD,MAAI,CAAC,gBAAgB,CAAC,eAAe,IAAI,YAAY,GAAG;AACtD,WAAO;AAAA,MACL,YAAY;AAAA,QACV,OAAO,MAAM;AAAA,QACb,SAAS,MAAM;AAAA,QACf,UAAU;AAAA,QACV,QAAQ,uDAAuD,gBAAgB,EAAE;AAAA,MACnF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,eAAe,YAAY,SAAS,cAAc;AACxD,MAAI,CAAC,cAAc;AACjB,WAAO;AAAA,MACL,YAAY;AAAA,QACV,OAAO,MAAM;AAAA,QACb,SAAS,MAAM;AAAA,QACf,UAAU;AAAA,QACV,QAAQ;AAAA,MACV;AAAA,IACF;AAAA,EACF;AAEA,QAAM,WAAgC;AAAA,IACpC;AAAA,MACE,QAAQ;AAAA,MACR,IAAI,MAAM;AAAA,MACV,OAAO,MAAM;AAAA,MACb,SAAS,MAAM;AAAA,IACjB;AAAA,EACF;AACA,MAAI,MAAM,UAAU,QAAQ,YAAY;AACtC,aAAS,KAAK,EAAE,QAAQ,QAAQ,IAAI,MAAM,QAAQ,OAAO,MAAM,OAAO,QAAQ,MAAM,OAAO,CAAC;AAAA,EAC9F;AAEA,SAAO;AAAA,IACL,UAAU;AAAA,MACR,IAAI,YAAY,SAAS,IAAI,KAAK,MAAM;AAAA,MACxC,OAAO,MAAM;AAAA,MACb,YAAY,YAAY,SAAS,YAAY,KAAK,QAAQ;AAAA,MAC1D,WAAW,YAAY,SAAS,WAAW,KAAK,QAAQ;AAAA,MACxD,MAAM;AAAA,MACN;AAAA,MACA,kBAAkB,iBAAiB,SAAS,kBAAkB;AAAA,MAC9D,YAAY,gBAAgB,SAAS,YAAY;AAAA,MACjD,cAAc,YAAY,SAAS,cAAc;AAAA,MACjD,YAAY,YAAY,SAAS,YAAY;AAAA,MAC7C,MAAM,gBAAgB,SAAS,MAAM;AAAA,MACrC,SAAS,uBAAuB,SAAS,SAAS;AAAA,MAClD;AAAA,MACA,SAAS,aAAa,OAAO;AAAA,MAC7B,UAAU,YAAY,SAAS,UAAU;AAAA,IAC3C;AAAA,EACF;AACF;AAEA,SAAS,aAAa,SAAqE;AACzF,QAAM,QAAQ,YAAY,SAAS,SAAS;AAC5C,MAAI,CAAC,MAAO,QAAO;AACnB,SAAO;AAAA,IACL,SAAS,OAAO,MAAM,YAAY,YAAY,MAAM,UAAU;AAAA,IAC9D,OAAO,gBAAgB,OAAO,OAAO;AAAA,IACrC,QAAQ,gBAAgB,OAAO,QAAQ;AAAA,IACvC,SAAS,uBAAuB,OAAO,SAAS;AAAA,IAChD,YAAY,YAAY,OAAO,YAAY;AAAA,IAC3C,UAAU,YAAY,OAAO,UAAU;AAAA,EACzC;AACF;AAEA,SAAS,YAAY,KAA8B,KAAiC;AAClF,QAAM,QAAQ,IAAI,GAAG;AACrB,SAAO,OAAO,UAAU,YAAY,MAAM,SAAS,IAAI,QAAQ;AACjE;AAEA,SAAS,YAAY,KAA8B,KAAiC;AAClF,QAAM,QAAQ,IAAI,GAAG;AACrB,SAAO,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK,IAAI,QAAQ;AACvE;AAEA,SAAS,gBAAgB,KAA8B,KAAiC;AACtF,QAAM,QAAQ,YAAY,KAAK,GAAG;AAClC,SAAO,UAAU,SAAY,SAAY,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,KAAK,CAAC;AACzE;AAEA,SAAS,uBAAuB,KAA8B,KAAiC;AAC7F,QAAM,QAAQ,YAAY,KAAK,GAAG;AAClC,SAAO,UAAU,SAAY,SAAY,KAAK,IAAI,GAAG,KAAK;AAC5D;AAEA,SAAS,iBAAiB,KAA8B,KAAmC;AACzF,QAAM,QAAQ,IAAI,GAAG;AACrB,MAAI,CAAC,MAAM,QAAQ,KAAK,EAAG,QAAO;AAClC,QAAM,UAAU,MAAM;AAAA,IACpB,CAAC,SAAyB,OAAO,SAAS,YAAY,KAAK,SAAS;AAAA,EACtE;AACA,SAAO,QAAQ,SAAS,IAAI,UAAU;AACxC;AAEA,SAAS,YACP,KACA,KACqC;AACrC,QAAM,QAAQ,IAAI,GAAG;AACrB,MAAI,CAAC,SAAS,OAAO,UAAU,YAAY,MAAM,QAAQ,KAAK,EAAG,QAAO;AACxE,SAAO;AACT;;;AC1JO,SAAS,8BAA8B,KAAK,wBAA+C;AAChG,SAAO;AAAA,IACL;AAAA,IACA,aAAa,OAAO;AAClB,aAAO,MAAM;AAAA,IACf;AAAA,IACA,OAAO,OAAO;AACZ,aAAO,MAAM;AAAA,IACf;AAAA,EACF;AACF;AAEO,SAAS,uCACd,QACA,cACA,UAAoD,CAAC,GACpB;AACjC,QAAM,eAAsC,CAAC;AAC7C,QAAM,cAAwB,CAAC;AAC/B,aAAW,SAAS,QAAQ;AAC1B,QAAI,CAAC,MAAM,SAAS;AAClB,kBAAY,KAAK,GAAG,MAAM,EAAE,mBAAmB;AAC/C;AAAA,IACF;AACA,QAAI,CAAC,sBAAsB,MAAM,YAAY,GAAG;AAC9C,kBAAY,KAAK,GAAG,MAAM,EAAE,0BAA0B,kBAAkB,MAAM,YAAY,CAAC,EAAE;AAC7F;AAAA,IACF;AAEA,QAAI;AACJ,QAAI;AACJ,QAAI;AACF,mBAAa,aAAa,aAAa,KAAK;AAC5C,aAAO,aAAa,SAAS,KAAK;AAAA,IACpC,SAAS,OAAO;AACd,kBAAY;AAAA,QACV,GAAG,MAAM,EAAE,mBAAmB,aAAa,EAAE,WAAW,aAAa,KAAK,CAAC;AAAA,MAC7E;AACA;AAAA,IACF;AACA,QAAI,CAAC,oBAAoB,UAAU,GAAG;AACpC,kBAAY,KAAK,GAAG,MAAM,EAAE,wBAAwB,kBAAkB,UAAU,CAAC,EAAE;AACnF;AAAA,IACF;AACA,QAAI,SAAS,QAAQ,SAAS,UAAa,CAAC,oBAAoB,IAAI,GAAG;AACrE,kBAAY,KAAK,GAAG,MAAM,EAAE,kBAAkB,kBAAkB,IAAI,CAAC,iBAAiB;AACtF,aAAO;AAAA,IACT;AAEA,iBAAa,KAAK;AAAA,MAChB,OAAO,MAAM;AAAA,MACb,QAAQ,SAAS,KAAK;AAAA,MACtB,cAAc,MAAM;AAAA,MACpB;AAAA,MACA;AAAA,IACF,CAAC;AAAA,EACH;AACA,SAAO;AAAA,IACL,gBAAgB,aAAa;AAAA,IAC7B;AAAA,IACA,SAAS,OAAO,SAAS,aAAa;AAAA,IACtC,aAAa,mBAAmB,aAAa,QAAQ,kBAAkB,EAAE;AAAA,EAC3E;AACF;AAEO,SAAS,wBACd,QACA,cACA,UAA4B,CAAC,GACZ;AACjB,QAAM,mBAAmB,uCAAuC,QAAQ,cAAc,OAAO;AAC7F,QAAM,EAAE,aAAa,IAAI;AACzB,QAAM,YAAY,qBAAqB,cAAc,OAAO;AAC5D,QAAM,UAAU,mBAAmB,UAAU,IAAI;AAAA,IAC/C,wBAAwB,QAAQ,0BAA0B;AAAA,IAC1D,yBAAyB,QAAQ,2BAA2B;AAAA,IAC5D,SAAS,iBAAiB;AAAA,IAC1B,aAAa,iBAAiB;AAAA,EAChC,CAAC;AACD,SAAO,EAAE,gBAAgB,aAAa,IAAI,GAAG,WAAW,QAAQ;AAClE;AAEA,SAAS,mBACP,UACA,SAM6B;AAC7B,QAAM,QAAQ,SAAS,IAAI,IAAI,SAAS,sBAAsB,SAAS,IAAI;AAC3E,QAAM,UAAoB,CAAC,GAAG,QAAQ,WAAW;AACjD,MAAI,SAAS,MAAM,GAAG;AACpB,YAAQ,KAAK,2BAA2B;AAAA,EAC1C;AACA,MAAI,QAAQ,UAAU,GAAG;AACvB,YAAQ,KAAK,WAAW,QAAQ,OAAO,0BAA0B;AAAA,EACnE;AACA,MAAI,SAAS,sBAAsB,QAAQ,wBAAwB;AACjE,YAAQ;AAAA,MACN,yBAAyB,SAAS,oBAAoB,QAAQ,CAAC,CAAC,UAAU,QAAQ,sBAAsB;AAAA,IAC1G;AAAA,EACF;AACA,MAAI,QAAQ,QAAQ,yBAAyB;AAC3C,YAAQ;AAAA,MACN,0BAA0B,MAAM,QAAQ,CAAC,CAAC,UAAU,QAAQ,uBAAuB;AAAA,IACrF;AAAA,EACF;AACA,MAAI,SAAS,sBAAsB,IAAI;AACrC,YAAQ,KAAK,yBAAyB,SAAS,oBAAoB,QAAQ,CAAC,CAAC,UAAU;AAAA,EACzF;AACA,SAAO;AAAA,IACL,WAAW,QAAQ,WAAW;AAAA,IAC9B,GAAG,SAAS;AAAA,IACZ,SAAS,QAAQ;AAAA,IACjB,qBAAqB,SAAS;AAAA,IAC9B,sBAAsB;AAAA,IACtB,qBAAqB,SAAS;AAAA,IAC9B;AAAA,EACF;AACF;AAEA,SAAS,SAAS,OAAoC;AACpD,MAAI,OAAO,MAAM,SAAS,WAAW,SAAU,QAAO,MAAM,QAAQ;AACpE,MAAI,OAAO,MAAM,SAAS,UAAU,SAAU,QAAO,MAAM,QAAQ;AACnE,MAAI,MAAM,SAAS,YAAY,KAAM,QAAO;AAC5C,SAAO;AACT;AAEA,SAAS,sBAAsB,OAAiC;AAC9D,SAAO,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK,KAAK,QAAQ,KAAK,SAAS;AACtF;AAEA,SAAS,oBAAoB,OAAiC;AAC5D,SAAO,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK,KAAK,SAAS,KAAK,SAAS;AACvF;AAEA,SAAS,kBAAkB,OAAwB;AACjD,SAAO,OAAO,UAAU,WAAW,OAAO,KAAK,IAAI,OAAO,SAAS,SAAS;AAC9E;AAEA,SAAS,aAAa,OAAwB;AAC5C,SAAO,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AAC9D;AAEA,SAAS,mBAAmB,aAAuB,gBAAkC;AACnF,MAAI,YAAY,UAAU,eAAgB,QAAO;AACjD,SAAO;AAAA,IACL,GAAG,YAAY,MAAM,GAAG,cAAc;AAAA,IACtC,GAAG,YAAY,SAAS,cAAc;AAAA,EACxC;AACF;;;AC/JA,IAAM,kBAAkD;AAAA,EACtD,gBAAgB;AAAA,EAChB,gBAAgB;AAAA,EAChB,cAAc;AAAA,EACd,YAAY;AAAA,EACZ,SAAS;AAAA,EACT,WAAW;AAAA,EACX,aAAa;AAAA,EACb,YAAY;AACd;AAEO,SAAS,yBAAyB,SAIf;AACxB,QAAM,YAAY,QAAQ;AAC1B,MAAI,CAAC,OAAO,SAAS,SAAS,KAAK,YAAY,KAAK,YAAY,GAAG;AACjE,UAAM,IAAI;AAAA,MACR,wEAAwE,SAAS;AAAA,IACnF;AAAA,EACF;AACA,QAAM,uBAAuB,QAAQ,wBAAwB;AAC7D,SAAO;AAAA,IACL,IAAI,QAAQ,MAAM,eAAe,SAAS;AAAA,IAC1C,OAAO,OAAO;AACZ,YAAM,aAAa,MAAM,cAAc;AACvC,aAAO;AAAA,QACL,QAAQ,cAAc,YAAY,WAAW;AAAA,QAC7C;AAAA,QACA,YAAY,MAAM;AAAA,QAClB,MAAM,MAAM;AAAA,QACZ,QACE,cAAc,YAAY,gCAAgC;AAAA,MAC9D;AAAA,IACF;AAAA,EACF;AACF;AAEO,SAAS,8BACd,QACA,QACA,UAAgD,CAAC,GACnB;AAC9B,QAAM,UAAU,EAAE,GAAG,iBAAiB,GAAI,QAAQ,WAAW,CAAC,EAAG;AACjE,QAAM,SAAS,OAAO,OAAO,CAAC,UAAU,MAAM,OAAO;AACrD,QAAM,OAAO,QAAQ,QAAQ;AAC7B,QAAM,cAAc,QAAQ,eAAe;AAC3C,QAAM,kBAAkB,QAAQ,mBAAmB;AACnD,QAAM,SAAmB,CAAC;AAC1B,QAAM,kBAA4B,CAAC;AACnC,QAAM,kBAA4B,CAAC;AACnC,MAAI,kBAAkB;AACtB,MAAI,gBAAgB;AACpB,MAAI,WAAW;AACf,MAAI,iBAAiB;AAErB,aAAW,SAAS,QAAQ;AAC1B,UAAM,WAAW,cAAc,OAAO,OAAO;AAC7C,UAAM,WAAW,OAAO,OAAO,KAAK;AACpC,UAAM,YAAY,sBAAsB,OAAO,SAAS,QAAQ,OAAO;AACvE,UAAM,SAASA,UAAS,OAAO,OAAO;AACtC,uBAAmB;AACnB,qBAAiB;AACjB,WAAO,KAAK,YAAY,QAAQ;AAChC,QAAI,SAAS,WAAW,UAAU;AAChC;AACA,sBAAgB,KAAK,MAAM;AAC3B,UAAI,SAAS,EAAG;AAAA,IAClB,OAAO;AACL,sBAAgB,KAAK,MAAM;AAAA,IAC7B;AAAA,EACF;AAEA,QAAM,IAAI,OAAO;AACjB,QAAM,WAAW,KAAK,IAAI,GAAG,IAAI,QAAQ;AACzC,QAAM,KAAK,mBAAmB,QAAQ,MAAM,EAAE,MAAM,QAAQ,QAAQ,GAAG,CAAC;AACxE,QAAM,UAAoB,CAAC;AAC3B,MAAI,IAAI,KAAM,SAAQ,KAAK,iBAAiB,IAAI,0BAA0B,CAAC,EAAE;AAC7E,MAAI,WAAW;AACb,YAAQ,KAAK,iBAAiB,WAAW,4BAA4B,QAAQ,EAAE;AACjF,MAAI,GAAG,SAAS,iBAAiB;AAC/B,YAAQ,KAAK,0BAA0B,GAAG,MAAM,QAAQ,CAAC,CAAC,mBAAmB,eAAe,EAAE;AAAA,EAChG;AACA,QAAM,iBACJ,IAAI,QAAQ,WAAW,cACnB,mBACA,GAAG,QAAQ,kBACT,SACA;AAER,SAAO;AAAA,IACL,UAAU,OAAO;AAAA,IACjB;AAAA,IACA;AAAA,IACA;AAAA,IACA,UAAU,IAAI,IAAI,WAAW,IAAI;AAAA,IACjC,mBAAmB,WAAW,IAAI,iBAAiB,WAAW;AAAA,IAC9D;AAAA,IACA;AAAA,IACA,cAAc,gBAAgB;AAAA,IAC9B,aAAa;AAAA,IACb,oBAAoB,gBAAgB,SAAS,IAAI,KAAK,eAAe,IAAI;AAAA,IACzE;AAAA,IACA;AAAA,EACF;AACF;AAEA,SAAS,cACP,OACA,SACQ;AACR,SACEA,UAAS,OAAO,OAAO,IAAI,QAAQ,cAAc,MAAM,WAAW,MAAM,SAAS,WAAW;AAEhG;AAEA,SAAS,sBACP,OACA,QACA,SACQ;AACR,MAAI,WAAW,SAAU,QAAO,cAAc,OAAO,OAAO;AAC5D,MAAI,WAAW,SAAU,QAAO,QAAQ,eAAe,QAAQ;AAC/D,MAAI,WAAW,MAAO,QAAO,QAAQ,eAAe,QAAQ;AAC5D,MAAI,WAAW,QAAS,QAAO,QAAQ,eAAe,QAAQ;AAC9D,MAAI,WAAW,OAAQ,QAAO,QAAQ;AACtC,SAAO,QAAQ;AACjB;AAEA,SAASA,UAAS,OAA4B,SAAiD;AAC7F,QAAM,UAAU,MAAM;AACtB,MAAI,CAAC,QAAS,QAAO,QAAQ;AAC7B,MAAI,OAAO,QAAQ,WAAW,SAAU,QAAO,IAAI,QAAQ,SAAS;AACpE,MAAI,OAAO,QAAQ,UAAU,SAAU,QAAO,IAAI,QAAQ,QAAQ;AAClE,MAAI,QAAQ,YAAY,KAAM,QAAO,QAAQ;AAC7C,MAAI,QAAQ,YAAY,MAAO,QAAO,QAAQ;AAC9C,SAAO,QAAQ;AACjB;AAEA,SAAS,KAAK,QAA0B;AACtC,SAAO,OAAO,OAAO,CAAC,KAAK,UAAU,MAAM,OAAO,CAAC,IAAI,OAAO;AAChE;;;ACpIO,SAAS,oBACd,SAC8B;AAC9B,QAAM,YAAY,8BAA8B,QAAQ,QAAQ,QAAQ,QAAQ,QAAQ,SAAS;AACjG,QAAM,cAAc,yBAAyB,QAAQ,QAAQ,QAAQ,WAAW;AAChF,QAAM,kBAAkB,QAAQ,KAAK;AACrC,QAAM,MAAM,kBACR,wBAAwB,QAAQ,QAAQ,iBAAiB,QAAQ,GAAG,IACpE;AACJ,QAAM,cAAwB,CAAC;AAC/B,QAAM,kBAAkB,UAAU;AAClC,QAAM,oBAAoB,cAAc,cAAc;AACtD,QAAM,eAAe,QAAQ,eAAe,QAAQ,QAAQ,QAAQ;AACpE,QAAM,YAA6B,MAC/B,IAAI,QAAQ,YACV,cACA,gBACF,eACE,gBACA;AAEN,MAAI,CAAC,YAAa,aAAY,KAAK,8DAA8D;AACjG,MAAI,gBAAgB,CAAC,gBAAiB,aAAY,KAAK,wCAAwC;AAAA,WACtF,OAAO,CAAC,IAAI,QAAQ;AAC3B,gBAAY,KAAK,GAAG,IAAI,QAAQ,QAAQ,IAAI,CAAC,WAAW,oBAAoB,MAAM,EAAE,CAAC;AAEvF,QAAM,SAAS,cAAc;AAAA,IAC3B;AAAA,IACA,gBAAgB,gBAAgB;AAAA,IAChC;AAAA,IACA;AAAA,EACF,CAAC;AAED,SAAO;AAAA,IACL,UAAU,QAAQ,OAAO;AAAA,IACzB,GAAG,QAAQ,OAAO;AAAA,IAClB;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAI,MAAM,EAAE,mBAAmB,IAAI,eAAe,IAAI,CAAC;AAAA,IACvD;AAAA,IACA,GAAI,cAAc,EAAE,YAAY,IAAI,CAAC;AAAA,IACrC,GAAI,MAAM,EAAE,IAAI,IAAI,CAAC;AAAA,IACrB;AAAA,EACF;AACF;AAEA,SAAS,cAAc,SAKI;AACzB,MAAI,QAAQ,oBAAoB,oBAAoB,CAAC,QAAQ,gBAAgB;AAC3E,WAAO;AAAA,EACT;AACA,MAAI,QAAQ,oBAAoB,OAAQ,QAAO;AAC/C,MAAI,QAAQ,gBAAgB,QAAQ,cAAc,YAAa,QAAO;AACtE,SAAO;AACT;","names":["rewardOf"]}
@@ -1,4 +1,4 @@
1
- export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-DsnOpCO6.js';
2
- import '../run-record-BgTFzO2r.js';
1
+ export { B as BENCHMARK_SPLIT_SEED, a as BenchmarkAdapter, b as BenchmarkDatasetItem, c as BenchmarkEvaluation, d as deterministicSplit, e as routing } from '../index-B1RKber3.js';
2
+ import '../run-record-sItO5ftF.js';
3
3
  import '../errors-Dwqw-T_m.js';
4
4
  import '../schema-m0gsnbt3.js';
@@ -0,0 +1,101 @@
1
+ import { T as TraceStore } from './store-CKUAgsJz.js';
2
+ import { R as Run } from './schema-m0gsnbt3.js';
3
+ import { O as OutcomeFilter, b as OutcomeStore } from './outcome-store-rnXLEqSn.js';
4
+
5
+ /**
6
+ * Correlation study — "does our eval score predict real-world outcomes?"
7
+ *
8
+ * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
9
+ * joins on runId, computes Pearson + Spearman + bootstrap CI for every
10
+ * (evalMetric, outcomeMetric) pair the caller declares.
11
+ *
12
+ * Without this number the framework is ornamental. With it and r > 0.6
13
+ * the framework is a moat — no other agent-eval tool publishes one.
14
+ */
15
+
16
+ interface EvalMetricSpec {
17
+ id: string;
18
+ /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
19
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
20
+ }
21
+ interface OutcomePair {
22
+ evalMetric: string;
23
+ outcomeMetric: string;
24
+ }
25
+ interface CorrelationResult {
26
+ evalMetric: string;
27
+ outcomeMetric: string;
28
+ n: number;
29
+ pearson: number;
30
+ spearman: number;
31
+ /** 95% bootstrap CI for Pearson. */
32
+ pearsonCi95: {
33
+ lower: number;
34
+ upper: number;
35
+ };
36
+ /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
37
+ verdict: 'strong' | 'moderate' | 'weak';
38
+ }
39
+ interface CorrelationStudyResult {
40
+ pairs: CorrelationResult[];
41
+ joinedSamples: number;
42
+ skippedRuns: number;
43
+ }
44
+ interface CorrelationStudyOptions {
45
+ /** Only join outcomes captured within this window after run.startedAt. */
46
+ maxCaptureLagMs?: number;
47
+ /** Restrict to a subset of outcomes (cohort, region, source). */
48
+ outcomeFilter?: OutcomeFilter;
49
+ /** Which outcome per run to use when multiple exist. Default 'latest'. */
50
+ reduction?: 'latest' | 'mean' | 'max';
51
+ /** Bootstrap iterations for the CI. Default 500. */
52
+ bootstrapIterations?: number;
53
+ }
54
+ declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
55
+
56
+ /**
57
+ * Calibration curve — binned "if eval says X, what does reality show?"
58
+ *
59
+ * Companion to correlationStudy. Raw correlation is a single number;
60
+ * the calibration curve shows *where* the eval is well-calibrated vs
61
+ * overconfident / underconfident. Buckets the eval metric, computes
62
+ * mean outcome per bucket, reports expected-calibration-error (ECE).
63
+ */
64
+
65
+ interface CalibrationBin {
66
+ lower: number;
67
+ upper: number;
68
+ n: number;
69
+ evalMean: number;
70
+ outcomeMean: number;
71
+ /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
72
+ gap: number;
73
+ }
74
+ interface CalibrationReport {
75
+ evalMetric: string;
76
+ outcomeMetric: string;
77
+ n: number;
78
+ bins: CalibrationBin[];
79
+ /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
80
+ ece: number;
81
+ /** Max bin gap — upper bound on miscalibration. */
82
+ maxGap: number;
83
+ }
84
+ interface CalibrationOptions {
85
+ bins?: number;
86
+ /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
87
+ binning?: 'equal-width' | 'equal-frequency';
88
+ /** Clip eval values to [lo, hi] before binning. */
89
+ range?: {
90
+ lo: number;
91
+ hi: number;
92
+ };
93
+ }
94
+ interface CalibrationPair {
95
+ evalScore: number;
96
+ outcome: number;
97
+ }
98
+ declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
99
+ declare function calibrationFromPairs(inputPairs: CalibrationPair[], evalMetric: string, outcomeMetric: string, options?: CalibrationOptions): CalibrationReport | null;
100
+
101
+ export { type CalibrationBin as C, type EvalMetricSpec as E, type OutcomePair as O, type CalibrationOptions as a, type CalibrationPair as b, type CalibrationReport as c, type CorrelationResult as d, type CorrelationStudyOptions as e, type CorrelationStudyResult as f, calibrationCurve as g, calibrationFromPairs as h, correlationStudy as i };