@tangle-network/agent-eval 0.23.1 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +80 -0
  2. package/README.md +141 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  20. package/dist/chunk-6QDKWHLS.js.map +1 -0
  21. package/dist/chunk-I4MBDTY5.js +272 -0
  22. package/dist/chunk-I4MBDTY5.js.map +1 -0
  23. package/dist/chunk-K2TPS5LB.js +569 -0
  24. package/dist/chunk-K2TPS5LB.js.map +1 -0
  25. package/dist/chunk-KKHDIONI.js +414 -0
  26. package/dist/chunk-KKHDIONI.js.map +1 -0
  27. package/dist/chunk-KMPRBJK4.js +74 -0
  28. package/dist/chunk-KMPRBJK4.js.map +1 -0
  29. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  30. package/dist/chunk-KTGTIOFD.js.map +1 -0
  31. package/dist/chunk-LSH4MMOZ.js +838 -0
  32. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  33. package/dist/chunk-NG236HPC.js +57 -0
  34. package/dist/chunk-NG236HPC.js.map +1 -0
  35. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  36. package/dist/chunk-NLMNWKVM.js.map +1 -0
  37. package/dist/chunk-NU65VQ7M.js +99 -0
  38. package/dist/chunk-NU65VQ7M.js.map +1 -0
  39. package/dist/chunk-OHEPNJQN.js +554 -0
  40. package/dist/chunk-OHEPNJQN.js.map +1 -0
  41. package/dist/chunk-OWLAAMME.js +250 -0
  42. package/dist/chunk-OWLAAMME.js.map +1 -0
  43. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  44. package/dist/chunk-PC4UYEBM.js.map +1 -0
  45. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  46. package/dist/chunk-RAF443UI.js.map +1 -0
  47. package/dist/chunk-RZTMDUO7.js +49 -0
  48. package/dist/chunk-RZTMDUO7.js.map +1 -0
  49. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  50. package/dist/chunk-SESZDQPX.js.map +1 -0
  51. package/dist/{chunk-6KQG5HAH.js → chunk-SY6WAAAD.js} +84 -71
  52. package/dist/chunk-SY6WAAAD.js.map +1 -0
  53. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  54. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  55. package/dist/{chunk-VQQSPGSM.js → chunk-VRJVTXRV.js} +169 -111
  56. package/dist/chunk-VRJVTXRV.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +1866 -3151
  80. package/dist/index.js +5457 -7809
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +1 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +409 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-TDPn1cxq.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-CUOiGcGv.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-BXGs_9V0.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +22 -22
  125. package/dist/wire/index.js +4 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,222 @@
1
+ import {
2
+ exportTrainingData,
3
+ toNdjson
4
+ } from "../chunk-KMPRBJK4.js";
5
+ import {
6
+ buildTrajectory
7
+ } from "../chunk-RZTMDUO7.js";
8
+ import "../chunk-5BKGXME7.js";
9
+ import {
10
+ TraceEmitter
11
+ } from "../chunk-TVVP3ZZQ.js";
12
+ import "../chunk-PZ5AY32C.js";
13
+
14
+ // src/prm/builtin-rubrics.ts
15
+ function outputLengthRubric(args = {}) {
16
+ const min = args.minChars ?? 20;
17
+ const max = args.maxChars ?? 8e3;
18
+ return {
19
+ id: "output-length",
20
+ kinds: ["llm"],
21
+ weight: args.weight ?? 0.5,
22
+ async grade({ step }) {
23
+ const llm = step.span;
24
+ const len = (llm.output ?? "").length;
25
+ if (len === 0) return { score: 0, rationale: "empty output" };
26
+ if (len < min)
27
+ return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` };
28
+ if (len > max)
29
+ return {
30
+ score: Math.max(0, 1 - (len - max) / max),
31
+ rationale: `above max (${len} > ${max})`
32
+ };
33
+ return { score: 1, rationale: `${len} chars in bounds` };
34
+ }
35
+ };
36
+ }
37
+ function toolSuccessRubric(args = {}) {
38
+ return {
39
+ id: "tool-success",
40
+ kinds: ["tool"],
41
+ weight: args.weight ?? 1,
42
+ async grade({ step }) {
43
+ const tool = step.span;
44
+ if (tool.status === "error")
45
+ return { score: 0, rationale: `error: ${tool.error ?? "unknown"}` };
46
+ const r = tool.result;
47
+ if (r === null || r === void 0) return { score: 0.3, rationale: "empty result" };
48
+ const asText = typeof r === "string" ? r : JSON.stringify(r);
49
+ if (asText.length < 4) return { score: 0.5, rationale: "tiny result" };
50
+ return { score: 1, rationale: `${tool.toolName} ok` };
51
+ }
52
+ };
53
+ }
54
+ function toolNonRedundantRubric(args = {}) {
55
+ const weight = args.weight ?? 0.5;
56
+ return {
57
+ id: "tool-non-redundant",
58
+ kinds: ["tool"],
59
+ weight,
60
+ async grade({ step, prior }) {
61
+ const tool = step.span;
62
+ const priorMatches = prior.filter((p) => {
63
+ if (p.span.kind !== "tool") return false;
64
+ const pt = p.span;
65
+ return pt.toolName === tool.toolName && stableStringify(pt.args) === stableStringify(tool.args);
66
+ });
67
+ if (priorMatches.length === 0) return { score: 1, rationale: "novel call" };
68
+ return {
69
+ score: Math.max(0, 1 - priorMatches.length * 0.5),
70
+ rationale: `${priorMatches.length} duplicate(s)`
71
+ };
72
+ }
73
+ };
74
+ }
75
+ function nonRefusalRubric(args = {}) {
76
+ const weight = args.weight ?? 1;
77
+ const markers = args.markers ?? [
78
+ /\bi\s+(?:can(?:not|'t)|won't|will\s+not)\b/i,
79
+ /\b(?:as\s+an?\s+)?ai\b.*?\b(?:can't|cannot)\b/i
80
+ ];
81
+ return {
82
+ id: "non-refusal",
83
+ kinds: ["llm"],
84
+ weight,
85
+ async grade({ step }) {
86
+ const llm = step.span;
87
+ const out = llm.output ?? "";
88
+ const refused = markers.some((re) => re.test(out));
89
+ return refused ? { score: 0, rationale: "refusal marker present" } : { score: 1, rationale: "no refusal" };
90
+ }
91
+ };
92
+ }
93
+ function toolIntentAlignmentRubric(args = {}) {
94
+ return {
95
+ id: "tool-intent-alignment",
96
+ kinds: ["llm"],
97
+ weight: args.weight ?? 0.5,
98
+ async grade({ step, next }) {
99
+ const llm = step.span;
100
+ const nextTool = next.find((s) => s.span.kind === "tool");
101
+ if (!nextTool) return null;
102
+ const toolName = nextTool.span.toolName;
103
+ const out = (llm.output ?? "").toLowerCase();
104
+ const mentioned = out.includes(toolName.toLowerCase());
105
+ return mentioned ? { score: 1, rationale: `mentioned "${toolName}" before calling it` } : { score: 0.5, rationale: `called "${toolName}" without announcing it` };
106
+ }
107
+ };
108
+ }
109
+ function stableStringify(value) {
110
+ if (value === null || typeof value !== "object") return JSON.stringify(value);
111
+ if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`;
112
+ const keys = Object.keys(value).sort();
113
+ return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify(value[k])}`).join(",")}}`;
114
+ }
115
+
116
+ // src/prm/inference.ts
117
+ async function prmBestOfN(store, grader, runIds) {
118
+ if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
119
+ const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
120
+ const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
121
+ const mean = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
122
+ const variance = graded.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / graded.length;
123
+ return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance) };
124
+ }
125
+ async function prmEnsembleBestOfN(store, graders, runIds) {
126
+ if (graders.length === 0) throw new Error("prmEnsembleBestOfN: at least 1 grader");
127
+ const perGrader = await Promise.all(
128
+ graders.map(async (g) => {
129
+ const graded = await Promise.all(runIds.map((id) => g.grade(store, id)));
130
+ return graded.sort((a, b) => b.aggregateScore - a.aggregateScore);
131
+ })
132
+ );
133
+ const bordaScores = /* @__PURE__ */ new Map();
134
+ for (const ranking of perGrader) {
135
+ ranking.forEach((g, rank) => {
136
+ bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank));
137
+ });
138
+ }
139
+ const canonical = perGrader[0];
140
+ const byRun = new Map(canonical.map((g) => [g.runId, g]));
141
+ const ranked = [...byRun.values()].sort(
142
+ (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
143
+ );
144
+ const mean = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
145
+ const variance = ranked.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / ranked.length;
146
+ return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance) };
147
+ }
148
+
149
+ // src/prm/rubric.ts
150
+ var PrmGrader = class {
151
+ constructor(rubrics) {
152
+ this.rubrics = rubrics;
153
+ if (rubrics.length === 0) throw new Error("PrmGrader: at least 1 rubric required");
154
+ }
155
+ rubrics;
156
+ /**
157
+ * Grade every eligible span in a run. Emits a JudgeVerdict span for each
158
+ * (rubric × span) verdict so the result is visible to downstream pipelines
159
+ * (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
160
+ */
161
+ async grade(store, runId) {
162
+ const trajectory = await buildTrajectory(store, runId);
163
+ const emitter = new TraceEmitter(store, { runId });
164
+ const steps = [];
165
+ let ungraded = 0;
166
+ for (let i = 0; i < trajectory.steps.length; i++) {
167
+ const step = trajectory.steps[i];
168
+ const ctx = {
169
+ trajectory,
170
+ step,
171
+ prior: trajectory.steps.slice(0, i),
172
+ next: trajectory.steps.slice(i + 1)
173
+ };
174
+ let gradedThis = false;
175
+ for (const rubric of this.rubrics) {
176
+ if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue;
177
+ const verdict = await rubric.grade(ctx);
178
+ if (verdict === null) continue;
179
+ const weight = rubric.weight ?? 1;
180
+ steps.push({
181
+ spanId: step.span.spanId,
182
+ rubricId: rubric.id,
183
+ score: verdict.score,
184
+ weight,
185
+ rationale: verdict.rationale,
186
+ evidence: verdict.evidence
187
+ });
188
+ gradedThis = true;
189
+ await emitter.recordJudge({
190
+ judgeId: `prm:${rubric.id}`,
191
+ targetSpanId: step.span.spanId,
192
+ dimension: "step_quality",
193
+ score: verdict.score,
194
+ rationale: verdict.rationale,
195
+ evidence: verdict.evidence,
196
+ name: `prm:${rubric.id}`
197
+ });
198
+ }
199
+ if (!gradedThis) ungraded++;
200
+ }
201
+ const totalWeight = steps.reduce((a, s) => a + s.weight, 0);
202
+ const aggregateScore = totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight;
203
+ return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded };
204
+ }
205
+ };
206
+ function isPrmVerdict(verdict) {
207
+ return verdict.judgeId.startsWith("prm:");
208
+ }
209
+ export {
210
+ PrmGrader,
211
+ exportTrainingData,
212
+ isPrmVerdict,
213
+ nonRefusalRubric,
214
+ outputLengthRubric,
215
+ prmBestOfN,
216
+ prmEnsembleBestOfN,
217
+ toNdjson,
218
+ toolIntentAlignmentRubric,
219
+ toolNonRedundantRubric,
220
+ toolSuccessRubric
221
+ };
222
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/prm/builtin-rubrics.ts","../../src/prm/inference.ts","../../src/prm/rubric.ts"],"sourcesContent":["/**\n * Built-in reference rubrics. Consumers combine these with domain\n * rubrics. All are deterministic, rule-based — cheap to run + easy\n * to unit-test. LLM-based rubrics are trivially authored by\n * following the StepRubric contract.\n */\n\nimport type { LlmSpan, ToolSpan } from '../trace/schema'\nimport type { StepRubric } from './rubric'\n\n/** Penalize very short or very long assistant outputs. */\nexport function outputLengthRubric(\n args: { minChars?: number; maxChars?: number; weight?: number } = {},\n): StepRubric {\n const min = args.minChars ?? 20\n const max = args.maxChars ?? 8000\n return {\n id: 'output-length',\n kinds: ['llm'],\n weight: args.weight ?? 0.5,\n async grade({ step }) {\n const llm = step.span as LlmSpan\n const len = (llm.output ?? '').length\n if (len === 0) return { score: 0, rationale: 'empty output' }\n if (len < min)\n return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` }\n if (len > max)\n return {\n score: Math.max(0, 1 - (len - max) / max),\n rationale: `above max (${len} > ${max})`,\n }\n return { score: 1, rationale: `${len} chars in bounds` }\n },\n }\n}\n\n/** Reward tool calls that succeeded (status='ok') with an informative result. */\nexport function toolSuccessRubric(args: { weight?: number } = {}): StepRubric {\n return {\n id: 'tool-success',\n kinds: ['tool'],\n weight: args.weight ?? 1,\n async grade({ step }) {\n const tool = step.span as ToolSpan\n if (tool.status === 'error')\n return { score: 0, rationale: `error: ${tool.error ?? 'unknown'}` }\n const r = tool.result\n if (r === null || r === undefined) return { score: 0.3, rationale: 'empty result' }\n const asText = typeof r === 'string' ? r : JSON.stringify(r)\n if (asText.length < 4) return { score: 0.5, rationale: 'tiny result' }\n return { score: 1, rationale: `${tool.toolName} ok` }\n },\n }\n}\n\n/** Penalize tool calls that duplicate a prior call with identical args. */\nexport function toolNonRedundantRubric(args: { weight?: number } = {}): StepRubric {\n const weight = args.weight ?? 0.5\n return {\n id: 'tool-non-redundant',\n kinds: ['tool'],\n weight,\n async grade({ step, prior }) {\n const tool = step.span as ToolSpan\n const priorMatches = prior.filter((p) => {\n if (p.span.kind !== 'tool') return false\n const pt = p.span as ToolSpan\n return (\n pt.toolName === tool.toolName && stableStringify(pt.args) === stableStringify(tool.args)\n )\n })\n if (priorMatches.length === 0) return { score: 1, rationale: 'novel call' }\n return {\n score: Math.max(0, 1 - priorMatches.length * 0.5),\n rationale: `${priorMatches.length} duplicate(s)`,\n }\n },\n }\n}\n\n/** Penalize LLM outputs that contain common refusal markers when a refusal\n * is NOT expected (caller inverts weight for scenarios where refusal IS expected). */\nexport function nonRefusalRubric(args: { markers?: RegExp[]; weight?: number } = {}): StepRubric {\n const weight = args.weight ?? 1\n const markers = args.markers ?? [\n /\\bi\\s+(?:can(?:not|'t)|won't|will\\s+not)\\b/i,\n /\\b(?:as\\s+an?\\s+)?ai\\b.*?\\b(?:can't|cannot)\\b/i,\n ]\n return {\n id: 'non-refusal',\n kinds: ['llm'],\n weight,\n async grade({ step }) {\n const llm = step.span as LlmSpan\n const out = llm.output ?? ''\n const refused = markers.some((re) => re.test(out))\n return refused\n ? { score: 0, rationale: 'refusal marker present' }\n : { score: 1, rationale: 'no refusal' }\n },\n }\n}\n\n/** Reward outputs that invoke the next-step tool the trajectory actually uses\n * (i.e. the LLM span announced \"I will call X\" and the following tool span IS X). */\nexport function toolIntentAlignmentRubric(args: { weight?: number } = {}): StepRubric {\n return {\n id: 'tool-intent-alignment',\n kinds: ['llm'],\n weight: args.weight ?? 0.5,\n async grade({ step, next }) {\n const llm = step.span as LlmSpan\n const nextTool = next.find((s) => s.span.kind === 'tool')\n if (!nextTool) return null\n const toolName = (nextTool.span as ToolSpan).toolName\n const out = (llm.output ?? '').toLowerCase()\n const mentioned = out.includes(toolName.toLowerCase())\n return mentioned\n ? { score: 1, rationale: `mentioned \"${toolName}\" before calling it` }\n : { score: 0.5, rationale: `called \"${toolName}\" without announcing it` }\n },\n }\n}\n\nfunction stableStringify(value: unknown): string {\n if (value === null || typeof value !== 'object') return JSON.stringify(value)\n if (Array.isArray(value)) return `[${value.map(stableStringify).join(',')}]`\n const keys = Object.keys(value as Record<string, unknown>).sort()\n return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify((value as Record<string, unknown>)[k])}`).join(',')}}`\n}\n","/**\n * Inference-time PRM scoring — pick the best of N candidate trajectories\n * using a trained reward model (or a rule-based PRM as a proxy).\n *\n * The canonical Best-of-N pattern: generate N completions, score each\n * with a PRM, pick the winner. Here the scoring loop is framework-agnostic\n * — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.\n */\n\nimport type { TraceStore } from '../trace/store'\nimport type { PrmGradedTrace, PrmGrader } from './rubric'\n\nexport interface BestOfNResult {\n winner: PrmGradedTrace\n ranked: PrmGradedTrace[]\n /** Standard deviation of aggregate scores — small = candidates were homogenous. */\n stdDev: number\n}\n\nexport async function prmBestOfN(\n store: TraceStore,\n grader: PrmGrader,\n runIds: string[],\n): Promise<BestOfNResult> {\n if (runIds.length === 0) throw new Error('prmBestOfN: at least 1 candidate required')\n const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)))\n const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore)\n const mean = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length\n const variance = graded.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / graded.length\n return { winner: ranked[0]!, ranked, stdDev: Math.sqrt(variance) }\n}\n\n/**\n * Weighted vote across multiple graders — use when you want a PRM ensemble\n * (e.g. rule-based + LLM-based + trained model). Each grader produces its\n * own ranking; we aggregate via rank-sum (Borda count) so no single grader\n * dominates via a different score scale.\n */\nexport async function prmEnsembleBestOfN(\n store: TraceStore,\n graders: PrmGrader[],\n runIds: string[],\n): Promise<BestOfNResult> {\n if (graders.length === 0) throw new Error('prmEnsembleBestOfN: at least 1 grader')\n const perGrader = await Promise.all(\n graders.map(async (g) => {\n const graded = await Promise.all(runIds.map((id) => g.grade(store, id)))\n return graded.sort((a, b) => b.aggregateScore - a.aggregateScore)\n }),\n )\n // Borda: rank-sum across graders.\n const bordaScores = new Map<string, number>()\n for (const ranking of perGrader) {\n ranking.forEach((g, rank) => {\n bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank))\n })\n }\n // Return a synthesized ranking using the first grader's graded traces\n // ordered by Borda score. aggregateScore field kept for UX.\n const canonical = perGrader[0]!\n const byRun = new Map(canonical.map((g) => [g.runId, g]))\n const ranked = [...byRun.values()].sort(\n (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0),\n )\n const mean = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length\n const variance = ranked.reduce((a, g) => a + (g.aggregateScore - mean) ** 2, 0) / ranked.length\n return { winner: ranked[0]!, ranked, stdDev: Math.sqrt(variance) }\n}\n","/**\n * Process Reward Modeling — per-step rubric grading.\n *\n * A StepRubric inspects one span and returns a score + rationale.\n * PrmGrader applies an array of rubrics to every LLM span in a\n * trajectory (consumers can broaden to tool/retrieval spans via the\n * `kind` filter on each rubric).\n *\n * Why this matters: outcome-only eval (did the final artifact work?)\n * gives sparse reward — most agent turns are unattributable. PRMs\n * densify the signal so optimizers and RL fine-tuning can assign\n * credit per turn.\n */\n\nimport { TraceEmitter } from '../trace/emitter'\nimport type { JudgeSpan, Span } from '../trace/schema'\nimport type { TraceStore } from '../trace/store'\nimport { buildTrajectory, type Trajectory, type TrajectoryStep } from '../trajectory'\n\nexport interface StepContext {\n trajectory: Trajectory\n step: TrajectoryStep\n /** Steps preceding `step` in trajectory order. */\n prior: TrajectoryStep[]\n /** Steps following `step`. */\n next: TrajectoryStep[]\n}\n\nexport interface StepRubric {\n id: string\n /** Only grade spans of these kinds (default: all). */\n kinds?: Array<Span['kind']>\n /** Weight in the aggregate score. Default 1. */\n weight?: number\n /** Returns score in 0..1 + optional rationale/evidence. Return `null` to\n * skip grading (rubric doesn't apply to this step). */\n grade: (\n ctx: StepContext,\n ) => Promise<{ score: number; rationale?: string; evidence?: string } | null>\n}\n\nexport interface GradedStep {\n spanId: string\n rubricId: string\n score: number\n weight: number\n rationale?: string\n evidence?: string\n}\n\nexport interface PrmGradedTrace {\n runId: string\n steps: GradedStep[]\n /** Weighted mean of all graded steps; 0..1. */\n aggregateScore: number\n /** Number of spans graded — useful for sanity-checking coverage. */\n gradedCount: number\n /** Number of spans in the trajectory that no rubric matched. */\n ungradedCount: number\n}\n\nexport class PrmGrader {\n constructor(private rubrics: StepRubric[]) {\n if (rubrics.length === 0) throw new Error('PrmGrader: at least 1 rubric required')\n }\n\n /**\n * Grade every eligible span in a run. Emits a JudgeVerdict span for each\n * (rubric × span) verdict so the result is visible to downstream pipelines\n * (judgeAgreementView, etc.) — PRM is just \"a judge that runs per span.\"\n */\n async grade(store: TraceStore, runId: string): Promise<PrmGradedTrace> {\n const trajectory = await buildTrajectory(store, runId)\n const emitter = new TraceEmitter(store, { runId })\n const steps: GradedStep[] = []\n let ungraded = 0\n for (let i = 0; i < trajectory.steps.length; i++) {\n const step = trajectory.steps[i]!\n const ctx: StepContext = {\n trajectory,\n step,\n prior: trajectory.steps.slice(0, i),\n next: trajectory.steps.slice(i + 1),\n }\n let gradedThis = false\n for (const rubric of this.rubrics) {\n if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue\n const verdict = await rubric.grade(ctx)\n if (verdict === null) continue\n const weight = rubric.weight ?? 1\n steps.push({\n spanId: step.span.spanId,\n rubricId: rubric.id,\n score: verdict.score,\n weight,\n rationale: verdict.rationale,\n evidence: verdict.evidence,\n })\n gradedThis = true\n // Persist the verdict as a JudgeSpan so the query pipelines see it\n await emitter.recordJudge({\n judgeId: `prm:${rubric.id}`,\n targetSpanId: step.span.spanId,\n dimension: 'step_quality',\n score: verdict.score,\n rationale: verdict.rationale,\n evidence: verdict.evidence,\n name: `prm:${rubric.id}`,\n })\n }\n if (!gradedThis) ungraded++\n }\n\n const totalWeight = steps.reduce((a, s) => a + s.weight, 0)\n const aggregateScore =\n totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight\n\n return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded }\n }\n}\n\n/** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines\n * can distinguish PRM verdicts from human or top-level LLM judges. */\nexport function isPrmVerdict(verdict: JudgeSpan): boolean {\n return verdict.judgeId.startsWith('prm:')\n}\n"],"mappings":";;;;;;;;;;;;;;AAWO,SAAS,mBACd,OAAkE,CAAC,GACvD;AACZ,QAAM,MAAM,KAAK,YAAY;AAC7B,QAAM,MAAM,KAAK,YAAY;AAC7B,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,KAAK;AAAA,IACb,QAAQ,KAAK,UAAU;AAAA,IACvB,MAAM,MAAM,EAAE,KAAK,GAAG;AACpB,YAAM,MAAM,KAAK;AACjB,YAAM,OAAO,IAAI,UAAU,IAAI;AAC/B,UAAI,QAAQ,EAAG,QAAO,EAAE,OAAO,GAAG,WAAW,eAAe;AAC5D,UAAI,MAAM;AACR,eAAO,EAAE,OAAO,KAAK,IAAI,GAAG,MAAM,GAAG,GAAG,WAAW,cAAc,GAAG,MAAM,GAAG,IAAI;AACnF,UAAI,MAAM;AACR,eAAO;AAAA,UACL,OAAO,KAAK,IAAI,GAAG,KAAK,MAAM,OAAO,GAAG;AAAA,UACxC,WAAW,cAAc,GAAG,MAAM,GAAG;AAAA,QACvC;AACF,aAAO,EAAE,OAAO,GAAG,WAAW,GAAG,GAAG,mBAAmB;AAAA,IACzD;AAAA,EACF;AACF;AAGO,SAAS,kBAAkB,OAA4B,CAAC,GAAe;AAC5E,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,MAAM;AAAA,IACd,QAAQ,KAAK,UAAU;AAAA,IACvB,MAAM,MAAM,EAAE,KAAK,GAAG;AACpB,YAAM,OAAO,KAAK;AAClB,UAAI,KAAK,WAAW;AAClB,eAAO,EAAE,OAAO,GAAG,WAAW,UAAU,KAAK,SAAS,SAAS,GAAG;AACpE,YAAM,IAAI,KAAK;AACf,UAAI,MAAM,QAAQ,MAAM,OAAW,QAAO,EAAE,OAAO,KAAK,WAAW,eAAe;AAClF,YAAM,SAAS,OAAO,MAAM,WAAW,IAAI,KAAK,UAAU,CAAC;AAC3D,UAAI,OAAO,SAAS,EAAG,QAAO,EAAE,OAAO,KAAK,WAAW,cAAc;AACrE,aAAO,EAAE,OAAO,GAAG,WAAW,GAAG,KAAK,QAAQ,MAAM;AAAA,IACtD;AAAA,EACF;AACF;AAGO,SAAS,uBAAuB,OAA4B,CAAC,GAAe;AACjF,QAAM,SAAS,KAAK,UAAU;AAC9B,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,MAAM;AAAA,IACd;AAAA,IACA,MAAM,MAAM,EAAE,MAAM,MAAM,GAAG;AAC3B,YAAM,OAAO,KAAK;AAClB,YAAM,eAAe,MAAM,OAAO,CAAC,MAAM;AACvC,YAAI,EAAE,KAAK,SAAS,OAAQ,QAAO;AACnC,cAAM,KAAK,EAAE;AACb,eACE,GAAG,aAAa,KAAK,YAAY,gBAAgB,GAAG,IAAI,MAAM,gBAAgB,KAAK,IAAI;AAAA,MAE3F,CAAC;AACD,UAAI,aAAa,WAAW,EAAG,QAAO,EAAE,OAAO,GAAG,WAAW,aAAa;AAC1E,aAAO;AAAA,QACL,OAAO,KAAK,IAAI,GAAG,IAAI,aAAa,SAAS,GAAG;AAAA,QAChD,WAAW,GAAG,aAAa,MAAM;AAAA,MACnC;AAAA,IACF;AAAA,EACF;AACF;AAIO,SAAS,iBAAiB,OAAgD,CAAC,GAAe;AAC/F,QAAM,SAAS,KAAK,UAAU;AAC9B,QAAM,UAAU,KAAK,WAAW;AAAA,IAC9B;AAAA,IACA;AAAA,EACF;AACA,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,KAAK;AAAA,IACb;AAAA,IACA,MAAM,MAAM,EAAE,KAAK,GAAG;AACpB,YAAM,MAAM,KAAK;AACjB,YAAM,MAAM,IAAI,UAAU;AAC1B,YAAM,UAAU,QAAQ,KAAK,CAAC,OAAO,GAAG,KAAK,GAAG,CAAC;AACjD,aAAO,UACH,EAAE,OAAO,GAAG,WAAW,yBAAyB,IAChD,EAAE,OAAO,GAAG,WAAW,aAAa;AAAA,IAC1C;AAAA,EACF;AACF;AAIO,SAAS,0BAA0B,OAA4B,CAAC,GAAe;AACpF,SAAO;AAAA,IACL,IAAI;AAAA,IACJ,OAAO,CAAC,KAAK;AAAA,IACb,QAAQ,KAAK,UAAU;AAAA,IACvB,MAAM,MAAM,EAAE,MAAM,KAAK,GAAG;AAC1B,YAAM,MAAM,KAAK;AACjB,YAAM,WAAW,KAAK,KAAK,CAAC,MAAM,EAAE,KAAK,SAAS,MAAM;AACxD,UAAI,CAAC,SAAU,QAAO;AACtB,YAAM,WAAY,SAAS,KAAkB;AAC7C,YAAM,OAAO,IAAI,UAAU,IAAI,YAAY;AAC3C,YAAM,YAAY,IAAI,SAAS,SAAS,YAAY,CAAC;AACrD,aAAO,YACH,EAAE,OAAO,GAAG,WAAW,cAAc,QAAQ,sBAAsB,IACnE,EAAE,OAAO,KAAK,WAAW,WAAW,QAAQ,0BAA0B;AAAA,IAC5E;AAAA,EACF;AACF;AAEA,SAAS,gBAAgB,OAAwB;AAC/C,MAAI,UAAU,QAAQ,OAAO,UAAU,SAAU,QAAO,KAAK,UAAU,KAAK;AAC5E,MAAI,MAAM,QAAQ,KAAK,EAAG,QAAO,IAAI,MAAM,IAAI,eAAe,EAAE,KAAK,GAAG,CAAC;AACzE,QAAM,OAAO,OAAO,KAAK,KAAgC,EAAE,KAAK;AAChE,SAAO,IAAI,KAAK,IAAI,CAAC,MAAM,GAAG,KAAK,UAAU,CAAC,CAAC,IAAI,gBAAiB,MAAkC,CAAC,CAAC,CAAC,EAAE,EAAE,KAAK,GAAG,CAAC;AACxH;;;AC9GA,eAAsB,WACpB,OACA,QACA,QACwB;AACxB,MAAI,OAAO,WAAW,EAAG,OAAM,IAAI,MAAM,2CAA2C;AACpF,QAAM,SAAS,MAAM,QAAQ,IAAI,OAAO,IAAI,CAAC,OAAO,OAAO,MAAM,OAAO,EAAE,CAAC,CAAC;AAC5E,QAAM,SAAS,CAAC,GAAG,MAAM,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,iBAAiB,EAAE,cAAc;AAC7E,QAAM,OAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,gBAAgB,CAAC,IAAI,OAAO;AACvE,QAAM,WAAW,OAAO,OAAO,CAAC,GAAG,MAAM,KAAK,EAAE,iBAAiB,SAAS,GAAG,CAAC,IAAI,OAAO;AACzF,SAAO,EAAE,QAAQ,OAAO,CAAC,GAAI,QAAQ,QAAQ,KAAK,KAAK,QAAQ,EAAE;AACnE;AAQA,eAAsB,mBACpB,OACA,SACA,QACwB;AACxB,MAAI,QAAQ,WAAW,EAAG,OAAM,IAAI,MAAM,uCAAuC;AACjF,QAAM,YAAY,MAAM,QAAQ;AAAA,IAC9B,QAAQ,IAAI,OAAO,MAAM;AACvB,YAAM,SAAS,MAAM,QAAQ,IAAI,OAAO,IAAI,CAAC,OAAO,EAAE,MAAM,OAAO,EAAE,CAAC,CAAC;AACvE,aAAO,OAAO,KAAK,CAAC,GAAG,MAAM,EAAE,iBAAiB,EAAE,cAAc;AAAA,IAClE,CAAC;AAAA,EACH;AAEA,QAAM,cAAc,oBAAI,IAAoB;AAC5C,aAAW,WAAW,WAAW;AAC/B,YAAQ,QAAQ,CAAC,GAAG,SAAS;AAC3B,kBAAY,IAAI,EAAE,QAAQ,YAAY,IAAI,EAAE,KAAK,KAAK,MAAM,QAAQ,SAAS,KAAK;AAAA,IACpF,CAAC;AAAA,EACH;AAGA,QAAM,YAAY,UAAU,CAAC;AAC7B,QAAM,QAAQ,IAAI,IAAI,UAAU,IAAI,CAAC,MAAM,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC;AACxD,QAAM,SAAS,CAAC,GAAG,MAAM,OAAO,CAAC,EAAE;AAAA,IACjC,CAAC,GAAG,OAAO,YAAY,IAAI,EAAE,KAAK,KAAK,MAAM,YAAY,IAAI,EAAE,KAAK,KAAK;AAAA,EAC3E;AACA,QAAM,OAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,gBAAgB,CAAC,IAAI,OAAO;AACvE,QAAM,WAAW,OAAO,OAAO,CAAC,GAAG,MAAM,KAAK,EAAE,iBAAiB,SAAS,GAAG,CAAC,IAAI,OAAO;AACzF,SAAO,EAAE,QAAQ,OAAO,CAAC,GAAI,QAAQ,QAAQ,KAAK,KAAK,QAAQ,EAAE;AACnE;;;ACNO,IAAM,YAAN,MAAgB;AAAA,EACrB,YAAoB,SAAuB;AAAvB;AAClB,QAAI,QAAQ,WAAW,EAAG,OAAM,IAAI,MAAM,uCAAuC;AAAA,EACnF;AAAA,EAFoB;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA,EASpB,MAAM,MAAM,OAAmB,OAAwC;AACrE,UAAM,aAAa,MAAM,gBAAgB,OAAO,KAAK;AACrD,UAAM,UAAU,IAAI,aAAa,OAAO,EAAE,MAAM,CAAC;AACjD,UAAM,QAAsB,CAAC;AAC7B,QAAI,WAAW;AACf,aAAS,IAAI,GAAG,IAAI,WAAW,MAAM,QAAQ,KAAK;AAChD,YAAM,OAAO,WAAW,MAAM,CAAC;AAC/B,YAAM,MAAmB;AAAA,QACvB;AAAA,QACA;AAAA,QACA,OAAO,WAAW,MAAM,MAAM,GAAG,CAAC;AAAA,QAClC,MAAM,WAAW,MAAM,MAAM,IAAI,CAAC;AAAA,MACpC;AACA,UAAI,aAAa;AACjB,iBAAW,UAAU,KAAK,SAAS;AACjC,YAAI,OAAO,SAAS,CAAC,OAAO,MAAM,SAAS,KAAK,KAAK,IAAI,EAAG;AAC5D,cAAM,UAAU,MAAM,OAAO,MAAM,GAAG;AACtC,YAAI,YAAY,KAAM;AACtB,cAAM,SAAS,OAAO,UAAU;AAChC,cAAM,KAAK;AAAA,UACT,QAAQ,KAAK,KAAK;AAAA,UAClB,UAAU,OAAO;AAAA,UACjB,OAAO,QAAQ;AAAA,UACf;AAAA,UACA,WAAW,QAAQ;AAAA,UACnB,UAAU,QAAQ;AAAA,QACpB,CAAC;AACD,qBAAa;AAEb,cAAM,QAAQ,YAAY;AAAA,UACxB,SAAS,OAAO,OAAO,EAAE;AAAA,UACzB,cAAc,KAAK,KAAK;AAAA,UACxB,WAAW;AAAA,UACX,OAAO,QAAQ;AAAA,UACf,WAAW,QAAQ;AAAA,UACnB,UAAU,QAAQ;AAAA,UAClB,MAAM,OAAO,OAAO,EAAE;AAAA,QACxB,CAAC;AAAA,MACH;AACA,UAAI,CAAC,WAAY;AAAA,IACnB;AAEA,UAAM,cAAc,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,QAAQ,CAAC;AAC1D,UAAM,iBACJ,gBAAgB,IAAI,IAAI,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,QAAQ,EAAE,QAAQ,CAAC,IAAI;AAE9E,WAAO,EAAE,OAAO,OAAO,gBAAgB,aAAa,MAAM,QAAQ,eAAe,SAAS;AAAA,EAC5F;AACF;AAIO,SAAS,aAAa,SAA6B;AACxD,SAAO,QAAQ,QAAQ,WAAW,MAAM;AAC1C;","names":[]}
@@ -0,0 +1,30 @@
1
+ import { L as LlmSpan, T as TraceStore, J as JudgeSpan, R as Run, F as FailureClass, a as ToolSpan } from './store-Db2Bv8Cf.js';
2
+
3
+ /**
4
+ * Typed query helpers over TraceStore.
5
+ *
6
+ * Not a full SQL engine — a minimal, composable set of operators that
7
+ * cover the canned-pipeline use cases. For ad-hoc analytics, persist to
8
+ * NDJSON and point DuckDB at it; the schema is stable so external SQL
9
+ * tooling works out of the box.
10
+ */
11
+
12
+ declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
13
+ declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
14
+ declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
15
+ declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
16
+ /** Group spans by any key selector. */
17
+ declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
18
+ /** Hash tool arguments to an orderless-key-stable string for de-duplication. */
19
+ declare function argHash(args: unknown): string;
20
+ /** Sum an LLM-span array into aggregate token + cost. */
21
+ declare function aggregateLlm(spans: LlmSpan[]): {
22
+ inputTokens: number;
23
+ outputTokens: number;
24
+ cachedTokens: number;
25
+ costUsd: number;
26
+ };
27
+ /** Pick the outcome's failure class when present, else derive 'success' from run status. */
28
+ declare function runFailureClass(run: Run): FailureClass;
29
+
30
+ export { aggregateLlm as a, argHash as b, runsForScenario as c, groupBy as g, judgeSpans as j, llmSpans as l, runFailureClass as r, toolSpans as t };
@@ -0,0 +1,292 @@
1
+ import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-BXGs_9V0.js';
3
+ import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
4
+
5
+ /**
6
+ * Release confidence gate.
7
+ *
8
+ * This is the production-facing composition layer over the lower-level
9
+ * primitives:
10
+ * - Dataset manifests prove corpus/version coverage.
11
+ * - RunRecord rows prove reproducible search/holdout outcomes.
12
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
13
+ * - HeldOutGate decisions remain the paired promotion authority.
14
+ *
15
+ * The gate is intentionally pure and conservative. Missing declared evidence
16
+ * fails closed instead of being treated as a neutral zero.
17
+ */
18
+
19
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
20
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
21
+ interface ReleaseTraceEvidence {
22
+ scenarioId: string;
23
+ candidateId?: string;
24
+ split?: RunSplitTag;
25
+ score?: number;
26
+ ok?: boolean;
27
+ turnCount?: number;
28
+ costUsd?: number;
29
+ durationMs?: number;
30
+ failureMode?: string;
31
+ asi?: ActionableSideInfo[];
32
+ metadata?: Record<string, unknown>;
33
+ }
34
+ interface ReleaseConfidenceThresholds {
35
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
36
+ requireCorpus?: boolean;
37
+ minScenarioCount?: number;
38
+ minSearchRuns?: number;
39
+ minHoldoutRuns?: number;
40
+ /** Require at least one holdout scenario/run. Default true. */
41
+ requireHoldout?: boolean;
42
+ minPassRate?: number;
43
+ minMeanScore?: number;
44
+ /** Search mean may exceed holdout mean by at most this much. */
45
+ maxOverfitGap?: number;
46
+ maxMeanCostUsd?: number;
47
+ maxP95WallMs?: number;
48
+ /** Low-score/failed rows must carry ASI. Default true. */
49
+ requireAsiForFailures?: boolean;
50
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
51
+ failureScoreThreshold?: number;
52
+ }
53
+ interface ReleaseConfidenceInput {
54
+ target: string;
55
+ candidateId?: string;
56
+ baselineId?: string;
57
+ dataset?: DatasetManifest;
58
+ scenarios?: readonly DatasetScenario[];
59
+ runs?: readonly RunRecord[];
60
+ traces?: readonly ReleaseTraceEvidence[];
61
+ gateDecision?: GateDecision | null;
62
+ thresholds?: ReleaseConfidenceThresholds;
63
+ }
64
+ interface ReleaseConfidenceAxis {
65
+ name: ReleaseConfidenceAxisName;
66
+ status: ReleaseConfidenceStatus;
67
+ score: number;
68
+ detail: string;
69
+ }
70
+ interface ReleaseConfidenceIssue {
71
+ axis: ReleaseConfidenceAxisName;
72
+ severity: 'critical' | 'warning';
73
+ code: string;
74
+ detail: string;
75
+ }
76
+ interface ReleaseConfidenceMetrics {
77
+ scenarioCount: number;
78
+ searchRuns: number;
79
+ holdoutRuns: number;
80
+ passRate: number;
81
+ meanScore: number;
82
+ searchMeanScore: number;
83
+ holdoutMeanScore: number;
84
+ overfitGap: number;
85
+ meanCostUsd: number;
86
+ p95WallMs: number;
87
+ failedRows: number;
88
+ failuresWithAsi: number;
89
+ singleShotTraces: number;
90
+ multiShotTraces: number;
91
+ splitCounts: Record<DatasetSplit, number>;
92
+ domainCounts: Record<string, number>;
93
+ failureModeCounts: Record<string, number>;
94
+ responsibleSurfaceCounts: Record<string, number>;
95
+ }
96
+ interface ReleaseConfidenceScorecard {
97
+ target: string;
98
+ candidateId: string | null;
99
+ baselineId: string | null;
100
+ status: ReleaseConfidenceStatus;
101
+ promote: boolean;
102
+ axes: ReleaseConfidenceAxis[];
103
+ issues: ReleaseConfidenceIssue[];
104
+ metrics: ReleaseConfidenceMetrics;
105
+ dataset: DatasetManifest | null;
106
+ gateDecision: GateDecision | null;
107
+ summary: string;
108
+ }
109
+ declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
110
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
111
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
112
+
113
+ /**
114
+ * Paper-grade paired statistics for held-out promotion gates.
115
+ *
116
+ * The promotion gate (`HeldOutGate`) needs three things:
117
+ *
118
+ * 1. A bootstrap confidence interval on the per-item paired delta
119
+ * (`pairedBootstrap`). Median delta is the headline number; the
120
+ * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
121
+ * 2. A non-parametric significance test on the paired deltas
122
+ * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
123
+ * paper-style name).
124
+ * 3. False-discovery-rate correction across simultaneously-tested
125
+ * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
126
+ *
127
+ * Why a separate file: every existing primitive lives in `statistics.ts`
128
+ * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
129
+ * paired-only, paper-grade, and load-bearing for the promotion gate.
130
+ * Putting it next to `statistics.ts` would require editing that file;
131
+ * the brief forbids that. New file, new exports, no surface change.
132
+ */
133
+ interface PairedBootstrapResult {
134
+ /** Number of paired observations (after dropping unequal lengths is rejected). */
135
+ n: number;
136
+ /** Median of paired deltas (after − before). */
137
+ median: number;
138
+ /** Mean of paired deltas. */
139
+ mean: number;
140
+ /** Lower bound of the bootstrap CI on the median delta. */
141
+ low: number;
142
+ /** Upper bound of the bootstrap CI on the median delta. */
143
+ high: number;
144
+ /** Confidence level used (e.g. 0.95). */
145
+ confidence: number;
146
+ /** Number of bootstrap resamples used. */
147
+ resamples: number;
148
+ }
149
+ interface PairedBootstrapOptions {
150
+ /** Confidence level. Default 0.95. */
151
+ confidence?: number;
152
+ /** Bootstrap resample count. Default 2000. */
153
+ resamples?: number;
154
+ /** Statistic to bootstrap. Default 'median'. */
155
+ statistic?: 'median' | 'mean';
156
+ /** Deterministic seed. If omitted, uses Math.random(). */
157
+ seed?: number;
158
+ }
159
+ /**
160
+ * Paired bootstrap on (after - before) deltas. Returns a CI on the
161
+ * chosen statistic (median by default). Pairs are resampled with
162
+ * replacement. The lower bound is what the promotion gate checks: if
163
+ * `low > pairedDeltaThreshold`, the gain is real at the chosen
164
+ * confidence level.
165
+ *
166
+ * Throws on unequal sample sizes — caller must align pairs upstream.
167
+ */
168
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
169
+ /**
170
+ * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
171
+ * paired deltas is the standard non-parametric significance test for
172
+ * "candidate beats baseline on matched items." Use alongside the
173
+ * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
174
+ */
175
+ declare function pairedWilcoxon(before: number[], after: number[]): {
176
+ w: number;
177
+ p: number;
178
+ };
179
+ /**
180
+ * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
181
+ * across multiple candidate-vs-baseline comparisons run in the same
182
+ * promotion sweep. Returns BH-adjusted q-values and significance at
183
+ * the requested FDR (default 0.05).
184
+ */
185
+ declare function bhAdjust(pValues: number[], fdr?: number): {
186
+ qValues: number[];
187
+ significant: boolean[];
188
+ };
189
+
190
+ /**
191
+ * Bootstrap-CI promotion gate.
192
+ *
193
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
194
+ * curation), the question is "did this generation actually improve, or are
195
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
196
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
197
+ * delta is real before code or prompts get promoted.
198
+ *
199
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
200
+ * and to compose into any verdict gate.
201
+ *
202
+ * Default gate:
203
+ * - Bootstrap mean baseline vs candidate (1k resamples).
204
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
205
+ * - Tunable confidence (default 95%) and resample count.
206
+ *
207
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
208
+ * vocabulary:
209
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
210
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
211
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
212
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
213
+ */
214
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
215
+ interface BootstrapResult {
216
+ baselineMean: number;
217
+ candidateMean: number;
218
+ /** candidateMean - baselineMean, point estimate. */
219
+ delta: number;
220
+ /** Lower bound of the (1 - alpha) CI on the delta. */
221
+ ciLower: number;
222
+ /** Upper bound of the (1 - alpha) CI on the delta. */
223
+ ciUpper: number;
224
+ /** Number of bootstrap resamples used. */
225
+ iterations: number;
226
+ alpha: number;
227
+ verdict: Verdict;
228
+ }
229
+ interface BootstrapOptions {
230
+ /** Confidence level alpha (default 0.05 → 95% CI). */
231
+ alpha?: number;
232
+ /** Number of resamples (default 1000). */
233
+ iterations?: number;
234
+ /**
235
+ * Minimum total samples (baseline + candidate) below which we always
236
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
237
+ * Default 6 (combined).
238
+ */
239
+ minTotalSamples?: number;
240
+ /** RNG seed for reproducibility. Default: Math.random. */
241
+ seed?: number;
242
+ }
243
+ /**
244
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
245
+ *
246
+ * Uses simple percentile bootstrap on the difference of resampled means.
247
+ * That's the standard non-parametric primitive — no distributional
248
+ * assumptions, robust to skew, easy to reason about.
249
+ */
250
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
251
+ /**
252
+ * Judge-replay promotion gate.
253
+ *
254
+ * The cheap inner-loop judge that drives an evolution run is by definition
255
+ * fast and noisy. When you're about to promote a winning variant to the
256
+ * canonical default, you want a STRONGER judge (a more expensive model, a
257
+ * human grader, a separately-trained reward model) to confirm the win
258
+ * generalises beyond the inner loop.
259
+ *
260
+ * This helper takes raw winner + baseline outputs, scores both through the
261
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
262
+ * judge agrees the winner is real with the configured confidence. Doesn't
263
+ * matter what shape your "output" is — pass a string, an object, anything
264
+ * the judge can read.
265
+ */
266
+ interface JudgeReplayGateArgs<TOutput> {
267
+ baselineOutputs: TOutput[];
268
+ candidateOutputs: TOutput[];
269
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
270
+ judge: (output: TOutput) => Promise<number> | number;
271
+ alpha?: number;
272
+ iterations?: number;
273
+ /** RNG seed for reproducibility. */
274
+ seed?: number;
275
+ /** Maximum concurrent judge calls. Default 4. */
276
+ judgeConcurrency?: number;
277
+ }
278
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
279
+ baselineSamples: number;
280
+ candidateSamples: number;
281
+ }>;
282
+
283
+ interface RenderReleaseReportOptions {
284
+ title?: string;
285
+ runs?: readonly RunRecord[];
286
+ comparator?: string;
287
+ traceAnalystFindings?: readonly string[];
288
+ nextActions?: readonly string[];
289
+ }
290
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
291
+
292
+ export { type BootstrapOptions as B, type JudgeReplayGateArgs as J, type PairedBootstrapOptions as P, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, bhAdjust as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, pairedWilcoxon as r, releaseTraceEvidenceFromMultiShotTrials as s, renderReleaseReport as t };