@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,292 @@
1
+ import { D as DatasetSplit, b as DatasetManifest, a as DatasetScenario } from './dataset-CiK_3LDr.js';
2
+ import { a3 as GateDecision, A as ActionableSideInfo, m as MultiShotTrialResult } from './summary-report-C7VPYEj2.js';
3
+ import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
4
+
5
+ /**
6
+ * Release confidence gate.
7
+ *
8
+ * This is the production-facing composition layer over the lower-level
9
+ * primitives:
10
+ * - Dataset manifests prove corpus/version coverage.
11
+ * - RunRecord rows prove reproducible search/holdout outcomes.
12
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
13
+ * - HeldOutGate decisions remain the paired promotion authority.
14
+ *
15
+ * The gate is intentionally pure and conservative. Missing declared evidence
16
+ * fails closed instead of being treated as a neutral zero.
17
+ */
18
+
19
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
20
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
21
+ interface ReleaseTraceEvidence {
22
+ scenarioId: string;
23
+ candidateId?: string;
24
+ split?: RunSplitTag;
25
+ score?: number;
26
+ ok?: boolean;
27
+ turnCount?: number;
28
+ costUsd?: number;
29
+ durationMs?: number;
30
+ failureMode?: string;
31
+ asi?: ActionableSideInfo[];
32
+ metadata?: Record<string, unknown>;
33
+ }
34
+ interface ReleaseConfidenceThresholds {
35
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
36
+ requireCorpus?: boolean;
37
+ minScenarioCount?: number;
38
+ minSearchRuns?: number;
39
+ minHoldoutRuns?: number;
40
+ /** Require at least one holdout scenario/run. Default true. */
41
+ requireHoldout?: boolean;
42
+ minPassRate?: number;
43
+ minMeanScore?: number;
44
+ /** Search mean may exceed holdout mean by at most this much. */
45
+ maxOverfitGap?: number;
46
+ maxMeanCostUsd?: number;
47
+ maxP95WallMs?: number;
48
+ /** Low-score/failed rows must carry ASI. Default true. */
49
+ requireAsiForFailures?: boolean;
50
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
51
+ failureScoreThreshold?: number;
52
+ }
53
+ interface ReleaseConfidenceInput {
54
+ target: string;
55
+ candidateId?: string;
56
+ baselineId?: string;
57
+ dataset?: DatasetManifest;
58
+ scenarios?: readonly DatasetScenario[];
59
+ runs?: readonly RunRecord[];
60
+ traces?: readonly ReleaseTraceEvidence[];
61
+ gateDecision?: GateDecision | null;
62
+ thresholds?: ReleaseConfidenceThresholds;
63
+ }
64
+ interface ReleaseConfidenceAxis {
65
+ name: ReleaseConfidenceAxisName;
66
+ status: ReleaseConfidenceStatus;
67
+ score: number;
68
+ detail: string;
69
+ }
70
+ interface ReleaseConfidenceIssue {
71
+ axis: ReleaseConfidenceAxisName;
72
+ severity: 'critical' | 'warning';
73
+ code: string;
74
+ detail: string;
75
+ }
76
+ interface ReleaseConfidenceMetrics {
77
+ scenarioCount: number;
78
+ searchRuns: number;
79
+ holdoutRuns: number;
80
+ passRate: number;
81
+ meanScore: number;
82
+ searchMeanScore: number;
83
+ holdoutMeanScore: number;
84
+ overfitGap: number;
85
+ meanCostUsd: number;
86
+ p95WallMs: number;
87
+ failedRows: number;
88
+ failuresWithAsi: number;
89
+ singleShotTraces: number;
90
+ multiShotTraces: number;
91
+ splitCounts: Record<DatasetSplit, number>;
92
+ domainCounts: Record<string, number>;
93
+ failureModeCounts: Record<string, number>;
94
+ responsibleSurfaceCounts: Record<string, number>;
95
+ }
96
+ interface ReleaseConfidenceScorecard {
97
+ target: string;
98
+ candidateId: string | null;
99
+ baselineId: string | null;
100
+ status: ReleaseConfidenceStatus;
101
+ promote: boolean;
102
+ axes: ReleaseConfidenceAxis[];
103
+ issues: ReleaseConfidenceIssue[];
104
+ metrics: ReleaseConfidenceMetrics;
105
+ dataset: DatasetManifest | null;
106
+ gateDecision: GateDecision | null;
107
+ summary: string;
108
+ }
109
+ declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
110
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
111
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
112
+
113
+ /**
114
+ * Paper-grade paired statistics for held-out promotion gates.
115
+ *
116
+ * The promotion gate (`HeldOutGate`) needs three things:
117
+ *
118
+ * 1. A bootstrap confidence interval on the per-item paired delta
119
+ * (`pairedBootstrap`). Median delta is the headline number; the
120
+ * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
121
+ * 2. A non-parametric significance test on the paired deltas
122
+ * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
123
+ * paper-style name).
124
+ * 3. False-discovery-rate correction across simultaneously-tested
125
+ * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
126
+ *
127
+ * Why a separate file: every existing primitive lives in `statistics.ts`
128
+ * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
129
+ * paired-only, paper-grade, and load-bearing for the promotion gate.
130
+ * Putting it next to `statistics.ts` would require editing that file;
131
+ * the brief forbids that. New file, new exports, no surface change.
132
+ */
133
+ interface PairedBootstrapResult {
134
+ /** Number of paired observations (after dropping unequal lengths is rejected). */
135
+ n: number;
136
+ /** Median of paired deltas (after − before). */
137
+ median: number;
138
+ /** Mean of paired deltas. */
139
+ mean: number;
140
+ /** Lower bound of the bootstrap CI on the median delta. */
141
+ low: number;
142
+ /** Upper bound of the bootstrap CI on the median delta. */
143
+ high: number;
144
+ /** Confidence level used (e.g. 0.95). */
145
+ confidence: number;
146
+ /** Number of bootstrap resamples used. */
147
+ resamples: number;
148
+ }
149
+ interface PairedBootstrapOptions {
150
+ /** Confidence level. Default 0.95. */
151
+ confidence?: number;
152
+ /** Bootstrap resample count. Default 2000. */
153
+ resamples?: number;
154
+ /** Statistic to bootstrap. Default 'median'. */
155
+ statistic?: 'median' | 'mean';
156
+ /** Deterministic seed. If omitted, uses Math.random(). */
157
+ seed?: number;
158
+ }
159
+ /**
160
+ * Paired bootstrap on (after - before) deltas. Returns a CI on the
161
+ * chosen statistic (median by default). Pairs are resampled with
162
+ * replacement. The lower bound is what the promotion gate checks: if
163
+ * `low > pairedDeltaThreshold`, the gain is real at the chosen
164
+ * confidence level.
165
+ *
166
+ * Throws on unequal sample sizes — caller must align pairs upstream.
167
+ */
168
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
169
+ /**
170
+ * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
171
+ * paired deltas is the standard non-parametric significance test for
172
+ * "candidate beats baseline on matched items." Use alongside the
173
+ * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
174
+ */
175
+ declare function pairedWilcoxon(before: number[], after: number[]): {
176
+ w: number;
177
+ p: number;
178
+ };
179
+ /**
180
+ * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
181
+ * across multiple candidate-vs-baseline comparisons run in the same
182
+ * promotion sweep. Returns BH-adjusted q-values and significance at
183
+ * the requested FDR (default 0.05).
184
+ */
185
+ declare function bhAdjust(pValues: number[], fdr?: number): {
186
+ qValues: number[];
187
+ significant: boolean[];
188
+ };
189
+
190
+ /**
191
+ * Bootstrap-CI promotion gate.
192
+ *
193
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
194
+ * curation), the question is "did this generation actually improve, or are
195
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
196
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
197
+ * delta is real before code or prompts get promoted.
198
+ *
199
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
200
+ * and to compose into any verdict gate.
201
+ *
202
+ * Default gate:
203
+ * - Bootstrap mean baseline vs candidate (1k resamples).
204
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
205
+ * - Tunable confidence (default 95%) and resample count.
206
+ *
207
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
208
+ * vocabulary:
209
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
210
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
211
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
212
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
213
+ */
214
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
215
+ interface BootstrapResult {
216
+ baselineMean: number;
217
+ candidateMean: number;
218
+ /** candidateMean - baselineMean, point estimate. */
219
+ delta: number;
220
+ /** Lower bound of the (1 - alpha) CI on the delta. */
221
+ ciLower: number;
222
+ /** Upper bound of the (1 - alpha) CI on the delta. */
223
+ ciUpper: number;
224
+ /** Number of bootstrap resamples used. */
225
+ iterations: number;
226
+ alpha: number;
227
+ verdict: Verdict;
228
+ }
229
+ interface BootstrapOptions {
230
+ /** Confidence level alpha (default 0.05 → 95% CI). */
231
+ alpha?: number;
232
+ /** Number of resamples (default 1000). */
233
+ iterations?: number;
234
+ /**
235
+ * Minimum total samples (baseline + candidate) below which we always
236
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
237
+ * Default 6 (combined).
238
+ */
239
+ minTotalSamples?: number;
240
+ /** RNG seed for reproducibility. Default: Math.random. */
241
+ seed?: number;
242
+ }
243
+ /**
244
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
245
+ *
246
+ * Uses simple percentile bootstrap on the difference of resampled means.
247
+ * That's the standard non-parametric primitive — no distributional
248
+ * assumptions, robust to skew, easy to reason about.
249
+ */
250
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
251
+ /**
252
+ * Judge-replay promotion gate.
253
+ *
254
+ * The cheap inner-loop judge that drives an evolution run is by definition
255
+ * fast and noisy. When you're about to promote a winning variant to the
256
+ * canonical default, you want a STRONGER judge (a more expensive model, a
257
+ * human grader, a separately-trained reward model) to confirm the win
258
+ * generalises beyond the inner loop.
259
+ *
260
+ * This helper takes raw winner + baseline outputs, scores both through the
261
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
262
+ * judge agrees the winner is real with the configured confidence. Doesn't
263
+ * matter what shape your "output" is — pass a string, an object, anything
264
+ * the judge can read.
265
+ */
266
+ interface JudgeReplayGateArgs<TOutput> {
267
+ baselineOutputs: TOutput[];
268
+ candidateOutputs: TOutput[];
269
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
270
+ judge: (output: TOutput) => Promise<number> | number;
271
+ alpha?: number;
272
+ iterations?: number;
273
+ /** RNG seed for reproducibility. */
274
+ seed?: number;
275
+ /** Maximum concurrent judge calls. Default 4. */
276
+ judgeConcurrency?: number;
277
+ }
278
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
279
+ baselineSamples: number;
280
+ candidateSamples: number;
281
+ }>;
282
+
283
+ interface RenderReleaseReportOptions {
284
+ title?: string;
285
+ runs?: readonly RunRecord[];
286
+ comparator?: string;
287
+ traceAnalystFindings?: readonly string[];
288
+ nextActions?: readonly string[];
289
+ }
290
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
291
+
292
+ export { type BootstrapOptions as B, type JudgeReplayGateArgs as J, type PairedBootstrapOptions as P, type ReleaseConfidenceAxis as R, type Verdict as V, type BootstrapResult as a, type PairedBootstrapResult as b, type ReleaseConfidenceAxisName as c, type ReleaseConfidenceInput as d, type ReleaseConfidenceIssue as e, type ReleaseConfidenceMetrics as f, type ReleaseConfidenceScorecard as g, type ReleaseConfidenceStatus as h, type ReleaseConfidenceThresholds as i, type ReleaseTraceEvidence as j, type RenderReleaseReportOptions as k, assertReleaseConfidence as l, bhAdjust as m, bootstrapCi as n, evaluateReleaseConfidence as o, judgeReplayGate as p, pairedBootstrap as q, pairedWilcoxon as r, releaseTraceEvidenceFromMultiShotTrials as s, renderReleaseReport as t };
@@ -0,0 +1,226 @@
1
+ import { T as TraceStore } from './store-Db2Bv8Cf.js';
2
+ import { R as ReplayError } from './errors-BZ9sTdz7.js';
3
+ import { R as RawProviderSink, c as RawProviderEvent } from './integrity-DK2EBVZC.js';
4
+
5
+ /**
6
+ * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
7
+ * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
8
+ *
9
+ * Wire format only. We do NOT depend on the @opentelemetry SDK — that
10
+ * would drag in polyfills incompatible with Workers/Edge. Consumers
11
+ * push the JSON to their collector of choice via HTTP.
12
+ *
13
+ * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
14
+ */
15
+
16
+ declare const OTEL_AGENT_EVAL_SCOPE: {
17
+ name: string;
18
+ version: string;
19
+ };
20
+ interface OtlpSpan {
21
+ traceId: string;
22
+ spanId: string;
23
+ parentSpanId?: string;
24
+ name: string;
25
+ kind: number;
26
+ startTimeUnixNano: string;
27
+ endTimeUnixNano: string;
28
+ attributes: Array<{
29
+ key: string;
30
+ value: {
31
+ stringValue?: string;
32
+ intValue?: string;
33
+ doubleValue?: number;
34
+ boolValue?: boolean;
35
+ };
36
+ }>;
37
+ events?: Array<{
38
+ timeUnixNano: string;
39
+ name: string;
40
+ attributes?: OtlpSpan['attributes'];
41
+ }>;
42
+ status?: {
43
+ code: number;
44
+ message?: string;
45
+ };
46
+ }
47
+ interface OtlpResourceSpans {
48
+ resource: {
49
+ attributes: OtlpSpan['attributes'];
50
+ };
51
+ scopeSpans: Array<{
52
+ scope: typeof OTEL_AGENT_EVAL_SCOPE;
53
+ spans: OtlpSpan[];
54
+ }>;
55
+ }
56
+ interface OtlpExport {
57
+ resourceSpans: OtlpResourceSpans[];
58
+ }
59
+ /** Export a single run's spans + events in OTLP/JSON. */
60
+ declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
61
+
62
+ /**
63
+ * Redaction — remove PII / secrets from trace payloads before persist.
64
+ *
65
+ * Pre-persistence rules mean raw traces in storage are already scrubbed.
66
+ * Unredacted variants (for debugging / post-mortems) live in a separate
67
+ * storage layer with stricter access controls; this module only covers
68
+ * the default scrub-then-persist path.
69
+ *
70
+ * Rules compose: pass an array of `RedactionRule`, each is applied in
71
+ * order. Strings that match get replaced with a tagged sentinel so the
72
+ * eval framework can count how many redactions happened per run
73
+ * (surfaced via `redaction_applied` events).
74
+ */
75
+ interface RedactionRule {
76
+ id: string;
77
+ pattern: RegExp;
78
+ /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
79
+ replacement?: string;
80
+ }
81
+ interface RedactionReport {
82
+ redactionCount: number;
83
+ byRule: Record<string, number>;
84
+ }
85
+ /** OWASP / common-sense defaults — extend per-domain. */
86
+ declare const DEFAULT_REDACTION_RULES: RedactionRule[];
87
+ declare const REDACTION_VERSION = "1.0.0";
88
+ /**
89
+ * Redact a single string. Returns the new string and a per-rule count of
90
+ * how many substitutions fired.
91
+ */
92
+ declare function redactString(input: string, rules?: RedactionRule[]): {
93
+ output: string;
94
+ report: RedactionReport;
95
+ };
96
+ /**
97
+ * Walk a JSON-ish value applying `redactString` to every string leaf.
98
+ * Arrays and plain objects are recursed; other types pass through
99
+ * untouched. Circular references throw — traces should be tree-shaped.
100
+ */
101
+ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
102
+ value: unknown;
103
+ report: RedactionReport;
104
+ };
105
+
106
+ /**
107
+ * Replay-from-raw-events — turn every captured campaign run into a
108
+ * re-runnable artifact.
109
+ *
110
+ * The premise: 0.21 made `RawProviderSink` capture every provider HTTP
111
+ * envelope. 0.22's `runEvalCampaign` makes capture the default. Together
112
+ * they mean every past run is a complete fingerprint of what happened on
113
+ * the wire — and that fingerprint is enough to replay the run without
114
+ * burning new LLM cost.
115
+ *
116
+ * Three use cases this primitive enables:
117
+ *
118
+ * 1. **Post-hoc judging** — apply a new judge / rubric / scoring callback
119
+ * to last week's runs without re-calling any LLM. The cost of trying
120
+ * a new rubric drops from "another full sweep" to a CPU-bound replay.
121
+ * 2. **Determinism audits** — replay the same campaign and verify the
122
+ * raw responses match byte-for-byte. Any drift is a non-determinism
123
+ * bug (in the harness, the prompt builder, the sandbox, …).
124
+ * 3. **Free judge calibration** — run two judges on identical responses
125
+ * and measure inter-judge agreement without doubling LLM spend.
126
+ *
127
+ * The interface is deliberately fetch-shaped. Inject `createReplayFetch`
128
+ * into `LlmClientOptions.fetch` and every `callLlm` transparently reads
129
+ * from the cache instead of calling the network. No new code path through
130
+ * the LLM client is needed; the cache hit is invisible to the runner.
131
+ */
132
+
133
+ declare class ReplayCacheMissError extends ReplayError {
134
+ readonly url: string;
135
+ readonly requestKey: string;
136
+ constructor(url: string, requestKey: string, message?: string);
137
+ }
138
+ interface ReplayCacheEntry {
139
+ request: RawProviderEvent;
140
+ response: RawProviderEvent;
141
+ }
142
+ interface ReplayCacheStats {
143
+ total: number;
144
+ byProvider: Record<string, number>;
145
+ byModel: Record<string, number>;
146
+ /** Spans for which we have a request but no response (run aborted mid-call). */
147
+ orphanRequests: number;
148
+ }
149
+ /**
150
+ * In-memory deterministic cache of (request → response) keyed on a stable
151
+ * hash of the request body. Built from a `RawProviderSink` containing
152
+ * paired `request` and `response` events from a previous run.
153
+ *
154
+ * The cache is the source of truth for replay; `createReplayFetch` is a
155
+ * thin wrapper that reads from it.
156
+ */
157
+ declare class ReplayCache {
158
+ private byKey;
159
+ private orphans;
160
+ private byProvider;
161
+ private byModel;
162
+ /**
163
+ * Build a cache from a sink's events. The sink must implement `list()`.
164
+ * Filter by `runId` / `spanId` to scope to a specific replay.
165
+ */
166
+ static fromSink(sink: RawProviderSink, filter?: {
167
+ runId?: string;
168
+ spanId?: string;
169
+ }): Promise<ReplayCache>;
170
+ /** Build a cache from an in-memory event list. */
171
+ static fromEvents(events: RawProviderEvent[]): Promise<ReplayCache>;
172
+ /** Number of cacheable (request, response) pairs in the cache. */
173
+ size(): number;
174
+ stats(): ReplayCacheStats;
175
+ /** Iterate every cached `(request, response)` pair in insertion order. */
176
+ entries(): IterableIterator<ReplayCacheEntry>;
177
+ /**
178
+ * Look up a cached response by hashing the (model, messages, temperature,
179
+ * maxTokens, response_format) shape. Returns `undefined` on miss; the
180
+ * caller decides whether to throw, fall back to the network, or skip.
181
+ */
182
+ lookup(requestBody: unknown): Promise<ReplayCacheEntry | undefined>;
183
+ }
184
+ interface ReplayFetchOptions {
185
+ /**
186
+ * Behaviour on cache miss. Default `'throw'`. `'fallback'` calls the
187
+ * `fallbackFetch` (typically `globalThis.fetch`) so a partial replay can
188
+ * still complete; `'fail-closed'` returns a synthetic 599 response so the
189
+ * call site sees a non-retriable failure.
190
+ */
191
+ onMiss?: 'throw' | 'fallback' | 'fail-closed';
192
+ fallbackFetch?: typeof fetch;
193
+ /** Optional callback fired once per replayed call (for telemetry / counters). */
194
+ onHit?: (info: {
195
+ url: string;
196
+ provider: string;
197
+ model: string;
198
+ }) => void;
199
+ /** Optional callback fired on cache miss before the `onMiss` policy applies. */
200
+ onMissNotify?: (info: {
201
+ url: string;
202
+ requestBody: unknown;
203
+ }) => void;
204
+ }
205
+ /**
206
+ * Build a `fetch`-shaped function that serves cached responses out of a
207
+ * `ReplayCache` for any URL ending in `/chat/completions`. Pass through
208
+ * `LlmClientOptions.fetch` and `callLlm` becomes free.
209
+ *
210
+ * Non-`/chat/completions` URLs are passed straight to the fallback fetch
211
+ * (default: `globalThis.fetch`). This matters because non-LLM HTTP work
212
+ * (judge HTTP servers, sandbox callbacks) sometimes flows through the same
213
+ * `fetch` and shouldn't be intercepted.
214
+ */
215
+ declare function createReplayFetch(cache: ReplayCache, opts?: ReplayFetchOptions): typeof fetch;
216
+ /**
217
+ * Convenience iterator over `(request, response)` pairs in a sink — for
218
+ * post-hoc scoring that doesn't need a `fetch` shim. The judge or scorer
219
+ * runs purely in-process over cached LLM outputs.
220
+ */
221
+ declare function iterateRawCalls(sink: RawProviderSink, filter?: {
222
+ runId?: string;
223
+ spanId?: string;
224
+ }): AsyncGenerator<ReplayCacheEntry>;
225
+
226
+ export { DEFAULT_REDACTION_RULES as D, OTEL_AGENT_EVAL_SCOPE as O, REDACTION_VERSION as R, type OtlpExport as a, type OtlpResourceSpans as b, type OtlpSpan as c, type RedactionReport as d, type RedactionRule as e, ReplayCache as f, type ReplayCacheEntry as g, ReplayCacheMissError as h, type ReplayCacheStats as i, type ReplayFetchOptions as j, createReplayFetch as k, exportRunAsOtlp as l, iterateRawCalls as m, redactValue as n, redactString as r };