@tangle-network/agent-eval 0.20.11 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/CHANGELOG.md +76 -0
  2. package/README.md +137 -170
  3. package/dist/benchmarks/index.d.ts +2 -1
  4. package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
  5. package/dist/chunk-3GN6U53I.js.map +1 -0
  6. package/dist/chunk-3IX6QTB7.js +1349 -0
  7. package/dist/chunk-3IX6QTB7.js.map +1 -0
  8. package/dist/chunk-5IIQKMD5.js +236 -0
  9. package/dist/chunk-5IIQKMD5.js.map +1 -0
  10. package/dist/chunk-ARZ6BEV6.js +1310 -0
  11. package/dist/chunk-ARZ6BEV6.js.map +1 -0
  12. package/dist/chunk-HRZELXCR.js +1354 -0
  13. package/dist/chunk-HRZELXCR.js.map +1 -0
  14. package/dist/chunk-KRR4VMH7.js +423 -0
  15. package/dist/chunk-KRR4VMH7.js.map +1 -0
  16. package/dist/chunk-SNUHRBDL.js +154 -0
  17. package/dist/chunk-SNUHRBDL.js.map +1 -0
  18. package/dist/chunk-WOK2RTWG.js +1920 -0
  19. package/dist/chunk-WOK2RTWG.js.map +1 -0
  20. package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
  21. package/dist/chunk-YUFXO3TU.js +148 -0
  22. package/dist/chunk-YUFXO3TU.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/control-cxwMOAsy.d.ts +259 -0
  26. package/dist/control.d.ts +6 -0
  27. package/dist/control.js +30 -0
  28. package/dist/control.js.map +1 -0
  29. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  30. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  31. package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
  32. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  33. package/dist/index.d.ts +178 -2945
  34. package/dist/index.js +1066 -6185
  35. package/dist/index.js.map +1 -1
  36. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  37. package/dist/openapi.json +1 -1
  38. package/dist/optimization.d.ts +146 -0
  39. package/dist/optimization.js +60 -0
  40. package/dist/optimization.js.map +1 -0
  41. package/dist/reporting-Da2ihlcM.d.ts +672 -0
  42. package/dist/reporting.d.ts +5 -0
  43. package/dist/reporting.js +36 -0
  44. package/dist/reporting.js.map +1 -0
  45. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  46. package/dist/store-u47QaJ9G.d.ts +297 -0
  47. package/dist/traces.d.ts +914 -0
  48. package/dist/traces.js +120 -0
  49. package/dist/traces.js.map +1 -0
  50. package/dist/wire/index.js +3 -2
  51. package/docs/concepts.md +16 -11
  52. package/docs/feature-guide.md +10 -17
  53. package/docs/integration-launch-gates.md +77 -0
  54. package/docs/product-eval-adoption.md +27 -0
  55. package/docs/research-report-methodology.md +155 -0
  56. package/docs/trace-analysis.md +75 -0
  57. package/package.json +30 -12
  58. package/dist/chunk-JAOLXRIA.js.map +0 -1
  59. /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
@@ -0,0 +1,672 @@
1
+ import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B9qvlm_o.js';
2
+ import { G as GateDecision, A as ActionableSideInfo, t as MultiShotTrialResult } from './multi-shot-optimization-Bvtz294B.js';
3
+ import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
4
+ import { a as Run, S as Span, f as TraceEvent, F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
5
+
6
+ /**
7
+ * Release confidence gate.
8
+ *
9
+ * This is the production-facing composition layer over the lower-level
10
+ * primitives:
11
+ * - Dataset manifests prove corpus/version coverage.
12
+ * - RunRecord rows prove reproducible search/holdout outcomes.
13
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
14
+ * - HeldOutGate decisions remain the paired promotion authority.
15
+ *
16
+ * The gate is intentionally pure and conservative. Missing declared evidence
17
+ * fails closed instead of being treated as a neutral zero.
18
+ */
19
+
20
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
21
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
22
+ interface ReleaseTraceEvidence {
23
+ scenarioId: string;
24
+ candidateId?: string;
25
+ split?: RunSplitTag;
26
+ score?: number;
27
+ ok?: boolean;
28
+ turnCount?: number;
29
+ costUsd?: number;
30
+ durationMs?: number;
31
+ failureMode?: string;
32
+ asi?: ActionableSideInfo[];
33
+ metadata?: Record<string, unknown>;
34
+ }
35
+ interface ReleaseConfidenceThresholds {
36
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
37
+ requireCorpus?: boolean;
38
+ minScenarioCount?: number;
39
+ minSearchRuns?: number;
40
+ minHoldoutRuns?: number;
41
+ /** Require at least one holdout scenario/run. Default true. */
42
+ requireHoldout?: boolean;
43
+ minPassRate?: number;
44
+ minMeanScore?: number;
45
+ /** Search mean may exceed holdout mean by at most this much. */
46
+ maxOverfitGap?: number;
47
+ maxMeanCostUsd?: number;
48
+ maxP95WallMs?: number;
49
+ /** Low-score/failed rows must carry ASI. Default true. */
50
+ requireAsiForFailures?: boolean;
51
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
52
+ failureScoreThreshold?: number;
53
+ }
54
+ interface ReleaseConfidenceInput {
55
+ target: string;
56
+ candidateId?: string;
57
+ baselineId?: string;
58
+ dataset?: DatasetManifest;
59
+ scenarios?: readonly DatasetScenario[];
60
+ runs?: readonly RunRecord[];
61
+ traces?: readonly ReleaseTraceEvidence[];
62
+ gateDecision?: GateDecision | null;
63
+ thresholds?: ReleaseConfidenceThresholds;
64
+ }
65
+ interface ReleaseConfidenceAxis {
66
+ name: ReleaseConfidenceAxisName;
67
+ status: ReleaseConfidenceStatus;
68
+ score: number;
69
+ detail: string;
70
+ }
71
+ interface ReleaseConfidenceIssue {
72
+ axis: ReleaseConfidenceAxisName;
73
+ severity: 'critical' | 'warning';
74
+ code: string;
75
+ detail: string;
76
+ }
77
+ interface ReleaseConfidenceMetrics {
78
+ scenarioCount: number;
79
+ searchRuns: number;
80
+ holdoutRuns: number;
81
+ passRate: number;
82
+ meanScore: number;
83
+ searchMeanScore: number;
84
+ holdoutMeanScore: number;
85
+ overfitGap: number;
86
+ meanCostUsd: number;
87
+ p95WallMs: number;
88
+ failedRows: number;
89
+ failuresWithAsi: number;
90
+ singleShotTraces: number;
91
+ multiShotTraces: number;
92
+ splitCounts: Record<DatasetSplit, number>;
93
+ domainCounts: Record<string, number>;
94
+ failureModeCounts: Record<string, number>;
95
+ responsibleSurfaceCounts: Record<string, number>;
96
+ }
97
+ interface ReleaseConfidenceScorecard {
98
+ target: string;
99
+ candidateId: string | null;
100
+ baselineId: string | null;
101
+ status: ReleaseConfidenceStatus;
102
+ promote: boolean;
103
+ axes: ReleaseConfidenceAxis[];
104
+ issues: ReleaseConfidenceIssue[];
105
+ metrics: ReleaseConfidenceMetrics;
106
+ dataset: DatasetManifest | null;
107
+ gateDecision: GateDecision | null;
108
+ summary: string;
109
+ }
110
+ declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
111
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
112
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
113
+
114
+ /**
115
+ * Failure taxonomy — canonical classes + a default classifier.
116
+ *
117
+ * Every failed run should end up in a named class. The classifier here
118
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
119
+ * the consumer for novel cases and trained into the rule base over time.
120
+ *
121
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
122
+ * returned class as `Run.outcome.failureClass`.
123
+ */
124
+
125
+ interface FailureContext {
126
+ run: Run;
127
+ spans: Span[];
128
+ events: TraceEvent[];
129
+ }
130
+ interface FailureClassification {
131
+ failureClass: FailureClass;
132
+ reason: string;
133
+ triggerSpanId?: string;
134
+ triggerEventId?: string;
135
+ }
136
+ /** Ordered rules — first match wins. */
137
+ interface FailureRule {
138
+ id: string;
139
+ match: (ctx: FailureContext) => {
140
+ failureClass: FailureClass;
141
+ reason: string;
142
+ triggerSpanId?: string;
143
+ triggerEventId?: string;
144
+ } | null;
145
+ }
146
+ declare const DEFAULT_RULES: FailureRule[];
147
+ /** Classify the failure mode of a run using an ordered rule list. */
148
+ declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
149
+
150
+ /**
151
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
152
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
153
+ *
154
+ * Each cluster includes: N runs, scenarios affected, representative
155
+ * error message, a proposed mitigation hint (rule → action table).
156
+ */
157
+
158
+ interface FailureCluster {
159
+ failureClass: FailureClass;
160
+ /** Tool name when the trigger was a tool span, else undefined. */
161
+ toolName?: string;
162
+ /** First 16 chars of argHash — clusters similar args. */
163
+ argPrefix?: string;
164
+ /**
165
+ * Source dimension when the trigger was a judge span (e.g. `'format'`,
166
+ * `'safety'`, `'correctness'`). Lets cross-template aggregators
167
+ * group failures by the dimension that fired without overloading
168
+ * `argPrefix`. Optional — legacy clusters without this field
169
+ * deserialize cleanly.
170
+ */
171
+ dimension?: string;
172
+ runCount: number;
173
+ scenarioIds: string[];
174
+ exampleError?: string;
175
+ exampleRunId: string;
176
+ }
177
+ interface FailureClusterReport {
178
+ clusters: FailureCluster[];
179
+ totalFailures: number;
180
+ totalRuns: number;
181
+ }
182
+ declare function failureClusterView(store: TraceStore, options?: {
183
+ rules?: FailureRule[];
184
+ minClusterSize?: number;
185
+ }): Promise<FailureClusterReport>;
186
+
187
+ /**
188
+ * Paper-grade paired statistics for held-out promotion gates.
189
+ *
190
+ * The promotion gate (`HeldOutGate`) needs three things:
191
+ *
192
+ * 1. A bootstrap confidence interval on the per-item paired delta
193
+ * (`pairedBootstrap`). Median delta is the headline number; the
194
+ * CI lower bound is what the gate checks against `pairedDeltaThreshold`.
195
+ * 2. A non-parametric significance test on the paired deltas
196
+ * (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
197
+ * paper-style name).
198
+ * 3. False-discovery-rate correction across simultaneously-tested
199
+ * candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
200
+ *
201
+ * Why a separate file: every existing primitive lives in `statistics.ts`
202
+ * (general) or `power-analysis.ts` (correction). Paired-bootstrap is
203
+ * paired-only, paper-grade, and load-bearing for the promotion gate.
204
+ * Putting it next to `statistics.ts` would require editing that file;
205
+ * the brief forbids that. New file, new exports, no surface change.
206
+ */
207
+ interface PairedBootstrapResult {
208
+ /** Number of paired observations (after dropping unequal lengths is rejected). */
209
+ n: number;
210
+ /** Median of paired deltas (after − before). */
211
+ median: number;
212
+ /** Mean of paired deltas. */
213
+ mean: number;
214
+ /** Lower bound of the bootstrap CI on the median delta. */
215
+ low: number;
216
+ /** Upper bound of the bootstrap CI on the median delta. */
217
+ high: number;
218
+ /** Confidence level used (e.g. 0.95). */
219
+ confidence: number;
220
+ /** Number of bootstrap resamples used. */
221
+ resamples: number;
222
+ }
223
+ interface PairedBootstrapOptions {
224
+ /** Confidence level. Default 0.95. */
225
+ confidence?: number;
226
+ /** Bootstrap resample count. Default 2000. */
227
+ resamples?: number;
228
+ /** Statistic to bootstrap. Default 'median'. */
229
+ statistic?: 'median' | 'mean';
230
+ /** Deterministic seed. If omitted, uses Math.random(). */
231
+ seed?: number;
232
+ }
233
+ /**
234
+ * Paired bootstrap on (after - before) deltas. Returns a CI on the
235
+ * chosen statistic (median by default). Pairs are resampled with
236
+ * replacement. The lower bound is what the promotion gate checks: if
237
+ * `low > pairedDeltaThreshold`, the gain is real at the chosen
238
+ * confidence level.
239
+ *
240
+ * Throws on unequal sample sizes — caller must align pairs upstream.
241
+ */
242
+ declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
243
+ /**
244
+ * Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
245
+ * paired deltas is the standard non-parametric significance test for
246
+ * "candidate beats baseline on matched items." Use alongside the
247
+ * bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
248
+ */
249
+ declare function pairedWilcoxon(before: number[], after: number[]): {
250
+ w: number;
251
+ p: number;
252
+ };
253
+ /**
254
+ * Paper-style alias for `benjaminiHochberg`. Use to correct p-values
255
+ * across multiple candidate-vs-baseline comparisons run in the same
256
+ * promotion sweep. Returns BH-adjusted q-values and significance at
257
+ * the requested FDR (default 0.05).
258
+ */
259
+ declare function bhAdjust(pValues: number[], fdr?: number): {
260
+ qValues: number[];
261
+ significant: boolean[];
262
+ };
263
+
264
+ /**
265
+ * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
266
+ * than replacing it.
267
+ *
268
+ * Three artefacts:
269
+ *
270
+ * - `summaryTable` Markdown table of per-candidate means,
271
+ * 95% bootstrap CIs, BH-adjusted Wilcoxon
272
+ * p-values, and Cohen's d versus a
273
+ * comparator candidate.
274
+ * - `paretoChart` Abstract spec for a cost vs quality
275
+ * scatter, with gate decisions overlaid.
276
+ * Returns numbers + labels — caller
277
+ * chooses the plotting library.
278
+ * - `gainHistogram`
279
+ * Per-item paired holdout deltas as a
280
+ * histogram spec (bins + counts + median +
281
+ * CI). Same "data, not images" contract.
282
+ *
283
+ * The figure types are PlotSpecs — JSON-friendly, library-agnostic.
284
+ * They aren't React components and they aren't PNGs; they are
285
+ * what you'd hand to vega-lite, plotly, matplotlib, or your own
286
+ * Canvas renderer to draw the actual figure.
287
+ */
288
+
289
+ interface SummaryTableOptions {
290
+ /** Comparator candidate id. Wilcoxon + Cohen's d are computed
291
+ * versus this candidate. Required for paired stats columns. */
292
+ comparator?: string;
293
+ /** Which split to read scores from. Default 'holdout'. */
294
+ split?: 'search' | 'holdout';
295
+ /** Confidence level for the bootstrap CI on the mean. Default 0.95. */
296
+ confidence?: number;
297
+ /** FDR for BH adjustment of the comparison p-values. Default 0.05. */
298
+ fdr?: number;
299
+ }
300
+ interface SummaryTableRow {
301
+ candidateId: string;
302
+ n: number;
303
+ mean: number;
304
+ ciLow: number;
305
+ ciHigh: number;
306
+ /** BH-adjusted q-value vs comparator. NaN if no comparator. */
307
+ qValue: number;
308
+ /** Cohen's d vs comparator. NaN if no comparator. */
309
+ cohensD: number;
310
+ }
311
+ interface SummaryTable {
312
+ rows: SummaryTableRow[];
313
+ comparator: string | null;
314
+ split: 'search' | 'holdout';
315
+ /** Pre-rendered markdown — drop into a paper or PR. */
316
+ markdown: string;
317
+ }
318
+ /**
319
+ * Table 1 helper. Buckets runs by `candidateId`, computes mean +
320
+ * bootstrap CI on the chosen split, and (when a comparator is given)
321
+ * BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
322
+ */
323
+ declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
324
+ interface ParetoPoint {
325
+ candidateId: string;
326
+ /** Mean USD cost per run on the chosen split. */
327
+ cost: number;
328
+ /** Mean score on the chosen split. */
329
+ quality: number;
330
+ /** Number of runs that informed this point. */
331
+ n: number;
332
+ /** Whether this candidate is on the Pareto frontier — high
333
+ * quality, low cost, no dominator. */
334
+ onFrontier: boolean;
335
+ /** Optional gate verdict for this candidate, if a `GateDecision`
336
+ * for it was passed in. */
337
+ gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
338
+ }
339
+ interface ParetoFigureSpec {
340
+ kind: 'pareto-cost-quality';
341
+ split: 'search' | 'holdout';
342
+ points: ParetoPoint[];
343
+ axes: {
344
+ x: 'costUsd';
345
+ y: 'score';
346
+ };
347
+ }
348
+ /**
349
+ * Cost vs quality scatter spec. `gateDecisions` is keyed by
350
+ * candidate id; if present, every point picks up the gate verdict
351
+ * for overlay.
352
+ */
353
+ declare function paretoChart(runs: RunRecord[], opts?: {
354
+ split?: 'search' | 'holdout';
355
+ gateDecisions?: Record<string, GateDecision>;
356
+ }): ParetoFigureSpec;
357
+ interface GainDistributionBin {
358
+ /** Inclusive lower edge. */
359
+ lo: number;
360
+ /** Exclusive upper edge (or inclusive if it's the last bin). */
361
+ hi: number;
362
+ /** Number of pairs whose delta lands in this bin. */
363
+ count: number;
364
+ }
365
+ interface GainDistributionFigureSpec {
366
+ kind: 'gain-distribution';
367
+ candidateId: string;
368
+ comparator: string;
369
+ split: 'search' | 'holdout';
370
+ /** Number of pairs used. */
371
+ n: number;
372
+ bins: GainDistributionBin[];
373
+ median: number;
374
+ ci: {
375
+ low: number;
376
+ high: number;
377
+ };
378
+ }
379
+ interface GainDistributionOptions {
380
+ /** Number of histogram bins. Default 11 (so the centre is exact at 0). */
381
+ bins?: number;
382
+ /** Which split to use. Default 'holdout'. */
383
+ split?: 'search' | 'holdout';
384
+ /** Confidence level for the CI. Default 0.95. */
385
+ confidence?: number;
386
+ /** Bootstrap resamples. Default 2000. */
387
+ resamples?: number;
388
+ /** Deterministic seed. */
389
+ seed?: number;
390
+ }
391
+ /**
392
+ * Held-out improvement distribution: per-pair delta (candidate −
393
+ * comparator), histogrammed. Includes the bootstrap CI on the median
394
+ * delta — same primitive the promotion gate uses.
395
+ */
396
+ declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
397
+ type ResearchReportDecision = 'promote' | 'hold' | 'reject' | 'equivalent' | 'needs_more_data';
398
+ /**
399
+ * Hard floor below which a paired comparison is treated as uninformative
400
+ * regardless of `minPairs`. Mirrors the lower limit on Wilcoxon signed-rank
401
+ * exact tables; below this the test has no power to separate effect sizes.
402
+ */
403
+ declare const RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
404
+ interface ResearchReportOptions {
405
+ /** Human-readable report title. */
406
+ title?: string;
407
+ /** Comparator candidate id. Required for statistical decision guidance. */
408
+ comparator?: string;
409
+ /** Which split to use for the primary decision. Default 'holdout'. */
410
+ split?: 'search' | 'holdout';
411
+ /** Confidence level used by lower-level report helpers. Default 0.95. */
412
+ confidence?: number;
413
+ /** FDR threshold for q-values. Default 0.05. */
414
+ fdr?: number;
415
+ /**
416
+ * Soft floor on paired observations before issuing a directional
417
+ * promote / reject. Below this we report `needs_more_data` and surface the
418
+ * minimum detectable effect at the current N. Default 20 — chosen so the
419
+ * Wilcoxon signed-rank approximation is reasonable and so the paired
420
+ * bootstrap CI has non-degenerate coverage. Hard floor is enforced at
421
+ * `RESEARCH_REPORT_HARD_PAIR_FLOOR` (6) regardless of this value.
422
+ */
423
+ minPairs?: number;
424
+ /**
425
+ * Region of Practical Equivalence on the paired delta. When a candidate's
426
+ * paired-delta CI is fully contained in `[low, high]`, the decision is
427
+ * `equivalent` rather than `hold`. Sourced from the domain owner — there is
428
+ * no statistically-defensible default.
429
+ */
430
+ rope?: {
431
+ low: number;
432
+ high: number;
433
+ };
434
+ /**
435
+ * Power for the minimum detectable effect (MDE) reported on each candidate.
436
+ * Default 0.8.
437
+ */
438
+ mdePower?: number;
439
+ /**
440
+ * Two-sided alpha for the MDE. Default matches `fdr` so the reported MDE
441
+ * lines up with the test the report actually runs.
442
+ */
443
+ mdeAlpha?: number;
444
+ /** Optional held-out gate decisions keyed by candidate id. */
445
+ gateDecisions?: Record<string, GateDecision>;
446
+ /** Optional failure clusters from failureClusterView. */
447
+ failureClusters?: FailureClusterReport;
448
+ /** Build gain histograms for these candidates. Defaults to all non-comparator candidates. */
449
+ candidateIds?: string[];
450
+ /** Deterministic bootstrap seed passed to gainHistogram and the posterior helper. */
451
+ seed?: number;
452
+ /** Report timestamp. Defaults to current time. */
453
+ generatedAt?: string;
454
+ /**
455
+ * Hash of a preregistered protocol (e.g. `signManifest({...}).contentHash`).
456
+ * Embedded verbatim in the report so the analysis can be cited as the
457
+ * preregistered one rather than a post-hoc fishing expedition.
458
+ */
459
+ preregistrationHash?: string;
460
+ }
461
+ interface ResearchReportRecommendation {
462
+ decision: ResearchReportDecision;
463
+ candidateId: string | null;
464
+ rationale: string[];
465
+ risks: string[];
466
+ nextActions: string[];
467
+ }
468
+ interface ResearchReportCandidate {
469
+ candidateId: string;
470
+ n: number;
471
+ mean: number;
472
+ ciLow: number;
473
+ ciHigh: number;
474
+ qValue: number;
475
+ cohensD: number;
476
+ meanDeltaVsComparator: number | null;
477
+ pairedN: number;
478
+ medianGain: number | null;
479
+ meanGain: number | null;
480
+ gainCi: {
481
+ low: number;
482
+ high: number;
483
+ } | null;
484
+ /**
485
+ * Bayesian-bootstrap-style posterior summaries on the paired delta. Computed
486
+ * from the same resamples that produce the gain CI; interpretable as
487
+ * "fraction of resamples in which the candidate beats the comparator on
488
+ * matched pairs."
489
+ */
490
+ prGreaterThanZero: number | null;
491
+ prInRope: number | null;
492
+ /**
493
+ * Minimum detectable effect (in score units) at the candidate's paired N,
494
+ * the configured power, and the configured alpha. Standardised by the
495
+ * observed paired-delta SD and inverted via `requiredSampleSize`. Reported
496
+ * for every candidate so a `needs_more_data` verdict is actionable.
497
+ */
498
+ mde: number | null;
499
+ onParetoFrontier: boolean;
500
+ gate?: ParetoPoint['gate'];
501
+ decision: ResearchReportDecision;
502
+ decisionReason: string;
503
+ }
504
+ interface ResearchReportMethodology {
505
+ /**
506
+ * Plain-language assumptions the report depends on. Read these first when
507
+ * deciding whether the verdict is load-bearing for a launch decision.
508
+ */
509
+ assumptions: string[];
510
+ /** Tests and estimators the verdict was computed from. */
511
+ methods: string[];
512
+ /** Alternatives the author considered and why this report didn't take them. */
513
+ alternatives: string[];
514
+ /** Failure modes — when this report should NOT drive a decision. */
515
+ whenNotToApply: string[];
516
+ /** Citations for the methodological choices above. */
517
+ citations: string[];
518
+ }
519
+ interface ResearchReport {
520
+ kind: 'agent-eval-research-report';
521
+ title: string;
522
+ generatedAt: string;
523
+ split: 'search' | 'holdout';
524
+ comparator: string | null;
525
+ /**
526
+ * SHA-256 over the canonicalised set of `(runId, candidateId, split)` triples
527
+ * the report was computed from, plus the comparator and split. Stable across
528
+ * key insertion order; recomputable by the reader to verify provenance.
529
+ */
530
+ runFingerprint: string;
531
+ preregistrationHash: string | null;
532
+ rope: {
533
+ low: number;
534
+ high: number;
535
+ } | null;
536
+ executiveSummary: string[];
537
+ recommendation: ResearchReportRecommendation;
538
+ candidates: ResearchReportCandidate[];
539
+ summary: SummaryTable;
540
+ charts: {
541
+ pareto: ParetoFigureSpec;
542
+ gains: GainDistributionFigureSpec[];
543
+ };
544
+ methodology: ResearchReportMethodology;
545
+ failureClusters?: FailureClusterReport;
546
+ markdown: string;
547
+ html: string;
548
+ }
549
+ /**
550
+ * Executive research report for CPO / AI-lead / launch-review consumption.
551
+ *
552
+ * Composes:
553
+ * - `summaryTable` marginal stats with BH-FDR-adjusted q-values
554
+ * - `paretoChart` cost-vs-quality frontier with gate overlay
555
+ * - `gainHistogram` per-candidate paired-delta distribution
556
+ * - paired posterior (this file): bootstrap CI on median, Pr(Δ>0),
557
+ * Pr(Δ∈ROPE), MDE at the configured power
558
+ *
559
+ * Decisions are made on paired evidence — never on marginal means alone —
560
+ * and respect any held-out gate decision the caller passes through. The
561
+ * report embeds a SHA-256 fingerprint of the input run set and, optionally,
562
+ * the hash of a preregistered protocol so a downstream reader can verify
563
+ * provenance and that the analysis was the preregistered one.
564
+ *
565
+ * Async because the fingerprint uses Web Crypto via `hashJson`; deterministic
566
+ * for any fixed `runs`, `seed`, and ROPE.
567
+ */
568
+ declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
569
+
570
+ interface RenderReleaseReportOptions {
571
+ title?: string;
572
+ runs?: readonly RunRecord[];
573
+ comparator?: string;
574
+ traceAnalystFindings?: readonly string[];
575
+ nextActions?: readonly string[];
576
+ }
577
+ declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
578
+
579
+ /**
580
+ * Bootstrap-CI promotion gate.
581
+ *
582
+ * In any iterative-improvement loop (GEPA, prompt evolution, dataset
583
+ * curation), the question is "did this generation actually improve, or are
584
+ * we celebrating noise?". With small N and noisy outcomes, point-estimate
585
+ * deltas lie. Bootstrap confidence intervals tell the operator whether the
586
+ * delta is real before code or prompts get promoted.
587
+ *
588
+ * This module is pure functions — no I/O, no model calls. Easy to unit-test
589
+ * and to compose into any verdict gate.
590
+ *
591
+ * Default gate:
592
+ * - Bootstrap mean baseline vs candidate (1k resamples).
593
+ * - Compute the delta distribution; pass if the lower CI bound > 0.
594
+ * - Tunable confidence (default 95%) and resample count.
595
+ *
596
+ * Verdict semantics intentionally match the existing `experiments.jsonl`
597
+ * vocabulary:
598
+ * - ADVANCE: candidate's CI lower bound > baseline mean (real win)
599
+ * - KEEP: overlap, but candidate point estimate >= baseline (neutral)
600
+ * - REVERT: candidate's CI upper bound < baseline mean (real regression)
601
+ * - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
602
+ */
603
+ type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
604
+ interface BootstrapResult {
605
+ baselineMean: number;
606
+ candidateMean: number;
607
+ /** candidateMean - baselineMean, point estimate. */
608
+ delta: number;
609
+ /** Lower bound of the (1 - alpha) CI on the delta. */
610
+ ciLower: number;
611
+ /** Upper bound of the (1 - alpha) CI on the delta. */
612
+ ciUpper: number;
613
+ /** Number of bootstrap resamples used. */
614
+ iterations: number;
615
+ alpha: number;
616
+ verdict: Verdict;
617
+ }
618
+ interface BootstrapOptions {
619
+ /** Confidence level alpha (default 0.05 → 95% CI). */
620
+ alpha?: number;
621
+ /** Number of resamples (default 1000). */
622
+ iterations?: number;
623
+ /**
624
+ * Minimum total samples (baseline + candidate) below which we always
625
+ * return INCONCLUSIVE — bootstrap with too few samples is meaningless.
626
+ * Default 6 (combined).
627
+ */
628
+ minTotalSamples?: number;
629
+ /** RNG seed for reproducibility. Default: Math.random. */
630
+ seed?: number;
631
+ }
632
+ /**
633
+ * Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
634
+ *
635
+ * Uses simple percentile bootstrap on the difference of resampled means.
636
+ * That's the standard non-parametric primitive — no distributional
637
+ * assumptions, robust to skew, easy to reason about.
638
+ */
639
+ declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
640
+ /**
641
+ * Judge-replay promotion gate.
642
+ *
643
+ * The cheap inner-loop judge that drives an evolution run is by definition
644
+ * fast and noisy. When you're about to promote a winning variant to the
645
+ * canonical default, you want a STRONGER judge (a more expensive model, a
646
+ * human grader, a separately-trained reward model) to confirm the win
647
+ * generalises beyond the inner loop.
648
+ *
649
+ * This helper takes raw winner + baseline outputs, scores both through the
650
+ * stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
651
+ * judge agrees the winner is real with the configured confidence. Doesn't
652
+ * matter what shape your "output" is — pass a string, an object, anything
653
+ * the judge can read.
654
+ */
655
+ interface JudgeReplayGateArgs<TOutput> {
656
+ baselineOutputs: TOutput[];
657
+ candidateOutputs: TOutput[];
658
+ /** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
659
+ judge: (output: TOutput) => Promise<number> | number;
660
+ alpha?: number;
661
+ iterations?: number;
662
+ /** RNG seed for reproducibility. */
663
+ seed?: number;
664
+ /** Maximum concurrent judge calls. Default 4. */
665
+ judgeConcurrency?: number;
666
+ }
667
+ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
668
+ baselineSamples: number;
669
+ candidateSamples: number;
670
+ }>;
671
+
672
+ export { type SummaryTableOptions as A, type BootstrapOptions as B, type SummaryTableRow as C, DEFAULT_RULES as D, assertReleaseConfidence as E, type FailureClassification as F, type GainDistributionBin as G, bhAdjust as H, bootstrapCi as I, type JudgeReplayGateArgs as J, classifyFailure as K, evaluateReleaseConfidence as L, failureClusterView as M, gainHistogram as N, judgeReplayGate as O, type PairedBootstrapOptions as P, pairedBootstrap as Q, type ReleaseConfidenceThresholds as R, type SummaryTable as S, pairedWilcoxon as T, paretoChart as U, type Verdict as V, releaseTraceEvidenceFromMultiShotTrials as W, renderReleaseReport as X, researchReport as Y, summaryTable as Z, type ReleaseConfidenceScorecard as a, type BootstrapResult as b, type FailureCluster as c, type FailureClusterReport as d, type FailureContext as e, type FailureRule as f, type GainDistributionFigureSpec as g, type GainDistributionOptions as h, type PairedBootstrapResult as i, type ParetoFigureSpec as j, type ParetoPoint as k, RESEARCH_REPORT_HARD_PAIR_FLOOR as l, type ReleaseConfidenceAxis as m, type ReleaseConfidenceAxisName as n, type ReleaseConfidenceInput as o, type ReleaseConfidenceIssue as p, type ReleaseConfidenceMetrics as q, type ReleaseConfidenceStatus as r, type ReleaseTraceEvidence as s, type RenderReleaseReportOptions as t, type ResearchReport as u, type ResearchReportCandidate as v, type ResearchReportDecision as w, type ResearchReportMethodology as x, type ResearchReportOptions as y, type ResearchReportRecommendation as z };
@@ -0,0 +1,5 @@
1
+ export { B as BootstrapOptions, b as BootstrapResult, G as GainDistributionBin, g as GainDistributionFigureSpec, h as GainDistributionOptions, J as JudgeReplayGateArgs, P as PairedBootstrapOptions, i as PairedBootstrapResult, j as ParetoFigureSpec, k as ParetoPoint, l as RESEARCH_REPORT_HARD_PAIR_FLOOR, m as ReleaseConfidenceAxis, n as ReleaseConfidenceAxisName, o as ReleaseConfidenceInput, p as ReleaseConfidenceIssue, q as ReleaseConfidenceMetrics, a as ReleaseConfidenceScorecard, r as ReleaseConfidenceStatus, R as ReleaseConfidenceThresholds, s as ReleaseTraceEvidence, t as RenderReleaseReportOptions, u as ResearchReport, v as ResearchReportCandidate, w as ResearchReportDecision, x as ResearchReportMethodology, y as ResearchReportOptions, z as ResearchReportRecommendation, S as SummaryTable, A as SummaryTableOptions, C as SummaryTableRow, V as Verdict, E as assertReleaseConfidence, H as bhAdjust, I as bootstrapCi, L as evaluateReleaseConfidence, N as gainHistogram, O as judgeReplayGate, Q as pairedBootstrap, T as pairedWilcoxon, U as paretoChart, W as releaseTraceEvidenceFromMultiShotTrials, X as renderReleaseReport, Y as researchReport, Z as summaryTable } from './reporting-Da2ihlcM.js';
2
+ import './dataset-B9qvlm_o.js';
3
+ import './multi-shot-optimization-Bvtz294B.js';
4
+ import './run-record-CX_jcAyr.js';
5
+ import './store-u47QaJ9G.js';