@tangle-network/agent-eval 0.40.5 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -355
- package/dist/campaign/index.js +106 -6
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
|
@@ -0,0 +1,561 @@
|
|
|
1
|
+
import { R as RunRecord } from './run-record-BGY6bHRh.js';
|
|
2
|
+
import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
6
|
+
*
|
|
7
|
+
* Encodes the "honesty override" pattern that lived inline in
|
|
8
|
+
* `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
|
|
9
|
+
* The optimizer's best-guess is one thing; what we should actually
|
|
10
|
+
* ship is another. The gate is the line between them.
|
|
11
|
+
*
|
|
12
|
+
* A candidate is promoted iff ALL three pass:
|
|
13
|
+
*
|
|
14
|
+
* 1. **Productive runs**: the candidate has at least
|
|
15
|
+
* `minProductiveRuns` paired observations on items where BOTH
|
|
16
|
+
* candidate and baseline produced a real (non-silent) score.
|
|
17
|
+
* 2. **Paired delta**: the lower bound of the bootstrap CI on the
|
|
18
|
+
* median per-item delta (candidate − baseline) on the HOLDOUT
|
|
19
|
+
* split is strictly greater than `pairedDeltaThreshold`.
|
|
20
|
+
* 3. **Overfit gap**: the candidate's gap between search-split
|
|
21
|
+
* score and holdout-split score is no worse (more positive)
|
|
22
|
+
* than the baseline's gap by more than `overfitGapThreshold`.
|
|
23
|
+
* "Better on search, worse on holdout" is the canonical
|
|
24
|
+
* overfit pattern; this catches it.
|
|
25
|
+
*
|
|
26
|
+
* The decision carries a machine-readable `rejectionCode` plus an
|
|
27
|
+
* `evidence` block with every number the gate looked at, so the
|
|
28
|
+
* downstream researcher / paper / dashboard can re-derive the
|
|
29
|
+
* verdict without re-running.
|
|
30
|
+
*
|
|
31
|
+
* See also:
|
|
32
|
+
* - `src/statistics.ts` for `pairedBootstrap` + `wilcoxonSignedRank`
|
|
33
|
+
* - `src/run-record.ts` for the input row schema
|
|
34
|
+
* - `src/reference-replay.ts` for the older, reference-replay-
|
|
35
|
+
* specific promotion path (still useful for replay-style evals).
|
|
36
|
+
*/
|
|
37
|
+
|
|
38
|
+
type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
|
|
39
|
+
interface HeldOutGateConfig {
|
|
40
|
+
/** Minimum number of paired (candidate, baseline) holdout observations
|
|
41
|
+
* required before the gate will even consider promoting. Default 3. */
|
|
42
|
+
minProductiveRuns?: number;
|
|
43
|
+
/** The bootstrap-CI lower bound on the median paired holdout delta
|
|
44
|
+
* must exceed this to promote. Default 0. */
|
|
45
|
+
pairedDeltaThreshold?: number;
|
|
46
|
+
/** Maximum allowed worsening of (search − holdout) gap relative to
|
|
47
|
+
* baseline. Default 0.15 (i.e. candidate may overfit by up to 15
|
|
48
|
+
* absolute score points more than baseline before rejection). */
|
|
49
|
+
overfitGapThreshold?: number;
|
|
50
|
+
/** Stable label of the baseline candidate. Required — paper-grade
|
|
51
|
+
* evaluation never compares two unlabelled candidates. */
|
|
52
|
+
baselineKey: string;
|
|
53
|
+
/** Confidence level for the bootstrap CI. Default 0.95. */
|
|
54
|
+
confidence?: number;
|
|
55
|
+
/** Bootstrap resamples. Default 2000. */
|
|
56
|
+
bootstrapResamples?: number;
|
|
57
|
+
/** Optional deterministic seed for the bootstrap. Default undefined
|
|
58
|
+
* (Math.random). */
|
|
59
|
+
seed?: number;
|
|
60
|
+
}
|
|
61
|
+
interface GateEvidence {
|
|
62
|
+
/** Number of paired (candidate, baseline) holdout observations used. */
|
|
63
|
+
productiveRuns: number;
|
|
64
|
+
/** Median of (candidate − baseline) paired holdout deltas. */
|
|
65
|
+
medianPairedDelta: number;
|
|
66
|
+
/** Bootstrap CI on the median paired holdout delta. */
|
|
67
|
+
pairedCI: {
|
|
68
|
+
low: number;
|
|
69
|
+
high: number;
|
|
70
|
+
};
|
|
71
|
+
/** Wilcoxon signed-rank p-value on the paired holdout deltas. */
|
|
72
|
+
pairedPValue: number;
|
|
73
|
+
/** Mean candidate score on the search split (NaN if none). */
|
|
74
|
+
searchScore: number;
|
|
75
|
+
/** Mean candidate score on the holdout split (NaN if none). */
|
|
76
|
+
holdoutScore: number;
|
|
77
|
+
/** Candidate (search − holdout) gap. */
|
|
78
|
+
overfitGap: number;
|
|
79
|
+
/** Baseline (search − holdout) gap. */
|
|
80
|
+
baselineOverfitGap: number;
|
|
81
|
+
}
|
|
82
|
+
interface GateDecision {
|
|
83
|
+
/** Final promote/no-promote verdict. */
|
|
84
|
+
promote: boolean;
|
|
85
|
+
/** The candidate that was evaluated. */
|
|
86
|
+
candidateId: string;
|
|
87
|
+
/** The baseline it was compared against. */
|
|
88
|
+
baselineId: string;
|
|
89
|
+
/** Every number the gate looked at, for audit + paper export. */
|
|
90
|
+
evidence: GateEvidence;
|
|
91
|
+
/** Human-readable reason. */
|
|
92
|
+
reason: string;
|
|
93
|
+
/** Machine-readable rejection code, or null on promote. */
|
|
94
|
+
rejectionCode: HeldOutGateRejectionCode | null;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Held-out paired-delta promotion gate. Construct once with config,
|
|
98
|
+
* call `evaluate(candidateRuns, baselineRuns)` per (candidate,
|
|
99
|
+
* baseline) pair. Stateless across calls.
|
|
100
|
+
*/
|
|
101
|
+
declare class HeldOutGate {
|
|
102
|
+
private readonly minProductiveRuns;
|
|
103
|
+
private readonly pairedDeltaThreshold;
|
|
104
|
+
private readonly overfitGapThreshold;
|
|
105
|
+
private readonly baselineKey;
|
|
106
|
+
private readonly confidence;
|
|
107
|
+
private readonly resamples;
|
|
108
|
+
private readonly seed?;
|
|
109
|
+
constructor(config: HeldOutGateConfig);
|
|
110
|
+
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
111
|
+
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
112
|
+
* the candidate run with the matching baseline run. Pairs without
|
|
113
|
+
* a holdout score on both sides are dropped. */
|
|
114
|
+
evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
|
|
119
|
+
* than replacing it.
|
|
120
|
+
*
|
|
121
|
+
* Three artefacts:
|
|
122
|
+
*
|
|
123
|
+
* - `summaryTable` Markdown table of per-candidate means,
|
|
124
|
+
* 95% bootstrap CIs, BH-adjusted Wilcoxon
|
|
125
|
+
* p-values, and Cohen's d versus a
|
|
126
|
+
* comparator candidate.
|
|
127
|
+
* - `paretoChart` Abstract spec for a cost vs quality
|
|
128
|
+
* scatter, with gate decisions overlaid.
|
|
129
|
+
* Returns numbers + labels — caller
|
|
130
|
+
* chooses the plotting library.
|
|
131
|
+
* - `gainHistogram`
|
|
132
|
+
* Per-item paired holdout deltas as a
|
|
133
|
+
* histogram spec (bins + counts + median +
|
|
134
|
+
* CI). Same "data, not images" contract.
|
|
135
|
+
*
|
|
136
|
+
* The figure types are PlotSpecs — JSON-friendly, library-agnostic.
|
|
137
|
+
* They aren't React components and they aren't PNGs; they are
|
|
138
|
+
* what you'd hand to vega-lite, plotly, matplotlib, or your own
|
|
139
|
+
* Canvas renderer to draw the actual figure.
|
|
140
|
+
*/
|
|
141
|
+
|
|
142
|
+
interface SummaryTableOptions {
|
|
143
|
+
/** Comparator candidate id. Wilcoxon + Cohen's d are computed
|
|
144
|
+
* versus this candidate. Required for paired stats columns. */
|
|
145
|
+
comparator?: string;
|
|
146
|
+
/** Which split to read scores from. Default 'holdout'. */
|
|
147
|
+
split?: 'search' | 'holdout';
|
|
148
|
+
/** Confidence level for the bootstrap CI on the mean. Default 0.95. */
|
|
149
|
+
confidence?: number;
|
|
150
|
+
/** FDR for BH adjustment of the comparison p-values. Default 0.05. */
|
|
151
|
+
fdr?: number;
|
|
152
|
+
}
|
|
153
|
+
interface SummaryTableRow {
|
|
154
|
+
candidateId: string;
|
|
155
|
+
n: number;
|
|
156
|
+
mean: number;
|
|
157
|
+
ciLow: number;
|
|
158
|
+
ciHigh: number;
|
|
159
|
+
/** BH-adjusted q-value vs comparator. NaN if no comparator. */
|
|
160
|
+
qValue: number;
|
|
161
|
+
/** Cohen's d vs comparator. NaN if no comparator. */
|
|
162
|
+
cohensD: number;
|
|
163
|
+
}
|
|
164
|
+
interface SummaryTable {
|
|
165
|
+
rows: SummaryTableRow[];
|
|
166
|
+
comparator: string | null;
|
|
167
|
+
split: 'search' | 'holdout';
|
|
168
|
+
/** Pre-rendered markdown — drop into a paper or PR. */
|
|
169
|
+
markdown: string;
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Table 1 helper. Buckets runs by `candidateId`, computes mean +
|
|
173
|
+
* bootstrap CI on the chosen split, and (when a comparator is given)
|
|
174
|
+
* BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
|
|
175
|
+
*/
|
|
176
|
+
declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
|
|
177
|
+
interface ParetoPoint {
|
|
178
|
+
candidateId: string;
|
|
179
|
+
/** Mean USD cost per run on the chosen split. */
|
|
180
|
+
cost: number;
|
|
181
|
+
/** Mean score on the chosen split. */
|
|
182
|
+
quality: number;
|
|
183
|
+
/** Number of runs that informed this point. */
|
|
184
|
+
n: number;
|
|
185
|
+
/** Whether this candidate is on the Pareto frontier — high
|
|
186
|
+
* quality, low cost, no dominator. */
|
|
187
|
+
onFrontier: boolean;
|
|
188
|
+
/** Optional gate verdict for this candidate, if a `GateDecision`
|
|
189
|
+
* for it was passed in. */
|
|
190
|
+
gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
|
|
191
|
+
}
|
|
192
|
+
interface ParetoFigureSpec {
|
|
193
|
+
kind: 'pareto-cost-quality';
|
|
194
|
+
split: 'search' | 'holdout';
|
|
195
|
+
points: ParetoPoint[];
|
|
196
|
+
axes: {
|
|
197
|
+
x: 'costUsd';
|
|
198
|
+
y: 'score';
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Cost vs quality scatter spec. `gateDecisions` is keyed by
|
|
203
|
+
* candidate id; if present, every point picks up the gate verdict
|
|
204
|
+
* for overlay.
|
|
205
|
+
*/
|
|
206
|
+
declare function paretoChart(runs: RunRecord[], opts?: {
|
|
207
|
+
split?: 'search' | 'holdout';
|
|
208
|
+
gateDecisions?: Record<string, GateDecision>;
|
|
209
|
+
}): ParetoFigureSpec;
|
|
210
|
+
interface GainDistributionBin {
|
|
211
|
+
/** Inclusive lower edge. */
|
|
212
|
+
lo: number;
|
|
213
|
+
/** Exclusive upper edge (or inclusive if it's the last bin). */
|
|
214
|
+
hi: number;
|
|
215
|
+
/** Number of pairs whose delta lands in this bin. */
|
|
216
|
+
count: number;
|
|
217
|
+
}
|
|
218
|
+
interface GainDistributionFigureSpec {
|
|
219
|
+
kind: 'gain-distribution';
|
|
220
|
+
candidateId: string;
|
|
221
|
+
comparator: string;
|
|
222
|
+
split: 'search' | 'holdout';
|
|
223
|
+
/** Number of pairs used. */
|
|
224
|
+
n: number;
|
|
225
|
+
bins: GainDistributionBin[];
|
|
226
|
+
median: number;
|
|
227
|
+
ci: {
|
|
228
|
+
low: number;
|
|
229
|
+
high: number;
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
interface GainDistributionOptions {
|
|
233
|
+
/** Number of histogram bins. Default 11 (so the centre is exact at 0). */
|
|
234
|
+
bins?: number;
|
|
235
|
+
/** Which split to use. Default 'holdout'. */
|
|
236
|
+
split?: 'search' | 'holdout';
|
|
237
|
+
/** Confidence level for the CI. Default 0.95. */
|
|
238
|
+
confidence?: number;
|
|
239
|
+
/** Bootstrap resamples. Default 2000. */
|
|
240
|
+
resamples?: number;
|
|
241
|
+
/** Deterministic seed. */
|
|
242
|
+
seed?: number;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Held-out improvement distribution: per-pair delta (candidate −
|
|
246
|
+
* comparator), histogrammed. Includes the bootstrap CI on the median
|
|
247
|
+
* delta — same primitive the promotion gate uses.
|
|
248
|
+
*/
|
|
249
|
+
declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
|
|
250
|
+
type ResearchReportDecision = 'promote' | 'hold' | 'reject' | 'equivalent' | 'needs_more_data';
|
|
251
|
+
/**
|
|
252
|
+
* Hard floor below which a paired comparison is treated as uninformative
|
|
253
|
+
* regardless of `minPairs`. Mirrors the lower limit on Wilcoxon signed-rank
|
|
254
|
+
* exact tables; below this the test has no power to separate effect sizes.
|
|
255
|
+
*/
|
|
256
|
+
declare const RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
|
|
257
|
+
interface ResearchReportOptions {
|
|
258
|
+
/** Human-readable report title. */
|
|
259
|
+
title?: string;
|
|
260
|
+
/** Comparator candidate id. Required for statistical decision guidance. */
|
|
261
|
+
comparator?: string;
|
|
262
|
+
/** Which split to use for the primary decision. Default 'holdout'. */
|
|
263
|
+
split?: 'search' | 'holdout';
|
|
264
|
+
/** Confidence level used by lower-level report helpers. Default 0.95. */
|
|
265
|
+
confidence?: number;
|
|
266
|
+
/** FDR threshold for q-values. Default 0.05. */
|
|
267
|
+
fdr?: number;
|
|
268
|
+
/**
|
|
269
|
+
* Soft floor on paired observations before issuing a directional
|
|
270
|
+
* promote / reject. Below this we report `needs_more_data` and surface the
|
|
271
|
+
* minimum detectable effect at the current N. Default 20 — chosen so the
|
|
272
|
+
* Wilcoxon signed-rank approximation is reasonable and so the paired
|
|
273
|
+
* bootstrap CI has non-degenerate coverage. Hard floor is enforced at
|
|
274
|
+
* `RESEARCH_REPORT_HARD_PAIR_FLOOR` (6) regardless of this value.
|
|
275
|
+
*/
|
|
276
|
+
minPairs?: number;
|
|
277
|
+
/**
|
|
278
|
+
* Region of Practical Equivalence on the paired delta. When a candidate's
|
|
279
|
+
* paired-delta CI is fully contained in `[low, high]`, the decision is
|
|
280
|
+
* `equivalent` rather than `hold`. Sourced from the domain owner — there is
|
|
281
|
+
* no statistically-defensible default.
|
|
282
|
+
*/
|
|
283
|
+
rope?: {
|
|
284
|
+
low: number;
|
|
285
|
+
high: number;
|
|
286
|
+
};
|
|
287
|
+
/**
|
|
288
|
+
* Power for the minimum detectable effect (MDE) reported on each candidate.
|
|
289
|
+
* Default 0.8.
|
|
290
|
+
*/
|
|
291
|
+
mdePower?: number;
|
|
292
|
+
/**
|
|
293
|
+
* Two-sided alpha for the MDE. Default matches `fdr` so the reported MDE
|
|
294
|
+
* lines up with the test the report actually runs.
|
|
295
|
+
*/
|
|
296
|
+
mdeAlpha?: number;
|
|
297
|
+
/** Optional held-out gate decisions keyed by candidate id. */
|
|
298
|
+
gateDecisions?: Record<string, GateDecision>;
|
|
299
|
+
/** Optional failure clusters from failureClusterView. */
|
|
300
|
+
failureClusters?: FailureClusterReport;
|
|
301
|
+
/** Build gain histograms for these candidates. Defaults to all non-comparator candidates. */
|
|
302
|
+
candidateIds?: string[];
|
|
303
|
+
/** Deterministic bootstrap seed passed to gainHistogram and the posterior helper. */
|
|
304
|
+
seed?: number;
|
|
305
|
+
/** Report timestamp. Defaults to current time. */
|
|
306
|
+
generatedAt?: string;
|
|
307
|
+
/**
|
|
308
|
+
* Hash of a preregistered protocol (e.g. `signManifest({...}).contentHash`).
|
|
309
|
+
* Embedded verbatim in the report so the analysis can be cited as the
|
|
310
|
+
* preregistered one rather than a post-hoc fishing expedition.
|
|
311
|
+
*/
|
|
312
|
+
preregistrationHash?: string;
|
|
313
|
+
}
|
|
314
|
+
interface ResearchReportRecommendation {
|
|
315
|
+
decision: ResearchReportDecision;
|
|
316
|
+
candidateId: string | null;
|
|
317
|
+
rationale: string[];
|
|
318
|
+
risks: string[];
|
|
319
|
+
nextActions: string[];
|
|
320
|
+
}
|
|
321
|
+
interface ResearchReportCandidate {
|
|
322
|
+
candidateId: string;
|
|
323
|
+
n: number;
|
|
324
|
+
mean: number;
|
|
325
|
+
ciLow: number;
|
|
326
|
+
ciHigh: number;
|
|
327
|
+
qValue: number;
|
|
328
|
+
cohensD: number;
|
|
329
|
+
meanDeltaVsComparator: number | null;
|
|
330
|
+
pairedN: number;
|
|
331
|
+
medianGain: number | null;
|
|
332
|
+
meanGain: number | null;
|
|
333
|
+
gainCi: {
|
|
334
|
+
low: number;
|
|
335
|
+
high: number;
|
|
336
|
+
} | null;
|
|
337
|
+
/**
|
|
338
|
+
* Bayesian-bootstrap-style posterior summaries on the paired delta. Computed
|
|
339
|
+
* from the same resamples that produce the gain CI; interpretable as
|
|
340
|
+
* "fraction of resamples in which the candidate beats the comparator on
|
|
341
|
+
* matched pairs."
|
|
342
|
+
*/
|
|
343
|
+
prGreaterThanZero: number | null;
|
|
344
|
+
prInRope: number | null;
|
|
345
|
+
/**
|
|
346
|
+
* Minimum detectable effect (in score units) at the candidate's paired N,
|
|
347
|
+
* the configured power, and the configured alpha. Standardised by the
|
|
348
|
+
* observed paired-delta SD and inverted via `requiredSampleSize`. Reported
|
|
349
|
+
* for every candidate so a `needs_more_data` verdict is actionable.
|
|
350
|
+
*/
|
|
351
|
+
mde: number | null;
|
|
352
|
+
onParetoFrontier: boolean;
|
|
353
|
+
gate?: ParetoPoint['gate'];
|
|
354
|
+
decision: ResearchReportDecision;
|
|
355
|
+
decisionReason: string;
|
|
356
|
+
}
|
|
357
|
+
interface ResearchReportMethodology {
|
|
358
|
+
/**
|
|
359
|
+
* Plain-language assumptions the report depends on. Read these first when
|
|
360
|
+
* deciding whether the verdict is load-bearing for a launch decision.
|
|
361
|
+
*/
|
|
362
|
+
assumptions: string[];
|
|
363
|
+
/** Tests and estimators the verdict was computed from. */
|
|
364
|
+
methods: string[];
|
|
365
|
+
/** Alternatives the author considered and why this report didn't take them. */
|
|
366
|
+
alternatives: string[];
|
|
367
|
+
/** Failure modes — when this report should NOT drive a decision. */
|
|
368
|
+
whenNotToApply: string[];
|
|
369
|
+
/** Citations for the methodological choices above. */
|
|
370
|
+
citations: string[];
|
|
371
|
+
}
|
|
372
|
+
interface ResearchReport {
|
|
373
|
+
kind: 'agent-eval-research-report';
|
|
374
|
+
title: string;
|
|
375
|
+
generatedAt: string;
|
|
376
|
+
split: 'search' | 'holdout';
|
|
377
|
+
comparator: string | null;
|
|
378
|
+
/**
|
|
379
|
+
* SHA-256 over the canonicalised set of `(runId, candidateId, split)` triples
|
|
380
|
+
* the report was computed from, plus the comparator and split. Stable across
|
|
381
|
+
* key insertion order; recomputable by the reader to verify provenance.
|
|
382
|
+
*/
|
|
383
|
+
runFingerprint: string;
|
|
384
|
+
preregistrationHash: string | null;
|
|
385
|
+
rope: {
|
|
386
|
+
low: number;
|
|
387
|
+
high: number;
|
|
388
|
+
} | null;
|
|
389
|
+
executiveSummary: string[];
|
|
390
|
+
recommendation: ResearchReportRecommendation;
|
|
391
|
+
candidates: ResearchReportCandidate[];
|
|
392
|
+
summary: SummaryTable;
|
|
393
|
+
charts: {
|
|
394
|
+
pareto: ParetoFigureSpec;
|
|
395
|
+
gains: GainDistributionFigureSpec[];
|
|
396
|
+
};
|
|
397
|
+
methodology: ResearchReportMethodology;
|
|
398
|
+
failureClusters?: FailureClusterReport;
|
|
399
|
+
markdown: string;
|
|
400
|
+
html: string;
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* Executive research report for CPO / AI-lead / launch-review consumption.
|
|
404
|
+
*
|
|
405
|
+
* Composes:
|
|
406
|
+
* - `summaryTable` marginal stats with BH-FDR-adjusted q-values
|
|
407
|
+
* - `paretoChart` cost-vs-quality frontier with gate overlay
|
|
408
|
+
* - `gainHistogram` per-candidate paired-delta distribution
|
|
409
|
+
* - paired posterior (this file): bootstrap CI on median, Pr(Δ>0),
|
|
410
|
+
* Pr(Δ∈ROPE), MDE at the configured power
|
|
411
|
+
*
|
|
412
|
+
* Decisions are made on paired evidence — never on marginal means alone —
|
|
413
|
+
* and respect any held-out gate decision the caller passes through. The
|
|
414
|
+
* report embeds a SHA-256 fingerprint of the input run set and, optionally,
|
|
415
|
+
* the hash of a preregistered protocol so a downstream reader can verify
|
|
416
|
+
* provenance and that the analysis was the preregistered one.
|
|
417
|
+
*
|
|
418
|
+
* Async because the fingerprint uses Web Crypto via `hashJson`; deterministic
|
|
419
|
+
* for any fixed `runs`, `seed`, and ROPE.
|
|
420
|
+
*/
|
|
421
|
+
declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
|
|
422
|
+
|
|
423
|
+
/**
|
|
424
|
+
* Always-valid sequential evaluation.
|
|
425
|
+
*
|
|
426
|
+
* `researchReport` assumes a single pre-specified analysis. Real
|
|
427
|
+
* consumers run campaigns weekly / nightly / per-PR; each new run silently
|
|
428
|
+
* inflates the false-discovery rate, because the BH-FDR guarantee is for
|
|
429
|
+
* the *first* look, not the 47th. Without time-uniform inference,
|
|
430
|
+
* launch-decision teams either (a) don't peek, which forfeits the cost
|
|
431
|
+
* advantage of stop-when-decisive, or (b) peek and pretend they didn't,
|
|
432
|
+
* which forfeits scientific validity.
|
|
433
|
+
*
|
|
434
|
+
* This module ships **e-value-based confidence sequences** for paired
|
|
435
|
+
* bounded outcomes. The methodology is the predictable plug-in betting
|
|
436
|
+
* martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
|
|
437
|
+
* stopping time. Concretely:
|
|
438
|
+
*
|
|
439
|
+
* For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
|
|
440
|
+
* a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
|
|
441
|
+
* plug-in), and the running e-value is
|
|
442
|
+
*
|
|
443
|
+
* E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
|
|
444
|
+
*
|
|
445
|
+
* E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
|
|
446
|
+
* Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
|
|
447
|
+
* at any time without inflating the type-I error.
|
|
448
|
+
*
|
|
449
|
+
* Combined with `runEvalCampaign`, every consumer running rolling
|
|
450
|
+
* campaigns gains the ability to ship the moment evidence is decisive,
|
|
451
|
+
* stop-early on dead-on-arrival variants, and accumulate evidence across
|
|
452
|
+
* partial runs without spending the FDR budget. No new sweep is wasted.
|
|
453
|
+
*
|
|
454
|
+
* References:
|
|
455
|
+
* - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
|
|
456
|
+
* Time-uniform, nonparametric, nonasymptotic confidence sequences.
|
|
457
|
+
* Annals of Statistics, 49(2), 1055–1080.
|
|
458
|
+
* - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
|
|
459
|
+
* random variables by betting. JRSS B, 86(1), 1–27.
|
|
460
|
+
*/
|
|
461
|
+
type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
|
|
462
|
+
interface PairedEvalueOptions {
|
|
463
|
+
/**
|
|
464
|
+
* Bound on |delta|. Default 1 (matching most score scales). Must satisfy
|
|
465
|
+
* c > 0; deltas outside [-c, c] are clipped with a warning attached to
|
|
466
|
+
* the return value.
|
|
467
|
+
*/
|
|
468
|
+
bound?: number;
|
|
469
|
+
/** Target Type-I error. Default 0.05. */
|
|
470
|
+
alpha?: number;
|
|
471
|
+
/**
|
|
472
|
+
* Region of Practical Equivalence on the *mean* paired delta. When
|
|
473
|
+
* supplied, the verdict can return `'equivalent'` once the running
|
|
474
|
+
* confidence sequence on the mean is fully contained in [low, high].
|
|
475
|
+
*/
|
|
476
|
+
rope?: {
|
|
477
|
+
low: number;
|
|
478
|
+
high: number;
|
|
479
|
+
};
|
|
480
|
+
/** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
|
|
481
|
+
initialBetShrinkage?: number;
|
|
482
|
+
}
|
|
483
|
+
interface PairedEvalueStep {
|
|
484
|
+
/** 1-indexed observation count. */
|
|
485
|
+
t: number;
|
|
486
|
+
delta: number;
|
|
487
|
+
/** Running e-value E_t = ∏ (1 + λ_i · D_i). */
|
|
488
|
+
evalue: number;
|
|
489
|
+
/** Time-uniform p-value at stopping time t. */
|
|
490
|
+
pValue: number;
|
|
491
|
+
/** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
|
|
492
|
+
csLow: number;
|
|
493
|
+
csHigh: number;
|
|
494
|
+
/** Verdict at this stopping time. */
|
|
495
|
+
decision: SequentialDecision;
|
|
496
|
+
}
|
|
497
|
+
interface PairedEvalueSequence {
|
|
498
|
+
steps: PairedEvalueStep[];
|
|
499
|
+
/** The decision at the final step. */
|
|
500
|
+
finalDecision: SequentialDecision;
|
|
501
|
+
/** Index (1-based) at which a non-`continue` decision first fired, or null. */
|
|
502
|
+
decisionFiredAt: number | null;
|
|
503
|
+
/** True if any deltas were clipped to [-bound, bound]. */
|
|
504
|
+
clipped: boolean;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Run the paired e-value sequence over an in-order delta stream.
|
|
508
|
+
*
|
|
509
|
+
* Use for *streaming* / interim analyses: pass the deltas you have so
|
|
510
|
+
* far, get the verdict at every prefix length. The decision is
|
|
511
|
+
* monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
|
|
512
|
+
* fires, the verdict at later steps remains decisive (the e-value is a
|
|
513
|
+
* non-negative martingale; once it crosses the threshold, it's crossed).
|
|
514
|
+
*/
|
|
515
|
+
declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
|
|
516
|
+
interface InterimReleaseConfidenceInput {
|
|
517
|
+
/**
|
|
518
|
+
* One delta series per candidate (paired deltas vs comparator). Order
|
|
519
|
+
* within a series is the order the campaigns were run.
|
|
520
|
+
*/
|
|
521
|
+
deltaSeries: Array<{
|
|
522
|
+
candidateId: string;
|
|
523
|
+
deltas: number[];
|
|
524
|
+
}>;
|
|
525
|
+
alpha?: number;
|
|
526
|
+
bound?: number;
|
|
527
|
+
rope?: {
|
|
528
|
+
low: number;
|
|
529
|
+
high: number;
|
|
530
|
+
};
|
|
531
|
+
}
|
|
532
|
+
interface InterimReleaseConfidence {
|
|
533
|
+
candidates: Array<{
|
|
534
|
+
candidateId: string;
|
|
535
|
+
decision: SequentialDecision;
|
|
536
|
+
decisionFiredAt: number | null;
|
|
537
|
+
finalEvalue: number;
|
|
538
|
+
finalPValue: number;
|
|
539
|
+
pairs: number;
|
|
540
|
+
csLow: number;
|
|
541
|
+
csHigh: number;
|
|
542
|
+
}>;
|
|
543
|
+
/**
|
|
544
|
+
* Campaign-level recommendation: pick the strongest 'promote_now', else
|
|
545
|
+
* 'continue' if any candidate is still live, else 'reject_now' if every
|
|
546
|
+
* candidate is dead, else 'equivalent'.
|
|
547
|
+
*/
|
|
548
|
+
recommendation: {
|
|
549
|
+
decision: SequentialDecision;
|
|
550
|
+
candidateId: string | null;
|
|
551
|
+
};
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Run interim sequential analyses across many candidates at once,
|
|
555
|
+
* preserving the time-uniform α guarantee for each candidate's series and
|
|
556
|
+
* synthesising a campaign-level recommendation. Designed to be called on
|
|
557
|
+
* every campaign tick — the recommendation is anytime-valid.
|
|
558
|
+
*/
|
|
559
|
+
declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
|
|
560
|
+
|
|
561
|
+
export { type GainDistributionBin as G, HeldOutGate as H, type InterimReleaseConfidence as I, type PairedEvalueOptions as P, RESEARCH_REPORT_HARD_PAIR_FLOOR as R, type SequentialDecision as S, type GainDistributionFigureSpec as a, type GainDistributionOptions as b, type InterimReleaseConfidenceInput as c, type PairedEvalueSequence as d, type PairedEvalueStep as e, type ParetoFigureSpec as f, type ParetoPoint as g, type ResearchReport as h, type ResearchReportCandidate as i, type ResearchReportDecision as j, type ResearchReportMethodology as k, type ResearchReportOptions as l, type ResearchReportRecommendation as m, type SummaryTable as n, type SummaryTableOptions as o, type SummaryTableRow as p, evaluateInterimReleaseConfidence as q, gainHistogram as r, pairedEvalueSequence as s, paretoChart as t, researchReport as u, summaryTable as v, type GateDecision as w, type GateEvidence as x, type HeldOutGateConfig as y, type HeldOutGateRejectionCode as z };
|
package/dist/traces.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
|
|
2
|
-
import { R as RawProviderSink,
|
|
3
|
-
export { F as FileSystemRawProviderSink,
|
|
2
|
+
import { R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
|
|
3
|
+
export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
|
|
4
4
|
import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
|
|
5
5
|
export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
|
|
6
|
+
export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
|
|
6
7
|
import { T as TraceStore } from './store-Db2Bv8Cf.js';
|
|
7
8
|
export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
|
|
8
9
|
export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
|
package/dist/traces.js
CHANGED
|
@@ -63,6 +63,11 @@ import {
|
|
|
63
63
|
assertRunCaptured,
|
|
64
64
|
throwIfRunIncomplete
|
|
65
65
|
} from "./chunk-UBPIXOC4.js";
|
|
66
|
+
import {
|
|
67
|
+
TraceEmitter,
|
|
68
|
+
llmSpanFromProvider
|
|
69
|
+
} from "./chunk-TVVP3ZZQ.js";
|
|
70
|
+
import "./chunk-VSMTAMNK.js";
|
|
66
71
|
import {
|
|
67
72
|
FileSystemRawProviderSink,
|
|
68
73
|
InMemoryRawProviderSink,
|
|
@@ -70,11 +75,6 @@ import {
|
|
|
70
75
|
defaultProviderRedactor,
|
|
71
76
|
providerFromBaseUrl
|
|
72
77
|
} from "./chunk-PC4UYEBM.js";
|
|
73
|
-
import {
|
|
74
|
-
TraceEmitter,
|
|
75
|
-
llmSpanFromProvider
|
|
76
|
-
} from "./chunk-TVVP3ZZQ.js";
|
|
77
|
-
import "./chunk-VSMTAMNK.js";
|
|
78
78
|
import "./chunk-QYJT52YW.js";
|
|
79
79
|
import "./chunk-PZ5AY32C.js";
|
|
80
80
|
export {
|