@tangle-network/agent-eval 0.22.0 → 0.23.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +156 -0
- package/README.md +13 -3
- package/dist/benchmarks/index.d.ts +2 -2
- package/dist/{chunk-UAND2LOT.js → chunk-7EAUOUQS.js} +4 -247
- package/dist/chunk-7EAUOUQS.js.map +1 -0
- package/dist/chunk-AXHNWLIX.js +246 -0
- package/dist/chunk-AXHNWLIX.js.map +1 -0
- package/dist/chunk-EXGR4XEM.js +283 -0
- package/dist/chunk-EXGR4XEM.js.map +1 -0
- package/dist/chunk-LZKIOBG2.js +2026 -0
- package/dist/chunk-LZKIOBG2.js.map +1 -0
- package/dist/{chunk-YUFXO3TU.js → chunk-QBW3YBTR.js} +1 -1
- package/dist/chunk-QBW3YBTR.js.map +1 -0
- package/dist/{chunk-ARZ6BEV6.js → chunk-V5QSWN7L.js} +2 -2
- package/dist/{chunk-USHQBPMH.js → chunk-VQQSPGSM.js} +7 -283
- package/dist/chunk-VQQSPGSM.js.map +1 -0
- package/dist/{chunk-4W4NCYM2.js → chunk-XPHOZPOM.js} +4 -2
- package/dist/chunk-XPHOZPOM.js.map +1 -0
- package/dist/{control-cxwMOAsy.d.ts → control-DvkH87qJ.d.ts} +2 -2
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{optimization-UVDNKaO6.d.ts → eval-campaign-Ds5QljIh.d.ts} +4 -5
- package/dist/{feedback-trajectory-CB0A32o3.d.ts → feedback-trajectory-c43WGtTX.d.ts} +1 -1
- package/dist/{index-c5saLbKD.d.ts → index-DDTlbHEK.d.ts} +1 -1
- package/dist/index-ekBXweiQ.d.ts +1894 -0
- package/dist/index.d.ts +18 -154
- package/dist/index.js +126 -26
- package/dist/index.js.map +1 -1
- package/dist/{integrity-K2oVlF57.d.ts → integrity-Cr5YodSY.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +7 -5
- package/dist/reporting.d.ts +294 -4
- package/dist/reporting.js +6 -4
- package/dist/rl.d.ts +8 -0
- package/dist/rl.js +113 -0
- package/dist/rl.js.map +1 -0
- package/dist/{run-record-CX_jcAyr.d.ts → run-record-DNiOMBrZ.d.ts} +10 -1
- package/dist/sequential-DgU2mFsE.d.ts +304 -0
- package/dist/{summary-report-D4p7RlDu.d.ts → summary-report-Ce1r4EYo.d.ts} +2 -2
- package/dist/traces.d.ts +2 -2
- package/dist/traces.js +6 -6
- package/docs/auto-research-loop-end-to-end.md +186 -0
- package/docs/three-package-architecture.md +180 -0
- package/package.json +22 -10
- package/dist/chunk-4W4NCYM2.js.map +0 -1
- package/dist/chunk-UAND2LOT.js.map +0 -1
- package/dist/chunk-USHQBPMH.js.map +0 -1
- package/dist/chunk-YUFXO3TU.js.map +0 -1
- package/dist/reporting-B82RSv9C.d.ts +0 -593
- /package/dist/{chunk-ARZ6BEV6.js.map → chunk-V5QSWN7L.js.map} +0 -0
|
@@ -1,593 +0,0 @@
|
|
|
1
|
-
import { D as DatasetSplit, c as DatasetManifest, a as DatasetScenario } from './dataset-B9qvlm_o.js';
|
|
2
|
-
import { k as GateDecision, A as ActionableSideInfo, C as MultiShotTrialResult } from './summary-report-D4p7RlDu.js';
|
|
3
|
-
import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Release confidence gate.
|
|
7
|
-
*
|
|
8
|
-
* This is the production-facing composition layer over the lower-level
|
|
9
|
-
* primitives:
|
|
10
|
-
* - Dataset manifests prove corpus/version coverage.
|
|
11
|
-
* - RunRecord rows prove reproducible search/holdout outcomes.
|
|
12
|
-
* - Multi-shot trace evidence carries turn counts and ASI diagnostics.
|
|
13
|
-
* - HeldOutGate decisions remain the paired promotion authority.
|
|
14
|
-
*
|
|
15
|
-
* The gate is intentionally pure and conservative. Missing declared evidence
|
|
16
|
-
* fails closed instead of being treated as a neutral zero.
|
|
17
|
-
*/
|
|
18
|
-
|
|
19
|
-
type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
|
|
20
|
-
type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
|
|
21
|
-
interface ReleaseTraceEvidence {
|
|
22
|
-
scenarioId: string;
|
|
23
|
-
candidateId?: string;
|
|
24
|
-
split?: RunSplitTag;
|
|
25
|
-
score?: number;
|
|
26
|
-
ok?: boolean;
|
|
27
|
-
turnCount?: number;
|
|
28
|
-
costUsd?: number;
|
|
29
|
-
durationMs?: number;
|
|
30
|
-
failureMode?: string;
|
|
31
|
-
asi?: ActionableSideInfo[];
|
|
32
|
-
metadata?: Record<string, unknown>;
|
|
33
|
-
}
|
|
34
|
-
interface ReleaseConfidenceThresholds {
|
|
35
|
-
/** Require a Dataset manifest or explicit scenarios. Default true. */
|
|
36
|
-
requireCorpus?: boolean;
|
|
37
|
-
minScenarioCount?: number;
|
|
38
|
-
minSearchRuns?: number;
|
|
39
|
-
minHoldoutRuns?: number;
|
|
40
|
-
/** Require at least one holdout scenario/run. Default true. */
|
|
41
|
-
requireHoldout?: boolean;
|
|
42
|
-
minPassRate?: number;
|
|
43
|
-
minMeanScore?: number;
|
|
44
|
-
/** Search mean may exceed holdout mean by at most this much. */
|
|
45
|
-
maxOverfitGap?: number;
|
|
46
|
-
maxMeanCostUsd?: number;
|
|
47
|
-
maxP95WallMs?: number;
|
|
48
|
-
/** Low-score/failed rows must carry ASI. Default true. */
|
|
49
|
-
requireAsiForFailures?: boolean;
|
|
50
|
-
/** Score below this is considered a failure for ASI coverage. Default 0.5. */
|
|
51
|
-
failureScoreThreshold?: number;
|
|
52
|
-
}
|
|
53
|
-
interface ReleaseConfidenceInput {
|
|
54
|
-
target: string;
|
|
55
|
-
candidateId?: string;
|
|
56
|
-
baselineId?: string;
|
|
57
|
-
dataset?: DatasetManifest;
|
|
58
|
-
scenarios?: readonly DatasetScenario[];
|
|
59
|
-
runs?: readonly RunRecord[];
|
|
60
|
-
traces?: readonly ReleaseTraceEvidence[];
|
|
61
|
-
gateDecision?: GateDecision | null;
|
|
62
|
-
thresholds?: ReleaseConfidenceThresholds;
|
|
63
|
-
}
|
|
64
|
-
interface ReleaseConfidenceAxis {
|
|
65
|
-
name: ReleaseConfidenceAxisName;
|
|
66
|
-
status: ReleaseConfidenceStatus;
|
|
67
|
-
score: number;
|
|
68
|
-
detail: string;
|
|
69
|
-
}
|
|
70
|
-
interface ReleaseConfidenceIssue {
|
|
71
|
-
axis: ReleaseConfidenceAxisName;
|
|
72
|
-
severity: 'critical' | 'warning';
|
|
73
|
-
code: string;
|
|
74
|
-
detail: string;
|
|
75
|
-
}
|
|
76
|
-
interface ReleaseConfidenceMetrics {
|
|
77
|
-
scenarioCount: number;
|
|
78
|
-
searchRuns: number;
|
|
79
|
-
holdoutRuns: number;
|
|
80
|
-
passRate: number;
|
|
81
|
-
meanScore: number;
|
|
82
|
-
searchMeanScore: number;
|
|
83
|
-
holdoutMeanScore: number;
|
|
84
|
-
overfitGap: number;
|
|
85
|
-
meanCostUsd: number;
|
|
86
|
-
p95WallMs: number;
|
|
87
|
-
failedRows: number;
|
|
88
|
-
failuresWithAsi: number;
|
|
89
|
-
singleShotTraces: number;
|
|
90
|
-
multiShotTraces: number;
|
|
91
|
-
splitCounts: Record<DatasetSplit, number>;
|
|
92
|
-
domainCounts: Record<string, number>;
|
|
93
|
-
failureModeCounts: Record<string, number>;
|
|
94
|
-
responsibleSurfaceCounts: Record<string, number>;
|
|
95
|
-
}
|
|
96
|
-
interface ReleaseConfidenceScorecard {
|
|
97
|
-
target: string;
|
|
98
|
-
candidateId: string | null;
|
|
99
|
-
baselineId: string | null;
|
|
100
|
-
status: ReleaseConfidenceStatus;
|
|
101
|
-
promote: boolean;
|
|
102
|
-
axes: ReleaseConfidenceAxis[];
|
|
103
|
-
issues: ReleaseConfidenceIssue[];
|
|
104
|
-
metrics: ReleaseConfidenceMetrics;
|
|
105
|
-
dataset: DatasetManifest | null;
|
|
106
|
-
gateDecision: GateDecision | null;
|
|
107
|
-
summary: string;
|
|
108
|
-
}
|
|
109
|
-
declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
|
|
110
|
-
declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
111
|
-
declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
|
|
112
|
-
|
|
113
|
-
/**
|
|
114
|
-
* OutcomeStore — deployment outcomes attached to Run IDs.
|
|
115
|
-
*
|
|
116
|
-
* Outcomes arrive asynchronously from production telemetry after the
|
|
117
|
-
* eval run completed: user ratings, retention flags, conversion events,
|
|
118
|
-
* revenue, support-ticket rate, anything a product team can measure.
|
|
119
|
-
* The store is a peer to TraceStore — separate lifecycle, same runId
|
|
120
|
-
* foreign key.
|
|
121
|
-
*
|
|
122
|
-
* The whole point of this module is to make the meta-eval correlation
|
|
123
|
-
* question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
|
|
124
|
-
*/
|
|
125
|
-
interface DeploymentOutcome {
|
|
126
|
-
runId: string;
|
|
127
|
-
capturedAt: number;
|
|
128
|
-
/** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
|
|
129
|
-
metrics: Record<string, number>;
|
|
130
|
-
/** Dimensions for stratified analysis — cohort, region, user_segment. */
|
|
131
|
-
labels?: Record<string, string>;
|
|
132
|
-
/** Free-form provenance (source system, pipeline version). */
|
|
133
|
-
source?: string;
|
|
134
|
-
}
|
|
135
|
-
interface OutcomeFilter {
|
|
136
|
-
runIds?: string[];
|
|
137
|
-
since?: number;
|
|
138
|
-
until?: number;
|
|
139
|
-
label?: {
|
|
140
|
-
key: string;
|
|
141
|
-
value: string;
|
|
142
|
-
};
|
|
143
|
-
source?: string;
|
|
144
|
-
}
|
|
145
|
-
interface OutcomeStore {
|
|
146
|
-
append(outcome: DeploymentOutcome): Promise<void>;
|
|
147
|
-
/** All outcomes attached to this run (a single run can have many — multiple
|
|
148
|
-
* capture windows over deployment time). */
|
|
149
|
-
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
150
|
-
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
151
|
-
}
|
|
152
|
-
declare class InMemoryOutcomeStore implements OutcomeStore {
|
|
153
|
-
private items;
|
|
154
|
-
append(outcome: DeploymentOutcome): Promise<void>;
|
|
155
|
-
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
156
|
-
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
157
|
-
}
|
|
158
|
-
interface FileSystemOutcomeStoreOptions {
|
|
159
|
-
dir: string;
|
|
160
|
-
maxBytes?: number;
|
|
161
|
-
}
|
|
162
|
-
declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
163
|
-
private dir;
|
|
164
|
-
private maxBytes;
|
|
165
|
-
private memo?;
|
|
166
|
-
private loaded;
|
|
167
|
-
constructor(options: FileSystemOutcomeStoreOptions);
|
|
168
|
-
private ensureDir;
|
|
169
|
-
append(outcome: DeploymentOutcome): Promise<void>;
|
|
170
|
-
private load;
|
|
171
|
-
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
172
|
-
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
/**
|
|
176
|
-
* Rubric predictive validity — does our eval rubric predict deployment
|
|
177
|
-
* outcomes?
|
|
178
|
-
*
|
|
179
|
-
* `correlationStudy` (already in this package) joins a `TraceStore` to an
|
|
180
|
-
* `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each
|
|
181
|
-
* (eval-metric, outcome-metric) pair. That answers "does X correlate with
|
|
182
|
-
* Y at all." `rubricPredictiveValidity` is the campaign-shaped wrapper
|
|
183
|
-
* around it: take a sequence of `RunRecord`s (the canonical campaign
|
|
184
|
-
* artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a
|
|
185
|
-
* ranked verdict on every rubric whose dimension scores were captured in
|
|
186
|
-
* `outcome.raw`.
|
|
187
|
-
*
|
|
188
|
-
* The point — quoting the methodology doc — is that **without this loop
|
|
189
|
-
* every rubric is faith-based**. Once it's wired, you know which rubrics
|
|
190
|
-
* have earned their promotion power and which ones are decoration.
|
|
191
|
-
*
|
|
192
|
-
* const validity = await rubricPredictiveValidity({
|
|
193
|
-
* runs: lastQuarter,
|
|
194
|
-
* outcomes: shipFlagOutcomeStore,
|
|
195
|
-
* outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
|
|
196
|
-
* rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],
|
|
197
|
-
* })
|
|
198
|
-
* for (const r of validity.ranked) {
|
|
199
|
-
* console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)
|
|
200
|
-
* }
|
|
201
|
-
*
|
|
202
|
-
* The function is intentionally read-only. Use the verdict to deprecate
|
|
203
|
-
* decorative rubrics, re-weight composite scores, or trigger a
|
|
204
|
-
* recalibration sweep when predictive validity drops below a threshold.
|
|
205
|
-
*/
|
|
206
|
-
|
|
207
|
-
interface RubricPredictiveValidityInput {
|
|
208
|
-
/**
|
|
209
|
-
* Canonical campaign output. Each record's `outcome.raw[<rubricId>]`
|
|
210
|
-
* provides the eval score; missing keys are silently skipped per pair.
|
|
211
|
-
*/
|
|
212
|
-
runs: RunRecord[];
|
|
213
|
-
outcomes: OutcomeStore;
|
|
214
|
-
/**
|
|
215
|
-
* Outcome metric names to evaluate against. Each must appear in at
|
|
216
|
-
* least one `DeploymentOutcome.metrics` keyspace; pairs with too few
|
|
217
|
-
* joined samples are excluded from the result.
|
|
218
|
-
*/
|
|
219
|
-
outcomeMetrics: string[];
|
|
220
|
-
/**
|
|
221
|
-
* Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.
|
|
222
|
-
* If omitted, every numeric key in `outcome.raw` across the run set is
|
|
223
|
-
* treated as a rubric.
|
|
224
|
-
*/
|
|
225
|
-
rubrics?: string[];
|
|
226
|
-
/** Minimum joined-sample count before a pair is reported. Default 8. */
|
|
227
|
-
minSamples?: number;
|
|
228
|
-
/** Bootstrap resamples for CI. Default 500. */
|
|
229
|
-
bootstrapResamples?: number;
|
|
230
|
-
/** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */
|
|
231
|
-
seed?: number;
|
|
232
|
-
/**
|
|
233
|
-
* Reduction when multiple outcomes attach to one runId. Default `'latest'`
|
|
234
|
-
* (most recently captured).
|
|
235
|
-
*/
|
|
236
|
-
reduction?: 'latest' | 'mean' | 'max';
|
|
237
|
-
}
|
|
238
|
-
interface RubricOutcomePair {
|
|
239
|
-
rubric: string;
|
|
240
|
-
outcome: string;
|
|
241
|
-
n: number;
|
|
242
|
-
pearson: number;
|
|
243
|
-
spearman: number;
|
|
244
|
-
ci95: {
|
|
245
|
-
low: number;
|
|
246
|
-
high: number;
|
|
247
|
-
};
|
|
248
|
-
/**
|
|
249
|
-
* Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,
|
|
250
|
-
* `decorative` < 0.4 in absolute correlation. A negative correlation
|
|
251
|
-
* with a desired outcome is also `decorative` — actively misleading
|
|
252
|
-
* is worse than uninformative.
|
|
253
|
-
*/
|
|
254
|
-
verdict: 'load_bearing' | 'informative' | 'decorative';
|
|
255
|
-
}
|
|
256
|
-
interface RubricRanking {
|
|
257
|
-
rubric: string;
|
|
258
|
-
/** Outcome metric this rubric correlated best with. */
|
|
259
|
-
bestOutcome: string;
|
|
260
|
-
spearman: number;
|
|
261
|
-
pearson: number;
|
|
262
|
-
n: number;
|
|
263
|
-
verdict: RubricOutcomePair['verdict'];
|
|
264
|
-
}
|
|
265
|
-
interface RubricPredictiveValidityReport {
|
|
266
|
-
pairs: RubricOutcomePair[];
|
|
267
|
-
/** Per-rubric best pair, sorted descending by |spearman|. */
|
|
268
|
-
ranked: RubricRanking[];
|
|
269
|
-
joinedSamples: number;
|
|
270
|
-
skippedRuns: number;
|
|
271
|
-
/** Rubrics that were declared but never produced a usable score. */
|
|
272
|
-
rubricsWithoutData: string[];
|
|
273
|
-
}
|
|
274
|
-
declare function rubricPredictiveValidity(input: RubricPredictiveValidityInput): Promise<RubricPredictiveValidityReport>;
|
|
275
|
-
|
|
276
|
-
/**
|
|
277
|
-
* Paper-grade paired statistics for held-out promotion gates.
|
|
278
|
-
*
|
|
279
|
-
* The promotion gate (`HeldOutGate`) needs three things:
|
|
280
|
-
*
|
|
281
|
-
* 1. A bootstrap confidence interval on the per-item paired delta
|
|
282
|
-
* (`pairedBootstrap`). Median delta is the headline number; the
|
|
283
|
-
* CI lower bound is what the gate checks against `pairedDeltaThreshold`.
|
|
284
|
-
* 2. A non-parametric significance test on the paired deltas
|
|
285
|
-
* (`pairedWilcoxon` — re-export of `wilcoxonSignedRank` under the
|
|
286
|
-
* paper-style name).
|
|
287
|
-
* 3. False-discovery-rate correction across simultaneously-tested
|
|
288
|
-
* candidate variants (`bhAdjust` — re-export of `benjaminiHochberg`).
|
|
289
|
-
*
|
|
290
|
-
* Why a separate file: every existing primitive lives in `statistics.ts`
|
|
291
|
-
* (general) or `power-analysis.ts` (correction). Paired-bootstrap is
|
|
292
|
-
* paired-only, paper-grade, and load-bearing for the promotion gate.
|
|
293
|
-
* Putting it next to `statistics.ts` would require editing that file;
|
|
294
|
-
* the brief forbids that. New file, new exports, no surface change.
|
|
295
|
-
*/
|
|
296
|
-
interface PairedBootstrapResult {
|
|
297
|
-
/** Number of paired observations (after dropping unequal lengths is rejected). */
|
|
298
|
-
n: number;
|
|
299
|
-
/** Median of paired deltas (after − before). */
|
|
300
|
-
median: number;
|
|
301
|
-
/** Mean of paired deltas. */
|
|
302
|
-
mean: number;
|
|
303
|
-
/** Lower bound of the bootstrap CI on the median delta. */
|
|
304
|
-
low: number;
|
|
305
|
-
/** Upper bound of the bootstrap CI on the median delta. */
|
|
306
|
-
high: number;
|
|
307
|
-
/** Confidence level used (e.g. 0.95). */
|
|
308
|
-
confidence: number;
|
|
309
|
-
/** Number of bootstrap resamples used. */
|
|
310
|
-
resamples: number;
|
|
311
|
-
}
|
|
312
|
-
interface PairedBootstrapOptions {
|
|
313
|
-
/** Confidence level. Default 0.95. */
|
|
314
|
-
confidence?: number;
|
|
315
|
-
/** Bootstrap resample count. Default 2000. */
|
|
316
|
-
resamples?: number;
|
|
317
|
-
/** Statistic to bootstrap. Default 'median'. */
|
|
318
|
-
statistic?: 'median' | 'mean';
|
|
319
|
-
/** Deterministic seed. If omitted, uses Math.random(). */
|
|
320
|
-
seed?: number;
|
|
321
|
-
}
|
|
322
|
-
/**
|
|
323
|
-
* Paired bootstrap on (after - before) deltas. Returns a CI on the
|
|
324
|
-
* chosen statistic (median by default). Pairs are resampled with
|
|
325
|
-
* replacement. The lower bound is what the promotion gate checks: if
|
|
326
|
-
* `low > pairedDeltaThreshold`, the gain is real at the chosen
|
|
327
|
-
* confidence level.
|
|
328
|
-
*
|
|
329
|
-
* Throws on unequal sample sizes — caller must align pairs upstream.
|
|
330
|
-
*/
|
|
331
|
-
declare function pairedBootstrap(before: number[], after: number[], opts?: PairedBootstrapOptions): PairedBootstrapResult;
|
|
332
|
-
/**
|
|
333
|
-
* Paper-style alias for `wilcoxonSignedRank`. The signed-rank test on
|
|
334
|
-
* paired deltas is the standard non-parametric significance test for
|
|
335
|
-
* "candidate beats baseline on matched items." Use alongside the
|
|
336
|
-
* bootstrap CI: bootstrap gives effect size, Wilcoxon gives p.
|
|
337
|
-
*/
|
|
338
|
-
declare function pairedWilcoxon(before: number[], after: number[]): {
|
|
339
|
-
w: number;
|
|
340
|
-
p: number;
|
|
341
|
-
};
|
|
342
|
-
/**
|
|
343
|
-
* Paper-style alias for `benjaminiHochberg`. Use to correct p-values
|
|
344
|
-
* across multiple candidate-vs-baseline comparisons run in the same
|
|
345
|
-
* promotion sweep. Returns BH-adjusted q-values and significance at
|
|
346
|
-
* the requested FDR (default 0.05).
|
|
347
|
-
*/
|
|
348
|
-
declare function bhAdjust(pValues: number[], fdr?: number): {
|
|
349
|
-
qValues: number[];
|
|
350
|
-
significant: boolean[];
|
|
351
|
-
};
|
|
352
|
-
|
|
353
|
-
/**
|
|
354
|
-
* Always-valid sequential evaluation.
|
|
355
|
-
*
|
|
356
|
-
* `researchReport` (0.21+) assumes a single pre-specified analysis. Real
|
|
357
|
-
* consumers run campaigns weekly / nightly / per-PR; each new run silently
|
|
358
|
-
* inflates the false-discovery rate, because the BH-FDR guarantee was for
|
|
359
|
-
* the *first* look, not the 47th. Without time-uniform inference,
|
|
360
|
-
* launch-decision teams either (a) don't peek, which forfeits the cost
|
|
361
|
-
* advantage of stop-when-decisive, or (b) peek and pretend they didn't,
|
|
362
|
-
* which forfeits scientific validity.
|
|
363
|
-
*
|
|
364
|
-
* This module ships **e-value-based confidence sequences** for paired
|
|
365
|
-
* bounded outcomes. The methodology is the predictable plug-in betting
|
|
366
|
-
* martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
|
|
367
|
-
* stopping time. Concretely:
|
|
368
|
-
*
|
|
369
|
-
* For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
|
|
370
|
-
* a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
|
|
371
|
-
* plug-in), and the running e-value is
|
|
372
|
-
*
|
|
373
|
-
* E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
|
|
374
|
-
*
|
|
375
|
-
* E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
|
|
376
|
-
* Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
|
|
377
|
-
* at any time without inflating the type-I error.
|
|
378
|
-
*
|
|
379
|
-
* Combined with `runEvalCampaign`, every consumer running rolling
|
|
380
|
-
* campaigns gains the ability to ship the moment evidence is decisive,
|
|
381
|
-
* stop-early on dead-on-arrival variants, and accumulate evidence across
|
|
382
|
-
* partial runs without spending the FDR budget. No new sweep is wasted.
|
|
383
|
-
*
|
|
384
|
-
* References:
|
|
385
|
-
* - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
|
|
386
|
-
* Time-uniform, nonparametric, nonasymptotic confidence sequences.
|
|
387
|
-
* Annals of Statistics, 49(2), 1055–1080.
|
|
388
|
-
* - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
|
|
389
|
-
* random variables by betting. JRSS B, 86(1), 1–27.
|
|
390
|
-
*/
|
|
391
|
-
type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
|
|
392
|
-
interface PairedEvalueOptions {
|
|
393
|
-
/**
|
|
394
|
-
* Bound on |delta|. Default 1 (matching most score scales). Must satisfy
|
|
395
|
-
* c > 0; deltas outside [-c, c] are clipped with a warning attached to
|
|
396
|
-
* the return value.
|
|
397
|
-
*/
|
|
398
|
-
bound?: number;
|
|
399
|
-
/** Target Type-I error. Default 0.05. */
|
|
400
|
-
alpha?: number;
|
|
401
|
-
/**
|
|
402
|
-
* Region of Practical Equivalence on the *mean* paired delta. When
|
|
403
|
-
* supplied, the verdict can return `'equivalent'` once the running
|
|
404
|
-
* confidence sequence on the mean is fully contained in [low, high].
|
|
405
|
-
*/
|
|
406
|
-
rope?: {
|
|
407
|
-
low: number;
|
|
408
|
-
high: number;
|
|
409
|
-
};
|
|
410
|
-
/** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
|
|
411
|
-
initialBetShrinkage?: number;
|
|
412
|
-
}
|
|
413
|
-
interface PairedEvalueStep {
|
|
414
|
-
/** 1-indexed observation count. */
|
|
415
|
-
t: number;
|
|
416
|
-
delta: number;
|
|
417
|
-
/** Running e-value E_t = ∏ (1 + λ_i · D_i). */
|
|
418
|
-
evalue: number;
|
|
419
|
-
/** Time-uniform p-value at stopping time t. */
|
|
420
|
-
pValue: number;
|
|
421
|
-
/** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
|
|
422
|
-
csLow: number;
|
|
423
|
-
csHigh: number;
|
|
424
|
-
/** Verdict at this stopping time. */
|
|
425
|
-
decision: SequentialDecision;
|
|
426
|
-
}
|
|
427
|
-
interface PairedEvalueSequence {
|
|
428
|
-
steps: PairedEvalueStep[];
|
|
429
|
-
/** The decision at the final step. */
|
|
430
|
-
finalDecision: SequentialDecision;
|
|
431
|
-
/** Index (1-based) at which a non-`continue` decision first fired, or null. */
|
|
432
|
-
decisionFiredAt: number | null;
|
|
433
|
-
/** True if any deltas were clipped to [-bound, bound]. */
|
|
434
|
-
clipped: boolean;
|
|
435
|
-
}
|
|
436
|
-
/**
|
|
437
|
-
* Run the paired e-value sequence over an in-order delta stream.
|
|
438
|
-
*
|
|
439
|
-
* Use for *streaming* / interim analyses: pass the deltas you have so
|
|
440
|
-
* far, get the verdict at every prefix length. The decision is
|
|
441
|
-
* monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
|
|
442
|
-
* fires, the verdict at later steps remains decisive (the e-value is a
|
|
443
|
-
* non-negative martingale; once it crosses the threshold, it's crossed).
|
|
444
|
-
*/
|
|
445
|
-
declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
|
|
446
|
-
interface InterimReleaseConfidenceInput {
|
|
447
|
-
/**
|
|
448
|
-
* One delta series per candidate (paired deltas vs comparator). Order
|
|
449
|
-
* within a series is the order the campaigns were run.
|
|
450
|
-
*/
|
|
451
|
-
deltaSeries: Array<{
|
|
452
|
-
candidateId: string;
|
|
453
|
-
deltas: number[];
|
|
454
|
-
}>;
|
|
455
|
-
alpha?: number;
|
|
456
|
-
bound?: number;
|
|
457
|
-
rope?: {
|
|
458
|
-
low: number;
|
|
459
|
-
high: number;
|
|
460
|
-
};
|
|
461
|
-
}
|
|
462
|
-
interface InterimReleaseConfidence {
|
|
463
|
-
candidates: Array<{
|
|
464
|
-
candidateId: string;
|
|
465
|
-
decision: SequentialDecision;
|
|
466
|
-
decisionFiredAt: number | null;
|
|
467
|
-
finalEvalue: number;
|
|
468
|
-
finalPValue: number;
|
|
469
|
-
pairs: number;
|
|
470
|
-
csLow: number;
|
|
471
|
-
csHigh: number;
|
|
472
|
-
}>;
|
|
473
|
-
/**
|
|
474
|
-
* Campaign-level recommendation: pick the strongest 'promote_now', else
|
|
475
|
-
* 'continue' if any candidate is still live, else 'reject_now' if every
|
|
476
|
-
* candidate is dead, else 'equivalent'.
|
|
477
|
-
*/
|
|
478
|
-
recommendation: {
|
|
479
|
-
decision: SequentialDecision;
|
|
480
|
-
candidateId: string | null;
|
|
481
|
-
};
|
|
482
|
-
}
|
|
483
|
-
/**
|
|
484
|
-
* Run interim sequential analyses across many candidates at once,
|
|
485
|
-
* preserving the time-uniform α guarantee for each candidate's series and
|
|
486
|
-
* synthesising a campaign-level recommendation. Designed to be called on
|
|
487
|
-
* every campaign tick — the recommendation is anytime-valid.
|
|
488
|
-
*/
|
|
489
|
-
declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
|
|
490
|
-
|
|
491
|
-
interface RenderReleaseReportOptions {
|
|
492
|
-
title?: string;
|
|
493
|
-
runs?: readonly RunRecord[];
|
|
494
|
-
comparator?: string;
|
|
495
|
-
traceAnalystFindings?: readonly string[];
|
|
496
|
-
nextActions?: readonly string[];
|
|
497
|
-
}
|
|
498
|
-
declare function renderReleaseReport(scorecard: ReleaseConfidenceScorecard, options?: RenderReleaseReportOptions): string;
|
|
499
|
-
|
|
500
|
-
/**
|
|
501
|
-
* Bootstrap-CI promotion gate.
|
|
502
|
-
*
|
|
503
|
-
* In any iterative-improvement loop (GEPA, prompt evolution, dataset
|
|
504
|
-
* curation), the question is "did this generation actually improve, or are
|
|
505
|
-
* we celebrating noise?". With small N and noisy outcomes, point-estimate
|
|
506
|
-
* deltas lie. Bootstrap confidence intervals tell the operator whether the
|
|
507
|
-
* delta is real before code or prompts get promoted.
|
|
508
|
-
*
|
|
509
|
-
* This module is pure functions — no I/O, no model calls. Easy to unit-test
|
|
510
|
-
* and to compose into any verdict gate.
|
|
511
|
-
*
|
|
512
|
-
* Default gate:
|
|
513
|
-
* - Bootstrap mean baseline vs candidate (1k resamples).
|
|
514
|
-
* - Compute the delta distribution; pass if the lower CI bound > 0.
|
|
515
|
-
* - Tunable confidence (default 95%) and resample count.
|
|
516
|
-
*
|
|
517
|
-
* Verdict semantics intentionally match the existing `experiments.jsonl`
|
|
518
|
-
* vocabulary:
|
|
519
|
-
* - ADVANCE: candidate's CI lower bound > baseline mean (real win)
|
|
520
|
-
* - KEEP: overlap, but candidate point estimate >= baseline (neutral)
|
|
521
|
-
* - REVERT: candidate's CI upper bound < baseline mean (real regression)
|
|
522
|
-
* - INCONCLUSIVE: not enough samples or CI straddles zero with no signal
|
|
523
|
-
*/
|
|
524
|
-
type Verdict = 'ADVANCE' | 'KEEP' | 'REVERT' | 'INCONCLUSIVE';
|
|
525
|
-
interface BootstrapResult {
|
|
526
|
-
baselineMean: number;
|
|
527
|
-
candidateMean: number;
|
|
528
|
-
/** candidateMean - baselineMean, point estimate. */
|
|
529
|
-
delta: number;
|
|
530
|
-
/** Lower bound of the (1 - alpha) CI on the delta. */
|
|
531
|
-
ciLower: number;
|
|
532
|
-
/** Upper bound of the (1 - alpha) CI on the delta. */
|
|
533
|
-
ciUpper: number;
|
|
534
|
-
/** Number of bootstrap resamples used. */
|
|
535
|
-
iterations: number;
|
|
536
|
-
alpha: number;
|
|
537
|
-
verdict: Verdict;
|
|
538
|
-
}
|
|
539
|
-
interface BootstrapOptions {
|
|
540
|
-
/** Confidence level alpha (default 0.05 → 95% CI). */
|
|
541
|
-
alpha?: number;
|
|
542
|
-
/** Number of resamples (default 1000). */
|
|
543
|
-
iterations?: number;
|
|
544
|
-
/**
|
|
545
|
-
* Minimum total samples (baseline + candidate) below which we always
|
|
546
|
-
* return INCONCLUSIVE — bootstrap with too few samples is meaningless.
|
|
547
|
-
* Default 6 (combined).
|
|
548
|
-
*/
|
|
549
|
-
minTotalSamples?: number;
|
|
550
|
-
/** RNG seed for reproducibility. Default: Math.random. */
|
|
551
|
-
seed?: number;
|
|
552
|
-
}
|
|
553
|
-
/**
|
|
554
|
-
* Compute the bootstrap CI on (candidateMean - baselineMean) and a verdict.
|
|
555
|
-
*
|
|
556
|
-
* Uses simple percentile bootstrap on the difference of resampled means.
|
|
557
|
-
* That's the standard non-parametric primitive — no distributional
|
|
558
|
-
* assumptions, robust to skew, easy to reason about.
|
|
559
|
-
*/
|
|
560
|
-
declare function bootstrapCi(baseline: number[], candidate: number[], options?: BootstrapOptions): BootstrapResult;
|
|
561
|
-
/**
|
|
562
|
-
* Judge-replay promotion gate.
|
|
563
|
-
*
|
|
564
|
-
* The cheap inner-loop judge that drives an evolution run is by definition
|
|
565
|
-
* fast and noisy. When you're about to promote a winning variant to the
|
|
566
|
-
* canonical default, you want a STRONGER judge (a more expensive model, a
|
|
567
|
-
* human grader, a separately-trained reward model) to confirm the win
|
|
568
|
-
* generalises beyond the inner loop.
|
|
569
|
-
*
|
|
570
|
-
* This helper takes raw winner + baseline outputs, scores both through the
|
|
571
|
-
* stronger judge, and applies `bootstrapCi`. ADVANCE means the stronger
|
|
572
|
-
* judge agrees the winner is real with the configured confidence. Doesn't
|
|
573
|
-
* matter what shape your "output" is — pass a string, an object, anything
|
|
574
|
-
* the judge can read.
|
|
575
|
-
*/
|
|
576
|
-
interface JudgeReplayGateArgs<TOutput> {
|
|
577
|
-
baselineOutputs: TOutput[];
|
|
578
|
-
candidateOutputs: TOutput[];
|
|
579
|
-
/** Stronger judge — async to allow LLM calls. Return a 0..N scalar score. */
|
|
580
|
-
judge: (output: TOutput) => Promise<number> | number;
|
|
581
|
-
alpha?: number;
|
|
582
|
-
iterations?: number;
|
|
583
|
-
/** RNG seed for reproducibility. */
|
|
584
|
-
seed?: number;
|
|
585
|
-
/** Maximum concurrent judge calls. Default 4. */
|
|
586
|
-
judgeConcurrency?: number;
|
|
587
|
-
}
|
|
588
|
-
declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): Promise<BootstrapResult & {
|
|
589
|
-
baselineSamples: number;
|
|
590
|
-
candidateSamples: number;
|
|
591
|
-
}>;
|
|
592
|
-
|
|
593
|
-
export { evaluateReleaseConfidence as A, type BootstrapOptions as B, judgeReplayGate as C, type DeploymentOutcome as D, pairedBootstrap as E, FileSystemOutcomeStore as F, pairedEvalueSequence as G, pairedWilcoxon as H, InMemoryOutcomeStore as I, type JudgeReplayGateArgs as J, releaseTraceEvidenceFromMultiShotTrials as K, renderReleaseReport as L, rubricPredictiveValidity as M, type OutcomeFilter as O, type PairedBootstrapOptions as P, type ReleaseConfidenceThresholds as R, type SequentialDecision as S, type Verdict as V, type ReleaseConfidenceScorecard as a, type OutcomeStore as b, type BootstrapResult as c, type FileSystemOutcomeStoreOptions as d, type InterimReleaseConfidence as e, type InterimReleaseConfidenceInput as f, type PairedBootstrapResult as g, type PairedEvalueOptions as h, type PairedEvalueSequence as i, type PairedEvalueStep as j, type ReleaseConfidenceAxis as k, type ReleaseConfidenceAxisName as l, type ReleaseConfidenceInput as m, type ReleaseConfidenceIssue as n, type ReleaseConfidenceMetrics as o, type ReleaseConfidenceStatus as p, type ReleaseTraceEvidence as q, type RenderReleaseReportOptions as r, type RubricOutcomePair as s, type RubricPredictiveValidityInput as t, type RubricPredictiveValidityReport as u, type RubricRanking as v, assertReleaseConfidence as w, bhAdjust as x, bootstrapCi as y, evaluateInterimReleaseConfidence as z };
|
|
File without changes
|