@tangle-network/agent-eval 0.40.5 → 0.42.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +48 -355
- package/dist/campaign/index.js +106 -6
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-AU2JLNSZ.js → chunk-H4TOS272.js} +1 -65
- package/dist/chunk-H4TOS272.js.map +1 -0
- package/dist/{chunk-NKLGKF2Q.js → chunk-KQ26DYTQ.js} +2 -18
- package/dist/chunk-KQ26DYTQ.js.map +1 -0
- package/dist/{chunk-EGIPWXHL.js → chunk-MNL6LXGQ.js} +98 -2
- package/dist/chunk-MNL6LXGQ.js.map +1 -0
- package/dist/{chunk-5U2DOJU4.js → chunk-N4SBKEPJ.js} +199 -2
- package/dist/chunk-N4SBKEPJ.js.map +1 -0
- package/dist/{chunk-LCIDRYGP.js → chunk-PD3MH6WU.js} +8 -8
- package/dist/{control-CmLJk3IG.d.ts → control-ojEWkMfJ.d.ts} +1 -1
- package/dist/control.d.ts +2 -2
- package/dist/{feedback-trajectory-Dvy-bt7x.d.ts → feedback-trajectory-BSxqEpu7.d.ts} +1 -1
- package/dist/index.d.ts +227 -687
- package/dist/index.js +753 -1237
- package/dist/index.js.map +1 -1
- package/dist/integrity-CTDhR1Sg.d.ts +81 -0
- package/dist/llm-client-BXVRUZyX.d.ts +234 -0
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +67 -3
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{integrity-DYR5gWlb.d.ts → raw-provider-sink-C46HDghv.d.ts} +1 -80
- package/dist/{release-report-Di84bXD7.d.ts → release-report-BtpgWRI0.d.ts} +21 -3
- package/dist/reporting.d.ts +2 -3
- package/dist/reporting.js +4 -8
- package/dist/{researcher-DeZ_EArp.d.ts → researcher-CoJMs2Iz.d.ts} +116 -205
- package/dist/rl.d.ts +103 -221
- package/dist/rl.js +44 -199
- package/dist/rl.js.map +1 -1
- package/dist/sequential-DdV5ShjT.d.ts +561 -0
- package/dist/traces.d.ts +3 -2
- package/dist/traces.js +5 -5
- package/dist/types-BLbRTxoc.d.ts +367 -0
- package/dist/wire/index.d.ts +1 -1
- package/package.json +1 -6
- package/dist/chunk-5U2DOJU4.js.map +0 -1
- package/dist/chunk-AU2JLNSZ.js.map +0 -1
- package/dist/chunk-DMW5VENN.js +0 -1412
- package/dist/chunk-DMW5VENN.js.map +0 -1
- package/dist/chunk-EGIPWXHL.js.map +0 -1
- package/dist/chunk-MAZ26DC7.js +0 -99
- package/dist/chunk-MAZ26DC7.js.map +0 -1
- package/dist/chunk-NKLGKF2Q.js.map +0 -1
- package/dist/multi-layer-verifier-BNi4-8lR.d.ts +0 -141
- package/dist/optimization.d.ts +0 -11
- package/dist/optimization.js +0 -71
- package/dist/optimization.js.map +0 -1
- package/dist/sequential-5iSVfzl2.d.ts +0 -139
- package/dist/summary-report-DuZXOk7K.d.ts +0 -917
- /package/dist/{chunk-LCIDRYGP.js.map → chunk-PD3MH6WU.js.map} +0 -0
|
@@ -1,917 +0,0 @@
|
|
|
1
|
-
import { R as RunRecord, a as RunSplitTag } from './run-record-BGY6bHRh.js';
|
|
2
|
-
import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
|
|
3
|
-
|
|
4
|
-
/**
|
|
5
|
-
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
6
|
-
*
|
|
7
|
-
* Encodes the "honesty override" pattern that lived inline in
|
|
8
|
-
* `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
|
|
9
|
-
* The optimizer's best-guess is one thing; what we should actually
|
|
10
|
-
* ship is another. The gate is the line between them.
|
|
11
|
-
*
|
|
12
|
-
* A candidate is promoted iff ALL three pass:
|
|
13
|
-
*
|
|
14
|
-
* 1. **Productive runs**: the candidate has at least
|
|
15
|
-
* `minProductiveRuns` paired observations on items where BOTH
|
|
16
|
-
* candidate and baseline produced a real (non-silent) score.
|
|
17
|
-
* 2. **Paired delta**: the lower bound of the bootstrap CI on the
|
|
18
|
-
* median per-item delta (candidate − baseline) on the HOLDOUT
|
|
19
|
-
* split is strictly greater than `pairedDeltaThreshold`.
|
|
20
|
-
* 3. **Overfit gap**: the candidate's gap between search-split
|
|
21
|
-
* score and holdout-split score is no worse (more positive)
|
|
22
|
-
* than the baseline's gap by more than `overfitGapThreshold`.
|
|
23
|
-
* "Better on search, worse on holdout" is the canonical
|
|
24
|
-
* overfit pattern; this catches it.
|
|
25
|
-
*
|
|
26
|
-
* The decision carries a machine-readable `rejectionCode` plus an
|
|
27
|
-
* `evidence` block with every number the gate looked at, so the
|
|
28
|
-
* downstream researcher / paper / dashboard can re-derive the
|
|
29
|
-
* verdict without re-running.
|
|
30
|
-
*
|
|
31
|
-
* See also:
|
|
32
|
-
* - `src/statistics.ts` for `pairedBootstrap` + `wilcoxonSignedRank`
|
|
33
|
-
* - `src/run-record.ts` for the input row schema
|
|
34
|
-
* - `src/reference-replay.ts` for the older, reference-replay-
|
|
35
|
-
* specific promotion path (still useful for replay-style evals).
|
|
36
|
-
*/
|
|
37
|
-
|
|
38
|
-
type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
|
|
39
|
-
interface HeldOutGateConfig {
|
|
40
|
-
/** Minimum number of paired (candidate, baseline) holdout observations
|
|
41
|
-
* required before the gate will even consider promoting. Default 3. */
|
|
42
|
-
minProductiveRuns?: number;
|
|
43
|
-
/** The bootstrap-CI lower bound on the median paired holdout delta
|
|
44
|
-
* must exceed this to promote. Default 0. */
|
|
45
|
-
pairedDeltaThreshold?: number;
|
|
46
|
-
/** Maximum allowed worsening of (search − holdout) gap relative to
|
|
47
|
-
* baseline. Default 0.15 (i.e. candidate may overfit by up to 15
|
|
48
|
-
* absolute score points more than baseline before rejection). */
|
|
49
|
-
overfitGapThreshold?: number;
|
|
50
|
-
/** Stable label of the baseline candidate. Required — paper-grade
|
|
51
|
-
* evaluation never compares two unlabelled candidates. */
|
|
52
|
-
baselineKey: string;
|
|
53
|
-
/** Confidence level for the bootstrap CI. Default 0.95. */
|
|
54
|
-
confidence?: number;
|
|
55
|
-
/** Bootstrap resamples. Default 2000. */
|
|
56
|
-
bootstrapResamples?: number;
|
|
57
|
-
/** Optional deterministic seed for the bootstrap. Default undefined
|
|
58
|
-
* (Math.random). */
|
|
59
|
-
seed?: number;
|
|
60
|
-
}
|
|
61
|
-
interface GateEvidence {
|
|
62
|
-
/** Number of paired (candidate, baseline) holdout observations used. */
|
|
63
|
-
productiveRuns: number;
|
|
64
|
-
/** Median of (candidate − baseline) paired holdout deltas. */
|
|
65
|
-
medianPairedDelta: number;
|
|
66
|
-
/** Bootstrap CI on the median paired holdout delta. */
|
|
67
|
-
pairedCI: {
|
|
68
|
-
low: number;
|
|
69
|
-
high: number;
|
|
70
|
-
};
|
|
71
|
-
/** Wilcoxon signed-rank p-value on the paired holdout deltas. */
|
|
72
|
-
pairedPValue: number;
|
|
73
|
-
/** Mean candidate score on the search split (NaN if none). */
|
|
74
|
-
searchScore: number;
|
|
75
|
-
/** Mean candidate score on the holdout split (NaN if none). */
|
|
76
|
-
holdoutScore: number;
|
|
77
|
-
/** Candidate (search − holdout) gap. */
|
|
78
|
-
overfitGap: number;
|
|
79
|
-
/** Baseline (search − holdout) gap. */
|
|
80
|
-
baselineOverfitGap: number;
|
|
81
|
-
}
|
|
82
|
-
interface GateDecision {
|
|
83
|
-
/** Final promote/no-promote verdict. */
|
|
84
|
-
promote: boolean;
|
|
85
|
-
/** The candidate that was evaluated. */
|
|
86
|
-
candidateId: string;
|
|
87
|
-
/** The baseline it was compared against. */
|
|
88
|
-
baselineId: string;
|
|
89
|
-
/** Every number the gate looked at, for audit + paper export. */
|
|
90
|
-
evidence: GateEvidence;
|
|
91
|
-
/** Human-readable reason. */
|
|
92
|
-
reason: string;
|
|
93
|
-
/** Machine-readable rejection code, or null on promote. */
|
|
94
|
-
rejectionCode: HeldOutGateRejectionCode | null;
|
|
95
|
-
}
|
|
96
|
-
/**
|
|
97
|
-
* Held-out paired-delta promotion gate. Construct once with config,
|
|
98
|
-
* call `evaluate(candidateRuns, baselineRuns)` per (candidate,
|
|
99
|
-
* baseline) pair. Stateless across calls.
|
|
100
|
-
*/
|
|
101
|
-
declare class HeldOutGate {
|
|
102
|
-
private readonly minProductiveRuns;
|
|
103
|
-
private readonly pairedDeltaThreshold;
|
|
104
|
-
private readonly overfitGapThreshold;
|
|
105
|
-
private readonly baselineKey;
|
|
106
|
-
private readonly confidence;
|
|
107
|
-
private readonly resamples;
|
|
108
|
-
private readonly seed?;
|
|
109
|
-
constructor(config: HeldOutGateConfig);
|
|
110
|
-
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
111
|
-
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
112
|
-
* the candidate run with the matching baseline run. Pairs without
|
|
113
|
-
* a holdout score on both sides are dropped. */
|
|
114
|
-
evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
/**
|
|
118
|
-
* Pareto frontier — multi-objective optimization over candidate runs.
|
|
119
|
-
*
|
|
120
|
-
* Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
|
|
121
|
-
* trading off (cost, latency, quality) or (passRate, tokenBudget,
|
|
122
|
-
* ttfb), you rarely have a single "winner" — you have a set of
|
|
123
|
-
* non-dominated candidates. This module exposes:
|
|
124
|
-
*
|
|
125
|
-
* - `paretoFrontier`: filter a set of candidates to the non-dominated ones
|
|
126
|
-
* - `dominates`: does A dominate B across all objectives?
|
|
127
|
-
*
|
|
128
|
-
* Each objective is declared with a direction: 'maximize' (higher=better)
|
|
129
|
-
* or 'minimize' (lower=better). Candidates are any object; pass an
|
|
130
|
-
* `objective(candidate)` accessor.
|
|
131
|
-
*/
|
|
132
|
-
type Direction = 'maximize' | 'minimize';
|
|
133
|
-
interface Objective<T> {
|
|
134
|
-
/** Stable label used in reports. */
|
|
135
|
-
name: string;
|
|
136
|
-
direction: Direction;
|
|
137
|
-
value: (candidate: T) => number;
|
|
138
|
-
}
|
|
139
|
-
interface ParetoResult<T> {
|
|
140
|
-
frontier: T[];
|
|
141
|
-
dominated: T[];
|
|
142
|
-
/** Index map: frontier[i] dominates each of dominatedBy[i]. */
|
|
143
|
-
dominanceMap: Array<{
|
|
144
|
-
dominator: T;
|
|
145
|
-
dominated: T[];
|
|
146
|
-
}>;
|
|
147
|
-
}
|
|
148
|
-
/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
|
|
149
|
-
declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
|
|
150
|
-
/**
|
|
151
|
-
* Compute the non-dominated frontier. Candidates with NaN/Infinity on any
|
|
152
|
-
* objective are excluded (can't rank them). A candidate enters the frontier
|
|
153
|
-
* iff no other candidate dominates it.
|
|
154
|
-
*/
|
|
155
|
-
declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
|
|
156
|
-
/**
|
|
157
|
-
* Weighted-sum scalarisation. Use as a tie-break / single-winner selector
|
|
158
|
-
* when callers don't want to consume a frontier. Each objective contributes
|
|
159
|
-
* its normalised value (0..1 via min-max across the candidate pool) times
|
|
160
|
-
* its weight; missing weights default to 1/N.
|
|
161
|
-
*
|
|
162
|
-
* Direction is honoured automatically — `minimize` axes have their values
|
|
163
|
-
* inverted before scaling so "higher scalar = better" always holds.
|
|
164
|
-
*/
|
|
165
|
-
declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
|
|
166
|
-
weights?: Partial<Record<string, number>>;
|
|
167
|
-
}): Array<{
|
|
168
|
-
candidate: T;
|
|
169
|
-
score: number;
|
|
170
|
-
}>;
|
|
171
|
-
/**
|
|
172
|
-
* NSGA-II crowding distance — secondary sort for ties on the frontier.
|
|
173
|
-
*
|
|
174
|
-
* When the Pareto front collapses to a single point (or many candidates tie
|
|
175
|
-
* on dominance), naive selection picks arbitrarily and the population
|
|
176
|
-
* degenerates over generations. NSGA-II preserves diversity by preferring
|
|
177
|
-
* candidates with more empty space around them on the frontier.
|
|
178
|
-
*
|
|
179
|
-
* Returns an array of `{ candidate, distance }` in the SAME order as the
|
|
180
|
-
* input. Higher distance = more isolated = should be preferred when
|
|
181
|
-
* preserving diversity.
|
|
182
|
-
*/
|
|
183
|
-
declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
184
|
-
candidate: T;
|
|
185
|
-
distance: number;
|
|
186
|
-
}>;
|
|
187
|
-
/**
|
|
188
|
-
* Pareto frontier with tie-break by crowding distance — the canonical
|
|
189
|
-
* NSGA-II selection step. Returns the frontier sorted by descending crowding
|
|
190
|
-
* distance so callers can `.slice(0, k)` to pick K diverse winners.
|
|
191
|
-
*/
|
|
192
|
-
declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
193
|
-
candidate: T;
|
|
194
|
-
distance: number;
|
|
195
|
-
}>;
|
|
196
|
-
|
|
197
|
-
/**
|
|
198
|
-
* PromptEvolutionLoop — population-based reflective-mutation evolution.
|
|
199
|
-
*
|
|
200
|
-
* Above the existing `AxGepaSteeringOptimizer` (which RANKS variants),
|
|
201
|
-
* this loop GENERATES variants. Each generation:
|
|
202
|
-
* 1. Score the population across (variant × scenario × rep).
|
|
203
|
-
* 2. Pick survivors from the Pareto frontier (with crowding-distance tie-break).
|
|
204
|
-
* 3. Ask the mutator for replacements until population size is restored.
|
|
205
|
-
* 4. Repeat for N generations OR until convergence.
|
|
206
|
-
*
|
|
207
|
-
* Domain-agnostic. Consumers supply:
|
|
208
|
-
* - A seed population of `EvolvableVariant`s.
|
|
209
|
-
* - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
|
|
210
|
-
* - A `MutateAdapter` that produces children given trace evidence.
|
|
211
|
-
* - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
|
|
212
|
-
*
|
|
213
|
-
* The loop owns: population management, parallel scheduling (concurrency-
|
|
214
|
-
* limited), Pareto selection with crowding distance, generation reporting.
|
|
215
|
-
*
|
|
216
|
-
* It does NOT own: rendering trials to a model, executing prompts, choosing
|
|
217
|
-
* mutation primitives, persisting to disk. Those are the consumer's call.
|
|
218
|
-
*/
|
|
219
|
-
|
|
220
|
-
interface EvolvableVariant<P = unknown> {
|
|
221
|
-
/** Stable id for the variant — surfaces in reports and trial results. */
|
|
222
|
-
id: string;
|
|
223
|
-
/** Variant payload — interpretation is the consumer's responsibility. */
|
|
224
|
-
payload: P;
|
|
225
|
-
/** Generation index (0 = seed, then 1, 2, ...). */
|
|
226
|
-
generation: number;
|
|
227
|
-
/** Parent variant id when produced via mutation; absent for seeds. */
|
|
228
|
-
parentId?: string;
|
|
229
|
-
/** Human label for reports. */
|
|
230
|
-
label: string;
|
|
231
|
-
/** What the mutator was trying to fix. */
|
|
232
|
-
rationale?: string;
|
|
233
|
-
}
|
|
234
|
-
interface TrialResult {
|
|
235
|
-
variantId: string;
|
|
236
|
-
scenarioId: string;
|
|
237
|
-
rep: number;
|
|
238
|
-
ok: boolean;
|
|
239
|
-
/** Primary scalar score the consumer cares about (e.g., recall, accuracy). */
|
|
240
|
-
score: number;
|
|
241
|
-
/** Token cost (or any cost-like dimension). */
|
|
242
|
-
cost?: number;
|
|
243
|
-
/** Wall time in ms. */
|
|
244
|
-
durationMs?: number;
|
|
245
|
-
/** Free-form metric bag for objective accessors. */
|
|
246
|
-
metrics?: Record<string, number>;
|
|
247
|
-
error?: string;
|
|
248
|
-
/**
|
|
249
|
-
* Whether the judge LLM call(s) that produced this trial's score actually
|
|
250
|
-
* completed. `undefined` means "consumer didn't report"; `false` means
|
|
251
|
-
* the judge aborted/failed and the score is synthetic (typically 0 or
|
|
252
|
-
* partial). `aggregateTrials({mode: 'exclude-failed'})` skips these
|
|
253
|
-
* trials so a silent-zero judge can't pollute the composite.
|
|
254
|
-
*/
|
|
255
|
-
judgeSucceeded?: boolean;
|
|
256
|
-
/** Number of judge attempts (informational, populated by `withJudgeRetry`). */
|
|
257
|
-
judgeAttempts?: number;
|
|
258
|
-
/** Last judge error message when `judgeSucceeded === false`. */
|
|
259
|
-
judgeError?: string;
|
|
260
|
-
}
|
|
261
|
-
/** Aggregated trial summary for one (variant, scenario) pair across reps. */
|
|
262
|
-
interface ScenarioAggregate {
|
|
263
|
-
variantId: string;
|
|
264
|
-
scenarioId: string;
|
|
265
|
-
meanScore: number;
|
|
266
|
-
meanCost: number;
|
|
267
|
-
meanDurationMs: number;
|
|
268
|
-
okRate: number;
|
|
269
|
-
trials: number;
|
|
270
|
-
/** Mean of every numeric metric across reps. */
|
|
271
|
-
metrics: Record<string, number>;
|
|
272
|
-
}
|
|
273
|
-
/** Aggregated trial summary for one variant across all scenarios. */
|
|
274
|
-
interface VariantAggregate {
|
|
275
|
-
variantId: string;
|
|
276
|
-
meanScore: number;
|
|
277
|
-
meanCost: number;
|
|
278
|
-
meanDurationMs: number;
|
|
279
|
-
okRate: number;
|
|
280
|
-
scenarios: ScenarioAggregate[];
|
|
281
|
-
/** Mean of every numeric metric, averaged across scenarios. */
|
|
282
|
-
metrics: Record<string, number>;
|
|
283
|
-
}
|
|
284
|
-
interface ScoreAdapter<P = unknown> {
|
|
285
|
-
score(args: {
|
|
286
|
-
variant: EvolvableVariant<P>;
|
|
287
|
-
scenarioId: string;
|
|
288
|
-
rep: number;
|
|
289
|
-
}): Promise<TrialResult>;
|
|
290
|
-
}
|
|
291
|
-
interface MutateAdapter<P = unknown> {
|
|
292
|
-
mutate(args: {
|
|
293
|
-
parent: EvolvableVariant<P>;
|
|
294
|
-
parentAggregate: VariantAggregate;
|
|
295
|
-
topTrials: TrialResult[];
|
|
296
|
-
bottomTrials: TrialResult[];
|
|
297
|
-
childCount: number;
|
|
298
|
-
generation: number;
|
|
299
|
-
}): Promise<EvolvableVariant<P>[]>;
|
|
300
|
-
}
|
|
301
|
-
interface PromptEvolutionConfig<P = unknown> {
|
|
302
|
-
runId: string;
|
|
303
|
-
/** What component is being mutated — surfaces in reports + reflection prompts. */
|
|
304
|
-
target: string;
|
|
305
|
-
seedVariants: EvolvableVariant<P>[];
|
|
306
|
-
scenarioIds: string[];
|
|
307
|
-
reps: number;
|
|
308
|
-
generations: number;
|
|
309
|
-
populationSize: number;
|
|
310
|
-
/** Maximum concurrent score() calls. */
|
|
311
|
-
scoreConcurrency: number;
|
|
312
|
-
scoreAdapter: ScoreAdapter<P>;
|
|
313
|
-
mutateAdapter: MutateAdapter<P>;
|
|
314
|
-
/** Pareto objectives over `VariantAggregate`. Ordered by importance. */
|
|
315
|
-
objectives: Objective<VariantAggregate>[];
|
|
316
|
-
/** Optional weights for the scalar tie-break selector (by objective name). */
|
|
317
|
-
scalarWeights?: Record<string, number>;
|
|
318
|
-
/** Stop early if a generation produces no Pareto improvement. Default true. */
|
|
319
|
-
earlyStopOnNoImprovement?: boolean;
|
|
320
|
-
onProgress?: (event: PromptEvolutionEvent) => void;
|
|
321
|
-
/**
|
|
322
|
-
* Optional cache key for memoising scored (variantId, scenarioId, rep)
|
|
323
|
-
* tuples. When provided AND a cache instance is passed, repeated trials
|
|
324
|
-
* skip re-scoring. Cache keys are stable across runs.
|
|
325
|
-
*/
|
|
326
|
-
cache?: TrialCache;
|
|
327
|
-
}
|
|
328
|
-
interface TrialCache {
|
|
329
|
-
get(key: string): TrialResult | undefined;
|
|
330
|
-
set(key: string, value: TrialResult): void;
|
|
331
|
-
}
|
|
332
|
-
declare class InMemoryTrialCache implements TrialCache {
|
|
333
|
-
private store;
|
|
334
|
-
get(key: string): TrialResult | undefined;
|
|
335
|
-
set(key: string, value: TrialResult): void;
|
|
336
|
-
size(): number;
|
|
337
|
-
clear(): void;
|
|
338
|
-
}
|
|
339
|
-
type PromptEvolutionEvent = {
|
|
340
|
-
type: 'generation-start';
|
|
341
|
-
generation: number;
|
|
342
|
-
populationSize: number;
|
|
343
|
-
} | {
|
|
344
|
-
type: 'trial-complete';
|
|
345
|
-
generation: number;
|
|
346
|
-
variantId: string;
|
|
347
|
-
scenarioId: string;
|
|
348
|
-
rep: number;
|
|
349
|
-
ok: boolean;
|
|
350
|
-
score: number;
|
|
351
|
-
cached: boolean;
|
|
352
|
-
} | {
|
|
353
|
-
type: 'generation-complete';
|
|
354
|
-
report: GenerationReport<unknown>;
|
|
355
|
-
} | {
|
|
356
|
-
type: 'converged';
|
|
357
|
-
generation: number;
|
|
358
|
-
reason: string;
|
|
359
|
-
};
|
|
360
|
-
interface GenerationReport<P = unknown> {
|
|
361
|
-
runId: string;
|
|
362
|
-
target: string;
|
|
363
|
-
generation: number;
|
|
364
|
-
variants: EvolvableVariant<P>[];
|
|
365
|
-
aggregates: VariantAggregate[];
|
|
366
|
-
/** Frontier candidates, sorted by descending crowding distance. */
|
|
367
|
-
paretoFrontIds: string[];
|
|
368
|
-
/** Scalar-best variant id — used for the single "winner" if callers want one. */
|
|
369
|
-
winnerId: string;
|
|
370
|
-
/** Trials that fed this generation (kept for downstream reporting). */
|
|
371
|
-
trials: TrialResult[];
|
|
372
|
-
}
|
|
373
|
-
interface PromptEvolutionResult<P = unknown> {
|
|
374
|
-
runId: string;
|
|
375
|
-
target: string;
|
|
376
|
-
generations: GenerationReport<P>[];
|
|
377
|
-
/** Best variant by scalar score in the final generation. */
|
|
378
|
-
bestVariant: EvolvableVariant<P>;
|
|
379
|
-
/** Best aggregate (matches bestVariant). */
|
|
380
|
-
bestAggregate: VariantAggregate;
|
|
381
|
-
}
|
|
382
|
-
declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
|
|
383
|
-
|
|
384
|
-
/**
|
|
385
|
-
* Reflective mutation — primitives for trace-conditioned prompt rewriting.
|
|
386
|
-
*
|
|
387
|
-
* Used by `prompt-evolution.ts` (and any consumer running iterative
|
|
388
|
-
* improvement). Given a parent prompt + concrete trace evidence (top trials,
|
|
389
|
-
* bottom trials, missed expectations), produce an LLM-ready prompt that
|
|
390
|
-
* proposes targeted mutations — not blind rephrasings.
|
|
391
|
-
*
|
|
392
|
-
* Why this lives outside `prompt-evolution.ts`: any consumer that wants to
|
|
393
|
-
* run reflective rewriting WITHOUT the population/Pareto machinery can
|
|
394
|
-
* import these primitives directly.
|
|
395
|
-
*
|
|
396
|
-
* Quality bar (vs. naive "mutate this prompt"):
|
|
397
|
-
* - Show parent ↔ children diff, not just one variant
|
|
398
|
-
* - Quote specific missed goldens with their match phrases
|
|
399
|
-
* - Surface the model's actual emitted output side-by-side with what was expected
|
|
400
|
-
* - Quote concrete mutation primitives so the model has a vocabulary
|
|
401
|
-
*/
|
|
402
|
-
interface TrialTrace {
|
|
403
|
-
/** Stable id for the trial — surfaces in the prompt for grounding. */
|
|
404
|
-
id: string;
|
|
405
|
-
/** Score the trial received on its primary metric. */
|
|
406
|
-
score: number;
|
|
407
|
-
/** Candidate inputs the agent was given (e.g., the fixture or scenario). */
|
|
408
|
-
inputName?: string;
|
|
409
|
-
/**
|
|
410
|
-
* Goldens / expectations this trial was tested against, with whether each
|
|
411
|
-
* was matched. The reflection prompt quotes the missed ones specifically.
|
|
412
|
-
*/
|
|
413
|
-
expectations?: Array<{
|
|
414
|
-
id: string;
|
|
415
|
-
phrase: string;
|
|
416
|
-
matched: boolean;
|
|
417
|
-
}>;
|
|
418
|
-
/** Free-form text — what the agent actually emitted (e.g., findings, plan). */
|
|
419
|
-
emitted?: string;
|
|
420
|
-
/** Optional structured metrics (recall, precision, cost, latency). */
|
|
421
|
-
metrics?: Record<string, number>;
|
|
422
|
-
}
|
|
423
|
-
interface ReflectionContext {
|
|
424
|
-
/** What is being mutated — appears in the system prompt for orientation. */
|
|
425
|
-
target: string;
|
|
426
|
-
/** Current variant's payload — JSON-serialised for the prompt. */
|
|
427
|
-
parentPayload: unknown;
|
|
428
|
-
/** Best-performing trials this generation. */
|
|
429
|
-
topTrials: TrialTrace[];
|
|
430
|
-
/** Worst-performing trials this generation — the missed-golden source. */
|
|
431
|
-
bottomTrials: TrialTrace[];
|
|
432
|
-
/** How many children the mutator should propose. */
|
|
433
|
-
childCount: number;
|
|
434
|
-
/** Optional: domain-specific mutation primitives the model can pick from. */
|
|
435
|
-
mutationPrimitives?: string[];
|
|
436
|
-
}
|
|
437
|
-
declare const DEFAULT_MUTATION_PRIMITIVES: string[];
|
|
438
|
-
/**
|
|
439
|
-
* Build the LLM-ready reflection prompt. Output is plain text — pass it as
|
|
440
|
-
* the user message. The system message should be small and stable (e.g.
|
|
441
|
-
* "Output ONLY a JSON object matching the schema below.").
|
|
442
|
-
*/
|
|
443
|
-
declare function buildReflectionPrompt(ctx: ReflectionContext): string;
|
|
444
|
-
interface ReflectionProposal {
|
|
445
|
-
label: string;
|
|
446
|
-
rationale: string;
|
|
447
|
-
payload: unknown;
|
|
448
|
-
}
|
|
449
|
-
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
450
|
-
|
|
451
|
-
/**
|
|
452
|
-
* Multi-shot optimization adapter.
|
|
453
|
-
*
|
|
454
|
-
* This is the canonical bridge between variable-length agent trajectories
|
|
455
|
-
* and `runPromptEvolution`. Apps provide four things:
|
|
456
|
-
*
|
|
457
|
-
* - variants: prompt/config/tool-policy candidates
|
|
458
|
-
* - runner: executes one full task trajectory for a variant
|
|
459
|
-
* - scorer: turns that trajectory into score + actionable side information
|
|
460
|
-
* - mutator: proposes new variants from top/bottom scored trials
|
|
461
|
-
*
|
|
462
|
-
* The adapter owns the boring but easy-to-get-wrong glue: stable seeds,
|
|
463
|
-
* score/cost objectives, error-to-trial conversion, ASI metric projection,
|
|
464
|
-
* and optional paired holdout gating via `HeldOutGate`.
|
|
465
|
-
*/
|
|
466
|
-
|
|
467
|
-
type MultiShotSplit = 'search' | 'dev' | 'holdout';
|
|
468
|
-
type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
469
|
-
type MultiShotVariant<P = unknown> = EvolvableVariant<P>;
|
|
470
|
-
interface ActionableSideInfo {
|
|
471
|
-
/** Stable expectation/check id when available. */
|
|
472
|
-
expectationId?: string;
|
|
473
|
-
/** Human-readable diagnosis of what happened. */
|
|
474
|
-
message: string;
|
|
475
|
-
severity?: AsiSeverity;
|
|
476
|
-
/** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
|
|
477
|
-
evidence?: string;
|
|
478
|
-
/** Prompt/tool/context surface likely responsible. */
|
|
479
|
-
responsibleSurface?: string;
|
|
480
|
-
/** Suggested fix in natural language. */
|
|
481
|
-
suggestion?: string;
|
|
482
|
-
/** Whether this expectation was satisfied. Defaults to false for ASI rows. */
|
|
483
|
-
matched?: boolean;
|
|
484
|
-
metadata?: Record<string, unknown>;
|
|
485
|
-
}
|
|
486
|
-
interface MultiShotTrace {
|
|
487
|
-
scenarioId: string;
|
|
488
|
-
/** Full turn/tool trace. Shape is intentionally app-owned. */
|
|
489
|
-
turns?: unknown[];
|
|
490
|
-
toolCalls?: unknown[];
|
|
491
|
-
artifacts?: unknown[];
|
|
492
|
-
/** Compact final output or summary used by reflection prompts. */
|
|
493
|
-
transcript?: string;
|
|
494
|
-
output?: unknown;
|
|
495
|
-
metadata?: Record<string, unknown>;
|
|
496
|
-
}
|
|
497
|
-
interface MultiShotRun {
|
|
498
|
-
trace: MultiShotTrace;
|
|
499
|
-
costUsd?: number;
|
|
500
|
-
durationMs?: number;
|
|
501
|
-
tokenUsage?: {
|
|
502
|
-
input?: number;
|
|
503
|
-
output?: number;
|
|
504
|
-
cached?: number;
|
|
505
|
-
};
|
|
506
|
-
metadata?: Record<string, unknown>;
|
|
507
|
-
}
|
|
508
|
-
interface MultiShotRunInput<P = unknown> {
|
|
509
|
-
variant: EvolvableVariant<P>;
|
|
510
|
-
scenarioId: string;
|
|
511
|
-
rep: number;
|
|
512
|
-
split: MultiShotSplit;
|
|
513
|
-
/** Stable paired seed for baseline/candidate comparisons. */
|
|
514
|
-
seed: number;
|
|
515
|
-
}
|
|
516
|
-
interface MultiShotRunner<P = unknown> {
|
|
517
|
-
run(input: MultiShotRunInput<P>): Promise<MultiShotRun> | MultiShotRun;
|
|
518
|
-
}
|
|
519
|
-
interface MultiShotScore {
|
|
520
|
-
/** Primary score in [0,1]. The adapter clamps for safety. */
|
|
521
|
-
score: number;
|
|
522
|
-
/** Pass/fail for top/bottom trial selection. Defaults to true. */
|
|
523
|
-
ok?: boolean;
|
|
524
|
-
costUsd?: number;
|
|
525
|
-
durationMs?: number;
|
|
526
|
-
metrics?: Record<string, number>;
|
|
527
|
-
asi?: ActionableSideInfo[];
|
|
528
|
-
/** Optional rich output shown to reflection mutators. */
|
|
529
|
-
emitted?: string;
|
|
530
|
-
metadata?: Record<string, unknown>;
|
|
531
|
-
}
|
|
532
|
-
interface MultiShotScorer<P = unknown> {
|
|
533
|
-
score(input: MultiShotRunInput<P> & {
|
|
534
|
-
run: MultiShotRun;
|
|
535
|
-
}): Promise<MultiShotScore> | MultiShotScore;
|
|
536
|
-
}
|
|
537
|
-
interface MultiShotTrialResult extends TrialResult {
|
|
538
|
-
split: MultiShotSplit;
|
|
539
|
-
seed: number;
|
|
540
|
-
trace?: MultiShotTrace;
|
|
541
|
-
asi?: ActionableSideInfo[];
|
|
542
|
-
emitted?: string;
|
|
543
|
-
metadata?: Record<string, unknown>;
|
|
544
|
-
}
|
|
545
|
-
interface MultiShotMutateAdapter<P = unknown> {
|
|
546
|
-
mutate(args: {
|
|
547
|
-
parent: EvolvableVariant<P>;
|
|
548
|
-
parentAggregate: VariantAggregate;
|
|
549
|
-
topTrials: MultiShotTrialResult[];
|
|
550
|
-
bottomTrials: MultiShotTrialResult[];
|
|
551
|
-
childCount: number;
|
|
552
|
-
generation: number;
|
|
553
|
-
}): Promise<EvolvableVariant<P>[]>;
|
|
554
|
-
}
|
|
555
|
-
interface MultiShotGateConfig<P = unknown> {
|
|
556
|
-
/** Search rows are optional, but enable HeldOutGate's overfit-gap check. */
|
|
557
|
-
searchScenarioIds?: string[];
|
|
558
|
-
holdoutScenarioIds: string[];
|
|
559
|
-
reps?: number;
|
|
560
|
-
gate: HeldOutGateConfig;
|
|
561
|
-
/** Convert scored trajectory runs into paper-grade RunRecords. */
|
|
562
|
-
toRunRecord(input: {
|
|
563
|
-
variant: EvolvableVariant<P>;
|
|
564
|
-
scenarioId: string;
|
|
565
|
-
rep: number;
|
|
566
|
-
split: RunSplitTag;
|
|
567
|
-
seed: number;
|
|
568
|
-
trial: MultiShotTrialResult;
|
|
569
|
-
}): RunRecord;
|
|
570
|
-
}
|
|
571
|
-
interface MultiShotOptimizationConfig<P = unknown> {
|
|
572
|
-
runId: string;
|
|
573
|
-
target: string;
|
|
574
|
-
seedVariants: EvolvableVariant<P>[];
|
|
575
|
-
searchScenarioIds: string[];
|
|
576
|
-
reps: number;
|
|
577
|
-
generations: number;
|
|
578
|
-
populationSize: number;
|
|
579
|
-
scoreConcurrency?: number;
|
|
580
|
-
runner: MultiShotRunner<P>;
|
|
581
|
-
scorer: MultiShotScorer<P>;
|
|
582
|
-
mutateAdapter: MultiShotMutateAdapter<P>;
|
|
583
|
-
objectives?: Objective<VariantAggregate>[];
|
|
584
|
-
scalarWeights?: Record<string, number>;
|
|
585
|
-
cache?: TrialCache;
|
|
586
|
-
earlyStopOnNoImprovement?: boolean;
|
|
587
|
-
seedBase?: number;
|
|
588
|
-
onProgress?: (event: PromptEvolutionEvent) => void;
|
|
589
|
-
gate?: MultiShotGateConfig<P>;
|
|
590
|
-
}
|
|
591
|
-
interface MultiShotGateResult {
|
|
592
|
-
decision: GateDecision;
|
|
593
|
-
candidateRuns: RunRecord[];
|
|
594
|
-
baselineRuns: RunRecord[];
|
|
595
|
-
}
|
|
596
|
-
interface MultiShotOptimizationResult<P = unknown> {
|
|
597
|
-
evolution: PromptEvolutionResult<P>;
|
|
598
|
-
/** Best candidate on the optimizer-visible search split. */
|
|
599
|
-
searchBestVariant: EvolvableVariant<P>;
|
|
600
|
-
searchBestAggregate: VariantAggregate;
|
|
601
|
-
/** Variant callers should actually ship after optional holdout gating. */
|
|
602
|
-
promotedVariant: EvolvableVariant<P>;
|
|
603
|
-
promotedAggregate: VariantAggregate;
|
|
604
|
-
/** Null when no gate was configured or the search-best candidate was the baseline. */
|
|
605
|
-
gate: MultiShotGateResult | null;
|
|
606
|
-
}
|
|
607
|
-
declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig<P>): Promise<MultiShotOptimizationResult<P>>;
|
|
608
|
-
declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
|
|
609
|
-
declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
|
|
610
|
-
|
|
611
|
-
/**
|
|
612
|
-
* Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
|
|
613
|
-
* than replacing it.
|
|
614
|
-
*
|
|
615
|
-
* Three artefacts:
|
|
616
|
-
*
|
|
617
|
-
* - `summaryTable` Markdown table of per-candidate means,
|
|
618
|
-
* 95% bootstrap CIs, BH-adjusted Wilcoxon
|
|
619
|
-
* p-values, and Cohen's d versus a
|
|
620
|
-
* comparator candidate.
|
|
621
|
-
* - `paretoChart` Abstract spec for a cost vs quality
|
|
622
|
-
* scatter, with gate decisions overlaid.
|
|
623
|
-
* Returns numbers + labels — caller
|
|
624
|
-
* chooses the plotting library.
|
|
625
|
-
* - `gainHistogram`
|
|
626
|
-
* Per-item paired holdout deltas as a
|
|
627
|
-
* histogram spec (bins + counts + median +
|
|
628
|
-
* CI). Same "data, not images" contract.
|
|
629
|
-
*
|
|
630
|
-
* The figure types are PlotSpecs — JSON-friendly, library-agnostic.
|
|
631
|
-
* They aren't React components and they aren't PNGs; they are
|
|
632
|
-
* what you'd hand to vega-lite, plotly, matplotlib, or your own
|
|
633
|
-
* Canvas renderer to draw the actual figure.
|
|
634
|
-
*/
|
|
635
|
-
|
|
636
|
-
interface SummaryTableOptions {
|
|
637
|
-
/** Comparator candidate id. Wilcoxon + Cohen's d are computed
|
|
638
|
-
* versus this candidate. Required for paired stats columns. */
|
|
639
|
-
comparator?: string;
|
|
640
|
-
/** Which split to read scores from. Default 'holdout'. */
|
|
641
|
-
split?: 'search' | 'holdout';
|
|
642
|
-
/** Confidence level for the bootstrap CI on the mean. Default 0.95. */
|
|
643
|
-
confidence?: number;
|
|
644
|
-
/** FDR for BH adjustment of the comparison p-values. Default 0.05. */
|
|
645
|
-
fdr?: number;
|
|
646
|
-
}
|
|
647
|
-
interface SummaryTableRow {
|
|
648
|
-
candidateId: string;
|
|
649
|
-
n: number;
|
|
650
|
-
mean: number;
|
|
651
|
-
ciLow: number;
|
|
652
|
-
ciHigh: number;
|
|
653
|
-
/** BH-adjusted q-value vs comparator. NaN if no comparator. */
|
|
654
|
-
qValue: number;
|
|
655
|
-
/** Cohen's d vs comparator. NaN if no comparator. */
|
|
656
|
-
cohensD: number;
|
|
657
|
-
}
|
|
658
|
-
interface SummaryTable {
|
|
659
|
-
rows: SummaryTableRow[];
|
|
660
|
-
comparator: string | null;
|
|
661
|
-
split: 'search' | 'holdout';
|
|
662
|
-
/** Pre-rendered markdown — drop into a paper or PR. */
|
|
663
|
-
markdown: string;
|
|
664
|
-
}
|
|
665
|
-
/**
|
|
666
|
-
* Table 1 helper. Buckets runs by `candidateId`, computes mean +
|
|
667
|
-
* bootstrap CI on the chosen split, and (when a comparator is given)
|
|
668
|
-
* BH-adjusted Wilcoxon p + Cohen's d versus that comparator.
|
|
669
|
-
*/
|
|
670
|
-
declare function summaryTable(runs: RunRecord[], opts?: SummaryTableOptions): SummaryTable;
|
|
671
|
-
interface ParetoPoint {
|
|
672
|
-
candidateId: string;
|
|
673
|
-
/** Mean USD cost per run on the chosen split. */
|
|
674
|
-
cost: number;
|
|
675
|
-
/** Mean score on the chosen split. */
|
|
676
|
-
quality: number;
|
|
677
|
-
/** Number of runs that informed this point. */
|
|
678
|
-
n: number;
|
|
679
|
-
/** Whether this candidate is on the Pareto frontier — high
|
|
680
|
-
* quality, low cost, no dominator. */
|
|
681
|
-
onFrontier: boolean;
|
|
682
|
-
/** Optional gate verdict for this candidate, if a `GateDecision`
|
|
683
|
-
* for it was passed in. */
|
|
684
|
-
gate?: 'promote' | 'reject_few_runs' | 'reject_negative_delta' | 'reject_overfit_gap' | null;
|
|
685
|
-
}
|
|
686
|
-
interface ParetoFigureSpec {
|
|
687
|
-
kind: 'pareto-cost-quality';
|
|
688
|
-
split: 'search' | 'holdout';
|
|
689
|
-
points: ParetoPoint[];
|
|
690
|
-
axes: {
|
|
691
|
-
x: 'costUsd';
|
|
692
|
-
y: 'score';
|
|
693
|
-
};
|
|
694
|
-
}
|
|
695
|
-
/**
|
|
696
|
-
* Cost vs quality scatter spec. `gateDecisions` is keyed by
|
|
697
|
-
* candidate id; if present, every point picks up the gate verdict
|
|
698
|
-
* for overlay.
|
|
699
|
-
*/
|
|
700
|
-
declare function paretoChart(runs: RunRecord[], opts?: {
|
|
701
|
-
split?: 'search' | 'holdout';
|
|
702
|
-
gateDecisions?: Record<string, GateDecision>;
|
|
703
|
-
}): ParetoFigureSpec;
|
|
704
|
-
interface GainDistributionBin {
|
|
705
|
-
/** Inclusive lower edge. */
|
|
706
|
-
lo: number;
|
|
707
|
-
/** Exclusive upper edge (or inclusive if it's the last bin). */
|
|
708
|
-
hi: number;
|
|
709
|
-
/** Number of pairs whose delta lands in this bin. */
|
|
710
|
-
count: number;
|
|
711
|
-
}
|
|
712
|
-
interface GainDistributionFigureSpec {
|
|
713
|
-
kind: 'gain-distribution';
|
|
714
|
-
candidateId: string;
|
|
715
|
-
comparator: string;
|
|
716
|
-
split: 'search' | 'holdout';
|
|
717
|
-
/** Number of pairs used. */
|
|
718
|
-
n: number;
|
|
719
|
-
bins: GainDistributionBin[];
|
|
720
|
-
median: number;
|
|
721
|
-
ci: {
|
|
722
|
-
low: number;
|
|
723
|
-
high: number;
|
|
724
|
-
};
|
|
725
|
-
}
|
|
726
|
-
interface GainDistributionOptions {
|
|
727
|
-
/** Number of histogram bins. Default 11 (so the centre is exact at 0). */
|
|
728
|
-
bins?: number;
|
|
729
|
-
/** Which split to use. Default 'holdout'. */
|
|
730
|
-
split?: 'search' | 'holdout';
|
|
731
|
-
/** Confidence level for the CI. Default 0.95. */
|
|
732
|
-
confidence?: number;
|
|
733
|
-
/** Bootstrap resamples. Default 2000. */
|
|
734
|
-
resamples?: number;
|
|
735
|
-
/** Deterministic seed. */
|
|
736
|
-
seed?: number;
|
|
737
|
-
}
|
|
738
|
-
/**
|
|
739
|
-
* Held-out improvement distribution: per-pair delta (candidate −
|
|
740
|
-
* comparator), histogrammed. Includes the bootstrap CI on the median
|
|
741
|
-
* delta — same primitive the promotion gate uses.
|
|
742
|
-
*/
|
|
743
|
-
declare function gainHistogram(runs: RunRecord[], candidateId: string, comparator: string, opts?: GainDistributionOptions): GainDistributionFigureSpec;
|
|
744
|
-
type ResearchReportDecision = 'promote' | 'hold' | 'reject' | 'equivalent' | 'needs_more_data';
|
|
745
|
-
/**
|
|
746
|
-
* Hard floor below which a paired comparison is treated as uninformative
|
|
747
|
-
* regardless of `minPairs`. Mirrors the lower limit on Wilcoxon signed-rank
|
|
748
|
-
* exact tables; below this the test has no power to separate effect sizes.
|
|
749
|
-
*/
|
|
750
|
-
declare const RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
|
|
751
|
-
interface ResearchReportOptions {
|
|
752
|
-
/** Human-readable report title. */
|
|
753
|
-
title?: string;
|
|
754
|
-
/** Comparator candidate id. Required for statistical decision guidance. */
|
|
755
|
-
comparator?: string;
|
|
756
|
-
/** Which split to use for the primary decision. Default 'holdout'. */
|
|
757
|
-
split?: 'search' | 'holdout';
|
|
758
|
-
/** Confidence level used by lower-level report helpers. Default 0.95. */
|
|
759
|
-
confidence?: number;
|
|
760
|
-
/** FDR threshold for q-values. Default 0.05. */
|
|
761
|
-
fdr?: number;
|
|
762
|
-
/**
|
|
763
|
-
* Soft floor on paired observations before issuing a directional
|
|
764
|
-
* promote / reject. Below this we report `needs_more_data` and surface the
|
|
765
|
-
* minimum detectable effect at the current N. Default 20 — chosen so the
|
|
766
|
-
* Wilcoxon signed-rank approximation is reasonable and so the paired
|
|
767
|
-
* bootstrap CI has non-degenerate coverage. Hard floor is enforced at
|
|
768
|
-
* `RESEARCH_REPORT_HARD_PAIR_FLOOR` (6) regardless of this value.
|
|
769
|
-
*/
|
|
770
|
-
minPairs?: number;
|
|
771
|
-
/**
|
|
772
|
-
* Region of Practical Equivalence on the paired delta. When a candidate's
|
|
773
|
-
* paired-delta CI is fully contained in `[low, high]`, the decision is
|
|
774
|
-
* `equivalent` rather than `hold`. Sourced from the domain owner — there is
|
|
775
|
-
* no statistically-defensible default.
|
|
776
|
-
*/
|
|
777
|
-
rope?: {
|
|
778
|
-
low: number;
|
|
779
|
-
high: number;
|
|
780
|
-
};
|
|
781
|
-
/**
|
|
782
|
-
* Power for the minimum detectable effect (MDE) reported on each candidate.
|
|
783
|
-
* Default 0.8.
|
|
784
|
-
*/
|
|
785
|
-
mdePower?: number;
|
|
786
|
-
/**
|
|
787
|
-
* Two-sided alpha for the MDE. Default matches `fdr` so the reported MDE
|
|
788
|
-
* lines up with the test the report actually runs.
|
|
789
|
-
*/
|
|
790
|
-
mdeAlpha?: number;
|
|
791
|
-
/** Optional held-out gate decisions keyed by candidate id. */
|
|
792
|
-
gateDecisions?: Record<string, GateDecision>;
|
|
793
|
-
/** Optional failure clusters from failureClusterView. */
|
|
794
|
-
failureClusters?: FailureClusterReport;
|
|
795
|
-
/** Build gain histograms for these candidates. Defaults to all non-comparator candidates. */
|
|
796
|
-
candidateIds?: string[];
|
|
797
|
-
/** Deterministic bootstrap seed passed to gainHistogram and the posterior helper. */
|
|
798
|
-
seed?: number;
|
|
799
|
-
/** Report timestamp. Defaults to current time. */
|
|
800
|
-
generatedAt?: string;
|
|
801
|
-
/**
|
|
802
|
-
* Hash of a preregistered protocol (e.g. `signManifest({...}).contentHash`).
|
|
803
|
-
* Embedded verbatim in the report so the analysis can be cited as the
|
|
804
|
-
* preregistered one rather than a post-hoc fishing expedition.
|
|
805
|
-
*/
|
|
806
|
-
preregistrationHash?: string;
|
|
807
|
-
}
|
|
808
|
-
interface ResearchReportRecommendation {
|
|
809
|
-
decision: ResearchReportDecision;
|
|
810
|
-
candidateId: string | null;
|
|
811
|
-
rationale: string[];
|
|
812
|
-
risks: string[];
|
|
813
|
-
nextActions: string[];
|
|
814
|
-
}
|
|
815
|
-
interface ResearchReportCandidate {
|
|
816
|
-
candidateId: string;
|
|
817
|
-
n: number;
|
|
818
|
-
mean: number;
|
|
819
|
-
ciLow: number;
|
|
820
|
-
ciHigh: number;
|
|
821
|
-
qValue: number;
|
|
822
|
-
cohensD: number;
|
|
823
|
-
meanDeltaVsComparator: number | null;
|
|
824
|
-
pairedN: number;
|
|
825
|
-
medianGain: number | null;
|
|
826
|
-
meanGain: number | null;
|
|
827
|
-
gainCi: {
|
|
828
|
-
low: number;
|
|
829
|
-
high: number;
|
|
830
|
-
} | null;
|
|
831
|
-
/**
|
|
832
|
-
* Bayesian-bootstrap-style posterior summaries on the paired delta. Computed
|
|
833
|
-
* from the same resamples that produce the gain CI; interpretable as
|
|
834
|
-
* "fraction of resamples in which the candidate beats the comparator on
|
|
835
|
-
* matched pairs."
|
|
836
|
-
*/
|
|
837
|
-
prGreaterThanZero: number | null;
|
|
838
|
-
prInRope: number | null;
|
|
839
|
-
/**
|
|
840
|
-
* Minimum detectable effect (in score units) at the candidate's paired N,
|
|
841
|
-
* the configured power, and the configured alpha. Standardised by the
|
|
842
|
-
* observed paired-delta SD and inverted via `requiredSampleSize`. Reported
|
|
843
|
-
* for every candidate so a `needs_more_data` verdict is actionable.
|
|
844
|
-
*/
|
|
845
|
-
mde: number | null;
|
|
846
|
-
onParetoFrontier: boolean;
|
|
847
|
-
gate?: ParetoPoint['gate'];
|
|
848
|
-
decision: ResearchReportDecision;
|
|
849
|
-
decisionReason: string;
|
|
850
|
-
}
|
|
851
|
-
interface ResearchReportMethodology {
|
|
852
|
-
/**
|
|
853
|
-
* Plain-language assumptions the report depends on. Read these first when
|
|
854
|
-
* deciding whether the verdict is load-bearing for a launch decision.
|
|
855
|
-
*/
|
|
856
|
-
assumptions: string[];
|
|
857
|
-
/** Tests and estimators the verdict was computed from. */
|
|
858
|
-
methods: string[];
|
|
859
|
-
/** Alternatives the author considered and why this report didn't take them. */
|
|
860
|
-
alternatives: string[];
|
|
861
|
-
/** Failure modes — when this report should NOT drive a decision. */
|
|
862
|
-
whenNotToApply: string[];
|
|
863
|
-
/** Citations for the methodological choices above. */
|
|
864
|
-
citations: string[];
|
|
865
|
-
}
|
|
866
|
-
interface ResearchReport {
|
|
867
|
-
kind: 'agent-eval-research-report';
|
|
868
|
-
title: string;
|
|
869
|
-
generatedAt: string;
|
|
870
|
-
split: 'search' | 'holdout';
|
|
871
|
-
comparator: string | null;
|
|
872
|
-
/**
|
|
873
|
-
* SHA-256 over the canonicalised set of `(runId, candidateId, split)` triples
|
|
874
|
-
* the report was computed from, plus the comparator and split. Stable across
|
|
875
|
-
* key insertion order; recomputable by the reader to verify provenance.
|
|
876
|
-
*/
|
|
877
|
-
runFingerprint: string;
|
|
878
|
-
preregistrationHash: string | null;
|
|
879
|
-
rope: {
|
|
880
|
-
low: number;
|
|
881
|
-
high: number;
|
|
882
|
-
} | null;
|
|
883
|
-
executiveSummary: string[];
|
|
884
|
-
recommendation: ResearchReportRecommendation;
|
|
885
|
-
candidates: ResearchReportCandidate[];
|
|
886
|
-
summary: SummaryTable;
|
|
887
|
-
charts: {
|
|
888
|
-
pareto: ParetoFigureSpec;
|
|
889
|
-
gains: GainDistributionFigureSpec[];
|
|
890
|
-
};
|
|
891
|
-
methodology: ResearchReportMethodology;
|
|
892
|
-
failureClusters?: FailureClusterReport;
|
|
893
|
-
markdown: string;
|
|
894
|
-
html: string;
|
|
895
|
-
}
|
|
896
|
-
/**
|
|
897
|
-
* Executive research report for CPO / AI-lead / launch-review consumption.
|
|
898
|
-
*
|
|
899
|
-
* Composes:
|
|
900
|
-
* - `summaryTable` marginal stats with BH-FDR-adjusted q-values
|
|
901
|
-
* - `paretoChart` cost-vs-quality frontier with gate overlay
|
|
902
|
-
* - `gainHistogram` per-candidate paired-delta distribution
|
|
903
|
-
* - paired posterior (this file): bootstrap CI on median, Pr(Δ>0),
|
|
904
|
-
* Pr(Δ∈ROPE), MDE at the configured power
|
|
905
|
-
*
|
|
906
|
-
* Decisions are made on paired evidence — never on marginal means alone —
|
|
907
|
-
* and respect any held-out gate decision the caller passes through. The
|
|
908
|
-
* report embeds a SHA-256 fingerprint of the input run set and, optionally,
|
|
909
|
-
* the hash of a preregistered protocol so a downstream reader can verify
|
|
910
|
-
* provenance and that the analysis was the preregistered one.
|
|
911
|
-
*
|
|
912
|
-
* Async because the fingerprint uses Web Crypto via `hashJson`; deterministic
|
|
913
|
-
* for any fixed `runs`, `seed`, and ROPE.
|
|
914
|
-
*/
|
|
915
|
-
declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
|
|
916
|
-
|
|
917
|
-
export { gainHistogram as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GainDistributionBin as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type GainDistributionFigureSpec as F, type GenerationReport as G, type GainDistributionOptions as H, InMemoryTrialCache as I, type ParetoFigureSpec as J, type ParetoPoint as K, RESEARCH_REPORT_HARD_PAIR_FLOOR as L, type MultiShotGateConfig as M, type ResearchReport as N, type ResearchReportCandidate as O, type PromptEvolutionConfig as P, type ResearchReportDecision as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type ResearchReportMethodology as U, type VariantAggregate as V, type ResearchReportOptions as W, type ResearchReportRecommendation as X, type SummaryTable as Y, type SummaryTableOptions as Z, type SummaryTableRow as _, type AsiSeverity as a, paretoChart as a0, researchReport as a1, summaryTable as a2, type GateDecision as a3, type HeldOutGateConfig as a4, type Objective as a5, type ParetoResult as a6, type Direction as a7, type GateEvidence as a8, HeldOutGate as a9, type HeldOutGateRejectionCode as aa, crowdingDistance as ab, dominates as ac, paretoFrontier as ad, paretoFrontierWithCrowding as ae, scalarScore as af, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };
|