@tangle-network/agent-eval 0.20.11 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +76 -0
- package/README.md +137 -170
- package/dist/benchmarks/index.d.ts +2 -1
- package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
- package/dist/chunk-3GN6U53I.js.map +1 -0
- package/dist/chunk-3IX6QTB7.js +1349 -0
- package/dist/chunk-3IX6QTB7.js.map +1 -0
- package/dist/chunk-5IIQKMD5.js +236 -0
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/chunk-ARZ6BEV6.js +1310 -0
- package/dist/chunk-ARZ6BEV6.js.map +1 -0
- package/dist/chunk-HRZELXCR.js +1354 -0
- package/dist/chunk-HRZELXCR.js.map +1 -0
- package/dist/chunk-KRR4VMH7.js +423 -0
- package/dist/chunk-KRR4VMH7.js.map +1 -0
- package/dist/chunk-SNUHRBDL.js +154 -0
- package/dist/chunk-SNUHRBDL.js.map +1 -0
- package/dist/chunk-WOK2RTWG.js +1920 -0
- package/dist/chunk-WOK2RTWG.js.map +1 -0
- package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
- package/dist/chunk-YUFXO3TU.js +148 -0
- package/dist/chunk-YUFXO3TU.js.map +1 -0
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/control-cxwMOAsy.d.ts +259 -0
- package/dist/control.d.ts +6 -0
- package/dist/control.js +30 -0
- package/dist/control.js.map +1 -0
- package/dist/dataset-B9qvlm_o.d.ts +112 -0
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
- package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
- package/dist/index.d.ts +178 -2945
- package/dist/index.js +1066 -6185
- package/dist/index.js.map +1 -1
- package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +146 -0
- package/dist/optimization.js +60 -0
- package/dist/optimization.js.map +1 -0
- package/dist/reporting-Da2ihlcM.d.ts +672 -0
- package/dist/reporting.d.ts +5 -0
- package/dist/reporting.js +36 -0
- package/dist/reporting.js.map +1 -0
- package/dist/run-record-CX_jcAyr.d.ts +134 -0
- package/dist/store-u47QaJ9G.d.ts +297 -0
- package/dist/traces.d.ts +914 -0
- package/dist/traces.js +120 -0
- package/dist/traces.js.map +1 -0
- package/dist/wire/index.js +3 -2
- package/docs/concepts.md +16 -11
- package/docs/feature-guide.md +10 -17
- package/docs/integration-launch-gates.md +77 -0
- package/docs/product-eval-adoption.md +27 -0
- package/docs/research-report-methodology.md +155 -0
- package/docs/trace-analysis.md +75 -0
- package/package.json +30 -12
- package/dist/chunk-JAOLXRIA.js.map +0 -1
- /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* HeldOutGate — first-class held-out paired-delta promotion gate.
|
|
5
|
+
*
|
|
6
|
+
* Encodes the "honesty override" pattern that lived inline in
|
|
7
|
+
* `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
|
|
8
|
+
* The optimizer's best-guess is one thing; what we should actually
|
|
9
|
+
* ship is another. The gate is the line between them.
|
|
10
|
+
*
|
|
11
|
+
* A candidate is promoted iff ALL three pass:
|
|
12
|
+
*
|
|
13
|
+
* 1. **Productive runs**: the candidate has at least
|
|
14
|
+
* `minProductiveRuns` paired observations on items where BOTH
|
|
15
|
+
* candidate and baseline produced a real (non-silent) score.
|
|
16
|
+
* 2. **Paired delta**: the lower bound of the bootstrap CI on the
|
|
17
|
+
* median per-item delta (candidate − baseline) on the HOLDOUT
|
|
18
|
+
* split is strictly greater than `pairedDeltaThreshold`.
|
|
19
|
+
* 3. **Overfit gap**: the candidate's gap between search-split
|
|
20
|
+
* score and holdout-split score is no worse (more positive)
|
|
21
|
+
* than the baseline's gap by more than `overfitGapThreshold`.
|
|
22
|
+
* "Better on search, worse on holdout" is the canonical
|
|
23
|
+
* overfit pattern; this catches it.
|
|
24
|
+
*
|
|
25
|
+
* The decision carries a machine-readable `rejectionCode` plus an
|
|
26
|
+
* `evidence` block with every number the gate looked at, so the
|
|
27
|
+
* downstream researcher / paper / dashboard can re-derive the
|
|
28
|
+
* verdict without re-running.
|
|
29
|
+
*
|
|
30
|
+
* See also:
|
|
31
|
+
* - `src/paired-stats.ts` for `pairedBootstrap` + `pairedWilcoxon`
|
|
32
|
+
* - `src/run-record.ts` for the input row schema
|
|
33
|
+
* - `src/reference-replay.ts` for the older, reference-replay-
|
|
34
|
+
* specific promotion path (still useful for replay-style evals).
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
|
|
38
|
+
interface HeldOutGateConfig {
|
|
39
|
+
/** Minimum number of paired (candidate, baseline) holdout observations
|
|
40
|
+
* required before the gate will even consider promoting. Default 3. */
|
|
41
|
+
minProductiveRuns?: number;
|
|
42
|
+
/** The bootstrap-CI lower bound on the median paired holdout delta
|
|
43
|
+
* must exceed this to promote. Default 0. */
|
|
44
|
+
pairedDeltaThreshold?: number;
|
|
45
|
+
/** Maximum allowed worsening of (search − holdout) gap relative to
|
|
46
|
+
* baseline. Default 0.15 (i.e. candidate may overfit by up to 15
|
|
47
|
+
* absolute score points more than baseline before rejection). */
|
|
48
|
+
overfitGapThreshold?: number;
|
|
49
|
+
/** Stable label of the baseline candidate. Required — paper-grade
|
|
50
|
+
* evaluation never compares two unlabelled candidates. */
|
|
51
|
+
baselineKey: string;
|
|
52
|
+
/** Confidence level for the bootstrap CI. Default 0.95. */
|
|
53
|
+
confidence?: number;
|
|
54
|
+
/** Bootstrap resamples. Default 2000. */
|
|
55
|
+
bootstrapResamples?: number;
|
|
56
|
+
/** Optional deterministic seed for the bootstrap. Default undefined
|
|
57
|
+
* (Math.random). */
|
|
58
|
+
seed?: number;
|
|
59
|
+
}
|
|
60
|
+
interface GateEvidence {
|
|
61
|
+
/** Number of paired (candidate, baseline) holdout observations used. */
|
|
62
|
+
productiveRuns: number;
|
|
63
|
+
/** Median of (candidate − baseline) paired holdout deltas. */
|
|
64
|
+
medianPairedDelta: number;
|
|
65
|
+
/** Bootstrap CI on the median paired holdout delta. */
|
|
66
|
+
pairedCI: {
|
|
67
|
+
low: number;
|
|
68
|
+
high: number;
|
|
69
|
+
};
|
|
70
|
+
/** Wilcoxon signed-rank p-value on the paired holdout deltas. */
|
|
71
|
+
pairedPValue: number;
|
|
72
|
+
/** Mean candidate score on the search split (NaN if none). */
|
|
73
|
+
searchScore: number;
|
|
74
|
+
/** Mean candidate score on the holdout split (NaN if none). */
|
|
75
|
+
holdoutScore: number;
|
|
76
|
+
/** Candidate (search − holdout) gap. */
|
|
77
|
+
overfitGap: number;
|
|
78
|
+
/** Baseline (search − holdout) gap. */
|
|
79
|
+
baselineOverfitGap: number;
|
|
80
|
+
}
|
|
81
|
+
interface GateDecision {
|
|
82
|
+
/** Final promote/no-promote verdict. */
|
|
83
|
+
promote: boolean;
|
|
84
|
+
/** The candidate that was evaluated. */
|
|
85
|
+
candidateId: string;
|
|
86
|
+
/** The baseline it was compared against. */
|
|
87
|
+
baselineId: string;
|
|
88
|
+
/** Every number the gate looked at, for audit + paper export. */
|
|
89
|
+
evidence: GateEvidence;
|
|
90
|
+
/** Human-readable reason. */
|
|
91
|
+
reason: string;
|
|
92
|
+
/** Machine-readable rejection code, or null on promote. */
|
|
93
|
+
rejectionCode: HeldOutGateRejectionCode | null;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Held-out paired-delta promotion gate. Construct once with config,
|
|
97
|
+
* call `evaluate(candidateRuns, baselineRuns)` per (candidate,
|
|
98
|
+
* baseline) pair. Stateless across calls.
|
|
99
|
+
*/
|
|
100
|
+
declare class HeldOutGate {
|
|
101
|
+
private readonly minProductiveRuns;
|
|
102
|
+
private readonly pairedDeltaThreshold;
|
|
103
|
+
private readonly overfitGapThreshold;
|
|
104
|
+
private readonly baselineKey;
|
|
105
|
+
private readonly confidence;
|
|
106
|
+
private readonly resamples;
|
|
107
|
+
private readonly seed?;
|
|
108
|
+
constructor(config: HeldOutGateConfig);
|
|
109
|
+
/** Decide whether `candidate` should replace `baseline`. Pairing
|
|
110
|
+
* is by (experimentId, seed) — identical experiment + seed pairs
|
|
111
|
+
* the candidate run with the matching baseline run. Pairs without
|
|
112
|
+
* a holdout score on both sides are dropped. */
|
|
113
|
+
evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Pareto frontier — multi-objective optimization over candidate runs.
|
|
118
|
+
*
|
|
119
|
+
* Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
|
|
120
|
+
* trading off (cost, latency, quality) or (passRate, tokenBudget,
|
|
121
|
+
* ttfb), you rarely have a single "winner" — you have a set of
|
|
122
|
+
* non-dominated candidates. This module exposes:
|
|
123
|
+
*
|
|
124
|
+
* - `paretoFrontier`: filter a set of candidates to the non-dominated ones
|
|
125
|
+
* - `dominates`: does A dominate B across all objectives?
|
|
126
|
+
*
|
|
127
|
+
* Each objective is declared with a direction: 'maximize' (higher=better)
|
|
128
|
+
* or 'minimize' (lower=better). Candidates are any object; pass an
|
|
129
|
+
* `objective(candidate)` accessor.
|
|
130
|
+
*/
|
|
131
|
+
type Direction = 'maximize' | 'minimize';
|
|
132
|
+
interface Objective<T> {
|
|
133
|
+
/** Stable label used in reports. */
|
|
134
|
+
name: string;
|
|
135
|
+
direction: Direction;
|
|
136
|
+
value: (candidate: T) => number;
|
|
137
|
+
}
|
|
138
|
+
interface ParetoResult<T> {
|
|
139
|
+
frontier: T[];
|
|
140
|
+
dominated: T[];
|
|
141
|
+
/** Index map: frontier[i] dominates each of dominatedBy[i]. */
|
|
142
|
+
dominanceMap: Array<{
|
|
143
|
+
dominator: T;
|
|
144
|
+
dominated: T[];
|
|
145
|
+
}>;
|
|
146
|
+
}
|
|
147
|
+
/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
|
|
148
|
+
declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
|
|
149
|
+
/**
|
|
150
|
+
* Compute the non-dominated frontier. Candidates with NaN/Infinity on any
|
|
151
|
+
* objective are excluded (can't rank them). A candidate enters the frontier
|
|
152
|
+
* iff no other candidate dominates it.
|
|
153
|
+
*/
|
|
154
|
+
declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
|
|
155
|
+
/**
|
|
156
|
+
* Weighted-sum scalarisation. Use as a tie-break / single-winner selector
|
|
157
|
+
* when callers don't want to consume a frontier. Each objective contributes
|
|
158
|
+
* its normalised value (0..1 via min-max across the candidate pool) times
|
|
159
|
+
* its weight; missing weights default to 1/N.
|
|
160
|
+
*
|
|
161
|
+
* Direction is honoured automatically — `minimize` axes have their values
|
|
162
|
+
* inverted before scaling so "higher scalar = better" always holds.
|
|
163
|
+
*/
|
|
164
|
+
declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
|
|
165
|
+
weights?: Partial<Record<string, number>>;
|
|
166
|
+
}): Array<{
|
|
167
|
+
candidate: T;
|
|
168
|
+
score: number;
|
|
169
|
+
}>;
|
|
170
|
+
/**
|
|
171
|
+
* NSGA-II crowding distance — secondary sort for ties on the frontier.
|
|
172
|
+
*
|
|
173
|
+
* When the Pareto front collapses to a single point (or many candidates tie
|
|
174
|
+
* on dominance), naive selection picks arbitrarily and the population
|
|
175
|
+
* degenerates over generations. NSGA-II preserves diversity by preferring
|
|
176
|
+
* candidates with more empty space around them on the frontier.
|
|
177
|
+
*
|
|
178
|
+
* Returns an array of `{ candidate, distance }` in the SAME order as the
|
|
179
|
+
* input. Higher distance = more isolated = should be preferred when
|
|
180
|
+
* preserving diversity.
|
|
181
|
+
*/
|
|
182
|
+
declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
183
|
+
candidate: T;
|
|
184
|
+
distance: number;
|
|
185
|
+
}>;
|
|
186
|
+
/**
|
|
187
|
+
* Pareto frontier with tie-break by crowding distance — the canonical
|
|
188
|
+
* NSGA-II selection step. Returns the frontier sorted by descending crowding
|
|
189
|
+
* distance so callers can `.slice(0, k)` to pick K diverse winners.
|
|
190
|
+
*/
|
|
191
|
+
declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
|
|
192
|
+
candidate: T;
|
|
193
|
+
distance: number;
|
|
194
|
+
}>;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* PromptEvolutionLoop — population-based reflective-mutation evolution.
|
|
198
|
+
*
|
|
199
|
+
* Above the existing `AxGepaSteeringOptimizer` (which RANKS variants),
|
|
200
|
+
* this loop GENERATES variants. Each generation:
|
|
201
|
+
* 1. Score the population across (variant × scenario × rep).
|
|
202
|
+
* 2. Pick survivors from the Pareto frontier (with crowding-distance tie-break).
|
|
203
|
+
* 3. Ask the mutator for replacements until population size is restored.
|
|
204
|
+
* 4. Repeat for N generations OR until convergence.
|
|
205
|
+
*
|
|
206
|
+
* Domain-agnostic. Consumers supply:
|
|
207
|
+
* - A seed population of `EvolvableVariant`s.
|
|
208
|
+
* - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
|
|
209
|
+
* - A `MutateAdapter` that produces children given trace evidence.
|
|
210
|
+
* - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
|
|
211
|
+
*
|
|
212
|
+
* The loop owns: population management, parallel scheduling (concurrency-
|
|
213
|
+
* limited), Pareto selection with crowding distance, generation reporting.
|
|
214
|
+
*
|
|
215
|
+
* It does NOT own: rendering trials to a model, executing prompts, choosing
|
|
216
|
+
* mutation primitives, persisting to disk. Those are the consumer's call.
|
|
217
|
+
*/
|
|
218
|
+
|
|
219
|
+
interface EvolvableVariant<P = unknown> {
|
|
220
|
+
/** Stable id for the variant — surfaces in reports and trial results. */
|
|
221
|
+
id: string;
|
|
222
|
+
/** Variant payload — interpretation is the consumer's responsibility. */
|
|
223
|
+
payload: P;
|
|
224
|
+
/** Generation index (0 = seed, then 1, 2, ...). */
|
|
225
|
+
generation: number;
|
|
226
|
+
/** Parent variant id when produced via mutation; absent for seeds. */
|
|
227
|
+
parentId?: string;
|
|
228
|
+
/** Human label for reports. */
|
|
229
|
+
label: string;
|
|
230
|
+
/** What the mutator was trying to fix. */
|
|
231
|
+
rationale?: string;
|
|
232
|
+
}
|
|
233
|
+
interface TrialResult {
|
|
234
|
+
variantId: string;
|
|
235
|
+
scenarioId: string;
|
|
236
|
+
rep: number;
|
|
237
|
+
ok: boolean;
|
|
238
|
+
/** Primary scalar score the consumer cares about (e.g., recall, accuracy). */
|
|
239
|
+
score: number;
|
|
240
|
+
/** Token cost (or any cost-like dimension). */
|
|
241
|
+
cost?: number;
|
|
242
|
+
/** Wall time in ms. */
|
|
243
|
+
durationMs?: number;
|
|
244
|
+
/** Free-form metric bag for objective accessors. */
|
|
245
|
+
metrics?: Record<string, number>;
|
|
246
|
+
error?: string;
|
|
247
|
+
}
|
|
248
|
+
/** Aggregated trial summary for one (variant, scenario) pair across reps. */
|
|
249
|
+
interface ScenarioAggregate {
|
|
250
|
+
variantId: string;
|
|
251
|
+
scenarioId: string;
|
|
252
|
+
meanScore: number;
|
|
253
|
+
meanCost: number;
|
|
254
|
+
meanDurationMs: number;
|
|
255
|
+
okRate: number;
|
|
256
|
+
trials: number;
|
|
257
|
+
/** Mean of every numeric metric across reps. */
|
|
258
|
+
metrics: Record<string, number>;
|
|
259
|
+
}
|
|
260
|
+
/** Aggregated trial summary for one variant across all scenarios. */
|
|
261
|
+
interface VariantAggregate {
|
|
262
|
+
variantId: string;
|
|
263
|
+
meanScore: number;
|
|
264
|
+
meanCost: number;
|
|
265
|
+
meanDurationMs: number;
|
|
266
|
+
okRate: number;
|
|
267
|
+
scenarios: ScenarioAggregate[];
|
|
268
|
+
/** Mean of every numeric metric, averaged across scenarios. */
|
|
269
|
+
metrics: Record<string, number>;
|
|
270
|
+
}
|
|
271
|
+
interface ScoreAdapter<P = unknown> {
|
|
272
|
+
score(args: {
|
|
273
|
+
variant: EvolvableVariant<P>;
|
|
274
|
+
scenarioId: string;
|
|
275
|
+
rep: number;
|
|
276
|
+
}): Promise<TrialResult>;
|
|
277
|
+
}
|
|
278
|
+
interface MutateAdapter<P = unknown> {
|
|
279
|
+
mutate(args: {
|
|
280
|
+
parent: EvolvableVariant<P>;
|
|
281
|
+
parentAggregate: VariantAggregate;
|
|
282
|
+
topTrials: TrialResult[];
|
|
283
|
+
bottomTrials: TrialResult[];
|
|
284
|
+
childCount: number;
|
|
285
|
+
generation: number;
|
|
286
|
+
}): Promise<EvolvableVariant<P>[]>;
|
|
287
|
+
}
|
|
288
|
+
interface PromptEvolutionConfig<P = unknown> {
|
|
289
|
+
runId: string;
|
|
290
|
+
/** What component is being mutated — surfaces in reports + reflection prompts. */
|
|
291
|
+
target: string;
|
|
292
|
+
seedVariants: EvolvableVariant<P>[];
|
|
293
|
+
scenarioIds: string[];
|
|
294
|
+
reps: number;
|
|
295
|
+
generations: number;
|
|
296
|
+
populationSize: number;
|
|
297
|
+
/** Maximum concurrent score() calls. */
|
|
298
|
+
scoreConcurrency: number;
|
|
299
|
+
scoreAdapter: ScoreAdapter<P>;
|
|
300
|
+
mutateAdapter: MutateAdapter<P>;
|
|
301
|
+
/** Pareto objectives over `VariantAggregate`. Ordered by importance. */
|
|
302
|
+
objectives: Objective<VariantAggregate>[];
|
|
303
|
+
/** Optional weights for the scalar tie-break selector (by objective name). */
|
|
304
|
+
scalarWeights?: Record<string, number>;
|
|
305
|
+
/** Stop early if a generation produces no Pareto improvement. Default true. */
|
|
306
|
+
earlyStopOnNoImprovement?: boolean;
|
|
307
|
+
onProgress?: (event: PromptEvolutionEvent) => void;
|
|
308
|
+
/**
|
|
309
|
+
* Optional cache key for memoising scored (variantId, scenarioId, rep)
|
|
310
|
+
* tuples. When provided AND a cache instance is passed, repeated trials
|
|
311
|
+
* skip re-scoring. Cache keys are stable across runs.
|
|
312
|
+
*/
|
|
313
|
+
cache?: TrialCache;
|
|
314
|
+
}
|
|
315
|
+
interface TrialCache {
|
|
316
|
+
get(key: string): TrialResult | undefined;
|
|
317
|
+
set(key: string, value: TrialResult): void;
|
|
318
|
+
}
|
|
319
|
+
declare class InMemoryTrialCache implements TrialCache {
|
|
320
|
+
private store;
|
|
321
|
+
get(key: string): TrialResult | undefined;
|
|
322
|
+
set(key: string, value: TrialResult): void;
|
|
323
|
+
size(): number;
|
|
324
|
+
clear(): void;
|
|
325
|
+
}
|
|
326
|
+
type PromptEvolutionEvent = {
|
|
327
|
+
type: 'generation-start';
|
|
328
|
+
generation: number;
|
|
329
|
+
populationSize: number;
|
|
330
|
+
} | {
|
|
331
|
+
type: 'trial-complete';
|
|
332
|
+
generation: number;
|
|
333
|
+
variantId: string;
|
|
334
|
+
scenarioId: string;
|
|
335
|
+
rep: number;
|
|
336
|
+
ok: boolean;
|
|
337
|
+
score: number;
|
|
338
|
+
cached: boolean;
|
|
339
|
+
} | {
|
|
340
|
+
type: 'generation-complete';
|
|
341
|
+
report: GenerationReport<unknown>;
|
|
342
|
+
} | {
|
|
343
|
+
type: 'converged';
|
|
344
|
+
generation: number;
|
|
345
|
+
reason: string;
|
|
346
|
+
};
|
|
347
|
+
interface GenerationReport<P = unknown> {
|
|
348
|
+
runId: string;
|
|
349
|
+
target: string;
|
|
350
|
+
generation: number;
|
|
351
|
+
variants: EvolvableVariant<P>[];
|
|
352
|
+
aggregates: VariantAggregate[];
|
|
353
|
+
/** Frontier candidates, sorted by descending crowding distance. */
|
|
354
|
+
paretoFrontIds: string[];
|
|
355
|
+
/** Scalar-best variant id — used for the single "winner" if callers want one. */
|
|
356
|
+
winnerId: string;
|
|
357
|
+
/** Trials that fed this generation (kept for downstream reporting). */
|
|
358
|
+
trials: TrialResult[];
|
|
359
|
+
}
|
|
360
|
+
interface PromptEvolutionResult<P = unknown> {
|
|
361
|
+
runId: string;
|
|
362
|
+
target: string;
|
|
363
|
+
generations: GenerationReport<P>[];
|
|
364
|
+
/** Best variant by scalar score in the final generation. */
|
|
365
|
+
bestVariant: EvolvableVariant<P>;
|
|
366
|
+
/** Best aggregate (matches bestVariant). */
|
|
367
|
+
bestAggregate: VariantAggregate;
|
|
368
|
+
}
|
|
369
|
+
declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
|
|
370
|
+
|
|
371
|
+
/**
|
|
372
|
+
* Reflective mutation — primitives for trace-conditioned prompt rewriting.
|
|
373
|
+
*
|
|
374
|
+
* Used by `prompt-evolution.ts` (and any consumer running iterative
|
|
375
|
+
* improvement). Given a parent prompt + concrete trace evidence (top trials,
|
|
376
|
+
* bottom trials, missed expectations), produce an LLM-ready prompt that
|
|
377
|
+
* proposes targeted mutations — not blind rephrasings.
|
|
378
|
+
*
|
|
379
|
+
* Why this lives outside `prompt-evolution.ts`: any consumer that wants to
|
|
380
|
+
* run reflective rewriting WITHOUT the population/Pareto machinery can
|
|
381
|
+
* import these primitives directly.
|
|
382
|
+
*
|
|
383
|
+
* Quality bar (vs. naive "mutate this prompt"):
|
|
384
|
+
* - Show parent ↔ children diff, not just one variant
|
|
385
|
+
* - Quote specific missed goldens with their match phrases
|
|
386
|
+
* - Surface the model's actual emitted output side-by-side with what was expected
|
|
387
|
+
* - Quote concrete mutation primitives so the model has a vocabulary
|
|
388
|
+
*/
|
|
389
|
+
interface TrialTrace {
|
|
390
|
+
/** Stable id for the trial — surfaces in the prompt for grounding. */
|
|
391
|
+
id: string;
|
|
392
|
+
/** Score the trial received on its primary metric. */
|
|
393
|
+
score: number;
|
|
394
|
+
/** Candidate inputs the agent was given (e.g., the fixture or scenario). */
|
|
395
|
+
inputName?: string;
|
|
396
|
+
/**
|
|
397
|
+
* Goldens / expectations this trial was tested against, with whether each
|
|
398
|
+
* was matched. The reflection prompt quotes the missed ones specifically.
|
|
399
|
+
*/
|
|
400
|
+
expectations?: Array<{
|
|
401
|
+
id: string;
|
|
402
|
+
phrase: string;
|
|
403
|
+
matched: boolean;
|
|
404
|
+
}>;
|
|
405
|
+
/** Free-form text — what the agent actually emitted (e.g., findings, plan). */
|
|
406
|
+
emitted?: string;
|
|
407
|
+
/** Optional structured metrics (recall, precision, cost, latency). */
|
|
408
|
+
metrics?: Record<string, number>;
|
|
409
|
+
}
|
|
410
|
+
interface ReflectionContext {
|
|
411
|
+
/** What is being mutated — appears in the system prompt for orientation. */
|
|
412
|
+
target: string;
|
|
413
|
+
/** Current variant's payload — JSON-serialised for the prompt. */
|
|
414
|
+
parentPayload: unknown;
|
|
415
|
+
/** Best-performing trials this generation. */
|
|
416
|
+
topTrials: TrialTrace[];
|
|
417
|
+
/** Worst-performing trials this generation — the missed-golden source. */
|
|
418
|
+
bottomTrials: TrialTrace[];
|
|
419
|
+
/** How many children the mutator should propose. */
|
|
420
|
+
childCount: number;
|
|
421
|
+
/** Optional: domain-specific mutation primitives the model can pick from. */
|
|
422
|
+
mutationPrimitives?: string[];
|
|
423
|
+
}
|
|
424
|
+
declare const DEFAULT_MUTATION_PRIMITIVES: string[];
|
|
425
|
+
/**
|
|
426
|
+
* Build the LLM-ready reflection prompt. Output is plain text — pass it as
|
|
427
|
+
* the user message. The system message should be small and stable (e.g.
|
|
428
|
+
* "Output ONLY a JSON object matching the schema below.").
|
|
429
|
+
*/
|
|
430
|
+
declare function buildReflectionPrompt(ctx: ReflectionContext): string;
|
|
431
|
+
interface ReflectionProposal {
|
|
432
|
+
label: string;
|
|
433
|
+
rationale: string;
|
|
434
|
+
payload: unknown;
|
|
435
|
+
}
|
|
436
|
+
declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
|
|
437
|
+
|
|
438
|
+
/**
|
|
439
|
+
* Multi-shot optimization adapter.
|
|
440
|
+
*
|
|
441
|
+
* This is the canonical bridge between variable-length agent trajectories
|
|
442
|
+
* and `runPromptEvolution`. Apps provide four things:
|
|
443
|
+
*
|
|
444
|
+
* - variants: prompt/config/tool-policy candidates
|
|
445
|
+
* - runner: executes one full task trajectory for a variant
|
|
446
|
+
* - scorer: turns that trajectory into score + actionable side information
|
|
447
|
+
* - mutator: proposes new variants from top/bottom scored trials
|
|
448
|
+
*
|
|
449
|
+
* The adapter owns the boring but easy-to-get-wrong glue: stable seeds,
|
|
450
|
+
* score/cost objectives, error-to-trial conversion, ASI metric projection,
|
|
451
|
+
* and optional paired holdout gating via `HeldOutGate`.
|
|
452
|
+
*/
|
|
453
|
+
|
|
454
|
+
type MultiShotSplit = 'search' | 'dev' | 'holdout';
|
|
455
|
+
type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
|
|
456
|
+
type MultiShotVariant<P = unknown> = EvolvableVariant<P>;
|
|
457
|
+
interface ActionableSideInfo {
|
|
458
|
+
/** Stable expectation/check id when available. */
|
|
459
|
+
expectationId?: string;
|
|
460
|
+
/** Human-readable diagnosis of what happened. */
|
|
461
|
+
message: string;
|
|
462
|
+
severity?: AsiSeverity;
|
|
463
|
+
/** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
|
|
464
|
+
evidence?: string;
|
|
465
|
+
/** Prompt/tool/context surface likely responsible. */
|
|
466
|
+
responsibleSurface?: string;
|
|
467
|
+
/** Suggested fix in natural language. */
|
|
468
|
+
suggestion?: string;
|
|
469
|
+
/** Whether this expectation was satisfied. Defaults to false for ASI rows. */
|
|
470
|
+
matched?: boolean;
|
|
471
|
+
metadata?: Record<string, unknown>;
|
|
472
|
+
}
|
|
473
|
+
interface MultiShotTrace {
|
|
474
|
+
scenarioId: string;
|
|
475
|
+
/** Full turn/tool trace. Shape is intentionally app-owned. */
|
|
476
|
+
turns?: unknown[];
|
|
477
|
+
toolCalls?: unknown[];
|
|
478
|
+
artifacts?: unknown[];
|
|
479
|
+
/** Compact final output or summary used by reflection prompts. */
|
|
480
|
+
transcript?: string;
|
|
481
|
+
output?: unknown;
|
|
482
|
+
metadata?: Record<string, unknown>;
|
|
483
|
+
}
|
|
484
|
+
interface MultiShotRun {
|
|
485
|
+
trace: MultiShotTrace;
|
|
486
|
+
costUsd?: number;
|
|
487
|
+
durationMs?: number;
|
|
488
|
+
tokenUsage?: {
|
|
489
|
+
input?: number;
|
|
490
|
+
output?: number;
|
|
491
|
+
cached?: number;
|
|
492
|
+
};
|
|
493
|
+
metadata?: Record<string, unknown>;
|
|
494
|
+
}
|
|
495
|
+
interface MultiShotRunInput<P = unknown> {
|
|
496
|
+
variant: EvolvableVariant<P>;
|
|
497
|
+
scenarioId: string;
|
|
498
|
+
rep: number;
|
|
499
|
+
split: MultiShotSplit;
|
|
500
|
+
/** Stable paired seed for baseline/candidate comparisons. */
|
|
501
|
+
seed: number;
|
|
502
|
+
}
|
|
503
|
+
interface MultiShotRunner<P = unknown> {
|
|
504
|
+
run(input: MultiShotRunInput<P>): Promise<MultiShotRun> | MultiShotRun;
|
|
505
|
+
}
|
|
506
|
+
interface MultiShotScore {
|
|
507
|
+
/** Primary score in [0,1]. The adapter clamps for safety. */
|
|
508
|
+
score: number;
|
|
509
|
+
/** Pass/fail for top/bottom trial selection. Defaults to true. */
|
|
510
|
+
ok?: boolean;
|
|
511
|
+
costUsd?: number;
|
|
512
|
+
durationMs?: number;
|
|
513
|
+
metrics?: Record<string, number>;
|
|
514
|
+
asi?: ActionableSideInfo[];
|
|
515
|
+
/** Optional rich output shown to reflection mutators. */
|
|
516
|
+
emitted?: string;
|
|
517
|
+
metadata?: Record<string, unknown>;
|
|
518
|
+
}
|
|
519
|
+
interface MultiShotScorer<P = unknown> {
|
|
520
|
+
score(input: MultiShotRunInput<P> & {
|
|
521
|
+
run: MultiShotRun;
|
|
522
|
+
}): Promise<MultiShotScore> | MultiShotScore;
|
|
523
|
+
}
|
|
524
|
+
interface MultiShotTrialResult extends TrialResult {
|
|
525
|
+
split: MultiShotSplit;
|
|
526
|
+
seed: number;
|
|
527
|
+
trace?: MultiShotTrace;
|
|
528
|
+
asi?: ActionableSideInfo[];
|
|
529
|
+
emitted?: string;
|
|
530
|
+
metadata?: Record<string, unknown>;
|
|
531
|
+
}
|
|
532
|
+
interface MultiShotMutateAdapter<P = unknown> {
|
|
533
|
+
mutate(args: {
|
|
534
|
+
parent: EvolvableVariant<P>;
|
|
535
|
+
parentAggregate: VariantAggregate;
|
|
536
|
+
topTrials: MultiShotTrialResult[];
|
|
537
|
+
bottomTrials: MultiShotTrialResult[];
|
|
538
|
+
childCount: number;
|
|
539
|
+
generation: number;
|
|
540
|
+
}): Promise<EvolvableVariant<P>[]>;
|
|
541
|
+
}
|
|
542
|
+
interface MultiShotGateConfig<P = unknown> {
|
|
543
|
+
/** Search rows are optional, but enable HeldOutGate's overfit-gap check. */
|
|
544
|
+
searchScenarioIds?: string[];
|
|
545
|
+
holdoutScenarioIds: string[];
|
|
546
|
+
reps?: number;
|
|
547
|
+
gate: HeldOutGateConfig;
|
|
548
|
+
/** Convert scored trajectory runs into paper-grade RunRecords. */
|
|
549
|
+
toRunRecord(input: {
|
|
550
|
+
variant: EvolvableVariant<P>;
|
|
551
|
+
scenarioId: string;
|
|
552
|
+
rep: number;
|
|
553
|
+
split: RunSplitTag;
|
|
554
|
+
seed: number;
|
|
555
|
+
trial: MultiShotTrialResult;
|
|
556
|
+
}): RunRecord;
|
|
557
|
+
}
|
|
558
|
+
interface MultiShotOptimizationConfig<P = unknown> {
|
|
559
|
+
runId: string;
|
|
560
|
+
target: string;
|
|
561
|
+
seedVariants: EvolvableVariant<P>[];
|
|
562
|
+
searchScenarioIds: string[];
|
|
563
|
+
reps: number;
|
|
564
|
+
generations: number;
|
|
565
|
+
populationSize: number;
|
|
566
|
+
scoreConcurrency?: number;
|
|
567
|
+
runner: MultiShotRunner<P>;
|
|
568
|
+
scorer: MultiShotScorer<P>;
|
|
569
|
+
mutateAdapter: MultiShotMutateAdapter<P>;
|
|
570
|
+
objectives?: Objective<VariantAggregate>[];
|
|
571
|
+
scalarWeights?: Record<string, number>;
|
|
572
|
+
cache?: TrialCache;
|
|
573
|
+
earlyStopOnNoImprovement?: boolean;
|
|
574
|
+
seedBase?: number;
|
|
575
|
+
onProgress?: (event: PromptEvolutionEvent) => void;
|
|
576
|
+
gate?: MultiShotGateConfig<P>;
|
|
577
|
+
}
|
|
578
|
+
interface MultiShotGateResult {
|
|
579
|
+
decision: GateDecision;
|
|
580
|
+
candidateRuns: RunRecord[];
|
|
581
|
+
baselineRuns: RunRecord[];
|
|
582
|
+
}
|
|
583
|
+
interface MultiShotOptimizationResult<P = unknown> {
|
|
584
|
+
evolution: PromptEvolutionResult<P>;
|
|
585
|
+
/** Best candidate on the optimizer-visible search split. */
|
|
586
|
+
searchBestVariant: EvolvableVariant<P>;
|
|
587
|
+
searchBestAggregate: VariantAggregate;
|
|
588
|
+
/** Variant callers should actually ship after optional holdout gating. */
|
|
589
|
+
promotedVariant: EvolvableVariant<P>;
|
|
590
|
+
promotedAggregate: VariantAggregate;
|
|
591
|
+
/** Null when no gate was configured or the search-best candidate was the baseline. */
|
|
592
|
+
gate: MultiShotGateResult | null;
|
|
593
|
+
}
|
|
594
|
+
declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig<P>): Promise<MultiShotOptimizationResult<P>>;
|
|
595
|
+
declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
|
|
596
|
+
declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
|
|
597
|
+
|
|
598
|
+
export { type ActionableSideInfo as A, type TrialTrace as B, buildReflectionPrompt as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, crowdingDistance as F, type GateDecision as G, HeldOutGate as H, InMemoryTrialCache as I, defaultMultiShotObjectives as J, dominates as K, paretoFrontier as L, type MutateAdapter as M, paretoFrontierWithCrowding as N, type Objective as O, type ParetoResult as P, parseReflectionResponse as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, runMultiShotOptimization as U, type VariantAggregate as V, runPromptEvolution as W, scalarScore as X, trialTraceFromMultiShotTrial as Y, type TrialResult as a, type AsiSeverity as b, type Direction as c, type GateEvidence as d, type GenerationReport as e, type HeldOutGateConfig as f, type HeldOutGateRejectionCode as g, type MultiShotGateConfig as h, type MultiShotGateResult as i, type MultiShotMutateAdapter as j, type MultiShotOptimizationConfig as k, type MultiShotOptimizationResult as l, type MultiShotRun as m, type MultiShotRunInput as n, type MultiShotRunner as o, type MultiShotScore as p, type MultiShotScorer as q, type MultiShotSplit as r, type MultiShotTrace as s, type MultiShotTrialResult as t, type MultiShotVariant as u, type PromptEvolutionConfig as v, type PromptEvolutionEvent as w, type PromptEvolutionResult as x, type ReflectionProposal as y, type ScoreAdapter as z };
|
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.21.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|