@tangle-network/agent-eval 0.20.11 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/CHANGELOG.md +76 -0
  2. package/README.md +137 -170
  3. package/dist/benchmarks/index.d.ts +2 -1
  4. package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
  5. package/dist/chunk-3GN6U53I.js.map +1 -0
  6. package/dist/chunk-3IX6QTB7.js +1349 -0
  7. package/dist/chunk-3IX6QTB7.js.map +1 -0
  8. package/dist/chunk-5IIQKMD5.js +236 -0
  9. package/dist/chunk-5IIQKMD5.js.map +1 -0
  10. package/dist/chunk-ARZ6BEV6.js +1310 -0
  11. package/dist/chunk-ARZ6BEV6.js.map +1 -0
  12. package/dist/chunk-HRZELXCR.js +1354 -0
  13. package/dist/chunk-HRZELXCR.js.map +1 -0
  14. package/dist/chunk-KRR4VMH7.js +423 -0
  15. package/dist/chunk-KRR4VMH7.js.map +1 -0
  16. package/dist/chunk-SNUHRBDL.js +154 -0
  17. package/dist/chunk-SNUHRBDL.js.map +1 -0
  18. package/dist/chunk-WOK2RTWG.js +1920 -0
  19. package/dist/chunk-WOK2RTWG.js.map +1 -0
  20. package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
  21. package/dist/chunk-YUFXO3TU.js +148 -0
  22. package/dist/chunk-YUFXO3TU.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/control-cxwMOAsy.d.ts +259 -0
  26. package/dist/control.d.ts +6 -0
  27. package/dist/control.js +30 -0
  28. package/dist/control.js.map +1 -0
  29. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  30. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  31. package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
  32. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  33. package/dist/index.d.ts +178 -2945
  34. package/dist/index.js +1066 -6185
  35. package/dist/index.js.map +1 -1
  36. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  37. package/dist/openapi.json +1 -1
  38. package/dist/optimization.d.ts +146 -0
  39. package/dist/optimization.js +60 -0
  40. package/dist/optimization.js.map +1 -0
  41. package/dist/reporting-Da2ihlcM.d.ts +672 -0
  42. package/dist/reporting.d.ts +5 -0
  43. package/dist/reporting.js +36 -0
  44. package/dist/reporting.js.map +1 -0
  45. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  46. package/dist/store-u47QaJ9G.d.ts +297 -0
  47. package/dist/traces.d.ts +914 -0
  48. package/dist/traces.js +120 -0
  49. package/dist/traces.js.map +1 -0
  50. package/dist/wire/index.js +3 -2
  51. package/docs/concepts.md +16 -11
  52. package/docs/feature-guide.md +10 -17
  53. package/docs/integration-launch-gates.md +77 -0
  54. package/docs/product-eval-adoption.md +27 -0
  55. package/docs/research-report-methodology.md +155 -0
  56. package/docs/trace-analysis.md +75 -0
  57. package/package.json +30 -12
  58. package/dist/chunk-JAOLXRIA.js.map +0 -1
  59. /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
@@ -0,0 +1,598 @@
1
+ import { a as RunRecord, R as RunSplitTag } from './run-record-CX_jcAyr.js';
2
+
3
+ /**
4
+ * HeldOutGate — first-class held-out paired-delta promotion gate.
5
+ *
6
+ * Encodes the "honesty override" pattern that lived inline in
7
+ * `~/webb/redteam/scripts/agent-eval-autoresearch.ts:138–171`.
8
+ * The optimizer's best-guess is one thing; what we should actually
9
+ * ship is another. The gate is the line between them.
10
+ *
11
+ * A candidate is promoted iff ALL three pass:
12
+ *
13
+ * 1. **Productive runs**: the candidate has at least
14
+ * `minProductiveRuns` paired observations on items where BOTH
15
+ * candidate and baseline produced a real (non-silent) score.
16
+ * 2. **Paired delta**: the lower bound of the bootstrap CI on the
17
+ * median per-item delta (candidate − baseline) on the HOLDOUT
18
+ * split is strictly greater than `pairedDeltaThreshold`.
19
+ * 3. **Overfit gap**: the candidate's gap between search-split
20
+ * score and holdout-split score is no worse (more positive)
21
+ * than the baseline's gap by more than `overfitGapThreshold`.
22
+ * "Better on search, worse on holdout" is the canonical
23
+ * overfit pattern; this catches it.
24
+ *
25
+ * The decision carries a machine-readable `rejectionCode` plus an
26
+ * `evidence` block with every number the gate looked at, so the
27
+ * downstream researcher / paper / dashboard can re-derive the
28
+ * verdict without re-running.
29
+ *
30
+ * See also:
31
+ * - `src/paired-stats.ts` for `pairedBootstrap` + `pairedWilcoxon`
32
+ * - `src/run-record.ts` for the input row schema
33
+ * - `src/reference-replay.ts` for the older, reference-replay-
34
+ * specific promotion path (still useful for replay-style evals).
35
+ */
36
+
37
+ type HeldOutGateRejectionCode = 'few_runs' | 'negative_delta' | 'overfit_gap';
38
+ interface HeldOutGateConfig {
39
+ /** Minimum number of paired (candidate, baseline) holdout observations
40
+ * required before the gate will even consider promoting. Default 3. */
41
+ minProductiveRuns?: number;
42
+ /** The bootstrap-CI lower bound on the median paired holdout delta
43
+ * must exceed this to promote. Default 0. */
44
+ pairedDeltaThreshold?: number;
45
+ /** Maximum allowed worsening of (search − holdout) gap relative to
46
+ * baseline. Default 0.15 (i.e. candidate may overfit by up to 15
47
+ * absolute score points more than baseline before rejection). */
48
+ overfitGapThreshold?: number;
49
+ /** Stable label of the baseline candidate. Required — paper-grade
50
+ * evaluation never compares two unlabelled candidates. */
51
+ baselineKey: string;
52
+ /** Confidence level for the bootstrap CI. Default 0.95. */
53
+ confidence?: number;
54
+ /** Bootstrap resamples. Default 2000. */
55
+ bootstrapResamples?: number;
56
+ /** Optional deterministic seed for the bootstrap. Default undefined
57
+ * (Math.random). */
58
+ seed?: number;
59
+ }
60
+ interface GateEvidence {
61
+ /** Number of paired (candidate, baseline) holdout observations used. */
62
+ productiveRuns: number;
63
+ /** Median of (candidate − baseline) paired holdout deltas. */
64
+ medianPairedDelta: number;
65
+ /** Bootstrap CI on the median paired holdout delta. */
66
+ pairedCI: {
67
+ low: number;
68
+ high: number;
69
+ };
70
+ /** Wilcoxon signed-rank p-value on the paired holdout deltas. */
71
+ pairedPValue: number;
72
+ /** Mean candidate score on the search split (NaN if none). */
73
+ searchScore: number;
74
+ /** Mean candidate score on the holdout split (NaN if none). */
75
+ holdoutScore: number;
76
+ /** Candidate (search − holdout) gap. */
77
+ overfitGap: number;
78
+ /** Baseline (search − holdout) gap. */
79
+ baselineOverfitGap: number;
80
+ }
81
+ interface GateDecision {
82
+ /** Final promote/no-promote verdict. */
83
+ promote: boolean;
84
+ /** The candidate that was evaluated. */
85
+ candidateId: string;
86
+ /** The baseline it was compared against. */
87
+ baselineId: string;
88
+ /** Every number the gate looked at, for audit + paper export. */
89
+ evidence: GateEvidence;
90
+ /** Human-readable reason. */
91
+ reason: string;
92
+ /** Machine-readable rejection code, or null on promote. */
93
+ rejectionCode: HeldOutGateRejectionCode | null;
94
+ }
95
+ /**
96
+ * Held-out paired-delta promotion gate. Construct once with config,
97
+ * call `evaluate(candidateRuns, baselineRuns)` per (candidate,
98
+ * baseline) pair. Stateless across calls.
99
+ */
100
+ declare class HeldOutGate {
101
+ private readonly minProductiveRuns;
102
+ private readonly pairedDeltaThreshold;
103
+ private readonly overfitGapThreshold;
104
+ private readonly baselineKey;
105
+ private readonly confidence;
106
+ private readonly resamples;
107
+ private readonly seed?;
108
+ constructor(config: HeldOutGateConfig);
109
+ /** Decide whether `candidate` should replace `baseline`. Pairing
110
+ * is by (experimentId, seed) — identical experiment + seed pairs
111
+ * the candidate run with the matching baseline run. Pairs without
112
+ * a holdout score on both sides are dropped. */
113
+ evaluate(candidate: RunRecord[], baseline: RunRecord[]): GateDecision;
114
+ }
115
+
116
+ /**
117
+ * Pareto frontier — multi-objective optimization over candidate runs.
118
+ *
119
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
120
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
121
+ * ttfb), you rarely have a single "winner" — you have a set of
122
+ * non-dominated candidates. This module exposes:
123
+ *
124
+ * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
125
+ * - `dominates`: does A dominate B across all objectives?
126
+ *
127
+ * Each objective is declared with a direction: 'maximize' (higher=better)
128
+ * or 'minimize' (lower=better). Candidates are any object; pass an
129
+ * `objective(candidate)` accessor.
130
+ */
131
+ type Direction = 'maximize' | 'minimize';
132
+ interface Objective<T> {
133
+ /** Stable label used in reports. */
134
+ name: string;
135
+ direction: Direction;
136
+ value: (candidate: T) => number;
137
+ }
138
+ interface ParetoResult<T> {
139
+ frontier: T[];
140
+ dominated: T[];
141
+ /** Index map: frontier[i] dominates each of dominatedBy[i]. */
142
+ dominanceMap: Array<{
143
+ dominator: T;
144
+ dominated: T[];
145
+ }>;
146
+ }
147
+ /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
148
+ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
149
+ /**
150
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
151
+ * objective are excluded (can't rank them). A candidate enters the frontier
152
+ * iff no other candidate dominates it.
153
+ */
154
+ declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
155
+ /**
156
+ * Weighted-sum scalarisation. Use as a tie-break / single-winner selector
157
+ * when callers don't want to consume a frontier. Each objective contributes
158
+ * its normalised value (0..1 via min-max across the candidate pool) times
159
+ * its weight; missing weights default to 1/N.
160
+ *
161
+ * Direction is honoured automatically — `minimize` axes have their values
162
+ * inverted before scaling so "higher scalar = better" always holds.
163
+ */
164
+ declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
165
+ weights?: Partial<Record<string, number>>;
166
+ }): Array<{
167
+ candidate: T;
168
+ score: number;
169
+ }>;
170
+ /**
171
+ * NSGA-II crowding distance — secondary sort for ties on the frontier.
172
+ *
173
+ * When the Pareto front collapses to a single point (or many candidates tie
174
+ * on dominance), naive selection picks arbitrarily and the population
175
+ * degenerates over generations. NSGA-II preserves diversity by preferring
176
+ * candidates with more empty space around them on the frontier.
177
+ *
178
+ * Returns an array of `{ candidate, distance }` in the SAME order as the
179
+ * input. Higher distance = more isolated = should be preferred when
180
+ * preserving diversity.
181
+ */
182
+ declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
183
+ candidate: T;
184
+ distance: number;
185
+ }>;
186
+ /**
187
+ * Pareto frontier with tie-break by crowding distance — the canonical
188
+ * NSGA-II selection step. Returns the frontier sorted by descending crowding
189
+ * distance so callers can `.slice(0, k)` to pick K diverse winners.
190
+ */
191
+ declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
192
+ candidate: T;
193
+ distance: number;
194
+ }>;
195
+
196
+ /**
197
+ * PromptEvolutionLoop — population-based reflective-mutation evolution.
198
+ *
199
+ * Above the existing `AxGepaSteeringOptimizer` (which RANKS variants),
200
+ * this loop GENERATES variants. Each generation:
201
+ * 1. Score the population across (variant × scenario × rep).
202
+ * 2. Pick survivors from the Pareto frontier (with crowding-distance tie-break).
203
+ * 3. Ask the mutator for replacements until population size is restored.
204
+ * 4. Repeat for N generations OR until convergence.
205
+ *
206
+ * Domain-agnostic. Consumers supply:
207
+ * - A seed population of `EvolvableVariant`s.
208
+ * - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
209
+ * - A `MutateAdapter` that produces children given trace evidence.
210
+ * - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
211
+ *
212
+ * The loop owns: population management, parallel scheduling (concurrency-
213
+ * limited), Pareto selection with crowding distance, generation reporting.
214
+ *
215
+ * It does NOT own: rendering trials to a model, executing prompts, choosing
216
+ * mutation primitives, persisting to disk. Those are the consumer's call.
217
+ */
218
+
219
+ interface EvolvableVariant<P = unknown> {
220
+ /** Stable id for the variant — surfaces in reports and trial results. */
221
+ id: string;
222
+ /** Variant payload — interpretation is the consumer's responsibility. */
223
+ payload: P;
224
+ /** Generation index (0 = seed, then 1, 2, ...). */
225
+ generation: number;
226
+ /** Parent variant id when produced via mutation; absent for seeds. */
227
+ parentId?: string;
228
+ /** Human label for reports. */
229
+ label: string;
230
+ /** What the mutator was trying to fix. */
231
+ rationale?: string;
232
+ }
233
+ interface TrialResult {
234
+ variantId: string;
235
+ scenarioId: string;
236
+ rep: number;
237
+ ok: boolean;
238
+ /** Primary scalar score the consumer cares about (e.g., recall, accuracy). */
239
+ score: number;
240
+ /** Token cost (or any cost-like dimension). */
241
+ cost?: number;
242
+ /** Wall time in ms. */
243
+ durationMs?: number;
244
+ /** Free-form metric bag for objective accessors. */
245
+ metrics?: Record<string, number>;
246
+ error?: string;
247
+ }
248
+ /** Aggregated trial summary for one (variant, scenario) pair across reps. */
249
+ interface ScenarioAggregate {
250
+ variantId: string;
251
+ scenarioId: string;
252
+ meanScore: number;
253
+ meanCost: number;
254
+ meanDurationMs: number;
255
+ okRate: number;
256
+ trials: number;
257
+ /** Mean of every numeric metric across reps. */
258
+ metrics: Record<string, number>;
259
+ }
260
+ /** Aggregated trial summary for one variant across all scenarios. */
261
+ interface VariantAggregate {
262
+ variantId: string;
263
+ meanScore: number;
264
+ meanCost: number;
265
+ meanDurationMs: number;
266
+ okRate: number;
267
+ scenarios: ScenarioAggregate[];
268
+ /** Mean of every numeric metric, averaged across scenarios. */
269
+ metrics: Record<string, number>;
270
+ }
271
+ interface ScoreAdapter<P = unknown> {
272
+ score(args: {
273
+ variant: EvolvableVariant<P>;
274
+ scenarioId: string;
275
+ rep: number;
276
+ }): Promise<TrialResult>;
277
+ }
278
+ interface MutateAdapter<P = unknown> {
279
+ mutate(args: {
280
+ parent: EvolvableVariant<P>;
281
+ parentAggregate: VariantAggregate;
282
+ topTrials: TrialResult[];
283
+ bottomTrials: TrialResult[];
284
+ childCount: number;
285
+ generation: number;
286
+ }): Promise<EvolvableVariant<P>[]>;
287
+ }
288
+ interface PromptEvolutionConfig<P = unknown> {
289
+ runId: string;
290
+ /** What component is being mutated — surfaces in reports + reflection prompts. */
291
+ target: string;
292
+ seedVariants: EvolvableVariant<P>[];
293
+ scenarioIds: string[];
294
+ reps: number;
295
+ generations: number;
296
+ populationSize: number;
297
+ /** Maximum concurrent score() calls. */
298
+ scoreConcurrency: number;
299
+ scoreAdapter: ScoreAdapter<P>;
300
+ mutateAdapter: MutateAdapter<P>;
301
+ /** Pareto objectives over `VariantAggregate`. Ordered by importance. */
302
+ objectives: Objective<VariantAggregate>[];
303
+ /** Optional weights for the scalar tie-break selector (by objective name). */
304
+ scalarWeights?: Record<string, number>;
305
+ /** Stop early if a generation produces no Pareto improvement. Default true. */
306
+ earlyStopOnNoImprovement?: boolean;
307
+ onProgress?: (event: PromptEvolutionEvent) => void;
308
+ /**
309
+ * Optional cache key for memoising scored (variantId, scenarioId, rep)
310
+ * tuples. When provided AND a cache instance is passed, repeated trials
311
+ * skip re-scoring. Cache keys are stable across runs.
312
+ */
313
+ cache?: TrialCache;
314
+ }
315
+ interface TrialCache {
316
+ get(key: string): TrialResult | undefined;
317
+ set(key: string, value: TrialResult): void;
318
+ }
319
+ declare class InMemoryTrialCache implements TrialCache {
320
+ private store;
321
+ get(key: string): TrialResult | undefined;
322
+ set(key: string, value: TrialResult): void;
323
+ size(): number;
324
+ clear(): void;
325
+ }
326
+ type PromptEvolutionEvent = {
327
+ type: 'generation-start';
328
+ generation: number;
329
+ populationSize: number;
330
+ } | {
331
+ type: 'trial-complete';
332
+ generation: number;
333
+ variantId: string;
334
+ scenarioId: string;
335
+ rep: number;
336
+ ok: boolean;
337
+ score: number;
338
+ cached: boolean;
339
+ } | {
340
+ type: 'generation-complete';
341
+ report: GenerationReport<unknown>;
342
+ } | {
343
+ type: 'converged';
344
+ generation: number;
345
+ reason: string;
346
+ };
347
+ interface GenerationReport<P = unknown> {
348
+ runId: string;
349
+ target: string;
350
+ generation: number;
351
+ variants: EvolvableVariant<P>[];
352
+ aggregates: VariantAggregate[];
353
+ /** Frontier candidates, sorted by descending crowding distance. */
354
+ paretoFrontIds: string[];
355
+ /** Scalar-best variant id — used for the single "winner" if callers want one. */
356
+ winnerId: string;
357
+ /** Trials that fed this generation (kept for downstream reporting). */
358
+ trials: TrialResult[];
359
+ }
360
+ interface PromptEvolutionResult<P = unknown> {
361
+ runId: string;
362
+ target: string;
363
+ generations: GenerationReport<P>[];
364
+ /** Best variant by scalar score in the final generation. */
365
+ bestVariant: EvolvableVariant<P>;
366
+ /** Best aggregate (matches bestVariant). */
367
+ bestAggregate: VariantAggregate;
368
+ }
369
+ declare function runPromptEvolution<P>(config: PromptEvolutionConfig<P>): Promise<PromptEvolutionResult<P>>;
370
+
371
+ /**
372
+ * Reflective mutation — primitives for trace-conditioned prompt rewriting.
373
+ *
374
+ * Used by `prompt-evolution.ts` (and any consumer running iterative
375
+ * improvement). Given a parent prompt + concrete trace evidence (top trials,
376
+ * bottom trials, missed expectations), produce an LLM-ready prompt that
377
+ * proposes targeted mutations — not blind rephrasings.
378
+ *
379
+ * Why this lives outside `prompt-evolution.ts`: any consumer that wants to
380
+ * run reflective rewriting WITHOUT the population/Pareto machinery can
381
+ * import these primitives directly.
382
+ *
383
+ * Quality bar (vs. naive "mutate this prompt"):
384
+ * - Show parent ↔ children diff, not just one variant
385
+ * - Quote specific missed goldens with their match phrases
386
+ * - Surface the model's actual emitted output side-by-side with what was expected
387
+ * - Quote concrete mutation primitives so the model has a vocabulary
388
+ */
389
+ interface TrialTrace {
390
+ /** Stable id for the trial — surfaces in the prompt for grounding. */
391
+ id: string;
392
+ /** Score the trial received on its primary metric. */
393
+ score: number;
394
+ /** Candidate inputs the agent was given (e.g., the fixture or scenario). */
395
+ inputName?: string;
396
+ /**
397
+ * Goldens / expectations this trial was tested against, with whether each
398
+ * was matched. The reflection prompt quotes the missed ones specifically.
399
+ */
400
+ expectations?: Array<{
401
+ id: string;
402
+ phrase: string;
403
+ matched: boolean;
404
+ }>;
405
+ /** Free-form text — what the agent actually emitted (e.g., findings, plan). */
406
+ emitted?: string;
407
+ /** Optional structured metrics (recall, precision, cost, latency). */
408
+ metrics?: Record<string, number>;
409
+ }
410
+ interface ReflectionContext {
411
+ /** What is being mutated — appears in the system prompt for orientation. */
412
+ target: string;
413
+ /** Current variant's payload — JSON-serialised for the prompt. */
414
+ parentPayload: unknown;
415
+ /** Best-performing trials this generation. */
416
+ topTrials: TrialTrace[];
417
+ /** Worst-performing trials this generation — the missed-golden source. */
418
+ bottomTrials: TrialTrace[];
419
+ /** How many children the mutator should propose. */
420
+ childCount: number;
421
+ /** Optional: domain-specific mutation primitives the model can pick from. */
422
+ mutationPrimitives?: string[];
423
+ }
424
+ declare const DEFAULT_MUTATION_PRIMITIVES: string[];
425
+ /**
426
+ * Build the LLM-ready reflection prompt. Output is plain text — pass it as
427
+ * the user message. The system message should be small and stable (e.g.
428
+ * "Output ONLY a JSON object matching the schema below.").
429
+ */
430
+ declare function buildReflectionPrompt(ctx: ReflectionContext): string;
431
+ interface ReflectionProposal {
432
+ label: string;
433
+ rationale: string;
434
+ payload: unknown;
435
+ }
436
+ declare function parseReflectionResponse(raw: string, maxProposals?: number): ReflectionProposal[];
437
+
438
+ /**
439
+ * Multi-shot optimization adapter.
440
+ *
441
+ * This is the canonical bridge between variable-length agent trajectories
442
+ * and `runPromptEvolution`. Apps provide four things:
443
+ *
444
+ * - variants: prompt/config/tool-policy candidates
445
+ * - runner: executes one full task trajectory for a variant
446
+ * - scorer: turns that trajectory into score + actionable side information
447
+ * - mutator: proposes new variants from top/bottom scored trials
448
+ *
449
+ * The adapter owns the boring but easy-to-get-wrong glue: stable seeds,
450
+ * score/cost objectives, error-to-trial conversion, ASI metric projection,
451
+ * and optional paired holdout gating via `HeldOutGate`.
452
+ */
453
+
454
+ type MultiShotSplit = 'search' | 'dev' | 'holdout';
455
+ type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
456
+ type MultiShotVariant<P = unknown> = EvolvableVariant<P>;
457
+ interface ActionableSideInfo {
458
+ /** Stable expectation/check id when available. */
459
+ expectationId?: string;
460
+ /** Human-readable diagnosis of what happened. */
461
+ message: string;
462
+ severity?: AsiSeverity;
463
+ /** Concrete trace excerpt, file path, tool call, screenshot id, etc. */
464
+ evidence?: string;
465
+ /** Prompt/tool/context surface likely responsible. */
466
+ responsibleSurface?: string;
467
+ /** Suggested fix in natural language. */
468
+ suggestion?: string;
469
+ /** Whether this expectation was satisfied. Defaults to false for ASI rows. */
470
+ matched?: boolean;
471
+ metadata?: Record<string, unknown>;
472
+ }
473
+ interface MultiShotTrace {
474
+ scenarioId: string;
475
+ /** Full turn/tool trace. Shape is intentionally app-owned. */
476
+ turns?: unknown[];
477
+ toolCalls?: unknown[];
478
+ artifacts?: unknown[];
479
+ /** Compact final output or summary used by reflection prompts. */
480
+ transcript?: string;
481
+ output?: unknown;
482
+ metadata?: Record<string, unknown>;
483
+ }
484
+ interface MultiShotRun {
485
+ trace: MultiShotTrace;
486
+ costUsd?: number;
487
+ durationMs?: number;
488
+ tokenUsage?: {
489
+ input?: number;
490
+ output?: number;
491
+ cached?: number;
492
+ };
493
+ metadata?: Record<string, unknown>;
494
+ }
495
+ interface MultiShotRunInput<P = unknown> {
496
+ variant: EvolvableVariant<P>;
497
+ scenarioId: string;
498
+ rep: number;
499
+ split: MultiShotSplit;
500
+ /** Stable paired seed for baseline/candidate comparisons. */
501
+ seed: number;
502
+ }
503
+ interface MultiShotRunner<P = unknown> {
504
+ run(input: MultiShotRunInput<P>): Promise<MultiShotRun> | MultiShotRun;
505
+ }
506
+ interface MultiShotScore {
507
+ /** Primary score in [0,1]. The adapter clamps for safety. */
508
+ score: number;
509
+ /** Pass/fail for top/bottom trial selection. Defaults to true. */
510
+ ok?: boolean;
511
+ costUsd?: number;
512
+ durationMs?: number;
513
+ metrics?: Record<string, number>;
514
+ asi?: ActionableSideInfo[];
515
+ /** Optional rich output shown to reflection mutators. */
516
+ emitted?: string;
517
+ metadata?: Record<string, unknown>;
518
+ }
519
+ interface MultiShotScorer<P = unknown> {
520
+ score(input: MultiShotRunInput<P> & {
521
+ run: MultiShotRun;
522
+ }): Promise<MultiShotScore> | MultiShotScore;
523
+ }
524
+ interface MultiShotTrialResult extends TrialResult {
525
+ split: MultiShotSplit;
526
+ seed: number;
527
+ trace?: MultiShotTrace;
528
+ asi?: ActionableSideInfo[];
529
+ emitted?: string;
530
+ metadata?: Record<string, unknown>;
531
+ }
532
+ interface MultiShotMutateAdapter<P = unknown> {
533
+ mutate(args: {
534
+ parent: EvolvableVariant<P>;
535
+ parentAggregate: VariantAggregate;
536
+ topTrials: MultiShotTrialResult[];
537
+ bottomTrials: MultiShotTrialResult[];
538
+ childCount: number;
539
+ generation: number;
540
+ }): Promise<EvolvableVariant<P>[]>;
541
+ }
542
+ interface MultiShotGateConfig<P = unknown> {
543
+ /** Search rows are optional, but enable HeldOutGate's overfit-gap check. */
544
+ searchScenarioIds?: string[];
545
+ holdoutScenarioIds: string[];
546
+ reps?: number;
547
+ gate: HeldOutGateConfig;
548
+ /** Convert scored trajectory runs into paper-grade RunRecords. */
549
+ toRunRecord(input: {
550
+ variant: EvolvableVariant<P>;
551
+ scenarioId: string;
552
+ rep: number;
553
+ split: RunSplitTag;
554
+ seed: number;
555
+ trial: MultiShotTrialResult;
556
+ }): RunRecord;
557
+ }
558
+ interface MultiShotOptimizationConfig<P = unknown> {
559
+ runId: string;
560
+ target: string;
561
+ seedVariants: EvolvableVariant<P>[];
562
+ searchScenarioIds: string[];
563
+ reps: number;
564
+ generations: number;
565
+ populationSize: number;
566
+ scoreConcurrency?: number;
567
+ runner: MultiShotRunner<P>;
568
+ scorer: MultiShotScorer<P>;
569
+ mutateAdapter: MultiShotMutateAdapter<P>;
570
+ objectives?: Objective<VariantAggregate>[];
571
+ scalarWeights?: Record<string, number>;
572
+ cache?: TrialCache;
573
+ earlyStopOnNoImprovement?: boolean;
574
+ seedBase?: number;
575
+ onProgress?: (event: PromptEvolutionEvent) => void;
576
+ gate?: MultiShotGateConfig<P>;
577
+ }
578
+ interface MultiShotGateResult {
579
+ decision: GateDecision;
580
+ candidateRuns: RunRecord[];
581
+ baselineRuns: RunRecord[];
582
+ }
583
+ interface MultiShotOptimizationResult<P = unknown> {
584
+ evolution: PromptEvolutionResult<P>;
585
+ /** Best candidate on the optimizer-visible search split. */
586
+ searchBestVariant: EvolvableVariant<P>;
587
+ searchBestAggregate: VariantAggregate;
588
+ /** Variant callers should actually ship after optional holdout gating. */
589
+ promotedVariant: EvolvableVariant<P>;
590
+ promotedAggregate: VariantAggregate;
591
+ /** Null when no gate was configured or the search-best candidate was the baseline. */
592
+ gate: MultiShotGateResult | null;
593
+ }
594
+ declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig<P>): Promise<MultiShotOptimizationResult<P>>;
595
+ declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
596
+ declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
597
+
598
+ export { type ActionableSideInfo as A, type TrialTrace as B, buildReflectionPrompt as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, crowdingDistance as F, type GateDecision as G, HeldOutGate as H, InMemoryTrialCache as I, defaultMultiShotObjectives as J, dominates as K, paretoFrontier as L, type MutateAdapter as M, paretoFrontierWithCrowding as N, type Objective as O, type ParetoResult as P, parseReflectionResponse as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, runMultiShotOptimization as U, type VariantAggregate as V, runPromptEvolution as W, scalarScore as X, trialTraceFromMultiShotTrial as Y, type TrialResult as a, type AsiSeverity as b, type Direction as c, type GateEvidence as d, type GenerationReport as e, type HeldOutGateConfig as f, type HeldOutGateRejectionCode as g, type MultiShotGateConfig as h, type MultiShotGateResult as i, type MultiShotMutateAdapter as j, type MultiShotOptimizationConfig as k, type MultiShotOptimizationResult as l, type MultiShotRun as m, type MultiShotRunInput as n, type MultiShotRunner as o, type MultiShotScore as p, type MultiShotScorer as q, type MultiShotSplit as r, type MultiShotTrace as s, type MultiShotTrialResult as t, type MultiShotVariant as u, type PromptEvolutionConfig as v, type PromptEvolutionEvent as w, type PromptEvolutionResult as x, type ReflectionProposal as y, type ScoreAdapter as z };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.20.11",
5
+ "version": "0.21.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",